diff --git a/rocclr/.clang-format b/rocclr/.clang-format new file mode 100644 index 0000000000..5572a72cdd --- /dev/null +++ b/rocclr/.clang-format @@ -0,0 +1,10 @@ +Language: Cpp +BasedOnStyle: Google +AlignEscapedNewlinesLeft: false +AlignOperands: false +ColumnLimit: 100 +AlwaysBreakTemplateDeclarations: false +DerivePointerAlignment: false +IndentFunctionDeclarationAfterType: false +MaxEmptyLinesToKeep: 2 +SortIncludes: false diff --git a/rocclr/runtime/device/appprofile.cpp b/rocclr/runtime/device/appprofile.cpp index b9181478ca..5fd61619fe 100644 --- a/rocclr/runtime/device/appprofile.cpp +++ b/rocclr/runtime/device/appprofile.cpp @@ -11,239 +11,219 @@ #include #ifdef BRAHMA -extern int SearchProfileOfAnApplication(const wchar_t* fileName, ADLApplicationProfile ** lppProfile); -#endif //BRAHMA +extern int SearchProfileOfAnApplication(const wchar_t* fileName, + ADLApplicationProfile** lppProfile); +#endif // BRAHMA -static void* __stdcall adlMallocCallback(int n) -{ - return malloc(n); -} +static void* __stdcall adlMallocCallback(int n) { return malloc(n); } -#define GETPROCADDRESS(_adltype_, _adlfunc_) (_adltype_)amd::Os::getSymbol(adlHandle_, #_adlfunc_); +#define GETPROCADDRESS(_adltype_, _adlfunc_) (_adltype_) amd::Os::getSymbol(adlHandle_, #_adlfunc_); namespace amd { #ifndef BRAHMA class ADL { -public: - ADL(); - ~ADL(); + public: + ADL(); + ~ADL(); - bool init(); + bool init(); - void* adlHandle() const { return adlHandle_; }; - ADL_CONTEXT_HANDLE adlContext() const { return adlContext_; } + void* adlHandle() const { return adlHandle_; }; + ADL_CONTEXT_HANDLE adlContext() const { return adlContext_; } - typedef int (*Adl2MainControlCreate)(ADL_MAIN_MALLOC_CALLBACK callback, - int iEnumConnectedAdapters, - ADL_CONTEXT_HANDLE* context); - typedef int (*Adl2MainControlDestroy)(ADL_CONTEXT_HANDLE context); - typedef int (*Adl2ConsoleModeFileDescriptorSet)(ADL_CONTEXT_HANDLE context, int fileDescriptor); - typedef int (*Adl2MainControlRefresh)(ADL_CONTEXT_HANDLE context); - typedef int (*Adl2ApplicationProfilesSystemReload)(ADL_CONTEXT_HANDLE context); - typedef int (*Adl2ApplicationProfilesProfileOfApplicationx2Search)(ADL_CONTEXT_HANDLE context, - const wchar_t* fileName, - const wchar_t* path, - const wchar_t* version, - const wchar_t* appProfileArea, - ADLApplicationProfile** lppProfile); + typedef int (*Adl2MainControlCreate)(ADL_MAIN_MALLOC_CALLBACK callback, + int iEnumConnectedAdapters, ADL_CONTEXT_HANDLE* context); + typedef int (*Adl2MainControlDestroy)(ADL_CONTEXT_HANDLE context); + typedef int (*Adl2ConsoleModeFileDescriptorSet)(ADL_CONTEXT_HANDLE context, int fileDescriptor); + typedef int (*Adl2MainControlRefresh)(ADL_CONTEXT_HANDLE context); + typedef int (*Adl2ApplicationProfilesSystemReload)(ADL_CONTEXT_HANDLE context); + typedef int (*Adl2ApplicationProfilesProfileOfApplicationx2Search)( + ADL_CONTEXT_HANDLE context, const wchar_t* fileName, const wchar_t* path, + const wchar_t* version, const wchar_t* appProfileArea, ADLApplicationProfile** lppProfile); - Adl2MainControlCreate adl2MainControlCreate; - Adl2MainControlDestroy adl2MainControlDestroy; - Adl2ConsoleModeFileDescriptorSet adl2ConsoleModeFileDescriptorSet; - Adl2MainControlRefresh adl2MainControlRefresh; - Adl2ApplicationProfilesSystemReload adl2ApplicationProfilesSystemReload; - Adl2ApplicationProfilesProfileOfApplicationx2Search adl2ApplicationProfilesProfileOfApplicationx2Search; + Adl2MainControlCreate adl2MainControlCreate; + Adl2MainControlDestroy adl2MainControlDestroy; + Adl2ConsoleModeFileDescriptorSet adl2ConsoleModeFileDescriptorSet; + Adl2MainControlRefresh adl2MainControlRefresh; + Adl2ApplicationProfilesSystemReload adl2ApplicationProfilesSystemReload; + Adl2ApplicationProfilesProfileOfApplicationx2Search + adl2ApplicationProfilesProfileOfApplicationx2Search; -private: - void* adlHandle_; - ADL_CONTEXT_HANDLE adlContext_; + private: + void* adlHandle_; + ADL_CONTEXT_HANDLE adlContext_; }; -ADL::ADL() : adlHandle_(NULL), - adlContext_(NULL) -{ - adl2MainControlCreate = NULL; - adl2MainControlDestroy = NULL; - adl2ConsoleModeFileDescriptorSet = NULL; - adl2MainControlRefresh = NULL; - adl2ApplicationProfilesSystemReload = NULL; - adl2ApplicationProfilesProfileOfApplicationx2Search = NULL; +ADL::ADL() : adlHandle_(NULL), adlContext_(NULL) { + adl2MainControlCreate = NULL; + adl2MainControlDestroy = NULL; + adl2ConsoleModeFileDescriptorSet = NULL; + adl2MainControlRefresh = NULL; + adl2ApplicationProfilesSystemReload = NULL; + adl2ApplicationProfilesProfileOfApplicationx2Search = NULL; } -ADL::~ADL() -{ - if (adl2MainControlDestroy != NULL) { - adl2MainControlDestroy(adlContext_); - } - adlContext_ = NULL; +ADL::~ADL() { + if (adl2MainControlDestroy != NULL) { + adl2MainControlDestroy(adlContext_); + } + adlContext_ = NULL; } -bool ADL::init() -{ - if (!adlHandle_) { - adlHandle_ = amd::Os::loadLibrary("atiadl" LP64_SWITCH(LINUX_SWITCH("xx", "xy"), "xx")); - } +bool ADL::init() { + if (!adlHandle_) { + adlHandle_ = amd::Os::loadLibrary("atiadl" LP64_SWITCH(LINUX_SWITCH("xx", "xy"), "xx")); + } - if (!adlHandle_) { + if (!adlHandle_) { + return false; + } + + adl2MainControlCreate = GETPROCADDRESS(Adl2MainControlCreate, ADL2_Main_Control_Create); + adl2MainControlDestroy = GETPROCADDRESS(Adl2MainControlDestroy, ADL2_Main_Control_Destroy); + adl2ConsoleModeFileDescriptorSet = + GETPROCADDRESS(Adl2ConsoleModeFileDescriptorSet, ADL2_ConsoleMode_FileDescriptor_Set); + adl2MainControlRefresh = GETPROCADDRESS(Adl2MainControlRefresh, ADL2_Main_Control_Refresh); + adl2ApplicationProfilesSystemReload = + GETPROCADDRESS(Adl2ApplicationProfilesSystemReload, ADL2_ApplicationProfiles_System_Reload); + adl2ApplicationProfilesProfileOfApplicationx2Search = + GETPROCADDRESS(Adl2ApplicationProfilesProfileOfApplicationx2Search, + ADL2_ApplicationProfiles_ProfileOfAnApplicationX2_Search); + + if (adl2MainControlCreate == NULL || adl2MainControlDestroy == NULL || + adl2MainControlRefresh == NULL || adl2ApplicationProfilesSystemReload == NULL || + adl2ApplicationProfilesProfileOfApplicationx2Search == NULL) { + return false; + } + + int result = adl2MainControlCreate(adlMallocCallback, 1, &adlContext_); + if (result != ADL_OK) { + // ADL2 is expected to return ADL_ERR_NO_XDISPLAY in Linux Console mode environment + if (result == ADL_ERR_NO_XDISPLAY) { + if (adl2ConsoleModeFileDescriptorSet == NULL || + adl2ConsoleModeFileDescriptorSet(adlContext_, ADL_UNSET) != ADL_OK) { return false; + } + adl2MainControlRefresh(adlContext_); + } else { + return false; } + } - adl2MainControlCreate = GETPROCADDRESS(Adl2MainControlCreate, ADL2_Main_Control_Create); - adl2MainControlDestroy = GETPROCADDRESS(Adl2MainControlDestroy, ADL2_Main_Control_Destroy); - adl2ConsoleModeFileDescriptorSet = GETPROCADDRESS(Adl2ConsoleModeFileDescriptorSet, ADL2_ConsoleMode_FileDescriptor_Set); - adl2MainControlRefresh = GETPROCADDRESS(Adl2MainControlRefresh, ADL2_Main_Control_Refresh); - adl2ApplicationProfilesSystemReload = GETPROCADDRESS(Adl2ApplicationProfilesSystemReload, - ADL2_ApplicationProfiles_System_Reload); - adl2ApplicationProfilesProfileOfApplicationx2Search = GETPROCADDRESS(Adl2ApplicationProfilesProfileOfApplicationx2Search, - ADL2_ApplicationProfiles_ProfileOfAnApplicationX2_Search); + // Reload is disabled in ADL with the change list 1198904 and ticket + // SWDEV-59442 - The ADL_ApplicationProfiles_System_Reload Function is not Re-entrant + // Returned value is ADL_ERR_NOT_SUPPORTED on Windows. + adl2ApplicationProfilesSystemReload(adlContext_); - if (adl2MainControlCreate == NULL - || adl2MainControlDestroy == NULL - || adl2MainControlRefresh == NULL - || adl2ApplicationProfilesSystemReload == NULL - || adl2ApplicationProfilesProfileOfApplicationx2Search == NULL) { - return false; - } - - int result = adl2MainControlCreate(adlMallocCallback, 1, &adlContext_); - if (result != ADL_OK) { - // ADL2 is expected to return ADL_ERR_NO_XDISPLAY in Linux Console mode environment - if (result == ADL_ERR_NO_XDISPLAY) { - if(adl2ConsoleModeFileDescriptorSet == NULL - || adl2ConsoleModeFileDescriptorSet(adlContext_, ADL_UNSET) != ADL_OK) { - return false; - } - adl2MainControlRefresh(adlContext_); - } - else { - return false; - } - } - - // Reload is disabled in ADL with the change list 1198904 and ticket - // SWDEV-59442 - The ADL_ApplicationProfiles_System_Reload Function is not Re-entrant - // Returned value is ADL_ERR_NOT_SUPPORTED on Windows. - adl2ApplicationProfilesSystemReload(adlContext_); - - return true; + return true; } -#endif //BRAHMA +#endif // BRAHMA -AppProfile::AppProfile(): gpuvmHighAddr_(false), - profileOverridesAllSettings_(false) -{ - appFileName_ = amd::Os::getAppFileName(); - propertyDataMap_.insert(DataMap::value_type("BuildOptsAppend", - PropertyData(DataType_String, &buildOptsAppend_))); +AppProfile::AppProfile() : gpuvmHighAddr_(false), profileOverridesAllSettings_(false) { + appFileName_ = amd::Os::getAppFileName(); + propertyDataMap_.insert( + DataMap::value_type("BuildOptsAppend", PropertyData(DataType_String, &buildOptsAppend_))); } -AppProfile::~AppProfile() -{ +AppProfile::~AppProfile() {} +bool AppProfile::init() { + if (appFileName_.empty()) { + return false; + } + + // Convert appName to wide char for X2_Search ADL interface + size_t strLength = appFileName_.length() + 1; + wchar_t* appName = new wchar_t[strLength]; + + size_t success = mbstowcs(appName, appFileName_.c_str(), strLength); + if (success > 0) { + // mbstowcs was able to convert to wide character successfully. + appName[strLength - 1] = L'\0'; + } + + wsAppFileName_ = appName; + + delete appName; + + ParseApplicationProfile(); + + return true; } -bool AppProfile::init() -{ - if (appFileName_.empty()){ - return false; - } - - // Convert appName to wide char for X2_Search ADL interface - size_t strLength = appFileName_.length() + 1; - wchar_t *appName = new wchar_t[strLength]; - - size_t success = mbstowcs(appName, appFileName_.c_str(), strLength); - if (success > 0) { - // mbstowcs was able to convert to wide character successfully. - appName[strLength - 1] = L'\0'; - } - - wsAppFileName_ = appName; - - delete appName; - - ParseApplicationProfile(); - - return true; -} - -bool AppProfile::ParseApplicationProfile() -{ - ADLApplicationProfile* pProfile = NULL; +bool AppProfile::ParseApplicationProfile() { + ADLApplicationProfile* pProfile = NULL; #ifndef BRAHMA - amd::ADL* adl = new amd::ADL; - - if ((adl == NULL) || !adl->init()) { - delete adl; - return false; - } - - // Apply blb configurations - int result = adl->adl2ApplicationProfilesProfileOfApplicationx2Search( - adl->adlContext(), wsAppFileName_.c_str(), NULL, NULL, - L"OCL", &pProfile); + amd::ADL* adl = new amd::ADL; + if ((adl == NULL) || !adl->init()) { delete adl; + return false; + } -#else //BRAHMA + // Apply blb configurations + int result = adl->adl2ApplicationProfilesProfileOfApplicationx2Search( + adl->adlContext(), wsAppFileName_.c_str(), NULL, NULL, L"OCL", &pProfile); - if (!SearchProfileOfAnApplication(wsAppFileName_.c_str(), &pProfile)) { - return false; + delete adl; + +#else // BRAHMA + + if (!SearchProfileOfAnApplication(wsAppFileName_.c_str(), &pProfile)) { + return false; + } + +#endif // BRAHMA + + if (pProfile == NULL) { + return false; + } + + PropertyRecord* firstProperty = pProfile->record; + uint32_t valueOffset = 0; + const int BUFSIZE = 1024; + wchar_t wbuffer[BUFSIZE]; + char buffer[2 * BUFSIZE]; + + for (int index = 0; index < pProfile->iCount; index++) { + PropertyRecord* profileProperty = + reinterpret_cast((reinterpret_cast(firstProperty)) + valueOffset); + + // Get property name + char* propertyName = profileProperty->strName; + auto entry = propertyDataMap_.find(std::string(propertyName)); + if (entry == propertyDataMap_.end()) { + // unexpected name + valueOffset += (sizeof(PropertyRecord) + profileProperty->iDataSize - 4); + continue; } -#endif //BRAHMA - - if (pProfile == NULL) { - return false; + // Get the property value + switch (entry->second.type_) { + case DataType_Boolean: + *(reinterpret_cast(entry->second.data_)) = profileProperty->uData[0] ? true : false; + break; + case DataType_String: { + assert((size_t)(profileProperty->iDataSize) < sizeof(wbuffer) - 2 && + "app profile string too long"); + memset(wbuffer, 0, sizeof(wbuffer)); + memcpy(wbuffer, profileProperty->uData, profileProperty->iDataSize); + size_t len = wcstombs(buffer, wbuffer, sizeof(buffer)); + assert(len < sizeof(buffer) - 1 && "app profile string too long"); + *(reinterpret_cast(entry->second.data_)) = buffer; + break; + } + default: + break; } + valueOffset += (sizeof(PropertyRecord) + profileProperty->iDataSize - 4); + } - PropertyRecord* firstProperty = pProfile->record; - uint32_t valueOffset = 0; - const int BUFSIZE = 1024; - wchar_t wbuffer[BUFSIZE]; - char buffer[2 * BUFSIZE]; - - for (int index = 0; index < pProfile->iCount; index++) { - PropertyRecord* profileProperty = reinterpret_cast - ((reinterpret_cast(firstProperty)) + valueOffset); - - // Get property name - char* propertyName = profileProperty->strName; - auto entry = propertyDataMap_.find(std::string(propertyName)); - if (entry == propertyDataMap_.end()) { - // unexpected name - valueOffset += (sizeof(PropertyRecord) + profileProperty->iDataSize - 4); - continue; - } - - // Get the property value - switch (entry->second.type_) { - case DataType_Boolean: - *(reinterpret_cast(entry->second.data_)) = - profileProperty->uData[0] ? true : false; - break; - case DataType_String: { - assert((size_t)(profileProperty->iDataSize) < sizeof(wbuffer) - 2 && - "app profile string too long"); - memset(wbuffer, 0, sizeof(wbuffer)); - memcpy(wbuffer, profileProperty->uData, profileProperty->iDataSize); - size_t len = wcstombs(buffer, wbuffer, sizeof(buffer)); - assert(len < sizeof(buffer) - 1 && "app profile string too long"); - *(reinterpret_cast(entry->second.data_)) = buffer; - break; - } - default: - break; - } - valueOffset += (sizeof(PropertyRecord) + profileProperty->iDataSize - 4); - } - - free(pProfile); - return true; + free(pProfile); + return true; } - } diff --git a/rocclr/runtime/device/appprofile.hpp b/rocclr/runtime/device/appprofile.hpp index c328419447..e2bca4a408 100644 --- a/rocclr/runtime/device/appprofile.hpp +++ b/rocclr/runtime/device/appprofile.hpp @@ -10,43 +10,40 @@ namespace amd { class AppProfile { -public: - AppProfile(); - virtual ~AppProfile(); + public: + AppProfile(); + virtual ~AppProfile(); - bool init(); + bool init(); - const std::string& GetBuildOptsAppend() const { return buildOptsAppend_; } + const std::string& GetBuildOptsAppend() const { return buildOptsAppend_; } - const std::string& appFileName() const { return appFileName_; } + const std::string& appFileName() const { return appFileName_; } -protected: - enum DataTypes - { - DataType_Unknown = 0, - DataType_Boolean, - DataType_String, - }; + protected: + enum DataTypes { + DataType_Unknown = 0, + DataType_Boolean, + DataType_String, + }; - struct PropertyData { - PropertyData(DataTypes type, void* data): type_(type), data_(data) {} - DataTypes type_; //!< Data type - void* data_; //!< Pointer to the data - }; + struct PropertyData { + PropertyData(DataTypes type, void* data) : type_(type), data_(data) {} + DataTypes type_; //!< Data type + void* data_; //!< Pointer to the data + }; - typedef std::map DataMap; + typedef std::map DataMap; - DataMap propertyDataMap_; - std::string appFileName_; // without extension - std::wstring wsAppFileName_; + DataMap propertyDataMap_; + std::string appFileName_; // without extension + std::wstring wsAppFileName_; - virtual bool ParseApplicationProfile(); + virtual bool ParseApplicationProfile(); - bool gpuvmHighAddr_; // Currently not used. - bool profileOverridesAllSettings_; // Overrides hint flags and env.var. - std::string buildOptsAppend_; + bool gpuvmHighAddr_; // Currently not used. + bool profileOverridesAllSettings_; // Overrides hint flags and env.var. + std::string buildOptsAppend_; }; - } #endif - diff --git a/rocclr/runtime/device/blit.cpp b/rocclr/runtime/device/blit.cpp index b2fb94c278..dfb03972dc 100644 --- a/rocclr/runtime/device/blit.cpp +++ b/rocclr/runtime/device/blit.cpp @@ -10,766 +10,653 @@ namespace device { HostBlitManager::HostBlitManager(VirtualDevice& vDev, Setup setup) - : BlitManager(setup) - , vDev_(vDev) - , dev_(vDev.device()) -{ } + : BlitManager(setup), vDev_(vDev), dev_(vDev.device()) {} -bool -HostBlitManager::readBuffer( - device::Memory& srcMemory, - void* dstHost, - const amd::Coord3D& origin, - const amd::Coord3D& size, - bool entire) const -{ - // Map the device memory to CPU visible - void* src = srcMemory.cpuMap(vDev_, Memory::CpuReadOnly); - if (NULL == src) { - LogError("Couldn't map device memory for host read"); - return false; - } +bool HostBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost, + const amd::Coord3D& origin, const amd::Coord3D& size, + bool entire) const { + // Map the device memory to CPU visible + void* src = srcMemory.cpuMap(vDev_, Memory::CpuReadOnly); + if (NULL == src) { + LogError("Couldn't map device memory for host read"); + return false; + } - // Copy memory - amd::Os::fastMemcpy(dstHost, - reinterpret_cast(src) + origin[0], size[0]); + // Copy memory + amd::Os::fastMemcpy(dstHost, reinterpret_cast(src) + origin[0], size[0]); - // Unmap device memory - srcMemory.cpuUnmap(vDev_); + // Unmap device memory + srcMemory.cpuUnmap(vDev_); - return true; + return true; } -bool -HostBlitManager::readBufferRect( - device::Memory& srcMemory, - void* dstHost, - const amd::BufferRect& bufRect, - const amd::BufferRect& hostRect, - const amd::Coord3D& size, - bool entire) const -{ - // Map source memory - void *src = srcMemory.cpuMap(vDev_, Memory::CpuReadOnly); - if (src == NULL) { - LogError("Couldn't map source memory"); - return false; +bool HostBlitManager::readBufferRect(device::Memory& srcMemory, void* dstHost, + const amd::BufferRect& bufRect, + const amd::BufferRect& hostRect, const amd::Coord3D& size, + bool entire) const { + // Map source memory + void* src = srcMemory.cpuMap(vDev_, Memory::CpuReadOnly); + if (src == NULL) { + LogError("Couldn't map source memory"); + return false; + } + + size_t srcOffset; + size_t dstOffset; + + for (size_t z = 0; z < size[2]; ++z) { + for (size_t y = 0; y < size[1]; ++y) { + srcOffset = bufRect.offset(0, y, z); + dstOffset = hostRect.offset(0, y, z); + + // Copy memory line by line + amd::Os::fastMemcpy((reinterpret_cast
(dstHost) + dstOffset), + (reinterpret_cast(src) + srcOffset), size[0]); } + } - size_t srcOffset; - size_t dstOffset; + // Unmap source memory + srcMemory.cpuUnmap(vDev_); - for (size_t z = 0; z < size[2]; ++z) { - for (size_t y = 0; y < size[1]; ++y) { - srcOffset = bufRect.offset(0, y, z); - dstOffset = hostRect.offset(0, y, z); - - // Copy memory line by line - amd::Os::fastMemcpy( - (reinterpret_cast
(dstHost) + dstOffset), - (reinterpret_cast(src) + srcOffset), - size[0]); - } - } - - // Unmap source memory - srcMemory.cpuUnmap(vDev_); - - return true; + return true; } -bool -HostBlitManager::readImage( - device::Memory& srcMemory, - void* dstHost, - const amd::Coord3D& origin, - const amd::Coord3D& size, - size_t rowPitch, - size_t slicePitch, - bool entire) const -{ - size_t startLayer = origin[2]; - size_t numLayers = size[2]; - if (srcMemory.owner()->getType() == CL_MEM_OBJECT_IMAGE1D_ARRAY) { - startLayer = origin[1]; - numLayers = size[1]; - } +bool HostBlitManager::readImage(device::Memory& srcMemory, void* dstHost, + const amd::Coord3D& origin, const amd::Coord3D& size, + size_t rowPitch, size_t slicePitch, bool entire) const { + size_t startLayer = origin[2]; + size_t numLayers = size[2]; + if (srcMemory.owner()->getType() == CL_MEM_OBJECT_IMAGE1D_ARRAY) { + startLayer = origin[1]; + numLayers = size[1]; + } - // rowPitch and slicePitch in bytes - size_t srcRowPitch; - size_t srcSlicePitch; + // rowPitch and slicePitch in bytes + size_t srcRowPitch; + size_t srcSlicePitch; - // Get physical GPU memmory - void* src = srcMemory.cpuMap(vDev_, Memory::CpuReadOnly, - startLayer, numLayers, &srcRowPitch, &srcSlicePitch); - if (NULL == src) { - LogError("Couldn't map GPU memory for host read"); - return false; - } + // Get physical GPU memmory + void* src = srcMemory.cpuMap(vDev_, Memory::CpuReadOnly, startLayer, numLayers, &srcRowPitch, + &srcSlicePitch); + if (NULL == src) { + LogError("Couldn't map GPU memory for host read"); + return false; + } - size_t elementSize = srcMemory.owner()->asImage()->getImageFormat().getElementSize(); - size_t srcOffsBase = origin[0] * elementSize; - size_t copySize = size[0] * elementSize; - size_t srcOffs; - size_t dstOffs = 0; + size_t elementSize = srcMemory.owner()->asImage()->getImageFormat().getElementSize(); + size_t srcOffsBase = origin[0] * elementSize; + size_t copySize = size[0] * elementSize; + size_t srcOffs; + size_t dstOffs = 0; - // Make sure we use the right pitch if it's not specified - if (rowPitch == 0) { - rowPitch = size[0] * elementSize; - } + // Make sure we use the right pitch if it's not specified + if (rowPitch == 0) { + rowPitch = size[0] * elementSize; + } - // Make sure we use the right slice if it's not specified - if (slicePitch == 0) { - slicePitch = size[0] * size[1] * elementSize; - } + // Make sure we use the right slice if it's not specified + if (slicePitch == 0) { + slicePitch = size[0] * size[1] * elementSize; + } - // Adjust destination offset with Y dimension - srcOffsBase += srcRowPitch * origin[1]; + // Adjust destination offset with Y dimension + srcOffsBase += srcRowPitch * origin[1]; - // Adjust the destination offset with Z dimension - srcOffsBase += srcSlicePitch * origin[2]; + // Adjust the destination offset with Z dimension + srcOffsBase += srcSlicePitch * origin[2]; + + // Copy memory line by line + for (size_t slice = 0; slice < size[2]; ++slice) { + srcOffs = srcOffsBase + slice * srcSlicePitch; + dstOffs = slice * slicePitch; // Copy memory line by line - for (size_t slice = 0; slice < size[2]; ++slice) { - srcOffs = srcOffsBase + slice * srcSlicePitch; - dstOffs = slice * slicePitch; + for (size_t row = 0; row < size[1]; ++row) { + // Copy memory + amd::Os::fastMemcpy((reinterpret_cast
(dstHost) + dstOffs), + (reinterpret_cast(src) + srcOffs), copySize); - // Copy memory line by line - for (size_t row = 0; row < size[1]; ++row) { - // Copy memory - amd::Os::fastMemcpy( - (reinterpret_cast
(dstHost) + dstOffs), - (reinterpret_cast(src) + srcOffs), - copySize); - - srcOffs += srcRowPitch; - dstOffs += rowPitch; - } + srcOffs += srcRowPitch; + dstOffs += rowPitch; } + } - // Unmap the device memory - srcMemory.cpuUnmap(vDev_); + // Unmap the device memory + srcMemory.cpuUnmap(vDev_); - return true; + return true; } -bool -HostBlitManager::writeBuffer( - const void* srcHost, - device::Memory& dstMemory, - const amd::Coord3D& origin, - const amd::Coord3D& size, - bool entire) const -{ - uint flags = 0; - if (entire) { - flags = Memory::CpuWriteOnly; - } +bool HostBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemory, + const amd::Coord3D& origin, const amd::Coord3D& size, + bool entire) const { + uint flags = 0; + if (entire) { + flags = Memory::CpuWriteOnly; + } - // Map the device memory to CPU visible - void* dst = dstMemory.cpuMap(vDev_, flags); - if (NULL == dst) { - LogError("Couldn't map GPU memory for host write"); - return false; - } + // Map the device memory to CPU visible + void* dst = dstMemory.cpuMap(vDev_, flags); + if (NULL == dst) { + LogError("Couldn't map GPU memory for host write"); + return false; + } - // Copy memory - amd::Os::fastMemcpy( - reinterpret_cast
(dst) + origin[0], srcHost, size[0]); + // Copy memory + amd::Os::fastMemcpy(reinterpret_cast
(dst) + origin[0], srcHost, size[0]); - // Unmap the device memory - dstMemory.cpuUnmap(vDev_); + // Unmap the device memory + dstMemory.cpuUnmap(vDev_); - return true; + return true; } -bool -HostBlitManager::writeBufferRect( - const void* srcHost, - device::Memory& dstMemory, - const amd::BufferRect& hostRect, - const amd::BufferRect& bufRect, - const amd::Coord3D& size, - bool entire) const -{ - // Map destination memory - void *dst = dstMemory.cpuMap(vDev_, (entire) ? Memory::CpuWriteOnly : 0); - if (dst == NULL) { - LogError("Couldn't map destination memory"); - return false; +bool HostBlitManager::writeBufferRect(const void* srcHost, device::Memory& dstMemory, + const amd::BufferRect& hostRect, + const amd::BufferRect& bufRect, const amd::Coord3D& size, + bool entire) const { + // Map destination memory + void* dst = dstMemory.cpuMap(vDev_, (entire) ? Memory::CpuWriteOnly : 0); + if (dst == NULL) { + LogError("Couldn't map destination memory"); + return false; + } + + size_t srcOffset; + size_t dstOffset; + + for (size_t z = 0; z < size[2]; ++z) { + for (size_t y = 0; y < size[1]; ++y) { + srcOffset = hostRect.offset(0, y, z); + dstOffset = bufRect.offset(0, y, z); + + // Copy memory line by line + amd::Os::fastMemcpy((reinterpret_cast
(dst) + dstOffset), + (reinterpret_cast(srcHost) + srcOffset), size[0]); } + } - size_t srcOffset; - size_t dstOffset; + // Unmap destination memory + dstMemory.cpuUnmap(vDev_); - for (size_t z = 0; z < size[2]; ++z) { - for (size_t y = 0; y < size[1]; ++y) { - srcOffset = hostRect.offset(0, y, z); - dstOffset = bufRect.offset(0, y, z); - - // Copy memory line by line - amd::Os::fastMemcpy( - (reinterpret_cast
(dst) + dstOffset), - (reinterpret_cast(srcHost) + srcOffset), - size[0]); - } - } - - // Unmap destination memory - dstMemory.cpuUnmap(vDev_); - - return true; + return true; } -bool -HostBlitManager::writeImage( - const void* srcHost, - device::Memory& dstMemory, - const amd::Coord3D& origin, - const amd::Coord3D& size, - size_t rowPitch, - size_t slicePitch, - bool entire) const -{ - uint flags = 0; - if (entire) { - flags = Memory::CpuWriteOnly; +bool HostBlitManager::writeImage(const void* srcHost, device::Memory& dstMemory, + const amd::Coord3D& origin, const amd::Coord3D& size, + size_t rowPitch, size_t slicePitch, bool entire) const { + uint flags = 0; + if (entire) { + flags = Memory::CpuWriteOnly; + } + + size_t startLayer = origin[2]; + size_t numLayers = size[2]; + if (dstMemory.owner()->getType() == CL_MEM_OBJECT_IMAGE1D_ARRAY) { + startLayer = origin[1]; + numLayers = size[1]; + } + + // rowPitch and slicePitch in bytes + size_t dstRowPitch; + size_t dstSlicePitch; + // Map the device memory to CPU visible + void* dst = dstMemory.cpuMap(vDev_, flags, startLayer, numLayers, &dstRowPitch, &dstSlicePitch); + if (NULL == dst) { + LogError("Couldn't map GPU memory for host write"); + return false; + } + + size_t elementSize = dstMemory.owner()->asImage()->getImageFormat().getElementSize(); + size_t srcOffs = 0; + size_t copySize = size[0] * elementSize; + size_t dstOffsBase = origin[0] * elementSize; + size_t dstOffs; + + // Make sure we use the right pitch if it's not specified + if (rowPitch == 0) { + rowPitch = size[0] * elementSize; + } + + // Make sure we use the right slice if it's not specified + if (slicePitch == 0) { + slicePitch = size[0] * size[1] * elementSize; + } + + // Adjust the destination offset with Y dimension + dstOffsBase += dstRowPitch * origin[1]; + + // Adjust the destination offset with Z dimension + dstOffsBase += dstSlicePitch * origin[2]; + + // Copy memory slice by slice + for (size_t slice = 0; slice < size[2]; ++slice) { + dstOffs = dstOffsBase + slice * dstSlicePitch; + srcOffs = slice * slicePitch; + + // Copy memory line by line + for (size_t row = 0; row < size[1]; ++row) { + // Copy memory + amd::Os::fastMemcpy((reinterpret_cast
(dst) + dstOffs), + (reinterpret_cast(srcHost) + srcOffs), copySize); + + dstOffs += dstRowPitch; + srcOffs += rowPitch; } + } - size_t startLayer = origin[2]; - size_t numLayers = size[2]; - if (dstMemory.owner()->getType() == CL_MEM_OBJECT_IMAGE1D_ARRAY) { - startLayer = origin[1]; - numLayers = size[1]; - } + // Unmap the device memory + dstMemory.cpuUnmap(vDev_); - // rowPitch and slicePitch in bytes - size_t dstRowPitch; - size_t dstSlicePitch; - // Map the device memory to CPU visible - void* dst = dstMemory.cpuMap(vDev_, flags, - startLayer, numLayers, &dstRowPitch, &dstSlicePitch); - if (NULL == dst) { - LogError("Couldn't map GPU memory for host write"); - return false; - } - - size_t elementSize = dstMemory.owner()->asImage()->getImageFormat().getElementSize(); - size_t srcOffs = 0; - size_t copySize = size[0] * elementSize; - size_t dstOffsBase = origin[0] * elementSize; - size_t dstOffs; - - // Make sure we use the right pitch if it's not specified - if (rowPitch == 0) { - rowPitch = size[0] * elementSize; - } - - // Make sure we use the right slice if it's not specified - if (slicePitch == 0) { - slicePitch = size[0] * size[1] * elementSize; - } - - // Adjust the destination offset with Y dimension - dstOffsBase += dstRowPitch * origin[1]; - - // Adjust the destination offset with Z dimension - dstOffsBase += dstSlicePitch * origin[2]; - - // Copy memory slice by slice - for (size_t slice = 0; slice < size[2]; ++slice) { - dstOffs = dstOffsBase + slice * dstSlicePitch; - srcOffs = slice * slicePitch; - - // Copy memory line by line - for (size_t row = 0; row < size[1]; ++row) { - // Copy memory - amd::Os::fastMemcpy( - (reinterpret_cast
(dst) + dstOffs), - (reinterpret_cast(srcHost) + srcOffs), - copySize); - - dstOffs += dstRowPitch; - srcOffs += rowPitch; - } - } - - // Unmap the device memory - dstMemory.cpuUnmap(vDev_); - - return true; + return true; } -bool -HostBlitManager::copyBuffer( - device::Memory& srcMemory, - device::Memory& dstMemory, - const amd::Coord3D& srcOrigin, - const amd::Coord3D& dstOrigin, - const amd::Coord3D& size, - bool entire) const -{ - // Map source memory - void *src = srcMemory.cpuMap(vDev_, - // Overlap detection - (&srcMemory == &dstMemory) ? 0 : Memory::CpuReadOnly); - if (src == NULL) { - LogError("Couldn't map source memory"); - return false; - } +bool HostBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& dstMemory, + const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin, + const amd::Coord3D& size, bool entire) const { + // Map source memory + void* src = srcMemory.cpuMap(vDev_, + // Overlap detection + (&srcMemory == &dstMemory) ? 0 : Memory::CpuReadOnly); + if (src == NULL) { + LogError("Couldn't map source memory"); + return false; + } - // Map destination memory - void *dst = dstMemory.cpuMap(vDev_, (entire) ? Memory::CpuWriteOnly : 0); - if (dst == NULL) { - LogError("Couldn't map destination memory"); - return false; - } + // Map destination memory + void* dst = dstMemory.cpuMap(vDev_, (entire) ? Memory::CpuWriteOnly : 0); + if (dst == NULL) { + LogError("Couldn't map destination memory"); + return false; + } - // Straight forward buffer copy - amd::Os::fastMemcpy( - (reinterpret_cast
(dst) + dstOrigin[0]), - (reinterpret_cast(src) + srcOrigin[0]), - size[0]); + // Straight forward buffer copy + amd::Os::fastMemcpy((reinterpret_cast
(dst) + dstOrigin[0]), + (reinterpret_cast(src) + srcOrigin[0]), size[0]); - // Unmap source and destination memory - dstMemory.cpuUnmap(vDev_); - srcMemory.cpuUnmap(vDev_); + // Unmap source and destination memory + dstMemory.cpuUnmap(vDev_); + srcMemory.cpuUnmap(vDev_); - return true; + return true; } -bool -HostBlitManager::copyBufferRect( - device::Memory& srcMemory, - device::Memory& dstMemory, - const amd::BufferRect& srcRect, - const amd::BufferRect& dstRect, - const amd::Coord3D& size, - bool entire) const -{ - // Map source memory - void *src = srcMemory.cpuMap(vDev_, - // Overlap detection - (&srcMemory == &dstMemory) ? 0 : Memory::CpuReadOnly); - if (src == NULL) { - LogError("Couldn't map source memory"); - return false; +bool HostBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& dstMemory, + const amd::BufferRect& srcRect, const amd::BufferRect& dstRect, + const amd::Coord3D& size, bool entire) const { + // Map source memory + void* src = srcMemory.cpuMap(vDev_, + // Overlap detection + (&srcMemory == &dstMemory) ? 0 : Memory::CpuReadOnly); + if (src == NULL) { + LogError("Couldn't map source memory"); + return false; + } + + // Map destination memory + void* dst = dstMemory.cpuMap(vDev_, (entire) ? Memory::CpuWriteOnly : 0); + if (dst == NULL) { + LogError("Couldn't map destination memory"); + return false; + } + + for (size_t z = 0; z < size[2]; ++z) { + for (size_t y = 0; y < size[1]; ++y) { + size_t srcOffset = srcRect.offset(0, y, z); + size_t dstOffset = dstRect.offset(0, y, z); + + // Copy memory line by line + amd::Os::fastMemcpy((reinterpret_cast
(dst) + dstOffset), + (reinterpret_cast(src) + srcOffset), size[0]); } + } - // Map destination memory - void *dst = dstMemory.cpuMap(vDev_, (entire) ? Memory::CpuWriteOnly : 0); - if (dst == NULL) { - LogError("Couldn't map destination memory"); - return false; - } + // Unmap source and destination memory + dstMemory.cpuUnmap(vDev_); + srcMemory.cpuUnmap(vDev_); - for (size_t z = 0; z < size[2]; ++z) { - for (size_t y = 0; y < size[1]; ++y) { - size_t srcOffset = srcRect.offset(0, y, z); - size_t dstOffset = dstRect.offset(0, y, z); - - // Copy memory line by line - amd::Os::fastMemcpy( - (reinterpret_cast
(dst) + dstOffset), - (reinterpret_cast(src) + srcOffset), - size[0]); - } - } - - // Unmap source and destination memory - dstMemory.cpuUnmap(vDev_); - srcMemory.cpuUnmap(vDev_); - - return true; + return true; } -bool -HostBlitManager::copyImageToBuffer( - device::Memory& srcMemory, - device::Memory& dstMemory, - const amd::Coord3D& srcOrigin, - const amd::Coord3D& dstOrigin, - const amd::Coord3D& size, - bool entire, - size_t rowPitch, - size_t slicePitch) const -{ - size_t startLayer = srcOrigin[2]; - size_t numLayers = size[2]; - if (srcMemory.owner()->getType() == CL_MEM_OBJECT_IMAGE1D_ARRAY) { - startLayer = srcOrigin[1]; - numLayers = size[1]; +bool HostBlitManager::copyImageToBuffer(device::Memory& srcMemory, device::Memory& dstMemory, + const amd::Coord3D& srcOrigin, + const amd::Coord3D& dstOrigin, const amd::Coord3D& size, + bool entire, size_t rowPitch, size_t slicePitch) const { + size_t startLayer = srcOrigin[2]; + size_t numLayers = size[2]; + if (srcMemory.owner()->getType() == CL_MEM_OBJECT_IMAGE1D_ARRAY) { + startLayer = srcOrigin[1]; + numLayers = size[1]; + } + // rowPitch and slicePitch in bytes + size_t srcRowPitch; + size_t srcSlicePitch; + // Map source memory + void* src = srcMemory.cpuMap(vDev_, Memory::CpuReadOnly, startLayer, numLayers, &srcRowPitch, + &srcSlicePitch); + if (src == NULL) { + LogError("Couldn't map source memory"); + return false; + } + size_t elementSize = srcMemory.owner()->asImage()->getImageFormat().getElementSize(); + + // Map destination memory + void* dst = dstMemory.cpuMap(vDev_, (entire) ? Memory::CpuWriteOnly : 0); + if (dst == NULL) { + LogError("Couldn't map destination memory"); + return false; + } + + size_t srcOffs = srcOrigin[0]; + size_t dstOffs = dstOrigin[0]; + size_t srcOffsOrg; + size_t copySize = size[0]; + + // Calculate the offset in bytes + srcOffs *= elementSize; + copySize *= elementSize; + + // Adjust source offset with Y and Z dimensions + srcOffs += srcRowPitch * srcOrigin[1]; + srcOffs += srcSlicePitch * srcOrigin[2]; + + srcOffsOrg = srcOffs; + + // Copy memory slice by slice + for (size_t slice = 0; slice < size[2]; ++slice) { + srcOffs = srcOffsOrg + slice * srcSlicePitch; + + // Copy memory line by line + for (size_t rows = 0; rows < size[1]; ++rows) { + amd::Os::fastMemcpy((reinterpret_cast
(dst) + dstOffs), + (reinterpret_cast(src) + srcOffs), copySize); + + srcOffs += srcRowPitch; + dstOffs += copySize; } - // rowPitch and slicePitch in bytes - size_t srcRowPitch; - size_t srcSlicePitch; - // Map source memory - void *src = srcMemory.cpuMap(vDev_, Memory::CpuReadOnly, - startLayer, numLayers, &srcRowPitch, &srcSlicePitch); - if (src == NULL) { - LogError("Couldn't map source memory"); - return false; - } - size_t elementSize = srcMemory.owner()->asImage()->getImageFormat().getElementSize(); + } - // Map destination memory - void *dst = dstMemory.cpuMap(vDev_, (entire) ? Memory::CpuWriteOnly : 0); - if (dst == NULL) { - LogError("Couldn't map destination memory"); - return false; - } + // Unmap source and destination memory + srcMemory.cpuUnmap(vDev_); + dstMemory.cpuUnmap(vDev_); - size_t srcOffs = srcOrigin[0]; - size_t dstOffs = dstOrigin[0]; - size_t srcOffsOrg; - size_t copySize = size[0]; - - // Calculate the offset in bytes - srcOffs *= elementSize; - copySize *= elementSize; - - // Adjust source offset with Y and Z dimensions - srcOffs += srcRowPitch * srcOrigin[1]; - srcOffs += srcSlicePitch * srcOrigin[2]; - - srcOffsOrg = srcOffs; - - // Copy memory slice by slice - for (size_t slice = 0; slice < size[2]; ++slice) { - srcOffs = srcOffsOrg + slice * srcSlicePitch; - - // Copy memory line by line - for (size_t rows = 0; rows < size[1]; ++rows) { - amd::Os::fastMemcpy( - (reinterpret_cast
(dst) + dstOffs), - (reinterpret_cast(src) + srcOffs), - copySize); - - srcOffs += srcRowPitch; - dstOffs += copySize; - } - } - - // Unmap source and destination memory - srcMemory.cpuUnmap(vDev_); - dstMemory.cpuUnmap(vDev_); - - return true; + return true; } -bool -HostBlitManager::copyBufferToImage( - device::Memory& srcMemory, - device::Memory& dstMemory, - const amd::Coord3D& srcOrigin, - const amd::Coord3D& dstOrigin, - const amd::Coord3D& size, - bool entire, - size_t rowPitch, - size_t slicePitch) const -{ - // Map source memory - void *src = srcMemory.cpuMap(vDev_, Memory::CpuReadOnly); - if (src == NULL) { - LogError("Couldn't map source memory"); - return false; +bool HostBlitManager::copyBufferToImage(device::Memory& srcMemory, device::Memory& dstMemory, + const amd::Coord3D& srcOrigin, + const amd::Coord3D& dstOrigin, const amd::Coord3D& size, + bool entire, size_t rowPitch, size_t slicePitch) const { + // Map source memory + void* src = srcMemory.cpuMap(vDev_, Memory::CpuReadOnly); + if (src == NULL) { + LogError("Couldn't map source memory"); + return false; + } + + size_t startLayer = dstOrigin[2]; + size_t numLayers = size[2]; + if (dstMemory.owner()->getType() == CL_MEM_OBJECT_IMAGE1D_ARRAY) { + startLayer = dstOrigin[1]; + numLayers = size[1]; + } + // rowPitch and slicePitch in bytes + size_t dstRowPitch; + size_t dstSlicePitch; + // Map destination memory + void* dst = dstMemory.cpuMap(vDev_, (entire) ? Memory::CpuWriteOnly : 0, startLayer, numLayers, + &dstRowPitch, &dstSlicePitch); + if (dst == NULL) { + LogError("Couldn't map destination memory"); + return false; + } + + size_t elementSize = dstMemory.owner()->asImage()->getImageFormat().getElementSize(); + size_t srcOffs = srcOrigin[0]; + size_t dstOffs = dstOrigin[0]; + size_t dstOffsOrg; + size_t copySize = size[0]; + + // Calculate the offset in bytes + dstOffs *= elementSize; + copySize *= elementSize; + + // Adjust destination offset with Y and Z dimension + dstOffs += dstRowPitch * dstOrigin[1]; + dstOffs += dstSlicePitch * dstOrigin[2]; + + dstOffsOrg = dstOffs; + + // Copy memory slice by slice + for (size_t slice = 0; slice < size[2]; ++slice) { + dstOffs = dstOffsOrg + slice * dstSlicePitch; + + // Copy memory line by line + for (size_t rows = 0; rows < size[1]; ++rows) { + amd::Os::fastMemcpy((reinterpret_cast
(dst) + dstOffs), + (reinterpret_cast(src) + srcOffs), copySize); + + srcOffs += copySize; + dstOffs += dstRowPitch; } + } - size_t startLayer = dstOrigin[2]; - size_t numLayers = size[2]; - if (dstMemory.owner()->getType() == CL_MEM_OBJECT_IMAGE1D_ARRAY) { - startLayer = dstOrigin[1]; - numLayers = size[1]; - } - // rowPitch and slicePitch in bytes - size_t dstRowPitch; - size_t dstSlicePitch; - // Map destination memory - void *dst = dstMemory.cpuMap(vDev_, (entire) ? Memory::CpuWriteOnly : 0, - startLayer, numLayers, &dstRowPitch, &dstSlicePitch); - if (dst == NULL) { - LogError("Couldn't map destination memory"); - return false; - } + // Unmap source and destination memory + srcMemory.cpuUnmap(vDev_); + dstMemory.cpuUnmap(vDev_); - size_t elementSize = dstMemory.owner()->asImage()->getImageFormat().getElementSize(); - size_t srcOffs = srcOrigin[0]; - size_t dstOffs = dstOrigin[0]; - size_t dstOffsOrg; - size_t copySize = size[0]; - - // Calculate the offset in bytes - dstOffs *= elementSize; - copySize *= elementSize; - - // Adjust destination offset with Y and Z dimension - dstOffs += dstRowPitch * dstOrigin[1]; - dstOffs += dstSlicePitch * dstOrigin[2]; - - dstOffsOrg = dstOffs; - - // Copy memory slice by slice - for (size_t slice = 0; slice < size[2]; ++slice) { - dstOffs = dstOffsOrg + slice * dstSlicePitch; - - // Copy memory line by line - for (size_t rows = 0; rows < size[1]; ++rows) { - amd::Os::fastMemcpy( - (reinterpret_cast
(dst) + dstOffs), - (reinterpret_cast(src) + srcOffs), - copySize); - - srcOffs += copySize; - dstOffs += dstRowPitch; - } - } - - // Unmap source and destination memory - srcMemory.cpuUnmap(vDev_); - dstMemory.cpuUnmap(vDev_); - - return true; + return true; } -bool -HostBlitManager::copyImage( - device::Memory& srcMemory, - device::Memory& dstMemory, - const amd::Coord3D& srcOrigin, - const amd::Coord3D& dstOrigin, - const amd::Coord3D& size, - bool entire) const -{ - size_t startLayer = srcOrigin[2]; - size_t numLayers = size[2]; - if (srcMemory.owner()->getType() == CL_MEM_OBJECT_IMAGE1D_ARRAY) { - startLayer = srcOrigin[1]; - numLayers = size[1]; - } - // rowPitch and slicePitch in bytes - size_t srcRowPitch; - size_t srcSlicePitch; - // Map source memory - void *src = srcMemory.cpuMap(vDev_, Memory::CpuReadOnly, - startLayer, numLayers, &srcRowPitch, &srcSlicePitch); - if (src == NULL) { - LogError("Couldn't map source memory"); - return false; - } - if (dstMemory.owner()->getType() == CL_MEM_OBJECT_IMAGE1D_ARRAY) { - startLayer = dstOrigin[1]; - numLayers = size[1]; - } - else { - startLayer = dstOrigin[2]; - numLayers = size[2]; +bool HostBlitManager::copyImage(device::Memory& srcMemory, device::Memory& dstMemory, + const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin, + const amd::Coord3D& size, bool entire) const { + size_t startLayer = srcOrigin[2]; + size_t numLayers = size[2]; + if (srcMemory.owner()->getType() == CL_MEM_OBJECT_IMAGE1D_ARRAY) { + startLayer = srcOrigin[1]; + numLayers = size[1]; + } + // rowPitch and slicePitch in bytes + size_t srcRowPitch; + size_t srcSlicePitch; + // Map source memory + void* src = srcMemory.cpuMap(vDev_, Memory::CpuReadOnly, startLayer, numLayers, &srcRowPitch, + &srcSlicePitch); + if (src == NULL) { + LogError("Couldn't map source memory"); + return false; + } + if (dstMemory.owner()->getType() == CL_MEM_OBJECT_IMAGE1D_ARRAY) { + startLayer = dstOrigin[1]; + numLayers = size[1]; + } else { + startLayer = dstOrigin[2]; + numLayers = size[2]; + } + + // rowPitch and slicePitch in bytes + size_t dstRowPitch; + size_t dstSlicePitch; + // Map destination memory + void* dst = dstMemory.cpuMap(vDev_, (entire) ? Memory::CpuWriteOnly : 0, startLayer, numLayers, + &dstRowPitch, &dstSlicePitch); + if (dst == NULL) { + LogError("Couldn't map destination memory"); + return false; + } + + size_t elementSize = dstMemory.owner()->asImage()->getImageFormat().getElementSize(); + assert(elementSize == srcMemory.owner()->asImage()->getImageFormat().getElementSize()); + + size_t srcOffs = srcOrigin[0]; + size_t dstOffs = dstOrigin[0]; + size_t srcOffsOrg; + size_t dstOffsOrg; + size_t copySize = size[0]; + + // Calculate the offsets in bytes + srcOffs *= elementSize; + dstOffs *= elementSize; + copySize *= elementSize; + + // Adjust destination and sorce offsets with Y dimension + srcOffs += srcRowPitch * srcOrigin[1]; + dstOffs += dstRowPitch * dstOrigin[1]; + + // Adjust destination and sorce offsets with Z dimension + srcOffs += srcSlicePitch * srcOrigin[2]; + dstOffs += dstSlicePitch * dstOrigin[2]; + + srcOffsOrg = srcOffs; + dstOffsOrg = dstOffs; + + // Copy memory slice by slice + for (size_t slice = 0; slice < size[2]; ++slice) { + srcOffs = srcOffsOrg + slice * srcSlicePitch; + dstOffs = dstOffsOrg + slice * dstSlicePitch; + + // Copy memory line by line + for (size_t rows = 0; rows < size[1]; ++rows) { + amd::Os::fastMemcpy((reinterpret_cast
(dst) + dstOffs), + (reinterpret_cast(src) + srcOffs), copySize); + + srcOffs += srcRowPitch; + dstOffs += dstRowPitch; } + } - // rowPitch and slicePitch in bytes - size_t dstRowPitch; - size_t dstSlicePitch; - // Map destination memory - void *dst = dstMemory.cpuMap(vDev_, (entire) ? Memory::CpuWriteOnly : 0, - startLayer, numLayers, &dstRowPitch, &dstSlicePitch); - if (dst == NULL) { - LogError("Couldn't map destination memory"); - return false; - } + // Unmap source and destination memory + srcMemory.cpuUnmap(vDev_); + dstMemory.cpuUnmap(vDev_); - size_t elementSize = dstMemory.owner()->asImage()->getImageFormat().getElementSize(); - assert(elementSize == srcMemory.owner()->asImage()->getImageFormat().getElementSize()); - - size_t srcOffs = srcOrigin[0]; - size_t dstOffs = dstOrigin[0]; - size_t srcOffsOrg; - size_t dstOffsOrg; - size_t copySize = size[0]; - - // Calculate the offsets in bytes - srcOffs *= elementSize; - dstOffs *= elementSize; - copySize *= elementSize; - - // Adjust destination and sorce offsets with Y dimension - srcOffs += srcRowPitch * srcOrigin[1]; - dstOffs += dstRowPitch * dstOrigin[1]; - - // Adjust destination and sorce offsets with Z dimension - srcOffs += srcSlicePitch * srcOrigin[2]; - dstOffs += dstSlicePitch * dstOrigin[2]; - - srcOffsOrg = srcOffs; - dstOffsOrg = dstOffs; - - // Copy memory slice by slice - for (size_t slice = 0; slice < size[2]; ++slice) { - srcOffs = srcOffsOrg + slice * srcSlicePitch; - dstOffs = dstOffsOrg + slice * dstSlicePitch; - - // Copy memory line by line - for (size_t rows = 0; rows < size[1]; ++rows) { - amd::Os::fastMemcpy( - (reinterpret_cast
(dst) + dstOffs), - (reinterpret_cast(src) + srcOffs), - copySize); - - srcOffs += srcRowPitch; - dstOffs += dstRowPitch; - } - } - - // Unmap source and destination memory - srcMemory.cpuUnmap(vDev_); - dstMemory.cpuUnmap(vDev_); - - return true; + return true; } -bool -HostBlitManager::fillBuffer( - device::Memory& memory, - const void* pattern, - size_t patternSize, - const amd::Coord3D& origin, - const amd::Coord3D& size, - bool entire - ) const -{ - // Map memory - void* fillMem = memory.cpuMap(vDev_, (entire) ? Memory::CpuWriteOnly : 0); - if (fillMem == NULL) { - LogError("Couldn't map destination memory"); - return false; - } +bool HostBlitManager::fillBuffer(device::Memory& memory, const void* pattern, size_t patternSize, + const amd::Coord3D& origin, const amd::Coord3D& size, + bool entire) const { + // Map memory + void* fillMem = memory.cpuMap(vDev_, (entire) ? Memory::CpuWriteOnly : 0); + if (fillMem == NULL) { + LogError("Couldn't map destination memory"); + return false; + } - size_t offset = origin[0]; - size_t fillSize = size[0]; + size_t offset = origin[0]; + size_t fillSize = size[0]; - if ((fillSize % patternSize) != 0) { - LogError("Misaligned buffer size and pattern size!"); - } + if ((fillSize % patternSize) != 0) { + LogError("Misaligned buffer size and pattern size!"); + } - // Fill the buffer memory with a pattern - for (size_t i = 0; i < (fillSize / patternSize); i++) { - memcpy( - (reinterpret_cast
(fillMem) + offset), - (reinterpret_cast(pattern)), - patternSize - ); - offset += patternSize; - } + // Fill the buffer memory with a pattern + for (size_t i = 0; i < (fillSize / patternSize); i++) { + memcpy((reinterpret_cast
(fillMem) + offset), + (reinterpret_cast(pattern)), patternSize); + offset += patternSize; + } - // Unmap source and destination memory - memory.cpuUnmap(vDev_); + // Unmap source and destination memory + memory.cpuUnmap(vDev_); - return true; + return true; } -bool -HostBlitManager::fillImage( - device::Memory& memory, - const void* pattern, - const amd::Coord3D& origin, - const amd::Coord3D& size, - bool entire - ) const -{ - size_t startLayer = origin[2]; - size_t numLayers = size[2]; - if (memory.owner()->getType() == CL_MEM_OBJECT_IMAGE1D_ARRAY) { - startLayer = origin[1]; - numLayers = size[1]; +bool HostBlitManager::fillImage(device::Memory& memory, const void* pattern, + const amd::Coord3D& origin, const amd::Coord3D& size, + bool entire) const { + size_t startLayer = origin[2]; + size_t numLayers = size[2]; + if (memory.owner()->getType() == CL_MEM_OBJECT_IMAGE1D_ARRAY) { + startLayer = origin[1]; + numLayers = size[1]; + } + // rowPitch and slicePitch in bytes + size_t devRowPitch; + size_t devSlicePitch; + + void* newpattern = const_cast(pattern); + cl_float4 fFillColor; + + // Converting a linear RGB floating-point color value to a normalized 8-bit unsigned integer sRGB + // value so that the cpu path can treat sRGB as RGB for host transfer. + if (memory.owner()->asImage()->getImageFormat().image_channel_order == CL_sRGBA) { + float* fColor = static_cast(newpattern); + fFillColor.s[0] = sRGBmap(fColor[0]) / 255.0f; + fFillColor.s[1] = sRGBmap(fColor[1]) / 255.0f; + fFillColor.s[2] = sRGBmap(fColor[2]) / 255.0f; + fFillColor.s[3] = fColor[3]; + newpattern = static_cast(&fFillColor); + } + + // Map memory + void* fillMem = memory.cpuMap(vDev_, (entire) ? Memory::CpuWriteOnly : 0, startLayer, numLayers, + &devRowPitch, &devSlicePitch); + if (fillMem == NULL) { + LogError("Couldn't map destination memory"); + return false; + } + + float fillValue[4]; + memset(fillValue, 0, sizeof(fillValue)); + memory.owner()->asImage()->getImageFormat().formatColor(newpattern, fillValue); + + size_t elementSize = memory.owner()->asImage()->getImageFormat().getElementSize(); + size_t offset = origin[0] * elementSize; + size_t offsetOrg; + + // Adjust offset with Y dimension + offset += devRowPitch * origin[1]; + + // Adjust offset with Z dimension + offset += devSlicePitch * origin[2]; + + offsetOrg = offset; + + // Fill the image memory with a pattern + for (size_t slice = 0; slice < size[2]; ++slice) { + offset = offsetOrg + slice * devSlicePitch; + + for (size_t rows = 0; rows < size[1]; ++rows) { + size_t pixOffset = offset; + + // Copy memory pixel by pixel + for (size_t column = 0; column < size[0]; ++column) { + memcpy((reinterpret_cast
(fillMem) + pixOffset), + (reinterpret_cast(fillValue)), elementSize); + pixOffset += elementSize; + } + + offset += devRowPitch; } - // rowPitch and slicePitch in bytes - size_t devRowPitch; - size_t devSlicePitch; + } - void *newpattern = const_cast(pattern); - cl_float4 fFillColor; + // Unmap memory + memory.cpuUnmap(vDev_); - // Converting a linear RGB floating-point color value to a normalized 8-bit unsigned integer sRGB value so that the cpu path can treat sRGB as RGB for host transfer. - if (memory.owner()->asImage()->getImageFormat().image_channel_order == CL_sRGBA) { - float *fColor = static_cast(newpattern); - fFillColor.s[0] = sRGBmap(fColor[0]) / 255.0f; - fFillColor.s[1] = sRGBmap(fColor[1]) / 255.0f; - fFillColor.s[2] = sRGBmap(fColor[2]) / 255.0f; - fFillColor.s[3] = fColor[3]; - newpattern = static_cast(&fFillColor); - } - - // Map memory - void* fillMem = memory.cpuMap(vDev_, (entire) ? Memory::CpuWriteOnly : 0, - startLayer, numLayers, &devRowPitch, &devSlicePitch); - if (fillMem == NULL) { - LogError("Couldn't map destination memory"); - return false; - } - - float fillValue[4]; - memset(fillValue, 0, sizeof(fillValue)); - memory.owner()->asImage()->getImageFormat().formatColor(newpattern, fillValue); - - size_t elementSize = memory.owner()->asImage()->getImageFormat().getElementSize(); - size_t offset = origin[0] * elementSize; - size_t offsetOrg; - - // Adjust offset with Y dimension - offset += devRowPitch * origin[1]; - - // Adjust offset with Z dimension - offset += devSlicePitch * origin[2]; - - offsetOrg = offset; - - // Fill the image memory with a pattern - for (size_t slice = 0; slice < size[2]; ++slice) { - offset = offsetOrg + slice * devSlicePitch; - - for (size_t rows = 0; rows < size[1]; ++rows) { - size_t pixOffset = offset; - - // Copy memory pixel by pixel - for (size_t column = 0; column < size[0]; ++column) { - memcpy( - (reinterpret_cast
(fillMem) + pixOffset), - (reinterpret_cast(fillValue)), - elementSize - ); - pixOffset += elementSize; - } - - offset += devRowPitch; - } - } - - // Unmap memory - memory.cpuUnmap(vDev_); - - return true; + return true; } -cl_uint -HostBlitManager::sRGBmap(float fc) const -{ - double c = (double)fc; +cl_uint HostBlitManager::sRGBmap(float fc) const { + double c = (double)fc; #ifdef ATI_OS_LINUX - if (isnan(c)) - c = 0.0; + if (isnan(c)) c = 0.0; #else - if (_isnan(c)) - c = 0.0; + if (_isnan(c)) c = 0.0; #endif - if (c > 1.0) - c = 1.0; - else if (c < 0.0) - c = 0.0; - else if (c < 0.0031308) - c = 12.92 * c; - else - c = (1055.0/1000.0) * pow(c, 5.0/12.0) - (55.0/1000.0); + if (c > 1.0) + c = 1.0; + else if (c < 0.0) + c = 0.0; + else if (c < 0.0031308) + c = 12.92 * c; + else + c = (1055.0 / 1000.0) * pow(c, 5.0 / 12.0) - (55.0 / 1000.0); - return (cl_uint)(c * 255.0 + 0.5); + return (cl_uint)(c * 255.0 + 0.5); } -} // namespace gpu +} // namespace gpu diff --git a/rocclr/runtime/device/blit.hpp b/rocclr/runtime/device/blit.hpp index 226637f653..386a979e92 100644 --- a/rocclr/runtime/device/blit.hpp +++ b/rocclr/runtime/device/blit.hpp @@ -17,354 +17,325 @@ namespace device { //! Blit Manager Abstraction class -class BlitManager : public amd::HeapObject -{ -public: - //! HW accelerated setup - union Setup { - struct { - uint disableReadBuffer_ : 1; - uint disableReadBufferRect_ : 1; - uint disableReadImage_ : 1; - uint disableWriteBuffer_ : 1; - uint disableWriteBufferRect_ : 1; - uint disableWriteImage_ : 1; - uint disableCopyBuffer_ : 1; - uint disableCopyBufferRect_ : 1; - uint disableCopyImageToBuffer_ : 1; - uint disableCopyBufferToImage_ : 1; - uint disableCopyImage_ : 1; - uint disableFillBuffer_ : 1; - uint disableFillImage_ : 1; - uint disableCopyBufferToImageOpt_: 1; - uint disableHwlCopyBuffer_ : 1; - }; - uint32_t value_; - Setup() : value_(0) {} - void disableAll() { value_ = 0xffffffff; } +class BlitManager : public amd::HeapObject { + public: + //! HW accelerated setup + union Setup { + struct { + uint disableReadBuffer_ : 1; + uint disableReadBufferRect_ : 1; + uint disableReadImage_ : 1; + uint disableWriteBuffer_ : 1; + uint disableWriteBufferRect_ : 1; + uint disableWriteImage_ : 1; + uint disableCopyBuffer_ : 1; + uint disableCopyBufferRect_ : 1; + uint disableCopyImageToBuffer_ : 1; + uint disableCopyBufferToImage_ : 1; + uint disableCopyImage_ : 1; + uint disableFillBuffer_ : 1; + uint disableFillImage_ : 1; + uint disableCopyBufferToImageOpt_ : 1; + uint disableHwlCopyBuffer_ : 1; }; + uint32_t value_; + Setup() : value_(0) {} + void disableAll() { value_ = 0xffffffff; } + }; -public: - //! Constructor - BlitManager( - Setup setup = Setup() //!< Specifies HW accelerated blits - ) : setup_(setup), syncOperation_(false) {} + public: + //! Constructor + BlitManager(Setup setup = Setup() //!< Specifies HW accelerated blits + ) + : setup_(setup), syncOperation_(false) {} - //! Destructor - virtual ~BlitManager() { } + //! Destructor + virtual ~BlitManager() {} - //! Creates HostBlitManager object - virtual bool create(amd::Device& device) { return true; } + //! Creates HostBlitManager object + virtual bool create(amd::Device& device) { return true; } - //! Copies a buffer object to system memory - virtual bool readBuffer( - Memory& srcMemory, //!< Source memory object - void* dstHost, //!< Destination host memory - const amd::Coord3D& origin, //!< Source origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const = 0; + //! Copies a buffer object to system memory + virtual bool readBuffer(Memory& srcMemory, //!< Source memory object + void* dstHost, //!< Destination host memory + const amd::Coord3D& origin, //!< Source origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const = 0; - //! Copies a buffer object to system memory - virtual bool readBufferRect( - Memory& srcMemory, //!< Source memory object - void* dstHost, //!< Destinaiton host memory - const amd::BufferRect& bufRect, //!< Source rectangle - const amd::BufferRect& hostRect, //!< Destination rectangle - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const = 0; + //! Copies a buffer object to system memory + virtual bool readBufferRect(Memory& srcMemory, //!< Source memory object + void* dstHost, //!< Destinaiton host memory + const amd::BufferRect& bufRect, //!< Source rectangle + const amd::BufferRect& hostRect, //!< Destination rectangle + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const = 0; - //! Copies an image object to system memory - virtual bool readImage( - Memory& srcMemory, //!< Source memory object - void* dstHost, //!< Destination host memory - const amd::Coord3D& origin, //!< Source origin - const amd::Coord3D& size, //!< Size of the copy region - size_t rowPitch, //!< Row pitch for host memory - size_t slicePitch, //!< Slice pitch for host memory - bool entire = false //!< Entire buffer will be updated - ) const = 0; + //! Copies an image object to system memory + virtual bool readImage(Memory& srcMemory, //!< Source memory object + void* dstHost, //!< Destination host memory + const amd::Coord3D& origin, //!< Source origin + const amd::Coord3D& size, //!< Size of the copy region + size_t rowPitch, //!< Row pitch for host memory + size_t slicePitch, //!< Slice pitch for host memory + bool entire = false //!< Entire buffer will be updated + ) const = 0; - //! Copies system memory to a buffer object - virtual bool writeBuffer( - const void* srcHost, //!< Source host memory - Memory& dstMemory, //!< Destination memory object - const amd::Coord3D& origin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const = 0; + //! Copies system memory to a buffer object + virtual bool writeBuffer(const void* srcHost, //!< Source host memory + Memory& dstMemory, //!< Destination memory object + const amd::Coord3D& origin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const = 0; - //! Copies system memory to a buffer object - virtual bool writeBufferRect( - const void* srcHost, //!< Source host memory - Memory& dstMemory, //!< Destination memory object - const amd::BufferRect& hostRect, //!< Destination rectangle - const amd::BufferRect& bufRect, //!< Source rectangle - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const = 0; + //! Copies system memory to a buffer object + virtual bool writeBufferRect(const void* srcHost, //!< Source host memory + Memory& dstMemory, //!< Destination memory object + const amd::BufferRect& hostRect, //!< Destination rectangle + const amd::BufferRect& bufRect, //!< Source rectangle + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const = 0; - //! Copies system memory to an image object - virtual bool writeImage( - const void* srcHost, //!< Source host memory - Memory& dstMemory, //!< Destination memory object - const amd::Coord3D& origin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - size_t rowPitch, //!< Row pitch for host memory - size_t slicePitch, //!< Slice pitch for host memory - bool entire = false //!< Entire buffer will be updated - ) const = 0; + //! Copies system memory to an image object + virtual bool writeImage(const void* srcHost, //!< Source host memory + Memory& dstMemory, //!< Destination memory object + const amd::Coord3D& origin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + size_t rowPitch, //!< Row pitch for host memory + size_t slicePitch, //!< Slice pitch for host memory + bool entire = false //!< Entire buffer will be updated + ) const = 0; - //! Copies a buffer object to another buffer object - virtual bool copyBuffer( - Memory& srcMemory, //!< Source memory object - Memory& dstMemory, //!< Destination memory object - const amd::Coord3D& srcOrigin, //!< Source origin - const amd::Coord3D& dstOrigin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const = 0; + //! Copies a buffer object to another buffer object + virtual bool copyBuffer(Memory& srcMemory, //!< Source memory object + Memory& dstMemory, //!< Destination memory object + const amd::Coord3D& srcOrigin, //!< Source origin + const amd::Coord3D& dstOrigin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const = 0; - //! Copies a buffer object to another buffer object - virtual bool copyBufferRect( - Memory& srcMemory, //!< Source memory object - Memory& dstMemory, //!< Destination memory object - const amd::BufferRect& srcRect, //!< Source rectangle - const amd::BufferRect& dstRect, //!< Destination rectangle - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const = 0; + //! Copies a buffer object to another buffer object + virtual bool copyBufferRect(Memory& srcMemory, //!< Source memory object + Memory& dstMemory, //!< Destination memory object + const amd::BufferRect& srcRect, //!< Source rectangle + const amd::BufferRect& dstRect, //!< Destination rectangle + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const = 0; - //! Copies an image object to a buffer object - virtual bool copyImageToBuffer( - Memory& srcMemory, //!< Source memory object - Memory& dstMemory, //!< Destination memory object - const amd::Coord3D& srcOrigin, //!< Source origin - const amd::Coord3D& dstOrigin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false, //!< Entire buffer will be updated - size_t rowPitch = 0, //!< Pitch for buffer - size_t slicePitch = 0 //!< Slice for buffer - ) const = 0; + //! Copies an image object to a buffer object + virtual bool copyImageToBuffer(Memory& srcMemory, //!< Source memory object + Memory& dstMemory, //!< Destination memory object + const amd::Coord3D& srcOrigin, //!< Source origin + const amd::Coord3D& dstOrigin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false, //!< Entire buffer will be updated + size_t rowPitch = 0, //!< Pitch for buffer + size_t slicePitch = 0 //!< Slice for buffer + ) const = 0; - //! Copies a buffer object to an image object - virtual bool copyBufferToImage( - Memory& srcMemory, //!< Source memory object - Memory& dstMemory, //!< Destination memory object - const amd::Coord3D& srcOrigin, //!< Source origin - const amd::Coord3D& dstOrigin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false, //!< Entire buffer will be updated - size_t rowPitch = 0, //!< Pitch for buffer - size_t slicePitch = 0 //!< Slice for buffer - ) const = 0; + //! Copies a buffer object to an image object + virtual bool copyBufferToImage(Memory& srcMemory, //!< Source memory object + Memory& dstMemory, //!< Destination memory object + const amd::Coord3D& srcOrigin, //!< Source origin + const amd::Coord3D& dstOrigin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false, //!< Entire buffer will be updated + size_t rowPitch = 0, //!< Pitch for buffer + size_t slicePitch = 0 //!< Slice for buffer + ) const = 0; - //! Copies an image object to another image object - virtual bool copyImage( - Memory& srcMemory, //!< Source memory object - Memory& dstMemory, //!< Destination memory object - const amd::Coord3D& srcOrigin, //!< Source origin - const amd::Coord3D& dstOrigin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const = 0; + //! Copies an image object to another image object + virtual bool copyImage(Memory& srcMemory, //!< Source memory object + Memory& dstMemory, //!< Destination memory object + const amd::Coord3D& srcOrigin, //!< Source origin + const amd::Coord3D& dstOrigin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const = 0; - //! Fills a buffer memory with a pattern data - virtual bool fillBuffer( - Memory& memory, //!< Memory object to fill with pattern - const void* pattern, //!< Pattern data - size_t patternSize, //!< Pattern size - const amd::Coord3D& origin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const = 0; + //! Fills a buffer memory with a pattern data + virtual bool fillBuffer(Memory& memory, //!< Memory object to fill with pattern + const void* pattern, //!< Pattern data + size_t patternSize, //!< Pattern size + const amd::Coord3D& origin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const = 0; - //! Fills an image memory with a pattern data - virtual bool fillImage( - Memory& dstMemory, //!< Memory object to fill with pattern - const void* pattern, //!< Pattern data - const amd::Coord3D& origin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const = 0; + //! Fills an image memory with a pattern data + virtual bool fillImage(Memory& dstMemory, //!< Memory object to fill with pattern + const void* pattern, //!< Pattern data + const amd::Coord3D& origin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const = 0; - //! Enables synchronization on blit operations - void enableSynchronization() { syncOperation_ = true; } + //! Enables synchronization on blit operations + void enableSynchronization() { syncOperation_ = true; } -protected: - const Setup setup_; //!< HW accelerated blit requested - bool syncOperation_; //!< Blit operations are synchronized + protected: + const Setup setup_; //!< HW accelerated blit requested + bool syncOperation_; //!< Blit operations are synchronized -private: - //! Disable copy constructor - BlitManager(const BlitManager&); + private: + //! Disable copy constructor + BlitManager(const BlitManager&); - //! Disable operator= - BlitManager& operator=(const BlitManager&); + //! Disable operator= + BlitManager& operator=(const BlitManager&); }; //! Host Blit Manager -class HostBlitManager : public device::BlitManager -{ -public: - //! Constructor - HostBlitManager( - VirtualDevice& vdev, //!< Virtual GPU to be used for blits - Setup setup = Setup() //!< Specifies HW accelerated blits - ); +class HostBlitManager : public device::BlitManager { + public: + //! Constructor + HostBlitManager(VirtualDevice& vdev, //!< Virtual GPU to be used for blits + Setup setup = Setup() //!< Specifies HW accelerated blits + ); - //! Destructor - virtual ~HostBlitManager() { } + //! Destructor + virtual ~HostBlitManager() {} - //! Creates HostBlitManager object - virtual bool create(amd::Device& device) { return true; } + //! Creates HostBlitManager object + virtual bool create(amd::Device& device) { return true; } - //! Copies a buffer object to system memory - virtual bool readBuffer( - device::Memory& srcMemory, //!< Source memory object - void* dstHost, //!< Destination host memory - const amd::Coord3D& origin, //!< Source origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; + //! Copies a buffer object to system memory + virtual bool readBuffer(device::Memory& srcMemory, //!< Source memory object + void* dstHost, //!< Destination host memory + const amd::Coord3D& origin, //!< Source origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const; - //! Copies a buffer object to system memory - virtual bool readBufferRect( - device::Memory& srcMemory, //!< Source memory object - void* dstHost, //!< Destinaiton host memory - const amd::BufferRect& bufRect, //!< Source rectangle - const amd::BufferRect& hostRect, //!< Destination rectangle - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; + //! Copies a buffer object to system memory + virtual bool readBufferRect(device::Memory& srcMemory, //!< Source memory object + void* dstHost, //!< Destinaiton host memory + const amd::BufferRect& bufRect, //!< Source rectangle + const amd::BufferRect& hostRect, //!< Destination rectangle + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const; - //! Copies an image object to system memory - virtual bool readImage( - device::Memory& srcMemory, //!< Source memory object - void* dstHost, //!< Destination host memory - const amd::Coord3D& origin, //!< Source origin - const amd::Coord3D& size, //!< Size of the copy region - size_t rowPitch, //!< Row pitch for host memory - size_t slicePitch, //!< Slice pitch for host memory - bool entire = false //!< Entire buffer will be updated - ) const; + //! Copies an image object to system memory + virtual bool readImage(device::Memory& srcMemory, //!< Source memory object + void* dstHost, //!< Destination host memory + const amd::Coord3D& origin, //!< Source origin + const amd::Coord3D& size, //!< Size of the copy region + size_t rowPitch, //!< Row pitch for host memory + size_t slicePitch, //!< Slice pitch for host memory + bool entire = false //!< Entire buffer will be updated + ) const; - //! Copies system memory to a buffer object - virtual bool writeBuffer( - const void* srcHost, //!< Source host memory - device::Memory& dstMemory, //!< Destination memory object - const amd::Coord3D& origin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; + //! Copies system memory to a buffer object + virtual bool writeBuffer(const void* srcHost, //!< Source host memory + device::Memory& dstMemory, //!< Destination memory object + const amd::Coord3D& origin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const; - //! Copies system memory to a buffer object - virtual bool writeBufferRect( - const void* srcHost, //!< Source host memory - device::Memory& dstMemory, //!< Destination memory object - const amd::BufferRect& hostRect, //!< Destination rectangle - const amd::BufferRect& bufRect, //!< Source rectangle - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; + //! Copies system memory to a buffer object + virtual bool writeBufferRect(const void* srcHost, //!< Source host memory + device::Memory& dstMemory, //!< Destination memory object + const amd::BufferRect& hostRect, //!< Destination rectangle + const amd::BufferRect& bufRect, //!< Source rectangle + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const; - //! Copies system memory to an image object - virtual bool writeImage( - const void* srcHost, //!< Source host memory - device::Memory& dstMemory, //!< Destination memory object - const amd::Coord3D& origin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - size_t rowPitch, //!< Row pitch for host memory - size_t slicePitch, //!< Slice pitch for host memory - bool entire = false //!< Entire buffer will be updated - ) const; + //! Copies system memory to an image object + virtual bool writeImage(const void* srcHost, //!< Source host memory + device::Memory& dstMemory, //!< Destination memory object + const amd::Coord3D& origin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + size_t rowPitch, //!< Row pitch for host memory + size_t slicePitch, //!< Slice pitch for host memory + bool entire = false //!< Entire buffer will be updated + ) const; - //! Copies a buffer object to another buffer object - virtual bool copyBuffer( - device::Memory& srcMemory, //!< Source memory object - device::Memory& dstMemory, //!< Destination memory object - const amd::Coord3D& srcOrigin, //!< Source origin - const amd::Coord3D& dstOrigin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; + //! Copies a buffer object to another buffer object + virtual bool copyBuffer(device::Memory& srcMemory, //!< Source memory object + device::Memory& dstMemory, //!< Destination memory object + const amd::Coord3D& srcOrigin, //!< Source origin + const amd::Coord3D& dstOrigin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const; - //! Copies a buffer object to another buffer object - virtual bool copyBufferRect( - device::Memory& srcMemory, //!< Source memory object - device::Memory& dstMemory, //!< Destination memory object - const amd::BufferRect& srcRect, //!< Source rectangle - const amd::BufferRect& dstRect, //!< Destination rectangle - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; + //! Copies a buffer object to another buffer object + virtual bool copyBufferRect(device::Memory& srcMemory, //!< Source memory object + device::Memory& dstMemory, //!< Destination memory object + const amd::BufferRect& srcRect, //!< Source rectangle + const amd::BufferRect& dstRect, //!< Destination rectangle + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const; - //! Copies an image object to a buffer object - virtual bool copyImageToBuffer( - device::Memory& srcMemory, //!< Source memory object - device::Memory& dstMemory, //!< Destination memory object - const amd::Coord3D& srcOrigin, //!< Source origin - const amd::Coord3D& dstOrigin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false, //!< Entire buffer will be updated - size_t rowPitch = 0, //!< Pitch for buffer - size_t slicePitch = 0 //!< Slice for buffer - ) const; + //! Copies an image object to a buffer object + virtual bool copyImageToBuffer(device::Memory& srcMemory, //!< Source memory object + device::Memory& dstMemory, //!< Destination memory object + const amd::Coord3D& srcOrigin, //!< Source origin + const amd::Coord3D& dstOrigin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false, //!< Entire buffer will be updated + size_t rowPitch = 0, //!< Pitch for buffer + size_t slicePitch = 0 //!< Slice for buffer + ) const; - //! Copies a buffer object to an image object - virtual bool copyBufferToImage( - device::Memory& srcMemory, //!< Source memory object - device::Memory& dstMemory, //!< Destination memory object - const amd::Coord3D& srcOrigin, //!< Source origin - const amd::Coord3D& dstOrigin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false, //!< Entire buffer will be updated - size_t rowPitch = 0, //!< Pitch for buffer - size_t slicePitch = 0 //!< Slice for buffer - ) const; + //! Copies a buffer object to an image object + virtual bool copyBufferToImage(device::Memory& srcMemory, //!< Source memory object + device::Memory& dstMemory, //!< Destination memory object + const amd::Coord3D& srcOrigin, //!< Source origin + const amd::Coord3D& dstOrigin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false, //!< Entire buffer will be updated + size_t rowPitch = 0, //!< Pitch for buffer + size_t slicePitch = 0 //!< Slice for buffer + ) const; - //! Copies an image object to another image object - virtual bool copyImage( - device::Memory& srcMemory, //!< Source memory object - device::Memory& dstMemory, //!< Destination memory object - const amd::Coord3D& srcOrigin, //!< Source origin - const amd::Coord3D& dstOrigin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; + //! Copies an image object to another image object + virtual bool copyImage(device::Memory& srcMemory, //!< Source memory object + device::Memory& dstMemory, //!< Destination memory object + const amd::Coord3D& srcOrigin, //!< Source origin + const amd::Coord3D& dstOrigin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const; - //! Fills a buffer memory with a pattern data - virtual bool fillBuffer( - device::Memory& memory, //!< Memory object to fill with pattern - const void* pattern, //!< Pattern data - size_t patternSize, //!< Pattern size - const amd::Coord3D& origin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; + //! Fills a buffer memory with a pattern data + virtual bool fillBuffer(device::Memory& memory, //!< Memory object to fill with pattern + const void* pattern, //!< Pattern data + size_t patternSize, //!< Pattern size + const amd::Coord3D& origin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const; - //! Fills an image memory with a pattern data - virtual bool fillImage( - device::Memory& dstMemory, //!< Memory object to fill with pattern - const void* pattern, //!< Pattern data - const amd::Coord3D& origin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; + //! Fills an image memory with a pattern data + virtual bool fillImage(device::Memory& dstMemory, //!< Memory object to fill with pattern + const void* pattern, //!< Pattern data + const amd::Coord3D& origin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const; - cl_uint sRGBmap(float fc) const; + cl_uint sRGBmap(float fc) const; -protected: - VirtualDevice& vDev_; //!< Virtual device object - const amd::Device& dev_; //!< Physical device + protected: + VirtualDevice& vDev_; //!< Virtual device object + const amd::Device& dev_; //!< Physical device -private: - //! Disable copy constructor - HostBlitManager(const HostBlitManager&); + private: + //! Disable copy constructor + HostBlitManager(const HostBlitManager&); - //! Disable operator= - HostBlitManager& operator=(const HostBlitManager&); + //! Disable operator= + HostBlitManager& operator=(const HostBlitManager&); }; /*@}*/} // namespace device diff --git a/rocclr/runtime/device/blitcl.cpp b/rocclr/runtime/device/blitcl.cpp index 853ebab465..1d2c409eaf 100644 --- a/rocclr/runtime/device/blitcl.cpp +++ b/rocclr/runtime/device/blitcl.cpp @@ -6,161 +6,86 @@ namespace device { #define BLIT_KERNELS(...) #__VA_ARGS__ -const char* BlitSourceCode = -BLIT_KERNELS( -extern void __amd_copyBufferRect( - __global uchar*, __global uchar*, - ulong4, ulong4, ulong4); +const char* BlitSourceCode = BLIT_KERNELS( + extern void __amd_copyBufferRect(__global uchar*, __global uchar*, ulong4, ulong4, ulong4); -extern void __amd_copyBufferRectAligned( - __global uint*, __global uint*, - ulong4, ulong4, ulong4); + extern void __amd_copyBufferRectAligned(__global uint*, __global uint*, ulong4, ulong4, ulong4); -extern void __amd_copyBuffer( - __global uchar*, __global uchar*, - ulong, ulong, ulong, uint); + extern void __amd_copyBuffer(__global uchar*, __global uchar*, ulong, ulong, ulong, uint); -extern void __amd_copyBufferAligned( - __global uint*, __global uint*, - ulong, ulong, ulong, uint); + extern void __amd_copyBufferAligned(__global uint*, __global uint*, ulong, ulong, ulong, uint); -extern void __amd_fillBuffer( - __global uchar*, __global uint*, __constant uchar*, - uint, ulong, ulong); + extern void __amd_fillBuffer(__global uchar*, __global uint*, __constant uchar*, uint, ulong, + ulong); -__kernel void copyBufferRect( - __global uchar* src, - __global uchar* dst, - ulong4 srcRect, - ulong4 dstRect, - ulong4 size) -{ - __amd_copyBufferRect(src, dst, srcRect, dstRect, size); -} + __kernel void copyBufferRect(__global uchar* src, __global uchar* dst, ulong4 srcRect, + ulong4 dstRect, ulong4 size) { + __amd_copyBufferRect(src, dst, srcRect, dstRect, size); + } -__kernel void copyBufferRectAligned( - __global uint* src, - __global uint* dst, - ulong4 srcRect, - ulong4 dstRect, - ulong4 size) -{ - __amd_copyBufferRectAligned(src, dst, srcRect, dstRect, size); -} + __kernel void copyBufferRectAligned(__global uint* src, __global uint* dst, ulong4 srcRect, + ulong4 dstRect, ulong4 size) { + __amd_copyBufferRectAligned(src, dst, srcRect, dstRect, size); + } -__kernel void copyBuffer( - __global uchar* srcI, - __global uchar* dstI, - ulong srcOrigin, - ulong dstOrigin, - ulong size, - uint remain) -{ - __amd_copyBuffer(srcI, dstI, srcOrigin, dstOrigin, size, remain); -} + __kernel void copyBuffer(__global uchar* srcI, __global uchar* dstI, ulong srcOrigin, + ulong dstOrigin, ulong size, uint remain) { + __amd_copyBuffer(srcI, dstI, srcOrigin, dstOrigin, size, remain); + } -__kernel void copyBufferAligned( - __global uint* src, - __global uint* dst, - ulong srcOrigin, - ulong dstOrigin, - ulong size, - uint alignment) -{ - __amd_copyBufferAligned(src, dst, srcOrigin, dstOrigin, size, alignment); -} + __kernel void copyBufferAligned(__global uint* src, __global uint* dst, ulong srcOrigin, + ulong dstOrigin, ulong size, uint alignment) { + __amd_copyBufferAligned(src, dst, srcOrigin, dstOrigin, size, alignment); + } -__kernel void fillBuffer( - __global uchar* bufUChar, - __global uint* bufUInt, - __constant uchar* pattern, - uint patternSize, - ulong offset, - ulong size) -{ - __amd_fillBuffer(bufUChar, bufUInt, pattern, patternSize, offset, size); -} -extern void __amd_copyBufferToImage( - __global uint*, __write_only image2d_array_t, ulong4, - int4, int4, uint4, ulong4); + __kernel void fillBuffer(__global uchar* bufUChar, __global uint* bufUInt, + __constant uchar* pattern, uint patternSize, ulong offset, + ulong size) { + __amd_fillBuffer(bufUChar, bufUInt, pattern, patternSize, offset, size); + } extern void __amd_copyBufferToImage(__global uint*, __write_only image2d_array_t, ulong4, + int4, int4, uint4, ulong4); -extern void __amd_copyImageToBuffer( - __read_only image2d_array_t, __global uint*, __global ushort*, - __global uchar*, int4, ulong4, int4, uint4, ulong4); + extern void __amd_copyImageToBuffer(__read_only image2d_array_t, __global uint*, + __global ushort*, __global uchar*, int4, ulong4, int4, + uint4, ulong4); -extern void __amd_copyImage( - __read_only image2d_array_t, __write_only image2d_array_t, - int4, int4, int4); + extern void __amd_copyImage(__read_only image2d_array_t, __write_only image2d_array_t, int4, + int4, int4); -extern void __amd_copyImage1DA( - __read_only image2d_array_t, __write_only image2d_array_t, - int4, int4, int4); + extern void __amd_copyImage1DA(__read_only image2d_array_t, __write_only image2d_array_t, int4, + int4, int4); -extern void __amd_fillImage( - __write_only image2d_array_t, - float4, int4, uint4, int4, int4, uint); + extern void __amd_fillImage(__write_only image2d_array_t, float4, int4, uint4, int4, int4, + uint); -__kernel void copyBufferToImage( - __global uint* src, - __write_only image2d_array_t dst, - ulong4 srcOrigin, - int4 dstOrigin, - int4 size, - uint4 format, - ulong4 pitch) -{ - __amd_copyBufferToImage(src, dst, srcOrigin, dstOrigin, size, format, pitch); -} + __kernel void copyBufferToImage(__global uint* src, __write_only image2d_array_t dst, + ulong4 srcOrigin, int4 dstOrigin, int4 size, uint4 format, + ulong4 pitch) { + __amd_copyBufferToImage(src, dst, srcOrigin, dstOrigin, size, format, pitch); + } -__kernel void copyImageToBuffer( - __read_only image2d_array_t src, - __global uint* dstUInt, - __global ushort* dstUShort, - __global uchar* dstUChar, - int4 srcOrigin, - ulong4 dstOrigin, - int4 size, - uint4 format, - ulong4 pitch) -{ - __amd_copyImageToBuffer(src, dstUInt, dstUShort, dstUChar, - srcOrigin, dstOrigin, size, format, pitch); -} + __kernel void copyImageToBuffer(__read_only image2d_array_t src, __global uint* dstUInt, + __global ushort* dstUShort, __global uchar* dstUChar, + int4 srcOrigin, ulong4 dstOrigin, int4 size, uint4 format, + ulong4 pitch) { + __amd_copyImageToBuffer(src, dstUInt, dstUShort, dstUChar, srcOrigin, dstOrigin, size, format, + pitch); + } -__kernel void copyImage( - __read_only image2d_array_t src, - __write_only image2d_array_t dst, - int4 srcOrigin, - int4 dstOrigin, - int4 size) -{ - __amd_copyImage(src, dst, srcOrigin, dstOrigin, size); -} + __kernel void copyImage(__read_only image2d_array_t src, __write_only image2d_array_t dst, + int4 srcOrigin, int4 dstOrigin, + int4 size) { __amd_copyImage(src, dst, srcOrigin, dstOrigin, size); } -__kernel void copyImage1DA( - __read_only image2d_array_t src, - __write_only image2d_array_t dst, - int4 srcOrigin, - int4 dstOrigin, - int4 size) -{ - __amd_copyImage1DA(src, dst, srcOrigin, dstOrigin, size); -} + __kernel void copyImage1DA(__read_only image2d_array_t src, __write_only image2d_array_t dst, + int4 srcOrigin, int4 dstOrigin, int4 size) { + __amd_copyImage1DA(src, dst, srcOrigin, dstOrigin, size); + } -__kernel void fillImage( - __write_only image2d_array_t image, - float4 patternFLOAT4, - int4 patternINT4, - uint4 patternUINT4, - int4 origin, - int4 size, - uint type) -{ - __amd_fillImage(image, patternFLOAT4, patternINT4, patternUINT4, - origin, size, type); -} -) -; + __kernel void fillImage(__write_only image2d_array_t image, float4 patternFLOAT4, + int4 patternINT4, uint4 patternUINT4, int4 origin, int4 size, + uint type) { + __amd_fillImage(image, patternFLOAT4, patternINT4, patternUINT4, origin, size, type); + }); -} // namespace device +} // namespace device diff --git a/rocclr/runtime/device/cpu/cpubinary.cpp b/rocclr/runtime/device/cpu/cpubinary.cpp index b35d70b3bb..e8cf8496c2 100644 --- a/rocclr/runtime/device/cpu/cpubinary.cpp +++ b/rocclr/runtime/device/cpu/cpubinary.cpp @@ -16,74 +16,69 @@ namespace cpu { -ClBinary::FeatureCheckResult -ClBinary::checkFeatures() -{ - /* Validate that all cpu features of loaded binary target (i.e. elf_target) exists in current target. - * If some of elf_target features doesn't exist in current target we fail the build since we assume that elf LLVM-IR and binary are - * target specific and can't be recompiled to current target*/ - uint16_t target = (uint16_t)dev().settings().cpuFeatures_; - uint16_t elf_target; - amd::OclElf::oclElfPlatform platform; - if (!elfIn()->getTarget(elf_target, platform)){ - LogError("Loading OCL CPU binary: incorrect format"); - return ERROR; +ClBinary::FeatureCheckResult ClBinary::checkFeatures() { + /* Validate that all cpu features of loaded binary target (i.e. elf_target) exists in current + * target. + * If some of elf_target features doesn't exist in current target we fail the build since we + * assume that elf LLVM-IR and binary are + * target specific and can't be recompiled to current target*/ + uint16_t target = (uint16_t)dev().settings().cpuFeatures_; + uint16_t elf_target; + amd::OclElf::oclElfPlatform platform; + if (!elfIn()->getTarget(elf_target, platform)) { + LogError("Loading OCL CPU binary: incorrect format"); + return ERROR; + } + uint64_t chip_options = 0x0; + if (platform == amd::OclElf::COMPLIB_PLATFORM) { + // BIF 3.0 + uint32_t flag; + if (!elfIn()->getFlags(flag)) { + LogError("Loading OCL CPU binary: incorrect format"); + return ERROR; } - uint64_t chip_options=0x0; - if (platform == amd::OclElf::COMPLIB_PLATFORM) { - // BIF 3.0 - uint32_t flag; - if (!elfIn()->getFlags(flag)) { - LogError("Loading OCL CPU binary: incorrect format"); - return ERROR; - } - aclTargetInfo tgtInfo = aclGetTargetInfoFromChipID(LP64_SWITCH("x86", "x86-64"), flag, NULL); - chip_options = aclGetChipOptions(tgtInfo) ; - if (((target & chip_options) != chip_options) || - ((elf_target == EM_386) && (strcmp(LP64_SWITCH("x86", "x86-64"), "x86") != 0)) || - ((elf_target == EM_X86_64) && (strcmp(LP64_SWITCH("x86", "x86-64"), "x86-64") != 0))){ - LogError("Loading OCL CPU binary: different target"); - return ERROR; - } + aclTargetInfo tgtInfo = aclGetTargetInfoFromChipID(LP64_SWITCH("x86", "x86-64"), flag, NULL); + chip_options = aclGetChipOptions(tgtInfo); + if (((target & chip_options) != chip_options) || + ((elf_target == EM_386) && (strcmp(LP64_SWITCH("x86", "x86-64"), "x86") != 0)) || + ((elf_target == EM_X86_64) && (strcmp(LP64_SWITCH("x86", "x86-64"), "x86-64") != 0))) { + LogError("Loading OCL CPU binary: different target"); + return ERROR; } - else { - // BIF 2.0 - if ((platform != amd::OclElf::CPU_PLATFORM) || - ((target & elf_target) != elf_target)) { - LogError("Loading OCL CPU binary: different target"); - return ERROR; - } + } else { + // BIF 2.0 + if ((platform != amd::OclElf::CPU_PLATFORM) || ((target & elf_target) != elf_target)) { + LogError("Loading OCL CPU binary: different target"); + return ERROR; } - char* section; - size_t sz; + } + char* section; + size_t sz; - /* If current target has more cpu features than the one for which the binary was (notice it must have all features as in elf_target - * due to previous check), we can benefit from recompiling the LLVM-IR if exists in binary (if there are errors, ignore them !).*/ - if (((platform == amd::OclElf::CPU_PLATFORM) && - ((target ^ elf_target) != 0)) || - ((platform == amd::OclElf::COMPLIB_PLATFORM) && - ((target ^ chip_options) != 0))) { - if (elfIn_->getSection(amd::OclElf::LLVMIR, §ion, &sz)) { - if ((section != NULL) && (sz > 0)) { - // hasDLL being false to force recompiling - RECOMPILE; - } - } + /* If current target has more cpu features than the one for which the binary was (notice it must + * have all features as in elf_target + * due to previous check), we can benefit from recompiling the LLVM-IR if exists in binary (if + * there are errors, ignore them !).*/ + if (((platform == amd::OclElf::CPU_PLATFORM) && ((target ^ elf_target) != 0)) || + ((platform == amd::OclElf::COMPLIB_PLATFORM) && ((target ^ chip_options) != 0))) { + if (elfIn_->getSection(amd::OclElf::LLVMIR, §ion, &sz)) { + if ((section != NULL) && (sz > 0)) { + // hasDLL being false to force recompiling + RECOMPILE; + } } - return OK; + } + return OK; } -bool -ClBinary::loadX86(Program& program, std::string& dllName, bool& hasDLL) -{ - hasDLL = false; +bool ClBinary::loadX86(Program& program, std::string& dllName, bool& hasDLL) { + hasDLL = false; - std::string tempName = amd::Os::getTempFileName(); + std::string tempName = amd::Os::getTempFileName(); - dllName = tempName - + "." WINDOWS_SWITCH("dll",MACOS_SWITCH("dyld","so")); + dllName = tempName + "." WINDOWS_SWITCH("dll", MACOS_SWITCH("dyld", "so")); - switch (checkFeatures()) { + switch (checkFeatures()) { case ERROR: return false; case RECOMPILE: @@ -91,67 +86,63 @@ ClBinary::loadX86(Program& program, std::string& dllName, bool& hasDLL) case OK: // Fallthrough break; - } + } - char* section; - size_t sz; + char* section; + size_t sz; - if (!elfIn_->getSection(amd::OclElf::DLL, §ion, &sz)) { - LogError("Loading OCL CPU binary: error occured!"); - return false; - } + if (!elfIn_->getSection(amd::OclElf::DLL, §ion, &sz)) { + LogError("Loading OCL CPU binary: error occured!"); + return false; + } - if ((section == NULL) || (sz == 0)) { - // hasDLL being false to force recompiling - return true; - } + if ((section == NULL) || (sz == 0)) { + // hasDLL being false to force recompiling + return true; + } - std::fstream f; - f.open(dllName.c_str(), (std::fstream::out | std::fstream::binary)); + std::fstream f; + f.open(dllName.c_str(), (std::fstream::out | std::fstream::binary)); - if (!f.is_open()) { + if (!f.is_open()) { #ifdef _WIN32 - amd::Os::unlink(tempName.c_str()); -#endif // _WIN32 - LogError("Loading OCL CPU binary: cannot open a file!"); - return false; - } - f.write(section, sz); - f.close(); + amd::Os::unlink(tempName.c_str()); +#endif // _WIN32 + LogError("Loading OCL CPU binary: cannot open a file!"); + return false; + } + f.write(section, sz); + f.close(); - hasDLL = true; - return true; + hasDLL = true; + return true; } -bool -ClBinary::storeX86(Program& program, std::string& dllName) -{ - std::fstream f; - f.open(dllName.c_str(), (std::fstream::in | std::fstream::binary)); - if (!f.is_open()) { - return false; - } +bool ClBinary::storeX86(Program& program, std::string& dllName) { + std::fstream f; + f.open(dllName.c_str(), (std::fstream::in | std::fstream::binary)); + if (!f.is_open()) { + return false; + } - f.seekg(0, std::fstream::end); - size_t x86CodeSize = f.tellg(); - f.seekg(0, std::fstream::beg); + f.seekg(0, std::fstream::end); + size_t x86CodeSize = f.tellg(); + f.seekg(0, std::fstream::beg); - if (saveISA()) { - char* x86Code = new char[x86CodeSize]; - f.read(x86Code, x86CodeSize); - elfOut_->addSection(amd::OclElf::DLL, x86Code, x86CodeSize); - delete [] x86Code; - } - f.close(); - return true; + if (saveISA()) { + char* x86Code = new char[x86CodeSize]; + f.read(x86Code, x86CodeSize); + elfOut_->addSection(amd::OclElf::DLL, x86Code, x86CodeSize); + delete[] x86Code; + } + f.close(); + return true; } -bool -ClBinary::loadX86JIT(Program& program, bool& hasJITBinary) -{ +bool ClBinary::loadX86JIT(Program& program, bool& hasJITBinary) { hasJITBinary = false; - switch (checkFeatures()) { + switch (checkFeatures()) { case ERROR: return false; case RECOMPILE: @@ -159,66 +150,62 @@ ClBinary::loadX86JIT(Program& program, bool& hasJITBinary) case OK: // Fallthrough break; - } + } - char* section; - size_t sz; + char* section; + size_t sz; - if (!elfIn_->getSection(amd::OclElf::JITBINARY, §ion, &sz)) { - LogError("Loading OCL CPU JIT binary: error occured!"); - return false; - } + if (!elfIn_->getSection(amd::OclElf::JITBINARY, §ion, &sz)) { + LogError("Loading OCL CPU JIT binary: error occured!"); + return false; + } - if ((section == NULL) || (sz == 0)) { - // force recompiling - return true; - } - acl_error err = ACL_SUCCESS; - program.setJITBinary(aclJITObjectImageCopy(program.compiler(), section, sz, &err)); - if (err != ACL_SUCCESS) { - LogWarning("aclJITObjectImageCopy failed"); - return false; - } - hasJITBinary = true; + if ((section == NULL) || (sz == 0)) { + // force recompiling return true; + } + acl_error err = ACL_SUCCESS; + program.setJITBinary(aclJITObjectImageCopy(program.compiler(), section, sz, &err)); + if (err != ACL_SUCCESS) { + LogWarning("aclJITObjectImageCopy failed"); + return false; + } + hasJITBinary = true; + return true; } void checkDifference(const char* buf1, const char* buf2, size_t size) { - for(size_t i = 0; i < size; ++i) { - if(buf1[i] != buf2[i]) { - printf("Index %d different",(int)i); + for (size_t i = 0; i < size; ++i) { + if (buf1[i] != buf2[i]) { + printf("Index %d different", (int)i); return; } } } -bool -ClBinary::storeX86JIT(Program& program) -{ +bool ClBinary::storeX86JIT(Program& program) { if (saveISA()) { acl_error err = ACL_SUCCESS; aclJITObjectImage objectImage = program.getJITBinary(); size_t x86CodeSize = aclJITObjectImageSize(program.compiler(), objectImage, &err); if (err != ACL_SUCCESS) { - LogWarning("aclJITObjectImageSize failed"); - return false; + LogWarning("aclJITObjectImageSize failed"); + return false; } const char* x86CodePtr = aclJITObjectImageData(program.compiler(), objectImage, &err); if (err != ACL_SUCCESS) { - LogWarning("aclJITObjectImageData failed"); - return false; + LogWarning("aclJITObjectImageData failed"); + return false; } elfOut_->addSection(amd::OclElf::JITBINARY, x86CodePtr, x86CodeSize); } return true; } -bool -ClBinary::storeX86Asm(const char* buffer, size_t size) -{ +bool ClBinary::storeX86Asm(const char* buffer, size_t size) { if (saveAS()) { elfOut_->addSection(amd::OclElf::ASTEXT, buffer, size); } return true; } -} // namespace cpu +} // namespace cpu diff --git a/rocclr/runtime/device/cpu/cpubinary.hpp b/rocclr/runtime/device/cpu/cpubinary.hpp index 1fa9bcf21e..79ce0bdc34 100644 --- a/rocclr/runtime/device/cpu/cpubinary.hpp +++ b/rocclr/runtime/device/cpu/cpubinary.hpp @@ -17,69 +17,59 @@ class Device; class Program; //! \class CPU binary -class ClBinary : public device::ClBinary -{ -public: - //! Constructor - ClBinary(const Device& dev) : device::ClBinary(dev) {} +class ClBinary : public device::ClBinary { + public: + //! Constructor + ClBinary(const Device& dev) : device::ClBinary(dev) {} - //! Destructor - ~ClBinary() {} + //! Destructor + ~ClBinary() {} - //! Loads x86 executable code - bool loadX86( - Program& prorgam, //!< CPU Program object - std::string& dllName, //!< Dll name of the CPU binary - bool& hasDLL //!< indicate if the OCL binary has DLL - ); + //! Loads x86 executable code + bool loadX86(Program& prorgam, //!< CPU Program object + std::string& dllName, //!< Dll name of the CPU binary + bool& hasDLL //!< indicate if the OCL binary has DLL + ); - //! Stores x86 executable code - bool storeX86( - Program& program, //!< CPU Program object - std::string& dllName //!< Dll name for the binary - ); + //! Stores x86 executable code + bool storeX86(Program& program, //!< CPU Program object + std::string& dllName //!< Dll name for the binary + ); - //! Loads x86 executable in-memory code - bool loadX86JIT( - Program& prorgam, //!< CPU Program object - bool& hasJITBin //!< indicate if the OCL binary has JIT binary - ); + //! Loads x86 executable in-memory code + bool loadX86JIT(Program& prorgam, //!< CPU Program object + bool& hasJITBin //!< indicate if the OCL binary has JIT binary + ); - //! Stores x86 executable in-memory code - bool storeX86JIT( - Program& program //!< CPU Program object - ); + //! Stores x86 executable in-memory code + bool storeX86JIT(Program& program //!< CPU Program object + ); - //! Set elf header information for CPU target - bool setElfTarget() { - uint32_t target = dev().settings().cpuFeatures_; - assert (((0xFFFF8000 & target) == 0) && "ASIC target ID >= 2^15"); - uint16_t elf_target = (uint16_t)(0x7FFF & target); - return elfOut()->setTarget(elf_target, amd::OclElf::CPU_PLATFORM); - } + //! Set elf header information for CPU target + bool setElfTarget() { + uint32_t target = dev().settings().cpuFeatures_; + assert(((0xFFFF8000 & target) == 0) && "ASIC target ID >= 2^15"); + uint16_t elf_target = (uint16_t)(0x7FFF & target); + return elfOut()->setTarget(elf_target, amd::OclElf::CPU_PLATFORM); + } bool storeX86Asm(const char* buffer, size_t size); -private: + private: + enum FeatureCheckResult { ERROR, RECOMPILE, OK }; - enum FeatureCheckResult { - ERROR, - RECOMPILE, - OK - }; + FeatureCheckResult checkFeatures(); - FeatureCheckResult checkFeatures(); + //! Disable default copy constructor + ClBinary(const ClBinary&); - //! Disable default copy constructor - ClBinary(const ClBinary&); + //! Disable default operator= + ClBinary& operator=(const ClBinary&); - //! Disable default operator= - ClBinary& operator=(const ClBinary&); - - //! Returns the GPU device for this object - const Device& dev() { return static_cast(dev_); } + //! Returns the GPU device for this object + const Device& dev() { return static_cast(dev_); } }; -} // namespace cpu +} // namespace cpu -#endif // CPUBINARY_HPP_ +#endif // CPUBINARY_HPP_ diff --git a/rocclr/runtime/device/cpu/cpubuiltins.cpp b/rocclr/runtime/device/cpu/cpubuiltins.cpp index a0896c97c5..b366134ad9 100644 --- a/rocclr/runtime/device/cpu/cpubuiltins.cpp +++ b/rocclr/runtime/device/cpu/cpubuiltins.cpp @@ -6,52 +6,47 @@ #include "device/cpu/cpucommand.hpp" #include -#include // for printf +#include // for printf #include #define BUF_SIZE_PRINTF 4095 -//In the current implementation of printf in gcc 4.5.2 runtime libraries,inf/infinity and nan are not supported -//The [-]infinity value is printed as [-]1.#INF00 -//The [-]nan value is printed as [-]1.#INF00 -//bufOutUpdate converts the all printed instanced of [-]1.#INF00 to inf,and +// In the current implementation of printf in gcc 4.5.2 runtime libraries,inf/infinity and nan are +// not supported +// The [-]infinity value is printed as [-]1.#INF00 +// The [-]nan value is printed as [-]1.#INF00 +// bufOutUpdate converts the all printed instanced of [-]1.#INF00 to inf,and // all printed instanced of [-]1.#IND00 to nan -void bufOutUpdate(std::string& sBufOut,const char* strToReplace,const char* strReplace) -{ - size_t foundIdx = 0; - while ((foundIdx = sBufOut.find(strToReplace,foundIdx)) != std::string::npos) { - sBufOut.replace(foundIdx,strlen(strToReplace),strReplace,strlen(strReplace)); - foundIdx += 3; - } +void bufOutUpdate(std::string& sBufOut, const char* strToReplace, const char* strReplace) { + size_t foundIdx = 0; + while ((foundIdx = sBufOut.find(strToReplace, foundIdx)) != std::string::npos) { + sBufOut.replace(foundIdx, strlen(strToReplace), strReplace, strlen(strReplace)); + foundIdx += 3; + } } -int cpuprintf(const char* format,...) -{ - char cBufOut[BUF_SIZE_PRINTF]; - std::string sBufOut; - va_list args; - va_start(args, format); - //write to the buffer - vsprintf(cBufOut,format,args); - sBufOut = cBufOut; +int cpuprintf(const char* format, ...) { + char cBufOut[BUF_SIZE_PRINTF]; + std::string sBufOut; + va_list args; + va_start(args, format); + // write to the buffer + vsprintf(cBufOut, format, args); + sBufOut = cBufOut; - //convert to correct infinity/nan representation - bufOutUpdate(sBufOut,"1.#INF00","inf"); - bufOutUpdate(sBufOut,"1.#IND00","nan"); - bufOutUpdate(sBufOut,"1.#QNAN0","nan"); - int ret = amd::Os::printf("%s",sBufOut.c_str()); - fflush(stdout); - va_end (args); - return ret; + // convert to correct infinity/nan representation + bufOutUpdate(sBufOut, "1.#INF00", "inf"); + bufOutUpdate(sBufOut, "1.#IND00", "nan"); + bufOutUpdate(sBufOut, "1.#QNAN0", "nan"); + int ret = amd::Os::printf("%s", sBufOut.c_str()); + fflush(stdout); + va_end(args); + return ret; } namespace cpu { -const clk_builtins_t -Builtins::dispatchTable_ = -{ +const clk_builtins_t Builtins::dispatchTable_ = { /* Synchronization functions */ &WorkItem::barrier, /* AMD Only builtins: FIXME_lmoriche: remove or add an extension */ - NULL, - cpuprintf -}; + NULL, cpuprintf}; -} // namespace cpu +} // namespace cpu diff --git a/rocclr/runtime/device/cpu/cpubuiltins.hpp b/rocclr/runtime/device/cpu/cpubuiltins.hpp index 2723730857..f50854e7b1 100644 --- a/rocclr/runtime/device/cpu/cpubuiltins.hpp +++ b/rocclr/runtime/device/cpu/cpubuiltins.hpp @@ -10,11 +10,10 @@ namespace cpu { -struct Builtins : public amd::AllStatic -{ - static const clk_builtins_t dispatchTable_; +struct Builtins : public amd::AllStatic { + static const clk_builtins_t dispatchTable_; }; -} // namespace cpu +} // namespace cpu #endif /*BUILTINS_HPP_*/ diff --git a/rocclr/runtime/device/cpu/cpucommand.cpp b/rocclr/runtime/device/cpu/cpucommand.cpp index e8c3488b8c..95ca23fa70 100644 --- a/rocclr/runtime/device/cpu/cpucommand.cpp +++ b/rocclr/runtime/device/cpu/cpucommand.cpp @@ -21,676 +21,589 @@ namespace cpu { -#define CPU_WORKER_THREAD_TOTAL_STACK_SIZE (CPU_WORKER_THREAD_STACK_SIZE + \ - CLK_PRIVATE_MEMORY_SIZE * (CPU_MAX_WORKGROUP_SIZE + 1)) +#define CPU_WORKER_THREAD_TOTAL_STACK_SIZE \ + (CPU_WORKER_THREAD_STACK_SIZE + CLK_PRIVATE_MEMORY_SIZE * (CPU_MAX_WORKGROUP_SIZE + 1)) -WorkerThread::WorkerThread(const cpu::Device& device) : - Thread("CPU Worker Thread", CPU_WORKER_THREAD_TOTAL_STACK_SIZE), - queueLock_("WorkerThread::queueLock"), waitingOp_(0), terminated_(false) -{ - localDataSize_ = (size_t) device.info().localMemSize_; - localDataStorage_ = (address) amd::AlignedMemory::allocate( - localDataSize_ + __CPU_SCRATCH_SIZE, sizeof(cl_long16)); +WorkerThread::WorkerThread(const cpu::Device& device) + : Thread("CPU Worker Thread", CPU_WORKER_THREAD_TOTAL_STACK_SIZE), + queueLock_("WorkerThread::queueLock"), + waitingOp_(0), + terminated_(false) { + localDataSize_ = (size_t)device.info().localMemSize_; + localDataStorage_ = + (address)amd::AlignedMemory::allocate(localDataSize_ + __CPU_SCRATCH_SIZE, sizeof(cl_long16)); #if defined(__linux__) && defined(NUMA_SUPPORT) - const nodemask_t* numaMask = device.getNumaMask(); - if (numaMask != NULL) { - numa_bind(numaMask); - } + const nodemask_t* numaMask = device.getNumaMask(); + if (numaMask != NULL) { + numa_bind(numaMask); + } #endif } -WorkerThread::~WorkerThread() -{ - guarantee(Thread::current() != this && "thread suicide!"); - amd::AlignedMemory::deallocate(localDataStorage_); +WorkerThread::~WorkerThread() { + guarantee(Thread::current() != this && "thread suicide!"); + amd::AlignedMemory::deallocate(localDataStorage_); } -bool -WorkerThread::terminate() -{ - terminated_ = true; +bool WorkerThread::terminate() { + terminated_ = true; - if (Thread::current() != this) { - // FIXME_lmoriche: fix termination handshake - while (state() < Thread::FINISHED) { - flush(); - amd::Os::yield(); - } + if (Thread::current() != this) { + // FIXME_lmoriche: fix termination handshake + while (state() < Thread::FINISHED) { + flush(); + amd::Os::yield(); } + } - return true; + return true; } -void -WorkerThread::enqueue(Operation& op) -{ - while (waitingOp_ != 0) { - amd::Os::yield(); - } - op.clone(operation()); - ++waitingOp_; +void WorkerThread::enqueue(Operation& op) { + while (waitingOp_ != 0) { + amd::Os::yield(); + } + op.clone(operation()); + ++waitingOp_; } -void -WorkerThread::loop() -{ - baseWorkItemsStack_ = amd::alignDown(stackBase() - - CPU_WORKER_THREAD_STACK_SIZE, CLK_PRIVATE_MEMORY_SIZE); +void WorkerThread::loop() { + baseWorkItemsStack_ = + amd::alignDown(stackBase() - CPU_WORKER_THREAD_STACK_SIZE, CLK_PRIVATE_MEMORY_SIZE); #if defined(WIN32) - amd::Os::touchStackPages(baseWorkItemsStack_, amd::Os::currentStackPtr()); -#endif // WINDOWS - Operation *op = operation(); + amd::Os::touchStackPages(baseWorkItemsStack_, amd::Os::currentStackPtr()); +#endif // WINDOWS + Operation* op = operation(); - queueLock_.lock(); - while (true) { - while (waitingOp_ == 0) { - if (terminated_) { - break; - } - queueLock_.wait(); - } - if (terminated_) { - break; - } - op->command().setStatus(CL_RUNNING); - op->execute(); - op->cleanup(); - --waitingOp_; + queueLock_.lock(); + while (true) { + while (waitingOp_ == 0) { + if (terminated_) { + break; + } + queueLock_.wait(); } - queueLock_.unlock(); + if (terminated_) { + break; + } + op->command().setStatus(CL_RUNNING); + op->execute(); + op->cleanup(); + --waitingOp_; + } + queueLock_.unlock(); } -void -NativeFn::execute() -{ - cl_int status = static_cast(command()).invoke(); - command().setStatus(status); +void NativeFn::execute() { + cl_int status = static_cast(command()).invoke(); + command().setStatus(status); } -static void -nop() { /*Do nothing*/ } +static void nop() { /*Do nothing*/ +} template -class NDRangeKernelBatchMode : public NDRangeKernelBatch -{ -private: - void executeWorkGroup(WorkGroup& wg) - { - if (NATURE == NATURE_WG_LEVEL_EXEC) { - wg.executeWorkItem(); - } - else if ((NATURE == NATURE_1_WORK_ITEM) || - (wg.getNumWorkItems() == 1)) { - wg.executeWorkItem(); - } - else { - wg.getBaseWorkItem()->setNext(&wg.getWorkerThread().mainFiber()); - if (NATURE == NATURE_WITHOUT_BARRIER) { - wg.executeWithoutBarrier(); - } - else { // NATURE == NATURE_WITH_BARRIER - wg.executeWithBarrier(); - } - } - // Yield at the end of each workgroup to avoid starving GPU device - amd::Os::yield(); + NDRangeKernelBatch::ExecutionOrder ORDER = NDRangeKernelBatch::ORDER_DEFAULT> +class NDRangeKernelBatchMode : public NDRangeKernelBatch { + private: + void executeWorkGroup(WorkGroup& wg) { + if (NATURE == NATURE_WG_LEVEL_EXEC) { + wg.executeWorkItem(); + } else if ((NATURE == NATURE_1_WORK_ITEM) || (wg.getNumWorkItems() == 1)) { + wg.executeWorkItem(); + } else { + wg.getBaseWorkItem()->setNext(&wg.getWorkerThread().mainFiber()); + if (NATURE == NATURE_WITHOUT_BARRIER) { + wg.executeWithoutBarrier(); + } else { // NATURE == NATURE_WITH_BARRIER + wg.executeWithBarrier(); + } + } + // Yield at the end of each workgroup to avoid starving GPU device + amd::Os::yield(); + } + + public: + void executeMode(WorkGroup& wg) { + const amd::NDRange& offset = static_cast(command_).sizes().offset(); + WorkItem* workItem0 = wg.getBaseWorkItem(); + clk_builtins_t tableTask; + size_t prevOpId = 0, opId = (size_t)-1; + + if (NATURE == NATURE_1_WORK_ITEM) { + tableTask = Builtins::dispatchTable_; + + // If local size == 1 then barrier() becomes a nop. + tableTask.barrier_ptr = (void (*)(cl_mem_fence_flags))nop; + workItem0->infoBlock().builtins = &tableTask; + workItem0->setNext(&wg.getWorkerThread().mainFiber()); } -public: - void executeMode(WorkGroup& wg) - { - const amd::NDRange& offset = - static_cast(command_).sizes().offset(); - WorkItem* workItem0 = wg.getBaseWorkItem(); - clk_builtins_t tableTask; - size_t prevOpId = 0, opId = (size_t)-1; - - if (NATURE == NATURE_1_WORK_ITEM) { - tableTask = Builtins::dispatchTable_; - - // If local size == 1 then barrier() becomes a nop. - tableTask.barrier_ptr = (void (*)(cl_mem_fence_flags)) nop; - workItem0->infoBlock().builtins = &tableTask; - workItem0->setNext(&wg.getWorkerThread().mainFiber()); - } - - while (getNextOperationId(opId)) { - workItem0->incrementGroupId(groupIds_, offset, opId - prevOpId); - uint workDims = workItem0->infoBlock().work_dim; - size_t numWorkItems = workItem0->infoBlock().local_size[0] * - (workDims >= 2 ? workItem0->infoBlock().local_size[1] : 1) * - (workDims >= 3 ? workItem0->infoBlock().local_size[2] : 1); - wg.setNumWorkItems(numWorkItems); - if(numWorkItems == 1) { - tableTask = Builtins::dispatchTable_; - tableTask.barrier_ptr = (void (*)(cl_mem_fence_flags)) nop; - workItem0->infoBlock().builtins = &tableTask; - workItem0->setNext(&wg.getWorkerThread().mainFiber()); - executeWorkGroup(wg); - tableTask.barrier_ptr = &WorkItem::barrier; - } else { - executeWorkGroup(wg); - } - prevOpId = opId; - } + while (getNextOperationId(opId)) { + workItem0->incrementGroupId(groupIds_, offset, opId - prevOpId); + uint workDims = workItem0->infoBlock().work_dim; + size_t numWorkItems = workItem0->infoBlock().local_size[0] * + (workDims >= 2 ? workItem0->infoBlock().local_size[1] : 1) * + (workDims >= 3 ? workItem0->infoBlock().local_size[2] : 1); + wg.setNumWorkItems(numWorkItems); + if (numWorkItems == 1) { + tableTask = Builtins::dispatchTable_; + tableTask.barrier_ptr = (void (*)(cl_mem_fence_flags))nop; + workItem0->infoBlock().builtins = &tableTask; + workItem0->setNext(&wg.getWorkerThread().mainFiber()); + executeWorkGroup(wg); + tableTask.barrier_ptr = &WorkItem::barrier; + } else { + executeWorkGroup(wg); + } + prevOpId = opId; + } //#define DISABLE_TASK_STEALING #if !defined(DISABLE_TASK_STEALING) && 0 - size_t maxId = numCores_; - size_t stolenId = coreId_ + 1; - NDRangeKernelBatch* workingBatch = this; - size_t numStolenIds = 1; - const size_t maxStealingSize = 3; - const size_t minAdaptiveStealingDiff = numCores_ * maxStealingSize; - - while (true) { - for (; stolenId < maxId; ++stolenId) { - WorkerThread* worker = virtualDevice_.getWorkerThread(stolenId); + size_t maxId = numCores_; + size_t stolenId = coreId_ + 1; + NDRangeKernelBatch* workingBatch = this; + size_t numStolenIds = 1; + const size_t maxStealingSize = 3; + const size_t minAdaptiveStealingDiff = numCores_ * maxStealingSize; - // In case were we have less operations than Worker Threads - if (worker->isOperationValid()) { - workingBatch = static_cast( - worker->operation()); + while (true) { + for (; stolenId < maxId; ++stolenId) { + WorkerThread* worker = virtualDevice_.getWorkerThread(stolenId); - numStolenIds = - workingBatch->getNextOperationIds(opId, numStolenIds); - if (numStolenIds > 0) { - do { - for (size_t i = 0; i < numStolenIds; ++i) { - workItem0->setGroupId(groupIds_, offset, opId); - executeWorkGroup(wg); - opId += numCores_; - } + // In case were we have less operations than Worker Threads + if (worker->isOperationValid()) { + workingBatch = static_cast(worker->operation()); - // adaptive stealing - if (numWorkGroups_ - opId > minAdaptiveStealingDiff) { - numStolenIds = maxStealingSize; - } - else { - while (workingBatch->getNextOperationId(opId)) { - workItem0->setGroupId(groupIds_, offset, opId); - executeWorkGroup(wg); - } - break; - } - numStolenIds = workingBatch->getNextOperationIds( - opId, numStolenIds); - } while (numStolenIds > 0); - } - numStolenIds = 1; + numStolenIds = workingBatch->getNextOperationIds(opId, numStolenIds); + if (numStolenIds > 0) { + do { + for (size_t i = 0; i < numStolenIds; ++i) { + workItem0->setGroupId(groupIds_, offset, opId); + executeWorkGroup(wg); + opId += numCores_; + } + + // adaptive stealing + if (numWorkGroups_ - opId > minAdaptiveStealingDiff) { + numStolenIds = maxStealingSize; + } else { + while (workingBatch->getNextOperationId(opId)) { + workItem0->setGroupId(groupIds_, offset, opId); + executeWorkGroup(wg); } - } // for (stolenId..maxId) - - if (stolenId == coreId_) { break; - } + } + numStolenIds = workingBatch->getNextOperationIds(opId, numStolenIds); + } while (numStolenIds > 0); + } + numStolenIds = 1; + } + } // for (stolenId..maxId) - stolenId = 0; - maxId = coreId_; - } // while (true) + if (stolenId == coreId_) { + break; + } + + stolenId = 0; + maxId = coreId_; + } // while (true) #endif - } + } }; -inline bool -NDRangeKernelBatch::getNextOperationId(size_t& opId) -{ - if (currentOpId_ >= numWorkGroups_) { - return false; - } - opId = amd::AtomicOperation::add(numCores_, ¤tOpId_); - return opId < numWorkGroups_; +inline bool NDRangeKernelBatch::getNextOperationId(size_t& opId) { + if (currentOpId_ >= numWorkGroups_) { + return false; + } + opId = amd::AtomicOperation::add(numCores_, ¤tOpId_); + return opId < numWorkGroups_; } -inline size_t -NDRangeKernelBatch::getNextOperationIds(size_t& opId, size_t count) -{ - size_t topId = numCores_ * count; - if (currentOpId_ >= numWorkGroups_) { - return 0; - } +inline size_t NDRangeKernelBatch::getNextOperationIds(size_t& opId, size_t count) { + size_t topId = numCores_ * count; + if (currentOpId_ >= numWorkGroups_) { + return 0; + } - opId = amd::AtomicOperation::add(topId, ¤tOpId_); - const size_t numWorkGroups = numWorkGroups_; - if (opId >= numWorkGroups) { - return 0; - } + opId = amd::AtomicOperation::add(topId, ¤tOpId_); + const size_t numWorkGroups = numWorkGroups_; + if (opId >= numWorkGroups) { + return 0; + } - topId += opId; - if (topId >= (numWorkGroups + numCores_)) { - count -= (topId - numWorkGroups) / numCores_; - } + topId += opId; + if (topId >= (numWorkGroups + numCores_)) { + count -= (topId - numWorkGroups) / numCores_; + } - return count; + return count; } // Process the parameters, allocate LDS. -bool -NDRangeKernelBatch::patchParameters( - const cpu::Kernel& cpuKernel, - address params, - address& localMemPtr, - const address localMemLimit, - size_t localMemSize) const -{ - amd::NDRangeKernelCommand& command = - static_cast(command_); +bool NDRangeKernelBatch::patchParameters(const cpu::Kernel& cpuKernel, address params, + address& localMemPtr, const address localMemLimit, + size_t localMemSize) const { + amd::NDRangeKernelCommand& command = static_cast(command_); - const amd::Device& device = command.queue()->device(); + const amd::Device& device = command.queue()->device(); - const amd::Kernel& kernel = command.kernel(); - const amd::KernelSignature& signature = kernel.signature(); - const amd::KernelParameters& kernelParam = kernel.parameters(); + const amd::Kernel& kernel = command.kernel(); + const amd::KernelSignature& signature = kernel.signature(); + const amd::KernelParameters& kernelParam = kernel.parameters(); - const_address cmdParams = command.parameters(); + const_address cmdParams = command.parameters(); - unsigned effectiveOffset = 0; + unsigned effectiveOffset = 0; - // DD -- on CPU device, real effective offset is NATIVELY aligned - // Here all source arguments are in place, so we're safe just iterating - for (size_t i = 0; i < signature.numParameters(); ++i) { - const amd::KernelParameterDescriptor& desc = signature.at(i); - const void* cmdParam = cmdParams + desc.offset_; - void *param; - size_t prmSize = cpuKernel.getArgSize(i); + // DD -- on CPU device, real effective offset is NATIVELY aligned + // Here all source arguments are in place, so we're safe just iterating + for (size_t i = 0; i < signature.numParameters(); ++i) { + const amd::KernelParameterDescriptor& desc = signature.at(i); + const void* cmdParam = cmdParams + desc.offset_; + void* param; + size_t prmSize = cpuKernel.getArgSize(i); - // Align i'th parameter on multiple of its size. Parameter size is power of 2. - size_t alignment = cpuKernel.getArgAlignment(i); - effectiveOffset = amd::alignUp(effectiveOffset, std::min(alignment, size_t(16))); - param = params + effectiveOffset; - if (desc.size_ == 0) { - // __local memory parameter - localMemPtr = amd::alignUp(localMemPtr, sizeof(cl_long16)); + // Align i'th parameter on multiple of its size. Parameter size is power of 2. + size_t alignment = cpuKernel.getArgAlignment(i); + effectiveOffset = amd::alignUp(effectiveOffset, std::min(alignment, size_t(16))); + param = params + effectiveOffset; + if (desc.size_ == 0) { + // __local memory parameter + localMemPtr = amd::alignUp(localMemPtr, sizeof(cl_long16)); - size_t length = *static_cast(cmdParam); - *static_cast(param) = localMemPtr; - localMemPtr += length; + size_t length = *static_cast(cmdParam); + *static_cast(param) = localMemPtr; + localMemPtr += length; - if (localMemPtr > localMemLimit) { - command.setException(CL_MEM_OBJECT_ALLOCATION_FAILURE); - return false; - } - } - else if (desc.type_ == T_POINTER) { - // __global memory parameter - cl_mem_object_type pointer_type = CL_MEM_OBJECT_BUFFER; - if (kernelParam.boundToSvmPointer(device, cmdParams, i)) { - *reinterpret_cast(param) = - *reinterpret_cast(cmdParam); - } - else { - void* hostMemPtr = NULL; - amd::Memory* memArg = - *reinterpret_cast(cmdParam); - if (memArg != NULL) { - hostMemPtr = memArg->getHostMem(); - if (hostMemPtr == NULL) { - command.setException(CL_MEM_OBJECT_ALLOCATION_FAILURE); - return false; - } - pointer_type = memArg->getType(); - } - // For images on CPU devices, pass "struct {int4 p0; int4 p1}". - // That allows an obvious implementation for - // __amdil_get_image[23]d_params[01]. - // That makes the rest of the .bc implementation for - // images relatively straight forward. - if (pointer_type == CL_MEM_OBJECT_IMAGE1D || - pointer_type == CL_MEM_OBJECT_IMAGE2D || - pointer_type == CL_MEM_OBJECT_IMAGE3D || - pointer_type == CL_MEM_OBJECT_IMAGE1D_ARRAY || - pointer_type == CL_MEM_OBJECT_IMAGE1D_BUFFER || - pointer_type == CL_MEM_OBJECT_IMAGE2D_ARRAY) { - amd::Image::Impl& impl = memArg->asImage()->getImpl(); - impl.reserved_ = hostMemPtr; - *reinterpret_cast(param) = (void*)&impl; - } else { - *reinterpret_cast(param) = hostMemPtr; - } - } - } - else if (desc.type_ == T_SAMPLER) { - // Switch from an Amd::Sampler to the 32bit integer - // variable that is a clk_sampler. - amd::Sampler* samplerArg = - *reinterpret_cast(cmdParam); - *reinterpret_cast(param) = (uint32_t)samplerArg->state(); - } - else { - //Using HCtoDCmap - HCtoDCmap arg_map = cpuKernel.getHCtoDCmap(i); - unsigned int arg_offset = effectiveOffset; - int err_code = 0; - int inStruct = 0; - int sys_64bit = LP64_SWITCH(0, 1); // Mapping only required for 32 bit targets - if (CPU_USE_ALIGNMENT_MAP == 0 && !sys_64bit) { - effectiveOffset += arg_map.copy_params(param, cmdParam, arg_offset, err_code, inStruct); - if (err_code) { - return false; - } - prmSize = arg_map.dc_size; - } - else { - ::memcpy(param, cmdParam, desc.size_); - } - } - effectiveOffset += prmSize; - } - - localMemPtr = amd::alignUp(localMemPtr, sizeof(cl_long16)); - if ((localMemPtr + localMemSize) > localMemLimit) { + if (localMemPtr > localMemLimit) { command.setException(CL_MEM_OBJECT_ALLOCATION_FAILURE); return false; - } - - return true; -} - -void -NDRangeKernelBatch::execute() -{ - amd::NDRangeKernelCommand& command = - static_cast(command_); - - const cpu::Kernel& kernel = static_cast( - *command.kernel().getDeviceKernel(command.queue()->device())); - - WorkerThread& thread = *WorkerThread::current(); - - const size_t numWorkItems = command.sizes().local().product(); - - address params = thread.baseWorkItemsStack(); - address baseLocalMemPtr = thread.localDataStorage(); - address patchedLocalMemPtr = thread.localDataStorage() + __CPU_SCRATCH_SIZE; - if (!patchParameters(kernel, params, - patchedLocalMemPtr, patchedLocalMemPtr + thread.localDataSize(), - kernel.workGroupInfo()->localMemSize_)) { - return; - } - - WorkItem* workItem0 = ::new((WorkItem*)params - 1) WorkItem( - command.sizes(), baseLocalMemPtr, patchedLocalMemPtr); - - WorkGroup wg(command, kernel, thread, params, workItem0, numWorkItems); - - if (numWorkItems == 1) { - static_cast*>(this)-> - executeMode(wg); - } - else if (kernel.hasBarrier()) { - static_cast*>(this)-> - executeMode(wg); - } - else { - static_cast*>(this)-> - executeMode(wg); - } -} - -void -WorkGroup::executeWorkItem() -{ - callKernel((kernelentrypoint_t)kernel_.getEntryPoint(), workItem0_->nativeStackPtr()); -} - -void -WorkGroup::executeWithBarrier() -{ - kernelentrypoint_t entryPoint = (kernelentrypoint_t)kernel_.getEntryPoint(); - - workingFiber_ = workItem0_; - address workGroupStackPtr = workItem0_->nativeStackPtr(); - - // Save the current stack context in case we execute a barrier. - volatile size_t threadCounter = 0; - bool barrier = !thread_.mainFiber().save(); - - size_t tid = threadCounter++; - WorkItem* workItem = (WorkItem*)((char*) workItem0_ - - tid * CLK_PRIVATE_MEMORY_SIZE); - - if (barrier) { - WorkItem* prev = (WorkItem*)((char*) workItem - + CLK_PRIVATE_MEMORY_SIZE); - - WINDOWS_ONLY(amd::Os::touchStackPages( - (address) (workItem + 1), (address) prev)); - ::memcpy(workItem, prev, sizeof(WorkItem)); - - clk_thread_info_block_t& tib = workItem->infoBlock(); - ++tib.local_id[0]; - if (unlikely(tib.local_id[0] >= tib.local_size[0])) { - // - // Compiling for Windows 64bit (only in release) introduces a bug, - // which uses the same register for saving threadCounter and the - // 0 value. Therefore "tib.local_id[i] = 0" was actually translated - // to "tib.local_id[0] = threadCounter". To avoid this issue, and - // still be able to store a 0 into tib.local_id[i], we trick the - // compiler, by using the value in tib.local_id[3], which is always - // initialized to 0. - // - tib.local_id[0] = tib.local_id[3]; - - ++tib.local_id[1]; - if (unlikely(tib.local_id[1] >= tib.local_size[1])) { - tib.local_id[1] = tib.local_id[3]; - - ++tib.local_id[2]; - } + } + } else if (desc.type_ == T_POINTER) { + // __global memory parameter + cl_mem_object_type pointer_type = CL_MEM_OBJECT_BUFFER; + if (kernelParam.boundToSvmPointer(device, cmdParams, i)) { + *reinterpret_cast(param) = *reinterpret_cast(cmdParam); + } else { + void* hostMemPtr = NULL; + amd::Memory* memArg = *reinterpret_cast(cmdParam); + if (memArg != NULL) { + hostMemPtr = memArg->getHostMem(); + if (hostMemPtr == NULL) { + command.setException(CL_MEM_OBJECT_ALLOCATION_FAILURE); + return false; + } + pointer_type = memArg->getType(); } - - // Link the previous workitem to this one. - prev->setNext(workItem); - // If this is the last workitem, complete the ring. - if (tid >= numWorkItems_ - 1) { - workItem->setNext(workItem0_); + // For images on CPU devices, pass "struct {int4 p0; int4 p1}". + // That allows an obvious implementation for + // __amdil_get_image[23]d_params[01]. + // That makes the rest of the .bc implementation for + // images relatively straight forward. + if (pointer_type == CL_MEM_OBJECT_IMAGE1D || pointer_type == CL_MEM_OBJECT_IMAGE2D || + pointer_type == CL_MEM_OBJECT_IMAGE3D || pointer_type == CL_MEM_OBJECT_IMAGE1D_ARRAY || + pointer_type == CL_MEM_OBJECT_IMAGE1D_BUFFER || + pointer_type == CL_MEM_OBJECT_IMAGE2D_ARRAY) { + amd::Image::Impl& impl = memArg->asImage()->getImpl(); + impl.reserved_ = hostMemPtr; + *reinterpret_cast(param) = (void*)&impl; + } else { + *reinterpret_cast(param) = hostMemPtr; } + } + } else if (desc.type_ == T_SAMPLER) { + // Switch from an Amd::Sampler to the 32bit integer + // variable that is a clk_sampler. + amd::Sampler* samplerArg = *reinterpret_cast(cmdParam); + *reinterpret_cast(param) = (uint32_t)samplerArg->state(); + } else { + // Using HCtoDCmap + HCtoDCmap arg_map = cpuKernel.getHCtoDCmap(i); + unsigned int arg_offset = effectiveOffset; + int err_code = 0; + int inStruct = 0; + int sys_64bit = LP64_SWITCH(0, 1); // Mapping only required for 32 bit targets + if (CPU_USE_ALIGNMENT_MAP == 0 && !sys_64bit) { + effectiveOffset += arg_map.copy_params(param, cmdParam, arg_offset, err_code, inStruct); + if (err_code) { + return false; + } + prmSize = arg_map.dc_size; + } else { + ::memcpy(param, cmdParam, desc.size_); + } } + effectiveOffset += prmSize; + } - // Execute thread0 + localMemPtr = amd::alignUp(localMemPtr, sizeof(cl_long16)); + if ((localMemPtr + localMemSize) > localMemLimit) { + command.setException(CL_MEM_OBJECT_ALLOCATION_FAILURE); + return false; + } - address workItemStackPtr = workItem->nativeStackPtr(); - callKernelProtectedReturn(entryPoint, workItemStackPtr); - - // Check if thread0 executed a barrier() - if (threadCounter > 1) { - workItem = (WorkItem*)workingFiber_; - workingFiber_ = workingFiber_->next(); - - tid = ((address)workItem0_ - (address)workItem) - / CLK_PRIVATE_MEMORY_SIZE; - if (tid == (numWorkItems_ - 1)) { - // If we get here, we are done! - return; - } - if (workItem->next() == &thread_.mainFiber()) { - // Detected a deadlock - command_.setException(CL_INVALID_KERNEL); - return; - } - - // Schedule the next workitem. - workItem->next()->restore(); - ShouldNotReachHere(); - } - - // Execute thread1...threadN - callKernelRange(entryPoint, workItemStackPtr, workItem->infoBlock()); + return true; } -void -WorkGroup::executeWithoutBarrier() -{ - kernelentrypoint_t entryPoint = (kernelentrypoint_t)kernel_.getEntryPoint(); - address workItemStackPtr = workItem0_->nativeStackPtr(); +void NDRangeKernelBatch::execute() { + amd::NDRangeKernelCommand& command = static_cast(command_); - // Execute thread0 - callKernel(entryPoint, workItemStackPtr); + const cpu::Kernel& kernel = + static_cast(*command.kernel().getDeviceKernel(command.queue()->device())); - // Execute thread1...threadN - callKernelRange(entryPoint, workItemStackPtr, workItem0_->infoBlock()); + WorkerThread& thread = *WorkerThread::current(); + + const size_t numWorkItems = command.sizes().local().product(); + + address params = thread.baseWorkItemsStack(); + address baseLocalMemPtr = thread.localDataStorage(); + address patchedLocalMemPtr = thread.localDataStorage() + __CPU_SCRATCH_SIZE; + if (!patchParameters(kernel, params, patchedLocalMemPtr, + patchedLocalMemPtr + thread.localDataSize(), + kernel.workGroupInfo()->localMemSize_)) { + return; + } + + WorkItem* workItem0 = + ::new ((WorkItem*)params - 1) WorkItem(command.sizes(), baseLocalMemPtr, patchedLocalMemPtr); + + WorkGroup wg(command, kernel, thread, params, workItem0, numWorkItems); + + if (numWorkItems == 1) { + static_cast*>(this)->executeMode(wg); + } else if (kernel.hasBarrier()) { + static_cast*>(this)->executeMode(wg); + } else { + static_cast*>(this)->executeMode(wg); + } } -void -WorkGroup::callKernelRange(kernelentrypoint_t entryPoint, - address stackPtr, - clk_thread_info_block_t& tib) -{ - while (true) { - ++tib.local_id[0]; - if (unlikely(tib.local_id[0] >= tib.local_size[0])) { - tib.local_id[0] = 0; +void WorkGroup::executeWorkItem() { + callKernel((kernelentrypoint_t)kernel_.getEntryPoint(), workItem0_->nativeStackPtr()); +} - ++tib.local_id[1]; - if (unlikely(tib.local_id[1] >= tib.local_size[1])) { - tib.local_id[1] = 0; +void WorkGroup::executeWithBarrier() { + kernelentrypoint_t entryPoint = (kernelentrypoint_t)kernel_.getEntryPoint(); - ++tib.local_id[2]; - if (unlikely(tib.local_id[2] >= tib.local_size[2])) { - tib.local_id[2] = 0; + workingFiber_ = workItem0_; + address workGroupStackPtr = workItem0_->nativeStackPtr(); - return; - } - } - } + // Save the current stack context in case we execute a barrier. + volatile size_t threadCounter = 0; + bool barrier = !thread_.mainFiber().save(); - callKernel(entryPoint, stackPtr); + size_t tid = threadCounter++; + WorkItem* workItem = (WorkItem*)((char*)workItem0_ - tid * CLK_PRIVATE_MEMORY_SIZE); + + if (barrier) { + WorkItem* prev = (WorkItem*)((char*)workItem + CLK_PRIVATE_MEMORY_SIZE); + + WINDOWS_ONLY(amd::Os::touchStackPages((address)(workItem + 1), (address)prev)); + ::memcpy(workItem, prev, sizeof(WorkItem)); + + clk_thread_info_block_t& tib = workItem->infoBlock(); + ++tib.local_id[0]; + if (unlikely(tib.local_id[0] >= tib.local_size[0])) { + // + // Compiling for Windows 64bit (only in release) introduces a bug, + // which uses the same register for saving threadCounter and the + // 0 value. Therefore "tib.local_id[i] = 0" was actually translated + // to "tib.local_id[0] = threadCounter". To avoid this issue, and + // still be able to store a 0 into tib.local_id[i], we trick the + // compiler, by using the value in tib.local_id[3], which is always + // initialized to 0. + // + tib.local_id[0] = tib.local_id[3]; + + ++tib.local_id[1]; + if (unlikely(tib.local_id[1] >= tib.local_size[1])) { + tib.local_id[1] = tib.local_id[3]; + + ++tib.local_id[2]; + } } + + // Link the previous workitem to this one. + prev->setNext(workItem); + // If this is the last workitem, complete the ring. + if (tid >= numWorkItems_ - 1) { + workItem->setNext(workItem0_); + } + } + + // Execute thread0 + + address workItemStackPtr = workItem->nativeStackPtr(); + callKernelProtectedReturn(entryPoint, workItemStackPtr); + + // Check if thread0 executed a barrier() + if (threadCounter > 1) { + workItem = (WorkItem*)workingFiber_; + workingFiber_ = workingFiber_->next(); + + tid = ((address)workItem0_ - (address)workItem) / CLK_PRIVATE_MEMORY_SIZE; + if (tid == (numWorkItems_ - 1)) { + // If we get here, we are done! + return; + } + if (workItem->next() == &thread_.mainFiber()) { + // Detected a deadlock + command_.setException(CL_INVALID_KERNEL); + return; + } + + // Schedule the next workitem. + workItem->next()->restore(); + ShouldNotReachHere(); + } + + // Execute thread1...threadN + callKernelRange(entryPoint, workItemStackPtr, workItem->infoBlock()); } -WorkItem::WorkItem(const amd::NDRangeContainer& sizes, - void* scratchMemPtr, - void* localMemPtr) -{ - const amd::NDRange& local = sizes.local(); - const amd::NDRange& global = sizes.global(); - const amd::NDRange& offset = sizes.offset(); - const size_t dims = sizes.dimensions(); +void WorkGroup::executeWithoutBarrier() { + kernelentrypoint_t entryPoint = (kernelentrypoint_t)kernel_.getEntryPoint(); + address workItemStackPtr = workItem0_->nativeStackPtr(); - tib_.builtins = &Builtins::dispatchTable_; - tib_.local_mem_base = localMemPtr; - tib_.local_scratch = scratchMemPtr; - tib_.table_base = (const void *)cpuTables; - tib_.work_dim = (cl_uint) sizes.dimensions(); + // Execute thread0 + callKernel(entryPoint, workItemStackPtr); - for (size_t i = 0; i < dims; ++i) { + // Execute thread1...threadN + callKernelRange(entryPoint, workItemStackPtr, workItem0_->infoBlock()); +} + +void WorkGroup::callKernelRange(kernelentrypoint_t entryPoint, address stackPtr, + clk_thread_info_block_t& tib) { + while (true) { + ++tib.local_id[0]; + if (unlikely(tib.local_id[0] >= tib.local_size[0])) { + tib.local_id[0] = 0; + + ++tib.local_id[1]; + if (unlikely(tib.local_id[1] >= tib.local_size[1])) { + tib.local_id[1] = 0; + + ++tib.local_id[2]; + if (unlikely(tib.local_id[2] >= tib.local_size[2])) { + tib.local_id[2] = 0; + + return; + } + } + } + + callKernel(entryPoint, stackPtr); + } +} + +WorkItem::WorkItem(const amd::NDRangeContainer& sizes, void* scratchMemPtr, void* localMemPtr) { + const amd::NDRange& local = sizes.local(); + const amd::NDRange& global = sizes.global(); + const amd::NDRange& offset = sizes.offset(); + const size_t dims = sizes.dimensions(); + + tib_.builtins = &Builtins::dispatchTable_; + tib_.local_mem_base = localMemPtr; + tib_.local_scratch = scratchMemPtr; + tib_.table_base = (const void*)cpuTables; + tib_.work_dim = (cl_uint)sizes.dimensions(); + + for (size_t i = 0; i < dims; ++i) { + tib_.global_offset[i] = offset[i]; + tib_.global_size[i] = global[i]; + tib_.local_size[i] = local[i]; + tib_.enqueued_local_size[i] = local[i]; + tib_.local_id[i] = 0; + tib_.group_id[i] = 0; + } + + // Fill the remaining dimensions. + for (size_t i = dims; i < sizeof(tib_.global_size) / sizeof(size_t); ++i) { + tib_.global_offset[i] = 0; + tib_.global_size[i] = 1; + tib_.local_size[i] = 1; + tib_.enqueued_local_size[i] = 1; + tib_.local_id[i] = 0; + tib_.group_id[i] = 0; + } +} + +ALWAYSINLINE void WorkItem::setGroupId(const amd::NDRange& rangeLimits, const amd::NDRange& offset, + size_t n) { + const size_t dims = rangeLimits.dimensions(); + for (size_t i = 0; i < dims; ++i) { + size_t lim = rangeLimits[i]; + size_t& val = tib_.group_id[i]; + val = n; + if (n < lim) { + tib_.global_offset[i] = offset[i] + val * tib_.enqueued_local_size[i]; + tib_.local_id[i] = 0; + tib_.local_size[i] = std::min(tib_.enqueued_local_size[i], + tib_.global_size[i] - (val * tib_.enqueued_local_size[i])); + + ++i; + for (; i < dims; ++i) { tib_.global_offset[i] = offset[i]; - tib_.global_size[i] = global[i]; - tib_.local_size[i] = local[i]; - tib_.enqueued_local_size[i] = local[i]; - tib_.local_id[i] = 0; - tib_.group_id[i] = 0; - } - - // Fill the remaining dimensions. - for (size_t i = dims; i < sizeof(tib_.global_size)/sizeof(size_t); ++i) { - tib_.global_offset[i] = 0; - tib_.global_size[i] = 1; - tib_.local_size[i] = 1; - tib_.enqueued_local_size[i] = 1; tib_.local_id[i] = 0; tib_.group_id[i] = 0; + } + break; + } else { + n /= lim; + val -= n * lim; + tib_.global_offset[i] = offset[i] + val * tib_.enqueued_local_size[i]; + tib_.local_id[i] = 0; + tib_.local_size[i] = std::min(tib_.enqueued_local_size[i], + tib_.global_size[i] - (val * tib_.enqueued_local_size[i])); } + } } -ALWAYSINLINE void -WorkItem::setGroupId( - const amd::NDRange& rangeLimits, - const amd::NDRange& offset, - size_t n) -{ - const size_t dims = rangeLimits.dimensions(); - for (size_t i = 0; i < dims; ++i) { - size_t lim = rangeLimits[i]; - size_t& val = tib_.group_id[i]; - val = n; - if (n < lim) { - tib_.global_offset[i] = - offset[i] + val * tib_.enqueued_local_size[i]; - tib_.local_id[i] = 0; - tib_.local_size[i] = - std::min(tib_.enqueued_local_size[i], - tib_.global_size[i] - (val * tib_.enqueued_local_size[i])); - - ++i; - for (; i < dims; ++i) { - tib_.global_offset[i] = offset[i]; - tib_.local_id[i] = 0; - tib_.group_id[i] = 0; - } - break; - } - else { - n /= lim; - val -= n * lim; - tib_.global_offset[i] = - offset[i] + val * tib_.enqueued_local_size[i]; - tib_.local_id[i] = 0; - tib_.local_size[i] = - std::min(tib_.enqueued_local_size[i], - tib_.global_size[i] - (val * tib_.enqueued_local_size[i])); - } +ALWAYSINLINE void WorkItem::incrementGroupId(const amd::NDRange& rangeLimits, + const amd::NDRange& offset, size_t n) { + const size_t dims = rangeLimits.dimensions(); + for (size_t i = 0; i < dims; ++i) { + size_t lim = rangeLimits[i]; + size_t& val = tib_.group_id[i]; + val += n; + if (val < lim) { + tib_.global_offset[i] = offset[i] + val * tib_.enqueued_local_size[i]; + tib_.local_id[i] = 0; + tib_.local_size[i] = std::min(tib_.enqueued_local_size[i], + tib_.global_size[i] - (val * tib_.enqueued_local_size[i])); + break; + } else { + n = val / lim; + val -= n * lim; + tib_.global_offset[i] = offset[i] + val * tib_.enqueued_local_size[i]; + tib_.local_id[i] = 0; + tib_.local_size[i] = std::min(tib_.enqueued_local_size[i], + tib_.global_size[i] - (val * tib_.enqueued_local_size[i])); } + } } -ALWAYSINLINE void -WorkItem::incrementGroupId( - const amd::NDRange& rangeLimits, - const amd::NDRange& offset, - size_t n) -{ - const size_t dims = rangeLimits.dimensions(); - for (size_t i = 0; i < dims; ++i) { - size_t lim = rangeLimits[i]; - size_t& val = tib_.group_id[i]; - val += n; - if (val < lim) { - tib_.global_offset[i] = - offset[i] + val * tib_.enqueued_local_size[i]; - tib_.local_id[i] = 0; - tib_.local_size[i] = - std::min(tib_.enqueued_local_size[i], - tib_.global_size[i] - (val * tib_.enqueued_local_size[i])); - break; - } - else { - n = val / lim; - val -= n * lim; - tib_.global_offset[i] = - offset[i] + val * tib_.enqueued_local_size[i]; - tib_.local_id[i] = 0; - tib_.local_size[i] = - std::min(tib_.enqueued_local_size[i], - tib_.global_size[i] - (val * tib_.enqueued_local_size[i])); - } - } +void WorkItem::barrier(cl_mem_fence_flags flags) { + WorkItem* workItem = WorkItem::current(); + workItem->swap(workItem->next()); } -void -WorkItem::barrier(cl_mem_fence_flags flags) -{ - WorkItem* workItem = WorkItem::current(); - workItem->swap(workItem->next()); +void Operation::cleanup() { + cl_int lastException = command().exception(); + cl_int status = (lastException != 0) ? lastException : CL_COMPLETE; + + Counter* counter = reinterpret_cast(command().data()); + if (counter == NULL) { + command().setStatus(status); + } else if (counter->decrement() == 0) { + counter->event().setStatus(status); + } } -void Operation::cleanup() -{ - cl_int lastException = command().exception(); - cl_int status = (lastException != 0) ? lastException : CL_COMPLETE; - - Counter* counter = reinterpret_cast(command().data()); - if (counter == NULL) { - command().setStatus(status); - } - else if (counter->decrement() == 0) { - counter->event().setStatus(status); - } -} - -} // namespace cpu +} // namespace cpu diff --git a/rocclr/runtime/device/cpu/cpucommand.hpp b/rocclr/runtime/device/cpu/cpucommand.hpp index abd195da0b..d5b4e35e83 100644 --- a/rocclr/runtime/device/cpu/cpucommand.hpp +++ b/rocclr/runtime/device/cpu/cpucommand.hpp @@ -15,7 +15,7 @@ #if defined(ATI_ARCH_ARM) #include -#endif // ATI_ARCH_ARM +#endif // ATI_ARCH_ARM namespace cpu { @@ -27,94 +27,91 @@ namespace cpu { */ //! A saved stack context -class StackContext : public amd::StackObject -{ -private: +class StackContext : public amd::StackObject { + private: #if defined(ATI_ARCH_ARM) - jmp_buf env_; + jmp_buf env_; #elif defined(_WIN64) - intptr_t __declspec(align(16)) regs_[32]; -#else // !_WIN64 - intptr_t regs_[LP64_SWITCH(6,8)]; -#endif // !_WIN64 + intptr_t __declspec(align(16)) regs_[32]; +#else // !_WIN64 + intptr_t regs_[LP64_SWITCH(6, 8)]; +#endif // !_WIN64 -public: - //! Save the stack context. Return 0 if returning directly. - inline intptr_t setjmp(); + public: + //! Save the stack context. Return 0 if returning directly. + inline intptr_t setjmp(); - //! Restore the stack context - inline void longjmp(intptr_t val) const; + //! Restore the stack context + inline void longjmp(intptr_t val) const; }; //! A thread fiber -class Fiber : public amd::StackObject -{ -private: - //! Next fiber in the thread. - Fiber* next_; +class Fiber : public amd::StackObject { + private: + //! Next fiber in the thread. + Fiber* next_; - //! This fiber's saved state. - StackContext context_; + //! This fiber's saved state. + StackContext context_; -public: - //! Construct a new Fiber - Fiber() : next_(NULL) { } + public: + //! Construct a new Fiber + Fiber() : next_(NULL) {} - //! Return the next fiber in the current thread. - const Fiber* next() const { return next_; } - //! Set the next fiber in the current thread. - void setNext(Fiber* next) { next_ = next; } + //! Return the next fiber in the current thread. + const Fiber* next() const { return next_; } + //! Set the next fiber in the current thread. + void setNext(Fiber* next) { next_ = next; } - //! Save the state of this fiber. Return true if directly returning. - ALWAYSINLINE bool save() { return context_.setjmp() == 0; } - //! Restore this fiber from the saved context. - void restore() const { context_.longjmp(1); } + //! Save the state of this fiber. Return true if directly returning. + ALWAYSINLINE bool save() { return context_.setjmp() == 0; } + //! Restore this fiber from the saved context. + void restore() const { context_.longjmp(1); } - //! Switch to the given fiber. - void swap(const Fiber* fiber) { if (save()) { fiber->restore(); } } + //! Switch to the given fiber. + void swap(const Fiber* fiber) { + if (save()) { + fiber->restore(); + } + } }; - //! A CPU core operation (enqueued in the worker thread queue) -class Operation : public amd::HeapObject -{ -public: - //! An atomic counter - class Counter - { - // FIXME_lmoriche: recycle the counters, implement a thread local pool. - private: - amd::Event& event_; - //! The atomic counter value. - amd::Atomic counter_; +class Operation : public amd::HeapObject { + public: + //! An atomic counter + class Counter { + // FIXME_lmoriche: recycle the counters, implement a thread local pool. + private: + amd::Event& event_; + //! The atomic counter value. + amd::Atomic counter_; - public: - //! Initialize the counter with the given initial value. - Counter(amd::Event& event, size_t initialValue) : - event_(event), counter_(initialValue) { } - //! Return the event associated with this counter. - amd::Event& event() { return event_; } - //! Decrement the counter and return the new value. - size_t decrement() { return --counter_; } - }; + public: + //! Initialize the counter with the given initial value. + Counter(amd::Event& event, size_t initialValue) : event_(event), counter_(initialValue) {} + //! Return the event associated with this counter. + amd::Event& event() { return event_; } + //! Decrement the counter and return the new value. + size_t decrement() { return --counter_; } + }; -protected: - amd::Command& command_; + protected: + amd::Command& command_; -public: - Operation(amd::Command& command) : command_(command) - { } + public: + Operation(amd::Command& command) : command_(command) {} - virtual ~Operation() {}; + virtual ~Operation(){}; - virtual void clone(Operation* buf) = 0; + virtual void clone(Operation* buf) = 0; - void cleanup(); + void cleanup(); - amd::Command& command() { return command_;} + amd::Command& command() { return command_; } - virtual void execute() = 0; + virtual void execute() = 0; }; /*! @} @@ -123,243 +120,210 @@ public: */ //! A work item instance -class WorkItem : public Fiber -{ -private: - //! Thread info block (must be the last field). - clk_thread_info_block_t tib_; +class WorkItem : public Fiber { + private: + //! Thread info block (must be the last field). + clk_thread_info_block_t tib_; -private: - //! Cannot be deleted (allocated with placement new). - void operator delete(void*) { ShouldNotCallThis(); } + private: + //! Cannot be deleted (allocated with placement new). + void operator delete(void*) { ShouldNotCallThis(); } -public: - //! Initialize this workgroup. - WorkItem( - const amd::NDRangeContainer& size, - void* scratchMemPtr, - void* localMemPtr); + public: + //! Initialize this workgroup. + WorkItem(const amd::NDRangeContainer& size, void* scratchMemPtr, void* localMemPtr); - //! Return the current WorkItem (based of the current stack pointer). - static WorkItem* current() { - return (WorkItem*)amd::alignUp((intptr_t) amd::Os::currentStackPtr(), - CLK_PRIVATE_MEMORY_SIZE) - 1; - } + //! Return the current WorkItem (based of the current stack pointer). + static WorkItem* current() { + return (WorkItem*)amd::alignUp((intptr_t)amd::Os::currentStackPtr(), CLK_PRIVATE_MEMORY_SIZE) - + 1; + } - clk_thread_info_block_t& infoBlock() { return tib_; } + clk_thread_info_block_t& infoBlock() { return tib_; } - //! Return the native stack pointer base for this workitem. - address nativeStackPtr() const { - address newSp = amd::alignDown((address) this - CPUKERNEL_STACK_ALIGN, - CPUKERNEL_STACK_ALIGN); - WINDOWS_ONLY(NOT_WIN64(newSp += sizeof(void*))); - return newSp; - } + //! Return the native stack pointer base for this workitem. + address nativeStackPtr() const { + address newSp = amd::alignDown((address) this - CPUKERNEL_STACK_ALIGN, CPUKERNEL_STACK_ALIGN); + WINDOWS_ONLY(NOT_WIN64(newSp += sizeof(void*))); + return newSp; + } - //! These functions are mapping "n" from 1d index to the required dimension - inline void setGroupId( - const amd::NDRange& rangeLimits, - const amd::NDRange& offset, - size_t n); - inline void incrementGroupId( - const amd::NDRange& rangeLimits, - const amd::NDRange& offset, - size_t n); + //! These functions are mapping "n" from 1d index to the required dimension + inline void setGroupId(const amd::NDRange& rangeLimits, const amd::NDRange& offset, size_t n); + inline void incrementGroupId(const amd::NDRange& rangeLimits, const amd::NDRange& offset, + size_t n); - //! Execute a thread synchronization barrier. - static void barrier(cl_mem_fence_flags flags); + //! Execute a thread synchronization barrier. + static void barrier(cl_mem_fence_flags flags); }; typedef void (*kernelentrypoint_t)(const void*); //! Execute a workgroup (work-items). -class WorkGroup -{ -private: - amd::NDRangeKernelCommand& command_; - const cpu::Kernel& kernel_; - WorkerThread& thread_; - address params_; - WorkItem* const workItem0_; - const Fiber* workingFiber_; - size_t numWorkItems_; +class WorkGroup { + private: + amd::NDRangeKernelCommand& command_; + const cpu::Kernel& kernel_; + WorkerThread& thread_; + address params_; + WorkItem* const workItem0_; + const Fiber* workingFiber_; + size_t numWorkItems_; -public: - WorkGroup( - amd::NDRangeKernelCommand& parent, - const cpu::Kernel& kernel, - WorkerThread& thread, - address params, - WorkItem* workItem0, - const size_t numWorkItems) : - command_(parent), - kernel_(kernel), - thread_(thread), - params_(params), - workItem0_(workItem0), - numWorkItems_(numWorkItems) - { } + public: + WorkGroup(amd::NDRangeKernelCommand& parent, const cpu::Kernel& kernel, WorkerThread& thread, + address params, WorkItem* workItem0, const size_t numWorkItems) + : command_(parent), + kernel_(kernel), + thread_(thread), + params_(params), + workItem0_(workItem0), + numWorkItems_(numWorkItems) {} - WorkItem* getBaseWorkItem() { return workItem0_; } - WorkerThread& getWorkerThread() { return thread_; } + WorkItem* getBaseWorkItem() { return workItem0_; } + WorkerThread& getWorkerThread() { return thread_; } - void executeWorkItem(); // In case of 1 WorkItem - void executeWithBarrier(); - void executeWithoutBarrier(); + void executeWorkItem(); // In case of 1 WorkItem + void executeWithBarrier(); + void executeWithoutBarrier(); - void setNumWorkItems(size_t workItems) { numWorkItems_ = workItems; } - size_t getNumWorkItems() { return numWorkItems_; } -private: - void callKernelRange( - kernelentrypoint_t entryPoint, - address stackPtr, - clk_thread_info_block_t& tib); - inline void callKernel( - kernelentrypoint_t entryPoint, - address stackPtr); - inline void callKernelProtectedReturn( - kernelentrypoint_t entryPoint, - address stackPtr); + void setNumWorkItems(size_t workItems) { numWorkItems_ = workItems; } + size_t getNumWorkItems() { return numWorkItems_; } + + private: + void callKernelRange(kernelentrypoint_t entryPoint, address stackPtr, + clk_thread_info_block_t& tib); + inline void callKernel(kernelentrypoint_t entryPoint, address stackPtr); + inline void callKernelProtectedReturn(kernelentrypoint_t entryPoint, address stackPtr); }; -class NDRangeKernelBatch : public Operation -{ -protected: - size_t coreId_; - const size_t numWorkGroups_; - const size_t numCores_; - volatile size_t currentOpId_; - const amd::NDRange groupIds_; //!< Number of groups in each dimensions - VirtualCPU& virtualDevice_; +class NDRangeKernelBatch : public Operation { + protected: + size_t coreId_; + const size_t numWorkGroups_; + const size_t numCores_; + volatile size_t currentOpId_; + const amd::NDRange groupIds_; //!< Number of groups in each dimensions + VirtualCPU& virtualDevice_; -public: - enum ExecutionOrder { - ORDER_DEFAULT, - ORDER_ROUND_ROBIN = ORDER_DEFAULT, - //ORDER_LINEAR - }; + public: + enum ExecutionOrder { + ORDER_DEFAULT, + ORDER_ROUND_ROBIN = ORDER_DEFAULT, + // ORDER_LINEAR + }; - enum ExecutionNature { - NATURE_WITH_BARRIER, - NATURE_WITHOUT_BARRIER, - NATURE_1_WORK_ITEM, - NATURE_WG_LEVEL_EXEC - }; + enum ExecutionNature { + NATURE_WITH_BARRIER, + NATURE_WITHOUT_BARRIER, + NATURE_1_WORK_ITEM, + NATURE_WG_LEVEL_EXEC + }; - NDRangeKernelBatch( - amd::NDRangeKernelCommand& parent, - VirtualCPU& virtualDevice, - const amd::NDRange& groupIds, size_t numCores) : - Operation(parent), - coreId_(0), - numWorkGroups_(groupIds.product()), - numCores_(numCores), - currentOpId_(0), - groupIds_(groupIds), - virtualDevice_(virtualDevice) - { } + NDRangeKernelBatch(amd::NDRangeKernelCommand& parent, VirtualCPU& virtualDevice, + const amd::NDRange& groupIds, size_t numCores) + : Operation(parent), + coreId_(0), + numWorkGroups_(groupIds.product()), + numCores_(numCores), + currentOpId_(0), + groupIds_(groupIds), + virtualDevice_(virtualDevice) {} - virtual void clone(Operation* buf) - { - ::new(buf) NDRangeKernelBatch(static_cast(command_), - virtualDevice_, groupIds_, numCores_); - static_cast(buf)->setCoreId(coreId_); - } + virtual void clone(Operation* buf) { + ::new (buf) NDRangeKernelBatch(static_cast(command_), + virtualDevice_, groupIds_, numCores_); + static_cast(buf)->setCoreId(coreId_); + } - virtual void execute(); + virtual void execute(); - void setCoreId(size_t coreId) { coreId_ = coreId; currentOpId_ = coreId; } + void setCoreId(size_t coreId) { + coreId_ = coreId; + currentOpId_ = coreId; + } - inline bool getNextOperationId(size_t& opId); - inline size_t getNextOperationIds(size_t& opId, size_t count); + inline bool getNextOperationId(size_t& opId); + inline size_t getNextOperationIds(size_t& opId, size_t count); -private: - bool patchParameters( - const cpu::Kernel& kernel, - address params, - address& localMemPtr, - const address localMemLimit, - size_t localMemSize) const; + private: + bool patchParameters(const cpu::Kernel& kernel, address params, address& localMemPtr, + const address localMemLimit, size_t localMemSize) const; }; -class NativeFn : public Operation -{ -public: - NativeFn(amd::NativeFnCommand& parent) : Operation(parent) - { } +class NativeFn : public Operation { + public: + NativeFn(amd::NativeFnCommand& parent) : Operation(parent) {} - virtual void clone(Operation* buf) - { - ::new(buf) NativeFn(static_cast(command_)); - } + virtual void clone(Operation* buf) { + ::new (buf) NativeFn(static_cast(command_)); + } - virtual void execute(); + virtual void execute(); }; #ifndef MAX -#define MAX(x,y) ((x)>=(y) ?(x) : (y)) -#endif //MAX +#define MAX(x, y) ((x) >= (y) ? (x) : (y)) +#endif // MAX #define MAX_OPERATION_ALLOC_SIZE (MAX(sizeof(NDRangeKernelBatch), sizeof(NativeFn))) //! A thread bound to a cpu core. -class WorkerThread : public amd::Thread -{ -private: - Fiber mainFiber_; //!< main fiber for this worker thread. +class WorkerThread : public amd::Thread { + private: + Fiber mainFiber_; //!< main fiber for this worker thread. - amd::Monitor queueLock_; //!< lock protecting the queue. - volatile int waitingOp_; - bool terminated_; //!< true if the thread is shutting down. - - //! Local memory storage - address localDataStorage_; - //! Size of the local memory. - size_t localDataSize_; + amd::Monitor queueLock_; //!< lock protecting the queue. + volatile int waitingOp_; + bool terminated_; //!< true if the thread is shutting down. - char operation_[MAX_OPERATION_ALLOC_SIZE]; + //! Local memory storage + address localDataStorage_; + //! Size of the local memory. + size_t localDataSize_; - address baseWorkItemsStack_; -private: - //! Awaits operations and execute them as they become ready. - void loop(); + char operation_[MAX_OPERATION_ALLOC_SIZE]; -public: - //! Construct a new WorkerThread. - WorkerThread(const cpu::Device& device); - //! Destroy the worker thread. - virtual ~WorkerThread(); - //! Cleanup the thread before termination. - bool terminate(); + address baseWorkItemsStack_; - //! Return the main fiber for this thread. - Fiber& mainFiber() { return mainFiber_; } - //! Return the LDS for this thread - address localDataStorage() const { return localDataStorage_; } - //! Return the size of the local memory for this thread. - size_t localDataSize() const { return localDataSize_; } + private: + //! Awaits operations and execute them as they become ready. + void loop(); - address baseWorkItemsStack() { return baseWorkItemsStack_; } + public: + //! Construct a new WorkerThread. + WorkerThread(const cpu::Device& device); + //! Destroy the worker thread. + virtual ~WorkerThread(); + //! Cleanup the thread before termination. + bool terminate(); - Operation* operation() { return reinterpret_cast(operation_); } - bool isOperationValid() { return waitingOp_ > 0; } + //! Return the main fiber for this thread. + Fiber& mainFiber() { return mainFiber_; } + //! Return the LDS for this thread + address localDataStorage() const { return localDataStorage_; } + //! Return the size of the local memory for this thread. + size_t localDataSize() const { return localDataSize_; } - //! Enqueue a new operation to execute in this thread. - void enqueue(Operation& op); - //! Signal to start processing the commands in the queue. - void flush() { amd::ScopedLock sl(queueLock_); queueLock_.notify(); } + address baseWorkItemsStack() { return baseWorkItemsStack_; } - //! This thread's execution engine. - void run(void* data) { - loop(); - } + Operation* operation() { return reinterpret_cast(operation_); } + bool isOperationValid() { return waitingOp_ > 0; } - bool isWorkerThread() const { return true; } + //! Enqueue a new operation to execute in this thread. + void enqueue(Operation& op); + //! Signal to start processing the commands in the queue. + void flush() { + amd::ScopedLock sl(queueLock_); + queueLock_.notify(); + } - //! Return the currently executing WorkerThread's instance. - static WorkerThread* current() - { - return static_cast(Thread::current()); - } + //! This thread's execution engine. + void run(void* data) { loop(); } + + bool isWorkerThread() const { return true; } + + //! Return the currently executing WorkerThread's instance. + static WorkerThread* current() { return static_cast(Thread::current()); } }; /*! @} @@ -371,59 +335,43 @@ extern "C" intptr_t _StackContext_setjmp(intptr_t* regs); #if !defined(ATI_ARCH_ARM) ALWAYSINLINE #endif -intptr_t -StackContext::setjmp() -{ +intptr_t StackContext::setjmp() { #if defined(ATI_ARCH_ARM) - return ::setjmp(env_); + return ::setjmp(env_); #else - return _StackContext_setjmp(regs_); + return _StackContext_setjmp(regs_); #endif } extern "C" void _StackContext_longjmp(const intptr_t* env, intptr_t val); -ALWAYSINLINE void -StackContext::longjmp(intptr_t val) const -{ +ALWAYSINLINE void StackContext::longjmp(intptr_t val) const { #if defined(ATI_ARCH_ARM) - return ::longjmp(*const_cast(&env_), val); + return ::longjmp(*const_cast(&env_), val); #else - return _StackContext_longjmp(regs_, val); + return _StackContext_longjmp(regs_, val); #endif } +extern "C" void _WorkGroup_callKernel(address params, kernelentrypoint_t entryPoint, + address stackPtr); -extern "C" void _WorkGroup_callKernel( - address params, - kernelentrypoint_t entryPoint, - address stackPtr); - -extern "C" void _WorkGroup_callKernelProtectedReturn( - address params, - kernelentrypoint_t entryPoint, - address stackPtr); +extern "C" void _WorkGroup_callKernelProtectedReturn(address params, kernelentrypoint_t entryPoint, + address stackPtr); -ALWAYSINLINE void -WorkGroup::callKernel( - kernelentrypoint_t entryPoint, - address stackPtr) -{ - _WorkGroup_callKernel(params_, entryPoint, stackPtr); +ALWAYSINLINE void WorkGroup::callKernel(kernelentrypoint_t entryPoint, address stackPtr) { + _WorkGroup_callKernel(params_, entryPoint, stackPtr); } // This version support the case of changing the stack for fibers. -ALWAYSINLINE void -WorkGroup::callKernelProtectedReturn( - kernelentrypoint_t entryPoint, - address stackPtr) -{ - _WorkGroup_callKernelProtectedReturn(params_, entryPoint, stackPtr); +ALWAYSINLINE void WorkGroup::callKernelProtectedReturn(kernelentrypoint_t entryPoint, + address stackPtr) { + _WorkGroup_callKernelProtectedReturn(params_, entryPoint, stackPtr); } -} // namespace cpu +} // namespace cpu #endif /*OPERATION_HPP_*/ diff --git a/rocclr/runtime/device/cpu/cpudevice.cpp b/rocclr/runtime/device/cpu/cpudevice.cpp index 5314689e62..0270c9432b 100644 --- a/rocclr/runtime/device/cpu/cpudevice.cpp +++ b/rocclr/runtime/device/cpu/cpudevice.cpp @@ -18,16 +18,16 @@ #if defined(__linux__) #if !defined(ATI_ARCH_ARM) #include -#endif // ATI_ARCH_ARM +#endif // ATI_ARCH_ARM #include #endif #if defined(_WIN32) -# include -# include +#include +#include -extern BOOL (WINAPI *pfnGetNumaNodeProcessorMaskEx)(USHORT,PGROUP_AFFINITY); -#endif // _WIN32 +extern BOOL(WINAPI* pfnGetNumaNodeProcessorMaskEx)(USHORT, PGROUP_AFFINITY); +#endif // _WIN32 namespace cpu { @@ -35,1133 +35,1012 @@ aclCompiler* Device::compiler_; size_t Device::maxWorkerThreads_ = (size_t)-1; -Device::~Device() -{ +Device::~Device() { #if defined(__linux__) && defined(NUMA_SUPPORT) - if (getNumaMask() != NULL) { - if (numaMask_ != NULL) { - delete numaMask_; - } + if (getNumaMask() != NULL) { + if (numaMask_ != NULL) { + delete numaMask_; } - else + } else #endif - if (workerThreadsAffinity_ != NULL) { - delete workerThreadsAffinity_; - } + if (workerThreadsAffinity_ != NULL) { + delete workerThreadsAffinity_; + } } -void -Device::tearDown() -{ - amd::Os::uninstallSigfpeHandler(); - aclCompilerFini(compiler_); +void Device::tearDown() { + amd::Os::uninstallSigfpeHandler(); + aclCompilerFini(compiler_); } -bool -Device::init() -{ - // Allow disabling of the CPU device - if (CPU_MAX_COMPUTE_UNITS == 0) - return false; +bool Device::init() { + // Allow disabling of the CPU device + if (CPU_MAX_COMPUTE_UNITS == 0) return false; - if(!amd::Os::installSigfpeHandler()) - return false; + if (!amd::Os::installSigfpeHandler()) return false; - const char *library = getenv("COMPILER_LIBRARY"); - aclCompilerOptions opts = { - sizeof(aclCompilerOptions_0_8), - (library || CPU_OPENCL_VERSION >= 200) - ? library : LINUX_ONLY("lib") "amdocl12cl" \ - LP64_SWITCH(LINUX_SWITCH("32",""),"64") LINUX_SWITCH(".so",".dll"), - NULL, - NULL, - NULL, - NULL, - NULL, - NULL - }; - acl_error error; - compiler_ = aclCompilerInit(&opts, &error); - if (error != ACL_SUCCESS) { - LogError("Error initializing the compiler"); - return false; - } + const char* library = getenv("COMPILER_LIBRARY"); + aclCompilerOptions opts = {sizeof(aclCompilerOptions_0_8), (library || CPU_OPENCL_VERSION >= 200) + ? library + : LINUX_ONLY("lib") "amdocl12cl" LP64_SWITCH( + LINUX_SWITCH("32", ""), "64") LINUX_SWITCH(".so", ".dll"), + NULL, NULL, NULL, NULL, NULL, NULL}; + acl_error error; + compiler_ = aclCompilerInit(&opts, &error); + if (error != ACL_SUCCESS) { + LogError("Error initializing the compiler"); + return false; + } - device::Info info; - ::memset(&info, '\0', sizeof(info)); + device::Info info; + ::memset(&info, '\0', sizeof(info)); - info.type_ = CL_DEVICE_TYPE_CPU; - info.vendorId_ = 0x1002; + info.type_ = CL_DEVICE_TYPE_CPU; + info.vendorId_ = 0x1002; - int systemProcessorCount = amd::Os::processorCount(); - info.maxComputeUnits_ = systemProcessorCount; - if (!flagIsDefault(CPU_MAX_COMPUTE_UNITS)) { - if ((CPU_MAX_COMPUTE_UNITS <= 0) || (CPU_MAX_COMPUTE_UNITS > systemProcessorCount)) - info.maxComputeUnits_ = systemProcessorCount; - else - info.maxComputeUnits_ = CPU_MAX_COMPUTE_UNITS; - } + int systemProcessorCount = amd::Os::processorCount(); + info.maxComputeUnits_ = systemProcessorCount; + if (!flagIsDefault(CPU_MAX_COMPUTE_UNITS)) { + if ((CPU_MAX_COMPUTE_UNITS <= 0) || (CPU_MAX_COMPUTE_UNITS > systemProcessorCount)) + info.maxComputeUnits_ = systemProcessorCount; + else + info.maxComputeUnits_ = CPU_MAX_COMPUTE_UNITS; + } - info.maxWorkItemDimensions_ = 3; - info.maxWorkGroupSize_ = CPU_MAX_WORKGROUP_SIZE; - info.maxWorkItemSizes_[0] = info.maxWorkGroupSize_; - info.maxWorkItemSizes_[1] = info.maxWorkGroupSize_; - info.maxWorkItemSizes_[2] = info.maxWorkGroupSize_; + info.maxWorkItemDimensions_ = 3; + info.maxWorkGroupSize_ = CPU_MAX_WORKGROUP_SIZE; + info.maxWorkItemSizes_[0] = info.maxWorkGroupSize_; + info.maxWorkItemSizes_[1] = info.maxWorkGroupSize_; + info.maxWorkItemSizes_[2] = info.maxWorkGroupSize_; - info.addressBits_ = LP64_SWITCH(32,64); + info.addressBits_ = LP64_SWITCH(32, 64); - if (CPU_IMAGE_SUPPORT) { - info.imageSupport_ = CL_TRUE; - info.maxReadImageArgs_ = MaxReadImage; - info.maxWriteImageArgs_ = MaxWriteImage; - info.image2DMaxWidth_ = 8 * Ki; - info.image2DMaxHeight_ = 8 * Ki; - info.image3DMaxWidth_ = 2 * Ki; - info.image3DMaxHeight_ = 2 * Ki; - info.image3DMaxDepth_ = 2 * Ki; - info.maxSamplers_ = MaxSamplers; + if (CPU_IMAGE_SUPPORT) { + info.imageSupport_ = CL_TRUE; + info.maxReadImageArgs_ = MaxReadImage; + info.maxWriteImageArgs_ = MaxWriteImage; + info.image2DMaxWidth_ = 8 * Ki; + info.image2DMaxHeight_ = 8 * Ki; + info.image3DMaxWidth_ = 2 * Ki; + info.image3DMaxHeight_ = 2 * Ki; + info.image3DMaxDepth_ = 2 * Ki; + info.maxSamplers_ = MaxSamplers; - // OpenCL 1.2 device info fields - info.imageMaxBufferSize_ = 64 * Ki; - info.imageMaxArraySize_ = 2 * Ki; + // OpenCL 1.2 device info fields + info.imageMaxBufferSize_ = 64 * Ki; + info.imageMaxArraySize_ = 2 * Ki; - info.imagePitchAlignment_ = 0; - info.imageBaseAddressAlignment_ = 0; - info.bufferFromImageSupport_ = CL_FALSE; - } + info.imagePitchAlignment_ = 0; + info.imageBaseAddressAlignment_ = 0; + info.bufferFromImageSupport_ = CL_FALSE; + } - info.maxParameterSize_ = 4*Ki; + info.maxParameterSize_ = 4 * Ki; - info.memBaseAddrAlign_ = 8 * (flagIsDefault(MEMOBJ_BASE_ADDR_ALIGN) ? - sizeof(cl_long16) : MEMOBJ_BASE_ADDR_ALIGN); - info.minDataTypeAlignSize_ = sizeof(cl_long16); + info.memBaseAddrAlign_ = + 8 * (flagIsDefault(MEMOBJ_BASE_ADDR_ALIGN) ? sizeof(cl_long16) : MEMOBJ_BASE_ADDR_ALIGN); + info.minDataTypeAlignSize_ = sizeof(cl_long16); - info.singleFPConfig_ = - CL_FP_DENORM | CL_FP_INF_NAN | - CL_FP_ROUND_TO_NEAREST | CL_FP_ROUND_TO_ZERO | - CL_FP_ROUND_TO_INF | CL_FP_FMA; + info.singleFPConfig_ = CL_FP_DENORM | CL_FP_INF_NAN | CL_FP_ROUND_TO_NEAREST | + CL_FP_ROUND_TO_ZERO | CL_FP_ROUND_TO_INF | CL_FP_FMA; - info.doubleFPConfig_ = info.singleFPConfig_; - info.singleFPConfig_ |= CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT; + info.doubleFPConfig_ = info.singleFPConfig_; + info.singleFPConfig_ |= CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT; - info.affinityDomain_.value_ = 0; - info.affinityDomain_.next_ = 1; + info.affinityDomain_.value_ = 0; + info.affinityDomain_.next_ = 1; - info.globalMemCacheType_ = CL_READ_WRITE_CACHE; + info.globalMemCacheType_ = CL_READ_WRITE_CACHE; #if defined(__linux__) - info.globalMemCacheLineSize_ = sysconf(_SC_LEVEL1_DCACHE_LINESIZE); - info.globalMemCacheSize_ = sysconf(_SC_LEVEL1_DCACHE_SIZE); - info.affinityDomain_.cacheL1_ = 1; + info.globalMemCacheLineSize_ = sysconf(_SC_LEVEL1_DCACHE_LINESIZE); + info.globalMemCacheSize_ = sysconf(_SC_LEVEL1_DCACHE_SIZE); + info.affinityDomain_.cacheL1_ = 1; - if (sysconf(_SC_LEVEL2_CACHE_SIZE) > 0) { - info.affinityDomain_.cacheL2_ = 1; - } - if (sysconf(_SC_LEVEL3_CACHE_SIZE) > 0) { - info.affinityDomain_.cacheL3_ = 1; - } - if (sysconf(_SC_LEVEL4_CACHE_SIZE) > 0) { - info.affinityDomain_.cacheL4_ = 1; - } + if (sysconf(_SC_LEVEL2_CACHE_SIZE) > 0) { + info.affinityDomain_.cacheL2_ = 1; + } + if (sysconf(_SC_LEVEL3_CACHE_SIZE) > 0) { + info.affinityDomain_.cacheL3_ = 1; + } + if (sysconf(_SC_LEVEL4_CACHE_SIZE) > 0) { + info.affinityDomain_.cacheL4_ = 1; + } #if defined(NUMA_SUPPORT) - if (numa_available() != -1 && numa_max_node() => 0) { - info.affinityDomain_.numa_ = 1; - } + if (numa_available() != -1 && numa_max_node() = > 0) { + info.affinityDomain_.numa_ = 1; + } #endif -#else // win32 +#else // win32 - DWORD length = 0; - ::GetLogicalProcessorInformation(NULL, &length); + DWORD length = 0; + ::GetLogicalProcessorInformation(NULL, &length); - PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = - (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION) malloc(length); + PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = + (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(length); - if (buffer != NULL && ::GetLogicalProcessorInformation(buffer, &length)) { - bool found = false; - PSYSTEM_LOGICAL_PROCESSOR_INFORMATION ptr, limit = - &buffer[length / sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION)]; - for (ptr = buffer; ptr < limit; ++ptr) { - PCACHE_DESCRIPTOR cache = &ptr->Cache; - if (ptr->Relationship == RelationCache && cache->Type != CacheInstruction) { - info.affinityDomain_.value_ |= - (device::AffinityDomain::AFFINITY_DOMAIN_L1_CACHE << 1) >> - cache->Level; + if (buffer != NULL && ::GetLogicalProcessorInformation(buffer, &length)) { + bool found = false; + PSYSTEM_LOGICAL_PROCESSOR_INFORMATION ptr, + limit = &buffer[length / sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION)]; + for (ptr = buffer; ptr < limit; ++ptr) { + PCACHE_DESCRIPTOR cache = &ptr->Cache; + if (ptr->Relationship == RelationCache && cache->Type != CacheInstruction) { + info.affinityDomain_.value_ |= + (device::AffinityDomain::AFFINITY_DOMAIN_L1_CACHE << 1) >> cache->Level; - if (!found && cache->Level == 1) { - info.globalMemCacheLineSize_ = cache->LineSize; - info.globalMemCacheSize_ = cache->Size; - found = true; - } - } + if (!found && cache->Level == 1) { + info.globalMemCacheLineSize_ = cache->LineSize; + info.globalMemCacheSize_ = cache->Size; + found = true; } + } } + } - free(buffer); + free(buffer); - ULONG highestNuma = 0; - if (::GetNumaHighestNodeNumber(&highestNuma) && highestNuma != 0) { - info.affinityDomain_.numa_ = 1; - } + ULONG highestNuma = 0; + if (::GetNumaHighestNodeNumber(&highestNuma) && highestNuma != 0) { + info.affinityDomain_.numa_ = 1; + } #endif - uintptr_t virtualMemSize; + uintptr_t virtualMemSize; #if defined(__linux__) #if !defined(ATI_ARCH_ARM) - struct sysinfo si; + struct sysinfo si; - if (sysinfo(&si) != 0) { - return false; - } - if (si.mem_unit == 0) { - // Linux kernels prior to 2.3.23 return sizes in bytes. - si.mem_unit = 1; - } - info.globalMemSize_ = (cl_ulong) si.totalram * si.mem_unit; + if (sysinfo(&si) != 0) { + return false; + } + if (si.mem_unit == 0) { + // Linux kernels prior to 2.3.23 return sizes in bytes. + si.mem_unit = 1; + } + info.globalMemSize_ = (cl_ulong)si.totalram * si.mem_unit; #else - info.globalMemSize_ = 0; + info.globalMemSize_ = 0; #endif - virtualMemSize = (uintptr_t) info.globalMemSize_; + virtualMemSize = (uintptr_t)info.globalMemSize_; #else - MEMORYSTATUSEX statex; - statex.dwLength = sizeof (statex); + MEMORYSTATUSEX statex; + statex.dwLength = sizeof(statex); - if (GlobalMemoryStatusEx (&statex) == 0) { - return false; - } - info.globalMemSize_ = (cl_ulong) statex.ullTotalPhys; - virtualMemSize = - (uintptr_t) std::min(statex.ullTotalPageFile, statex.ullTotalVirtual); + if (GlobalMemoryStatusEx(&statex) == 0) { + return false; + } + info.globalMemSize_ = (cl_ulong)statex.ullTotalPhys; + virtualMemSize = (uintptr_t)std::min(statex.ullTotalPageFile, statex.ullTotalVirtual); #endif - //disable CPU device if system memory is equal to or less than 2GB - if (info.globalMemSize_ <= OCL_SYSMEM_REQUIREMENT * Gi) { - return true; - } + // disable CPU device if system memory is equal to or less than 2GB + if (info.globalMemSize_ <= OCL_SYSMEM_REQUIREMENT * Gi) { + return true; + } - maxWorkerThreads_ = (size_t) (virtualMemSize / - (uintptr_t) ((CPU_WORKER_THREAD_STACK_SIZE + - CLK_PRIVATE_MEMORY_SIZE * (CPU_MAX_WORKGROUP_SIZE + 1))) * - 7 / 10); + maxWorkerThreads_ = (size_t)( + virtualMemSize / (uintptr_t)((CPU_WORKER_THREAD_STACK_SIZE + + CLK_PRIVATE_MEMORY_SIZE * (CPU_MAX_WORKGROUP_SIZE + 1))) * + 7 / 10); #if defined(_LP64) - // Cap at 8TiB for 64-bit - const cl_ulong maxGlobalMemSize = 8ULL*Ki*Gi; + // Cap at 8TiB for 64-bit + const cl_ulong maxGlobalMemSize = 8ULL * Ki * Gi; #elif defined(_WIN32) - // Cap at 2GiB (see http://msdn.microsoft.com/en-us/library/aa366778.aspx) - const cl_ulong maxGlobalMemSize = 2ULL*Gi; -#else // linux - // Cap at 3.5GiB - const cl_ulong maxGlobalMemSize = 3584ULL*Mi; + // Cap at 2GiB (see http://msdn.microsoft.com/en-us/library/aa366778.aspx) + const cl_ulong maxGlobalMemSize = 2ULL * Gi; +#else // linux + // Cap at 3.5GiB + const cl_ulong maxGlobalMemSize = 3584ULL * Mi; #endif - info.globalMemSize_ = std::min(info.globalMemSize_, maxGlobalMemSize); + info.globalMemSize_ = std::min(info.globalMemSize_, maxGlobalMemSize); - info.maxMemAllocSize_ = info.globalMemSize_ * CPU_MAX_ALLOC_PERCENT / 100; - if (flagIsDefault(CPU_MAX_ALLOC_PERCENT)) { - const cl_ulong minAllocSize = LP64_SWITCH(1ULL*Gi, 2ULL*Gi); - info.maxMemAllocSize_ = std::max(info.maxMemAllocSize_, - std::min(info.globalMemSize_, minAllocSize)); - } + info.maxMemAllocSize_ = info.globalMemSize_ * CPU_MAX_ALLOC_PERCENT / 100; + if (flagIsDefault(CPU_MAX_ALLOC_PERCENT)) { + const cl_ulong minAllocSize = LP64_SWITCH(1ULL * Gi, 2ULL * Gi); + info.maxMemAllocSize_ = + std::max(info.maxMemAllocSize_, std::min(info.globalMemSize_, minAllocSize)); + } - info.maxConstantBufferSize_ = 64*Ki; - info.maxConstantArgs_ = 8; + info.maxConstantBufferSize_ = 64 * Ki; + info.maxConstantArgs_ = 8; - info.localMemType_ = CL_GLOBAL; - info.localMemSize_ = std::max((cl_ulong)32*Ki, info.globalMemCacheSize_/2); + info.localMemType_ = CL_GLOBAL; + info.localMemSize_ = std::max((cl_ulong)32 * Ki, info.globalMemCacheSize_ / 2); - info.errorCorrectionSupport_ = CL_FALSE; - info.hostUnifiedMemory_ = CL_TRUE; - info.profilingTimerResolution_ = (size_t)amd::Os::timerResolutionNanos(); - info.profilingTimerOffset_ = amd::Os::offsetToEpochNanos(); - info.littleEndian_ = CL_TRUE; - info.available_ = CL_TRUE; - info.compilerAvailable_ = CL_TRUE; - info.linkerAvailable_ = CL_TRUE; + info.errorCorrectionSupport_ = CL_FALSE; + info.hostUnifiedMemory_ = CL_TRUE; + info.profilingTimerResolution_ = (size_t)amd::Os::timerResolutionNanos(); + info.profilingTimerOffset_ = amd::Os::offsetToEpochNanos(); + info.littleEndian_ = CL_TRUE; + info.available_ = CL_TRUE; + info.compilerAvailable_ = CL_TRUE; + info.linkerAvailable_ = CL_TRUE; - info.executionCapabilities_ = CL_EXEC_KERNEL | CL_EXEC_NATIVE_KERNEL; - // Enable SVM only for OpenCL 2.0 - if (((OPENCL_MAJOR >= 2) && (CPU_OPENCL_VERSION >= 200)) || OCL_FORCE_CPU_SVM) { - info.svmCapabilities_ = CL_DEVICE_SVM_COARSE_GRAIN_BUFFER | - CL_DEVICE_SVM_FINE_GRAIN_BUFFER | - CL_DEVICE_SVM_FINE_GRAIN_SYSTEM | - CL_DEVICE_SVM_ATOMICS; - } - info.preferredPlatformAtomicAlignment_ = 0; - info.preferredGlobalAtomicAlignment_ = 0; - info.preferredLocalAtomicAlignment_ = 0; - info.queueProperties_ = CL_QUEUE_PROFILING_ENABLE; + info.executionCapabilities_ = CL_EXEC_KERNEL | CL_EXEC_NATIVE_KERNEL; + // Enable SVM only for OpenCL 2.0 + if (((OPENCL_MAJOR >= 2) && (CPU_OPENCL_VERSION >= 200)) || OCL_FORCE_CPU_SVM) { + info.svmCapabilities_ = CL_DEVICE_SVM_COARSE_GRAIN_BUFFER | CL_DEVICE_SVM_FINE_GRAIN_BUFFER | + CL_DEVICE_SVM_FINE_GRAIN_SYSTEM | CL_DEVICE_SVM_ATOMICS; + } + info.preferredPlatformAtomicAlignment_ = 0; + info.preferredGlobalAtomicAlignment_ = 0; + info.preferredLocalAtomicAlignment_ = 0; + info.queueProperties_ = CL_QUEUE_PROFILING_ENABLE; - info.platform_ = AMD_PLATFORM; + info.platform_ = AMD_PLATFORM; #if defined(__linux__) - std::ifstream ifs("/proc/cpuinfo", std::ios::in); - if (ifs.is_open()) { - std::string line; - bool vendor = false; - bool name = false; - bool freq = false; + std::ifstream ifs("/proc/cpuinfo", std::ios::in); + if (ifs.is_open()) { + std::string line; + bool vendor = false; + bool name = false; + bool freq = false; - while (std::getline(ifs, line) && !(vendor && name && freq)) { - if (!vendor && (line.find("vendor_id\t: ") - != std::string::npos)) { - ::strcpy( - info.vendor_, - line.substr(line.find_first_of(':') + 2).c_str()); - vendor = true; - } - else if (!name && (line.find("model name\t: ") != std::string::npos - || line.find("Processor\t: ") != std::string::npos)) { - ::strcpy( - info.name_, - line.substr(line.find_first_of(':') + 2).c_str()); - name = true; - } - else if (!freq && (line.find("cpu MHz\t\t: ") - != std::string::npos)) { - info.maxClockFrequency_ = - ::atoi(line.substr(line.find_first_of(':') + 2).c_str()); - freq = true; - } - } - ifs.close(); + while (std::getline(ifs, line) && !(vendor && name && freq)) { + if (!vendor && (line.find("vendor_id\t: ") != std::string::npos)) { + ::strcpy(info.vendor_, line.substr(line.find_first_of(':') + 2).c_str()); + vendor = true; + } else if (!name && (line.find("model name\t: ") != std::string::npos || + line.find("Processor\t: ") != std::string::npos)) { + ::strcpy(info.name_, line.substr(line.find_first_of(':') + 2).c_str()); + name = true; + } else if (!freq && (line.find("cpu MHz\t\t: ") != std::string::npos)) { + info.maxClockFrequency_ = ::atoi(line.substr(line.find_first_of(':') + 2).c_str()); + freq = true; + } } + ifs.close(); + } #elif defined(_WIN32) - int CPUInfo[4] = {-1}; - int nRet = 0; - unsigned nIds, nExIds, i; + int CPUInfo[4] = {-1}; + int nRet = 0; + unsigned nIds, nExIds, i; - // cpuid with an InfoType argument of 0 returns the number of - // valid Ids in CPUInfo[0] and the CPU identification string in - // the other three array elements. The CPU identification string is - // not in linear order. The code below arranges the information - // in a human readable form. - amd::Os::cpuid(CPUInfo, 0); - nIds = CPUInfo[0]; - memset(info.vendor_, 0, sizeof(info.vendor_)); - *((int*)(info.vendor_+0)) = CPUInfo[1]; - *((int*)(info.vendor_+4)) = CPUInfo[3]; - *((int*)(info.vendor_+8)) = CPUInfo[2]; + // cpuid with an InfoType argument of 0 returns the number of + // valid Ids in CPUInfo[0] and the CPU identification string in + // the other three array elements. The CPU identification string is + // not in linear order. The code below arranges the information + // in a human readable form. + amd::Os::cpuid(CPUInfo, 0); + nIds = CPUInfo[0]; + memset(info.vendor_, 0, sizeof(info.vendor_)); + *((int*)(info.vendor_ + 0)) = CPUInfo[1]; + *((int*)(info.vendor_ + 4)) = CPUInfo[3]; + *((int*)(info.vendor_ + 8)) = CPUInfo[2]; - // Calling cpuid with 0x80000000 as the InfoType argument - // gets the number of valid extended IDs. - amd::Os::cpuid(CPUInfo, 0x80000000); - nExIds = CPUInfo[0]; - memset(info.name_, 0, sizeof(info.name_)); - sprintf(info.name_, "Unknown Processor"); + // Calling cpuid with 0x80000000 as the InfoType argument + // gets the number of valid extended IDs. + amd::Os::cpuid(CPUInfo, 0x80000000); + nExIds = CPUInfo[0]; + memset(info.name_, 0, sizeof(info.name_)); + sprintf(info.name_, "Unknown Processor"); - // Get the information associated with each extended ID. - for (i=0x80000000; i<=nExIds; ++i) - { - amd::Os::cpuid(CPUInfo, i); - // Interpret CPU brand string and cache information. - if (i == 0x80000002) - memcpy(info.name_, CPUInfo, sizeof(CPUInfo)); - else if (i == 0x80000003) - memcpy(info.name_ + 16, CPUInfo, sizeof(CPUInfo)); - else if (i == 0x80000004) - memcpy(info.name_ + 32, CPUInfo, sizeof(CPUInfo)); - } + // Get the information associated with each extended ID. + for (i = 0x80000000; i <= nExIds; ++i) { + amd::Os::cpuid(CPUInfo, i); + // Interpret CPU brand string and cache information. + if (i == 0x80000002) + memcpy(info.name_, CPUInfo, sizeof(CPUInfo)); + else if (i == 0x80000003) + memcpy(info.name_ + 16, CPUInfo, sizeof(CPUInfo)); + else if (i == 0x80000004) + memcpy(info.name_ + 32, CPUInfo, sizeof(CPUInfo)); + } - info.maxClockFrequency_ = 0; - HKEY hKey; + info.maxClockFrequency_ = 0; + HKEY hKey; - // Open the key - if (RegOpenKeyEx( - HKEY_LOCAL_MACHINE, - "HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0\\", - 0, KEY_QUERY_VALUE, &hKey) == ERROR_SUCCESS) { + // Open the key + if (RegOpenKeyEx(HKEY_LOCAL_MACHINE, "HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0\\", 0, + KEY_QUERY_VALUE, &hKey) == ERROR_SUCCESS) { + // Read the value + DWORD dwLen = 4; + RegQueryValueEx(hKey, "~MHz", NULL, NULL, (LPBYTE)&info.maxClockFrequency_, &dwLen); - // Read the value - DWORD dwLen = 4; - RegQueryValueEx( - hKey, "~MHz", NULL, NULL, - (LPBYTE)&info.maxClockFrequency_, &dwLen); - - // Cleanup and return - RegCloseKey(hKey); - } + // Cleanup and return + RegCloseKey(hKey); + } #else - ::strcpy(info.name_, "Unknown Processor"); - ::strcpy(info.vendor_, "Unknown Vendor"); - info.maxClockFrequency_ = 0; + ::strcpy(info.name_, "Unknown Processor"); + ::strcpy(info.vendor_, "Unknown Vendor"); + info.maxClockFrequency_ = 0; #endif #define OPENCL_VERSION_STR XSTR(OPENCL_MAJOR) "." XSTR(OPENCL_MINOR) - info.profile_ = "FULL_PROFILE"; - if (CPU_OPENCL_VERSION < 200) { - info.version_ = "OpenCL 1.2 " AMD_PLATFORM_INFO; - info.oclcVersion_ = "OpenCL C 1.2 "; - } - else { - info.version_ = "OpenCL " OPENCL_VERSION_STR " " AMD_PLATFORM_INFO; - info.oclcVersion_ = "OpenCL C " OPENCL_VERSION_STR " "; - } - info.spirVersions_ = "1.2"; + info.profile_ = "FULL_PROFILE"; + if (CPU_OPENCL_VERSION < 200) { + info.version_ = "OpenCL 1.2 " AMD_PLATFORM_INFO; + info.oclcVersion_ = "OpenCL C 1.2 "; + } else { + info.version_ = "OpenCL " OPENCL_VERSION_STR " " AMD_PLATFORM_INFO; + info.oclcVersion_ = "OpenCL C " OPENCL_VERSION_STR " "; + } + info.spirVersions_ = "1.2"; - info.partitionCreateInfo_.type_.value_ = 0; - info.partitionProperties_.value_ = 0; - if (info.maxComputeUnits_ > 1) { - info.partitionProperties_.equally_ = 1; - info.partitionProperties_.byCounts_ = 1; - if (info.affinityDomain_.value_ != 0) { - info.partitionProperties_.byAffinityDomain_ = 1; - } - } - else { - info.affinityDomain_.value_ = 0; + info.partitionCreateInfo_.type_.value_ = 0; + info.partitionProperties_.value_ = 0; + if (info.maxComputeUnits_ > 1) { + info.partitionProperties_.equally_ = 1; + info.partitionProperties_.byCounts_ = 1; + if (info.affinityDomain_.value_ != 0) { + info.partitionProperties_.byAffinityDomain_ = 1; } + } else { + info.affinityDomain_.value_ = 0; + } - // Copy the name into the boardName data member for CPU implementation. -// ::strncpy(info.boardName_, info.name_, sizeof(info.boardName_)); - memset(info.boardName_, 0, sizeof(info.boardName_)); + // Copy the name into the boardName data member for CPU implementation. + // ::strncpy(info.boardName_, info.name_, sizeof(info.boardName_)); + memset(info.boardName_, 0, sizeof(info.boardName_)); - Device* device = new Device(); + Device* device = new Device(); - if (device == NULL || !device->create()) { - delete device; - return false; - } + if (device == NULL || !device->create()) { + delete device; + return false; + } - ::snprintf(info.driverVersion_, sizeof(info.driverVersion_) - 1, - "%s (%s%s%s)", AMD_BUILD_STRING, + ::snprintf(info.driverVersion_, sizeof(info.driverVersion_) - 1, "%s (%s%s%s)", AMD_BUILD_STRING, #if defined(ATI_ARCH_X86) - "sse2", -#else // !ATI_ARCH_X86 - "", -#endif // !ATI_ARCH_X86 - device->hasAVXInstructions() ? ",avx" : "", - device->hasFMA4Instructions() ? ",fma4" : ""); + "sse2", +#else // !ATI_ARCH_X86 + "", +#endif // !ATI_ARCH_X86 + device->hasAVXInstructions() ? ",avx" : "", + device->hasFMA4Instructions() ? ",fma4" : ""); - // These will need to change for AVX2 - info.preferredVectorWidthChar_ = 16; - info.preferredVectorWidthShort_ = 8; - info.preferredVectorWidthInt_ = 4; - info.preferredVectorWidthLong_ = 2; - if (device->hasAVXInstructions()) { - info.preferredVectorWidthFloat_ = 8; - info.preferredVectorWidthDouble_ = 4; - } else { - info.preferredVectorWidthFloat_ = 4; - info.preferredVectorWidthDouble_ = 2; - } - info.preferredVectorWidthHalf_ = 0; // no half support + // These will need to change for AVX2 + info.preferredVectorWidthChar_ = 16; + info.preferredVectorWidthShort_ = 8; + info.preferredVectorWidthInt_ = 4; + info.preferredVectorWidthLong_ = 2; + if (device->hasAVXInstructions()) { + info.preferredVectorWidthFloat_ = 8; + info.preferredVectorWidthDouble_ = 4; + } else { + info.preferredVectorWidthFloat_ = 4; + info.preferredVectorWidthDouble_ = 2; + } + info.preferredVectorWidthHalf_ = 0; // no half support - // Same here, will need to change for AVX2 - info.nativeVectorWidthChar_ = 16; - info.nativeVectorWidthShort_ = 8; - info.nativeVectorWidthInt_ = 4; - info.nativeVectorWidthLong_ = 2; - if (device->hasAVXInstructions()) { - info.nativeVectorWidthFloat_ = 8; - info.nativeVectorWidthDouble_ = 4; - } else { - info.nativeVectorWidthFloat_ = 4; - info.nativeVectorWidthDouble_ = 2; - } - info.nativeVectorWidthHalf_ = 0; // no half support + // Same here, will need to change for AVX2 + info.nativeVectorWidthChar_ = 16; + info.nativeVectorWidthShort_ = 8; + info.nativeVectorWidthInt_ = 4; + info.nativeVectorWidthLong_ = 2; + if (device->hasAVXInstructions()) { + info.nativeVectorWidthFloat_ = 8; + info.nativeVectorWidthDouble_ = 4; + } else { + info.nativeVectorWidthFloat_ = 4; + info.nativeVectorWidthDouble_ = 2; + } + info.nativeVectorWidthHalf_ = 0; // no half support - // Find all supported device extensions - info.extensions_ = device->getExtensionString(); + // Find all supported device extensions + info.extensions_ = device->getExtensionString(); - // OpenCL 1.2 device info fields - info.builtInKernels_ = ""; - info.preferredInteropUserSync_ = true; - info.printfBufferSize_ = 64*Ki; + // OpenCL 1.2 device info fields + info.builtInKernels_ = ""; + info.preferredInteropUserSync_ = true; + info.printfBufferSize_ = 64 * Ki; - info.maxPipePacketSize_ = info.maxMemAllocSize_; - info.maxPipeActiveReservations_ = 16; - info.maxPipeArgs_ = 16; - info.maxReadWriteImageArgs_ = MaxReadWriteImage; + info.maxPipePacketSize_ = info.maxMemAllocSize_; + info.maxPipeActiveReservations_ = 16; + info.maxPipeArgs_ = 16; + info.maxReadWriteImageArgs_ = MaxReadWriteImage; - // Max size should not be bigger than 1.75 GB - const cl_ulong maxSize = std::min(static_cast((Gi/4)*7), - info.maxMemAllocSize_); - info.maxGlobalVariableSize_ = static_cast(maxSize); - info.globalVariablePreferredTotalSize_ = static_cast(maxSize); + // Max size should not be bigger than 1.75 GB + const cl_ulong maxSize = std::min(static_cast((Gi / 4) * 7), info.maxMemAllocSize_); + info.maxGlobalVariableSize_ = static_cast(maxSize); + info.globalVariablePreferredTotalSize_ = static_cast(maxSize); - device->info_ = info; - device->registerDevice(); + device->info_ = info; + device->registerDevice(); - return true; + return true; } -bool -Device::create() -{ - // Create CPU settings - settings_ = new cpu::Settings(); - cpu::Settings* cpuSettings = reinterpret_cast(settings_); +bool Device::create() { + // Create CPU settings + settings_ = new cpu::Settings(); + cpu::Settings* cpuSettings = reinterpret_cast(settings_); - if ((cpuSettings == NULL) || !cpuSettings->create()) { - return false; - } + if ((cpuSettings == NULL) || !cpuSettings->create()) { + return false; + } #if defined(ATI_ARCH_X86) - // Check that we have at least SSE2 - if (settings().cpuFeatures_ == 0) { - return false; - } + // Check that we have at least SSE2 + if (settings().cpuFeatures_ == 0) { + return false; + } #endif - return true; + return true; } -bool -Device::initSubDevice( - device::Info& info, - cl_uint maxComputeUnits, - const device::CreateSubDevicesInfo& create_info) -{ +bool Device::initSubDevice(device::Info& info, cl_uint maxComputeUnits, + const device::CreateSubDevicesInfo& create_info) { + if (workerThreadsAffinity_ == NULL) { + workerThreadsAffinity_ = new amd::Os::ThreadAffinityMask; if (workerThreadsAffinity_ == NULL) { - workerThreadsAffinity_ = new amd::Os::ThreadAffinityMask; - if (workerThreadsAffinity_ == NULL) { - return false; - } + return false; } + } - info_ = info; - info_.maxComputeUnits_ = maxComputeUnits; - info_.partitionCreateInfo_ = create_info.p_; - if (create_info.p_.type_.value_ == device::PartitionType::BY_COUNTS) { - cl_uint* countsList = new cl_uint[create_info.p_.byCounts_.listSize_]; - if (countsList == NULL) { - return false; - } - for (size_t i = 0; i < create_info.p_.byCounts_.listSize_; ++i) { - countsList[i] = create_info.countsListAt(i); - } - info_.partitionCreateInfo_.byCounts_.countsList_ = countsList; + info_ = info; + info_.maxComputeUnits_ = maxComputeUnits; + info_.partitionCreateInfo_ = create_info.p_; + if (create_info.p_.type_.value_ == device::PartitionType::BY_COUNTS) { + cl_uint* countsList = new cl_uint[create_info.p_.byCounts_.listSize_]; + if (countsList == NULL) { + return false; } + for (size_t i = 0; i < create_info.p_.byCounts_.listSize_; ++i) { + countsList[i] = create_info.countsListAt(i); + } + info_.partitionCreateInfo_.byCounts_.countsList_ = countsList; + } - // The device cannot be partitioned further - if (maxComputeUnits == 1) { - info_.partitionProperties_.value_ = 0; - info_.affinityDomain_.value_ = 0; - } - return true; + // The device cannot be partitioned further + if (maxComputeUnits == 1) { + info_.partitionProperties_.value_ = 0; + info_.affinityDomain_.value_ = 0; + } + return true; } -void -Device::setWorkerThreadsAffinity( - cl_uint numWorkerThreads, - const amd::Os::ThreadAffinityMask* threadsAffinityMask, - uint& baseCoreId) -{ - uint coreId = baseCoreId; - if (threadsAffinityMask == NULL) { - for (cl_uint i = 0; i < numWorkerThreads; ++i) { - ++coreId; - workerThreadsAffinity_->set(coreId); - } +void Device::setWorkerThreadsAffinity(cl_uint numWorkerThreads, + const amd::Os::ThreadAffinityMask* threadsAffinityMask, + uint& baseCoreId) { + uint coreId = baseCoreId; + if (threadsAffinityMask == NULL) { + for (cl_uint i = 0; i < numWorkerThreads; ++i) { + ++coreId; + workerThreadsAffinity_->set(coreId); } - else { // Already has affinity, so filter accordingly - for (cl_uint i = 0; i < numWorkerThreads; ++i) { - coreId = threadsAffinityMask->getNextSet(coreId); - workerThreadsAffinity_->set(coreId); - } + } else { // Already has affinity, so filter accordingly + for (cl_uint i = 0; i < numWorkerThreads; ++i) { + coreId = threadsAffinityMask->getNextSet(coreId); + workerThreadsAffinity_->set(coreId); } - baseCoreId = coreId; + } + baseCoreId = coreId; } -cl_int -Device::createSubDevices( - device::CreateSubDevicesInfo& create_info, - cl_uint num_entries, - cl_device_id* devices, - cl_uint* num_devices) -{ - switch (create_info.p_.type_.value_) { +cl_int Device::createSubDevices(device::CreateSubDevicesInfo& create_info, cl_uint num_entries, + cl_device_id* devices, cl_uint* num_devices) { + switch (create_info.p_.type_.value_) { case device::PartitionType::EQUALLY: - return partitionEqually( - create_info, num_entries, devices, num_devices); + return partitionEqually(create_info, num_entries, devices, num_devices); case device::PartitionType::BY_COUNTS: - return partitionByCounts( - create_info, num_entries, devices, num_devices); + return partitionByCounts(create_info, num_entries, devices, num_devices); case device::PartitionType::BY_AFFINITY_DOMAIN: - if (info_.affinityDomain_.value_ == 0) { - return CL_DEVICE_PARTITION_FAILED; - } - - if (create_info.p_.byAffinityDomain_.next_) { - create_info.p_.byAffinityDomain_.next_ = 0; - create_info.p_.byAffinityDomain_.value_ = - (1 << amd::leastBitSet(info_.affinityDomain_.value_)); - } - else { - if ((create_info.p_.byAffinityDomain_.value_ & - info_.affinityDomain_.value_) == 0) { - return CL_INVALID_VALUE; - } - } - - if (create_info.p_.byAffinityDomain_.numa_) { - return partitionByAffinityDomainNUMA( - create_info, num_entries, devices, num_devices); - } - else { - return partitionByAffinityDomainCacheLevel( - create_info, num_entries, devices, num_devices); - } - default: - return CL_INVALID_VALUE; - } - return CL_SUCCESS; -} - -cl_int -Device::partitionEqually( - const device::CreateSubDevicesInfo& create_info, - cl_uint num_entries, - cl_device_id* devices, - cl_uint* num_devices) -{ - cl_uint subComputeUnits = - (cl_uint)create_info.p_.equally_.numComputeUnits_; - if (subComputeUnits == 0) { - return CL_INVALID_VALUE; - } - - cl_uint numSubDevices = info_.maxComputeUnits_ / subComputeUnits; - if (numSubDevices == 0) { + if (info_.affinityDomain_.value_ == 0) { return CL_DEVICE_PARTITION_FAILED; - } + } - if (num_devices != NULL) { - *num_devices = numSubDevices; - } - - if (devices != NULL) { - if (num_entries < numSubDevices) { - return CL_INVALID_VALUE; + if (create_info.p_.byAffinityDomain_.next_) { + create_info.p_.byAffinityDomain_.next_ = 0; + create_info.p_.byAffinityDomain_.value_ = + (1 << amd::leastBitSet(info_.affinityDomain_.value_)); + } else { + if ((create_info.p_.byAffinityDomain_.value_ & info_.affinityDomain_.value_) == 0) { + return CL_INVALID_VALUE; } - uint coreId = (uint)-1; - while (numSubDevices-- > 0) { - Device* device = new Device(this); - if (device == NULL) { - return CL_OUT_OF_HOST_MEMORY; - } + } - if (!device->create() || - !device->initSubDevice(info_, subComputeUnits, create_info)) { - device->release(); - return CL_OUT_OF_HOST_MEMORY; - } - - device->setWorkerThreadsAffinity( - subComputeUnits, workerThreadsAffinity_, coreId); - *devices++ = as_cl(static_cast(device)); - } - } - - return CL_SUCCESS; + if (create_info.p_.byAffinityDomain_.numa_) { + return partitionByAffinityDomainNUMA(create_info, num_entries, devices, num_devices); + } else { + return partitionByAffinityDomainCacheLevel(create_info, num_entries, devices, num_devices); + } + default: + return CL_INVALID_VALUE; + } + return CL_SUCCESS; } -cl_int -Device::partitionByCounts( - const device::CreateSubDevicesInfo& create_info, - cl_uint num_entries, - cl_device_id* devices, - cl_uint* num_devices) -{ - cl_uint maxComputeUnits = 0; - cl_uint numSubDevices = (cl_uint)create_info.p_.byCounts_.listSize_; - for (size_t i = (size_t)numSubDevices; i > 0; --i) { - maxComputeUnits += create_info.countsListAt(i); +cl_int Device::partitionEqually(const device::CreateSubDevicesInfo& create_info, + cl_uint num_entries, cl_device_id* devices, cl_uint* num_devices) { + cl_uint subComputeUnits = (cl_uint)create_info.p_.equally_.numComputeUnits_; + if (subComputeUnits == 0) { + return CL_INVALID_VALUE; + } + + cl_uint numSubDevices = info_.maxComputeUnits_ / subComputeUnits; + if (numSubDevices == 0) { + return CL_DEVICE_PARTITION_FAILED; + } + + if (num_devices != NULL) { + *num_devices = numSubDevices; + } + + if (devices != NULL) { + if (num_entries < numSubDevices) { + return CL_INVALID_VALUE; } - if (numSubDevices == 0 || maxComputeUnits > info_.maxComputeUnits_) { - return CL_INVALID_DEVICE_PARTITION_COUNT; + uint coreId = (uint)-1; + while (numSubDevices-- > 0) { + Device* device = new Device(this); + if (device == NULL) { + return CL_OUT_OF_HOST_MEMORY; + } + + if (!device->create() || !device->initSubDevice(info_, subComputeUnits, create_info)) { + device->release(); + return CL_OUT_OF_HOST_MEMORY; + } + + device->setWorkerThreadsAffinity(subComputeUnits, workerThreadsAffinity_, coreId); + *devices++ = as_cl(static_cast(device)); } + } - if (num_devices != NULL) { - *num_devices = numSubDevices; - } - - if (devices != NULL) { - if (num_entries < numSubDevices) { - return CL_INVALID_VALUE; - } - uint coreId = (uint)-1; - while (numSubDevices-- > 0) { - Device* device = new Device(this); - if (device == NULL) { - return CL_OUT_OF_HOST_MEMORY; - } - - cl_uint subComputeUnits = - create_info.countsListAt((size_t)numSubDevices); - if (!device->create() || - !device->initSubDevice(info_, subComputeUnits, create_info)) { - device->release(); - return CL_OUT_OF_HOST_MEMORY; - } - - device->setWorkerThreadsAffinity( - subComputeUnits, workerThreadsAffinity_, coreId); - *devices++ = as_cl(static_cast(device)); - } - } - - return CL_SUCCESS; + return CL_SUCCESS; } -cl_int -Device::partitionByAffinityDomainNUMA( - const device::CreateSubDevicesInfo& create_info, - cl_uint num_entries, - cl_device_id* devices, - cl_uint* num_devices) -{ - cl_uint numSubDevices = 0; +cl_int Device::partitionByCounts(const device::CreateSubDevicesInfo& create_info, + cl_uint num_entries, cl_device_id* devices, cl_uint* num_devices) { + cl_uint maxComputeUnits = 0; + cl_uint numSubDevices = (cl_uint)create_info.p_.byCounts_.listSize_; + for (size_t i = (size_t)numSubDevices; i > 0; --i) { + maxComputeUnits += create_info.countsListAt(i); + } + if (numSubDevices == 0 || maxComputeUnits > info_.maxComputeUnits_) { + return CL_INVALID_DEVICE_PARTITION_COUNT; + } + + if (num_devices != NULL) { + *num_devices = numSubDevices; + } + + if (devices != NULL) { + if (num_entries < numSubDevices) { + return CL_INVALID_VALUE; + } + uint coreId = (uint)-1; + while (numSubDevices-- > 0) { + Device* device = new Device(this); + if (device == NULL) { + return CL_OUT_OF_HOST_MEMORY; + } + + cl_uint subComputeUnits = create_info.countsListAt((size_t)numSubDevices); + if (!device->create() || !device->initSubDevice(info_, subComputeUnits, create_info)) { + device->release(); + return CL_OUT_OF_HOST_MEMORY; + } + + device->setWorkerThreadsAffinity(subComputeUnits, workerThreadsAffinity_, coreId); + *devices++ = as_cl(static_cast(device)); + } + } + + return CL_SUCCESS; +} + +cl_int Device::partitionByAffinityDomainNUMA(const device::CreateSubDevicesInfo& create_info, + cl_uint num_entries, cl_device_id* devices, + cl_uint* num_devices) { + cl_uint numSubDevices = 0; #if defined(__linux__) #if !defined(NUMA_SUPPORT) - return CL_INVALID_VALUE; + return CL_INVALID_VALUE; #else - int highestNuma = numa_max_node(); - if (highestNuma < 0) { + int highestNuma = numa_max_node(); + if (highestNuma < 0) { + return CL_INVALID_VALUE; + } + + numSubDevices = (cl_uint)highestNuma; + if (devices != NULL) { + for (int node = 0; node <= highestNuma; ++node) { + cl_uint subComputeUnits = 0; + int len = 1; + while (true) { + ulong* cpus = alloca(sizeof(ulong) * len); + if (numa_node_to_cpus(node, cpus, len * sizeof(ulong)) < 0) { + if (errno != ERANGE) { + return CL_INVALID_VALUE; + } + len *= 2; + } else { + len *= sizeof(ulong) * 8; + for (int i = 0; i < len; i++) { + if (test_bit(i, cpus)) { + ++subComputeUnits; + } + } + break; + } + } + + if (subComputeUnits == 0) { return CL_INVALID_VALUE; + } + + Device* device = new Device(this); + if (device == NULL) { + return CL_OUT_OF_HOST_MEMORY; + } + + if (!device->create() || NULL == (device->numaMask_ = new nodemask_t)) { + device->release(); + return CL_OUT_OF_HOST_MEMORY; + } + + + if (!device->initSubDevice(info_, subComputeUnits, create_info)) { + delete device->numaMask_; + device->numaMask_ = NULL; + device->release(); + return CL_OUT_OF_HOST_MEMORY; + } + + nodemask_zero(device->numaMask_); + nodemask_set(device->numaMask_, node); + // Need to remove this domain type + device->info_.affinityDomain_.numa_ = 0; + *devices++ = as_cl(static_cast(device)); + } + } +#endif // NUMA_SUPPORT + +#else // win32 + GROUP_AFFINITY numaNodeMask; + ULONG highestNuma = 0; + if (!::GetNumaHighestNodeNumber(&highestNuma)) { + return CL_INVALID_VALUE; + } + + for (ULONG node = 0; node <= highestNuma; ++node) { + if (pfnGetNumaNodeProcessorMaskEx != NULL) { + if (!pfnGetNumaNodeProcessorMaskEx((USHORT)node, &numaNodeMask)) { + // Highet NUMA node number is not guaranteed to be the + // number of nodes. + continue; + } + } else { + ULONGLONG tmpMask; + if (!::GetNumaNodeProcessorMask((UCHAR)node, &tmpMask)) { + // Highet NUMA node number is not guaranteed to be the + // number of nodes. + continue; + } + numaNodeMask.Group = 0; + numaNodeMask.Mask = (KAFFINITY)tmpMask; } - numSubDevices = (cl_uint)highestNuma; - if (devices != NULL) { - for (int node = 0; node <= highestNuma; ++node) { - cl_uint subComputeUnits = 0; - int len = 1; - while (true) { - ulong* cpus = alloca(sizeof(ulong)*len); - if (numa_node_to_cpus(node, cpus, len * sizeof(ulong)) < 0) { - if (errno != ERANGE) { - return CL_INVALID_VALUE; - } - len *= 2; - } - else { - len *= sizeof(ulong) * 8; - for (int i = 0; i < len; i++) { - if (test_bit(i, cpus)) { - ++subComputeUnits; - } - } - break; - } - } - - if (subComputeUnits == 0) { - return CL_INVALID_VALUE; - } - - Device* device = new Device(this); - if (device == NULL) { - return CL_OUT_OF_HOST_MEMORY; - } - - if (!device->create() || NULL == (device->numaMask_ = new nodemask_t)) { - device->release(); - return CL_OUT_OF_HOST_MEMORY; - } - - - if (!device->initSubDevice( - info_, subComputeUnits, create_info)) { - delete device->numaMask_; - device->numaMask_ = NULL; - device->release(); - return CL_OUT_OF_HOST_MEMORY; - } - - nodemask_zero(device->numaMask_); - nodemask_set(device->numaMask_, node); - // Need to remove this domain type - device->info_.affinityDomain_.numa_ = 0; - *devices++ = as_cl(static_cast(device)); - } - } -#endif // NUMA_SUPPORT - -#else // win32 - GROUP_AFFINITY numaNodeMask; - ULONG highestNuma = 0; - if (!::GetNumaHighestNodeNumber(&highestNuma)) { - return CL_INVALID_VALUE; - } - - for (ULONG node = 0; node <= highestNuma; ++node) { - if (pfnGetNumaNodeProcessorMaskEx != NULL) { - if (!pfnGetNumaNodeProcessorMaskEx((USHORT)node, &numaNodeMask)) { - // Highet NUMA node number is not guaranteed to be the - // number of nodes. - continue; - } - } - else { - ULONGLONG tmpMask; - if (!::GetNumaNodeProcessorMask((UCHAR)node, &tmpMask)) { - // Highet NUMA node number is not guaranteed to be the - // number of nodes. - continue; - } - numaNodeMask.Group = 0; - numaNodeMask.Mask = (KAFFINITY)tmpMask; - } - - if (workerThreadsAffinity_ != NULL) { - workerThreadsAffinity_->adjust(0, numaNodeMask.Mask); - } - if (numaNodeMask.Mask == 0) { - continue; - } - - if (devices != NULL) { - Device* device = new Device(this); - if (device == NULL) { - return CL_OUT_OF_HOST_MEMORY; - } - - if (!device->create() || !device->initSubDevice(info_, - (cl_uint)amd::countBitsSet(numaNodeMask.Mask), create_info)) { - device->release(); - return CL_OUT_OF_HOST_MEMORY; - } - - device->workerThreadsAffinity_->set( - numaNodeMask.Group, numaNodeMask.Mask); - // Need to remove this domain type - device->info_.affinityDomain_.numa_ = 0; - *devices++ = as_cl(static_cast(device)); - } - numSubDevices++; - } - -#endif // win32 - - if (num_devices != NULL) { - *num_devices = numSubDevices; - } - - // Could not get a processor mask for any of the nodes - if (numSubDevices == 0) { - return CL_INVALID_VALUE; - } - return CL_SUCCESS; -} - -#if defined(__linux__) -static bool -readFileString(const char* file, char* buf, size_t bufSize) -{ - int fd = open(file, O_RDONLY); - if (fd < 0) { - return false; - } - - struct stat st; - if (fstat(fd, &st) < 0) { - close(fd); - return false; - } - - if ((size_t)st.st_size < bufSize) { - bufSize = (size_t)st.st_size; - } - - ssize_t n = read(fd, buf, bufSize); - close(fd); - - if (n <= 0) { - return false; - } - - if (n >= (ssize_t)bufSize) { - n = (ssize_t)bufSize - 1; - } - buf[n] = '\0'; - return true; -} - -static void -parseSharedCpuMap(const char* cpuMap, cpu_set_t& mask) -{ - CPU_ZERO(&mask); - uint32_t* bits = (uint32_t*)mask.__bits; - const char* s = cpuMap + strlen(cpuMap); - while (true) { - s = (const char*)memrchr(cpuMap, ',', s - cpuMap); - if (!s) { - s = cpuMap; - } - else { - s++; - } - - *bits++ = strtoul(s, NULL, 16); - - if (s == cpuMap) { - return; - } - - --s; - } -} -#endif // linux - -cl_int -Device::partitionByAffinityDomainCacheLevel( - const device::CreateSubDevicesInfo& create_info, - cl_uint num_entries, - cl_device_id* devices, - cl_uint* num_devices) -{ - cl_uint cacheLevel = 0; - switch (create_info.p_.byAffinityDomain_.value_) { - case device::AffinityDomain::AFFINITY_DOMAIN_L4_CACHE: - cacheLevel = 4; - break; - case device::AffinityDomain::AFFINITY_DOMAIN_L3_CACHE: - cacheLevel = 3; - break; - case device::AffinityDomain::AFFINITY_DOMAIN_L2_CACHE: - cacheLevel = 2; - break; - case device::AffinityDomain::AFFINITY_DOMAIN_L1_CACHE: - cacheLevel = 1; - break; - default: - return CL_INVALID_VALUE; - } - - const uint negAffinityDomain = - ~create_info.p_.byAffinityDomain_.value_; - cl_uint numSubDevices = 0; - -#if defined(__linux__) - - amd::Os::ThreadAffinityMask affinityMask; if (workerThreadsAffinity_ != NULL) { - affinityMask = *workerThreadsAffinity_; + workerThreadsAffinity_->adjust(0, numaNodeMask.Mask); } - else { - for (uint cpuId = 0; cpuId < (uint)info_.maxComputeUnits_; ++cpuId) { - affinityMask.set(cpuId); - } + if (numaNodeMask.Mask == 0) { + continue; } - amd::Os::ThreadAffinityMask currentMask; - char buf[1024]; - for (uint cpuId = affinityMask.getFirstSet(); - cpuId != (uint)-1; - cpuId = affinityMask.getNextSet(cpuId)) { + if (devices != NULL) { + Device* device = new Device(this); + if (device == NULL) { + return CL_OUT_OF_HOST_MEMORY; + } - sprintf(buf, - "/sys/devices/system/cpu/cpu%u/cache/index%u/shared_cpu_map", - cpuId, cacheLevel); + if (!device->create() || + !device->initSubDevice(info_, (cl_uint)amd::countBitsSet(numaNodeMask.Mask), + create_info)) { + device->release(); + return CL_OUT_OF_HOST_MEMORY; + } + + device->workerThreadsAffinity_->set(numaNodeMask.Group, numaNodeMask.Mask); + // Need to remove this domain type + device->info_.affinityDomain_.numa_ = 0; + *devices++ = as_cl(static_cast(device)); + } + numSubDevices++; + } + +#endif // win32 + + if (num_devices != NULL) { + *num_devices = numSubDevices; + } + + // Could not get a processor mask for any of the nodes + if (numSubDevices == 0) { + return CL_INVALID_VALUE; + } + return CL_SUCCESS; +} + +#if defined(__linux__) +static bool readFileString(const char* file, char* buf, size_t bufSize) { + int fd = open(file, O_RDONLY); + if (fd < 0) { + return false; + } + + struct stat st; + if (fstat(fd, &st) < 0) { + close(fd); + return false; + } + + if ((size_t)st.st_size < bufSize) { + bufSize = (size_t)st.st_size; + } + + ssize_t n = read(fd, buf, bufSize); + close(fd); + + if (n <= 0) { + return false; + } + + if (n >= (ssize_t)bufSize) { + n = (ssize_t)bufSize - 1; + } + buf[n] = '\0'; + return true; +} + +static void parseSharedCpuMap(const char* cpuMap, cpu_set_t& mask) { + CPU_ZERO(&mask); + uint32_t* bits = (uint32_t*)mask.__bits; + const char* s = cpuMap + strlen(cpuMap); + while (true) { + s = (const char*)memrchr(cpuMap, ',', s - cpuMap); + if (!s) { + s = cpuMap; + } else { + s++; + } + + *bits++ = strtoul(s, NULL, 16); + + if (s == cpuMap) { + return; + } + + --s; + } +} +#endif // linux + +cl_int Device::partitionByAffinityDomainCacheLevel(const device::CreateSubDevicesInfo& create_info, + cl_uint num_entries, cl_device_id* devices, + cl_uint* num_devices) { + cl_uint cacheLevel = 0; + switch (create_info.p_.byAffinityDomain_.value_) { + case device::AffinityDomain::AFFINITY_DOMAIN_L4_CACHE: + cacheLevel = 4; + break; + case device::AffinityDomain::AFFINITY_DOMAIN_L3_CACHE: + cacheLevel = 3; + break; + case device::AffinityDomain::AFFINITY_DOMAIN_L2_CACHE: + cacheLevel = 2; + break; + case device::AffinityDomain::AFFINITY_DOMAIN_L1_CACHE: + cacheLevel = 1; + break; + default: + return CL_INVALID_VALUE; + } + + const uint negAffinityDomain = ~create_info.p_.byAffinityDomain_.value_; + cl_uint numSubDevices = 0; + +#if defined(__linux__) + + amd::Os::ThreadAffinityMask affinityMask; + if (workerThreadsAffinity_ != NULL) { + affinityMask = *workerThreadsAffinity_; + } else { + for (uint cpuId = 0; cpuId < (uint)info_.maxComputeUnits_; ++cpuId) { + affinityMask.set(cpuId); + } + } + + amd::Os::ThreadAffinityMask currentMask; + char buf[1024]; + for (uint cpuId = affinityMask.getFirstSet(); cpuId != (uint)-1; + cpuId = affinityMask.getNextSet(cpuId)) { + sprintf(buf, "/sys/devices/system/cpu/cpu%u/cache/index%u/shared_cpu_map", cpuId, cacheLevel); + + if (!readFileString(buf, buf, sizeof(buf))) { + return CL_INVALID_VALUE; + } + + parseSharedCpuMap(buf, currentMask.getNative()); + affinityMask.adjust(currentMask.getNative()); + if (currentMask.isEmpty()) { + continue; + } + + cl_uint maxComputeUnits; + if (cacheLevel > 1) { + maxComputeUnits = 0; + amd::Os::ThreadAffinityMask currentMaskSub; + cl_uint cacheLevelSub = cacheLevel - 1; + for (uint cpuIdSub = affinityMask.getFirstSet(); cpuIdSub != (uint)-1; + cpuIdSub = affinityMask.getNextSet(cpuIdSub)) { + sprintf(buf, "/sys/devices/system/cpu/cpu%u/cache/index%u/shared_cpu_map", cpuIdSub, + cacheLevelSub); if (!readFileString(buf, buf, sizeof(buf))) { - return CL_INVALID_VALUE; + return CL_INVALID_VALUE; } - parseSharedCpuMap(buf, currentMask.getNative()); - affinityMask.adjust(currentMask.getNative()); - if (currentMask.isEmpty()) { + parseSharedCpuMap(buf, currentMaskSub.getNative()); + currentMask.adjust(currentMaskSub.getNative()); + if (!currentMaskSub.isEmpty()) { + ++maxComputeUnits; + } + } + + if (maxComputeUnits == 0) { + continue; + } + } else { + maxComputeUnits = 1; + } + + if (devices != NULL) { + Device* device = new Device(this); + if (device == NULL) { + return CL_OUT_OF_HOST_MEMORY; + } + + if (!device->create() || !device->initSubDevice(info_, maxComputeUnits, create_info)) { + device->release(); + return CL_OUT_OF_HOST_MEMORY; + } + + device->workerThreadsAffinity_->set(currentMask.getNative()); + // Need to remove this domain type + device->info_.affinityDomain_.value_ &= negAffinityDomain; + *devices++ = as_cl(static_cast(device)); + } + numSubDevices++; + affinityMask.clear(currentMask.getNative()); + } + +#else // win32 + DWORD length = 0; + ::GetLogicalProcessorInformation(NULL, &length); + + PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = + (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(length); + + if (buffer != NULL && ::GetLogicalProcessorInformation(buffer, &length)) { + PSYSTEM_LOGICAL_PROCESSOR_INFORMATION ptr, + limit = &buffer[length / sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION)]; + + for (ptr = buffer; ptr < limit; ++ptr) { + PCACHE_DESCRIPTOR cache = &ptr->Cache; + if (ptr->Relationship == RelationCache && cache->Type != CacheInstruction) { + if (cache->Level == cacheLevel) { + KAFFINITY affinityMask = (KAFFINITY)ptr->ProcessorMask; + if (workerThreadsAffinity_ != NULL) { + workerThreadsAffinity_->adjust(0, affinityMask); + } + if (affinityMask == 0) { continue; - } + } - cl_uint maxComputeUnits; - if (cacheLevel > 1) { + cl_uint maxComputeUnits; + if (cacheLevel > 1) { maxComputeUnits = 0; - amd::Os::ThreadAffinityMask currentMaskSub; cl_uint cacheLevelSub = cacheLevel - 1; - for (uint cpuIdSub = affinityMask.getFirstSet(); - cpuIdSub != (uint)-1; - cpuIdSub = affinityMask.getNextSet(cpuIdSub)) { - - sprintf(buf, - "/sys/devices/system/cpu/cpu%u/cache/index%u/shared_cpu_map", - cpuIdSub, cacheLevelSub); - - if (!readFileString(buf, buf, sizeof(buf))) { - return CL_INVALID_VALUE; - } - - parseSharedCpuMap(buf, currentMaskSub.getNative()); - currentMask.adjust(currentMaskSub.getNative()); - if (!currentMaskSub.isEmpty()) { - ++maxComputeUnits; + for (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION ptrSub = buffer; ptrSub < limit; ++ptrSub) { + PCACHE_DESCRIPTOR cacheSub = &ptrSub->Cache; + if (ptrSub->Relationship == RelationCache && cacheSub->Type != CacheInstruction) { + if (cacheSub->Level == cacheLevelSub && + ((affinityMask & (KAFFINITY)ptrSub->ProcessorMask) != 0)) { + ++maxComputeUnits; } + } } if (maxComputeUnits == 0) { - continue; + continue; } - } - else { + } else { maxComputeUnits = 1; - } + } - if (devices != NULL) { + if (devices != NULL) { Device* device = new Device(this); if (device == NULL) { - return CL_OUT_OF_HOST_MEMORY; + free(buffer); + return CL_OUT_OF_HOST_MEMORY; } - if (!device->create() || - !device->initSubDevice(info_, maxComputeUnits, create_info)) { - device->release(); - return CL_OUT_OF_HOST_MEMORY; + if (!device->create() || !device->initSubDevice(info_, maxComputeUnits, create_info)) { + free(buffer); + device->release(); + return CL_OUT_OF_HOST_MEMORY; } - device->workerThreadsAffinity_->set(currentMask.getNative()); + device->workerThreadsAffinity_->set(0, affinityMask); // Need to remove this domain type device->info_.affinityDomain_.value_ &= negAffinityDomain; *devices++ = as_cl(static_cast(device)); + } + numSubDevices++; + if (numSubDevices >= info_.maxComputeUnits_) { + break; + } } - numSubDevices++; - affinityMask.clear(currentMask.getNative()); + } } + } -#else // win32 - DWORD length = 0; - ::GetLogicalProcessorInformation(NULL, &length); - - PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = - (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION) malloc(length); - - if (buffer != NULL && ::GetLogicalProcessorInformation(buffer, &length)) { - PSYSTEM_LOGICAL_PROCESSOR_INFORMATION ptr, limit = - &buffer[length / sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION)]; - - for (ptr = buffer; ptr < limit; ++ptr) { - PCACHE_DESCRIPTOR cache = &ptr->Cache; - if (ptr->Relationship == RelationCache && cache->Type != CacheInstruction) { - if (cache->Level == cacheLevel) { - KAFFINITY affinityMask = (KAFFINITY)ptr->ProcessorMask; - if (workerThreadsAffinity_ != NULL) { - workerThreadsAffinity_->adjust(0, affinityMask); - } - if (affinityMask == 0) { - continue; - } - - cl_uint maxComputeUnits; - if (cacheLevel > 1) { - maxComputeUnits = 0; - cl_uint cacheLevelSub = cacheLevel - 1; - for (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION - ptrSub = buffer; ptrSub < limit; ++ptrSub) { - - PCACHE_DESCRIPTOR cacheSub = &ptrSub->Cache; - if (ptrSub->Relationship == RelationCache && - cacheSub->Type != CacheInstruction) { - if (cacheSub->Level == cacheLevelSub && - ((affinityMask & (KAFFINITY)ptrSub->ProcessorMask) != 0)) { - ++maxComputeUnits; - } - } - } - - if (maxComputeUnits == 0) { - continue; - } - } - else { - maxComputeUnits = 1; - } - - if (devices != NULL) { - Device* device = new Device(this); - if (device == NULL) { - free(buffer); - return CL_OUT_OF_HOST_MEMORY; - } - - if (!device->create() || !device->initSubDevice(info_, - maxComputeUnits, create_info)) { - free(buffer); - device->release(); - return CL_OUT_OF_HOST_MEMORY; - } - - device->workerThreadsAffinity_->set(0, affinityMask); - // Need to remove this domain type - device->info_.affinityDomain_.value_ &= negAffinityDomain; - *devices++ = as_cl(static_cast(device)); - } - numSubDevices++; - if (numSubDevices >= info_.maxComputeUnits_) { - break; - } - } - } - } - } - - free(buffer); + free(buffer); #endif - if (num_devices != NULL) { - *num_devices = numSubDevices; - } + if (num_devices != NULL) { + *num_devices = numSubDevices; + } - if (numSubDevices == 0) { - return CL_INVALID_VALUE; - } + if (numSubDevices == 0) { + return CL_INVALID_VALUE; + } - return CL_SUCCESS; + return CL_SUCCESS; } -device::Program* -Device::createProgram(amd::option::Options* options) -{ - Program* cpuProgram = new Program(*this); - if (cpuProgram == NULL) { - LogError("We failed memory allocation for program!"); - } +device::Program* Device::createProgram(amd::option::Options* options) { + Program* cpuProgram = new Program(*this); + if (cpuProgram == NULL) { + LogError("We failed memory allocation for program!"); + } - return cpuProgram; + return cpuProgram; } -void* -Device::allocMapTarget( - amd::Memory& mem, - const amd::Coord3D& origin, - const amd::Coord3D& region, - uint mapFlags, - size_t* rowPitch, - size_t* slicePitch) -{ - if (mem.asImage() != NULL) { - amd::Image * image = mem.asImage(); - size_t elementSize = image->getImageFormat().getElementSize(); - size_t rp = image->getRowPitch(); - size_t sp = image->getSlicePitch(); - *rowPitch = rp; - if (slicePitch) { - *slicePitch = sp; - } - return (address) image->getHostMem() - + (origin[0] * elementSize + origin[1] * rp + origin[2] * sp); - } - else if (mem.asBuffer() != NULL) { - return (address) mem.getHostMem() + origin[0]; +void* Device::allocMapTarget(amd::Memory& mem, const amd::Coord3D& origin, + const amd::Coord3D& region, uint mapFlags, size_t* rowPitch, + size_t* slicePitch) { + if (mem.asImage() != NULL) { + amd::Image* image = mem.asImage(); + size_t elementSize = image->getImageFormat().getElementSize(); + size_t rp = image->getRowPitch(); + size_t sp = image->getSlicePitch(); + *rowPitch = rp; + if (slicePitch) { + *slicePitch = sp; } + return (address)image->getHostMem() + + (origin[0] * elementSize + origin[1] * rp + origin[2] * sp); + } else if (mem.asBuffer() != NULL) { + return (address)mem.getHostMem() + origin[0]; + } - return NULL; + return NULL; } -void -Device::freeMapTarget(amd::Memory& mem, void* target) -{ - // nop for CPU +void Device::freeMapTarget(amd::Memory& mem, void* target) { + // nop for CPU } -} // namespace cpu +} // namespace cpu diff --git a/rocclr/runtime/device/cpu/cpudevice.hpp b/rocclr/runtime/device/cpu/cpudevice.hpp index 7399721aeb..16df72d541 100644 --- a/rocclr/runtime/device/cpu/cpudevice.hpp +++ b/rocclr/runtime/device/cpu/cpudevice.hpp @@ -21,9 +21,9 @@ namespace cpu { //! Maximum number of the supported samplers -const static uint32_t MaxSamplers = 16; +const static uint32_t MaxSamplers = 16; //! Maximum number of supported read images -const static uint32_t MaxReadImage = 128; +const static uint32_t MaxReadImage = 128; //! Maximum number of supported write images const static uint32_t MaxWriteImage = 64; //! Maximum number of supported read/write images @@ -40,203 +40,169 @@ const static uint32_t MaxReadWriteImage = 64; */ //! A CPU device ordinal -class Device : public amd::Device -{ -protected: - static aclCompiler* compiler_; -public: - aclCompiler* compiler() const { return compiler_; } +class Device : public amd::Device { + protected: + static aclCompiler* compiler_; -public: - static bool init(void); + public: + aclCompiler* compiler() const { return compiler_; } - //! Shutdown CPU device - static void tearDown(); + public: + static bool init(void); - //! Construct a new identifier - Device(Device* parent = NULL) : - amd::Device(parent), - workerThreadsAffinity_(NULL) - {} + //! Shutdown CPU device + static void tearDown(); - virtual ~Device(); + //! Construct a new identifier + Device(Device* parent = NULL) : amd::Device(parent), workerThreadsAffinity_(NULL) {} - bool create(); + virtual ~Device(); - virtual cl_int createSubDevices( - device::CreateSubDevicesInfo& create_info, - cl_uint num_entries, - cl_device_id* devices, - cl_uint* num_devices); + bool create(); - //! Instantiate a new virtual device - virtual device::VirtualDevice* createVirtualDevice( - amd::CommandQueue* queue = NULL - ) - { - VirtualCPU* virtualCpu = new VirtualCPU(*this); - if (virtualCpu != NULL && !virtualCpu->acceptingCommands()) { - virtualCpu->terminate(); - delete virtualCpu; - virtualCpu = NULL; - } - return virtualCpu; + virtual cl_int createSubDevices(device::CreateSubDevicesInfo& create_info, cl_uint num_entries, + cl_device_id* devices, cl_uint* num_devices); + + //! Instantiate a new virtual device + virtual device::VirtualDevice* createVirtualDevice(amd::CommandQueue* queue = NULL) { + VirtualCPU* virtualCpu = new VirtualCPU(*this); + if (virtualCpu != NULL && !virtualCpu->acceptingCommands()) { + virtualCpu->terminate(); + delete virtualCpu; + virtualCpu = NULL; } + return virtualCpu; + } - //! Compile the given source code. - virtual device::Program* createProgram(amd::option::Options* options = NULL); + //! Compile the given source code. + virtual device::Program* createProgram(amd::option::Options* options = NULL); - //! Just returns NULL as CPU devices use the host memory - virtual device::Memory* createMemory(amd::Memory& owner) const - { - return NULL; - } + //! Just returns NULL as CPU devices use the host memory + virtual device::Memory* createMemory(amd::Memory& owner) const { return NULL; } - //! Sampler object allocation - virtual bool createSampler( - const amd::Sampler& owner, //!< abstraction layer sampler object - device::Sampler** sampler //!< device sampler object - ) const - { - // Just return NULL on CPU device - *sampler = NULL; - return true; - } + //! Sampler object allocation + virtual bool createSampler(const amd::Sampler& owner, //!< abstraction layer sampler object + device::Sampler** sampler //!< device sampler object + ) const { + // Just return NULL on CPU device + *sampler = NULL; + return true; + } - //! Reallocates device memory obje - virtual bool reallocMemory(amd::Memory& owner) const - { - return true; - } + //! Reallocates device memory obje + virtual bool reallocMemory(amd::Memory& owner) const { return true; } - //! Just returns NULL as CPU devices use the host memory - virtual device::Memory* createView( - amd::Memory& owner, //!< Owner memory object - const device::Memory& parent //!< Parent device memory object for the view - ) const - { - return NULL; - } + //! Just returns NULL as CPU devices use the host memory + virtual device::Memory* createView( + amd::Memory& owner, //!< Owner memory object + const device::Memory& parent //!< Parent device memory object for the view + ) const { + return NULL; + } - //! Acquire external graphics API object in the host thread - //! Needed for OpenGL objects on CPU device + //! Acquire external graphics API object in the host thread + //! Needed for OpenGL objects on CPU device - //! Return true if initialized interoperability, otherwise false - virtual bool bindExternalDevice(uint flags, void* const pDevice[], void* pContext, bool validateOnly) - { - return true; // On CPU always avail if pD3DDevice is not NULL - } + //! Return true if initialized interoperability, otherwise false + virtual bool bindExternalDevice(uint flags, void* const pDevice[], void* pContext, + bool validateOnly) { + return true; // On CPU always avail if pD3DDevice is not NULL + } - virtual bool unbindExternalDevice(uint flags, void* const pDevice[], void* pContext, bool validateOnly) - { - return true; - } + virtual bool unbindExternalDevice(uint flags, void* const pDevice[], void* pContext, + bool validateOnly) { + return true; + } - //! Gets a pointer to a region of host-visible memory for use as the target - //! of a non-blocking map for a given memory object - virtual void* allocMapTarget( - amd::Memory& mem, //!< Abstraction layer memory object - const amd::Coord3D& origin, //!< The map location in memory - const amd::Coord3D& region, //!< The map region in memory - uint mapFlags, //!< Map flags - size_t* rowPitch = NULL, //!< Row pitch for the mapped memory - size_t* slicePitch = NULL //!< Slice for the mapped memory - ); + //! Gets a pointer to a region of host-visible memory for use as the target + //! of a non-blocking map for a given memory object + virtual void* allocMapTarget(amd::Memory& mem, //!< Abstraction layer memory object + const amd::Coord3D& origin, //!< The map location in memory + const amd::Coord3D& region, //!< The map region in memory + uint mapFlags, //!< Map flags + size_t* rowPitch = NULL, //!< Row pitch for the mapped memory + size_t* slicePitch = NULL //!< Slice for the mapped memory + ); - //! Releases non-blocking map target memory - virtual void freeMapTarget(amd::Memory& mem, void* target); + //! Releases non-blocking map target memory + virtual void freeMapTarget(amd::Memory& mem, void* target); - //! Empty implementation on a CPU device - virtual bool globalFreeMemory(size_t* freeMemory) const { return false; } + //! Empty implementation on a CPU device + virtual bool globalFreeMemory(size_t* freeMemory) const { return false; } - //! Get CPU device settings - const cpu::Settings& settings() const - { return reinterpret_cast(*settings_); } + //! Get CPU device settings + const cpu::Settings& settings() const { return reinterpret_cast(*settings_); } - bool hasAVXInstructions() const - { return (settings().cpuFeatures_ & Settings::AVXInstructions) ? true : false; } + bool hasAVXInstructions() const { + return (settings().cpuFeatures_ & Settings::AVXInstructions) ? true : false; + } - bool hasFMA4Instructions() const - { return (settings().cpuFeatures_ & Settings::FMA4Instructions) ? true : false; } + bool hasFMA4Instructions() const { + return (settings().cpuFeatures_ & Settings::FMA4Instructions) ? true : false; + } - static size_t getMaxWorkerThreadsNumber() { return maxWorkerThreads_; } + static size_t getMaxWorkerThreadsNumber() { return maxWorkerThreads_; } - void setWorkerThreadsAffinity( - cl_uint numWorkerThreads, - const amd::Os::ThreadAffinityMask* threadsAffinityMask, - uint& baseCoreId); + void setWorkerThreadsAffinity(cl_uint numWorkerThreads, + const amd::Os::ThreadAffinityMask* threadsAffinityMask, + uint& baseCoreId); - const amd::Os::ThreadAffinityMask* getWorkerThreadsAffinity() const - { - return workerThreadsAffinity_; - } - //! host memory alloc - virtual void* svmAlloc(amd::Context& context, size_t size, size_t alignment, cl_svm_mem_flags flags, void* svmPtr) const - { - return NULL; - } + const amd::Os::ThreadAffinityMask* getWorkerThreadsAffinity() const { + return workerThreadsAffinity_; + } + //! host memory alloc + virtual void* svmAlloc(amd::Context& context, size_t size, size_t alignment, + cl_svm_mem_flags flags, void* svmPtr) const { + return NULL; + } - //! host memory deallocation - virtual void svmFree(void* ptr) const - { - return; - } -private: - bool initSubDevice( - device::Info& info, - cl_uint maxComputeUnits, - const device::CreateSubDevicesInfo& create_info); + //! host memory deallocation + virtual void svmFree(void* ptr) const { return; } - cl_int partitionEqually( - const device::CreateSubDevicesInfo& create_info, - cl_uint num_entries, - cl_device_id* devices, - cl_uint* num_devices); + private: + bool initSubDevice(device::Info& info, cl_uint maxComputeUnits, + const device::CreateSubDevicesInfo& create_info); - cl_int partitionByCounts( - const device::CreateSubDevicesInfo& create_info, - cl_uint num_entries, - cl_device_id* devices, - cl_uint* num_devices); + cl_int partitionEqually(const device::CreateSubDevicesInfo& create_info, cl_uint num_entries, + cl_device_id* devices, cl_uint* num_devices); - cl_int partitionByAffinityDomainNUMA( - const device::CreateSubDevicesInfo& create_info, - cl_uint num_entries, - cl_device_id* devices, - cl_uint* num_devices); + cl_int partitionByCounts(const device::CreateSubDevicesInfo& create_info, cl_uint num_entries, + cl_device_id* devices, cl_uint* num_devices); - cl_int partitionByAffinityDomainCacheLevel( - const device::CreateSubDevicesInfo& create_info, - cl_uint num_entries, - cl_device_id* devices, - cl_uint* num_devices); + cl_int partitionByAffinityDomainNUMA(const device::CreateSubDevicesInfo& create_info, + cl_uint num_entries, cl_device_id* devices, + cl_uint* num_devices); -private: + cl_int partitionByAffinityDomainCacheLevel(const device::CreateSubDevicesInfo& create_info, + cl_uint num_entries, cl_device_id* devices, + cl_uint* num_devices); + + private: #if defined(__linux__) && defined(NUMA_SUPPORT) -public: - const nodemask_t* getNumaMask() const - { - return (info_.partitionCreateInfo_.type_ == device::PartitionType::BY_AFFINITY_DOMAIN && - info_.partitionCreateInfo_.byAffinityDomain_.numa_) ? - numaMask_ : NULL; - } + public: + const nodemask_t* getNumaMask() const { + return (info_.partitionCreateInfo_.type_ == device::PartitionType::BY_AFFINITY_DOMAIN && + info_.partitionCreateInfo_.byAffinityDomain_.numa_) + ? numaMask_ + : NULL; + } -private: - union { - nodemask_t* numaMask_; - amd::Os::ThreadAffinityMask* workerThreadsAffinity_; //!< As the number of compute units. - }; + private: + union { + nodemask_t* numaMask_; + amd::Os::ThreadAffinityMask* workerThreadsAffinity_; //!< As the number of compute units. + }; #else - amd::Os::ThreadAffinityMask* workerThreadsAffinity_; //!< As the number of compute units. + amd::Os::ThreadAffinityMask* workerThreadsAffinity_; //!< As the number of compute units. #endif - static size_t maxWorkerThreads_; //!< Maximum number of Worker Threads + static size_t maxWorkerThreads_; //!< Maximum number of Worker Threads }; /*! @} * @} */ -} // namespace cpu +} // namespace cpu -#endif // CPUDEVICE_HPP_ +#endif // CPUDEVICE_HPP_ diff --git a/rocclr/runtime/device/cpu/cpufeat.hpp b/rocclr/runtime/device/cpu/cpufeat.hpp index f97f8a3b7f..75ffceaedb 100644 --- a/rocclr/runtime/device/cpu/cpufeat.hpp +++ b/rocclr/runtime/device/cpu/cpufeat.hpp @@ -24,4 +24,4 @@ #define CPUFEAT_DX_SSE (1 < 25) #define CPUFEAT_DX_SSE2 (1 << 26) -#endif // CPUFEAT_HPP +#endif // CPUFEAT_HPP diff --git a/rocclr/runtime/device/cpu/cpukernel.hpp b/rocclr/runtime/device/cpu/cpukernel.hpp index 5bfa1ff02b..a2544850ce 100644 --- a/rocclr/runtime/device/cpu/cpukernel.hpp +++ b/rocclr/runtime/device/cpu/cpukernel.hpp @@ -15,91 +15,79 @@ namespace cpu { //! \class CPU kernel -class Kernel : public device::Kernel -{ -private: - const void* entryPoint_; //!< entry for the kernel +class Kernel : public device::Kernel { + private: + const void* entryPoint_; //!< entry for the kernel - std::vector< std::pair > args_; - std::vector< std::pair < HCtoDCmap, size_t> > HCtoDCmaps_; - std::vector< HCtoDCmap > internal_maps_; -public: - uint nature_; //!< kernel's nature - uint privateSize_; //!< WorkItem's private memory size (in bytes) + std::vector > args_; + std::vector > HCtoDCmaps_; + std::vector internal_maps_; -private: - //! Disable default copy constructor - Kernel(const Kernel&); - //! Disable operator= - Kernel& operator=(const Kernel&); + public: + uint nature_; //!< kernel's nature + uint privateSize_; //!< WorkItem's private memory size (in bytes) -public: - void addArg(size_t size, size_t alignment) { - args_.push_back(std::pair(size, alignment)); - } + private: + //! Disable default copy constructor + Kernel(const Kernel&); + //! Disable operator= + Kernel& operator=(const Kernel&); - size_t getArgSize(int argIndex) const { - return args_[argIndex].first; - } + public: + void addArg(size_t size, size_t alignment) { + args_.push_back(std::pair(size, alignment)); + } - size_t getArgAlignment(int argIndex) const { - return args_[argIndex].second; - } + size_t getArgSize(int argIndex) const { return args_[argIndex].first; } - void addInternalMap(HCtoDCmap *new_map) { - if (new_map != NULL) { - internal_maps_.push_back(*new_map); - this->addInternalMap(new_map->internal_field_map); - this->addInternalMap(new_map->next_field_map); - } - else - return; - } + size_t getArgAlignment(int argIndex) const { return args_[argIndex].second; } - void addHCtoDCmap(HCtoDCmap *new_map) { - if (new_map != NULL) { - if (HCtoDCmaps_.size() > 0) - HCtoDCmaps_.push_back(std::pair< HCtoDCmap, size_t >(*new_map, HCtoDCmaps_.back().second)); - else - HCtoDCmaps_.push_back(std::pair< HCtoDCmap, size_t >(*new_map, 0)); - } - else - return; - } + void addInternalMap(HCtoDCmap* new_map) { + if (new_map != NULL) { + internal_maps_.push_back(*new_map); + this->addInternalMap(new_map->internal_field_map); + this->addInternalMap(new_map->next_field_map); + } else + return; + } - HCtoDCmap getHCtoDCmap(int mapIndex) const { - return HCtoDCmaps_[mapIndex].first; - } + void addHCtoDCmap(HCtoDCmap* new_map) { + if (new_map != NULL) { + if (HCtoDCmaps_.size() > 0) + HCtoDCmaps_.push_back(std::pair(*new_map, HCtoDCmaps_.back().second)); + else + HCtoDCmaps_.push_back(std::pair(*new_map, 0)); + } else + return; + } + + HCtoDCmap getHCtoDCmap(int mapIndex) const { return HCtoDCmaps_[mapIndex].first; } - uint getArgNumber() { - return HCtoDCmaps_.size(); - } + uint getArgNumber() { return HCtoDCmaps_.size(); } - //! Default constructor - Kernel(const std::string& name) - : device::Kernel(name), entryPoint_(NULL), nature_(0), - privateSize_(CLK_PRIVATE_MEMORY_SIZE) - { - workGroupInfo_.size_ = CPU_MAX_WORKGROUP_SIZE; - } + //! Default constructor + Kernel(const std::string& name) + : device::Kernel(name), entryPoint_(NULL), nature_(0), privateSize_(CLK_PRIVATE_MEMORY_SIZE) { + workGroupInfo_.size_ = CPU_MAX_WORKGROUP_SIZE; + } - //! Default destructor - ~Kernel() {} + //! Default destructor + ~Kernel() {} - //! Returns the CPU kernel entry point - const void* getEntryPoint() const { return entryPoint_; } + //! Returns the CPU kernel entry point + const void* getEntryPoint() const { return entryPoint_; } - //! Sets the CPU kernel entry point - void setEntryPoint(const void* entryPoint) { entryPoint_ = entryPoint; } + //! Sets the CPU kernel entry point + void setEntryPoint(const void* entryPoint) { entryPoint_ = entryPoint; } - //! Returns true if the kernel has a call to barrier - bool hasBarrier() const { return 0 != (nature_ & KN_HAS_BARRIER); } + //! Returns true if the kernel has a call to barrier + bool hasBarrier() const { return 0 != (nature_ & KN_HAS_BARRIER); } - //! Returns the private memory size of a single WorkItem - uint getWorkItemPrivateMemSize() const { return privateSize_; } + //! Returns the private memory size of a single WorkItem + uint getWorkItemPrivateMemSize() const { return privateSize_; } }; -} // namespace cpu +} // namespace cpu -#endif // CPUKERNEL_HPP_ +#endif // CPUKERNEL_HPP_ diff --git a/rocclr/runtime/device/cpu/cpumapping.cpp b/rocclr/runtime/device/cpu/cpumapping.cpp index 3ef0572635..43b3279fae 100644 --- a/rocclr/runtime/device/cpu/cpumapping.cpp +++ b/rocclr/runtime/device/cpu/cpumapping.cpp @@ -25,354 +25,403 @@ using std::min; using std::max; namespace cpu { - HCtoDCmap::HCtoDCmap(const clk_parameter_descriptor_t* desc, unsigned int level_alignment, unsigned int index, unsigned int init_offset) - { - level_alignment = std::max(level_alignment, 1u); // Minimal possible alignment is 1 and alignment is used as a divisor below. - //Initialize fields - hc_offset = 0; - hc_size = 0; - dc_offset = 0; - dc_size = 0; - hc_alignment = level_alignment; - dc_alignment = level_alignment; - internal_field_map = NULL; - next_field_map = NULL; - return; - } +HCtoDCmap::HCtoDCmap(const clk_parameter_descriptor_t* desc, unsigned int level_alignment, + unsigned int index, unsigned int init_offset) { + level_alignment = + std::max(level_alignment, + 1u); // Minimal possible alignment is 1 and alignment is used as a divisor below. + // Initialize fields + hc_offset = 0; + hc_size = 0; + dc_offset = 0; + dc_size = 0; + hc_alignment = level_alignment; + dc_alignment = level_alignment; + internal_field_map = NULL; + next_field_map = NULL; + return; +} - HCtoDCmap::~HCtoDCmap() - { - return; - } +HCtoDCmap::~HCtoDCmap() { return; } - //Helper to find sizes of each scalar type - size_t HCtoDCmap::getHostScalarParamSize(const clk_value_type_t type) const - { - size_t size = 0; - switch (type) { - case T_CHAR: - size = 1; - break; - case T_SHORT: case T_CHAR2: - size = 2; - break; - case T_FLOAT: case T_INT: case T_CHAR4: - case T_SHORT2: case T_CHAR3: - size = 4; - break; - case T_SAMPLER: - size = 4; - break; - case T_LONG: case T_DOUBLE: case T_CHAR8: - case T_SHORT4: case T_INT2: case T_FLOAT2: - case T_SHORT3: - size = 8; - break; - case T_INT3: case T_FLOAT3: - case T_CHAR16: case T_SHORT8: case T_INT4: - case T_FLOAT4: case T_LONG2: case T_DOUBLE2: - size = 16; - break; - case T_LONG3: case T_DOUBLE3: - case T_SHORT16: case T_INT8: case T_FLOAT8: - case T_LONG4: case T_DOUBLE4: - size = 32; - break; - case T_INT16: case T_FLOAT16: case T_LONG8: - case T_DOUBLE8: - size = 64; - break; - case T_LONG16: case T_DOUBLE16: - size = 128; - break; - case T_POINTER: case T_VOID: - size = sizeof(void*); - break; - default: - assert(0 && "unknown scalar parameter size"); - break; - } - return size; - } +// Helper to find sizes of each scalar type +size_t HCtoDCmap::getHostScalarParamSize(const clk_value_type_t type) const { + size_t size = 0; + switch (type) { + case T_CHAR: + size = 1; + break; + case T_SHORT: + case T_CHAR2: + size = 2; + break; + case T_FLOAT: + case T_INT: + case T_CHAR4: + case T_SHORT2: + case T_CHAR3: + size = 4; + break; + case T_SAMPLER: + size = 4; + break; + case T_LONG: + case T_DOUBLE: + case T_CHAR8: + case T_SHORT4: + case T_INT2: + case T_FLOAT2: + case T_SHORT3: + size = 8; + break; + case T_INT3: + case T_FLOAT3: + case T_CHAR16: + case T_SHORT8: + case T_INT4: + case T_FLOAT4: + case T_LONG2: + case T_DOUBLE2: + size = 16; + break; + case T_LONG3: + case T_DOUBLE3: + case T_SHORT16: + case T_INT8: + case T_FLOAT8: + case T_LONG4: + case T_DOUBLE4: + size = 32; + break; + case T_INT16: + case T_FLOAT16: + case T_LONG8: + case T_DOUBLE8: + size = 64; + break; + case T_LONG16: + case T_DOUBLE16: + size = 128; + break; + case T_POINTER: + case T_VOID: + size = sizeof(void*); + break; + default: + assert(0 && "unknown scalar parameter size"); + break; + } + return size; +} - size_t HCtoDCmap::getScalarAlignment(const clk_value_type_t type, bool isHost) const - { - size_t align = 0; - switch (type) { - case T_CHAR: - align = 1; - break; - case T_SHORT: case T_CHAR2: - align = 2; - break; - case T_FLOAT: case T_INT: case T_CHAR4: - case T_SHORT2: case T_CHAR3: - align = 4; - break; - case T_SAMPLER: - align = sizeof(uint32_t); - break; - case T_LONG: - #if defined(_WIN32) - align = 8; - #else - align = isHost? 8 : LP64_SWITCH(4, 8); - #endif - break; - case T_DOUBLE: - #if defined(_WIN32) - align = 8; - #else - align = LP64_SWITCH(4, 8); - #endif - break; - case T_CHAR8: - case T_SHORT4: case T_INT2: case T_FLOAT2: - case T_SHORT3: - align = 4; - break; - case T_INT3: case T_FLOAT3: - case T_CHAR16: case T_SHORT8: case T_INT4: - case T_FLOAT4: case T_LONG2: case T_DOUBLE2: - case T_LONG3: case T_DOUBLE3: - case T_SHORT16: case T_INT8: case T_FLOAT8: - case T_LONG4: case T_DOUBLE4: - case T_INT16: case T_FLOAT16: case T_LONG8: - case T_DOUBLE8: - case T_LONG16: case T_DOUBLE16: - align = LP64_SWITCH(4, 8); - break; - case T_POINTER: case T_VOID: - align = sizeof(void*); - break; - default: - assert(0 && "unknown scalar parameter alignment"); - break; - } - return align; - } +size_t HCtoDCmap::getScalarAlignment(const clk_value_type_t type, bool isHost) const { + size_t align = 0; + switch (type) { + case T_CHAR: + align = 1; + break; + case T_SHORT: + case T_CHAR2: + align = 2; + break; + case T_FLOAT: + case T_INT: + case T_CHAR4: + case T_SHORT2: + case T_CHAR3: + align = 4; + break; + case T_SAMPLER: + align = sizeof(uint32_t); + break; + case T_LONG: +#if defined(_WIN32) + align = 8; +#else + align = isHost ? 8 : LP64_SWITCH(4, 8); +#endif + break; + case T_DOUBLE: +#if defined(_WIN32) + align = 8; +#else + align = LP64_SWITCH(4, 8); +#endif + break; + case T_CHAR8: + case T_SHORT4: + case T_INT2: + case T_FLOAT2: + case T_SHORT3: + align = 4; + break; + case T_INT3: + case T_FLOAT3: + case T_CHAR16: + case T_SHORT8: + case T_INT4: + case T_FLOAT4: + case T_LONG2: + case T_DOUBLE2: + case T_LONG3: + case T_DOUBLE3: + case T_SHORT16: + case T_INT8: + case T_FLOAT8: + case T_LONG4: + case T_DOUBLE4: + case T_INT16: + case T_FLOAT16: + case T_LONG8: + case T_DOUBLE8: + case T_LONG16: + case T_DOUBLE16: + align = LP64_SWITCH(4, 8); + break; + case T_POINTER: + case T_VOID: + align = sizeof(void*); + break; + default: + assert(0 && "unknown scalar parameter alignment"); + break; + } + return align; +} - // Align up arguments within each map, return the size of current map parameter - // Input current alignment of the parameter, size of outer struct if it exists - void HCtoDCmap::align_map(unsigned outer_hc_alignment, unsigned outer_dc_alignment, unsigned &outer_hc_size, unsigned &outer_dc_size, int &inStruct) - { - unsigned map_param_size = 0; - if (internal_field_map != NULL) { - hc_size = 0; //Recalculate size to account for internal offsets - inStruct++; - internal_field_map->align_map(hc_alignment, dc_alignment, hc_size, dc_size, inStruct); // align internal struct, might alter size of this struct - if (hc_alignment != 1 && hc_size%hc_alignment) - hc_size = max(hc_size, hc_size - (hc_size%hc_alignment) + hc_alignment); - if (dc_alignment != 1 && dc_size%dc_alignment) - dc_size = max(dc_size, dc_size - (dc_size%dc_alignment) + dc_alignment); - } - // Use map_param_size to store current parameter size after adjusting alignment - if (hc_alignment != 1 && hc_size % hc_alignment != 0) { - map_param_size = max(hc_alignment, hc_size - (hc_size%hc_alignment) + hc_alignment); - } - else { - map_param_size = max(hc_alignment, hc_size); - } - if (next_field_map != NULL) { - next_field_map->hc_offset = this->next_offset(hc_offset, map_param_size, inStruct); - next_field_map->align_map(outer_hc_alignment, outer_dc_alignment, outer_hc_size, outer_dc_size, inStruct); - // Reset parameter size for char padding - if (next_field_map->type == T_CHAR) - map_param_size = 1; - } - else - { - // Moving out of struct - if (inStruct > 0) - inStruct--; - if (type == T_CHAR) - map_param_size = 1; - } - outer_hc_size = max(outer_hc_size, hc_offset+map_param_size); - outer_dc_size = max(outer_dc_size, dc_offset+dc_size); - return; - } +// Align up arguments within each map, return the size of current map parameter +// Input current alignment of the parameter, size of outer struct if it exists +void HCtoDCmap::align_map(unsigned outer_hc_alignment, unsigned outer_dc_alignment, + unsigned& outer_hc_size, unsigned& outer_dc_size, int& inStruct) { + unsigned map_param_size = 0; + if (internal_field_map != NULL) { + hc_size = 0; // Recalculate size to account for internal offsets + inStruct++; + internal_field_map->align_map( + hc_alignment, dc_alignment, hc_size, dc_size, + inStruct); // align internal struct, might alter size of this struct + if (hc_alignment != 1 && hc_size % hc_alignment) + hc_size = max(hc_size, hc_size - (hc_size % hc_alignment) + hc_alignment); + if (dc_alignment != 1 && dc_size % dc_alignment) + dc_size = max(dc_size, dc_size - (dc_size % dc_alignment) + dc_alignment); + } + // Use map_param_size to store current parameter size after adjusting alignment + if (hc_alignment != 1 && hc_size % hc_alignment != 0) { + map_param_size = max(hc_alignment, hc_size - (hc_size % hc_alignment) + hc_alignment); + } else { + map_param_size = max(hc_alignment, hc_size); + } + if (next_field_map != NULL) { + next_field_map->hc_offset = this->next_offset(hc_offset, map_param_size, inStruct); + next_field_map->align_map(outer_hc_alignment, outer_dc_alignment, outer_hc_size, outer_dc_size, + inStruct); + // Reset parameter size for char padding + if (next_field_map->type == T_CHAR) map_param_size = 1; + } else { + // Moving out of struct + if (inStruct > 0) inStruct--; + if (type == T_CHAR) map_param_size = 1; + } + outer_hc_size = max(outer_hc_size, hc_offset + map_param_size); + outer_dc_size = max(outer_dc_size, dc_offset + dc_size); + return; +} - // Return current size of map, calculate internal maps and process next args if in struct. - // Alignment: alignment flag for members in case of structs, alignment of scalar otherwise. - int HCtoDCmap::compute_map(const clk_parameter_descriptor_t* desc, unsigned int &outer_hc_alignment, unsigned int &outer_dc_alignment, unsigned int init_offset, int& inStruct, int& index_out) - { - unsigned internal_index; +// Return current size of map, calculate internal maps and process next args if in struct. +// Alignment: alignment flag for members in case of structs, alignment of scalar otherwise. +int HCtoDCmap::compute_map(const clk_parameter_descriptor_t* desc, unsigned int& outer_hc_alignment, + unsigned int& outer_dc_alignment, unsigned int init_offset, + int& inStruct, int& index_out) { + unsigned internal_index; + internal_index = index_out; + unsigned int next_offset = init_offset; + unsigned struct_size = 0; + type = desc[internal_index].type; + + if (desc[internal_index].type == T_STRUCT) { + // Moving into struct, go to next index + inStruct++; + hc_offset = init_offset; + if (desc[index_out + 1].type != T_VOID) { + index_out++; + internal_index = index_out; + internal_field_map = new HCtoDCmap(desc, 0, internal_index, init_offset); + hc_size = internal_field_map->compute_map(desc, hc_alignment, dc_alignment, next_offset, + inStruct, index_out); + hc_alignment = + max(hc_alignment, + internal_field_map->hc_alignment); // Adjust alignment to biggest member alignment + struct_size = hc_size; + internal_index = index_out; + outer_hc_alignment = max(outer_hc_alignment, hc_alignment); + if (inStruct > 0) { + if (desc[index_out + 1].type != T_VOID) { + // Still inside struct and not done + index_out++; + internal_index = index_out; + next_field_map = new HCtoDCmap(desc, 0, internal_index, next_offset); + struct_size = hc_size; + struct_size += next_field_map->compute_map(desc, outer_hc_alignment, outer_dc_alignment, + next_offset, inStruct, index_out); + next_offset = max(next_field_map->hc_offset + next_field_map->hc_size, + next_field_map->hc_offset + hc_alignment); + // running count of strucdc_size = hc_size + size of next member + return struct_size; + } else { + // Moving out of struct, go to next index + index_out++; + internal_index = index_out; + inStruct--; + return hc_size; // return last struct member size + } + } + } + } else if (desc[internal_index].type == T_PAD) { + // Struct has padding + hc_offset = init_offset; + if (desc[index_out + 1].type != T_VOID) { + index_out++; + internal_index = index_out; + internal_field_map = new HCtoDCmap(desc, 0, internal_index, init_offset); + hc_size = internal_field_map->compute_map(desc, hc_alignment, dc_alignment, next_offset, + inStruct, index_out); + // Adjust alignment to biggest member alignment + hc_alignment = 1; + dc_alignment = 1; + unsigned pad_size = hc_size; + internal_index = index_out; + if (desc[index_out + 1].type != T_VOID) { + // Still inside padding and not done + index_out++; internal_index = index_out; - unsigned int next_offset = init_offset; - unsigned struct_size = 0; - type = desc[internal_index].type; - - if (desc[internal_index].type == T_STRUCT) { - //Moving into struct, go to next index - inStruct++; - hc_offset = init_offset; - if (desc[index_out+1].type != T_VOID) { - index_out++; - internal_index = index_out; - internal_field_map = new HCtoDCmap(desc, 0, internal_index, init_offset); - hc_size = internal_field_map->compute_map(desc, hc_alignment, dc_alignment, next_offset, inStruct, index_out); - hc_alignment = max(hc_alignment, internal_field_map->hc_alignment); // Adjust alignment to biggest member alignment - struct_size = hc_size; - internal_index = index_out; - outer_hc_alignment = max(outer_hc_alignment, hc_alignment); - if (inStruct > 0) { - if (desc[index_out+1].type != T_VOID) { - //Still inside struct and not done - index_out++; - internal_index = index_out; - next_field_map = new HCtoDCmap(desc, 0, internal_index, next_offset); - struct_size = hc_size; - struct_size += next_field_map->compute_map(desc, outer_hc_alignment, outer_dc_alignment, next_offset, inStruct, index_out); - next_offset = max(next_field_map->hc_offset+next_field_map->hc_size, next_field_map->hc_offset+hc_alignment); - // running count of strucdc_size = hc_size + size of next member - return struct_size; - } - else { - //Moving out of struct, go to next index - index_out++; - internal_index = index_out; - inStruct--; - return hc_size; //return last struct member size - } - } - } - } - else if (desc[internal_index].type == T_PAD) { - //Struct has padding - hc_offset = init_offset; - if (desc[index_out+1].type != T_VOID) { - index_out++; - internal_index = index_out; - internal_field_map = new HCtoDCmap(desc, 0, internal_index, init_offset); - hc_size = internal_field_map->compute_map(desc, hc_alignment, dc_alignment, next_offset, inStruct, index_out); - // Adjust alignment to biggest member alignment - hc_alignment = 1; - dc_alignment = 1; - unsigned pad_size = hc_size; - internal_index = index_out; - if (desc[index_out+1].type != T_VOID) { - //Still inside padding and not done - index_out++; - internal_index = index_out; - next_field_map = new HCtoDCmap(desc, 0, internal_index, next_offset); - pad_size = hc_size; - pad_size += next_field_map->compute_map(desc, outer_hc_alignment, outer_dc_alignment, next_offset, inStruct, index_out); - next_offset = max(next_field_map->hc_offset+next_field_map->hc_size, next_field_map->hc_offset+hc_alignment); - // running count of padding dc_size = hc_size + size of next member - return pad_size; - } - else { - //Moving out of struct, go to next index - index_out++; - internal_index = index_out; - return hc_size; //return last padding member size - } - } - } - else { - //Scalar parameter - hc_offset = init_offset; - hc_size = getHostScalarParamSize(desc[internal_index].type); - dc_size = hc_size; - hc_alignment = getScalarAlignment(desc[internal_index].type, true); - dc_alignment = getScalarAlignment(desc[internal_index].type, false); - outer_hc_alignment = max(outer_hc_alignment, hc_alignment); //Adjust alignment of upper level struct if necessary, upper level alignment = max alignment of members - outer_dc_alignment = max(outer_dc_alignment, dc_alignment); //Adjust alignment of upper level struct if necessary, upper level alignment = max alignment of members - if (inStruct > 0) { - if (desc[index_out+1].type != T_VOID) { - //Still inside struct and not done - index_out++; - next_field_map = new HCtoDCmap(desc, outer_hc_alignment, internal_index, next_offset); - struct_size = hc_size; - struct_size += next_field_map->compute_map(desc, outer_hc_alignment, outer_dc_alignment, next_offset, inStruct, index_out); - next_offset = hc_offset+hc_alignment; - outer_hc_alignment = max(outer_hc_alignment, next_field_map->hc_alignment); - outer_dc_alignment = max(outer_dc_alignment, next_field_map->dc_alignment); - // running count of strucdc_size = hc_size + size of next member - return struct_size; - } - else { - //Moving out of struct, go to next index - index_out++; - inStruct--; - return hc_size; //return last struct member size - } - } - } - return hc_size; + next_field_map = new HCtoDCmap(desc, 0, internal_index, next_offset); + pad_size = hc_size; + pad_size += next_field_map->compute_map(desc, outer_hc_alignment, outer_dc_alignment, + next_offset, inStruct, index_out); + next_offset = max(next_field_map->hc_offset + next_field_map->hc_size, + next_field_map->hc_offset + hc_alignment); + // running count of padding dc_size = hc_size + size of next member + return pad_size; + } else { + // Moving out of struct, go to next index + index_out++; + internal_index = index_out; + return hc_size; // return last padding member size + } } - - // Adjust offset for source and target, return next source offset - unsigned HCtoDCmap::next_offset(unsigned current_offset, unsigned &map_param_size, int& inStruct_flag) - { - unsigned next_offset = current_offset; - if (next_field_map == NULL) { - assert(0 && "invalid next struct field map"); - return next_offset; - } - else { - // Ignore alignment when a char occurs to account for padding - if (type == T_PAD) { - next_field_map->dc_offset = dc_offset + dc_size; - next_offset = current_offset + hc_size; - } - else { - if ((dc_offset + dc_size) % next_field_map->dc_alignment != 0) { - this->next_field_map->dc_offset = dc_offset + dc_size - (dc_size % next_field_map->dc_alignment) + next_field_map->dc_alignment; - } - else { - this->next_field_map->dc_offset = dc_offset + max(dc_size, next_field_map->dc_alignment); - } - if ((hc_offset + hc_size) % next_field_map->hc_alignment != 0) { - next_offset = hc_offset + hc_size - (hc_size % next_field_map->hc_alignment) + next_field_map->hc_alignment; - } - else { - next_offset = hc_offset + max(next_field_map->hc_alignment, map_param_size); - } - } - return next_offset; - } + } else { + // Scalar parameter + hc_offset = init_offset; + hc_size = getHostScalarParamSize(desc[internal_index].type); + dc_size = hc_size; + hc_alignment = getScalarAlignment(desc[internal_index].type, true); + dc_alignment = getScalarAlignment(desc[internal_index].type, false); + outer_hc_alignment = max(outer_hc_alignment, hc_alignment); // Adjust alignment of upper level + // struct if necessary, upper level + // alignment = max alignment of + // members + outer_dc_alignment = max(outer_dc_alignment, dc_alignment); // Adjust alignment of upper level + // struct if necessary, upper level + // alignment = max alignment of + // members + if (inStruct > 0) { + if (desc[index_out + 1].type != T_VOID) { + // Still inside struct and not done + index_out++; + next_field_map = new HCtoDCmap(desc, outer_hc_alignment, internal_index, next_offset); + struct_size = hc_size; + struct_size += next_field_map->compute_map(desc, outer_hc_alignment, outer_dc_alignment, + next_offset, inStruct, index_out); + next_offset = hc_offset + hc_alignment; + outer_hc_alignment = max(outer_hc_alignment, next_field_map->hc_alignment); + outer_dc_alignment = max(outer_dc_alignment, next_field_map->dc_alignment); + // running count of strucdc_size = hc_size + size of next member + return struct_size; + } else { + // Moving out of struct, go to next index + index_out++; + inStruct--; + return hc_size; // return last struct member size + } } + } + return hc_size; +} - // Copy memory according to mapping - unsigned int HCtoDCmap::copy_params(void *dst, const void *src, unsigned int arg_offset, int& error_code, int &inStruct) const - { - unsigned int padding = 0; - // Pad offset to be aligned by 8 if parameter is double, not as struct field - if ((arg_offset) % 8 != 0 && (type == T_DOUBLE) && inStruct == 0) - padding = hc_alignment-((arg_offset+dc_offset)%hc_alignment); - #if defined(_WIN32) - // In windows, double is aligned by 8, add padding to struct if it contains double - if ((arg_offset+dc_offset) % 8 != 0 && hc_alignment == 8) - padding = hc_alignment-((arg_offset+dc_offset)%hc_alignment); - #endif - ::memcpy(reinterpret_cast(reinterpret_cast(dst)+padding), src, hc_size); - #if defined(_WIN32) - if (internal_field_map != NULL) { - inStruct++; - void *internal_dst = reinterpret_cast(reinterpret_cast(dst)+padding); - internal_field_map->copy_params(internal_dst, src, arg_offset+padding, error_code, inStruct); - inStruct--; - } - if (next_field_map != NULL) { - void *next_dst = reinterpret_cast(reinterpret_cast(dst)+next_field_map->dc_offset); // Next field starts with padding - const void *next_src = reinterpret_cast(reinterpret_cast(src)+next_field_map->hc_offset); - next_field_map->copy_params(next_dst, next_src, arg_offset+next_field_map->dc_offset, error_code, inStruct); - } - #else - if (internal_field_map != NULL) { - inStruct++; - internal_field_map->copy_params(dst, src, arg_offset, error_code, inStruct); - inStruct--; - } - if (next_field_map != NULL) { - void *next_dst = reinterpret_cast(reinterpret_cast(dst)+next_field_map->dc_offset); - const void *next_src = reinterpret_cast(reinterpret_cast(src)+next_field_map->hc_offset); - next_field_map->copy_params(next_dst, next_src, arg_offset, error_code, inStruct); - } - #endif - return padding; +// Adjust offset for source and target, return next source offset +unsigned HCtoDCmap::next_offset(unsigned current_offset, unsigned& map_param_size, + int& inStruct_flag) { + unsigned next_offset = current_offset; + if (next_field_map == NULL) { + assert(0 && "invalid next struct field map"); + return next_offset; + } else { + // Ignore alignment when a char occurs to account for padding + if (type == T_PAD) { + next_field_map->dc_offset = dc_offset + dc_size; + next_offset = current_offset + hc_size; + } else { + if ((dc_offset + dc_size) % next_field_map->dc_alignment != 0) { + this->next_field_map->dc_offset = dc_offset + dc_size - + (dc_size % next_field_map->dc_alignment) + next_field_map->dc_alignment; + } else { + this->next_field_map->dc_offset = dc_offset + max(dc_size, next_field_map->dc_alignment); + } + if ((hc_offset + hc_size) % next_field_map->hc_alignment != 0) { + next_offset = hc_offset + hc_size - (hc_size % next_field_map->hc_alignment) + + next_field_map->hc_alignment; + } else { + next_offset = hc_offset + max(next_field_map->hc_alignment, map_param_size); + } } + return next_offset; + } +} -} //namespace cpu \ No newline at end of file +// Copy memory according to mapping +unsigned int HCtoDCmap::copy_params(void* dst, const void* src, unsigned int arg_offset, + int& error_code, int& inStruct) const { + unsigned int padding = 0; + // Pad offset to be aligned by 8 if parameter is double, not as struct field + if ((arg_offset) % 8 != 0 && (type == T_DOUBLE) && inStruct == 0) + padding = hc_alignment - ((arg_offset + dc_offset) % hc_alignment); +#if defined(_WIN32) + // In windows, double is aligned by 8, add padding to struct if it contains double + if ((arg_offset + dc_offset) % 8 != 0 && hc_alignment == 8) + padding = hc_alignment - ((arg_offset + dc_offset) % hc_alignment); +#endif + ::memcpy(reinterpret_cast(reinterpret_cast(dst) + padding), src, hc_size); +#if defined(_WIN32) + if (internal_field_map != NULL) { + inStruct++; + void* internal_dst = reinterpret_cast(reinterpret_cast(dst) + padding); + internal_field_map->copy_params(internal_dst, src, arg_offset + padding, error_code, inStruct); + inStruct--; + } + if (next_field_map != NULL) { + void* next_dst = + reinterpret_cast(reinterpret_cast(dst) + + next_field_map->dc_offset); // Next field starts with padding + const void* next_src = reinterpret_cast( + reinterpret_cast(src) + next_field_map->hc_offset); + next_field_map->copy_params(next_dst, next_src, arg_offset + next_field_map->dc_offset, + error_code, inStruct); + } +#else + if (internal_field_map != NULL) { + inStruct++; + internal_field_map->copy_params(dst, src, arg_offset, error_code, inStruct); + inStruct--; + } + if (next_field_map != NULL) { + void* next_dst = + reinterpret_cast(reinterpret_cast(dst) + next_field_map->dc_offset); + const void* next_src = reinterpret_cast( + reinterpret_cast(src) + next_field_map->hc_offset); + next_field_map->copy_params(next_dst, next_src, arg_offset, error_code, inStruct); + } +#endif + return padding; +} + +} // namespace cpu \ No newline at end of file diff --git a/rocclr/runtime/device/cpu/cpumapping.hpp b/rocclr/runtime/device/cpu/cpumapping.hpp index 0ea5d2f350..00531ef8ea 100644 --- a/rocclr/runtime/device/cpu/cpumapping.hpp +++ b/rocclr/runtime/device/cpu/cpumapping.hpp @@ -9,36 +9,37 @@ namespace cpu { -class HCtoDCmap -{ +class HCtoDCmap { + public: + unsigned int hc_offset, hc_size; // Offset and size of this parameter in host compiler + unsigned int dc_offset, dc_size; // Offset and size of this parameter in device compiler + unsigned int hc_alignment; // Alignment of parameter in host compiler + unsigned int dc_alignment; // Alignment of parameter in device compiler + clk_value_type_t type; // Type of parameter + HCtoDCmap* + internal_field_map; // Pointer to internal mapping when current parameter is of type T_STRUCT + HCtoDCmap* + next_field_map; // Pointer to next struct field when current parameter is a struct member -public: - unsigned int hc_offset, hc_size; // Offset and size of this parameter in host compiler - unsigned int dc_offset, dc_size; // Offset and size of this parameter in device compiler - unsigned int hc_alignment; // Alignment of parameter in host compiler - unsigned int dc_alignment; // Alignment of parameter in device compiler - clk_value_type_t type; // Type of parameter - HCtoDCmap *internal_field_map; // Pointer to internal mapping when current parameter is of type T_STRUCT - HCtoDCmap *next_field_map; // Pointer to next struct field when current parameter is a struct member + HCtoDCmap(const clk_parameter_descriptor_t*, unsigned int, unsigned int, unsigned int); + virtual ~HCtoDCmap(); + int compute_map(const clk_parameter_descriptor_t*, unsigned int&, unsigned int&, unsigned int, + int&, int&); + unsigned next_offset(unsigned, unsigned&, int&); + size_t getHostScalarParamSize(const clk_value_type_t) const; + size_t getScalarAlignment(const clk_value_type_t, bool) const; + void align_map(unsigned, unsigned, unsigned&, unsigned&, int&); + unsigned int copy_params(void*, const void*, unsigned int, int&, int&) const; - HCtoDCmap(const clk_parameter_descriptor_t*, unsigned int, unsigned int, unsigned int); - virtual ~HCtoDCmap(); - int compute_map(const clk_parameter_descriptor_t*, unsigned int &, unsigned int &, unsigned int, int&, int&); - unsigned next_offset(unsigned, unsigned &, int &); - size_t getHostScalarParamSize(const clk_value_type_t) const; - size_t getScalarAlignment(const clk_value_type_t, bool) const; - void align_map(unsigned, unsigned, unsigned&, unsigned&, int&); - unsigned int copy_params(void *, const void *, unsigned int, int&, int&) const; - -private: + private: }; -} // namespace cpu +} // namespace cpu -#endif // CPUMAPPING_HPP_ -// Mapping rule -// Long types are treated with 8 byte alignment in runtime when passed in as arguments -// but they are treated with 4 byte alignment in compiler -// Double members have 8 byte alignment when passed as scalar argument -// but have 4 byte alignment as a field inside a struct \ No newline at end of file +#endif // CPUMAPPING_HPP_ + // Mapping rule + // Long types are treated with 8 byte alignment in runtime when passed in as arguments + // but they are treated with 4 byte alignment in compiler + // Double members have 8 byte alignment when passed as scalar argument + // but have 4 byte alignment as a field inside a struct \ No newline at end of file diff --git a/rocclr/runtime/device/cpu/cpuprogram.cpp b/rocclr/runtime/device/cpu/cpuprogram.cpp index 7f56866dac..179c483083 100644 --- a/rocclr/runtime/device/cpu/cpuprogram.cpp +++ b/rocclr/runtime/device/cpu/cpuprogram.cpp @@ -18,7 +18,7 @@ #include #if defined(_WIN32) -# include +#include #endif // amdrt.o @@ -26,609 +26,582 @@ #include "amdrt.inc" #endif -//CLC_IN_PROCESS_CHANGE +// CLC_IN_PROCESS_CHANGE extern int openclFrontEnd(const char* cmdline, std::string*, std::string* typInfo = NULL); namespace cpu { -static inline bool -isScalar(clk_value_type_t type) -{ - switch (type) { - case T_CHAR: case T_SHORT: case T_INT: - case T_LONG: case T_FLOAT: case T_DOUBLE: - case T_POINTER: - return true; - default: - return false; - } -} - - -static cl_kernel_arg_address_qualifier -getParamAddressQualifier(const clk_parameter_descriptor_t* desc) -{ - switch (desc->space) { - case A_LOCAL: - return CL_KERNEL_ARG_ADDRESS_LOCAL; - break; - case A_CONSTANT: - return CL_KERNEL_ARG_ADDRESS_CONSTANT; - break; - case A_GLOBAL: - return CL_KERNEL_ARG_ADDRESS_GLOBAL; - break; - default: - return CL_KERNEL_ARG_ADDRESS_PRIVATE; - break; - } -} - -static cl_kernel_arg_type_qualifier -getParamTypeQualifier(const clk_parameter_descriptor_t* desc) -{ - cl_kernel_arg_type_qualifier typeQualifier = CL_KERNEL_ARG_TYPE_NONE; - - if (desc->space == A_CONSTANT) { - typeQualifier |= CL_KERNEL_ARG_TYPE_CONST; - } - - if ((desc->qualifier & Q_CONST) != 0) { - typeQualifier |= CL_KERNEL_ARG_TYPE_CONST; - } - if ((desc->qualifier & Q_RESTRICT) != 0) { - typeQualifier |= CL_KERNEL_ARG_TYPE_RESTRICT; - } - if ((desc->qualifier & Q_VOLATILE) != 0) { - typeQualifier |= CL_KERNEL_ARG_TYPE_VOLATILE; - } - - if ((desc->qualifier & Q_PIPE) != 0) { - typeQualifier = CL_KERNEL_ARG_TYPE_PIPE; - } - - return typeQualifier; -} - -static cl_kernel_arg_access_qualifier -getParamAccessQualifier(const clk_parameter_descriptor_t* desc) -{ - uint access = desc->qualifier & (Q_READ | Q_WRITE); - switch (access) { - case Q_READ: - return CL_KERNEL_ARG_ACCESS_READ_ONLY; - break; - case Q_WRITE: - return CL_KERNEL_ARG_ACCESS_WRITE_ONLY; - break; - case (Q_READ | Q_WRITE): - return CL_KERNEL_ARG_ACCESS_READ_WRITE; - break; - default: - return CL_KERNEL_ARG_ACCESS_NONE; - break; - } -} - -static size_t -getScalarParamSize(bool cpuLayer, const clk_value_type_t type, - cl_kernel_arg_address_qualifier qualifier) -{ - size_t size = 0; - - if (qualifier == CL_KERNEL_ARG_ADDRESS_LOCAL) { - return cpuLayer ? sizeof(void*) : 0; - } - - switch (type) { +static inline bool isScalar(clk_value_type_t type) { + switch (type) { case T_CHAR: - size = 1; - break; - case T_SHORT: case T_CHAR2: - size = 2; - break; - case T_FLOAT: case T_INT: case T_CHAR4: - case T_SHORT2: case T_CHAR3: - size = 4; - break; - case T_SAMPLER: - size = cpuLayer ? sizeof(uint32_t) : sizeof(cl_sampler); - break; - case T_LONG: case T_DOUBLE: case T_CHAR8: - case T_SHORT4: case T_INT2: case T_FLOAT2: - case T_SHORT3: - size = 8; - break; - case T_INT3: case T_FLOAT3: - case T_CHAR16: case T_SHORT8: case T_INT4: - case T_FLOAT4: case T_LONG2: case T_DOUBLE2: - size = 16; - break; - case T_LONG3: case T_DOUBLE3: - case T_SHORT16: case T_INT8: case T_FLOAT8: - case T_LONG4: case T_DOUBLE4: - size = 32; - break; - case T_INT16: case T_FLOAT16: case T_LONG8: - case T_DOUBLE8: - size = 64; - break; - case T_LONG16: case T_DOUBLE16: - size = 128; - break; - case T_POINTER: case T_VOID: - size = sizeof(void*); - break; + case T_SHORT: + case T_INT: + case T_LONG: + case T_FLOAT: + case T_DOUBLE: + case T_POINTER: + return true; default: - ShouldNotReachHere(); - break; - } - return size; + return false; + } } -static size_t -getParamSizeImpl(bool cpuLayer, const clk_parameter_descriptor_t* desc, - unsigned index, cl_kernel_arg_address_qualifier qualifier, - size_t* alignment, unsigned* index_out) -{ - size_t size = 0; - if(desc[index].type == T_STRUCT || desc[index].type == T_PAD) { - size_t maxAlignment = 0; - size_t structSize = 0; - size_t structAlignment = 0; - index++; - while(desc[index].type != T_VOID) { - size_t elementAlignment = 0; - size_t elementSize = - getParamSizeImpl(cpuLayer, desc, index, qualifier, - &elementAlignment, index_out); - #if defined(_WIN32) - maxAlignment = std::max(maxAlignment, elementAlignment); - #else - // In Linux, the alignment of long field is 4 for GCC, - // but it is 8 on LLVM side - if (desc[index].type == T_LONG) - structAlignment = cpuLayer? LP64_SWITCH(4, 8) : 8; - else - structAlignment = std::max(maxAlignment, elementAlignment); - maxAlignment = std::max(maxAlignment, structAlignment); - #endif - index = *index_out; - structSize = - amd::alignUp(structSize, - std::min(elementAlignment, size_t(16))) + - elementSize; - } - *index_out = index + 1; - *alignment = maxAlignment; - size = amd::alignUp(structSize, std::min(maxAlignment, size_t(16))); + +static cl_kernel_arg_address_qualifier getParamAddressQualifier( + const clk_parameter_descriptor_t* desc) { + switch (desc->space) { + case A_LOCAL: + return CL_KERNEL_ARG_ADDRESS_LOCAL; + break; + case A_CONSTANT: + return CL_KERNEL_ARG_ADDRESS_CONSTANT; + break; + case A_GLOBAL: + return CL_KERNEL_ARG_ADDRESS_GLOBAL; + break; + default: + return CL_KERNEL_ARG_ADDRESS_PRIVATE; + break; + } +} + +static cl_kernel_arg_type_qualifier getParamTypeQualifier(const clk_parameter_descriptor_t* desc) { + cl_kernel_arg_type_qualifier typeQualifier = CL_KERNEL_ARG_TYPE_NONE; + + if (desc->space == A_CONSTANT) { + typeQualifier |= CL_KERNEL_ARG_TYPE_CONST; + } + + if ((desc->qualifier & Q_CONST) != 0) { + typeQualifier |= CL_KERNEL_ARG_TYPE_CONST; + } + if ((desc->qualifier & Q_RESTRICT) != 0) { + typeQualifier |= CL_KERNEL_ARG_TYPE_RESTRICT; + } + if ((desc->qualifier & Q_VOLATILE) != 0) { + typeQualifier |= CL_KERNEL_ARG_TYPE_VOLATILE; + } + + if ((desc->qualifier & Q_PIPE) != 0) { + typeQualifier = CL_KERNEL_ARG_TYPE_PIPE; + } + + return typeQualifier; +} + +static cl_kernel_arg_access_qualifier getParamAccessQualifier( + const clk_parameter_descriptor_t* desc) { + uint access = desc->qualifier & (Q_READ | Q_WRITE); + switch (access) { + case Q_READ: + return CL_KERNEL_ARG_ACCESS_READ_ONLY; + break; + case Q_WRITE: + return CL_KERNEL_ARG_ACCESS_WRITE_ONLY; + break; + case (Q_READ | Q_WRITE): + return CL_KERNEL_ARG_ACCESS_READ_WRITE; + break; + default: + return CL_KERNEL_ARG_ACCESS_NONE; + break; + } +} + +static size_t getScalarParamSize(bool cpuLayer, const clk_value_type_t type, + cl_kernel_arg_address_qualifier qualifier) { + size_t size = 0; + + if (qualifier == CL_KERNEL_ARG_ADDRESS_LOCAL) { + return cpuLayer ? sizeof(void*) : 0; + } + + switch (type) { + case T_CHAR: + size = 1; + break; + case T_SHORT: + case T_CHAR2: + size = 2; + break; + case T_FLOAT: + case T_INT: + case T_CHAR4: + case T_SHORT2: + case T_CHAR3: + size = 4; + break; + case T_SAMPLER: + size = cpuLayer ? sizeof(uint32_t) : sizeof(cl_sampler); + break; + case T_LONG: + case T_DOUBLE: + case T_CHAR8: + case T_SHORT4: + case T_INT2: + case T_FLOAT2: + case T_SHORT3: + size = 8; + break; + case T_INT3: + case T_FLOAT3: + case T_CHAR16: + case T_SHORT8: + case T_INT4: + case T_FLOAT4: + case T_LONG2: + case T_DOUBLE2: + size = 16; + break; + case T_LONG3: + case T_DOUBLE3: + case T_SHORT16: + case T_INT8: + case T_FLOAT8: + case T_LONG4: + case T_DOUBLE4: + size = 32; + break; + case T_INT16: + case T_FLOAT16: + case T_LONG8: + case T_DOUBLE8: + size = 64; + break; + case T_LONG16: + case T_DOUBLE16: + size = 128; + break; + case T_POINTER: + case T_VOID: + size = sizeof(void*); + break; + default: + ShouldNotReachHere(); + break; + } + return size; +} + +static size_t getParamSizeImpl(bool cpuLayer, const clk_parameter_descriptor_t* desc, + unsigned index, cl_kernel_arg_address_qualifier qualifier, + size_t* alignment, unsigned* index_out) { + size_t size = 0; + if (desc[index].type == T_STRUCT || desc[index].type == T_PAD) { + size_t maxAlignment = 0; + size_t structSize = 0; + size_t structAlignment = 0; + index++; + while (desc[index].type != T_VOID) { + size_t elementAlignment = 0; + size_t elementSize = + getParamSizeImpl(cpuLayer, desc, index, qualifier, &elementAlignment, index_out); +#if defined(_WIN32) + maxAlignment = std::max(maxAlignment, elementAlignment); +#else + // In Linux, the alignment of long field is 4 for GCC, + // but it is 8 on LLVM side + if (desc[index].type == T_LONG) + structAlignment = cpuLayer ? LP64_SWITCH(4, 8) : 8; + else + structAlignment = std::max(maxAlignment, elementAlignment); + maxAlignment = std::max(maxAlignment, structAlignment); +#endif + index = *index_out; + structSize = amd::alignUp(structSize, std::min(elementAlignment, size_t(16))) + elementSize; + } + *index_out = index + 1; + *alignment = maxAlignment; + size = amd::alignUp(structSize, std::min(maxAlignment, size_t(16))); + } else { + size = getScalarParamSize(cpuLayer, desc[index].type, qualifier); + if (desc[index].type == T_DOUBLE) { +#if defined(_WIN32) + *alignment = 8; +#else + *alignment = LP64_SWITCH(4, 8); +#endif + } else if (desc[index].type == T_LONG) { + *alignment = 8; } else { - size = getScalarParamSize(cpuLayer, desc[index].type, qualifier); - if (desc[index].type == T_DOUBLE) { - #if defined(_WIN32) - *alignment = 8; - #else - *alignment = LP64_SWITCH(4, 8); - #endif - } else if (desc[index].type == T_LONG) { - *alignment = 8; - } else { - *alignment = size; - } - *index_out = index + 1; + *alignment = size; } - return size; + *index_out = index + 1; + } + return size; } -size_t -getParamSize(bool cpuLayer, const clk_parameter_descriptor_t* desc, - cl_kernel_arg_address_qualifier qualifier, - size_t* alignment) -{ - unsigned index_out = 0; - return getParamSizeImpl(cpuLayer, desc, 0, qualifier, alignment, - &index_out); +size_t getParamSize(bool cpuLayer, const clk_parameter_descriptor_t* desc, + cl_kernel_arg_address_qualifier qualifier, size_t* alignment) { + unsigned index_out = 0; + return getParamSizeImpl(cpuLayer, desc, 0, qualifier, alignment, &index_out); } -static unsigned -getNumTypeDescs(const clk_parameter_descriptor_t* desc) -{ +static unsigned getNumTypeDescs(const clk_parameter_descriptor_t* desc) { int numStruct = 0; unsigned i; - for(i = 0; desc[i].type != T_VOID || numStruct > 0; ++i) { - if (desc[i].type == T_STRUCT || desc[i].type == T_PAD) - numStruct++; - if (desc[i].type == T_VOID) - numStruct--; - } - return i + 1; + for (i = 0; desc[i].type != T_VOID || numStruct > 0; ++i) { + if (desc[i].type == T_STRUCT || desc[i].type == T_PAD) numStruct++; + if (desc[i].type == T_VOID) numStruct--; + } + return i + 1; } -static clk_value_type_t -getFirstScalarType(const clk_parameter_descriptor_t* desc) -{ +static clk_value_type_t getFirstScalarType(const clk_parameter_descriptor_t* desc) { int i = 0; - while(desc[i].type == T_STRUCT) - i++; + while (desc[i].type == T_STRUCT) i++; return desc[i].type; } -static const clk_value_type_t -getParamType(const clk_parameter_descriptor_t* desc, - const clk_parameter_descriptor_t** desc_out, - const char** type_name) -{ - unsigned numDescs = getNumTypeDescs(desc); - *desc_out = desc + numDescs; - *type_name = desc[numDescs-1].name; - // Use old behaviour and return first scalar type in case of a struct. - return getFirstScalarType(desc); - +static const clk_value_type_t getParamType(const clk_parameter_descriptor_t* desc, + const clk_parameter_descriptor_t** desc_out, + const char** type_name) { + unsigned numDescs = getNumTypeDescs(desc); + *desc_out = desc + numDescs; + *type_name = desc[numDescs - 1].name; + // Use old behaviour and return first scalar type in case of a struct. + return getFirstScalarType(desc); } -static amd::KernelParameterDescriptor -getParam(bool cpuLayer, const clk_parameter_descriptor_t* desc, - size_t offset_in, const clk_parameter_descriptor_t ** desc_out) -{ - size_t alignment; +static amd::KernelParameterDescriptor getParam(bool cpuLayer, + const clk_parameter_descriptor_t* desc, + size_t offset_in, + const clk_parameter_descriptor_t** desc_out) { + size_t alignment; - amd::KernelParameterDescriptor param; - param.name_ = desc->name; - param.type_ = getParamType(desc, desc_out, &(param.typeName_)); - param.addressQualifier_ = getParamAddressQualifier(desc); - param.typeQualifier_ = getParamTypeQualifier(desc); - param.accessQualifier_ = getParamAccessQualifier(desc); - param.size_ = getParamSize(cpuLayer, desc, param.addressQualifier_, - &alignment); - if(param.size_ == 0) { - param.offset_ = amd::alignUp(offset_in, - std::min(sizeof(cl_mem), size_t(16))); - } else { - param.offset_ = amd::alignUp(offset_in, - std::min(alignment, size_t(16))); - } - return param; + amd::KernelParameterDescriptor param; + param.name_ = desc->name; + param.type_ = getParamType(desc, desc_out, &(param.typeName_)); + param.addressQualifier_ = getParamAddressQualifier(desc); + param.typeQualifier_ = getParamTypeQualifier(desc); + param.accessQualifier_ = getParamAccessQualifier(desc); + param.size_ = getParamSize(cpuLayer, desc, param.addressQualifier_, &alignment); + if (param.size_ == 0) { + param.offset_ = amd::alignUp(offset_in, std::min(sizeof(cl_mem), size_t(16))); + } else { + param.offset_ = amd::alignUp(offset_in, std::min(alignment, size_t(16))); + } + return param; } -static bool -setKernelInfoCallback(std::string symbol, const void* value, void* data) -{ - cpu::Program* program = reinterpret_cast(data); - device::Program::kernels_t& kernels = program->kernels(); - const char __OpenCL_[] = "__OpenCL_"; - const char _kernel[] = "_stub"; - const char _data[] = "_metadata"; - const char _nature[] = "_nature"; - - const size_t offset = sizeof(__OpenCL_) - 1; - if (symbol.compare(0, offset, __OpenCL_) != 0) { - return false; - } - - size_t suffixPos = symbol.rfind('_'); - if (suffixPos == std::string::npos) { - return false; - } - - std::string name = symbol.substr(offset, suffixPos - offset); - cpu::Kernel* kernel = reinterpret_cast(kernels[name]); - if (NULL == kernel) { - kernel = new Kernel(name); - kernels[name] = kernel; - } - - if (symbol.compare(suffixPos, sizeof(_kernel) - 1, _kernel) == 0) { - kernel->setEntryPoint(value); - return true; - } - else if (symbol.compare(suffixPos, sizeof(_data) - 1, _data) == 0) { - device::Kernel::parameters_t params; - - size_t* recordPtr = (size_t*) value; - size_t* recordEnd = recordPtr + (*recordPtr)/sizeof(size_t); - ++recordPtr; // skip struct_length - - kernel->setLocalMemSize(*recordPtr++); - kernel->setPreferredSizeMultiple(1); - - kernel->setUniformWorkGroupSize(program->getCompilerOptions() - ->oVariables->UniformWorkGroupSize); - - kernel->setReqdWorkGroupSize(recordPtr[0], recordPtr[1], recordPtr[2]); - recordPtr += 3; - - kernel->setWorkGroupSizeHint(recordPtr[0], recordPtr[1], recordPtr[2]); - recordPtr += 3; - - const clk_parameter_descriptor_t* desc = - reinterpret_cast(recordPtr); - - size_t offset = 0; - while (desc->type != T_VOID) { - const clk_parameter_descriptor_t* next_desc = NULL; - amd::KernelParameterDescriptor param = getParam(false, desc, offset, - &next_desc); - - size_t cpuSize, cpuAlignment; - cpuSize = - getParamSize(true, desc, param.addressQualifier_, &cpuAlignment); - kernel->addArg(cpuSize, cpuAlignment); - - //Init for HCtoDCmap - unsigned int init_offset = 0; - unsigned int align = 0; - int inStruct = 0; - int end_index = 0; - HCtoDCmap *map_p = new HCtoDCmap(desc, align, 0, init_offset); - map_p->dc_size = map_p->compute_map(desc, map_p->hc_alignment, map_p->dc_alignment, init_offset, inStruct, end_index); - map_p->align_map(map_p->hc_alignment, map_p->dc_alignment, map_p->hc_size, map_p->dc_size, inStruct); - if (CPU_USE_ALIGNMENT_MAP == 0) { - kernel->addHCtoDCmap(map_p); - if (map_p->internal_field_map != NULL) { - kernel->addInternalMap(map_p->internal_field_map); - } - } - else { - delete(map_p); - } - //End of HCtoDCmap - - desc = next_desc; - params.push_back(param); - size_t size = param.size_ == 0 ? sizeof(cl_mem) : param.size_; -#if defined(USE_NATIVE_ABI) - size = amd::alignUp(size, sizeof(size_t)); -#endif // USE_NATIVE_ABI - offset = param.offset_ + size; - } - - // retrieve vector type hint metadata - const clk_parameter_descriptor_t* vth_desc = NULL; - getParam(false, desc, offset, &vth_desc); - const size_t* vthPtr = reinterpret_cast(vth_desc); - if (vthPtr < recordEnd && *vthPtr != 0) { - const char* vecTypeHint = reinterpret_cast(*vthPtr); - kernel->setVecTypeHint(vecTypeHint); - } - - if (kernel->createSignature(params)) { - return true; - } - } - else if (symbol.compare(suffixPos, sizeof(_nature) - 1, _nature) == 0) { - uint32_t* recordPtr = (uint32_t*) value; - kernel->nature_ = (uint)recordPtr[0]; - kernel->privateSize_ = (uint)recordPtr[1]; - return true; - } +static bool setKernelInfoCallback(std::string symbol, const void* value, void* data) { + cpu::Program* program = reinterpret_cast(data); + device::Program::kernels_t& kernels = program->kernels(); + const char __OpenCL_[] = "__OpenCL_"; + const char _kernel[] = "_stub"; + const char _data[] = "_metadata"; + const char _nature[] = "_nature"; + const size_t offset = sizeof(__OpenCL_) - 1; + if (symbol.compare(0, offset, __OpenCL_) != 0) { return false; + } + + size_t suffixPos = symbol.rfind('_'); + if (suffixPos == std::string::npos) { + return false; + } + + std::string name = symbol.substr(offset, suffixPos - offset); + cpu::Kernel* kernel = reinterpret_cast(kernels[name]); + if (NULL == kernel) { + kernel = new Kernel(name); + kernels[name] = kernel; + } + + if (symbol.compare(suffixPos, sizeof(_kernel) - 1, _kernel) == 0) { + kernel->setEntryPoint(value); + return true; + } else if (symbol.compare(suffixPos, sizeof(_data) - 1, _data) == 0) { + device::Kernel::parameters_t params; + + size_t* recordPtr = (size_t*)value; + size_t* recordEnd = recordPtr + (*recordPtr) / sizeof(size_t); + ++recordPtr; // skip struct_length + + kernel->setLocalMemSize(*recordPtr++); + kernel->setPreferredSizeMultiple(1); + + kernel->setUniformWorkGroupSize( + program->getCompilerOptions()->oVariables->UniformWorkGroupSize); + + kernel->setReqdWorkGroupSize(recordPtr[0], recordPtr[1], recordPtr[2]); + recordPtr += 3; + + kernel->setWorkGroupSizeHint(recordPtr[0], recordPtr[1], recordPtr[2]); + recordPtr += 3; + + const clk_parameter_descriptor_t* desc = + reinterpret_cast(recordPtr); + + size_t offset = 0; + while (desc->type != T_VOID) { + const clk_parameter_descriptor_t* next_desc = NULL; + amd::KernelParameterDescriptor param = getParam(false, desc, offset, &next_desc); + + size_t cpuSize, cpuAlignment; + cpuSize = getParamSize(true, desc, param.addressQualifier_, &cpuAlignment); + kernel->addArg(cpuSize, cpuAlignment); + + // Init for HCtoDCmap + unsigned int init_offset = 0; + unsigned int align = 0; + int inStruct = 0; + int end_index = 0; + HCtoDCmap* map_p = new HCtoDCmap(desc, align, 0, init_offset); + map_p->dc_size = map_p->compute_map(desc, map_p->hc_alignment, map_p->dc_alignment, + init_offset, inStruct, end_index); + map_p->align_map(map_p->hc_alignment, map_p->dc_alignment, map_p->hc_size, map_p->dc_size, + inStruct); + if (CPU_USE_ALIGNMENT_MAP == 0) { + kernel->addHCtoDCmap(map_p); + if (map_p->internal_field_map != NULL) { + kernel->addInternalMap(map_p->internal_field_map); + } + } else { + delete (map_p); + } + // End of HCtoDCmap + + desc = next_desc; + params.push_back(param); + size_t size = param.size_ == 0 ? sizeof(cl_mem) : param.size_; +#if defined(USE_NATIVE_ABI) + size = amd::alignUp(size, sizeof(size_t)); +#endif // USE_NATIVE_ABI + offset = param.offset_ + size; + } + + // retrieve vector type hint metadata + const clk_parameter_descriptor_t* vth_desc = NULL; + getParam(false, desc, offset, &vth_desc); + const size_t* vthPtr = reinterpret_cast(vth_desc); + if (vthPtr < recordEnd && *vthPtr != 0) { + const char* vecTypeHint = reinterpret_cast(*vthPtr); + kernel->setVecTypeHint(vecTypeHint); + } + + if (kernel->createSignature(params)) { + return true; + } + } else if (symbol.compare(suffixPos, sizeof(_nature) - 1, _nature) == 0) { + uint32_t* recordPtr = (uint32_t*)value; + kernel->nature_ = (uint)recordPtr[0]; + kernel->privateSize_ = (uint)recordPtr[1]; + return true; + } + + return false; } -static bool -setKernelInfoCallbackCStr(const char* symbol, const void* value, void* data) { +static bool setKernelInfoCallbackCStr(const char* symbol, const void* value, void* data) { std::string symbolString(symbol); return setKernelInfoCallback(symbolString, value, data); } -static bool -setSymbolsCallback(std::string symbol, const void* value, void* data) -{ - device::ClBinary* clbinary = (device::ClBinary*) data; - const char __OpenCL_[] = "__OpenCL_"; - const char _stub[] = "_stub"; - const char _kernel[] = "_kernel"; - const char _data[] = "_metadata"; +static bool setSymbolsCallback(std::string symbol, const void* value, void* data) { + device::ClBinary* clbinary = (device::ClBinary*)data; + const char __OpenCL_[] = "__OpenCL_"; + const char _stub[] = "_stub"; + const char _kernel[] = "_kernel"; + const char _data[] = "_metadata"; - const size_t offset = sizeof(__OpenCL_) - 1; - if (symbol.compare(0, offset, __OpenCL_) != 0) { - return false; - } - - size_t suffixPos = symbol.rfind('_'); - if (suffixPos == std::string::npos) { - return false; - } - - if ((symbol.compare(suffixPos, sizeof(_stub) - 1, _stub) == 0) || - (symbol.compare(suffixPos, sizeof(_kernel) - 1, _kernel) == 0) || - (symbol.compare(suffixPos, sizeof(_data) - 1, _data) == 0)) { - - return clbinary->elfOut()->addSymbol(amd::OclElf::DLL, - const_cast(symbol.c_str()), - 0, false); - } + const size_t offset = sizeof(__OpenCL_) - 1; + if (symbol.compare(0, offset, __OpenCL_) != 0) { return false; + } + + size_t suffixPos = symbol.rfind('_'); + if (suffixPos == std::string::npos) { + return false; + } + + if ((symbol.compare(suffixPos, sizeof(_stub) - 1, _stub) == 0) || + (symbol.compare(suffixPos, sizeof(_kernel) - 1, _kernel) == 0) || + (symbol.compare(suffixPos, sizeof(_data) - 1, _data) == 0)) { + return clbinary->elfOut()->addSymbol(amd::OclElf::DLL, const_cast(symbol.c_str()), 0, + false); + } + return false; } -static bool -setSymbolsCallbackCStr(const char* symbol, const void* value, void* data) { +static bool setSymbolsCallbackCStr(const char* symbol, const void* value, void* data) { std::string symbolString(symbol); return setSymbolsCallback(symbolString, value, data); } // Some helper functions to simplify testing the disassembler struct DisasData { -public: - DisasData(std::stringstream *stream, - aclJITObjectImage im, aclCompiler* cmpl) - : asmstream(stream), image(im), compiler(cmpl) {}; - std::stringstream *asmstream; + public: + DisasData(std::stringstream* stream, aclJITObjectImage im, aclCompiler* cmpl) + : asmstream(stream), image(im), compiler(cmpl){}; + std::stringstream* asmstream; aclJITObjectImage image; aclCompiler* compiler; }; #if defined(LEGACY_COMPLIB) -static bool -disasSymbolsCallback(std::string symbol, const void* value, void* data) -{ - DisasData* disasData = (DisasData*) data; - std::stringstream &asmstream = *(disasData->asmstream); - aclJITObjectImage image = disasData->image; - aclCompiler* compiler = disasData->compiler; - const char __OpenCL_[] = "__OpenCL_"; - const char _stub[] = "_stub"; - const char _kernel[] = "_kernel"; - const char _data[] = "_metadata"; +static bool disasSymbolsCallback(std::string symbol, const void* value, void* data) { + DisasData* disasData = (DisasData*)data; + std::stringstream& asmstream = *(disasData->asmstream); + aclJITObjectImage image = disasData->image; + aclCompiler* compiler = disasData->compiler; + const char __OpenCL_[] = "__OpenCL_"; + const char _stub[] = "_stub"; + const char _kernel[] = "_kernel"; + const char _data[] = "_metadata"; - const size_t offset = sizeof(__OpenCL_) - 1; - if (symbol.compare(0, offset, __OpenCL_) != 0) { - return false; - } - - size_t suffixPos = symbol.rfind('_'); - if (suffixPos == std::string::npos) { - return false; - } - - if ((symbol.compare(suffixPos, sizeof(_stub) - 1, _stub) == 0) || - (symbol.compare(suffixPos, sizeof(_kernel) - 1, _kernel) == 0)) { - acl_error err = ACL_SUCCESS; - char* kernelDisas = - aclJITObjectImageDisassembleKernel(compiler, image, symbol.c_str(), &err); - if (err != ACL_SUCCESS) { - LogWarning("aclJITObjectImageDisassembleKernel failed"); - return false; - } - asmstream << kernelDisas; - free(kernelDisas); - } + const size_t offset = sizeof(__OpenCL_) - 1; + if (symbol.compare(0, offset, __OpenCL_) != 0) { return false; + } + + size_t suffixPos = symbol.rfind('_'); + if (suffixPos == std::string::npos) { + return false; + } + + if ((symbol.compare(suffixPos, sizeof(_stub) - 1, _stub) == 0) || + (symbol.compare(suffixPos, sizeof(_kernel) - 1, _kernel) == 0)) { + acl_error err = ACL_SUCCESS; + char* kernelDisas = aclJITObjectImageDisassembleKernel(compiler, image, symbol.c_str(), &err); + if (err != ACL_SUCCESS) { + LogWarning("aclJITObjectImageDisassembleKernel failed"); + return false; + } + asmstream << kernelDisas; + free(kernelDisas); + } + return false; } -static bool -disasSymbolsCallbackCStr(const char* symbol, const void* value, void* data) { +static bool disasSymbolsCallbackCStr(const char* symbol, const void* value, void* data) { std::string symbolString(symbol); return disasSymbolsCallback(symbolString, value, data); } #endif -bool -Program::compileBinaryToISA(amd::option::Options* options) -{ - const bool has_avx = !options->oVariables->DisableAVX - && device().hasAVXInstructions(); - const bool has_fma4 = device().hasFMA4Instructions(); +bool Program::compileBinaryToISA(amd::option::Options* options) { + const bool has_avx = !options->oVariables->DisableAVX && device().hasAVXInstructions(); + const bool has_fma4 = device().hasFMA4Instructions(); #if defined(WITH_ONLINE_COMPILER) - std::string tempName = amd::Os::getTempFileName(); - dllFileName_ = tempName + "dbg" + "." IF(IS_WINDOWS, "dll", "so"); + std::string tempName = amd::Os::getTempFileName(); + dllFileName_ = tempName + "dbg" + "." IF(IS_WINDOWS, "dll", "so"); - acl_error err = ACL_SUCCESS; - aclTargetInfo aclinfo = info(has_avx ? - /*has_fma4 ? "Bulldozer" :*/ - "Corei7_AVX" : - "Athlon64"); + acl_error err = ACL_SUCCESS; + aclTargetInfo aclinfo = info(has_avx ? + /*has_fma4 ? "Bulldozer" :*/ + "Corei7_AVX" + : "Athlon64"); - aclBinaryOptions binOpts = {0}; - binOpts.struct_size = sizeof(binOpts); - binOpts.elfclass = aclinfo.arch_id == aclX64 ? ELFCLASS64 : ELFCLASS32; - binOpts.bitness = ELFDATA2LSB; - binOpts.alloc = &::malloc; - binOpts.dealloc = &::free; + aclBinaryOptions binOpts = {0}; + binOpts.struct_size = sizeof(binOpts); + binOpts.elfclass = aclinfo.arch_id == aclX64 ? ELFCLASS64 : ELFCLASS32; + binOpts.bitness = ELFDATA2LSB; + binOpts.alloc = &::malloc; + binOpts.dealloc = &::free; - aclBinary* bin = aclBinaryInit(sizeof(aclBinary), &aclinfo, &binOpts, &err); - if (err != ACL_SUCCESS) { - buildLog_ += "Internal error: Setting up input OpenCL binary failed!\n"; - LogWarning("aclBinaryInit failed"); - return false; - } + aclBinary* bin = aclBinaryInit(sizeof(aclBinary), &aclinfo, &binOpts, &err); + if (err != ACL_SUCCESS) { + buildLog_ += "Internal error: Setting up input OpenCL binary failed!\n"; + LogWarning("aclBinaryInit failed"); + return false; + } - aclSections_0_8 spirFlag; - _acl_type_enum_0_8 aclTypeBinaryUsed; - if (std::string::npos != options->clcOptions.find("--spirv") - || elfSectionType_ == amd::OclElf::SPIRV) { - spirFlag = aclSPIRV; - aclTypeBinaryUsed = ACL_TYPE_SPIRV_BINARY; - } else if (std::string::npos != options->clcOptions.find("--spir") - || elfSectionType_ == amd::OclElf::SPIR) { - spirFlag = aclSPIR; - aclTypeBinaryUsed = ACL_TYPE_SPIR_BINARY; - } else { - spirFlag = aclLLVMIR; - aclTypeBinaryUsed = ACL_TYPE_LLVMIR_BINARY; - } + aclSections_0_8 spirFlag; + _acl_type_enum_0_8 aclTypeBinaryUsed; + if (std::string::npos != options->clcOptions.find("--spirv") || + elfSectionType_ == amd::OclElf::SPIRV) { + spirFlag = aclSPIRV; + aclTypeBinaryUsed = ACL_TYPE_SPIRV_BINARY; + } else if (std::string::npos != options->clcOptions.find("--spir") || + elfSectionType_ == amd::OclElf::SPIR) { + spirFlag = aclSPIR; + aclTypeBinaryUsed = ACL_TYPE_SPIR_BINARY; + } else { + spirFlag = aclLLVMIR; + aclTypeBinaryUsed = ACL_TYPE_LLVMIR_BINARY; + } - if (ACL_SUCCESS != aclInsertSection(compiler(), bin, - llvmBinary_.data(), llvmBinary_.size(), spirFlag)) { - LogWarning("aclInsertSection failed"); - aclBinaryFini(bin); - return false; - } + if (ACL_SUCCESS != + aclInsertSection(compiler(), bin, llvmBinary_.data(), llvmBinary_.size(), spirFlag)) { + LogWarning("aclInsertSection failed"); + aclBinaryFini(bin); + return false; + } - // temporary solution to synchronize buildNo between runtime and complib - // until we move runtime inside complib - ((amd::option::Options*)bin->options)->setBuildNo(options->getBuildNo()); + // temporary solution to synchronize buildNo between runtime and complib + // until we move runtime inside complib + ((amd::option::Options*)bin->options)->setBuildNo(options->getBuildNo()); - err = aclCompile(compiler(), bin, options->origOptionStr.c_str(), - aclTypeBinaryUsed, ACL_TYPE_ISA, NULL); + err = aclCompile(compiler(), bin, options->origOptionStr.c_str(), aclTypeBinaryUsed, ACL_TYPE_ISA, + NULL); - buildLog_ += aclGetCompilerLog(compiler()); + buildLog_ += aclGetCompilerLog(compiler()); - if (err != ACL_SUCCESS) { - LogWarning("aclCompile failed"); - aclBinaryFini(bin); - return false; - } + if (err != ACL_SUCCESS) { + LogWarning("aclCompile failed"); + aclBinaryFini(bin); + return false; + } - if (options->oVariables->BinBIF30) { - if (!createBIFBinary(bin)) { - aclBinaryFini(bin); - return false; - } - } - - if (options->oVariables->BinAS && !options->oVariables->UseJIT) { - size_t len = 0; - const char* asmtext = - static_cast(aclExtractSection(compiler(), bin, - &len, aclCODEGEN, &err)); - if (err != ACL_SUCCESS) { - LogWarning("aclExtractSection failed"); - aclBinaryFini(bin); - return false; - } - - // Store the Asm text in ASTEXT section unless the JIT is used - - if (!clBinary()->storeX86Asm(asmtext, len)) { - buildLog_ += "Internal Error: Storing X86 ASM failed!\n"; - return false; - } - } - - size_t len = 0; - const void* isa = aclExtractSection(compiler(), bin, - &len, aclTEXT, &err); - if (err != ACL_SUCCESS) { - LogWarning("aclExtractSection failed"); - aclBinaryFini(bin); - return false; - } - - if (options->oVariables->UseJIT) { - // printf("Using the jit!\n"); - aclJITObjectImage objectImage = aclJITObjectImageCreate(compiler(), isa, len, bin, &err); - if (err != ACL_SUCCESS) { - LogWarning("aclJITObjectImageCreate failed"); - aclBinaryFini(bin); - return false; - } - err = aclJITObjectImageFinalize(compiler(), objectImage); - if (err != ACL_SUCCESS) { - LogWarning("aclJITObjectImageFinalize failed"); - aclBinaryFini(bin); - return false; - } - setJITBinary(objectImage); + if (options->oVariables->BinBIF30) { + if (!createBIFBinary(bin)) { aclBinaryFini(bin); + return false; + } + } + + if (options->oVariables->BinAS && !options->oVariables->UseJIT) { + size_t len = 0; + const char* asmtext = + static_cast(aclExtractSection(compiler(), bin, &len, aclCODEGEN, &err)); + if (err != ACL_SUCCESS) { + LogWarning("aclExtractSection failed"); + aclBinaryFini(bin); + return false; + } + + // Store the Asm text in ASTEXT section unless the JIT is used + + if (!clBinary()->storeX86Asm(asmtext, len)) { + buildLog_ += "Internal Error: Storing X86 ASM failed!\n"; + return false; + } + } + + size_t len = 0; + const void* isa = aclExtractSection(compiler(), bin, &len, aclTEXT, &err); + if (err != ACL_SUCCESS) { + LogWarning("aclExtractSection failed"); + aclBinaryFini(bin); + return false; + } + + if (options->oVariables->UseJIT) { + // printf("Using the jit!\n"); + aclJITObjectImage objectImage = aclJITObjectImageCreate(compiler(), isa, len, bin, &err); + if (err != ACL_SUCCESS) { + LogWarning("aclJITObjectImageCreate failed"); + aclBinaryFini(bin); + return false; + } + err = aclJITObjectImageFinalize(compiler(), objectImage); + if (err != ACL_SUCCESS) { + LogWarning("aclJITObjectImageFinalize failed"); + aclBinaryFini(bin); + return false; + } + setJITBinary(objectImage); + aclBinaryFini(bin); // Store the object image binary in the CL binary; - if (!clBinary()->storeX86JIT(*this)) { - buildLog_ += "Internal Error: Storing X86 DLL failed!\n"; - return false; + if (!clBinary()->storeX86JIT(*this)) { + buildLog_ += "Internal Error: Storing X86 DLL failed!\n"; + return false; } #if 0 @@ -646,466 +619,443 @@ Program::compileBinaryToISA(amd::option::Options* options) asmtext.str().c_str()); #endif - return true; - } - - std::fstream f; - f.open(dllFileName_.c_str(), std::fstream::out | std::fstream::binary); - f.write(static_cast(isa), len); - f.close(); - - aclBinaryFini(bin); - - if (f.fail() || f.bad()) { - buildLog_ += "Internal error: fail to create an internal file!\n"; - return false; - } - - // Store the dll binary in the CL binary; - if (!clBinary()->storeX86(*this, dllFileName_)) { - buildLog_ += "Internal Error: Storing X86 DLL failed!\n"; - return false; - } - return true; -#endif // WITH_ONLINE_COMPILER + } + + std::fstream f; + f.open(dllFileName_.c_str(), std::fstream::out | std::fstream::binary); + f.write(static_cast(isa), len); + f.close(); + + aclBinaryFini(bin); + + if (f.fail() || f.bad()) { + buildLog_ += "Internal error: fail to create an internal file!\n"; return false; + } + + // Store the dll binary in the CL binary; + if (!clBinary()->storeX86(*this, dllFileName_)) { + buildLog_ += "Internal Error: Storing X86 DLL failed!\n"; + return false; + } + + return true; +#endif // WITH_ONLINE_COMPILER + return false; } -bool -Program::initBuild(amd::option::Options* options) -{ - if (!this->::device::Program::initBuild(options)) { - return false; - } +bool Program::initBuild(amd::option::Options* options) { + if (!this->::device::Program::initBuild(options)) { + return false; + } - options->setPerBuildInfo("cpu", - clBinary()->getEncryptCode(), false); + options->setPerBuildInfo("cpu", clBinary()->getEncryptCode(), false); - /* - -f[no-]bin-source : control .source - -f[no-]bin-llvmir : control .llvmir - -f[no-]bin-amdil : control .amdil - -f[no-]bin-exe : control .text + /* + -f[no-]bin-source : control .source + -f[no-]bin-llvmir : control .llvmir + -f[no-]bin-amdil : control .amdil + -f[no-]bin-exe : control .text - Default: -fno-bin-source -fbin-llvmir -fno-bin-amdil -fbin-exe - */ - // Elf Binary setup - clBinary()->init(options); + Default: -fno-bin-source -fbin-llvmir -fno-bin-amdil -fbin-exe + */ + // Elf Binary setup + clBinary()->init(options); - std::string outFileName; - if (options->isDumpFlagSet(amd::option::DUMP_BIF)) { - outFileName = options->getDumpFileName(".bin"); - } - if (!clBinary()->setElfOut(LP64_SWITCH(ELFCLASS32, ELFCLASS64), - (outFileName.size() > 0) - ? outFileName.c_str() : NULL)) { - LogError("setup elfout for CPU failed"); - return false; - } + std::string outFileName; + if (options->isDumpFlagSet(amd::option::DUMP_BIF)) { + outFileName = options->getDumpFileName(".bin"); + } + if (!clBinary()->setElfOut(LP64_SWITCH(ELFCLASS32, ELFCLASS64), + (outFileName.size() > 0) ? outFileName.c_str() : NULL)) { + LogError("setup elfout for CPU failed"); + return false; + } - return true; + return true; } -bool -Program::finiBuild(bool isBuildGood) -{ - clBinary()->resetElfOut(); - clBinary()->resetElfIn(); +bool Program::finiBuild(bool isBuildGood) { + clBinary()->resetElfOut(); + clBinary()->resetElfIn(); - if (!isBuildGood) { - // Prevent the encrypted binary form leaking out - clBinary()->setBinary(NULL, 0); - } + if (!isBuildGood) { + // Prevent the encrypted binary form leaking out + clBinary()->setBinary(NULL, 0); + } - return this->::device::Program::finiBuild(isBuildGood); + return this->::device::Program::finiBuild(isBuildGood); } -bool -Program::compileImpl( - const std::string& sourceCode, - const std::vector& headers, - const char** headerIncludeNames, - amd::option::Options* options) -{ +bool Program::compileImpl(const std::string& sourceCode, + const std::vector& headers, + const char** headerIncludeNames, amd::option::Options* options) { #if defined(WITH_ONLINE_COMPILER) - std::string tempFolder = amd::Os::getTempPath(); + std::string tempFolder = amd::Os::getTempPath(); - std::fstream f; - std::vector headerFileNames(headers.size()); - std::vector newDirs; - for (size_t i = 0; i < headers.size(); ++i) { - std::string headerPath = tempFolder; - std::string headerIncludeName(headerIncludeNames[i]); - // replace / in path with current os's file separator - if (amd::Os::fileSeparator() != '/') { - for (std::string::iterator it = headerIncludeName.begin(), - end = headerIncludeName.end(); - it != end; - ++it) { - if (*it == '/') *it = amd::Os::fileSeparator(); - } - } - size_t pos = headerIncludeName.rfind(amd::Os::fileSeparator()); - if (pos != std::string::npos) { - headerPath += amd::Os::fileSeparator(); - headerPath += headerIncludeName.substr(0, pos); - headerIncludeName = headerIncludeName.substr(pos+1); - } - if (!amd::Os::pathExists(headerPath)) { - bool ret = amd::Os::createPath(headerPath); - assert(ret && "failed creating path!"); - newDirs.push_back(headerPath); - } - std::string headerFullName - = headerPath + amd::Os::fileSeparator() + headerIncludeName; - headerFileNames[i] = headerFullName; - f.open(headerFullName.c_str(), std::fstream::out); - assert(!f.fail() && "failed creating header file!"); - f.write(headers[i]->c_str(), headers[i]->length()); - f.close(); + std::fstream f; + std::vector headerFileNames(headers.size()); + std::vector newDirs; + for (size_t i = 0; i < headers.size(); ++i) { + std::string headerPath = tempFolder; + std::string headerIncludeName(headerIncludeNames[i]); + // replace / in path with current os's file separator + if (amd::Os::fileSeparator() != '/') { + for (std::string::iterator it = headerIncludeName.begin(), end = headerIncludeName.end(); + it != end; ++it) { + if (*it == '/') *it = amd::Os::fileSeparator(); + } } - - acl_error err = ACL_SUCCESS; - aclTargetInfo aclinfo = info(); - - aclBinaryOptions binOpts = {0}; - binOpts.struct_size = sizeof(binOpts); - binOpts.elfclass = aclinfo.arch_id == aclX64 ? ELFCLASS64 : ELFCLASS32; - binOpts.bitness = ELFDATA2LSB; - binOpts.alloc = &::malloc; - binOpts.dealloc = &::free; - - aclBinary* bin = aclBinaryInit(sizeof(aclBinary), &aclinfo, &binOpts, &err); - if (err != ACL_SUCCESS) { - buildLog_ += "Internal error: Setting up input OpenCL binary failed!\n"; - LogWarning("aclBinaryInit failed"); - return false; + size_t pos = headerIncludeName.rfind(amd::Os::fileSeparator()); + if (pos != std::string::npos) { + headerPath += amd::Os::fileSeparator(); + headerPath += headerIncludeName.substr(0, pos); + headerIncludeName = headerIncludeName.substr(pos + 1); } - - if (ACL_SUCCESS != aclInsertSection(compiler(), bin, - sourceCode.c_str(), sourceCode.size(), aclSOURCE)) { - LogWarning("aclInsertSection failed"); - aclBinaryFini(bin); - return false; + if (!amd::Os::pathExists(headerPath)) { + bool ret = amd::Os::createPath(headerPath); + assert(ret && "failed creating path!"); + newDirs.push_back(headerPath); } + std::string headerFullName = headerPath + amd::Os::fileSeparator() + headerIncludeName; + headerFileNames[i] = headerFullName; + f.open(headerFullName.c_str(), std::fstream::out); + assert(!f.fail() && "failed creating header file!"); + f.write(headers[i]->c_str(), headers[i]->length()); + f.close(); + } - // temporary solution to synchronize buildNo between runtime and complib - // until we move runtime inside complib - ((amd::option::Options*)bin->options)->setBuildNo(options->getBuildNo()); + acl_error err = ACL_SUCCESS; + aclTargetInfo aclinfo = info(); - std::stringstream opts; - std::string token; - opts << options->origOptionStr.c_str(); + aclBinaryOptions binOpts = {0}; + binOpts.struct_size = sizeof(binOpts); + binOpts.elfclass = aclinfo.arch_id == aclX64 ? ELFCLASS64 : ELFCLASS32; + binOpts.bitness = ELFDATA2LSB; + binOpts.alloc = &::malloc; + binOpts.dealloc = &::free; - if (options->origOptionStr.find("-cl-std=CL") == std::string::npos) { - switch(OPENCL_MAJOR*100 + OPENCL_MINOR*10) { - case 100: opts << " -cl-std=CL1.0"; break; - case 110: opts << " -cl-std=CL1.1"; break; - case 200: default: - case 120: opts << " -cl-std=CL1.2"; break; - } - } - - //Add only for CL2.0 and later - bool spirFlag = false; - if (options->oVariables->CLStd[2] >= '2') { - opts << " -D" << "CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE=" - << device().info().maxGlobalVariableSize_; - spirFlag = true; - } - - // FIXME: Should we prefix everything with -Wf,? - std::istringstream iss(options->clcOptions); - while (getline(iss, token, ' ')) { - if (!token.empty()) { - // Check if this is a -D option - if (token.compare("-D") == 0) { - // It is, skip payload - getline(iss, token, ' '); - continue; - } - opts << " -Wf," << token; - } - } - - if (!headers.empty()) { - opts << " -I" << tempFolder; - } - - if (device().info().imageSupport_) { - opts << " -D__IMAGE_SUPPORT__=1"; - } - if (device().hasFMA4Instructions()) { - opts << " -DFP_FAST_FMA=1 -DFP_FAST_FMAF=1"; - } - - iss.clear(); - iss.str(device().info().extensions_); - while (getline(iss, token, ' ')) { - if (!token.empty()) { - opts << " -D" << token << "=1"; - } - } - - std::string newOpt = opts.str(); - size_t pos = newOpt.find("-fno-bin-llvmir"); - while (pos != std::string::npos) { - newOpt.erase(pos, 15); - pos = newOpt.find("-fno-bin-llvmir"); - } - - err = aclCompile(compiler(), bin, newOpt.c_str(), - ACL_TYPE_OPENCL, spirFlag ? ACL_TYPE_SPIR_BINARY : ACL_TYPE_LLVMIR_BINARY, NULL); - - buildLog_ += aclGetCompilerLog(compiler()); - - if (err != ACL_SUCCESS) { - LogWarning("aclCompile failed"); - aclBinaryFini(bin); - return false; - } - - size_t size = 0; - const void* llvmir = aclExtractSection(compiler(), bin, - &size, aclLLVMIR, &err); - if (err != ACL_SUCCESS) { - LogWarning("aclExtractSection failed"); - aclBinaryFini(bin); - return false; - } - - llvmBinary_.assign(reinterpret_cast(llvmir), size); - elfSectionType_ = amd::OclElf::LLVMIR; - aclBinaryFini(bin); - - if (clBinary()->saveSOURCE()) { - clBinary()->elfOut()->addSection( - amd::OclElf::SOURCE, sourceCode.data(), sourceCode.length()); - } - if (clBinary()->saveLLVMIR()) { - clBinary()->elfOut()->addSection( - amd::OclElf::LLVMIR, llvmBinary_.data(), llvmBinary_.size(), false); - // store the original compile options - clBinary()->storeCompileOptions(compileOptions_); - } - - return true; -#else // WITH_ONLINE_COMPILER + aclBinary* bin = aclBinaryInit(sizeof(aclBinary), &aclinfo, &binOpts, &err); + if (err != ACL_SUCCESS) { + buildLog_ += "Internal error: Setting up input OpenCL binary failed!\n"; + LogWarning("aclBinaryInit failed"); return false; + } + + if (ACL_SUCCESS != + aclInsertSection(compiler(), bin, sourceCode.c_str(), sourceCode.size(), aclSOURCE)) { + LogWarning("aclInsertSection failed"); + aclBinaryFini(bin); + return false; + } + + // temporary solution to synchronize buildNo between runtime and complib + // until we move runtime inside complib + ((amd::option::Options*)bin->options)->setBuildNo(options->getBuildNo()); + + std::stringstream opts; + std::string token; + opts << options->origOptionStr.c_str(); + + if (options->origOptionStr.find("-cl-std=CL") == std::string::npos) { + switch (OPENCL_MAJOR * 100 + OPENCL_MINOR * 10) { + case 100: + opts << " -cl-std=CL1.0"; + break; + case 110: + opts << " -cl-std=CL1.1"; + break; + case 200: + default: + case 120: + opts << " -cl-std=CL1.2"; + break; + } + } + + // Add only for CL2.0 and later + bool spirFlag = false; + if (options->oVariables->CLStd[2] >= '2') { + opts << " -D" + << "CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE=" << device().info().maxGlobalVariableSize_; + spirFlag = true; + } + + // FIXME: Should we prefix everything with -Wf,? + std::istringstream iss(options->clcOptions); + while (getline(iss, token, ' ')) { + if (!token.empty()) { + // Check if this is a -D option + if (token.compare("-D") == 0) { + // It is, skip payload + getline(iss, token, ' '); + continue; + } + opts << " -Wf," << token; + } + } + + if (!headers.empty()) { + opts << " -I" << tempFolder; + } + + if (device().info().imageSupport_) { + opts << " -D__IMAGE_SUPPORT__=1"; + } + if (device().hasFMA4Instructions()) { + opts << " -DFP_FAST_FMA=1 -DFP_FAST_FMAF=1"; + } + + iss.clear(); + iss.str(device().info().extensions_); + while (getline(iss, token, ' ')) { + if (!token.empty()) { + opts << " -D" << token << "=1"; + } + } + + std::string newOpt = opts.str(); + size_t pos = newOpt.find("-fno-bin-llvmir"); + while (pos != std::string::npos) { + newOpt.erase(pos, 15); + pos = newOpt.find("-fno-bin-llvmir"); + } + + err = aclCompile(compiler(), bin, newOpt.c_str(), ACL_TYPE_OPENCL, + spirFlag ? ACL_TYPE_SPIR_BINARY : ACL_TYPE_LLVMIR_BINARY, NULL); + + buildLog_ += aclGetCompilerLog(compiler()); + + if (err != ACL_SUCCESS) { + LogWarning("aclCompile failed"); + aclBinaryFini(bin); + return false; + } + + size_t size = 0; + const void* llvmir = aclExtractSection(compiler(), bin, &size, aclLLVMIR, &err); + if (err != ACL_SUCCESS) { + LogWarning("aclExtractSection failed"); + aclBinaryFini(bin); + return false; + } + + llvmBinary_.assign(reinterpret_cast(llvmir), size); + elfSectionType_ = amd::OclElf::LLVMIR; + aclBinaryFini(bin); + + if (clBinary()->saveSOURCE()) { + clBinary()->elfOut()->addSection(amd::OclElf::SOURCE, sourceCode.data(), sourceCode.length()); + } + if (clBinary()->saveLLVMIR()) { + clBinary()->elfOut()->addSection(amd::OclElf::LLVMIR, llvmBinary_.data(), llvmBinary_.size(), + false); + // store the original compile options + clBinary()->storeCompileOptions(compileOptions_); + } + + return true; +#else // WITH_ONLINE_COMPILER + return false; #endif } -bool -Program::loadDllCode(amd::option::Options* options, bool addElfSymbols) -{ - if(options->oVariables->UseJIT) { - acl_error err = ACL_SUCCESS; - aclJITObjectImage objectImage = getJITBinary(); - err = aclJITObjectImageIterateSymbols(compiler(), objectImage, - setKernelInfoCallbackCStr, this); - if (err != ACL_SUCCESS) { +bool Program::loadDllCode(amd::option::Options* options, bool addElfSymbols) { + if (options->oVariables->UseJIT) { + acl_error err = ACL_SUCCESS; + aclJITObjectImage objectImage = getJITBinary(); + err = aclJITObjectImageIterateSymbols(compiler(), objectImage, setKernelInfoCallbackCStr, this); + if (err != ACL_SUCCESS) { + LogWarning("aclJITObjectImageIterateSymbols failed"); + return false; + } + err = aclJITObjectImageIterateSymbols(compiler(), objectImage, setSymbolsCallbackCStr, + clBinary()); + if (err != ACL_SUCCESS) { + LogWarning("aclJITObjectImageIterateSymbols failed"); + return false; + } + size_t size = aclJITObjectImageGetGlobalsSize(compiler(), objectImage, &err); + if (err != ACL_SUCCESS) { + LogWarning("aclJITObjectImageGetGlobalsSize failed"); + return false; + } + setGlobalVariableTotalSize(size); + return true; + } +// Check if we have a URI +#if defined(_WIN32) + UINT prevMode = ::SetErrorMode(SEM_FAILCRITICALERRORS | SEM_NOGPFAULTERRORBOX); + + handle_ = ::LoadLibraryEx(dllFileName_.c_str(), NULL, DONT_RESOLVE_DLL_REFERENCES); + + ::SetErrorMode(prevMode); +#else + handle_ = amd::Os::loadLibrary(dllFileName_.c_str()); +#endif + if (!handle_) { + return false; + } + + if (!amd::Os::iterateSymbols(handle_, setKernelInfoCallback, this)) { + return false; + } + + // Add cpu symbols into elf + if (addElfSymbols) { + if (!amd::Os::iterateSymbols(handle_, setSymbolsCallback, clBinary())) { + return false; + } + } + + return true; +} + +bool Program::linkImpl(amd::option::Options* options) { +#if defined(WITH_ONLINE_COMPILER) + // If we don't have LLVM binary then attempt to use OCL binary + if (llvmBinary_.empty()) { + // Load ISA + // For elf format, setup elfIn() and this elfIn() will be released + // at the end of build by finiBuild(). + if (!clBinary()->setElfIn(LP64_SWITCH(ELFCLASS32, ELFCLASS64))) { + buildLog_ += "Internal error: Setting up input OpenCL binary failed!\n"; + LogError("Setting up input binary failed"); + return false; + } + + if (options->oVariables->UseJIT) { + bool hasJITBinary; + if (!clBinary()->loadX86JIT(*this, hasJITBinary)) { + return false; + } else if (hasJITBinary) { + aclJITObjectImage objectImage = getJITBinary(); + acl_error err = aclJITObjectImageIterateSymbols(compiler(), objectImage, + setKernelInfoCallbackCStr, this); + if (err != ACL_SUCCESS) { LogWarning("aclJITObjectImageIterateSymbols failed"); return false; - } - err = aclJITObjectImageIterateSymbols(compiler(), objectImage, - setSymbolsCallbackCStr, clBinary()); - if (err != ACL_SUCCESS) { + } + err = aclJITObjectImageIterateSymbols(compiler(), objectImage, setSymbolsCallbackCStr, + clBinary()); + if (err != ACL_SUCCESS) { LogWarning("aclJITObjectImageIterateSymbols failed"); return false; - } - size_t size = aclJITObjectImageGetGlobalsSize(compiler(), objectImage, &err); - if (err != ACL_SUCCESS) { + } + size_t size = aclJITObjectImageGetGlobalsSize(compiler(), objectImage, &err); + if (err != ACL_SUCCESS) { LogWarning("aclJITObjectImageGetGlobalsSize failed"); return false; - } - setGlobalVariableTotalSize(size); - return true; - } - // Check if we have a URI -#if defined(_WIN32) - UINT prevMode = ::SetErrorMode(SEM_FAILCRITICALERRORS | SEM_NOGPFAULTERRORBOX); - - handle_ = ::LoadLibraryEx( - dllFileName_.c_str(), NULL,DONT_RESOLVE_DLL_REFERENCES); - - ::SetErrorMode(prevMode); -#else - handle_ = amd::Os::loadLibrary(dllFileName_.c_str()); -#endif - if (!handle_) { - return false; - } - - if (!amd::Os::iterateSymbols(handle_, setKernelInfoCallback, this)) { - return false; - } - - // Add cpu symbols into elf - if (addElfSymbols) { - if (!amd::Os::iterateSymbols(handle_, setSymbolsCallback, clBinary())) { - return false; - } - } - - return true; -} - -bool -Program::linkImpl(amd::option::Options* options) -{ -#if defined(WITH_ONLINE_COMPILER) - // If we don't have LLVM binary then attempt to use OCL binary - if (llvmBinary_.empty()) { - // Load ISA - // For elf format, setup elfIn() and this elfIn() will be released - // at the end of build by finiBuild(). - if (!clBinary()->setElfIn(LP64_SWITCH(ELFCLASS32, ELFCLASS64))) { - buildLog_ += "Internal error: Setting up input OpenCL binary failed!\n"; - LogError("Setting up input binary failed"); - return false; - } - - if (options->oVariables->UseJIT) { - bool hasJITBinary; - if (!clBinary()->loadX86JIT(*this, hasJITBinary)) { - return false; - } else if (hasJITBinary) { - aclJITObjectImage objectImage = getJITBinary(); - acl_error err = aclJITObjectImageIterateSymbols(compiler(), objectImage, - setKernelInfoCallbackCStr, this); - if (err != ACL_SUCCESS) { - LogWarning("aclJITObjectImageIterateSymbols failed"); - return false; - } - err = aclJITObjectImageIterateSymbols(compiler(), objectImage, - setSymbolsCallbackCStr, clBinary()); - if (err != ACL_SUCCESS) { - LogWarning("aclJITObjectImageIterateSymbols failed"); - return false; - } - size_t size = aclJITObjectImageGetGlobalsSize(compiler(), objectImage, &err); - if (err != ACL_SUCCESS) { - LogWarning("aclJITObjectImageGetGlobalsSize failed"); - return false; - } - setGlobalVariableTotalSize(size); - return true; - } - // Fall-through to recompile - } else { - // Trying to load DLL that was generated by out-process as/ld before - bool hasDLL = false; - bool loadSuccess = clBinary()->loadX86(*this, dllFileName_, hasDLL); - if (!loadSuccess) { - buildLog_ += "Error: loading a kernel from OpenCL binary failed!\n"; - return false; - } - else if (hasDLL) { - if (loadDllCode(options)) { - // No rebuid and use the original binary. Release any new binary if there is. - clBinary()->restoreOrigBinary(); - return true; - } - } - // Fall-through to recompile - } - - // Need to try recompile, check to see if if LLVM IR is present - if (clBinary()->loadLlvmBinary(llvmBinary_, elfSectionType_) && - clBinary()->isRecompilable(llvmBinary_, amd::OclElf::CPU_PLATFORM)) { - // Copy both .source and .llvmir into the elfout_ - char *section; - size_t sz; - if (clBinary()->saveSOURCE() && - clBinary()->elfIn()->getSection(amd::OclElf::SOURCE, §ion, &sz)) { - if ((section != NULL) && (sz > 0)) { - clBinary()->elfOut()->addSection(amd::OclElf::SOURCE, section, sz); - } - } - - if (clBinary()->saveLLVMIR()) { - clBinary()->elfOut()->addSection(elfSectionType_, - llvmBinary_.data(), - llvmBinary_.size(), false); - } - } - // We failed kernels loading (wrong ASIC?) - else { - buildLog_ += "Error: Runtime failed to load kernels from OCL binary!\n"; - LogError(buildLog_.c_str()); - return false; - } - } - - // Do we have llvm binary? - if (!llvmBinary_.empty()) { - // Compile llvm binary to x86 source code - if (!compileBinaryToISA(options)) { - LogError("We failed to compile LLVMIR binary to ASM text!"); - return false; - } - } - - setType(TYPE_EXECUTABLE); - - ///////////////////////////////////////////////////////////// - //////////////// check, there is a good place to finish elf objects - ////////////////////////////////////////////////////////////// - - // Load dll executable - if (loadDllCode(options, clBinary()->saveISA())) { - if (!createBinary(options)) { - buildLog_ += "Internal Error: creating OpenCL binary failed!\n"; - return false; } + setGlobalVariableTotalSize(size); return true; + } + // Fall-through to recompile + } else { + // Trying to load DLL that was generated by out-process as/ld before + bool hasDLL = false; + bool loadSuccess = clBinary()->loadX86(*this, dllFileName_, hasDLL); + if (!loadSuccess) { + buildLog_ += "Error: loading a kernel from OpenCL binary failed!\n"; + return false; + } else if (hasDLL) { + if (loadDllCode(options)) { + // No rebuid and use the original binary. Release any new binary if there is. + clBinary()->restoreOrigBinary(); + return true; + } + } + // Fall-through to recompile } - buildLog_ += "Internal Error: loading shared library failed!\n"; -#endif // WITH_ONLINE_COMPILER - return false; + + // Need to try recompile, check to see if if LLVM IR is present + if (clBinary()->loadLlvmBinary(llvmBinary_, elfSectionType_) && + clBinary()->isRecompilable(llvmBinary_, amd::OclElf::CPU_PLATFORM)) { + // Copy both .source and .llvmir into the elfout_ + char* section; + size_t sz; + if (clBinary()->saveSOURCE() && + clBinary()->elfIn()->getSection(amd::OclElf::SOURCE, §ion, &sz)) { + if ((section != NULL) && (sz > 0)) { + clBinary()->elfOut()->addSection(amd::OclElf::SOURCE, section, sz); + } + } + + if (clBinary()->saveLLVMIR()) { + clBinary()->elfOut()->addSection(elfSectionType_, llvmBinary_.data(), llvmBinary_.size(), + false); + } + } + // We failed kernels loading (wrong ASIC?) + else { + buildLog_ += "Error: Runtime failed to load kernels from OCL binary!\n"; + LogError(buildLog_.c_str()); + return false; + } + } + + // Do we have llvm binary? + if (!llvmBinary_.empty()) { + // Compile llvm binary to x86 source code + if (!compileBinaryToISA(options)) { + LogError("We failed to compile LLVMIR binary to ASM text!"); + return false; + } + } + + setType(TYPE_EXECUTABLE); + + ///////////////////////////////////////////////////////////// + //////////////// check, there is a good place to finish elf objects + ////////////////////////////////////////////////////////////// + + // Load dll executable + if (loadDllCode(options, clBinary()->saveISA())) { + if (!createBinary(options)) { + buildLog_ += "Internal Error: creating OpenCL binary failed!\n"; + return false; + } + return true; + } + buildLog_ += "Internal Error: loading shared library failed!\n"; +#endif // WITH_ONLINE_COMPILER + return false; } -bool -Program::linkImpl( - const std::vector& inputPrograms, - amd::option::Options* options, - bool createLibrary) -{ +bool Program::linkImpl(const std::vector& inputPrograms, + amd::option::Options* options, bool createLibrary) { #if defined(WITH_ONLINE_COMPILER) - std::vector llvmBinaries(inputPrograms.size()); - std::vector elfSectionType(inputPrograms.size()); - std::vector::const_iterator it - = inputPrograms.begin(); - std::vector::const_iterator itEnd - = inputPrograms.end(); - for (size_t i = 0; it != itEnd; ++it, ++i) { - Program* program = (Program*)*it; + std::vector llvmBinaries(inputPrograms.size()); + std::vector elfSectionType(inputPrograms.size()); + std::vector::const_iterator it = inputPrograms.begin(); + std::vector::const_iterator itEnd = inputPrograms.end(); + for (size_t i = 0; it != itEnd; ++it, ++i) { + Program* program = (Program*)*it; - if (program->llvmBinary_.empty()) { - if (program->clBinary() == NULL) { - buildLog_ += "Internal error: Input program not compiled!\n"; - LogError("Loading compiled input object failed"); - return false; - } + if (program->llvmBinary_.empty()) { + if (program->clBinary() == NULL) { + buildLog_ += "Internal error: Input program not compiled!\n"; + LogError("Loading compiled input object failed"); + return false; + } - // If we don't have LLVM binary then attempt to use OCL binary - // Load ISA - // For elf format, setup elfIn() and this elfIn() will be released - // at the end of build by finiBuild(). - if (!program->clBinary()->setElfIn(LP64_SWITCH(ELFCLASS32, - ELFCLASS64))) { - buildLog_ += "Internal error: Setting up input OpenCL binary" - " failed!\n"; - LogError("Setting up input binary failed"); - return false; - } + // If we don't have LLVM binary then attempt to use OCL binary + // Load ISA + // For elf format, setup elfIn() and this elfIn() will be released + // at the end of build by finiBuild(). + if (!program->clBinary()->setElfIn(LP64_SWITCH(ELFCLASS32, ELFCLASS64))) { + buildLog_ += + "Internal error: Setting up input OpenCL binary" + " failed!\n"; + LogError("Setting up input binary failed"); + return false; + } - // Need to try recompile, check to see if if LLVM IR is present - if (program->clBinary()->loadLlvmBinary(program->llvmBinary_, program->elfSectionType_) && - program->clBinary()->isRecompilable(program->llvmBinary_, - amd::OclElf::CPU_PLATFORM)) { - // Copy both .source and .llvmir into the elfout_ + // Need to try recompile, check to see if if LLVM IR is present + if (program->clBinary()->loadLlvmBinary(program->llvmBinary_, program->elfSectionType_) && + program->clBinary()->isRecompilable(program->llvmBinary_, amd::OclElf::CPU_PLATFORM)) { +// Copy both .source and .llvmir into the elfout_ #if 0 // TODO: copy source into .source section of elfout_ char *section; @@ -1117,222 +1067,211 @@ Program::linkImpl( } } #endif - } - // We failed kernels loading (wrong ASIC?) - else { - buildLog_ += "Error: Runtime failed to load kernels from OCL " - "binary!\n"; - LogError(buildLog_.c_str()); - return false; - } - } - - llvmBinaries[i] = &program->llvmBinary_; - elfSectionType[i] = program->elfSectionType_; + } + // We failed kernels loading (wrong ASIC?) + else { + buildLog_ += + "Error: Runtime failed to load kernels from OCL " + "binary!\n"; + LogError(buildLog_.c_str()); + return false; + } } - acl_error err = ACL_SUCCESS; - aclTargetInfo aclinfo = info(); + llvmBinaries[i] = &program->llvmBinary_; + elfSectionType[i] = program->elfSectionType_; + } - aclBinaryOptions binOpts = {0}; - binOpts.struct_size = sizeof(binOpts); - binOpts.elfclass = aclinfo.arch_id == aclX64 ? ELFCLASS64 : ELFCLASS32; - binOpts.bitness = ELFDATA2LSB; - binOpts.alloc = &::malloc; - binOpts.dealloc = &::free; + acl_error err = ACL_SUCCESS; + aclTargetInfo aclinfo = info(); - std::vector libs(llvmBinaries.size(), NULL); - for (size_t i = 0; i < libs.size(); ++i) { - libs[i] = aclBinaryInit(sizeof(aclBinary), &aclinfo, &binOpts, &err); - if (err != ACL_SUCCESS) { - buildLog_ += "Internal error: Setting up input OpenCL binary failed!\n"; - LogWarning("aclBinaryInit failed"); - break; - } + aclBinaryOptions binOpts = {0}; + binOpts.struct_size = sizeof(binOpts); + binOpts.elfclass = aclinfo.arch_id == aclX64 ? ELFCLASS64 : ELFCLASS32; + binOpts.bitness = ELFDATA2LSB; + binOpts.alloc = &::malloc; + binOpts.dealloc = &::free; - _bif_sections_enum_0_8 aclTypeUsed; - if (elfSectionType[i] == amd::OclElf::SPIRV) { - aclTypeUsed = aclSPIRV; - } else if (elfSectionType[i] == amd::OclElf::SPIR) { - aclTypeUsed = aclSPIR; - } else { - aclTypeUsed = aclLLVMIR; - } - err = aclInsertSection(compiler(), libs[i], - llvmBinaries[i]->data(), llvmBinaries[i]->size(), aclTypeUsed); - - if (err != ACL_SUCCESS) { - LogWarning("aclInsertSection failed"); - break; - } - - // temporary solution to synchronize buildNo between runtime and complib - // until we move runtime inside complib - ((amd::option::Options*)libs[i]->options)->setBuildNo( - options->getBuildNo()); + std::vector libs(llvmBinaries.size(), NULL); + for (size_t i = 0; i < libs.size(); ++i) { + libs[i] = aclBinaryInit(sizeof(aclBinary), &aclinfo, &binOpts, &err); + if (err != ACL_SUCCESS) { + buildLog_ += "Internal error: Setting up input OpenCL binary failed!\n"; + LogWarning("aclBinaryInit failed"); + break; } - if (libs.size() > 0 && err == ACL_SUCCESS) do { - unsigned int numLibs = libs.size() - 1; - - if (numLibs > 0) { - err = aclLink(compiler(), libs[0], libs.size() - 1, &libs[1], - ACL_TYPE_LLVMIR_BINARY, "-create-library", NULL); - - buildLog_ += aclGetCompilerLog(compiler()); - - if (err != ACL_SUCCESS) { - LogWarning("aclLink failed"); - break; - } - } - - size_t size = 0; - _bif_sections_enum_0_8 aclTypeUsed; - if (elfSectionType[0] == amd::OclElf::SPIRV && numLibs == 0) { - aclTypeUsed = aclSPIRV; - } else if (elfSectionType[0] == amd::OclElf::SPIR && numLibs == 0) { - aclTypeUsed = aclSPIR; - } else { - aclTypeUsed = aclLLVMIR; - } - const void* llvmir = aclExtractSection(compiler(), libs[0], - &size, aclTypeUsed, &err); - if (err != ACL_SUCCESS) { - LogWarning("aclExtractSection failed"); - break; - } - - llvmBinary_.assign(reinterpret_cast(llvmir), size); - } while(0); - - std::for_each(libs.begin(), libs.end(), std::ptr_fun(aclBinaryFini)); + _bif_sections_enum_0_8 aclTypeUsed; + if (elfSectionType[i] == amd::OclElf::SPIRV) { + aclTypeUsed = aclSPIRV; + } else if (elfSectionType[i] == amd::OclElf::SPIR) { + aclTypeUsed = aclSPIR; + } else { + aclTypeUsed = aclLLVMIR; + } + err = aclInsertSection(compiler(), libs[i], llvmBinaries[i]->data(), llvmBinaries[i]->size(), + aclTypeUsed); if (err != ACL_SUCCESS) { - buildLog_ += "Error: linking llvm modules failed!"; - return false; + LogWarning("aclInsertSection failed"); + break; } - if (clBinary()->saveLLVMIR()) { - clBinary()->elfOut()->addSection(elfSectionType_, - llvmBinary_.data(), - llvmBinary_.size(), - false); - // store the original link options - clBinary()->storeLinkOptions(linkOptions_); - clBinary()->storeCompileOptions(compileOptions_); - } + // temporary solution to synchronize buildNo between runtime and complib + // until we move runtime inside complib + ((amd::option::Options*)libs[i]->options)->setBuildNo(options->getBuildNo()); + } - // skip the rest if we are building an opencl library - if (createLibrary) { - setType(TYPE_LIBRARY); - if (!createBinary(options)) { - buildLog_ += "Intenral error: creating OpenCL binary failed\n"; - return false; + if (libs.size() > 0 && err == ACL_SUCCESS) do { + unsigned int numLibs = libs.size() - 1; + + if (numLibs > 0) { + err = aclLink(compiler(), libs[0], libs.size() - 1, &libs[1], ACL_TYPE_LLVMIR_BINARY, + "-create-library", NULL); + + buildLog_ += aclGetCompilerLog(compiler()); + + if (err != ACL_SUCCESS) { + LogWarning("aclLink failed"); + break; } + } - return true; - } + size_t size = 0; + _bif_sections_enum_0_8 aclTypeUsed; + if (elfSectionType[0] == amd::OclElf::SPIRV && numLibs == 0) { + aclTypeUsed = aclSPIRV; + } else if (elfSectionType[0] == amd::OclElf::SPIR && numLibs == 0) { + aclTypeUsed = aclSPIR; + } else { + aclTypeUsed = aclLLVMIR; + } + const void* llvmir = aclExtractSection(compiler(), libs[0], &size, aclTypeUsed, &err); + if (err != ACL_SUCCESS) { + LogWarning("aclExtractSection failed"); + break; + } - // Compile llvm binary to x86 source code - if (!compileBinaryToISA(options)) { - LogError("We failed to compile LLVMIR binary to ASM text!"); - return false; - } + llvmBinary_.assign(reinterpret_cast(llvmir), size); + } while (0); - setType(TYPE_EXECUTABLE); + std::for_each(libs.begin(), libs.end(), std::ptr_fun(aclBinaryFini)); - ///////////////////////////////////////////////////////////// - //////////////// check, there is a good place to finish elf objects - ////////////////////////////////////////////////////////////// - - // Load dll executable - if (loadDllCode(options, clBinary()->saveISA())) { - if (!createBinary(options)) { - buildLog_ += "Internal Error: creating OpenCL binary failed!\n"; - return false; - } - return true; - } - buildLog_ += "Internal Error: loading shared library failed!\n"; -#endif // WITH_ONLINE_COMPILER + if (err != ACL_SUCCESS) { + buildLog_ += "Error: linking llvm modules failed!"; return false; + } + + if (clBinary()->saveLLVMIR()) { + clBinary()->elfOut()->addSection(elfSectionType_, llvmBinary_.data(), llvmBinary_.size(), + false); + // store the original link options + clBinary()->storeLinkOptions(linkOptions_); + clBinary()->storeCompileOptions(compileOptions_); + } + + // skip the rest if we are building an opencl library + if (createLibrary) { + setType(TYPE_LIBRARY); + if (!createBinary(options)) { + buildLog_ += "Intenral error: creating OpenCL binary failed\n"; + return false; + } + + return true; + } + + // Compile llvm binary to x86 source code + if (!compileBinaryToISA(options)) { + LogError("We failed to compile LLVMIR binary to ASM text!"); + return false; + } + + setType(TYPE_EXECUTABLE); + + ///////////////////////////////////////////////////////////// + //////////////// check, there is a good place to finish elf objects + ////////////////////////////////////////////////////////////// + + // Load dll executable + if (loadDllCode(options, clBinary()->saveISA())) { + if (!createBinary(options)) { + buildLog_ += "Internal Error: creating OpenCL binary failed!\n"; + return false; + } + return true; + } + buildLog_ += "Internal Error: loading shared library failed!\n"; +#endif // WITH_ONLINE_COMPILER + return false; } -bool -Program::initClBinary() -{ +bool Program::initClBinary() { + if (clBinary_ == NULL) { + clBinary_ = new ClBinary(device()); if (clBinary_ == NULL) { - clBinary_ = new ClBinary(device()); - if (clBinary_ == NULL) { - return false; - } + return false; } + } + return true; +} + +void Program::releaseClBinary() { + if (clBinary_ != NULL) { + delete clBinary_; + clBinary_ = NULL; + } +} + +bool Program::createBinary(amd::option::Options* options) { + if (options->oVariables->BinBIF30) { return true; + } + + if (!clBinary()->createElfBinary(options->oVariables->BinEncrypt, type())) { + buildLog_ += "Internal Error: creating OpenCL binary failed!\n"; + LogError("Failed to create ELF binary image!"); + return false; + } + return true; } -void -Program::releaseClBinary() -{ - if (clBinary_ != NULL) { - delete clBinary_; - clBinary_ = NULL; - } +const aclTargetInfo& Program::info(const char* str) { + acl_error err = ACL_SUCCESS; + info_ = aclGetTargetInfo(LP64_SWITCH("x86", "x86-64"), (str && str[0] == '\0' ? "Generic" : str), + &err); + if (err != ACL_SUCCESS) { + LogWarning("aclGetTargetInfo failed"); + } + return info_; } -bool -Program::createBinary(amd::option::Options* options) -{ - if (options->oVariables->BinBIF30) { - return true; - } +Program::~Program() { + if (getJITBinary() != NULL) { + aclJITObjectImageDestroy(compiler(), getJITBinary()); + } - if (!clBinary()->createElfBinary(options->oVariables->BinEncrypt, - type())) { - buildLog_ += "Internal Error: creating OpenCL binary failed!\n"; - LogError("Failed to create ELF binary image!"); - return false; - } - return true; -} + if (!sourceFileName_.empty()) { + amd::Os::unlink(sourceFileName_.c_str()); + } -const aclTargetInfo & -Program::info(const char * str) { - acl_error err = ACL_SUCCESS; - info_ = aclGetTargetInfo(LP64_SWITCH("x86", "x86-64"), ( str && str[0] == '\0' ? "Generic" : str ), &err); - if (err != ACL_SUCCESS) { - LogWarning("aclGetTargetInfo failed"); - } - return info_; -} - -Program::~Program() -{ - if(getJITBinary() != NULL) { - aclJITObjectImageDestroy(compiler(), getJITBinary()); - } - - if (!sourceFileName_.empty()) { - amd::Os::unlink(sourceFileName_.c_str()); - } - - if (handle_ != NULL) { - amd::Os::unloadLibrary(handle_); - amd::Os::unlink(dllFileName_); - char dllName[256]; + if (handle_ != NULL) { + amd::Os::unloadLibrary(handle_); + amd::Os::unlink(dllFileName_); + char dllName[256]; #ifdef _WIN32 - memcpy(dllName, dllFileName_.data(), dllFileName_.size()); - char* tempName = strrchr(dllName, '.'); - if (tempName != NULL) { - *tempName = '\0'; - amd::Os::unlink(dllName); - } -#endif // _WIN32 + memcpy(dllName, dllFileName_.data(), dllFileName_.size()); + char* tempName = strrchr(dllName, '.'); + if (tempName != NULL) { + *tempName = '\0'; + amd::Os::unlink(dllName); } +#endif // _WIN32 + } #if defined(WITH_ONLINE_COMPILER) - releaseClBinary(); + releaseClBinary(); #endif } -} // namespace cpu +} // namespace cpu diff --git a/rocclr/runtime/device/cpu/cpuprogram.hpp b/rocclr/runtime/device/cpu/cpuprogram.hpp index 47fec428ba..908956617a 100644 --- a/rocclr/runtime/device/cpu/cpuprogram.hpp +++ b/rocclr/runtime/device/cpu/cpuprogram.hpp @@ -14,106 +14,94 @@ namespace amd { namespace option { class Options; -} // option -} // amd +} // option +} // amd //! \namespace cpu CPU Device Implementation namespace cpu { //! \class CPU program -class Program : public device::Program -{ -private: - aclJITObjectImage JITBinary; - std::string sourceFileName_; //!< The source image. - void* handle_; // @todo: remove me +class Program : public device::Program { + private: + aclJITObjectImage JITBinary; + std::string sourceFileName_; //!< The source image. + void* handle_; // @todo: remove me -public: - //! Default constructor - Program(Device& cpuDev) - : device::Program(cpuDev), JITBinary(NULL), handle_(NULL) {} + public: + //! Default constructor + Program(Device& cpuDev) : device::Program(cpuDev), JITBinary(NULL), handle_(NULL) {} - //! Default destructor - ~Program(); + //! Default destructor + ~Program(); - //! pre-compile setup for CPU - virtual bool initBuild(amd::option::Options* options); + //! pre-compile setup for CPU + virtual bool initBuild(amd::option::Options* options); - //! post-compile setup for CPU - virtual bool finiBuild(bool isBuildGood); + //! post-compile setup for CPU + virtual bool finiBuild(bool isBuildGood); - //! Compiles CPU program - virtual bool compileImpl( - const std::string& sourceCode, - const std::vector& headers, - const char** headerIncludeNames, - amd::option::Options* options ); + //! Compiles CPU program + virtual bool compileImpl(const std::string& sourceCode, + const std::vector& headers, + const char** headerIncludeNames, amd::option::Options* options); - //! Links CPU program - virtual bool linkImpl(amd::option::Options* options = NULL); + //! Links CPU program + virtual bool linkImpl(amd::option::Options* options = NULL); - //! Links CPU programs - virtual bool linkImpl( - const std::vector& inputPrograms, - amd::option::Options* options = NULL, - bool createLibrary = false); + //! Links CPU programs + virtual bool linkImpl(const std::vector& inputPrograms, + amd::option::Options* options = NULL, bool createLibrary = false); - virtual bool createBinary(amd::option::Options* options); + virtual bool createBinary(amd::option::Options* options); - //! Returns the device object, associated with this program. - const Device& device() { - return static_cast(device::Program::device()); - } + //! Returns the device object, associated with this program. + const Device& device() { return static_cast(device::Program::device()); } - /*! \brief Invokes the LLC compiler for the LLVM binary compilation - * to x86 ASM text source code and ISA binary - * - * \return True if we successefully compiled a CPU program - */ - bool compileBinaryToISA( - amd::option::Options* options //!< options for compilation - ); + /*! \brief Invokes the LLC compiler for the LLVM binary compilation + * to x86 ASM text source code and ISA binary + * + * \return True if we successefully compiled a CPU program + */ + bool compileBinaryToISA(amd::option::Options* options //!< options for compilation + ); - //! Load the library into memory - bool loadDllCode(amd::option::Options* options, bool addElfSymbols=false); + //! Load the library into memory + bool loadDllCode(amd::option::Options* options, bool addElfSymbols = false); - //! Initialize binary for CPU - virtual bool initClBinary(); + //! Initialize binary for CPU + virtual bool initClBinary(); - //! Release binary for CPU - virtual void releaseClBinary(); + //! Release binary for CPU + virtual void releaseClBinary(); - ClBinary* clBinary() { - return static_cast(device::Program::clBinary()); - } - const ClBinary* clBinary() const { - return static_cast(device::Program::clBinary()); - } + ClBinary* clBinary() { return static_cast(device::Program::clBinary()); } + const ClBinary* clBinary() const { + return static_cast(device::Program::clBinary()); + } - aclJITObjectImage getJITBinary() { return this->JITBinary; } - void setJITBinary(aclJITObjectImage JITBinary) { this->JITBinary = JITBinary; } + aclJITObjectImage getJITBinary() { return this->JITBinary; } + void setJITBinary(aclJITObjectImage JITBinary) { this->JITBinary = JITBinary; } - //! Returns the pointer to the Compiler struct - //! Became public (prev. private) due to use in cpubinary for aclJIT functionality - aclCompiler* compiler() { return static_cast(device()).compiler(); } + //! Returns the pointer to the Compiler struct + //! Became public (prev. private) due to use in cpubinary for aclJIT functionality + aclCompiler* compiler() { return static_cast(device()).compiler(); } -private: + private: + //! Disable default copy constructor + Program(const Program&); - //! Disable default copy constructor - Program(const Program&); + //! Disable operator= + Program& operator=(const Program&); - //! Disable operator= - Program& operator=(const Program&); + std::string dllFileName_; //!< File name of the dll with kernels + protected: + virtual bool isElf(const char* bin) const { + return amd::isElfHeader(bin, LP64_SWITCH(ELFCLASS32, ELFCLASS64)); + } - std::string dllFileName_; //!< File name of the dll with kernels -protected: - virtual bool isElf(const char* bin) const { - return amd::isElfHeader(bin, LP64_SWITCH(ELFCLASS32, ELFCLASS64)); - } - - virtual const aclTargetInfo & info(const char * str = ""); + virtual const aclTargetInfo& info(const char* str = ""); }; -} // namespace cpu +} // namespace cpu -#endif // CPUPROGRAM_HPP_ +#endif // CPUPROGRAM_HPP_ diff --git a/rocclr/runtime/device/cpu/cpusettings.cpp b/rocclr/runtime/device/cpu/cpusettings.cpp index da4fb15344..1389f235a9 100644 --- a/rocclr/runtime/device/cpu/cpusettings.cpp +++ b/rocclr/runtime/device/cpu/cpusettings.cpp @@ -7,99 +7,92 @@ namespace cpu { -bool -Settings::create() -{ - // This code is temporary until cl_khr_fp64 is unconditional - if (flagIsDefault(CL_KHR_FP64) || CL_KHR_FP64) { - enableExtension(ClKhrFp64); - } +bool Settings::create() { + // This code is temporary until cl_khr_fp64 is unconditional + if (flagIsDefault(CL_KHR_FP64) || CL_KHR_FP64) { + enableExtension(ClKhrFp64); + } - enableExtension(ClAmdFp64); - enableExtension(ClKhrGlobalInt32BaseAtomics); - enableExtension(ClKhrGlobalInt32ExtendedAtomics); - enableExtension(ClKhrLocalInt32BaseAtomics); - enableExtension(ClKhrLocalInt32ExtendedAtomics); + enableExtension(ClAmdFp64); + enableExtension(ClKhrGlobalInt32BaseAtomics); + enableExtension(ClKhrGlobalInt32ExtendedAtomics); + enableExtension(ClKhrLocalInt32BaseAtomics); + enableExtension(ClKhrLocalInt32ExtendedAtomics); #ifdef _LP64 - enableExtension(ClKhrInt64BaseAtomics); - enableExtension(ClKhrInt64ExtendedAtomics); -#endif // _LP64 - enableExtension(ClKhrByteAddressableStore); - enableExtension(ClKhrGlSharing); - enableExtension(ClKhrGlEvent); - enableExtension(ClExtDeviceFission); - enableExtension(ClAmdDeviceAttributeQuery); - enableExtension(ClAmdVec3); - enableExtension(ClAmdMediaOps); - enableExtension(ClAmdMediaOps2); - enableExtension(ClAmdPopcnt); - enableExtension(ClAmdPrintf); + enableExtension(ClKhrInt64BaseAtomics); + enableExtension(ClKhrInt64ExtendedAtomics); +#endif // _LP64 + enableExtension(ClKhrByteAddressableStore); + enableExtension(ClKhrGlSharing); + enableExtension(ClKhrGlEvent); + enableExtension(ClExtDeviceFission); + enableExtension(ClAmdDeviceAttributeQuery); + enableExtension(ClAmdVec3); + enableExtension(ClAmdMediaOps); + enableExtension(ClAmdMediaOps2); + enableExtension(ClAmdPopcnt); + enableExtension(ClAmdPrintf); - // enableExtension(ClKhrSelectFpRoundingMode); - enableExtension(ClKhr3DImageWrites); + // enableExtension(ClKhrSelectFpRoundingMode); + enableExtension(ClKhr3DImageWrites); - // enableExtension(ClKhrFp16); +// enableExtension(ClKhrFp16); #if defined(_WIN32) - enableExtension(ClKhrD3d10Sharing); -#endif // _WIN32 - enableExtension(ClKhrSpir); - // ToDo: enable this after conformance test is updated to accept it - // enableExtension(ClKhrIlProgram); + enableExtension(ClKhrD3d10Sharing); +#endif // _WIN32 + enableExtension(ClKhrSpir); + // ToDo: enable this after conformance test is updated to accept it + // enableExtension(ClKhrIlProgram); - // Enable some OpenCL 2.0 extensions - if ((OPENCL_MAJOR >= 2) && (CPU_OPENCL_VERSION >= 200)) { - partialDispatch_ = true; - enableExtension(ClKhrSubGroups); - supportDepthsRGB_ = true; - enableExtension(ClKhrDepthImages); - } + // Enable some OpenCL 2.0 extensions + if ((OPENCL_MAJOR >= 2) && (CPU_OPENCL_VERSION >= 200)) { + partialDispatch_ = true; + enableExtension(ClKhrSubGroups); + supportDepthsRGB_ = true; + enableExtension(ClKhrDepthImages); + } - // Map CPUID feature bits to our own feature bits - const int sse2_features = CPUFEAT_DX_SSE | CPUFEAT_DX_SSE2; - const int avx_features = CPUFEAT_CX_SSE3 | CPUFEAT_CX_SSSE3 | - CPUFEAT_CX_SSE4_1 | CPUFEAT_CX_SSE4_2 | - CPUFEAT_CX_POPCNT | CPUFEAT_CX_AVX | - CPUFEAT_CX_OSXSAVE; - const int fma3_features = INTEL_CPUFEAT_CX_FMA3; - const int fma4_features = AMD_CPUFEAT_CX_FMA4 | AMD_CPUFEAT_CX_XOP; - int regs[4]; + // Map CPUID feature bits to our own feature bits + const int sse2_features = CPUFEAT_DX_SSE | CPUFEAT_DX_SSE2; + const int avx_features = CPUFEAT_CX_SSE3 | CPUFEAT_CX_SSSE3 | CPUFEAT_CX_SSE4_1 | + CPUFEAT_CX_SSE4_2 | CPUFEAT_CX_POPCNT | CPUFEAT_CX_AVX | CPUFEAT_CX_OSXSAVE; + const int fma3_features = INTEL_CPUFEAT_CX_FMA3; + const int fma4_features = AMD_CPUFEAT_CX_FMA4 | AMD_CPUFEAT_CX_XOP; + int regs[4]; #if defined(ATI_ARCH_X86) - amd::Os::cpuid(regs, 0x0); - bool isAmd = regs[1] == ('A' | ('u' << 8) | ('t' << 16) | ('h' << 24)); - bool isIntel = regs[1] == ('G' | ('e' << 8) | ('n' << 16) | ('u' << 24)); + amd::Os::cpuid(regs, 0x0); + bool isAmd = regs[1] == ('A' | ('u' << 8) | ('t' << 16) | ('h' << 24)); + bool isIntel = regs[1] == ('G' | ('e' << 8) | ('n' << 16) | ('u' << 24)); - amd::Os::cpuid(regs, 0x1); + amd::Os::cpuid(regs, 0x1); - cpuFeatures_ = (regs[3] & sse2_features) == sse2_features ? - SSE2Instructions : 0; + cpuFeatures_ = (regs[3] & sse2_features) == sse2_features ? SSE2Instructions : 0; - if ((regs[2] & avx_features) == avx_features) { - // Check for state support - uint64_t xcr0 = amd::Os::xgetbv(0); + if ((regs[2] & avx_features) == avx_features) { + // Check for state support + uint64_t xcr0 = amd::Os::xgetbv(0); - // Check for SSE and YMM bits (1 and 2) - if (((uint32_t)xcr0 & 0x6U) == 0x6U) { - cpuFeatures_ |= AVXInstructions; + // Check for SSE and YMM bits (1 and 2) + if (((uint32_t)xcr0 & 0x6U) == 0x6U) { + cpuFeatures_ |= AVXInstructions; - // Now check for FMA and XOP - if (isIntel) { - cpuFeatures_ |= (regs[2] & fma3_features) == fma3_features ? - FMA3Instructions : 0; - } + // Now check for FMA and XOP + if (isIntel) { + cpuFeatures_ |= (regs[2] & fma3_features) == fma3_features ? FMA3Instructions : 0; + } - if (isAmd) { - amd::Os::cpuid(regs, 0x80000001); - cpuFeatures_ |= (regs[2] & fma4_features) == fma4_features ? - FMA4Instructions : 0; - } - } + if (isAmd) { + amd::Os::cpuid(regs, 0x80000001); + cpuFeatures_ |= (regs[2] & fma4_features) == fma4_features ? FMA4Instructions : 0; + } } -#endif // ATI_ARCH_X86 + } +#endif // ATI_ARCH_X86 - return true; + return true; } -} // namespace cpu +} // namespace cpu diff --git a/rocclr/runtime/device/cpu/cpusettings.hpp b/rocclr/runtime/device/cpu/cpusettings.hpp index 6f15780c2d..a5958d8900 100644 --- a/rocclr/runtime/device/cpu/cpusettings.hpp +++ b/rocclr/runtime/device/cpu/cpusettings.hpp @@ -13,32 +13,31 @@ namespace cpu { //! Device settings -class Settings : public device::Settings -{ -public: - enum CpuFeatures { - SSE2Instructions = 0x01, - AVXInstructions = 0x02, // Processor reports SSSE3, SSE4_1, SSE4_2 - // POPCNT and AVX - FMA3Instructions = 0x04, // Intel processor reports FMA3 - FMA4Instructions = 0x08 // AMD processor reports FMA4 and XOP - }; - uint32_t cpuFeatures_; //!< CPU features +class Settings : public device::Settings { + public: + enum CpuFeatures { + SSE2Instructions = 0x01, + AVXInstructions = 0x02, // Processor reports SSSE3, SSE4_1, SSE4_2 + // POPCNT and AVX + FMA3Instructions = 0x04, // Intel processor reports FMA3 + FMA4Instructions = 0x08 // AMD processor reports FMA4 and XOP + }; + uint32_t cpuFeatures_; //!< CPU features - //! Default constructor - Settings() { cpuFeatures_ = 0; } + //! Default constructor + Settings() { cpuFeatures_ = 0; } - //! Creates settings - bool create(); + //! Creates settings + bool create(); -private: - //! Disable copy constructor - Settings(const Settings&); + private: + //! Disable copy constructor + Settings(const Settings&); - //! Disable assignment - Settings& operator=(const Settings&); + //! Disable assignment + Settings& operator=(const Settings&); }; -} // namespace cpu +} // namespace cpu -#endif // CPUSETTINGS_HPP_ +#endif // CPUSETTINGS_HPP_ diff --git a/rocclr/runtime/device/cpu/cputables.hpp b/rocclr/runtime/device/cpu/cputables.hpp index 3e98d24878..31f02a489a 100644 --- a/rocclr/runtime/device/cpu/cputables.hpp +++ b/rocclr/runtime/device/cpu/cputables.hpp @@ -3,2013 +3,1008 @@ #define TABLES_HPP_ static const uint32_t __ALIGNED__(32) cpuTables[8024] = { - 0x00000000, 0x3faff55b, 0xbd38db2c, 0x3e56e59f, - 0x00000000, 0x3fb0f99e, 0x54dedf96, 0x3e64e3aa, - 0x00000000, 0x3fb1f86d, 0xab1bda88, 0x3e67e105, - 0x00000000, 0x3fb2f719, 0x4d013fd0, 0x3e48c525, - 0x00000000, 0x3fb3f59f, 0x3ad62670, 0x3e2cf8ab, - 0x00000000, 0x3fb4f3fd, 0xbec80468, 0x3e59dca4, - 0x00000000, 0x3fb5f232, 0xec98a8da, 0x3e53f4b5, - 0x00000000, 0x3fb6f03b, 0x619d81fe, 0x3e6b9d49, - 0x00000000, 0x3fb7ee18, 0x87460934, 0x3e430178, - 0x00000000, 0x3fb8ebc5, 0xca0b9944, 0x3e511e3e, - 0x00000000, 0x3fb9e941, 0x3c5a332e, 0x3e54f3f7, - 0x00000000, 0x3fbae68a, 0xae0e00a6, 0x3e5c71c8, - 0x00000000, 0x3fbbe39e, 0xf86fbdc7, 0x3e67cde0, - 0x00000000, 0x3fbce07c, 0x8c889c72, 0x3e570f32, - 0x00000000, 0x3fbddd21, 0x9b994efe, 0x3e5c07ae, - 0x00000000, 0x3fbed98c, 0x1d7b1698, 0x3e40c802, - 0x00000000, 0x3fbfd5ba, 0xedb8cb22, 0x3e635585, - 0x00000000, 0x3fc068d5, 0x67b30e96, 0x3e708425, - 0x00000000, 0x3fc0e6ad, 0x1031472e, 0x3e799e81, - 0x00000000, 0x3fc16465, 0x1416bcee, 0x3e604182, - 0x00000000, 0x3fc1e1fa, 0xe4dc96f4, 0x3e7f6086, - 0x00000000, 0x3fc25f6e, 0x5c5f1b58, 0x3e471a53, - 0x00000000, 0x3fc2dcbd, 0x3fe63ca1, 0x3e765f74, - 0x00000000, 0x3fc359e8, 0x3472d014, 0x3e7dbd73, - 0x00000000, 0x3fc3d6ee, 0x4d8b0d1d, 0x3e7d18cc, - 0x00000000, 0x3fc453ce, 0x53c8fb29, 0x3e78c125, - 0x00000000, 0x3fc4d087, 0xe2e8f991, 0x3e753b49, - 0x00000000, 0x3fc54d18, 0xe148c141, 0x3e77422a, - 0x00000000, 0x3fc5c981, 0x69df56a8, 0x3e4e3ec2, - 0x00000000, 0x3fc645bf, 0x4e7e0ac9, 0x3e7ff675, - 0x00000000, 0x3fc6c1d4, 0x7b1b5aad, 0x3e713126, - 0x00000000, 0x3fc73dbd, 0x403a94bc, 0x3e7d14fa, - 0x00000000, 0x3fc7b97b, 0xc089a3d8, 0x3e62f396, - 0x00000000, 0x3fc8350b, 0x78fa95bb, 0x3e7c731d, - 0x00000000, 0x3fc8b06e, 0x85177399, 0x3e7c50f3, - 0x00000000, 0x3fc92ba3, 0x9c6f2c20, 0x3e6f4140, - 0x00000000, 0x3fc9a6a8, 0xc4c39ec0, 0x3e7d2d90, - 0x00000000, 0x3fca217e, 0x696f2106, 0x3e680420, - 0x00000000, 0x3fca9c23, 0x7943a2e8, 0x3e4b4032, - 0x00000000, 0x3fcb1696, 0x02f3d2a2, 0x3e65d35e, - 0x00000000, 0x3fcb90d7, 0x288117b0, 0x3e64a498, - 0x00000000, 0x3fcc0ae5, 0x19afb324, 0x3e635da1, - 0x00000000, 0x3fcc84bf, 0xcdb9a908, 0x3e714e85, - 0x00000000, 0x3fccfe65, 0xe5547b9a, 0x3e638754, - 0x00000000, 0x3fcd77d5, 0xe6ce3246, 0x3e7be40a, - 0x00000000, 0x3fcdf110, 0xb3bea7e7, 0x3e70c993, - 0x00000000, 0x3fce6a14, 0x89ac3359, 0x3e71d2dd, - 0x00000000, 0x3fcee2e1, 0x03332c46, 0x3e614766, - 0x00000000, 0x3fcf5b75, 0x1bac55b7, 0x3e7f2590, - 0x00000000, 0x3fcfd3d1, 0x7c826e28, 0x3e7f881b, - 0x00000000, 0x3fd025fa, 0x6d698d20, 0x3e744199, - 0x00000000, 0x3fd061ee, 0x521ea089, 0x3e8407ac, - 0x00000000, 0x3fd09dc5, 0x6c4b1723, 0x3e82fb0c, - 0x00000000, 0x3fd0d97e, 0x966a3e18, 0x3e8ca135, - 0x00000000, 0x3fd1151a, 0xe4d646e4, 0x3e6b1218, - 0x00000000, 0x3fd15097, 0xa350d288, 0x3e6d4e72, - 0x00000000, 0x3fd18bf5, 0x2f04c329, 0x3e84617e, - 0x00000000, 0x3fd1c735, 0x41e82650, 0x3e6096ec, - 0x00000000, 0x3fd20255, 0x25773e6e, 0x3e79f91f, - 0x00000000, 0x3fd23d56, 0x20f1d674, 0x3e659c08, - 0x00000000, 0x3fd27837, 0xa2df1064, 0x3e602bf7, - 0x00000000, 0x3fd2b2f7, 0xfc40508f, 0x3e8fb36b, - 0x00000000, 0x3fd2ed98, 0x3f8dc892, 0x3e7ea08f, - 0x00000000, 0x3fd32818, 0x54656a0e, 0x3e73ed62, - 0x00000000, 0x3fd36277, 0xe5e69c58, 0x3e6b83f5, - 0x00000000, 0x3fd39cb4, 0xaf768592, 0x3e8d6ec2, - 0x00000000, 0x3fd3d6d1, 0x9a226f94, 0x3e649388, - 0x00000000, 0x3fd410cb, 0xa65279ba, 0x3e85ad8f, - 0x00000000, 0x3fd44aa4, 0x84d45434, 0x3e6b6157, - 0x00000000, 0x3fd4845a, 0x4368f145, 0x3e809a18, - 0x00000000, 0x3fd4bdee, 0x39b0d91c, 0x3e761a24, - 0x00000000, 0x3fd4f75f, 0x5e39a978, 0x3e7ce1a6, - 0x00000000, 0x3fd530ad, 0xa93b6a66, 0x3e832a39, - 0x00000000, 0x3fd569d8, 0x9af804e7, 0x3e81c369, - 0x00000000, 0x3fd5a2e0, 0x4e44ede8, 0x3e575e0f, - 0x00000000, 0x3fd5dbc3, 0xd1a7a83b, 0x3e8f77ce, - 0x00000000, 0x3fd61484, 0x0cb1b500, 0x3e284e7f, - 0x00000000, 0x3fd64d1f, 0x38b02dfe, 0x3e8ec6b8, - 0x00000000, 0x3fd68597, 0xdfbeda87, 0x3e83ebf4, - 0x00000000, 0x3fd6bdea, 0xed9cb475, 0x3e89397a, - 0x00000000, 0x3fd6f619, 0xbc239c54, 0x3e707937, - 0x00000000, 0x3fd72e22, 0x553131b6, 0x3e8aa754, - 0x00000000, 0x3fd76607, 0x407c45dc, 0x3e74a05d, - 0x00000000, 0x3fd79dc6, 0x1a206dd0, 0x3e813223, - 0x00000000, 0x3fd7d560, 0xfdd69c88, 0x3e72d8ec, - 0x00000000, 0x3fd80cd4, 0x74218606, 0x3e7a852c, - 0x00000000, 0x3fd84422, 0xbaeebb50, 0x3e871bf2, - 0x00000000, 0x3fd87b4b, 0xb7491820, 0x3e483d7d, - 0x00000000, 0x3fd8b24d, 0x92b6da14, 0x3e6ca50d, - 0x00000000, 0x3fd8e929, 0xe8530298, 0x3e56f5cd, - 0x00000000, 0x3fd91fde, 0x98910740, 0x3e7f3431, - 0x00000000, 0x3fd9566d, 0x41ccd80a, 0x3e70e8d2, - 0x00000000, 0x3fd98cd5, 0xc619e6c8, 0x3e71535a, - 0x00000000, 0x3fd9c316, 0x41c36cd2, 0x3e773160, - 0x00000000, 0x3fd9f930, 0x00637d8e, 0x3e7985a0, - 0x00000000, 0x3fda2f23, 0x858c0a68, 0x3e6f2f29, - 0x00000000, 0x3fda64ee, 0x7f96d909, 0x3e887984, - 0x00000000, 0x3fda9a92, 0x19e12e42, 0x3e8ab3d3, - 0x00000000, 0x3fdad00f, 0x62dfc4c2, 0x3e750881, - 0x00000000, 0x3fdb0564, 0xa1cd9d8c, 0x3e605749, - 0x00000000, 0x3fdb3a91, 0x6c6b8618, 0x3e5da65c, - 0x00000000, 0x3fdb6f96, 0x7df1ad64, 0x3e6739bf, - 0x00000000, 0x3fdba473, 0x52aa3340, 0x3e6bc312, - 0x00000000, 0x3fdbd928, 0x91ad3aa8, 0x3e5e5281, - 0x00000000, 0x3fdc0db4, 0x3df19f18, 0x3e8929d9, - 0x00000000, 0x3fdc4219, 0xb693a080, 0x3e5ff11e, - 0x00000000, 0x3fdc7655, 0xf145a3a0, 0x3e455ae3, - 0x00000000, 0x3fdcaa68, 0xc6c0ca82, 0x3e7cbcd8, - 0x00000000, 0x3fdcde53, 0xd425d304, 0x3e70cb04, - 0x00000000, 0x3fdd1215, 0xab5be678, 0x3e79adfc, - 0x00000000, 0x3fdd45ae, 0xc5662508, 0x3e893d90, - 0x00000000, 0x3fdd791f, 0xbd35ff40, 0x3e768489, - 0x00000000, 0x3fddac67, 0x3da2b7e0, 0x3e3586ed, - 0x00000000, 0x3fdddf85, 0x2e850eee, 0x3e87604d, - 0x00000000, 0x3fde127b, 0x2bfb53d8, 0x3e7ac1d1, - 0x00000000, 0x3fde4548, 0x68274740, 0x3e39b3d4, - 0x00000000, 0x3fde77eb, 0x8d10e53c, 0x3e7fc5d6, - 0x00000000, 0x3fdeaa65, 0x1884becb, 0x3e88f9e5, - 0x00000000, 0x3fdedcb6, 0x869c06d1, 0x3e8a87f0, - 0x00000000, 0x3fdf0ede, 0x79f685fa, 0x3e831e72, - 0x00000000, 0x3fdf40dd, 0x2f9719b0, 0x3e46a828, - 0x00000000, 0x3fdf72b2, 0x4a8a44e0, 0x3e60d272, - 0x00000000, 0x3fdfa45d, 0x4b11ad4e, 0x3e8a6052, - 0x00000000, 0x3fdfd5e0, 0x832750f0, 0x3e575fdf, - 0x00000000, 0x3fe0039c, 0x02e4cd36, 0x3e8cf069, - 0x00000000, 0x3fe01c34, 0x2d4f6d10, 0x3e6e8242, - 0x00000000, 0x3fe034b7, 0x1063e6c0, 0x3e524a09, - 0x00000000, 0x3fe04d25, 0x72dc6f38, 0x3e78a1a1, - 0x00000000, 0x3fe0657e, 0x19f8a92d, 0x3e929b66, - 0x00000000, 0x3fe07dc3, 0x9c1b70c8, 0x3e79274d, - 0x00000000, 0x3fe095f3, 0x1fbb7930, 0x3e50c34b, - 0x00000000, 0x3fe0ae0e, 0x6c20eb50, 0x3e663986, - 0x00000000, 0x3fe0c614, 0xf6832e9e, 0x3e86d6d0, - 0x00000000, 0x3fe0de05, 0xef99f25e, 0x3e9af54d, - 0x00000000, 0x3fe0f5e2, 0x52a00262, 0x3e916cfc, - 0x00000000, 0x3fe10daa, 0x83569c32, 0x3e8dcc1e, - 0x00000000, 0x3fe1255d, 0x551ed425, 0x3e937f7a, - 0x00000000, 0x3fe13cfb, 0xadc98887, 0x3e9f6360, - 0x00000000, 0x3fe15485, 0x8d35a2c1, 0x3e92c6ec, - 0x00000000, 0x3fe16bfa, 0xf84cb036, 0x3e8bd44d, - 0x00000000, 0x3fe1835a, 0x826e310e, 0x3e9117cf, - 0x00000000, 0x3fe19aa5, 0xf332cfc9, 0x3e9ca533, - 0x00000000, 0x3fe1b1dc, 0x509dbc2e, 0x3e90f208, - 0x00000000, 0x3fe1c8fe, 0x93c945de, 0x3e8cd07d, - 0x00000000, 0x3fe1e00b, 0xd67e6d72, 0x3e957bdf, - 0x00000000, 0x3fe1f704, 0xc516c658, 0x3e7aab89, - 0x00000000, 0x3fe20de8, 0xb1a1b8a0, 0x3e63e823, - 0x00000000, 0x3fe224b7, 0x4a9d6d3c, 0x3e830746, - 0x00000000, 0x3fe23b71, 0xcd438843, 0x3e9c5993, - 0x00000000, 0x3fe25217, 0xa02ab554, 0x3e9ba2fc, - 0x00000000, 0x3fe268a9, 0x6983a268, 0x3e801a5b, - 0x00000000, 0x3fe27f26, 0xb350efc8, 0x3e6273d1, - 0x00000000, 0x3fe2958e, 0x8c37b0c6, 0x3e864c23, - 0x00000000, 0x3fe2abe2, 0x7370a300, 0x3e6aded0, - 0x00000000, 0x3fe2c221, 0x197eb47e, 0x3e878091, - 0x00000000, 0x3fe2d84c, 0x45e0dabc, 0x3e74b0f2, - 0x00000000, 0x3fe2ee62, 0x794e2eaf, 0x3e9080d9, - 0x00000000, 0x3fe30464, 0x42b60c76, 0x3e8d4ec2, - 0x00000000, 0x3fe31a52, 0xf940caa0, 0x3e4221d2, - 0x00000000, 0x3fe3302b, 0x2b2bba5c, 0x3e7cdbc4, - 0x00000000, 0x3fe345f0, 0xbb440840, 0x3e6cce37, - 0x00000000, 0x3fe35ba0, 0x99cf1dd0, 0x3e96c1d9, - 0x00000000, 0x3fe3713d, 0x07eb0870, 0x3e5bed8a, - 0x00000000, 0x3fe386c5, 0x8f490e3c, 0x3e769ed8, - 0x00000000, 0x3fe39c39, 0x19b73ef0, 0x3e6cd417, - 0x00000000, 0x3fe3b198, 0xc95b41b7, 0x3e9cbc4a, - 0x00000000, 0x3fe3c6e4, 0xb890f5d7, 0x3e9238f1, - 0x00000000, 0x3fe3dc1c, 0x82259cc4, 0x3e750c42, - 0x00000000, 0x3fe3f13f, 0xde87b3e2, 0x3e9713d2, - 0x00000000, 0x3fe4064f, 0xd2255276, 0x3e81d5a7, - 0x00000000, 0x3fe41b4a, 0x48227ac1, 0x3e9c0dfd, - 0x00000000, 0x3fe43032, 0xdab76753, 0x3e91c964, - 0x00000000, 0x3fe44506, 0xd5704496, 0x3e86de56, - 0x00000000, 0x3fe459c6, 0x1fd19968, 0x3e84aeb7, - 0x00000000, 0x3fe46e72, 0xc57b1918, 0x3e8fbf91, - 0x00000000, 0x3fe4830a, 0x7fbe5d9a, 0x3e9d6bef, - 0x00000000, 0x3fe4978f, 0xdc249066, 0x3e9464d3, - 0x00000000, 0x3fe4ac00, 0xec4d9073, 0x3e9638e2, - 0x00000000, 0x3fe4c05e, 0x7247ea7c, 0x3e716f4a, - 0x00000000, 0x3fe4d4a8, 0x40f1d440, 0x3e31a0a7, - 0x00000000, 0x3fe4e8de, 0x0114a33c, 0x3e86edbb, - 0x00000000, 0x3fe4fd01, 0xbf1d513c, 0x3e7dbee8, - 0x00000000, 0x3fe51110, 0xb0248f73, 0x3e95b8bd, - 0x00000000, 0x3fe5250c, 0x3f5eac64, 0x3e97de3d, - 0x00000000, 0x3fe538f5, 0x87ae448a, 0x3e8ee241, - 0x00000000, 0x3fe54cca, 0x91ec5192, 0x3e9e06c5, - 0x00000000, 0x3fe5608d, 0x1a332738, 0x3e74e386, - 0x00000000, 0x3fe5743c, 0xdcc2bfe4, 0x3e7a9599, - 0x00000000, 0x3fe587d8, 0xbad43468, 0x3e6f732f, - 0x00000000, 0x3fe59b60, 0x73b727d9, 0x3e9eb9f5, - 0x00000000, 0x3fe5aed6, 0xa2eb9897, 0x3e98b212, - 0x00000000, 0x3fe5c239, 0x4c167215, 0x3e938488, - 0x00000000, 0x3fe5d589, 0x63020051, 0x3e90e2d3, - 0x00000000, 0x3fe5e8c6, 0x79fbd022, 0x3e928208, - 0x00000000, 0x3fe5fbf0, 0x893e4b30, 0x3e9a1ab9, - 0x00000000, 0x3fe60f08, 0x17a24478, 0x3e82d1b8, - 0x00000000, 0x3fe6220d, 0x8ded4878, 0x3e615d7b, - 0x00000000, 0x3fe634ff, 0x9db3a5e4, 0x3e78968f, - 0x00000000, 0x3fe647de, 0x71fe135f, 0x3e971c41, - 0x00000000, 0x3fe65aab, 0x605d0d8c, 0x3e96d80f, - 0x00000000, 0x3fe66d66, 0x43691590, 0x3e7c91f0, - 0x00000000, 0x3fe6800e, 0x15fce2b2, 0x3e839f8a, - 0x00000000, 0x3fe692a4, 0xa9d94b80, 0x3e455bed, - 0x00000000, 0x3fe6a527, 0x5d60949a, 0x3e8b12c1, - 0x00000000, 0x3fe6b798, 0xb312bfe3, 0x3e924167, - 0x00000000, 0x3fe6c9f7, 0x33070277, 0x3e90ab86, - 0x00000000, 0x3fe6dc44, 0xebbc80ee, 0x3e854554, - 0x00000000, 0x3fe6ee7f, 0xef5a4bb8, 0x3e60204a, - 0x00000000, 0x3fe700a7, 0xc679cf2c, 0x3e98af08, - 0x00000000, 0x3fe712be, 0x330ae6c8, 0x3e90852a, - 0x00000000, 0x3fe724c3, 0x9ec32916, 0x3e86d3eb, - 0x00000000, 0x3fe736b6, 0x7fcbbafe, 0x3e8685cb, - 0x00000000, 0x3fe74897, 0xc1e0bd95, 0x3e91f751, - 0x00000000, 0x3fe75a67, 0xb0f72560, 0x3e5705b1, - 0x00000000, 0x3fe76c24, 0xd808ca92, 0x3e9b98d8, - 0x00000000, 0x3fe77dd1, 0xc75cc980, 0x3e62ea22, - 0x00000000, 0x3fe78f6b, 0x2bca0350, 0x3e97aba6, - 0x00000000, 0x3fe7a0f4, 0x3442278c, 0x3e9d7383, - 0x00000000, 0x3fe7b26c, 0x1fb18bf9, 0x3e95a5ca, - 0x00000000, 0x3fe7c3d3, 0x2b6ecf28, 0x3e61a609, - 0x00000000, 0x3fe7d528, 0x49aac104, 0x3e744fd0, - 0x00000000, 0x3fe7e66c, 0xd8df5180, 0x3e2c114f, - 0x00000000, 0x3fe7f79e, 0x130feae5, 0x3e95972f, - 0x00000000, 0x3fe808c0, 0xa55fe198, 0x3e7ca034, - 0x00000000, 0x3fe819d0, 0x49990227, 0x3e96e2b1, - 0x00000000, 0x3fe82ad0, 0x0294592c, 0x3e7b0000, - 0x00000000, 0x3fe83bbe, 0xc442620e, 0x3e98b9bd, - 0x00000000, 0x3fe84c9c, 0xfabf3e4e, 0x3e8d94fd, - 0x00000000, 0x3fe85d69, 0xb145ad9a, 0x3e85db30, - 0x00000000, 0x3fe86e25, 0xb95022b0, 0x3e8e3e1e, - 0x00000000, 0x3fe87ed0, 0x45442bd6, 0x3e9d5b8b, - 0x00000000, 0x3fe88f6b, 0x231ecd2e, 0x3e97a046, - 0x00000000, 0x3fe89ff5, 0x3ef55232, 0x3e9feafe, - 0x00000000, 0x3fe8b06f, 0xbfd78267, 0x3e9839e7, - 0x00000000, 0x3fe8c0d9, 0x9d6fa900, 0x3e645cf4, - 0x00000000, 0x3fe8d132, 0x2b27f380, 0x3e4be313, - 0x00000000, 0x3fe8e17a, 0x0bb84f9f, 0x3e953398, - 0x00000000, 0x3fe8f1b3, 0xce3ba390, 0x3e5889e2, - 0x00000000, 0x3fe901db, 0xc3ad0cc8, 0x3e7f7778, - 0x00000000, 0x3fe911f3, 0xcec4eba2, 0x3e846660, - 0x00000000, 0x3fe921fb, 0x4611a626, 0x3e85110b, - 0x00000000, 0x3ff00000, 0x00000000, 0x00000000, - 0x80000000, 0x3ff00553, 0xc81e4294, 0x3e6e6a24, - 0x90000000, 0x3ff00aa3, 0x11e3a785, 0x3e585485, - 0x10000000, 0x3ff00ff0, 0x36ec07f6, 0x3e64eb93, - 0x20000000, 0x3ff01539, 0xb8b750e1, 0x3e40ea64, - 0xb0000000, 0x3ff01a7e, 0xcff8a53c, 0x3e461637, - 0xd0000000, 0x3ff01fc0, 0xf7bd1943, 0x3e40733b, - 0x80000000, 0x3ff024ff, 0x1345cced, 0x3e566691, - 0xd0000000, 0x3ff02a3a, 0x3f592f14, 0x3e477b7a, - 0xb0000000, 0x3ff02f72, 0xdd1a5402, 0x3e6f18d3, - 0x50000000, 0x3ff034a7, 0xa58ee9a4, 0x3e2be2f5, - 0x80000000, 0x3ff039d8, 0x8f085fa7, 0x3e68901f, - 0x70000000, 0x3ff03f06, 0xcd5b5d69, 0x3e5c68b8, - 0x10000000, 0x3ff04431, 0x8624be42, 0x3e5a6b0e, - 0x70000000, 0x3ff04958, 0xb06f68e7, 0x3dbc4b22, - 0x80000000, 0x3ff04e7c, 0xafcabe9b, 0x3e60f3f0, - 0x60000000, 0x3ff0539d, 0xbca4e1b7, 0x3e548495, - 0x00000000, 0x3ff058bb, 0x1abdfdc3, 0x3e66107f, - 0x70000000, 0x3ff05dd5, 0x1878288a, 0x3e6e6726, - 0xc0000000, 0x3ff062ec, 0x55286f1e, 0x3e5a6bc1, - 0xe0000000, 0x3ff06800, 0xc64a85f2, 0x3e58a759, - 0xe0000000, 0x3ff06d11, 0x0a4a8d09, 0x3e45fce7, - 0xc0000000, 0x3ff0721f, 0xf373fe1d, 0x3e32f9cb, - 0x80000000, 0x3ff0772a, 0xce4ac359, 0x3e590564, - 0x30000000, 0x3ff07c32, 0xe761b02f, 0x3e5ac29c, - 0xd0000000, 0x3ff08136, 0xf497381c, 0x3e5cb752, - 0x60000000, 0x3ff08638, 0x1cfb35e0, 0x3e68bb9e, - 0xf0000000, 0x3ff08b36, 0x7099de90, 0x3e65b491, - 0x80000000, 0x3ff09032, 0xc9c65ef2, 0x3e5cc77a, - 0x10000000, 0x3ff0952b, 0xe7be3dba, 0x3e57a0f3, - 0xa0000000, 0x3ff09a20, 0x1ee0c16f, 0x3e66ec85, - 0x40000000, 0x3ff09f13, 0xbf2946da, 0x3e689449, - 0xf0000000, 0x3ff0a402, 0x301ba223, 0x3e698f25, - 0xc0000000, 0x3ff0a8ef, 0xc651f549, 0x3e347d5e, - 0x90000000, 0x3ff0add9, 0x9a86007a, 0x3e6c33ec, - 0x90000000, 0x3ff0b2c0, 0x53e92649, 0x3e5e0b66, - 0xb0000000, 0x3ff0b7a4, 0xc09d755f, 0x3e3bd64a, - 0xf0000000, 0x3ff0bc85, 0x06f78167, 0x3e2f5375, - 0x50000000, 0x3ff0c164, 0xd1b3735e, 0x3e62c382, - 0xe0000000, 0x3ff0c63f, 0x659f99e1, 0x3e6e20ed, - 0xb0000000, 0x3ff0cb18, 0x3a9c182a, 0x3e586b63, - 0xb0000000, 0x3ff0cfee, 0x5a65e777, 0x3e445cfd, - 0xe0000000, 0x3ff0d4c1, 0x0f58bca4, 0x3e60c877, - 0x50000000, 0x3ff0d992, 0x4b0933c5, 0x3e6739e4, - 0x10000000, 0x3ff0de60, 0xd9ce7bd8, 0x3e027dc3, - 0x00000000, 0x3ff0e32b, 0x7c5a7b64, 0x3e63c53c, - 0x40000000, 0x3ff0e7f3, 0x83830cec, 0x3e696696, - 0xd0000000, 0x3ff0ecb8, 0xc39bdcc4, 0x3e68d772, - 0xb0000000, 0x3ff0f17b, 0x8bcf6d7b, 0x3e69b000, - 0xf0000000, 0x3ff0f63b, 0x5825ce4f, 0x3e3bbb30, - 0x70000000, 0x3ff0faf9, 0xaf13a406, 0x3e6da3f4, - 0x60000000, 0x3ff0ffb4, 0x6f74ce86, 0x3e5f36b9, - 0xb0000000, 0x3ff1046c, 0x2303f790, 0x3e165c00, - 0x50000000, 0x3ff10922, 0x095ba7d5, 0x3e682f84, - 0x60000000, 0x3ff10dd5, 0x3541b2c6, 0x3e6d4643, - 0xe0000000, 0x3ff11285, 0x56e93a89, 0x3e671c3d, - 0xd0000000, 0x3ff11733, 0xf4e40012, 0x3e598dce, - 0x30000000, 0x3ff11bdf, 0xef17fe03, 0x3e4530eb, - 0x00000000, 0x3ff12088, 0xa3715066, 0x3e4e8b8f, - 0x40000000, 0x3ff1252e, 0xb3b211dc, 0x3e6ab26e, - 0x10000000, 0x3ff129d2, 0xdc906307, 0x3e454dd4, - 0x50000000, 0x3ff12e73, 0x2387984e, 0x3e5c9f96, - 0x10000000, 0x3ff13312, 0x59afec09, 0x3e6c62a9, - 0x60000000, 0x3ff137ae, 0xac6a866a, 0x3e6638d9, - 0x40000000, 0x3ff13c48, 0xeca8a22d, 0x3e338704, - 0xa0000000, 0x3ff140df, 0x1db14f8f, 0x3e4e6c9e, - 0x90000000, 0x3ff14574, 0x7f9c9eaa, 0x3e58744b, - 0x10000000, 0x3ff14a07, 0x3486373b, 0x3e66c289, - 0x30000000, 0x3ff14e97, 0xe31699b7, 0x3e5b36bc, - 0xe0000000, 0x3ff15324, 0x13d200c7, 0x3e671e38, - 0x30000000, 0x3ff157b0, 0xab40aa88, 0x3e699755, - 0x20000000, 0x3ff15c39, 0x0e4bcfc0, 0x3e6b45ca, - 0xc0000000, 0x3ff160bf, 0x0d869c5d, 0x3e32dd09, - 0xf0000000, 0x3ff16543, 0x16b917da, 0x3e64fe05, - 0xd0000000, 0x3ff169c5, 0x226317a2, 0x3e694563, - 0x60000000, 0x3ff16e45, 0xafc2c851, 0x3e653d8f, - 0xa0000000, 0x3ff172c2, 0x1fbd41a3, 0x3e5dcbd4, - 0x90000000, 0x3ff1773d, 0x5285f59c, 0x3e5862ff, - 0x30000000, 0x3ff17bb6, 0xa97a1e1c, 0x3e63072e, - 0x90000000, 0x3ff1802c, 0x75184805, 0x3e528390, - 0xa0000000, 0x3ff184a0, 0x3e9eff42, 0x3e64b032, - 0x70000000, 0x3ff18912, 0x93c45484, 0x3e6b1588, - 0x10000000, 0x3ff18d82, 0x0fc35826, 0x3e3149ef, - 0x60000000, 0x3ff191ef, 0xea96acaa, 0x3e5f2e77, - 0x80000000, 0x3ff1965a, 0x4c471a95, 0x3e520007, - 0x60000000, 0x3ff19ac3, 0x517f6f04, 0x3e63f8cc, - 0x10000000, 0x3ff19f2a, 0xe311bb55, 0x3e660ba2, - 0x90000000, 0x3ff1a38e, 0x730bbec3, 0x3e64b788, - 0xe0000000, 0x3ff1a7f0, 0x795ee20c, 0x3e657090, - 0x00000000, 0x3ff1ac51, 0x983670b1, 0x3e6d9ffe, - 0x00000000, 0x3ff1b0af, 0xff61bfda, 0x3e62a463, - 0xd0000000, 0x3ff1b50a, 0x6a5e65cf, 0x3e69d1bc, - 0x80000000, 0x3ff1b964, 0xbaa9e922, 0x3e68718a, - 0x10000000, 0x3ff1bdbc, 0x2ffa342e, 0x3e63c2f5, - 0x80000000, 0x3ff1c211, 0x3ff42c80, 0x3e60fae1, - 0xd0000000, 0x3ff1c664, 0x0ef00d57, 0x3e65440f, - 0x10000000, 0x3ff1cab6, 0x2d4e3c1e, 0x3e46fcd2, - 0x30000000, 0x3ff1cf05, 0xb409e863, 0x3e4e0c60, - 0x30000000, 0x3ff1d352, 0x5a5f0333, 0x3e6f9cab, - 0x30000000, 0x3ff1d79d, 0x744c333d, 0x3e630f24, - 0x20000000, 0x3ff1dbe6, 0x2a76b2fe, 0x3e4b5062, - 0xf0000000, 0x3ff1e02c, 0xba595375, 0x3e6fdb94, - 0xd0000000, 0x3ff1e471, 0xb945a171, 0x3e3861b9, - 0x90000000, 0x3ff1e8b4, 0x015188c4, 0x3e654348, - 0x50000000, 0x3ff1ecf5, 0x49865523, 0x3e6b54d1, - 0x10000000, 0x3ff1f134, 0x83d9de33, 0x3e6a0bb7, - 0xd0000000, 0x3ff1f570, 0x2b1a2157, 0x3e6629d1, - 0x90000000, 0x3ff1f9ab, 0x35d179df, 0x3e6467fe, - 0x50000000, 0x3ff1fde4, 0x3e26c8f7, 0x3e69763f, - 0x20000000, 0x3ff2021b, 0xbb9f7679, 0x3e53f798, - 0xf0000000, 0x3ff2064f, 0x7e855898, 0x3e552e57, - 0xc0000000, 0x3ff20a82, 0xe5502c3a, 0x3e6fde47, - 0xb0000000, 0x3ff20eb3, 0x548d96a0, 0x3e5cbd0b, - 0xa0000000, 0x3ff212e2, 0xf7be8de8, 0x3e6a9cd9, - 0xb0000000, 0x3ff2170f, 0x704886de, 0x3e522bbe, - 0xc0000000, 0x3ff21b3a, 0x8317f020, 0x3e6e3dea, - 0xf0000000, 0x3ff21f63, 0x85ac8855, 0x3e6e8120, - 0x40000000, 0x3ff2238b, 0x4f24cb07, 0x3e5c8714, - 0xa0000000, 0x3ff227b0, 0xee311fa2, 0x3e61e128, - 0x20000000, 0x3ff22bd4, 0x3d61a2d3, 0x3e5b5c16, - 0xc0000000, 0x3ff22ff5, 0x7fb90633, 0x3e47d97e, - 0x70000000, 0x3ff23415, 0x9d50f6a7, 0x3e6efe89, - 0x50000000, 0x3ff23833, 0xeb75de5a, 0x3e6d0333, - 0x60000000, 0x3ff23c4f, 0xbe73a573, 0x3e40e590, - 0x80000000, 0x3ff24069, 0xcac3cdd2, 0x3e68ce8d, - 0xd0000000, 0x3ff24481, 0x8954064b, 0x3e6ee8a4, - 0x50000000, 0x3ff24898, 0x18461e09, 0x3e6aa62f, - 0x00000000, 0x3ff24cad, 0x40986a15, 0x3e601e59, - 0xe0000000, 0x3ff250bf, 0x4f9b8d4c, 0x3e3b082f, - 0xe0000000, 0x3ff254d0, 0xe5527f5a, 0x3e6876e0, - 0x20000000, 0x3ff258e0, 0x80831e6b, 0x3e636170, - 0x90000000, 0x3ff25ced, 0xe34aa4a2, 0x3e681b26, - 0x40000000, 0x3ff260f9, 0x6dfab0c1, 0x3e552ee6, - 0x20000000, 0x3ff26503, 0x329e8819, 0x3e5d85a5, - 0x40000000, 0x3ff2690b, 0xb646b5d1, 0x3e5105c1, - 0x90000000, 0x3ff26d11, 0x0c1a379c, 0x3e6bb669, - 0x30000000, 0x3ff27116, 0xa73ce3a9, 0x3e586aeb, - 0x00000000, 0x3ff27519, 0x98294dd4, 0x3e6dd161, - 0x20000000, 0x3ff2791a, 0x75775e83, 0x3e6454e6, - 0x80000000, 0x3ff27d19, 0x026197ea, 0x3e63842e, - 0x20000000, 0x3ff28117, 0xe70c44d2, 0x3e6f1ce0, - 0x10000000, 0x3ff28513, 0x441a5627, 0x3e6ad636, - 0x50000000, 0x3ff2890d, 0xd7212abb, 0x3e54c205, - 0xd0000000, 0x3ff28d05, 0x6c116419, 0x3e6167c8, - 0xa0000000, 0x3ff290fc, 0xef16e294, 0x3e638ec3, - 0xc0000000, 0x3ff294f1, 0xeace9321, 0x3e6473fc, - 0x30000000, 0x3ff298e5, 0xa836dba7, 0x3e67af53, - 0x00000000, 0x3ff29cd7, 0xc383b652, 0x3e1a51f3, - 0x10000000, 0x3ff2a0c7, 0xa190822d, 0x3e63696d, - 0x80000000, 0x3ff2a4b5, 0xec77074b, 0x3e62f9ad, - 0x50000000, 0x3ff2a8a2, 0xd5bee55f, 0x3e38190f, - 0x70000000, 0x3ff2ac8d, 0xfac68e55, 0x3e4bfee8, - 0xf0000000, 0x3ff2b076, 0x6bc5f68a, 0x3e331c9d, - 0xc0000000, 0x3ff2b45e, 0x23737edf, 0x3e689d05, - 0x00000000, 0x3ff2b845, 0x43bf47bb, 0x3e5a2959, - 0xa0000000, 0x3ff2bc29, 0x2e5b3207, 0x3e396be3, - 0x90000000, 0x3ff2c00c, 0xd909fa0e, 0x3e6e44c7, - 0x00000000, 0x3ff2c3ee, 0xda94d9ea, 0x3e2b2505, - 0xc0000000, 0x3ff2c7cd, 0xf46c9c98, 0x3e60c851, - 0xf0000000, 0x3ff2cbab, 0x7d9aa3b7, 0x3e5da71f, - 0x80000000, 0x3ff2cf88, 0x5d019ef1, 0x3e6f1b60, - 0x90000000, 0x3ff2d363, 0xa2189563, 0x3e4386e8, - 0x00000000, 0x3ff2d73d, 0x5d306ba7, 0x3e3b19fa, - 0xd0000000, 0x3ff2db14, 0xb67aef76, 0x3e6dd749, - 0x20000000, 0x3ff2deeb, 0xf1dc04b0, 0x3e676ff6, - 0xe0000000, 0x3ff2e2bf, 0xd0b232a6, 0x3e635a33, - 0x10000000, 0x3ff2e693, 0x0024a4e1, 0x3e64bdc8, - 0xb0000000, 0x3ff2ea64, 0x770fd723, 0x3e6ebd61, - 0xd0000000, 0x3ff2ee34, 0xc537264d, 0x3e64769f, - 0x60000000, 0x3ff2f203, 0x429f3b98, 0x3e69021f, - 0x70000000, 0x3ff2f5d0, 0x3efbd606, 0x3e5ee708, - 0xf0000000, 0x3ff2f99b, 0x552a6b1a, 0x3e6ad985, - 0xf0000000, 0x3ff2fd65, 0x78772160, 0x3e6e3df7, - 0x70000000, 0x3ff3012e, 0x6ddc9b34, 0x3e6ca5d7, - 0x70000000, 0x3ff304f5, 0xffdbaf74, 0x3e691154, - 0xf0000000, 0x3ff308ba, 0x57fb306a, 0x3e667bdd, - 0xf0000000, 0x3ff30c7e, 0x5ac40886, 0x3e67dc25, - 0x80000000, 0x3ff31041, 0x8e8afafe, 0x3df219f3, - 0x80000000, 0x3ff31402, 0xf9669a04, 0x3e62416b, - 0x10000000, 0x3ff317c2, 0xb2b3987f, 0x3e611c96, - 0x20000000, 0x3ff31b80, 0x447e1177, 0x3e6f99ed, - 0xd0000000, 0x3ff31f3c, 0x26328a11, 0x3e132458, - 0xf0000000, 0x3ff322f7, 0xd1e645f8, 0x3e66f56d, - 0xb0000000, 0x3ff326b1, 0x46945535, 0x3e461649, - 0xf0000000, 0x3ff32a69, 0x9d190028, 0x3e5e37d5, - 0xc0000000, 0x3ff32e20, 0xf12bf828, 0x3e668671, - 0x20000000, 0x3ff331d6, 0xca6aabbd, 0x3e6e8ecb, - 0x20000000, 0x3ff3358a, 0x109a5912, 0x3e53f49e, - 0xa0000000, 0x3ff3393c, 0x11ec3043, 0x3e6b8a0e, - 0xc0000000, 0x3ff33ced, 0x0aed691a, 0x3e65fae0, - 0x70000000, 0x3ff3409d, 0xbece3e4a, 0x3e6c0569, - 0xc0000000, 0x3ff3444b, 0x744efbfe, 0x3e605e26, - 0xa0000000, 0x3ff347f8, 0xa94be5c5, 0x3e65b570, - 0x20000000, 0x3ff34ba4, 0x6ea0e063, 0x3e5d6f15, - 0x30000000, 0x3ff34f4e, 0x612fc484, 0x3e6e0ca7, - 0xf0000000, 0x3ff352f6, 0x27b25258, 0x3e4963c9, - 0x40000000, 0x3ff3569e, 0xaa725a5c, 0x3e547930, - 0x30000000, 0x3ff35a44, 0xe3af43b3, 0x3e58a79f, - 0xc0000000, 0x3ff35de8, 0x9c41bdaf, 0x3e5e6dc2, - 0xf0000000, 0x3ff3618b, 0x76f863a5, 0x3e657a2e, - 0xd0000000, 0x3ff3652d, 0x1716354d, 0x3e2ae3b6, - 0x40000000, 0x3ff368ce, 0xdf6906b1, 0x3e665fb5, - 0x60000000, 0x3ff36c6d, 0x7f588f7b, 0x3e66177d, - 0x30000000, 0x3ff3700b, 0xbd091b67, 0x3e3ad55a, - 0xa0000000, 0x3ff373a7, 0xb2422d76, 0x3e155337, - 0xb0000000, 0x3ff37742, 0xe86972d5, 0x3e6084eb, - 0x70000000, 0x3ff37adc, 0x808e1ea3, 0x3e656395, - 0xe0000000, 0x3ff37e74, 0x1b40fba7, 0x3e61bce2, - 0x00000000, 0x3ff3820c, 0x4605b515, 0x3e5006f9, - 0xc0000000, 0x3ff385a1, 0xaceb1f7d, 0x3e6aa676, - 0x40000000, 0x3ff38936, 0x76554ce6, 0x3e58229f, - 0x60000000, 0x3ff38cc9, 0x6cf57330, 0x3e6eabfc, - 0x40000000, 0x3ff3905b, 0x9c0ce8bc, 0x3e64daed, - 0xd0000000, 0x3ff393eb, 0x68237141, 0x3e60ff17, - 0x10000000, 0x3ff3977b, 0x3051b085, 0x3e6575f8, - 0x10000000, 0x3ff39b09, 0xeb523e29, 0x3e42667d, - 0xc0000000, 0x3ff39e95, 0x6954f4fd, 0x3e181699, - 0x20000000, 0x3ff3a221, 0xcf4d9cd4, 0x3e587cfc, - 0x40000000, 0x3ff3a5ab, 0x18198353, 0x3e52c5d0, - 0x10000000, 0x3ff3a934, 0x8dcc34aa, 0x3e6a7a89, - 0xb0000000, 0x3ff3acbb, 0xdadc36d1, 0x3e2cead6, - 0x00000000, 0x3ff3b042, 0x9c498bdf, 0x3e2a5575, - 0x00000000, 0x3ff3b3c7, 0x9ef6de04, 0x3e6c414a, - 0xd0000000, 0x3ff3b74a, 0x8a6e58fa, 0x3e63e210, - 0x60000000, 0x3ff3bacd, 0x7643d77c, 0x3e5587fd, - 0xb0000000, 0x3ff3be4e, 0x1d3ff3df, 0x3e3901eb, - 0xb0000000, 0x3ff3c1ce, 0x7c812fc6, 0x3e6f2ccd, - 0x90000000, 0x3ff3c54d, 0x70a01049, 0x3e21c8ee, - 0x20000000, 0x3ff3c8cb, 0x02831eec, 0x3e563e8d, - 0x70000000, 0x3ff3cc47, 0x2a92c7ff, 0x3e6f61a4, - 0xa0000000, 0x3ff3cfc2, 0x99c84d24, 0x3dda9173, - 0x80000000, 0x3ff3d33c, 0xc8eec2f0, 0x3e5e9197, - 0x30000000, 0x3ff3d6b5, 0x2f5a1378, 0x3e5e6f84, - 0xb0000000, 0x3ff3da2c, 0x2a90a0fc, 0x3e2fac24, - 0xf0000000, 0x3ff3dda2, 0x26610227, 0x3e535ed7, - 0x00000000, 0x3ff3e118, 0x4804b15b, 0x3e50e0d6, - 0xe0000000, 0x3ff3e48b, 0x5daba814, 0x3e056067, - 0x80000000, 0x3ff3e7fe, 0xc8768032, 0x3e637388, - 0x00000000, 0x3ff3eb70, 0x9f9e01f5, 0x3e3ee3c8, - 0x40000000, 0x3ff3eee0, 0x0d09747c, 0x3e639f6f, - 0x60000000, 0x3ff3f24f, 0x27abb8f0, 0x3e4322c3, - 0x40000000, 0x3ff3f5bd, 0x47c8ac80, 0x3e6961b3, - 0x00000000, 0x3ff3f92a, 0xbbd0f118, 0x3e63711f, - 0x90000000, 0x3ff3fc95, 0xd7718ffb, 0x3e64fad8, - 0xf0000000, 0x3ff3ffff, 0xffffffff, 0x3e6fffff, - 0x30000000, 0x3ff40369, 0x79ec35b4, 0x3e667efa, - 0x40000000, 0x3ff406d1, 0x87a254a8, 0x3e6a7376, - 0x30000000, 0x3ff40a38, 0xf87d924d, 0x3e5bace0, - 0xf0000000, 0x3ff40d9d, 0xc237e392, 0x3e629e37, - 0x90000000, 0x3ff41102, 0xac3f3012, 0x3e557ce7, - 0x00000000, 0x3ff41466, 0x359f8fbd, 0x3e682829, - 0x50000000, 0x3ff417c8, 0x42d14676, 0x3e6cc9be, - 0x80000000, 0x3ff41b29, 0x1c137d0b, 0x3e6a8f00, - 0x90000000, 0x3ff41e89, 0x687dda05, 0x3e636127, - 0x80000000, 0x3ff421e8, 0x322646f0, 0x3e524dba, - 0x40000000, 0x3ff42546, 0x1ed210b4, 0x3e6dc43f, - 0xf0000000, 0x3ff428a2, 0x15c447bb, 0x3e631ae5, - 0xf0000000, 0x3fe428a2, 0x15c447bb, 0x3e531ae5, - 0xa0000000, 0x3fe965fe, 0xf20ac166, 0x3e44f5b8, - 0x00000000, 0x3ff00000, 0x00000000, 0x00000000, - 0xf0000000, 0x3ff428a2, 0x15c447bb, 0x3e631ae5, - 0xa0000000, 0x3ff965fe, 0xf20ac166, 0x3e54f5b8, - 0x00000000, 0x3ff00000, 0x00000000, 0x00000000, - 0x30000000, 0x3ff02c9a, 0xc1dcdef9, 0x3e6cef00, - 0xd0000000, 0x3ff059b0, 0xa1d73e2a, 0x3e48ac2b, - 0x10000000, 0x3ff08745, 0x901186be, 0x3e60eb37, - 0x60000000, 0x3ff0b558, 0x1ec53172, 0x3e69f312, - 0x30000000, 0x3ff0e3ec, 0x10103a17, 0x3e469e8d, - 0xd0000000, 0x3ff11301, 0xa4ebbf1a, 0x3df25b50, - 0xa0000000, 0x3ff1429a, 0xbf668203, 0x3e6d525b, - 0x30000000, 0x3ff172b8, 0xf5b9bef9, 0x3e68faa2, - 0xe0000000, 0x3ff1a35b, 0xea796d31, 0x3e66df96, - 0x30000000, 0x3ff1d487, 0xa7805b80, 0x3e368b9a, - 0x80000000, 0x3ff2063b, 0xac771dd6, 0x3e60c519, - 0x60000000, 0x3ff2387a, 0x70cd83f5, 0x3e6ceac4, - 0x60000000, 0x3ff26b45, 0x7495e99c, 0x3e5789f3, - 0xf0000000, 0x3ff29e9d, 0x84b09745, 0x3e547f7b, - 0xa0000000, 0x3ff2d285, 0x2d002475, 0x3e5b900c, - 0x00000000, 0x3ff306fe, 0x2a5bd1ab, 0x3e64636e, - 0xb0000000, 0x3ff33c08, 0xfa64e430, 0x3e4320b7, - 0x30000000, 0x3ff371a7, 0x2a9c5154, 0x3e5ceaa7, - 0x30000000, 0x3ff3a7db, 0xdba86f24, 0x3e53967f, - 0x40000000, 0x3ff3dea6, 0x446b6824, 0x3e682468, - 0x20000000, 0x3ff4160a, 0x9f84325b, 0x3e3f72e2, - 0x60000000, 0x3ff44e08, 0x40c4dbd0, 0x3e18624b, - 0xb0000000, 0x3ff486a2, 0x404f068e, 0x3e5704f3, - 0xd0000000, 0x3ff4bfda, 0x9c750e5e, 0x3e54d8a8, - 0x70000000, 0x3ff4f9b2, 0x9ab4cf62, 0x3e5a74b2, - 0x50000000, 0x3ff5342b, 0x077c2a0f, 0x3e5a753e, - 0x30000000, 0x3ff56f47, 0x699bb2c0, 0x3e5ad49f, - 0xd0000000, 0x3ff5ab07, 0x52b19260, 0x3e6a90a8, - 0x10000000, 0x3ff5e76f, 0x21ba6f93, 0x3e56b485, - 0xb0000000, 0x3ff6247e, 0x58f87d03, 0x3e0d2ac2, - 0x80000000, 0x3ff66238, 0x24893ecf, 0x3e42a911, - 0x60000000, 0x3ff6a09e, 0x32422cbe, 0x3e59fcef, - 0x30000000, 0x3ff6dfb2, 0x5de441c5, 0x3e68ca34, - 0xe0000000, 0x3ff71f75, 0xe7ba46e1, 0x3e61d8be, - 0x50000000, 0x3ff75feb, 0x22fdba6a, 0x3e59099f, - 0x70000000, 0x3ff7a114, 0x36bea881, 0x3e4f580c, - 0x30000000, 0x3ff7e2f3, 0x8841740a, 0x3e5b3d39, - 0x90000000, 0x3ff82589, 0x25159f11, 0x3e62999c, - 0x90000000, 0x3ff868d9, 0xd901c83b, 0x3e668925, - 0x40000000, 0x3ff8ace5, 0xdadd3e2a, 0x3e415506, - 0x90000000, 0x3ff8f1ae, 0x6c57304e, 0x3e622aee, - 0xb0000000, 0x3ff93737, 0x9e8a0387, 0x3e29b8bc, - 0x90000000, 0x3ff97d82, 0x9f173d24, 0x3e6fbc9c, - 0x80000000, 0x3ff9c491, 0x80e3e235, 0x3e451f84, - 0x70000000, 0x3ffa0c66, 0xc96535b5, 0x3e66bbca, - 0xb0000000, 0x3ffa5503, 0xe45a1224, 0x3e41f12a, - 0x50000000, 0x3ffa9e6b, 0xfd0fac90, 0x3e55e7f6, - 0x90000000, 0x3ffae89f, 0x5abd0e69, 0x3e62b5a7, - 0xb0000000, 0x3ffb33a2, 0xf5ed7fa1, 0x3e609e2b, - 0xf0000000, 0x3ffb7f76, 0x37553d84, 0x3e47daf2, - 0x90000000, 0x3ffbcc1e, 0x891ee83d, 0x3e12f074, - 0xd0000000, 0x3ffc199b, 0x38444196, 0x3e6b0aa5, - 0x20000000, 0x3ffc67f1, 0x9694426f, 0x3e6cafa2, - 0xd0000000, 0x3ffcb720, 0xd22a0797, 0x3e69df20, - 0x40000000, 0x3ffd072d, 0xf71a1e45, 0x3e640f12, - 0xd0000000, 0x3ffd5818, 0x0e4bb40b, 0x3e69f749, - 0x00000000, 0x3ffda9e6, 0x2b84600d, 0x3e4ed994, - 0x30000000, 0x3ffdfc97, 0xf5cb4656, 0x3e4bdcda, - 0xe0000000, 0x3ffe502e, 0xd89cf44c, 0x3e5e2cff, - 0xa0000000, 0x3ffea4af, 0xcc2c7b9d, 0x3e452486, - 0xe0000000, 0x3ffefa1b, 0x4eee3fa4, 0x3e6cc2b4, - 0x50000000, 0x3fff5076, 0x80ce9f09, 0x3e66dc8a, - 0x80000000, 0x3fffa7c1, 0x82e90a7e, 0x3e39e90d, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x3f8fc0a8, 0x7c79f3db, 0x3e361f80, - 0x00000000, 0x3f9f8298, 0x980267c8, 0x3e6873c1, - 0x00000000, 0x3fa77458, 0x9f88c69e, 0x3e5ec65b, - 0x00000000, 0x3faf0a30, 0x54cc2f99, 0x3e58022c, - 0x00000000, 0x3fb341d7, 0x3a125330, 0x3e62c37a, - 0x00000000, 0x3fb6f0d2, 0x69737c93, 0x3e615cad, - 0x00000000, 0x3fba926d, 0xb1b285e9, 0x3e4d256a, - 0x00000000, 0x3fbe2707, 0xb97a7aa2, 0x3e5b8abc, - 0x00000000, 0x3fc0d77e, 0x9659a5dc, 0x3e6f3423, - 0x80000000, 0x3fc29552, 0x48d30177, 0x3e6e07fd, - 0x00000000, 0x3fc44d2b, 0x4799f4f6, 0x3e6b32df, - 0x00000000, 0x3fc5ff30, 0xf4f21cf8, 0x3e6c29e4, - 0x00000000, 0x3fc7ab89, 0x48df1b59, 0x3e1086c8, - 0x80000000, 0x3fc9525a, 0xb4764130, 0x3e4cf456, - 0x00000000, 0x3fcaf3c9, 0xfcb63398, 0x3e63a02f, - 0x80000000, 0x3fcc8ff7, 0x886b0976, 0x3e61e6a6, - 0x00000000, 0x3fce2707, 0xb97a7aa2, 0x3e6b8abc, - 0x00000000, 0x3fcfb918, 0x8aa35552, 0x3e6b578f, - 0xc0000000, 0x3fd0a324, 0x71afb9fc, 0x3e6139c8, - 0x80000000, 0x3fd1675c, 0x0701ce64, 0x3e65d5d3, - 0xc0000000, 0x3fd22941, 0xb2d12142, 0x3e6de7bc, - 0x80000000, 0x3fd2e8e2, 0x984e1664, 0x3e6d708e, - 0x40000000, 0x3fd3a64c, 0xe9c72f36, 0x3e556945, - 0xc0000000, 0x3fd4618b, 0x13e85bda, 0x3e20e2f6, - 0x80000000, 0x3fd51aad, 0xb42724f6, 0x3e3cb7e0, - 0x80000000, 0x3fd5d1bd, 0xe52846c7, 0x3e6fac04, - 0x00000000, 0x3fd686c8, 0xaec442be, 0x3e5e9b14, - 0xc0000000, 0x3fd739d7, 0x034e7126, 0x3e6b5de8, - 0x00000000, 0x3fd7eaf8, 0xe1b259d3, 0x3e6dc157, - 0x80000000, 0x3fd89a33, 0x6ad69c62, 0x3e3b0509, - 0x00000000, 0x3fd94794, 0xfaba4cdd, 0x3e5c2116, - 0xc0000000, 0x3fd9f323, 0x25f95b47, 0x3e665fcc, - 0x80000000, 0x3fda9cec, 0x498d4850, 0x3e5a9a08, - 0x40000000, 0x3fdb44f7, 0xb1465f77, 0x3e6de647, - 0x80000000, 0x3fdbeb4d, 0x7bf7861d, 0x3e5da71b, - 0xc0000000, 0x3fdc8ff7, 0x86b09760, 0x3e3e6a68, - 0x40000000, 0x3fdd32fe, 0xeab0ef64, 0x3e6f0075, - 0x00000000, 0x3fddd46a, 0x82fb989b, 0x3e330712, - 0x40000000, 0x3fde7442, 0xc3f1bed2, 0x3e60eb43, - 0x40000000, 0x3fdf128f, 0xecb35c84, 0x3e5faf06, - 0x80000000, 0x3fdfaf58, 0x3db35f68, 0x3e4ef1e6, - 0xa0000000, 0x3fe02552, 0xfb1a71a5, 0x3e469743, - 0x40000000, 0x3fe0723e, 0x404e5796, 0x3e6c1cdf, - 0xe0000000, 0x3fe0be72, 0x0ada625e, 0x3e4094aa, - 0x80000000, 0x3fe109f3, 0x96fde3ec, 0x3e6e2d4c, - 0xc0000000, 0x3fe154c3, 0xe9a98f34, 0x3e62f4d5, - 0xa0000000, 0x3fe19ee6, 0x6ecc5cbe, 0x3e6467c9, - 0x40000000, 0x3fe1e85f, 0xd03dec5a, 0x3e6e7040, - 0xc0000000, 0x3fe23130, 0x4282de36, 0x3e67bebf, - 0x00000000, 0x3fe2795e, 0x1aeb783f, 0x3e6289b1, - 0xe0000000, 0x3fe2c0e9, 0x1772f538, 0x3e5a891d, - 0x20000000, 0x3fe307d7, 0xbe1fb591, 0x3e634f10, - 0x80000000, 0x3fe34e28, 0xd316eb93, 0x3e6d9ce1, - 0xc0000000, 0x3fe393e0, 0x19a9c442, 0x3e63562a, - 0x60000000, 0x3fe3d902, 0xf548084c, 0x3e54e2ad, - 0xe0000000, 0x3fe41d8f, 0x5cc8c97a, 0x3e508ce5, - 0xc0000000, 0x3fe4618b, 0x13e85bda, 0x3e30e2f6, - 0x40000000, 0x3fe4a4f8, 0xbb0227bf, 0x3e6db03e, - 0x00000000, 0x3fe4e7d8, 0xb09cb098, 0x3e61b75b, - 0x20000000, 0x3fe52a2d, 0xabb9df22, 0x3e496f16, - 0xc0000000, 0x3fe56bf9, 0x99411c62, 0x3e65b3f3, - 0x40000000, 0x3fe5ad40, 0x59f65355, 0x3e586b3e, - 0xa0000000, 0x3fe5ee02, 0xeae1ac12, 0x3e52482c, - 0xe0000000, 0x3fe62e42, 0xef35793c, 0x3e6efa39, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0xa0000000, 0x3f6ff00a, 0x0250435a, 0x3db5885e, - 0x60000000, 0x3f7fe02a, 0x11f86ed2, 0x3de620cf, - 0x50000000, 0x3f87dc47, 0xedba4a25, 0x3dff0214, - 0xb0000000, 0x3f8fc0a8, 0x79f3db4e, 0x3dbf807c, - 0x40000000, 0x3f93cea4, 0xa779a52b, 0x3dea352b, - 0x00000000, 0x3f97b91b, 0x6aa49fd5, 0x3dff56c4, - 0x20000000, 0x3f9b9fc0, 0x5fef5196, 0x3dfebe46, - 0x00000000, 0x3f9f829b, 0x0099f1f8, 0x3e0cf066, - 0x80000000, 0x3fa1b0d9, 0xff85945d, 0x3e1247b2, - 0xb0000000, 0x3fa39e87, 0xbf5202b6, 0x3e13fd7a, - 0xa0000000, 0x3fa58a5b, 0xa918d51e, 0x3e1f91c9, - 0xf0000000, 0x3fa77458, 0xf118d3ca, 0x3e08cb73, - 0x00000000, 0x3fa95c83, 0xd6fad074, 0x3e1d91c7, - 0x70000000, 0x3fab42dd, 0xec28d14c, 0x3de1971b, - 0x80000000, 0x3fad276b, 0xa423c78a, 0x3e15b616, - 0xc0000000, 0x3faf0a30, 0x617cc971, 0x3da162a6, - 0x30000000, 0x3fb07598, 0xc4c06d29, 0x3e166391, - 0xe0000000, 0x3fb16536, 0xc1d0c4b8, 0x3e2d46f5, - 0x20000000, 0x3fb253f6, 0x2df1f6d3, 0x3e2e1428, - 0x90000000, 0x3fb341d7, 0x424a660d, 0x3e186f47, - 0xb0000000, 0x3fb42edc, 0xe077753e, 0x3e2d4c8d, - 0x30000000, 0x3fb51b07, 0x7ed24f1c, 0x3e2e0c30, - 0xa0000000, 0x3fb60658, 0x8763bdd3, 0x3e226ea1, - 0x80000000, 0x3fb6f0d2, 0x9737c933, 0x3e25cad6, - 0x60000000, 0x3fb7da76, 0x99088901, 0x3e2af625, - 0xd0000000, 0x3fb8c345, 0x83d6b2d0, 0x3e18c66c, - 0x40000000, 0x3fb9ab42, 0xb36fb30f, 0x3e1880ce, - 0x30000000, 0x3fba926d, 0xc6ca17a4, 0x3e2495aa, - 0x20000000, 0x3fbb78c8, 0x4210878c, 0x3e2761db, - 0x80000000, 0x3fbc5e54, 0x862bac2f, 0x3e2eb78e, - 0xd0000000, 0x3fbd4313, 0x75790dd9, 0x3e19b2cd, - 0x60000000, 0x3fbe2707, 0xcbd3d50f, 0x3e2c55e5, - 0xc0000000, 0x3fbf0a30, 0x617cc971, 0x3db162a6, - 0x30000000, 0x3fbfec91, 0xaaa2e519, 0x3dfdbeab, - 0x10000000, 0x3fc06715, 0x7150c647, 0x3e1652cb, - 0x70000000, 0x3fc0d77e, 0xb2cd2ee2, 0x3e39a11c, - 0x80000000, 0x3fc14785, 0xb1a28813, 0x3e219d0a, - 0xd0000000, 0x3fc1b72a, 0x80a41811, 0x3e24bd9e, - 0x10000000, 0x3fc2266f, 0x96faa3df, 0x3e3214b5, - 0xf0000000, 0x3fc29552, 0x46980bb8, 0x3e303fea, - 0x10000000, 0x3fc303d7, 0xa5fd28c7, 0x3e31c8ff, - 0x20000000, 0x3fc371fc, 0x3bcd96c5, 0x3dce8f74, - 0xb0000000, 0x3fc3dfc2, 0x395315c6, 0x3dfd98c5, - 0x60000000, 0x3fc44d2b, 0x3ccfa7b2, 0x3e3996fa, - 0xf0000000, 0x3fc4ba36, 0x2ad13037, 0x3e1cd2af, - 0xe0000000, 0x3fc526e5, 0xbd17200e, 0x3e1d0da1, - 0xd0000000, 0x3fc59338, 0x0ba68b75, 0x3e333041, - 0x70000000, 0x3fc5ff30, 0x790e7c41, 0x3df4f27a, - 0x40000000, 0x3fc66acd, 0x86f6ff1b, 0x3e13956a, - 0xe0000000, 0x3fc6d60f, 0x723551d9, 0x3e2c6748, - 0xf0000000, 0x3fc740f8, 0x9326cdfc, 0x3e2500de, - 0x00000000, 0x3fc7ab89, 0x48df1b59, 0x3e1086c8, - 0xa0000000, 0x3fc815c0, 0xad6836ff, 0x3e04357e, - 0x60000000, 0x3fc87fa0, 0x42408024, 0x3e248324, - 0xd0000000, 0x3fc8e928, 0x8154b13d, 0x3e3d10da, - 0x90000000, 0x3fc9525a, 0x68ec8260, 0x3e39e8ad, - 0x20000000, 0x3fc9bb36, 0x06abaf18, 0x3e3cfbf7, - 0x10000000, 0x3fca23bc, 0xc6326e23, 0x3e3fc56a, - 0xf0000000, 0x3fca8bec, 0x3185cf21, 0x3e39105e, - 0x40000000, 0x3fcaf3c9, 0xe5b19cc0, 0x3e3d017f, - 0x90000000, 0x3fcb5b51, 0x48dd13fe, 0x3e3d1f6b, - 0x70000000, 0x3fcbc286, 0x58a7e73a, 0x3e20b633, - 0x50000000, 0x3fcc2968, 0x028c211c, 0x3e263063, - 0xc0000000, 0x3fcc8ff7, 0x86b09760, 0x3e2e6a68, - 0x40000000, 0x3fccf635, 0xb891cd03, 0x3e3c138b, - 0x60000000, 0x3fcd5c21, 0x22b7221a, 0x3e369f77, - 0xa0000000, 0x3fcdc1bc, 0xac1a628c, 0x3df57d8f, - 0x60000000, 0x3fce2707, 0xcbd3d50f, 0x3e3c55e5, - 0x50000000, 0x3fce8c02, 0xff48fe2e, 0x3e1552d2, - 0xc0000000, 0x3fcef0ad, 0x6ca431bc, 0x3e37b8b2, - 0x50000000, 0x3fcf550a, 0xdc1c5f6d, 0x3e292dec, - 0x60000000, 0x3fcfb918, 0x551aaa8c, 0x3e3abc7c, - 0x40000000, 0x3fd00e6c, 0x731a354b, 0x3e36b540, - 0x90000000, 0x3fd04025, 0x036b89ef, 0x3e32d341, - 0x50000000, 0x3fd071b8, 0x1a3a2e0f, 0x3e4f9ab2, - 0xe0000000, 0x3fd0a324, 0x1afb9fbd, 0x3e239c87, - 0x50000000, 0x3fd0d46b, 0x2c81f640, 0x3e3e6add, - 0xf0000000, 0x3fd1058b, 0xaa313f41, 0x3e435c95, - 0x00000000, 0x3fd13687, 0x82f6cc53, 0x3e249d45, - 0xa0000000, 0x3fd1675c, 0x1c07398f, 0x3e47574c, - 0x20000000, 0x3fd1980d, 0xdece9e8d, 0x3e4ba846, - 0xc0000000, 0x3fd1c898, 0xafbc68e7, 0x3e16999f, - 0x90000000, 0x3fd1f8ff, 0xe51b0103, 0x3e4c9145, - 0xf0000000, 0x3fd22941, 0xcb44850a, 0x3e479ef2, - 0x10000000, 0x3fd25960, 0x3de11275, 0x3e0beec7, - 0x10000000, 0x3fd2895a, 0x1af5a498, 0x3e2ef435, - 0x30000000, 0x3fd2b930, 0x493b4a50, 0x3e45713a, - 0xb0000000, 0x3fd2e8e2, 0x61385992, 0x3e45c23a, - 0xc0000000, 0x3fd31871, 0x09f57299, 0x3e42a883, - 0x90000000, 0x3fd347dd, 0xa9ac8ace, 0x3e4530fa, - 0x60000000, 0x3fd37726, 0xd792a758, 0x3e25fec2, - 0x50000000, 0x3fd3a64c, 0xa71cbcd7, 0x3e35a517, - 0xa0000000, 0x3fd3d54f, 0x3e1cd9a3, 0x3e3707dc, - 0x80000000, 0x3fd40430, 0x8ef43049, 0x3e3a1a9f, - 0x20000000, 0x3fd432ef, 0x276b3674, 0x3e4409d0, - 0xc0000000, 0x3fd4618b, 0x13e85bd9, 0x3e20e2f6, - 0x80000000, 0x3fd49006, 0x33001e5f, 0x3df00274, - 0x90000000, 0x3fd4be5f, 0x836d3265, 0x3e35dde2, - 0x30000000, 0x3fd4ec97, 0x4d7aaf04, 0x3e230013, - 0x80000000, 0x3fd51aad, 0xb42724f5, 0x3e3cb7e0, - 0xc0000000, 0x3fd548a2, 0x167e6308, 0x3e2d6e93, - 0x10000000, 0x3fd57677, 0xb1526adb, 0x3e3d1569, - 0xb0000000, 0x3fd5a42a, 0x338a1a41, 0x3e0e99fc, - 0xb0000000, 0x3fd5d1bd, 0x94a11b1c, 0x3e4eb013, - 0x70000000, 0x3fd5ff30, 0x790e7c41, 0x3e04f27a, - 0xf0000000, 0x3fd62c82, 0xa97b7af9, 0x3e25ce3c, - 0x70000000, 0x3fd659b5, 0x940ed857, 0x3e281f0f, - 0x10000000, 0x3fd686c8, 0x5d88857c, 0x3e4d3629, - 0x20000000, 0x3fd6b3bb, 0xec4af526, 0x3e21aca1, - 0xa0000000, 0x3fd6e08e, 0xc7182726, 0x3e445743, - 0xe0000000, 0x3fd70d42, 0xaead337e, 0x3e23c491, - 0xf0000000, 0x3fd739d7, 0x1a738931, 0x3e3aef40, - 0x10000000, 0x3fd7664e, 0x76092a29, 0x3e21cede, - 0x50000000, 0x3fd792a5, 0x44f82bb4, 0x3e4fba8f, - 0x00000000, 0x3fd7bede, 0x7f3c3e1a, 0x3e446f5f, - 0x30000000, 0x3fd7eaf8, 0x86c9674b, 0x3e47055f, - 0x10000000, 0x3fd816f4, 0x2b6b6e1a, 0x3e4b41a9, - 0xd0000000, 0x3fd842d1, 0x2e927628, 0x3e443d16, - 0x90000000, 0x3fd86e91, 0x4013f9b1, 0x3e446617, - 0x80000000, 0x3fd89a33, 0x6ad69c62, 0x3e3b0509, - 0xc0000000, 0x3fd8c5b7, 0x150faa58, 0x3e40b169, - 0x80000000, 0x3fd8f11e, 0x1df85da7, 0x3e3cd98b, - 0xe0000000, 0x3fd91c67, 0x7b0f8fa8, 0x3e468b50, - 0x10000000, 0x3fd94794, 0xf57499ba, 0x3e48422d, - 0x40000000, 0x3fd972a3, 0x86970274, 0x3e113515, - 0x80000000, 0x3fd99d95, 0xacba92ee, 0x3e117e08, - 0x00000000, 0x3fd9c86b, 0x14dd0229, 0x3e26e043, - 0xe0000000, 0x3fd9f323, 0x97e56d1a, 0x3e497f30, - 0x60000000, 0x3fda1dc0, 0x55901286, 0x3e3356e6, - 0x90000000, 0x3fda4840, 0x457f94d6, 0x3e0cb761, - 0x90000000, 0x3fda72a4, 0xa85a9dac, 0x3e39af67, - 0x90000000, 0x3fda9cec, 0x931a909f, 0x3e453410, - 0xc0000000, 0x3fdac718, 0x206058f5, 0x3e22c587, - 0x30000000, 0x3fdaf129, 0x58899c22, 0x3e223bc3, - 0x00000000, 0x3fdb1b1e, 0xb6d223cb, 0x3e4d7bf8, - 0x70000000, 0x3fdb44f7, 0xc5197ddb, 0x3e47991e, - 0x90000000, 0x3fdb6eb5, 0xbb3a9219, 0x3e4a79e6, - 0x90000000, 0x3fdb9858, 0xed663ec5, 0x3e3a4c43, - 0x80000000, 0x3fdbc1e0, 0x1484f438, 0x3e461b5a, - 0x90000000, 0x3fdbeb4d, 0xf7ef0c3a, 0x3e4b4e36, - 0xf0000000, 0x3fdc149f, 0x6acd0d1b, 0x3e115f02, - 0xa0000000, 0x3fdc3dd7, 0x35cecf05, 0x3e3f36b5, - 0xe0000000, 0x3fdc66f4, 0xbf3eb5c6, 0x3e2ffb7f, - 0xc0000000, 0x3fdc8ff7, 0x86b09760, 0x3e3e6a68, - 0x70000000, 0x3fdcb8e0, 0x27f5bbc3, 0x3e3135eb, - 0x00000000, 0x3fdce1af, 0xd6f6fa57, 0x3e470be7, - 0xa0000000, 0x3fdd0a63, 0xc84ab338, 0x3e4ce43c, - 0x70000000, 0x3fdd32fe, 0xaac3bd91, 0x3e4c01d7, - 0x90000000, 0x3fdd5b7f, 0x07961060, 0x3e45c58d, - 0x20000000, 0x3fdd83e7, 0xf941456e, 0x3e3628bc, - 0x30000000, 0x3fddac35, 0xa8461cd2, 0x3e4c58b2, - 0x00000000, 0x3fddd46a, 0x82fb989a, 0x3e330712, - 0x90000000, 0x3fddfc85, 0x6a80f09c, 0x3e420dab, - 0x10000000, 0x3fde2488, 0x4c397b1e, 0x3e44f8d8, - 0xa0000000, 0x3fde4c71, 0x08599e48, 0x3e40d0ee, - 0x60000000, 0x3fde7442, 0x7e37da36, 0x3e1d6878, - 0x60000000, 0x3fde9bfa, 0xd591bafc, 0x3e366187, - 0xd0000000, 0x3fdec399, 0x00bae772, 0x3e223466, - 0xc0000000, 0x3fdeeb20, 0xd0d61b8e, 0x3e390377, - 0x50000000, 0x3fdf128f, 0xd966b907, 0x3e4f5e0d, - 0xb0000000, 0x3fdf39e5, 0xb79a00e2, 0x3e49023c, - 0xf0000000, 0x3fdf6123, 0x58c28ad8, 0x3e44e051, - 0x30000000, 0x3fdf884a, 0x08b18ae4, 0x3e3bfa7b, - 0x80000000, 0x3fdfaf58, 0x3db35f67, 0x3e4ef1e6, - 0x20000000, 0x3fdfd64f, 0x39493d4f, 0x3e0ec2ae, - 0x00000000, 0x3fdffd2e, 0x30ab2fa0, 0x3e40afe9, - 0xb0000000, 0x3fe011fa, 0xa1810dd4, 0x3e225ff8, - 0xa0000000, 0x3fe02552, 0xfb1a71a5, 0x3e469743, - 0xe0000000, 0x3fe0389e, 0x76785571, 0x3e5f9cc6, - 0x90000000, 0x3fe04bdf, 0xa4cbf982, 0x3e5b524d, - 0xb0000000, 0x3fe05f14, 0x381535b8, 0x3e5a4c8b, - 0x50000000, 0x3fe0723e, 0x809caf2c, 0x3e5839be, - 0x80000000, 0x3fe0855c, 0x1cb82c13, 0x3e50968a, - 0x40000000, 0x3fe0986f, 0x41723fb5, 0x3e5eae6a, - 0xb0000000, 0x3fe0ab76, 0xa380a4db, 0x3e5d9c29, - 0xe0000000, 0x3fe0be72, 0x0ada625e, 0x3e4094aa, - 0xc0000000, 0x3fe0d163, 0x6fc108ca, 0x3e5973ad, - 0x80000000, 0x3fe0e449, 0x2fdbab97, 0x3e474732, - 0x10000000, 0x3fe0f724, 0xfa9d4221, 0x3e593692, - 0x90000000, 0x3fe109f3, 0x2dfbc7d9, 0x3e5c5a99, - 0x10000000, 0x3fe11cb8, 0xe102387a, 0x3e4e1f33, - 0x90000000, 0x3fe12f71, 0xf14c048c, 0x3e464fbe, - 0x20000000, 0x3fe14220, 0x13ca5e3b, 0x3e4490f5, - 0xd0000000, 0x3fe154c3, 0x4d4c799d, 0x3e37a6af, - 0xa0000000, 0x3fe1675c, 0x1c07398f, 0x3e57574c, - 0xb0000000, 0x3fe179ea, 0x417f8c1c, 0x3e57b133, - 0x00000000, 0x3fe18c6e, 0x0c176514, 0x3e5feb9e, - 0xb0000000, 0x3fe19ee6, 0xbb3172f7, 0x3e419f25, - 0xb0000000, 0x3fe1b154, 0x7bbfb852, 0x3e45f68a, - 0x10000000, 0x3fe1c3b8, 0x497929f1, 0x3e5ee278, - 0xf0000000, 0x3fe1d610, 0x06109d58, 0x3e5ccee0, - 0x50000000, 0x3fe1e85f, 0xa07bd8b3, 0x3e5ce081, - 0x40000000, 0x3fe1faa3, 0x981817b8, 0x3e570e12, - 0xd0000000, 0x3fe20cdc, 0xd93503d0, 0x3e292ab6, - 0xf0000000, 0x3fe21f0b, 0xd7c3b61e, 0x3e58cb7d, - 0xd0000000, 0x3fe23130, 0x0a0b78da, 0x3e4efafd, - 0x60000000, 0x3fe2434b, 0x67c4288e, 0x3e5e9072, - 0xc0000000, 0x3fe2555b, 0x96780875, 0x3e5d31ef, - 0x00000000, 0x3fe26762, 0xfcd2ad50, 0x3e23430d, - 0x10000000, 0x3fe2795e, 0xd75bc1f9, 0x3e344d88, - 0x00000000, 0x3fe28b50, 0x055e04fc, 0x3e5bec0f, - 0xf0000000, 0x3fe29d37, 0x1590b9ad, 0x3e5d8561, - 0xf0000000, 0x3fe2af15, 0x8e583229, 0x3df32056, - 0xe0000000, 0x3fe2c0e9, 0x1772f538, 0x3e5a891d, - 0x00000000, 0x3fe2d2b4, 0xdabba74d, 0x3e22edc9, - 0x30000000, 0x3fe2e474, 0xa1015086, 0x3e4b9009, - 0x90000000, 0x3fe2f62a, 0x8c5b1a19, 0x3e52a12a, - 0x30000000, 0x3fe307d7, 0xf0fdac85, 0x3e3a7885, - 0x00000000, 0x3fe3197a, 0xd43ac691, 0x3e5f4ffc, - 0x30000000, 0x3fe32b13, 0xe2640aad, 0x3e52243a, - 0xb0000000, 0x3fe33ca2, 0x299035d3, 0x3e546513, - 0x90000000, 0x3fe34e28, 0xa62dd725, 0x3e5b39c3, - 0xe0000000, 0x3fe35fa4, 0x40049f51, 0x3e5ba6dd, - 0xb0000000, 0x3fe37117, 0xd7177409, 0x3e451d1e, - 0xf0000000, 0x3fe38280, 0xfd7f5216, 0x3e5cb0f2, - 0xd0000000, 0x3fe393e0, 0xcd4e2213, 0x3e3ab150, - 0x30000000, 0x3fe3a537, 0xf3193844, 0x3e5cfd7b, - 0x40000000, 0x3fe3b684, 0x455f1dbd, 0x3e53fff8, - 0xf0000000, 0x3fe3c7c7, 0x0b905fc9, 0x3e5fee64, - 0x60000000, 0x3fe3d902, 0xf548084c, 0x3e54e2ad, - 0x90000000, 0x3fe3ea33, 0xdc1ecdd2, 0x3e3b597a, - 0x80000000, 0x3fe3fb5b, 0x096d3a75, 0x3e4345bd, - 0x40000000, 0x3fe40c7a, 0xd2453c8b, 0x3e5101b9, - 0xe0000000, 0x3fe41d8f, 0x5cc8c979, 0x3e508ce5, - 0x60000000, 0x3fe42e9c, 0x7e595f71, 0x3e5bbf01, - 0xe0000000, 0x3fe43f9f, 0x3bd393dc, 0x3e37ce73, - 0x50000000, 0x3fe4509a, 0xa503f8a1, 0x3e233bb0, - 0xc0000000, 0x3fe4618b, 0x13e85bd9, 0x3e30e2f6, - 0x30000000, 0x3fe47274, 0x5a635b3c, 0x3e5e6755, - 0xd0000000, 0x3fe48353, 0xf73d5e8b, 0x3e2ea88d, - 0x80000000, 0x3fe4942a, 0x3bda18a8, 0x3e3d17e0, - 0x50000000, 0x3fe4a4f8, 0x76044f7e, 0x3e5b607d, - 0x60000000, 0x3fe4b5bd, 0xe71bc2fc, 0x3e52adc4, - 0xa0000000, 0x3fe4c679, 0x7362d1d9, 0x3e5f99dc, - 0x30000000, 0x3fe4d72d, 0x008e6a6a, 0x3e5473fa, - 0x10000000, 0x3fe4e7d8, 0x09cb0985, 0x3e2b75bb, - 0x30000000, 0x3fe4f87a, 0xd10b9aba, 0x3e5ea04d, - 0xc0000000, 0x3fe50913, 0xd6979674, 0x3e5802d0, - 0xc0000000, 0x3fe519a4, 0xccd99094, 0x3e174688, - 0x20000000, 0x3fe52a2d, 0xabb9df22, 0x3e496f16, - 0x00000000, 0x3fe53aad, 0xf2aa374f, 0x3e46e66d, - 0x60000000, 0x3fe54b24, 0x5ea4550a, 0x3e4e6652, - 0x50000000, 0x3fe55b93, 0x34f20cbd, 0x3e42d02f, - 0xd0000000, 0x3fe56bf9, 0x65047188, 0x3e46cfce, - 0xf0000000, 0x3fe57c57, 0x842d58b8, 0x3e39b78c, - 0xb0000000, 0x3fe58cad, 0x24c24bc9, 0x3e4735e6, - 0x20000000, 0x3fe59cfb, 0xf7dd1adf, 0x3e47eba1, - 0x40000000, 0x3fe5ad40, 0x59f65355, 0x3e586b3e, - 0x30000000, 0x3fe5bd7d, 0x637f1b4d, 0x3e1ce38e, - 0xd0000000, 0x3fe5cdb1, 0xc919edc7, 0x3e58d82e, - 0x50000000, 0x3fe5ddde, 0x8ddcfa37, 0x3e4c5264, - 0xa0000000, 0x3fe5ee02, 0xeae1ac12, 0x3e52482c, - 0xd0000000, 0x3fe5fe1e, 0x311aba4f, 0x3e55a312, - 0xf0000000, 0x3fe60e32, 0x6329f225, 0x3e411e23, - 0xf0000000, 0x3fe61e3e, 0xcd2f246c, 0x3e5b48c8, - 0xe0000000, 0x3fe62e42, 0xef35793c, 0x3e6efa39, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x40000000, 0x00000000, 0x00000000, - 0x00000000, 0x3fffe000, 0xe01fe020, 0x3effe01f, - 0x00000000, 0x3fffc000, 0x01fc07f0, 0x3f1fc07f, - 0x00000000, 0x3fffa000, 0x1fa11caa, 0x3f31caa0, - 0x00000000, 0x3fff8000, 0x1f81f820, 0x3f3f81f8, - 0x00000000, 0x3fff6000, 0x06ddaba6, 0x3f488565, - 0x00000000, 0x3fff4000, 0x2909c560, 0x3f519679, - 0x00000000, 0x3fff2000, 0x8c2ad433, 0x3f57d910, - 0x00000000, 0x3fff0000, 0xf07c1f08, 0x3f5f07c1, - 0x00000000, 0x3ffee000, 0x8b1c03dd, 0x3f638ff0, - 0x00000000, 0x3ffec000, 0x03d980f6, 0x3f680f66, - 0x00000000, 0x3ffea000, 0x7403d5d0, 0x3f6d00f5, - 0x00000000, 0x3ffe9000, 0x0b7672a0, 0x3f331abf, - 0x00000000, 0x3ffe7000, 0x5d43919b, 0x3f506a96, - 0x00000000, 0x3ffe5000, 0x0795ceb2, 0x3f5ceb24, - 0x00000000, 0x3ffe3000, 0xb834e67f, 0x3f6522f3, - 0x00000000, 0x3ffe1000, 0x3c3c3c3c, 0x3f6c3c3c, - 0x00000000, 0x3ffe0000, 0x1e01e01e, 0x3f3e01e0, - 0x00000000, 0x3ffde000, 0xe21a291c, 0x3f575b8f, - 0x00000000, 0x3ffdc000, 0x403b9404, 0x3f6403b9, - 0x00000000, 0x3ffda000, 0x7303b5cc, 0x3f6cc0ed, - 0x00000000, 0x3ffd9000, 0xf3fc4da2, 0x3f479118, - 0x00000000, 0x3ffd7000, 0xe0b0ce46, 0x3f5ed952, - 0x00000000, 0x3ffd5000, 0xeae56404, 0x3f695900, - 0x00000000, 0x3ffd4000, 0x1d41d41d, 0x3f3d41d4, - 0x00000000, 0x3ffd2000, 0xf16c69ae, 0x3f5cb28f, - 0x00000000, 0x3ffd0000, 0xdd80e866, 0x3f696b1e, - 0x00000000, 0x3ffcf000, 0x25fe30d9, 0x3f4372e2, - 0x00000000, 0x3ffcd000, 0x073615a2, 0x3f60ad12, - 0x00000000, 0x3ffcb000, 0x0397cdb3, 0x3f6cdb2c, - 0x00000000, 0x3ffca000, 0x7b864407, 0x3f52cc15, - 0x00000000, 0x3ffc8000, 0xf7148404, 0x3f664cb5, - 0x00000000, 0x3ffc7000, 0x1c71c71c, 0x3f3c71c7, - 0x00000000, 0x3ffc5000, 0x1a930b84, 0x3f6129a2, - 0x00000000, 0x3ffc3000, 0x87f1e038, 0x3f6f1e03, - 0x00000000, 0x3ffc2000, 0xba80709b, 0x3f5ad4e4, - 0x00000000, 0x3ffc0000, 0x0381c0e0, 0x3f6c0e07, - 0x00000000, 0x3ffbf000, 0x1a362bb0, 0x3f560fba, - 0x00000000, 0x3ffbd000, 0x280dee96, 0x3f6a5713, - 0x00000000, 0x3ffbc000, 0x20f9ece9, 0x3f53f596, - 0x00000000, 0x3ffba000, 0x83759f23, 0x3f69f229, - 0x00000000, 0x3ffb9000, 0x63fc8d5c, 0x3f5478ac, - 0x00000000, 0x3ffb7000, 0xb4671656, 0x3f6ad87b, - 0x00000000, 0x3ffb6000, 0xfbb8148c, 0x3f578b8e, - 0x00000000, 0x3ffb4000, 0xd0369d03, 0x3f6d0369, - 0x00000000, 0x3ffb3000, 0x601b3748, 0x3f5d212b, - 0x00000000, 0x3ffb2000, 0x406c80d9, 0x3f0b2036, - 0x00000000, 0x3ffb0000, 0xb24547d1, 0x3f629663, - 0x00000000, 0x3ffaf000, 0x0d79435e, 0x3f4435e5, - 0x00000000, 0x3ffad000, 0x2920bc03, 0x3f67d0ff, - 0x00000000, 0x3ffac000, 0x15c06b16, 0x3f55c06b, - 0x00000000, 0x3ffaa000, 0x0fd7f954, 0x3f6e3a5f, - 0x00000000, 0x3ffa9000, 0xd4c77b03, 0x3f61dec0, - 0x00000000, 0x3ffa8000, 0x870ac52e, 0x3f473289, - 0x00000000, 0x3ffa6000, 0xa034da03, 0x3f6a034d, - 0x00000000, 0x3ffa5000, 0xa2292856, 0x3f5d041d, - 0x00000000, 0x3ffa4000, 0x1a41a41a, 0x3f3a41a4, - 0x00000000, 0x3ffa2000, 0x8a39409d, 0x3f68550f, - 0x00000000, 0x3ffa1000, 0xe92c0686, 0x3f5b4fe5, - 0x00000000, 0x3ffa0000, 0x1a01a01a, 0x3f3a01a0, - 0x00000000, 0x3ff9e000, 0x2067b23a, 0x3f691d2a, - 0x00000000, 0x3ff9d000, 0xada0b4e5, 0x3f5e7c5d, - 0x00000000, 0x3ff9c000, 0x25080ce1, 0x3f468a77, - 0x00000000, 0x3ff9a000, 0xaa21b490, 0x3f6c49d4, - 0x00000000, 0x3ff99000, 0x33333333, 0x3f633333, - 0x00000000, 0x3ff98000, 0x3b03fccf, 0x3f54bc36, - 0x00000000, 0x3ff97000, 0x970e4f81, 0x3f2c9f01, - 0x00000000, 0x3ff95000, 0xc6ef5b25, 0x3f697617, - 0x00000000, 0x3ff94000, 0xadd3c0ca, 0x3f6161f9, - 0x00000000, 0x3ff93000, 0x6cb39806, 0x3f5319fe, - 0x00000000, 0x3ff92000, 0x1c451ab3, 0x3f2f693a, - 0x00000000, 0x3ff90000, 0x0321a9e2, 0x3f6a9e24, - 0x00000000, 0x3ff8f000, 0x3831f383, 0x3f63831f, - 0x00000000, 0x3ff8e000, 0xc4dcfc1c, 0x3f5949eb, - 0x00000000, 0x3ff8d000, 0x80c6980c, 0x3f480c69, - 0x00000000, 0x3ff8b000, 0xc5fe7403, 0x3f6f9d00, - 0x00000000, 0x3ff8a000, 0xd7e75347, 0x3f69721e, - 0x00000000, 0x3ff89000, 0x0313381f, 0x3f6381ec, - 0x00000000, 0x3ff88000, 0xaec12653, 0x3f5b97c2, - 0x00000000, 0x3ff87000, 0x024ae3ba, 0x3f509ef3, - 0x00000000, 0x3ff86000, 0x18618618, 0x3f386186, - 0x00000000, 0x3ff84000, 0xf00c2780, 0x3f6e0184, - 0x00000000, 0x3ff83000, 0x657dba52, 0x3f692ef5, - 0x00000000, 0x3ff82000, 0x05494030, 0x3f649403, - 0x00000000, 0x3ff81000, 0x30303030, 0x3f603030, - 0x00000000, 0x3ff80000, 0x80601806, 0x3f580601, - 0x00000000, 0x3ff7f000, 0x05fd017f, 0x3f5017f4, - 0x00000000, 0x3ff7e000, 0xd278e8dd, 0x3f412a8a, - 0x00000000, 0x3ff7d000, 0x417d05f4, 0x3f17d05f, - 0x00000000, 0x3ff7b000, 0x5c02f7d6, 0x3f6d6724, - 0x00000000, 0x3ff7a000, 0xc1d986a9, 0x3f6a4411, - 0x00000000, 0x3ff79000, 0x6c7316df, 0x3f6754d7, - 0x00000000, 0x3ff78000, 0xf149902f, 0x3f649902, - 0x00000000, 0x3ff77000, 0x358c1a68, 0x3f621023, - 0x00000000, 0x3ff76000, 0xd2a6c406, 0x3f5f7390, - 0x00000000, 0x3ff75000, 0x05d5b2b1, 0x3f5b2b08, - 0x00000000, 0x3ff74000, 0x745d1746, 0x3f5745d1, - 0x00000000, 0x3ff73000, 0x07fa32c4, 0x3f53c315, - 0x00000000, 0x3ff72000, 0x1b7af017, 0x3f50a1fd, - 0x00000000, 0x3ff71000, 0xe3e0453a, 0x3f4bc36c, - 0x00000000, 0x3ff70000, 0x5c0b8170, 0x3f4702e0, - 0x00000000, 0x3ff6f000, 0x9300b793, 0x3f4300b7, - 0x00000000, 0x3ff6e000, 0x337c6cb1, 0x3f3f76b4, - 0x00000000, 0x3ff6d000, 0x1c860fb0, 0x3f3a6268, - 0x00000000, 0x3ff6c000, 0x16c16c17, 0x3f36c16c, - 0x00000000, 0x3ff6b000, 0x31a3cfc7, 0x3f3490aa, - 0x00000000, 0x3ff6a000, 0x3729043e, 0x3f33cd15, - 0x00000000, 0x3ff69000, 0x8d0bfd2e, 0x3f3473a8, - 0x00000000, 0x3ff68000, 0x16816817, 0x3f368168, - 0x00000000, 0x3ff67000, 0x16719f36, 0x3f39f360, - 0x00000000, 0x3ff66000, 0x122f9016, 0x3f3ec6a5, - 0x00000000, 0x3ff65000, 0xda5519cf, 0x3f427c29, - 0x00000000, 0x3ff64000, 0x590b2164, 0x3f4642c8, - 0x00000000, 0x3ff63000, 0x5606f00b, 0x3f4ab5c4, - 0x00000000, 0x3ff62000, 0x0b11fd3c, 0x3f4fd3b8, - 0x00000000, 0x3ff61000, 0xc6ba4eaa, 0x3f52cda0, - 0x00000000, 0x3ff60000, 0x60581606, 0x3f560581, - 0x00000000, 0x3ff5f000, 0xa4b7ef87, 0x3f5990d0, - 0x00000000, 0x3ff5e000, 0x40579d6f, 0x3f5d6ee3, - 0x00000000, 0x3ff5d000, 0xd9c54a69, 0x3f60cf87, - 0x00000000, 0x3ff5c000, 0x2620ae4c, 0x3f631057, - 0x00000000, 0x3ff5b000, 0x8ff522a2, 0x3f65798c, - 0x00000000, 0x3ff5a000, 0x02b580ad, 0x3f680ad6, - 0x00000000, 0x3ff59000, 0x4799546f, 0x3f6ac3e2, - 0x00000000, 0x3ff58000, 0x02b1da46, 0x3f6da461, - 0x00000000, 0x3ff58000, 0x01580560, 0x3f158056, - 0x00000000, 0x3ff57000, 0x06b39a23, 0x3f3ed3c5, - 0x00000000, 0x3ff56000, 0xe2970f60, 0x3f4cbdd3, - 0x00000000, 0x3ff55000, 0x55555555, 0x3f555555, - 0x00000000, 0x3ff54000, 0xee0bf805, 0x3f5c979a, - 0x00000000, 0x3ff53000, 0xe81fd58e, 0x3f621291, - 0x00000000, 0x3ff52000, 0x500a9580, 0x3f65fead, - 0x00000000, 0x3ff51000, 0xc5f02a3a, 0x3f6a0fd5, - 0x00000000, 0x3ff50000, 0x23898adc, 0x3f6e45c2, - 0x00000000, 0x3ff50000, 0x15015015, 0x3f350150, - 0x00000000, 0x3ff4f000, 0xea64d422, 0x3f4c7b16, - 0x00000000, 0x3ff4e000, 0xbc14e5e1, 0x3f57829c, - 0x00000000, 0x3ff4d000, 0xb8589720, 0x3f60877d, - 0x00000000, 0x3ff4c000, 0x4b5edcea, 0x3f65710e, - 0x00000000, 0x3ff4b000, 0x4d1fc1c8, 0x3f6a7dbb, - 0x00000000, 0x3ff4a000, 0xa57eb503, 0x3f6fad40, - 0x00000000, 0x3ff4a000, 0xb00a5140, 0x3f43fd6b, - 0x00000000, 0x3ff49000, 0xcb419ba9, 0x3f54e78e, - 0x00000000, 0x3ff48000, 0x029100a4, 0x3f600a44, - 0x00000000, 0x3ff47000, 0x5c28f5c3, 0x3f65c28f, - 0x00000000, 0x3ff46000, 0xb2c0cc4a, 0x3f6b9c68, - 0x00000000, 0x3ff46000, 0xb9f34381, 0x3f2978fe, - 0x00000000, 0x3ff45000, 0x3bb6500a, 0x3f4ecf16, - 0x00000000, 0x3ff44000, 0x8b67ebb9, 0x3f5be195, - 0x00000000, 0x3ff43000, 0x57dc9a3b, 0x3f644e61, - 0x00000000, 0x3ff42000, 0xaa3f0ddf, 0x3f6acc4b, - 0x00000000, 0x3ff42000, 0xcb2a247b, 0x3f26a4cb, - 0x00000000, 0x3ff41000, 0x50505050, 0x3f505050, - 0x00000000, 0x3ff40000, 0x39959819, 0x3f5e0b44, - 0x00000000, 0x3ff3f000, 0x6027f602, 0x3f66027f, - 0x00000000, 0x3ff3e000, 0x4b5e0db4, 0x3f6d1e85, - 0x00000000, 0x3ff3e000, 0x254813e2, 0x3f4165e7, - 0x00000000, 0x3ff3d000, 0xa9d716ef, 0x3f576646, - 0x00000000, 0x3ff3c000, 0xf757ce88, 0x3f632b48, - 0x00000000, 0x3ff3b000, 0x4652a906, 0x3f6ac1b2, - 0x00000000, 0x3ff3b000, 0x13b13b14, 0x3f33b13b, - 0x00000000, 0x3ff3a000, 0xeb208984, 0x3f5490e1, - 0x00000000, 0x3ff39000, 0x30fec66e, 0x3f623858, - 0x00000000, 0x3ff38000, 0xcc111b7e, 0x3f6a45a6, - 0x00000000, 0x3ff38000, 0x13813814, 0x3f338138, - 0x00000000, 0x3ff37000, 0x2517b708, 0x3f556f47, - 0x00000000, 0x3ff36000, 0xbc0e8f2a, 0x3f631be7, - 0x00000000, 0x3ff35000, 0x3e55f044, 0x3f6b9cbf, - 0x00000000, 0x3ff35000, 0x5bc609a9, 0x3f40e7d9, - 0x00000000, 0x3ff34000, 0x804d19e7, 0x3f59e6b3, - 0x00000000, 0x3ff33000, 0xaf7963c2, 0x3f65c8b6, - 0x00000000, 0x3ff32000, 0xd43bf402, 0x3f6eb9da, - 0x00000000, 0x3ff32000, 0x5885fb37, 0x3f4f1a51, - 0x00000000, 0x3ff31000, 0xd3d76c02, 0x3f60eeb1, - 0x00000000, 0x3ff30000, 0x61a32026, 0x3f6a3202, - 0x00000000, 0x3ff30000, 0x40260390, 0x3f3c82ac, - 0x00000000, 0x3ff2f000, 0x84bda12f, 0x3f5a12f6, - 0x00000000, 0x3ff2e000, 0xfda2962c, 0x3f669d43, - 0x00000000, 0x3ff2e000, 0xc04b8097, 0x3f02e025, - 0x00000000, 0x3ff2d000, 0xb542804b, 0x3f542804, - 0x00000000, 0x3ff2c000, 0x02593f6a, 0x3f63f69b, - 0x00000000, 0x3ff2b000, 0xb46e21fa, 0x3f6df31c, - 0x00000000, 0x3ff2b000, 0x04ad012b, 0x3f5012b4, - 0x00000000, 0x3ff2a000, 0xe7820a7f, 0x3f623925, - 0x00000000, 0x3ff29000, 0xc8253c82, 0x3f6c8253, - 0x00000000, 0x3ff29000, 0xc02526e5, 0x3f4b92dd, - 0x00000000, 0x3ff28000, 0x11602511, 0x3f616025, - 0x00000000, 0x3ff27000, 0x439c9adf, 0x3f6bf471, - 0x00000000, 0x3ff27000, 0x0939a85c, 0x3f4a85c4, - 0x00000000, 0x3ff26000, 0xac024d16, 0x3f6166f9, - 0x00000000, 0x3ff25000, 0x0125e227, 0x3f6c44e1, - 0x00000000, 0x3ff25000, 0x8bbd90e5, 0x3f4cebf4, - 0x00000000, 0x3ff24000, 0x92492492, 0x3f624924, - 0x00000000, 0x3ff23000, 0x2ec0b673, 0x3f6d6f2e, - 0x00000000, 0x3ff23000, 0x6af37c05, 0x3f5159e2, - 0x00000000, 0x3ff22000, 0x40245402, 0x3f640245, - 0x00000000, 0x3ff21000, 0x43f6f024, 0x3f6f6f02, - 0x00000000, 0x3ff21000, 0x21579805, 0x3f55e601, - 0x00000000, 0x3ff20000, 0xcf81b10f, 0x3f668e18, - 0x00000000, 0x3ff20000, 0x12012012, 0x3f320120, - 0x00000000, 0x3ff1f000, 0x047dc11f, 0x3f5c11f7, - 0x00000000, 0x3ff1e000, 0xff70985e, 0x3f69e878, - 0x00000000, 0x3ff1e000, 0xfdc3a219, 0x3f4779d9, - 0x00000000, 0x3ff1d000, 0x5c957907, 0x3f61eace, - 0x00000000, 0x3ff1c000, 0x450239e1, 0x3f6e0d5b, - 0x00000000, 0x3ff1c000, 0x73816367, 0x3f548bf0, - 0x00000000, 0x3ff1b000, 0x8dda5202, 0x3f669480, - 0x00000000, 0x3ff1b000, 0x2bae2b21, 0x3f37c67f, - 0x00000000, 0x3ff1a000, 0x69ee5847, 0x3f5ee584, - 0x00000000, 0x3ff19000, 0xc0233c02, 0x3f6c0233, - 0x00000000, 0x3ff19000, 0x328a7012, 0x3f514e02, - 0x00000000, 0x3ff18000, 0x2057b573, 0x3f656107, - 0x00000000, 0x3ff18000, 0x11811812, 0x3f318118, - 0x00000000, 0x3ff17000, 0x6f5a1060, 0x3f5e2864, - 0x00000000, 0x3ff16000, 0x84e6f1d7, 0x3f6c0d12, - 0x00000000, 0x3ff16000, 0xf0c80459, 0x3f523543, - 0x00000000, 0x3ff15000, 0xea4e1a09, 0x3f663cbe, - 0x00000000, 0x3ff15000, 0xdd5c8cb8, 0x3f3b9a3f, - 0x00000000, 0x3ff14000, 0x159a76d2, 0x3f60be1c, - 0x00000000, 0x3ff13000, 0x688e4838, 0x3f6e1d1a, - 0x00000000, 0x3ff13000, 0xd72044d7, 0x3f572044, - 0x00000000, 0x3ff12000, 0xdb81577b, 0x3f691713, - 0x00000000, 0x3ff12000, 0xe9819b50, 0x3f4ac73a, - 0x00000000, 0x3ff11000, 0x4e904cf6, 0x3f646033, - 0x00000000, 0x3ff11000, 0x11111111, 0x3f311111, - 0x00000000, 0x3ff10000, 0x0441fef0, 0x3f5feef8, - 0x00000000, 0x3ff0f000, 0xfde021fe, 0x3f6de021, - 0x00000000, 0x3ff0f000, 0xcc9686a0, 0x3f57b7ea, - 0x00000000, 0x3ff0e000, 0xcd391fbc, 0x3f69ead7, - 0x00000000, 0x3ff0e000, 0x09804390, 0x3f501956, - 0x00000000, 0x3ff0d000, 0x1e8d2b32, 0x3f664151, - 0x00000000, 0x3ff0d000, 0xacf1ce96, 0x3f4222b1, - 0x00000000, 0x3ff0c000, 0x79b47582, 0x3f62e29f, - 0x00000000, 0x3ff0c000, 0x682e11cd, 0x3f24f0d1, - 0x00000000, 0x3ff0b000, 0x96771e4d, 0x3f5f9bb0, - 0x00000000, 0x3ff0a000, 0x5dd96ae2, 0x3f6e5ee4, - 0x00000000, 0x3ff0a000, 0xa0429a04, 0x3f5a0429, - 0x00000000, 0x3ff09000, 0x5f06c021, 0x3f6bb74d, - 0x00000000, 0x3ff09000, 0x04254fce, 0x3f54fce4, - 0x00000000, 0x3ff08000, 0xeacbc402, 0x3f695766, - 0x00000000, 0x3ff08000, 0x08421084, 0x3f508421, - 0x00000000, 0x3ff07000, 0x71d5c338, 0x3f673e53, - 0x00000000, 0x3ff07000, 0x3fbe3368, 0x3f493052, - 0x00000000, 0x3ff06000, 0xf225f6c4, 0x3f656b38, - 0x00000000, 0x3ff06000, 0x8d4fdf3b, 0x3f426e97, - 0x00000000, 0x3ff05000, 0xe4eb0cc6, 0x3f63dd40, - 0x00000000, 0x3ff05000, 0x73404146, 0x3f397f7d, - 0x00000000, 0x3ff04000, 0x2cc98af1, 0x3f629398, - 0x00000000, 0x3ff04000, 0x10410410, 0x3f304104, - 0x00000000, 0x3ff03000, 0x048ff7e4, 0x3f618d6f, - 0x00000000, 0x3ff03000, 0xebc349de, 0x3f2236a3, - 0x00000000, 0x3ff02000, 0xee53d18c, 0x3f60c9f8, - 0x00000000, 0x3ff02000, 0x81020408, 0x3f102040, - 0x00000000, 0x3ff01000, 0xa2f46ea6, 0x3f60486c, - 0x00000000, 0x3ff01000, 0x10101010, 0x3ef01010, - 0x00000000, 0x3ff00000, 0x02010080, 0x3f600804, - 0x00000000, 0x3ff00000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0xc0000000, 0x3ff2cd9f, 0x096a0092, 0x3e513ae6, - 0x60000000, 0x400d03cf, 0xfb79a640, 0x3e5db70c, - 0xe0000000, 0x40240926, 0xb66dc067, 0x3e8c2526, - 0x00000000, 0x403b4a38, 0x8647f380, 0x3e8b81b1, - 0x60000000, 0x40528d01, 0xd1e1eb08, 0x3ebbc1cd, - 0x28000000, 0x406936d2, 0x1534fb09, 0x3ecd9f20, - 0x68000000, 0x40812287, 0x4a4e9954, 0x3edd1c06, - 0x50000000, 0x409749ea, 0x5d06ea74, 0x3ed4eca6, - 0x70000000, 0x40afa715, 0xbcc0ecc5, 0x3f00c259, - 0xc8000000, 0x40c5829d, 0x47cf9016, 0x3f2b5a66, - 0x88000000, 0x40dd3c44, 0xdefb0870, 0x3f09691a, - 0x50000000, 0x40f3de16, 0xc29cde38, 0x3f53410f, - 0x90000000, 0x410b00b5, 0x50b6fb3c, 0x3f46a31a, - 0x48000000, 0x412259ac, 0x71805c40, 0x3f57defc, - 0xa8000000, 0x4138f0cc, 0xd80e0bab, 0x3f9eb49f, - 0xd0000000, 0x4150f2eb, 0x7bcd5920, 0x3f84fffc, - 0x88000000, 0x41670934, 0xb6c63435, 0x3fc03a93, - 0x08000000, 0x417f4f22, 0xb255fd1c, 0x3fb1940b, - 0xf8000000, 0x419546d8, 0x14260b50, 0x3fded26e, - 0x88000000, 0x41aceb08, 0x1fc9f2a2, 0x3ffb4740, - 0xf8000000, 0x41c3a6e1, 0xf55634f1, 0x40267bb3, - 0xb8000000, 0x41dab5ad, 0xf8194ddc, 0x401c435f, - 0x30000000, 0x41f226af, 0x052ba63a, 0x404d8fee, - 0xb0000000, 0x4208ab7f, 0xdccde3f6, 0x40651d7e, - 0x90000000, 0x4220c3d3, 0x44557d1a, 0x40704b16, - 0x68000000, 0x4236c932, 0xca0a9dc4, 0x4076a6b5, - 0xf0000000, 0x424ef822, 0x72249aba, 0x40afd9cc, - 0x30000000, 0x42650bba, 0x693edab5, 0x40ce58de, - 0x40000000, 0x427c9aae, 0x58ac6363, 0x40d8c701, - 0x08000000, 0x42937047, 0x64f43e20, 0x40e76147, - 0x58000000, 0x42aa6b76, 0xb36fc718, 0x4106337d, - 0xc8000000, 0x42c1f43f, 0xb1f611e2, 0x41212d98, - 0x48000000, 0x42d866f3, 0x108b37cc, 0x412392bc, - 0x28000000, 0x42f0953e, 0xdc3473dc, 0x415ce87b, - 0x20000000, 0x430689e2, 0xae99ad14, 0x414bc8d5, - 0xa0000000, 0x431ea215, 0x6744835c, 0x415d20d7, - 0x00000000, 0x3ff00000, 0x00000000, 0x00000000, - 0x50000000, 0x3ff8b075, 0x04c2bd28, 0x3e3d9f55, - 0x08000000, 0x400e18fa, 0xf0a4c9fd, 0x3e67cb66, - 0x90000000, 0x402422a4, 0x7928e588, 0x3e8f5861, - 0x58000000, 0x403b4ee8, 0x00c38d48, 0x3e6bc7d0, - 0xc8000000, 0x40528d6f, 0x4e329998, 0x3eaf7f9d, - 0x78000000, 0x406936e6, 0x64885269, 0x3ec6e6e4, - 0x48000000, 0x40812289, 0xb946c154, 0x3ecba3a8, - 0xa8000000, 0x409749ea, 0x6110d5a4, 0x3ed3f4e7, - 0x80000000, 0x40afa715, 0x515a3e2b, 0x3f017622, - 0xd0000000, 0x40c5829d, 0x528af3d0, 0x3ee4dc4b, - 0x88000000, 0x40dd3c44, 0x78615e10, 0x3f111562, - 0x50000000, 0x40f3de16, 0x0ed821f5, 0x3f535ad5, - 0x90000000, 0x410b00b5, 0x55f2935c, 0x3f46b610, - 0x48000000, 0x412259ac, 0x4a601240, 0x3f57e279, - 0xa8000000, 0x4138f0cc, 0x5f6aadd3, 0x3f9eb4b4, - 0xd0000000, 0x4150f2eb, 0x967b3698, 0x3f85000b, - 0x88000000, 0x41670934, 0x0fadc092, 0x3fc03a94, - 0x08000000, 0x417f4f22, 0xf3bf874c, 0x3fb1940b, - 0xf8000000, 0x419546d8, 0x1a2a2110, 0x3fded26e, - 0x88000000, 0x41aceb08, 0x205796d6, 0x3ffb4740, - 0xf8000000, 0x41c3a6e1, 0xf55cb85d, 0x40267bb3, - 0xb8000000, 0x41dab5ad, 0xf81e18ac, 0x401c435f, - 0x30000000, 0x41f226af, 0x052bdea4, 0x404d8fee, - 0xb0000000, 0x4208ab7f, 0xdccde926, 0x40651d7e, - 0x90000000, 0x4220c3d3, 0x44557e0e, 0x40704b16, - 0x68000000, 0x4236c932, 0xca0a9e1c, 0x4076a6b5, - 0xf0000000, 0x424ef822, 0x72249abe, 0x40afd9cc, - 0x30000000, 0x42650bba, 0x693edab5, 0x40ce58de, - 0x40000000, 0x427c9aae, 0x58ac6364, 0x40d8c701, - 0x08000000, 0x42937047, 0x64f43e20, 0x40e76147, - 0x58000000, 0x42aa6b76, 0xb36fc718, 0x4106337d, - 0xc8000000, 0x42c1f43f, 0xb1f611e2, 0x41212d98, - 0x48000000, 0x42d866f3, 0x108b37cc, 0x412392bc, - 0x28000000, 0x42f0953e, 0xdc3473dc, 0x415ce87b, - 0x20000000, 0x430689e2, 0xae99ad14, 0x414bc8d5, - 0xa0000000, 0x431ea215, 0x6744835c, 0x415d20d7, - 0x00000000, 0x40000000, 0xe01fe020, 0x3fffe01f, - 0x01fc07f0, 0x3fffc07f, 0xaa01fa12, 0x3fffa11c, - 0x1f81f820, 0x3fff81f8, 0xaca0dbb5, 0x3fff6310, - 0x9e4a4271, 0x3fff4465, 0x44230ab5, 0x3fff25f6, - 0xf07c1f08, 0x3fff07c1, 0xf8458e02, 0x3ffee9c7, - 0xb301ecc0, 0x3ffecc07, 0x7aba01eb, 0x3ffeae80, - 0xabf0b767, 0x3ffe9131, 0xa59750e4, 0x3ffe741a, - 0xc901e574, 0x3ffe573a, 0x79dc1a73, 0x3ffe3a91, - 0x1e1e1e1e, 0x3ffe1e1e, 0x1e01e01e, 0x3ffe01e0, - 0xe3f8868a, 0x3ffde5d6, 0xdca01dca, 0x3ffdca01, - 0x76b981db, 0x3ffdae60, 0x231e7f8a, 0x3ffd92f2, - 0x54b82c34, 0x3ffd77b6, 0x807572b2, 0x3ffd5cac, - 0x1d41d41d, 0x3ffd41d4, 0xa3fc5b1a, 0x3ffd272c, - 0x8f6ec074, 0x3ffd0cb5, 0x5c44bfc6, 0x3ffcf26e, - 0x89039b0b, 0x3ffcd856, 0x9601cbe7, 0x3ffcbe6d, - 0x055ee191, 0x3ffca4b3, 0x5afb8a42, 0x3ffc8b26, - 0x1c71c71c, 0x3ffc71c7, 0xd10d4986, 0x3ffc5894, - 0x01c3f8f0, 0x3ffc3f8f, 0x392ea01c, 0x3ffc26b5, - 0x0381c0e0, 0x3ffc0e07, 0xee868d8b, 0x3ffbf583, - 0x899406f7, 0x3ffbdd2b, 0x65883e7b, 0x3ffbc4fd, - 0x14c1bad0, 0x3ffbacf9, 0x2b18ff23, 0x3ffb951e, - 0x3dda338b, 0x3ffb7d6c, 0xe3beee05, 0x3ffb65e2, - 0xb4e81b4f, 0x3ffb4e81, 0x4ad806ce, 0x3ffb3748, - 0x406c80d9, 0x3ffb2036, 0x31d922a4, 0x3ffb094b, - 0xbca1af28, 0x3ffaf286, 0x7f94905e, 0x3ffadbe8, - 0x1ac5701b, 0x3ffac570, 0x2f87ebfd, 0x3ffaaf1d, - 0x606a63be, 0x3ffa98ef, 0x5130e159, 0x3ffa82e6, - 0xa6d01a6d, 0x3ffa6d01, 0x07688a4a, 0x3ffa5741, - 0x1a41a41a, 0x3ffa41a4, 0x87c51ca0, 0x3ffa2c2a, - 0xf97a4b02, 0x3ffa16d3, 0x1a01a01a, 0x3ffa01a0, - 0x951033d9, 0x3ff9ec8e, 0x176b682d, 0x3ff9d79f, - 0x4ee4a102, 0x3ff9c2d1, 0xea5510da, 0x3ff9ae24, - 0x9999999a, 0x3ff99999, 0x0d8ec0ff, 0x3ff9852f, - 0xf80cb872, 0x3ff970e4, 0x0be377ae, 0x3ff95cbb, - 0xfcd6e9e0, 0x3ff948b0, 0x7f9b2ce6, 0x3ff934c6, - 0x49d0e229, 0x3ff920fb, 0x120190d5, 0x3ff90d4f, - 0x8f9c18fa, 0x3ff8f9c1, 0x7af1373f, 0x3ff8e652, - 0x8d3018d3, 0x3ff8d301, 0x8062ff3a, 0x3ff8bfce, - 0x0f6bf3aa, 0x3ff8acb9, 0xf601899c, 0x3ff899c0, - 0xf0abb04a, 0x3ff886e5, 0xbcc092b9, 0x3ff87427, - 0x18618618, 0x3ff86186, 0xc2780614, 0x3ff84f00, - 0x7ab2bedd, 0x3ff83c97, 0x0182a4a0, 0x3ff82a4a, - 0x18181818, 0x3ff81818, 0x80601806, 0x3ff80601, - 0xfd017f40, 0x3ff7f405, 0x515a4f1d, 0x3ff7e225, - 0x417d05f4, 0x3ff7d05f, 0x922e017c, 0x3ff7beb3, - 0x08e0ecc3, 0x3ff7ad22, 0x6bb6398b, 0x3ff79baa, - 0x8178a4c8, 0x3ff78a4c, 0x119ac60d, 0x3ff77908, - 0xe434a9b1, 0x3ff767dc, 0xc201756d, 0x3ff756ca, - 0x745d1746, 0x3ff745d1, 0xc541fe8d, 0x3ff734f0, - 0x7f46debc, 0x3ff72428, 0x6d9c7c09, 0x3ff71378, - 0x5c0b8170, 0x3ff702e0, 0x16f26017, 0x3ff6f260, - 0x6b4337c7, 0x3ff6e1f7, 0x2681c861, 0x3ff6d1a6, - 0x16c16c17, 0x3ff6c16c, 0x0aa31a3d, 0x3ff6b149, - 0xd1537290, 0x3ff6a13c, 0x3a88d0c0, 0x3ff69147, - 0x16816817, 0x3ff68168, 0x3601671a, 0x3ff6719f, - 0x6a5122f9, 0x3ff661ec, 0x853b4aa3, 0x3ff6524f, - 0x590b2164, 0x3ff642c8, 0xb88ac0de, 0x3ff63356, - 0x77016240, 0x3ff623fa, 0x6831ae94, 0x3ff614b3, - 0x60581606, 0x3ff60581, 0x34292dfc, 0x3ff5f664, - 0xb8d015e7, 0x3ff5e75b, 0xc3ece2a5, 0x3ff5d867, - 0x2b931057, 0x3ff5c988, 0xc647fa91, 0x3ff5babc, - 0x6b015ac0, 0x3ff5ac05, 0xf123ccaa, 0x3ff59d61, - 0x308158ed, 0x3ff58ed2, 0x01580560, 0x3ff58056, - 0x3c506b3a, 0x3ff571ed, 0xba7c52e2, 0x3ff56397, - 0x55555555, 0x3ff55555, 0xe6bb82fe, 0x3ff54725, - 0x48f40feb, 0x3ff53909, 0x56a8054b, 0x3ff52aff, - 0xeae2f815, 0x3ff51d07, 0xe111c4c5, 0x3ff50f22, - 0x15015015, 0x3ff50150, 0x62dd4c9b, 0x3ff4f38f, - 0xa72f0539, 0x3ff4e5e0, 0xbedc2c4c, 0x3ff4d843, - 0x8725af6e, 0x3ff4cab8, 0xdda68fe1, 0x3ff4bd3e, - 0xa052bf5b, 0x3ff4afd6, 0xad76014a, 0x3ff4a27f, - 0xe3b2d067, 0x3ff49539, 0x22014880, 0x3ff48805, - 0x47ae147b, 0x3ff47ae1, 0x34596066, 0x3ff46dce, - 0xc7f5cf9a, 0x3ff460cb, 0xe2c776ca, 0x3ff453d9, - 0x6562d9fb, 0x3ff446f8, 0x30abee4d, 0x3ff43a27, - 0x25d51f87, 0x3ff42d66, 0x265e5951, 0x3ff420b5, - 0x14141414, 0x3ff41414, 0xd10e6566, 0x3ff40782, - 0x3fb013fb, 0x3ff3fb01, 0x42a5af07, 0x3ff3ee8f, - 0xbce4a902, 0x3ff3e22c, 0x91aa75c6, 0x3ff3d5d9, - 0xa47babe7, 0x3ff3c995, 0xd9232955, 0x3ff3bd60, - 0x13b13b14, 0x3ff3b13b, 0x387ac822, 0x3ff3a524, - 0x2c187f63, 0x3ff3991c, 0xd366088e, 0x3ff38d22, - 0x13813814, 0x3ff38138, 0xd1c945ee, 0x3ff3755b, - 0xf3de0748, 0x3ff3698d, 0x5f9f2af8, 0x3ff35dce, - 0xfb2b78c1, 0x3ff3521c, 0xace01346, 0x3ff34679, - 0x5b57bcb2, 0x3ff33ae4, 0xed6a1dfa, 0x3ff32f5c, - 0x4a2b10bf, 0x3ff323e3, 0x58e9ebb6, 0x3ff31877, - 0x0130d190, 0x3ff30d19, 0x2ac40260, 0x3ff301c8, - 0xbda12f68, 0x3ff2f684, 0xa1fed14b, 0x3ff2eb4e, - 0xc04b8097, 0x3ff2e025, 0x012d50a0, 0x3ff2d50a, - 0x4d812ca0, 0x3ff2c9fb, 0x8e5a3711, 0x3ff2bef9, - 0xad012b40, 0x3ff2b404, 0x92f3c105, 0x3ff2a91c, - 0x29e4129e, 0x3ff29e41, 0x5bb804a5, 0x3ff29372, - 0x1288b013, 0x3ff288b0, 0x38a1ce4d, 0x3ff27dfa, - 0xb8812735, 0x3ff27350, 0x7cd60127, 0x3ff268b3, - 0x708092f1, 0x3ff25e22, 0x7e9177b2, 0x3ff2539d, - 0x92492492, 0x3ff24924, 0x9717605b, 0x3ff23eb7, - 0x789abcdf, 0x3ff23456, 0x22a0122a, 0x3ff22a01, - 0x8121fb78, 0x3ff21fb7, 0x804855e6, 0x3ff21579, - 0x0c67c0d9, 0x3ff20b47, 0x12012012, 0x3ff20120, - 0x7dc11f70, 0x3ff1f704, 0x3c7fb84c, 0x3ff1ecf4, - 0x3b3fb874, 0x3ff1e2ef, 0x672e4abd, 0x3ff1d8f5, - 0xada2811d, 0x3ff1cf06, 0xfc1ce059, 0x3ff1c522, - 0x4046ed29, 0x3ff1bb4a, 0x67f2bae3, 0x3ff1b17c, - 0x611a7b96, 0x3ff1a7b9, 0x19e0119e, 0x3ff19e01, - 0x808ca29c, 0x3ff19453, 0x83902bdb, 0x3ff18ab0, - 0x11811812, 0x3ff18118, 0x191bd684, 0x3ff1778a, - 0x89427379, 0x3ff16e06, 0x50fc3201, 0x3ff1648d, - 0x5f75270d, 0x3ff15b1e, 0xa3fdd5c9, 0x3ff151b9, - 0x0e0acd3b, 0x3ff1485f, 0x8d344724, 0x3ff13f0e, - 0x1135c811, 0x3ff135c8, 0x89edc0ac, 0x3ff12c8b, - 0xe75d3033, 0x3ff12358, 0x19a74826, 0x3ff11a30, - 0x11111111, 0x3ff11111, 0xbe011080, 0x3ff107fb, - 0x10fef011, 0x3ff0fef0, 0xfab325a2, 0x3ff0f5ed, - 0x6be69c90, 0x3ff0ecf5, 0x55826011, 0x3ff0e406, - 0xa88f4696, 0x3ff0db20, 0x56359e3a, 0x3ff0d244, - 0x4fbcda3b, 0x3ff0c971, 0x868b4171, 0x3ff0c0a7, - 0xec259dc8, 0x3ff0b7e6, 0x722eecb5, 0x3ff0af2f, - 0x0a6810a7, 0x3ff0a681, 0xa6af8360, 0x3ff09ddb, - 0x39010954, 0x3ff0953f, 0xb37565e2, 0x3ff08cab, - 0x08421084, 0x3ff08421, 0x29b8eae2, 0x3ff07b9f, - 0x0a47f7c6, 0x3ff07326, 0x9c7912fb, 0x3ff06ab5, - 0xd2f1a9fc, 0x3ff0624d, 0xa0727586, 0x3ff059ee, - 0xf7d73404, 0x3ff05197, 0xcc1664c5, 0x3ff04949, - 0x10410410, 0x3ff04104, 0xb78247fc, 0x3ff038c6, - 0xb51f5e1a, 0x3ff03091, 0xfc7729e9, 0x3ff02864, - 0x81020408, 0x3ff02040, 0x36517a37, 0x3ff01824, - 0x10101010, 0x3ff01010, 0x02010080, 0x3ff00804, - 0x00000000, 0x3ff00000, 0x87ec3637, 0x3dd06139, - 0x8db86d61, 0xbe138a4b, 0xe5bcc98d, 0x3e4f7b72, - 0x634fb0e4, 0xbe85f1f0, 0x954a9137, 0x3ebb9df2, - 0x3a67dc3c, 0xbeef4d1e, 0x1dd3898b, 0x3f1f9a32, - 0x3db06d60, 0xbf4c02db, 0xd0dbc1ad, 0x3f7565bc, - 0x31284e9c, 0xbf9b82ce, 0x1a042b32, 0x3fbce2f2, - 0x6b0379e6, 0xbfd81274, 0x50429b6d, 0x3ff20dd7, - 0x1438dcf6, 0x3dba1606, 0x016becfe, 0xbdff4342, - 0x75b4cde8, 0x3e395b1b, 0xf9c2a481, 0xbe71d468, - 0xb7a0966a, 0x3ea6ae6b, 0xfda1afb2, 0xbeda0df5, - 0xd9475b18, 0x3f0ac250, 0xc8a35a21, 0xbf384227, - 0xceb7b0e9, 0x3f631cd0, 0x054eefef, 0xbf89b3c1, - 0xbc5785f9, 0x3facf6d2, 0x451254ef, 0xbfcb5db0, - 0xa741088b, 0x3feaf767, 0x6a2dd61d, 0x3da4e37d, - 0xe1edf74c, 0xbde92a56, 0x4377a2ac, 0x3e2491d9, - 0x39f489a1, 0xbe5d38a6, 0x1dce54b4, 0x3e92d22f, - 0xc5671218, 0xbec5f6c1, 0x3a1bb9bd, 0x3ef70561, - 0x46c610de, 0xbf25712a, 0xbf0e3574, 0x3f518a17, - 0xdb5a4b51, 0xbf78ea53, 0x50e230dd, 0x3f9e962f, - 0x85d30895, 0xbfc0b60e, 0x95f8c2b3, 0x3fe5990d, - 0x45479566, 0x3d90d511, 0x59127e17, 0xbdd4667c, - 0x3e09cefe, 0x3e10d308, 0xa495656e, 0xbe482e4c, - 0xa05c8f4f, 0x3e7f9bcf, 0x66933f59, 0xbeb2ca0d, - 0xf3cce792, 0x3ee42cd2, 0x99c503dd, 0xbf136902, - 0x532c5193, 0x3f409c2f, 0xdf44f35e, 0xbf6931da, - 0x1aa68815, 0x3f911720, 0x1d221312, 0xbfb5e25e, - 0xd6c7e25c, 0x3fe235fd, 0xc24c08dd, 0x3d7b51ef, - 0x4c2104f1, 0xbdc0ab73, 0x866c02a1, 0x3dfbc79c, - 0x72027ff7, 0xbe343c2f, 0x21752c63, 0x3e6ae853, - 0x76229e21, 0xbea0593b, 0x32c6dda4, 0x3ed2106d, - 0xf8633468, 0xbf020fd5, 0xc5ffc392, 0x3f304e0c, - 0x0a3b61e7, 0xbf5ab23d, 0x9b1a01ca, 0x3f844016, - 0x759a9c41, 0xbfae8712, 0x142795e3, 0x3fdfd9ae, - 0xcee056b0, 0xbdf46eed, 0xbf31cd83, 0x3e272c71, - 0x3414494b, 0xbe3b858c, 0x150492a9, 0xbe51e878, - 0x4f9d39bd, 0xbe8d551c, 0x081cc18c, 0x3ec65597, - 0x3a3f547c, 0x3ef0e91e, 0x5416a98f, 0xbf30c9b6, - 0x4dfcdccd, 0x3f2428f4, 0xe4d81e00, 0x3f80b3bd, - 0x46178bfd, 0xbf8cb801, 0x675394d2, 0xbfc946a9, - 0x791dc8fb, 0x3fe6e23d, 0xd42d57ef, 0xbdb573c8, - 0xed1a27a8, 0xbdf632bf, 0x6b1145de, 0x3e367087, - 0xbdc2436d, 0x3e2d7559, 0xa688c6ab, 0xbe9801bb, - 0xf4ea88f0, 0x3e8ad7ec, 0xf8bce95a, 0x3efc2cfa, - 0x2fa996ef, 0xbf1f0bdd, 0x22b44655, 0xbf4a8a14, - 0x66397f44, 0x3f7b13bf, 0xbd4c13b8, 0x3f82a08f, - 0x7a57ce71, 0xbfc9ce0d, 0x517103fe, 0x3fe05fd2, - 0x0e7c1431, 0x3dc14df8, 0xad2781f0, 0xbe00a71c, - 0x333bdac6, 0x3e05e0fa, 0x8ecd3732, 0x3e62222b, - 0x9d8f65fe, 0xbe84bcf8, 0xe8101ae5, 0xbec10eb4, - 0xc2c15462, 0x3ef46afc, 0x0af95711, 0x3efded8b, - 0x8fc77467, 0xbf50a3f2, 0x2397107c, 0x3f663734, - 0x2190992e, 0x3f97d6f6, 0x75be54c3, 0xbfc56874, - 0xacd147f5, 0x3fd4cb2b, 0x7100d5fc, 0x3db9806d, - 0xc3b2ca73, 0xbdd1537c, 0x60cdaaa7, 0xbe26c0f4, - 0x7eb23d7f, 0x3e5604f1, 0x4c3adb34, 0x3e7e86ae, - 0x9b1fe971, 0xbec1b7ba, 0xc100d516, 0x3eceefdc, - 0x7287d24a, 0x3f19c090, 0xf860db94, 0xbf4594c1, - 0x906b63ac, 0xbf48ab6e, 0xabea8a44, 0x3f9a798f, - 0x314c81f1, 0xbfbdca7c, 0x462cb19e, 0x3fc761d7, - 0xaa177cb2, 0xbd902cf0, 0x207e29b4, 0x3decb11b, - 0x5fac5489, 0xbe209b21, 0xb5bdf6e3, 0xbe32b95b, - 0xca4c97b7, 0x3e882b27, 0xc0a47266, 0xbeaa520c, - 0x67f1145d, 0xbedc5422, 0x3e7fc487, 0x3f161837, - 0x7eb92091, 0xbf27130d, 0x8d1e6d3f, 0xbf63a37e, - 0x8a1743b5, 0x3f9522fd, 0x334d0c36, 0xbfb1ada1, - 0xae493c1d, 0x3fb741a2, 0x3f800000, 0x00000000, - 0x3f804000, 0x3a28e585, 0x3f80a000, 0x399c910f, - 0x3f80e000, 0x3a703484, 0x3f814000, 0x3a0eb4bc, - 0x3f81a000, 0x392750df, 0x3f81e000, 0x3a419dc7, - 0x3f824000, 0x39ac3801, 0x3f828000, 0x3a675948, - 0x3f82e000, 0x39eabf9a, 0x3f834000, 0x356629d6, - 0x3f838000, 0x3a07f04c, 0x3f83e000, 0x3848dac3, - 0x3f842000, 0x3a0e1b17, 0x3f848000, 0x384a1cc7, - 0x3f84c000, 0x3a082ade, 0x3f852000, 0x363f31e5, - 0x3f856000, 0x39eccf0d, 0x3f85a000, 0x3a692c6f, - 0x3f860000, 0x39b22cb1, 0x3f864000, 0x3a462d87, - 0x3f86a000, 0x3941e864, 0x3f86e000, 0x3a180409, - 0x3f872000, 0x3a7cd32d, 0x3f878000, 0x39bdde6c, - 0x3f87c000, 0x3a3e5fb4, 0x3f882000, 0x38d960b3, - 0x3f886000, 0x39eab752, 0x3f88a000, 0x3a4cf599, - 0x3f890000, 0x390803d1, 0x3f894000, 0x39e90955, - 0x3f898000, 0x3a44878c, 0x3f89e000, 0x38908271, - 0x3f8a2000, 0x39ba4b0f, 0x3f8a6000, 0x3a25cdb3, - 0x3f8aa000, 0x3a6c0f33, 0x3f8b0000, 0x393fc12e, - 0x3f8b4000, 0x39e2ee51, 0x3f8b8000, 0x3a30a9dd, - 0x3f8bc000, 0x3a6d8e61, 0x3f8c2000, 0x3920aa58, - 0x3f8c6000, 0x39c1088a, 0x3f8ca000, 0x3a16a120, - 0x3f8ce000, 0x3a4a86c1, 0x3f8d2000, 0x3a7c3aae, - 0x3f8d8000, 0x392f0952, 0x3f8dc000, 0x39b2461c, - 0x3f8e0000, 0x3a04621f, 0x3f8e4000, 0x3a2d84b8, - 0x3f8e8000, 0x3a548ff4, 0x3f8ec000, 0x3a7988db, - 0x3f8f2000, 0x38e3a30c, 0x3f8f6000, 0x39755daa, - 0x3f8fa000, 0x39b86d8a, 0x3f8fe000, 0x39f22e5e, - 0x3f902000, 0x3a13fd53, 0x3f906000, 0x3a2cedcc, - 0x3f90a000, 0x3a43ed23, 0x3f90e000, 0x3a58ffd0, - 0x3f912000, 0x3a6c2a3c, 0x3f916000, 0x3a7d70bf, - 0x3f91c000, 0x384d7a06, 0x3f920000, 0x38d318cf, - 0x3f924000, 0x39185d53, 0x3f928000, 0x393fe1b1, - 0x3f92c000, 0x396029b1, 0x3f930000, 0x3979454c, - 0x3f934000, 0x3985a221, 0x3f938000, 0x398b1b0d, - 0x3f93c000, 0x398d1515, 0x3f940000, 0x398b97c7, - 0x3f944000, 0x3986aa98, 0x3f948000, 0x397ca9c7, - 0x3f94c000, 0x39653bd8, 0x3f950000, 0x394719b5, - 0x3f954000, 0x39225182, 0x3f958000, 0x38ede264, - 0x3f95c000, 0x388a0d15, 0x3f960000, 0x3749f226, - 0x3f962000, 0x3a737219, 0x3f966000, 0x3a6223e3, - 0x3f96a000, 0x3a4f406c, 0x3f96e000, 0x3a3acaee, - 0x3f972000, 0x3a24c698, 0x3f976000, 0x3a0d368f, - 0x3f97a000, 0x39e83bdd, 0x3f97e000, 0x39b2ff8f, - 0x3f982000, 0x39757c89, 0x3f986000, 0x38fdf7dc, - 0x3f98a000, 0x3622482d, 0x3f98c000, 0x3a600bf3, - 0x3f990000, 0x3a3dfedf, 0x3f994000, 0x3a1a7de3, - 0x3f998000, 0x39eb17a4, 0x3f99c000, 0x399e56e3, - 0x3f9a0000, 0x391d7e03, 0x3f9a2000, 0x3a7e2ab7, - 0x3f9a6000, 0x3a538fc2, 0x3f9aa000, 0x3a279148, - 0x3f9ae000, 0x39f463ce, 0x3f9b2000, 0x3996e86c, - 0x3f9b6000, 0x38dad617, 0x3f9b8000, 0x3a69e815, - 0x3f9bc000, 0x3a371eac, 0x3f9c0000, 0x3a030100, - 0x3f9c4000, 0x399b2304, 0x3f9c8000, 0x38b694db, - 0x3f9ca000, 0x3a5ec6af, 0x3f9ce000, 0x3a257018, - 0x3f9d2000, 0x39d5a259, 0x3f9d6000, 0x393bb0e7, - 0x3f9d8000, 0x3a71c388, 0x3f9dc000, 0x3a335958, - 0x3f9e0000, 0x39e75fcb, 0x3f9e4000, 0x394b2590, - 0x3f9e6000, 0x3a70a802, 0x3f9ea000, 0x3a2d4de7, - 0x3f9ee000, 0x39d17a6c, 0x3f9f2000, 0x390be02b, - 0x3f9f4000, 0x3a5c007c, 0x3f9f8000, 0x3a13d899, - 0x3f9fc000, 0x399504dc, 0x3fa00000, 0x00000000, - 0x3fa02000, 0x3a34534e, 0x3fa06000, 0x39cefca8, - 0x3fa0a000, 0x38cc1828, 0x3fa0c000, 0x3a4a6352, - 0x3fa10000, 0x39f4424a, 0x3fa14000, 0x3922f98d, - 0x3f214000, 0x38a2f98d, 0x3f4b2000, 0x397f529f, - 0x3f800000, 0x00000000, 0x3fa14000, 0x3922f98d, - 0x3fcb2000, 0x39ff529f, 0x3f800000, 0x00000000, - 0x3f816000, 0x391a3e77, 0x3f82c000, 0x39d8698a, - 0x3f842000, 0x3a51461d, 0x3f85a000, 0x39ac367c, - 0x3f870000, 0x3a7b0cb4, 0x3f888000, 0x3a407404, - 0x3f8a0000, 0x3a26abaa, 0x3f8b8000, 0x3a2e0f1f, - 0x3f8d0000, 0x3a56fadb, 0x3f8ea000, 0x39073168, - 0x3f902000, 0x3a0ee218, 0x3f91c000, 0x38f4dcea, - 0x3f934000, 0x3a515978, 0x3f94e000, 0x3a277d47, - 0x3f968000, 0x3a2169b9, 0x3f982000, 0x3a3f828c, - 0x3f99e000, 0x370b2641, 0x3f9b8000, 0x39d39b9d, - 0x3f9d2000, 0x3a76cd39, 0x3f9ee000, 0x3a299304, - 0x3fa0a000, 0x3a02887d, 0x3fa26000, 0x3a021818, - 0x3fa42000, 0x3a28ad70, 0x3fa5e000, 0x3a76b54d, - 0x3fa7c000, 0x39d93b4e, 0x3fa9a000, 0x382d5a75, - 0x3fab6000, 0x3a51cdad, 0x3fad4000, 0x3a41f752, - 0x3faf2000, 0x3a5bc56b, 0x3fb12000, 0x38fd6074, - 0x3fb30000, 0x3a0e2095, 0x3fb50000, 0x391e667f, - 0x3fb6e000, 0x3a6c8f19, 0x3fb8e000, 0x3a5d7a3b, - 0x3fbae000, 0x3a7ad590, 0x3fbd0000, 0x398a39f5, - 0x3fbf0000, 0x3a3ccdb3, 0x3fc12000, 0x39c4cca6, - 0x3fc34000, 0x39599b44, 0x3fc56000, 0x3965422a, - 0x3fc78000, 0x39d74c8a, 0x3fc9a000, 0x3a4dec33, - 0x3fcbe000, 0x39c14fef, 0x3fce2000, 0x391182a3, - 0x3fd06000, 0x38ccf6bb, 0x3fd2a000, 0x3981d91f, - 0x3fd4e000, 0x3a1ad55e, 0x3fd74000, 0x391f995a, - 0x3fd98000, 0x3a68ae13, 0x3fdbe000, 0x3a5dbcbe, - 0x3fde6000, 0x37f4825e, 0x3fe0c000, 0x39cdeec2, - 0x3fe32000, 0x3a7c4b95, 0x3fe5a000, 0x3a48373b, - 0x3fe82000, 0x3a4b5281, 0x3feac000, 0x37c6e7dd, - 0x3fed4000, 0x39f301ed, 0x3fefe000, 0x3917337b, - 0x3ff28000, 0x383b9e2c, 0x3ff52000, 0x392fa2a4, - 0x3ff7c000, 0x3a06fb98, 0x3ffa8000, 0x38ecb6dc, - 0x3ffd2000, 0x3a706067, 0x40000000, 0x00000000, - 0x00000000, 0x00000000, 0x3c37c000, 0x374a16dd, - 0x3cb70000, 0x37f2d0b8, 0x3d08c000, 0x381a3aa2, - 0x3d35c000, 0x37b4dd63, 0x3d624000, 0x383f5721, - 0x3d874000, 0x384e27e8, 0x3d9d4000, 0x380bf749, - 0x3db30000, 0x387dbeb2, 0x3dc8c000, 0x37216e46, - 0x3dde4000, 0x3684815b, 0x3df38000, 0x383b045f, - 0x3e044000, 0x390b119b, 0x3e0ec000, 0x391a32ea, - 0x3e194000, 0x38ba789e, 0x3e238000, 0x39553f30, - 0x3e2e0000, 0x3651cfde, 0x3e380000, 0x39685a9d, - 0x3e424000, 0x39057a05, 0x3e4c4000, 0x395ba0ef, - 0x3e564000, 0x396bc5b6, 0x3e604000, 0x3936d9bb, - 0x3e6a4000, 0x38772619, 0x3e740000, 0x39017ce9, - 0x3e7dc000, 0x3902d720, 0x3e83c000, 0x38856dd8, - 0x3e888000, 0x3941f6b4, 0x3e8d4000, 0x3980b652, - 0x3e920000, 0x3980f561, 0x3e96c000, 0x39443f13, - 0x3e9b8000, 0x38926752, 0x3ea00000, 0x39c8c763, - 0x3ea4c000, 0x391e12f3, 0x3ea94000, 0x39b7bf89, - 0x3eae0000, 0x36d1cfde, 0x3eb28000, 0x38c7f233, - 0x3eb70000, 0x39087367, 0x3ebb8000, 0x38e95d3f, - 0x3ec00000, 0x38256316, 0x3ec44000, 0x39d38e5c, - 0x3ec8c000, 0x396ea247, 0x3ecd4000, 0x350e4788, - 0x3ed18000, 0x395d829f, 0x3ed5c000, 0x39c30f2f, - 0x3eda0000, 0x39fd7ee7, 0x3ede8000, 0x3872e9e7, - 0x3ee2c000, 0x3897d694, 0x3ee70000, 0x3824923a, - 0x3eeb0000, 0x39ea7c06, 0x3eef4000, 0x39a7fa88, - 0x3ef38000, 0x391aa879, 0x3ef78000, 0x39dace65, - 0x3efbc000, 0x39215a32, 0x3effc000, 0x39af3350, - 0x3f01c000, 0x3a7b5172, 0x3f040000, 0x389cf27f, - 0x3f060000, 0x3902806b, 0x3f080000, 0x3909d8a9, - 0x3f0a0000, 0x38c9faa1, 0x3f0c0000, 0x37a33dca, - 0x3f0dc000, 0x3a6623d2, 0x3f0fc000, 0x3a3c7a61, - 0x3f11c000, 0x3a083a84, 0x3f13c000, 0x39930161, - 0x3f15c000, 0x35d1cfde, 0x3f178000, 0x3a2d0ebd, - 0x3f198000, 0x399f1aad, 0x3f1b4000, 0x3a67ff6d, - 0x3f1d4000, 0x39ecfea8, 0x3f1f0000, 0x3a7b26f3, - 0x3f210000, 0x39ec1fa6, 0x3f22c000, 0x3a675314, - 0x3f24c000, 0x399e12f3, 0x3f268000, 0x3a2d4b66, - 0x3f288000, 0x370c3845, 0x3f2a4000, 0x399ba329, - 0x3f2c0000, 0x3a1044d3, 0x3f2dc000, 0x3a49a196, - 0x3f2f8000, 0x3a79fe83, 0x3f318000, 0x3905c7aa, - 0x3f334000, 0x39802391, 0x3f350000, 0x39abe796, - 0x3f36c000, 0x39c65a9d, 0x3f388000, 0x39cfa6c5, - 0x3f3a4000, 0x39c7f593, 0x3f3c0000, 0x39af6ff7, - 0x3f3dc000, 0x39863e4d, 0x3f3f8000, 0x391910c1, - 0x3f414000, 0x369d5be7, 0x3f42c000, 0x3a541616, - 0x3f448000, 0x3a1ee960, 0x3f464000, 0x39c38ed2, - 0x3f480000, 0x38e61600, 0x3f498000, 0x3a4fedb4, - 0x3f4b4000, 0x39f6b4ab, 0x3f4d0000, 0x38f8d3b0, - 0x3f4e8000, 0x3a3b3faa, 0x3f504000, 0x399fb693, - 0x3f51c000, 0x3a5cfe71, 0x3f538000, 0x39c5740b, - 0x3f550000, 0x3a611eb0, 0x3f56c000, 0x39b079c4, - 0x3f584000, 0x3a4824d7, 0x3f5a0000, 0x39439a54, - 0x3f5b8000, 0x3a1291ea, 0x3f5d0000, 0x3a6d3673, - 0x3f5ec000, 0x3981c731, 0x3f604000, 0x3a0da88f, - 0x3f61c000, 0x3a53945c, 0x3f638000, 0x3895ae91, - 0x3f650000, 0x3996372a, 0x3f668000, 0x39f9a832, - 0x3f680000, 0x3a27eda4, 0x3f698000, 0x3a4c764f, - 0x3f6b0000, 0x3a6a7c06, 0x3f6cc000, 0x370321eb, - 0x3f6e4000, 0x3899ab3f, 0x3f6fc000, 0x38f02086, - 0x3f714000, 0x390a1707, 0x3f72c000, 0x39031e44, - 0x3f744000, 0x38c6b362, 0x3f75c000, 0x382bf195, - 0x3f770000, 0x3a768e36, 0x3f788000, 0x3a5c503b, - 0x3f7a0000, 0x3a3c1179, 0x3f7b8000, 0x3a15de1d, - 0x3f7d0000, 0x39d3845d, 0x3f7e8000, 0x395f263f, - 0x3f800000, 0x00000000, 0x00000000, 0x00000000, - 0x3b5d4000, 0x367a8e44, 0x3bdc8000, 0x368ed49f, - 0x3c24c000, 0x36c21451, 0x3c5ac000, 0x375211d6, - 0x3c884000, 0x3720ea11, 0x3ca2c000, 0x37e9eb59, - 0x3cbd4000, 0x37b87be7, 0x3cd78000, 0x37bf2560, - 0x3cf1c000, 0x33d597a0, 0x3d05c000, 0x37806a05, - 0x3d128000, 0x3820581f, 0x3d1f4000, 0x38223334, - 0x3d2c0000, 0x378e3bac, 0x3d388000, 0x3810684f, - 0x3d450000, 0x37feb7ae, 0x3d518000, 0x36a9d609, - 0x3d5dc000, 0x37a68163, 0x3d6a0000, 0x376a8b27, - 0x3d760000, 0x384c8fd6, 0x3d810000, 0x3885183e, - 0x3d870000, 0x3874a760, 0x3d8d0000, 0x380d1154, - 0x3d92c000, 0x38ea42bd, 0x3d98c000, 0x384c1571, - 0x3d9e8000, 0x38ba66b8, 0x3da44000, 0x38e7da3b, - 0x3daa0000, 0x38eee632, 0x3dafc000, 0x38d00911, - 0x3db58000, 0x388bbede, 0x3dbb4000, 0x378a0512, - 0x3dc0c000, 0x3894c7a0, 0x3dc64000, 0x38e30710, - 0x3dcc0000, 0x36db2829, 0x3dd18000, 0x3729d609, - 0x3dd6c000, 0x38fa0e82, 0x3ddc4000, 0x38bc9a75, - 0x3de1c000, 0x383a9297, 0x3de70000, 0x38dc83c8, - 0x3dec8000, 0x37eac335, 0x3df1c000, 0x38706ac3, - 0x3df70000, 0x389574c2, 0x3dfc4000, 0x3892d068, - 0x3e00c000, 0x38615032, 0x3e034000, 0x3917acf4, - 0x3e05c000, 0x3967a126, 0x3e088000, 0x38217840, - 0x3e0b0000, 0x38b420ab, 0x3e0d8000, 0x38f9c7b2, - 0x3e100000, 0x391103bd, 0x3e128000, 0x39169a6b, - 0x3e150000, 0x390dd194, 0x3e178000, 0x38eda471, - 0x3e1a0000, 0x38a38950, 0x3e1c8000, 0x37f6844a, - 0x3e1ec000, 0x395e1cdb, 0x3e214000, 0x390fcffc, - 0x3e23c000, 0x38503e9d, 0x3e260000, 0x394b00fd, - 0x3e288000, 0x38a9910a, 0x3e2ac000, 0x39518a31, - 0x3e2d4000, 0x3882d2c2, 0x3e2f8000, 0x392488e4, - 0x3e31c000, 0x397b0aff, 0x3e344000, 0x388a22d8, - 0x3e368000, 0x3902bd5e, 0x3e38c000, 0x39342f85, - 0x3e3b0000, 0x39598811, 0x3e3d4000, 0x3972e6b1, - 0x3e3fc000, 0x34d53654, 0x3e420000, 0x360ca25e, - 0x3e440000, 0x39785cc0, 0x3e464000, 0x39630710, - 0x3e488000, 0x39424ed7, 0x3e4ac000, 0x39165101, - 0x3e4d0000, 0x38be5421, 0x3e4f4000, 0x37e7b0c0, - 0x3e514000, 0x394fd0c3, 0x3e538000, 0x38efaaaa, - 0x3e55c000, 0x37a8f566, 0x3e57c000, 0x3927c744, - 0x3e5a0000, 0x383fa4d5, 0x3e5c0000, 0x392d9e39, - 0x3e5e4000, 0x3803feae, 0x3e604000, 0x390a268c, - 0x3e624000, 0x39692b80, 0x3e648000, 0x38789b4f, - 0x3e668000, 0x3909307d, 0x3e688000, 0x394a601c, - 0x3e6ac000, 0x35e67edc, 0x3e6cc000, 0x383e386d, - 0x3e6ec000, 0x38a7743d, 0x3e70c000, 0x38dccec3, - 0x3e72c000, 0x38ff57e0, 0x3e74c000, 0x39079d8b, - 0x3e76c000, 0x390651a6, 0x3e78c000, 0x38f7bad9, - 0x3e7ac000, 0x38d0ab82, 0x3e7cc000, 0x38979e7d, - 0x3e7ec000, 0x381978ee, 0x3e804000, 0x397816c8, - 0x3e814000, 0x39410cb2, 0x3e824000, 0x39015384, - 0x3e834000, 0x3863fa28, 0x3e840000, 0x39f41065, - 0x3e850000, 0x39c7668a, 0x3e860000, 0x39968afa, - 0x3e870000, 0x39430db9, 0x3e880000, 0x38a18cf3, - 0x3e88c000, 0x39eb2907, 0x3e89c000, 0x39a9e10c, - 0x3e8ac000, 0x39492800, 0x3e8bc000, 0x385a53d1, - 0x3e8c8000, 0x39ce0cf7, 0x3e8d8000, 0x3979c7b2, - 0x3e8e8000, 0x389f5d99, 0x3e8f4000, 0x39ceefcb, - 0x3e904000, 0x39646a39, 0x3e914000, 0x380d7a9b, - 0x3e920000, 0x39ad6650, 0x3e930000, 0x390ac3b8, - 0x3e93c000, 0x39d9a9a8, 0x3e94c000, 0x39548a99, - 0x3e958000, 0x39f73c4b, 0x3e968000, 0x3980960e, - 0x3e978000, 0x374b3d5a, 0x3e984000, 0x39888f1e, - 0x3e994000, 0x37679a07, 0x3e9a0000, 0x39826a13, - 0x00000000, 0x00000000, 0x3bff0000, 0x3429ac41, - 0x3c7e0000, 0x35a8b0fc, 0x3cbdc000, 0x368d83ea, - 0x3cfc1000, 0x361b0e78, 0x3d1cf000, 0x3687b9fe, - 0x3d3ba000, 0x3631ec65, 0x3d5a1000, 0x36dd7119, - 0x3d785000, 0x35c30045, 0x3d8b2000, 0x379b7751, - 0x3d9a0000, 0x37ebcb0d, 0x3da8d000, 0x37839f83, - 0x3db78000, 0x37528ae5, 0x3dc61000, 0x37a2eb18, - 0x3dd49000, 0x36da7495, 0x3de2f000, 0x36a91eb7, - 0x3df13000, 0x3783b715, 0x3dff6000, 0x371131db, - 0x3e06b000, 0x383f3e68, 0x3e0db000, 0x38156a97, - 0x3e14a000, 0x38297c0f, 0x3e1b8000, 0x387e100f, - 0x3e226000, 0x3815b665, 0x3e293000, 0x37e5e3a1, - 0x3e2ff000, 0x38183853, 0x3e36b000, 0x35fe719d, - 0x3e3d5000, 0x38448108, 0x3e43f000, 0x38503290, - 0x3e4a9000, 0x373539e8, 0x3e511000, 0x385e0ff1, - 0x3e579000, 0x3864a740, 0x3e5e1000, 0x3786742d, - 0x3e647000, 0x387be3cd, 0x3e6ae000, 0x3685ad3e, - 0x3e713000, 0x3803b715, 0x3e778000, 0x37adcbdc, - 0x3e7dc000, 0x380c36af, 0x3e820000, 0x371652d3, - 0x3e851000, 0x38927139, 0x3e882000, 0x38c5fcd7, - 0x3e8b3000, 0x38ae55d5, 0x3e8e4000, 0x3818c169, - 0x3e914000, 0x38a0fde7, 0x3e944000, 0x38ad09ef, - 0x3e974000, 0x3862bae1, 0x3e9a3000, 0x38eecd4c, - 0x3e9d3000, 0x3798aad2, 0x3ea02000, 0x37421a1a, - 0x3ea30000, 0x38c5e10e, 0x3ea5f000, 0x37bf2aee, - 0x3ea8d000, 0x382d872d, 0x3eabb000, 0x37ee2e8a, - 0x3eae8000, 0x38dedfac, 0x3eb16000, 0x3802f2b9, - 0x3eb43000, 0x38481e9b, 0x3eb70000, 0x380eaa2b, - 0x3eb9c000, 0x38ebfb5d, 0x3ebc9000, 0x38255fdd, - 0x3ebf5000, 0x38783b82, 0x3ec21000, 0x3851da1e, - 0x3ec4d000, 0x374e1b05, 0x3ec78000, 0x388f439b, - 0x3eca3000, 0x38ca0e10, 0x3ecce000, 0x38cac08b, - 0x3ecf9000, 0x3891f65f, 0x3ed24000, 0x378121cb, - 0x3ed4e000, 0x386c9a9a, 0x3ed78000, 0x38949923, - 0x3eda2000, 0x38777bcc, 0x3edcc000, 0x37b12d26, - 0x3edf5000, 0x38a6ced3, 0x3ee1e000, 0x38ebd3e6, - 0x3ee47000, 0x38fbe3cd, 0x3ee70000, 0x38d785c2, - 0x3ee99000, 0x387e7e00, 0x3eec1000, 0x38f392c5, - 0x3eeea000, 0x37d40983, 0x3ef12000, 0x38081a7c, - 0x3ef3a000, 0x3784c3ad, 0x3ef61000, 0x38cce923, - 0x3ef89000, 0x380f5faf, 0x3efb0000, 0x3891fd38, - 0x3efd7000, 0x38ac47bc, 0x3effe000, 0x3897042b, - 0x3f012000, 0x392952d2, 0x3f025000, 0x396fced4, - 0x3f039000, 0x37f97073, 0x3f04c000, 0x385e9eae, - 0x3f05f000, 0x3865c84a, 0x3f072000, 0x38130ba3, - 0x3f084000, 0x3979cf16, 0x3f097000, 0x3938cac9, - 0x3f0aa000, 0x38c3d2f4, 0x3f0bc000, 0x39755dec, - 0x3f0cf000, 0x38e6b467, 0x3f0e1000, 0x395c0fb8, - 0x3f0f4000, 0x383ebce0, 0x3f106000, 0x38dcd192, - 0x3f118000, 0x39186bdf, 0x3f12a000, 0x392de74c, - 0x3f13c000, 0x392f0944, 0x3f14e000, 0x391bff61, - 0x3f160000, 0x38e9ed44, 0x3f172000, 0x38686dc8, - 0x3f183000, 0x396b99a7, 0x3f195000, 0x39099c89, - 0x3f1a7000, 0x37a27673, 0x3f1b8000, 0x390bdaa3, - 0x3f1c9000, 0x397069ab, 0x3f1db000, 0x388449ff, - 0x3f1ec000, 0x39013538, 0x3f1fd000, 0x392dc268, - 0x3f20e000, 0x3947f423, 0x3f21f000, 0x394ff17c, - 0x3f230000, 0x3945e10e, 0x3f241000, 0x3929e8f5, - 0x3f252000, 0x38f85db0, 0x3f263000, 0x38735f99, - 0x3f273000, 0x396c08db, 0x3f284000, 0x3909e600, - 0x3f295000, 0x37b4996f, 0x3f2a5000, 0x391233cc, - 0x3f2b5000, 0x397cead9, 0x3f2c6000, 0x38adb5cd, - 0x3f2d6000, 0x3920261a, 0x3f2e6000, 0x3958ee36, - 0x3f2f7000, 0x35aa4905, 0x3f307000, 0x37cbd11e, - 0x3f317000, 0x3805fdf4, 0x40000000, 0x00000000, - 0x3ffe0000, 0x38fe03f8, 0x3ffc0000, 0x39fc0fc1, - 0x3ffa0000, 0x3a8cb3c9, 0x3ff80000, 0x3af83e10, - 0x3ff60000, 0x3b407b30, 0x3ff40000, 0x3b898d60, - 0x3ff20000, 0x3bb9d648, 0x3ff00000, 0x3bf0f0f1, - 0x3fef0000, 0x3abadc7f, 0x3fed0000, 0x3b66076c, - 0x3feb0000, 0x3bbdb2a6, 0x3fea0000, 0x39ea0ea1, - 0x3fe80000, 0x3b4b58f7, 0x3fe60000, 0x3bc2b448, - 0x3fe50000, 0x3a9660ac, 0x3fe30000, 0x3b8e38e4, - 0x3fe10000, 0x3bfc780e, 0x3fe00000, 0x3b607038, - 0x3fde0000, 0x3be95c4d, 0x3fdd0000, 0x3b4f914c, - 0x3fdb0000, 0x3beb61ef, 0x3fda0000, 0x3b681b4f, - 0x3fd90000, 0x385901b2, 0x3fd70000, 0x3b9435e5, - 0x3fd60000, 0x3aae0359, 0x3fd40000, 0x3bc77b03, - 0x3fd30000, 0x3b501a6d, 0x3fd20000, 0x39d20d21, - 0x3fd00000, 0x3bb69fcc, 0x3fcf0000, 0x3b48e951, - 0x3fce0000, 0x3a3453b9, 0x3fcc0000, 0x3bcccccd, - 0x3fcb0000, 0x3b8727c0, 0x3fca0000, 0x3b0b0fcd, - 0x3fc90000, 0x397b49d1, 0x3fc70000, 0x3bce0c7d, - 0x3fc60000, 0x3b980c6a, 0x3fc50000, 0x3b4b90f7, - 0x3fc40000, 0x3adcbe15, 0x3fc30000, 0x39c30c31, - 0x3fc10000, 0x3be4bbd6, 0x3fc00000, 0x3bc0c0c1, - 0x3fbf0000, 0x3ba02fe8, 0x3fbe0000, 0x3b82fa0c, - 0x3fbd0000, 0x3b52208e, 0x3fbc0000, 0x3b24c818, - 0x3fbb0000, 0x3afb9c87, 0x3fba0000, 0x3aba2e8c, - 0x3fb90000, 0x3a850fe9, 0x3fb80000, 0x3a381703, - 0x3fb70000, 0x39fbb5a2, 0x3fb60000, 0x39b60b61, - 0x3fb50000, 0x399e68aa, 0x3fb40000, 0x39b40b41, - 0x3fb30000, 0x39f63529, 0x3fb20000, 0x3a321643, - 0x3fb10000, 0x3a7e9dc0, 0x3fb00000, 0x3ab02c0b, - 0x3faf0000, 0x3aeb771a, 0x3fae0000, 0x3b1882b9, - 0x3fad0000, 0x3b4056b0, 0x3fac0000, 0x3b6d2308, - 0x3fab0000, 0x3b8f69e3, 0x3faa0000, 0x3baaaaab, - 0x3fa90000, 0x3bc84a48, 0x3fa80000, 0x3be83f57, - 0x3fa80000, 0x39a80a81, 0x3fa70000, 0x3abc14e6, - 0x3fa60000, 0x3b2b8872, 0x3fa50000, 0x3b7d6a05, - 0x3fa40000, 0x3ba9cf1e, 0x3fa30000, 0x3bd70a3d, - 0x3fa30000, 0x394bc7f6, 0x3fa20000, 0x3adf0cac, - 0x3fa10000, 0x3b56625d, 0x3fa00000, 0x3ba0a0a1, - 0x3f9f0000, 0x3bd809fe, 0x3f9f0000, 0x3a0b2f39, - 0x3f9e0000, 0x3b195a48, 0x3f9d0000, 0x3b89d89e, - 0x3f9c0000, 0x3bc8e161, 0x3f9c0000, 0x399c09c1, - 0x3f9b0000, 0x3b18df3e, 0x3f9a0000, 0x3b90e7d9, - 0x3f990000, 0x3bd722db, 0x3f990000, 0x3a78d28b, - 0x3f980000, 0x3b519013, 0x3f970000, 0x3bb425ed, - 0x3f970000, 0x3817012e, 0x3f960000, 0x3b1fb4d8, - 0x3f950000, 0x3ba02568, 0x3f940000, 0x3bf2094f, - 0x3f940000, 0x3b0b0129, 0x3f930000, 0x3b9a85c4, - 0x3f920000, 0x3bf11384, 0x3f920000, 0x3b124925, - 0x3f910000, 0x3ba2b3c5, 0x3f900000, 0x3bfdbc09, - 0x3f900000, 0x3b3470c6, 0x3f8f0000, 0x3bb823ee, - 0x3f8f0000, 0x3a3bced0, 0x3f8e0000, 0x3b706ada, - 0x3f8d0000, 0x3bda5202, 0x3f8d0000, 0x3af72c23, - 0x3f8c0000, 0x3ba29c04, 0x3f8c0000, 0x398c08c1, - 0x3f8b0000, 0x3b606894, 0x3f8a0000, 0x3bd8f2fc, - 0x3f8a0000, 0x3b05f0e1, 0x3f890000, 0x3bae408a, - 0x3f890000, 0x3a5639d7, 0x3f880000, 0x3b888889, - 0x3f870000, 0x3bf78088, 0x3f870000, 0x3b4f56be, - 0x3f860000, 0x3bd90544, 0x3f860000, 0x3b1714fc, - 0x3f850000, 0x3bbf3761, 0x3f850000, 0x3ad0214d, - 0x3f840000, 0x3ba9f9c8, 0x3f840000, 0x3a842108, - 0x3f830000, 0x3b993052, 0x3f830000, 0x3a1374bc, - 0x3f820000, 0x3b8cbfbf, 0x3f820000, 0x39820821, - 0x3f810000, 0x3b848da9, 0x3f810000, 0x38810204, - 0x3f800000, 0x3b808081, 0x3f800000, 0x00000000, - 0x00000000, 0x3f800000, 0x3f966cfe, 0x3fc583ab, - 0x40681e7b, 0x4070c7d0, 0x41204937, 0x41211525, - 0x41da51c0, 0x41da7743, 0x4294680b, 0x42946b7e, - 0x4349b691, 0x4349b734, 0x4409143b, 0x4409144a, - 0x44ba4f53, 0x44ba4f55, 0x457d38ac, 0x457d38ac, - 0x462c14ee, 0x462c14ef, 0x46e9e224, 0x46e9e224, - 0x479ef0b3, 0x479ef0b3, 0x485805ad, 0x485805ad, - 0x4912cd62, 0x4912cd62, 0x49c78665, 0x49c78665, - 0x4a87975f, 0x4a87975f, 0x4b3849a4, 0x4b3849a4, - 0x4bfa7910, 0x4bfa7910, 0x4caa36c8, 0x4caa36c8, - 0x4d675844, 0x4d675844, 0x4e1d3710, 0x4e1d3710, - 0x4ed5ad6e, 0x4ed5ad6e, 0x4f91357a, 0x4f91357a, - 0x50455bfe, 0x50455bfe, 0x51061e9d, 0x51061e9d, - 0x51b64993, 0x51b64993, 0x5277c118, 0x5277c118, - 0x53285dd2, 0x53285dd2, 0x53e4d572, 0x53e4d572, - 0x549b8238, 0x549b8238, 0x55535bb3, 0x55535bb3, - 0x560fa1fe, 0x560fa1fe, 0x56c3379a, 0x56c3379a, - 0x5784a9f1, 0x5784a9f1, 0x58344f11, 0x58344f11, - 0x58f510ad, 0x58f510ad, 0x3d7faade, 0x3d87ccf5, - 0x3d8fc36e, 0x3d97b8ca, 0x3d9facf8, 0x3da79feb, - 0x3daf9192, 0x3db781df, 0x3dbf70c1, 0x3dc75e2a, - 0x3dcf4a0b, 0x3dd73454, 0x3ddf1cf6, 0x3de703e3, - 0x3deee90c, 0x3df6cc61, 0x3dfeadd5, 0x3e0346ac, - 0x3e07356e, 0x3e0b232a, 0x3e0f0fd8, 0x3e12fb71, - 0x3e16e5ee, 0x3e1acf47, 0x3e1eb777, 0x3e229e76, - 0x3e26843d, 0x3e2a68c6, 0x3e2e4c09, 0x3e322e00, - 0x3e360ea4, 0x3e39edef, 0x3e3dcbda, 0x3e41a85f, - 0x3e458377, 0x3e495d1c, 0x3e4d3547, 0x3e510bf3, - 0x3e54e119, 0x3e58b4b3, 0x3e5c86bb, 0x3e60572a, - 0x3e6425fc, 0x3e67f32a, 0x3e6bbeaf, 0x3e6f8884, - 0x3e7350a4, 0x3e77170a, 0x3e7adbb0, 0x3e7e9e90, - 0x3e812fd3, 0x3e830f75, 0x3e84ee2d, 0x3e86cbf7, - 0x3e88a8d2, 0x3e8a84ba, 0x3e8c5fad, 0x3e8e39a9, - 0x3e9012ab, 0x3e91eab1, 0x3e93c1b9, 0x3e9597c0, - 0x3e976cc4, 0x3e9940c2, 0x3e9b13ba, 0x3e9ce5a7, - 0x3e9eb689, 0x3ea0865d, 0x3ea25522, 0x3ea422d4, - 0x3ea5ef73, 0x3ea7bafc, 0x3ea9856d, 0x3eab4ec4, - 0x3ead1701, 0x3eaede20, 0x3eb0a420, 0x3eb26900, - 0x3eb42cbd, 0x3eb5ef56, 0x3eb7b0ca, 0x3eb97117, - 0x3ebb303b, 0x3ebcee34, 0x3ebeab02, 0x3ec066a3, - 0x3ec22116, 0x3ec3da58, 0x3ec5926a, 0x3ec74949, - 0x3ec8fef4, 0x3ecab36a, 0x3ecc66aa, 0x3ece18b3, - 0x3ecfc983, 0x3ed1791a, 0x3ed32776, 0x3ed4d497, - 0x3ed6807b, 0x3ed82b21, 0x3ed9d489, 0x3edb7cb1, - 0x3edd239a, 0x3edec941, 0x3ee06da6, 0x3ee210c9, - 0x3ee3b2a8, 0x3ee55344, 0x3ee6f29a, 0x3ee890ab, - 0x3eea2d76, 0x3eebc8fb, 0x3eed6338, 0x3eeefc2e, - 0x3ef093db, 0x3ef22a40, 0x3ef3bf5c, 0x3ef5532e, - 0x3ef6e5b7, 0x3ef876f5, 0x3efa06e8, 0x3efb9591, - 0x3efd22ef, 0x3efeaf01, 0x3f001ce4, 0x3f00e1a1, - 0x3f01a5b8, 0x3f02692a, 0x3f032bf5, 0x3f03ee1a, - 0x3f04af98, 0x3f057071, 0x3f0630a3, 0x3f06f02f, - 0x3f07af14, 0x3f086d54, 0x3f092aed, 0x3f09e7e0, - 0x3f0aa42d, 0x3f0b5fd3, 0x3f0c1ad4, 0x3f0cd52f, - 0x3f0d8ee4, 0x3f0e47f4, 0x3f0f005d, 0x3f0fb822, - 0x3f106f41, 0x3f1125ba, 0x3f11db8f, 0x3f1290bf, - 0x3f13454a, 0x3f13f931, 0x3f14ac73, 0x3f155f11, - 0x3f16110b, 0x3f16c261, 0x3f177314, 0x3f182324, - 0x3f18d290, 0x3f19815a, 0x3f1a2f81, 0x3f1add06, - 0x3f1b89e8, 0x3f1c3629, 0x3f1ce1c9, 0x3f1d8cc7, - 0x3f1e3725, 0x3f1ee0e1, 0x3f1f89fe, 0x3f20327a, - 0x3f20da57, 0x3f218194, 0x3f222833, 0x3f22ce33, - 0x3f237394, 0x3f241857, 0x3f24bc7d, 0x3f256006, - 0x3f2602f1, 0x3f26a540, 0x3f2746f3, 0x3f27e80a, - 0x3f288885, 0x3f292866, 0x3f29c7ac, 0x3f2a6658, - 0x3f2b0469, 0x3f2ba1e2, 0x3f2c3ec1, 0x3f2cdb08, - 0x3f2d76b6, 0x3f2e11cd, 0x3f2eac4c, 0x3f2f4635, - 0x3f2fdf87, 0x3f307842, 0x3f311069, 0x3f31a7fa, - 0x3f323ef6, 0x3f32d55e, 0x3f336b32, 0x3f340072, - 0x3f349520, 0x3f35293b, 0x3f35bcc5, 0x3f364fbc, - 0x3f36e223, 0x3f3773f9, 0x3f38053e, 0x3f3895f4, - 0x3f39261b, 0x3f39b5b3, 0x3f3a44bc, 0x3f3ad338, - 0x3f3b6127, 0x3f3bee89, 0x3f3c7b5e, 0x3f3d07a7, - 0x3f3d9365, 0x3f3e1e99, 0x3f3ea941, 0x3f3f3360, - 0x3f3fbcf5, 0x3f404602, 0x3f40ce86, 0x3f415682, - 0x3f41ddf6, 0x3f4264e4, 0x3f42eb4b, 0x3f43712c, - 0x3f43f687, 0x3f447b5e, 0x3f44ffb0, 0x3f45837e, - 0x3f4606c9, 0x3f468990, 0x3f470bd5, 0x3f478d98, - 0x3f480eda, 0x3f488f9b, 0x3f490fdb, 0x3f800000, - 0x3f8164d2, 0x3f82cd87, 0x3f843a29, 0x3f85aac3, - 0x3f871f62, 0x3f88980f, 0x3f8a14d5, 0x3f8b95c2, - 0x3f8d1adf, 0x3f8ea43a, 0x3f9031dc, 0x3f91c3d3, - 0x3f935a2b, 0x3f94f4f0, 0x3f96942d, 0x3f9837f0, - 0x3f99e046, 0x3f9b8d3a, 0x3f9d3eda, 0x3f9ef532, - 0x3fa0b051, 0x3fa27043, 0x3fa43516, 0x3fa5fed7, - 0x3fa7cd94, 0x3fa9a15b, 0x3fab7a3a, 0x3fad583f, - 0x3faf3b79, 0x3fb123f6, 0x3fb311c4, 0x3fb504f3, - 0x3fb6fd92, 0x3fb8fbaf, 0x3fbaff5b, 0x3fbd08a4, - 0x3fbf179a, 0x3fc12c4d, 0x3fc346cd, 0x3fc5672a, - 0x3fc78d75, 0x3fc9b9be, 0x3fcbec15, 0x3fce248c, - 0x3fd06334, 0x3fd2a81e, 0x3fd4f35b, 0x3fd744fd, - 0x3fd99d16, 0x3fdbfbb8, 0x3fde60f5, 0x3fe0ccdf, - 0x3fe33f89, 0x3fe5b907, 0x3fe8396a, 0x3feac0c7, - 0x3fed4f30, 0x3fefe4ba, 0x3ff28177, 0x3ff5257d, - 0x3ff7d0df, 0x3ffa83b3, 0x3ffd3e0c, 0x40000000, - 0x40000000, 0x3ffe03f8, 0x3ffc0fc1, 0x3ffa232d, - 0x3ff83e10, 0x3ff6603e, 0x3ff4898d, 0x3ff2b9d6, - 0x3ff0f0f1, 0x3fef2eb7, 0x3fed7304, 0x3febbdb3, - 0x3fea0ea1, 0x3fe865ac, 0x3fe6c2b4, 0x3fe52598, - 0x3fe38e39, 0x3fe1fc78, 0x3fe07038, 0x3fdee95c, - 0x3fdd67c9, 0x3fdbeb62, 0x3fda740e, 0x3fd901b2, - 0x3fd79436, 0x3fd62b81, 0x3fd4c77b, 0x3fd3680d, - 0x3fd20d21, 0x3fd0b6a0, 0x3fcf6475, 0x3fce168a, - 0x3fcccccd, 0x3fcb8728, 0x3fca4588, 0x3fc907da, - 0x3fc7ce0c, 0x3fc6980c, 0x3fc565c8, 0x3fc43730, - 0x3fc30c31, 0x3fc1e4bc, 0x3fc0c0c1, 0x3fbfa030, - 0x3fbe82fa, 0x3fbd6910, 0x3fbc5264, 0x3fbb3ee7, - 0x3fba2e8c, 0x3fb92144, 0x3fb81703, 0x3fb70fbb, - 0x3fb60b61, 0x3fb509e7, 0x3fb40b41, 0x3fb30f63, - 0x3fb21643, 0x3fb11fd4, 0x3fb02c0b, 0x3faf3ade, - 0x3fae4c41, 0x3fad602b, 0x3fac7692, 0x3fab8f6a, - 0x3faaaaab, 0x3fa9c84a, 0x3fa8e83f, 0x3fa80a81, - 0x3fa72f05, 0x3fa655c4, 0x3fa57eb5, 0x3fa4a9cf, - 0x3fa3d70a, 0x3fa3065e, 0x3fa237c3, 0x3fa16b31, - 0x3fa0a0a1, 0x3f9fd80a, 0x3f9f1166, 0x3f9e4cad, - 0x3f9d89d9, 0x3f9cc8e1, 0x3f9c09c1, 0x3f9b4c70, - 0x3f9a90e8, 0x3f99d723, 0x3f991f1a, 0x3f9868c8, - 0x3f97b426, 0x3f97012e, 0x3f964fda, 0x3f95a025, - 0x3f94f209, 0x3f944581, 0x3f939a86, 0x3f92f114, - 0x3f924925, 0x3f91a2b4, 0x3f90fdbc, 0x3f905a38, - 0x3f8fb824, 0x3f8f177a, 0x3f8e7835, 0x3f8dda52, - 0x3f8d3dcb, 0x3f8ca29c, 0x3f8c08c1, 0x3f8b7034, - 0x3f8ad8f3, 0x3f8a42f8, 0x3f89ae41, 0x3f891ac7, - 0x3f888889, 0x3f87f781, 0x3f8767ab, 0x3f86d905, - 0x3f864b8a, 0x3f85bf37, 0x3f853408, 0x3f84a9fa, - 0x3f842108, 0x3f839930, 0x3f83126f, 0x3f828cc0, - 0x3f820821, 0x3f81848e, 0x3f810204, 0x3f808081, - 0x3f800000, 0x00000000, 0x399f22b4, 0x3a1f22b4, - 0x3a6eb40e, 0x3a9f22b4, 0x3ac6eb61, 0x3aeeb40e, - 0x3b0b3e5d, 0x3b1f22b4, 0x3b33070a, 0x3b46eb61, - 0x3b5b518e, 0x3b70f18f, 0x3b83e1c6, 0x3b8fe616, - 0x3b9c87fd, 0x3ba9c9b6, 0x3bb7ad6f, 0x3bc6354a, - 0x3bd56360, 0x3be539c1, 0x3bf5ba71, 0x3c0373b6, - 0x3c0c6153, 0x3c15a705, 0x3c1f45be, 0x3c293e6b, - 0x3c3391f7, 0x3c3e4149, 0x3c494d44, 0x3c54b6c9, - 0x3c607eb4, 0x3c6ca5df, 0x3c792d22, 0x3c830aa9, - 0x3c89af9f, 0x3c9085dc, 0x3c978dc6, 0x3c9ec7c2, - 0x3ca63433, 0x3cadd37d, 0x3cb5a602, 0x3cbdac21, - 0x3cc5e63a, 0x3cce54ac, 0x3cd6f7d5, 0x3cdfd010, - 0x3ce8ddba, 0x3cf2212d, 0x3cfb9ac3, 0x3d02a56a, - 0x3d0798dd, 0x3d0ca7e6, 0x3d11d2af, 0x3d171964, - 0x3d1c7c30, 0x3d21fb3c, 0x3d2796b2, 0x3d2d4ebb, - 0x3d332381, 0x3d39152b, 0x3d3f23e4, 0x3d454fd2, - 0x3d4b991d, 0x3d51ffec, 0x3d588468, 0x3d5f26b6, - 0x3d65e6fd, 0x3d6cc563, 0x3d73c20e, 0x3d7add24, - 0x3d810b65, 0x3d84b793, 0x3d88732e, 0x3d8c3e48, - 0x3d9018f4, 0x3d940344, 0x3d97fd49, 0x3d9c0715, - 0x3da020ba, 0x3da44a4a, 0x3da883d6, 0x3daccd6f, - 0x3db12727, 0x3db5910f, 0x3dba0b38, 0x3dbe95b3, - 0x3dc33090, 0x3dc7dbe0, 0x3dcc97b4, 0x3dd1641d, - 0x3dd6412b, 0x3ddb2eee, 0x3de02d76, 0x3de53cd4, - 0x3dea5d18, 0x3def8e51, 0x3df4d090, 0x3dfa23e5, - 0x3dff885e, 0x3e027f06, 0x3e05427f, 0x3e080ea2, - 0x3e0ae377, 0x3e0dc104, 0x3e10a753, 0x3e13966a, - 0x3e168e51, 0x3e198f0f, 0x3e1c98ac, 0x3e1fab30, - 0x3e22c6a1, 0x3e25eb07, 0x3e29186a, 0x3e2c4ed0, - 0x3e2f8e42, 0x3e32d6c5, 0x3e362862, 0x3e39831f, - 0x3e3ce703, 0x3e405417, 0x3e43ca60, 0x3e4749e6, - 0x3e4ad2af, 0x3e4e64c3, 0x3e520029, 0x3e55a4e7, - 0x3e595305, 0x3e5d0a89, 0x3e60cb7a, 0x3e6495df, - 0x3e6869be, 0x3e6c471f, 0x3e702e07, 0x3e741e7e, - 0x3e78188b, 0x3e7c1c33, 0x3e8014bf, 0x3e822039, - 0x3e84308b, 0x3e8645b8, 0x3e885fc3, 0x3e8a7eb0, - 0x3e8ca281, 0x3e8ecb3b, 0x3e90f8df, 0x3e932b72, - 0x3e9562f6, 0x3e979f6f, 0x3e99e0e0, 0x3e9c274c, - 0x3e9e72b6, 0x3ea0c321, 0x3ea31890, 0x3ea57307, - 0x3ea7d288, 0x3eaa3716, 0x3eaca0b6, 0x3eaf0f68, - 0x3eb18332, 0x3eb3fc15, 0x3eb67a14, 0x3eb8fd34, - 0x3ebb8576, 0x3ebe12de, 0x3ec0a56e, 0x3ec33d2a, - 0x3ec5da14, 0x3ec87c30, 0x3ecb2380, 0x3ecdd008, - 0x3ed081ca, 0x3ed338c9, 0x3ed5f508, 0x3ed8b68a, - 0x3edb7d52, 0x3ede4963, 0x3ee11abf, 0x3ee3f169, - 0x3ee6cd65, 0x3ee9aeb5, 0x3eec955b, 0x3eef815c, - 0x3ef272b8, 0x3ef56974, 0x3ef86593, 0x3efb6716, - 0x3efe6e00, 0x3f00bd2b, 0x3f02460c, 0x3f03d1a5, - 0x3f055ff7, 0x3f06f104, 0x3f0884cd, 0x3f0a1b54, - 0x3f0bb499, 0x3f0d509f, 0x3f0eef65, 0x3f1090ef, - 0x3f12353d, 0x3f13dc50, 0x3f15862a, 0x3f1732cc, - 0x3f18e237, 0x3f1a946e, 0x3f1c4970, 0x3f1e0140, - 0x3f1fbbde, 0x3f21794d, 0x3f23398c, 0x3f24fc9f, - 0x3f26c285, 0x3f288b41, 0x3f2a56d2, 0x3f2c253c, - 0x3f2df67f, 0x3f2fca9c, 0x3f31a194, 0x3f337b6a, - 0x3f35581d, 0x3f3737b0, 0x3f391a24, 0x3f3aff7a, - 0x3f3ce7b2, 0x3f3ed2cf, 0x3f40c0d2, 0x3f42b1bc, - 0x3f44a58e, 0x3f469c49, 0x3f4895ef, 0x3f4a9280, - 0x3f4c91ff, 0x3f4e946c, 0x3f5099c9, 0x3f52a216, - 0x3f54ad56, 0x3f56bb88, 0x3f58ccaf, 0x3f5ae0cc, - 0x3f5cf7df, 0x3f5f11ea, 0x3f612eef, 0x3f634eee, - 0x3f6571e9, 0x3f6797e0, 0x3f69c0d5, 0x3f6becca, - 0x3f6e1bbf, 0x3f704db5, 0x3f7282ae, 0x3f74baab, - 0x3f76f5ae, 0x3f7933b6, 0x3f7b74c6, 0x3f7db8de, - 0x3f800000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x000000a2, - 0x000003e6, 0x00000036, 0x00000393, 0x00000244, - 0x00000054, 0x0000029f, 0x00000309, 0x00000357, - 0x00000347, 0x00000353, 0x00000137, 0x000001c0, - 0x0000036d, 0x00000229, 0x00000166, 0x0000013c, - 0x0000010e, 0x00000104, 0x0000007f, 0x00000251, - 0x0000018e, 0x000002bd, 0x000003ae, 0x000003c5, - 0x00000186, 0x00000372, 0x0000011b, 0x0000023a, - 0x00000109, 0x000000dd, 0x000000b8, 0x00000006, - 0x00000124, 0x000002ee, 0x00000282, 0x000001d1, - 0x00000248, 0x000001cf, 0x00000387, 0x000001eb, - 0x00000072, 0x00000312, 0x00000269, 0x0000033e, - 0x000003a2, 0x00000023, 0x0000017d, 0x0000012e, - 0x000002ed, 0x00000048, 0x0000013a, 0x0000019c, - 0x000001c0, 0x0000026b, 0x00000117, 0x0000037e, - 0x00000104, 0x00000399, 0x00000075, 0x00000239, - 0x0000020d, 0x00000133, 0x0000027d, 0x0000009c, - 0x00000211, 0x000001f8, 0x000002ef, 0x000001f9, - 0x000000a0, 0x000003b1, 0x000003fe, 0x00000097, - 0x000003ff, 0x000001e0, 0x00000166, 0x0000000f, - 0x000003bc, 0x000002f1, 0x00000062, 0x0000035a, - 0x00000029, 0x000002d1, 0x000003db, 0x00000136, - 0x000001fb, 0x000000f2, 0x000001f2, 0x00000309, - 0x000002dd, 0x000000f4, 0x0000018f, 0x00000366, - 0x00000279, 0x000001fe, 0x0000028b, 0x00000175, - 0x0000009e, 0x000003ac, 0x000001fa, 0x000003e5, - 0x000003c5, 0x000003b3, 0x00000341, 0x00000339, - 0x000003de, 0x000000a5, 0x000000a4, 0x000002ea, - 0x000001af, 0x000003b5, 0x000003ec, 0x0000011f, - 0x00000235, 0x000001d0, 0x00000215, 0x00000203, - 0x000000c1, 0x0000006f, 0x0000031e, 0xc11bf1e0, - 0x7421580c, 0x7ec47e35, 0x4ba9afed, 0xe7de294a, - 0xc5ecf41c, 0xeb1faf97, 0xa8b5d49e, 0xfd9a797f, - 0x26dd3d18, 0xfb3c9f2c, 0xb47db4d9, 0x462d6829, - 0x603fbcbc, 0x5fff7816, 0xa0ec7fe2, 0x7e2ef7e4, - 0xe7d27211, 0x58e60d4c, 0xf904e647, 0xc09ad17d, - 0x1213a671, 0xd7d4baed, 0x9cfba208, 0xac72c4a6, - 0x4873f877, 0xbba82746, 0x4b801924, 0xb8e90937, - 0x1586dc91, 0x8eaf7aef, 0x4107f945, 0x5664f10e, - 0x77036d8a, 0x5f47d4d3, 0x54a7f09d, 0x0db93910, - 0x00028be6, 0x00000000, 0x00000000, 0x00000000 -}; - -#endif // TABLES_HPP_ + 0x00000000, 0x3faff55b, 0xbd38db2c, 0x3e56e59f, 0x00000000, 0x3fb0f99e, 0x54dedf96, 0x3e64e3aa, + 0x00000000, 0x3fb1f86d, 0xab1bda88, 0x3e67e105, 0x00000000, 0x3fb2f719, 0x4d013fd0, 0x3e48c525, + 0x00000000, 0x3fb3f59f, 0x3ad62670, 0x3e2cf8ab, 0x00000000, 0x3fb4f3fd, 0xbec80468, 0x3e59dca4, + 0x00000000, 0x3fb5f232, 0xec98a8da, 0x3e53f4b5, 0x00000000, 0x3fb6f03b, 0x619d81fe, 0x3e6b9d49, + 0x00000000, 0x3fb7ee18, 0x87460934, 0x3e430178, 0x00000000, 0x3fb8ebc5, 0xca0b9944, 0x3e511e3e, + 0x00000000, 0x3fb9e941, 0x3c5a332e, 0x3e54f3f7, 0x00000000, 0x3fbae68a, 0xae0e00a6, 0x3e5c71c8, + 0x00000000, 0x3fbbe39e, 0xf86fbdc7, 0x3e67cde0, 0x00000000, 0x3fbce07c, 0x8c889c72, 0x3e570f32, + 0x00000000, 0x3fbddd21, 0x9b994efe, 0x3e5c07ae, 0x00000000, 0x3fbed98c, 0x1d7b1698, 0x3e40c802, + 0x00000000, 0x3fbfd5ba, 0xedb8cb22, 0x3e635585, 0x00000000, 0x3fc068d5, 0x67b30e96, 0x3e708425, + 0x00000000, 0x3fc0e6ad, 0x1031472e, 0x3e799e81, 0x00000000, 0x3fc16465, 0x1416bcee, 0x3e604182, + 0x00000000, 0x3fc1e1fa, 0xe4dc96f4, 0x3e7f6086, 0x00000000, 0x3fc25f6e, 0x5c5f1b58, 0x3e471a53, + 0x00000000, 0x3fc2dcbd, 0x3fe63ca1, 0x3e765f74, 0x00000000, 0x3fc359e8, 0x3472d014, 0x3e7dbd73, + 0x00000000, 0x3fc3d6ee, 0x4d8b0d1d, 0x3e7d18cc, 0x00000000, 0x3fc453ce, 0x53c8fb29, 0x3e78c125, + 0x00000000, 0x3fc4d087, 0xe2e8f991, 0x3e753b49, 0x00000000, 0x3fc54d18, 0xe148c141, 0x3e77422a, + 0x00000000, 0x3fc5c981, 0x69df56a8, 0x3e4e3ec2, 0x00000000, 0x3fc645bf, 0x4e7e0ac9, 0x3e7ff675, + 0x00000000, 0x3fc6c1d4, 0x7b1b5aad, 0x3e713126, 0x00000000, 0x3fc73dbd, 0x403a94bc, 0x3e7d14fa, + 0x00000000, 0x3fc7b97b, 0xc089a3d8, 0x3e62f396, 0x00000000, 0x3fc8350b, 0x78fa95bb, 0x3e7c731d, + 0x00000000, 0x3fc8b06e, 0x85177399, 0x3e7c50f3, 0x00000000, 0x3fc92ba3, 0x9c6f2c20, 0x3e6f4140, + 0x00000000, 0x3fc9a6a8, 0xc4c39ec0, 0x3e7d2d90, 0x00000000, 0x3fca217e, 0x696f2106, 0x3e680420, + 0x00000000, 0x3fca9c23, 0x7943a2e8, 0x3e4b4032, 0x00000000, 0x3fcb1696, 0x02f3d2a2, 0x3e65d35e, + 0x00000000, 0x3fcb90d7, 0x288117b0, 0x3e64a498, 0x00000000, 0x3fcc0ae5, 0x19afb324, 0x3e635da1, + 0x00000000, 0x3fcc84bf, 0xcdb9a908, 0x3e714e85, 0x00000000, 0x3fccfe65, 0xe5547b9a, 0x3e638754, + 0x00000000, 0x3fcd77d5, 0xe6ce3246, 0x3e7be40a, 0x00000000, 0x3fcdf110, 0xb3bea7e7, 0x3e70c993, + 0x00000000, 0x3fce6a14, 0x89ac3359, 0x3e71d2dd, 0x00000000, 0x3fcee2e1, 0x03332c46, 0x3e614766, + 0x00000000, 0x3fcf5b75, 0x1bac55b7, 0x3e7f2590, 0x00000000, 0x3fcfd3d1, 0x7c826e28, 0x3e7f881b, + 0x00000000, 0x3fd025fa, 0x6d698d20, 0x3e744199, 0x00000000, 0x3fd061ee, 0x521ea089, 0x3e8407ac, + 0x00000000, 0x3fd09dc5, 0x6c4b1723, 0x3e82fb0c, 0x00000000, 0x3fd0d97e, 0x966a3e18, 0x3e8ca135, + 0x00000000, 0x3fd1151a, 0xe4d646e4, 0x3e6b1218, 0x00000000, 0x3fd15097, 0xa350d288, 0x3e6d4e72, + 0x00000000, 0x3fd18bf5, 0x2f04c329, 0x3e84617e, 0x00000000, 0x3fd1c735, 0x41e82650, 0x3e6096ec, + 0x00000000, 0x3fd20255, 0x25773e6e, 0x3e79f91f, 0x00000000, 0x3fd23d56, 0x20f1d674, 0x3e659c08, + 0x00000000, 0x3fd27837, 0xa2df1064, 0x3e602bf7, 0x00000000, 0x3fd2b2f7, 0xfc40508f, 0x3e8fb36b, + 0x00000000, 0x3fd2ed98, 0x3f8dc892, 0x3e7ea08f, 0x00000000, 0x3fd32818, 0x54656a0e, 0x3e73ed62, + 0x00000000, 0x3fd36277, 0xe5e69c58, 0x3e6b83f5, 0x00000000, 0x3fd39cb4, 0xaf768592, 0x3e8d6ec2, + 0x00000000, 0x3fd3d6d1, 0x9a226f94, 0x3e649388, 0x00000000, 0x3fd410cb, 0xa65279ba, 0x3e85ad8f, + 0x00000000, 0x3fd44aa4, 0x84d45434, 0x3e6b6157, 0x00000000, 0x3fd4845a, 0x4368f145, 0x3e809a18, + 0x00000000, 0x3fd4bdee, 0x39b0d91c, 0x3e761a24, 0x00000000, 0x3fd4f75f, 0x5e39a978, 0x3e7ce1a6, + 0x00000000, 0x3fd530ad, 0xa93b6a66, 0x3e832a39, 0x00000000, 0x3fd569d8, 0x9af804e7, 0x3e81c369, + 0x00000000, 0x3fd5a2e0, 0x4e44ede8, 0x3e575e0f, 0x00000000, 0x3fd5dbc3, 0xd1a7a83b, 0x3e8f77ce, + 0x00000000, 0x3fd61484, 0x0cb1b500, 0x3e284e7f, 0x00000000, 0x3fd64d1f, 0x38b02dfe, 0x3e8ec6b8, + 0x00000000, 0x3fd68597, 0xdfbeda87, 0x3e83ebf4, 0x00000000, 0x3fd6bdea, 0xed9cb475, 0x3e89397a, + 0x00000000, 0x3fd6f619, 0xbc239c54, 0x3e707937, 0x00000000, 0x3fd72e22, 0x553131b6, 0x3e8aa754, + 0x00000000, 0x3fd76607, 0x407c45dc, 0x3e74a05d, 0x00000000, 0x3fd79dc6, 0x1a206dd0, 0x3e813223, + 0x00000000, 0x3fd7d560, 0xfdd69c88, 0x3e72d8ec, 0x00000000, 0x3fd80cd4, 0x74218606, 0x3e7a852c, + 0x00000000, 0x3fd84422, 0xbaeebb50, 0x3e871bf2, 0x00000000, 0x3fd87b4b, 0xb7491820, 0x3e483d7d, + 0x00000000, 0x3fd8b24d, 0x92b6da14, 0x3e6ca50d, 0x00000000, 0x3fd8e929, 0xe8530298, 0x3e56f5cd, + 0x00000000, 0x3fd91fde, 0x98910740, 0x3e7f3431, 0x00000000, 0x3fd9566d, 0x41ccd80a, 0x3e70e8d2, + 0x00000000, 0x3fd98cd5, 0xc619e6c8, 0x3e71535a, 0x00000000, 0x3fd9c316, 0x41c36cd2, 0x3e773160, + 0x00000000, 0x3fd9f930, 0x00637d8e, 0x3e7985a0, 0x00000000, 0x3fda2f23, 0x858c0a68, 0x3e6f2f29, + 0x00000000, 0x3fda64ee, 0x7f96d909, 0x3e887984, 0x00000000, 0x3fda9a92, 0x19e12e42, 0x3e8ab3d3, + 0x00000000, 0x3fdad00f, 0x62dfc4c2, 0x3e750881, 0x00000000, 0x3fdb0564, 0xa1cd9d8c, 0x3e605749, + 0x00000000, 0x3fdb3a91, 0x6c6b8618, 0x3e5da65c, 0x00000000, 0x3fdb6f96, 0x7df1ad64, 0x3e6739bf, + 0x00000000, 0x3fdba473, 0x52aa3340, 0x3e6bc312, 0x00000000, 0x3fdbd928, 0x91ad3aa8, 0x3e5e5281, + 0x00000000, 0x3fdc0db4, 0x3df19f18, 0x3e8929d9, 0x00000000, 0x3fdc4219, 0xb693a080, 0x3e5ff11e, + 0x00000000, 0x3fdc7655, 0xf145a3a0, 0x3e455ae3, 0x00000000, 0x3fdcaa68, 0xc6c0ca82, 0x3e7cbcd8, + 0x00000000, 0x3fdcde53, 0xd425d304, 0x3e70cb04, 0x00000000, 0x3fdd1215, 0xab5be678, 0x3e79adfc, + 0x00000000, 0x3fdd45ae, 0xc5662508, 0x3e893d90, 0x00000000, 0x3fdd791f, 0xbd35ff40, 0x3e768489, + 0x00000000, 0x3fddac67, 0x3da2b7e0, 0x3e3586ed, 0x00000000, 0x3fdddf85, 0x2e850eee, 0x3e87604d, + 0x00000000, 0x3fde127b, 0x2bfb53d8, 0x3e7ac1d1, 0x00000000, 0x3fde4548, 0x68274740, 0x3e39b3d4, + 0x00000000, 0x3fde77eb, 0x8d10e53c, 0x3e7fc5d6, 0x00000000, 0x3fdeaa65, 0x1884becb, 0x3e88f9e5, + 0x00000000, 0x3fdedcb6, 0x869c06d1, 0x3e8a87f0, 0x00000000, 0x3fdf0ede, 0x79f685fa, 0x3e831e72, + 0x00000000, 0x3fdf40dd, 0x2f9719b0, 0x3e46a828, 0x00000000, 0x3fdf72b2, 0x4a8a44e0, 0x3e60d272, + 0x00000000, 0x3fdfa45d, 0x4b11ad4e, 0x3e8a6052, 0x00000000, 0x3fdfd5e0, 0x832750f0, 0x3e575fdf, + 0x00000000, 0x3fe0039c, 0x02e4cd36, 0x3e8cf069, 0x00000000, 0x3fe01c34, 0x2d4f6d10, 0x3e6e8242, + 0x00000000, 0x3fe034b7, 0x1063e6c0, 0x3e524a09, 0x00000000, 0x3fe04d25, 0x72dc6f38, 0x3e78a1a1, + 0x00000000, 0x3fe0657e, 0x19f8a92d, 0x3e929b66, 0x00000000, 0x3fe07dc3, 0x9c1b70c8, 0x3e79274d, + 0x00000000, 0x3fe095f3, 0x1fbb7930, 0x3e50c34b, 0x00000000, 0x3fe0ae0e, 0x6c20eb50, 0x3e663986, + 0x00000000, 0x3fe0c614, 0xf6832e9e, 0x3e86d6d0, 0x00000000, 0x3fe0de05, 0xef99f25e, 0x3e9af54d, + 0x00000000, 0x3fe0f5e2, 0x52a00262, 0x3e916cfc, 0x00000000, 0x3fe10daa, 0x83569c32, 0x3e8dcc1e, + 0x00000000, 0x3fe1255d, 0x551ed425, 0x3e937f7a, 0x00000000, 0x3fe13cfb, 0xadc98887, 0x3e9f6360, + 0x00000000, 0x3fe15485, 0x8d35a2c1, 0x3e92c6ec, 0x00000000, 0x3fe16bfa, 0xf84cb036, 0x3e8bd44d, + 0x00000000, 0x3fe1835a, 0x826e310e, 0x3e9117cf, 0x00000000, 0x3fe19aa5, 0xf332cfc9, 0x3e9ca533, + 0x00000000, 0x3fe1b1dc, 0x509dbc2e, 0x3e90f208, 0x00000000, 0x3fe1c8fe, 0x93c945de, 0x3e8cd07d, + 0x00000000, 0x3fe1e00b, 0xd67e6d72, 0x3e957bdf, 0x00000000, 0x3fe1f704, 0xc516c658, 0x3e7aab89, + 0x00000000, 0x3fe20de8, 0xb1a1b8a0, 0x3e63e823, 0x00000000, 0x3fe224b7, 0x4a9d6d3c, 0x3e830746, + 0x00000000, 0x3fe23b71, 0xcd438843, 0x3e9c5993, 0x00000000, 0x3fe25217, 0xa02ab554, 0x3e9ba2fc, + 0x00000000, 0x3fe268a9, 0x6983a268, 0x3e801a5b, 0x00000000, 0x3fe27f26, 0xb350efc8, 0x3e6273d1, + 0x00000000, 0x3fe2958e, 0x8c37b0c6, 0x3e864c23, 0x00000000, 0x3fe2abe2, 0x7370a300, 0x3e6aded0, + 0x00000000, 0x3fe2c221, 0x197eb47e, 0x3e878091, 0x00000000, 0x3fe2d84c, 0x45e0dabc, 0x3e74b0f2, + 0x00000000, 0x3fe2ee62, 0x794e2eaf, 0x3e9080d9, 0x00000000, 0x3fe30464, 0x42b60c76, 0x3e8d4ec2, + 0x00000000, 0x3fe31a52, 0xf940caa0, 0x3e4221d2, 0x00000000, 0x3fe3302b, 0x2b2bba5c, 0x3e7cdbc4, + 0x00000000, 0x3fe345f0, 0xbb440840, 0x3e6cce37, 0x00000000, 0x3fe35ba0, 0x99cf1dd0, 0x3e96c1d9, + 0x00000000, 0x3fe3713d, 0x07eb0870, 0x3e5bed8a, 0x00000000, 0x3fe386c5, 0x8f490e3c, 0x3e769ed8, + 0x00000000, 0x3fe39c39, 0x19b73ef0, 0x3e6cd417, 0x00000000, 0x3fe3b198, 0xc95b41b7, 0x3e9cbc4a, + 0x00000000, 0x3fe3c6e4, 0xb890f5d7, 0x3e9238f1, 0x00000000, 0x3fe3dc1c, 0x82259cc4, 0x3e750c42, + 0x00000000, 0x3fe3f13f, 0xde87b3e2, 0x3e9713d2, 0x00000000, 0x3fe4064f, 0xd2255276, 0x3e81d5a7, + 0x00000000, 0x3fe41b4a, 0x48227ac1, 0x3e9c0dfd, 0x00000000, 0x3fe43032, 0xdab76753, 0x3e91c964, + 0x00000000, 0x3fe44506, 0xd5704496, 0x3e86de56, 0x00000000, 0x3fe459c6, 0x1fd19968, 0x3e84aeb7, + 0x00000000, 0x3fe46e72, 0xc57b1918, 0x3e8fbf91, 0x00000000, 0x3fe4830a, 0x7fbe5d9a, 0x3e9d6bef, + 0x00000000, 0x3fe4978f, 0xdc249066, 0x3e9464d3, 0x00000000, 0x3fe4ac00, 0xec4d9073, 0x3e9638e2, + 0x00000000, 0x3fe4c05e, 0x7247ea7c, 0x3e716f4a, 0x00000000, 0x3fe4d4a8, 0x40f1d440, 0x3e31a0a7, + 0x00000000, 0x3fe4e8de, 0x0114a33c, 0x3e86edbb, 0x00000000, 0x3fe4fd01, 0xbf1d513c, 0x3e7dbee8, + 0x00000000, 0x3fe51110, 0xb0248f73, 0x3e95b8bd, 0x00000000, 0x3fe5250c, 0x3f5eac64, 0x3e97de3d, + 0x00000000, 0x3fe538f5, 0x87ae448a, 0x3e8ee241, 0x00000000, 0x3fe54cca, 0x91ec5192, 0x3e9e06c5, + 0x00000000, 0x3fe5608d, 0x1a332738, 0x3e74e386, 0x00000000, 0x3fe5743c, 0xdcc2bfe4, 0x3e7a9599, + 0x00000000, 0x3fe587d8, 0xbad43468, 0x3e6f732f, 0x00000000, 0x3fe59b60, 0x73b727d9, 0x3e9eb9f5, + 0x00000000, 0x3fe5aed6, 0xa2eb9897, 0x3e98b212, 0x00000000, 0x3fe5c239, 0x4c167215, 0x3e938488, + 0x00000000, 0x3fe5d589, 0x63020051, 0x3e90e2d3, 0x00000000, 0x3fe5e8c6, 0x79fbd022, 0x3e928208, + 0x00000000, 0x3fe5fbf0, 0x893e4b30, 0x3e9a1ab9, 0x00000000, 0x3fe60f08, 0x17a24478, 0x3e82d1b8, + 0x00000000, 0x3fe6220d, 0x8ded4878, 0x3e615d7b, 0x00000000, 0x3fe634ff, 0x9db3a5e4, 0x3e78968f, + 0x00000000, 0x3fe647de, 0x71fe135f, 0x3e971c41, 0x00000000, 0x3fe65aab, 0x605d0d8c, 0x3e96d80f, + 0x00000000, 0x3fe66d66, 0x43691590, 0x3e7c91f0, 0x00000000, 0x3fe6800e, 0x15fce2b2, 0x3e839f8a, + 0x00000000, 0x3fe692a4, 0xa9d94b80, 0x3e455bed, 0x00000000, 0x3fe6a527, 0x5d60949a, 0x3e8b12c1, + 0x00000000, 0x3fe6b798, 0xb312bfe3, 0x3e924167, 0x00000000, 0x3fe6c9f7, 0x33070277, 0x3e90ab86, + 0x00000000, 0x3fe6dc44, 0xebbc80ee, 0x3e854554, 0x00000000, 0x3fe6ee7f, 0xef5a4bb8, 0x3e60204a, + 0x00000000, 0x3fe700a7, 0xc679cf2c, 0x3e98af08, 0x00000000, 0x3fe712be, 0x330ae6c8, 0x3e90852a, + 0x00000000, 0x3fe724c3, 0x9ec32916, 0x3e86d3eb, 0x00000000, 0x3fe736b6, 0x7fcbbafe, 0x3e8685cb, + 0x00000000, 0x3fe74897, 0xc1e0bd95, 0x3e91f751, 0x00000000, 0x3fe75a67, 0xb0f72560, 0x3e5705b1, + 0x00000000, 0x3fe76c24, 0xd808ca92, 0x3e9b98d8, 0x00000000, 0x3fe77dd1, 0xc75cc980, 0x3e62ea22, + 0x00000000, 0x3fe78f6b, 0x2bca0350, 0x3e97aba6, 0x00000000, 0x3fe7a0f4, 0x3442278c, 0x3e9d7383, + 0x00000000, 0x3fe7b26c, 0x1fb18bf9, 0x3e95a5ca, 0x00000000, 0x3fe7c3d3, 0x2b6ecf28, 0x3e61a609, + 0x00000000, 0x3fe7d528, 0x49aac104, 0x3e744fd0, 0x00000000, 0x3fe7e66c, 0xd8df5180, 0x3e2c114f, + 0x00000000, 0x3fe7f79e, 0x130feae5, 0x3e95972f, 0x00000000, 0x3fe808c0, 0xa55fe198, 0x3e7ca034, + 0x00000000, 0x3fe819d0, 0x49990227, 0x3e96e2b1, 0x00000000, 0x3fe82ad0, 0x0294592c, 0x3e7b0000, + 0x00000000, 0x3fe83bbe, 0xc442620e, 0x3e98b9bd, 0x00000000, 0x3fe84c9c, 0xfabf3e4e, 0x3e8d94fd, + 0x00000000, 0x3fe85d69, 0xb145ad9a, 0x3e85db30, 0x00000000, 0x3fe86e25, 0xb95022b0, 0x3e8e3e1e, + 0x00000000, 0x3fe87ed0, 0x45442bd6, 0x3e9d5b8b, 0x00000000, 0x3fe88f6b, 0x231ecd2e, 0x3e97a046, + 0x00000000, 0x3fe89ff5, 0x3ef55232, 0x3e9feafe, 0x00000000, 0x3fe8b06f, 0xbfd78267, 0x3e9839e7, + 0x00000000, 0x3fe8c0d9, 0x9d6fa900, 0x3e645cf4, 0x00000000, 0x3fe8d132, 0x2b27f380, 0x3e4be313, + 0x00000000, 0x3fe8e17a, 0x0bb84f9f, 0x3e953398, 0x00000000, 0x3fe8f1b3, 0xce3ba390, 0x3e5889e2, + 0x00000000, 0x3fe901db, 0xc3ad0cc8, 0x3e7f7778, 0x00000000, 0x3fe911f3, 0xcec4eba2, 0x3e846660, + 0x00000000, 0x3fe921fb, 0x4611a626, 0x3e85110b, 0x00000000, 0x3ff00000, 0x00000000, 0x00000000, + 0x80000000, 0x3ff00553, 0xc81e4294, 0x3e6e6a24, 0x90000000, 0x3ff00aa3, 0x11e3a785, 0x3e585485, + 0x10000000, 0x3ff00ff0, 0x36ec07f6, 0x3e64eb93, 0x20000000, 0x3ff01539, 0xb8b750e1, 0x3e40ea64, + 0xb0000000, 0x3ff01a7e, 0xcff8a53c, 0x3e461637, 0xd0000000, 0x3ff01fc0, 0xf7bd1943, 0x3e40733b, + 0x80000000, 0x3ff024ff, 0x1345cced, 0x3e566691, 0xd0000000, 0x3ff02a3a, 0x3f592f14, 0x3e477b7a, + 0xb0000000, 0x3ff02f72, 0xdd1a5402, 0x3e6f18d3, 0x50000000, 0x3ff034a7, 0xa58ee9a4, 0x3e2be2f5, + 0x80000000, 0x3ff039d8, 0x8f085fa7, 0x3e68901f, 0x70000000, 0x3ff03f06, 0xcd5b5d69, 0x3e5c68b8, + 0x10000000, 0x3ff04431, 0x8624be42, 0x3e5a6b0e, 0x70000000, 0x3ff04958, 0xb06f68e7, 0x3dbc4b22, + 0x80000000, 0x3ff04e7c, 0xafcabe9b, 0x3e60f3f0, 0x60000000, 0x3ff0539d, 0xbca4e1b7, 0x3e548495, + 0x00000000, 0x3ff058bb, 0x1abdfdc3, 0x3e66107f, 0x70000000, 0x3ff05dd5, 0x1878288a, 0x3e6e6726, + 0xc0000000, 0x3ff062ec, 0x55286f1e, 0x3e5a6bc1, 0xe0000000, 0x3ff06800, 0xc64a85f2, 0x3e58a759, + 0xe0000000, 0x3ff06d11, 0x0a4a8d09, 0x3e45fce7, 0xc0000000, 0x3ff0721f, 0xf373fe1d, 0x3e32f9cb, + 0x80000000, 0x3ff0772a, 0xce4ac359, 0x3e590564, 0x30000000, 0x3ff07c32, 0xe761b02f, 0x3e5ac29c, + 0xd0000000, 0x3ff08136, 0xf497381c, 0x3e5cb752, 0x60000000, 0x3ff08638, 0x1cfb35e0, 0x3e68bb9e, + 0xf0000000, 0x3ff08b36, 0x7099de90, 0x3e65b491, 0x80000000, 0x3ff09032, 0xc9c65ef2, 0x3e5cc77a, + 0x10000000, 0x3ff0952b, 0xe7be3dba, 0x3e57a0f3, 0xa0000000, 0x3ff09a20, 0x1ee0c16f, 0x3e66ec85, + 0x40000000, 0x3ff09f13, 0xbf2946da, 0x3e689449, 0xf0000000, 0x3ff0a402, 0x301ba223, 0x3e698f25, + 0xc0000000, 0x3ff0a8ef, 0xc651f549, 0x3e347d5e, 0x90000000, 0x3ff0add9, 0x9a86007a, 0x3e6c33ec, + 0x90000000, 0x3ff0b2c0, 0x53e92649, 0x3e5e0b66, 0xb0000000, 0x3ff0b7a4, 0xc09d755f, 0x3e3bd64a, + 0xf0000000, 0x3ff0bc85, 0x06f78167, 0x3e2f5375, 0x50000000, 0x3ff0c164, 0xd1b3735e, 0x3e62c382, + 0xe0000000, 0x3ff0c63f, 0x659f99e1, 0x3e6e20ed, 0xb0000000, 0x3ff0cb18, 0x3a9c182a, 0x3e586b63, + 0xb0000000, 0x3ff0cfee, 0x5a65e777, 0x3e445cfd, 0xe0000000, 0x3ff0d4c1, 0x0f58bca4, 0x3e60c877, + 0x50000000, 0x3ff0d992, 0x4b0933c5, 0x3e6739e4, 0x10000000, 0x3ff0de60, 0xd9ce7bd8, 0x3e027dc3, + 0x00000000, 0x3ff0e32b, 0x7c5a7b64, 0x3e63c53c, 0x40000000, 0x3ff0e7f3, 0x83830cec, 0x3e696696, + 0xd0000000, 0x3ff0ecb8, 0xc39bdcc4, 0x3e68d772, 0xb0000000, 0x3ff0f17b, 0x8bcf6d7b, 0x3e69b000, + 0xf0000000, 0x3ff0f63b, 0x5825ce4f, 0x3e3bbb30, 0x70000000, 0x3ff0faf9, 0xaf13a406, 0x3e6da3f4, + 0x60000000, 0x3ff0ffb4, 0x6f74ce86, 0x3e5f36b9, 0xb0000000, 0x3ff1046c, 0x2303f790, 0x3e165c00, + 0x50000000, 0x3ff10922, 0x095ba7d5, 0x3e682f84, 0x60000000, 0x3ff10dd5, 0x3541b2c6, 0x3e6d4643, + 0xe0000000, 0x3ff11285, 0x56e93a89, 0x3e671c3d, 0xd0000000, 0x3ff11733, 0xf4e40012, 0x3e598dce, + 0x30000000, 0x3ff11bdf, 0xef17fe03, 0x3e4530eb, 0x00000000, 0x3ff12088, 0xa3715066, 0x3e4e8b8f, + 0x40000000, 0x3ff1252e, 0xb3b211dc, 0x3e6ab26e, 0x10000000, 0x3ff129d2, 0xdc906307, 0x3e454dd4, + 0x50000000, 0x3ff12e73, 0x2387984e, 0x3e5c9f96, 0x10000000, 0x3ff13312, 0x59afec09, 0x3e6c62a9, + 0x60000000, 0x3ff137ae, 0xac6a866a, 0x3e6638d9, 0x40000000, 0x3ff13c48, 0xeca8a22d, 0x3e338704, + 0xa0000000, 0x3ff140df, 0x1db14f8f, 0x3e4e6c9e, 0x90000000, 0x3ff14574, 0x7f9c9eaa, 0x3e58744b, + 0x10000000, 0x3ff14a07, 0x3486373b, 0x3e66c289, 0x30000000, 0x3ff14e97, 0xe31699b7, 0x3e5b36bc, + 0xe0000000, 0x3ff15324, 0x13d200c7, 0x3e671e38, 0x30000000, 0x3ff157b0, 0xab40aa88, 0x3e699755, + 0x20000000, 0x3ff15c39, 0x0e4bcfc0, 0x3e6b45ca, 0xc0000000, 0x3ff160bf, 0x0d869c5d, 0x3e32dd09, + 0xf0000000, 0x3ff16543, 0x16b917da, 0x3e64fe05, 0xd0000000, 0x3ff169c5, 0x226317a2, 0x3e694563, + 0x60000000, 0x3ff16e45, 0xafc2c851, 0x3e653d8f, 0xa0000000, 0x3ff172c2, 0x1fbd41a3, 0x3e5dcbd4, + 0x90000000, 0x3ff1773d, 0x5285f59c, 0x3e5862ff, 0x30000000, 0x3ff17bb6, 0xa97a1e1c, 0x3e63072e, + 0x90000000, 0x3ff1802c, 0x75184805, 0x3e528390, 0xa0000000, 0x3ff184a0, 0x3e9eff42, 0x3e64b032, + 0x70000000, 0x3ff18912, 0x93c45484, 0x3e6b1588, 0x10000000, 0x3ff18d82, 0x0fc35826, 0x3e3149ef, + 0x60000000, 0x3ff191ef, 0xea96acaa, 0x3e5f2e77, 0x80000000, 0x3ff1965a, 0x4c471a95, 0x3e520007, + 0x60000000, 0x3ff19ac3, 0x517f6f04, 0x3e63f8cc, 0x10000000, 0x3ff19f2a, 0xe311bb55, 0x3e660ba2, + 0x90000000, 0x3ff1a38e, 0x730bbec3, 0x3e64b788, 0xe0000000, 0x3ff1a7f0, 0x795ee20c, 0x3e657090, + 0x00000000, 0x3ff1ac51, 0x983670b1, 0x3e6d9ffe, 0x00000000, 0x3ff1b0af, 0xff61bfda, 0x3e62a463, + 0xd0000000, 0x3ff1b50a, 0x6a5e65cf, 0x3e69d1bc, 0x80000000, 0x3ff1b964, 0xbaa9e922, 0x3e68718a, + 0x10000000, 0x3ff1bdbc, 0x2ffa342e, 0x3e63c2f5, 0x80000000, 0x3ff1c211, 0x3ff42c80, 0x3e60fae1, + 0xd0000000, 0x3ff1c664, 0x0ef00d57, 0x3e65440f, 0x10000000, 0x3ff1cab6, 0x2d4e3c1e, 0x3e46fcd2, + 0x30000000, 0x3ff1cf05, 0xb409e863, 0x3e4e0c60, 0x30000000, 0x3ff1d352, 0x5a5f0333, 0x3e6f9cab, + 0x30000000, 0x3ff1d79d, 0x744c333d, 0x3e630f24, 0x20000000, 0x3ff1dbe6, 0x2a76b2fe, 0x3e4b5062, + 0xf0000000, 0x3ff1e02c, 0xba595375, 0x3e6fdb94, 0xd0000000, 0x3ff1e471, 0xb945a171, 0x3e3861b9, + 0x90000000, 0x3ff1e8b4, 0x015188c4, 0x3e654348, 0x50000000, 0x3ff1ecf5, 0x49865523, 0x3e6b54d1, + 0x10000000, 0x3ff1f134, 0x83d9de33, 0x3e6a0bb7, 0xd0000000, 0x3ff1f570, 0x2b1a2157, 0x3e6629d1, + 0x90000000, 0x3ff1f9ab, 0x35d179df, 0x3e6467fe, 0x50000000, 0x3ff1fde4, 0x3e26c8f7, 0x3e69763f, + 0x20000000, 0x3ff2021b, 0xbb9f7679, 0x3e53f798, 0xf0000000, 0x3ff2064f, 0x7e855898, 0x3e552e57, + 0xc0000000, 0x3ff20a82, 0xe5502c3a, 0x3e6fde47, 0xb0000000, 0x3ff20eb3, 0x548d96a0, 0x3e5cbd0b, + 0xa0000000, 0x3ff212e2, 0xf7be8de8, 0x3e6a9cd9, 0xb0000000, 0x3ff2170f, 0x704886de, 0x3e522bbe, + 0xc0000000, 0x3ff21b3a, 0x8317f020, 0x3e6e3dea, 0xf0000000, 0x3ff21f63, 0x85ac8855, 0x3e6e8120, + 0x40000000, 0x3ff2238b, 0x4f24cb07, 0x3e5c8714, 0xa0000000, 0x3ff227b0, 0xee311fa2, 0x3e61e128, + 0x20000000, 0x3ff22bd4, 0x3d61a2d3, 0x3e5b5c16, 0xc0000000, 0x3ff22ff5, 0x7fb90633, 0x3e47d97e, + 0x70000000, 0x3ff23415, 0x9d50f6a7, 0x3e6efe89, 0x50000000, 0x3ff23833, 0xeb75de5a, 0x3e6d0333, + 0x60000000, 0x3ff23c4f, 0xbe73a573, 0x3e40e590, 0x80000000, 0x3ff24069, 0xcac3cdd2, 0x3e68ce8d, + 0xd0000000, 0x3ff24481, 0x8954064b, 0x3e6ee8a4, 0x50000000, 0x3ff24898, 0x18461e09, 0x3e6aa62f, + 0x00000000, 0x3ff24cad, 0x40986a15, 0x3e601e59, 0xe0000000, 0x3ff250bf, 0x4f9b8d4c, 0x3e3b082f, + 0xe0000000, 0x3ff254d0, 0xe5527f5a, 0x3e6876e0, 0x20000000, 0x3ff258e0, 0x80831e6b, 0x3e636170, + 0x90000000, 0x3ff25ced, 0xe34aa4a2, 0x3e681b26, 0x40000000, 0x3ff260f9, 0x6dfab0c1, 0x3e552ee6, + 0x20000000, 0x3ff26503, 0x329e8819, 0x3e5d85a5, 0x40000000, 0x3ff2690b, 0xb646b5d1, 0x3e5105c1, + 0x90000000, 0x3ff26d11, 0x0c1a379c, 0x3e6bb669, 0x30000000, 0x3ff27116, 0xa73ce3a9, 0x3e586aeb, + 0x00000000, 0x3ff27519, 0x98294dd4, 0x3e6dd161, 0x20000000, 0x3ff2791a, 0x75775e83, 0x3e6454e6, + 0x80000000, 0x3ff27d19, 0x026197ea, 0x3e63842e, 0x20000000, 0x3ff28117, 0xe70c44d2, 0x3e6f1ce0, + 0x10000000, 0x3ff28513, 0x441a5627, 0x3e6ad636, 0x50000000, 0x3ff2890d, 0xd7212abb, 0x3e54c205, + 0xd0000000, 0x3ff28d05, 0x6c116419, 0x3e6167c8, 0xa0000000, 0x3ff290fc, 0xef16e294, 0x3e638ec3, + 0xc0000000, 0x3ff294f1, 0xeace9321, 0x3e6473fc, 0x30000000, 0x3ff298e5, 0xa836dba7, 0x3e67af53, + 0x00000000, 0x3ff29cd7, 0xc383b652, 0x3e1a51f3, 0x10000000, 0x3ff2a0c7, 0xa190822d, 0x3e63696d, + 0x80000000, 0x3ff2a4b5, 0xec77074b, 0x3e62f9ad, 0x50000000, 0x3ff2a8a2, 0xd5bee55f, 0x3e38190f, + 0x70000000, 0x3ff2ac8d, 0xfac68e55, 0x3e4bfee8, 0xf0000000, 0x3ff2b076, 0x6bc5f68a, 0x3e331c9d, + 0xc0000000, 0x3ff2b45e, 0x23737edf, 0x3e689d05, 0x00000000, 0x3ff2b845, 0x43bf47bb, 0x3e5a2959, + 0xa0000000, 0x3ff2bc29, 0x2e5b3207, 0x3e396be3, 0x90000000, 0x3ff2c00c, 0xd909fa0e, 0x3e6e44c7, + 0x00000000, 0x3ff2c3ee, 0xda94d9ea, 0x3e2b2505, 0xc0000000, 0x3ff2c7cd, 0xf46c9c98, 0x3e60c851, + 0xf0000000, 0x3ff2cbab, 0x7d9aa3b7, 0x3e5da71f, 0x80000000, 0x3ff2cf88, 0x5d019ef1, 0x3e6f1b60, + 0x90000000, 0x3ff2d363, 0xa2189563, 0x3e4386e8, 0x00000000, 0x3ff2d73d, 0x5d306ba7, 0x3e3b19fa, + 0xd0000000, 0x3ff2db14, 0xb67aef76, 0x3e6dd749, 0x20000000, 0x3ff2deeb, 0xf1dc04b0, 0x3e676ff6, + 0xe0000000, 0x3ff2e2bf, 0xd0b232a6, 0x3e635a33, 0x10000000, 0x3ff2e693, 0x0024a4e1, 0x3e64bdc8, + 0xb0000000, 0x3ff2ea64, 0x770fd723, 0x3e6ebd61, 0xd0000000, 0x3ff2ee34, 0xc537264d, 0x3e64769f, + 0x60000000, 0x3ff2f203, 0x429f3b98, 0x3e69021f, 0x70000000, 0x3ff2f5d0, 0x3efbd606, 0x3e5ee708, + 0xf0000000, 0x3ff2f99b, 0x552a6b1a, 0x3e6ad985, 0xf0000000, 0x3ff2fd65, 0x78772160, 0x3e6e3df7, + 0x70000000, 0x3ff3012e, 0x6ddc9b34, 0x3e6ca5d7, 0x70000000, 0x3ff304f5, 0xffdbaf74, 0x3e691154, + 0xf0000000, 0x3ff308ba, 0x57fb306a, 0x3e667bdd, 0xf0000000, 0x3ff30c7e, 0x5ac40886, 0x3e67dc25, + 0x80000000, 0x3ff31041, 0x8e8afafe, 0x3df219f3, 0x80000000, 0x3ff31402, 0xf9669a04, 0x3e62416b, + 0x10000000, 0x3ff317c2, 0xb2b3987f, 0x3e611c96, 0x20000000, 0x3ff31b80, 0x447e1177, 0x3e6f99ed, + 0xd0000000, 0x3ff31f3c, 0x26328a11, 0x3e132458, 0xf0000000, 0x3ff322f7, 0xd1e645f8, 0x3e66f56d, + 0xb0000000, 0x3ff326b1, 0x46945535, 0x3e461649, 0xf0000000, 0x3ff32a69, 0x9d190028, 0x3e5e37d5, + 0xc0000000, 0x3ff32e20, 0xf12bf828, 0x3e668671, 0x20000000, 0x3ff331d6, 0xca6aabbd, 0x3e6e8ecb, + 0x20000000, 0x3ff3358a, 0x109a5912, 0x3e53f49e, 0xa0000000, 0x3ff3393c, 0x11ec3043, 0x3e6b8a0e, + 0xc0000000, 0x3ff33ced, 0x0aed691a, 0x3e65fae0, 0x70000000, 0x3ff3409d, 0xbece3e4a, 0x3e6c0569, + 0xc0000000, 0x3ff3444b, 0x744efbfe, 0x3e605e26, 0xa0000000, 0x3ff347f8, 0xa94be5c5, 0x3e65b570, + 0x20000000, 0x3ff34ba4, 0x6ea0e063, 0x3e5d6f15, 0x30000000, 0x3ff34f4e, 0x612fc484, 0x3e6e0ca7, + 0xf0000000, 0x3ff352f6, 0x27b25258, 0x3e4963c9, 0x40000000, 0x3ff3569e, 0xaa725a5c, 0x3e547930, + 0x30000000, 0x3ff35a44, 0xe3af43b3, 0x3e58a79f, 0xc0000000, 0x3ff35de8, 0x9c41bdaf, 0x3e5e6dc2, + 0xf0000000, 0x3ff3618b, 0x76f863a5, 0x3e657a2e, 0xd0000000, 0x3ff3652d, 0x1716354d, 0x3e2ae3b6, + 0x40000000, 0x3ff368ce, 0xdf6906b1, 0x3e665fb5, 0x60000000, 0x3ff36c6d, 0x7f588f7b, 0x3e66177d, + 0x30000000, 0x3ff3700b, 0xbd091b67, 0x3e3ad55a, 0xa0000000, 0x3ff373a7, 0xb2422d76, 0x3e155337, + 0xb0000000, 0x3ff37742, 0xe86972d5, 0x3e6084eb, 0x70000000, 0x3ff37adc, 0x808e1ea3, 0x3e656395, + 0xe0000000, 0x3ff37e74, 0x1b40fba7, 0x3e61bce2, 0x00000000, 0x3ff3820c, 0x4605b515, 0x3e5006f9, + 0xc0000000, 0x3ff385a1, 0xaceb1f7d, 0x3e6aa676, 0x40000000, 0x3ff38936, 0x76554ce6, 0x3e58229f, + 0x60000000, 0x3ff38cc9, 0x6cf57330, 0x3e6eabfc, 0x40000000, 0x3ff3905b, 0x9c0ce8bc, 0x3e64daed, + 0xd0000000, 0x3ff393eb, 0x68237141, 0x3e60ff17, 0x10000000, 0x3ff3977b, 0x3051b085, 0x3e6575f8, + 0x10000000, 0x3ff39b09, 0xeb523e29, 0x3e42667d, 0xc0000000, 0x3ff39e95, 0x6954f4fd, 0x3e181699, + 0x20000000, 0x3ff3a221, 0xcf4d9cd4, 0x3e587cfc, 0x40000000, 0x3ff3a5ab, 0x18198353, 0x3e52c5d0, + 0x10000000, 0x3ff3a934, 0x8dcc34aa, 0x3e6a7a89, 0xb0000000, 0x3ff3acbb, 0xdadc36d1, 0x3e2cead6, + 0x00000000, 0x3ff3b042, 0x9c498bdf, 0x3e2a5575, 0x00000000, 0x3ff3b3c7, 0x9ef6de04, 0x3e6c414a, + 0xd0000000, 0x3ff3b74a, 0x8a6e58fa, 0x3e63e210, 0x60000000, 0x3ff3bacd, 0x7643d77c, 0x3e5587fd, + 0xb0000000, 0x3ff3be4e, 0x1d3ff3df, 0x3e3901eb, 0xb0000000, 0x3ff3c1ce, 0x7c812fc6, 0x3e6f2ccd, + 0x90000000, 0x3ff3c54d, 0x70a01049, 0x3e21c8ee, 0x20000000, 0x3ff3c8cb, 0x02831eec, 0x3e563e8d, + 0x70000000, 0x3ff3cc47, 0x2a92c7ff, 0x3e6f61a4, 0xa0000000, 0x3ff3cfc2, 0x99c84d24, 0x3dda9173, + 0x80000000, 0x3ff3d33c, 0xc8eec2f0, 0x3e5e9197, 0x30000000, 0x3ff3d6b5, 0x2f5a1378, 0x3e5e6f84, + 0xb0000000, 0x3ff3da2c, 0x2a90a0fc, 0x3e2fac24, 0xf0000000, 0x3ff3dda2, 0x26610227, 0x3e535ed7, + 0x00000000, 0x3ff3e118, 0x4804b15b, 0x3e50e0d6, 0xe0000000, 0x3ff3e48b, 0x5daba814, 0x3e056067, + 0x80000000, 0x3ff3e7fe, 0xc8768032, 0x3e637388, 0x00000000, 0x3ff3eb70, 0x9f9e01f5, 0x3e3ee3c8, + 0x40000000, 0x3ff3eee0, 0x0d09747c, 0x3e639f6f, 0x60000000, 0x3ff3f24f, 0x27abb8f0, 0x3e4322c3, + 0x40000000, 0x3ff3f5bd, 0x47c8ac80, 0x3e6961b3, 0x00000000, 0x3ff3f92a, 0xbbd0f118, 0x3e63711f, + 0x90000000, 0x3ff3fc95, 0xd7718ffb, 0x3e64fad8, 0xf0000000, 0x3ff3ffff, 0xffffffff, 0x3e6fffff, + 0x30000000, 0x3ff40369, 0x79ec35b4, 0x3e667efa, 0x40000000, 0x3ff406d1, 0x87a254a8, 0x3e6a7376, + 0x30000000, 0x3ff40a38, 0xf87d924d, 0x3e5bace0, 0xf0000000, 0x3ff40d9d, 0xc237e392, 0x3e629e37, + 0x90000000, 0x3ff41102, 0xac3f3012, 0x3e557ce7, 0x00000000, 0x3ff41466, 0x359f8fbd, 0x3e682829, + 0x50000000, 0x3ff417c8, 0x42d14676, 0x3e6cc9be, 0x80000000, 0x3ff41b29, 0x1c137d0b, 0x3e6a8f00, + 0x90000000, 0x3ff41e89, 0x687dda05, 0x3e636127, 0x80000000, 0x3ff421e8, 0x322646f0, 0x3e524dba, + 0x40000000, 0x3ff42546, 0x1ed210b4, 0x3e6dc43f, 0xf0000000, 0x3ff428a2, 0x15c447bb, 0x3e631ae5, + 0xf0000000, 0x3fe428a2, 0x15c447bb, 0x3e531ae5, 0xa0000000, 0x3fe965fe, 0xf20ac166, 0x3e44f5b8, + 0x00000000, 0x3ff00000, 0x00000000, 0x00000000, 0xf0000000, 0x3ff428a2, 0x15c447bb, 0x3e631ae5, + 0xa0000000, 0x3ff965fe, 0xf20ac166, 0x3e54f5b8, 0x00000000, 0x3ff00000, 0x00000000, 0x00000000, + 0x30000000, 0x3ff02c9a, 0xc1dcdef9, 0x3e6cef00, 0xd0000000, 0x3ff059b0, 0xa1d73e2a, 0x3e48ac2b, + 0x10000000, 0x3ff08745, 0x901186be, 0x3e60eb37, 0x60000000, 0x3ff0b558, 0x1ec53172, 0x3e69f312, + 0x30000000, 0x3ff0e3ec, 0x10103a17, 0x3e469e8d, 0xd0000000, 0x3ff11301, 0xa4ebbf1a, 0x3df25b50, + 0xa0000000, 0x3ff1429a, 0xbf668203, 0x3e6d525b, 0x30000000, 0x3ff172b8, 0xf5b9bef9, 0x3e68faa2, + 0xe0000000, 0x3ff1a35b, 0xea796d31, 0x3e66df96, 0x30000000, 0x3ff1d487, 0xa7805b80, 0x3e368b9a, + 0x80000000, 0x3ff2063b, 0xac771dd6, 0x3e60c519, 0x60000000, 0x3ff2387a, 0x70cd83f5, 0x3e6ceac4, + 0x60000000, 0x3ff26b45, 0x7495e99c, 0x3e5789f3, 0xf0000000, 0x3ff29e9d, 0x84b09745, 0x3e547f7b, + 0xa0000000, 0x3ff2d285, 0x2d002475, 0x3e5b900c, 0x00000000, 0x3ff306fe, 0x2a5bd1ab, 0x3e64636e, + 0xb0000000, 0x3ff33c08, 0xfa64e430, 0x3e4320b7, 0x30000000, 0x3ff371a7, 0x2a9c5154, 0x3e5ceaa7, + 0x30000000, 0x3ff3a7db, 0xdba86f24, 0x3e53967f, 0x40000000, 0x3ff3dea6, 0x446b6824, 0x3e682468, + 0x20000000, 0x3ff4160a, 0x9f84325b, 0x3e3f72e2, 0x60000000, 0x3ff44e08, 0x40c4dbd0, 0x3e18624b, + 0xb0000000, 0x3ff486a2, 0x404f068e, 0x3e5704f3, 0xd0000000, 0x3ff4bfda, 0x9c750e5e, 0x3e54d8a8, + 0x70000000, 0x3ff4f9b2, 0x9ab4cf62, 0x3e5a74b2, 0x50000000, 0x3ff5342b, 0x077c2a0f, 0x3e5a753e, + 0x30000000, 0x3ff56f47, 0x699bb2c0, 0x3e5ad49f, 0xd0000000, 0x3ff5ab07, 0x52b19260, 0x3e6a90a8, + 0x10000000, 0x3ff5e76f, 0x21ba6f93, 0x3e56b485, 0xb0000000, 0x3ff6247e, 0x58f87d03, 0x3e0d2ac2, + 0x80000000, 0x3ff66238, 0x24893ecf, 0x3e42a911, 0x60000000, 0x3ff6a09e, 0x32422cbe, 0x3e59fcef, + 0x30000000, 0x3ff6dfb2, 0x5de441c5, 0x3e68ca34, 0xe0000000, 0x3ff71f75, 0xe7ba46e1, 0x3e61d8be, + 0x50000000, 0x3ff75feb, 0x22fdba6a, 0x3e59099f, 0x70000000, 0x3ff7a114, 0x36bea881, 0x3e4f580c, + 0x30000000, 0x3ff7e2f3, 0x8841740a, 0x3e5b3d39, 0x90000000, 0x3ff82589, 0x25159f11, 0x3e62999c, + 0x90000000, 0x3ff868d9, 0xd901c83b, 0x3e668925, 0x40000000, 0x3ff8ace5, 0xdadd3e2a, 0x3e415506, + 0x90000000, 0x3ff8f1ae, 0x6c57304e, 0x3e622aee, 0xb0000000, 0x3ff93737, 0x9e8a0387, 0x3e29b8bc, + 0x90000000, 0x3ff97d82, 0x9f173d24, 0x3e6fbc9c, 0x80000000, 0x3ff9c491, 0x80e3e235, 0x3e451f84, + 0x70000000, 0x3ffa0c66, 0xc96535b5, 0x3e66bbca, 0xb0000000, 0x3ffa5503, 0xe45a1224, 0x3e41f12a, + 0x50000000, 0x3ffa9e6b, 0xfd0fac90, 0x3e55e7f6, 0x90000000, 0x3ffae89f, 0x5abd0e69, 0x3e62b5a7, + 0xb0000000, 0x3ffb33a2, 0xf5ed7fa1, 0x3e609e2b, 0xf0000000, 0x3ffb7f76, 0x37553d84, 0x3e47daf2, + 0x90000000, 0x3ffbcc1e, 0x891ee83d, 0x3e12f074, 0xd0000000, 0x3ffc199b, 0x38444196, 0x3e6b0aa5, + 0x20000000, 0x3ffc67f1, 0x9694426f, 0x3e6cafa2, 0xd0000000, 0x3ffcb720, 0xd22a0797, 0x3e69df20, + 0x40000000, 0x3ffd072d, 0xf71a1e45, 0x3e640f12, 0xd0000000, 0x3ffd5818, 0x0e4bb40b, 0x3e69f749, + 0x00000000, 0x3ffda9e6, 0x2b84600d, 0x3e4ed994, 0x30000000, 0x3ffdfc97, 0xf5cb4656, 0x3e4bdcda, + 0xe0000000, 0x3ffe502e, 0xd89cf44c, 0x3e5e2cff, 0xa0000000, 0x3ffea4af, 0xcc2c7b9d, 0x3e452486, + 0xe0000000, 0x3ffefa1b, 0x4eee3fa4, 0x3e6cc2b4, 0x50000000, 0x3fff5076, 0x80ce9f09, 0x3e66dc8a, + 0x80000000, 0x3fffa7c1, 0x82e90a7e, 0x3e39e90d, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x3f8fc0a8, 0x7c79f3db, 0x3e361f80, 0x00000000, 0x3f9f8298, 0x980267c8, 0x3e6873c1, + 0x00000000, 0x3fa77458, 0x9f88c69e, 0x3e5ec65b, 0x00000000, 0x3faf0a30, 0x54cc2f99, 0x3e58022c, + 0x00000000, 0x3fb341d7, 0x3a125330, 0x3e62c37a, 0x00000000, 0x3fb6f0d2, 0x69737c93, 0x3e615cad, + 0x00000000, 0x3fba926d, 0xb1b285e9, 0x3e4d256a, 0x00000000, 0x3fbe2707, 0xb97a7aa2, 0x3e5b8abc, + 0x00000000, 0x3fc0d77e, 0x9659a5dc, 0x3e6f3423, 0x80000000, 0x3fc29552, 0x48d30177, 0x3e6e07fd, + 0x00000000, 0x3fc44d2b, 0x4799f4f6, 0x3e6b32df, 0x00000000, 0x3fc5ff30, 0xf4f21cf8, 0x3e6c29e4, + 0x00000000, 0x3fc7ab89, 0x48df1b59, 0x3e1086c8, 0x80000000, 0x3fc9525a, 0xb4764130, 0x3e4cf456, + 0x00000000, 0x3fcaf3c9, 0xfcb63398, 0x3e63a02f, 0x80000000, 0x3fcc8ff7, 0x886b0976, 0x3e61e6a6, + 0x00000000, 0x3fce2707, 0xb97a7aa2, 0x3e6b8abc, 0x00000000, 0x3fcfb918, 0x8aa35552, 0x3e6b578f, + 0xc0000000, 0x3fd0a324, 0x71afb9fc, 0x3e6139c8, 0x80000000, 0x3fd1675c, 0x0701ce64, 0x3e65d5d3, + 0xc0000000, 0x3fd22941, 0xb2d12142, 0x3e6de7bc, 0x80000000, 0x3fd2e8e2, 0x984e1664, 0x3e6d708e, + 0x40000000, 0x3fd3a64c, 0xe9c72f36, 0x3e556945, 0xc0000000, 0x3fd4618b, 0x13e85bda, 0x3e20e2f6, + 0x80000000, 0x3fd51aad, 0xb42724f6, 0x3e3cb7e0, 0x80000000, 0x3fd5d1bd, 0xe52846c7, 0x3e6fac04, + 0x00000000, 0x3fd686c8, 0xaec442be, 0x3e5e9b14, 0xc0000000, 0x3fd739d7, 0x034e7126, 0x3e6b5de8, + 0x00000000, 0x3fd7eaf8, 0xe1b259d3, 0x3e6dc157, 0x80000000, 0x3fd89a33, 0x6ad69c62, 0x3e3b0509, + 0x00000000, 0x3fd94794, 0xfaba4cdd, 0x3e5c2116, 0xc0000000, 0x3fd9f323, 0x25f95b47, 0x3e665fcc, + 0x80000000, 0x3fda9cec, 0x498d4850, 0x3e5a9a08, 0x40000000, 0x3fdb44f7, 0xb1465f77, 0x3e6de647, + 0x80000000, 0x3fdbeb4d, 0x7bf7861d, 0x3e5da71b, 0xc0000000, 0x3fdc8ff7, 0x86b09760, 0x3e3e6a68, + 0x40000000, 0x3fdd32fe, 0xeab0ef64, 0x3e6f0075, 0x00000000, 0x3fddd46a, 0x82fb989b, 0x3e330712, + 0x40000000, 0x3fde7442, 0xc3f1bed2, 0x3e60eb43, 0x40000000, 0x3fdf128f, 0xecb35c84, 0x3e5faf06, + 0x80000000, 0x3fdfaf58, 0x3db35f68, 0x3e4ef1e6, 0xa0000000, 0x3fe02552, 0xfb1a71a5, 0x3e469743, + 0x40000000, 0x3fe0723e, 0x404e5796, 0x3e6c1cdf, 0xe0000000, 0x3fe0be72, 0x0ada625e, 0x3e4094aa, + 0x80000000, 0x3fe109f3, 0x96fde3ec, 0x3e6e2d4c, 0xc0000000, 0x3fe154c3, 0xe9a98f34, 0x3e62f4d5, + 0xa0000000, 0x3fe19ee6, 0x6ecc5cbe, 0x3e6467c9, 0x40000000, 0x3fe1e85f, 0xd03dec5a, 0x3e6e7040, + 0xc0000000, 0x3fe23130, 0x4282de36, 0x3e67bebf, 0x00000000, 0x3fe2795e, 0x1aeb783f, 0x3e6289b1, + 0xe0000000, 0x3fe2c0e9, 0x1772f538, 0x3e5a891d, 0x20000000, 0x3fe307d7, 0xbe1fb591, 0x3e634f10, + 0x80000000, 0x3fe34e28, 0xd316eb93, 0x3e6d9ce1, 0xc0000000, 0x3fe393e0, 0x19a9c442, 0x3e63562a, + 0x60000000, 0x3fe3d902, 0xf548084c, 0x3e54e2ad, 0xe0000000, 0x3fe41d8f, 0x5cc8c97a, 0x3e508ce5, + 0xc0000000, 0x3fe4618b, 0x13e85bda, 0x3e30e2f6, 0x40000000, 0x3fe4a4f8, 0xbb0227bf, 0x3e6db03e, + 0x00000000, 0x3fe4e7d8, 0xb09cb098, 0x3e61b75b, 0x20000000, 0x3fe52a2d, 0xabb9df22, 0x3e496f16, + 0xc0000000, 0x3fe56bf9, 0x99411c62, 0x3e65b3f3, 0x40000000, 0x3fe5ad40, 0x59f65355, 0x3e586b3e, + 0xa0000000, 0x3fe5ee02, 0xeae1ac12, 0x3e52482c, 0xe0000000, 0x3fe62e42, 0xef35793c, 0x3e6efa39, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xa0000000, 0x3f6ff00a, 0x0250435a, 0x3db5885e, + 0x60000000, 0x3f7fe02a, 0x11f86ed2, 0x3de620cf, 0x50000000, 0x3f87dc47, 0xedba4a25, 0x3dff0214, + 0xb0000000, 0x3f8fc0a8, 0x79f3db4e, 0x3dbf807c, 0x40000000, 0x3f93cea4, 0xa779a52b, 0x3dea352b, + 0x00000000, 0x3f97b91b, 0x6aa49fd5, 0x3dff56c4, 0x20000000, 0x3f9b9fc0, 0x5fef5196, 0x3dfebe46, + 0x00000000, 0x3f9f829b, 0x0099f1f8, 0x3e0cf066, 0x80000000, 0x3fa1b0d9, 0xff85945d, 0x3e1247b2, + 0xb0000000, 0x3fa39e87, 0xbf5202b6, 0x3e13fd7a, 0xa0000000, 0x3fa58a5b, 0xa918d51e, 0x3e1f91c9, + 0xf0000000, 0x3fa77458, 0xf118d3ca, 0x3e08cb73, 0x00000000, 0x3fa95c83, 0xd6fad074, 0x3e1d91c7, + 0x70000000, 0x3fab42dd, 0xec28d14c, 0x3de1971b, 0x80000000, 0x3fad276b, 0xa423c78a, 0x3e15b616, + 0xc0000000, 0x3faf0a30, 0x617cc971, 0x3da162a6, 0x30000000, 0x3fb07598, 0xc4c06d29, 0x3e166391, + 0xe0000000, 0x3fb16536, 0xc1d0c4b8, 0x3e2d46f5, 0x20000000, 0x3fb253f6, 0x2df1f6d3, 0x3e2e1428, + 0x90000000, 0x3fb341d7, 0x424a660d, 0x3e186f47, 0xb0000000, 0x3fb42edc, 0xe077753e, 0x3e2d4c8d, + 0x30000000, 0x3fb51b07, 0x7ed24f1c, 0x3e2e0c30, 0xa0000000, 0x3fb60658, 0x8763bdd3, 0x3e226ea1, + 0x80000000, 0x3fb6f0d2, 0x9737c933, 0x3e25cad6, 0x60000000, 0x3fb7da76, 0x99088901, 0x3e2af625, + 0xd0000000, 0x3fb8c345, 0x83d6b2d0, 0x3e18c66c, 0x40000000, 0x3fb9ab42, 0xb36fb30f, 0x3e1880ce, + 0x30000000, 0x3fba926d, 0xc6ca17a4, 0x3e2495aa, 0x20000000, 0x3fbb78c8, 0x4210878c, 0x3e2761db, + 0x80000000, 0x3fbc5e54, 0x862bac2f, 0x3e2eb78e, 0xd0000000, 0x3fbd4313, 0x75790dd9, 0x3e19b2cd, + 0x60000000, 0x3fbe2707, 0xcbd3d50f, 0x3e2c55e5, 0xc0000000, 0x3fbf0a30, 0x617cc971, 0x3db162a6, + 0x30000000, 0x3fbfec91, 0xaaa2e519, 0x3dfdbeab, 0x10000000, 0x3fc06715, 0x7150c647, 0x3e1652cb, + 0x70000000, 0x3fc0d77e, 0xb2cd2ee2, 0x3e39a11c, 0x80000000, 0x3fc14785, 0xb1a28813, 0x3e219d0a, + 0xd0000000, 0x3fc1b72a, 0x80a41811, 0x3e24bd9e, 0x10000000, 0x3fc2266f, 0x96faa3df, 0x3e3214b5, + 0xf0000000, 0x3fc29552, 0x46980bb8, 0x3e303fea, 0x10000000, 0x3fc303d7, 0xa5fd28c7, 0x3e31c8ff, + 0x20000000, 0x3fc371fc, 0x3bcd96c5, 0x3dce8f74, 0xb0000000, 0x3fc3dfc2, 0x395315c6, 0x3dfd98c5, + 0x60000000, 0x3fc44d2b, 0x3ccfa7b2, 0x3e3996fa, 0xf0000000, 0x3fc4ba36, 0x2ad13037, 0x3e1cd2af, + 0xe0000000, 0x3fc526e5, 0xbd17200e, 0x3e1d0da1, 0xd0000000, 0x3fc59338, 0x0ba68b75, 0x3e333041, + 0x70000000, 0x3fc5ff30, 0x790e7c41, 0x3df4f27a, 0x40000000, 0x3fc66acd, 0x86f6ff1b, 0x3e13956a, + 0xe0000000, 0x3fc6d60f, 0x723551d9, 0x3e2c6748, 0xf0000000, 0x3fc740f8, 0x9326cdfc, 0x3e2500de, + 0x00000000, 0x3fc7ab89, 0x48df1b59, 0x3e1086c8, 0xa0000000, 0x3fc815c0, 0xad6836ff, 0x3e04357e, + 0x60000000, 0x3fc87fa0, 0x42408024, 0x3e248324, 0xd0000000, 0x3fc8e928, 0x8154b13d, 0x3e3d10da, + 0x90000000, 0x3fc9525a, 0x68ec8260, 0x3e39e8ad, 0x20000000, 0x3fc9bb36, 0x06abaf18, 0x3e3cfbf7, + 0x10000000, 0x3fca23bc, 0xc6326e23, 0x3e3fc56a, 0xf0000000, 0x3fca8bec, 0x3185cf21, 0x3e39105e, + 0x40000000, 0x3fcaf3c9, 0xe5b19cc0, 0x3e3d017f, 0x90000000, 0x3fcb5b51, 0x48dd13fe, 0x3e3d1f6b, + 0x70000000, 0x3fcbc286, 0x58a7e73a, 0x3e20b633, 0x50000000, 0x3fcc2968, 0x028c211c, 0x3e263063, + 0xc0000000, 0x3fcc8ff7, 0x86b09760, 0x3e2e6a68, 0x40000000, 0x3fccf635, 0xb891cd03, 0x3e3c138b, + 0x60000000, 0x3fcd5c21, 0x22b7221a, 0x3e369f77, 0xa0000000, 0x3fcdc1bc, 0xac1a628c, 0x3df57d8f, + 0x60000000, 0x3fce2707, 0xcbd3d50f, 0x3e3c55e5, 0x50000000, 0x3fce8c02, 0xff48fe2e, 0x3e1552d2, + 0xc0000000, 0x3fcef0ad, 0x6ca431bc, 0x3e37b8b2, 0x50000000, 0x3fcf550a, 0xdc1c5f6d, 0x3e292dec, + 0x60000000, 0x3fcfb918, 0x551aaa8c, 0x3e3abc7c, 0x40000000, 0x3fd00e6c, 0x731a354b, 0x3e36b540, + 0x90000000, 0x3fd04025, 0x036b89ef, 0x3e32d341, 0x50000000, 0x3fd071b8, 0x1a3a2e0f, 0x3e4f9ab2, + 0xe0000000, 0x3fd0a324, 0x1afb9fbd, 0x3e239c87, 0x50000000, 0x3fd0d46b, 0x2c81f640, 0x3e3e6add, + 0xf0000000, 0x3fd1058b, 0xaa313f41, 0x3e435c95, 0x00000000, 0x3fd13687, 0x82f6cc53, 0x3e249d45, + 0xa0000000, 0x3fd1675c, 0x1c07398f, 0x3e47574c, 0x20000000, 0x3fd1980d, 0xdece9e8d, 0x3e4ba846, + 0xc0000000, 0x3fd1c898, 0xafbc68e7, 0x3e16999f, 0x90000000, 0x3fd1f8ff, 0xe51b0103, 0x3e4c9145, + 0xf0000000, 0x3fd22941, 0xcb44850a, 0x3e479ef2, 0x10000000, 0x3fd25960, 0x3de11275, 0x3e0beec7, + 0x10000000, 0x3fd2895a, 0x1af5a498, 0x3e2ef435, 0x30000000, 0x3fd2b930, 0x493b4a50, 0x3e45713a, + 0xb0000000, 0x3fd2e8e2, 0x61385992, 0x3e45c23a, 0xc0000000, 0x3fd31871, 0x09f57299, 0x3e42a883, + 0x90000000, 0x3fd347dd, 0xa9ac8ace, 0x3e4530fa, 0x60000000, 0x3fd37726, 0xd792a758, 0x3e25fec2, + 0x50000000, 0x3fd3a64c, 0xa71cbcd7, 0x3e35a517, 0xa0000000, 0x3fd3d54f, 0x3e1cd9a3, 0x3e3707dc, + 0x80000000, 0x3fd40430, 0x8ef43049, 0x3e3a1a9f, 0x20000000, 0x3fd432ef, 0x276b3674, 0x3e4409d0, + 0xc0000000, 0x3fd4618b, 0x13e85bd9, 0x3e20e2f6, 0x80000000, 0x3fd49006, 0x33001e5f, 0x3df00274, + 0x90000000, 0x3fd4be5f, 0x836d3265, 0x3e35dde2, 0x30000000, 0x3fd4ec97, 0x4d7aaf04, 0x3e230013, + 0x80000000, 0x3fd51aad, 0xb42724f5, 0x3e3cb7e0, 0xc0000000, 0x3fd548a2, 0x167e6308, 0x3e2d6e93, + 0x10000000, 0x3fd57677, 0xb1526adb, 0x3e3d1569, 0xb0000000, 0x3fd5a42a, 0x338a1a41, 0x3e0e99fc, + 0xb0000000, 0x3fd5d1bd, 0x94a11b1c, 0x3e4eb013, 0x70000000, 0x3fd5ff30, 0x790e7c41, 0x3e04f27a, + 0xf0000000, 0x3fd62c82, 0xa97b7af9, 0x3e25ce3c, 0x70000000, 0x3fd659b5, 0x940ed857, 0x3e281f0f, + 0x10000000, 0x3fd686c8, 0x5d88857c, 0x3e4d3629, 0x20000000, 0x3fd6b3bb, 0xec4af526, 0x3e21aca1, + 0xa0000000, 0x3fd6e08e, 0xc7182726, 0x3e445743, 0xe0000000, 0x3fd70d42, 0xaead337e, 0x3e23c491, + 0xf0000000, 0x3fd739d7, 0x1a738931, 0x3e3aef40, 0x10000000, 0x3fd7664e, 0x76092a29, 0x3e21cede, + 0x50000000, 0x3fd792a5, 0x44f82bb4, 0x3e4fba8f, 0x00000000, 0x3fd7bede, 0x7f3c3e1a, 0x3e446f5f, + 0x30000000, 0x3fd7eaf8, 0x86c9674b, 0x3e47055f, 0x10000000, 0x3fd816f4, 0x2b6b6e1a, 0x3e4b41a9, + 0xd0000000, 0x3fd842d1, 0x2e927628, 0x3e443d16, 0x90000000, 0x3fd86e91, 0x4013f9b1, 0x3e446617, + 0x80000000, 0x3fd89a33, 0x6ad69c62, 0x3e3b0509, 0xc0000000, 0x3fd8c5b7, 0x150faa58, 0x3e40b169, + 0x80000000, 0x3fd8f11e, 0x1df85da7, 0x3e3cd98b, 0xe0000000, 0x3fd91c67, 0x7b0f8fa8, 0x3e468b50, + 0x10000000, 0x3fd94794, 0xf57499ba, 0x3e48422d, 0x40000000, 0x3fd972a3, 0x86970274, 0x3e113515, + 0x80000000, 0x3fd99d95, 0xacba92ee, 0x3e117e08, 0x00000000, 0x3fd9c86b, 0x14dd0229, 0x3e26e043, + 0xe0000000, 0x3fd9f323, 0x97e56d1a, 0x3e497f30, 0x60000000, 0x3fda1dc0, 0x55901286, 0x3e3356e6, + 0x90000000, 0x3fda4840, 0x457f94d6, 0x3e0cb761, 0x90000000, 0x3fda72a4, 0xa85a9dac, 0x3e39af67, + 0x90000000, 0x3fda9cec, 0x931a909f, 0x3e453410, 0xc0000000, 0x3fdac718, 0x206058f5, 0x3e22c587, + 0x30000000, 0x3fdaf129, 0x58899c22, 0x3e223bc3, 0x00000000, 0x3fdb1b1e, 0xb6d223cb, 0x3e4d7bf8, + 0x70000000, 0x3fdb44f7, 0xc5197ddb, 0x3e47991e, 0x90000000, 0x3fdb6eb5, 0xbb3a9219, 0x3e4a79e6, + 0x90000000, 0x3fdb9858, 0xed663ec5, 0x3e3a4c43, 0x80000000, 0x3fdbc1e0, 0x1484f438, 0x3e461b5a, + 0x90000000, 0x3fdbeb4d, 0xf7ef0c3a, 0x3e4b4e36, 0xf0000000, 0x3fdc149f, 0x6acd0d1b, 0x3e115f02, + 0xa0000000, 0x3fdc3dd7, 0x35cecf05, 0x3e3f36b5, 0xe0000000, 0x3fdc66f4, 0xbf3eb5c6, 0x3e2ffb7f, + 0xc0000000, 0x3fdc8ff7, 0x86b09760, 0x3e3e6a68, 0x70000000, 0x3fdcb8e0, 0x27f5bbc3, 0x3e3135eb, + 0x00000000, 0x3fdce1af, 0xd6f6fa57, 0x3e470be7, 0xa0000000, 0x3fdd0a63, 0xc84ab338, 0x3e4ce43c, + 0x70000000, 0x3fdd32fe, 0xaac3bd91, 0x3e4c01d7, 0x90000000, 0x3fdd5b7f, 0x07961060, 0x3e45c58d, + 0x20000000, 0x3fdd83e7, 0xf941456e, 0x3e3628bc, 0x30000000, 0x3fddac35, 0xa8461cd2, 0x3e4c58b2, + 0x00000000, 0x3fddd46a, 0x82fb989a, 0x3e330712, 0x90000000, 0x3fddfc85, 0x6a80f09c, 0x3e420dab, + 0x10000000, 0x3fde2488, 0x4c397b1e, 0x3e44f8d8, 0xa0000000, 0x3fde4c71, 0x08599e48, 0x3e40d0ee, + 0x60000000, 0x3fde7442, 0x7e37da36, 0x3e1d6878, 0x60000000, 0x3fde9bfa, 0xd591bafc, 0x3e366187, + 0xd0000000, 0x3fdec399, 0x00bae772, 0x3e223466, 0xc0000000, 0x3fdeeb20, 0xd0d61b8e, 0x3e390377, + 0x50000000, 0x3fdf128f, 0xd966b907, 0x3e4f5e0d, 0xb0000000, 0x3fdf39e5, 0xb79a00e2, 0x3e49023c, + 0xf0000000, 0x3fdf6123, 0x58c28ad8, 0x3e44e051, 0x30000000, 0x3fdf884a, 0x08b18ae4, 0x3e3bfa7b, + 0x80000000, 0x3fdfaf58, 0x3db35f67, 0x3e4ef1e6, 0x20000000, 0x3fdfd64f, 0x39493d4f, 0x3e0ec2ae, + 0x00000000, 0x3fdffd2e, 0x30ab2fa0, 0x3e40afe9, 0xb0000000, 0x3fe011fa, 0xa1810dd4, 0x3e225ff8, + 0xa0000000, 0x3fe02552, 0xfb1a71a5, 0x3e469743, 0xe0000000, 0x3fe0389e, 0x76785571, 0x3e5f9cc6, + 0x90000000, 0x3fe04bdf, 0xa4cbf982, 0x3e5b524d, 0xb0000000, 0x3fe05f14, 0x381535b8, 0x3e5a4c8b, + 0x50000000, 0x3fe0723e, 0x809caf2c, 0x3e5839be, 0x80000000, 0x3fe0855c, 0x1cb82c13, 0x3e50968a, + 0x40000000, 0x3fe0986f, 0x41723fb5, 0x3e5eae6a, 0xb0000000, 0x3fe0ab76, 0xa380a4db, 0x3e5d9c29, + 0xe0000000, 0x3fe0be72, 0x0ada625e, 0x3e4094aa, 0xc0000000, 0x3fe0d163, 0x6fc108ca, 0x3e5973ad, + 0x80000000, 0x3fe0e449, 0x2fdbab97, 0x3e474732, 0x10000000, 0x3fe0f724, 0xfa9d4221, 0x3e593692, + 0x90000000, 0x3fe109f3, 0x2dfbc7d9, 0x3e5c5a99, 0x10000000, 0x3fe11cb8, 0xe102387a, 0x3e4e1f33, + 0x90000000, 0x3fe12f71, 0xf14c048c, 0x3e464fbe, 0x20000000, 0x3fe14220, 0x13ca5e3b, 0x3e4490f5, + 0xd0000000, 0x3fe154c3, 0x4d4c799d, 0x3e37a6af, 0xa0000000, 0x3fe1675c, 0x1c07398f, 0x3e57574c, + 0xb0000000, 0x3fe179ea, 0x417f8c1c, 0x3e57b133, 0x00000000, 0x3fe18c6e, 0x0c176514, 0x3e5feb9e, + 0xb0000000, 0x3fe19ee6, 0xbb3172f7, 0x3e419f25, 0xb0000000, 0x3fe1b154, 0x7bbfb852, 0x3e45f68a, + 0x10000000, 0x3fe1c3b8, 0x497929f1, 0x3e5ee278, 0xf0000000, 0x3fe1d610, 0x06109d58, 0x3e5ccee0, + 0x50000000, 0x3fe1e85f, 0xa07bd8b3, 0x3e5ce081, 0x40000000, 0x3fe1faa3, 0x981817b8, 0x3e570e12, + 0xd0000000, 0x3fe20cdc, 0xd93503d0, 0x3e292ab6, 0xf0000000, 0x3fe21f0b, 0xd7c3b61e, 0x3e58cb7d, + 0xd0000000, 0x3fe23130, 0x0a0b78da, 0x3e4efafd, 0x60000000, 0x3fe2434b, 0x67c4288e, 0x3e5e9072, + 0xc0000000, 0x3fe2555b, 0x96780875, 0x3e5d31ef, 0x00000000, 0x3fe26762, 0xfcd2ad50, 0x3e23430d, + 0x10000000, 0x3fe2795e, 0xd75bc1f9, 0x3e344d88, 0x00000000, 0x3fe28b50, 0x055e04fc, 0x3e5bec0f, + 0xf0000000, 0x3fe29d37, 0x1590b9ad, 0x3e5d8561, 0xf0000000, 0x3fe2af15, 0x8e583229, 0x3df32056, + 0xe0000000, 0x3fe2c0e9, 0x1772f538, 0x3e5a891d, 0x00000000, 0x3fe2d2b4, 0xdabba74d, 0x3e22edc9, + 0x30000000, 0x3fe2e474, 0xa1015086, 0x3e4b9009, 0x90000000, 0x3fe2f62a, 0x8c5b1a19, 0x3e52a12a, + 0x30000000, 0x3fe307d7, 0xf0fdac85, 0x3e3a7885, 0x00000000, 0x3fe3197a, 0xd43ac691, 0x3e5f4ffc, + 0x30000000, 0x3fe32b13, 0xe2640aad, 0x3e52243a, 0xb0000000, 0x3fe33ca2, 0x299035d3, 0x3e546513, + 0x90000000, 0x3fe34e28, 0xa62dd725, 0x3e5b39c3, 0xe0000000, 0x3fe35fa4, 0x40049f51, 0x3e5ba6dd, + 0xb0000000, 0x3fe37117, 0xd7177409, 0x3e451d1e, 0xf0000000, 0x3fe38280, 0xfd7f5216, 0x3e5cb0f2, + 0xd0000000, 0x3fe393e0, 0xcd4e2213, 0x3e3ab150, 0x30000000, 0x3fe3a537, 0xf3193844, 0x3e5cfd7b, + 0x40000000, 0x3fe3b684, 0x455f1dbd, 0x3e53fff8, 0xf0000000, 0x3fe3c7c7, 0x0b905fc9, 0x3e5fee64, + 0x60000000, 0x3fe3d902, 0xf548084c, 0x3e54e2ad, 0x90000000, 0x3fe3ea33, 0xdc1ecdd2, 0x3e3b597a, + 0x80000000, 0x3fe3fb5b, 0x096d3a75, 0x3e4345bd, 0x40000000, 0x3fe40c7a, 0xd2453c8b, 0x3e5101b9, + 0xe0000000, 0x3fe41d8f, 0x5cc8c979, 0x3e508ce5, 0x60000000, 0x3fe42e9c, 0x7e595f71, 0x3e5bbf01, + 0xe0000000, 0x3fe43f9f, 0x3bd393dc, 0x3e37ce73, 0x50000000, 0x3fe4509a, 0xa503f8a1, 0x3e233bb0, + 0xc0000000, 0x3fe4618b, 0x13e85bd9, 0x3e30e2f6, 0x30000000, 0x3fe47274, 0x5a635b3c, 0x3e5e6755, + 0xd0000000, 0x3fe48353, 0xf73d5e8b, 0x3e2ea88d, 0x80000000, 0x3fe4942a, 0x3bda18a8, 0x3e3d17e0, + 0x50000000, 0x3fe4a4f8, 0x76044f7e, 0x3e5b607d, 0x60000000, 0x3fe4b5bd, 0xe71bc2fc, 0x3e52adc4, + 0xa0000000, 0x3fe4c679, 0x7362d1d9, 0x3e5f99dc, 0x30000000, 0x3fe4d72d, 0x008e6a6a, 0x3e5473fa, + 0x10000000, 0x3fe4e7d8, 0x09cb0985, 0x3e2b75bb, 0x30000000, 0x3fe4f87a, 0xd10b9aba, 0x3e5ea04d, + 0xc0000000, 0x3fe50913, 0xd6979674, 0x3e5802d0, 0xc0000000, 0x3fe519a4, 0xccd99094, 0x3e174688, + 0x20000000, 0x3fe52a2d, 0xabb9df22, 0x3e496f16, 0x00000000, 0x3fe53aad, 0xf2aa374f, 0x3e46e66d, + 0x60000000, 0x3fe54b24, 0x5ea4550a, 0x3e4e6652, 0x50000000, 0x3fe55b93, 0x34f20cbd, 0x3e42d02f, + 0xd0000000, 0x3fe56bf9, 0x65047188, 0x3e46cfce, 0xf0000000, 0x3fe57c57, 0x842d58b8, 0x3e39b78c, + 0xb0000000, 0x3fe58cad, 0x24c24bc9, 0x3e4735e6, 0x20000000, 0x3fe59cfb, 0xf7dd1adf, 0x3e47eba1, + 0x40000000, 0x3fe5ad40, 0x59f65355, 0x3e586b3e, 0x30000000, 0x3fe5bd7d, 0x637f1b4d, 0x3e1ce38e, + 0xd0000000, 0x3fe5cdb1, 0xc919edc7, 0x3e58d82e, 0x50000000, 0x3fe5ddde, 0x8ddcfa37, 0x3e4c5264, + 0xa0000000, 0x3fe5ee02, 0xeae1ac12, 0x3e52482c, 0xd0000000, 0x3fe5fe1e, 0x311aba4f, 0x3e55a312, + 0xf0000000, 0x3fe60e32, 0x6329f225, 0x3e411e23, 0xf0000000, 0x3fe61e3e, 0xcd2f246c, 0x3e5b48c8, + 0xe0000000, 0x3fe62e42, 0xef35793c, 0x3e6efa39, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x40000000, 0x00000000, 0x00000000, 0x00000000, 0x3fffe000, 0xe01fe020, 0x3effe01f, + 0x00000000, 0x3fffc000, 0x01fc07f0, 0x3f1fc07f, 0x00000000, 0x3fffa000, 0x1fa11caa, 0x3f31caa0, + 0x00000000, 0x3fff8000, 0x1f81f820, 0x3f3f81f8, 0x00000000, 0x3fff6000, 0x06ddaba6, 0x3f488565, + 0x00000000, 0x3fff4000, 0x2909c560, 0x3f519679, 0x00000000, 0x3fff2000, 0x8c2ad433, 0x3f57d910, + 0x00000000, 0x3fff0000, 0xf07c1f08, 0x3f5f07c1, 0x00000000, 0x3ffee000, 0x8b1c03dd, 0x3f638ff0, + 0x00000000, 0x3ffec000, 0x03d980f6, 0x3f680f66, 0x00000000, 0x3ffea000, 0x7403d5d0, 0x3f6d00f5, + 0x00000000, 0x3ffe9000, 0x0b7672a0, 0x3f331abf, 0x00000000, 0x3ffe7000, 0x5d43919b, 0x3f506a96, + 0x00000000, 0x3ffe5000, 0x0795ceb2, 0x3f5ceb24, 0x00000000, 0x3ffe3000, 0xb834e67f, 0x3f6522f3, + 0x00000000, 0x3ffe1000, 0x3c3c3c3c, 0x3f6c3c3c, 0x00000000, 0x3ffe0000, 0x1e01e01e, 0x3f3e01e0, + 0x00000000, 0x3ffde000, 0xe21a291c, 0x3f575b8f, 0x00000000, 0x3ffdc000, 0x403b9404, 0x3f6403b9, + 0x00000000, 0x3ffda000, 0x7303b5cc, 0x3f6cc0ed, 0x00000000, 0x3ffd9000, 0xf3fc4da2, 0x3f479118, + 0x00000000, 0x3ffd7000, 0xe0b0ce46, 0x3f5ed952, 0x00000000, 0x3ffd5000, 0xeae56404, 0x3f695900, + 0x00000000, 0x3ffd4000, 0x1d41d41d, 0x3f3d41d4, 0x00000000, 0x3ffd2000, 0xf16c69ae, 0x3f5cb28f, + 0x00000000, 0x3ffd0000, 0xdd80e866, 0x3f696b1e, 0x00000000, 0x3ffcf000, 0x25fe30d9, 0x3f4372e2, + 0x00000000, 0x3ffcd000, 0x073615a2, 0x3f60ad12, 0x00000000, 0x3ffcb000, 0x0397cdb3, 0x3f6cdb2c, + 0x00000000, 0x3ffca000, 0x7b864407, 0x3f52cc15, 0x00000000, 0x3ffc8000, 0xf7148404, 0x3f664cb5, + 0x00000000, 0x3ffc7000, 0x1c71c71c, 0x3f3c71c7, 0x00000000, 0x3ffc5000, 0x1a930b84, 0x3f6129a2, + 0x00000000, 0x3ffc3000, 0x87f1e038, 0x3f6f1e03, 0x00000000, 0x3ffc2000, 0xba80709b, 0x3f5ad4e4, + 0x00000000, 0x3ffc0000, 0x0381c0e0, 0x3f6c0e07, 0x00000000, 0x3ffbf000, 0x1a362bb0, 0x3f560fba, + 0x00000000, 0x3ffbd000, 0x280dee96, 0x3f6a5713, 0x00000000, 0x3ffbc000, 0x20f9ece9, 0x3f53f596, + 0x00000000, 0x3ffba000, 0x83759f23, 0x3f69f229, 0x00000000, 0x3ffb9000, 0x63fc8d5c, 0x3f5478ac, + 0x00000000, 0x3ffb7000, 0xb4671656, 0x3f6ad87b, 0x00000000, 0x3ffb6000, 0xfbb8148c, 0x3f578b8e, + 0x00000000, 0x3ffb4000, 0xd0369d03, 0x3f6d0369, 0x00000000, 0x3ffb3000, 0x601b3748, 0x3f5d212b, + 0x00000000, 0x3ffb2000, 0x406c80d9, 0x3f0b2036, 0x00000000, 0x3ffb0000, 0xb24547d1, 0x3f629663, + 0x00000000, 0x3ffaf000, 0x0d79435e, 0x3f4435e5, 0x00000000, 0x3ffad000, 0x2920bc03, 0x3f67d0ff, + 0x00000000, 0x3ffac000, 0x15c06b16, 0x3f55c06b, 0x00000000, 0x3ffaa000, 0x0fd7f954, 0x3f6e3a5f, + 0x00000000, 0x3ffa9000, 0xd4c77b03, 0x3f61dec0, 0x00000000, 0x3ffa8000, 0x870ac52e, 0x3f473289, + 0x00000000, 0x3ffa6000, 0xa034da03, 0x3f6a034d, 0x00000000, 0x3ffa5000, 0xa2292856, 0x3f5d041d, + 0x00000000, 0x3ffa4000, 0x1a41a41a, 0x3f3a41a4, 0x00000000, 0x3ffa2000, 0x8a39409d, 0x3f68550f, + 0x00000000, 0x3ffa1000, 0xe92c0686, 0x3f5b4fe5, 0x00000000, 0x3ffa0000, 0x1a01a01a, 0x3f3a01a0, + 0x00000000, 0x3ff9e000, 0x2067b23a, 0x3f691d2a, 0x00000000, 0x3ff9d000, 0xada0b4e5, 0x3f5e7c5d, + 0x00000000, 0x3ff9c000, 0x25080ce1, 0x3f468a77, 0x00000000, 0x3ff9a000, 0xaa21b490, 0x3f6c49d4, + 0x00000000, 0x3ff99000, 0x33333333, 0x3f633333, 0x00000000, 0x3ff98000, 0x3b03fccf, 0x3f54bc36, + 0x00000000, 0x3ff97000, 0x970e4f81, 0x3f2c9f01, 0x00000000, 0x3ff95000, 0xc6ef5b25, 0x3f697617, + 0x00000000, 0x3ff94000, 0xadd3c0ca, 0x3f6161f9, 0x00000000, 0x3ff93000, 0x6cb39806, 0x3f5319fe, + 0x00000000, 0x3ff92000, 0x1c451ab3, 0x3f2f693a, 0x00000000, 0x3ff90000, 0x0321a9e2, 0x3f6a9e24, + 0x00000000, 0x3ff8f000, 0x3831f383, 0x3f63831f, 0x00000000, 0x3ff8e000, 0xc4dcfc1c, 0x3f5949eb, + 0x00000000, 0x3ff8d000, 0x80c6980c, 0x3f480c69, 0x00000000, 0x3ff8b000, 0xc5fe7403, 0x3f6f9d00, + 0x00000000, 0x3ff8a000, 0xd7e75347, 0x3f69721e, 0x00000000, 0x3ff89000, 0x0313381f, 0x3f6381ec, + 0x00000000, 0x3ff88000, 0xaec12653, 0x3f5b97c2, 0x00000000, 0x3ff87000, 0x024ae3ba, 0x3f509ef3, + 0x00000000, 0x3ff86000, 0x18618618, 0x3f386186, 0x00000000, 0x3ff84000, 0xf00c2780, 0x3f6e0184, + 0x00000000, 0x3ff83000, 0x657dba52, 0x3f692ef5, 0x00000000, 0x3ff82000, 0x05494030, 0x3f649403, + 0x00000000, 0x3ff81000, 0x30303030, 0x3f603030, 0x00000000, 0x3ff80000, 0x80601806, 0x3f580601, + 0x00000000, 0x3ff7f000, 0x05fd017f, 0x3f5017f4, 0x00000000, 0x3ff7e000, 0xd278e8dd, 0x3f412a8a, + 0x00000000, 0x3ff7d000, 0x417d05f4, 0x3f17d05f, 0x00000000, 0x3ff7b000, 0x5c02f7d6, 0x3f6d6724, + 0x00000000, 0x3ff7a000, 0xc1d986a9, 0x3f6a4411, 0x00000000, 0x3ff79000, 0x6c7316df, 0x3f6754d7, + 0x00000000, 0x3ff78000, 0xf149902f, 0x3f649902, 0x00000000, 0x3ff77000, 0x358c1a68, 0x3f621023, + 0x00000000, 0x3ff76000, 0xd2a6c406, 0x3f5f7390, 0x00000000, 0x3ff75000, 0x05d5b2b1, 0x3f5b2b08, + 0x00000000, 0x3ff74000, 0x745d1746, 0x3f5745d1, 0x00000000, 0x3ff73000, 0x07fa32c4, 0x3f53c315, + 0x00000000, 0x3ff72000, 0x1b7af017, 0x3f50a1fd, 0x00000000, 0x3ff71000, 0xe3e0453a, 0x3f4bc36c, + 0x00000000, 0x3ff70000, 0x5c0b8170, 0x3f4702e0, 0x00000000, 0x3ff6f000, 0x9300b793, 0x3f4300b7, + 0x00000000, 0x3ff6e000, 0x337c6cb1, 0x3f3f76b4, 0x00000000, 0x3ff6d000, 0x1c860fb0, 0x3f3a6268, + 0x00000000, 0x3ff6c000, 0x16c16c17, 0x3f36c16c, 0x00000000, 0x3ff6b000, 0x31a3cfc7, 0x3f3490aa, + 0x00000000, 0x3ff6a000, 0x3729043e, 0x3f33cd15, 0x00000000, 0x3ff69000, 0x8d0bfd2e, 0x3f3473a8, + 0x00000000, 0x3ff68000, 0x16816817, 0x3f368168, 0x00000000, 0x3ff67000, 0x16719f36, 0x3f39f360, + 0x00000000, 0x3ff66000, 0x122f9016, 0x3f3ec6a5, 0x00000000, 0x3ff65000, 0xda5519cf, 0x3f427c29, + 0x00000000, 0x3ff64000, 0x590b2164, 0x3f4642c8, 0x00000000, 0x3ff63000, 0x5606f00b, 0x3f4ab5c4, + 0x00000000, 0x3ff62000, 0x0b11fd3c, 0x3f4fd3b8, 0x00000000, 0x3ff61000, 0xc6ba4eaa, 0x3f52cda0, + 0x00000000, 0x3ff60000, 0x60581606, 0x3f560581, 0x00000000, 0x3ff5f000, 0xa4b7ef87, 0x3f5990d0, + 0x00000000, 0x3ff5e000, 0x40579d6f, 0x3f5d6ee3, 0x00000000, 0x3ff5d000, 0xd9c54a69, 0x3f60cf87, + 0x00000000, 0x3ff5c000, 0x2620ae4c, 0x3f631057, 0x00000000, 0x3ff5b000, 0x8ff522a2, 0x3f65798c, + 0x00000000, 0x3ff5a000, 0x02b580ad, 0x3f680ad6, 0x00000000, 0x3ff59000, 0x4799546f, 0x3f6ac3e2, + 0x00000000, 0x3ff58000, 0x02b1da46, 0x3f6da461, 0x00000000, 0x3ff58000, 0x01580560, 0x3f158056, + 0x00000000, 0x3ff57000, 0x06b39a23, 0x3f3ed3c5, 0x00000000, 0x3ff56000, 0xe2970f60, 0x3f4cbdd3, + 0x00000000, 0x3ff55000, 0x55555555, 0x3f555555, 0x00000000, 0x3ff54000, 0xee0bf805, 0x3f5c979a, + 0x00000000, 0x3ff53000, 0xe81fd58e, 0x3f621291, 0x00000000, 0x3ff52000, 0x500a9580, 0x3f65fead, + 0x00000000, 0x3ff51000, 0xc5f02a3a, 0x3f6a0fd5, 0x00000000, 0x3ff50000, 0x23898adc, 0x3f6e45c2, + 0x00000000, 0x3ff50000, 0x15015015, 0x3f350150, 0x00000000, 0x3ff4f000, 0xea64d422, 0x3f4c7b16, + 0x00000000, 0x3ff4e000, 0xbc14e5e1, 0x3f57829c, 0x00000000, 0x3ff4d000, 0xb8589720, 0x3f60877d, + 0x00000000, 0x3ff4c000, 0x4b5edcea, 0x3f65710e, 0x00000000, 0x3ff4b000, 0x4d1fc1c8, 0x3f6a7dbb, + 0x00000000, 0x3ff4a000, 0xa57eb503, 0x3f6fad40, 0x00000000, 0x3ff4a000, 0xb00a5140, 0x3f43fd6b, + 0x00000000, 0x3ff49000, 0xcb419ba9, 0x3f54e78e, 0x00000000, 0x3ff48000, 0x029100a4, 0x3f600a44, + 0x00000000, 0x3ff47000, 0x5c28f5c3, 0x3f65c28f, 0x00000000, 0x3ff46000, 0xb2c0cc4a, 0x3f6b9c68, + 0x00000000, 0x3ff46000, 0xb9f34381, 0x3f2978fe, 0x00000000, 0x3ff45000, 0x3bb6500a, 0x3f4ecf16, + 0x00000000, 0x3ff44000, 0x8b67ebb9, 0x3f5be195, 0x00000000, 0x3ff43000, 0x57dc9a3b, 0x3f644e61, + 0x00000000, 0x3ff42000, 0xaa3f0ddf, 0x3f6acc4b, 0x00000000, 0x3ff42000, 0xcb2a247b, 0x3f26a4cb, + 0x00000000, 0x3ff41000, 0x50505050, 0x3f505050, 0x00000000, 0x3ff40000, 0x39959819, 0x3f5e0b44, + 0x00000000, 0x3ff3f000, 0x6027f602, 0x3f66027f, 0x00000000, 0x3ff3e000, 0x4b5e0db4, 0x3f6d1e85, + 0x00000000, 0x3ff3e000, 0x254813e2, 0x3f4165e7, 0x00000000, 0x3ff3d000, 0xa9d716ef, 0x3f576646, + 0x00000000, 0x3ff3c000, 0xf757ce88, 0x3f632b48, 0x00000000, 0x3ff3b000, 0x4652a906, 0x3f6ac1b2, + 0x00000000, 0x3ff3b000, 0x13b13b14, 0x3f33b13b, 0x00000000, 0x3ff3a000, 0xeb208984, 0x3f5490e1, + 0x00000000, 0x3ff39000, 0x30fec66e, 0x3f623858, 0x00000000, 0x3ff38000, 0xcc111b7e, 0x3f6a45a6, + 0x00000000, 0x3ff38000, 0x13813814, 0x3f338138, 0x00000000, 0x3ff37000, 0x2517b708, 0x3f556f47, + 0x00000000, 0x3ff36000, 0xbc0e8f2a, 0x3f631be7, 0x00000000, 0x3ff35000, 0x3e55f044, 0x3f6b9cbf, + 0x00000000, 0x3ff35000, 0x5bc609a9, 0x3f40e7d9, 0x00000000, 0x3ff34000, 0x804d19e7, 0x3f59e6b3, + 0x00000000, 0x3ff33000, 0xaf7963c2, 0x3f65c8b6, 0x00000000, 0x3ff32000, 0xd43bf402, 0x3f6eb9da, + 0x00000000, 0x3ff32000, 0x5885fb37, 0x3f4f1a51, 0x00000000, 0x3ff31000, 0xd3d76c02, 0x3f60eeb1, + 0x00000000, 0x3ff30000, 0x61a32026, 0x3f6a3202, 0x00000000, 0x3ff30000, 0x40260390, 0x3f3c82ac, + 0x00000000, 0x3ff2f000, 0x84bda12f, 0x3f5a12f6, 0x00000000, 0x3ff2e000, 0xfda2962c, 0x3f669d43, + 0x00000000, 0x3ff2e000, 0xc04b8097, 0x3f02e025, 0x00000000, 0x3ff2d000, 0xb542804b, 0x3f542804, + 0x00000000, 0x3ff2c000, 0x02593f6a, 0x3f63f69b, 0x00000000, 0x3ff2b000, 0xb46e21fa, 0x3f6df31c, + 0x00000000, 0x3ff2b000, 0x04ad012b, 0x3f5012b4, 0x00000000, 0x3ff2a000, 0xe7820a7f, 0x3f623925, + 0x00000000, 0x3ff29000, 0xc8253c82, 0x3f6c8253, 0x00000000, 0x3ff29000, 0xc02526e5, 0x3f4b92dd, + 0x00000000, 0x3ff28000, 0x11602511, 0x3f616025, 0x00000000, 0x3ff27000, 0x439c9adf, 0x3f6bf471, + 0x00000000, 0x3ff27000, 0x0939a85c, 0x3f4a85c4, 0x00000000, 0x3ff26000, 0xac024d16, 0x3f6166f9, + 0x00000000, 0x3ff25000, 0x0125e227, 0x3f6c44e1, 0x00000000, 0x3ff25000, 0x8bbd90e5, 0x3f4cebf4, + 0x00000000, 0x3ff24000, 0x92492492, 0x3f624924, 0x00000000, 0x3ff23000, 0x2ec0b673, 0x3f6d6f2e, + 0x00000000, 0x3ff23000, 0x6af37c05, 0x3f5159e2, 0x00000000, 0x3ff22000, 0x40245402, 0x3f640245, + 0x00000000, 0x3ff21000, 0x43f6f024, 0x3f6f6f02, 0x00000000, 0x3ff21000, 0x21579805, 0x3f55e601, + 0x00000000, 0x3ff20000, 0xcf81b10f, 0x3f668e18, 0x00000000, 0x3ff20000, 0x12012012, 0x3f320120, + 0x00000000, 0x3ff1f000, 0x047dc11f, 0x3f5c11f7, 0x00000000, 0x3ff1e000, 0xff70985e, 0x3f69e878, + 0x00000000, 0x3ff1e000, 0xfdc3a219, 0x3f4779d9, 0x00000000, 0x3ff1d000, 0x5c957907, 0x3f61eace, + 0x00000000, 0x3ff1c000, 0x450239e1, 0x3f6e0d5b, 0x00000000, 0x3ff1c000, 0x73816367, 0x3f548bf0, + 0x00000000, 0x3ff1b000, 0x8dda5202, 0x3f669480, 0x00000000, 0x3ff1b000, 0x2bae2b21, 0x3f37c67f, + 0x00000000, 0x3ff1a000, 0x69ee5847, 0x3f5ee584, 0x00000000, 0x3ff19000, 0xc0233c02, 0x3f6c0233, + 0x00000000, 0x3ff19000, 0x328a7012, 0x3f514e02, 0x00000000, 0x3ff18000, 0x2057b573, 0x3f656107, + 0x00000000, 0x3ff18000, 0x11811812, 0x3f318118, 0x00000000, 0x3ff17000, 0x6f5a1060, 0x3f5e2864, + 0x00000000, 0x3ff16000, 0x84e6f1d7, 0x3f6c0d12, 0x00000000, 0x3ff16000, 0xf0c80459, 0x3f523543, + 0x00000000, 0x3ff15000, 0xea4e1a09, 0x3f663cbe, 0x00000000, 0x3ff15000, 0xdd5c8cb8, 0x3f3b9a3f, + 0x00000000, 0x3ff14000, 0x159a76d2, 0x3f60be1c, 0x00000000, 0x3ff13000, 0x688e4838, 0x3f6e1d1a, + 0x00000000, 0x3ff13000, 0xd72044d7, 0x3f572044, 0x00000000, 0x3ff12000, 0xdb81577b, 0x3f691713, + 0x00000000, 0x3ff12000, 0xe9819b50, 0x3f4ac73a, 0x00000000, 0x3ff11000, 0x4e904cf6, 0x3f646033, + 0x00000000, 0x3ff11000, 0x11111111, 0x3f311111, 0x00000000, 0x3ff10000, 0x0441fef0, 0x3f5feef8, + 0x00000000, 0x3ff0f000, 0xfde021fe, 0x3f6de021, 0x00000000, 0x3ff0f000, 0xcc9686a0, 0x3f57b7ea, + 0x00000000, 0x3ff0e000, 0xcd391fbc, 0x3f69ead7, 0x00000000, 0x3ff0e000, 0x09804390, 0x3f501956, + 0x00000000, 0x3ff0d000, 0x1e8d2b32, 0x3f664151, 0x00000000, 0x3ff0d000, 0xacf1ce96, 0x3f4222b1, + 0x00000000, 0x3ff0c000, 0x79b47582, 0x3f62e29f, 0x00000000, 0x3ff0c000, 0x682e11cd, 0x3f24f0d1, + 0x00000000, 0x3ff0b000, 0x96771e4d, 0x3f5f9bb0, 0x00000000, 0x3ff0a000, 0x5dd96ae2, 0x3f6e5ee4, + 0x00000000, 0x3ff0a000, 0xa0429a04, 0x3f5a0429, 0x00000000, 0x3ff09000, 0x5f06c021, 0x3f6bb74d, + 0x00000000, 0x3ff09000, 0x04254fce, 0x3f54fce4, 0x00000000, 0x3ff08000, 0xeacbc402, 0x3f695766, + 0x00000000, 0x3ff08000, 0x08421084, 0x3f508421, 0x00000000, 0x3ff07000, 0x71d5c338, 0x3f673e53, + 0x00000000, 0x3ff07000, 0x3fbe3368, 0x3f493052, 0x00000000, 0x3ff06000, 0xf225f6c4, 0x3f656b38, + 0x00000000, 0x3ff06000, 0x8d4fdf3b, 0x3f426e97, 0x00000000, 0x3ff05000, 0xe4eb0cc6, 0x3f63dd40, + 0x00000000, 0x3ff05000, 0x73404146, 0x3f397f7d, 0x00000000, 0x3ff04000, 0x2cc98af1, 0x3f629398, + 0x00000000, 0x3ff04000, 0x10410410, 0x3f304104, 0x00000000, 0x3ff03000, 0x048ff7e4, 0x3f618d6f, + 0x00000000, 0x3ff03000, 0xebc349de, 0x3f2236a3, 0x00000000, 0x3ff02000, 0xee53d18c, 0x3f60c9f8, + 0x00000000, 0x3ff02000, 0x81020408, 0x3f102040, 0x00000000, 0x3ff01000, 0xa2f46ea6, 0x3f60486c, + 0x00000000, 0x3ff01000, 0x10101010, 0x3ef01010, 0x00000000, 0x3ff00000, 0x02010080, 0x3f600804, + 0x00000000, 0x3ff00000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xc0000000, 0x3ff2cd9f, 0x096a0092, 0x3e513ae6, + 0x60000000, 0x400d03cf, 0xfb79a640, 0x3e5db70c, 0xe0000000, 0x40240926, 0xb66dc067, 0x3e8c2526, + 0x00000000, 0x403b4a38, 0x8647f380, 0x3e8b81b1, 0x60000000, 0x40528d01, 0xd1e1eb08, 0x3ebbc1cd, + 0x28000000, 0x406936d2, 0x1534fb09, 0x3ecd9f20, 0x68000000, 0x40812287, 0x4a4e9954, 0x3edd1c06, + 0x50000000, 0x409749ea, 0x5d06ea74, 0x3ed4eca6, 0x70000000, 0x40afa715, 0xbcc0ecc5, 0x3f00c259, + 0xc8000000, 0x40c5829d, 0x47cf9016, 0x3f2b5a66, 0x88000000, 0x40dd3c44, 0xdefb0870, 0x3f09691a, + 0x50000000, 0x40f3de16, 0xc29cde38, 0x3f53410f, 0x90000000, 0x410b00b5, 0x50b6fb3c, 0x3f46a31a, + 0x48000000, 0x412259ac, 0x71805c40, 0x3f57defc, 0xa8000000, 0x4138f0cc, 0xd80e0bab, 0x3f9eb49f, + 0xd0000000, 0x4150f2eb, 0x7bcd5920, 0x3f84fffc, 0x88000000, 0x41670934, 0xb6c63435, 0x3fc03a93, + 0x08000000, 0x417f4f22, 0xb255fd1c, 0x3fb1940b, 0xf8000000, 0x419546d8, 0x14260b50, 0x3fded26e, + 0x88000000, 0x41aceb08, 0x1fc9f2a2, 0x3ffb4740, 0xf8000000, 0x41c3a6e1, 0xf55634f1, 0x40267bb3, + 0xb8000000, 0x41dab5ad, 0xf8194ddc, 0x401c435f, 0x30000000, 0x41f226af, 0x052ba63a, 0x404d8fee, + 0xb0000000, 0x4208ab7f, 0xdccde3f6, 0x40651d7e, 0x90000000, 0x4220c3d3, 0x44557d1a, 0x40704b16, + 0x68000000, 0x4236c932, 0xca0a9dc4, 0x4076a6b5, 0xf0000000, 0x424ef822, 0x72249aba, 0x40afd9cc, + 0x30000000, 0x42650bba, 0x693edab5, 0x40ce58de, 0x40000000, 0x427c9aae, 0x58ac6363, 0x40d8c701, + 0x08000000, 0x42937047, 0x64f43e20, 0x40e76147, 0x58000000, 0x42aa6b76, 0xb36fc718, 0x4106337d, + 0xc8000000, 0x42c1f43f, 0xb1f611e2, 0x41212d98, 0x48000000, 0x42d866f3, 0x108b37cc, 0x412392bc, + 0x28000000, 0x42f0953e, 0xdc3473dc, 0x415ce87b, 0x20000000, 0x430689e2, 0xae99ad14, 0x414bc8d5, + 0xa0000000, 0x431ea215, 0x6744835c, 0x415d20d7, 0x00000000, 0x3ff00000, 0x00000000, 0x00000000, + 0x50000000, 0x3ff8b075, 0x04c2bd28, 0x3e3d9f55, 0x08000000, 0x400e18fa, 0xf0a4c9fd, 0x3e67cb66, + 0x90000000, 0x402422a4, 0x7928e588, 0x3e8f5861, 0x58000000, 0x403b4ee8, 0x00c38d48, 0x3e6bc7d0, + 0xc8000000, 0x40528d6f, 0x4e329998, 0x3eaf7f9d, 0x78000000, 0x406936e6, 0x64885269, 0x3ec6e6e4, + 0x48000000, 0x40812289, 0xb946c154, 0x3ecba3a8, 0xa8000000, 0x409749ea, 0x6110d5a4, 0x3ed3f4e7, + 0x80000000, 0x40afa715, 0x515a3e2b, 0x3f017622, 0xd0000000, 0x40c5829d, 0x528af3d0, 0x3ee4dc4b, + 0x88000000, 0x40dd3c44, 0x78615e10, 0x3f111562, 0x50000000, 0x40f3de16, 0x0ed821f5, 0x3f535ad5, + 0x90000000, 0x410b00b5, 0x55f2935c, 0x3f46b610, 0x48000000, 0x412259ac, 0x4a601240, 0x3f57e279, + 0xa8000000, 0x4138f0cc, 0x5f6aadd3, 0x3f9eb4b4, 0xd0000000, 0x4150f2eb, 0x967b3698, 0x3f85000b, + 0x88000000, 0x41670934, 0x0fadc092, 0x3fc03a94, 0x08000000, 0x417f4f22, 0xf3bf874c, 0x3fb1940b, + 0xf8000000, 0x419546d8, 0x1a2a2110, 0x3fded26e, 0x88000000, 0x41aceb08, 0x205796d6, 0x3ffb4740, + 0xf8000000, 0x41c3a6e1, 0xf55cb85d, 0x40267bb3, 0xb8000000, 0x41dab5ad, 0xf81e18ac, 0x401c435f, + 0x30000000, 0x41f226af, 0x052bdea4, 0x404d8fee, 0xb0000000, 0x4208ab7f, 0xdccde926, 0x40651d7e, + 0x90000000, 0x4220c3d3, 0x44557e0e, 0x40704b16, 0x68000000, 0x4236c932, 0xca0a9e1c, 0x4076a6b5, + 0xf0000000, 0x424ef822, 0x72249abe, 0x40afd9cc, 0x30000000, 0x42650bba, 0x693edab5, 0x40ce58de, + 0x40000000, 0x427c9aae, 0x58ac6364, 0x40d8c701, 0x08000000, 0x42937047, 0x64f43e20, 0x40e76147, + 0x58000000, 0x42aa6b76, 0xb36fc718, 0x4106337d, 0xc8000000, 0x42c1f43f, 0xb1f611e2, 0x41212d98, + 0x48000000, 0x42d866f3, 0x108b37cc, 0x412392bc, 0x28000000, 0x42f0953e, 0xdc3473dc, 0x415ce87b, + 0x20000000, 0x430689e2, 0xae99ad14, 0x414bc8d5, 0xa0000000, 0x431ea215, 0x6744835c, 0x415d20d7, + 0x00000000, 0x40000000, 0xe01fe020, 0x3fffe01f, 0x01fc07f0, 0x3fffc07f, 0xaa01fa12, 0x3fffa11c, + 0x1f81f820, 0x3fff81f8, 0xaca0dbb5, 0x3fff6310, 0x9e4a4271, 0x3fff4465, 0x44230ab5, 0x3fff25f6, + 0xf07c1f08, 0x3fff07c1, 0xf8458e02, 0x3ffee9c7, 0xb301ecc0, 0x3ffecc07, 0x7aba01eb, 0x3ffeae80, + 0xabf0b767, 0x3ffe9131, 0xa59750e4, 0x3ffe741a, 0xc901e574, 0x3ffe573a, 0x79dc1a73, 0x3ffe3a91, + 0x1e1e1e1e, 0x3ffe1e1e, 0x1e01e01e, 0x3ffe01e0, 0xe3f8868a, 0x3ffde5d6, 0xdca01dca, 0x3ffdca01, + 0x76b981db, 0x3ffdae60, 0x231e7f8a, 0x3ffd92f2, 0x54b82c34, 0x3ffd77b6, 0x807572b2, 0x3ffd5cac, + 0x1d41d41d, 0x3ffd41d4, 0xa3fc5b1a, 0x3ffd272c, 0x8f6ec074, 0x3ffd0cb5, 0x5c44bfc6, 0x3ffcf26e, + 0x89039b0b, 0x3ffcd856, 0x9601cbe7, 0x3ffcbe6d, 0x055ee191, 0x3ffca4b3, 0x5afb8a42, 0x3ffc8b26, + 0x1c71c71c, 0x3ffc71c7, 0xd10d4986, 0x3ffc5894, 0x01c3f8f0, 0x3ffc3f8f, 0x392ea01c, 0x3ffc26b5, + 0x0381c0e0, 0x3ffc0e07, 0xee868d8b, 0x3ffbf583, 0x899406f7, 0x3ffbdd2b, 0x65883e7b, 0x3ffbc4fd, + 0x14c1bad0, 0x3ffbacf9, 0x2b18ff23, 0x3ffb951e, 0x3dda338b, 0x3ffb7d6c, 0xe3beee05, 0x3ffb65e2, + 0xb4e81b4f, 0x3ffb4e81, 0x4ad806ce, 0x3ffb3748, 0x406c80d9, 0x3ffb2036, 0x31d922a4, 0x3ffb094b, + 0xbca1af28, 0x3ffaf286, 0x7f94905e, 0x3ffadbe8, 0x1ac5701b, 0x3ffac570, 0x2f87ebfd, 0x3ffaaf1d, + 0x606a63be, 0x3ffa98ef, 0x5130e159, 0x3ffa82e6, 0xa6d01a6d, 0x3ffa6d01, 0x07688a4a, 0x3ffa5741, + 0x1a41a41a, 0x3ffa41a4, 0x87c51ca0, 0x3ffa2c2a, 0xf97a4b02, 0x3ffa16d3, 0x1a01a01a, 0x3ffa01a0, + 0x951033d9, 0x3ff9ec8e, 0x176b682d, 0x3ff9d79f, 0x4ee4a102, 0x3ff9c2d1, 0xea5510da, 0x3ff9ae24, + 0x9999999a, 0x3ff99999, 0x0d8ec0ff, 0x3ff9852f, 0xf80cb872, 0x3ff970e4, 0x0be377ae, 0x3ff95cbb, + 0xfcd6e9e0, 0x3ff948b0, 0x7f9b2ce6, 0x3ff934c6, 0x49d0e229, 0x3ff920fb, 0x120190d5, 0x3ff90d4f, + 0x8f9c18fa, 0x3ff8f9c1, 0x7af1373f, 0x3ff8e652, 0x8d3018d3, 0x3ff8d301, 0x8062ff3a, 0x3ff8bfce, + 0x0f6bf3aa, 0x3ff8acb9, 0xf601899c, 0x3ff899c0, 0xf0abb04a, 0x3ff886e5, 0xbcc092b9, 0x3ff87427, + 0x18618618, 0x3ff86186, 0xc2780614, 0x3ff84f00, 0x7ab2bedd, 0x3ff83c97, 0x0182a4a0, 0x3ff82a4a, + 0x18181818, 0x3ff81818, 0x80601806, 0x3ff80601, 0xfd017f40, 0x3ff7f405, 0x515a4f1d, 0x3ff7e225, + 0x417d05f4, 0x3ff7d05f, 0x922e017c, 0x3ff7beb3, 0x08e0ecc3, 0x3ff7ad22, 0x6bb6398b, 0x3ff79baa, + 0x8178a4c8, 0x3ff78a4c, 0x119ac60d, 0x3ff77908, 0xe434a9b1, 0x3ff767dc, 0xc201756d, 0x3ff756ca, + 0x745d1746, 0x3ff745d1, 0xc541fe8d, 0x3ff734f0, 0x7f46debc, 0x3ff72428, 0x6d9c7c09, 0x3ff71378, + 0x5c0b8170, 0x3ff702e0, 0x16f26017, 0x3ff6f260, 0x6b4337c7, 0x3ff6e1f7, 0x2681c861, 0x3ff6d1a6, + 0x16c16c17, 0x3ff6c16c, 0x0aa31a3d, 0x3ff6b149, 0xd1537290, 0x3ff6a13c, 0x3a88d0c0, 0x3ff69147, + 0x16816817, 0x3ff68168, 0x3601671a, 0x3ff6719f, 0x6a5122f9, 0x3ff661ec, 0x853b4aa3, 0x3ff6524f, + 0x590b2164, 0x3ff642c8, 0xb88ac0de, 0x3ff63356, 0x77016240, 0x3ff623fa, 0x6831ae94, 0x3ff614b3, + 0x60581606, 0x3ff60581, 0x34292dfc, 0x3ff5f664, 0xb8d015e7, 0x3ff5e75b, 0xc3ece2a5, 0x3ff5d867, + 0x2b931057, 0x3ff5c988, 0xc647fa91, 0x3ff5babc, 0x6b015ac0, 0x3ff5ac05, 0xf123ccaa, 0x3ff59d61, + 0x308158ed, 0x3ff58ed2, 0x01580560, 0x3ff58056, 0x3c506b3a, 0x3ff571ed, 0xba7c52e2, 0x3ff56397, + 0x55555555, 0x3ff55555, 0xe6bb82fe, 0x3ff54725, 0x48f40feb, 0x3ff53909, 0x56a8054b, 0x3ff52aff, + 0xeae2f815, 0x3ff51d07, 0xe111c4c5, 0x3ff50f22, 0x15015015, 0x3ff50150, 0x62dd4c9b, 0x3ff4f38f, + 0xa72f0539, 0x3ff4e5e0, 0xbedc2c4c, 0x3ff4d843, 0x8725af6e, 0x3ff4cab8, 0xdda68fe1, 0x3ff4bd3e, + 0xa052bf5b, 0x3ff4afd6, 0xad76014a, 0x3ff4a27f, 0xe3b2d067, 0x3ff49539, 0x22014880, 0x3ff48805, + 0x47ae147b, 0x3ff47ae1, 0x34596066, 0x3ff46dce, 0xc7f5cf9a, 0x3ff460cb, 0xe2c776ca, 0x3ff453d9, + 0x6562d9fb, 0x3ff446f8, 0x30abee4d, 0x3ff43a27, 0x25d51f87, 0x3ff42d66, 0x265e5951, 0x3ff420b5, + 0x14141414, 0x3ff41414, 0xd10e6566, 0x3ff40782, 0x3fb013fb, 0x3ff3fb01, 0x42a5af07, 0x3ff3ee8f, + 0xbce4a902, 0x3ff3e22c, 0x91aa75c6, 0x3ff3d5d9, 0xa47babe7, 0x3ff3c995, 0xd9232955, 0x3ff3bd60, + 0x13b13b14, 0x3ff3b13b, 0x387ac822, 0x3ff3a524, 0x2c187f63, 0x3ff3991c, 0xd366088e, 0x3ff38d22, + 0x13813814, 0x3ff38138, 0xd1c945ee, 0x3ff3755b, 0xf3de0748, 0x3ff3698d, 0x5f9f2af8, 0x3ff35dce, + 0xfb2b78c1, 0x3ff3521c, 0xace01346, 0x3ff34679, 0x5b57bcb2, 0x3ff33ae4, 0xed6a1dfa, 0x3ff32f5c, + 0x4a2b10bf, 0x3ff323e3, 0x58e9ebb6, 0x3ff31877, 0x0130d190, 0x3ff30d19, 0x2ac40260, 0x3ff301c8, + 0xbda12f68, 0x3ff2f684, 0xa1fed14b, 0x3ff2eb4e, 0xc04b8097, 0x3ff2e025, 0x012d50a0, 0x3ff2d50a, + 0x4d812ca0, 0x3ff2c9fb, 0x8e5a3711, 0x3ff2bef9, 0xad012b40, 0x3ff2b404, 0x92f3c105, 0x3ff2a91c, + 0x29e4129e, 0x3ff29e41, 0x5bb804a5, 0x3ff29372, 0x1288b013, 0x3ff288b0, 0x38a1ce4d, 0x3ff27dfa, + 0xb8812735, 0x3ff27350, 0x7cd60127, 0x3ff268b3, 0x708092f1, 0x3ff25e22, 0x7e9177b2, 0x3ff2539d, + 0x92492492, 0x3ff24924, 0x9717605b, 0x3ff23eb7, 0x789abcdf, 0x3ff23456, 0x22a0122a, 0x3ff22a01, + 0x8121fb78, 0x3ff21fb7, 0x804855e6, 0x3ff21579, 0x0c67c0d9, 0x3ff20b47, 0x12012012, 0x3ff20120, + 0x7dc11f70, 0x3ff1f704, 0x3c7fb84c, 0x3ff1ecf4, 0x3b3fb874, 0x3ff1e2ef, 0x672e4abd, 0x3ff1d8f5, + 0xada2811d, 0x3ff1cf06, 0xfc1ce059, 0x3ff1c522, 0x4046ed29, 0x3ff1bb4a, 0x67f2bae3, 0x3ff1b17c, + 0x611a7b96, 0x3ff1a7b9, 0x19e0119e, 0x3ff19e01, 0x808ca29c, 0x3ff19453, 0x83902bdb, 0x3ff18ab0, + 0x11811812, 0x3ff18118, 0x191bd684, 0x3ff1778a, 0x89427379, 0x3ff16e06, 0x50fc3201, 0x3ff1648d, + 0x5f75270d, 0x3ff15b1e, 0xa3fdd5c9, 0x3ff151b9, 0x0e0acd3b, 0x3ff1485f, 0x8d344724, 0x3ff13f0e, + 0x1135c811, 0x3ff135c8, 0x89edc0ac, 0x3ff12c8b, 0xe75d3033, 0x3ff12358, 0x19a74826, 0x3ff11a30, + 0x11111111, 0x3ff11111, 0xbe011080, 0x3ff107fb, 0x10fef011, 0x3ff0fef0, 0xfab325a2, 0x3ff0f5ed, + 0x6be69c90, 0x3ff0ecf5, 0x55826011, 0x3ff0e406, 0xa88f4696, 0x3ff0db20, 0x56359e3a, 0x3ff0d244, + 0x4fbcda3b, 0x3ff0c971, 0x868b4171, 0x3ff0c0a7, 0xec259dc8, 0x3ff0b7e6, 0x722eecb5, 0x3ff0af2f, + 0x0a6810a7, 0x3ff0a681, 0xa6af8360, 0x3ff09ddb, 0x39010954, 0x3ff0953f, 0xb37565e2, 0x3ff08cab, + 0x08421084, 0x3ff08421, 0x29b8eae2, 0x3ff07b9f, 0x0a47f7c6, 0x3ff07326, 0x9c7912fb, 0x3ff06ab5, + 0xd2f1a9fc, 0x3ff0624d, 0xa0727586, 0x3ff059ee, 0xf7d73404, 0x3ff05197, 0xcc1664c5, 0x3ff04949, + 0x10410410, 0x3ff04104, 0xb78247fc, 0x3ff038c6, 0xb51f5e1a, 0x3ff03091, 0xfc7729e9, 0x3ff02864, + 0x81020408, 0x3ff02040, 0x36517a37, 0x3ff01824, 0x10101010, 0x3ff01010, 0x02010080, 0x3ff00804, + 0x00000000, 0x3ff00000, 0x87ec3637, 0x3dd06139, 0x8db86d61, 0xbe138a4b, 0xe5bcc98d, 0x3e4f7b72, + 0x634fb0e4, 0xbe85f1f0, 0x954a9137, 0x3ebb9df2, 0x3a67dc3c, 0xbeef4d1e, 0x1dd3898b, 0x3f1f9a32, + 0x3db06d60, 0xbf4c02db, 0xd0dbc1ad, 0x3f7565bc, 0x31284e9c, 0xbf9b82ce, 0x1a042b32, 0x3fbce2f2, + 0x6b0379e6, 0xbfd81274, 0x50429b6d, 0x3ff20dd7, 0x1438dcf6, 0x3dba1606, 0x016becfe, 0xbdff4342, + 0x75b4cde8, 0x3e395b1b, 0xf9c2a481, 0xbe71d468, 0xb7a0966a, 0x3ea6ae6b, 0xfda1afb2, 0xbeda0df5, + 0xd9475b18, 0x3f0ac250, 0xc8a35a21, 0xbf384227, 0xceb7b0e9, 0x3f631cd0, 0x054eefef, 0xbf89b3c1, + 0xbc5785f9, 0x3facf6d2, 0x451254ef, 0xbfcb5db0, 0xa741088b, 0x3feaf767, 0x6a2dd61d, 0x3da4e37d, + 0xe1edf74c, 0xbde92a56, 0x4377a2ac, 0x3e2491d9, 0x39f489a1, 0xbe5d38a6, 0x1dce54b4, 0x3e92d22f, + 0xc5671218, 0xbec5f6c1, 0x3a1bb9bd, 0x3ef70561, 0x46c610de, 0xbf25712a, 0xbf0e3574, 0x3f518a17, + 0xdb5a4b51, 0xbf78ea53, 0x50e230dd, 0x3f9e962f, 0x85d30895, 0xbfc0b60e, 0x95f8c2b3, 0x3fe5990d, + 0x45479566, 0x3d90d511, 0x59127e17, 0xbdd4667c, 0x3e09cefe, 0x3e10d308, 0xa495656e, 0xbe482e4c, + 0xa05c8f4f, 0x3e7f9bcf, 0x66933f59, 0xbeb2ca0d, 0xf3cce792, 0x3ee42cd2, 0x99c503dd, 0xbf136902, + 0x532c5193, 0x3f409c2f, 0xdf44f35e, 0xbf6931da, 0x1aa68815, 0x3f911720, 0x1d221312, 0xbfb5e25e, + 0xd6c7e25c, 0x3fe235fd, 0xc24c08dd, 0x3d7b51ef, 0x4c2104f1, 0xbdc0ab73, 0x866c02a1, 0x3dfbc79c, + 0x72027ff7, 0xbe343c2f, 0x21752c63, 0x3e6ae853, 0x76229e21, 0xbea0593b, 0x32c6dda4, 0x3ed2106d, + 0xf8633468, 0xbf020fd5, 0xc5ffc392, 0x3f304e0c, 0x0a3b61e7, 0xbf5ab23d, 0x9b1a01ca, 0x3f844016, + 0x759a9c41, 0xbfae8712, 0x142795e3, 0x3fdfd9ae, 0xcee056b0, 0xbdf46eed, 0xbf31cd83, 0x3e272c71, + 0x3414494b, 0xbe3b858c, 0x150492a9, 0xbe51e878, 0x4f9d39bd, 0xbe8d551c, 0x081cc18c, 0x3ec65597, + 0x3a3f547c, 0x3ef0e91e, 0x5416a98f, 0xbf30c9b6, 0x4dfcdccd, 0x3f2428f4, 0xe4d81e00, 0x3f80b3bd, + 0x46178bfd, 0xbf8cb801, 0x675394d2, 0xbfc946a9, 0x791dc8fb, 0x3fe6e23d, 0xd42d57ef, 0xbdb573c8, + 0xed1a27a8, 0xbdf632bf, 0x6b1145de, 0x3e367087, 0xbdc2436d, 0x3e2d7559, 0xa688c6ab, 0xbe9801bb, + 0xf4ea88f0, 0x3e8ad7ec, 0xf8bce95a, 0x3efc2cfa, 0x2fa996ef, 0xbf1f0bdd, 0x22b44655, 0xbf4a8a14, + 0x66397f44, 0x3f7b13bf, 0xbd4c13b8, 0x3f82a08f, 0x7a57ce71, 0xbfc9ce0d, 0x517103fe, 0x3fe05fd2, + 0x0e7c1431, 0x3dc14df8, 0xad2781f0, 0xbe00a71c, 0x333bdac6, 0x3e05e0fa, 0x8ecd3732, 0x3e62222b, + 0x9d8f65fe, 0xbe84bcf8, 0xe8101ae5, 0xbec10eb4, 0xc2c15462, 0x3ef46afc, 0x0af95711, 0x3efded8b, + 0x8fc77467, 0xbf50a3f2, 0x2397107c, 0x3f663734, 0x2190992e, 0x3f97d6f6, 0x75be54c3, 0xbfc56874, + 0xacd147f5, 0x3fd4cb2b, 0x7100d5fc, 0x3db9806d, 0xc3b2ca73, 0xbdd1537c, 0x60cdaaa7, 0xbe26c0f4, + 0x7eb23d7f, 0x3e5604f1, 0x4c3adb34, 0x3e7e86ae, 0x9b1fe971, 0xbec1b7ba, 0xc100d516, 0x3eceefdc, + 0x7287d24a, 0x3f19c090, 0xf860db94, 0xbf4594c1, 0x906b63ac, 0xbf48ab6e, 0xabea8a44, 0x3f9a798f, + 0x314c81f1, 0xbfbdca7c, 0x462cb19e, 0x3fc761d7, 0xaa177cb2, 0xbd902cf0, 0x207e29b4, 0x3decb11b, + 0x5fac5489, 0xbe209b21, 0xb5bdf6e3, 0xbe32b95b, 0xca4c97b7, 0x3e882b27, 0xc0a47266, 0xbeaa520c, + 0x67f1145d, 0xbedc5422, 0x3e7fc487, 0x3f161837, 0x7eb92091, 0xbf27130d, 0x8d1e6d3f, 0xbf63a37e, + 0x8a1743b5, 0x3f9522fd, 0x334d0c36, 0xbfb1ada1, 0xae493c1d, 0x3fb741a2, 0x3f800000, 0x00000000, + 0x3f804000, 0x3a28e585, 0x3f80a000, 0x399c910f, 0x3f80e000, 0x3a703484, 0x3f814000, 0x3a0eb4bc, + 0x3f81a000, 0x392750df, 0x3f81e000, 0x3a419dc7, 0x3f824000, 0x39ac3801, 0x3f828000, 0x3a675948, + 0x3f82e000, 0x39eabf9a, 0x3f834000, 0x356629d6, 0x3f838000, 0x3a07f04c, 0x3f83e000, 0x3848dac3, + 0x3f842000, 0x3a0e1b17, 0x3f848000, 0x384a1cc7, 0x3f84c000, 0x3a082ade, 0x3f852000, 0x363f31e5, + 0x3f856000, 0x39eccf0d, 0x3f85a000, 0x3a692c6f, 0x3f860000, 0x39b22cb1, 0x3f864000, 0x3a462d87, + 0x3f86a000, 0x3941e864, 0x3f86e000, 0x3a180409, 0x3f872000, 0x3a7cd32d, 0x3f878000, 0x39bdde6c, + 0x3f87c000, 0x3a3e5fb4, 0x3f882000, 0x38d960b3, 0x3f886000, 0x39eab752, 0x3f88a000, 0x3a4cf599, + 0x3f890000, 0x390803d1, 0x3f894000, 0x39e90955, 0x3f898000, 0x3a44878c, 0x3f89e000, 0x38908271, + 0x3f8a2000, 0x39ba4b0f, 0x3f8a6000, 0x3a25cdb3, 0x3f8aa000, 0x3a6c0f33, 0x3f8b0000, 0x393fc12e, + 0x3f8b4000, 0x39e2ee51, 0x3f8b8000, 0x3a30a9dd, 0x3f8bc000, 0x3a6d8e61, 0x3f8c2000, 0x3920aa58, + 0x3f8c6000, 0x39c1088a, 0x3f8ca000, 0x3a16a120, 0x3f8ce000, 0x3a4a86c1, 0x3f8d2000, 0x3a7c3aae, + 0x3f8d8000, 0x392f0952, 0x3f8dc000, 0x39b2461c, 0x3f8e0000, 0x3a04621f, 0x3f8e4000, 0x3a2d84b8, + 0x3f8e8000, 0x3a548ff4, 0x3f8ec000, 0x3a7988db, 0x3f8f2000, 0x38e3a30c, 0x3f8f6000, 0x39755daa, + 0x3f8fa000, 0x39b86d8a, 0x3f8fe000, 0x39f22e5e, 0x3f902000, 0x3a13fd53, 0x3f906000, 0x3a2cedcc, + 0x3f90a000, 0x3a43ed23, 0x3f90e000, 0x3a58ffd0, 0x3f912000, 0x3a6c2a3c, 0x3f916000, 0x3a7d70bf, + 0x3f91c000, 0x384d7a06, 0x3f920000, 0x38d318cf, 0x3f924000, 0x39185d53, 0x3f928000, 0x393fe1b1, + 0x3f92c000, 0x396029b1, 0x3f930000, 0x3979454c, 0x3f934000, 0x3985a221, 0x3f938000, 0x398b1b0d, + 0x3f93c000, 0x398d1515, 0x3f940000, 0x398b97c7, 0x3f944000, 0x3986aa98, 0x3f948000, 0x397ca9c7, + 0x3f94c000, 0x39653bd8, 0x3f950000, 0x394719b5, 0x3f954000, 0x39225182, 0x3f958000, 0x38ede264, + 0x3f95c000, 0x388a0d15, 0x3f960000, 0x3749f226, 0x3f962000, 0x3a737219, 0x3f966000, 0x3a6223e3, + 0x3f96a000, 0x3a4f406c, 0x3f96e000, 0x3a3acaee, 0x3f972000, 0x3a24c698, 0x3f976000, 0x3a0d368f, + 0x3f97a000, 0x39e83bdd, 0x3f97e000, 0x39b2ff8f, 0x3f982000, 0x39757c89, 0x3f986000, 0x38fdf7dc, + 0x3f98a000, 0x3622482d, 0x3f98c000, 0x3a600bf3, 0x3f990000, 0x3a3dfedf, 0x3f994000, 0x3a1a7de3, + 0x3f998000, 0x39eb17a4, 0x3f99c000, 0x399e56e3, 0x3f9a0000, 0x391d7e03, 0x3f9a2000, 0x3a7e2ab7, + 0x3f9a6000, 0x3a538fc2, 0x3f9aa000, 0x3a279148, 0x3f9ae000, 0x39f463ce, 0x3f9b2000, 0x3996e86c, + 0x3f9b6000, 0x38dad617, 0x3f9b8000, 0x3a69e815, 0x3f9bc000, 0x3a371eac, 0x3f9c0000, 0x3a030100, + 0x3f9c4000, 0x399b2304, 0x3f9c8000, 0x38b694db, 0x3f9ca000, 0x3a5ec6af, 0x3f9ce000, 0x3a257018, + 0x3f9d2000, 0x39d5a259, 0x3f9d6000, 0x393bb0e7, 0x3f9d8000, 0x3a71c388, 0x3f9dc000, 0x3a335958, + 0x3f9e0000, 0x39e75fcb, 0x3f9e4000, 0x394b2590, 0x3f9e6000, 0x3a70a802, 0x3f9ea000, 0x3a2d4de7, + 0x3f9ee000, 0x39d17a6c, 0x3f9f2000, 0x390be02b, 0x3f9f4000, 0x3a5c007c, 0x3f9f8000, 0x3a13d899, + 0x3f9fc000, 0x399504dc, 0x3fa00000, 0x00000000, 0x3fa02000, 0x3a34534e, 0x3fa06000, 0x39cefca8, + 0x3fa0a000, 0x38cc1828, 0x3fa0c000, 0x3a4a6352, 0x3fa10000, 0x39f4424a, 0x3fa14000, 0x3922f98d, + 0x3f214000, 0x38a2f98d, 0x3f4b2000, 0x397f529f, 0x3f800000, 0x00000000, 0x3fa14000, 0x3922f98d, + 0x3fcb2000, 0x39ff529f, 0x3f800000, 0x00000000, 0x3f816000, 0x391a3e77, 0x3f82c000, 0x39d8698a, + 0x3f842000, 0x3a51461d, 0x3f85a000, 0x39ac367c, 0x3f870000, 0x3a7b0cb4, 0x3f888000, 0x3a407404, + 0x3f8a0000, 0x3a26abaa, 0x3f8b8000, 0x3a2e0f1f, 0x3f8d0000, 0x3a56fadb, 0x3f8ea000, 0x39073168, + 0x3f902000, 0x3a0ee218, 0x3f91c000, 0x38f4dcea, 0x3f934000, 0x3a515978, 0x3f94e000, 0x3a277d47, + 0x3f968000, 0x3a2169b9, 0x3f982000, 0x3a3f828c, 0x3f99e000, 0x370b2641, 0x3f9b8000, 0x39d39b9d, + 0x3f9d2000, 0x3a76cd39, 0x3f9ee000, 0x3a299304, 0x3fa0a000, 0x3a02887d, 0x3fa26000, 0x3a021818, + 0x3fa42000, 0x3a28ad70, 0x3fa5e000, 0x3a76b54d, 0x3fa7c000, 0x39d93b4e, 0x3fa9a000, 0x382d5a75, + 0x3fab6000, 0x3a51cdad, 0x3fad4000, 0x3a41f752, 0x3faf2000, 0x3a5bc56b, 0x3fb12000, 0x38fd6074, + 0x3fb30000, 0x3a0e2095, 0x3fb50000, 0x391e667f, 0x3fb6e000, 0x3a6c8f19, 0x3fb8e000, 0x3a5d7a3b, + 0x3fbae000, 0x3a7ad590, 0x3fbd0000, 0x398a39f5, 0x3fbf0000, 0x3a3ccdb3, 0x3fc12000, 0x39c4cca6, + 0x3fc34000, 0x39599b44, 0x3fc56000, 0x3965422a, 0x3fc78000, 0x39d74c8a, 0x3fc9a000, 0x3a4dec33, + 0x3fcbe000, 0x39c14fef, 0x3fce2000, 0x391182a3, 0x3fd06000, 0x38ccf6bb, 0x3fd2a000, 0x3981d91f, + 0x3fd4e000, 0x3a1ad55e, 0x3fd74000, 0x391f995a, 0x3fd98000, 0x3a68ae13, 0x3fdbe000, 0x3a5dbcbe, + 0x3fde6000, 0x37f4825e, 0x3fe0c000, 0x39cdeec2, 0x3fe32000, 0x3a7c4b95, 0x3fe5a000, 0x3a48373b, + 0x3fe82000, 0x3a4b5281, 0x3feac000, 0x37c6e7dd, 0x3fed4000, 0x39f301ed, 0x3fefe000, 0x3917337b, + 0x3ff28000, 0x383b9e2c, 0x3ff52000, 0x392fa2a4, 0x3ff7c000, 0x3a06fb98, 0x3ffa8000, 0x38ecb6dc, + 0x3ffd2000, 0x3a706067, 0x40000000, 0x00000000, 0x00000000, 0x00000000, 0x3c37c000, 0x374a16dd, + 0x3cb70000, 0x37f2d0b8, 0x3d08c000, 0x381a3aa2, 0x3d35c000, 0x37b4dd63, 0x3d624000, 0x383f5721, + 0x3d874000, 0x384e27e8, 0x3d9d4000, 0x380bf749, 0x3db30000, 0x387dbeb2, 0x3dc8c000, 0x37216e46, + 0x3dde4000, 0x3684815b, 0x3df38000, 0x383b045f, 0x3e044000, 0x390b119b, 0x3e0ec000, 0x391a32ea, + 0x3e194000, 0x38ba789e, 0x3e238000, 0x39553f30, 0x3e2e0000, 0x3651cfde, 0x3e380000, 0x39685a9d, + 0x3e424000, 0x39057a05, 0x3e4c4000, 0x395ba0ef, 0x3e564000, 0x396bc5b6, 0x3e604000, 0x3936d9bb, + 0x3e6a4000, 0x38772619, 0x3e740000, 0x39017ce9, 0x3e7dc000, 0x3902d720, 0x3e83c000, 0x38856dd8, + 0x3e888000, 0x3941f6b4, 0x3e8d4000, 0x3980b652, 0x3e920000, 0x3980f561, 0x3e96c000, 0x39443f13, + 0x3e9b8000, 0x38926752, 0x3ea00000, 0x39c8c763, 0x3ea4c000, 0x391e12f3, 0x3ea94000, 0x39b7bf89, + 0x3eae0000, 0x36d1cfde, 0x3eb28000, 0x38c7f233, 0x3eb70000, 0x39087367, 0x3ebb8000, 0x38e95d3f, + 0x3ec00000, 0x38256316, 0x3ec44000, 0x39d38e5c, 0x3ec8c000, 0x396ea247, 0x3ecd4000, 0x350e4788, + 0x3ed18000, 0x395d829f, 0x3ed5c000, 0x39c30f2f, 0x3eda0000, 0x39fd7ee7, 0x3ede8000, 0x3872e9e7, + 0x3ee2c000, 0x3897d694, 0x3ee70000, 0x3824923a, 0x3eeb0000, 0x39ea7c06, 0x3eef4000, 0x39a7fa88, + 0x3ef38000, 0x391aa879, 0x3ef78000, 0x39dace65, 0x3efbc000, 0x39215a32, 0x3effc000, 0x39af3350, + 0x3f01c000, 0x3a7b5172, 0x3f040000, 0x389cf27f, 0x3f060000, 0x3902806b, 0x3f080000, 0x3909d8a9, + 0x3f0a0000, 0x38c9faa1, 0x3f0c0000, 0x37a33dca, 0x3f0dc000, 0x3a6623d2, 0x3f0fc000, 0x3a3c7a61, + 0x3f11c000, 0x3a083a84, 0x3f13c000, 0x39930161, 0x3f15c000, 0x35d1cfde, 0x3f178000, 0x3a2d0ebd, + 0x3f198000, 0x399f1aad, 0x3f1b4000, 0x3a67ff6d, 0x3f1d4000, 0x39ecfea8, 0x3f1f0000, 0x3a7b26f3, + 0x3f210000, 0x39ec1fa6, 0x3f22c000, 0x3a675314, 0x3f24c000, 0x399e12f3, 0x3f268000, 0x3a2d4b66, + 0x3f288000, 0x370c3845, 0x3f2a4000, 0x399ba329, 0x3f2c0000, 0x3a1044d3, 0x3f2dc000, 0x3a49a196, + 0x3f2f8000, 0x3a79fe83, 0x3f318000, 0x3905c7aa, 0x3f334000, 0x39802391, 0x3f350000, 0x39abe796, + 0x3f36c000, 0x39c65a9d, 0x3f388000, 0x39cfa6c5, 0x3f3a4000, 0x39c7f593, 0x3f3c0000, 0x39af6ff7, + 0x3f3dc000, 0x39863e4d, 0x3f3f8000, 0x391910c1, 0x3f414000, 0x369d5be7, 0x3f42c000, 0x3a541616, + 0x3f448000, 0x3a1ee960, 0x3f464000, 0x39c38ed2, 0x3f480000, 0x38e61600, 0x3f498000, 0x3a4fedb4, + 0x3f4b4000, 0x39f6b4ab, 0x3f4d0000, 0x38f8d3b0, 0x3f4e8000, 0x3a3b3faa, 0x3f504000, 0x399fb693, + 0x3f51c000, 0x3a5cfe71, 0x3f538000, 0x39c5740b, 0x3f550000, 0x3a611eb0, 0x3f56c000, 0x39b079c4, + 0x3f584000, 0x3a4824d7, 0x3f5a0000, 0x39439a54, 0x3f5b8000, 0x3a1291ea, 0x3f5d0000, 0x3a6d3673, + 0x3f5ec000, 0x3981c731, 0x3f604000, 0x3a0da88f, 0x3f61c000, 0x3a53945c, 0x3f638000, 0x3895ae91, + 0x3f650000, 0x3996372a, 0x3f668000, 0x39f9a832, 0x3f680000, 0x3a27eda4, 0x3f698000, 0x3a4c764f, + 0x3f6b0000, 0x3a6a7c06, 0x3f6cc000, 0x370321eb, 0x3f6e4000, 0x3899ab3f, 0x3f6fc000, 0x38f02086, + 0x3f714000, 0x390a1707, 0x3f72c000, 0x39031e44, 0x3f744000, 0x38c6b362, 0x3f75c000, 0x382bf195, + 0x3f770000, 0x3a768e36, 0x3f788000, 0x3a5c503b, 0x3f7a0000, 0x3a3c1179, 0x3f7b8000, 0x3a15de1d, + 0x3f7d0000, 0x39d3845d, 0x3f7e8000, 0x395f263f, 0x3f800000, 0x00000000, 0x00000000, 0x00000000, + 0x3b5d4000, 0x367a8e44, 0x3bdc8000, 0x368ed49f, 0x3c24c000, 0x36c21451, 0x3c5ac000, 0x375211d6, + 0x3c884000, 0x3720ea11, 0x3ca2c000, 0x37e9eb59, 0x3cbd4000, 0x37b87be7, 0x3cd78000, 0x37bf2560, + 0x3cf1c000, 0x33d597a0, 0x3d05c000, 0x37806a05, 0x3d128000, 0x3820581f, 0x3d1f4000, 0x38223334, + 0x3d2c0000, 0x378e3bac, 0x3d388000, 0x3810684f, 0x3d450000, 0x37feb7ae, 0x3d518000, 0x36a9d609, + 0x3d5dc000, 0x37a68163, 0x3d6a0000, 0x376a8b27, 0x3d760000, 0x384c8fd6, 0x3d810000, 0x3885183e, + 0x3d870000, 0x3874a760, 0x3d8d0000, 0x380d1154, 0x3d92c000, 0x38ea42bd, 0x3d98c000, 0x384c1571, + 0x3d9e8000, 0x38ba66b8, 0x3da44000, 0x38e7da3b, 0x3daa0000, 0x38eee632, 0x3dafc000, 0x38d00911, + 0x3db58000, 0x388bbede, 0x3dbb4000, 0x378a0512, 0x3dc0c000, 0x3894c7a0, 0x3dc64000, 0x38e30710, + 0x3dcc0000, 0x36db2829, 0x3dd18000, 0x3729d609, 0x3dd6c000, 0x38fa0e82, 0x3ddc4000, 0x38bc9a75, + 0x3de1c000, 0x383a9297, 0x3de70000, 0x38dc83c8, 0x3dec8000, 0x37eac335, 0x3df1c000, 0x38706ac3, + 0x3df70000, 0x389574c2, 0x3dfc4000, 0x3892d068, 0x3e00c000, 0x38615032, 0x3e034000, 0x3917acf4, + 0x3e05c000, 0x3967a126, 0x3e088000, 0x38217840, 0x3e0b0000, 0x38b420ab, 0x3e0d8000, 0x38f9c7b2, + 0x3e100000, 0x391103bd, 0x3e128000, 0x39169a6b, 0x3e150000, 0x390dd194, 0x3e178000, 0x38eda471, + 0x3e1a0000, 0x38a38950, 0x3e1c8000, 0x37f6844a, 0x3e1ec000, 0x395e1cdb, 0x3e214000, 0x390fcffc, + 0x3e23c000, 0x38503e9d, 0x3e260000, 0x394b00fd, 0x3e288000, 0x38a9910a, 0x3e2ac000, 0x39518a31, + 0x3e2d4000, 0x3882d2c2, 0x3e2f8000, 0x392488e4, 0x3e31c000, 0x397b0aff, 0x3e344000, 0x388a22d8, + 0x3e368000, 0x3902bd5e, 0x3e38c000, 0x39342f85, 0x3e3b0000, 0x39598811, 0x3e3d4000, 0x3972e6b1, + 0x3e3fc000, 0x34d53654, 0x3e420000, 0x360ca25e, 0x3e440000, 0x39785cc0, 0x3e464000, 0x39630710, + 0x3e488000, 0x39424ed7, 0x3e4ac000, 0x39165101, 0x3e4d0000, 0x38be5421, 0x3e4f4000, 0x37e7b0c0, + 0x3e514000, 0x394fd0c3, 0x3e538000, 0x38efaaaa, 0x3e55c000, 0x37a8f566, 0x3e57c000, 0x3927c744, + 0x3e5a0000, 0x383fa4d5, 0x3e5c0000, 0x392d9e39, 0x3e5e4000, 0x3803feae, 0x3e604000, 0x390a268c, + 0x3e624000, 0x39692b80, 0x3e648000, 0x38789b4f, 0x3e668000, 0x3909307d, 0x3e688000, 0x394a601c, + 0x3e6ac000, 0x35e67edc, 0x3e6cc000, 0x383e386d, 0x3e6ec000, 0x38a7743d, 0x3e70c000, 0x38dccec3, + 0x3e72c000, 0x38ff57e0, 0x3e74c000, 0x39079d8b, 0x3e76c000, 0x390651a6, 0x3e78c000, 0x38f7bad9, + 0x3e7ac000, 0x38d0ab82, 0x3e7cc000, 0x38979e7d, 0x3e7ec000, 0x381978ee, 0x3e804000, 0x397816c8, + 0x3e814000, 0x39410cb2, 0x3e824000, 0x39015384, 0x3e834000, 0x3863fa28, 0x3e840000, 0x39f41065, + 0x3e850000, 0x39c7668a, 0x3e860000, 0x39968afa, 0x3e870000, 0x39430db9, 0x3e880000, 0x38a18cf3, + 0x3e88c000, 0x39eb2907, 0x3e89c000, 0x39a9e10c, 0x3e8ac000, 0x39492800, 0x3e8bc000, 0x385a53d1, + 0x3e8c8000, 0x39ce0cf7, 0x3e8d8000, 0x3979c7b2, 0x3e8e8000, 0x389f5d99, 0x3e8f4000, 0x39ceefcb, + 0x3e904000, 0x39646a39, 0x3e914000, 0x380d7a9b, 0x3e920000, 0x39ad6650, 0x3e930000, 0x390ac3b8, + 0x3e93c000, 0x39d9a9a8, 0x3e94c000, 0x39548a99, 0x3e958000, 0x39f73c4b, 0x3e968000, 0x3980960e, + 0x3e978000, 0x374b3d5a, 0x3e984000, 0x39888f1e, 0x3e994000, 0x37679a07, 0x3e9a0000, 0x39826a13, + 0x00000000, 0x00000000, 0x3bff0000, 0x3429ac41, 0x3c7e0000, 0x35a8b0fc, 0x3cbdc000, 0x368d83ea, + 0x3cfc1000, 0x361b0e78, 0x3d1cf000, 0x3687b9fe, 0x3d3ba000, 0x3631ec65, 0x3d5a1000, 0x36dd7119, + 0x3d785000, 0x35c30045, 0x3d8b2000, 0x379b7751, 0x3d9a0000, 0x37ebcb0d, 0x3da8d000, 0x37839f83, + 0x3db78000, 0x37528ae5, 0x3dc61000, 0x37a2eb18, 0x3dd49000, 0x36da7495, 0x3de2f000, 0x36a91eb7, + 0x3df13000, 0x3783b715, 0x3dff6000, 0x371131db, 0x3e06b000, 0x383f3e68, 0x3e0db000, 0x38156a97, + 0x3e14a000, 0x38297c0f, 0x3e1b8000, 0x387e100f, 0x3e226000, 0x3815b665, 0x3e293000, 0x37e5e3a1, + 0x3e2ff000, 0x38183853, 0x3e36b000, 0x35fe719d, 0x3e3d5000, 0x38448108, 0x3e43f000, 0x38503290, + 0x3e4a9000, 0x373539e8, 0x3e511000, 0x385e0ff1, 0x3e579000, 0x3864a740, 0x3e5e1000, 0x3786742d, + 0x3e647000, 0x387be3cd, 0x3e6ae000, 0x3685ad3e, 0x3e713000, 0x3803b715, 0x3e778000, 0x37adcbdc, + 0x3e7dc000, 0x380c36af, 0x3e820000, 0x371652d3, 0x3e851000, 0x38927139, 0x3e882000, 0x38c5fcd7, + 0x3e8b3000, 0x38ae55d5, 0x3e8e4000, 0x3818c169, 0x3e914000, 0x38a0fde7, 0x3e944000, 0x38ad09ef, + 0x3e974000, 0x3862bae1, 0x3e9a3000, 0x38eecd4c, 0x3e9d3000, 0x3798aad2, 0x3ea02000, 0x37421a1a, + 0x3ea30000, 0x38c5e10e, 0x3ea5f000, 0x37bf2aee, 0x3ea8d000, 0x382d872d, 0x3eabb000, 0x37ee2e8a, + 0x3eae8000, 0x38dedfac, 0x3eb16000, 0x3802f2b9, 0x3eb43000, 0x38481e9b, 0x3eb70000, 0x380eaa2b, + 0x3eb9c000, 0x38ebfb5d, 0x3ebc9000, 0x38255fdd, 0x3ebf5000, 0x38783b82, 0x3ec21000, 0x3851da1e, + 0x3ec4d000, 0x374e1b05, 0x3ec78000, 0x388f439b, 0x3eca3000, 0x38ca0e10, 0x3ecce000, 0x38cac08b, + 0x3ecf9000, 0x3891f65f, 0x3ed24000, 0x378121cb, 0x3ed4e000, 0x386c9a9a, 0x3ed78000, 0x38949923, + 0x3eda2000, 0x38777bcc, 0x3edcc000, 0x37b12d26, 0x3edf5000, 0x38a6ced3, 0x3ee1e000, 0x38ebd3e6, + 0x3ee47000, 0x38fbe3cd, 0x3ee70000, 0x38d785c2, 0x3ee99000, 0x387e7e00, 0x3eec1000, 0x38f392c5, + 0x3eeea000, 0x37d40983, 0x3ef12000, 0x38081a7c, 0x3ef3a000, 0x3784c3ad, 0x3ef61000, 0x38cce923, + 0x3ef89000, 0x380f5faf, 0x3efb0000, 0x3891fd38, 0x3efd7000, 0x38ac47bc, 0x3effe000, 0x3897042b, + 0x3f012000, 0x392952d2, 0x3f025000, 0x396fced4, 0x3f039000, 0x37f97073, 0x3f04c000, 0x385e9eae, + 0x3f05f000, 0x3865c84a, 0x3f072000, 0x38130ba3, 0x3f084000, 0x3979cf16, 0x3f097000, 0x3938cac9, + 0x3f0aa000, 0x38c3d2f4, 0x3f0bc000, 0x39755dec, 0x3f0cf000, 0x38e6b467, 0x3f0e1000, 0x395c0fb8, + 0x3f0f4000, 0x383ebce0, 0x3f106000, 0x38dcd192, 0x3f118000, 0x39186bdf, 0x3f12a000, 0x392de74c, + 0x3f13c000, 0x392f0944, 0x3f14e000, 0x391bff61, 0x3f160000, 0x38e9ed44, 0x3f172000, 0x38686dc8, + 0x3f183000, 0x396b99a7, 0x3f195000, 0x39099c89, 0x3f1a7000, 0x37a27673, 0x3f1b8000, 0x390bdaa3, + 0x3f1c9000, 0x397069ab, 0x3f1db000, 0x388449ff, 0x3f1ec000, 0x39013538, 0x3f1fd000, 0x392dc268, + 0x3f20e000, 0x3947f423, 0x3f21f000, 0x394ff17c, 0x3f230000, 0x3945e10e, 0x3f241000, 0x3929e8f5, + 0x3f252000, 0x38f85db0, 0x3f263000, 0x38735f99, 0x3f273000, 0x396c08db, 0x3f284000, 0x3909e600, + 0x3f295000, 0x37b4996f, 0x3f2a5000, 0x391233cc, 0x3f2b5000, 0x397cead9, 0x3f2c6000, 0x38adb5cd, + 0x3f2d6000, 0x3920261a, 0x3f2e6000, 0x3958ee36, 0x3f2f7000, 0x35aa4905, 0x3f307000, 0x37cbd11e, + 0x3f317000, 0x3805fdf4, 0x40000000, 0x00000000, 0x3ffe0000, 0x38fe03f8, 0x3ffc0000, 0x39fc0fc1, + 0x3ffa0000, 0x3a8cb3c9, 0x3ff80000, 0x3af83e10, 0x3ff60000, 0x3b407b30, 0x3ff40000, 0x3b898d60, + 0x3ff20000, 0x3bb9d648, 0x3ff00000, 0x3bf0f0f1, 0x3fef0000, 0x3abadc7f, 0x3fed0000, 0x3b66076c, + 0x3feb0000, 0x3bbdb2a6, 0x3fea0000, 0x39ea0ea1, 0x3fe80000, 0x3b4b58f7, 0x3fe60000, 0x3bc2b448, + 0x3fe50000, 0x3a9660ac, 0x3fe30000, 0x3b8e38e4, 0x3fe10000, 0x3bfc780e, 0x3fe00000, 0x3b607038, + 0x3fde0000, 0x3be95c4d, 0x3fdd0000, 0x3b4f914c, 0x3fdb0000, 0x3beb61ef, 0x3fda0000, 0x3b681b4f, + 0x3fd90000, 0x385901b2, 0x3fd70000, 0x3b9435e5, 0x3fd60000, 0x3aae0359, 0x3fd40000, 0x3bc77b03, + 0x3fd30000, 0x3b501a6d, 0x3fd20000, 0x39d20d21, 0x3fd00000, 0x3bb69fcc, 0x3fcf0000, 0x3b48e951, + 0x3fce0000, 0x3a3453b9, 0x3fcc0000, 0x3bcccccd, 0x3fcb0000, 0x3b8727c0, 0x3fca0000, 0x3b0b0fcd, + 0x3fc90000, 0x397b49d1, 0x3fc70000, 0x3bce0c7d, 0x3fc60000, 0x3b980c6a, 0x3fc50000, 0x3b4b90f7, + 0x3fc40000, 0x3adcbe15, 0x3fc30000, 0x39c30c31, 0x3fc10000, 0x3be4bbd6, 0x3fc00000, 0x3bc0c0c1, + 0x3fbf0000, 0x3ba02fe8, 0x3fbe0000, 0x3b82fa0c, 0x3fbd0000, 0x3b52208e, 0x3fbc0000, 0x3b24c818, + 0x3fbb0000, 0x3afb9c87, 0x3fba0000, 0x3aba2e8c, 0x3fb90000, 0x3a850fe9, 0x3fb80000, 0x3a381703, + 0x3fb70000, 0x39fbb5a2, 0x3fb60000, 0x39b60b61, 0x3fb50000, 0x399e68aa, 0x3fb40000, 0x39b40b41, + 0x3fb30000, 0x39f63529, 0x3fb20000, 0x3a321643, 0x3fb10000, 0x3a7e9dc0, 0x3fb00000, 0x3ab02c0b, + 0x3faf0000, 0x3aeb771a, 0x3fae0000, 0x3b1882b9, 0x3fad0000, 0x3b4056b0, 0x3fac0000, 0x3b6d2308, + 0x3fab0000, 0x3b8f69e3, 0x3faa0000, 0x3baaaaab, 0x3fa90000, 0x3bc84a48, 0x3fa80000, 0x3be83f57, + 0x3fa80000, 0x39a80a81, 0x3fa70000, 0x3abc14e6, 0x3fa60000, 0x3b2b8872, 0x3fa50000, 0x3b7d6a05, + 0x3fa40000, 0x3ba9cf1e, 0x3fa30000, 0x3bd70a3d, 0x3fa30000, 0x394bc7f6, 0x3fa20000, 0x3adf0cac, + 0x3fa10000, 0x3b56625d, 0x3fa00000, 0x3ba0a0a1, 0x3f9f0000, 0x3bd809fe, 0x3f9f0000, 0x3a0b2f39, + 0x3f9e0000, 0x3b195a48, 0x3f9d0000, 0x3b89d89e, 0x3f9c0000, 0x3bc8e161, 0x3f9c0000, 0x399c09c1, + 0x3f9b0000, 0x3b18df3e, 0x3f9a0000, 0x3b90e7d9, 0x3f990000, 0x3bd722db, 0x3f990000, 0x3a78d28b, + 0x3f980000, 0x3b519013, 0x3f970000, 0x3bb425ed, 0x3f970000, 0x3817012e, 0x3f960000, 0x3b1fb4d8, + 0x3f950000, 0x3ba02568, 0x3f940000, 0x3bf2094f, 0x3f940000, 0x3b0b0129, 0x3f930000, 0x3b9a85c4, + 0x3f920000, 0x3bf11384, 0x3f920000, 0x3b124925, 0x3f910000, 0x3ba2b3c5, 0x3f900000, 0x3bfdbc09, + 0x3f900000, 0x3b3470c6, 0x3f8f0000, 0x3bb823ee, 0x3f8f0000, 0x3a3bced0, 0x3f8e0000, 0x3b706ada, + 0x3f8d0000, 0x3bda5202, 0x3f8d0000, 0x3af72c23, 0x3f8c0000, 0x3ba29c04, 0x3f8c0000, 0x398c08c1, + 0x3f8b0000, 0x3b606894, 0x3f8a0000, 0x3bd8f2fc, 0x3f8a0000, 0x3b05f0e1, 0x3f890000, 0x3bae408a, + 0x3f890000, 0x3a5639d7, 0x3f880000, 0x3b888889, 0x3f870000, 0x3bf78088, 0x3f870000, 0x3b4f56be, + 0x3f860000, 0x3bd90544, 0x3f860000, 0x3b1714fc, 0x3f850000, 0x3bbf3761, 0x3f850000, 0x3ad0214d, + 0x3f840000, 0x3ba9f9c8, 0x3f840000, 0x3a842108, 0x3f830000, 0x3b993052, 0x3f830000, 0x3a1374bc, + 0x3f820000, 0x3b8cbfbf, 0x3f820000, 0x39820821, 0x3f810000, 0x3b848da9, 0x3f810000, 0x38810204, + 0x3f800000, 0x3b808081, 0x3f800000, 0x00000000, 0x00000000, 0x3f800000, 0x3f966cfe, 0x3fc583ab, + 0x40681e7b, 0x4070c7d0, 0x41204937, 0x41211525, 0x41da51c0, 0x41da7743, 0x4294680b, 0x42946b7e, + 0x4349b691, 0x4349b734, 0x4409143b, 0x4409144a, 0x44ba4f53, 0x44ba4f55, 0x457d38ac, 0x457d38ac, + 0x462c14ee, 0x462c14ef, 0x46e9e224, 0x46e9e224, 0x479ef0b3, 0x479ef0b3, 0x485805ad, 0x485805ad, + 0x4912cd62, 0x4912cd62, 0x49c78665, 0x49c78665, 0x4a87975f, 0x4a87975f, 0x4b3849a4, 0x4b3849a4, + 0x4bfa7910, 0x4bfa7910, 0x4caa36c8, 0x4caa36c8, 0x4d675844, 0x4d675844, 0x4e1d3710, 0x4e1d3710, + 0x4ed5ad6e, 0x4ed5ad6e, 0x4f91357a, 0x4f91357a, 0x50455bfe, 0x50455bfe, 0x51061e9d, 0x51061e9d, + 0x51b64993, 0x51b64993, 0x5277c118, 0x5277c118, 0x53285dd2, 0x53285dd2, 0x53e4d572, 0x53e4d572, + 0x549b8238, 0x549b8238, 0x55535bb3, 0x55535bb3, 0x560fa1fe, 0x560fa1fe, 0x56c3379a, 0x56c3379a, + 0x5784a9f1, 0x5784a9f1, 0x58344f11, 0x58344f11, 0x58f510ad, 0x58f510ad, 0x3d7faade, 0x3d87ccf5, + 0x3d8fc36e, 0x3d97b8ca, 0x3d9facf8, 0x3da79feb, 0x3daf9192, 0x3db781df, 0x3dbf70c1, 0x3dc75e2a, + 0x3dcf4a0b, 0x3dd73454, 0x3ddf1cf6, 0x3de703e3, 0x3deee90c, 0x3df6cc61, 0x3dfeadd5, 0x3e0346ac, + 0x3e07356e, 0x3e0b232a, 0x3e0f0fd8, 0x3e12fb71, 0x3e16e5ee, 0x3e1acf47, 0x3e1eb777, 0x3e229e76, + 0x3e26843d, 0x3e2a68c6, 0x3e2e4c09, 0x3e322e00, 0x3e360ea4, 0x3e39edef, 0x3e3dcbda, 0x3e41a85f, + 0x3e458377, 0x3e495d1c, 0x3e4d3547, 0x3e510bf3, 0x3e54e119, 0x3e58b4b3, 0x3e5c86bb, 0x3e60572a, + 0x3e6425fc, 0x3e67f32a, 0x3e6bbeaf, 0x3e6f8884, 0x3e7350a4, 0x3e77170a, 0x3e7adbb0, 0x3e7e9e90, + 0x3e812fd3, 0x3e830f75, 0x3e84ee2d, 0x3e86cbf7, 0x3e88a8d2, 0x3e8a84ba, 0x3e8c5fad, 0x3e8e39a9, + 0x3e9012ab, 0x3e91eab1, 0x3e93c1b9, 0x3e9597c0, 0x3e976cc4, 0x3e9940c2, 0x3e9b13ba, 0x3e9ce5a7, + 0x3e9eb689, 0x3ea0865d, 0x3ea25522, 0x3ea422d4, 0x3ea5ef73, 0x3ea7bafc, 0x3ea9856d, 0x3eab4ec4, + 0x3ead1701, 0x3eaede20, 0x3eb0a420, 0x3eb26900, 0x3eb42cbd, 0x3eb5ef56, 0x3eb7b0ca, 0x3eb97117, + 0x3ebb303b, 0x3ebcee34, 0x3ebeab02, 0x3ec066a3, 0x3ec22116, 0x3ec3da58, 0x3ec5926a, 0x3ec74949, + 0x3ec8fef4, 0x3ecab36a, 0x3ecc66aa, 0x3ece18b3, 0x3ecfc983, 0x3ed1791a, 0x3ed32776, 0x3ed4d497, + 0x3ed6807b, 0x3ed82b21, 0x3ed9d489, 0x3edb7cb1, 0x3edd239a, 0x3edec941, 0x3ee06da6, 0x3ee210c9, + 0x3ee3b2a8, 0x3ee55344, 0x3ee6f29a, 0x3ee890ab, 0x3eea2d76, 0x3eebc8fb, 0x3eed6338, 0x3eeefc2e, + 0x3ef093db, 0x3ef22a40, 0x3ef3bf5c, 0x3ef5532e, 0x3ef6e5b7, 0x3ef876f5, 0x3efa06e8, 0x3efb9591, + 0x3efd22ef, 0x3efeaf01, 0x3f001ce4, 0x3f00e1a1, 0x3f01a5b8, 0x3f02692a, 0x3f032bf5, 0x3f03ee1a, + 0x3f04af98, 0x3f057071, 0x3f0630a3, 0x3f06f02f, 0x3f07af14, 0x3f086d54, 0x3f092aed, 0x3f09e7e0, + 0x3f0aa42d, 0x3f0b5fd3, 0x3f0c1ad4, 0x3f0cd52f, 0x3f0d8ee4, 0x3f0e47f4, 0x3f0f005d, 0x3f0fb822, + 0x3f106f41, 0x3f1125ba, 0x3f11db8f, 0x3f1290bf, 0x3f13454a, 0x3f13f931, 0x3f14ac73, 0x3f155f11, + 0x3f16110b, 0x3f16c261, 0x3f177314, 0x3f182324, 0x3f18d290, 0x3f19815a, 0x3f1a2f81, 0x3f1add06, + 0x3f1b89e8, 0x3f1c3629, 0x3f1ce1c9, 0x3f1d8cc7, 0x3f1e3725, 0x3f1ee0e1, 0x3f1f89fe, 0x3f20327a, + 0x3f20da57, 0x3f218194, 0x3f222833, 0x3f22ce33, 0x3f237394, 0x3f241857, 0x3f24bc7d, 0x3f256006, + 0x3f2602f1, 0x3f26a540, 0x3f2746f3, 0x3f27e80a, 0x3f288885, 0x3f292866, 0x3f29c7ac, 0x3f2a6658, + 0x3f2b0469, 0x3f2ba1e2, 0x3f2c3ec1, 0x3f2cdb08, 0x3f2d76b6, 0x3f2e11cd, 0x3f2eac4c, 0x3f2f4635, + 0x3f2fdf87, 0x3f307842, 0x3f311069, 0x3f31a7fa, 0x3f323ef6, 0x3f32d55e, 0x3f336b32, 0x3f340072, + 0x3f349520, 0x3f35293b, 0x3f35bcc5, 0x3f364fbc, 0x3f36e223, 0x3f3773f9, 0x3f38053e, 0x3f3895f4, + 0x3f39261b, 0x3f39b5b3, 0x3f3a44bc, 0x3f3ad338, 0x3f3b6127, 0x3f3bee89, 0x3f3c7b5e, 0x3f3d07a7, + 0x3f3d9365, 0x3f3e1e99, 0x3f3ea941, 0x3f3f3360, 0x3f3fbcf5, 0x3f404602, 0x3f40ce86, 0x3f415682, + 0x3f41ddf6, 0x3f4264e4, 0x3f42eb4b, 0x3f43712c, 0x3f43f687, 0x3f447b5e, 0x3f44ffb0, 0x3f45837e, + 0x3f4606c9, 0x3f468990, 0x3f470bd5, 0x3f478d98, 0x3f480eda, 0x3f488f9b, 0x3f490fdb, 0x3f800000, + 0x3f8164d2, 0x3f82cd87, 0x3f843a29, 0x3f85aac3, 0x3f871f62, 0x3f88980f, 0x3f8a14d5, 0x3f8b95c2, + 0x3f8d1adf, 0x3f8ea43a, 0x3f9031dc, 0x3f91c3d3, 0x3f935a2b, 0x3f94f4f0, 0x3f96942d, 0x3f9837f0, + 0x3f99e046, 0x3f9b8d3a, 0x3f9d3eda, 0x3f9ef532, 0x3fa0b051, 0x3fa27043, 0x3fa43516, 0x3fa5fed7, + 0x3fa7cd94, 0x3fa9a15b, 0x3fab7a3a, 0x3fad583f, 0x3faf3b79, 0x3fb123f6, 0x3fb311c4, 0x3fb504f3, + 0x3fb6fd92, 0x3fb8fbaf, 0x3fbaff5b, 0x3fbd08a4, 0x3fbf179a, 0x3fc12c4d, 0x3fc346cd, 0x3fc5672a, + 0x3fc78d75, 0x3fc9b9be, 0x3fcbec15, 0x3fce248c, 0x3fd06334, 0x3fd2a81e, 0x3fd4f35b, 0x3fd744fd, + 0x3fd99d16, 0x3fdbfbb8, 0x3fde60f5, 0x3fe0ccdf, 0x3fe33f89, 0x3fe5b907, 0x3fe8396a, 0x3feac0c7, + 0x3fed4f30, 0x3fefe4ba, 0x3ff28177, 0x3ff5257d, 0x3ff7d0df, 0x3ffa83b3, 0x3ffd3e0c, 0x40000000, + 0x40000000, 0x3ffe03f8, 0x3ffc0fc1, 0x3ffa232d, 0x3ff83e10, 0x3ff6603e, 0x3ff4898d, 0x3ff2b9d6, + 0x3ff0f0f1, 0x3fef2eb7, 0x3fed7304, 0x3febbdb3, 0x3fea0ea1, 0x3fe865ac, 0x3fe6c2b4, 0x3fe52598, + 0x3fe38e39, 0x3fe1fc78, 0x3fe07038, 0x3fdee95c, 0x3fdd67c9, 0x3fdbeb62, 0x3fda740e, 0x3fd901b2, + 0x3fd79436, 0x3fd62b81, 0x3fd4c77b, 0x3fd3680d, 0x3fd20d21, 0x3fd0b6a0, 0x3fcf6475, 0x3fce168a, + 0x3fcccccd, 0x3fcb8728, 0x3fca4588, 0x3fc907da, 0x3fc7ce0c, 0x3fc6980c, 0x3fc565c8, 0x3fc43730, + 0x3fc30c31, 0x3fc1e4bc, 0x3fc0c0c1, 0x3fbfa030, 0x3fbe82fa, 0x3fbd6910, 0x3fbc5264, 0x3fbb3ee7, + 0x3fba2e8c, 0x3fb92144, 0x3fb81703, 0x3fb70fbb, 0x3fb60b61, 0x3fb509e7, 0x3fb40b41, 0x3fb30f63, + 0x3fb21643, 0x3fb11fd4, 0x3fb02c0b, 0x3faf3ade, 0x3fae4c41, 0x3fad602b, 0x3fac7692, 0x3fab8f6a, + 0x3faaaaab, 0x3fa9c84a, 0x3fa8e83f, 0x3fa80a81, 0x3fa72f05, 0x3fa655c4, 0x3fa57eb5, 0x3fa4a9cf, + 0x3fa3d70a, 0x3fa3065e, 0x3fa237c3, 0x3fa16b31, 0x3fa0a0a1, 0x3f9fd80a, 0x3f9f1166, 0x3f9e4cad, + 0x3f9d89d9, 0x3f9cc8e1, 0x3f9c09c1, 0x3f9b4c70, 0x3f9a90e8, 0x3f99d723, 0x3f991f1a, 0x3f9868c8, + 0x3f97b426, 0x3f97012e, 0x3f964fda, 0x3f95a025, 0x3f94f209, 0x3f944581, 0x3f939a86, 0x3f92f114, + 0x3f924925, 0x3f91a2b4, 0x3f90fdbc, 0x3f905a38, 0x3f8fb824, 0x3f8f177a, 0x3f8e7835, 0x3f8dda52, + 0x3f8d3dcb, 0x3f8ca29c, 0x3f8c08c1, 0x3f8b7034, 0x3f8ad8f3, 0x3f8a42f8, 0x3f89ae41, 0x3f891ac7, + 0x3f888889, 0x3f87f781, 0x3f8767ab, 0x3f86d905, 0x3f864b8a, 0x3f85bf37, 0x3f853408, 0x3f84a9fa, + 0x3f842108, 0x3f839930, 0x3f83126f, 0x3f828cc0, 0x3f820821, 0x3f81848e, 0x3f810204, 0x3f808081, + 0x3f800000, 0x00000000, 0x399f22b4, 0x3a1f22b4, 0x3a6eb40e, 0x3a9f22b4, 0x3ac6eb61, 0x3aeeb40e, + 0x3b0b3e5d, 0x3b1f22b4, 0x3b33070a, 0x3b46eb61, 0x3b5b518e, 0x3b70f18f, 0x3b83e1c6, 0x3b8fe616, + 0x3b9c87fd, 0x3ba9c9b6, 0x3bb7ad6f, 0x3bc6354a, 0x3bd56360, 0x3be539c1, 0x3bf5ba71, 0x3c0373b6, + 0x3c0c6153, 0x3c15a705, 0x3c1f45be, 0x3c293e6b, 0x3c3391f7, 0x3c3e4149, 0x3c494d44, 0x3c54b6c9, + 0x3c607eb4, 0x3c6ca5df, 0x3c792d22, 0x3c830aa9, 0x3c89af9f, 0x3c9085dc, 0x3c978dc6, 0x3c9ec7c2, + 0x3ca63433, 0x3cadd37d, 0x3cb5a602, 0x3cbdac21, 0x3cc5e63a, 0x3cce54ac, 0x3cd6f7d5, 0x3cdfd010, + 0x3ce8ddba, 0x3cf2212d, 0x3cfb9ac3, 0x3d02a56a, 0x3d0798dd, 0x3d0ca7e6, 0x3d11d2af, 0x3d171964, + 0x3d1c7c30, 0x3d21fb3c, 0x3d2796b2, 0x3d2d4ebb, 0x3d332381, 0x3d39152b, 0x3d3f23e4, 0x3d454fd2, + 0x3d4b991d, 0x3d51ffec, 0x3d588468, 0x3d5f26b6, 0x3d65e6fd, 0x3d6cc563, 0x3d73c20e, 0x3d7add24, + 0x3d810b65, 0x3d84b793, 0x3d88732e, 0x3d8c3e48, 0x3d9018f4, 0x3d940344, 0x3d97fd49, 0x3d9c0715, + 0x3da020ba, 0x3da44a4a, 0x3da883d6, 0x3daccd6f, 0x3db12727, 0x3db5910f, 0x3dba0b38, 0x3dbe95b3, + 0x3dc33090, 0x3dc7dbe0, 0x3dcc97b4, 0x3dd1641d, 0x3dd6412b, 0x3ddb2eee, 0x3de02d76, 0x3de53cd4, + 0x3dea5d18, 0x3def8e51, 0x3df4d090, 0x3dfa23e5, 0x3dff885e, 0x3e027f06, 0x3e05427f, 0x3e080ea2, + 0x3e0ae377, 0x3e0dc104, 0x3e10a753, 0x3e13966a, 0x3e168e51, 0x3e198f0f, 0x3e1c98ac, 0x3e1fab30, + 0x3e22c6a1, 0x3e25eb07, 0x3e29186a, 0x3e2c4ed0, 0x3e2f8e42, 0x3e32d6c5, 0x3e362862, 0x3e39831f, + 0x3e3ce703, 0x3e405417, 0x3e43ca60, 0x3e4749e6, 0x3e4ad2af, 0x3e4e64c3, 0x3e520029, 0x3e55a4e7, + 0x3e595305, 0x3e5d0a89, 0x3e60cb7a, 0x3e6495df, 0x3e6869be, 0x3e6c471f, 0x3e702e07, 0x3e741e7e, + 0x3e78188b, 0x3e7c1c33, 0x3e8014bf, 0x3e822039, 0x3e84308b, 0x3e8645b8, 0x3e885fc3, 0x3e8a7eb0, + 0x3e8ca281, 0x3e8ecb3b, 0x3e90f8df, 0x3e932b72, 0x3e9562f6, 0x3e979f6f, 0x3e99e0e0, 0x3e9c274c, + 0x3e9e72b6, 0x3ea0c321, 0x3ea31890, 0x3ea57307, 0x3ea7d288, 0x3eaa3716, 0x3eaca0b6, 0x3eaf0f68, + 0x3eb18332, 0x3eb3fc15, 0x3eb67a14, 0x3eb8fd34, 0x3ebb8576, 0x3ebe12de, 0x3ec0a56e, 0x3ec33d2a, + 0x3ec5da14, 0x3ec87c30, 0x3ecb2380, 0x3ecdd008, 0x3ed081ca, 0x3ed338c9, 0x3ed5f508, 0x3ed8b68a, + 0x3edb7d52, 0x3ede4963, 0x3ee11abf, 0x3ee3f169, 0x3ee6cd65, 0x3ee9aeb5, 0x3eec955b, 0x3eef815c, + 0x3ef272b8, 0x3ef56974, 0x3ef86593, 0x3efb6716, 0x3efe6e00, 0x3f00bd2b, 0x3f02460c, 0x3f03d1a5, + 0x3f055ff7, 0x3f06f104, 0x3f0884cd, 0x3f0a1b54, 0x3f0bb499, 0x3f0d509f, 0x3f0eef65, 0x3f1090ef, + 0x3f12353d, 0x3f13dc50, 0x3f15862a, 0x3f1732cc, 0x3f18e237, 0x3f1a946e, 0x3f1c4970, 0x3f1e0140, + 0x3f1fbbde, 0x3f21794d, 0x3f23398c, 0x3f24fc9f, 0x3f26c285, 0x3f288b41, 0x3f2a56d2, 0x3f2c253c, + 0x3f2df67f, 0x3f2fca9c, 0x3f31a194, 0x3f337b6a, 0x3f35581d, 0x3f3737b0, 0x3f391a24, 0x3f3aff7a, + 0x3f3ce7b2, 0x3f3ed2cf, 0x3f40c0d2, 0x3f42b1bc, 0x3f44a58e, 0x3f469c49, 0x3f4895ef, 0x3f4a9280, + 0x3f4c91ff, 0x3f4e946c, 0x3f5099c9, 0x3f52a216, 0x3f54ad56, 0x3f56bb88, 0x3f58ccaf, 0x3f5ae0cc, + 0x3f5cf7df, 0x3f5f11ea, 0x3f612eef, 0x3f634eee, 0x3f6571e9, 0x3f6797e0, 0x3f69c0d5, 0x3f6becca, + 0x3f6e1bbf, 0x3f704db5, 0x3f7282ae, 0x3f74baab, 0x3f76f5ae, 0x3f7933b6, 0x3f7b74c6, 0x3f7db8de, + 0x3f800000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x000000a2, + 0x000003e6, 0x00000036, 0x00000393, 0x00000244, 0x00000054, 0x0000029f, 0x00000309, 0x00000357, + 0x00000347, 0x00000353, 0x00000137, 0x000001c0, 0x0000036d, 0x00000229, 0x00000166, 0x0000013c, + 0x0000010e, 0x00000104, 0x0000007f, 0x00000251, 0x0000018e, 0x000002bd, 0x000003ae, 0x000003c5, + 0x00000186, 0x00000372, 0x0000011b, 0x0000023a, 0x00000109, 0x000000dd, 0x000000b8, 0x00000006, + 0x00000124, 0x000002ee, 0x00000282, 0x000001d1, 0x00000248, 0x000001cf, 0x00000387, 0x000001eb, + 0x00000072, 0x00000312, 0x00000269, 0x0000033e, 0x000003a2, 0x00000023, 0x0000017d, 0x0000012e, + 0x000002ed, 0x00000048, 0x0000013a, 0x0000019c, 0x000001c0, 0x0000026b, 0x00000117, 0x0000037e, + 0x00000104, 0x00000399, 0x00000075, 0x00000239, 0x0000020d, 0x00000133, 0x0000027d, 0x0000009c, + 0x00000211, 0x000001f8, 0x000002ef, 0x000001f9, 0x000000a0, 0x000003b1, 0x000003fe, 0x00000097, + 0x000003ff, 0x000001e0, 0x00000166, 0x0000000f, 0x000003bc, 0x000002f1, 0x00000062, 0x0000035a, + 0x00000029, 0x000002d1, 0x000003db, 0x00000136, 0x000001fb, 0x000000f2, 0x000001f2, 0x00000309, + 0x000002dd, 0x000000f4, 0x0000018f, 0x00000366, 0x00000279, 0x000001fe, 0x0000028b, 0x00000175, + 0x0000009e, 0x000003ac, 0x000001fa, 0x000003e5, 0x000003c5, 0x000003b3, 0x00000341, 0x00000339, + 0x000003de, 0x000000a5, 0x000000a4, 0x000002ea, 0x000001af, 0x000003b5, 0x000003ec, 0x0000011f, + 0x00000235, 0x000001d0, 0x00000215, 0x00000203, 0x000000c1, 0x0000006f, 0x0000031e, 0xc11bf1e0, + 0x7421580c, 0x7ec47e35, 0x4ba9afed, 0xe7de294a, 0xc5ecf41c, 0xeb1faf97, 0xa8b5d49e, 0xfd9a797f, + 0x26dd3d18, 0xfb3c9f2c, 0xb47db4d9, 0x462d6829, 0x603fbcbc, 0x5fff7816, 0xa0ec7fe2, 0x7e2ef7e4, + 0xe7d27211, 0x58e60d4c, 0xf904e647, 0xc09ad17d, 0x1213a671, 0xd7d4baed, 0x9cfba208, 0xac72c4a6, + 0x4873f877, 0xbba82746, 0x4b801924, 0xb8e90937, 0x1586dc91, 0x8eaf7aef, 0x4107f945, 0x5664f10e, + 0x77036d8a, 0x5f47d4d3, 0x54a7f09d, 0x0db93910, 0x00028be6, 0x00000000, 0x00000000, 0x00000000}; +#endif // TABLES_HPP_ diff --git a/rocclr/runtime/device/cpu/cpuvirtual.cpp b/rocclr/runtime/device/cpu/cpuvirtual.cpp index 0f94cbbb24..15ab049177 100644 --- a/rocclr/runtime/device/cpu/cpuvirtual.cpp +++ b/rocclr/runtime/device/cpu/cpuvirtual.cpp @@ -17,367 +17,328 @@ namespace cpu { amd::Atomic VirtualCPU::numWorkerThreads_(0); -VirtualCPU::VirtualCPU(Device& device) - : device::VirtualDevice(device), acceptingCommands_(false) -{ - const size_t numCores = device.info().maxComputeUnits_; +VirtualCPU::VirtualCPU(Device& device) : device::VirtualDevice(device), acceptingCommands_(false) { + const size_t numCores = device.info().maxComputeUnits_; - if ((numWorkerThreads_ += numCores) >= Device::getMaxWorkerThreadsNumber()) { - numWorkerThreads_ -= numCores; - cores_ = NULL; - return; - } - - cores_ = new(std::nothrow) WorkerThread*[numCores]; - if (cores_ == NULL) { - return; - } - - // Clear memory for the worker threads - memset(cores_, 0, numCores * sizeof(WorkerThread*)); - -#if defined(__linux__) - const bool isNuma = -#if defined(NUMA_SUPPORT) - device.getNumaMask() == NULL; -#else - false; -#endif // NUMA_SUPPORT - const amd::Os::ThreadAffinityMask* affinityMask = isNuma ? NULL : -#else - const amd::Os::ThreadAffinityMask* affinityMask = -#endif - device.getWorkerThreadsAffinity(); - - uint coreId = affinityMask != NULL ? affinityMask->getFirstSet() : (uint)-1; - - for (size_t i = 0; i < numCores; ++i) { - WorkerThread* thread = cores_[i] = new WorkerThread(device); - if (thread == NULL) { - for (size_t j = 0; j < i; ++j) { - cores_[j]->resume(); - } - return; - } - - if (thread->state() != amd::Thread::INITIALIZED) { - return; - } - -#if defined(__linux__) - if (!isNuma) { - if (coreId == (uint)-1) { - thread->setAffinity((uint) i); - } - else { - thread->setAffinity(coreId); - coreId = affinityMask->getNextSet(coreId); - } - } -#else // On Windows we set an affinity mask and not a specific ID. - if (coreId != (uint)-1) { - thread->setAffinity(*affinityMask); - } -#endif - thread->start(); - } - - blitMgr_ = new device::HostBlitManager(*this); - if ((NULL == blitMgr_) || !blitMgr_->create(device)) { - LogError("Could not create BlitManager!"); - return; - } - - acceptingCommands_ = true; -} - -VirtualCPU::~VirtualCPU() -{ - if (cores_ == NULL) { - return; - } - - delete blitMgr_; - - const size_t numCores = device().info().maxComputeUnits_; - for (size_t i = 0; i < numCores; ++i) { - delete cores_[i]; - } + if ((numWorkerThreads_ += numCores) >= Device::getMaxWorkerThreadsNumber()) { numWorkerThreads_ -= numCores; - delete[] cores_; + cores_ = NULL; + return; + } + + cores_ = new (std::nothrow) WorkerThread*[numCores]; + if (cores_ == NULL) { + return; + } + + // Clear memory for the worker threads + memset(cores_, 0, numCores * sizeof(WorkerThread*)); + +#if defined(__linux__) + const bool isNuma = +#if defined(NUMA_SUPPORT) + device.getNumaMask() == NULL; +#else + false; +#endif // NUMA_SUPPORT + const amd::Os::ThreadAffinityMask* affinityMask = isNuma ? NULL : +#else + const amd::Os::ThreadAffinityMask* affinityMask = +#endif + device.getWorkerThreadsAffinity(); + + uint coreId = affinityMask != NULL ? affinityMask->getFirstSet() : (uint)-1; + + for (size_t i = 0; i < numCores; ++i) { + WorkerThread* thread = cores_[i] = new WorkerThread(device); + if (thread == NULL) { + for (size_t j = 0; j < i; ++j) { + cores_[j]->resume(); + } + return; + } + + if (thread->state() != amd::Thread::INITIALIZED) { + return; + } + +#if defined(__linux__) + if (!isNuma) { + if (coreId == (uint)-1) { + thread->setAffinity((uint)i); + } else { + thread->setAffinity(coreId); + coreId = affinityMask->getNextSet(coreId); + } + } +#else // On Windows we set an affinity mask and not a specific ID. + if (coreId != (uint)-1) { + thread->setAffinity(*affinityMask); + } +#endif + thread->start(); + } + + blitMgr_ = new device::HostBlitManager(*this); + if ((NULL == blitMgr_) || !blitMgr_->create(device)) { + LogError("Could not create BlitManager!"); + return; + } + + acceptingCommands_ = true; } -bool -VirtualCPU::terminate() -{ - if (cores_ == NULL) { - return true; - } +VirtualCPU::~VirtualCPU() { + if (cores_ == NULL) { + return; + } - const size_t numCores = device().info().maxComputeUnits_; - for (size_t i = 0; i < numCores; ++i) { - if (cores_[i]) { - cores_[i]->terminate(); - } - } + delete blitMgr_; + + const size_t numCores = device().info().maxComputeUnits_; + for (size_t i = 0; i < numCores; ++i) { + delete cores_[i]; + } + numWorkerThreads_ -= numCores; + delete[] cores_; +} + +bool VirtualCPU::terminate() { + if (cores_ == NULL) { return true; + } + + const size_t numCores = device().info().maxComputeUnits_; + for (size_t i = 0; i < numCores; ++i) { + if (cores_[i]) { + cores_[i]->terminate(); + } + } + return true; } -void -VirtualCPU::submitReadMemory(amd::ReadMemoryCommand& vcmd) -{ - vcmd.setStatus(CL_RUNNING); +void VirtualCPU::submitReadMemory(amd::ReadMemoryCommand& vcmd) { + vcmd.setStatus(CL_RUNNING); - bool result = false; - device::Memory memory(vcmd.source()); + bool result = false; + device::Memory memory(vcmd.source()); - // Ensure memory up-to-date - vcmd.source().cacheWriteBack(); + // Ensure memory up-to-date + vcmd.source().cacheWriteBack(); - switch (vcmd.type()) { + switch (vcmd.type()) { case CL_COMMAND_READ_BUFFER: - result = blitMgr().readBuffer(memory, vcmd.destination(), - vcmd.origin(), vcmd.size(), vcmd.isEntireMemory()); - break; + result = blitMgr().readBuffer(memory, vcmd.destination(), vcmd.origin(), vcmd.size(), + vcmd.isEntireMemory()); + break; case CL_COMMAND_READ_BUFFER_RECT: - result = blitMgr().readBufferRect(memory, - vcmd.destination(), vcmd.bufRect(), vcmd.hostRect(), vcmd.size(), - vcmd.isEntireMemory()); - break; + result = blitMgr().readBufferRect(memory, vcmd.destination(), vcmd.bufRect(), vcmd.hostRect(), + vcmd.size(), vcmd.isEntireMemory()); + break; case CL_COMMAND_READ_IMAGE: - result = blitMgr().readImage(memory, vcmd.destination(), - vcmd.origin(), vcmd.size(), vcmd.rowPitch(), vcmd.slicePitch(), - vcmd.isEntireMemory()); - break; + result = blitMgr().readImage(memory, vcmd.destination(), vcmd.origin(), vcmd.size(), + vcmd.rowPitch(), vcmd.slicePitch(), vcmd.isEntireMemory()); + break; default: - LogError("Unsupported type for the read command"); - break; - } + LogError("Unsupported type for the read command"); + break; + } - if (!result) { - LogError("submitReadMemory failed!"); - vcmd.setStatus(CL_INVALID_OPERATION); - } - else { - vcmd.setStatus(CL_COMPLETE); - } + if (!result) { + LogError("submitReadMemory failed!"); + vcmd.setStatus(CL_INVALID_OPERATION); + } else { + vcmd.setStatus(CL_COMPLETE); + } } -void -VirtualCPU::submitWriteMemory(amd::WriteMemoryCommand& vcmd) -{ - vcmd.setStatus(CL_RUNNING); +void VirtualCPU::submitWriteMemory(amd::WriteMemoryCommand& vcmd) { + vcmd.setStatus(CL_RUNNING); - bool result = false; - device::Memory memory(vcmd.destination()); + bool result = false; + device::Memory memory(vcmd.destination()); - // Ensure memory up-to-date - vcmd.destination().cacheWriteBack(); + // Ensure memory up-to-date + vcmd.destination().cacheWriteBack(); - // Process different write commands - switch (vcmd.type()) { + // Process different write commands + switch (vcmd.type()) { case CL_COMMAND_WRITE_BUFFER: - result = blitMgr().writeBuffer(vcmd.source(), memory, - vcmd.origin(), vcmd.size(), vcmd.isEntireMemory()); - break; + result = blitMgr().writeBuffer(vcmd.source(), memory, vcmd.origin(), vcmd.size(), + vcmd.isEntireMemory()); + break; case CL_COMMAND_WRITE_BUFFER_RECT: - result = blitMgr().writeBufferRect(vcmd.source(), memory, - vcmd.hostRect(), vcmd.bufRect(), vcmd.size(), - vcmd.isEntireMemory()); - break; + result = blitMgr().writeBufferRect(vcmd.source(), memory, vcmd.hostRect(), vcmd.bufRect(), + vcmd.size(), vcmd.isEntireMemory()); + break; case CL_COMMAND_WRITE_IMAGE: - result = blitMgr().writeImage(vcmd.source(), memory, - vcmd.origin(), vcmd.size(), vcmd.rowPitch(), vcmd.slicePitch(), - vcmd.isEntireMemory()); - break; + result = blitMgr().writeImage(vcmd.source(), memory, vcmd.origin(), vcmd.size(), + vcmd.rowPitch(), vcmd.slicePitch(), vcmd.isEntireMemory()); + break; default: - LogError("Unsupported type for the write command"); - break; - } + LogError("Unsupported type for the write command"); + break; + } - // Mark cache as clean (CPU works directly on backing store) - vcmd.destination().signalWrite(NULL); + // Mark cache as clean (CPU works directly on backing store) + vcmd.destination().signalWrite(NULL); - if (!result) { - LogError("submitWriteMemory failed!"); - vcmd.setStatus(CL_INVALID_OPERATION); - } - else { - vcmd.setStatus(CL_COMPLETE); - } + if (!result) { + LogError("submitWriteMemory failed!"); + vcmd.setStatus(CL_INVALID_OPERATION); + } else { + vcmd.setStatus(CL_COMPLETE); + } } -void -VirtualCPU::submitCopyMemory(amd::CopyMemoryCommand& vcmd) -{ - vcmd.setStatus(CL_RUNNING); +void VirtualCPU::submitCopyMemory(amd::CopyMemoryCommand& vcmd) { + vcmd.setStatus(CL_RUNNING); - // Ensure memory up-to-date - vcmd.source().cacheWriteBack(); - vcmd.destination().cacheWriteBack(); + // Ensure memory up-to-date + vcmd.source().cacheWriteBack(); + vcmd.destination().cacheWriteBack(); - // Translate memory references and ensure cache up-to-date - device::Memory dstMemory(vcmd.destination()); - device::Memory srcMemory(vcmd.source()); + // Translate memory references and ensure cache up-to-date + device::Memory dstMemory(vcmd.destination()); + device::Memory srcMemory(vcmd.source()); - bool result = false; + bool result = false; - // Check if HW can be used for memory copy - switch (vcmd.type()) { + // Check if HW can be used for memory copy + switch (vcmd.type()) { case CL_COMMAND_COPY_BUFFER: - result = blitMgr().copyBuffer(srcMemory, dstMemory, - vcmd.srcOrigin(), vcmd.dstOrigin(), vcmd.size(), - vcmd.isEntireMemory()); - break; + result = blitMgr().copyBuffer(srcMemory, dstMemory, vcmd.srcOrigin(), vcmd.dstOrigin(), + vcmd.size(), vcmd.isEntireMemory()); + break; case CL_COMMAND_COPY_BUFFER_RECT: - result = blitMgr().copyBufferRect(srcMemory, dstMemory, - vcmd.srcRect(), vcmd.dstRect(), vcmd.size(), - vcmd.isEntireMemory()); - break; + result = blitMgr().copyBufferRect(srcMemory, dstMemory, vcmd.srcRect(), vcmd.dstRect(), + vcmd.size(), vcmd.isEntireMemory()); + break; case CL_COMMAND_COPY_IMAGE_TO_BUFFER: - result = blitMgr().copyImageToBuffer(srcMemory, dstMemory, - vcmd.srcOrigin(), vcmd.dstOrigin(), vcmd.size(), - vcmd.isEntireMemory()); - break; + result = blitMgr().copyImageToBuffer(srcMemory, dstMemory, vcmd.srcOrigin(), vcmd.dstOrigin(), + vcmd.size(), vcmd.isEntireMemory()); + break; case CL_COMMAND_COPY_BUFFER_TO_IMAGE: - result = blitMgr().copyBufferToImage(srcMemory, dstMemory, - vcmd.srcOrigin(), vcmd.dstOrigin(), vcmd.size(), - vcmd.isEntireMemory()); - break; + result = blitMgr().copyBufferToImage(srcMemory, dstMemory, vcmd.srcOrigin(), vcmd.dstOrigin(), + vcmd.size(), vcmd.isEntireMemory()); + break; case CL_COMMAND_COPY_IMAGE: - result = blitMgr().copyImage(srcMemory, dstMemory, - vcmd.srcOrigin(), vcmd.dstOrigin(), vcmd.size(), - vcmd.isEntireMemory()); - break; + result = blitMgr().copyImage(srcMemory, dstMemory, vcmd.srcOrigin(), vcmd.dstOrigin(), + vcmd.size(), vcmd.isEntireMemory()); + break; default: - LogError("Unsupported command type for memory copy!"); - break; - } + LogError("Unsupported command type for memory copy!"); + break; + } - // Mark cache as clean (CPU works directly on backing store) - vcmd.destination().signalWrite(NULL); + // Mark cache as clean (CPU works directly on backing store) + vcmd.destination().signalWrite(NULL); - if (!result) { - LogError("submitCopyMemory failed!"); - vcmd.setStatus(CL_INVALID_OPERATION); - } - else { - vcmd.setStatus(CL_COMPLETE); - } + if (!result) { + LogError("submitCopyMemory failed!"); + vcmd.setStatus(CL_INVALID_OPERATION); + } else { + vcmd.setStatus(CL_COMPLETE); + } } -void -VirtualCPU::submitMapMemory(amd::MapMemoryCommand& cmd) -{ - cmd.setStatus(CL_RUNNING); +void VirtualCPU::submitMapMemory(amd::MapMemoryCommand& cmd) { + cmd.setStatus(CL_RUNNING); - if (cmd.mapFlags() & CL_MAP_READ - || cmd.mapFlags() & CL_MAP_WRITE) { - LogInfo("cpu::VirtualCPU::submitMapMemory() CL_MAP_READ and CL_MAP_WRITE ignored"); - } + if (cmd.mapFlags() & CL_MAP_READ || cmd.mapFlags() & CL_MAP_WRITE) { + LogInfo("cpu::VirtualCPU::submitMapMemory() CL_MAP_READ and CL_MAP_WRITE ignored"); + } - // Ensure memory up-to-date - cmd.memory().cacheWriteBack(); + // Ensure memory up-to-date + cmd.memory().cacheWriteBack(); - cmd.setStatus(CL_COMPLETE); + cmd.setStatus(CL_COMPLETE); } -void -VirtualCPU::submitUnmapMemory(amd::UnmapMemoryCommand& cmd) -{ - cmd.setStatus(CL_RUNNING); +void VirtualCPU::submitUnmapMemory(amd::UnmapMemoryCommand& cmd) { + cmd.setStatus(CL_RUNNING); - // Mark cache as clean (CPU works directly on backing store) - cmd.memory().signalWrite(NULL); + // Mark cache as clean (CPU works directly on backing store) + cmd.memory().signalWrite(NULL); - //! @todo:dgladdin: strictly speaking we should check that the mem object was mapped - cmd.setStatus(CL_COMPLETE); + //! @todo:dgladdin: strictly speaking we should check that the mem object was mapped + cmd.setStatus(CL_COMPLETE); } -void -VirtualCPU::submitFillMemory(amd::FillMemoryCommand& vcmd) -{ - vcmd.setStatus(CL_RUNNING); +void VirtualCPU::submitFillMemory(amd::FillMemoryCommand& vcmd) { + vcmd.setStatus(CL_RUNNING); - device::Memory memory(vcmd.memory()); + device::Memory memory(vcmd.memory()); - vcmd.memory().cacheWriteBack(); + vcmd.memory().cacheWriteBack(); - bool result = false; + bool result = false; - // Find the the right fill operation - switch (vcmd.type()) { + // Find the the right fill operation + switch (vcmd.type()) { case CL_COMMAND_FILL_BUFFER: - result = blitMgr().fillBuffer(memory, vcmd.pattern(), - vcmd.patternSize(), vcmd.origin(), vcmd.size(), - vcmd.isEntireMemory()); - break; + result = blitMgr().fillBuffer(memory, vcmd.pattern(), vcmd.patternSize(), vcmd.origin(), + vcmd.size(), vcmd.isEntireMemory()); + break; case CL_COMMAND_FILL_IMAGE: - result = blitMgr().fillImage(memory, vcmd.pattern(), - vcmd.origin(), vcmd.size(), vcmd.isEntireMemory()); - break; + result = blitMgr().fillImage(memory, vcmd.pattern(), vcmd.origin(), vcmd.size(), + vcmd.isEntireMemory()); + break; default: - LogError("Unsupported command type for FillMemory!"); - break; - } + LogError("Unsupported command type for FillMemory!"); + break; + } - vcmd.memory().signalWrite(NULL); + vcmd.memory().signalWrite(NULL); - if (!result) { - LogError("submitFillMemory failed!"); - vcmd.setStatus(CL_INVALID_OPERATION); - } - else { - vcmd.setStatus(CL_COMPLETE); - } + if (!result) { + LogError("submitFillMemory failed!"); + vcmd.setStatus(CL_INVALID_OPERATION); + } else { + vcmd.setStatus(CL_COMPLETE); + } } //! Helper function for forcing a cache sync for all kernel parameters -static void syncAllParams(amd::NDRangeKernelCommand& cmd) -{ - const amd::Kernel& kernel = cmd.kernel(); - const amd::KernelParameters& kernelParam = kernel.parameters(); - const amd::KernelSignature& signature = kernel.signature(); - const amd::Device& device = cmd.queue()->device(); +static void syncAllParams(amd::NDRangeKernelCommand& cmd) { + const amd::Kernel& kernel = cmd.kernel(); + const amd::KernelParameters& kernelParam = kernel.parameters(); + const amd::KernelSignature& signature = kernel.signature(); + const amd::Device& device = cmd.queue()->device(); - for (size_t i = 0; i < signature.numParameters(); ++i) { - const amd::KernelParameterDescriptor& desc = signature.at(i); - if (desc.type_ == T_POINTER && desc.size_ > 0 && - !kernelParam.boundToSvmPointer(device, cmd.parameters(), i)) { - address ptr = (address) (cmd.parameters() + desc.offset_); - amd::Memory* memArg = *(amd::Memory**)ptr; + for (size_t i = 0; i < signature.numParameters(); ++i) { + const amd::KernelParameterDescriptor& desc = signature.at(i); + if (desc.type_ == T_POINTER && desc.size_ > 0 && + !kernelParam.boundToSvmPointer(device, cmd.parameters(), i)) { + address ptr = (address)(cmd.parameters() + desc.offset_); + amd::Memory* memArg = *(amd::Memory**)ptr; - if (memArg != NULL) { - memArg->cacheWriteBack(); - memArg->signalWrite(NULL); - } - } + if (memArg != NULL) { + memArg->cacheWriteBack(); + memArg->signalWrite(NULL); + } } + } } -void -VirtualCPU::computeLocalSizes(amd::NDRangeKernelCommand& command, - amd::NDRange& local) { - bool uniformSize = (OPENCL_MAJOR < 2) || - command.kernel().getDeviceKernel(device())->getUniformWorkGroupSize(); +void VirtualCPU::computeLocalSizes(amd::NDRangeKernelCommand& command, amd::NDRange& local) { + bool uniformSize = + (OPENCL_MAJOR < 2) || command.kernel().getDeviceKernel(device())->getUniformWorkGroupSize(); const amd::NDRangeContainer& sizes = command.sizes(); const size_t numCores = device().info().maxComputeUnits_; const size_t globalSize1D = sizes.global().product(); - const size_t targetNumOperations = - std::min(globalSize1D, numCores * 4); - size_t localSize1D = - std::min(globalSize1D / targetNumOperations, - device().info().maxWorkGroupSize_); - + const size_t targetNumOperations = std::min(globalSize1D, numCores * 4); + size_t localSize1D = + std::min(globalSize1D / targetNumOperations, device().info().maxWorkGroupSize_); + for (size_t i = 0; i < local.dimensions(); ++i) { const size_t globalSize = sizes.global()[i]; size_t localSize = - std::min(std::min(localSize1D, globalSize), - device().info().maxWorkItemSizes_[i]); - + std::min(std::min(localSize1D, globalSize), device().info().maxWorkItemSizes_[i]); + // local must exactly divide global if uniform size is required // For non uniform size, we could use the work group size hint if (uniformSize && globalSize % localSize != 0) { @@ -395,222 +356,181 @@ VirtualCPU::computeLocalSizes(amd::NDRangeKernelCommand& command, } -static -amd::NDRange computeRemainders(const amd::NDRange& global, - const amd::NDRange& local) -{ +static amd::NDRange computeRemainders(const amd::NDRange& global, const amd::NDRange& local) { amd::NDRange remainders(local.dimensions()); for (size_t i = 0; i < local.dimensions(); ++i) { - remainders[i] = (global[i] % local[i] != 0) ? 1 : 0; + remainders[i] = (global[i] % local[i] != 0) ? 1 : 0; } return remainders; } -void -VirtualCPU::submitKernel(amd::NDRangeKernelCommand& command) -{ - const amd::NDRangeContainer& sizes = command.sizes(); - const size_t numCores = device().info().maxComputeUnits_; +void VirtualCPU::submitKernel(amd::NDRangeKernelCommand& command) { + const amd::NDRangeContainer& sizes = command.sizes(); + const size_t numCores = device().info().maxComputeUnits_; - amd::NDRange local = sizes.local(); + amd::NDRange local = sizes.local(); - if (local == 0) { - computeLocalSizes(command, local); - } - amd::NDRange remainders = computeRemainders(sizes.global(), local); + if (local == 0) { + computeLocalSizes(command, local); + } + amd::NDRange remainders = computeRemainders(sizes.global(), local); - // number of groups in each dimensions - const amd::NDRange numGroups = (sizes.global() / local) + remainders; + // number of groups in each dimensions + const amd::NDRange numGroups = (sizes.global() / local) + remainders; - size_t numOperations = numGroups.product(); - if (numOperations == 0) { - command.setStatus(CL_COMPLETE); - return; - } - - syncAllParams(command); - // retain the command here instead of retaining in NDRangeKernelBatch' ctor - command.retain(); - - size_t batchCount = std::min(numOperations, numCores); - NDRangeKernelBatch batch(command, *this, numGroups, batchCount); - - Operation::Counter counter(command, batchCount); - command.setData(&counter); - - for (size_t coreId = 0; coreId < batchCount; ++coreId) { - batch.setCoreId(coreId); - cores_[coreId]->enqueue(batch); - cores_[coreId]->flush(); - } - - command.awaitCompletion(); - command.release(); -} - -void -VirtualCPU::submitNativeFn(amd::NativeFnCommand& command) -{ - NativeFn fn(command); - cores_[0]->enqueue(fn); - cores_[0]->flush(); - command.awaitCompletion(); -} - -void -VirtualCPU::submitMarker(amd::Marker& command) -{ + size_t numOperations = numGroups.product(); + if (numOperations == 0) { command.setStatus(CL_COMPLETE); + return; + } + + syncAllParams(command); + // retain the command here instead of retaining in NDRangeKernelBatch' ctor + command.retain(); + + size_t batchCount = std::min(numOperations, numCores); + NDRangeKernelBatch batch(command, *this, numGroups, batchCount); + + Operation::Counter counter(command, batchCount); + command.setData(&counter); + + for (size_t coreId = 0; coreId < batchCount; ++coreId) { + batch.setCoreId(coreId); + cores_[coreId]->enqueue(batch); + cores_[coreId]->flush(); + } + + command.awaitCompletion(); + command.release(); } -void -VirtualCPU::submitAcquireExtObjects(amd::AcquireExtObjectsCommand& cmd) -{ - //! @todo [odintsov]: create an AcquireExtObjectsOperation and enqueue it - //! to a core when a core scheduler is around. - // - // cores_[0]->enqueue(new AcquireExtObjectsOperation(cmd)); - // the code below will be moved to AcquireExtObjectsOperation::execute() - cmd.setStatus(CL_RUNNING); - - // - // AcquireExtObjects execution starts here - // - bool bError = false; - - //! Go through ext objects by one and call member function to execute - //! a sequence of external graphics API commands for each external object - for(std::vector::const_iterator itr = cmd.getMemList().begin(); - itr != cmd.getMemList().end(); itr++) { - if(*itr) { - bError |= !((*itr)->mapExtObjectInCQThread()); - } - } - if(bError) { - cmd.setStatus(CL_INVALID_OPERATION); - } - else { - cmd.setStatus(CL_COMPLETE); - } +void VirtualCPU::submitNativeFn(amd::NativeFnCommand& command) { + NativeFn fn(command); + cores_[0]->enqueue(fn); + cores_[0]->flush(); + command.awaitCompletion(); } -void -VirtualCPU::submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& cmd) -{ - //! @todo [odintsov]: create a ReleaseExtObjectsOperation and enqueue it - //! to a core when a core scheduler is around. - // - // cores_[i]->enqueue(new ReleaseExtObjectsOperation(cmd)); - // the code below will be moved to ReleaseExtObjectsOperation::execute() - cmd.setStatus(CL_RUNNING); +void VirtualCPU::submitMarker(amd::Marker& command) { command.setStatus(CL_COMPLETE); } - bool bError = false; +void VirtualCPU::submitAcquireExtObjects(amd::AcquireExtObjectsCommand& cmd) { + //! @todo [odintsov]: create an AcquireExtObjectsOperation and enqueue it + //! to a core when a core scheduler is around. + // + // cores_[0]->enqueue(new AcquireExtObjectsOperation(cmd)); + // the code below will be moved to AcquireExtObjectsOperation::execute() + cmd.setStatus(CL_RUNNING); - for(std::vector::const_iterator itr = cmd.getMemList().begin(); - itr != cmd.getMemList().end(); itr++) { - if(*itr) { - bError |= !((*itr)->unmapExtObjectInCQThread()); - } - } - if(bError) { - cmd.setStatus(CL_INVALID_OPERATION); - } - else { - cmd.setStatus(CL_COMPLETE); - } -} + // + // AcquireExtObjects execution starts here + // + bool bError = false; -void VirtualCPU::submitPerfCounter(amd::PerfCounterCommand& cmd) -{ - cmd.setStatus(CL_RUNNING); - LogError("We don't support HW perf counters on CPU"); + //! Go through ext objects by one and call member function to execute + //! a sequence of external graphics API commands for each external object + for (std::vector::const_iterator itr = cmd.getMemList().begin(); + itr != cmd.getMemList().end(); itr++) { + if (*itr) { + bError |= !((*itr)->mapExtObjectInCQThread()); + } + } + if (bError) { cmd.setStatus(CL_INVALID_OPERATION); + } else { + cmd.setStatus(CL_COMPLETE); + } } -void VirtualCPU::submitThreadTraceMemObjects(amd::ThreadTraceMemObjectsCommand& cmd) -{ - cmd.setStatus(CL_RUNNING); - LogError("We don't support thread trace on CPU"); - cmd.setStatus(CL_INVALID_OPERATION); -} +void VirtualCPU::submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& cmd) { + //! @todo [odintsov]: create a ReleaseExtObjectsOperation and enqueue it + //! to a core when a core scheduler is around. + // + // cores_[i]->enqueue(new ReleaseExtObjectsOperation(cmd)); + // the code below will be moved to ReleaseExtObjectsOperation::execute() + cmd.setStatus(CL_RUNNING); -void VirtualCPU::submitThreadTrace(amd::ThreadTraceCommand& cmd) -{ - cmd.setStatus(CL_RUNNING); - LogError("We don't support thread trace on CPU"); - cmd.setStatus(CL_INVALID_OPERATION); -} + bool bError = false; -void -VirtualCPU::flush(amd::Command* list, bool wait) -{ - amd::Command* head = list; - - // Release all commands from the link list - while (head != NULL) { - amd::Command * it = head->getNext(); - head->release(); - head = it; + for (std::vector::const_iterator itr = cmd.getMemList().begin(); + itr != cmd.getMemList().end(); itr++) { + if (*itr) { + bError |= !((*itr)->unmapExtObjectInCQThread()); } -} - -void -VirtualCPU::submitSignal(amd::SignalCommand & cmd) -{ + } + if (bError) { cmd.setStatus(CL_INVALID_OPERATION); + } else { + cmd.setStatus(CL_COMPLETE); + } } -void -VirtualCPU::submitMakeBuffersResident(amd::MakeBuffersResidentCommand & cmd) -{ - cmd.setStatus(CL_INVALID_OPERATION); +void VirtualCPU::submitPerfCounter(amd::PerfCounterCommand& cmd) { + cmd.setStatus(CL_RUNNING); + LogError("We don't support HW perf counters on CPU"); + cmd.setStatus(CL_INVALID_OPERATION); } -void -VirtualCPU::submitSvmFreeMemory(amd::SvmFreeMemoryCommand& cmd) -{ - cmd.setStatus(CL_RUNNING); - if (cmd.pfnFreeFunc() == NULL) { - // pointers allocated using clSVMAlloc - for (cl_uint i = 0; i < cmd.svmPointers().size(); i++) { - amd::SvmBuffer::free(cmd.context(), cmd.svmPointers()[i]); - } +void VirtualCPU::submitThreadTraceMemObjects(amd::ThreadTraceMemObjectsCommand& cmd) { + cmd.setStatus(CL_RUNNING); + LogError("We don't support thread trace on CPU"); + cmd.setStatus(CL_INVALID_OPERATION); +} + +void VirtualCPU::submitThreadTrace(amd::ThreadTraceCommand& cmd) { + cmd.setStatus(CL_RUNNING); + LogError("We don't support thread trace on CPU"); + cmd.setStatus(CL_INVALID_OPERATION); +} + +void VirtualCPU::flush(amd::Command* list, bool wait) { + amd::Command* head = list; + + // Release all commands from the link list + while (head != NULL) { + amd::Command* it = head->getNext(); + head->release(); + head = it; + } +} + +void VirtualCPU::submitSignal(amd::SignalCommand& cmd) { cmd.setStatus(CL_INVALID_OPERATION); } + +void VirtualCPU::submitMakeBuffersResident(amd::MakeBuffersResidentCommand& cmd) { + cmd.setStatus(CL_INVALID_OPERATION); +} + +void VirtualCPU::submitSvmFreeMemory(amd::SvmFreeMemoryCommand& cmd) { + cmd.setStatus(CL_RUNNING); + if (cmd.pfnFreeFunc() == NULL) { + // pointers allocated using clSVMAlloc + for (cl_uint i = 0; i < cmd.svmPointers().size(); i++) { + amd::SvmBuffer::free(cmd.context(), cmd.svmPointers()[i]); } - else { - cmd.pfnFreeFunc()(as_cl(cmd.queue()->asCommandQueue()), cmd.svmPointers().size(), - (void**) (&(cmd.svmPointers()[0])), cmd.userData()); - } - cmd.setStatus(CL_COMPLETE); + } else { + cmd.pfnFreeFunc()(as_cl(cmd.queue()->asCommandQueue()), cmd.svmPointers().size(), + (void**)(&(cmd.svmPointers()[0])), cmd.userData()); + } + cmd.setStatus(CL_COMPLETE); } -void -VirtualCPU::submitSvmCopyMemory(amd::SvmCopyMemoryCommand& cmd) -{ - cmd.setStatus(CL_RUNNING); - amd::SvmBuffer::memFill(cmd.dst(), cmd.src(), cmd.srcSize(), 1); - cmd.setStatus(CL_COMPLETE); +void VirtualCPU::submitSvmCopyMemory(amd::SvmCopyMemoryCommand& cmd) { + cmd.setStatus(CL_RUNNING); + amd::SvmBuffer::memFill(cmd.dst(), cmd.src(), cmd.srcSize(), 1); + cmd.setStatus(CL_COMPLETE); } -void -VirtualCPU::submitSvmFillMemory(amd::SvmFillMemoryCommand& cmd) -{ - cmd.setStatus(CL_RUNNING); - amd::SvmBuffer::memFill(cmd.dst(), cmd.pattern(), cmd.patternSize(), cmd.times()); - cmd.setStatus(CL_COMPLETE); +void VirtualCPU::submitSvmFillMemory(amd::SvmFillMemoryCommand& cmd) { + cmd.setStatus(CL_RUNNING); + amd::SvmBuffer::memFill(cmd.dst(), cmd.pattern(), cmd.patternSize(), cmd.times()); + cmd.setStatus(CL_COMPLETE); } -void -VirtualCPU::submitSvmMapMemory(amd::SvmMapMemoryCommand& cmd) -{ - cmd.setStatus(CL_COMPLETE); +void VirtualCPU::submitSvmMapMemory(amd::SvmMapMemoryCommand& cmd) { cmd.setStatus(CL_COMPLETE); } + +void VirtualCPU::submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& cmd) { + cmd.setStatus(CL_COMPLETE); } -void -VirtualCPU::submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& cmd) -{ - cmd.setStatus(CL_COMPLETE); -} - -} // namespace cpu +} // namespace cpu diff --git a/rocclr/runtime/device/cpu/cpuvirtual.hpp b/rocclr/runtime/device/cpu/cpuvirtual.hpp index 5a1ac86b70..3bb2d3394f 100644 --- a/rocclr/runtime/device/cpu/cpuvirtual.hpp +++ b/rocclr/runtime/device/cpu/cpuvirtual.hpp @@ -17,60 +17,52 @@ namespace cpu { class WorkerThread; class Device; -class VirtualCPU : public device::VirtualDevice -{ -private: - WorkerThread** cores_; //!< Pointer to array of Worker threads - static amd::Atomic numWorkerThreads_; //!< Current Worker Threads number - bool acceptingCommands_; +class VirtualCPU : public device::VirtualDevice { + private: + WorkerThread** cores_; //!< Pointer to array of Worker threads + static amd::Atomic numWorkerThreads_; //!< Current Worker Threads number + bool acceptingCommands_; -public: - VirtualCPU(cpu::Device& device); - ~VirtualCPU(); - bool terminate(); + public: + VirtualCPU(cpu::Device& device); + ~VirtualCPU(); + bool terminate(); - WorkerThread* getWorkerThread(size_t id) { return cores_[id]; } + WorkerThread* getWorkerThread(size_t id) { return cores_[id]; } - bool acceptingCommands() const { return acceptingCommands_; } + bool acceptingCommands() const { return acceptingCommands_; } - virtual void submitReadMemory(amd::ReadMemoryCommand& command); - virtual void submitWriteMemory(amd::WriteMemoryCommand& command); - virtual void submitCopyMemory(amd::CopyMemoryCommand& command); - virtual void submitMapMemory(amd::MapMemoryCommand& command); - virtual void submitUnmapMemory(amd::UnmapMemoryCommand& command); - virtual void submitKernel(amd::NDRangeKernelCommand& command); - virtual void submitNativeFn(amd::NativeFnCommand& command); - virtual void submitMarker(amd::Marker& command); - virtual void submitFillMemory(amd::FillMemoryCommand& command); - virtual void submitMigrateMemObjects(amd::MigrateMemObjectsCommand& cmd) {} - virtual void submitAcquireExtObjects(amd::AcquireExtObjectsCommand& cmd); - virtual void submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& cmd); - virtual void submitPerfCounter(amd::PerfCounterCommand& cmd); - virtual void submitThreadTraceMemObjects(amd::ThreadTraceMemObjectsCommand& cmd); - virtual void submitThreadTrace(amd::ThreadTraceCommand& cmd); - virtual void flush(amd::Command* list = NULL, bool wait = false); - virtual void submitSignal(amd::SignalCommand & cmd); - virtual void submitMakeBuffersResident(amd::MakeBuffersResidentCommand & cmd); - virtual void submitSvmFreeMemory(amd::SvmFreeMemoryCommand& cmd); - virtual void submitSvmCopyMemory(amd::SvmCopyMemoryCommand& cmd); - virtual void submitSvmFillMemory(amd::SvmFillMemoryCommand& cmd); - virtual void submitSvmMapMemory(amd::SvmMapMemoryCommand& cmd); - virtual void submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& cmd); + virtual void submitReadMemory(amd::ReadMemoryCommand& command); + virtual void submitWriteMemory(amd::WriteMemoryCommand& command); + virtual void submitCopyMemory(amd::CopyMemoryCommand& command); + virtual void submitMapMemory(amd::MapMemoryCommand& command); + virtual void submitUnmapMemory(amd::UnmapMemoryCommand& command); + virtual void submitKernel(amd::NDRangeKernelCommand& command); + virtual void submitNativeFn(amd::NativeFnCommand& command); + virtual void submitMarker(amd::Marker& command); + virtual void submitFillMemory(amd::FillMemoryCommand& command); + virtual void submitMigrateMemObjects(amd::MigrateMemObjectsCommand& cmd) {} + virtual void submitAcquireExtObjects(amd::AcquireExtObjectsCommand& cmd); + virtual void submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& cmd); + virtual void submitPerfCounter(amd::PerfCounterCommand& cmd); + virtual void submitThreadTraceMemObjects(amd::ThreadTraceMemObjectsCommand& cmd); + virtual void submitThreadTrace(amd::ThreadTraceCommand& cmd); + virtual void flush(amd::Command* list = NULL, bool wait = false); + virtual void submitSignal(amd::SignalCommand& cmd); + virtual void submitMakeBuffersResident(amd::MakeBuffersResidentCommand& cmd); + virtual void submitSvmFreeMemory(amd::SvmFreeMemoryCommand& cmd); + virtual void submitSvmCopyMemory(amd::SvmCopyMemoryCommand& cmd); + virtual void submitSvmFillMemory(amd::SvmFillMemoryCommand& cmd); + virtual void submitSvmMapMemory(amd::SvmMapMemoryCommand& cmd); + virtual void submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& cmd); - virtual void computeLocalSizes(amd::NDRangeKernelCommand& command, - amd::NDRange& local); + virtual void computeLocalSizes(amd::NDRangeKernelCommand& command, amd::NDRange& local); - static bool fillImage( - amd::Image& image, - address fillMem, - const void* pattern, - const amd::Coord3D& origin, - const amd::Coord3D& region, - size_t rowPitch, - size_t slicePitch, - size_t elementSize); + static bool fillImage(amd::Image& image, address fillMem, const void* pattern, + const amd::Coord3D& origin, const amd::Coord3D& region, size_t rowPitch, + size_t slicePitch, size_t elementSize); }; -} // namespace cpu +} // namespace cpu -#endif // CPUVIRTUAL_HPP_ +#endif // CPUVIRTUAL_HPP_ diff --git a/rocclr/runtime/device/device.cpp b/rocclr/runtime/device/device.cpp index 17e5454f0f..62322d9af7 100644 --- a/rocclr/runtime/device/device.cpp +++ b/rocclr/runtime/device/device.cpp @@ -13,26 +13,26 @@ extern amd::AppProfile* rocCreateAppProfile(); #if defined(WITH_CPU_DEVICE) #include "device/cpu/cpudevice.hpp" -#endif // WITH_CPU_DEVICE +#endif // WITH_CPU_DEVICE #if defined(WITH_PAL_DEVICE) -//namespace pal { +// namespace pal { extern bool PalDeviceLoad(); extern void PalDeviceUnload(); //} -#endif // WITH_PAL_DEVICE +#endif // WITH_PAL_DEVICE #if defined(WITH_GPU_DEVICE) extern bool DeviceLoad(); extern void DeviceUnload(); -#endif // WITH_GPU_DEVICE +#endif // WITH_GPU_DEVICE #include "platform/runtime.hpp" #include "platform/program.hpp" #include "thread/monitor.hpp" #include "amdocl/cl_common.hpp" #include "utils/options.hpp" -#include "utils/versions.hpp" // AMD_PLATFORM_INFO +#include "utils/versions.hpp" // AMD_PLATFORM_INFO #if defined(HAVE_BLOWFISH_H) #include "blowfish/oclcrypt.hpp" @@ -53,1924 +53,1694 @@ extern void DeviceUnload(); #include - namespace device { extern const char* BlitSourceCode; } namespace amd { -std::vector *Device::devices_ = NULL; +std::vector* Device::devices_ = NULL; AppProfile Device::appProfile_; amd::Monitor SvmManager::AllocatedLock_("Guards SVM allocation list"); std::map SvmManager::svmBufferMap_; -size_t -SvmManager::size() -{ - amd::ScopedLock lock(AllocatedLock_); - return svmBufferMap_.size(); +size_t SvmManager::size() { + amd::ScopedLock lock(AllocatedLock_); + return svmBufferMap_.size(); } -void -SvmManager::AddSvmBuffer(const void* k, amd::Memory* v) -{ - amd::ScopedLock lock(AllocatedLock_); - svmBufferMap_.insert(std::pair(reinterpret_cast(k), v)); +void SvmManager::AddSvmBuffer(const void* k, amd::Memory* v) { + amd::ScopedLock lock(AllocatedLock_); + svmBufferMap_.insert(std::pair(reinterpret_cast(k), v)); } -void -SvmManager::RemoveSvmBuffer(const void* k) -{ - amd::ScopedLock lock(AllocatedLock_); - svmBufferMap_.erase(reinterpret_cast(k)); +void SvmManager::RemoveSvmBuffer(const void* k) { + amd::ScopedLock lock(AllocatedLock_); + svmBufferMap_.erase(reinterpret_cast(k)); } -amd::Memory* -SvmManager::FindSvmBuffer(const void* k) -{ - amd::ScopedLock lock(AllocatedLock_); - uintptr_t key = reinterpret_cast(k); - std::map::iterator it = svmBufferMap_.upper_bound(key); - if (it == svmBufferMap_.begin()) { - return NULL; - } +amd::Memory* SvmManager::FindSvmBuffer(const void* k) { + amd::ScopedLock lock(AllocatedLock_); + uintptr_t key = reinterpret_cast(k); + std::map::iterator it = svmBufferMap_.upper_bound(key); + if (it == svmBufferMap_.begin()) { + return NULL; + } - --it; - amd::Memory* mem = it->second; - if (key >= it->first && key < (it->first + mem->getSize())) { - //the k is in the range - return mem; - } - else { - return NULL; - } + --it; + amd::Memory* mem = it->second; + if (key >= it->first && key < (it->first + mem->getSize())) { + // the k is in the range + return mem; + } else { + return NULL; + } } -Device::BlitProgram::~BlitProgram() -{ - if (program_ != NULL) { - program_->release(); - } +Device::BlitProgram::~BlitProgram() { + if (program_ != NULL) { + program_->release(); + } } -bool -Device::BlitProgram::create(amd::Device* device, - const char* extraKernels, const char* extraOptions) -{ - std::vector devices; - devices.push_back(device); - std::string kernels(device::BlitSourceCode); +bool Device::BlitProgram::create(amd::Device* device, const char* extraKernels, + const char* extraOptions) { + std::vector devices; + devices.push_back(device); + std::string kernels(device::BlitSourceCode); - if (extraKernels != NULL) { - kernels += extraKernels; - } + if (extraKernels != NULL) { + kernels += extraKernels; + } - // Create a program with all blit kernels - program_ = new Program(*context_, kernels.c_str()); - if (program_ == NULL) { - return false; - } + // Create a program with all blit kernels + program_ = new Program(*context_, kernels.c_str()); + if (program_ == NULL) { + return false; + } - // Build all kernels - std::string opt = "-cl-internal-kernel " + // Build all kernels + std::string opt = + "-cl-internal-kernel " #if !defined(WITH_LIGHTNING_COMPILER) - "-Wf,--force_disable_spir -fno-lib-no-inline " \ - "-fno-sc-keep-calls " -#endif // !defined(WITH_LIGHTNING_COMPILER) - ; + "-Wf,--force_disable_spir -fno-lib-no-inline " + "-fno-sc-keep-calls " +#endif // !defined(WITH_LIGHTNING_COMPILER) + ; - if (extraOptions != NULL) { - opt += extraOptions; - } - if (!GPU_DUMP_BLIT_KERNELS) { - opt += " -fno-enable-dump"; - } - if (CL_SUCCESS != program_->build(devices, opt.c_str(), - NULL, NULL, GPU_DUMP_BLIT_KERNELS)) { - return false; - } + if (extraOptions != NULL) { + opt += extraOptions; + } + if (!GPU_DUMP_BLIT_KERNELS) { + opt += " -fno-enable-dump"; + } + if (CL_SUCCESS != program_->build(devices, opt.c_str(), NULL, NULL, GPU_DUMP_BLIT_KERNELS)) { + return false; + } - return true; + return true; } -bool -Device::init() -{ - assert(!Runtime::initialized() && "initialize only once"); - bool ret = false; - devices_ = NULL; - appProfile_.init(); +bool Device::init() { + assert(!Runtime::initialized() && "initialize only once"); + bool ret = false; + devices_ = NULL; + appProfile_.init(); // IMPORTANT: Note that we are initialiing HSA stack first and then // GPU stack. The order of initialization is signiicant and if changed // amd::Device::registerDevice() must be accordingly modified. #if defined(WITH_HSA_DEVICE) - // Return value of roc::Device::init() - // If returned false, error initializing HSA stack. - // If returned true, either HSA not installed or HSA stack - // successfully initialized. - if (!roc::Device::init() ) { - // abort() commentted because this is the only indication - // that KFD is not installed. - // Ignore the failure and assume KFD is not installed. - //abort(); - } - ret |= roc::NullDevice::init(); -#endif // WITH_HSA_DEVICE + // Return value of roc::Device::init() + // If returned false, error initializing HSA stack. + // If returned true, either HSA not installed or HSA stack + // successfully initialized. + if (!roc::Device::init()) { + // abort() commentted because this is the only indication + // that KFD is not installed. + // Ignore the failure and assume KFD is not installed. + // abort(); + } + ret |= roc::NullDevice::init(); +#endif // WITH_HSA_DEVICE #if defined(WITH_GPU_DEVICE) - if (GPU_ENABLE_PAL != 1) { - ret |= DeviceLoad(); - } -#endif // WITH_GPU_DEVICE + if (GPU_ENABLE_PAL != 1) { + ret |= DeviceLoad(); + } +#endif // WITH_GPU_DEVICE #if defined(WITH_PAL_DEVICE) - if (GPU_ENABLE_PAL != 0) { - ret |= PalDeviceLoad(); - } -#endif // WITH_PAL_DEVICE + if (GPU_ENABLE_PAL != 0) { + ret |= PalDeviceLoad(); + } +#endif // WITH_PAL_DEVICE #if defined(WITH_CPU_DEVICE) - ret |= cpu::Device::init(); -#endif // WITH_CPU_DEVICE - return ret; + ret |= cpu::Device::init(); +#endif // WITH_CPU_DEVICE + return ret; } -void -Device::tearDown() -{ - if (devices_ != NULL) { - for (uint i = 0; i < devices_->size(); ++i) { - delete devices_->at(i); - } - devices_->clear(); - delete devices_; +void Device::tearDown() { + if (devices_ != NULL) { + for (uint i = 0; i < devices_->size(); ++i) { + delete devices_->at(i); } + devices_->clear(); + delete devices_; + } #if defined(WITH_HSA_DEVICE) - roc::Device::tearDown(); -#endif // WITH_HSA_DEVICE + roc::Device::tearDown(); +#endif // WITH_HSA_DEVICE #if defined(WITH_GPU_DEVICE) - if (GPU_ENABLE_PAL != 1) { - DeviceUnload(); - } -#endif // WITH_GPU_DEVICE + if (GPU_ENABLE_PAL != 1) { + DeviceUnload(); + } +#endif // WITH_GPU_DEVICE #if defined(WITH_PAL_DEVICE) - if (GPU_ENABLE_PAL != 0) { - PalDeviceUnload(); - } -#endif // WITH_PAL_DEVICE + if (GPU_ENABLE_PAL != 0) { + PalDeviceUnload(); + } +#endif // WITH_PAL_DEVICE #if defined(WITH_CPU_DEVICE) - cpu::Device::tearDown(); -#endif // WITH_CPU_DEVICE + cpu::Device::tearDown(); +#endif // WITH_CPU_DEVICE } Device::Device(Device* parent) - : settings_(NULL) - , online_(true) - , blitProgram_(NULL) - , hwDebugMgr_(NULL) - , parent_(parent) - , vaCacheAccess_(nullptr) - , vaCacheMap_(nullptr) -{ - memset(&info_, '\0', sizeof(info_)); - if (parent_ != NULL) { - parent_->retain(); - } + : settings_(NULL), + online_(true), + blitProgram_(NULL), + hwDebugMgr_(NULL), + parent_(parent), + vaCacheAccess_(nullptr), + vaCacheMap_(nullptr) { + memset(&info_, '\0', sizeof(info_)); + if (parent_ != NULL) { + parent_->retain(); + } } -Device::~Device() -{ - CondLog((vaCacheMap_ != nullptr) && - (vaCacheMap_->size() != 0), "Application didn't unmap all host memory!"); - delete vaCacheMap_; - delete vaCacheAccess_; +Device::~Device() { + CondLog((vaCacheMap_ != nullptr) && (vaCacheMap_->size() != 0), + "Application didn't unmap all host memory!"); + delete vaCacheMap_; + delete vaCacheAccess_; - // Destroy device settings - if (settings_ != NULL) { - delete settings_; - } + // Destroy device settings + if (settings_ != NULL) { + delete settings_; + } - if (parent_ != NULL) { - parent_->release(); - } - else { - if (info_.extensions_ != NULL) { - delete [] info_.extensions_; - } + if (parent_ != NULL) { + parent_->release(); + } else { + if (info_.extensions_ != NULL) { + delete[] info_.extensions_; } + } - if (info_.partitionCreateInfo_.type_.byCounts_ && - info_.partitionCreateInfo_.byCounts_.countsList_ != NULL) { - delete [] info_.partitionCreateInfo_.byCounts_.countsList_; - } + if (info_.partitionCreateInfo_.type_.byCounts_ && + info_.partitionCreateInfo_.byCounts_.countsList_ != NULL) { + delete[] info_.partitionCreateInfo_.byCounts_.countsList_; + } } -bool -Device::create() -{ - vaCacheAccess_ = new amd::Monitor("VA Cache Ops Lock", true); - if (NULL == vaCacheAccess_) { - return false; - } - vaCacheMap_ = new std::map(); - if (NULL == vaCacheMap_) { - return false; - } - return true; -} - -bool -Device::isAncestor(const Device* sub) const -{ - for (const Device* d = sub->parent_; d != NULL; d = d->parent_) { - if (d == this) { - return true; - } - } +bool Device::create() { + vaCacheAccess_ = new amd::Monitor("VA Cache Ops Lock", true); + if (NULL == vaCacheAccess_) { return false; + } + vaCacheMap_ = new std::map(); + if (NULL == vaCacheMap_) { + return false; + } + return true; } -void -Device::registerDevice() -{ - assert(Runtime::singleThreaded() && "this is not thread-safe"); - - static bool defaultIsAssigned = false; - - if (devices_ == NULL) { - devices_ = new std::vector; +bool Device::isAncestor(const Device* sub) const { + for (const Device* d = sub->parent_; d != NULL; d = d->parent_) { + if (d == this) { + return true; } - - if (info_.available_) { - if (!defaultIsAssigned) { - defaultIsAssigned = true; - info_.type_ |= CL_DEVICE_TYPE_DEFAULT; - } - } - devices_->push_back(this); + } + return false; } -void -Device::addVACache(device::Memory* memory) const -{ - // Make sure system memory has direct access - if (memory->isHostMemDirectAccess()) { - // VA cache access must be serialised - amd::ScopedLock lk(*vaCacheAccess_); - void* start = memory->owner()->getHostMem(); - size_t offset; - device::Memory* doubleMap = findMemoryFromVA(start, &offset); +void Device::registerDevice() { + assert(Runtime::singleThreaded() && "this is not thread-safe"); - if (doubleMap == nullptr) { - // Insert the new entry - vaCacheMap_->insert(std::pair - (reinterpret_cast(start), memory)); - } - else { - LogError("Unexpected double map() call from the app!"); - } + static bool defaultIsAssigned = false; + + if (devices_ == NULL) { + devices_ = new std::vector; + } + + if (info_.available_) { + if (!defaultIsAssigned) { + defaultIsAssigned = true; + info_.type_ |= CL_DEVICE_TYPE_DEFAULT; } + } + devices_->push_back(this); } -void -Device::removeVACache(const device::Memory* memory) const -{ - // Make sure system memory has direct access - if (memory->isHostMemDirectAccess() && memory->owner()) { - // VA cache access must be serialised - amd::ScopedLock lk(*vaCacheAccess_); - void* start = memory->owner()->getHostMem(); - vaCacheMap_->erase(reinterpret_cast(start)); - } -} - -device::Memory* -Device::findMemoryFromVA(const void* ptr, size_t* offset) const -{ +void Device::addVACache(device::Memory* memory) const { + // Make sure system memory has direct access + if (memory->isHostMemDirectAccess()) { // VA cache access must be serialised amd::ScopedLock lk(*vaCacheAccess_); + void* start = memory->owner()->getHostMem(); + size_t offset; + device::Memory* doubleMap = findMemoryFromVA(start, &offset); - uintptr_t key = reinterpret_cast(ptr); - std::map::iterator it = vaCacheMap_->upper_bound( - reinterpret_cast(ptr)); - if (it == vaCacheMap_->begin()) { - return nullptr; + if (doubleMap == nullptr) { + // Insert the new entry + vaCacheMap_->insert( + std::pair(reinterpret_cast(start), memory)); + } else { + LogError("Unexpected double map() call from the app!"); } + } +} - --it; - device::Memory* mem = it->second; - if (key >= it->first && key < (it->first + mem->size())) { - // ptr is in the range - *offset = key - it->first; - return mem; - } +void Device::removeVACache(const device::Memory* memory) const { + // Make sure system memory has direct access + if (memory->isHostMemDirectAccess() && memory->owner()) { + // VA cache access must be serialised + amd::ScopedLock lk(*vaCacheAccess_); + void* start = memory->owner()->getHostMem(); + vaCacheMap_->erase(reinterpret_cast(start)); + } +} + +device::Memory* Device::findMemoryFromVA(const void* ptr, size_t* offset) const { + // VA cache access must be serialised + amd::ScopedLock lk(*vaCacheAccess_); + + uintptr_t key = reinterpret_cast(ptr); + std::map::iterator it = + vaCacheMap_->upper_bound(reinterpret_cast(ptr)); + if (it == vaCacheMap_->begin()) { return nullptr; + } + + --it; + device::Memory* mem = it->second; + if (key >= it->first && key < (it->first + mem->size())) { + // ptr is in the range + *offset = key - it->first; + return mem; + } + return nullptr; } bool Device::IsTypeMatching(cl_device_type type, bool offlineDevices) { - if (!(isOnline() || offlineDevices)) { - return false; - } + if (!(isOnline() || offlineDevices)) { + return false; + } - return (info_.type_ & type) != 0; + return (info_.type_ & type) != 0; } -std::vector -Device::getDevices(cl_device_type type, bool offlineDevices) -{ - std::vector result; - - if (devices_ == NULL) { - return result; - } - - // Create the list of available devices - for (device_iterator it = devices_->begin(); it != devices_->end(); ++it) { - // Check if the device type is matched - if ((*it)->IsTypeMatching(type, offlineDevices)) { - result.push_back(*it); - } - } +std::vector Device::getDevices(cl_device_type type, bool offlineDevices) { + std::vector result; + if (devices_ == NULL) { return result; + } + + // Create the list of available devices + for (device_iterator it = devices_->begin(); it != devices_->end(); ++it) { + // Check if the device type is matched + if ((*it)->IsTypeMatching(type, offlineDevices)) { + result.push_back(*it); + } + } + + return result; } -size_t -Device::numDevices(cl_device_type type, bool offlineDevices) -{ - size_t result = 0; +size_t Device::numDevices(cl_device_type type, bool offlineDevices) { + size_t result = 0; - if (devices_ == NULL) { - return 0; + if (devices_ == NULL) { + return 0; + } + + for (device_iterator it = devices_->begin(); it != devices_->end(); ++it) { + // Check if the device type is matched + if ((*it)->IsTypeMatching(type, offlineDevices)) { + ++result; } + } - for (device_iterator it = devices_->begin(); it != devices_->end(); ++it) { - // Check if the device type is matched - if ((*it)->IsTypeMatching(type, offlineDevices)) { - ++result; - } - } - - return result; + return result; } -bool -Device::getDeviceIDs( - cl_device_type deviceType, - cl_uint numEntries, - cl_device_id* devices, - cl_uint* numDevices, - bool offlineDevices) -{ - if (numDevices != NULL && devices == NULL) { - *numDevices = - (cl_uint)amd::Device::numDevices(deviceType, offlineDevices); - return (*numDevices > 0) ? true : false; - } - assert(devices != NULL && "check the code above"); +bool Device::getDeviceIDs(cl_device_type deviceType, cl_uint numEntries, cl_device_id* devices, + cl_uint* numDevices, bool offlineDevices) { + if (numDevices != NULL && devices == NULL) { + *numDevices = (cl_uint)amd::Device::numDevices(deviceType, offlineDevices); + return (*numDevices > 0) ? true : false; + } + assert(devices != NULL && "check the code above"); - std::vector ret = - amd::Device::getDevices(deviceType, offlineDevices); - if (ret.size() == 0) { - *not_null(numDevices) = 0; - return false; - } + std::vector ret = amd::Device::getDevices(deviceType, offlineDevices); + if (ret.size() == 0) { + *not_null(numDevices) = 0; + return false; + } - std::vector::iterator it = ret.begin(); - cl_uint count = std::min(numEntries, (cl_uint)ret.size()); + std::vector::iterator it = ret.begin(); + cl_uint count = std::min(numEntries, (cl_uint)ret.size()); - while (count--) { - *devices++ = as_cl(*it++); - --numEntries; - } - while (numEntries--) { - *devices++ = (cl_device_id) 0; - } + while (count--) { + *devices++ = as_cl(*it++); + --numEntries; + } + while (numEntries--) { + *devices++ = (cl_device_id)0; + } - *not_null(numDevices) = (cl_uint)ret.size(); - return true; + *not_null(numDevices) = (cl_uint)ret.size(); + return true; } -char* -Device::getExtensionString() -{ - std::stringstream extStream; - size_t size; - char* result = NULL; +char* Device::getExtensionString() { + std::stringstream extStream; + size_t size; + char* result = NULL; - // Generate the extension string - for (uint i = 0; i < ClExtTotal; ++i) { - if (settings().checkExtension(i)) { - extStream << OclExtensionsString[i]; - } + // Generate the extension string + for (uint i = 0; i < ClExtTotal; ++i) { + if (settings().checkExtension(i)) { + extStream << OclExtensionsString[i]; } + } - size = extStream.str().size() + 1; + size = extStream.str().size() + 1; - // Create a single string with all extensions - result = new char[size]; - if (result != NULL) { - memcpy(result, extStream.str().data(), (size - 1)); - result[size - 1] = 0; - } + // Create a single string with all extensions + result = new char[size]; + if (result != NULL) { + memcpy(result, extStream.str().data(), (size - 1)); + result[size - 1] = 0; + } - return result; + return result; } -void* -Device::allocMapTarget( - amd::Memory& mem, - const amd::Coord3D& origin, - const amd::Coord3D& region, - uint mapFlags, - size_t* rowPitch, - size_t* slicePitch) -{ - // Translate memory references - device::Memory* devMem = mem.getDeviceMemory(*this); - if (devMem == NULL) { - LogError("allocMapTarget failed. Can't allocate video memory"); - return NULL; - } +void* Device::allocMapTarget(amd::Memory& mem, const amd::Coord3D& origin, + const amd::Coord3D& region, uint mapFlags, size_t* rowPitch, + size_t* slicePitch) { + // Translate memory references + device::Memory* devMem = mem.getDeviceMemory(*this); + if (devMem == NULL) { + LogError("allocMapTarget failed. Can't allocate video memory"); + return NULL; + } - // Pass request over to memory - return devMem->allocMapTarget(origin, region, mapFlags, rowPitch, slicePitch); + // Pass request over to memory + return devMem->allocMapTarget(origin, region, mapFlags, rowPitch, slicePitch); } #if defined(WITH_LIGHTNING_COMPILER) -CacheCompilation::CacheCompilation(std::string targetStr, std::string postfix, bool enableCache, bool resetCache) - : codeCache_ ( targetStr, 0, AMD_PLATFORM_BUILD_NUMBER, postfix ) - , isCodeCacheEnabled_ (enableCache) -{ - if (resetCache) { - // clean up the cached data of the target device - StringCache emptyCache(targetStr, 0, 0, postfix); - } +CacheCompilation::CacheCompilation(std::string targetStr, std::string postfix, bool enableCache, + bool resetCache) + : codeCache_(targetStr, 0, AMD_PLATFORM_BUILD_NUMBER, postfix), + isCodeCacheEnabled_(enableCache) { + if (resetCache) { + // clean up the cached data of the target device + StringCache emptyCache(targetStr, 0, 0, postfix); + } } -bool -CacheCompilation::linkLLVMBitcode(amd::opencl_driver::Compiler* C, - std::vector& inputs, - amd::opencl_driver::Buffer* output, - std::vector& options, - std::string& buildLog) -{ - std::string cacheOpt; - cacheOpt = std::accumulate(begin(options), end(options), cacheOpt); - - bool ret = false; - bool cachedCodeExist = false; - std::vector bcSet; - if (isCodeCacheEnabled_) { - using namespace amd::opencl_driver; - - for (auto &input : inputs) { - assert(input->Type() == DT_LLVM_BC); - - BufferReference* bc = reinterpret_cast(input); - StringCache::CachedData cachedData = { bc->Ptr(), bc->Size() }; - bcSet.push_back(cachedData); - } - - std::string dstData = ""; - if (codeCache_.getCacheEntry(isCodeCacheEnabled_, bcSet.data(), bcSet.size(), - cacheOpt, dstData, "Link LLVM Bitcodes")) { - std::copy(dstData.begin(), dstData.end(), std::back_inserter(output->Buf())); - cachedCodeExist = true; - } - } - - if (!cachedCodeExist) { - if (!C->LinkLLVMBitcode(inputs, output, options)) { - return false; - } - - if (isCodeCacheEnabled_) { - std::string dstData(output->Buf().data(), output->Buf().size()); - if (!codeCache_.makeCacheEntry(bcSet.data(), bcSet.size(), cacheOpt, dstData)) { - buildLog += "Warning: Failed to caching codes.\n"; - LogWarning("Caching codes failed!"); - } - } - } - - return true; -} - -bool -CacheCompilation::compileToLLVMBitcode(amd::opencl_driver::Compiler* C, +bool CacheCompilation::linkLLVMBitcode(amd::opencl_driver::Compiler* C, std::vector& inputs, amd::opencl_driver::Buffer* output, - std::vector& options, - std::string& buildLog) -{ - std::string cacheOpt; - for (uint i=0; i < options.size(); i++) { - // skip the header file option, which is associated with the -cl-std= option - if (options[i].compare("-include-pch") == 0) { - i++; - continue; - } - cacheOpt += options[i]; + std::vector& options, std::string& buildLog) { + std::string cacheOpt; + cacheOpt = std::accumulate(begin(options), end(options), cacheOpt); + + bool ret = false; + bool cachedCodeExist = false; + std::vector bcSet; + if (isCodeCacheEnabled_) { + using namespace amd::opencl_driver; + + for (auto& input : inputs) { + assert(input->Type() == DT_LLVM_BC); + + BufferReference* bc = reinterpret_cast(input); + StringCache::CachedData cachedData = {bc->Ptr(), bc->Size()}; + bcSet.push_back(cachedData); + } + + std::string dstData = ""; + if (codeCache_.getCacheEntry(isCodeCacheEnabled_, bcSet.data(), bcSet.size(), cacheOpt, dstData, + "Link LLVM Bitcodes")) { + std::copy(dstData.begin(), dstData.end(), std::back_inserter(output->Buf())); + cachedCodeExist = true; + } + } + + if (!cachedCodeExist) { + if (!C->LinkLLVMBitcode(inputs, output, options)) { + return false; } - bool ret = false; - bool cachedCodeExist = false; - std::vector bcSet; if (isCodeCacheEnabled_) { - using namespace amd::opencl_driver; - - bool checkCache = true; - for (auto &input : inputs) { - if (input->Type() == DT_CL) { - BufferReference* bc = reinterpret_cast(input); - StringCache::CachedData cachedData = { bc->Ptr(), bc->Size() }; - bcSet.push_back(cachedData); - } - else if (input->Type() == DT_CL_HEADER) { - FileReference* bcFile = reinterpret_cast(input); - std::string bc; - bcFile->ReadToString(bc); - StringCache::CachedData cachedData = { bc.c_str(), bc.size() }; - bcSet.push_back(cachedData); - } - else { - buildLog += "Error: unsupported bitcode type for checking cache.\n"; - checkCache = false; - break; - } - } - - std::string dstData = ""; - if (checkCache && - codeCache_.getCacheEntry(isCodeCacheEnabled_, bcSet.data(), bcSet.size(), - cacheOpt, dstData, "Compile to LLVM Bitcodes")) { - std::copy(dstData.begin(), dstData.end(), std::back_inserter(output->Buf())); - cachedCodeExist = true; - } + std::string dstData(output->Buf().data(), output->Buf().size()); + if (!codeCache_.makeCacheEntry(bcSet.data(), bcSet.size(), cacheOpt, dstData)) { + buildLog += "Warning: Failed to caching codes.\n"; + LogWarning("Caching codes failed!"); + } } + } - if (!cachedCodeExist) { - if (!C->CompileToLLVMBitcode(inputs, output, options)) { - return false; - } - - if (isCodeCacheEnabled_) { - std::string dstData(output->Buf().data(), output->Buf().size()); - if (!codeCache_.makeCacheEntry(bcSet.data(), bcSet.size(), cacheOpt, dstData)) { - buildLog += "Warning: Failed to caching codes.\n"; - LogWarning("Caching codes failed!"); - } - } - } - - return true; + return true; } -bool -CacheCompilation::compileAndLinkExecutable(amd::opencl_driver::Compiler* C, - std::vector& inputs, - amd::opencl_driver::Buffer* output, - std::vector& options, - std::string& buildLog) -{ - std::string cacheOpt; - cacheOpt = std::accumulate(begin(options), end(options), cacheOpt); +bool CacheCompilation::compileToLLVMBitcode(amd::opencl_driver::Compiler* C, + std::vector& inputs, + amd::opencl_driver::Buffer* output, + std::vector& options, + std::string& buildLog) { + std::string cacheOpt; + for (uint i = 0; i < options.size(); i++) { + // skip the header file option, which is associated with the -cl-std= option + if (options[i].compare("-include-pch") == 0) { + i++; + continue; + } + cacheOpt += options[i]; + } + + bool ret = false; + bool cachedCodeExist = false; + std::vector bcSet; + if (isCodeCacheEnabled_) { + using namespace amd::opencl_driver; + + bool checkCache = true; + for (auto& input : inputs) { + if (input->Type() == DT_CL) { + BufferReference* bc = reinterpret_cast(input); + StringCache::CachedData cachedData = {bc->Ptr(), bc->Size()}; + bcSet.push_back(cachedData); + } else if (input->Type() == DT_CL_HEADER) { + FileReference* bcFile = reinterpret_cast(input); + std::string bc; + bcFile->ReadToString(bc); + StringCache::CachedData cachedData = {bc.c_str(), bc.size()}; + bcSet.push_back(cachedData); + } else { + buildLog += "Error: unsupported bitcode type for checking cache.\n"; + checkCache = false; + break; + } + } + + std::string dstData = ""; + if (checkCache && + codeCache_.getCacheEntry(isCodeCacheEnabled_, bcSet.data(), bcSet.size(), cacheOpt, dstData, + "Compile to LLVM Bitcodes")) { + std::copy(dstData.begin(), dstData.end(), std::back_inserter(output->Buf())); + cachedCodeExist = true; + } + } + + if (!cachedCodeExist) { + if (!C->CompileToLLVMBitcode(inputs, output, options)) { + return false; + } - bool ret = false; - bool cachedCodeExist = false; - std::vector bcSet; if (isCodeCacheEnabled_) { - for (auto &input : inputs) { - assert(input->Type() == amd::opencl_driver::DT_LLVM_BC); + std::string dstData(output->Buf().data(), output->Buf().size()); + if (!codeCache_.makeCacheEntry(bcSet.data(), bcSet.size(), cacheOpt, dstData)) { + buildLog += "Warning: Failed to caching codes.\n"; + LogWarning("Caching codes failed!"); + } + } + } - amd::opencl_driver::Buffer* bc = (amd::opencl_driver::Buffer*) input; - StringCache::CachedData cachedData = { bc->Buf().data(), bc->Size() }; - bcSet.push_back(cachedData); - } + return true; +} - std::string dstData = ""; - if (codeCache_.getCacheEntry(isCodeCacheEnabled_, bcSet.data(), bcSet.size(), - cacheOpt, dstData, "Compile and Link Executable")) { - std::copy(dstData.begin(), dstData.end(), std::back_inserter(output->Buf())); - cachedCodeExist = true; - } +bool CacheCompilation::compileAndLinkExecutable(amd::opencl_driver::Compiler* C, + std::vector& inputs, + amd::opencl_driver::Buffer* output, + std::vector& options, + std::string& buildLog) { + std::string cacheOpt; + cacheOpt = std::accumulate(begin(options), end(options), cacheOpt); + + bool ret = false; + bool cachedCodeExist = false; + std::vector bcSet; + if (isCodeCacheEnabled_) { + for (auto& input : inputs) { + assert(input->Type() == amd::opencl_driver::DT_LLVM_BC); + + amd::opencl_driver::Buffer* bc = (amd::opencl_driver::Buffer*)input; + StringCache::CachedData cachedData = {bc->Buf().data(), bc->Size()}; + bcSet.push_back(cachedData); } - if (!cachedCodeExist) { - if (!C->CompileAndLinkExecutable(inputs, output, options)) { - return false; - } + std::string dstData = ""; + if (codeCache_.getCacheEntry(isCodeCacheEnabled_, bcSet.data(), bcSet.size(), cacheOpt, dstData, + "Compile and Link Executable")) { + std::copy(dstData.begin(), dstData.end(), std::back_inserter(output->Buf())); + cachedCodeExist = true; + } + } - if (isCodeCacheEnabled_) { - std::string dstData(output->Buf().data(), output->Buf().size()); - if (!codeCache_.makeCacheEntry(bcSet.data(), bcSet.size(), cacheOpt, dstData)) { - buildLog += "Warning: Failed to caching codes.\n"; - LogWarning("Caching codes failed!"); - } - } + if (!cachedCodeExist) { + if (!C->CompileAndLinkExecutable(inputs, output, options)) { + return false; } - return true; + if (isCodeCacheEnabled_) { + std::string dstData(output->Buf().data(), output->Buf().size()); + if (!codeCache_.makeCacheEntry(bcSet.data(), bcSet.size(), cacheOpt, dstData)) { + buildLog += "Warning: Failed to caching codes.\n"; + LogWarning("Caching codes failed!"); + } + } + } + + return true; } #endif -} // namespace amd +} // namespace amd namespace device { -Settings::Settings() -{ - assert((ClExtTotal < (8 * sizeof(extensions_))) && "Too many extensions!"); - extensions_ = 0; - partialDispatch_ = false; - supportRA_ = true; - customHostAllocator_ = false; - waitCommand_ = AMD_OCL_WAIT_COMMAND; - supportDepthsRGB_ = false; - enableHwDebug_ = false; - commandQueues_ = 200; //!< Field value set to maximum number - //!< concurrent Virtual GPUs for default +Settings::Settings() { + assert((ClExtTotal < (8 * sizeof(extensions_))) && "Too many extensions!"); + extensions_ = 0; + partialDispatch_ = false; + supportRA_ = true; + customHostAllocator_ = false; + waitCommand_ = AMD_OCL_WAIT_COMMAND; + supportDepthsRGB_ = false; + enableHwDebug_ = false; + commandQueues_ = 200; //!< Field value set to maximum number + //!< concurrent Virtual GPUs for default } -bool -Kernel::createSignature(const parameters_t& params) -{ - std::stringstream attribs; - if (workGroupInfo_.compileSize_[0] != 0) { - attribs << "reqd_work_group_size("; - for (size_t i = 0; i < 3; ++i) { - if (i != 0) { - attribs << ","; - } +bool Kernel::createSignature(const parameters_t& params) { + std::stringstream attribs; + if (workGroupInfo_.compileSize_[0] != 0) { + attribs << "reqd_work_group_size("; + for (size_t i = 0; i < 3; ++i) { + if (i != 0) { + attribs << ","; + } - attribs << workGroupInfo_.compileSize_[i]; - } - attribs << ")"; + attribs << workGroupInfo_.compileSize_[i]; } - if (workGroupInfo_.compileSizeHint_[0] != 0) { - attribs << " work_group_size_hint("; - for (size_t i = 0; i < 3; ++i) { - if (i != 0) { - attribs << ","; - } + attribs << ")"; + } + if (workGroupInfo_.compileSizeHint_[0] != 0) { + attribs << " work_group_size_hint("; + for (size_t i = 0; i < 3; ++i) { + if (i != 0) { + attribs << ","; + } - attribs << workGroupInfo_.compileSizeHint_[i]; - } - attribs << ")"; + attribs << workGroupInfo_.compileSizeHint_[i]; } + attribs << ")"; + } - if (!workGroupInfo_.compileVecTypeHint_.empty()) { - attribs << " vec_type_hint(" - << workGroupInfo_.compileVecTypeHint_ - << ")"; - } + if (!workGroupInfo_.compileVecTypeHint_.empty()) { + attribs << " vec_type_hint(" << workGroupInfo_.compileVecTypeHint_ << ")"; + } - // Destroy old signature if it was allocated before - // (offline devices path) - delete signature_; - signature_ = new amd::KernelSignature(params, attribs.str()); - if (NULL != signature_) { - return true; - } - return false; + // Destroy old signature if it was allocated before + // (offline devices path) + delete signature_; + signature_ = new amd::KernelSignature(params, attribs.str()); + if (NULL != signature_) { + return true; + } + return false; } -Kernel::~Kernel() -{ - delete signature_; +Kernel::~Kernel() { delete signature_; } + +std::string Kernel::openclMangledName(const std::string& name) { + const oclBIFSymbolStruct* bifSym = findBIF30SymStruct(symOpenclKernel); + assert(bifSym && "symbol not found"); + return std::string("&") + bifSym->str[bif::PRE] + name + bifSym->str[bif::POST]; } -std::string -Kernel::openclMangledName(const std::string& name) -{ - const oclBIFSymbolStruct* bifSym = findBIF30SymStruct(symOpenclKernel); - assert(bifSym && "symbol not found"); - return std::string("&") + bifSym->str[bif::PRE] + name + bifSym->str[bif::POST]; -} +void Memory::saveMapInfo(const void* mapAddress, const amd::Coord3D origin, + const amd::Coord3D region, uint mapFlags, bool entire, + amd::Image* baseMip) { + // Map/Unmap must be serialized. + amd::ScopedLock lock(owner()->lockMemoryOps()); -void -Memory::saveMapInfo( - const void* mapAddress, - const amd::Coord3D origin, - const amd::Coord3D region, - uint mapFlags, - bool entire, - amd::Image* baseMip) -{ - // Map/Unmap must be serialized. - amd::ScopedLock lock(owner()->lockMemoryOps()); + WriteMapInfo info = {}; + WriteMapInfo* pInfo = &info; + auto it = writeMapInfo_.find(mapAddress); + if (it != writeMapInfo_.end()) { + LogWarning("Double map of the same or overlapped region!"); + pInfo = &it->second; + } - WriteMapInfo info = {}; - WriteMapInfo* pInfo = &info; - auto it = writeMapInfo_.find(mapAddress); - if (it != writeMapInfo_.end()) { - LogWarning("Double map of the same or overlapped region!"); - pInfo = &it->second; - } - - if (mapFlags & (CL_MAP_WRITE | CL_MAP_WRITE_INVALIDATE_REGION)) { - pInfo->origin_ = origin; - pInfo->region_ = region; - pInfo->entire_ = entire; - pInfo->unmapWrite_ = true; - } - if (mapFlags & CL_MAP_READ) { - pInfo->unmapRead_ = true; - } - pInfo->baseMip_ = baseMip; - - // Insert into the map if it's the first region - if (++pInfo->count_ == 1) { - writeMapInfo_.insert(std::pair(mapAddress, info)); - } + if (mapFlags & (CL_MAP_WRITE | CL_MAP_WRITE_INVALIDATE_REGION)) { + pInfo->origin_ = origin; + pInfo->region_ = region; + pInfo->entire_ = entire; + pInfo->unmapWrite_ = true; + } + if (mapFlags & CL_MAP_READ) { + pInfo->unmapRead_ = true; + } + pInfo->baseMip_ = baseMip; + // Insert into the map if it's the first region + if (++pInfo->count_ == 1) { + writeMapInfo_.insert(std::pair(mapAddress, info)); + } } Program::Program(amd::Device& device) - : device_(device) - , type_(TYPE_NONE) - , clBinary_(NULL) - , llvmBinary_() - , elfSectionType_(amd::OclElf::LLVMIR) - , compileOptions_() - , linkOptions_() - , lastBuildOptionsArg_() - , buildStatus_(CL_BUILD_NONE) - , buildError_(CL_SUCCESS) - , globalVariableTotalSize_(0) - , programOptions(NULL) -{ } + : device_(device), + type_(TYPE_NONE), + clBinary_(NULL), + llvmBinary_(), + elfSectionType_(amd::OclElf::LLVMIR), + compileOptions_(), + linkOptions_(), + lastBuildOptionsArg_(), + buildStatus_(CL_BUILD_NONE), + buildError_(CL_SUCCESS), + globalVariableTotalSize_(0), + programOptions(NULL) {} -Program::~Program() -{ - clear(); +Program::~Program() { clear(); } + +void Program::clear() { + // Destroy all device kernels + kernels_t::const_iterator it; + for (it = kernels_.begin(); it != kernels_.end(); ++it) { + delete it->second; + } + kernels_.clear(); } -void -Program::clear() -{ - // Destroy all device kernels - kernels_t::const_iterator it; - for (it = kernels_.begin(); it != kernels_.end(); ++it) { - delete it->second; - } - kernels_.clear(); +bool Program::initBuild(amd::option::Options* options) { + programOptions = options; + + if (options->oVariables->DumpFlags > 0) { + static amd::Atomic build_num = 0; + options->setBuildNo(build_num++); + } + buildLog_.clear(); + if (!initClBinary()) { + return false; + } + return true; } -bool -Program::initBuild(amd::option::Options* options) -{ - programOptions = options; +bool Program::finiBuild(bool isBuildGood) { return true; } - if (options->oVariables->DumpFlags > 0) { - static amd::Atomic build_num = 0; - options->setBuildNo(build_num++); +cl_int Program::compile(const std::string& sourceCode, + const std::vector& headers, + const char** headerIncludeNames, const char* origOptions, + amd::option::Options* options) { + uint64_t start_time = 0; + if (options->oVariables->EnableBuildTiming) { + buildLog_ = "\nStart timing major build components.....\n\n"; + start_time = amd::Os::timeNanos(); + } + + lastBuildOptionsArg_ = origOptions ? origOptions : ""; + if (options) { + compileOptions_ = options->origOptionStr; + } + + buildStatus_ = CL_BUILD_IN_PROGRESS; + if (!initBuild(options)) { + buildStatus_ = CL_BUILD_ERROR; + if (buildLog_.empty()) { + buildLog_ = "Internal error: Compilation init failed."; } - buildLog_.clear(); - if (!initClBinary()) { - return false; + } + + if (options->oVariables->FP32RoundDivideSqrt && + !(device().info().singleFPConfig_ & CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT)) { + buildStatus_ = CL_BUILD_ERROR; + buildLog_ += + "Error: -cl-fp32-correctly-rounded-divide-sqrt " + "specified without device support"; + } + + // Compile the source code if any + if ((buildStatus_ == CL_BUILD_IN_PROGRESS) && !sourceCode.empty() && + !compileImpl(sourceCode, headers, headerIncludeNames, options)) { + buildStatus_ = CL_BUILD_ERROR; + if (buildLog_.empty()) { + buildLog_ = "Internal error: Compilation failed."; } - return true; + } + + setType(TYPE_COMPILED); + + if ((buildStatus_ == CL_BUILD_IN_PROGRESS) && !createBinary(options)) { + buildLog_ += "Internal Error: creating OpenCL binary failed!\n"; + } + + if (!finiBuild(buildStatus_ == CL_BUILD_IN_PROGRESS)) { + buildStatus_ = CL_BUILD_ERROR; + if (buildLog_.empty()) { + buildLog_ = "Internal error: Compilation fini failed."; + } + } + + if (buildStatus_ == CL_BUILD_IN_PROGRESS) { + buildStatus_ = CL_BUILD_SUCCESS; + } else { + buildError_ = CL_COMPILE_PROGRAM_FAILURE; + } + + if (options->oVariables->EnableBuildTiming) { + std::stringstream tmp_ss; + tmp_ss << "\nTotal Compile Time: " << (amd::Os::timeNanos() - start_time) / 1000ULL << " us\n"; + buildLog_ += tmp_ss.str(); + } + + if (options->oVariables->BuildLog && !buildLog_.empty()) { + if (strcmp(options->oVariables->BuildLog, "stderr") == 0) { + fprintf(stderr, "%s\n", options->optionsLog().c_str()); + fprintf(stderr, "%s\n", buildLog_.c_str()); + } else if (strcmp(options->oVariables->BuildLog, "stdout") == 0) { + printf("%s\n", options->optionsLog().c_str()); + printf("%s\n", buildLog_.c_str()); + } else { + std::fstream f; + std::stringstream tmp_ss; + std::string logs = options->optionsLog() + buildLog_; + tmp_ss << options->oVariables->BuildLog << "." << options->getBuildNo(); + f.open(tmp_ss.str().c_str(), (std::fstream::out | std::fstream::binary)); + f.write(logs.data(), logs.size()); + f.close(); + } + LogError(buildLog_.c_str()); + } + + return buildError(); } -bool -Program::finiBuild(bool isBuildGood) -{ - return true; +cl_int Program::link(const std::vector& inputPrograms, const char* origLinkOptions, + amd::option::Options* linkOptions) { + lastBuildOptionsArg_ = origLinkOptions ? origLinkOptions : ""; + if (linkOptions) { + linkOptions_ = linkOptions->origOptionStr; + } + + buildStatus_ = CL_BUILD_IN_PROGRESS; + + amd::option::Options options; + if (!getCompileOptionsAtLinking(inputPrograms, linkOptions)) { + buildStatus_ = CL_BUILD_ERROR; + if (buildLog_.empty()) { + buildLog_ += "Internal error: Get compile options failed."; + } + } else { + if (!amd::option::parseAllOptions(compileOptions_, options)) { + buildStatus_ = CL_BUILD_ERROR; + buildLog_ += options.optionsLog(); + LogError("Parsing compile options failed."); + } + } + + uint64_t start_time = 0; + if (options.oVariables->EnableBuildTiming) { + buildLog_ = "\nStart timing major build components.....\n\n"; + start_time = amd::Os::timeNanos(); + } + + // initBuild() will clear buildLog_, so store it in a temporary variable + std::string tmpBuildLog = buildLog_; + + if ((buildStatus_ == CL_BUILD_IN_PROGRESS) && !initBuild(&options)) { + buildStatus_ = CL_BUILD_ERROR; + if (buildLog_.empty()) { + buildLog_ += "Internal error: Compilation init failed."; + } + } + + buildLog_ += tmpBuildLog; + + if (options.oVariables->FP32RoundDivideSqrt && + !(device().info().singleFPConfig_ & CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT)) { + buildStatus_ = CL_BUILD_ERROR; + buildLog_ += + "Error: -cl-fp32-correctly-rounded-divide-sqrt " + "specified without device support"; + } + + bool createLibrary = linkOptions ? linkOptions->oVariables->clCreateLibrary : false; + if ((buildStatus_ == CL_BUILD_IN_PROGRESS) && !linkImpl(inputPrograms, &options, createLibrary)) { + buildStatus_ = CL_BUILD_ERROR; + if (buildLog_.empty()) { + buildLog_ += "Internal error: Link failed.\n"; + buildLog_ += "Make sure the system setup is correct."; + } + } + + if (!finiBuild(buildStatus_ == CL_BUILD_IN_PROGRESS)) { + buildStatus_ = CL_BUILD_ERROR; + if (buildLog_.empty()) { + buildLog_ = "Internal error: Compilation fini failed."; + } + } + + if (buildStatus_ == CL_BUILD_IN_PROGRESS) { + buildStatus_ = CL_BUILD_SUCCESS; + } else { + buildError_ = CL_LINK_PROGRAM_FAILURE; + } + + if (options.oVariables->EnableBuildTiming) { + std::stringstream tmp_ss; + tmp_ss << "\nTotal Link Time: " << (amd::Os::timeNanos() - start_time) / 1000ULL << " us\n"; + buildLog_ += tmp_ss.str(); + } + + if (options.oVariables->BuildLog && !buildLog_.empty()) { + if (strcmp(options.oVariables->BuildLog, "stderr") == 0) { + fprintf(stderr, "%s\n", options.optionsLog().c_str()); + fprintf(stderr, "%s\n", buildLog_.c_str()); + } else if (strcmp(options.oVariables->BuildLog, "stdout") == 0) { + printf("%s\n", options.optionsLog().c_str()); + printf("%s\n", buildLog_.c_str()); + } else { + std::fstream f; + std::stringstream tmp_ss; + std::string logs = options.optionsLog() + buildLog_; + tmp_ss << options.oVariables->BuildLog << "." << options.getBuildNo(); + f.open(tmp_ss.str().c_str(), (std::fstream::out | std::fstream::binary)); + f.write(logs.data(), logs.size()); + f.close(); + } + } + + if (!buildLog_.empty()) { + LogError(buildLog_.c_str()); + } + + return buildError(); } -cl_int -Program::compile(const std::string& sourceCode, - const std::vector& headers, - const char** headerIncludeNames, - const char* origOptions, - amd::option::Options* options) -{ - uint64_t start_time = 0; - if (options->oVariables->EnableBuildTiming) { - buildLog_ = "\nStart timing major build components.....\n\n"; - start_time = amd::Os::timeNanos(); - } +cl_int Program::build(const std::string& sourceCode, const char* origOptions, + amd::option::Options* options) { + uint64_t start_time = 0; + if (options->oVariables->EnableBuildTiming) { + buildLog_ = "\nStart timing major build components.....\n\n"; + start_time = amd::Os::timeNanos(); + } - lastBuildOptionsArg_ = origOptions ? origOptions : ""; - if (options) { - compileOptions_ = options->origOptionStr; - } + lastBuildOptionsArg_ = origOptions ? origOptions : ""; + if (options) { + compileOptions_ = options->origOptionStr; + } - buildStatus_ = CL_BUILD_IN_PROGRESS; - if (!initBuild(options)) { - buildStatus_ = CL_BUILD_ERROR; - if (buildLog_.empty()) { - buildLog_ = "Internal error: Compilation init failed."; - } + buildStatus_ = CL_BUILD_IN_PROGRESS; + if (!initBuild(options)) { + buildStatus_ = CL_BUILD_ERROR; + if (buildLog_.empty()) { + buildLog_ = "Internal error: Compilation init failed."; } + } - if (options->oVariables->FP32RoundDivideSqrt && - !(device().info().singleFPConfig_ - & CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT)) { - buildStatus_ = CL_BUILD_ERROR; - buildLog_ += "Error: -cl-fp32-correctly-rounded-divide-sqrt "\ - "specified without device support"; + if (options->oVariables->FP32RoundDivideSqrt && + !(device().info().singleFPConfig_ & CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT)) { + buildStatus_ = CL_BUILD_ERROR; + buildLog_ += + "Error: -cl-fp32-correctly-rounded-divide-sqrt " + "specified without device support"; + } + + // Compile the source code if any + std::vector headers; + if ((buildStatus_ == CL_BUILD_IN_PROGRESS) && !sourceCode.empty() && + !compileImpl(sourceCode, headers, NULL, options)) { + buildStatus_ = CL_BUILD_ERROR; + if (buildLog_.empty()) { + buildLog_ = "Internal error: Compilation failed."; } + } - // Compile the source code if any - if ((buildStatus_ == CL_BUILD_IN_PROGRESS) && - !sourceCode.empty() && - !compileImpl(sourceCode, headers, headerIncludeNames, options)) { - buildStatus_ = CL_BUILD_ERROR; - if (buildLog_.empty()) { - buildLog_ = "Internal error: Compilation failed."; - } + if ((buildStatus_ == CL_BUILD_IN_PROGRESS) && !linkImpl(options)) { + buildStatus_ = CL_BUILD_ERROR; + if (buildLog_.empty()) { + buildLog_ += "Internal error: Link failed.\n"; + buildLog_ += "Make sure the system setup is correct."; } + } - setType(TYPE_COMPILED); - - if ((buildStatus_ == CL_BUILD_IN_PROGRESS) && - !createBinary(options)) { - buildLog_ += "Internal Error: creating OpenCL binary failed!\n"; + if (!finiBuild(buildStatus_ == CL_BUILD_IN_PROGRESS)) { + buildStatus_ = CL_BUILD_ERROR; + if (buildLog_.empty()) { + buildLog_ = "Internal error: Compilation fini failed."; } + } - if (!finiBuild(buildStatus_ == CL_BUILD_IN_PROGRESS)) { - buildStatus_ = CL_BUILD_ERROR; - if (buildLog_.empty()) { - buildLog_ = "Internal error: Compilation fini failed."; - } - } + if (buildStatus_ == CL_BUILD_IN_PROGRESS) { + buildStatus_ = CL_BUILD_SUCCESS; + } else { + buildError_ = CL_BUILD_PROGRAM_FAILURE; + } - if (buildStatus_ == CL_BUILD_IN_PROGRESS) { - buildStatus_ = CL_BUILD_SUCCESS; - } - else { - buildError_ = CL_COMPILE_PROGRAM_FAILURE; - } + if (options->oVariables->EnableBuildTiming) { + std::stringstream tmp_ss; + tmp_ss << "\nTotal Build Time: " << (amd::Os::timeNanos() - start_time) / 1000ULL << " us\n"; + buildLog_ += tmp_ss.str(); + } - if (options->oVariables->EnableBuildTiming) { - std::stringstream tmp_ss; - tmp_ss << "\nTotal Compile Time: " - << (amd::Os::timeNanos() - start_time)/1000ULL - << " us\n"; - buildLog_ += tmp_ss.str(); + if (options->oVariables->BuildLog && !buildLog_.empty()) { + if (strcmp(options->oVariables->BuildLog, "stderr") == 0) { + fprintf(stderr, "%s\n", options->optionsLog().c_str()); + fprintf(stderr, "%s\n", buildLog_.c_str()); + } else if (strcmp(options->oVariables->BuildLog, "stdout") == 0) { + printf("%s\n", options->optionsLog().c_str()); + printf("%s\n", buildLog_.c_str()); + } else { + std::fstream f; + std::stringstream tmp_ss; + std::string logs = options->optionsLog() + buildLog_; + tmp_ss << options->oVariables->BuildLog << "." << options->getBuildNo(); + f.open(tmp_ss.str().c_str(), (std::fstream::out | std::fstream::binary)); + f.write(logs.data(), logs.size()); + f.close(); } + } - if (options->oVariables->BuildLog && !buildLog_.empty()) { - if (strcmp(options->oVariables->BuildLog, "stderr") == 0) { - fprintf(stderr, "%s\n", options->optionsLog().c_str()); - fprintf(stderr, "%s\n", buildLog_.c_str()); - } - else if (strcmp(options->oVariables->BuildLog, "stdout") == 0) { - printf("%s\n", options->optionsLog().c_str()); - printf("%s\n", buildLog_.c_str()); - } - else { - std::fstream f; - std::stringstream tmp_ss; - std::string logs = options->optionsLog() + buildLog_; - tmp_ss << options->oVariables->BuildLog - << "." << options->getBuildNo(); - f.open(tmp_ss.str().c_str(), - (std::fstream::out | std::fstream::binary)); - f.write(logs.data(), logs.size()); - f.close(); - } - LogError(buildLog_.c_str()); - } + if (!buildLog_.empty()) { + LogError(buildLog_.c_str()); + } - return buildError(); + return buildError(); } -cl_int Program::link(const std::vector& inputPrograms, - const char* origLinkOptions, - amd::option::Options* linkOptions) -{ - lastBuildOptionsArg_ = origLinkOptions ? origLinkOptions : ""; - if (linkOptions) { - linkOptions_ = linkOptions->origOptionStr; +bool Program::getCompileOptionsAtLinking(const std::vector& inputPrograms, + const amd::option::Options* linkOptions) { + amd::option::Options compileOptions; + std::vector::const_iterator it = inputPrograms.begin(); + std::vector::const_iterator itEnd = inputPrograms.end(); + for (size_t i = 0; it != itEnd; ++it, ++i) { + Program* program = *it; + + amd::option::Options compileOptions2; + amd::option::Options* thisCompileOptions = i == 0 ? &compileOptions : &compileOptions2; + if (!amd::option::parseAllOptions(program->compileOptions_, *thisCompileOptions)) { + buildLog_ += thisCompileOptions->optionsLog(); + LogError("Parsing compile options failed."); + return false; } - buildStatus_ = CL_BUILD_IN_PROGRESS; + if (i == 0) compileOptions_ = program->compileOptions_; - amd::option::Options options; - if (!getCompileOptionsAtLinking(inputPrograms, linkOptions)) { - buildStatus_ = CL_BUILD_ERROR; - if (buildLog_.empty()) { - buildLog_ += "Internal error: Get compile options failed."; + // if we are linking a program executable, and if "program" is a + // compiled module or a library created with "-enable-link-options", + // we can overwrite "program"'s compile options with linking options + if (!linkOptions_.empty() && !linkOptions->oVariables->clCreateLibrary) { + bool linkOptsCanOverwrite = false; + if (program->type() != TYPE_LIBRARY) { + linkOptsCanOverwrite = true; + } else { + amd::option::Options thisLinkOptions; + if (!amd::option::parseLinkOptions(program->linkOptions_, thisLinkOptions)) { + buildLog_ += thisLinkOptions.optionsLog(); + LogError("Parsing link options failed."); + return false; } - } - else { - if (!amd::option::parseAllOptions(compileOptions_, options)) { - buildStatus_ = CL_BUILD_ERROR; - buildLog_ += options.optionsLog(); - LogError("Parsing compile options failed."); + if (thisLinkOptions.oVariables->clEnableLinkOptions) linkOptsCanOverwrite = true; + } + if (linkOptsCanOverwrite) { + if (!thisCompileOptions->setOptionVariablesAs(*linkOptions)) { + buildLog_ += thisCompileOptions->optionsLog(); + LogError("Setting link options failed."); + return false; } + } + if (i == 0) compileOptions_ += " " + linkOptions_; } - - uint64_t start_time = 0; - if (options.oVariables->EnableBuildTiming) { - buildLog_ = "\nStart timing major build components.....\n\n"; - start_time = amd::Os::timeNanos(); + // warn if input modules have inconsistent compile options + if (i > 0) { + if (!compileOptions.equals(*thisCompileOptions, true /*ignore clc options*/)) { + buildLog_ += + "Warning: Input OpenCL binaries has inconsistent" + " compile options. Using compile options from" + " the first input binary!\n"; + } } - - // initBuild() will clear buildLog_, so store it in a temporary variable - std::string tmpBuildLog = buildLog_; - - if ((buildStatus_ == CL_BUILD_IN_PROGRESS) - && !initBuild(&options)) { - buildStatus_ = CL_BUILD_ERROR; - if (buildLog_.empty()) { - buildLog_ += "Internal error: Compilation init failed."; - } - } - - buildLog_ += tmpBuildLog; - - if (options.oVariables->FP32RoundDivideSqrt && - !(device().info().singleFPConfig_ - & CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT)) { - buildStatus_ = CL_BUILD_ERROR; - buildLog_ += "Error: -cl-fp32-correctly-rounded-divide-sqrt "\ - "specified without device support"; - } - - bool createLibrary - = linkOptions ? linkOptions->oVariables->clCreateLibrary : false; - if ((buildStatus_ == CL_BUILD_IN_PROGRESS) - && !linkImpl(inputPrograms, &options, createLibrary)) { - buildStatus_ = CL_BUILD_ERROR; - if (buildLog_.empty()) { - buildLog_ += "Internal error: Link failed.\n"; - buildLog_ += "Make sure the system setup is correct."; - } - } - - if (!finiBuild(buildStatus_ == CL_BUILD_IN_PROGRESS)) { - buildStatus_ = CL_BUILD_ERROR; - if (buildLog_.empty()) { - buildLog_ = "Internal error: Compilation fini failed."; - } - } - - if (buildStatus_ == CL_BUILD_IN_PROGRESS) { - buildStatus_ = CL_BUILD_SUCCESS; - } - else { - buildError_ = CL_LINK_PROGRAM_FAILURE; - } - - if (options.oVariables->EnableBuildTiming) { - std::stringstream tmp_ss; - tmp_ss << "\nTotal Link Time: " - << (amd::Os::timeNanos() - start_time)/1000ULL - << " us\n"; - buildLog_ += tmp_ss.str(); - } - - if (options.oVariables->BuildLog && !buildLog_.empty()) { - if (strcmp(options.oVariables->BuildLog, "stderr") == 0) { - fprintf(stderr, "%s\n", options.optionsLog().c_str()); - fprintf(stderr, "%s\n", buildLog_.c_str()); - } - else if (strcmp(options.oVariables->BuildLog, "stdout") == 0) { - printf("%s\n", options.optionsLog().c_str()); - printf("%s\n", buildLog_.c_str()); - } - else { - std::fstream f; - std::stringstream tmp_ss; - std::string logs = options.optionsLog() + buildLog_; - tmp_ss << options.oVariables->BuildLog - << "." << options.getBuildNo(); - f.open(tmp_ss.str().c_str(), - (std::fstream::out | std::fstream::binary)); - f.write(logs.data(), logs.size()); - f.close(); - } - } - - if (!buildLog_.empty()) { - LogError(buildLog_.c_str()); - } - - return buildError(); + } + return true; } -cl_int -Program::build(const std::string& sourceCode, - const char* origOptions, - amd::option::Options* options) -{ - uint64_t start_time = 0; - if (options->oVariables->EnableBuildTiming) { - buildLog_ = "\nStart timing major build components.....\n\n"; - start_time = amd::Os::timeNanos(); - } +bool Program::initClBinary(char* binaryIn, size_t size) { + if (!initClBinary()) { + return false; + } - lastBuildOptionsArg_ = origOptions ? origOptions : ""; - if (options) { - compileOptions_ = options->origOptionStr; - } + // Save the original binary that isn't owned by ClBinary + clBinary()->saveOrigBinary(binaryIn, size); - buildStatus_ = CL_BUILD_IN_PROGRESS; - if (!initBuild(options)) { - buildStatus_ = CL_BUILD_ERROR; - if (buildLog_.empty()) { - buildLog_ = "Internal error: Compilation init failed."; - } - } + char* bin = binaryIn; + size_t sz = size; - if (options->oVariables->FP32RoundDivideSqrt && - !(device().info().singleFPConfig_ & CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT)) { - buildStatus_ = CL_BUILD_ERROR; - buildLog_ += "Error: -cl-fp32-correctly-rounded-divide-sqrt "\ - "specified without device support"; - } - - // Compile the source code if any - std::vector headers; - if ((buildStatus_ == CL_BUILD_IN_PROGRESS) && - !sourceCode.empty() && !compileImpl(sourceCode, headers, NULL, options)) { - buildStatus_ = CL_BUILD_ERROR; - if (buildLog_.empty()) { - buildLog_ = "Internal error: Compilation failed."; - } - } - - if ((buildStatus_ == CL_BUILD_IN_PROGRESS) && !linkImpl(options)) { - buildStatus_ = CL_BUILD_ERROR; - if (buildLog_.empty()) { - buildLog_ += "Internal error: Link failed.\n"; - buildLog_ += "Make sure the system setup is correct."; - } - } - - if (!finiBuild(buildStatus_ == CL_BUILD_IN_PROGRESS)) { - buildStatus_ = CL_BUILD_ERROR; - if (buildLog_.empty()) { - buildLog_ = "Internal error: Compilation fini failed."; - } - } - - if (buildStatus_ == CL_BUILD_IN_PROGRESS) { - buildStatus_ = CL_BUILD_SUCCESS; - } - else { - buildError_ = CL_BUILD_PROGRAM_FAILURE; - } - - if (options->oVariables->EnableBuildTiming) { - std::stringstream tmp_ss; - tmp_ss << "\nTotal Build Time: " - << (amd::Os::timeNanos() - start_time)/1000ULL - << " us\n"; - buildLog_ += tmp_ss.str(); - } - - if (options->oVariables->BuildLog && !buildLog_.empty()) { - if (strcmp(options->oVariables->BuildLog, "stderr") == 0) { - fprintf(stderr, "%s\n", options->optionsLog().c_str()); - fprintf(stderr, "%s\n", buildLog_.c_str()); - } - else if (strcmp(options->oVariables->BuildLog, "stdout") == 0) { - printf("%s\n", options->optionsLog().c_str()); - printf("%s\n", buildLog_.c_str()); - } - else { - std::fstream f; - std::stringstream tmp_ss; - std::string logs = options->optionsLog() + buildLog_; - tmp_ss << options->oVariables->BuildLog << "." << options->getBuildNo(); - f.open(tmp_ss.str().c_str(), (std::fstream::out | std::fstream::binary)); - f.write(logs.data(), logs.size()); - f.close(); - } - } - - if (!buildLog_.empty()) { - LogError(buildLog_.c_str()); - } - - return buildError(); -} - -bool -Program::getCompileOptionsAtLinking(const std::vector& inputPrograms, - const amd::option::Options* linkOptions) -{ - amd::option::Options compileOptions; - std::vector::const_iterator it - = inputPrograms.begin(); - std::vector::const_iterator itEnd - = inputPrograms.end(); - for (size_t i = 0; it != itEnd; ++it, ++i) { - Program* program = *it; - - amd::option::Options compileOptions2; - amd::option::Options* thisCompileOptions - = i == 0 ? &compileOptions : &compileOptions2; - if (!amd::option::parseAllOptions(program->compileOptions_, - *thisCompileOptions)) { - buildLog_ += thisCompileOptions->optionsLog(); - LogError("Parsing compile options failed."); - return false; - } - - if (i == 0) - compileOptions_ = program->compileOptions_; - - // if we are linking a program executable, and if "program" is a - // compiled module or a library created with "-enable-link-options", - // we can overwrite "program"'s compile options with linking options - if (!linkOptions_.empty() - && !linkOptions->oVariables->clCreateLibrary) { - bool linkOptsCanOverwrite = false; - if (program->type() != TYPE_LIBRARY) { - linkOptsCanOverwrite = true; - } - else { - amd::option::Options thisLinkOptions; - if (!amd::option::parseLinkOptions(program->linkOptions_, - thisLinkOptions)) { - buildLog_ += thisLinkOptions.optionsLog(); - LogError("Parsing link options failed."); - return false; - } - if (thisLinkOptions.oVariables->clEnableLinkOptions) - linkOptsCanOverwrite = true; - } - if (linkOptsCanOverwrite) { - if (!thisCompileOptions->setOptionVariablesAs(*linkOptions)) { - buildLog_ += thisCompileOptions->optionsLog(); - LogError("Setting link options failed."); - return false; - } - } - if (i == 0) - compileOptions_ += " " + linkOptions_; - } - // warn if input modules have inconsistent compile options - if (i > 0) { - if (!compileOptions.equals(*thisCompileOptions, - true/*ignore clc options*/)) { - buildLog_ += "Warning: Input OpenCL binaries has inconsistent" - " compile options. Using compile options from" - " the first input binary!\n"; - } - } - } - return true; -} - -bool -Program::initClBinary(char* binaryIn, size_t size) -{ - if (!initClBinary()) { - return false; - } - - // Save the original binary that isn't owned by ClBinary - clBinary()->saveOrigBinary(binaryIn, size); - - char* bin = binaryIn; - size_t sz = size; - - //unencrypted - int encryptCode = 0; - char* decryptedBin = NULL; + // unencrypted + int encryptCode = 0; + char* decryptedBin = NULL; #if !defined(WITH_LIGHTNING_COMPILER) - bool isSPIRV = isSPIRVMagic(binaryIn, size); - if (isSPIRV || isBcMagic(binaryIn)) - { - acl_error err = ACL_SUCCESS; - aclBinaryOptions binOpts = {0}; - binOpts.struct_size = sizeof(binOpts); - binOpts.elfclass - = (info().arch_id == aclX64 || info().arch_id == aclAMDIL64 || - info().arch_id == aclHSAIL64) - ? ELFCLASS64 : ELFCLASS32; - binOpts.bitness = ELFDATA2LSB; - binOpts.alloc = &::malloc; - binOpts.dealloc = &::free; - aclBinary* aclbin_v30 = aclBinaryInit(sizeof(aclBinary), &info(), &binOpts, &err); - if (err != ACL_SUCCESS) { - LogWarning("aclBinaryInit failed"); - aclBinaryFini(aclbin_v30); - return false; - } - err = aclInsertSection(device().compiler(), aclbin_v30, binaryIn, size, - isSPIRV?aclSPIRV:aclSPIR); - if (ACL_SUCCESS != err) { - LogWarning("aclInsertSection failed"); - aclBinaryFini(aclbin_v30); - return false; - } - if (info().arch_id == aclHSAIL || info().arch_id == aclHSAIL64) { - err = aclWriteToMem(aclbin_v30, reinterpret_cast(&bin), &sz); - if (err != ACL_SUCCESS) { - LogWarning("aclWriteToMem failed"); - aclBinaryFini(aclbin_v30); - return false; - } - aclBinaryFini(aclbin_v30); - } - else { - aclBinary* aclbin_v21 = aclCreateFromBinary(aclbin_v30,aclBIFVersion21); - err = aclWriteToMem(aclbin_v21, reinterpret_cast(&bin), &sz); - if (err != ACL_SUCCESS) { - LogWarning("aclWriteToMem failed"); - aclBinaryFini(aclbin_v30); - aclBinaryFini(aclbin_v21); - return false; - } - aclBinaryFini(aclbin_v30); - aclBinaryFini(aclbin_v21); - } + bool isSPIRV = isSPIRVMagic(binaryIn, size); + if (isSPIRV || isBcMagic(binaryIn)) { + acl_error err = ACL_SUCCESS; + aclBinaryOptions binOpts = {0}; + binOpts.struct_size = sizeof(binOpts); + binOpts.elfclass = + (info().arch_id == aclX64 || info().arch_id == aclAMDIL64 || info().arch_id == aclHSAIL64) + ? ELFCLASS64 + : ELFCLASS32; + binOpts.bitness = ELFDATA2LSB; + binOpts.alloc = &::malloc; + binOpts.dealloc = &::free; + aclBinary* aclbin_v30 = aclBinaryInit(sizeof(aclBinary), &info(), &binOpts, &err); + if (err != ACL_SUCCESS) { + LogWarning("aclBinaryInit failed"); + aclBinaryFini(aclbin_v30); + return false; } - else -#endif // defined(WITH_LIGHTNING_COMPILER) - { - size_t decryptedSize; - if (!clBinary()->decryptElf(binaryIn,size, - &decryptedBin,&decryptedSize,&encryptCode)) { - return false; - } - if (decryptedBin != NULL) { - // It is decrypted binary. - bin = decryptedBin; - sz = decryptedSize; - } - - if (!isElf(bin)) { - // Invalid binary. - if (decryptedBin != NULL) { - delete [] decryptedBin; - } - return false; - } + err = aclInsertSection(device().compiler(), aclbin_v30, binaryIn, size, + isSPIRV ? aclSPIRV : aclSPIR); + if (ACL_SUCCESS != err) { + LogWarning("aclInsertSection failed"); + aclBinaryFini(aclbin_v30); + return false; + } + if (info().arch_id == aclHSAIL || info().arch_id == aclHSAIL64) { + err = aclWriteToMem(aclbin_v30, reinterpret_cast(&bin), &sz); + if (err != ACL_SUCCESS) { + LogWarning("aclWriteToMem failed"); + aclBinaryFini(aclbin_v30); + return false; + } + aclBinaryFini(aclbin_v30); + } else { + aclBinary* aclbin_v21 = aclCreateFromBinary(aclbin_v30, aclBIFVersion21); + err = aclWriteToMem(aclbin_v21, reinterpret_cast(&bin), &sz); + if (err != ACL_SUCCESS) { + LogWarning("aclWriteToMem failed"); + aclBinaryFini(aclbin_v30); + aclBinaryFini(aclbin_v21); + return false; + } + aclBinaryFini(aclbin_v30); + aclBinaryFini(aclbin_v21); + } + } else +#endif // defined(WITH_LIGHTNING_COMPILER) + { + size_t decryptedSize; + if (!clBinary()->decryptElf(binaryIn, size, &decryptedBin, &decryptedSize, &encryptCode)) { + return false; + } + if (decryptedBin != NULL) { + // It is decrypted binary. + bin = decryptedBin; + sz = decryptedSize; } - clBinary()->setFlags(encryptCode); + if (!isElf(bin)) { + // Invalid binary. + if (decryptedBin != NULL) { + delete[] decryptedBin; + } + return false; + } + } - return clBinary()->setBinary(bin, sz, (decryptedBin != NULL)); + clBinary()->setFlags(encryptCode); + + return clBinary()->setBinary(bin, sz, (decryptedBin != NULL)); } -bool -Program::setBinary(char* binaryIn, size_t size) -{ - if (!initClBinary(binaryIn, size)) { - return false; - } +bool Program::setBinary(char* binaryIn, size_t size) { + if (!initClBinary(binaryIn, size)) { + return false; + } #if defined(WITH_LIGHTNING_COMPILER) - if (!clBinary()->setElfIn(ELFCLASS64)) { -#else // !defined(WITH_LIGHTNING_COMPILER) - if (!clBinary()->setElfIn(ELFCLASS32)) { -#endif // !defined(WITH_LIGHTNING_COMPILER) - LogError("Setting input OCL binary failed"); - return false; + if (!clBinary()->setElfIn(ELFCLASS64)) { +#else // !defined(WITH_LIGHTNING_COMPILER) + if (!clBinary()->setElfIn(ELFCLASS32)) { +#endif // !defined(WITH_LIGHTNING_COMPILER) + LogError("Setting input OCL binary failed"); + return false; + } + uint16_t type; + if (!clBinary()->elfIn()->getType(type)) { + LogError("Bad OCL Binary: error loading ELF type!"); + return false; + } + switch (type) { + case ET_NONE: { + setType(TYPE_NONE); + break; } - uint16_t type; - if (!clBinary()->elfIn()->getType(type)) { - LogError("Bad OCL Binary: error loading ELF type!"); - return false; - } - switch (type) { - case ET_NONE: - { - setType(TYPE_NONE); - break; - } - case ET_REL: - { - if (clBinary()->isSPIR() || clBinary()->isSPIRV()) { - setType(TYPE_INTERMEDIATE); - } else { - setType(TYPE_COMPILED); - } - break; - } - case ET_DYN: - { - setType(TYPE_LIBRARY); - break; - } - case ET_EXEC: - { - setType(TYPE_EXECUTABLE); - break; - } - default: - LogError("Bad OCL Binary: bad ELF type!"); - return false; - } - - clBinary()->loadCompileOptions(compileOptions_); - clBinary()->loadLinkOptions(linkOptions_); -#if defined(WITH_LIGHTNING_COMPILER) - //TODO: Remove this once BIF is no longer used as we should have a machinasm in - // place to get the binary type correctly from above. - // It is a workaround for executable build from the library. The code object - // binary does not have the type information. - - char *sect = NULL; - size_t sz = 0; - if (clBinary()->elfIn()->getSection(amd::OclElf::TEXT, §, &sz) && sect && sz > 0) { - setType(TYPE_EXECUTABLE); - } - - sect = NULL; - sz = 0; - if (type != ET_DYN && // binary is not a library - (clBinary()->elfIn()->getSection(amd::OclElf::LLVMIR, §, &sz) && sect && sz > 0)) - { + case ET_REL: { + if (clBinary()->isSPIR() || clBinary()->isSPIRV()) { + setType(TYPE_INTERMEDIATE); + } else { setType(TYPE_COMPILED); + } + break; } + case ET_DYN: { + setType(TYPE_LIBRARY); + break; + } + case ET_EXEC: { + setType(TYPE_EXECUTABLE); + break; + } + default: + LogError("Bad OCL Binary: bad ELF type!"); + return false; + } + + clBinary()->loadCompileOptions(compileOptions_); + clBinary()->loadLinkOptions(linkOptions_); +#if defined(WITH_LIGHTNING_COMPILER) + // TODO: Remove this once BIF is no longer used as we should have a machinasm in + // place to get the binary type correctly from above. + // It is a workaround for executable build from the library. The code object + // binary does not have the type information. + + char* sect = NULL; + size_t sz = 0; + if (clBinary()->elfIn()->getSection(amd::OclElf::TEXT, §, &sz) && sect && sz > 0) { + setType(TYPE_EXECUTABLE); + } + + sect = NULL; + sz = 0; + if (type != ET_DYN && // binary is not a library + (clBinary()->elfIn()->getSection(amd::OclElf::LLVMIR, §, &sz) && sect && sz > 0)) { + setType(TYPE_COMPILED); + } #endif - clBinary()->resetElfIn(); - return true; + clBinary()->resetElfIn(); + return true; } -bool -Program::createBIFBinary(aclBinary* bin) -{ +bool Program::createBIFBinary(aclBinary* bin) { #if defined(WITH_LIGHTNING_COMPILER) - assert(!"createBIFBinary() should not be called when using LC"); + assert(!"createBIFBinary() should not be called when using LC"); + return false; +#else // defined(WITH_LIGHTNING_COMPILER) + acl_error err; + char* binaryIn = NULL; + size_t size; + err = aclWriteToMem(bin, reinterpret_cast(&binaryIn), &size); + if (err != ACL_SUCCESS) { + LogWarning("aclWriteToMem failed"); return false; -#else // defined(WITH_LIGHTNING_COMPILER) - acl_error err; - char *binaryIn = NULL; - size_t size; - err = aclWriteToMem(bin, reinterpret_cast(&binaryIn), &size); - if (err != ACL_SUCCESS) { - LogWarning("aclWriteToMem failed"); - return false; - } - clBinary()->saveBIFBinary(binaryIn, size); - aclFreeMem(bin, binaryIn); - return true; -#endif // defined(WITH_LIGHTNING_COMPILER) + } + clBinary()->saveBIFBinary(binaryIn, size); + aclFreeMem(bin, binaryIn); + return true; +#endif // defined(WITH_LIGHTNING_COMPILER) } ClBinary::ClBinary(const amd::Device& dev, BinaryImageFormat bifVer) - : dev_(dev) - , binary_(NULL) - , size_(0) - , flags_(0) - , origBinary_(NULL) - , origSize_(0) - , encryptCode_ (0) - , elfIn_(NULL) - , elfOut_(NULL) - , format_(bifVer) -{ + : dev_(dev), + binary_(NULL), + size_(0), + flags_(0), + origBinary_(NULL), + origSize_(0), + encryptCode_(0), + elfIn_(NULL), + elfOut_(NULL), + format_(bifVer) {} + +ClBinary::~ClBinary() { + release(); + + if (elfIn_) { + delete elfIn_; + } + if (elfOut_) { + delete elfOut_; + } } -ClBinary::~ClBinary() -{ - release(); - - if (elfIn_) { - delete elfIn_; - } - if (elfOut_) { - delete elfOut_; - } -} - -std::string -ClBinary::getBIFSymbol(unsigned int symbolID) const -{ - size_t nSymbols = 0; - // Due to PRE & POST defines in bif_section_labels.hpp conflict with - // PRE & POST struct members in sp3-si-chip-registers.h - // unable to include bif_section_labels.hpp in device.hpp - //! @todo: resolve conflict by renaming defines, - // then include bif_section_labels.hpp in device.hpp & - // use oclBIFSymbolID instead of unsigned int as a parameter - const oclBIFSymbolID symID = static_cast(symbolID); - switch (format_) { +std::string ClBinary::getBIFSymbol(unsigned int symbolID) const { + size_t nSymbols = 0; + // Due to PRE & POST defines in bif_section_labels.hpp conflict with + // PRE & POST struct members in sp3-si-chip-registers.h + // unable to include bif_section_labels.hpp in device.hpp + //! @todo: resolve conflict by renaming defines, + // then include bif_section_labels.hpp in device.hpp & + // use oclBIFSymbolID instead of unsigned int as a parameter + const oclBIFSymbolID symID = static_cast(symbolID); + switch (format_) { case BIF_VERSION2: { - nSymbols = sizeof(BIF20)/sizeof(oclBIFSymbolStruct); - const oclBIFSymbolStruct* symb = findBIFSymbolStruct(BIF20, nSymbols, symID); - assert(symb && "BIF20 symbol with symbolID not found"); - if (symb) { - return std::string(symb->str[bif::PRE]) + std::string(symb->str[bif::POST]); - } - break; + nSymbols = sizeof(BIF20) / sizeof(oclBIFSymbolStruct); + const oclBIFSymbolStruct* symb = findBIFSymbolStruct(BIF20, nSymbols, symID); + assert(symb && "BIF20 symbol with symbolID not found"); + if (symb) { + return std::string(symb->str[bif::PRE]) + std::string(symb->str[bif::POST]); + } + break; } case BIF_VERSION3: { - nSymbols = sizeof(BIF30)/sizeof(oclBIFSymbolStruct); - const oclBIFSymbolStruct* symb = findBIFSymbolStruct(BIF30, nSymbols, symID); - assert(symb && "BIF30 symbol with symbolID not found"); - if (symb) { - return std::string(symb->str[bif::PRE]) + std::string(symb->str[bif::POST]); - } - break; + nSymbols = sizeof(BIF30) / sizeof(oclBIFSymbolStruct); + const oclBIFSymbolStruct* symb = findBIFSymbolStruct(BIF30, nSymbols, symID); + assert(symb && "BIF30 symbol with symbolID not found"); + if (symb) { + return std::string(symb->str[bif::PRE]) + std::string(symb->str[bif::POST]); + } + break; } default: - assert(0 && "unexpected BIF type"); - return ""; - } - return ""; + assert(0 && "unexpected BIF type"); + return ""; + } + return ""; } -void -ClBinary::init(amd::option::Options* optionsObj, bool amdilRequired) -{ - // option has higher priority than environment variable. - if ((flags_ & BinarySourceMask) != BinaryRemoveSource) { - // set to zero - flags_ = (flags_ & (~BinarySourceMask)); +void ClBinary::init(amd::option::Options* optionsObj, bool amdilRequired) { + // option has higher priority than environment variable. + if ((flags_ & BinarySourceMask) != BinaryRemoveSource) { + // set to zero + flags_ = (flags_ & (~BinarySourceMask)); - flags_ |= (optionsObj->oVariables->BinSOURCE - ? BinarySaveSource : BinaryNoSaveSource); - } + flags_ |= (optionsObj->oVariables->BinSOURCE ? BinarySaveSource : BinaryNoSaveSource); + } - if ((flags_ & BinaryLlvmirMask) != BinaryRemoveLlvmir) { - // set to zero - flags_ = (flags_ & (~BinaryLlvmirMask)); + if ((flags_ & BinaryLlvmirMask) != BinaryRemoveLlvmir) { + // set to zero + flags_ = (flags_ & (~BinaryLlvmirMask)); - flags_ |= (optionsObj->oVariables->BinLLVMIR - ? BinarySaveLlvmir : BinaryNoSaveLlvmir); - } + flags_ |= (optionsObj->oVariables->BinLLVMIR ? BinarySaveLlvmir : BinaryNoSaveLlvmir); + } - // If amdilRequired is true, force to save AMDIL (for correctness) - if ((flags_ & BinaryAmdilMask) != BinaryRemoveAmdil || - amdilRequired) { - // set to zero - flags_ = (flags_ & (~BinaryAmdilMask)); - flags_ |= ((optionsObj->oVariables->BinAMDIL || amdilRequired) - ? BinarySaveAmdil : BinaryNoSaveAmdil); - } + // If amdilRequired is true, force to save AMDIL (for correctness) + if ((flags_ & BinaryAmdilMask) != BinaryRemoveAmdil || amdilRequired) { + // set to zero + flags_ = (flags_ & (~BinaryAmdilMask)); + flags_ |= + ((optionsObj->oVariables->BinAMDIL || amdilRequired) ? BinarySaveAmdil : BinaryNoSaveAmdil); + } - if ((flags_ & BinaryIsaMask) != BinaryRemoveIsa) { - // set to zero - flags_ = (flags_ & (~BinaryIsaMask)); - flags_ |= ((optionsObj->oVariables->BinEXE) - ? BinarySaveIsa : BinaryNoSaveIsa); - } + if ((flags_ & BinaryIsaMask) != BinaryRemoveIsa) { + // set to zero + flags_ = (flags_ & (~BinaryIsaMask)); + flags_ |= ((optionsObj->oVariables->BinEXE) ? BinarySaveIsa : BinaryNoSaveIsa); + } - if ((flags_ & BinaryASMask) != BinaryRemoveAS) { - // set to zero - flags_ = (flags_ & (~BinaryASMask)); - flags_ |= ((optionsObj->oVariables->BinAS) - ? BinarySaveAS : BinaryNoSaveAS); - } + if ((flags_ & BinaryASMask) != BinaryRemoveAS) { + // set to zero + flags_ = (flags_ & (~BinaryASMask)); + flags_ |= ((optionsObj->oVariables->BinAS) ? BinarySaveAS : BinaryNoSaveAS); + } } -bool -ClBinary::isRecompilable(std::string& llvmBinary, - amd::OclElf::oclElfPlatform thePlatform) -{ - /* It is recompilable if there is llvmir that was generated for - the same platform (CPU or GPU) and with the same bitness. - - Note: the bitness has been checked in initClBinary(), no need - to check it here. - */ - if (llvmBinary.empty() ) { - return false; - } - - uint16_t elf_target; - amd::OclElf::oclElfPlatform platform; - if (elfIn()->getTarget(elf_target, platform)){ - if (platform == thePlatform){ - return true; - } - if ((platform == amd::OclElf::COMPLIB_PLATFORM) && - (((thePlatform == amd::OclElf::CAL_PLATFORM) && - ((elf_target == (uint16_t)EM_AMDIL) || - (elf_target == (uint16_t)EM_HSAIL) || - (elf_target == (uint16_t)EM_HSAIL_64))) || - ((thePlatform == amd::OclElf::CPU_PLATFORM) && - ((elf_target == (uint16_t)EM_386) || - (elf_target == (uint16_t)EM_X86_64))))){ - return true; - } - } +bool ClBinary::isRecompilable(std::string& llvmBinary, amd::OclElf::oclElfPlatform thePlatform) { + /* It is recompilable if there is llvmir that was generated for + the same platform (CPU or GPU) and with the same bitness. + Note: the bitness has been checked in initClBinary(), no need + to check it here. + */ + if (llvmBinary.empty()) { return false; -} + } -void -ClBinary::release() -{ - if (isBinaryAllocated() && (binary_ != NULL)) { - delete [] binary_; - binary_ = NULL; - flags_ &= ~BinaryAllocated; + uint16_t elf_target; + amd::OclElf::oclElfPlatform platform; + if (elfIn()->getTarget(elf_target, platform)) { + if (platform == thePlatform) { + return true; } + if ((platform == amd::OclElf::COMPLIB_PLATFORM) && + (((thePlatform == amd::OclElf::CAL_PLATFORM) && + ((elf_target == (uint16_t)EM_AMDIL) || (elf_target == (uint16_t)EM_HSAIL) || + (elf_target == (uint16_t)EM_HSAIL_64))) || + ((thePlatform == amd::OclElf::CPU_PLATFORM) && + ((elf_target == (uint16_t)EM_386) || (elf_target == (uint16_t)EM_X86_64))))) { + return true; + } + } + + return false; } -void -ClBinary::saveBIFBinary(char* binaryIn, size_t size) -{ - char *image = new char[size]; - memcpy(image, binaryIn, size); - - setBinary(image, size, true); - return; +void ClBinary::release() { + if (isBinaryAllocated() && (binary_ != NULL)) { + delete[] binary_; + binary_ = NULL; + flags_ &= ~BinaryAllocated; + } } -bool -ClBinary::createElfBinary(bool doencrypt, Program::type_t type) -{ +void ClBinary::saveBIFBinary(char* binaryIn, size_t size) { + char* image = new char[size]; + memcpy(image, binaryIn, size); + + setBinary(image, size, true); + return; +} + +bool ClBinary::createElfBinary(bool doencrypt, Program::type_t type) { #if 0 if (!saveISA() && !saveAMDIL() && !saveLLVMIR() && !saveSOURCE()) { return true; } #endif - release(); + release(); - size_t imageSize; - char* image; - assert (elfOut_ && "elfOut_ should be initialized in ClBinary::data()"); + size_t imageSize; + char* image; + assert(elfOut_ && "elfOut_ should be initialized in ClBinary::data()"); - // Insert Version string that builds this binary into .comment section - const device::Info& devInfo = dev_.info(); - std::string buildVerInfo("@(#) "); - if (devInfo.version_ != NULL) { - buildVerInfo.append(devInfo.version_); - buildVerInfo.append(". Driver version: "); - buildVerInfo.append(devInfo.driverVersion_); + // Insert Version string that builds this binary into .comment section + const device::Info& devInfo = dev_.info(); + std::string buildVerInfo("@(#) "); + if (devInfo.version_ != NULL) { + buildVerInfo.append(devInfo.version_); + buildVerInfo.append(". Driver version: "); + buildVerInfo.append(devInfo.driverVersion_); + } else { + // char OpenCLVersion[256]; + // size_t sz; + // cl_int ret= clGetPlatformInfo(AMD_PLATFORM, CL_PLATFORM_VERSION, 256, OpenCLVersion, &sz); + // if (ret == CL_SUCCESS) { + // buildVerInfo.append(OpenCLVersion, sz); + // } + + // If CAL is unavailable, just hard-code the OpenCL driver version + buildVerInfo.append("OpenCL 1.1" AMD_PLATFORM_INFO); + } + + elfOut_->addSection(amd::OclElf::COMMENT, buildVerInfo.data(), buildVerInfo.size()); + switch (type) { + case Program::TYPE_NONE: { + elfOut_->setType(ET_NONE); + break; } - else { - // char OpenCLVersion[256]; - // size_t sz; - // cl_int ret= clGetPlatformInfo(AMD_PLATFORM, CL_PLATFORM_VERSION, 256, OpenCLVersion, &sz); - // if (ret == CL_SUCCESS) { - // buildVerInfo.append(OpenCLVersion, sz); - // } - - // If CAL is unavailable, just hard-code the OpenCL driver version - buildVerInfo.append("OpenCL 1.1" AMD_PLATFORM_INFO); + case Program::TYPE_COMPILED: { + elfOut_->setType(ET_REL); + break; } - - elfOut_->addSection(amd::OclElf::COMMENT, buildVerInfo.data(), buildVerInfo.size()); - switch (type) { - case Program::TYPE_NONE: - { - elfOut_->setType(ET_NONE); - break; - } - case Program::TYPE_COMPILED: - { - elfOut_->setType(ET_REL); - break; - } - case Program::TYPE_LIBRARY: - { - elfOut_->setType(ET_DYN); - break; - } - case Program::TYPE_EXECUTABLE: - { - elfOut_->setType(ET_EXEC); - break; - } - default: - assert(0 && "unexpected elf type"); + case Program::TYPE_LIBRARY: { + elfOut_->setType(ET_DYN); + break; } - - if (!elfOut_->dumpImage(&image, &imageSize)) { - return false; + case Program::TYPE_EXECUTABLE: { + elfOut_->setType(ET_EXEC); + break; } + default: + assert(0 && "unexpected elf type"); + } + + if (!elfOut_->dumpImage(&image, &imageSize)) { + return false; + } #if defined(HAVE_BLOWFISH_H) - if (doencrypt) { - // Increase the size by 64 to accomodate extra headers - int outBufSize = (int)(imageSize + 64); - char * outBuf = new char[outBufSize]; - if (outBuf == NULL) { - return false; - } - memset(outBuf, '\0', outBufSize); - - int outBytes = 0; - bool success = amd::oclEncrypt(0, image, imageSize, outBuf, outBufSize, &outBytes); - delete [] image; - if (!success) { - delete [] outBuf; - return false; - } - image = outBuf; - imageSize = outBytes; + if (doencrypt) { + // Increase the size by 64 to accomodate extra headers + int outBufSize = (int)(imageSize + 64); + char* outBuf = new char[outBufSize]; + if (outBuf == NULL) { + return false; } + memset(outBuf, '\0', outBufSize); + + int outBytes = 0; + bool success = amd::oclEncrypt(0, image, imageSize, outBuf, outBufSize, &outBytes); + delete[] image; + if (!success) { + delete[] outBuf; + return false; + } + image = outBuf; + imageSize = outBytes; + } #endif - setBinary(image, imageSize, true); - return true; + setBinary(image, imageSize, true); + return true; } -Program::binary_t ClBinary::data() const -{ - return std::make_pair(binary_, size_); +Program::binary_t ClBinary::data() const { return std::make_pair(binary_, size_); } + +bool ClBinary::setBinary(char* theBinary, size_t theBinarySize, bool allocated) { + release(); + + size_ = theBinarySize; + binary_ = theBinary; + if (allocated) { + flags_ |= BinaryAllocated; + } + return true; } -bool -ClBinary::setBinary(char* theBinary, size_t theBinarySize, bool allocated) -{ - release(); - - size_ = theBinarySize; - binary_ = theBinary; - if (allocated) { - flags_ |= BinaryAllocated; - } - return true; +void ClBinary::setFlags(int encryptCode) { + encryptCode_ = encryptCode; + if (encryptCode != 0) { + flags_ = + (flags_ & + (~(BinarySourceMask | BinaryLlvmirMask | BinaryAmdilMask | BinaryIsaMask | BinaryASMask))); + flags_ |= (BinaryRemoveSource | BinaryRemoveLlvmir | BinaryRemoveAmdil | BinarySaveIsa | + BinaryRemoveAS); + } } -void -ClBinary::setFlags(int encryptCode) -{ - encryptCode_ = encryptCode; - if (encryptCode != 0) { - flags_ = (flags_ & (~(BinarySourceMask | BinaryLlvmirMask | - BinaryAmdilMask | BinaryIsaMask | - BinaryASMask))); - flags_ |= (BinaryRemoveSource | BinaryRemoveLlvmir | - BinaryRemoveAmdil | BinarySaveIsa | - BinaryRemoveAS); - } -} - -bool -ClBinary::decryptElf(char* binaryIn, size_t size, - char** decryptBin, size_t* decryptSize, int* encryptCode) -{ - *decryptBin = NULL; +bool ClBinary::decryptElf(char* binaryIn, size_t size, char** decryptBin, size_t* decryptSize, + int* encryptCode) { + *decryptBin = NULL; #if defined(HAVE_BLOWFISH_H) - int outBufSize = 0; - if (amd::isEncryptedBIF(binaryIn, (int)size, &outBufSize)) { - char* outBuf = new (std::nothrow) char[outBufSize]; - if (outBuf == NULL) { - return false; - } - - // Decrypt - int outDataSize = 0; - if (!amd::oclDecrypt(binaryIn, (int)size, outBuf, outBufSize, &outDataSize)) { - delete [] outBuf; - return false; - } - - *decryptBin = reinterpret_cast(outBuf); - *decryptSize = outDataSize; - *encryptCode = 1; + int outBufSize = 0; + if (amd::isEncryptedBIF(binaryIn, (int)size, &outBufSize)) { + char* outBuf = new (std::nothrow) char[outBufSize]; + if (outBuf == NULL) { + return false; } + + // Decrypt + int outDataSize = 0; + if (!amd::oclDecrypt(binaryIn, (int)size, outBuf, outBufSize, &outDataSize)) { + delete[] outBuf; + return false; + } + + *decryptBin = reinterpret_cast(outBuf); + *decryptSize = outDataSize; + *encryptCode = 1; + } #endif - return true; + return true; } -bool -ClBinary::setElfIn(unsigned char eclass) -{ - if (elfIn_) return true; +bool ClBinary::setElfIn(unsigned char eclass) { + if (elfIn_) return true; - if (binary_ == NULL) { - return false; - } - elfIn_ = new amd::OclElf(eclass, binary_, size_, NULL, ELF_C_READ); - if ( (elfIn_ == NULL)|| elfIn_->hasError() ) { - if (elfIn_) { - delete elfIn_; - elfIn_ = NULL; - } - LogError("Creating input ELF object failed"); - return false; - } - - return true; -} - -void ClBinary::resetElfIn() -{ + if (binary_ == NULL) { + return false; + } + elfIn_ = new amd::OclElf(eclass, binary_, size_, NULL, ELF_C_READ); + if ((elfIn_ == NULL) || elfIn_->hasError()) { if (elfIn_) { - delete elfIn_; - elfIn_ = NULL; + delete elfIn_; + elfIn_ = NULL; } + LogError("Creating input ELF object failed"); + return false; + } + + return true; } -bool ClBinary::setElfOut(unsigned char eclass, const char* outFile) -{ - elfOut_ = new amd::OclElf(eclass, NULL, 0, outFile, ELF_C_WRITE); - if ( (elfOut_ == NULL) || elfOut_->hasError() ) { - if (elfOut_) { - delete elfOut_; - elfOut_ = NULL; - } - LogError("Creating ouput ELF object failed"); - return false; - } - - return setElfTarget(); +void ClBinary::resetElfIn() { + if (elfIn_) { + delete elfIn_; + elfIn_ = NULL; + } } -void ClBinary::resetElfOut() -{ +bool ClBinary::setElfOut(unsigned char eclass, const char* outFile) { + elfOut_ = new amd::OclElf(eclass, NULL, 0, outFile, ELF_C_WRITE); + if ((elfOut_ == NULL) || elfOut_->hasError()) { if (elfOut_) { - delete elfOut_; - elfOut_ = NULL; + delete elfOut_; + elfOut_ = NULL; } -} - -bool -ClBinary::loadLlvmBinary(std::string& llvmBinary, amd::OclElf::oclElfSections& elfSectionType) const -{ - // Check if current binary already has LLVMIR - char *section = NULL; - size_t sz = 0; - const amd::OclElf::oclElfSections SectionTypes[] = - {amd::OclElf::LLVMIR, amd::OclElf::SPIR, amd::OclElf::SPIRV}; - - for (int i = 0; i < 3; ++i){ - if (elfIn_->getSection(SectionTypes[i], §ion, &sz) && section && sz > 0) { - llvmBinary.append(section, sz); - elfSectionType = SectionTypes[i]; - return true; - } - } - + LogError("Creating ouput ELF object failed"); return false; + } + + return setElfTarget(); } -bool ClBinary::loadCompileOptions(std::string& compileOptions) const -{ - char *options = NULL; - size_t sz; - compileOptions.clear(); - if (elfIn_->getSymbol(amd::OclElf::COMMENT, - getBIFSymbol(symOpenclCompilerOptions).c_str(), &options, &sz)) { - if (sz > 0) { - compileOptions.append(options, sz); - } - return true; +void ClBinary::resetElfOut() { + if (elfOut_) { + delete elfOut_; + elfOut_ = NULL; + } +} + +bool ClBinary::loadLlvmBinary(std::string& llvmBinary, + amd::OclElf::oclElfSections& elfSectionType) const { + // Check if current binary already has LLVMIR + char* section = NULL; + size_t sz = 0; + const amd::OclElf::oclElfSections SectionTypes[] = {amd::OclElf::LLVMIR, amd::OclElf::SPIR, + amd::OclElf::SPIRV}; + + for (int i = 0; i < 3; ++i) { + if (elfIn_->getSection(SectionTypes[i], §ion, &sz) && section && sz > 0) { + llvmBinary.append(section, sz); + elfSectionType = SectionTypes[i]; + return true; } - return false; + } + + return false; } -bool ClBinary::loadLinkOptions(std::string& linkOptions) const -{ - char *options = NULL; - size_t sz; - linkOptions.clear(); - if (elfIn_->getSymbol(amd::OclElf::COMMENT, - getBIFSymbol(symOpenclLinkerOptions).c_str(), &options, &sz)) { - if (sz > 0) { - linkOptions.append(options, sz); - } - return true; +bool ClBinary::loadCompileOptions(std::string& compileOptions) const { + char* options = NULL; + size_t sz; + compileOptions.clear(); + if (elfIn_->getSymbol(amd::OclElf::COMMENT, getBIFSymbol(symOpenclCompilerOptions).c_str(), + &options, &sz)) { + if (sz > 0) { + compileOptions.append(options, sz); } - return false; + return true; + } + return false; } -void ClBinary::storeCompileOptions(const std::string& compileOptions) -{ - elfOut()->addSymbol(amd::OclElf::COMMENT, - getBIFSymbol(symOpenclCompilerOptions).c_str(), - compileOptions.c_str(), compileOptions.length()); -} - -void ClBinary::storeLinkOptions(const std::string& linkOptions) -{ - elfOut()->addSymbol(amd::OclElf::COMMENT, - getBIFSymbol(symOpenclLinkerOptions).c_str(), - linkOptions.c_str(), linkOptions.length()); -} - -bool -ClBinary::isSPIR() const -{ - char *section = NULL; - size_t sz = 0; - if (elfIn_->getSection(amd::OclElf::LLVMIR, §ion, &sz) && section && sz > 0) - return false; - - if (elfIn_->getSection(amd::OclElf::SPIR, §ion, &sz) && section && sz > 0) - return true; - - return false; -} - -bool -ClBinary::isSPIRV() const -{ - char *section = NULL; - size_t sz = 0; - - if (elfIn_->getSection(amd::OclElf::SPIRV, §ion, &sz) && section && sz > 0) { - return true; +bool ClBinary::loadLinkOptions(std::string& linkOptions) const { + char* options = NULL; + size_t sz; + linkOptions.clear(); + if (elfIn_->getSymbol(amd::OclElf::COMMENT, getBIFSymbol(symOpenclLinkerOptions).c_str(), + &options, &sz)) { + if (sz > 0) { + linkOptions.append(options, sz); } - return false; + return true; + } + return false; } -cl_device_partition_property -PartitionType::toCL() const -{ - static cl_device_partition_property conv[] = { - CL_DEVICE_PARTITION_EQUALLY, - CL_DEVICE_PARTITION_BY_COUNTS, - CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN - }; - return conv[amd::leastBitSet(value_)]; +void ClBinary::storeCompileOptions(const std::string& compileOptions) { + elfOut()->addSymbol(amd::OclElf::COMMENT, getBIFSymbol(symOpenclCompilerOptions).c_str(), + compileOptions.c_str(), compileOptions.length()); } -size_t -PartitionType::toCL(cl_device_partition_property* types) const -{ - size_t i = 0; - if (equally_) { - types[i++] = CL_DEVICE_PARTITION_EQUALLY; - } - if (byCounts_) { - types[i++] = CL_DEVICE_PARTITION_BY_COUNTS; - } - if (byAffinityDomain_) { - types[i++] = CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN; - } - return i; +void ClBinary::storeLinkOptions(const std::string& linkOptions) { + elfOut()->addSymbol(amd::OclElf::COMMENT, getBIFSymbol(symOpenclLinkerOptions).c_str(), + linkOptions.c_str(), linkOptions.length()); } -cl_device_affinity_domain -AffinityDomain::toCL() const -{ - return (cl_device_affinity_domain)value_; +bool ClBinary::isSPIR() const { + char* section = NULL; + size_t sz = 0; + if (elfIn_->getSection(amd::OclElf::LLVMIR, §ion, &sz) && section && sz > 0) return false; + + if (elfIn_->getSection(amd::OclElf::SPIR, §ion, &sz) && section && sz > 0) return true; + + return false; } +bool ClBinary::isSPIRV() const { + char* section = NULL; + size_t sz = 0; + + if (elfIn_->getSection(amd::OclElf::SPIRV, §ion, &sz) && section && sz > 0) { + return true; + } + return false; +} + +cl_device_partition_property PartitionType::toCL() const { + static cl_device_partition_property conv[] = {CL_DEVICE_PARTITION_EQUALLY, + CL_DEVICE_PARTITION_BY_COUNTS, + CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN}; + return conv[amd::leastBitSet(value_)]; +} + +size_t PartitionType::toCL(cl_device_partition_property* types) const { + size_t i = 0; + if (equally_) { + types[i++] = CL_DEVICE_PARTITION_EQUALLY; + } + if (byCounts_) { + types[i++] = CL_DEVICE_PARTITION_BY_COUNTS; + } + if (byAffinityDomain_) { + types[i++] = CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN; + } + return i; +} + +cl_device_affinity_domain AffinityDomain::toCL() const { return (cl_device_affinity_domain)value_; } + #ifdef cl_ext_device_fission -cl_device_partition_property_ext -PartitionType::toCLExt() const -{ - static cl_device_partition_property_ext conv[] = { - CL_DEVICE_PARTITION_EQUALLY_EXT, - CL_DEVICE_PARTITION_BY_COUNTS_EXT, - CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN_EXT - }; - return conv[amd::leastBitSet(value_)]; +cl_device_partition_property_ext PartitionType::toCLExt() const { + static cl_device_partition_property_ext conv[] = {CL_DEVICE_PARTITION_EQUALLY_EXT, + CL_DEVICE_PARTITION_BY_COUNTS_EXT, + CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN_EXT}; + return conv[amd::leastBitSet(value_)]; } -size_t -PartitionType::toCLExt(cl_device_partition_property_ext* types) const -{ - size_t i = 0; - if (equally_) { - types[i++] = CL_DEVICE_PARTITION_EQUALLY_EXT; - } - if (byCounts_) { - types[i++] = CL_DEVICE_PARTITION_BY_COUNTS_EXT; - } - if (byAffinityDomain_) { - types[i++] = CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN_EXT; - } - return i; +size_t PartitionType::toCLExt(cl_device_partition_property_ext* types) const { + size_t i = 0; + if (equally_) { + types[i++] = CL_DEVICE_PARTITION_EQUALLY_EXT; + } + if (byCounts_) { + types[i++] = CL_DEVICE_PARTITION_BY_COUNTS_EXT; + } + if (byAffinityDomain_) { + types[i++] = CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN_EXT; + } + return i; } -cl_device_partition_property_ext -AffinityDomain::toCLExt() const -{ - static cl_device_partition_property_ext conv[] = { - CL_AFFINITY_DOMAIN_NUMA_EXT, - CL_AFFINITY_DOMAIN_L4_CACHE_EXT, - CL_AFFINITY_DOMAIN_L3_CACHE_EXT, - CL_AFFINITY_DOMAIN_L2_CACHE_EXT, - CL_AFFINITY_DOMAIN_L1_CACHE_EXT, - CL_AFFINITY_DOMAIN_NEXT_FISSIONABLE_EXT - }; - return conv[amd::leastBitSet(value_)]; +cl_device_partition_property_ext AffinityDomain::toCLExt() const { + static cl_device_partition_property_ext conv[] = { + CL_AFFINITY_DOMAIN_NUMA_EXT, CL_AFFINITY_DOMAIN_L4_CACHE_EXT, + CL_AFFINITY_DOMAIN_L3_CACHE_EXT, CL_AFFINITY_DOMAIN_L2_CACHE_EXT, + CL_AFFINITY_DOMAIN_L1_CACHE_EXT, CL_AFFINITY_DOMAIN_NEXT_FISSIONABLE_EXT}; + return conv[amd::leastBitSet(value_)]; } -size_t -AffinityDomain::toCLExt(cl_device_partition_property_ext* affinities) const -{ - size_t i = 0; - if (numa_) { - affinities[i++] = CL_AFFINITY_DOMAIN_NUMA_EXT; - } - if (cacheL4_) { - affinities[i++] = CL_AFFINITY_DOMAIN_L4_CACHE_EXT; - } - if (cacheL3_) { - affinities[i++] = CL_AFFINITY_DOMAIN_L3_CACHE_EXT; - } - if (cacheL2_) { - affinities[i++] = CL_AFFINITY_DOMAIN_L2_CACHE_EXT; - } - if (cacheL1_) { - affinities[i++] = CL_AFFINITY_DOMAIN_L1_CACHE_EXT; - } - if (next_) { - affinities[i++] = CL_AFFINITY_DOMAIN_NEXT_FISSIONABLE_EXT; - } - return i; +size_t AffinityDomain::toCLExt(cl_device_partition_property_ext* affinities) const { + size_t i = 0; + if (numa_) { + affinities[i++] = CL_AFFINITY_DOMAIN_NUMA_EXT; + } + if (cacheL4_) { + affinities[i++] = CL_AFFINITY_DOMAIN_L4_CACHE_EXT; + } + if (cacheL3_) { + affinities[i++] = CL_AFFINITY_DOMAIN_L3_CACHE_EXT; + } + if (cacheL2_) { + affinities[i++] = CL_AFFINITY_DOMAIN_L2_CACHE_EXT; + } + if (cacheL1_) { + affinities[i++] = CL_AFFINITY_DOMAIN_L1_CACHE_EXT; + } + if (next_) { + affinities[i++] = CL_AFFINITY_DOMAIN_NEXT_FISSIONABLE_EXT; + } + return i; } -#endif // cl_ext_device_fission +#endif // cl_ext_device_fission -} // namespace device +} // namespace device diff --git a/rocclr/runtime/device/device.hpp b/rocclr/runtime/device/device.hpp index 28b8dde668..5a9871878a 100644 --- a/rocclr/runtime/device/device.hpp +++ b/rocclr/runtime/device/device.hpp @@ -19,7 +19,7 @@ #if defined(WITH_LIGHTNING_COMPILER) #include "caching/cache.hpp" #include "driver/AmdCompiler.h" -#endif // defined(WITH_LIGHTNING_COMPILER) +#endif // defined(WITH_LIGHTNING_COMPILER) #include "acl.h" #include "hwdebug.hpp" @@ -71,1824 +71,1702 @@ struct Coord3D; namespace option { class Options; -} // option +} // option -struct ProfilingCallback: public amd::HeapObject { - virtual void callback (ulong duration) = 0; +struct ProfilingCallback : public amd::HeapObject { + virtual void callback(ulong duration) = 0; }; } enum OclExtensions { - ClKhrFp64 = 0, - ClAmdFp64, - ClKhrSelectFpRoundingMode, - ClKhrGlobalInt32BaseAtomics, - ClKhrGlobalInt32ExtendedAtomics, - ClKhrLocalInt32BaseAtomics, - ClKhrLocalInt32ExtendedAtomics, - ClKhrInt64BaseAtomics, - ClKhrInt64ExtendedAtomics, - ClKhr3DImageWrites, - ClKhrByteAddressableStore, - ClKhrFp16, - ClKhrGlSharing, - ClKhrGLDepthImages, - ClExtDeviceFission, - ClAmdDeviceAttributeQuery, - ClAmdVec3, - ClAmdPrintf, - ClAmdMediaOps, - ClAmdMediaOps2, - ClAmdPopcnt, + ClKhrFp64 = 0, + ClAmdFp64, + ClKhrSelectFpRoundingMode, + ClKhrGlobalInt32BaseAtomics, + ClKhrGlobalInt32ExtendedAtomics, + ClKhrLocalInt32BaseAtomics, + ClKhrLocalInt32ExtendedAtomics, + ClKhrInt64BaseAtomics, + ClKhrInt64ExtendedAtomics, + ClKhr3DImageWrites, + ClKhrByteAddressableStore, + ClKhrFp16, + ClKhrGlSharing, + ClKhrGLDepthImages, + ClExtDeviceFission, + ClAmdDeviceAttributeQuery, + ClAmdVec3, + ClAmdPrintf, + ClAmdMediaOps, + ClAmdMediaOps2, + ClAmdPopcnt, #if defined(_WIN32) - ClKhrD3d10Sharing, - ClKhrD3d11Sharing, - ClKhrD3d9Sharing, + ClKhrD3d10Sharing, + ClKhrD3d11Sharing, + ClKhrD3d9Sharing, #endif - ClKhrImage2dFromBuffer, - ClAmdSemaphore, - ClAMDBusAddressableMemory, - ClAMDC11Atomics, - ClKhrSpir, - ClKhrSubGroups, - ClKhrGlEvent, - ClKhrDepthImages, - ClKhrMipMapImage, - ClKhrMipMapImageWrites, - ClKhrIlProgram, - ClAMDLiquidFlash, - ClExtTotal + ClKhrImage2dFromBuffer, + ClAmdSemaphore, + ClAMDBusAddressableMemory, + ClAMDC11Atomics, + ClKhrSpir, + ClKhrSubGroups, + ClKhrGlEvent, + ClKhrDepthImages, + ClKhrMipMapImage, + ClKhrMipMapImageWrites, + ClKhrIlProgram, + ClAMDLiquidFlash, + ClExtTotal }; -static const char* -OclExtensionsString[] = { - "cl_khr_fp64 ", "cl_amd_fp64 ", - "cl_khr_select_fprounding_mode ", - "cl_khr_global_int32_base_atomics ", - "cl_khr_global_int32_extended_atomics ", - "cl_khr_local_int32_base_atomics ", - "cl_khr_local_int32_extended_atomics ", - "cl_khr_int64_base_atomics ", - "cl_khr_int64_extended_atomics ", - "cl_khr_3d_image_writes ", - "cl_khr_byte_addressable_store ", - "cl_khr_fp16 ", - "cl_khr_gl_sharing ", - "cl_khr_gl_depth_images ", - "cl_ext_device_fission ", - "cl_amd_device_attribute_query ", - "cl_amd_vec3 ", - "cl_amd_printf ", - "cl_amd_media_ops ", - "cl_amd_media_ops2 ", - "cl_amd_popcnt ", +static const char* OclExtensionsString[] = {"cl_khr_fp64 ", + "cl_amd_fp64 ", + "cl_khr_select_fprounding_mode ", + "cl_khr_global_int32_base_atomics ", + "cl_khr_global_int32_extended_atomics ", + "cl_khr_local_int32_base_atomics ", + "cl_khr_local_int32_extended_atomics ", + "cl_khr_int64_base_atomics ", + "cl_khr_int64_extended_atomics ", + "cl_khr_3d_image_writes ", + "cl_khr_byte_addressable_store ", + "cl_khr_fp16 ", + "cl_khr_gl_sharing ", + "cl_khr_gl_depth_images ", + "cl_ext_device_fission ", + "cl_amd_device_attribute_query ", + "cl_amd_vec3 ", + "cl_amd_printf ", + "cl_amd_media_ops ", + "cl_amd_media_ops2 ", + "cl_amd_popcnt ", #if defined(_WIN32) - "cl_khr_d3d10_sharing ", - "cl_khr_d3d11_sharing ", - "cl_khr_dx9_media_sharing ", + "cl_khr_d3d10_sharing ", + "cl_khr_d3d11_sharing ", + "cl_khr_dx9_media_sharing ", #endif - "cl_khr_image2d_from_buffer ", - "", - "cl_amd_bus_addressable_memory ", - "cl_amd_c11_atomics ", - "cl_khr_spir ", - "cl_khr_subgroups ", - "cl_khr_gl_event ", - "cl_khr_depth_images ", - "cl_khr_mipmap_image ", - "cl_khr_mipmap_image_writes ", - "", - (IS_LINUX) ? "" : "cl_amd_liquid_flash ", - NULL -}; + "cl_khr_image2d_from_buffer ", + "", + "cl_amd_bus_addressable_memory ", + "cl_amd_c11_atomics ", + "cl_khr_spir ", + "cl_khr_subgroups ", + "cl_khr_gl_event ", + "cl_khr_depth_images ", + "cl_khr_mipmap_image ", + "cl_khr_mipmap_image_writes ", + "", + (IS_LINUX) ? "" : "cl_amd_liquid_flash ", + NULL}; namespace device { class ClBinary; class BlitManager; -struct PartitionType : public amd::EmbeddedObject -{ - enum { - EQUALLY = (1 << 0), - BY_COUNTS = (1 << 1), - BY_AFFINITY_DOMAIN = (1 << 2) +struct PartitionType : public amd::EmbeddedObject { + enum { EQUALLY = (1 << 0), BY_COUNTS = (1 << 1), BY_AFFINITY_DOMAIN = (1 << 2) }; + + union { + struct { + uint equally_ : 1; + uint byCounts_ : 1; + uint byAffinityDomain_ : 1; }; + uint value_; + }; - union { - struct { - uint equally_ : 1; - uint byCounts_ : 1; - uint byAffinityDomain_ : 1; - }; - uint value_; - }; + size_t getNumSet() const { return (size_t)amd::countBitsSet(value_); } - size_t getNumSet() const { return (size_t)amd::countBitsSet(value_); } - - cl_device_partition_property toCL() const; - size_t toCL(cl_device_partition_property* types) const; + cl_device_partition_property toCL() const; + size_t toCL(cl_device_partition_property* types) const; #ifdef cl_ext_device_fission - cl_device_partition_property_ext toCLExt() const; - size_t toCLExt(cl_device_partition_property_ext* types) const; + cl_device_partition_property_ext toCLExt() const; + size_t toCLExt(cl_device_partition_property_ext* types) const; #endif }; -struct AffinityDomain : public amd::EmbeddedObject -{ - enum { - AFFINITY_DOMAIN_NUMA = (1 << 0), - AFFINITY_DOMAIN_L4_CACHE = (1 << 1), - AFFINITY_DOMAIN_L3_CACHE = (1 << 2), - AFFINITY_DOMAIN_L2_CACHE = (1 << 3), - AFFINITY_DOMAIN_L1_CACHE = (1 << 4), - AFFINITY_DOMAIN_NEXT_PARTITIONABLE = (1 << 5) +struct AffinityDomain : public amd::EmbeddedObject { + enum { + AFFINITY_DOMAIN_NUMA = (1 << 0), + AFFINITY_DOMAIN_L4_CACHE = (1 << 1), + AFFINITY_DOMAIN_L3_CACHE = (1 << 2), + AFFINITY_DOMAIN_L2_CACHE = (1 << 3), + AFFINITY_DOMAIN_L1_CACHE = (1 << 4), + AFFINITY_DOMAIN_NEXT_PARTITIONABLE = (1 << 5) + }; + + union { + struct { + uint numa_ : 1; + uint cacheL4_ : 1; + uint cacheL3_ : 1; + uint cacheL2_ : 1; + uint cacheL1_ : 1; + uint next_ : 1; }; + uint value_; + }; - union { - struct { - uint numa_ : 1; - uint cacheL4_ : 1; - uint cacheL3_ : 1; - uint cacheL2_ : 1; - uint cacheL1_ : 1; - uint next_ : 1; - }; - uint value_; - }; + size_t getNumSet() const { return (size_t)amd::countBitsSet(value_); } - size_t getNumSet() const { return (size_t)amd::countBitsSet(value_); } - - cl_device_affinity_domain toCL() const; + cl_device_affinity_domain toCL() const; #ifdef cl_ext_device_fission - cl_device_partition_property_ext toCLExt() const; - size_t toCLExt(cl_device_partition_property_ext* affinities) const; + cl_device_partition_property_ext toCLExt() const; + size_t toCLExt(cl_device_partition_property_ext* affinities) const; #endif }; //! Device partition properties. -struct PartitionInfo : public amd::EmbeddedObject -{ - PartitionType type_; - union { - struct { - size_t numComputeUnits_; - } equally_; +struct PartitionInfo : public amd::EmbeddedObject { + PartitionType type_; + union { + struct { + size_t numComputeUnits_; + } equally_; - AffinityDomain byAffinityDomain_; + AffinityDomain byAffinityDomain_; - struct { - const cl_uint* countsList_; - size_t listSize_; - } byCounts_; - }; + struct { + const cl_uint* countsList_; + size_t listSize_; + } byCounts_; + }; }; //! Create Sub-Devices request properties. -struct CreateSubDevicesInfo : public amd::HeapObject -{ - PartitionInfo p_; - virtual cl_uint countsListAt(size_t i) const = 0; - virtual ~CreateSubDevicesInfo() {} +struct CreateSubDevicesInfo : public amd::HeapObject { + PartitionInfo p_; + virtual cl_uint countsListAt(size_t i) const = 0; + virtual ~CreateSubDevicesInfo() {} }; -template -struct CreateSubDevicesInfoT : public CreateSubDevicesInfo -{ - virtual cl_uint countsListAt(size_t i) const { - return (cl_uint)reinterpret_cast(p_.byCounts_.countsList_)[i]; - } +template struct CreateSubDevicesInfoT : public CreateSubDevicesInfo { + virtual cl_uint countsListAt(size_t i) const { + return (cl_uint) reinterpret_cast(p_.byCounts_.countsList_)[i]; + } - void initCountsList(const PROP_T* props) { - p_.byCounts_.countsList_ = reinterpret_cast(props); - p_.byCounts_.listSize_ = 0; - for (; *props != ((PROP_T)0); ++props) { - ++p_.byCounts_.listSize_; - } + void initCountsList(const PROP_T* props) { + p_.byCounts_.countsList_ = reinterpret_cast(props); + p_.byCounts_.listSize_ = 0; + for (; *props != ((PROP_T)0); ++props) { + ++p_.byCounts_.listSize_; } + } }; //! Physical device properties. -struct Info : public amd::EmbeddedObject -{ - //! The OpenCL device type. - cl_device_type type_; +struct Info : public amd::EmbeddedObject { + //! The OpenCL device type. + cl_device_type type_; - //! A unique device vendor identifier. - cl_uint vendorId_; + //! A unique device vendor identifier. + cl_uint vendorId_; - //! The number of parallel compute cores on the compute device. - cl_uint maxComputeUnits_; + //! The number of parallel compute cores on the compute device. + cl_uint maxComputeUnits_; - //! Maximum dimensions that specify the global and local work-item IDs - // used by the data-parallel execution model. - cl_uint maxWorkItemDimensions_; + //! Maximum dimensions that specify the global and local work-item IDs + // used by the data-parallel execution model. + cl_uint maxWorkItemDimensions_; - //! Maximum number of work-items that can be specified in each dimension - // to clEnqueueNDRangeKernel. - size_t maxWorkItemSizes_[3]; + //! Maximum number of work-items that can be specified in each dimension + // to clEnqueueNDRangeKernel. + size_t maxWorkItemSizes_[3]; - //! Maximum number of work-items in a work-group executing a kernel - // using the data-parallel execution model. - size_t maxWorkGroupSize_; + //! Maximum number of work-items in a work-group executing a kernel + // using the data-parallel execution model. + size_t maxWorkGroupSize_; - //! Number of shader engines in physical GPU - size_t numberOfShaderEngines; + //! Number of shader engines in physical GPU + size_t numberOfShaderEngines; - //! cl_uint Preferred native vector width size for built-in scalar types - // that can be put into vectors. - cl_uint preferredVectorWidthChar_; - cl_uint preferredVectorWidthShort_; - cl_uint preferredVectorWidthInt_; - cl_uint preferredVectorWidthLong_; - cl_uint preferredVectorWidthFloat_; - cl_uint preferredVectorWidthDouble_; - cl_uint preferredVectorWidthHalf_; + //! cl_uint Preferred native vector width size for built-in scalar types + // that can be put into vectors. + cl_uint preferredVectorWidthChar_; + cl_uint preferredVectorWidthShort_; + cl_uint preferredVectorWidthInt_; + cl_uint preferredVectorWidthLong_; + cl_uint preferredVectorWidthFloat_; + cl_uint preferredVectorWidthDouble_; + cl_uint preferredVectorWidthHalf_; - //! Returns the native ISA vector width. The vector width is defined as the - // number of scalar elements that can be stored in the vector. - cl_uint nativeVectorWidthChar_; - cl_uint nativeVectorWidthShort_; - cl_uint nativeVectorWidthInt_; - cl_uint nativeVectorWidthLong_; - cl_uint nativeVectorWidthFloat_; - cl_uint nativeVectorWidthDouble_; - cl_uint nativeVectorWidthHalf_; + //! Returns the native ISA vector width. The vector width is defined as the + // number of scalar elements that can be stored in the vector. + cl_uint nativeVectorWidthChar_; + cl_uint nativeVectorWidthShort_; + cl_uint nativeVectorWidthInt_; + cl_uint nativeVectorWidthLong_; + cl_uint nativeVectorWidthFloat_; + cl_uint nativeVectorWidthDouble_; + cl_uint nativeVectorWidthHalf_; - //! Maximum configured clock frequency of the device in MHz. - cl_uint maxClockFrequency_; + //! Maximum configured clock frequency of the device in MHz. + cl_uint maxClockFrequency_; - //! Describes the address spaces supported by the device. - cl_uint addressBits_; + //! Describes the address spaces supported by the device. + cl_uint addressBits_; - //! Max number of simultaneous image objects that can be read by a - // kernel. - cl_uint maxReadImageArgs_; + //! Max number of simultaneous image objects that can be read by a + // kernel. + cl_uint maxReadImageArgs_; - //! Max number of simultaneous image objects that can be written to - // by a kernel. - cl_uint maxWriteImageArgs_; + //! Max number of simultaneous image objects that can be written to + // by a kernel. + cl_uint maxWriteImageArgs_; - //! Max number of simultaneous image objects that can be read/written to - // by a kernel. - cl_uint maxReadWriteImageArgs_; + //! Max number of simultaneous image objects that can be read/written to + // by a kernel. + cl_uint maxReadWriteImageArgs_; - //! Max size of memory object allocation in bytes. - cl_ulong maxMemAllocSize_; + //! Max size of memory object allocation in bytes. + cl_ulong maxMemAllocSize_; - //! Max width of 2D image in pixels. - size_t image2DMaxWidth_; + //! Max width of 2D image in pixels. + size_t image2DMaxWidth_; - //! Max height of 2D image in pixels. - size_t image2DMaxHeight_; + //! Max height of 2D image in pixels. + size_t image2DMaxHeight_; - //! Max width of 3D image in pixels. - size_t image3DMaxWidth_; + //! Max width of 3D image in pixels. + size_t image3DMaxWidth_; - //! Max height of 3D image in pixels. - size_t image3DMaxHeight_; + //! Max height of 3D image in pixels. + size_t image3DMaxHeight_; - //! Max depth of 3D image in pixels. - size_t image3DMaxDepth_; + //! Max depth of 3D image in pixels. + size_t image3DMaxDepth_; - //! Describes whether images are supported - cl_bool imageSupport_; + //! Describes whether images are supported + cl_bool imageSupport_; - //! Max size in bytes of the arguments that can be passed to a kernel. - size_t maxParameterSize_; + //! Max size in bytes of the arguments that can be passed to a kernel. + size_t maxParameterSize_; - //! Maximum number of samplers that can be used in a kernel. - cl_uint maxSamplers_; + //! Maximum number of samplers that can be used in a kernel. + cl_uint maxSamplers_; - //! Describes the alignment in bits of the base address of any - // allocated memory object. - cl_uint memBaseAddrAlign_; + //! Describes the alignment in bits of the base address of any + // allocated memory object. + cl_uint memBaseAddrAlign_; - //! The smallest alignment in bytes which can be used for any data type. - cl_uint minDataTypeAlignSize_; + //! The smallest alignment in bytes which can be used for any data type. + cl_uint minDataTypeAlignSize_; - //! Describes single precision floating point capability of the device. - cl_device_fp_config halfFPConfig_; - cl_device_fp_config singleFPConfig_; - cl_device_fp_config doubleFPConfig_; + //! Describes single precision floating point capability of the device. + cl_device_fp_config halfFPConfig_; + cl_device_fp_config singleFPConfig_; + cl_device_fp_config doubleFPConfig_; - //! Type of global memory cache supported. - cl_device_mem_cache_type globalMemCacheType_; + //! Type of global memory cache supported. + cl_device_mem_cache_type globalMemCacheType_; - //! Size of global memory cache line in bytes. - cl_uint globalMemCacheLineSize_; + //! Size of global memory cache line in bytes. + cl_uint globalMemCacheLineSize_; - //! Size of global memory cache in bytes. - cl_ulong globalMemCacheSize_; + //! Size of global memory cache in bytes. + cl_ulong globalMemCacheSize_; - //! Size of global device memory in bytes. - cl_ulong globalMemSize_; + //! Size of global device memory in bytes. + cl_ulong globalMemSize_; - //! Max size in bytes of a constant buffer allocation. - cl_ulong maxConstantBufferSize_; + //! Max size in bytes of a constant buffer allocation. + cl_ulong maxConstantBufferSize_; - //! Max number of arguments declared - cl_uint maxConstantArgs_; + //! Max number of arguments declared + cl_uint maxConstantArgs_; - //! This is used to determine the type of local memory that is available - cl_device_local_mem_type localMemType_; + //! This is used to determine the type of local memory that is available + cl_device_local_mem_type localMemType_; - //! Size of local memory arena in bytes. - cl_ulong localMemSize_; + //! Size of local memory arena in bytes. + cl_ulong localMemSize_; - //! If enabled, implies that all the memories, caches, registers etc. in - // the device implement error correction. - cl_bool errorCorrectionSupport_; + //! If enabled, implies that all the memories, caches, registers etc. in + // the device implement error correction. + cl_bool errorCorrectionSupport_; - //! CL_TRUE if the device and the host have a unified memory subsystem and - // is CL_FALSE otherwise. - cl_bool hostUnifiedMemory_; + //! CL_TRUE if the device and the host have a unified memory subsystem and + // is CL_FALSE otherwise. + cl_bool hostUnifiedMemory_; - //! Describes the resolution of device timer. - size_t profilingTimerResolution_; + //! Describes the resolution of device timer. + size_t profilingTimerResolution_; - //! Timer starting point offset to Epoch. - cl_ulong profilingTimerOffset_; + //! Timer starting point offset to Epoch. + cl_ulong profilingTimerOffset_; - //! CL_TRUE if device is a little endian device. - cl_bool littleEndian_; + //! CL_TRUE if device is a little endian device. + cl_bool littleEndian_; - //! If enabled, implies that commands can be submitted to command-queues - // created on this device. - cl_bool available_; + //! If enabled, implies that commands can be submitted to command-queues + // created on this device. + cl_bool available_; - //! if the implementation does not have a compiler available to compile - // the program source. - cl_bool compilerAvailable_; + //! if the implementation does not have a compiler available to compile + // the program source. + cl_bool compilerAvailable_; - //! Describes the execution capabilities of the device. - cl_device_exec_capabilities executionCapabilities_; + //! Describes the execution capabilities of the device. + cl_device_exec_capabilities executionCapabilities_; - //! Describes the SVM capabilities of the device. - cl_device_svm_capabilities svmCapabilities_; + //! Describes the SVM capabilities of the device. + cl_device_svm_capabilities svmCapabilities_; - //! Preferred alignment for OpenCL fine-grained SVM atomic types. - cl_uint preferredPlatformAtomicAlignment_; + //! Preferred alignment for OpenCL fine-grained SVM atomic types. + cl_uint preferredPlatformAtomicAlignment_; - //! Preferred alignment for OpenCL global atomic types. - cl_uint preferredGlobalAtomicAlignment_; + //! Preferred alignment for OpenCL global atomic types. + cl_uint preferredGlobalAtomicAlignment_; - //! Preferred alignment for OpenCL local atomic types. - cl_uint preferredLocalAtomicAlignment_; + //! Preferred alignment for OpenCL local atomic types. + cl_uint preferredLocalAtomicAlignment_; - //! Describes the command-queue properties supported of the host queue. - cl_command_queue_properties queueProperties_; + //! Describes the command-queue properties supported of the host queue. + cl_command_queue_properties queueProperties_; - //! The platform associated with this device - cl_platform_id platform_; + //! The platform associated with this device + cl_platform_id platform_; - //! Device name string - char name_[0x40]; + //! Device name string + char name_[0x40]; - //! Vendor name string - char vendor_[0x20]; + //! Vendor name string + char vendor_[0x20]; - //! OpenCL software driver version string in the form major.minor - char driverVersion_[0x20]; + //! OpenCL software driver version string in the form major.minor + char driverVersion_[0x20]; - //! Returns the profile name supported by the device. - const char* profile_; + //! Returns the profile name supported by the device. + const char* profile_; - //! Returns the OpenCL version supported by the device. - const char* version_; + //! Returns the OpenCL version supported by the device. + const char* version_; - //! The highest OpenCL C version supported by the compiler for this device. - const char* oclcVersion_; + //! The highest OpenCL C version supported by the compiler for this device. + const char* oclcVersion_; - //! Returns a space separated list of extension names. - const char* extensions_; + //! Returns a space separated list of extension names. + const char* extensions_; - //! Returns if device linker is available - cl_bool linkerAvailable_; + //! Returns if device linker is available + cl_bool linkerAvailable_; - //! Returns the list of built-in kernels, supported by the device - const char* builtInKernels_; - - //! Returns max number of pixels for a 1D image created from a buffer object - size_t imageMaxBufferSize_; + //! Returns the list of built-in kernels, supported by the device + const char* builtInKernels_; + + //! Returns max number of pixels for a 1D image created from a buffer object + size_t imageMaxBufferSize_; - //! Returns max number of images in a 1D or 2D image array - size_t imageMaxArraySize_; - - //! Returns the list of partition types supported by device - PartitionType partitionProperties_; - - //! Returns the list of supported affinity domains for - //! partitioning the device using CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN - AffinityDomain affinityDomain_; - - //! Returns the properties argument specified in clCreateSubDevices - //! if device is a subdevice. - PartitionInfo partitionCreateInfo_; - - //! Returns CL_TRUE if the devices preference is for the user to be - //! responsible for synchronization - cl_bool preferredInteropUserSync_; - - //! Returns maximum size of the internal buffer that holds the output - //! of printf calls from a kernel - size_t printfBufferSize_; - - //! Indicates maximum number of supported global atomic counters - cl_uint maxAtomicCounters_; - - //! Returns the topology for the device - cl_device_topology_amd deviceTopology_; - - //! Semaphore information - cl_uint maxSemaphores_; - cl_uint maxSemaphoreSize_; - - //! Returns the SKU board name for the device - char boardName_[128]; - - //! Number of SIMD (Single Instruction Multiple Data) units per compute unit - //! that execute in parallel. All work items from the same work group must be - //! executed by SIMDs in the same compute unit. - cl_uint simdPerCU_; - //! The maximum number of work items from the same work group that can be - //! executed by a SIMD in parallel - cl_uint simdWidth_; - //! The number of instructions that a SIMD can execute in parallel - cl_uint simdInstructionWidth_; - //! The number of workitems per wavefront - cl_uint wavefrontWidth_; - //! Number of global memory channels - cl_uint globalMemChannels_; - //! Number of banks in each global memory channel - cl_uint globalMemChannelBanks_; - //! Width in bytes of each of global memory bank - cl_uint globalMemChannelBankWidth_; - //! Local memory size per CU - cl_uint localMemSizePerCU_; - //! Number of banks of local memory - cl_uint localMemBanks_; - //! The core engine GFXIP version - cl_uint gfxipVersion_; - //! Number of available async queues - cl_uint numAsyncQueues_; - //! Number of available real time queues - cl_uint numRTQueues_; - //! Number of available real time compute units - cl_uint numRTCUs_; - //! Thread trace enable - cl_bool threadTraceEnable_; - - //! Image pitch alignment for image2d_from_buffer - cl_uint imagePitchAlignment_; - //! Image base address alignment for image2d_from_buffer - cl_uint imageBaseAddressAlignment_; - - //! Describes whether buffers from images are supported - cl_bool bufferFromImageSupport_; - - //! Returns the supported SPIR versions for the device - const char* spirVersions_; - - //! OpenCL20 device info fields: - - //! The max number of pipe objects that can be passed as arguments to a kernel - cl_uint maxPipeArgs_; - //! The max number of reservations that can be active for a pipe per work-item in a kernel - cl_uint maxPipeActiveReservations_; - //! The max size of pipe packet in bytes - cl_uint maxPipePacketSize_; - - //! The command-queue properties supported of the device queue. - cl_command_queue_properties queueOnDeviceProperties_; - //! The preferred size of the device queue in bytes - cl_uint queueOnDevicePreferredSize_; - //! The max size of the device queue in bytes - cl_uint queueOnDeviceMaxSize_; - //! The maximum number of device queues - cl_uint maxOnDeviceQueues_; - //! The maximum number of events in use on a device queue - cl_uint maxOnDeviceEvents_; - - //! The maximum size of global scope variables - size_t maxGlobalVariableSize_; - size_t globalVariablePreferredTotalSize_; - //! Driver store location - char driverStore_[200]; + //! Returns max number of images in a 1D or 2D image array + size_t imageMaxArraySize_; + + //! Returns the list of partition types supported by device + PartitionType partitionProperties_; + + //! Returns the list of supported affinity domains for + //! partitioning the device using CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN + AffinityDomain affinityDomain_; + + //! Returns the properties argument specified in clCreateSubDevices + //! if device is a subdevice. + PartitionInfo partitionCreateInfo_; + + //! Returns CL_TRUE if the devices preference is for the user to be + //! responsible for synchronization + cl_bool preferredInteropUserSync_; + + //! Returns maximum size of the internal buffer that holds the output + //! of printf calls from a kernel + size_t printfBufferSize_; + + //! Indicates maximum number of supported global atomic counters + cl_uint maxAtomicCounters_; + + //! Returns the topology for the device + cl_device_topology_amd deviceTopology_; + + //! Semaphore information + cl_uint maxSemaphores_; + cl_uint maxSemaphoreSize_; + + //! Returns the SKU board name for the device + char boardName_[128]; + + //! Number of SIMD (Single Instruction Multiple Data) units per compute unit + //! that execute in parallel. All work items from the same work group must be + //! executed by SIMDs in the same compute unit. + cl_uint simdPerCU_; + //! The maximum number of work items from the same work group that can be + //! executed by a SIMD in parallel + cl_uint simdWidth_; + //! The number of instructions that a SIMD can execute in parallel + cl_uint simdInstructionWidth_; + //! The number of workitems per wavefront + cl_uint wavefrontWidth_; + //! Number of global memory channels + cl_uint globalMemChannels_; + //! Number of banks in each global memory channel + cl_uint globalMemChannelBanks_; + //! Width in bytes of each of global memory bank + cl_uint globalMemChannelBankWidth_; + //! Local memory size per CU + cl_uint localMemSizePerCU_; + //! Number of banks of local memory + cl_uint localMemBanks_; + //! The core engine GFXIP version + cl_uint gfxipVersion_; + //! Number of available async queues + cl_uint numAsyncQueues_; + //! Number of available real time queues + cl_uint numRTQueues_; + //! Number of available real time compute units + cl_uint numRTCUs_; + //! Thread trace enable + cl_bool threadTraceEnable_; + + //! Image pitch alignment for image2d_from_buffer + cl_uint imagePitchAlignment_; + //! Image base address alignment for image2d_from_buffer + cl_uint imageBaseAddressAlignment_; + + //! Describes whether buffers from images are supported + cl_bool bufferFromImageSupport_; + + //! Returns the supported SPIR versions for the device + const char* spirVersions_; + + //! OpenCL20 device info fields: + + //! The max number of pipe objects that can be passed as arguments to a kernel + cl_uint maxPipeArgs_; + //! The max number of reservations that can be active for a pipe per work-item in a kernel + cl_uint maxPipeActiveReservations_; + //! The max size of pipe packet in bytes + cl_uint maxPipePacketSize_; + + //! The command-queue properties supported of the device queue. + cl_command_queue_properties queueOnDeviceProperties_; + //! The preferred size of the device queue in bytes + cl_uint queueOnDevicePreferredSize_; + //! The max size of the device queue in bytes + cl_uint queueOnDeviceMaxSize_; + //! The maximum number of device queues + cl_uint maxOnDeviceQueues_; + //! The maximum number of events in use on a device queue + cl_uint maxOnDeviceEvents_; + + //! The maximum size of global scope variables + size_t maxGlobalVariableSize_; + size_t globalVariablePreferredTotalSize_; + //! Driver store location + char driverStore_[200]; }; //! Device settings -class Settings : public amd::HeapObject -{ -public: - uint64_t extensions_; //!< Supported OCL extensions - union { - struct { - uint partialDispatch_: 1; //!< Enables partial dispatch - uint supportRA_: 1; //!< Support RA channel order format - uint waitCommand_: 1; //!< Enables a wait for every submitted command - uint customHostAllocator_: 1;//!< True if device has custom host allocator - // that replaces generic OS allocation routines - uint supportDepthsRGB_: 1; //!< Support DEPTH and sRGB channel order format - uint enableHwDebug_: 1; //!< Enable HW debug support - uint reserved_: 26; - }; - uint value_; +class Settings : public amd::HeapObject { + public: + uint64_t extensions_; //!< Supported OCL extensions + union { + struct { + uint partialDispatch_ : 1; //!< Enables partial dispatch + uint supportRA_ : 1; //!< Support RA channel order format + uint waitCommand_ : 1; //!< Enables a wait for every submitted command + uint customHostAllocator_ : 1; //!< True if device has custom host allocator + // that replaces generic OS allocation routines + uint supportDepthsRGB_ : 1; //!< Support DEPTH and sRGB channel order format + uint enableHwDebug_ : 1; //!< Enable HW debug support + uint reserved_ : 26; }; + uint value_; + }; - uint commandQueues_; //!< Field value for maximum number - //!< concurrent Virtual GPUs for each backend - //! Default constructor - Settings(); + uint commandQueues_; //!< Field value for maximum number + //!< concurrent Virtual GPUs for each backend + //! Default constructor + Settings(); - //! Check the specified extension - bool checkExtension(uint name) const - { return (extensions_ & (static_cast(1) << name)) ? true : false; } + //! Check the specified extension + bool checkExtension(uint name) const { + return (extensions_ & (static_cast(1) << name)) ? true : false; + } - //! Enable the specified extension - void enableExtension(uint name) { extensions_ |= static_cast(1) << name; } + //! Enable the specified extension + void enableExtension(uint name) { extensions_ |= static_cast(1) << name; } -private: - //! Disable copy constructor - Settings(const Settings&); + private: + //! Disable copy constructor + Settings(const Settings&); - //! Disable assignment - Settings& operator=(const Settings&); + //! Disable assignment + Settings& operator=(const Settings&); }; //! Device-independent cache memory, base class for the device-specific //! memories. One Memory instance refers to one or more of these. -class Memory : public amd::HeapObject -{ -public: - //! Resource map flags - enum CpuMapFlags - { - CpuReadWrite = 0x00000000, //!< Lock for CPU read/Write - CpuReadOnly = 0x00000001, //!< Lock for CPU read only operation - CpuWriteOnly = 0x00000002, //!< Lock for CPU write only operation +class Memory : public amd::HeapObject { + public: + //! Resource map flags + enum CpuMapFlags { + CpuReadWrite = 0x00000000, //!< Lock for CPU read/Write + CpuReadOnly = 0x00000001, //!< Lock for CPU read only operation + CpuWriteOnly = 0x00000002, //!< Lock for CPU write only operation + }; + + union SyncFlags { + struct { + uint skipParent_ : 1; //!< Skip parent synchronization + uint skipViews_ : 1; //!< Skip views synchronization + uint skipEntire_ : 1; //!< Skip entire synchronization + }; + uint value_; + SyncFlags() : value_(0) {} + }; + + struct WriteMapInfo : public amd::HeapObject { + amd::Coord3D origin_; //!< Origin of the map location + amd::Coord3D region_; //!< Mapped region + amd::Image* baseMip_; //!< The base mip level for images + union { + struct { + uint32_t count_ : 8; //!< The same map region counter + uint32_t unmapWrite_ : 1; //!< Unmap write operation + uint32_t unmapRead_ : 1; //!< Unmap read operation + uint32_t entire_ : 1; //!< Process the entire memory + }; + uint32_t flags_; }; - union SyncFlags { - struct { - uint skipParent_ : 1; //!< Skip parent synchronization - uint skipViews_ : 1; //!< Skip views synchronization - uint skipEntire_ : 1; //!< Skip entire synchronization - }; - uint value_; - SyncFlags(): value_(0) {} - }; + //! Returns the state of entire map + bool isEntire() const { return (entire_) ? true : false; } - struct WriteMapInfo: public amd::HeapObject - { - amd::Coord3D origin_; //!< Origin of the map location - amd::Coord3D region_; //!< Mapped region - amd::Image* baseMip_; //!< The base mip level for images - union { - struct { - uint32_t count_: 8; //!< The same map region counter - uint32_t unmapWrite_: 1; //!< Unmap write operation - uint32_t unmapRead_: 1; //!< Unmap read operation - uint32_t entire_: 1; //!< Process the entire memory - }; - uint32_t flags_; - }; + //! Returns the state of map write flag + bool isUnmapWrite() const { return (unmapWrite_) ? true : false; } - //! Returns the state of entire map - bool isEntire() const { return (entire_) ? true : false; } + //! Returns the state of map read flag + bool isUnmapRead() const { return (unmapRead_) ? true : false; } - //! Returns the state of map write flag - bool isUnmapWrite() const { return (unmapWrite_) ? true : false; } + WriteMapInfo() : origin_(0, 0, 0), region_(0, 0, 0), baseMip_(NULL), flags_(0) {} + }; - //! Returns the state of map read flag - bool isUnmapRead() const { return (unmapRead_) ? true : false; } + //! Constructor (from an amd::Memory object). + Memory(amd::Memory& owner) + : flags_(0), owner_(&owner), version_(0), mapMemory_(NULL), indirectMapCount_(0) { + size_ = owner.getSize(); + } - WriteMapInfo(): origin_(0, 0, 0), region_(0, 0, 0), baseMip_(NULL), flags_(0) {} - }; + //! Constructor (no owner), always eager allocation. + Memory(size_t size) + : flags_(0), owner_(NULL), version_(0), mapMemory_(NULL), indirectMapCount_(0), size_(size) {} - //! Constructor (from an amd::Memory object). - Memory(amd::Memory& owner) - : flags_(0) - , owner_(&owner) - , version_(0) - , mapMemory_(NULL) - , indirectMapCount_(0) - { - size_ = owner.getSize(); + enum GLResourceOP { + GLDecompressResource = 0, // orders the GL driver to decompress any depth-stencil or MSAA + // resource to be sampled by a CL kernel. + GLInvalidateFBO // orders the GL driver to invalidate any FBO the resource may be bound to, + // since the resource internal state changed. + }; + + //! Default destructor for the device memory object + virtual ~Memory(){}; + + //! Releases virtual objects associated with this memory + void releaseVirtual(); + + //! Read the size + size_t size() const { return size_; } + + //! Gets the owner Memory instance + amd::Memory* owner() const { return owner_; } + + //! Immediate blocking write from device cache to owners's backing store. + //! Marks owner as "current" by resetting the last writer to NULL. + virtual void syncHostFromCache(SyncFlags syncFlags = SyncFlags()) {} + + //! Allocate memory for API-level maps + virtual void* allocMapTarget(const amd::Coord3D& origin, //!< The map location in memory + const amd::Coord3D& region, //!< The map region in memory + uint mapFlags, //!< Map flags + size_t* rowPitch = NULL, //!< Row pitch for the mapped memory + size_t* slicePitch = NULL //!< Slice for the mapped memory + ) { + return NULL; + } + + virtual bool pinSystemMemory(void* hostPtr, //!< System memory address + size_t size //!< Size of allocated system memory + ) { + return true; + } + + //! Releases indirect map surface + virtual void releaseIndirectMap() {} + //! decompress any MSAA/depth-stencil interop surfaces. + //! notify GL to invalidate any surfaces touched by a CL kernel + virtual bool processGLResource(GLResourceOP operation) { return false; } + + //! Map the device memory to CPU visible + virtual void* cpuMap(VirtualDevice& vDev, //!< Virtual device for map operaiton + uint flags = 0, //!< flags for the map operation + // Optimization for multilayer map/unmap + uint startLayer = 0, //!< Start layer for multilayer map + uint numLayers = 0, //!< End layer for multilayer map + size_t* rowPitch = NULL, //!< Row pitch for the device memory + size_t* slicePitch = NULL //!< Slice pitch for the device memory + ) { + amd::Image* image = owner()->asImage(); + if (image != NULL) { + *rowPitch = image->getRowPitch(); + *slicePitch = image->getSlicePitch(); } + // Default behavior uses preallocated host mem for CPU + return owner()->getHostMem(); + } - //! Constructor (no owner), always eager allocation. - Memory(size_t size) - : flags_(0) - , owner_(NULL) - , version_(0) - , mapMemory_(NULL) - , indirectMapCount_(0) - , size_(size) - { + //! Unmap the device memory + virtual void cpuUnmap(VirtualDevice& vDev //!< Virtual device for unmap operaiton + ) {} + + //! Saves map info for this object + //! @note: It's not a thread safe operation, the app must implement + //! synchronization for the multiple write maps if necessary + void saveMapInfo(const void* mapAddress, //!< Map cpu address + const amd::Coord3D origin, //!< Origin of the map location + const amd::Coord3D region, //!< Mapped region + uint mapFlags, //!< Map flags + bool entire, //!< True if the enitre memory was mapped + amd::Image* baseMip = nullptr //!< The base mip level for map + ); + + const WriteMapInfo* writeMapInfo(const void* mapAddress) const { + // Unmap must be serialized. + amd::ScopedLock lock(owner()->lockMemoryOps()); + + auto it = writeMapInfo_.find(mapAddress); + if (it == writeMapInfo_.end()) { + if (writeMapInfo_.size() == 0) { + LogError("Unmap is a NOP!"); + return nullptr; + } + LogWarning("Unknown unmap signature!"); + // Get the first map info + it = writeMapInfo_.begin(); } + return &it->second; + } - enum GLResourceOP - { - GLDecompressResource = 0, // orders the GL driver to decompress any depth-stencil or MSAA resource to be sampled by a CL kernel. - GLInvalidateFBO // orders the GL driver to invalidate any FBO the resource may be bound to, since the resource internal state changed. - }; - - //! Default destructor for the device memory object - virtual ~Memory() {}; - - //! Releases virtual objects associated with this memory - void releaseVirtual(); - - //! Read the size - size_t size() const {return size_;} - - //! Gets the owner Memory instance - amd::Memory* owner() const { return owner_; } - - //! Immediate blocking write from device cache to owners's backing store. - //! Marks owner as "current" by resetting the last writer to NULL. - virtual void syncHostFromCache( - SyncFlags syncFlags = SyncFlags() - ) {} - - //! Allocate memory for API-level maps - virtual void* allocMapTarget( - const amd::Coord3D& origin, //!< The map location in memory - const amd::Coord3D& region, //!< The map region in memory - uint mapFlags, //!< Map flags - size_t* rowPitch = NULL, //!< Row pitch for the mapped memory - size_t* slicePitch = NULL //!< Slice for the mapped memory - ) { return NULL; } - - virtual bool pinSystemMemory( - void* hostPtr, //!< System memory address - size_t size //!< Size of allocated system memory - ) { return true; } - - //! Releases indirect map surface - virtual void releaseIndirectMap() {} - //! decompress any MSAA/depth-stencil interop surfaces. - //! notify GL to invalidate any surfaces touched by a CL kernel - virtual bool processGLResource(GLResourceOP operation) { return false;} - - //! Map the device memory to CPU visible - virtual void* cpuMap( - VirtualDevice& vDev, //!< Virtual device for map operaiton - uint flags = 0, //!< flags for the map operation - // Optimization for multilayer map/unmap - uint startLayer = 0, //!< Start layer for multilayer map - uint numLayers = 0, //!< End layer for multilayer map - size_t* rowPitch = NULL,//!< Row pitch for the device memory - size_t* slicePitch = NULL //!< Slice pitch for the device memory - ) - { - amd::Image* image = owner()->asImage(); - if (image != NULL) { - *rowPitch = image->getRowPitch(); - *slicePitch = image->getSlicePitch(); - } - // Default behavior uses preallocated host mem for CPU - return owner()->getHostMem(); - } - - //! Unmap the device memory - virtual void cpuUnmap( - VirtualDevice& vDev //!< Virtual device for unmap operaiton - ) {} - - //! Saves map info for this object - //! @note: It's not a thread safe operation, the app must implement - //! synchronization for the multiple write maps if necessary - void saveMapInfo( - const void* mapAddress, //!< Map cpu address - const amd::Coord3D origin, //!< Origin of the map location - const amd::Coord3D region, //!< Mapped region - uint mapFlags, //!< Map flags - bool entire, //!< True if the enitre memory was mapped - amd::Image* baseMip = nullptr //!< The base mip level for map - ); - - const WriteMapInfo* writeMapInfo(const void* mapAddress) const - { - // Unmap must be serialized. - amd::ScopedLock lock(owner()->lockMemoryOps()); - - auto it = writeMapInfo_.find(mapAddress); - if (it == writeMapInfo_.end()) { - if (writeMapInfo_.size() == 0) { - LogError("Unmap is a NOP!"); - return nullptr; - } - LogWarning("Unknown unmap signature!"); - // Get the first map info - it = writeMapInfo_.begin(); - } - return &it->second; + //! Clear memory object as mapped read only + void clearUnmapInfo(const void* mapAddress) { + // Unmap must be serialized. + amd::ScopedLock lock(owner()->lockMemoryOps()); + auto it = writeMapInfo_.find(mapAddress); + if (it == writeMapInfo_.end()) { + // Get the first map info + it = writeMapInfo_.begin(); } - - //! Clear memory object as mapped read only - void clearUnmapInfo(const void* mapAddress) - { - // Unmap must be serialized. - amd::ScopedLock lock(owner()->lockMemoryOps()); - auto it = writeMapInfo_.find(mapAddress); - if (it == writeMapInfo_.end()) { - // Get the first map info - it = writeMapInfo_.begin(); - } - if (--it->second.count_ == 0) { - writeMapInfo_.erase(it); - } + if (--it->second.count_ == 0) { + writeMapInfo_.erase(it); } + } - //! Returns state of memory direct access flag - bool isHostMemDirectAccess() const - { return (flags_ & HostMemoryDirectAccess) ? true : false; } + //! Returns state of memory direct access flag + bool isHostMemDirectAccess() const { return (flags_ & HostMemoryDirectAccess) ? true : false; } - //! Returns state of host memory registration flag - bool isHostMemoryRegistered() const - { return (flags_ & HostMemoryRegistered) ? true : false; } + //! Returns state of host memory registration flag + bool isHostMemoryRegistered() const { return (flags_ & HostMemoryRegistered) ? true : false; } -protected: - enum Flags { - HostMemoryDirectAccess = 0x00000001, //!< GPU has direct access to the host memory - MapResourceAlloced = 0x00000002, //!< Map resource was allocated - PinnedMemoryAlloced = 0x00000004, //!< An extra pinned resource was allocated - SubMemoryObject = 0x00000008, //!< Memory is sub-memory - HostMemoryRegistered = 0x00000010, //!< Host memory was registered - }; - uint flags_; //!< Memory object flags + protected: + enum Flags { + HostMemoryDirectAccess = 0x00000001, //!< GPU has direct access to the host memory + MapResourceAlloced = 0x00000002, //!< Map resource was allocated + PinnedMemoryAlloced = 0x00000004, //!< An extra pinned resource was allocated + SubMemoryObject = 0x00000008, //!< Memory is sub-memory + HostMemoryRegistered = 0x00000010, //!< Host memory was registered + }; + uint flags_; //!< Memory object flags - amd::Memory* owner_; //!< The Memory instance that we cache, - //!< or NULL if we're device-private workspace. + amd::Memory* owner_; //!< The Memory instance that we cache, + //!< or NULL if we're device-private workspace. - volatile size_t version_; //!< The version we're currently shadowing + volatile size_t version_; //!< The version we're currently shadowing - //! NB, the map data below is for an API-level map (from clEnqueueMapBuffer), - //! not a physical map. When a memory object does not use USE_HOST_PTR we - //! can use a remote resource and DMA, avoiding the additional CPU memcpy. - amd::Memory* mapMemory_; //!< Memory used as map target buffer - volatile size_t indirectMapCount_; //!< Number of maps - std::map writeMapInfo_; //!< Saved write map info for partial unmap + //! NB, the map data below is for an API-level map (from clEnqueueMapBuffer), + //! not a physical map. When a memory object does not use USE_HOST_PTR we + //! can use a remote resource and DMA, avoiding the additional CPU memcpy. + amd::Memory* mapMemory_; //!< Memory used as map target buffer + volatile size_t indirectMapCount_; //!< Number of maps + std::map writeMapInfo_; //!< Saved write map info for partial unmap - //! Increment map count - void incIndMapCount() { ++indirectMapCount_; } + //! Increment map count + void incIndMapCount() { ++indirectMapCount_; } - //! Decrement map count - virtual void decIndMapCount() {} + //! Decrement map count + virtual void decIndMapCount() {} -private: - //! Disable default copy constructor - Memory& operator=(const Memory&); + private: + //! Disable default copy constructor + Memory& operator=(const Memory&); - //! Disable operator= - Memory(const Memory&); + //! Disable operator= + Memory(const Memory&); - //! Our size - size_t size_; + //! Our size + size_t size_; }; -class Sampler : public amd::HeapObject -{ -public: - //! Constructor - Sampler() {} +class Sampler : public amd::HeapObject { + public: + //! Constructor + Sampler() {} - //! Default destructor for the device memory object - virtual ~Sampler() {}; + //! Default destructor for the device memory object + virtual ~Sampler(){}; - //! Returns device specific HW state for the sampler - uint64_t hwSrd() const { return hwSrd_; } + //! Returns device specific HW state for the sampler + uint64_t hwSrd() const { return hwSrd_; } -protected: - uint64_t hwSrd_; //!< Device specific HW state for the sampler + protected: + uint64_t hwSrd_; //!< Device specific HW state for the sampler -private: - //! Disable default copy constructor - Sampler& operator=(const Sampler&); + private: + //! Disable default copy constructor + Sampler& operator=(const Sampler&); - //! Disable operator= - Sampler(const Sampler&); + //! Disable operator= + Sampler(const Sampler&); }; //! \class DeviceKernel, which will contain the common fields for any device -class Kernel : public amd::HeapObject -{ -public: - typedef std::vector parameters_t; +class Kernel : public amd::HeapObject { + public: + typedef std::vector parameters_t; - //! \struct The device kernel workgroup info structure - struct WorkGroupInfo : public amd::EmbeddedObject - { - size_t size_; //!< kernel workgroup size - size_t compileSize_[3]; //!< kernel compiled workgroup size - cl_ulong localMemSize_; //!< amount of used local memory - size_t preferredSizeMultiple_; //!< preferred multiple for launch - cl_ulong privateMemSize_; //!< amount of used private memory - size_t scratchRegs_; //!< amount of used scratch registers - size_t wavefrontPerSIMD_; //!< number of wavefronts per SIMD - size_t wavefrontSize_; //!< number of threads per wavefront - size_t availableGPRs_; //!< GPRs available to the program - size_t usedGPRs_; //!< GPRs used by the program - size_t availableSGPRs_; //!< SGPRs available to the program - size_t usedSGPRs_; //!< SGPRs used by the program - size_t availableVGPRs_; //!< VGPRs available to the program - size_t usedVGPRs_; //!< VGPRs used by the program - size_t availableLDSSize_; //!< available LDS size - size_t usedLDSSize_; //!< used LDS size - size_t availableStackSize_; //!< available stack size - size_t usedStackSize_; //!< used stack size - size_t compileSizeHint_[3]; //!< kernel compiled workgroup size hint - std::string compileVecTypeHint_; //!< kernel compiled vector type hint - bool uniformWorkGroupSize_; //!< uniform work group size option - size_t wavesPerSimdHint_; //!< waves per simd hit - }; + //! \struct The device kernel workgroup info structure + struct WorkGroupInfo : public amd::EmbeddedObject { + size_t size_; //!< kernel workgroup size + size_t compileSize_[3]; //!< kernel compiled workgroup size + cl_ulong localMemSize_; //!< amount of used local memory + size_t preferredSizeMultiple_; //!< preferred multiple for launch + cl_ulong privateMemSize_; //!< amount of used private memory + size_t scratchRegs_; //!< amount of used scratch registers + size_t wavefrontPerSIMD_; //!< number of wavefronts per SIMD + size_t wavefrontSize_; //!< number of threads per wavefront + size_t availableGPRs_; //!< GPRs available to the program + size_t usedGPRs_; //!< GPRs used by the program + size_t availableSGPRs_; //!< SGPRs available to the program + size_t usedSGPRs_; //!< SGPRs used by the program + size_t availableVGPRs_; //!< VGPRs available to the program + size_t usedVGPRs_; //!< VGPRs used by the program + size_t availableLDSSize_; //!< available LDS size + size_t usedLDSSize_; //!< used LDS size + size_t availableStackSize_; //!< available stack size + size_t usedStackSize_; //!< used stack size + size_t compileSizeHint_[3]; //!< kernel compiled workgroup size hint + std::string compileVecTypeHint_; //!< kernel compiled vector type hint + bool uniformWorkGroupSize_; //!< uniform work group size option + size_t wavesPerSimdHint_; //!< waves per simd hit + }; - //! Default constructor - Kernel(const std::string& name): name_(name), signature_(NULL), hsa_(false) - { - // Instead of memset(&workGroupInfo_, '\0', sizeof(workGroupInfo_)); - // Due to std::string not being able to be memset to 0 - workGroupInfo_.size_ = 0; - workGroupInfo_.compileSize_[0] = 0; - workGroupInfo_.compileSize_[1] = 0; - workGroupInfo_.compileSize_[2] = 0; - workGroupInfo_.localMemSize_ = 0; - workGroupInfo_.preferredSizeMultiple_ = 0; - workGroupInfo_.privateMemSize_ = 0; - workGroupInfo_.scratchRegs_ = 0; - workGroupInfo_.wavefrontPerSIMD_ = 0; - workGroupInfo_.wavefrontSize_ = 0; - workGroupInfo_.availableGPRs_ = 0; - workGroupInfo_.usedGPRs_ = 0; - workGroupInfo_.availableSGPRs_ = 0; - workGroupInfo_.usedSGPRs_ = 0; - workGroupInfo_.availableVGPRs_ = 0; - workGroupInfo_.usedVGPRs_ = 0; - workGroupInfo_.availableLDSSize_ = 0; - workGroupInfo_.usedLDSSize_ = 0; - workGroupInfo_.availableStackSize_ = 0; - workGroupInfo_.usedStackSize_ = 0; - workGroupInfo_.compileSizeHint_[0] = 0; - workGroupInfo_.compileSizeHint_[1] = 0; - workGroupInfo_.compileSizeHint_[2] = 0; - workGroupInfo_.compileVecTypeHint_ = ""; - workGroupInfo_.uniformWorkGroupSize_ = false; - workGroupInfo_.wavesPerSimdHint_ = 0; - } + //! Default constructor + Kernel(const std::string& name) : name_(name), signature_(NULL), hsa_(false) { + // Instead of memset(&workGroupInfo_, '\0', sizeof(workGroupInfo_)); + // Due to std::string not being able to be memset to 0 + workGroupInfo_.size_ = 0; + workGroupInfo_.compileSize_[0] = 0; + workGroupInfo_.compileSize_[1] = 0; + workGroupInfo_.compileSize_[2] = 0; + workGroupInfo_.localMemSize_ = 0; + workGroupInfo_.preferredSizeMultiple_ = 0; + workGroupInfo_.privateMemSize_ = 0; + workGroupInfo_.scratchRegs_ = 0; + workGroupInfo_.wavefrontPerSIMD_ = 0; + workGroupInfo_.wavefrontSize_ = 0; + workGroupInfo_.availableGPRs_ = 0; + workGroupInfo_.usedGPRs_ = 0; + workGroupInfo_.availableSGPRs_ = 0; + workGroupInfo_.usedSGPRs_ = 0; + workGroupInfo_.availableVGPRs_ = 0; + workGroupInfo_.usedVGPRs_ = 0; + workGroupInfo_.availableLDSSize_ = 0; + workGroupInfo_.usedLDSSize_ = 0; + workGroupInfo_.availableStackSize_ = 0; + workGroupInfo_.usedStackSize_ = 0; + workGroupInfo_.compileSizeHint_[0] = 0; + workGroupInfo_.compileSizeHint_[1] = 0; + workGroupInfo_.compileSizeHint_[2] = 0; + workGroupInfo_.compileVecTypeHint_ = ""; + workGroupInfo_.uniformWorkGroupSize_ = false; + workGroupInfo_.wavesPerSimdHint_ = 0; + } - //! Default destructor - virtual ~Kernel(); + //! Default destructor + virtual ~Kernel(); - //! Validates memory argument - virtual bool validateMemory( - uint idx, //!< Argument's index - amd::Memory* amdMem //!< memory object for validation - ) const { return true; } + //! Validates memory argument + virtual bool validateMemory(uint idx, //!< Argument's index + amd::Memory* amdMem //!< memory object for validation + ) const { + return true; + } - //! Returns the kernel info structure - const WorkGroupInfo* workGroupInfo() const { return &workGroupInfo_; } + //! Returns the kernel info structure + const WorkGroupInfo* workGroupInfo() const { return &workGroupInfo_; } - //! Returns the kernel signature - const amd::KernelSignature& signature() const { return *signature_; } + //! Returns the kernel signature + const amd::KernelSignature& signature() const { return *signature_; } - //! Returns the kernel name - const std::string& name() const { return name_; } + //! Returns the kernel name + const std::string& name() const { return name_; } - //! Initializes the kernel parameters for the abstraction layer - bool createSignature(const parameters_t& params); + //! Initializes the kernel parameters for the abstraction layer + bool createSignature(const parameters_t& params); - //! Returns TRUE if it's a HSA kernel - bool hsa() const { return hsa_; } + //! Returns TRUE if it's a HSA kernel + bool hsa() const { return hsa_; } - void setUniformWorkGroupSize(bool u) - { - workGroupInfo_.uniformWorkGroupSize_ = u; - } + void setUniformWorkGroupSize(bool u) { workGroupInfo_.uniformWorkGroupSize_ = u; } - bool getUniformWorkGroupSize() const { - return workGroupInfo_.uniformWorkGroupSize_; - } + bool getUniformWorkGroupSize() const { return workGroupInfo_.uniformWorkGroupSize_; } - void setReqdWorkGroupSize(size_t x, size_t y, size_t z) - { - workGroupInfo_.compileSize_[0] = x; - workGroupInfo_.compileSize_[1] = y; - workGroupInfo_.compileSize_[2] = z; - } + void setReqdWorkGroupSize(size_t x, size_t y, size_t z) { + workGroupInfo_.compileSize_[0] = x; + workGroupInfo_.compileSize_[1] = y; + workGroupInfo_.compileSize_[2] = z; + } - size_t getReqdWorkGroupSize(int dim) { - return workGroupInfo_.compileSize_[dim]; - } + size_t getReqdWorkGroupSize(int dim) { return workGroupInfo_.compileSize_[dim]; } - void setWorkGroupSizeHint(size_t x, size_t y, size_t z) - { - workGroupInfo_.compileSizeHint_[0] = x; - workGroupInfo_.compileSizeHint_[1] = y; - workGroupInfo_.compileSizeHint_[2] = z; - } + void setWorkGroupSizeHint(size_t x, size_t y, size_t z) { + workGroupInfo_.compileSizeHint_[0] = x; + workGroupInfo_.compileSizeHint_[1] = y; + workGroupInfo_.compileSizeHint_[2] = z; + } - size_t getWorkGroupSizeHint(int dim) const { - return workGroupInfo_.compileSizeHint_[dim]; - } + size_t getWorkGroupSizeHint(int dim) const { return workGroupInfo_.compileSizeHint_[dim]; } - //! Get profiling callback object - virtual amd::ProfilingCallback* getProfilingCallback( - const device::VirtualDevice *vdv) { - return NULL; - } + //! Get profiling callback object + virtual amd::ProfilingCallback* getProfilingCallback(const device::VirtualDevice* vdv) { + return NULL; + } - void setVecTypeHint(const std::string& hint) - { - workGroupInfo_.compileVecTypeHint_ = hint; - } + void setVecTypeHint(const std::string& hint) { workGroupInfo_.compileVecTypeHint_ = hint; } - void setLocalMemSize(size_t size) - { - workGroupInfo_.localMemSize_ = size; - } + void setLocalMemSize(size_t size) { workGroupInfo_.localMemSize_ = size; } - void setPreferredSizeMultiple(size_t size) - { - workGroupInfo_.preferredSizeMultiple_ = size; - } + void setPreferredSizeMultiple(size_t size) { workGroupInfo_.preferredSizeMultiple_ = size; } - //! Return the build log - const std::string& buildLog() const { return buildLog_; } + //! Return the build log + const std::string& buildLog() const { return buildLog_; } - static std::string openclMangledName(const std::string& name); + static std::string openclMangledName(const std::string& name); -protected: - std::string name_; //!< kernel name - WorkGroupInfo workGroupInfo_; //!< device kernel info structure - amd::KernelSignature* signature_; //!< kernel signature - bool hsa_; //!< True if HSA kernel on GPU - std::string buildLog_; //!< build log -private: - //! Disable default copy constructor - Kernel(const Kernel&); + protected: + std::string name_; //!< kernel name + WorkGroupInfo workGroupInfo_; //!< device kernel info structure + amd::KernelSignature* signature_; //!< kernel signature + bool hsa_; //!< True if HSA kernel on GPU + std::string buildLog_; //!< build log + private: + //! Disable default copy constructor + Kernel(const Kernel&); - //! Disable operator= - Kernel& operator=(const Kernel&); + //! Disable operator= + Kernel& operator=(const Kernel&); }; //! A program object for a specific device. -class Program : public amd::HeapObject -{ -public: - typedef std::pair binary_t; - typedef std::map kernels_t; - // type of the program - typedef enum { - TYPE_NONE = 0, // uncompiled - TYPE_COMPILED, // compiled - TYPE_LIBRARY, // linked library - TYPE_EXECUTABLE, // linked executable - TYPE_INTERMEDIATE // intermediate - } type_t; +class Program : public amd::HeapObject { + public: + typedef std::pair binary_t; + typedef std::map kernels_t; + // type of the program + typedef enum { + TYPE_NONE = 0, // uncompiled + TYPE_COMPILED, // compiled + TYPE_LIBRARY, // linked library + TYPE_EXECUTABLE, // linked executable + TYPE_INTERMEDIATE // intermediate + } type_t; -private: - //! The device target for this binary. - amd::SharedReference device_; + private: + //! The device target for this binary. + amd::SharedReference device_; - kernels_t kernels_; //!< The kernel entry points this binary. + kernels_t kernels_; //!< The kernel entry points this binary. - type_t type_; //!< type of this program + type_t type_; //!< type of this program -protected: - ClBinary* clBinary_; //!< The CL program binary file - std::string llvmBinary_; //!< LLVM IR binary code - amd::OclElf::oclElfSections elfSectionType_; //!< LLVM IR binary code is in SPIR format - std::string compileOptions_;//!< compile/build options. - std::string linkOptions_; //!< link options. - //!< the option arg passed in to clCompileProgram(), clLinkProgram(), - //! or clBuildProgram(), whichever is called last - std::string lastBuildOptionsArg_; - std::string buildLog_; //!< build log. - cl_int buildStatus_; //!< build status. - cl_int buildError_; //!< build error - //! The info target for this binary. - aclTargetInfo info_; - size_t globalVariableTotalSize_; + protected: + ClBinary* clBinary_; //!< The CL program binary file + std::string llvmBinary_; //!< LLVM IR binary code + amd::OclElf::oclElfSections elfSectionType_; //!< LLVM IR binary code is in SPIR format + std::string compileOptions_; //!< compile/build options. + std::string linkOptions_; //!< link options. + //!< the option arg passed in to clCompileProgram(), clLinkProgram(), + //! or clBuildProgram(), whichever is called last + std::string lastBuildOptionsArg_; + std::string buildLog_; //!< build log. + cl_int buildStatus_; //!< build status. + cl_int buildError_; //!< build error + //! The info target for this binary. + aclTargetInfo info_; + size_t globalVariableTotalSize_; -public: - //! Construct a section. - Program(amd::Device& device); + public: + //! Construct a section. + Program(amd::Device& device); - //! Destroy this binary image. - virtual ~Program(); + //! Destroy this binary image. + virtual ~Program(); - //! Destroy all the kernels - void clear(); + //! Destroy all the kernels + void clear(); - //! Return the compiler options passed to build this program - amd::option::Options* getCompilerOptions() const { return programOptions; } + //! Return the compiler options passed to build this program + amd::option::Options* getCompilerOptions() const { return programOptions; } - //! Compile the device program. - cl_int compile(const std::string& sourceCode, - const std::vector& headers, - const char** headerIncludeNames, - const char* origOptions, - amd::option::Options* options); - - //! Builds the device program. - cl_int link(const std::vector& inputPrograms, - const char* origOptions, - amd::option::Options* options); - - //! Builds the device program. - cl_int build(const std::string& sourceCode, - const char* origOptions, + //! Compile the device program. + cl_int compile(const std::string& sourceCode, const std::vector& headers, + const char** headerIncludeNames, const char* origOptions, amd::option::Options* options); - //! Returns the device object, associated with this program. - const amd::Device& device() const { return device_(); } + //! Builds the device program. + cl_int link(const std::vector& inputPrograms, const char* origOptions, + amd::option::Options* options); - //! Return the compiler options used to build the program. - const std::string& compileOptions() const { return compileOptions_; } + //! Builds the device program. + cl_int build(const std::string& sourceCode, const char* origOptions, + amd::option::Options* options); - //! Return the option arg passed in to clCompileProgram(), clLinkProgram(), - //! or clBuildProgram(), whichever is called last - const std::string lastBuildOptionsArg() const { - return lastBuildOptionsArg_; - } + //! Returns the device object, associated with this program. + const amd::Device& device() const { return device_(); } - //! Return the build log. - const std::string& buildLog() const { return buildLog_; } + //! Return the compiler options used to build the program. + const std::string& compileOptions() const { return compileOptions_; } - //! Return the build status. - cl_build_status buildStatus() const { return buildStatus_; } + //! Return the option arg passed in to clCompileProgram(), clLinkProgram(), + //! or clBuildProgram(), whichever is called last + const std::string lastBuildOptionsArg() const { return lastBuildOptionsArg_; } - //! Return the build error. - cl_int buildError() const { return buildError_; } + //! Return the build log. + const std::string& buildLog() const { return buildLog_; } - //! Return the symbols vector. - const kernels_t& kernels() const { return kernels_; } - kernels_t& kernels() { return kernels_; } + //! Return the build status. + cl_build_status buildStatus() const { return buildStatus_; } - //! Return the binary image. - inline const binary_t binary() const; - inline binary_t binary(); + //! Return the build error. + cl_int buildError() const { return buildError_; } - //! Returns the CL program binary file - ClBinary* clBinary() { return clBinary_; } - const ClBinary* clBinary() const { return clBinary_; } + //! Return the symbols vector. + const kernels_t& kernels() const { return kernels_; } + kernels_t& kernels() { return kernels_; } - bool setBinary(char* binaryIn, size_t size); + //! Return the binary image. + inline const binary_t binary() const; + inline binary_t binary(); - type_t type() const { return type_; } + //! Returns the CL program binary file + ClBinary* clBinary() { return clBinary_; } + const ClBinary* clBinary() const { return clBinary_; } - void setGlobalVariableTotalSize(size_t size) { - globalVariableTotalSize_ = size; - } + bool setBinary(char* binaryIn, size_t size); - size_t globalVariableTotalSize() const { - return globalVariableTotalSize_; - } + type_t type() const { return type_; } -protected: - //! pre-compile setup - virtual bool initBuild(amd::option::Options* options); + void setGlobalVariableTotalSize(size_t size) { globalVariableTotalSize_ = size; } - //! post-compile cleanup - virtual bool finiBuild(bool isBuildGood); + size_t globalVariableTotalSize() const { return globalVariableTotalSize_; } - //! Compile the device program. - virtual bool compileImpl(const std::string& sourceCode, - const std::vector& headers, - const char** headerIncludeNames, - amd::option::Options* options) = 0; + protected: + //! pre-compile setup + virtual bool initBuild(amd::option::Options* options); - //! Link the device program. - virtual bool linkImpl(amd::option::Options* options) = 0; + //! post-compile cleanup + virtual bool finiBuild(bool isBuildGood); - //! Link the device programs. - virtual bool linkImpl(const std::vector& inputPrograms, - amd::option::Options* options, - bool createLibrary) = 0; + //! Compile the device program. + virtual bool compileImpl(const std::string& sourceCode, + const std::vector& headers, + const char** headerIncludeNames, amd::option::Options* options) = 0; - virtual bool createBinary(amd::option::Options* options) = 0; + //! Link the device program. + virtual bool linkImpl(amd::option::Options* options) = 0; - virtual bool createBIFBinary(aclBinary* bin); + //! Link the device programs. + virtual bool linkImpl(const std::vector& inputPrograms, amd::option::Options* options, + bool createLibrary) = 0; - //! Initialize Binary (used only for clCreateProgramWithBinary()). - bool initClBinary(char* binaryIn, size_t size); + virtual bool createBinary(amd::option::Options* options) = 0; - //! Initialize Binary - virtual bool initClBinary() = 0; + virtual bool createBIFBinary(aclBinary* bin); - //! Release the Binary - virtual void releaseClBinary() = 0; + //! Initialize Binary (used only for clCreateProgramWithBinary()). + bool initClBinary(char* binaryIn, size_t size); - //! return target info - virtual const aclTargetInfo & info(const char * str = "") = 0; + //! Initialize Binary + virtual bool initClBinary() = 0; - virtual bool isElf(const char* bin) const = 0; + //! Release the Binary + virtual void releaseClBinary() = 0; - //! At linking time, get the set of compile options to be used from - //! the set of input program, warn if they have inconsisten compile - //! options. - bool getCompileOptionsAtLinking(const std::vector& inputPrograms, - const amd::option::Options* linkOptions); + //! return target info + virtual const aclTargetInfo& info(const char* str = "") = 0; - void setType(type_t newType) { type_ = newType; } + virtual bool isElf(const char* bin) const = 0; -private: - //! Disable default copy constructor - Program(const Program&); + //! At linking time, get the set of compile options to be used from + //! the set of input program, warn if they have inconsisten compile + //! options. + bool getCompileOptionsAtLinking(const std::vector& inputPrograms, + const amd::option::Options* linkOptions); - //! Disable operator= - Program& operator=(const Program&); + void setType(type_t newType) { type_ = newType; } -public: - amd::option::Options* programOptions; + private: + //! Disable default copy constructor + Program(const Program&); + + //! Disable operator= + Program& operator=(const Program&); + + public: + amd::option::Options* programOptions; }; -class ClBinary : public amd::HeapObject -{ -public: +class ClBinary : public amd::HeapObject { + public: + enum BinaryImageFormat { + BIF_VERSION2 = 0, //!< Binary Image Format version 2.0 (ELF) + BIF_VERSION3 //!< Binary Image Format version 3.0 (ELF) + }; - enum BinaryImageFormat { - BIF_VERSION2 = 0, //!< Binary Image Format version 2.0 (ELF) - BIF_VERSION3 //!< Binary Image Format version 3.0 (ELF) - }; + //! Constructor + ClBinary(const amd::Device& dev, BinaryImageFormat bifVer = BIF_VERSION2); - //! Constructor - ClBinary(const amd::Device& dev, BinaryImageFormat bifVer = BIF_VERSION2); + //! Destructor + virtual ~ClBinary(); - //! Destructor - virtual ~ClBinary(); + void init(amd::option::Options* optionsObj, bool amdilRequired = false); - void init(amd::option::Options* optionsObj, bool amdilRequired = false); + /** called only in loading image routines, + never called in storing routines */ + bool setBinary(char* theBinary, size_t theBinarySize, bool allocated = false); - /** called only in loading image routines, - never called in storing routines */ - bool setBinary(char* theBinary, size_t theBinarySize, bool allocated=false); + //! setin elfIn_ + bool setElfIn(unsigned char eclass); + void resetElfIn(); - //! setin elfIn_ - bool setElfIn(unsigned char eclass); - void resetElfIn(); + //! set out elf + bool setElfOut(unsigned char eclass, const char* outFile); + void resetElfOut(); - //! set out elf - bool setElfOut(unsigned char eclass, const char* outFile); - void resetElfOut(); + //! Set elf header information + virtual bool setElfTarget() = 0; - //! Set elf header information - virtual bool setElfTarget() = 0; + // class used in for loading images in new format + amd::OclElf* elfIn() { return elfIn_; } - // class used in for loading images in new format - amd::OclElf* elfIn() { return elfIn_; } + // classes used storing and loading images in new format + amd::OclElf* elfOut() { return elfOut_; } + void elfOut(amd::OclElf* v) { elfOut_ = v; } - // classes used storing and loading images in new format - amd::OclElf* elfOut() { return elfOut_; } - void elfOut(amd::OclElf* v) { elfOut_ = v; } + //! Create and save ELF binary image + bool createElfBinary(bool doencrypt, Program::type_t type); - //! Create and save ELF binary image - bool createElfBinary(bool doencrypt, Program::type_t type); + // save BIF binary image + void saveBIFBinary(char* binaryIn, size_t size); - //save BIF binary image - void saveBIFBinary(char* binaryIn, size_t size); + bool decryptElf(char* binaryIn, size_t size, char** decryptBin, size_t* decryptSize, + int* encryptCode); - bool decryptElf(char* binaryIn, size_t size, - char** decryptBin, size_t* decryptSize, int* encryptCode); + //! Returns the binary pair for the abstraction layer + Program::binary_t data() const; - //! Returns the binary pair for the abstraction layer - Program::binary_t data() const; + //! Loads llvmir binary from OCL binary file + bool loadLlvmBinary( + std::string& llvmBinary, //!< LLVMIR binary code + amd::OclElf::oclElfSections& elfSectionType //!< LLVMIR binary is in SPIR format + ) const; - //! Loads llvmir binary from OCL binary file - bool loadLlvmBinary( - std::string& llvmBinary, //!< LLVMIR binary code - amd::OclElf::oclElfSections& elfSectionType //!< LLVMIR binary is in SPIR format - ) const; + //! Loads compile options from OCL binary file + bool loadCompileOptions(std::string& compileOptions //!< return the compile options loaded + ) const; - //! Loads compile options from OCL binary file - bool loadCompileOptions( - std::string& compileOptions //!< return the compile options loaded - ) const; + //! Loads link options from OCL binary file + bool loadLinkOptions(std::string& linkOptions //!< return the link options loaded + ) const; - //! Loads link options from OCL binary file - bool loadLinkOptions( - std::string& linkOptions //!< return the link options loaded - ) const; + //! Store compile options into OCL binary file + void storeCompileOptions(const std::string& compileOptions //!< the compile options to be stored + ); - //! Store compile options into OCL binary file - void storeCompileOptions( - const std::string& compileOptions //!< the compile options to be stored - ); + //! Store link options into OCL binary file + void storeLinkOptions(const std::string& linkOptions //!< the link options to be stored + ); - //! Store link options into OCL binary file - void storeLinkOptions( - const std::string& linkOptions //!< the link options to be stored - ); + //! Check if the binary is recompilable + bool isRecompilable(std::string& llvmBinary, amd::OclElf::oclElfPlatform thePlatform); - //! Check if the binary is recompilable - bool isRecompilable(std::string& llvmBinary, - amd::OclElf::oclElfPlatform thePlatform); + void saveOrigBinary(char* origBinary, size_t origSize) { + origBinary_ = origBinary; + origSize_ = origSize; + } - void saveOrigBinary(char* origBinary, size_t origSize) { - origBinary_ = origBinary; - origSize_ = origSize; + void restoreOrigBinary() { + if (origBinary_ != NULL) { + (void)setBinary(origBinary_, origSize_, false); } + } - void restoreOrigBinary() { - if (origBinary_ != NULL) { - (void)setBinary(origBinary_, origSize_, false); - } - } + //! Set Binary flags + void setFlags(int encryptCode); - //! Set Binary flags - void setFlags(int encryptCode); + bool saveSOURCE() { return ((flags_ & BinarySourceMask) == BinarySaveSource); } + bool saveLLVMIR() { return ((flags_ & BinaryLlvmirMask) == BinarySaveLlvmir); } + bool saveAMDIL() { return ((flags_ & BinaryAmdilMask) == BinarySaveAmdil); } + bool saveISA() { return ((flags_ & BinaryIsaMask) == BinarySaveIsa); } - bool saveSOURCE () { - return ((flags_ & BinarySourceMask) == BinarySaveSource); - } - bool saveLLVMIR () { - return ((flags_ & BinaryLlvmirMask) == BinarySaveLlvmir); - } - bool saveAMDIL () { - return ((flags_ & BinaryAmdilMask) == BinarySaveAmdil); - } - bool saveISA () { - return ((flags_ & BinaryIsaMask) == BinarySaveIsa); - } + bool saveAS() { return ((flags_ & BinaryASMask) == BinarySaveAS); } - bool saveAS () { - return ((flags_ & BinaryASMask) == BinarySaveAS); - } + // Return the encrypt code for this input binary ( "> 0" means encrypted) + int getEncryptCode() { return encryptCode_; } - // Return the encrypt code for this input binary ( "> 0" means encrypted) - int getEncryptCode() { return encryptCode_; } + // Returns TRUE of binary file is SPIR + bool isSPIR() const; + // Returns TRUE of binary file is SPIRV + bool isSPIRV() const; - // Returns TRUE of binary file is SPIR - bool isSPIR() const; - // Returns TRUE of binary file is SPIRV - bool isSPIRV() const; -protected: - enum Flags { - BinaryAllocated = 0x1, //!< Binary was created + protected: + enum Flags { + BinaryAllocated = 0x1, //!< Binary was created - // Source control - BinaryNoSaveSource = 0x0, // 0: default - BinaryRemoveSource = 0x2, // for encrypted binary - BinarySaveSource = 0x4, - BinarySourceMask = 0x6, + // Source control + BinaryNoSaveSource = 0x0, // 0: default + BinaryRemoveSource = 0x2, // for encrypted binary + BinarySaveSource = 0x4, + BinarySourceMask = 0x6, - // LLVMIR control - BinarySaveLlvmir = 0x0, // 0: default - BinaryRemoveLlvmir = 0x8, // for encrypted binary - BinaryNoSaveLlvmir = 0x10, - BinaryLlvmirMask = 0x18, + // LLVMIR control + BinarySaveLlvmir = 0x0, // 0: default + BinaryRemoveLlvmir = 0x8, // for encrypted binary + BinaryNoSaveLlvmir = 0x10, + BinaryLlvmirMask = 0x18, - // AMDIL control - BinarySaveAmdil = 0x0, // 0: default - BinaryRemoveAmdil = 0x20, // for encrypted binary - BinaryNoSaveAmdil = 0x40, - BinaryAmdilMask = 0x60, + // AMDIL control + BinarySaveAmdil = 0x0, // 0: default + BinaryRemoveAmdil = 0x20, // for encrypted binary + BinaryNoSaveAmdil = 0x40, + BinaryAmdilMask = 0x60, - // ISA control - BinarySaveIsa = 0x0, // 0: default - BinaryRemoveIsa = 0x80, // for encrypted binary - BinaryNoSaveIsa = 0x100, - BinaryIsaMask = 0x180, + // ISA control + BinarySaveIsa = 0x0, // 0: default + BinaryRemoveIsa = 0x80, // for encrypted binary + BinaryNoSaveIsa = 0x100, + BinaryIsaMask = 0x180, - // AS control - BinaryNoSaveAS = 0x0, // 0: default - BinaryRemoveAS = 0x200, // for encrypted binary - BinarySaveAS = 0x400, - BinaryASMask = 0x600 - }; + // AS control + BinaryNoSaveAS = 0x0, // 0: default + BinaryRemoveAS = 0x200, // for encrypted binary + BinarySaveAS = 0x400, + BinaryASMask = 0x600 + }; - //! Returns TRUE if binary file was allocated - bool isBinaryAllocated() const - { return (flags_ & BinaryAllocated) ? true : false; } + //! Returns TRUE if binary file was allocated + bool isBinaryAllocated() const { return (flags_ & BinaryAllocated) ? true : false; } - //! Returns BIF symbol name by symbolID, - //! returns empty string if not found or if BIF version is unsupported - std::string getBIFSymbol(unsigned int symbolID) const; + //! Returns BIF symbol name by symbolID, + //! returns empty string if not found or if BIF version is unsupported + std::string getBIFSymbol(unsigned int symbolID) const; -protected: - const amd::Device& dev_; //!< Device object + protected: + const amd::Device& dev_; //!< Device object -private: - //! Disable default copy constructor - ClBinary(const ClBinary&); + private: + //! Disable default copy constructor + ClBinary(const ClBinary&); - //! Disable default operator= - ClBinary& operator=(const ClBinary&); + //! Disable default operator= + ClBinary& operator=(const ClBinary&); - //! Releases the binary data store - void release(); + //! Releases the binary data store + void release(); - char* binary_; //!< binary data - size_t size_; //!< binary size - uint flags_; //!< CL binary object flags + char* binary_; //!< binary data + size_t size_; //!< binary size + uint flags_; //!< CL binary object flags - char* origBinary_; //!< original binary data - size_t origSize_; //!< original binary size + char* origBinary_; //!< original binary data + size_t origSize_; //!< original binary size - int encryptCode_; //!< Encryption Code for input binary (0 for not encrypted) + int encryptCode_; //!< Encryption Code for input binary (0 for not encrypted) -protected: - amd::OclElf *elfIn_; //!< ELF object for input ELF binary - amd::OclElf *elfOut_; //!< ELF object for output ELF binary - BinaryImageFormat format_; //!< which binary image format to use + protected: + amd::OclElf* elfIn_; //!< ELF object for input ELF binary + amd::OclElf* elfOut_; //!< ELF object for output ELF binary + BinaryImageFormat format_; //!< which binary image format to use }; inline const Program::binary_t Program::binary() const { - if (clBinary() == NULL) { - return std::make_pair((const void*)0, 0); - } - return clBinary()->data(); + if (clBinary() == NULL) { + return std::make_pair((const void*)0, 0); + } + return clBinary()->data(); } inline Program::binary_t Program::binary() { - if (clBinary() == NULL) { - return std::make_pair((const void*)0, 0); - } - return clBinary()->data(); + if (clBinary() == NULL) { + return std::make_pair((const void*)0, 0); + } + return clBinary()->data(); } /*! \class PerfCounter * * \brief The device interface class for the performance counters */ -class PerfCounter : public amd::HeapObject -{ -public: - //! Constructor for the device performance - PerfCounter() {} +class PerfCounter : public amd::HeapObject { + public: + //! Constructor for the device performance + PerfCounter() {} - //! Get the performance counter info - virtual uint64_t getInfo(uint64_t infoType) const = 0; + //! Get the performance counter info + virtual uint64_t getInfo(uint64_t infoType) const = 0; - //! Destructor for PerfCounter class - virtual ~PerfCounter() {} + //! Destructor for PerfCounter class + virtual ~PerfCounter() {} -private: - //! Disable default copy constructor - PerfCounter(const PerfCounter&); + private: + //! Disable default copy constructor + PerfCounter(const PerfCounter&); - //! Disable default operator= - PerfCounter& operator=(const PerfCounter&); + //! Disable default operator= + PerfCounter& operator=(const PerfCounter&); }; /*! \class ThreadTrace * * \brief The device interface class for the performance counters */ -class ThreadTrace : public amd::HeapObject -{ -public: - //! Constructor for the device performance - ThreadTrace() {} - //! Update ThreadTrace status to true/false if new buffer was binded/unbinded respectively - virtual void setNewBufferBinded(bool) = 0; - //! Get the performance counter info - virtual bool info(uint infoType, uint* info,uint infoSize) const = 0; - //! Destructor for PerfCounter class - virtual ~ThreadTrace() {} +class ThreadTrace : public amd::HeapObject { + public: + //! Constructor for the device performance + ThreadTrace() {} + //! Update ThreadTrace status to true/false if new buffer was binded/unbinded respectively + virtual void setNewBufferBinded(bool) = 0; + //! Get the performance counter info + virtual bool info(uint infoType, uint* info, uint infoSize) const = 0; + //! Destructor for PerfCounter class + virtual ~ThreadTrace() {} -private: - //! Disable default copy constructor - ThreadTrace(const ThreadTrace&); + private: + //! Disable default copy constructor + ThreadTrace(const ThreadTrace&); - //! Disable default operator= - ThreadTrace& operator=(const ThreadTrace&); + //! Disable default operator= + ThreadTrace& operator=(const ThreadTrace&); }; //! A device execution environment. -class VirtualDevice : public amd::HeapObject -{ -public: - //! Construct a new virtual device for the given physical device. - VirtualDevice(amd::Device& device) - : device_(device), blitMgr_(NULL) - { } +class VirtualDevice : public amd::HeapObject { + public: + //! Construct a new virtual device for the given physical device. + VirtualDevice(amd::Device& device) : device_(device), blitMgr_(NULL) {} - //! Destroy this virtual device. - virtual ~VirtualDevice() - { } + //! Destroy this virtual device. + virtual ~VirtualDevice() {} - //! Prepare this virtual device for destruction. - virtual bool terminate() = 0; + //! Prepare this virtual device for destruction. + virtual bool terminate() = 0; - //! Return the physical device for this virtual device. - const amd::Device& device() const { return device_(); } + //! Return the physical device for this virtual device. + const amd::Device& device() const { return device_(); } - virtual void submitReadMemory(amd::ReadMemoryCommand& cmd) = 0; - virtual void submitWriteMemory(amd::WriteMemoryCommand& cmd) = 0; - virtual void submitCopyMemory(amd::CopyMemoryCommand& cmd) = 0; - virtual void submitMapMemory(amd::MapMemoryCommand& cmd) = 0; - virtual void submitUnmapMemory(amd::UnmapMemoryCommand& cmd) = 0; - virtual void submitKernel(amd::NDRangeKernelCommand& command) = 0; - virtual void submitNativeFn(amd::NativeFnCommand& cmd) = 0; - virtual void submitMarker(amd::Marker& cmd) = 0; - virtual void submitFillMemory(amd::FillMemoryCommand& cmd) = 0; - virtual void submitMigrateMemObjects(amd::MigrateMemObjectsCommand& cmd) = 0; - virtual void submitAcquireExtObjects(amd::AcquireExtObjectsCommand& cmd) = 0; - virtual void submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& cmd) = 0; - virtual void submitPerfCounter(amd::PerfCounterCommand& cmd) = 0; - virtual void submitThreadTraceMemObjects(amd::ThreadTraceMemObjectsCommand& cmd) = 0; - virtual void submitThreadTrace(amd::ThreadTraceCommand& cmd) = 0; - virtual void flush(amd::Command* list = NULL, bool wait = false) = 0; - virtual void submitSvmFreeMemory(amd::SvmFreeMemoryCommand& cmd) = 0; - virtual void submitSvmCopyMemory(amd::SvmCopyMemoryCommand& cmd) = 0; - virtual void submitSvmFillMemory(amd::SvmFillMemoryCommand& cmd) = 0; - virtual void submitSvmMapMemory(amd::SvmMapMemoryCommand& cmd) = 0; - virtual void submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& cmd) = 0; - /// Optional extensions - virtual void submitSignal(amd::SignalCommand & cmd) = 0; - virtual void submitMakeBuffersResident(amd::MakeBuffersResidentCommand & cmd) = 0; - virtual void submitTransferBufferFromFile(amd::TransferBufferFileCommand& cmd) { ShouldNotReachHere(); } + virtual void submitReadMemory(amd::ReadMemoryCommand& cmd) = 0; + virtual void submitWriteMemory(amd::WriteMemoryCommand& cmd) = 0; + virtual void submitCopyMemory(amd::CopyMemoryCommand& cmd) = 0; + virtual void submitMapMemory(amd::MapMemoryCommand& cmd) = 0; + virtual void submitUnmapMemory(amd::UnmapMemoryCommand& cmd) = 0; + virtual void submitKernel(amd::NDRangeKernelCommand& command) = 0; + virtual void submitNativeFn(amd::NativeFnCommand& cmd) = 0; + virtual void submitMarker(amd::Marker& cmd) = 0; + virtual void submitFillMemory(amd::FillMemoryCommand& cmd) = 0; + virtual void submitMigrateMemObjects(amd::MigrateMemObjectsCommand& cmd) = 0; + virtual void submitAcquireExtObjects(amd::AcquireExtObjectsCommand& cmd) = 0; + virtual void submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& cmd) = 0; + virtual void submitPerfCounter(amd::PerfCounterCommand& cmd) = 0; + virtual void submitThreadTraceMemObjects(amd::ThreadTraceMemObjectsCommand& cmd) = 0; + virtual void submitThreadTrace(amd::ThreadTraceCommand& cmd) = 0; + virtual void flush(amd::Command* list = NULL, bool wait = false) = 0; + virtual void submitSvmFreeMemory(amd::SvmFreeMemoryCommand& cmd) = 0; + virtual void submitSvmCopyMemory(amd::SvmCopyMemoryCommand& cmd) = 0; + virtual void submitSvmFillMemory(amd::SvmFillMemoryCommand& cmd) = 0; + virtual void submitSvmMapMemory(amd::SvmMapMemoryCommand& cmd) = 0; + virtual void submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& cmd) = 0; + /// Optional extensions + virtual void submitSignal(amd::SignalCommand& cmd) = 0; + virtual void submitMakeBuffersResident(amd::MakeBuffersResidentCommand& cmd) = 0; + virtual void submitTransferBufferFromFile(amd::TransferBufferFileCommand& cmd) { + ShouldNotReachHere(); + } - //! Get the blit manager object - device::BlitManager& blitMgr() const { return *blitMgr_; } + //! Get the blit manager object + device::BlitManager& blitMgr() const { return *blitMgr_; } -private: - //! Disable default copy constructor - VirtualDevice& operator=(const VirtualDevice&); + private: + //! Disable default copy constructor + VirtualDevice& operator=(const VirtualDevice&); - //! Disable operator= - VirtualDevice(const VirtualDevice&); + //! Disable operator= + VirtualDevice(const VirtualDevice&); - //! The physical device that this virtual device utilizes - amd::SharedReference device_; + //! The physical device that this virtual device utilizes + amd::SharedReference device_; -protected: - device::BlitManager* blitMgr_; //!< Blit manager + protected: + device::BlitManager* blitMgr_; //!< Blit manager }; -} // namespace device +} // namespace device namespace amd { //! SvmManager class -class SvmManager : public AllStatic -{ -public: - - static size_t size(); //!< obtain the size of the container - static void AddSvmBuffer(const void* k, amd::Memory* v); //!< add the svm pointer and buffer in the container - static void RemoveSvmBuffer(const void* k); //!< Remove an entry of svm info from the container - static amd::Memory* FindSvmBuffer(const void* k); //!< find the svm buffer based on the input pointer -private: - static std::map svmBufferMap_; //!< the svm space information container - static amd::Monitor AllocatedLock_; //!< amd monitor locker +class SvmManager : public AllStatic { + public: + static size_t size(); //!< obtain the size of the container + static void AddSvmBuffer(const void* k, + amd::Memory* v); //!< add the svm pointer and buffer in the container + static void RemoveSvmBuffer(const void* k); //!< Remove an entry of svm info from the container + static amd::Memory* FindSvmBuffer( + const void* k); //!< find the svm buffer based on the input pointer + private: + static std::map svmBufferMap_; //!< the svm space information container + static amd::Monitor AllocatedLock_; //!< amd monitor locker }; - /*! \addtogroup Runtime - * @{ - * - * \addtogroup Device Device Abstraction - * @{ - */ -class Device : public RuntimeObject -{ -protected: +/*! \addtogroup Runtime +* @{ +* +* \addtogroup Device Device Abstraction +* @{ +*/ +class Device : public RuntimeObject { + protected: #if defined(WITH_LIGHTNING_COMPILER) - typedef amd::opencl_driver::Compiler Compiler; -#else // !defined(WITH_LIGHTNING_COMPILER) - typedef aclCompiler Compiler; -#endif // !defined(WITH_LIGHTNING_COMPILER) + typedef amd::opencl_driver::Compiler Compiler; +#else // !defined(WITH_LIGHTNING_COMPILER) + typedef aclCompiler Compiler; +#endif // !defined(WITH_LIGHTNING_COMPILER) -public: - typedef std::list CommandQueues; + public: + typedef std::list CommandQueues; - struct BlitProgram : public amd::HeapObject - { - Program* program_; //!< GPU program obejct - Context* context_; //!< A dummy context + struct BlitProgram : public amd::HeapObject { + Program* program_; //!< GPU program obejct + Context* context_; //!< A dummy context - BlitProgram(Context* context): program_(NULL), context_(context) {} - ~BlitProgram(); + BlitProgram(Context* context) : program_(NULL), context_(context) {} + ~BlitProgram(); - //! Creates blit program for this device - bool create(Device* device, //!< Device object - const char* extraKernel = NULL, //!< Extra kernels from the device layer - const char* extraOptions = NULL //!< Extra compilation options - ); - }; + //! Creates blit program for this device + bool create(Device* device, //!< Device object + const char* extraKernel = NULL, //!< Extra kernels from the device layer + const char* extraOptions = NULL //!< Extra compilation options + ); + }; - virtual Compiler* compiler() const = 0; + virtual Compiler* compiler() const = 0; - Device(Device* parent = NULL); - virtual ~Device(); + Device(Device* parent = NULL); + virtual ~Device(); - //! Initializes abstraction layer device object - bool create(); + //! Initializes abstraction layer device object + bool create(); - //! Increment the reference count - uint retain() { - // Only increment the reference count of sub-devices - return !isRootDevice() ? RuntimeObject::retain() : 0u; + //! Increment the reference count + uint retain() { + // Only increment the reference count of sub-devices + return !isRootDevice() ? RuntimeObject::retain() : 0u; + } + + //! Decrement the reference count + uint release() { + // Only decrement the reference count of sub-devices + return !isRootDevice() ? RuntimeObject::release() : 0u; + } + + //! Register a device as available + void registerDevice(); + + //! Initialize the device layer (enumerate known devices) + static bool init(); + + //! Shutdown the device layer + static void tearDown(); + + static std::vector getDevices(cl_device_type type, //!< Device type + bool offlineDevices //!< Enable offline devices + ); + + static size_t numDevices(cl_device_type type, //!< Device type + bool offlineDevices //!< Enable offline devices + ); + + static bool getDeviceIDs(cl_device_type deviceType, //!< Device type + cl_uint numEntries, //!< Number of entries in the array + cl_device_id* devices, //!< Array of the device ID(s) + cl_uint* numDevices, //!< Number of available devices + bool offlineDevices //!< Report offline devices + ); + + const device::Info& info() const { return info_; } + + //! Return svm support capability. + bool svmSupport() const { + return (info().svmCapabilities_ & + (CL_DEVICE_SVM_COARSE_GRAIN_BUFFER | CL_DEVICE_SVM_FINE_GRAIN_BUFFER | + CL_DEVICE_SVM_FINE_GRAIN_SYSTEM)) != 0 + ? true + : false; + } + + //! check svm FGS support capability. + inline bool isFineGrainedSystem(bool FGSOPT = false) const { + return FGSOPT && (info().svmCapabilities_ & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM) != 0 ? true + : false; + } + + //! Return this device's type. + cl_device_type type() const { return info().type_ & ~(CL_DEVICE_TYPE_DEFAULT); } + + //! Create sub-devices according to the given partition scheme. + virtual cl_int createSubDevices(device::CreateSubDevicesInfo& create_info, cl_uint num_entries, + cl_device_id* devices, cl_uint* num_devices) = 0; + + //! Create a new virtual device environment. + virtual device::VirtualDevice* createVirtualDevice(CommandQueue* queue = NULL) = 0; + + //! Create a program for device. + virtual device::Program* createProgram(option::Options* options = NULL) = 0; + + //! Allocate a chunk of device memory as a cache for a CL memory object + virtual device::Memory* createMemory(Memory& owner) const = 0; + + //! Allocate a device sampler object + virtual bool createSampler(const Sampler&, device::Sampler**) const = 0; + + //! Allocates a view object from the device memory + virtual device::Memory* createView( + amd::Memory& owner, //!< Owner memory object + const device::Memory& parent //!< Parent device memory object for the view + ) const = 0; + + //! Reallocates device memory object + virtual bool reallocMemory(Memory& owner) const = 0; + + //! Return true if initialized external API interop, otherwise false + virtual bool bindExternalDevice( + uint flags, //!< Enum val. for ext.API type: GL, D3D10, etc. + void* const pDevice[], //!< D3D device do D3D, HDC/Display handle of X Window for GL + void* pContext, //!< HGLRC/GLXContext handle + bool validateOnly //! Only validate if the device can inter-operate with pDevice/pContext, do + //! not bind. + ) = 0; + + virtual bool unbindExternalDevice( + uint flags, //!< Enum val. for ext.API type: GL, D3D10, etc. + void* const pDevice[], //!< D3D device do D3D, HDC/Display handle of X Window for GL + void* pContext, //!< HGLRC/GLXContext handle + bool validateOnly //! Only validate if the device can inter-operate with pDevice/pContext, do + //! not bind. + ) = 0; + + //! resolves GL depth/msaa buffer + virtual bool resolveGLMemory(device::Memory*) const { return true; } + + //! Gets a pointer to a region of host-visible memory for use as the target + //! of an indirect map for a given memory object + virtual void* allocMapTarget(amd::Memory& mem, //!< Abstraction layer memory object + const amd::Coord3D& origin, //!< The map location in memory + const amd::Coord3D& region, //!< The map region in memory + uint mapFlags, //!< Map flags + size_t* rowPitch = NULL, //!< Row pitch for the mapped memory + size_t* slicePitch = NULL //!< Slice for the mapped memory + ); + + //! Gets free memory on a GPU device + virtual bool globalFreeMemory(size_t* freeMemory //!< Free memory information on a GPU device + ) const = 0; + + /** + * @return True if the device has its own custom host allocator to be used + * instead of the generic OS allocation routines + */ + bool customHostAllocator() const { return settings().customHostAllocator_ == 1; } + + /** + * @copydoc amd::Context::hostAlloc + */ + virtual void* hostAlloc(size_t size, size_t alignment, bool atomics = false) const { + ShouldNotCallThis(); + return NULL; + } + + /** + * @copydoc amd::Context::hostFree + */ + virtual void hostFree(void* ptr, size_t size = 0) const { ShouldNotCallThis(); } + + /** + * @copydoc amd::Context::svmAlloc + */ + virtual void* svmAlloc(Context& context, size_t size, size_t alignment, cl_svm_mem_flags flags, + void* svmPtr) const = 0; + /** + * @copydoc amd::Context::svmFree + */ + virtual void svmFree(void* ptr) const = 0; + + //! Validate kernel + virtual bool validateKernel(const amd::Kernel& kernel, const device::VirtualDevice* vdev) { + return true; + }; + + //! Returns TRUE if the device is available for computations + bool isOnline() const { return online_; } + //! Returns TRUE if the device is a root device (as opposed to sub-device) + bool isRootDevice() const { return parent_ == NULL; } + //! Returns TRUE if 'this' is an ancestor of the given sub-device. + bool isAncestor(const Device* sub) const; + + //! Return the parent device. + Device* parent() const { return parent_; } + + //! Return the root device for this instance; + Device& rootDevice() { + Device* root = this; + while (!root->isRootDevice()) { + root = root->parent_; } + return *root; + } - //! Decrement the reference count - uint release() { - // Only decrement the reference count of sub-devices - return !isRootDevice() ? RuntimeObject::release() : 0u; + const Device& rootDevice() const { + const Device* root = this; + while (!root->isRootDevice()) { + root = root->parent_; } + return *root; + } - //! Register a device as available - void registerDevice(); + //! Returns device settings + const device::Settings& settings() const { return *settings_; } - //! Initialize the device layer (enumerate known devices) - static bool init(); + //! Returns blit program info structure + BlitProgram* blitProgram() const { return blitProgram_; } - //! Shutdown the device layer - static void tearDown(); + //! RTTI internal implementation + virtual ObjectType objectType() const { return ObjectTypeDevice; } - static std::vector - getDevices( - cl_device_type type, //!< Device type - bool offlineDevices //!< Enable offline devices - ); + //! Returns app profile + static const AppProfile* appProfile() { return &appProfile_; } - static size_t - numDevices( - cl_device_type type, //!< Device type - bool offlineDevices //!< Enable offline devices - ); + //! Register a hardware debugger manager + HwDebugManager* hwDebugMgr() const { return hwDebugMgr_; } - static bool - getDeviceIDs( - cl_device_type deviceType, //!< Device type - cl_uint numEntries, //!< Number of entries in the array - cl_device_id* devices, //!< Array of the device ID(s) - cl_uint* numDevices, //!< Number of available devices - bool offlineDevices //!< Report offline devices - ); + //! Initialize the Hardware Debug Manager + virtual cl_int hwDebugManagerInit(amd::Context* context, uintptr_t messageStorage) { + return CL_SUCCESS; + } - const device::Info& info() const { return info_; } + //! Remove the Hardware Debug Manager + virtual void hwDebugManagerRemove() {} - //! Return svm support capability. - bool svmSupport() const { - return (info().svmCapabilities_ & - (CL_DEVICE_SVM_COARSE_GRAIN_BUFFER | - CL_DEVICE_SVM_FINE_GRAIN_BUFFER | - CL_DEVICE_SVM_FINE_GRAIN_SYSTEM)) != 0 ? true : false; - } + //! Adds GPU memory to the VA cache list + void addVACache(device::Memory* memory) const; - //! check svm FGS support capability. - inline bool isFineGrainedSystem(bool FGSOPT = false) const { - return FGSOPT && (info().svmCapabilities_ & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM) != 0 ? true : false; - } + //! Removes GPU memory from the VA cache list + void removeVACache(const device::Memory* memory) const; - //! Return this device's type. - cl_device_type type() const { - return info().type_ & ~(CL_DEVICE_TYPE_DEFAULT); - } + //! Finds GPU memory from virtual address + device::Memory* findMemoryFromVA(const void* ptr, size_t* offset) const; - //! Create sub-devices according to the given partition scheme. - virtual cl_int createSubDevices( - device::CreateSubDevicesInfo& create_info, - cl_uint num_entries, - cl_device_id* devices, - cl_uint* num_devices) = 0; + protected: + //! Enable the specified extension + char* getExtensionString(); - //! Create a new virtual device environment. - virtual device::VirtualDevice* createVirtualDevice( - CommandQueue* queue = NULL - ) = 0; + device::Info info_; //!< Device info structure + device::Settings* settings_; //!< Device settings + bool online_; //!< The device in online + BlitProgram* blitProgram_; //!< Blit program info + static AppProfile appProfile_; //!< application profile + HwDebugManager* hwDebugMgr_; //!< Hardware Debug manager - //! Create a program for device. - virtual device::Program* createProgram(option::Options* options = NULL) = 0; - - //! Allocate a chunk of device memory as a cache for a CL memory object - virtual device::Memory* createMemory(Memory& owner) const = 0; - - //! Allocate a device sampler object - virtual bool createSampler(const Sampler&, device::Sampler**) const = 0; - - //! Allocates a view object from the device memory - virtual device::Memory* createView( - amd::Memory& owner, //!< Owner memory object - const device::Memory& parent //!< Parent device memory object for the view - ) const = 0; - - //! Reallocates device memory object - virtual bool reallocMemory(Memory& owner) const = 0; - - //! Return true if initialized external API interop, otherwise false - virtual bool bindExternalDevice( - uint flags, //!< Enum val. for ext.API type: GL, D3D10, etc. - void* const pDevice[], //!< D3D device do D3D, HDC/Display handle of X Window for GL - void* pContext, //!< HGLRC/GLXContext handle - bool validateOnly //! Only validate if the device can inter-operate with pDevice/pContext, do not bind. - ) = 0; - - virtual bool unbindExternalDevice( - uint flags, //!< Enum val. for ext.API type: GL, D3D10, etc. - void* const pDevice[], //!< D3D device do D3D, HDC/Display handle of X Window for GL - void* pContext, //!< HGLRC/GLXContext handle - bool validateOnly //! Only validate if the device can inter-operate with pDevice/pContext, do not bind. - ) = 0; - - //! resolves GL depth/msaa buffer - virtual bool resolveGLMemory(device::Memory*) const { return true; } - - //! Gets a pointer to a region of host-visible memory for use as the target - //! of an indirect map for a given memory object - virtual void* allocMapTarget( - amd::Memory& mem, //!< Abstraction layer memory object - const amd::Coord3D& origin, //!< The map location in memory - const amd::Coord3D& region, //!< The map region in memory - uint mapFlags, //!< Map flags - size_t* rowPitch = NULL, //!< Row pitch for the mapped memory - size_t* slicePitch = NULL //!< Slice for the mapped memory - ); - - //! Gets free memory on a GPU device - virtual bool globalFreeMemory( - size_t* freeMemory //!< Free memory information on a GPU device - ) const = 0; - - /** - * @return True if the device has its own custom host allocator to be used - * instead of the generic OS allocation routines - */ - bool customHostAllocator() const { - return settings().customHostAllocator_ == 1; - } - - /** - * @copydoc amd::Context::hostAlloc - */ - virtual void* hostAlloc(size_t size, size_t alignment, bool atomics = false) const - { - ShouldNotCallThis(); - return NULL; - } - - /** - * @copydoc amd::Context::hostFree - */ - virtual void hostFree(void* ptr, size_t size = 0) const - { - ShouldNotCallThis(); - } - - /** - * @copydoc amd::Context::svmAlloc - */ - virtual void* svmAlloc(Context& context, size_t size, size_t alignment, cl_svm_mem_flags flags, void* svmPtr) const = 0; - /** - * @copydoc amd::Context::svmFree - */ - virtual void svmFree(void* ptr) const = 0; - - //! Validate kernel - virtual bool validateKernel( - const amd::Kernel& kernel, - const device::VirtualDevice* vdev) { return true; }; - - //! Returns TRUE if the device is available for computations - bool isOnline() const { return online_; } - //! Returns TRUE if the device is a root device (as opposed to sub-device) - bool isRootDevice() const { return parent_ == NULL; } - //! Returns TRUE if 'this' is an ancestor of the given sub-device. - bool isAncestor(const Device* sub) const; - - //! Return the parent device. - Device* parent() const { return parent_; } - - //! Return the root device for this instance; - Device& rootDevice() { - Device* root = this; - while (!root->isRootDevice()) { - root = root->parent_; - } - return *root; - } - - const Device& rootDevice() const { - const Device* root = this; - while (!root->isRootDevice()) { - root = root->parent_; - } - return *root; - } - - //! Returns device settings - const device::Settings& settings() const { return *settings_; } - - //! Returns blit program info structure - BlitProgram* blitProgram() const { return blitProgram_; } - - //! RTTI internal implementation - virtual ObjectType objectType() const {return ObjectTypeDevice;} - - //! Returns app profile - static const AppProfile* appProfile() {return &appProfile_;} - - //! Register a hardware debugger manager - HwDebugManager* hwDebugMgr() const { return hwDebugMgr_; } - - //! Initialize the Hardware Debug Manager - virtual cl_int hwDebugManagerInit(amd::Context *context, uintptr_t messageStorage) { return CL_SUCCESS; } - - //! Remove the Hardware Debug Manager - virtual void hwDebugManagerRemove() {} - - //! Adds GPU memory to the VA cache list - void addVACache(device::Memory* memory) const; - - //! Removes GPU memory from the VA cache list - void removeVACache(const device::Memory* memory) const; - - //! Finds GPU memory from virtual address - device::Memory* findMemoryFromVA(const void* ptr, size_t* offset) const; - -protected: - //! Enable the specified extension - char* getExtensionString(); - - device::Info info_; //!< Device info structure - device::Settings* settings_; //!< Device settings - bool online_; //!< The device in online - BlitProgram* blitProgram_; //!< Blit program info - static AppProfile appProfile_; //!< application profile - HwDebugManager* hwDebugMgr_; //!< Hardware Debug manager - -private: - bool IsTypeMatching(cl_device_type type, bool offlineDevices); + private: + bool IsTypeMatching(cl_device_type type, bool offlineDevices); #if defined(WITH_HSA_DEVICE) - static AppProfile* rocAppProfile_; + static AppProfile* rocAppProfile_; #endif - typedef std::vector::iterator device_iterator; - static std::vector* devices_; //!< All known devices + typedef std::vector::iterator device_iterator; + static std::vector* devices_; //!< All known devices - Device* parent_; //!< This device's parent - Monitor* vaCacheAccess_; //!< Lock to serialize VA caching access - std::map* vaCacheMap_; //!< VA cache map + Device* parent_; //!< This device's parent + Monitor* vaCacheAccess_; //!< Lock to serialize VA caching access + std::map* vaCacheMap_; //!< VA cache map }; -struct KernelParameterDescriptor -{ - const char* name_; //!< The parameter's name in the source - clk_value_type_t type_; //!< The parameter's type - size_t offset_; //!< Its offset in the parameter's stack - size_t size_; //!< Its size in bytes - //! Argument's address qualifier - cl_kernel_arg_address_qualifier addressQualifier_; - //! Argument's access qualifier - cl_kernel_arg_access_qualifier accessQualifier_; - //! Argument's type qualifier - cl_kernel_arg_type_qualifier typeQualifier_; - const char* typeName_; //!< Argument's type name +struct KernelParameterDescriptor { + const char* name_; //!< The parameter's name in the source + clk_value_type_t type_; //!< The parameter's type + size_t offset_; //!< Its offset in the parameter's stack + size_t size_; //!< Its size in bytes + //! Argument's address qualifier + cl_kernel_arg_address_qualifier addressQualifier_; + //! Argument's access qualifier + cl_kernel_arg_access_qualifier accessQualifier_; + //! Argument's type qualifier + cl_kernel_arg_type_qualifier typeQualifier_; + const char* typeName_; //!< Argument's type name }; #if defined(WITH_LIGHTNING_COMPILER) //! Compilation process with cache support. -class CacheCompilation : public amd::HeapObject -{ -public: +class CacheCompilation : public amd::HeapObject { + public: + enum COMPILER_OPERATION { LINK_LLVM_BITCODES = 0, COMPILE_TO_LLVM, COMPILE_AND_LINK_EXEC }; - enum COMPILER_OPERATION { - LINK_LLVM_BITCODES = 0, - COMPILE_TO_LLVM, - COMPILE_AND_LINK_EXEC - }; + //! Constructor + CacheCompilation(std::string targetStr, std::string postfix, bool enableCache, bool resetCache); - //! Constructor - CacheCompilation(std::string targetStr, - std::string postfix, - bool enableCache, - bool resetCache); + //! NB, the cacheOpt argument is used for specifying the operation + //! condition, normally would be the same as the options argument. + //! However, the cacheOpt argument should not include any option + //! that would be modified each time but not affect the operation, + //! e.g. output file name. - //! NB, the cacheOpt argument is used for specifying the operation - //! condition, normally would be the same as the options argument. - //! However, the cacheOpt argument should not include any option - //! that would be modified each time but not affect the operation, - //! e.g. output file name. + //! Link LLVM bitcode + bool linkLLVMBitcode(amd::opencl_driver::Compiler* C, + std::vector& inputs, + amd::opencl_driver::Buffer* output, std::vector& options, + std::string& buildLog); - //! Link LLVM bitcode - bool linkLLVMBitcode(amd::opencl_driver::Compiler* C, - std::vector& inputs, - amd::opencl_driver::Buffer* output, - std::vector& options, - std::string& buildLog); + //! Compile to LLVM bitcode + bool compileToLLVMBitcode(amd::opencl_driver::Compiler* C, + std::vector& inputs, + amd::opencl_driver::Buffer* output, std::vector& options, + std::string& buildLog); - //! Compile to LLVM bitcode - bool compileToLLVMBitcode(amd::opencl_driver::Compiler* C, - std::vector& inputs, - amd::opencl_driver::Buffer* output, - std::vector& options, - std::string& buildLog); + //! Compile and link executable + bool compileAndLinkExecutable(amd::opencl_driver::Compiler* C, + std::vector& inputs, + amd::opencl_driver::Buffer* output, + std::vector& options, std::string& buildLog); - //! Compile and link executable - bool compileAndLinkExecutable(amd::opencl_driver::Compiler* C, - std::vector& inputs, - amd::opencl_driver::Buffer* output, - std::vector& options, - std::string& buildLog); - -private: - StringCache codeCache_; //! Cached codes - const bool isCodeCacheEnabled_; //! Code cache enable + private: + StringCache codeCache_; //! Cached codes + const bool isCodeCacheEnabled_; //! Code cache enable }; #endif @@ -1897,6 +1775,6 @@ private: * @} */ -} // namespace amd +} // namespace amd #endif /*DEVICE_HPP_*/ diff --git a/rocclr/runtime/device/gpu/gpuappprofile.cpp b/rocclr/runtime/device/gpu/gpuappprofile.cpp index a244d2e22e..5225788e65 100644 --- a/rocclr/runtime/device/gpu/gpuappprofile.cpp +++ b/rocclr/runtime/device/gpu/gpuappprofile.cpp @@ -10,16 +10,11 @@ namespace gpu { AppProfile::AppProfile() - : amd::AppProfile() - , enableHighPerformanceState_(true) - , reportAsOCL12Device_(false) -{ - propertyDataMap_.insert(DataMap::value_type("HighPerfState", - PropertyData(DataType_Boolean, &enableHighPerformanceState_))); + : amd::AppProfile(), enableHighPerformanceState_(true), reportAsOCL12Device_(false) { + propertyDataMap_.insert(DataMap::value_type( + "HighPerfState", PropertyData(DataType_Boolean, &enableHighPerformanceState_))); - propertyDataMap_.insert(DataMap::value_type("OCL12Device", - PropertyData(DataType_Boolean, &reportAsOCL12Device_))); + propertyDataMap_.insert( + DataMap::value_type("OCL12Device", PropertyData(DataType_Boolean, &reportAsOCL12Device_))); } - } - diff --git a/rocclr/runtime/device/gpu/gpuappprofile.hpp b/rocclr/runtime/device/gpu/gpuappprofile.hpp index 348d847642..2216d2b44c 100644 --- a/rocclr/runtime/device/gpu/gpuappprofile.hpp +++ b/rocclr/runtime/device/gpu/gpuappprofile.hpp @@ -10,21 +10,18 @@ namespace gpu { -class AppProfile : public amd::AppProfile -{ -public: - AppProfile(); +class AppProfile : public amd::AppProfile { + public: + AppProfile(); - //! return the value of enableHighPerformanceState_ - bool enableHighPerformanceState() const { return enableHighPerformanceState_; } - bool reportAsOCL12Device() const { return reportAsOCL12Device_; } + //! return the value of enableHighPerformanceState_ + bool enableHighPerformanceState() const { return enableHighPerformanceState_; } + bool reportAsOCL12Device() const { return reportAsOCL12Device_; } -private: - - bool enableHighPerformanceState_; - bool reportAsOCL12Device_; + private: + bool enableHighPerformanceState_; + bool reportAsOCL12Device_; }; - } #endif diff --git a/rocclr/runtime/device/gpu/gpubinary.cpp b/rocclr/runtime/device/gpu/gpubinary.cpp index 37ead10c09..267aa9f255 100644 --- a/rocclr/runtime/device/gpu/gpubinary.cpp +++ b/rocclr/runtime/device/gpu/gpubinary.cpp @@ -12,537 +12,496 @@ namespace { -enum { - NDX_KERNEL = 0, - NDX_METADATA = 1, - NDX_HEADER = 2, - NDX_AMDIL = 3, - NDX_LAST -}; +enum { NDX_KERNEL = 0, NDX_METADATA = 1, NDX_HEADER = 2, NDX_AMDIL = 3, NDX_LAST }; typedef struct { - bool IsKernel; // whether the entry is for kernel + bool IsKernel; // whether the entry is for kernel - /* - SymInfo[NDX_KERNEL] : SymbolInfo for kernel isa (cal image) - SymInfo[NDX_METADATA] : SymbolInfo for kernel metadata - SymInfo[NDX_HEADER] : SymbolInfo for kernel header - SymInfo[NDX_AMDIL] : SymbolInfo for kernel's amdil - */ - amd::OclElf::SymbolInfo SymInfo[NDX_LAST]; + /* + SymInfo[NDX_KERNEL] : SymbolInfo for kernel isa (cal image) + SymInfo[NDX_METADATA] : SymbolInfo for kernel metadata + SymInfo[NDX_HEADER] : SymbolInfo for kernel header + SymInfo[NDX_AMDIL] : SymbolInfo for kernel's amdil + */ + amd::OclElf::SymbolInfo SymInfo[NDX_LAST]; } ElfSymbol_t; - } namespace gpu { -bool -ClBinary::loadKernels(NullProgram& program, bool* hasRecompiled) -{ - const char __OpenCL_[] = "__OpenCL_"; - const char _kernel[] = "_kernel"; - const char _data[] = "_metadata"; // metadata for kernel function - const char _fdata[] = "_fmetadata"; // metadata for non-kernel function - const char _header[] = "_header"; - const char _amdil[] = "_amdil"; +bool ClBinary::loadKernels(NullProgram& program, bool* hasRecompiled) { + const char __OpenCL_[] = "__OpenCL_"; + const char _kernel[] = "_kernel"; + const char _data[] = "_metadata"; // metadata for kernel function + const char _fdata[] = "_fmetadata"; // metadata for non-kernel function + const char _header[] = "_header"; + const char _amdil[] = "_amdil"; - *hasRecompiled = false; + *hasRecompiled = false; - // TODO : jugu - // Target should be 15 bit maximum. Should check this somewhere. - uint32_t target = static_cast(dev().calTarget()); - uint16_t elf_target; - amd::OclElf::oclElfPlatform platform; - if (!elfIn()->getTarget(elf_target, platform)) { - LogError("The OCL binary image loading failed: incorrect format"); - return false; + // TODO : jugu + // Target should be 15 bit maximum. Should check this somewhere. + uint32_t target = static_cast(dev().calTarget()); + uint16_t elf_target; + amd::OclElf::oclElfPlatform platform; + if (!elfIn()->getTarget(elf_target, platform)) { + LogError("The OCL binary image loading failed: incorrect format"); + return false; + } + if (platform == amd::OclElf::COMPLIB_PLATFORM) { + // BIF 3.0 + uint32_t flag; + aclTargetInfo tgtInfo = aclGetTargetInfo("amdil", dev().hwInfo()->targetName_, NULL); + if (!elfIn()->getFlags(flag)) { + LogError("The OCL binary image loading failed: incorrect format"); + return false; } - if (platform == amd::OclElf::COMPLIB_PLATFORM) { - // BIF 3.0 - uint32_t flag; - aclTargetInfo tgtInfo = aclGetTargetInfo("amdil", dev().hwInfo()->targetName_, NULL); - if (!elfIn()->getFlags(flag)){ - LogError("The OCL binary image loading failed: incorrect format"); - return false; - } - if ((elf_target != EM_AMDIL) || - (tgtInfo.chip_id != flag)) { - LogError("The OCL binary image loading failed: different target"); - return false; - } + if ((elf_target != EM_AMDIL) || (tgtInfo.chip_id != flag)) { + LogError("The OCL binary image loading failed: different target"); + return false; } - else { - if (((platform != amd::OclElf::CAL_PLATFORM) || - ((uint32_t)target != elf_target))) { - LogError("The OCL binary image loading failed: different target"); - return false; - } + } else { + if (((platform != amd::OclElf::CAL_PLATFORM) || ((uint32_t)target != elf_target))) { + LogError("The OCL binary image loading failed: different target"); + return false; } + } - /* Using class so that dtor() can be invoked to do clean-up */ - class TempWrapper { - public: - /* - functionNameMap[] maps from a function name (linkage name in the generated code) - to ElfSymbol_t, which is defined as above. - */ - std::map functionNameMap; - - // Keep all kernel ILs if -use-debugil is present (gpu debugging) - std::map kernelILs; - - ~TempWrapper () { - std::map::iterator - I, IB = functionNameMap.begin(), IE = functionNameMap.end(); - for (I = IB; I != IE; ++I) { - delete [] (*I).second; - } - - kernelILs.clear(); - } - } tempObj; - + /* Using class so that dtor() can be invoked to do clean-up */ + class TempWrapper { + public: /* - If usedebugil is true, we will load IL from .debugil section. We will ignore - _kernel, _amdil, _header in the binary. + functionNameMap[] maps from a function name (linkage name in the generated code) + to ElfSymbol_t, which is defined as above. */ - bool usedebugil = program.getCompilerOptions()->oVariables->UseDebugIL; + std::map functionNameMap; - for (amd::Sym_Handle sym = elfIn()->nextSymbol(NULL); - sym != NULL; - sym = elfIn()->nextSymbol(sym)) { - amd::OclElf::SymbolInfo symInfo; - if (!elfIn()->getSymbolInfo(sym, &symInfo)) { - LogError("LoadKernelFromElf: getSymbolInfo() fails"); - return false; - } + // Keep all kernel ILs if -use-debugil is present (gpu debugging) + std::map kernelILs; - std::string elfSymName(symInfo.sym_name); - const size_t offset = sizeof(__OpenCL_) - 1; - if (elfSymName.compare(0, offset, __OpenCL_) != 0) { - continue; - } + ~TempWrapper() { + std::map::iterator I, IB = functionNameMap.begin(), + IE = functionNameMap.end(); + for (I = IB; I != IE; ++I) { + delete[](*I).second; + } - // Assume this elfSymName is associated with a kernel name. The following code will adjust - // it if it isn't. - const size_t suffixPos = elfSymName.rfind('_'); - bool isKernel = true; // assume it is a kernel - std::string FName = elfSymName.substr(0, suffixPos); - FName.append("_kernel"); // make the kernel's linkage name + kernelILs.clear(); + } + } tempObj; - ElfSymbol_t* elfsymbol = tempObj.functionNameMap[FName]; - amd::OclElf::SymbolInfo* sinfo = (elfsymbol != NULL) ? &(elfsymbol->SymInfo[0]) : NULL; + /* + If usedebugil is true, we will load IL from .debugil section. We will ignore + _kernel, _amdil, _header in the binary. + */ + bool usedebugil = program.getCompilerOptions()->oVariables->UseDebugIL; - // Add info for this elf symbol into tempobj's functionNameMap[] - int index = -1; - if (!usedebugil && - (elfSymName.compare(suffixPos, sizeof(_kernel) - 1, _kernel) == 0)) { - index = NDX_KERNEL; - assert (((sinfo == NULL) || (sinfo[index].size == 0)) && - "More than one kernel symbol for the same kernel"); - } - else if (!usedebugil && - (elfSymName.compare(suffixPos, sizeof(_header) - 1, _header) == 0)) { - index = NDX_HEADER; - assert (((sinfo == NULL) || (sinfo[index].size == 0)) && - "More than one header symbol for a kernel"); - } - else if (!usedebugil && - (elfSymName.compare(suffixPos, sizeof(_amdil) - 1, _amdil) == 0)) { - index = NDX_AMDIL; - assert (((sinfo == NULL) || (sinfo[index].size == 0)) && - "More than one amdil symbol for a kernel"); - } - else if (elfSymName.compare(suffixPos, sizeof(_data) - 1, _data) == 0) { - index = NDX_METADATA; - assert (((sinfo == NULL) || (sinfo[index].size == 0)) && - "More than one metadata symbol for the same kernel"); - } - else if (elfSymName.compare(suffixPos, sizeof(_fdata) - 1, _fdata) == 0) { - index = NDX_METADATA; - isKernel = false; - - FName = elfSymName.substr(offset, suffixPos - offset); - - elfsymbol = tempObj.functionNameMap[FName]; - sinfo = (elfsymbol != NULL) ? &(elfsymbol->SymInfo[0]) : NULL; - - assert (((sinfo == NULL) || (sinfo[index].size == 0)) && - "More than one metadata symbol for a non-kernel function"); - } - - if (index >= 0) { - if (elfsymbol == NULL) { - elfsymbol = new ElfSymbol_t(); - sinfo = &(elfsymbol->SymInfo[0]); - ::memset(sinfo, 0, NDX_LAST * sizeof(amd::OclElf::SymbolInfo)); - tempObj.functionNameMap[FName] = elfsymbol; - - elfsymbol->IsKernel = isKernel; - } - sinfo[index] = symInfo; - } + for (amd::Sym_Handle sym = elfIn()->nextSymbol(NULL); sym != NULL; + sym = elfIn()->nextSymbol(sym)) { + amd::OclElf::SymbolInfo symInfo; + if (!elfIn()->getSymbolInfo(sym, &symInfo)) { + LogError("LoadKernelFromElf: getSymbolInfo() fails"); + return false; } - std::string programil; - if (usedebugil) { - char *section; - size_t sz; - - if (elfIn_->getSection(amd::OclElf::ILDEBUG, §ion, &sz)) { - // Get debugIL - programil.append(section, sz); - } - else { - LogError("LoadKernelFromElf(): reading .debugil failed"); - return false; - } - - // Append all function metadata to debugIL - std::map::iterator - I, IB = tempObj.functionNameMap.begin(), IE = tempObj.functionNameMap.end(); - for (I = IB; I != IE; ++I) { - ElfSymbol_t* elfsymbol = (*I).second; - if (elfsymbol == NULL) { - // Not valid, skip - continue; - } - if ( (elfsymbol->SymInfo[NDX_METADATA].address != 0) && - (elfsymbol->SymInfo[NDX_METADATA].size > 0) ) { - std::string mdString = std::string(elfsymbol->SymInfo[NDX_METADATA].address, - elfsymbol->SymInfo[NDX_METADATA].size); - assert ((mdString.find_first_of('\0') == std::string::npos) && - "Metadata string has NULL inside !"); - programil.append(mdString); - } - } - - const char* ilKernelName = - program.getCompilerOptions()->oVariables->JustKernel; - if (!program.getAllKernelILs(tempObj.kernelILs, programil, ilKernelName)) { - LogError("LoadKernelFromElf(): MDParser failed generating kernel ILs"); - return false; - } - - // Now, patch the IL from debugIL into functionNameMap[] - std::map::iterator - KI, KIB = tempObj.kernelILs.begin(), KIE = tempObj.kernelILs.end(); - for (KI = KIB; KI != KIE; ++KI) { - const std::string& kn = (*KI).first; - const std::string& ilstr = (*KI).second; - - ElfSymbol_t* elfsymbol = tempObj.functionNameMap[kn]; - if (elfsymbol == NULL) { - elfsymbol = new ElfSymbol_t(); - ::memset(elfsymbol->SymInfo, 0, NDX_LAST * sizeof(amd::OclElf::SymbolInfo)); - tempObj.functionNameMap[kn] = elfsymbol; - } - amd::OclElf::SymbolInfo* sinfo = &(elfsymbol->SymInfo[0]); - - elfsymbol->IsKernel = true; - sinfo[NDX_AMDIL].address = const_cast(ilstr.data()); - sinfo[NDX_AMDIL].size = ilstr.size(); - // All the other fields in SymInfo is unused - } + std::string elfSymName(symInfo.sym_name); + const size_t offset = sizeof(__OpenCL_) - 1; + if (elfSymName.compare(0, offset, __OpenCL_) != 0) { + continue; } - bool recompiled = false; - bool hasKernels = false; - std::map::iterator - I, IB = tempObj.functionNameMap.begin(), IE = tempObj.functionNameMap.end(); + // Assume this elfSymName is associated with a kernel name. The following code will adjust + // it if it isn't. + const size_t suffixPos = elfSymName.rfind('_'); + bool isKernel = true; // assume it is a kernel + std::string FName = elfSymName.substr(0, suffixPos); + FName.append("_kernel"); // make the kernel's linkage name + + ElfSymbol_t* elfsymbol = tempObj.functionNameMap[FName]; + amd::OclElf::SymbolInfo* sinfo = (elfsymbol != NULL) ? &(elfsymbol->SymInfo[0]) : NULL; + + // Add info for this elf symbol into tempobj's functionNameMap[] + int index = -1; + if (!usedebugil && (elfSymName.compare(suffixPos, sizeof(_kernel) - 1, _kernel) == 0)) { + index = NDX_KERNEL; + assert(((sinfo == NULL) || (sinfo[index].size == 0)) && + "More than one kernel symbol for the same kernel"); + } else if (!usedebugil && (elfSymName.compare(suffixPos, sizeof(_header) - 1, _header) == 0)) { + index = NDX_HEADER; + assert(((sinfo == NULL) || (sinfo[index].size == 0)) && + "More than one header symbol for a kernel"); + } else if (!usedebugil && (elfSymName.compare(suffixPos, sizeof(_amdil) - 1, _amdil) == 0)) { + index = NDX_AMDIL; + assert(((sinfo == NULL) || (sinfo[index].size == 0)) && + "More than one amdil symbol for a kernel"); + } else if (elfSymName.compare(suffixPos, sizeof(_data) - 1, _data) == 0) { + index = NDX_METADATA; + assert(((sinfo == NULL) || (sinfo[index].size == 0)) && + "More than one metadata symbol for the same kernel"); + } else if (elfSymName.compare(suffixPos, sizeof(_fdata) - 1, _fdata) == 0) { + index = NDX_METADATA; + isKernel = false; + + FName = elfSymName.substr(offset, suffixPos - offset); + + elfsymbol = tempObj.functionNameMap[FName]; + sinfo = (elfsymbol != NULL) ? &(elfsymbol->SymInfo[0]) : NULL; + + assert(((sinfo == NULL) || (sinfo[index].size == 0)) && + "More than one metadata symbol for a non-kernel function"); + } + + if (index >= 0) { + if (elfsymbol == NULL) { + elfsymbol = new ElfSymbol_t(); + sinfo = &(elfsymbol->SymInfo[0]); + ::memset(sinfo, 0, NDX_LAST * sizeof(amd::OclElf::SymbolInfo)); + tempObj.functionNameMap[FName] = elfsymbol; + + elfsymbol->IsKernel = isKernel; + } + sinfo[index] = symInfo; + } + } + + std::string programil; + if (usedebugil) { + char* section; + size_t sz; + + if (elfIn_->getSection(amd::OclElf::ILDEBUG, §ion, &sz)) { + // Get debugIL + programil.append(section, sz); + } else { + LogError("LoadKernelFromElf(): reading .debugil failed"); + return false; + } + + // Append all function metadata to debugIL + std::map::iterator I, IB = tempObj.functionNameMap.begin(), + IE = tempObj.functionNameMap.end(); for (I = IB; I != IE; ++I) { - ElfSymbol_t* elfsymbol = (*I).second; - if (elfsymbol == NULL) { - // Not valid, skip - continue; - } - else if (!elfsymbol->IsKernel) { - // Not a kernel. Add its metadata to the OCL binary in case recompilation happens - // and the new binary is needed. - if (saveAMDIL()&& - (elfsymbol->SymInfo[NDX_METADATA].size > 0)) { - std::string fmetadata = "__OpenCL_"; - fmetadata.append((*I).first); - fmetadata.append("_fmetadata"); - - if (!elfOut()->addSymbol(amd::OclElf::RODATA, fmetadata.c_str(), - elfsymbol->SymInfo[NDX_METADATA].address, - elfsymbol->SymInfo[NDX_METADATA].size)) { - LogError ("AddSymbol() failed to add fmetadata"); - return false; - } - } - continue; - } - amd::OclElf::SymbolInfo* sinfo = &(elfsymbol->SymInfo[0]); - std::string FName = (*I).first; - - // For this kernel, get the demangled kernel name, which is used to identify each kernel. - const size_t name_sz = FName.size() - (sizeof(_kernel) - 1) - (sizeof(__OpenCL_) - 1); - std::string demangledKName = FName.substr(sizeof(__OpenCL_) - 1, name_sz); - - // Check if the current entry is valid - if (((sinfo[NDX_HEADER].size <= 0) || (sinfo[NDX_KERNEL].size <= 0)) && - (sinfo[NDX_AMDIL].size <= 0)) { - std::string tlog = "Warning: both IL and CAL Image are not available for kernel " + - demangledKName; - LogWarning (tlog.c_str()); - continue; - } - hasKernels = true; - - Kernel::InitData initData = {0}; - std::string ilSource(sinfo[NDX_AMDIL].address, sinfo[NDX_AMDIL].size); - std::string metadata(sinfo[NDX_METADATA].address, sinfo[NDX_METADATA].size); - if ((sinfo[NDX_HEADER].size <= 0) || (sinfo[NDX_KERNEL].size <= 0)) { - // IL recompilation - // TODO: global data recompilation as well. - // 1) parse IL; 2) parse metadata to set up kernel header - size_t pos; - if (!program.findAllILFuncs((programil.size() ? programil : ilSource), pos)) { - program.freeAllILFuncs(); - return false; - } - - bool isFailed = false; - for (uint32_t i=0; i < program.funcs_.size(); ++i) { - ILFunc *func = program.funcs_[i]; - ElfSymbol_t *sym = tempObj.functionNameMap[func->name_]; - if (sym == NULL) { - // No metadata for this function. - continue; - } - - assert ((func->metadata_.end_ == 0) && "ILFunc init failed"); - amd::OclElf::SymbolInfo* si = &(sym->SymInfo[0]); - if (si[NDX_METADATA].size > 0) { - std::string meta(si[NDX_METADATA].address, si[NDX_METADATA].size); - if (!program.parseFuncMetadata(meta, 0, std::string::npos)) { - isFailed = true; - break; - } - if (func->metadata_.end_ != std::string::npos) { - assert( false && "ILFunc name and index does not match"); - isFailed = true; - break; - } - - // Accumulate all emulated local, region and private sizes, - // necessary for the kernel execution - initData.localSize_ += func->localSize_; - initData.privateSize_ += func->privateSize_; - - // Accumulate all HW local, region and private sizes, - // necessary for the kernel execution - initData.hwLocalSize_ += func->hwLocalSize_; - initData.hwPrivateSize_ += func->hwPrivateSize_; - initData.flags_ |= func->flags_; - } - } - - program.freeAllILFuncs(); - if (isFailed) { - return false; - } - } - else { - KernelHeaderSymbol kHeader = {0}; - ::memcpy(&kHeader, sinfo[NDX_HEADER].address, - (sizeof(kHeader) < sinfo[NDX_HEADER].size) - ? sizeof(kHeader) - : sinfo[NDX_HEADER].size); - - if (kHeader.version_ > VERSION_CURRENT) { - LogError("LoadKernelFromElf: cannot handle the newer version of the binary"); - return false; - } - - // VERSION_0 - initData.localSize_ = kHeader.localSize_; - initData.hwLocalSize_ = kHeader.hwLocalSize_; - initData.privateSize_ = kHeader.privateSize_; - initData.hwPrivateSize_ = kHeader.hwPrivateSize_; - initData.flags_ = kHeader.flags_; - } - - bool created; - NullKernel* gpuKernel = program.createKernel(demangledKName, &initData, ilSource, metadata, - &created, sinfo[NDX_KERNEL].address, sinfo[NDX_KERNEL].size); - if (!created) { - std::string tlog = "Error: Creating kernel during loading OCL binary " + - demangledKName + " failed!"; - LogError(tlog.c_str()); - return false; - } - - recompiled = recompiled || (sinfo[NDX_KERNEL].size == 0); - - // Add the current kernel to the OCL binary in case recompilation happens and - // the new binary is needed. - if (!storeKernel(demangledKName, gpuKernel, &initData, metadata, ilSource)) { - return false; - } + ElfSymbol_t* elfsymbol = (*I).second; + if (elfsymbol == NULL) { + // Not valid, skip + continue; + } + if ((elfsymbol->SymInfo[NDX_METADATA].address != 0) && + (elfsymbol->SymInfo[NDX_METADATA].size > 0)) { + std::string mdString = std::string(elfsymbol->SymInfo[NDX_METADATA].address, + elfsymbol->SymInfo[NDX_METADATA].size); + assert((mdString.find_first_of('\0') == std::string::npos) && + "Metadata string has NULL inside !"); + programil.append(mdString); + } } - *hasRecompiled = recompiled; - return hasKernels; -} - -bool -ClBinary::storeKernel( - const std::string& name, - const NullKernel* nullKernel, - Kernel::InitData* initData, - const std::string& metadata, - const std::string& ilSource) -{ - if (!saveISA() && !saveAMDIL()) { - return true; + const char* ilKernelName = program.getCompilerOptions()->oVariables->JustKernel; + if (!program.getAllKernelILs(tempObj.kernelILs, programil, ilKernelName)) { + LogError("LoadKernelFromElf(): MDParser failed generating kernel ILs"); + return false; } - // should we save kernel metadata only under saveAMDIL()? - bool kernelMetaStored = false; + // Now, patch the IL from debugIL into functionNameMap[] + std::map::iterator KI, KIB = tempObj.kernelILs.begin(), + KIE = tempObj.kernelILs.end(); + for (KI = KIB; KI != KIE; ++KI) { + const std::string& kn = (*KI).first; + const std::string& ilstr = (*KI).second; - if (saveAMDIL() && (ilSource.size() > 0)) { - // Save IL (this is the per-kernel IL) - std::string ilName = "__OpenCL_" + name + "_amdil"; - if (!elfOut()->addSymbol(amd::OclElf::ILTEXT, ilName.c_str(), - ilSource.data(), ilSource.size())) { - LogError ("AddElfSymbol failed"); - return false; - } + ElfSymbol_t* elfsymbol = tempObj.functionNameMap[kn]; + if (elfsymbol == NULL) { + elfsymbol = new ElfSymbol_t(); + ::memset(elfsymbol->SymInfo, 0, NDX_LAST * sizeof(amd::OclElf::SymbolInfo)); + tempObj.functionNameMap[kn] = elfsymbol; + } + amd::OclElf::SymbolInfo* sinfo = &(elfsymbol->SymInfo[0]); - std::string metaName = "__OpenCL_" + name + "_metadata"; - // Save metadata symbols in .rodata - if (!elfOut()->addSymbol(amd::OclElf::RODATA, metaName.c_str(), - metadata.data(), metadata.size())) { - LogError ("AddElfSymbol failed"); - return false; - } - kernelMetaStored = true; + elfsymbol->IsKernel = true; + sinfo[NDX_AMDIL].address = const_cast(ilstr.data()); + sinfo[NDX_AMDIL].size = ilstr.size(); + // All the other fields in SymInfo is unused } + } - if (!saveISA()) { - return true; + bool recompiled = false; + bool hasKernels = false; + std::map::iterator I, IB = tempObj.functionNameMap.begin(), + IE = tempObj.functionNameMap.end(); + for (I = IB; I != IE; ++I) { + ElfSymbol_t* elfsymbol = (*I).second; + if (elfsymbol == NULL) { + // Not valid, skip + continue; + } else if (!elfsymbol->IsKernel) { + // Not a kernel. Add its metadata to the OCL binary in case recompilation happens + // and the new binary is needed. + if (saveAMDIL() && (elfsymbol->SymInfo[NDX_METADATA].size > 0)) { + std::string fmetadata = "__OpenCL_"; + fmetadata.append((*I).first); + fmetadata.append("_fmetadata"); + + if (!elfOut()->addSymbol(amd::OclElf::RODATA, fmetadata.c_str(), + elfsymbol->SymInfo[NDX_METADATA].address, + elfsymbol->SymInfo[NDX_METADATA].size)) { + LogError("AddSymbol() failed to add fmetadata"); + return false; + } + } + continue; } + amd::OclElf::SymbolInfo* sinfo = &(elfsymbol->SymInfo[0]); + std::string FName = (*I).first; - size_t binarySize = (nullKernel != NULL) ? nullKernel->getCalBinarySize() : 0; - if (binarySize != 0) { - if (!kernelMetaStored) { - std::string metaName = "__OpenCL_" + name + "_metadata"; - // Save metadata symbols in .rodata - if (!elfOut()->addSymbol(amd::OclElf::RODATA, metaName.c_str(), - metadata.data(), metadata.size())) { - LogError ("AddSymbol failed"); - return false; - } - } - // Save kernel symbol that is associated with GPU ISA - std::string kernelName = "__OpenCL_" + name + "_kernel"; - uint8_t* isacode = new uint8_t[binarySize]; - if (!nullKernel->getCalBinary( - reinterpret_cast(isacode), binarySize)) { - LogError("Failed to read GPU kernel isa"); - delete [] isacode; - return false; - } - if (!elfOut()->addSymbol(amd::OclElf::CAL, kernelName.c_str(), - isacode, binarySize)) { - LogError ("AddElfSymbol failed"); - return false; - } - delete [] isacode; + // For this kernel, get the demangled kernel name, which is used to identify each kernel. + const size_t name_sz = FName.size() - (sizeof(_kernel) - 1) - (sizeof(__OpenCL_) - 1); + std::string demangledKName = FName.substr(sizeof(__OpenCL_) - 1, name_sz); - // Save kernel header information into a pseudo symbol - // __OpenCL__header - // for example, given a kernel foo, this pseudo symbol - // would be __OpenCL_foo_header - std::string headerName = "__OpenCL_" + name + "_header"; - KernelHeaderSymbol kHeader; - // VERSION_0 - kHeader.privateSize_ = initData->privateSize_; - kHeader.localSize_ = initData->localSize_; - kHeader.regionSize_ = 0; - kHeader.hwPrivateSize_ = initData->hwPrivateSize_; - kHeader.hwLocalSize_ = initData->hwLocalSize_; - kHeader.hwRegionSize_ = 0; - kHeader.flags_ = initData->flags_; - - // VERSION_1 - kHeader.version_ = VERSION_CURRENT; - - if (!elfOut()->addSymbol(amd::OclElf::RODATA, headerName.c_str(), - &kHeader, sizeof(kHeader))) { - LogError("AddElfSymbol failed"); - return false; - } + // Check if the current entry is valid + if (((sinfo[NDX_HEADER].size <= 0) || (sinfo[NDX_KERNEL].size <= 0)) && + (sinfo[NDX_AMDIL].size <= 0)) { + std::string tlog = + "Warning: both IL and CAL Image are not available for kernel " + demangledKName; + LogWarning(tlog.c_str()); + continue; } - return true; -} + hasKernels = true; -bool -ClBinary::loadGlobalData(Program& program) -{ - const char __OpenCL_[] = "__OpenCL_"; - const char _global[] = "_global"; - - for (amd::Sym_Handle sym = elfIn()->nextSymbol(NULL); - sym != NULL; - sym = elfIn()->nextSymbol(sym)) { - amd::OclElf::SymbolInfo symInfo; - if (!elfIn()->getSymbolInfo(sym, &symInfo)) { - LogError("LoadGlobalDataFromElf: getSymbolInfo() fails"); - return false; - } - - std::string globalName(symInfo.sym_name); - const size_t offset = sizeof(__OpenCL_) - 1; - if (globalName.compare(0, offset, __OpenCL_) != 0) { - continue; - } - const size_t suffixPos = globalName.rfind('_'); - if (globalName.compare(suffixPos, sizeof(_global) - 1, _global) != 0) { - continue; - } - - // Get index for this global - std::string indexString = globalName.substr(offset, suffixPos - offset); - uint index = ::atoi(indexString.c_str()); - - if (!program.allocGlobalData(symInfo.address, symInfo.size, index)) { - LogError("Couldn't load global data"); - return false; - } - } - - return true; -} - -bool -ClBinary::storeGlobalData(const void* globalData, size_t dataSize, uint index) -{ - // For each global, use "__OpenCL_" as its name - // Since there is no name in amdil, just use "__OpenCL__global" for now. - std::stringstream glbName; - glbName << "__OpenCL_" << index << "_global"; - - if (!elfOut()->addSymbol(amd::OclElf::RODATA, glbName.str().c_str(), - globalData, dataSize)) { - LogError("addSymbol() failed"); + Kernel::InitData initData = {0}; + std::string ilSource(sinfo[NDX_AMDIL].address, sinfo[NDX_AMDIL].size); + std::string metadata(sinfo[NDX_METADATA].address, sinfo[NDX_METADATA].size); + if ((sinfo[NDX_HEADER].size <= 0) || (sinfo[NDX_KERNEL].size <= 0)) { + // IL recompilation + // TODO: global data recompilation as well. + // 1) parse IL; 2) parse metadata to set up kernel header + size_t pos; + if (!program.findAllILFuncs((programil.size() ? programil : ilSource), pos)) { + program.freeAllILFuncs(); return false; - } - return true; -} + } -bool -ClBinary::clearElfOut() -{ - // Recreate libelf elf object - if (!elfOut()->Clear()) { + bool isFailed = false; + for (uint32_t i = 0; i < program.funcs_.size(); ++i) { + ILFunc* func = program.funcs_[i]; + ElfSymbol_t* sym = tempObj.functionNameMap[func->name_]; + if (sym == NULL) { + // No metadata for this function. + continue; + } + + assert((func->metadata_.end_ == 0) && "ILFunc init failed"); + amd::OclElf::SymbolInfo* si = &(sym->SymInfo[0]); + if (si[NDX_METADATA].size > 0) { + std::string meta(si[NDX_METADATA].address, si[NDX_METADATA].size); + if (!program.parseFuncMetadata(meta, 0, std::string::npos)) { + isFailed = true; + break; + } + if (func->metadata_.end_ != std::string::npos) { + assert(false && "ILFunc name and index does not match"); + isFailed = true; + break; + } + + // Accumulate all emulated local, region and private sizes, + // necessary for the kernel execution + initData.localSize_ += func->localSize_; + initData.privateSize_ += func->privateSize_; + + // Accumulate all HW local, region and private sizes, + // necessary for the kernel execution + initData.hwLocalSize_ += func->hwLocalSize_; + initData.hwPrivateSize_ += func->hwPrivateSize_; + initData.flags_ |= func->flags_; + } + } + + program.freeAllILFuncs(); + if (isFailed) { return false; + } + } else { + KernelHeaderSymbol kHeader = {0}; + ::memcpy(&kHeader, sinfo[NDX_HEADER].address, (sizeof(kHeader) < sinfo[NDX_HEADER].size) + ? sizeof(kHeader) + : sinfo[NDX_HEADER].size); + + if (kHeader.version_ > VERSION_CURRENT) { + LogError("LoadKernelFromElf: cannot handle the newer version of the binary"); + return false; + } + + // VERSION_0 + initData.localSize_ = kHeader.localSize_; + initData.hwLocalSize_ = kHeader.hwLocalSize_; + initData.privateSize_ = kHeader.privateSize_; + initData.hwPrivateSize_ = kHeader.hwPrivateSize_; + initData.flags_ = kHeader.flags_; } - // Need to re-setup target - return setElfTarget(); + bool created; + NullKernel* gpuKernel = + program.createKernel(demangledKName, &initData, ilSource, metadata, &created, + sinfo[NDX_KERNEL].address, sinfo[NDX_KERNEL].size); + if (!created) { + std::string tlog = + "Error: Creating kernel during loading OCL binary " + demangledKName + " failed!"; + LogError(tlog.c_str()); + return false; + } + + recompiled = recompiled || (sinfo[NDX_KERNEL].size == 0); + + // Add the current kernel to the OCL binary in case recompilation happens and + // the new binary is needed. + if (!storeKernel(demangledKName, gpuKernel, &initData, metadata, ilSource)) { + return false; + } + } + + *hasRecompiled = recompiled; + return hasKernels; } -} // namespace gpu +bool ClBinary::storeKernel(const std::string& name, const NullKernel* nullKernel, + Kernel::InitData* initData, const std::string& metadata, + const std::string& ilSource) { + if (!saveISA() && !saveAMDIL()) { + return true; + } + + // should we save kernel metadata only under saveAMDIL()? + bool kernelMetaStored = false; + + if (saveAMDIL() && (ilSource.size() > 0)) { + // Save IL (this is the per-kernel IL) + std::string ilName = "__OpenCL_" + name + "_amdil"; + if (!elfOut()->addSymbol(amd::OclElf::ILTEXT, ilName.c_str(), ilSource.data(), + ilSource.size())) { + LogError("AddElfSymbol failed"); + return false; + } + + std::string metaName = "__OpenCL_" + name + "_metadata"; + // Save metadata symbols in .rodata + if (!elfOut()->addSymbol(amd::OclElf::RODATA, metaName.c_str(), metadata.data(), + metadata.size())) { + LogError("AddElfSymbol failed"); + return false; + } + kernelMetaStored = true; + } + + if (!saveISA()) { + return true; + } + + size_t binarySize = (nullKernel != NULL) ? nullKernel->getCalBinarySize() : 0; + if (binarySize != 0) { + if (!kernelMetaStored) { + std::string metaName = "__OpenCL_" + name + "_metadata"; + // Save metadata symbols in .rodata + if (!elfOut()->addSymbol(amd::OclElf::RODATA, metaName.c_str(), metadata.data(), + metadata.size())) { + LogError("AddSymbol failed"); + return false; + } + } + // Save kernel symbol that is associated with GPU ISA + std::string kernelName = "__OpenCL_" + name + "_kernel"; + uint8_t* isacode = new uint8_t[binarySize]; + if (!nullKernel->getCalBinary(reinterpret_cast(isacode), binarySize)) { + LogError("Failed to read GPU kernel isa"); + delete[] isacode; + return false; + } + if (!elfOut()->addSymbol(amd::OclElf::CAL, kernelName.c_str(), isacode, binarySize)) { + LogError("AddElfSymbol failed"); + return false; + } + delete[] isacode; + + // Save kernel header information into a pseudo symbol + // __OpenCL__header + // for example, given a kernel foo, this pseudo symbol + // would be __OpenCL_foo_header + std::string headerName = "__OpenCL_" + name + "_header"; + KernelHeaderSymbol kHeader; + // VERSION_0 + kHeader.privateSize_ = initData->privateSize_; + kHeader.localSize_ = initData->localSize_; + kHeader.regionSize_ = 0; + kHeader.hwPrivateSize_ = initData->hwPrivateSize_; + kHeader.hwLocalSize_ = initData->hwLocalSize_; + kHeader.hwRegionSize_ = 0; + kHeader.flags_ = initData->flags_; + + // VERSION_1 + kHeader.version_ = VERSION_CURRENT; + + if (!elfOut()->addSymbol(amd::OclElf::RODATA, headerName.c_str(), &kHeader, sizeof(kHeader))) { + LogError("AddElfSymbol failed"); + return false; + } + } + return true; +} + +bool ClBinary::loadGlobalData(Program& program) { + const char __OpenCL_[] = "__OpenCL_"; + const char _global[] = "_global"; + + for (amd::Sym_Handle sym = elfIn()->nextSymbol(NULL); sym != NULL; + sym = elfIn()->nextSymbol(sym)) { + amd::OclElf::SymbolInfo symInfo; + if (!elfIn()->getSymbolInfo(sym, &symInfo)) { + LogError("LoadGlobalDataFromElf: getSymbolInfo() fails"); + return false; + } + + std::string globalName(symInfo.sym_name); + const size_t offset = sizeof(__OpenCL_) - 1; + if (globalName.compare(0, offset, __OpenCL_) != 0) { + continue; + } + const size_t suffixPos = globalName.rfind('_'); + if (globalName.compare(suffixPos, sizeof(_global) - 1, _global) != 0) { + continue; + } + + // Get index for this global + std::string indexString = globalName.substr(offset, suffixPos - offset); + uint index = ::atoi(indexString.c_str()); + + if (!program.allocGlobalData(symInfo.address, symInfo.size, index)) { + LogError("Couldn't load global data"); + return false; + } + } + + return true; +} + +bool ClBinary::storeGlobalData(const void* globalData, size_t dataSize, uint index) { + // For each global, use "__OpenCL_" as its name + // Since there is no name in amdil, just use "__OpenCL__global" for now. + std::stringstream glbName; + glbName << "__OpenCL_" << index << "_global"; + + if (!elfOut()->addSymbol(amd::OclElf::RODATA, glbName.str().c_str(), globalData, dataSize)) { + LogError("addSymbol() failed"); + return false; + } + return true; +} + +bool ClBinary::clearElfOut() { + // Recreate libelf elf object + if (!elfOut()->Clear()) { + return false; + } + + // Need to re-setup target + return setElfTarget(); +} + +} // namespace gpu diff --git a/rocclr/runtime/device/gpu/gpubinary.hpp b/rocclr/runtime/device/gpu/gpubinary.hpp index 199db839a4..071d6fda98 100644 --- a/rocclr/runtime/device/gpu/gpubinary.hpp +++ b/rocclr/runtime/device/gpu/gpubinary.hpp @@ -10,134 +10,117 @@ namespace gpu { -class ClBinary : public device::ClBinary -{ -public: - +class ClBinary : public device::ClBinary { + public: #pragma pack(push, 8) - // Kernel version in the ELF header symbol - enum KernelVersions { - VERSION_0 = 0, - VERSION_1, - VERSION_CURRENT = VERSION_1 - }; + // Kernel version in the ELF header symbol + enum KernelVersions { VERSION_0 = 0, VERSION_1, VERSION_CURRENT = VERSION_1 }; - /* This is the ELF header symbol */ - struct KernelHeaderSymbol { - /* VERSION_0 - Version 0 has 8 uint32_t (32 bytes), top 5 are used, the rest zero'ed. - In Version_0, KernelHeaderSymbol is the same as KernelHeader - */ - uint32_t privateSize_; //!< Emulated private memory size - uint32_t localSize_; //!< Emulated local memory size - uint32_t hwPrivateSize_; //!< HW private memory size - uint32_t hwLocalSize_; //!< HW local memory size - uint32_t flags_; //!< Kernel's flags + /* This is the ELF header symbol */ + struct KernelHeaderSymbol { + /* VERSION_0 + Version 0 has 8 uint32_t (32 bytes), top 5 are used, the rest zero'ed. + In Version_0, KernelHeaderSymbol is the same as KernelHeader + */ + uint32_t privateSize_; //!< Emulated private memory size + uint32_t localSize_; //!< Emulated local memory size + uint32_t hwPrivateSize_; //!< HW private memory size + uint32_t hwLocalSize_; //!< HW local memory size + uint32_t flags_; //!< Kernel's flags - /* VERSION_1 - VERSION_1 has 6 uint32_t. - */ - uint32_t version_; //!< Kernel's version - uint32_t regionSize_; //!< Region memory size - uint32_t hwRegionSize_; //!< HW region memory size + /* VERSION_1 + VERSION_1 has 6 uint32_t. + */ + uint32_t version_; //!< Kernel's version + uint32_t regionSize_; //!< Region memory size + uint32_t hwRegionSize_; //!< HW region memory size - /* New entries can be added here, do not change the previous entries */ - }; + /* New entries can be added here, do not change the previous entries */ + }; #pragma pack(pop) - //! Constructor - ClBinary(const NullDevice& dev) - : device::ClBinary(dev) - {} + //! Constructor + ClBinary(const NullDevice& dev) : device::ClBinary(dev) {} - //! Destructor - ~ClBinary() {} + //! Destructor + ~ClBinary() {} - //! Creates and loads kernels from the OCL ELF binary file into the program - bool loadKernels( - NullProgram& program, //!< Program object with the binary - bool* hasRecompiled //!< Recompile amdil to isa. - ); + //! Creates and loads kernels from the OCL ELF binary file into the program + bool loadKernels(NullProgram& program, //!< Program object with the binary + bool* hasRecompiled //!< Recompile amdil to isa. + ); - //! Stores compiled kernel into the OCL ELF binary file - bool storeKernel( - const std::string& name, //!< Kernel's name - const NullKernel* nullKernel, //!< The kernel to add - Kernel::InitData* initData, //!< Kernel init data - const std::string& metadata, //!< Kernel's metadata - const std::string& ilSource //!< IL source text - ); + //! Stores compiled kernel into the OCL ELF binary file + bool storeKernel(const std::string& name, //!< Kernel's name + const NullKernel* nullKernel, //!< The kernel to add + Kernel::InitData* initData, //!< Kernel init data + const std::string& metadata, //!< Kernel's metadata + const std::string& ilSource //!< IL source text + ); - //! Loads the program's global data - bool loadGlobalData( - Program& program //!< The program object for the global data load - ); + //! Loads the program's global data + bool loadGlobalData(Program& program //!< The program object for the global data load + ); - //! Stores the program's global data - bool storeGlobalData( - const void* globalData, //!< The program global data - size_t dataSize, //!< The program global data size - uint index //!< The global data storage index - ); + //! Stores the program's global data + bool storeGlobalData(const void* globalData, //!< The program global data + size_t dataSize, //!< The program global data size + uint index //!< The global data storage index + ); - //! Set elf header information for GPU target - bool setElfTarget() { - uint32_t target = static_cast(dev().calTarget()); - assert (((0xFFFF8000 & target) == 0) && "ASIC target ID >= 2^15"); - uint16_t elf_target = (uint16_t)(0x7FFF & target); - return elfOut()->setTarget(elf_target, amd::OclElf::CAL_PLATFORM); - } + //! Set elf header information for GPU target + bool setElfTarget() { + uint32_t target = static_cast(dev().calTarget()); + assert(((0xFFFF8000 & target) == 0) && "ASIC target ID >= 2^15"); + uint16_t elf_target = (uint16_t)(0x7FFF & target); + return elfOut()->setTarget(elf_target, amd::OclElf::CAL_PLATFORM); + } - //! Clear elf out. - bool clearElfOut(); + //! Clear elf out. + bool clearElfOut(); -private: - //! Disable default copy constructor - ClBinary(const ClBinary&); + private: + //! Disable default copy constructor + ClBinary(const ClBinary&); - //! Disable default operator= - ClBinary& operator=(const ClBinary&); - - //! Returns the GPU device for this object - const NullDevice& dev() const { return static_cast(dev_); } + //! Disable default operator= + ClBinary& operator=(const ClBinary&); + //! Returns the GPU device for this object + const NullDevice& dev() const { return static_cast(dev_); } }; -class ClBinaryHsa : public device::ClBinary -{ -public: - ClBinaryHsa(const Device& dev, BinaryImageFormat bifVer = BIF_VERSION3) - : device::ClBinary(dev, bifVer) - {} +class ClBinaryHsa : public device::ClBinary { + public: + ClBinaryHsa(const Device& dev, BinaryImageFormat bifVer = BIF_VERSION3) + : device::ClBinary(dev, bifVer) {} - //! Destructor - ~ClBinaryHsa() {} + //! Destructor + ~ClBinaryHsa() {} -protected: - bool setElfTarget() { - uint32_t target = static_cast(21);//dev().calTarget()); - assert (((0xFFFF8000 & target) == 0) && "ASIC target ID >= 2^15"); - uint16_t elf_target = (uint16_t)(0x7FFF & target); - return elfOut()->setTarget(elf_target, amd::OclElf::CAL_PLATFORM); - return true; - } + protected: + bool setElfTarget() { + uint32_t target = static_cast(21); // dev().calTarget()); + assert(((0xFFFF8000 & target) == 0) && "ASIC target ID >= 2^15"); + uint16_t elf_target = (uint16_t)(0x7FFF & target); + return elfOut()->setTarget(elf_target, amd::OclElf::CAL_PLATFORM); + return true; + } -private: - //! Disable default copy constructor - ClBinaryHsa(const ClBinaryHsa&); + private: + //! Disable default copy constructor + ClBinaryHsa(const ClBinaryHsa&); - //! Disable default operator= - ClBinaryHsa& operator=(const ClBinaryHsa&); - - //! Returns the HSA device for this object - const Device& dev() const { return static_cast(dev_); } + //! Disable default operator= + ClBinaryHsa& operator=(const ClBinaryHsa&); + //! Returns the HSA device for this object + const Device& dev() const { return static_cast(dev_); } }; -} // namespace gpu - -#endif // GPUBINARY_HPP_ +} // namespace gpu +#endif // GPUBINARY_HPP_ diff --git a/rocclr/runtime/device/gpu/gpublit.cpp b/rocclr/runtime/device/gpu/gpublit.cpp index c6f3f05095..9dd416e822 100644 --- a/rocclr/runtime/device/gpu/gpublit.cpp +++ b/rocclr/runtime/device/gpu/gpublit.cpp @@ -13,2795 +13,2406 @@ namespace gpu { DmaBlitManager::DmaBlitManager(VirtualGPU& gpu, Setup setup) - : HostBlitManager(gpu, setup) - , MinSizeForPinnedTransfer(dev().settings().pinnedMinXferSize_) - , completeOperation_(false) - , context_(NULL) -{ + : HostBlitManager(gpu, setup), + MinSizeForPinnedTransfer(dev().settings().pinnedMinXferSize_), + completeOperation_(false), + context_(NULL) {} + +inline void DmaBlitManager::synchronize() const { + if (syncOperation_) { + gpu().waitAllEngines(); + gpu().releaseMemObjects(); + } } -inline void -DmaBlitManager::synchronize() const -{ - if (syncOperation_) { - gpu().waitAllEngines(); - gpu().releaseMemObjects(); - } +inline Memory& DmaBlitManager::gpuMem(device::Memory& mem) const { + return static_cast(mem); } -inline Memory& -DmaBlitManager::gpuMem(device::Memory& mem) const -{ - return static_cast(mem); -} +bool DmaBlitManager::readMemoryStaged(Memory& srcMemory, void* dstHost, Memory** xferBuf, + size_t origin, size_t& offset, size_t& totalSize, + size_t xferSize) const { + amd::Coord3D dst(0, 0, 0); + size_t tmpSize; + uint idxWrite = 0; + uint idxRead = 0; + size_t chunkSize; + static const bool CopyRect = false; + // Flush DMA for ASYNC copy + static const bool FlushDMA = true; -bool -DmaBlitManager::readMemoryStaged( - Memory& srcMemory, - void* dstHost, - Memory** xferBuf, - size_t origin, - size_t& offset, - size_t& totalSize, - size_t xferSize) const -{ - amd::Coord3D dst(0, 0, 0); - size_t tmpSize; - uint idxWrite = 0; - uint idxRead = 0; - size_t chunkSize; - static const bool CopyRect = false; - // Flush DMA for ASYNC copy - static const bool FlushDMA = true; + if (dev().xferRead().bufSize() < 128 * Ki) { + chunkSize = dev().xferRead().bufSize(); + } else { + chunkSize = std::min(amd::alignUp(xferSize / 4, 256), dev().xferRead().bufSize()); + chunkSize = std::max(chunkSize, 128 * Ki); + } - if (dev().xferRead().bufSize() < 128 * Ki) { - chunkSize = dev().xferRead().bufSize(); - } - else { - chunkSize = std::min(amd::alignUp(xferSize / 4, 256), - dev().xferRead().bufSize()); - chunkSize = std::max(chunkSize, 128 * Ki); - } + // Find the partial transfer size + tmpSize = std::min(chunkSize, xferSize); + amd::Coord3D srcLast(origin + offset, 0, 0); + amd::Coord3D copySizeLast(tmpSize, 0, 0); + + // Copy data into the temporary surface + if (!srcMemory.partialMemCopyTo(gpu(), srcLast, dst, copySizeLast, *xferBuf[idxWrite], CopyRect, + FlushDMA)) { + return false; + } + + totalSize -= tmpSize; + xferSize -= tmpSize; + offset += tmpSize; + + while (xferSize != 0) { // Find the partial transfer size tmpSize = std::min(chunkSize, xferSize); - amd::Coord3D srcLast(origin + offset, 0, 0); - amd::Coord3D copySizeLast(tmpSize, 0, 0); + amd::Coord3D src(origin + offset, 0, 0); + amd::Coord3D copySize(tmpSize, 0, 0); + idxWrite = (idxWrite + 1) % 2; // Copy data into the temporary surface - if (!srcMemory.partialMemCopyTo(gpu(), srcLast, dst, copySizeLast, - *xferBuf[idxWrite], CopyRect, FlushDMA)) { - return false; + if (!srcMemory.partialMemCopyTo(gpu(), src, dst, copySize, *xferBuf[idxWrite], CopyRect, + FlushDMA)) { + return false; } + // Read previous buffer + if (!xferBuf[idxRead]->hostRead(&gpu(), + reinterpret_cast(dstHost) + offset - copySizeLast[0], + dst, copySizeLast)) { + return false; + } + idxRead = (idxRead + 1) % 2; + copySizeLast = copySize; + totalSize -= tmpSize; xferSize -= tmpSize; offset += tmpSize; + } - while (xferSize != 0) { - // Find the partial transfer size - tmpSize = std::min(chunkSize, xferSize); + // Last read + if (!xferBuf[idxRead]->hostRead( + &gpu(), reinterpret_cast(dstHost) + offset - copySizeLast[0], dst, copySizeLast)) { + return false; + } - amd::Coord3D src(origin + offset, 0, 0); - amd::Coord3D copySize(tmpSize, 0, 0); + return true; +} - idxWrite = (idxWrite + 1) % 2; - // Copy data into the temporary surface - if (!srcMemory.partialMemCopyTo(gpu(), src, dst, copySize, - *xferBuf[idxWrite], CopyRect, FlushDMA)) { - return false; +bool DmaBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost, + const amd::Coord3D& origin, const amd::Coord3D& size, + bool entire) const { + // Use host copy if memory has direct access + if (setup_.disableReadBuffer_ || + (gpuMem(srcMemory).isHostMemDirectAccess() && gpuMem(srcMemory).isCacheable())) { + return HostBlitManager::readBuffer(srcMemory, dstHost, origin, size, entire); + } else { + size_t srcSize = size[0]; + size_t offset = 0; + size_t pinSize = dev().settings().pinnedXferSize_; + pinSize = std::min(pinSize, srcSize); + + // Check if a pinned transfer can be executed + if (pinSize && (srcSize > MinSizeForPinnedTransfer)) { + // Allign offset to 4K boundary (Vista/Win7 limitation) + char* tmpHost = const_cast( + amd::alignDown(reinterpret_cast(dstHost), PinnedMemoryAlignment)); + + // Find the partial size for unaligned copy + size_t partial = reinterpret_cast(dstHost) - tmpHost; + + amd::Memory* pinned = NULL; + bool first = true; + size_t tmpSize; + size_t pinAllocSize; + + // Copy memory, using pinning + while (srcSize > 0) { + // If it's the first iterarion, then readjust the copy size + // to include alignment + if (first) { + pinAllocSize = amd::alignUp(pinSize + partial, PinnedMemoryAlignment); + tmpSize = std::min(pinAllocSize - partial, srcSize); + first = false; + } else { + tmpSize = std::min(pinSize, srcSize); + pinAllocSize = amd::alignUp(tmpSize, PinnedMemoryAlignment); + partial = 0; } + amd::Coord3D dst(partial, 0, 0); + amd::Coord3D srcPin(origin[0] + offset, 0, 0); + amd::Coord3D copySizePin(tmpSize, 0, 0); + size_t partial2; - // Read previous buffer - if (!xferBuf[idxRead]->hostRead(&gpu(), - reinterpret_cast(dstHost) + offset - copySizeLast[0], - dst, copySizeLast)) { - return false; + // Allocate a GPU resource for pinning + pinned = pinHostMemory(tmpHost, pinAllocSize, partial2); + + if (pinned != NULL) { + // Get device memory for this virtual device + Memory* dstMemory = dev().getGpuMemory(pinned); + + if (!gpuMem(srcMemory).partialMemCopyTo(gpu(), srcPin, dst, copySizePin, *dstMemory)) { + LogWarning("DmaBlitManager::readBuffer failed a pinned copy!"); + gpu().addPinnedMem(pinned); + break; + } + gpu().addPinnedMem(pinned); + } else { + LogWarning("DmaBlitManager::readBuffer failed to pin a resource!"); + break; } - idxRead = (idxRead + 1) % 2; - copySizeLast = copySize; - - totalSize -= tmpSize; - xferSize -= tmpSize; + srcSize -= tmpSize; offset += tmpSize; + tmpHost = reinterpret_cast(tmpHost) + tmpSize + partial; + } } - // Last read - if (!xferBuf[idxRead]->hostRead(&gpu(), - reinterpret_cast(dstHost) + offset - copySizeLast[0], dst, copySizeLast)) { + if (0 != srcSize) { + Memory& xferBuf0 = dev().xferRead().acquire(); + Memory& xferBuf1 = dev().xferRead().acquire(); + Memory* xferBuf[2] = {&xferBuf0, &xferBuf1}; + + // Read memory using a staged resource + if (!readMemoryStaged(gpuMem(srcMemory), dstHost, xferBuf, origin[0], offset, srcSize, + srcSize)) { + LogError("DmaBlitManager::readBuffer failed!"); return false; - } + } - return true; + dev().xferRead().release(gpu(), xferBuf1); + dev().xferRead().release(gpu(), xferBuf0); + } + } + + return true; } -bool -DmaBlitManager::readBuffer( - device::Memory& srcMemory, - void* dstHost, - const amd::Coord3D& origin, - const amd::Coord3D& size, - bool entire) const -{ - // Use host copy if memory has direct access - if (setup_.disableReadBuffer_ || - (gpuMem(srcMemory).isHostMemDirectAccess() && gpuMem(srcMemory).isCacheable())) { - return HostBlitManager::readBuffer( - srcMemory, dstHost, origin, size, entire); - } - else { - size_t srcSize = size[0]; - size_t offset = 0; - size_t pinSize = dev().settings().pinnedXferSize_; - pinSize = std::min(pinSize, srcSize); +bool DmaBlitManager::readBufferRect(device::Memory& srcMemory, void* dstHost, + const amd::BufferRect& bufRect, const amd::BufferRect& hostRect, + const amd::Coord3D& size, bool entire) const { + // Use host copy if memory has direct access + if (setup_.disableReadBufferRect_ || + (gpuMem(srcMemory).isHostMemDirectAccess() && gpuMem(srcMemory).isCacheable())) { + return HostBlitManager::readBufferRect(srcMemory, dstHost, bufRect, hostRect, size, entire); + } else { + Memory& xferBuf = dev().xferRead().acquire(); - // Check if a pinned transfer can be executed - if (pinSize && (srcSize > MinSizeForPinnedTransfer)) { - // Allign offset to 4K boundary (Vista/Win7 limitation) - char* tmpHost = const_cast( - amd::alignDown(reinterpret_cast(dstHost), - PinnedMemoryAlignment)); + amd::Coord3D dst(0, 0, 0); + size_t tmpSize = 0; + size_t bufOffset; + size_t hostOffset; + size_t srcSize; - // Find the partial size for unaligned copy - size_t partial = reinterpret_cast(dstHost) - tmpHost; + for (size_t z = 0; z < size[2]; ++z) { + for (size_t y = 0; y < size[1]; ++y) { + srcSize = size[0]; + bufOffset = bufRect.offset(0, y, z); + hostOffset = hostRect.offset(0, y, z); - amd::Memory* pinned = NULL; - bool first = true; - size_t tmpSize; - size_t pinAllocSize; + while (srcSize != 0) { + // Find the partial transfer size + tmpSize = std::min(dev().xferRead().bufSize(), srcSize); - // Copy memory, using pinning - while (srcSize > 0) { - // If it's the first iterarion, then readjust the copy size - // to include alignment - if (first) { - pinAllocSize = amd::alignUp(pinSize + partial, - PinnedMemoryAlignment); - tmpSize = std::min(pinAllocSize - partial, srcSize); - first = false; - } - else { - tmpSize = std::min(pinSize, srcSize); - pinAllocSize = amd::alignUp(tmpSize, PinnedMemoryAlignment); - partial = 0; - } - amd::Coord3D dst(partial, 0, 0); - amd::Coord3D srcPin(origin[0] + offset, 0, 0); - amd::Coord3D copySizePin(tmpSize, 0, 0); - size_t partial2; + amd::Coord3D src(bufOffset, 0, 0); + amd::Coord3D copySize(tmpSize, 0, 0); - // Allocate a GPU resource for pinning - pinned = pinHostMemory(tmpHost, pinAllocSize, partial2); + // Copy data into the temporary surface + if (!gpuMem(srcMemory).partialMemCopyTo(gpu(), src, dst, copySize, xferBuf, true)) { + LogError("DmaBlitManager::readBufferRect failed!"); + return false; + } - if (pinned != NULL) { - // Get device memory for this virtual device - Memory* dstMemory = dev().getGpuMemory(pinned); + if (!xferBuf.hostRead(&gpu(), reinterpret_cast(dstHost) + hostOffset, dst, + copySize)) { + LogError("DmaBlitManager::readBufferRect failed!"); + return false; + } - if (!gpuMem(srcMemory).partialMemCopyTo( - gpu(), srcPin, dst, copySizePin, *dstMemory)) { - LogWarning("DmaBlitManager::readBuffer failed a pinned copy!"); - gpu().addPinnedMem(pinned); - break; - } - gpu().addPinnedMem(pinned); - } - else { - LogWarning("DmaBlitManager::readBuffer failed to pin a resource!"); - break; - } - srcSize -= tmpSize; - offset += tmpSize; - tmpHost = reinterpret_cast(tmpHost) + tmpSize + partial; - } + srcSize -= tmpSize; + bufOffset += tmpSize; + hostOffset += tmpSize; } + } + } + dev().xferRead().release(gpu(), xferBuf); + } - if (0 != srcSize) { - Memory& xferBuf0 = dev().xferRead().acquire(); - Memory& xferBuf1 = dev().xferRead().acquire(); - Memory* xferBuf[2] = { &xferBuf0, &xferBuf1 }; + return true; +} - // Read memory using a staged resource - if (!readMemoryStaged(gpuMem(srcMemory), dstHost, xferBuf, origin[0], - offset, srcSize, srcSize)) { - LogError("DmaBlitManager::readBuffer failed!"); - return false; - } +bool DmaBlitManager::readImage(device::Memory& srcMemory, void* dstHost, const amd::Coord3D& origin, + const amd::Coord3D& size, size_t rowPitch, size_t slicePitch, + bool entire) const { + if (setup_.disableReadImage_) { + return HostBlitManager::readImage(srcMemory, dstHost, origin, size, rowPitch, slicePitch, + entire); + } else { + //! @todo Add HW accelerated path + return HostBlitManager::readImage(srcMemory, dstHost, origin, size, rowPitch, slicePitch, + entire); + } - dev().xferRead().release(gpu(), xferBuf1); - dev().xferRead().release(gpu(), xferBuf0); + return true; +} + +bool DmaBlitManager::writeMemoryStaged(const void* srcHost, Memory& dstMemory, Memory& xferBuf, + size_t origin, size_t& offset, size_t& totalSize, + size_t xferSize) const { + amd::Coord3D src(0, 0, 0); + size_t tmpSize; + size_t chunkSize; + static const bool CopyRect = false; + // Flush DMA for ASYNC copy + // @todo Blocking write requires a flush to start earlier, + // but currently VDI doesn't provide that info + static const bool FlushDMA = false; + + if (dev().xferRead().bufSize() < 128 * Ki) { + chunkSize = dev().xferWrite().bufSize(); + } else { + chunkSize = std::min(amd::alignUp(xferSize / 4, 256), dev().xferWrite().bufSize()); + chunkSize = std::max(chunkSize, 128 * Ki); + } + + while (xferSize != 0) { + // Find the partial transfer size + tmpSize = std::min(chunkSize, xferSize); + amd::Coord3D dst(origin + offset, 0, 0); + amd::Coord3D copySize(tmpSize, 0, 0); + + // Copy data into the temporary buffer, using CPU + if (!xferBuf.hostWrite(&gpu(), reinterpret_cast(srcHost) + offset, src, copySize, + Resource::Discard)) { + return false; + } + + // Copy data into the original destination memory + if (!xferBuf.partialMemCopyTo(gpu(), src, dst, copySize, dstMemory, CopyRect, FlushDMA)) { + return false; + } + + totalSize -= tmpSize; + offset += tmpSize; + xferSize -= tmpSize; + } + + return true; +} + +bool DmaBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemory, + const amd::Coord3D& origin, const amd::Coord3D& size, + bool entire) const { + // Use host copy if memory has direct access or it's persistent + if (setup_.disableWriteBuffer_ || gpuMem(dstMemory).isHostMemDirectAccess() || + gpuMem(dstMemory).isPersistentDirectMap()) { + return HostBlitManager::writeBuffer(srcHost, dstMemory, origin, size, entire); + } else { + size_t dstSize = size[0]; + size_t tmpSize = 0; + size_t offset = 0; + size_t pinSize = dev().settings().pinnedXferSize_; + pinSize = std::min(pinSize, dstSize); + + // Check if a pinned transfer can be executed + if (pinSize && (dstSize > MinSizeForPinnedTransfer)) { + // Allign offset to 4K boundary (Vista/Win7 limitation) + char* tmpHost = const_cast( + amd::alignDown(reinterpret_cast(srcHost), PinnedMemoryAlignment)); + + // Find the partial size for unaligned copy + size_t partial = reinterpret_cast(srcHost) - tmpHost; + + amd::Memory* pinned = NULL; + bool first = true; + size_t tmpSize; + size_t pinAllocSize; + + // Copy memory, using pinning + while (dstSize > 0) { + // If it's the first iterarion, then readjust the copy size + // to include alignment + if (first) { + pinAllocSize = amd::alignUp(pinSize + partial, PinnedMemoryAlignment); + tmpSize = std::min(pinAllocSize - partial, dstSize); + first = false; + } else { + tmpSize = std::min(pinSize, dstSize); + pinAllocSize = amd::alignUp(tmpSize, PinnedMemoryAlignment); + partial = 0; } - } + amd::Coord3D src(partial, 0, 0); + amd::Coord3D dstPin(origin[0] + offset, 0, 0); + amd::Coord3D copySizePin(tmpSize, 0, 0); + size_t partial2; - return true; -} + // Allocate a GPU resource for pinning + pinned = pinHostMemory(tmpHost, pinAllocSize, partial2); -bool -DmaBlitManager::readBufferRect( - device::Memory& srcMemory, - void* dstHost, - const amd::BufferRect& bufRect, - const amd::BufferRect& hostRect, - const amd::Coord3D& size, - bool entire) const -{ - // Use host copy if memory has direct access - if (setup_.disableReadBufferRect_ || - (gpuMem(srcMemory).isHostMemDirectAccess() && gpuMem(srcMemory).isCacheable())) { - return HostBlitManager::readBufferRect( - srcMemory, dstHost, bufRect, hostRect, size, entire); - } - else { - Memory& xferBuf = dev().xferRead().acquire(); + if (pinned != NULL) { + // Get device memory for this virtual device + Memory* srcMemory = dev().getGpuMemory(pinned); - amd::Coord3D dst(0, 0, 0); - size_t tmpSize = 0; - size_t bufOffset; - size_t hostOffset; - size_t srcSize; - - for (size_t z = 0; z < size[2]; ++z) { - for (size_t y = 0; y < size[1]; ++y) { - srcSize = size[0]; - bufOffset = bufRect.offset(0, y, z); - hostOffset = hostRect.offset(0, y, z); - - while (srcSize != 0) { - // Find the partial transfer size - tmpSize = std::min(dev().xferRead().bufSize(), srcSize); - - amd::Coord3D src(bufOffset, 0, 0); - amd::Coord3D copySize(tmpSize, 0, 0); - - // Copy data into the temporary surface - if (!gpuMem(srcMemory).partialMemCopyTo( - gpu(), src, dst, copySize, xferBuf, true)) { - LogError("DmaBlitManager::readBufferRect failed!"); - return false; - } - - if (!xferBuf.hostRead(&gpu(), - reinterpret_cast(dstHost) + hostOffset, - dst, copySize)) { - LogError("DmaBlitManager::readBufferRect failed!"); - return false; - } - - srcSize -= tmpSize; - bufOffset += tmpSize; - hostOffset += tmpSize; - } - } + if (!srcMemory->partialMemCopyTo(gpu(), src, dstPin, copySizePin, gpuMem(dstMemory))) { + LogWarning("DmaBlitManager::writeBuffer failed a pinned copy!"); + gpu().addPinnedMem(pinned); + break; + } + gpu().addPinnedMem(pinned); + } else { + LogWarning("DmaBlitManager::writeBuffer failed to pin a resource!"); + break; } - dev().xferRead().release(gpu(), xferBuf); + dstSize -= tmpSize; + offset += tmpSize; + tmpHost = reinterpret_cast(tmpHost) + tmpSize + partial; + } } - return true; + + if (dstSize != 0) { + Memory& xferBuf = dev().xferWrite().acquire(); + + // Write memory using a staged resource + if (!writeMemoryStaged(srcHost, gpuMem(dstMemory), xferBuf, origin[0], offset, dstSize, + dstSize)) { + LogError("DmaBlitManager::writeBuffer failed!"); + return false; + } + + gpu().addXferWrite(xferBuf); + } + } + + return true; } -bool -DmaBlitManager::readImage( - device::Memory& srcMemory, - void* dstHost, - const amd::Coord3D& origin, - const amd::Coord3D& size, - size_t rowPitch, - size_t slicePitch, - bool entire) const -{ - if (setup_.disableReadImage_) { - return HostBlitManager::readImage(srcMemory, dstHost, - origin, size, rowPitch, slicePitch, entire); - } - else { - //! @todo Add HW accelerated path - return HostBlitManager::readImage(srcMemory, dstHost, - origin, size, rowPitch, slicePitch, entire); - } +bool DmaBlitManager::writeBufferRect(const void* srcHost, device::Memory& dstMemory, + const amd::BufferRect& hostRect, + const amd::BufferRect& bufRect, const amd::Coord3D& size, + bool entire) const { + // Use host copy if memory has direct access or it's persistent + if (setup_.disableWriteBufferRect_ || dstMemory.isHostMemDirectAccess() || + gpuMem(dstMemory).isPersistentDirectMap()) { + return HostBlitManager::writeBufferRect(srcHost, dstMemory, hostRect, bufRect, size, entire); + } else { + Memory& xferBuf = dev().xferWrite().acquire(); - return true; -} - -bool -DmaBlitManager::writeMemoryStaged( - const void* srcHost, - Memory& dstMemory, - Memory& xferBuf, - size_t origin, - size_t& offset, - size_t& totalSize, - size_t xferSize) const -{ amd::Coord3D src(0, 0, 0); - size_t tmpSize; - size_t chunkSize; - static const bool CopyRect = false; - // Flush DMA for ASYNC copy - // @todo Blocking write requires a flush to start earlier, - // but currently VDI doesn't provide that info - static const bool FlushDMA = false; + size_t tmpSize = 0; + size_t bufOffset; + size_t hostOffset; + size_t dstSize; - if (dev().xferRead().bufSize() < 128 * Ki) { - chunkSize = dev().xferWrite().bufSize(); - } - else { - chunkSize = std::min(amd::alignUp(xferSize / 4, 256), - dev().xferWrite().bufSize()); - chunkSize = std::max(chunkSize, 128 * Ki); - } + for (size_t z = 0; z < size[2]; ++z) { + for (size_t y = 0; y < size[1]; ++y) { + dstSize = size[0]; + bufOffset = bufRect.offset(0, y, z); + hostOffset = hostRect.offset(0, y, z); - while (xferSize != 0) { - // Find the partial transfer size - tmpSize = std::min(chunkSize, xferSize); - amd::Coord3D dst(origin + offset, 0, 0); - amd::Coord3D copySize(tmpSize, 0, 0); + while (dstSize != 0) { + // Find the partial transfer size + tmpSize = std::min(dev().xferWrite().bufSize(), dstSize); - // Copy data into the temporary buffer, using CPU - if (!xferBuf.hostWrite(&gpu(), - reinterpret_cast(srcHost) + offset, - src, copySize, Resource::Discard)) { + amd::Coord3D dst(bufOffset, 0, 0); + amd::Coord3D copySize(tmpSize, 0, 0); + + // Copy data into the temporary buffer, using CPU + if (!xferBuf.hostWrite(&gpu(), reinterpret_cast(srcHost) + hostOffset, src, + copySize, Resource::Discard)) { + LogError("DmaBlitManager::writeBufferRect failed!"); return false; - } + } - // Copy data into the original destination memory - if (!xferBuf.partialMemCopyTo( - gpu(), src, dst, copySize, dstMemory, CopyRect, FlushDMA)) { + // Copy data into the original destination memory + if (!xferBuf.partialMemCopyTo(gpu(), src, dst, copySize, gpuMem(dstMemory))) { + LogError("DmaBlitManager::writeBufferRect failed!"); return false; + } + + dstSize -= tmpSize; + bufOffset += tmpSize; + hostOffset += tmpSize; } - - totalSize -= tmpSize; - offset += tmpSize; - xferSize -= tmpSize; + } } + gpu().addXferWrite(xferBuf); + } - return true; + return true; } -bool -DmaBlitManager::writeBuffer( - const void* srcHost, - device::Memory& dstMemory, - const amd::Coord3D& origin, - const amd::Coord3D& size, - bool entire) const -{ - // Use host copy if memory has direct access or it's persistent - if (setup_.disableWriteBuffer_ || - gpuMem(dstMemory).isHostMemDirectAccess() || - gpuMem(dstMemory).isPersistentDirectMap()) { - return HostBlitManager::writeBuffer( - srcHost, dstMemory, origin, size, entire); - } - else { +bool DmaBlitManager::writeImage(const void* srcHost, device::Memory& dstMemory, + const amd::Coord3D& origin, const amd::Coord3D& size, + size_t rowPitch, size_t slicePitch, bool entire) const { + if (setup_.disableWriteImage_) { + return HostBlitManager::writeImage(srcHost, dstMemory, origin, size, rowPitch, slicePitch, + entire); + } else { + //! @todo Add HW accelerated path + return HostBlitManager::writeImage(srcHost, dstMemory, origin, size, rowPitch, slicePitch, + entire); + } - size_t dstSize = size[0]; - size_t tmpSize = 0; - size_t offset = 0; - size_t pinSize = dev().settings().pinnedXferSize_; - pinSize = std::min(pinSize, dstSize); - - // Check if a pinned transfer can be executed - if (pinSize && (dstSize > MinSizeForPinnedTransfer)) { - // Allign offset to 4K boundary (Vista/Win7 limitation) - char* tmpHost = const_cast( - amd::alignDown(reinterpret_cast(srcHost), - PinnedMemoryAlignment)); - - // Find the partial size for unaligned copy - size_t partial = reinterpret_cast(srcHost) - tmpHost; - - amd::Memory* pinned = NULL; - bool first = true; - size_t tmpSize; - size_t pinAllocSize; - - // Copy memory, using pinning - while (dstSize > 0) { - // If it's the first iterarion, then readjust the copy size - // to include alignment - if (first) { - pinAllocSize = amd::alignUp(pinSize + partial, - PinnedMemoryAlignment); - tmpSize = std::min(pinAllocSize - partial, dstSize); - first = false; - } - else { - tmpSize = std::min(pinSize, dstSize); - pinAllocSize = amd::alignUp(tmpSize, PinnedMemoryAlignment); - partial = 0; - } - amd::Coord3D src(partial, 0, 0); - amd::Coord3D dstPin(origin[0] + offset, 0, 0); - amd::Coord3D copySizePin(tmpSize, 0, 0); - size_t partial2; - - // Allocate a GPU resource for pinning - pinned = pinHostMemory(tmpHost, pinAllocSize, partial2); - - if (pinned != NULL) { - // Get device memory for this virtual device - Memory* srcMemory = dev().getGpuMemory(pinned); - - if (!srcMemory->partialMemCopyTo( - gpu(), src, dstPin, copySizePin, gpuMem(dstMemory))) { - LogWarning("DmaBlitManager::writeBuffer failed a pinned copy!"); - gpu().addPinnedMem(pinned); - break; - } - gpu().addPinnedMem(pinned); - } - else { - LogWarning("DmaBlitManager::writeBuffer failed to pin a resource!"); - break; - } - dstSize -= tmpSize; - offset += tmpSize; - tmpHost = reinterpret_cast(tmpHost) + tmpSize + partial; - } - } - - - if (dstSize != 0) { - Memory& xferBuf = dev().xferWrite().acquire(); - - // Write memory using a staged resource - if (!writeMemoryStaged(srcHost, gpuMem(dstMemory), xferBuf, origin[0], - offset, dstSize, dstSize)) { - LogError("DmaBlitManager::writeBuffer failed!"); - return false; - } - - gpu().addXferWrite(xferBuf); - } - } - - return true; + return true; } -bool -DmaBlitManager::writeBufferRect( - const void* srcHost, - device::Memory& dstMemory, - const amd::BufferRect& hostRect, - const amd::BufferRect& bufRect, - const amd::Coord3D& size, - bool entire) const -{ - // Use host copy if memory has direct access or it's persistent - if (setup_.disableWriteBufferRect_ || - dstMemory.isHostMemDirectAccess() || - gpuMem(dstMemory).isPersistentDirectMap()) { - return HostBlitManager::writeBufferRect( - srcHost, dstMemory, hostRect, bufRect, size, entire); - } - else { - Memory& xferBuf = dev().xferWrite().acquire(); +bool DmaBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& dstMemory, + const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin, + const amd::Coord3D& size, bool entire) const { + if (setup_.disableCopyBuffer_ || + (gpuMem(srcMemory).isHostMemDirectAccess() && gpuMem(srcMemory).isCacheable() && + !dev().settings().apuSystem_ && gpuMem(dstMemory).isHostMemDirectAccess())) { + return HostBlitManager::copyBuffer(srcMemory, dstMemory, srcOrigin, dstOrigin, size); + } else { + return gpuMem(srcMemory).partialMemCopyTo(gpu(), srcOrigin, dstOrigin, size, gpuMem(dstMemory)); + } - amd::Coord3D src(0, 0, 0); - size_t tmpSize = 0; - size_t bufOffset; - size_t hostOffset; - size_t dstSize; - - for (size_t z = 0; z < size[2]; ++z) { - for (size_t y = 0; y < size[1]; ++y) { - dstSize = size[0]; - bufOffset = bufRect.offset(0, y, z); - hostOffset = hostRect.offset(0, y, z); - - while (dstSize != 0) { - // Find the partial transfer size - tmpSize = std::min(dev().xferWrite().bufSize(), dstSize); - - amd::Coord3D dst(bufOffset, 0, 0); - amd::Coord3D copySize(tmpSize, 0, 0); - - // Copy data into the temporary buffer, using CPU - if (!xferBuf.hostWrite(&gpu(), - reinterpret_cast(srcHost) + hostOffset, - src, copySize, Resource::Discard)) { - LogError("DmaBlitManager::writeBufferRect failed!"); - return false; - } - - // Copy data into the original destination memory - if (!xferBuf.partialMemCopyTo( - gpu(), src, dst, copySize, gpuMem(dstMemory))) { - LogError("DmaBlitManager::writeBufferRect failed!"); - return false; - } - - dstSize -= tmpSize; - bufOffset += tmpSize; - hostOffset += tmpSize; - } - } - } - gpu().addXferWrite(xferBuf); - } - - return true; + return true; } -bool -DmaBlitManager::writeImage( - const void* srcHost, - device::Memory& dstMemory, - const amd::Coord3D& origin, - const amd::Coord3D& size, - size_t rowPitch, - size_t slicePitch, - bool entire) const -{ - if (setup_.disableWriteImage_) { - return HostBlitManager::writeImage( - srcHost, dstMemory, origin, size, rowPitch, slicePitch, entire); - } - else { - //! @todo Add HW accelerated path - return HostBlitManager::writeImage( - srcHost, dstMemory, origin, size, rowPitch, slicePitch, entire); +bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& dstMemory, + const amd::BufferRect& srcRect, const amd::BufferRect& dstRect, + const amd::Coord3D& size, bool entire) const { + if (setup_.disableCopyBufferRect_ || + (gpuMem(srcMemory).isHostMemDirectAccess() && gpuMem(srcMemory).isCacheable() && + gpuMem(dstMemory).isHostMemDirectAccess())) { + return HostBlitManager::copyBufferRect(srcMemory, dstMemory, srcRect, dstRect, size, entire); + } else { + size_t srcOffset; + size_t dstOffset; + + uint bytesPerElement = 16; + bool optimalElementSize = false; + bool subWindowRectCopy = true; + + srcOffset = srcRect.offset(0, 0, 0); + dstOffset = dstRect.offset(0, 0, 0); + + while (bytesPerElement >= 1) { + if (((srcOffset % 4) == 0) && ((dstOffset % 4) == 0) && ((size[0] % bytesPerElement) == 0) && + ((srcRect.rowPitch_ % bytesPerElement) == 0) && + ((srcRect.slicePitch_ % bytesPerElement) == 0) && + ((dstRect.rowPitch_ % bytesPerElement) == 0) && + ((dstRect.slicePitch_ % bytesPerElement) == 0)) { + optimalElementSize = true; + break; + } + bytesPerElement = bytesPerElement >> 1; } - return true; + // 19 bit limit in HW in SI and 16 bit limit in CI+(we adjust the ElementSize to 4bytes but the + // packet still has 14bits) + size_t pitchLimit = dev().settings().ciPlus_ ? (0x3FFF * bytesPerElement) | 0xF : 0x7FFFF; + size_t sizeLimit = dev().settings().ciPlus_ ? (0x3FFF * bytesPerElement) | 0xF : 0x3FFF; + + if (!optimalElementSize || (srcRect.rowPitch_ > pitchLimit) || + (dstRect.rowPitch_ > pitchLimit) || (size[0] > sizeLimit) || // See above + (size[1] > 0x3fff) || // 14 bits limit in HW + (size[2] > 0x7ff)) { // 11 bits limit in HW + // Restriction with rectLinearDRMDMA packet + subWindowRectCopy = false; + } + + if (subWindowRectCopy) { + // Copy data with subwindow copy packet + if (!gpuMem(srcMemory).partialMemCopyTo( + gpu(), amd::Coord3D(srcOffset, srcRect.rowPitch_, srcRect.slicePitch_), + amd::Coord3D(dstOffset, dstRect.rowPitch_, dstRect.slicePitch_), size, + gpuMem(dstMemory), true, false, bytesPerElement)) { + LogError("copyBufferRect failed!"); + return false; + } + } else { + for (size_t z = 0; z < size[2]; ++z) { + for (size_t y = 0; y < size[1]; ++y) { + srcOffset = srcRect.offset(0, y, z); + dstOffset = dstRect.offset(0, y, z); + + amd::Coord3D src(srcOffset, 0, 0); + amd::Coord3D dst(dstOffset, 0, 0); + amd::Coord3D copySize(size[0], 0, 0); + + // Copy data + if (!gpuMem(srcMemory).partialMemCopyTo(gpu(), src, dst, copySize, gpuMem(dstMemory))) { + LogError("copyBufferRect failed!"); + return false; + } + } + } + } + } + return true; } -bool -DmaBlitManager::copyBuffer( - device::Memory& srcMemory, - device::Memory& dstMemory, - const amd::Coord3D& srcOrigin, - const amd::Coord3D& dstOrigin, - const amd::Coord3D& size, - bool entire) const -{ - if (setup_.disableCopyBuffer_ || - (gpuMem(srcMemory).isHostMemDirectAccess() && gpuMem(srcMemory).isCacheable() && - !dev().settings().apuSystem_ && - gpuMem(dstMemory).isHostMemDirectAccess())) { - return HostBlitManager::copyBuffer( - srcMemory, dstMemory, srcOrigin, dstOrigin, size); - } - else { - return gpuMem(srcMemory).partialMemCopyTo(gpu(), - srcOrigin, dstOrigin, size, gpuMem(dstMemory)); - } +bool DmaBlitManager::copyImageToBuffer(device::Memory& srcMemory, device::Memory& dstMemory, + const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin, + const amd::Coord3D& size, bool entire, size_t rowPitch, + size_t slicePitch) const { + bool result = false; - return true; + if (setup_.disableCopyImageToBuffer_) { + result = HostBlitManager::copyImageToBuffer(srcMemory, dstMemory, srcOrigin, dstOrigin, size, + entire, rowPitch, slicePitch); + } else { + // Use CAL path for a transfer + result = + gpuMem(srcMemory).partialMemCopyTo(gpu(), srcOrigin, dstOrigin, size, gpuMem(dstMemory)); + + // Check if a HostBlit transfer is required + if (completeOperation_ && !result) { + result = HostBlitManager::copyImageToBuffer(srcMemory, dstMemory, srcOrigin, dstOrigin, size, + entire, rowPitch, slicePitch); + } + } + + return result; } -bool -DmaBlitManager::copyBufferRect( - device::Memory& srcMemory, - device::Memory& dstMemory, - const amd::BufferRect& srcRect, - const amd::BufferRect& dstRect, - const amd::Coord3D& size, - bool entire) const -{ - if (setup_.disableCopyBufferRect_ || - (gpuMem(srcMemory).isHostMemDirectAccess() && gpuMem(srcMemory).isCacheable() && - gpuMem(dstMemory).isHostMemDirectAccess())) { - return HostBlitManager::copyBufferRect( - srcMemory, dstMemory, srcRect, dstRect, size, entire); +bool DmaBlitManager::copyBufferToImage(device::Memory& srcMemory, device::Memory& dstMemory, + const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin, + const amd::Coord3D& size, bool entire, size_t rowPitch, + size_t slicePitch) const { + bool result = false; + + if (setup_.disableCopyBufferToImage_) { + result = HostBlitManager::copyBufferToImage(srcMemory, dstMemory, srcOrigin, dstOrigin, size, + entire, rowPitch, slicePitch); + } else { + // Use CAL path for a transfer + result = + gpuMem(srcMemory).partialMemCopyTo(gpu(), srcOrigin, dstOrigin, size, gpuMem(dstMemory)); + + // Check if a HostBlit transfer is required + if (completeOperation_ && !result) { + result = HostBlitManager::copyBufferToImage(srcMemory, dstMemory, srcOrigin, dstOrigin, size, + entire, rowPitch, slicePitch); } - else { - size_t srcOffset; - size_t dstOffset; + } - uint bytesPerElement = 16; - bool optimalElementSize = false; - bool subWindowRectCopy = true; - - srcOffset = srcRect.offset(0, 0, 0); - dstOffset = dstRect.offset(0, 0, 0); - - while (bytesPerElement >= 1) { - if (((srcOffset % 4) == 0) && - ((dstOffset % 4) == 0) && - ((size[0] % bytesPerElement) == 0) && - ((srcRect.rowPitch_ % bytesPerElement) == 0) && - ((srcRect.slicePitch_ % bytesPerElement) == 0) && - ((dstRect.rowPitch_ % bytesPerElement) == 0) && - ((dstRect.slicePitch_ % bytesPerElement) == 0)) { - optimalElementSize = true; - break; - } - bytesPerElement = bytesPerElement >> 1; - } - - // 19 bit limit in HW in SI and 16 bit limit in CI+(we adjust the ElementSize to 4bytes but the packet still has 14bits) - size_t pitchLimit = dev().settings().ciPlus_ ? (0x3FFF * bytesPerElement) | 0xF : 0x7FFFF; - size_t sizeLimit = dev().settings().ciPlus_ ? (0x3FFF * bytesPerElement) | 0xF : 0x3FFF; - - if (!optimalElementSize || - (srcRect.rowPitch_ > pitchLimit) || - (dstRect.rowPitch_ > pitchLimit) || - (size[0] > sizeLimit) || // See above - (size[1] > 0x3fff) || // 14 bits limit in HW - (size[2] > 0x7ff)) { // 11 bits limit in HW - // Restriction with rectLinearDRMDMA packet - subWindowRectCopy = false; - } - - if (subWindowRectCopy) { - // Copy data with subwindow copy packet - if (!gpuMem(srcMemory).partialMemCopyTo(gpu(), - amd::Coord3D(srcOffset, srcRect.rowPitch_, srcRect.slicePitch_), - amd::Coord3D(dstOffset, dstRect.rowPitch_, dstRect.slicePitch_), - size, gpuMem(dstMemory), true, false, bytesPerElement)) { - LogError("copyBufferRect failed!"); - return false; - } - } - else { - for (size_t z = 0; z < size[2]; ++z) { - for (size_t y = 0; y < size[1]; ++y) { - srcOffset = srcRect.offset(0, y, z); - dstOffset = dstRect.offset(0, y, z); - - amd::Coord3D src(srcOffset, 0, 0); - amd::Coord3D dst(dstOffset, 0, 0); - amd::Coord3D copySize(size[0], 0, 0); - - // Copy data - if (!gpuMem(srcMemory).partialMemCopyTo( - gpu(), src, dst, copySize, gpuMem(dstMemory))) { - LogError("copyBufferRect failed!"); - return false; - } - } - } - } - } - return true; + return result; } -bool -DmaBlitManager::copyImageToBuffer( - device::Memory& srcMemory, - device::Memory& dstMemory, - const amd::Coord3D& srcOrigin, - const amd::Coord3D& dstOrigin, - const amd::Coord3D& size, - bool entire, - size_t rowPitch, - size_t slicePitch) const -{ - bool result = false; +bool DmaBlitManager::copyImage(device::Memory& srcMemory, device::Memory& dstMemory, + const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin, + const amd::Coord3D& size, bool entire) const { + bool result = false; - if (setup_.disableCopyImageToBuffer_) { - result = HostBlitManager::copyImageToBuffer(srcMemory, dstMemory, - srcOrigin, dstOrigin, size, entire, rowPitch, slicePitch); - } - else { - // Use CAL path for a transfer - result = gpuMem(srcMemory).partialMemCopyTo( - gpu(), srcOrigin, dstOrigin, size, gpuMem(dstMemory)); + if (setup_.disableCopyImage_) { + return HostBlitManager::copyImage(srcMemory, dstMemory, srcOrigin, dstOrigin, size, entire); + } else { + //! @todo Add HW accelerated path + return HostBlitManager::copyImage(srcMemory, dstMemory, srcOrigin, dstOrigin, size, entire); + } - // Check if a HostBlit transfer is required - if (completeOperation_ && !result) { - result = HostBlitManager::copyImageToBuffer(srcMemory, - dstMemory, srcOrigin, dstOrigin, size, entire, rowPitch, slicePitch); - } - } - - return result; + return result; } -bool -DmaBlitManager::copyBufferToImage( - device::Memory& srcMemory, - device::Memory& dstMemory, - const amd::Coord3D& srcOrigin, - const amd::Coord3D& dstOrigin, - const amd::Coord3D& size, - bool entire, - size_t rowPitch, - size_t slicePitch) const -{ - bool result = false; +KernelBlitManager::KernelBlitManager(VirtualGPU& gpu, Setup setup) + : DmaBlitManager(gpu, setup), + program_(NULL), + constantBuffer_(NULL), + xferBufferSize_(0), + lockXferOps_(NULL) { + for (uint i = 0; i < BlitTotal; ++i) { + kernels_[i] = NULL; + } - if (setup_.disableCopyBufferToImage_) { - result = HostBlitManager::copyBufferToImage(srcMemory, - dstMemory, srcOrigin, dstOrigin, size, entire, rowPitch, slicePitch); - } - else { - // Use CAL path for a transfer - result = gpuMem(srcMemory).partialMemCopyTo( - gpu(), srcOrigin, dstOrigin, size, gpuMem(dstMemory)); + for (uint i = 0; i < MaxXferBuffers; ++i) { + xferBuffers_[i] = NULL; + } - // Check if a HostBlit transfer is required - if (completeOperation_ && !result) { - result = HostBlitManager::copyBufferToImage(srcMemory, - dstMemory, srcOrigin, dstOrigin, size, entire, rowPitch, slicePitch); - } - } - - return result; + completeOperation_ = false; } -bool -DmaBlitManager::copyImage( - device::Memory& srcMemory, - device::Memory& dstMemory, - const amd::Coord3D& srcOrigin, - const amd::Coord3D& dstOrigin, - const amd::Coord3D& size, - bool entire) const -{ - bool result = false; - - if (setup_.disableCopyImage_) { - return HostBlitManager::copyImage(srcMemory, dstMemory, - srcOrigin, dstOrigin, size, entire); - } - else { - //! @todo Add HW accelerated path - return HostBlitManager::copyImage(srcMemory, dstMemory, - srcOrigin, dstOrigin, size, entire); +KernelBlitManager::~KernelBlitManager() { + for (uint i = 0; i < BlitTotal; ++i) { + if (NULL != kernels_[i]) { + kernels_[i]->release(); } + } + if (NULL != program_) { + program_->release(); + } - return result; + if (NULL != context_) { + // Release a dummy context + context_->release(); + } + + if (NULL != constantBuffer_) { + constantBuffer_->release(); + } + + for (uint i = 0; i < MaxXferBuffers; ++i) { + if (NULL != xferBuffers_[i]) { + xferBuffers_[i]->release(); + } + } + + delete lockXferOps_; } -KernelBlitManager::KernelBlitManager( - VirtualGPU& gpu, Setup setup) - : DmaBlitManager(gpu, setup) - , program_(NULL) - , constantBuffer_(NULL) - , xferBufferSize_(0) - , lockXferOps_(NULL) -{ +bool KernelBlitManager::create(amd::Device& device) { + if (!createProgram(static_cast(device))) { + return false; + } + return true; +} + +bool KernelBlitManager::createProgram(Device& device) { + std::vector devices; + devices.push_back(&device); + + // Save context and program for this device + context_ = device.blitProgram()->context_; + context_->retain(); + program_ = device.blitProgram()->program_; + program_->retain(); + + bool result = false; + do { + // Create kernel objects for all blits for (uint i = 0; i < BlitTotal; ++i) { - kernels_[i] = NULL; + const amd::Symbol* symbol = program_->findSymbol(BlitName[i]); + if (symbol == NULL) { + break; + } + kernels_[i] = new amd::Kernel(*program_, *symbol, BlitName[i]); + if (kernels_[i] == NULL) { + break; + } + // Validate blit kernels for the scratch memory usage (pre SI) + if (!device.validateKernel(*kernels_[i], &gpu())) { + break; + } } + result = true; + } while (!result); + + // Create an internal constant buffer + constantBuffer_ = new (*context_) amd::Buffer(*context_, CL_MEM_ALLOC_HOST_PTR, 4 * Ki); + + if ((constantBuffer_ != NULL) && !constantBuffer_->create(NULL)) { + constantBuffer_->release(); + constantBuffer_ = NULL; + return false; + } else if (constantBuffer_ == NULL) { + return false; + } + + // Assign the constant buffer to the current virtual GPU + constantBuffer_->setVirtualDevice(&gpu()); + + if (dev().settings().xferBufSize_ > 0) { + xferBufferSize_ = dev().settings().xferBufSize_; for (uint i = 0; i < MaxXferBuffers; ++i) { + // Create internal xfer buffers for image copy optimization + xferBuffers_[i] = new (*context_) amd::Buffer(*context_, 0, xferBufferSize_); + + if ((xferBuffers_[i] != NULL) && !xferBuffers_[i]->create(NULL)) { + xferBuffers_[i]->release(); xferBuffers_[i] = NULL; - } - - completeOperation_ = false; -} - -KernelBlitManager::~KernelBlitManager() -{ - for (uint i = 0; i < BlitTotal; ++i) { - if (NULL != kernels_[i]) { - kernels_[i]->release(); - } - } - if (NULL != program_) { - program_->release(); - } - - if (NULL != context_) { - // Release a dummy context - context_->release(); - } - - if (NULL != constantBuffer_) { - constantBuffer_->release(); - } - - for (uint i = 0; i < MaxXferBuffers; ++i) { - if (NULL != xferBuffers_[i]) { - xferBuffers_[i]->release(); - } - } - - delete lockXferOps_; -} - -bool -KernelBlitManager::create(amd::Device& device) -{ - if (!createProgram(static_cast(device))) { return false; - } - return true; -} - -bool -KernelBlitManager::createProgram(Device& device) -{ - std::vector devices; - devices.push_back(&device); - - // Save context and program for this device - context_ = device.blitProgram()->context_; - context_->retain(); - program_ = device.blitProgram()->program_; - program_->retain(); - - bool result = false; - do { - // Create kernel objects for all blits - for (uint i = 0; i < BlitTotal; ++i) { - const amd::Symbol* symbol = program_->findSymbol(BlitName[i]); - if (symbol == NULL) { - break; - } - kernels_[i] = new amd::Kernel(*program_, *symbol, BlitName[i]); - if (kernels_[i] == NULL) { - break; - } - // Validate blit kernels for the scratch memory usage (pre SI) - if (!device.validateKernel(*kernels_[i], &gpu())) { - break; - } - } - - result = true; - } while(!result); - - // Create an internal constant buffer - constantBuffer_ = new (*context_) - amd::Buffer(*context_, CL_MEM_ALLOC_HOST_PTR, 4 * Ki); - - if ((constantBuffer_ != NULL) && !constantBuffer_->create(NULL)) { - constantBuffer_->release(); - constantBuffer_ = NULL; - return false; - } - else if (constantBuffer_ == NULL) { + } else if (xferBuffers_[i] == NULL) { return false; + } + + // Assign the xfer buffer to the current virtual GPU + xferBuffers_[i]->setVirtualDevice(&gpu()); + //! @note Workaround for conformance allocation test. + //! Force GPU mem alloc. + //! Unaligned images require xfer optimization, + //! but deferred memory allocation can cause + //! virtual heap fragmentation for big allocations and + //! then fail the following test with 32 bit ISA, because + //! runtime runs out of 4GB space. + dev().getGpuMemory(xferBuffers_[i]); } + } - // Assign the constant buffer to the current virtual GPU - constantBuffer_->setVirtualDevice(&gpu()); + lockXferOps_ = new amd::Monitor("Transfer Ops Lock", true); + if (NULL == lockXferOps_) { + return false; + } - if (dev().settings().xferBufSize_ > 0) { - xferBufferSize_ = dev().settings().xferBufSize_; - for (uint i = 0; i < MaxXferBuffers; ++i) { - // Create internal xfer buffers for image copy optimization - xferBuffers_[i] = new (*context_) - amd::Buffer(*context_, 0, xferBufferSize_); - - if ((xferBuffers_[i] != NULL) && !xferBuffers_[i]->create(NULL)) { - xferBuffers_[i]->release(); - xferBuffers_[i] = NULL; - return false; - } - else if (xferBuffers_[i] == NULL) { - return false; - } - - // Assign the xfer buffer to the current virtual GPU - xferBuffers_[i]->setVirtualDevice(&gpu()); - //! @note Workaround for conformance allocation test. - //! Force GPU mem alloc. - //! Unaligned images require xfer optimization, - //! but deferred memory allocation can cause - //! virtual heap fragmentation for big allocations and - //! then fail the following test with 32 bit ISA, because - //! runtime runs out of 4GB space. - dev().getGpuMemory(xferBuffers_[i]); - } - } - - lockXferOps_ = new amd::Monitor("Transfer Ops Lock", true); - if (NULL == lockXferOps_) { - return false; - } - - return result; + return result; } // The following data structures will be used for the view creations. // Some formats has to be converted before a kernel blit operation struct FormatConvertion { - cl_uint clOldType_; - cl_uint clNewType_; + cl_uint clOldType_; + cl_uint clNewType_; }; // The list of rejected data formats and corresponding conversion -static const FormatConvertion RejectedData[] = -{ - { CL_UNORM_INT8, CL_UNSIGNED_INT8 }, - { CL_UNORM_INT16, CL_UNSIGNED_INT16 }, - { CL_SNORM_INT8, CL_UNSIGNED_INT8 }, - { CL_SNORM_INT16, CL_UNSIGNED_INT16 }, - { CL_HALF_FLOAT, CL_UNSIGNED_INT16 }, - { CL_FLOAT, CL_UNSIGNED_INT32 }, - { CL_SIGNED_INT8, CL_UNSIGNED_INT8 }, - { CL_SIGNED_INT16, CL_UNSIGNED_INT16 }, - { CL_UNORM_INT_101010, CL_UNSIGNED_INT8 }, - { CL_SIGNED_INT32, CL_UNSIGNED_INT32 } -}; +static const FormatConvertion RejectedData[] = { + {CL_UNORM_INT8, CL_UNSIGNED_INT8}, {CL_UNORM_INT16, CL_UNSIGNED_INT16}, + {CL_SNORM_INT8, CL_UNSIGNED_INT8}, {CL_SNORM_INT16, CL_UNSIGNED_INT16}, + {CL_HALF_FLOAT, CL_UNSIGNED_INT16}, {CL_FLOAT, CL_UNSIGNED_INT32}, + {CL_SIGNED_INT8, CL_UNSIGNED_INT8}, {CL_SIGNED_INT16, CL_UNSIGNED_INT16}, + {CL_UNORM_INT_101010, CL_UNSIGNED_INT8}, {CL_SIGNED_INT32, CL_UNSIGNED_INT32}}; // The list of rejected channel's order and corresponding conversion -static const FormatConvertion RejectedOrder[] = -{ - { CL_A, CL_R }, - { CL_RA, CL_RG }, - { CL_LUMINANCE, CL_R }, - { CL_INTENSITY, CL_R }, - { CL_RGB, CL_RGBA }, - { CL_BGRA, CL_RGBA }, - { CL_ARGB, CL_RGBA }, - { CL_sRGB, CL_RGBA }, - { CL_sRGBx, CL_RGBA }, - { CL_sRGBA, CL_RGBA }, - { CL_sBGRA, CL_RGBA } -}; +static const FormatConvertion RejectedOrder[] = { + {CL_A, CL_R}, {CL_RA, CL_RG}, {CL_LUMINANCE, CL_R}, {CL_INTENSITY, CL_R}, + {CL_RGB, CL_RGBA}, {CL_BGRA, CL_RGBA}, {CL_ARGB, CL_RGBA}, {CL_sRGB, CL_RGBA}, + {CL_sRGBx, CL_RGBA}, {CL_sRGBA, CL_RGBA}, {CL_sBGRA, CL_RGBA}}; -const uint RejectedFormatDataTotal = - sizeof(RejectedData) / sizeof(FormatConvertion); -const uint RejectedFormatChannelTotal = - sizeof(RejectedOrder) / sizeof(FormatConvertion); +const uint RejectedFormatDataTotal = sizeof(RejectedData) / sizeof(FormatConvertion); +const uint RejectedFormatChannelTotal = sizeof(RejectedOrder) / sizeof(FormatConvertion); -bool -KernelBlitManager::copyBufferToImage( - device::Memory& srcMemory, - device::Memory& dstMemory, - const amd::Coord3D& srcOrigin, - const amd::Coord3D& dstOrigin, - const amd::Coord3D& size, - bool entire, - size_t rowPitch, - size_t slicePitch) const -{ - amd::ScopedLock k(lockXferOps_); - bool result = false; - static const bool CopyRect = false; - // Flush DMA for ASYNC copy - static const bool FlushDMA = true; - size_t imgRowPitch = size[0] * gpuMem(dstMemory).elementSize(); - size_t imgSlicePitch = imgRowPitch * size[1]; - - if (setup_.disableCopyBufferToImage_) { - result = DmaBlitManager::copyBufferToImage( - srcMemory, dstMemory, srcOrigin, dstOrigin, size, - entire, rowPitch, slicePitch); - synchronize(); - return result; - } - // Check if buffer is in system memory with direct access - else if (gpuMem(srcMemory).isHostMemDirectAccess() && - (((rowPitch == 0) && (slicePitch == 0)) || - ((rowPitch == imgRowPitch) && - ((slicePitch == 0) || (slicePitch == imgSlicePitch))))) { - // First attempt to do this all with DMA, - // but there are restriciton with older hardware - if (dev().settings().imageDMA_) { - result = DmaBlitManager::copyBufferToImage( - srcMemory, dstMemory, srcOrigin, dstOrigin, size, - entire, rowPitch, slicePitch); - if (result) { - synchronize(); - return result; - } - } - - if (!setup_.disableCopyBufferToImageOpt_) { - // Find the overall copy size - size_t copySize = size[0] * size[1] * size[2] * gpuMem(dstMemory).elementSize(); - - // Check if double copy was requested - if (xferBufferSize_ != 0) { - amd::Coord3D src(srcOrigin); - amd::Coord3D xferSrc(0, 0, 0); - amd::Coord3D dst(dstOrigin); - amd::Coord3D xferRect(size); - // Find transfer size in pixels - size_t xferSizePix = xferBufferSize_ / gpuMem(dstMemory).elementSize(); - bool transfer = true; - - // Find transfer rectangle - if (xferRect[0] > xferSizePix) { - // The algorithm can't break a line. - // It requires multiple rectangles tracking - transfer = false; - } - else { - xferRect.c[1] = xferSizePix / xferRect[0]; - } - // Check if we exceeded the original size boundary in Y - if (xferRect[1] > size[1]) { - xferRect.c[1] = size[1]; - xferRect.c[2] = xferSizePix / (xferRect[0] * xferRect[1]); - } - else { - xferRect.c[2] = 1; - } - // Check if we exceeded the original size boundary in Z - if (xferRect[2] > size[2]) { - xferRect.c[2] = size[2]; - } - // Make sure size in Y dimension is divided by the rectangle size - if (size[2] > 1) { - while ((size[1] % xferRect[1]) != 0) { - xferRect.c[1]--; - } - } - - // Find one step copy size, based on the copy rectange - amd::Coord3D oneStepSize( - xferRect[0] * xferRect[1] * xferRect[2] * gpuMem(dstMemory).elementSize()); - - // Initialize transfer buffer array - Memory* xferBuf[MaxXferBuffers]; - for (uint i = 0; i < MaxXferBuffers; ++i) { - xferBuf[i] = dev().getGpuMemory(xferBuffers_[i]); - if (xferBuf[i] == NULL) { - transfer = false; - break; - } - } - - // Loop until we transfer all data - while (transfer && (copySize > 0)) { - size_t copySizeTmp = copySize; - amd::Coord3D srcTmp(src); - amd::Coord3D oneStepSizeTmp(oneStepSize); - // Step 1. Initiate DRM transfer with all staging buffers - for (uint i = 0; i < MaxXferBuffers; ++i) { - // Make sure we don't transfer more than copy size - if (copySizeTmp > 0) { - if (!gpuMem(srcMemory).partialMemCopyTo(gpu(), srcTmp, - xferSrc, oneStepSizeTmp, *xferBuf[i], CopyRect, FlushDMA)) { - transfer = false; - break; - } - - copySizeTmp -= oneStepSizeTmp[0]; - // Change buffer offset - srcTmp.c[0] += oneStepSizeTmp[0]; - - if (copySizeTmp < oneStepSizeTmp[0]) { - oneStepSizeTmp.c[0] = copySizeTmp; - } - } - else { - break; - } - } - - // Step 2. Initiate compute transfer with all staging buffers - for (uint i = 0; i < MaxXferBuffers; ++i) { - if (copySize > 0) { - if (!copyBufferToImageKernel( - *xferBuf[i], dstMemory, - xferSrc, dst, xferRect, false)) { - transfer = false; - break; - } - gpu().flushDMA(MainEngine); - - copySize -= oneStepSize[0]; - // Change buffer offset - src.c[0] += oneStepSize[0]; - // Change image offset, ignore X offset - for (uint j = 1; j < 3; ++j) { - dst.c[j] += xferRect[j]; - if ((dst[j] - dstOrigin[j]) >= size[j]) { - dst.c[j] = dstOrigin[j]; - } - else { - break; - } - } - // Recalculate rectangle size if the remain data is smaller - if (copySize < oneStepSize[0]) { - for (uint j = 0; j < 3; ++j) { - xferRect.c[j] = size[j] - (dst[j] - dstOrigin[j]); - } - oneStepSize.c[0] = copySize; - } - } - else { - break; - } - } - } - - if (copySize == 0) { - result = true; - } - else { - LogWarning("2 step transfer in copyBufferToImage failed"); - } - } - } - } - - if (!result) { - result = copyBufferToImageKernel(srcMemory, - dstMemory, srcOrigin, dstOrigin, size, entire, rowPitch, slicePitch); - } +bool KernelBlitManager::copyBufferToImage(device::Memory& srcMemory, device::Memory& dstMemory, + const amd::Coord3D& srcOrigin, + const amd::Coord3D& dstOrigin, const amd::Coord3D& size, + bool entire, size_t rowPitch, size_t slicePitch) const { + amd::ScopedLock k(lockXferOps_); + bool result = false; + static const bool CopyRect = false; + // Flush DMA for ASYNC copy + static const bool FlushDMA = true; + size_t imgRowPitch = size[0] * gpuMem(dstMemory).elementSize(); + size_t imgSlicePitch = imgRowPitch * size[1]; + if (setup_.disableCopyBufferToImage_) { + result = DmaBlitManager::copyBufferToImage(srcMemory, dstMemory, srcOrigin, dstOrigin, size, + entire, rowPitch, slicePitch); synchronize(); - return result; -} - -void -CalcRowSlicePitches( - cl_ulong* pitch, const cl_int* copySize, - size_t rowPitch, size_t slicePitch, const Memory& mem) -{ - size_t memFmtSize = memoryFormatSize(mem.cal()->format_).size_; - bool img1Darray = (mem.cal()->dimension_ == GSL_MOA_TEXTURE_1D_ARRAY) ? true : false; - - if (rowPitch == 0) { - pitch[0] = copySize[0]; - } - else { - pitch[0] = rowPitch / memFmtSize; - } - if (slicePitch == 0) { - pitch[1] = pitch[0] * (img1Darray ? 1 : copySize[1]); - } - else { - pitch[1] = slicePitch / memFmtSize; - } - assert((pitch[0] <= pitch[1]) && "rowPitch must be <= slicePitch"); - - if (img1Darray) { - // For 1D array rowRitch = slicePitch - pitch[0] = pitch[1]; - } -} - -static void -setArgument(amd::Kernel* kernel, size_t index, size_t size, const void* value) -{ - const amd::KernelParameterDescriptor& desc = kernel->signature().at(index); - - void* param = kernel->parameters().values() + desc.offset_; - assert((desc.type_ == T_POINTER || value != NULL || desc.size_ == 0) && - "not a valid local mem arg"); - - uint32_t uint32_value = 0; - uint64_t uint64_value = 0; - - if (desc.type_ == T_POINTER && desc.size_ != 0) { - if ((value == NULL) || (static_cast(value) == NULL)) { - LP64_SWITCH(uint32_value, uint64_value) = 0; - } - else { - // convert cl_mem to amd::Memory*, return false if invalid. - LP64_SWITCH(uint32_value, uint64_value) = - (uintptr_t)(*static_cast(value)); - } - } - else if (desc.type_ == T_SAMPLER) { - assert(false && "No sampler support in blit manager! Use internal samplers!"); - } - else switch (desc.size_) { - case 1: uint32_value = *static_cast(value); break; - case 2: uint32_value = *static_cast(value); break; - case 4: uint32_value = *static_cast(value); break; - case 8: uint64_value = *static_cast(value); break; - default: break; - } - - switch (desc.size_) { - case 0 /*local mem*/ : *static_cast(param) = size; break; - case sizeof(uint32_t): *static_cast(param) = uint32_value; break; - case sizeof(uint64_t): *static_cast(param) = uint64_value; break; - default: ::memcpy(param, value, size); break; - } -} - -bool -KernelBlitManager::copyBufferToImageKernel( - device::Memory& srcMemory, - device::Memory& dstMemory, - const amd::Coord3D& srcOrigin, - const amd::Coord3D& dstOrigin, - const amd::Coord3D& size, - bool entire, - size_t rowPitch, - size_t slicePitch) const -{ - bool rejected = false; - Memory* dstView = &gpuMem(dstMemory); - bool releaseView = false; - bool result = false; - CalFormat imgFormat; - imgFormat.channelOrder_ = gpuMem(dstMemory).cal()->channelOrder_; - imgFormat.type_ = gpuMem(dstMemory).cal()->format_; - amd::Image::Format newFormat(dev().getOclFormat(imgFormat)); - - // Find unsupported formats - for (uint i = 0; i < RejectedFormatDataTotal; ++i) { - if (RejectedData[i].clOldType_ == newFormat.image_channel_data_type) { - newFormat.image_channel_data_type = RejectedData[i].clNewType_; - rejected = true; - break; - } - } - - // Find unsupported channel's order - for (uint i = 0; i < RejectedFormatChannelTotal; ++i) { - if (RejectedOrder[i].clOldType_ == newFormat.image_channel_order) { - newFormat.image_channel_order = RejectedOrder[i].clNewType_; - rejected = true; - break; - } - } - - // If the image format was rejected, then attempt to create a view - if (rejected) { - dstView = createView(gpuMem(dstMemory), dev().getCalFormat(newFormat)); - if (dstView != NULL) { - rejected = false; - releaseView = true; - } - } - - // Fall into the host path if the image format was rejected - if (rejected) { - return HostBlitManager::copyBufferToImage( - srcMemory, dstMemory, srcOrigin, dstOrigin, size, entire); - } - - // Use a common blit type with three dimensions by default - uint blitType = BlitCopyBufferToImage; - size_t dim = 0; - size_t globalWorkOffset[3] = { 0, 0, 0 }; - size_t globalWorkSize[3]; - size_t localWorkSize[3]; - - // Program the kernels workload depending on the blit dimensions - dim = 3; - if (gpuMem(dstMemory).cal()->dimSize_ == 1) { - globalWorkSize[0] = amd::alignUp(size[0], 256); - globalWorkSize[1] = amd::alignUp(size[1], 1); - globalWorkSize[2] = amd::alignUp(size[2], 1); - localWorkSize[0] = 256; - localWorkSize[1] = localWorkSize[2] = 1; - } - else if (gpuMem(dstMemory).cal()->dimSize_ == 2) { - globalWorkSize[0] = amd::alignUp(size[0], 16); - globalWorkSize[1] = amd::alignUp(size[1], 16); - globalWorkSize[2] = amd::alignUp(size[2], 1); - localWorkSize[0] = localWorkSize[1] = 16; - localWorkSize[2] = 1; - } - else { - globalWorkSize[0] = amd::alignUp(size[0], 8); - globalWorkSize[1] = amd::alignUp(size[1], 8); - globalWorkSize[2] = amd::alignUp(size[2], 4); - localWorkSize[0] = localWorkSize[1] = 8; - localWorkSize[2] = 4; - } - - // Program kernels arguments for the blit operation - Memory* mem = &gpuMem(srcMemory); - setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem); - mem = dstView; - setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem); - const MemFormatStruct& memFmt = memoryFormatSize(gpuMem(dstMemory).cal()->format_); - - // 1 element granularity for writes by default - cl_int granularity = 1; - if (memFmt.size_ == 2) { - granularity = 2; - } - else if (memFmt.size_ >= 4) { - granularity = 4; - } - CondLog(((srcOrigin[0] % granularity) != 0), "Unaligned offset in blit!"); - cl_ulong srcOrg[4] = { srcOrigin[0] / granularity, - srcOrigin[1], - srcOrigin[2], 0 }; - setArgument(kernels_[blitType], 2, sizeof(srcOrg), srcOrg); - - cl_int dstOrg[4] = { (cl_int)dstOrigin[0], - (cl_int)dstOrigin[1], - (cl_int)dstOrigin[2], 0 }; - cl_int copySize[4] = { (cl_int)size[0], - (cl_int)size[1], - (cl_int)size[2], 0 }; - - setArgument(kernels_[blitType], 3, sizeof(dstOrg), dstOrg); - setArgument(kernels_[blitType], 4, sizeof(copySize), copySize); - - // Program memory format - uint multiplier = memFmt.size_ / sizeof(uint32_t); - multiplier = (multiplier == 0) ? 1 : multiplier; - cl_uint format[4] = { memFmt.components_, - memFmt.size_ / memFmt.components_, - multiplier, 0 }; - setArgument(kernels_[blitType], 5, sizeof(format), format); - - // Program row and slice pitches - cl_ulong pitch[4] = { 0 }; - CalcRowSlicePitches(pitch, copySize, rowPitch, slicePitch, gpuMem(dstMemory)); - setArgument(kernels_[blitType], 6, sizeof(pitch), pitch); - - // Create ND range object for the kernel's execution - amd::NDRangeContainer ndrange(dim, - globalWorkOffset, globalWorkSize, localWorkSize); - - // Execute the blit - address parameters = kernels_[blitType]->parameters().values(); - result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters); - if (releaseView) { - delete dstView; - } - - return result; -} - -bool -KernelBlitManager::copyImageToBuffer( - device::Memory& srcMemory, - device::Memory& dstMemory, - const amd::Coord3D& srcOrigin, - const amd::Coord3D& dstOrigin, - const amd::Coord3D& size, - bool entire, - size_t rowPitch, - size_t slicePitch) const -{ - amd::ScopedLock k(lockXferOps_); - bool result = false; - static const bool CopyRect = false; - // Flush DMA for ASYNC copy - static const bool FlushDMA = true; - size_t imgRowPitch = size[0] * gpuMem(srcMemory).elementSize(); - size_t imgSlicePitch = imgRowPitch * size[1]; - - if (setup_.disableCopyImageToBuffer_) { - result = HostBlitManager::copyImageToBuffer( - srcMemory, dstMemory, srcOrigin, dstOrigin, - size, entire, rowPitch, slicePitch); + } + // Check if buffer is in system memory with direct access + else if (gpuMem(srcMemory).isHostMemDirectAccess() && + (((rowPitch == 0) && (slicePitch == 0)) || + ((rowPitch == imgRowPitch) && ((slicePitch == 0) || (slicePitch == imgSlicePitch))))) { + // First attempt to do this all with DMA, + // but there are restriciton with older hardware + if (dev().settings().imageDMA_) { + result = DmaBlitManager::copyBufferToImage(srcMemory, dstMemory, srcOrigin, dstOrigin, size, + entire, rowPitch, slicePitch); + if (result) { synchronize(); return result; + } } - // Check if buffer is in system memory with direct access - else if (gpuMem(dstMemory).isHostMemDirectAccess() && - (((rowPitch == 0) && (slicePitch == 0)) || - ((rowPitch == imgRowPitch) && - ((slicePitch == 0) || (slicePitch == imgSlicePitch))))) { - // First attempt to do this all with DMA, - // but there are restriciton with older hardware - // If the dest buffer is external physical(SDI), copy two step as - // single step SDMA is causing corruption and the cause is under investigation - if (dev().settings().imageDMA_ && - gpuMem(dstMemory).memoryType() != Resource::ExternalPhysical) { - result = DmaBlitManager::copyImageToBuffer( - srcMemory, dstMemory, srcOrigin, dstOrigin, - size, entire, rowPitch, slicePitch); - if (result) { - synchronize(); - return result; - } + + if (!setup_.disableCopyBufferToImageOpt_) { + // Find the overall copy size + size_t copySize = size[0] * size[1] * size[2] * gpuMem(dstMemory).elementSize(); + + // Check if double copy was requested + if (xferBufferSize_ != 0) { + amd::Coord3D src(srcOrigin); + amd::Coord3D xferSrc(0, 0, 0); + amd::Coord3D dst(dstOrigin); + amd::Coord3D xferRect(size); + // Find transfer size in pixels + size_t xferSizePix = xferBufferSize_ / gpuMem(dstMemory).elementSize(); + bool transfer = true; + + // Find transfer rectangle + if (xferRect[0] > xferSizePix) { + // The algorithm can't break a line. + // It requires multiple rectangles tracking + transfer = false; + } else { + xferRect.c[1] = xferSizePix / xferRect[0]; + } + // Check if we exceeded the original size boundary in Y + if (xferRect[1] > size[1]) { + xferRect.c[1] = size[1]; + xferRect.c[2] = xferSizePix / (xferRect[0] * xferRect[1]); + } else { + xferRect.c[2] = 1; + } + // Check if we exceeded the original size boundary in Z + if (xferRect[2] > size[2]) { + xferRect.c[2] = size[2]; + } + // Make sure size in Y dimension is divided by the rectangle size + if (size[2] > 1) { + while ((size[1] % xferRect[1]) != 0) { + xferRect.c[1]--; + } } - // Find the overall copy size - size_t copySize = size[0] * size[1] * size[2] * gpuMem(srcMemory).elementSize(); + // Find one step copy size, based on the copy rectange + amd::Coord3D oneStepSize(xferRect[0] * xferRect[1] * xferRect[2] * + gpuMem(dstMemory).elementSize()); - // Check if double copy was requested - if (xferBufferSize_ != 0) { - amd::Coord3D src(srcOrigin); - amd::Coord3D dst(dstOrigin); - amd::Coord3D xferDst(0, 0, 0); - amd::Coord3D xferRect(size); - // Find transfer size in pixels - size_t xferSizePix = xferBufferSize_ / gpuMem(srcMemory).elementSize(); - bool transfer = true; + // Initialize transfer buffer array + Memory* xferBuf[MaxXferBuffers]; + for (uint i = 0; i < MaxXferBuffers; ++i) { + xferBuf[i] = dev().getGpuMemory(xferBuffers_[i]); + if (xferBuf[i] == NULL) { + transfer = false; + break; + } + } - // Find transfer rectangle - if (xferRect[0] > xferSizePix) { - // The algorithm can't break a line. - // It requires multiple rectangles tracking + // Loop until we transfer all data + while (transfer && (copySize > 0)) { + size_t copySizeTmp = copySize; + amd::Coord3D srcTmp(src); + amd::Coord3D oneStepSizeTmp(oneStepSize); + // Step 1. Initiate DRM transfer with all staging buffers + for (uint i = 0; i < MaxXferBuffers; ++i) { + // Make sure we don't transfer more than copy size + if (copySizeTmp > 0) { + if (!gpuMem(srcMemory).partialMemCopyTo(gpu(), srcTmp, xferSrc, oneStepSizeTmp, + *xferBuf[i], CopyRect, FlushDMA)) { transfer = false; + break; + } + + copySizeTmp -= oneStepSizeTmp[0]; + // Change buffer offset + srcTmp.c[0] += oneStepSizeTmp[0]; + + if (copySizeTmp < oneStepSizeTmp[0]) { + oneStepSizeTmp.c[0] = copySizeTmp; + } + } else { + break; } - else { - xferRect.c[1] = xferSizePix / xferRect[0]; - } - // Check if we exceeded the original size boundary in Y - if (xferRect[1] > size[1]) { - xferRect.c[1] = size[1]; - xferRect.c[2] = xferSizePix / (xferRect[0] * xferRect[1]); - } - else { - xferRect.c[2] = 1; - } - // Check if we exceeded the original size boundary in Z - if (xferRect[2] > size[2]) { - xferRect.c[2] = size[2]; - } - // Make sure size in Y dimension is divided by the rectangle size - if (size[2] > 1) { - while ((size[1] % xferRect[1]) != 0) { - xferRect.c[1]--; + } + + // Step 2. Initiate compute transfer with all staging buffers + for (uint i = 0; i < MaxXferBuffers; ++i) { + if (copySize > 0) { + if (!copyBufferToImageKernel(*xferBuf[i], dstMemory, xferSrc, dst, xferRect, false)) { + transfer = false; + break; + } + gpu().flushDMA(MainEngine); + + copySize -= oneStepSize[0]; + // Change buffer offset + src.c[0] += oneStepSize[0]; + // Change image offset, ignore X offset + for (uint j = 1; j < 3; ++j) { + dst.c[j] += xferRect[j]; + if ((dst[j] - dstOrigin[j]) >= size[j]) { + dst.c[j] = dstOrigin[j]; + } else { + break; } - } - - // Find one step copy size, based on the copy rectange - amd::Coord3D oneStepSize( - xferRect[0] * xferRect[1] * xferRect[2] * gpuMem(srcMemory).elementSize()); - - // Initialize transfer buffer array - Memory* xferBuf[MaxXferBuffers]; - for (uint i = 0; i < MaxXferBuffers; ++i) { - xferBuf[i] = dev().getGpuMemory(xferBuffers_[i]); - if (xferBuf[i] == NULL) { - transfer = false; - break; + } + // Recalculate rectangle size if the remain data is smaller + if (copySize < oneStepSize[0]) { + for (uint j = 0; j < 3; ++j) { + xferRect.c[j] = size[j] - (dst[j] - dstOrigin[j]); } + oneStepSize.c[0] = copySize; + } + } else { + break; } - - // Loop until we transfer all data - while (transfer && (copySize > 0)) { - size_t copySizeTmp = copySize; - amd::Coord3D srcTmp(src); - amd::Coord3D oneStepSizeTmp(oneStepSize); - amd::Coord3D xferRectTmp(xferRect); - - // Step 1. Initiate compute transfer with all staging buffers - for (uint i = 0; i < MaxXferBuffers; ++i) { - if (copySizeTmp > 0) { - if (!copyImageToBufferKernel( - srcMemory, *xferBuf[i], - srcTmp, xferDst, xferRectTmp, false)) { - transfer = false; - break; - } - gpu().flushDMA(MainEngine); - - copySizeTmp -= oneStepSizeTmp[0]; - // Change image offset, ignore X offset - for (uint j = 1; j < 3; ++j) { - srcTmp.c[j] += xferRectTmp[j]; - if ((srcTmp[j] - srcOrigin[j]) >= size[j]) { - srcTmp.c[j] = srcOrigin[j]; - } - else { - break; - } - } - // Recalculate rectangle size if the remain data is smaller - if (copySizeTmp < oneStepSizeTmp[0]) { - for (uint j = 0; j < 3; ++j) { - xferRectTmp.c[j] = size[j] - (srcTmp[j] - srcOrigin[j]); - } - } - } - else { - break; - } - } - - // Step 2. Initiate DRM transfer with all staging buffers - for (uint i = 0; i < MaxXferBuffers; ++i) { - // Make sure we don't transfer more than copy size - if (copySize > 0) { - if (!xferBuf[i]->partialMemCopyTo(gpu(), xferDst, dst, - oneStepSize, gpuMem(dstMemory), CopyRect, FlushDMA)) { - transfer = false; - break; - } - - copySize -= oneStepSize[0]; - // Change buffer offset - dst.c[0] += oneStepSize[0]; - // Change image offset, ignore X offset - for (uint j = 1; j < 3; ++j) { - src.c[j] += xferRect[j]; - if ((src[j] - srcOrigin[j]) >= size[j]) { - src.c[j] = srcOrigin[j]; - } - else { - break; - } - } - // Recalculate rectangle size if the remain data is smaller - if (copySize < oneStepSize[0]) { - for (uint j = 0; j < 3; ++j) { - xferRect.c[j] = size[j] - (src[j] - srcOrigin[j]); - } - oneStepSize.c[0] = copySize; - } - } - else { - break; - } - } - } - - if (copySize == 0) { - result = true; - } - else { - LogWarning("2 step transfer in copyBufferToImage failed"); - } + } } + + if (copySize == 0) { + result = true; + } else { + LogWarning("2 step transfer in copyBufferToImage failed"); + } + } } + } - if (!result) { - result = copyImageToBufferKernel(srcMemory, - dstMemory, srcOrigin, dstOrigin, size, entire, rowPitch, slicePitch); - } + if (!result) { + result = copyBufferToImageKernel(srcMemory, dstMemory, srcOrigin, dstOrigin, size, entire, + rowPitch, slicePitch); + } - synchronize(); + synchronize(); - return result; + return result; } -bool -KernelBlitManager::copyImageToBufferKernel( - device::Memory& srcMemory, - device::Memory& dstMemory, - const amd::Coord3D& srcOrigin, - const amd::Coord3D& dstOrigin, - const amd::Coord3D& size, - bool entire, - size_t rowPitch, - size_t slicePitch) const -{ - bool rejected = false; - Memory* srcView = &gpuMem(srcMemory); - bool releaseView = false; - bool result = false; - CalFormat imgFormat; - imgFormat.channelOrder_ = gpuMem(srcMemory).cal()->channelOrder_; - imgFormat.type_ = gpuMem(srcMemory).cal()->format_; - amd::Image::Format newFormat(dev().getOclFormat(imgFormat)); +void CalcRowSlicePitches(cl_ulong* pitch, const cl_int* copySize, size_t rowPitch, + size_t slicePitch, const Memory& mem) { + size_t memFmtSize = memoryFormatSize(mem.cal()->format_).size_; + bool img1Darray = (mem.cal()->dimension_ == GSL_MOA_TEXTURE_1D_ARRAY) ? true : false; - // Find unsupported formats - for (uint i = 0; i < RejectedFormatDataTotal; ++i) { - if (RejectedData[i].clOldType_ == newFormat.image_channel_data_type) { - newFormat.image_channel_data_type = RejectedData[i].clNewType_; - rejected = true; - break; - } + if (rowPitch == 0) { + pitch[0] = copySize[0]; + } else { + pitch[0] = rowPitch / memFmtSize; + } + if (slicePitch == 0) { + pitch[1] = pitch[0] * (img1Darray ? 1 : copySize[1]); + } else { + pitch[1] = slicePitch / memFmtSize; + } + assert((pitch[0] <= pitch[1]) && "rowPitch must be <= slicePitch"); + + if (img1Darray) { + // For 1D array rowRitch = slicePitch + pitch[0] = pitch[1]; + } +} + +static void setArgument(amd::Kernel* kernel, size_t index, size_t size, const void* value) { + const amd::KernelParameterDescriptor& desc = kernel->signature().at(index); + + void* param = kernel->parameters().values() + desc.offset_; + assert((desc.type_ == T_POINTER || value != NULL || desc.size_ == 0) && + "not a valid local mem arg"); + + uint32_t uint32_value = 0; + uint64_t uint64_value = 0; + + if (desc.type_ == T_POINTER && desc.size_ != 0) { + if ((value == NULL) || (static_cast(value) == NULL)) { + LP64_SWITCH(uint32_value, uint64_value) = 0; + } else { + // convert cl_mem to amd::Memory*, return false if invalid. + LP64_SWITCH(uint32_value, uint64_value) = (uintptr_t)(*static_cast(value)); + } + } else if (desc.type_ == T_SAMPLER) { + assert(false && "No sampler support in blit manager! Use internal samplers!"); + } else + switch (desc.size_) { + case 1: + uint32_value = *static_cast(value); + break; + case 2: + uint32_value = *static_cast(value); + break; + case 4: + uint32_value = *static_cast(value); + break; + case 8: + uint64_value = *static_cast(value); + break; + default: + break; } - // Find unsupported channel's order + switch (desc.size_) { + case 0 /*local mem*/: + *static_cast(param) = size; + break; + case sizeof(uint32_t): + *static_cast(param) = uint32_value; + break; + case sizeof(uint64_t): + *static_cast(param) = uint64_value; + break; + default: + ::memcpy(param, value, size); + break; + } +} + +bool KernelBlitManager::copyBufferToImageKernel(device::Memory& srcMemory, + device::Memory& dstMemory, + const amd::Coord3D& srcOrigin, + const amd::Coord3D& dstOrigin, + const amd::Coord3D& size, bool entire, + size_t rowPitch, size_t slicePitch) const { + bool rejected = false; + Memory* dstView = &gpuMem(dstMemory); + bool releaseView = false; + bool result = false; + CalFormat imgFormat; + imgFormat.channelOrder_ = gpuMem(dstMemory).cal()->channelOrder_; + imgFormat.type_ = gpuMem(dstMemory).cal()->format_; + amd::Image::Format newFormat(dev().getOclFormat(imgFormat)); + + // Find unsupported formats + for (uint i = 0; i < RejectedFormatDataTotal; ++i) { + if (RejectedData[i].clOldType_ == newFormat.image_channel_data_type) { + newFormat.image_channel_data_type = RejectedData[i].clNewType_; + rejected = true; + break; + } + } + + // Find unsupported channel's order + for (uint i = 0; i < RejectedFormatChannelTotal; ++i) { + if (RejectedOrder[i].clOldType_ == newFormat.image_channel_order) { + newFormat.image_channel_order = RejectedOrder[i].clNewType_; + rejected = true; + break; + } + } + + // If the image format was rejected, then attempt to create a view + if (rejected) { + dstView = createView(gpuMem(dstMemory), dev().getCalFormat(newFormat)); + if (dstView != NULL) { + rejected = false; + releaseView = true; + } + } + + // Fall into the host path if the image format was rejected + if (rejected) { + return HostBlitManager::copyBufferToImage(srcMemory, dstMemory, srcOrigin, dstOrigin, size, + entire); + } + + // Use a common blit type with three dimensions by default + uint blitType = BlitCopyBufferToImage; + size_t dim = 0; + size_t globalWorkOffset[3] = {0, 0, 0}; + size_t globalWorkSize[3]; + size_t localWorkSize[3]; + + // Program the kernels workload depending on the blit dimensions + dim = 3; + if (gpuMem(dstMemory).cal()->dimSize_ == 1) { + globalWorkSize[0] = amd::alignUp(size[0], 256); + globalWorkSize[1] = amd::alignUp(size[1], 1); + globalWorkSize[2] = amd::alignUp(size[2], 1); + localWorkSize[0] = 256; + localWorkSize[1] = localWorkSize[2] = 1; + } else if (gpuMem(dstMemory).cal()->dimSize_ == 2) { + globalWorkSize[0] = amd::alignUp(size[0], 16); + globalWorkSize[1] = amd::alignUp(size[1], 16); + globalWorkSize[2] = amd::alignUp(size[2], 1); + localWorkSize[0] = localWorkSize[1] = 16; + localWorkSize[2] = 1; + } else { + globalWorkSize[0] = amd::alignUp(size[0], 8); + globalWorkSize[1] = amd::alignUp(size[1], 8); + globalWorkSize[2] = amd::alignUp(size[2], 4); + localWorkSize[0] = localWorkSize[1] = 8; + localWorkSize[2] = 4; + } + + // Program kernels arguments for the blit operation + Memory* mem = &gpuMem(srcMemory); + setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem); + mem = dstView; + setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem); + const MemFormatStruct& memFmt = memoryFormatSize(gpuMem(dstMemory).cal()->format_); + + // 1 element granularity for writes by default + cl_int granularity = 1; + if (memFmt.size_ == 2) { + granularity = 2; + } else if (memFmt.size_ >= 4) { + granularity = 4; + } + CondLog(((srcOrigin[0] % granularity) != 0), "Unaligned offset in blit!"); + cl_ulong srcOrg[4] = {srcOrigin[0] / granularity, srcOrigin[1], srcOrigin[2], 0}; + setArgument(kernels_[blitType], 2, sizeof(srcOrg), srcOrg); + + cl_int dstOrg[4] = {(cl_int)dstOrigin[0], (cl_int)dstOrigin[1], (cl_int)dstOrigin[2], 0}; + cl_int copySize[4] = {(cl_int)size[0], (cl_int)size[1], (cl_int)size[2], 0}; + + setArgument(kernels_[blitType], 3, sizeof(dstOrg), dstOrg); + setArgument(kernels_[blitType], 4, sizeof(copySize), copySize); + + // Program memory format + uint multiplier = memFmt.size_ / sizeof(uint32_t); + multiplier = (multiplier == 0) ? 1 : multiplier; + cl_uint format[4] = {memFmt.components_, memFmt.size_ / memFmt.components_, multiplier, 0}; + setArgument(kernels_[blitType], 5, sizeof(format), format); + + // Program row and slice pitches + cl_ulong pitch[4] = {0}; + CalcRowSlicePitches(pitch, copySize, rowPitch, slicePitch, gpuMem(dstMemory)); + setArgument(kernels_[blitType], 6, sizeof(pitch), pitch); + + // Create ND range object for the kernel's execution + amd::NDRangeContainer ndrange(dim, globalWorkOffset, globalWorkSize, localWorkSize); + + // Execute the blit + address parameters = kernels_[blitType]->parameters().values(); + result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters); + if (releaseView) { + delete dstView; + } + + return result; +} + +bool KernelBlitManager::copyImageToBuffer(device::Memory& srcMemory, device::Memory& dstMemory, + const amd::Coord3D& srcOrigin, + const amd::Coord3D& dstOrigin, const amd::Coord3D& size, + bool entire, size_t rowPitch, size_t slicePitch) const { + amd::ScopedLock k(lockXferOps_); + bool result = false; + static const bool CopyRect = false; + // Flush DMA for ASYNC copy + static const bool FlushDMA = true; + size_t imgRowPitch = size[0] * gpuMem(srcMemory).elementSize(); + size_t imgSlicePitch = imgRowPitch * size[1]; + + if (setup_.disableCopyImageToBuffer_) { + result = HostBlitManager::copyImageToBuffer(srcMemory, dstMemory, srcOrigin, dstOrigin, size, + entire, rowPitch, slicePitch); + synchronize(); + return result; + } + // Check if buffer is in system memory with direct access + else if (gpuMem(dstMemory).isHostMemDirectAccess() && + (((rowPitch == 0) && (slicePitch == 0)) || + ((rowPitch == imgRowPitch) && ((slicePitch == 0) || (slicePitch == imgSlicePitch))))) { + // First attempt to do this all with DMA, + // but there are restriciton with older hardware + // If the dest buffer is external physical(SDI), copy two step as + // single step SDMA is causing corruption and the cause is under investigation + if (dev().settings().imageDMA_ && + gpuMem(dstMemory).memoryType() != Resource::ExternalPhysical) { + result = DmaBlitManager::copyImageToBuffer(srcMemory, dstMemory, srcOrigin, dstOrigin, size, + entire, rowPitch, slicePitch); + if (result) { + synchronize(); + return result; + } + } + + // Find the overall copy size + size_t copySize = size[0] * size[1] * size[2] * gpuMem(srcMemory).elementSize(); + + // Check if double copy was requested + if (xferBufferSize_ != 0) { + amd::Coord3D src(srcOrigin); + amd::Coord3D dst(dstOrigin); + amd::Coord3D xferDst(0, 0, 0); + amd::Coord3D xferRect(size); + // Find transfer size in pixels + size_t xferSizePix = xferBufferSize_ / gpuMem(srcMemory).elementSize(); + bool transfer = true; + + // Find transfer rectangle + if (xferRect[0] > xferSizePix) { + // The algorithm can't break a line. + // It requires multiple rectangles tracking + transfer = false; + } else { + xferRect.c[1] = xferSizePix / xferRect[0]; + } + // Check if we exceeded the original size boundary in Y + if (xferRect[1] > size[1]) { + xferRect.c[1] = size[1]; + xferRect.c[2] = xferSizePix / (xferRect[0] * xferRect[1]); + } else { + xferRect.c[2] = 1; + } + // Check if we exceeded the original size boundary in Z + if (xferRect[2] > size[2]) { + xferRect.c[2] = size[2]; + } + // Make sure size in Y dimension is divided by the rectangle size + if (size[2] > 1) { + while ((size[1] % xferRect[1]) != 0) { + xferRect.c[1]--; + } + } + + // Find one step copy size, based on the copy rectange + amd::Coord3D oneStepSize(xferRect[0] * xferRect[1] * xferRect[2] * + gpuMem(srcMemory).elementSize()); + + // Initialize transfer buffer array + Memory* xferBuf[MaxXferBuffers]; + for (uint i = 0; i < MaxXferBuffers; ++i) { + xferBuf[i] = dev().getGpuMemory(xferBuffers_[i]); + if (xferBuf[i] == NULL) { + transfer = false; + break; + } + } + + // Loop until we transfer all data + while (transfer && (copySize > 0)) { + size_t copySizeTmp = copySize; + amd::Coord3D srcTmp(src); + amd::Coord3D oneStepSizeTmp(oneStepSize); + amd::Coord3D xferRectTmp(xferRect); + + // Step 1. Initiate compute transfer with all staging buffers + for (uint i = 0; i < MaxXferBuffers; ++i) { + if (copySizeTmp > 0) { + if (!copyImageToBufferKernel(srcMemory, *xferBuf[i], srcTmp, xferDst, xferRectTmp, + false)) { + transfer = false; + break; + } + gpu().flushDMA(MainEngine); + + copySizeTmp -= oneStepSizeTmp[0]; + // Change image offset, ignore X offset + for (uint j = 1; j < 3; ++j) { + srcTmp.c[j] += xferRectTmp[j]; + if ((srcTmp[j] - srcOrigin[j]) >= size[j]) { + srcTmp.c[j] = srcOrigin[j]; + } else { + break; + } + } + // Recalculate rectangle size if the remain data is smaller + if (copySizeTmp < oneStepSizeTmp[0]) { + for (uint j = 0; j < 3; ++j) { + xferRectTmp.c[j] = size[j] - (srcTmp[j] - srcOrigin[j]); + } + } + } else { + break; + } + } + + // Step 2. Initiate DRM transfer with all staging buffers + for (uint i = 0; i < MaxXferBuffers; ++i) { + // Make sure we don't transfer more than copy size + if (copySize > 0) { + if (!xferBuf[i]->partialMemCopyTo(gpu(), xferDst, dst, oneStepSize, gpuMem(dstMemory), + CopyRect, FlushDMA)) { + transfer = false; + break; + } + + copySize -= oneStepSize[0]; + // Change buffer offset + dst.c[0] += oneStepSize[0]; + // Change image offset, ignore X offset + for (uint j = 1; j < 3; ++j) { + src.c[j] += xferRect[j]; + if ((src[j] - srcOrigin[j]) >= size[j]) { + src.c[j] = srcOrigin[j]; + } else { + break; + } + } + // Recalculate rectangle size if the remain data is smaller + if (copySize < oneStepSize[0]) { + for (uint j = 0; j < 3; ++j) { + xferRect.c[j] = size[j] - (src[j] - srcOrigin[j]); + } + oneStepSize.c[0] = copySize; + } + } else { + break; + } + } + } + + if (copySize == 0) { + result = true; + } else { + LogWarning("2 step transfer in copyBufferToImage failed"); + } + } + } + + if (!result) { + result = copyImageToBufferKernel(srcMemory, dstMemory, srcOrigin, dstOrigin, size, entire, + rowPitch, slicePitch); + } + + synchronize(); + + return result; +} + +bool KernelBlitManager::copyImageToBufferKernel(device::Memory& srcMemory, + device::Memory& dstMemory, + const amd::Coord3D& srcOrigin, + const amd::Coord3D& dstOrigin, + const amd::Coord3D& size, bool entire, + size_t rowPitch, size_t slicePitch) const { + bool rejected = false; + Memory* srcView = &gpuMem(srcMemory); + bool releaseView = false; + bool result = false; + CalFormat imgFormat; + imgFormat.channelOrder_ = gpuMem(srcMemory).cal()->channelOrder_; + imgFormat.type_ = gpuMem(srcMemory).cal()->format_; + amd::Image::Format newFormat(dev().getOclFormat(imgFormat)); + + // Find unsupported formats + for (uint i = 0; i < RejectedFormatDataTotal; ++i) { + if (RejectedData[i].clOldType_ == newFormat.image_channel_data_type) { + newFormat.image_channel_data_type = RejectedData[i].clNewType_; + rejected = true; + break; + } + } + + // Find unsupported channel's order + for (uint i = 0; i < RejectedFormatChannelTotal; ++i) { + if (RejectedOrder[i].clOldType_ == newFormat.image_channel_order) { + newFormat.image_channel_order = RejectedOrder[i].clNewType_; + rejected = true; + break; + } + } + + // If the image format was rejected, then attempt to create a view + if (rejected) { + srcView = createView(gpuMem(srcMemory), dev().getCalFormat(newFormat)); + if (srcView != NULL) { + rejected = false; + releaseView = true; + } + } + + // Fall into the host path if the image format was rejected + if (rejected) { + return HostBlitManager::copyImageToBuffer(srcMemory, dstMemory, srcOrigin, dstOrigin, size, + entire); + } + + uint blitType = BlitCopyImageToBuffer; + size_t dim = 0; + size_t globalWorkOffset[3] = {0, 0, 0}; + size_t globalWorkSize[3]; + size_t localWorkSize[3]; + + // Program the kernels workload depending on the blit dimensions + dim = 3; + // Find the current blit type + if (gpuMem(srcMemory).cal()->dimSize_ == 1) { + globalWorkSize[0] = amd::alignUp(size[0], 256); + globalWorkSize[1] = amd::alignUp(size[1], 1); + globalWorkSize[2] = amd::alignUp(size[2], 1); + localWorkSize[0] = 256; + localWorkSize[1] = localWorkSize[2] = 1; + } else if (gpuMem(srcMemory).cal()->dimSize_ == 2) { + globalWorkSize[0] = amd::alignUp(size[0], 16); + globalWorkSize[1] = amd::alignUp(size[1], 16); + globalWorkSize[2] = amd::alignUp(size[2], 1); + localWorkSize[0] = localWorkSize[1] = 16; + localWorkSize[2] = 1; + } else { + globalWorkSize[0] = amd::alignUp(size[0], 8); + globalWorkSize[1] = amd::alignUp(size[1], 8); + globalWorkSize[2] = amd::alignUp(size[2], 4); + localWorkSize[0] = localWorkSize[1] = 8; + localWorkSize[2] = 4; + } + + // Program kernels arguments for the blit operation + Memory* mem = srcView; + setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem); + mem = &gpuMem(dstMemory); + setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem); + + // Update extra paramters for USHORT and UBYTE pointers. + // Only then compiler can optimize the kernel to use + // UAV Raw for other writes + setArgument(kernels_[blitType], 2, sizeof(cl_mem), &mem); + setArgument(kernels_[blitType], 3, sizeof(cl_mem), &mem); + + cl_int srcOrg[4] = {(cl_int)srcOrigin[0], (cl_int)srcOrigin[1], (cl_int)srcOrigin[2], 0}; + cl_int copySize[4] = {(cl_int)size[0], (cl_int)size[1], (cl_int)size[2], 0}; + setArgument(kernels_[blitType], 4, sizeof(srcOrg), srcOrg); + const MemFormatStruct& memFmt = memoryFormatSize(gpuMem(srcMemory).cal()->format_); + + // 1 element granularity for writes by default + cl_int granularity = 1; + if (memFmt.size_ == 2) { + granularity = 2; + } else if (memFmt.size_ >= 4) { + granularity = 4; + } + CondLog(((dstOrigin[0] % granularity) != 0), "Unaligned offset in blit!"); + cl_ulong dstOrg[4] = {dstOrigin[0] / granularity, dstOrigin[1], dstOrigin[2], 0}; + setArgument(kernels_[blitType], 5, sizeof(dstOrg), dstOrg); + setArgument(kernels_[blitType], 6, sizeof(copySize), copySize); + + // Program memory format + uint multiplier = memFmt.size_ / sizeof(uint32_t); + multiplier = (multiplier == 0) ? 1 : multiplier; + cl_uint format[4] = {memFmt.components_, memFmt.size_ / memFmt.components_, multiplier, 0}; + setArgument(kernels_[blitType], 7, sizeof(format), format); + + // Program row and slice pitches + cl_ulong pitch[4] = {0}; + CalcRowSlicePitches(pitch, copySize, rowPitch, slicePitch, gpuMem(srcMemory)); + setArgument(kernels_[blitType], 8, sizeof(pitch), pitch); + + // Create ND range object for the kernel's execution + amd::NDRangeContainer ndrange(dim, globalWorkOffset, globalWorkSize, localWorkSize); + + // Execute the blit + address parameters = kernels_[blitType]->parameters().values(); + result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters); + if (releaseView) { + delete srcView; + } + + return result; +} + +bool KernelBlitManager::copyImage(device::Memory& srcMemory, device::Memory& dstMemory, + const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin, + const amd::Coord3D& size, bool entire) const { + amd::ScopedLock k(lockXferOps_); + bool rejected = false; + Memory* srcView = &gpuMem(srcMemory); + Memory* dstView = &gpuMem(dstMemory); + bool releaseView = false; + bool result = false; + CalFormat imgFormat; + imgFormat.channelOrder_ = gpuMem(srcMemory).cal()->channelOrder_; + imgFormat.type_ = gpuMem(srcMemory).cal()->format_; + amd::Image::Format newFormat(dev().getOclFormat(imgFormat)); + + // Find unsupported formats + for (uint i = 0; i < RejectedFormatDataTotal; ++i) { + if (RejectedData[i].clOldType_ == newFormat.image_channel_data_type) { + newFormat.image_channel_data_type = RejectedData[i].clNewType_; + rejected = true; + break; + } + } + + // Search for the rejected channel's order only if the format was rejected + // Note: Image blit is independent from the channel order + if (rejected) { for (uint i = 0; i < RejectedFormatChannelTotal; ++i) { - if (RejectedOrder[i].clOldType_ == newFormat.image_channel_order) { - newFormat.image_channel_order = RejectedOrder[i].clNewType_; - rejected = true; - break; - } + if (RejectedOrder[i].clOldType_ == newFormat.image_channel_order) { + newFormat.image_channel_order = RejectedOrder[i].clNewType_; + rejected = true; + break; + } } + } - // If the image format was rejected, then attempt to create a view - if (rejected) { - srcView = createView(gpuMem(srcMemory), dev().getCalFormat(newFormat)); - if (srcView != NULL) { - rejected = false; - releaseView = true; - } - } - - // Fall into the host path if the image format was rejected - if (rejected) { - return HostBlitManager::copyImageToBuffer( - srcMemory, dstMemory, srcOrigin, dstOrigin, size, entire); - } - - uint blitType = BlitCopyImageToBuffer; - size_t dim = 0; - size_t globalWorkOffset[3] = { 0, 0, 0 }; - size_t globalWorkSize[3]; - size_t localWorkSize[3]; - - // Program the kernels workload depending on the blit dimensions - dim = 3; - // Find the current blit type - if (gpuMem(srcMemory).cal()->dimSize_ == 1) { - globalWorkSize[0] = amd::alignUp(size[0], 256); - globalWorkSize[1] = amd::alignUp(size[1], 1); - globalWorkSize[2] = amd::alignUp(size[2], 1); - localWorkSize[0] = 256; - localWorkSize[1] = localWorkSize[2] = 1; - } - else if (gpuMem(srcMemory).cal()->dimSize_ == 2) { - globalWorkSize[0] = amd::alignUp(size[0], 16); - globalWorkSize[1] = amd::alignUp(size[1], 16); - globalWorkSize[2] = amd::alignUp(size[2], 1); - localWorkSize[0] = localWorkSize[1] = 16; - localWorkSize[2] = 1; - } - else { - globalWorkSize[0] = amd::alignUp(size[0], 8); - globalWorkSize[1] = amd::alignUp(size[1], 8); - globalWorkSize[2] = amd::alignUp(size[2], 4); - localWorkSize[0] = localWorkSize[1] = 8; - localWorkSize[2] = 4; - } - - // Program kernels arguments for the blit operation - Memory* mem = srcView; - setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem); - mem = &gpuMem(dstMemory); - setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem); - - // Update extra paramters for USHORT and UBYTE pointers. - // Only then compiler can optimize the kernel to use - // UAV Raw for other writes - setArgument(kernels_[blitType], 2, sizeof(cl_mem), &mem); - setArgument(kernels_[blitType], 3, sizeof(cl_mem), &mem); - - cl_int srcOrg[4] = { (cl_int)srcOrigin[0], - (cl_int)srcOrigin[1], - (cl_int)srcOrigin[2], 0 }; - cl_int copySize[4] = { (cl_int)size[0], - (cl_int)size[1], - (cl_int)size[2], 0 }; - setArgument(kernels_[blitType], 4, sizeof(srcOrg), srcOrg); - const MemFormatStruct& memFmt = memoryFormatSize(gpuMem(srcMemory).cal()->format_); - - // 1 element granularity for writes by default - cl_int granularity = 1; - if (memFmt.size_ == 2) { - granularity = 2; - } - else if (memFmt.size_ >= 4) { - granularity = 4; - } - CondLog(((dstOrigin[0] % granularity) != 0), "Unaligned offset in blit!"); - cl_ulong dstOrg[4] = { dstOrigin[0] / granularity, - dstOrigin[1], - dstOrigin[2], 0 }; - setArgument(kernels_[blitType], 5, sizeof(dstOrg), dstOrg); - setArgument(kernels_[blitType], 6, sizeof(copySize), copySize); - - // Program memory format - uint multiplier = memFmt.size_ / sizeof(uint32_t); - multiplier = (multiplier == 0) ? 1 : multiplier; - cl_uint format[4] = { memFmt.components_, - memFmt.size_ / memFmt.components_, - multiplier, 0 }; - setArgument(kernels_[blitType], 7, sizeof(format), format); - - // Program row and slice pitches - cl_ulong pitch[4] = { 0 }; - CalcRowSlicePitches(pitch, copySize, rowPitch, slicePitch, gpuMem(srcMemory)); - setArgument(kernels_[blitType], 8, sizeof(pitch), pitch); - - // Create ND range object for the kernel's execution - amd::NDRangeContainer ndrange(dim, - globalWorkOffset, globalWorkSize, localWorkSize); - - // Execute the blit - address parameters = kernels_[blitType]->parameters().values(); - result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters); - if (releaseView) { + // Attempt to create a view if the format was rejected + if (rejected) { + srcView = createView(gpuMem(srcMemory), dev().getCalFormat(newFormat)); + if (srcView != NULL) { + dstView = createView(gpuMem(dstMemory), dev().getCalFormat(newFormat)); + if (dstView != NULL) { + rejected = false; + releaseView = true; + } else { delete srcView; + } } + } + // Fall into the host path for the entire 2D copy or + // if the image format was rejected + if (rejected) { + result = HostBlitManager::copyImage(srcMemory, dstMemory, srcOrigin, dstOrigin, size, entire); + synchronize(); return result; + } + + uint blitType = BlitCopyImage; + size_t dim = 0; + size_t globalWorkOffset[3] = {0, 0, 0}; + size_t globalWorkSize[3]; + size_t localWorkSize[3]; + + // Program the kernels workload depending on the blit dimensions + dim = 3; + // Find the current blit type + if ((gpuMem(srcMemory).cal()->dimSize_ == 1) || (gpuMem(dstMemory).cal()->dimSize_ == 1)) { + globalWorkSize[0] = amd::alignUp(size[0], 256); + globalWorkSize[1] = amd::alignUp(size[1], 1); + globalWorkSize[2] = amd::alignUp(size[2], 1); + localWorkSize[0] = 256; + localWorkSize[1] = localWorkSize[2] = 1; + } else if ((gpuMem(srcMemory).cal()->dimSize_ == 2) || (gpuMem(dstMemory).cal()->dimSize_ == 2)) { + globalWorkSize[0] = amd::alignUp(size[0], 16); + globalWorkSize[1] = amd::alignUp(size[1], 16); + globalWorkSize[2] = amd::alignUp(size[2], 1); + localWorkSize[0] = localWorkSize[1] = 16; + localWorkSize[2] = 1; + } else { + globalWorkSize[0] = amd::alignUp(size[0], 8); + globalWorkSize[1] = amd::alignUp(size[1], 8); + globalWorkSize[2] = amd::alignUp(size[2], 4); + localWorkSize[0] = localWorkSize[1] = 8; + localWorkSize[2] = 4; + } + + // The current OpenCL spec allows "copy images from a 1D image + // array object to a 1D image array object" only. + if ((gpuMem(srcMemory).cal()->dimension_ == GSL_MOA_TEXTURE_1D_ARRAY) || + (gpuMem(dstMemory).cal()->dimension_ == GSL_MOA_TEXTURE_1D_ARRAY)) { + blitType = BlitCopyImage1DA; + } + + // Program kernels arguments for the blit operation + Memory* mem = srcView; + setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem); + mem = dstView; + setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem); + + // Program source origin + cl_int srcOrg[4] = {(cl_int)srcOrigin[0], (cl_int)srcOrigin[1], (cl_int)srcOrigin[2], 0}; + setArgument(kernels_[blitType], 2, sizeof(srcOrg), srcOrg); + + // Program destinaiton origin + cl_int dstOrg[4] = {(cl_int)dstOrigin[0], (cl_int)dstOrigin[1], (cl_int)dstOrigin[2], 0}; + setArgument(kernels_[blitType], 3, sizeof(dstOrg), dstOrg); + + cl_int copySize[4] = {(cl_int)size[0], (cl_int)size[1], (cl_int)size[2], 0}; + setArgument(kernels_[blitType], 4, sizeof(copySize), copySize); + + // Create ND range object for the kernel's execution + amd::NDRangeContainer ndrange(dim, globalWorkOffset, globalWorkSize, localWorkSize); + + // Execute the blit + address parameters = kernels_[blitType]->parameters().values(); + result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters); + if (releaseView) { + delete srcView; + delete dstView; + } + + synchronize(); + + return result; } -bool -KernelBlitManager::copyImage( - device::Memory& srcMemory, - device::Memory& dstMemory, - const amd::Coord3D& srcOrigin, - const amd::Coord3D& dstOrigin, - const amd::Coord3D& size, - bool entire) const -{ - amd::ScopedLock k(lockXferOps_); - bool rejected = false; - Memory* srcView = &gpuMem(srcMemory); - Memory* dstView = &gpuMem(dstMemory); - bool releaseView = false; - bool result = false; - CalFormat imgFormat; - imgFormat.channelOrder_ = gpuMem(srcMemory).cal()->channelOrder_; - imgFormat.type_ = gpuMem(srcMemory).cal()->format_; - amd::Image::Format newFormat(dev().getOclFormat(imgFormat)); +void FindPinSize(size_t& pinSize, const amd::Coord3D& size, size_t& rowPitch, size_t& slicePitch, + const Memory& mem) { + pinSize = size[0] * mem.elementSize(); + if ((rowPitch == 0) || (rowPitch == pinSize)) { + rowPitch = 0; + } else { + pinSize = rowPitch; + } - // Find unsupported formats - for (uint i = 0; i < RejectedFormatDataTotal; ++i) { - if (RejectedData[i].clOldType_ == newFormat.image_channel_data_type) { - newFormat.image_channel_data_type = RejectedData[i].clNewType_; - rejected = true; - break; + // Calculate the pin size, which should be equal to the copy size + for (uint i = 1; i < mem.cal()->dimSize_; ++i) { + pinSize *= size[i]; + if (i == 1) { + if ((slicePitch == 0) || (slicePitch == pinSize)) { + slicePitch = 0; + } else { + if (mem.cal()->dimension_ != GSL_MOA_TEXTURE_1D_ARRAY) { + pinSize = slicePitch; + } else { + pinSize = slicePitch * size[i]; } + } + } + } +} + +bool KernelBlitManager::readImage(device::Memory& srcMemory, void* dstHost, + const amd::Coord3D& origin, const amd::Coord3D& size, + size_t rowPitch, size_t slicePitch, bool entire) const { + amd::ScopedLock k(lockXferOps_); + bool result = false; + + // Use host copy if memory has direct access or it's persistent + if (setup_.disableReadImage_ || + (gpuMem(srcMemory).isHostMemDirectAccess() && gpuMem(srcMemory).isCacheable())) { + result = + HostBlitManager::readImage(srcMemory, dstHost, origin, size, rowPitch, slicePitch, entire); + synchronize(); + return result; + } else { + size_t pinSize; + FindPinSize(pinSize, size, rowPitch, slicePitch, gpuMem(srcMemory)); + + size_t partial; + amd::Memory* amdMemory = pinHostMemory(dstHost, pinSize, partial); + + if (amdMemory == NULL) { + // Force SW copy + result = HostBlitManager::readImage(srcMemory, dstHost, origin, size, rowPitch, slicePitch, + entire); + synchronize(); + return result; } - // Search for the rejected channel's order only if the format was rejected - // Note: Image blit is independent from the channel order - if (rejected) { - for (uint i = 0; i < RejectedFormatChannelTotal; ++i) { - if (RejectedOrder[i].clOldType_ == newFormat.image_channel_order) { - newFormat.image_channel_order = RejectedOrder[i].clNewType_; - rejected = true; - break; - } - } + // Readjust destination offset + const amd::Coord3D dstOrigin(partial); + + // Get device memory for this virtual device + Memory* dstMemory = dev().getGpuMemory(amdMemory); + + // Copy image to buffer + result = copyImageToBuffer(srcMemory, *dstMemory, origin, dstOrigin, size, entire, rowPitch, + slicePitch); + + // Add pinned memory for a later release + gpu().addPinnedMem(amdMemory); + } + + synchronize(); + + return result; +} + +bool KernelBlitManager::writeImage(const void* srcHost, device::Memory& dstMemory, + const amd::Coord3D& origin, const amd::Coord3D& size, + size_t rowPitch, size_t slicePitch, bool entire) const { + amd::ScopedLock k(lockXferOps_); + bool result = false; + + // Use host copy if memory has direct access or it's persistent + if (setup_.disableWriteImage_ || gpuMem(dstMemory).isHostMemDirectAccess() || + gpuMem(dstMemory).isPersistentDirectMap()) { + result = + HostBlitManager::writeImage(srcHost, dstMemory, origin, size, rowPitch, slicePitch, entire); + synchronize(); + return result; + } else { + size_t pinSize; + FindPinSize(pinSize, size, rowPitch, slicePitch, gpuMem(dstMemory)); + + size_t partial; + amd::Memory* amdMemory = pinHostMemory(srcHost, pinSize, partial); + + if (amdMemory == NULL) { + // Force SW copy + result = HostBlitManager::writeImage(srcHost, dstMemory, origin, size, rowPitch, slicePitch, + entire); + synchronize(); + return result; } - // Attempt to create a view if the format was rejected - if (rejected) { - srcView = createView(gpuMem(srcMemory), dev().getCalFormat(newFormat)); - if (srcView != NULL) { - dstView = createView(gpuMem(dstMemory), dev().getCalFormat(newFormat)); - if (dstView != NULL) { - rejected = false; - releaseView = true; - } - else { - delete srcView; - } - } - } + // Readjust destination offset + const amd::Coord3D srcOrigin(partial); - // Fall into the host path for the entire 2D copy or - // if the image format was rejected - if (rejected) { - result = HostBlitManager::copyImage(srcMemory, dstMemory, - srcOrigin, dstOrigin, size, entire); + // Get device memory for this virtual device + Memory* srcMemory = dev().getGpuMemory(amdMemory); + + // Copy image to buffer + result = copyBufferToImage(*srcMemory, dstMemory, srcOrigin, origin, size, entire, rowPitch, + slicePitch); + + // Add pinned memory for a later release + gpu().addPinnedMem(amdMemory); + } + + synchronize(); + + return result; +} + +bool KernelBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& dstMemory, + const amd::BufferRect& srcRectIn, + const amd::BufferRect& dstRectIn, const amd::Coord3D& sizeIn, + bool entire) const { + amd::ScopedLock k(lockXferOps_); + bool result = false; + bool rejected = false; + + // Fall into the CAL path for rejected transfers + if (setup_.disableCopyBufferRect_ || gpuMem(srcMemory).isHostMemDirectAccess() || + gpuMem(dstMemory).isHostMemDirectAccess()) { + result = + DmaBlitManager::copyBufferRect(srcMemory, dstMemory, srcRectIn, dstRectIn, sizeIn, entire); + + if (result) { + synchronize(); + return result; + } + } + + uint blitType = BlitCopyBufferRect; + size_t dim = 3; + size_t globalWorkOffset[3] = {0, 0, 0}; + size_t globalWorkSize[3]; + size_t localWorkSize[3]; + + const static uint CopyRectAlignment[3] = {16, 4, 1}; + + bool aligned; + uint i; + for (i = 0; i < sizeof(CopyRectAlignment) / sizeof(uint); i++) { + // Check source alignments + aligned = ((srcRectIn.rowPitch_ % CopyRectAlignment[i]) == 0); + aligned &= ((srcRectIn.slicePitch_ % CopyRectAlignment[i]) == 0); + aligned &= ((srcRectIn.start_ % CopyRectAlignment[i]) == 0); + + // Check destination alignments + aligned &= ((dstRectIn.rowPitch_ % CopyRectAlignment[i]) == 0); + aligned &= ((dstRectIn.slicePitch_ % CopyRectAlignment[i]) == 0); + aligned &= ((dstRectIn.start_ % CopyRectAlignment[i]) == 0); + + // Check copy size alignment in the first dimension + aligned &= ((sizeIn[0] % CopyRectAlignment[i]) == 0); + + if (aligned) { + if (CopyRectAlignment[i] != 1) { + blitType = BlitCopyBufferRectAligned; + } + break; + } + } + + amd::BufferRect srcRect; + amd::BufferRect dstRect; + amd::Coord3D size(sizeIn[0], sizeIn[1], sizeIn[2]); + + srcRect.rowPitch_ = srcRectIn.rowPitch_ / CopyRectAlignment[i]; + srcRect.slicePitch_ = srcRectIn.slicePitch_ / CopyRectAlignment[i]; + srcRect.start_ = srcRectIn.start_ / CopyRectAlignment[i]; + srcRect.end_ = srcRectIn.end_ / CopyRectAlignment[i]; + + dstRect.rowPitch_ = dstRectIn.rowPitch_ / CopyRectAlignment[i]; + dstRect.slicePitch_ = dstRectIn.slicePitch_ / CopyRectAlignment[i]; + dstRect.start_ = dstRectIn.start_ / CopyRectAlignment[i]; + dstRect.end_ = dstRectIn.end_ / CopyRectAlignment[i]; + + size.c[0] /= CopyRectAlignment[i]; + + // Program the kernel's workload depending on the transfer dimensions + if ((size[1] == 1) && (size[2] == 1)) { + globalWorkSize[0] = amd::alignUp(size[0], 256); + globalWorkSize[1] = 1; + globalWorkSize[2] = 1; + localWorkSize[0] = 256; + localWorkSize[1] = 1; + localWorkSize[2] = 1; + } else if (size[2] == 1) { + globalWorkSize[0] = amd::alignUp(size[0], 16); + globalWorkSize[1] = amd::alignUp(size[1], 16); + globalWorkSize[2] = 1; + localWorkSize[0] = localWorkSize[1] = 16; + localWorkSize[2] = 1; + } else { + globalWorkSize[0] = amd::alignUp(size[0], 8); + globalWorkSize[1] = amd::alignUp(size[1], 8); + globalWorkSize[2] = amd::alignUp(size[2], 4); + localWorkSize[0] = localWorkSize[1] = 8; + localWorkSize[2] = 4; + } + + + // Program kernels arguments for the blit operation + Memory* mem = &gpuMem(srcMemory); + setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem); + mem = &gpuMem(dstMemory); + setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem); + cl_ulong src[4] = {srcRect.rowPitch_, srcRect.slicePitch_, srcRect.start_, 0}; + setArgument(kernels_[blitType], 2, sizeof(src), src); + cl_ulong dst[4] = {dstRect.rowPitch_, dstRect.slicePitch_, dstRect.start_, 0}; + setArgument(kernels_[blitType], 3, sizeof(dst), dst); + cl_ulong copySize[4] = {size[0], size[1], size[2], CopyRectAlignment[i]}; + setArgument(kernels_[blitType], 4, sizeof(copySize), copySize); + + // Create ND range object for the kernel's execution + amd::NDRangeContainer ndrange(dim, globalWorkOffset, globalWorkSize, localWorkSize); + + // Execute the blit + address parameters = kernels_[blitType]->parameters().values(); + result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters); + + synchronize(); + + return result; +} + +bool KernelBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost, + const amd::Coord3D& origin, const amd::Coord3D& size, + bool entire) const { + amd::ScopedLock k(lockXferOps_); + bool result = false; + // Use host copy if memory has direct access + if (setup_.disableReadBuffer_ || + (gpuMem(srcMemory).isHostMemDirectAccess() && gpuMem(srcMemory).isCacheable())) { + result = HostBlitManager::readBuffer(srcMemory, dstHost, origin, size, entire); + synchronize(); + return result; + } else { + size_t pinSize = size[0]; + // Check if a pinned transfer can be executed with a single pin + if ((pinSize <= dev().settings().pinnedXferSize_) && (pinSize > MinSizeForPinnedTransfer)) { + size_t partial; + amd::Memory* amdMemory = pinHostMemory(dstHost, pinSize, partial); + + if (amdMemory == NULL) { + // Force SW copy + result = HostBlitManager::readBuffer(srcMemory, dstHost, origin, size, entire); synchronize(); return result; + } + + // Readjust host mem offset + amd::Coord3D dstOrigin(partial); + + // Get device memory for this virtual device + Memory* dstMemory = dev().getGpuMemory(amdMemory); + + // Copy image to buffer + result = copyBuffer(srcMemory, *dstMemory, origin, dstOrigin, size, entire); + + // Add pinned memory for a later release + gpu().addPinnedMem(amdMemory); + } else { + result = DmaBlitManager::readBuffer(srcMemory, dstHost, origin, size, entire); + } + } + + synchronize(); + + return result; +} + +bool KernelBlitManager::readBufferRect(device::Memory& srcMemory, void* dstHost, + const amd::BufferRect& bufRect, + const amd::BufferRect& hostRect, const amd::Coord3D& size, + bool entire) const { + amd::ScopedLock k(lockXferOps_); + bool result = false; + + // Use host copy if memory has direct access + if (setup_.disableReadBufferRect_ || + (gpuMem(srcMemory).isHostMemDirectAccess() && gpuMem(srcMemory).isCacheable())) { + result = HostBlitManager::readBufferRect(srcMemory, dstHost, bufRect, hostRect, size, entire); + synchronize(); + return result; + } else { + size_t pinSize = hostRect.start_ + hostRect.end_; + size_t partial; + amd::Memory* amdMemory = pinHostMemory(dstHost, pinSize, partial); + + if (amdMemory == NULL) { + // Force SW copy + result = HostBlitManager::readBufferRect(srcMemory, dstHost, bufRect, hostRect, size, entire); + synchronize(); + return result; } - uint blitType = BlitCopyImage; - size_t dim = 0; - size_t globalWorkOffset[3] = { 0, 0, 0 }; - size_t globalWorkSize[3]; - size_t localWorkSize[3]; + // Readjust host mem offset + amd::BufferRect rect; + rect.rowPitch_ = hostRect.rowPitch_; + rect.slicePitch_ = hostRect.slicePitch_; + rect.start_ = hostRect.start_ + partial; + rect.end_ = hostRect.end_; - // Program the kernels workload depending on the blit dimensions - dim = 3; - // Find the current blit type - if ((gpuMem(srcMemory).cal()->dimSize_ == 1) || - (gpuMem(dstMemory).cal()->dimSize_ == 1)) { - globalWorkSize[0] = amd::alignUp(size[0], 256); - globalWorkSize[1] = amd::alignUp(size[1], 1); - globalWorkSize[2] = amd::alignUp(size[2], 1); - localWorkSize[0] = 256; - localWorkSize[1] = localWorkSize[2] = 1; + // Get device memory for this virtual device + Memory* dstMemory = dev().getGpuMemory(amdMemory); + + // Copy image to buffer + result = copyBufferRect(srcMemory, *dstMemory, bufRect, rect, size, entire); + + // Add pinned memory for a later release + gpu().addPinnedMem(amdMemory); + } + + synchronize(); + + return result; +} + +bool KernelBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemory, + const amd::Coord3D& origin, const amd::Coord3D& size, + bool entire) const { + amd::ScopedLock k(lockXferOps_); + bool result = false; + + // Use host copy if memory has direct access or it's persistent + if (setup_.disableWriteBuffer_ || gpuMem(dstMemory).isHostMemDirectAccess() || + (gpuMem(dstMemory).memoryType() == Resource::Persistent)) { + result = HostBlitManager::writeBuffer(srcHost, dstMemory, origin, size, entire); + synchronize(); + return result; + } else { + size_t pinSize = size[0]; + + // Check if a pinned transfer can be executed with a single pin + if ((pinSize <= dev().settings().pinnedXferSize_) && (pinSize > MinSizeForPinnedTransfer)) { + size_t partial; + amd::Memory* amdMemory = pinHostMemory(srcHost, pinSize, partial); + + if (amdMemory == NULL) { + // Force SW copy + result = HostBlitManager::writeBuffer(srcHost, dstMemory, origin, size, entire); + synchronize(); + return result; + } + + // Readjust destination offset + const amd::Coord3D srcOrigin(partial); + + // Get device memory for this virtual device + Memory* srcMemory = dev().getGpuMemory(amdMemory); + + // Copy buffer rect + result = copyBuffer(*srcMemory, dstMemory, srcOrigin, origin, size, entire); + + // Add pinned memory for a later release + gpu().addPinnedMem(amdMemory); + } else { + result = DmaBlitManager::writeBuffer(srcHost, dstMemory, origin, size, entire); } - else if ((gpuMem(srcMemory).cal()->dimSize_ == 2) || - (gpuMem(dstMemory).cal()->dimSize_ == 2)) { - globalWorkSize[0] = amd::alignUp(size[0], 16); - globalWorkSize[1] = amd::alignUp(size[1], 16); - globalWorkSize[2] = amd::alignUp(size[2], 1); - localWorkSize[0] = localWorkSize[1] = 16; - localWorkSize[2] = 1; - } - else { - globalWorkSize[0] = amd::alignUp(size[0], 8); - globalWorkSize[1] = amd::alignUp(size[1], 8); - globalWorkSize[2] = amd::alignUp(size[2], 4); - localWorkSize[0] = localWorkSize[1] = 8; - localWorkSize[2] = 4; + } + + synchronize(); + + + return result; +} + +bool KernelBlitManager::writeBufferRect(const void* srcHost, device::Memory& dstMemory, + const amd::BufferRect& hostRect, + const amd::BufferRect& bufRect, const amd::Coord3D& size, + bool entire) const { + amd::ScopedLock k(lockXferOps_); + bool result = false; + + // Use host copy if memory has direct access or it's persistent + if (setup_.disableWriteBufferRect_ || gpuMem(dstMemory).isHostMemDirectAccess() || + gpuMem(dstMemory).isPersistentDirectMap()) { + result = HostBlitManager::writeBufferRect(srcHost, dstMemory, hostRect, bufRect, size, entire); + synchronize(); + return result; + } else { + size_t pinSize = hostRect.start_ + hostRect.end_; + size_t partial; + amd::Memory* amdMemory = pinHostMemory(srcHost, pinSize, partial); + + if (amdMemory == NULL) { + // Force SW copy + result = + HostBlitManager::writeBufferRect(srcHost, dstMemory, hostRect, bufRect, size, entire); + synchronize(); + return result; } - // The current OpenCL spec allows "copy images from a 1D image - // array object to a 1D image array object" only. - if ((gpuMem(srcMemory).cal()->dimension_ == GSL_MOA_TEXTURE_1D_ARRAY) || - (gpuMem(dstMemory).cal()->dimension_ == GSL_MOA_TEXTURE_1D_ARRAY)) { - blitType = BlitCopyImage1DA; + // Readjust destination offset + const amd::Coord3D srcOrigin(partial); + + // Get device memory for this virtual device + Memory* srcMemory = dev().getGpuMemory(amdMemory); + + // Readjust host mem offset + amd::BufferRect rect; + rect.rowPitch_ = hostRect.rowPitch_; + rect.slicePitch_ = hostRect.slicePitch_; + rect.start_ = hostRect.start_ + partial; + rect.end_ = hostRect.end_; + + // Copy buffer rect + result = copyBufferRect(*srcMemory, dstMemory, rect, bufRect, size, entire); + + // Add pinned memory for a later release + gpu().addPinnedMem(amdMemory); + } + + synchronize(); + + return result; +} + +bool KernelBlitManager::fillBuffer(device::Memory& memory, const void* pattern, size_t patternSize, + const amd::Coord3D& origin, const amd::Coord3D& size, + bool entire) const { + amd::ScopedLock k(lockXferOps_); + bool result = false; + + // Use host fill if memory has direct access + if (setup_.disableFillBuffer_ || gpuMem(memory).isHostMemDirectAccess()) { + result = HostBlitManager::fillBuffer(memory, pattern, patternSize, origin, size, entire); + synchronize(); + return result; + } else { + uint fillType = FillBuffer; + size_t globalWorkOffset[3] = {0, 0, 0}; + cl_ulong fillSize = size[0] / patternSize; + size_t globalWorkSize = amd::alignUp(fillSize, 256); + size_t localWorkSize = 256; + bool dwordAligned = ((patternSize % sizeof(uint32_t)) == 0) ? true : false; + + // Program kernels arguments for the fill operation + Memory* mem = &gpuMem(memory); + if (dwordAligned) { + setArgument(kernels_[fillType], 0, sizeof(cl_mem), NULL); + setArgument(kernels_[fillType], 1, sizeof(cl_mem), &mem); + } else { + setArgument(kernels_[fillType], 0, sizeof(cl_mem), &mem); + setArgument(kernels_[fillType], 1, sizeof(cl_mem), NULL); } - - // Program kernels arguments for the blit operation - Memory* mem = srcView; - setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem); - mem = dstView; - setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem); - - // Program source origin - cl_int srcOrg[4] = { (cl_int)srcOrigin[0], - (cl_int)srcOrigin[1], - (cl_int)srcOrigin[2], 0 }; - setArgument(kernels_[blitType], 2, sizeof(srcOrg), srcOrg); - - // Program destinaiton origin - cl_int dstOrg[4] = { (cl_int)dstOrigin[0], - (cl_int)dstOrigin[1], - (cl_int)dstOrigin[2], 0 }; - setArgument(kernels_[blitType], 3, sizeof(dstOrg), dstOrg); - - cl_int copySize[4] = { (cl_int)size[0], - (cl_int)size[1], - (cl_int)size[2], 0 }; - setArgument(kernels_[blitType], 4, sizeof(copySize), copySize); + Memory* gpuCB = dev().getGpuMemory(constantBuffer_); + if (gpuCB == NULL) { + return false; + } + void* constBuf = gpuCB->map(&gpu(), Resource::WriteOnly); + memcpy(constBuf, pattern, patternSize); + gpuCB->unmap(&gpu()); + setArgument(kernels_[fillType], 2, sizeof(cl_mem), &gpuCB); + cl_ulong offset = origin[0]; + if (dwordAligned) { + patternSize /= sizeof(uint32_t); + offset /= sizeof(uint32_t); + } + setArgument(kernels_[fillType], 3, sizeof(cl_uint), &patternSize); + setArgument(kernels_[fillType], 4, sizeof(offset), &offset); + setArgument(kernels_[fillType], 5, sizeof(fillSize), &fillSize); // Create ND range object for the kernel's execution - amd::NDRangeContainer ndrange(dim, - globalWorkOffset, globalWorkSize, localWorkSize); - - // Execute the blit - address parameters = kernels_[blitType]->parameters().values(); - result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters); - if (releaseView) { - delete srcView; - delete dstView; - } - - synchronize(); - - return result; -} - -void -FindPinSize( - size_t& pinSize, const amd::Coord3D& size, - size_t& rowPitch, size_t& slicePitch, const Memory& mem) -{ - pinSize = size[0] * mem.elementSize(); - if ((rowPitch == 0) || (rowPitch == pinSize)) { - rowPitch = 0; - } - else { - pinSize = rowPitch; - } - - // Calculate the pin size, which should be equal to the copy size - for (uint i = 1; i < mem.cal()->dimSize_; ++i) { - pinSize *= size[i]; - if (i == 1) { - if ((slicePitch == 0) || (slicePitch == pinSize)) { - slicePitch = 0; - } - else { - if (mem.cal()->dimension_ != GSL_MOA_TEXTURE_1D_ARRAY) { - pinSize = slicePitch; - } - else { - pinSize = slicePitch * size[i]; - } - } - } - } -} - -bool -KernelBlitManager::readImage( - device::Memory& srcMemory, - void* dstHost, - const amd::Coord3D& origin, - const amd::Coord3D& size, - size_t rowPitch, - size_t slicePitch, - bool entire) const -{ - amd::ScopedLock k(lockXferOps_); - bool result = false; - - // Use host copy if memory has direct access or it's persistent - if (setup_.disableReadImage_ || - (gpuMem(srcMemory).isHostMemDirectAccess() && gpuMem(srcMemory).isCacheable())) { - result = HostBlitManager::readImage(srcMemory, dstHost, - origin, size, rowPitch, slicePitch, entire); - synchronize(); - return result; - } - else { - size_t pinSize; - FindPinSize(pinSize, size, rowPitch, slicePitch, gpuMem(srcMemory)); - - size_t partial; - amd::Memory* amdMemory = pinHostMemory(dstHost, pinSize, partial); - - if (amdMemory == NULL) { - // Force SW copy - result = HostBlitManager::readImage(srcMemory, dstHost, - origin, size, rowPitch, slicePitch, entire); - synchronize(); - return result; - } - - // Readjust destination offset - const amd::Coord3D dstOrigin(partial); - - // Get device memory for this virtual device - Memory* dstMemory = dev().getGpuMemory(amdMemory); - - // Copy image to buffer - result = copyImageToBuffer(srcMemory, *dstMemory, - origin, dstOrigin, size, entire, rowPitch, slicePitch); - - // Add pinned memory for a later release - gpu().addPinnedMem(amdMemory); - } - - synchronize(); - - return result; -} - -bool -KernelBlitManager::writeImage( - const void* srcHost, - device::Memory& dstMemory, - const amd::Coord3D& origin, - const amd::Coord3D& size, - size_t rowPitch, - size_t slicePitch, - bool entire) const -{ - amd::ScopedLock k(lockXferOps_); - bool result = false; - - // Use host copy if memory has direct access or it's persistent - if (setup_.disableWriteImage_|| - gpuMem(dstMemory).isHostMemDirectAccess() || - gpuMem(dstMemory).isPersistentDirectMap()) { - result = HostBlitManager::writeImage( - srcHost, dstMemory, origin, size, rowPitch, slicePitch, entire); - synchronize(); - return result; - } - else { - size_t pinSize; - FindPinSize(pinSize, size, rowPitch, slicePitch, gpuMem(dstMemory)); - - size_t partial; - amd::Memory* amdMemory = pinHostMemory(srcHost, pinSize, partial); - - if (amdMemory == NULL) { - // Force SW copy - result = HostBlitManager::writeImage( - srcHost, dstMemory, origin, size, rowPitch, slicePitch, entire); - synchronize(); - return result; - } - - // Readjust destination offset - const amd::Coord3D srcOrigin(partial); - - // Get device memory for this virtual device - Memory* srcMemory = dev().getGpuMemory(amdMemory); - - // Copy image to buffer - result = copyBufferToImage(*srcMemory, dstMemory, - srcOrigin, origin, size, entire, rowPitch, slicePitch); - - // Add pinned memory for a later release - gpu().addPinnedMem(amdMemory); - } - - synchronize(); - - return result; -} - -bool -KernelBlitManager::copyBufferRect( - device::Memory& srcMemory, - device::Memory& dstMemory, - const amd::BufferRect& srcRectIn, - const amd::BufferRect& dstRectIn, - const amd::Coord3D& sizeIn, - bool entire) const -{ - amd::ScopedLock k(lockXferOps_); - bool result = false; - bool rejected = false; - - // Fall into the CAL path for rejected transfers - if (setup_.disableCopyBufferRect_ || - gpuMem(srcMemory).isHostMemDirectAccess() || gpuMem(dstMemory).isHostMemDirectAccess()) { - result = DmaBlitManager::copyBufferRect(srcMemory, dstMemory, - srcRectIn, dstRectIn, sizeIn, entire); - - if (result) { - synchronize(); - return result; - } - } - - uint blitType = BlitCopyBufferRect; - size_t dim = 3; - size_t globalWorkOffset[3] = { 0, 0, 0 }; - size_t globalWorkSize[3]; - size_t localWorkSize[3]; - - const static uint CopyRectAlignment[3] = { 16, 4, 1 }; - - bool aligned; - uint i; - for (i = 0; i < sizeof(CopyRectAlignment) / sizeof(uint); i++) { - // Check source alignments - aligned = ((srcRectIn.rowPitch_ % CopyRectAlignment[i]) == 0); - aligned &= ((srcRectIn.slicePitch_ % CopyRectAlignment[i]) == 0); - aligned &= ((srcRectIn.start_ % CopyRectAlignment[i]) == 0); - - // Check destination alignments - aligned &= ((dstRectIn.rowPitch_ % CopyRectAlignment[i]) == 0); - aligned &= ((dstRectIn.slicePitch_ % CopyRectAlignment[i]) == 0); - aligned &= ((dstRectIn.start_ % CopyRectAlignment[i]) == 0); - - // Check copy size alignment in the first dimension - aligned &= ((sizeIn[0] % CopyRectAlignment[i]) == 0); - - if (aligned) { - if (CopyRectAlignment[i] != 1) { - blitType = BlitCopyBufferRectAligned; - } - break; - } - } - - amd::BufferRect srcRect; - amd::BufferRect dstRect; - amd::Coord3D size(sizeIn[0], sizeIn[1], sizeIn[2]); - - srcRect.rowPitch_ = srcRectIn.rowPitch_ / CopyRectAlignment[i]; - srcRect.slicePitch_ = srcRectIn.slicePitch_ / CopyRectAlignment[i]; - srcRect.start_ = srcRectIn.start_ / CopyRectAlignment[i]; - srcRect.end_ = srcRectIn.end_ / CopyRectAlignment[i]; - - dstRect.rowPitch_ = dstRectIn.rowPitch_ / CopyRectAlignment[i]; - dstRect.slicePitch_ = dstRectIn.slicePitch_ / CopyRectAlignment[i]; - dstRect.start_ = dstRectIn.start_ / CopyRectAlignment[i]; - dstRect.end_ = dstRectIn.end_ / CopyRectAlignment[i]; - - size.c[0] /= CopyRectAlignment[i]; - - // Program the kernel's workload depending on the transfer dimensions - if ((size[1] == 1) && (size[2] == 1)) { - globalWorkSize[0] = amd::alignUp(size[0], 256); - globalWorkSize[1] = 1; - globalWorkSize[2] = 1; - localWorkSize[0] = 256; - localWorkSize[1] = 1; - localWorkSize[2] = 1; - } - else if (size[2] == 1) { - globalWorkSize[0] = amd::alignUp(size[0], 16); - globalWorkSize[1] = amd::alignUp(size[1], 16); - globalWorkSize[2] = 1; - localWorkSize[0] = localWorkSize[1] = 16; - localWorkSize[2] = 1; - } - else { - globalWorkSize[0] = amd::alignUp(size[0], 8); - globalWorkSize[1] = amd::alignUp(size[1], 8); - globalWorkSize[2] = amd::alignUp(size[2], 4); - localWorkSize[0] = localWorkSize[1] = 8; - localWorkSize[2] = 4; - } - - - // Program kernels arguments for the blit operation - Memory* mem = &gpuMem(srcMemory); - setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem); - mem = &gpuMem(dstMemory); - setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem); - cl_ulong src[4] = { srcRect.rowPitch_, - srcRect.slicePitch_, - srcRect.start_, 0 }; - setArgument(kernels_[blitType], 2, sizeof(src), src); - cl_ulong dst[4] = { dstRect.rowPitch_, - dstRect.slicePitch_, - dstRect.start_, 0 }; - setArgument(kernels_[blitType], 3, sizeof(dst), dst); - cl_ulong copySize[4] = { size[0], size[1], size[2], CopyRectAlignment[i] }; - setArgument(kernels_[blitType], 4, sizeof(copySize), copySize); - - // Create ND range object for the kernel's execution - amd::NDRangeContainer ndrange(dim, - globalWorkOffset, globalWorkSize, localWorkSize); - - // Execute the blit - address parameters = kernels_[blitType]->parameters().values(); - result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters); - - synchronize(); - - return result; -} - -bool -KernelBlitManager::readBuffer( - device::Memory& srcMemory, - void* dstHost, - const amd::Coord3D& origin, - const amd::Coord3D& size, - bool entire) const -{ - amd::ScopedLock k(lockXferOps_); - bool result = false; - // Use host copy if memory has direct access - if (setup_.disableReadBuffer_ || - (gpuMem(srcMemory).isHostMemDirectAccess() && gpuMem(srcMemory).isCacheable())) { - result = HostBlitManager::readBuffer( - srcMemory, dstHost, origin, size, entire); - synchronize(); - return result; - } - else { - size_t pinSize = size[0]; - // Check if a pinned transfer can be executed with a single pin - if ((pinSize <= dev().settings().pinnedXferSize_) && - (pinSize > MinSizeForPinnedTransfer)) { - size_t partial; - amd::Memory* amdMemory = pinHostMemory(dstHost, pinSize, partial); - - if (amdMemory == NULL) { - // Force SW copy - result = HostBlitManager::readBuffer( - srcMemory, dstHost, origin, size, entire); - synchronize(); - return result; - } - - // Readjust host mem offset - amd::Coord3D dstOrigin(partial); - - // Get device memory for this virtual device - Memory* dstMemory = dev().getGpuMemory(amdMemory); - - // Copy image to buffer - result = copyBuffer(srcMemory, *dstMemory, - origin, dstOrigin, size, entire); - - // Add pinned memory for a later release - gpu().addPinnedMem(amdMemory); - } - else { - result = DmaBlitManager::readBuffer( - srcMemory, dstHost, origin, size, entire); - } - } - - synchronize(); - - return result; -} - -bool -KernelBlitManager::readBufferRect( - device::Memory& srcMemory, - void* dstHost, - const amd::BufferRect& bufRect, - const amd::BufferRect& hostRect, - const amd::Coord3D& size, - bool entire) const -{ - amd::ScopedLock k(lockXferOps_); - bool result = false; - - // Use host copy if memory has direct access - if (setup_.disableReadBufferRect_ || - (gpuMem(srcMemory).isHostMemDirectAccess() && gpuMem(srcMemory).isCacheable())) { - result = HostBlitManager::readBufferRect( - srcMemory, dstHost, bufRect, hostRect, size, entire); - synchronize(); - return result; - } - else { - size_t pinSize = hostRect.start_ + hostRect.end_; - size_t partial; - amd::Memory* amdMemory = pinHostMemory(dstHost, pinSize, partial); - - if (amdMemory == NULL) { - // Force SW copy - result = HostBlitManager::readBufferRect( - srcMemory, dstHost, bufRect, hostRect, size, entire); - synchronize(); - return result; - } - - // Readjust host mem offset - amd::BufferRect rect; - rect.rowPitch_ = hostRect.rowPitch_; - rect.slicePitch_ = hostRect.slicePitch_; - rect.start_ = hostRect.start_ + partial; - rect.end_ = hostRect.end_; - - // Get device memory for this virtual device - Memory* dstMemory = dev().getGpuMemory(amdMemory); - - // Copy image to buffer - result = copyBufferRect(srcMemory, *dstMemory, - bufRect, rect, size, entire); - - // Add pinned memory for a later release - gpu().addPinnedMem(amdMemory); - } - - synchronize(); - - return result; -} - -bool -KernelBlitManager::writeBuffer( - const void* srcHost, - device::Memory& dstMemory, - const amd::Coord3D& origin, - const amd::Coord3D& size, - bool entire) const -{ - amd::ScopedLock k(lockXferOps_); - bool result = false; - - // Use host copy if memory has direct access or it's persistent - if (setup_.disableWriteBuffer_ || - gpuMem(dstMemory).isHostMemDirectAccess() || - (gpuMem(dstMemory).memoryType() == Resource::Persistent)) { - result = HostBlitManager::writeBuffer( - srcHost, dstMemory, origin, size, entire); - synchronize(); - return result; - } - else { - size_t pinSize = size[0]; - - // Check if a pinned transfer can be executed with a single pin - if ((pinSize <= dev().settings().pinnedXferSize_) && - (pinSize > MinSizeForPinnedTransfer)) { - size_t partial; - amd::Memory* amdMemory = pinHostMemory(srcHost, pinSize, partial); - - if (amdMemory == NULL) { - // Force SW copy - result = HostBlitManager::writeBuffer( - srcHost, dstMemory, origin, size, entire); - synchronize(); - return result; - } - - // Readjust destination offset - const amd::Coord3D srcOrigin(partial); - - // Get device memory for this virtual device - Memory* srcMemory = dev().getGpuMemory(amdMemory); - - // Copy buffer rect - result = copyBuffer(*srcMemory, dstMemory, - srcOrigin, origin, size, entire); - - // Add pinned memory for a later release - gpu().addPinnedMem(amdMemory); - } - else { - result = DmaBlitManager::writeBuffer( - srcHost, dstMemory, origin, size, entire); - } - } - - synchronize(); - - - return result; -} - -bool -KernelBlitManager::writeBufferRect( - const void* srcHost, - device::Memory& dstMemory, - const amd::BufferRect& hostRect, - const amd::BufferRect& bufRect, - const amd::Coord3D& size, - bool entire) const -{ - amd::ScopedLock k(lockXferOps_); - bool result = false; - - // Use host copy if memory has direct access or it's persistent - if (setup_.disableWriteBufferRect_ || - gpuMem(dstMemory).isHostMemDirectAccess() || - gpuMem(dstMemory).isPersistentDirectMap()) { - result = HostBlitManager::writeBufferRect( - srcHost, dstMemory, hostRect, bufRect, size, entire); - synchronize(); - return result; - } - else { - size_t pinSize = hostRect.start_ + hostRect.end_; - size_t partial; - amd::Memory* amdMemory = pinHostMemory(srcHost, pinSize, partial); - - if (amdMemory == NULL) { - // Force SW copy - result = HostBlitManager::writeBufferRect( - srcHost, dstMemory, hostRect, bufRect, size, entire); - synchronize(); - return result; - } - - // Readjust destination offset - const amd::Coord3D srcOrigin(partial); - - // Get device memory for this virtual device - Memory* srcMemory = dev().getGpuMemory(amdMemory); - - // Readjust host mem offset - amd::BufferRect rect; - rect.rowPitch_ = hostRect.rowPitch_; - rect.slicePitch_ = hostRect.slicePitch_; - rect.start_ = hostRect.start_ + partial; - rect.end_ = hostRect.end_; - - // Copy buffer rect - result = copyBufferRect(*srcMemory, dstMemory, - rect, bufRect, size, entire); - - // Add pinned memory for a later release - gpu().addPinnedMem(amdMemory); - } - - synchronize(); - - return result; -} - -bool -KernelBlitManager::fillBuffer( - device::Memory& memory, - const void* pattern, - size_t patternSize, - const amd::Coord3D& origin, - const amd::Coord3D& size, - bool entire - ) const -{ - amd::ScopedLock k(lockXferOps_); - bool result = false; - - // Use host fill if memory has direct access - if (setup_.disableFillBuffer_ || - gpuMem(memory).isHostMemDirectAccess()) { - result = HostBlitManager::fillBuffer( - memory, pattern, patternSize, origin, size, entire); - synchronize(); - return result; - } - else { - uint fillType = FillBuffer; - size_t globalWorkOffset[3] = { 0, 0, 0 }; - cl_ulong fillSize = size[0] / patternSize; - size_t globalWorkSize = amd::alignUp(fillSize, 256); - size_t localWorkSize = 256; - bool dwordAligned = - ((patternSize % sizeof(uint32_t)) == 0) ? true : false; - - // Program kernels arguments for the fill operation - Memory* mem = &gpuMem(memory); - if (dwordAligned) { - setArgument(kernels_[fillType], 0, sizeof(cl_mem), NULL); - setArgument(kernels_[fillType], 1, sizeof(cl_mem), &mem); - } - else { - setArgument(kernels_[fillType], 0, sizeof(cl_mem), &mem); - setArgument(kernels_[fillType], 1, sizeof(cl_mem), NULL); - } - Memory* gpuCB = dev().getGpuMemory(constantBuffer_); - if (gpuCB == NULL) { - return false; - } - void* constBuf = gpuCB->map(&gpu(), Resource::WriteOnly); - memcpy(constBuf, pattern, patternSize); - gpuCB->unmap(&gpu()); - setArgument(kernels_[fillType], 2, sizeof(cl_mem), &gpuCB); - cl_ulong offset = origin[0]; - if (dwordAligned) { - patternSize /= sizeof(uint32_t); - offset /= sizeof(uint32_t); - } - setArgument(kernels_[fillType], 3, sizeof(cl_uint), &patternSize); - setArgument(kernels_[fillType], 4, sizeof(offset), &offset); - setArgument(kernels_[fillType], 5, sizeof(fillSize), &fillSize); - - // Create ND range object for the kernel's execution - amd::NDRangeContainer ndrange(1, - globalWorkOffset, &globalWorkSize, &localWorkSize); - - // Execute the blit - address parameters = kernels_[fillType]->parameters().values(); - result = gpu().submitKernelInternal(ndrange, *kernels_[fillType], parameters); - } - - synchronize(); - - return result; -} - -bool -KernelBlitManager::copyBuffer( - device::Memory& srcMemory, - device::Memory& dstMemory, - const amd::Coord3D& srcOrigin, - const amd::Coord3D& dstOrigin, - const amd::Coord3D& sizeIn, - bool entire) const -{ - amd::ScopedLock k(lockXferOps_); - bool result = false; - - if (!gpuMem(srcMemory).isHostMemDirectAccess() && - !gpuMem(dstMemory).isHostMemDirectAccess()) { - uint blitType = BlitCopyBuffer; - size_t dim = 1; - size_t globalWorkOffset[3] = { 0, 0, 0 }; - size_t globalWorkSize = 0; - size_t localWorkSize = 0; - - const static uint CopyBuffAlignment[3] = { 16, 4, 1 }; - amd::Coord3D size(sizeIn[0], sizeIn[1], sizeIn[2]); - - bool aligned; - uint i; - for (i = 0; i < sizeof(CopyBuffAlignment) / sizeof(uint); i++) { - // Check source alignments - aligned = ((srcOrigin[0] % CopyBuffAlignment[i]) == 0); - // Check destination alignments - aligned &= ((dstOrigin[0] % CopyBuffAlignment[i]) == 0); - // Check copy size alignment in the first dimension - aligned &= ((sizeIn[0] % CopyBuffAlignment[i]) == 0); - - if (aligned) { - if (CopyBuffAlignment[i] != 1) { - blitType = BlitCopyBufferAligned; - } - break; - } - } - - cl_uint remain; - if (blitType == BlitCopyBufferAligned) { - size.c[0] /= CopyBuffAlignment[i]; - } - else { - if (dev().settings().ciPlus_) { - remain = size[0] % 4; - size.c[0] /= 4; - size.c[0] += 1; - } - else { - // Check if offsets are aligned - aligned = ((srcOrigin[0] % sizeof(uint32_t)) == 0); - aligned &= ((dstOrigin[0] % sizeof(uint32_t)) == 0); - if (aligned) { - remain = size[0] % 4; - size.c[0] /= 4; - size.c[0] += 1; - } - else { - remain = 8; - } - } - } - - // Program the dispatch dimensions - localWorkSize = 256; - globalWorkSize = amd::alignUp(size[0] , 256); - - // Program kernels arguments for the blit operation - Memory* mem = &gpuMem(srcMemory); - setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem); - mem = &gpuMem(dstMemory); - setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem); - // Program source origin - cl_ulong srcOffset = srcOrigin[0] / CopyBuffAlignment[i];; - setArgument(kernels_[blitType], 2, sizeof(srcOffset), &srcOffset); - - // Program destinaiton origin - cl_ulong dstOffset = dstOrigin[0] / CopyBuffAlignment[i];; - setArgument(kernels_[blitType], 3, sizeof(dstOffset), &dstOffset); - - cl_ulong copySize = size[0]; - setArgument(kernels_[blitType], 4, sizeof(copySize), ©Size); - - if (blitType == BlitCopyBufferAligned) { - cl_int alignment = CopyBuffAlignment[i]; - setArgument(kernels_[blitType], 5, sizeof(alignment), &alignment); - } - else { - setArgument(kernels_[blitType], 5, sizeof(remain), &remain); - } - - // Create ND range object for the kernel's execution - amd::NDRangeContainer ndrange(1, - globalWorkOffset, &globalWorkSize, &localWorkSize); - - // Execute the blit - address parameters = kernels_[blitType]->parameters().values(); - result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters); - } - else { - result = DmaBlitManager::copyBuffer( - srcMemory, dstMemory, srcOrigin, dstOrigin, sizeIn, entire); - } - - synchronize(); - - return result; -} - -bool -KernelBlitManager::fillImage( - device::Memory& memory, - const void* pattern, - const amd::Coord3D& origin, - const amd::Coord3D& size, - bool entire - ) const -{ - amd::ScopedLock k(lockXferOps_); - bool result = false; - - // Use host fill if memory has direct access - if (setup_.disableFillImage_ || - gpuMem(memory).isHostMemDirectAccess()) { - result = HostBlitManager::fillImage( - memory, pattern, origin, size, entire); - synchronize(); - return result; - } - - uint fillType; - size_t dim = 0; - size_t globalWorkOffset[3] = { 0, 0, 0 }; - size_t globalWorkSize[3]; - size_t localWorkSize[3]; - Memory* memView = &gpuMem(memory); - amd::Image::Format newFormat(gpuMem(memory).owner()->asImage()->getImageFormat()); - - // Program the kernels workload depending on the fill dimensions - fillType = FillImage; - dim = 3; - - void *newpattern = const_cast(pattern); - cl_uint4 iFillColor; - - bool rejected = false; - bool releaseView = false; - // For depth, we need to create a view - if ((memView->cal()->format_ == CM_SURF_FMT_DEPTH32F) || - (memView->cal()->format_ == CM_SURF_FMT_RGBA8_SRGB) || - (memView->cal()->format_ == CM_SURF_FMT_DEPTH16)) { - - // Find unsupported data type - for (uint i = 0; i < RejectedFormatDataTotal; ++i) { - if (RejectedData[i].clOldType_ == newFormat.image_channel_data_type) { - newFormat.image_channel_data_type = RejectedData[i].clNewType_; - rejected = true; - break; - } - } - - // Below may not be correct. We need to find why unsigned int view doesn't work for DEPTH16. - if (gpuMem(memory).cal()->format_ == CM_SURF_FMT_DEPTH16) { - newFormat.image_channel_data_type = CL_UNORM_INT16; - } - - if (gpuMem(memory).cal()->format_ == CM_SURF_FMT_RGBA8_SRGB) { - // Converting a linear RGB floating-point color value to a 8-bit unsigned integer sRGB value because hw is not support write_imagef for sRGB. - float *fColor = static_cast(newpattern); - iFillColor.s[0] = sRGBmap(fColor[0]); - iFillColor.s[1] = sRGBmap(fColor[1]); - iFillColor.s[2] = sRGBmap(fColor[2]); - iFillColor.s[3] = (cl_uint)(fColor[3]*255.0f); - newpattern = static_cast(&iFillColor); - for (uint i = 0; i < RejectedFormatChannelTotal; ++i) { - if (RejectedOrder[i].clOldType_ == newFormat.image_channel_order) { - newFormat.image_channel_order = RejectedOrder[i].clNewType_; - rejected = true; - break; - } - } - } - } - // If the image format was rejected, then attempt to create a view - if (rejected) { - memView = createView(gpuMem(memory), dev().getCalFormat(newFormat)); - if (memView != NULL) { - rejected = false; - releaseView = true; - } - } - - // Perform workload split to allow multiple operations in a single thread - globalWorkSize[0] = (size[0] + TransferSplitSize - 1) / TransferSplitSize; - // Find the current blit type - if (memView->cal()->dimSize_ == 1) { - globalWorkSize[0] = amd::alignUp(globalWorkSize[0], 256); - globalWorkSize[1] = amd::alignUp(size[1], 1); - globalWorkSize[2] = amd::alignUp(size[2], 1); - localWorkSize[0] = 256; - localWorkSize[1] = localWorkSize[2] = 1; - } - else if (memView->cal()->dimSize_ == 2) { - globalWorkSize[0] = amd::alignUp(globalWorkSize[0], 16); - globalWorkSize[1] = amd::alignUp(size[1], 16); - globalWorkSize[2] = amd::alignUp(size[2], 1); - localWorkSize[0] = localWorkSize[1] = 16; - localWorkSize[2] = 1; - } - else { - globalWorkSize[0] = amd::alignUp(globalWorkSize[0], 8); - globalWorkSize[1] = amd::alignUp(size[1], 8); - globalWorkSize[2] = amd::alignUp(size[2], 4); - localWorkSize[0] = localWorkSize[1] = 8; - localWorkSize[2] = 4; - } - - // Program kernels arguments for the blit operation - Memory* mem = memView; - setArgument(kernels_[fillType], 0, sizeof(cl_mem), &mem); - setArgument(kernels_[fillType], 1, sizeof(cl_float4), newpattern); - setArgument(kernels_[fillType], 2, sizeof(cl_int4), newpattern); - setArgument(kernels_[fillType], 3, sizeof(cl_uint4), newpattern); - - cl_int fillOrigin[4] = { (cl_int)origin[0], - (cl_int)origin[1], - (cl_int)origin[2], 0 }; - cl_int fillSize[4] = { (cl_int)size[0], - (cl_int)size[1], - (cl_int)size[2], 0 }; - setArgument(kernels_[fillType], 4, sizeof(fillOrigin), fillOrigin); - setArgument(kernels_[fillType], 5, sizeof(fillSize), fillSize); - - // Find the type of image - uint32_t type = 0; - switch (newFormat.image_channel_data_type) { - case CL_SNORM_INT8: - case CL_SNORM_INT16: - case CL_UNORM_INT8: - case CL_UNORM_INT16: - case CL_UNORM_SHORT_565: - case CL_UNORM_SHORT_555: - case CL_UNORM_INT_101010: - case CL_HALF_FLOAT: - case CL_FLOAT: - type = 0; - break; - case CL_SIGNED_INT8: - case CL_SIGNED_INT16: - case CL_SIGNED_INT32: - type = 1; - break; - case CL_UNSIGNED_INT8: - case CL_UNSIGNED_INT16: - case CL_UNSIGNED_INT32: - type = 2; - break; - } - setArgument(kernels_[fillType], 6, sizeof(type), &type); - - // Create ND range object for the kernel's execution - amd::NDRangeContainer ndrange(dim, - globalWorkOffset, globalWorkSize, localWorkSize); + amd::NDRangeContainer ndrange(1, globalWorkOffset, &globalWorkSize, &localWorkSize); // Execute the blit address parameters = kernels_[fillType]->parameters().values(); result = gpu().submitKernelInternal(ndrange, *kernels_[fillType], parameters); - if (releaseView) { - delete memView; - } + } - synchronize(); + synchronize(); - return result; + return result; } -bool -KernelBlitManager::runScheduler( - device::Memory& vqueue, - device::Memory& params, - uint paramIdx, - uint threads - ) const -{ - amd::ScopedLock k(lockXferOps_); - bool result = false; +bool KernelBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& dstMemory, + const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin, + const amd::Coord3D& sizeIn, bool entire) const { + amd::ScopedLock k(lockXferOps_); + bool result = false; - size_t dim = 1; - size_t globalWorkOffset[1] = { 0 }; - size_t globalWorkSize[1] = { threads }; - size_t localWorkSize[1] = { 1 }; + if (!gpuMem(srcMemory).isHostMemDirectAccess() && !gpuMem(dstMemory).isHostMemDirectAccess()) { + uint blitType = BlitCopyBuffer; + size_t dim = 1; + size_t globalWorkOffset[3] = {0, 0, 0}; + size_t globalWorkSize = 0; + size_t localWorkSize = 0; - // Program kernels arguments - Memory* q = &gpuMem(vqueue); - Memory* p = &gpuMem(params); - setArgument(kernels_[Scheduler], 0, sizeof(cl_mem), &q); - setArgument(kernels_[Scheduler], 1, sizeof(cl_mem), &p); - setArgument(kernels_[Scheduler], 2, sizeof(uint), ¶mIdx); + const static uint CopyBuffAlignment[3] = {16, 4, 1}; + amd::Coord3D size(sizeIn[0], sizeIn[1], sizeIn[2]); + + bool aligned; + uint i; + for (i = 0; i < sizeof(CopyBuffAlignment) / sizeof(uint); i++) { + // Check source alignments + aligned = ((srcOrigin[0] % CopyBuffAlignment[i]) == 0); + // Check destination alignments + aligned &= ((dstOrigin[0] % CopyBuffAlignment[i]) == 0); + // Check copy size alignment in the first dimension + aligned &= ((sizeIn[0] % CopyBuffAlignment[i]) == 0); + + if (aligned) { + if (CopyBuffAlignment[i] != 1) { + blitType = BlitCopyBufferAligned; + } + break; + } + } + + cl_uint remain; + if (blitType == BlitCopyBufferAligned) { + size.c[0] /= CopyBuffAlignment[i]; + } else { + if (dev().settings().ciPlus_) { + remain = size[0] % 4; + size.c[0] /= 4; + size.c[0] += 1; + } else { + // Check if offsets are aligned + aligned = ((srcOrigin[0] % sizeof(uint32_t)) == 0); + aligned &= ((dstOrigin[0] % sizeof(uint32_t)) == 0); + if (aligned) { + remain = size[0] % 4; + size.c[0] /= 4; + size.c[0] += 1; + } else { + remain = 8; + } + } + } + + // Program the dispatch dimensions + localWorkSize = 256; + globalWorkSize = amd::alignUp(size[0], 256); + + // Program kernels arguments for the blit operation + Memory* mem = &gpuMem(srcMemory); + setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem); + mem = &gpuMem(dstMemory); + setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem); + // Program source origin + cl_ulong srcOffset = srcOrigin[0] / CopyBuffAlignment[i]; + ; + setArgument(kernels_[blitType], 2, sizeof(srcOffset), &srcOffset); + + // Program destinaiton origin + cl_ulong dstOffset = dstOrigin[0] / CopyBuffAlignment[i]; + ; + setArgument(kernels_[blitType], 3, sizeof(dstOffset), &dstOffset); + + cl_ulong copySize = size[0]; + setArgument(kernels_[blitType], 4, sizeof(copySize), ©Size); + + if (blitType == BlitCopyBufferAligned) { + cl_int alignment = CopyBuffAlignment[i]; + setArgument(kernels_[blitType], 5, sizeof(alignment), &alignment); + } else { + setArgument(kernels_[blitType], 5, sizeof(remain), &remain); + } // Create ND range object for the kernel's execution - amd::NDRangeContainer ndrange(1, - globalWorkOffset, globalWorkSize, localWorkSize); + amd::NDRangeContainer ndrange(1, globalWorkOffset, &globalWorkSize, &localWorkSize); // Execute the blit - address parameters = kernels_[Scheduler]->parameters().values(); - result = gpu().submitKernelInternal(ndrange, *kernels_[Scheduler], parameters); + address parameters = kernels_[blitType]->parameters().values(); + result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters); + } else { + result = DmaBlitManager::copyBuffer(srcMemory, dstMemory, srcOrigin, dstOrigin, sizeIn, entire); + } + synchronize(); + + return result; +} + +bool KernelBlitManager::fillImage(device::Memory& memory, const void* pattern, + const amd::Coord3D& origin, const amd::Coord3D& size, + bool entire) const { + amd::ScopedLock k(lockXferOps_); + bool result = false; + + // Use host fill if memory has direct access + if (setup_.disableFillImage_ || gpuMem(memory).isHostMemDirectAccess()) { + result = HostBlitManager::fillImage(memory, pattern, origin, size, entire); synchronize(); - return result; + } + + uint fillType; + size_t dim = 0; + size_t globalWorkOffset[3] = {0, 0, 0}; + size_t globalWorkSize[3]; + size_t localWorkSize[3]; + Memory* memView = &gpuMem(memory); + amd::Image::Format newFormat(gpuMem(memory).owner()->asImage()->getImageFormat()); + + // Program the kernels workload depending on the fill dimensions + fillType = FillImage; + dim = 3; + + void* newpattern = const_cast(pattern); + cl_uint4 iFillColor; + + bool rejected = false; + bool releaseView = false; + // For depth, we need to create a view + if ((memView->cal()->format_ == CM_SURF_FMT_DEPTH32F) || + (memView->cal()->format_ == CM_SURF_FMT_RGBA8_SRGB) || + (memView->cal()->format_ == CM_SURF_FMT_DEPTH16)) { + // Find unsupported data type + for (uint i = 0; i < RejectedFormatDataTotal; ++i) { + if (RejectedData[i].clOldType_ == newFormat.image_channel_data_type) { + newFormat.image_channel_data_type = RejectedData[i].clNewType_; + rejected = true; + break; + } + } + + // Below may not be correct. We need to find why unsigned int view doesn't work for DEPTH16. + if (gpuMem(memory).cal()->format_ == CM_SURF_FMT_DEPTH16) { + newFormat.image_channel_data_type = CL_UNORM_INT16; + } + + if (gpuMem(memory).cal()->format_ == CM_SURF_FMT_RGBA8_SRGB) { + // Converting a linear RGB floating-point color value to a 8-bit unsigned integer sRGB value + // because hw is not support write_imagef for sRGB. + float* fColor = static_cast(newpattern); + iFillColor.s[0] = sRGBmap(fColor[0]); + iFillColor.s[1] = sRGBmap(fColor[1]); + iFillColor.s[2] = sRGBmap(fColor[2]); + iFillColor.s[3] = (cl_uint)(fColor[3] * 255.0f); + newpattern = static_cast(&iFillColor); + for (uint i = 0; i < RejectedFormatChannelTotal; ++i) { + if (RejectedOrder[i].clOldType_ == newFormat.image_channel_order) { + newFormat.image_channel_order = RejectedOrder[i].clNewType_; + rejected = true; + break; + } + } + } + } + // If the image format was rejected, then attempt to create a view + if (rejected) { + memView = createView(gpuMem(memory), dev().getCalFormat(newFormat)); + if (memView != NULL) { + rejected = false; + releaseView = true; + } + } + + // Perform workload split to allow multiple operations in a single thread + globalWorkSize[0] = (size[0] + TransferSplitSize - 1) / TransferSplitSize; + // Find the current blit type + if (memView->cal()->dimSize_ == 1) { + globalWorkSize[0] = amd::alignUp(globalWorkSize[0], 256); + globalWorkSize[1] = amd::alignUp(size[1], 1); + globalWorkSize[2] = amd::alignUp(size[2], 1); + localWorkSize[0] = 256; + localWorkSize[1] = localWorkSize[2] = 1; + } else if (memView->cal()->dimSize_ == 2) { + globalWorkSize[0] = amd::alignUp(globalWorkSize[0], 16); + globalWorkSize[1] = amd::alignUp(size[1], 16); + globalWorkSize[2] = amd::alignUp(size[2], 1); + localWorkSize[0] = localWorkSize[1] = 16; + localWorkSize[2] = 1; + } else { + globalWorkSize[0] = amd::alignUp(globalWorkSize[0], 8); + globalWorkSize[1] = amd::alignUp(size[1], 8); + globalWorkSize[2] = amd::alignUp(size[2], 4); + localWorkSize[0] = localWorkSize[1] = 8; + localWorkSize[2] = 4; + } + + // Program kernels arguments for the blit operation + Memory* mem = memView; + setArgument(kernels_[fillType], 0, sizeof(cl_mem), &mem); + setArgument(kernels_[fillType], 1, sizeof(cl_float4), newpattern); + setArgument(kernels_[fillType], 2, sizeof(cl_int4), newpattern); + setArgument(kernels_[fillType], 3, sizeof(cl_uint4), newpattern); + + cl_int fillOrigin[4] = {(cl_int)origin[0], (cl_int)origin[1], (cl_int)origin[2], 0}; + cl_int fillSize[4] = {(cl_int)size[0], (cl_int)size[1], (cl_int)size[2], 0}; + setArgument(kernels_[fillType], 4, sizeof(fillOrigin), fillOrigin); + setArgument(kernels_[fillType], 5, sizeof(fillSize), fillSize); + + // Find the type of image + uint32_t type = 0; + switch (newFormat.image_channel_data_type) { + case CL_SNORM_INT8: + case CL_SNORM_INT16: + case CL_UNORM_INT8: + case CL_UNORM_INT16: + case CL_UNORM_SHORT_565: + case CL_UNORM_SHORT_555: + case CL_UNORM_INT_101010: + case CL_HALF_FLOAT: + case CL_FLOAT: + type = 0; + break; + case CL_SIGNED_INT8: + case CL_SIGNED_INT16: + case CL_SIGNED_INT32: + type = 1; + break; + case CL_UNSIGNED_INT8: + case CL_UNSIGNED_INT16: + case CL_UNSIGNED_INT32: + type = 2; + break; + } + setArgument(kernels_[fillType], 6, sizeof(type), &type); + + // Create ND range object for the kernel's execution + amd::NDRangeContainer ndrange(dim, globalWorkOffset, globalWorkSize, localWorkSize); + + // Execute the blit + address parameters = kernels_[fillType]->parameters().values(); + result = gpu().submitKernelInternal(ndrange, *kernels_[fillType], parameters); + if (releaseView) { + delete memView; + } + + synchronize(); + + return result; } -amd::Memory* -DmaBlitManager::pinHostMemory( - const void* hostMem, - size_t pinSize, - size_t& partial) const -{ - size_t pinAllocSize; - const static bool SysMem = true; - amd::Memory* amdMemory; +bool KernelBlitManager::runScheduler(device::Memory& vqueue, device::Memory& params, uint paramIdx, + uint threads) const { + amd::ScopedLock k(lockXferOps_); + bool result = false; - // Allign offset to 4K boundary (Vista/Win7 limitation) - char* tmpHost = const_cast( - amd::alignDown(reinterpret_cast(hostMem), - PinnedMemoryAlignment)); + size_t dim = 1; + size_t globalWorkOffset[1] = {0}; + size_t globalWorkSize[1] = {threads}; + size_t localWorkSize[1] = {1}; - // Find the partial size for unaligned copy - partial = reinterpret_cast(hostMem) - tmpHost; + // Program kernels arguments + Memory* q = &gpuMem(vqueue); + Memory* p = &gpuMem(params); + setArgument(kernels_[Scheduler], 0, sizeof(cl_mem), &q); + setArgument(kernels_[Scheduler], 1, sizeof(cl_mem), &p); + setArgument(kernels_[Scheduler], 2, sizeof(uint), ¶mIdx); - // Recalculate pin memory size - pinAllocSize = amd::alignUp(pinSize + partial, PinnedMemoryAlignment); + // Create ND range object for the kernel's execution + amd::NDRangeContainer ndrange(1, globalWorkOffset, globalWorkSize, localWorkSize); - amdMemory = gpu().findPinnedMem(tmpHost, pinAllocSize); + // Execute the blit + address parameters = kernels_[Scheduler]->parameters().values(); + result = gpu().submitKernelInternal(ndrange, *kernels_[Scheduler], parameters); - if (NULL != amdMemory) { - return amdMemory; - } + synchronize(); - amdMemory = new(*context_) - amd::Buffer(*context_, CL_MEM_USE_HOST_PTR, pinAllocSize); - if ((amdMemory != NULL) && !amdMemory->create(tmpHost, SysMem)) { - amdMemory->release(); - return NULL; - } + return result; +} - // Get device memory for this virtual device - // @note: This will force real memory pinning - amdMemory->setVirtualDevice(&gpu()); - Memory* srcMemory = dev().getGpuMemory(amdMemory); +amd::Memory* DmaBlitManager::pinHostMemory(const void* hostMem, size_t pinSize, + size_t& partial) const { + size_t pinAllocSize; + const static bool SysMem = true; + amd::Memory* amdMemory; - if (srcMemory == NULL) { - // Release all pinned memory and attempt pinning again - gpu().releasePinnedMem(); - srcMemory = dev().getGpuMemory(amdMemory); - if (srcMemory == NULL) { - // Release memory - amdMemory->release(); - amdMemory = NULL; - } - } + // Allign offset to 4K boundary (Vista/Win7 limitation) + char* tmpHost = const_cast( + amd::alignDown(reinterpret_cast(hostMem), PinnedMemoryAlignment)); + // Find the partial size for unaligned copy + partial = reinterpret_cast(hostMem) - tmpHost; + + // Recalculate pin memory size + pinAllocSize = amd::alignUp(pinSize + partial, PinnedMemoryAlignment); + + amdMemory = gpu().findPinnedMem(tmpHost, pinAllocSize); + + if (NULL != amdMemory) { return amdMemory; -} + } -Memory* -KernelBlitManager::createView( - const Memory& parent, - const CalFormat& format -) const -{ - assert(!parent.cal()->buffer_ && "View supports images only"); - gpu::Memory* gpuImage = NULL; + amdMemory = new (*context_) amd::Buffer(*context_, CL_MEM_USE_HOST_PTR, pinAllocSize); + if ((amdMemory != NULL) && !amdMemory->create(tmpHost, SysMem)) { + amdMemory->release(); + return NULL; + } - gpuImage = new gpu::Image(dev(), parent.size(), - parent.cal()->width_, - parent.cal()->height_, - parent.cal()->depth_, - format.type_, - format.channelOrder_, - parent.cal()->imageType_, - 1); + // Get device memory for this virtual device + // @note: This will force real memory pinning + amdMemory->setVirtualDevice(&gpu()); + Memory* srcMemory = dev().getGpuMemory(amdMemory); - // Create resource - if (NULL != gpuImage) { - bool result = false; - Resource::ImageViewParams params; - const Memory& gpuMem = static_cast(parent); - - params.owner_ = parent.owner(); - params.level_ = 0; - params.layer_ = 0; - params.resource_ = &gpuMem; - params.memory_ = &gpuMem; - params.gpu_ = &gpu(); - - // Create memory object - result = gpuImage->create(Resource::ImageView, ¶ms); - if (!result) { - delete gpuImage; - return NULL; - } + if (srcMemory == NULL) { + // Release all pinned memory and attempt pinning again + gpu().releasePinnedMem(); + srcMemory = dev().getGpuMemory(amdMemory); + if (srcMemory == NULL) { + // Release memory + amdMemory->release(); + amdMemory = NULL; } + } - return gpuImage; + return amdMemory; } -} // namespace gpu +Memory* KernelBlitManager::createView(const Memory& parent, const CalFormat& format) const { + assert(!parent.cal()->buffer_ && "View supports images only"); + gpu::Memory* gpuImage = NULL; + + gpuImage = new gpu::Image(dev(), parent.size(), parent.cal()->width_, parent.cal()->height_, + parent.cal()->depth_, format.type_, format.channelOrder_, + parent.cal()->imageType_, 1); + + // Create resource + if (NULL != gpuImage) { + bool result = false; + Resource::ImageViewParams params; + const Memory& gpuMem = static_cast(parent); + + params.owner_ = parent.owner(); + params.level_ = 0; + params.layer_ = 0; + params.resource_ = &gpuMem; + params.memory_ = &gpuMem; + params.gpu_ = &gpu(); + + // Create memory object + result = gpuImage->create(Resource::ImageView, ¶ms); + if (!result) { + delete gpuImage; + return NULL; + } + } + + return gpuImage; +} + +} // namespace gpu diff --git a/rocclr/runtime/device/gpu/gpublit.hpp b/rocclr/runtime/device/gpu/gpublit.hpp index 87dbab3c50..1288bb6584 100644 --- a/rocclr/runtime/device/gpu/gpublit.hpp +++ b/rocclr/runtime/device/gpu/gpublit.hpp @@ -24,429 +24,385 @@ class Memory; class VirtualGPU; //! DMA Blit Manager -class DmaBlitManager : public device::HostBlitManager -{ -public: - //! Constructor - DmaBlitManager( - VirtualGPU& gpu, //!< Virtual GPU to be used for blits - Setup setup = Setup() //!< Specifies HW accelerated blits - ); +class DmaBlitManager : public device::HostBlitManager { + public: + //! Constructor + DmaBlitManager(VirtualGPU& gpu, //!< Virtual GPU to be used for blits + Setup setup = Setup() //!< Specifies HW accelerated blits + ); - //! Destructor - virtual ~DmaBlitManager() {} + //! Destructor + virtual ~DmaBlitManager() {} - //! Creates DmaBlitManager object - virtual bool create(amd::Device& device) { return true; } + //! Creates DmaBlitManager object + virtual bool create(amd::Device& device) { return true; } - //! Copies a buffer object to system memory - virtual bool readBuffer( - device::Memory& srcMemory, //!< Source memory object - void* dstHost, //!< Destination host memory - const amd::Coord3D& origin, //!< Source origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; + //! Copies a buffer object to system memory + virtual bool readBuffer(device::Memory& srcMemory, //!< Source memory object + void* dstHost, //!< Destination host memory + const amd::Coord3D& origin, //!< Source origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const; - //! Copies a buffer object to system memory - virtual bool readBufferRect( - device::Memory& srcMemory, //!< Source memory object - void* dstHost, //!< Destinaiton host memory - const amd::BufferRect& bufRect, //!< Source rectangle - const amd::BufferRect& hostRect, //!< Destination rectangle - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; + //! Copies a buffer object to system memory + virtual bool readBufferRect(device::Memory& srcMemory, //!< Source memory object + void* dstHost, //!< Destinaiton host memory + const amd::BufferRect& bufRect, //!< Source rectangle + const amd::BufferRect& hostRect, //!< Destination rectangle + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const; - //! Copies an image object to system memory - virtual bool readImage( - device::Memory& srcMemory, //!< Source memory object - void* dstHost, //!< Destination host memory - const amd::Coord3D& origin, //!< Source origin - const amd::Coord3D& size, //!< Size of the copy region - size_t rowPitch, //!< Row pitch for host memory - size_t slicePitch, //!< Slice pitch for host memory - bool entire = false //!< Entire buffer will be updated - ) const; + //! Copies an image object to system memory + virtual bool readImage(device::Memory& srcMemory, //!< Source memory object + void* dstHost, //!< Destination host memory + const amd::Coord3D& origin, //!< Source origin + const amd::Coord3D& size, //!< Size of the copy region + size_t rowPitch, //!< Row pitch for host memory + size_t slicePitch, //!< Slice pitch for host memory + bool entire = false //!< Entire buffer will be updated + ) const; - //! Copies system memory to a buffer object - virtual bool writeBuffer( - const void* srcHost, //!< Source host memory - device::Memory& dstMemory, //!< Destination memory object - const amd::Coord3D& origin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; + //! Copies system memory to a buffer object + virtual bool writeBuffer(const void* srcHost, //!< Source host memory + device::Memory& dstMemory, //!< Destination memory object + const amd::Coord3D& origin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const; - //! Copies system memory to a buffer object - virtual bool writeBufferRect( - const void* srcHost, //!< Source host memory - device::Memory& dstMemory, //!< Destination memory object - const amd::BufferRect& hostRect, //!< Destination rectangle - const amd::BufferRect& bufRect, //!< Source rectangle - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; + //! Copies system memory to a buffer object + virtual bool writeBufferRect(const void* srcHost, //!< Source host memory + device::Memory& dstMemory, //!< Destination memory object + const amd::BufferRect& hostRect, //!< Destination rectangle + const amd::BufferRect& bufRect, //!< Source rectangle + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const; - //! Copies system memory to an image object - virtual bool writeImage( - const void* srcHost, //!< Source host memory - device::Memory& dstMemory, //!< Destination memory object - const amd::Coord3D& origin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - size_t rowPitch, //!< Row pitch for host memory - size_t slicePitch, //!< Slice pitch for host memory - bool entire = false //!< Entire buffer will be updated - ) const; + //! Copies system memory to an image object + virtual bool writeImage(const void* srcHost, //!< Source host memory + device::Memory& dstMemory, //!< Destination memory object + const amd::Coord3D& origin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + size_t rowPitch, //!< Row pitch for host memory + size_t slicePitch, //!< Slice pitch for host memory + bool entire = false //!< Entire buffer will be updated + ) const; - //! Copies a buffer object to another buffer object - virtual bool copyBuffer( - device::Memory& srcMemory, //!< Source memory object - device::Memory& dstMemory, //!< Destination memory object - const amd::Coord3D& srcOrigin, //!< Source origin - const amd::Coord3D& dstOrigin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; + //! Copies a buffer object to another buffer object + virtual bool copyBuffer(device::Memory& srcMemory, //!< Source memory object + device::Memory& dstMemory, //!< Destination memory object + const amd::Coord3D& srcOrigin, //!< Source origin + const amd::Coord3D& dstOrigin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const; - //! Copies a buffer object to another buffer object - virtual bool copyBufferRect( - device::Memory& srcMemory, //!< Source memory object - device::Memory& dstMemory, //!< Destination memory object - const amd::BufferRect& srcRect, //!< Source rectangle - const amd::BufferRect& dstRect, //!< Destination rectangle - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; + //! Copies a buffer object to another buffer object + virtual bool copyBufferRect(device::Memory& srcMemory, //!< Source memory object + device::Memory& dstMemory, //!< Destination memory object + const amd::BufferRect& srcRect, //!< Source rectangle + const amd::BufferRect& dstRect, //!< Destination rectangle + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const; - //! Copies an image object to a buffer object - virtual bool copyImageToBuffer( - device::Memory& srcMemory, //!< Source memory object - device::Memory& dstMemory, //!< Destination memory object - const amd::Coord3D& srcOrigin, //!< Source origin - const amd::Coord3D& dstOrigin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false, //!< Entire buffer will be updated - size_t rowPitch = 0, //!< Pitch for buffer - size_t slicePitch = 0 //!< Slice for buffer - ) const; + //! Copies an image object to a buffer object + virtual bool copyImageToBuffer(device::Memory& srcMemory, //!< Source memory object + device::Memory& dstMemory, //!< Destination memory object + const amd::Coord3D& srcOrigin, //!< Source origin + const amd::Coord3D& dstOrigin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false, //!< Entire buffer will be updated + size_t rowPitch = 0, //!< Pitch for buffer + size_t slicePitch = 0 //!< Slice for buffer + ) const; - //! Copies a buffer object to an image object - virtual bool copyBufferToImage( - device::Memory& srcMemory, //!< Source memory object - device::Memory& dstMemory, //!< Destination memory object - const amd::Coord3D& srcOrigin, //!< Source origin - const amd::Coord3D& dstOrigin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false, //!< Entire buffer will be updated - size_t rowPitch = 0, //!< Pitch for buffer - size_t slicePitch = 0 //!< Slice for buffer - ) const; + //! Copies a buffer object to an image object + virtual bool copyBufferToImage(device::Memory& srcMemory, //!< Source memory object + device::Memory& dstMemory, //!< Destination memory object + const amd::Coord3D& srcOrigin, //!< Source origin + const amd::Coord3D& dstOrigin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false, //!< Entire buffer will be updated + size_t rowPitch = 0, //!< Pitch for buffer + size_t slicePitch = 0 //!< Slice for buffer + ) const; - //! Copies an image object to another image object - virtual bool copyImage( - device::Memory& srcMemory, //!< Source memory object - device::Memory& dstMemory, //!< Destination memory object - const amd::Coord3D& srcOrigin, //!< Source origin - const amd::Coord3D& dstOrigin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; + //! Copies an image object to another image object + virtual bool copyImage(device::Memory& srcMemory, //!< Source memory object + device::Memory& dstMemory, //!< Destination memory object + const amd::Coord3D& srcOrigin, //!< Source origin + const amd::Coord3D& dstOrigin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const; -protected: - const static uint MaxPinnedBuffers = 4; + protected: + const static uint MaxPinnedBuffers = 4; - //! Synchronizes the blit operations if necessary - inline void synchronize() const; + //! Synchronizes the blit operations if necessary + inline void synchronize() const; - //! Returns the virtual GPU object - VirtualGPU& gpu() const { return static_cast(vDev_); } + //! Returns the virtual GPU object + VirtualGPU& gpu() const { return static_cast(vDev_); } - //! Returns the GPU device object - const Device& dev() const { return static_cast(dev_); }; + //! Returns the GPU device object + const Device& dev() const { return static_cast(dev_); }; - inline Memory& gpuMem(device::Memory& mem) const; + inline Memory& gpuMem(device::Memory& mem) const; - //! Pins host memory for GPU access - amd::Memory* pinHostMemory( - const void* hostMem, //!< Host memory pointer - size_t pinSize, //!< Host memory size - size_t& partial //!< Extra offset for memory alignment - ) const; + //! Pins host memory for GPU access + amd::Memory* pinHostMemory(const void* hostMem, //!< Host memory pointer + size_t pinSize, //!< Host memory size + size_t& partial //!< Extra offset for memory alignment + ) const; - const size_t MinSizeForPinnedTransfer; - bool completeOperation_; //!< DMA blit manager must complete operation - amd::Context* context_; //!< A dummy context + const size_t MinSizeForPinnedTransfer; + bool completeOperation_; //!< DMA blit manager must complete operation + amd::Context* context_; //!< A dummy context -private: + private: + //! Disable copy constructor + DmaBlitManager(const DmaBlitManager&); - //! Disable copy constructor - DmaBlitManager(const DmaBlitManager&); + //! Disable operator= + DmaBlitManager& operator=(const DmaBlitManager&); - //! Disable operator= - DmaBlitManager& operator=(const DmaBlitManager&); + //! Reads video memory, using a staged buffer + bool readMemoryStaged(Memory& srcMemory, //!< Source memory object + void* dstHost, //!< Destination host memory + Memory** xferBuf, //!< Staged buffer for read + size_t origin, //!< Original offset in the source memory + size_t& offset, //!< Offset for the current copy pointer + size_t& totalSize, //!< Total size for copy region + size_t xferSize //!< Transfer size + ) const; - //! Reads video memory, using a staged buffer - bool readMemoryStaged( - Memory& srcMemory, //!< Source memory object - void* dstHost, //!< Destination host memory - Memory** xferBuf, //!< Staged buffer for read - size_t origin, //!< Original offset in the source memory - size_t& offset, //!< Offset for the current copy pointer - size_t& totalSize, //!< Total size for copy region - size_t xferSize //!< Transfer size - ) const; - - //! Write into video memory, using a staged buffer - bool writeMemoryStaged( - const void* srcHost, //!< Source host memory - Memory& dstMemory, //!< Destination memory object - Memory& xferBuf, //!< Staged buffer for write - size_t origin, //!< Original offset in the destination memory - size_t& offset, //!< Offset for the current copy pointer - size_t& totalSize, //!< Total size for the copy region - size_t xferSize //!< Transfer size - ) const; + //! Write into video memory, using a staged buffer + bool writeMemoryStaged(const void* srcHost, //!< Source host memory + Memory& dstMemory, //!< Destination memory object + Memory& xferBuf, //!< Staged buffer for write + size_t origin, //!< Original offset in the destination memory + size_t& offset, //!< Offset for the current copy pointer + size_t& totalSize, //!< Total size for the copy region + size_t xferSize //!< Transfer size + ) const; }; //! Kernel Blit Manager -class KernelBlitManager : public DmaBlitManager -{ -public: - enum { - BlitCopyImage = 0, - BlitCopyImage1DA, - BlitCopyImageToBuffer, - BlitCopyBufferToImage, - BlitCopyBufferRect, - BlitCopyBufferRectAligned, - BlitCopyBuffer, - BlitCopyBufferAligned, - FillBuffer, - FillImage, - Scheduler, - BlitTotal - }; +class KernelBlitManager : public DmaBlitManager { + public: + enum { + BlitCopyImage = 0, + BlitCopyImage1DA, + BlitCopyImageToBuffer, + BlitCopyBufferToImage, + BlitCopyBufferRect, + BlitCopyBufferRectAligned, + BlitCopyBuffer, + BlitCopyBufferAligned, + FillBuffer, + FillImage, + Scheduler, + BlitTotal + }; - //! Constructor - KernelBlitManager( - VirtualGPU& gpu, //!< Virtual GPU to be used for blits - Setup setup = Setup() //!< Specifies HW accelerated blits - ); + //! Constructor + KernelBlitManager(VirtualGPU& gpu, //!< Virtual GPU to be used for blits + Setup setup = Setup() //!< Specifies HW accelerated blits + ); - //! Destructor - virtual ~KernelBlitManager(); + //! Destructor + virtual ~KernelBlitManager(); - //! Creates DmaBlitManager object - virtual bool create(amd::Device& device); + //! Creates DmaBlitManager object + virtual bool create(amd::Device& device); - //! Copies a buffer object to another buffer object - virtual bool copyBufferRect( - device::Memory& srcMemory, //!< Source memory object - device::Memory& dstMemory, //!< Destination memory object - const amd::BufferRect& srcRectIn, //!< Source rectangle - const amd::BufferRect& dstRectIn, //!< Destination rectangle - const amd::Coord3D& sizeIn, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; + //! Copies a buffer object to another buffer object + virtual bool copyBufferRect(device::Memory& srcMemory, //!< Source memory object + device::Memory& dstMemory, //!< Destination memory object + const amd::BufferRect& srcRectIn, //!< Source rectangle + const amd::BufferRect& dstRectIn, //!< Destination rectangle + const amd::Coord3D& sizeIn, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const; - //! Copies a buffer object to system memory - virtual bool readBuffer( - device::Memory& srcMemory, //!< Source memory object - void* dstHost, //!< Destination host memory - const amd::Coord3D& origin, //!< Source origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; + //! Copies a buffer object to system memory + virtual bool readBuffer(device::Memory& srcMemory, //!< Source memory object + void* dstHost, //!< Destination host memory + const amd::Coord3D& origin, //!< Source origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const; - //! Copies a buffer object to system memory - virtual bool readBufferRect( - device::Memory& srcMemory, //!< Source memory object - void* dstHost, //!< Destinaiton host memory - const amd::BufferRect& bufRect, //!< Source rectangle - const amd::BufferRect& hostRect, //!< Destination rectangle - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; + //! Copies a buffer object to system memory + virtual bool readBufferRect(device::Memory& srcMemory, //!< Source memory object + void* dstHost, //!< Destinaiton host memory + const amd::BufferRect& bufRect, //!< Source rectangle + const amd::BufferRect& hostRect, //!< Destination rectangle + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const; - //! Copies system memory to a buffer object - virtual bool writeBuffer( - const void* srcHost, //!< Source host memory - device::Memory& dstMemory, //!< Destination memory object - const amd::Coord3D& origin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; + //! Copies system memory to a buffer object + virtual bool writeBuffer(const void* srcHost, //!< Source host memory + device::Memory& dstMemory, //!< Destination memory object + const amd::Coord3D& origin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const; - //! Copies system memory to a buffer object - virtual bool writeBufferRect( - const void* srcHost, //!< Source host memory - device::Memory& dstMemory, //!< Destination memory object - const amd::BufferRect& hostRect, //!< Destination rectangle - const amd::BufferRect& bufRect, //!< Source rectangle - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; + //! Copies system memory to a buffer object + virtual bool writeBufferRect(const void* srcHost, //!< Source host memory + device::Memory& dstMemory, //!< Destination memory object + const amd::BufferRect& hostRect, //!< Destination rectangle + const amd::BufferRect& bufRect, //!< Source rectangle + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const; - //! Copies a buffer object to an image object - virtual bool copyBuffer( - device::Memory& srcMemory, //!< Source memory object - device::Memory& dstMemory, //!< Destination memory object - const amd::Coord3D& srcOrigin, //!< Source origin - const amd::Coord3D& dstOrigin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; + //! Copies a buffer object to an image object + virtual bool copyBuffer(device::Memory& srcMemory, //!< Source memory object + device::Memory& dstMemory, //!< Destination memory object + const amd::Coord3D& srcOrigin, //!< Source origin + const amd::Coord3D& dstOrigin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const; - //! Copies a buffer object to an image object - virtual bool copyBufferToImage( - device::Memory& srcMemory, //!< Source memory object - device::Memory& dstMemory, //!< Destination memory object - const amd::Coord3D& srcOrigin, //!< Source origin - const amd::Coord3D& dstOrigin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false, //!< Entire buffer will be updated - size_t rowPitch = 0, //!< Pitch for buffer - size_t slicePitch = 0 //!< Slice for buffer - ) const; + //! Copies a buffer object to an image object + virtual bool copyBufferToImage(device::Memory& srcMemory, //!< Source memory object + device::Memory& dstMemory, //!< Destination memory object + const amd::Coord3D& srcOrigin, //!< Source origin + const amd::Coord3D& dstOrigin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false, //!< Entire buffer will be updated + size_t rowPitch = 0, //!< Pitch for buffer + size_t slicePitch = 0 //!< Slice for buffer + ) const; - //! Copies an image object to a buffer object - virtual bool copyImageToBuffer( - device::Memory& srcMemory, //!< Source memory object - device::Memory& dstMemory, //!< Destination memory object - const amd::Coord3D& srcOrigin, //!< Source origin - const amd::Coord3D& dstOrigin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false, //!< Entire buffer will be updated - size_t rowPitch = 0, //!< Pitch for buffer - size_t slicePitch = 0 //!< Slice for buffer - ) const; + //! Copies an image object to a buffer object + virtual bool copyImageToBuffer(device::Memory& srcMemory, //!< Source memory object + device::Memory& dstMemory, //!< Destination memory object + const amd::Coord3D& srcOrigin, //!< Source origin + const amd::Coord3D& dstOrigin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false, //!< Entire buffer will be updated + size_t rowPitch = 0, //!< Pitch for buffer + size_t slicePitch = 0 //!< Slice for buffer + ) const; - //! Copies an image object to another image object - virtual bool copyImage( - device::Memory& srcMemory, //!< Source memory object - device::Memory& dstMemory, //!< Destination memory object - const amd::Coord3D& srcOrigin, //!< Source origin - const amd::Coord3D& dstOrigin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; + //! Copies an image object to another image object + virtual bool copyImage(device::Memory& srcMemory, //!< Source memory object + device::Memory& dstMemory, //!< Destination memory object + const amd::Coord3D& srcOrigin, //!< Source origin + const amd::Coord3D& dstOrigin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const; - //! Copies an image object to system memory - virtual bool readImage( - device::Memory& srcMemory, //!< Source memory object - void* dstHost, //!< Destination host memory - const amd::Coord3D& origin, //!< Source origin - const amd::Coord3D& size, //!< Size of the copy region - size_t rowPitch, //!< Row pitch for host memory - size_t slicePitch, //!< Slice pitch for host memory - bool entire = false //!< Entire buffer will be updated - ) const; + //! Copies an image object to system memory + virtual bool readImage(device::Memory& srcMemory, //!< Source memory object + void* dstHost, //!< Destination host memory + const amd::Coord3D& origin, //!< Source origin + const amd::Coord3D& size, //!< Size of the copy region + size_t rowPitch, //!< Row pitch for host memory + size_t slicePitch, //!< Slice pitch for host memory + bool entire = false //!< Entire buffer will be updated + ) const; - //! Copies system memory to an image object - virtual bool writeImage( - const void* srcHost, //!< Source host memory - device::Memory& dstMemory, //!< Destination memory object - const amd::Coord3D& origin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - size_t rowPitch, //!< Row pitch for host memory - size_t slicePitch, //!< Slice pitch for host memory - bool entire = false //!< Entire buffer will be updated - ) const; + //! Copies system memory to an image object + virtual bool writeImage(const void* srcHost, //!< Source host memory + device::Memory& dstMemory, //!< Destination memory object + const amd::Coord3D& origin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + size_t rowPitch, //!< Row pitch for host memory + size_t slicePitch, //!< Slice pitch for host memory + bool entire = false //!< Entire buffer will be updated + ) const; - //! Fills a buffer memory with a pattern data - virtual bool fillBuffer( - device::Memory& memory, //!< Memory object to fill with pattern - const void* pattern, //!< Pattern data - size_t patternSize, //!< Pattern size - const amd::Coord3D& origin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; + //! Fills a buffer memory with a pattern data + virtual bool fillBuffer(device::Memory& memory, //!< Memory object to fill with pattern + const void* pattern, //!< Pattern data + size_t patternSize, //!< Pattern size + const amd::Coord3D& origin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const; - //! Fills an image memory with a pattern data - virtual bool fillImage( - device::Memory& dstMemory, //!< Memory object to fill with pattern - const void* pattern, //!< Pattern data - const amd::Coord3D& origin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; + //! Fills an image memory with a pattern data + virtual bool fillImage(device::Memory& dstMemory, //!< Memory object to fill with pattern + const void* pattern, //!< Pattern data + const amd::Coord3D& origin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const; - //! Fills an image memory with a pattern data - virtual bool runScheduler( - device::Memory& vqueue, //!< Memory object for virtual queue - device::Memory& params, //!< Extra arguments for the scheduler - uint paramIdx, //!< Parameter index - uint threads //!< Number of scheduling threads - ) const; + //! Fills an image memory with a pattern data + virtual bool runScheduler(device::Memory& vqueue, //!< Memory object for virtual queue + device::Memory& params, //!< Extra arguments for the scheduler + uint paramIdx, //!< Parameter index + uint threads //!< Number of scheduling threads + ) const; -private: - static const size_t MaxXferBuffers = 2; - static const uint TransferSplitSize = 3; + private: + static const size_t MaxXferBuffers = 2; + static const uint TransferSplitSize = 3; - //! Copies a buffer object to an image object - bool copyBufferToImageKernel( - device::Memory& srcMemory, //!< Source memory object - device::Memory& dstMemory, //!< Destination memory object - const amd::Coord3D& srcOrigin, //!< Source origin - const amd::Coord3D& dstOrigin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false, //!< Entire buffer will be updated - size_t rowPitch = 0, //!< Pitch for buffer - size_t slicePitch = 0 //!< Slice for buffer - ) const; + //! Copies a buffer object to an image object + bool copyBufferToImageKernel(device::Memory& srcMemory, //!< Source memory object + device::Memory& dstMemory, //!< Destination memory object + const amd::Coord3D& srcOrigin, //!< Source origin + const amd::Coord3D& dstOrigin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false, //!< Entire buffer will be updated + size_t rowPitch = 0, //!< Pitch for buffer + size_t slicePitch = 0 //!< Slice for buffer + ) const; - //! Copies an image object to a buffer object - bool copyImageToBufferKernel( - device::Memory& srcMemory, //!< Source memory object - device::Memory& dstMemory, //!< Destination memory object - const amd::Coord3D& srcOrigin, //!< Source origin - const amd::Coord3D& dstOrigin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false, //!< Entire buffer will be updated - size_t rowPitch = 0, //!< Pitch for buffer - size_t slicePitch = 0 //!< Slice for buffer - ) const; + //! Copies an image object to a buffer object + bool copyImageToBufferKernel(device::Memory& srcMemory, //!< Source memory object + device::Memory& dstMemory, //!< Destination memory object + const amd::Coord3D& srcOrigin, //!< Source origin + const amd::Coord3D& dstOrigin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false, //!< Entire buffer will be updated + size_t rowPitch = 0, //!< Pitch for buffer + size_t slicePitch = 0 //!< Slice for buffer + ) const; - //! Creates a program for all blit operations - bool createProgram( - Device& device //!< Device object - ); + //! Creates a program for all blit operations + bool createProgram(Device& device //!< Device object + ); - //! Creates a view memory object - Memory* createView( - const Memory& parent, //!< Parent memory object - const CalFormat& format //!< The new format for a view - ) const; + //! Creates a view memory object + Memory* createView(const Memory& parent, //!< Parent memory object + const CalFormat& format //!< The new format for a view + ) const; - //! Disable copy constructor - KernelBlitManager(const KernelBlitManager&); + //! Disable copy constructor + KernelBlitManager(const KernelBlitManager&); - //! Disable operator= - KernelBlitManager& operator=(const KernelBlitManager&); + //! Disable operator= + KernelBlitManager& operator=(const KernelBlitManager&); - amd::Program* program_; //!< GPU program obejct - amd::Kernel* kernels_[BlitTotal]; //!< GPU kernels for blit - amd::Memory* constantBuffer_; //!< An internal CB for blits - amd::Memory* xferBuffers_[MaxXferBuffers]; //!< Transfer buffers for images - size_t xferBufferSize_; //!< Transfer buffer size - amd::Monitor* lockXferOps_; //!< Lock transfer operation + amd::Program* program_; //!< GPU program obejct + amd::Kernel* kernels_[BlitTotal]; //!< GPU kernels for blit + amd::Memory* constantBuffer_; //!< An internal CB for blits + amd::Memory* xferBuffers_[MaxXferBuffers]; //!< Transfer buffers for images + size_t xferBufferSize_; //!< Transfer buffer size + amd::Monitor* lockXferOps_; //!< Lock transfer operation }; static const char* BlitName[KernelBlitManager::BlitTotal] = { - "copyImage", - "copyImage1DA", - "copyImageToBuffer", - "copyBufferToImage", - "copyBufferRect", - "copyBufferRectAligned", - "copyBuffer", - "copyBufferAligned", - "fillBuffer", - "fillImage", - "scheduler", - }; + "copyImage", "copyImage1DA", "copyImageToBuffer", + "copyBufferToImage", "copyBufferRect", "copyBufferRectAligned", + "copyBuffer", "copyBufferAligned", "fillBuffer", + "fillImage", "scheduler", +}; /*@}*/} // namespace gpu diff --git a/rocclr/runtime/device/gpu/gpucompiler.cpp b/rocclr/runtime/device/gpu/gpucompiler.cpp index edad392eef..34276ee460 100644 --- a/rocclr/runtime/device/gpu/gpucompiler.cpp +++ b/rocclr/runtime/device/gpu/gpucompiler.cpp @@ -14,444 +14,427 @@ #include "utils/options.hpp" #include -//CLC_IN_PROCESS_CHANGE +// CLC_IN_PROCESS_CHANGE extern int openclFrontEnd(const char* cmdline, std::string*, std::string* typeInfo = NULL); namespace gpu { static int programsCount = 0; -bool -NullProgram::compileImpl(const std::string& src, - const std::vector& headers, - const char** headerIncludeNames, - amd::option::Options* options) -{ - std::string sourceCode = src; +bool NullProgram::compileImpl(const std::string& src, + const std::vector& headers, + const char** headerIncludeNames, amd::option::Options* options) { + std::string sourceCode = src; - if (dev().settings().debugFlags_ & Settings::CheckForILSource) { - size_t inc = sourceCode.find("il_cs_", 0); - if (inc != std::string::npos) { - // CL program is an IL program - ilProgram_ = sourceCode; - return true; - } + if (dev().settings().debugFlags_ & Settings::CheckForILSource) { + size_t inc = sourceCode.find("il_cs_", 0); + if (inc != std::string::npos) { + // CL program is an IL program + ilProgram_ = sourceCode; + return true; } + } - std::string tempFolder = amd::Os::getTempPath(); - std::string tempFileName = amd::Os::getTempFileName(); + std::string tempFolder = amd::Os::getTempPath(); + std::string tempFileName = amd::Os::getTempFileName(); - if (dev().settings().debugFlags_ & Settings::StubCLPrograms) { - std::stringstream fileName; - std::fstream stubRead; - // Dump the IL function - fileName << "program_" << programsCount++ << ".cl"; - stubRead.open(fileName.str().c_str(), (std::fstream::in | std::fstream::binary)); - // Check if we have OpenCL program - if (stubRead.is_open()) { - // Find the stream size - stubRead.seekg(0, std::fstream::end); - size_t size = stubRead.tellg(); - stubRead.seekg(0, std::ios::beg); + if (dev().settings().debugFlags_ & Settings::StubCLPrograms) { + std::stringstream fileName; + std::fstream stubRead; + // Dump the IL function + fileName << "program_" << programsCount++ << ".cl"; + stubRead.open(fileName.str().c_str(), (std::fstream::in | std::fstream::binary)); + // Check if we have OpenCL program + if (stubRead.is_open()) { + // Find the stream size + stubRead.seekg(0, std::fstream::end); + size_t size = stubRead.tellg(); + stubRead.seekg(0, std::ios::beg); - char* data = new char[size]; - stubRead.read(data, size); - stubRead.close(); + char* data = new char[size]; + stubRead.read(data, size); + stubRead.close(); - sourceCode.assign(data, size); - delete[] data; - } - else { - std::fstream stubWrite; - stubWrite.open(fileName.str().c_str(), - (std::fstream::out | std::fstream::binary)); - stubWrite << sourceCode; - stubWrite.close(); - } + sourceCode.assign(data, size); + delete[] data; + } else { + std::fstream stubWrite; + stubWrite.open(fileName.str().c_str(), (std::fstream::out | std::fstream::binary)); + stubWrite << sourceCode; + stubWrite.close(); } + } - std::fstream f; - std::vector headerFileNames(headers.size()); - std::vector newDirs; - for (size_t i = 0; i < headers.size(); ++i) { - std::string headerPath = tempFolder; - std::string headerIncludeName(headerIncludeNames[i]); - // replace / in path with current os's file separator - if ( amd::Os::fileSeparator() != '/') { - for (std::string::iterator it = headerIncludeName.begin(), - end = headerIncludeName.end(); - it != end; - ++it) { - if (*it == '/') *it = amd::Os::fileSeparator(); - } - } - size_t pos = headerIncludeName.rfind(amd::Os::fileSeparator()); - if (pos != std::string::npos) { - headerPath += amd::Os::fileSeparator(); - headerPath += headerIncludeName.substr(0, pos); - headerIncludeName = headerIncludeName.substr(pos+1); - } - if (!amd::Os::pathExists(headerPath)) { - bool ret = amd::Os::createPath(headerPath); - assert(ret && "failed creating path!"); - newDirs.push_back(headerPath); - } - std::string headerFullName - = headerPath + amd::Os::fileSeparator() + headerIncludeName; - headerFileNames[i] = headerFullName; - f.open(headerFullName.c_str(), std::fstream::out); - assert(!f.fail() && "failed creating header file!"); - f.write(headers[i]->c_str(), headers[i]->length()); - f.close(); + std::fstream f; + std::vector headerFileNames(headers.size()); + std::vector newDirs; + for (size_t i = 0; i < headers.size(); ++i) { + std::string headerPath = tempFolder; + std::string headerIncludeName(headerIncludeNames[i]); + // replace / in path with current os's file separator + if (amd::Os::fileSeparator() != '/') { + for (std::string::iterator it = headerIncludeName.begin(), end = headerIncludeName.end(); + it != end; ++it) { + if (*it == '/') *it = amd::Os::fileSeparator(); + } } - - acl_error err; - const aclTargetInfo& targInfo = info(); - - aclBinaryOptions binOpts = {0}; - binOpts.struct_size = sizeof(binOpts); - binOpts.elfclass = targInfo.arch_id == aclAMDIL64 ? ELFCLASS64 : ELFCLASS32; - binOpts.bitness = ELFDATA2LSB; - binOpts.alloc = &::malloc; - binOpts.dealloc = &::free; - - aclBinary* bin - = aclBinaryInit(sizeof(aclBinary), &targInfo, &binOpts, &err); - if (err != ACL_SUCCESS) { - LogWarning("aclBinaryInit failed"); - return false; + size_t pos = headerIncludeName.rfind(amd::Os::fileSeparator()); + if (pos != std::string::npos) { + headerPath += amd::Os::fileSeparator(); + headerPath += headerIncludeName.substr(0, pos); + headerIncludeName = headerIncludeName.substr(pos + 1); } - - if (ACL_SUCCESS != aclInsertSection(dev().compiler(), bin, - sourceCode.c_str(), sourceCode.size(), aclSOURCE)) { - LogWarning("aclInsertSection failed"); - aclBinaryFini(bin); - return false; + if (!amd::Os::pathExists(headerPath)) { + bool ret = amd::Os::createPath(headerPath); + assert(ret && "failed creating path!"); + newDirs.push_back(headerPath); } + std::string headerFullName = headerPath + amd::Os::fileSeparator() + headerIncludeName; + headerFileNames[i] = headerFullName; + f.open(headerFullName.c_str(), std::fstream::out); + assert(!f.fail() && "failed creating header file!"); + f.write(headers[i]->c_str(), headers[i]->length()); + f.close(); + } - // temporary solution to synchronize buildNo between runtime and complib - // until we move runtime inside complib - ((amd::option::Options*)bin->options)->setBuildNo(options->getBuildNo()); + acl_error err; + const aclTargetInfo& targInfo = info(); - std::stringstream opts; - std::string token; - opts << options->origOptionStr.c_str(); + aclBinaryOptions binOpts = {0}; + binOpts.struct_size = sizeof(binOpts); + binOpts.elfclass = targInfo.arch_id == aclAMDIL64 ? ELFCLASS64 : ELFCLASS32; + binOpts.bitness = ELFDATA2LSB; + binOpts.alloc = &::malloc; + binOpts.dealloc = &::free; - if (options->origOptionStr.find("-cl-std=CL") == std::string::npos) { - switch(dev().settings().oclVersion_) { - case OpenCL10: opts << " -cl-std=CL1.0"; break; - case OpenCL11: opts << " -cl-std=CL1.1"; break; - case OpenCL20: default: - case OpenCL12: opts << " -cl-std=CL1.2"; break; - } - } + aclBinary* bin = aclBinaryInit(sizeof(aclBinary), &targInfo, &binOpts, &err); + if (err != ACL_SUCCESS) { + LogWarning("aclBinaryInit failed"); + return false; + } - // FIXME: Should we prefix everything with -Wf,? - std::istringstream iss(options->clcOptions); - while (getline(iss, token, ' ')) { - if (!token.empty()) { - // Check if this is a -D option - if (token.compare("-D") == 0) { - // It is, skip payload - getline(iss, token, ' '); - continue; - } - opts << " -Wf," << token; - } - } - - if (!headers.empty()) { - opts << " -I" << tempFolder; - } - - if (!dev().settings().imageSupport_) { - opts << " -fno-image-support"; - } - - if (dev().settings().reportFMAF_) { - opts << " -mfast-fmaf"; - } - - if (dev().settings().reportFMA_) { - opts << " -mfast-fma"; - } - - iss.clear(); - iss.str(device().info().extensions_); - while (getline(iss, token, ' ')) { - if (!token.empty()) { - opts << " -D" << token << "=1"; - } - } - - std::string newOpt = opts.str(); - size_t pos = newOpt.find("-fno-bin-llvmir"); - while (pos != std::string::npos) { - newOpt.erase(pos, 15); - pos = newOpt.find("-fno-bin-llvmir"); - } - - err = aclCompile(dev().compiler(), bin, newOpt.c_str(), - ACL_TYPE_OPENCL, ACL_TYPE_LLVMIR_BINARY, NULL); - - buildLog_ += aclGetCompilerLog(dev().compiler()); - - if (err != ACL_SUCCESS) { - LogWarning("aclCompile failed"); - aclBinaryFini(bin); - return false; - } - - size_t len = 0; - const void* ir = aclExtractSection(dev().compiler(), bin, - &len, aclLLVMIR, &err); - if (err != ACL_SUCCESS) { - LogWarning("aclExtractSection failed"); - aclBinaryFini(bin); - return false; - } - - llvmBinary_.assign(reinterpret_cast(ir), len); - elfSectionType_ = amd::OclElf::LLVMIR; + if (ACL_SUCCESS != + aclInsertSection(dev().compiler(), bin, sourceCode.c_str(), sourceCode.size(), aclSOURCE)) { + LogWarning("aclInsertSection failed"); aclBinaryFini(bin); + return false; + } - for (size_t i = 0; i < headerFileNames.size(); ++i) { - amd::Os::unlink(headerFileNames[i].c_str()); + // temporary solution to synchronize buildNo between runtime and complib + // until we move runtime inside complib + ((amd::option::Options*)bin->options)->setBuildNo(options->getBuildNo()); + + std::stringstream opts; + std::string token; + opts << options->origOptionStr.c_str(); + + if (options->origOptionStr.find("-cl-std=CL") == std::string::npos) { + switch (dev().settings().oclVersion_) { + case OpenCL10: + opts << " -cl-std=CL1.0"; + break; + case OpenCL11: + opts << " -cl-std=CL1.1"; + break; + case OpenCL20: + default: + case OpenCL12: + opts << " -cl-std=CL1.2"; + break; } - for (size_t i = 0; i < newDirs.size(); ++i) { - amd::Os::removePath(newDirs[i]); + } + + // FIXME: Should we prefix everything with -Wf,? + std::istringstream iss(options->clcOptions); + while (getline(iss, token, ' ')) { + if (!token.empty()) { + // Check if this is a -D option + if (token.compare("-D") == 0) { + // It is, skip payload + getline(iss, token, ' '); + continue; + } + opts << " -Wf," << token; } + } + + if (!headers.empty()) { + opts << " -I" << tempFolder; + } + + if (!dev().settings().imageSupport_) { + opts << " -fno-image-support"; + } + + if (dev().settings().reportFMAF_) { + opts << " -mfast-fmaf"; + } + + if (dev().settings().reportFMA_) { + opts << " -mfast-fma"; + } + + iss.clear(); + iss.str(device().info().extensions_); + while (getline(iss, token, ' ')) { + if (!token.empty()) { + opts << " -D" << token << "=1"; + } + } + + std::string newOpt = opts.str(); + size_t pos = newOpt.find("-fno-bin-llvmir"); + while (pos != std::string::npos) { + newOpt.erase(pos, 15); + pos = newOpt.find("-fno-bin-llvmir"); + } + + err = aclCompile(dev().compiler(), bin, newOpt.c_str(), ACL_TYPE_OPENCL, ACL_TYPE_LLVMIR_BINARY, + NULL); + + buildLog_ += aclGetCompilerLog(dev().compiler()); + + if (err != ACL_SUCCESS) { + LogWarning("aclCompile failed"); + aclBinaryFini(bin); + return false; + } + + size_t len = 0; + const void* ir = aclExtractSection(dev().compiler(), bin, &len, aclLLVMIR, &err); + if (err != ACL_SUCCESS) { + LogWarning("aclExtractSection failed"); + aclBinaryFini(bin); + return false; + } + + llvmBinary_.assign(reinterpret_cast(ir), len); + elfSectionType_ = amd::OclElf::LLVMIR; + aclBinaryFini(bin); + + for (size_t i = 0; i < headerFileNames.size(); ++i) { + amd::Os::unlink(headerFileNames[i].c_str()); + } + for (size_t i = 0; i < newDirs.size(); ++i) { + amd::Os::removePath(newDirs[i]); + } #ifdef _WIN32 - amd::Os::unlink(tempFileName); + amd::Os::unlink(tempFileName); #endif - if (clBinary()->saveSOURCE()) { - clBinary()->elfOut()->addSection( - amd::OclElf::SOURCE, sourceCode.data(), sourceCode.size()); - } - if (clBinary()->saveLLVMIR()) { - clBinary()->elfOut()->addSection( - amd::OclElf::LLVMIR, llvmBinary_.data(), llvmBinary_.size(), false); - // store the original compile options - clBinary()->storeCompileOptions(compileOptions_); - } + if (clBinary()->saveSOURCE()) { + clBinary()->elfOut()->addSection(amd::OclElf::SOURCE, sourceCode.data(), sourceCode.size()); + } + if (clBinary()->saveLLVMIR()) { + clBinary()->elfOut()->addSection(amd::OclElf::LLVMIR, llvmBinary_.data(), llvmBinary_.size(), + false); + // store the original compile options + clBinary()->storeCompileOptions(compileOptions_); + } - return true; + return true; } -int -NullProgram::compileBinaryToIL(amd::option::Options* options) -{ - acl_error err; - const aclTargetInfo& targInfo = info(); +int NullProgram::compileBinaryToIL(amd::option::Options* options) { + acl_error err; + const aclTargetInfo& targInfo = info(); - aclBinaryOptions binOpts = {0}; - binOpts.struct_size = sizeof(binOpts); - binOpts.elfclass = targInfo.arch_id == aclAMDIL64 ? ELFCLASS64 : ELFCLASS32; - binOpts.bitness = ELFDATA2LSB; - binOpts.alloc = &::malloc; - binOpts.dealloc = &::free; + aclBinaryOptions binOpts = {0}; + binOpts.struct_size = sizeof(binOpts); + binOpts.elfclass = targInfo.arch_id == aclAMDIL64 ? ELFCLASS64 : ELFCLASS32; + binOpts.bitness = ELFDATA2LSB; + binOpts.alloc = &::malloc; + binOpts.dealloc = &::free; - aclBinary* bin - = aclBinaryInit(sizeof(aclBinary), &targInfo, &binOpts, &err); - if (err != ACL_SUCCESS) { - LogWarning("aclBinaryInit failed"); - return CL_BUILD_PROGRAM_FAILURE; - } - aclSections_0_8 spirFlag; - _acl_type_enum_0_8 aclTypeBinaryUsed; - if (std::string::npos != options->clcOptions.find("--spirv") - || elfSectionType_ == amd::OclElf::SPIRV) { - spirFlag = aclSPIRV; - aclTypeBinaryUsed = ACL_TYPE_SPIRV_BINARY; - } else if (std::string::npos != options->clcOptions.find("--spir") - || elfSectionType_ == amd::OclElf::SPIR) { - spirFlag = aclSPIR; - aclTypeBinaryUsed = ACL_TYPE_SPIR_BINARY; - } else { - spirFlag = aclLLVMIR; - aclTypeBinaryUsed = ACL_TYPE_LLVMIR_BINARY; - } + aclBinary* bin = aclBinaryInit(sizeof(aclBinary), &targInfo, &binOpts, &err); + if (err != ACL_SUCCESS) { + LogWarning("aclBinaryInit failed"); + return CL_BUILD_PROGRAM_FAILURE; + } + aclSections_0_8 spirFlag; + _acl_type_enum_0_8 aclTypeBinaryUsed; + if (std::string::npos != options->clcOptions.find("--spirv") || + elfSectionType_ == amd::OclElf::SPIRV) { + spirFlag = aclSPIRV; + aclTypeBinaryUsed = ACL_TYPE_SPIRV_BINARY; + } else if (std::string::npos != options->clcOptions.find("--spir") || + elfSectionType_ == amd::OclElf::SPIR) { + spirFlag = aclSPIR; + aclTypeBinaryUsed = ACL_TYPE_SPIR_BINARY; + } else { + spirFlag = aclLLVMIR; + aclTypeBinaryUsed = ACL_TYPE_LLVMIR_BINARY; + } - if (ACL_SUCCESS != aclInsertSection(dev().compiler(), bin, - llvmBinary_.data(), llvmBinary_.size(), spirFlag)) { - LogWarning("aclInsertSection failed"); - aclBinaryFini(bin); - return CL_BUILD_PROGRAM_FAILURE; - } - - // pass kernel argument alignment info to compiler lib through option str - std::string optionStr = options->origOptionStr; - if (options->origOptionStr.find("kernel-arg-alignment") - == std::string::npos) { - char s[256]; - sprintf(s, " -Wb,-kernel-arg-alignment=%d", - dev().info().memBaseAddrAlign_ / 8); - optionStr += s; - } - - // temporary solution to synchronize buildNo between runtime and complib - // until we move runtime inside complib - ((amd::option::Options*)bin->options)->setBuildNo(options->getBuildNo()); - - aclType type = ACL_TYPE_CG ; - // If option bin-bif30 is set, generate BIF 3.0 binary - if (options->oVariables->BinBIF30) { - type = ACL_TYPE_ISA; - } - - err = aclCompile(dev().compiler(), bin, optionStr.c_str(), - aclTypeBinaryUsed, type, NULL); - buildLog_ += aclGetCompilerLog(dev().compiler()); - - if (err != ACL_SUCCESS) { - LogWarning("aclCompile failed"); - aclBinaryFini(bin); - return CL_BUILD_PROGRAM_FAILURE; - } - - if (options->oVariables->BinBIF30) { - if (!createBIFBinary(bin)) { - aclBinaryFini(bin); - return CL_BUILD_PROGRAM_FAILURE; - } - } - - size_t len = 0; - const void* amdil = aclExtractSection(dev().compiler(), bin, - &len, aclCODEGEN, &err); - if (err != ACL_SUCCESS) { - LogWarning("aclExtractSection failed"); - aclBinaryFini(bin); - return CL_BUILD_PROGRAM_FAILURE; - } - - ilProgram_.assign(reinterpret_cast(amdil), len); + if (ACL_SUCCESS != + aclInsertSection(dev().compiler(), bin, llvmBinary_.data(), llvmBinary_.size(), spirFlag)) { + LogWarning("aclInsertSection failed"); aclBinaryFini(bin); + return CL_BUILD_PROGRAM_FAILURE; + } - return CL_SUCCESS; + // pass kernel argument alignment info to compiler lib through option str + std::string optionStr = options->origOptionStr; + if (options->origOptionStr.find("kernel-arg-alignment") == std::string::npos) { + char s[256]; + sprintf(s, " -Wb,-kernel-arg-alignment=%d", dev().info().memBaseAddrAlign_ / 8); + optionStr += s; + } + + // temporary solution to synchronize buildNo between runtime and complib + // until we move runtime inside complib + ((amd::option::Options*)bin->options)->setBuildNo(options->getBuildNo()); + + aclType type = ACL_TYPE_CG; + // If option bin-bif30 is set, generate BIF 3.0 binary + if (options->oVariables->BinBIF30) { + type = ACL_TYPE_ISA; + } + + err = aclCompile(dev().compiler(), bin, optionStr.c_str(), aclTypeBinaryUsed, type, NULL); + buildLog_ += aclGetCompilerLog(dev().compiler()); + + if (err != ACL_SUCCESS) { + LogWarning("aclCompile failed"); + aclBinaryFini(bin); + return CL_BUILD_PROGRAM_FAILURE; + } + + if (options->oVariables->BinBIF30) { + if (!createBIFBinary(bin)) { + aclBinaryFini(bin); + return CL_BUILD_PROGRAM_FAILURE; + } + } + + size_t len = 0; + const void* amdil = aclExtractSection(dev().compiler(), bin, &len, aclCODEGEN, &err); + if (err != ACL_SUCCESS) { + LogWarning("aclExtractSection failed"); + aclBinaryFini(bin); + return CL_BUILD_PROGRAM_FAILURE; + } + + ilProgram_.assign(reinterpret_cast(amdil), len); + aclBinaryFini(bin); + + return CL_SUCCESS; } -bool -HSAILProgram::compileImpl( - const std::string& sourceCode, - const std::vector& headers, - const char** headerIncludeNames, - amd::option::Options* options) -{ - acl_error errorCode; - aclTargetInfo target; +bool HSAILProgram::compileImpl(const std::string& sourceCode, + const std::vector& headers, + const char** headerIncludeNames, amd::option::Options* options) { + acl_error errorCode; + aclTargetInfo target; - std::string arch = "hsail"; - if (dev().settings().use64BitPtr_) { - arch += "64"; + std::string arch = "hsail"; + if (dev().settings().use64BitPtr_) { + arch += "64"; + } + target = aclGetTargetInfo(arch.c_str(), dev().hwInfo()->targetName_, &errorCode); + + // end if asic info is ready + // We dump the source code for each program (param: headers) + // into their filenames (headerIncludeNames) into the TEMP + // folder specific to the OS and add the include path while + // compiling + + // Find the temp folder for the OS + std::string tempFolder = amd::Os::getTempPath(); + std::string tempFileName = amd::Os::getTempFileName(); + + // Iterate through each source code and dump it into tmp + std::fstream f; + std::vector headerFileNames(headers.size()); + std::vector newDirs; + for (size_t i = 0; i < headers.size(); ++i) { + std::string headerPath = tempFolder; + std::string headerIncludeName(headerIncludeNames[i]); + // replace / in path with current os's file separator + if (amd::Os::fileSeparator() != '/') { + for (std::string::iterator it = headerIncludeName.begin(), end = headerIncludeName.end(); + it != end; ++it) { + if (*it == '/') *it = amd::Os::fileSeparator(); + } } - target = aclGetTargetInfo(arch.c_str(), - dev().hwInfo()->targetName_, &errorCode); - - // end if asic info is ready - // We dump the source code for each program (param: headers) - // into their filenames (headerIncludeNames) into the TEMP - // folder specific to the OS and add the include path while - // compiling - - // Find the temp folder for the OS - std::string tempFolder = amd::Os::getTempPath(); - std::string tempFileName = amd::Os::getTempFileName(); - - // Iterate through each source code and dump it into tmp - std::fstream f; - std::vector headerFileNames(headers.size()); - std::vector newDirs; - for (size_t i = 0; i < headers.size(); ++i) { - std::string headerPath = tempFolder; - std::string headerIncludeName(headerIncludeNames[i]); - // replace / in path with current os's file separator - if (amd::Os::fileSeparator() != '/') { - for (std::string::iterator it = headerIncludeName.begin(), - end = headerIncludeName.end(); it != end; ++it) { - if (*it == '/') *it = amd::Os::fileSeparator(); - } - } - size_t pos = headerIncludeName.rfind(amd::Os::fileSeparator()); - if (pos != std::string::npos) { - headerPath += amd::Os::fileSeparator(); - headerPath += headerIncludeName.substr(0, pos); - headerIncludeName = headerIncludeName.substr(pos+1); - } - if (!amd::Os::pathExists(headerPath)) { - bool ret = amd::Os::createPath(headerPath); - assert(ret && "failed creating path!"); - newDirs.push_back(headerPath); - } - std::string headerFullName = - headerPath + amd::Os::fileSeparator() + headerIncludeName; - headerFileNames[i] = headerFullName; - f.open(headerFullName.c_str(), std::fstream::out); - // Should we allow asserts - assert(!f.fail() && "failed creating header file!"); - f.write(headers[i]->c_str(), headers[i]->length()); - f.close(); + size_t pos = headerIncludeName.rfind(amd::Os::fileSeparator()); + if (pos != std::string::npos) { + headerPath += amd::Os::fileSeparator(); + headerPath += headerIncludeName.substr(0, pos); + headerIncludeName = headerIncludeName.substr(pos + 1); } - - // Create Binary - binaryElf_ = aclBinaryInit(sizeof(aclBinary), - &target, &binOpts_, &errorCode); - if (errorCode != ACL_SUCCESS) { - buildLog_ += "Error: aclBinary init failure\n"; - LogWarning("aclBinaryInit failed"); - return false; + if (!amd::Os::pathExists(headerPath)) { + bool ret = amd::Os::createPath(headerPath); + assert(ret && "failed creating path!"); + newDirs.push_back(headerPath); } + std::string headerFullName = headerPath + amd::Os::fileSeparator() + headerIncludeName; + headerFileNames[i] = headerFullName; + f.open(headerFullName.c_str(), std::fstream::out); + // Should we allow asserts + assert(!f.fail() && "failed creating header file!"); + f.write(headers[i]->c_str(), headers[i]->length()); + f.close(); + } - // Insert opencl into binary - errorCode = aclInsertSection(dev().hsaCompiler(), binaryElf_, - sourceCode.c_str(), strlen(sourceCode.c_str()), aclSOURCE); - if (errorCode != ACL_SUCCESS) { - buildLog_ += "Error: Inserting openCl Source to binary\n"; - } + // Create Binary + binaryElf_ = aclBinaryInit(sizeof(aclBinary), &target, &binOpts_, &errorCode); + if (errorCode != ACL_SUCCESS) { + buildLog_ += "Error: aclBinary init failure\n"; + LogWarning("aclBinaryInit failed"); + return false; + } - // Set the options for the compiler - // Set the include path for the temp folder that contains the includes - if (!headers.empty()) { - compileOptions_.append(" -I"); - compileOptions_.append(tempFolder); - } + // Insert opencl into binary + errorCode = aclInsertSection(dev().hsaCompiler(), binaryElf_, sourceCode.c_str(), + strlen(sourceCode.c_str()), aclSOURCE); + if (errorCode != ACL_SUCCESS) { + buildLog_ += "Error: Inserting openCl Source to binary\n"; + } - //Add only for CL2.0 and above - if (options->oVariables->CLStd[2] >= '2') { - std::stringstream opts; - opts << " -D" << "CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE=" - << device().info().maxGlobalVariableSize_; - compileOptions_.append(opts.str()); - } + // Set the options for the compiler + // Set the include path for the temp folder that contains the includes + if (!headers.empty()) { + compileOptions_.append(" -I"); + compileOptions_.append(tempFolder); + } + + // Add only for CL2.0 and above + if (options->oVariables->CLStd[2] >= '2') { + std::stringstream opts; + opts << " -D" + << "CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE=" << device().info().maxGlobalVariableSize_; + compileOptions_.append(opts.str()); + } #if !defined(_LP64) && defined(ATI_OS_LINUX) - if (options->origOptionStr.find("-cl-std=CL2.0") != std::string::npos && !dev().settings().force32BitOcl20_) { - errorCode = ACL_UNSUPPORTED; - LogWarning("aclCompile failed"); - return false; - } + if (options->origOptionStr.find("-cl-std=CL2.0") != std::string::npos && + !dev().settings().force32BitOcl20_) { + errorCode = ACL_UNSUPPORTED; + LogWarning("aclCompile failed"); + return false; + } #endif - // Compile source to IR - compileOptions_.append(hsailOptions()); - errorCode = aclCompile(dev().hsaCompiler(), binaryElf_, compileOptions_.c_str(), - ACL_TYPE_OPENCL, ACL_TYPE_LLVMIR_BINARY, NULL); - buildLog_ += aclGetCompilerLog(dev().hsaCompiler()); - if (errorCode != ACL_SUCCESS) { - LogWarning("aclCompile failed"); - buildLog_ += "Error: Compiling CL to IR\n"; - return false; - } + // Compile source to IR + compileOptions_.append(hsailOptions()); + errorCode = aclCompile(dev().hsaCompiler(), binaryElf_, compileOptions_.c_str(), ACL_TYPE_OPENCL, + ACL_TYPE_LLVMIR_BINARY, NULL); + buildLog_ += aclGetCompilerLog(dev().hsaCompiler()); + if (errorCode != ACL_SUCCESS) { + LogWarning("aclCompile failed"); + buildLog_ += "Error: Compiling CL to IR\n"; + return false; + } - clBinary()->storeCompileOptions(compileOptions_); + clBinary()->storeCompileOptions(compileOptions_); - // Save the binary in the interface class - saveBinaryAndSetType(TYPE_COMPILED); + // Save the binary in the interface class + saveBinaryAndSetType(TYPE_COMPILED); - return true; + return true; } -} // namespace gpu +} // namespace gpu diff --git a/rocclr/runtime/device/gpu/gpuconstbuf.cpp b/rocclr/runtime/device/gpu/gpuconstbuf.cpp index ccf7d1f104..70b301d63e 100644 --- a/rocclr/runtime/device/gpu/gpuconstbuf.cpp +++ b/rocclr/runtime/device/gpu/gpuconstbuf.cpp @@ -9,81 +9,74 @@ namespace gpu { -ConstBuffer::ConstBuffer( - VirtualGPU& gpu, - size_t size) - : Memory(const_cast(gpu.dev()), size * VectorSize) - , gpu_(gpu) - , size_(size * VectorSize) - , wrtOffset_(0) - , lastWrtSize_(0) - , wrtAddress_(NULL) -{ +ConstBuffer::ConstBuffer(VirtualGPU& gpu, size_t size) + : Memory(const_cast(gpu.dev()), size * VectorSize), + gpu_(gpu), + size_(size * VectorSize), + wrtOffset_(0), + lastWrtSize_(0), + wrtAddress_(NULL) {} + +ConstBuffer::~ConstBuffer() { + if (wrtAddress_ != NULL) { + unmap(&gpu_); + } + + amd::AlignedMemory::deallocate(sysMemCopy_); } -ConstBuffer::~ConstBuffer() -{ +bool ConstBuffer::create() { + // Create sysmem copy for the constant buffer + sysMemCopy_ = reinterpret_cast
(amd::AlignedMemory::allocate(size_, 256)); + if (sysMemCopy_ == NULL) { + LogPrintfError( + "We couldn't allocate sysmem copy for constant buffer,\ + size(%d)!", + size_); + return false; + } + memset(sysMemCopy_, 0, size_); + + if (!Memory::create(Resource::RemoteUSWC)) { + LogPrintfError("We couldn't create HW constant buffer, size(%d)!", size_); + return false; + } + + // Constant buffer warm-up + warmUpRenames(gpu_); + + wrtAddress_ = map(&gpu_, Resource::Discard); + if (wrtAddress_ == NULL) { + LogPrintfError("We couldn't map HW constant buffer, size(%d)!", size_); + return false; + } + + return true; +} + +bool ConstBuffer::uploadDataToHw(size_t size) { + static const size_t HwCbAlignment = 256; + + // Align copy size on the vector's boundary + size_t count = amd::alignUp(size, VectorSize); + wrtOffset_ += lastWrtSize_; + + // Check if CB has enough space for copy + if ((wrtOffset_ + count) > size_) { if (wrtAddress_ != NULL) { - unmap(&gpu_); + unmap(&gpu_); } - - amd::AlignedMemory::deallocate(sysMemCopy_); -} - -bool -ConstBuffer::create() -{ - // Create sysmem copy for the constant buffer - sysMemCopy_ = reinterpret_cast
(amd::AlignedMemory::allocate(size_, 256)); - if (sysMemCopy_ == NULL) { - LogPrintfError("We couldn't allocate sysmem copy for constant buffer,\ - size(%d)!", size_); - return false; - } - memset(sysMemCopy_, 0, size_); - - if (!Memory::create(Resource::RemoteUSWC)) { - LogPrintfError("We couldn't create HW constant buffer, size(%d)!", size_); - return false; - } - - // Constant buffer warm-up - warmUpRenames(gpu_); - wrtAddress_ = map(&gpu_, Resource::Discard); - if (wrtAddress_ == NULL) { - LogPrintfError("We couldn't map HW constant buffer, size(%d)!", size_); - return false; - } + wrtOffset_ = 0; + lastWrtSize_ = 0; + } - return true; + // Update memory with new CB data + memcpy((reinterpret_cast(wrtAddress_) + wrtOffset_), sysMemCopy_, count); + + // Adjust the size by the HW CB buffer alignment + lastWrtSize_ = amd::alignUp(size, HwCbAlignment); + return true; } -bool -ConstBuffer::uploadDataToHw(size_t size) -{ - static const size_t HwCbAlignment = 256; - - // Align copy size on the vector's boundary - size_t count = amd::alignUp(size, VectorSize); - wrtOffset_ += lastWrtSize_; - - // Check if CB has enough space for copy - if ((wrtOffset_ + count) > size_) { - if (wrtAddress_ != NULL) { - unmap(&gpu_); - } - wrtAddress_ = map(&gpu_, Resource::Discard); - wrtOffset_ = 0; - lastWrtSize_ = 0; - } - - // Update memory with new CB data - memcpy((reinterpret_cast(wrtAddress_) + wrtOffset_), sysMemCopy_, count); - - // Adjust the size by the HW CB buffer alignment - lastWrtSize_ = amd::alignUp(size, HwCbAlignment); - return true; -} - -} // namespace gpu +} // namespace gpu diff --git a/rocclr/runtime/device/gpu/gpuconstbuf.hpp b/rocclr/runtime/device/gpu/gpuconstbuf.hpp index aeb129e2a2..68999a99f7 100644 --- a/rocclr/runtime/device/gpu/gpuconstbuf.hpp +++ b/rocclr/runtime/device/gpu/gpuconstbuf.hpp @@ -11,57 +11,54 @@ namespace gpu { //! Cconstant buffer -class ConstBuffer : public Memory -{ -public: - //! Vector size of the constant buffer - static const size_t VectorSize = 16; +class ConstBuffer : public Memory { + public: + //! Vector size of the constant buffer + static const size_t VectorSize = 16; - //! Constructor for the ConstBuffer class - ConstBuffer( - VirtualGPU& gpu, //!< Virtual GPU device object - size_t size //!< size of the constant buffer in vectors - ); + //! Constructor for the ConstBuffer class + ConstBuffer(VirtualGPU& gpu, //!< Virtual GPU device object + size_t size //!< size of the constant buffer in vectors + ); - //! Destructor for the ConstBuffer class - ~ConstBuffer(); + //! Destructor for the ConstBuffer class + ~ConstBuffer(); - //! Creates the real HW constant buffer - bool create(); + //! Creates the real HW constant buffer + bool create(); - /*! \brief Uploads current constant buffer data from sysMemCopy_ to HW - * - * \return True if the data upload was succesful - */ - bool uploadDataToHw( - size_t size //!< real data size for upload - ); + /*! \brief Uploads current constant buffer data from sysMemCopy_ to HW + * + * \return True if the data upload was succesful + */ + bool uploadDataToHw(size_t size //!< real data size for upload + ); - //! Returns a pointer to the system memory copy for CB - address sysMemCopy() const { return sysMemCopy_; } + //! Returns a pointer to the system memory copy for CB + address sysMemCopy() const { return sysMemCopy_; } - //! Returns CB size - size_t size() const { return size_; } + //! Returns CB size + size_t size() const { return size_; } - //! Returns current write offset for the constant buffer - size_t wrtOffset() const { return wrtOffset_; } + //! Returns current write offset for the constant buffer + size_t wrtOffset() const { return wrtOffset_; } - //! Returns last write size for the constant buffer - size_t lastWrtSize() const { return lastWrtSize_; } + //! Returns last write size for the constant buffer + size_t lastWrtSize() const { return lastWrtSize_; } -private: - //! Disable copy constructor - ConstBuffer(const ConstBuffer&); + private: + //! Disable copy constructor + ConstBuffer(const ConstBuffer&); - //! Disable operator= - ConstBuffer& operator=(const ConstBuffer&); + //! Disable operator= + ConstBuffer& operator=(const ConstBuffer&); - VirtualGPU& gpu_; //!< Virtual GPU object - address sysMemCopy_; //!< System memory copy - size_t size_; //!< Constant buffer size - size_t wrtOffset_; //!< Current write offset - size_t lastWrtSize_; //!< Last write size - void* wrtAddress_; //!< Write address in CB + VirtualGPU& gpu_; //!< Virtual GPU object + address sysMemCopy_; //!< System memory copy + size_t size_; //!< Constant buffer size + size_t wrtOffset_; //!< Current write offset + size_t lastWrtSize_; //!< Last write size + void* wrtAddress_; //!< Write address in CB }; diff --git a/rocclr/runtime/device/gpu/gpucounters.cpp b/rocclr/runtime/device/gpu/gpucounters.cpp index 5ca7f82c2e..7059b78f9f 100644 --- a/rocclr/runtime/device/gpu/gpucounters.cpp +++ b/rocclr/runtime/device/gpu/gpucounters.cpp @@ -10,79 +10,72 @@ namespace gpu { CalCounterReference::~CalCounterReference() { - // The counter object is always associated with a particular queue, - // so we have to lock just this queue - amd::ScopedLock lock(gpu_.execution()); + // The counter object is always associated with a particular queue, + // so we have to lock just this queue + amd::ScopedLock lock(gpu_.execution()); - if (0 != counter_) { - gpu().cs()->destroyQuery(gslCounter()); - } + if (0 != counter_) { + gpu().cs()->destroyQuery(gslCounter()); + } } -bool -CalCounterReference::growResultArray(uint index) { - if (results_ != NULL) { - delete [] results_; - } - results_ = new uint64_t [index + 1]; - if (results_ == NULL) { - return false; - } - return true; +bool CalCounterReference::growResultArray(uint index) { + if (results_ != NULL) { + delete[] results_; + } + results_ = new uint64_t[index + 1]; + if (results_ == NULL) { + return false; + } + return true; } -PerfCounter::~PerfCounter() -{ - if (calRef_ == NULL) { - return; - } +PerfCounter::~PerfCounter() { + if (calRef_ == NULL) { + return; + } - // Release the counter reference object - calRef_->release(); + // Release the counter reference object + calRef_->release(); } -bool -PerfCounter::create( - CalCounterReference* calRef) -{ - assert(&gpu() == &calRef->gpu()); +bool PerfCounter::create(CalCounterReference* calRef) { + assert(&gpu() == &calRef->gpu()); - calRef_ = calRef; - counter_ = calRef->gslCounter(); - index_ = calRef->retain() - 2; - calRef->growResultArray(index_); + calRef_ = calRef; + counter_ = calRef->gslCounter(); + index_ = calRef->retain() - 2; + calRef->growResultArray(index_); - // Initialize the counter - gslCounter()->getAsPerformanceQueryObject()->setCounterState( - info()->blockIndex_, info()->counterIndex_, info()->eventIndex_); + // Initialize the counter + gslCounter()->getAsPerformanceQueryObject()->setCounterState( + info()->blockIndex_, info()->counterIndex_, info()->eventIndex_); - return true; + return true; } -uint64_t -PerfCounter::getInfo(uint64_t infoType) const -{ - switch (infoType) { +uint64_t PerfCounter::getInfo(uint64_t infoType) const { + switch (infoType) { case CL_PERFCOUNTER_GPU_BLOCK_INDEX: { - // Return the GPU block index - return info()->blockIndex_; + // Return the GPU block index + return info()->blockIndex_; } case CL_PERFCOUNTER_GPU_COUNTER_INDEX: { - // Return the GPU counter index - return info()->counterIndex_; + // Return the GPU counter index + return info()->counterIndex_; } case CL_PERFCOUNTER_GPU_EVENT_INDEX: { - // Return the GPU event index - return info()->eventIndex_; + // Return the GPU event index + return info()->eventIndex_; } case CL_PERFCOUNTER_DATA: { - gslCounter()->GetResult(gpu().cs(), reinterpret_cast(calRef_->results())); - return calRef_->results()[index_]; + gslCounter()->GetResult(gpu().cs(), reinterpret_cast(calRef_->results())); + return calRef_->results()[index_]; } default: - LogError("Wrong PerfCounter::getInfo parameter"); - } - return 0; + LogError("Wrong PerfCounter::getInfo parameter"); + } + return 0; } -} // namespace gpu +} // namespace gpu diff --git a/rocclr/runtime/device/gpu/gpucounters.hpp b/rocclr/runtime/device/gpu/gpucounters.hpp index df4a06ce55..f4421b927c 100644 --- a/rocclr/runtime/device/gpu/gpucounters.hpp +++ b/rocclr/runtime/device/gpu/gpucounters.hpp @@ -12,129 +12,112 @@ namespace gpu { class VirtualGPU; -class CalCounterReference : public amd::ReferenceCountedObject -{ -public: - //! Default constructor - CalCounterReference( - VirtualGPU& gpu, //!< Virtual GPU device object - gslQueryObject gslCounter) - : gpu_(gpu) - , counter_(gslCounter) - , results_(NULL) {} +class CalCounterReference : public amd::ReferenceCountedObject { + public: + //! Default constructor + CalCounterReference(VirtualGPU& gpu, //!< Virtual GPU device object + gslQueryObject gslCounter) + : gpu_(gpu), counter_(gslCounter), results_(NULL) {} - //! Get CAL counter - gslQueryObject gslCounter() const { return counter_; } + //! Get CAL counter + gslQueryObject gslCounter() const { return counter_; } - //! Returns the virtual GPU device - const VirtualGPU& gpu() const { return gpu_; } + //! Returns the virtual GPU device + const VirtualGPU& gpu() const { return gpu_; } - //! Increases the results array for this CAL counter(container) - bool growResultArray( - uint maxIndex //!< the maximum HW counter index in the CAL counter - ); + //! Increases the results array for this CAL counter(container) + bool growResultArray(uint maxIndex //!< the maximum HW counter index in the CAL counter + ); - //! Returns the CAL counter results - uint64_t* results() const { return results_; } + //! Returns the CAL counter results + uint64_t* results() const { return results_; } -protected: - //! Default destructor - ~CalCounterReference(); + protected: + //! Default destructor + ~CalCounterReference(); -private: - //! Disable copy constructor - CalCounterReference(const CalCounterReference&); + private: + //! Disable copy constructor + CalCounterReference(const CalCounterReference&); - //! Disable operator= - CalCounterReference& operator=(const CalCounterReference&); + //! Disable operator= + CalCounterReference& operator=(const CalCounterReference&); - VirtualGPU& gpu_; //!< The virtual GPU device object - gslQueryObject counter_; //!< GSL object counter - uint64_t* results_; //!< CAL counter results + VirtualGPU& gpu_; //!< The virtual GPU device object + gslQueryObject counter_; //!< GSL object counter + uint64_t* results_; //!< CAL counter results }; //! Performance counter implementation on GPU -class PerfCounter : public device::PerfCounter -{ -public: - //! The performance counter info - struct Info : public amd::EmbeddedObject - { - uint blockIndex_; //!< Index of the block to configure - uint counterIndex_; //!< Index of the hardware counter - uint eventIndex_; //!< Event you wish to count with the counter - }; +class PerfCounter : public device::PerfCounter { + public: + //! The performance counter info + struct Info : public amd::EmbeddedObject { + uint blockIndex_; //!< Index of the block to configure + uint counterIndex_; //!< Index of the hardware counter + uint eventIndex_; //!< Event you wish to count with the counter + }; - //! The PerfCounter flags - enum Flags - { - BeginIssued = 0x00000001, - EndIssued = 0x00000002, - ResultReady = 0x00000004 - }; + //! The PerfCounter flags + enum Flags { BeginIssued = 0x00000001, EndIssued = 0x00000002, ResultReady = 0x00000004 }; - //! Constructor for the GPU PerfCounter object - PerfCounter( - const Device& device, //!< A GPU device object - const VirtualGPU& gpu, //!< Virtual GPU device object - cl_uint blockIndex, //!< HW block index - cl_uint counterIndex, //!< Counter index within the block - cl_uint eventIndex) //!< Event index for profiling - : gpuDevice_(device) - , gpu_(gpu) - , calRef_(NULL) - , flags_(0) - , counter_(0) - , index_(0) - { - info_.blockIndex_ = blockIndex; - info_.counterIndex_ = counterIndex; - info_.eventIndex_ = eventIndex; - } + //! Constructor for the GPU PerfCounter object + PerfCounter(const Device& device, //!< A GPU device object + const VirtualGPU& gpu, //!< Virtual GPU device object + cl_uint blockIndex, //!< HW block index + cl_uint counterIndex, //!< Counter index within the block + cl_uint eventIndex) //!< Event index for profiling + : gpuDevice_(device), + gpu_(gpu), + calRef_(NULL), + flags_(0), + counter_(0), + index_(0) { + info_.blockIndex_ = blockIndex; + info_.counterIndex_ = counterIndex; + info_.eventIndex_ = eventIndex; + } - //! Destructor for the GPU PerfCounter object - virtual ~PerfCounter(); + //! Destructor for the GPU PerfCounter object + virtual ~PerfCounter(); - //! Creates the current object - bool create( - CalCounterReference* calRef //!< Reference counter - ); + //! Creates the current object + bool create(CalCounterReference* calRef //!< Reference counter + ); - //! Returns the specific information about the counter - uint64_t getInfo( - uint64_t infoType //!< The type of returned information - ) const; + //! Returns the specific information about the counter + uint64_t getInfo(uint64_t infoType //!< The type of returned information + ) const; - //! Returns the GPU device, associated with the current object - const Device& dev() const { return gpuDevice_; } + //! Returns the GPU device, associated with the current object + const Device& dev() const { return gpuDevice_; } - //! Returns the virtual GPU device - const VirtualGPU& gpu() const { return gpu_; } + //! Returns the virtual GPU device + const VirtualGPU& gpu() const { return gpu_; } - //! Returns the CAL performance counter descriptor - const Info* info() const { return &info_; } + //! Returns the CAL performance counter descriptor + const Info* info() const { return &info_; } - //! Returns the Info structure for performance counter - gslQueryObject gslCounter() const { return counter_; } + //! Returns the Info structure for performance counter + gslQueryObject gslCounter() const { return counter_; } -private: - //! Disable default copy constructor - PerfCounter(const PerfCounter&); + private: + //! Disable default copy constructor + PerfCounter(const PerfCounter&); - //! Disable default operator= - PerfCounter& operator=(const PerfCounter&); + //! Disable default operator= + PerfCounter& operator=(const PerfCounter&); - const Device& gpuDevice_; //!< The backend device - const VirtualGPU& gpu_; //!< The virtual GPU device object + const Device& gpuDevice_; //!< The backend device + const VirtualGPU& gpu_; //!< The virtual GPU device object - CalCounterReference* calRef_; //!< Reference counter - uint flags_; //!< The perfcounter object state - Info info_; //!< The info structure for perfcounter - gslQueryObject counter_; //!< GSL counter object - uint index_; //!< Counter index in the CAL container + CalCounterReference* calRef_; //!< Reference counter + uint flags_; //!< The perfcounter object state + Info info_; //!< The info structure for perfcounter + gslQueryObject counter_; //!< GSL counter object + uint index_; //!< Counter index in the CAL container }; -} // namespace gpu - -#endif // GPUCOUNTERS_HPP_ +} // namespace gpu +#endif // GPUCOUNTERS_HPP_ diff --git a/rocclr/runtime/device/gpu/gpudebugger.hpp b/rocclr/runtime/device/gpu/gpudebugger.hpp index 36eab4ad28..c2c7d2e5b9 100644 --- a/rocclr/runtime/device/gpu/gpudebugger.hpp +++ b/rocclr/runtime/device/gpu/gpudebugger.hpp @@ -34,82 +34,77 @@ namespace gpu { * * This structure contains the packet information for kernel dispatch */ -struct PacketAmdInfo -{ - uint32_t trapReservedVgprIndex_; //!< reserved VGPR index, -1 when they are not valid - uint32_t scratchBufferWaveOffset_; //!< scratch buffer wave offset, -1 when no scratch buffer - void* pointerToIsaBuffer_; //!< pointer to the buffer containing ISA - size_t sizeOfIsaBuffer_; //!< size of the ISA buffer - uint32_t numberOfVgprs_; //!< number of VGPRs used by the kernel - uint32_t numberOfSgprs_; //!< number of SGPRs used by the kernel - size_t sizeOfStaticGroupMemory_; //!< Static local memory used by the kernel +struct PacketAmdInfo { + uint32_t trapReservedVgprIndex_; //!< reserved VGPR index, -1 when they are not valid + uint32_t scratchBufferWaveOffset_; //!< scratch buffer wave offset, -1 when no scratch buffer + void* pointerToIsaBuffer_; //!< pointer to the buffer containing ISA + size_t sizeOfIsaBuffer_; //!< size of the ISA buffer + uint32_t numberOfVgprs_; //!< number of VGPRs used by the kernel + uint32_t numberOfSgprs_; //!< number of SGPRs used by the kernel + size_t sizeOfStaticGroupMemory_; //!< Static local memory used by the kernel }; /*! \brief Cache mask for invalidation */ -struct HwDbgGpuCacheMask -{ - HwDbgGpuCacheMask() :ui32All_(0) {} +struct HwDbgGpuCacheMask { + HwDbgGpuCacheMask() : ui32All_(0) {} - HwDbgGpuCacheMask(uint32_t mask) :ui32All_(mask) {} + HwDbgGpuCacheMask(uint32_t mask) : ui32All_(mask) {} - union { - struct { - uint32_t sqICache_ : 1; //!< Instruction cache - uint32_t sqKCache_ : 1; //!< Data cache - uint32_t tcL1_ : 1; //!< tcL1 cache - uint32_t tcL2_ : 1; //!< tcL2 cache - uint32_t reserved_ : 28; - }; - uint32_t ui32All_; + union { + struct { + uint32_t sqICache_ : 1; //!< Instruction cache + uint32_t sqKCache_ : 1; //!< Data cache + uint32_t tcL1_ : 1; //!< tcL1 cache + uint32_t tcL2_ : 1; //!< tcL2 cache + uint32_t reserved_ : 28; }; + uint32_t ui32All_; + }; }; /*! \brief Address watch information * * Information about each watch point - address, mask, mode and event */ -struct HwDbgAddressWatch -{ - void* watchAddress_; //! The address of watch point - uint64_t watchMask_; //! The mask for watch point (lower 24 bits) - cl_dbg_address_watch_mode_amd watchMode_; //! The watch mode for this watch - DebugEvent event_; //! Event of the watch point (not used for now) +struct HwDbgAddressWatch { + void* watchAddress_; //! The address of watch point + uint64_t watchMask_; //! The mask for watch point (lower 24 bits) + cl_dbg_address_watch_mode_amd watchMode_; //! The watch mode for this watch + DebugEvent event_; //! Event of the watch point (not used for now) }; /*! \brief Runtime structure used to communicate debug information * between Ocl services and core for a kernel dispatch. */ -struct DebugToolInfo -{ - uint64_t scratchAddress_; //! Scratch memory address - size_t scratchSize_; //! Scratch memory size - uint64_t globalAddress_; //! Global memory address - uint32_t cacheDisableMask_; //! Cache mask, indicating caches disabled - uint32_t exceptionMask_; //! Exception mask - uint32_t reservedCuNum_; //! Number of reserved CUs for display, - //! which ranges from 0 to 7 in the current implementation. - bool monitorMode_; //! Debug or profiler mode - bool gpuSingleStepMode_; //! SQ debug mode - amd::Memory* trapHandler_; //! Trap handler address - amd::Memory* trapBuffer_; //! Trap buffer address - bool sqPerfcounterEnable_; //! whether SQ perf counters are enabled - aclBinary* aclBinary_; //! pointer of the kernel ACL binary - amd::Event* event_; //! pointer of the kernel event in the enqueue command +struct DebugToolInfo { + uint64_t scratchAddress_; //! Scratch memory address + size_t scratchSize_; //! Scratch memory size + uint64_t globalAddress_; //! Global memory address + uint32_t cacheDisableMask_; //! Cache mask, indicating caches disabled + uint32_t exceptionMask_; //! Exception mask + uint32_t reservedCuNum_; //! Number of reserved CUs for display, + //! which ranges from 0 to 7 in the current implementation. + bool monitorMode_; //! Debug or profiler mode + bool gpuSingleStepMode_; //! SQ debug mode + amd::Memory* trapHandler_; //! Trap handler address + amd::Memory* trapBuffer_; //! Trap buffer address + bool sqPerfcounterEnable_; //! whether SQ perf counters are enabled + aclBinary* aclBinary_; //! pointer of the kernel ACL binary + amd::Event* event_; //! pointer of the kernel event in the enqueue command }; /*! \brief Message used by the KFD wave control for CI * * Structure indicates the various information used by the wave control function. */ -struct HwDebugWaveAddr -{ - uint32_t VMID_ : 4; //! Virtual memory id - uint32_t wave_ : 4; //! Wave id - uint32_t SIMD_ : 2; //! SIMD id - uint32_t CU_ : 4; //! Compute unit - uint32_t SH_ : 1; //! Shader array - uint32_t SE_ : 1; //! Shader engine +struct HwDebugWaveAddr { + uint32_t VMID_ : 4; //! Virtual memory id + uint32_t wave_ : 4; //! Wave id + uint32_t SIMD_ : 2; //! SIMD id + uint32_t CU_ : 4; //! Compute unit + uint32_t SH_ : 1; //! Shader array + uint32_t SE_ : 1; //! Shader engine }; /*! \brief Kernel code information @@ -117,10 +112,9 @@ struct HwDebugWaveAddr * This structure contains the pointer of mapped kernel code for host access * and its size (in bytes) */ -struct AqlCodeInfo -{ - amd_kernel_code_t * aqlCode_; //! pointer of AQL code to allow host access - uint32_t aqlCodeSize_; //! size of AQL code +struct AqlCodeInfo { + amd_kernel_code_t* aqlCode_; //! pointer of AQL code to allow host access + uint32_t aqlCodeSize_; //! size of AQL code }; /**@}*/ diff --git a/rocclr/runtime/device/gpu/gpudebugmanager.cpp b/rocclr/runtime/device/gpu/gpudebugmanager.cpp index 95b74769b7..6ef53f4e16 100644 --- a/rocclr/runtime/device/gpu/gpudebugmanager.cpp +++ b/rocclr/runtime/device/gpu/gpudebugmanager.cpp @@ -33,376 +33,313 @@ class Memory; */ GpuDebugManager::GpuDebugManager(amd::Device* device) - : HwDebugManager(device) - , vGpu_(NULL) - , debugMessages_(0) - , addressWatch_(NULL) - , addressWatchSize_(0) - , oclEventHandle_(NULL) -{ - // Initialize the exception info and the kernel execution mode - excpPolicy_.exceptionMask = 0x0; - excpPolicy_.waveAction = CL_DBG_WAVES_RESUME; - excpPolicy_.hostAction = CL_DBG_HOST_IGNORE; - excpPolicy_.waveMode = CL_DBG_WAVEMODE_BROADCAST; + : HwDebugManager(device), + vGpu_(NULL), + debugMessages_(0), + addressWatch_(NULL), + addressWatchSize_(0), + oclEventHandle_(NULL) { + // Initialize the exception info and the kernel execution mode + excpPolicy_.exceptionMask = 0x0; + excpPolicy_.waveAction = CL_DBG_WAVES_RESUME; + excpPolicy_.hostAction = CL_DBG_HOST_IGNORE; + excpPolicy_.waveMode = CL_DBG_WAVEMODE_BROADCAST; - execMode_.ui32All = 0; + execMode_.ui32All = 0; - rtTrapHandlerInfo_.trap_.trapHandler_ = NULL; - rtTrapHandlerInfo_.trap_.trapBuffer_ = NULL; + rtTrapHandlerInfo_.trap_.trapHandler_ = NULL; + rtTrapHandlerInfo_.trap_.trapBuffer_ = NULL; - aqlPacket_ = (hsa_kernel_dispatch_packet_t *) NULL; + aqlPacket_ = (hsa_kernel_dispatch_packet_t*)NULL; - return; + return; } -GpuDebugManager::~GpuDebugManager() -{ - if (NULL != addressWatch_) { - delete [] addressWatch_; - } +GpuDebugManager::~GpuDebugManager() { + if (NULL != addressWatch_) { + delete[] addressWatch_; + } } -void -GpuDebugManager::executePreDispatchCallBack(void* aqlPacket, - void* toolInfo) -{ - DebugToolInfo* info = reinterpret_cast(toolInfo); +void GpuDebugManager::executePreDispatchCallBack(void* aqlPacket, void* toolInfo) { + DebugToolInfo* info = reinterpret_cast(toolInfo); - aqlPacket_ = reinterpret_cast(aqlPacket); + aqlPacket_ = reinterpret_cast(aqlPacket); - // Only if the pre-dispatch callback is set, will we update cache - // flush configuration and build the memory descriptor. - if (NULL != preDispatchCallBackFunc_) { - // Build the scratch memory descriptor - device()->gslCtx()->BuildScratchBufferResource(debugInfo_.scratchMemoryDescriptor_, - info->scratchAddress_, - info->scratchSize_); + // Only if the pre-dispatch callback is set, will we update cache + // flush configuration and build the memory descriptor. + if (NULL != preDispatchCallBackFunc_) { + // Build the scratch memory descriptor + device()->gslCtx()->BuildScratchBufferResource(debugInfo_.scratchMemoryDescriptor_, + info->scratchAddress_, info->scratchSize_); - // Build the global memory descriptor - device()->gslCtx()->BuildHeapBufferResource(debugInfo_.globalMemoryDescriptor_, - info->globalAddress_); + // Build the global memory descriptor + device()->gslCtx()->BuildHeapBufferResource(debugInfo_.globalMemoryDescriptor_, + info->globalAddress_); -// // for invalidate cache (BuildEndOfKernelNotifyCommands) -// aqlPacket->release_fence_scope = 2; + // // for invalidate cache (BuildEndOfKernelNotifyCommands) + // aqlPacket->release_fence_scope = 2; - aclBinary_ = reinterpret_cast(info->aclBinary_); - oclEventHandle_ = reinterpret_cast(as_cl(info->event_)); + aclBinary_ = reinterpret_cast(info->aclBinary_); + oclEventHandle_ = reinterpret_cast(as_cl(info->event_)); - cl_device_id clDeviceId = as_cl(device_); - preDispatchCallBackFunc_(clDeviceId, - oclEventHandle_, - aqlPacket_, - aclBinary_, - preDispatchCallBackArgs_); - } + cl_device_id clDeviceId = as_cl(device_); + preDispatchCallBackFunc_(clDeviceId, oclEventHandle_, aqlPacket_, aclBinary_, + preDispatchCallBackArgs_); + } - // setup the trap handler information only if the debugger has been registered - if (isRegistered()) { - // Copy the various info set by the debugger/profiler to the tool info structure - setupTrapInformation(info); - } + // setup the trap handler information only if the debugger has been registered + if (isRegistered()) { + // Copy the various info set by the debugger/profiler to the tool info structure + setupTrapInformation(info); + } } -void -GpuDebugManager::executePostDispatchCallBack() -{ - if (NULL != postDispatchCallBackFunc_) { - cl_device_id clDeviceId = as_cl(device_); - postDispatchCallBackFunc_(clDeviceId, - aqlPacket_->completion_signal.handle, - postDispatchCallBackArgs_); - } +void GpuDebugManager::executePostDispatchCallBack() { + if (NULL != postDispatchCallBackFunc_) { + cl_device_id clDeviceId = as_cl(device_); + postDispatchCallBackFunc_(clDeviceId, aqlPacket_->completion_signal.handle, + postDispatchCallBackArgs_); + } } //! Map the kernel code for host access -void -GpuDebugManager::mapKernelCode(void* aqlCodeInfo) const -{ - AqlCodeInfo* codeInfo = reinterpret_cast(aqlCodeInfo); +void GpuDebugManager::mapKernelCode(void* aqlCodeInfo) const { + AqlCodeInfo* codeInfo = reinterpret_cast(aqlCodeInfo); - codeInfo->aqlCode_ = reinterpret_cast(aqlCodeAddr_); - codeInfo->aqlCodeSize_ = aqlCodeSize_; + codeInfo->aqlCode_ = reinterpret_cast(aqlCodeAddr_); + codeInfo->aqlCodeSize_ = aqlCodeSize_; } -cl_int -GpuDebugManager::registerDebugger(amd::Context* context, uintptr_t messageStorage) +cl_int GpuDebugManager::registerDebugger(amd::Context* context, uintptr_t messageStorage) { + if (!device()->settings().enableHwDebug_) { + LogError("debugmanager: Register debugger error - HW DEBUG is not enable"); + return CL_DEBUGGER_REGISTER_FAILURE_AMD; + } + + // first time register - set the message storage, flush queue and enable hw debug + if (!isRegistered()) { + debugMessages_ = messageStorage; + if (!device()->gslCtx()->registerHwDebugger(debugMessages_)) { + LogError("debugmanager: Register debugger failed"); + return CL_OUT_OF_RESOURCES; + } + + isRegistered_ = true; + + if (CL_SUCCESS != createRuntimeTrapHandler()) { + LogError("debugmanager: Create runtime trap handler failed"); + return CL_OUT_OF_RESOURCES; + } + } + + context_ = context; + + return CL_SUCCESS; +} + +void GpuDebugManager::unregisterDebugger() { + if (isRegistered()) { + // reset the debugger registration flag + isRegistered_ = false; + context_ = NULL; + } +} + +void GpuDebugManager::flushCache(uint32_t mask) { + HwDbgGpuCacheMask cacheMask(mask); + device()->xferQueue()->flushCuCaches(cacheMask); +} + + +void GpuDebugManager::setupTrapInformation(DebugToolInfo* toolInfo) { + toolInfo->scratchAddress_ = 0; + toolInfo->scratchSize_ = 0; + toolInfo->globalAddress_ = 0; + toolInfo->sqPerfcounterEnable_ = false; + + // Set up trap related info in the kernel info structure to be + // used in the kernel dispatch. + toolInfo->exceptionMask_ = excpPolicy_.exceptionMask; + toolInfo->gpuSingleStepMode_ = execMode_.gpuSingleStepMode; + toolInfo->monitorMode_ = execMode_.monitorMode; + + // The order of these three bits is determined by the definition + // of the register COMPUTE_DISPATCH_INITIATOR + toolInfo->cacheDisableMask_ = ((execMode_.disableL1Scalar << 2) | + (execMode_.disableL2Cache << 1) | (execMode_.disableL1Vector)); + + toolInfo->reservedCuNum_ = execMode_.reservedCuNum; + + toolInfo->trapHandler_ = rtTrapInfo_[kDebugTrapHandlerLocation]; + toolInfo->trapBuffer_ = rtTrapInfo_[kDebugTrapBufferLocation]; +} + +void GpuDebugManager::getPacketAmdInfo(const void* aqlCodeInfo, void* packetInfo) const + { - if (!device()->settings().enableHwDebug_) { - LogError("debugmanager: Register debugger error - HW DEBUG is not enable"); - return CL_DEBUGGER_REGISTER_FAILURE_AMD; + const AqlCodeInfo* codeInfo = reinterpret_cast(aqlCodeInfo); + + const amd_kernel_code_t* hostAqlCode = codeInfo->aqlCode_; + + PacketAmdInfo* packet = reinterpret_cast(packetInfo); + + const amd_kernel_code_t* akc = hostAqlCode; + + packet->numberOfSgprs_ = akc->wavefront_sgpr_count; + packet->numberOfVgprs_ = akc->workitem_vgpr_count; + + // use mapped kernel_object_address for host accessing of ISA buffer + packet->pointerToIsaBuffer_ = (char*)(hostAqlCode) + akc->kernel_code_entry_byte_offset; + + packet->scratchBufferWaveOffset_ = akc->debug_wavefront_private_segment_offset_sgpr; + + packet->sizeOfIsaBuffer_ = codeInfo->aqlCodeSize_; + + packet->sizeOfStaticGroupMemory_ = akc->workgroup_group_segment_byte_size; + + // The trap_reserved_vgpr_index will be 4 less the original + // This value must be used only by the debugger + packet->trapReservedVgprIndex_ = akc->workitem_vgpr_count - NumberReserveVgprs; +} + +DebugEvent GpuDebugManager::createDebugEvent(const bool autoReset) { + // create the event object + osEventHandle shaderEvent = osEventCreate(!autoReset); + + // event object has been created, set the initial state + if (shaderEvent != 0) { + osEventReset(shaderEvent); // initial state is non-signaled + + if (device()->gslCtx()->exceptionNotification(shaderEvent)) { + return shaderEvent; } + } - // first time register - set the message storage, flush queue and enable hw debug - if (!isRegistered()) { - debugMessages_ = messageStorage; - if (!device()->gslCtx()->registerHwDebugger(debugMessages_)) { - LogError("debugmanager: Register debugger failed"); - return CL_OUT_OF_RESOURCES; - } - - isRegistered_ = true; - - if (CL_SUCCESS != createRuntimeTrapHandler()) { - LogError("debugmanager: Create runtime trap handler failed"); - return CL_OUT_OF_RESOURCES; - } - } - - context_ = context; + return 0; +} +cl_int GpuDebugManager::waitDebugEvent(DebugEvent pEvent, uint32_t timeOut) const { + if (osEventTimedWait(pEvent, timeOut)) { return CL_SUCCESS; + } else { + return CL_EVENT_TIMEOUT_AMD; + } } -void -GpuDebugManager::unregisterDebugger() -{ - if (isRegistered()) { - // reset the debugger registration flag - isRegistered_ = false; - context_ = NULL; - } +void GpuDebugManager::destroyDebugEvent(DebugEvent* pEvent) { + osEventDestroy(*pEvent); + *pEvent = 0; + + device()->gslCtx()->exceptionNotification(0); } -void -GpuDebugManager::flushCache(uint32_t mask) -{ - HwDbgGpuCacheMask cacheMask(mask); - device()->xferQueue()->flushCuCaches(cacheMask); +void GpuDebugManager::wavefrontControl(uint32_t waveAction, uint32_t waveMode, uint32_t trapId, + void* waveAddr) const { + device()->gslCtx()->executeSqCommand(waveAction, waveMode, trapId, waveAddr); } +void GpuDebugManager::setAddressWatch(uint32_t numWatchPoints, void** watchAddress, + uint64_t* watchMask, uint64_t* watchMode, DebugEvent* event) { + size_t requiredSize = numWatchPoints * sizeof(HwDbgAddressWatch); -void -GpuDebugManager::setupTrapInformation(DebugToolInfo* toolInfo) -{ - toolInfo->scratchAddress_ = 0; - toolInfo->scratchSize_ = 0; - toolInfo->globalAddress_ = 0; - toolInfo->sqPerfcounterEnable_ = false; + // previously allocated size is not big enough, allocate new memory + if (addressWatchSize_ < requiredSize) { + if (NULL != addressWatch_) { // free the smaller address watch storage + delete[] addressWatch_; + } + addressWatch_ = new HwDbgAddressWatch[numWatchPoints]; + addressWatchSize_ = requiredSize; + } - // Set up trap related info in the kernel info structure to be - // used in the kernel dispatch. - toolInfo->exceptionMask_ = excpPolicy_.exceptionMask; - toolInfo->gpuSingleStepMode_ = execMode_.gpuSingleStepMode; - toolInfo->monitorMode_ = execMode_.monitorMode; + // fill in the address watch structure + memset(addressWatch_, 0, addressWatchSize_); - // The order of these three bits is determined by the definition - // of the register COMPUTE_DISPATCH_INITIATOR - toolInfo->cacheDisableMask_ = ((execMode_.disableL1Scalar << 2) - | (execMode_.disableL2Cache << 1) - | (execMode_.disableL1Vector)); + for (uint32_t i = 0; i < numWatchPoints; i++) { + amd::Memory* watchMem = as_amd(reinterpret_cast(watchAddress[i])); + Memory* watchMemAddress = device()->getGpuMemory(watchMem); - toolInfo->reservedCuNum_ = execMode_.reservedCuNum; + addressWatch_[i].watchAddress_ = reinterpret_cast(watchMemAddress->vmAddress()); + addressWatch_[i].watchMask_ = watchMask[i]; + addressWatch_[i].watchMode_ = (cl_dbg_address_watch_mode_amd)watchMode[i]; + addressWatch_[i].event_ = (0 != event) ? event[i] : 0; + } - toolInfo->trapHandler_ = rtTrapInfo_[kDebugTrapHandlerLocation]; - toolInfo->trapBuffer_ = rtTrapInfo_[kDebugTrapBufferLocation]; + // setup the watch addresses + device()->gslCtx()->setAddressWatch(numWatchPoints, (void*)addressWatch_); } -void -GpuDebugManager::getPacketAmdInfo( - const void* aqlCodeInfo, - void* packetInfo) const +void GpuDebugManager::setGlobalMemory(amd::Memory* memObj, uint32_t offset, void* srcPtr, + uint32_t size) { + gpu::Memory* globalMem = device()->getGpuMemory(memObj); -{ - const AqlCodeInfo* codeInfo = - reinterpret_cast(aqlCodeInfo); + address mappedMem = static_cast
(globalMem->map(NULL, 0)); + assert(mappedMem != 0); - const amd_kernel_code_t* hostAqlCode = codeInfo->aqlCode_; + void* dest_ptr = reinterpret_cast(mappedMem + offset); + memcpy(dest_ptr, srcPtr, size); - PacketAmdInfo* packet = - reinterpret_cast(packetInfo); - - const amd_kernel_code_t* akc = hostAqlCode; - - packet->numberOfSgprs_ = akc->wavefront_sgpr_count; - packet->numberOfVgprs_ = akc->workitem_vgpr_count; - - // use mapped kernel_object_address for host accessing of ISA buffer - packet->pointerToIsaBuffer_ = (char*) (hostAqlCode) + - akc->kernel_code_entry_byte_offset; - - packet->scratchBufferWaveOffset_ = - akc->debug_wavefront_private_segment_offset_sgpr; - - packet->sizeOfIsaBuffer_ = codeInfo->aqlCodeSize_; - - packet->sizeOfStaticGroupMemory_ = akc->workgroup_group_segment_byte_size; - - // The trap_reserved_vgpr_index will be 4 less the original - // This value must be used only by the debugger - packet->trapReservedVgprIndex_ = akc->workitem_vgpr_count - NumberReserveVgprs; + globalMem->unmap(NULL); } -DebugEvent -GpuDebugManager::createDebugEvent( - const bool autoReset) -{ - // create the event object - osEventHandle shaderEvent = osEventCreate(!autoReset); +cl_int GpuDebugManager::createRuntimeTrapHandler() { + size_t codeSize = 0; + const uint32_t* rtTrapCode = NULL; - // event object has been created, set the initial state - if (shaderEvent != 0) { + if (device()->settings().viPlus_) { + codeSize = sizeof(RuntimeTrapCodeVi); + rtTrapCode = RuntimeTrapCodeVi; + } else { + codeSize = sizeof(RuntimeTrapCode); + rtTrapCode = RuntimeTrapCode; + } - osEventReset(shaderEvent); // initial state is non-signaled + uint32_t numCodes = codeSize / sizeof(uint32_t); - if (device()->gslCtx()->exceptionNotification(shaderEvent)) { - return shaderEvent; - } - } + // Handle TMA corruption hw bug workaround - + // The trap handler buffer has extra 256 bytes allocated, the TMA address + // is stored in the first two DWORDs and the actual trap handler code + // is stored starting at the location of 256 bytes (TbaStartOffset). + // + // allocate memory for the runtime trap handler (TBA) + TMA address + uint32_t allocSize = codeSize + TbaStartOffset; - return 0; -} + Memory* rtTBA = new Memory(*device(), allocSize); + runtimeTBA_ = rtTBA; -cl_int -GpuDebugManager::waitDebugEvent( - DebugEvent pEvent, - uint32_t timeOut) const -{ - if (osEventTimedWait(pEvent, timeOut)) { - return CL_SUCCESS; - } - else { - return CL_EVENT_TIMEOUT_AMD; - } -} + if ((rtTBA == NULL) || !rtTBA->create(Resource::RemoteUSWC)) { + return CL_OUT_OF_RESOURCES; + } + address tbaAddress = reinterpret_cast
(rtTBA->map(NULL)); -void -GpuDebugManager::destroyDebugEvent(DebugEvent* pEvent) -{ - osEventDestroy(*pEvent); - *pEvent = 0; + // allocate buffer for the runtime trap handler buffer (TMA) + uint32_t tmaSize = 0x100; + Memory* rtTMA = new Memory(*device(), tmaSize); + runtimeTMA_ = rtTMA; - device()->gslCtx()->exceptionNotification(0); + if ((rtTMA == NULL) || !rtTMA->create(Resource::RemoteUSWC)) { + return CL_OUT_OF_RESOURCES; + } -} + uint64_t rtTmaAddress = rtTMA->vmAddress(); + if ((rtTBA->vmAddress() & 0xFF) != 0 || (rtTmaAddress & 0xFF) != 0) { + LogError("debugmanager: Trap handler/buffer is not 256-byte aligned"); + return CL_INVALID_VALUE; + } -void -GpuDebugManager::wavefrontControl( - uint32_t waveAction, - uint32_t waveMode, - uint32_t trapId, - void* waveAddr) const -{ - device()->gslCtx()->executeSqCommand(waveAction, waveMode, trapId, waveAddr); -} + // store the TMA address at the beginning of trap handler buffer + uint64_t* tbaStorage = reinterpret_cast(tbaAddress); + tbaStorage[0] = rtTmaAddress; -void -GpuDebugManager::setAddressWatch( - uint32_t numWatchPoints, - void** watchAddress, - uint64_t* watchMask, - uint64_t* watchMode, - DebugEvent* event) -{ - size_t requiredSize = numWatchPoints * sizeof(HwDbgAddressWatch); + // save the trap handler code + uint32_t* trapHandlerPtr = (uint32_t*)(tbaAddress + TbaStartOffset); + for (uint32_t i = 0; i < numCodes; i++) { + trapHandlerPtr[i] = rtTrapCode[i]; + } - // previously allocated size is not big enough, allocate new memory - if (addressWatchSize_ < requiredSize) { - if (NULL != addressWatch_) { // free the smaller address watch storage - delete [] addressWatch_; - } - addressWatch_ = new HwDbgAddressWatch[numWatchPoints]; - addressWatchSize_ = requiredSize; - } + rtTBA->unmap(NULL); - // fill in the address watch structure - memset(addressWatch_, 0, addressWatchSize_); - - for (uint32_t i = 0; i < numWatchPoints; i++) - { - amd::Memory* watchMem = as_amd(reinterpret_cast(watchAddress[i])); - Memory* watchMemAddress = device()->getGpuMemory(watchMem); - - addressWatch_[i].watchAddress_ = reinterpret_cast(watchMemAddress->vmAddress()); - addressWatch_[i].watchMask_ = watchMask[i]; - addressWatch_[i].watchMode_ = (cl_dbg_address_watch_mode_amd) watchMode[i]; - addressWatch_[i].event_ = (0 != event) ? event[i] : 0; - } - - // setup the watch addresses - device()->gslCtx()->setAddressWatch(numWatchPoints, (void*) addressWatch_); - -} - -void -GpuDebugManager::setGlobalMemory( - amd::Memory* memObj, - uint32_t offset, - void* srcPtr, - uint32_t size) -{ - gpu::Memory* globalMem = device()->getGpuMemory(memObj); - - address mappedMem = static_cast
(globalMem->map(NULL,0)); - assert(mappedMem != 0); - - void* dest_ptr = reinterpret_cast(mappedMem + offset); - memcpy(dest_ptr, srcPtr, size); - - globalMem->unmap(NULL); -} - -cl_int -GpuDebugManager::createRuntimeTrapHandler() -{ - size_t codeSize = 0; - const uint32_t* rtTrapCode = NULL; - - if (device()->settings().viPlus_) { - codeSize = sizeof(RuntimeTrapCodeVi); - rtTrapCode = RuntimeTrapCodeVi; - } - else { - codeSize = sizeof(RuntimeTrapCode); - rtTrapCode = RuntimeTrapCode; - } - - uint32_t numCodes = codeSize / sizeof(uint32_t); - - // Handle TMA corruption hw bug workaround - - // The trap handler buffer has extra 256 bytes allocated, the TMA address - // is stored in the first two DWORDs and the actual trap handler code - // is stored starting at the location of 256 bytes (TbaStartOffset). - // - // allocate memory for the runtime trap handler (TBA) + TMA address - uint32_t allocSize = codeSize + TbaStartOffset; - - Memory* rtTBA = new Memory(*device(), allocSize); - runtimeTBA_ = rtTBA; - - if ((rtTBA == NULL) || !rtTBA->create(Resource::RemoteUSWC)) { - return CL_OUT_OF_RESOURCES; - } - address tbaAddress = reinterpret_cast
(rtTBA->map(NULL)); - - // allocate buffer for the runtime trap handler buffer (TMA) - uint32_t tmaSize = 0x100; - Memory* rtTMA = new Memory(*device(), tmaSize); - runtimeTMA_ = rtTMA; - - if ((rtTMA == NULL) || !rtTMA->create(Resource::RemoteUSWC)) { - return CL_OUT_OF_RESOURCES; - } - - uint64_t rtTmaAddress = rtTMA->vmAddress(); - if ((rtTBA->vmAddress() & 0xFF) != 0 || (rtTmaAddress & 0xFF) != 0) { - LogError("debugmanager: Trap handler/buffer is not 256-byte aligned"); - return CL_INVALID_VALUE; - } - - // store the TMA address at the beginning of trap handler buffer - uint64_t* tbaStorage = reinterpret_cast(tbaAddress); - tbaStorage[0] = rtTmaAddress; - - // save the trap handler code - uint32_t* trapHandlerPtr = (uint32_t*)(tbaAddress + TbaStartOffset); - for (uint32_t i = 0; i < numCodes; i++) { - trapHandlerPtr[i] = rtTrapCode[i]; - } - - rtTBA->unmap(NULL); - - return CL_SUCCESS; + return CL_SUCCESS; } } // namespace gpu diff --git a/rocclr/runtime/device/gpu/gpudebugmanager.hpp b/rocclr/runtime/device/gpu/gpudebugmanager.hpp index 1b0b12307a..d11569bc80 100644 --- a/rocclr/runtime/device/gpu/gpudebugmanager.hpp +++ b/rocclr/runtime/device/gpu/gpudebugmanager.hpp @@ -36,96 +36,85 @@ class Memory; * */ class GpuDebugManager : public amd::HwDebugManager { -public: + public: + //! Constructor of the debug manager class + GpuDebugManager(amd::Device* device); - //! Constructor of the debug manager class - GpuDebugManager(amd::Device* device); + //! Destructor of the debug manager class + ~GpuDebugManager(); - //! Destructor of the debug manager class - ~GpuDebugManager(); + //! Get the single instance of the GpuDebugManager class + static GpuDebugManager* getDefaultInstance(); - //! Get the single instance of the GpuDebugManager class - static GpuDebugManager* getDefaultInstance(); + //! Destroy the GpuDebugManager class object + static void destroyInstances(); - //! Destroy the GpuDebugManager class object - static void destroyInstances(); + //! Flush cache + void flushCache(uint32_t mask); - //! Flush cache - void flushCache(uint32_t mask); + //! Create the debug event + DebugEvent createDebugEvent(const bool autoReset); - //! Create the debug event - DebugEvent createDebugEvent(const bool autoReset); + //! Wait for the debug event + cl_int waitDebugEvent(DebugEvent pEvent, uint32_t timeOut) const; - //! Wait for the debug event - cl_int waitDebugEvent(DebugEvent pEvent, uint32_t timeOut) const; + //! Destroy the debug event + void destroyDebugEvent(DebugEvent* pEvent); - //! Destroy the debug event - void destroyDebugEvent(DebugEvent* pEvent); + //! Register the debugger + cl_int registerDebugger(amd::Context* context, uintptr_t messageStorage); - //! Register the debugger - cl_int registerDebugger(amd::Context*context, uintptr_t messageStorage); + //! Unregister the debugger + void unregisterDebugger(); - //! Unregister the debugger - void unregisterDebugger(); + //! Send the wavefront control cmmand + void wavefrontControl(uint32_t waveAction, uint32_t waveMode, uint32_t trapId, + void* waveAddr) const; - //! Send the wavefront control cmmand - void wavefrontControl(uint32_t waveAction, - uint32_t waveMode, - uint32_t trapId, - void* waveAddr) const; + //! Set address watching point + void setAddressWatch(uint32_t numWatchPoints, void** watchAddress, uint64_t* watchMask, + uint64_t* watchMode, DebugEvent* pEvent); - //! Set address watching point - void setAddressWatch(uint32_t numWatchPoints, - void** watchAddress, - uint64_t* watchMask, - uint64_t* watchMode, - DebugEvent* pEvent); + //! Map the kernel code for host access + void mapKernelCode(void* aqlCodeInfo) const; - //! Map the kernel code for host access - void mapKernelCode(void* aqlCodeInfo) const; + //! Get the packet information for dispatch + void getPacketAmdInfo(const void* aqlCodeInfo, void* packetInfo) const; - //! Get the packet information for dispatch - void getPacketAmdInfo(const void* aqlCodeInfo, void* packetInfo) const; + //! Set global memory values + void setGlobalMemory(amd::Memory* memObj, uint32_t offset, void* srcPtr, uint32_t size); - //! Set global memory values - void setGlobalMemory(amd::Memory* memObj, uint32_t offset, void* srcPtr, uint32_t size); + //! Execute the post-dispatch callback function + void executePostDispatchCallBack(); - //! Execute the post-dispatch callback function - void executePostDispatchCallBack(); + //! Execute the pre-dispatch callback function + void executePreDispatchCallBack(void* aqlPacket, void* toolInfo); - //! Execute the pre-dispatch callback function - void executePreDispatchCallBack(void* aqlPacket, - void* toolInfo); + private: + //! Setup trap handler info for kernel execution + void setupTrapInformation(DebugToolInfo* toolInfo); -private: + //! Create runtime trap handler + cl_int createRuntimeTrapHandler(); - //! Setup trap handler info for kernel execution - void setupTrapInformation(DebugToolInfo* toolInfo); + protected: + const VirtualGPU* vGpu() const { return vGpu_; } - //! Create runtime trap handler - cl_int createRuntimeTrapHandler(); + private: + const gpu::Device* device() const { return reinterpret_cast(device_); } -protected: + VirtualGPU* vGpu_; //!< the virtual GPU - const VirtualGPU* vGpu() const { return vGpu_; } + uintptr_t debugMessages_; //!< Pointer to a SHARED_DEBUG_MESSAGES pass to the KMD -private: + HwDbgAddressWatch* addressWatch_; //!< Address watch data + size_t addressWatchSize_; //!< Size of address watch data - const gpu::Device* device() const { - return reinterpret_cast(device_); } - - VirtualGPU* vGpu_; //!< the virtual GPU - - uintptr_t debugMessages_; //!< Pointer to a SHARED_DEBUG_MESSAGES pass to the KMD - - HwDbgAddressWatch* addressWatch_; //!< Address watch data - size_t addressWatchSize_; //!< Size of address watch data - - //! Arguments used by the callback function - void* oclEventHandle_; //!< event handler - const hsa_kernel_dispatch_packet_t* aqlPacket_; //!< AQL packet + //! Arguments used by the callback function + void* oclEventHandle_; //!< event handler + const hsa_kernel_dispatch_packet_t* aqlPacket_; //!< AQL packet }; } // namespace gpu -#endif // HWDBG_DEBUGMANAGER_H__ +#endif // HWDBG_DEBUGMANAGER_H__ diff --git a/rocclr/runtime/device/gpu/gpudefs.hpp b/rocclr/runtime/device/gpu/gpudefs.hpp index db1bb016c6..d6c08bf992 100644 --- a/rocclr/runtime/device/gpu/gpudefs.hpp +++ b/rocclr/runtime/device/gpu/gpudefs.hpp @@ -13,7 +13,7 @@ #include "GSLDevice.h" #include "GSLContext.h" -extern bool getFuncInfoFromImage(CALimage image, CALfuncInfo *pFuncInfo); +extern bool getFuncInfoFromImage(CALimage image, CALfuncInfo* pFuncInfo); /*! \addtogroup GPU * @{ @@ -26,9 +26,9 @@ namespace gpu { //! Maximum number of the supported global atomic counters const static uint MaxAtomicCounters = 8; //! Maximum number of the supported samplers -const static uint MaxSamplers = 16; +const static uint MaxSamplers = 16; //! Maximum number of supported read images -const static uint MaxReadImage = 128; +const static uint MaxReadImage = 128; //! Maximum number of supported write images const static uint MaxWriteImage = 8; //! Maximum number of supported read/write images for OCL20 @@ -64,84 +64,111 @@ const static uint HsaSamplerObjectAlignment = 16; const static uint DeviceQueueMaskSize = 32; //! Defines all supported ASIC families -enum AsicFamilies { - Family7xx, - Family8xx, - FamilyTotal -}; +enum AsicFamilies { Family7xx, Family8xx, FamilyTotal }; struct AMDDeviceInfo { - uint machine_; //!< Machine target ID - const char* targetName_; //!< Target name - const char* machineTarget_; //!< Machine target - uint simdPerCU_; //!< Number of SIMDs per CU - uint simdWidth_; //!< Number of workitems processed per SIMD - uint simdInstructionWidth_; //!< Number of instructions processed per SIMD - uint memChannelBankWidth_; //!< Memory channel bank width - uint localMemSizePerCU_; //!< Local memory size per CU - uint localMemBanks_; //!< Number of banks of local memory - uint gfxipVersion_; //!< The core engine GFXIP version + uint machine_; //!< Machine target ID + const char* targetName_; //!< Target name + const char* machineTarget_; //!< Machine target + uint simdPerCU_; //!< Number of SIMDs per CU + uint simdWidth_; //!< Number of workitems processed per SIMD + uint simdInstructionWidth_; //!< Number of instructions processed per SIMD + uint memChannelBankWidth_; //!< Memory channel bank width + uint localMemSizePerCU_; //!< Local memory size per CU + uint localMemBanks_; //!< Number of banks of local memory + uint gfxipVersion_; //!< The core engine GFXIP version }; static const AMDDeviceInfo DeviceInfo[] = { - // Machine targetName machineTarget -/* CAL_TARGET_600 */ { ED_ATI_CAL_MACHINE_R600_ISA, "", "", 0, 0, 0, 0, 0, 0, 0 }, -/* CAL_TARGET_610 */ { ED_ATI_CAL_MACHINE_R610_ISA, "", "", 0, 0, 0, 0, 0, 0, 0 }, -/* CAL_TARGET_630 */ { ED_ATI_CAL_MACHINE_R630_ISA, "", "", 0, 0, 0, 0, 0, 0, 0 }, -/* CAL_TARGET_670 */ { ED_ATI_CAL_MACHINE_R670_ISA, "", "", 0, 0, 0, 0, 0, 0, 0 }, -/* CAL_TARGET_7XX */ { ED_ATI_CAL_MACHINE_R770_ISA, "", "", 0, 0, 0, 0, 0, 0, 0 }, -/* CAL_TARGET_770 */ { ED_ATI_CAL_MACHINE_R770_ISA, "", "", 0, 0, 0, 0, 0, 0, 0 }, -/* CAL_TARGET_710 */ { ED_ATI_CAL_MACHINE_R710_ISA, "", "", 0, 0, 0, 0, 0, 0, 0 }, -/* CAL_TARGET_730 */ { ED_ATI_CAL_MACHINE_R730_ISA, "", "", 0, 0, 0, 0, 0, 0, 0 }, -/* CAL_TARGET_CYPRESS */ { ED_ATI_CAL_MACHINE_CYPRESS_ISA, "", "", 1, 16, 5, 256, 32 * Ki, 32, 400 }, -/* CAL_TARGET_JUNIPER */ { ED_ATI_CAL_MACHINE_JUNIPER_ISA, "", "", 1, 16, 5, 256, 32 * Ki, 32, 400 }, -/* CAL_TARGET_REDWOOD */ { ED_ATI_CAL_MACHINE_REDWOOD_ISA, "", "", 1, 16, 5, 256, 32 * Ki, 16, 400 }, -/* CAL_TARGET_CEDAR */ { ED_ATI_CAL_MACHINE_CEDAR_ISA, "", "", 1, 8, 5, 256, 32 * Ki, 16, 400 }, -/* CAL_TARGET_SUMO */ { ED_ATI_CAL_MACHINE_SUMO_ISA, "", "", 1, 16, 5, 256, 32 * Ki, 16, 400 }, -/* CAL_TARGET_SUPERSUMO*/ { ED_ATI_CAL_MACHINE_SUPERSUMO_ISA, "", "", 1, 16, 5, 256, 32 * Ki, 16, 400 }, -/* CAL_TARGET_WRESTLER*/ { ED_ATI_CAL_MACHINE_WRESTLER_ISA, "", "", 1, 8, 5, 256, 32 * Ki, 16, 400 }, -/* CAL_TARGET_CAYMAN */ { ED_ATI_CAL_MACHINE_CAYMAN_ISA, "", "", 1, 16, 4, 256, 32 * Ki, 32, 500 }, -/* CAL_TARGET_KAUAI */ { ED_ATI_CAL_MACHINE_KAUAI_ISA, "", "", 1, 16, 5, 256, 32 * Ki, 32, 400 }, -/* CAL_TARGET_BARTS */ { ED_ATI_CAL_MACHINE_BARTS_ISA , "", "", 1, 16, 5, 256, 32 * Ki, 32, 400 }, -/* CAL_TARGET_TURKS */ { ED_ATI_CAL_MACHINE_TURKS_ISA , "", "", 1, 16, 5, 256, 32 * Ki, 32, 400 }, -/* CAL_TARGET_CAICOS */ { ED_ATI_CAL_MACHINE_CAICOS_ISA, "", "", 1, 16, 5, 256, 32 * Ki, 32, 400 }, -/* CAL_TARGET_TAHITI */ { ED_ATI_CAL_MACHINE_TAHITI_ISA, "Tahiti", "tahiti", 4, 16, 1, 256, 64 * Ki, 32, 600 }, -/* CAL_TARGET_PITCAIRN */ { ED_ATI_CAL_MACHINE_PITCAIRN_ISA, "Pitcairn", "pitcairn", 4, 16, 1, 256, 64 * Ki, 32, 600 }, -/* CAL_TARGET_CAPEVERDE */ { ED_ATI_CAL_MACHINE_CAPEVERDE_ISA, "Capeverde", "capeverde", 4, 16, 1, 256, 64 * Ki, 32, 600 }, -/* CAL_TARGET_DEVASTATOR */ { ED_ATI_CAL_MACHINE_DEVASTATOR_ISA,"", "", 1, 16, 4, 256, 32 * Ki, 32, 500 }, -/* CAL_TARGET_SCRAPPER */ { ED_ATI_CAL_MACHINE_SCRAPPER_ISA, "", "", 1, 16, 4, 256, 32 * Ki, 32, 500 }, -/* CAL_TARGET_OLAND */ { ED_ATI_CAL_MACHINE_OLAND_ISA, "Oland", "oland", 4, 16, 1, 256, 64 * Ki, 32, 600 }, -/* CAL_TARGET_BONAIRE */ { ED_ATI_CAL_MACHINE_BONAIRE_ISA, "Bonaire", "bonaire", 4, 16, 1, 256, 64 * Ki, 32, 702 }, -/* CAL_TARGET_SPECTRE */ { ED_ATI_CAL_MACHINE_SPECTRE_ISA, "Spectre", "spectre", 4, 16, 1, 256, 64 * Ki, 32, 701 }, -/* CAL_TARGET_SPOOKY */ { ED_ATI_CAL_MACHINE_SPOOKY_ISA, "Spooky", "spooky", 4, 16, 1, 256, 64 * Ki, 32, 701 }, -/* CAL_TARGET_KALINDI */ { ED_ATI_CAL_MACHINE_KALINDI_ISA, "Kalindi", "kalindi", 4, 16, 1, 256, 64 * Ki, 32, 702 }, -/* CAL_TARGET_HAINAN */ { ED_ATI_CAL_MACHINE_HAINAN_ISA, "Hainan", "hainan", 4, 16, 1, 256, 64 * Ki, 32, 600 }, -/* CAL_TARGET_HAWAII */ { ED_ATI_CAL_MACHINE_HAWAII_ISA, "Hawaii", "hawaii", 4, 16, 1, 256, 64 * Ki, 32, 702 }, -/* CAL_TARGET_ICELAND */ { ED_ATI_CAL_MACHINE_ICELAND_ISA, "Iceland", "iceland", 4, 16, 1, 256, 64 * Ki, 32, 800 }, -/* CAL_TARGET_TONGA */ { ED_ATI_CAL_MACHINE_TONGA_ISA, "Tonga", "tonga", 4, 16, 1, 256, 64 * Ki, 32, 800 }, -/* CAL_TARGET_MULLINS */ { ED_ATI_CAL_MACHINE_GODAVARI_ISA, "Mullins", "mullins", 4, 16, 1, 256, 64 * Ki, 32, 702 }, -/* CAL_TARGET_FIJI */ { ED_ATI_CAL_MACHINE_FIJI_ISA, "Fiji", "fiji", 4, 16, 1, 256, 64 * Ki, 32, 800 }, -/* CAL_TARGET_CARRIZO */ { ED_ATI_CAL_MACHINE_CARRIZO_ISA, "Carrizo" , "carrizo", 4, 16, 1, 256, 64 * Ki, 32, 800 }, -/* CAL_TARGET_ELLESMERE */ { ED_ATI_CAL_MACHINE_ELLESMERE_ISA, "Ellesmere", "ellesmere", 4, 16, 1, 256, 64 * Ki, 32, 800 }, -/* CAL_TARGET_BAFFIN */ { ED_ATI_CAL_MACHINE_BAFFIN_ISA, "Baffin", "baffin", 4, 16, 1, 256, 64 * Ki, 32, 800 }, -/* CAL_TARGET_GREENLAND */ { ED_ATI_CAL_MACHINE_GREENLAND_ISA, IF(IS_BRAHMA,"","gfx900"), IF(IS_BRAHMA,"","gfx900"), 4, 16, 1, 256, 64 * Ki, 32, 900 }, -/* CAL_TARGET_STONEY */ { ED_ATI_CAL_MACHINE_STONEY_ISA, "Stoney", "stoney", 4, 16, 1, 256, 64 * Ki, 32, 800 }, -/* CAL_TARGET_LEXA */ { ED_ATI_CAL_MACHINE_LEXA_ISA, IF(IS_BRAHMA,"","gfx804"), IF(IS_BRAHMA,"","gfx804"), 4, 16, 1, 256, 64 * Ki, 32, 800 }, -/* CAL_TARGET_RAVEN */ { ED_ATI_CAL_MACHINE_RAVEN_ISA, IF(IS_BRAHMA,"","gfx901"), IF(IS_BRAHMA,"","gfx901"), 4, 16, 1, 256, 64 * Ki, 32, 900 }, -/* CAL_TARGET_POLARIS22 */ { ED_ATI_CAL_MACHINE_POLARIS22_ISA, IF(IS_BRAHMA,"","gfx804"), IF(IS_BRAHMA,"","gfx804"), 4, 16, 1, 256, 64 * Ki, 32, 800 }, + // Machine targetName machineTarget + /* CAL_TARGET_600 */ {ED_ATI_CAL_MACHINE_R600_ISA, "", "", 0, 0, 0, 0, 0, 0, 0}, + /* CAL_TARGET_610 */ {ED_ATI_CAL_MACHINE_R610_ISA, "", "", 0, 0, 0, 0, 0, 0, 0}, + /* CAL_TARGET_630 */ {ED_ATI_CAL_MACHINE_R630_ISA, "", "", 0, 0, 0, 0, 0, 0, 0}, + /* CAL_TARGET_670 */ {ED_ATI_CAL_MACHINE_R670_ISA, "", "", 0, 0, 0, 0, 0, 0, 0}, + /* CAL_TARGET_7XX */ {ED_ATI_CAL_MACHINE_R770_ISA, "", "", 0, 0, 0, 0, 0, 0, 0}, + /* CAL_TARGET_770 */ {ED_ATI_CAL_MACHINE_R770_ISA, "", "", 0, 0, 0, 0, 0, 0, 0}, + /* CAL_TARGET_710 */ {ED_ATI_CAL_MACHINE_R710_ISA, "", "", 0, 0, 0, 0, 0, 0, 0}, + /* CAL_TARGET_730 */ {ED_ATI_CAL_MACHINE_R730_ISA, "", "", 0, 0, 0, 0, 0, 0, 0}, + /* CAL_TARGET_CYPRESS */ {ED_ATI_CAL_MACHINE_CYPRESS_ISA, "", "", 1, 16, 5, 256, 32 * Ki, 32, + 400}, + /* CAL_TARGET_JUNIPER */ {ED_ATI_CAL_MACHINE_JUNIPER_ISA, "", "", 1, 16, 5, 256, 32 * Ki, 32, + 400}, + /* CAL_TARGET_REDWOOD */ {ED_ATI_CAL_MACHINE_REDWOOD_ISA, "", "", 1, 16, 5, 256, 32 * Ki, 16, + 400}, + /* CAL_TARGET_CEDAR */ {ED_ATI_CAL_MACHINE_CEDAR_ISA, "", "", 1, 8, 5, 256, 32 * Ki, 16, 400}, + /* CAL_TARGET_SUMO */ {ED_ATI_CAL_MACHINE_SUMO_ISA, "", "", 1, 16, 5, 256, 32 * Ki, 16, 400}, + /* CAL_TARGET_SUPERSUMO*/ {ED_ATI_CAL_MACHINE_SUPERSUMO_ISA, "", "", 1, 16, 5, 256, 32 * Ki, 16, + 400}, + /* CAL_TARGET_WRESTLER*/ {ED_ATI_CAL_MACHINE_WRESTLER_ISA, "", "", 1, 8, 5, 256, 32 * Ki, 16, + 400}, + /* CAL_TARGET_CAYMAN */ {ED_ATI_CAL_MACHINE_CAYMAN_ISA, "", "", 1, 16, 4, 256, 32 * Ki, 32, + 500}, + /* CAL_TARGET_KAUAI */ {ED_ATI_CAL_MACHINE_KAUAI_ISA, "", "", 1, 16, 5, 256, 32 * Ki, 32, 400}, + /* CAL_TARGET_BARTS */ {ED_ATI_CAL_MACHINE_BARTS_ISA, "", "", 1, 16, 5, 256, 32 * Ki, 32, 400}, + /* CAL_TARGET_TURKS */ {ED_ATI_CAL_MACHINE_TURKS_ISA, "", "", 1, 16, 5, 256, 32 * Ki, 32, 400}, + /* CAL_TARGET_CAICOS */ {ED_ATI_CAL_MACHINE_CAICOS_ISA, "", "", 1, 16, 5, 256, 32 * Ki, 32, + 400}, + /* CAL_TARGET_TAHITI */ {ED_ATI_CAL_MACHINE_TAHITI_ISA, "Tahiti", "tahiti", 4, 16, 1, 256, + 64 * Ki, 32, 600}, + /* CAL_TARGET_PITCAIRN */ {ED_ATI_CAL_MACHINE_PITCAIRN_ISA, "Pitcairn", "pitcairn", 4, 16, 1, + 256, 64 * Ki, 32, 600}, + /* CAL_TARGET_CAPEVERDE */ {ED_ATI_CAL_MACHINE_CAPEVERDE_ISA, "Capeverde", "capeverde", 4, 16, + 1, 256, 64 * Ki, 32, 600}, + /* CAL_TARGET_DEVASTATOR */ {ED_ATI_CAL_MACHINE_DEVASTATOR_ISA, "", "", 1, 16, 4, 256, 32 * Ki, + 32, 500}, + /* CAL_TARGET_SCRAPPER */ {ED_ATI_CAL_MACHINE_SCRAPPER_ISA, "", "", 1, 16, 4, 256, 32 * Ki, 32, + 500}, + /* CAL_TARGET_OLAND */ {ED_ATI_CAL_MACHINE_OLAND_ISA, "Oland", "oland", 4, 16, 1, 256, 64 * Ki, + 32, 600}, + /* CAL_TARGET_BONAIRE */ {ED_ATI_CAL_MACHINE_BONAIRE_ISA, "Bonaire", "bonaire", 4, 16, 1, 256, + 64 * Ki, 32, 702}, + /* CAL_TARGET_SPECTRE */ {ED_ATI_CAL_MACHINE_SPECTRE_ISA, "Spectre", "spectre", 4, 16, 1, 256, + 64 * Ki, 32, 701}, + /* CAL_TARGET_SPOOKY */ {ED_ATI_CAL_MACHINE_SPOOKY_ISA, "Spooky", "spooky", 4, 16, 1, 256, + 64 * Ki, 32, 701}, + /* CAL_TARGET_KALINDI */ {ED_ATI_CAL_MACHINE_KALINDI_ISA, "Kalindi", "kalindi", 4, 16, 1, 256, + 64 * Ki, 32, 702}, + /* CAL_TARGET_HAINAN */ {ED_ATI_CAL_MACHINE_HAINAN_ISA, "Hainan", "hainan", 4, 16, 1, 256, + 64 * Ki, 32, 600}, + /* CAL_TARGET_HAWAII */ {ED_ATI_CAL_MACHINE_HAWAII_ISA, "Hawaii", "hawaii", 4, 16, 1, 256, + 64 * Ki, 32, 702}, + /* CAL_TARGET_ICELAND */ {ED_ATI_CAL_MACHINE_ICELAND_ISA, "Iceland", "iceland", 4, 16, 1, 256, + 64 * Ki, 32, 800}, + /* CAL_TARGET_TONGA */ {ED_ATI_CAL_MACHINE_TONGA_ISA, "Tonga", "tonga", 4, 16, 1, 256, 64 * Ki, + 32, 800}, + /* CAL_TARGET_MULLINS */ {ED_ATI_CAL_MACHINE_GODAVARI_ISA, "Mullins", "mullins", 4, 16, 1, 256, + 64 * Ki, 32, 702}, + /* CAL_TARGET_FIJI */ {ED_ATI_CAL_MACHINE_FIJI_ISA, "Fiji", "fiji", 4, 16, 1, 256, 64 * Ki, 32, + 800}, + /* CAL_TARGET_CARRIZO */ {ED_ATI_CAL_MACHINE_CARRIZO_ISA, "Carrizo", "carrizo", 4, 16, 1, 256, + 64 * Ki, 32, 800}, + /* CAL_TARGET_ELLESMERE */ {ED_ATI_CAL_MACHINE_ELLESMERE_ISA, "Ellesmere", "ellesmere", 4, 16, + 1, 256, 64 * Ki, 32, 800}, + /* CAL_TARGET_BAFFIN */ {ED_ATI_CAL_MACHINE_BAFFIN_ISA, "Baffin", "baffin", 4, 16, 1, 256, + 64 * Ki, 32, 800}, + /* CAL_TARGET_GREENLAND */ {ED_ATI_CAL_MACHINE_GREENLAND_ISA, IF(IS_BRAHMA, "", "gfx900"), + IF(IS_BRAHMA, "", "gfx900"), 4, 16, 1, 256, 64 * Ki, 32, 900}, + /* CAL_TARGET_STONEY */ {ED_ATI_CAL_MACHINE_STONEY_ISA, "Stoney", "stoney", 4, 16, 1, 256, + 64 * Ki, 32, 800}, + /* CAL_TARGET_LEXA */ {ED_ATI_CAL_MACHINE_LEXA_ISA, IF(IS_BRAHMA, "", "gfx804"), + IF(IS_BRAHMA, "", "gfx804"), 4, 16, 1, 256, 64 * Ki, 32, 800}, + /* CAL_TARGET_RAVEN */ {ED_ATI_CAL_MACHINE_RAVEN_ISA, IF(IS_BRAHMA, "", "gfx901"), + IF(IS_BRAHMA, "", "gfx901"), 4, 16, 1, 256, 64 * Ki, 32, 900}, + /* CAL_TARGET_POLARIS22 */ {ED_ATI_CAL_MACHINE_POLARIS22_ISA, IF(IS_BRAHMA, "", "gfx804"), + IF(IS_BRAHMA, "", "gfx804"), 4, 16, 1, 256, 64 * Ki, 32, 800}, }; enum gfx_handle { - gfx700 = 700, - gfx701 = 701, - gfx702 = 702, - gfx800 = 800, - gfx801 = 801, - gfx804 = 804, - gfx810 = 810, - gfx900 = 900, - gfx901 = 901 + gfx700 = 700, + gfx701 = 701, + gfx702 = 702, + gfx800 = 800, + gfx801 = 801, + gfx804 = 804, + gfx810 = 810, + gfx900 = 900, + gfx901 = 901 }; static const char* Gfx700 = "AMD:AMDGPU:7:0:0"; @@ -154,334 +181,272 @@ static const char* Gfx900 = "AMD:AMDGPU:9:0:0"; static const char* Gfx901 = "AMD:AMDGPU:9:0:1"; // Supported OpenCL versions -enum OclVersion { - OpenCL10, - OpenCL11, - OpenCL12, - OpenCL20 -}; +enum OclVersion { OpenCL10, OpenCL11, OpenCL12, OpenCL20 }; struct CalFormat { - gslChannelOrder channelOrder_; //!< Texel/pixel GSL channel order - cmSurfFmt type_; //!< Texel/pixel CAL format + gslChannelOrder channelOrder_; //!< Texel/pixel GSL channel order + cmSurfFmt type_; //!< Texel/pixel CAL format }; struct MemoryFormat { - cl_image_format clFormat_; //!< CL image format - CalFormat calFormat_; //!< CAL image format + cl_image_format clFormat_; //!< CL image format + CalFormat calFormat_; //!< CAL image format }; -static const MemoryFormat -MemoryFormatMap[] = { +static const MemoryFormat MemoryFormatMap[] = { // R - { { CL_R, CL_UNORM_INT8 }, - { GSL_CHANNEL_ORDER_R, CM_SURF_FMT_INTENSITY8 } }, - { { CL_R, CL_UNORM_INT16 }, - { GSL_CHANNEL_ORDER_R, CM_SURF_FMT_R16 } }, + {{CL_R, CL_UNORM_INT8}, {GSL_CHANNEL_ORDER_R, CM_SURF_FMT_INTENSITY8}}, + {{CL_R, CL_UNORM_INT16}, {GSL_CHANNEL_ORDER_R, CM_SURF_FMT_R16}}, - { { CL_R, CL_SNORM_INT8 }, - { GSL_CHANNEL_ORDER_R, CM_SURF_FMT_sR8 } }, - { { CL_R, CL_SNORM_INT16 }, - { GSL_CHANNEL_ORDER_R, CM_SURF_FMT_sU16 } }, + {{CL_R, CL_SNORM_INT8}, {GSL_CHANNEL_ORDER_R, CM_SURF_FMT_sR8}}, + {{CL_R, CL_SNORM_INT16}, {GSL_CHANNEL_ORDER_R, CM_SURF_FMT_sU16}}, - { { CL_R, CL_SIGNED_INT8 }, - { GSL_CHANNEL_ORDER_R, CM_SURF_FMT_sR8I } }, - { { CL_R, CL_SIGNED_INT16 }, - { GSL_CHANNEL_ORDER_R, CM_SURF_FMT_sR16I } }, - { { CL_R, CL_SIGNED_INT32}, - { GSL_CHANNEL_ORDER_R, CM_SURF_FMT_sR32I } }, - { { CL_R, CL_UNSIGNED_INT8 }, - { GSL_CHANNEL_ORDER_R, CM_SURF_FMT_R8I } }, - { { CL_R, CL_UNSIGNED_INT16 }, - { GSL_CHANNEL_ORDER_R, CM_SURF_FMT_R16I } }, - { { CL_R, CL_UNSIGNED_INT32}, - { GSL_CHANNEL_ORDER_R , CM_SURF_FMT_R32I } }, + {{CL_R, CL_SIGNED_INT8}, {GSL_CHANNEL_ORDER_R, CM_SURF_FMT_sR8I}}, + {{CL_R, CL_SIGNED_INT16}, {GSL_CHANNEL_ORDER_R, CM_SURF_FMT_sR16I}}, + {{CL_R, CL_SIGNED_INT32}, {GSL_CHANNEL_ORDER_R, CM_SURF_FMT_sR32I}}, + {{CL_R, CL_UNSIGNED_INT8}, {GSL_CHANNEL_ORDER_R, CM_SURF_FMT_R8I}}, + {{CL_R, CL_UNSIGNED_INT16}, {GSL_CHANNEL_ORDER_R, CM_SURF_FMT_R16I}}, + {{CL_R, CL_UNSIGNED_INT32}, {GSL_CHANNEL_ORDER_R, CM_SURF_FMT_R32I}}, - { { CL_R, CL_HALF_FLOAT }, - { GSL_CHANNEL_ORDER_R, CM_SURF_FMT_R16F } }, - { { CL_R, CL_FLOAT }, - { GSL_CHANNEL_ORDER_R, CM_SURF_FMT_R32F } }, + {{CL_R, CL_HALF_FLOAT}, {GSL_CHANNEL_ORDER_R, CM_SURF_FMT_R16F}}, + {{CL_R, CL_FLOAT}, {GSL_CHANNEL_ORDER_R, CM_SURF_FMT_R32F}}, // A - { { CL_A, CL_UNORM_INT8 }, - { GSL_CHANNEL_ORDER_A, CM_SURF_FMT_INTENSITY8 } }, - { { CL_A, CL_UNORM_INT16 }, - { GSL_CHANNEL_ORDER_A, CM_SURF_FMT_R16 } }, + {{CL_A, CL_UNORM_INT8}, {GSL_CHANNEL_ORDER_A, CM_SURF_FMT_INTENSITY8}}, + {{CL_A, CL_UNORM_INT16}, {GSL_CHANNEL_ORDER_A, CM_SURF_FMT_R16}}, - { { CL_A, CL_SNORM_INT8 }, - { GSL_CHANNEL_ORDER_A, CM_SURF_FMT_sR8 } }, - { { CL_A, CL_SNORM_INT16 }, - { GSL_CHANNEL_ORDER_A, CM_SURF_FMT_sU16 } }, + {{CL_A, CL_SNORM_INT8}, {GSL_CHANNEL_ORDER_A, CM_SURF_FMT_sR8}}, + {{CL_A, CL_SNORM_INT16}, {GSL_CHANNEL_ORDER_A, CM_SURF_FMT_sU16}}, - { { CL_A, CL_SIGNED_INT8 }, - { GSL_CHANNEL_ORDER_A, CM_SURF_FMT_sR8I } }, - { { CL_A, CL_SIGNED_INT16 }, - { GSL_CHANNEL_ORDER_A, CM_SURF_FMT_sR16I } }, - { { CL_A, CL_SIGNED_INT32}, - { GSL_CHANNEL_ORDER_A, CM_SURF_FMT_sR32I } }, - { { CL_A, CL_UNSIGNED_INT8 }, - { GSL_CHANNEL_ORDER_A, CM_SURF_FMT_R8I } }, - { { CL_A, CL_UNSIGNED_INT16 }, - { GSL_CHANNEL_ORDER_A, CM_SURF_FMT_R16I } }, - { { CL_A, CL_UNSIGNED_INT32}, - { GSL_CHANNEL_ORDER_A , CM_SURF_FMT_R32I } }, + {{CL_A, CL_SIGNED_INT8}, {GSL_CHANNEL_ORDER_A, CM_SURF_FMT_sR8I}}, + {{CL_A, CL_SIGNED_INT16}, {GSL_CHANNEL_ORDER_A, CM_SURF_FMT_sR16I}}, + {{CL_A, CL_SIGNED_INT32}, {GSL_CHANNEL_ORDER_A, CM_SURF_FMT_sR32I}}, + {{CL_A, CL_UNSIGNED_INT8}, {GSL_CHANNEL_ORDER_A, CM_SURF_FMT_R8I}}, + {{CL_A, CL_UNSIGNED_INT16}, {GSL_CHANNEL_ORDER_A, CM_SURF_FMT_R16I}}, + {{CL_A, CL_UNSIGNED_INT32}, {GSL_CHANNEL_ORDER_A, CM_SURF_FMT_R32I}}, - { { CL_A, CL_HALF_FLOAT }, - { GSL_CHANNEL_ORDER_A, CM_SURF_FMT_R16F } }, - { { CL_A, CL_FLOAT }, - { GSL_CHANNEL_ORDER_A, CM_SURF_FMT_R32F } }, + {{CL_A, CL_HALF_FLOAT}, {GSL_CHANNEL_ORDER_A, CM_SURF_FMT_R16F}}, + {{CL_A, CL_FLOAT}, {GSL_CHANNEL_ORDER_A, CM_SURF_FMT_R32F}}, // RG - { { CL_RG, CL_UNORM_INT8 }, - { GSL_CHANNEL_ORDER_RG, CM_SURF_FMT_RG8 } }, - { { CL_RG, CL_UNORM_INT16 }, - { GSL_CHANNEL_ORDER_RG, CM_SURF_FMT_RG16 } }, + {{CL_RG, CL_UNORM_INT8}, {GSL_CHANNEL_ORDER_RG, CM_SURF_FMT_RG8}}, + {{CL_RG, CL_UNORM_INT16}, {GSL_CHANNEL_ORDER_RG, CM_SURF_FMT_RG16}}, - { { CL_RG, CL_SNORM_INT8 }, - { GSL_CHANNEL_ORDER_RG, CM_SURF_FMT_sRG8 } }, - { { CL_RG, CL_SNORM_INT16 }, - { GSL_CHANNEL_ORDER_RG, CM_SURF_FMT_sUV16 } }, + {{CL_RG, CL_SNORM_INT8}, {GSL_CHANNEL_ORDER_RG, CM_SURF_FMT_sRG8}}, + {{CL_RG, CL_SNORM_INT16}, {GSL_CHANNEL_ORDER_RG, CM_SURF_FMT_sUV16}}, - { { CL_RG, CL_SIGNED_INT8 }, - { GSL_CHANNEL_ORDER_RG, CM_SURF_FMT_sRG8I } }, - { { CL_RG, CL_SIGNED_INT16 }, - { GSL_CHANNEL_ORDER_RG, CM_SURF_FMT_sRG16I } }, - { { CL_RG, CL_SIGNED_INT32}, - { GSL_CHANNEL_ORDER_RG, CM_SURF_FMT_sRG32I } }, - { { CL_RG, CL_UNSIGNED_INT8 }, - { GSL_CHANNEL_ORDER_RG, CM_SURF_FMT_RG8I } }, - { { CL_RG, CL_UNSIGNED_INT16 }, - { GSL_CHANNEL_ORDER_RG, CM_SURF_FMT_RG16I } }, - { { CL_RG, CL_UNSIGNED_INT32}, - { GSL_CHANNEL_ORDER_RG , CM_SURF_FMT_RG32I } }, + {{CL_RG, CL_SIGNED_INT8}, {GSL_CHANNEL_ORDER_RG, CM_SURF_FMT_sRG8I}}, + {{CL_RG, CL_SIGNED_INT16}, {GSL_CHANNEL_ORDER_RG, CM_SURF_FMT_sRG16I}}, + {{CL_RG, CL_SIGNED_INT32}, {GSL_CHANNEL_ORDER_RG, CM_SURF_FMT_sRG32I}}, + {{CL_RG, CL_UNSIGNED_INT8}, {GSL_CHANNEL_ORDER_RG, CM_SURF_FMT_RG8I}}, + {{CL_RG, CL_UNSIGNED_INT16}, {GSL_CHANNEL_ORDER_RG, CM_SURF_FMT_RG16I}}, + {{CL_RG, CL_UNSIGNED_INT32}, {GSL_CHANNEL_ORDER_RG, CM_SURF_FMT_RG32I}}, - { { CL_RG, CL_HALF_FLOAT }, - { GSL_CHANNEL_ORDER_RG, CM_SURF_FMT_RG16F } }, - { { CL_RG, CL_FLOAT }, - { GSL_CHANNEL_ORDER_RG, CM_SURF_FMT_RG32F } }, + {{CL_RG, CL_HALF_FLOAT}, {GSL_CHANNEL_ORDER_RG, CM_SURF_FMT_RG16F}}, + {{CL_RG, CL_FLOAT}, {GSL_CHANNEL_ORDER_RG, CM_SURF_FMT_RG32F}}, // RA - { { CL_RA, CL_UNORM_INT8 }, - { GSL_CHANNEL_ORDER_RA, CM_SURF_FMT_RG8 } }, - { { CL_RA, CL_UNORM_INT16 }, - { GSL_CHANNEL_ORDER_RA, CM_SURF_FMT_RG16 } }, + {{CL_RA, CL_UNORM_INT8}, {GSL_CHANNEL_ORDER_RA, CM_SURF_FMT_RG8}}, + {{CL_RA, CL_UNORM_INT16}, {GSL_CHANNEL_ORDER_RA, CM_SURF_FMT_RG16}}, - { { CL_RA, CL_SNORM_INT8 }, - { GSL_CHANNEL_ORDER_RA, CM_SURF_FMT_sRG8 } }, - { { CL_RA, CL_SNORM_INT16 }, - { GSL_CHANNEL_ORDER_RA, CM_SURF_FMT_sUV16 } }, + {{CL_RA, CL_SNORM_INT8}, {GSL_CHANNEL_ORDER_RA, CM_SURF_FMT_sRG8}}, + {{CL_RA, CL_SNORM_INT16}, {GSL_CHANNEL_ORDER_RA, CM_SURF_FMT_sUV16}}, - { { CL_RA, CL_SIGNED_INT8 }, - { GSL_CHANNEL_ORDER_RA, CM_SURF_FMT_sRG8I } }, - { { CL_RA, CL_SIGNED_INT16 }, - { GSL_CHANNEL_ORDER_RA, CM_SURF_FMT_sRG16I } }, - { { CL_RA, CL_SIGNED_INT32}, - { GSL_CHANNEL_ORDER_RA, CM_SURF_FMT_sRG32I } }, - { { CL_RA, CL_UNSIGNED_INT8 }, - { GSL_CHANNEL_ORDER_RA, CM_SURF_FMT_RG8I } }, - { { CL_RA, CL_UNSIGNED_INT16 }, - { GSL_CHANNEL_ORDER_RA, CM_SURF_FMT_RG16I } }, - { { CL_RA, CL_UNSIGNED_INT32}, - { GSL_CHANNEL_ORDER_RA , CM_SURF_FMT_RG32I } }, + {{CL_RA, CL_SIGNED_INT8}, {GSL_CHANNEL_ORDER_RA, CM_SURF_FMT_sRG8I}}, + {{CL_RA, CL_SIGNED_INT16}, {GSL_CHANNEL_ORDER_RA, CM_SURF_FMT_sRG16I}}, + {{CL_RA, CL_SIGNED_INT32}, {GSL_CHANNEL_ORDER_RA, CM_SURF_FMT_sRG32I}}, + {{CL_RA, CL_UNSIGNED_INT8}, {GSL_CHANNEL_ORDER_RA, CM_SURF_FMT_RG8I}}, + {{CL_RA, CL_UNSIGNED_INT16}, {GSL_CHANNEL_ORDER_RA, CM_SURF_FMT_RG16I}}, + {{CL_RA, CL_UNSIGNED_INT32}, {GSL_CHANNEL_ORDER_RA, CM_SURF_FMT_RG32I}}, - { { CL_RA, CL_HALF_FLOAT }, - { GSL_CHANNEL_ORDER_RA, CM_SURF_FMT_RG16F } }, - { { CL_RA, CL_FLOAT }, - { GSL_CHANNEL_ORDER_RA, CM_SURF_FMT_RG32F } }, + {{CL_RA, CL_HALF_FLOAT}, {GSL_CHANNEL_ORDER_RA, CM_SURF_FMT_RG16F}}, + {{CL_RA, CL_FLOAT}, {GSL_CHANNEL_ORDER_RA, CM_SURF_FMT_RG32F}}, // RGB - { { CL_RGB, CL_UNORM_INT_101010 }, - { GSL_CHANNEL_ORDER_RGB, CM_SURF_FMT_BGR10_X2 } }, - { { CL_RGB, CL_UNSIGNED_INT8 }, // This is used only by blit kernel - { GSL_CHANNEL_ORDER_RGBA, CM_SURF_FMT_RGBA8UI } }, + {{CL_RGB, CL_UNORM_INT_101010}, {GSL_CHANNEL_ORDER_RGB, CM_SURF_FMT_BGR10_X2}}, + {{CL_RGB, CL_UNSIGNED_INT8}, // This is used only by blit kernel + {GSL_CHANNEL_ORDER_RGBA, CM_SURF_FMT_RGBA8UI}}, // RGBA - { { CL_RGBA, CL_UNORM_INT8 }, - { GSL_CHANNEL_ORDER_RGBA, CM_SURF_FMT_RGBA8 } }, - { { CL_RGBA, CL_UNORM_INT16 }, - { GSL_CHANNEL_ORDER_RGBA, CM_SURF_FMT_RGBA16 } }, + {{CL_RGBA, CL_UNORM_INT8}, {GSL_CHANNEL_ORDER_RGBA, CM_SURF_FMT_RGBA8}}, + {{CL_RGBA, CL_UNORM_INT16}, {GSL_CHANNEL_ORDER_RGBA, CM_SURF_FMT_RGBA16}}, - { { CL_RGBA, CL_SNORM_INT8 }, - { GSL_CHANNEL_ORDER_RGBA, CM_SURF_FMT_sRGBA8 } }, - { { CL_RGBA, CL_SNORM_INT16 }, - { GSL_CHANNEL_ORDER_RGBA, CM_SURF_FMT_sUVWQ16 } }, + {{CL_RGBA, CL_SNORM_INT8}, {GSL_CHANNEL_ORDER_RGBA, CM_SURF_FMT_sRGBA8}}, + {{CL_RGBA, CL_SNORM_INT16}, {GSL_CHANNEL_ORDER_RGBA, CM_SURF_FMT_sUVWQ16}}, - { { CL_RGBA, CL_SIGNED_INT8 }, - { GSL_CHANNEL_ORDER_RGBA, CM_SURF_FMT_sRGBA8I } }, - { { CL_RGBA, CL_SIGNED_INT16 }, - { GSL_CHANNEL_ORDER_RGBA, CM_SURF_FMT_sRGBA16I } }, - { { CL_RGBA, CL_SIGNED_INT32}, - { GSL_CHANNEL_ORDER_RGBA, CM_SURF_FMT_sRGBA32I } }, - { { CL_RGBA, CL_UNSIGNED_INT8 }, - { GSL_CHANNEL_ORDER_RGBA, CM_SURF_FMT_RGBA8UI } }, - { { CL_RGBA, CL_UNSIGNED_INT16 }, - { GSL_CHANNEL_ORDER_RGBA, CM_SURF_FMT_RGBA16UI } }, - { { CL_RGBA, CL_UNSIGNED_INT32}, - { GSL_CHANNEL_ORDER_RGBA , CM_SURF_FMT_RGBA32UI } }, + {{CL_RGBA, CL_SIGNED_INT8}, {GSL_CHANNEL_ORDER_RGBA, CM_SURF_FMT_sRGBA8I}}, + {{CL_RGBA, CL_SIGNED_INT16}, {GSL_CHANNEL_ORDER_RGBA, CM_SURF_FMT_sRGBA16I}}, + {{CL_RGBA, CL_SIGNED_INT32}, {GSL_CHANNEL_ORDER_RGBA, CM_SURF_FMT_sRGBA32I}}, + {{CL_RGBA, CL_UNSIGNED_INT8}, {GSL_CHANNEL_ORDER_RGBA, CM_SURF_FMT_RGBA8UI}}, + {{CL_RGBA, CL_UNSIGNED_INT16}, {GSL_CHANNEL_ORDER_RGBA, CM_SURF_FMT_RGBA16UI}}, + {{CL_RGBA, CL_UNSIGNED_INT32}, {GSL_CHANNEL_ORDER_RGBA, CM_SURF_FMT_RGBA32UI}}, - { { CL_RGBA, CL_HALF_FLOAT }, - { GSL_CHANNEL_ORDER_RGBA, CM_SURF_FMT_RGBA16F } }, - { { CL_RGBA, CL_FLOAT }, - { GSL_CHANNEL_ORDER_RGBA, CM_SURF_FMT_RGBA32F } }, + {{CL_RGBA, CL_HALF_FLOAT}, {GSL_CHANNEL_ORDER_RGBA, CM_SURF_FMT_RGBA16F}}, + {{CL_RGBA, CL_FLOAT}, {GSL_CHANNEL_ORDER_RGBA, CM_SURF_FMT_RGBA32F}}, // ARGB - { { CL_ARGB, CL_UNORM_INT8 }, - { GSL_CHANNEL_ORDER_ARGB, CM_SURF_FMT_RGBA8 } }, - { { CL_ARGB, CL_SNORM_INT8 }, - { GSL_CHANNEL_ORDER_ARGB, CM_SURF_FMT_sRGBA8 } }, - { { CL_ARGB, CL_SIGNED_INT8 }, - { GSL_CHANNEL_ORDER_ARGB, CM_SURF_FMT_sRGBA8I } }, - { { CL_ARGB, CL_UNSIGNED_INT8 }, - { GSL_CHANNEL_ORDER_ARGB, CM_SURF_FMT_RGBA8UI } }, + {{CL_ARGB, CL_UNORM_INT8}, {GSL_CHANNEL_ORDER_ARGB, CM_SURF_FMT_RGBA8}}, + {{CL_ARGB, CL_SNORM_INT8}, {GSL_CHANNEL_ORDER_ARGB, CM_SURF_FMT_sRGBA8}}, + {{CL_ARGB, CL_SIGNED_INT8}, {GSL_CHANNEL_ORDER_ARGB, CM_SURF_FMT_sRGBA8I}}, + {{CL_ARGB, CL_UNSIGNED_INT8}, {GSL_CHANNEL_ORDER_ARGB, CM_SURF_FMT_RGBA8UI}}, // BGRA - { { CL_BGRA, CL_UNORM_INT8 }, - { GSL_CHANNEL_ORDER_BGRA, CM_SURF_FMT_RGBA8 } }, - { { CL_BGRA, CL_SNORM_INT8 }, - { GSL_CHANNEL_ORDER_BGRA, CM_SURF_FMT_sRGBA8 } }, - { { CL_BGRA, CL_SIGNED_INT8 }, - { GSL_CHANNEL_ORDER_BGRA, CM_SURF_FMT_sRGBA8I } }, - { { CL_BGRA, CL_UNSIGNED_INT8 }, - { GSL_CHANNEL_ORDER_BGRA, CM_SURF_FMT_RGBA8UI } }, + {{CL_BGRA, CL_UNORM_INT8}, {GSL_CHANNEL_ORDER_BGRA, CM_SURF_FMT_RGBA8}}, + {{CL_BGRA, CL_SNORM_INT8}, {GSL_CHANNEL_ORDER_BGRA, CM_SURF_FMT_sRGBA8}}, + {{CL_BGRA, CL_SIGNED_INT8}, {GSL_CHANNEL_ORDER_BGRA, CM_SURF_FMT_sRGBA8I}}, + {{CL_BGRA, CL_UNSIGNED_INT8}, {GSL_CHANNEL_ORDER_BGRA, CM_SURF_FMT_RGBA8UI}}, // LUMINANCE - { {CL_LUMINANCE, CL_SNORM_INT8}, - { GSL_CHANNEL_ORDER_LUMINANCE,CM_SURF_FMT_sR8 } }, - { {CL_LUMINANCE, CL_SNORM_INT16}, - { GSL_CHANNEL_ORDER_LUMINANCE,CM_SURF_FMT_sU16 } }, - { {CL_LUMINANCE, CL_UNORM_INT8}, - { GSL_CHANNEL_ORDER_LUMINANCE,CM_SURF_FMT_INTENSITY8 } }, - { {CL_LUMINANCE, CL_UNORM_INT16}, - { GSL_CHANNEL_ORDER_LUMINANCE,CM_SURF_FMT_R16 } }, - { {CL_LUMINANCE, CL_HALF_FLOAT}, - { GSL_CHANNEL_ORDER_LUMINANCE,CM_SURF_FMT_R16F } }, - { {CL_LUMINANCE, CL_FLOAT}, - { GSL_CHANNEL_ORDER_LUMINANCE,CM_SURF_FMT_R32F } }, + {{CL_LUMINANCE, CL_SNORM_INT8}, {GSL_CHANNEL_ORDER_LUMINANCE, CM_SURF_FMT_sR8}}, + {{CL_LUMINANCE, CL_SNORM_INT16}, {GSL_CHANNEL_ORDER_LUMINANCE, CM_SURF_FMT_sU16}}, + {{CL_LUMINANCE, CL_UNORM_INT8}, {GSL_CHANNEL_ORDER_LUMINANCE, CM_SURF_FMT_INTENSITY8}}, + {{CL_LUMINANCE, CL_UNORM_INT16}, {GSL_CHANNEL_ORDER_LUMINANCE, CM_SURF_FMT_R16}}, + {{CL_LUMINANCE, CL_HALF_FLOAT}, {GSL_CHANNEL_ORDER_LUMINANCE, CM_SURF_FMT_R16F}}, + {{CL_LUMINANCE, CL_FLOAT}, {GSL_CHANNEL_ORDER_LUMINANCE, CM_SURF_FMT_R32F}}, // INTENSITY - { {CL_INTENSITY, CL_SNORM_INT8}, - { GSL_CHANNEL_ORDER_INTENSITY,CM_SURF_FMT_sR8 } }, - { {CL_INTENSITY, CL_SNORM_INT16}, - { GSL_CHANNEL_ORDER_INTENSITY,CM_SURF_FMT_sU16 } }, - { {CL_INTENSITY, CL_UNORM_INT8}, - { GSL_CHANNEL_ORDER_INTENSITY,CM_SURF_FMT_INTENSITY8 } }, - { {CL_INTENSITY, CL_UNORM_INT16}, - { GSL_CHANNEL_ORDER_INTENSITY,CM_SURF_FMT_R16 } }, - { {CL_INTENSITY, CL_HALF_FLOAT}, - { GSL_CHANNEL_ORDER_INTENSITY,CM_SURF_FMT_R16F } }, - { {CL_INTENSITY, CL_FLOAT}, - { GSL_CHANNEL_ORDER_INTENSITY,CM_SURF_FMT_R32F } }, + {{CL_INTENSITY, CL_SNORM_INT8}, {GSL_CHANNEL_ORDER_INTENSITY, CM_SURF_FMT_sR8}}, + {{CL_INTENSITY, CL_SNORM_INT16}, {GSL_CHANNEL_ORDER_INTENSITY, CM_SURF_FMT_sU16}}, + {{CL_INTENSITY, CL_UNORM_INT8}, {GSL_CHANNEL_ORDER_INTENSITY, CM_SURF_FMT_INTENSITY8}}, + {{CL_INTENSITY, CL_UNORM_INT16}, {GSL_CHANNEL_ORDER_INTENSITY, CM_SURF_FMT_R16}}, + {{CL_INTENSITY, CL_HALF_FLOAT}, {GSL_CHANNEL_ORDER_INTENSITY, CM_SURF_FMT_R16F}}, + {{CL_INTENSITY, CL_FLOAT}, {GSL_CHANNEL_ORDER_INTENSITY, CM_SURF_FMT_R32F}}, // sRBGA - { {CL_sRGBA ,CL_UNORM_INT8}, - { GSL_CHANNEL_ORDER_SRGBA, CM_SURF_FMT_RGBA8_SRGB } }, - { {CL_sRGBA ,CL_UNSIGNED_INT8}, // This is used only by blit kernel - { GSL_CHANNEL_ORDER_SRGBA, CM_SURF_FMT_RGBA8UI } }, + {{CL_sRGBA, CL_UNORM_INT8}, {GSL_CHANNEL_ORDER_SRGBA, CM_SURF_FMT_RGBA8_SRGB}}, + {{CL_sRGBA, CL_UNSIGNED_INT8}, // This is used only by blit kernel + {GSL_CHANNEL_ORDER_SRGBA, CM_SURF_FMT_RGBA8UI}}, // sRBG - { {CL_sRGB ,CL_UNORM_INT8}, - { GSL_CHANNEL_ORDER_SRGB, CM_SURF_FMT_RGBX8UI } }, - { {CL_sRGB ,CL_UNSIGNED_INT8}, // This is used only by blit kernel - { GSL_CHANNEL_ORDER_SRGB, CM_SURF_FMT_RGBA8UI } }, + {{CL_sRGB, CL_UNORM_INT8}, {GSL_CHANNEL_ORDER_SRGB, CM_SURF_FMT_RGBX8UI}}, + {{CL_sRGB, CL_UNSIGNED_INT8}, // This is used only by blit kernel + {GSL_CHANNEL_ORDER_SRGB, CM_SURF_FMT_RGBA8UI}}, // sRBGx - { {CL_sRGBx ,CL_UNORM_INT8}, - { GSL_CHANNEL_ORDER_SRGBX, CM_SURF_FMT_RGBX8UI } }, - { {CL_sRGBx ,CL_UNSIGNED_INT8}, // This is used only by blit kernel - { GSL_CHANNEL_ORDER_SRGBX, CM_SURF_FMT_RGBA8UI } }, + {{CL_sRGBx, CL_UNORM_INT8}, {GSL_CHANNEL_ORDER_SRGBX, CM_SURF_FMT_RGBX8UI}}, + {{CL_sRGBx, CL_UNSIGNED_INT8}, // This is used only by blit kernel + {GSL_CHANNEL_ORDER_SRGBX, CM_SURF_FMT_RGBA8UI}}, // sBGRA - { {CL_sBGRA ,CL_UNORM_INT8}, - { GSL_CHANNEL_ORDER_SBGRA, CM_SURF_FMT_RGBA8 } }, - { {CL_sBGRA ,CL_UNSIGNED_INT8}, // This is used only by blit kernel - { GSL_CHANNEL_ORDER_SBGRA, CM_SURF_FMT_RGBA8UI } }, + {{CL_sBGRA, CL_UNORM_INT8}, {GSL_CHANNEL_ORDER_SBGRA, CM_SURF_FMT_RGBA8}}, + {{CL_sBGRA, CL_UNSIGNED_INT8}, // This is used only by blit kernel + {GSL_CHANNEL_ORDER_SBGRA, CM_SURF_FMT_RGBA8UI}}, // DEPTH - { {CL_DEPTH ,CL_FLOAT}, - {GSL_CHANNEL_ORDER_REPLICATE_R ,CM_SURF_FMT_DEPTH32F}}, - { {CL_DEPTH ,CL_UNSIGNED_INT32}, // This is used only by blit kernel - {GSL_CHANNEL_ORDER_REPLICATE_R ,CM_SURF_FMT_R32I}}, + {{CL_DEPTH, CL_FLOAT}, {GSL_CHANNEL_ORDER_REPLICATE_R, CM_SURF_FMT_DEPTH32F}}, + {{CL_DEPTH, CL_UNSIGNED_INT32}, // This is used only by blit kernel + {GSL_CHANNEL_ORDER_REPLICATE_R, CM_SURF_FMT_R32I}}, - { {CL_DEPTH ,CL_UNORM_INT16}, - {GSL_CHANNEL_ORDER_REPLICATE_R ,CM_SURF_FMT_DEPTH16}}, - { {CL_DEPTH ,CL_UNSIGNED_INT16}, // This is used only by blit kernel - {GSL_CHANNEL_ORDER_REPLICATE_R ,CM_SURF_FMT_R16I}}, + {{CL_DEPTH, CL_UNORM_INT16}, {GSL_CHANNEL_ORDER_REPLICATE_R, CM_SURF_FMT_DEPTH16}}, + {{CL_DEPTH, CL_UNSIGNED_INT16}, // This is used only by blit kernel + {GSL_CHANNEL_ORDER_REPLICATE_R, CM_SURF_FMT_R16I}}, - { {CL_DEPTH_STENCIL ,CL_UNORM_INT24}, - {GSL_CHANNEL_ORDER_REPLICATE_R ,CM_SURF_FMT_DEPTH24_STEN8}}, - { {CL_DEPTH_STENCIL ,CL_FLOAT}, - {GSL_CHANNEL_ORDER_REPLICATE_R ,CM_SURF_FMT_DEPTH32F_X24_STEN8}} + {{CL_DEPTH_STENCIL, CL_UNORM_INT24}, + {GSL_CHANNEL_ORDER_REPLICATE_R, CM_SURF_FMT_DEPTH24_STEN8}}, + {{CL_DEPTH_STENCIL, CL_FLOAT}, {GSL_CHANNEL_ORDER_REPLICATE_R, CM_SURF_FMT_DEPTH32F_X24_STEN8}} }; struct MemFormatStruct { - cmSurfFmt format_; - uint size_; - uint components_; + cmSurfFmt format_; + uint size_; + uint components_; }; -static const MemFormatStruct -MemoryFormatSize[] = { - { CM_SURF_FMT_INTENSITY8, 1, 1 },/**< 1 component, normalized unsigned 8-bit integer value per component */ - { CM_SURF_FMT_RG8, 2, 2 }, /**< 2 component, normalized unsigned 8-bit integer value per component */ - { CM_SURF_FMT_RGBA8, 4, 4 }, /**< 4 component, normalized unsigned 8-bit integer value per component */ - { CM_SURF_FMT_RGBA8_SRGB, 4, 4 }, /**< 4 component, normalized unsigned 8-bit integer value per component */ - { CM_SURF_FMT_R16, 2, 1 }, /**< 1 component, normalized unsigned 16-bit integer value per component */ - { CM_SURF_FMT_RG16, 4, 2 }, /**< 2 component, normalized unsigned 16-bit integer value per component */ - { CM_SURF_FMT_RGBA16, 8, 4 }, /**< 4 component, normalized unsigned 16-bit integer value per component */ - { CM_SURF_FMT_sRGBA8, 4, 4 }, /**< 4 component, normalized signed 8-bit integer value per component */ - { CM_SURF_FMT_sU16, 2, 1 }, /**< 1 component, normalized signed 16-bit integer value per component */ - { CM_SURF_FMT_sUV16, 4, 2 }, /**< 2 component, normalized signed 16-bit integer value per component */ - { CM_SURF_FMT_sUVWQ16, 8, 4 }, /**< 4 component, normalized signed 16-bit integer value per component */ - { CM_SURF_FMT_R32F, 4, 1 }, /**< A 1 component, 32-bit float value per component */ - { CM_SURF_FMT_RG32F, 8, 2 }, /**< A 2 component, 32-bit float value per component */ - { CM_SURF_FMT_RGBA32F, 16, 4 }, /**< A 4 component, 32-bit float value per component */ - { CM_SURF_FMT_sR8, 1, 1 }, /**< 1 component, normalized signed 8-bit integer value per component */ - { CM_SURF_FMT_sRG8, 2, 2 }, /**< 2 component, normalized signed 8-bit integer value per component */ +static const MemFormatStruct MemoryFormatSize[] = { + {CM_SURF_FMT_INTENSITY8, 1, + 1}, /**< 1 component, normalized unsigned 8-bit integer value per component */ + {CM_SURF_FMT_RG8, 2, + 2}, /**< 2 component, normalized unsigned 8-bit integer value per component */ + {CM_SURF_FMT_RGBA8, 4, + 4}, /**< 4 component, normalized unsigned 8-bit integer value per component */ + {CM_SURF_FMT_RGBA8_SRGB, 4, + 4}, /**< 4 component, normalized unsigned 8-bit integer value per component */ + {CM_SURF_FMT_R16, 2, + 1}, /**< 1 component, normalized unsigned 16-bit integer value per component */ + {CM_SURF_FMT_RG16, 4, + 2}, /**< 2 component, normalized unsigned 16-bit integer value per component */ + {CM_SURF_FMT_RGBA16, 8, + 4}, /**< 4 component, normalized unsigned 16-bit integer value per component */ + {CM_SURF_FMT_sRGBA8, 4, + 4}, /**< 4 component, normalized signed 8-bit integer value per component */ + {CM_SURF_FMT_sU16, 2, + 1}, /**< 1 component, normalized signed 16-bit integer value per component */ + {CM_SURF_FMT_sUV16, 4, + 2}, /**< 2 component, normalized signed 16-bit integer value per component */ + {CM_SURF_FMT_sUVWQ16, 8, + 4}, /**< 4 component, normalized signed 16-bit integer value per component */ + {CM_SURF_FMT_R32F, 4, 1}, /**< A 1 component, 32-bit float value per component */ + {CM_SURF_FMT_RG32F, 8, 2}, /**< A 2 component, 32-bit float value per component */ + {CM_SURF_FMT_RGBA32F, 16, 4}, /**< A 4 component, 32-bit float value per component */ + {CM_SURF_FMT_sR8, 1, + 1}, /**< 1 component, normalized signed 8-bit integer value per component */ + {CM_SURF_FMT_sRG8, 2, + 2}, /**< 2 component, normalized signed 8-bit integer value per component */ - { CM_SURF_FMT_R8I, 1, 1 }, /**< 1 component, unnormalized unsigned 8-bit integer value per component */ - { CM_SURF_FMT_RG8I, 2, 2 }, /**< 2 component, unnormalized unsigned 8-bit integer value per component */ - { CM_SURF_FMT_RGBA8UI, 4, 4 }, /**< 4 component, unnormalized unsigned 8-bit integer value per component */ - { CM_SURF_FMT_RGBX8UI, 4, 4 }, /**< 4 component, unnormalized unsigned 8-bit integer value per component */ - { CM_SURF_FMT_sR8I, 1, 1 }, /**< 1 component, unnormalized signed 8-bit integer value per component */ - { CM_SURF_FMT_sRG8I, 2, 2 }, /**< 2 component, unnormalized signed 8-bit integer value per component */ - { CM_SURF_FMT_sRGBA8I, 4, 4 }, /**< 4 component, unnormalized signed 8-bit integer value per component */ - { CM_SURF_FMT_R16I, 2, 1 }, /**< 1 component, unnormalized unsigned 16-bit integer value per component */ - { CM_SURF_FMT_RG16I, 4, 2 }, /**< 2 component, unnormalized unsigned 16-bit integer value per component */ - { CM_SURF_FMT_RGBA16UI, 8, 4 }, /**< 4 component, unnormalized unsigned 16-bit integer value per component */ - { CM_SURF_FMT_sR16I, 2, 1 }, /**< 1 component, unnormalized signed 16-bit integer value per component */ - { CM_SURF_FMT_sRG16I, 4, 2 }, /**< 2 component, unnormalized signed 16-bit integer value per component */ - { CM_SURF_FMT_sRGBA16I, 8, 4 }, /**< 4 component, unnormalized signed 16-bit integer value per component */ - { CM_SURF_FMT_R32I, 4, 1 }, /**< 1 component, unnormalized unsigned 32-bit integer value per component */ - { CM_SURF_FMT_RG32I, 8, 2 }, /**< 2 component, unnormalized unsigned 32-bit integer value per component */ - { CM_SURF_FMT_RGBA32UI, 16, 4 }, /**< 4 component, unnormalized unsigned 32-bit integer value per component */ - { CM_SURF_FMT_sR32I, 4, 1 }, /**< 1 component, unnormalized signed 32-bit integer value per component */ - { CM_SURF_FMT_sRG32I, 8, 2 }, /**< 2 component, unnormalized signed 32-bit integer value per component */ - { CM_SURF_FMT_sRGBA32I, 16, 4 }, /**< 4 component, unnormalized signed 32-bit integer value per component */ + {CM_SURF_FMT_R8I, 1, + 1}, /**< 1 component, unnormalized unsigned 8-bit integer value per component */ + {CM_SURF_FMT_RG8I, 2, + 2}, /**< 2 component, unnormalized unsigned 8-bit integer value per component */ + {CM_SURF_FMT_RGBA8UI, 4, + 4}, /**< 4 component, unnormalized unsigned 8-bit integer value per component */ + {CM_SURF_FMT_RGBX8UI, 4, + 4}, /**< 4 component, unnormalized unsigned 8-bit integer value per component */ + {CM_SURF_FMT_sR8I, 1, + 1}, /**< 1 component, unnormalized signed 8-bit integer value per component */ + {CM_SURF_FMT_sRG8I, 2, + 2}, /**< 2 component, unnormalized signed 8-bit integer value per component */ + {CM_SURF_FMT_sRGBA8I, 4, + 4}, /**< 4 component, unnormalized signed 8-bit integer value per component */ + {CM_SURF_FMT_R16I, 2, + 1}, /**< 1 component, unnormalized unsigned 16-bit integer value per component */ + {CM_SURF_FMT_RG16I, 4, + 2}, /**< 2 component, unnormalized unsigned 16-bit integer value per component */ + {CM_SURF_FMT_RGBA16UI, 8, + 4}, /**< 4 component, unnormalized unsigned 16-bit integer value per component */ + {CM_SURF_FMT_sR16I, 2, + 1}, /**< 1 component, unnormalized signed 16-bit integer value per component */ + {CM_SURF_FMT_sRG16I, 4, + 2}, /**< 2 component, unnormalized signed 16-bit integer value per component */ + {CM_SURF_FMT_sRGBA16I, 8, + 4}, /**< 4 component, unnormalized signed 16-bit integer value per component */ + {CM_SURF_FMT_R32I, 4, + 1}, /**< 1 component, unnormalized unsigned 32-bit integer value per component */ + {CM_SURF_FMT_RG32I, 8, + 2}, /**< 2 component, unnormalized unsigned 32-bit integer value per component */ + {CM_SURF_FMT_RGBA32UI, 16, + 4}, /**< 4 component, unnormalized unsigned 32-bit integer value per component */ + {CM_SURF_FMT_sR32I, 4, + 1}, /**< 1 component, unnormalized signed 32-bit integer value per component */ + {CM_SURF_FMT_sRG32I, 8, + 2}, /**< 2 component, unnormalized signed 32-bit integer value per component */ + {CM_SURF_FMT_sRGBA32I, 16, + 4}, /**< 4 component, unnormalized signed 32-bit integer value per component */ - { CM_SURF_FMT_R16F, 2, 1 }, /**< A 1 component, 16-bit float value per component */ - { CM_SURF_FMT_RG16F, 4, 2 }, /**< A 2 component, 16-bit float value per component */ - { CM_SURF_FMT_RGBA16F, 8, 4 }, /**< A 4 component, 16-bit float value per component */ + {CM_SURF_FMT_R16F, 2, 1}, /**< A 1 component, 16-bit float value per component */ + {CM_SURF_FMT_RG16F, 4, 2}, /**< A 2 component, 16-bit float value per component */ + {CM_SURF_FMT_RGBA16F, 8, 4}, /**< A 4 component, 16-bit float value per component */ - { CM_SURF_FMT_BGR10_X2, 4, 4 }, /**< 4 component, unnormalized signed 10-bit integer value per component packed as (@c XXRRRRRRRRRRGGGGGGGGGGBBBBBBBBBB)*/ - { CM_SURF_FMT_DEPTH32F, 4, 1 }, /**< A one component, 32 float value per component */ - { CM_SURF_FMT_DEPTH16 , 2, 1 }, /**< A one component, 16 unsigned int value per component */ - { CM_SURF_FMT_DEPTH24_STEN8 , 4 ,1}, /**< A one component, 32 float value per component */ - { CM_SURF_FMT_DEPTH32F_X24_STEN8 , 8 ,2} /**< depth + stencil, 64 bits per element packed as (@c XXXXXXXXXXXXXXXXXXXXXXXXSSSSSSSSDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD) */ + {CM_SURF_FMT_BGR10_X2, 4, 4}, /**< 4 component, unnormalized signed 10-bit integer value per + component packed as (@c XXRRRRRRRRRRGGGGGGGGGGBBBBBBBBBB)*/ + {CM_SURF_FMT_DEPTH32F, 4, 1}, /**< A one component, 32 float value per component */ + {CM_SURF_FMT_DEPTH16, 2, 1}, /**< A one component, 16 unsigned int value per component */ + {CM_SURF_FMT_DEPTH24_STEN8, 4, 1}, /**< A one component, 32 float value per component */ + {CM_SURF_FMT_DEPTH32F_X24_STEN8, 8, + 2} /**< depth + stencil, 64 bits per element packed as (@c + XXXXXXXXXXXXXXXXXXXXXXXXSSSSSSSSDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD) */ }; -__inline const MemFormatStruct& -memoryFormatSize(cmSurfFmt fmt) -{ - for (uint i = 0; i < sizeof(MemoryFormatSize) / sizeof(MemFormatStruct); ++i) { - if (MemoryFormatSize[i].format_ == fmt) { - return MemoryFormatSize[i]; - } +__inline const MemFormatStruct& memoryFormatSize(cmSurfFmt fmt) { + for (uint i = 0; i < sizeof(MemoryFormatSize) / sizeof(MemFormatStruct); ++i) { + if (MemoryFormatSize[i].format_ == fmt) { + return MemoryFormatSize[i]; } - assert (!"Unknown GSL memory format!"); - return MemoryFormatSize[0]; + } + assert(!"Unknown GSL memory format!"); + return MemoryFormatSize[0]; } -} // namespace gpu +} // namespace gpu -#endif // GPUDEFS_HPP_ +#endif // GPUDEFS_HPP_ diff --git a/rocclr/runtime/device/gpu/gpudevice.cpp b/rocclr/runtime/device/gpu/gpudevice.cpp index b061ae9df2..1460bd1d78 100644 --- a/rocclr/runtime/device/gpu/gpudevice.cpp +++ b/rocclr/runtime/device/gpu/gpudevice.cpp @@ -30,9 +30,9 @@ #include "CL/cl_d3d10.h" #include "CL/cl_d3d11.h" #include "CL/cl_dx9_media_sharing.h" -#endif // _WIN32 +#endif // _WIN32 -#include "os_if.h" // for osInit() +#include "os_if.h" // for osInit() #include #include @@ -43,22 +43,18 @@ #include "gpudebugmanager.hpp" -bool DeviceLoad() -{ - bool ret = false; +bool DeviceLoad() { + bool ret = false; - // Create online devices - ret |= gpu::Device::init(); - // Create offline GPU devices - ret |= gpu::NullDevice::init(); + // Create online devices + ret |= gpu::Device::init(); + // Create offline GPU devices + ret |= gpu::NullDevice::init(); - return ret; + return ret; } -void DeviceUnload() -{ - gpu::Device::tearDown(); -} +void DeviceUnload() { gpu::Device::tearDown(); } namespace gpu { @@ -67,79 +63,68 @@ aclCompiler* NullDevice::hsaCompiler_; AppProfile Device::appProfile_; NullDevice::NullDevice() - : amd::Device(NULL) - , calTarget_(static_cast(0)) - , hwInfo_(NULL) -{ -} + : amd::Device(NULL), calTarget_(static_cast(0)), hwInfo_(NULL) {} -bool -NullDevice::init() -{ - std::vector devices; +bool NullDevice::init() { + std::vector devices; - devices = getDevices(CL_DEVICE_TYPE_GPU, false); + devices = getDevices(CL_DEVICE_TYPE_GPU, false); - // Loop through all supported devices and create each of them - for (uint id = CAL_TARGET_TAHITI; id <= CAL_TARGET_LAST; ++id) { - bool foundActive = false; + // Loop through all supported devices and create each of them + for (uint id = CAL_TARGET_TAHITI; id <= CAL_TARGET_LAST; ++id) { + bool foundActive = false; - if (gpu::DeviceInfo[id].targetName_[0] == '\0') { - continue; - } - - // Loop through all active devices and see if we match one - for (uint i = 0; i < devices.size(); ++i) { - if (static_cast(devices[i])->calTarget() == - static_cast(id)) { - foundActive = true; - break; - } - } - - // Don't report an offline device if it's active - if (foundActive) { - continue; - } - - NullDevice* dev = new NullDevice(); - if (NULL != dev) { - if (!dev->create(static_cast(id))) { - delete dev; - } - else { - dev->registerDevice(); - } - } + if (gpu::DeviceInfo[id].targetName_[0] == '\0') { + continue; } - return true; + // Loop through all active devices and see if we match one + for (uint i = 0; i < devices.size(); ++i) { + if (static_cast(devices[i])->calTarget() == static_cast(id)) { + foundActive = true; + break; + } + } + + // Don't report an offline device if it's active + if (foundActive) { + continue; + } + + NullDevice* dev = new NullDevice(); + if (NULL != dev) { + if (!dev->create(static_cast(id))) { + delete dev; + } else { + dev->registerDevice(); + } + } + } + + return true; } -bool -NullDevice::create(CALtarget target) -{ - CALdeviceattribs calAttr = {0}; - gslMemInfo memInfo = {0}; +bool NullDevice::create(CALtarget target) { + CALdeviceattribs calAttr = {0}; + gslMemInfo memInfo = {0}; - online_ = false; + online_ = false; - calTarget_ = calAttr.target = target; - hwInfo_ = &DeviceInfo[calTarget_]; + calTarget_ = calAttr.target = target; + hwInfo_ = &DeviceInfo[calTarget_]; - assert((target >= CAL_TARGET_TAHITI) && - (target != CAL_TARGET_SCRAPPER) && - (target != CAL_TARGET_DEVASTATOR)); + assert((target >= CAL_TARGET_TAHITI) && (target != CAL_TARGET_SCRAPPER) && + (target != CAL_TARGET_DEVASTATOR)); - // Force double if it could be supported - switch (target) { + // Force double if it could be supported + switch (target) { case CAL_TARGET_PITCAIRN: case CAL_TARGET_CAPEVERDE: case CAL_TARGET_TAHITI: case CAL_TARGET_OLAND: case CAL_TARGET_HAINAN: - calAttr.doublePrecision = CAL_TRUE; - break; + calAttr.doublePrecision = CAL_TRUE; + break; case CAL_TARGET_BONAIRE: case CAL_TARGET_SPECTRE: case CAL_TARGET_SPOOKY: @@ -157,2325 +142,2115 @@ NullDevice::create(CALtarget target) case CAL_TARGET_LEXA: case CAL_TARGET_RAVEN: case CAL_TARGET_POLARIS22: - calAttr.doublePrecision = CAL_TRUE; - calAttr.isOpenCL200Device = CAL_TRUE; - break; + calAttr.doublePrecision = CAL_TRUE; + calAttr.isOpenCL200Device = CAL_TRUE; + break; default: - break; - } + break; + } - settings_ = new gpu::Settings(); - gpu::Settings* gpuSettings = reinterpret_cast(settings_); - // Create setting for the offline target - if ((gpuSettings == NULL) || !gpuSettings->create(calAttr - )) { - return false; - } + settings_ = new gpu::Settings(); + gpu::Settings* gpuSettings = reinterpret_cast(settings_); + // Create setting for the offline target + if ((gpuSettings == NULL) || !gpuSettings->create(calAttr)) { + return false; + } - // Report 512MB for all offline devices - memInfo.cardMemAvailableBytes = 512 * Mi; - memInfo.cardLargestFreeBlockBytes = 512 * Mi; - calAttr.localRAM = 512; + // Report 512MB for all offline devices + memInfo.cardMemAvailableBytes = 512 * Mi; + memInfo.cardLargestFreeBlockBytes = 512 * Mi; + calAttr.localRAM = 512; - // Fill the device info structure - fillDeviceInfo(calAttr, memInfo, 4096, 1, 0); + // Fill the device info structure + fillDeviceInfo(calAttr, memInfo, 4096, 1, 0); - if (NULL == compiler_) { + if (NULL == compiler_) { #if !defined(ATI_OS_LINUX) - char CompilerLibrary[220] = ""; - strcat_s(CompilerLibrary, "amdocl12cl" LP64_SWITCH("", "64") ".dll"); + char CompilerLibrary[220] = ""; + strcat_s(CompilerLibrary, "amdocl12cl" LP64_SWITCH("", "64") ".dll"); #endif - const char *library = getenv("COMPILER_LIBRARY"); - aclCompilerOptions opts = { - sizeof(aclCompilerOptions_0_8), + const char* library = getenv("COMPILER_LIBRARY"); + aclCompilerOptions opts = { + sizeof(aclCompilerOptions_0_8), #if defined(ATI_OS_LINUX) - library ? library : LINUX_ONLY("lib") "amdocl12cl" \ - LP64_SWITCH(LINUX_SWITCH("32",""),"64") LINUX_SWITCH(".so",".dll"), + library ? library : LINUX_ONLY("lib") "amdocl12cl" LP64_SWITCH(LINUX_SWITCH("32", ""), "64") + LINUX_SWITCH(".so", ".dll"), #else - library ? library : CompilerLibrary, + library ? library : CompilerLibrary, #endif - NULL, - NULL, - NULL, - NULL, - NULL, - AMD_OCL_SC_LIB - }; - compiler_ = aclCompilerInit(&opts, NULL); - } + NULL, + NULL, + NULL, + NULL, + NULL, + AMD_OCL_SC_LIB + }; + compiler_ = aclCompilerInit(&opts, NULL); + } - if (settings().hsail_ || (settings().oclVersion_ == OpenCL20)) { - // Runtime doesn't know what local size could be on the real board - info_.maxGlobalVariableSize_ = static_cast(512 * Mi); + if (settings().hsail_ || (settings().oclVersion_ == OpenCL20)) { + // Runtime doesn't know what local size could be on the real board + info_.maxGlobalVariableSize_ = static_cast(512 * Mi); - if (NULL == hsaCompiler_) { - const char* library = getenv("HSA_COMPILER_LIBRARY"); - aclCompilerOptions opts = { - sizeof(aclCompilerOptions_0_8), - library, - NULL, - NULL, - NULL, - NULL, - NULL, - AMD_OCL_SC_LIB - }; - // Initialize the compiler handle - acl_error error; - hsaCompiler_ = aclCompilerInit(&opts, &error); - if (error != ACL_SUCCESS) { - LogError("Error initializing the compiler"); - return false; - } - } - } - - return true; -} - -bool -NullDevice::isHsailProgram(amd::option::Options* options) { - bool isCIPlus = settings().ciPlus_; - bool isHSAILcapable = settings().hsail_; - bool isBlit = false; - bool isSPIRV = false; - bool isLangExt = false; - bool isClang = false; - bool isEDG = false; - bool isLegacy = false; - bool isOCL20 = false; - std::vector optvec; - bool isInputOptions = false; - if (options != NULL) { - optvec.push_back(options); - isInputOptions = true; - } - amd::option::Options parsedOptions; - if (!amd::Program::ParseAllOptions("", parsedOptions)) { - return NULL; - } - optvec.push_back(&parsedOptions); - for (auto const op : optvec) { - // TODO: Remove isOCL20 related code from this function along with switching HSAIL by default - if (isCIPlus && amd::Program::GetOclCVersion(op->oVariables->CLStd) >= 20) { - isOCL20 = true; - } - if (op->oVariables->clInternalKernel) { - isBlit = true; - break; - } - if (!isLegacy) { - isLegacy = op->oVariables->Legacy; - } - if (!isLangExt) { - isLangExt = op->isCStrOptionsEqual(op->oVariables->XLang, "clc++") || - op->isCStrOptionsEqual(op->oVariables->XLang, "spir"); - } - // Checks Frontend option only from input *options, not from Env, - // because they might be only calculated by RT based on the binaries to link. - // -frontend is being queried now instead of -cl-std=CL2.0, because the last one - // is not an indicator for HSAIL path anymore. - // TODO: Revise these binary's target checks - // and possibly remove them after switching to HSAIL by default. - if (isInputOptions) { - if (!isClang) { - isClang = op->isCStrOptionsEqual(op->oVariables->Frontend, "clang"); - } - if (!isEDG) { - isEDG = op->isCStrOptionsEqual(op->oVariables->Frontend, "edg"); - } - } - if (!isSPIRV) { - isSPIRV = op->oVariables->BinaryIsSpirv; - } - isInputOptions = false; - } - if (isSPIRV || (isBlit && isCIPlus && isHSAILcapable) || isClang || isOCL20) { - return true; - } - if (isLegacy || !isHSAILcapable || isEDG || isLangExt) { + if (NULL == hsaCompiler_) { + const char* library = getenv("HSA_COMPILER_LIBRARY"); + aclCompilerOptions opts = { + sizeof(aclCompilerOptions_0_8), library, NULL, NULL, NULL, NULL, NULL, AMD_OCL_SC_LIB}; + // Initialize the compiler handle + acl_error error; + hsaCompiler_ = aclCompilerInit(&opts, &error); + if (error != ACL_SUCCESS) { + LogError("Error initializing the compiler"); return false; + } } + } + + return true; +} + +bool NullDevice::isHsailProgram(amd::option::Options* options) { + bool isCIPlus = settings().ciPlus_; + bool isHSAILcapable = settings().hsail_; + bool isBlit = false; + bool isSPIRV = false; + bool isLangExt = false; + bool isClang = false; + bool isEDG = false; + bool isLegacy = false; + bool isOCL20 = false; + std::vector optvec; + bool isInputOptions = false; + if (options != NULL) { + optvec.push_back(options); + isInputOptions = true; + } + amd::option::Options parsedOptions; + if (!amd::Program::ParseAllOptions("", parsedOptions)) { + return NULL; + } + optvec.push_back(&parsedOptions); + for (auto const op : optvec) { + // TODO: Remove isOCL20 related code from this function along with switching HSAIL by default + if (isCIPlus && amd::Program::GetOclCVersion(op->oVariables->CLStd) >= 20) { + isOCL20 = true; + } + if (op->oVariables->clInternalKernel) { + isBlit = true; + break; + } + if (!isLegacy) { + isLegacy = op->oVariables->Legacy; + } + if (!isLangExt) { + isLangExt = op->isCStrOptionsEqual(op->oVariables->XLang, "clc++") || + op->isCStrOptionsEqual(op->oVariables->XLang, "spir"); + } + // Checks Frontend option only from input *options, not from Env, + // because they might be only calculated by RT based on the binaries to link. + // -frontend is being queried now instead of -cl-std=CL2.0, because the last one + // is not an indicator for HSAIL path anymore. + // TODO: Revise these binary's target checks + // and possibly remove them after switching to HSAIL by default. + if (isInputOptions) { + if (!isClang) { + isClang = op->isCStrOptionsEqual(op->oVariables->Frontend, "clang"); + } + if (!isEDG) { + isEDG = op->isCStrOptionsEqual(op->oVariables->Frontend, "edg"); + } + } + if (!isSPIRV) { + isSPIRV = op->oVariables->BinaryIsSpirv; + } + isInputOptions = false; + } + if (isSPIRV || (isBlit && isCIPlus && isHSAILcapable) || isClang || isOCL20) { return true; + } + if (isLegacy || !isHSAILcapable || isEDG || isLangExt) { + return false; + } + return true; } -device::Program* -NullDevice::createProgram(amd::option::Options* options) -{ - if (isHsailProgram(options)) { - return new HSAILProgram(*this); - } - return new NullProgram(*this); +device::Program* NullDevice::createProgram(amd::option::Options* options) { + if (isHsailProgram(options)) { + return new HSAILProgram(*this); + } + return new NullProgram(*this); } -void -NullDevice::fillDeviceInfo( - const CALdeviceattribs& calAttr, - const gslMemInfo& memInfo, - size_t maxTextureSize, - uint numComputeRings, - uint numComputeRingsRT - ) -{ - info_.type_ = CL_DEVICE_TYPE_GPU; - info_.vendorId_ = 0x1002; - info_.maxComputeUnits_ = calAttr.numberOfSIMD; - info_.maxWorkItemDimensions_ = 3; - info_.numberOfShaderEngines = calAttr.numberOfShaderEngines; +void NullDevice::fillDeviceInfo(const CALdeviceattribs& calAttr, const gslMemInfo& memInfo, + size_t maxTextureSize, uint numComputeRings, + uint numComputeRingsRT) { + info_.type_ = CL_DEVICE_TYPE_GPU; + info_.vendorId_ = 0x1002; + info_.maxComputeUnits_ = calAttr.numberOfSIMD; + info_.maxWorkItemDimensions_ = 3; + info_.numberOfShaderEngines = calAttr.numberOfShaderEngines; - // SI parts are scalar. Also, reads don't need to be 128-bits to get peak rates. - // For example, float4 is not faster than float as long as all threads fetch the same - // amount of data and the reads are coalesced. This is from the H/W team and confirmed - // through experimentation. May also be true on EG/NI, but no point in confusing - // developers now. - info_.nativeVectorWidthChar_ = info_.preferredVectorWidthChar_ = 4; - info_.nativeVectorWidthShort_ = info_.preferredVectorWidthShort_ = 2; - info_.nativeVectorWidthInt_ = info_.preferredVectorWidthInt_ = 1; - info_.nativeVectorWidthLong_ = info_.preferredVectorWidthLong_ = 1; - info_.nativeVectorWidthFloat_ = info_.preferredVectorWidthFloat_ = 1; - info_.nativeVectorWidthDouble_ = info_.preferredVectorWidthDouble_ = - (settings().checkExtension(ClKhrFp64)) ? 1 : 0; - info_.nativeVectorWidthHalf_ = info_.preferredVectorWidthHalf_ = 0; // no half support + // SI parts are scalar. Also, reads don't need to be 128-bits to get peak rates. + // For example, float4 is not faster than float as long as all threads fetch the same + // amount of data and the reads are coalesced. This is from the H/W team and confirmed + // through experimentation. May also be true on EG/NI, but no point in confusing + // developers now. + info_.nativeVectorWidthChar_ = info_.preferredVectorWidthChar_ = 4; + info_.nativeVectorWidthShort_ = info_.preferredVectorWidthShort_ = 2; + info_.nativeVectorWidthInt_ = info_.preferredVectorWidthInt_ = 1; + info_.nativeVectorWidthLong_ = info_.preferredVectorWidthLong_ = 1; + info_.nativeVectorWidthFloat_ = info_.preferredVectorWidthFloat_ = 1; + info_.nativeVectorWidthDouble_ = info_.preferredVectorWidthDouble_ = + (settings().checkExtension(ClKhrFp64)) ? 1 : 0; + info_.nativeVectorWidthHalf_ = info_.preferredVectorWidthHalf_ = 0; // no half support - info_.maxClockFrequency_ = (calAttr.engineClock != 0) ? calAttr.engineClock : 555; - info_.maxParameterSize_ = 1024; - info_.minDataTypeAlignSize_ = sizeof(cl_long16); - info_.singleFPConfig_ = CL_FP_ROUND_TO_NEAREST | CL_FP_ROUND_TO_ZERO - | CL_FP_ROUND_TO_INF | CL_FP_INF_NAN | CL_FP_FMA; + info_.maxClockFrequency_ = (calAttr.engineClock != 0) ? calAttr.engineClock : 555; + info_.maxParameterSize_ = 1024; + info_.minDataTypeAlignSize_ = sizeof(cl_long16); + info_.singleFPConfig_ = + CL_FP_ROUND_TO_NEAREST | CL_FP_ROUND_TO_ZERO | CL_FP_ROUND_TO_INF | CL_FP_INF_NAN | CL_FP_FMA; - if (settings().singleFpDenorm_) { - info_.singleFPConfig_ |= CL_FP_DENORM; - } + if (settings().singleFpDenorm_) { + info_.singleFPConfig_ |= CL_FP_DENORM; + } - if (settings().checkExtension(ClKhrFp64)) { - info_.doubleFPConfig_ = info_.singleFPConfig_ | CL_FP_DENORM; - } + if (settings().checkExtension(ClKhrFp64)) { + info_.doubleFPConfig_ = info_.singleFPConfig_ | CL_FP_DENORM; + } - if (settings().reportFMA_) { - info_.singleFPConfig_ |= CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT; - } + if (settings().reportFMA_) { + info_.singleFPConfig_ |= CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT; + } - info_.globalMemCacheLineSize_ = settings().cacheLineSize_; - info_.globalMemCacheSize_ = settings().cacheSize_; - if ((settings().cacheLineSize_ != 0) || (settings().cacheSize_ != 0)) { - info_.globalMemCacheType_ = CL_READ_WRITE_CACHE; - } - else { - info_.globalMemCacheType_ = CL_NONE; - } + info_.globalMemCacheLineSize_ = settings().cacheLineSize_; + info_.globalMemCacheSize_ = settings().cacheSize_; + if ((settings().cacheLineSize_ != 0) || (settings().cacheSize_ != 0)) { + info_.globalMemCacheType_ = CL_READ_WRITE_CACHE; + } else { + info_.globalMemCacheType_ = CL_NONE; + } #if defined(ATI_OS_LINUX) - info_.globalMemSize_ = - (static_cast(std::min(GPU_MAX_HEAP_SIZE, 100u)) * - // globalMemSize is the actual available size for app on Linux - // Because Linux base driver doesn't support paging - static_cast(memInfo.cardMemAvailableBytes + memInfo.cardExtMemAvailableBytes) / 100u); + info_.globalMemSize_ = + (static_cast(std::min(GPU_MAX_HEAP_SIZE, 100u)) * + // globalMemSize is the actual available size for app on Linux + // Because Linux base driver doesn't support paging + static_cast(memInfo.cardMemAvailableBytes + memInfo.cardExtMemAvailableBytes) / + 100u); #else - info_.globalMemSize_ = - (static_cast(std::min(GPU_MAX_HEAP_SIZE, 100u)) * - static_cast(calAttr.localRAM) / 100u) * Mi; + info_.globalMemSize_ = (static_cast(std::min(GPU_MAX_HEAP_SIZE, 100u)) * + static_cast(calAttr.localRAM) / 100u) * + Mi; #endif - int uswcPercentAvailable = (calAttr.uncachedRemoteRAM > 1536 && IS_WINDOWS) ? 75 : 50; - if (settings().apuSystem_) { - info_.globalMemSize_ += - (static_cast(calAttr.uncachedRemoteRAM) * Mi * uswcPercentAvailable)/100; - } + int uswcPercentAvailable = (calAttr.uncachedRemoteRAM > 1536 && IS_WINDOWS) ? 75 : 50; + if (settings().apuSystem_) { + info_.globalMemSize_ += + (static_cast(calAttr.uncachedRemoteRAM) * Mi * uswcPercentAvailable) / 100; + } - // We try to calculate the largest available memory size from - // the largest available block in either heap. In theory this - // should be the size we can actually allocate at application - // start. Note that it may not be a guarantee still as the - // application progresses. +// We try to calculate the largest available memory size from +// the largest available block in either heap. In theory this +// should be the size we can actually allocate at application +// start. Note that it may not be a guarantee still as the +// application progresses. #if defined(BRAHMA) && defined(ATI_BITS_64) - info_.maxMemAllocSize_ = std::max( - cl_ulong(memInfo.cardMemAvailableBytes), - cl_ulong(memInfo.cardExtMemAvailableBytes)); + info_.maxMemAllocSize_ = + std::max(cl_ulong(memInfo.cardMemAvailableBytes), cl_ulong(memInfo.cardExtMemAvailableBytes)); #else - info_.maxMemAllocSize_ = std::max( - cl_ulong(memInfo.cardLargestFreeBlockBytes), - cl_ulong(memInfo.cardExtLargestFreeBlockBytes)); + info_.maxMemAllocSize_ = std::max(cl_ulong(memInfo.cardLargestFreeBlockBytes), + cl_ulong(memInfo.cardExtLargestFreeBlockBytes)); #endif - if (settings().apuSystem_) { - info_.maxMemAllocSize_ = std::max( - (static_cast(calAttr.uncachedRemoteRAM) * Mi * uswcPercentAvailable)/100, - info_.maxMemAllocSize_); + if (settings().apuSystem_) { + info_.maxMemAllocSize_ = std::max( + (static_cast(calAttr.uncachedRemoteRAM) * Mi * uswcPercentAvailable) / 100, + info_.maxMemAllocSize_); + } + info_.maxMemAllocSize_ = + cl_ulong(info_.maxMemAllocSize_ * std::min(GPU_SINGLE_ALLOC_PERCENT, 100u) / 100u); + + //! \note Force max single allocation size. + //! 4GB limit for the blit kernels and 64 bit optimizations. + info_.maxMemAllocSize_ = + std::min(info_.maxMemAllocSize_, static_cast(settings().maxAllocSize_)); + + if (info_.maxMemAllocSize_ < cl_ulong(128 * Mi)) { + LogError( + "We are unable to get a heap large enough to support the OpenCL minimum " + "requirement for FULL_PROFILE"); + } + + info_.maxMemAllocSize_ = std::max(cl_ulong(128 * Mi), info_.maxMemAllocSize_); + + // Clamp max single alloc size to the globalMemSize since it's + // reduced by default + info_.maxMemAllocSize_ = std::min(info_.maxMemAllocSize_, info_.globalMemSize_); + + // We need to verify that we are not reporting more global memory + // that 4x single alloc + info_.globalMemSize_ = std::min(4 * info_.maxMemAllocSize_, info_.globalMemSize_); + + // Use 64 bit pointers + if (settings().use64BitPtr_) { + info_.addressBits_ = 64; + } else { + info_.addressBits_ = 32; + // Limit total size with 3GB for 32 bit + info_.globalMemSize_ = std::min(info_.globalMemSize_, cl_ulong(3 * Gi)); + } + + // Alignment in BITS of the base address of any allocated memory object + static const size_t MemBaseAlignment = 256; + //! @note Force 256 bytes alignment, since currently + //! calAttr.surface_alignment returns 4KB. For pinned memory runtime + //! should be able to create a view with 256 bytes alignement + info_.memBaseAddrAlign_ = 8 * MemBaseAlignment; + + info_.maxConstantBufferSize_ = (settings().ciPlus_) ? info_.maxMemAllocSize_ : 64 * Ki; + info_.maxConstantArgs_ = MaxConstArguments; + + // Image support fields + if (settings().imageSupport_) { + info_.imageSupport_ = CL_TRUE; + info_.maxSamplers_ = MaxSamplers; + info_.maxReadImageArgs_ = MaxReadImage; + info_.maxWriteImageArgs_ = MaxWriteImage; + info_.image2DMaxWidth_ = maxTextureSize; + info_.image2DMaxHeight_ = maxTextureSize; + info_.image3DMaxWidth_ = std::min(2 * Ki, maxTextureSize); + info_.image3DMaxHeight_ = std::min(2 * Ki, maxTextureSize); + info_.image3DMaxDepth_ = std::min(2 * Ki, maxTextureSize); + + info_.imagePitchAlignment_ = 256; // XXX: 256 pixel pitch alignment for now + info_.imageBaseAddressAlignment_ = 256; // XXX: 256 byte base address alignment for now + + info_.bufferFromImageSupport_ = CL_TRUE; + } + + info_.errorCorrectionSupport_ = CL_FALSE; + + if (settings().apuSystem_) { + info_.hostUnifiedMemory_ = CL_TRUE; + } + + info_.profilingTimerResolution_ = 1; + info_.profilingTimerOffset_ = amd::Os::offsetToEpochNanos(); + info_.littleEndian_ = CL_TRUE; + info_.available_ = CL_TRUE; + info_.compilerAvailable_ = CL_TRUE; + info_.linkerAvailable_ = CL_TRUE; + + info_.executionCapabilities_ = CL_EXEC_KERNEL; + info_.preferredPlatformAtomicAlignment_ = 0; + info_.preferredGlobalAtomicAlignment_ = 0; + info_.preferredLocalAtomicAlignment_ = 0; + info_.queueProperties_ = CL_QUEUE_PROFILING_ENABLE; + + info_.platform_ = AMD_PLATFORM; + + if ((calTarget() == CAL_TARGET_CARRIZO) && ASICREV_IS_CARRIZO_BRISTOL(calAttr.asicRevision)) { + const static char* bristol = "Bristol Ridge"; + ::strcpy(info_.name_, bristol); + } else { + ::strcpy(info_.name_, hwInfo()->targetName_); + } + ::strcpy(info_.vendor_, "Advanced Micro Devices, Inc."); + ::snprintf(info_.driverVersion_, sizeof(info_.driverVersion_) - 1, AMD_BUILD_STRING); + + info_.profile_ = "FULL_PROFILE"; + if (settings().oclVersion_ == OpenCL20) { + info_.version_ = "OpenCL 2.0 " AMD_PLATFORM_INFO; + info_.oclcVersion_ = "OpenCL C 2.0 "; + info_.spirVersions_ = "1.2"; + } else if (settings().oclVersion_ == OpenCL12) { + info_.version_ = "OpenCL 1.2 " AMD_PLATFORM_INFO; + info_.oclcVersion_ = "OpenCL C 1.2 "; + info_.spirVersions_ = "1.2"; + } else { + info_.version_ = "OpenCL 1.0 " AMD_PLATFORM_INFO; + info_.oclcVersion_ = "OpenCL C 1.0 "; + info_.spirVersions_ = ""; + LogError("Unknown version for support"); + } + + // Fill workgroup info size + info_.maxWorkGroupSize_ = settings().maxWorkGroupSize_; + info_.maxWorkItemSizes_[0] = info_.maxWorkGroupSize_; + info_.maxWorkItemSizes_[1] = info_.maxWorkGroupSize_; + info_.maxWorkItemSizes_[2] = info_.maxWorkGroupSize_; + + if (settings().hwLDSSize_ != 0) { + info_.localMemType_ = CL_LOCAL; + info_.localMemSize_ = settings().hwLDSSize_; + } else { + info_.localMemType_ = CL_GLOBAL; + info_.localMemSize_ = 16 * Ki; + } + + info_.extensions_ = getExtensionString(); + + info_.deviceTopology_.pcie.type = CL_DEVICE_TOPOLOGY_TYPE_PCIE_AMD; + info_.deviceTopology_.pcie.bus = (calAttr.pciTopologyInformation & (0xFF << 8)) >> 8; + info_.deviceTopology_.pcie.device = (calAttr.pciTopologyInformation & (0x1F << 3)) >> 3; + info_.deviceTopology_.pcie.function = (calAttr.pciTopologyInformation & 0x07); + + ::strncpy(info_.boardName_, calAttr.boardName, sizeof(info_.boardName_)); + ::strncpy(info_.driverStore_, calAttr.driverStore, sizeof(info_.driverStore_)); + + // OpenCL1.2 device info fields + info_.builtInKernels_ = ""; + info_.imageMaxBufferSize_ = MaxImageBufferSize; + info_.imageMaxArraySize_ = MaxImageArraySize; + info_.preferredInteropUserSync_ = true; + info_.printfBufferSize_ = PrintfDbg::WorkitemDebugSize * info().maxWorkGroupSize_; + + if (settings().oclVersion_ >= OpenCL20) { + info_.svmCapabilities_ = (CL_DEVICE_SVM_COARSE_GRAIN_BUFFER | CL_DEVICE_SVM_FINE_GRAIN_BUFFER); + if (settings().svmAtomics_) { + info_.svmCapabilities_ |= CL_DEVICE_SVM_ATOMICS; } - info_.maxMemAllocSize_ = cl_ulong(info_.maxMemAllocSize_ * - std::min(GPU_SINGLE_ALLOC_PERCENT, 100u) / 100u); - - //! \note Force max single allocation size. - //! 4GB limit for the blit kernels and 64 bit optimizations. - info_.maxMemAllocSize_ = std::min(info_.maxMemAllocSize_, - static_cast(settings().maxAllocSize_)); - - if (info_.maxMemAllocSize_ < cl_ulong(128 * Mi)) { - LogError("We are unable to get a heap large enough to support the OpenCL minimum "\ - "requirement for FULL_PROFILE"); + if (settings().svmFineGrainSystem_) { + info_.svmCapabilities_ |= CL_DEVICE_SVM_FINE_GRAIN_SYSTEM; } + // OpenCL2.0 device info fields + info_.maxWriteImageArgs_ = MaxReadWriteImage; //!< For compatibility + info_.maxReadWriteImageArgs_ = MaxReadWriteImage; - info_.maxMemAllocSize_ = std::max(cl_ulong(128 * Mi), info_.maxMemAllocSize_); + info_.maxPipePacketSize_ = info_.maxMemAllocSize_; + info_.maxPipeActiveReservations_ = 16; + info_.maxPipeArgs_ = 16; - // Clamp max single alloc size to the globalMemSize since it's - // reduced by default - info_.maxMemAllocSize_ = std::min(info_.maxMemAllocSize_, info_.globalMemSize_); + info_.queueOnDeviceProperties_ = + CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_PROFILING_ENABLE; + info_.queueOnDevicePreferredSize_ = 256 * Ki; + info_.queueOnDeviceMaxSize_ = 8 * Mi; + info_.maxOnDeviceQueues_ = 1; + info_.maxOnDeviceEvents_ = settings().numDeviceEvents_; + info_.globalVariablePreferredTotalSize_ = static_cast(info_.globalMemSize_); + //! \todo Remove % calculation. + //! Use 90% of max single alloc size. + //! Boards with max single alloc size around 4GB will fail allocations + info_.maxGlobalVariableSize_ = + static_cast(amd::alignDown(info_.maxMemAllocSize_ * 9 / 10, 256)); + } - // We need to verify that we are not reporting more global memory - // that 4x single alloc - info_.globalMemSize_ = std::min( 4 * info_.maxMemAllocSize_, info_.globalMemSize_); - - // Use 64 bit pointers - if (settings().use64BitPtr_) { - info_.addressBits_ = 64; - } - else { - info_.addressBits_ = 32; - // Limit total size with 3GB for 32 bit - info_.globalMemSize_ = std::min(info_.globalMemSize_, cl_ulong(3 * Gi)); - } - - // Alignment in BITS of the base address of any allocated memory object - static const size_t MemBaseAlignment = 256; - //! @note Force 256 bytes alignment, since currently - //! calAttr.surface_alignment returns 4KB. For pinned memory runtime - //! should be able to create a view with 256 bytes alignement - info_.memBaseAddrAlign_ = 8 * MemBaseAlignment; - - info_.maxConstantBufferSize_ = - (settings().ciPlus_) ? info_.maxMemAllocSize_ : 64 * Ki; - info_.maxConstantArgs_ = MaxConstArguments; - - // Image support fields - if (settings().imageSupport_) { - info_.imageSupport_ = CL_TRUE; - info_.maxSamplers_ = MaxSamplers; - info_.maxReadImageArgs_ = MaxReadImage; - info_.maxWriteImageArgs_ = MaxWriteImage; - info_.image2DMaxWidth_ = maxTextureSize; - info_.image2DMaxHeight_ = maxTextureSize; - info_.image3DMaxWidth_ = std::min(2 * Ki, maxTextureSize); - info_.image3DMaxHeight_ = std::min(2 * Ki, maxTextureSize); - info_.image3DMaxDepth_ = std::min(2 * Ki, maxTextureSize); - - info_.imagePitchAlignment_ = 256; // XXX: 256 pixel pitch alignment for now - info_.imageBaseAddressAlignment_ = 256; // XXX: 256 byte base address alignment for now - - info_.bufferFromImageSupport_ = CL_TRUE; - } - - info_.errorCorrectionSupport_ = CL_FALSE; - - if (settings().apuSystem_) { - info_.hostUnifiedMemory_ = CL_TRUE; - } - - info_.profilingTimerResolution_ = 1; - info_.profilingTimerOffset_ = amd::Os::offsetToEpochNanos(); - info_.littleEndian_ = CL_TRUE; - info_.available_ = CL_TRUE; - info_.compilerAvailable_ = CL_TRUE; - info_.linkerAvailable_ = CL_TRUE; - - info_.executionCapabilities_ = CL_EXEC_KERNEL; - info_.preferredPlatformAtomicAlignment_ = 0; - info_.preferredGlobalAtomicAlignment_ = 0; - info_.preferredLocalAtomicAlignment_ = 0; - info_.queueProperties_ = CL_QUEUE_PROFILING_ENABLE; - - info_.platform_ = AMD_PLATFORM; - - if ((calTarget() == CAL_TARGET_CARRIZO) && - ASICREV_IS_CARRIZO_BRISTOL(calAttr.asicRevision)) { - const static char* bristol = "Bristol Ridge"; - ::strcpy(info_.name_, bristol); - } - else { - ::strcpy(info_.name_, hwInfo()->targetName_); - } - ::strcpy(info_.vendor_, "Advanced Micro Devices, Inc."); - ::snprintf(info_.driverVersion_, sizeof(info_.driverVersion_) - 1, - AMD_BUILD_STRING); - - info_.profile_ = "FULL_PROFILE"; - if (settings().oclVersion_ == OpenCL20) { - info_.version_ = "OpenCL 2.0 " AMD_PLATFORM_INFO; - info_.oclcVersion_ = "OpenCL C 2.0 "; - info_.spirVersions_ = "1.2"; - } - else if (settings().oclVersion_ == OpenCL12) { - info_.version_ = "OpenCL 1.2 " AMD_PLATFORM_INFO; - info_.oclcVersion_ = "OpenCL C 1.2 "; - info_.spirVersions_ = "1.2"; - } - else { - info_.version_ = "OpenCL 1.0 " AMD_PLATFORM_INFO; - info_.oclcVersion_ = "OpenCL C 1.0 "; - info_.spirVersions_ = ""; - LogError("Unknown version for support"); - } - - // Fill workgroup info size - info_.maxWorkGroupSize_ = settings().maxWorkGroupSize_; - info_.maxWorkItemSizes_[0] = info_.maxWorkGroupSize_; - info_.maxWorkItemSizes_[1] = info_.maxWorkGroupSize_; - info_.maxWorkItemSizes_[2] = info_.maxWorkGroupSize_; - - if (settings().hwLDSSize_ != 0) { - info_.localMemType_ = CL_LOCAL; - info_.localMemSize_ = settings().hwLDSSize_; - } - else { - info_.localMemType_ = CL_GLOBAL; - info_.localMemSize_ = 16 * Ki; - } - - info_.extensions_ = getExtensionString(); - - info_.deviceTopology_.pcie.type = CL_DEVICE_TOPOLOGY_TYPE_PCIE_AMD; - info_.deviceTopology_.pcie.bus = (calAttr.pciTopologyInformation&(0xFF<<8))>>8; - info_.deviceTopology_.pcie.device = (calAttr.pciTopologyInformation&(0x1F<<3))>>3; - info_.deviceTopology_.pcie.function = (calAttr.pciTopologyInformation&0x07); - - ::strncpy(info_.boardName_, calAttr.boardName, sizeof(info_.boardName_)); - ::strncpy(info_.driverStore_, calAttr.driverStore, sizeof(info_.driverStore_)); - - // OpenCL1.2 device info fields - info_.builtInKernels_ = ""; - info_.imageMaxBufferSize_ = MaxImageBufferSize; - info_.imageMaxArraySize_ = MaxImageArraySize; - info_.preferredInteropUserSync_ = true; - info_.printfBufferSize_ = PrintfDbg::WorkitemDebugSize * info().maxWorkGroupSize_; - - if (settings().oclVersion_ >= OpenCL20) { - info_.svmCapabilities_ = - (CL_DEVICE_SVM_COARSE_GRAIN_BUFFER | CL_DEVICE_SVM_FINE_GRAIN_BUFFER); - if (settings().svmAtomics_) { - info_.svmCapabilities_ |= CL_DEVICE_SVM_ATOMICS; - } - if (settings().svmFineGrainSystem_) { - info_.svmCapabilities_ |= CL_DEVICE_SVM_FINE_GRAIN_SYSTEM; - } - // OpenCL2.0 device info fields - info_.maxWriteImageArgs_ = MaxReadWriteImage; //!< For compatibility - info_.maxReadWriteImageArgs_ = MaxReadWriteImage; - - info_.maxPipePacketSize_ = info_.maxMemAllocSize_; - info_.maxPipeActiveReservations_ = 16; - info_.maxPipeArgs_ = 16; - - info_.queueOnDeviceProperties_ = - CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_PROFILING_ENABLE; - info_.queueOnDevicePreferredSize_ = 256 * Ki; - info_.queueOnDeviceMaxSize_ = 8 * Mi; - info_.maxOnDeviceQueues_ = 1; - info_.maxOnDeviceEvents_ = settings().numDeviceEvents_; - info_.globalVariablePreferredTotalSize_ = static_cast(info_.globalMemSize_); - //! \todo Remove % calculation. - //! Use 90% of max single alloc size. - //! Boards with max single alloc size around 4GB will fail allocations - info_.maxGlobalVariableSize_ = static_cast( - amd::alignDown(info_.maxMemAllocSize_ * 9 / 10, 256)); - } - - if (settings().checkExtension(ClAmdDeviceAttributeQuery)) { - info_.simdPerCU_ = hwInfo()->simdPerCU_; - info_.simdWidth_ = hwInfo()->simdWidth_; - info_.simdInstructionWidth_ = hwInfo()->simdInstructionWidth_; - info_.wavefrontWidth_ = calAttr.wavefrontSize; - info_.globalMemChannels_ = calAttr.memBusWidth / 32; - info_.globalMemChannelBanks_ = calAttr.numMemBanks; - info_.globalMemChannelBankWidth_ = hwInfo()->memChannelBankWidth_; - info_.localMemSizePerCU_ = hwInfo()->localMemSizePerCU_; - info_.localMemBanks_ = hwInfo()->localMemBanks_; - info_.gfxipVersion_ = hwInfo()->gfxipVersion_; - info_.numAsyncQueues_ = numComputeRings; - info_.numRTQueues_ = numComputeRingsRT; - info_.numRTCUs_ = calAttr.maxRTCUs; - info_.threadTraceEnable_ = settings().threadTraceEnable_; - } + if (settings().checkExtension(ClAmdDeviceAttributeQuery)) { + info_.simdPerCU_ = hwInfo()->simdPerCU_; + info_.simdWidth_ = hwInfo()->simdWidth_; + info_.simdInstructionWidth_ = hwInfo()->simdInstructionWidth_; + info_.wavefrontWidth_ = calAttr.wavefrontSize; + info_.globalMemChannels_ = calAttr.memBusWidth / 32; + info_.globalMemChannelBanks_ = calAttr.numMemBanks; + info_.globalMemChannelBankWidth_ = hwInfo()->memChannelBankWidth_; + info_.localMemSizePerCU_ = hwInfo()->localMemSizePerCU_; + info_.localMemBanks_ = hwInfo()->localMemBanks_; + info_.gfxipVersion_ = hwInfo()->gfxipVersion_; + info_.numAsyncQueues_ = numComputeRings; + info_.numRTQueues_ = numComputeRingsRT; + info_.numRTCUs_ = calAttr.maxRTCUs; + info_.threadTraceEnable_ = settings().threadTraceEnable_; + } } -bool -Device::Heap::create(Device& device) -{ - // Create global GPU heap - resource_ = new Memory(device, 0); - if (resource_ == NULL) { - return false; - } +bool Device::Heap::create(Device& device) { + // Create global GPU heap + resource_ = new Memory(device, 0); + if (resource_ == NULL) { + return false; + } - if (!resource_->create(Resource::Heap)) { - return false; - } + if (!resource_->create(Resource::Heap)) { + return false; + } - baseAddress_ = resource_->gslResource()->getSurfaceAddress(); - return true; + baseAddress_ = resource_->gslResource()->getSurfaceAddress(); + return true; } -void -Device::Engines::create(uint num, gslEngineDescriptor* desc, uint maxNumComputeRings) -{ - numComputeRings_ = 0; - numComputeRingsRT_ = 0; - numDmaEngines_ = 0; +void Device::Engines::create(uint num, gslEngineDescriptor* desc, uint maxNumComputeRings) { + numComputeRings_ = 0; + numComputeRingsRT_ = 0; + numDmaEngines_ = 0; - for (uint i = 0; i < num; ++i) { - desc_[desc[i].id] = desc[i]; - desc_[desc[i].id].priority = GSL_ENGINEPRIORITY_NEUTRAL; + for (uint i = 0; i < num; ++i) { + desc_[desc[i].id] = desc[i]; + desc_[desc[i].id].priority = GSL_ENGINEPRIORITY_NEUTRAL; - if (desc[i].id >= GSL_ENGINEID_COMPUTE0 && - desc[i].id <= GSL_ENGINEID_COMPUTE7) { - numComputeRings_++; - } - - if (desc[i].id == GSL_ENGINEID_COMPUTE_RT) { - numComputeRingsRT_++; - } - if (desc[i].id == GSL_ENGINEID_COMPUTE_MEDIUM_PRIORITY) { - numComputeRingsRT_++; - } - - if (desc[i].id >= GSL_ENGINEID_DRMDMA0 && - desc[i].id <= GSL_ENGINEID_DRMDMA1) { - numDmaEngines_++; - } + if (desc[i].id >= GSL_ENGINEID_COMPUTE0 && desc[i].id <= GSL_ENGINEID_COMPUTE7) { + numComputeRings_++; } - numComputeRings_ = std::min(numComputeRings_, maxNumComputeRings); -} - -uint -Device::Engines::getRequested(uint engines, gslEngineDescriptor* desc) const -{ - uint slot = 0; - for (uint i = 0; i < GSL_ENGINEID_MAX; ++i) { - if ((engines & getMask(static_cast(i))) && - (desc_[i].id == static_cast(i))) { - desc[slot] = desc_[i]; - engines &= ~getMask(static_cast(i)); - slot++; - } + if (desc[i].id == GSL_ENGINEID_COMPUTE_RT) { + numComputeRingsRT_++; } - return (engines == 0) ? slot : 0; -} - -Device::XferBuffers::~XferBuffers() -{ - // Destroy temporary buffer for reads - for (const auto& buf : freeBuffers_) { - // CPU optimization: unmap staging buffer just once - if (!buf->cal()->cardMemory_) { - buf->unmap(NULL); - } - delete buf; + if (desc[i].id == GSL_ENGINEID_COMPUTE_MEDIUM_PRIORITY) { + numComputeRingsRT_++; } - freeBuffers_.clear(); + + if (desc[i].id >= GSL_ENGINEID_DRMDMA0 && desc[i].id <= GSL_ENGINEID_DRMDMA1) { + numDmaEngines_++; + } + } + + numComputeRings_ = std::min(numComputeRings_, maxNumComputeRings); } -bool -Device::XferBuffers::create() -{ - Memory* xferBuf = NULL; - bool result = false; - // Create a buffer object +uint Device::Engines::getRequested(uint engines, gslEngineDescriptor* desc) const { + uint slot = 0; + for (uint i = 0; i < GSL_ENGINEID_MAX; ++i) { + if ((engines & getMask(static_cast(i))) && + (desc_[i].id == static_cast(i))) { + desc[slot] = desc_[i]; + engines &= ~getMask(static_cast(i)); + slot++; + } + } + return (engines == 0) ? slot : 0; +} + +Device::XferBuffers::~XferBuffers() { + // Destroy temporary buffer for reads + for (const auto& buf : freeBuffers_) { + // CPU optimization: unmap staging buffer just once + if (!buf->cal()->cardMemory_) { + buf->unmap(NULL); + } + delete buf; + } + freeBuffers_.clear(); +} + +bool Device::XferBuffers::create() { + Memory* xferBuf = NULL; + bool result = false; + // Create a buffer object + xferBuf = new Memory(dev(), bufSize_); + + // Try to allocate memory for the transfer buffer + if ((NULL == xferBuf) || !xferBuf->create(type_)) { + delete xferBuf; + xferBuf = NULL; + LogError("Couldn't allocate a transfer buffer!"); + } else { + result = true; + freeBuffers_.push_back(xferBuf); + // CPU optimization: map staging buffer just once + if (!xferBuf->cal()->cardMemory_) { + xferBuf->map(NULL); + } + } + + return result; +} + +Memory& Device::XferBuffers::acquire() { + Memory* xferBuf = NULL; + size_t listSize; + + // Lock the operations with the staged buffer list + amd::ScopedLock l(lock_); + listSize = freeBuffers_.size(); + + // If the list is empty, then attempt to allocate a staged buffer + if (listSize == 0) { + // Allocate memory xferBuf = new Memory(dev(), bufSize_); - // Try to allocate memory for the transfer buffer + // Allocate memory for the transfer buffer if ((NULL == xferBuf) || !xferBuf->create(type_)) { - delete xferBuf; - xferBuf = NULL; - LogError("Couldn't allocate a transfer buffer!"); - } - else { - result = true; - freeBuffers_.push_back(xferBuf); - // CPU optimization: map staging buffer just once - if (!xferBuf->cal()->cardMemory_) { - xferBuf->map(NULL); - } + delete xferBuf; + xferBuf = NULL; + LogError("Couldn't allocate a transfer buffer!"); + } else { + ++acquiredCnt_; + // CPU optimization: map staging buffer just once + if (!xferBuf->cal()->cardMemory_) { + xferBuf->map(NULL); + } } + } - return result; + if (xferBuf == NULL) { + xferBuf = *(freeBuffers_.begin()); + freeBuffers_.erase(freeBuffers_.begin()); + ++acquiredCnt_; + } + + return *xferBuf; } -Memory& -Device::XferBuffers::acquire() -{ - Memory* xferBuf = NULL; - size_t listSize; - - // Lock the operations with the staged buffer list - amd::ScopedLock l(lock_); - listSize = freeBuffers_.size(); - - // If the list is empty, then attempt to allocate a staged buffer - if (listSize == 0) { - // Allocate memory - xferBuf = new Memory(dev(), bufSize_); - - // Allocate memory for the transfer buffer - if ((NULL == xferBuf) || !xferBuf->create(type_)) { - delete xferBuf; - xferBuf = NULL; - LogError("Couldn't allocate a transfer buffer!"); - } - else { - ++acquiredCnt_; - // CPU optimization: map staging buffer just once - if (!xferBuf->cal()->cardMemory_) { - xferBuf->map(NULL); - } - } - } - - if (xferBuf == NULL) { - xferBuf = *(freeBuffers_.begin()); - freeBuffers_.erase(freeBuffers_.begin()); - ++acquiredCnt_; - } - - return *xferBuf; -} - -void -Device::XferBuffers::release(VirtualGPU& gpu, Memory& buffer) -{ - // Make sure buffer isn't busy on the current VirtualGPU, because - // the next aquire can come from different queue - buffer.wait(gpu); - // Lock the operations with the staged buffer list - amd::ScopedLock l(lock_); - freeBuffers_.push_back(&buffer); - --acquiredCnt_; +void Device::XferBuffers::release(VirtualGPU& gpu, Memory& buffer) { + // Make sure buffer isn't busy on the current VirtualGPU, because + // the next aquire can come from different queue + buffer.wait(gpu); + // Lock the operations with the staged buffer list + amd::ScopedLock l(lock_); + freeBuffers_.push_back(&buffer); + --acquiredCnt_; } -Device::ScopedLockVgpus::ScopedLockVgpus(const Device& dev) - : dev_(dev) -{ - // Lock the virtual GPU list - dev_.vgpusAccess()->lock(); +Device::ScopedLockVgpus::ScopedLockVgpus(const Device& dev) : dev_(dev) { + // Lock the virtual GPU list + dev_.vgpusAccess()->lock(); - // Find all available virtual GPUs and lock them - // from the execution of commands - for (uint idx = 0; idx < dev_.vgpus().size(); ++idx) { - dev_.vgpus()[idx]->execution().lock(); - } + // Find all available virtual GPUs and lock them + // from the execution of commands + for (uint idx = 0; idx < dev_.vgpus().size(); ++idx) { + dev_.vgpus()[idx]->execution().lock(); + } } -Device::ScopedLockVgpus::~ScopedLockVgpus() -{ - // Find all available virtual GPUs and unlock them - // for the execution of commands - for (uint idx = 0; idx < dev_.vgpus().size(); ++idx) { - dev_.vgpus()[idx]->execution().unlock(); - } +Device::ScopedLockVgpus::~ScopedLockVgpus() { + // Find all available virtual GPUs and unlock them + // for the execution of commands + for (uint idx = 0; idx < dev_.vgpus().size(); ++idx) { + dev_.vgpus()[idx]->execution().unlock(); + } - // Unock the virtual GPU list - dev_.vgpusAccess()->unlock(); + // Unock the virtual GPU list + dev_.vgpusAccess()->unlock(); } Device::Device() - : NullDevice() - , CALGSLDevice() - , numOfVgpus_(0) - , context_(NULL) - , heap_() - , dummyPage_(NULL) - , lockAsyncOps_(NULL) - , lockAsyncOpsForInitHeap_(NULL) - , vgpusAccess_(NULL) - , scratchAlloc_(NULL) - , mapCacheOps_(NULL) - , xferRead_(NULL) - , xferWrite_(NULL) - , mapCache_(NULL) - , resourceCache_(NULL) - , heapInitComplete_(false) - , xferQueue_(NULL) - , globalScratchBuf_(NULL) - , srdManager_(NULL) -{ -} + : NullDevice(), + CALGSLDevice(), + numOfVgpus_(0), + context_(NULL), + heap_(), + dummyPage_(NULL), + lockAsyncOps_(NULL), + lockAsyncOpsForInitHeap_(NULL), + vgpusAccess_(NULL), + scratchAlloc_(NULL), + mapCacheOps_(NULL), + xferRead_(NULL), + xferWrite_(NULL), + mapCache_(NULL), + resourceCache_(NULL), + heapInitComplete_(false), + xferQueue_(NULL), + globalScratchBuf_(NULL), + srdManager_(NULL) {} -Device::~Device() -{ - // remove the HW debug manager - delete hwDebugMgr_; - hwDebugMgr_ = NULL; +Device::~Device() { + // remove the HW debug manager + delete hwDebugMgr_; + hwDebugMgr_ = NULL; - delete srdManager_; + delete srdManager_; - for (uint s = 0; s < scratch_.size(); ++s) { - delete scratch_[s]; - scratch_[s] = NULL; + for (uint s = 0; s < scratch_.size(); ++s) { + delete scratch_[s]; + scratch_[s] = NULL; + } + + delete globalScratchBuf_; + globalScratchBuf_ = NULL; + + // Destroy transfer queue + delete xferQueue_; + + // Destroy blit program + delete blitProgram_; + + // Release cached map targets + for (uint i = 0; mapCache_ != NULL && i < mapCache_->size(); ++i) { + if ((*mapCache_)[i] != NULL) { + (*mapCache_)[i]->release(); } + } + delete mapCache_; - delete globalScratchBuf_; - globalScratchBuf_ = NULL; + // Destroy temporary buffers for read/write + delete xferRead_; + delete xferWrite_; - // Destroy transfer queue - delete xferQueue_; + if (dummyPage_ != NULL) { + dummyPage_->release(); + } - // Destroy blit program - delete blitProgram_; + // Destroy resource cache + delete resourceCache_; - // Release cached map targets - for (uint i = 0; mapCache_ != NULL && i < mapCache_->size(); ++i) { - if ((*mapCache_)[i] != NULL) { - (*mapCache_)[i]->release(); - } - } - delete mapCache_; + delete lockAsyncOps_; + delete lockAsyncOpsForInitHeap_; + delete vgpusAccess_; + delete scratchAlloc_; + delete mapCacheOps_; - // Destroy temporary buffers for read/write - delete xferRead_; - delete xferWrite_; + if (context_ != NULL) { + context_->release(); + } - if (dummyPage_ != NULL) { - dummyPage_->release(); - } - - // Destroy resource cache - delete resourceCache_; - - delete lockAsyncOps_; - delete lockAsyncOpsForInitHeap_; - delete vgpusAccess_; - delete scratchAlloc_; - delete mapCacheOps_; - - if (context_ != NULL) { - context_->release(); - } - - // Close the active device - close(); + // Close the active device + close(); } extern const char* SchedulerSourceCode; -bool -Device::create(CALuint ordinal, CALuint numOfDevices) -{ - if (!amd::Device::create()) { - return false; - } +bool Device::create(CALuint ordinal, CALuint numOfDevices) { + if (!amd::Device::create()) { + return false; + } - appProfile_.init(); + appProfile_.init(); - bool smallMemSystem = false; - if (amd::Os::hostTotalPhysicalMemory() < OCL_SYSMEM_REQUIREMENT * Gi) { - smallMemSystem = true; - } + bool smallMemSystem = false; + if (amd::Os::hostTotalPhysicalMemory() < OCL_SYSMEM_REQUIREMENT * Gi) { + smallMemSystem = true; + } - // Open GSL device - if (!open(ordinal, appProfile_.enableHighPerformanceState(), - smallMemSystem || appProfile_.reportAsOCL12Device() || (OPENCL_VERSION < 200))) { - return false; - } + // Open GSL device + if (!open(ordinal, appProfile_.enableHighPerformanceState(), + smallMemSystem || appProfile_.reportAsOCL12Device() || (OPENCL_VERSION < 200))) { + return false; + } - // Update CAL target - calTarget_ = getAttribs().target; - hwInfo_ = &DeviceInfo[calTarget_]; + // Update CAL target + calTarget_ = getAttribs().target; + hwInfo_ = &DeviceInfo[calTarget_]; - if ((GPU_ENABLE_PAL == 2) && (calTarget_ == CAL_TARGET_GREENLAND - || calTarget_ == CAL_TARGET_RAVEN)) { - return false; - } + if ((GPU_ENABLE_PAL == 2) && + (calTarget_ == CAL_TARGET_GREENLAND || calTarget_ == CAL_TARGET_RAVEN)) { + return false; + } #if defined(BRAHMA) - if (calTarget_ == CAL_TARGET_GREENLAND || - calTarget_ == CAL_TARGET_LEXA || - calTarget_ == CAL_TARGET_RAVEN || - calTarget_ == CAL_TARGET_POLARIS22) { - return false; - } + if (calTarget_ == CAL_TARGET_GREENLAND || calTarget_ == CAL_TARGET_LEXA || + calTarget_ == CAL_TARGET_RAVEN || calTarget_ == CAL_TARGET_POLARIS22) { + return false; + } #endif - // Creates device settings - settings_ = new gpu::Settings(); - gpu::Settings* gpuSettings = reinterpret_cast(settings_); - if ((gpuSettings == NULL) || !gpuSettings->create(getAttribs() - , appProfile_.reportAsOCL12Device(), smallMemSystem - )) { - return false; - } + // Creates device settings + settings_ = new gpu::Settings(); + gpu::Settings* gpuSettings = reinterpret_cast(settings_); + if ((gpuSettings == NULL) || + !gpuSettings->create(getAttribs(), appProfile_.reportAsOCL12Device(), smallMemSystem)) { + return false; + } - engines_.create(m_nEngines, m_engines, settings().numComputeRings_); + engines_.create(m_nEngines, m_engines, settings().numComputeRings_); - amd::Context::Info info = {0}; - std::vector devices; - devices.push_back(this); + amd::Context::Info info = {0}; + std::vector devices; + devices.push_back(this); - // Create a dummy context - context_ = new amd::Context(devices, info); - if (context_ == NULL) { - return false; - } + // Create a dummy context + context_ = new amd::Context(devices, info); + if (context_ == NULL) { + return false; + } - // Create the locks - lockAsyncOps_ = new amd::Monitor("Device Async Ops Lock", true); - if (NULL == lockAsyncOps_) { - return false; - } + // Create the locks + lockAsyncOps_ = new amd::Monitor("Device Async Ops Lock", true); + if (NULL == lockAsyncOps_) { + return false; + } - lockAsyncOpsForInitHeap_ = new amd::Monitor("Async Ops Lock For Initialization of Heap Resource", true); - if (NULL == lockAsyncOpsForInitHeap_) { - return false; - } + lockAsyncOpsForInitHeap_ = + new amd::Monitor("Async Ops Lock For Initialization of Heap Resource", true); + if (NULL == lockAsyncOpsForInitHeap_) { + return false; + } - vgpusAccess_ = new amd::Monitor("Virtual GPU List Ops Lock", true); - if (NULL == vgpusAccess_) { - return false; - } + vgpusAccess_ = new amd::Monitor("Virtual GPU List Ops Lock", true); + if (NULL == vgpusAccess_) { + return false; + } - scratchAlloc_ = new amd::Monitor("Scratch Allocation Lock", true); - if (NULL == scratchAlloc_) { - return false; - } + scratchAlloc_ = new amd::Monitor("Scratch Allocation Lock", true); + if (NULL == scratchAlloc_) { + return false; + } - mapCacheOps_ = new amd::Monitor("Map Cache Lock", true); - if (NULL == mapCacheOps_) { - return false; - } + mapCacheOps_ = new amd::Monitor("Map Cache Lock", true); + if (NULL == mapCacheOps_) { + return false; + } - mapCache_ = new std::vector(); - if (mapCache_ == NULL) { - return false; - } - // Use just 1 entry by default for the map cache - mapCache_->push_back(NULL); + mapCache_ = new std::vector(); + if (mapCache_ == NULL) { + return false; + } + // Use just 1 entry by default for the map cache + mapCache_->push_back(NULL); - size_t resourceCacheSize = settings().resourceCacheSize_; + size_t resourceCacheSize = settings().resourceCacheSize_; #ifdef DEBUG - std::stringstream message; - if (settings().remoteAlloc_) { - message << "Using *Remote* memory"; - } - else { - message << "Using *Local* memory"; - } + std::stringstream message; + if (settings().remoteAlloc_) { + message << "Using *Remote* memory"; + } else { + message << "Using *Local* memory"; + } - message << std::endl; - LogInfo(message.str().c_str()); -#endif // DEBUG + message << std::endl; + LogInfo(message.str().c_str()); +#endif // DEBUG - // Create resource cache. - // \note Cache must be created before any resource creation to avoid NULL check - resourceCache_ = new ResourceCache(resourceCacheSize); - if (NULL == resourceCache_) { - return false; - } + // Create resource cache. + // \note Cache must be created before any resource creation to avoid NULL check + resourceCache_ = new ResourceCache(resourceCacheSize); + if (NULL == resourceCache_) { + return false; + } - // Fill the device info structure - fillDeviceInfo(getAttribs(), getMemInfo(), - static_cast(getMaxTextureSize()), - engines().numComputeRings(), engines().numComputeRingsRT()); + // Fill the device info structure + fillDeviceInfo(getAttribs(), getMemInfo(), static_cast(getMaxTextureSize()), + engines().numComputeRings(), engines().numComputeRingsRT()); - if (NULL == compiler_) { + if (NULL == compiler_) { #if !defined(ATI_OS_LINUX) - char CompilerLibrary[220] = ""; - strcat_s(CompilerLibrary, "amdocl12cl" LP64_SWITCH("", "64") ".dll"); + char CompilerLibrary[220] = ""; + strcat_s(CompilerLibrary, "amdocl12cl" LP64_SWITCH("", "64") ".dll"); #endif - const char *library = getenv("COMPILER_LIBRARY"); - aclCompilerOptions opts = { - sizeof(aclCompilerOptions_0_8), + const char* library = getenv("COMPILER_LIBRARY"); + aclCompilerOptions opts = { + sizeof(aclCompilerOptions_0_8), #if defined(ATI_OS_LINUX) - library ? library : LINUX_ONLY("lib") "amdocl12cl" \ - LP64_SWITCH(LINUX_SWITCH("32",""),"64") LINUX_SWITCH(".so",".dll"), + library ? library : LINUX_ONLY("lib") "amdocl12cl" LP64_SWITCH(LINUX_SWITCH("32", ""), "64") + LINUX_SWITCH(".so", ".dll"), #else - library ? library : CompilerLibrary, + library ? library : CompilerLibrary, #endif - NULL, - NULL, - NULL, - NULL, - NULL, - AMD_OCL_SC_LIB - }; - compiler_ = aclCompilerInit(&opts, NULL); - } + NULL, + NULL, + NULL, + NULL, + NULL, + AMD_OCL_SC_LIB + }; + compiler_ = aclCompilerInit(&opts, NULL); + } - if (settings().hsail_ || (settings().oclVersion_ == OpenCL20)) { - if (NULL == hsaCompiler_) { - const char* library = getenv("HSA_COMPILER_LIBRARY"); - aclCompilerOptions opts = { - sizeof(aclCompilerOptions_0_8), - library, - NULL, - NULL, - NULL, - NULL, - NULL, - AMD_OCL_SC_LIB - }; - // Initialize the compiler handle - acl_error error; - hsaCompiler_ = aclCompilerInit(&opts, &error); - if (error != ACL_SUCCESS) { - LogError("Error initializing the compiler"); - return false; - } - } - } - else { - blitProgram_ = new BlitProgram(context_); - // Create blit programs - if (blitProgram_ == NULL || !blitProgram_->create(this)) { - delete blitProgram_; - blitProgram_ = NULL; - LogError("Couldn't create blit kernels!"); - return false; - } - } - - // Allocate SRD manager - srdManager_ = new SrdManager(*this, - std::max(HsaImageObjectSize, HsaSamplerObjectSize), 64 * Ki); - if (srdManager_ == NULL) { + if (settings().hsail_ || (settings().oclVersion_ == OpenCL20)) { + if (NULL == hsaCompiler_) { + const char* library = getenv("HSA_COMPILER_LIBRARY"); + aclCompilerOptions opts = { + sizeof(aclCompilerOptions_0_8), library, NULL, NULL, NULL, NULL, NULL, AMD_OCL_SC_LIB}; + // Initialize the compiler handle + acl_error error; + hsaCompiler_ = aclCompilerInit(&opts, &error); + if (error != ACL_SUCCESS) { + LogError("Error initializing the compiler"); return false; + } } - - // create the HW debug manager if needed - if (settings().enableHwDebug_) { - hwDebugMgr_ = new GpuDebugManager(this); + } else { + blitProgram_ = new BlitProgram(context_); + // Create blit programs + if (blitProgram_ == NULL || !blitProgram_->create(this)) { + delete blitProgram_; + blitProgram_ = NULL; + LogError("Couldn't create blit kernels!"); + return false; } + } - return true; + // Allocate SRD manager + srdManager_ = new SrdManager(*this, std::max(HsaImageObjectSize, HsaSamplerObjectSize), 64 * Ki); + if (srdManager_ == NULL) { + return false; + } + + // create the HW debug manager if needed + if (settings().enableHwDebug_) { + hwDebugMgr_ = new GpuDebugManager(this); + } + + return true; } -bool -Device::initializeHeapResources() -{ - amd::ScopedLock k(lockAsyncOpsForInitHeap_); - if (!heapInitComplete_) { - heapInitComplete_ = true; +bool Device::initializeHeapResources() { + amd::ScopedLock k(lockAsyncOpsForInitHeap_); + if (!heapInitComplete_) { + heapInitComplete_ = true; - PerformFullInitialization(); + PerformFullInitialization(); - uint numComputeRings = engines_.numComputeRings() + engines_.numComputeRingsRT(); - scratch_.resize((settings().useSingleScratch_) ? 1 : (numComputeRings ? numComputeRings : 1)); + uint numComputeRings = engines_.numComputeRings() + engines_.numComputeRingsRT(); + scratch_.resize((settings().useSingleScratch_) ? 1 : (numComputeRings ? numComputeRings : 1)); - // Initialize the number of mem object for the scratch buffer - for (uint s = 0; s < scratch_.size(); ++s) { - scratch_[s] = new ScratchBuffer(); - if (NULL == scratch_[s]) { - return false; - } - } - - // Complete initialization of the heap and other buffers - if (!heap_.create(*this)) { - LogError("Failed GPU heap creation"); - return false; - } - - size_t dummySize = amd::Os::pageSize(); - - // Allocate a dummy page for NULL pointer processing - dummyPage_ = new(*context_) amd::Buffer(*context_, 0, dummySize); - if ((dummyPage_ != NULL) && !dummyPage_->create()) { - dummyPage_->release(); - return false; - } - - Memory* devMemory = reinterpret_cast(dummyPage_->getDeviceMemory(*this)); - if (devMemory == NULL) { - // Release memory - dummyPage_->release(); - dummyPage_ = NULL; - return false; - } - - if (settings().stagedXferSize_ != 0) { - // Initialize staged write buffers - if (settings().stagedXferWrite_) { - Resource::MemoryType type; - if (settings().stagingWritePersistent_ && !settings().disablePersistent_) { - type = Resource::Persistent; - } else { - type = Resource::RemoteUSWC; - } - xferWrite_ = new XferBuffers(*this, type, - amd::alignUp(settings().stagedXferSize_, 4 * Ki)); - if ((xferWrite_ == NULL) || !xferWrite_->create()) { - LogError("Couldn't allocate transfer buffer objects for read"); - return false; - } - } - - // Initialize staged read buffers - if (settings().stagedXferRead_) { - xferRead_ = new XferBuffers(*this, Resource::Remote, - amd::alignUp(settings().stagedXferSize_, 4 * Ki)); - if ((xferRead_ == NULL) || !xferRead_->create()) { - LogError("Couldn't allocate transfer buffer objects for write"); - return false; - } - } - } - - // Delay compilation due to brig_loader memory allocation - if (settings().ciPlus_) { - const char* CL20extraBlits = NULL; - const char* ocl20 = NULL; - if (settings().oclVersion_ == OpenCL20) { - CL20extraBlits = SchedulerSourceCode; - ocl20 = "-cl-std=CL2.0"; - } - blitProgram_ = new BlitProgram(context_); - // Create blit programs - if (blitProgram_ == NULL || - !blitProgram_->create(this, CL20extraBlits, ocl20)) { - delete blitProgram_; - blitProgram_ = NULL; - LogError("Couldn't create blit kernels!"); - return false; - } - } - - // Create a synchronized transfer queue - xferQueue_ = new VirtualGPU(*this); - if (!(xferQueue_ && xferQueue_->create( - false - ))) { - delete xferQueue_; - xferQueue_ = NULL; - } - if (NULL == xferQueue_) { - LogError("Couldn't create the device transfer manager!"); - return false; - } - xferQueue_->enableSyncedBlit(); + // Initialize the number of mem object for the scratch buffer + for (uint s = 0; s < scratch_.size(); ++s) { + scratch_[s] = new ScratchBuffer(); + if (NULL == scratch_[s]) { + return false; + } } - return true; + + // Complete initialization of the heap and other buffers + if (!heap_.create(*this)) { + LogError("Failed GPU heap creation"); + return false; + } + + size_t dummySize = amd::Os::pageSize(); + + // Allocate a dummy page for NULL pointer processing + dummyPage_ = new (*context_) amd::Buffer(*context_, 0, dummySize); + if ((dummyPage_ != NULL) && !dummyPage_->create()) { + dummyPage_->release(); + return false; + } + + Memory* devMemory = reinterpret_cast(dummyPage_->getDeviceMemory(*this)); + if (devMemory == NULL) { + // Release memory + dummyPage_->release(); + dummyPage_ = NULL; + return false; + } + + if (settings().stagedXferSize_ != 0) { + // Initialize staged write buffers + if (settings().stagedXferWrite_) { + Resource::MemoryType type; + if (settings().stagingWritePersistent_ && !settings().disablePersistent_) { + type = Resource::Persistent; + } else { + type = Resource::RemoteUSWC; + } + xferWrite_ = new XferBuffers(*this, type, amd::alignUp(settings().stagedXferSize_, 4 * Ki)); + if ((xferWrite_ == NULL) || !xferWrite_->create()) { + LogError("Couldn't allocate transfer buffer objects for read"); + return false; + } + } + + // Initialize staged read buffers + if (settings().stagedXferRead_) { + xferRead_ = new XferBuffers(*this, Resource::Remote, + amd::alignUp(settings().stagedXferSize_, 4 * Ki)); + if ((xferRead_ == NULL) || !xferRead_->create()) { + LogError("Couldn't allocate transfer buffer objects for write"); + return false; + } + } + } + + // Delay compilation due to brig_loader memory allocation + if (settings().ciPlus_) { + const char* CL20extraBlits = NULL; + const char* ocl20 = NULL; + if (settings().oclVersion_ == OpenCL20) { + CL20extraBlits = SchedulerSourceCode; + ocl20 = "-cl-std=CL2.0"; + } + blitProgram_ = new BlitProgram(context_); + // Create blit programs + if (blitProgram_ == NULL || !blitProgram_->create(this, CL20extraBlits, ocl20)) { + delete blitProgram_; + blitProgram_ = NULL; + LogError("Couldn't create blit kernels!"); + return false; + } + } + + // Create a synchronized transfer queue + xferQueue_ = new VirtualGPU(*this); + if (!(xferQueue_ && xferQueue_->create(false))) { + delete xferQueue_; + xferQueue_ = NULL; + } + if (NULL == xferQueue_) { + LogError("Couldn't create the device transfer manager!"); + return false; + } + xferQueue_->enableSyncedBlit(); + } + return true; } -device::VirtualDevice* -Device::createVirtualDevice( - amd::CommandQueue* queue - ) -{ - bool profiling = false; - bool interopQueue = false; - uint rtCUs = amd::CommandQueue::RealTimeDisabled; - uint deviceQueueSize = 0; +device::VirtualDevice* Device::createVirtualDevice(amd::CommandQueue* queue) { + bool profiling = false; + bool interopQueue = false; + uint rtCUs = amd::CommandQueue::RealTimeDisabled; + uint deviceQueueSize = 0; - if (queue != NULL) { - profiling = queue->properties().test(CL_QUEUE_PROFILING_ENABLE); - if (queue->asHostQueue() != NULL) { - interopQueue = (0 != (queue->context().info().flags_ & - (amd::Context::GLDeviceKhr | - amd::Context::D3D10DeviceKhr | - amd::Context::D3D11DeviceKhr))); - rtCUs = queue->rtCUs(); - } - else if (queue->asDeviceQueue() != NULL) { - deviceQueueSize = queue->asDeviceQueue()->size(); - } + if (queue != NULL) { + profiling = queue->properties().test(CL_QUEUE_PROFILING_ENABLE); + if (queue->asHostQueue() != NULL) { + interopQueue = (0 != (queue->context().info().flags_ & + (amd::Context::GLDeviceKhr | amd::Context::D3D10DeviceKhr | + amd::Context::D3D11DeviceKhr))); + rtCUs = queue->rtCUs(); + } else if (queue->asDeviceQueue() != NULL) { + deviceQueueSize = queue->asDeviceQueue()->size(); } + } - // Not safe to add a queue. So lock the device - amd::ScopedLock k(lockAsyncOps()); - amd::ScopedLock lock(vgpusAccess()); + // Not safe to add a queue. So lock the device + amd::ScopedLock k(lockAsyncOps()); + amd::ScopedLock lock(vgpusAccess()); - // Initialization of heap and other resources occur during the command queue creation time. - if (!initializeHeapResources()) { - return NULL; - } + // Initialization of heap and other resources occur during the command queue creation time. + if (!initializeHeapResources()) { + return NULL; + } - VirtualGPU* vgpu = new VirtualGPU(*this); - if (vgpu && vgpu->create(profiling, rtCUs, deviceQueueSize, queue->priority())) { - return vgpu; - } else { - delete vgpu; - return NULL; - } + VirtualGPU* vgpu = new VirtualGPU(*this); + if (vgpu && vgpu->create(profiling, rtCUs, deviceQueueSize, queue->priority())) { + return vgpu; + } else { + delete vgpu; + return NULL; + } } -device::Program* -Device::createProgram(amd::option::Options* options) -{ - if (isHsailProgram(options)) { - return new HSAILProgram(*this); - } - return new Program(*this); +device::Program* Device::createProgram(amd::option::Options* options) { + if (isHsailProgram(options)) { + return new HSAILProgram(*this); + } + return new Program(*this); } //! Requested devices list as configured by the GPU_DEVICE_ORDINAL typedef std::map requestedDevices_t; //! Parses the requested list of devices to be exposed to the user. -static void -parseRequestedDeviceList(requestedDevices_t &requestedDevices) { - char *pch = NULL; - int requestedDeviceCount = 0; - const char* requestedDeviceList = GPU_DEVICE_ORDINAL; +static void parseRequestedDeviceList(requestedDevices_t& requestedDevices) { + char* pch = NULL; + int requestedDeviceCount = 0; + const char* requestedDeviceList = GPU_DEVICE_ORDINAL; - pch = strtok(const_cast(requestedDeviceList), ","); - while (pch != NULL) { - bool deviceIdValid = true; - int currentDeviceIndex = atoi(pch); - // Validate device index. - for (size_t i = 0; i < strlen(pch); i++) { - if (!isdigit(pch[i])) { - deviceIdValid = false; - break; - } - } - if (currentDeviceIndex < 0) { - deviceIdValid = false; - } - // Get next token. - pch = strtok(NULL, ","); - if (!deviceIdValid) { - continue; - } - - // Requested device is valid. - requestedDevices[currentDeviceIndex] = true; + pch = strtok(const_cast(requestedDeviceList), ","); + while (pch != NULL) { + bool deviceIdValid = true; + int currentDeviceIndex = atoi(pch); + // Validate device index. + for (size_t i = 0; i < strlen(pch); i++) { + if (!isdigit(pch[i])) { + deviceIdValid = false; + break; + } } + if (currentDeviceIndex < 0) { + deviceIdValid = false; + } + // Get next token. + pch = strtok(NULL, ","); + if (!deviceIdValid) { + continue; + } + + // Requested device is valid. + requestedDevices[currentDeviceIndex] = true; + } } -#if defined(_WIN32) && defined (DEBUG) +#if defined(_WIN32) && defined(DEBUG) #include #include -static int reportHook(int reportType, char *message, int *returnValue) -{ - fprintf(stderr, "%s", message); - ::exit(3); - return 1; +static int reportHook(int reportType, char* message, int* returnValue) { + fprintf(stderr, "%s", message); + ::exit(3); + return 1; } -#endif // _WIN32 & DEBUG +#endif // _WIN32 & DEBUG -bool -Device::init() -{ - CALuint numDevices = 0; - bool useDeviceList = false; - requestedDevices_t requestedDevices; +bool Device::init() { + CALuint numDevices = 0; + bool useDeviceList = false; + requestedDevices_t requestedDevices; - hsaCompiler_ = NULL; - compiler_ = NULL; + hsaCompiler_ = NULL; + compiler_ = NULL; #if defined(_WIN32) && !defined(_WIN64) - // @toto: FIXME: remove this when CAL is fixed!!! - unsigned int old, ignored; - _controlfp_s(&old, 0, 0); -#endif // _WIN32 && !_WIN64 - // FIXME_lmoriche: needs cleanup - osInit(); + // @toto: FIXME: remove this when CAL is fixed!!! + unsigned int old, ignored; + _controlfp_s(&old, 0, 0); +#endif // _WIN32 && !_WIN64 + // FIXME_lmoriche: needs cleanup + osInit(); #if defined(_WIN32) - //osAssertSetStyle(OSASSERT_STYLE_LOGANDEXIT); -#endif // WIN32 +// osAssertSetStyle(OSASSERT_STYLE_LOGANDEXIT); +#endif // WIN32 -#if defined(_WIN32) && defined (DEBUG) - if (::getenv("AMD_OCL_SUPPRESS_MESSAGE_BOX")) - { - _CrtSetReportHook(reportHook); - _set_error_mode(_OUT_TO_STDERR); - } -#endif // _WIN32 & DEBUG +#if defined(_WIN32) && defined(DEBUG) + if (::getenv("AMD_OCL_SUPPRESS_MESSAGE_BOX")) { + _CrtSetReportHook(reportHook); + _set_error_mode(_OUT_TO_STDERR); + } +#endif // _WIN32 & DEBUG - gslInit(); + gslInit(); #if defined(_WIN32) && !defined(_WIN64) - _controlfp_s(&ignored, old, _MCW_RC | _MCW_PC); -#endif // _WIN32 && !_WIN64 + _controlfp_s(&ignored, old, _MCW_RC | _MCW_PC); +#endif // _WIN32 && !_WIN64 - // Get the total number of active devices - // Count up all the devices in the system. - numDevices = gsAdaptor::enumerateAdaptors(); + // Get the total number of active devices + // Count up all the devices in the system. + numDevices = gsAdaptor::enumerateAdaptors(); - CALuint ordinal = 0; - const char* selectDeviceByName = NULL; - if (!flagIsDefault(GPU_DEVICE_ORDINAL)) { - useDeviceList = true; - parseRequestedDeviceList(requestedDevices); - } - else if (!flagIsDefault(GPU_DEVICE_NAME)) { - selectDeviceByName = GPU_DEVICE_NAME; - } + CALuint ordinal = 0; + const char* selectDeviceByName = NULL; + if (!flagIsDefault(GPU_DEVICE_ORDINAL)) { + useDeviceList = true; + parseRequestedDeviceList(requestedDevices); + } else if (!flagIsDefault(GPU_DEVICE_NAME)) { + selectDeviceByName = GPU_DEVICE_NAME; + } - // Loop through all active devices and initialize the device info structure - for (; ordinal < numDevices; ++ordinal) { - // Create the GPU device object - Device *d = new Device(); - bool result = (NULL != d) && d->create(ordinal, numDevices); - if (useDeviceList) { - result &= (requestedDevices.find(ordinal) != requestedDevices.end()); - } - if (result && - ((NULL == selectDeviceByName) || ('\0' == selectDeviceByName[0]) || - (strstr(selectDeviceByName, d->info().name_) != NULL))) { - d->registerDevice(); - } - else { - delete d; - } + // Loop through all active devices and initialize the device info structure + for (; ordinal < numDevices; ++ordinal) { + // Create the GPU device object + Device* d = new Device(); + bool result = (NULL != d) && d->create(ordinal, numDevices); + if (useDeviceList) { + result &= (requestedDevices.find(ordinal) != requestedDevices.end()); } - return true; + if (result && ((NULL == selectDeviceByName) || ('\0' == selectDeviceByName[0]) || + (strstr(selectDeviceByName, d->info().name_) != NULL))) { + d->registerDevice(); + } else { + delete d; + } + } + return true; } -void -Device::tearDown() -{ - osExit(); - gslExit(); - aclCompilerFini(compiler_); - if (hsaCompiler_ != NULL) { - aclCompilerFini(hsaCompiler_); +void Device::tearDown() { + osExit(); + gslExit(); + aclCompilerFini(compiler_); + if (hsaCompiler_ != NULL) { + aclCompilerFini(hsaCompiler_); + } +} + +gpu::Memory* Device::getGpuMemory(amd::Memory* mem) const { + return static_cast(mem->getDeviceMemory(*this)); +} + +const device::BlitManager& Device::xferMgr() const { return xferQueue_->blitMgr(); } + +CalFormat Device::getCalFormat(const amd::Image::Format& format) const { + // Find CAL format + for (uint i = 0; i < sizeof(MemoryFormatMap) / sizeof(MemoryFormat); ++i) { + if ((format.image_channel_data_type == MemoryFormatMap[i].clFormat_.image_channel_data_type) && + (format.image_channel_order == MemoryFormatMap[i].clFormat_.image_channel_order)) { + return MemoryFormatMap[i].calFormat_; } + } + osAssert(0 && "We didn't find CAL resource format!"); + return MemoryFormatMap[0].calFormat_; } -gpu::Memory* -Device::getGpuMemory(amd::Memory* mem) const -{ - return static_cast(mem->getDeviceMemory(*this)); -} - -const device::BlitManager& -Device::xferMgr() const -{ - return xferQueue_->blitMgr(); -} - -CalFormat -Device::getCalFormat(const amd::Image::Format& format) const -{ - // Find CAL format - for (uint i = 0; i < sizeof(MemoryFormatMap) / sizeof(MemoryFormat); ++i) { - if ((format.image_channel_data_type == - MemoryFormatMap[i].clFormat_.image_channel_data_type) && - (format.image_channel_order == - MemoryFormatMap[i].clFormat_.image_channel_order)) { - return MemoryFormatMap[i].calFormat_; - } +amd::Image::Format Device::getOclFormat(const CalFormat& format) const { + // Find CL format + for (uint i = 0; i < sizeof(MemoryFormatMap) / sizeof(MemoryFormat); ++i) { + if ((format.type_ == MemoryFormatMap[i].calFormat_.type_) && + (format.channelOrder_ == MemoryFormatMap[i].calFormat_.channelOrder_)) { + return MemoryFormatMap[i].clFormat_; } - osAssert(0 && "We didn't find CAL resource format!"); - return MemoryFormatMap[0].calFormat_; -} - -amd::Image::Format -Device::getOclFormat(const CalFormat& format) const -{ - // Find CL format - for (uint i = 0; i < sizeof(MemoryFormatMap) / sizeof(MemoryFormat); ++i) { - if ((format.type_ == - MemoryFormatMap[i].calFormat_.type_) && - (format.channelOrder_ == - MemoryFormatMap[i].calFormat_.channelOrder_)) { - return MemoryFormatMap[i].clFormat_; - } - } - osAssert(0 && "We didn't find OCL resource format!"); - return MemoryFormatMap[0].clFormat_; + } + osAssert(0 && "We didn't find OCL resource format!"); + return MemoryFormatMap[0].clFormat_; } // Create buffer without an owner (merge common code with createBuffer() ?) -gpu::Memory* -Device::createScratchBuffer(size_t size) const -{ - Memory* gpuMemory = NULL; +gpu::Memory* Device::createScratchBuffer(size_t size) const { + Memory* gpuMemory = NULL; - // Create a memory object - gpuMemory = new gpu::Memory(*this, size); - if (NULL == gpuMemory || !gpuMemory->create(Resource::Local)) { - delete gpuMemory; - gpuMemory = NULL; - } + // Create a memory object + gpuMemory = new gpu::Memory(*this, size); + if (NULL == gpuMemory || !gpuMemory->create(Resource::Local)) { + delete gpuMemory; + gpuMemory = NULL; + } - return gpuMemory; + return gpuMemory; } -gpu::Memory* -Device::createBuffer( - amd::Memory& owner, - bool directAccess) const -{ - size_t size = owner.getSize(); - gpu::Memory* gpuMemory; +gpu::Memory* Device::createBuffer(amd::Memory& owner, bool directAccess) const { + size_t size = owner.getSize(); + gpu::Memory* gpuMemory; - // Create resource - bool result = false; + // Create resource + bool result = false; - if (owner.getType() == CL_MEM_OBJECT_PIPE) { - // directAccess isnt needed as Pipes shouldnt be host accessible for GPU - directAccess = false; + if (owner.getType() == CL_MEM_OBJECT_PIPE) { + // directAccess isnt needed as Pipes shouldnt be host accessible for GPU + directAccess = false; + } + + if (NULL != owner.parent()) { + gpu::Memory* gpuParent = getGpuMemory(owner.parent()); + if (NULL == gpuParent) { + LogError("Can't get the owner object for subbuffer allocation"); + return NULL; } - if (NULL != owner.parent()) { - gpu::Memory* gpuParent = getGpuMemory(owner.parent()); - if (NULL == gpuParent) { - LogError("Can't get the owner object for subbuffer allocation"); + if (nullptr != owner.parent()->getSvmPtr()) { + amd::Memory* amdParent = owner.parent(); + { + // Lock memory object, so only one commitment will occur + amd::ScopedLock lock(amdParent->lockMemoryOps()); + amdParent->commitSvmMemory(); + amdParent->setHostMem(amdParent->getSvmPtr()); + } + // Ignore a possible pinning error. Runtime will fallback to SW emulation + // bool ok = gpuParent->pinSystemMemory( + // amdParent->getHostMem(), amdParent->getSize()); + } + return gpuParent->createBufferView(owner); + } + + Resource::MemoryType type = + (owner.forceSysMemAlloc() || (owner.getMemFlags() & CL_MEM_SVM_FINE_GRAIN_BUFFER)) + ? Resource::Remote + : Resource::Local; + + if (owner.getMemFlags() & CL_MEM_BUS_ADDRESSABLE_AMD) { + type = Resource::BusAddressable; + } else if (owner.getMemFlags() & CL_MEM_EXTERNAL_PHYSICAL_AMD) { + type = Resource::ExternalPhysical; + } + + // Use direct access if it's possible + bool remoteAlloc = false; + // Internal means VirtualDevice!=NULL + bool internalAlloc = + ((owner.getMemFlags() & CL_MEM_USE_HOST_PTR) && (owner.getVirtualDevice() != NULL)) ? true + : false; + + // Create a memory object + gpuMemory = new gpu::Buffer(*this, owner, owner.getSize()); + if (NULL == gpuMemory) { + return NULL; + } + + // Check if owner is interop memory + if (owner.isInterop()) { + result = gpuMemory->createInterop(Memory::InteropDirectAccess); + } else if (owner.getMemFlags() & CL_MEM_USE_PERSISTENT_MEM_AMD) { + // Attempt to allocate from persistent heap + result = gpuMemory->create(Resource::Persistent); + } else if (directAccess || (type == Resource::Remote)) { + // Check for system memory allocations + if ((owner.getMemFlags() & (CL_MEM_ALLOC_HOST_PTR | CL_MEM_USE_HOST_PTR)) || + (settings().remoteAlloc_)) { + // Allocate remote memory if AHP allocation and context has just 1 device + if ((owner.getMemFlags() & CL_MEM_ALLOC_HOST_PTR) && + (owner.getContext().devices().size() == 1)) { + if (owner.getMemFlags() & + (CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS)) { + // GPU will be reading from this host memory buffer, + // so assume Host write into it + type = Resource::RemoteUSWC; + remoteAlloc = true; + } + } + // Make sure owner has a valid hostmem pointer and it's not COPY + if (!remoteAlloc && (owner.getHostMem() != NULL)) { + Resource::PinnedParams params; + params.owner_ = &owner; + params.gpu_ = reinterpret_cast(owner.getVirtualDevice()); + + params.hostMemRef_ = owner.getHostMemRef(); + params.size_ = owner.getHostMemRef()->size(); + if (0 == params.size_) { + params.size_ = owner.getSize(); + } + // Create memory object + result = gpuMemory->create(Resource::Pinned, ¶ms); + + // If direct access failed + if (!result) { + // Don't use cached allocation + // if size is biger than max single alloc + if (owner.getSize() > info().maxMemAllocSize_) { + delete gpuMemory; return NULL; + } } + } + } + } - if (nullptr != owner.parent()->getSvmPtr()) { - amd::Memory* amdParent = owner.parent(); - { - // Lock memory object, so only one commitment will occur - amd::ScopedLock lock(amdParent->lockMemoryOps()); - amdParent->commitSvmMemory(); - amdParent->setHostMem(amdParent->getSvmPtr()); - } - // Ignore a possible pinning error. Runtime will fallback to SW emulation - //bool ok = gpuParent->pinSystemMemory( - // amdParent->getHostMem(), amdParent->getSize()); + if (!result && + // Make sure it's not internal alloc + !internalAlloc) { + Resource::CreateParams params; + params.owner_ = &owner; + params.gpu_ = static_cast(owner.getVirtualDevice()); + + // Create memory object + result = gpuMemory->create(type, ¶ms); + + // If allocation was successful + if (result) { + // Initialize if the memory is a pipe object + if (owner.getType() == CL_MEM_OBJECT_PIPE) { + // Pipe initialize in order read_idx, write_idx, end_idx. Refer clk_pipe_t structure. + // Init with 3 DWORDS for 32bit addressing and 6 DWORDS for 64bit + size_t pipeInit[3] = {0, 0, owner.asPipe()->getMaxNumPackets()}; + gpuMemory->writeRawData(*xferQueue_, sizeof(pipeInit), pipeInit, true); + } + // If memory has direct access from host, then get CPU address + if (gpuMemory->isHostMemDirectAccess() && (type != Resource::ExternalPhysical)) { + void* address = gpuMemory->map(NULL); + if (address != NULL) { + // Copy saved memory + if (owner.getMemFlags() & CL_MEM_COPY_HOST_PTR) { + memcpy(address, owner.getHostMem(), owner.getSize()); + } + // It should be safe to change the host memory pointer, + // because it's lock protected from the upper caller + owner.setHostMem(address); + } else { + result = false; } - return gpuParent->createBufferView(owner); + } + // An optimization for CHP. Copy memory and destroy sysmem allocation + else if ((gpuMemory->memoryType() != Resource::Pinned) && + (owner.getMemFlags() & CL_MEM_COPY_HOST_PTR) && + (owner.getContext().devices().size() == 1)) { + amd::Coord3D origin(0, 0, 0); + amd::Coord3D region(owner.getSize()); + static const bool Entire = true; + if (xferMgr().writeBuffer(owner.getHostMem(), *gpuMemory, origin, region, Entire)) { + // Clear CHP memory + owner.setHostMem(NULL); + } + } } + } - Resource::MemoryType type = (owner.forceSysMemAlloc() || - (owner.getMemFlags() & CL_MEM_SVM_FINE_GRAIN_BUFFER)) ? - Resource::Remote : Resource::Local; + if (!result) { + delete gpuMemory; + return NULL; + } - if (owner.getMemFlags() & CL_MEM_BUS_ADDRESSABLE_AMD) { - type = Resource::BusAddressable; + return gpuMemory; +} + +gpu::Memory* Device::createImage(amd::Memory& owner, bool directAccess) const { + size_t size = owner.getSize(); + amd::Image& image = *owner.asImage(); + gpu::Memory* gpuImage = NULL; + CalFormat format = getCalFormat(image.getImageFormat()); + + if ((NULL != owner.parent()) && (owner.parent()->asImage() != NULL)) { + device::Memory* devParent = owner.parent()->getDeviceMemory(*this); + if (NULL == devParent) { + LogError("Can't get the owner object for image view allocation"); + return NULL; } - else if (owner.getMemFlags() & CL_MEM_EXTERNAL_PHYSICAL_AMD) { - type = Resource::ExternalPhysical; + // Create a view on the specified device + gpuImage = (gpu::Memory*)createView(owner, *devParent); + if ((NULL != gpuImage) && (gpuImage->owner() != NULL)) { + gpuImage->owner()->setHostMem((address)(owner.parent()->getHostMem()) + + gpuImage->owner()->getOrigin()); } + return gpuImage; + } - // Use direct access if it's possible - bool remoteAlloc = false; - // Internal means VirtualDevice!=NULL - bool internalAlloc = ((owner.getMemFlags() & CL_MEM_USE_HOST_PTR) && - (owner.getVirtualDevice() != NULL)) ? true : false; + gpuImage = + new gpu::Image(*this, owner, image.getWidth(), image.getHeight(), image.getDepth(), + format.type_, format.channelOrder_, image.getType(), image.getMipLevels()); - // Create a memory object - gpuMemory = new gpu::Buffer(*this, owner, owner.getSize()); - if (NULL == gpuMemory) { - return NULL; - } + // Create resource + if (NULL != gpuImage) { + const bool imageBuffer = ((owner.getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER) || + ((owner.getType() == CL_MEM_OBJECT_IMAGE2D) && + (owner.parent() != NULL) && (owner.parent()->asBuffer() != NULL))); + bool result = false; // Check if owner is interop memory if (owner.isInterop()) { - result = gpuMemory->createInterop(Memory::InteropDirectAccess); + result = gpuImage->createInterop(Memory::InteropDirectAccess); + } else if (imageBuffer) { + Resource::ImageBufferParams params; + gpu::Memory* buffer = reinterpret_cast(image.parent()->getDeviceMemory(*this)); + if (buffer == NULL) { + LogError("Buffer creation for ImageBuffer failed!"); + delete gpuImage; + return NULL; + } + params.owner_ = &owner; + params.resource_ = buffer; + params.memory_ = buffer; + + // Create memory object + result = gpuImage->create(Resource::ImageBuffer, ¶ms); + } else if (directAccess && (owner.getMemFlags() & CL_MEM_ALLOC_HOST_PTR)) { + Resource::PinnedParams params; + params.owner_ = &owner; + params.hostMemRef_ = owner.getHostMemRef(); + params.size_ = owner.getHostMemRef()->size(); + + // Create memory object + result = gpuImage->create(Resource::Pinned, ¶ms); } - else if (owner.getMemFlags() & CL_MEM_USE_PERSISTENT_MEM_AMD) { + + if (!result && !owner.isInterop()) { + if (owner.getMemFlags() & CL_MEM_USE_PERSISTENT_MEM_AMD) { // Attempt to allocate from persistent heap - result = gpuMemory->create(Resource::Persistent); - } - else if (directAccess || (type == Resource::Remote)) { - // Check for system memory allocations - if ((owner.getMemFlags() & (CL_MEM_ALLOC_HOST_PTR | CL_MEM_USE_HOST_PTR)) - || (settings().remoteAlloc_)) { - // Allocate remote memory if AHP allocation and context has just 1 device - if ((owner.getMemFlags() & CL_MEM_ALLOC_HOST_PTR) && - (owner.getContext().devices().size() == 1)) { - if (owner.getMemFlags() & (CL_MEM_READ_ONLY | - CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS)) { - // GPU will be reading from this host memory buffer, - // so assume Host write into it - type = Resource::RemoteUSWC; - remoteAlloc = true; - } - } - // Make sure owner has a valid hostmem pointer and it's not COPY - if (!remoteAlloc && (owner.getHostMem() != NULL)) { - Resource::PinnedParams params; - params.owner_ = &owner; - params.gpu_ = - reinterpret_cast(owner.getVirtualDevice()); - - params.hostMemRef_ = owner.getHostMemRef(); - params.size_ = owner.getHostMemRef()->size(); - if (0 == params.size_) { - params.size_ = owner.getSize(); - } - // Create memory object - result = gpuMemory->create(Resource::Pinned, ¶ms); - - // If direct access failed - if (!result) { - // Don't use cached allocation - // if size is biger than max single alloc - if (owner.getSize() > info().maxMemAllocSize_) { - delete gpuMemory; - return NULL; - } - } - } - } - } - - if (!result && - // Make sure it's not internal alloc - !internalAlloc) { - Resource::CreateParams params; - params.owner_ = &owner; - params.gpu_ = static_cast(owner.getVirtualDevice()); - + result = gpuImage->create(Resource::Persistent); + } else { + Resource::MemoryType type = + (owner.forceSysMemAlloc()) ? Resource::RemoteUSWC : Resource::Local; // Create memory object - result = gpuMemory->create(type, ¶ms); - - // If allocation was successful - if (result) { - // Initialize if the memory is a pipe object - if (owner.getType() == CL_MEM_OBJECT_PIPE) { - // Pipe initialize in order read_idx, write_idx, end_idx. Refer clk_pipe_t structure. - // Init with 3 DWORDS for 32bit addressing and 6 DWORDS for 64bit - size_t pipeInit[3] = {0 , 0, owner.asPipe()->getMaxNumPackets()}; - gpuMemory->writeRawData(*xferQueue_, sizeof(pipeInit), pipeInit, true); - } - // If memory has direct access from host, then get CPU address - if (gpuMemory->isHostMemDirectAccess() && - (type != Resource::ExternalPhysical)) { - void* address = gpuMemory->map(NULL); - if (address != NULL) { - // Copy saved memory - if (owner.getMemFlags() & CL_MEM_COPY_HOST_PTR) { - memcpy(address, owner.getHostMem(), owner.getSize()); - } - // It should be safe to change the host memory pointer, - // because it's lock protected from the upper caller - owner.setHostMem(address); - } - else { - result = false; - } - } - // An optimization for CHP. Copy memory and destroy sysmem allocation - else if ((gpuMemory->memoryType() != Resource::Pinned) && - (owner.getMemFlags() & CL_MEM_COPY_HOST_PTR) && - (owner.getContext().devices().size() == 1)) { - amd::Coord3D origin(0, 0, 0); - amd::Coord3D region(owner.getSize()); - static const bool Entire = true; - if (xferMgr().writeBuffer(owner.getHostMem(), - *gpuMemory, origin, region, Entire)) { - // Clear CHP memory - owner.setHostMem(NULL); - } - } - } + result = gpuImage->create(type); + } } if (!result) { - delete gpuMemory; - return NULL; + delete gpuImage; + return NULL; + } else if ((gpuImage->memoryType() != Resource::Pinned) && + (owner.getMemFlags() & CL_MEM_COPY_HOST_PTR) && + (owner.getContext().devices().size() == 1)) { + // Ignore copy for image1D_buffer, since it was already done for buffer + if (imageBuffer) { + // Clear CHP memory + owner.setHostMem(NULL); + } else { + amd::Coord3D origin(0, 0, 0); + static const bool Entire = true; + if (xferMgr().writeImage(owner.getHostMem(), *gpuImage, origin, image.getRegion(), 0, 0, + Entire)) { + // Clear CHP memory + owner.setHostMem(NULL); + } + } } - return gpuMemory; -} - -gpu::Memory* -Device::createImage(amd::Memory& owner, bool directAccess) const -{ - size_t size = owner.getSize(); - amd::Image& image = *owner.asImage(); - gpu::Memory* gpuImage = NULL; - CalFormat format = getCalFormat(image.getImageFormat()); - - if ((NULL != owner.parent()) && (owner.parent()->asImage() != NULL)) { - device::Memory* devParent = owner.parent()->getDeviceMemory(*this); - if (NULL == devParent) { - LogError("Can't get the owner object for image view allocation"); - return NULL; - } - // Create a view on the specified device - gpuImage = (gpu::Memory*)createView(owner, *devParent); - if ((NULL != gpuImage) && (gpuImage->owner() != NULL)) { - gpuImage->owner()->setHostMem((address)(owner.parent()->getHostMem()) + gpuImage->owner()->getOrigin()); - } - return gpuImage; + if (result) { + gslMemObject temp = gpuImage->gslResource(); + size_t bytePitch = gpuImage->elementSize() * temp->getPitch(); + image.setBytePitch(bytePitch); } + } - gpuImage = new gpu::Image(*this, owner, - image.getWidth(), - image.getHeight(), - image.getDepth(), - format.type_, - format.channelOrder_, - image.getType(), - image.getMipLevels()); - - // Create resource - if (NULL != gpuImage) { - const bool imageBuffer = - ((owner.getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER) || - ((owner.getType() == CL_MEM_OBJECT_IMAGE2D) && - (owner.parent() != NULL) && - (owner.parent()->asBuffer() != NULL))); - bool result = false; - - // Check if owner is interop memory - if (owner.isInterop()) { - result = gpuImage->createInterop(Memory::InteropDirectAccess); - } - else if (imageBuffer) { - Resource::ImageBufferParams params; - gpu::Memory* buffer = reinterpret_cast - (image.parent()->getDeviceMemory(*this)); - if (buffer == NULL) { - LogError("Buffer creation for ImageBuffer failed!"); - delete gpuImage; - return NULL; - } - params.owner_ = &owner; - params.resource_ = buffer; - params.memory_ = buffer; - - // Create memory object - result = gpuImage->create(Resource::ImageBuffer, ¶ms); - } - else if (directAccess && (owner.getMemFlags() & CL_MEM_ALLOC_HOST_PTR)) { - Resource::PinnedParams params; - params.owner_ = &owner; - params.hostMemRef_ = owner.getHostMemRef(); - params.size_ = owner.getHostMemRef()->size(); - - // Create memory object - result = gpuImage->create(Resource::Pinned, ¶ms); - } - - if (!result && !owner.isInterop()) { - if (owner.getMemFlags() & CL_MEM_USE_PERSISTENT_MEM_AMD) { - // Attempt to allocate from persistent heap - result = gpuImage->create(Resource::Persistent); - } - else { - Resource::MemoryType type = (owner.forceSysMemAlloc()) ? - Resource::RemoteUSWC : Resource::Local; - // Create memory object - result = gpuImage->create(type); - } - } - - if (!result) { - delete gpuImage; - return NULL; - } - else if ((gpuImage->memoryType() != Resource::Pinned) && - (owner.getMemFlags() & CL_MEM_COPY_HOST_PTR) && - (owner.getContext().devices().size() == 1)) { - // Ignore copy for image1D_buffer, since it was already done for buffer - if (imageBuffer) { - // Clear CHP memory - owner.setHostMem(NULL); - } - else { - amd::Coord3D origin(0, 0, 0); - static const bool Entire = true; - if (xferMgr().writeImage(owner.getHostMem(), - *gpuImage, origin, image.getRegion(), 0, 0, Entire)) { - // Clear CHP memory - owner.setHostMem(NULL); - } - } - } - - if (result) { - gslMemObject temp = gpuImage->gslResource(); - size_t bytePitch = gpuImage->elementSize() * temp->getPitch(); - image.setBytePitch(bytePitch); - } - } - - return gpuImage; + return gpuImage; } //! Allocates cache memory on the card -device::Memory* -Device::createMemory( - amd::Memory& owner) const -{ - bool directAccess = false; - gpu::Memory* memory = NULL; +device::Memory* Device::createMemory(amd::Memory& owner) const { + bool directAccess = false; + gpu::Memory* memory = NULL; - if (owner.asBuffer()) { - directAccess = (settings().hostMemDirectAccess_ & Settings::HostMemBuffer) - ? true : false; - memory = createBuffer(owner, directAccess); - } - else if (owner.asImage()) { - directAccess = (settings().hostMemDirectAccess_ & Settings::HostMemImage) - ? true : false; - memory = createImage(owner, directAccess); - } - else { - LogError("Unknown memory type!"); - } + if (owner.asBuffer()) { + directAccess = (settings().hostMemDirectAccess_ & Settings::HostMemBuffer) ? true : false; + memory = createBuffer(owner, directAccess); + } else if (owner.asImage()) { + directAccess = (settings().hostMemDirectAccess_ & Settings::HostMemImage) ? true : false; + memory = createImage(owner, directAccess); + } else { + LogError("Unknown memory type!"); + } - // Attempt to pin system memory if runtime didn't use direct access - if ((memory != NULL) && - (memory->memoryType() != Resource::Pinned) && - (memory->memoryType() != Resource::Remote) && - (memory->memoryType() != Resource::RemoteUSWC) && - (memory->memoryType() != Resource::ExternalPhysical) && - ((owner.getHostMem() != NULL) || - ((NULL != owner.parent()) && (owner.getHostMem() != NULL)))) { - bool ok = memory->pinSystemMemory( - owner.getHostMem(), (owner.getHostMemRef()->size()) ? - owner.getHostMemRef()->size() : owner.getSize()); - //! \note: Ignore the pinning result for now - } + // Attempt to pin system memory if runtime didn't use direct access + if ((memory != NULL) && (memory->memoryType() != Resource::Pinned) && + (memory->memoryType() != Resource::Remote) && + (memory->memoryType() != Resource::RemoteUSWC) && + (memory->memoryType() != Resource::ExternalPhysical) && + ((owner.getHostMem() != NULL) || + ((NULL != owner.parent()) && (owner.getHostMem() != NULL)))) { + bool ok = memory->pinSystemMemory(owner.getHostMem(), (owner.getHostMemRef()->size()) + ? owner.getHostMemRef()->size() + : owner.getSize()); + //! \note: Ignore the pinning result for now + } - return memory; + return memory; } -bool -Device::createSampler(const amd::Sampler& owner, device::Sampler** sampler) const -{ - *sampler = NULL; - if (settings().hsail_ || (settings().oclVersion_ >= OpenCL20)) { - Sampler* gpuSampler = new Sampler(*this); - if ((NULL == gpuSampler) || !gpuSampler->create(owner)) { - delete gpuSampler; - return false; - } - *sampler = gpuSampler; +bool Device::createSampler(const amd::Sampler& owner, device::Sampler** sampler) const { + *sampler = NULL; + if (settings().hsail_ || (settings().oclVersion_ >= OpenCL20)) { + Sampler* gpuSampler = new Sampler(*this); + if ((NULL == gpuSampler) || !gpuSampler->create(owner)) { + delete gpuSampler; + return false; } - return true; + *sampler = gpuSampler; + } + return true; } //! \note reallocMemory() must be called only from outside of //! VirtualGPU submit commands methods. //! Otherwise a deadlock in lockVgpus() is possible -bool -Device::reallocMemory(amd::Memory& owner) const -{ - bool directAccess = false; +bool Device::reallocMemory(amd::Memory& owner) const { + bool directAccess = false; - // For now we have to serialize reallocation code - amd::ScopedLock lk(*lockAsyncOps_); - - // Read device memory after the lock, - // since realloc from another thread can replace the pointer - gpu::Memory* gpuMemory = getGpuMemory(&owner); - if (gpuMemory == NULL) { - return false; - } - - if (gpuMemory->pinOffset() == 0) { - return true; - } - else if (NULL != owner.parent()) { - if (!reallocMemory(*owner.parent())) { - return false; - } - } - - if (owner.asBuffer()) { - gpuMemory = createBuffer(owner, directAccess); - } - else if (owner.asImage()) { - return true; - } - else { - LogError("Unknown memory type!"); - } - - if (gpuMemory != NULL) { - gpu::Memory* newMemory = gpuMemory; - gpu::Memory* oldMemory = getGpuMemory(&owner); - - // Transfer the object - if (oldMemory != NULL) { - if (!oldMemory->moveTo(*newMemory)) { - delete newMemory; - return false; - } - } - - // Attempt to pin system memory - if ((newMemory->memoryType() != Resource::Pinned) && - ((owner.getHostMem() != NULL) || - ((NULL != owner.parent()) && (owner.getHostMem() != NULL)))) { - bool ok = newMemory->pinSystemMemory( - owner.getHostMem(), (owner.getHostMemRef()->size()) ? - owner.getHostMemRef()->size() : owner.getSize()); - //! \note: Ignore the pinning result for now - } - - return true; - } + // For now we have to serialize reallocation code + amd::ScopedLock lk(*lockAsyncOps_); + // Read device memory after the lock, + // since realloc from another thread can replace the pointer + gpu::Memory* gpuMemory = getGpuMemory(&owner); + if (gpuMemory == NULL) { return false; + } + + if (gpuMemory->pinOffset() == 0) { + return true; + } else if (NULL != owner.parent()) { + if (!reallocMemory(*owner.parent())) { + return false; + } + } + + if (owner.asBuffer()) { + gpuMemory = createBuffer(owner, directAccess); + } else if (owner.asImage()) { + return true; + } else { + LogError("Unknown memory type!"); + } + + if (gpuMemory != NULL) { + gpu::Memory* newMemory = gpuMemory; + gpu::Memory* oldMemory = getGpuMemory(&owner); + + // Transfer the object + if (oldMemory != NULL) { + if (!oldMemory->moveTo(*newMemory)) { + delete newMemory; + return false; + } + } + + // Attempt to pin system memory + if ((newMemory->memoryType() != Resource::Pinned) && + ((owner.getHostMem() != NULL) || + ((NULL != owner.parent()) && (owner.getHostMem() != NULL)))) { + bool ok = newMemory->pinSystemMemory(owner.getHostMem(), (owner.getHostMemRef()->size()) + ? owner.getHostMemRef()->size() + : owner.getSize()); + //! \note: Ignore the pinning result for now + } + + return true; + } + + return false; } -device::Memory* -Device::createView(amd::Memory& owner, const device::Memory& parent) const -{ - size_t size = owner.getSize(); - assert((owner.asImage() != NULL) && "View supports images only"); - const amd::Image& image = *owner.asImage(); - gpu::Memory* gpuImage = NULL; - CalFormat format = getCalFormat(image.getImageFormat()); +device::Memory* Device::createView(amd::Memory& owner, const device::Memory& parent) const { + size_t size = owner.getSize(); + assert((owner.asImage() != NULL) && "View supports images only"); + const amd::Image& image = *owner.asImage(); + gpu::Memory* gpuImage = NULL; + CalFormat format = getCalFormat(image.getImageFormat()); - gpuImage = new gpu::Image(*this, owner, - image.getWidth(), - image.getHeight(), - image.getDepth(), - format.type_, - format.channelOrder_, - image.getType(), - image.getMipLevels()); + gpuImage = + new gpu::Image(*this, owner, image.getWidth(), image.getHeight(), image.getDepth(), + format.type_, format.channelOrder_, image.getType(), image.getMipLevels()); - // Create resource - if (NULL != gpuImage) { - bool result = false; - Resource::ImageViewParams params; - const gpu::Memory& gpuMem = static_cast(parent); + // Create resource + if (NULL != gpuImage) { + bool result = false; + Resource::ImageViewParams params; + const gpu::Memory& gpuMem = static_cast(parent); - params.owner_ = &owner; - params.level_ = image.getBaseMipLevel(); - params.layer_ = 0; - params.resource_ = &gpuMem; - params.gpu_ = reinterpret_cast(owner.getVirtualDevice()); - params.memory_ = &gpuMem; + params.owner_ = &owner; + params.level_ = image.getBaseMipLevel(); + params.layer_ = 0; + params.resource_ = &gpuMem; + params.gpu_ = reinterpret_cast(owner.getVirtualDevice()); + params.memory_ = &gpuMem; - // Create memory object - result = gpuImage->create(Resource::ImageView, ¶ms); - if (!result) { - delete gpuImage; - return NULL; - } + // Create memory object + result = gpuImage->create(Resource::ImageView, ¶ms); + if (!result) { + delete gpuImage; + return NULL; } + } - return gpuImage; + return gpuImage; } //! Attempt to bind with external graphics API's device/context -bool -Device::bindExternalDevice( - uint flags, void* const pDevice[], void* pContext, bool validateOnly) -{ - assert(pDevice); +bool Device::bindExternalDevice(uint flags, void* const pDevice[], void* pContext, + bool validateOnly) { + assert(pDevice); - if (flags & amd::Context::Flags::GLDeviceKhr) { - // There is no need to perform full initialization here - // if the GSLDevice is still uninitialized. - // Only adapter initialization is required to validate - // GL interoperability. - PerformAdapterInitialization(validateOnly); + if (flags & amd::Context::Flags::GLDeviceKhr) { + // There is no need to perform full initialization here + // if the GSLDevice is still uninitialized. + // Only adapter initialization is required to validate + // GL interoperability. + PerformAdapterInitialization(validateOnly); - // Attempt to associate GSL-OGL - if (!glAssociate((CALvoid*)pContext, pDevice[amd::Context::DeviceFlagIdx::GLDeviceKhrIdx])) { - CloseInitializedAdapter(validateOnly); - LogError("Failed gslGLAssociate()"); - return false; - } - - CloseInitializedAdapter(validateOnly); + // Attempt to associate GSL-OGL + if (!glAssociate((CALvoid*)pContext, pDevice[amd::Context::DeviceFlagIdx::GLDeviceKhrIdx])) { + CloseInitializedAdapter(validateOnly); + LogError("Failed gslGLAssociate()"); + return false; } + CloseInitializedAdapter(validateOnly); + } + #ifdef _WIN32 - if (flags & amd::Context::Flags::D3D10DeviceKhr) { - // There is no need to perform full initialization here - // if the GSLDevice is still uninitialized. - // Only adapter initialization is required - // to validate D3D10 interoperability. - PerformAdapterInitialization(validateOnly); + if (flags & amd::Context::Flags::D3D10DeviceKhr) { + // There is no need to perform full initialization here + // if the GSLDevice is still uninitialized. + // Only adapter initialization is required + // to validate D3D10 interoperability. + PerformAdapterInitialization(validateOnly); - // Associate GSL-D3D - if (!associateD3D10Device( - reinterpret_cast(pDevice[amd::Context::DeviceFlagIdx::D3D10DeviceKhrIdx]))) { - CloseInitializedAdapter(validateOnly); - LogError("Failed gslD3D10Associate()"); - return false; - } - - CloseInitializedAdapter(validateOnly); + // Associate GSL-D3D + if (!associateD3D10Device(reinterpret_cast( + pDevice[amd::Context::DeviceFlagIdx::D3D10DeviceKhrIdx]))) { + CloseInitializedAdapter(validateOnly); + LogError("Failed gslD3D10Associate()"); + return false; } - if (flags & amd::Context::Flags::D3D11DeviceKhr) { - // There is no need to perform full initialization here - // if the GSLDevice is still uninitialized. - // Only adapter initialization is required to validate - // D3D11 interoperability. - PerformAdapterInitialization(validateOnly); + CloseInitializedAdapter(validateOnly); + } - // Associate GSL-D3D - if (!associateD3D11Device( - reinterpret_cast(pDevice[amd::Context::DeviceFlagIdx::D3D11DeviceKhrIdx]))) { - CloseInitializedAdapter(validateOnly); - LogError("Failed gslD3D11Associate()"); - return false; - } + if (flags & amd::Context::Flags::D3D11DeviceKhr) { + // There is no need to perform full initialization here + // if the GSLDevice is still uninitialized. + // Only adapter initialization is required to validate + // D3D11 interoperability. + PerformAdapterInitialization(validateOnly); - CloseInitializedAdapter(validateOnly); + // Associate GSL-D3D + if (!associateD3D11Device(reinterpret_cast( + pDevice[amd::Context::DeviceFlagIdx::D3D11DeviceKhrIdx]))) { + CloseInitializedAdapter(validateOnly); + LogError("Failed gslD3D11Associate()"); + return false; } - if (flags & amd::Context::Flags::D3D9DeviceKhr) { - PerformAdapterInitialization(validateOnly); + CloseInitializedAdapter(validateOnly); + } - // Associate GSL-D3D - if (!associateD3D9Device( - reinterpret_cast(pDevice[amd::Context::DeviceFlagIdx::D3D9DeviceKhrIdx]))) { - CloseInitializedAdapter(validateOnly); - LogWarning("D3D9<->OpenCL adapter mismatch or D3D9Associate() failure"); - return false; - } + if (flags & amd::Context::Flags::D3D9DeviceKhr) { + PerformAdapterInitialization(validateOnly); - CloseInitializedAdapter(validateOnly); + // Associate GSL-D3D + if (!associateD3D9Device(reinterpret_cast( + pDevice[amd::Context::DeviceFlagIdx::D3D9DeviceKhrIdx]))) { + CloseInitializedAdapter(validateOnly); + LogWarning("D3D9<->OpenCL adapter mismatch or D3D9Associate() failure"); + return false; } - if (flags & amd::Context::Flags::D3D9DeviceEXKhr) { - PerformAdapterInitialization(validateOnly); + CloseInitializedAdapter(validateOnly); + } - // Associate GSL-D3D - if (!associateD3D9Device( - reinterpret_cast(pDevice[amd::Context::DeviceFlagIdx::D3D9DeviceEXKhrIdx]))) { - CloseInitializedAdapter(validateOnly); - LogWarning("D3D9<->OpenCL adapter mismatch or D3D9Associate() failure"); - return false; - } + if (flags & amd::Context::Flags::D3D9DeviceEXKhr) { + PerformAdapterInitialization(validateOnly); - CloseInitializedAdapter(validateOnly); + // Associate GSL-D3D + if (!associateD3D9Device(reinterpret_cast( + pDevice[amd::Context::DeviceFlagIdx::D3D9DeviceEXKhrIdx]))) { + CloseInitializedAdapter(validateOnly); + LogWarning("D3D9<->OpenCL adapter mismatch or D3D9Associate() failure"); + return false; } - if (flags & amd::Context::Flags::D3D9DeviceVAKhr) { - } -#endif //_WIN32 + CloseInitializedAdapter(validateOnly); + } + + if (flags & amd::Context::Flags::D3D9DeviceVAKhr) { + } +#endif //_WIN32 + return true; +} + +bool Device::unbindExternalDevice(uint flags, void* const pDevice[], void* pContext, + bool validateOnly) { + if ((flags & amd::Context::Flags::GLDeviceKhr) == 0) { return true; + } + + void* glDevice = pDevice[amd::Context::DeviceFlagIdx::GLDeviceKhrIdx]; + if (glDevice != NULL) { + // Dissociate GSL-OGL + if (true != glDissociate(pContext, glDevice)) { + if (validateOnly) { + LogWarning("Failed gslGLDiassociate()"); + } + return false; + } + } + return true; } -bool -Device::unbindExternalDevice(uint flags, void* const pDevice[], void* pContext, bool validateOnly) -{ - if ((flags & amd::Context::Flags::GLDeviceKhr) == 0) { - return true; - } +bool Device::globalFreeMemory(size_t* freeMemory) const { + const uint TotalFreeMemory = 0; + const uint LargestFreeBlock = 1; - void * glDevice = pDevice[amd::Context::DeviceFlagIdx::GLDeviceKhrIdx]; - if (glDevice != NULL) { - // Dissociate GSL-OGL - if (true != glDissociate(pContext, glDevice)) { - if (validateOnly) { - LogWarning("Failed gslGLDiassociate()"); - } - return false; - } + // Initialization of heap and other resources because getMemInfo needs it. + if (!(const_cast(this)->initializeHeapResources())) { + return false; + } + + gslMemInfo memInfo = {0}; + gslCtx()->getMemInfo(&memInfo, GSL_MEMINFO_BASIC); + + // Fill free memory info + freeMemory[TotalFreeMemory] = + (memInfo.cardMemAvailableBytes + memInfo.cardExtMemAvailableBytes) / Ki; + freeMemory[LargestFreeBlock] = + std::max(memInfo.cardLargestFreeBlockBytes, memInfo.cardExtLargestFreeBlockBytes) / Ki; + if (settings().apuSystem_) { + if (settings().viPlus_) { + // for viPlus_, OCL is using remote instead remoteUSWC to avoid extra copy + freeMemory[TotalFreeMemory] += memInfo.agpMemAvailableCacheableBytes / Ki; + freeMemory[LargestFreeBlock] += memInfo.agpCacheableLargestFreeBlockBytes / Ki; + } else { + freeMemory[TotalFreeMemory] += memInfo.agpMemAvailableBytes / Ki; + freeMemory[LargestFreeBlock] += memInfo.agpLargestFreeBlockBytes / Ki; } - return true; + } + + return true; } -bool -Device::globalFreeMemory(size_t* freeMemory) const -{ - const uint TotalFreeMemory = 0; - const uint LargestFreeBlock = 1; +amd::Memory* Device::findMapTarget(size_t size) const { + // Must be serialised for access + amd::ScopedLock lk(*mapCacheOps_); - // Initialization of heap and other resources because getMemInfo needs it. - if (!(const_cast(this)->initializeHeapResources())) { - return false; - } + amd::Memory* map = NULL; + size_t minSize = 0; + size_t maxSize = 0; + uint mapId = mapCache_->size(); + uint releaseId = mapCache_->size(); - gslMemInfo memInfo = {0}; - gslCtx()->getMemInfo(&memInfo, GSL_MEMINFO_BASIC); - - // Fill free memory info - freeMemory[TotalFreeMemory] = (memInfo.cardMemAvailableBytes + - memInfo.cardExtMemAvailableBytes) / Ki; - freeMemory[LargestFreeBlock] = std::max(memInfo.cardLargestFreeBlockBytes, - memInfo.cardExtLargestFreeBlockBytes) / Ki; - if (settings().apuSystem_) { - if (settings().viPlus_) { - // for viPlus_, OCL is using remote instead remoteUSWC to avoid extra copy - freeMemory[TotalFreeMemory] += memInfo.agpMemAvailableCacheableBytes / Ki; - freeMemory[LargestFreeBlock] += memInfo.agpCacheableLargestFreeBlockBytes / Ki; + // Find if the list has a map target of appropriate size + for (uint i = 0; i < mapCache_->size(); i++) { + if ((*mapCache_)[i] != NULL) { + // Requested size is smaller than the entry size + if (size < (*mapCache_)[i]->getSize()) { + if ((minSize == 0) || (minSize > (*mapCache_)[i]->getSize())) { + minSize = (*mapCache_)[i]->getSize(); + mapId = i; } - else { - freeMemory[TotalFreeMemory] += memInfo.agpMemAvailableBytes / Ki; - freeMemory[LargestFreeBlock] += memInfo.agpLargestFreeBlockBytes / Ki; + } + // Requeted size matches the entry size + else if (size == (*mapCache_)[i]->getSize()) { + mapId = i; + break; + } else { + // Find the biggest map target in the list + if (maxSize < (*mapCache_)[i]->getSize()) { + maxSize = (*mapCache_)[i]->getSize(); + releaseId = i; } + } } + } - return true; + // Check if we found any map target + if (mapId < mapCache_->size()) { + map = (*mapCache_)[mapId]; + (*mapCache_)[mapId] = NULL; + Memory* gpuMemory = reinterpret_cast(map->getDeviceMemory(*this)); + + // Get the base pointer for the map resource + if ((gpuMemory == NULL) || (NULL == gpuMemory->map(NULL))) { + (*mapCache_)[mapId]->release(); + map = NULL; + } + } + // If cache is full, then release the biggest map target + else if (releaseId < mapCache_->size()) { + (*mapCache_)[releaseId]->release(); + (*mapCache_)[releaseId] = NULL; + } + + return map; } -amd::Memory* -Device::findMapTarget(size_t size) const -{ - // Must be serialised for access - amd::ScopedLock lk(*mapCacheOps_); +bool Device::addMapTarget(amd::Memory* memory) const { + // Must be serialised for access + amd::ScopedLock lk(*mapCacheOps_); - amd::Memory* map = NULL; - size_t minSize = 0; - size_t maxSize = 0; - uint mapId = mapCache_->size(); - uint releaseId = mapCache_->size(); - - // Find if the list has a map target of appropriate size - for (uint i = 0; i < mapCache_->size(); i++) { - if ((*mapCache_)[i] != NULL) { - // Requested size is smaller than the entry size - if (size < (*mapCache_)[i]->getSize()) { - if ((minSize == 0) || - (minSize > (*mapCache_)[i]->getSize())) { - minSize = (*mapCache_)[i]->getSize(); - mapId = i; - } - } - // Requeted size matches the entry size - else if (size == (*mapCache_)[i]->getSize()) { - mapId = i; - break; - } - else { - // Find the biggest map target in the list - if (maxSize < (*mapCache_)[i]->getSize()) { - maxSize = (*mapCache_)[i]->getSize(); - releaseId = i; - } - } - } + // the svm memory shouldn't be cached + if (!memory->canBeCached()) { + return false; + } + // Find if the list has a map target of appropriate size + for (uint i = 0; i < mapCache_->size(); ++i) { + if ((*mapCache_)[i] == NULL) { + (*mapCache_)[i] = memory; + return true; } + } - // Check if we found any map target - if (mapId < mapCache_->size()) { - map = (*mapCache_)[mapId]; - (*mapCache_)[mapId] = NULL; - Memory* gpuMemory = reinterpret_cast - (map->getDeviceMemory(*this)); + // Add a new entry + mapCache_->push_back(memory); - // Get the base pointer for the map resource - if ((gpuMemory == NULL) || (NULL == gpuMemory->map(NULL))) { - (*mapCache_)[mapId]->release(); - map = NULL; - } - } - // If cache is full, then release the biggest map target - else if (releaseId < mapCache_->size()) { - (*mapCache_)[releaseId]->release(); - (*mapCache_)[releaseId] = NULL; - } - - return map; + return true; } -bool -Device::addMapTarget(amd::Memory* memory) const -{ - // Must be serialised for access - amd::ScopedLock lk(*mapCacheOps_); +Device::ScratchBuffer::~ScratchBuffer() { destroyMemory(); } - //the svm memory shouldn't be cached - if (!memory->canBeCached()) { - return false; +void Device::ScratchBuffer::destroyMemory() { + // Release memory object + delete memObj_; + memObj_ = NULL; +} + +bool Device::allocScratch(uint regNum, const VirtualGPU* vgpu) { + if (regNum > 0) { + // Serialize the scratch buffer allocation code + amd::ScopedLock lk(*scratchAlloc_); + uint sb = vgpu->hwRing(); + + static const uint WaveSizeLimit = ((1 << 21) - 256); + const uint threadSizeLimit = WaveSizeLimit / getAttribs().wavefrontSize; + if (regNum > threadSizeLimit) { + LogError("Requested private memory is bigger than HW supports!"); + regNum = threadSizeLimit; } - // Find if the list has a map target of appropriate size - for (uint i = 0; i < mapCache_->size(); ++i) { - if ((*mapCache_)[i] == NULL) { - (*mapCache_)[i] = memory; - return true; + + // Check if the current buffer isn't big enough + if (regNum > scratch_[sb]->regNum_) { + // Stall all command queues, since runtime will reallocate memory + ScopedLockVgpus lock(*this); + + scratch_[sb]->regNum_ = regNum; + uint64_t size = 0; + uint64_t offset = 0; + + // Destroy all views + for (uint s = 0; s < scratch_.size(); ++s) { + ScratchBuffer* scratchBuf = scratch_[s]; + if (scratchBuf->regNum_ > 0) { + scratchBuf->destroyMemory(); + // Calculate the size of the scratch buffer for a queue + scratchBuf->size_ = calcScratchBufferSize(scratchBuf->regNum_); + scratchBuf->size_ = std::min(scratchBuf->size_, info().maxMemAllocSize_); + scratchBuf->size_ = std::min(scratchBuf->size_, uint64_t(3 * Gi)); + scratchBuf->size_ = amd::alignUp(scratchBuf->size_, 0xFFFF); + scratchBuf->offset_ = offset; + size += scratchBuf->size_; + offset += scratchBuf->size_; } - } + } - // Add a new entry - mapCache_->push_back(memory); + delete globalScratchBuf_; - return true; -} - -Device::ScratchBuffer::~ScratchBuffer() -{ - destroyMemory(); -} - -void -Device::ScratchBuffer::destroyMemory() -{ - // Release memory object - delete memObj_; - memObj_ = NULL; -} - -bool -Device::allocScratch(uint regNum, const VirtualGPU* vgpu) -{ - if (regNum > 0) { - // Serialize the scratch buffer allocation code - amd::ScopedLock lk(*scratchAlloc_); - uint sb = vgpu->hwRing(); - - static const uint WaveSizeLimit = ((1 << 21) - 256); - const uint threadSizeLimit = WaveSizeLimit / getAttribs().wavefrontSize; - if (regNum > threadSizeLimit) { - LogError("Requested private memory is bigger than HW supports!"); - regNum = threadSizeLimit; - } - - // Check if the current buffer isn't big enough - if (regNum > scratch_[sb]->regNum_) { - // Stall all command queues, since runtime will reallocate memory - ScopedLockVgpus lock(*this); - - scratch_[sb]->regNum_ = regNum; - uint64_t size = 0; - uint64_t offset = 0; - - // Destroy all views - for (uint s = 0; s < scratch_.size(); ++s) { - ScratchBuffer* scratchBuf = scratch_[s]; - if (scratchBuf->regNum_ > 0) { - scratchBuf->destroyMemory(); - // Calculate the size of the scratch buffer for a queue - scratchBuf->size_ = calcScratchBufferSize(scratchBuf->regNum_); - scratchBuf->size_ = std::min(scratchBuf->size_, info().maxMemAllocSize_); - scratchBuf->size_ = std::min(scratchBuf->size_, uint64_t(3 * Gi)); - scratchBuf->size_ = amd::alignUp(scratchBuf->size_, 0xFFFF); - scratchBuf->offset_ = offset; - size += scratchBuf->size_; - offset += scratchBuf->size_; - } - } - - delete globalScratchBuf_; - - // Allocate new buffer. - globalScratchBuf_ = new gpu::Memory(*this, static_cast(size)); - if ((globalScratchBuf_ == NULL) || - !globalScratchBuf_->create(Resource::Scratch)) { - LogError("Couldn't allocate scratch memory"); - for (uint s = 0; s < scratch_.size(); ++s) { - scratch_[s]->regNum_ = 0; - } - return false; - } - - for (uint s = 0; s < scratch_.size(); ++s) { - // Loop through all memory objects and reallocate them - if (scratch_[s]->regNum_ > 0) { - // Allocate new buffer - scratch_[s]->memObj_ = new gpu::Memory(*this, scratch_[s]->size_); - Resource::ViewParams view; - view.resource_ = globalScratchBuf_; - view.offset_ = scratch_[s]->offset_; - view.size_ = scratch_[s]->size_; - if ((scratch_[s]->memObj_ == NULL) || - !scratch_[s]->memObj_->create(Resource::View, &view)) { - LogError("Couldn't allocate a scratch view"); - delete scratch_[s]->memObj_; - scratch_[s]->regNum_ = 0; - return false; - } - } - } - } - } - return true; -} - -bool -Device::validateKernel(const amd::Kernel& kernel, const device::VirtualDevice* vdev) -{ - // Find the number of scratch registers used in the kernel - const device::Kernel* devKernel = kernel.getDeviceKernel(*this); - uint regNum = static_cast(devKernel->workGroupInfo()->scratchRegs_); - const VirtualGPU* vgpu = static_cast(vdev); - - if (!allocScratch(regNum, vgpu)) { - return false; - } - - if (devKernel->hsa()) { - const HSAILKernel* hsaKernel = static_cast(devKernel); - if (hsaKernel->dynamicParallelism()) { - amd::DeviceQueue* defQueue = - kernel.program().context().defDeviceQueue(*this); - if (defQueue != NULL) { - vgpu = static_cast(defQueue->vDev()); - if (!allocScratch(hsaKernel->prog().maxScratchRegs(), vgpu)) { - return false; - } - } - else { - return false; - } - } - } - - return true; -} - -void -Device::destroyScratchBuffers() -{ - if (globalScratchBuf_ != NULL) { + // Allocate new buffer. + globalScratchBuf_ = new gpu::Memory(*this, static_cast(size)); + if ((globalScratchBuf_ == NULL) || !globalScratchBuf_->create(Resource::Scratch)) { + LogError("Couldn't allocate scratch memory"); for (uint s = 0; s < scratch_.size(); ++s) { - scratch_[s]->destroyMemory(); + scratch_[s]->regNum_ = 0; + } + return false; + } + + for (uint s = 0; s < scratch_.size(); ++s) { + // Loop through all memory objects and reallocate them + if (scratch_[s]->regNum_ > 0) { + // Allocate new buffer + scratch_[s]->memObj_ = new gpu::Memory(*this, scratch_[s]->size_); + Resource::ViewParams view; + view.resource_ = globalScratchBuf_; + view.offset_ = scratch_[s]->offset_; + view.size_ = scratch_[s]->size_; + if ((scratch_[s]->memObj_ == NULL) || + !scratch_[s]->memObj_->create(Resource::View, &view)) { + LogError("Couldn't allocate a scratch view"); + delete scratch_[s]->memObj_; scratch_[s]->regNum_ = 0; + return false; + } } - delete globalScratchBuf_; - globalScratchBuf_ = NULL; + } } + } + return true; } -void -Device::fillHwSampler( - uint32_t state, void* hwState, uint32_t hwStateSize, - uint32_t mipFilter, float minLod, float maxLod) const -{ - // All GSL sampler's parameters are in floats - uint32_t gslAddress = GSL_CLAMP_TO_BORDER; - uint32_t gslMinFilter = GSL_MIN_NEAREST; - uint32_t gslMagFilter = GSL_MAG_NEAREST; - bool unnorm = !(state & amd::Sampler::StateNormalizedCoordsMask); +bool Device::validateKernel(const amd::Kernel& kernel, const device::VirtualDevice* vdev) { + // Find the number of scratch registers used in the kernel + const device::Kernel* devKernel = kernel.getDeviceKernel(*this); + uint regNum = static_cast(devKernel->workGroupInfo()->scratchRegs_); + const VirtualGPU* vgpu = static_cast(vdev); - state &= ~amd::Sampler::StateNormalizedCoordsMask; + if (!allocScratch(regNum, vgpu)) { + return false; + } - // Program the sampler address mode - switch (state & amd::Sampler::StateAddressMask) { - case amd::Sampler::StateAddressRepeat: - gslAddress = GSL_REPEAT; - break; - case amd::Sampler::StateAddressClampToEdge: - gslAddress = GSL_CLAMP_TO_EDGE; - break; - case amd::Sampler::StateAddressMirroredRepeat: - gslAddress = GSL_MIRRORED_REPEAT; - break; - case amd::Sampler::StateAddressClamp: - case amd::Sampler::StateAddressNone: - default: - break; - } - state &= ~amd::Sampler::StateAddressMask; - - // Program texture filter mode - if (state == amd::Sampler::StateFilterLinear) { - gslMinFilter = GSL_MIN_LINEAR; - gslMagFilter = GSL_MAG_LINEAR; - } - - if (mipFilter == CL_FILTER_NEAREST) { - if (gslMinFilter == GSL_MIN_NEAREST) { - gslMinFilter = GSL_MIN_NEAREST_MIPMAP_NEAREST; - } - else { - gslMinFilter = GSL_MIN_LINEAR_MIPMAP_NEAREST; - } - } - else if (mipFilter == CL_FILTER_LINEAR) { - if (gslMinFilter == GSL_MIN_NEAREST) { - gslMinFilter = GSL_MIN_NEAREST_MIPMAP_LINEAR; - } - else { - gslMinFilter = GSL_MIN_LINEAR_MIPMAP_LINEAR; + if (devKernel->hsa()) { + const HSAILKernel* hsaKernel = static_cast(devKernel); + if (hsaKernel->dynamicParallelism()) { + amd::DeviceQueue* defQueue = kernel.program().context().defDeviceQueue(*this); + if (defQueue != NULL) { + vgpu = static_cast(defQueue->vDev()); + if (!allocScratch(hsaKernel->prog().maxScratchRegs(), vgpu)) { + return false; } + } else { + return false; + } } + } - fillSamplerHwState(unnorm, gslMinFilter, gslMagFilter, - gslAddress, minLod, maxLod, hwState, hwStateSize); + return true; } -void* -Device::hostAlloc(size_t size, size_t alignment, bool atomics) const -{ - //for discrete gpu, we only reserve,no commit yet. - return amd::Os::reserveMemory(NULL, size, alignment, amd::Os::MEM_PROT_NONE); -} - -void -Device::hostFree(void* ptr, size_t size) const -{ - //If we allocate the host memory, we need free, or we have to release - amd::Os::releaseMemory(ptr, size); -} - -void* -Device::svmAlloc(amd::Context& context, size_t size, size_t alignment, cl_svm_mem_flags flags, void* svmPtr) const -{ - alignment = std::max(alignment, static_cast(info_.memBaseAddrAlign_)); - - //VAM for GPU needs 64K alignment for Tahiti and CI+, will pull idnfo from gsl later - size_t vmBigK = 64 * Ki; - alignment = (alignment < vmBigK) ? vmBigK : alignment; - - size = amd::alignUp(size, alignment); - amd::Memory* mem = NULL; - if (NULL == svmPtr) { - if (isFineGrainedSystem()) { - return amd::Os::alignedMalloc(size, alignment); - } - - //create a hidden buffer, which will allocated on the device later - mem = new (context)amd::Buffer(context, flags, size, reinterpret_cast(1)); - if (mem == NULL) { - LogError("failed to create a svm mem object!"); - return NULL; - } - - if (!mem->create(NULL, false)) { - LogError("failed to create a svm hidden buffer!"); - mem->release(); - return NULL; - } - //if the device supports SVM FGS, return the committed CPU address directly. - gpu::Memory* gpuMem = getGpuMemory(mem); - - //add the information to context so that we can use it later. - amd::SvmManager::AddSvmBuffer(mem->getSvmPtr(), mem); - svmPtr = mem->getSvmPtr(); +void Device::destroyScratchBuffers() { + if (globalScratchBuf_ != NULL) { + for (uint s = 0; s < scratch_.size(); ++s) { + scratch_[s]->destroyMemory(); + scratch_[s]->regNum_ = 0; } - else { - //find the existing amd::mem object - mem = amd::SvmManager::FindSvmBuffer(svmPtr); - if (NULL == mem) { - return NULL; - } - //commit the CPU memory for FGS device. - if (isFineGrainedSystem()) { - mem->commitSvmMemory(); - } - else { - gpu::Memory* gpuMem = getGpuMemory(mem); - } - svmPtr = mem->getSvmPtr(); - } - return svmPtr; + delete globalScratchBuf_; + globalScratchBuf_ = NULL; + } } -void -Device::svmFree(void *ptr) const -{ +void Device::fillHwSampler(uint32_t state, void* hwState, uint32_t hwStateSize, uint32_t mipFilter, + float minLod, float maxLod) const { + // All GSL sampler's parameters are in floats + uint32_t gslAddress = GSL_CLAMP_TO_BORDER; + uint32_t gslMinFilter = GSL_MIN_NEAREST; + uint32_t gslMagFilter = GSL_MAG_NEAREST; + bool unnorm = !(state & amd::Sampler::StateNormalizedCoordsMask); + + state &= ~amd::Sampler::StateNormalizedCoordsMask; + + // Program the sampler address mode + switch (state & amd::Sampler::StateAddressMask) { + case amd::Sampler::StateAddressRepeat: + gslAddress = GSL_REPEAT; + break; + case amd::Sampler::StateAddressClampToEdge: + gslAddress = GSL_CLAMP_TO_EDGE; + break; + case amd::Sampler::StateAddressMirroredRepeat: + gslAddress = GSL_MIRRORED_REPEAT; + break; + case amd::Sampler::StateAddressClamp: + case amd::Sampler::StateAddressNone: + default: + break; + } + state &= ~amd::Sampler::StateAddressMask; + + // Program texture filter mode + if (state == amd::Sampler::StateFilterLinear) { + gslMinFilter = GSL_MIN_LINEAR; + gslMagFilter = GSL_MAG_LINEAR; + } + + if (mipFilter == CL_FILTER_NEAREST) { + if (gslMinFilter == GSL_MIN_NEAREST) { + gslMinFilter = GSL_MIN_NEAREST_MIPMAP_NEAREST; + } else { + gslMinFilter = GSL_MIN_LINEAR_MIPMAP_NEAREST; + } + } else if (mipFilter == CL_FILTER_LINEAR) { + if (gslMinFilter == GSL_MIN_NEAREST) { + gslMinFilter = GSL_MIN_NEAREST_MIPMAP_LINEAR; + } else { + gslMinFilter = GSL_MIN_LINEAR_MIPMAP_LINEAR; + } + } + + fillSamplerHwState(unnorm, gslMinFilter, gslMagFilter, gslAddress, minLod, maxLod, hwState, + hwStateSize); +} + +void* Device::hostAlloc(size_t size, size_t alignment, bool atomics) const { + // for discrete gpu, we only reserve,no commit yet. + return amd::Os::reserveMemory(NULL, size, alignment, amd::Os::MEM_PROT_NONE); +} + +void Device::hostFree(void* ptr, size_t size) const { + // If we allocate the host memory, we need free, or we have to release + amd::Os::releaseMemory(ptr, size); +} + +void* Device::svmAlloc(amd::Context& context, size_t size, size_t alignment, cl_svm_mem_flags flags, + void* svmPtr) const { + alignment = std::max(alignment, static_cast(info_.memBaseAddrAlign_)); + + // VAM for GPU needs 64K alignment for Tahiti and CI+, will pull idnfo from gsl later + size_t vmBigK = 64 * Ki; + alignment = (alignment < vmBigK) ? vmBigK : alignment; + + size = amd::alignUp(size, alignment); + amd::Memory* mem = NULL; + if (NULL == svmPtr) { if (isFineGrainedSystem()) { - amd::Os::alignedFree(ptr); + return amd::Os::alignedMalloc(size, alignment); } - else { - amd::Memory * svmMem = NULL; - svmMem = amd::SvmManager::FindSvmBuffer(ptr); - if (NULL != svmMem) { - svmMem->release(); - amd::SvmManager::RemoveSvmBuffer(ptr); - } + + // create a hidden buffer, which will allocated on the device later + mem = new (context) amd::Buffer(context, flags, size, reinterpret_cast(1)); + if (mem == NULL) { + LogError("failed to create a svm mem object!"); + return NULL; } + + if (!mem->create(NULL, false)) { + LogError("failed to create a svm hidden buffer!"); + mem->release(); + return NULL; + } + // if the device supports SVM FGS, return the committed CPU address directly. + gpu::Memory* gpuMem = getGpuMemory(mem); + + // add the information to context so that we can use it later. + amd::SvmManager::AddSvmBuffer(mem->getSvmPtr(), mem); + svmPtr = mem->getSvmPtr(); + } else { + // find the existing amd::mem object + mem = amd::SvmManager::FindSvmBuffer(svmPtr); + if (NULL == mem) { + return NULL; + } + // commit the CPU memory for FGS device. + if (isFineGrainedSystem()) { + mem->commitSvmMemory(); + } else { + gpu::Memory* gpuMem = getGpuMemory(mem); + } + svmPtr = mem->getSvmPtr(); + } + return svmPtr; +} + +void Device::svmFree(void* ptr) const { + if (isFineGrainedSystem()) { + amd::Os::alignedFree(ptr); + } else { + amd::Memory* svmMem = NULL; + svmMem = amd::SvmManager::FindSvmBuffer(ptr); + if (NULL != svmMem) { + svmMem->release(); + amd::SvmManager::RemoveSvmBuffer(ptr); + } + } } -Device::SrdManager::~SrdManager() -{ - for (uint i = 0; i < pool_.size(); ++i) { - pool_[i].buf_->unmap(NULL); - delete pool_[i].buf_; - delete pool_[i].flags_; - } +Device::SrdManager::~SrdManager() { + for (uint i = 0; i < pool_.size(); ++i) { + pool_[i].buf_->unmap(NULL); + delete pool_[i].buf_; + delete pool_[i].flags_; + } } -bool -Sampler::create(uint32_t oclSamplerState) -{ - hwSrd_ = dev_.srds().allocSrdSlot(&hwState_); - if (0 == hwSrd_) { - return false; - } - dev_.fillHwSampler(oclSamplerState, hwState_, HsaSamplerObjectSize); - return true; +bool Sampler::create(uint32_t oclSamplerState) { + hwSrd_ = dev_.srds().allocSrdSlot(&hwState_); + if (0 == hwSrd_) { + return false; + } + dev_.fillHwSampler(oclSamplerState, hwState_, HsaSamplerObjectSize); + return true; } -bool -Sampler::create(const amd::Sampler& owner) -{ - hwSrd_ = dev_.srds().allocSrdSlot(&hwState_); - if (0 == hwSrd_) { - return false; - } - dev_.fillHwSampler(owner.state(), hwState_, HsaSamplerObjectSize, - owner.mipFilter(), owner.minLod(), owner.maxLod()); - return true; +bool Sampler::create(const amd::Sampler& owner) { + hwSrd_ = dev_.srds().allocSrdSlot(&hwState_); + if (0 == hwSrd_) { + return false; + } + dev_.fillHwSampler(owner.state(), hwState_, HsaSamplerObjectSize, owner.mipFilter(), + owner.minLod(), owner.maxLod()); + return true; } -Sampler::~Sampler() -{ - dev_.srds().freeSrdSlot(hwSrd_); +Sampler::~Sampler() { dev_.srds().freeSrdSlot(hwSrd_); } + +uint64_t Device::SrdManager::allocSrdSlot(address* cpuAddr) { + amd::ScopedLock lock(ml_); + // Check all buffers in the pool of chunks + for (uint i = 0; i < pool_.size(); ++i) { + const Chunk& ch = pool_[i]; + // Search for an empty slot + for (uint s = 0; s < numFlags_; ++s) { + uint mask = ch.flags_[s]; + // Check if there is an empty slot in this group + if (mask != 0) { + uint idx; + // Find the first empty index + for (idx = 0; (mask & 0x1) == 0; mask >>= 1, ++idx) + ; + // Mark the slot as busy + ch.flags_[s] &= ~(1 << idx); + // Calculate SRD offset in the buffer + uint offset = (s * MaskBits + idx) * srdSize_; + *cpuAddr = ch.buf_->data() + offset; + return ch.buf_->vmAddress() + offset; + } + } + } + // At this point the manager doesn't have empty slots + // and has to allocate a new chunk + Chunk chunk; + chunk.flags_ = new uint[numFlags_]; + if (chunk.flags_ == NULL) { + return 0; + } + chunk.buf_ = new Memory(dev_, bufSize_); + if (chunk.buf_ == NULL || !chunk.buf_->create(Resource::Remote) || + (NULL == chunk.buf_->map(NULL))) { + delete[] chunk.flags_; + delete chunk.buf_; + return 0; + } + // All slots in the chunk are in "free" state + memset(chunk.flags_, 0xff, numFlags_ * sizeof(uint)); + // Take the first one... + chunk.flags_[0] &= ~0x1; + pool_.push_back(chunk); + *cpuAddr = chunk.buf_->data(); + return chunk.buf_->vmAddress(); } -uint64_t -Device::SrdManager::allocSrdSlot(address* cpuAddr) -{ - amd::ScopedLock lock(ml_); - // Check all buffers in the pool of chunks - for (uint i = 0; i < pool_.size(); ++i) { - const Chunk& ch = pool_[i]; - // Search for an empty slot - for (uint s = 0; s < numFlags_; ++s) { - uint mask = ch.flags_[s]; - // Check if there is an empty slot in this group - if (mask != 0) { - uint idx; - // Find the first empty index - for (idx = 0; (mask & 0x1) == 0; mask >>= 1, ++idx); - // Mark the slot as busy - ch.flags_[s] &= ~(1 << idx); - // Calculate SRD offset in the buffer - uint offset = (s * MaskBits + idx) * srdSize_; - *cpuAddr = ch.buf_->data() + offset; - return ch.buf_->vmAddress() + offset; - } - } +void Device::SrdManager::freeSrdSlot(uint64_t addr) { + amd::ScopedLock lock(ml_); + // Check all buffers in the pool of chunks + for (uint i = 0; i < pool_.size(); ++i) { + Chunk* ch = &pool_[i]; + // Find the offset + int64_t offs = static_cast(addr) - static_cast(ch->buf_->vmAddress()); + // Check if the offset inside the chunk buffer + if ((offs >= 0) && (offs < bufSize_)) { + // Find the index in the chunk + uint idx = offs / srdSize_; + uint s = idx / MaskBits; + // Free the slot + ch->flags_[s] |= 1 << (idx % MaskBits); + return; } - // At this point the manager doesn't have empty slots - // and has to allocate a new chunk - Chunk chunk; - chunk.flags_ = new uint[numFlags_]; - if (chunk.flags_ == NULL) { - return 0; - } - chunk.buf_ = new Memory(dev_, bufSize_); - if (chunk.buf_ == NULL || !chunk.buf_->create(Resource::Remote) || - (NULL == chunk.buf_->map(NULL))) { - delete [] chunk.flags_; - delete chunk.buf_; - return 0; - } - // All slots in the chunk are in "free" state - memset(chunk.flags_, 0xff, numFlags_ * sizeof(uint)); - // Take the first one... - chunk.flags_[0] &= ~0x1; - pool_.push_back(chunk); - *cpuAddr = chunk.buf_->data(); - return chunk.buf_->vmAddress(); + } + assert(false && "Wrong slot address!"); } -void -Device::SrdManager::freeSrdSlot(uint64_t addr) { - amd::ScopedLock lock(ml_); - // Check all buffers in the pool of chunks - for (uint i = 0; i < pool_.size(); ++i) { - Chunk* ch = &pool_[i]; - // Find the offset - int64_t offs = static_cast(addr) - - static_cast(ch->buf_->vmAddress()); - // Check if the offset inside the chunk buffer - if ((offs >= 0) && (offs < bufSize_)) { - // Find the index in the chunk - uint idx = offs / srdSize_; - uint s = idx / MaskBits; - // Free the slot - ch->flags_[s] |= 1 << (idx % MaskBits); - return; - } - } - assert(false && "Wrong slot address!"); +void Device::SrdManager::fillResourceList(std::vector& memList) { + for (uint i = 0; i < pool_.size(); ++i) { + memList.push_back(pool_[i].buf_); + } } -void -Device::SrdManager::fillResourceList(std::vector& memList) -{ - for (uint i = 0; i < pool_.size(); ++i) { - memList.push_back(pool_[i].buf_); - } +cl_int Device::hwDebugManagerInit(amd::Context* context, uintptr_t messageStorage) { + cl_int status = hwDebugMgr_->registerDebugger(context, messageStorage); + + if (CL_SUCCESS != status) { + delete hwDebugMgr_; + hwDebugMgr_ = NULL; + } + + return status; } -cl_int -Device::hwDebugManagerInit(amd::Context *context, uintptr_t messageStorage) -{ - cl_int status = hwDebugMgr_->registerDebugger(context, messageStorage); - - if (CL_SUCCESS != status) { - delete hwDebugMgr_; - hwDebugMgr_ = NULL; - } - - return status; -} - -} // namespace gpu +} // namespace gpu diff --git a/rocclr/runtime/device/gpu/gpudevice.hpp b/rocclr/runtime/device/gpu/gpudevice.hpp index 05bd7daaf1..6001440a81 100644 --- a/rocclr/runtime/device/gpu/gpudevice.hpp +++ b/rocclr/runtime/device/gpu/gpudevice.hpp @@ -33,105 +33,106 @@ namespace gpu { //! A nil device object -class NullDevice : public amd::Device -{ -protected: - static aclCompiler* compiler_; - static aclCompiler* hsaCompiler_; -public: - aclCompiler* compiler() const { return compiler_; } - aclCompiler* hsaCompiler() const { return hsaCompiler_; } +class NullDevice : public amd::Device { + protected: + static aclCompiler* compiler_; + static aclCompiler* hsaCompiler_; -public: - static bool init(void); + public: + aclCompiler* compiler() const { return compiler_; } + aclCompiler* hsaCompiler() const { return hsaCompiler_; } - //! Construct a new identifier - NullDevice(); + public: + static bool init(void); - //! Creates an offline device with the specified target - bool create( - CALtarget target //!< GPU device identifier - ); + //! Construct a new identifier + NullDevice(); - virtual cl_int createSubDevices( - device::CreateSubDevicesInfo& create_info, - cl_uint num_entries, - cl_device_id* devices, - cl_uint* num_devices) { - return CL_INVALID_VALUE; - } + //! Creates an offline device with the specified target + bool create(CALtarget target //!< GPU device identifier + ); - //! Instantiate a new virtual device - virtual device::VirtualDevice* createVirtualDevice( - amd::CommandQueue* queue = NULL - ) { return NULL; } + virtual cl_int createSubDevices(device::CreateSubDevicesInfo& create_info, cl_uint num_entries, + cl_device_id* devices, cl_uint* num_devices) { + return CL_INVALID_VALUE; + } - //! Create the device program. - virtual device::Program* createProgram(amd::option::Options* options = NULL); + //! Instantiate a new virtual device + virtual device::VirtualDevice* createVirtualDevice(amd::CommandQueue* queue = NULL) { + return NULL; + } - //! Just returns NULL for the dummy device - virtual device::Memory* createMemory(amd::Memory& owner) const { return NULL; } + //! Create the device program. + virtual device::Program* createProgram(amd::option::Options* options = NULL); - //! Sampler object allocation - virtual bool createSampler( - const amd::Sampler& owner, //!< abstraction layer sampler object - device::Sampler** sampler //!< device sampler object - ) const - { - ShouldNotReachHere(); - return true; - } + //! Just returns NULL for the dummy device + virtual device::Memory* createMemory(amd::Memory& owner) const { return NULL; } - //! Just returns NULL for the dummy device - virtual device::Memory* createView( - amd::Memory& owner, //!< Owner memory object - const device::Memory& parent //!< Parent device memory object for the view - ) const { return NULL; } + //! Sampler object allocation + virtual bool createSampler(const amd::Sampler& owner, //!< abstraction layer sampler object + device::Sampler** sampler //!< device sampler object + ) const { + ShouldNotReachHere(); + return true; + } - //! Reallocates the provided buffer object - virtual bool reallocMemory(amd::Memory& owner) const { return true; } + //! Just returns NULL for the dummy device + virtual device::Memory* createView( + amd::Memory& owner, //!< Owner memory object + const device::Memory& parent //!< Parent device memory object for the view + ) const { + return NULL; + } - //! Acquire external graphics API object in the host thread - //! Needed for OpenGL objects on CPU device + //! Reallocates the provided buffer object + virtual bool reallocMemory(amd::Memory& owner) const { return true; } - virtual bool bindExternalDevice( - uint flags, void* const pDevice[], void* pContext, bool validateOnly) { return true; } + //! Acquire external graphics API object in the host thread + //! Needed for OpenGL objects on CPU device - virtual bool unbindExternalDevice( - uint flags, void* const pDevice[], void* pContext, bool validateOnly) { return true; } + virtual bool bindExternalDevice(uint flags, void* const pDevice[], void* pContext, + bool validateOnly) { + return true; + } - //! Releases non-blocking map target memory - virtual void freeMapTarget(amd::Memory& mem, void* target) {} + virtual bool unbindExternalDevice(uint flags, void* const pDevice[], void* pContext, + bool validateOnly) { + return true; + } - CALtarget calTarget() const { return calTarget_; } + //! Releases non-blocking map target memory + virtual void freeMapTarget(amd::Memory& mem, void* target) {} - const AMDDeviceInfo* hwInfo() const { return hwInfo_; } + CALtarget calTarget() const { return calTarget_; } - //! Empty implementation on Null device - virtual bool globalFreeMemory(size_t* freeMemory) const { return false; } + const AMDDeviceInfo* hwInfo() const { return hwInfo_; } - //! Get GPU device settings - const gpu::Settings& settings() const - { return reinterpret_cast(*settings_); } - virtual void* svmAlloc(amd::Context& context, size_t size, size_t alignment, cl_svm_mem_flags flags, void* svmPtr) const { return NULL; } - virtual void svmFree(void* ptr) const {return;} + //! Empty implementation on Null device + virtual bool globalFreeMemory(size_t* freeMemory) const { return false; } -protected: - CALtarget calTarget_; //!< GPU device identifier - const AMDDeviceInfo* hwInfo_; //!< Device HW info structure + //! Get GPU device settings + const gpu::Settings& settings() const { return reinterpret_cast(*settings_); } + virtual void* svmAlloc(amd::Context& context, size_t size, size_t alignment, + cl_svm_mem_flags flags, void* svmPtr) const { + return NULL; + } + virtual void svmFree(void* ptr) const { return; } - //! Answer the question: "Should HSAIL Program be created?", - //! based on the given options. - bool isHsailProgram(amd::option::Options* options = NULL); + protected: + CALtarget calTarget_; //!< GPU device identifier + const AMDDeviceInfo* hwInfo_; //!< Device HW info structure - //! Fills OpenCL device info structure - void fillDeviceInfo( - const CALdeviceattribs& calAttr, //!< CAL device attributes info - const gslMemInfo& memInfo, //!< GSL mem info - size_t maxTextureSize, //!< Maximum texture size supported in HW - uint numComputeRings, //!< Number of compute rings - uint numComputeRingsRT //!< Number of RT compute rings - ); + //! Answer the question: "Should HSAIL Program be created?", + //! based on the given options. + bool isHsailProgram(amd::option::Options* options = NULL); + + //! Fills OpenCL device info structure + void fillDeviceInfo(const CALdeviceattribs& calAttr, //!< CAL device attributes info + const gslMemInfo& memInfo, //!< GSL mem info + size_t maxTextureSize, //!< Maximum texture size supported in HW + uint numComputeRings, //!< Number of compute rings + uint numComputeRingsRT //!< Number of RT compute rings + ); }; //! Forward declarations @@ -152,463 +153,426 @@ class ThreadTrace; #define CL_FILTER_NONE 0x1142 #endif -class Sampler : public device::Sampler -{ -public: - //! Constructor - Sampler(const Device& dev): dev_(dev) {} +class Sampler : public device::Sampler { + public: + //! Constructor + Sampler(const Device& dev) : dev_(dev) {} - //! Default destructor for the device memory object - virtual ~Sampler(); + //! Default destructor for the device memory object + virtual ~Sampler(); - //! Creates a device sampler from the OCL sampler state - bool create( - uint32_t oclSamplerState //!< OCL sampler state - ); + //! Creates a device sampler from the OCL sampler state + bool create(uint32_t oclSamplerState //!< OCL sampler state + ); - //! Creates a device sampler from the OCL sampler state - bool create( - const amd::Sampler& owner //!< AMD sampler object - ); + //! Creates a device sampler from the OCL sampler state + bool create(const amd::Sampler& owner //!< AMD sampler object + ); - const void* hwState() const { return hwState_; } + const void* hwState() const { return hwState_; } -private: - //! Disable default copy constructor - Sampler& operator=(const Sampler&); + private: + //! Disable default copy constructor + Sampler& operator=(const Sampler&); - //! Disable operator= - Sampler(const Sampler&); + //! Disable operator= + Sampler(const Sampler&); - const Device& dev_; //!< Device object associated with the sampler - address hwState_; //!< GPU HW state (\todo legacy path) + const Device& dev_; //!< Device object associated with the sampler + address hwState_; //!< GPU HW state (\todo legacy path) }; //! A GPU device ordinal (physical GPU device) -class Device : public NullDevice, public CALGSLDevice -{ -public: - class Heap : public amd::EmbeddedObject - { - public: - //! The size of a heap element in bytes - static const size_t ElementSize = 4; +class Device : public NullDevice, public CALGSLDevice { + public: + class Heap : public amd::EmbeddedObject { + public: + //! The size of a heap element in bytes + static const size_t ElementSize = 4; - //! The type of a heap element in bytes - static const cmSurfFmt ElementType = CM_SURF_FMT_R32I; + //! The type of a heap element in bytes + static const cmSurfFmt ElementType = CM_SURF_FMT_R32I; - Heap(): resource_(NULL), baseAddress_(0) {} + Heap() : resource_(NULL), baseAddress_(0) {} - bool create( - Device& device //!< GPU device object - ); - - //! Gets the GPU resource associated with the global heap - const Memory& resource() const { return *resource_; } - - //! Returns the base virtual address of the heap - uint64_t baseAddress() const { return baseAddress_; } - - protected: - Memory* resource_; //!< GPU resource referencing the heap memory - uint64_t baseAddress_; //!< Virtual heap base address - }; - - //! Locks any access to the virtual GPUs - class ScopedLockVgpus : public amd::StackObject { - public: - //! Default constructor - ScopedLockVgpus(const Device& dev); - - //! Destructor - ~ScopedLockVgpus(); - - private: - const Device& dev_; //! Device object - }; - - //! Interop emulation flags - enum InteropEmulationFlags - { - D3D10Device = 0x00000001, - GLContext = 0x00000002, - }; - - class Engines : public amd::EmbeddedObject - { - public: - //! Default constructor - Engines() - : numComputeRings_(0) - , numComputeRingsRT_(0) - , numDmaEngines_(0) - { memset(desc_, 0xff, sizeof(desc_)); } - - //! Creates engine descriptor for this class - void create(uint num, gslEngineDescriptor* desc, uint maxNumComputeRings); - - //! Gets engine type mask - uint getMask(gslEngineID id) const { return (1 << id); } - - //! Gets a descriptor for the requested engines - uint getRequested(uint engines, gslEngineDescriptor* desc) const; - - //! Returns the number of available compute rings - uint numComputeRings() const { return numComputeRings_; } - - //! Returns the number of available real time compute rings - uint numComputeRingsRT() const { return numComputeRingsRT_; } - - //! Returns the number of available DMA engines - uint numDMAEngines() const { return numDmaEngines_; } - - private: - uint numComputeRings_; - uint numComputeRingsRT_; - uint numDmaEngines_; - gslEngineDescriptor desc_[GSL_ENGINEID_MAX]; //!< Engine descriptor - }; - - //! Transfer buffers - class XferBuffers : public amd::HeapObject - { - public: - static const size_t MaxXferBufListSize = 8; - - //! Default constructor - XferBuffers(const Device& device, Resource::MemoryType type, size_t bufSize) - : type_(type) - , bufSize_(bufSize) - , acquiredCnt_(0) - , gpuDevice_(device) - {} - - //! Default destructor - ~XferBuffers(); - - //! Creates the xfer buffers object - bool create(); - - //! Acquires an instance of the transfer buffers - Memory& acquire(); - - //! Releases transfer buffer - void release( - VirtualGPU& gpu, //!< Virual GPU object used with the buffer - Memory& buffer //!< Transfer buffer for release - ); - - //! Returns the buffer's size for transfer - size_t bufSize() const { return bufSize_; } - - private: - //! Disable copy constructor - XferBuffers(const XferBuffers&); - - //! Disable assignment operator - XferBuffers& operator=(const XferBuffers&); - - //! Get device object - const Device& dev() const { return gpuDevice_; } - - Resource::MemoryType type_; //!< The buffer's type - size_t bufSize_; //!< Staged buffer size - std::list freeBuffers_; //!< The list of free buffers - amd::Atomic acquiredCnt_; //!< The total number of acquired buffers - amd::Monitor lock_; //!< Stgaed buffer acquire/release lock - const Device& gpuDevice_; //!< GPU device object - }; - - struct ScratchBuffer : public amd::HeapObject - { - uint regNum_; //!< The number of used scratch registers - Memory* memObj_; //!< Memory objects for scratch buffers - uint64_t offset_; //!< Offset from the global scratch store - uint64_t size_; //!< Scratch buffer size on this queue - - //! Default constructor - ScratchBuffer(): regNum_(0), memObj_(NULL), offset_(0) {} - - //! Default constructor - ~ScratchBuffer(); - - //! Destroys memory objects - void destroyMemory(); - }; - - - class SrdManager : public amd::HeapObject { - public: - SrdManager(const Device& dev, uint srdSize, uint bufSize) - : dev_(dev) - , numFlags_(bufSize / (srdSize * MaskBits)) - , srdSize_(srdSize) - , bufSize_(bufSize) {} - ~SrdManager(); - - //! Allocates a new SRD slot for a resource - uint64_t allocSrdSlot(address* cpuAddr); - - //! Frees a SRD slot - void freeSrdSlot(uint64_t addr); - - // Fills the memory list for VidMM KMD - void fillResourceList(std::vector& memList); - - private: - //! Disable copy constructor - SrdManager(const SrdManager&); - - //! Disable assignment operator - SrdManager& operator=(const SrdManager&); - - struct Chunk { - Memory* buf_; - uint* flags_; - Chunk(): buf_(NULL), flags_(NULL) {} - }; - - static const uint MaskBits = 32; - const Device& dev_; //!< GPU device for the chunk manager - amd::Monitor ml_; //!< Global lock for the SRD manager - std::vector pool_; //!< Pool of SRD buffers - uint numFlags_; //!< Total number of flags in array - uint srdSize_; //!< SRD size - uint bufSize_; //!< Buffer size that holds SRDs - }; - - //! Initialise the whole GPU device subsystem (CAL init, device enumeration, etc). - static bool init(); - - //! Shutdown the whole GPU device subsystem (CAL shutdown). - static void tearDown(); - - //! Construct a new physical GPU device - Device(); - - //! Initialise a device (i.e. all parts of the constructor that could - //! potentially fail) - bool create( - CALuint ordinal, //!< GPU device ordinal index. Starts from 0 - CALuint numOfDevices //!< number of GPU devices in the system - ); - - //! Destructor for the physical GPU device - virtual ~Device(); - - //! Instantiate a new virtual device - device::VirtualDevice* createVirtualDevice( - amd::CommandQueue* queue = NULL - ); - - //! Memory allocation - virtual device::Memory* createMemory( - amd::Memory& owner //!< abstraction layer memory object - ) const; - - //! Sampler object allocation - virtual bool createSampler( - const amd::Sampler& owner, //!< abstraction layer sampler object - device::Sampler** sampler //!< device sampler object - ) const; - - //! Reallocates the provided buffer object - virtual bool reallocMemory( - amd::Memory& owner //!< Buffer for reallocation - ) const; - - //! Allocates a view object from the device memory - virtual device::Memory* createView( - amd::Memory& owner, //!< Owner memory object - const device::Memory& parent //!< Parent device memory object for the view - ) const; - - //! Create the device program. - virtual device::Program* createProgram(amd::option::Options* options = NULL); - - //! Attempt to bind with external graphics API's device/context - virtual bool bindExternalDevice( - uint flags, - void* const pDevice[], - void* pContext, - bool validateOnly); - - //! Attempt to unbind with external graphics API's device/context - virtual bool unbindExternalDevice( - uint flags, - void* const pDevice[], - void* pContext, - bool validateOnly); - - //! Validates kernel before execution - virtual bool validateKernel( - const amd::Kernel& kernel, //!< AMD kernel object - const device::VirtualDevice* vdev - ); - - //! Retrieves information about free memory on a GPU device - virtual bool globalFreeMemory(size_t* freeMemory) const; - - //! Returns a GPU memory object from AMD memory object - gpu::Memory* getGpuMemory( - amd::Memory* mem //!< Pointer to AMD memory object - ) const; + bool create(Device& device //!< GPU device object + ); //! Gets the GPU resource associated with the global heap - const Memory& globalMem() const { return heap_.resource(); } + const Memory& resource() const { return *resource_; } - //! Gets the device context object - amd::Context& context() const { return *context_; } + //! Returns the base virtual address of the heap + uint64_t baseAddress() const { return baseAddress_; } - //! Gets the global heap object - const Heap& heap() const { return heap_; } + protected: + Memory* resource_; //!< GPU resource referencing the heap memory + uint64_t baseAddress_; //!< Virtual heap base address + }; - //! Gets the memory object for the dummy page - amd::Memory* dummyPage() const { return dummyPage_; } + //! Locks any access to the virtual GPUs + class ScopedLockVgpus : public amd::StackObject { + public: + //! Default constructor + ScopedLockVgpus(const Device& dev); - amd::Monitor& lockAsyncOps() const { return *lockAsyncOps_; } + //! Destructor + ~ScopedLockVgpus(); - //! Returns the lock object for the virtual gpus list - amd::Monitor* vgpusAccess() const { return vgpusAccess_; } + private: + const Device& dev_; //! Device object + }; - //! Returns the number of virtual GPUs allocated on this device - uint numOfVgpus() const { return numOfVgpus_; } - uint numOfVgpus_; //!< The number of virtual GPUs (lock protected) + //! Interop emulation flags + enum InteropEmulationFlags { + D3D10Device = 0x00000001, + GLContext = 0x00000002, + }; - typedef std::vector VirtualGPUs; + class Engines : public amd::EmbeddedObject { + public: + //! Default constructor + Engines() : numComputeRings_(0), numComputeRingsRT_(0), numDmaEngines_(0) { + memset(desc_, 0xff, sizeof(desc_)); + } - //! Returns the list of all virtual GPUs running on this device - const VirtualGPUs& vgpus() const { return vgpus_; } - VirtualGPUs vgpus_; //!< The list of all running virtual gpus (lock protected) + //! Creates engine descriptor for this class + void create(uint num, gslEngineDescriptor* desc, uint maxNumComputeRings); - //! Scratch buffer allocation - gpu::Memory* createScratchBuffer( - size_t size //!< Size of buffer - ) const; + //! Gets engine type mask + uint getMask(gslEngineID id) const { return (1 << id); } - //! Returns transfer buffer object - XferBuffers& xferWrite() const { return *xferWrite_; } + //! Gets a descriptor for the requested engines + uint getRequested(uint engines, gslEngineDescriptor* desc) const; - //! Returns transfer buffer object - XferBuffers& xferRead() const { return *xferRead_; } + //! Returns the number of available compute rings + uint numComputeRings() const { return numComputeRings_; } - //! Finds an appropriate map target - amd::Memory* findMapTarget(size_t size) const; + //! Returns the number of available real time compute rings + uint numComputeRingsRT() const { return numComputeRingsRT_; } - //! Adds a map target to the cache - bool addMapTarget(amd::Memory* memory) const; + //! Returns the number of available DMA engines + uint numDMAEngines() const { return numDmaEngines_; } - //! Returns resource cache object - ResourceCache& resourceCache() const { return *resourceCache_; } + private: + uint numComputeRings_; + uint numComputeRingsRT_; + uint numDmaEngines_; + gslEngineDescriptor desc_[GSL_ENGINEID_MAX]; //!< Engine descriptor + }; - //! Returns engines object - const Engines& engines() const { return engines_; } + //! Transfer buffers + class XferBuffers : public amd::HeapObject { + public: + static const size_t MaxXferBufListSize = 8; - //! Returns engines object - const device::BlitManager& xferMgr() const; + //! Default constructor + XferBuffers(const Device& device, Resource::MemoryType type, size_t bufSize) + : type_(type), bufSize_(bufSize), acquiredCnt_(0), gpuDevice_(device) {} - VirtualGPU* xferQueue() const { return xferQueue_; } + //! Default destructor + ~XferBuffers(); - //! Retrieves the internal format from the OCL format - CalFormat getCalFormat( - const amd::Image::Format& format //! OCL image format - ) const; + //! Creates the xfer buffers object + bool create(); - //! Retrieves the OCL format from the internal image format - amd::Image::Format getOclFormat( - const CalFormat& format //! Internal image format - ) const; + //! Acquires an instance of the transfer buffers + Memory& acquire(); - const ScratchBuffer* scratch(uint idx) const { return scratch_[idx]; } + //! Releases transfer buffer + void release(VirtualGPU& gpu, //!< Virual GPU object used with the buffer + Memory& buffer //!< Transfer buffer for release + ); - //! Returns the global scratch buffer - Memory* globalScratchBuf() const { return globalScratchBuf_; }; + //! Returns the buffer's size for transfer + size_t bufSize() const { return bufSize_; } - //! Destroys scratch buffer memory - void destroyScratchBuffers(); - - //! Initialize heap resources if uninitialized - bool initializeHeapResources(); - - //! Set GSL sampler to the specified state - void fillHwSampler( - uint32_t state, //!< Sampler's OpenCL state - void* hwState, //!< Sampler's HW state - uint32_t hwStateSize, //!< Size of sampler's HW state - uint32_t mipFilter = CL_FILTER_NONE, //!< Mip filter - float minLod = 0.f, //!< Min level of detail - float maxLod = CL_MAXFLOAT //!< Max level of detail - ) const; - - //! host memory alloc - virtual void* hostAlloc(size_t size, size_t alignment, bool atomics = false) const; - - //! SVM allocation - virtual void* svmAlloc(amd::Context& context, size_t size, size_t alignment, cl_svm_mem_flags flags, void* svmPtr) const; - - //! Free host SVM memory - void hostFree(void* ptr, size_t size) const; - - //! SVM free - virtual void svmFree(void* ptr) const; - - //! Returns SRD manger object - SrdManager& srds() const { return *srdManager_; } - - //! Initial the Hardware Debug Manager - cl_int hwDebugManagerInit(amd::Context *context, uintptr_t messageStorage); - -private: + private: //! Disable copy constructor - Device(const Device&); + XferBuffers(const XferBuffers&); - //! Disable assignment - Device& operator=(const Device&); + //! Disable assignment operator + XferBuffers& operator=(const XferBuffers&); - //! Sends the stall command to all queues - bool stallQueues(); + //! Get device object + const Device& dev() const { return gpuDevice_; } - //! Buffer allocation - gpu::Memory* createBuffer( - amd::Memory& owner, //!< Abstraction layer memory object - bool directAccess //!< Use direct host memory access - ) const; + Resource::MemoryType type_; //!< The buffer's type + size_t bufSize_; //!< Staged buffer size + std::list freeBuffers_; //!< The list of free buffers + amd::Atomic acquiredCnt_; //!< The total number of acquired buffers + amd::Monitor lock_; //!< Stgaed buffer acquire/release lock + const Device& gpuDevice_; //!< GPU device object + }; - //! Image allocation - gpu::Memory* createImage( - amd::Memory& owner, //!< Abstraction layer memory object - bool directAccess //!< Use direct host memory access - ) const; + struct ScratchBuffer : public amd::HeapObject { + uint regNum_; //!< The number of used scratch registers + Memory* memObj_; //!< Memory objects for scratch buffers + uint64_t offset_; //!< Offset from the global scratch store + uint64_t size_; //!< Scratch buffer size on this queue - //! Allocates/reallocates the scratch buffer, according to the usage - bool allocScratch( - uint regNum, //!< Number of the scratch registers - const VirtualGPU* vgpu //!< Virtual GPU for the allocation - ); + //! Default constructor + ScratchBuffer() : regNum_(0), memObj_(NULL), offset_(0) {} - amd::Context* context_; //!< A dummy context for internal allocations - Heap heap_; //!< GPU global heap - amd::Memory* dummyPage_; //!< A dummy page for NULL pointer + //! Default constructor + ~ScratchBuffer(); - amd::Monitor* lockAsyncOps_; //!< Lock to serialise all async ops on this device - amd::Monitor* lockAsyncOpsForInitHeap_; //!< Lock to serialise all async ops on initialization heap operation - amd::Monitor* vgpusAccess_; //!< Lock to serialise virtual gpu list access - amd::Monitor* scratchAlloc_; //!< Lock to serialise scratch allocation - amd::Monitor* mapCacheOps_; //!< Lock to serialise cache for the map resources + //! Destroys memory objects + void destroyMemory(); + }; - XferBuffers* xferRead_; //!< Transfer buffers read - XferBuffers* xferWrite_; //!< Transfer buffers write - std::vector* mapCache_; //!< Map cache info structure - ResourceCache* resourceCache_; //!< Resource cache - Engines engines_; //!< Available engines on device - bool heapInitComplete_; //!< Keep track of initialization status of heap resources - VirtualGPU* xferQueue_; //!< Transfer queue - std::vector scratch_; //!< Scratch buffers for kernels - Memory* globalScratchBuf_; //!< Global scratch buffer - SrdManager* srdManager_; //!< SRD manager object + class SrdManager : public amd::HeapObject { + public: + SrdManager(const Device& dev, uint srdSize, uint bufSize) + : dev_(dev), + numFlags_(bufSize / (srdSize * MaskBits)), + srdSize_(srdSize), + bufSize_(bufSize) {} + ~SrdManager(); - static AppProfile appProfile_; //!< application profile + //! Allocates a new SRD slot for a resource + uint64_t allocSrdSlot(address* cpuAddr); + + //! Frees a SRD slot + void freeSrdSlot(uint64_t addr); + + // Fills the memory list for VidMM KMD + void fillResourceList(std::vector& memList); + + private: + //! Disable copy constructor + SrdManager(const SrdManager&); + + //! Disable assignment operator + SrdManager& operator=(const SrdManager&); + + struct Chunk { + Memory* buf_; + uint* flags_; + Chunk() : buf_(NULL), flags_(NULL) {} + }; + + static const uint MaskBits = 32; + const Device& dev_; //!< GPU device for the chunk manager + amd::Monitor ml_; //!< Global lock for the SRD manager + std::vector pool_; //!< Pool of SRD buffers + uint numFlags_; //!< Total number of flags in array + uint srdSize_; //!< SRD size + uint bufSize_; //!< Buffer size that holds SRDs + }; + + //! Initialise the whole GPU device subsystem (CAL init, device enumeration, etc). + static bool init(); + + //! Shutdown the whole GPU device subsystem (CAL shutdown). + static void tearDown(); + + //! Construct a new physical GPU device + Device(); + + //! Initialise a device (i.e. all parts of the constructor that could + //! potentially fail) + bool create(CALuint ordinal, //!< GPU device ordinal index. Starts from 0 + CALuint numOfDevices //!< number of GPU devices in the system + ); + + //! Destructor for the physical GPU device + virtual ~Device(); + + //! Instantiate a new virtual device + device::VirtualDevice* createVirtualDevice(amd::CommandQueue* queue = NULL); + + //! Memory allocation + virtual device::Memory* createMemory(amd::Memory& owner //!< abstraction layer memory object + ) const; + + //! Sampler object allocation + virtual bool createSampler(const amd::Sampler& owner, //!< abstraction layer sampler object + device::Sampler** sampler //!< device sampler object + ) const; + + //! Reallocates the provided buffer object + virtual bool reallocMemory(amd::Memory& owner //!< Buffer for reallocation + ) const; + + //! Allocates a view object from the device memory + virtual device::Memory* createView( + amd::Memory& owner, //!< Owner memory object + const device::Memory& parent //!< Parent device memory object for the view + ) const; + + //! Create the device program. + virtual device::Program* createProgram(amd::option::Options* options = NULL); + + //! Attempt to bind with external graphics API's device/context + virtual bool bindExternalDevice(uint flags, void* const pDevice[], void* pContext, + bool validateOnly); + + //! Attempt to unbind with external graphics API's device/context + virtual bool unbindExternalDevice(uint flags, void* const pDevice[], void* pContext, + bool validateOnly); + + //! Validates kernel before execution + virtual bool validateKernel(const amd::Kernel& kernel, //!< AMD kernel object + const device::VirtualDevice* vdev); + + //! Retrieves information about free memory on a GPU device + virtual bool globalFreeMemory(size_t* freeMemory) const; + + //! Returns a GPU memory object from AMD memory object + gpu::Memory* getGpuMemory(amd::Memory* mem //!< Pointer to AMD memory object + ) const; + + //! Gets the GPU resource associated with the global heap + const Memory& globalMem() const { return heap_.resource(); } + + //! Gets the device context object + amd::Context& context() const { return *context_; } + + //! Gets the global heap object + const Heap& heap() const { return heap_; } + + //! Gets the memory object for the dummy page + amd::Memory* dummyPage() const { return dummyPage_; } + + amd::Monitor& lockAsyncOps() const { return *lockAsyncOps_; } + + //! Returns the lock object for the virtual gpus list + amd::Monitor* vgpusAccess() const { return vgpusAccess_; } + + //! Returns the number of virtual GPUs allocated on this device + uint numOfVgpus() const { return numOfVgpus_; } + uint numOfVgpus_; //!< The number of virtual GPUs (lock protected) + + typedef std::vector VirtualGPUs; + + //! Returns the list of all virtual GPUs running on this device + const VirtualGPUs& vgpus() const { return vgpus_; } + VirtualGPUs vgpus_; //!< The list of all running virtual gpus (lock protected) + + //! Scratch buffer allocation + gpu::Memory* createScratchBuffer(size_t size //!< Size of buffer + ) const; + + //! Returns transfer buffer object + XferBuffers& xferWrite() const { return *xferWrite_; } + + //! Returns transfer buffer object + XferBuffers& xferRead() const { return *xferRead_; } + + //! Finds an appropriate map target + amd::Memory* findMapTarget(size_t size) const; + + //! Adds a map target to the cache + bool addMapTarget(amd::Memory* memory) const; + + //! Returns resource cache object + ResourceCache& resourceCache() const { return *resourceCache_; } + + //! Returns engines object + const Engines& engines() const { return engines_; } + + //! Returns engines object + const device::BlitManager& xferMgr() const; + + VirtualGPU* xferQueue() const { return xferQueue_; } + + //! Retrieves the internal format from the OCL format + CalFormat getCalFormat(const amd::Image::Format& format //! OCL image format + ) const; + + //! Retrieves the OCL format from the internal image format + amd::Image::Format getOclFormat(const CalFormat& format //! Internal image format + ) const; + + const ScratchBuffer* scratch(uint idx) const { return scratch_[idx]; } + + //! Returns the global scratch buffer + Memory* globalScratchBuf() const { return globalScratchBuf_; }; + + //! Destroys scratch buffer memory + void destroyScratchBuffers(); + + //! Initialize heap resources if uninitialized + bool initializeHeapResources(); + + //! Set GSL sampler to the specified state + void fillHwSampler(uint32_t state, //!< Sampler's OpenCL state + void* hwState, //!< Sampler's HW state + uint32_t hwStateSize, //!< Size of sampler's HW state + uint32_t mipFilter = CL_FILTER_NONE, //!< Mip filter + float minLod = 0.f, //!< Min level of detail + float maxLod = CL_MAXFLOAT //!< Max level of detail + ) const; + + //! host memory alloc + virtual void* hostAlloc(size_t size, size_t alignment, bool atomics = false) const; + + //! SVM allocation + virtual void* svmAlloc(amd::Context& context, size_t size, size_t alignment, + cl_svm_mem_flags flags, void* svmPtr) const; + + //! Free host SVM memory + void hostFree(void* ptr, size_t size) const; + + //! SVM free + virtual void svmFree(void* ptr) const; + + //! Returns SRD manger object + SrdManager& srds() const { return *srdManager_; } + + //! Initial the Hardware Debug Manager + cl_int hwDebugManagerInit(amd::Context* context, uintptr_t messageStorage); + + private: + //! Disable copy constructor + Device(const Device&); + + //! Disable assignment + Device& operator=(const Device&); + + //! Sends the stall command to all queues + bool stallQueues(); + + //! Buffer allocation + gpu::Memory* createBuffer(amd::Memory& owner, //!< Abstraction layer memory object + bool directAccess //!< Use direct host memory access + ) const; + + //! Image allocation + gpu::Memory* createImage(amd::Memory& owner, //!< Abstraction layer memory object + bool directAccess //!< Use direct host memory access + ) const; + + //! Allocates/reallocates the scratch buffer, according to the usage + bool allocScratch(uint regNum, //!< Number of the scratch registers + const VirtualGPU* vgpu //!< Virtual GPU for the allocation + ); + + amd::Context* context_; //!< A dummy context for internal allocations + Heap heap_; //!< GPU global heap + amd::Memory* dummyPage_; //!< A dummy page for NULL pointer + + amd::Monitor* lockAsyncOps_; //!< Lock to serialise all async ops on this device + amd::Monitor* lockAsyncOpsForInitHeap_; //!< Lock to serialise all async ops on initialization + //!heap operation + amd::Monitor* vgpusAccess_; //!< Lock to serialise virtual gpu list access + amd::Monitor* scratchAlloc_; //!< Lock to serialise scratch allocation + amd::Monitor* mapCacheOps_; //!< Lock to serialise cache for the map resources + + XferBuffers* xferRead_; //!< Transfer buffers read + XferBuffers* xferWrite_; //!< Transfer buffers write + + std::vector* mapCache_; //!< Map cache info structure + ResourceCache* resourceCache_; //!< Resource cache + Engines engines_; //!< Available engines on device + bool heapInitComplete_; //!< Keep track of initialization status of heap resources + VirtualGPU* xferQueue_; //!< Transfer queue + std::vector scratch_; //!< Scratch buffers for kernels + Memory* globalScratchBuf_; //!< Global scratch buffer + SrdManager* srdManager_; //!< SRD manager object + + static AppProfile appProfile_; //!< application profile }; /*@}*/} // namespace gpu diff --git a/rocclr/runtime/device/gpu/gpukernel.cpp b/rocclr/runtime/device/gpu/gpukernel.cpp index 2a149e67f7..1d3cb4c136 100644 --- a/rocclr/runtime/device/gpu/gpukernel.cpp +++ b/rocclr/runtime/device/gpu/gpukernel.cpp @@ -24,382 +24,390 @@ namespace gpu { -const MetaDataConst ArgState[ArgStateTotal] = -{ -// Note: the order is important +const MetaDataConst ArgState[ArgStateTotal] = { + // Note: the order is important // Name Type Properties -// Kernel description (special properties) -{ "memory:compilerwrite", KernelArg::PrivateFixed, { 0, 0, 0, 0, 0, 0, 0 } }, - { "uniqueid:", KernelArg::None, { 0, 0, 0, 0, 0, 0, 0 } }, - { "memory:private:", KernelArg::PrivateSize, { 0, 0, 0, 0, 0, 0, 0 } }, - { "memory:local:", KernelArg::LocalSize, { 0, 0, 0, 0, 0, 0, 0 } }, - { "memory:hwprivate:", KernelArg::HwPrivateSize, { 0, 0, 0, 0, 0, 0, 0 } }, - { "memory:uavprivate:", KernelArg::HwPrivateSize, { 0, 0, 0, 0, 0, 0, 0 } }, - { "memory:hwlocal:", KernelArg::HwLocalSize, { 0, 0, 0, 0, 0, 0, 0 } }, - { "memory:64bitABI", KernelArg::ABI64Bit, { 0, 0, 0, 0, 0, 0, 0 } }, - { "limitgroupsize", KernelArg::Wavefront, { 0, 0, 0, 0, 0, 0, 0 } }, - { "function:", KernelArg::None, { 1, 1, 0, 0, 0, 0, 0 } }, - { "intrinsic:", KernelArg::None, { 1, 0, 0, 0, 0, 0, 0 } }, - { "error:", KernelArg::ErrorMessage, { 0, 0, 0, 0, 0, 0, 0 } }, - { "warning:", KernelArg::WarningMessage, { 0, 0, 0, 0, 0, 0, 0 } }, - { "printf_fmt:", KernelArg::PrintfFormatStr, { 0, 0, 0, 0, 0, 0, 0 } }, - { "version:", KernelArg::MetadataVersion, { 0, 0, 0, 0, 0, 0, 0 } }, -// Kernel basic types - { "pointer:", KernelArg::PointerGlobal, { 1, 1, 1, 1, 1, 1, 0 } }, - { "value:", KernelArg::Value, { 1, 1, 1, 1, 1, 0, 0 } }, - { "image:", KernelArg::Image, { 1, 1, 1, 1, 1, 0, 0 } }, - { "sampler:", KernelArg::Sampler, { 0, 1, 0, 0, 0, 0, 0 } }, - { "counter:", KernelArg::Counter, { 1, 1, 0, 1, 1, 0, 0 } }, - { "cws:", KernelArg::Grouping, { 0, 0, 0, 0, 0, 0, 0 } }, - { "lws:", KernelArg::WrkgrpSize, { 0, 0, 0, 0, 0, 0, 0 } }, - { "uavid:", KernelArg::UavId, { 0, 0, 0, 0, 0, 0, 0 } }, - { "reflection:", KernelArg::Reflection, { 0, 0, 0, 0, 0, 0, 0 } }, - { "constarg:", KernelArg::ConstArg, { 0, 0, 0, 0, 0, 0, 0 } }, - { "cbid:", KernelArg::ConstBufId, { 0, 0, 0, 0, 0, 0, 0 } }, - { "printfid:", KernelArg::PrintfBufId, { 0, 0, 0, 0, 0, 0, 0 } }, - { "wsh:", KernelArg::GroupingHint, { 0, 0, 0, 0, 0, 0, 0 } }, - { "vth:", KernelArg::VecTypeHint, { 0, 0, 0, 0, 0, 0, 0 } }, - { "WavesPerSimdHint:", KernelArg::WavesPerSimdHint,{ 0, 0, 0, 0, 0, 0, 0 } }, + // Kernel description (special properties) + {"memory:compilerwrite", KernelArg::PrivateFixed, {0, 0, 0, 0, 0, 0, 0}}, + {"uniqueid:", KernelArg::None, {0, 0, 0, 0, 0, 0, 0}}, + {"memory:private:", KernelArg::PrivateSize, {0, 0, 0, 0, 0, 0, 0}}, + {"memory:local:", KernelArg::LocalSize, {0, 0, 0, 0, 0, 0, 0}}, + {"memory:hwprivate:", KernelArg::HwPrivateSize, {0, 0, 0, 0, 0, 0, 0}}, + {"memory:uavprivate:", KernelArg::HwPrivateSize, {0, 0, 0, 0, 0, 0, 0}}, + {"memory:hwlocal:", KernelArg::HwLocalSize, {0, 0, 0, 0, 0, 0, 0}}, + {"memory:64bitABI", KernelArg::ABI64Bit, {0, 0, 0, 0, 0, 0, 0}}, + {"limitgroupsize", KernelArg::Wavefront, {0, 0, 0, 0, 0, 0, 0}}, + {"function:", KernelArg::None, {1, 1, 0, 0, 0, 0, 0}}, + {"intrinsic:", KernelArg::None, {1, 0, 0, 0, 0, 0, 0}}, + {"error:", KernelArg::ErrorMessage, {0, 0, 0, 0, 0, 0, 0}}, + {"warning:", KernelArg::WarningMessage, {0, 0, 0, 0, 0, 0, 0}}, + {"printf_fmt:", KernelArg::PrintfFormatStr, {0, 0, 0, 0, 0, 0, 0}}, + {"version:", KernelArg::MetadataVersion, {0, 0, 0, 0, 0, 0, 0}}, + // Kernel basic types + {"pointer:", KernelArg::PointerGlobal, {1, 1, 1, 1, 1, 1, 0}}, + {"value:", KernelArg::Value, {1, 1, 1, 1, 1, 0, 0}}, + {"image:", KernelArg::Image, {1, 1, 1, 1, 1, 0, 0}}, + {"sampler:", KernelArg::Sampler, {0, 1, 0, 0, 0, 0, 0}}, + {"counter:", KernelArg::Counter, {1, 1, 0, 1, 1, 0, 0}}, + {"cws:", KernelArg::Grouping, {0, 0, 0, 0, 0, 0, 0}}, + {"lws:", KernelArg::WrkgrpSize, {0, 0, 0, 0, 0, 0, 0}}, + {"uavid:", KernelArg::UavId, {0, 0, 0, 0, 0, 0, 0}}, + {"reflection:", KernelArg::Reflection, {0, 0, 0, 0, 0, 0, 0}}, + {"constarg:", KernelArg::ConstArg, {0, 0, 0, 0, 0, 0, 0}}, + {"cbid:", KernelArg::ConstBufId, {0, 0, 0, 0, 0, 0, 0}}, + {"printfid:", KernelArg::PrintfBufId, {0, 0, 0, 0, 0, 0, 0}}, + {"wsh:", KernelArg::GroupingHint, {0, 0, 0, 0, 0, 0, 0}}, + {"vth:", KernelArg::VecTypeHint, {0, 0, 0, 0, 0, 0, 0}}, + {"WavesPerSimdHint:", KernelArg::WavesPerSimdHint, {0, 0, 0, 0, 0, 0, 0}}, }; -const DataTypeConst DataType[] = -{ - { "i8:", KernelArg::Char, }, - { "i16:", KernelArg::Short, }, - { "i32:", KernelArg::Int, }, - { "i64:", KernelArg::Long, }, - { "u8:", KernelArg::UChar, }, - { "u16:", KernelArg::UShort, }, - { "u32:", KernelArg::UInt, }, - { "u64:", KernelArg::ULong, }, - { "float:", KernelArg::Float, }, - { "double:", KernelArg::Double, }, - { "struct:", KernelArg::Struct, }, - { "union:", KernelArg::Union, }, - { "1D:", KernelArg::Image1D, }, - { "2D:", KernelArg::Image2D, }, - { "3D:", KernelArg::Image3D, }, - { "1DB:", KernelArg::Image1DB, }, - { "1DA:", KernelArg::Image1DA, }, - { "2DA:", KernelArg::Image2DA, }, - { "opaque:", KernelArg::Opaque, }, - { "event:", KernelArg::Event, }, - { "sampler:", KernelArg::Sampler, }, - { "half:", KernelArg::Half, }, +const DataTypeConst DataType[] = { + { + "i8:", KernelArg::Char, + }, + { + "i16:", KernelArg::Short, + }, + { + "i32:", KernelArg::Int, + }, + { + "i64:", KernelArg::Long, + }, + { + "u8:", KernelArg::UChar, + }, + { + "u16:", KernelArg::UShort, + }, + { + "u32:", KernelArg::UInt, + }, + { + "u64:", KernelArg::ULong, + }, + { + "float:", KernelArg::Float, + }, + { + "double:", KernelArg::Double, + }, + { + "struct:", KernelArg::Struct, + }, + { + "union:", KernelArg::Union, + }, + { + "1D:", KernelArg::Image1D, + }, + { + "2D:", KernelArg::Image2D, + }, + { + "3D:", KernelArg::Image3D, + }, + { + "1DB:", KernelArg::Image1DB, + }, + { + "1DA:", KernelArg::Image1DA, + }, + { + "2DA:", KernelArg::Image2DA, + }, + { + "opaque:", KernelArg::Opaque, + }, + { + "event:", KernelArg::Event, + }, + { + "sampler:", KernelArg::Sampler, + }, + { + "half:", KernelArg::Half, + }, }; const uint DataTypeTotal = sizeof(DataType) / sizeof(DataTypeConst); -struct BufDataConst -{ - const char* tagName_; //!< buffer's name - KernelArg::ArgumentType type_; //!< type of argument - struct - { - uint number_ : 1; //!< buffer's number - uint alignment_ : 1; //!< buffer's alignment - uint attribute_ : 1; //!< buffer's read/write attribute - uint reserved : 29; //!< reserved - }; +struct BufDataConst { + const char* tagName_; //!< buffer's name + KernelArg::ArgumentType type_; //!< type of argument + struct { + uint number_ : 1; //!< buffer's number + uint alignment_ : 1; //!< buffer's alignment + uint attribute_ : 1; //!< buffer's read/write attribute + uint reserved : 29; //!< reserved + }; }; -static const BufDataConst BufType[] = -{ - { "g", KernelArg::PointerGlobal, { 1, 0, 0, 0 } }, - { "p", KernelArg::PointerPrivate, { 1, 1, 1, 0 } }, - { "l", KernelArg::PointerLocal, { 1, 1, 1, 0 } }, - { "uav", KernelArg::PointerGlobal, { 1, 1, 1, 0 } }, - { "c", KernelArg::PointerConst, { 1, 1, 1, 0 } }, - { "hl", KernelArg::PointerHwLocal, { 1, 1, 1, 0 } }, - { "hp", KernelArg::PointerHwPrivate,{ 1, 1, 1, 0 } }, - { "hc", KernelArg::PointerHwConst, { 1, 1, 1, 0 } } -}; +static const BufDataConst BufType[] = {{"g", KernelArg::PointerGlobal, {1, 0, 0, 0}}, + {"p", KernelArg::PointerPrivate, {1, 1, 1, 0}}, + {"l", KernelArg::PointerLocal, {1, 1, 1, 0}}, + {"uav", KernelArg::PointerGlobal, {1, 1, 1, 0}}, + {"c", KernelArg::PointerConst, {1, 1, 1, 0}}, + {"hl", KernelArg::PointerHwLocal, {1, 1, 1, 0}}, + {"hp", KernelArg::PointerHwPrivate, {1, 1, 1, 0}}, + {"hc", KernelArg::PointerHwConst, {1, 1, 1, 0}}}; static const uint BufTypeTotal = sizeof(BufType) / sizeof(BufDataConst); //! The mathlib constants for each kernel execution -static const float MathLibConst[4] = { 0.0f, 0.5f, 1.0f, 2.0f }; +static const float MathLibConst[4] = {0.0f, 0.5f, 1.0f, 2.0f}; -bool -expect(const std::string& str, size_t* pos, const std::string& sym) -{ - bool result = true; - uint i; +bool expect(const std::string& str, size_t* pos, const std::string& sym) { + bool result = true; + uint i; - if (*pos == std::string::npos) { - return false; + if (*pos == std::string::npos) { + return false; + } + + // Check if we have expected symbols + for (i = 0; i < sym.size(); ++i) { + char deb = str[*pos + i]; + if (deb != sym[i]) { + result = false; + break; } + } - // Check if we have expected symbols - for (i = 0; i < sym.size(); ++i) { - char deb = str[*pos + i]; - if (deb != sym[i]) { - result = false; - break; - } - } + if (result) *pos += i; - if (result) *pos += i; - - return result; + return result; } -bool -getword(const std::string& str, size_t* pos, std::string& sym) -{ - if (*pos == std::string::npos) { - return false; - } +bool getword(const std::string& str, size_t* pos, std::string& sym) { + if (*pos == std::string::npos) { + return false; + } - *pos = str.find_first_not_of(" \n\r", *pos); - size_t posEnd = str.find_first_of(": \n\r;", *pos); - size_t count = posEnd - *pos; + *pos = str.find_first_not_of(" \n\r", *pos); + size_t posEnd = str.find_first_of(": \n\r;", *pos); + size_t count = posEnd - *pos; - if (count != 0) { - sym = str.substr(*pos, count); - } - sym[count] = 0; - *pos = posEnd + 1; - return true; + if (count != 0) { + sym = str.substr(*pos, count); + } + sym[count] = 0; + *pos = posEnd + 1; + return true; } -bool -getstring(const std::string& str, size_t* pos, std::string* out) -{ - if (*pos == std::string::npos) { - return false; - } +bool getstring(const std::string& str, size_t* pos, std::string* out) { + if (*pos == std::string::npos) { + return false; + } - *pos = str.find_first_not_of(" \n\r", *pos); - size_t posEnd = str.find_first_of(":\n\r;", *pos); - size_t count = posEnd - *pos; + *pos = str.find_first_not_of(" \n\r", *pos); + size_t posEnd = str.find_first_of(":\n\r;", *pos); + size_t count = posEnd - *pos; - char* sym = new char[count + 1]; - if (count != 0) { - if (!str.copy(sym, count, *pos)) { - return false; - } + char* sym = new char[count + 1]; + if (count != 0) { + if (!str.copy(sym, count, *pos)) { + return false; } - sym[count] = 0; - *out = sym; - delete [] sym; - *pos = posEnd + 1; - return true; + } + sym[count] = 0; + *out = sym; + delete[] sym; + *pos = posEnd + 1; + return true; } -bool -getuint(const std::string& str, size_t* pos, uint* val) -{ - if (*pos == std::string::npos) { - return false; - } +bool getuint(const std::string& str, size_t* pos, uint* val) { + if (*pos == std::string::npos) { + return false; + } - char sym[16]; - *pos = str.find_first_not_of(" \n\r", *pos); - size_t posEnd = str.find_first_of(": \n\r;)", *pos); + char sym[16]; + *pos = str.find_first_not_of(" \n\r", *pos); + size_t posEnd = str.find_first_of(": \n\r;)", *pos); - if (!str.copy(sym, posEnd - *pos, *pos)) { - return false; - } - *val = 0; - for (size_t i = 0; i < (posEnd - *pos); ++i) { - *val = (*val * 10) + (sym[i] - 0x30); - } - *pos = posEnd + 1; - return true; + if (!str.copy(sym, posEnd - *pos, *pos)) { + return false; + } + *val = 0; + for (size_t i = 0; i < (posEnd - *pos); ++i) { + *val = (*val * 10) + (sym[i] - 0x30); + } + *pos = posEnd + 1; + return true; } -bool -getuintHex(const std::string& str, size_t* pos, uint* val) -{ - if (*pos == std::string::npos) { - return false; - } +bool getuintHex(const std::string& str, size_t* pos, uint* val) { + if (*pos == std::string::npos) { + return false; + } - char sym[16]; - *pos = str.find_first_not_of(" \n\r", *pos); - size_t posEnd = str.find_first_of(": \n\r;)", *pos); + char sym[16]; + *pos = str.find_first_not_of(" \n\r", *pos); + size_t posEnd = str.find_first_of(": \n\r;)", *pos); - if (!str.copy(sym, posEnd - *pos, *pos)) { - return false; + if (!str.copy(sym, posEnd - *pos, *pos)) { + return false; + } + *val = 0; + for (size_t i = 0; i < (posEnd - *pos); ++i) { + if (sym[i] >= '0' && sym[i] <= 'F') { + *val = (*val * 16) + (sym[i] - '0'); + } else if (sym[i] >= 'a' && sym[i] <= 'f') { + *val = (*val * 16) + (sym[i] - 'a' + 10); + } else { + return false; } - *val = 0; - for (size_t i = 0; i < (posEnd - *pos); ++i) { - if (sym[i] >= '0' && sym[i] <= 'F') { - *val = (*val * 16) + (sym[i] - '0'); - } - else if (sym[i] >= 'a' && sym[i] <= 'f') { - *val = (*val * 16) + (sym[i] - 'a' + 10); - } - else { - return false; - } - } - *pos = posEnd + 1; - return true; + } + *pos = posEnd + 1; + return true; } -bool -getuint64Hex(const std::string& str, size_t* pos, uint64_t* val) -{ - if (*pos == std::string::npos) { - return false; - } +bool getuint64Hex(const std::string& str, size_t* pos, uint64_t* val) { + if (*pos == std::string::npos) { + return false; + } - char sym[16]; - *pos = str.find_first_not_of(" \n\r", *pos); - size_t posEnd = str.find_first_of(": \n\r;)", *pos); + char sym[16]; + *pos = str.find_first_not_of(" \n\r", *pos); + size_t posEnd = str.find_first_of(": \n\r;)", *pos); - if (!str.copy(sym, posEnd - *pos, *pos)) { - return false; + if (!str.copy(sym, posEnd - *pos, *pos)) { + return false; + } + *val = 0; + for (size_t i = 0; i < (posEnd - *pos); ++i) { + if (sym[i] >= '0' && sym[i] <= 'F') { + *val = (*val * 16) + (sym[i] - '0'); + } else if (sym[i] >= 'a' && sym[i] <= 'f') { + *val = (*val * 16) + (sym[i] - 'a' + 10); + } else { + return false; } - *val = 0; - for (size_t i = 0; i < (posEnd - *pos); ++i) { - if (sym[i] >= '0' && sym[i] <= 'F') { - *val = (*val * 16) + (sym[i] - '0'); - } - else if (sym[i] >= 'a' && sym[i] <= 'f') { - *val = (*val * 16) + (sym[i] - 'a' + 10); - } - else { - return false; - } - } - *pos = posEnd + 1; - return true; + } + *pos = posEnd + 1; + return true; } -void -intToStr(size_t value, char* str, size_t size) -{ - static const uint MaxDigits32bit = 10; - char result[MaxDigits32bit]; - uint idx = MaxDigits32bit; +void intToStr(size_t value, char* str, size_t size) { + static const uint MaxDigits32bit = 10; + char result[MaxDigits32bit]; + uint idx = MaxDigits32bit; - do { - idx--; - result[idx] = static_cast((value % 10) + '0'); - value /= 10; - } while ((value != 0) && (idx > 0)); - size_t len = MaxDigits32bit - idx; - size_t n = std::min(len, size-1); - memcpy(str, &result[idx], n); - str[n] = '\0'; + do { + idx--; + result[idx] = static_cast((value % 10) + '0'); + value /= 10; + } while ((value != 0) && (idx > 0)); + size_t len = MaxDigits32bit - idx; + size_t n = std::min(len, size - 1); + memcpy(str, &result[idx], n); + str[n] = '\0'; } //! Default destructor -CalImageReference::~CalImageReference() -{ - // Free CAL image - free(image_); +CalImageReference::~CalImageReference() { + // Free CAL image + free(image_); } KernelArg::KernelArg() - : type_(KernelArg::None) - , size_(0) - , cbIdx_(0) - , cbPos_(0) - , index_(0) - , alignment_(1) - , dataType_(KernelArg::None) -{ - name_ = ""; - buf_ = ""; - memory_.value_ = 0; - typeQualifier_ = CL_KERNEL_ARG_TYPE_NONE; + : type_(KernelArg::None), + size_(0), + cbIdx_(0), + cbPos_(0), + index_(0), + alignment_(1), + dataType_(KernelArg::None) { + name_ = ""; + buf_ = ""; + memory_.value_ = 0; + typeQualifier_ = CL_KERNEL_ARG_TYPE_NONE; } -KernelArg::KernelArg(const KernelArg& data) -{ - // Fill the new object - *this = data; +KernelArg::KernelArg(const KernelArg& data) { + // Fill the new object + *this = data; } -KernelArg& -KernelArg::operator=(const KernelArg& data) -{ - // Fill the fields of the current object - name_ = data.name_; - typeName_ = data.typeName_; - typeQualifier_ = data.typeQualifier_; - type_ = data.type_; - size_ = data.size_; - cbIdx_ = data.cbIdx_; - cbPos_ = data.cbPos_; - buf_ = data.buf_; - index_ = data.index_; - alignment_ = data.alignment_; - dataType_ = data.dataType_; - memory_.value_ = data.memory_.value_; - return *this; +KernelArg& KernelArg::operator=(const KernelArg& data) { + // Fill the fields of the current object + name_ = data.name_; + typeName_ = data.typeName_; + typeQualifier_ = data.typeQualifier_; + type_ = data.type_; + size_ = data.size_; + cbIdx_ = data.cbIdx_; + cbPos_ = data.cbPos_; + buf_ = data.buf_; + index_ = data.index_; + alignment_ = data.alignment_; + dataType_ = data.dataType_; + memory_.value_ = data.memory_.value_; + return *this; } -bool -KernelArg::isCbNeeded() const -{ - //! \note not a safe way - bool result = ((type_ > None) && (type_ < Sampler)) ? true : false; - if ((type_ == Sampler) && (location_ == 0)) { - // Sampler is defined outside the kernel - result = true; - } - return result; +bool KernelArg::isCbNeeded() const { + //! \note not a safe way + bool result = ((type_ > None) && (type_ < Sampler)) ? true : false; + if ((type_ == Sampler) && (location_ == 0)) { + // Sampler is defined outside the kernel + result = true; + } + return result; } -size_t -KernelArg::size(bool gpuLayer)const -{ - switch (type_) { +size_t KernelArg::size(bool gpuLayer) const { + switch (type_) { case None: - return 0; + return 0; case PointerConst: case PointerHwConst: case PointerGlobal: - return (gpuLayer) ? sizeof(uint32_t) * size_ : sizeof(cl_mem); + return (gpuLayer) ? sizeof(uint32_t) * size_ : sizeof(cl_mem); case Image1D: case Image2D: case Image3D: case Image1DB: case Image1DA: case Image2DA: - return (gpuLayer) ? sizeof(ImageConstants) : sizeof(cl_mem); + return (gpuLayer) ? sizeof(ImageConstants) : sizeof(cl_mem); case Sampler: - return (gpuLayer) ? 2 * sizeof(uint32_t) : sizeof(cl_sampler); + return (gpuLayer) ? 2 * sizeof(uint32_t) : sizeof(cl_sampler); case Counter: - return (gpuLayer) ? 0 : sizeof(cl_mem); + return (gpuLayer) ? 0 : sizeof(cl_mem); case PointerLocal: case PointerHwLocal: - return (gpuLayer) ? sizeof(uint32_t) * size_ : 0; + return (gpuLayer) ? sizeof(uint32_t) * size_ : 0; case PointerPrivate: case PointerHwPrivate: - return (gpuLayer) ? sizeof(uint32_t) * size_ : 0; + return (gpuLayer) ? sizeof(uint32_t) * size_ : 0; case Float: - return sizeof(cl_float) * amd::nextPowerOfTwo(size_); + return sizeof(cl_float) * amd::nextPowerOfTwo(size_); case Double: - return sizeof(cl_double) * amd::nextPowerOfTwo(size_); + return sizeof(cl_double) * amd::nextPowerOfTwo(size_); case Char: case UChar: - return sizeof(cl_char) * amd::nextPowerOfTwo(size_); + return sizeof(cl_char) * amd::nextPowerOfTwo(size_); case Short: case UShort: - return sizeof(cl_short) * amd::nextPowerOfTwo(size_); + return sizeof(cl_short) * amd::nextPowerOfTwo(size_); case Int: case UInt: - return sizeof(cl_uint) * amd::nextPowerOfTwo(size_); + return sizeof(cl_uint) * amd::nextPowerOfTwo(size_); case Long: case ULong: - return sizeof(cl_ulong) * amd::nextPowerOfTwo(size_); + return sizeof(cl_ulong) * amd::nextPowerOfTwo(size_); case Struct: case Union: - return (gpuLayer) ? amd::alignUp(size_, 16) : size_; + return (gpuLayer) ? amd::alignUp(size_, 16) : size_; default: - return 0; - } + return 0; + } } -cl_kernel_arg_address_qualifier -KernelArg::addressQualifier() const -{ - switch (type_) { +cl_kernel_arg_address_qualifier KernelArg::addressQualifier() const { + switch (type_) { case PointerGlobal: case Image1D: case Image2D: @@ -407,65 +415,57 @@ KernelArg::addressQualifier() const case Image1DB: case Image1DA: case Image2DA: - return CL_KERNEL_ARG_ADDRESS_GLOBAL; + return CL_KERNEL_ARG_ADDRESS_GLOBAL; case PointerLocal: case PointerHwLocal: - return CL_KERNEL_ARG_ADDRESS_LOCAL; + return CL_KERNEL_ARG_ADDRESS_LOCAL; case PointerConst: case PointerHwConst: - return CL_KERNEL_ARG_ADDRESS_CONSTANT; + return CL_KERNEL_ARG_ADDRESS_CONSTANT; default: - return CL_KERNEL_ARG_ADDRESS_PRIVATE; - } + return CL_KERNEL_ARG_ADDRESS_PRIVATE; + } } -cl_kernel_arg_access_qualifier -KernelArg::accessQualifier() const -{ - switch (type_) { +cl_kernel_arg_access_qualifier KernelArg::accessQualifier() const { + switch (type_) { case Image1D: case Image2D: case Image3D: case Image1DB: case Image1DA: case Image2DA: - if (memory_.readOnly_) { - return CL_KERNEL_ARG_ACCESS_READ_ONLY; - } - else if (memory_.writeOnly_) { - return CL_KERNEL_ARG_ACCESS_WRITE_ONLY; - } - else if (memory_.readWrite_) { - return CL_KERNEL_ARG_ACCESS_READ_WRITE; - } - // Fall through ... + if (memory_.readOnly_) { + return CL_KERNEL_ARG_ACCESS_READ_ONLY; + } else if (memory_.writeOnly_) { + return CL_KERNEL_ARG_ACCESS_WRITE_ONLY; + } else if (memory_.readWrite_) { + return CL_KERNEL_ARG_ACCESS_READ_WRITE; + } + // Fall through ... default: - return CL_KERNEL_ARG_ACCESS_NONE; - } + return CL_KERNEL_ARG_ACCESS_NONE; + } } //! temporary solution for the vectors handling in compiler -size_t -KernelArg::specialVector() const -{ - if (size_ > VectorSizeLimit) { - switch (type_) { - case Char: - case UChar: - return sizeof(cl_char); - case Short: - case UShort: - return sizeof(cl_short); - default: - return 0; - } - } - return 0; -} -clk_value_type_t -KernelArg::type()const -{ +size_t KernelArg::specialVector() const { + if (size_ > VectorSizeLimit) { switch (type_) { + case Char: + case UChar: + return sizeof(cl_char); + case Short: + case UShort: + return sizeof(cl_short); + default: + return 0; + } + } + return 0; +} +clk_value_type_t KernelArg::type() const { + switch (type_) { case PointerGlobal: case PointerLocal: case PointerHwLocal: @@ -478,1480 +478,1323 @@ KernelArg::type()const case Image1DA: case Image2DA: case Counter: - return T_POINTER; + return T_POINTER; case Float: - return T_FLOAT; + return T_FLOAT; case Double: - return T_DOUBLE; + return T_DOUBLE; case Char: case UChar: - return T_CHAR; + return T_CHAR; case Short: case UShort: - return T_SHORT; + return T_SHORT; case Int: - return T_INT; + return T_INT; case UInt: - //! \note No UINT type - return T_INT; + //! \note No UINT type + return T_INT; case Long: - return T_LONG; + return T_LONG; case ULong: - //! \note No ULONG type - return T_LONG; + //! \note No ULONG type + return T_LONG; case Struct: case Union: - //! @todo What should we report? - return T_CHAR; + //! @todo What should we report? + return T_CHAR; case Sampler: - return T_SAMPLER; + return T_SAMPLER; case PointerPrivate: case PointerHwPrivate: case None: default: - return T_VOID; - } + return T_VOID; + } } -NullKernel::NullKernel( - const std::string& name, - const NullDevice& gpuNullDev, - const NullProgram& nullprog) - : device::Kernel(name) - , buildError_(CL_BUILD_PROGRAM_FAILURE) - , gpuDev_(gpuNullDev) - , prog_(nullprog) - , calRef_(NULL) - , internal_(false) - , flags_(0) - , cbSizes_(NULL) - , numCb_(0) - , rwAttributes_(false) - , instructionCnt_(4) -{ - // UAV raw index will be detected - uavRaw_ = UavIdUndefined; - // CB index will be detected - cbId_ = UavIdUndefined; - // Printf index will be detected - printfId_ = UavIdUndefined; +NullKernel::NullKernel(const std::string& name, const NullDevice& gpuNullDev, + const NullProgram& nullprog) + : device::Kernel(name), + buildError_(CL_BUILD_PROGRAM_FAILURE), + gpuDev_(gpuNullDev), + prog_(nullprog), + calRef_(NULL), + internal_(false), + flags_(0), + cbSizes_(NULL), + numCb_(0), + rwAttributes_(false), + instructionCnt_(4) { + // UAV raw index will be detected + uavRaw_ = UavIdUndefined; + // CB index will be detected + cbId_ = UavIdUndefined; + // Printf index will be detected + printfId_ = UavIdUndefined; } -NullKernel::~NullKernel() -{ - uint idx; +NullKernel::~NullKernel() { + uint idx; - if (calRef_ == NULL) { - return; - } - calRef_->release(); + if (calRef_ == NULL) { + return; + } + calRef_->release(); - // Destroy all kernel arguments - for (idx = 0; idx < arguments_.size(); ++idx) { - delete arguments_[idx]; - } - arguments_.clear(); + // Destroy all kernel arguments + for (idx = 0; idx < arguments_.size(); ++idx) { + delete arguments_[idx]; + } + arguments_.clear(); - // Destroy all sampler kernel arguments - for (idx = 0; idx < intSamplers_.size(); ++idx) { - delete intSamplers_[idx]; - } - intSamplers_.clear(); + // Destroy all sampler kernel arguments + for (idx = 0; idx < intSamplers_.size(); ++idx) { + delete intSamplers_[idx]; + } + intSamplers_.clear(); } -static int -scComponentToArrayIndex(E_SC_COMPONENT dstComp) -{ - switch (dstComp) { - case SC_COMPONENT_X: - return 0; - case SC_COMPONENT_Y: - return 1; - case SC_COMPONENT_Z: - return 2; - case SC_COMPONENT_W: - return 3; - } +static int scComponentToArrayIndex(E_SC_COMPONENT dstComp) { + switch (dstComp) { + case SC_COMPONENT_X: + return 0; + case SC_COMPONENT_Y: + return 1; + case SC_COMPONENT_Z: + return 2; + case SC_COMPONENT_W: + return 3; + } - return 0; + return 0; } -static void -addLoopConst(const SC_HWSHADER* shader, AMUabiAddEncoding& encoding) -{ - uint count = shader->dep.NumIntrlIConstants; - encoding.litConstsCount = shader->dep.NumIntrlIConstants; +static void addLoopConst(const SC_HWSHADER* shader, AMUabiAddEncoding& encoding) { + uint count = shader->dep.NumIntrlIConstants; + encoding.litConstsCount = shader->dep.NumIntrlIConstants; - // only suppport loop consts (int consts) - if (count) { - AMUabiLiteralConst* allocatedconsts = encoding.litConsts; - memset(allocatedconsts, 0, count * sizeof(AMUabiLiteralConst)); - uint usedConsts = 0; - for (uint i = 0; i < count; ++i) { - uint currentConst; - for (currentConst = 0; currentConst < usedConsts; ++currentConst) { - if (allocatedconsts[currentConst].addr == - HWSHADER_Get(shader, dep.IntrlIConstants)[i].uDstNumber) { - break; - } - } - if (currentConst == usedConsts) { - usedConsts++; - assert(usedConsts <= count); - } - allocatedconsts[currentConst].addr = HWSHADER_Get(shader, dep.IntrlIConstants)[i].uDstNumber; - allocatedconsts[currentConst].type = AMU_ABI_INT32; - allocatedconsts[currentConst].value. - int32[scComponentToArrayIndex(HWSHADER_Get(shader, dep.IntrlIConstants)[i].eDstComp)] = - HWSHADER_Get(shader, dep.IntrlIConstants)[i].iValue; + // only suppport loop consts (int consts) + if (count) { + AMUabiLiteralConst* allocatedconsts = encoding.litConsts; + memset(allocatedconsts, 0, count * sizeof(AMUabiLiteralConst)); + uint usedConsts = 0; + for (uint i = 0; i < count; ++i) { + uint currentConst; + for (currentConst = 0; currentConst < usedConsts; ++currentConst) { + if (allocatedconsts[currentConst].addr == + HWSHADER_Get(shader, dep.IntrlIConstants)[i].uDstNumber) { + break; } - encoding.litConstsCount = usedConsts; + } + if (currentConst == usedConsts) { + usedConsts++; + assert(usedConsts <= count); + } + allocatedconsts[currentConst].addr = HWSHADER_Get(shader, dep.IntrlIConstants)[i].uDstNumber; + allocatedconsts[currentConst].type = AMU_ABI_INT32; + allocatedconsts[currentConst].value.int32[scComponentToArrayIndex( + HWSHADER_Get(shader, dep.IntrlIConstants)[i].eDstComp)] = + HWSHADER_Get(shader, dep.IntrlIConstants)[i].iValue; } + encoding.litConstsCount = usedConsts; + } } -bool -NullKernel::create( - const std::string& code, - const std::string& metadata, - const void* binaryCode, - size_t binarySize) -{ - std::auto_ptr uavRefCount (new uint[MaxUavArguments]); - if (NULL == uavRefCount.get()) { - return false; - } +bool NullKernel::create(const std::string& code, const std::string& metadata, + const void* binaryCode, size_t binarySize) { + std::auto_ptr uavRefCount(new uint[MaxUavArguments]); + if (NULL == uavRefCount.get()) { + return false; + } - // Set all ref counts to 0 - memset(uavRefCount.get(), 0, sizeof(uavRefCount.get()[0]) * MaxUavArguments); + // Set all ref counts to 0 + memset(uavRefCount.get(), 0, sizeof(uavRefCount.get()[0]) * MaxUavArguments); - // parse the metadata fields - if (!parseArguments(metadata, uavRefCount.get())) { - return false; - } + // parse the metadata fields + if (!parseArguments(metadata, uavRefCount.get())) { + return false; + } - CALimage calImage; - // Save source if DEBUG build + CALimage calImage; +// Save source if DEBUG build #if DEBUG - ilSource_ = code; -#endif // DEBUG + ilSource_ = code; +#endif // DEBUG - amd::option::Options *options = nullProg().getCompilerOptions(); - internal_ = options->oVariables->clInternalKernel; + amd::option::Options* options = nullProg().getCompilerOptions(); + internal_ = options->oVariables->clInternalKernel; - if ((binaryCode == NULL) && (binarySize == 0) && !code.empty()) { - acl_error err; - std::string arch = GPU_TARGET_INFO_ARCH; - if (nullDev().settings().use64BitPtr_) { - arch += "64"; - } - aclTargetInfo info = aclGetTargetInfo( - arch.c_str(), nullDev().hwInfo()->targetName_, &err); - if (err != ACL_SUCCESS) { - LogWarning("aclGetTargetInfo failed"); - return false; - } - - aclBinaryOptions binOpts = {0}; - binOpts.struct_size = sizeof(binOpts); - binOpts.elfclass = info.arch_id == aclAMDIL64 ? ELFCLASS64 : ELFCLASS32; - binOpts.bitness = ELFDATA2LSB; - binOpts.alloc = &::malloc; - binOpts.dealloc = &::free; - - aclBinary* bin = aclBinaryInit(sizeof(aclBinary), &info, &binOpts, &err); - if (err != ACL_SUCCESS) { - LogWarning("aclBinaryInit failed"); - return false; - } - - if (ACL_SUCCESS != aclInsertSection(nullDev().compiler(), bin, - code.data(), code.size(), aclSOURCE)) { - LogWarning("aclInsertSection failed"); - aclBinaryFini(bin); - return false; - } - - amd::option::Options* Opts = (amd::option::Options*)bin->options; - - // Append an option so that we can selectively enable a SCOption on CZ - // whenever IOMMUv2 is enabled. - if (nullDev().settings().svmFineGrainSystem_) { - options->origOptionStr.append(" -sc-xnack-iommu"); - } - // temporary solution to synchronize buildNo between runtime and complib - // until we move runtime inside complib - Opts->setBuildNo(options->getBuildNo()); - - // pass kernel name to compiler - Opts->setCurrKernelName(name().c_str()); - - err = aclCompile(nullDev().compiler(), bin, options->origOptionStr.c_str(), - ACL_TYPE_AMDIL_TEXT, ACL_TYPE_ISA, NULL); - - buildLog_ += aclGetCompilerLog(nullDev().compiler()); - - if (err != ACL_SUCCESS) { - LogWarning("aclCompile failed"); - aclBinaryFini(bin); - return false; - } - if (!options->oVariables->BinEXE) { - // Early exit if binary doesn't contain EXE - aclBinaryFini(bin); - return true; - } - size_t len; - const void* isa = aclExtractSection(nullDev().compiler(), bin, - &len, aclTEXT, &err); - if (err != ACL_SUCCESS) { - LogWarning("aclExtractSection failed"); - aclBinaryFini(bin); - return false; - } - - uint calImageSize; - if (!createMultiBinary( - &calImageSize, reinterpret_cast(&calImage), isa)) { - LogWarning("initSrcEncoding failed"); - aclBinaryFini(bin); - return false; - } - - aclBinaryFini(bin); + if ((binaryCode == NULL) && (binarySize == 0) && !code.empty()) { + acl_error err; + std::string arch = GPU_TARGET_INFO_ARCH; + if (nullDev().settings().use64BitPtr_) { + arch += "64"; } - else if ((binaryCode != NULL) && (binarySize != 0)) { - uint size = 0; - if (!amuABIMultiBinaryGetSize(&size, const_cast(binaryCode)) - || size > binarySize) { - buildLog_ += "Invalid binary image"; - LogError("amuABIMultiBinaryGetSize failed!"); - return false; - } - - calImage = static_cast(malloc(size)); - ::memcpy(calImage, binaryCode, size); - } - else { - LogError("Incorrect initialization parameters!"); - return false; + aclTargetInfo info = aclGetTargetInfo(arch.c_str(), nullDev().hwInfo()->targetName_, &err); + if (err != ACL_SUCCESS) { + LogWarning("aclGetTargetInfo failed"); + return false; } - calRef_ = new CalImageReference(calImage); - if (calRef_ == NULL) { - LogError("Memory allocation failure!"); - // Free CAL image - free(calImage); - return false; + aclBinaryOptions binOpts = {0}; + binOpts.struct_size = sizeof(binOpts); + binOpts.elfclass = info.arch_id == aclAMDIL64 ? ELFCLASS64 : ELFCLASS32; + binOpts.bitness = ELFDATA2LSB; + binOpts.alloc = &::malloc; + binOpts.dealloc = &::free; + + aclBinary* bin = aclBinaryInit(sizeof(aclBinary), &info, &binOpts, &err); + if (err != ACL_SUCCESS) { + LogWarning("aclBinaryInit failed"); + return false; } - CALfuncInfo calFuncInfo; - - // Get kernel compiled information - getFuncInfoFromImage(calImage, &calFuncInfo); - if (calFuncInfo.maxScratchRegsNeeded > 0) { - LogPrintfInfo("%s kernel has register spilling." - "Lower performance is expected.", name().c_str()); + if (ACL_SUCCESS != + aclInsertSection(nullDev().compiler(), bin, code.data(), code.size(), aclSOURCE)) { + LogWarning("aclInsertSection failed"); + aclBinaryFini(bin); + return false; } - workGroupInfo_.scratchRegs_ = calFuncInfo.maxScratchRegsNeeded; - workGroupInfo_.wavefrontPerSIMD_ = calFuncInfo.numWavefrontPerSIMD; - workGroupInfo_.wavefrontSize_ = calFuncInfo.wavefrontSize; - workGroupInfo_.availableGPRs_ = calFuncInfo.numGPRsAvailable; - workGroupInfo_.usedGPRs_ = calFuncInfo.numGPRsUsed; - workGroupInfo_.availableSGPRs_ = calFuncInfo.numSGPRsAvailable; - workGroupInfo_.usedSGPRs_ = calFuncInfo.numSGPRsUsed; - workGroupInfo_.availableVGPRs_ = calFuncInfo.numVGPRsAvailable; - workGroupInfo_.usedVGPRs_ = calFuncInfo.numVGPRsUsed; - workGroupInfo_.availableLDSSize_ = calFuncInfo.LDSSizeAvailable; - workGroupInfo_.usedLDSSize_ = calFuncInfo.LDSSizeUsed; - workGroupInfo_.availableStackSize_ = calFuncInfo.stackSizeAvailable; - workGroupInfo_.usedStackSize_ = calFuncInfo.stackSizeUsed; + amd::option::Options* Opts = (amd::option::Options*)bin->options; - device::Kernel::parameters_t params; - if (!createSignature(params)) { - return false; + // Append an option so that we can selectively enable a SCOption on CZ + // whenever IOMMUv2 is enabled. + if (nullDev().settings().svmFineGrainSystem_) { + options->origOptionStr.append(" -sc-xnack-iommu"); + } + // temporary solution to synchronize buildNo between runtime and complib + // until we move runtime inside complib + Opts->setBuildNo(options->getBuildNo()); + + // pass kernel name to compiler + Opts->setCurrKernelName(name().c_str()); + + err = aclCompile(nullDev().compiler(), bin, options->origOptionStr.c_str(), ACL_TYPE_AMDIL_TEXT, + ACL_TYPE_ISA, NULL); + + buildLog_ += aclGetCompilerLog(nullDev().compiler()); + + if (err != ACL_SUCCESS) { + LogWarning("aclCompile failed"); + aclBinaryFini(bin); + return false; + } + if (!options->oVariables->BinEXE) { + // Early exit if binary doesn't contain EXE + aclBinaryFini(bin); + return true; + } + size_t len; + const void* isa = aclExtractSection(nullDev().compiler(), bin, &len, aclTEXT, &err); + if (err != ACL_SUCCESS) { + LogWarning("aclExtractSection failed"); + aclBinaryFini(bin); + return false; } - return true; + uint calImageSize; + if (!createMultiBinary(&calImageSize, reinterpret_cast(&calImage), isa)) { + LogWarning("initSrcEncoding failed"); + aclBinaryFini(bin); + return false; + } + + aclBinaryFini(bin); + } else if ((binaryCode != NULL) && (binarySize != 0)) { + uint size = 0; + if (!amuABIMultiBinaryGetSize(&size, const_cast(binaryCode)) || size > binarySize) { + buildLog_ += "Invalid binary image"; + LogError("amuABIMultiBinaryGetSize failed!"); + return false; + } + + calImage = static_cast(malloc(size)); + ::memcpy(calImage, binaryCode, size); + } else { + LogError("Incorrect initialization parameters!"); + return false; + } + + calRef_ = new CalImageReference(calImage); + if (calRef_ == NULL) { + LogError("Memory allocation failure!"); + // Free CAL image + free(calImage); + return false; + } + + CALfuncInfo calFuncInfo; + + // Get kernel compiled information + getFuncInfoFromImage(calImage, &calFuncInfo); + if (calFuncInfo.maxScratchRegsNeeded > 0) { + LogPrintfInfo( + "%s kernel has register spilling." + "Lower performance is expected.", + name().c_str()); + } + + workGroupInfo_.scratchRegs_ = calFuncInfo.maxScratchRegsNeeded; + workGroupInfo_.wavefrontPerSIMD_ = calFuncInfo.numWavefrontPerSIMD; + workGroupInfo_.wavefrontSize_ = calFuncInfo.wavefrontSize; + workGroupInfo_.availableGPRs_ = calFuncInfo.numGPRsAvailable; + workGroupInfo_.usedGPRs_ = calFuncInfo.numGPRsUsed; + workGroupInfo_.availableSGPRs_ = calFuncInfo.numSGPRsAvailable; + workGroupInfo_.usedSGPRs_ = calFuncInfo.numSGPRsUsed; + workGroupInfo_.availableVGPRs_ = calFuncInfo.numVGPRsAvailable; + workGroupInfo_.usedVGPRs_ = calFuncInfo.numVGPRsUsed; + workGroupInfo_.availableLDSSize_ = calFuncInfo.LDSSizeAvailable; + workGroupInfo_.usedLDSSize_ = calFuncInfo.LDSSizeUsed; + workGroupInfo_.availableStackSize_ = calFuncInfo.stackSizeAvailable; + workGroupInfo_.usedStackSize_ = calFuncInfo.stackSizeUsed; + + device::Kernel::parameters_t params; + if (!createSignature(params)) { + return false; + } + + return true; } -size_t -NullKernel::getCalBinarySize() const -{ - CALuint imageSize; - if (!amuABIMultiBinaryGetSize(&imageSize, calImage())) { - LogError("Failed to get the image size!"); - return 0; - } - return static_cast(imageSize); +size_t NullKernel::getCalBinarySize() const { + CALuint imageSize; + if (!amuABIMultiBinaryGetSize(&imageSize, calImage())) { + LogError("Failed to get the image size!"); + return 0; + } + return static_cast(imageSize); } -bool -NullKernel::getCalBinary(void* binary, size_t size) const -{ - uint calImageSize = 0; - if (!amuABIMultiBinaryGetSize(&calImageSize, calImage()) - || size < calImageSize) { - LogError("CAL failed to save the kernel binary!"); - return false; - } - ::memcpy(binary, calImage(), calImageSize); +bool NullKernel::getCalBinary(void* binary, size_t size) const { + uint calImageSize = 0; + if (!amuABIMultiBinaryGetSize(&calImageSize, calImage()) || size < calImageSize) { + LogError("CAL failed to save the kernel binary!"); + return false; + } + ::memcpy(binary, calImage(), calImageSize); - return true; + return true; } -bool -Kernel::create( - const std::string& code, - const std::string& metadata, - const void* binaryCode, - size_t binarySize) -{ - setPreferredSizeMultiple(dev().getAttribs().wavefrontSize); +bool Kernel::create(const std::string& code, const std::string& metadata, const void* binaryCode, + size_t binarySize) { + setPreferredSizeMultiple(dev().getAttribs().wavefrontSize); - if (!NullKernel::create(code, metadata, binaryCode, binarySize)) { - return false; - } + if (!NullKernel::create(code, metadata, binaryCode, binarySize)) { + return false; + } - // initialize constant buffer sizes - if (!initConstBuffers()) { - return false; - } + // initialize constant buffer sizes + if (!initConstBuffers()) { + return false; + } - // Initialize the kernel parameters - bool result = initParameters(); + // Initialize the kernel parameters + bool result = initParameters(); - // Wave limiter needs to be initialized after kernel metadata is parsed - // Since it depends on it. - waveLimiter_.enable(dev().settings().ciPlus_); + // Wave limiter needs to be initialized after kernel metadata is parsed + // Since it depends on it. + waveLimiter_.enable(dev().settings().ciPlus_); - if (result) { - buildError_ = CL_SUCCESS; - } - else { - result = false; - } + if (result) { + buildError_ = CL_SUCCESS; + } else { + result = false; + } - return result; + return result; } -Kernel::Kernel( - const std::string& name, - const Device& gpuDev, - const Program& prog, - const InitData* initData) - : NullKernel(name, gpuDev, prog) - , waveLimiter_(this, dev().getAttribs().numberOfCUsperShaderArray * dev().hwInfo()->simdPerCU_) -{ - hwPrivateSize_ = 0; - if (NULL != initData) { - flags_ = initData->flags_; - hwPrivateSize_ = initData->hwPrivateSize_; - hwLocalSize_ = initData->hwLocalSize_; - } - // Workgroup info private memory size - workGroupInfo_.privateMemSize_ = hwPrivateSize_; - // Default wavesPerSimdHint_ - workGroupInfo_.wavesPerSimdHint_ = ~0U; - hsa_ = false; +Kernel::Kernel(const std::string& name, const Device& gpuDev, const Program& prog, + const InitData* initData) + : NullKernel(name, gpuDev, prog), + waveLimiter_(this, + dev().getAttribs().numberOfCUsperShaderArray * dev().hwInfo()->simdPerCU_) { + hwPrivateSize_ = 0; + if (NULL != initData) { + flags_ = initData->flags_; + hwPrivateSize_ = initData->hwPrivateSize_; + hwLocalSize_ = initData->hwLocalSize_; + } + // Workgroup info private memory size + workGroupInfo_.privateMemSize_ = hwPrivateSize_; + // Default wavesPerSimdHint_ + workGroupInfo_.wavesPerSimdHint_ = ~0U; + hsa_ = false; } -Kernel::~Kernel() -{ - if (calRef_ == NULL) { - return; +Kernel::~Kernel() { + if (calRef_ == NULL) { + return; + } + + { + Device::ScopedLockVgpus lock(dev()); + + // Release all virtual image objects on all virtual GPUs + for (uint idx = 0; idx < dev().vgpus().size(); ++idx) { + dev().vgpus()[idx]->releaseKernel(calImage()); } + } - { - Device::ScopedLockVgpus lock(dev()); + if (0 != numCb_) { + delete[] cbSizes_; + } +} - // Release all virtual image objects on all virtual GPUs - for (uint idx = 0; idx < dev().vgpus().size(); ++idx) { - dev().vgpus()[idx]->releaseKernel(calImage()); +const Device& Kernel::dev() const { return reinterpret_cast(gpuDev_); } + +const Program& Kernel::prog() const { return reinterpret_cast(prog_); } + +bool NullKernel::createMultiBinary(uint* imageSize, void** image, const void* isa) { + const SC_HWSHADER* shader = reinterpret_cast(isa); + + bool result = false; + AMUabiAddEncoding encoding; + memset(&encoding, 0, sizeof(AMUabiAddEncoding)); + + size_t allocSize = sizeof(uint) * MaxReadImage + sizeof(CALUavEntry) * MaxUavArguments + + sizeof(CALSamplerMapEntry) * MaxSamplers + sizeof(CALConstantBufferMask) * MaxConstBuffers + + sizeof(AMUabiLiteralConst) * shader->dep.NumIntrlIConstants; + char* tmpMem = new char[allocSize]; + if (tmpMem == NULL) { + LogError("Error allocating memory"); + return false; + } + + CalcPtr(encoding.inputs, tmpMem, 0, 0); + CalcPtr(encoding.uav, encoding.inputs, sizeof(uint), MaxReadImage); + CalcPtr(encoding.inputSamplerMaps, encoding.uav, sizeof(CALUavEntry), MaxUavArguments); + CalcPtr(encoding.constBuffers, encoding.inputSamplerMaps, sizeof(CALSamplerMapEntry), + MaxSamplers); + if (shader->dep.NumIntrlIConstants != 0) { + CalcPtr(encoding.litConsts, encoding.constBuffers, sizeof(CALConstantBufferMask), + MaxConstBuffers); + } + AMUabiMultiBinary amuBinary; + amuABIMultiBinaryCreate(&amuBinary); + + result = siCreateHwInfo(shader, encoding); + if (!result) { + delete[] tmpMem; + LogWarning("Error Creating program info"); + return false; + } + + addLoopConst(shader, encoding); + + unsigned int outputCount = 0, condOut = 0, earlyExit = 0, globalCount = 0, persistentCount = 0; + unsigned int symbolCount = 0; + CALOutputEntry* outputs = 0; + unsigned int* globalBuffers = 0; + unsigned int* persistentBuffers = 0; + AMUabiUserSymbol* symbols = 0; + + CALSamplerMapEntry* inputSamplers = encoding.inputSamplerMaps; + CALConstantBufferMask* constBuffers = encoding.constBuffers; + uint* inputResources = encoding.inputs; + CALUavEntry* uav = encoding.uav; + + uint inputSamplerCount = samplerSize(); + for (uint i = 0; i < inputSamplerCount; ++i) { + inputSamplers[i].resource = 0; + inputSamplers[i].sampler = sampler(i)->index_; + } + + uint constBufferCount = 2; + + constBuffers[0].index = 0; + constBuffers[1].index = 1; + + uint inputResourceCount = 0; + + uint uavCount = 0; + bool cbBound = false; + bool printfBound = false; + for (uint i = 0; i < arguments_.size(); ++i) { + const KernelArg* arg = argument(i); + switch (arg->type_) { + case KernelArg::PointerConst: + case KernelArg::PointerHwConst: + constBuffers[constBufferCount++].index = arg->index_; + break; + case KernelArg::PointerGlobal: + uav[uavCount].offset = arg->index_; + uav[uavCount].type = AMU_ABI_UAV_TYPE_TYPELESS; + uav[uavCount].dimension = AMU_ABI_DIM_BUFFER; + uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS; + uavCount++; + break; + case KernelArg::ConstBufId: + if (!cbBound) { + uav[uavCount].offset = cbId_; + uav[uavCount].type = AMU_ABI_UAV_TYPE_RAW; + uav[uavCount].dimension = AMU_ABI_DIM_BUFFER; + uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS; + uavCount++; } - } - - if (0 != numCb_) { - delete [] cbSizes_; - } -} - -const Device& -Kernel::dev() const -{ - return reinterpret_cast(gpuDev_); -} - -const Program& -Kernel::prog() const -{ - return reinterpret_cast(prog_); -} - -bool -NullKernel::createMultiBinary(uint* imageSize, void** image, const void* isa) -{ - const SC_HWSHADER* shader = reinterpret_cast(isa); - - bool result = false; - AMUabiAddEncoding encoding; - memset(&encoding, 0, sizeof(AMUabiAddEncoding)); - - size_t allocSize = - sizeof(uint) * MaxReadImage + - sizeof(CALUavEntry) * MaxUavArguments + - sizeof(CALSamplerMapEntry) * MaxSamplers + - sizeof(CALConstantBufferMask) * MaxConstBuffers + - sizeof(AMUabiLiteralConst) * shader->dep.NumIntrlIConstants; - char* tmpMem = new char[allocSize]; - if (tmpMem == NULL) { - LogError("Error allocating memory"); - return false; - } - - CalcPtr(encoding.inputs, tmpMem, 0, 0); - CalcPtr(encoding.uav, encoding.inputs, sizeof(uint), MaxReadImage); - CalcPtr(encoding.inputSamplerMaps, encoding.uav, sizeof(CALUavEntry), MaxUavArguments); - CalcPtr(encoding.constBuffers, encoding.inputSamplerMaps, sizeof(CALSamplerMapEntry), MaxSamplers); - if (shader->dep.NumIntrlIConstants != 0) { - CalcPtr(encoding.litConsts, encoding.constBuffers, sizeof(CALConstantBufferMask), MaxConstBuffers); - } - AMUabiMultiBinary amuBinary; - amuABIMultiBinaryCreate(&amuBinary); - - result = siCreateHwInfo(shader, encoding); - if (!result) { - delete [] tmpMem; - LogWarning("Error Creating program info"); - return false; - } - - addLoopConst(shader, encoding); - - unsigned int outputCount=0, condOut=0, earlyExit=0, globalCount=0, persistentCount=0; - unsigned int symbolCount=0; - CALOutputEntry* outputs=0; - unsigned int* globalBuffers=0; - unsigned int* persistentBuffers=0; - AMUabiUserSymbol* symbols=0; - - CALSamplerMapEntry* inputSamplers = encoding.inputSamplerMaps; - CALConstantBufferMask* constBuffers = encoding.constBuffers; - uint* inputResources = encoding.inputs; - CALUavEntry* uav = encoding.uav; - - uint inputSamplerCount = samplerSize(); - for (uint i = 0; i < inputSamplerCount; ++i) { - inputSamplers[i].resource = 0; - inputSamplers[i].sampler = sampler(i)->index_; - } - - uint constBufferCount = 2; - - constBuffers[0].index = 0; - constBuffers[1].index = 1; - - uint inputResourceCount = 0; - - uint uavCount = 0; - bool cbBound = false; - bool printfBound = false; - for (uint i = 0; i < arguments_.size(); ++i) { - const KernelArg* arg = argument(i); - switch (arg->type_) { - case KernelArg::PointerConst: - case KernelArg::PointerHwConst: - constBuffers[constBufferCount++].index = arg->index_; - break; - case KernelArg::PointerGlobal: - uav[uavCount].offset = arg->index_; - uav[uavCount].type = AMU_ABI_UAV_TYPE_TYPELESS; + cbBound = true; + break; + case KernelArg::PrintfBufId: + if (!printfBound) { + uav[uavCount].offset = printfId_; + uav[uavCount].type = AMU_ABI_UAV_TYPE_RAW; + uav[uavCount].dimension = AMU_ABI_DIM_BUFFER; + uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS; + uavCount++; + } + printfBound = true; + break; + case KernelArg::UavId: + if ((UavIdUndefined != uavRaw_) && !(flags() & PrintfOutput)) { + uav[uavCount].offset = arg->index_; + uav[uavCount].type = AMU_ABI_UAV_TYPE_TYPELESS; + uav[uavCount].dimension = AMU_ABI_DIM_BUFFER; + uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS; + uavCount++; + } else { + if (UavIdUndefined != uavRaw_) { + uav[uavCount].offset = uavRaw_; + uav[uavCount].type = AMU_ABI_UAV_TYPE_RAW; uav[uavCount].dimension = AMU_ABI_DIM_BUFFER; uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS; uavCount++; - break; - case KernelArg::ConstBufId: - if (!cbBound) { - uav[uavCount].offset = cbId_; - uav[uavCount].type = AMU_ABI_UAV_TYPE_RAW; - uav[uavCount].dimension = AMU_ABI_DIM_BUFFER; - uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS; - uavCount++; - } - cbBound = true; - break; - case KernelArg::PrintfBufId: - if (!printfBound) { - uav[uavCount].offset = printfId_; - uav[uavCount].type = AMU_ABI_UAV_TYPE_RAW; - uav[uavCount].dimension = AMU_ABI_DIM_BUFFER; - uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS; - uavCount++; - } - printfBound = true; - break; - case KernelArg::UavId: - if ((UavIdUndefined != uavRaw_) && - !(flags() & PrintfOutput)) { - uav[uavCount].offset = arg->index_; - uav[uavCount].type = AMU_ABI_UAV_TYPE_TYPELESS; - uav[uavCount].dimension = AMU_ABI_DIM_BUFFER; - uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS; - uavCount++; - } - else { - if (UavIdUndefined != uavRaw_) { - uav[uavCount].offset = uavRaw_; - uav[uavCount].type = AMU_ABI_UAV_TYPE_RAW; - uav[uavCount].dimension = AMU_ABI_DIM_BUFFER; - uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS; - uavCount++; - } - } - break; - case KernelArg::Sampler: - inputSamplers[inputSamplerCount].resource = 0; - inputSamplers[inputSamplerCount].sampler = arg->index_; - inputSamplerCount++; - break; - case KernelArg::Image1D: - case KernelArg::Image2D: - case KernelArg::Image3D: - case KernelArg::Image1DB: - case KernelArg::Image1DA: - case KernelArg::Image2DA: - if (arg->memory_.readOnly_) { - inputResources[inputResourceCount++] = arg->index_; - } - else { - uav[uavCount].offset = arg->index_; - uav[uavCount].type = AMU_ABI_UAV_TYPE_TYPED; - uav[uavCount].dimension = AMU_ABI_DIM_2D; - uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS; - uavCount++; - } - break; - default: - break; + } } + break; + case KernelArg::Sampler: + inputSamplers[inputSamplerCount].resource = 0; + inputSamplers[inputSamplerCount].sampler = arg->index_; + inputSamplerCount++; + break; + case KernelArg::Image1D: + case KernelArg::Image2D: + case KernelArg::Image3D: + case KernelArg::Image1DB: + case KernelArg::Image1DA: + case KernelArg::Image2DA: + if (arg->memory_.readOnly_) { + inputResources[inputResourceCount++] = arg->index_; + } else { + uav[uavCount].offset = arg->index_; + uav[uavCount].type = AMU_ABI_UAV_TYPE_TYPED; + uav[uavCount].dimension = AMU_ABI_DIM_2D; + uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS; + uavCount++; + } + break; + default: + break; } + } - for (uint i = 0; i < nullProg().glbCb().size(); ++i) { - constBuffers[constBufferCount++].index = nullProg().glbCb()[i]; - } + for (uint i = 0; i < nullProg().glbCb().size(); ++i) { + constBuffers[constBufferCount++].index = nullProg().glbCb()[i]; + } - encoding.machine = nullDev().hwInfo()->machine_; - encoding.type = ED_ATI_CAL_TYPE_COMPUTE; - encoding.inputCount = inputResourceCount; - encoding.outputCount = outputCount; - encoding.outputs = outputs; - encoding.condOut = condOut; - encoding.earlyExit = earlyExit; - encoding.globalBuffersCount = globalCount; - encoding.globalBuffers = globalBuffers; - encoding.persistentBuffersCount = persistentCount; - encoding.persistentBuffers = persistentBuffers; - encoding.constBuffersCount = constBufferCount; - encoding.inputSamplerMapCount = inputSamplerCount; - encoding.symbolsCount = symbolCount; - encoding.symbols = symbols; - encoding.uavCount = uavCount; + encoding.machine = nullDev().hwInfo()->machine_; + encoding.type = ED_ATI_CAL_TYPE_COMPUTE; + encoding.inputCount = inputResourceCount; + encoding.outputCount = outputCount; + encoding.outputs = outputs; + encoding.condOut = condOut; + encoding.earlyExit = earlyExit; + encoding.globalBuffersCount = globalCount; + encoding.globalBuffers = globalBuffers; + encoding.persistentBuffersCount = persistentCount; + encoding.persistentBuffers = persistentBuffers; + encoding.constBuffersCount = constBufferCount; + encoding.inputSamplerMapCount = inputSamplerCount; + encoding.symbolsCount = symbolCount; + encoding.symbols = symbols; + encoding.uavCount = uavCount; - amuABIMultiBinaryAddEncoding(amuBinary, &encoding); + amuABIMultiBinaryAddEncoding(amuBinary, &encoding); - uint success = amuABIMultiBinaryPack(imageSize, image, amuBinary); + uint success = amuABIMultiBinaryPack(imageSize, image, amuBinary); - amuABIMultiBinaryDestroy(amuBinary); + amuABIMultiBinaryDestroy(amuBinary); - delete [] tmpMem; - delete [] encoding.progInfos; + delete[] tmpMem; + delete[] encoding.progInfos; - return (success == 0) ? false : true; + return (success == 0) ? false : true; } -void -Kernel::findLocalWorkSize( - size_t workDim, - const amd::NDRange& gblWorkSize, - amd::NDRange& lclWorkSize) const -{ - // Initialize the default workgoup info - // Check if the kernel has the compiled sizes - if (workGroupInfo()->compileSize_[0] == 0) { - // Find the default local workgroup size, if it wasn't specified - if (lclWorkSize[0] == 0) { - size_t thrPerGrp; - bool b1DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE); - bool b2DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_X) || - !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_Y); - bool b3DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_X) || - !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Y) || - !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Z); +void Kernel::findLocalWorkSize(size_t workDim, const amd::NDRange& gblWorkSize, + amd::NDRange& lclWorkSize) const { + // Initialize the default workgoup info + // Check if the kernel has the compiled sizes + if (workGroupInfo()->compileSize_[0] == 0) { + // Find the default local workgroup size, if it wasn't specified + if (lclWorkSize[0] == 0) { + size_t thrPerGrp; + bool b1DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE); + bool b2DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_X) || + !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_Y); + bool b3DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_X) || + !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Y) || + !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Z); - bool overrideSet = ((workDim == 1) && b1DOverrideSet) || - ((workDim == 2) && b2DOverrideSet) || - ((workDim == 3) && b3DOverrideSet); - if (!overrideSet) { - // Find threads per group - thrPerGrp = workGroupInfo()->size_; + bool overrideSet = ((workDim == 1) && b1DOverrideSet) || ((workDim == 2) && b2DOverrideSet) || + ((workDim == 3) && b3DOverrideSet); + if (!overrideSet) { + // Find threads per group + thrPerGrp = workGroupInfo()->size_; - // Check if kernel uses images - if ((flags() & ImageEnable) && - // and thread group is a multiple value of wavefronts - ((thrPerGrp % workGroupInfo()->wavefrontSize_) == 0) && - // and it's 2 or 3-dimensional workload - (workDim > 1) && - ((dev().settings().partialDispatch_) || - (((gblWorkSize[0] % 16) == 0) && - ((gblWorkSize[1] % 16) == 0)))) { - // Use 8x8 workgroup size if kernel has image writes - if ((flags() & ImageWrite) || - (thrPerGrp != nullDev().info().maxWorkGroupSize_)) { - lclWorkSize[0] = 8; - lclWorkSize[1] = 8; - } - else { - lclWorkSize[0] = 16; - lclWorkSize[1] = 16; - } - if (workDim == 3) { - lclWorkSize[2] = 1; - } - } - else { - size_t tmp = thrPerGrp; - // Split the local workgroup into the most efficient way - for (uint d = 0; d < workDim; ++d) { - size_t div = tmp; - for (; (gblWorkSize[d] % div) != 0; div--); - lclWorkSize[d] = div; - tmp /= div; - } - // Assuming DWORD access - const uint cacheLineMatch = dev().settings().cacheLineSize_ >> 2; + // Check if kernel uses images + if ((flags() & ImageEnable) && + // and thread group is a multiple value of wavefronts + ((thrPerGrp % workGroupInfo()->wavefrontSize_) == 0) && + // and it's 2 or 3-dimensional workload + (workDim > 1) && ((dev().settings().partialDispatch_) || + (((gblWorkSize[0] % 16) == 0) && ((gblWorkSize[1] % 16) == 0)))) { + // Use 8x8 workgroup size if kernel has image writes + if ((flags() & ImageWrite) || (thrPerGrp != nullDev().info().maxWorkGroupSize_)) { + lclWorkSize[0] = 8; + lclWorkSize[1] = 8; + } else { + lclWorkSize[0] = 16; + lclWorkSize[1] = 16; + } + if (workDim == 3) { + lclWorkSize[2] = 1; + } + } else { + size_t tmp = thrPerGrp; + // Split the local workgroup into the most efficient way + for (uint d = 0; d < workDim; ++d) { + size_t div = tmp; + for (; (gblWorkSize[d] % div) != 0; div--) + ; + lclWorkSize[d] = div; + tmp /= div; + } + // Assuming DWORD access + const uint cacheLineMatch = dev().settings().cacheLineSize_ >> 2; - // Check if partial dispatch is enabled and - if (dev().settings().partialDispatch_ && - // we couldn't find optimal workload - (((lclWorkSize.product() % workGroupInfo()->wavefrontSize_) != 0) || - // or size is too small for the cache line - (lclWorkSize[0] < cacheLineMatch))) { - size_t maxSize = 0; - size_t maxDim = 0; - for (uint d = 0; d < workDim; ++d) { - if (maxSize < gblWorkSize[d]) { - maxSize = gblWorkSize[d]; - maxDim = d; - } - } - // Use X dimension as high priority. Runtime will assume that - // X dimension is more important for the address calculation - if ((maxDim != 0) && (gblWorkSize[0] >= (cacheLineMatch / 2))) { - lclWorkSize[0] = cacheLineMatch; - thrPerGrp /= cacheLineMatch; - lclWorkSize[maxDim] = thrPerGrp; - for (uint d = 1; d < workDim; ++d) { - if (d != maxDim) { - lclWorkSize[d] = 1; - } - } - } - else { - // Check if a local workgroup has the most optimal size - if (thrPerGrp > maxSize) { - thrPerGrp = maxSize; - } - lclWorkSize[maxDim] = thrPerGrp; - for (uint d = 0; d < workDim; ++d) { - if (d != maxDim) { - lclWorkSize[d] = 1; - } - } - } - } - } + // Check if partial dispatch is enabled and + if (dev().settings().partialDispatch_ && + // we couldn't find optimal workload + (((lclWorkSize.product() % workGroupInfo()->wavefrontSize_) != 0) || + // or size is too small for the cache line + (lclWorkSize[0] < cacheLineMatch))) { + size_t maxSize = 0; + size_t maxDim = 0; + for (uint d = 0; d < workDim; ++d) { + if (maxSize < gblWorkSize[d]) { + maxSize = gblWorkSize[d]; + maxDim = d; + } } - else { - // Use overrides when app doesn't provide workgroup dimensions - if (workDim == 1) { - lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE; + // Use X dimension as high priority. Runtime will assume that + // X dimension is more important for the address calculation + if ((maxDim != 0) && (gblWorkSize[0] >= (cacheLineMatch / 2))) { + lclWorkSize[0] = cacheLineMatch; + thrPerGrp /= cacheLineMatch; + lclWorkSize[maxDim] = thrPerGrp; + for (uint d = 1; d < workDim; ++d) { + if (d != maxDim) { + lclWorkSize[d] = 1; } - else if (workDim == 2) { - lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_2D_X; - lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_2D_Y; - } - else if (workDim == 3) { - lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_3D_X; - lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_3D_Y; - lclWorkSize[2] = GPU_MAX_WORKGROUP_SIZE_3D_Z; - } - else - { - assert(0 && "Invalid workDim!"); + } + } else { + // Check if a local workgroup has the most optimal size + if (thrPerGrp > maxSize) { + thrPerGrp = maxSize; + } + lclWorkSize[maxDim] = thrPerGrp; + for (uint d = 0; d < workDim; ++d) { + if (d != maxDim) { + lclWorkSize[d] = 1; } + } } + } } - } - else { - for (uint d = 0; d < workDim; ++d) { - lclWorkSize[d] = workGroupInfo()->compileSize_[d]; + } else { + // Use overrides when app doesn't provide workgroup dimensions + if (workDim == 1) { + lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE; + } else if (workDim == 2) { + lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_2D_X; + lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_2D_Y; + } else if (workDim == 3) { + lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_3D_X; + lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_3D_Y; + lclWorkSize[2] = GPU_MAX_WORKGROUP_SIZE_3D_Z; + } else { + assert(0 && "Invalid workDim!"); } + } } + } else { + for (uint d = 0; d < workDim; ++d) { + lclWorkSize[d] = workGroupInfo()->compileSize_[d]; + } + } } -void -Kernel::setupProgramGrid( - VirtualGPU& gpu, - size_t workDim, - const amd::NDRange& glbWorkOffset, - const amd::NDRange& gblWorkSize, - amd::NDRange& lclWorkSize, - const amd::NDRange& groupOffset, - const amd::NDRange& glbWorkOffsetOrg, - const amd::NDRange& glbWorkSizeOrg - ) const -{ - // ABI is always in CB0 - address cbBuf = gpu.cb(0)->sysMemCopy(); - uint* pGlobalSize = reinterpret_cast - (cbBuf + GlobalWorkitemOffset * ConstBuffer::VectorSize); - uint* pLocalSize = reinterpret_cast - (cbBuf + LocalWorkitemOffset * ConstBuffer::VectorSize); - uint* pNumGroups = reinterpret_cast - (cbBuf + GroupsOffset * ConstBuffer::VectorSize); - uint* pGlobalOffset = reinterpret_cast - (cbBuf + GlobalWorkOffsetOffset * ConstBuffer::VectorSize); - uint* pGroupOffset = reinterpret_cast - (cbBuf + GroupWorkOffsetOffset * ConstBuffer::VectorSize); - uint32_t* debugInfo = reinterpret_cast - (cbBuf + DebugOffset * ConstBuffer::VectorSize); - uint* pNDRangeGlobalOffset = reinterpret_cast - (cbBuf + NDRangeGlobalWorkOffsetOffset * ConstBuffer::VectorSize); +void Kernel::setupProgramGrid(VirtualGPU& gpu, size_t workDim, const amd::NDRange& glbWorkOffset, + const amd::NDRange& gblWorkSize, amd::NDRange& lclWorkSize, + const amd::NDRange& groupOffset, const amd::NDRange& glbWorkOffsetOrg, + const amd::NDRange& glbWorkSizeOrg) const { + // ABI is always in CB0 + address cbBuf = gpu.cb(0)->sysMemCopy(); + uint* pGlobalSize = + reinterpret_cast(cbBuf + GlobalWorkitemOffset * ConstBuffer::VectorSize); + uint* pLocalSize = reinterpret_cast(cbBuf + LocalWorkitemOffset * ConstBuffer::VectorSize); + uint* pNumGroups = reinterpret_cast(cbBuf + GroupsOffset * ConstBuffer::VectorSize); + uint* pGlobalOffset = + reinterpret_cast(cbBuf + GlobalWorkOffsetOffset * ConstBuffer::VectorSize); + uint* pGroupOffset = + reinterpret_cast(cbBuf + GroupWorkOffsetOffset * ConstBuffer::VectorSize); + uint32_t* debugInfo = reinterpret_cast(cbBuf + DebugOffset * ConstBuffer::VectorSize); + uint* pNDRangeGlobalOffset = + reinterpret_cast(cbBuf + NDRangeGlobalWorkOffsetOffset * ConstBuffer::VectorSize); - // Check for 64-bit metadata - uint glbABIShift = (abi64Bit()) ? 1 : 0; + // Check for 64-bit metadata + uint glbABIShift = (abi64Bit()) ? 1 : 0; - VirtualGPU::CalVirtualDesc* progGrid = &gpu.cal_; + VirtualGPU::CalVirtualDesc* progGrid = &gpu.cal_; - // Finds local workgroup size - findLocalWorkSize(workDim, gblWorkSize, lclWorkSize); + // Finds local workgroup size + findLocalWorkSize(workDim, gblWorkSize, lclWorkSize); - // Initialize the execution grid block and size/offset - pGlobalSize[0] = pGlobalSize[1] = pGlobalSize[2] = 1; - pGlobalSize[3] = static_cast(workDim); + // Initialize the execution grid block and size/offset + pGlobalSize[0] = pGlobalSize[1] = pGlobalSize[2] = 1; + pGlobalSize[3] = static_cast(workDim); - pLocalSize[0] = pLocalSize[1] = pLocalSize[2] = 1; - pLocalSize[3] = 0; + pLocalSize[0] = pLocalSize[1] = pLocalSize[2] = 1; + pLocalSize[3] = 0; - pNumGroups[0] = pNumGroups[1] = pNumGroups[2] = 1; - pNumGroups[3] = 0; + pNumGroups[0] = pNumGroups[1] = pNumGroups[2] = 1; + pNumGroups[3] = 0; - pGlobalOffset[2] = pGlobalOffset[1] = pGlobalOffset[0] = 0; - pGroupOffset[2] = pGroupOffset[1] = pGroupOffset[0] = 0; + pGlobalOffset[2] = pGlobalOffset[1] = pGlobalOffset[0] = 0; + pGroupOffset[2] = pGroupOffset[1] = pGroupOffset[0] = 0; - progGrid->gridBlock.width = - progGrid->gridBlock.height = - progGrid->gridBlock.depth = 1; + progGrid->gridBlock.width = progGrid->gridBlock.height = progGrid->gridBlock.depth = 1; - progGrid->gridSize.width = - progGrid->gridSize.height = - progGrid->gridSize.depth = 1; + progGrid->gridSize.width = progGrid->gridSize.height = progGrid->gridSize.depth = 1; - progGrid->partialGridBlock.width = - progGrid->partialGridBlock.height = - progGrid->partialGridBlock.depth = 1; + progGrid->partialGridBlock.width = progGrid->partialGridBlock.height = + progGrid->partialGridBlock.depth = 1; - bool partialGrid = false; + bool partialGrid = false; - // Fill the right values, based on the application request - switch (workDim) { + // Fill the right values, based on the application request + switch (workDim) { case 3: - pLocalSize[2] = - progGrid->gridBlock.depth = static_cast(lclWorkSize[2]); + pLocalSize[2] = progGrid->gridBlock.depth = static_cast(lclWorkSize[2]); - pGlobalSize[2] = static_cast(glbWorkSizeOrg[2]); - progGrid->gridSize.depth = static_cast(gblWorkSize[2]); - progGrid->gridSize.depth /= progGrid->gridBlock.depth; - pNumGroups[2] = pGlobalSize[2] / progGrid->gridBlock.depth; + pGlobalSize[2] = static_cast(glbWorkSizeOrg[2]); + progGrid->gridSize.depth = static_cast(gblWorkSize[2]); + progGrid->gridSize.depth /= progGrid->gridBlock.depth; + pNumGroups[2] = pGlobalSize[2] / progGrid->gridBlock.depth; - pGlobalOffset[2] = glbWorkOffset[2]; - pGroupOffset[2] = groupOffset[2]; - pNDRangeGlobalOffset[2 + glbABIShift] = glbWorkOffsetOrg[2]; + pGlobalOffset[2] = glbWorkOffset[2]; + pGroupOffset[2] = groupOffset[2]; + pNDRangeGlobalOffset[2 + glbABIShift] = glbWorkOffsetOrg[2]; - if (dev().settings().partialDispatch_) { - // Check if partial workgroup dispatch is required - progGrid->partialGridBlock.depth = gblWorkSize[2] % lclWorkSize[2]; - if (progGrid->partialGridBlock.depth != 0) { - partialGrid = true; - // Increment the number of groups - progGrid->gridSize.depth++; - pNumGroups[2]++; - } - else { - progGrid->partialGridBlock.depth = lclWorkSize[2]; - } + if (dev().settings().partialDispatch_) { + // Check if partial workgroup dispatch is required + progGrid->partialGridBlock.depth = gblWorkSize[2] % lclWorkSize[2]; + if (progGrid->partialGridBlock.depth != 0) { + partialGrid = true; + // Increment the number of groups + progGrid->gridSize.depth++; + pNumGroups[2]++; + } else { + progGrid->partialGridBlock.depth = lclWorkSize[2]; } - // Fall through to fill 2D and 1D dimensions... + } + // Fall through to fill 2D and 1D dimensions... case 2: - pLocalSize[1] = - progGrid->gridBlock.height = static_cast(lclWorkSize[1]); + pLocalSize[1] = progGrid->gridBlock.height = static_cast(lclWorkSize[1]); - pGlobalSize[1] = static_cast(glbWorkSizeOrg[1]); - progGrid->gridSize.height = static_cast(gblWorkSize[1]); - progGrid->gridSize.height /= progGrid->gridBlock.height; - pNumGroups[1] = pGlobalSize[1] / progGrid->gridBlock.height; + pGlobalSize[1] = static_cast(glbWorkSizeOrg[1]); + progGrid->gridSize.height = static_cast(gblWorkSize[1]); + progGrid->gridSize.height /= progGrid->gridBlock.height; + pNumGroups[1] = pGlobalSize[1] / progGrid->gridBlock.height; - pGlobalOffset[1] = glbWorkOffset[1]; - pGroupOffset[1] = groupOffset[1]; - pNDRangeGlobalOffset[1 + glbABIShift] = glbWorkOffsetOrg[1]; + pGlobalOffset[1] = glbWorkOffset[1]; + pGroupOffset[1] = groupOffset[1]; + pNDRangeGlobalOffset[1 + glbABIShift] = glbWorkOffsetOrg[1]; - if (dev().settings().partialDispatch_) { - // Check if partial workgroup dispatch is required - progGrid->partialGridBlock.height = gblWorkSize[1] % lclWorkSize[1]; - if (progGrid->partialGridBlock.height != 0) { - partialGrid = true; - // Increment the number of groups - progGrid->gridSize.height++; - pNumGroups[1]++; - } - else { - progGrid->partialGridBlock.height = lclWorkSize[1]; - } + if (dev().settings().partialDispatch_) { + // Check if partial workgroup dispatch is required + progGrid->partialGridBlock.height = gblWorkSize[1] % lclWorkSize[1]; + if (progGrid->partialGridBlock.height != 0) { + partialGrid = true; + // Increment the number of groups + progGrid->gridSize.height++; + pNumGroups[1]++; + } else { + progGrid->partialGridBlock.height = lclWorkSize[1]; } - // Fall through to fill 1D dimension... + } + // Fall through to fill 1D dimension... case 1: - pLocalSize[0] = - progGrid->gridBlock.width = static_cast(lclWorkSize[0]); + pLocalSize[0] = progGrid->gridBlock.width = static_cast(lclWorkSize[0]); - pGlobalSize[0] = static_cast(glbWorkSizeOrg[0]); - progGrid->gridSize.width = static_cast(gblWorkSize[0]); - progGrid->gridSize.width /= progGrid->gridBlock.width; - pNumGroups[0] = pGlobalSize[0] / progGrid->gridBlock.width; + pGlobalSize[0] = static_cast(glbWorkSizeOrg[0]); + progGrid->gridSize.width = static_cast(gblWorkSize[0]); + progGrid->gridSize.width /= progGrid->gridBlock.width; + pNumGroups[0] = pGlobalSize[0] / progGrid->gridBlock.width; - pGlobalOffset[0] = glbWorkOffset[0]; - pGroupOffset[0] = groupOffset[0]; - pNDRangeGlobalOffset[0 + glbABIShift] = glbWorkOffsetOrg[0]; + pGlobalOffset[0] = glbWorkOffset[0]; + pGroupOffset[0] = groupOffset[0]; + pNDRangeGlobalOffset[0 + glbABIShift] = glbWorkOffsetOrg[0]; - if (dev().settings().partialDispatch_) { - // Check if partial workgroup dispatch is required - progGrid->partialGridBlock.width = gblWorkSize[0] % lclWorkSize[0]; - if (progGrid->partialGridBlock.width != 0) { - partialGrid = true; - // Increment the number of groups - progGrid->gridSize.width++; - pNumGroups[0]++; - } - else { - progGrid->partialGridBlock.width = lclWorkSize[0]; - } + if (dev().settings().partialDispatch_) { + // Check if partial workgroup dispatch is required + progGrid->partialGridBlock.width = gblWorkSize[0] % lclWorkSize[0]; + if (progGrid->partialGridBlock.width != 0) { + partialGrid = true; + // Increment the number of groups + progGrid->gridSize.width++; + pNumGroups[0]++; + } else { + progGrid->partialGridBlock.width = lclWorkSize[0]; } - break; + } + break; default: - LogWarning("Wrong dimensions. Force to 1x1x1!"); - break; - } + LogWarning("Wrong dimensions. Force to 1x1x1!"); + break; + } - if (!partialGrid) { - progGrid->partialGridBlock.width = - progGrid->partialGridBlock.height = + if (!partialGrid) { + progGrid->partialGridBlock.width = progGrid->partialGridBlock.height = progGrid->partialGridBlock.depth = 0; - } + } - // Calculate the total number of workitems and workgroups - pGlobalOffset[3] = pGroupOffset[3] = 1; - for (uint i = 0; i < workDim; ++i) { - pGlobalOffset[3] *= pGlobalOffset[i]; - pGroupOffset[3] *= pGroupOffset[i]; + // Calculate the total number of workitems and workgroups + pGlobalOffset[3] = pGroupOffset[3] = 1; + for (uint i = 0; i < workDim; ++i) { + pGlobalOffset[3] *= pGlobalOffset[i]; + pGroupOffset[3] *= pGroupOffset[i]; + } + + // Setup debug output buffer (if printf is active) + if (flags() & PrintfOutput) { + if (abi64Bit()) { + // Setup the debug info in constant buffer + reinterpret_cast(debugInfo)[1] = gpu.printfDbg().bufOffset(); + // Size in DWORDs + debugInfo[4] = static_cast(gpu.printfDbg().wiDbgSize()); + debugInfo[4] /= sizeof(uint32_t); + } else { + // Setup the debug info in constant buffer + debugInfo[1] = static_cast(gpu.printfDbg().bufOffset()); + // Size in DWORDs + debugInfo[2] = static_cast(gpu.printfDbg().wiDbgSize()); + debugInfo[2] /= sizeof(uint32_t); + } + } +} + +bool Kernel::initParameters() { + size_t offset = 0; + device::Kernel::parameters_t params; + amd::KernelParameterDescriptor desc; + + for (uint i = 0; i < arguments_.size(); ++i) { + const KernelArg* arg = argument(i); + + // Initialize the arguments for the abstraction layer + if (arg->isCbNeeded()) { + desc.name_ = arg->name_.data(); + desc.type_ = arg->type(); + desc.size_ = arg->size(false); + desc.addressQualifier_ = arg->addressQualifier(); + desc.accessQualifier_ = arg->accessQualifier(); + desc.typeName_ = arg->typeName(); + desc.typeQualifier_ = arg->typeQualifier(); + + // Make offset alignment to match CPU metadata, since + // in multidevice config abstraction layer has a single signature + // and CPU sends the paramaters as they are allocated in memory + size_t size = desc.size_; + if (size == 0) { + // Local memory for CPU + size = sizeof(cl_mem); + } + offset = amd::alignUp(offset, std::min(size, size_t(16))); + desc.offset_ = offset; + offset += amd::alignUp(size, sizeof(uint32_t)); + params.push_back(desc); + } + } + + // Report the allocated local memory size (emulated and hw) + if (hwLocalSize_ != 0) { + CondLog((dev().info().localMemSize_ < hwLocalSize_), + "Requested local size is bigger than reported"); + workGroupInfo_.localMemSize_ = hwLocalSize_; + } + + if (!createSignature(params)) { + return false; + } + + return true; +} + +bool Kernel::bindGlobalHwCb(VirtualGPU& gpu, VirtualGPU::GslKernelDesc* desc) const { + bool result = true; + + // Bind HW constant buffers used for the global data store + const Program::HwConstBuffers& gds = prog().glbHwCb(); + for (Program::HwConstBuffers::const_iterator it = gds.begin(); (it != gds.end() && result); + ++it) { + uint idx = it->first; + result = bindResource(gpu, *(it->second), idx, ConstantBuffer, idx); + } + + return result; +} + +bool Kernel::bindConstantBuffers(VirtualGPU& gpu) const { + bool result = true; + + assert((numCb_ <= MaxConstBuffersArguments) && "Runtime doesn't support more CBs for arguments!"); + + // Upload the parameters to HW and bind all constant buffers + for (uint i = 0; i < numCb_; i++) { + ConstBuffer* cb = gpu.constBufs_[i]; + result &= cb->uploadDataToHw(cbSizes_[i]) && + bindResource(gpu, *cb, i, ConstantBuffer, i, cb->wrtOffset()); + } + + return result; +} + +void Kernel::processMemObjects(VirtualGPU& gpu, const amd::Kernel& kernel, const_address params, + bool nativeMem) const { + VirtualGPU::MemoryDependency& dependecy = gpu.memoryDependency(); + + // Mark the tracker with a new kernel, + // so we can avoid checks of the aliased objects + gpu.memoryDependency().newKernel(); + + // Check all parameters for the current kernel + const amd::KernelSignature& signature = kernel.signature(); + for (size_t i = 0; i < signature.numParameters(); ++i) { + const amd::KernelParameterDescriptor& desc = signature.at(i); + const KernelArg* arg = argument(i); + Memory* memory = NULL; + + // Find if current argument is a buffer + if ((desc.type_ == T_POINTER) && (arg->type_ != KernelArg::PointerLocal) && + (arg->type_ != KernelArg::PointerHwLocal)) { + if (nativeMem) { + memory = *reinterpret_cast(params + desc.offset_); + } else if (*reinterpret_cast(params + desc.offset_) != NULL) { + memory = dev().getGpuMemory(*reinterpret_cast(params + desc.offset_)); + // Synchronize data with other memory instances if necessary + memory->syncCacheFromHost(gpu); + } + + if (memory != NULL) { + // Validate memory for a dependency in the queue + gpu.memoryDependency().validate(gpu, memory, arg->memory_.readOnly_); + } + } + } +} + +bool Kernel::loadParameters(VirtualGPU& gpu, const amd::Kernel& kernel, const_address params, + bool nativeMem) const { + bool result = true; + uint i; + + // Initialize local private ranges + if (!initLocalPrivateRanges(gpu)) { + return false; + } + + if ((UavIdUndefined != uavRaw_) && (!(flags() & PrintfOutput) || (printfId_ != UavIdUndefined))) { + Memory* gpuMemory = dev().getGpuMemory(dev().dummyPage()); + // Bind a buffer for a dummy read + result = bindResource(gpu, *gpuMemory, 0, ArgumentUavID, uavRaw_); + } + + // Find all parameters for the current kernel + const amd::KernelSignature& signature = kernel.signature(); + for (i = 0; i != signature.numParameters(); ++i) { + const amd::KernelParameterDescriptor& desc = signature.at(i); + // Set current argument + if (!setArgument(gpu, i, params + desc.offset_, desc.size_, nativeMem)) { + result = false; + break; + } + } + + if (result) { + // Update the ring ranges and math constant + setLocalPrivateRanges(gpu); + + result = bindConstantBuffers(gpu); + + if (flags() & PrivateFixed) { + result &= bindResource(gpu, dev().globalMem(), 0, GlobalBuffer, uavRaw_); } // Setup debug output buffer (if printf is active) if (flags() & PrintfOutput) { - if (abi64Bit()) { - // Setup the debug info in constant buffer - reinterpret_cast(debugInfo)[1] = - gpu.printfDbg().bufOffset(); - // Size in DWORDs - debugInfo[4] = static_cast(gpu.printfDbg().wiDbgSize()); - debugInfo[4] /= sizeof(uint32_t); - } - else { - // Setup the debug info in constant buffer - debugInfo[1] = static_cast(gpu.printfDbg().bufOffset()); - // Size in DWORDs - debugInfo[2] = static_cast(gpu.printfDbg().wiDbgSize()); - debugInfo[2] /= sizeof(uint32_t); - } + gpu.addVmMemory(gpu.printfDbg().dbgBuffer()); } + } + + return result; } -bool -Kernel::initParameters() -{ - size_t offset = 0; - device::Kernel::parameters_t params; - amd::KernelParameterDescriptor desc; +bool Kernel::run(VirtualGPU& gpu, GpuEvent* calEvent, bool lastRun, bool lastDoppCmd, + bool pfpaDoppCmd) const { + const VirtualGPU::CalVirtualDesc* dispatch = gpu.cal(); - for (uint i = 0; i < arguments_.size(); ++i) { - const KernelArg* arg = argument(i); + auto compProg = static_cast(gpu.gslKernelDesc()->func_); + compProg->setWavesPerSH(waveLimiter_.getWavesPerSH(&gpu)); - // Initialize the arguments for the abstraction layer - if (arg->isCbNeeded()) { - desc.name_ = arg->name_.data(); - desc.type_ = arg->type(); - desc.size_ = arg->size(false); - desc.addressQualifier_ = arg->addressQualifier(); - desc.accessQualifier_ = arg->accessQualifier(); - desc.typeName_ = arg->typeName(); - desc.typeQualifier_ = arg->typeQualifier(); + gpu.eventBegin(MainEngine); + gpu.rs()->Dispatch(gpu.cs(), &dispatch->gridBlock, &dispatch->partialGridBlock, + &dispatch->gridSize, dispatch->localSize, gpu.vmMems(), dispatch->memCount_, + lastDoppCmd, pfpaDoppCmd); + gpu.eventEnd(MainEngine, *calEvent); - // Make offset alignment to match CPU metadata, since - // in multidevice config abstraction layer has a single signature - // and CPU sends the paramaters as they are allocated in memory - size_t size = desc.size_; - if (size == 0) { - // Local memory for CPU - size = sizeof(cl_mem); - } - offset = amd::alignUp(offset, std::min(size, size_t(16))); - desc.offset_ = offset; - offset += amd::alignUp(size, sizeof(uint32_t)); - params.push_back(desc); - } - } + // Unbind all resources + unbindResources(gpu, *calEvent, lastRun); - // Report the allocated local memory size (emulated and hw) - if (hwLocalSize_ != 0) { - CondLog((dev().info().localMemSize_ < hwLocalSize_), - "Requested local size is bigger than reported"); - workGroupInfo_.localMemSize_ = hwLocalSize_; - } - - if (!createSignature(params)) { - return false; - } - - return true; -} - -bool -Kernel::bindGlobalHwCb( - VirtualGPU& gpu, - VirtualGPU::GslKernelDesc* desc) const -{ - bool result = true; - - // Bind HW constant buffers used for the global data store - const Program::HwConstBuffers& gds = prog().glbHwCb(); - for (Program::HwConstBuffers::const_iterator it = gds.begin(); - (it != gds.end() && result); ++it) { - uint idx = it->first; - result = bindResource(gpu, *(it->second), idx, ConstantBuffer, idx); - } - - return result; -} - -bool -Kernel::bindConstantBuffers(VirtualGPU& gpu) const -{ - bool result = true; - - assert((numCb_ <= MaxConstBuffersArguments) && - "Runtime doesn't support more CBs for arguments!"); - - // Upload the parameters to HW and bind all constant buffers - for (uint i = 0; i < numCb_; i++) { - ConstBuffer* cb = gpu.constBufs_[i]; - result &= cb->uploadDataToHw(cbSizes_[i]) && - bindResource(gpu, *cb, i, ConstantBuffer, i, cb->wrtOffset()); - } - - return result; -} - -void -Kernel::processMemObjects( - VirtualGPU& gpu, - const amd::Kernel& kernel, - const_address params, - bool nativeMem) const -{ - VirtualGPU::MemoryDependency& dependecy = gpu.memoryDependency(); - - // Mark the tracker with a new kernel, - // so we can avoid checks of the aliased objects - gpu.memoryDependency().newKernel(); - - // Check all parameters for the current kernel - const amd::KernelSignature& signature = kernel.signature(); - for (size_t i = 0; i < signature.numParameters(); ++i) { - const amd::KernelParameterDescriptor& desc = signature.at(i); - const KernelArg* arg = argument(i); - Memory* memory = NULL; - - // Find if current argument is a buffer - if ((desc.type_ == T_POINTER) && - (arg->type_ != KernelArg::PointerLocal) && - (arg->type_ != KernelArg::PointerHwLocal)) { - if (nativeMem) { - memory = *reinterpret_cast(params + desc.offset_); - } - else if (*reinterpret_cast - (params + desc.offset_) != NULL) { - memory = dev().getGpuMemory(*reinterpret_cast - (params + desc.offset_)); - // Synchronize data with other memory instances if necessary - memory->syncCacheFromHost(gpu); - } - - if (memory != NULL) { - // Validate memory for a dependency in the queue - gpu.memoryDependency().validate(gpu, memory, arg->memory_.readOnly_); - } - } - } -} - -bool -Kernel::loadParameters( - VirtualGPU& gpu, - const amd::Kernel& kernel, - const_address params, - bool nativeMem) const -{ - bool result = true; - uint i; - - // Initialize local private ranges - if (!initLocalPrivateRanges(gpu)) { - return false; - } - - if ((UavIdUndefined != uavRaw_) && - (!(flags() & PrintfOutput) || (printfId_ != UavIdUndefined))) { - Memory* gpuMemory = dev().getGpuMemory(dev().dummyPage()); - // Bind a buffer for a dummy read - result = bindResource(gpu, *gpuMemory, 0, - ArgumentUavID, uavRaw_); - } - - // Find all parameters for the current kernel - const amd::KernelSignature& signature = kernel.signature(); - for (i = 0; i != signature.numParameters(); ++i) { - const amd::KernelParameterDescriptor& desc = signature.at(i); - // Set current argument - if (!setArgument(gpu, i, params + desc.offset_, desc.size_, nativeMem)) { - result = false; - break; - } - } - - if (result) { - // Update the ring ranges and math constant - setLocalPrivateRanges(gpu); - - result = bindConstantBuffers(gpu); - - if (flags() & PrivateFixed) { - result &= bindResource(gpu, dev().globalMem(), 0, GlobalBuffer, uavRaw_); - } - - // Setup debug output buffer (if printf is active) - if (flags() & PrintfOutput) { - gpu.addVmMemory(gpu.printfDbg().dbgBuffer()); - } - } - - return result; -} - -bool -Kernel::run(VirtualGPU& gpu, GpuEvent* calEvent, bool lastRun, bool lastDoppCmd, bool pfpaDoppCmd) const -{ - const VirtualGPU::CalVirtualDesc* dispatch = gpu.cal(); - - auto compProg = static_cast(gpu.gslKernelDesc()->func_); - compProg->setWavesPerSH(waveLimiter_.getWavesPerSH(&gpu)); - - gpu.eventBegin(MainEngine); - gpu.rs()->Dispatch(gpu.cs(), &dispatch->gridBlock, &dispatch->partialGridBlock, - &dispatch->gridSize, dispatch->localSize, gpu.vmMems(), dispatch->memCount_, lastDoppCmd, pfpaDoppCmd); - gpu.eventEnd(MainEngine, *calEvent); - - // Unbind all resources - unbindResources(gpu, *calEvent, lastRun); - - return true; + return true; } static size_t counter = 0; -void -Kernel::debug(VirtualGPU& gpu) const -{ - std::fstream stubWrite; - address src = NULL; +void Kernel::debug(VirtualGPU& gpu) const { + std::fstream stubWrite; + address src = NULL; - std::cerr << "--- " << name_ << " ---" << std::endl; - for (uint i = 0; i < arguments_.size(); ++i) { - const KernelArg* arg = argument(i); - const Memory* gpuMem = gpu.slots_[i].memory_; - std::stringstream fileName; - bool bufferObj = - ((arg->type_ == KernelArg::PointerGlobal) || - (arg->type_ == KernelArg::PointerConst) || - (arg->type_ == KernelArg::PointerHwConst)); + std::cerr << "--- " << name_ << " ---" << std::endl; + for (uint i = 0; i < arguments_.size(); ++i) { + const KernelArg* arg = argument(i); + const Memory* gpuMem = gpu.slots_[i].memory_; + std::stringstream fileName; + bool bufferObj = + ((arg->type_ == KernelArg::PointerGlobal) || (arg->type_ == KernelArg::PointerConst) || + (arg->type_ == KernelArg::PointerHwConst)); - if ((src != NULL) && arg->isCbNeeded() && bufferObj) { - address memory = gpu.cb(arg->cbIdx_)->sysMemCopy(); - std::cerr.setf(std::ios::hex); - uint* location = reinterpret_cast - (src + *reinterpret_cast(memory + arg->cbPos_)); - std::cerr << " > " << arg->name_ << ": 0x" << location << std::endl; + if ((src != NULL) && arg->isCbNeeded() && bufferObj) { + address memory = gpu.cb(arg->cbIdx_)->sysMemCopy(); + std::cerr.setf(std::ios::hex); + uint* location = + reinterpret_cast(src + *reinterpret_cast(memory + arg->cbPos_)); + std::cerr << " > " << arg->name_ << ": 0x" << location << std::endl; - // Dump the data - fileName << counter << "_kernel_" << name() << - "_" << arg->name_ << "_" << location << ".bin"; - stubWrite.open(fileName.str().c_str(), - (std::fstream::out | std::fstream::binary)); + // Dump the data + fileName << counter << "_kernel_" << name() << "_" << arg->name_ << "_" << location << ".bin"; + stubWrite.open(fileName.str().c_str(), (std::fstream::out | std::fstream::binary)); - // Write data to a file - if (stubWrite.is_open()) { - stubWrite.write( - reinterpret_cast(location), gpuMem->size()); - stubWrite.close(); - } - } - if (((arg->type_ >= KernelArg::Image1D) && - (arg->type_ <= KernelArg::Image3D)) || - ((src == NULL) && bufferObj)) { - //@todo Replace the current map - Memory* resource = const_cast(gpu.slots_[i].memory_); - void* memory = resource->map(&gpu); - uint* location = reinterpret_cast(memory); - std::cerr << " > " << arg->name_ << (bufferObj ? ": buffer" : ": image") << std::endl; - // Dump the data - fileName << counter << "_kernel_" << name() << - "_" << arg->name_ << "_" << location << ".bin"; - stubWrite.open(fileName.str().c_str(), - (std::fstream::out | std::fstream::binary)); - - // Write data to a file - if (stubWrite.is_open()) { - stubWrite.write( - reinterpret_cast(location), gpuMem->size()); - stubWrite.close(); - } - resource->unmap(&gpu); - } + // Write data to a file + if (stubWrite.is_open()) { + stubWrite.write(reinterpret_cast(location), gpuMem->size()); + stubWrite.close(); + } } + if (((arg->type_ >= KernelArg::Image1D) && (arg->type_ <= KernelArg::Image3D)) || + ((src == NULL) && bufferObj)) { + //@todo Replace the current map + Memory* resource = const_cast(gpu.slots_[i].memory_); + void* memory = resource->map(&gpu); + uint* location = reinterpret_cast(memory); + std::cerr << " > " << arg->name_ << (bufferObj ? ": buffer" : ": image") << std::endl; + // Dump the data + fileName << counter << "_kernel_" << name() << "_" << arg->name_ << "_" << location << ".bin"; + stubWrite.open(fileName.str().c_str(), (std::fstream::out | std::fstream::binary)); - for (uint i = 0; i < gpu.constBufs_.size(); ++i) { - std::stringstream fileName; - fileName << counter++ << "_kernel_" << name() << "_const" << i << ".bin"; - stubWrite.open(fileName.str().c_str(), - (std::fstream::out | std::fstream::binary)); - if (stubWrite.is_open()) { - address memory = reinterpret_cast
(gpu.constBufs_[i]->map(&gpu, Resource::ReadOnly)); - // Check if we have OpenCL program - stubWrite.write(reinterpret_cast(memory+gpu.cb(i)->wrtOffset()), gpu.cb(i)->lastWrtSize()); - gpu.constBufs_[i]->unmap(&gpu); - stubWrite.close(); - } + // Write data to a file + if (stubWrite.is_open()) { + stubWrite.write(reinterpret_cast(location), gpuMem->size()); + stubWrite.close(); + } + resource->unmap(&gpu); } - const Program::HwConstBuffers& gds = prog().glbHwCb(); - for (Program::HwConstBuffers::const_iterator it = gds.begin(); it != gds.end(); ++it) { - uint idx = it->first; - std::stringstream fileName; - fileName << counter++ << "_kernel_" << name() << "_const" << idx << ".bin"; - stubWrite.open(fileName.str().c_str(), - (std::fstream::out | std::fstream::binary)); - if (stubWrite.is_open()) { - address memory = reinterpret_cast
((it->second)->map(&gpu, Resource::ReadOnly)); - // Check if we have OpenCL program - stubWrite.write(reinterpret_cast(memory), (it->second)->size()); - (it->second)->unmap(&gpu); - stubWrite.close(); - } + } + + for (uint i = 0; i < gpu.constBufs_.size(); ++i) { + std::stringstream fileName; + fileName << counter++ << "_kernel_" << name() << "_const" << i << ".bin"; + stubWrite.open(fileName.str().c_str(), (std::fstream::out | std::fstream::binary)); + if (stubWrite.is_open()) { + address memory = reinterpret_cast
(gpu.constBufs_[i]->map(&gpu, Resource::ReadOnly)); + // Check if we have OpenCL program + stubWrite.write(reinterpret_cast(memory + gpu.cb(i)->wrtOffset()), + gpu.cb(i)->lastWrtSize()); + gpu.constBufs_[i]->unmap(&gpu); + stubWrite.close(); } + } + const Program::HwConstBuffers& gds = prog().glbHwCb(); + for (Program::HwConstBuffers::const_iterator it = gds.begin(); it != gds.end(); ++it) { + uint idx = it->first; + std::stringstream fileName; + fileName << counter++ << "_kernel_" << name() << "_const" << idx << ".bin"; + stubWrite.open(fileName.str().c_str(), (std::fstream::out | std::fstream::binary)); + if (stubWrite.is_open()) { + address memory = reinterpret_cast
((it->second)->map(&gpu, Resource::ReadOnly)); + // Check if we have OpenCL program + stubWrite.write(reinterpret_cast(memory), (it->second)->size()); + (it->second)->unmap(&gpu); + stubWrite.close(); + } + } } -bool -Kernel::initConstBuffers() -{ - bool result = true; - size_t i; +bool Kernel::initConstBuffers() { + bool result = true; + size_t i; - assert((numCb_ != 0) && "We have 0 constant buffers!"); + assert((numCb_ != 0) && "We have 0 constant buffers!"); - // Allocate an array for CB sizes - cbSizes_ = new size_t[numCb_]; - if (cbSizes_ == NULL) { - return false; + // Allocate an array for CB sizes + cbSizes_ = new size_t[numCb_]; + if (cbSizes_ == NULL) { + return false; + } + memset(cbSizes_, 0, sizeof(size_t) * numCb_); + + // CB0 is reserved for ABI data + cbSizes_[0] = TotalABIVectors * ConstBuffer::VectorSize; + + // Find sizes of all constant buffers + for (i = 0; i < arguments_.size(); ++i) { + const KernelArg* arg = argument(i); + size_t size = arg->cbPos_ + arg->size(true); + size_t specVec = arg->specialVector(); + if (specVec != 0) { + size = arg->cbPos_ + (arg->size_ / KernelArg::VectorSizeLimit) * ConstBuffer::VectorSize; } - memset(cbSizes_, 0, sizeof(size_t) * numCb_); - - // CB0 is reserved for ABI data - cbSizes_[0] = TotalABIVectors * ConstBuffer::VectorSize; - - // Find sizes of all constant buffers - for (i = 0; i < arguments_.size(); ++i) { - const KernelArg* arg = argument(i); - size_t size = arg->cbPos_ + arg->size(true); - size_t specVec = arg->specialVector(); - if (specVec != 0) { - size = arg->cbPos_ + (arg->size_ / KernelArg::VectorSizeLimit) * - ConstBuffer::VectorSize; - } - // Do we need a CB? - if (arg->isCbNeeded() && (cbSizes_[arg->cbIdx_] < size)) { - cbSizes_[arg->cbIdx_] = size; - } + // Do we need a CB? + if (arg->isCbNeeded() && (cbSizes_[arg->cbIdx_] < size)) { + cbSizes_[arg->cbIdx_] = size; } + } - return result; + return result; } -bool -Kernel::setInternalSamplers(VirtualGPU& gpu) const -{ - for (uint i = 0; i < samplerSize(); ++i) { - const KernelArg* arg = sampler(i); - uint state = arg->cbPos_; - uint idx = arg->index_; +bool Kernel::setInternalSamplers(VirtualGPU& gpu) const { + for (uint i = 0; i < samplerSize(); ++i) { + const KernelArg* arg = sampler(i); + uint state = arg->cbPos_; + uint idx = arg->index_; - if (gpu.cal()->samplersState_[idx] != state) { - setSampler(gpu, state, idx); - gpu.cal_.samplersState_[idx] = state; - } + if (gpu.cal()->samplersState_[idx] != state) { + setSampler(gpu, state, idx); + gpu.cal_.samplersState_[idx] = state; } + } - return true; + return true; } -bool -Kernel::setArgument( - VirtualGPU& gpu, - uint idx, - const void* param, - size_t size, - bool nativeMem) const -{ - bool result = true; - const KernelArg* arg; - address memory; - size_t argSize; - static const bool waitOnBusyEngine = true; +bool Kernel::setArgument(VirtualGPU& gpu, uint idx, const void* param, size_t size, + bool nativeMem) const { + bool result = true; + const KernelArg* arg; + address memory; + size_t argSize; + static const bool waitOnBusyEngine = true; - assert((idx < arguments_.size()) && "Param index is out of range!"); + assert((idx < arguments_.size()) && "Param index is out of range!"); - arg = argument(idx); - assert((arg->cbIdx_ == 1) && "Runtime supports CB1 only for the arguments buffer!"); - memory = gpu.cb(1)->sysMemCopy(); - argSize = arg->size(true); + arg = argument(idx); + assert((arg->cbIdx_ == 1) && "Runtime supports CB1 only for the arguments buffer!"); + memory = gpu.cb(1)->sysMemCopy(); + argSize = arg->size(true); - // Bind the global heap for emulation mode - switch (arg->type_) { + // Bind the global heap for emulation mode + switch (arg->type_) { case KernelArg::PointerLocal: case KernelArg::PointerPrivate: - if (!bindResource(gpu, dev().globalMem(), 0, GlobalBuffer, uavRaw_)) { - return false; - } - // Fall through ... + if (!bindResource(gpu, dev().globalMem(), 0, GlobalBuffer, uavRaw_)) { + return false; + } + // Fall through ... default: - break; - } + break; + } - switch (arg->type_) { + switch (arg->type_) { case KernelArg::PointerConst: case KernelArg::PointerHwConst: - case KernelArg::PointerGlobal: - { - gpu::Memory* gpuMem = NULL; - if (nativeMem) { - gpuMem = *reinterpret_cast(param); - } - else if (*reinterpret_cast(param) != NULL) { - gpuMem = dev().getGpuMemory(*reinterpret_cast(param)); - } - bool forceZeroOffset = false; + case KernelArg::PointerGlobal: { + gpu::Memory* gpuMem = NULL; + if (nativeMem) { + gpuMem = *reinterpret_cast(param); + } else if (*reinterpret_cast(param) != NULL) { + gpuMem = dev().getGpuMemory(*reinterpret_cast(param)); + } + bool forceZeroOffset = false; - if (gpuMem == NULL) { - forceZeroOffset = true; - gpuMem = dev().getGpuMemory(dev().dummyPage()); - } - uint64_t offset = gpuMem->pinOffset(); + if (gpuMem == NULL) { + forceZeroOffset = true; + gpuMem = dev().getGpuMemory(dev().dummyPage()); + } + uint64_t offset = gpuMem->pinOffset(); - // Make sure the passed argument is a buffer object - if (!gpuMem->cal()->buffer_) { - LogError("The kernel buffer argument isn't a buffer object!"); - return false; - } + // Make sure the passed argument is a buffer object + if (!gpuMem->cal()->buffer_) { + LogError("The kernel buffer argument isn't a buffer object!"); + return false; + } - if (arg->type_ == KernelArg::PointerHwConst) { - // Bind current memory object with the kernel - if (!bindResource(gpu, *gpuMem, idx, - ArgumentConstBuffer, arg->index_)) { - return false; - } - assert((offset == 0) && "No offset for HW CB"); - // Add a fake offset to make sure (ptr != NULL) is TRUE - offset = 1; - } - else { - ResourceType type = ArgumentHeapBuffer; - - // Check if kernel expects UAV binding - if (arg->memory_.uavBuf_) { - type = ArgumentBuffer; - } - else { - // Bind global buffer to UAV this buffer is bound to - if (!bindResource(gpu, dev().globalMem(), 0, - GlobalBuffer, uavRaw_)) { - return false; - } - } - - // Bind current memory object with the kernel - // Note: it's a fake binding, if the buffer is part of - // the global heap - if (!bindResource(gpu, *gpuMem, idx, type, arg->index_)) { - return false; - } - - // Update offset only if we bind HeapBuffer or - // it's global address space in UAV setup on SI+ - offset += gpuMem->hbOffset(); - if (!forceZeroOffset) { - assert((offset != 0) && "Offset 0 with a real allocation!"); - } - gpu.addVmMemory(gpuMem); - } - - // Wait for resource if it was used on an inactive engine - //! \note syncCache may call DRM transfer - gpuMem->wait(gpu, waitOnBusyEngine); - - if (forceZeroOffset) { - offset = 0; - } - - // Copy memory offset into the constant buffer - if (abi64Bit()) { - *(reinterpret_cast(memory + arg->cbPos_)) = offset; - } - else { - *(reinterpret_cast(memory + arg->cbPos_)) = - static_cast(offset); - } + if (arg->type_ == KernelArg::PointerHwConst) { + // Bind current memory object with the kernel + if (!bindResource(gpu, *gpuMem, idx, ArgumentConstBuffer, arg->index_)) { + return false; } - break; + assert((offset == 0) && "No offset for HW CB"); + // Add a fake offset to make sure (ptr != NULL) is TRUE + offset = 1; + } else { + ResourceType type = ArgumentHeapBuffer; + + // Check if kernel expects UAV binding + if (arg->memory_.uavBuf_) { + type = ArgumentBuffer; + } else { + // Bind global buffer to UAV this buffer is bound to + if (!bindResource(gpu, dev().globalMem(), 0, GlobalBuffer, uavRaw_)) { + return false; + } + } + + // Bind current memory object with the kernel + // Note: it's a fake binding, if the buffer is part of + // the global heap + if (!bindResource(gpu, *gpuMem, idx, type, arg->index_)) { + return false; + } + + // Update offset only if we bind HeapBuffer or + // it's global address space in UAV setup on SI+ + offset += gpuMem->hbOffset(); + if (!forceZeroOffset) { + assert((offset != 0) && "Offset 0 with a real allocation!"); + } + gpu.addVmMemory(gpuMem); + } + + // Wait for resource if it was used on an inactive engine + //! \note syncCache may call DRM transfer + gpuMem->wait(gpu, waitOnBusyEngine); + + if (forceZeroOffset) { + offset = 0; + } + + // Copy memory offset into the constant buffer + if (abi64Bit()) { + *(reinterpret_cast(memory + arg->cbPos_)) = offset; + } else { + *(reinterpret_cast(memory + arg->cbPos_)) = static_cast(offset); + } + } break; case KernelArg::Image1D: case KernelArg::Image2D: case KernelArg::Image3D: case KernelArg::Image1DB: case KernelArg::Image1DA: - case KernelArg::Image2DA: - { - gpu::Memory* gpuMem = NULL; - if (nativeMem) { - gpuMem = *reinterpret_cast(param); - } - else if (*reinterpret_cast(param) != NULL) { - gpuMem = dev().getGpuMemory(*reinterpret_cast(param)); - } + case KernelArg::Image2DA: { + gpu::Memory* gpuMem = NULL; + if (nativeMem) { + gpuMem = *reinterpret_cast(param); + } else if (*reinterpret_cast(param) != NULL) { + gpuMem = dev().getGpuMemory(*reinterpret_cast(param)); + } - if (gpuMem == NULL) { - return false; - } - // Make sure the passed argument is an image object - if (gpuMem->cal()->buffer_) { - LogError("The kernel image argument isn't an image object!"); - return false; - } + if (gpuMem == NULL) { + return false; + } + // Make sure the passed argument is an image object + if (gpuMem->cal()->buffer_) { + LogError("The kernel image argument isn't an image object!"); + return false; + } - ResourceType resType = arg->memory_.readOnly_ ? - ArgumentImageRead : ArgumentImageWrite; + ResourceType resType = arg->memory_.readOnly_ ? ArgumentImageRead : ArgumentImageWrite; - // Bind current memory object with the shader. - if (!bindResource(gpu, *gpuMem, idx, - resType, arg->index_)) { - return false; - } + // Bind current memory object with the shader. + if (!bindResource(gpu, *gpuMem, idx, resType, arg->index_)) { + return false; + } - // Wait for resource if it was used on an inactive engine - //! \note syncCache may call DRM transfer - gpuMem->wait(gpu, waitOnBusyEngine); + // Wait for resource if it was used on an inactive engine + //! \note syncCache may call DRM transfer + gpuMem->wait(gpu, waitOnBusyEngine); - // Copy image constants into the constant buffer - if (gpuMem->owner() != NULL) { - copyImageConstants(gpuMem->owner()->asImage(), - reinterpret_cast(memory + arg->cbPos_)); - } + // Copy image constants into the constant buffer + if (gpuMem->owner() != NULL) { + copyImageConstants(gpuMem->owner()->asImage(), + reinterpret_cast(memory + arg->cbPos_)); + } - // Handle DOPP texture resource - gslMemObject gslMem = gpuMem->gslResource(); - if (gslMem->getAttribs().isDOPPDesktopTexture) { - gpu.addVmMemory(gpuMem); - } - } - break; - case KernelArg::Sampler: - { - amd::Sampler* amdSampler = - *reinterpret_cast(param); - uint idx = arg->index_; - uint32_t state = amdSampler->state(); + // Handle DOPP texture resource + gslMemObject gslMem = gpuMem->gslResource(); + if (gslMem->getAttribs().isDOPPDesktopTexture) { + gpu.addVmMemory(gpuMem); + } + } break; + case KernelArg::Sampler: { + amd::Sampler* amdSampler = *reinterpret_cast(param); + uint idx = arg->index_; + uint32_t state = amdSampler->state(); - if (state != gpu.cal()->samplersState_[idx]) { - setSampler(gpu, state, idx); - gpu.cal_.samplersState_[idx] = state; - } + if (state != gpu.cal()->samplersState_[idx]) { + setSampler(gpu, state, idx); + gpu.cal_.samplersState_[idx] = state; + } - // Copy sampler state into the constant buffer - *(reinterpret_cast(memory + arg->cbPos_)) = state; - } - break; - case KernelArg::Counter: - { - gpu::Memory* gpuMem = NULL; - if (nativeMem) { - gpuMem = *reinterpret_cast(param); - } - else if (*reinterpret_cast(param) != NULL) { - gpuMem = dev().getGpuMemory(*reinterpret_cast(param)); - } + // Copy sampler state into the constant buffer + *(reinterpret_cast(memory + arg->cbPos_)) = state; + } break; + case KernelArg::Counter: { + gpu::Memory* gpuMem = NULL; + if (nativeMem) { + gpuMem = *reinterpret_cast(param); + } else if (*reinterpret_cast(param) != NULL) { + gpuMem = dev().getGpuMemory(*reinterpret_cast(param)); + } - // Wait for resource if it was used on an inactive engine - //! \note syncCache may call DRM transfer - gpuMem->wait(gpu, waitOnBusyEngine); + // Wait for resource if it was used on an inactive engine + //! \note syncCache may call DRM transfer + gpuMem->wait(gpu, waitOnBusyEngine); - // Bind current memory object with the shader. - if (!bindResource(gpu, *gpuMem, idx, - ArgumentCounter, idx)) { - return false; - } - } - break; - case KernelArg::PointerHwLocal: - { - // Calculate current offset in the local ring - uint offset = gpu.cal_.localSize; - uint extra = amd::alignUp(offset, arg->alignment_) - offset; + // Bind current memory object with the shader. + if (!bindResource(gpu, *gpuMem, idx, ArgumentCounter, idx)) { + return false; + } + } break; + case KernelArg::PointerHwLocal: { + // Calculate current offset in the local ring + uint offset = gpu.cal_.localSize; + uint extra = amd::alignUp(offset, arg->alignment_) - offset; - offset = amd::alignUp(offset, arg->alignment_); - size_t memSize = *static_cast(param); + offset = amd::alignUp(offset, arg->alignment_); + size_t memSize = *static_cast(param); - // Allocate new memory from the local ring - gpu.cal_.localSize += static_cast(memSize) + extra; - // Copy current local argument's offset into the CB - *(reinterpret_cast(memory + arg->cbPos_)) = offset; + // Allocate new memory from the local ring + gpu.cal_.localSize += static_cast(memSize) + extra; + // Copy current local argument's offset into the CB + *(reinterpret_cast(memory + arg->cbPos_)) = offset; - CondLog((gpu.cal_.localSize > dev().info().localMemSize_), - "Requested local size is bigger than reported!"); - } - break; + CondLog((gpu.cal_.localSize > dev().info().localMemSize_), + "Requested local size is bigger than reported!"); + } break; case KernelArg::Float: case KernelArg::Double: case KernelArg::Char: @@ -1962,416 +1805,365 @@ Kernel::setArgument( case KernelArg::UInt: case KernelArg::Long: case KernelArg::ULong: - if (size != argSize) { - LogWarning("Parameter's sizes are unmatched!"); - } - // Fall through ... + if (size != argSize) { + LogWarning("Parameter's sizes are unmatched!"); + } + // Fall through ... case KernelArg::Struct: case KernelArg::Union: { - size_t specVec = arg->specialVector(); - if (specVec != 0) { - uint iter = (arg->size_ / KernelArg::VectorSizeLimit); - for (uint i = 0; i < iter; ++i) { - amd::Os::fastMemcpy((memory + arg->cbPos_ + - i * ConstBuffer::VectorSize), - reinterpret_cast(param) + - i * KernelArg::VectorSizeLimit * specVec, - specVec * KernelArg::VectorSizeLimit); - } + size_t specVec = arg->specialVector(); + if (specVec != 0) { + uint iter = (arg->size_ / KernelArg::VectorSizeLimit); + for (uint i = 0; i < iter; ++i) { + amd::Os::fastMemcpy( + (memory + arg->cbPos_ + i * ConstBuffer::VectorSize), + reinterpret_cast(param) + i * KernelArg::VectorSizeLimit * specVec, + specVec * KernelArg::VectorSizeLimit); } - else { - // Copy data into the CB - amd::Os::fastMemcpy((memory + arg->cbPos_), param, size); - } - } - break; + } else { + // Copy data into the CB + amd::Os::fastMemcpy((memory + arg->cbPos_), param, size); + } + } break; default: - LogError("Unhandled argument's type!"); - break; - } + LogError("Unhandled argument's type!"); + break; + } - return result; + return result; } -bool -Kernel::initLocalPrivateRanges(VirtualGPU& gpu) const -{ - // Initialize HW local - gpu.cal_.localSize = hwLocalSize_; +bool Kernel::initLocalPrivateRanges(VirtualGPU& gpu) const { + // Initialize HW local + gpu.cal_.localSize = hwLocalSize_; - // Bind the global buffer if emulated local or private memory - // was allocated by the kernel - if ((flags() & PrintfOutput && (printfId_ == UavIdUndefined)) && - (uavRaw_ != UavIdUndefined)) { - if (!bindResource(gpu, dev().globalMem(), 0, GlobalBuffer, uavRaw_)) { - return false; - } + // Bind the global buffer if emulated local or private memory + // was allocated by the kernel + if ((flags() & PrintfOutput && (printfId_ == UavIdUndefined)) && (uavRaw_ != UavIdUndefined)) { + if (!bindResource(gpu, dev().globalMem(), 0, GlobalBuffer, uavRaw_)) { + return false; } + } - // Bind the global buffer if emulated constant buffers are enabled - if (cbId_ != UavIdUndefined) { - if (!bindResource(gpu, dev().globalMem(), 0, ArgumentCbID, cbId_)) { - return false; - } + // Bind the global buffer if emulated constant buffers are enabled + if (cbId_ != UavIdUndefined) { + if (!bindResource(gpu, dev().globalMem(), 0, ArgumentCbID, cbId_)) { + return false; } + } - // Bind the printf buffer - if (printfId_ != UavIdUndefined) { - if (!bindResource(gpu, dev().globalMem(), 0, ArgumentPrintfID, printfId_)) { - return false; - } + // Bind the printf buffer + if (printfId_ != UavIdUndefined) { + if (!bindResource(gpu, dev().globalMem(), 0, ArgumentPrintfID, printfId_)) { + return false; } - // Initialize the iterations count - gpu.cal_.iterations_ = 1; + } + // Initialize the iterations count + gpu.cal_.iterations_ = 1; - return true; + return true; } -void -Kernel::setLocalPrivateRanges(VirtualGPU& gpu) const -{ - address cbBuf = gpu.cb(0)->sysMemCopy(); - uint* data; - uint gridSize = - gpu.cal()->gridSize.width * - gpu.cal()->gridSize.height * - gpu.cal()->gridSize.depth; - uint blockSize = - gpu.cal()->gridBlock.width * - gpu.cal()->gridBlock.height * - gpu.cal()->gridBlock.depth; +void Kernel::setLocalPrivateRanges(VirtualGPU& gpu) const { + address cbBuf = gpu.cb(0)->sysMemCopy(); + uint* data; + uint gridSize = + gpu.cal()->gridSize.width * gpu.cal()->gridSize.height * gpu.cal()->gridSize.depth; + uint blockSize = + gpu.cal()->gridBlock.width * gpu.cal()->gridBlock.height * gpu.cal()->gridBlock.depth; - //! \todo validate if the compiler still generates PrivateFixed - if (flags() & PrivateFixed) { - // Update private ring - data = reinterpret_cast - (cbBuf + PrivateRingOffset * ConstBuffer::VectorSize); - Memory* gpuMemory = dev().getGpuMemory(dev().dummyPage()); + //! \todo validate if the compiler still generates PrivateFixed + if (flags() & PrivateFixed) { + // Update private ring + data = reinterpret_cast(cbBuf + PrivateRingOffset * ConstBuffer::VectorSize); + Memory* gpuMemory = dev().getGpuMemory(dev().dummyPage()); - if (abi64Bit()) { - reinterpret_cast(data)[0] = gpuMemory->hbOffset(); - data[2] = 0; - data[3] = 0; - } - else { - data[0] = static_cast(gpuMemory->hbOffset()); - data[1] = 0; - data[2] = data[3] = 0; - } - gpu.addVmMemory(gpuMemory); + if (abi64Bit()) { + reinterpret_cast(data)[0] = gpuMemory->hbOffset(); + data[2] = 0; + data[3] = 0; + } else { + data[0] = static_cast(gpuMemory->hbOffset()); + data[1] = 0; + data[2] = data[3] = 0; } + gpu.addVmMemory(gpuMemory); + } - // Copy the math lib constants - amd::Os::fastMemcpy( - (cbBuf + MathLibOffset * ConstBuffer::VectorSize), - MathLibConst, sizeof(MathLibConst)); + // Copy the math lib constants + amd::Os::fastMemcpy((cbBuf + MathLibOffset * ConstBuffer::VectorSize), MathLibConst, + sizeof(MathLibConst)); - // Update the offset to the global data - if (prog().glbData() != NULL) { - gpu.addVmMemory(prog().glbData()); - uint64_t glbDataOffset = prog().glbData()->hbOffset(); - if (abi64Bit()) { - *reinterpret_cast(cbBuf + GlobalDataStoreOffset * - ConstBuffer::VectorSize) = glbDataOffset; - } - else { - *reinterpret_cast(cbBuf + GlobalDataStoreOffset * - ConstBuffer::VectorSize) = static_cast(glbDataOffset); - } + // Update the offset to the global data + if (prog().glbData() != NULL) { + gpu.addVmMemory(prog().glbData()); + uint64_t glbDataOffset = prog().glbData()->hbOffset(); + if (abi64Bit()) { + *reinterpret_cast(cbBuf + GlobalDataStoreOffset * ConstBuffer::VectorSize) = + glbDataOffset; + } else { + *reinterpret_cast(cbBuf + GlobalDataStoreOffset * ConstBuffer::VectorSize) = + static_cast(glbDataOffset); } + } - // Split workload if it was requested - if ((gpu.cal_.iterations_ < 2) && - gpu.dmaFlushMgmt().dispatchSplitSize() != 0) { - uint totalSize = gridSize * blockSize; - if (totalSize > gpu.dmaFlushMgmt().dispatchSplitSize()) { - gpu.cal_.iterations_ = std::max(gpu.cal_.iterations_, - (totalSize / gpu.dmaFlushMgmt().dispatchSplitSize())); - } + // Split workload if it was requested + if ((gpu.cal_.iterations_ < 2) && gpu.dmaFlushMgmt().dispatchSplitSize() != 0) { + uint totalSize = gridSize * blockSize; + if (totalSize > gpu.dmaFlushMgmt().dispatchSplitSize()) { + gpu.cal_.iterations_ = + std::max(gpu.cal_.iterations_, (totalSize / gpu.dmaFlushMgmt().dispatchSplitSize())); } + } - // Initialize the number of iterations to the grid size - if (flags() & PrintfOutput) { - gpu.cal_.iterations_ = gridSize; - } + // Initialize the number of iterations to the grid size + if (flags() & PrintfOutput) { + gpu.cal_.iterations_ = gridSize; + } } -void -Kernel::setSampler( - VirtualGPU& gpu, - uint32_t state, - uint physUnit -) const -{ - // All CAL sampler's parameters are in floats - float gslAddress = GSL_CLAMP_TO_BORDER; - float gslMinFilter = GSL_MIN_NEAREST; - float gslMagFilter = GSL_MAG_NEAREST; +void Kernel::setSampler(VirtualGPU& gpu, uint32_t state, uint physUnit) const { + // All CAL sampler's parameters are in floats + float gslAddress = GSL_CLAMP_TO_BORDER; + float gslMinFilter = GSL_MIN_NEAREST; + float gslMagFilter = GSL_MAG_NEAREST; - state &= ~amd::Sampler::StateNormalizedCoordsMask; + state &= ~amd::Sampler::StateNormalizedCoordsMask; - // Program the sampler address mode - switch (state & amd::Sampler::StateAddressMask) { - case amd::Sampler::StateAddressRepeat: - gslAddress = GSL_REPEAT; - break; - case amd::Sampler::StateAddressClampToEdge: - gslAddress = GSL_CLAMP_TO_EDGE; - break; - case amd::Sampler::StateAddressMirroredRepeat: - gslAddress = GSL_MIRRORED_REPEAT; - break; - case amd::Sampler::StateAddressClamp: - case amd::Sampler::StateAddressNone: - default: - break; - } - state &= ~amd::Sampler::StateAddressMask; + // Program the sampler address mode + switch (state & amd::Sampler::StateAddressMask) { + case amd::Sampler::StateAddressRepeat: + gslAddress = GSL_REPEAT; + break; + case amd::Sampler::StateAddressClampToEdge: + gslAddress = GSL_CLAMP_TO_EDGE; + break; + case amd::Sampler::StateAddressMirroredRepeat: + gslAddress = GSL_MIRRORED_REPEAT; + break; + case amd::Sampler::StateAddressClamp: + case amd::Sampler::StateAddressNone: + default: + break; + } + state &= ~amd::Sampler::StateAddressMask; - gpu.setSamplerParameter(physUnit, GSL_TEXTURE_WRAP_S, &gslAddress); - gpu.setSamplerParameter(physUnit, GSL_TEXTURE_WRAP_T, &gslAddress); - gpu.setSamplerParameter(physUnit, GSL_TEXTURE_WRAP_R, &gslAddress); + gpu.setSamplerParameter(physUnit, GSL_TEXTURE_WRAP_S, &gslAddress); + gpu.setSamplerParameter(physUnit, GSL_TEXTURE_WRAP_T, &gslAddress); + gpu.setSamplerParameter(physUnit, GSL_TEXTURE_WRAP_R, &gslAddress); - // Program texture filter mode - if (state == amd::Sampler::StateFilterLinear) { - gslMinFilter = GSL_MIN_LINEAR; - gslMagFilter = GSL_MAG_LINEAR; - } + // Program texture filter mode + if (state == amd::Sampler::StateFilterLinear) { + gslMinFilter = GSL_MIN_LINEAR; + gslMagFilter = GSL_MAG_LINEAR; + } - gpu.setSamplerParameter(physUnit, GSL_TEXTURE_MIN_FILTER, &gslMinFilter); - gpu.setSamplerParameter(physUnit, GSL_TEXTURE_MAG_FILTER, &gslMagFilter); + gpu.setSamplerParameter(physUnit, GSL_TEXTURE_MIN_FILTER, &gslMinFilter); + gpu.setSamplerParameter(physUnit, GSL_TEXTURE_MAG_FILTER, &gslMagFilter); } -bool -Kernel::bindResource( - VirtualGPU& gpu, - const Memory& memory, - uint paramIdx, - ResourceType type, - uint physUnit, - size_t offset) const -{ - gslUAVType uavType = GSL_UAV_TYPE_UNKNOWN; +bool Kernel::bindResource(VirtualGPU& gpu, const Memory& memory, uint paramIdx, ResourceType type, + uint physUnit, size_t offset) const { + gslUAVType uavType = GSL_UAV_TYPE_UNKNOWN; - // Find the original resource name from the IL program - switch (type) { + // Find the original resource name from the IL program + switch (type) { case GlobalBuffer: - if (gpu.state_.boundGlobal_) { - return true; - } - gpu.state_.boundGlobal_ = true; - physUnit = uavRaw_; - uavType = GSL_UAV_TYPE_TYPELESS; - break; + if (gpu.state_.boundGlobal_) { + return true; + } + gpu.state_.boundGlobal_ = true; + physUnit = uavRaw_; + uavType = GSL_UAV_TYPE_TYPELESS; + break; case ArgumentCbID: - if (gpu.state_.boundCb_) { - return true; - } - gpu.state_.boundCb_ = true; - physUnit = cbId_; - uavType = GSL_UAV_TYPE_TYPELESS; - break; + if (gpu.state_.boundCb_) { + return true; + } + gpu.state_.boundCb_ = true; + physUnit = cbId_; + uavType = GSL_UAV_TYPE_TYPELESS; + break; case ArgumentPrintfID: - if (gpu.state_.boundPrintf_) { - return true; - } - gpu.state_.boundPrintf_ = true; - physUnit = printfId_; - uavType = GSL_UAV_TYPE_TYPELESS; - break; + if (gpu.state_.boundPrintf_) { + return true; + } + gpu.state_.boundPrintf_ = true; + physUnit = printfId_; + uavType = GSL_UAV_TYPE_TYPELESS; + break; case ArgumentHeapBuffer: case ArgumentBuffer: case ArgumentImageRead: case ArgumentImageWrite: case ArgumentConstBuffer: case ArgumentCounter: - // Early exit if resource is bound already - if (gpu.slots_[paramIdx].state_.bound_) { - return true; - } + // Early exit if resource is bound already + if (gpu.slots_[paramIdx].state_.bound_) { + return true; + } - // Associate resource with the slot - gpu.slots_[paramIdx].memory_ = &memory; + // Associate resource with the slot + gpu.slots_[paramIdx].memory_ = &memory; - // Mark resource as bound - gpu.slots_[paramIdx].state_.bound_ = true; + // Mark resource as bound + gpu.slots_[paramIdx].state_.bound_ = true; - if (type == ArgumentCounter) { - GpuEvent calEvent; + if (type == ArgumentCounter) { + GpuEvent calEvent; - // Bind memory with atomic counter - gpu.cs()->bindAtomicCounter(argument(paramIdx)->index_, - memory.gslResource()); + // Bind memory with atomic counter + gpu.cs()->bindAtomicCounter(argument(paramIdx)->index_, memory.gslResource()); - // Copy the counter value into GDS - gpu.eventBegin(MainEngine); - gpu.cs()->syncAtomicCounter(argument(paramIdx)->index_, false); - gpu.eventEnd(MainEngine, calEvent); + // Copy the counter value into GDS + gpu.eventBegin(MainEngine); + gpu.cs()->syncAtomicCounter(argument(paramIdx)->index_, false); + gpu.eventEnd(MainEngine, calEvent); - // Mark resource as busy - memory.setBusy(gpu, calEvent); - return true; - } - else if (type == ArgumentHeapBuffer) { - // We return here, since we just have to bind the global heap - return true; - } - else if (type == ArgumentConstBuffer) { - gpu.slots_[paramIdx].state_.constant_ = true; - } - break; + // Mark resource as busy + memory.setBusy(gpu, calEvent); + return true; + } else if (type == ArgumentHeapBuffer) { + // We return here, since we just have to bind the global heap + return true; + } else if (type == ArgumentConstBuffer) { + gpu.slots_[paramIdx].state_.constant_ = true; + } + break; case ArgumentUavID: case ConstantBuffer: - break; + break; default: - LogPrintfError("Unspecified argument type ()!", type); - return false; - } + LogPrintfError("Unspecified argument type ()!", type); + return false; + } - gslMemObject gslMem = NULL; - // Use global address space on SI+ for UAV setup - if ((type == ArgumentBuffer) || (type == ArgumentCbID) || - (type == ArgumentUavID) || (type == ArgumentPrintfID)) { - gslMem = dev().heap().resource().gslResource(); - } - else { - gslMem = memory.gslResource(); - } + gslMemObject gslMem = NULL; + // Use global address space on SI+ for UAV setup + if ((type == ArgumentBuffer) || (type == ArgumentCbID) || (type == ArgumentUavID) || + (type == ArgumentPrintfID)) { + gslMem = dev().heap().resource().gslResource(); + } else { + gslMem = memory.gslResource(); + } - // Associate memory with the physical unit, the actual binding - bool result = true; - switch (type) { + // Associate memory with the physical unit, the actual binding + bool result = true; + switch (type) { case GlobalBuffer: case ArgumentBuffer: case ArgumentImageWrite: case ArgumentUavID: case ArgumentCbID: case ArgumentPrintfID: - if (type == ArgumentImageWrite) { - uavType = GSL_UAV_TYPE_TYPED; - } - else if ((type == ArgumentBuffer) || (type == ArgumentUavID)) { - uavType = GSL_UAV_TYPE_TYPELESS; - } - if (gpu.cal_.uavs_[physUnit] != gslMem) { - result = gpu.setUAVBuffer(physUnit, gslMem, uavType); - gpu.setUAVChannelOrder(physUnit, gslMem); - gpu.cal_.uavs_[physUnit] = gslMem; - } - break; + if (type == ArgumentImageWrite) { + uavType = GSL_UAV_TYPE_TYPED; + } else if ((type == ArgumentBuffer) || (type == ArgumentUavID)) { + uavType = GSL_UAV_TYPE_TYPELESS; + } + if (gpu.cal_.uavs_[physUnit] != gslMem) { + result = gpu.setUAVBuffer(physUnit, gslMem, uavType); + gpu.setUAVChannelOrder(physUnit, gslMem); + gpu.cal_.uavs_[physUnit] = gslMem; + } + break; case ConstantBuffer: case ArgumentConstBuffer: - if ((gpu.cal_.constBuffers_[physUnit] != gslMem) || (offset != 0)) { - result = gpu.setConstantBuffer(physUnit, - gslMem, offset, memory.hbSize()); - gpu.cal_.constBuffers_[physUnit] = gslMem; - } - break; + if ((gpu.cal_.constBuffers_[physUnit] != gslMem) || (offset != 0)) { + result = gpu.setConstantBuffer(physUnit, gslMem, offset, memory.hbSize()); + gpu.cal_.constBuffers_[physUnit] = gslMem; + } + break; case ArgumentImageRead: - if (gpu.cal_.readImages_[physUnit] != gslMem) { - result = gpu.setInput(physUnit, gslMem); - gpu.cal_.readImages_[physUnit] = gslMem; - } - break; + if (gpu.cal_.readImages_[physUnit] != gslMem) { + result = gpu.setInput(physUnit, gslMem); + gpu.cal_.readImages_[physUnit] = gslMem; + } + break; default: - result = false; - assert(false); - break; - } - if (!result) { - LogPrintfError("setMem failed unit:%d mem:0x%08x!", physUnit, gslMem); - return false; - } + result = false; + assert(false); + break; + } + if (!result) { + LogPrintfError("setMem failed unit:%d mem:0x%08x!", physUnit, gslMem); + return false; + } - return true; + return true; } -void -Kernel::unbindResources( - VirtualGPU& gpu, - GpuEvent calEvent, - bool lastRun) const -{ - // Make sure unbind occurs on the last run, in case the execution had a split - if (lastRun) { - for (uint i = 0; i < arguments_.size(); ++i) { - if (gpu.slots_[i].state_.bound_) { - GpuEvent calEventTmp = calEvent; +void Kernel::unbindResources(VirtualGPU& gpu, GpuEvent calEvent, bool lastRun) const { + // Make sure unbind occurs on the last run, in case the execution had a split + if (lastRun) { + for (uint i = 0; i < arguments_.size(); ++i) { + if (gpu.slots_[i].state_.bound_) { + GpuEvent calEventTmp = calEvent; - if (KernelArg::Counter == argument(i)->type_) { - // Copy the counter value from GDS - gpu.eventBegin(MainEngine); - gpu.cs()->syncAtomicCounter(argument(i)->index_, true); - gpu.eventEnd(MainEngine, calEventTmp); - } - else if (!(gpu.slots_[i].state_.constant_ || - argument(i)->memory_.readOnly_)) { - // Signal the abstraction layer that GPU memory is dirty - if (gpu.slots_[i].memory_->owner() != NULL) { - gpu.slots_[i].memory_->owner()->signalWrite(&gpu.dev()); - } - } - // Mark resource as busy - gpu.slots_[i].memory_->setBusy(gpu, calEventTmp); - - gpu.slots_[i].state_.value_ = 0; - } + if (KernelArg::Counter == argument(i)->type_) { + // Copy the counter value from GDS + gpu.eventBegin(MainEngine); + gpu.cs()->syncAtomicCounter(argument(i)->index_, true); + gpu.eventEnd(MainEngine, calEventTmp); + } else if (!(gpu.slots_[i].state_.constant_ || argument(i)->memory_.readOnly_)) { + // Signal the abstraction layer that GPU memory is dirty + if (gpu.slots_[i].memory_->owner() != NULL) { + gpu.slots_[i].memory_->owner()->signalWrite(&gpu.dev()); + } } + // Mark resource as busy + gpu.slots_[i].memory_->setBusy(gpu, calEventTmp); - // Unbind the global buffer - gpu.state_.boundGlobal_ = false; - - // Unbind the constant buffer - gpu.state_.boundCb_ = false; - - // Unbind the pritnf buffer - gpu.state_.boundPrintf_ = false; + gpu.slots_[i].state_.value_ = 0; + } } - // Mark CB busy - for (uint i = 0; i < numCb_; ++i) { - gpu.constBufs_[i]->setBusy(gpu, calEvent); - } + // Unbind the global buffer + gpu.state_.boundGlobal_ = false; - // Set the event object for the scratch buffer - if (workGroupInfo()->scratchRegs_ > 0) { - dev().scratch(gpu.hwRing())->memObj_->setBusy(gpu, calEvent); - } + // Unbind the constant buffer + gpu.state_.boundCb_ = false; + + // Unbind the pritnf buffer + gpu.state_.boundPrintf_ = false; + } + + // Mark CB busy + for (uint i = 0; i < numCb_; ++i) { + gpu.constBufs_[i]->setBusy(gpu, calEvent); + } + + // Set the event object for the scratch buffer + if (workGroupInfo()->scratchRegs_ > 0) { + dev().scratch(gpu.hwRing())->memObj_->setBusy(gpu, calEvent); + } } -void -Kernel::copyImageConstants( - const amd::Image* amdImage, - ImageConstants* imageData - ) const -{ - imageData->width_ = static_cast(amdImage->getWidth()); - imageData->height_ = static_cast(amdImage->getHeight()); - imageData->depth_ = static_cast(amdImage->getDepth()); - imageData->dataType_ = - static_cast(amdImage->getImageFormat().image_channel_data_type); +void Kernel::copyImageConstants(const amd::Image* amdImage, ImageConstants* imageData) const { + imageData->width_ = static_cast(amdImage->getWidth()); + imageData->height_ = static_cast(amdImage->getHeight()); + imageData->depth_ = static_cast(amdImage->getDepth()); + imageData->dataType_ = static_cast(amdImage->getImageFormat().image_channel_data_type); - imageData->widthFloat_ = 1.f / static_cast(amdImage->getWidth()); - imageData->heightFloat_ = 1.f / static_cast(amdImage->getHeight()); - imageData->depthFloat_ = 1.f / static_cast(amdImage->getDepth()); - imageData->channelOrder_ = - static_cast(amdImage->getImageFormat().image_channel_order); + imageData->widthFloat_ = 1.f / static_cast(amdImage->getWidth()); + imageData->heightFloat_ = 1.f / static_cast(amdImage->getHeight()); + imageData->depthFloat_ = 1.f / static_cast(amdImage->getDepth()); + imageData->channelOrder_ = static_cast(amdImage->getImageFormat().image_channel_order); } union MetadataVersion { - struct { - uint64_t revision_: 16; //!< LLVM metadata revision - uint64_t minorVersion_: 16; //!< LLVM metadata minor verison - uint64_t majorVersion_: 16; //!< LLVM metadata major version - }; - uint64_t value_; - MetadataVersion(uint mj, uint mi, uint rev): value_(0) - { - revision_ = rev; - minorVersion_ = mi; - majorVersion_ = mj; - } - MetadataVersion(): value_(0) {} + struct { + uint64_t revision_ : 16; //!< LLVM metadata revision + uint64_t minorVersion_ : 16; //!< LLVM metadata minor verison + uint64_t majorVersion_ : 16; //!< LLVM metadata major version + }; + uint64_t value_; + MetadataVersion(uint mj, uint mi, uint rev) : value_(0) { + revision_ = rev; + minorVersion_ = mi; + majorVersion_ = mj; + } + MetadataVersion() : value_(0) {} }; //! Version of metadata with buffer attributes @@ -2380,1641 +2172,1533 @@ const MetadataVersion MetadataBufferAttributes = MetadataVersion(2, 0, 88); //! Version of metadata with type qualifiers const MetadataVersion MetadataTypeQualifiers = MetadataVersion(3, 1, 103); -bool -NullKernel::parseArguments(const std::string& metaData, uint* uavRefCount) -{ - // Initialize workgroup info - workGroupInfo_.size_ = nullDev().info().maxWorkGroupSize_; - MetadataVersion mdVersion; +bool NullKernel::parseArguments(const std::string& metaData, uint* uavRefCount) { + // Initialize workgroup info + workGroupInfo_.size_ = nullDev().info().maxWorkGroupSize_; + MetadataVersion mdVersion; - // Find first tag - size_t pos = metaData.find(";"); + // Find first tag + size_t pos = metaData.find(";"); - // Loop through all provided program arguments - while (pos != std::string::npos) { - KernelArg arg; + // Loop through all provided program arguments + while (pos != std::string::npos) { + KernelArg arg; - if (!expect(metaData, &pos, ";")) { + if (!expect(metaData, &pos, ";")) { + break; + } + + arg.type_ = KernelArg::None; + + // Loop through all available metadata types + for (uint i = 0; i < ArgStateTotal; ++i) { + uint tmpValue; + // Find the name tag + if (expect(metaData, &pos, ArgState[i].typeName_)) { + switch (ArgState[i].type_) { + case KernelArg::None: + // Process next ... + continue; + case KernelArg::Reflection: { + uint argIdx; + // Read the argument's index + if (!getuint(metaData, &pos, &argIdx)) { + LogWarning("Couldn't get the argument index!"); + return false; + } + KernelArg* tmpArg = arguments_[argIdx]; + if (!getstring(metaData, &pos, &tmpArg->typeName_)) { + LogWarning("Couldn't get the argument type!"); + return false; + } + } + continue; + case KernelArg::ConstArg: { + uint argIdx; + // Read the argument's index + if (!getuint(metaData, &pos, &argIdx)) { + LogWarning("Couldn't get the argument index!"); + return false; + } + KernelArg* tmpArg = arguments_[argIdx]; + tmpArg->typeQualifier_ |= CL_KERNEL_ARG_TYPE_CONST; + } + continue; + case KernelArg::Grouping: + for (uint j = 0; j < 3; ++j) { + uint temp; + // Read the compile workgroup size + if (!getuint(metaData, &pos, &temp)) { + LogWarning("Couldn't get the compile workgroup size!"); + return false; + } + workGroupInfo_.compileSize_[j] = temp; + } + // Process next ... + continue; + case KernelArg::WrkgrpSize: { + uint temp; + // Read the workgroup size + if (!getuint(metaData, &pos, &temp)) { + LogWarning("Couldn't get the workgroup size!"); + return false; + } + workGroupInfo_.size_ = temp; + } + // Process next ... + continue; + case KernelArg::Wavefront: + // Process next ... + continue; + case KernelArg::UavId: + // Read index + if (!getuint(metaData, &pos, &arg.index_)) { + return false; + } + break; + case KernelArg::ConstBufId: + // Read index + if (!getuint(metaData, &pos, &cbId_)) { + return false; + } + continue; + case KernelArg::PrintfBufId: + // Read index + if (!getuint(metaData, &pos, &printfId_)) { + return false; + } + continue; + case KernelArg::MetadataVersion: + // Read metadata version + if (!getuint(metaData, &pos, &tmpValue)) { + return false; + } + mdVersion.majorVersion_ = tmpValue; + if (!getuint(metaData, &pos, &tmpValue)) { + return false; + } + mdVersion.minorVersion_ = tmpValue; + if (!getuint(metaData, &pos, &tmpValue)) { + return false; + } + mdVersion.revision_ = tmpValue; + // Process next ... + continue; + case KernelArg::GroupingHint: + for (uint j = 0; j < 3; ++j) { + uint temp; + // Read the compile workgroup size hint + if (!getuint(metaData, &pos, &temp)) { + LogWarning("Couldn't get the compile workgroup size hint!"); + return false; + } + workGroupInfo_.compileSizeHint_[j] = temp; + } + // Process next ... + continue; + case KernelArg::VecTypeHint: { + std::string temp; + // Read the compile vector type hint + if (!getstring(metaData, &pos, &temp)) { + LogWarning("Couldn't get the compile vector type hint!"); + return false; + } + workGroupInfo_.compileVecTypeHint_ = temp; + } + // Process next ... + continue; + case KernelArg::WavesPerSimdHint: { + uint tmp; + if (!getuint(metaData, &pos, &tmp)) { + return false; + } + workGroupInfo_.wavesPerSimdHint_ = tmp; + } + continue; + default: break; } - arg.type_ = KernelArg::None; + std::string argName; + // Save the argument type + arg.type_ = ArgState[i].type_; - // Loop through all available metadata types - for (uint i = 0; i < ArgStateTotal; ++i) { - uint tmpValue; - // Find the name tag - if (expect(metaData, &pos, ArgState[i].typeName_)) { - switch (ArgState[i].type_) { - case KernelArg::None: - // Process next ... - continue; - case KernelArg::Reflection: { - uint argIdx; - // Read the argument's index - if (!getuint(metaData, &pos, &argIdx)) { - LogWarning("Couldn't get the argument index!"); - return false; - } - KernelArg* tmpArg = arguments_[argIdx]; - if (!getstring(metaData, &pos, &tmpArg->typeName_)) { - LogWarning("Couldn't get the argument type!"); - return false; - } - } - continue; - case KernelArg::ConstArg: { - uint argIdx; - // Read the argument's index - if (!getuint(metaData, &pos, &argIdx)) { - LogWarning("Couldn't get the argument index!"); - return false; - } - KernelArg* tmpArg = arguments_[argIdx]; - tmpArg->typeQualifier_ |= CL_KERNEL_ARG_TYPE_CONST; - } - continue; - case KernelArg::Grouping: - for (uint j = 0; j < 3; ++j) { - uint temp; - // Read the compile workgroup size - if (!getuint(metaData, &pos, &temp)) { - LogWarning("Couldn't get the compile workgroup size!"); - return false; - } - workGroupInfo_.compileSize_[j] = temp; - } - // Process next ... - continue; - case KernelArg::WrkgrpSize: { - uint temp; - // Read the workgroup size - if (!getuint(metaData, &pos, &temp)) { - LogWarning("Couldn't get the workgroup size!"); - return false; - } - workGroupInfo_.size_ = temp; - } - // Process next ... - continue; - case KernelArg::Wavefront: - // Process next ... - continue; - case KernelArg::UavId: - // Read index - if (!getuint(metaData, &pos, &arg.index_)) { - return false; - } - break; - case KernelArg::ConstBufId: - // Read index - if (!getuint(metaData, &pos, &cbId_)) { - return false; - } - continue; - case KernelArg::PrintfBufId: - // Read index - if (!getuint(metaData, &pos, &printfId_)) { - return false; - } - continue; - case KernelArg::MetadataVersion: - // Read metadata version - if (!getuint(metaData, &pos, &tmpValue)) { - return false; - } - mdVersion.majorVersion_ = tmpValue; - if (!getuint(metaData, &pos, &tmpValue)) { - return false; - } - mdVersion.minorVersion_ = tmpValue; - if (!getuint(metaData, &pos, &tmpValue)) { - return false; - } - mdVersion.revision_ = tmpValue; - // Process next ... - continue; - case KernelArg::GroupingHint: - for (uint j = 0; j < 3; ++j) { - uint temp; - // Read the compile workgroup size hint - if (!getuint(metaData, &pos, &temp)) { - LogWarning("Couldn't get the compile workgroup size hint!"); - return false; - } - workGroupInfo_.compileSizeHint_[j] = temp; - } - // Process next ... - continue; - case KernelArg::VecTypeHint: - { - std::string temp; - // Read the compile vector type hint - if (!getstring(metaData, &pos, &temp)) { - LogWarning("Couldn't get the compile vector type hint!"); - return false; - } - workGroupInfo_.compileVecTypeHint_ = temp; - } - // Process next ... - continue; - case KernelArg::WavesPerSimdHint: - { - uint tmp; - if (!getuint(metaData, &pos, &tmp)) { - return false; - } - workGroupInfo_.wavesPerSimdHint_ = tmp; - } - continue; - default: - break; - } - - std::string argName; - // Save the argument type - arg.type_ = ArgState[i].type_; - - // Check if we should expect the name - if (ArgState[i].name_) { - // Read the parameter's name - if (!getword(metaData, &pos, argName)) { - LogWarning("Couldn't get a kernel argument!"); - return false; - } - arg.name_ = argName; - } - - if (arg.type_ == KernelArg::Sampler) { - if (!getuint(metaData, &pos, &arg.index_)) { - LogWarning("Couldn't get a kernel argument!"); - return false; - } - if (!getuint(metaData, &pos, &arg.location_)) { - LogWarning("Couldn't get a kernel argument!"); - return false; - } - if (!getuint(metaData, &pos, &arg.cbPos_)) { - LogWarning("Couldn't get a kernel argument!"); - return false; - } - } - - // Check if we should expect the resource data type - if (ArgState[i].resType_) { - uint k; - // Search for the data type - for (k = 0; k < DataTypeTotal; k++) { - if (expect(metaData, &pos, DataType[k].tagName_)) { - arg.dataType_ = DataType[k].type_; - if (arg.type_ == KernelArg::Image) { - flags_ |= ImageEnable; - if (expect(metaData, &pos, "RO:")) { - arg.memory_.readOnly_ = 1; - } - else if (expect(metaData, &pos, "RW:")) { - arg.memory_.readWrite_ = 1; - flags_ |= ImageWrite; - } - else if (expect(metaData, &pos, "WO:")) { - arg.memory_.writeOnly_ = 1; - flags_ |= ImageWrite; - } - } - else if (arg.type_ == KernelArg::Value) { - arg.type_ = DataType[k].type_; - } - break; - } - } - if (k == DataTypeTotal) { - LogWarning("We couldn't find the argument's type."); - if ((arg.type_ == KernelArg::Value) || - !getword(metaData, &pos, argName)) { - LogWarning("Couldn't get a kernel argument!"); - return false; - } - } - //! @todo temporary condition - if ((arg.type_ == KernelArg::Opaque) || - (arg.type_ == KernelArg::Sampler)) { - assert(false); - continue; - } - } - - // Check if we should expect the data size - if (ArgState[i].size_) { - uint tmpData; - // Read the data size - if (!getuint(metaData, &pos, &tmpData)) { - LogWarning("Couldn't get a kernel argument!"); - return false; - } - if (arg.type_ == KernelArg::Image) { - arg.type_ = arg.dataType_; - arg.index_ = tmpData; - } - else { - arg.size_ = tmpData; - } - } - - if (arg.type_ == KernelArg::Counter) { - // Read a counter index - if (!getuint(metaData, &pos, &arg.index_)) { - LogWarning("Couldn't get a counter index!"); - return false; - } - } - - // Check if we should expect a resource index - if (ArgState[i].cbIdx_) { - // Read resource index - if (!getuint(metaData, &pos, &arg.cbIdx_)) { - LogWarning("Couldn't get a kernel argument!"); - return false; - } - - if (arg.isCbNeeded() && (numCb_ < arg.cbIdx_)) { - numCb_ = arg.cbIdx_; - } - } - // Check if we should expect the CB offset - if (ArgState[i].cbPos_) { - // Read position in the constant buffer - if (!getuint(metaData, &pos, &arg.cbPos_)) { - LogWarning("Couldn't get a kernel argument!"); - return false; - } - } - // Check if we should expect the buffer type - if (ArgState[i].buf_) { - // Read the buffer type - if (!getword(metaData, &pos, argName)) { - LogWarning("Couldn't get a kernel argument!"); - return false; - } - arg.buf_ = argName; - - for (uint k = 0; k < BufTypeTotal; ++k) { - if (0 == arg.buf_.compare(BufType[k].tagName_)) { - // Update the parameter type - arg.type_ = BufType[k].type_; - // Check if we should expect a buffer index - if (BufType[k].number_) { - // Read a buffer index - if (!getuint(metaData, &pos, &arg.index_)) { - LogWarning("Couldn't get a kernel argument!"); - return false; - } - } - // Check for the required alignment - if (BufType[k].alignment_) { - // Read data alignment - if (!getuint(metaData, &pos, &arg.alignment_)) { - LogWarning("Couldn't get a kernel argument!"); - return false; - } - } - // Check for the buffer's attribute - if ((mdVersion.value_ >= MetadataBufferAttributes.value_) && - BufType[k].attribute_) { - if (expect(metaData, &pos, "RO")) { - arg.memory_.readOnly_ = 1; - } - else if (expect(metaData, &pos, "RW")) { - arg.memory_.readWrite_ = 1; - } - else if (expect(metaData, &pos, "WO")) { - arg.memory_.writeOnly_ = 1; - } - } - // Check for the type qualifier - if ((mdVersion.value_ >= MetadataTypeQualifiers.value_) && - BufType[k].attribute_) { - uint tmp; - pos += 1; - if (!getuint(metaData, &pos, &tmp)) { - LogWarning("Couldn't get volatile type!"); - return false; - } - if (tmp == 1) { - arg.typeQualifier_ |= CL_KERNEL_ARG_TYPE_VOLATILE; - } - if (!getuint(metaData, &pos, &tmp)) { - LogWarning("Couldn't get restrict type!"); - return false; - } - if (tmp == 1) { - arg.typeQualifier_ |= CL_KERNEL_ARG_TYPE_RESTRICT; - } - } - } - } - } - // Find multiple UAV references - switch (arg.type_) { - case KernelArg::PointerGlobal: - case KernelArg::PointerConst: - case KernelArg::PointerLocal: - case KernelArg::PointerPrivate: - case KernelArg::UavId: - uavRefCount[arg.index_]++; - break; - default: - break; - } - // Check if this argument will be passed in constant buffer - if (arg.isCbNeeded() || (arg.type_ == KernelArg::UavId)) { - if (arg.type_ == KernelArg::Sampler) { - // Serach for the passed by value sampler - for (uint i = 0; i < argSize(); ++i) { - KernelArg* value = arguments_[i]; - if (0 == value->name_.compare(arg.name_)) { - value->type_ = arg.type_; - value->index_ = arg.index_; - value->location_ = 0; - break; - } - } - } - else { - KernelArg* argument = new KernelArg(arg); - if (argument != NULL) { - addArgument(argument); - } - else { - LogError("Couldn't allocate memory!"); - return false; - } - } - } - // Check if we have a pre-defined sampler - else if (arg.type_ == KernelArg::Sampler) { - KernelArg* sampler = new KernelArg(arg); - if (sampler != NULL) { - addSampler(sampler); - } - else { - LogError("Couldn't allocate memory!"); - return false; - } - } - break; - } + // Check if we should expect the name + if (ArgState[i].name_) { + // Read the parameter's name + if (!getword(metaData, &pos, argName)) { + LogWarning("Couldn't get a kernel argument!"); + return false; + } + arg.name_ = argName; } - // Next argument - pos = metaData.find(";", pos); + if (arg.type_ == KernelArg::Sampler) { + if (!getuint(metaData, &pos, &arg.index_)) { + LogWarning("Couldn't get a kernel argument!"); + return false; + } + if (!getuint(metaData, &pos, &arg.location_)) { + LogWarning("Couldn't get a kernel argument!"); + return false; + } + if (!getuint(metaData, &pos, &arg.cbPos_)) { + LogWarning("Couldn't get a kernel argument!"); + return false; + } + } + + // Check if we should expect the resource data type + if (ArgState[i].resType_) { + uint k; + // Search for the data type + for (k = 0; k < DataTypeTotal; k++) { + if (expect(metaData, &pos, DataType[k].tagName_)) { + arg.dataType_ = DataType[k].type_; + if (arg.type_ == KernelArg::Image) { + flags_ |= ImageEnable; + if (expect(metaData, &pos, "RO:")) { + arg.memory_.readOnly_ = 1; + } else if (expect(metaData, &pos, "RW:")) { + arg.memory_.readWrite_ = 1; + flags_ |= ImageWrite; + } else if (expect(metaData, &pos, "WO:")) { + arg.memory_.writeOnly_ = 1; + flags_ |= ImageWrite; + } + } else if (arg.type_ == KernelArg::Value) { + arg.type_ = DataType[k].type_; + } + break; + } + } + if (k == DataTypeTotal) { + LogWarning("We couldn't find the argument's type."); + if ((arg.type_ == KernelArg::Value) || !getword(metaData, &pos, argName)) { + LogWarning("Couldn't get a kernel argument!"); + return false; + } + } + //! @todo temporary condition + if ((arg.type_ == KernelArg::Opaque) || (arg.type_ == KernelArg::Sampler)) { + assert(false); + continue; + } + } + + // Check if we should expect the data size + if (ArgState[i].size_) { + uint tmpData; + // Read the data size + if (!getuint(metaData, &pos, &tmpData)) { + LogWarning("Couldn't get a kernel argument!"); + return false; + } + if (arg.type_ == KernelArg::Image) { + arg.type_ = arg.dataType_; + arg.index_ = tmpData; + } else { + arg.size_ = tmpData; + } + } + + if (arg.type_ == KernelArg::Counter) { + // Read a counter index + if (!getuint(metaData, &pos, &arg.index_)) { + LogWarning("Couldn't get a counter index!"); + return false; + } + } + + // Check if we should expect a resource index + if (ArgState[i].cbIdx_) { + // Read resource index + if (!getuint(metaData, &pos, &arg.cbIdx_)) { + LogWarning("Couldn't get a kernel argument!"); + return false; + } + + if (arg.isCbNeeded() && (numCb_ < arg.cbIdx_)) { + numCb_ = arg.cbIdx_; + } + } + // Check if we should expect the CB offset + if (ArgState[i].cbPos_) { + // Read position in the constant buffer + if (!getuint(metaData, &pos, &arg.cbPos_)) { + LogWarning("Couldn't get a kernel argument!"); + return false; + } + } + // Check if we should expect the buffer type + if (ArgState[i].buf_) { + // Read the buffer type + if (!getword(metaData, &pos, argName)) { + LogWarning("Couldn't get a kernel argument!"); + return false; + } + arg.buf_ = argName; + + for (uint k = 0; k < BufTypeTotal; ++k) { + if (0 == arg.buf_.compare(BufType[k].tagName_)) { + // Update the parameter type + arg.type_ = BufType[k].type_; + // Check if we should expect a buffer index + if (BufType[k].number_) { + // Read a buffer index + if (!getuint(metaData, &pos, &arg.index_)) { + LogWarning("Couldn't get a kernel argument!"); + return false; + } + } + // Check for the required alignment + if (BufType[k].alignment_) { + // Read data alignment + if (!getuint(metaData, &pos, &arg.alignment_)) { + LogWarning("Couldn't get a kernel argument!"); + return false; + } + } + // Check for the buffer's attribute + if ((mdVersion.value_ >= MetadataBufferAttributes.value_) && BufType[k].attribute_) { + if (expect(metaData, &pos, "RO")) { + arg.memory_.readOnly_ = 1; + } else if (expect(metaData, &pos, "RW")) { + arg.memory_.readWrite_ = 1; + } else if (expect(metaData, &pos, "WO")) { + arg.memory_.writeOnly_ = 1; + } + } + // Check for the type qualifier + if ((mdVersion.value_ >= MetadataTypeQualifiers.value_) && BufType[k].attribute_) { + uint tmp; + pos += 1; + if (!getuint(metaData, &pos, &tmp)) { + LogWarning("Couldn't get volatile type!"); + return false; + } + if (tmp == 1) { + arg.typeQualifier_ |= CL_KERNEL_ARG_TYPE_VOLATILE; + } + if (!getuint(metaData, &pos, &tmp)) { + LogWarning("Couldn't get restrict type!"); + return false; + } + if (tmp == 1) { + arg.typeQualifier_ |= CL_KERNEL_ARG_TYPE_RESTRICT; + } + } + } + } + } + // Find multiple UAV references + switch (arg.type_) { + case KernelArg::PointerGlobal: + case KernelArg::PointerConst: + case KernelArg::PointerLocal: + case KernelArg::PointerPrivate: + case KernelArg::UavId: + uavRefCount[arg.index_]++; + break; + default: + break; + } + // Check if this argument will be passed in constant buffer + if (arg.isCbNeeded() || (arg.type_ == KernelArg::UavId)) { + if (arg.type_ == KernelArg::Sampler) { + // Serach for the passed by value sampler + for (uint i = 0; i < argSize(); ++i) { + KernelArg* value = arguments_[i]; + if (0 == value->name_.compare(arg.name_)) { + value->type_ = arg.type_; + value->index_ = arg.index_; + value->location_ = 0; + break; + } + } + } else { + KernelArg* argument = new KernelArg(arg); + if (argument != NULL) { + addArgument(argument); + } else { + LogError("Couldn't allocate memory!"); + return false; + } + } + } + // Check if we have a pre-defined sampler + else if (arg.type_ == KernelArg::Sampler) { + KernelArg* sampler = new KernelArg(arg); + if (sampler != NULL) { + addSampler(sampler); + } else { + LogError("Couldn't allocate memory!"); + return false; + } + } + break; + } } - // Find arguments that will require a reallocation + // Next argument + pos = metaData.find(";", pos); + } + + // Find arguments that will require a reallocation + for (uint i = 0; i < arguments_.size(); ++i) { + KernelArg* arg = arguments_[i]; + switch (arg->type_) { + case KernelArg::PointerGlobal: + case KernelArg::PointerConst: + case KernelArg::PointerLocal: + case KernelArg::PointerPrivate: + // Check if can't use a dedicated UAV, + // so realloc memory in the heap + arg->memory_.realloc_ = false; + arg->memory_.uavBuf_ = true; + break; + case KernelArg::PointerHwConst: + arg->memory_.realloc_ = true; + break; + case KernelArg::UavId: + uavRaw_ = arg->index_; + break; + default: + break; + } + // If argument marked with the const qualifier, then overwrite + // Read-Write attributes, since compiler doesn't mark it properly + if (arg->typeQualifier() & CL_KERNEL_ARG_TYPE_CONST) { + arg->memory_.readOnly_ = 1; + arg->memory_.readWrite_ = 0; + arg->memory_.writeOnly_ = 0; + } + } + + if ((uavRaw_ != UavIdUndefined) && !(flags() & PrintfOutput)) { + // Find if default UAV is already assigned to an argument for (uint i = 0; i < arguments_.size(); ++i) { - KernelArg* arg = arguments_[i]; - switch (arg->type_) { + KernelArg* arg = arguments_[i]; + switch (arg->type_) { case KernelArg::PointerGlobal: case KernelArg::PointerConst: case KernelArg::PointerLocal: case KernelArg::PointerPrivate: - // Check if can't use a dedicated UAV, - // so realloc memory in the heap - arg->memory_.realloc_ = false; - arg->memory_.uavBuf_ = true; - break; - case KernelArg::PointerHwConst: - arg->memory_.realloc_ = true; - break; - case KernelArg::UavId: - uavRaw_ = arg->index_; - break; + if (uavRaw_ == arg->index_) { + uavRaw_ = UavIdUndefined; + } + break; default: - break; - } - // If argument marked with the const qualifier, then overwrite - // Read-Write attributes, since compiler doesn't mark it properly - if (arg->typeQualifier() & CL_KERNEL_ARG_TYPE_CONST) { - arg->memory_.readOnly_ = 1; - arg->memory_.readWrite_ = 0; - arg->memory_.writeOnly_ = 0; - } + break; + } } + } - if ((uavRaw_ != UavIdUndefined) && - !(flags() & PrintfOutput)) { - // Find if default UAV is already assigned to an argument - for (uint i = 0; i < arguments_.size(); ++i) { - KernelArg* arg = arguments_[i]; - switch (arg->type_) { - case KernelArg::PointerGlobal: - case KernelArg::PointerConst: - case KernelArg::PointerLocal: - case KernelArg::PointerPrivate: - if (uavRaw_ == arg->index_) { - uavRaw_ = UavIdUndefined; - } - break; - default: - break; + // There is always 1 constant buffer, associated with the kernel + numCb_++; + assert((numCb_ <= MaxConstBuffersArguments) && + "Runtime doesn't support more than max CBs for arguments!"); - } - } + // Limit workgroup size if requested + if ((flags() & LimitWorkgroup) && (GPU_MAX_WORKGROUP_SIZE == 0)) { + size_t temp = 1; + workGroupInfo_.size_ = workGroupInfo()->wavefrontSize_; + for (uint j = 0; j < 3; ++j) { + if (workGroupInfo()->compileSize_[j] != 0) { + temp *= workGroupInfo_.compileSize_[j]; + } } - - // There is always 1 constant buffer, associated with the kernel - numCb_++; - assert((numCb_ <= MaxConstBuffersArguments) && - "Runtime doesn't support more than max CBs for arguments!"); - - // Limit workgroup size if requested - if ((flags() & LimitWorkgroup) && (GPU_MAX_WORKGROUP_SIZE == 0)) { - size_t temp = 1; - workGroupInfo_.size_ = workGroupInfo()->wavefrontSize_; - for (uint j = 0; j < 3; ++j) { - if (workGroupInfo()->compileSize_[j] != 0) { - temp *= workGroupInfo_.compileSize_[j]; - } - } - // Report a compilation error if requested compile size doesn't - // match the required workgroup size - if (workGroupInfo()->size_ < temp) { - char str[8]; - intToStr(workGroupInfo_.size_, str, 8); - buildError_ = CL_OUT_OF_RESOURCES; - buildLog_ += - "Error: Requested compile size is bigger than the required workgroup size of "; - buildLog_ += str; - buildLog_ += " elements\n"; - LogError(buildLog().c_str()); - return false; - } + // Report a compilation error if requested compile size doesn't + // match the required workgroup size + if (workGroupInfo()->size_ < temp) { + char str[8]; + intToStr(workGroupInfo_.size_, str, 8); + buildError_ = CL_OUT_OF_RESOURCES; + buildLog_ += "Error: Requested compile size is bigger than the required workgroup size of "; + buildLog_ += str; + buildLog_ += " elements\n"; + LogError(buildLog().c_str()); + return false; } + } - // Read/Write attributes are provided in metadata - if (mdVersion.value_ >= MetadataBufferAttributes.value_) { - rwAttributes_ = true; - } + // Read/Write attributes are provided in metadata + if (mdVersion.value_ >= MetadataBufferAttributes.value_) { + rwAttributes_ = true; + } - return true; + return true; } -bool -Kernel::validateMemory(uint idx, amd::Memory* amdMem) const -{ - // Check if memory doesn't require reallocation - bool noRealloc = (!argument(idx)->memory_.realloc_ || - amdMem->reallocedDeviceMemory(&dev())); +bool Kernel::validateMemory(uint idx, amd::Memory* amdMem) const { + // Check if memory doesn't require reallocation + bool noRealloc = (!argument(idx)->memory_.realloc_ || amdMem->reallocedDeviceMemory(&dev())); - return noRealloc; + return noRealloc; } -inline static HSAIL_ARG_TYPE -GetHSAILArgType(const aclArgData* argInfo) -{ - switch (argInfo->type) { - case ARG_TYPE_POINTER: - return HSAIL_ARGTYPE_POINTER; - case ARG_TYPE_QUEUE: - return HSAIL_ARGTYPE_QUEUE; - case ARG_TYPE_VALUE: - return HSAIL_ARGTYPE_VALUE; - case ARG_TYPE_IMAGE: - return HSAIL_ARGTYPE_IMAGE; - case ARG_TYPE_SAMPLER: - return HSAIL_ARGTYPE_SAMPLER; - case ARG_TYPE_ERROR: - default: - return HSAIL_ARGTYPE_ERROR; - } +inline static HSAIL_ARG_TYPE GetHSAILArgType(const aclArgData* argInfo) { + switch (argInfo->type) { + case ARG_TYPE_POINTER: + return HSAIL_ARGTYPE_POINTER; + case ARG_TYPE_QUEUE: + return HSAIL_ARGTYPE_QUEUE; + case ARG_TYPE_VALUE: + return HSAIL_ARGTYPE_VALUE; + case ARG_TYPE_IMAGE: + return HSAIL_ARGTYPE_IMAGE; + case ARG_TYPE_SAMPLER: + return HSAIL_ARGTYPE_SAMPLER; + case ARG_TYPE_ERROR: + default: + return HSAIL_ARGTYPE_ERROR; + } } -inline static size_t -GetHSAILArgAlignment(const aclArgData* argInfo) -{ - switch (argInfo->type) { - case ARG_TYPE_POINTER: - return argInfo->arg.pointer.align; - default: - return 1; - } +inline static size_t GetHSAILArgAlignment(const aclArgData* argInfo) { + switch (argInfo->type) { + case ARG_TYPE_POINTER: + return argInfo->arg.pointer.align; + default: + return 1; + } } -inline static HSAIL_ACCESS_TYPE -GetHSAILArgAccessType(const aclArgData* argInfo) -{ - if (argInfo->type == ARG_TYPE_POINTER) { - switch (argInfo->arg.pointer.type) { - case ACCESS_TYPE_RO: - return HSAIL_ACCESS_TYPE_RO; - case ACCESS_TYPE_WO: - return HSAIL_ACCESS_TYPE_WO; - case ACCESS_TYPE_RW: - default: - return HSAIL_ACCESS_TYPE_RW; - } +inline static HSAIL_ACCESS_TYPE GetHSAILArgAccessType(const aclArgData* argInfo) { + if (argInfo->type == ARG_TYPE_POINTER) { + switch (argInfo->arg.pointer.type) { + case ACCESS_TYPE_RO: + return HSAIL_ACCESS_TYPE_RO; + case ACCESS_TYPE_WO: + return HSAIL_ACCESS_TYPE_WO; + case ACCESS_TYPE_RW: + default: + return HSAIL_ACCESS_TYPE_RW; } - return HSAIL_ACCESS_TYPE_NONE; + } + return HSAIL_ACCESS_TYPE_NONE; } -inline static HSAIL_ADDRESS_QUALIFIER -GetHSAILAddrQual(const aclArgData* argInfo) -{ - if (argInfo->type == ARG_TYPE_POINTER) { - switch (argInfo->arg.pointer.memory) { - case PTR_MT_CONSTANT_EMU: - case PTR_MT_CONSTANT: - case PTR_MT_UAV: - case PTR_MT_GLOBAL: - return HSAIL_ADDRESS_GLOBAL; - case PTR_MT_LDS_EMU: - case PTR_MT_LDS: - return HSAIL_ADDRESS_LOCAL; - case PTR_MT_SCRATCH_EMU: - return HSAIL_ADDRESS_GLOBAL; - case PTR_MT_ERROR: - default: - LogError("Unsupported address type"); - return HSAIL_ADDRESS_ERROR; - } - } - else if ((argInfo->type == ARG_TYPE_IMAGE) || - (argInfo->type == ARG_TYPE_SAMPLER)) { +inline static HSAIL_ADDRESS_QUALIFIER GetHSAILAddrQual(const aclArgData* argInfo) { + if (argInfo->type == ARG_TYPE_POINTER) { + switch (argInfo->arg.pointer.memory) { + case PTR_MT_CONSTANT_EMU: + case PTR_MT_CONSTANT: + case PTR_MT_UAV: + case PTR_MT_GLOBAL: return HSAIL_ADDRESS_GLOBAL; - } - else if (argInfo->type == ARG_TYPE_QUEUE) { + case PTR_MT_LDS_EMU: + case PTR_MT_LDS: + return HSAIL_ADDRESS_LOCAL; + case PTR_MT_SCRATCH_EMU: return HSAIL_ADDRESS_GLOBAL; + case PTR_MT_ERROR: + default: + LogError("Unsupported address type"); + return HSAIL_ADDRESS_ERROR; } - return HSAIL_ADDRESS_ERROR; + } else if ((argInfo->type == ARG_TYPE_IMAGE) || (argInfo->type == ARG_TYPE_SAMPLER)) { + return HSAIL_ADDRESS_GLOBAL; + } else if (argInfo->type == ARG_TYPE_QUEUE) { + return HSAIL_ADDRESS_GLOBAL; + } + return HSAIL_ADDRESS_ERROR; } /* f16 returns f32 - workaround due to comp lib */ -inline static HSAIL_DATA_TYPE -GetHSAILDataType(const aclArgData* argInfo) -{ - aclArgDataType dataType; +inline static HSAIL_DATA_TYPE GetHSAILDataType(const aclArgData* argInfo) { + aclArgDataType dataType; - if (argInfo->type == ARG_TYPE_POINTER) { - dataType = argInfo->arg.pointer.data; - } - else if (argInfo->type == ARG_TYPE_VALUE) { - dataType = argInfo->arg.value.data; - } - else { - return HSAIL_DATATYPE_ERROR; - } - switch (dataType) { - case DATATYPE_i1: - return HSAIL_DATATYPE_B1; - case DATATYPE_i8: - return HSAIL_DATATYPE_S8; - case DATATYPE_i16: - return HSAIL_DATATYPE_S16; - case DATATYPE_i32: - return HSAIL_DATATYPE_S32; - case DATATYPE_i64: - return HSAIL_DATATYPE_S64; - case DATATYPE_u8: - return HSAIL_DATATYPE_U8; - case DATATYPE_u16: - return HSAIL_DATATYPE_U16; - case DATATYPE_u32: - return HSAIL_DATATYPE_U32; - case DATATYPE_u64: - return HSAIL_DATATYPE_U64; - case DATATYPE_f16: - return HSAIL_DATATYPE_F32; - case DATATYPE_f32: - return HSAIL_DATATYPE_F32; - case DATATYPE_f64: - return HSAIL_DATATYPE_F64; - case DATATYPE_struct: - return HSAIL_DATATYPE_STRUCT; - case DATATYPE_opaque: - return HSAIL_DATATYPE_OPAQUE; - case DATATYPE_ERROR: - default: - return HSAIL_DATATYPE_ERROR; - } + if (argInfo->type == ARG_TYPE_POINTER) { + dataType = argInfo->arg.pointer.data; + } else if (argInfo->type == ARG_TYPE_VALUE) { + dataType = argInfo->arg.value.data; + } else { + return HSAIL_DATATYPE_ERROR; + } + switch (dataType) { + case DATATYPE_i1: + return HSAIL_DATATYPE_B1; + case DATATYPE_i8: + return HSAIL_DATATYPE_S8; + case DATATYPE_i16: + return HSAIL_DATATYPE_S16; + case DATATYPE_i32: + return HSAIL_DATATYPE_S32; + case DATATYPE_i64: + return HSAIL_DATATYPE_S64; + case DATATYPE_u8: + return HSAIL_DATATYPE_U8; + case DATATYPE_u16: + return HSAIL_DATATYPE_U16; + case DATATYPE_u32: + return HSAIL_DATATYPE_U32; + case DATATYPE_u64: + return HSAIL_DATATYPE_U64; + case DATATYPE_f16: + return HSAIL_DATATYPE_F32; + case DATATYPE_f32: + return HSAIL_DATATYPE_F32; + case DATATYPE_f64: + return HSAIL_DATATYPE_F64; + case DATATYPE_struct: + return HSAIL_DATATYPE_STRUCT; + case DATATYPE_opaque: + return HSAIL_DATATYPE_OPAQUE; + case DATATYPE_ERROR: + default: + return HSAIL_DATATYPE_ERROR; + } } -inline static int -GetHSAILArgSize(const aclArgData *argInfo) -{ - switch (argInfo->type) { - case ARG_TYPE_VALUE: - switch (GetHSAILDataType(argInfo)) { - case HSAIL_DATATYPE_B1: - return 1; - case HSAIL_DATATYPE_B8: - case HSAIL_DATATYPE_S8: - case HSAIL_DATATYPE_U8: - return 1; - case HSAIL_DATATYPE_B16: - case HSAIL_DATATYPE_U16: - case HSAIL_DATATYPE_S16: - case HSAIL_DATATYPE_F16: - return 2; - case HSAIL_DATATYPE_B32: - case HSAIL_DATATYPE_U32: - case HSAIL_DATATYPE_S32: - case HSAIL_DATATYPE_F32: - return 4; - case HSAIL_DATATYPE_B64: - case HSAIL_DATATYPE_U64: - case HSAIL_DATATYPE_S64: - case HSAIL_DATATYPE_F64: - return 8; - case HSAIL_DATATYPE_STRUCT: - return argInfo->arg.value.numElements; - default: - return -1; - } - case ARG_TYPE_POINTER: - case ARG_TYPE_IMAGE: - case ARG_TYPE_SAMPLER: - case ARG_TYPE_QUEUE: - return sizeof(void*); +inline static int GetHSAILArgSize(const aclArgData* argInfo) { + switch (argInfo->type) { + case ARG_TYPE_VALUE: + switch (GetHSAILDataType(argInfo)) { + case HSAIL_DATATYPE_B1: + return 1; + case HSAIL_DATATYPE_B8: + case HSAIL_DATATYPE_S8: + case HSAIL_DATATYPE_U8: + return 1; + case HSAIL_DATATYPE_B16: + case HSAIL_DATATYPE_U16: + case HSAIL_DATATYPE_S16: + case HSAIL_DATATYPE_F16: + return 2; + case HSAIL_DATATYPE_B32: + case HSAIL_DATATYPE_U32: + case HSAIL_DATATYPE_S32: + case HSAIL_DATATYPE_F32: + return 4; + case HSAIL_DATATYPE_B64: + case HSAIL_DATATYPE_U64: + case HSAIL_DATATYPE_S64: + case HSAIL_DATATYPE_F64: + return 8; + case HSAIL_DATATYPE_STRUCT: + return argInfo->arg.value.numElements; default: - return -1; - } + return -1; + } + case ARG_TYPE_POINTER: + case ARG_TYPE_IMAGE: + case ARG_TYPE_SAMPLER: + case ARG_TYPE_QUEUE: + return sizeof(void*); + default: + return -1; + } } -inline static clk_value_type_t -GetOclType(const aclArgData* argInfo) -{ - static const clk_value_type_t ClkValueMapType[6][6] = { - { T_CHAR, T_CHAR2, T_CHAR3, T_CHAR4, T_CHAR8, T_CHAR16 }, - { T_SHORT, T_SHORT2, T_SHORT3, T_SHORT4, T_SHORT8, T_SHORT16 }, - { T_INT, T_INT2, T_INT3, T_INT4, T_INT8, T_INT16 }, - { T_LONG, T_LONG2, T_LONG3, T_LONG4, T_LONG8, T_LONG16 }, - { T_FLOAT, T_FLOAT2, T_FLOAT3, T_FLOAT4, T_FLOAT8, T_FLOAT16 }, - { T_DOUBLE, T_DOUBLE2, T_DOUBLE3, T_DOUBLE4, T_DOUBLE8, T_DOUBLE16 }, - }; +inline static clk_value_type_t GetOclType(const aclArgData* argInfo) { + static const clk_value_type_t ClkValueMapType[6][6] = { + {T_CHAR, T_CHAR2, T_CHAR3, T_CHAR4, T_CHAR8, T_CHAR16}, + {T_SHORT, T_SHORT2, T_SHORT3, T_SHORT4, T_SHORT8, T_SHORT16}, + {T_INT, T_INT2, T_INT3, T_INT4, T_INT8, T_INT16}, + {T_LONG, T_LONG2, T_LONG3, T_LONG4, T_LONG8, T_LONG16}, + {T_FLOAT, T_FLOAT2, T_FLOAT3, T_FLOAT4, T_FLOAT8, T_FLOAT16}, + {T_DOUBLE, T_DOUBLE2, T_DOUBLE3, T_DOUBLE4, T_DOUBLE8, T_DOUBLE16}, + }; - uint sizeType; - if (argInfo->type == ARG_TYPE_QUEUE) { - return T_QUEUE; - } - if ((argInfo->type == ARG_TYPE_POINTER) || (argInfo->type == ARG_TYPE_IMAGE)) { - return T_POINTER; - } - else if (argInfo->type == ARG_TYPE_VALUE) { - switch (argInfo->arg.value.data) { - case DATATYPE_i8: - case DATATYPE_u8: - sizeType = 0; - break; - case DATATYPE_i16: - case DATATYPE_u16: - sizeType = 1; - break; - case DATATYPE_i32: - case DATATYPE_u32: - sizeType = 2; - break; - case DATATYPE_i64: - case DATATYPE_u64: - sizeType = 3; - break; - case DATATYPE_f16: - case DATATYPE_f32: - sizeType = 4; - break; - case DATATYPE_f64: - sizeType = 5; - break; - default: - return T_VOID; - } - switch (argInfo->arg.value.numElements) { - case 1: return ClkValueMapType[sizeType][0]; - case 2: return ClkValueMapType[sizeType][1]; - case 3: return ClkValueMapType[sizeType][2]; - case 4: return ClkValueMapType[sizeType][3]; - case 8: return ClkValueMapType[sizeType][4]; - case 16: return ClkValueMapType[sizeType][5]; - default: return T_VOID; - } - } - else if (argInfo->type == ARG_TYPE_SAMPLER) { - return T_SAMPLER; - } - else { + uint sizeType; + if (argInfo->type == ARG_TYPE_QUEUE) { + return T_QUEUE; + } + if ((argInfo->type == ARG_TYPE_POINTER) || (argInfo->type == ARG_TYPE_IMAGE)) { + return T_POINTER; + } else if (argInfo->type == ARG_TYPE_VALUE) { + switch (argInfo->arg.value.data) { + case DATATYPE_i8: + case DATATYPE_u8: + sizeType = 0; + break; + case DATATYPE_i16: + case DATATYPE_u16: + sizeType = 1; + break; + case DATATYPE_i32: + case DATATYPE_u32: + sizeType = 2; + break; + case DATATYPE_i64: + case DATATYPE_u64: + sizeType = 3; + break; + case DATATYPE_f16: + case DATATYPE_f32: + sizeType = 4; + break; + case DATATYPE_f64: + sizeType = 5; + break; + default: return T_VOID; } + switch (argInfo->arg.value.numElements) { + case 1: + return ClkValueMapType[sizeType][0]; + case 2: + return ClkValueMapType[sizeType][1]; + case 3: + return ClkValueMapType[sizeType][2]; + case 4: + return ClkValueMapType[sizeType][3]; + case 8: + return ClkValueMapType[sizeType][4]; + case 16: + return ClkValueMapType[sizeType][5]; + default: + return T_VOID; + } + } else if (argInfo->type == ARG_TYPE_SAMPLER) { + return T_SAMPLER; + } else { + return T_VOID; + } } -inline static cl_kernel_arg_address_qualifier -GetOclAddrQual(const aclArgData* argInfo) -{ - if (argInfo->type == ARG_TYPE_POINTER) { - switch (argInfo->arg.pointer.memory) { - case PTR_MT_UAV: - case PTR_MT_GLOBAL: - return CL_KERNEL_ARG_ADDRESS_GLOBAL; - case PTR_MT_CONSTANT: - case PTR_MT_UAV_CONSTANT: - case PTR_MT_CONSTANT_EMU: - return CL_KERNEL_ARG_ADDRESS_CONSTANT; - case PTR_MT_LDS_EMU: - case PTR_MT_LDS: - return CL_KERNEL_ARG_ADDRESS_LOCAL; - default: - return CL_KERNEL_ARG_ADDRESS_PRIVATE; - } - } - else if (argInfo->type == ARG_TYPE_IMAGE) { +inline static cl_kernel_arg_address_qualifier GetOclAddrQual(const aclArgData* argInfo) { + if (argInfo->type == ARG_TYPE_POINTER) { + switch (argInfo->arg.pointer.memory) { + case PTR_MT_UAV: + case PTR_MT_GLOBAL: return CL_KERNEL_ARG_ADDRESS_GLOBAL; + case PTR_MT_CONSTANT: + case PTR_MT_UAV_CONSTANT: + case PTR_MT_CONSTANT_EMU: + return CL_KERNEL_ARG_ADDRESS_CONSTANT; + case PTR_MT_LDS_EMU: + case PTR_MT_LDS: + return CL_KERNEL_ARG_ADDRESS_LOCAL; + default: + return CL_KERNEL_ARG_ADDRESS_PRIVATE; } - //default for all other cases - return CL_KERNEL_ARG_ADDRESS_PRIVATE; + } else if (argInfo->type == ARG_TYPE_IMAGE) { + return CL_KERNEL_ARG_ADDRESS_GLOBAL; + } + // default for all other cases + return CL_KERNEL_ARG_ADDRESS_PRIVATE; } -inline static cl_kernel_arg_access_qualifier -GetOclAccessQual(const aclArgData* argInfo) -{ - if (argInfo->type == ARG_TYPE_IMAGE) { - switch (argInfo->arg.image.type) { - case ACCESS_TYPE_RO: - return CL_KERNEL_ARG_ACCESS_READ_ONLY; - case ACCESS_TYPE_WO: - return CL_KERNEL_ARG_ACCESS_WRITE_ONLY; - case ACCESS_TYPE_RW: - return CL_KERNEL_ARG_ACCESS_READ_WRITE; +inline static cl_kernel_arg_access_qualifier GetOclAccessQual(const aclArgData* argInfo) { + if (argInfo->type == ARG_TYPE_IMAGE) { + switch (argInfo->arg.image.type) { + case ACCESS_TYPE_RO: + return CL_KERNEL_ARG_ACCESS_READ_ONLY; + case ACCESS_TYPE_WO: + return CL_KERNEL_ARG_ACCESS_WRITE_ONLY; + case ACCESS_TYPE_RW: + return CL_KERNEL_ARG_ACCESS_READ_WRITE; + default: + return CL_KERNEL_ARG_ACCESS_NONE; + } + } + return CL_KERNEL_ARG_ACCESS_NONE; +} + +inline static cl_kernel_arg_type_qualifier GetOclTypeQual(const aclArgData* argInfo) { + cl_kernel_arg_type_qualifier rv = CL_KERNEL_ARG_TYPE_NONE; + if (argInfo->type == ARG_TYPE_POINTER) { + if (argInfo->arg.pointer.isVolatile) { + rv |= CL_KERNEL_ARG_TYPE_VOLATILE; + } + if (argInfo->arg.pointer.isRestrict) { + rv |= CL_KERNEL_ARG_TYPE_RESTRICT; + } + if (argInfo->arg.pointer.isPipe) { + rv |= CL_KERNEL_ARG_TYPE_PIPE; + } + if (argInfo->isConst) { + rv |= CL_KERNEL_ARG_TYPE_CONST; + } + switch (argInfo->arg.pointer.memory) { + case PTR_MT_CONSTANT: + case PTR_MT_UAV_CONSTANT: + case PTR_MT_CONSTANT_EMU: + rv |= CL_KERNEL_ARG_TYPE_CONST; + break; + default: + break; + } + } + return rv; +} + +static int GetOclSize(const aclArgData* argInfo) { + switch (argInfo->type) { + case ARG_TYPE_POINTER: + return sizeof(void*); + case ARG_TYPE_VALUE: + //! \note OCL 6.1.5. For 3-component vector data types, + //! the size of the data type is 4 * sizeof(component). + switch (argInfo->arg.value.data) { + case DATATYPE_struct: + return 1 * argInfo->arg.value.numElements; + case DATATYPE_i8: + case DATATYPE_u8: + return 1 * amd::nextPowerOfTwo(argInfo->arg.value.numElements); + case DATATYPE_u16: + case DATATYPE_i16: + case DATATYPE_f16: + return 2 * amd::nextPowerOfTwo(argInfo->arg.value.numElements); + case DATATYPE_u32: + case DATATYPE_i32: + case DATATYPE_f32: + return 4 * amd::nextPowerOfTwo(argInfo->arg.value.numElements); + case DATATYPE_i64: + case DATATYPE_u64: + case DATATYPE_f64: + return 8 * amd::nextPowerOfTwo(argInfo->arg.value.numElements); + case DATATYPE_ERROR: default: - return CL_KERNEL_ARG_ACCESS_NONE; - } - } - return CL_KERNEL_ARG_ACCESS_NONE; + return -1; + } + case ARG_TYPE_IMAGE: + return sizeof(cl_mem); + case ARG_TYPE_SAMPLER: + return sizeof(cl_sampler); + case ARG_TYPE_QUEUE: + return sizeof(cl_command_queue); + default: + return -1; + } } -inline static cl_kernel_arg_type_qualifier -GetOclTypeQual(const aclArgData* argInfo) -{ - cl_kernel_arg_type_qualifier rv = CL_KERNEL_ARG_TYPE_NONE; - if (argInfo->type == ARG_TYPE_POINTER) { - if (argInfo->arg.pointer.isVolatile) { - rv |= CL_KERNEL_ARG_TYPE_VOLATILE; - } - if (argInfo->arg.pointer.isRestrict) { - rv |= CL_KERNEL_ARG_TYPE_RESTRICT; - } - if (argInfo->arg.pointer.isPipe) { - rv |= CL_KERNEL_ARG_TYPE_PIPE; - } - if (argInfo->isConst) { - rv |= CL_KERNEL_ARG_TYPE_CONST; - } - switch (argInfo->arg.pointer.memory) { - case PTR_MT_CONSTANT: - case PTR_MT_UAV_CONSTANT: - case PTR_MT_CONSTANT_EMU: - rv |= CL_KERNEL_ARG_TYPE_CONST; +void HSAILKernel::initArgList(const aclArgData* aclArg) { + // Initialize the hsail argument list too + initHsailArgs(aclArg); + + // Iterate through the arguments and insert into parameterList + device::Kernel::parameters_t params; + amd::KernelParameterDescriptor desc; + size_t offset = 0; + + // Reserved arguments for HSAIL launch + aclArg += MaxExtraArgumentsNum; + for (uint i = 0; aclArg->struct_size != 0; i++, aclArg++) { + desc.name_ = arguments_[i]->name_.c_str(); + desc.type_ = GetOclType(aclArg); + desc.addressQualifier_ = GetOclAddrQual(aclArg); + desc.accessQualifier_ = GetOclAccessQual(aclArg); + desc.typeQualifier_ = GetOclTypeQual(aclArg); + desc.typeName_ = arguments_[i]->typeName_.c_str(); + + // Make a check if it is local or global + if (desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_LOCAL) { + desc.size_ = 0; + } else { + desc.size_ = GetOclSize(aclArg); + } + + // Make offset alignment to match CPU metadata, since + // in multidevice config abstraction layer has a single signature + // and CPU sends the paramaters as they are allocated in memory + size_t size = desc.size_; + if (size == 0) { + // Local memory for CPU + size = sizeof(cl_mem); + } + offset = amd::alignUp(offset, std::min(size, size_t(16))); + desc.offset_ = offset; + offset += amd::alignUp(size, sizeof(uint32_t)); + params.push_back(desc); + + if (arguments_[i]->type_ == HSAIL_ARGTYPE_IMAGE) { + flags_.imageEna_ = true; + if (desc.accessQualifier_ != CL_KERNEL_ARG_ACCESS_READ_ONLY) { + flags_.imageWriteEna_ = true; + } + } + } + + createSignature(params); +} + +void HSAILKernel::initHsailArgs(const aclArgData* aclArg) { + int offset = 0; + + // Reserved arguments for HSAIL launch + aclArg += MaxExtraArgumentsNum; + + // Iterate through the each kernel argument + for (; aclArg->struct_size != 0; aclArg++) { + Argument* arg = new Argument; + // Initialize HSAIL kernel argument + arg->name_ = aclArg->argStr; + arg->typeName_ = aclArg->typeStr; + arg->size_ = GetHSAILArgSize(aclArg); + arg->offset_ = offset; + arg->type_ = GetHSAILArgType(aclArg); + arg->addrQual_ = GetHSAILAddrQual(aclArg); + arg->dataType_ = GetHSAILDataType(aclArg); + // If vector of args we add additional arguments to flatten it out + arg->numElem_ = + ((aclArg->type == ARG_TYPE_VALUE) && (aclArg->arg.value.data != DATATYPE_struct)) + ? aclArg->arg.value.numElements + : 1; + arg->alignment_ = GetHSAILArgAlignment(aclArg); + arg->access_ = GetHSAILArgAccessType(aclArg); + offset += GetHSAILArgSize(aclArg); + arguments_.push_back(arg); + } +} + +void HSAILKernel::initPrintf(const aclPrintfFmt* aclPrintf) { + PrintfInfo info; + uint index = 0; + for (; aclPrintf->struct_size != 0; aclPrintf++) { + index = aclPrintf->ID; + if (printf_.size() <= index) { + printf_.resize(index + 1); + } + std::string pfmt = aclPrintf->fmtStr; + info.fmtString_.clear(); + size_t pos = 0; + bool need_nl = true; + for (size_t pos = 0; pos < pfmt.size(); ++pos) { + char symbol = pfmt[pos]; + need_nl = true; + if (symbol == '\\') { + // Rest of the C escape sequences (e.g. \') are handled correctly + // by the MDParser, we are not sure exactly how! + switch (pfmt[pos + 1]) { + case 'a': + pos++; + symbol = '\a'; break; - default: + case 'b': + pos++; + symbol = '\b'; + break; + case 'f': + pos++; + symbol = '\f'; + break; + case 'n': + pos++; + symbol = '\n'; + need_nl = false; + break; + case 'r': + pos++; + symbol = '\r'; + break; + case 'v': + pos++; + symbol = '\v'; + break; + case '7': + if (pfmt[pos + 2] == '2') { + pos += 2; + symbol = '\72'; + } + break; + default: break; } + } + info.fmtString_.push_back(symbol); } - return rv; + if (need_nl) { + info.fmtString_ += "\n"; + } + uint32_t* tmp_ptr = const_cast(aclPrintf->argSizes); + for (uint i = 0; i < aclPrintf->numSizes; i++, tmp_ptr++) { + info.arguments_.push_back(*tmp_ptr); + } + printf_[index] = info; + info.arguments_.clear(); + } } -static int -GetOclSize(const aclArgData* argInfo) -{ - switch (argInfo->type) { - case ARG_TYPE_POINTER: return sizeof(void *); - case ARG_TYPE_VALUE: - //! \note OCL 6.1.5. For 3-component vector data types, - //! the size of the data type is 4 * sizeof(component). - switch (argInfo->arg.value.data) { - case DATATYPE_struct: - return 1 * argInfo->arg.value.numElements; - case DATATYPE_i8: - case DATATYPE_u8: - return 1 * amd::nextPowerOfTwo(argInfo->arg.value.numElements); - case DATATYPE_u16: - case DATATYPE_i16: - case DATATYPE_f16: - return 2 * amd::nextPowerOfTwo(argInfo->arg.value.numElements); - case DATATYPE_u32: - case DATATYPE_i32: - case DATATYPE_f32: - return 4 * amd::nextPowerOfTwo(argInfo->arg.value.numElements); - case DATATYPE_i64: - case DATATYPE_u64: - case DATATYPE_f64: - return 8 * amd::nextPowerOfTwo(argInfo->arg.value.numElements); - case DATATYPE_ERROR: - default: return -1; - } - case ARG_TYPE_IMAGE: return sizeof(cl_mem); - case ARG_TYPE_SAMPLER: return sizeof(cl_sampler); - case ARG_TYPE_QUEUE: return sizeof(cl_command_queue); - default: return -1; - } +HSAILKernel::HSAILKernel(std::string name, HSAILProgram* prog, std::string compileOptions, + uint extraArgsNum) + : device::Kernel(name), + compileOptions_(compileOptions), + dev_(prog->dev()), + prog_(*prog), + index_(0), + code_(NULL), + codeSize_(0), + hwMetaData_(NULL), + extraArgumentsNum_(extraArgsNum), + waveLimiter_(this, (prog->isNull() ? 1 : dev().getAttribs().numberOfCUsperShaderArray) * + dev().hwInfo()->simdPerCU_) { + hsa_ = true; } -void -HSAILKernel::initArgList(const aclArgData* aclArg) -{ - // Initialize the hsail argument list too - initHsailArgs(aclArg); +HSAILKernel::~HSAILKernel() { + while (!arguments_.empty()) { + Argument* arg = arguments_.back(); + delete arg; + arguments_.pop_back(); + } - // Iterate through the arguments and insert into parameterList - device::Kernel::parameters_t params; - amd::KernelParameterDescriptor desc; - size_t offset = 0; + delete[] hwMetaData_; - // Reserved arguments for HSAIL launch - aclArg += MaxExtraArgumentsNum; - for (uint i = 0; aclArg->struct_size != 0; i++, aclArg++) { - desc.name_ = arguments_[i]->name_.c_str(); - desc.type_ = GetOclType(aclArg); - desc.addressQualifier_ = GetOclAddrQual(aclArg); - desc.accessQualifier_ = GetOclAccessQual(aclArg); - desc.typeQualifier_ = GetOclTypeQual(aclArg); - desc.typeName_ = arguments_[i]->typeName_.c_str(); - - // Make a check if it is local or global - if (desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_LOCAL) { - desc.size_ = 0; - } - else { - desc.size_ = GetOclSize(aclArg); - } - - // Make offset alignment to match CPU metadata, since - // in multidevice config abstraction layer has a single signature - // and CPU sends the paramaters as they are allocated in memory - size_t size = desc.size_; - if (size == 0) { - // Local memory for CPU - size = sizeof(cl_mem); - } - offset = amd::alignUp(offset, std::min(size, size_t(16))); - desc.offset_ = offset; - offset += amd::alignUp(size, sizeof(uint32_t)); - params.push_back(desc); - - if (arguments_[i]->type_ == HSAIL_ARGTYPE_IMAGE) { - flags_.imageEna_ = true; - if (desc.accessQualifier_ != CL_KERNEL_ARG_ACCESS_READ_ONLY) { - flags_.imageWriteEna_ = true; - } - } - } - - createSignature(params); + delete code_; } -void -HSAILKernel::initHsailArgs(const aclArgData* aclArg) -{ - int offset = 0; - - // Reserved arguments for HSAIL launch - aclArg += MaxExtraArgumentsNum; - - // Iterate through the each kernel argument - for (; aclArg->struct_size != 0; aclArg++) { - Argument* arg = new Argument; - // Initialize HSAIL kernel argument - arg->name_ = aclArg->argStr; - arg->typeName_ = aclArg->typeStr; - arg->size_ = GetHSAILArgSize(aclArg); - arg->offset_ = offset; - arg->type_ = GetHSAILArgType(aclArg); - arg->addrQual_ = GetHSAILAddrQual(aclArg); - arg->dataType_ = GetHSAILDataType(aclArg); - // If vector of args we add additional arguments to flatten it out - arg->numElem_ = ((aclArg->type == ARG_TYPE_VALUE) && - (aclArg->arg.value.data != DATATYPE_struct)) ? - aclArg->arg.value.numElements : 1; - arg->alignment_ = GetHSAILArgAlignment(aclArg); - arg->access_ = GetHSAILArgAccessType(aclArg); - offset += GetHSAILArgSize(aclArg); - arguments_.push_back(arg); +bool HSAILKernel::init(amd::hsa::loader::Symbol* sym, bool finalize) { + if (extraArgumentsNum_ > MaxExtraArgumentsNum) { + LogError("Failed to initialize kernel: extra arguments number is bigger than is supported"); + return false; + } + acl_error error = ACL_SUCCESS; + std::string openClKernelName = openclMangledName(name()); + flags_.internalKernel_ = + (compileOptions_.find("-cl-internal-kernel") != std::string::npos) ? true : false; + // compile kernel down to ISA + if (finalize) { + std::string options(compileOptions_.c_str()); + options.append(" -just-kernel="); + options.append(openClKernelName.c_str()); + // Append an option so that we can selectively enable a SCOption on CZ + // whenever IOMMUv2 is enabled. + if (dev().settings().svmFineGrainSystem_) { + options.append(" -sc-xnack-iommu"); } + error = aclCompile(dev().hsaCompiler(), prog().binaryElf(), options.c_str(), ACL_TYPE_CG, + ACL_TYPE_ISA, NULL); + buildLog_ += aclGetCompilerLog(dev().hsaCompiler()); + if (error != ACL_SUCCESS) { + LogError("Failed to finalize kernel"); + return false; + } + } + + aqlCreateHWInfo(sym); + + // Pull out metadata from the ELF + size_t sizeOfArgList; + error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), RT_ARGUMENT_ARRAY, + openClKernelName.c_str(), NULL, &sizeOfArgList); + if (error != ACL_SUCCESS) { + return false; + } + + char* aclArgList = new char[sizeOfArgList]; + if (NULL == aclArgList) { + return false; + } + error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), RT_ARGUMENT_ARRAY, + openClKernelName.c_str(), aclArgList, &sizeOfArgList); + if (error != ACL_SUCCESS) { + return false; + } + + size_t sizeOfWorkGroupSize; + error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), RT_WORK_GROUP_SIZE, + openClKernelName.c_str(), NULL, &sizeOfWorkGroupSize); + if (error != ACL_SUCCESS) { + return false; + } + error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), RT_WORK_GROUP_SIZE, + openClKernelName.c_str(), workGroupInfo_.compileSize_, &sizeOfWorkGroupSize); + if (error != ACL_SUCCESS) { + return false; + } + + // Copy wavefront size + workGroupInfo_.wavefrontSize_ = prog().isNull() ? 64 : dev().getAttribs().wavefrontSize; + + // Find total workgroup size + if (workGroupInfo_.compileSize_[0] != 0) { + workGroupInfo_.size_ = workGroupInfo_.compileSize_[0] * workGroupInfo_.compileSize_[1] * + workGroupInfo_.compileSize_[2]; + } else { + workGroupInfo_.size_ = dev().info().maxWorkGroupSize_; + } + + // Pull out printf metadata from the ELF + size_t sizeOfPrintfList; + error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), RT_GPU_PRINTF_ARRAY, + openClKernelName.c_str(), NULL, &sizeOfPrintfList); + if (error != ACL_SUCCESS) { + return false; + } + + // Make sure kernel has any printf info + if (0 != sizeOfPrintfList) { + char* aclPrintfList = new char[sizeOfPrintfList]; + if (NULL == aclPrintfList) { + return false; + } + error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), RT_GPU_PRINTF_ARRAY, + openClKernelName.c_str(), aclPrintfList, &sizeOfPrintfList); + if (error != ACL_SUCCESS) { + return false; + } + + // Set the PrintfList + initPrintf(reinterpret_cast(aclPrintfList)); + delete[] aclPrintfList; + } + + aclMetadata md; + md.enqueue_kernel = false; + size_t sizeOfDeviceEnqueue = sizeof(md.enqueue_kernel); + error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), RT_DEVICE_ENQUEUE, + openClKernelName.c_str(), &md.enqueue_kernel, &sizeOfDeviceEnqueue); + if (error != ACL_SUCCESS) { + return false; + } + flags_.dynamicParallelism_ = md.enqueue_kernel; + + md.kernel_index = -1; + size_t sizeOfIndex = sizeof(md.kernel_index); + error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), RT_KERNEL_INDEX, + openClKernelName.c_str(), &md.kernel_index, &sizeOfIndex); + if (error != ACL_SUCCESS) { + return false; + } + index_ = md.kernel_index; + + size_t sizeOfWavesPerSimdHint = sizeof(workGroupInfo_.wavesPerSimdHint_); + error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), RT_WAVES_PER_SIMD_HINT, + openClKernelName.c_str(), &workGroupInfo_.wavesPerSimdHint_, + &sizeOfWavesPerSimdHint); + if (error != ACL_SUCCESS) { + return false; + } + + waveLimiter_.enable(dev().settings().ciPlus_); + + size_t sizeOfWorkGroupSizeHint = sizeof(workGroupInfo_.compileSizeHint_); + error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), RT_WORK_GROUP_SIZE_HINT, + openClKernelName.c_str(), workGroupInfo_.compileSizeHint_, + &sizeOfWorkGroupSizeHint); + if (error != ACL_SUCCESS) { + return false; + } + + size_t sizeOfVecTypeHint; + error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), RT_VEC_TYPE_HINT, + openClKernelName.c_str(), NULL, &sizeOfVecTypeHint); + if (error != ACL_SUCCESS) { + return false; + } + + if (0 != sizeOfVecTypeHint) { + char* VecTypeHint = new char[sizeOfVecTypeHint + 1]; + if (NULL == VecTypeHint) { + return false; + } + error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), RT_VEC_TYPE_HINT, + openClKernelName.c_str(), VecTypeHint, &sizeOfVecTypeHint); + if (error != ACL_SUCCESS) { + return false; + } + VecTypeHint[sizeOfVecTypeHint] = '\0'; + workGroupInfo_.compileVecTypeHint_ = std::string(VecTypeHint); + delete[] VecTypeHint; + } + + // Set the argList + initArgList(reinterpret_cast(aclArgList)); + delete[] aclArgList; + + return true; } -void -HSAILKernel::initPrintf(const aclPrintfFmt* aclPrintf) -{ - PrintfInfo info; - uint index = 0; - for (; aclPrintf->struct_size != 0; aclPrintf++) { - index = aclPrintf->ID; - if (printf_.size() <= index) { - printf_.resize(index + 1); - } - std::string pfmt = aclPrintf->fmtStr; - info.fmtString_.clear(); - size_t pos = 0; - bool need_nl = true; - for (size_t pos = 0; pos < pfmt.size(); ++pos) { - char symbol = pfmt[pos]; - need_nl = true; - if (symbol == '\\') { - // Rest of the C escape sequences (e.g. \') are handled correctly - // by the MDParser, we are not sure exactly how! - switch (pfmt[pos+1]) { - case 'a': - pos++; - symbol = '\a'; - break; - case 'b': - pos++; - symbol = '\b'; - break; - case 'f': - pos++; - symbol = '\f'; - break; - case 'n': - pos++; - symbol = '\n'; - need_nl = false; - break; - case 'r': - pos++; - symbol = '\r'; - break; - case 'v': - pos++; - symbol = '\v'; - break; - case '7': - if (pfmt[pos+2] == '2') { - pos += 2; - symbol = '\72'; +bool HSAILKernel::validateMemory(uint idx, amd::Memory* amdMem) const { + // Check if memory doesn't require reallocation + bool noRealloc = true; + // amdMem->reallocedDeviceMemory(&dev())); + + return noRealloc; +} + +const Device& HSAILKernel::dev() const { return reinterpret_cast(dev_); } + +const HSAILProgram& HSAILKernel::prog() const { + return reinterpret_cast(prog_); +} + +void HSAILKernel::findLocalWorkSize(size_t workDim, const amd::NDRange& gblWorkSize, + amd::NDRange& lclWorkSize) const { + // Initialize the default workgoup info + // Check if the kernel has the compiled sizes + if (workGroupInfo()->compileSize_[0] == 0) { + // Find the default local workgroup size, if it wasn't specified + if (lclWorkSize[0] == 0) { + size_t thrPerGrp; + bool b1DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE); + bool b2DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_X) || + !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_Y); + bool b3DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_X) || + !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Y) || + !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Z); + + bool overrideSet = ((workDim == 1) && b1DOverrideSet) || ((workDim == 2) && b2DOverrideSet) || + ((workDim == 3) && b3DOverrideSet); + if (!overrideSet) { + // Find threads per group + thrPerGrp = workGroupInfo()->size_; + + // Check if kernel uses images + if (flags_.imageEna_ && + // and thread group is a multiple value of wavefronts + ((thrPerGrp % workGroupInfo()->wavefrontSize_) == 0) && + // and it's 2 or 3-dimensional workload + (workDim > 1) && ((dev().settings().partialDispatch_) || + (((gblWorkSize[0] % 16) == 0) && ((gblWorkSize[1] % 16) == 0)))) { + // Use 8x8 workgroup size if kernel has image writes + if (flags_.imageWriteEna_ || (thrPerGrp != dev().info().maxWorkGroupSize_)) { + lclWorkSize[0] = 8; + lclWorkSize[1] = 8; + } else { + lclWorkSize[0] = 16; + lclWorkSize[1] = 16; + } + if (workDim == 3) { + lclWorkSize[2] = 1; + } + } else { + size_t tmp = thrPerGrp; + // Split the local workgroup into the most efficient way + for (uint d = 0; d < workDim; ++d) { + size_t div = tmp; + for (; (gblWorkSize[d] % div) != 0; div--) + ; + lclWorkSize[d] = div; + tmp /= div; + } + + // Check if partial dispatch is enabled and + if (dev().settings().partialDispatch_ && + // we couldn't find optimal workload + (lclWorkSize.product() % workGroupInfo()->wavefrontSize_) != 0) { + size_t maxSize = 0; + size_t maxDim = 0; + for (uint d = 0; d < workDim; ++d) { + if (maxSize < gblWorkSize[d]) { + maxSize = gblWorkSize[d]; + maxDim = d; + } + } + // Check if a local workgroup has the most optimal size + if (thrPerGrp > maxSize) { + thrPerGrp = maxSize; + } + lclWorkSize[maxDim] = thrPerGrp; + for (uint d = 0; d < workDim; ++d) { + if (d != maxDim) { + lclWorkSize[d] = 1; } - break; - default: - break; } } - info.fmtString_.push_back(symbol); } - if (need_nl) { - info.fmtString_ += "\n"; + } else { + // Use overrides when app doesn't provide workgroup dimensions + if (workDim == 1) { + lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE; + } else if (workDim == 2) { + lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_2D_X; + lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_2D_Y; + } else if (workDim == 3) { + lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_3D_X; + lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_3D_Y; + lclWorkSize[2] = GPU_MAX_WORKGROUP_SIZE_3D_Z; + } else { + assert(0 && "Invalid workDim!"); } - uint32_t *tmp_ptr = const_cast(aclPrintf->argSizes); - for (uint i = 0; i < aclPrintf->numSizes; i++ , tmp_ptr++) { - info.arguments_.push_back(*tmp_ptr); - } - printf_[index] = info; - info.arguments_.clear(); + } } + } else { + for (uint d = 0; d < workDim; ++d) { + lclWorkSize[d] = workGroupInfo()->compileSize_[d]; + } + } } -HSAILKernel::HSAILKernel(std::string name, - HSAILProgram* prog, - std::string compileOptions, - uint extraArgsNum) - : device::Kernel(name) - , compileOptions_(compileOptions) - , dev_(prog->dev()) - , prog_(*prog) - , index_(0) - , code_(NULL) - , codeSize_(0) - , hwMetaData_(NULL) - , extraArgumentsNum_(extraArgsNum) - , waveLimiter_(this, (prog->isNull() ? 1 : - dev().getAttribs().numberOfCUsperShaderArray) * dev().hwInfo()->simdPerCU_) -{ - hsa_ = true; +inline static void WriteAqlArg( + unsigned char** dst, //!< The write pointer to the buffer + const void* src, //!< The source pointer + uint size, //!< The size in bytes to copy + uint alignment = 0 //!< The alignment to follow while writing to the buffer + ) { + if (alignment == 0) { + *dst = amd::alignUp(*dst, size); + } else { + *dst = amd::alignUp(*dst, alignment); + } + memcpy(*dst, src, size); + *dst += size; } -HSAILKernel::~HSAILKernel() -{ - while (!arguments_.empty()) { - Argument* arg = arguments_.back(); - delete arg; - arguments_.pop_back(); - } - - delete [] hwMetaData_; - - delete code_; -} - -bool -HSAILKernel::init(amd::hsa::loader::Symbol *sym, bool finalize) -{ - if (extraArgumentsNum_ > MaxExtraArgumentsNum) { - LogError("Failed to initialize kernel: extra arguments number is bigger than is supported"); - return false; - } - acl_error error = ACL_SUCCESS; - std::string openClKernelName = openclMangledName(name()); - flags_.internalKernel_ = (compileOptions_.find("-cl-internal-kernel") != - std::string::npos) ? true: false; - //compile kernel down to ISA - if (finalize) { - std::string options(compileOptions_.c_str()); - options.append(" -just-kernel="); - options.append(openClKernelName.c_str()); - // Append an option so that we can selectively enable a SCOption on CZ - // whenever IOMMUv2 is enabled. - if (dev().settings().svmFineGrainSystem_) { - options.append(" -sc-xnack-iommu"); - } - error = aclCompile(dev().hsaCompiler(), prog().binaryElf(), - options.c_str(), ACL_TYPE_CG, ACL_TYPE_ISA, NULL); - buildLog_ += aclGetCompilerLog(dev().hsaCompiler()); - if (error != ACL_SUCCESS) { - LogError("Failed to finalize kernel"); - return false; - } - } - - aqlCreateHWInfo(sym); - - // Pull out metadata from the ELF - size_t sizeOfArgList; - error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), - RT_ARGUMENT_ARRAY, openClKernelName.c_str(), NULL, &sizeOfArgList); - if (error != ACL_SUCCESS) { - return false; - } - - char* aclArgList = new char[sizeOfArgList]; - if (NULL == aclArgList) { - return false; - } - error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), - RT_ARGUMENT_ARRAY, openClKernelName.c_str(), aclArgList, &sizeOfArgList); - if (error != ACL_SUCCESS) { - return false; - } - - size_t sizeOfWorkGroupSize; - error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), - RT_WORK_GROUP_SIZE, openClKernelName.c_str(), NULL, &sizeOfWorkGroupSize); - if (error != ACL_SUCCESS) { - return false; - } - error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), - RT_WORK_GROUP_SIZE, openClKernelName.c_str(), - workGroupInfo_.compileSize_, &sizeOfWorkGroupSize); - if (error != ACL_SUCCESS) { - return false; - } - - // Copy wavefront size - workGroupInfo_.wavefrontSize_ = prog().isNull() ? 64 : dev().getAttribs().wavefrontSize; - - // Find total workgroup size - if (workGroupInfo_.compileSize_[0] != 0) { - workGroupInfo_.size_ = - workGroupInfo_.compileSize_[0] * - workGroupInfo_.compileSize_[1] * - workGroupInfo_.compileSize_[2]; - } - else { - workGroupInfo_.size_ = dev().info().maxWorkGroupSize_; - } - - // Pull out printf metadata from the ELF - size_t sizeOfPrintfList; - error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), - RT_GPU_PRINTF_ARRAY, openClKernelName.c_str(), NULL, &sizeOfPrintfList); - if (error != ACL_SUCCESS) { - return false; - } - - // Make sure kernel has any printf info - if (0 != sizeOfPrintfList) { - char* aclPrintfList = new char[sizeOfPrintfList]; - if (NULL == aclPrintfList) { - return false; - } - error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), - RT_GPU_PRINTF_ARRAY, openClKernelName.c_str(), aclPrintfList, - &sizeOfPrintfList); - if (error != ACL_SUCCESS) { - return false; - } - - // Set the PrintfList - initPrintf(reinterpret_cast(aclPrintfList)); - delete [] aclPrintfList; - } - - aclMetadata md; - md.enqueue_kernel = false; - size_t sizeOfDeviceEnqueue = sizeof(md.enqueue_kernel); - error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), - RT_DEVICE_ENQUEUE, openClKernelName.c_str(), - &md.enqueue_kernel, &sizeOfDeviceEnqueue); - if (error != ACL_SUCCESS) { - return false; - } - flags_.dynamicParallelism_ = md.enqueue_kernel; - - md.kernel_index = -1; - size_t sizeOfIndex = sizeof(md.kernel_index); - error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), - RT_KERNEL_INDEX, openClKernelName.c_str(), - &md.kernel_index, &sizeOfIndex); - if (error != ACL_SUCCESS) { - return false; - } - index_ = md.kernel_index; - - size_t sizeOfWavesPerSimdHint = sizeof(workGroupInfo_.wavesPerSimdHint_); - error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), - RT_WAVES_PER_SIMD_HINT, openClKernelName.c_str(), - &workGroupInfo_.wavesPerSimdHint_, &sizeOfWavesPerSimdHint); - if (error != ACL_SUCCESS) { - return false; - } - - waveLimiter_.enable(dev().settings().ciPlus_); - - size_t sizeOfWorkGroupSizeHint = sizeof(workGroupInfo_.compileSizeHint_); - error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), - RT_WORK_GROUP_SIZE_HINT, openClKernelName.c_str(), - workGroupInfo_.compileSizeHint_, &sizeOfWorkGroupSizeHint); - if (error != ACL_SUCCESS) { - return false; - } - - size_t sizeOfVecTypeHint; - error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), - RT_VEC_TYPE_HINT, openClKernelName.c_str(), - NULL, &sizeOfVecTypeHint); - if (error != ACL_SUCCESS) { - return false; - } - - if (0 != sizeOfVecTypeHint) { - char* VecTypeHint = new char[sizeOfVecTypeHint + 1]; - if (NULL == VecTypeHint) { - return false; - } - error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), - RT_VEC_TYPE_HINT, openClKernelName.c_str(), - VecTypeHint, &sizeOfVecTypeHint); - if (error != ACL_SUCCESS) { - return false; - } - VecTypeHint[sizeOfVecTypeHint] = '\0'; - workGroupInfo_.compileVecTypeHint_ = std::string(VecTypeHint); - delete [] VecTypeHint; - } - - // Set the argList - initArgList(reinterpret_cast(aclArgList)); - delete [] aclArgList; - - return true; -} - -bool -HSAILKernel::validateMemory(uint idx, amd::Memory* amdMem) const -{ - // Check if memory doesn't require reallocation - bool noRealloc = true; - //amdMem->reallocedDeviceMemory(&dev())); - - return noRealloc; -} - -const Device& -HSAILKernel::dev() const -{ - return reinterpret_cast(dev_); -} - -const HSAILProgram& -HSAILKernel::prog() const -{ - return reinterpret_cast(prog_); -} - -void -HSAILKernel::findLocalWorkSize( - size_t workDim, - const amd::NDRange& gblWorkSize, - amd::NDRange& lclWorkSize) const -{ - // Initialize the default workgoup info - // Check if the kernel has the compiled sizes - if (workGroupInfo()->compileSize_[0] == 0) { - // Find the default local workgroup size, if it wasn't specified - if (lclWorkSize[0] == 0) { - size_t thrPerGrp; - bool b1DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE); - bool b2DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_X) || - !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_Y); - bool b3DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_X) || - !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Y) || - !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Z); - - bool overrideSet = ((workDim == 1) && b1DOverrideSet) || - ((workDim == 2) && b2DOverrideSet) || - ((workDim == 3) && b3DOverrideSet); - if (!overrideSet) { - // Find threads per group - thrPerGrp = workGroupInfo()->size_; - - // Check if kernel uses images - if (flags_.imageEna_ && - // and thread group is a multiple value of wavefronts - ((thrPerGrp % workGroupInfo()->wavefrontSize_) == 0) && - // and it's 2 or 3-dimensional workload - (workDim > 1) && - ((dev().settings().partialDispatch_) || - (((gblWorkSize[0] % 16) == 0) && - ((gblWorkSize[1] % 16) == 0)))) { - // Use 8x8 workgroup size if kernel has image writes - if (flags_.imageWriteEna_ || - (thrPerGrp != dev().info().maxWorkGroupSize_)) { - lclWorkSize[0] = 8; - lclWorkSize[1] = 8; - } - else { - lclWorkSize[0] = 16; - lclWorkSize[1] = 16; - } - if (workDim == 3) { - lclWorkSize[2] = 1; - } - } - else { - size_t tmp = thrPerGrp; - // Split the local workgroup into the most efficient way - for (uint d = 0; d < workDim; ++d) { - size_t div = tmp; - for (; (gblWorkSize[d] % div) != 0; div--); - lclWorkSize[d] = div; - tmp /= div; - } - - // Check if partial dispatch is enabled and - if (dev().settings().partialDispatch_ && - // we couldn't find optimal workload - (lclWorkSize.product() % workGroupInfo()->wavefrontSize_) != 0) { - size_t maxSize = 0; - size_t maxDim = 0; - for (uint d = 0; d < workDim; ++d) { - if (maxSize < gblWorkSize[d]) { - maxSize = gblWorkSize[d]; - maxDim = d; - } - } - // Check if a local workgroup has the most optimal size - if (thrPerGrp > maxSize) { - thrPerGrp = maxSize; - } - lclWorkSize[maxDim] = thrPerGrp; - for (uint d = 0; d < workDim; ++d) { - if (d != maxDim) { - lclWorkSize[d] = 1; - } - } - } - } - } - else { - // Use overrides when app doesn't provide workgroup dimensions - if (workDim == 1) { - lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE; - } - else if (workDim == 2) { - lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_2D_X; - lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_2D_Y; - } - else if (workDim == 3) { - lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_3D_X; - lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_3D_Y; - lclWorkSize[2] = GPU_MAX_WORKGROUP_SIZE_3D_Z; - } - else - { - assert(0 && "Invalid workDim!"); - } - } - } - } - else { - for (uint d = 0; d < workDim; ++d) { - lclWorkSize[d] = workGroupInfo()->compileSize_[d]; - } - } -} - -inline static void -WriteAqlArg( - unsigned char** dst,//!< The write pointer to the buffer - const void* src, //!< The source pointer - uint size, //!< The size in bytes to copy - uint alignment = 0 //!< The alignment to follow while writing to the buffer - ) -{ - if (alignment == 0) { - *dst = amd::alignUp(*dst, size); - } - else { - *dst = amd::alignUp(*dst, alignment); - } - memcpy(*dst, src, size); - *dst += size; -} - -const uint16_t kDispatchPacketHeader = - (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) | +const uint16_t kDispatchPacketHeader = (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) | (1 << HSA_PACKET_HEADER_BARRIER) | (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) | (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); -hsa_kernel_dispatch_packet_t* -HSAILKernel::loadArguments( - VirtualGPU& gpu, - const amd::Kernel& kernel, - const amd::NDRangeContainer& sizes, - const_address parameters, - bool nativeMem, - uint64_t vmDefQueue, - uint64_t* vmParentWrap, - std::vector& memList) const -{ - static const bool WaitOnBusyEngine = true; - uint64_t ldsAddress = ldsSize(); - address aqlArgBuf = gpu.cb(0)->sysMemCopy(); - address aqlStruct = gpu.cb(1)->sysMemCopy(); - bool srdResource = false; +hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments( + VirtualGPU& gpu, const amd::Kernel& kernel, const amd::NDRangeContainer& sizes, + const_address parameters, bool nativeMem, uint64_t vmDefQueue, uint64_t* vmParentWrap, + std::vector& memList) const { + static const bool WaitOnBusyEngine = true; + uint64_t ldsAddress = ldsSize(); + address aqlArgBuf = gpu.cb(0)->sysMemCopy(); + address aqlStruct = gpu.cb(1)->sysMemCopy(); + bool srdResource = false; - if (extraArgumentsNum_ > 0) { - assert(MaxExtraArgumentsNum >= 6 && "MaxExtraArgumentsNum has changed, the below algorithm should be changed accordingly"); - size_t extraArgs[MaxExtraArgumentsNum] = { 0, 0, 0, 0, 0, 0 }; - // The HLC generates up to 3 additional arguments for the global offsets - for (uint i = 0; i < sizes.dimensions(); ++i) { - extraArgs[i] = sizes.offset()[i]; - } - // Check if the kernel may have printf output - if ((printfInfo().size() > 0) && - // and printf buffer was allocated - (gpu.printfDbgHSA().dbgBuffer() != NULL)) { - // and set the fourth argument as the printf_buffer pointer - extraArgs[3] = static_cast(gpu.printfDbgHSA().dbgBuffer()->vmAddress()); - memList.push_back(gpu.printfDbgHSA().dbgBuffer()); - } - if (dynamicParallelism()) { - // Provide the host parent AQL wrap object to the kernel - AmdAqlWrap* wrap = reinterpret_cast(aqlStruct); - memset(wrap, 0, sizeof(AmdAqlWrap)); - wrap->state = AQL_WRAP_BUSY; - ConstBuffer* cb = gpu.constBufs_[1]; - cb->uploadDataToHw(sizeof(AmdAqlWrap)); - *vmParentWrap = cb->vmAddress() + cb->wrtOffset(); - // and set 5th & 6th arguments - extraArgs[4] = vmDefQueue; - extraArgs[5] = *vmParentWrap; - memList.push_back(cb); - } - WriteAqlArg(&aqlArgBuf, extraArgs, sizeof(size_t)*extraArgumentsNum_, sizeof(size_t)); + if (extraArgumentsNum_ > 0) { + assert(MaxExtraArgumentsNum >= 6 && + "MaxExtraArgumentsNum has changed, the below algorithm should be changed accordingly"); + size_t extraArgs[MaxExtraArgumentsNum] = {0, 0, 0, 0, 0, 0}; + // The HLC generates up to 3 additional arguments for the global offsets + for (uint i = 0; i < sizes.dimensions(); ++i) { + extraArgs[i] = sizes.offset()[i]; } + // Check if the kernel may have printf output + if ((printfInfo().size() > 0) && + // and printf buffer was allocated + (gpu.printfDbgHSA().dbgBuffer() != NULL)) { + // and set the fourth argument as the printf_buffer pointer + extraArgs[3] = static_cast(gpu.printfDbgHSA().dbgBuffer()->vmAddress()); + memList.push_back(gpu.printfDbgHSA().dbgBuffer()); + } + if (dynamicParallelism()) { + // Provide the host parent AQL wrap object to the kernel + AmdAqlWrap* wrap = reinterpret_cast(aqlStruct); + memset(wrap, 0, sizeof(AmdAqlWrap)); + wrap->state = AQL_WRAP_BUSY; + ConstBuffer* cb = gpu.constBufs_[1]; + cb->uploadDataToHw(sizeof(AmdAqlWrap)); + *vmParentWrap = cb->vmAddress() + cb->wrtOffset(); + // and set 5th & 6th arguments + extraArgs[4] = vmDefQueue; + extraArgs[5] = *vmParentWrap; + memList.push_back(cb); + } + WriteAqlArg(&aqlArgBuf, extraArgs, sizeof(size_t) * extraArgumentsNum_, sizeof(size_t)); + } - const amd::KernelSignature& signature = kernel.signature(); - const amd::KernelParameters& kernelParams = kernel.parameters(); + const amd::KernelSignature& signature = kernel.signature(); + const amd::KernelParameters& kernelParams = kernel.parameters(); - // Find all parameters for the current kernel - for (uint i = 0; i != signature.numParameters(); ++i) { - const HSAILKernel::Argument* arg = argument(i); - const amd::KernelParameterDescriptor& desc = signature.at(i); - const_address paramaddr = parameters + desc.offset_; + // Find all parameters for the current kernel + for (uint i = 0; i != signature.numParameters(); ++i) { + const HSAILKernel::Argument* arg = argument(i); + const amd::KernelParameterDescriptor& desc = signature.at(i); + const_address paramaddr = parameters + desc.offset_; - switch (arg->type_) { - case HSAIL_ARGTYPE_POINTER: - // If it is a global pointer - if (arg->addrQual_ == HSAIL_ADDRESS_GLOBAL) { + switch (arg->type_) { + case HSAIL_ARGTYPE_POINTER: + // If it is a global pointer + if (arg->addrQual_ == HSAIL_ADDRESS_GLOBAL) { + Memory* gpuMem = NULL; + amd::Memory* mem = NULL; - Memory* gpuMem = NULL; - amd::Memory* mem = NULL; - - if (kernelParams.boundToSvmPointer(dev(), parameters, i)) { - WriteAqlArg(&aqlArgBuf, paramaddr, sizeof(paramaddr)); - mem = amd::SvmManager::FindSvmBuffer(*reinterpret_cast(paramaddr)); - if (mem != NULL) { - gpuMem = dev().getGpuMemory(mem); - gpuMem->wait(gpu, WaitOnBusyEngine); - if ((mem->getMemFlags() & CL_MEM_READ_ONLY) == 0) { - mem->signalWrite(&dev()); - } - memList.push_back(gpuMem); - } - // If finegrainsystem is present then the pointer can be malloced by the app and - // passed to kernel directly. If so copy the pointer location to aqlArgBuf - else if (!dev().isFineGrainedSystem(true)) { - return NULL; - } - break; - } - if (nativeMem) { - gpuMem = *reinterpret_cast(paramaddr); - if (NULL != gpuMem) { - mem = gpuMem->owner(); - } - } - else { - mem = *reinterpret_cast(paramaddr); - if (mem != NULL) { - gpuMem = dev().getGpuMemory(mem); - } - } - if (gpuMem == NULL) { - WriteAqlArg(&aqlArgBuf, &gpuMem, sizeof(void*)); - break; - } - - //! @todo 64 bit isn't supported with 32 bit binary - uint64_t globalAddress = gpuMem->vmAddress() + gpuMem->pinOffset(); - WriteAqlArg(&aqlArgBuf, &globalAddress, sizeof(void*)); - - // Wait for resource if it was used on an inactive engine - //! \note syncCache may call DRM transfer - gpuMem->wait(gpu, WaitOnBusyEngine); - - //! @todo Compiler has to return read/write attributes - if ((NULL != mem) && - ((mem->getMemFlags() & CL_MEM_READ_ONLY) == 0)) { - mem->signalWrite(&dev()); - } - memList.push_back(gpuMem); - - // save the memory object pointer to allow global memory access - if (NULL != dev().hwDebugMgr()) { - dev().hwDebugMgr()->assignKernelParamMem(i, gpuMem->owner()); - } - } - // If it is a local pointer - else { - assert((arg->addrQual_ == HSAIL_ADDRESS_LOCAL) && - "Unsupported address type"); - ldsAddress = amd::alignUp(ldsAddress, arg->alignment_); - WriteAqlArg(&aqlArgBuf, &ldsAddress, sizeof(size_t)); - ldsAddress += *reinterpret_cast(paramaddr); - } - break; - case HSAIL_ARGTYPE_VALUE: - // Special case for structrues - if (arg->dataType_ == HSAIL_DATATYPE_STRUCT) { - // Copy the current structre into CB1 - memcpy(aqlStruct, paramaddr, arg->size_); - ConstBuffer* cb = gpu.constBufs_[1]; - cb->uploadDataToHw(arg->size_); - // Then use a pointer in aqlArgBuffer to CB1 - uint64_t gpuPtr = cb->vmAddress() + cb->wrtOffset(); - WriteAqlArg(&aqlArgBuf, &gpuPtr, sizeof(void*)); - memList.push_back(cb); - } - else { - WriteAqlArg(&aqlArgBuf, paramaddr, - arg->numElem_ * arg->size_, arg->size_); - } - break; - case HSAIL_ARGTYPE_IMAGE: { - Image* image = NULL; - amd::Memory* mem = NULL; - if (nativeMem) { - image = static_cast(*reinterpret_cast(paramaddr)); - } - else { - mem = *reinterpret_cast(paramaddr); - if (mem == NULL) { - LogError( "The kernel image argument isn't an image object!"); - return nullptr; - } - image = static_cast(dev().getGpuMemory(mem)); - } - - // Wait for resource if it was used on an inactive engine - //! \note syncCache may call DRM transfer - image->wait(gpu, WaitOnBusyEngine); - - //! \note Special case for the image views. - //! Copy SRD to CB1, so blit manager will be able to release - //! this view without a wait for SRD resource. - if (image->memoryType() == Resource::ImageView) { - // Copy the current structre into CB1 - memcpy(aqlStruct, image->hwState(), HsaImageObjectSize); - ConstBuffer* cb = gpu.constBufs_[1]; - cb->uploadDataToHw(HsaImageObjectSize); - // Then use a pointer in aqlArgBuffer to CB1 - uint64_t srd = cb->vmAddress() + cb->wrtOffset(); - WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd)); - memList.push_back(cb); - } - else { - uint64_t srd = image->hwSrd(); - WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd)); - srdResource = true; - } - - //! @todo Compiler has to return read/write attributes - if ((NULL != mem) && - ((mem->getMemFlags() & CL_MEM_READ_ONLY) == 0)) { + if (kernelParams.boundToSvmPointer(dev(), parameters, i)) { + WriteAqlArg(&aqlArgBuf, paramaddr, sizeof(paramaddr)); + mem = amd::SvmManager::FindSvmBuffer(*reinterpret_cast(paramaddr)); + if (mem != NULL) { + gpuMem = dev().getGpuMemory(mem); + gpuMem->wait(gpu, WaitOnBusyEngine); + if ((mem->getMemFlags() & CL_MEM_READ_ONLY) == 0) { mem->signalWrite(&dev()); + } + memList.push_back(gpuMem); } + // If finegrainsystem is present then the pointer can be malloced by the app and + // passed to kernel directly. If so copy the pointer location to aqlArgBuf + else if (!dev().isFineGrainedSystem(true)) { + return NULL; + } + break; + } + if (nativeMem) { + gpuMem = *reinterpret_cast(paramaddr); + if (NULL != gpuMem) { + mem = gpuMem->owner(); + } + } else { + mem = *reinterpret_cast(paramaddr); + if (mem != NULL) { + gpuMem = dev().getGpuMemory(mem); + } + } + if (gpuMem == NULL) { + WriteAqlArg(&aqlArgBuf, &gpuMem, sizeof(void*)); + break; + } - memList.push_back(image); - break; - } - case HSAIL_ARGTYPE_SAMPLER: { - const amd::Sampler* sampler = - *reinterpret_cast(paramaddr); - const Sampler* gpuSampler = static_cast - (sampler->getDeviceSampler(dev())); - uint64_t srd = gpuSampler->hwSrd(); - WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd)); - srdResource = true; - break; - } - case HSAIL_ARGTYPE_QUEUE: { - const amd::DeviceQueue* queue = - *reinterpret_cast(paramaddr); - VirtualGPU* gpuQueue = static_cast(queue->vDev()); - uint64_t vmQueue; - if (dev().settings().useDeviceQueue_) { - vmQueue = gpuQueue->vQueue()->vmAddress(); - } - else { - if (!gpu.createVirtualQueue(queue->size())) { - LogError("Virtual queue creation failed!"); - return nullptr; - } - vmQueue = gpu.vQueue()->vmAddress(); - } - WriteAqlArg(&aqlArgBuf, &vmQueue, sizeof(void*)); - break; - } - default: - LogError(" Unsupported address type "); - return NULL; - } - } + //! @todo 64 bit isn't supported with 32 bit binary + uint64_t globalAddress = gpuMem->vmAddress() + gpuMem->pinOffset(); + WriteAqlArg(&aqlArgBuf, &globalAddress, sizeof(void*)); - if (ldsAddress > dev().info().localMemSize_) { - LogError("No local memory available\n"); + // Wait for resource if it was used on an inactive engine + //! \note syncCache may call DRM transfer + gpuMem->wait(gpu, WaitOnBusyEngine); + + //! @todo Compiler has to return read/write attributes + if ((NULL != mem) && ((mem->getMemFlags() & CL_MEM_READ_ONLY) == 0)) { + mem->signalWrite(&dev()); + } + memList.push_back(gpuMem); + + // save the memory object pointer to allow global memory access + if (NULL != dev().hwDebugMgr()) { + dev().hwDebugMgr()->assignKernelParamMem(i, gpuMem->owner()); + } + } + // If it is a local pointer + else { + assert((arg->addrQual_ == HSAIL_ADDRESS_LOCAL) && "Unsupported address type"); + ldsAddress = amd::alignUp(ldsAddress, arg->alignment_); + WriteAqlArg(&aqlArgBuf, &ldsAddress, sizeof(size_t)); + ldsAddress += *reinterpret_cast(paramaddr); + } + break; + case HSAIL_ARGTYPE_VALUE: + // Special case for structrues + if (arg->dataType_ == HSAIL_DATATYPE_STRUCT) { + // Copy the current structre into CB1 + memcpy(aqlStruct, paramaddr, arg->size_); + ConstBuffer* cb = gpu.constBufs_[1]; + cb->uploadDataToHw(arg->size_); + // Then use a pointer in aqlArgBuffer to CB1 + uint64_t gpuPtr = cb->vmAddress() + cb->wrtOffset(); + WriteAqlArg(&aqlArgBuf, &gpuPtr, sizeof(void*)); + memList.push_back(cb); + } else { + WriteAqlArg(&aqlArgBuf, paramaddr, arg->numElem_ * arg->size_, arg->size_); + } + break; + case HSAIL_ARGTYPE_IMAGE: { + Image* image = NULL; + amd::Memory* mem = NULL; + if (nativeMem) { + image = static_cast(*reinterpret_cast(paramaddr)); + } else { + mem = *reinterpret_cast(paramaddr); + if (mem == NULL) { + LogError("The kernel image argument isn't an image object!"); + return nullptr; + } + image = static_cast(dev().getGpuMemory(mem)); + } + + // Wait for resource if it was used on an inactive engine + //! \note syncCache may call DRM transfer + image->wait(gpu, WaitOnBusyEngine); + + //! \note Special case for the image views. + //! Copy SRD to CB1, so blit manager will be able to release + //! this view without a wait for SRD resource. + if (image->memoryType() == Resource::ImageView) { + // Copy the current structre into CB1 + memcpy(aqlStruct, image->hwState(), HsaImageObjectSize); + ConstBuffer* cb = gpu.constBufs_[1]; + cb->uploadDataToHw(HsaImageObjectSize); + // Then use a pointer in aqlArgBuffer to CB1 + uint64_t srd = cb->vmAddress() + cb->wrtOffset(); + WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd)); + memList.push_back(cb); + } else { + uint64_t srd = image->hwSrd(); + WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd)); + srdResource = true; + } + + //! @todo Compiler has to return read/write attributes + if ((NULL != mem) && ((mem->getMemFlags() & CL_MEM_READ_ONLY) == 0)) { + mem->signalWrite(&dev()); + } + + memList.push_back(image); + break; + } + case HSAIL_ARGTYPE_SAMPLER: { + const amd::Sampler* sampler = *reinterpret_cast(paramaddr); + const Sampler* gpuSampler = static_cast(sampler->getDeviceSampler(dev())); + uint64_t srd = gpuSampler->hwSrd(); + WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd)); + srdResource = true; + break; + } + case HSAIL_ARGTYPE_QUEUE: { + const amd::DeviceQueue* queue = *reinterpret_cast(paramaddr); + VirtualGPU* gpuQueue = static_cast(queue->vDev()); + uint64_t vmQueue; + if (dev().settings().useDeviceQueue_) { + vmQueue = gpuQueue->vQueue()->vmAddress(); + } else { + if (!gpu.createVirtualQueue(queue->size())) { + LogError("Virtual queue creation failed!"); + return nullptr; + } + vmQueue = gpu.vQueue()->vmAddress(); + } + WriteAqlArg(&aqlArgBuf, &vmQueue, sizeof(void*)); + break; + } + default: + LogError(" Unsupported address type "); return NULL; } + } - // HSAIL kernarg segment size is rounded up to multiple of 16. - aqlArgBuf = amd::alignUp(aqlArgBuf, 16); - assert((aqlArgBuf == (gpu.cb(0)->sysMemCopy() + argsBufferSize())) && - "Size and the number of arguments don't match!"); - hsa_kernel_dispatch_packet_t* hsaDisp = - reinterpret_cast(aqlArgBuf); + if (ldsAddress > dev().info().localMemSize_) { + LogError("No local memory available\n"); + return NULL; + } - amd::NDRange local(sizes.local()); - const amd::NDRange& global = sizes.global(); + // HSAIL kernarg segment size is rounded up to multiple of 16. + aqlArgBuf = amd::alignUp(aqlArgBuf, 16); + assert((aqlArgBuf == (gpu.cb(0)->sysMemCopy() + argsBufferSize())) && + "Size and the number of arguments don't match!"); + hsa_kernel_dispatch_packet_t* hsaDisp = + reinterpret_cast(aqlArgBuf); - // Check if runtime has to find local workgroup size - findLocalWorkSize(sizes.dimensions(), sizes.global(), local); + amd::NDRange local(sizes.local()); + const amd::NDRange& global = sizes.global(); - hsaDisp->header = kDispatchPacketHeader; - hsaDisp->setup = sizes.dimensions(); + // Check if runtime has to find local workgroup size + findLocalWorkSize(sizes.dimensions(), sizes.global(), local); - hsaDisp->workgroup_size_x = local[0]; - hsaDisp->workgroup_size_y = (sizes.dimensions() > 1) ? local[1] : 1; - hsaDisp->workgroup_size_z = (sizes.dimensions() > 2) ? local[2] : 1; + hsaDisp->header = kDispatchPacketHeader; + hsaDisp->setup = sizes.dimensions(); - hsaDisp->grid_size_x = global[0]; - hsaDisp->grid_size_y = (sizes.dimensions() > 1) ? global[1] : 1; - hsaDisp->grid_size_z = (sizes.dimensions() > 2) ? global[2] : 1; - hsaDisp->reserved2 = 0; + hsaDisp->workgroup_size_x = local[0]; + hsaDisp->workgroup_size_y = (sizes.dimensions() > 1) ? local[1] : 1; + hsaDisp->workgroup_size_z = (sizes.dimensions() > 2) ? local[2] : 1; - // Initialize kernel ISA and execution buffer requirements - hsaDisp->private_segment_size = spillSegSize(); - hsaDisp->group_segment_size = ldsAddress - ldsSize(); - hsaDisp->kernel_object = gpuAqlCode()->vmAddress(); + hsaDisp->grid_size_x = global[0]; + hsaDisp->grid_size_y = (sizes.dimensions() > 1) ? global[1] : 1; + hsaDisp->grid_size_z = (sizes.dimensions() > 2) ? global[2] : 1; + hsaDisp->reserved2 = 0; - ConstBuffer* cb = gpu.constBufs_[0]; - cb->uploadDataToHw(argsBufferSize() + sizeof(hsa_kernel_dispatch_packet_t)); - uint64_t argList = cb->vmAddress() + cb->wrtOffset(); + // Initialize kernel ISA and execution buffer requirements + hsaDisp->private_segment_size = spillSegSize(); + hsaDisp->group_segment_size = ldsAddress - ldsSize(); + hsaDisp->kernel_object = gpuAqlCode()->vmAddress(); - hsaDisp->kernarg_address = reinterpret_cast(argList); - hsaDisp->reserved2 = 0; - hsaDisp->completion_signal.handle = 0; + ConstBuffer* cb = gpu.constBufs_[0]; + cb->uploadDataToHw(argsBufferSize() + sizeof(hsa_kernel_dispatch_packet_t)); + uint64_t argList = cb->vmAddress() + cb->wrtOffset(); - memList.push_back(cb); - memList.push_back(gpuAqlCode()); - for (gpu::Memory * mem : prog().globalStores()) { - memList.push_back(mem); - } - if (AMD_HSA_BITS_GET(cpuAqlCode_->kernel_code_properties, - AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_QUEUE_PTR)) { - memList.push_back(gpu.hsaQueueMem()); - } + hsaDisp->kernarg_address = reinterpret_cast(argList); + hsaDisp->reserved2 = 0; + hsaDisp->completion_signal.handle = 0; - if (srdResource || prog().isStaticSampler()) { - dev().srds().fillResourceList(memList); - } + memList.push_back(cb); + memList.push_back(gpuAqlCode()); + for (gpu::Memory* mem : prog().globalStores()) { + memList.push_back(mem); + } + if (AMD_HSA_BITS_GET(cpuAqlCode_->kernel_code_properties, + AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_QUEUE_PTR)) { + memList.push_back(gpu.hsaQueueMem()); + } - return hsaDisp; + if (srdResource || prog().isStaticSampler()) { + dev().srds().fillResourceList(memList); + } + + return hsaDisp; } -} // namespace gpu +} // namespace gpu diff --git a/rocclr/runtime/device/gpu/gpukernel.hpp b/rocclr/runtime/device/gpu/gpukernel.hpp index b865a7a1de..1e0fd7f6f2 100644 --- a/rocclr/runtime/device/gpu/gpukernel.hpp +++ b/rocclr/runtime/device/gpu/gpukernel.hpp @@ -22,9 +22,9 @@ namespace amd { namespace hsa { namespace loader { class Symbol; -} // loader -} // hsa -} // amd +} // loader +} // hsa +} // amd //! \namespace gpu GPU Device Implementation namespace gpu { @@ -34,22 +34,17 @@ class Device; class NullDevice; class HSAILProgram; -struct HWSHADER_Helper -{ - template - static T Get(S base, T offset) { - return reinterpret_cast(reinterpret_cast(base) - + reinterpret_cast(offset)); - } +struct HWSHADER_Helper { + template static T Get(S base, T offset) { + return reinterpret_cast(reinterpret_cast(base) + reinterpret_cast(offset)); + } }; -#define HWSHADER_Get(shader, field) \ - HWSHADER_Helper::Get((shader), (shader)->field) +#define HWSHADER_Get(shader, field) HWSHADER_Helper::Get((shader), (shader)->field) template static void CalcPtr(D& dst, const S src, size_t structSize, size_t size) { - dst = reinterpret_cast(reinterpret_cast(src) - + structSize * size); + dst = reinterpret_cast(reinterpret_cast(src) + structSize * size); } /*! \addtogroup GPU GPU Device Implementation @@ -61,270 +56,255 @@ static void CalcPtr(D& dst, const S src, size_t structSize, size_t size) { * * \return True if we found the entry of the symbols */ -bool expect( - const std::string& str, //!< The original std::string - size_t* pos, //!< Position to start - const std::string& sym //!< The sympols to expect - ); +bool expect(const std::string& str, //!< The original std::string + size_t* pos, //!< Position to start + const std::string& sym //!< The sympols to expect + ); /*! \brief Helper function for the std::string processing. * Gets a word from the std::string * * \return True if we successfully received a word */ -bool getword( - const std::string& str, //!< The original std::string - size_t* pos, //!< Position to start - std::string& sym //!< Returned word - ); +bool getword(const std::string& str, //!< The original std::string + size_t* pos, //!< Position to start + std::string& sym //!< Returned word + ); /*! \brief Helper function for the std::string processing. * Loads numbers from the metadata * * \return True if we loaded a number */ -bool getuint( - const std::string& str, //!< The original std::string - size_t* pos, //!< Position to start - uint* val //!< Returned number - ); +bool getuint(const std::string& str, //!< The original std::string + size_t* pos, //!< Position to start + uint* val //!< Returned number + ); /*! \brief Helper function for the std::string processing. * Loads numbers from the metadata in HEX format * * \return True if we loaded a number */ -bool getuintHex( - const std::string& str, //!< The original std::string - size_t* pos, //!< Position to start - uint* val //!< Returned number - ); +bool getuintHex(const std::string& str, //!< The original std::string + size_t* pos, //!< Position to start + uint* val //!< Returned number + ); /*! \brief Helper function for the std::string processing. * Loads numbers from the metadata in HEX format * * \return True if we loaded a number */ -bool getuint64Hex( - const std::string& str, //!< The original std::string - size_t* pos, //!< Position to start - uint64_t* val //!< Returned number - ); +bool getuint64Hex(const std::string& str, //!< The original std::string + size_t* pos, //!< Position to start + uint64_t* val //!< Returned number + ); /*! \brief Helper function for the std::string processing. * Converts unsigned integer to string * * \return None */ -void intToStr( - size_t value, //!< Value for conversion - char* str, //!< Pointer to the converted string - size_t size //!< String size - ); +void intToStr(size_t value, //!< Value for conversion + char* str, //!< Pointer to the converted string + size_t size //!< String size + ); //! Image constant data from ABI specification -struct ImageConstants : public amd::EmbeddedObject -{ - uint32_t width_; //!< Image surface width - uint32_t height_; //!< Image surface height - uint32_t depth_; //!< Image surface depth (1 for 2D images) - uint32_t dataType_; //!< Image surface data type - float widthFloat_; //!< Image surface width - float heightFloat_; //!< Image surface height - float depthFloat_; //!< Image surface depth (1 for 2D images) - uint32_t channelOrder_; //!< Image surface texels channel order +struct ImageConstants : public amd::EmbeddedObject { + uint32_t width_; //!< Image surface width + uint32_t height_; //!< Image surface height + uint32_t depth_; //!< Image surface depth (1 for 2D images) + uint32_t dataType_; //!< Image surface data type + float widthFloat_; //!< Image surface width + float heightFloat_; //!< Image surface height + float depthFloat_; //!< Image surface depth (1 for 2D images) + uint32_t channelOrder_; //!< Image surface texels channel order }; //! Kernel arguments -struct KernelArg : public amd::HeapObject -{ -public: - //! \enum Kernel argument type - enum ArgumentType - { - None = 0, - PointerGlobal, - Value, - Image, - PointerLocal, - PointerHwLocal, - PointerPrivate, - PointerHwPrivate, - PointerConst, - PointerHwConst, - Float, - Double, - Half, - Char, - UChar, - Short, - UShort, - Int, - UInt, - Long, - ULong, - Struct, - Union, - Opaque, - Event, - Image1D, //!< first image - Image2D, - Image1DB, - Image1DA, - Image2DA, - Image3D, //!< last image - Counter, - Sampler, - PrivateSize, - LocalSize, - HwPrivateSize, - HwLocalSize, - Grouping, - WrkgrpSize, - Wavefront, - PrivateFixed, - ErrorMessage, - WarningMessage, - PrintfFormatStr, - MetadataVersion, - UavId, - ABI64Bit, - GWS, - SWGWS, - Reflection, - ConstArg, - ConstBufId, - PrintfBufId, - GroupingHint, - VecTypeHint, - WavesPerSimdHint, - TotalTypes +struct KernelArg : public amd::HeapObject { + public: + //! \enum Kernel argument type + enum ArgumentType { + None = 0, + PointerGlobal, + Value, + Image, + PointerLocal, + PointerHwLocal, + PointerPrivate, + PointerHwPrivate, + PointerConst, + PointerHwConst, + Float, + Double, + Half, + Char, + UChar, + Short, + UShort, + Int, + UInt, + Long, + ULong, + Struct, + Union, + Opaque, + Event, + Image1D, //!< first image + Image2D, + Image1DB, + Image1DA, + Image2DA, + Image3D, //!< last image + Counter, + Sampler, + PrivateSize, + LocalSize, + HwPrivateSize, + HwLocalSize, + Grouping, + WrkgrpSize, + Wavefront, + PrivateFixed, + ErrorMessage, + WarningMessage, + PrintfFormatStr, + MetadataVersion, + UavId, + ABI64Bit, + GWS, + SWGWS, + Reflection, + ConstArg, + ConstBufId, + PrintfBufId, + GroupingHint, + VecTypeHint, + WavesPerSimdHint, + TotalTypes + }; + + // The compiler metadata fields + std::string name_; //!< parameters name + ArgumentType type_; //!< type of argument + union { + uint size_; //!< number of arguments (for values and pointers only) + uint location_; //!< sampler's location (for samplers only) + }; + uint cbIdx_; //!< constant buffer index + uint cbPos_; //!< dword address in CB for the argument + std::string buf_; //!< buffer tag + uint index_; //!< buffer/image/sampler index + uint alignment_; //!< the required argument's alignment + ArgumentType dataType_; //!< data type of the argument + union { + struct { + uint uavBuf_ : 1; //!< UAV memory, no global heap + uint realloc_ : 1; //!< argument has to be reallocatedin the global heap + uint readOnly_ : 1; //!< Read only memory object + uint writeOnly_ : 1; //!< Write only memory object + uint readWrite_ : 1; //!< Read/Write memory object }; + uint value_; + } memory_; - // The compiler metadata fields - std::string name_; //!< parameters name - ArgumentType type_; //!< type of argument - union { - uint size_; //!< number of arguments (for values and pointers only) - uint location_; //!< sampler's location (for samplers only) - }; - uint cbIdx_; //!< constant buffer index - uint cbPos_; //!< dword address in CB for the argument - std::string buf_; //!< buffer tag - uint index_; //!< buffer/image/sampler index - uint alignment_; //!< the required argument's alignment - ArgumentType dataType_; //!< data type of the argument - union { - struct { - uint uavBuf_ : 1; //!< UAV memory, no global heap - uint realloc_ : 1; //!< argument has to be reallocatedin the global heap - uint readOnly_ : 1; //!< Read only memory object - uint writeOnly_ : 1; //!< Write only memory object - uint readWrite_ : 1; //!< Read/Write memory object - }; - uint value_; - } memory_; + std::string typeName_; //!< argument's type name + uint typeQualifier_; //!< argument's type qualifier - std::string typeName_; //!< argument's type name - uint typeQualifier_; //!< argument's type qualifier + //! Default constructor for the kernel argument + KernelArg(); - //! Default constructor for the kernel argument - KernelArg(); + //! Copy constructor for the kernel argument + KernelArg(const KernelArg& data); - //! Copy constructor for the kernel argument - KernelArg(const KernelArg& data); + //! Overloads operator= + KernelArg& operator=(const KernelArg& data); - //! Overloads operator= - KernelArg& operator=(const KernelArg& data); + //! Destructor of the kernel argument + ~KernelArg() { name_.clear(); } - //! Destructor of the kernel argument - ~KernelArg() { name_.clear(); } + /*! \brief Checks if this arguments requires a place in constant buffer + * + * \return True if we need CB + */ + bool isCbNeeded() const; - /*! \brief Checks if this arguments requires a place in constant buffer - * - * \return True if we need CB - */ - bool isCbNeeded() const; + /*! \brief Retrieves the argument's size + * + * \return Size of the current argument + */ + size_t size(bool gpuLayer //!< True if we want the argument's size for the GPU layer + ) const; - /*! \brief Retrieves the argument's size - * - * \return Size of the current argument - */ - size_t size( - bool gpuLayer //!< True if we want the argument's size for the GPU layer - ) const; + /*! \brief Retrieves the argument's type for the abstraction layer + * + * \return The argument's type in the abstraction layer format + */ + clk_value_type_t type() const; - /*! \brief Retrieves the argument's type for the abstraction layer - * - * \return The argument's type in the abstraction layer format - */ - clk_value_type_t type() const; + /*! \brief Retrieves the argument's address qualifier for the abstraction layer + * + * \return The argument's address qualifier in the abstraction layer format + */ + cl_kernel_arg_address_qualifier addressQualifier() const; - /*! \brief Retrieves the argument's address qualifier for the abstraction layer - * - * \return The argument's address qualifier in the abstraction layer format - */ - cl_kernel_arg_address_qualifier addressQualifier() const; + /*! \brief Retrieves the argument's access qualifier for the abstraction layer + * + * \return The argument's access qualifier in the abstraction layer format + */ + cl_kernel_arg_access_qualifier accessQualifier() const; - /*! \brief Retrieves the argument's access qualifier for the abstraction layer - * - * \return The argument's access qualifier in the abstraction layer format - */ - cl_kernel_arg_access_qualifier accessQualifier() const; + /*! \brief Retrieves the argument's type name for the abstraction layer + * + * \return The argument's type name + */ + const char* typeName() const { return typeName_.c_str(); } - /*! \brief Retrieves the argument's type name for the abstraction layer - * - * \return The argument's type name - */ - const char* typeName() const { return typeName_.c_str(); } - - /*! \brief Retrieves the argument's type qualifier for the abstraction layer - * - * \return The argument's type qualifier - */ - cl_kernel_arg_type_qualifier typeQualifier() const - { - switch (type_) { - case PointerConst: - case PointerHwConst: - return static_cast(typeQualifier_ | - CL_KERNEL_ARG_TYPE_CONST); - default: - return static_cast(typeQualifier_); - } + /*! \brief Retrieves the argument's type qualifier for the abstraction layer + * + * \return The argument's type qualifier + */ + cl_kernel_arg_type_qualifier typeQualifier() const { + switch (type_) { + case PointerConst: + case PointerHwConst: + return static_cast(typeQualifier_ | CL_KERNEL_ARG_TYPE_CONST); + default: + return static_cast(typeQualifier_); } + } - //! Special case for vectors with component size <= 16bit - const static uint VectorSizeLimit = 4; - size_t specialVector() const; + //! Special case for vectors with component size <= 16bit + const static uint VectorSizeLimit = 4; + size_t specialVector() const; }; -struct DataTypeConst -{ - const char* tagName_; //!< data type's name - KernelArg::ArgumentType type_; //!< data type +struct DataTypeConst { + const char* tagName_; //!< data type's name + KernelArg::ArgumentType type_; //!< data type }; //! Metadata description for parsing -struct MetaDataConst -{ - const char* typeName_; //!< parameters name - KernelArg::ArgumentType type_; //!< type of argument - struct - { - uint size_ : 1; //!< number of arguments - uint name_ : 1; //!< argument's name - uint resType_: 1; //!< argument's type - uint cbIdx_ : 1; //!< resource index CB, sampler or image - uint cbPos_ : 1; //!< dword address in CB for the argument - uint buf_ : 1; //!< buffer tag - uint reserved: 26; //!< reserved - }; +struct MetaDataConst { + const char* typeName_; //!< parameters name + KernelArg::ArgumentType type_; //!< type of argument + struct { + uint size_ : 1; //!< number of arguments + uint name_ : 1; //!< argument's name + uint resType_ : 1; //!< argument's type + uint cbIdx_ : 1; //!< resource index CB, sampler or image + uint cbPos_ : 1; //!< dword address in CB for the argument + uint buf_ : 1; //!< buffer tag + uint reserved : 26; //!< reserved + }; }; -const uint DescTotal = 15; -const uint BasicTypeTotal = 15; -const uint ArgStateTotal = DescTotal + BasicTypeTotal; +const uint DescTotal = 15; +const uint BasicTypeTotal = 15; +const uint ArgStateTotal = DescTotal + BasicTypeTotal; //! The constant array that describes different metadata properties extern const MetaDataConst ArgState[ArgStateTotal]; @@ -337,652 +317,603 @@ extern const uint DataTypeTotal; class Program; class NullProgram; -class CalImageReference : public amd::ReferenceCountedObject -{ -public: - //! Default constructor - CalImageReference(CALimage calImage): image_(calImage) {} +class CalImageReference : public amd::ReferenceCountedObject { + public: + //! Default constructor + CalImageReference(CALimage calImage) : image_(calImage) {} - //! Get CAL image - CALimage calImage() const { return image_; } + //! Get CAL image + CALimage calImage() const { return image_; } -protected: - //! Default destructor - ~CalImageReference(); + protected: + //! Default destructor + ~CalImageReference(); -private: - //! Disable copy constructor - CalImageReference(const CalImageReference&); + private: + //! Disable copy constructor + CalImageReference(const CalImageReference&); - //! Disable operator= - CalImageReference& operator=(const CalImageReference&); + //! Disable operator= + CalImageReference& operator=(const CalImageReference&); - CALimage image_; //!< CAL kernel image + CALimage image_; //!< CAL kernel image }; //! \class GPU NullKernel - Kernel for offline device -class NullKernel : public device::Kernel -{ -public: - typedef std::vector arguments_t; +class NullKernel : public device::Kernel { + public: + typedef std::vector arguments_t; - const static uint UavIdUndefined = 0xffff; + const static uint UavIdUndefined = 0xffff; - enum Flags { - LimitWorkgroup = 1 << 0, //!< Limits the workgroup size - PrintfOutput = 1 << 1, //!< Kernel has printf output - PrivateFixed = 1 << 2, //!< Kernel has printf output - ABI64bit = 1 << 3, //!< Kernel has 64 bit ABI - Unused0 = 1 << 4, //!< Unused - Unused1 = 1 << 5, //!< Unused - ImageEnable = 1 << 6, //!< Kernel uses images - ImageWrite = 1 << 7, //!< Kernel writes images - }; + enum Flags { + LimitWorkgroup = 1 << 0, //!< Limits the workgroup size + PrintfOutput = 1 << 1, //!< Kernel has printf output + PrivateFixed = 1 << 2, //!< Kernel has printf output + ABI64bit = 1 << 3, //!< Kernel has 64 bit ABI + Unused0 = 1 << 4, //!< Unused + Unused1 = 1 << 5, //!< Unused + ImageEnable = 1 << 6, //!< Kernel uses images + ImageWrite = 1 << 7, //!< Kernel writes images + }; - //! \enum Resource type for binding - enum ResourceType - { - Undefined = 0x00000000, //!< resource type will be detected - ConstantBuffer = 0x00000001, //!< resource is a constant buffer - GlobalBuffer = 0x00000002, //!< resource is a global buffer - ArgumentHeapBuffer = 0x00000004, //!< resource is an argument buffer - ArgumentBuffer = 0x00000005, //!< resource is an argument buffer - ArgumentImageRead = 0x00000006, //!< resource is an argument image read - ArgumentImageWrite = 0x00000007, //!< resource is an argument image write - ArgumentConstBuffer = 0x00000008, //!< resource is an argument const buffer - ArgumentCounter = 0x00000009, //!< resource is a global counter - ArgumentUavID = 0x0000000a, //!< resource is a dummy ID read - ArgumentCbID = 0x0000000b, //!< resource is a constant buffer - ArgumentPrintfID = 0x0000000c, //!< resource is a printf buffer - }; + //! \enum Resource type for binding + enum ResourceType { + Undefined = 0x00000000, //!< resource type will be detected + ConstantBuffer = 0x00000001, //!< resource is a constant buffer + GlobalBuffer = 0x00000002, //!< resource is a global buffer + ArgumentHeapBuffer = 0x00000004, //!< resource is an argument buffer + ArgumentBuffer = 0x00000005, //!< resource is an argument buffer + ArgumentImageRead = 0x00000006, //!< resource is an argument image read + ArgumentImageWrite = 0x00000007, //!< resource is an argument image write + ArgumentConstBuffer = 0x00000008, //!< resource is an argument const buffer + ArgumentCounter = 0x00000009, //!< resource is a global counter + ArgumentUavID = 0x0000000a, //!< resource is a dummy ID read + ArgumentCbID = 0x0000000b, //!< resource is a constant buffer + ArgumentPrintfID = 0x0000000c, //!< resource is a printf buffer + }; - //! GPU kernel constructor - NullKernel( - const std::string& name, //!< The kernel's name - const NullDevice& gpuNullDev, //!< GPU device object - const NullProgram& nullProg //!< Reference to the program - ); + //! GPU kernel constructor + NullKernel(const std::string& name, //!< The kernel's name + const NullDevice& gpuNullDev, //!< GPU device object + const NullProgram& nullProg //!< Reference to the program + ); - virtual ~NullKernel(); + virtual ~NullKernel(); - /*! \brief Creates a GPU kernel in CAL - * - * \return True if we successfully created a kernel in CAL - */ - bool create( - const std::string& code, //!< IL source code - const std::string& metadata, //!< the kernel metadata structure - const void* binaryCode = NULL, //!< binary machine code for CAL - size_t binarySize = 0 //!< the machine code size - ); + /*! \brief Creates a GPU kernel in CAL + * + * \return True if we successfully created a kernel in CAL + */ + bool create(const std::string& code, //!< IL source code + const std::string& metadata, //!< the kernel metadata structure + const void* binaryCode = NULL, //!< binary machine code for CAL + size_t binarySize = 0 //!< the machine code size + ); - //! Returns CAL function descriptor - CALimage calImage() const { return calRef_->calImage(); } + //! Returns CAL function descriptor + CALimage calImage() const { return calRef_->calImage(); } - //! Returns TRUE if we successfully retrieved the binary from CAL - bool getCalBinary( - void* binary, //!< ISA binary code - size_t size //!< ISA binary size - ) const; + //! Returns TRUE if we successfully retrieved the binary from CAL + bool getCalBinary(void* binary, //!< ISA binary code + size_t size //!< ISA binary size + ) const; - //! Returns CAL image size - size_t getCalBinarySize() const; + //! Returns CAL image size + size_t getCalBinarySize() const; - //! Returns GPU device object, associated with this kernel - const NullDevice& nullDev() const { return gpuDev_; } + //! Returns GPU device object, associated with this kernel + const NullDevice& nullDev() const { return gpuDev_; } - //! Returns GPU device object, associated with this kernel - const NullProgram& nullProg() const { return prog_; } + //! Returns GPU device object, associated with this kernel + const NullProgram& nullProg() const { return prog_; } - //! Returns the kernel's build error - const cl_int buildError() const { return buildError_; } + //! Returns the kernel's build error + const cl_int buildError() const { return buildError_; } - //! Returns the kernel's flags - uint flags() const { return flags_; } + //! Returns the kernel's flags + uint flags() const { return flags_; } - //! Returns TRUE if ABI is for 64 bits - bool abi64Bit() const { return (flags_ & ABI64bit) ? true : false; } + //! Returns TRUE if ABI is for 64 bits + bool abi64Bit() const { return (flags_ & ABI64bit) ? true : false; } - //! Returns the total number of all arguments - size_t argSize() const { return arguments_.size(); } + //! Returns the total number of all arguments + size_t argSize() const { return arguments_.size(); } - //! Returns instruction count of the current kernel - uint instructionCnt() const { return instructionCnt_; } + //! Returns instruction count of the current kernel + uint instructionCnt() const { return instructionCnt_; } -protected: - /*! \brief Parses the metadata structure for the kernel, - * provided by the OpenCL compiler - * - * \return True if we succefully parsed all arguments - */ - bool parseArguments( - const std::string& metaData, //!< the program for parsing - uint* uavRefCount //!< an array of reference counters for used UAVs - ); + protected: + /*! \brief Parses the metadata structure for the kernel, + * provided by the OpenCL compiler + * + * \return True if we succefully parsed all arguments + */ + bool parseArguments(const std::string& metaData, //!< the program for parsing + uint* uavRefCount //!< an array of reference counters for used UAVs + ); - //! Returns the argument for the specified index - const KernelArg* argument(uint idx) const { return arguments_[idx]; } + //! Returns the argument for the specified index + const KernelArg* argument(uint idx) const { return arguments_[idx]; } - //! Adds the kernel argument into the list - void addArgument(KernelArg* arg) { arguments_.push_back(arg); } + //! Adds the kernel argument into the list + void addArgument(KernelArg* arg) { arguments_.push_back(arg); } - //! Returns the argument for the specified sampler's index - const KernelArg* sampler(uint idx) const { return intSamplers_[idx]; } + //! Returns the argument for the specified sampler's index + const KernelArg* sampler(uint idx) const { return intSamplers_[idx]; } - //! Returns the total number of all internal samplers - size_t samplerSize() const { return intSamplers_.size(); } + //! Returns the total number of all internal samplers + size_t samplerSize() const { return intSamplers_.size(); } - //! Adds the kernel sampler into the sampler's list - void addSampler(KernelArg* arg) { intSamplers_.push_back(arg); } + //! Adds the kernel sampler into the sampler's list + void addSampler(KernelArg* arg) { intSamplers_.push_back(arg); } - //! Returns UAV raw index for this kernel - uint uavRaw() const { return uavRaw_; } + //! Returns UAV raw index for this kernel + uint uavRaw() const { return uavRaw_; } - cl_int buildError_; //!< Kernel's build error - std::string ilSource_; //!< IL source code of this kernel + cl_int buildError_; //!< Kernel's build error + std::string ilSource_; //!< IL source code of this kernel - const NullDevice& gpuDev_; //!< GPU device object - const NullProgram& prog_; //!< Reference to the parent program + const NullDevice& gpuDev_; //!< GPU device object + const NullProgram& prog_; //!< Reference to the parent program - CalImageReference* calRef_; //!< CAL image reference for this kernel - bool internal_; //!< Runtime internal ker + CalImageReference* calRef_; //!< CAL image reference for this kernel + bool internal_; //!< Runtime internal ker - uint flags_; //!< kernel object flags - arguments_t arguments_; //!< kernel arguments for the execution - arguments_t intSamplers_; //!< predefined intenal kernel samplers + uint flags_; //!< kernel object flags + arguments_t arguments_; //!< kernel arguments for the execution + arguments_t intSamplers_; //!< predefined intenal kernel samplers - size_t* cbSizes_; //!< real constant buffer sizes for this kernel - uint numCb_; //!< total number of constant buffers + size_t* cbSizes_; //!< real constant buffer sizes for this kernel + uint numCb_; //!< total number of constant buffers - uint uavRaw_; //!< UAV used for RAW access + uint uavRaw_; //!< UAV used for RAW access - bool rwAttributes_; //!< backend provides RW attributes for arguments + bool rwAttributes_; //!< backend provides RW attributes for arguments - uint instructionCnt_;//!< Instruction count + uint instructionCnt_; //!< Instruction count - uint cbId_; //!< UAV used for constant buffer access - uint printfId_; //!< UAV used for printf buffer access + uint cbId_; //!< UAV used for constant buffer access + uint printfId_; //!< UAV used for printf buffer access -private: - //! Disable copy constructor - NullKernel(const NullKernel&); + private: + //! Disable copy constructor + NullKernel(const NullKernel&); - //! Disable operator= - NullKernel& operator=(const NullKernel&); + //! Disable operator= + NullKernel& operator=(const NullKernel&); - //! Creates a filename for ISA/IL dumps - std::string mkDumpName( - const char* extension //!< File extension to append - ) const; + //! Creates a filename for ISA/IL dumps + std::string mkDumpName(const char* extension //!< File extension to append + ) const; - bool createMultiBinary( - uint* imageSize, //!< Multibinary image size - void** image, //!< Multibinary image - const void* isa //!< Kernel HW info - ); + bool createMultiBinary(uint* imageSize, //!< Multibinary image size + void** image, //!< Multibinary image + const void* isa //!< Kernel HW info + ); - //! SI HW specific setup for kernels - bool siCreateHwInfo( - const void* shader, //!< HW info shader - AMUabiAddEncoding& encoding //!< ABI encoding structure - ); + //! SI HW specific setup for kernels + bool siCreateHwInfo(const void* shader, //!< HW info shader + AMUabiAddEncoding& encoding //!< ABI encoding structure + ); - //! r800 HW specific setup for kernels - bool r800CreateHwInfo( - const void* shader, //!< HW info shader - AMUabiAddEncoding& encoding //!< ABI encoding structure - ); + //! r800 HW specific setup for kernels + bool r800CreateHwInfo(const void* shader, //!< HW info shader + AMUabiAddEncoding& encoding //!< ABI encoding structure + ); }; //! \class GPU kernel -class Kernel : public NullKernel -{ -public: - struct InitData { - uint privateSize_; //!< Private ring initial size - uint localSize_; //!< Local ring initial size - uint hwPrivateSize_; //!< HW private ring initial size - uint hwLocalSize_; //!< HW local ring initial size - uint flags_; //!< Kernel initialization flags - }; +class Kernel : public NullKernel { + public: + struct InitData { + uint privateSize_; //!< Private ring initial size + uint localSize_; //!< Local ring initial size + uint hwPrivateSize_; //!< HW private ring initial size + uint hwLocalSize_; //!< HW local ring initial size + uint flags_; //!< Kernel initialization flags + }; - //! GPU kernel constructor - Kernel( - const std::string& name, //!< The kernel's name - const Device& gpuDev, //!< GPU device object - const Program& prog, //!< Reference to the program - const InitData* initData_ //!< Initialization data - ); + //! GPU kernel constructor + Kernel(const std::string& name, //!< The kernel's name + const Device& gpuDev, //!< GPU device object + const Program& prog, //!< Reference to the program + const InitData* initData_ //!< Initialization data + ); - //! GPU kernel destructor - virtual ~Kernel(); + //! GPU kernel destructor + virtual ~Kernel(); - /*! \brief Creates a GPU kernel in CAL - * - * \return True if we successfully created a kernel in CAL - */ - bool create( - const std::string& code, //!< IL source code - const std::string& metadata, //!< the kernel metadata structure - const void* binaryCode = NULL, //!< binary machine code for CAL - size_t binarySize = 0 //!< the machine code size - ); + /*! \brief Creates a GPU kernel in CAL + * + * \return True if we successfully created a kernel in CAL + */ + bool create(const std::string& code, //!< IL source code + const std::string& metadata, //!< the kernel metadata structure + const void* binaryCode = NULL, //!< binary machine code for CAL + size_t binarySize = 0 //!< the machine code size + ); - //! Validates memory argument - virtual bool validateMemory( - uint idx, //!< Argument's index - amd::Memory* amdMem //!< AMD memory object for validation - ) const ; + //! Validates memory argument + virtual bool validateMemory(uint idx, //!< Argument's index + amd::Memory* amdMem //!< AMD memory object for validation + ) const; - //! Initializes the CAL program grid for the kernel execution - void setupProgramGrid( - VirtualGPU& gpu, //!< virtual GPU device object - size_t workDim, //!< work dimension - const amd::NDRange& glbWorkOffset, //!< global work offset - const amd::NDRange& gblWorkSize, //!< global work size - amd::NDRange& lclWorkSize, //!< local work size - const amd::NDRange& groupOffset, //!< group offsets - const amd::NDRange& glbWorkOffsetOrg, - const amd::NDRange& glbWorkSizeOrg //!< original global work size - ) const; + //! Initializes the CAL program grid for the kernel execution + void setupProgramGrid(VirtualGPU& gpu, //!< virtual GPU device object + size_t workDim, //!< work dimension + const amd::NDRange& glbWorkOffset, //!< global work offset + const amd::NDRange& gblWorkSize, //!< global work size + amd::NDRange& lclWorkSize, //!< local work size + const amd::NDRange& groupOffset, //!< group offsets + const amd::NDRange& glbWorkOffsetOrg, + const amd::NDRange& glbWorkSizeOrg //!< original global work size + ) const; - /*! \brief Detects if runtime has to disable cache optimization and - * recompiles the kernel - * - * \return True if aliases were detected in the kernel arguments - */ - void processMemObjects( - VirtualGPU& gpu, //!< Virtual GPU objects - queue - const amd::Kernel& kernel, //!< AMD kernel object for execution - const_address params, //!< pointer to the param's store - bool nativeMem //!< Native memory objects - ) const; + /*! \brief Detects if runtime has to disable cache optimization and + * recompiles the kernel + * + * \return True if aliases were detected in the kernel arguments + */ + void processMemObjects(VirtualGPU& gpu, //!< Virtual GPU objects - queue + const amd::Kernel& kernel, //!< AMD kernel object for execution + const_address params, //!< pointer to the param's store + bool nativeMem //!< Native memory objects + ) const; - /*! \brief Loads all kernel arguments, so we could run the kernel in HW. - * This includes CB update and resource binding - * - * \return True if we succefully loaded the arguments - */ - bool loadParameters( - VirtualGPU& gpu, //!< virtual GPU device object - const amd::Kernel& kernel, //!< AMD kernel object for execution - const_address params, //!< pointer to the param's store - bool nativeMem //!< Native memory objects - ) const; + /*! \brief Loads all kernel arguments, so we could run the kernel in HW. + * This includes CB update and resource binding + * + * \return True if we succefully loaded the arguments + */ + bool loadParameters(VirtualGPU& gpu, //!< virtual GPU device object + const amd::Kernel& kernel, //!< AMD kernel object for execution + const_address params, //!< pointer to the param's store + bool nativeMem //!< Native memory objects + ) const; - //! Binds the constant buffers associated with the kernel - bool bindConstantBuffers(VirtualGPU& gpu) const; + //! Binds the constant buffers associated with the kernel + bool bindConstantBuffers(VirtualGPU& gpu) const; - /*! \brief Runs the kernel on HW - * - * \return True if we succefully executed the kernel - */ - bool run( - VirtualGPU& gpu, //!< virtual GPU device object - GpuEvent* gpuEvent, //!< Pointer to the GPU event - bool lastRun, //!< Last run in the split execution - bool lastDoppCmd, //!< for last dopp submission kernel dispatch - bool pfpaDoppCmd //!< for PFPA dopp submission kernel dispatch - ) const; + /*! \brief Runs the kernel on HW + * + * \return True if we succefully executed the kernel + */ + bool run(VirtualGPU& gpu, //!< virtual GPU device object + GpuEvent* gpuEvent, //!< Pointer to the GPU event + bool lastRun, //!< Last run in the split execution + bool lastDoppCmd, //!< for last dopp submission kernel dispatch + bool pfpaDoppCmd //!< for PFPA dopp submission kernel dispatch + ) const; - //! Help function to debug the kernel output - void debug( - VirtualGPU& gpu //!< virtual GPU device object - ) const; + //! Help function to debug the kernel output + void debug(VirtualGPU& gpu //!< virtual GPU device object + ) const; - //! Programs internal samplers defined inside the kernel - bool setInternalSamplers( - VirtualGPU& gpu //!< Virtual GPU device object - ) const; + //! Programs internal samplers defined inside the kernel + bool setInternalSamplers(VirtualGPU& gpu //!< Virtual GPU device object + ) const; - //! Returns TRUE if we successfully retrieved the binary from CAL - bool getCalBinary( - void* binary, //!< ISA binary code - size_t size //!< ISA binary size - ) const; + //! Returns TRUE if we successfully retrieved the binary from CAL + bool getCalBinary(void* binary, //!< ISA binary code + size_t size //!< ISA binary size + ) const; - //! Returns CAL image size - size_t getCalBinarySize() const; + //! Returns CAL image size + size_t getCalBinarySize() const; - //! Returns GPU device object, associated with this kernel - const Device& dev() const; + //! Returns GPU device object, associated with this kernel + const Device& dev() const; - //! Returns GPU device object, associated with this kernel - const Program& prog() const; + //! Returns GPU device object, associated with this kernel + const Program& prog() const; - //! Binds global HW constant buffers - bool bindGlobalHwCb( - VirtualGPU& gpu, //!< Virtual GPU device object - VirtualGPU::GslKernelDesc* desc //!< Kernel descriptor - ) const; + //! Binds global HW constant buffers + bool bindGlobalHwCb(VirtualGPU& gpu, //!< Virtual GPU device object + VirtualGPU::GslKernelDesc* desc //!< Kernel descriptor + ) const; - //! Get profiling callback object - virtual amd::ProfilingCallback* getProfilingCallback( - const device::VirtualDevice *vdev){ - return waveLimiter_.getProfilingCallback(vdev); - } + //! Get profiling callback object + virtual amd::ProfilingCallback* getProfilingCallback(const device::VirtualDevice* vdev) { + return waveLimiter_.getProfilingCallback(vdev); + } -protected: - //! Initializes the kernel parameters for the abstraction layer - bool initParameters(); + protected: + //! Initializes the kernel parameters for the abstraction layer + bool initParameters(); - /*! \brief Creates constant buffer resources, associated with the kernel - * - * \return TRUE if we succefully created constant buffers - */ - bool initConstBuffers(); + /*! \brief Creates constant buffer resources, associated with the kernel + * + * \return TRUE if we succefully created constant buffers + */ + bool initConstBuffers(); -private: - //! Disable copy constructor - Kernel(const Kernel&); + private: + //! Disable copy constructor + Kernel(const Kernel&); - //! Disable operator= - Kernel& operator=(const Kernel&); + //! Disable operator= + Kernel& operator=(const Kernel&); - //! \enum Fixed Metadata offsets - enum MetadataOffsets - { - GlobalWorkitemOffset = 0, - LocalWorkitemOffset = 1, - GroupsOffset = 2, - PrivateRingOffset = 3, - LocalRingOffset = 4, - MathLibOffset = 5, - GlobalWorkOffsetOffset = 6, - GroupWorkOffsetOffset = 7, - GlobalDataStoreOffset = 8, - DebugOffset = 8, - NDRangeGlobalWorkOffsetOffset = 9, + //! \enum Fixed Metadata offsets + enum MetadataOffsets { + GlobalWorkitemOffset = 0, + LocalWorkitemOffset = 1, + GroupsOffset = 2, + PrivateRingOffset = 3, + LocalRingOffset = 4, + MathLibOffset = 5, + GlobalWorkOffsetOffset = 6, + GroupWorkOffsetOffset = 7, + GlobalDataStoreOffset = 8, + DebugOffset = 8, + NDRangeGlobalWorkOffsetOffset = 9, - // The total number of constants reserved for ABI - TotalABIVectors - }; + // The total number of constants reserved for ABI + TotalABIVectors + }; - /*! \brief Sets the kernel argument - * - * \return True if we succefully updated the arguments - */ - bool setArgument( - VirtualGPU& gpu, //!< Virtual GPU device object - uint idx, //!< the argument index - const void* param, //!< the arguments data - size_t size, //!< size of the provided data - bool nativeMem //!< Native memory objects - ) const; + /*! \brief Sets the kernel argument + * + * \return True if we succefully updated the arguments + */ + bool setArgument(VirtualGPU& gpu, //!< Virtual GPU device object + uint idx, //!< the argument index + const void* param, //!< the arguments data + size_t size, //!< size of the provided data + bool nativeMem //!< Native memory objects + ) const; - /*! \brief Initializes local and private buffer ranges - * - * \return True if we succefully initialized the ranges - */ - bool initLocalPrivateRanges( - VirtualGPU& gpu //!< Virtual GPU device object - ) const; + /*! \brief Initializes local and private buffer ranges + * + * \return True if we succefully initialized the ranges + */ + bool initLocalPrivateRanges(VirtualGPU& gpu //!< Virtual GPU device object + ) const; - //! Sets local and private buffer ranges - void setLocalPrivateRanges( - VirtualGPU& gpu //!< Virtual GPU device object - ) const; + //! Sets local and private buffer ranges + void setLocalPrivateRanges(VirtualGPU& gpu //!< Virtual GPU device object + ) const; - //! Sets the sampler's parameters for the image look-up - void setSampler( - VirtualGPU& gpu, //!< virtual GPU device object - uint32_t state, //!< sampler state - uint physUnit //!< sampler's number - ) const; + //! Sets the sampler's parameters for the image look-up + void setSampler(VirtualGPU& gpu, //!< virtual GPU device object + uint32_t state, //!< sampler state + uint physUnit //!< sampler's number + ) const; - /*! \brief Binds resource - * - * \return True if we succefully created constant buffers - */ - bool bindResource( - VirtualGPU& gpu, //!< virtual GPU device object - const Memory& memory, //!< memory for binding - uint paramIdx, //!< index of the parameter - ResourceType type, //!< resource type - uint physUnit, //!< PhysUnit - size_t offset = 0 - ) const; + /*! \brief Binds resource + * + * \return True if we succefully created constant buffers + */ + bool bindResource(VirtualGPU& gpu, //!< virtual GPU device object + const Memory& memory, //!< memory for binding + uint paramIdx, //!< index of the parameter + ResourceType type, //!< resource type + uint physUnit, //!< PhysUnit + size_t offset = 0) const; - //! Unbinds all resources for the kernel - void unbindResources( - VirtualGPU& gpu, //!< virtual GPU device object - GpuEvent gpuEvent, //!< GPU event that will be associated with the resources - bool lastRun //!< last run in the split execution - ) const; + //! Unbinds all resources for the kernel + void unbindResources(VirtualGPU& gpu, //!< virtual GPU device object + GpuEvent gpuEvent, //!< GPU event that will be associated with the resources + bool lastRun //!< last run in the split execution + ) const; - //! Copies image constants to the constant buffer - void copyImageConstants( - const amd::Image* amdImage, //!< Abstraction layer image object - ImageConstants* imageData //!< Pointer in CB to the image constants - ) const; + //! Copies image constants to the constant buffer + void copyImageConstants(const amd::Image* amdImage, //!< Abstraction layer image object + ImageConstants* imageData //!< Pointer in CB to the image constants + ) const; - //! Finds local workgroup size - void findLocalWorkSize( - size_t workDim, //!< Work dimension - const amd::NDRange& gblWorkSize,//!< Global work size - amd::NDRange& lclWorkSize //!< Local work size - ) const; + //! Finds local workgroup size + void findLocalWorkSize(size_t workDim, //!< Work dimension + const amd::NDRange& gblWorkSize, //!< Global work size + amd::NDRange& lclWorkSize //!< Local work size + ) const; - uint hwPrivateSize_; //!< initial HW private size - uint hwLocalSize_; //!< initial HW local size + uint hwPrivateSize_; //!< initial HW private size + uint hwLocalSize_; //!< initial HW local size - WaveLimiterManager waveLimiter_; //!< adaptively control number of waves + WaveLimiterManager waveLimiter_; //!< adaptively control number of waves }; -enum HSAIL_ADDRESS_QUALIFIER{ - HSAIL_ADDRESS_ERROR = 0, - HSAIL_ADDRESS_GLOBAL, - HSAIL_ADDRESS_LOCAL, - HSAIL_MAX_ADDRESS_QUALIFIERS -} ; - -enum HSAIL_ARG_TYPE{ - HSAIL_ARGTYPE_ERROR = 0, - HSAIL_ARGTYPE_POINTER, - HSAIL_ARGTYPE_VALUE, - HSAIL_ARGTYPE_IMAGE, - HSAIL_ARGTYPE_SAMPLER, - HSAIL_ARGTYPE_QUEUE, - HSAIL_ARGMAX_ARG_TYPES +enum HSAIL_ADDRESS_QUALIFIER { + HSAIL_ADDRESS_ERROR = 0, + HSAIL_ADDRESS_GLOBAL, + HSAIL_ADDRESS_LOCAL, + HSAIL_MAX_ADDRESS_QUALIFIERS }; -enum HSAIL_DATA_TYPE{ - HSAIL_DATATYPE_ERROR = 0, - HSAIL_DATATYPE_B1, - HSAIL_DATATYPE_B8, - HSAIL_DATATYPE_B16, - HSAIL_DATATYPE_B32, - HSAIL_DATATYPE_B64, - HSAIL_DATATYPE_S8, - HSAIL_DATATYPE_S16, - HSAIL_DATATYPE_S32, - HSAIL_DATATYPE_S64, - HSAIL_DATATYPE_U8, - HSAIL_DATATYPE_U16, - HSAIL_DATATYPE_U32, - HSAIL_DATATYPE_U64, - HSAIL_DATATYPE_F16, - HSAIL_DATATYPE_F32, - HSAIL_DATATYPE_F64, - HSAIL_DATATYPE_STRUCT, - HSAIL_DATATYPE_OPAQUE, - HSAIL_DATATYPE_MAX_TYPES +enum HSAIL_ARG_TYPE { + HSAIL_ARGTYPE_ERROR = 0, + HSAIL_ARGTYPE_POINTER, + HSAIL_ARGTYPE_VALUE, + HSAIL_ARGTYPE_IMAGE, + HSAIL_ARGTYPE_SAMPLER, + HSAIL_ARGTYPE_QUEUE, + HSAIL_ARGMAX_ARG_TYPES +}; + +enum HSAIL_DATA_TYPE { + HSAIL_DATATYPE_ERROR = 0, + HSAIL_DATATYPE_B1, + HSAIL_DATATYPE_B8, + HSAIL_DATATYPE_B16, + HSAIL_DATATYPE_B32, + HSAIL_DATATYPE_B64, + HSAIL_DATATYPE_S8, + HSAIL_DATATYPE_S16, + HSAIL_DATATYPE_S32, + HSAIL_DATATYPE_S64, + HSAIL_DATATYPE_U8, + HSAIL_DATATYPE_U16, + HSAIL_DATATYPE_U32, + HSAIL_DATATYPE_U64, + HSAIL_DATATYPE_F16, + HSAIL_DATATYPE_F32, + HSAIL_DATATYPE_F64, + HSAIL_DATATYPE_STRUCT, + HSAIL_DATATYPE_OPAQUE, + HSAIL_DATATYPE_MAX_TYPES }; enum HSAIL_ACCESS_TYPE { - HSAIL_ACCESS_TYPE_NONE = 0, - HSAIL_ACCESS_TYPE_RO, - HSAIL_ACCESS_TYPE_WO, - HSAIL_ACCESS_TYPE_RW + HSAIL_ACCESS_TYPE_NONE = 0, + HSAIL_ACCESS_TYPE_RO, + HSAIL_ACCESS_TYPE_WO, + HSAIL_ACCESS_TYPE_RW }; -class HSAILKernel : public device::Kernel -{ -public: - struct Argument - { - std::string name_; //!< Argument's name - std::string typeName_; //!< Argument's type name - uint size_; //!< Size in bytes - uint offset_; //!< Argument's offset - uint alignment_; //!< Argument's alignment - HSAIL_ARG_TYPE type_; //!< Type of the argument - HSAIL_ADDRESS_QUALIFIER addrQual_; //!< Address qualifier of the argument - HSAIL_DATA_TYPE dataType_; //!< The type of data - uint numElem_; //!< Number of elements - HSAIL_ACCESS_TYPE access_; //!< Access type for the argument +class HSAILKernel : public device::Kernel { + public: + struct Argument { + std::string name_; //!< Argument's name + std::string typeName_; //!< Argument's type name + uint size_; //!< Size in bytes + uint offset_; //!< Argument's offset + uint alignment_; //!< Argument's alignment + HSAIL_ARG_TYPE type_; //!< Type of the argument + HSAIL_ADDRESS_QUALIFIER addrQual_; //!< Address qualifier of the argument + HSAIL_DATA_TYPE dataType_; //!< The type of data + uint numElem_; //!< Number of elements + HSAIL_ACCESS_TYPE access_; //!< Access type for the argument + }; + + // Max number of possible extra (hidden) kernel arguments + static const uint MaxExtraArgumentsNum = 6; + + HSAILKernel(std::string name, HSAILProgram* prog, std::string compileOptions, uint extraArgsNum); + + virtual ~HSAILKernel(); + + //! Initializes the metadata required for this kernel, + //! finalizes the kernel if needed + bool init(amd::hsa::loader::Symbol* sym, bool finalize = false); + + //! Returns true if memory is valid for execution + virtual bool validateMemory(uint idx, amd::Memory* amdMem) const; + + //! Returns a pointer to the hsail argument + const Argument* argument(size_t i) const { return arguments_[i]; } + + //! Returns the number of hsail arguments + size_t numArguments() const { return arguments_.size(); } + + //! Returns GPU device object, associated with this kernel + const Device& dev() const; + + //! Returns HSA program associated with this kernel + const HSAILProgram& prog() const; + + //! Returns LDS size used in this kernel + uint32_t ldsSize() const { return cpuAqlCode_->workgroup_group_segment_byte_size; } + + //! Returns pointer on CPU to AQL code info + const void* cpuAqlCode() const { return cpuAqlCode_; } + + //! Returns memory object with AQL code + gpu::Memory* gpuAqlCode() const { return code_; } + + //! Returns size of AQL code + size_t aqlCodeSize() const { return codeSize_; } + + //! Returns the size of argument buffer + size_t argsBufferSize() const { return cpuAqlCode_->kernarg_segment_byte_size; } + + //! Returns spill reg size per workitem + int spillSegSize() const { return cpuAqlCode_->workitem_private_segment_byte_size; } + + //! Returns TRUE if kernel uses dynamic parallelism + bool dynamicParallelism() const { return (flags_.dynamicParallelism_) ? true : false; } + + //! Returns TRUE if kernel is internal kernel + bool isInternalKernel() const { return (flags_.internalKernel_) ? true : false; } + + //! Finds local workgroup size + void findLocalWorkSize(size_t workDim, //!< Work dimension + const amd::NDRange& gblWorkSize, //!< Global work size + amd::NDRange& lclWorkSize //!< Local work size + ) const; + + //! Returns AQL packet in CPU memory + //! if the kerenl arguments were successfully loaded, otherwise NULL + hsa_kernel_dispatch_packet_t* loadArguments( + VirtualGPU& gpu, //!< Running GPU context + const amd::Kernel& kernel, //!< AMD kernel object + const amd::NDRangeContainer& sizes, //!< NDrange container + const_address parameters, //!< Application arguments for the kernel + bool nativeMem, //!< Native memory objectes are passed + uint64_t vmDefQueue, //!< GPU VM default queue pointer + uint64_t* vmParentWrap, //!< GPU VM parent aql wrap object + std::vector& memList //!< Memory list for GSL/VidMM handles + ) const; + + //! Returns pritnf info array + const std::vector& printfInfo() const { return printf_; } + + //! Returns the kernel index in the program + uint index() const { return index_; } + + //! Returns kernel's extra argument count + uint extraArgumentsNum() const { return extraArgumentsNum_; } + + //! Get profiling callback object + virtual amd::ProfilingCallback* getProfilingCallback(const device::VirtualDevice* vdev) { + return waveLimiter_.getProfilingCallback(vdev); + } + + //! Get waves per shader array to be used for kernel execution. + uint getWavesPerSH(const device::VirtualDevice* vdev) const { + return waveLimiter_.getWavesPerSH(vdev); + } + + private: + //! Disable copy constructor + HSAILKernel(const HSAILKernel&); + + //! Disable operator= + HSAILKernel& operator=(const HSAILKernel&); + + //! Creates AQL kernel HW info + bool aqlCreateHWInfo(amd::hsa::loader::Symbol* sym); + + //! Initializes arguments_ and the abstraction layer kernel parameters + void initArgList(const aclArgData* aclArg //!< List of ACL arguments + ); + + //! Initializes Hsail Argument metadata and info + void initHsailArgs(const aclArgData* aclArg //!< List of ACL arguments + ); + + //! Initializes Hsail Printf metadata and info + void initPrintf(const aclPrintfFmt* aclPrintf //!< List of ACL printfs + ); + + std::vector arguments_; //!< Vector list of HSAIL Arguments + std::string compileOptions_; //!< compile used for finalizing this kernel + amd_kernel_code_t* cpuAqlCode_; //!< AQL kernel code on CPU + const NullDevice& dev_; //!< GPU device object + const HSAILProgram& prog_; //!< Reference to the parent program + std::vector printf_; //!< Format strings for GPU printf support + uint index_; //!< Kernel index in the program + + gpu::Memory* code_; //!< Memory object with ISA code + size_t codeSize_; //!< Size of ISA code + + char* hwMetaData_; //!< SI metadata + + uint extraArgumentsNum_; //! Number of extra (hidden) kernel arguments + + union Flags { + struct { + uint imageEna_ : 1; //!< Kernel uses images + uint imageWriteEna_ : 1; //!< Kernel uses image writes + uint dynamicParallelism_ : 1; //!< Dynamic parallelism enabled + uint internalKernel_ : 1; //!< True: internal kernel }; + uint value_; + Flags() : value_(0) {} + } flags_; - // Max number of possible extra (hidden) kernel arguments - static const uint MaxExtraArgumentsNum = 6; - - HSAILKernel(std::string name, - HSAILProgram* prog, - std::string compileOptions, - uint extraArgsNum); - - virtual ~HSAILKernel(); - - //! Initializes the metadata required for this kernel, - //! finalizes the kernel if needed - bool init(amd::hsa::loader::Symbol *sym, bool finalize = false); - - //! Returns true if memory is valid for execution - virtual bool validateMemory(uint idx, amd::Memory* amdMem) const; - - //! Returns a pointer to the hsail argument - const Argument* argument(size_t i) const { return arguments_[i]; } - - //! Returns the number of hsail arguments - size_t numArguments() const { return arguments_.size(); } - - //! Returns GPU device object, associated with this kernel - const Device& dev() const; - - //! Returns HSA program associated with this kernel - const HSAILProgram& prog() const; - - //! Returns LDS size used in this kernel - uint32_t ldsSize() const - { return cpuAqlCode_->workgroup_group_segment_byte_size; } - - //! Returns pointer on CPU to AQL code info - const void* cpuAqlCode() const { return cpuAqlCode_; } - - //! Returns memory object with AQL code - gpu::Memory* gpuAqlCode() const { return code_; } - - //! Returns size of AQL code - size_t aqlCodeSize() const { return codeSize_; } - - //! Returns the size of argument buffer - size_t argsBufferSize() const - { return cpuAqlCode_->kernarg_segment_byte_size; } - - //! Returns spill reg size per workitem - int spillSegSize() const - { return cpuAqlCode_->workitem_private_segment_byte_size; } - - //! Returns TRUE if kernel uses dynamic parallelism - bool dynamicParallelism() const - { return (flags_.dynamicParallelism_) ? true : false; } - - //! Returns TRUE if kernel is internal kernel - bool isInternalKernel() const - { return (flags_.internalKernel_) ? true : false; } - - //! Finds local workgroup size - void findLocalWorkSize( - size_t workDim, //!< Work dimension - const amd::NDRange& gblWorkSize,//!< Global work size - amd::NDRange& lclWorkSize //!< Local work size - ) const; - - //! Returns AQL packet in CPU memory - //! if the kerenl arguments were successfully loaded, otherwise NULL - hsa_kernel_dispatch_packet_t* loadArguments( - VirtualGPU& gpu, //!< Running GPU context - const amd::Kernel& kernel, //!< AMD kernel object - const amd::NDRangeContainer& sizes, //!< NDrange container - const_address parameters, //!< Application arguments for the kernel - bool nativeMem, //!< Native memory objectes are passed - uint64_t vmDefQueue, //!< GPU VM default queue pointer - uint64_t* vmParentWrap, //!< GPU VM parent aql wrap object - std::vector& memList //!< Memory list for GSL/VidMM handles - ) const; - - //! Returns pritnf info array - const std::vector& printfInfo() const { return printf_; } - - //! Returns the kernel index in the program - uint index() const { return index_; } - - //! Returns kernel's extra argument count - uint extraArgumentsNum() const { return extraArgumentsNum_; } - - //! Get profiling callback object - virtual amd::ProfilingCallback* getProfilingCallback( - const device::VirtualDevice *vdev){ - return waveLimiter_.getProfilingCallback(vdev); - } - - //! Get waves per shader array to be used for kernel execution. - uint getWavesPerSH(const device::VirtualDevice *vdev) const { - return waveLimiter_.getWavesPerSH(vdev); - } - -private: - //! Disable copy constructor - HSAILKernel(const HSAILKernel&); - - //! Disable operator= - HSAILKernel& operator=(const HSAILKernel&); - - //! Creates AQL kernel HW info - bool aqlCreateHWInfo(amd::hsa::loader::Symbol *sym); - - //! Initializes arguments_ and the abstraction layer kernel parameters - void initArgList( - const aclArgData* aclArg //!< List of ACL arguments - ); - - //! Initializes Hsail Argument metadata and info - void initHsailArgs( - const aclArgData* aclArg //!< List of ACL arguments - ); - - //! Initializes Hsail Printf metadata and info - void initPrintf( - const aclPrintfFmt* aclPrintf //!< List of ACL printfs - ); - - std::vector arguments_; //!< Vector list of HSAIL Arguments - std::string compileOptions_; //!< compile used for finalizing this kernel - amd_kernel_code_t* cpuAqlCode_; //!< AQL kernel code on CPU - const NullDevice& dev_; //!< GPU device object - const HSAILProgram& prog_; //!< Reference to the parent program - std::vector printf_; //!< Format strings for GPU printf support - uint index_; //!< Kernel index in the program - - gpu::Memory* code_; //!< Memory object with ISA code - size_t codeSize_; //!< Size of ISA code - - char* hwMetaData_; //!< SI metadata - - uint extraArgumentsNum_; //! Number of extra (hidden) kernel arguments - - union Flags { - struct { - uint imageEna_: 1; //!< Kernel uses images - uint imageWriteEna_: 1; //!< Kernel uses image writes - uint dynamicParallelism_: 1; //!< Dynamic parallelism enabled - uint internalKernel_: 1; //!< True: internal kernel - }; - uint value_; - Flags(): value_(0) {} - } flags_; - - WaveLimiterManager waveLimiter_; //!< adaptively control number of waves + WaveLimiterManager waveLimiter_; //!< adaptively control number of waves }; /*@}*/} // namespace gpu diff --git a/rocclr/runtime/device/gpu/gpumemory.cpp b/rocclr/runtime/device/gpu/gpumemory.cpp index 0141603ff9..ba6e76b1de 100644 --- a/rocclr/runtime/device/gpu/gpumemory.cpp +++ b/rocclr/runtime/device/gpu/gpumemory.cpp @@ -12,7 +12,7 @@ #include "amdocl/cl_d3d9_amd.hpp" #include "amdocl/cl_d3d10_amd.hpp" #include "amdocl/cl_d3d11_amd.hpp" -#endif //_WIN32 +#endif //_WIN32 #include "amdocl/cl_gl_amd.hpp" #include @@ -22,1297 +22,1144 @@ //! Turn this on to enable sanity checks before and after every heap operation. #if DEBUG -#define EXTRA_HEAP_CHECKS 1 -#endif // DEBUG +#define EXTRA_HEAP_CHECKS 1 +#endif // DEBUG namespace gpu { -Memory::Memory( - const Device& gpuDev, - amd::Memory& owner, - size_t size) - : device::Memory(owner) - , Resource(gpuDev, size / Device::Heap::ElementSize, Device::Heap::ElementType) -{ - init(); +Memory::Memory(const Device& gpuDev, amd::Memory& owner, size_t size) + : device::Memory(owner), + Resource(gpuDev, size / Device::Heap::ElementSize, Device::Heap::ElementType) { + init(); - if (owner.parent() != NULL) { - flags_ |= SubMemoryObject; - } + if (owner.parent() != NULL) { + flags_ |= SubMemoryObject; + } } -Memory::Memory( - const Device& gpuDev, - size_t size) - : device::Memory(size) - , Resource(gpuDev, - amd::alignUp(size, Device::Heap::ElementSize) / - Device::Heap::ElementSize, Device::Heap::ElementType) -{ - init(); +Memory::Memory(const Device& gpuDev, size_t size) + : device::Memory(size), + Resource(gpuDev, amd::alignUp(size, Device::Heap::ElementSize) / Device::Heap::ElementSize, + Device::Heap::ElementType) { + init(); } -Memory::Memory( - const Device& gpuDev, - amd::Memory& owner, - size_t width, - cmSurfFmt format - ) - : device::Memory(owner) - , Resource(gpuDev, width, format) -{ - init(); +Memory::Memory(const Device& gpuDev, amd::Memory& owner, size_t width, cmSurfFmt format) + : device::Memory(owner), Resource(gpuDev, width, format) { + init(); - if (owner.parent() != NULL) { - flags_ |= SubMemoryObject; - } + if (owner.parent() != NULL) { + flags_ |= SubMemoryObject; + } } -Memory::Memory( - const Device& gpuDev, - size_t size, - size_t width, - cmSurfFmt format - ) - : device::Memory(size) - , Resource(gpuDev, width, format) -{ - init(); +Memory::Memory(const Device& gpuDev, size_t size, size_t width, cmSurfFmt format) + : device::Memory(size), Resource(gpuDev, width, format) { + init(); } -Memory::Memory( - const Device& gpuDev, - amd::Memory& owner, - size_t width, - size_t height, - size_t depth, - cmSurfFmt format, - gslChannelOrder chOrder, - cl_mem_object_type imageType, - uint mipLevels - ) - : device::Memory(owner) - , Resource(gpuDev, width, height, depth, format, chOrder, imageType, mipLevels) -{ - init(); +Memory::Memory(const Device& gpuDev, amd::Memory& owner, size_t width, size_t height, size_t depth, + cmSurfFmt format, gslChannelOrder chOrder, cl_mem_object_type imageType, + uint mipLevels) + : device::Memory(owner), + Resource(gpuDev, width, height, depth, format, chOrder, imageType, mipLevels) { + init(); - if (owner.parent() != NULL) { - flags_ |= SubMemoryObject; - } + if (owner.parent() != NULL) { + flags_ |= SubMemoryObject; + } } -Memory::Memory( - const Device& gpuDev, - size_t size, - size_t width, - size_t height, - size_t depth, - cmSurfFmt format, - gslChannelOrder chOrder, - cl_mem_object_type imageType, - uint mipLevels - ) - : device::Memory(size) - , Resource(gpuDev, width, height, depth, format, chOrder, imageType, mipLevels) -{ - init(); +Memory::Memory(const Device& gpuDev, size_t size, size_t width, size_t height, size_t depth, + cmSurfFmt format, gslChannelOrder chOrder, cl_mem_object_type imageType, + uint mipLevels) + : device::Memory(size), + Resource(gpuDev, width, height, depth, format, chOrder, imageType, mipLevels) { + init(); } -void -Memory::init() -{ - indirectMapCount_ = 0; - interopType_ = InteropNone; - interopMemory_ = NULL; - pinnedMemory_ = NULL; - parent_ = NULL; +void Memory::init() { + indirectMapCount_ = 0; + interopType_ = InteropNone; + interopMemory_ = NULL; + pinnedMemory_ = NULL; + parent_ = NULL; } #ifdef _WIN32 -static HANDLE -getSharedHandle(IUnknown* pIface) -{ - // Sanity checks - assert(pIface != NULL); +static HANDLE getSharedHandle(IUnknown* pIface) { + // Sanity checks + assert(pIface != NULL); - HRESULT hRes; - HANDLE hShared; - IDXGIResource* pDxgiRes = NULL; - if((hRes = (const_cast(pIface))->QueryInterface( - __uuidof(IDXGIResource), - (void**) &pDxgiRes)) != S_OK) { - return (HANDLE) 0; - } - if(!pDxgiRes) { - return (HANDLE) 0; - } - hRes = pDxgiRes->GetSharedHandle(&hShared); - pDxgiRes->Release(); - if(hRes != S_OK) { - return (HANDLE) 0; - } - return hShared; + HRESULT hRes; + HANDLE hShared; + IDXGIResource* pDxgiRes = NULL; + if ((hRes = (const_cast(pIface)) + ->QueryInterface(__uuidof(IDXGIResource), (void**)&pDxgiRes)) != S_OK) { + return (HANDLE)0; + } + if (!pDxgiRes) { + return (HANDLE)0; + } + hRes = pDxgiRes->GetSharedHandle(&hShared); + pDxgiRes->Release(); + if (hRes != S_OK) { + return (HANDLE)0; + } + return hShared; } -#endif //_WIN32 +#endif //_WIN32 -bool -Memory::create( - Resource::MemoryType memType, - Resource::CreateParams* params) -{ - bool result; +bool Memory::create(Resource::MemoryType memType, Resource::CreateParams* params) { + bool result; - // Reset the flag in case we reallocate the heap in local/remote - flags_ &= ~HostMemoryDirectAccess; + // Reset the flag in case we reallocate the heap in local/remote + flags_ &= ~HostMemoryDirectAccess; - // Create a resource in CAL - result = Resource::create(memType, params); + // Create a resource in CAL + result = Resource::create(memType, params); - // Check if CAL created a resource - if (result) { - switch (memoryType()) { - case Resource::Pinned: - case Resource::ExternalPhysical: - // Marks memory object for direct GPU access to the host memory - flags_ |= HostMemoryDirectAccess; - break; - case Resource::Remote: - case Resource::RemoteUSWC: - if (!cal()->tiled_) { - // Marks memory object for direct GPU access to the host memory - flags_ |= HostMemoryDirectAccess; - } - break; - case Resource::View: { - Resource::ViewParams* view = - reinterpret_cast(params); - if (view->resource_->memoryType() == Resource::Persistent) { - flags_ |= HostMemoryDirectAccess; - } - // Check if parent was allocated in system memory - if ((view->resource_->memoryType() == Resource::Pinned) || - (((view->resource_->memoryType() == Resource::Remote) || - (view->resource_->memoryType() == Resource::RemoteUSWC)) && - // @todo Enable unconditional optimization for remote memory - // Check for external allocation, to avoid the optimization - // for non-VM (double copy) mode - (owner() != NULL) && - ((owner()->getMemFlags() & CL_MEM_ALLOC_HOST_PTR) || - dev().settings().remoteAlloc_))) { - // Marks memory object for direct GPU access to the host memory - flags_ |= HostMemoryDirectAccess; - } - if ((view->owner_ != NULL) && (view->owner_->parent() != NULL)) { - parent_ = reinterpret_cast(view->memory_); - flags_ |= SubMemoryObject; - } - break; - } - case Resource::ImageView: { - Resource::ImageViewParams* view = - reinterpret_cast(params); - parent_ = reinterpret_cast(view->memory_); - flags_ |= SubMemoryObject | (parent_->flags_ & HostMemoryDirectAccess); - break; - } - case Resource::ImageBuffer: { - Resource::ImageBufferParams* view = - reinterpret_cast(params); - parent_ = reinterpret_cast(view->memory_); - flags_ |= SubMemoryObject | (parent_->flags_ & HostMemoryDirectAccess); - break; - } - default: - break; - } - } - - return result; -} - -bool Memory::processGLResource(GLResourceOP operation) -{ - bool retVal = false; - switch (operation) - { - case GLDecompressResource: - retVal = gslGLAcquire(); + // Check if CAL created a resource + if (result) { + switch (memoryType()) { + case Resource::Pinned: + case Resource::ExternalPhysical: + // Marks memory object for direct GPU access to the host memory + flags_ |= HostMemoryDirectAccess; break; - case GLInvalidateFBO: - retVal = gslGLRelease(); + case Resource::Remote: + case Resource::RemoteUSWC: + if (!cal()->tiled_) { + // Marks memory object for direct GPU access to the host memory + flags_ |= HostMemoryDirectAccess; + } + break; + case Resource::View: { + Resource::ViewParams* view = reinterpret_cast(params); + if (view->resource_->memoryType() == Resource::Persistent) { + flags_ |= HostMemoryDirectAccess; + } + // Check if parent was allocated in system memory + if ((view->resource_->memoryType() == Resource::Pinned) || + (((view->resource_->memoryType() == Resource::Remote) || + (view->resource_->memoryType() == Resource::RemoteUSWC)) && + // @todo Enable unconditional optimization for remote memory + // Check for external allocation, to avoid the optimization + // for non-VM (double copy) mode + (owner() != NULL) && + ((owner()->getMemFlags() & CL_MEM_ALLOC_HOST_PTR) || dev().settings().remoteAlloc_))) { + // Marks memory object for direct GPU access to the host memory + flags_ |= HostMemoryDirectAccess; + } + if ((view->owner_ != NULL) && (view->owner_->parent() != NULL)) { + parent_ = reinterpret_cast(view->memory_); + flags_ |= SubMemoryObject; + } + break; + } + case Resource::ImageView: { + Resource::ImageViewParams* view = reinterpret_cast(params); + parent_ = reinterpret_cast(view->memory_); + flags_ |= SubMemoryObject | (parent_->flags_ & HostMemoryDirectAccess); + break; + } + case Resource::ImageBuffer: { + Resource::ImageBufferParams* view = reinterpret_cast(params); + parent_ = reinterpret_cast(view->memory_); + flags_ |= SubMemoryObject | (parent_->flags_ & HostMemoryDirectAccess); + break; + } + default: break; - default: - assert(false && "unknown GLResourceOP"); } - return retVal; + } + + return result; +} + +bool Memory::processGLResource(GLResourceOP operation) { + bool retVal = false; + switch (operation) { + case GLDecompressResource: + retVal = gslGLAcquire(); + break; + case GLInvalidateFBO: + retVal = gslGLRelease(); + break; + default: + assert(false && "unknown GLResourceOP"); + } + return retVal; } - -bool -Memory::createInterop(InteropType type) -{ - Resource::MemoryType memType = Resource::Empty; - Resource::OGLInteropParams oglRes; +bool Memory::createInterop(InteropType type) { + Resource::MemoryType memType = Resource::Empty; + Resource::OGLInteropParams oglRes; #ifdef _WIN32 - Resource::D3DInteropParams d3dRes; -#endif //_WIN32 + Resource::D3DInteropParams d3dRes; +#endif //_WIN32 - // Only external objects support interop - assert(owner() != NULL); + // Only external objects support interop + assert(owner() != NULL); - Resource::CreateParams* createParams = NULL; + Resource::CreateParams* createParams = NULL; - amd::InteropObject* interop = owner()->getInteropObj(); - assert((interop != NULL) && "An invalid interop object is impossible!"); + amd::InteropObject* interop = owner()->getInteropObj(); + assert((interop != NULL) && "An invalid interop object is impossible!"); - amd::GLObject* glObject = interop->asGLObject(); + amd::GLObject* glObject = interop->asGLObject(); #ifdef _WIN32 - amd::D3D10Object* d3d10Object = interop->asD3D10Object(); - amd::D3D11Object* d3d11Object = interop->asD3D11Object(); - amd::D3D9Object* d3d9Object = interop->asD3D9Object(); + amd::D3D10Object* d3d10Object = interop->asD3D10Object(); + amd::D3D11Object* d3d11Object = interop->asD3D11Object(); + amd::D3D9Object* d3d9Object = interop->asD3D9Object(); - if (d3d10Object != NULL) { - createParams = &d3dRes; + if (d3d10Object != NULL) { + createParams = &d3dRes; - d3dRes.owner_ = owner(); + d3dRes.owner_ = owner(); - const amd::D3D10ObjDesc_t* objDesc = d3d10Object->getObjDesc(); + const amd::D3D10ObjDesc_t* objDesc = d3d10Object->getObjDesc(); - memType = Resource::D3D10Interop; + memType = Resource::D3D10Interop; - // Get shared handle - if ((d3dRes.handle_ = - getSharedHandle(d3d10Object->getD3D10Resource()))) { - d3dRes.iDirect3D_ = static_cast - (d3d10Object->getD3D10Resource()); - d3dRes.type_ = Resource::InteropTypeless; - } - - d3dRes.misc = 0; - // Find D3D10 object type - switch (objDesc->objDim_) { - case D3D10_RESOURCE_DIMENSION_BUFFER: - d3dRes.type_ = Resource::InteropVertexBuffer; - break; - case D3D10_RESOURCE_DIMENSION_TEXTURE1D: - case D3D10_RESOURCE_DIMENSION_TEXTURE2D: - case D3D10_RESOURCE_DIMENSION_TEXTURE3D: - d3dRes.type_ = Resource::InteropTexture; - if (objDesc->mipLevels_ > 1) { - d3dRes.type_ = Resource::InteropTextureViewLevel; - - if (objDesc->arraySize_ > 1) { - d3dRes.layer_ = d3d10Object->getSubresource() / - objDesc->mipLevels_; - d3dRes.mipLevel_ = d3d10Object->getSubresource() % - objDesc->mipLevels_; - } - else { - d3dRes.layer_ = 0; - d3dRes.mipLevel_ = d3d10Object->getSubresource(); - } - } - break; - default: - return false; - break; - } + // Get shared handle + if ((d3dRes.handle_ = getSharedHandle(d3d10Object->getD3D10Resource()))) { + d3dRes.iDirect3D_ = static_cast(d3d10Object->getD3D10Resource()); + d3dRes.type_ = Resource::InteropTypeless; } - else if (d3d11Object != NULL) { - createParams = &d3dRes; - d3dRes.owner_ = owner(); + d3dRes.misc = 0; + // Find D3D10 object type + switch (objDesc->objDim_) { + case D3D10_RESOURCE_DIMENSION_BUFFER: + d3dRes.type_ = Resource::InteropVertexBuffer; + break; + case D3D10_RESOURCE_DIMENSION_TEXTURE1D: + case D3D10_RESOURCE_DIMENSION_TEXTURE2D: + case D3D10_RESOURCE_DIMENSION_TEXTURE3D: + d3dRes.type_ = Resource::InteropTexture; + if (objDesc->mipLevels_ > 1) { + d3dRes.type_ = Resource::InteropTextureViewLevel; - const amd::D3D11ObjDesc_t* objDesc = d3d11Object->getObjDesc(); - - memType = Resource::D3D11Interop; - - // Get shared handle - if ((d3dRes.handle_ = - getSharedHandle(d3d11Object->getD3D11Resource()))) { - d3dRes.iDirect3D_ = static_cast - (d3d11Object->getD3D11Resource()); - d3dRes.type_ = Resource::InteropTypeless; + if (objDesc->arraySize_ > 1) { + d3dRes.layer_ = d3d10Object->getSubresource() / objDesc->mipLevels_; + d3dRes.mipLevel_ = d3d10Object->getSubresource() % objDesc->mipLevels_; + } else { + d3dRes.layer_ = 0; + d3dRes.mipLevel_ = d3d10Object->getSubresource(); + } } - - d3dRes.misc = 0; - // Find D3D11 object type - switch (objDesc->objDim_) { - case D3D11_RESOURCE_DIMENSION_BUFFER: - d3dRes.type_ = Resource::InteropVertexBuffer; - break; - case D3D11_RESOURCE_DIMENSION_TEXTURE1D: - case D3D11_RESOURCE_DIMENSION_TEXTURE2D: - case D3D11_RESOURCE_DIMENSION_TEXTURE3D: - d3dRes.type_ = Resource::InteropTexture; - d3dRes.layer_= d3d11Object->getPlane(); - d3dRes.misc = d3d11Object->getMiscFlag(); - if (objDesc->mipLevels_ > 1) { - d3dRes.type_ = Resource::InteropTextureViewLevel; - - if (objDesc->arraySize_ > 1) { - d3dRes.layer_ = d3d11Object->getSubresource() / - objDesc->mipLevels_; - d3dRes.mipLevel_ = d3d11Object->getSubresource() % - objDesc->mipLevels_; - } - else { - d3dRes.layer_ = 0; - d3dRes.mipLevel_ = d3d11Object->getSubresource(); - } - } - break; - default: - return false; - break; - } - } - else if (d3d9Object != NULL) { - createParams = &d3dRes; - - d3dRes.owner_ = owner(); - - const amd::D3D9ObjDesc_t* objDesc = d3d9Object->getObjDesc(); - - memType = Resource::D3D9Interop; - - // Get shared handle - if ((d3dRes.handle_ = d3d9Object->getD3D9SharedHandle())) { - d3dRes.iDirect3D_ = static_cast - (d3d9Object->getD3D9Resource()); - d3dRes.type_ = Resource::InteropSurface; - d3dRes.mipLevel_ = 0; - d3dRes.layer_ = d3d9Object->getPlane(); - d3dRes.misc = d3d9Object->getMiscFlag(); - } - } - else -#endif //_WIN32 - if (glObject != NULL) { - createParams = &oglRes; - - oglRes.owner_ = owner(); - - memType = Resource::OGLInterop; - - // Fill the interop creation parameters - oglRes.handle_ = static_cast(glObject->getGLName()); - - // Find OGL object type - switch (glObject->getCLGLObjectType()) { - case CL_GL_OBJECT_BUFFER: - oglRes.type_ = Resource::InteropVertexBuffer; - break; - case CL_GL_OBJECT_TEXTURE_BUFFER: - case CL_GL_OBJECT_TEXTURE1D: - case CL_GL_OBJECT_TEXTURE1D_ARRAY: - case CL_GL_OBJECT_TEXTURE2D: - case CL_GL_OBJECT_TEXTURE2D_ARRAY: - case CL_GL_OBJECT_TEXTURE3D: - oglRes.type_ = Resource::InteropTexture; - if (GL_TEXTURE_CUBE_MAP == glObject->getGLTarget()) { - switch (glObject->getCubemapFace()) { - case GL_TEXTURE_CUBE_MAP_POSITIVE_X: - case GL_TEXTURE_CUBE_MAP_NEGATIVE_X: - case GL_TEXTURE_CUBE_MAP_POSITIVE_Y: - case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y: - case GL_TEXTURE_CUBE_MAP_POSITIVE_Z: - case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z: - oglRes.type_ = Resource::InteropTextureViewCube; - oglRes.layer_ = - glObject->getCubemapFace() - GL_TEXTURE_CUBE_MAP_POSITIVE_X; - oglRes.mipLevel_ = glObject->getGLMipLevel(); - break; - default: - break; - } - } - else if (glObject->getGLMipLevel() != 0) { - oglRes.type_ = Resource::InteropTextureViewLevel; - oglRes.layer_ = 0; - oglRes.mipLevel_ = glObject->getGLMipLevel(); - } - break; - case CL_GL_OBJECT_RENDERBUFFER: - oglRes.type_ = Resource::InteropRenderBuffer; - break; - default: - return false; - break; - } - - oglRes.glPlatformContext_ = owner()->getContext().info().hCtx_; - oglRes.glDeviceContext_ = owner()->getContext().info().hDev_[amd::Context::DeviceFlagIdx::GLDeviceKhrIdx]; - // We dont pass any flags here for the GL Resource. - oglRes.flags_ = 0; - } - else { + break; + default: return false; + break; + } + } else if (d3d11Object != NULL) { + createParams = &d3dRes; + + d3dRes.owner_ = owner(); + + const amd::D3D11ObjDesc_t* objDesc = d3d11Object->getObjDesc(); + + memType = Resource::D3D11Interop; + + // Get shared handle + if ((d3dRes.handle_ = getSharedHandle(d3d11Object->getD3D11Resource()))) { + d3dRes.iDirect3D_ = static_cast(d3d11Object->getD3D11Resource()); + d3dRes.type_ = Resource::InteropTypeless; } - // Get the interop settings - if (type == InteropDirectAccess) { - // Create memory object - if (!create(memType, createParams)) { - return false; + d3dRes.misc = 0; + // Find D3D11 object type + switch (objDesc->objDim_) { + case D3D11_RESOURCE_DIMENSION_BUFFER: + d3dRes.type_ = Resource::InteropVertexBuffer; + break; + case D3D11_RESOURCE_DIMENSION_TEXTURE1D: + case D3D11_RESOURCE_DIMENSION_TEXTURE2D: + case D3D11_RESOURCE_DIMENSION_TEXTURE3D: + d3dRes.type_ = Resource::InteropTexture; + d3dRes.layer_ = d3d11Object->getPlane(); + d3dRes.misc = d3d11Object->getMiscFlag(); + if (objDesc->mipLevels_ > 1) { + d3dRes.type_ = Resource::InteropTextureViewLevel; + + if (objDesc->arraySize_ > 1) { + d3dRes.layer_ = d3d11Object->getSubresource() / objDesc->mipLevels_; + d3dRes.mipLevel_ = d3d11Object->getSubresource() % objDesc->mipLevels_; + } else { + d3dRes.layer_ = 0; + d3dRes.mipLevel_ = d3d11Object->getSubresource(); + } } + break; + default: + return false; + break; } - else { - // Allocate Resource object for interop as buffer - interopMemory_ = new Memory(dev(), size(), - amd::alignUp(size(), Device::Heap::ElementSize) / Device::Heap::ElementSize, - Device::Heap::ElementType); + } else if (d3d9Object != NULL) { + createParams = &d3dRes; - // Create the interop object in CAL - if (NULL == interopMemory_ || !interopMemory_->create(memType, createParams)) { - delete interopMemory_; - interopMemory_ = NULL; - return false; + d3dRes.owner_ = owner(); + + const amd::D3D9ObjDesc_t* objDesc = d3d9Object->getObjDesc(); + + memType = Resource::D3D9Interop; + + // Get shared handle + if ((d3dRes.handle_ = d3d9Object->getD3D9SharedHandle())) { + d3dRes.iDirect3D_ = static_cast(d3d9Object->getD3D9Resource()); + d3dRes.type_ = Resource::InteropSurface; + d3dRes.mipLevel_ = 0; + d3dRes.layer_ = d3d9Object->getPlane(); + d3dRes.misc = d3d9Object->getMiscFlag(); + } + } else +#endif //_WIN32 + if (glObject != NULL) { + createParams = &oglRes; + + oglRes.owner_ = owner(); + + memType = Resource::OGLInterop; + + // Fill the interop creation parameters + oglRes.handle_ = static_cast(glObject->getGLName()); + + // Find OGL object type + switch (glObject->getCLGLObjectType()) { + case CL_GL_OBJECT_BUFFER: + oglRes.type_ = Resource::InteropVertexBuffer; + break; + case CL_GL_OBJECT_TEXTURE_BUFFER: + case CL_GL_OBJECT_TEXTURE1D: + case CL_GL_OBJECT_TEXTURE1D_ARRAY: + case CL_GL_OBJECT_TEXTURE2D: + case CL_GL_OBJECT_TEXTURE2D_ARRAY: + case CL_GL_OBJECT_TEXTURE3D: + oglRes.type_ = Resource::InteropTexture; + if (GL_TEXTURE_CUBE_MAP == glObject->getGLTarget()) { + switch (glObject->getCubemapFace()) { + case GL_TEXTURE_CUBE_MAP_POSITIVE_X: + case GL_TEXTURE_CUBE_MAP_NEGATIVE_X: + case GL_TEXTURE_CUBE_MAP_POSITIVE_Y: + case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y: + case GL_TEXTURE_CUBE_MAP_POSITIVE_Z: + case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z: + oglRes.type_ = Resource::InteropTextureViewCube; + oglRes.layer_ = glObject->getCubemapFace() - GL_TEXTURE_CUBE_MAP_POSITIVE_X; + oglRes.mipLevel_ = glObject->getGLMipLevel(); + break; + default: + break; + } + } else if (glObject->getGLMipLevel() != 0) { + oglRes.type_ = Resource::InteropTextureViewLevel; + oglRes.layer_ = 0; + oglRes.mipLevel_ = glObject->getGLMipLevel(); } + break; + case CL_GL_OBJECT_RENDERBUFFER: + oglRes.type_ = Resource::InteropRenderBuffer; + break; + default: + return false; + break; } - setInteropType(type); + oglRes.glPlatformContext_ = owner()->getContext().info().hCtx_; + oglRes.glDeviceContext_ = + owner()->getContext().info().hDev_[amd::Context::DeviceFlagIdx::GLDeviceKhrIdx]; + // We dont pass any flags here for the GL Resource. + oglRes.flags_ = 0; + } else { + return false; + } - return true; + // Get the interop settings + if (type == InteropDirectAccess) { + // Create memory object + if (!create(memType, createParams)) { + return false; + } + } else { + // Allocate Resource object for interop as buffer + interopMemory_ = new Memory( + dev(), size(), amd::alignUp(size(), Device::Heap::ElementSize) / Device::Heap::ElementSize, + Device::Heap::ElementType); + + // Create the interop object in CAL + if (NULL == interopMemory_ || !interopMemory_->create(memType, createParams)) { + delete interopMemory_; + interopMemory_ = NULL; + return false; + } + } + + setInteropType(type); + + return true; } -Memory::~Memory() -{ - // Clean VA cache - dev().removeVACache(this); +Memory::~Memory() { + // Clean VA cache + dev().removeVACache(this); - delete interopMemory_; + delete interopMemory_; - // Release associated map target, if any - if (NULL != mapMemory_) { - mapMemory()->unmap(NULL); - mapMemory_->release(); + // Release associated map target, if any + if (NULL != mapMemory_) { + mapMemory()->unmap(NULL); + mapMemory_->release(); + } + + // Destory pinned memory + if (flags_ & PinnedMemoryAlloced) { + delete pinnedMemory_; + } + + if ((owner() != NULL) && isHostMemDirectAccess() && !(flags_ & SubMemoryObject) && + (memoryType() != Resource::ExternalPhysical)) { + // Unmap memory if direct access was requested + unmap(NULL); + } +} + +void Memory::syncCacheFromHost(VirtualGPU& gpu, device::Memory::SyncFlags syncFlags) { + // If the last writer was another GPU, then make a writeback + if (!isHostMemDirectAccess() && (owner()->getLastWriter() != NULL) && + (&dev() != owner()->getLastWriter())) { + mgpuCacheWriteBack(); + } + + // If host memory doesn't have direct access, then we have to synchronize + if (!isHostMemDirectAccess() && (NULL != owner()->getHostMem())) { + bool hasUpdates = true; + + // Make sure the parent of subbuffer is up to date + if (!syncFlags.skipParent_ && (flags_ & SubMemoryObject)) { + gpu::Memory* gpuMemory = dev().getGpuMemory(owner()->parent()); + + //! \note: Skipping the sync for a view doesn't reflect the parent settings, + //! since a view is a small portion of parent + device::Memory::SyncFlags syncFlagsTmp; + + // Sync parent from a view, so views have to be skipped + syncFlagsTmp.skipViews_ = true; + + // Make sure the parent sync is an unique operation. + // If the app uses multiple subbuffers from multiple queues, + // then the parent sync can be called from multiple threads + amd::ScopedLock lock(owner()->parent()->lockMemoryOps()); + gpuMemory->syncCacheFromHost(gpu, syncFlagsTmp); + //! \note Don't do early exit here, since we still have to sync + //! this view, if the parent sync operation was a NOP. + //! If parent was synchronized, then this view sync will be a NOP } - // Destory pinned memory + // Is this a NOP? + if ((version_ == owner()->getVersion()) || (&dev() == owner()->getLastWriter())) { + hasUpdates = false; + } + + // Update all available views, since we sync the parent + if ((owner()->subBuffers().size() != 0) && (hasUpdates || !syncFlags.skipViews_)) { + device::Memory::SyncFlags syncFlagsTmp; + + // Sync views from parent, so parent has to be skipped + syncFlagsTmp.skipParent_ = true; + + if (hasUpdates) { + // Parent will be synced so update all views with a skip + syncFlagsTmp.skipEntire_ = true; + } else { + // Passthrough the skip entire flag to the views, since + // any view is a submemory of the parent + syncFlagsTmp.skipEntire_ = syncFlags.skipEntire_; + } + + amd::ScopedLock lock(owner()->lockMemoryOps()); + for (auto& sub : owner()->subBuffers()) { + //! \note Don't allow subbuffer's allocation in the worker thread. + //! It may cause a system lock, because possible resource + //! destruction, heap reallocation or subbuffer allocation + static const bool AllocSubBuffer = false; + device::Memory* devSub = sub->getDeviceMemory(dev(), AllocSubBuffer); + if (NULL != devSub) { + gpu::Memory* gpuSub = reinterpret_cast(devSub); + gpuSub->syncCacheFromHost(gpu, syncFlagsTmp); + } + } + } + + // Make sure we didn't have a NOP, + // because this GPU device was the last writer + if (&dev() != owner()->getLastWriter()) { + // Update the latest version + version_ = owner()->getVersion(); + } + + // Exit if sync is a NOP or sync can be skipped + if (!hasUpdates || syncFlags.skipEntire_) { + return; + } + + bool result = false; + static const bool Entire = true; + amd::Coord3D origin(0, 0, 0); + + // If host memory was pinned then make a transfer if (flags_ & PinnedMemoryAlloced) { - delete pinnedMemory_; + if (cal()->buffer_) { + amd::Coord3D region(owner()->getSize()); + result = gpu.blitMgr().copyBuffer(*pinnedMemory_, *this, origin, origin, region, Entire); + } else { + amd::Image& image = *static_cast(owner()); + result = gpu.blitMgr().copyBufferToImage(*pinnedMemory_, *this, origin, origin, + image.getRegion(), Entire, image.getRowPitch(), + image.getSlicePitch()); + } } - if ((owner() != NULL) && isHostMemDirectAccess() && - !(flags_ & SubMemoryObject) && - (memoryType() != Resource::ExternalPhysical)) { - // Unmap memory if direct access was requested - unmap(NULL); + if (!result) { + if (cal()->buffer_) { + amd::Coord3D region(owner()->getSize()); + result = gpu.blitMgr().writeBuffer(owner()->getHostMem(), *this, origin, region, Entire); + } else { + amd::Image& image = *static_cast(owner()); + result = gpu.blitMgr().writeImage(owner()->getHostMem(), *this, origin, image.getRegion(), + image.getRowPitch(), image.getSlicePitch(), Entire); + } } + + //!@todo A wait isn't really necessary. However + //! Linux no-VM may have extra random failures. + wait(gpu); + + // Should never fail + assert(result && "Memory synchronization failed!"); + } } -void -Memory::syncCacheFromHost(VirtualGPU& gpu, device::Memory::SyncFlags syncFlags) -{ - // If the last writer was another GPU, then make a writeback - if (!isHostMemDirectAccess() && - (owner()->getLastWriter() != NULL) && - (&dev() != owner()->getLastWriter())) { - mgpuCacheWriteBack(); +void Memory::syncHostFromCache(device::Memory::SyncFlags syncFlags) { + // Sanity checks + assert(owner() != NULL); + + // If host memory doesn't have direct access, then we have to synchronize + if (!isHostMemDirectAccess()) { + bool hasUpdates = true; + + // Make sure the parent of subbuffer is up to date + if (!syncFlags.skipParent_ && (flags_ & SubMemoryObject)) { + device::Memory* m = owner()->parent()->getDeviceMemory(dev()); + + //! \note: Skipping the sync for a view doesn't reflect the parent settings, + //! since a view is a small portion of parent + device::Memory::SyncFlags syncFlagsTmp; + + // Sync parent from a view, so views have to be skipped + syncFlagsTmp.skipViews_ = true; + + // Make sure the parent sync is an unique operation. + // If the app uses multiple subbuffers from multiple queues, + // then the parent sync can be called from multiple threads + amd::ScopedLock lock(owner()->parent()->lockMemoryOps()); + m->syncHostFromCache(syncFlagsTmp); + //! \note Don't do early exit here, since we still have to sync + //! this view, if the parent sync operation was a NOP. + //! If parent was synchronized, then this view sync will be a NOP } - // If host memory doesn't have direct access, then we have to synchronize - if (!isHostMemDirectAccess() && (NULL != owner()->getHostMem())) { - bool hasUpdates = true; - - // Make sure the parent of subbuffer is up to date - if (!syncFlags.skipParent_ && (flags_ & SubMemoryObject)) { - gpu::Memory* gpuMemory = dev().getGpuMemory(owner()->parent()); - - //! \note: Skipping the sync for a view doesn't reflect the parent settings, - //! since a view is a small portion of parent - device::Memory::SyncFlags syncFlagsTmp; - - // Sync parent from a view, so views have to be skipped - syncFlagsTmp.skipViews_ = true; - - // Make sure the parent sync is an unique operation. - // If the app uses multiple subbuffers from multiple queues, - // then the parent sync can be called from multiple threads - amd::ScopedLock lock(owner()->parent()->lockMemoryOps()); - gpuMemory->syncCacheFromHost(gpu, syncFlagsTmp); - //! \note Don't do early exit here, since we still have to sync - //! this view, if the parent sync operation was a NOP. - //! If parent was synchronized, then this view sync will be a NOP - } - - // Is this a NOP? - if ((version_ == owner()->getVersion()) || - (&dev() == owner()->getLastWriter())) { - hasUpdates = false; - } - - // Update all available views, since we sync the parent - if ((owner()->subBuffers().size() != 0) && - (hasUpdates || !syncFlags.skipViews_)) { - device::Memory::SyncFlags syncFlagsTmp; - - // Sync views from parent, so parent has to be skipped - syncFlagsTmp.skipParent_ = true; - - if (hasUpdates) { - // Parent will be synced so update all views with a skip - syncFlagsTmp.skipEntire_ = true; - } - else { - // Passthrough the skip entire flag to the views, since - // any view is a submemory of the parent - syncFlagsTmp.skipEntire_ = syncFlags.skipEntire_; - } - - amd::ScopedLock lock(owner()->lockMemoryOps()); - for (auto& sub : owner()->subBuffers()) { - //! \note Don't allow subbuffer's allocation in the worker thread. - //! It may cause a system lock, because possible resource - //! destruction, heap reallocation or subbuffer allocation - static const bool AllocSubBuffer = false; - device::Memory* devSub = - sub->getDeviceMemory(dev(), AllocSubBuffer); - if (NULL != devSub) { - gpu::Memory* gpuSub = reinterpret_cast(devSub); - gpuSub->syncCacheFromHost(gpu, syncFlagsTmp); - } - } - } - - // Make sure we didn't have a NOP, - // because this GPU device was the last writer - if (&dev() != owner()->getLastWriter()) { - // Update the latest version - version_ = owner()->getVersion(); - } - - // Exit if sync is a NOP or sync can be skipped - if (!hasUpdates || syncFlags.skipEntire_) { - return; - } - - bool result = false; - static const bool Entire = true; - amd::Coord3D origin(0, 0, 0); - - // If host memory was pinned then make a transfer - if (flags_ & PinnedMemoryAlloced) { - if (cal()->buffer_) { - amd::Coord3D region(owner()->getSize()); - result = gpu.blitMgr().copyBuffer(*pinnedMemory_, - *this, origin, origin, region, Entire); - } - else { - amd::Image& image = *static_cast(owner()); - result = gpu.blitMgr().copyBufferToImage(*pinnedMemory_, - *this, origin, origin, image.getRegion(), Entire, - image.getRowPitch(), image.getSlicePitch()); - } - } - - if (!result) { - if (cal()->buffer_) { - amd::Coord3D region(owner()->getSize()); - result = gpu.blitMgr().writeBuffer(owner()->getHostMem(), - *this, origin, region, Entire); - } - else { - amd::Image& image = *static_cast(owner()); - result = gpu.blitMgr().writeImage(owner()->getHostMem(), - *this, origin, image.getRegion(), - image.getRowPitch(), image.getSlicePitch(), Entire); - } - } - - //!@todo A wait isn't really necessary. However - //! Linux no-VM may have extra random failures. - wait(gpu); - - // Should never fail - assert(result && "Memory synchronization failed!"); + // Is this a NOP? + if ((NULL == owner()->getLastWriter()) || (version_ == owner()->getVersion())) { + hasUpdates = false; } + + // Update all available views, since we sync the parent + if ((owner()->subBuffers().size() != 0) && (hasUpdates || !syncFlags.skipViews_)) { + device::Memory::SyncFlags syncFlagsTmp; + + // Sync views from parent, so parent has to be skipped + syncFlagsTmp.skipParent_ = true; + + if (hasUpdates) { + // Parent will be synced so update all views with a skip + syncFlagsTmp.skipEntire_ = true; + } else { + // Passthrough the skip entire flag to the views, since + // any view is a submemory of the parent + syncFlagsTmp.skipEntire_ = syncFlags.skipEntire_; + } + + amd::ScopedLock lock(owner()->lockMemoryOps()); + for (auto& sub : owner()->subBuffers()) { + //! \note Don't allow subbuffer's allocation in the worker thread. + //! It may cause a system lock, because possible resource + //! destruction, heap reallocation or subbuffer allocation + static const bool AllocSubBuffer = false; + device::Memory* devSub = sub->getDeviceMemory(dev(), AllocSubBuffer); + if (NULL != devSub) { + gpu::Memory* gpuSub = reinterpret_cast(devSub); + gpuSub->syncHostFromCache(syncFlagsTmp); + } + } + } + + // Make sure we didn't have a NOP, + // because CPU was the last writer + if (NULL != owner()->getLastWriter()) { + // Mark parent as up to date, set our version accordingly + version_ = owner()->getVersion(); + } + + // Exit if sync is a NOP or sync can be skipped + if (!hasUpdates || syncFlags.skipEntire_) { + return; + } + + bool result = false; + static const bool Entire = true; + amd::Coord3D origin(0, 0, 0); + + // If backing store was pinned then make a transfer + if (flags_ & PinnedMemoryAlloced) { + if (cal()->buffer_) { + amd::Coord3D region(owner()->getSize()); + result = dev().xferMgr().copyBuffer(*this, *pinnedMemory_, origin, origin, region, Entire); + } else { + amd::Image& image = *static_cast(owner()); + result = dev().xferMgr().copyImageToBuffer(*this, *pinnedMemory_, origin, origin, + image.getRegion(), Entire, image.getRowPitch(), + image.getSlicePitch()); + } + } + + // Just do a basic host read + if (!result) { + if (cal()->buffer_) { + amd::Coord3D region(owner()->getSize()); + result = dev().xferMgr().readBuffer(*this, owner()->getHostMem(), origin, region, Entire); + } else { + amd::Image& image = *static_cast(owner()); + result = dev().xferMgr().readImage(*this, owner()->getHostMem(), origin, image.getRegion(), + image.getRowPitch(), image.getSlicePitch(), Entire); + } + } + + // Should never fail + assert(result && "Memory synchronization failed!"); + } } -void -Memory::syncHostFromCache(device::Memory::SyncFlags syncFlags) -{ - // Sanity checks - assert(owner() != NULL); +gpu::Memory* Memory::createBufferView(amd::Memory& subBufferOwner) { + gpu::Memory* viewMemory; + Resource::ViewParams params; - // If host memory doesn't have direct access, then we have to synchronize - if (!isHostMemDirectAccess()) { - bool hasUpdates = true; + size_t offset = subBufferOwner.getOrigin(); + size_t size = subBufferOwner.getSize(); - // Make sure the parent of subbuffer is up to date - if (!syncFlags.skipParent_ && (flags_ & SubMemoryObject)) { - device::Memory* m = owner()->parent()->getDeviceMemory(dev()); + // Create a memory object + viewMemory = new gpu::Memory(dev(), subBufferOwner, size); + if (NULL == viewMemory) { + return NULL; + } - //! \note: Skipping the sync for a view doesn't reflect the parent settings, - //! since a view is a small portion of parent - device::Memory::SyncFlags syncFlagsTmp; + params.owner_ = &subBufferOwner; + params.gpu_ = static_cast(subBufferOwner.getVirtualDevice()); + params.offset_ = offset; + params.size_ = size; + params.resource_ = this; + params.memory_ = this; + if (!viewMemory->create(Resource::View, ¶ms)) { + delete viewMemory; + return NULL; + } - // Sync parent from a view, so views have to be skipped - syncFlagsTmp.skipViews_ = true; + // Explicitly set the host memory location, + // because the parent location could change after reallocation + if (NULL != owner()->getHostMem()) { + subBufferOwner.setHostMem(reinterpret_cast(owner()->getHostMem()) + offset); + } else { + subBufferOwner.setHostMem(NULL); + } - // Make sure the parent sync is an unique operation. - // If the app uses multiple subbuffers from multiple queues, - // then the parent sync can be called from multiple threads - amd::ScopedLock lock(owner()->parent()->lockMemoryOps()); - m->syncHostFromCache(syncFlagsTmp); - //! \note Don't do early exit here, since we still have to sync - //! this view, if the parent sync operation was a NOP. - //! If parent was synchronized, then this view sync will be a NOP - } - - // Is this a NOP? - if ((NULL == owner()->getLastWriter()) || - (version_ == owner()->getVersion())) { - hasUpdates = false; - } - - // Update all available views, since we sync the parent - if ((owner()->subBuffers().size() != 0) && - (hasUpdates || !syncFlags.skipViews_)) { - device::Memory::SyncFlags syncFlagsTmp; - - // Sync views from parent, so parent has to be skipped - syncFlagsTmp.skipParent_ = true; - - if (hasUpdates) { - // Parent will be synced so update all views with a skip - syncFlagsTmp.skipEntire_ = true; - } - else { - // Passthrough the skip entire flag to the views, since - // any view is a submemory of the parent - syncFlagsTmp.skipEntire_ = syncFlags.skipEntire_; - } - - amd::ScopedLock lock(owner()->lockMemoryOps()); - for (auto& sub : owner()->subBuffers()) { - //! \note Don't allow subbuffer's allocation in the worker thread. - //! It may cause a system lock, because possible resource - //! destruction, heap reallocation or subbuffer allocation - static const bool AllocSubBuffer = false; - device::Memory* devSub = - sub->getDeviceMemory(dev(), AllocSubBuffer); - if (NULL != devSub) { - gpu::Memory* gpuSub = reinterpret_cast(devSub); - gpuSub->syncHostFromCache(syncFlagsTmp); - } - } - } - - // Make sure we didn't have a NOP, - // because CPU was the last writer - if (NULL != owner()->getLastWriter()) { - // Mark parent as up to date, set our version accordingly - version_ = owner()->getVersion(); - } - - // Exit if sync is a NOP or sync can be skipped - if (!hasUpdates || syncFlags.skipEntire_) { - return; - } - - bool result = false; - static const bool Entire = true; - amd::Coord3D origin(0, 0, 0); - - // If backing store was pinned then make a transfer - if (flags_ & PinnedMemoryAlloced) { - if (cal()->buffer_) { - amd::Coord3D region(owner()->getSize()); - result = dev().xferMgr().copyBuffer(*this, - *pinnedMemory_, origin, origin, region, Entire); - } - else { - amd::Image& image = *static_cast(owner()); - result = dev().xferMgr().copyImageToBuffer(*this, - *pinnedMemory_, origin, origin, image.getRegion(), Entire, - image.getRowPitch(), image.getSlicePitch()); - } - } - - // Just do a basic host read - if (!result) { - if (cal()->buffer_) { - amd::Coord3D region(owner()->getSize()); - result = dev().xferMgr().readBuffer(*this, - owner()->getHostMem(), origin, region, Entire); - } - else { - amd::Image& image = *static_cast(owner()); - result = dev().xferMgr().readImage(*this, - owner()->getHostMem(), origin, image.getRegion(), - image.getRowPitch(), image.getSlicePitch(), Entire); - } - } - - // Should never fail - assert(result && "Memory synchronization failed!"); - } + return viewMemory; } -gpu::Memory* -Memory::createBufferView(amd::Memory& subBufferOwner) -{ - gpu::Memory* viewMemory; - Resource::ViewParams params; +void Memory::decIndMapCount() { + // Map/unmap must be serialized + amd::ScopedLock lock(owner()->lockMemoryOps()); - size_t offset = subBufferOwner.getOrigin(); - size_t size = subBufferOwner.getSize(); - - // Create a memory object - viewMemory = new gpu::Memory(dev(), subBufferOwner, size); - if (NULL == viewMemory) { - return NULL; + if (indirectMapCount_ == 0) { + if (!mipMapped()) { + LogError("decIndMapCount() called when indirectMapCount_ already zero"); } + return; + } - params.owner_ = &subBufferOwner; - params.gpu_ = static_cast(subBufferOwner.getVirtualDevice()); - params.offset_ = offset; - params.size_ = size; - params.resource_ = this; - params.memory_ = this; - if (!viewMemory->create(Resource::View, ¶ms)) { - delete viewMemory; - return NULL; - } - - // Explicitly set the host memory location, - // because the parent location could change after reallocation - if (NULL != owner()->getHostMem()) { - subBufferOwner.setHostMem( - reinterpret_cast(owner()->getHostMem()) + offset); - } - else { - subBufferOwner.setHostMem(NULL); - } - - return viewMemory; -} - -void -Memory::decIndMapCount() -{ - // Map/unmap must be serialized - amd::ScopedLock lock(owner()->lockMemoryOps()); - - if (indirectMapCount_ == 0) { - if (!mipMapped()) { - LogError("decIndMapCount() called when indirectMapCount_ already zero"); - } - return; - } - - // Decrement the counter and release indirect map if it's the last op - if (--indirectMapCount_ == 0) { - if (NULL != mapMemory_) { - amd::Memory* memory = mapMemory_; - amd::Memory* empty = NULL; - - // Get GPU memory - Memory* gpuMemory = mapMemory(); - gpuMemory->unmap(NULL); - - if (!dev().addMapTarget(memory)) { - memory->release(); - } - - // Map/unamp is serialized for the same memory object, - // so it's safe to clear the pointer - assert((mapMemory_ != NULL) && "Mapped buffer should be valid"); - mapMemory_ = NULL; - } + // Decrement the counter and release indirect map if it's the last op + if (--indirectMapCount_ == 0) { + if (NULL != mapMemory_) { + amd::Memory* memory = mapMemory_; + amd::Memory* empty = NULL; + + // Get GPU memory + Memory* gpuMemory = mapMemory(); + gpuMemory->unmap(NULL); + + if (!dev().addMapTarget(memory)) { + memory->release(); + } + + // Map/unamp is serialized for the same memory object, + // so it's safe to clear the pointer + assert((mapMemory_ != NULL) && "Mapped buffer should be valid"); + mapMemory_ = NULL; } + } } // Note - must be called by the device under the async lock, so no spinning // or long pauses allowed in this function. -void* -Memory::allocMapTarget( - const amd::Coord3D& origin, - const amd::Coord3D& region, - uint mapFlags, - size_t* rowPitch, - size_t* slicePitch) -{ - // Sanity checks - assert(owner() != NULL); +void* Memory::allocMapTarget(const amd::Coord3D& origin, const amd::Coord3D& region, uint mapFlags, + size_t* rowPitch, size_t* slicePitch) { + // Sanity checks + assert(owner() != NULL); - // Map/unmap must be serialized - amd::ScopedLock lock(owner()->lockMemoryOps()); + // Map/unmap must be serialized + amd::ScopedLock lock(owner()->lockMemoryOps()); - address mapAddress = NULL; - size_t offset = origin[0]; + address mapAddress = NULL; + size_t offset = origin[0]; - //For SVM implementation, we cannot use cached map. if svm space, use the svm host pointer - void *initHostPtr = owner()->getSvmPtr(); - if (NULL != initHostPtr) { - owner()->commitSvmMemory(); + // For SVM implementation, we cannot use cached map. if svm space, use the svm host pointer + void* initHostPtr = owner()->getSvmPtr(); + if (NULL != initHostPtr) { + owner()->commitSvmMemory(); + } + + if (owner()->numDevices() > 1) { + if ((NULL == initHostPtr) && (owner()->getHostMem() == NULL)) { + static const bool forceAllocHostMem = true; + if (!owner()->allocHostMemory(NULL, forceAllocHostMem)) { + return NULL; + } + //! \note Ignore pinning result + // bool ok = pinSystemMemory(owner()->getHostMem(), owner()->getSize()); + } + } + + incIndMapCount(); + // If host memory exists, use it + if ((owner()->getHostMem() != NULL) && isDirectMap()) { + mapAddress = reinterpret_cast
(owner()->getHostMem()); + } + // If resource is a persistent allocation, we can use it directly + else if (isPersistentDirectMap()) { + if (NULL == map(NULL)) { + LogError("Could not map target persistent resource"); + decIndMapCount(); + return NULL; + } + mapAddress = data(); + } + // Otherwise we can use a remote resource: + else { + // Are we in range? + size_t elementCount = cal()->width_; + size_t rSize = elementCount * elementSize(); + if (offset >= rSize || offset + region[0] > rSize) { + LogWarning("Memory::allocMapTarget() - offset/size out of bounds"); + return NULL; } - if (owner()->numDevices() > 1) { - if ((NULL == initHostPtr) && (owner()->getHostMem() == NULL)) { - static const bool forceAllocHostMem = true; - if (!owner()->allocHostMemory(NULL, forceAllocHostMem)) { - return NULL; - } - //! \note Ignore pinning result - //bool ok = pinSystemMemory(owner()->getHostMem(), owner()->getSize()); + // Allocate a map resource if there isn't any yet + if (indirectMapCount_ == 1) { + const static bool SysMem = true; + bool failed = false; + amd::Memory* memory = NULL; + // Search for a possible indirect resource + cl_mem_flags flag = 0; + bool canBeCached = true; + if (NULL != initHostPtr) { + // make sure the host memory is committed already, or we have a big problem. + assert(owner()->isSvmPtrCommited() && "The host svm memory not committed yet!"); + flag = CL_MEM_USE_HOST_PTR; + canBeCached = false; + } else { + memory = dev().findMapTarget(owner()->getSize()); + } + + if (memory == NULL) { + // for map target of svm buffer , we need use svm host ptr + memory = new (dev().context()) amd::Buffer(dev().context(), flag, owner()->getSize()); + Memory* gpuMemory; + + do { + if ((memory == NULL) || !memory->create(initHostPtr, SysMem)) { + failed = true; + break; + } + memory->setCacheStatus(canBeCached); + + gpuMemory = reinterpret_cast(memory->getDeviceMemory(dev())); + + // Create, Map and get the base pointer for the resource + if ((gpuMemory == NULL) || (NULL == gpuMemory->map(NULL))) { + failed = true; + break; + } + } while (false); + } + + if (failed) { + if (memory != NULL) { + memory->release(); } + decIndMapCount(); + LogError("Could not map target resource"); + return NULL; + } + + // Map/unamp is serialized for the same memory object, + // so it's safe to assign the new pointer + assert((mapMemory_ == NULL) && "Mapped buffer can't be valid"); + mapMemory_ = memory; + } else { + // Did the map resource allocation fail? + if (mapMemory_ == NULL) { + LogError("Could not map target resource"); + return NULL; + } } + mapAddress = mapMemory()->data(); + } - incIndMapCount(); - // If host memory exists, use it - if ((owner()->getHostMem() != NULL) && isDirectMap()) { - mapAddress = reinterpret_cast
(owner()->getHostMem()); - } - // If resource is a persistent allocation, we can use it directly - else if (isPersistentDirectMap()) { - if (NULL == map(NULL)) { - LogError("Could not map target persistent resource"); - decIndMapCount(); - return NULL; - } - mapAddress = data(); - } - // Otherwise we can use a remote resource: - else { - // Are we in range? - size_t elementCount = cal()->width_; - size_t rSize = elementCount * elementSize(); - if (offset >= rSize || offset + region[0] > rSize) { - LogWarning("Memory::allocMapTarget() - offset/size out of bounds"); - return NULL; - } - - // Allocate a map resource if there isn't any yet - if (indirectMapCount_ == 1) { - const static bool SysMem = true; - bool failed = false; - amd::Memory* memory = NULL; - // Search for a possible indirect resource - cl_mem_flags flag = 0; - bool canBeCached = true; - if (NULL != initHostPtr) { - //make sure the host memory is committed already, or we have a big problem. - assert(owner()->isSvmPtrCommited() && "The host svm memory not committed yet!"); - flag = CL_MEM_USE_HOST_PTR; - canBeCached = false; - } - else { - memory = dev().findMapTarget(owner()->getSize()); - } - - if (memory == NULL) { - // for map target of svm buffer , we need use svm host ptr - memory = new(dev().context()) - amd::Buffer(dev().context(), flag, owner()->getSize()); - Memory* gpuMemory; - - do { - if ((memory == NULL) || !memory->create(initHostPtr, SysMem)) { - failed = true; - break; - } - memory->setCacheStatus(canBeCached); - - gpuMemory = reinterpret_cast - (memory->getDeviceMemory(dev())); - - // Create, Map and get the base pointer for the resource - if ((gpuMemory == NULL) || (NULL == gpuMemory->map(NULL))) { - failed = true; - break; - } - } - while (false); - } - - if (failed) { - if (memory != NULL) { - memory->release(); - } - decIndMapCount(); - LogError("Could not map target resource"); - return NULL; - } - - // Map/unamp is serialized for the same memory object, - // so it's safe to assign the new pointer - assert((mapMemory_ == NULL) && "Mapped buffer can't be valid"); - mapMemory_ = memory; - } - else { - // Did the map resource allocation fail? - if (mapMemory_ == NULL) { - LogError("Could not map target resource"); - return NULL; - } - } - mapAddress = mapMemory()->data(); - } - - return mapAddress + offset; + return mapAddress + offset; } -bool -Memory::pinSystemMemory(void* hostPtr, size_t size) -{ - bool result = false; +bool Memory::pinSystemMemory(void* hostPtr, size_t size) { + bool result = false; - // If memory has a direct access already, then skip the host memory pinning - if (isHostMemDirectAccess()) { - return true; - } - - // Check if memory is pinned already - if (flags_ & PinnedMemoryAlloced) { - return true; - } - - // Allocate memory for the pinned object - pinnedMemory_ = new Memory(dev(), size); - - if (pinnedMemory_ == NULL) { - return false; - } - - // Check if it's a view - if (flags_ & SubMemoryObject) { - const gpu::Memory* gpuMemory; - if (owner() != NULL) { - gpuMemory = dev().getGpuMemory(owner()->parent()); - } - else { - gpuMemory = parent(); - } - - if (gpuMemory->flags_ & PinnedMemoryAlloced) { - Resource::ViewParams params; - params.owner_ = owner(); - params.offset_ = owner()->getOrigin(); - params.size_ = owner()->getSize(); - params.resource_ = gpuMemory->pinnedMemory_; - params.memory_ = NULL; - result = pinnedMemory_->create(Resource::View, ¶ms); - } - } - else { - Resource::PinnedParams params; - // Fill resource creation parameters - params.owner_ = owner(); - params.hostMemRef_ = owner()->getHostMemRef(); - params.size_ = size; - - // Create resource - result = pinnedMemory_->create(Resource::Pinned, ¶ms); - } - - if (!result) { - delete pinnedMemory_; - pinnedMemory_ = NULL; - return false; - } - - flags_ |= PinnedMemoryAlloced; + // If memory has a direct access already, then skip the host memory pinning + if (isHostMemDirectAccess()) { return true; -} + } -void* -Memory::cpuMap( - device::VirtualDevice& vDev, uint flags, - uint startLayer, uint numLayers, - size_t* rowPitch, - size_t* slicePitch) -{ - uint resFlags = 0; - if (flags == Memory::CpuReadOnly) { - resFlags = Resource::ReadOnly; - } - else if (flags == Memory::CpuWriteOnly) { - resFlags = Resource::WriteOnly; + // Check if memory is pinned already + if (flags_ & PinnedMemoryAlloced) { + return true; + } + + // Allocate memory for the pinned object + pinnedMemory_ = new Memory(dev(), size); + + if (pinnedMemory_ == NULL) { + return false; + } + + // Check if it's a view + if (flags_ & SubMemoryObject) { + const gpu::Memory* gpuMemory; + if (owner() != NULL) { + gpuMemory = dev().getGpuMemory(owner()->parent()); + } else { + gpuMemory = parent(); } - void* ptr = map(&static_cast(vDev), resFlags, startLayer, numLayers); - if (!cal()->buffer_) { - *rowPitch = cal()->pitch_ * elementSize(); - *slicePitch = cal()->slice_ * elementSize(); + if (gpuMemory->flags_ & PinnedMemoryAlloced) { + Resource::ViewParams params; + params.owner_ = owner(); + params.offset_ = owner()->getOrigin(); + params.size_ = owner()->getSize(); + params.resource_ = gpuMemory->pinnedMemory_; + params.memory_ = NULL; + result = pinnedMemory_->create(Resource::View, ¶ms); } - return ptr; + } else { + Resource::PinnedParams params; + // Fill resource creation parameters + params.owner_ = owner(); + params.hostMemRef_ = owner()->getHostMemRef(); + params.size_ = size; + + // Create resource + result = pinnedMemory_->create(Resource::Pinned, ¶ms); + } + + if (!result) { + delete pinnedMemory_; + pinnedMemory_ = NULL; + return false; + } + + flags_ |= PinnedMemoryAlloced; + return true; } -void -Memory::cpuUnmap(device::VirtualDevice& vDev) -{ - unmap(&static_cast(vDev)); +void* Memory::cpuMap(device::VirtualDevice& vDev, uint flags, uint startLayer, uint numLayers, + size_t* rowPitch, size_t* slicePitch) { + uint resFlags = 0; + if (flags == Memory::CpuReadOnly) { + resFlags = Resource::ReadOnly; + } else if (flags == Memory::CpuWriteOnly) { + resFlags = Resource::WriteOnly; + } + + void* ptr = map(&static_cast(vDev), resFlags, startLayer, numLayers); + if (!cal()->buffer_) { + *rowPitch = cal()->pitch_ * elementSize(); + *slicePitch = cal()->slice_ * elementSize(); + } + return ptr; } +void Memory::cpuUnmap(device::VirtualDevice& vDev) { unmap(&static_cast(vDev)); } + //! \note moveTo() must be called only from outside of //! VirtualGPU submit command methods. //! Otherwise a deadlock in lockVgpus() is possible. //! Also the logic in this function is very specific to //! the zero-copy functionality. -bool -Memory::moveTo(Memory& dst) -{ - bool result = false; +bool Memory::moveTo(Memory& dst) { + bool result = false; - // Make sure that all virtual devices don't process any commands - Device::ScopedLockVgpus lock(dev()); + // Make sure that all virtual devices don't process any commands + Device::ScopedLockVgpus lock(dev()); - // Wait for idle on all virtual GPUs - //!@note It's enough to wait on the active queue only - for (uint idx = 0; idx < dev().vgpus().size(); ++idx) { - wait(*(dev().vgpus()[idx])); - } + // Wait for idle on all virtual GPUs + //!@note It's enough to wait on the active queue only + for (uint idx = 0; idx < dev().vgpus().size(); ++idx) { + wait(*(dev().vgpus()[idx])); + } - static const bool Entire = true; - amd::Coord3D origin(0, 0, 0); - amd::Coord3D region(size()); + static const bool Entire = true; + amd::Coord3D origin(0, 0, 0); + amd::Coord3D region(size()); - // Transfer the data from old location to a new one - if (dev().xferMgr().copyBuffer( - *this, dst, origin, origin, region, Entire)) { - // Move all properties to the new object - dst.mapMemory_ = mapMemory_; - mapMemory_ = NULL; + // Transfer the data from old location to a new one + if (dev().xferMgr().copyBuffer(*this, dst, origin, origin, region, Entire)) { + // Move all properties to the new object + dst.mapMemory_ = mapMemory_; + mapMemory_ = NULL; - dst.flags_ |= flags_ & ~HostMemoryDirectAccess; - flags_ &= HostMemoryDirectAccess; + dst.flags_ |= flags_ & ~HostMemoryDirectAccess; + flags_ &= HostMemoryDirectAccess; - dst.indirectMapCount_ = indirectMapCount_; - indirectMapCount_ = 0; + dst.indirectMapCount_ = indirectMapCount_; + indirectMapCount_ = 0; - dst.pinnedMemory_ = pinnedMemory_; - pinnedMemory_ = NULL; + dst.pinnedMemory_ = pinnedMemory_; + pinnedMemory_ = NULL; - // Replace the device memory object - //! @note: current object will be destroyed - owner()->replaceDeviceMemory(&dev(), &dst); - result = true; - } + // Replace the device memory object + //! @note: current object will be destroyed + owner()->replaceDeviceMemory(&dev(), &dst); + result = true; + } - return result; + return result; } -Memory* -Memory::mapMemory() const -{ - Memory* map = NULL; - if (NULL != mapMemory_) { - map = reinterpret_cast(mapMemory_->getDeviceMemory(dev())); - } - return map; +Memory* Memory::mapMemory() const { + Memory* map = NULL; + if (NULL != mapMemory_) { + map = reinterpret_cast(mapMemory_->getDeviceMemory(dev())); + } + return map; } -void -Memory::mgpuCacheWriteBack() -{ - // Lock memory object, so only one write back can occur - amd::ScopedLock lock(owner()->lockMemoryOps()); +void Memory::mgpuCacheWriteBack() { + // Lock memory object, so only one write back can occur + amd::ScopedLock lock(owner()->lockMemoryOps()); - // Attempt to allocate a staging buffer if don't have any - if (owner()->getHostMem() == NULL) { - if (nullptr != owner()->getSvmPtr()) { - owner()->commitSvmMemory(); - owner()->setHostMem(owner()->getSvmPtr()); + // Attempt to allocate a staging buffer if don't have any + if (owner()->getHostMem() == NULL) { + if (nullptr != owner()->getSvmPtr()) { + owner()->commitSvmMemory(); + owner()->setHostMem(owner()->getSvmPtr()); + } else { + static const bool forceAllocHostMem = true; + owner()->allocHostMemory(nullptr, forceAllocHostMem); + } + } + // Make synchronization + if (owner()->getHostMem() != NULL) { + //! \note Ignore pinning result + bool ok = pinSystemMemory(owner()->getHostMem(), owner()->getSize()); + owner()->cacheWriteBack(); + } +} + +Memory* Buffer::createBufferView(amd::Memory& subBufferOwner) const { + gpu::Memory* subBuffer; + Resource::ViewParams params; + + size_t offset = subBufferOwner.getOrigin(); + size_t size = subBufferOwner.getSize(); + + // Create a memory object + subBuffer = new gpu::Buffer(dev(), subBufferOwner, size); + if (NULL == subBuffer) { + return NULL; + } + + // Allocate a view for this buffer object + params.owner_ = &subBufferOwner; + params.offset_ = offset; + params.size_ = size; + params.resource_ = this; + params.memory_ = this; + + if (!subBuffer->create(Resource::View, ¶ms)) { + delete subBuffer; + return NULL; + } + + return subBuffer; +} + +void* Image::allocMapTarget(const amd::Coord3D& origin, const amd::Coord3D& region, uint mapFlags, + size_t* rowPitch, size_t* slicePitch) { + // Sanity checks + assert(owner() != NULL); + bool useRemoteResource = true; + size_t slicePitchTmp = 0; + size_t height = cal()->height_; + size_t depth = cal()->depth_; + + // Map/unmap must be serialized + amd::ScopedLock lock(owner()->lockMemoryOps()); + + address mapAddress = NULL; + size_t offset = origin[0]; + + incIndMapCount(); + + // If host memory exists, use it + if ((owner()->getHostMem() != NULL) && isDirectMap()) { + useRemoteResource = false; + mapAddress = reinterpret_cast
(owner()->getHostMem()); + amd::Image* amdImage = owner()->asImage(); + + // Calculate the offset in bytes + offset *= elementSize(); + + // Update the row and slice pitches value + *rowPitch = + (amdImage->getRowPitch() == 0) ? (cal()->width_ * elementSize()) : amdImage->getRowPitch(); + slicePitchTmp = + (amdImage->getSlicePitch() == 0) ? (height * (*rowPitch)) : amdImage->getSlicePitch(); + + // Adjust the offset in Y and Z dimensions + offset += origin[1] * (*rowPitch); + offset += origin[2] * slicePitchTmp; + } + // If resource is a persistent allocation, we can use it directly + //! @note Even if resource is a persistent allocation, + //! runtime can't use it directly, + //! because CAL volume map doesn't work properly. + //! @todo arrays can be added for persistent lock with some CAL changes + else if (isPersistentDirectMap()) { + if (NULL == map(NULL)) { + useRemoteResource = true; + LogError("Could not map target persistent resource, try remote resource"); + } else { + useRemoteResource = false; + mapAddress = data(); + + // Calculate the offset in bytes + offset *= elementSize(); + + // Update the row pitch value + *rowPitch = cal()->pitch_ * elementSize(); + + // Adjust the offset in Y dimension + offset += origin[1] * (*rowPitch); + } + } + + // Otherwise we can use a remote resource: + if (useRemoteResource) { + // Calculate X offset in bytes + offset *= elementSize(); + + // Allocate a map resource if there isn't any yet + if (indirectMapCount_ == 1) { + const static bool SysMem = true; + bool failed = false; + amd::Memory* memory; + + // Search for a possible indirect resource + memory = dev().findMapTarget(owner()->getSize()); + + if (memory == NULL) { + // Allocate a new buffer to use as the map target + //! @note Allocate a 1D buffer, since CAL issues with 3D + //! Also HW doesn't support untiled images + memory = new (dev().context()) + amd::Buffer(dev().context(), 0, cal()->width_ * height * depth * elementSize()); + memory->setVirtualDevice(owner()->getVirtualDevice()); + + Memory* gpuMemory; + do { + if ((memory == NULL) || !memory->create(NULL, SysMem)) { + failed = true; + break; + } + + gpuMemory = reinterpret_cast(memory->getDeviceMemory(dev())); + + // Create, Map and get the base pointer for the resource + if ((gpuMemory == NULL) || (NULL == gpuMemory->map(NULL))) { + failed = true; + break; + } + } while (false); + } + + if (failed) { + if (memory != NULL) { + memory->release(); } - else { - static const bool forceAllocHostMem = true; - owner()->allocHostMemory(nullptr, forceAllocHostMem); - } - } - // Make synchronization - if (owner()->getHostMem() != NULL) { - //! \note Ignore pinning result - bool ok = pinSystemMemory(owner()->getHostMem(), owner()->getSize()); - owner()->cacheWriteBack(); - } -} - -Memory* -Buffer::createBufferView(amd::Memory& subBufferOwner) const -{ - gpu::Memory* subBuffer; - Resource::ViewParams params; - - size_t offset = subBufferOwner.getOrigin(); - size_t size = subBufferOwner.getSize(); - - // Create a memory object - subBuffer = new gpu::Buffer(dev(), subBufferOwner, size); - if (NULL == subBuffer) { + decIndMapCount(); + LogError("Could not map target resource"); return NULL; - } + } - // Allocate a view for this buffer object - params.owner_ = &subBufferOwner; - params.offset_ = offset; - params.size_ = size; - params.resource_ = this; - params.memory_ = this; - - if (!subBuffer->create(Resource::View, ¶ms)) { - delete subBuffer; + // Map/unamp is serialized for the same memory object, + // so it's safe to assign the new pointer + assert((mapMemory_ == NULL) && "Mapped buffer can't be valid"); + mapMemory_ = memory; + } else { + // Did the map resource allocation fail? + if (mapMemory_ == NULL) { + LogError("Could not map target resource"); return NULL; + } } - return subBuffer; + mapAddress = mapMemory()->data(); + + // Update the row and slice pitches value + *rowPitch = region[0] * elementSize(); + if (cal()->dimension_ == GSL_MOA_TEXTURE_1D_ARRAY) { + slicePitchTmp = *rowPitch; + } else { + slicePitchTmp = *rowPitch * region[1]; + } + // Use start of the indirect buffer + offset = 0; + } + + if (slicePitch != NULL) { + *slicePitch = slicePitchTmp; + } + + return mapAddress + offset; } -void* -Image::allocMapTarget( - const amd::Coord3D& origin, - const amd::Coord3D& region, - uint mapFlags, - size_t* rowPitch, - size_t* slicePitch) -{ - // Sanity checks - assert(owner() != NULL); - bool useRemoteResource = true; - size_t slicePitchTmp = 0; - size_t height = cal()->height_; - size_t depth = cal()->depth_; - - // Map/unmap must be serialized - amd::ScopedLock lock(owner()->lockMemoryOps()); - - address mapAddress = NULL; - size_t offset = origin[0]; - - incIndMapCount(); - - // If host memory exists, use it - if ((owner()->getHostMem() != NULL) && isDirectMap()) { - useRemoteResource = false; - mapAddress = reinterpret_cast
(owner()->getHostMem()); - amd::Image* amdImage = owner()->asImage(); - - // Calculate the offset in bytes - offset *= elementSize(); - - // Update the row and slice pitches value - *rowPitch = (amdImage->getRowPitch() == 0) ? - (cal()->width_ * elementSize()) : amdImage->getRowPitch(); - slicePitchTmp = (amdImage->getSlicePitch() == 0) ? - (height * (*rowPitch)) : amdImage->getSlicePitch(); - - // Adjust the offset in Y and Z dimensions - offset += origin[1] * (*rowPitch); - offset += origin[2] * slicePitchTmp; - } - // If resource is a persistent allocation, we can use it directly - //! @note Even if resource is a persistent allocation, - //! runtime can't use it directly, - //! because CAL volume map doesn't work properly. - //! @todo arrays can be added for persistent lock with some CAL changes - else if (isPersistentDirectMap()) { - if (NULL == map(NULL)) { - useRemoteResource = true; - LogError("Could not map target persistent resource, try remote resource"); - } - else { - useRemoteResource = false; - mapAddress = data(); - - // Calculate the offset in bytes - offset *= elementSize(); - - // Update the row pitch value - *rowPitch = cal()->pitch_ * elementSize(); - - // Adjust the offset in Y dimension - offset += origin[1] * (*rowPitch); - } - } - - // Otherwise we can use a remote resource: - if (useRemoteResource) { - // Calculate X offset in bytes - offset *= elementSize(); - - // Allocate a map resource if there isn't any yet - if (indirectMapCount_ == 1) { - const static bool SysMem = true; - bool failed = false; - amd::Memory* memory; - - // Search for a possible indirect resource - memory = dev().findMapTarget(owner()->getSize()); - - if (memory == NULL) { - // Allocate a new buffer to use as the map target - //! @note Allocate a 1D buffer, since CAL issues with 3D - //! Also HW doesn't support untiled images - memory = new (dev().context()) - amd::Buffer(dev().context(), 0, - cal()->width_ * height * depth * elementSize()); - memory->setVirtualDevice(owner()->getVirtualDevice()); - - Memory* gpuMemory; - do { - if ((memory == NULL) || !memory->create(NULL, SysMem)) { - failed = true; - break; - } - - gpuMemory = reinterpret_cast - (memory->getDeviceMemory(dev())); - - // Create, Map and get the base pointer for the resource - if ((gpuMemory == NULL) || (NULL == gpuMemory->map(NULL))) { - failed = true; - break; - } - } - while (false); - } - - if (failed) { - if (memory != NULL) { - memory->release(); - } - decIndMapCount(); - LogError("Could not map target resource"); - return NULL; - } - - // Map/unamp is serialized for the same memory object, - // so it's safe to assign the new pointer - assert((mapMemory_ == NULL) && "Mapped buffer can't be valid"); - mapMemory_ = memory; - } - else { - // Did the map resource allocation fail? - if (mapMemory_ == NULL) { - LogError("Could not map target resource"); - return NULL; - } - } - - mapAddress = mapMemory()->data(); - - // Update the row and slice pitches value - *rowPitch = region[0] * elementSize(); - if (cal()->dimension_ == GSL_MOA_TEXTURE_1D_ARRAY) { - slicePitchTmp = *rowPitch ; - } - else { - slicePitchTmp = *rowPitch * region[1]; - } - // Use start of the indirect buffer - offset = 0; - } - - if (slicePitch != NULL) { - *slicePitch = slicePitchTmp; - } - - return mapAddress + offset; -} - -} // namespace gpu +} // namespace gpu diff --git a/rocclr/runtime/device/gpu/gpumemory.hpp b/rocclr/runtime/device/gpu/gpumemory.hpp index 49f584c693..63510812b7 100644 --- a/rocclr/runtime/device/gpu/gpumemory.hpp +++ b/rocclr/runtime/device/gpu/gpumemory.hpp @@ -28,275 +28,245 @@ class VirtualGPU; //! GPU memory object. // Wrapper that can contain a heap block or an interop buffer/image. -class Memory: public device::Memory, public Resource -{ -public: - enum InteropType { - InteropNone = 0, //!< None interop memory - InteropHwEmulation = 1, //!< Uses HW emulaiton with calMemCopy - InteropDirectAccess = 2 //!< Uses direct access to the interop surface - }; +class Memory : public device::Memory, public Resource { + public: + enum InteropType { + InteropNone = 0, //!< None interop memory + InteropHwEmulation = 1, //!< Uses HW emulaiton with calMemCopy + InteropDirectAccess = 2 //!< Uses direct access to the interop surface + }; - //! Constructor (with owner) - Memory( - const Device& gpuDev, - amd::Memory& owner, - size_t size = 0); + //! Constructor (with owner) + Memory(const Device& gpuDev, amd::Memory& owner, size_t size = 0); - //! Constructor (nonfat version for local scratch mem use without heap block) - Memory( - const Device& gpuDev, - size_t size); + //! Constructor (nonfat version for local scratch mem use without heap block) + Memory(const Device& gpuDev, size_t size); - //! Constructor memory for buffer (without global heap allocaton) - Memory( - const Device& gpuDev, //!< GPU device object - amd::Memory& owner, //!< Abstraction layer memory object - size_t width, //!< Memory width - cmSurfFmt format //!< CAL format - ); + //! Constructor memory for buffer (without global heap allocaton) + Memory(const Device& gpuDev, //!< GPU device object + amd::Memory& owner, //!< Abstraction layer memory object + size_t width, //!< Memory width + cmSurfFmt format //!< CAL format + ); - //! Constructor memory for buffer (without global heap allocaton) - Memory( - const Device& gpuDev, //!< GPU device object - size_t size, //!< Memory object size - size_t width, //!< Memory width - cmSurfFmt format //!< CAL format - ); + //! Constructor memory for buffer (without global heap allocaton) + Memory(const Device& gpuDev, //!< GPU device object + size_t size, //!< Memory object size + size_t width, //!< Memory width + cmSurfFmt format //!< CAL format + ); - //! Constructor memory for images (without global heap allocaton) - Memory( - const Device& gpuDev, //!< GPU device object - amd::Memory& owner, //!< Abstraction layer memory object - size_t width, //!< Allocated memory width - size_t height, //!< Allocated memory height - size_t depth, //!< Allocated memory depth - cmSurfFmt format, //!< Memory format - gslChannelOrder chOrder, //!< Channel order - cl_mem_object_type imageType, //!< CL image type - uint mipLevels //!< The number of mip levels - ); + //! Constructor memory for images (without global heap allocaton) + Memory(const Device& gpuDev, //!< GPU device object + amd::Memory& owner, //!< Abstraction layer memory object + size_t width, //!< Allocated memory width + size_t height, //!< Allocated memory height + size_t depth, //!< Allocated memory depth + cmSurfFmt format, //!< Memory format + gslChannelOrder chOrder, //!< Channel order + cl_mem_object_type imageType, //!< CL image type + uint mipLevels //!< The number of mip levels + ); - //! Constructor memory for images (without global heap allocaton) - Memory( - const Device& gpuDev, //!< GPU device object - size_t size, //!< Memory object size - size_t width, //!< Allocated memory width - size_t height, //!< Allocated memory height - size_t depth, //!< Allocated memory depth - cmSurfFmt format, //!< Memory format - gslChannelOrder chOrder, //!< Channel order - cl_mem_object_type imageType, //!< CL image type - uint mipLevels //!< The number of mip levels - ); + //! Constructor memory for images (without global heap allocaton) + Memory(const Device& gpuDev, //!< GPU device object + size_t size, //!< Memory object size + size_t width, //!< Allocated memory width + size_t height, //!< Allocated memory height + size_t depth, //!< Allocated memory depth + cmSurfFmt format, //!< Memory format + gslChannelOrder chOrder, //!< Channel order + cl_mem_object_type imageType, //!< CL image type + uint mipLevels //!< The number of mip levels + ); - //! Default destructor - ~Memory(); + //! Default destructor + ~Memory(); - //! Creates the interop memory - bool createInterop( - InteropType type //!< The interop type - ); + //! Creates the interop memory + bool createInterop(InteropType type //!< The interop type + ); - //! Overloads the resource create method - virtual bool create( - Resource::MemoryType memType, //!< Memory type - Resource::CreateParams* params = NULL //!< Prameters for create - ); + //! Overloads the resource create method + virtual bool create(Resource::MemoryType memType, //!< Memory type + Resource::CreateParams* params = NULL //!< Prameters for create + ); - //! Allocate memory for API-level maps - virtual void* allocMapTarget( - const amd::Coord3D& origin, //!< The map location in memory - const amd::Coord3D& region, //!< The map region in memory - uint mapFlags, //!< Map flags - size_t* rowPitch = NULL, //!< Row pitch for the mapped memory - size_t* slicePitch = NULL //!< Slice for the mapped memory - ); + //! Allocate memory for API-level maps + virtual void* allocMapTarget(const amd::Coord3D& origin, //!< The map location in memory + const amd::Coord3D& region, //!< The map region in memory + uint mapFlags, //!< Map flags + size_t* rowPitch = NULL, //!< Row pitch for the mapped memory + size_t* slicePitch = NULL //!< Slice for the mapped memory + ); - //! Pins system memory associated with this memory object - virtual bool pinSystemMemory( - void* hostPtr, //!< System memory address - size_t size //!< Size of allocated system memory - ); + //! Pins system memory associated with this memory object + virtual bool pinSystemMemory(void* hostPtr, //!< System memory address + size_t size //!< Size of allocated system memory + ); - //! Releases indirect map surface - virtual void releaseIndirectMap() { decIndMapCount(); } + //! Releases indirect map surface + virtual void releaseIndirectMap() { decIndMapCount(); } - //! Map the device memory to CPU visible - virtual void* cpuMap( - device::VirtualDevice& vDev,//!< Virtual device for map operaiton - uint flags = 0, //!< flags for the map operation - // Optimization for multilayer map/unmap - uint startLayer = 0, //!< Start layer for multilayer map - uint numLayers = 0, //!< End layer for multilayer map - size_t* rowPitch = NULL, //!< Row pitch for the device memory - size_t* slicePitch = NULL //!< Slice pitch for the device memory - ); + //! Map the device memory to CPU visible + virtual void* cpuMap(device::VirtualDevice& vDev, //!< Virtual device for map operaiton + uint flags = 0, //!< flags for the map operation + // Optimization for multilayer map/unmap + uint startLayer = 0, //!< Start layer for multilayer map + uint numLayers = 0, //!< End layer for multilayer map + size_t* rowPitch = NULL, //!< Row pitch for the device memory + size_t* slicePitch = NULL //!< Slice pitch for the device memory + ); - //! Unmap the device memory - virtual void cpuUnmap( - device::VirtualDevice& vDev //!< Virtual device for unmap operaiton - ); + //! Unmap the device memory + virtual void cpuUnmap(device::VirtualDevice& vDev //!< Virtual device for unmap operaiton + ); - //! Updates device memory from the owner's host allocation - void syncCacheFromHost( - VirtualGPU& gpu, //!< Virtual GPU device object - //! Synchronization flags - device::Memory::SyncFlags syncFlags = device::Memory::SyncFlags() - ); + //! Updates device memory from the owner's host allocation + void syncCacheFromHost(VirtualGPU& gpu, //!< Virtual GPU device object + //! Synchronization flags + device::Memory::SyncFlags syncFlags = device::Memory::SyncFlags()); - //! Updates the owner's host allocation from device memory - virtual void syncHostFromCache( - //! Synchronization flags - device::Memory::SyncFlags syncFlags = device::Memory::SyncFlags() - ); + //! Updates the owner's host allocation from device memory + virtual void syncHostFromCache( + //! Synchronization flags + device::Memory::SyncFlags syncFlags = device::Memory::SyncFlags()); - //! Creates a view from current resource - virtual Memory* createBufferView( - amd::Memory& subBufferOwner //!< The abstraction layer subbuf owner - ); + //! Creates a view from current resource + virtual Memory* createBufferView( + amd::Memory& subBufferOwner //!< The abstraction layer subbuf owner + ); - //! Allocates host memory for synchronization with MGPU context - void mgpuCacheWriteBack(); + //! Allocates host memory for synchronization with MGPU context + void mgpuCacheWriteBack(); - //! Transfers objects data to the destination object - bool moveTo(Memory& dst); + //! Transfers objects data to the destination object + bool moveTo(Memory& dst); - //! Accessors for indirect map memory object - Memory* mapMemory() const; + //! Accessors for indirect map memory object + Memory* mapMemory() const; - //! Returns the interop memory for this memory object - Memory* interop() const { return interopMemory_; } + //! Returns the interop memory for this memory object + Memory* interop() const { return interopMemory_; } - //! Gets interop type for this memory object - InteropType interopType() const { return interopType_; } + //! Gets interop type for this memory object + InteropType interopType() const { return interopType_; } - //! Sets interop type for this memory object - void setInteropType(InteropType type) { interopType_ = type; } + //! Sets interop type for this memory object + void setInteropType(InteropType type) { interopType_ = type; } - //! Set the owner - void setOwner(amd::Memory* owner) { owner_ = owner; } + //! Set the owner + void setOwner(amd::Memory* owner) { owner_ = owner; } - // Decompress GL depth-stencil/MSAA resources for CL access - // Invalidates any FBOs the resource may be bound to, otherwise the GL driver may crash. - virtual bool processGLResource(GLResourceOP operation); + // Decompress GL depth-stencil/MSAA resources for CL access + // Invalidates any FBOs the resource may be bound to, otherwise the GL driver may crash. + virtual bool processGLResource(GLResourceOP operation); - //! Returns the interop resource for this memory object - const Memory* parent() const { return parent_; } + //! Returns the interop resource for this memory object + const Memory* parent() const { return parent_; } - //! Returns TRUE if direct map is acceaptable. The method detects - //! forced USWC memory on APU and will cause a switch to - //! indirect map for allocations with a possibility of host read - bool isDirectMap() - { - return (isCacheable() || !isHostMemDirectAccess() || + //! Returns TRUE if direct map is acceaptable. The method detects + //! forced USWC memory on APU and will cause a switch to + //! indirect map for allocations with a possibility of host read + bool isDirectMap() { + return (isCacheable() || !isHostMemDirectAccess() || (owner()->getMemFlags() & (CL_MEM_ALLOC_HOST_PTR | CL_MEM_HOST_WRITE_ONLY | CL_MEM_READ_ONLY))); - } + } -protected: - //! Decrement map count - void decIndMapCount(); + protected: + //! Decrement map count + void decIndMapCount(); - //! Initialize the object members - void init(); + //! Initialize the object members + void init(); -private: - //! Disable copy constructor - Memory(const Memory&); + private: + //! Disable copy constructor + Memory(const Memory&); - //! Disable operator= - Memory& operator=(const Memory&); + //! Disable operator= + Memory& operator=(const Memory&); - InteropType interopType_; //!< Interop type - Memory* interopMemory_; //!< interop memory + InteropType interopType_; //!< Interop type + Memory* interopMemory_; //!< interop memory - Memory* pinnedMemory_; //!< Memory used as pinned system memory - const Memory* parent_; //!< Parent memory object + Memory* pinnedMemory_; //!< Memory used as pinned system memory + const Memory* parent_; //!< Parent memory object }; -class Buffer: public gpu::Memory -{ -public: - //! Buffer constructor - Buffer( - const Device& gpuDev, //!< GPU device object - amd::Memory& owner, //!< Abstraction layer memory object - size_t size //!< Buffer size - ) - : gpu::Memory(gpuDev, owner, - amd::alignUp(size, ElementSize) / ElementSize, ElementType) - {} +class Buffer : public gpu::Memory { + public: + //! Buffer constructor + Buffer(const Device& gpuDev, //!< GPU device object + amd::Memory& owner, //!< Abstraction layer memory object + size_t size //!< Buffer size + ) + : gpu::Memory(gpuDev, owner, amd::alignUp(size, ElementSize) / ElementSize, ElementType) {} - //! Creates a view from current resource - virtual Memory* createBufferView( - amd::Memory& subBufferOwner //!< The abstraction layer subbuf owner - ) const; + //! Creates a view from current resource + virtual Memory* createBufferView( + amd::Memory& subBufferOwner //!< The abstraction layer subbuf owner + ) const; -private: - //! Disable copy constructor - Buffer(const Buffer&); + private: + //! Disable copy constructor + Buffer(const Buffer&); - //! Disable operator= - Buffer& operator=(const Buffer&); + //! Disable operator= + Buffer& operator=(const Buffer&); - //! The size of buffer element in bytes - static const size_t ElementSize = 4; + //! The size of buffer element in bytes + static const size_t ElementSize = 4; - //! The type of buffer element - static const cmSurfFmt ElementType = CM_SURF_FMT_R32I; + //! The type of buffer element + static const cmSurfFmt ElementType = CM_SURF_FMT_R32I; }; -class Image: public gpu::Memory -{ -public: - //! Image constructor - Image( - const Device& gpuDev, //!< GPU device object - amd::Memory& owner, //!< Abstraction layer memory object - size_t width, //!< Allocated memory width - size_t height, //!< Allocated memory height - size_t depth, //!< Allocated memory depth - cmSurfFmt format, //!< Memory format - gslChannelOrder chOrder, //!< Channel order - cl_mem_object_type imageType, //!< CL image type - uint mipLevels //!< The number of mip levels +class Image : public gpu::Memory { + public: + //! Image constructor + Image(const Device& gpuDev, //!< GPU device object + amd::Memory& owner, //!< Abstraction layer memory object + size_t width, //!< Allocated memory width + size_t height, //!< Allocated memory height + size_t depth, //!< Allocated memory depth + cmSurfFmt format, //!< Memory format + gslChannelOrder chOrder, //!< Channel order + cl_mem_object_type imageType, //!< CL image type + uint mipLevels //!< The number of mip levels ) - : gpu::Memory(gpuDev, owner, width, height, depth, format, chOrder, imageType, mipLevels) - {} + : gpu::Memory(gpuDev, owner, width, height, depth, format, chOrder, imageType, mipLevels) {} - //! Image constructor - Image( - const Device& gpuDev, //!< GPU device object - size_t size, //!< Memory size - size_t width, //!< Allocated memory width - size_t height, //!< Allocated memory height - size_t depth, //!< Allocated memory depth - cmSurfFmt format, //!< Memory format - gslChannelOrder chOrder, //!< Channel order - cl_mem_object_type imageType, //!< CL image type - uint mipLevels //!< The number of mip levels + //! Image constructor + Image(const Device& gpuDev, //!< GPU device object + size_t size, //!< Memory size + size_t width, //!< Allocated memory width + size_t height, //!< Allocated memory height + size_t depth, //!< Allocated memory depth + cmSurfFmt format, //!< Memory format + gslChannelOrder chOrder, //!< Channel order + cl_mem_object_type imageType, //!< CL image type + uint mipLevels //!< The number of mip levels ) - : gpu::Memory(gpuDev, size, width, height, depth, format, chOrder, imageType, mipLevels) - {} + : gpu::Memory(gpuDev, size, width, height, depth, format, chOrder, imageType, mipLevels) {} - //! Allocate memory for API-level maps - virtual void* allocMapTarget( - const amd::Coord3D& origin, //!< The map location in memory - const amd::Coord3D& region, //!< The map region in memory - uint mapFlags, //!< Map flags - size_t* rowPitch = NULL, //!< Row pitch for the mapped memory - size_t* slicePitch = NULL //!< Slice for the mapped memory - ); + //! Allocate memory for API-level maps + virtual void* allocMapTarget(const amd::Coord3D& origin, //!< The map location in memory + const amd::Coord3D& region, //!< The map region in memory + uint mapFlags, //!< Map flags + size_t* rowPitch = NULL, //!< Row pitch for the mapped memory + size_t* slicePitch = NULL //!< Slice for the mapped memory + ); -private: - //! Disable copy constructor - Image(const Image&); + private: + //! Disable copy constructor + Image(const Image&); - //! Disable operator= - Image& operator=(const Image&); + //! Disable operator= + Image& operator=(const Image&); }; -} // namespace gpu +} // namespace gpu -#endif // GPUMEMORY_HPP_ +#endif // GPUMEMORY_HPP_ diff --git a/rocclr/runtime/device/gpu/gpuprintf.cpp b/rocclr/runtime/device/gpu/gpuprintf.cpp index 0baf68f76f..58120a4002 100644 --- a/rocclr/runtime/device/gpu/gpuprintf.cpp +++ b/rocclr/runtime/device/gpu/gpuprintf.cpp @@ -17,707 +17,625 @@ namespace gpu { PrintfDbg::PrintfDbg(Device& device, FILE* file) - : dbgBuffer_(NULL) - , dbgFile_(file) - , gpuDevice_(device) - , wiDbgSize_(0) - , initCntValue_(device, 4) -{ + : dbgBuffer_(NULL), + dbgFile_(file), + gpuDevice_(device), + wiDbgSize_(0), + initCntValue_(device, 4) {} + +PrintfDbg::~PrintfDbg() { delete dbgBuffer_; } + +bool PrintfDbg::create() { + // Create a resource for the init count value + if (initCntValue_.create(Resource::Remote)) { + uint32_t* value = reinterpret_cast(initCntValue_.map(NULL)); + // The counter starts from 1 + if (NULL != value) { + *value = 1; + } else { + return false; + } + initCntValue_.unmap(NULL); + return true; + } + return false; } -PrintfDbg::~PrintfDbg() -{ +bool PrintfDbg::init(VirtualGPU& gpu, bool printfEnabled, const amd::NDRange& size) { + // Set up debug output buffer (if printf active) + if (printfEnabled) { + if (!allocate()) { + return false; + } + + // Make sure that the size isn't bigger than the reported max + if (size.product() <= dev().settings().maxWorkGroupSize_) { + size_t wiDbgSizeTmp; + + // Calculate the debug buffer size per workitem + wiDbgSizeTmp = std::min(dbgBuffer_->size() / size.product(), dev().xferRead().bufSize()); + + // Make sure the size is DWORD aligned + wiDbgSizeTmp = amd::alignDown(wiDbgSizeTmp, sizeof(uint32_t)); + + // If the new size is different, then clear the initial values + if (wiDbgSize_ != wiDbgSizeTmp) { + wiDbgSize_ = wiDbgSizeTmp; + if (!clearWorkitems(gpu, 0, size.product())) { + wiDbgSize_ = 0; + return false; + } + } + } + } + + return true; +} + +bool PrintfDbg::output(VirtualGPU& gpu, bool printfEnabled, const amd::NDRange& size, + const std::vector& printfInfo) { + // Are we expected to generate debug output? + if (printfEnabled && !printfInfo.empty()) { + uint32_t* workitemData; + size_t i, j, k, z; + bool realloc = false; + + // Wait for kernel execution + gpu.waitAllEngines(); + + size_t zdim = 1; + size_t ydim = 1; + size_t xdim = 1; + + switch (size.dimensions()) { + case 3: + zdim = size[2]; + // Fall through ... + case 2: + ydim = size[1]; + // Fall through ... + case 1: + xdim = size[0]; + // Fall through ... + default: + break; + } + + for (k = 0; k < zdim; ++k) { + for (j = 0; j < ydim; ++j) { + for (i = 0; i < xdim; ++i) { + size_t idx = (xdim * (ydim * k + j) + i); + workitemData = mapWorkitem(gpu, idx, &realloc); + + if (NULL != workitemData) { + uint32_t wp = workitemData[0]; // write pointer (i.e. first unwritten element) + // Walk through each PrintfDbg entry + for (z = 1; (z < (wiDbgSize() / sizeof(uint32_t))) && (z < wp);) { + if (printfInfo.size() < workitemData[z]) { + LogError("The format string wasn't reported"); + return false; + } + // Get the PrintfDbg info + const PrintfInfo& info = printfInfo[workitemData[z++]]; + // There's something in this buffer + outputDbgBuffer(info, workitemData, z); + } + } + unmapWorkitem(gpu, workitemData); + } + } + } + + // Reallocate debug buffer if necessary + if (!allocate(realloc)) { + return false; + } + } + return true; +} + +uint64_t PrintfDbg::bufOffset() const { return dbgBuffer_->hbOffset(); } + +bool PrintfDbg::allocate(bool realloc) { + if (NULL == dbgBuffer_) { + dbgBuffer_ = dev().createScratchBuffer(dev().info().printfBufferSize_); + } else if (realloc) { + LogWarning("Debug buffer reallocation!"); + // Double the buffer size if it's not big enough + size_t size = dbgBuffer_->size(); delete dbgBuffer_; + dbgBuffer_ = dev().createScratchBuffer(size << 1); + } + + return (NULL != dbgBuffer_) ? true : false; } -bool -PrintfDbg::create() -{ - // Create a resource for the init count value - if (initCntValue_.create(Resource::Remote)) { - uint32_t* value = reinterpret_cast(initCntValue_.map(NULL)); - // The counter starts from 1 - if (NULL != value) { - *value = 1; - } - else { - return false; - } - initCntValue_.unmap(NULL); - return true; - } - return false; -} - -bool -PrintfDbg::init( - VirtualGPU& gpu, - bool printfEnabled, - const amd::NDRange& size) -{ - // Set up debug output buffer (if printf active) - if (printfEnabled) { - if (!allocate()) { - return false; - } - - // Make sure that the size isn't bigger than the reported max - if (size.product() <= dev().settings().maxWorkGroupSize_) { - size_t wiDbgSizeTmp; - - // Calculate the debug buffer size per workitem - wiDbgSizeTmp = std::min(dbgBuffer_->size() / size.product(), - dev().xferRead().bufSize()); - - // Make sure the size is DWORD aligned - wiDbgSizeTmp = amd::alignDown(wiDbgSizeTmp, sizeof(uint32_t)); - - // If the new size is different, then clear the initial values - if (wiDbgSize_ != wiDbgSizeTmp) { - wiDbgSize_ = wiDbgSizeTmp; - if (!clearWorkitems(gpu, 0, size.product())) { - wiDbgSize_ = 0; - return false; - } - } - } - } - - return true; -} - -bool -PrintfDbg::output( - VirtualGPU& gpu, - bool printfEnabled, - const amd::NDRange& size, - const std::vector& printfInfo) -{ - // Are we expected to generate debug output? - if (printfEnabled && !printfInfo.empty()) { - uint32_t* workitemData; - size_t i, j, k, z; - bool realloc = false; - - // Wait for kernel execution - gpu.waitAllEngines(); - - size_t zdim = 1; - size_t ydim = 1; - size_t xdim = 1; - - switch (size.dimensions()) { - case 3: - zdim = size[2]; - // Fall through ... - case 2: - ydim = size[1]; - // Fall through ... - case 1: - xdim = size[0]; - // Fall through ... - default: - break; - } - - for (k = 0; k < zdim; ++k) { - for (j = 0; j < ydim; ++j) { - for (i = 0; i < xdim; ++i) { - size_t idx = (xdim * (ydim * k + j) + i); - workitemData = mapWorkitem(gpu, idx, &realloc); - - if (NULL != workitemData) { - uint32_t wp = workitemData[0]; // write pointer (i.e. first unwritten element) - // Walk through each PrintfDbg entry - for (z = 1; (z < (wiDbgSize() / sizeof(uint32_t))) && (z < wp); ) { - if (printfInfo.size() < workitemData[z]) { - LogError("The format string wasn't reported"); - return false; - } - // Get the PrintfDbg info - const PrintfInfo& info = printfInfo[workitemData[z++]]; - // There's something in this buffer - outputDbgBuffer(info, workitemData, z); - } - } - unmapWorkitem(gpu, workitemData); - } - } - } - - // Reallocate debug buffer if necessary - if (!allocate(realloc)) { - return false; - } - } - return true; -} - -uint64_t -PrintfDbg::bufOffset() const -{ - return dbgBuffer_->hbOffset(); -} - -bool -PrintfDbg::allocate(bool realloc) -{ - if (NULL == dbgBuffer_) { - dbgBuffer_ = dev().createScratchBuffer(dev().info().printfBufferSize_); - } - else if (realloc) { - LogWarning("Debug buffer reallocation!"); - // Double the buffer size if it's not big enough - size_t size = dbgBuffer_->size(); - delete dbgBuffer_; - dbgBuffer_ = dev().createScratchBuffer(size << 1); - } - - return (NULL != dbgBuffer_) ? true : false; -} - -bool -PrintfDbg::checkFloat(const std::string& fmt) const -{ - switch (fmt[fmt.size() - 1]) { +bool PrintfDbg::checkFloat(const std::string& fmt) const { + switch (fmt[fmt.size() - 1]) { case 'e': case 'E': case 'f': case 'g': case 'G': case 'a': - return true; - break; + return true; + break; default: - break; - } - return false; + break; + } + return false; } -bool -PrintfDbg::checkString(const std::string& fmt) const -{ - if (fmt[fmt.size() - 1] == 's') - return true; - return false; +bool PrintfDbg::checkString(const std::string& fmt) const { + if (fmt[fmt.size() - 1] == 's') return true; + return false; } -int -PrintfDbg::checkVectorSpecifier( - const std::string& fmt, - size_t startPos, - size_t& curPos) const -{ - int vectorSize = 0; - size_t pos = curPos; - size_t size = curPos - startPos; +int PrintfDbg::checkVectorSpecifier(const std::string& fmt, size_t startPos, size_t& curPos) const { + int vectorSize = 0; + size_t pos = curPos; + size_t size = curPos - startPos; - if (size >= 3) { - size = 0; - //no modifiers - if (fmt[curPos - 3] == 'v') { - size = 2; - } - //the modifiers are "h" or "l" - else if (fmt[curPos - 4] == 'v') { - size = 3; - } - //the modifier is "hh" - else if ((curPos >= 5) && (fmt[curPos - 5] == 'v')) { - size = 4; - } - if (size > 0) { - curPos = size; - pos -= curPos; - - // Get vector size - vectorSize = fmt[pos++] - '0'; - // PrintfDbg supports only 2, 3, 4, 8 and 16 wide vectors - switch (vectorSize) { - case 1: - if ((fmt[pos++] - '0') == 6) { - vectorSize = 16; - } - else { - vectorSize = 0; - } - break; - case 2: - case 3: - case 4: - case 8: - break; - default: - vectorSize = 0; - break; - } - } + if (size >= 3) { + size = 0; + // no modifiers + if (fmt[curPos - 3] == 'v') { + size = 2; } + // the modifiers are "h" or "l" + else if (fmt[curPos - 4] == 'v') { + size = 3; + } + // the modifier is "hh" + else if ((curPos >= 5) && (fmt[curPos - 5] == 'v')) { + size = 4; + } + if (size > 0) { + curPos = size; + pos -= curPos; - return vectorSize; + // Get vector size + vectorSize = fmt[pos++] - '0'; + // PrintfDbg supports only 2, 3, 4, 8 and 16 wide vectors + switch (vectorSize) { + case 1: + if ((fmt[pos++] - '0') == 6) { + vectorSize = 16; + } else { + vectorSize = 0; + } + break; + case 2: + case 3: + case 4: + case 8: + break; + default: + vectorSize = 0; + break; + } + } + } + + return vectorSize; } static const size_t ConstStr = 0xffffffff; static const char Separator[] = ",\0"; -size_t -PrintfDbg::outputArgument( - const std::string& fmt, - bool printFloat, - size_t size, - const uint32_t* argument) const -{ - // Serialize the output to the screen - amd::ScopedLock k(dev().lockAsyncOps()); +size_t PrintfDbg::outputArgument(const std::string& fmt, bool printFloat, size_t size, + const uint32_t* argument) const { + // Serialize the output to the screen + amd::ScopedLock k(dev().lockAsyncOps()); - size_t copiedBytes = size; - // Print the string argument, using standard PrintfDbg() - if (checkString(fmt.c_str())) { - //copiedBytes should be as number of printed chars - copiedBytes = 0; - //(null) should be printed - if (*argument == 0) { - amd::Os::printf(fmt.data(),0); - //copiedBytes = strlen("(null)") - copiedBytes = 6; - } - else { - const unsigned char* argumentStr = reinterpret_cast(argument); - amd::Os::printf(fmt.data(),argumentStr); - //copiedBytes = strlen(argumentStr) - while (argumentStr[copiedBytes++] != 0); - } + size_t copiedBytes = size; + // Print the string argument, using standard PrintfDbg() + if (checkString(fmt.c_str())) { + // copiedBytes should be as number of printed chars + copiedBytes = 0; + //(null) should be printed + if (*argument == 0) { + amd::Os::printf(fmt.data(), 0); + // copiedBytes = strlen("(null)") + copiedBytes = 6; + } else { + const unsigned char* argumentStr = reinterpret_cast(argument); + amd::Os::printf(fmt.data(), argumentStr); + // copiedBytes = strlen(argumentStr) + while (argumentStr[copiedBytes++] != 0) + ; } + } - // Print the argument(except for string ), using standard PrintfDbg() - else { - bool hlModifier = (strstr(fmt.c_str(),"hl") != NULL); - std::string hlFmt; - if (hlModifier) { - hlFmt = fmt; - hlFmt.erase(hlFmt.find_first_of("hl"),2); - } - switch (size) { - case 0: { - const char* str = reinterpret_cast(argument); - amd::Os::printf(fmt.data(), str); - // Find the string length - while (str[copiedBytes++] != 0); - } - break; - case 1: - amd::Os::printf(fmt.data(), *(reinterpret_cast(argument))); - break; - case 2: - case 4: - if (printFloat) { - static const char* fSpecifiers = "eEfgGa"; - std::string fmtF = fmt; - size_t posS = fmtF.find_first_of("%"); - size_t posE = fmtF.find_first_of(fSpecifiers); - if (posS != std::string::npos &&posE != std::string::npos) { - fmtF.replace(posS+1,posE-posS,"s"); - } - float fArg = *(reinterpret_cast(argument)); - float fSign = copysign(1.0,fArg); - if (isinf(fArg)&&!isnan(fArg)) { - if(fSign < 0) { - amd::Os::printf(fmtF.data(),"-infinity"); - } - else { - amd::Os::printf(fmtF.data(),"infinity"); - } - } - else if (isnan(fArg)) { - if(fSign < 0) { - amd::Os::printf(fmtF.data(),"-nan"); - } - else { - amd::Os::printf(fmtF.data(),"nan"); - } - } - else if (hlModifier) { - amd::Os::printf(hlFmt.data(),fArg); - } - else { - amd::Os::printf(fmt.data(),fArg); - } - } - else { - bool hhModifier = (strstr(fmt.c_str(),"hh") != NULL); - if (hhModifier) { - //current implementation of printf in gcc 4.5.2 runtime libraries, doesn`t recognize "hh" modifier ==> - //argument should be explicitly converted to unsigned char (uchar) before printing and - //fmt should be updated not to contain "hh" modifier - std::string hhFmt = fmt; - hhFmt.erase(hhFmt.find_first_of("h"),2); - amd::Os::printf(hhFmt.data(), *(reinterpret_cast(argument))); - } - else if (hlModifier) { - amd::Os::printf(hlFmt.data(), *argument); - } - else { - amd::Os::printf(fmt.data(), *argument); - } - } - break; - case 8: - if (printFloat) { - if (hlModifier) { - amd::Os::printf(hlFmt.data(), *(reinterpret_cast(argument))); - } - else { - amd::Os::printf(fmt.data(), *(reinterpret_cast(argument))); - } - } - else { - std::string out = fmt; - // Use 'll' for 64 bit printf - out.insert((out.size() - 1), 1, 'l'); - amd::Os::printf(out.data(), *(reinterpret_cast(argument))); - } - break; - case ConstStr: { - const char* str = reinterpret_cast(argument); - amd::Os::printf(fmt.data(), str); - } - break; - default: - amd::Os::printf("Error: Unsupported data size for PrintfDbg. %d bytes", - static_cast(size)); - return 0; - } + // Print the argument(except for string ), using standard PrintfDbg() + else { + bool hlModifier = (strstr(fmt.c_str(), "hl") != NULL); + std::string hlFmt; + if (hlModifier) { + hlFmt = fmt; + hlFmt.erase(hlFmt.find_first_of("hl"), 2); } - fflush(stdout); - return copiedBytes; + switch (size) { + case 0: { + const char* str = reinterpret_cast(argument); + amd::Os::printf(fmt.data(), str); + // Find the string length + while (str[copiedBytes++] != 0) + ; + } break; + case 1: + amd::Os::printf(fmt.data(), *(reinterpret_cast(argument))); + break; + case 2: + case 4: + if (printFloat) { + static const char* fSpecifiers = "eEfgGa"; + std::string fmtF = fmt; + size_t posS = fmtF.find_first_of("%"); + size_t posE = fmtF.find_first_of(fSpecifiers); + if (posS != std::string::npos && posE != std::string::npos) { + fmtF.replace(posS + 1, posE - posS, "s"); + } + float fArg = *(reinterpret_cast(argument)); + float fSign = copysign(1.0, fArg); + if (isinf(fArg) && !isnan(fArg)) { + if (fSign < 0) { + amd::Os::printf(fmtF.data(), "-infinity"); + } else { + amd::Os::printf(fmtF.data(), "infinity"); + } + } else if (isnan(fArg)) { + if (fSign < 0) { + amd::Os::printf(fmtF.data(), "-nan"); + } else { + amd::Os::printf(fmtF.data(), "nan"); + } + } else if (hlModifier) { + amd::Os::printf(hlFmt.data(), fArg); + } else { + amd::Os::printf(fmt.data(), fArg); + } + } else { + bool hhModifier = (strstr(fmt.c_str(), "hh") != NULL); + if (hhModifier) { + // current implementation of printf in gcc 4.5.2 runtime libraries, doesn`t recognize + // "hh" modifier ==> + // argument should be explicitly converted to unsigned char (uchar) before printing and + // fmt should be updated not to contain "hh" modifier + std::string hhFmt = fmt; + hhFmt.erase(hhFmt.find_first_of("h"), 2); + amd::Os::printf(hhFmt.data(), *(reinterpret_cast(argument))); + } else if (hlModifier) { + amd::Os::printf(hlFmt.data(), *argument); + } else { + amd::Os::printf(fmt.data(), *argument); + } + } + break; + case 8: + if (printFloat) { + if (hlModifier) { + amd::Os::printf(hlFmt.data(), *(reinterpret_cast(argument))); + } else { + amd::Os::printf(fmt.data(), *(reinterpret_cast(argument))); + } + } else { + std::string out = fmt; + // Use 'll' for 64 bit printf + out.insert((out.size() - 1), 1, 'l'); + amd::Os::printf(out.data(), *(reinterpret_cast(argument))); + } + break; + case ConstStr: { + const char* str = reinterpret_cast(argument); + amd::Os::printf(fmt.data(), str); + } break; + default: + amd::Os::printf("Error: Unsupported data size for PrintfDbg. %d bytes", + static_cast(size)); + return 0; + } + } + fflush(stdout); + return copiedBytes; } -void -PrintfDbg::outputDbgBuffer(const PrintfInfo& info, const uint32_t* workitemData, size_t& i) const -{ - static const char* specifiers = "cdieEfgGaosuxXp"; - static const char* modifiers = "hl"; - static const char* special = "%n"; - static const std::string sepStr = "%s"; - const uint32_t* s = workitemData; - size_t pos = 0; +void PrintfDbg::outputDbgBuffer(const PrintfInfo& info, const uint32_t* workitemData, + size_t& i) const { + static const char* specifiers = "cdieEfgGaosuxXp"; + static const char* modifiers = "hl"; + static const char* special = "%n"; + static const std::string sepStr = "%s"; + const uint32_t* s = workitemData; + size_t pos = 0; - // Find the format string - std::string str = info.fmtString_; - std::string fmt; - size_t posStart, posEnd; + // Find the format string + std::string str = info.fmtString_; + std::string fmt; + size_t posStart, posEnd; - // Print all arguments - // Note: the following code walks through all arguments, provided by the kernel and - // finds the corresponding specifier in the format string. - // Then it splits the original string into substrings with a single specifier and - // uses standard PrintfDbg() to print each argument - for (uint j = 0; j < info.arguments_.size(); ++j) { - do { - posStart = str.find_first_of("%", pos); - if (posStart != std::string::npos) { - posStart++; - // Erase all spaces after % - while (str[posStart] == ' ') { - str.erase(posStart, 1); - } - size_t tmp = str.find_first_of(special, posStart); - size_t tmp2 = str.find_first_of(specifiers, posStart); - // Special cases. Special symbol is located before any specifier - if (tmp < tmp2) { - posEnd = posStart + 1; - fmt = str.substr(pos, posEnd - pos); - fmt.erase(posStart - pos - 1, 1); - pos = posStart = posEnd; - outputArgument(sepStr, false, ConstStr, - reinterpret_cast(fmt.data())); - continue; - } - break; - } - else if (pos < str.length()) { - outputArgument(sepStr, false, ConstStr,reinterpret_cast((str.substr(pos)).data())); - } + // Print all arguments + // Note: the following code walks through all arguments, provided by the kernel and + // finds the corresponding specifier in the format string. + // Then it splits the original string into substrings with a single specifier and + // uses standard PrintfDbg() to print each argument + for (uint j = 0; j < info.arguments_.size(); ++j) { + do { + posStart = str.find_first_of("%", pos); + if (posStart != std::string::npos) { + posStart++; + // Erase all spaces after % + while (str[posStart] == ' ') { + str.erase(posStart, 1); } - while (posStart != std::string::npos); - - if (posStart != std::string::npos) { - bool printFloat = false; - int vectorSize = 0; - size_t length; - size_t idPos = 0; - - // Search for PrintfDbg specifier in the format string. - // It will be a split point for the output - posEnd = str.find_first_of(specifiers, posStart); - if (posEnd == std::string::npos) { - pos = posStart = posEnd; - break; - } - posEnd++; - - size_t curPos = posEnd; - vectorSize = checkVectorSpecifier(str, posStart, curPos); - - // Get substring from the last position to the current specifier - fmt = str.substr(pos, posEnd - pos); - - // Readjust the string pointer if PrintfDbg outputs a vector - if (vectorSize != 0) { - size_t posVecSpec = fmt.length()-(curPos + 1); - size_t posVecMod = fmt.find_first_of(modifiers,posVecSpec + 1); - size_t posMod = str.find_first_of(modifiers,posStart); - if(posMod < posEnd){ - fmt = fmt.erase(posVecSpec, posVecMod - posVecSpec); - } - else{ - fmt = fmt.erase(posVecSpec, curPos); - } - idPos = posStart - pos - 1; - } - pos = posStart = posEnd; - - // Find out if the argument is a float - printFloat = checkFloat(fmt); - - // Is it a scalar value? - if (vectorSize == 0) { - length = outputArgument(fmt, printFloat, info.arguments_[j], &s[i]); - if (0 == length) { - return; - } - i += amd::alignUp(length, sizeof(uint32_t)) / sizeof(uint32_t); - } - else { - // 3-component vector's size is defined as 4 * size of each scalar component - size_t elemSize = info.arguments_[j] / (vectorSize == 3 ? 4 : vectorSize); - size_t k = i * sizeof(uint32_t); - std::string elementStr = fmt.substr(idPos, fmt.size()); - - // Print first element with full string - if (0 == outputArgument(fmt, printFloat, elemSize, &s[i])) { - return; - } - - // Print other elemnts with separator if available - for (int e = 1; e < vectorSize; ++e) { - const char* t = reinterpret_cast(s); - // Output the vector separator - outputArgument(sepStr, false, ConstStr, - reinterpret_cast(Separator)); - - // Output the next element - outputArgument(elementStr, printFloat, elemSize, - reinterpret_cast(&t[k + e * elemSize])); - } - i += (amd::alignUp(info.arguments_[j], sizeof(uint32_t))) - / sizeof(uint32_t); - } + size_t tmp = str.find_first_of(special, posStart); + size_t tmp2 = str.find_first_of(specifiers, posStart); + // Special cases. Special symbol is located before any specifier + if (tmp < tmp2) { + posEnd = posStart + 1; + fmt = str.substr(pos, posEnd - pos); + fmt.erase(posStart - pos - 1, 1); + pos = posStart = posEnd; + outputArgument(sepStr, false, ConstStr, reinterpret_cast(fmt.data())); + continue; } - } - - if (pos != std::string::npos) { - fmt = str.substr(pos, str.size() - pos); + break; + } else if (pos < str.length()) { outputArgument(sepStr, false, ConstStr, - reinterpret_cast(fmt.data())); - } -} + reinterpret_cast((str.substr(pos)).data())); + } + } while (posStart != std::string::npos); -bool -PrintfDbg::clearWorkitems(VirtualGPU& gpu, size_t idxStart, size_t number) const -{ - // Go through all locations for every thread and copy 1 - for (uint i = idxStart; i < idxStart + number; ++i) { - amd::Coord3D dst(i * wiDbgSize(), 0, 0); - amd::Coord3D size(sizeof(uint32_t), 0, 0); + if (posStart != std::string::npos) { + bool printFloat = false; + int vectorSize = 0; + size_t length; + size_t idPos = 0; - // Copy 1 into the corresponding location in the debug buffer - if (!initCntValue_.partialMemCopyTo( - gpu, amd::Coord3D(0, 0, 0), dst, size, *dbgBuffer_)) { - return false; + // Search for PrintfDbg specifier in the format string. + // It will be a split point for the output + posEnd = str.find_first_of(specifiers, posStart); + if (posEnd == std::string::npos) { + pos = posStart = posEnd; + break; + } + posEnd++; + + size_t curPos = posEnd; + vectorSize = checkVectorSpecifier(str, posStart, curPos); + + // Get substring from the last position to the current specifier + fmt = str.substr(pos, posEnd - pos); + + // Readjust the string pointer if PrintfDbg outputs a vector + if (vectorSize != 0) { + size_t posVecSpec = fmt.length() - (curPos + 1); + size_t posVecMod = fmt.find_first_of(modifiers, posVecSpec + 1); + size_t posMod = str.find_first_of(modifiers, posStart); + if (posMod < posEnd) { + fmt = fmt.erase(posVecSpec, posVecMod - posVecSpec); + } else { + fmt = fmt.erase(posVecSpec, curPos); } + idPos = posStart - pos - 1; + } + pos = posStart = posEnd; + + // Find out if the argument is a float + printFloat = checkFloat(fmt); + + // Is it a scalar value? + if (vectorSize == 0) { + length = outputArgument(fmt, printFloat, info.arguments_[j], &s[i]); + if (0 == length) { + return; + } + i += amd::alignUp(length, sizeof(uint32_t)) / sizeof(uint32_t); + } else { + // 3-component vector's size is defined as 4 * size of each scalar component + size_t elemSize = info.arguments_[j] / (vectorSize == 3 ? 4 : vectorSize); + size_t k = i * sizeof(uint32_t); + std::string elementStr = fmt.substr(idPos, fmt.size()); + + // Print first element with full string + if (0 == outputArgument(fmt, printFloat, elemSize, &s[i])) { + return; + } + + // Print other elemnts with separator if available + for (int e = 1; e < vectorSize; ++e) { + const char* t = reinterpret_cast(s); + // Output the vector separator + outputArgument(sepStr, false, ConstStr, reinterpret_cast(Separator)); + + // Output the next element + outputArgument(elementStr, printFloat, elemSize, + reinterpret_cast(&t[k + e * elemSize])); + } + i += (amd::alignUp(info.arguments_[j], sizeof(uint32_t))) / sizeof(uint32_t); + } } - return true; + } + + if (pos != std::string::npos) { + fmt = str.substr(pos, str.size() - pos); + outputArgument(sepStr, false, ConstStr, reinterpret_cast(fmt.data())); + } } -uint32_t* -PrintfDbg::mapWorkitem(VirtualGPU& gpu, size_t idx, bool* realloc) -{ - uint32_t wiSize = 0; - amd::Coord3D src(idx * wiDbgSize(), 0, 0); +bool PrintfDbg::clearWorkitems(VirtualGPU& gpu, size_t idxStart, size_t number) const { + // Go through all locations for every thread and copy 1 + for (uint i = idxStart; i < idxStart + number; ++i) { + amd::Coord3D dst(i * wiDbgSize(), 0, 0); + amd::Coord3D size(sizeof(uint32_t), 0, 0); + + // Copy 1 into the corresponding location in the debug buffer + if (!initCntValue_.partialMemCopyTo(gpu, amd::Coord3D(0, 0, 0), dst, size, *dbgBuffer_)) { + return false; + } + } + return true; +} + +uint32_t* PrintfDbg::mapWorkitem(VirtualGPU& gpu, size_t idx, bool* realloc) { + uint32_t wiSize = 0; + amd::Coord3D src(idx * wiDbgSize(), 0, 0); + xferBufRead_ = &(dev().xferRead().acquire()); + + // Copy workitem size from the corresponding location in the debug buffer + if (!dbgBuffer_->partialMemCopyTo(gpu, src, amd::Coord3D(0, 0, 0), + amd::Coord3D(sizeof(uint32_t), 0, 0), *xferBufRead_)) { + return NULL; + } + + // Get memory pointer to the satged buffer + uint32_t* workitem = reinterpret_cast(xferBufRead_->map(&gpu)); + if (NULL == workitem) { + return NULL; + } + + // Copy size value + wiSize = *workitem; + xferBufRead_->unmap(&gpu); + + // Check if the cuurent workitem almost reached the size limit + if ((wiDbgSize() - static_cast(wiSize)) < 3) { + *realloc = true; + } + + // If the current workitem had any output then get the data + if ((wiSize > 1) && (wiSize <= wiDbgSize())) { + amd::Coord3D size(wiSize * sizeof(uint32_t), 0, 0); + + // Copy the current workitem output data to the staged buffer + if (!dbgBuffer_->partialMemCopyTo(gpu, src, amd::Coord3D(0, 0, 0), size, *xferBufRead_) || + // Clear the write pointer back to index 1 for the current workitem + !clearWorkitems(gpu, idx, 1)) { + LogError("Reading the workitem data failed!"); + return NULL; + } + + // Get a pointer to the workitem data + uint32_t* workitem = reinterpret_cast(xferBufRead_->map(&gpu)); + + return workitem; + } + + return NULL; +} + +void PrintfDbg::unmapWorkitem(VirtualGPU& gpu, const uint32_t* workitemData) const { + if (NULL != workitemData) { + xferBufRead_->unmap(&gpu); + } + + dev().xferRead().release(gpu, *xferBufRead_); +} + +bool PrintfDbgHSA::init(VirtualGPU& gpu, bool printfEnabled) { + // Set up debug output buffer (if printf active) + if (printfEnabled) { + if (!allocate()) { + return false; + } + + // The first two DWORDs in the printf buffer are as follows: + // First DWORD = Offset to where next information is to + // be written, initialized to 0 + // Second DWORD = Number of bytes available for printf data + // = buffer size – 2*sizeof(uint32_t) + const uint8_t initSize = 2 * sizeof(uint32_t); + uint8_t sysMem[initSize]; + memset(sysMem, 0, initSize); + uint32_t dbgBufferSize = dbgBuffer_->size() - initSize; + memcpy(&sysMem[4], &dbgBufferSize, sizeof(dbgBufferSize)); + + // Copy offset and number of bytes available for printf data + // into the corresponding location in the debug buffer + dbgBuffer_->writeRawData(gpu, initSize, sysMem, true); + } + return true; +} + +bool PrintfDbgHSA::output(VirtualGPU& gpu, bool printfEnabled, + const std::vector& printfInfo) { + if (printfEnabled) { + uint32_t offsetSize = 0; xferBufRead_ = &(dev().xferRead().acquire()); - // Copy workitem size from the corresponding location in the debug buffer - if (!dbgBuffer_->partialMemCopyTo(gpu, - src, amd::Coord3D(0, 0, 0), amd::Coord3D(sizeof(uint32_t), 0, 0), - *xferBufRead_)) { - return NULL; + // Copy offset from the first DWORD in the debug buffer + if (!dbgBuffer_->partialMemCopyTo(gpu, amd::Coord3D(0, 0, 0), amd::Coord3D(0, 0, 0), + amd::Coord3D(sizeof(uint32_t), 0, 0), *xferBufRead_)) { + return false; } // Get memory pointer to the satged buffer - uint32_t* workitem = reinterpret_cast(xferBufRead_->map(&gpu)); - if (NULL == workitem) { - return NULL; + uint32_t* dbgBufferPtr = reinterpret_cast(xferBufRead_->map(&gpu)); + if (NULL == dbgBufferPtr) { + return false; } - // Copy size value - wiSize = *workitem; + offsetSize = *dbgBufferPtr; xferBufRead_->unmap(&gpu); - // Check if the cuurent workitem almost reached the size limit - if ((wiDbgSize() - static_cast(wiSize)) < 3) { - *realloc = true; + if (offsetSize == 0) { + LogInfo("The printf buffer is empty!"); + dev().xferRead().release(gpu, *xferBufRead_); + return true; } - // If the current workitem had any output then get the data - if ((wiSize > 1) && (wiSize <= wiDbgSize())) { - amd::Coord3D size(wiSize * sizeof(uint32_t), 0, 0); + size_t bufSize = dev().xferRead().bufSize(); + size_t copySize = offsetSize; + while (copySize != 0) { + // Copy the buffer data (i.e., the printfID followed by the + // argument data for each printf call in th kernel) to the staged buffer + if (!dbgBuffer_->partialMemCopyTo( + gpu, amd::Coord3D(2 * sizeof(uint32_t) + offsetSize - copySize, 0, 0), + amd::Coord3D(0, 0, 0), std::min(copySize, bufSize), *xferBufRead_)) { + return false; + } - // Copy the current workitem output data to the staged buffer - if (!dbgBuffer_->partialMemCopyTo( - gpu, src, amd::Coord3D(0, 0, 0), size, *xferBufRead_) || - // Clear the write pointer back to index 1 for the current workitem - !clearWorkitems(gpu, idx, 1)) { - LogError("Reading the workitem data failed!"); - return NULL; + // Get a pointer to the buffer data + dbgBufferPtr = reinterpret_cast(xferBufRead_->map(&gpu)); + if (NULL == dbgBufferPtr) { + return false; + } + + + std::vector::const_iterator ita; + uint sb = 0; + uint sbt = 0; + + // parse the debug buffer + while (sbt < copySize) { + assert(((*dbgBufferPtr) < printfInfo.size()) && "Cound't find the reported PrintfID!"); + const PrintfInfo& info = printfInfo[(*dbgBufferPtr)]; + sb += sizeof(uint32_t); + for (ita = info.arguments_.begin(); ita != info.arguments_.end(); ++ita) { + sb += *ita; } - // Get a pointer to the workitem data - uint32_t* workitem = reinterpret_cast - (xferBufRead_->map(&gpu)); + if (sbt + sb > bufSize) { + break; // Need new portion of data in staging buffer + } - return workitem; - } + size_t idx = 1; + // There's something in the debug buffer + outputDbgBuffer(info, dbgBufferPtr, idx); - return NULL; -} + sbt += sb; + dbgBufferPtr += sb / sizeof(uint32_t); + sb = 0; + } -void -PrintfDbg::unmapWorkitem(VirtualGPU& gpu , const uint32_t* workitemData) const -{ - if (NULL != workitemData) { - xferBufRead_->unmap(&gpu); + copySize -= sbt; + xferBufRead_->unmap(&gpu); } dev().xferRead().release(gpu, *xferBufRead_); + } + + return true; } -bool -PrintfDbgHSA::init( - VirtualGPU& gpu, - bool printfEnabled) -{ - // Set up debug output buffer (if printf active) - if (printfEnabled) { - if (!allocate()) { - return false; - } - - // The first two DWORDs in the printf buffer are as follows: - // First DWORD = Offset to where next information is to - // be written, initialized to 0 - // Second DWORD = Number of bytes available for printf data - // = buffer size – 2*sizeof(uint32_t) - const uint8_t initSize = 2*sizeof(uint32_t); - uint8_t sysMem[initSize]; - memset(sysMem, 0, initSize); - uint32_t dbgBufferSize = dbgBuffer_->size() - initSize; - memcpy(&sysMem[4], &dbgBufferSize, sizeof(dbgBufferSize)); - - // Copy offset and number of bytes available for printf data - // into the corresponding location in the debug buffer - dbgBuffer_->writeRawData(gpu, initSize, sysMem, true); - } - return true; -} - -bool -PrintfDbgHSA::output( - VirtualGPU& gpu, - bool printfEnabled, - const std::vector& printfInfo) -{ - if (printfEnabled) { - uint32_t offsetSize = 0; - xferBufRead_ = &(dev().xferRead().acquire()); - - // Copy offset from the first DWORD in the debug buffer - if (!dbgBuffer_->partialMemCopyTo(gpu, - amd::Coord3D(0, 0, 0), amd::Coord3D(0, 0, 0), - amd::Coord3D(sizeof(uint32_t), 0, 0),*xferBufRead_)) { - return false; - } - - // Get memory pointer to the satged buffer - uint32_t* dbgBufferPtr = reinterpret_cast(xferBufRead_->map(&gpu)); - if (NULL == dbgBufferPtr) { - return false; - } - - offsetSize = *dbgBufferPtr; - xferBufRead_->unmap(&gpu); - - if (offsetSize == 0) { - LogInfo("The printf buffer is empty!"); - dev().xferRead().release(gpu, *xferBufRead_); - return true; - } - - size_t bufSize = dev().xferRead().bufSize(); - size_t copySize = offsetSize; - while (copySize != 0) { - // Copy the buffer data (i.e., the printfID followed by the - //argument data for each printf call in th kernel) to the staged buffer - if (!dbgBuffer_->partialMemCopyTo(gpu, - amd::Coord3D(2*sizeof(uint32_t) + offsetSize - copySize, 0, 0), - amd::Coord3D(0, 0, 0), - std::min(copySize, bufSize), *xferBufRead_)) { - return false; - } - - // Get a pointer to the buffer data - dbgBufferPtr = reinterpret_cast(xferBufRead_->map(&gpu)); - if (NULL == dbgBufferPtr) { - return false; - } - - - std::vector::const_iterator ita; - uint sb = 0; - uint sbt = 0; - - // parse the debug buffer - while (sbt < copySize) { - assert(((*dbgBufferPtr) < printfInfo.size()) && - "Cound't find the reported PrintfID!"); - const PrintfInfo& info = printfInfo[(*dbgBufferPtr)]; - sb += sizeof(uint32_t); - for (ita = info.arguments_.begin(); - ita != info.arguments_.end(); ++ita){ - sb += *ita; - } - - if (sbt + sb > bufSize) { - break; // Need new portion of data in staging buffer - } - - size_t idx = 1; - // There's something in the debug buffer - outputDbgBuffer(info, dbgBufferPtr, idx); - - sbt += sb; - dbgBufferPtr += sb/sizeof(uint32_t); - sb = 0; - } - - copySize -= sbt; - xferBufRead_->unmap(&gpu); - } - - dev().xferRead().release(gpu, *xferBufRead_); - } - - return true; -} - -} // namespace gpu +} // namespace gpu diff --git a/rocclr/runtime/device/gpu/gpuprintf.hpp b/rocclr/runtime/device/gpu/gpuprintf.hpp index 9c6316d8c6..76df64abd6 100644 --- a/rocclr/runtime/device/gpu/gpuprintf.hpp +++ b/rocclr/runtime/device/gpu/gpuprintf.hpp @@ -12,182 +12,159 @@ #ifndef isinf #ifdef _MSC_VER #define isinf(X) (!_finite(X) && !_isnan(X)) -#endif //_MSC_VER -#endif //isinf +#endif //_MSC_VER +#endif // isinf #ifndef isnan #ifdef _MSC_VER #define isnan(X) (_isnan(X)) -#endif //_MSC_VER -#endif //isnan +#endif //_MSC_VER +#endif // isnan #ifndef copysign #ifdef _MSC_VER -#define copysign(X,Y) (_copysign(X,Y)) -#endif //_MSC_VER -#endif //copysign +#define copysign(X, Y) (_copysign(X, Y)) +#endif //_MSC_VER +#endif // copysign //! GPU Device Implementation namespace gpu { //! Printf info structure -struct PrintfInfo -{ - std::string fmtString_; //!< formated string for printf - std::vector arguments_; //!< passed arguments to the printf() call +struct PrintfInfo { + std::string fmtString_; //!< formated string for printf + std::vector arguments_; //!< passed arguments to the printf() call }; class Kernel; class VirtualGPU; class Memory; -class PrintfDbg : public amd::HeapObject -{ -public: - //! Debug buffer size per workitem - static const uint WorkitemDebugSize = 4096; +class PrintfDbg : public amd::HeapObject { + public: + //! Debug buffer size per workitem + static const uint WorkitemDebugSize = 4096; - //! Default constructor - PrintfDbg( - Device& device, - FILE* file = NULL - ); + //! Default constructor + PrintfDbg(Device& device, FILE* file = NULL); - //! Destructor - ~PrintfDbg(); + //! Destructor + ~PrintfDbg(); - //! Creates the PrintfDbg object - bool create(); + //! Creates the PrintfDbg object + bool create(); - //! Initializes the debug buffer before kernel's execution - bool init( - VirtualGPU& gpu, //!< Virtual GPU object - bool printfEnabled, //!< checks for printf - const amd::NDRange& size //!< Kernel's workload - ); + //! Initializes the debug buffer before kernel's execution + bool init(VirtualGPU& gpu, //!< Virtual GPU object + bool printfEnabled, //!< checks for printf + const amd::NDRange& size //!< Kernel's workload + ); - //! Prints the kernel's debug informaiton from the buffer - bool output( - VirtualGPU& gpu, //!< Virtual GPU object - bool printfEnabled, //!< checks for printf - const amd::NDRange& size, //!< Kernel's workload - const std::vector& printfInfo //!< printf info - ); + //! Prints the kernel's debug informaiton from the buffer + bool output(VirtualGPU& gpu, //!< Virtual GPU object + bool printfEnabled, //!< checks for printf + const amd::NDRange& size, //!< Kernel's workload + const std::vector& printfInfo //!< printf info + ); - //! Returns the debug buffer offset - uint64_t bufOffset() const; + //! Returns the debug buffer offset + uint64_t bufOffset() const; - //! Debug buffer size per workitem - size_t wiDbgSize() const { return wiDbgSize_; } + //! Debug buffer size per workitem + size_t wiDbgSize() const { return wiDbgSize_; } - //! Returns debug buffer object - Memory* dbgBuffer() const { return dbgBuffer_; } + //! Returns debug buffer object + Memory* dbgBuffer() const { return dbgBuffer_; } -protected: - Memory* dbgBuffer_; //!< Buffer to hold debug output - FILE* dbgFile_; //!< Debug file - Device& gpuDevice_; //!< GPU device object - Memory* xferBufRead_; //!< Transfer buffer for the dump read + protected: + Memory* dbgBuffer_; //!< Buffer to hold debug output + FILE* dbgFile_; //!< Debug file + Device& gpuDevice_; //!< GPU device object + Memory* xferBufRead_; //!< Transfer buffer for the dump read - //! Gets GPU device object - Device& dev() const { return gpuDevice_; } + //! Gets GPU device object + Device& dev() const { return gpuDevice_; } - //! Allocates the debug buffer - bool allocate( - bool realloc = false //!< If TRUE then reallocate the debug memory - ); + //! Allocates the debug buffer + bool allocate(bool realloc = false //!< If TRUE then reallocate the debug memory + ); - //! Returns TRUE if a float value has to be printed - bool checkFloat( - const std::string& fmt //!< Format string - ) const; + //! Returns TRUE if a float value has to be printed + bool checkFloat(const std::string& fmt //!< Format string + ) const; - //! Returns TRUE if a string value has to be printed - bool checkString( - const std::string& fmt //!< Format string - ) const; + //! Returns TRUE if a string value has to be printed + bool checkString(const std::string& fmt //!< Format string + ) const; - //! Finds the specifier in the format string - int checkVectorSpecifier( - const std::string& fmt, //!< Format string - size_t startPos, //!< Start position for processing - size_t& curPos //!< End position for processing - ) const; + //! Finds the specifier in the format string + int checkVectorSpecifier(const std::string& fmt, //!< Format string + size_t startPos, //!< Start position for processing + size_t& curPos //!< End position for processing + ) const; - //! Outputs an argument - size_t outputArgument( - const std::string& fmt, //!< Format strint - bool printFloat, //!< Argument is a float value - size_t size, //!< Argument's size - const uint32_t* argument //!< Argument's location - ) const; + //! Outputs an argument + size_t outputArgument(const std::string& fmt, //!< Format strint + bool printFloat, //!< Argument is a float value + size_t size, //!< Argument's size + const uint32_t* argument //!< Argument's location + ) const; - //! Displays the PrintfDbg - void outputDbgBuffer( - const PrintfInfo& info, //!< printf info - const uint32_t* workitemData, //!< The PrintfDbg dump buffer - size_t& i //!< index to the data in the buffer - ) const; + //! Displays the PrintfDbg + void outputDbgBuffer(const PrintfInfo& info, //!< printf info + const uint32_t* workitemData, //!< The PrintfDbg dump buffer + size_t& i //!< index to the data in the buffer + ) const; -private: - //! Disable copy constructor - PrintfDbg(const PrintfDbg&); + private: + //! Disable copy constructor + PrintfDbg(const PrintfDbg&); - //! Disable assignment - PrintfDbg& operator=(const PrintfDbg&); + //! Disable assignment + PrintfDbg& operator=(const PrintfDbg&); - //! Returns the pointer to the workitem data block - bool clearWorkitems( - VirtualGPU& gpu, //!< Virtual GPU object - size_t idxStart, //!< Workitem global index start - size_t number //!< Number of workitems to clear - ) const; + //! Returns the pointer to the workitem data block + bool clearWorkitems(VirtualGPU& gpu, //!< Virtual GPU object + size_t idxStart, //!< Workitem global index start + size_t number //!< Number of workitems to clear + ) const; - //! Returns the pointer to the workitem data block - uint32_t* mapWorkitem( - VirtualGPU& gpu, //!< Virtual GPU object - size_t idx, //!< Workitem global index - bool* realloc //!< Returns TRUE if workitem reached the buffer limit - ); + //! Returns the pointer to the workitem data block + uint32_t* mapWorkitem(VirtualGPU& gpu, //!< Virtual GPU object + size_t idx, //!< Workitem global index + bool* realloc //!< Returns TRUE if workitem reached the buffer limit + ); - //! Unamp the staged buffer - void unmapWorkitem( - VirtualGPU& gpu, //!< Virtual GPU object - const uint32_t* workitemData //!< The PrintfDbg dump buffer - ) const; + //! Unamp the staged buffer + void unmapWorkitem(VirtualGPU& gpu, //!< Virtual GPU object + const uint32_t* workitemData //!< The PrintfDbg dump buffer + ) const; - size_t wiDbgSize_; //!< Workitem debug size - Memory initCntValue_; //!< Initialized count value + size_t wiDbgSize_; //!< Workitem debug size + Memory initCntValue_; //!< Initialized count value }; -class PrintfDbgHSA : public PrintfDbg -{ -public: +class PrintfDbgHSA : public PrintfDbg { + public: + //! Default constructor + PrintfDbgHSA(Device& device, FILE* file = NULL) : PrintfDbg(device, file) {} - //! Default constructor - PrintfDbgHSA( - Device& device, - FILE* file = NULL - ): PrintfDbg(device, file) { } + //! Initializes the debug buffer before kernel's execution + bool init(VirtualGPU& gpu, //!< Virtual GPU object + bool printfEnabled //!< checks for printf + ); - //! Initializes the debug buffer before kernel's execution - bool init( - VirtualGPU& gpu, //!< Virtual GPU object - bool printfEnabled //!< checks for printf - ); + //! Prints the kernel's debug informaiton from the buffer + bool output(VirtualGPU& gpu, //!< Virtual GPU object + bool printfEnabled, //!< checks for printf + const std::vector& printfInfo //!< printf info + ); - //! Prints the kernel's debug informaiton from the buffer - bool output( - VirtualGPU& gpu, //!< Virtual GPU object - bool printfEnabled, //!< checks for printf - const std::vector& printfInfo //!< printf info - ); + private: + //! Disable copy constructor + PrintfDbgHSA(const PrintfDbgHSA&); -private: - //! Disable copy constructor - PrintfDbgHSA(const PrintfDbgHSA&); - - //! Disable assignment - PrintfDbgHSA& operator=(const PrintfDbgHSA&); + //! Disable assignment + PrintfDbgHSA& operator=(const PrintfDbgHSA&); }; /*@}*/} // namespace gpu diff --git a/rocclr/runtime/device/gpu/gpuprogram.cpp b/rocclr/runtime/device/gpu/gpuprogram.cpp index 0e54de9e0d..d56c1cc219 100644 --- a/rocclr/runtime/device/gpu/gpuprogram.cpp +++ b/rocclr/runtime/device/gpu/gpuprogram.cpp @@ -23,531 +23,480 @@ namespace gpu { -bool -NullProgram::initBuild(amd::option::Options* options) -{ - if (!device::Program::initBuild(options)) { - return false; - } - - const char* devname = dev().hwInfo()->machineTarget_; - options->setPerBuildInfo( - (devname && (devname[0] != '\0')) ? devname : "gpu", - clBinary()->getEncryptCode(), - true // FIXME: the dev ptr is used to query the wavefront size. - ); - - // Elf Binary setup - std::string outFileName; - - // Recompile from IL may happen (invoking Kernel::recompil()) to generate correct - // isa code for 7xx. Because of this, force saving AMDIL into the binary. - clBinary()->init(options, (dev().calTarget() <= CAL_TARGET_730)); - if (options->isDumpFlagSet(amd::option::DUMP_BIF)) { - outFileName = options->getDumpFileName(".bin"); - } - - bool useELF64 = dev().settings().use64BitPtr_; - if (!clBinary()->setElfOut(useELF64 ? ELFCLASS64 : ELFCLASS32, - (outFileName.size() > 0) ? outFileName.c_str() : NULL)) { - LogError("Setup elf out for gpu failed"); - return false; - } - return true; -} - -bool -NullProgram::finiBuild(bool isBuildGood) -{ - clBinary()->resetElfOut(); - clBinary()->resetElfIn(); - - if (!isBuildGood) { - // Prevent the encrypted binary form leaking out - clBinary()->setBinary(NULL, 0); - } - - return device::Program::finiBuild(isBuildGood); -} - -const aclTargetInfo & -NullProgram::info(const char * str) { - acl_error err; - std::string arch = GPU_TARGET_INFO_ARCH; - if (dev().settings().use64BitPtr_) { - arch += "64"; - } - info_ = aclGetTargetInfo(arch.c_str(), ( str && str[0] == '\0' ? dev().hwInfo()->targetName_ : str ), &err); - if (err != ACL_SUCCESS) { - LogWarning("aclGetTargetInfo failed"); - } - return info_; -} - -NullProgram::~NullProgram() -{ - // Destroy all ILFunc objects - freeAllILFuncs(); - releaseClBinary(); -} - -bool -NullProgram::isCalled(const ILFunc* base, const ILFunc* func) -{ - // Loop through all functions, which will be called from the base one - for (size_t i = 0; i < base->calls_.size(); ++i) { - assert(base->calls_[i] != base && "recursion"); - // Check if the current function is the one - if (base->calls_[i] == func) { - return true; - } - // We have to use a recursive method to make sure it's not called inside - else if (isCalled(base->calls_[i], func)) { - return true; - } - } +bool NullProgram::initBuild(amd::option::Options* options) { + if (!device::Program::initBuild(options)) { return false; + } + + const char* devname = dev().hwInfo()->machineTarget_; + options->setPerBuildInfo((devname && (devname[0] != '\0')) ? devname : "gpu", + clBinary()->getEncryptCode(), + true // FIXME: the dev ptr is used to query the wavefront size. + ); + + // Elf Binary setup + std::string outFileName; + + // Recompile from IL may happen (invoking Kernel::recompil()) to generate correct + // isa code for 7xx. Because of this, force saving AMDIL into the binary. + clBinary()->init(options, (dev().calTarget() <= CAL_TARGET_730)); + if (options->isDumpFlagSet(amd::option::DUMP_BIF)) { + outFileName = options->getDumpFileName(".bin"); + } + + bool useELF64 = dev().settings().use64BitPtr_; + if (!clBinary()->setElfOut(useELF64 ? ELFCLASS64 : ELFCLASS32, + (outFileName.size() > 0) ? outFileName.c_str() : NULL)) { + LogError("Setup elf out for gpu failed"); + return false; + } + return true; } -uint -ILFunc::totalHwPrivateUsage() { - if (totalHwPrivateSize_ >= 0) - return totalHwPrivateSize_; +bool NullProgram::finiBuild(bool isBuildGood) { + clBinary()->resetElfOut(); + clBinary()->resetElfIn(); + + if (!isBuildGood) { + // Prevent the encrypted binary form leaking out + clBinary()->setBinary(NULL, 0); + } + + return device::Program::finiBuild(isBuildGood); +} + +const aclTargetInfo& NullProgram::info(const char* str) { + acl_error err; + std::string arch = GPU_TARGET_INFO_ARCH; + if (dev().settings().use64BitPtr_) { + arch += "64"; + } + info_ = aclGetTargetInfo(arch.c_str(), + (str && str[0] == '\0' ? dev().hwInfo()->targetName_ : str), &err); + if (err != ACL_SUCCESS) { + LogWarning("aclGetTargetInfo failed"); + } + return info_; +} + +NullProgram::~NullProgram() { + // Destroy all ILFunc objects + freeAllILFuncs(); + releaseClBinary(); +} + +bool NullProgram::isCalled(const ILFunc* base, const ILFunc* func) { + // Loop through all functions, which will be called from the base one + for (size_t i = 0; i < base->calls_.size(); ++i) { + assert(base->calls_[i] != base && "recursion"); + // Check if the current function is the one + if (base->calls_[i] == func) { + return true; + } + // We have to use a recursive method to make sure it's not called inside + else if (isCalled(base->calls_[i], func)) { + return true; + } + } + return false; +} + +uint ILFunc::totalHwPrivateUsage() { + if (totalHwPrivateSize_ >= 0) return totalHwPrivateSize_; uint maxChildUsage = 0; for (size_t i = 0; i < calls_.size(); ++i) { uint childUsage = calls_[i]->totalHwPrivateUsage(); - if (childUsage > maxChildUsage) - maxChildUsage = childUsage; + if (childUsage > maxChildUsage) maxChildUsage = childUsage; } totalHwPrivateSize_ = hwPrivateSize_ + maxChildUsage; return totalHwPrivateSize_; } -void -NullProgram::patchMain(std::string& kernel, uint index) -{ - std::string callPatch = "call "; - char sym; +void NullProgram::patchMain(std::string& kernel, uint index) { + std::string callPatch = "call "; + char sym; - // Create the patch string - while (index) { - sym = (index % 10) + 0x30; - callPatch.insert(5, &sym, 1); - index /= 10; - } - callPatch += ";"; + // Create the patch string + while (index) { + sym = (index % 10) + 0x30; + callPatch.insert(5, &sym, 1); + index /= 10; + } + callPatch += ";"; - // Patch the program - kernel.replace(patch_, callPatch.size(), callPatch); + // Patch the program + kernel.replace(patch_, callPatch.size(), callPatch); } -NullKernel* -Program::createKernel( - const std::string& name, const Kernel::InitData* initData, - const std::string& code, const std::string& metadata, bool* created, - const void* binaryCode, size_t binarySize) -{ - amd::option::Options *options = getCompilerOptions(); - uint64_t start_time = 0; - if (options->oVariables->EnableBuildTiming) { - start_time = amd::Os::timeNanos(); - } +NullKernel* Program::createKernel(const std::string& name, const Kernel::InitData* initData, + const std::string& code, const std::string& metadata, + bool* created, const void* binaryCode, size_t binarySize) { + amd::option::Options* options = getCompilerOptions(); + uint64_t start_time = 0; + if (options->oVariables->EnableBuildTiming) { + start_time = amd::Os::timeNanos(); + } - *created = false; - // Create a GPU kernel - Kernel* gpuKernel = new Kernel(name, - static_cast(device()), *this, initData); + *created = false; + // Create a GPU kernel + Kernel* gpuKernel = new Kernel(name, static_cast(device()), *this, initData); - if (gpuKernel == NULL) { - buildLog_ += "new Kernel() failed"; - LogPrintfError("new Kernel() failed for kernel %s!", name.c_str()); - return NULL; - } - else if (gpuKernel->create(code, metadata, binaryCode, binarySize)) { - // Add kernel to the program - kernels()[gpuKernel->name()] = gpuKernel; - buildLog_ += gpuKernel->buildLog(); - } - else { - buildError_ = gpuKernel->buildError(); - buildLog_ += gpuKernel->buildLog(); - delete gpuKernel; - LogPrintfError("Kernel creation failed for kernel %s!", name.c_str()); - return NULL; - } + if (gpuKernel == NULL) { + buildLog_ += "new Kernel() failed"; + LogPrintfError("new Kernel() failed for kernel %s!", name.c_str()); + return NULL; + } else if (gpuKernel->create(code, metadata, binaryCode, binarySize)) { + // Add kernel to the program + kernels()[gpuKernel->name()] = gpuKernel; + buildLog_ += gpuKernel->buildLog(); + } else { + buildError_ = gpuKernel->buildError(); + buildLog_ += gpuKernel->buildLog(); + delete gpuKernel; + LogPrintfError("Kernel creation failed for kernel %s!", name.c_str()); + return NULL; + } - if (options->oVariables->EnableBuildTiming) { - std::stringstream tmp_ss; - tmp_ss << " Time for creating kernel (" - << name << ") : " - << (amd::Os::timeNanos() - start_time)/1000ULL - << " us\n"; - buildLog_ += tmp_ss.str(); - } + if (options->oVariables->EnableBuildTiming) { + std::stringstream tmp_ss; + tmp_ss << " Time for creating kernel (" << name + << ") : " << (amd::Os::timeNanos() - start_time) / 1000ULL << " us\n"; + buildLog_ += tmp_ss.str(); + } - *created = true; - return static_cast(gpuKernel); + *created = true; + return static_cast(gpuKernel); } -bool -NullProgram::linkImpl(amd::option::Options* options) -{ - if (llvmBinary_.empty()) { - // We are using either CL binary or IL directly. - bool hasRecompiled; - if (ilProgram_.empty()) { - // Setup elfIn() and try to load ISA from binary - // This elfIn() will be released at the end of build by finiBuild(). - if (!clBinary()->setElfIn(ELFCLASS32)) { - buildLog_ += "Internal error: Setting input OCL binary failed!\n"; - LogError("Setting input OCL binary failed"); - return false; +bool NullProgram::linkImpl(amd::option::Options* options) { + if (llvmBinary_.empty()) { + // We are using either CL binary or IL directly. + bool hasRecompiled; + if (ilProgram_.empty()) { + // Setup elfIn() and try to load ISA from binary + // This elfIn() will be released at the end of build by finiBuild(). + if (!clBinary()->setElfIn(ELFCLASS32)) { + buildLog_ += "Internal error: Setting input OCL binary failed!\n"; + LogError("Setting input OCL binary failed"); + return false; + } + bool loadSuccess = false; + if (!options->oVariables->ForceLLVM) { + loadSuccess = loadBinary(&hasRecompiled); + } + if (!loadSuccess && (options->oVariables->UseDebugIL && !options->oVariables->ForceLLVM)) { + buildLog_ += "Internal error: Loading OpenCL binary under -use-debugil failed!\n"; + LogError("Loading OCL binary failed under -use-debugil"); + return false; + } + if (loadSuccess) { + if (hasRecompiled) { + char* section; + size_t sz; + if (clBinary()->saveSOURCE() && + clBinary()->elfIn()->getSection(amd::OclElf::SOURCE, §ion, &sz)) { + clBinary()->elfOut()->addSection(amd::OclElf::SOURCE, section, sz); + } + if (clBinary()->saveLLVMIR()) { + if (clBinary()->loadLlvmBinary(llvmBinary_, elfSectionType_) && + (!llvmBinary_.empty())) { + clBinary()->elfOut()->addSection(elfSectionType_, llvmBinary_.data(), + llvmBinary_.size(), false); } - bool loadSuccess = false; - if (!options->oVariables->ForceLLVM) { - loadSuccess = loadBinary(&hasRecompiled); - } - if (!loadSuccess && - (options->oVariables->UseDebugIL && - !options->oVariables->ForceLLVM)) { - buildLog_ += "Internal error: Loading OpenCL binary under -use-debugil failed!\n"; - LogError("Loading OCL binary failed under -use-debugil"); - return false; - } - if (loadSuccess) { - if (hasRecompiled) { - char *section; - size_t sz; - if (clBinary()->saveSOURCE() && - clBinary()->elfIn()->getSection(amd::OclElf::SOURCE, §ion, &sz)) { - clBinary()->elfOut()->addSection(amd::OclElf::SOURCE, section, sz); - } - if (clBinary()->saveLLVMIR()) { - if (clBinary()->loadLlvmBinary(llvmBinary_, elfSectionType_) && (!llvmBinary_.empty())) { - clBinary()->elfOut()->addSection(elfSectionType_, - llvmBinary_.data(), llvmBinary_.size(), false); - } - } + } - setType(TYPE_EXECUTABLE); - if (!clBinary()->createElfBinary(options->oVariables->BinEncrypt, type())) { - buildLog_ += "Internal error: Failed to create OpenCL binary!\n"; - LogError("Failed to create OpenCL binary"); - return false; - } - } - else { - // The original binary is good and reuse it. - // Release the new binary if there is. - clBinary()->restoreOrigBinary(); - } - return true; - } - else if (clBinary()->loadLlvmBinary(llvmBinary_, elfSectionType_) && - clBinary()->isRecompilable(llvmBinary_, amd::OclElf::CAL_PLATFORM)) { - char *section; - size_t sz; - - // Clean up and remove all the content generated before - if (!clBinary()->clearElfOut()) { - buildLog_ += "Internal error: Resetting OpenCL Binary failed!\n"; - LogError("Resetting output OCL binary failed"); - return false; - } - - if (clBinary()->saveSOURCE() && - clBinary()->elfIn()->getSection(amd::OclElf::SOURCE, §ion, &sz)) { - clBinary()->elfOut()->addSection(amd::OclElf::SOURCE, section, sz); - } - if (clBinary()->saveLLVMIR()) { - clBinary()->elfOut()->addSection(elfSectionType_, - llvmBinary_.data(), llvmBinary_.size(), false); - } - } - else { - buildLog_ += "Internal error: Input OpenCL binary is not for the target!\n"; - LogError("OCL Binary isn't good for the target"); - return false; - } - } - } - - if (!llvmBinary_.empty()) { - // Compile llvm binary to the IL source code - // This is link/OPT/Codegen part of compiler. - cl_int iErr = compileBinaryToIL(options); - if (iErr != CL_SUCCESS) { - buildLog_ += "Error: Compilation from LLVMIR binary to IL text failed!"; - LogError(buildLog_.c_str()); + setType(TYPE_EXECUTABLE); + if (!clBinary()->createElfBinary(options->oVariables->BinEncrypt, type())) { + buildLog_ += "Internal error: Failed to create OpenCL binary!\n"; + LogError("Failed to create OpenCL binary"); return false; + } + } else { + // The original binary is good and reuse it. + // Release the new binary if there is. + clBinary()->restoreOrigBinary(); } - } - - if (!ilProgram_.empty() && options->oVariables->EnableDebug) { - // Lets parse out the dwarf debug information and store it in the elf - llvm::CompUnit compilation(ilProgram_); - std::string debugILStr = compilation.getILStr(); - const char* dbgSec = debugILStr.c_str(); - size_t dbgSize = debugILStr.size(); - // Add an IL section that contains debug information and is the - // output of LLVM codegen. - clBinary()->elfOut()->addSection(amd::OclElf::ILDEBUG, dbgSec, dbgSize); - - if ((dbgSize > 0) && options->isDumpFlagSet(amd::option::DUMP_DEBUGIL)) { - std::string debugilWithLine; - size_t b = 1; - size_t e; - int linenum=0; - char cstr[9]; - cstr[8] = 0; - while (b != std::string::npos) { - e = debugILStr.find_first_of("\n", b); - if (e != std::string::npos) { - ++e; - } - sprintf(&cstr[0], "%5x: ", linenum); - debugilWithLine.append(cstr); - debugilWithLine.append(debugILStr.substr(b,e-b)); - b = e; - ++linenum; - } - std::string debugilFileName = options->getDumpFileName(".debugil"); - std::fstream f; - f.open(debugilFileName.c_str(), (std::fstream::out | std::fstream::binary)); - f.write(debugilWithLine.c_str(), debugilWithLine.size()); - f.close(); - } - - for (unsigned x = 0; x < llvm::AMDILDwarf::DEBUG_LAST; ++x) { - dbgSec = compilation.getDebugData()->getDwarfBitstream( - static_cast(x), dbgSize); - // Do not create an elf section if the size of the section is - // 0. - if (!dbgSize) { - continue; - } - clBinary()->elfOut()->addSection( - static_cast(x - + amd::OclElf::DEBUG_INFO), dbgSec, dbgSize); - } - - } - - // Create kernel objects - if (!ilProgram_.empty() && parseKernels(ilProgram_)) { - // Loop through all possible kernels - for (size_t i = 0; i < funcs_.size(); ++i) { - ILFunc* baseFunc = funcs_[i]; - // Make sure we have a Kernel function, but not Intrinsic or Simple - if (baseFunc->state_ == ILFunc::Kernel) { - size_t metadataSize = - baseFunc->metadata_.end_ - baseFunc->metadata_.begin_; - std::string kernel = ilProgram_; - std::string metadataStr; - std::vector notCalled; - std::vector called; - std::map macros; - size_t j; - Kernel::InitData initData = {0}; - - // Fill the list of not used functions, relativly to the current - for (j = 0; j < funcs_.size(); ++j) { - if ((i != j) && - ((funcs_[j]->state_ == ILFunc::Regular) || - (funcs_[j]->state_ == ILFunc::Kernel))) { - if (!isCalled(baseFunc, funcs_[j])) { - notCalled.push_back(funcs_[j]); - } - else { - called.push_back(funcs_[j]); - } - } - } - - // Get the metadata string for the current kernel - metadataStr.insert(0, kernel, - baseFunc->metadata_.begin_, metadataSize); - - std::vector rangeList; - // Remove unused kernels, starting from the end - for (j = notCalled.size(); j > 0; --j) { - ILFunc* func = notCalled[j-1]; - std::vector::iterator it; - for (it = rangeList.begin(); it != rangeList.end(); ++it) { - if ((*it)->begin_ < func->metadata_.begin_) { - assert((*it)->begin_ < func->code_.begin_ - && "code and metadata not next to each other"); - break; - } - assert((*it)->begin_ >= func->code_.begin_ - && "code and metadata not next to each other"); - } - assert(func->metadata_.begin_ > func->code_.begin_ - && "code after metadata"); - if (it == rangeList.end()) { - rangeList.push_back(&func->metadata_); - rangeList.push_back(&func->code_); - } - else { - it = rangeList.insert(it, &func->code_); - rangeList.insert(it, &func->metadata_); - } - } - for (j = 0; j < rangeList.size(); ++j) { - const ILFunc::SourceRange* range = rangeList[j]; - kernel.erase(range->begin_, range->end_ - range->begin_); - } - - // Patch the main program with a call to the current kernel - patchMain(kernel, baseFunc->index_); - - // Add macros at the top, loop through all available functions - // for this kernel - for (j = 0; j <= called.size(); ++j) { - ILFunc* func = (j < called.size()) ? called[j] : baseFunc; - for (size_t l = func->macros_.size(); l > 0 ; --l) { - int lines; - int idx = static_cast(func->macros_[l - 1]); - const char** macro = amd::MacroDBGetMacro(&lines, idx); - - // Make sure we didn't place this macro already - if (macros[idx] == NULL) { - macros[idx] = macro; - // Do we have a valid macro? - if ((lines == 0) || (macro == NULL)) { - buildLog_ += "Error: undefined macro!\n"; - LogPrintfError( - "Metadata reports undefined macro %d!", idx); - return false; - } - else { - // Add the macro to the IL source - for (int k = 0; k < lines; ++k) { - kernel.insert(0, macro[k], strlen(macro[k])); - } - } - } - } - // Accumulate all emulated local and private sizes, - // necessary for the kernel execution - initData.localSize_ += func->localSize_; - - // Accumulate all HW local and private sizes, - // necessary for the kernel execution - initData.hwLocalSize_ += func->hwLocalSize_; - initData.hwPrivateSize_ += func->hwPrivateSize_; - initData.flags_ |= func->flags_; - } - initData.privateSize_ = baseFunc->totalHwPrivateUsage(); - amdilUtils::changePrivateUAVLength(kernel, - initData.privateSize_); - - // Create a GPU kernel - bool created; - NullKernel* gpuKernel = createKernel(baseFunc->name_, - &initData, kernel.data(), metadataStr, &created); - if (!created) { - buildLog_ += "Error: Creating kernel " + - baseFunc->name_ + " failed!\n"; - LogError(buildLog_.c_str()); - return false; - } - - // Add the current kernel to the binary - if (!clBinary()->storeKernel(baseFunc->name_, gpuKernel, - &initData, metadataStr, kernel)) { - buildLog_ += "Internal error: adding a kernel into OpenCL binary failed!\n"; - return false; - } - } - else { - // Non-kernel function, save metadata symbols for recompilation - if (clBinary()->saveAMDIL()) { - size_t metadataSize = - baseFunc->metadata_.end_ - baseFunc->metadata_.begin_; - if (metadataSize <= 0) { - continue; - } - std::string metadataStr; - // Get the metadata string - metadataStr.insert(0, ilProgram_, baseFunc->metadata_.begin_, - metadataSize); - - std::stringstream aStream; - aStream << "__OpenCL_" << baseFunc->name_ << "_fmetadata"; - std::string metaName = aStream.str(); - // Save metadata symbols in .rodata - if (!clBinary()->elfOut()->addSymbol(amd::OclElf::RODATA, - metaName.c_str(), - metadataStr.data(), - metadataStr.size())) { - buildLog_ += "Internal error: addSymbol failed!\n"; - LogError ("AddSymbol failed"); - return false; - } - } - } - } - - setType(TYPE_EXECUTABLE); - if (!createBinary(options)) { - buildLog_ += "Intenral error: creating OpenCL binary failed\n"; - return false; - } - - // Destroy all ILFunc objects - freeAllILFuncs(); - ilProgram_.clear(); return true; + } else if (clBinary()->loadLlvmBinary(llvmBinary_, elfSectionType_) && + clBinary()->isRecompilable(llvmBinary_, amd::OclElf::CAL_PLATFORM)) { + char* section; + size_t sz; + + // Clean up and remove all the content generated before + if (!clBinary()->clearElfOut()) { + buildLog_ += "Internal error: Resetting OpenCL Binary failed!\n"; + LogError("Resetting output OCL binary failed"); + return false; + } + + if (clBinary()->saveSOURCE() && + clBinary()->elfIn()->getSection(amd::OclElf::SOURCE, §ion, &sz)) { + clBinary()->elfOut()->addSection(amd::OclElf::SOURCE, section, sz); + } + if (clBinary()->saveLLVMIR()) { + clBinary()->elfOut()->addSection(elfSectionType_, llvmBinary_.data(), llvmBinary_.size(), + false); + } + } else { + buildLog_ += "Internal error: Input OpenCL binary is not for the target!\n"; + LogError("OCL Binary isn't good for the target"); + return false; + } } - return false; + } + + if (!llvmBinary_.empty()) { + // Compile llvm binary to the IL source code + // This is link/OPT/Codegen part of compiler. + cl_int iErr = compileBinaryToIL(options); + if (iErr != CL_SUCCESS) { + buildLog_ += "Error: Compilation from LLVMIR binary to IL text failed!"; + LogError(buildLog_.c_str()); + return false; + } + } + + if (!ilProgram_.empty() && options->oVariables->EnableDebug) { + // Lets parse out the dwarf debug information and store it in the elf + llvm::CompUnit compilation(ilProgram_); + std::string debugILStr = compilation.getILStr(); + const char* dbgSec = debugILStr.c_str(); + size_t dbgSize = debugILStr.size(); + // Add an IL section that contains debug information and is the + // output of LLVM codegen. + clBinary()->elfOut()->addSection(amd::OclElf::ILDEBUG, dbgSec, dbgSize); + + if ((dbgSize > 0) && options->isDumpFlagSet(amd::option::DUMP_DEBUGIL)) { + std::string debugilWithLine; + size_t b = 1; + size_t e; + int linenum = 0; + char cstr[9]; + cstr[8] = 0; + while (b != std::string::npos) { + e = debugILStr.find_first_of("\n", b); + if (e != std::string::npos) { + ++e; + } + sprintf(&cstr[0], "%5x: ", linenum); + debugilWithLine.append(cstr); + debugilWithLine.append(debugILStr.substr(b, e - b)); + b = e; + ++linenum; + } + std::string debugilFileName = options->getDumpFileName(".debugil"); + std::fstream f; + f.open(debugilFileName.c_str(), (std::fstream::out | std::fstream::binary)); + f.write(debugilWithLine.c_str(), debugilWithLine.size()); + f.close(); + } + + for (unsigned x = 0; x < llvm::AMDILDwarf::DEBUG_LAST; ++x) { + dbgSec = compilation.getDebugData()->getDwarfBitstream( + static_cast(x), dbgSize); + // Do not create an elf section if the size of the section is + // 0. + if (!dbgSize) { + continue; + } + clBinary()->elfOut()->addSection( + static_cast(x + amd::OclElf::DEBUG_INFO), dbgSec, dbgSize); + } + + } + + // Create kernel objects + if (!ilProgram_.empty() && parseKernels(ilProgram_)) { + // Loop through all possible kernels + for (size_t i = 0; i < funcs_.size(); ++i) { + ILFunc* baseFunc = funcs_[i]; + // Make sure we have a Kernel function, but not Intrinsic or Simple + if (baseFunc->state_ == ILFunc::Kernel) { + size_t metadataSize = baseFunc->metadata_.end_ - baseFunc->metadata_.begin_; + std::string kernel = ilProgram_; + std::string metadataStr; + std::vector notCalled; + std::vector called; + std::map macros; + size_t j; + Kernel::InitData initData = {0}; + + // Fill the list of not used functions, relativly to the current + for (j = 0; j < funcs_.size(); ++j) { + if ((i != j) && + ((funcs_[j]->state_ == ILFunc::Regular) || (funcs_[j]->state_ == ILFunc::Kernel))) { + if (!isCalled(baseFunc, funcs_[j])) { + notCalled.push_back(funcs_[j]); + } else { + called.push_back(funcs_[j]); + } + } + } + + // Get the metadata string for the current kernel + metadataStr.insert(0, kernel, baseFunc->metadata_.begin_, metadataSize); + + std::vector rangeList; + // Remove unused kernels, starting from the end + for (j = notCalled.size(); j > 0; --j) { + ILFunc* func = notCalled[j - 1]; + std::vector::iterator it; + for (it = rangeList.begin(); it != rangeList.end(); ++it) { + if ((*it)->begin_ < func->metadata_.begin_) { + assert((*it)->begin_ < func->code_.begin_ && + "code and metadata not next to each other"); + break; + } + assert((*it)->begin_ >= func->code_.begin_ && + "code and metadata not next to each other"); + } + assert(func->metadata_.begin_ > func->code_.begin_ && "code after metadata"); + if (it == rangeList.end()) { + rangeList.push_back(&func->metadata_); + rangeList.push_back(&func->code_); + } else { + it = rangeList.insert(it, &func->code_); + rangeList.insert(it, &func->metadata_); + } + } + for (j = 0; j < rangeList.size(); ++j) { + const ILFunc::SourceRange* range = rangeList[j]; + kernel.erase(range->begin_, range->end_ - range->begin_); + } + + // Patch the main program with a call to the current kernel + patchMain(kernel, baseFunc->index_); + + // Add macros at the top, loop through all available functions + // for this kernel + for (j = 0; j <= called.size(); ++j) { + ILFunc* func = (j < called.size()) ? called[j] : baseFunc; + for (size_t l = func->macros_.size(); l > 0; --l) { + int lines; + int idx = static_cast(func->macros_[l - 1]); + const char** macro = amd::MacroDBGetMacro(&lines, idx); + + // Make sure we didn't place this macro already + if (macros[idx] == NULL) { + macros[idx] = macro; + // Do we have a valid macro? + if ((lines == 0) || (macro == NULL)) { + buildLog_ += "Error: undefined macro!\n"; + LogPrintfError("Metadata reports undefined macro %d!", idx); + return false; + } else { + // Add the macro to the IL source + for (int k = 0; k < lines; ++k) { + kernel.insert(0, macro[k], strlen(macro[k])); + } + } + } + } + // Accumulate all emulated local and private sizes, + // necessary for the kernel execution + initData.localSize_ += func->localSize_; + + // Accumulate all HW local and private sizes, + // necessary for the kernel execution + initData.hwLocalSize_ += func->hwLocalSize_; + initData.hwPrivateSize_ += func->hwPrivateSize_; + initData.flags_ |= func->flags_; + } + initData.privateSize_ = baseFunc->totalHwPrivateUsage(); + amdilUtils::changePrivateUAVLength(kernel, initData.privateSize_); + + // Create a GPU kernel + bool created; + NullKernel* gpuKernel = + createKernel(baseFunc->name_, &initData, kernel.data(), metadataStr, &created); + if (!created) { + buildLog_ += "Error: Creating kernel " + baseFunc->name_ + " failed!\n"; + LogError(buildLog_.c_str()); + return false; + } + + // Add the current kernel to the binary + if (!clBinary()->storeKernel(baseFunc->name_, gpuKernel, &initData, metadataStr, kernel)) { + buildLog_ += "Internal error: adding a kernel into OpenCL binary failed!\n"; + return false; + } + } else { + // Non-kernel function, save metadata symbols for recompilation + if (clBinary()->saveAMDIL()) { + size_t metadataSize = baseFunc->metadata_.end_ - baseFunc->metadata_.begin_; + if (metadataSize <= 0) { + continue; + } + std::string metadataStr; + // Get the metadata string + metadataStr.insert(0, ilProgram_, baseFunc->metadata_.begin_, metadataSize); + + std::stringstream aStream; + aStream << "__OpenCL_" << baseFunc->name_ << "_fmetadata"; + std::string metaName = aStream.str(); + // Save metadata symbols in .rodata + if (!clBinary()->elfOut()->addSymbol(amd::OclElf::RODATA, metaName.c_str(), + metadataStr.data(), metadataStr.size())) { + buildLog_ += "Internal error: addSymbol failed!\n"; + LogError("AddSymbol failed"); + return false; + } + } + } + } + + setType(TYPE_EXECUTABLE); + if (!createBinary(options)) { + buildLog_ += "Intenral error: creating OpenCL binary failed\n"; + return false; + } + + // Destroy all ILFunc objects + freeAllILFuncs(); + ilProgram_.clear(); + return true; + } + return false; } -bool -NullProgram::linkImpl(const std::vector& inputPrograms, - amd::option::Options* options, - bool createLibrary) -{ - std::vector llvmBinaries(inputPrograms.size()); - std::vector elfSectionType(inputPrograms.size()); - std::vector::const_iterator it - = inputPrograms.begin(); - std::vector::const_iterator itEnd - = inputPrograms.end(); - for (size_t i = 0; it != itEnd; ++it, ++i) { - NullProgram* program = (NullProgram*)*it; +bool NullProgram::linkImpl(const std::vector& inputPrograms, + amd::option::Options* options, bool createLibrary) { + std::vector llvmBinaries(inputPrograms.size()); + std::vector elfSectionType(inputPrograms.size()); + std::vector::const_iterator it = inputPrograms.begin(); + std::vector::const_iterator itEnd = inputPrograms.end(); + for (size_t i = 0; it != itEnd; ++it, ++i) { + NullProgram* program = (NullProgram*)*it; - if (program->llvmBinary_.empty()) { - if (program->clBinary() == NULL) { - buildLog_ += "Internal error: Input program not compiled!\n"; - LogError("Loading compiled input object failed"); - return false; - } + if (program->llvmBinary_.empty()) { + if (program->clBinary() == NULL) { + buildLog_ += "Internal error: Input program not compiled!\n"; + LogError("Loading compiled input object failed"); + return false; + } - // We are using CL binary directly. - // Setup elfIn() and try to load llvmIR from binary - // This elfIn() will be released at the end of build by finiBuild(). - if (!program->clBinary()->setElfIn(ELFCLASS32)) { - buildLog_ += "Internal error: Setting input OCL binary failed!\n"; - LogError("Setting input OCL binary failed"); - return false; - } - if (!program->clBinary()->loadLlvmBinary(program->llvmBinary_, - program->elfSectionType_)) { - buildLog_ - += "Internal error: Failed loading compiled binary!\n"; - LogError("Bad OCL Binary"); - return false; - } + // We are using CL binary directly. + // Setup elfIn() and try to load llvmIR from binary + // This elfIn() will be released at the end of build by finiBuild(). + if (!program->clBinary()->setElfIn(ELFCLASS32)) { + buildLog_ += "Internal error: Setting input OCL binary failed!\n"; + LogError("Setting input OCL binary failed"); + return false; + } + if (!program->clBinary()->loadLlvmBinary(program->llvmBinary_, program->elfSectionType_)) { + buildLog_ += "Internal error: Failed loading compiled binary!\n"; + LogError("Bad OCL Binary"); + return false; + } - if (!program->clBinary()->isRecompilable(program->llvmBinary_, - amd::OclElf::CAL_PLATFORM)) { - buildLog_ += "Internal error: Input OpenCL binary is not" - " for the target!\n"; - LogError("OCL Binary isn't good for the target"); - return false; - } + if (!program->clBinary()->isRecompilable(program->llvmBinary_, amd::OclElf::CAL_PLATFORM)) { + buildLog_ += + "Internal error: Input OpenCL binary is not" + " for the target!\n"; + LogError("OCL Binary isn't good for the target"); + return false; + } #if 0 // TODO: copy .source over to output program char *section; @@ -558,1526 +507,1428 @@ NullProgram::linkImpl(const std::vector& inputPrograms, clBinary()->elfOut()->addSection(amd::OclElf::SOURCE, section, sz); } #endif - } - - llvmBinaries[i] = &program->llvmBinary_; - elfSectionType[i] = program->elfSectionType_; } - acl_error err; - aclTargetInfo aclinfo = info(); - aclBinaryOptions binOpts = {0}; - binOpts.struct_size = sizeof(binOpts); - binOpts.elfclass = aclinfo.arch_id == aclAMDIL64 ? ELFCLASS64 : ELFCLASS32; - binOpts.bitness = ELFDATA2LSB; - binOpts.alloc = &::malloc; - binOpts.dealloc = &::free; + llvmBinaries[i] = &program->llvmBinary_; + elfSectionType[i] = program->elfSectionType_; + } - std::vector libs(llvmBinaries.size(), NULL); - for (size_t i = 0; i < libs.size(); ++i) { - libs[i] = aclBinaryInit(sizeof(aclBinary), &aclinfo, &binOpts, &err); - if (err != ACL_SUCCESS) { - LogWarning("aclBinaryInit failed"); - break; - } - - _bif_sections_enum_0_8 aclTypeUsed; - if (elfSectionType[i] == amd::OclElf::SPIRV) { - aclTypeUsed = aclSPIRV; - } else if (elfSectionType[i] == amd::OclElf::SPIR) { - aclTypeUsed = aclSPIR; - } else { - aclTypeUsed = aclLLVMIR; - } - err = aclInsertSection(dev().compiler(), libs[i], - llvmBinaries[i]->data(), llvmBinaries[i]->size(), aclTypeUsed); - if (err != ACL_SUCCESS) { - LogWarning("aclInsertSection failed"); - break; - } - - // temporary solution to synchronize buildNo between runtime and complib - // until we move runtime inside complib - ((amd::option::Options*)libs[i]->options)->setBuildNo( - options->getBuildNo()); - } - - - if (libs.size() > 0 && err == ACL_SUCCESS) do { - unsigned int numLibs = libs.size() - 1; - - if (numLibs > 0) { - err = aclLink(dev().compiler(), libs[0], numLibs, &libs[1], - ACL_TYPE_LLVMIR_BINARY, "-create-library", NULL); - - buildLog_ += aclGetCompilerLog(dev().compiler()); - - if (err != ACL_SUCCESS) { - LogWarning("aclLink failed"); - break; - } - } - - size_t size = 0; - _bif_sections_enum_0_8 aclTypeUsed; - if (elfSectionType[0] == amd::OclElf::SPIRV && numLibs == 0) { - aclTypeUsed = aclSPIRV; - } else if (elfSectionType[0] == amd::OclElf::SPIR && numLibs == 0) { - aclTypeUsed = aclSPIR; - } else { - aclTypeUsed = aclLLVMIR; - } - const void* llvmir = aclExtractSection(dev().compiler(), libs[0], - &size, aclTypeUsed, &err); - if (err != ACL_SUCCESS) { - LogWarning("aclExtractSection failed"); - break; - } - - llvmBinary_.assign(reinterpret_cast(llvmir), size); - elfSectionType_ = amd::OclElf::LLVMIR; - } while(0); - - std::for_each(libs.begin(), libs.end(), std::ptr_fun(aclBinaryFini)); + acl_error err; + aclTargetInfo aclinfo = info(); + aclBinaryOptions binOpts = {0}; + binOpts.struct_size = sizeof(binOpts); + binOpts.elfclass = aclinfo.arch_id == aclAMDIL64 ? ELFCLASS64 : ELFCLASS32; + binOpts.bitness = ELFDATA2LSB; + binOpts.alloc = &::malloc; + binOpts.dealloc = &::free; + std::vector libs(llvmBinaries.size(), NULL); + for (size_t i = 0; i < libs.size(); ++i) { + libs[i] = aclBinaryInit(sizeof(aclBinary), &aclinfo, &binOpts, &err); if (err != ACL_SUCCESS) { - buildLog_ += "Error: linking llvm modules failed!"; - return false; + LogWarning("aclBinaryInit failed"); + break; } - if (clBinary()->saveLLVMIR()) { - clBinary()->elfOut()->addSection(amd::OclElf::LLVMIR, - llvmBinary_.data(), llvmBinary_.size(), - false); - // store the original link options - clBinary()->storeLinkOptions(linkOptions_); - - clBinary()->storeCompileOptions(compileOptions_); + _bif_sections_enum_0_8 aclTypeUsed; + if (elfSectionType[i] == amd::OclElf::SPIRV) { + aclTypeUsed = aclSPIRV; + } else if (elfSectionType[i] == amd::OclElf::SPIR) { + aclTypeUsed = aclSPIR; + } else { + aclTypeUsed = aclLLVMIR; + } + err = aclInsertSection(dev().compiler(), libs[i], llvmBinaries[i]->data(), + llvmBinaries[i]->size(), aclTypeUsed); + if (err != ACL_SUCCESS) { + LogWarning("aclInsertSection failed"); + break; } - // skip the rest if we are building an opencl library - if (createLibrary) { - setType(TYPE_LIBRARY); - if (!createBinary(options)) { - buildLog_ += "Intenral error: creating OpenCL binary failed\n"; - return false; + // temporary solution to synchronize buildNo between runtime and complib + // until we move runtime inside complib + ((amd::option::Options*)libs[i]->options)->setBuildNo(options->getBuildNo()); + } + + + if (libs.size() > 0 && err == ACL_SUCCESS) do { + unsigned int numLibs = libs.size() - 1; + + if (numLibs > 0) { + err = aclLink(dev().compiler(), libs[0], numLibs, &libs[1], ACL_TYPE_LLVMIR_BINARY, + "-create-library", NULL); + + buildLog_ += aclGetCompilerLog(dev().compiler()); + + if (err != ACL_SUCCESS) { + LogWarning("aclLink failed"); + break; } + } - return true; - } + size_t size = 0; + _bif_sections_enum_0_8 aclTypeUsed; + if (elfSectionType[0] == amd::OclElf::SPIRV && numLibs == 0) { + aclTypeUsed = aclSPIRV; + } else if (elfSectionType[0] == amd::OclElf::SPIR && numLibs == 0) { + aclTypeUsed = aclSPIR; + } else { + aclTypeUsed = aclLLVMIR; + } + const void* llvmir = aclExtractSection(dev().compiler(), libs[0], &size, aclTypeUsed, &err); + if (err != ACL_SUCCESS) { + LogWarning("aclExtractSection failed"); + break; + } - // Compile llvm binary to the IL source code - // This is link/OPT/Codegen part of compiler. - cl_int iErr = compileBinaryToIL(options); - if (iErr != CL_SUCCESS) { - buildLog_ += "Error: Compilation from LLVMIR binary to IL text failed!"; - LogError(buildLog_.c_str()); - return false; - } + llvmBinary_.assign(reinterpret_cast(llvmir), size); + elfSectionType_ = amd::OclElf::LLVMIR; + } while (0); - if (!ilProgram_.empty() && options->oVariables->EnableDebug) { - // Lets parse out the dwarf debug information and store it in the elf - llvm::CompUnit compilation(ilProgram_); - std::string debugILStr = compilation.getILStr(); - const char* dbgSec = debugILStr.c_str(); - size_t dbgSize = debugILStr.size(); - // Add an IL section that contains debug information and is the - // output of LLVM codegen. - clBinary()->elfOut()->addSection(amd::OclElf::ILDEBUG, dbgSec, dbgSize); + std::for_each(libs.begin(), libs.end(), std::ptr_fun(aclBinaryFini)); - if ((dbgSize > 0) && options->isDumpFlagSet(amd::option::DUMP_DEBUGIL)) { - std::string debugilWithLine; - size_t b = 1; - size_t e; - int linenum=0; - char cstr[9]; - cstr[8] = 0; - while (b != std::string::npos) { - e = debugILStr.find_first_of("\n", b); - if (e != std::string::npos) { - ++e; - } - sprintf(&cstr[0], "%5x: ", linenum); - debugilWithLine.append(cstr); - debugilWithLine.append(debugILStr.substr(b,e-b)); - b = e; - ++linenum; - } - std::string debugilFileName = options->getDumpFileName(".debugil"); - std::fstream f; - f.open(debugilFileName.c_str(), (std::fstream::out | std::fstream::binary)); - f.write(debugilWithLine.c_str(), debugilWithLine.size()); - f.close(); - } - - for (unsigned x = 0; x < llvm::AMDILDwarf::DEBUG_LAST; ++x) { - dbgSec = compilation.getDebugData()->getDwarfBitstream( - static_cast(x), dbgSize); - // Do not create an elf section if the size of the section is - // 0. - if (!dbgSize) { - continue; - } - clBinary()->elfOut()->addSection( - static_cast(x - + amd::OclElf::DEBUG_INFO), dbgSec, dbgSize); - } - - } - - // Create kernel objects - if (!ilProgram_.empty() && parseKernels(ilProgram_)) { - // Loop through all possible kernels - for (size_t i = 0; i < funcs_.size(); ++i) { - ILFunc* baseFunc = funcs_[i]; - // Make sure we have a Kernel function, but not Intrinsic or Simple - if (baseFunc->state_ == ILFunc::Kernel) { - size_t metadataSize = - baseFunc->metadata_.end_ - baseFunc->metadata_.begin_; - std::string kernel = ilProgram_; - std::string metadataStr; - std::vector notCalled; - std::vector called; - std::map macros; - size_t j; - Kernel::InitData initData = {0}; - - // Fill the list of not used functions, relativly to the current - for (j = 0; j < funcs_.size(); ++j) { - if ((i != j) && - ((funcs_[j]->state_ == ILFunc::Regular) || - (funcs_[j]->state_ == ILFunc::Kernel))) { - if (!isCalled(baseFunc, funcs_[j])) { - notCalled.push_back(funcs_[j]); - } - else { - called.push_back(funcs_[j]); - } - } - } - - // Get the metadata string for the current kernel - metadataStr.insert(0, kernel, - baseFunc->metadata_.begin_, metadataSize); - - std::vector rangeList; - // Remove unused kernels, starting from the end - for (j = notCalled.size(); j > 0; --j) { - ILFunc* func = notCalled[j-1]; - std::vector::iterator it; - for (it = rangeList.begin(); it != rangeList.end(); ++it) { - if ((*it)->begin_ < func->metadata_.begin_) { - assert((*it)->begin_ < func->code_.begin_ - && "code and metadata not next to each other"); - break; - } - assert((*it)->begin_ >= func->code_.begin_ - && "code and metadata not next to each other"); - } - assert(func->metadata_.begin_ > func->code_.begin_ - && "code after metadata"); - if (it == rangeList.end()) { - rangeList.push_back(&func->metadata_); - rangeList.push_back(&func->code_); - } - else { - it = rangeList.insert(it, &func->code_); - rangeList.insert(it, &func->metadata_); - } - } - for (j = 0; j < rangeList.size(); ++j) { - const ILFunc::SourceRange* range = rangeList[j]; - kernel.erase(range->begin_, range->end_ - range->begin_); - } - - // Patch the main program with a call to the current kernel - patchMain(kernel, baseFunc->index_); - - // Add macros at the top, loop through all available functions - // for this kernel - for (j = 0; j <= called.size(); ++j) { - ILFunc* func = (j < called.size()) ? called[j] : baseFunc; - for (size_t l = func->macros_.size(); l > 0 ; --l) { - int lines; - int idx = static_cast(func->macros_[l - 1]); - const char** macro = amd::MacroDBGetMacro(&lines, idx); - - // Make sure we didn't place this macro already - if (macros[idx] == NULL) { - macros[idx] = macro; - // Do we have a valid macro? - if ((lines == 0) || (macro == NULL)) { - buildLog_ += "Error: undefined macro!\n"; - LogPrintfError( - "Metadata reports undefined macro %d!", idx); - return false; - } - else { - // Add the macro to the IL source - for (int k = 0; k < lines; ++k) { - kernel.insert(0, macro[k], strlen(macro[k])); - } - } - } - } - // Accumulate all emulated local and private sizes, - // necessary for the kernel execution - initData.localSize_ += func->localSize_; - - // Accumulate all HW local and private sizes, - // necessary for the kernel execution - initData.hwLocalSize_ += func->hwLocalSize_; - initData.hwPrivateSize_ += func->hwPrivateSize_; - initData.flags_ |= func->flags_; - } - initData.privateSize_ = baseFunc->totalHwPrivateUsage(); - amdilUtils::changePrivateUAVLength(kernel, - initData.privateSize_); - - // Create a GPU kernel - bool created; - NullKernel* gpuKernel = createKernel(baseFunc->name_, - &initData, kernel.data(), metadataStr, &created); - if (!created) { - buildLog_ += "Error: Creating kernel " + - baseFunc->name_ + " failed!\n"; - LogError(buildLog_.c_str()); - return false; - } - - // Add the current kernel to the binary - if (!clBinary()->storeKernel(baseFunc->name_, gpuKernel, - &initData, metadataStr, kernel)) { - buildLog_ += "Internal error: adding a kernel into OpenCL binary failed!\n"; - return false; - } - } - else { - // Non-kernel function, save metadata symbols for recompilation - if (clBinary()->saveAMDIL()) { - size_t metadataSize = - baseFunc->metadata_.end_ - baseFunc->metadata_.begin_; - if (metadataSize <= 0) { - continue; - } - std::string metadataStr; - // Get the metadata string - metadataStr.insert(0, ilProgram_, baseFunc->metadata_.begin_, - metadataSize); - - std::stringstream aStream; - aStream << "__OpenCL_" << baseFunc->name_ << "_fmetadata"; - std::string metaName = aStream.str(); - // Save metadata symbols in .rodata - if (!clBinary()->elfOut()->addSymbol(amd::OclElf::RODATA, - metaName.c_str(), - metadataStr.data(), - metadataStr.size())) { - buildLog_ += "Internal error: addSymbol failed!\n"; - LogError ("AddSymbol failed"); - return false; - } - } - } - } - - setType(TYPE_EXECUTABLE); - if (!createBinary(options)) { - buildLog_ += "Intenral error: creating OpenCL binary failed\n"; - return false; - } - - // Destroy all ILFunc objects - freeAllILFuncs(); - ilProgram_.clear(); - return true; - } + if (err != ACL_SUCCESS) { + buildLog_ += "Error: linking llvm modules failed!"; return false; + } + + if (clBinary()->saveLLVMIR()) { + clBinary()->elfOut()->addSection(amd::OclElf::LLVMIR, llvmBinary_.data(), llvmBinary_.size(), + false); + // store the original link options + clBinary()->storeLinkOptions(linkOptions_); + + clBinary()->storeCompileOptions(compileOptions_); + } + + // skip the rest if we are building an opencl library + if (createLibrary) { + setType(TYPE_LIBRARY); + if (!createBinary(options)) { + buildLog_ += "Intenral error: creating OpenCL binary failed\n"; + return false; + } + + return true; + } + + // Compile llvm binary to the IL source code + // This is link/OPT/Codegen part of compiler. + cl_int iErr = compileBinaryToIL(options); + if (iErr != CL_SUCCESS) { + buildLog_ += "Error: Compilation from LLVMIR binary to IL text failed!"; + LogError(buildLog_.c_str()); + return false; + } + + if (!ilProgram_.empty() && options->oVariables->EnableDebug) { + // Lets parse out the dwarf debug information and store it in the elf + llvm::CompUnit compilation(ilProgram_); + std::string debugILStr = compilation.getILStr(); + const char* dbgSec = debugILStr.c_str(); + size_t dbgSize = debugILStr.size(); + // Add an IL section that contains debug information and is the + // output of LLVM codegen. + clBinary()->elfOut()->addSection(amd::OclElf::ILDEBUG, dbgSec, dbgSize); + + if ((dbgSize > 0) && options->isDumpFlagSet(amd::option::DUMP_DEBUGIL)) { + std::string debugilWithLine; + size_t b = 1; + size_t e; + int linenum = 0; + char cstr[9]; + cstr[8] = 0; + while (b != std::string::npos) { + e = debugILStr.find_first_of("\n", b); + if (e != std::string::npos) { + ++e; + } + sprintf(&cstr[0], "%5x: ", linenum); + debugilWithLine.append(cstr); + debugilWithLine.append(debugILStr.substr(b, e - b)); + b = e; + ++linenum; + } + std::string debugilFileName = options->getDumpFileName(".debugil"); + std::fstream f; + f.open(debugilFileName.c_str(), (std::fstream::out | std::fstream::binary)); + f.write(debugilWithLine.c_str(), debugilWithLine.size()); + f.close(); + } + + for (unsigned x = 0; x < llvm::AMDILDwarf::DEBUG_LAST; ++x) { + dbgSec = compilation.getDebugData()->getDwarfBitstream( + static_cast(x), dbgSize); + // Do not create an elf section if the size of the section is + // 0. + if (!dbgSize) { + continue; + } + clBinary()->elfOut()->addSection( + static_cast(x + amd::OclElf::DEBUG_INFO), dbgSec, dbgSize); + } + + } + + // Create kernel objects + if (!ilProgram_.empty() && parseKernels(ilProgram_)) { + // Loop through all possible kernels + for (size_t i = 0; i < funcs_.size(); ++i) { + ILFunc* baseFunc = funcs_[i]; + // Make sure we have a Kernel function, but not Intrinsic or Simple + if (baseFunc->state_ == ILFunc::Kernel) { + size_t metadataSize = baseFunc->metadata_.end_ - baseFunc->metadata_.begin_; + std::string kernel = ilProgram_; + std::string metadataStr; + std::vector notCalled; + std::vector called; + std::map macros; + size_t j; + Kernel::InitData initData = {0}; + + // Fill the list of not used functions, relativly to the current + for (j = 0; j < funcs_.size(); ++j) { + if ((i != j) && + ((funcs_[j]->state_ == ILFunc::Regular) || (funcs_[j]->state_ == ILFunc::Kernel))) { + if (!isCalled(baseFunc, funcs_[j])) { + notCalled.push_back(funcs_[j]); + } else { + called.push_back(funcs_[j]); + } + } + } + + // Get the metadata string for the current kernel + metadataStr.insert(0, kernel, baseFunc->metadata_.begin_, metadataSize); + + std::vector rangeList; + // Remove unused kernels, starting from the end + for (j = notCalled.size(); j > 0; --j) { + ILFunc* func = notCalled[j - 1]; + std::vector::iterator it; + for (it = rangeList.begin(); it != rangeList.end(); ++it) { + if ((*it)->begin_ < func->metadata_.begin_) { + assert((*it)->begin_ < func->code_.begin_ && + "code and metadata not next to each other"); + break; + } + assert((*it)->begin_ >= func->code_.begin_ && + "code and metadata not next to each other"); + } + assert(func->metadata_.begin_ > func->code_.begin_ && "code after metadata"); + if (it == rangeList.end()) { + rangeList.push_back(&func->metadata_); + rangeList.push_back(&func->code_); + } else { + it = rangeList.insert(it, &func->code_); + rangeList.insert(it, &func->metadata_); + } + } + for (j = 0; j < rangeList.size(); ++j) { + const ILFunc::SourceRange* range = rangeList[j]; + kernel.erase(range->begin_, range->end_ - range->begin_); + } + + // Patch the main program with a call to the current kernel + patchMain(kernel, baseFunc->index_); + + // Add macros at the top, loop through all available functions + // for this kernel + for (j = 0; j <= called.size(); ++j) { + ILFunc* func = (j < called.size()) ? called[j] : baseFunc; + for (size_t l = func->macros_.size(); l > 0; --l) { + int lines; + int idx = static_cast(func->macros_[l - 1]); + const char** macro = amd::MacroDBGetMacro(&lines, idx); + + // Make sure we didn't place this macro already + if (macros[idx] == NULL) { + macros[idx] = macro; + // Do we have a valid macro? + if ((lines == 0) || (macro == NULL)) { + buildLog_ += "Error: undefined macro!\n"; + LogPrintfError("Metadata reports undefined macro %d!", idx); + return false; + } else { + // Add the macro to the IL source + for (int k = 0; k < lines; ++k) { + kernel.insert(0, macro[k], strlen(macro[k])); + } + } + } + } + // Accumulate all emulated local and private sizes, + // necessary for the kernel execution + initData.localSize_ += func->localSize_; + + // Accumulate all HW local and private sizes, + // necessary for the kernel execution + initData.hwLocalSize_ += func->hwLocalSize_; + initData.hwPrivateSize_ += func->hwPrivateSize_; + initData.flags_ |= func->flags_; + } + initData.privateSize_ = baseFunc->totalHwPrivateUsage(); + amdilUtils::changePrivateUAVLength(kernel, initData.privateSize_); + + // Create a GPU kernel + bool created; + NullKernel* gpuKernel = + createKernel(baseFunc->name_, &initData, kernel.data(), metadataStr, &created); + if (!created) { + buildLog_ += "Error: Creating kernel " + baseFunc->name_ + " failed!\n"; + LogError(buildLog_.c_str()); + return false; + } + + // Add the current kernel to the binary + if (!clBinary()->storeKernel(baseFunc->name_, gpuKernel, &initData, metadataStr, kernel)) { + buildLog_ += "Internal error: adding a kernel into OpenCL binary failed!\n"; + return false; + } + } else { + // Non-kernel function, save metadata symbols for recompilation + if (clBinary()->saveAMDIL()) { + size_t metadataSize = baseFunc->metadata_.end_ - baseFunc->metadata_.begin_; + if (metadataSize <= 0) { + continue; + } + std::string metadataStr; + // Get the metadata string + metadataStr.insert(0, ilProgram_, baseFunc->metadata_.begin_, metadataSize); + + std::stringstream aStream; + aStream << "__OpenCL_" << baseFunc->name_ << "_fmetadata"; + std::string metaName = aStream.str(); + // Save metadata symbols in .rodata + if (!clBinary()->elfOut()->addSymbol(amd::OclElf::RODATA, metaName.c_str(), + metadataStr.data(), metadataStr.size())) { + buildLog_ += "Internal error: addSymbol failed!\n"; + LogError("AddSymbol failed"); + return false; + } + } + } + } + + setType(TYPE_EXECUTABLE); + if (!createBinary(options)) { + buildLog_ += "Intenral error: creating OpenCL binary failed\n"; + return false; + } + + // Destroy all ILFunc objects + freeAllILFuncs(); + ilProgram_.clear(); + return true; + } + return false; } -bool -NullProgram::initClBinary() -{ +bool NullProgram::initClBinary() { + if (clBinary_ == NULL) { + clBinary_ = new ClBinary(static_cast(device())); if (clBinary_ == NULL) { - clBinary_ = new ClBinary(static_cast(device())); - if (clBinary_ == NULL) { - return false; - } + return false; } - return true; + } + return true; } -void -NullProgram::releaseClBinary() -{ - if (clBinary_ != NULL) { - delete clBinary_; - clBinary_ = NULL; - } +void NullProgram::releaseClBinary() { + if (clBinary_ != NULL) { + delete clBinary_; + clBinary_ = NULL; + } } -bool -NullProgram::loadBinary(bool* hasRecompiled) -{ - if (!clBinary()->loadKernels(*this, hasRecompiled)) { - clear(); +bool NullProgram::loadBinary(bool* hasRecompiled) { + if (!clBinary()->loadKernels(*this, hasRecompiled)) { + clear(); + return false; + } + return true; +} + +bool NullProgram::initGlobalData(const std::string& source, size_t start) { + size_t pos, dataStart; + + // Find the global data store + dataStart = source.find(";#DATASTART", start); + if (dataStart != std::string::npos) { + uint index = 0; + pos = dataStart + 2; + while (expect(source, &pos, "DATASTART:")) { + uint dataSize = 0; + uint offset; + uint numElements; + size_t posStart; + bool failed = false; + + // Kernel has the global constants + if (!getuint(source, &pos, &index)) { return false; - } - return true; -} - -bool -NullProgram::initGlobalData(const std::string& source, size_t start) -{ - size_t pos, dataStart; - - // Find the global data store - dataStart= source.find(";#DATASTART", start); - if (dataStart!= std::string::npos) { - uint index = 0; - pos = dataStart + 2; - while (expect(source, &pos, "DATASTART:")) { - uint dataSize = 0; - uint offset; - uint numElements; - size_t posStart; - bool failed = false; - - // Kernel has the global constants - if (!getuint(source, &pos, &index)) { - return false; - } - pos--; - if (expect(source, &pos, ":")) { - // Read the size - if (!getuint(source, &pos, &dataSize)) { - return false; - } - } - else { - // Emulated global data store - pos++; - dataSize = index; - index = 0; - } - - if (dataSize == 0) { - return false; - } - - posStart = pos = source.find_first_not_of(";# \n\r", pos); - - char* globalData = new char[dataSize]; - if (globalData == NULL) { - return false; - } - - // Find the global data size - while (!expect(source, &pos, "DATAEND")) { - for (uint i = 0; i < DataTypeTotal; ++i) { - if (expect(source, &pos, DataType[i].tagName_)) { - // Read the offset - if (!getuint(source, &pos, &offset)) { - return false; - } - if (!getuint(source, &pos, &numElements)) { - return false; - } - for (uint j = 0; j < numElements; ++j) { - switch (DataType[i].type_) { - case KernelArg::Float: { - uint32_t* tmp = reinterpret_cast(globalData + offset); - if (!getuintHex(source, &pos, &tmp[j])) { - failed = true; - } - } - break; - case KernelArg::Double: { - uint64_t* tmp = reinterpret_cast(globalData + offset); - if (!getuint64Hex(source, &pos, &tmp[j])) { - failed = true; - } - } - break; - case KernelArg::Struct: - case KernelArg::Union: - // Struct and Union should be presented as bytes - // Fall through... - case KernelArg::Char: { - uint8_t* tmp = reinterpret_cast(globalData + offset); - uint value; - if (!getuintHex(source, &pos, &value)) { - failed = true; - } - tmp[j] = static_cast(value); - } - break; - case KernelArg::Short: { - uint16_t* tmp = reinterpret_cast(globalData + offset); - uint value; - if (!getuintHex(source, &pos, &value)) { - failed = true; - } - tmp[j] = static_cast(value); - } - break; - case KernelArg::Int: - case KernelArg::UInt: { - uint32_t* tmp = reinterpret_cast(globalData + offset); - if (!getuintHex(source, &pos, &tmp[j])) { - failed = true; - } - } - break; - case KernelArg::Long: - case KernelArg::ULong: { - uint64_t* tmp = reinterpret_cast(globalData + offset); - if (!getuint64Hex(source, &pos, &tmp[j])) { - failed = true; - } - } - break; - case KernelArg::None: - default: - break; - } - if (failed) { - delete [] globalData; - return false; - } - } - break; - } - } - if (posStart == pos) { - delete [] globalData; - return false; - } - posStart = pos = source.find_first_not_of(";# \n\r", pos); - } - - if (!allocGlobalData(globalData, dataSize, index)) { - failed = true; - } - - if (!clBinary()->storeGlobalData(globalData, dataSize, index)) { - failed = true; - } - - delete [] globalData; - - // Erase the global store information - if (index != 0) { - if (expect(source, &pos, ":")) { - // Read the size - if (!getuint(source, &pos, &index)) { - return false; - } - } - } - pos = source.find_first_not_of(";# \n\r", pos); - (const_cast(source)).erase(dataStart, pos - dataStart); - pos = dataStart; - if (failed) { - return false; - } + } + pos--; + if (expect(source, &pos, ":")) { + // Read the size + if (!getuint(source, &pos, &dataSize)) { + return false; } - } + } else { + // Emulated global data store + pos++; + dataSize = index; + index = 0; + } - return true; -} - -bool -NullProgram::findILFuncs(const std::string& source, - const std::string &func_start, - const std::string &func_end, - size_t& lastFuncPos) -{ - lastFuncPos = 0; - - // Find first tag - size_t pos = source.find(func_start); - - // Loop through all provided program arguments - while (pos != std::string::npos) { - std::string funcName; - ILFunc func; - - func.code_.begin_ = pos; - if (!expect(source, &pos, func_start)) { - break; - } - - pos = source.find_first_not_of(" \n\r", pos); - // Read the function index - if (!getuint(source, &pos, &func.index_)) { - LogError("Error reading function index"); - return false; - } - - pos = source.find_first_of(";\n\r", pos); - if (source[pos] == '\r' || source[pos] == '\n') { - // this is the dummy macro - func.name_ = std::string(""); - } - else { - pos = source.find_first_not_of("; \n\r", pos); - // Read the function's name - if (!getword(source, &pos, funcName)) { - LogError("Error reading function name"); - return false; - } - func.name_ = funcName; - } - - // Find the function end - pos = source.find(func_end, pos); - if (!expect(source, &pos, func_end)) { - break; - } - if (source[pos] == '\r' || source[pos] == '\n') { - if (!func.name_.empty()) { - LogError("Missing function name"); - return false; - } - } - else { - // this is the dummy macro - pos = source.find_first_not_of("; \n\r", pos); - if (!expect(source, &pos, funcName)) { - LogError("Error reading function name"); - return false; - } - } - // Save the function end - func.code_.end_ = pos; - - if (!func.name_.empty()) { - // Create a new function - ILFunc* clFunc = new ILFunc(func); - if (clFunc != NULL) { - addFunc(clFunc); - } - else { - return false; - } - } - lastFuncPos = pos; - // Next function - pos = source.find(func_start, pos); - } - - return true; -} - -bool -NullProgram::findAllILFuncs(const std::string& source, size_t& lastFuncPos) -{ - // find all functions defined using "func" - size_t lastPos1; - bool ret = findILFuncs(source, "func ", "endfunc ", lastPos1); - if (!ret) return false; - - // find all functions defined using outlined macro - size_t lastPos2; - ret = findILFuncs(source, "mdef(", "mend", lastPos2); - if (!ret) return false; - - lastFuncPos = std::max(lastPos1, lastPos2); - return true; -} - -bool -NullProgram::parseAllILFuncs(const std::string& source) -{ - bool doPatch = true; - amd::option::Options *opts = getCompilerOptions(); - if (opts->isCStrOptionsEqual(opts->oVariables->XLang, "il")) { - doPatch = false; - } - // Find the patch position - if (doPatch) { - patch_ = source.find(";$$$$$$$$$$"); - if (patch_ == std::string::npos) { - return false; - } - } - - size_t lastFuncPos = 0; - if (!findAllILFuncs(source, lastFuncPos)) { + if (dataSize == 0) { return false; - } + } - // Initialize the global data if available - if (!initGlobalData(source, lastFuncPos)) { - LogError("We failed the global constants detection/initialization!"); + posStart = pos = source.find_first_not_of(";# \n\r", pos); + + char* globalData = new char[dataSize]; + if (globalData == NULL) { return false; - } + } - return true; -} - -bool -NullProgram::parseFuncMetadata(const std::string& source, size_t posBegin, size_t posEnd) -{ - ILFunc* baseFunc = NULL; - uint index; - size_t pos = posBegin; - while (pos < posEnd) { - if (!expect(source, &pos, ";")) { - break; - } - for (uint k = 0; k < DescTotal; ++k) { - uint funcIndex; - uint j; - - if (expect(source, &pos, ArgState[k].typeName_)) { - if (ArgState[k].type_ == KernelArg::ErrorMessage) { - // Next argument - size_t posNext = source.find(";", pos); - buildLog_.append("Error:"); - buildLog_.append(source.substr(pos, posNext - pos)); - return false; - } - else if (ArgState[k].type_ == KernelArg::WarningMessage) { - // Next argument - size_t posNext = source.find(";", pos); - buildLog_.append("Warning:"); - buildLog_.append(source.substr(pos, posNext - pos)); - continue; - } - else if (ArgState[k].type_ == KernelArg::PrivateFixed) { - baseFunc->flags_ |= Kernel::PrivateFixed; - continue; - } - else if (ArgState[k].type_ == KernelArg::ABI64Bit) { - baseFunc->flags_ |= Kernel::ABI64bit; - continue; - } - else if (ArgState[k].type_ == KernelArg::Wavefront) { - baseFunc->flags_ |= Kernel::LimitWorkgroup; - continue; - } - else if (ArgState[k].type_ == KernelArg::PrintfFormatStr) { - uint tmp; - uint arguments; - PrintfInfo info; - - // Read index - if (!getuint(source, &pos, &index)) { - return false; - } - if (printf_.size() <= index) { - printf_.resize(index + 1); - } - // Read the number of arguments - if (!getuint(source, &pos, &arguments)) { - return false; - } - for (uint j = 0; j < arguments; ++j) { - // Read the argument's size in bytes - if (!getuint(source, &pos, &tmp)) { - return false; - } - info.arguments_.push_back(tmp); - } - - // Read length - if (!getuint(source, &pos, &tmp)) { - return false; - } - // Read string (uses length so all possible chars are valid) - for (size_t i = 0; i < tmp; ++i) { - char symbol = source[pos++]; - if (symbol == '\\') { - // Rest of the C escape sequences (e.g. \') are handled correctly - // by the MDParser, we are not sure exactly how! - switch (source[pos]) { - case 'n': - pos++; - symbol = '\n'; - break; - case 'r': - pos++; - symbol = '\r'; - break; - case 'a': - pos++; - symbol = '\a'; - break; - case 'b': - pos++; - symbol = '\b'; - break; - case 'f': - pos++; - symbol = '\f'; - break; - case 'v': - pos++; - symbol = '\v'; - break; - default: - break; - } - } - info.fmtString_.push_back(symbol); - } - if (!expect(source, &pos, ";")) { - return false; - } - printf_[index] = info; - baseFunc->flags_ |= Kernel::PrintfOutput; - // Process next token ... - continue; - } - else if (ArgState[k].type_ == KernelArg::MetadataVersion) { - continue; - } - - // Read the index - if (!getuint(source, &pos, &index)) { - return false; - } - - switch (ArgState[k].type_) { - case KernelArg::PrivateSize: - baseFunc->privateSize_ = index; - continue; - case KernelArg::LocalSize: - baseFunc->localSize_ = index; - continue; - case KernelArg::HwPrivateSize: - baseFunc->hwPrivateSize_ = index; - continue; - case KernelArg::HwLocalSize: - baseFunc->hwLocalSize_ = index; - continue; + // Find the global data size + while (!expect(source, &pos, "DATAEND")) { + for (uint i = 0; i < DataTypeTotal; ++i) { + if (expect(source, &pos, DataType[i].tagName_)) { + // Read the offset + if (!getuint(source, &pos, &offset)) { + return false; + } + if (!getuint(source, &pos, &numElements)) { + return false; + } + for (uint j = 0; j < numElements; ++j) { + switch (DataType[i].type_) { + case KernelArg::Float: { + uint32_t* tmp = reinterpret_cast(globalData + offset); + if (!getuintHex(source, &pos, &tmp[j])) { + failed = true; + } + } break; + case KernelArg::Double: { + uint64_t* tmp = reinterpret_cast(globalData + offset); + if (!getuint64Hex(source, &pos, &tmp[j])) { + failed = true; + } + } break; + case KernelArg::Struct: + case KernelArg::Union: + // Struct and Union should be presented as bytes + // Fall through... + case KernelArg::Char: { + uint8_t* tmp = reinterpret_cast(globalData + offset); + uint value; + if (!getuintHex(source, &pos, &value)) { + failed = true; + } + tmp[j] = static_cast(value); + } break; + case KernelArg::Short: { + uint16_t* tmp = reinterpret_cast(globalData + offset); + uint value; + if (!getuintHex(source, &pos, &value)) { + failed = true; + } + tmp[j] = static_cast(value); + } break; + case KernelArg::Int: + case KernelArg::UInt: { + uint32_t* tmp = reinterpret_cast(globalData + offset); + if (!getuintHex(source, &pos, &tmp[j])) { + failed = true; + } + } break; + case KernelArg::Long: + case KernelArg::ULong: { + uint64_t* tmp = reinterpret_cast(globalData + offset); + if (!getuint64Hex(source, &pos, &tmp[j])) { + failed = true; + } + } break; + case KernelArg::None: default: - break; - } - - if (!ArgState[k].size_) { - // Find the base function - baseFunc = findILFunc(index); - if (baseFunc == NULL) { - return false; - } - // Sanity check - if (baseFunc->state_ != ILFunc::Unknown) { - buildLog_ = "Error: Creating kernel "; - buildLog_ += baseFunc->name_; - buildLog_ += " failed!\n"; - LogError(buildLog_.c_str()); - continue; - } - // If we have __OpenCL_ prefix in the name - // and _kernel suffix, then this is a kernel function - const std::string prefix = "__OpenCL_"; - const std::string postfix = "_kernel"; - const std::string &fname = baseFunc->name_; - size_t namelen = fname.size(); - size_t postfixPos = namelen - postfix.size(); - if (fname.compare(0, prefix.size(), prefix) == 0 && - fname.compare(postfixPos, namelen, postfix) == 0) { - baseFunc->state_ = ILFunc::Kernel; - baseFunc->name_.erase(postfixPos, postfix.size()); - baseFunc->name_.erase(0, prefix.size()); - } - else { - baseFunc->state_ = ILFunc::Regular; - } - baseFunc->metadata_.begin_ = posBegin; - baseFunc->metadata_.end_ = posEnd; - continue; - } - - // Process metadata - for (j = 0; j < index; ++j) { - // Read the index - if (getuint(source, &pos, &funcIndex)) { - bool error = false; - if (ArgState[k].name_) { - ILFunc* func = findILFunc(funcIndex); - if (NULL != func) { - baseFunc->calls_.push_back(func); - } - else { - buildLog_ += "Error: Undeclared function index "; - error = true; - } - } - else { - if (funcIndex != 0xffffffff) { - baseFunc->macros_.push_back(funcIndex); - } - else { - buildLog_ += "Error: Undeclared macro index "; - error = true; - } - } - if (error) { - char str[8]; - intToStr(funcIndex, str, 8); - buildLog_ += str; - buildLog_ += "\n"; - LogError("Undeclared index!"); - return false; - } - } - else { - return false; - } - } + break; + } + if (failed) { + delete[] globalData; + return false; + } } - } - // Next argument - pos = source.find(";", pos); - } - return true; -} - -bool -NullProgram::parseKernels(const std::string& source) -{ - size_t pos = 0; - - // Strip out all the debug tokens as these are - // not needed yet, but will be used later. - while(1) { - pos = source.find(";DEBUGSTART", pos); - if (pos == std::string::npos) { break; + } } - size_t last = source.find(";DEBUGEND", pos); - const_cast(source).erase(pos, last - pos + 10); - pos = last; - } - // Create a list of all functions in the program - if (!parseAllILFuncs(source)) { + if (posStart == pos) { + delete[] globalData; + return false; + } + posStart = pos = source.find_first_not_of(";# \n\r", pos); + } + + if (!allocGlobalData(globalData, dataSize, index)) { + failed = true; + } + + if (!clBinary()->storeGlobalData(globalData, dataSize, index)) { + failed = true; + } + + delete[] globalData; + + // Erase the global store information + if (index != 0) { + if (expect(source, &pos, ":")) { + // Read the size + if (!getuint(source, &pos, &index)) { + return false; + } + } + } + pos = source.find_first_not_of(";# \n\r", pos); + (const_cast(source)).erase(dataStart, pos - dataStart); + pos = dataStart; + if (failed) { return false; + } } - pos = 0; - // Find all available metadata structures - for (size_t i = 0; i < funcs_.size(); ++i) { - std::string funcName; - ILFunc::SourceRange range; + } - // Find function metadata start - range.begin_ = pos = source.find(";ARGSTART:", pos); - if (pos == std::string::npos) { + return true; +} + +bool NullProgram::findILFuncs(const std::string& source, const std::string& func_start, + const std::string& func_end, size_t& lastFuncPos) { + lastFuncPos = 0; + + // Find first tag + size_t pos = source.find(func_start); + + // Loop through all provided program arguments + while (pos != std::string::npos) { + std::string funcName; + ILFunc func; + + func.code_.begin_ = pos; + if (!expect(source, &pos, func_start)) { + break; + } + + pos = source.find_first_not_of(" \n\r", pos); + // Read the function index + if (!getuint(source, &pos, &func.index_)) { + LogError("Error reading function index"); + return false; + } + + pos = source.find_first_of(";\n\r", pos); + if (source[pos] == '\r' || source[pos] == '\n') { + // this is the dummy macro + func.name_ = std::string(""); + } else { + pos = source.find_first_not_of("; \n\r", pos); + // Read the function's name + if (!getword(source, &pos, funcName)) { + LogError("Error reading function name"); + return false; + } + func.name_ = funcName; + } + + // Find the function end + pos = source.find(func_end, pos); + if (!expect(source, &pos, func_end)) { + break; + } + if (source[pos] == '\r' || source[pos] == '\n') { + if (!func.name_.empty()) { + LogError("Missing function name"); + return false; + } + } else { + // this is the dummy macro + pos = source.find_first_not_of("; \n\r", pos); + if (!expect(source, &pos, funcName)) { + LogError("Error reading function name"); + return false; + } + } + // Save the function end + func.code_.end_ = pos; + + if (!func.name_.empty()) { + // Create a new function + ILFunc* clFunc = new ILFunc(func); + if (clFunc != NULL) { + addFunc(clFunc); + } else { + return false; + } + } + lastFuncPos = pos; + // Next function + pos = source.find(func_start, pos); + } + + return true; +} + +bool NullProgram::findAllILFuncs(const std::string& source, size_t& lastFuncPos) { + // find all functions defined using "func" + size_t lastPos1; + bool ret = findILFuncs(source, "func ", "endfunc ", lastPos1); + if (!ret) return false; + + // find all functions defined using outlined macro + size_t lastPos2; + ret = findILFuncs(source, "mdef(", "mend", lastPos2); + if (!ret) return false; + + lastFuncPos = std::max(lastPos1, lastPos2); + return true; +} + +bool NullProgram::parseAllILFuncs(const std::string& source) { + bool doPatch = true; + amd::option::Options* opts = getCompilerOptions(); + if (opts->isCStrOptionsEqual(opts->oVariables->XLang, "il")) { + doPatch = false; + } + // Find the patch position + if (doPatch) { + patch_ = source.find(";$$$$$$$$$$"); + if (patch_ == std::string::npos) { + return false; + } + } + + size_t lastFuncPos = 0; + if (!findAllILFuncs(source, lastFuncPos)) { + return false; + } + + // Initialize the global data if available + if (!initGlobalData(source, lastFuncPos)) { + LogError("We failed the global constants detection/initialization!"); + return false; + } + + return true; +} + +bool NullProgram::parseFuncMetadata(const std::string& source, size_t posBegin, size_t posEnd) { + ILFunc* baseFunc = NULL; + uint index; + size_t pos = posBegin; + while (pos < posEnd) { + if (!expect(source, &pos, ";")) { + break; + } + for (uint k = 0; k < DescTotal; ++k) { + uint funcIndex; + uint j; + + if (expect(source, &pos, ArgState[k].typeName_)) { + if (ArgState[k].type_ == KernelArg::ErrorMessage) { + // Next argument + size_t posNext = source.find(";", pos); + buildLog_.append("Error:"); + buildLog_.append(source.substr(pos, posNext - pos)); + return false; + } else if (ArgState[k].type_ == KernelArg::WarningMessage) { + // Next argument + size_t posNext = source.find(";", pos); + buildLog_.append("Warning:"); + buildLog_.append(source.substr(pos, posNext - pos)); + continue; + } else if (ArgState[k].type_ == KernelArg::PrivateFixed) { + baseFunc->flags_ |= Kernel::PrivateFixed; + continue; + } else if (ArgState[k].type_ == KernelArg::ABI64Bit) { + baseFunc->flags_ |= Kernel::ABI64bit; + continue; + } else if (ArgState[k].type_ == KernelArg::Wavefront) { + baseFunc->flags_ |= Kernel::LimitWorkgroup; + continue; + } else if (ArgState[k].type_ == KernelArg::PrintfFormatStr) { + uint tmp; + uint arguments; + PrintfInfo info; + + // Read index + if (!getuint(source, &pos, &index)) { + return false; + } + if (printf_.size() <= index) { + printf_.resize(index + 1); + } + // Read the number of arguments + if (!getuint(source, &pos, &arguments)) { + return false; + } + for (uint j = 0; j < arguments; ++j) { + // Read the argument's size in bytes + if (!getuint(source, &pos, &tmp)) { + return false; + } + info.arguments_.push_back(tmp); + } + + // Read length + if (!getuint(source, &pos, &tmp)) { + return false; + } + // Read string (uses length so all possible chars are valid) + for (size_t i = 0; i < tmp; ++i) { + char symbol = source[pos++]; + if (symbol == '\\') { + // Rest of the C escape sequences (e.g. \') are handled correctly + // by the MDParser, we are not sure exactly how! + switch (source[pos]) { + case 'n': + pos++; + symbol = '\n'; + break; + case 'r': + pos++; + symbol = '\r'; + break; + case 'a': + pos++; + symbol = '\a'; + break; + case 'b': + pos++; + symbol = '\b'; + break; + case 'f': + pos++; + symbol = '\f'; + break; + case 'v': + pos++; + symbol = '\v'; + break; + default: + break; + } + } + info.fmtString_.push_back(symbol); + } + if (!expect(source, &pos, ";")) { + return false; + } + printf_[index] = info; + baseFunc->flags_ |= Kernel::PrintfOutput; + // Process next token ... + continue; + } else if (ArgState[k].type_ == KernelArg::MetadataVersion) { + continue; + } + + // Read the index + if (!getuint(source, &pos, &index)) { + return false; + } + + switch (ArgState[k].type_) { + case KernelArg::PrivateSize: + baseFunc->privateSize_ = index; + continue; + case KernelArg::LocalSize: + baseFunc->localSize_ = index; + continue; + case KernelArg::HwPrivateSize: + baseFunc->hwPrivateSize_ = index; + continue; + case KernelArg::HwLocalSize: + baseFunc->hwLocalSize_ = index; + continue; + default: break; } - // Find function metadata end - pos = source.find(";ARGEND:", pos); - if (!expect(source, &pos, ";ARGEND:")) { - break; - } - // Read the function's name - if (!getword(source, &pos, funcName)) { + if (!ArgState[k].size_) { + // Find the base function + baseFunc = findILFunc(index); + if (baseFunc == NULL) { return false; + } + // Sanity check + if (baseFunc->state_ != ILFunc::Unknown) { + buildLog_ = "Error: Creating kernel "; + buildLog_ += baseFunc->name_; + buildLog_ += " failed!\n"; + LogError(buildLog_.c_str()); + continue; + } + // If we have __OpenCL_ prefix in the name + // and _kernel suffix, then this is a kernel function + const std::string prefix = "__OpenCL_"; + const std::string postfix = "_kernel"; + const std::string& fname = baseFunc->name_; + size_t namelen = fname.size(); + size_t postfixPos = namelen - postfix.size(); + if (fname.compare(0, prefix.size(), prefix) == 0 && + fname.compare(postfixPos, namelen, postfix) == 0) { + baseFunc->state_ = ILFunc::Kernel; + baseFunc->name_.erase(postfixPos, postfix.size()); + baseFunc->name_.erase(0, prefix.size()); + } else { + baseFunc->state_ = ILFunc::Regular; + } + baseFunc->metadata_.begin_ = posBegin; + baseFunc->metadata_.end_ = posEnd; + continue; } - pos = source.find_first_not_of(" \n\r", pos); - range.end_ = pos; - if (!parseFuncMetadata(source, range.begin_, range.end_)) { + + // Process metadata + for (j = 0; j < index; ++j) { + // Read the index + if (getuint(source, &pos, &funcIndex)) { + bool error = false; + if (ArgState[k].name_) { + ILFunc* func = findILFunc(funcIndex); + if (NULL != func) { + baseFunc->calls_.push_back(func); + } else { + buildLog_ += "Error: Undeclared function index "; + error = true; + } + } else { + if (funcIndex != 0xffffffff) { + baseFunc->macros_.push_back(funcIndex); + } else { + buildLog_ += "Error: Undeclared macro index "; + error = true; + } + } + if (error) { + char str[8]; + intToStr(funcIndex, str, 8); + buildLog_ += str; + buildLog_ += "\n"; + LogError("Undeclared index!"); + return false; + } + } else { return false; + } } + } } - return true; + // Next argument + pos = source.find(";", pos); + } + return true; } -void NullProgram::freeAllILFuncs() -{ - for (size_t i = 0; i < funcs_.size(); ++i) { - delete funcs_[i]; +bool NullProgram::parseKernels(const std::string& source) { + size_t pos = 0; + + // Strip out all the debug tokens as these are + // not needed yet, but will be used later. + while (1) { + pos = source.find(";DEBUGSTART", pos); + if (pos == std::string::npos) { + break; } - funcs_.clear(); + size_t last = source.find(";DEBUGEND", pos); + const_cast(source).erase(pos, last - pos + 10); + pos = last; + } + // Create a list of all functions in the program + if (!parseAllILFuncs(source)) { + return false; + } + pos = 0; + // Find all available metadata structures + for (size_t i = 0; i < funcs_.size(); ++i) { + std::string funcName; + ILFunc::SourceRange range; + + // Find function metadata start + range.begin_ = pos = source.find(";ARGSTART:", pos); + if (pos == std::string::npos) { + break; + } + + // Find function metadata end + pos = source.find(";ARGEND:", pos); + if (!expect(source, &pos, ";ARGEND:")) { + break; + } + // Read the function's name + if (!getword(source, &pos, funcName)) { + return false; + } + pos = source.find_first_not_of(" \n\r", pos); + range.end_ = pos; + if (!parseFuncMetadata(source, range.begin_, range.end_)) { + return false; + } + } + return true; } -ILFunc* -NullProgram::findILFunc(uint index) -{ - for (size_t i = 0; i < funcs_.size(); ++i) { - if (funcs_[i]->index_ == index) { - return funcs_[i]; - } +void NullProgram::freeAllILFuncs() { + for (size_t i = 0; i < funcs_.size(); ++i) { + delete funcs_[i]; + } + funcs_.clear(); +} + +ILFunc* NullProgram::findILFunc(uint index) { + for (size_t i = 0; i < funcs_.size(); ++i) { + if (funcs_[i]->index_ == index) { + return funcs_[i]; } + } + return NULL; +} + +NullKernel* NullProgram::createKernel(const std::string& name, const Kernel::InitData* initData, + const std::string& code, const std::string& metadata, + bool* created, const void* binaryCode, size_t binarySize) { + amd::option::Options* options = getCompilerOptions(); + uint64_t start_time = 0; + if (options->oVariables->EnableBuildTiming) { + start_time = amd::Os::timeNanos(); + } + + *created = false; + // Create a GPU kernel + NullKernel* gpuKernel = + new NullKernel(name, static_cast(device()), *this); + + if (gpuKernel == NULL) { + buildLog_ += "new Kernel() failed"; + LogPrintfError("new Kernel() failed for kernel %s!", name.c_str()); return NULL; -} + } else if (gpuKernel->create(code, metadata, binaryCode, binarySize)) { + // Add kernel to the program + kernels()[gpuKernel->name()] = gpuKernel; + buildLog_ += gpuKernel->buildLog(); + } else { + buildError_ = gpuKernel->buildError(); + buildLog_ += gpuKernel->buildLog(); + delete gpuKernel; + LogPrintfError("Kernel creation failed for kernel %s!", name.c_str()); + return NULL; + } -NullKernel* -NullProgram::createKernel( - const std::string& name, const Kernel::InitData* initData, - const std::string& code, const std::string& metadata, bool* created, - const void* binaryCode, size_t binarySize) -{ - amd::option::Options *options = getCompilerOptions(); - uint64_t start_time = 0; - if (options->oVariables->EnableBuildTiming) { - start_time = amd::Os::timeNanos(); - } + if (options->oVariables->EnableBuildTiming) { + std::stringstream tmp_ss; + tmp_ss << " Time for creating kernel (" << name + << ") : " << (amd::Os::timeNanos() - start_time) / 1000ULL << " us\n"; + buildLog_ += tmp_ss.str(); + } - *created = false; - // Create a GPU kernel - NullKernel* gpuKernel = new NullKernel(name, - static_cast(device()), *this); - - if (gpuKernel == NULL) { - buildLog_ += "new Kernel() failed"; - LogPrintfError("new Kernel() failed for kernel %s!", - name.c_str()); - return NULL; - } - else if (gpuKernel->create(code, metadata, binaryCode, binarySize)) { - // Add kernel to the program - kernels()[gpuKernel->name()] = gpuKernel; - buildLog_ += gpuKernel->buildLog(); - } - else { - buildError_ = gpuKernel->buildError(); - buildLog_ += gpuKernel->buildLog(); - delete gpuKernel; - LogPrintfError("Kernel creation failed for kernel %s!", name.c_str()); - return NULL; - } - - if (options->oVariables->EnableBuildTiming) { - std::stringstream tmp_ss; - tmp_ss << " Time for creating kernel (" - << name << ") : " - << (amd::Os::timeNanos() - start_time)/1000ULL - << " us\n"; - buildLog_ += tmp_ss.str(); - } - - *created = true; - return gpuKernel; + *created = true; + return gpuKernel; } // Invoked from ClBinary -bool -NullProgram::getAllKernelILs(std::map& allKernelILs, - std::string& programIL, const char* ilKernelName) -{ - llvm::CompUnit compunit (programIL); - if (ilKernelName != NULL) { - std::string MangeledName("__OpenCL_"); - MangeledName.append(ilKernelName); - MangeledName.append("_kernel"); - for (int i=0; i < static_cast(compunit.getNumKernels()); ++i) { - std::string kernelname = compunit.getKernelName(i); - if (kernelname.compare(MangeledName) == 0) { - allKernelILs[kernelname] = compunit.getKernelStr(i); - break; - } - } +bool NullProgram::getAllKernelILs(std::map& allKernelILs, + std::string& programIL, const char* ilKernelName) { + llvm::CompUnit compunit(programIL); + if (ilKernelName != NULL) { + std::string MangeledName("__OpenCL_"); + MangeledName.append(ilKernelName); + MangeledName.append("_kernel"); + for (int i = 0; i < static_cast(compunit.getNumKernels()); ++i) { + std::string kernelname = compunit.getKernelName(i); + if (kernelname.compare(MangeledName) == 0) { + allKernelILs[kernelname] = compunit.getKernelStr(i); + break; + } } - else { - for (int i=0; i < static_cast(compunit.getNumKernels()); ++i) { - std::string kernelname = compunit.getKernelName(i); - allKernelILs[kernelname] = compunit.getKernelStr(i); - } + } else { + for (int i = 0; i < static_cast(compunit.getNumKernels()); ++i) { + std::string kernelname = compunit.getKernelName(i); + allKernelILs[kernelname] = compunit.getKernelStr(i); } + } + return true; +} + +bool NullProgram::createBinary(amd::option::Options* options) { + if (options->oVariables->BinBIF30) { return true; -} - -bool -NullProgram::createBinary(amd::option::Options* options) -{ - if (options->oVariables->BinBIF30) { - return true; - } - - if (!clBinary()->createElfBinary(options->oVariables->BinEncrypt, - type())) { - LogError("Failed to create ELF binary image!"); - return false; - } - return true; -} - -Program::~Program() -{ - // Destroy the global HW constant buffers - const Program::HwConstBuffers& gds = glbHwCb(); - for (Program::HwConstBuffers::const_iterator it = gds.begin(); it != gds.end(); ++it) { - delete it->second; - } - - // Destroy the global data store - if (glbData_ != NULL) { - delete glbData_; - } -} - -bool -Program::allocGlobalData(const void* globalData, size_t dataSize, uint index) -{ - bool result = false; - gpu::Memory* dataStore = NULL; - - if (index == 0) { - // We have to lock the heap block allocation, - // so possible reallocation won't occur twice or - // another thread could destroy a heap block, - // while we didn't finish allocation - amd::ScopedLock k(dev().lockAsyncOps()); - - // Allocate memory for the global data store - glbData_ = dev().createScratchBuffer(amd::alignUp(dataSize, 0x1000)); - dataStore = glbData_; - } - else { - dataStore = new Memory(dev(), amd::alignUp(dataSize, ConstBuffer::VectorSize)); - - // Initialize constant buffer - if ((dataStore == NULL) || !dataStore->create(Resource::RemoteUSWC)) { - delete dataStore; - } - else { - constBufs_[index] = dataStore; - glbCb_.push_back(index); - } - } - - if (dataStore != NULL) { - // Upload data to GPU memory - static const bool Entire = true; - amd::Coord3D origin(0, 0, 0); - amd::Coord3D region(dataSize); - result = dev().xferMgr().writeBuffer(globalData, - *dataStore, origin, region, Entire); - } - - return result; -} - -bool -Program::loadBinary(bool* hasRecompile) -{ - if (clBinary()->loadKernels(*this, hasRecompile)) { - // Load the global data - if (clBinary()->loadGlobalData(*this)) { - return true; - } - } - - // Make sure that kernels that have been generated so far shall be deleted. - clear(); + } + if (!clBinary()->createElfBinary(options->oVariables->BinEncrypt, type())) { + LogError("Failed to create ELF binary image!"); return false; + } + return true; +} + +Program::~Program() { + // Destroy the global HW constant buffers + const Program::HwConstBuffers& gds = glbHwCb(); + for (Program::HwConstBuffers::const_iterator it = gds.begin(); it != gds.end(); ++it) { + delete it->second; + } + + // Destroy the global data store + if (glbData_ != NULL) { + delete glbData_; + } +} + +bool Program::allocGlobalData(const void* globalData, size_t dataSize, uint index) { + bool result = false; + gpu::Memory* dataStore = NULL; + + if (index == 0) { + // We have to lock the heap block allocation, + // so possible reallocation won't occur twice or + // another thread could destroy a heap block, + // while we didn't finish allocation + amd::ScopedLock k(dev().lockAsyncOps()); + + // Allocate memory for the global data store + glbData_ = dev().createScratchBuffer(amd::alignUp(dataSize, 0x1000)); + dataStore = glbData_; + } else { + dataStore = new Memory(dev(), amd::alignUp(dataSize, ConstBuffer::VectorSize)); + + // Initialize constant buffer + if ((dataStore == NULL) || !dataStore->create(Resource::RemoteUSWC)) { + delete dataStore; + } else { + constBufs_[index] = dataStore; + glbCb_.push_back(index); + } + } + + if (dataStore != NULL) { + // Upload data to GPU memory + static const bool Entire = true; + amd::Coord3D origin(0, 0, 0); + amd::Coord3D region(dataSize); + result = dev().xferMgr().writeBuffer(globalData, *dataStore, origin, region, Entire); + } + + return result; +} + +bool Program::loadBinary(bool* hasRecompile) { + if (clBinary()->loadKernels(*this, hasRecompile)) { + // Load the global data + if (clBinary()->loadGlobalData(*this)) { + return true; + } + } + + // Make sure that kernels that have been generated so far shall be deleted. + clear(); + + return false; } HSAILProgram::HSAILProgram(Device& device) - : Program(device) - , llvmBinary_() - , binaryElf_(NULL) - , rawBinary_(NULL) - , kernels_(NULL) - , maxScratchRegs_(0) - , isNull_(false) - , executable_(NULL) - , loaderContext_(this) -{ - memset(&binOpts_, 0, sizeof(binOpts_)); - binOpts_.struct_size = sizeof(binOpts_); - binOpts_.elfclass = LP64_SWITCH(ELFCLASS32, ELFCLASS64); - binOpts_.bitness = ELFDATA2LSB; - binOpts_.alloc = &::malloc; - binOpts_.dealloc = &::free; - loader_ = amd::hsa::loader::Loader::Create(&loaderContext_); + : Program(device), + llvmBinary_(), + binaryElf_(NULL), + rawBinary_(NULL), + kernels_(NULL), + maxScratchRegs_(0), + isNull_(false), + executable_(NULL), + loaderContext_(this) { + memset(&binOpts_, 0, sizeof(binOpts_)); + binOpts_.struct_size = sizeof(binOpts_); + binOpts_.elfclass = LP64_SWITCH(ELFCLASS32, ELFCLASS64); + binOpts_.bitness = ELFDATA2LSB; + binOpts_.alloc = &::malloc; + binOpts_.dealloc = &::free; + loader_ = amd::hsa::loader::Loader::Create(&loaderContext_); } HSAILProgram::HSAILProgram(NullDevice& device) - : Program(device) - , llvmBinary_() - , binaryElf_(NULL) - , rawBinary_(NULL) - , kernels_(NULL) - , maxScratchRegs_(0) - , isNull_(true) - , executable_(NULL) - , loaderContext_(this) -{ - memset(&binOpts_, 0, sizeof(binOpts_)); - binOpts_.struct_size = sizeof(binOpts_); - binOpts_.elfclass = LP64_SWITCH(ELFCLASS32, ELFCLASS64); - binOpts_.bitness = ELFDATA2LSB; - binOpts_.alloc = &::malloc; - binOpts_.dealloc = &::free; - loader_ = amd::hsa::loader::Loader::Create(&loaderContext_); + : Program(device), + llvmBinary_(), + binaryElf_(NULL), + rawBinary_(NULL), + kernels_(NULL), + maxScratchRegs_(0), + isNull_(true), + executable_(NULL), + loaderContext_(this) { + memset(&binOpts_, 0, sizeof(binOpts_)); + binOpts_.struct_size = sizeof(binOpts_); + binOpts_.elfclass = LP64_SWITCH(ELFCLASS32, ELFCLASS64); + binOpts_.bitness = ELFDATA2LSB; + binOpts_.alloc = &::malloc; + binOpts_.dealloc = &::free; + loader_ = amd::hsa::loader::Loader::Create(&loaderContext_); } -HSAILProgram::~HSAILProgram() -{ - // Destroy internal static samplers - for (auto& it : staticSamplers_) { - delete it; +HSAILProgram::~HSAILProgram() { + // Destroy internal static samplers + for (auto& it : staticSamplers_) { + delete it; + } + if (rawBinary_ != NULL) { + aclFreeMem(binaryElf_, rawBinary_); + } + acl_error error; + // Free the elf binary + if (binaryElf_ != NULL) { + error = aclBinaryFini(binaryElf_); + if (error != ACL_SUCCESS) { + LogWarning("Error while destroying the acl binary \n"); } - if (rawBinary_ != NULL) { - aclFreeMem(binaryElf_, rawBinary_); - } - acl_error error; - // Free the elf binary - if (binaryElf_ != NULL) { - error = aclBinaryFini(binaryElf_); - if (error != ACL_SUCCESS) { - LogWarning( "Error while destroying the acl binary \n" ); - } - } - releaseClBinary(); - if (executable_ != NULL) { - loader_->DestroyExecutable(executable_); - } - delete kernels_; - amd::hsa::loader::Loader::Destroy(loader_); + } + releaseClBinary(); + if (executable_ != NULL) { + loader_->DestroyExecutable(executable_); + } + delete kernels_; + amd::hsa::loader::Loader::Destroy(loader_); } -bool -HSAILProgram::initBuild(amd::option::Options *options) -{ - if (!device::Program::initBuild(options)) { +bool HSAILProgram::initBuild(amd::option::Options* options) { + if (!device::Program::initBuild(options)) { + return false; + } + + const char* devName = dev().hwInfo()->machineTarget_; + options->setPerBuildInfo((devName && (devName[0] != '\0')) ? devName : "gpu", + clBinary()->getEncryptCode(), true); + + // Elf Binary setup + std::string outFileName; + + // true means fsail required + clBinary()->init(options, true); + if (options->isDumpFlagSet(amd::option::DUMP_BIF)) { + outFileName = options->getDumpFileName(".bin"); + } + + if (!clBinary()->setElfOut(LP64_SWITCH(ELFCLASS32, ELFCLASS64), + (outFileName.size() > 0) ? outFileName.c_str() : NULL)) { + LogError("Setup elf out for gpu failed"); + return false; + } + return true; +} + +bool HSAILProgram::finiBuild(bool isBuildGood) { + clBinary()->resetElfOut(); + clBinary()->resetElfIn(); + + if (!isBuildGood) { + // Prevent the encrypted binary form leaking out + clBinary()->setBinary(NULL, 0); + } + + return device::Program::finiBuild(isBuildGood); +} + +bool HSAILProgram::linkImpl(const std::vector& inputPrograms, + amd::option::Options* options, bool createLibrary) { + std::vector::const_iterator it = inputPrograms.begin(); + std::vector::const_iterator itEnd = inputPrograms.end(); + acl_error errorCode; + + // For each program we need to extract the LLVMIR and create + // aclBinary for each + std::vector binaries_to_link; + + for (size_t i = 0; it != itEnd; ++it, ++i) { + HSAILProgram* program = (HSAILProgram*)*it; + // Check if the program was created with clCreateProgramWIthBinary + binary_t binary = program->binary(); + if ((binary.first != NULL) && (binary.second > 0)) { + // Binary already exists -- we can also check if there is no + // opencl source code + // Need to check if LLVMIR exists in the binary + // If LLVMIR does not exist then is it valid + // We need to pull out all the compiled kernels + // We cannot do this at present because we need at least + // Hsail text to pull the kernels oout + void* mem = const_cast(binary.first); + binaryElf_ = aclReadFromMem(mem, binary.second, &errorCode); + if (errorCode != ACL_SUCCESS) { + LogWarning("Error while linking : Could not read from raw binary"); return false; + } } - - const char* devName = dev().hwInfo()->machineTarget_; - options->setPerBuildInfo( - (devName && (devName[0] != '\0')) ? devName : "gpu", - clBinary()->getEncryptCode(), true); - - // Elf Binary setup - std::string outFileName; - - // true means fsail required - clBinary()->init(options, true); - if (options->isDumpFlagSet(amd::option::DUMP_BIF)) { - outFileName = options->getDumpFileName(".bin"); - } - - if (!clBinary()->setElfOut(LP64_SWITCH(ELFCLASS32, ELFCLASS64), - (outFileName.size() > 0) ? outFileName.c_str() : NULL)) { - LogError("Setup elf out for gpu failed"); - return false; - } - return true; -} - -bool -HSAILProgram::finiBuild(bool isBuildGood) -{ - clBinary()->resetElfOut(); - clBinary()->resetElfIn(); - - if (!isBuildGood) { - // Prevent the encrypted binary form leaking out - clBinary()->setBinary(NULL, 0); - } - - return device::Program::finiBuild(isBuildGood); -} - -bool -HSAILProgram::linkImpl( - const std::vector &inputPrograms, - amd::option::Options *options, - bool createLibrary) -{ - std::vector::const_iterator it - = inputPrograms.begin(); - std::vector::const_iterator itEnd - = inputPrograms.end(); - acl_error errorCode; - - // For each program we need to extract the LLVMIR and create - // aclBinary for each - std::vector binaries_to_link; - - for (size_t i = 0; it != itEnd; ++it, ++i) { - HSAILProgram *program = (HSAILProgram *)*it; - // Check if the program was created with clCreateProgramWIthBinary - binary_t binary = program->binary(); - if ((binary.first != NULL) && (binary.second > 0)) { - // Binary already exists -- we can also check if there is no - // opencl source code - // Need to check if LLVMIR exists in the binary - // If LLVMIR does not exist then is it valid - // We need to pull out all the compiled kernels - // We cannot do this at present because we need at least - // Hsail text to pull the kernels oout - void *mem = const_cast(binary.first); - binaryElf_ = aclReadFromMem(mem, binary.second, &errorCode); - if (errorCode != ACL_SUCCESS) { - LogWarning("Error while linking : Could not read from raw binary"); - return false; - } - } - // At this stage each HSAILProgram contains a valid binary_elf - // Check if LLVMIR is in the binary - // @TODO - Memory leak , cannot free this buffer - // need to fix this.. File EPR on compiler library - size_t llvmirSize = 0; - const void *llvmirText = aclExtractSection(dev().hsaCompiler(), - binaryElf_, &llvmirSize, aclLLVMIR, &errorCode); + // At this stage each HSAILProgram contains a valid binary_elf + // Check if LLVMIR is in the binary + // @TODO - Memory leak , cannot free this buffer + // need to fix this.. File EPR on compiler library + size_t llvmirSize = 0; + const void* llvmirText = + aclExtractSection(dev().hsaCompiler(), binaryElf_, &llvmirSize, aclLLVMIR, &errorCode); + if (errorCode != ACL_SUCCESS) { + bool spirv = false; + size_t boolSize = sizeof(bool); + errorCode = + aclQueryInfo(dev().hsaCompiler(), binaryElf_, RT_CONTAINS_SPIRV, NULL, &spirv, &boolSize); + if (errorCode != ACL_SUCCESS) { + spirv = false; + } + if (spirv) { + errorCode = aclCompile(dev().hsaCompiler(), binaryElf_, options->origOptionStr.c_str(), + ACL_TYPE_SPIRV_BINARY, ACL_TYPE_LLVMIR_BINARY, NULL); + buildLog_ += aclGetCompilerLog(dev().hsaCompiler()); if (errorCode != ACL_SUCCESS) { - bool spirv = false; - size_t boolSize = sizeof(bool); - errorCode = aclQueryInfo(dev().hsaCompiler(), binaryElf_, - RT_CONTAINS_SPIRV, NULL, &spirv, &boolSize); - if (errorCode != ACL_SUCCESS) { - spirv = false; - } - if (spirv) { - errorCode = aclCompile(dev().hsaCompiler(), binaryElf_, - options->origOptionStr.c_str(), ACL_TYPE_SPIRV_BINARY, - ACL_TYPE_LLVMIR_BINARY, NULL); - buildLog_ += aclGetCompilerLog(dev().hsaCompiler()); - if (errorCode != ACL_SUCCESS) { - buildLog_ += "Error while linking: Could not load SPIR-V" ; - return false; - } - } else { - buildLog_ +="Error while linking : \ - Invalid binary (Missing LLVMIR section)" ; - return false; - } + buildLog_ += "Error while linking: Could not load SPIR-V"; + return false; } - // Create a new aclBinary for each LLVMIR and save it in a list - aclBIFVersion ver = aclBinaryVersion(binaryElf_); - aclBinary *bin = aclCreateFromBinary(binaryElf_, ver); - binaries_to_link.push_back(bin); - } - - errorCode = aclLink(dev().hsaCompiler(), - binaries_to_link[0], binaries_to_link.size() - 1, - binaries_to_link.size() > 1 ? &binaries_to_link[1] : NULL, - ACL_TYPE_LLVMIR_BINARY, "-create-library", NULL); - if (errorCode != ACL_SUCCESS) { - buildLog_ += aclGetCompilerLog(dev().hsaCompiler()); - buildLog_ +="Error while linking : aclLink failed" ; + } else { + buildLog_ += + "Error while linking : \ + Invalid binary (Missing LLVMIR section)"; return false; + } } - // Store the newly linked aclBinary for this program. - binaryElf_ = binaries_to_link[0]; - // Free all the other aclBinaries - for (size_t i = 1; i < binaries_to_link.size(); i++) { - aclBinaryFini(binaries_to_link[i]); - } - if (createLibrary) { - saveBinaryAndSetType(TYPE_LIBRARY); - buildLog_ += aclGetCompilerLog(dev().hsaCompiler()); - return true; - } - // Now call linkImpl with the new options - return linkImpl(options); + // Create a new aclBinary for each LLVMIR and save it in a list + aclBIFVersion ver = aclBinaryVersion(binaryElf_); + aclBinary* bin = aclCreateFromBinary(binaryElf_, ver); + binaries_to_link.push_back(bin); + } + + errorCode = aclLink(dev().hsaCompiler(), binaries_to_link[0], binaries_to_link.size() - 1, + binaries_to_link.size() > 1 ? &binaries_to_link[1] : NULL, + ACL_TYPE_LLVMIR_BINARY, "-create-library", NULL); + if (errorCode != ACL_SUCCESS) { + buildLog_ += aclGetCompilerLog(dev().hsaCompiler()); + buildLog_ += "Error while linking : aclLink failed"; + return false; + } + // Store the newly linked aclBinary for this program. + binaryElf_ = binaries_to_link[0]; + // Free all the other aclBinaries + for (size_t i = 1; i < binaries_to_link.size(); i++) { + aclBinaryFini(binaries_to_link[i]); + } + if (createLibrary) { + saveBinaryAndSetType(TYPE_LIBRARY); + buildLog_ += aclGetCompilerLog(dev().hsaCompiler()); + return true; + } + // Now call linkImpl with the new options + return linkImpl(options); } -aclType -HSAILProgram::getCompilationStagesFromBinary(std::vector& completeStages, bool& needOptionsCheck) -{ - acl_error errorCode; - size_t secSize = 0; - completeStages.clear(); - aclType from = ACL_TYPE_DEFAULT; - needOptionsCheck = true; - size_t boolSize = sizeof(bool); - //! @todo Should we also check for ACL_TYPE_OPENCL & ACL_TYPE_LLVMIR_TEXT? - // Checking llvmir in .llvmir section - bool containsSpirv = true; - errorCode = aclQueryInfo(dev().hsaCompiler(), binaryElf_, - RT_CONTAINS_SPIRV, NULL, &containsSpirv, &boolSize); - if (errorCode != ACL_SUCCESS) { - containsSpirv = false; - } - if (containsSpirv) { - completeStages.push_back(from); - from = ACL_TYPE_SPIRV_BINARY; - } - bool containsSpirText = true; - errorCode = aclQueryInfo(dev().hsaCompiler(), binaryElf_, RT_CONTAINS_SPIR, NULL, &containsSpirText, &boolSize); - if (errorCode != ACL_SUCCESS) { - containsSpirText = false; - } - if (containsSpirText) { - completeStages.push_back(from); - from = ACL_TYPE_SPIR_BINARY; - } - bool containsLlvmirText = true; - errorCode = aclQueryInfo(dev().hsaCompiler(), binaryElf_, RT_CONTAINS_LLVMIR, NULL, &containsLlvmirText, &boolSize); - if (errorCode != ACL_SUCCESS) { - containsLlvmirText = false; - } - // Checking compile & link options in .comment section - bool containsOpts = true; - errorCode = aclQueryInfo(dev().hsaCompiler(), binaryElf_, RT_CONTAINS_OPTIONS, NULL, &containsOpts, &boolSize); - if (errorCode != ACL_SUCCESS) { - containsOpts = false; - } - if (containsLlvmirText && containsOpts) { - completeStages.push_back(from); - from = ACL_TYPE_LLVMIR_BINARY; - } - // Checking HSAIL in .cg section - bool containsHsailText = true; - errorCode = aclQueryInfo(dev().hsaCompiler(), binaryElf_, RT_CONTAINS_HSAIL, NULL, &containsHsailText, &boolSize); - if (errorCode != ACL_SUCCESS) { - containsHsailText = false; - } - // Checking BRIG sections - bool containsBrig = true; - errorCode = aclQueryInfo(dev().hsaCompiler(), binaryElf_, RT_CONTAINS_BRIG, NULL, &containsBrig, &boolSize); - if (errorCode != ACL_SUCCESS) { - containsBrig = false; - } - if (containsBrig) { - completeStages.push_back(from); - from = ACL_TYPE_HSAIL_BINARY; - } else if (containsHsailText) { - completeStages.push_back(from); - from = ACL_TYPE_HSAIL_TEXT; - } - // Checking Loader Map symbol from CG section - bool containsLoaderMap = true; - errorCode = aclQueryInfo(dev().hsaCompiler(), binaryElf_, RT_CONTAINS_LOADER_MAP, NULL, &containsLoaderMap, &boolSize); - if (errorCode != ACL_SUCCESS) { - containsLoaderMap = false; - } - if (containsLoaderMap) { - completeStages.push_back(from); - from = ACL_TYPE_CG; - } - // Checking ISA in .text section - bool containsShaderIsa = true; - errorCode = aclQueryInfo(dev().hsaCompiler(), binaryElf_, RT_CONTAINS_ISA, NULL, &containsShaderIsa, &boolSize); - if (errorCode != ACL_SUCCESS) { - containsShaderIsa = false; - } - if (containsShaderIsa) { - completeStages.push_back(from); - from = ACL_TYPE_ISA; - } - std::string sCurOptions = compileOptions_ + linkOptions_; - amd::option::Options curOptions; - if (!amd::option::parseAllOptions(sCurOptions, curOptions)) { - buildLog_ += curOptions.optionsLog(); - LogError("Parsing compile options failed."); - return ACL_TYPE_DEFAULT; - } - switch (from) { +aclType HSAILProgram::getCompilationStagesFromBinary(std::vector& completeStages, + bool& needOptionsCheck) { + acl_error errorCode; + size_t secSize = 0; + completeStages.clear(); + aclType from = ACL_TYPE_DEFAULT; + needOptionsCheck = true; + size_t boolSize = sizeof(bool); + //! @todo Should we also check for ACL_TYPE_OPENCL & ACL_TYPE_LLVMIR_TEXT? + // Checking llvmir in .llvmir section + bool containsSpirv = true; + errorCode = aclQueryInfo(dev().hsaCompiler(), binaryElf_, RT_CONTAINS_SPIRV, NULL, &containsSpirv, + &boolSize); + if (errorCode != ACL_SUCCESS) { + containsSpirv = false; + } + if (containsSpirv) { + completeStages.push_back(from); + from = ACL_TYPE_SPIRV_BINARY; + } + bool containsSpirText = true; + errorCode = aclQueryInfo(dev().hsaCompiler(), binaryElf_, RT_CONTAINS_SPIR, NULL, + &containsSpirText, &boolSize); + if (errorCode != ACL_SUCCESS) { + containsSpirText = false; + } + if (containsSpirText) { + completeStages.push_back(from); + from = ACL_TYPE_SPIR_BINARY; + } + bool containsLlvmirText = true; + errorCode = aclQueryInfo(dev().hsaCompiler(), binaryElf_, RT_CONTAINS_LLVMIR, NULL, + &containsLlvmirText, &boolSize); + if (errorCode != ACL_SUCCESS) { + containsLlvmirText = false; + } + // Checking compile & link options in .comment section + bool containsOpts = true; + errorCode = aclQueryInfo(dev().hsaCompiler(), binaryElf_, RT_CONTAINS_OPTIONS, NULL, + &containsOpts, &boolSize); + if (errorCode != ACL_SUCCESS) { + containsOpts = false; + } + if (containsLlvmirText && containsOpts) { + completeStages.push_back(from); + from = ACL_TYPE_LLVMIR_BINARY; + } + // Checking HSAIL in .cg section + bool containsHsailText = true; + errorCode = aclQueryInfo(dev().hsaCompiler(), binaryElf_, RT_CONTAINS_HSAIL, NULL, + &containsHsailText, &boolSize); + if (errorCode != ACL_SUCCESS) { + containsHsailText = false; + } + // Checking BRIG sections + bool containsBrig = true; + errorCode = aclQueryInfo(dev().hsaCompiler(), binaryElf_, RT_CONTAINS_BRIG, NULL, &containsBrig, + &boolSize); + if (errorCode != ACL_SUCCESS) { + containsBrig = false; + } + if (containsBrig) { + completeStages.push_back(from); + from = ACL_TYPE_HSAIL_BINARY; + } else if (containsHsailText) { + completeStages.push_back(from); + from = ACL_TYPE_HSAIL_TEXT; + } + // Checking Loader Map symbol from CG section + bool containsLoaderMap = true; + errorCode = aclQueryInfo(dev().hsaCompiler(), binaryElf_, RT_CONTAINS_LOADER_MAP, NULL, + &containsLoaderMap, &boolSize); + if (errorCode != ACL_SUCCESS) { + containsLoaderMap = false; + } + if (containsLoaderMap) { + completeStages.push_back(from); + from = ACL_TYPE_CG; + } + // Checking ISA in .text section + bool containsShaderIsa = true; + errorCode = aclQueryInfo(dev().hsaCompiler(), binaryElf_, RT_CONTAINS_ISA, NULL, + &containsShaderIsa, &boolSize); + if (errorCode != ACL_SUCCESS) { + containsShaderIsa = false; + } + if (containsShaderIsa) { + completeStages.push_back(from); + from = ACL_TYPE_ISA; + } + std::string sCurOptions = compileOptions_ + linkOptions_; + amd::option::Options curOptions; + if (!amd::option::parseAllOptions(sCurOptions, curOptions)) { + buildLog_ += curOptions.optionsLog(); + LogError("Parsing compile options failed."); + return ACL_TYPE_DEFAULT; + } + switch (from) { // compile from HSAIL text, no matter prev. stages and options case ACL_TYPE_HSAIL_TEXT: - needOptionsCheck = false; - break; + needOptionsCheck = false; + break; case ACL_TYPE_HSAIL_BINARY: - // do not check options, if LLVMIR is absent or might be absent or options are absent - if (!curOptions.oVariables->BinLLVMIR || !containsLlvmirText || !containsOpts) { - needOptionsCheck = false; - } - break; + // do not check options, if LLVMIR is absent or might be absent or options are absent + if (!curOptions.oVariables->BinLLVMIR || !containsLlvmirText || !containsOpts) { + needOptionsCheck = false; + } + break; case ACL_TYPE_CG: case ACL_TYPE_ISA: - // do not check options, if LLVMIR is absent or might be absent or options are absent - if (!curOptions.oVariables->BinLLVMIR || !containsLlvmirText || !containsOpts) { - needOptionsCheck = false; - } - // do not check options, if BRIG is absent or might be absent or LoaderMap is absent - if (!curOptions.oVariables->BinCG || !containsBrig || !containsLoaderMap) { - needOptionsCheck = false; - } - break; + // do not check options, if LLVMIR is absent or might be absent or options are absent + if (!curOptions.oVariables->BinLLVMIR || !containsLlvmirText || !containsOpts) { + needOptionsCheck = false; + } + // do not check options, if BRIG is absent or might be absent or LoaderMap is absent + if (!curOptions.oVariables->BinCG || !containsBrig || !containsLoaderMap) { + needOptionsCheck = false; + } + break; // recompilation might be needed case ACL_TYPE_LLVMIR_BINARY: case ACL_TYPE_DEFAULT: default: - break; - } - return from; + break; + } + return from; } -aclType -HSAILProgram::getNextCompilationStageFromBinary(amd::option::Options* options) { - aclType continueCompileFrom = ACL_TYPE_DEFAULT; - binary_t binary = this->binary(); - // If the binary already exists - if ((binary.first != NULL) && (binary.second > 0)) { - void *mem = const_cast(binary.first); - acl_error errorCode; - binaryElf_ = aclReadFromMem(mem, binary.second, &errorCode); - if (errorCode != ACL_SUCCESS) { - buildLog_ += "Error: Reading the binary from memory failed.\n"; - return continueCompileFrom; - } - // Calculate the next stage to compile from, based on sections in binaryElf_; - // No any validity checks here - std::vector completeStages; - bool needOptionsCheck = true; - continueCompileFrom = getCompilationStagesFromBinary(completeStages, needOptionsCheck); - // Saving binary in the interface class, - // which also load compile & link options from binary - setBinary(static_cast(mem), binary.second); - if (!options || !needOptionsCheck) { - return continueCompileFrom; - } - bool recompile = false; - //! @todo Should we also check for ACL_TYPE_OPENCL & ACL_TYPE_LLVMIR_TEXT? - switch (continueCompileFrom) { +aclType HSAILProgram::getNextCompilationStageFromBinary(amd::option::Options* options) { + aclType continueCompileFrom = ACL_TYPE_DEFAULT; + binary_t binary = this->binary(); + // If the binary already exists + if ((binary.first != NULL) && (binary.second > 0)) { + void* mem = const_cast(binary.first); + acl_error errorCode; + binaryElf_ = aclReadFromMem(mem, binary.second, &errorCode); + if (errorCode != ACL_SUCCESS) { + buildLog_ += "Error: Reading the binary from memory failed.\n"; + return continueCompileFrom; + } + // Calculate the next stage to compile from, based on sections in binaryElf_; + // No any validity checks here + std::vector completeStages; + bool needOptionsCheck = true; + continueCompileFrom = getCompilationStagesFromBinary(completeStages, needOptionsCheck); + // Saving binary in the interface class, + // which also load compile & link options from binary + setBinary(static_cast(mem), binary.second); + if (!options || !needOptionsCheck) { + return continueCompileFrom; + } + bool recompile = false; + //! @todo Should we also check for ACL_TYPE_OPENCL & ACL_TYPE_LLVMIR_TEXT? + switch (continueCompileFrom) { case ACL_TYPE_HSAIL_BINARY: case ACL_TYPE_CG: case ACL_TYPE_ISA: { - // Compare options loaded from binary with current ones, recompile if differ; - // If compile options are absent in binary, do not compare and recompile - if (compileOptions_.empty()) - break; - const oclBIFSymbolStruct* symbol = findBIF30SymStruct(symOpenclCompilerOptions); - assert(symbol && "symbol not found"); - std::string symName = std::string(symbol->str[bif::PRE]) + std::string(symbol->str[bif::POST]); - size_t symSize = 0; - const void *opts = aclExtractSymbol(dev().hsaCompiler(), - binaryElf_, &symSize, aclCOMMENT, symName.c_str(), &errorCode); - if (errorCode != ACL_SUCCESS) { - recompile = true; - break; - } - std::string sBinOptions = std::string((char*)opts, symSize); - std::string sCurOptions = compileOptions_ + linkOptions_; - amd::option::Options curOptions, binOptions; - if (!amd::option::parseAllOptions(sBinOptions, binOptions)) { - buildLog_ += binOptions.optionsLog(); - LogError("Parsing compile options from binary failed."); - return ACL_TYPE_DEFAULT; - } - if (!amd::option::parseAllOptions(sCurOptions, curOptions)) { - buildLog_ += curOptions.optionsLog(); - LogError("Parsing compile options failed."); - return ACL_TYPE_DEFAULT; - } - if (!curOptions.equals(binOptions)) { - recompile = true; - } + // Compare options loaded from binary with current ones, recompile if differ; + // If compile options are absent in binary, do not compare and recompile + if (compileOptions_.empty()) break; + const oclBIFSymbolStruct* symbol = findBIF30SymStruct(symOpenclCompilerOptions); + assert(symbol && "symbol not found"); + std::string symName = + std::string(symbol->str[bif::PRE]) + std::string(symbol->str[bif::POST]); + size_t symSize = 0; + const void* opts = aclExtractSymbol(dev().hsaCompiler(), binaryElf_, &symSize, aclCOMMENT, + symName.c_str(), &errorCode); + if (errorCode != ACL_SUCCESS) { + recompile = true; break; + } + std::string sBinOptions = std::string((char*)opts, symSize); + std::string sCurOptions = compileOptions_ + linkOptions_; + amd::option::Options curOptions, binOptions; + if (!amd::option::parseAllOptions(sBinOptions, binOptions)) { + buildLog_ += binOptions.optionsLog(); + LogError("Parsing compile options from binary failed."); + return ACL_TYPE_DEFAULT; + } + if (!amd::option::parseAllOptions(sCurOptions, curOptions)) { + buildLog_ += curOptions.optionsLog(); + LogError("Parsing compile options failed."); + return ACL_TYPE_DEFAULT; + } + if (!curOptions.equals(binOptions)) { + recompile = true; + } + break; } default: + break; + } + if (recompile) { + while (!completeStages.empty()) { + continueCompileFrom = completeStages.back(); + if (continueCompileFrom == ACL_TYPE_SPIRV_BINARY || + continueCompileFrom == ACL_TYPE_LLVMIR_BINARY || + continueCompileFrom == ACL_TYPE_SPIR_BINARY || + continueCompileFrom == ACL_TYPE_DEFAULT) { break; - } - if (recompile) { - while (!completeStages.empty()) { - continueCompileFrom = completeStages.back(); - if (continueCompileFrom == ACL_TYPE_SPIRV_BINARY || - continueCompileFrom == ACL_TYPE_LLVMIR_BINARY || - continueCompileFrom == ACL_TYPE_SPIR_BINARY || - continueCompileFrom == ACL_TYPE_DEFAULT) { - break; - } - completeStages.pop_back(); - } + } + completeStages.pop_back(); } } - return continueCompileFrom; + } + return continueCompileFrom; } -inline static std::vector -splitSpaceSeparatedString(char *str) -{ +inline static std::vector splitSpaceSeparatedString(char* str) { std::string s(str); std::stringstream ss(s); std::istream_iterator beg(ss), end; @@ -2085,18 +1936,16 @@ splitSpaceSeparatedString(char *str) return vec; } -bool -HSAILProgram::linkImpl(amd::option::Options* options) -{ - acl_error errorCode; - aclType continueCompileFrom = ACL_TYPE_LLVMIR_BINARY; - bool finalize = true; - bool hsaLoad = true; - // If !binaryElf_ then program must have been created using clCreateProgramWithBinary - if (!binaryElf_) { - continueCompileFrom = getNextCompilationStageFromBinary(options); - } - switch (continueCompileFrom) { +bool HSAILProgram::linkImpl(amd::option::Options* options) { + acl_error errorCode; + aclType continueCompileFrom = ACL_TYPE_LLVMIR_BINARY; + bool finalize = true; + bool hsaLoad = true; + // If !binaryElf_ then program must have been created using clCreateProgramWithBinary + if (!binaryElf_) { + continueCompileFrom = getNextCompilationStageFromBinary(options); + } + switch (continueCompileFrom) { case ACL_TYPE_SPIRV_BINARY: case ACL_TYPE_SPIR_BINARY: // Compilation from ACL_TYPE_LLVMIR_BINARY to ACL_TYPE_CG in cases: @@ -2111,327 +1960,338 @@ HSAILProgram::linkImpl(amd::option::Options* options) // Compilation from ACL_TYPE_HSAIL_TEXT to ACL_TYPE_CG in cases: // 1. if the program is created with binary and contains only hsail text case ACL_TYPE_HSAIL_TEXT: { - std::string curOptions = options->origOptionStr + hsailOptions(); - errorCode = aclCompile(dev().hsaCompiler(), binaryElf_, - curOptions.c_str(), continueCompileFrom, ACL_TYPE_CG, NULL); - buildLog_ += aclGetCompilerLog(dev().hsaCompiler()); - if (errorCode != ACL_SUCCESS) { - buildLog_ += "Error: BRIG code generation failed.\n"; - return false; - } - break; + std::string curOptions = options->origOptionStr + hsailOptions(); + errorCode = aclCompile(dev().hsaCompiler(), binaryElf_, curOptions.c_str(), + continueCompileFrom, ACL_TYPE_CG, NULL); + buildLog_ += aclGetCompilerLog(dev().hsaCompiler()); + if (errorCode != ACL_SUCCESS) { + buildLog_ += "Error: BRIG code generation failed.\n"; + return false; + } + break; } case ACL_TYPE_CG: - break; + break; case ACL_TYPE_ISA: - finalize = false; - break; + finalize = false; + break; default: - buildLog_ += "Error: The binary is incorrect or incomplete. Finalization to ISA couldn't be performed.\n"; - return false; + buildLog_ += + "Error: The binary is incorrect or incomplete. Finalization to ISA couldn't be " + "performed.\n"; + return false; + } + if (finalize) { + std::string fin_options(options->origOptionStr + hsailOptions()); + // Append an option so that we can selectively enable a SCOption on CZ + // whenever IOMMUv2 is enabled. + if (dev().settings().svmFineGrainSystem_) { + fin_options.append(" -sc-xnack-iommu"); } - if (finalize) { - std::string fin_options(options->origOptionStr + hsailOptions()); - // Append an option so that we can selectively enable a SCOption on CZ - // whenever IOMMUv2 is enabled. - if (dev().settings().svmFineGrainSystem_) { - fin_options.append(" -sc-xnack-iommu"); - } - errorCode = aclCompile(dev().hsaCompiler(), binaryElf_, - fin_options.c_str(), ACL_TYPE_CG, ACL_TYPE_ISA, NULL); - buildLog_ += aclGetCompilerLog(dev().hsaCompiler()); - if (errorCode != ACL_SUCCESS) { - buildLog_ += "Error: BRIG finalization to ISA failed.\n"; - return false; - } - } - // ACL_TYPE_CG stage is not performed for offline compilation - hsa_agent_t agent; - agent.handle = 1; - if (hsaLoad) { - executable_ = loader_->CreateExecutable(HSA_PROFILE_FULL, NULL); - if (executable_ == NULL) { - buildLog_ += "Error: Executable for AMD HSA Code Object isn't created.\n"; - return false; - } - size_t size = 0; - hsa_code_object_t code_object; - code_object.handle = reinterpret_cast(aclExtractSection(dev().hsaCompiler(), binaryElf_, &size, aclTEXT, &errorCode)); - if (errorCode != ACL_SUCCESS) { - buildLog_ += "Error: Extracting AMD HSA Code Object from binary failed.\n"; - return false; - } - hsa_status_t status = executable_->LoadCodeObject(agent, code_object, NULL); - if (status != HSA_STATUS_SUCCESS) { - buildLog_ += "Error: AMD HSA Code Object loading failed.\n"; - return false; - } - } - size_t kernelNamesSize = 0; - errorCode = aclQueryInfo(dev().hsaCompiler(), binaryElf_, RT_KERNEL_NAMES, NULL, NULL, &kernelNamesSize); - if (errorCode != ACL_SUCCESS) { - buildLog_ += "Error: Querying of kernel names size from the binary failed.\n"; - return false; - } - if (kernelNamesSize > 0) { - char* kernelNames = new char[kernelNamesSize]; - errorCode = aclQueryInfo(dev().hsaCompiler(), binaryElf_, RT_KERNEL_NAMES, NULL, kernelNames, &kernelNamesSize); - if (errorCode != ACL_SUCCESS) { - buildLog_ += "Error: Querying of kernel names from the binary failed.\n"; - delete kernelNames; - return false; - } - std::vector vKernels = splitSpaceSeparatedString(kernelNames); - delete kernelNames; - std::vector::iterator it = vKernels.begin(); - bool dynamicParallelism = false; - aclMetadata md; - md.numHiddenKernelArgs = 0; - size_t sizeOfnumHiddenKernelArgs = sizeof(md.numHiddenKernelArgs); - for (it; it != vKernels.end(); ++it) { - std::string kernelName(*it); - std::string openclKernelName = Kernel::openclMangledName(kernelName); - errorCode = aclQueryInfo(dev().hsaCompiler(), binaryElf_, RT_NUM_KERNEL_HIDDEN_ARGS, - openclKernelName.c_str(), &md.numHiddenKernelArgs, &sizeOfnumHiddenKernelArgs); - if (errorCode != ACL_SUCCESS) { - buildLog_ += "Error: Querying of kernel '" + openclKernelName + - "' extra arguments count from AMD HSA Code Object failed. Kernel initialization failed.\n"; - return false; - } - HSAILKernel *aKernel = new HSAILKernel(kernelName, this, options->origOptionStr + hsailOptions(), - md.numHiddenKernelArgs); - kernels()[kernelName] = aKernel; - amd::hsa::loader::Symbol *sym = executable_->GetSymbol(openclKernelName.c_str(), &agent); - if (!sym) { - buildLog_ += "Error: Getting kernel ISA code symbol '" + openclKernelName + - "' from AMD HSA Code Object failed. Kernel initialization failed.\n"; - return false; - } - if (!aKernel->init(sym, false)) { - buildLog_ += "Error: Kernel '" + openclKernelName + "' initialization failed.\n"; - return false; - } - buildLog_ += aKernel->buildLog(); - aKernel->setUniformWorkGroupSize(options->oVariables->UniformWorkGroupSize); - dynamicParallelism |= aKernel->dynamicParallelism(); - // Find max scratch regs used in the program. It's used for scratch buffer preallocation - // with dynamic parallelism, since runtime doesn't know which child kernel will be called - maxScratchRegs_ = std::max(static_cast(aKernel->workGroupInfo()->scratchRegs_), maxScratchRegs_); - } - // Allocate kernel table for device enqueuing - if (!isNull() && dynamicParallelism && !allocKernelTable()) { - return false; - } - } - // Save the binary in the interface class - saveBinaryAndSetType(TYPE_EXECUTABLE); + errorCode = aclCompile(dev().hsaCompiler(), binaryElf_, fin_options.c_str(), ACL_TYPE_CG, + ACL_TYPE_ISA, NULL); buildLog_ += aclGetCompilerLog(dev().hsaCompiler()); - return true; + if (errorCode != ACL_SUCCESS) { + buildLog_ += "Error: BRIG finalization to ISA failed.\n"; + return false; + } + } + // ACL_TYPE_CG stage is not performed for offline compilation + hsa_agent_t agent; + agent.handle = 1; + if (hsaLoad) { + executable_ = loader_->CreateExecutable(HSA_PROFILE_FULL, NULL); + if (executable_ == NULL) { + buildLog_ += "Error: Executable for AMD HSA Code Object isn't created.\n"; + return false; + } + size_t size = 0; + hsa_code_object_t code_object; + code_object.handle = reinterpret_cast( + aclExtractSection(dev().hsaCompiler(), binaryElf_, &size, aclTEXT, &errorCode)); + if (errorCode != ACL_SUCCESS) { + buildLog_ += "Error: Extracting AMD HSA Code Object from binary failed.\n"; + return false; + } + hsa_status_t status = executable_->LoadCodeObject(agent, code_object, NULL); + if (status != HSA_STATUS_SUCCESS) { + buildLog_ += "Error: AMD HSA Code Object loading failed.\n"; + return false; + } + } + size_t kernelNamesSize = 0; + errorCode = + aclQueryInfo(dev().hsaCompiler(), binaryElf_, RT_KERNEL_NAMES, NULL, NULL, &kernelNamesSize); + if (errorCode != ACL_SUCCESS) { + buildLog_ += "Error: Querying of kernel names size from the binary failed.\n"; + return false; + } + if (kernelNamesSize > 0) { + char* kernelNames = new char[kernelNamesSize]; + errorCode = aclQueryInfo(dev().hsaCompiler(), binaryElf_, RT_KERNEL_NAMES, NULL, kernelNames, + &kernelNamesSize); + if (errorCode != ACL_SUCCESS) { + buildLog_ += "Error: Querying of kernel names from the binary failed.\n"; + delete kernelNames; + return false; + } + std::vector vKernels = splitSpaceSeparatedString(kernelNames); + delete kernelNames; + std::vector::iterator it = vKernels.begin(); + bool dynamicParallelism = false; + aclMetadata md; + md.numHiddenKernelArgs = 0; + size_t sizeOfnumHiddenKernelArgs = sizeof(md.numHiddenKernelArgs); + for (it; it != vKernels.end(); ++it) { + std::string kernelName(*it); + std::string openclKernelName = Kernel::openclMangledName(kernelName); + errorCode = aclQueryInfo(dev().hsaCompiler(), binaryElf_, RT_NUM_KERNEL_HIDDEN_ARGS, + openclKernelName.c_str(), &md.numHiddenKernelArgs, + &sizeOfnumHiddenKernelArgs); + if (errorCode != ACL_SUCCESS) { + buildLog_ += "Error: Querying of kernel '" + openclKernelName + + "' extra arguments count from AMD HSA Code Object failed. Kernel initialization " + "failed.\n"; + return false; + } + HSAILKernel* aKernel = new HSAILKernel( + kernelName, this, options->origOptionStr + hsailOptions(), md.numHiddenKernelArgs); + kernels()[kernelName] = aKernel; + amd::hsa::loader::Symbol* sym = executable_->GetSymbol(openclKernelName.c_str(), &agent); + if (!sym) { + buildLog_ += "Error: Getting kernel ISA code symbol '" + openclKernelName + + "' from AMD HSA Code Object failed. Kernel initialization failed.\n"; + return false; + } + if (!aKernel->init(sym, false)) { + buildLog_ += "Error: Kernel '" + openclKernelName + "' initialization failed.\n"; + return false; + } + buildLog_ += aKernel->buildLog(); + aKernel->setUniformWorkGroupSize(options->oVariables->UniformWorkGroupSize); + dynamicParallelism |= aKernel->dynamicParallelism(); + // Find max scratch regs used in the program. It's used for scratch buffer preallocation + // with dynamic parallelism, since runtime doesn't know which child kernel will be called + maxScratchRegs_ = + std::max(static_cast(aKernel->workGroupInfo()->scratchRegs_), maxScratchRegs_); + } + // Allocate kernel table for device enqueuing + if (!isNull() && dynamicParallelism && !allocKernelTable()) { + return false; + } + } + // Save the binary in the interface class + saveBinaryAndSetType(TYPE_EXECUTABLE); + buildLog_ += aclGetCompilerLog(dev().hsaCompiler()); + return true; } -bool -HSAILProgram::createBinary(amd::option::Options *options) -{ - return true; -} +bool HSAILProgram::createBinary(amd::option::Options* options) { return true; } -bool -HSAILProgram::initClBinary() -{ +bool HSAILProgram::initClBinary() { + if (clBinary_ == NULL) { + clBinary_ = new ClBinaryHsa(static_cast(device())); if (clBinary_ == NULL) { - clBinary_ = new ClBinaryHsa(static_cast(device())); - if (clBinary_ == NULL) { - return false; - } + return false; } - return true; + } + return true; } -void -HSAILProgram::releaseClBinary() -{ - if (clBinary_ != NULL) { - delete clBinary_; - clBinary_ = NULL; - } +void HSAILProgram::releaseClBinary() { + if (clBinary_ != NULL) { + delete clBinary_; + clBinary_ = NULL; + } } -std::string -HSAILProgram::hsailOptions() -{ - std::string hsailOptions; - // Set options for the standard device specific options - // All our devices support these options now - if (dev().settings().reportFMAF_) { - hsailOptions.append(" -DFP_FAST_FMAF=1"); - } - if (dev().settings().reportFMA_) { - hsailOptions.append(" -DFP_FAST_FMA=1"); - } - if (!dev().settings().singleFpDenorm_) { - hsailOptions.append(" -cl-denorms-are-zero"); - } +std::string HSAILProgram::hsailOptions() { + std::string hsailOptions; + // Set options for the standard device specific options + // All our devices support these options now + if (dev().settings().reportFMAF_) { + hsailOptions.append(" -DFP_FAST_FMAF=1"); + } + if (dev().settings().reportFMA_) { + hsailOptions.append(" -DFP_FAST_FMA=1"); + } + if (!dev().settings().singleFpDenorm_) { + hsailOptions.append(" -cl-denorms-are-zero"); + } - // Check if the host is 64 bit or 32 bit - LP64_ONLY(hsailOptions.append(" -m64")); + // Check if the host is 64 bit or 32 bit + LP64_ONLY(hsailOptions.append(" -m64")); - // Append each extension supported by the device - std::string token; - std::istringstream iss(""); - iss.str(device().info().extensions_); - while (getline(iss, token, ' ')) { - if (!token.empty()) { - hsailOptions.append(" -D"); - hsailOptions.append(token); - hsailOptions.append("=1"); - } + // Append each extension supported by the device + std::string token; + std::istringstream iss(""); + iss.str(device().info().extensions_); + while (getline(iss, token, ' ')) { + if (!token.empty()) { + hsailOptions.append(" -D"); + hsailOptions.append(token); + hsailOptions.append("=1"); } - return hsailOptions; + } + return hsailOptions; } -bool -HSAILProgram::allocKernelTable() -{ - uint size = kernels().size() * sizeof(size_t); +bool HSAILProgram::allocKernelTable() { + uint size = kernels().size() * sizeof(size_t); - kernels_ = new gpu::Memory(dev(), size); - // Initialize kernel table - if ((kernels_ == NULL) || !kernels_->create(Resource::RemoteUSWC)) { - delete kernels_; - return false; - } - else { - size_t* table = reinterpret_cast( - kernels_->map(NULL, gpu::Resource::WriteOnly)); - for (auto& it : kernels()) { - HSAILKernel* kernel = static_cast(it.second); - table[kernel->index()] = static_cast( - kernel->gpuAqlCode()->vmAddress()); - } - kernels_->unmap(NULL); - } - return true; -} - -void -HSAILProgram::fillResListWithKernels( - std::vector& memList) const -{ + kernels_ = new gpu::Memory(dev(), size); + // Initialize kernel table + if ((kernels_ == NULL) || !kernels_->create(Resource::RemoteUSWC)) { + delete kernels_; + return false; + } else { + size_t* table = reinterpret_cast(kernels_->map(NULL, gpu::Resource::WriteOnly)); for (auto& it : kernels()) { - memList.push_back( - static_cast(it.second)->gpuAqlCode()); + HSAILKernel* kernel = static_cast(it.second); + table[kernel->index()] = static_cast(kernel->gpuAqlCode()->vmAddress()); } + kernels_->unmap(NULL); + } + return true; } -const aclTargetInfo & -HSAILProgram::info(const char * str) { - acl_error err; - std::string arch = "hsail"; - if (dev().settings().use64BitPtr_) { - arch = "hsail64"; - } - info_ = aclGetTargetInfo(arch.c_str(), ( str && str[0] == '\0' ? - dev().hwInfo()->targetName_ : str ), &err); - if (err != ACL_SUCCESS) { - LogWarning("aclGetTargetInfo failed"); - } - return info_; +void HSAILProgram::fillResListWithKernels(std::vector& memList) const { + for (auto& it : kernels()) { + memList.push_back(static_cast(it.second)->gpuAqlCode()); + } } -bool -HSAILProgram::saveBinaryAndSetType(type_t type) -{ - //Write binary to memory - if (rawBinary_ != NULL) { - //Free memory containing rawBinary - aclFreeMem(binaryElf_, rawBinary_); - rawBinary_ = NULL; - } - size_t size = 0; - if (aclWriteToMem(binaryElf_, &rawBinary_, &size) != ACL_SUCCESS) { - buildLog_ += "Failed to write binary to memory \n"; - return false; - } - setBinary(static_cast(rawBinary_), size); - //Set the type of binary - setType(type); - return true; +const aclTargetInfo& HSAILProgram::info(const char* str) { + acl_error err; + std::string arch = "hsail"; + if (dev().settings().use64BitPtr_) { + arch = "hsail64"; + } + info_ = aclGetTargetInfo(arch.c_str(), + (str && str[0] == '\0' ? dev().hwInfo()->targetName_ : str), &err); + if (err != ACL_SUCCESS) { + LogWarning("aclGetTargetInfo failed"); + } + return info_; } -hsa_isa_t ORCAHSALoaderContext::IsaFromName(const char *name) { - hsa_isa_t isa = {0}; - if (!strcmp(Gfx700, name)) { isa.handle = gfx700; return isa; } - if (!strcmp(Gfx701, name)) { isa.handle = gfx701; return isa; } - if (!strcmp(Gfx800, name)) { isa.handle = gfx800; return isa; } - if (!strcmp(Gfx801, name)) { isa.handle = gfx801; return isa; } - if (!strcmp(Gfx804, name)) { isa.handle = gfx804; return isa; } - if (!strcmp(Gfx810, name)) { isa.handle = gfx810; return isa; } - if (!strcmp(Gfx900, name)) { isa.handle = gfx900; return isa; } - if (!strcmp(Gfx901, name)) { isa.handle = gfx901; return isa; } +bool HSAILProgram::saveBinaryAndSetType(type_t type) { + // Write binary to memory + if (rawBinary_ != NULL) { + // Free memory containing rawBinary + aclFreeMem(binaryElf_, rawBinary_); + rawBinary_ = NULL; + } + size_t size = 0; + if (aclWriteToMem(binaryElf_, &rawBinary_, &size) != ACL_SUCCESS) { + buildLog_ += "Failed to write binary to memory \n"; + return false; + } + setBinary(static_cast(rawBinary_), size); + // Set the type of binary + setType(type); + return true; +} + +hsa_isa_t ORCAHSALoaderContext::IsaFromName(const char* name) { + hsa_isa_t isa = {0}; + if (!strcmp(Gfx700, name)) { + isa.handle = gfx700; return isa; + } + if (!strcmp(Gfx701, name)) { + isa.handle = gfx701; + return isa; + } + if (!strcmp(Gfx800, name)) { + isa.handle = gfx800; + return isa; + } + if (!strcmp(Gfx801, name)) { + isa.handle = gfx801; + return isa; + } + if (!strcmp(Gfx804, name)) { + isa.handle = gfx804; + return isa; + } + if (!strcmp(Gfx810, name)) { + isa.handle = gfx810; + return isa; + } + if (!strcmp(Gfx900, name)) { + isa.handle = gfx900; + return isa; + } + if (!strcmp(Gfx901, name)) { + isa.handle = gfx901; + return isa; + } + return isa; } bool ORCAHSALoaderContext::IsaSupportedByAgent(hsa_agent_t agent, hsa_isa_t isa) { - switch (program_->dev().hwInfo()->gfxipVersion_) { + switch (program_->dev().hwInfo()->gfxipVersion_) { default: - LogError("Unsupported gfxip version"); - return false; + LogError("Unsupported gfxip version"); + return false; case gfx700: case gfx701: case gfx702: - // gfx701 only differs from gfx700 by faster fp operations and can be loaded on either device. - return isa.handle == gfx700 || isa.handle == gfx701; + // gfx701 only differs from gfx700 by faster fp operations and can be loaded on either device. + return isa.handle == gfx700 || isa.handle == gfx701; case gfx800: - switch (program_->dev().hwInfo()->machine_) { + switch (program_->dev().hwInfo()->machine_) { case ED_ATI_CAL_MACHINE_ICELAND_ISA: case ED_ATI_CAL_MACHINE_TONGA_ISA: - return isa.handle == gfx800; + return isa.handle == gfx800; case ED_ATI_CAL_MACHINE_CARRIZO_ISA: - return isa.handle == gfx801; + return isa.handle == gfx801; case ED_ATI_CAL_MACHINE_FIJI_ISA: case ED_ATI_CAL_MACHINE_ELLESMERE_ISA: case ED_ATI_CAL_MACHINE_BAFFIN_ISA: case ED_ATI_CAL_MACHINE_LEXA_ISA: case ED_ATI_CAL_MACHINE_POLARIS22_ISA: - // gfx800 ISA has only sgrps limited and can be loaded. - // gfx801 ISA has XNACK limitations and can be loaded. - return isa.handle == gfx800 || isa.handle == gfx801 || isa.handle == gfx804; + // gfx800 ISA has only sgrps limited and can be loaded. + // gfx801 ISA has XNACK limitations and can be loaded. + return isa.handle == gfx800 || isa.handle == gfx801 || isa.handle == gfx804; case ED_ATI_CAL_MACHINE_STONEY_ISA: - return isa.handle == gfx810; + return isa.handle == gfx810; default: - assert(0); - return false; - } + assert(0); + return false; + } case gfx900: - switch (program_->dev().hwInfo()->machine_) { + switch (program_->dev().hwInfo()->machine_) { case ED_ATI_CAL_MACHINE_GREENLAND_ISA: - return isa.handle == gfx900 || isa.handle == gfx901; + return isa.handle == gfx900 || isa.handle == gfx901; default: - assert(0); - return false; - } - } + assert(0); + return false; + } + } } -void* ORCAHSALoaderContext::SegmentAlloc(amdgpu_hsa_elf_segment_t segment, - hsa_agent_t agent, size_t size, size_t align, bool zero) { - assert(size); - assert(align); - switch (segment) { +void* ORCAHSALoaderContext::SegmentAlloc(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, + size_t size, size_t align, bool zero) { + assert(size); + assert(align); + switch (segment) { case AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM: case AMDGPU_HSA_SEGMENT_GLOBAL_AGENT: case AMDGPU_HSA_SEGMENT_READONLY_AGENT: - return AgentGlobalAlloc(agent, size, align, zero); + return AgentGlobalAlloc(agent, size, align, zero); case AMDGPU_HSA_SEGMENT_CODE_AGENT: - return KernelCodeAlloc(agent, size, align, zero); + return KernelCodeAlloc(agent, size, align, zero); default: - assert(false); return 0; - } + assert(false); + return 0; + } } -bool ORCAHSALoaderContext::SegmentCopy(amdgpu_hsa_elf_segment_t segment, - hsa_agent_t agent, void* dst, size_t offset, const void* src, size_t size) { - switch (segment) { +bool ORCAHSALoaderContext::SegmentCopy(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, + void* dst, size_t offset, const void* src, size_t size) { + switch (segment) { case AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM: case AMDGPU_HSA_SEGMENT_GLOBAL_AGENT: case AMDGPU_HSA_SEGMENT_READONLY_AGENT: @@ -2439,175 +2299,198 @@ bool ORCAHSALoaderContext::SegmentCopy(amdgpu_hsa_elf_segment_t segment, case AMDGPU_HSA_SEGMENT_CODE_AGENT: return KernelCodeCopy(dst, offset, src, size); default: - assert(false); return false; - } + assert(false); + return false; + } } -void ORCAHSALoaderContext::SegmentFree(amdgpu_hsa_elf_segment_t segment, - hsa_agent_t agent, void* seg, size_t size) { - switch (segment) { +void ORCAHSALoaderContext::SegmentFree(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, + void* seg, size_t size) { + switch (segment) { case AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM: case AMDGPU_HSA_SEGMENT_GLOBAL_AGENT: - case AMDGPU_HSA_SEGMENT_READONLY_AGENT: AgentGlobalFree(seg, size); break; - case AMDGPU_HSA_SEGMENT_CODE_AGENT: KernelCodeFree(seg, size); break; + case AMDGPU_HSA_SEGMENT_READONLY_AGENT: + AgentGlobalFree(seg, size); + break; + case AMDGPU_HSA_SEGMENT_CODE_AGENT: + KernelCodeFree(seg, size); + break; default: - assert(false); return; - } + assert(false); + return; + } } -void* ORCAHSALoaderContext::SegmentAddress(amdgpu_hsa_elf_segment_t segment, - hsa_agent_t agent, void* seg, size_t offset) { - assert(seg); - switch (segment) { +void* ORCAHSALoaderContext::SegmentAddress(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, + void* seg, size_t offset) { + assert(seg); + switch (segment) { case AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM: case AMDGPU_HSA_SEGMENT_GLOBAL_AGENT: case AMDGPU_HSA_SEGMENT_READONLY_AGENT: { - if (!program_->isNull()) { - gpu::Memory *gpuMem = reinterpret_cast(seg); - return reinterpret_cast(gpuMem->vmAddress() + offset); - } + if (!program_->isNull()) { + gpu::Memory* gpuMem = reinterpret_cast(seg); + return reinterpret_cast(gpuMem->vmAddress() + offset); + } } - case AMDGPU_HSA_SEGMENT_CODE_AGENT: return (char*) seg + offset; + case AMDGPU_HSA_SEGMENT_CODE_AGENT: + return (char*)seg + offset; default: - assert(false); return NULL; - } + assert(false); + return NULL; + } } hsa_status_t ORCAHSALoaderContext::SamplerCreate( - hsa_agent_t agent, - const hsa_ext_sampler_descriptor_t *sampler_descriptor, - hsa_ext_sampler_t *sampler_handle) -{ - if (!agent.handle) { - return HSA_STATUS_ERROR_INVALID_AGENT; - } - if (!sampler_descriptor || !sampler_handle) { - return HSA_STATUS_ERROR_INVALID_ARGUMENT; - } + hsa_agent_t agent, const hsa_ext_sampler_descriptor_t* sampler_descriptor, + hsa_ext_sampler_t* sampler_handle) { + if (!agent.handle) { + return HSA_STATUS_ERROR_INVALID_AGENT; + } + if (!sampler_descriptor || !sampler_handle) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } - if (program_->isNull()) { - // Offline compilation. Provide a fake handle to avoid an assert - sampler_handle->handle = 1; - return HSA_STATUS_SUCCESS; - } - - uint32_t state = 0; - switch (sampler_descriptor->coordinate_mode) { - case HSA_EXT_SAMPLER_COORDINATE_MODE_UNNORMALIZED: state = amd::Sampler::StateNormalizedCoordsFalse; break; - case HSA_EXT_SAMPLER_COORDINATE_MODE_NORMALIZED: state = amd::Sampler::StateNormalizedCoordsTrue; break; - default: - assert(false); - return HSA_STATUS_ERROR_INVALID_ARGUMENT; - } - switch (sampler_descriptor->filter_mode) { - case HSA_EXT_SAMPLER_FILTER_MODE_NEAREST: state |= amd::Sampler::StateFilterNearest; break; - case HSA_EXT_SAMPLER_FILTER_MODE_LINEAR: state |= amd::Sampler::StateFilterLinear; break; - default: - assert(false); - return HSA_STATUS_ERROR_INVALID_ARGUMENT; - - } - switch (sampler_descriptor->address_mode) { - case HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE: state |= amd::Sampler::StateAddressClampToEdge; break; - case HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_BORDER: state |= amd::Sampler::StateAddressClamp; break; - case HSA_EXT_SAMPLER_ADDRESSING_MODE_REPEAT: state |= amd::Sampler::StateAddressRepeat; break; - case HSA_EXT_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT: state |= amd::Sampler::StateAddressMirroredRepeat; break; - case HSA_EXT_SAMPLER_ADDRESSING_MODE_UNDEFINED: state |= amd::Sampler::StateAddressNone; break; - default: - assert(false); - return HSA_STATUS_ERROR_INVALID_ARGUMENT; - } - gpu::Sampler* sampler = new gpu::Sampler(program_->dev()); - if (!sampler || !sampler->create(state)) { - delete sampler; - return HSA_STATUS_ERROR; - } - program_->addSampler(sampler); - sampler_handle->handle = sampler->hwSrd(); + if (program_->isNull()) { + // Offline compilation. Provide a fake handle to avoid an assert + sampler_handle->handle = 1; return HSA_STATUS_SUCCESS; + } + + uint32_t state = 0; + switch (sampler_descriptor->coordinate_mode) { + case HSA_EXT_SAMPLER_COORDINATE_MODE_UNNORMALIZED: + state = amd::Sampler::StateNormalizedCoordsFalse; + break; + case HSA_EXT_SAMPLER_COORDINATE_MODE_NORMALIZED: + state = amd::Sampler::StateNormalizedCoordsTrue; + break; + default: + assert(false); + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + switch (sampler_descriptor->filter_mode) { + case HSA_EXT_SAMPLER_FILTER_MODE_NEAREST: + state |= amd::Sampler::StateFilterNearest; + break; + case HSA_EXT_SAMPLER_FILTER_MODE_LINEAR: + state |= amd::Sampler::StateFilterLinear; + break; + default: + assert(false); + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + switch (sampler_descriptor->address_mode) { + case HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE: + state |= amd::Sampler::StateAddressClampToEdge; + break; + case HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_BORDER: + state |= amd::Sampler::StateAddressClamp; + break; + case HSA_EXT_SAMPLER_ADDRESSING_MODE_REPEAT: + state |= amd::Sampler::StateAddressRepeat; + break; + case HSA_EXT_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT: + state |= amd::Sampler::StateAddressMirroredRepeat; + break; + case HSA_EXT_SAMPLER_ADDRESSING_MODE_UNDEFINED: + state |= amd::Sampler::StateAddressNone; + break; + default: + assert(false); + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + gpu::Sampler* sampler = new gpu::Sampler(program_->dev()); + if (!sampler || !sampler->create(state)) { + delete sampler; + return HSA_STATUS_ERROR; + } + program_->addSampler(sampler); + sampler_handle->handle = sampler->hwSrd(); + return HSA_STATUS_SUCCESS; } -hsa_status_t ORCAHSALoaderContext::SamplerDestroy( - hsa_agent_t agent, hsa_ext_sampler_t sampler_handle) { - if (!agent.handle) { - return HSA_STATUS_ERROR_INVALID_AGENT; - } - if (!sampler_handle.handle) { - return HSA_STATUS_ERROR_INVALID_ARGUMENT; - } - return HSA_STATUS_SUCCESS; +hsa_status_t ORCAHSALoaderContext::SamplerDestroy(hsa_agent_t agent, + hsa_ext_sampler_t sampler_handle) { + if (!agent.handle) { + return HSA_STATUS_ERROR_INVALID_AGENT; + } + if (!sampler_handle.handle) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + return HSA_STATUS_SUCCESS; } void* ORCAHSALoaderContext::CpuMemAlloc(size_t size, size_t align, bool zero) { - assert(size); - assert(align); - assert(sizeof(void*) == 8 || sizeof(void*) == 4); - void* ptr = amd::Os::alignedMalloc(size, align); - if (zero) { - memset(ptr, 0, size); - } - return ptr; + assert(size); + assert(align); + assert(sizeof(void*) == 8 || sizeof(void*) == 4); + void* ptr = amd::Os::alignedMalloc(size, align); + if (zero) { + memset(ptr, 0, size); + } + return ptr; } -bool ORCAHSALoaderContext::CpuMemCopy(void *dst, size_t offset, const void* src, size_t size) { +bool ORCAHSALoaderContext::CpuMemCopy(void* dst, size_t offset, const void* src, size_t size) { if (!dst || !src || dst == src) { - return false; + return false; } if (0 == size) { - return true; + return true; } amd::Os::fastMemcpy((char*)dst + offset, src, size); return true; } void* ORCAHSALoaderContext::GpuMemAlloc(size_t size, size_t align, bool zero) { - assert(size); - assert(align); - assert(sizeof(void*) == 8 || sizeof(void*) == 4); - if (program_->isNull()) { - return new char [size]; - } + assert(size); + assert(align); + assert(sizeof(void*) == 8 || sizeof(void*) == 4); + if (program_->isNull()) { + return new char[size]; + } - gpu::Memory* mem = new gpu::Memory(program_->dev(), amd::alignUp(size, align)); - if (!mem || !mem->create(gpu::Resource::Local)) { - delete mem; - return NULL; - } - assert(program_->dev().xferQueue()); - if (zero) { - char pattern = 0; - program_->dev().xferMgr().fillBuffer(*mem, &pattern, sizeof(pattern), amd::Coord3D(0), amd::Coord3D(size)); - } - program_->addGlobalStore(mem); - program_->setGlobalVariableTotalSize(program_->globalVariableTotalSize() + size); - return mem; + gpu::Memory* mem = new gpu::Memory(program_->dev(), amd::alignUp(size, align)); + if (!mem || !mem->create(gpu::Resource::Local)) { + delete mem; + return NULL; + } + assert(program_->dev().xferQueue()); + if (zero) { + char pattern = 0; + program_->dev().xferMgr().fillBuffer(*mem, &pattern, sizeof(pattern), amd::Coord3D(0), + amd::Coord3D(size)); + } + program_->addGlobalStore(mem); + program_->setGlobalVariableTotalSize(program_->globalVariableTotalSize() + size); + return mem; } -bool ORCAHSALoaderContext::GpuMemCopy(void *dst, size_t offset, const void *src, size_t size) { - if (!dst || !src || dst == src) { - return false; - } - if (0 == size) { - return true; - } - if (program_->isNull()) { - memcpy(reinterpret_cast
(dst) + offset, src, size); - return true; - } - assert(program_->dev().xferQueue()); - gpu::Memory* mem = reinterpret_cast(dst); - return program_->dev().xferMgr().writeBuffer(src, *mem, amd::Coord3D(offset), amd::Coord3D(size), true); +bool ORCAHSALoaderContext::GpuMemCopy(void* dst, size_t offset, const void* src, size_t size) { + if (!dst || !src || dst == src) { + return false; + } + if (0 == size) { return true; + } + if (program_->isNull()) { + memcpy(reinterpret_cast
(dst) + offset, src, size); + return true; + } + assert(program_->dev().xferQueue()); + gpu::Memory* mem = reinterpret_cast(dst); + return program_->dev().xferMgr().writeBuffer(src, *mem, amd::Coord3D(offset), amd::Coord3D(size), + true); + return true; } -void ORCAHSALoaderContext::GpuMemFree(void *ptr, size_t size) -{ - if (program_->isNull()) { - delete [] reinterpret_cast(ptr); - } - else { - delete reinterpret_cast(ptr); - } +void ORCAHSALoaderContext::GpuMemFree(void* ptr, size_t size) { + if (program_->isNull()) { + delete[] reinterpret_cast(ptr); + } else { + delete reinterpret_cast(ptr); + } } -} // namespace gpu +} // namespace gpu diff --git a/rocclr/runtime/device/gpu/gpuprogram.hpp b/rocclr/runtime/device/gpu/gpuprogram.hpp index a1e2a185a4..1e4f10f5f2 100644 --- a/rocclr/runtime/device/gpu/gpuprogram.hpp +++ b/rocclr/runtime/device/gpu/gpuprogram.hpp @@ -12,15 +12,15 @@ namespace amd { namespace option { class Options; -} // option +} // option namespace hsa { namespace loader { class Loader; class Executable; class Context; -} // loader -} // hsa -} // amd +} // loader +} // hsa +} // amd //! \namespace gpu GPU Device Implementation namespace gpu { @@ -30,597 +30,546 @@ namespace gpu { */ //! \struct ILFunc for the opencl program processing -struct ILFunc : public amd::HeapObject -{ -public: - //! \struct CodeRange for the code ranges - struct SourceRange : public amd::EmbeddedObject - { - size_t begin_; //!< start code position - size_t end_; //!< end code position - }; +struct ILFunc : public amd::HeapObject { + public: + //! \struct CodeRange for the code ranges + struct SourceRange : public amd::EmbeddedObject { + size_t begin_; //!< start code position + size_t end_; //!< end code position + }; - //! \enum IL function state - enum State - { - Unknown = 0x00000000, //! unknown function - Regular = 0x00000001, //! regular function from the program - Kernel = 0x00000002 //! kernel function from the program - }; + //! \enum IL function state + enum State { + Unknown = 0x00000000, //! unknown function + Regular = 0x00000001, //! regular function from the program + Kernel = 0x00000002 //! kernel function from the program + }; - //! Default constructor - ILFunc() - : name_("") - , index_(0) - , state_(Unknown) - , privateSize_(0) - , localSize_(0) - , hwPrivateSize_(0) - , hwLocalSize_(0) - , flags_(0) - , totalHwPrivateSize_(-1) - { - code_.begin_ = code_.end_ = 0; - metadata_.begin_ = metadata_.end_ = 0; - } + //! Default constructor + ILFunc() + : name_(""), + index_(0), + state_(Unknown), + privateSize_(0), + localSize_(0), + hwPrivateSize_(0), + hwLocalSize_(0), + flags_(0), + totalHwPrivateSize_(-1) { + code_.begin_ = code_.end_ = 0; + metadata_.begin_ = metadata_.end_ = 0; + } - //! Copy constructor - ILFunc(const ILFunc& func) { *this = func; } + //! Copy constructor + ILFunc(const ILFunc& func) { *this = func; } - //! Destructor - ~ILFunc() {} + //! Destructor + ~ILFunc() {} - //! Overloads operator= - ILFunc& operator=(const ILFunc& func) - { - name_ = func.name_; - index_ = func.index_; - code_ = func.code_; - metadata_ = func.metadata_; - state_ = func.state_; - privateSize_ = func.privateSize_; - localSize_ = func.localSize_; - hwPrivateSize_ = func.hwPrivateSize_; - hwLocalSize_ = func.hwLocalSize_; - flags_ = func.flags_; - totalHwPrivateSize_ = func.totalHwPrivateSize_; + //! Overloads operator= + ILFunc& operator=(const ILFunc& func) { + name_ = func.name_; + index_ = func.index_; + code_ = func.code_; + metadata_ = func.metadata_; + state_ = func.state_; + privateSize_ = func.privateSize_; + localSize_ = func.localSize_; + hwPrivateSize_ = func.hwPrivateSize_; + hwLocalSize_ = func.hwLocalSize_; + flags_ = func.flags_; + totalHwPrivateSize_ = func.totalHwPrivateSize_; - // Note: we don't copy calls_ and macros_ - return *this; - } + // Note: we don't copy calls_ and macros_ + return *this; + } - std::string name_; //!< kernel's name - uint index_; //!< kernel's index - SourceRange code_; //!< the entire function range in the source - SourceRange metadata_; //!< the metadata range - State state_; //!< the function is real, and not intrinsic - uint privateSize_; //!< private ring allocation by the function - uint localSize_; //!< local ring allocation by the function - uint hwPrivateSize_; //!< HW private ring allocation by the function - uint hwLocalSize_; //!< HW local ring allocation by the function - uint flags_; //!< The IL func flags/properties - long long totalHwPrivateSize_; //!< total HW private usage including called functions - std::vector calls_; //! Functions called from the current - std::vector macros_; //! Macros, used in the IL function + std::string name_; //!< kernel's name + uint index_; //!< kernel's index + SourceRange code_; //!< the entire function range in the source + SourceRange metadata_; //!< the metadata range + State state_; //!< the function is real, and not intrinsic + uint privateSize_; //!< private ring allocation by the function + uint localSize_; //!< local ring allocation by the function + uint hwPrivateSize_; //!< HW private ring allocation by the function + uint hwLocalSize_; //!< HW local ring allocation by the function + uint flags_; //!< The IL func flags/properties + long long totalHwPrivateSize_; //!< total HW private usage including called functions + std::vector calls_; //! Functions called from the current + std::vector macros_; //! Macros, used in the IL function - uint totalHwPrivateUsage(); //!< total HW private usage including called functions + uint totalHwPrivateUsage(); //!< total HW private usage including called functions }; //! \class empty program -class NullProgram : public device::Program -{ -friend class ClBinary; -public: - //! Default constructor - NullProgram(NullDevice& nullDev) : device::Program(nullDev) , patch_(0) {} +class NullProgram : public device::Program { + friend class ClBinary; - //! Default destructor - ~NullProgram(); + public: + //! Default constructor + NullProgram(NullDevice& nullDev) : device::Program(nullDev), patch_(0) {} - // Initialize Binary for GPU - virtual bool initClBinary(); - // Release Binary for GPU - virtual void releaseClBinary(); + //! Default destructor + ~NullProgram(); - //! Returns global constant buffers - const std::vector& glbCb() const { return glbCb_; } + // Initialize Binary for GPU + virtual bool initClBinary(); + // Release Binary for GPU + virtual void releaseClBinary(); -protected: - //! pre-compile setup for GPU - virtual bool initBuild(amd::option::Options* options); + //! Returns global constant buffers + const std::vector& glbCb() const { return glbCb_; } - //! post-compile setup for GPU - virtual bool finiBuild(bool isBuildGood); + protected: + //! pre-compile setup for GPU + virtual bool initBuild(amd::option::Options* options); - /*! \brief Compiles GPU CL program to LLVM binary (compiler frontend) - * - * \return True if we successefully compiled a GPU program - */ - virtual bool compileImpl( - const std::string& sourceCode, //!< the program's source code - const std::vector& headers, //!< header souce codes - const char** headerIncludeNames,//!< include names of headers - amd::option::Options* options //!< compile options's object - ); + //! post-compile setup for GPU + virtual bool finiBuild(bool isBuildGood); - /*! \brief Compiles LLVM binary to IL code (compiler backend: link+opt+codegen) - * - * \return The build error code - */ - int compileBinaryToIL( - amd::option::Options* options //!< options for compilation - ); + /*! \brief Compiles GPU CL program to LLVM binary (compiler frontend) + * + * \return True if we successefully compiled a GPU program + */ + virtual bool compileImpl(const std::string& sourceCode, //!< the program's source code + const std::vector& headers, //!< header souce codes + const char** headerIncludeNames, //!< include names of headers + amd::option::Options* options //!< compile options's object + ); - /*! \brief Links the compiled IL program with HW - * - * \return True if we successefully linked a GPU program - */ - virtual bool linkImpl( - amd::option::Options* options = NULL //!< options object - ); - virtual bool linkImpl( - const std::vector& inputPrograms, - amd::option::Options* options = NULL, //!< options object - bool createLibrary = false - ); + /*! \brief Compiles LLVM binary to IL code (compiler backend: link+opt+codegen) + * + * \return The build error code + */ + int compileBinaryToIL(amd::option::Options* options //!< options for compilation + ); - virtual bool createBinary(amd::option::Options* options); + /*! \brief Links the compiled IL program with HW + * + * \return True if we successefully linked a GPU program + */ + virtual bool linkImpl(amd::option::Options* options = NULL //!< options object + ); + virtual bool linkImpl(const std::vector& inputPrograms, + amd::option::Options* options = NULL, //!< options object + bool createLibrary = false); + + virtual bool createBinary(amd::option::Options* options); - /*! \brief Parses the GPU program and finds all available kernels - * - * \return True if we successefully parsed the GPU program - */ - bool parseKernels( - const std::string& source //! the program's source code - ); + /*! \brief Parses the GPU program and finds all available kernels + * + * \return True if we successefully parsed the GPU program + */ + bool parseKernels(const std::string& source //! the program's source code + ); - /*! \brief Parse all functions in the program - * - * \return True if we successefully parsed all functions - */ - bool parseAllILFuncs( - const std::string& source //! the program's source code - ); + /*! \brief Parse all functions in the program + * + * \return True if we successefully parsed all functions + */ + bool parseAllILFuncs(const std::string& source //! the program's source code + ); - /*! \brief Parse a function's metadata given as source[posBegin:posEnd-1] - * - * \return True if we successefully parsed the given metadata - */ - bool parseFuncMetadata( - const std::string& source, //! string that contains metadata - size_t posBegin, //! begin of metadata in 'source' - size_t posEnd //! end of metadata in 'source' - ); + /*! \brief Parse a function's metadata given as source[posBegin:posEnd-1] + * + * \return True if we successefully parsed the given metadata + */ + bool parseFuncMetadata(const std::string& source, //! string that contains metadata + size_t posBegin, //! begin of metadata in 'source' + size_t posEnd //! end of metadata in 'source' + ); - /*! \brief Finds functions with the given start and end string in the - * program - * - * \return True if we successefully found all functions - */ - bool findILFuncs( - const std::string& source, //! the program's source code - const std::string& func_start, //! the start string of a function - const std::string& func_end, //! the end string of a function - size_t& lastFuncPos //! pos to the end of the last func in 'source' - ); + /*! \brief Finds functions with the given start and end string in the + * program + * + * \return True if we successefully found all functions + */ + bool findILFuncs(const std::string& source, //! the program's source code + const std::string& func_start, //! the start string of a function + const std::string& func_end, //! the end string of a function + size_t& lastFuncPos //! pos to the end of the last func in 'source' + ); - /*! \brief Finds all functions in the program - * - * \return True if we successefully found all functions - */ - bool findAllILFuncs( - const std::string& source, //! the program's source code - size_t& lastFuncPos //! pos to the end of the last func in 'source' - ); + /*! \brief Finds all functions in the program + * + * \return True if we successefully found all functions + */ + bool findAllILFuncs(const std::string& source, //! the program's source code + size_t& lastFuncPos //! pos to the end of the last func in 'source' + ); - /*! \brief Finds function, corresponded to the provided unique index - * - * \return Pointer to the ILFunc structure - */ - ILFunc* findILFunc( - uint index //! the function unique index - ); + /*! \brief Finds function, corresponded to the provided unique index + * + * \return Pointer to the ILFunc structure + */ + ILFunc* findILFunc(uint index //! the function unique index + ); - //! Destroys all objects, associated with the IL functions - void freeAllILFuncs(); + //! Destroys all objects, associated with the IL functions + void freeAllILFuncs(); - /*! \brief Finds if a provided function is called from the base function - * - * \return True if a function is used from the base one - */ - bool isCalled( - const ILFunc* base, //!< The base function - const ILFunc* func //!< Function to check for usage - ); + /*! \brief Finds if a provided function is called from the base function + * + * \return True if a function is used from the base one + */ + bool isCalled(const ILFunc* base, //!< The base function + const ILFunc* func //!< Function to check for usage + ); - //! Patches the "main" function with the call to the current kernel - void patchMain( - std::string& kernel, //! The current kernel's code for compilation - uint index //! Index of the current kernel in the program - ); + //! Patches the "main" function with the call to the current kernel + void patchMain(std::string& kernel, //! The current kernel's code for compilation + uint index //! Index of the current kernel in the program + ); - //! Adds the IL function object into the list of functions - void addFunc(ILFunc* func) { funcs_.push_back(func); } + //! Adds the IL function object into the list of functions + void addFunc(ILFunc* func) { funcs_.push_back(func); } - //! Empty implementation, since we don't have real HW - virtual bool allocGlobalData( - const void* globalData, //!< Pointer to the global data - size_t dataSize, //!< The global data size - uint index //!< Index for the global data store (0 - global heap) - ) { glbCb_.push_back(index); return true; } + //! Empty implementation, since we don't have real HW + virtual bool allocGlobalData(const void* globalData, //!< Pointer to the global data + size_t dataSize, //!< The global data size + uint index //!< Index for the global data store (0 - global heap) + ) { + glbCb_.push_back(index); + return true; + } - //! Load binary for offline device. - virtual bool loadBinary(bool *hasRecompiled); + //! Load binary for offline device. + virtual bool loadBinary(bool* hasRecompiled); - //! Create NullKernel for compiling to isa. - virtual NullKernel* createKernel( - const std::string& name, //!< The kernel's name - const Kernel::InitData* initData, //!< Initialization data - const std::string& code, //!< IL source code - const std::string& metadata, //!< the kernel metadata structure - bool* created, //!< True if the object was created - const void* binaryCode = NULL, //!< binary machine code for CAL - size_t binarySize = 0 //!< the machine code size - ); + //! Create NullKernel for compiling to isa. + virtual NullKernel* createKernel(const std::string& name, //!< The kernel's name + const Kernel::InitData* initData, //!< Initialization data + const std::string& code, //!< IL source code + const std::string& metadata, //!< the kernel metadata structure + bool* created, //!< True if the object was created + const void* binaryCode = NULL, //!< binary machine code for CAL + size_t binarySize = 0 //!< the machine code size + ); - ClBinary* clBinary() { - return static_cast(device::Program::clBinary()); - } - const ClBinary* clBinary() const { - return static_cast(device::Program::clBinary()); - } + ClBinary* clBinary() { return static_cast(device::Program::clBinary()); } + const ClBinary* clBinary() const { + return static_cast(device::Program::clBinary()); + } - /*! Get all per-kernel IL from programIL, where programIL is the IL for the - * whole compilation unit. - */ - bool getAllKernelILs(std::map& allKernelILs, - std::string& programIL, const char* ilKernelName); + /*! Get all per-kernel IL from programIL, where programIL is the IL for the + * whole compilation unit. + */ + bool getAllKernelILs(std::map& allKernelILs, std::string& programIL, + const char* ilKernelName); -protected: - std::vector printf_; //!< Format strings for GPU printf support - std::vector glbCb_; //!< Global constant buffers + protected: + std::vector printf_; //!< Format strings for GPU printf support + std::vector glbCb_; //!< Global constant buffers - virtual bool isElf(const char* bin) const { - return amd::isElfMagic(bin); - } + virtual bool isElf(const char* bin) const { return amd::isElfMagic(bin); } - virtual const aclTargetInfo & info(const char * str = ""); + virtual const aclTargetInfo& info(const char* str = ""); -private: - //! Disable default copy constructor - NullProgram(const NullProgram&); + private: + //! Disable default copy constructor + NullProgram(const NullProgram&); - //! Disable operator= - NullProgram& operator=(const NullProgram&); + //! Disable operator= + NullProgram& operator=(const NullProgram&); - //! Initializes the global data store - bool initGlobalData( - const std::string& source, //!< the program's source code - size_t start //!< start position for the global data search - ); + //! Initializes the global data store + bool initGlobalData(const std::string& source, //!< the program's source code + size_t start //!< start position for the global data search + ); - //! Return a typecasted GPU device - gpu::NullDevice& dev() - { return const_cast( - static_cast(device())); } + //! Return a typecasted GPU device + gpu::NullDevice& dev() { + return const_cast(static_cast(device())); + } - size_t patch_; //!< Patch call position in the source code. - std::vector funcs_; //!< list of all functions. + size_t patch_; //!< Patch call position in the source code. + std::vector funcs_; //!< list of all functions. - std::string ilProgram_; //!< IL program after compilation + std::string ilProgram_; //!< IL program after compilation }; //! \class GPU program -class Program : public NullProgram -{ -public: - //! GPU program constructor - Program(Device& gpuDev) - : NullProgram(gpuDev) - , glbData_(NULL) - {} +class Program : public NullProgram { + public: + //! GPU program constructor + Program(Device& gpuDev) : NullProgram(gpuDev), glbData_(NULL) {} - //! GPU program destructor - ~Program(); + //! GPU program destructor + ~Program(); - //! Get the global data store for this program - gpu::Memory* glbData() const { return glbData_; } + //! Get the global data store for this program + gpu::Memory* glbData() const { return glbData_; } - //! Returns TRUE if we successfully allocated the global data store - //! in video memory - bool allocGlobalData( - const void* globalData, //!< Pointer to the global data - size_t dataSize, //!< The global data size - uint index //!< Index for the global data store (0 - global heap) - ); + //! Returns TRUE if we successfully allocated the global data store + //! in video memory + bool allocGlobalData(const void* globalData, //!< Pointer to the global data + size_t dataSize, //!< The global data size + uint index //!< Index for the global data store (0 - global heap) + ); - //! Returns TRUE if we could - virtual bool loadBinary(bool* hasRecompiled); + //! Returns TRUE if we could + virtual bool loadBinary(bool* hasRecompiled); - //! Creates the GPU kernel (return base type) - virtual NullKernel* createKernel( - const std::string& name, //!< The kernel's name - const Kernel::InitData* initData, //!< Initialization data - const std::string& code, //!< IL source code - const std::string& metadata, //!< the kernel metadata structure - bool* created, //!< True if the object was created - const void* binaryCode = NULL, //!< binary machine code for CAL - size_t binarySize = 0 //!< the machine code size - ); + //! Creates the GPU kernel (return base type) + virtual NullKernel* createKernel(const std::string& name, //!< The kernel's name + const Kernel::InitData* initData, //!< Initialization data + const std::string& code, //!< IL source code + const std::string& metadata, //!< the kernel metadata structure + bool* created, //!< True if the object was created + const void* binaryCode = NULL, //!< binary machine code for CAL + size_t binarySize = 0 //!< the machine code size + ); - typedef std::map HwConstBuffers; + typedef std::map HwConstBuffers; - //! Global HW constant buffers - const HwConstBuffers& glbHwCb() const { return constBufs_; } + //! Global HW constant buffers + const HwConstBuffers& glbHwCb() const { return constBufs_; } - //! Returns pritnf info array - const std::vector& printfInfo() const { return printf_; } + //! Returns pritnf info array + const std::vector& printfInfo() const { return printf_; } - //! Return a typecasted GPU device - gpu::Device& dev() - { return const_cast( - static_cast(device())); } + //! Return a typecasted GPU device + gpu::Device& dev() { return const_cast(static_cast(device())); } -protected: + protected: + private: + //! Disable copy constructor + Program(const Program&); + //! Disable operator= + Program& operator=(const Program&); -private: - //! Disable copy constructor - Program(const Program&); - - //! Disable operator= - Program& operator=(const Program&); - - HwConstBuffers constBufs_; //!< Constant buffers for the global store - gpu::Memory* glbData_; //!< Global data store + HwConstBuffers constBufs_; //!< Constant buffers for the global store + gpu::Memory* glbData_; //!< Global data store }; using namespace amd::hsa::loader; class HSAILProgram; -class ORCAHSALoaderContext final: public Context { -public: - ORCAHSALoaderContext(HSAILProgram* program): program_(program) {} +class ORCAHSALoaderContext final : public Context { + public: + ORCAHSALoaderContext(HSAILProgram* program) : program_(program) {} - virtual ~ORCAHSALoaderContext() {} + virtual ~ORCAHSALoaderContext() {} - hsa_isa_t IsaFromName(const char *name) override; + hsa_isa_t IsaFromName(const char* name) override; - bool IsaSupportedByAgent(hsa_agent_t agent, hsa_isa_t isa) override; + bool IsaSupportedByAgent(hsa_agent_t agent, hsa_isa_t isa) override; - void* SegmentAlloc(amdgpu_hsa_elf_segment_t segment, - hsa_agent_t agent, size_t size, size_t align, bool zero) override; + void* SegmentAlloc(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, size_t size, size_t align, + bool zero) override; - bool SegmentCopy(amdgpu_hsa_elf_segment_t segment, - hsa_agent_t agent, void* dst, size_t offset, - const void* src, size_t size) override; + bool SegmentCopy(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* dst, size_t offset, + const void* src, size_t size) override; - void SegmentFree(amdgpu_hsa_elf_segment_t segment, - hsa_agent_t agent, void* seg, size_t size = 0) override; + void SegmentFree(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* seg, + size_t size = 0) override; - void* SegmentAddress(amdgpu_hsa_elf_segment_t segment, - hsa_agent_t agent, void* seg, size_t offset) override; + void* SegmentAddress(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* seg, + size_t offset) override; - void* SegmentHostAddress(amdgpu_hsa_elf_segment_t segment, - hsa_agent_t agent, void* seg, size_t offset) override { - return nullptr; - } + void* SegmentHostAddress(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* seg, + size_t offset) override { + return nullptr; + } - bool SegmentFreeze(amdgpu_hsa_elf_segment_t segment, - hsa_agent_t agent, void* seg, size_t size) override { return false; } + bool SegmentFreeze(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* seg, + size_t size) override { + return false; + } - bool ImageExtensionSupported() override { return false; } + bool ImageExtensionSupported() override { return false; } - hsa_status_t ImageCreate( - hsa_agent_t agent, - hsa_access_permission_t image_permission, - const hsa_ext_image_descriptor_t *image_descriptor, - const void *image_data, - hsa_ext_image_t *image_handle) override { - // not supported - assert(false); - return HSA_STATUS_ERROR; - } + hsa_status_t ImageCreate(hsa_agent_t agent, hsa_access_permission_t image_permission, + const hsa_ext_image_descriptor_t* image_descriptor, + const void* image_data, hsa_ext_image_t* image_handle) override { + // not supported + assert(false); + return HSA_STATUS_ERROR; + } - hsa_status_t ImageDestroy( - hsa_agent_t agent, hsa_ext_image_t image_handle) override { - // not supported - assert(false); - return HSA_STATUS_ERROR; - } + hsa_status_t ImageDestroy(hsa_agent_t agent, hsa_ext_image_t image_handle) override { + // not supported + assert(false); + return HSA_STATUS_ERROR; + } - hsa_status_t SamplerCreate( - hsa_agent_t agent, - const hsa_ext_sampler_descriptor_t *sampler_descriptor, - hsa_ext_sampler_t *sampler_handle) override; + hsa_status_t SamplerCreate(hsa_agent_t agent, + const hsa_ext_sampler_descriptor_t* sampler_descriptor, + hsa_ext_sampler_t* sampler_handle) override; - //! All samplers are owned by HSAILProgram and are deleted in its destructor. - hsa_status_t SamplerDestroy( - hsa_agent_t agent, hsa_ext_sampler_t sampler_handle) override; + //! All samplers are owned by HSAILProgram and are deleted in its destructor. + hsa_status_t SamplerDestroy(hsa_agent_t agent, hsa_ext_sampler_t sampler_handle) override; -private: + private: + void* AgentGlobalAlloc(hsa_agent_t agent, size_t size, size_t align, bool zero) { + return GpuMemAlloc(size, align, zero); + } - void* AgentGlobalAlloc( - hsa_agent_t agent, size_t size, size_t align, bool zero) { - return GpuMemAlloc(size, align, zero); - } + bool AgentGlobalCopy(void* dst, size_t offset, const void* src, size_t size) { + return GpuMemCopy(dst, offset, src, size); + } - bool AgentGlobalCopy(void *dst, size_t offset, const void *src, size_t size) { - return GpuMemCopy(dst, offset, src, size); - } + void AgentGlobalFree(void* ptr, size_t size) { GpuMemFree(ptr, size); } - void AgentGlobalFree(void *ptr, size_t size) { - GpuMemFree(ptr, size); - } + void* KernelCodeAlloc(hsa_agent_t agent, size_t size, size_t align, bool zero) { + return CpuMemAlloc(size, align, zero); + } - void* KernelCodeAlloc( - hsa_agent_t agent, size_t size, size_t align, bool zero) { - return CpuMemAlloc(size, align, zero); - } + bool KernelCodeCopy(void* dst, size_t offset, const void* src, size_t size) { + return CpuMemCopy(dst, offset, src, size); + } - bool KernelCodeCopy(void *dst, size_t offset, const void *src, size_t size) { - return CpuMemCopy(dst, offset, src, size); - } + void KernelCodeFree(void* ptr, size_t size) { CpuMemFree(ptr, size); } - void KernelCodeFree(void *ptr, size_t size) { - CpuMemFree(ptr, size); - } + void* CpuMemAlloc(size_t size, size_t align, bool zero); - void* CpuMemAlloc(size_t size, size_t align, bool zero); + bool CpuMemCopy(void* dst, size_t offset, const void* src, size_t size); - bool CpuMemCopy(void *dst, size_t offset, const void* src, size_t size); + void CpuMemFree(void* ptr, size_t size) { amd::Os::alignedFree(ptr); } - void CpuMemFree(void *ptr, size_t size) { - amd::Os::alignedFree(ptr); - } + void* GpuMemAlloc(size_t size, size_t align, bool zero); - void* GpuMemAlloc(size_t size, size_t align, bool zero); + bool GpuMemCopy(void* dst, size_t offset, const void* src, size_t size); - bool GpuMemCopy(void *dst, size_t offset, const void *src, size_t size); + void GpuMemFree(void* ptr, size_t size = 0); - void GpuMemFree(void *ptr, size_t size = 0); + ORCAHSALoaderContext(const ORCAHSALoaderContext& c); - ORCAHSALoaderContext(const ORCAHSALoaderContext &c); + ORCAHSALoaderContext& operator=(const ORCAHSALoaderContext& c); - ORCAHSALoaderContext& operator=(const ORCAHSALoaderContext &c); - - gpu::HSAILProgram* program_; + gpu::HSAILProgram* program_; }; //! \class HSAIL program -class HSAILProgram : public device::Program -{ - friend class ClBinary; -public: - //! Default constructor - HSAILProgram(Device& device); - HSAILProgram(NullDevice& device); - //! Default destructor - ~HSAILProgram(); +class HSAILProgram : public device::Program { + friend class ClBinary; - //! Returns the aclBinary associated with the progrm - aclBinary* binaryElf() const { - return static_cast(binaryElf_); } + public: + //! Default constructor + HSAILProgram(Device& device); + HSAILProgram(NullDevice& device); + //! Default destructor + ~HSAILProgram(); - void addGlobalStore(Memory* mem) { globalStores_.push_back(mem); } + //! Returns the aclBinary associated with the progrm + aclBinary* binaryElf() const { return static_cast(binaryElf_); } - const std::vector& globalStores() const { return globalStores_; } + void addGlobalStore(Memory* mem) { globalStores_.push_back(mem); } - //! Return a typecasted GPU device - gpu::Device& dev() - { return const_cast( - static_cast(device())); } + const std::vector& globalStores() const { return globalStores_; } - //! Returns GPU kernel table - const Memory* kernelTable() const { return kernels_; } + //! Return a typecasted GPU device + gpu::Device& dev() { return const_cast(static_cast(device())); } - //! Adds all kernels to the mem handle lists - void fillResListWithKernels(std::vector& memList) const; + //! Returns GPU kernel table + const Memory* kernelTable() const { return kernels_; } - //! Returns the maximum number of scratch regs used in the program - uint maxScratchRegs() const { return maxScratchRegs_; } + //! Adds all kernels to the mem handle lists + void fillResListWithKernels(std::vector& memList) const; - //! Add internal static sampler - void addSampler(Sampler* sampler) { staticSamplers_.push_back(sampler); } + //! Returns the maximum number of scratch regs used in the program + uint maxScratchRegs() const { return maxScratchRegs_; } - //! Returns TRUE if the program just compiled - bool isNull() const { return isNull_; } + //! Add internal static sampler + void addSampler(Sampler* sampler) { staticSamplers_.push_back(sampler); } - //! Returns TRUE if the program contains static samplers - bool isStaticSampler() const { return (staticSamplers_.size() != 0); } + //! Returns TRUE if the program just compiled + bool isNull() const { return isNull_; } -protected: - //! pre-compile setup for GPU - virtual bool initBuild(amd::option::Options* options); + //! Returns TRUE if the program contains static samplers + bool isStaticSampler() const { return (staticSamplers_.size() != 0); } - //! post-compile setup for GPU - virtual bool finiBuild(bool isBuildGood); + protected: + //! pre-compile setup for GPU + virtual bool initBuild(amd::option::Options* options); - /*! \brief Compiles GPU CL program to LLVM binary (compiler frontend) - * - * \return True if we successefully compiled a GPU program - */ - virtual bool compileImpl( - const std::string& sourceCode, //!< the program's source code - const std::vector& headers, - const char** headerIncludeNames, - amd::option::Options* options //!< compile options's object - ); + //! post-compile setup for GPU + virtual bool finiBuild(bool isBuildGood); - /* \brief Returns the next stage to compile from, based on sections in binary, - * also returns completeStages in a vector, which contains at least ACL_TYPE_DEFAULT, - * sets needOptionsCheck to true if options check is needed to decide whether or not to recompile - */ - aclType getCompilationStagesFromBinary(std::vector& completeStages, bool& needOptionsCheck); + /*! \brief Compiles GPU CL program to LLVM binary (compiler frontend) + * + * \return True if we successefully compiled a GPU program + */ + virtual bool compileImpl(const std::string& sourceCode, //!< the program's source code + const std::vector& headers, + const char** headerIncludeNames, + amd::option::Options* options //!< compile options's object + ); - /* \brief Returns the next stage to compile from, based on sections and options in binary - */ - aclType getNextCompilationStageFromBinary(amd::option::Options* options); + /* \brief Returns the next stage to compile from, based on sections in binary, + * also returns completeStages in a vector, which contains at least ACL_TYPE_DEFAULT, + * sets needOptionsCheck to true if options check is needed to decide whether or not to recompile + */ + aclType getCompilationStagesFromBinary(std::vector& completeStages, + bool& needOptionsCheck); - bool saveBinaryAndSetType(type_t type); + /* \brief Returns the next stage to compile from, based on sections and options in binary + */ + aclType getNextCompilationStageFromBinary(amd::option::Options* options); - virtual bool linkImpl(amd::option::Options* options); + bool saveBinaryAndSetType(type_t type); - //! Link the device programs. - virtual bool linkImpl (const std::vector& inputPrograms, - amd::option::Options* options, - bool createLibrary); + virtual bool linkImpl(amd::option::Options* options); - virtual bool createBinary(amd::option::Options* options); + //! Link the device programs. + virtual bool linkImpl(const std::vector& inputPrograms, + amd::option::Options* options, bool createLibrary); - //! Initialize Binary - virtual bool initClBinary(); + virtual bool createBinary(amd::option::Options* options); - //! Release the Binary - virtual void releaseClBinary(); + //! Initialize Binary + virtual bool initClBinary(); - virtual const aclTargetInfo & info(const char * str = ""); + //! Release the Binary + virtual void releaseClBinary(); - virtual bool isElf(const char* bin) const { - return amd::isElfMagic(bin); - //return false; - } + virtual const aclTargetInfo& info(const char* str = ""); - //! Returns the binary - // This should ensure that the binary is updated with all the kernels - // ClBinary& clBinary() { return binary_; } - ClBinary* clBinary() { - return static_cast(device::Program::clBinary()); - } - const ClBinary* clBinary() const { - return static_cast(device::Program::clBinary()); - } + virtual bool isElf(const char* bin) const { + return amd::isElfMagic(bin); + // return false; + } -private: - //! Disable default copy constructor - HSAILProgram(const HSAILProgram&); + //! Returns the binary + // This should ensure that the binary is updated with all the kernels + // ClBinary& clBinary() { return binary_; } + ClBinary* clBinary() { return static_cast(device::Program::clBinary()); } + const ClBinary* clBinary() const { + return static_cast(device::Program::clBinary()); + } - //! Disable operator= - HSAILProgram& operator=(const HSAILProgram&); + private: + //! Disable default copy constructor + HSAILProgram(const HSAILProgram&); - //! Returns all the options to be appended while passing to the - //compiler library - std::string hsailOptions(); + //! Disable operator= + HSAILProgram& operator=(const HSAILProgram&); - //! Allocate kernel table - bool allocKernelTable(); + //! Returns all the options to be appended while passing to the + // compiler library + std::string hsailOptions(); - std::string openCLSource_; //!< Original OpenCL source - std::string HSAILProgram_; //!< FSAIL program after compilation - std::string llvmBinary_; //!< LLVM IR binary code - aclBinary* binaryElf_; //!< Binary for the new compiler library - void* rawBinary_; //!< Pointer to the raw binary - aclBinaryOptions binOpts_; //!< Binary options to create aclBinary - std::vector globalStores_; //!< Global memory for the program - Memory* kernels_; //!< Table with kernel object pointers - uint maxScratchRegs_; //!< Maximum number of scratch regs used in the program by individual kernel - std::list staticSamplers_; //!< List od internal static samplers - bool isNull_; //!< Null program no memory allocations - amd::hsa::loader::Loader* loader_; //!< Loader object - amd::hsa::loader::Executable* executable_; //!< Executable for HSA Loader - ORCAHSALoaderContext loaderContext_; //!< Context for HSA Loader + //! Allocate kernel table + bool allocKernelTable(); + + std::string openCLSource_; //!< Original OpenCL source + std::string HSAILProgram_; //!< FSAIL program after compilation + std::string llvmBinary_; //!< LLVM IR binary code + aclBinary* binaryElf_; //!< Binary for the new compiler library + void* rawBinary_; //!< Pointer to the raw binary + aclBinaryOptions binOpts_; //!< Binary options to create aclBinary + std::vector globalStores_; //!< Global memory for the program + Memory* kernels_; //!< Table with kernel object pointers + uint + maxScratchRegs_; //!< Maximum number of scratch regs used in the program by individual kernel + std::list staticSamplers_; //!< List od internal static samplers + bool isNull_; //!< Null program no memory allocations + amd::hsa::loader::Loader* loader_; //!< Loader object + amd::hsa::loader::Executable* executable_; //!< Executable for HSA Loader + ORCAHSALoaderContext loaderContext_; //!< Context for HSA Loader }; /*@}*/} // namespace gpu diff --git a/rocclr/runtime/device/gpu/gpuresource.cpp b/rocclr/runtime/device/gpu/gpuresource.cpp index 6cf59df440..e4dc2982ae 100644 --- a/rocclr/runtime/device/gpu/gpuresource.cpp +++ b/rocclr/runtime/device/gpu/gpuresource.cpp @@ -24,1998 +24,1802 @@ namespace gpu { -GslResourceReference::GslResourceReference( - const Device& gpuDev, - gslMemObject gslResource, - gslMemObject gslResOriginal - ) - : device_(gpuDev) - , resource_(gslResource) - , resOriginal_(gslResOriginal) - , cpuAddress_(NULL) -{ +GslResourceReference::GslResourceReference(const Device& gpuDev, gslMemObject gslResource, + gslMemObject gslResOriginal) + : device_(gpuDev), resource_(gslResource), resOriginal_(gslResOriginal), cpuAddress_(NULL) {} + +GslResourceReference::~GslResourceReference() { + if (cpuAddress_ != NULL) { + device_.resUnmapRemote(gslResource()); + } + if (0 != gslResource()) { + device_.resFree(gslResource()); + resource_ = NULL; + } + + if (0 != gslOriginal()) { + device_.resFree(gslOriginal()); + resOriginal_ = NULL; + } } -GslResourceReference::~GslResourceReference() -{ - if (cpuAddress_ != NULL) { - device_.resUnmapRemote(gslResource()); - } - if (0 != gslResource()) { - device_.resFree(gslResource()); - resource_ = NULL; - } - - if (0 != gslOriginal()) { - device_.resFree(gslOriginal()); - resOriginal_ = NULL; - } +Resource::Resource(const Device& gpuDev, size_t width, cmSurfFmt format) + : elementSize_(0), + gpuDevice_(gpuDev), + mapCount_(0), + address_(NULL), + offset_(0), + curRename_(0), + gslRef_(NULL), + viewOwner_(NULL), + hbOffset_(0), + hbSize_(0), + pinOffset_(0), + glInterop_(0), + gpu_(NULL) { + // Fill GSL descriptor fields + cal_.type_ = Empty; + cal_.width_ = width; + cal_.height_ = 1; + cal_.depth_ = 1; + cal_.mipLevels_ = 1; + cal_.format_ = format; + cal_.flags_ = 0; + cal_.pitch_ = 0; + cal_.slice_ = 0; + cal_.channelOrder_ = GSL_CHANNEL_ORDER_REPLICATE_R; + cal_.dimension_ = GSL_MOA_BUFFER; + cal_.cardMemory_ = true; + cal_.dimSize_ = 1; + cal_.buffer_ = true; + cal_.imageArray_ = false; + cal_.imageType_ = 0; + cal_.skipRsrcCache_ = false; + cal_.scratch_ = false; + cal_.isAllocSVM_ = false; + cal_.isAllocExecute_ = false; } -Resource::Resource( - const Device& gpuDev, - size_t width, - cmSurfFmt format) - : elementSize_(0) - , gpuDevice_(gpuDev) - , mapCount_(0) - , address_(NULL) - , offset_(0) - , curRename_(0) - , gslRef_(NULL) - , viewOwner_(NULL) - , hbOffset_(0) - , hbSize_(0) - , pinOffset_(0) - , glInterop_(0) - , gpu_(NULL) -{ - // Fill GSL descriptor fields - cal_.type_ = Empty; - cal_.width_ = width; - cal_.height_ = 1; - cal_.depth_ = 1; - cal_.mipLevels_ = 1; - cal_.format_ = format; - cal_.flags_ = 0; - cal_.pitch_ = 0; - cal_.slice_ = 0; - cal_.channelOrder_ = GSL_CHANNEL_ORDER_REPLICATE_R; - cal_.dimension_ = GSL_MOA_BUFFER; - cal_.cardMemory_ = true; - cal_.dimSize_ = 1; - cal_.buffer_ = true; - cal_.imageArray_ = false; - cal_.imageType_ = 0; - cal_.skipRsrcCache_ = false; - cal_.scratch_ = false; - cal_.isAllocSVM_ = false; - cal_.isAllocExecute_ = false; -} +Resource::Resource(const Device& gpuDev, size_t width, size_t height, size_t depth, + cmSurfFmt format, gslChannelOrder chOrder, cl_mem_object_type imageType, + uint mipLevels) + : elementSize_(0), + gpuDevice_(gpuDev), + mapCount_(0), + address_(NULL), + offset_(0), + curRename_(0), + gslRef_(NULL), + viewOwner_(NULL), + hbOffset_(0), + hbSize_(0), + pinOffset_(0), + glInterop_(0), + gpu_(NULL) { + // Fill GSL descriptor fields + cal_.type_ = Empty; + cal_.width_ = width; + cal_.height_ = height; + cal_.depth_ = depth; + cal_.mipLevels_ = mipLevels; + cal_.format_ = format; + cal_.flags_ = 0; + cal_.pitch_ = 0; + cal_.slice_ = 0; + cal_.channelOrder_ = chOrder; + cal_.cardMemory_ = true; + cal_.buffer_ = false; + cal_.imageArray_ = false; + cal_.imageType_ = imageType; + cal_.skipRsrcCache_ = false; + cal_.scratch_ = false; + cal_.isAllocSVM_ = false; + cal_.isAllocExecute_ = false; -Resource::Resource( - const Device& gpuDev, - size_t width, - size_t height, - size_t depth, - cmSurfFmt format, - gslChannelOrder chOrder, - cl_mem_object_type imageType, - uint mipLevels) - : elementSize_(0) - , gpuDevice_(gpuDev) - , mapCount_(0) - , address_(NULL) - , offset_(0) - , curRename_(0) - , gslRef_(NULL) - , viewOwner_(NULL) - , hbOffset_(0) - , hbSize_(0) - , pinOffset_(0) - , glInterop_(0) - , gpu_(NULL) -{ - // Fill GSL descriptor fields - cal_.type_ = Empty; - cal_.width_ = width; - cal_.height_ = height; - cal_.depth_ = depth; - cal_.mipLevels_ = mipLevels; - cal_.format_ = format; - cal_.flags_ = 0; - cal_.pitch_ = 0; - cal_.slice_ = 0; - cal_.channelOrder_ = chOrder; - cal_.cardMemory_ = true; - cal_.buffer_ = false; - cal_.imageArray_ = false; - cal_.imageType_ = imageType; - cal_.skipRsrcCache_ = false; - cal_.scratch_ = false; - cal_.isAllocSVM_ = false; - cal_.isAllocExecute_ = false; - - switch (imageType) { + switch (imageType) { case CL_MEM_OBJECT_IMAGE2D: - cal_.dimension_ = GSL_MOA_TEXTURE_2D; - cal_.dimSize_ = 2; - break; + cal_.dimension_ = GSL_MOA_TEXTURE_2D; + cal_.dimSize_ = 2; + break; case CL_MEM_OBJECT_IMAGE3D: - cal_.dimension_ = GSL_MOA_TEXTURE_3D; - cal_.dimSize_ = 3; - break; + cal_.dimension_ = GSL_MOA_TEXTURE_3D; + cal_.dimSize_ = 3; + break; case CL_MEM_OBJECT_IMAGE2D_ARRAY: - cal_.dimension_ = GSL_MOA_TEXTURE_2D_ARRAY; - cal_.dimSize_ = 3; - cal_.imageArray_ = true; - break; + cal_.dimension_ = GSL_MOA_TEXTURE_2D_ARRAY; + cal_.dimSize_ = 3; + cal_.imageArray_ = true; + break; case CL_MEM_OBJECT_IMAGE1D: - cal_.dimension_ = GSL_MOA_TEXTURE_1D; - cal_.dimSize_ = 1; - break; + cal_.dimension_ = GSL_MOA_TEXTURE_1D; + cal_.dimSize_ = 1; + break; case CL_MEM_OBJECT_IMAGE1D_ARRAY: - cal_.dimension_ = GSL_MOA_TEXTURE_1D_ARRAY; - cal_.dimSize_ = 2; - cal_.imageArray_ = true; - break; + cal_.dimension_ = GSL_MOA_TEXTURE_1D_ARRAY; + cal_.dimSize_ = 2; + cal_.imageArray_ = true; + break; case CL_MEM_OBJECT_IMAGE1D_BUFFER: - cal_.dimension_ = GSL_MOA_TEXTURE_BUFFER; - cal_.dimSize_ = 1; - break; + cal_.dimension_ = GSL_MOA_TEXTURE_BUFFER; + cal_.dimSize_ = 1; + break; default: - cal_.dimSize_ = 1; - LogError("Unknown image type!"); - break; - } + cal_.dimSize_ = 1; + LogError("Unknown image type!"); + break; + } } -Resource::~Resource() -{ - free(); -} +Resource::~Resource() { free(); } -static uint32_t GetHSAILImageFormatType(cmSurfFmt format) -{ - uint32_t formatType = HSA_EXT_IMAGE_CHANNEL_TYPE_SNORM_INT8; +static uint32_t GetHSAILImageFormatType(cmSurfFmt format) { + uint32_t formatType = HSA_EXT_IMAGE_CHANNEL_TYPE_SNORM_INT8; - switch (format) - { + switch (format) { case CM_SURF_FMT_sR8: case CM_SURF_FMT_sRG8: case CM_SURF_FMT_sRGBA8: - formatType = HSA_EXT_IMAGE_CHANNEL_TYPE_SNORM_INT8; - break; + formatType = HSA_EXT_IMAGE_CHANNEL_TYPE_SNORM_INT8; + break; case CM_SURF_FMT_sU16: case CM_SURF_FMT_sUV16: case CM_SURF_FMT_sUVWQ16: - formatType = HSA_EXT_IMAGE_CHANNEL_TYPE_SNORM_INT16; - break; + formatType = HSA_EXT_IMAGE_CHANNEL_TYPE_SNORM_INT16; + break; case CM_SURF_FMT_INTENSITY8: case CM_SURF_FMT_RG8: case CM_SURF_FMT_RGBA8: case CM_SURF_FMT_RGBX8UI: case CM_SURF_FMT_RGBA8_SRGB: - formatType = HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT8; - break; + formatType = HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT8; + break; case CM_SURF_FMT_R16: case CM_SURF_FMT_RG16: case CM_SURF_FMT_RGBA16: case CM_SURF_FMT_DEPTH16: - formatType = HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT16; - break; + formatType = HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT16; + break; case CM_SURF_FMT_BGR10_X2: - formatType = HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_101010; - break; + formatType = HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_101010; + break; case CM_SURF_FMT_sR8I: case CM_SURF_FMT_sRG8I: case CM_SURF_FMT_sRGBA8I: - formatType = HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT8; - break; + formatType = HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT8; + break; case CM_SURF_FMT_sR16I: case CM_SURF_FMT_sRG16I: case CM_SURF_FMT_sRGBA16I: - formatType = HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT16; - break; + formatType = HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT16; + break; case CM_SURF_FMT_sR32I: case CM_SURF_FMT_sRG32I: case CM_SURF_FMT_sRGBA32I: - formatType = HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT32; - break; + formatType = HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT32; + break; case CM_SURF_FMT_R8I: case CM_SURF_FMT_RG8I: case CM_SURF_FMT_RGBA8UI: - formatType = HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8; - break; + formatType = HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8; + break; case CM_SURF_FMT_R16I: case CM_SURF_FMT_RG16I: case CM_SURF_FMT_RGBA16UI: - formatType = HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16; - break; + formatType = HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16; + break; case CM_SURF_FMT_R32I: case CM_SURF_FMT_RG32I: case CM_SURF_FMT_RGBA32UI: - formatType = HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32; - break; + formatType = HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32; + break; case CM_SURF_FMT_R16F: case CM_SURF_FMT_RG16F: case CM_SURF_FMT_RGBA16F: - formatType = HSA_EXT_IMAGE_CHANNEL_TYPE_HALF_FLOAT; - break; + formatType = HSA_EXT_IMAGE_CHANNEL_TYPE_HALF_FLOAT; + break; case CM_SURF_FMT_R32F: case CM_SURF_FMT_RG32F: case CM_SURF_FMT_RGBA32F: case CM_SURF_FMT_DEPTH32F: case CM_SURF_FMT_DEPTH32F_X24_STEN8: - formatType = HSA_EXT_IMAGE_CHANNEL_TYPE_FLOAT; - break; + formatType = HSA_EXT_IMAGE_CHANNEL_TYPE_FLOAT; + break; case CM_SURF_FMT_DEPTH24_STEN8: - formatType = HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT24; - break; + formatType = HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT24; + break; default: - assert(false); - } + assert(false); + } - return formatType; + return formatType; } -static uint32_t GetHSAILImageOrderType(gslChannelOrder chOrder, cmSurfFmt format) -{ - uint32_t orderType = HSA_EXT_IMAGE_CHANNEL_ORDER_A; +static uint32_t GetHSAILImageOrderType(gslChannelOrder chOrder, cmSurfFmt format) { + uint32_t orderType = HSA_EXT_IMAGE_CHANNEL_ORDER_A; - switch (chOrder) - { + switch (chOrder) { case GSL_CHANNEL_ORDER_R: - orderType = HSA_EXT_IMAGE_CHANNEL_ORDER_R; - break; + orderType = HSA_EXT_IMAGE_CHANNEL_ORDER_R; + break; case GSL_CHANNEL_ORDER_A: - orderType = HSA_EXT_IMAGE_CHANNEL_ORDER_A; - break; + orderType = HSA_EXT_IMAGE_CHANNEL_ORDER_A; + break; case GSL_CHANNEL_ORDER_RG: - orderType = HSA_EXT_IMAGE_CHANNEL_ORDER_RG; - break; + orderType = HSA_EXT_IMAGE_CHANNEL_ORDER_RG; + break; case GSL_CHANNEL_ORDER_RA: - orderType = HSA_EXT_IMAGE_CHANNEL_ORDER_RA; - break; + orderType = HSA_EXT_IMAGE_CHANNEL_ORDER_RA; + break; case GSL_CHANNEL_ORDER_RGB: - orderType = HSA_EXT_IMAGE_CHANNEL_ORDER_RGB; - break; + orderType = HSA_EXT_IMAGE_CHANNEL_ORDER_RGB; + break; case GSL_CHANNEL_ORDER_RGBA: - orderType = HSA_EXT_IMAGE_CHANNEL_ORDER_RGBA; - break; + orderType = HSA_EXT_IMAGE_CHANNEL_ORDER_RGBA; + break; case GSL_CHANNEL_ORDER_BGRA: - orderType = HSA_EXT_IMAGE_CHANNEL_ORDER_BGRA; - break; + orderType = HSA_EXT_IMAGE_CHANNEL_ORDER_BGRA; + break; case GSL_CHANNEL_ORDER_ARGB: - orderType = HSA_EXT_IMAGE_CHANNEL_ORDER_ARGB; - break; + orderType = HSA_EXT_IMAGE_CHANNEL_ORDER_ARGB; + break; case GSL_CHANNEL_ORDER_SRGB: - orderType = HSA_EXT_IMAGE_CHANNEL_ORDER_SRGB; - break; + orderType = HSA_EXT_IMAGE_CHANNEL_ORDER_SRGB; + break; case GSL_CHANNEL_ORDER_SRGBX: - orderType = HSA_EXT_IMAGE_CHANNEL_ORDER_SRGBX; - break; + orderType = HSA_EXT_IMAGE_CHANNEL_ORDER_SRGBX; + break; case GSL_CHANNEL_ORDER_SRGBA: - orderType = HSA_EXT_IMAGE_CHANNEL_ORDER_SRGBA; - break; + orderType = HSA_EXT_IMAGE_CHANNEL_ORDER_SRGBA; + break; case GSL_CHANNEL_ORDER_SBGRA: - orderType = HSA_EXT_IMAGE_CHANNEL_ORDER_SBGRA; - break; + orderType = HSA_EXT_IMAGE_CHANNEL_ORDER_SBGRA; + break; case GSL_CHANNEL_ORDER_INTENSITY: - orderType = HSA_EXT_IMAGE_CHANNEL_ORDER_INTENSITY; - break; + orderType = HSA_EXT_IMAGE_CHANNEL_ORDER_INTENSITY; + break; case GSL_CHANNEL_ORDER_LUMINANCE: - orderType = HSA_EXT_IMAGE_CHANNEL_ORDER_LUMINANCE; - break; + orderType = HSA_EXT_IMAGE_CHANNEL_ORDER_LUMINANCE; + break; case GSL_CHANNEL_ORDER_REPLICATE_R: - if ((format == CM_SURF_FMT_DEPTH32F_X24_STEN8) || - (format == CM_SURF_FMT_DEPTH24_STEN8)) { - orderType = HSA_EXT_IMAGE_CHANNEL_ORDER_DEPTH_STENCIL; - } - else { - orderType = HSA_EXT_IMAGE_CHANNEL_ORDER_DEPTH; - } - break; + if ((format == CM_SURF_FMT_DEPTH32F_X24_STEN8) || (format == CM_SURF_FMT_DEPTH24_STEN8)) { + orderType = HSA_EXT_IMAGE_CHANNEL_ORDER_DEPTH_STENCIL; + } else { + orderType = HSA_EXT_IMAGE_CHANNEL_ORDER_DEPTH; + } + break; default: - assert(false); - } + assert(false); + } - return orderType; + return orderType; } -bool -Resource::create(MemoryType memType, CreateParams* params) -{ - bool calRes = false; - gslMemObject gslResource = 0; - gslMemObject gslResOriginal = 0; - const amd::HostMemoryReference* hostMemRef = NULL; - bool imageCreateView = false; - CALuint hostMemOffset = 0; - bool foundCalRef = false; - bool viewDefined = false; - uint viewLayer = 0; - uint viewLevel = 0; - uint viewFlags = 0; - gslResource3D viewSize = {0}; - size_t viewOffset = 0; - cmSurfFmt viewSurfFmt; - gslChannelOrder viewChannelOrder = GSL_CHANNEL_ORDER_UNSPECIFIED; - gslMemObjectAttribType viewResType; - CALresourceDesc desc; - uint64 bytePitch = (uint64)-1; - bool useRowPitch = false; - bool mipLevelPitchPad = false; +bool Resource::create(MemoryType memType, CreateParams* params) { + bool calRes = false; + gslMemObject gslResource = 0; + gslMemObject gslResOriginal = 0; + const amd::HostMemoryReference* hostMemRef = NULL; + bool imageCreateView = false; + CALuint hostMemOffset = 0; + bool foundCalRef = false; + bool viewDefined = false; + uint viewLayer = 0; + uint viewLevel = 0; + uint viewFlags = 0; + gslResource3D viewSize = {0}; + size_t viewOffset = 0; + cmSurfFmt viewSurfFmt; + gslChannelOrder viewChannelOrder = GSL_CHANNEL_ORDER_UNSPECIFIED; + gslMemObjectAttribType viewResType; + CALresourceDesc desc; + uint64 bytePitch = (uint64)-1; + bool useRowPitch = false; + bool mipLevelPitchPad = false; - desc.vaBase = 0; - desc.minAlignment = 0; - desc.isAllocExecute = false; - desc.isAllocSVM = false; - desc.section = GSL_SECTION_REGULAR; - if (NULL != params && NULL != params->owner_) { //make sure params not NULL - mcaddr svmPtr = reinterpret_cast(params->owner_->getSvmPtr()); - desc.vaBase = (svmPtr == 1)? 0:svmPtr; - // Dont cache coarse\fine grain svm resource as these may not be released - // and allocations may fail since there is limited space for coarse\fine grainbuffers - cal_.skipRsrcCache_ = (svmPtr != 0); - desc.section = (svmPtr != 0) ? GSL_SECTION_SVM : GSL_SECTION_REGULAR; + desc.vaBase = 0; + desc.minAlignment = 0; + desc.isAllocExecute = false; + desc.isAllocSVM = false; + desc.section = GSL_SECTION_REGULAR; + if (NULL != params && NULL != params->owner_) { // make sure params not NULL + mcaddr svmPtr = reinterpret_cast(params->owner_->getSvmPtr()); + desc.vaBase = (svmPtr == 1) ? 0 : svmPtr; + // Dont cache coarse\fine grain svm resource as these may not be released + // and allocations may fail since there is limited space for coarse\fine grainbuffers + cal_.skipRsrcCache_ = (svmPtr != 0); + desc.section = (svmPtr != 0) ? GSL_SECTION_SVM : GSL_SECTION_REGULAR; - if (params->owner_->getMemFlags() & CL_MEM_SVM_ATOMICS) { - desc.section = GSL_SECTION_SVM_ATOMICS; - } - - if (dev().settings().svmFineGrainSystem_ && - (desc.section == GSL_SECTION_SVM || - desc.section == GSL_SECTION_SVM_ATOMICS)) { - cal_.isAllocSVM_ = desc.isAllocSVM = true; - } + if (params->owner_->getMemFlags() & CL_MEM_SVM_ATOMICS) { + desc.section = GSL_SECTION_SVM_ATOMICS; } - if (memType == Shader){ - if(dev().settings().svmFineGrainSystem_) { - cal_.isAllocExecute_ = desc.isAllocExecute = true; - cal_.isAllocSVM_ = desc.isAllocSVM = true; - } - // force to use remote memory for HW DEBUG or use - // local memory once we determine if FGS is supported - memType = (!dev().settings().enableHwDebug_) ? Local : RemoteUSWC; + if (dev().settings().svmFineGrainSystem_ && + (desc.section == GSL_SECTION_SVM || desc.section == GSL_SECTION_SVM_ATOMICS)) { + cal_.isAllocSVM_ = desc.isAllocSVM = true; } + } - // This is a thread safe operation - const_cast(dev()).initializeHeapResources(); - - // Get the element size - elementSize_ = static_cast(memoryFormatSize(cal()->format_).size_); - cal_.type_ = memType; - if (memType == Scratch) { - // use local memory for scratch buffer unless it is using HW DEBUG - cal_.type_ = (!dev().settings().enableHwDebug_) ? Local : RemoteUSWC; - cal_.scratch_ = true; + if (memType == Shader) { + if (dev().settings().svmFineGrainSystem_) { + cal_.isAllocExecute_ = desc.isAllocExecute = true; + cal_.isAllocSVM_ = desc.isAllocSVM = true; } + // force to use remote memory for HW DEBUG or use + // local memory once we determine if FGS is supported + memType = (!dev().settings().enableHwDebug_) ? Local : RemoteUSWC; + } - // Force remote allocation if it was requested in the settings - if (dev().settings().remoteAlloc_ && - ((memoryType() == Local) || - (memoryType() == Persistent))) { - if (dev().settings().apuSystem_ && dev().settings().viPlus_) { - cal_.type_ = Remote; - } - else { - cal_.type_ = RemoteUSWC; - } + // This is a thread safe operation + const_cast(dev()).initializeHeapResources(); + + // Get the element size + elementSize_ = static_cast(memoryFormatSize(cal()->format_).size_); + cal_.type_ = memType; + if (memType == Scratch) { + // use local memory for scratch buffer unless it is using HW DEBUG + cal_.type_ = (!dev().settings().enableHwDebug_) ? Local : RemoteUSWC; + cal_.scratch_ = true; + } + + // Force remote allocation if it was requested in the settings + if (dev().settings().remoteAlloc_ && ((memoryType() == Local) || (memoryType() == Persistent))) { + if (dev().settings().apuSystem_ && dev().settings().viPlus_) { + cal_.type_ = Remote; + } else { + cal_.type_ = RemoteUSWC; } + } - if (dev().settings().disablePersistent_ && (memoryType() == Persistent)) { - cal_.type_ = RemoteUSWC; - } + if (dev().settings().disablePersistent_ && (memoryType() == Persistent)) { + cal_.type_ = RemoteUSWC; + } - if (cal()->buffer_) { - // Force linear tiling for buffer alloctions - cal_.flags_ |= CAL_RESALLOC_GLOBAL_BUFFER; - } + if (cal()->buffer_) { + // Force linear tiling for buffer alloctions + cal_.flags_ |= CAL_RESALLOC_GLOBAL_BUFFER; + } - if (params != NULL) { - gpu_ = params->gpu_; - } + if (params != NULL) { + gpu_ = params->gpu_; + } - switch (memoryType()) { + switch (memoryType()) { case Heap: - gslResource = dev().resGetHeap(0); - if (gslResource == 0) { - return false; - } - calRes = true; - cal_.width_ = static_cast(gslResource->getPitch()); - cal_.pitch_ = static_cast(gslResource->getPitch()); - break; + gslResource = dev().resGetHeap(0); + if (gslResource == 0) { + return false; + } + calRes = true; + cal_.width_ = static_cast(gslResource->getPitch()); + cal_.pitch_ = static_cast(gslResource->getPitch()); + break; case Persistent: - if (dev().settings().linearPersistentImage_) { - // Force linear tiling for image allocations in persistent - cal_.flags_ |= CAL_RESALLOC_GLOBAL_BUFFER; - } - // Fall through ... + if (dev().settings().linearPersistentImage_) { + // Force linear tiling for image allocations in persistent + cal_.flags_ |= CAL_RESALLOC_GLOBAL_BUFFER; + } + // Fall through ... case RemoteUSWC: case Remote: case Shader: case BusAddressable: case ExternalPhysical: - // Fall through to process the memory allocation ... + // Fall through to process the memory allocation ... case Local: { - if (cal()->buffer_) { - //! @todo Remove alignment. - //! GSL asserts in mem copy with an unaligned size - cal_.width_ = amd::alignUp(cal_.width_, 64); + if (cal()->buffer_) { + //! @todo Remove alignment. + //! GSL asserts in mem copy with an unaligned size + cal_.width_ = amd::alignUp(cal_.width_, 64); + } + + desc.dimension = cal()->dimension_; + desc.size.width = cal()->width_; + desc.size.height = cal()->height_; + desc.size.depth = cal()->depth_; + desc.format = cal()->format_; + desc.channelOrder = cal()->channelOrder_; + desc.flags = cal()->flags_; + desc.mipLevels = cal()->mipLevels_; + desc.systemMemory = NULL; + + uint allocAttempt = 0; + do { + // Find a type for allocation + if (memoryType() == Persistent) { + desc.type = GSL_MOA_MEMORY_CARD_LOCKABLE; + } else if (memoryType() == Remote) { + desc.type = GSL_MOA_MEMORY_REMOTE_CACHEABLE; + } else if (memoryType() == RemoteUSWC) { + desc.type = GSL_MOA_MEMORY_AGP; + } else if (memoryType() == BusAddressable) { + desc.type = GSL_MOA_MEMORY_CARD_BUS_ADDRESSABLE; + } else if (memoryType() == ExternalPhysical) { + desc.type = GSL_MOA_MEMORY_CARD_EXTERNAL_PHYSICAL; + cl_bus_address_amd bus_address = + (reinterpret_cast(params->owner_))->busAddress(); + desc.busAddress[0] = bus_address.surface_bus_address; + desc.busAddress[1] = bus_address.marker_bus_address; + } else { + desc.type = GSL_MOA_MEMORY_CARD_EXT_NONEXT; } - desc.dimension = cal()->dimension_; - desc.size.width = cal()->width_; - desc.size.height = cal()->height_; - desc.size.depth = cal()->depth_; - desc.format = cal()->format_; - desc.channelOrder = cal()->channelOrder_; - desc.flags = cal()->flags_; - desc.mipLevels = cal()->mipLevels_; - desc.systemMemory = NULL; - - uint allocAttempt = 0; - do { - // Find a type for allocation - if (memoryType() == Persistent) { - desc.type = GSL_MOA_MEMORY_CARD_LOCKABLE; - } - else if (memoryType() == Remote) { - desc.type = GSL_MOA_MEMORY_REMOTE_CACHEABLE; - } - else if (memoryType() == RemoteUSWC) { - desc.type = GSL_MOA_MEMORY_AGP; - } - else if (memoryType() == BusAddressable){ - desc.type = GSL_MOA_MEMORY_CARD_BUS_ADDRESSABLE; - } - else if (memoryType() == ExternalPhysical){ - desc.type = GSL_MOA_MEMORY_CARD_EXTERNAL_PHYSICAL; - cl_bus_address_amd bus_address = - (reinterpret_cast(params->owner_))->busAddress(); - desc.busAddress[0] = bus_address.surface_bus_address; - desc.busAddress[1] = bus_address.marker_bus_address; - } - else { - desc.type = GSL_MOA_MEMORY_CARD_EXT_NONEXT; - } - - // Check resource cache first for an appropriate resource - gslRef_ = dev().resourceCache().findCalResource(&cal_); - if (memType == Scratch) { - if ((dev().settings().hsail_) || (dev().settings().oclVersion_ >= OpenCL20)) { - desc.minAlignment = 64 * Ki; - } - else { - desc.vaBase = static_cast(0x100000000ULL); - } - } - else if ((gslRef_ != NULL) && (!dev().settings().use64BitPtr_)) { - // Make sure runtime didn't pick a resource with > 4GB address - if ((cal()->dimension_ == GSL_MOA_BUFFER) && - (static_cast(gslRef_->gslResource()->getSurfaceAddress() + - gslRef_->gslResource()->getSurfaceSize()) > (uint64_t(4) * Gi))) { - gslRef_->release(); - gslRef_ = NULL; - } - } - // Try to allocate memory if we couldn't find a cached resource - if (gslRef_ == NULL) { - // Allocate memory - gslResource = dev().resAlloc(&desc); - if (gslResource != 0) { - calRes = true; - } - } - else { - calRes = true; - gslResource = gslRef_->gslOriginal(); - foundCalRef = true; - } - - // If GSL fails allocation then try other heaps - if (!calRes) { - // Free cache if we failed allocation - if (dev().resourceCache().free()) { - // We freed something - attempt to allocate memory again - continue; - } - - // Local to Persistent - if (memoryType() == Local) { - cal_.type_ = Persistent; - } - // Don't switch to USWC if persistent memory was explicitly asked - else if ((allocAttempt > 0) && (memoryType() == Persistent)) { - cal_.type_ = RemoteUSWC; - } - // Remote cacheable to uncacheable - else if (memoryType() == Remote) { - cal_.type_ = RemoteUSWC; - } - else { - break; - } - allocAttempt++; - } + // Check resource cache first for an appropriate resource + gslRef_ = dev().resourceCache().findCalResource(&cal_); + if (memType == Scratch) { + if ((dev().settings().hsail_) || (dev().settings().oclVersion_ >= OpenCL20)) { + desc.minAlignment = 64 * Ki; + } else { + desc.vaBase = static_cast(0x100000000ULL); + } + } else if ((gslRef_ != NULL) && (!dev().settings().use64BitPtr_)) { + // Make sure runtime didn't pick a resource with > 4GB address + if ((cal()->dimension_ == GSL_MOA_BUFFER) && + (static_cast(gslRef_->gslResource()->getSurfaceAddress() + + gslRef_->gslResource()->getSurfaceSize()) > + (uint64_t(4) * Gi))) { + gslRef_->release(); + gslRef_ = NULL; + } } - while (!calRes); - } - break; - case Pinned: { - PinnedParams* pinned = reinterpret_cast(params); - CALuint allocSize = static_cast(pinned->size_); - void* pinAddress; - hostMemRef = pinned->hostMemRef_; - pinAddress = address_ = hostMemRef->hostMem(); - - // Use untiled allocation - cal_.flags_ |= CAL_RESALLOC_GLOBAL_BUFFER; - - desc.size.width = cal()->width_; - - if (cal()->dimension_ == GSL_MOA_BUFFER) { - // Allign offset to 4K boundary (Vista/Win7 limitation) - char* tmpHost = const_cast( - amd::alignDown(reinterpret_cast(address_), - PinnedMemoryAlignment)); - - // Find the partial size for unaligned copy - hostMemOffset = static_cast( - reinterpret_cast(address_) - tmpHost); - - pinOffset_ = hostMemOffset & 0xff; - - pinAddress = tmpHost; - // Align width to avoid GSL useless assert with a view - if (hostMemOffset != 0) { - desc.size.width += hostMemOffset / elementSize(); - desc.size.width = amd::alignUp(desc.size.width, 64); - } - hostMemOffset &= ~(0xff); - } - else if (cal()->dimension_ == GSL_MOA_TEXTURE_2D) { - //! @todo: Width has to be aligned for 3D. - //! Need to be replaced with a compute copy - // Width aligned by 8 texels - if (((cal()->width_ % 0x8) != 0) || - // Pitch aligned by 64 bytes - (((cal()->width_ * elementSize()) % 0x40) != 0)) { - return false; - } - } - else { - //! @todo GSL doesn't support pinning with resAlloc_ - return false; - } - - // Fill the GSL desc info structure - desc.dimension = cal()->dimension_; - desc.type = GSL_MOA_MEMORY_SYSTEM; - desc.size.height = cal()->height_; - desc.size.depth = cal()->depth_; - desc.format = cal()->format_; - desc.channelOrder = cal()->channelOrder_; - desc.mipLevels = 0; - desc.systemMemory = reinterpret_cast(pinAddress); - desc.flags = 0; - - // Ensure page alignment - if ((CALuint64)desc.systemMemory & (amd::Os::pageSize() - 1)) { - return false; - } - - gslResource = dev().resAlloc(&desc); - if (gslResource != 0) { + // Try to allocate memory if we couldn't find a cached resource + if (gslRef_ == NULL) { + // Allocate memory + gslResource = dev().resAlloc(&desc); + if (gslResource != 0) { calRes = true; + } + } else { + calRes = true; + gslResource = gslRef_->gslOriginal(); + foundCalRef = true; } - else { - pinOffset_ = 0; + + // If GSL fails allocation then try other heaps + if (!calRes) { + // Free cache if we failed allocation + if (dev().resourceCache().free()) { + // We freed something - attempt to allocate memory again + continue; + } + + // Local to Persistent + if (memoryType() == Local) { + cal_.type_ = Persistent; + } + // Don't switch to USWC if persistent memory was explicitly asked + else if ((allocAttempt > 0) && (memoryType() == Persistent)) { + cal_.type_ = RemoteUSWC; + } + // Remote cacheable to uncacheable + else if (memoryType() == Remote) { + cal_.type_ = RemoteUSWC; + } else { + break; + } + allocAttempt++; } - } - break; + } while (!calRes); + } break; + case Pinned: { + PinnedParams* pinned = reinterpret_cast(params); + CALuint allocSize = static_cast(pinned->size_); + void* pinAddress; + hostMemRef = pinned->hostMemRef_; + pinAddress = address_ = hostMemRef->hostMem(); + + // Use untiled allocation + cal_.flags_ |= CAL_RESALLOC_GLOBAL_BUFFER; + + desc.size.width = cal()->width_; + + if (cal()->dimension_ == GSL_MOA_BUFFER) { + // Allign offset to 4K boundary (Vista/Win7 limitation) + char* tmpHost = const_cast( + amd::alignDown(reinterpret_cast(address_), PinnedMemoryAlignment)); + + // Find the partial size for unaligned copy + hostMemOffset = static_cast(reinterpret_cast(address_) - tmpHost); + + pinOffset_ = hostMemOffset & 0xff; + + pinAddress = tmpHost; + // Align width to avoid GSL useless assert with a view + if (hostMemOffset != 0) { + desc.size.width += hostMemOffset / elementSize(); + desc.size.width = amd::alignUp(desc.size.width, 64); + } + hostMemOffset &= ~(0xff); + } else if (cal()->dimension_ == GSL_MOA_TEXTURE_2D) { + //! @todo: Width has to be aligned for 3D. + //! Need to be replaced with a compute copy + // Width aligned by 8 texels + if (((cal()->width_ % 0x8) != 0) || + // Pitch aligned by 64 bytes + (((cal()->width_ * elementSize()) % 0x40) != 0)) { + return false; + } + } else { + //! @todo GSL doesn't support pinning with resAlloc_ + return false; + } + + // Fill the GSL desc info structure + desc.dimension = cal()->dimension_; + desc.type = GSL_MOA_MEMORY_SYSTEM; + desc.size.height = cal()->height_; + desc.size.depth = cal()->depth_; + desc.format = cal()->format_; + desc.channelOrder = cal()->channelOrder_; + desc.mipLevels = 0; + desc.systemMemory = reinterpret_cast(pinAddress); + desc.flags = 0; + + // Ensure page alignment + if ((CALuint64)desc.systemMemory & (amd::Os::pageSize() - 1)) { + return false; + } + + gslResource = dev().resAlloc(&desc); + if (gslResource != 0) { + calRes = true; + } else { + pinOffset_ = 0; + } + } break; case View: { - // Save the offset in the global heap - ViewParams* view = reinterpret_cast(params); - offset_ = view->offset_; + // Save the offset in the global heap + ViewParams* view = reinterpret_cast(params); + offset_ = view->offset_; - // Make sure parent was provided - if (NULL != view->resource_) { - viewOwner_ = view->resource_; - uint64 bytePitch = (view->size_ + viewOwner_->pinOffset()); - viewSize.width = bytePitch / elementSize(); - viewSize.height = 1; - viewSize.depth = 1; - viewOffset = static_cast(offset() / elementSize()); + // Make sure parent was provided + if (NULL != view->resource_) { + viewOwner_ = view->resource_; + uint64 bytePitch = (view->size_ + viewOwner_->pinOffset()); + viewSize.width = bytePitch / elementSize(); + viewSize.height = 1; + viewSize.depth = 1; + viewOffset = static_cast(offset() / elementSize()); - gslResource = dev().resAllocView( - view->resource_->gslResource(), viewSize, viewOffset, - cal()->format_, GSL_CHANNEL_ORDER_REPLICATE_R, - cal()->dimension_, 0, 0, cal()->flags_, bytePitch); - if (gslResource != 0) { - calRes = true; - } - - if (viewOwner_->isMemoryType(Pinned)) { - address_ = viewOwner_->data() + offset(); - } - pinOffset_ = viewOwner_->pinOffset(); + gslResource = dev().resAllocView(view->resource_->gslResource(), viewSize, viewOffset, + cal()->format_, GSL_CHANNEL_ORDER_REPLICATE_R, + cal()->dimension_, 0, 0, cal()->flags_, bytePitch); + if (gslResource != 0) { + calRes = true; } - else { - cal_.type_ = Empty; + + if (viewOwner_->isMemoryType(Pinned)) { + address_ = viewOwner_->data() + offset(); } - } - break; + pinOffset_ = viewOwner_->pinOffset(); + } else { + cal_.type_ = Empty; + } + } break; case ImageView: { - ImageViewParams* imageView = reinterpret_cast(params); - imageCreateView = true; - viewLayer = imageView->layer_; - viewLevel = imageView->level_; - gslResource = imageView->resource_->gslResource(); - viewOwner_ = imageView->resource_; - if ((viewLevel != 0) || viewOwner_->mipMapped()) { - viewFlags |= CAL_RESALLOCSLICEVIEW_LEVEL; - } - if ((viewOwner_->viewOwner_ != NULL) && - viewOwner_->viewOwner_->mipMapped()) { - mipLevelPitchPad = true; - } + ImageViewParams* imageView = reinterpret_cast(params); + imageCreateView = true; + viewLayer = imageView->layer_; + viewLevel = imageView->level_; + gslResource = imageView->resource_->gslResource(); + viewOwner_ = imageView->resource_; + if ((viewLevel != 0) || viewOwner_->mipMapped()) { + viewFlags |= CAL_RESALLOCSLICEVIEW_LEVEL; + } + if ((viewOwner_->viewOwner_ != NULL) && viewOwner_->viewOwner_->mipMapped()) { + mipLevelPitchPad = true; + } - if (viewLayer != 0) { - viewFlags |= CAL_RESALLOCSLICEVIEW_LEVEL_AND_LAYER; - } - calRes = true; - } - break; + if (viewLayer != 0) { + viewFlags |= CAL_RESALLOCSLICEVIEW_LEVEL_AND_LAYER; + } + calRes = true; + } break; case ImageBuffer: { - ImageBufferParams* imageBuffer = reinterpret_cast(params); - imageCreateView = true; - gslResource = imageBuffer->resource_->gslResource(); - viewOwner_ = imageBuffer->resource_; - calRes = true; - useRowPitch = true; - } - break; + ImageBufferParams* imageBuffer = reinterpret_cast(params); + imageCreateView = true; + gslResource = imageBuffer->resource_->gslResource(); + viewOwner_ = imageBuffer->resource_; + calRes = true; + useRowPitch = true; + } break; case OGLInterop: { - OGLInteropParams* oglRes = reinterpret_cast(params); - assert(oglRes->glPlatformContext_ && "We don't have OGL context!"); - switch (oglRes->type_) { + OGLInteropParams* oglRes = reinterpret_cast(params); + assert(oglRes->glPlatformContext_ && "We don't have OGL context!"); + switch (oglRes->type_) { case InteropVertexBuffer: - glType_ = GL_RESOURCE_ATTACH_VERTEXBUFFER_AMD; - break; + glType_ = GL_RESOURCE_ATTACH_VERTEXBUFFER_AMD; + break; case InteropRenderBuffer: - glType_ = GL_RESOURCE_ATTACH_RENDERBUFFER_AMD; - break; + glType_ = GL_RESOURCE_ATTACH_RENDERBUFFER_AMD; + break; case InteropTexture: case InteropTextureViewLevel: case InteropTextureViewCube: - glType_ = GL_RESOURCE_ATTACH_TEXTURE_AMD; - break; + glType_ = GL_RESOURCE_ATTACH_TEXTURE_AMD; + break; default: - LogError("Unknown OGL interop type!"); - return false; - break; - } - glPlatformContext_ = oglRes->glPlatformContext_; - glDeviceContext_ = oglRes->glDeviceContext_; - CALGSLDevice::GLResAssociate resData = {0}; - resData.GLContext = oglRes->glPlatformContext_; - resData.GLdeviceContext = oglRes->glDeviceContext_; - resData.name = oglRes->handle_; - resData.type = glType_; - // We need not pass any flags down to OGL for interop and there is no need to - // pass down resData.flags field + LogError("Unknown OGL interop type!"); + return false; + break; + } + glPlatformContext_ = oglRes->glPlatformContext_; + glDeviceContext_ = oglRes->glDeviceContext_; + CALGSLDevice::GLResAssociate resData = {0}; + resData.GLContext = oglRes->glPlatformContext_; + resData.GLdeviceContext = oglRes->glDeviceContext_; + resData.name = oglRes->handle_; + resData.type = glType_; + // We need not pass any flags down to OGL for interop and there is no need to + // pass down resData.flags field - if (dev().resGLAssociate(resData)) { - gslResource = resData.memObject; - glInteropMbRes_ = resData.mbResHandle; - glInterop_ = resData.mem_base; - calRes = true; - } + if (dev().resGLAssociate(resData)) { + gslResource = resData.memObject; + glInteropMbRes_ = resData.mbResHandle; + glInterop_ = resData.mem_base; + calRes = true; + } - // Check if we have to create a view - if (calRes && - ((oglRes->type_ == InteropTextureViewLevel) || - (oglRes->type_ == InteropTextureViewCube))) { - imageCreateView = true; - viewLayer = oglRes->layer_; - viewLevel = oglRes->mipLevel_; + // Check if we have to create a view + if (calRes && ((oglRes->type_ == InteropTextureViewLevel) || + (oglRes->type_ == InteropTextureViewCube))) { + imageCreateView = true; + viewLayer = oglRes->layer_; + viewLevel = oglRes->mipLevel_; - // Find the view parameters - if (InteropTextureViewLevel == oglRes->type_) { - viewFlags |= CAL_RESALLOCSLICEVIEW_LEVEL; - } - else if (InteropTextureViewCube == oglRes->type_) { - viewFlags |= CAL_RESALLOCSLICEVIEW_LEVEL_AND_LAYER; - } - else { - LogError("Unknown Interop View Type"); - } + // Find the view parameters + if (InteropTextureViewLevel == oglRes->type_) { + viewFlags |= CAL_RESALLOCSLICEVIEW_LEVEL; + } else if (InteropTextureViewCube == oglRes->type_) { + viewFlags |= CAL_RESALLOCSLICEVIEW_LEVEL_AND_LAYER; + } else { + LogError("Unknown Interop View Type"); } - } - break; + } + } break; #ifdef _WIN32 case D3D9Interop: case D3D10Interop: case D3D11Interop: { - D3DInteropParams* d3dRes = reinterpret_cast(params); - desc.dimension = cal()->dimension_; - desc.size.width = cal()->width_; - desc.size.height = cal()->height_; - desc.size.depth = cal()->depth_; - desc.format = cal()->format_; - desc.channelOrder = cal()->channelOrder_; - desc.flags = cal()->flags_; - desc.mipLevels = 0; - desc.systemMemory = NULL; - switch (d3dRes->misc) { - case 1: // NV12 format - case 2: // YV12 format - // Readjust the size to the original NV12/YV12 size, since runtime - // creates an interop for all planes - switch (d3dRes->layer_) { + D3DInteropParams* d3dRes = reinterpret_cast(params); + desc.dimension = cal()->dimension_; + desc.size.width = cal()->width_; + desc.size.height = cal()->height_; + desc.size.depth = cal()->depth_; + desc.format = cal()->format_; + desc.channelOrder = cal()->channelOrder_; + desc.flags = cal()->flags_; + desc.mipLevels = 0; + desc.systemMemory = NULL; + switch (d3dRes->misc) { + case 1: // NV12 format + case 2: // YV12 format + // Readjust the size to the original NV12/YV12 size, since runtime + // creates an interop for all planes + switch (d3dRes->layer_) { case 0: - desc.size.height = 3 * desc.size.height / 2; - break; + desc.size.height = 3 * desc.size.height / 2; + break; case 1: case 2: - // Force R8 format for the interop allocation by default - if (1 == d3dRes->misc) { - desc.format = CM_SURF_FMT_R8; - desc.channelOrder = GSL_CHANNEL_ORDER_R; - } - desc.size.width = 2 * desc.size.width; - desc.size.height = 3 * desc.size.height; - break; + // Force R8 format for the interop allocation by default + if (1 == d3dRes->misc) { + desc.format = CM_SURF_FMT_R8; + desc.channelOrder = GSL_CHANNEL_ORDER_R; + } + desc.size.width = 2 * desc.size.width; + desc.size.height = 3 * desc.size.height; + break; default: - break; - } - break; + break; + } + break; default: - break; + break; + } + + // Create an interop GSL object + gslResource = + dev().resMapD3DResource(&desc, (CALuint64)d3dRes->handle_, (memoryType() != D3D9Interop)); + if (gslResource != 0) { + calRes = true; + } else { + return false; + } + + + // Check if we have to create a view + if (calRes && ((d3dRes->type_ == InteropTextureViewLevel) || + (d3dRes->type_ == InteropTextureViewCube))) { + imageCreateView = true; + viewLayer = d3dRes->layer_; + viewLevel = d3dRes->mipLevel_; + + // Find the view parameters + if (InteropTextureViewLevel == d3dRes->type_) { + viewFlags |= CAL_RESALLOCSLICEVIEW_LEVEL; + } else if (InteropTextureViewCube == d3dRes->type_) { + viewFlags |= CAL_RESALLOCSLICEVIEW_LEVEL_AND_LAYER; + } else { + LogError("Unknown Interop View Type"); } + } - // Create an interop GSL object - gslResource = dev().resMapD3DResource( - &desc, (CALuint64)d3dRes->handle_, (memoryType() != D3D9Interop)); - if (gslResource != 0) { - calRes = true; - } - else { - return false; - } - - - // Check if we have to create a view - if (calRes && - ((d3dRes->type_ == InteropTextureViewLevel) || - (d3dRes->type_ == InteropTextureViewCube))) { - imageCreateView = true; - viewLayer = d3dRes->layer_; - viewLevel = d3dRes->mipLevel_; - - // Find the view parameters - if (InteropTextureViewLevel == d3dRes->type_) { - viewFlags |= CAL_RESALLOCSLICEVIEW_LEVEL; - } - else if (InteropTextureViewCube == d3dRes->type_) { - viewFlags |= CAL_RESALLOCSLICEVIEW_LEVEL_AND_LAYER; - } - else { - LogError("Unknown Interop View Type"); - } - } - - switch (d3dRes->misc) { + switch (d3dRes->misc) { case 0: - break; - case 1: // NV12 format - case 2: // YV12 format - // Create a view for the specified plane - viewDefined = true; - viewSize.width = cal()->width_; - viewSize.height = cal()->height_; - viewSize.depth = 1; - bytePitch = static_cast(gslResource->getPitch()); - viewOffset = 0; - viewSurfFmt = cal()->format_; - viewChannelOrder = cal()->channelOrder_; - switch (d3dRes->layer_) { + break; + case 1: // NV12 format + case 2: // YV12 format + // Create a view for the specified plane + viewDefined = true; + viewSize.width = cal()->width_; + viewSize.height = cal()->height_; + viewSize.depth = 1; + bytePitch = static_cast(gslResource->getPitch()); + viewOffset = 0; + viewSurfFmt = cal()->format_; + viewChannelOrder = cal()->channelOrder_; + switch (d3dRes->layer_) { case -1: - break; + break; case 0: - break; + break; case 1: - // Y - plane size to the offset - viewOffset = bytePitch * viewSize.height * 2; - if (d3dRes->misc == 2) { - // YV12 format U is 2 times smaller plane - bytePitch /= 2; - } - break; - case 2: - // Y + U plane sizes to the offest. - // U plane is 4 times smaller than Y => 5/2 - viewOffset = bytePitch * viewSize.height * 5 / 2; - // V is 2 times smaller plane + // Y - plane size to the offset + viewOffset = bytePitch * viewSize.height * 2; + if (d3dRes->misc == 2) { + // YV12 format U is 2 times smaller plane bytePitch /= 2; - break; + } + break; + case 2: + // Y + U plane sizes to the offest. + // U plane is 4 times smaller than Y => 5/2 + viewOffset = bytePitch * viewSize.height * 5 / 2; + // V is 2 times smaller plane + bytePitch /= 2; + break; default: - LogError("Unknown Interop View Type"); - calRes = false; - break; - } - break; + LogError("Unknown Interop View Type"); + calRes = false; + break; + } + break; default: - LogError("Unknown Interop View Type"); - calRes = false; - } - } - break; -#endif // _WIN32 + LogError("Unknown Interop View Type"); + calRes = false; + } + } break; +#endif // _WIN32 default: - LogWarning("Resource::create() called with unknown memory type"); - return false; - break; + LogWarning("Resource::create() called with unknown memory type"); + return false; + break; + } + + // Create a view for interop, since the original buffer may have different format + // than the global buffer and GSL mem copy will fail + bool interopBufView = + cal()->buffer_ && ((memoryType() == D3D10Interop) || (memoryType() == OGLInterop) || + (memoryType() == D3D11Interop)); + + bool ignoreParentHandle = ((memoryType() == ImageView) || (memoryType() == ImageBuffer)); + + // Create imageview if it was requested + if (calRes && (imageCreateView || interopBufView || hostMemOffset || viewDefined)) { + gslResOriginal = gslResource; + + // Disable tiling if it's a buffer view + if (interopBufView || hostMemOffset) { + viewFlags = CAL_RESALLOCVIEW_GLOBAL_BUFFER; } - // Create a view for interop, since the original buffer may have different format - // than the global buffer and GSL mem copy will fail - bool interopBufView = cal()->buffer_ && - ((memoryType() == D3D10Interop) || (memoryType() == OGLInterop) || - (memoryType() == D3D11Interop)); - - bool ignoreParentHandle = - ((memoryType() == ImageView) || (memoryType() == ImageBuffer)); - - // Create imageview if it was requested - if (calRes && - (imageCreateView || interopBufView || hostMemOffset || viewDefined)) { - - gslResOriginal = gslResource; - - // Disable tiling if it's a buffer view - if (interopBufView || hostMemOffset) { - viewFlags = CAL_RESALLOCVIEW_GLOBAL_BUFFER; - } - - viewResType = cal()->dimension_; - if (!viewDefined) { - viewSize.width = cal()->width_ + (pinOffset() / elementSize()); - viewSize.height = cal()->height_; - viewSize.depth = cal()->depth_; - viewOffset = hostMemOffset / static_cast(elementSize()); - viewSurfFmt = cal()->format_; - viewChannelOrder = cal()->channelOrder_; - } - - if (useRowPitch && (params->owner_ != NULL) && params->owner_->asImage() && - (params->owner_->asImage()->getRowPitch() != 0)) { - bytePitch = params->owner_->asImage()->getRowPitch(); - } - - // Allocate a view resource object - gslResource = dev().resAllocView( - gslResOriginal, viewSize, viewOffset, viewSurfFmt, - viewChannelOrder, viewResType, viewLevel, viewLayer, viewFlags, bytePitch); - - if (gslResource == 0) { - // If we don't have to keep the parent handle, - // then destroy the original resource - if (!ignoreParentHandle) { - dev().resFree(gslResOriginal); - gslResOriginal = 0; - } - LogError("ResAlloc failed!"); - return false; - } - - if (ignoreParentHandle) { - gslResOriginal = 0; - } + viewResType = cal()->dimension_; + if (!viewDefined) { + viewSize.width = cal()->width_ + (pinOffset() / elementSize()); + viewSize.height = cal()->height_; + viewSize.depth = cal()->depth_; + viewOffset = hostMemOffset / static_cast(elementSize()); + viewSurfFmt = cal()->format_; + viewChannelOrder = cal()->channelOrder_; } - if (!calRes) { - if (gslResource != 0) { - dev().resFree(gslResource); - } - if (memoryType() != Pinned) { - LogError("calResAlloc failed!"); - } - return false; + if (useRowPitch && (params->owner_ != NULL) && params->owner_->asImage() && + (params->owner_->asImage()->getRowPitch() != 0)) { + bytePitch = params->owner_->asImage()->getRowPitch(); } - // Find memory location - switch (gslResource->getAttribs().location) { + // Allocate a view resource object + gslResource = + dev().resAllocView(gslResOriginal, viewSize, viewOffset, viewSurfFmt, viewChannelOrder, + viewResType, viewLevel, viewLayer, viewFlags, bytePitch); + + if (gslResource == 0) { + // If we don't have to keep the parent handle, + // then destroy the original resource + if (!ignoreParentHandle) { + dev().resFree(gslResOriginal); + gslResOriginal = 0; + } + LogError("ResAlloc failed!"); + return false; + } + + if (ignoreParentHandle) { + gslResOriginal = 0; + } + } + + if (!calRes) { + if (gslResource != 0) { + dev().resFree(gslResource); + } + if (memoryType() != Pinned) { + LogError("calResAlloc failed!"); + } + return false; + } + + // Find memory location + switch (gslResource->getAttribs().location) { case GSL_MOA_MEMORY_CARD: case GSL_MOA_MEMORY_CARD_EXT: case GSL_MOA_MEMORY_CARD_LOCKABLE: case GSL_MOA_MEMORY_CARD_EXT_NONEXT: case GSL_MOA_MEMORY_CARD_BUS_ADDRESSABLE: - cal_.cardMemory_ = true; - break; + cal_.cardMemory_ = true; + break; default: - cal_.cardMemory_ = false; - break; - } + cal_.cardMemory_ = false; + break; + } - gslMemObjectAttribTiling tiling = gslResource->getAttribs().tiling; - cal_.tiled_ = (GSL_MOA_TILING_LINEAR != tiling) && - (GSL_MOA_TILING_LINEAR_GENERAL != tiling); + gslMemObjectAttribTiling tiling = gslResource->getAttribs().tiling; + cal_.tiled_ = (GSL_MOA_TILING_LINEAR != tiling) && (GSL_MOA_TILING_LINEAR_GENERAL != tiling); - // Get the heap block offset - hbOffset_ = gslResource->getSurfaceAddress() - - dev().heap().baseAddress(); - hbSize_ = static_cast(gslResource->getSurfaceSize()); + // Get the heap block offset + hbOffset_ = gslResource->getSurfaceAddress() - dev().heap().baseAddress(); + hbSize_ = static_cast(gslResource->getSurfaceSize()); - if (!dev().settings().use64BitPtr_ && - !((memType == Scratch) || ((memType == View) && viewOwner_->cal()->scratch_))) { - // Make sure runtime doesn't go over the address space limit for buffers - if ((memoryType() != Heap) && - (cal()->dimension_ == GSL_MOA_BUFFER) && - ((hbOffset_ + hbSize_) > (uint64_t(4) * Gi))) { - if (cal_.cardMemory_) { - LogPrintfError( - "Out of 4GB address space. Base: 0x%016llX, size: 0x%016llX!", - hbOffset_, hbSize_); + if (!dev().settings().use64BitPtr_ && + !((memType == Scratch) || ((memType == View) && viewOwner_->cal()->scratch_))) { + // Make sure runtime doesn't go over the address space limit for buffers + if ((memoryType() != Heap) && (cal()->dimension_ == GSL_MOA_BUFFER) && + ((hbOffset_ + hbSize_) > (uint64_t(4) * Gi))) { + if (cal_.cardMemory_) { + LogPrintfError("Out of 4GB address space. Base: 0x%016llX, size: 0x%016llX!", hbOffset_, + hbSize_); - dev().resFree(gslResource); - //! @note: A workaround for a Windows delay on memory destruction - //! Runtime submits a fake memory fill to force KMD to return - //! the freed memory ranges - if (IS_WINDOWS) { - uint32_t pattern = 0; - Memory* dummy = reinterpret_cast( - dev().dummyPage()->getDeviceMemory(dev())); - dev().xferMgr().fillBuffer(*dummy, &pattern, sizeof(uint32_t), - amd::Coord3D(0), amd::Coord3D(sizeof(uint32_t))); - } - if ((gslResOriginal != 0) && !ignoreParentHandle) { - dev().resFree(gslResOriginal); - gslResOriginal = 0; - } - return false; - } - else { - LogWarning("Out of 4GB address space for AHP/UHP!"); - } + dev().resFree(gslResource); + //! @note: A workaround for a Windows delay on memory destruction + //! Runtime submits a fake memory fill to force KMD to return + //! the freed memory ranges + if (IS_WINDOWS) { + uint32_t pattern = 0; + Memory* dummy = reinterpret_cast(dev().dummyPage()->getDeviceMemory(dev())); + dev().xferMgr().fillBuffer(*dummy, &pattern, sizeof(uint32_t), amd::Coord3D(0), + amd::Coord3D(sizeof(uint32_t))); } - } - - if (!foundCalRef) { - gslRef_ = new GslResourceReference(dev(), gslResource, gslResOriginal); - if (gslRef_ == NULL) { - LogError("Memory allocation failure!"); - dev().resFree(gslResource); - return false; + if ((gslResOriginal != 0) && !ignoreParentHandle) { + dev().resFree(gslResOriginal); + gslResOriginal = 0; } + return false; + } else { + LogWarning("Out of 4GB address space for AHP/UHP!"); + } } + } - if ((dev().settings().hsail_ || (dev().settings().oclVersion_ == OpenCL20)) && - !cal()->buffer_) { - hwSrd_ = dev().srds().allocSrdSlot(reinterpret_cast(&hwState_)); - if (0 == hwSrd_) { - return false; - } - dev().fillImageHwState(gslResource, hwState_, 8 * sizeof(uint32_t)); - hwState_[8] = GetHSAILImageFormatType(cal()->format_); - hwState_[9] = GetHSAILImageOrderType(cal()->channelOrder_, cal()->format_); - hwState_[10] = static_cast(cal()->width_); - if (memoryType() == ImageView) { - // Workaround for depth view, change tileIndex to the parent for depth view - if (viewChannelOrder == GSL_CHANNEL_ORDER_REPLICATE_R) { - if ((hwState_[3] & 0x1f00000) == 0xe00000) { - hwState_[3] = (hwState_[3] & 0xfe0fffff) | - (viewOwner_->hwState_[3] & 0x1f00000); - } - } - // Update the POW2_PAD flag, otherwise HW uses a wrong pitch value - if ((viewFlags & CAL_RESALLOCSLICEVIEW_LEVEL) || mipLevelPitchPad) { - hwState_[3] |= (viewOwner_->hwState_[3] & 0x2000000); - } - } - hwState_[11] = 0; // one extra reserved field in the argument - } - - if (desc.section == GSL_SECTION_SVM || desc.section == GSL_SECTION_SVM_ATOMICS) - { - params->owner_->setSvmPtr(reinterpret_cast(gslResource->getSurfaceAddress())); - } - - return true; -} - -void -Resource::free() -{ + if (!foundCalRef) { + gslRef_ = new GslResourceReference(dev(), gslResource, gslResOriginal); if (gslRef_ == NULL) { - return; + LogError("Memory allocation failure!"); + dev().resFree(gslResource); + return false; } + } - // Sanity check for the map calls - if (mapCount_ != 0) { - LogWarning("Resource wasn't unlocked, but destroyed!"); + if ((dev().settings().hsail_ || (dev().settings().oclVersion_ == OpenCL20)) && !cal()->buffer_) { + hwSrd_ = dev().srds().allocSrdSlot(reinterpret_cast(&hwState_)); + if (0 == hwSrd_) { + return false; } - const bool wait = (memoryType() != ImageView) && - (memoryType() != ImageBuffer); - - // Check if resource could be used in any queue(thread) - if (gpu_ == NULL) { - Device::ScopedLockVgpus lock(dev()); - - if (renames_.size() == 0) { - // Destroy GSL resource - if (gslResource() != 0) { - // Release all virtual memory objects on all virtual GPUs - for (uint idx = 0; idx < dev().vgpus().size(); ++idx) { - // Ignore the transfer queue, - // since it releases resources after every operation - if (dev().vgpus()[idx] != dev().xferQueue()) { - dev().vgpus()[idx]->releaseMemory(gslResource(), wait); - } - } - - //! @note: This is a workaround for bad applications that - //! don't unmap memory - if (mapCount_ != 0) { - unmap(NULL); - } - - // Add resource to the cache - if (!dev().resourceCache().addCalResource(&cal_, gslRef_)) { - gslFree(); - } - } - } - else { - renames_[curRename_]->cpuAddress_ = 0; - for (size_t i = 0; i < renames_.size(); ++i) { - gslRef_ = renames_[i]; - // Destroy GSL resource - if (gslResource() != 0) { - // Release all virtual memory objects on all virtual GPUs - for (uint idx = 0; idx < dev().vgpus().size(); ++idx) { - // Ignore the transfer queue, - // since it releases resources after every operation - if (dev().vgpus()[idx] != dev().xferQueue()) { - dev().vgpus()[idx]->releaseMemory(gslResource()); - } - } - gslFree(); - } - } - } - } - else { - if (renames_.size() == 0) { - // Destroy GSL resource - if (gslResource() != 0) { - // Release virtual memory object on the specified virtual GPU - gpu_->releaseMemory(gslResource(), wait); - gslFree(); - } - } - else for (size_t i = 0; i < renames_.size(); ++i) { - gslRef_ = renames_[i]; - // Destroy GSL resource - if (gslResource() != 0) { - // Release virtual memory object on the specified virtual GPUs - gpu_->releaseMemory(gslResource()); - gslFree(); - } + dev().fillImageHwState(gslResource, hwState_, 8 * sizeof(uint32_t)); + hwState_[8] = GetHSAILImageFormatType(cal()->format_); + hwState_[9] = GetHSAILImageOrderType(cal()->channelOrder_, cal()->format_); + hwState_[10] = static_cast(cal()->width_); + if (memoryType() == ImageView) { + // Workaround for depth view, change tileIndex to the parent for depth view + if (viewChannelOrder == GSL_CHANNEL_ORDER_REPLICATE_R) { + if ((hwState_[3] & 0x1f00000) == 0xe00000) { + hwState_[3] = (hwState_[3] & 0xfe0fffff) | (viewOwner_->hwState_[3] & 0x1f00000); } + } + // Update the POW2_PAD flag, otherwise HW uses a wrong pitch value + if ((viewFlags & CAL_RESALLOCSLICEVIEW_LEVEL) || mipLevelPitchPad) { + hwState_[3] |= (viewOwner_->hwState_[3] & 0x2000000); + } } + hwState_[11] = 0; // one extra reserved field in the argument + } - // Free SRD for images - if ((dev().settings().hsail_ || (dev().settings().oclVersion_ == OpenCL20)) && - !cal()->buffer_) { - dev().srds().freeSrdSlot(hwSrd_); - } + if (desc.section == GSL_SECTION_SVM || desc.section == GSL_SECTION_SVM_ATOMICS) { + params->owner_->setSvmPtr(reinterpret_cast(gslResource->getSurfaceAddress())); + } + + return true; } -void -Resource::writeRawData( - VirtualGPU& gpu, - size_t size, - const void* data, - bool waitForEvent) const -{ - GpuEvent event; +void Resource::free() { + if (gslRef_ == NULL) { + return; + } - // Write data size bytes to surface - // size needs to be DWORD aligned - assert((size & 3) == 0); - gpu.eventBegin(MainEngine); - gslResource()->writeDataRaw(gpu.cs(), size, data, true); - gpu.eventEnd(MainEngine, event); + // Sanity check for the map calls + if (mapCount_ != 0) { + LogWarning("Resource wasn't unlocked, but destroyed!"); + } + const bool wait = (memoryType() != ImageView) && (memoryType() != ImageBuffer); + // Check if resource could be used in any queue(thread) + if (gpu_ == NULL) { + Device::ScopedLockVgpus lock(dev()); + + if (renames_.size() == 0) { + // Destroy GSL resource + if (gslResource() != 0) { + // Release all virtual memory objects on all virtual GPUs + for (uint idx = 0; idx < dev().vgpus().size(); ++idx) { + // Ignore the transfer queue, + // since it releases resources after every operation + if (dev().vgpus()[idx] != dev().xferQueue()) { + dev().vgpus()[idx]->releaseMemory(gslResource(), wait); + } + } + + //! @note: This is a workaround for bad applications that + //! don't unmap memory + if (mapCount_ != 0) { + unmap(NULL); + } + + // Add resource to the cache + if (!dev().resourceCache().addCalResource(&cal_, gslRef_)) { + gslFree(); + } + } + } else { + renames_[curRename_]->cpuAddress_ = 0; + for (size_t i = 0; i < renames_.size(); ++i) { + gslRef_ = renames_[i]; + // Destroy GSL resource + if (gslResource() != 0) { + // Release all virtual memory objects on all virtual GPUs + for (uint idx = 0; idx < dev().vgpus().size(); ++idx) { + // Ignore the transfer queue, + // since it releases resources after every operation + if (dev().vgpus()[idx] != dev().xferQueue()) { + dev().vgpus()[idx]->releaseMemory(gslResource()); + } + } + gslFree(); + } + } + } + } else { + if (renames_.size() == 0) { + // Destroy GSL resource + if (gslResource() != 0) { + // Release virtual memory object on the specified virtual GPU + gpu_->releaseMemory(gslResource(), wait); + gslFree(); + } + } else + for (size_t i = 0; i < renames_.size(); ++i) { + gslRef_ = renames_[i]; + // Destroy GSL resource + if (gslResource() != 0) { + // Release virtual memory object on the specified virtual GPUs + gpu_->releaseMemory(gslResource()); + gslFree(); + } + } + } + + // Free SRD for images + if ((dev().settings().hsail_ || (dev().settings().oclVersion_ == OpenCL20)) && !cal()->buffer_) { + dev().srds().freeSrdSlot(hwSrd_); + } +} + +void Resource::writeRawData(VirtualGPU& gpu, size_t size, const void* data, + bool waitForEvent) const { + GpuEvent event; + + // Write data size bytes to surface + // size needs to be DWORD aligned + assert((size & 3) == 0); + gpu.eventBegin(MainEngine); + gslResource()->writeDataRaw(gpu.cs(), size, data, true); + gpu.eventEnd(MainEngine, event); + + setBusy(gpu, event); + // Update the global GPU event + gpu.setGpuEvent(event, false); + + if (waitForEvent) { + // Wait for event to complete + gpu.waitForEvent(&event); + } +} + +bool Resource::partialMemCopyTo(VirtualGPU& gpu, const amd::Coord3D& srcOrigin, + const amd::Coord3D& dstOrigin, const amd::Coord3D& size, + Resource& dstResource, bool enableCopyRect, bool flushDMA, + uint bytesPerElement) const { + GpuEvent event; + bool result; + CALuint syncFlags = CAL_MEMCOPY_SYNC; + EngineType activeEngineID = gpu.engineID_; + static const bool waitOnBusyEngine = true; + // \note timing issues in Linux with sync mode + bool flush = true; + + // Check if runtime can use async memory copy, + // even if a caller didn't request async + if (dev().settings().asyncMemCopy_ && + // Keep ASYNC if profiling is disabled or sdma profiling is possible + (!gpu.profiling() || dev().settings().sdmaProfiling_) && + (!cal()->cardMemory_ || !dstResource.cal()->cardMemory_)) { + // Switch to SDMA engine + gpu.engineID_ = SdmaEngine; + syncFlags = CAL_MEMCOPY_ASYNC; + flush = false; + } + + // Wait for the resources, since runtime may use async transfers + wait(gpu, waitOnBusyEngine); + dstResource.wait(gpu, waitOnBusyEngine); + + size_t calSrcOrigin[3], calDstOrigin[3], calSize[3]; + calSrcOrigin[0] = srcOrigin[0] + pinOffset(); + calSrcOrigin[1] = srcOrigin[1]; + calSrcOrigin[2] = srcOrigin[2]; + calDstOrigin[0] = dstOrigin[0] + dstResource.pinOffset(); + calDstOrigin[1] = dstOrigin[1]; + calDstOrigin[2] = dstOrigin[2]; + calSize[0] = size[0]; + calSize[1] = size[1]; + calSize[2] = size[2]; + + result = gpu.copyPartial(event, gslResource(), calSrcOrigin, dstResource.gslResource(), + calDstOrigin, calSize, static_cast(syncFlags), + enableCopyRect, bytesPerElement); + + if (result) { + // Mark source and destination as busy setBusy(gpu, event); + dstResource.setBusy(gpu, event); + // Update the global GPU event - gpu.setGpuEvent(event, false); + gpu.setGpuEvent(event, (flush | flushDMA)); + } - if (waitForEvent) { - // Wait for event to complete - gpu.waitForEvent(&event); - } + // Restore the original engine + gpu.engineID_ = activeEngineID; + + return result; } -bool -Resource::partialMemCopyTo( - VirtualGPU& gpu, - const amd::Coord3D& srcOrigin, - const amd::Coord3D& dstOrigin, - const amd::Coord3D& size, - Resource& dstResource, - bool enableCopyRect, - bool flushDMA, - uint bytesPerElement) const -{ - GpuEvent event; - bool result; - CALuint syncFlags = CAL_MEMCOPY_SYNC; - EngineType activeEngineID = gpu.engineID_; - static const bool waitOnBusyEngine = true; - // \note timing issues in Linux with sync mode - bool flush = true; +void Resource::setBusy(VirtualGPU& gpu, GpuEvent gpuEvent) const { + gpu.assignGpuEvent(gslResource(), gpuEvent); - // Check if runtime can use async memory copy, - // even if a caller didn't request async - if (dev().settings().asyncMemCopy_ && - // Keep ASYNC if profiling is disabled or sdma profiling is possible - (!gpu.profiling() || dev().settings().sdmaProfiling_) && - (!cal()->cardMemory_ || !dstResource.cal()->cardMemory_)) { - // Switch to SDMA engine - gpu.engineID_ = SdmaEngine; - syncFlags = CAL_MEMCOPY_ASYNC; - flush = false; - } - - // Wait for the resources, since runtime may use async transfers - wait(gpu, waitOnBusyEngine); - dstResource.wait(gpu, waitOnBusyEngine); - - size_t calSrcOrigin[3], calDstOrigin[3], calSize[3]; - calSrcOrigin[0] = srcOrigin[0] + pinOffset(); - calSrcOrigin[1] = srcOrigin[1]; - calSrcOrigin[2] = srcOrigin[2]; - calDstOrigin[0] = dstOrigin[0] + dstResource.pinOffset(); - calDstOrigin[1] = dstOrigin[1]; - calDstOrigin[2] = dstOrigin[2]; - calSize[0] = size[0]; - calSize[1] = size[1]; - calSize[2] = size[2]; - - result = gpu.copyPartial(event, - gslResource(), calSrcOrigin, - dstResource.gslResource(), calDstOrigin, - calSize, static_cast(syncFlags), enableCopyRect, bytesPerElement); - - if (result) { - // Mark source and destination as busy - setBusy(gpu, event); - dstResource.setBusy(gpu, event); - - // Update the global GPU event - gpu.setGpuEvent(event, (flush | flushDMA)); - } - - // Restore the original engine - gpu.engineID_ = activeEngineID; - - return result; + // If current resource is a view, then update the parent event as well + if (viewOwner_ != NULL) { + viewOwner_->setBusy(gpu, gpuEvent); + } } -void -Resource::setBusy( - VirtualGPU& gpu, - GpuEvent gpuEvent - ) const -{ - gpu.assignGpuEvent(gslResource(), gpuEvent); +void Resource::wait(VirtualGPU& gpu, bool waitOnBusyEngine) const { + GpuEvent* gpuEvent = gpu.getGpuEvent(gslResource()); - // If current resource is a view, then update the parent event as well - if (viewOwner_ != NULL) { - viewOwner_->setBusy(gpu, gpuEvent); - } + // Check if we have to wait unconditionally + if (!waitOnBusyEngine || + // or we have to wait only if another engine was used on this resource + (waitOnBusyEngine && (gpuEvent->engineId_ != gpu.engineID_))) { + gpu.waitForEvent(gpuEvent); + } + + // If current resource is a view and not in the global heap, + // then wait for the parent event as well + if ((viewOwner_ != NULL) && (viewOwner_ != &dev().globalMem())) { + viewOwner_->wait(gpu, waitOnBusyEngine); + } } -void -Resource::wait(VirtualGPU& gpu, bool waitOnBusyEngine) const -{ - GpuEvent* gpuEvent = gpu.getGpuEvent(gslResource()); +bool Resource::hostWrite(VirtualGPU* gpu, const void* hostPtr, const amd::Coord3D& origin, + const amd::Coord3D& size, uint flags, size_t rowPitch, size_t slicePitch) { + void* dst; - // Check if we have to wait unconditionally - if (!waitOnBusyEngine || - // or we have to wait only if another engine was used on this resource - (waitOnBusyEngine && (gpuEvent->engineId_ != gpu.engineID_))) { - gpu.waitForEvent(gpuEvent); - } - - // If current resource is a view and not in the global heap, - // then wait for the parent event as well - if ((viewOwner_ != NULL) && (viewOwner_ != &dev().globalMem())) { - viewOwner_->wait(gpu, waitOnBusyEngine); - } -} - -bool -Resource::hostWrite( - VirtualGPU* gpu, - const void* hostPtr, - const amd::Coord3D& origin, - const amd::Coord3D& size, - uint flags, - size_t rowPitch, - size_t slicePitch) -{ - void* dst; - - size_t startLayer = origin[2]; - size_t numLayers = size[2]; - if (cal()->dimension_ == GSL_MOA_TEXTURE_1D_ARRAY) { - startLayer = origin[1]; - numLayers = size[1]; - } - - // Get physical GPU memmory - dst = map(gpu, flags, startLayer, numLayers); - if (NULL == dst) { - LogError("Couldn't map GPU memory for host write"); - return false; - } - - if (1 == cal()->dimSize_) { - size_t copySize = (cal()->buffer_) ? size[0] : size[0] * elementSize_; - - // Update the pointer - dst = static_cast(static_cast(dst) + origin[0]); - - // Copy memory - amd::Os::fastMemcpy(dst, hostPtr, copySize); - } - else { - size_t srcOffs = 0; - size_t dstOffsBase = origin[0] * elementSize_; - size_t dstOffs; - - // Make sure we use the right pitch if it's not specified - if (rowPitch == 0) { - rowPitch = size[0] * elementSize_; - } - - // Make sure we use the right slice if it's not specified - if (slicePitch == 0) { - slicePitch = size[0] * size[1] * elementSize_; - } - - // Adjust the destination offset with Y dimension - dstOffsBase += cal()->pitch_ * origin[1] * elementSize_; - - // Adjust the destination offset with Z dimension - dstOffsBase += cal()->slice_ * origin[2] * elementSize_; - - // Copy memory slice by slice - for (size_t slice = 0; slice < size[2]; ++slice) { - dstOffs = dstOffsBase + slice * cal()->slice_ * elementSize_; - srcOffs = slice * slicePitch; - - // Copy memory line by line - for (size_t row = 0; row < size[1]; ++row) { - // Copy memory - amd::Os::fastMemcpy( - (reinterpret_cast
(dst) + dstOffs), - (reinterpret_cast(hostPtr) + srcOffs), - size[0] * elementSize_); - - dstOffs += cal()->pitch_ * elementSize_; - srcOffs += rowPitch; - } - } - } - - // Unmap GPU memory - unmap(gpu); - - return true; -} - -bool -Resource::hostRead( - VirtualGPU* gpu, - void* hostPtr, - const amd::Coord3D& origin, - const amd::Coord3D& size, - size_t rowPitch, - size_t slicePitch) -{ - void* src; - - size_t startLayer = origin[2]; - size_t numLayers = size[2]; - if (cal()->dimension_ == GSL_MOA_TEXTURE_1D_ARRAY) { - startLayer = origin[1]; - numLayers = size[1]; - } - - // Get physical GPU memmory - src = map(gpu, ReadOnly, startLayer, numLayers); - if (NULL == src) { - LogError("Couldn't map GPU memory for host read"); - return false; - } - - if (1 == cal()->dimSize_) { - size_t copySize = (cal()->buffer_) ? size[0] : size[0] * elementSize_; - - // Update the pointer - src = static_cast(static_cast(src) + origin[0]); - - // Copy memory - amd::Os::fastMemcpy(hostPtr, src, copySize); - } - else { - size_t srcOffsBase = origin[0] * elementSize_; - size_t srcOffs; - size_t dstOffs = 0; - - // Make sure we use the right pitch if it's not specified - if (rowPitch == 0) { - rowPitch = size[0] * elementSize_; - } - - // Make sure we use the right slice if it's not specified - if (slicePitch == 0) { - slicePitch = size[0] * size[1] * elementSize_; - } - - // Adjust destination offset with Y dimension - srcOffsBase += cal()->pitch_ * origin[1] * elementSize_; - - // Adjust the destination offset with Z dimension - srcOffsBase += cal()->slice_ * origin[2] * elementSize_; - - // Copy memory line by line - for (size_t slice = 0; slice < size[2]; ++slice) { - srcOffs = srcOffsBase + slice * cal()->slice_ * elementSize_; - dstOffs = slice * slicePitch; - - // Copy memory line by line - for (size_t row = 0; row < size[1]; ++row) { - // Copy memory - amd::Os::fastMemcpy( - (reinterpret_cast
(hostPtr) + dstOffs), - (reinterpret_cast(src) + srcOffs), - size[0] * elementSize_); - - srcOffs += cal()->pitch_ * elementSize_; - dstOffs += rowPitch; - } - } - } - - // Unmap GPU memory - unmap(gpu); - - return true; -} - -void* -Resource::gslMap(size_t* pitch, gslMapAccessType flags, gslMemObject resource) const -{ - if (cal_.cardMemory_ || cal_.tiled_) { - // @todo remove const cast - return const_cast(dev()).resMapLocal(*pitch, resource, flags); - } - else { - return dev().resMapRemote(*pitch, resource, flags); - } -} - -void -Resource::gslUnmap(gslMemObject resource) const -{ - if (cal_.cardMemory_) { - // @todo remove const cast - const_cast(dev()).resUnmapLocal(resource); - } - else { - dev().resUnmapRemote(resource); - } -} - -bool -Resource::gslGLAcquire() -{ - bool retVal = true; - if (cal()->type_ == OGLInterop) { - retVal = dev().resGLAcquire(glPlatformContext_,glInteropMbRes_, glType_); - } - return retVal; -} - -bool -Resource::gslGLRelease() -{ - bool retVal = true; - if (cal()->type_ == OGLInterop) { - retVal = dev().resGLRelease(glPlatformContext_,glInteropMbRes_, glType_); - } - return retVal; -} -void -Resource::gslFree() const -{ - if (cal()->type_ == OGLInterop) { - if (0 == gslRef_->resOriginal_) { - dev().resGLFree(glPlatformContext_, glDeviceContext_, - gslRef_->resource_, glInterop_, glInteropMbRes_, glType_); - gslRef_->resource_ = 0; - } - else { - dev().resFree(gslRef_->resource_); - gslRef_->resource_ = 0; - dev().resGLFree(glPlatformContext_, glDeviceContext_, - gslRef_->resOriginal_, glInterop_, glInteropMbRes_, glType_); - gslRef_->resOriginal_ = 0; - } - } - gslRef_->release(); -} - -bool -Resource::isMemoryType(MemoryType memType) const -{ - if (memoryType() == memType) { - return true; - } - else if (memoryType() == View) { - return viewOwner_->isMemoryType(memType); - } + size_t startLayer = origin[2]; + size_t numLayers = size[2]; + if (cal()->dimension_ == GSL_MOA_TEXTURE_1D_ARRAY) { + startLayer = origin[1]; + numLayers = size[1]; + } + // Get physical GPU memmory + dst = map(gpu, flags, startLayer, numLayers); + if (NULL == dst) { + LogError("Couldn't map GPU memory for host write"); return false; + } + + if (1 == cal()->dimSize_) { + size_t copySize = (cal()->buffer_) ? size[0] : size[0] * elementSize_; + + // Update the pointer + dst = static_cast(static_cast(dst) + origin[0]); + + // Copy memory + amd::Os::fastMemcpy(dst, hostPtr, copySize); + } else { + size_t srcOffs = 0; + size_t dstOffsBase = origin[0] * elementSize_; + size_t dstOffs; + + // Make sure we use the right pitch if it's not specified + if (rowPitch == 0) { + rowPitch = size[0] * elementSize_; + } + + // Make sure we use the right slice if it's not specified + if (slicePitch == 0) { + slicePitch = size[0] * size[1] * elementSize_; + } + + // Adjust the destination offset with Y dimension + dstOffsBase += cal()->pitch_ * origin[1] * elementSize_; + + // Adjust the destination offset with Z dimension + dstOffsBase += cal()->slice_ * origin[2] * elementSize_; + + // Copy memory slice by slice + for (size_t slice = 0; slice < size[2]; ++slice) { + dstOffs = dstOffsBase + slice * cal()->slice_ * elementSize_; + srcOffs = slice * slicePitch; + + // Copy memory line by line + for (size_t row = 0; row < size[1]; ++row) { + // Copy memory + amd::Os::fastMemcpy((reinterpret_cast
(dst) + dstOffs), + (reinterpret_cast(hostPtr) + srcOffs), + size[0] * elementSize_); + + dstOffs += cal()->pitch_ * elementSize_; + srcOffs += rowPitch; + } + } + } + + // Unmap GPU memory + unmap(gpu); + + return true; } -bool -Resource::isPersistentDirectMap() const -{ - bool directMap = ((memoryType() == Resource::Persistent) && - (cal()->dimSize_ < 3) && !cal()->imageArray_); +bool Resource::hostRead(VirtualGPU* gpu, void* hostPtr, const amd::Coord3D& origin, + const amd::Coord3D& size, size_t rowPitch, size_t slicePitch) { + void* src; - // If direct map is possible, then validate it with the current tiling - if (directMap && cal()->tiled_) { - //!@note IOL for Linux doesn't support tiling aperture - // and runtime doesn't force linear images in persistent - directMap = IS_WINDOWS && !dev().settings().linearPersistentImage_; + size_t startLayer = origin[2]; + size_t numLayers = size[2]; + if (cal()->dimension_ == GSL_MOA_TEXTURE_1D_ARRAY) { + startLayer = origin[1]; + numLayers = size[1]; + } + + // Get physical GPU memmory + src = map(gpu, ReadOnly, startLayer, numLayers); + if (NULL == src) { + LogError("Couldn't map GPU memory for host read"); + return false; + } + + if (1 == cal()->dimSize_) { + size_t copySize = (cal()->buffer_) ? size[0] : size[0] * elementSize_; + + // Update the pointer + src = static_cast(static_cast(src) + origin[0]); + + // Copy memory + amd::Os::fastMemcpy(hostPtr, src, copySize); + } else { + size_t srcOffsBase = origin[0] * elementSize_; + size_t srcOffs; + size_t dstOffs = 0; + + // Make sure we use the right pitch if it's not specified + if (rowPitch == 0) { + rowPitch = size[0] * elementSize_; } - return directMap; + // Make sure we use the right slice if it's not specified + if (slicePitch == 0) { + slicePitch = size[0] * size[1] * elementSize_; + } + + // Adjust destination offset with Y dimension + srcOffsBase += cal()->pitch_ * origin[1] * elementSize_; + + // Adjust the destination offset with Z dimension + srcOffsBase += cal()->slice_ * origin[2] * elementSize_; + + // Copy memory line by line + for (size_t slice = 0; slice < size[2]; ++slice) { + srcOffs = srcOffsBase + slice * cal()->slice_ * elementSize_; + dstOffs = slice * slicePitch; + + // Copy memory line by line + for (size_t row = 0; row < size[1]; ++row) { + // Copy memory + amd::Os::fastMemcpy((reinterpret_cast
(hostPtr) + dstOffs), + (reinterpret_cast(src) + srcOffs), + size[0] * elementSize_); + + srcOffs += cal()->pitch_ * elementSize_; + dstOffs += rowPitch; + } + } + } + + // Unmap GPU memory + unmap(gpu); + + return true; } -void* -Resource::map(VirtualGPU* gpu, uint flags, uint startLayer, uint numLayers) -{ - if (isMemoryType(Pinned)) { - // Check if we have to wait - if (!(flags & NoWait)) { - if (gpu != NULL) { - wait(*gpu); - } - } - return address_; +void* Resource::gslMap(size_t* pitch, gslMapAccessType flags, gslMemObject resource) const { + if (cal_.cardMemory_ || cal_.tiled_) { + // @todo remove const cast + return const_cast(dev()).resMapLocal(*pitch, resource, flags); + } else { + return dev().resMapRemote(*pitch, resource, flags); + } +} + +void Resource::gslUnmap(gslMemObject resource) const { + if (cal_.cardMemory_) { + // @todo remove const cast + const_cast(dev()).resUnmapLocal(resource); + } else { + dev().resUnmapRemote(resource); + } +} + +bool Resource::gslGLAcquire() { + bool retVal = true; + if (cal()->type_ == OGLInterop) { + retVal = dev().resGLAcquire(glPlatformContext_, glInteropMbRes_, glType_); + } + return retVal; +} + +bool Resource::gslGLRelease() { + bool retVal = true; + if (cal()->type_ == OGLInterop) { + retVal = dev().resGLRelease(glPlatformContext_, glInteropMbRes_, glType_); + } + return retVal; +} +void Resource::gslFree() const { + if (cal()->type_ == OGLInterop) { + if (0 == gslRef_->resOriginal_) { + dev().resGLFree(glPlatformContext_, glDeviceContext_, gslRef_->resource_, glInterop_, + glInteropMbRes_, glType_); + gslRef_->resource_ = 0; + } else { + dev().resFree(gslRef_->resource_); + gslRef_->resource_ = 0; + dev().resGLFree(glPlatformContext_, glDeviceContext_, gslRef_->resOriginal_, glInterop_, + glInteropMbRes_, glType_); + gslRef_->resOriginal_ = 0; } + } + gslRef_->release(); +} - gslMapAccessType mapFlags = GSL_MAP_READ_WRITE; +bool Resource::isMemoryType(MemoryType memType) const { + if (memoryType() == memType) { + return true; + } else if (memoryType() == View) { + return viewOwner_->isMemoryType(memType); + } - if (flags & ReadOnly) { - assert(!(flags & Discard) && "We can't use lock discard with read only!"); - mapFlags = GSL_MAP_READ_ONLY; - } + return false; +} - if (flags & WriteOnly) { - mapFlags = GSL_MAP_WRITE_ONLY; - } +bool Resource::isPersistentDirectMap() const { + bool directMap = + ((memoryType() == Resource::Persistent) && (cal()->dimSize_ < 3) && !cal()->imageArray_); - // Check if use map discard - if (flags & Discard) { - mapFlags = GSL_MAP_WRITE_ONLY; - if (gpu != NULL) { - // If we use a new renamed allocation, then skip the wait - if (rename(*gpu)) { - flags |= NoWait; - } - } - } + // If direct map is possible, then validate it with the current tiling + if (directMap && cal()->tiled_) { + //!@note IOL for Linux doesn't support tiling aperture + // and runtime doesn't force linear images in persistent + directMap = IS_WINDOWS && !dev().settings().linearPersistentImage_; + } + return directMap; +} + +void* Resource::map(VirtualGPU* gpu, uint flags, uint startLayer, uint numLayers) { + if (isMemoryType(Pinned)) { // Check if we have to wait if (!(flags & NoWait)) { - if (gpu != NULL) { - wait(*gpu); - } + if (gpu != NULL) { + wait(*gpu); + } } - - // Check if memory wasn't mapped yet - if (++mapCount_ == 1) { - if ((cal()->dimSize_ == 3) || cal()->imageArray_ || - ((cal()->type_ == ImageView) && viewOwner_->mipMapped())) { - // Save map info for multilayer map/unmap - startLayer_ = startLayer; - numLayers_ = numLayers; - mapFlags_ = mapFlags; - // Map with layers - address_ = mapLayers(gpu, mapFlags); - } - else { - // Map current resource - address_ = gslMap(&cal_.pitch_, mapFlags, gslResource()); - if (address_ == NULL) { - LogError("cal::ResMap failed!"); - --mapCount_; - return NULL; - } - } - } - - //! \note the atomic operation with counter doesn't - // guarantee that the address will be valid, - // since GSL could still process the first map - if (address_ == NULL) { - for (uint i = 0; address_ == NULL && i < 10; ++i) { - amd::Os::sleep(1); - } - assert((address_ != NULL) && "Multiple maps failed!"); - } - return address_; + } + + gslMapAccessType mapFlags = GSL_MAP_READ_WRITE; + + if (flags & ReadOnly) { + assert(!(flags & Discard) && "We can't use lock discard with read only!"); + mapFlags = GSL_MAP_READ_ONLY; + } + + if (flags & WriteOnly) { + mapFlags = GSL_MAP_WRITE_ONLY; + } + + // Check if use map discard + if (flags & Discard) { + mapFlags = GSL_MAP_WRITE_ONLY; + if (gpu != NULL) { + // If we use a new renamed allocation, then skip the wait + if (rename(*gpu)) { + flags |= NoWait; + } + } + } + + // Check if we have to wait + if (!(flags & NoWait)) { + if (gpu != NULL) { + wait(*gpu); + } + } + + // Check if memory wasn't mapped yet + if (++mapCount_ == 1) { + if ((cal()->dimSize_ == 3) || cal()->imageArray_ || + ((cal()->type_ == ImageView) && viewOwner_->mipMapped())) { + // Save map info for multilayer map/unmap + startLayer_ = startLayer; + numLayers_ = numLayers; + mapFlags_ = mapFlags; + // Map with layers + address_ = mapLayers(gpu, mapFlags); + } else { + // Map current resource + address_ = gslMap(&cal_.pitch_, mapFlags, gslResource()); + if (address_ == NULL) { + LogError("cal::ResMap failed!"); + --mapCount_; + return NULL; + } + } + } + + //! \note the atomic operation with counter doesn't + // guarantee that the address will be valid, + // since GSL could still process the first map + if (address_ == NULL) { + for (uint i = 0; address_ == NULL && i < 10; ++i) { + amd::Os::sleep(1); + } + assert((address_ != NULL) && "Multiple maps failed!"); + } + + return address_; } -void* -Resource::mapLayers(VirtualGPU* gpu, CALuint flags) -{ - size_t srcOffs = 0; - size_t dstOffs = 0; - gslMemObject sliceResource = 0; - gslMemObjectAttribType gslDim = GSL_MOA_TEXTURE_2D; - size_t layers = cal()->depth_; - size_t height = cal()->height_; +void* Resource::mapLayers(VirtualGPU* gpu, CALuint flags) { + size_t srcOffs = 0; + size_t dstOffs = 0; + gslMemObject sliceResource = 0; + gslMemObjectAttribType gslDim = GSL_MOA_TEXTURE_2D; + size_t layers = cal()->depth_; + size_t height = cal()->height_; - // Use 1D layers - if (GSL_MOA_TEXTURE_1D_ARRAY == cal()->dimension_) { - gslDim = GSL_MOA_TEXTURE_1D; - height = 1; - layers = cal()->height_; + // Use 1D layers + if (GSL_MOA_TEXTURE_1D_ARRAY == cal()->dimension_) { + gslDim = GSL_MOA_TEXTURE_1D; + height = 1; + layers = cal()->height_; + } + + cal_.pitch_ = cal()->width_; + cal_.slice_ = cal()->pitch_ * height; + address_ = new char[cal()->slice_ * layers * elementSize()]; + if (NULL == address_) { + return NULL; + } + + // Check if map is write only + if (flags == GSL_MAP_WRITE_ONLY) { + return address_; + } + + if (numLayers_ != 0) { + layers = startLayer_ + numLayers_; + } + + dstOffs = startLayer_ * cal()->slice_ * elementSize(); + + // Loop through all layers + for (uint i = startLayer_; i < layers; ++i) { + gslResource3D gslSize; + size_t calOffset; + void* sliceAddr; + size_t pitch; + + // Allocate a layer from the image + gslSize.width = cal()->width_; + gslSize.height = height; + gslSize.depth = 1; + calOffset = 0; + sliceResource = + dev().resAllocView(gslResource(), gslSize, calOffset, cal()->format_, cal()->channelOrder_, + gslDim, 0, i, CAL_RESALLOCSLICEVIEW_LEVEL_AND_LAYER); + if (0 == sliceResource) { + LogError("Map layer. resAllocSliceView failed!"); + return NULL; } - cal_.pitch_ = cal()->width_; - cal_.slice_ = cal()->pitch_ * height; - address_ = new char [cal()->slice_ * layers * elementSize()]; - if (NULL == address_) { - return NULL; + // Map 2D layer + sliceAddr = gslMap(&pitch, GSL_MAP_READ_ONLY, sliceResource); + if (sliceAddr == NULL) { + LogError("Map layer. CalResMap failed!"); + return NULL; } - // Check if map is write only - if (flags == GSL_MAP_WRITE_ONLY) { - return address_; + srcOffs = 0; + // Copy memory line by line + for (size_t rows = 0; rows < height; ++rows) { + // Copy memory + amd::Os::fastMemcpy((reinterpret_cast
(address_) + dstOffs), + (reinterpret_cast(sliceAddr) + srcOffs), + cal()->width_ * elementSize_); + + dstOffs += cal()->pitch_ * elementSize(); + srcOffs += pitch * elementSize(); } - if (numLayers_ != 0) { - layers = startLayer_ + numLayers_; + // Unmap a layer + gslUnmap(sliceResource); + dev().resFree(sliceResource); + } + + return address_; +} + +void Resource::unmap(VirtualGPU* gpu) { + if (isMemoryType(Pinned)) { + return; + } + + // Decrement map counter + int count = --mapCount_; + + // Check if it's the last unmap + if (count == 0) { + if ((cal()->dimSize_ == 3) || cal()->imageArray_ || + ((cal()->type_ == ImageView) && viewOwner_->mipMapped())) { + // Unmap layers + unmapLayers(gpu); + } else { + // Unmap current resource + gslUnmap(gslResource()); } + address_ = NULL; + } else if (count < 0) { + LogError("dev().serialCalResUnmap failed!"); + ++mapCount_; + return; + } +} - dstOffs = startLayer_ * cal()->slice_ * elementSize(); +void Resource::unmapLayers(VirtualGPU* gpu) { + size_t srcOffs = 0; + size_t dstOffs = 0; + gslMemObjectAttribType gslDim = GSL_MOA_TEXTURE_2D; + gslMemObject sliceResource = NULL; + CALuint layers = cal()->depth_; + CALuint height = cal()->height_; + // Use 1D layers + if (GSL_MOA_TEXTURE_1D_ARRAY == cal()->dimension_) { + gslDim = GSL_MOA_TEXTURE_1D; + height = 1; + layers = cal()->height_; + } + + if (numLayers_ != 0) { + layers = startLayer_ + numLayers_; + } + + srcOffs = startLayer_ * cal()->slice_ * elementSize(); + + // Check if map is write only + if (!(mapFlags_ == GSL_MAP_READ_ONLY)) { // Loop through all layers for (uint i = startLayer_; i < layers; ++i) { - gslResource3D gslSize; - size_t calOffset; - void* sliceAddr; - size_t pitch; + gslResource3D gslSize; + size_t calOffset; + void* sliceAddr; + size_t pitch; - // Allocate a layer from the image - gslSize.width = cal()->width_; - gslSize.height = height; - gslSize.depth = 1; - calOffset = 0; - sliceResource = dev().resAllocView( - gslResource(), gslSize, - calOffset, cal()->format_, cal()->channelOrder_, gslDim, - 0, i, CAL_RESALLOCSLICEVIEW_LEVEL_AND_LAYER); - if (0 == sliceResource) { - LogError("Map layer. resAllocSliceView failed!"); - return NULL; - } - - // Map 2D layer - sliceAddr = gslMap(&pitch, GSL_MAP_READ_ONLY, sliceResource); - if (sliceAddr == NULL) { - LogError("Map layer. CalResMap failed!"); - return NULL; - } - - srcOffs = 0; - // Copy memory line by line - for (size_t rows = 0; rows < height; ++rows) { - // Copy memory - amd::Os::fastMemcpy( - (reinterpret_cast
(address_) + dstOffs), - (reinterpret_cast(sliceAddr) + srcOffs), - cal()->width_ * elementSize_); - - dstOffs += cal()->pitch_ * elementSize(); - srcOffs += pitch * elementSize(); - } - - // Unmap a layer - gslUnmap(sliceResource); - dev().resFree(sliceResource); - } - - return address_; -} - -void -Resource::unmap(VirtualGPU* gpu) -{ - if (isMemoryType(Pinned)) { + // Allocate a layer from the image + gslSize.width = cal()->width_; + gslSize.height = height; + gslSize.depth = 1; + calOffset = 0; + sliceResource = dev().resAllocView(gslResource(), gslSize, calOffset, cal()->format_, + cal()->channelOrder_, gslDim, 0, i, + CAL_RESALLOCSLICEVIEW_LEVEL_AND_LAYER); + if (0 == sliceResource) { + LogError("Unmap layer. resAllocSliceView failed!"); return; - } + } - // Decrement map counter - int count = --mapCount_; - - // Check if it's the last unmap - if (count == 0) { - if ((cal()->dimSize_ == 3) || cal()->imageArray_ || - ((cal()->type_ == ImageView) && viewOwner_->mipMapped())) { - // Unmap layers - unmapLayers(gpu); - } - else { - // Unmap current resource - gslUnmap(gslResource()); - } - address_ = NULL; - } - else if (count < 0) { - LogError("dev().serialCalResUnmap failed!"); - ++mapCount_; + // Map a layer + sliceAddr = gslMap(&pitch, GSL_MAP_WRITE_ONLY, sliceResource); + if (sliceAddr == NULL) { + LogError("Unmap layer. CalResMap failed!"); return; + } + + dstOffs = 0; + // Copy memory line by line + for (size_t rows = 0; rows < height; ++rows) { + // Copy memory + amd::Os::fastMemcpy((reinterpret_cast
(sliceAddr) + dstOffs), + (reinterpret_cast(address_) + srcOffs), + cal()->width_ * elementSize_); + + dstOffs += pitch * elementSize(); + srcOffs += cal()->pitch_ * elementSize(); + } + + // Unmap a layer + gslUnmap(sliceResource); + dev().resFree(sliceResource); } + } + + // Destroy the mapped memory + delete[] reinterpret_cast(address_); } -void -Resource::unmapLayers(VirtualGPU* gpu) -{ - size_t srcOffs = 0; - size_t dstOffs = 0; - gslMemObjectAttribType gslDim = GSL_MOA_TEXTURE_2D; - gslMemObject sliceResource = NULL; - CALuint layers = cal()->depth_; - CALuint height = cal()->height_; +void Resource::setActiveRename(VirtualGPU& gpu, GslResourceReference* rename) { + // Copy the unique GSL data + gslRef_ = rename; + address_ = rename->cpuAddress_; - // Use 1D layers - if (GSL_MOA_TEXTURE_1D_ARRAY == cal()->dimension_) { - gslDim = GSL_MOA_TEXTURE_1D; - height = 1; - layers = cal()->height_; - } - - if (numLayers_ != 0) { - layers = startLayer_ + numLayers_; - } - - srcOffs = startLayer_ * cal()->slice_ * elementSize(); - - // Check if map is write only - if (!(mapFlags_ == GSL_MAP_READ_ONLY)) { - // Loop through all layers - for (uint i = startLayer_; i < layers; ++i) { - gslResource3D gslSize; - size_t calOffset; - void* sliceAddr; - size_t pitch; - - // Allocate a layer from the image - gslSize.width = cal()->width_; - gslSize.height = height; - gslSize.depth = 1; - calOffset = 0; - sliceResource = dev().resAllocView( - gslResource(), gslSize, - calOffset, cal()->format_, cal()->channelOrder_, gslDim, - 0, i, CAL_RESALLOCSLICEVIEW_LEVEL_AND_LAYER); - if (0 == sliceResource) { - LogError("Unmap layer. resAllocSliceView failed!"); - return; - } - - // Map a layer - sliceAddr = gslMap(&pitch, GSL_MAP_WRITE_ONLY, sliceResource); - if (sliceAddr == NULL) { - LogError("Unmap layer. CalResMap failed!"); - return; - } - - dstOffs = 0; - // Copy memory line by line - for (size_t rows = 0; rows < height; ++rows) { - // Copy memory - amd::Os::fastMemcpy( - (reinterpret_cast
(sliceAddr) + dstOffs), - (reinterpret_cast(address_) + srcOffs), - cal()->width_ * elementSize_); - - dstOffs += pitch * elementSize(); - srcOffs += cal()->pitch_ * elementSize(); - } - - // Unmap a layer - gslUnmap(sliceResource); - dev().resFree(sliceResource); - } - } - - // Destroy the mapped memory - delete [] reinterpret_cast(address_); + hbOffset_ = rename->gslResource()->getSurfaceAddress() - dev().heap().baseAddress(); } -void -Resource::setActiveRename(VirtualGPU& gpu, GslResourceReference* rename) -{ - // Copy the unique GSL data - gslRef_ = rename; - address_ = rename->cpuAddress_; - - hbOffset_ = rename->gslResource()->getSurfaceAddress() - - dev().heap().baseAddress(); +bool Resource::getActiveRename(VirtualGPU& gpu, GslResourceReference** rename) { + // Copy the old data to the rename descriptor + *rename = gslRef_; + return true; } -bool -Resource::getActiveRename(VirtualGPU& gpu, GslResourceReference** rename) -{ - // Copy the old data to the rename descriptor - *rename = gslRef_; +bool Resource::rename(VirtualGPU& gpu, bool force) { + GpuEvent* gpuEvent = gpu.getGpuEvent(gslResource()); + if (!gpuEvent->isValid() && !force) { return true; -} + } -bool -Resource::rename(VirtualGPU& gpu, bool force) -{ - GpuEvent* gpuEvent = gpu.getGpuEvent(gslResource()); - if (!gpuEvent->isValid() && !force) { - return true; + bool useNext = false; + CALuint resSize = cal()->width_ * ((cal()->height_) ? cal()->height_ : 1) * elementSize_; + + // Rename will work with real GSL resources + if (((memoryType() != Local) && (memoryType() != Persistent) && (memoryType() != Remote) && + (memoryType() != RemoteUSWC)) || + (dev().settings().maxRenames_ == 0)) { + return false; + } + + // If the resource for renaming is too big, then lets check the current status first + // at the cost of an extra flush + if (resSize >= (dev().settings().maxRenameSize_ / dev().settings().maxRenames_)) { + if (gpu.isDone(gpuEvent)) { + return true; + } + } + + // Save the first + if (renames_.size() == 0) { + GslResourceReference* rename; + if (mapCount_ > 0) { + gslRef_->cpuAddress_ = address_; + } + if (!getActiveRename(gpu, &rename)) { + return false; } - bool useNext = false; - CALuint resSize = cal()->width_ * ((cal()->height_) ? cal()->height_ : 1) * - elementSize_; + curRename_ = renames_.size(); + renames_.push_back(rename); + } - // Rename will work with real GSL resources - if (((memoryType() != Local) && - (memoryType() != Persistent) && - (memoryType() != Remote) && - (memoryType() != RemoteUSWC)) || - (dev().settings().maxRenames_ == 0)) { - return false; - } + // Can we use a new rename? + if ((renames_.size() <= dev().settings().maxRenames_) && + ((renames_.size() * resSize) <= dev().settings().maxRenameSize_)) { + GslResourceReference* rename; - // If the resource for renaming is too big, then lets check the current status first - // at the cost of an extra flush - if (resSize >= (dev().settings().maxRenameSize_ / dev().settings().maxRenames_)) { - if (gpu.isDone(gpuEvent)) { - return true; + // Create a new GSL allocation + if (create(memoryType())) { + if (mapCount_ > 0) { + assert(!cal()->cardMemory_ && "Unsupported memory type!"); + gslRef_->cpuAddress_ = dev().resMapRemote(cal_.pitch_, gslResource(), GSL_MAP_READ_WRITE); + if (gslRef_->cpuAddress_ == NULL) { + LogError("gslMap fails on rename!"); } - } - - // Save the first - if (renames_.size() == 0) { - GslResourceReference* rename; - if (mapCount_ > 0) { - gslRef_->cpuAddress_ = address_; - } - if (!getActiveRename(gpu, &rename)) { - return false; - } - + address_ = gslRef_->cpuAddress_; + } + if (getActiveRename(gpu, &rename)) { curRename_ = renames_.size(); renames_.push_back(rename); + } else { + gslRef_->release(); + useNext = true; + } + } else { + useNext = true; } + } else { + useNext = true; + } - // Can we use a new rename? - if ((renames_.size() <= dev().settings().maxRenames_) && - ((renames_.size() * resSize) <= dev().settings().maxRenameSize_)) { - GslResourceReference* rename; - - // Create a new GSL allocation - if (create(memoryType())) { - if (mapCount_ > 0) { - assert(!cal()->cardMemory_ && "Unsupported memory type!"); - gslRef_->cpuAddress_ = dev().resMapRemote(cal_.pitch_, - gslResource(), GSL_MAP_READ_WRITE); - if (gslRef_->cpuAddress_ == NULL) { - LogError("gslMap fails on rename!"); - } - address_ = gslRef_->cpuAddress_; - } - if (getActiveRename(gpu, &rename)) { - curRename_ = renames_.size(); - renames_.push_back(rename); - } - else { - gslRef_->release(); - useNext = true; - } - } - else { - useNext = true; - } - } - else { - useNext = true; + if (useNext) { + // Get the last submitted + curRename_++; + if (curRename_ >= renames_.size()) { + curRename_ = 0; } + setActiveRename(gpu, renames_[curRename_]); + return false; + } - if (useNext) { - // Get the last submitted - curRename_++; - if (curRename_ >= renames_.size()) { - curRename_ = 0; - } - setActiveRename(gpu, renames_[curRename_]); - return false; - } - - return true; + return true; } -void -Resource::warmUpRenames(VirtualGPU& gpu) -{ - for (uint i = 0; i < dev().settings().maxRenames_; ++i) { - // EPR #411675 - On Kaveri, benchmark "photo editing" of PCMarks takes longer time - // if writing 0 for the buffer paging by VidMM is excuted. Not sure how PCMarks measures it. - // Disable this code for apu - if (!dev().settings().apuSystem_) { - uint dummy = 0; - const bool NoWait = false; - // Write 0 for the buffer paging by VidMM - writeRawData(gpu, sizeof(dummy), &dummy, NoWait); - } - const bool Force = true; - rename(gpu, Force); +void Resource::warmUpRenames(VirtualGPU& gpu) { + for (uint i = 0; i < dev().settings().maxRenames_; ++i) { + // EPR #411675 - On Kaveri, benchmark "photo editing" of PCMarks takes longer time + // if writing 0 for the buffer paging by VidMM is excuted. Not sure how PCMarks measures it. + // Disable this code for apu + if (!dev().settings().apuSystem_) { + uint dummy = 0; + const bool NoWait = false; + // Write 0 for the buffer paging by VidMM + writeRawData(gpu, sizeof(dummy), &dummy, NoWait); } + const bool Force = true; + rename(gpu, Force); + } } -ResourceCache::~ResourceCache() -{ - free(); -} +ResourceCache::~ResourceCache() { free(); } //! \note the cache works in FILO mode -bool -ResourceCache::addCalResource( - Resource::CalResourceDesc* desc, GslResourceReference* ref) -{ - amd::ScopedLock l(&lockCacheOps_); - bool result = false; - size_t size = getResourceSize(desc); +bool ResourceCache::addCalResource(Resource::CalResourceDesc* desc, GslResourceReference* ref) { + amd::ScopedLock l(&lockCacheOps_); + bool result = false; + size_t size = getResourceSize(desc); - // Make sure current allocation isn't bigger than cache - if (((desc->type_ == Resource::Local) || - (desc->type_ == Resource::Persistent) || - (desc->type_ == Resource::Remote) || - (desc->type_ == Resource::RemoteUSWC)) && - (size < cacheSizeLimit_) && - !desc->skipRsrcCache_) { - // Validate the cache size limit. Loop until we have enough space - while ((cacheSize_ + size) > cacheSizeLimit_) { - removeLast(); - } - Resource::CalResourceDesc* descCached = new Resource::CalResourceDesc; - if (descCached != NULL) { - // Copy the original desc to the cached version - memcpy(descCached, desc, sizeof(Resource::CalResourceDesc)); - - // Add the current resource to the cache - resCache_.push_front(std::make_pair(descCached, ref)); - cacheSize_ += size; - result = true; - } + // Make sure current allocation isn't bigger than cache + if (((desc->type_ == Resource::Local) || (desc->type_ == Resource::Persistent) || + (desc->type_ == Resource::Remote) || (desc->type_ == Resource::RemoteUSWC)) && + (size < cacheSizeLimit_) && !desc->skipRsrcCache_) { + // Validate the cache size limit. Loop until we have enough space + while ((cacheSize_ + size) > cacheSizeLimit_) { + removeLast(); } + Resource::CalResourceDesc* descCached = new Resource::CalResourceDesc; + if (descCached != NULL) { + // Copy the original desc to the cached version + memcpy(descCached, desc, sizeof(Resource::CalResourceDesc)); - return result; + // Add the current resource to the cache + resCache_.push_front(std::make_pair(descCached, ref)); + cacheSize_ += size; + result = true; + } + } + + return result; } -GslResourceReference* -ResourceCache::findCalResource(Resource::CalResourceDesc* desc) -{ - amd::ScopedLock l(&lockCacheOps_); - GslResourceReference* ref = NULL; - size_t size = getResourceSize(desc); - - // Early exit if resource is too big or it is for scratch buffer - if (size >= cacheSizeLimit_ || desc->skipRsrcCache_ || desc->scratch_) { - //! \note we may need to free the cache here to reduce memory pressure - return ref; - } - - // Serach the right resource through the cache list - for (const auto& it: resCache_) { - Resource::CalResourceDesc* entry = it.first; - // Find if we can reuse this entry - if ((entry->dimension_ == desc->dimension_) && - (entry->type_ == desc->type_) && - (entry->width_ == desc->width_) && - (entry->height_ == desc->height_) && - (entry->depth_ == desc->depth_) && - (entry->channelOrder_ == desc->channelOrder_) && - (entry->format_ == desc->format_) && - (entry->flags_ == desc->flags_) && - (entry->mipLevels_ == desc->mipLevels_) && - (entry->isAllocSVM_ == desc->isAllocSVM_) && - (entry->isAllocExecute_ == desc->isAllocExecute_)) { - ref = it.second; - delete it.first; - // Remove the found etry from the cache - resCache_.remove(it); - cacheSize_ -= size; - break; - } - } +GslResourceReference* ResourceCache::findCalResource(Resource::CalResourceDesc* desc) { + amd::ScopedLock l(&lockCacheOps_); + GslResourceReference* ref = NULL; + size_t size = getResourceSize(desc); + // Early exit if resource is too big or it is for scratch buffer + if (size >= cacheSizeLimit_ || desc->skipRsrcCache_ || desc->scratch_) { + //! \note we may need to free the cache here to reduce memory pressure return ref; -} + } -bool -ResourceCache::free(size_t minCacheEntries) -{ - amd::ScopedLock l(&lockCacheOps_); - bool result = false; - - if (minCacheEntries < resCache_.size()) { - if (static_cast(cacheSize_) > 0) { - result = true; - } - // Clear the cache - while (static_cast(cacheSize_) > 0) { - removeLast(); - } - CondLog((cacheSize_ != 0), "Incorrect size for cache release!"); + // Serach the right resource through the cache list + for (const auto& it : resCache_) { + Resource::CalResourceDesc* entry = it.first; + // Find if we can reuse this entry + if ((entry->dimension_ == desc->dimension_) && (entry->type_ == desc->type_) && + (entry->width_ == desc->width_) && (entry->height_ == desc->height_) && + (entry->depth_ == desc->depth_) && (entry->channelOrder_ == desc->channelOrder_) && + (entry->format_ == desc->format_) && (entry->flags_ == desc->flags_) && + (entry->mipLevels_ == desc->mipLevels_) && (entry->isAllocSVM_ == desc->isAllocSVM_) && + (entry->isAllocExecute_ == desc->isAllocExecute_)) { + ref = it.second; + delete it.first; + // Remove the found etry from the cache + resCache_.remove(it); + cacheSize_ -= size; + break; } - return result; + } + + return ref; } -size_t -ResourceCache::getResourceSize(Resource::CalResourceDesc* desc) -{ - // Find the total amount of elements - size_t size = - desc->width_ * - ((desc->height_) ? desc->height_ : 1) * - ((desc->depth_) ? desc->depth_: 1); +bool ResourceCache::free(size_t minCacheEntries) { + amd::ScopedLock l(&lockCacheOps_); + bool result = false; - // Find total size in bytes - size *= static_cast(memoryFormatSize(desc->format_).size_); - - return size; + if (minCacheEntries < resCache_.size()) { + if (static_cast(cacheSize_) > 0) { + result = true; + } + // Clear the cache + while (static_cast(cacheSize_) > 0) { + removeLast(); + } + CondLog((cacheSize_ != 0), "Incorrect size for cache release!"); + } + return result; } -void -ResourceCache::removeLast() -{ - std::pair entry; - entry = resCache_.back(); - resCache_.pop_back(); +size_t ResourceCache::getResourceSize(Resource::CalResourceDesc* desc) { + // Find the total amount of elements + size_t size = + desc->width_ * ((desc->height_) ? desc->height_ : 1) * ((desc->depth_) ? desc->depth_ : 1); - size_t size = getResourceSize(entry.first); + // Find total size in bytes + size *= static_cast(memoryFormatSize(desc->format_).size_); - // Delete CalResourceDesc - delete entry.first; - - // Destroy GSL resource - entry.second->release(); - cacheSize_ -= size; + return size; } -} // namespace gpu +void ResourceCache::removeLast() { + std::pair entry; + entry = resCache_.back(); + resCache_.pop_back(); + + size_t size = getResourceSize(entry.first); + + // Delete CalResourceDesc + delete entry.first; + + // Destroy GSL resource + entry.second->release(); + cacheSize_ -= size; +} + +} // namespace gpu diff --git a/rocclr/runtime/device/gpu/gpuresource.hpp b/rocclr/runtime/device/gpu/gpuresource.hpp index 338fa9050a..251ec2b8ee 100644 --- a/rocclr/runtime/device/gpu/gpuresource.hpp +++ b/rocclr/runtime/device/gpu/gpuresource.hpp @@ -19,483 +19,453 @@ class VirtualGPU; * @{ */ -class GslResourceReference : public amd::ReferenceCountedObject -{ -public: - //! Default constructor - GslResourceReference( - const Device& gpuDev, //!< GPU device object - gslMemObject gslResource, //!< CAL resource - gslMemObject gslResOriginal = NULL //!< Original CAL resource - ); +class GslResourceReference : public amd::ReferenceCountedObject { + public: + //! Default constructor + GslResourceReference(const Device& gpuDev, //!< GPU device object + gslMemObject gslResource, //!< CAL resource + gslMemObject gslResOriginal = NULL //!< Original CAL resource + ); - //! Get CAL resource - gslMemObject gslResource() const { return resource_; } + //! Get CAL resource + gslMemObject gslResource() const { return resource_; } - //! Original CAL resource - gslMemObject gslOriginal() const { return (resOriginal_ == 0) ? resource_ : resOriginal_; } + //! Original CAL resource + gslMemObject gslOriginal() const { return (resOriginal_ == 0) ? resource_ : resOriginal_; } - const Device& device_; //!< GPU device - gslMemObject resource_; //!< GSL resource object - gslMemObject resOriginal_; //!< Original resource object, NULL if no channel order - void* cpuAddress_; //!< CPU address of this memory + const Device& device_; //!< GPU device + gslMemObject resource_; //!< GSL resource object + gslMemObject resOriginal_; //!< Original resource object, NULL if no channel order + void* cpuAddress_; //!< CPU address of this memory -protected: - //! Default destructor - ~GslResourceReference(); + protected: + //! Default destructor + ~GslResourceReference(); -private: - //! Disable copy constructor - GslResourceReference(const GslResourceReference&); + private: + //! Disable copy constructor + GslResourceReference(const GslResourceReference&); - //! Disable operator= - GslResourceReference& operator=(const GslResourceReference&); + //! Disable operator= + GslResourceReference& operator=(const GslResourceReference&); }; //! GPU resource -class Resource : public amd::HeapObject -{ -public: - enum InteropType { - InteropTypeless = 0, - InteropVertexBuffer, - InteropIndexBuffer, - InteropRenderBuffer, - InteropTexture, - InteropTextureViewLevel, - InteropTextureViewCube, - InteropSurface - }; +class Resource : public amd::HeapObject { + public: + enum InteropType { + InteropTypeless = 0, + InteropVertexBuffer, + InteropIndexBuffer, + InteropRenderBuffer, + InteropTexture, + InteropTextureViewLevel, + InteropTextureViewCube, + InteropSurface + }; - struct CreateParams : public amd::StackObject { - amd::Memory* owner_; //!< Resource's owner - VirtualGPU* gpu_; //!< Resource won't be shared between multiple queues - CreateParams(): owner_(NULL), gpu_(NULL) {} - }; + struct CreateParams : public amd::StackObject { + amd::Memory* owner_; //!< Resource's owner + VirtualGPU* gpu_; //!< Resource won't be shared between multiple queues + CreateParams() : owner_(NULL), gpu_(NULL) {} + }; - struct PinnedParams : public CreateParams { - const amd::HostMemoryReference* hostMemRef_;//!< System memory pointer for pinning - size_t size_; //!< System memory size - }; + struct PinnedParams : public CreateParams { + const amd::HostMemoryReference* hostMemRef_; //!< System memory pointer for pinning + size_t size_; //!< System memory size + }; - struct ViewParams : public CreateParams { - size_t offset_; //!< Alias resource offset - size_t size_; //!< Alias resource size - const Resource* resource_; //!< Parent resource for the view creation - const void* memory_; - }; + struct ViewParams : public CreateParams { + size_t offset_; //!< Alias resource offset + size_t size_; //!< Alias resource size + const Resource* resource_; //!< Parent resource for the view creation + const void* memory_; + }; - struct ImageViewParams : public CreateParams { - size_t level_; //!< Image mip level for a new view - size_t layer_; //!< Image layer for a new view - const Resource* resource_; //!< Parent resource for the view creation - const void* memory_; - }; + struct ImageViewParams : public CreateParams { + size_t level_; //!< Image mip level for a new view + size_t layer_; //!< Image layer for a new view + const Resource* resource_; //!< Parent resource for the view creation + const void* memory_; + }; - struct ImageBufferParams : public CreateParams { - const Resource* resource_; //!< Parent resource for the image creation - const void* memory_; - }; + struct ImageBufferParams : public CreateParams { + const Resource* resource_; //!< Parent resource for the image creation + const void* memory_; + }; - struct OGLInteropParams : public CreateParams { - InteropType type_; //!< OGL resource type - CALuint handle_; //!< OGL resource handle - uint mipLevel_; //!< Texture mip level - uint layer_; //!< Texture layer - void* glPlatformContext_; - void* glDeviceContext_; - uint flags_; - }; + struct OGLInteropParams : public CreateParams { + InteropType type_; //!< OGL resource type + CALuint handle_; //!< OGL resource handle + uint mipLevel_; //!< Texture mip level + uint layer_; //!< Texture layer + void* glPlatformContext_; + void* glDeviceContext_; + uint flags_; + }; #ifdef _WIN32 - struct D3DInteropParams : public CreateParams { - InteropType type_; //!< D3D resource type - void* iDirect3D_; //!< D3D resource interface object - HANDLE handle_; //!< D3D resource handle - uint mipLevel_; //!< Texture mip level - int layer_; //!< Texture layer - uint misc; //!< miscellaneous cases + struct D3DInteropParams : public CreateParams { + InteropType type_; //!< D3D resource type + void* iDirect3D_; //!< D3D resource interface object + HANDLE handle_; //!< D3D resource handle + uint mipLevel_; //!< Texture mip level + int layer_; //!< Texture layer + uint misc; //!< miscellaneous cases + }; +#endif // _WIN32 + + //! Resource memory + enum MemoryType { + Empty = 0x0, //!< resource is empty + Local, //!< resource in local memory + Persistent, //!< resource in persistent memory + Remote, //!< resource in nonlocal memory + RemoteUSWC, //!< resource in nonlocal memory + Pinned, //!< resource in pinned system memory + View, //!< resource is an alias + OGLInterop, //!< resource is an OGL memory object + D3D10Interop, //!< resource is a D3D10 memory object + D3D11Interop, //!< resource is a D3D11 memory object + Heap, //!< resource is a heap + ImageView, //!< resource is a view to some image + ImageBuffer, //!< resource is an image view of a buffer + BusAddressable, //!< resource is a bus addressable memory + ExternalPhysical, //!< resource is an external physical memory + D3D9Interop, //!< resource is a D3D9 memory object + Scratch, //!< resource is scratch memory + Shader, //!< resource is a shader + }; + + //! Resource map flags + enum MapFlags { + Discard = 0x00000001, //!< discard lock + NoOverwrite = 0x00000002, //!< lock with no overwrite + ReadOnly = 0x00000004, //!< lock for read only operation + WriteOnly = 0x00000008, //!< lock for write only operation + NoWait = 0x00000010, //!< lock with no wait + }; + + //! CAL resource descriptor + struct CalResourceDesc : public amd::HeapObject { + MemoryType type_; //!< Memory type + size_t width_; //!< CAL resource width + size_t height_; //!< CAL resource height + size_t depth_; //!< CAL resource depth + uint mipLevels_; //!< Number of mip levels + cmSurfFmt format_; //!< GSL resource format + CALuint flags_; //!< CAL resource flags, used in creation + size_t pitch_; //!< CAL resource pitch, valid if locked + CALuint slice_; //!< CAL resource slice, valid if locked + gslChannelOrder channelOrder_; //!< GSL resource channel order + gslMemObjectAttribType dimension_; //!< GSL resource dimension + cl_mem_object_type imageType_; //!< CL image type + union { + struct { + uint dimSize_ : 2; //!< Dimension size + uint cardMemory_ : 1; //!< GSL resource is in video memory + uint imageArray_ : 1; //!< GSL resource is an array of images + uint buffer_ : 1; //!< GSL resource is a buffer + uint tiled_ : 1; //!< GSL resource is tiled + uint scratch_ : 1; //!< Scratch buffer + uint skipRsrcCache_ : 1; //!< Skip caching of a cal resource + uint isAllocSVM_ : 1; //!< SVM resource attribute + uint isAllocExecute_ : 1; //!< SVM resource allocation attribute for shader\cmdbuf + }; + uint state_; }; -#endif // _WIN32 + }; - //! Resource memory - enum MemoryType - { - Empty = 0x0, //!< resource is empty - Local, //!< resource in local memory - Persistent, //!< resource in persistent memory - Remote, //!< resource in nonlocal memory - RemoteUSWC, //!< resource in nonlocal memory - Pinned, //!< resource in pinned system memory - View, //!< resource is an alias - OGLInterop, //!< resource is an OGL memory object - D3D10Interop, //!< resource is a D3D10 memory object - D3D11Interop, //!< resource is a D3D11 memory object - Heap, //!< resource is a heap - ImageView, //!< resource is a view to some image - ImageBuffer, //!< resource is an image view of a buffer - BusAddressable, //!< resource is a bus addressable memory - ExternalPhysical, //!< resource is an external physical memory - D3D9Interop, //!< resource is a D3D9 memory object - Scratch, //!< resource is scratch memory - Shader, //!< resource is a shader - }; + //! Constructor of 1D Resource object + Resource(const Device& gpuDev, //!< GPU device object + size_t width, //!< resource width + cmSurfFmt format //!< resource format + ); - //! Resource map flags - enum MapFlags - { - Discard = 0x00000001, //!< discard lock - NoOverwrite = 0x00000002, //!< lock with no overwrite - ReadOnly = 0x00000004, //!< lock for read only operation - WriteOnly = 0x00000008, //!< lock for write only operation - NoWait = 0x00000010, //!< lock with no wait - }; + //! Constructor of Image Resource object + Resource(const Device& gpuDev, //!< GPU device object + size_t width, //!< resource width + size_t height, //!< resource height + size_t depth, //!< resource depth + cmSurfFmt format, //!< resource format + gslChannelOrder chOrder, //!< resource channel order + cl_mem_object_type imageType, //!< CL image type + uint mipLevels = 1 //!< Number of mip levels + ); - //! CAL resource descriptor - struct CalResourceDesc : public amd::HeapObject - { - MemoryType type_; //!< Memory type - size_t width_; //!< CAL resource width - size_t height_; //!< CAL resource height - size_t depth_; //!< CAL resource depth - uint mipLevels_; //!< Number of mip levels - cmSurfFmt format_; //!< GSL resource format - CALuint flags_; //!< CAL resource flags, used in creation - size_t pitch_; //!< CAL resource pitch, valid if locked - CALuint slice_; //!< CAL resource slice, valid if locked - gslChannelOrder channelOrder_; //!< GSL resource channel order - gslMemObjectAttribType dimension_; //!< GSL resource dimension - cl_mem_object_type imageType_; //!< CL image type - union { - struct { - uint dimSize_ : 2; //!< Dimension size - uint cardMemory_ : 1; //!< GSL resource is in video memory - uint imageArray_ : 1; //!< GSL resource is an array of images - uint buffer_ : 1; //!< GSL resource is a buffer - uint tiled_ : 1; //!< GSL resource is tiled - uint scratch_ : 1; //!< Scratch buffer - uint skipRsrcCache_ : 1; //!< Skip caching of a cal resource - uint isAllocSVM_ : 1; //!< SVM resource attribute - uint isAllocExecute_ : 1; //!< SVM resource allocation attribute for shader\cmdbuf - }; - uint state_; - }; - }; + //! Destructor of the resource + virtual ~Resource(); - //! Constructor of 1D Resource object - Resource( - const Device& gpuDev, //!< GPU device object - size_t width, //!< resource width - cmSurfFmt format //!< resource format - ); + /*! \brief Creates a CAL object, associated with the resource + * + * \return True if we succesfully created a CAL resource + */ + virtual bool create(MemoryType memType, //!< memory type + CreateParams* params = 0 //!< special parameters for resource allocation + ); - //! Constructor of Image Resource object - Resource( - const Device& gpuDev, //!< GPU device object - size_t width, //!< resource width - size_t height, //!< resource height - size_t depth, //!< resource depth - cmSurfFmt format, //!< resource format - gslChannelOrder chOrder, //!< resource channel order - cl_mem_object_type imageType, //!< CL image type - uint mipLevels = 1 //!< Number of mip levels - ); + /*! \brief Copies a subregion of memory from one resource to another + * + * This is a general copy from anything to anything (as long as it fits). + * All positions and sizes are given in bytes. Note, however, that only + * a subset of this general interface is currently implemented. + * + * \return true if successful + */ + bool partialMemCopyTo(VirtualGPU& gpu, //!< Virtual GPU device object + const amd::Coord3D& srcOrigin, //!< Origin of the source region + const amd::Coord3D& dstOrigin, //!< Origin of the destination region + const amd::Coord3D& size, //!< Size of the region to copy + Resource& dstResource, //!< Destination resource + bool enableRectCopy = false, //!< Rectangular DMA support + bool flushDMA = false, //!< Flush DMA if requested + uint bytesPerElement = 1 //!< Bytes Per Element + ) const; - //! Destructor of the resource - virtual ~Resource(); + /*! \brief Copies size/4 DWORD of memory to a surface + * + * This is a raw copy to any surface using a CP packet. + * Size needs to be atleast a DWORD or multiple + * + */ + void writeRawData(VirtualGPU& gpu, //!< Virtual GPU device object + size_t size, //!< Size in bytes of data to be copied(multiple of DWORDS) + const void* data, //!< Data to be copied + bool waitForEvent //!< Wait for event complete + ) const; - /*! \brief Creates a CAL object, associated with the resource - * - * \return True if we succesfully created a CAL resource - */ - virtual bool create( - MemoryType memType, //!< memory type - CreateParams* params = 0 //!< special parameters for resource allocation - ); + //! Returns the offset in GPU memory for aliases + size_t offset() const { return offset_; } - /*! \brief Copies a subregion of memory from one resource to another - * - * This is a general copy from anything to anything (as long as it fits). - * All positions and sizes are given in bytes. Note, however, that only - * a subset of this general interface is currently implemented. - * - * \return true if successful - */ - bool partialMemCopyTo( - VirtualGPU& gpu, //!< Virtual GPU device object - const amd::Coord3D& srcOrigin, //!< Origin of the source region - const amd::Coord3D& dstOrigin, //!< Origin of the destination region - const amd::Coord3D& size, //!< Size of the region to copy - Resource& dstResource, //!< Destination resource - bool enableRectCopy = false, //!< Rectangular DMA support - bool flushDMA = false, //!< Flush DMA if requested - uint bytesPerElement = 1 //!< Bytes Per Element - ) const; + //! Returns the offset in GPU heap + uint64_t hbOffset() const { return hbOffset_; } - /*! \brief Copies size/4 DWORD of memory to a surface - * - * This is a raw copy to any surface using a CP packet. - * Size needs to be atleast a DWORD or multiple - * - */ - void writeRawData( - VirtualGPU& gpu, //!< Virtual GPU device object - size_t size, //!< Size in bytes of data to be copied(multiple of DWORDS) - const void* data, //!< Data to be copied - bool waitForEvent //!< Wait for event complete - ) const; + //! Returns the pinned memory offset + uint64_t pinOffset() const { return pinOffset_; } - //! Returns the offset in GPU memory for aliases - size_t offset() const { return offset_; } + //! Returns the size in GPU heap + uint64_t hbSize() const { return hbSize_; } - //! Returns the offset in GPU heap - uint64_t hbOffset() const { return hbOffset_; } + //! Returns the GPU device that owns this resource + const Device& dev() const { return gpuDevice_; } - //! Returns the pinned memory offset - uint64_t pinOffset() const { return pinOffset_; } + //! Returns the CAL descriptor for resource + const CalResourceDesc* cal() const { return &cal_; } - //! Returns the size in GPU heap - uint64_t hbSize() const { return hbSize_; } + //! Returns the CAL resource handle + gslMemObject gslResource() const { return gslRef_->gslResource(); } - //! Returns the GPU device that owns this resource - const Device& dev() const { return gpuDevice_; } + //! Returns global memory offset + uint64_t vmAddress() const { return gslResource()->getSurfaceAddress(); } - //! Returns the CAL descriptor for resource - const CalResourceDesc* cal() const { return &cal_; } + //! Returns global memory offset + bool mipMapped() const { return (cal()->mipLevels_ > 1) ? true : false; } - //! Returns the CAL resource handle - gslMemObject gslResource() const { return gslRef_->gslResource(); } + //! Checks if persistent memory can have a direct map + bool isPersistentDirectMap() const; - //! Returns global memory offset - uint64_t vmAddress() const { return gslResource()->getSurfaceAddress(); } + /*! \brief Locks the resource and returns a physical pointer + * + * \note This operation stalls HW pipeline! + * + * \return Pointer to the physical memory + */ + void* map(VirtualGPU* gpu, //!< Virtual GPU device object + uint flags = 0, //!< flags for the map operation + // Optimization for multilayer map/unmap + uint startLayer = 0, //!< Start layer for multilayer map + uint numLayers = 0 //!< End layer for multilayer map + ); - //! Returns global memory offset - bool mipMapped() const { return (cal()->mipLevels_ > 1) ? true : false; } + //! Unlocks the resource if it was locked + void unmap(VirtualGPU* gpu //!< Virtual GPU device object + ); - //! Checks if persistent memory can have a direct map - bool isPersistentDirectMap() const; + //! Marks the resource as busy + void setBusy(VirtualGPU& gpu, //!< Virtual GPU device object + GpuEvent calEvent //!< CAL event + ) const; - /*! \brief Locks the resource and returns a physical pointer - * - * \note This operation stalls HW pipeline! - * - * \return Pointer to the physical memory - */ - void* map( - VirtualGPU* gpu, //!< Virtual GPU device object - uint flags = 0, //!< flags for the map operation - // Optimization for multilayer map/unmap - uint startLayer = 0, //!< Start layer for multilayer map - uint numLayers = 0 //!< End layer for multilayer map - ); + //! Wait for the resource + void wait(VirtualGPU& gpu, //!< Virtual GPU device object + bool waitOnBusyEngine = false //!< Wait only if engine has changed + ) const; - //! Unlocks the resource if it was locked - void unmap( - VirtualGPU* gpu //!< Virtual GPU device object - ); + //! Performs host write to the resource GPU memory + bool hostWrite(VirtualGPU* gpu, //!< Virtual GPU device object + const void* hostPtr, //!< Host pointer to the SRC data + const amd::Coord3D& origin, //!< Offsets for the update + const amd::Coord3D& size, //!< The number of bytes to write + uint flags = 0, //!< Map flags + size_t rowPitch = 0, //!< Raw data row pitch + size_t slicePitch = 0 //!< Raw data slice pitch + ); - //! Marks the resource as busy - void setBusy( - VirtualGPU& gpu, //!< Virtual GPU device object - GpuEvent calEvent //!< CAL event - ) const; + //! Performs host read from the resource GPU memory + bool hostRead(VirtualGPU* gpu, //!< Virtual GPU device object + void* hostPtr, //!< Host pointer to the DST data + const amd::Coord3D& origin, //!< Offsets for the update + const amd::Coord3D& size, //!< The number of bytes to write + size_t rowPitch = 0, //!< Raw data row pitch + size_t slicePitch = 0 //!< Raw data slice pitch + ); - //! Wait for the resource - void wait( - VirtualGPU& gpu, //!< Virtual GPU device object - bool waitOnBusyEngine = false//!< Wait only if engine has changed - ) const; + //! Warms up the rename list for this resource + void warmUpRenames(VirtualGPU& gpu); - //! Performs host write to the resource GPU memory - bool hostWrite( - VirtualGPU* gpu, //!< Virtual GPU device object - const void* hostPtr, //!< Host pointer to the SRC data - const amd::Coord3D& origin, //!< Offsets for the update - const amd::Coord3D& size, //!< The number of bytes to write - uint flags = 0, //!< Map flags - size_t rowPitch = 0, //!< Raw data row pitch - size_t slicePitch = 0 //!< Raw data slice pitch - ); + //! Gets the resource element size + size_t elementSize() const { return elementSize_; } - //! Performs host read from the resource GPU memory - bool hostRead( - VirtualGPU* gpu, //!< Virtual GPU device object - void* hostPtr, //!< Host pointer to the DST data - const amd::Coord3D& origin, //!< Offsets for the update - const amd::Coord3D& size, //!< The number of bytes to write - size_t rowPitch = 0, //!< Raw data row pitch - size_t slicePitch = 0 //!< Raw data slice pitch - ); + //! Get the mapped address of this resource + address data() const { return reinterpret_cast
(address_); } - //! Warms up the rename list for this resource - void warmUpRenames(VirtualGPU& gpu); + //! Frees all allocated CAL memories and resources, + //! associated with this objects. And also destroys all rename structures + //! Note: doesn't destroy the object itself + void free(); - //! Gets the resource element size - size_t elementSize() const { return elementSize_; } + //! Return memory type + MemoryType memoryType() const { return cal_.type_; } - //! Get the mapped address of this resource - address data() const { return reinterpret_cast
(address_); } + //! Retunrs true if memory type matches specified + bool isMemoryType(MemoryType memType) const; - //! Frees all allocated CAL memories and resources, - //! associated with this objects. And also destroys all rename structures - //! Note: doesn't destroy the object itself - void free(); + //! Returns TRUE if resource was allocated as cacheable + bool isCacheable() const { return (isMemoryType(Remote) || isMemoryType(Pinned)) ? true : false; } - //! Return memory type - MemoryType memoryType() const { return cal_.type_; } + bool gslGLAcquire(); + bool gslGLRelease(); - //! Retunrs true if memory type matches specified - bool isMemoryType(MemoryType memType) const; + //! Returns HW state for the resource (used for images only) + const void* hwState() const { return hwState_; } - //! Returns TRUE if resource was allocated as cacheable - bool isCacheable() const - { return (isMemoryType(Remote) || isMemoryType(Pinned)) ? true : false; } + //! Returns CPU HW SRD for the resource (used for images only) + uint64_t hwSrd() const { return hwSrd_; } - bool gslGLAcquire() ; - bool gslGLRelease() ; + protected: + size_t elementSize_; //!< Size of a single element in bytes - //! Returns HW state for the resource (used for images only) - const void* hwState() const { return hwState_; } + private: + //! Disable copy constructor + Resource(const Resource&); - //! Returns CPU HW SRD for the resource (used for images only) - uint64_t hwSrd() const { return hwSrd_; } + //! Disable operator= + Resource& operator=(const Resource&); -protected: - size_t elementSize_; //!< Size of a single element in bytes + typedef std::vector RenameList; -private: - //! Disable copy constructor - Resource(const Resource&); + //! Rename current resource + bool rename(VirtualGPU& gpu, //!< Virtual GPU device object + bool force = false //!< Force renaming + ); - //! Disable operator= - Resource& operator=(const Resource&); + //! Sets the rename as active + void setActiveRename(VirtualGPU& gpu, //!< Virtual GPU device object + GslResourceReference* rename //!< new active rename + ); - typedef std::vector RenameList; + //! Gets the active rename + bool getActiveRename(VirtualGPU& gpu, //!< Virtual GPU device object + GslResourceReference** rename //!< Saved active rename + ); - //! Rename current resource - bool rename( - VirtualGPU& gpu, //!< Virtual GPU device object - bool force = false //!< Force renaming - ); + /*! \brief Locks the resource with layers and returns a physical pointer + * + * \return Pointer to the physical memory + */ + void* mapLayers(VirtualGPU* gpu, //!< Virtual GPU device object + CALuint flags = 0 //!< flags for the map operation + ); - //! Sets the rename as active - void setActiveRename( - VirtualGPU& gpu, //!< Virtual GPU device object - GslResourceReference* rename //!< new active rename - ); + //! Unlocks the resource with layers if it was locked + void unmapLayers(VirtualGPU* gpu //!< Virtual GPU device object + ); - //! Gets the active rename - bool getActiveRename( - VirtualGPU& gpu, //!< Virtual GPU device object - GslResourceReference** rename //!< Saved active rename - ); + //! Calls GSL to map a resource + void* gslMap(size_t* pitch, //!< Pitch value for the image + gslMapAccessType flags, //!< Map flags + gslMemObject resource //!< GSL memory object + ) const; - /*! \brief Locks the resource with layers and returns a physical pointer - * - * \return Pointer to the physical memory - */ - void* mapLayers( - VirtualGPU* gpu, //!< Virtual GPU device object - CALuint flags = 0 //!< flags for the map operation - ); + //! Uses GSL to unmap a resource + void gslUnmap(gslMemObject resource //!< GSL memory object + ) const; - //! Unlocks the resource with layers if it was locked - void unmapLayers( - VirtualGPU* gpu //!< Virtual GPU device object - ); + //! Fress all GSL resources associated with OCL resource + void gslFree() const; - //! Calls GSL to map a resource - void* gslMap( - size_t* pitch, //!< Pitch value for the image - gslMapAccessType flags, //!< Map flags - gslMemObject resource //!< GSL memory object - ) const; + const Device& gpuDevice_; //!< GPU device + CalResourceDesc cal_; //!< CAL descriptor for this resource + amd::Atomic mapCount_; //!< Total number of maps + void* address_; //!< Physical address of this resource + size_t offset_; //!< Resource offset + size_t curRename_; //!< Current active rename in the list + RenameList renames_; //!< Rename resource list + GslResourceReference* gslRef_; //!< GSL resource reference + const Resource* viewOwner_; //!< GPU resource, which owns this view + uint64_t hbOffset_; //!< Offset in the heap (virtual or real) + uint64_t hbSize_; //!< Memory size + uint64_t pinOffset_; //!< Pinned memory offset + gslMemObject glInterop_; //!< Original GL interop object + void* glInteropMbRes_; //!< Mb Res handle + uint32_t glType_; //!< GL interop type + void* glPlatformContext_; + void* glDeviceContext_; - //! Uses GSL to unmap a resource - void gslUnmap( - gslMemObject resource //!< GSL memory object - ) const; + // Optimization for multilayer map/unmap + uint startLayer_; //!< Start layer for map/unmapLayer + uint numLayers_; //!< Number of layers for map/unmapLayer + CALuint mapFlags_; //!< Map flags for map/umapLayer - //! Fress all GSL resources associated with OCL resource - void gslFree() const; + //! @note: This field is necessary for the thread safe release only + VirtualGPU* gpu_; //!< Resource will be used only on this queue - const Device& gpuDevice_; //!< GPU device - CalResourceDesc cal_; //!< CAL descriptor for this resource - amd::Atomic mapCount_; //!< Total number of maps - void* address_; //!< Physical address of this resource - size_t offset_; //!< Resource offset - size_t curRename_; //!< Current active rename in the list - RenameList renames_; //!< Rename resource list - GslResourceReference* gslRef_; //!< GSL resource reference - const Resource* viewOwner_; //!< GPU resource, which owns this view - uint64_t hbOffset_; //!< Offset in the heap (virtual or real) - uint64_t hbSize_; //!< Memory size - uint64_t pinOffset_; //!< Pinned memory offset - gslMemObject glInterop_; //!< Original GL interop object - void* glInteropMbRes_;//!< Mb Res handle - uint32_t glType_; //!< GL interop type - void* glPlatformContext_; - void* glDeviceContext_; - - // Optimization for multilayer map/unmap - uint startLayer_; //!< Start layer for map/unmapLayer - uint numLayers_; //!< Number of layers for map/unmapLayer - CALuint mapFlags_; //!< Map flags for map/umapLayer - - //! @note: This field is necessary for the thread safe release only - VirtualGPU* gpu_; //!< Resource will be used only on this queue - - uint32_t* hwState_; //!< HW state for image object - uint64_t hwSrd_; //!< GPU pointer to HW SRD + uint32_t* hwState_; //!< HW state for image object + uint64_t hwSrd_; //!< GPU pointer to HW SRD }; -class ResourceCache : public amd::HeapObject -{ -public: - //! Default constructor - ResourceCache(size_t cacheSizeLimit) - : lockCacheOps_("CAL resource cache", true) - , cacheSize_(0) - , cacheSizeLimit_(cacheSizeLimit) - {} +class ResourceCache : public amd::HeapObject { + public: + //! Default constructor + ResourceCache(size_t cacheSizeLimit) + : lockCacheOps_("CAL resource cache", true), cacheSize_(0), cacheSizeLimit_(cacheSizeLimit) {} - //! Default destructor - ~ResourceCache(); + //! Default destructor + ~ResourceCache(); - //! Adds a CAL resource to the cache - bool addCalResource( - Resource::CalResourceDesc* desc, //!< CAL resource descriptor - cache key - GslResourceReference* ref //!< CAL resource reference - ); + //! Adds a CAL resource to the cache + bool addCalResource(Resource::CalResourceDesc* desc, //!< CAL resource descriptor - cache key + GslResourceReference* ref //!< CAL resource reference + ); - //! Finds a CAL resource from the cache - GslResourceReference* findCalResource( - Resource::CalResourceDesc* desc //!< CAL resource descriptor - cache key - ); + //! Finds a CAL resource from the cache + GslResourceReference* findCalResource( + Resource::CalResourceDesc* desc //!< CAL resource descriptor - cache key + ); - //! Destroys cache - bool free(size_t minCacheEntries = 0); + //! Destroys cache + bool free(size_t minCacheEntries = 0); -private: - //! Disable copy constructor - ResourceCache(const ResourceCache&); + private: + //! Disable copy constructor + ResourceCache(const ResourceCache&); - //! Disable operator= - ResourceCache& operator=(const ResourceCache&); + //! Disable operator= + ResourceCache& operator=(const ResourceCache&); - //! Gets resource size in bytes - size_t getResourceSize(Resource::CalResourceDesc* desc); + //! Gets resource size in bytes + size_t getResourceSize(Resource::CalResourceDesc* desc); - //! Removes one last entry from the cache - void removeLast(); + //! Removes one last entry from the cache + void removeLast(); - amd::Monitor lockCacheOps_; //!< Lock to serialise cache access + amd::Monitor lockCacheOps_; //!< Lock to serialise cache access - size_t cacheSize_; //!< Current cache size in bytes - size_t cacheSizeLimit_; //!< Cache size limit in bytes + size_t cacheSize_; //!< Current cache size in bytes + size_t cacheSizeLimit_; //!< Cache size limit in bytes - //! CAL resource cache - std::list > resCache_; + //! CAL resource cache + std::list > resCache_; }; /*@}*/} // namespace gpu diff --git a/rocclr/runtime/device/gpu/gpusched.hpp b/rocclr/runtime/device/gpu/gpusched.hpp index 3d5bace02b..14503fd767 100644 --- a/rocclr/runtime/device/gpu/gpusched.hpp +++ b/rocclr/runtime/device/gpu/gpusched.hpp @@ -10,69 +10,70 @@ namespace gpu { //! AmdAqlWrap slot state enum AqlWrapState { - AQL_WRAP_FREE = 0, - AQL_WRAP_RESERVED, - AQL_WRAP_READY, - AQL_WRAP_MARKER, - AQL_WRAP_BUSY, - AQL_WRAP_DONE + AQL_WRAP_FREE = 0, + AQL_WRAP_RESERVED, + AQL_WRAP_READY, + AQL_WRAP_MARKER, + AQL_WRAP_BUSY, + AQL_WRAP_DONE }; struct AmdVQueueHeader { - uint32_t aql_slot_num; //!< [LRO/SRO] The total number of the AQL slots (multiple of 64). - uint32_t event_slot_num; //!< [LRO] The number of kernel events in the events buffer - uint64_t event_slot_mask; //!< [LRO] A pointer to the allocation bitmask array for the events - uint64_t event_slots; //!< [LRO] Pointer to a buffer for the events. - // Array of event_slot_num entries of AmdEvent - uint64_t aql_slot_mask; //!< [LRO/SRO]A pointer to the allocation bitmask for aql_warp slots - uint32_t command_counter; //!< [LRW] The global counter for the submitted commands into the queue - uint32_t wait_size; //!< [LRO] The wait list size (in clk_event_t) - uint32_t arg_size; //!< [LRO] The size of argument buffer (in bytes) - uint32_t mask_groups; //!< Processed mask groups by one thread - uint64_t kernel_table; //!< [LRO] Pointer to an array with all kernel objects (ulong for each entry) - uint32_t reserved[2]; //!< For the future usage + uint32_t aql_slot_num; //!< [LRO/SRO] The total number of the AQL slots (multiple of 64). + uint32_t event_slot_num; //!< [LRO] The number of kernel events in the events buffer + uint64_t event_slot_mask; //!< [LRO] A pointer to the allocation bitmask array for the events + uint64_t event_slots; //!< [LRO] Pointer to a buffer for the events. + // Array of event_slot_num entries of AmdEvent + uint64_t aql_slot_mask; //!< [LRO/SRO]A pointer to the allocation bitmask for aql_warp slots + uint32_t command_counter; //!< [LRW] The global counter for the submitted commands into the queue + uint32_t wait_size; //!< [LRO] The wait list size (in clk_event_t) + uint32_t arg_size; //!< [LRO] The size of argument buffer (in bytes) + uint32_t mask_groups; //!< Processed mask groups by one thread + uint64_t + kernel_table; //!< [LRO] Pointer to an array with all kernel objects (ulong for each entry) + uint32_t reserved[2]; //!< For the future usage }; struct AmdAqlWrap { - uint32_t state; //!< [LRW/SRW] The current state of the AQL wrapper: FREE, RESERVED, READY, - // MARKER, BUSY and DONE. The block could be returned back to a free state. - uint32_t enqueue_flags; //!< [LWO/SRO] Contains the flags for the kernel execution start - uint32_t command_id; //!< [LWO/SRO] The unique command ID - uint32_t child_counter; //!< [LRW/SRW] Counter that determine the launches of child kernels. - // It’s incremented on the - // start and decremented on the finish. The parent kernel can be considered as - // done when the value is 0 and the state is DONE - uint64_t completion; //!< [LWO/SRO] CL event for the current execution (clk_event_t) - uint64_t parent_wrap; //!< [LWO/SRO] Pointer to the parent AQL wrapper (AmdAqlWrap*) - uint64_t wait_list; //!< [LRO/SRO] Pointer to an array of clk_event_t objects (64 bytes default) - uint32_t wait_num; //!< [LWO/SRO] The number of cl_event_wait objects - uint32_t reserved[5]; //!< For the future usage - hsa_kernel_dispatch_packet_t aql; //!< [LWO/SRO] AQL packet – 64 bytes AQL packet + uint32_t state; //!< [LRW/SRW] The current state of the AQL wrapper: FREE, RESERVED, READY, + // MARKER, BUSY and DONE. The block could be returned back to a free state. + uint32_t enqueue_flags; //!< [LWO/SRO] Contains the flags for the kernel execution start + uint32_t command_id; //!< [LWO/SRO] The unique command ID + uint32_t child_counter; //!< [LRW/SRW] Counter that determine the launches of child kernels. + // It’s incremented on the + // start and decremented on the finish. The parent kernel can be considered as + // done when the value is 0 and the state is DONE + uint64_t completion; //!< [LWO/SRO] CL event for the current execution (clk_event_t) + uint64_t parent_wrap; //!< [LWO/SRO] Pointer to the parent AQL wrapper (AmdAqlWrap*) + uint64_t wait_list; //!< [LRO/SRO] Pointer to an array of clk_event_t objects (64 bytes default) + uint32_t wait_num; //!< [LWO/SRO] The number of cl_event_wait objects + uint32_t reserved[5]; //!< For the future usage + hsa_kernel_dispatch_packet_t aql; //!< [LWO/SRO] AQL packet – 64 bytes AQL packet }; struct AmdEvent { - uint32_t state; //!< [LRO/SRW] Event state: START, END, COMPLETE - uint32_t counter; //!< [LRW] Event retain/release counter. 0 means the event is free - uint64_t timer[3]; //!< [LRO/SWO] Timer values for profiling for each state - uint64_t captureInfo; //!< [LRW/SRO] Profiling capture info for CLK_PROFILING_COMMAND_EXEC_TIME + uint32_t state; //!< [LRO/SRW] Event state: START, END, COMPLETE + uint32_t counter; //!< [LRW] Event retain/release counter. 0 means the event is free + uint64_t timer[3]; //!< [LRO/SWO] Timer values for profiling for each state + uint64_t captureInfo; //!< [LRW/SRO] Profiling capture info for CLK_PROFILING_COMMAND_EXEC_TIME }; struct SchedulerParam { - uint32_t signal; //!< Signal to stop the child queue(address must be 16 bytes aligned) - uint32_t eng_clk; //!< Engine clock in Mhz - uint64_t hw_queue; //!< Address to HW queue - uint64_t hsa_queue; //!< Address to HSA dummy queue - uint32_t useATC; //!< GPU access to shader program by ATC. - uint32_t scratchSize; //!< Scratch buffer size - uint64_t scratch; //!< GPU address to the scratch buffer - uint32_t numMaxWaves; //!< The max number of possible waves - uint32_t releaseHostCP; //!< Releases CP on the host queue - uint64_t parentAQL; //!< Host parent AmdAqlWrap packet - uint32_t dedicatedQueue; //!< Scheduler uses a dedicated queue - uint32_t scratchOffset; //!< Scratch buffer offset - uint32_t reserved[2]; //!< Reserved + uint32_t signal; //!< Signal to stop the child queue(address must be 16 bytes aligned) + uint32_t eng_clk; //!< Engine clock in Mhz + uint64_t hw_queue; //!< Address to HW queue + uint64_t hsa_queue; //!< Address to HSA dummy queue + uint32_t useATC; //!< GPU access to shader program by ATC. + uint32_t scratchSize; //!< Scratch buffer size + uint64_t scratch; //!< GPU address to the scratch buffer + uint32_t numMaxWaves; //!< The max number of possible waves + uint32_t releaseHostCP; //!< Releases CP on the host queue + uint64_t parentAQL; //!< Host parent AmdAqlWrap packet + uint32_t dedicatedQueue; //!< Scheduler uses a dedicated queue + uint32_t scratchOffset; //!< Scratch buffer offset + uint32_t reserved[2]; //!< Reserved }; -} // namespace gpu +} // namespace gpu #endif diff --git a/rocclr/runtime/device/gpu/gpuschedcl.cpp b/rocclr/runtime/device/gpu/gpuschedcl.cpp index d55b533e58..1c0fc38269 100644 --- a/rocclr/runtime/device/gpu/gpuschedcl.cpp +++ b/rocclr/runtime/device/gpu/gpuschedcl.cpp @@ -292,4 +292,4 @@ scheduler( \n ); -} // namespace gpu +} // namespace gpu diff --git a/rocclr/runtime/device/gpu/gpuscsi.cpp b/rocclr/runtime/device/gpu/gpuscsi.cpp index 1c7859f5cd..af742bb260 100644 --- a/rocclr/runtime/device/gpu/gpuscsi.cpp +++ b/rocclr/runtime/device/gpu/gpuscsi.cpp @@ -18,182 +18,177 @@ namespace gpu { -bool -NullKernel::siCreateHwInfo(const void* shader, AMUabiAddEncoding& encoding) -{ - static const uint NumSiCsInfos = (70 + 5 + 1 + 32 + 6); - CALProgramInfoEntry* newInfos; - uint i = 0; - uint infoCount = NumSiCsInfos; - const SC_SI_HWSHADER_CS* cShader = reinterpret_cast(shader); - newInfos = new CALProgramInfoEntry[infoCount]; - encoding.progInfos = newInfos; - if (encoding.progInfos == 0) { - infoCount = 0; - return false; - } - newInfos[i].address = AMU_ABI_USER_ELEMENT_COUNT; - newInfos[i].value = cShader->common.userElementCount; +bool NullKernel::siCreateHwInfo(const void* shader, AMUabiAddEncoding& encoding) { + static const uint NumSiCsInfos = (70 + 5 + 1 + 32 + 6); + CALProgramInfoEntry* newInfos; + uint i = 0; + uint infoCount = NumSiCsInfos; + const SC_SI_HWSHADER_CS* cShader = reinterpret_cast(shader); + newInfos = new CALProgramInfoEntry[infoCount]; + encoding.progInfos = newInfos; + if (encoding.progInfos == 0) { + infoCount = 0; + return false; + } + newInfos[i].address = AMU_ABI_USER_ELEMENT_COUNT; + newInfos[i].value = cShader->common.userElementCount; + i++; + for (unsigned int j = 0; j < cShader->common.userElementCount; j++) { + newInfos[i].address = AMU_ABI_USER_ELEMENTS_0_DWORD0 + 4 * j; + newInfos[i].value = HWSHADER_Get(cShader, common.pUserElements)[j].dataClass; i++; - for (unsigned int j = 0; j < cShader->common.userElementCount; j++) { - newInfos[i].address = AMU_ABI_USER_ELEMENTS_0_DWORD0 + 4*j; - newInfos[i].value = HWSHADER_Get(cShader, common.pUserElements)[j].dataClass; - i++; - newInfos[i].address = AMU_ABI_USER_ELEMENTS_0_DWORD1 + 4*j; - newInfos[i].value = HWSHADER_Get(cShader, common.pUserElements)[j].apiSlot; - i++; - newInfos[i].address = AMU_ABI_USER_ELEMENTS_0_DWORD2 + 4*j; - newInfos[i].value = HWSHADER_Get(cShader, common.pUserElements)[j].startUserReg; - i++; - newInfos[i].address = AMU_ABI_USER_ELEMENTS_0_DWORD3 + 4*j; - newInfos[i].value = HWSHADER_Get(cShader, common.pUserElements)[j].userRegCount; - i++; - } + newInfos[i].address = AMU_ABI_USER_ELEMENTS_0_DWORD1 + 4 * j; + newInfos[i].value = HWSHADER_Get(cShader, common.pUserElements)[j].apiSlot; + i++; + newInfos[i].address = AMU_ABI_USER_ELEMENTS_0_DWORD2 + 4 * j; + newInfos[i].value = HWSHADER_Get(cShader, common.pUserElements)[j].startUserReg; + i++; + newInfos[i].address = AMU_ABI_USER_ELEMENTS_0_DWORD3 + 4 * j; + newInfos[i].value = HWSHADER_Get(cShader, common.pUserElements)[j].userRegCount; + i++; + } - newInfos[i].address = AMU_ABI_SI_NUM_VGPRS; - newInfos[i].value = cShader->common.numVgprs; - i++; - newInfos[i].address = AMU_ABI_SI_NUM_SGPRS; - newInfos[i].value = cShader->common.numSgprs; - i++; - newInfos[i].address = AMU_ABI_SI_NUM_SGPRS_AVAIL; - newInfos[i].value = SI_sgprs_avail; //512;//options.NumSGPRsAvailable; - i++; - newInfos[i].address = AMU_ABI_SI_NUM_VGPRS_AVAIL; - newInfos[i].value = SI_vgprs_avail;//options.NumVGPRsAvailable; - i++; + newInfos[i].address = AMU_ABI_SI_NUM_VGPRS; + newInfos[i].value = cShader->common.numVgprs; + i++; + newInfos[i].address = AMU_ABI_SI_NUM_SGPRS; + newInfos[i].value = cShader->common.numSgprs; + i++; + newInfos[i].address = AMU_ABI_SI_NUM_SGPRS_AVAIL; + newInfos[i].value = SI_sgprs_avail; // 512;//options.NumSGPRsAvailable; + i++; + newInfos[i].address = AMU_ABI_SI_NUM_VGPRS_AVAIL; + newInfos[i].value = SI_vgprs_avail; // options.NumVGPRsAvailable; + i++; - newInfos[i].address = AMU_ABI_SI_FLOAT_MODE; - newInfos[i].value = cShader->common.floatMode; - i++; - newInfos[i].address = AMU_ABI_SI_IEEE_MODE; - newInfos[i].value = cShader->common.bIeeeMode; + newInfos[i].address = AMU_ABI_SI_FLOAT_MODE; + newInfos[i].value = cShader->common.floatMode; + i++; + newInfos[i].address = AMU_ABI_SI_IEEE_MODE; + newInfos[i].value = cShader->common.bIeeeMode; + i++; + + newInfos[i].address = AMU_ABI_SI_SCRATCH_SIZE; + newInfos[i].value = cShader->common.scratchSize; + ; + i++; + + newInfos[i].address = mmCOMPUTE_PGM_RSRC2; + newInfos[i].value = cShader->computePgmRsrc2.u32All; + i++; + + newInfos[i].address = AMU_ABI_NUM_THREAD_PER_GROUP_X; + newInfos[i].value = cShader->numThreadX; + i++; + newInfos[i].address = AMU_ABI_NUM_THREAD_PER_GROUP_Y; + newInfos[i].value = cShader->numThreadY; + i++; + newInfos[i].address = AMU_ABI_NUM_THREAD_PER_GROUP_Z; + newInfos[i].value = cShader->numThreadZ; + i++; + + newInfos[i].address = AMU_ABI_ORDERED_APPEND_ENABLE; + newInfos[i].value = cShader->bOrderedAppendEnable; + i++; + + newInfos[i].address = AMU_ABI_RAT_OP_IS_USED; + newInfos[i].value = cShader->common.uavResourceUsage[0]; + i++; + + for (unsigned int j = 0; j < ((SC_MAX_UAV + 31) / 32); j++) { + newInfos[i].address = AMU_ABI_UAV_RESOURCE_MASK_0 + j; + newInfos[i].value = cShader->common.uavResourceUsage[j]; i++; + } - newInfos[i].address = AMU_ABI_SI_SCRATCH_SIZE; - newInfos[i].value = cShader->common.scratchSize;; - i++; + newInfos[i].address = AMU_ABI_NUM_WAVEFRONT_PER_SIMD; // Setting the same as for scWrapR800Info + newInfos[i].value = 1; + i++; - newInfos[i].address = mmCOMPUTE_PGM_RSRC2; - newInfos[i].value = cShader->computePgmRsrc2.u32All; - i++; + newInfos[i].address = AMU_ABI_WAVEFRONT_SIZE; + newInfos[i].value = nullDev().hwInfo()->simdWidth_ * 4; // options.WavefrontSize; + i++; - newInfos[i].address = AMU_ABI_NUM_THREAD_PER_GROUP_X; - newInfos[i].value = cShader->numThreadX; - i++; - newInfos[i].address = AMU_ABI_NUM_THREAD_PER_GROUP_Y; - newInfos[i].value = cShader->numThreadY; - i++; - newInfos[i].address = AMU_ABI_NUM_THREAD_PER_GROUP_Z; - newInfos[i].value = cShader->numThreadZ; - i++; + newInfos[i].address = AMU_ABI_LDS_SIZE_AVAIL; + newInfos[i].value = SI_ldssize_avail; // options.LDSSize; + i++; - newInfos[i].address = AMU_ABI_ORDERED_APPEND_ENABLE; - newInfos[i].value = cShader->bOrderedAppendEnable; - i++; + COMPUTE_PGM_RSRC2 computePgmRsrc2; + computePgmRsrc2.u32All = cShader->computePgmRsrc2.u32All; - newInfos[i].address = AMU_ABI_RAT_OP_IS_USED; - newInfos[i].value = cShader->common.uavResourceUsage[0]; - i++; + newInfos[i].address = AMU_ABI_LDS_SIZE_USED; + newInfos[i].value = 64 * 4 * computePgmRsrc2.bits.LDS_SIZE; + i++; - for (unsigned int j = 0; j < ((SC_MAX_UAV + 31) / 32); j++) { - newInfos[i].address = AMU_ABI_UAV_RESOURCE_MASK_0 + j; - newInfos[i].value = cShader->common.uavResourceUsage[j]; - i++; - } + infoCount = i; + assert((i + 4 * (16 - cShader->common.userElementCount)) == NumSiCsInfos); + encoding.progInfosCount = infoCount; - newInfos[i].address = AMU_ABI_NUM_WAVEFRONT_PER_SIMD; // Setting the same as for scWrapR800Info - newInfos[i].value = 1; - i++; + encoding.textData = HWSHADER_Get(cShader, common.hShaderMemHandle); + encoding.textSize = cShader->common.codeLenInByte; + instructionCnt_ = encoding.textSize / sizeof(uint32_t); + encoding.scratchRegisterCount = cShader->common.scratchSize; + encoding.UAVReturnBufferTotalSize = 0; - newInfos[i].address = AMU_ABI_WAVEFRONT_SIZE; - newInfos[i].value = nullDev().hwInfo()->simdWidth_ * 4; //options.WavefrontSize; - i++; - - newInfos[i].address = AMU_ABI_LDS_SIZE_AVAIL; - newInfos[i].value = SI_ldssize_avail; //options.LDSSize; - i++; - - COMPUTE_PGM_RSRC2 computePgmRsrc2; - computePgmRsrc2.u32All = cShader->computePgmRsrc2.u32All; - - newInfos[i].address = AMU_ABI_LDS_SIZE_USED; - newInfos[i].value = 64 * 4 * computePgmRsrc2.bits.LDS_SIZE; - i++; - - infoCount = i; - assert((i + 4 * (16 - cShader->common.userElementCount)) == NumSiCsInfos); - encoding.progInfosCount = infoCount; - - encoding.textData = HWSHADER_Get(cShader, common.hShaderMemHandle); - encoding.textSize = cShader->common.codeLenInByte; - instructionCnt_ = encoding.textSize / sizeof(uint32_t); - encoding.scratchRegisterCount = cShader->common.scratchSize; - encoding.UAVReturnBufferTotalSize = 0; - - return true; + return true; } -bool -HSAILKernel::aqlCreateHWInfo(amd::hsa::loader::Symbol *sym) -{ - if (!sym) { - return false; - } - uint64_t akc_addr = 0; - if (!sym->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, reinterpret_cast(&akc_addr))) { - return false; - } - amd_kernel_code_t *akc = reinterpret_cast(akc_addr); - cpuAqlCode_ = akc; - if (!sym->GetInfo(HSA_EXT_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT_SIZE, reinterpret_cast(&codeSize_))) { - return false; - } - size_t akc_align = 0; - if (!sym->GetInfo(HSA_EXT_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT_ALIGN, reinterpret_cast(&akc_align))) { - return false; - } +bool HSAILKernel::aqlCreateHWInfo(amd::hsa::loader::Symbol* sym) { + if (!sym) { + return false; + } + uint64_t akc_addr = 0; + if (!sym->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, reinterpret_cast(&akc_addr))) { + return false; + } + amd_kernel_code_t* akc = reinterpret_cast(akc_addr); + cpuAqlCode_ = akc; + if (!sym->GetInfo(HSA_EXT_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT_SIZE, + reinterpret_cast(&codeSize_))) { + return false; + } + size_t akc_align = 0; + if (!sym->GetInfo(HSA_EXT_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT_ALIGN, + reinterpret_cast(&akc_align))) { + return false; + } - // Allocate HW resources for the real program only - if (!prog().isNull()) { - code_ = new gpu::Memory(dev(), amd::alignUp(codeSize_, akc_align)); - // Initialize kernel ISA code - if (code_ && code_->create(Resource::Shader)) { - address cpuCodePtr = static_cast
(code_->map(NULL, Resource::WriteOnly)); - // Copy only amd_kernel_code_t - memcpy(cpuCodePtr, reinterpret_cast
(akc), codeSize_); - code_->unmap(NULL); - } - else { - LogError("Failed to allocate ISA code!"); - return false; - } + // Allocate HW resources for the real program only + if (!prog().isNull()) { + code_ = new gpu::Memory(dev(), amd::alignUp(codeSize_, akc_align)); + // Initialize kernel ISA code + if (code_ && code_->create(Resource::Shader)) { + address cpuCodePtr = static_cast
(code_->map(NULL, Resource::WriteOnly)); + // Copy only amd_kernel_code_t + memcpy(cpuCodePtr, reinterpret_cast
(akc), codeSize_); + code_->unmap(NULL); + } else { + LogError("Failed to allocate ISA code!"); + return false; } + } - assert((akc->workitem_private_segment_byte_size & 3) == 0 && - "Scratch must be DWORD aligned"); - workGroupInfo_.scratchRegs_ = - amd::alignUp(akc->workitem_private_segment_byte_size, 16) / sizeof(uint); - workGroupInfo_.privateMemSize_ = akc->workitem_private_segment_byte_size; - workGroupInfo_.availableLDSSize_ = dev().info().localMemSize_; - workGroupInfo_.localMemSize_ = - workGroupInfo_.usedLDSSize_ = akc->workgroup_group_segment_byte_size; - workGroupInfo_.usedSGPRs_ = akc->wavefront_sgpr_count; - workGroupInfo_.usedStackSize_ = 0; - workGroupInfo_.usedVGPRs_ = akc->workitem_vgpr_count; + assert((akc->workitem_private_segment_byte_size & 3) == 0 && "Scratch must be DWORD aligned"); + workGroupInfo_.scratchRegs_ = + amd::alignUp(akc->workitem_private_segment_byte_size, 16) / sizeof(uint); + workGroupInfo_.privateMemSize_ = akc->workitem_private_segment_byte_size; + workGroupInfo_.availableLDSSize_ = dev().info().localMemSize_; + workGroupInfo_.localMemSize_ = workGroupInfo_.usedLDSSize_ = + akc->workgroup_group_segment_byte_size; + workGroupInfo_.usedSGPRs_ = akc->wavefront_sgpr_count; + workGroupInfo_.usedStackSize_ = 0; + workGroupInfo_.usedVGPRs_ = akc->workitem_vgpr_count; - if (!prog().isNull()) { - workGroupInfo_.availableSGPRs_ = dev().gslCtx()->getNumSGPRsAvailable(); - workGroupInfo_.availableVGPRs_ = dev().gslCtx()->getNumVGPRsAvailable(); - workGroupInfo_.preferredSizeMultiple_ = dev().getAttribs().wavefrontSize; - workGroupInfo_.wavefrontPerSIMD_ = dev().getAttribs().wavefrontSize; - } - else { - workGroupInfo_.availableSGPRs_ = 104; - workGroupInfo_.availableVGPRs_ = 256; - workGroupInfo_.preferredSizeMultiple_ = - workGroupInfo_.wavefrontPerSIMD_ = 64; - } - return true; + if (!prog().isNull()) { + workGroupInfo_.availableSGPRs_ = dev().gslCtx()->getNumSGPRsAvailable(); + workGroupInfo_.availableVGPRs_ = dev().gslCtx()->getNumVGPRsAvailable(); + workGroupInfo_.preferredSizeMultiple_ = dev().getAttribs().wavefrontSize; + workGroupInfo_.wavefrontPerSIMD_ = dev().getAttribs().wavefrontSize; + } else { + workGroupInfo_.availableSGPRs_ = 104; + workGroupInfo_.availableVGPRs_ = 256; + workGroupInfo_.preferredSizeMultiple_ = workGroupInfo_.wavefrontPerSIMD_ = 64; + } + return true; } -} // namespace gpu +} // namespace gpu diff --git a/rocclr/runtime/device/gpu/gpusettings.cpp b/rocclr/runtime/device/gpu/gpusettings.cpp index b303f45092..3c57d08165 100644 --- a/rocclr/runtime/device/gpu/gpusettings.cpp +++ b/rocclr/runtime/device/gpu/gpusettings.cpp @@ -21,174 +21,167 @@ namespace gpu { * This structure contains the time and OS minor version for max workload time * adjustment for Windows 7 or 8. */ -struct ModifyMaxWorkload -{ - uint32_t time; //!< max work load time (10x ms) - uint32_t minorVersion; //!< OS minor version +struct ModifyMaxWorkload { + uint32_t time; //!< max work load time (10x ms) + uint32_t minorVersion; //!< OS minor version #if defined(_WIN32) - BYTE comparisonOps; //!< Comparison option + BYTE comparisonOps; //!< Comparison option #endif }; -Settings::Settings() -{ - // Initialize the GPU device default settings - oclVersion_ = OpenCL12; - debugFlags_ = 0; - syncObject_ = GPU_USE_SYNC_OBJECTS; - remoteAlloc_ = REMOTE_ALLOC; +Settings::Settings() { + // Initialize the GPU device default settings + oclVersion_ = OpenCL12; + debugFlags_ = 0; + syncObject_ = GPU_USE_SYNC_OBJECTS; + remoteAlloc_ = REMOTE_ALLOC; - stagedXferRead_ = true; - stagedXferWrite_ = true; - stagedXferSize_ = GPU_STAGING_BUFFER_SIZE * Ki; + stagedXferRead_ = true; + stagedXferWrite_ = true; + stagedXferSize_ = GPU_STAGING_BUFFER_SIZE * Ki; - // We will enable staged read/write if we use local memory - disablePersistent_ = false; + // We will enable staged read/write if we use local memory + disablePersistent_ = false; - // By Default persistent writes will be disabled. - stagingWritePersistent_ = GPU_STAGING_WRITE_PERSISTENT; + // By Default persistent writes will be disabled. + stagingWritePersistent_ = GPU_STAGING_WRITE_PERSISTENT; - maxRenames_ = 16; - maxRenameSize_ = 4 * Mi; + maxRenames_ = 16; + maxRenameSize_ = 4 * Mi; - imageSupport_ = false; - hwLDSSize_ = 0; + imageSupport_ = false; + hwLDSSize_ = 0; - // Set this to true when we drop the flag - doublePrecision_ = ::CL_KHR_FP64; + // Set this to true when we drop the flag + doublePrecision_ = ::CL_KHR_FP64; - // Fill workgroup info size - // @todo: revisit the 256 limitation on workgroup size - maxWorkGroupSize_ = 256; + // Fill workgroup info size + // @todo: revisit the 256 limitation on workgroup size + maxWorkGroupSize_ = 256; - hostMemDirectAccess_ = HostMemDisable; + hostMemDirectAccess_ = HostMemDisable; - libSelector_ = amd::LibraryUndefined; + libSelector_ = amd::LibraryUndefined; - // Enable workload split by default (for 24 bit arithmetic or timeout) - workloadSplitSize_ = 1 << GPU_WORKLOAD_SPLIT; + // Enable workload split by default (for 24 bit arithmetic or timeout) + workloadSplitSize_ = 1 << GPU_WORKLOAD_SPLIT; - // By default use host blit - blitEngine_ = BlitEngineHost; - const static size_t MaxPinnedXferSize = 32; - pinnedXferSize_ = std::min(GPU_PINNED_XFER_SIZE, MaxPinnedXferSize) * Mi; - pinnedMinXferSize_ = std::min(GPU_PINNED_MIN_XFER_SIZE * Ki, pinnedXferSize_); + // By default use host blit + blitEngine_ = BlitEngineHost; + const static size_t MaxPinnedXferSize = 32; + pinnedXferSize_ = std::min(GPU_PINNED_XFER_SIZE, MaxPinnedXferSize) * Mi; + pinnedMinXferSize_ = std::min(GPU_PINNED_MIN_XFER_SIZE * Ki, pinnedXferSize_); - // Disable FP_FAST_FMA defines by default - reportFMAF_ = false; - reportFMA_ = false; + // Disable FP_FAST_FMA defines by default + reportFMAF_ = false; + reportFMA_ = false; - // Disable async memory transfers by default - asyncMemCopy_ = false; + // Disable async memory transfers by default + asyncMemCopy_ = false; - // GPU device by default - apuSystem_ = false; + // GPU device by default + apuSystem_ = false; - // Disable 64 bit pointers support by default - use64BitPtr_ = false; + // Disable 64 bit pointers support by default + use64BitPtr_ = false; - // Max alloc size is 16GB - maxAllocSize_ = 16 * static_cast(Gi); + // Max alloc size is 16GB + maxAllocSize_ = 16 * static_cast(Gi); - // Disable memory dependency tracking by default - numMemDependencies_ = 0; + // Disable memory dependency tracking by default + numMemDependencies_ = 0; - // By default cache isn't present - cacheLineSize_ = 0; - cacheSize_ = 0; + // By default cache isn't present + cacheLineSize_ = 0; + cacheSize_ = 0; - // Initialize transfer buffer size to 1MB by default - xferBufSize_ = 1024 * Ki; + // Initialize transfer buffer size to 1MB by default + xferBufSize_ = 1024 * Ki; - // Use image DMA if requested - imageDMA_ = GPU_IMAGE_DMA; + // Use image DMA if requested + imageDMA_ = GPU_IMAGE_DMA; - // Disable ASIC specific features by default - ciPlus_ = false; - viPlus_ = false; - aiPlus_ = false; + // Disable ASIC specific features by default + ciPlus_ = false; + viPlus_ = false; + aiPlus_ = false; - // Number of compute rings. - numComputeRings_ = 0; + // Number of compute rings. + numComputeRings_ = 0; - minWorkloadTime_ = 100; // 0.1 ms - maxWorkloadTime_ = 500000; // 500 ms + minWorkloadTime_ = 100; // 0.1 ms + maxWorkloadTime_ = 500000; // 500 ms - // Controls tiled images in persistent - //!@note IOL for Linux doesn't setup tiling aperture in CMM/QS - linearPersistentImage_ = false; + // Controls tiled images in persistent + //!@note IOL for Linux doesn't setup tiling aperture in CMM/QS + linearPersistentImage_ = false; - useSingleScratch_ = GPU_USE_SINGLE_SCRATCH; + useSingleScratch_ = GPU_USE_SINGLE_SCRATCH; - // SDMA profiling is disabled by default - sdmaProfiling_ = false; + // SDMA profiling is disabled by default + sdmaProfiling_ = false; - // Device enqueuing settings - numDeviceEvents_ = 1024; - numWaitEvents_ = 8; + // Device enqueuing settings + numDeviceEvents_ = 1024; + numWaitEvents_ = 8; - // Disable HSAIL by default - hsail_ = false; + // Disable HSAIL by default + hsail_ = false; - // Don't support platform atomics by default. - svmAtomics_ = false; + // Don't support platform atomics by default. + svmAtomics_ = false; - // Use host queue for device enqueuing by default - useDeviceQueue_ = GPU_USE_DEVICE_QUEUE; + // Use host queue for device enqueuing by default + useDeviceQueue_ = GPU_USE_DEVICE_QUEUE; - // Don't support Denormals for single precision by default - singleFpDenorm_ = false; + // Don't support Denormals for single precision by default + singleFpDenorm_ = false; } -bool -Settings::create( - const CALdeviceattribs& calAttr - , bool reportAsOCL12Device - , bool smallMemSystem -) -{ - CALuint target = calAttr.target; - uint32_t osVer = 0x0; +bool Settings::create(const CALdeviceattribs& calAttr, bool reportAsOCL12Device, + bool smallMemSystem) { + CALuint target = calAttr.target; + uint32_t osVer = 0x0; - // Disable thread trace by default for all devices - threadTraceEnable_ = false; + // Disable thread trace by default for all devices + threadTraceEnable_ = false; - if (calAttr.doublePrecision) { - // Report FP_FAST_FMA define if double precision HW - reportFMA_ = true; - // FMA is 1/4 speed on Pitcairn, Cape Verde, Devastator and Scrapper - // Bonaire, Kalindi, Spectre and Spooky so disable - // FP_FMA_FMAF for those parts in switch below - reportFMAF_ = true; - } + if (calAttr.doublePrecision) { + // Report FP_FAST_FMA define if double precision HW + reportFMA_ = true; + // FMA is 1/4 speed on Pitcairn, Cape Verde, Devastator and Scrapper + // Bonaire, Kalindi, Spectre and Spooky so disable + // FP_FMA_FMAF for those parts in switch below + reportFMAF_ = true; + } - // Update GPU specific settings and info structure if we have any - ModifyMaxWorkload modifyMaxWorkload = {0}; + // Update GPU specific settings and info structure if we have any + ModifyMaxWorkload modifyMaxWorkload = {0}; - switch (target) { + switch (target) { case CAL_TARGET_RAVEN: - // APU systems for AI - apuSystem_ = true; + // APU systems for AI + apuSystem_ = true; case CAL_TARGET_GREENLAND: - //TODO: specific codes for AI - aiPlus_ = true; - // Fall through to VI ... + // TODO: specific codes for AI + aiPlus_ = true; + // Fall through to VI ... case CAL_TARGET_STONEY: - if (!aiPlus_) { - // Fix BSOD/TDR issues observed on Stoney Win7/8.1/10 - minWorkloadTime_ = 1000; - modifyMaxWorkload.time = 1000; // Decided by experiment - modifyMaxWorkload.minorVersion = 1; // Win 7 + if (!aiPlus_) { + // Fix BSOD/TDR issues observed on Stoney Win7/8.1/10 + minWorkloadTime_ = 1000; + modifyMaxWorkload.time = 1000; // Decided by experiment + modifyMaxWorkload.minorVersion = 1; // Win 7 #if defined(_WIN32) - modifyMaxWorkload.comparisonOps = VER_EQUAL; // Limit to Win 7 only + modifyMaxWorkload.comparisonOps = VER_EQUAL; // Limit to Win 7 only #endif - } + } case CAL_TARGET_CARRIZO: - if (!aiPlus_) { - // APU systems for VI - apuSystem_ = true; - } + if (!aiPlus_) { + // APU systems for VI + apuSystem_ = true; + } case CAL_TARGET_ICELAND: case CAL_TARGET_TONGA: case CAL_TARGET_FIJI: @@ -196,300 +189,291 @@ Settings::create( case CAL_TARGET_BAFFIN: case CAL_TARGET_LEXA: case CAL_TARGET_POLARIS22: - // Disable tiling aperture on VI+ - linearPersistentImage_ = true; - // Keep this false even though we have support - // singleFpDenorm_ = true; - viPlus_ = true; - enableExtension(ClKhrFp16); - // Fall through to CI ... + // Disable tiling aperture on VI+ + linearPersistentImage_ = true; + // Keep this false even though we have support + // singleFpDenorm_ = true; + viPlus_ = true; + enableExtension(ClKhrFp16); + // Fall through to CI ... case CAL_TARGET_KALINDI: case CAL_TARGET_SPECTRE: case CAL_TARGET_SPOOKY: case CAL_TARGET_GODAVARI: - if (!viPlus_) { - // APU systems for CI - apuSystem_ = true; - // Fix BSOD/TDR issues observed on Kaveri Win7 (EPR#416903) - modifyMaxWorkload.time = 250000; // 250ms - modifyMaxWorkload.minorVersion = 1; // Win 7 + if (!viPlus_) { + // APU systems for CI + apuSystem_ = true; + // Fix BSOD/TDR issues observed on Kaveri Win7 (EPR#416903) + modifyMaxWorkload.time = 250000; // 250ms + modifyMaxWorkload.minorVersion = 1; // Win 7 #if defined(_WIN32) - modifyMaxWorkload.comparisonOps = VER_EQUAL; // limit to Win 7 + modifyMaxWorkload.comparisonOps = VER_EQUAL; // limit to Win 7 #endif - } - // Fall through ... + } + // Fall through ... case CAL_TARGET_BONAIRE: case CAL_TARGET_HAWAII: - ciPlus_ = true; - sdmaProfiling_ = true; - hsail_ = GPU_HSAIL_ENABLE; - threadTraceEnable_ = AMD_THREAD_TRACE_ENABLE; - // Fall through to SI ... + ciPlus_ = true; + sdmaProfiling_ = true; + hsail_ = GPU_HSAIL_ENABLE; + threadTraceEnable_ = AMD_THREAD_TRACE_ENABLE; + // Fall through to SI ... case CAL_TARGET_PITCAIRN: case CAL_TARGET_CAPEVERDE: case CAL_TARGET_OLAND: case CAL_TARGET_HAINAN: - reportFMAF_ = false; - if (target == CAL_TARGET_HAWAII) { - reportFMAF_ = true; - } - // Fall through ... + reportFMAF_ = false; + if (target == CAL_TARGET_HAWAII) { + reportFMAF_ = true; + } + // Fall through ... case CAL_TARGET_TAHITI: - // Cache line size is 64 bytes - cacheLineSize_ = 64; - // L1 cache size is 16KB - cacheSize_ = 16 * Ki; + // Cache line size is 64 bytes + cacheLineSize_ = 64; + // L1 cache size is 16KB + cacheSize_ = 16 * Ki; - if (ciPlus_) { - libSelector_ = amd::GPU_Library_CI; - if (LP64_SWITCH(false, true)) { - oclVersion_ = !reportAsOCL12Device && calAttr.isOpenCL200Device ? - XCONCAT(OpenCL, XCONCAT(OPENCL_MAJOR, OPENCL_MINOR)) : OpenCL12; - } - if (smallMemSystem) { //force the dGPU to be 1.2 device for small memory system. - if (apuSystem_) { - return false; - } - else { - oclVersion_ = OpenCL12; - } - } - if (GPU_FORCE_OCL20_32BIT) { - force32BitOcl20_ = true; - oclVersion_ = !reportAsOCL12Device && calAttr.isOpenCL200Device ? - XCONCAT(OpenCL, XCONCAT(OPENCL_MAJOR, OPENCL_MINOR)) : OpenCL12; - } - if (OPENCL_VERSION < 200) { - oclVersion_ = OpenCL12; - } - numComputeRings_ = 8; + if (ciPlus_) { + libSelector_ = amd::GPU_Library_CI; + if (LP64_SWITCH(false, true)) { + oclVersion_ = !reportAsOCL12Device && calAttr.isOpenCL200Device + ? XCONCAT(OpenCL, XCONCAT(OPENCL_MAJOR, OPENCL_MINOR)) + : OpenCL12; } - else { - numComputeRings_ = 2; - libSelector_ = amd::GPU_Library_SI; + if (smallMemSystem) { // force the dGPU to be 1.2 device for small memory system. + if (apuSystem_) { + return false; + } else { + oclVersion_ = OpenCL12; + } } + if (GPU_FORCE_OCL20_32BIT) { + force32BitOcl20_ = true; + oclVersion_ = !reportAsOCL12Device && calAttr.isOpenCL200Device + ? XCONCAT(OpenCL, XCONCAT(OPENCL_MAJOR, OPENCL_MINOR)) + : OpenCL12; + } + if (OPENCL_VERSION < 200) { + oclVersion_ = OpenCL12; + } + numComputeRings_ = 8; + } else { + numComputeRings_ = 2; + libSelector_ = amd::GPU_Library_SI; + } - // This needs to be cleaned once 64bit addressing is stable - if (oclVersion_ < OpenCL20) { - use64BitPtr_ = flagIsDefault(GPU_FORCE_64BIT_PTR) ? LP64_SWITCH(false, - calAttr.isWorkstation || hsail_) : GPU_FORCE_64BIT_PTR; - } - else { - if (GPU_FORCE_64BIT_PTR || LP64_SWITCH(false, (hsail_ - || (oclVersion_ >= OpenCL20)))) { - use64BitPtr_ = true; - } + // This needs to be cleaned once 64bit addressing is stable + if (oclVersion_ < OpenCL20) { + use64BitPtr_ = flagIsDefault(GPU_FORCE_64BIT_PTR) + ? LP64_SWITCH(false, calAttr.isWorkstation || hsail_) + : GPU_FORCE_64BIT_PTR; + } else { + if (GPU_FORCE_64BIT_PTR || LP64_SWITCH(false, (hsail_ || (oclVersion_ >= OpenCL20)))) { + use64BitPtr_ = true; } + } - if (oclVersion_ >= OpenCL20) { - supportDepthsRGB_ = true; - } - if (use64BitPtr_) { - if ((GPU_ENABLE_LARGE_ALLOCATION) && (calAttr.isWorkstation) - && (oclVersion_ == OpenCL20)) { - maxAllocSize_ = 64ULL * Gi; - } - else { - maxAllocSize_ = 4048 * Mi; - } - } - else { - maxAllocSize_ = 3ULL * Gi; + if (oclVersion_ >= OpenCL20) { + supportDepthsRGB_ = true; + } + if (use64BitPtr_) { + if ((GPU_ENABLE_LARGE_ALLOCATION) && (calAttr.isWorkstation) && (oclVersion_ == OpenCL20)) { + maxAllocSize_ = 64ULL * Gi; + } else { + maxAllocSize_ = 4048 * Mi; } + } else { + maxAllocSize_ = 3ULL * Gi; + } - supportRA_ = false; - partialDispatch_ = GPU_PARTIAL_DISPATCH; - numMemDependencies_ = GPU_NUM_MEM_DEPENDENCY; + supportRA_ = false; + partialDispatch_ = GPU_PARTIAL_DISPATCH; + numMemDependencies_ = GPU_NUM_MEM_DEPENDENCY; - enableExtension(ClKhrInt64BaseAtomics); - enableExtension(ClKhrInt64ExtendedAtomics); - enableExtension(ClKhrImage2dFromBuffer); - break; + enableExtension(ClKhrInt64BaseAtomics); + enableExtension(ClKhrInt64ExtendedAtomics); + enableExtension(ClKhrImage2dFromBuffer); + break; default: - assert(0 && "Unknown ASIC type!"); - return false; - } + assert(0 && "Unknown ASIC type!"); + return false; + } #if defined(_WIN32) - if (modifyMaxWorkload.time > 0) { - OSVERSIONINFOEX versionInfo = { 0 }; - versionInfo.dwOSVersionInfoSize = sizeof(OSVERSIONINFOEX); - versionInfo.dwMajorVersion = 6; - versionInfo.dwMinorVersion = modifyMaxWorkload.minorVersion; + if (modifyMaxWorkload.time > 0) { + OSVERSIONINFOEX versionInfo = {0}; + versionInfo.dwOSVersionInfoSize = sizeof(OSVERSIONINFOEX); + versionInfo.dwMajorVersion = 6; + versionInfo.dwMinorVersion = modifyMaxWorkload.minorVersion; - DWORDLONG conditionMask = 0; - VER_SET_CONDITION(conditionMask, VER_MAJORVERSION, modifyMaxWorkload.comparisonOps); - VER_SET_CONDITION(conditionMask, VER_MINORVERSION, modifyMaxWorkload.comparisonOps); - if (VerifyVersionInfo(&versionInfo, VER_MAJORVERSION | VER_MINORVERSION, conditionMask)) { - maxWorkloadTime_ = modifyMaxWorkload.time; - } + DWORDLONG conditionMask = 0; + VER_SET_CONDITION(conditionMask, VER_MAJORVERSION, modifyMaxWorkload.comparisonOps); + VER_SET_CONDITION(conditionMask, VER_MINORVERSION, modifyMaxWorkload.comparisonOps); + if (VerifyVersionInfo(&versionInfo, VER_MAJORVERSION | VER_MINORVERSION, conditionMask)) { + maxWorkloadTime_ = modifyMaxWorkload.time; } -#endif // defined(_WIN32) + } +#endif // defined(_WIN32) - // Enable atomics support - enableExtension(ClKhrGlobalInt32BaseAtomics); - enableExtension(ClKhrGlobalInt32ExtendedAtomics); - enableExtension(ClKhrLocalInt32BaseAtomics); - enableExtension(ClKhrLocalInt32ExtendedAtomics); - enableExtension(ClKhrByteAddressableStore); - enableExtension(ClKhrGlSharing); - enableExtension(ClKhrGlEvent); - enableExtension(ClAmdMediaOps); - enableExtension(ClAmdMediaOps2); - enableExtension(ClAmdPopcnt); - enableExtension(ClKhr3DImageWrites); - enableExtension(ClAmdVec3); - enableExtension(ClAmdPrintf); - // Enable some platform extensions - enableExtension(ClAmdDeviceAttributeQuery); - enableExtension(ClKhrSpir); - enableExtension(ClAMDLiquidFlash); + // Enable atomics support + enableExtension(ClKhrGlobalInt32BaseAtomics); + enableExtension(ClKhrGlobalInt32ExtendedAtomics); + enableExtension(ClKhrLocalInt32BaseAtomics); + enableExtension(ClKhrLocalInt32ExtendedAtomics); + enableExtension(ClKhrByteAddressableStore); + enableExtension(ClKhrGlSharing); + enableExtension(ClKhrGlEvent); + enableExtension(ClAmdMediaOps); + enableExtension(ClAmdMediaOps2); + enableExtension(ClAmdPopcnt); + enableExtension(ClKhr3DImageWrites); + enableExtension(ClAmdVec3); + enableExtension(ClAmdPrintf); + // Enable some platform extensions + enableExtension(ClAmdDeviceAttributeQuery); + enableExtension(ClKhrSpir); + enableExtension(ClAMDLiquidFlash); - hwLDSSize_ = 32 * Ki; + hwLDSSize_ = 32 * Ki; - imageSupport_ = true; + imageSupport_ = true; - // Use kernels for blit if appropriate - blitEngine_ = BlitEngineKernel; + // Use kernels for blit if appropriate + blitEngine_ = BlitEngineKernel; - hostMemDirectAccess_ |= HostMemBuffer; - // HW doesn't support untiled image writes - // hostMemDirectAccess_ |= HostMemImage; + hostMemDirectAccess_ |= HostMemBuffer; + // HW doesn't support untiled image writes + // hostMemDirectAccess_ |= HostMemImage; - asyncMemCopy_ = true; + asyncMemCopy_ = true; - // Make sure device actually supports double precision - doublePrecision_ = (calAttr.doublePrecision) ? doublePrecision_ : false; - if (doublePrecision_) { - // Enable KHR double precision extension - enableExtension(ClKhrFp64); - } + // Make sure device actually supports double precision + doublePrecision_ = (calAttr.doublePrecision) ? doublePrecision_ : false; + if (doublePrecision_) { + // Enable KHR double precision extension + enableExtension(ClKhrFp64); + } - if (calAttr.doublePrecision) { - // Enable AMD double precision extension - doublePrecision_ = true; - enableExtension(ClAmdFp64); - } + if (calAttr.doublePrecision) { + // Enable AMD double precision extension + doublePrecision_ = true; + enableExtension(ClAmdFp64); + } - if (calAttr.totalSDIHeap > 0) { - //Enable bus addressable memory extension - enableExtension(ClAMDBusAddressableMemory); - } + if (calAttr.totalSDIHeap > 0) { + // Enable bus addressable memory extension + enableExtension(ClAMDBusAddressableMemory); + } - if (calAttr.longIdleDetect) { - // KMD is unable to detect if we map the visible memory for CPU access, so - // accessing persistent staged buffer may fail if LongIdleDetct is enabled. - disablePersistent_ = true; - } + if (calAttr.longIdleDetect) { + // KMD is unable to detect if we map the visible memory for CPU access, so + // accessing persistent staged buffer may fail if LongIdleDetct is enabled. + disablePersistent_ = true; + } - svmFineGrainSystem_ = calAttr.isSVMFineGrainSystem; + svmFineGrainSystem_ = calAttr.isSVMFineGrainSystem; - svmAtomics_ = (calAttr.svmAtomics || calAttr.isSVMFineGrainSystem) ? true : false; + svmAtomics_ = (calAttr.svmAtomics || calAttr.isSVMFineGrainSystem) ? true : false; #if defined(_WIN32) - enableExtension(ClKhrD3d9Sharing); - enableExtension(ClKhrD3d10Sharing); - enableExtension(ClKhrD3d11Sharing); -#endif // _WIN32 + enableExtension(ClKhrD3d9Sharing); + enableExtension(ClKhrD3d10Sharing); + enableExtension(ClKhrD3d11Sharing); +#endif // _WIN32 - // Enable some OpenCL 2.0 extensions - if (oclVersion_ >= OpenCL20) { - enableExtension(ClKhrGLDepthImages); - enableExtension(ClKhrSubGroups); - enableExtension(ClKhrDepthImages); + // Enable some OpenCL 2.0 extensions + if (oclVersion_ >= OpenCL20) { + enableExtension(ClKhrGLDepthImages); + enableExtension(ClKhrSubGroups); + enableExtension(ClKhrDepthImages); - if (GPU_MIPMAP) { - enableExtension(ClKhrMipMapImage); - enableExtension(ClKhrMipMapImageWrites); - } - - // Enable HW debug - if (GPU_ENABLE_HW_DEBUG) { - enableHwDebug_ = true; - } + if (GPU_MIPMAP) { + enableExtension(ClKhrMipMapImage); + enableExtension(ClKhrMipMapImageWrites); } - if (apuSystem_ && - ((calAttr.totalVisibleHeap + calAttr.totalInvisibleHeap) < 150)) { - remoteAlloc_ = true; + // Enable HW debug + if (GPU_ENABLE_HW_DEBUG) { + enableHwDebug_ = true; } + } - // Save resource cache size + if (apuSystem_ && ((calAttr.totalVisibleHeap + calAttr.totalInvisibleHeap) < 150)) { + remoteAlloc_ = true; + } + +// Save resource cache size #ifdef ATI_OS_LINUX - // Due to EPR#406216, set the default value for Linux for now - resourceCacheSize_ = GPU_RESOURCE_CACHE_SIZE * Mi; + // Due to EPR#406216, set the default value for Linux for now + resourceCacheSize_ = GPU_RESOURCE_CACHE_SIZE * Mi; #else - if (remoteAlloc_) { - resourceCacheSize_ = std::max((calAttr.uncachedRemoteRAM / 8) * Mi, - GPU_RESOURCE_CACHE_SIZE * Mi); - } - else { - resourceCacheSize_ = std::max((calAttr.localRAM / 8) * Mi, - GPU_RESOURCE_CACHE_SIZE * Mi); - } - resourceCacheSize_ = std::min(resourceCacheSize_, 512 * Mi); + if (remoteAlloc_) { + resourceCacheSize_ = + std::max((calAttr.uncachedRemoteRAM / 8) * Mi, GPU_RESOURCE_CACHE_SIZE * Mi); + } else { + resourceCacheSize_ = std::max((calAttr.localRAM / 8) * Mi, GPU_RESOURCE_CACHE_SIZE * Mi); + } + resourceCacheSize_ = std::min(resourceCacheSize_, 512 * Mi); #endif - // Override current device settings - override(); + // Override current device settings + override(); - return true; + return true; } -void -Settings::override() -{ - // Limit reported workgroup size - if (GPU_MAX_WORKGROUP_SIZE != 0) { - maxWorkGroupSize_ = GPU_MAX_WORKGROUP_SIZE; - } +void Settings::override() { + // Limit reported workgroup size + if (GPU_MAX_WORKGROUP_SIZE != 0) { + maxWorkGroupSize_ = GPU_MAX_WORKGROUP_SIZE; + } - // Override blit engine type - if (GPU_BLIT_ENGINE_TYPE != BlitEngineDefault) { - blitEngine_ = GPU_BLIT_ENGINE_TYPE; - } + // Override blit engine type + if (GPU_BLIT_ENGINE_TYPE != BlitEngineDefault) { + blitEngine_ = GPU_BLIT_ENGINE_TYPE; + } - if (!flagIsDefault(DEBUG_GPU_FLAGS)) { - debugFlags_ = DEBUG_GPU_FLAGS; - } + if (!flagIsDefault(DEBUG_GPU_FLAGS)) { + debugFlags_ = DEBUG_GPU_FLAGS; + } - // Check async memory transfer - if (!flagIsDefault(GPU_ASYNC_MEM_COPY)) { - asyncMemCopy_ = GPU_ASYNC_MEM_COPY; - } + // Check async memory transfer + if (!flagIsDefault(GPU_ASYNC_MEM_COPY)) { + asyncMemCopy_ = GPU_ASYNC_MEM_COPY; + } - if (!flagIsDefault(DEBUG_GPU_FLAGS)) { - debugFlags_ = DEBUG_GPU_FLAGS; - } + if (!flagIsDefault(DEBUG_GPU_FLAGS)) { + debugFlags_ = DEBUG_GPU_FLAGS; + } - if (!flagIsDefault(GPU_XFER_BUFFER_SIZE)) { - xferBufSize_ = GPU_XFER_BUFFER_SIZE * Ki; - } + if (!flagIsDefault(GPU_XFER_BUFFER_SIZE)) { + xferBufSize_ = GPU_XFER_BUFFER_SIZE * Ki; + } - if (!flagIsDefault(GPU_USE_SYNC_OBJECTS)) { - syncObject_ = GPU_USE_SYNC_OBJECTS; - } + if (!flagIsDefault(GPU_USE_SYNC_OBJECTS)) { + syncObject_ = GPU_USE_SYNC_OBJECTS; + } - if (!flagIsDefault(GPU_NUM_COMPUTE_RINGS)) { - numComputeRings_ = GPU_NUM_COMPUTE_RINGS; - } + if (!flagIsDefault(GPU_NUM_COMPUTE_RINGS)) { + numComputeRings_ = GPU_NUM_COMPUTE_RINGS; + } - if (!flagIsDefault(GPU_RESOURCE_CACHE_SIZE)) { - resourceCacheSize_ = GPU_RESOURCE_CACHE_SIZE * Mi; - } + if (!flagIsDefault(GPU_RESOURCE_CACHE_SIZE)) { + resourceCacheSize_ = GPU_RESOURCE_CACHE_SIZE * Mi; + } - if (!flagIsDefault(AMD_GPU_FORCE_SINGLE_FP_DENORM)) { - switch (AMD_GPU_FORCE_SINGLE_FP_DENORM) { - case 0: - singleFpDenorm_ = false; - break; - case 1: - singleFpDenorm_ = true; - break; - default: - break; - } + if (!flagIsDefault(AMD_GPU_FORCE_SINGLE_FP_DENORM)) { + switch (AMD_GPU_FORCE_SINGLE_FP_DENORM) { + case 0: + singleFpDenorm_ = false; + break; + case 1: + singleFpDenorm_ = true; + break; + default: + break; } + } } -} // namespace gpu +} // namespace gpu diff --git a/rocclr/runtime/device/gpu/gpusettings.hpp b/rocclr/runtime/device/gpu/gpusettings.hpp index 632c121ff6..8bb3364890 100644 --- a/rocclr/runtime/device/gpu/gpusettings.hpp +++ b/rocclr/runtime/device/gpu/gpusettings.hpp @@ -15,111 +15,108 @@ namespace gpu { //! Device settings -class Settings : public device::Settings -{ -public: - //! Debug GPU flags - enum DebugGpuFlags - { - CheckForILSource = 0x00000001, - StubCLPrograms = 0x00000002, //!< Enables OpenCL programs stubbing - LockGlobalMemory = 0x00000004, +class Settings : public device::Settings { + public: + //! Debug GPU flags + enum DebugGpuFlags { + CheckForILSource = 0x00000001, + StubCLPrograms = 0x00000002, //!< Enables OpenCL programs stubbing + LockGlobalMemory = 0x00000004, + }; + + enum BlitEngineType { + BlitEngineDefault = 0x00000000, + BlitEngineHost = 0x00000001, + BlitEngineCAL = 0x00000002, + BlitEngineKernel = 0x00000003, + }; + + enum HostMemFlags { + HostMemDisable = 0x00000000, + HostMemBuffer = 0x00000001, + HostMemImage = 0x00000002, + }; + + union { + struct { + uint remoteAlloc_ : 1; //!< Allocate remote memory for the heap + uint stagedXferRead_ : 1; //!< Uses a staged buffer read + uint stagedXferWrite_ : 1; //!< Uses a staged buffer write + uint disablePersistent_ : 1; //!< Disables using persistent memory for staging + uint imageSupport_ : 1; //!< Report images support + uint doublePrecision_ : 1; //!< Enables double precision support + uint reportFMAF_ : 1; //!< Report FP_FAST_FMAF define in CL program + uint reportFMA_ : 1; //!< Report FP_FAST_FMA define in CL program + uint use64BitPtr_ : 1; //!< Use 64bit pointers on GPU + uint force32BitOcl20_ : 1; //!< Force 32bit apps to take CLANG/HSAIL path on GPU + uint imageDMA_ : 1; //!< Enable direct image DMA transfers + uint syncObject_ : 1; //!< Enable syncobject + uint ciPlus_ : 1; //!< CI and post CI features + uint viPlus_ : 1; //!< VI and post VI features + uint aiPlus_ : 1; //!< AI and post AI features + uint threadTraceEnable_ : 1; //!< Thread trace enable + uint linearPersistentImage_ : 1; //!< Allocates linear images in persistent + uint useSingleScratch_ : 1; //!< Allocates single scratch per device + uint sdmaProfiling_ : 1; //!< Enables SDMA profiling + uint hsail_ : 1; //!< Enables HSAIL compilation + uint stagingWritePersistent_ : 1; //!< Enables persistent writes + uint svmAtomics_ : 1; //!< SVM device atomics + uint svmFineGrainSystem_ : 1; //!< SVM fine grain system support + uint apuSystem_ : 1; //!< Device is APU system with shared memory + uint asyncMemCopy_ : 1; //!< Use async memory transfers + uint useDeviceQueue_ : 1; //!< Submit to separate device queue + uint singleFpDenorm_ : 1; //!< Support Single FP Denorm + uint reserved_ : 5; }; + uint value_; + }; - enum BlitEngineType - { - BlitEngineDefault = 0x00000000, - BlitEngineHost = 0x00000001, - BlitEngineCAL = 0x00000002, - BlitEngineKernel = 0x00000003, - }; - - enum HostMemFlags - { - HostMemDisable = 0x00000000, - HostMemBuffer = 0x00000001, - HostMemImage = 0x00000002, - }; - - union { - struct { - uint remoteAlloc_: 1; //!< Allocate remote memory for the heap - uint stagedXferRead_: 1; //!< Uses a staged buffer read - uint stagedXferWrite_: 1; //!< Uses a staged buffer write - uint disablePersistent_: 1; //!< Disables using persistent memory for staging - uint imageSupport_: 1; //!< Report images support - uint doublePrecision_: 1; //!< Enables double precision support - uint reportFMAF_: 1; //!< Report FP_FAST_FMAF define in CL program - uint reportFMA_: 1; //!< Report FP_FAST_FMA define in CL program - uint use64BitPtr_: 1; //!< Use 64bit pointers on GPU - uint force32BitOcl20_: 1; //!< Force 32bit apps to take CLANG/HSAIL path on GPU - uint imageDMA_: 1; //!< Enable direct image DMA transfers - uint syncObject_: 1; //!< Enable syncobject - uint ciPlus_: 1; //!< CI and post CI features - uint viPlus_: 1; //!< VI and post VI features - uint aiPlus_: 1; //!< AI and post AI features - uint threadTraceEnable_: 1; //!< Thread trace enable - uint linearPersistentImage_: 1; //!< Allocates linear images in persistent - uint useSingleScratch_: 1; //!< Allocates single scratch per device - uint sdmaProfiling_: 1; //!< Enables SDMA profiling - uint hsail_: 1; //!< Enables HSAIL compilation - uint stagingWritePersistent_: 1; //!< Enables persistent writes - uint svmAtomics_: 1; //!< SVM device atomics - uint svmFineGrainSystem_: 1; //!< SVM fine grain system support - uint apuSystem_: 1; //!< Device is APU system with shared memory - uint asyncMemCopy_: 1; //!< Use async memory transfers - uint useDeviceQueue_: 1; //!< Submit to separate device queue - uint singleFpDenorm_: 1; //!< Support Single FP Denorm - uint reserved_: 5; - }; - uint value_; - }; - - uint oclVersion_; //!< Reported OpenCL version support - uint debugFlags_; //!< Debug GPU flags - size_t stagedXferSize_; //!< Staged buffer size - uint maxRenames_; //!< Maximum number of possible renames - uint maxRenameSize_; //!< Maximum size for all renames - uint hwLDSSize_; //!< HW local data store size - uint maxWorkGroupSize_; //!< Requested workgroup size for this device - uint hostMemDirectAccess_; //!< Enables direct access to the host memory - amd::LibrarySelector libSelector_; //!< Select linking libraries for compiler - uint workloadSplitSize_; //!< Workload split size - uint minWorkloadTime_; //!< Minimal workload time in 0.1 ms - uint maxWorkloadTime_; //!< Maximum workload time in 0.1 ms - uint blitEngine_; //!< Blit engine type - size_t pinnedXferSize_; //!< Pinned buffer size for transfer - size_t pinnedMinXferSize_; //!< Minimal buffer size for pinned transfer - size_t resourceCacheSize_; //!< Resource cache size in MB - uint64_t maxAllocSize_; //!< Maximum single allocation size - size_t numMemDependencies_;//!< The array size for memory dependencies tracking - uint cacheLineSize_; //!< Cache line size in bytes - uint cacheSize_; //!< L1 cache size in bytes - size_t xferBufSize_; //!< Transfer buffer size for image copy optimization - uint numComputeRings_; //!< 0 - disabled, 1 , 2,.. - the number of compute rings - uint numDeviceEvents_; //!< The number of device events - uint numWaitEvents_; //!< The number of wait events for device enqueue + uint oclVersion_; //!< Reported OpenCL version support + uint debugFlags_; //!< Debug GPU flags + size_t stagedXferSize_; //!< Staged buffer size + uint maxRenames_; //!< Maximum number of possible renames + uint maxRenameSize_; //!< Maximum size for all renames + uint hwLDSSize_; //!< HW local data store size + uint maxWorkGroupSize_; //!< Requested workgroup size for this device + uint hostMemDirectAccess_; //!< Enables direct access to the host memory + amd::LibrarySelector libSelector_; //!< Select linking libraries for compiler + uint workloadSplitSize_; //!< Workload split size + uint minWorkloadTime_; //!< Minimal workload time in 0.1 ms + uint maxWorkloadTime_; //!< Maximum workload time in 0.1 ms + uint blitEngine_; //!< Blit engine type + size_t pinnedXferSize_; //!< Pinned buffer size for transfer + size_t pinnedMinXferSize_; //!< Minimal buffer size for pinned transfer + size_t resourceCacheSize_; //!< Resource cache size in MB + uint64_t maxAllocSize_; //!< Maximum single allocation size + size_t numMemDependencies_; //!< The array size for memory dependencies tracking + uint cacheLineSize_; //!< Cache line size in bytes + uint cacheSize_; //!< L1 cache size in bytes + size_t xferBufSize_; //!< Transfer buffer size for image copy optimization + uint numComputeRings_; //!< 0 - disabled, 1 , 2,.. - the number of compute rings + uint numDeviceEvents_; //!< The number of device events + uint numWaitEvents_; //!< The number of wait events for device enqueue - //! Default constructor - Settings(); + //! Default constructor + Settings(); - //! Creates settings - bool create( - const CALdeviceattribs& calAttr //!< CAL attributes structure - , bool reportAsOCL12Device = false //!< Report As OpenCL1.2 Device - , bool smallMemSystem = false //!< report the sys memory is small - ); + //! Creates settings + bool create(const CALdeviceattribs& calAttr //!< CAL attributes structure + , + bool reportAsOCL12Device = false //!< Report As OpenCL1.2 Device + , + bool smallMemSystem = false //!< report the sys memory is small + ); -private: - //! Disable copy constructor - Settings(const Settings&); + private: + //! Disable copy constructor + Settings(const Settings&); - //! Disable assignment - Settings& operator=(const Settings&); + //! Disable assignment + Settings& operator=(const Settings&); - //! Overrides current settings based on registry/environment - void override(); + //! Overrides current settings based on registry/environment + void override(); }; /*@}*/} // namespace gpu diff --git a/rocclr/runtime/device/gpu/gputhreadtrace.cpp b/rocclr/runtime/device/gpu/gputhreadtrace.cpp index ea006902ef..48122d6d71 100644 --- a/rocclr/runtime/device/gpu/gputhreadtrace.cpp +++ b/rocclr/runtime/device/gpu/gputhreadtrace.cpp @@ -8,60 +8,54 @@ namespace gpu { CalThreadTraceReference::~CalThreadTraceReference() { - // The thread trace object is always associated with a particular queue, - // so we have to lock just this queue - amd::ScopedLock lock(gpu_.execution()); + // The thread trace object is always associated with a particular queue, + // so we have to lock just this queue + amd::ScopedLock lock(gpu_.execution()); - if (0 != threadTrace_) { - //gpu().cs()->destroyQuery(gslThreadTrace()); - } + if (0 != threadTrace_) { + // gpu().cs()->destroyQuery(gslThreadTrace()); + } } -ThreadTrace::~ThreadTrace() -{ - if (calRef_ == NULL) { - return; - } - for(uint i = 0; i < amdThreadTraceMemObjsNum_;++i) { - threadTraceBufferObjs_[i]->attachMemObject(gpu().cs(), NULL, 0, 0, 0, i); - gpu().cs()->destroyShaderTraceBuffer(threadTraceBufferObjs_[i]); - } +ThreadTrace::~ThreadTrace() { + if (calRef_ == NULL) { + return; + } + for (uint i = 0; i < amdThreadTraceMemObjsNum_; ++i) { + threadTraceBufferObjs_[i]->attachMemObject(gpu().cs(), NULL, 0, 0, 0, i); + gpu().cs()->destroyShaderTraceBuffer(threadTraceBufferObjs_[i]); + } - // Release the thread trace reference object - //calRef_->release(); + // Release the thread trace reference object + // calRef_->release(); } -bool -ThreadTrace::create(CalThreadTraceReference* calRef) -{ - assert(&gpu() == &calRef->gpu()); +bool ThreadTrace::create(CalThreadTraceReference* calRef) { + assert(&gpu() == &calRef->gpu()); - calRef_ = calRef; - threadTrace_ = calRef->gslThreadTrace(); + calRef_ = calRef; + threadTrace_ = calRef->gslThreadTrace(); - return true; + return true; } -bool -ThreadTrace::info(uint infoType, uint* info, uint infoSize) const -{ - switch (infoType) { +bool ThreadTrace::info(uint infoType, uint* info, uint infoSize) const { + switch (infoType) { case CL_THREAD_TRACE_BUFFERS_SIZE: { - if (infoSize < amdThreadTraceMemObjsNum_) { - LogError("The amount of buffers should be equal to the amount of Shader Engines"); - return false; - } - else { - gslThreadTrace()->GetResultAll(gpu().cs(), info); - } - break; + if (infoSize < amdThreadTraceMemObjsNum_) { + LogError("The amount of buffers should be equal to the amount of Shader Engines"); + return false; + } else { + gslThreadTrace()->GetResultAll(gpu().cs(), info); + } + break; } default: - LogError("Wrong ThreadTrace::getInfo parameter"); - return false; - } - return true; + LogError("Wrong ThreadTrace::getInfo parameter"); + return false; + } + return true; } -} // namespace gpu +} // namespace gpu diff --git a/rocclr/runtime/device/gpu/gputhreadtrace.hpp b/rocclr/runtime/device/gpu/gputhreadtrace.hpp index 2554bbc848..5feb875e0d 100644 --- a/rocclr/runtime/device/gpu/gputhreadtrace.hpp +++ b/rocclr/runtime/device/gpu/gputhreadtrace.hpp @@ -14,125 +14,118 @@ namespace gpu { class VirtualGPU; -class CalThreadTraceReference : public amd::ReferenceCountedObject -{ -public: - //! Default constructor - CalThreadTraceReference( - VirtualGPU& gpu, //!< Virtual GPU device object - gslQueryObject gslThreadTrace) //!< GSL query thread trace object - : gpu_(gpu) - , threadTrace_(gslThreadTrace){} +class CalThreadTraceReference : public amd::ReferenceCountedObject { + public: + //! Default constructor + CalThreadTraceReference(VirtualGPU& gpu, //!< Virtual GPU device object + gslQueryObject gslThreadTrace) //!< GSL query thread trace object + : gpu_(gpu), + threadTrace_(gslThreadTrace) {} - //! Get GSL thread race object - gslQueryObject gslThreadTrace() const { return threadTrace_; } + //! Get GSL thread race object + gslQueryObject gslThreadTrace() const { return threadTrace_; } - //! Returns the virtual GPU device - const VirtualGPU& gpu() const { return gpu_; } + //! Returns the virtual GPU device + const VirtualGPU& gpu() const { return gpu_; } -protected: - //! Default destructor - ~CalThreadTraceReference(); + protected: + //! Default destructor + ~CalThreadTraceReference(); -private: - //! Disable copy constructor - CalThreadTraceReference(const CalThreadTraceReference&); + private: + //! Disable copy constructor + CalThreadTraceReference(const CalThreadTraceReference&); - //! Disable operator= - CalThreadTraceReference& operator=(const CalThreadTraceReference&); + //! Disable operator= + CalThreadTraceReference& operator=(const CalThreadTraceReference&); - VirtualGPU& gpu_; //!< The virtual GPU device object - gslQueryObject threadTrace_; //!< GSL thread trace query object + VirtualGPU& gpu_; //!< The virtual GPU device object + gslQueryObject threadTrace_; //!< GSL thread trace query object }; //! ThreadTrace implementation on GPU -class ThreadTrace : public device::ThreadTrace -{ -public: +class ThreadTrace : public device::ThreadTrace { + public: + //! Destructor for the GPU ThreadTrace object + virtual ~ThreadTrace(); - //! Destructor for the GPU ThreadTrace object - virtual ~ThreadTrace(); + //! Creates the current object + bool create(CalThreadTraceReference* calRef //!< Reference ThreadTrace + ); - //! Creates the current object - bool create( - CalThreadTraceReference* calRef //!< Reference ThreadTrace - ); + //! Returns the GPU device, associated with the current object + const Device& dev() const { return gpuDevice_; } - //! Returns the GPU device, associated with the current object - const Device& dev() const { return gpuDevice_; } + //! Returns the virtual GPU device + const VirtualGPU& gpu() const { return gpu_; } - //! Returns the virtual GPU device - const VirtualGPU& gpu() const { return gpu_; } - - //! Constructor for the GPU ThreadTrace object - ThreadTrace( - Device& device, //!< A GPU device object - VirtualGPU& gpu, //!< Virtual GPU device object - uint amdThreadTraceMemObjsNum) - : gpuDevice_(device) - , gpu_(gpu) - , calRef_(NULL) - , index_(0) - , amdThreadTraceMemObjsNum_(amdThreadTraceMemObjsNum) - { - threadTraceBufferObjs_ = new gslShaderTraceBufferObject[amdThreadTraceMemObjsNum]; - for (uint i = 0; i < amdThreadTraceMemObjsNum;++i) { - threadTraceBufferObjs_[i] = gpu.cs()->createShaderTraceBuffer(); - } + //! Constructor for the GPU ThreadTrace object + ThreadTrace(Device& device, //!< A GPU device object + VirtualGPU& gpu, //!< Virtual GPU device object + uint amdThreadTraceMemObjsNum) + : gpuDevice_(device), + gpu_(gpu), + calRef_(NULL), + index_(0), + amdThreadTraceMemObjsNum_(amdThreadTraceMemObjsNum) { + threadTraceBufferObjs_ = new gslShaderTraceBufferObject[amdThreadTraceMemObjsNum]; + for (uint i = 0; i < amdThreadTraceMemObjsNum; ++i) { + threadTraceBufferObjs_[i] = gpu.cs()->createShaderTraceBuffer(); } + } - //! Returns the specific information about the thread trace object - bool info( - uint infoType, //!< The type of returned information - uint* info, //!< The returned information - uint infoSize //!< The size of returned information - ) const; + //! Returns the specific information about the thread trace object + bool info(uint infoType, //!< The type of returned information + uint* info, //!< The returned information + uint infoSize //!< The size of returned information + ) const; - //! Set the ThreadTrace memory buffer size - void setMemBufferSizeTT(uint memBufferSizeTT) { memBufferSizeTT_ = memBufferSizeTT;} + //! Set the ThreadTrace memory buffer size + void setMemBufferSizeTT(uint memBufferSizeTT) { memBufferSizeTT_ = memBufferSizeTT; } - //! Set isNewBufferBinded_ to true/false if new buffer was binded/unbinded respectively - void setNewBufferBinded(bool isNewBufferBinded) { isNewBufferBinded_ = isNewBufferBinded; } + //! Set isNewBufferBinded_ to true/false if new buffer was binded/unbinded respectively + void setNewBufferBinded(bool isNewBufferBinded) { isNewBufferBinded_ = isNewBufferBinded; } - //! Attach gslMemObject to the TreadTrace buffer - void attachMemToThreadTraceBuffer(); + //! Attach gslMemObject to the TreadTrace buffer + void attachMemToThreadTraceBuffer(); - void setMemObj(size_t memObjSize,std::vector memObj) - { - memObj_ = memObj; - memBufferSizeTT_ = memObjSize; - } - //! Get GSL thread trace object - gslQueryObject gslThreadTrace() const { return threadTrace_; } + void setMemObj(size_t memObjSize, std::vector memObj) { + memObj_ = memObj; + memBufferSizeTT_ = memObjSize; + } + //! Get GSL thread trace object + gslQueryObject gslThreadTrace() const { return threadTrace_; } - //! Get GSL Thread Trace Buffer objects - gslShaderTraceBufferObject* getThreadTraceBufferObjects() {return threadTraceBufferObjs_;} -private: - //! Disable default copy constructor - ThreadTrace(const ThreadTrace&); + //! Get GSL Thread Trace Buffer objects + gslShaderTraceBufferObject* getThreadTraceBufferObjects() { return threadTraceBufferObjs_; } - //! Disable default operator= - ThreadTrace& operator=(const ThreadTrace&); + private: + //! Disable default copy constructor + ThreadTrace(const ThreadTrace&); - //! Retrieve gslMemoryObject - gslMemObject getCurrentGslMemObject(amd::Memory* ); + //! Disable default operator= + ThreadTrace& operator=(const ThreadTrace&); - const Device& gpuDevice_; //!< The backend device + //! Retrieve gslMemoryObject + gslMemObject getCurrentGslMemObject(amd::Memory*); - VirtualGPU& gpu_; //!< The virtual GPU device object + const Device& gpuDevice_; //!< The backend device - CalThreadTraceReference* calRef_; //!< Reference ThreadTrace - gslShaderTraceBufferObject* threadTraceBufferObjs_; //!< The buffer object for Thread Trace recording - uint index_; //!< ThreadTrace index in the CAL container - uint memBufferSizeTT_; //!< ThreadTrace memory buffer size - std::vector memObj_; //!< ThreadTrace memory object - gslQueryObject threadTrace_; //!< GSL thread trace query object - uint amdThreadTraceMemObjsNum_; //!< ThreadTrace memory object`s number (should be equal to the SE number) - bool isNewBufferBinded_; //!< The indicator if new buffer was binded to the ThreadTrace object - bool isBufferOnSubmit_; //!< The indicator if "new buffer on submit" mode is used + VirtualGPU& gpu_; //!< The virtual GPU device object + + CalThreadTraceReference* calRef_; //!< Reference ThreadTrace + gslShaderTraceBufferObject* + threadTraceBufferObjs_; //!< The buffer object for Thread Trace recording + uint index_; //!< ThreadTrace index in the CAL container + uint memBufferSizeTT_; //!< ThreadTrace memory buffer size + std::vector memObj_; //!< ThreadTrace memory object + gslQueryObject threadTrace_; //!< GSL thread trace query object + uint amdThreadTraceMemObjsNum_; //!< ThreadTrace memory object`s number (should be equal to the + //!SE number) + bool isNewBufferBinded_; //!< The indicator if new buffer was binded to the ThreadTrace object + bool isBufferOnSubmit_; //!< The indicator if "new buffer on submit" mode is used }; -} // namespace gpu - -#endif // GPU_THREAD_TRACE_HPP_ +} // namespace gpu +#endif // GPU_THREAD_TRACE_HPP_ diff --git a/rocclr/runtime/device/gpu/gputimestamp.cpp b/rocclr/runtime/device/gpu/gputimestamp.cpp index 9ab4981e7d..aa45c4aea9 100644 --- a/rocclr/runtime/device/gpu/gputimestamp.cpp +++ b/rocclr/runtime/device/gpu/gputimestamp.cpp @@ -11,109 +11,86 @@ namespace gpu { -TimeStamp::TimeStamp( - const VirtualGPU& gpu, - gslMemObject gslMem, - uint memOffset, - address cpuAddr) - : gpu_(gpu) - , gslMem_(gslMem) - , memOffset_(memOffset) -{ - values_ = reinterpret_cast(cpuAddr + memOffset); +TimeStamp::TimeStamp(const VirtualGPU& gpu, gslMemObject gslMem, uint memOffset, address cpuAddr) + : gpu_(gpu), gslMem_(gslMem), memOffset_(memOffset) { + values_ = reinterpret_cast(cpuAddr + memOffset); } -TimeStamp::~TimeStamp() -{ -} +TimeStamp::~TimeStamp() {} -void -TimeStamp::begin(bool sdma) -{ - if (!flags_.beginIssued_) { - gpu().rs()->writeTimer(gpu().cs(), sdma, gslMem_, - memOffset_ + CommandStartTime * sizeof(uint64_t)); - flags_.beginIssued_ = true; - } -} - -void -TimeStamp::end(bool sdma) -{ - CondLog(!flags_.beginIssued_, "We didn't issue a begin operation!"); +void TimeStamp::begin(bool sdma) { + if (!flags_.beginIssued_) { gpu().rs()->writeTimer(gpu().cs(), sdma, gslMem_, - memOffset_ + CommandEndTime * sizeof(uint64_t)); - flags_.endIssued_ = true; - flags_.sdma_ = sdma; + memOffset_ + CommandStartTime * sizeof(uint64_t)); + flags_.beginIssued_ = true; + } } -inline void -SetValue(uint64_t* time, uint64_t val, double nanos) -{ - *time = static_cast(static_cast(val) * nanos); +void TimeStamp::end(bool sdma) { + CondLog(!flags_.beginIssued_, "We didn't issue a begin operation!"); + gpu().rs()->writeTimer(gpu().cs(), sdma, gslMem_, memOffset_ + CommandEndTime * sizeof(uint64_t)); + flags_.endIssued_ = true; + flags_.sdma_ = sdma; } -void -TimeStamp::value(uint64_t* startTime, uint64_t* endTime) -{ - CondLog(!flags_.endIssued_, "We didn't send the counter end operation!"); - const double NanoSecondsPerTick = gpu_.dev().getAttribs().nanoSecondsPerTick; - - SetValue(startTime, values_[CommandStartTime], NanoSecondsPerTick); - SetValue(endTime, values_[CommandEndTime], NanoSecondsPerTick); +inline void SetValue(uint64_t* time, uint64_t val, double nanos) { + *time = static_cast(static_cast(val) * nanos); } -TimeStampCache::~TimeStampCache() -{ - // Release all time stamp objects from the cache - for (uint i = 0; i < freedTS_.size(); ++i) { - delete freedTS_[i]; +void TimeStamp::value(uint64_t* startTime, uint64_t* endTime) { + CondLog(!flags_.endIssued_, "We didn't send the counter end operation!"); + const double NanoSecondsPerTick = gpu_.dev().getAttribs().nanoSecondsPerTick; + + SetValue(startTime, values_[CommandStartTime], NanoSecondsPerTick); + SetValue(endTime, values_[CommandEndTime], NanoSecondsPerTick); +} + +TimeStampCache::~TimeStampCache() { + // Release all time stamp objects from the cache + for (uint i = 0; i < freedTS_.size(); ++i) { + delete freedTS_[i]; + } + freedTS_.clear(); + + // Release all memory objects + for (uint i = 0; i < tsBuf_.size(); ++i) { + tsBuf_[i]->unmap(&gpu_); + delete tsBuf_[i]; + } + tsBuf_.clear(); +} + +TimeStamp* TimeStampCache::allocTimeStamp() { + TimeStamp* ts = NULL; + if (0 != freedTS_.size()) { + ts = freedTS_.back(); + freedTS_.pop_back(); + } + + if (NULL == ts) { + if ((tsBufCpu_ == NULL) || ((tsOffset_ + TimerSlotSize) > TimerBufSize)) { + Memory* buf = new Memory(gpu_.dev(), TimerBufSize); + if (buf == NULL || !buf->create(Resource::Remote)) { + return NULL; + } + tsBufCpu_ = reinterpret_cast
(buf->map(&gpu_)); + memset(tsBufCpu_, 0, TimerBufSize); + tsOffset_ = 0; + tsBuf_.push_back(buf); } - freedTS_.clear(); - - // Release all memory objects - for (uint i = 0; i < tsBuf_.size(); ++i) { - tsBuf_[i]->unmap(&gpu_); - delete tsBuf_[i]; + // Allocate a TimeStamp object + ts = new TimeStamp(gpu_, tsBuf_[(tsBuf_.size() - 1)]->gslResource(), tsOffset_, tsBufCpu_); + // Create a timestamp + if (ts == NULL) { + return NULL; } - tsBuf_.clear(); + tsOffset_ += TimerSlotSize; + } + // Set this timestamp into DRM profile mode if it was requested + ts->clearStates(); + + return ts; } -TimeStamp* -TimeStampCache::allocTimeStamp() -{ - TimeStamp* ts = NULL; - if (0 != freedTS_.size()) { - ts = freedTS_.back(); - freedTS_.pop_back(); - } - - if (NULL == ts) { - if ((tsBufCpu_ == NULL) || ((tsOffset_ + TimerSlotSize) > TimerBufSize)) { - Memory* buf = new Memory(gpu_.dev(), TimerBufSize); - if (buf == NULL || !buf->create(Resource::Remote)) { - return NULL; - } - tsBufCpu_ = reinterpret_cast
(buf->map(&gpu_)); - memset(tsBufCpu_, 0, TimerBufSize); - tsOffset_ = 0; - tsBuf_.push_back(buf); - } - // Allocate a TimeStamp object - ts = new TimeStamp(gpu_, tsBuf_[(tsBuf_.size() - 1)]->gslResource(), - tsOffset_, tsBufCpu_); - // Create a timestamp - if (ts == NULL) { - return NULL; - } - tsOffset_ += TimerSlotSize; - } - - // Set this timestamp into DRM profile mode if it was requested - ts->clearStates(); - - return ts; -} - -} // namespace gpu +} // namespace gpu diff --git a/rocclr/runtime/device/gpu/gputimestamp.hpp b/rocclr/runtime/device/gpu/gputimestamp.hpp index a33e3545a0..1b1a394c21 100644 --- a/rocclr/runtime/device/gpu/gputimestamp.hpp +++ b/rocclr/runtime/device/gpu/gputimestamp.hpp @@ -18,113 +18,101 @@ class Device; class VirtualGPU; class Memory; -class TimeStamp : public amd::HeapObject -{ -public: - //! Enums for the timestamp information - //! \note *4 is the limitaiton of SDMA HW - //! (address has to be aligned by 256 bit) - enum TimeStampValue { - CommandStartTime = 0, - CommandEndTime = 4, - CommandTotal = 8 +class TimeStamp : public amd::HeapObject { + public: + //! Enums for the timestamp information + //! \note *4 is the limitaiton of SDMA HW + //! (address has to be aligned by 256 bit) + enum TimeStampValue { CommandStartTime = 0, CommandEndTime = 4, CommandTotal = 8 }; + + //! The TimeStamp object flags + union Flags { + struct { + uint32_t beginIssued_ : 1; + uint32_t endIssued_ : 1; + uint32_t sdma_ : 1; }; + uint32_t value_; + Flags() : value_(0) {} + }; - //! The TimeStamp object flags - union Flags - { - struct - { - uint32_t beginIssued_ : 1; - uint32_t endIssued_ : 1; - uint32_t sdma_ : 1; - }; - uint32_t value_; - Flags(): value_(0) {} - }; + //! Default constructor + TimeStamp(const VirtualGPU& gpu, //!< Virtual GPU + gslMemObject gslMem, //!< Buffer with the timer values + uint memOffset, //!< Offset in the buffer for the current TS + address cpuAddr //!< CPU pointer for the values in memory + ); - //! Default constructor - TimeStamp( - const VirtualGPU& gpu, //!< Virtual GPU - gslMemObject gslMem, //!< Buffer with the timer values - uint memOffset, //!< Offset in the buffer for the current TS - address cpuAddr //!< CPU pointer for the values in memory - ); + //! Default destructor + ~TimeStamp(); - //! Default destructor - ~TimeStamp(); + //! Starts the timestamp + void begin(bool sdma = false); - //! Starts the timestamp - void begin(bool sdma = false); + //! Ends the timestamp + void end(bool sdma = false); - //! Ends the timestamp - void end(bool sdma = false); + //! Returns the timestamp result in nano seconds + void value(uint64_t* startTime, uint64_t* endTime); - //! Returns the timestamp result in nano seconds - void value(uint64_t* startTime, uint64_t* endTime); + //! Clear all TimeStamp states + void clearStates() { + flags_.value_ = 0; + values_[CommandStartTime] = 0; + values_[CommandEndTime] = 0; + } - //! Clear all TimeStamp states - void clearStates() - { flags_.value_ = 0; - values_[CommandStartTime] = 0; - values_[CommandEndTime] = 0; - } + //! Timer commands were submitted to HW + bool isValid() const { return (flags_.endIssued_) ? true : false; } - //! Timer commands were submitted to HW - bool isValid() const { return (flags_.endIssued_) ? true : false; } + private: + //! Disable copy constructor + TimeStamp(const TimeStamp&); -private: - //! Disable copy constructor - TimeStamp(const TimeStamp&); + //! Disable operator= + TimeStamp& operator=(const TimeStamp&); - //! Disable operator= - TimeStamp& operator=(const TimeStamp&); + //! Returns the GPU device object + const VirtualGPU& gpu() const { return gpu_; } - //! Returns the GPU device object - const VirtualGPU& gpu() const { return gpu_; } - - const VirtualGPU& gpu_; //!< Virtual GPU - Flags flags_; //!< The time stamp state - gslMemObject gslMem_; //!< Buffer with the timer values - uint memOffset_; //!< Offset in the buffer for the current timer - volatile uint64_t* values_; //!< CPU pointer to the timer values + const VirtualGPU& gpu_; //!< Virtual GPU + Flags flags_; //!< The time stamp state + gslMemObject gslMem_; //!< Buffer with the timer values + uint memOffset_; //!< Offset in the buffer for the current timer + volatile uint64_t* values_; //!< CPU pointer to the timer values }; -class TimeStampCache : public amd::HeapObject -{ -public: - //! Default constructor - TimeStampCache( - VirtualGPU& gpu //!< Virtual GPU object - ) - : gpu_(gpu) - , tsBufCpu_(NULL) - , tsOffset_(0) {} +class TimeStampCache : public amd::HeapObject { + public: + //! Default constructor + TimeStampCache(VirtualGPU& gpu //!< Virtual GPU object + ) + : gpu_(gpu), tsBufCpu_(NULL), tsOffset_(0) {} - //! Default destructor - ~TimeStampCache(); + //! Default destructor + ~TimeStampCache(); - //! Gets a time stamp object. It will find a freed object or allocate a new one - TimeStamp* allocTimeStamp(); + //! Gets a time stamp object. It will find a freed object or allocate a new one + TimeStamp* allocTimeStamp(); - //! Frees a time stamp object - void freeTimeStamp(TimeStamp* ts) { freedTS_.push_back(ts); } + //! Frees a time stamp object + void freeTimeStamp(TimeStamp* ts) { freedTS_.push_back(ts); } -private: - static const uint TimerSlotSize = TimeStamp::CommandTotal * sizeof(uint64_t); - static const uint TimerBufSize = TimerSlotSize * 4096; + private: + static const uint TimerSlotSize = TimeStamp::CommandTotal * sizeof(uint64_t); + static const uint TimerBufSize = TimerSlotSize * 4096; - //! Disable copy constructor - TimeStampCache(const TimeStampCache&); + //! Disable copy constructor + TimeStampCache(const TimeStampCache&); - //! Disable operator= - TimeStampCache& operator=(const TimeStampCache&); + //! Disable operator= + TimeStampCache& operator=(const TimeStampCache&); - std::vector freedTS_; //!< Array of freed time stamp objects - VirtualGPU& gpu_; //!< Virtual GPU - std::vector tsBuf_; //!< Array of memory objects with the timer value - address tsBufCpu_; //!< CPU pointer for current TS memory - uint tsOffset_; //!< Active offset in the current mem object + std::vector freedTS_; //!< Array of freed time stamp objects + VirtualGPU& gpu_; //!< Virtual GPU + std::vector tsBuf_; //!< Array of memory objects with the timer value + address tsBufCpu_; //!< CPU pointer for current TS memory + uint tsOffset_; //!< Active offset in the current mem object }; /*@}*/} // namespace gpu diff --git a/rocclr/runtime/device/gpu/gputrap.hpp b/rocclr/runtime/device/gpu/gputrap.hpp index e1eed63243..1494710819 100644 --- a/rocclr/runtime/device/gpu/gputrap.hpp +++ b/rocclr/runtime/device/gpu/gputrap.hpp @@ -116,72 +116,24 @@ end *******************************************************************************/ /// shader codes with "asic(TAHITI)" instruction -static const uint32_t RuntimeTrapCode [] = { - 0x7e008200, 0xbf8c0000, - 0xbef8036c, 0x8779ff6d, - 0x0000ffff, 0x8879ff79, - 0x01000000, 0xbefa03ff, - 0x00002000, 0xbefb03ff, - 0x00024fac, 0x80f8ff78, - 0x00000100, 0xbef70300, - 0xc2007900, 0xbf8c0000, - 0xbeee0300, 0xc2007901, - 0xbf8c0000, 0xbeef0300, - 0xbe800377, 0xbef60398, - 0x8078766e, 0x8779ff6f, - 0x0000ffff, 0x8879ff79, - 0x00680000, 0xbefa03ff, - 0x00002000, 0xbefb03ff, - 0x00024fac, 0xbef6036e, - 0xbef7036f, 0xbef30300, - 0xc2007902, 0xbf8c0000, - 0xbeee0300, 0xc2007903, - 0xbf8c0000, 0xbeef0300, - 0xc2007900, 0xbf8c0000, - 0xbef20300, 0xc2007901, - 0xbf8c0000, 0x89737300, - 0x89007300, 0x89737300, - 0xbef80372, 0xbef90373, - 0xbef21f00, 0x80728872, - 0xbe802078, 0xbeef0377, - 0xbeee0376, 0x8771ff71, - 0x0000ffff, 0xbe802270 -}; +static const uint32_t RuntimeTrapCode[] = { + 0x7e008200, 0xbf8c0000, 0xbef8036c, 0x8779ff6d, 0x0000ffff, 0x8879ff79, 0x01000000, 0xbefa03ff, + 0x00002000, 0xbefb03ff, 0x00024fac, 0x80f8ff78, 0x00000100, 0xbef70300, 0xc2007900, 0xbf8c0000, + 0xbeee0300, 0xc2007901, 0xbf8c0000, 0xbeef0300, 0xbe800377, 0xbef60398, 0x8078766e, 0x8779ff6f, + 0x0000ffff, 0x8879ff79, 0x00680000, 0xbefa03ff, 0x00002000, 0xbefb03ff, 0x00024fac, 0xbef6036e, + 0xbef7036f, 0xbef30300, 0xc2007902, 0xbf8c0000, 0xbeee0300, 0xc2007903, 0xbf8c0000, 0xbeef0300, + 0xc2007900, 0xbf8c0000, 0xbef20300, 0xc2007901, 0xbf8c0000, 0x89737300, 0x89007300, 0x89737300, + 0xbef80372, 0xbef90373, 0xbef21f00, 0x80728872, 0xbe802078, 0xbeef0377, 0xbeee0376, 0x8771ff71, + 0x0000ffff, 0xbe802270}; /// shader codes with "asic(VI)" instruction -static const uint32_t RuntimeTrapCodeVi [] = { - 0x7e006a00, 0xbf8c0000, - 0xbef8006c, 0x8679ff6d, - 0x0000ffff, 0x8779ff79, - 0x01000000, 0xbefa00ff, - 0x00002000, 0xbefb00ff, - 0x00024fac, 0x80f8ff78, - 0x00000100, 0xbef70000, - 0xc022003c, 0x00000000, - 0xbf8c0000, 0xbeee0000, - 0xc022003c, 0x00000004, - 0xbf8c0000, 0xbeef0000, - 0xbe800077, 0xbef60098, - 0x8078766e, 0x8679ff6f, - 0x0000ffff, 0x8779ff79, - 0x00680000, 0xbefa00ff, - 0x00002000, 0xbefb00ff, - 0x00024fac, 0xbef6006e, - 0xbef7006f, 0xbef30000, - 0xc022003c, 0x00000008, - 0xbf8c0000, 0xbeee0000, - 0xc022003c, 0x0000000c, - 0xbf8c0000, 0xbeef0000, - 0xc022003c, 0x00000000, - 0xbf8c0000, 0xbef20000, - 0xc022003c, 0x00000004, - 0xbf8c0000, 0x88737300, - 0x88007300, 0x88737300, - 0xbef80072, 0xbef90073, - 0xbef21c00, 0x80728872, - 0xbe801d78, 0xbeef0077, - 0xbeee0076, 0x8671ff71, - 0x0000ffff, 0xbe801f70 -}; - +static const uint32_t RuntimeTrapCodeVi[] = { + 0x7e006a00, 0xbf8c0000, 0xbef8006c, 0x8679ff6d, 0x0000ffff, 0x8779ff79, 0x01000000, 0xbefa00ff, + 0x00002000, 0xbefb00ff, 0x00024fac, 0x80f8ff78, 0x00000100, 0xbef70000, 0xc022003c, 0x00000000, + 0xbf8c0000, 0xbeee0000, 0xc022003c, 0x00000004, 0xbf8c0000, 0xbeef0000, 0xbe800077, 0xbef60098, + 0x8078766e, 0x8679ff6f, 0x0000ffff, 0x8779ff79, 0x00680000, 0xbefa00ff, 0x00002000, 0xbefb00ff, + 0x00024fac, 0xbef6006e, 0xbef7006f, 0xbef30000, 0xc022003c, 0x00000008, 0xbf8c0000, 0xbeee0000, + 0xc022003c, 0x0000000c, 0xbf8c0000, 0xbeef0000, 0xc022003c, 0x00000000, 0xbf8c0000, 0xbef20000, + 0xc022003c, 0x00000004, 0xbf8c0000, 0x88737300, 0x88007300, 0x88737300, 0xbef80072, 0xbef90073, + 0xbef21c00, 0x80728872, 0xbe801d78, 0xbeef0077, 0xbeee0076, 0x8671ff71, 0x0000ffff, 0xbe801f70}; diff --git a/rocclr/runtime/device/gpu/gpuvirtual.cpp b/rocclr/runtime/device/gpu/gpuvirtual.cpp index a251daf829..90c7d8c145 100644 --- a/rocclr/runtime/device/gpu/gpuvirtual.cpp +++ b/rocclr/runtime/device/gpu/gpuvirtual.cpp @@ -28,3625 +28,3247 @@ #include "amdocl/cl_d3d9_amd.hpp" #include "amdocl/cl_d3d10_amd.hpp" #include "amdocl/cl_d3d11_amd.hpp" -#endif // _WIN32 +#endif // _WIN32 namespace gpu { -bool -VirtualGPU::MemoryDependency::create(size_t numMemObj) -{ - if (numMemObj > 0) { - // Allocate the array of memory objects for dependency tracking - memObjectsInQueue_ = new MemoryState[numMemObj]; - if (NULL == memObjectsInQueue_) { - return false; - } - memset(memObjectsInQueue_, 0, sizeof(MemoryState) * numMemObj); - maxMemObjectsInQueue_ = numMemObj; +bool VirtualGPU::MemoryDependency::create(size_t numMemObj) { + if (numMemObj > 0) { + // Allocate the array of memory objects for dependency tracking + memObjectsInQueue_ = new MemoryState[numMemObj]; + if (NULL == memObjectsInQueue_) { + return false; } + memset(memObjectsInQueue_, 0, sizeof(MemoryState) * numMemObj); + maxMemObjectsInQueue_ = numMemObj; + } - return true; + return true; } -void -VirtualGPU::MemoryDependency::validate( - VirtualGPU& gpu, - const Memory* memory, - bool readOnly) -{ - bool flushL1Cache = false; +void VirtualGPU::MemoryDependency::validate(VirtualGPU& gpu, const Memory* memory, bool readOnly) { + bool flushL1Cache = false; - if (maxMemObjectsInQueue_ == 0) { - // Flush cache - gpu.flushCUCaches(); - return; + if (maxMemObjectsInQueue_ == 0) { + // Flush cache + gpu.flushCUCaches(); + return; + } + + uint64_t curStart = memory->hbOffset(); + uint64_t curEnd = curStart + memory->hbSize(); + + // Loop through all memory objects in the queue and find dependency + // @note don't include objects from the current kernel + for (size_t j = 0; j < endMemObjectsInQueue_; ++j) { + // Check if the queue already contains this mem object and + // GPU operations aren't readonly + uint64_t busyStart = memObjectsInQueue_[j].start_; + uint64_t busyEnd = memObjectsInQueue_[j].end_; + + // Check if the start inside the busy region + if ((((curStart >= busyStart) && (curStart < busyEnd)) || + // Check if the end inside the busy region + ((curEnd > busyStart) && (curEnd <= busyEnd)) || + // Check if the start/end cover the busy region + ((curStart <= busyStart) && (curEnd >= busyEnd))) && + // If the buys region was written or the current one is for write + (!memObjectsInQueue_[j].readOnly_ || !readOnly)) { + flushL1Cache = true; + break; } + } - uint64_t curStart = memory->hbOffset(); - uint64_t curEnd = curStart + memory->hbSize(); + // Did we reach the limit? + if (maxMemObjectsInQueue_ <= (numMemObjectsInQueue_ + 1)) { + flushL1Cache = true; + } - // Loop through all memory objects in the queue and find dependency - // @note don't include objects from the current kernel - for (size_t j = 0; j < endMemObjectsInQueue_; ++j) { - // Check if the queue already contains this mem object and - // GPU operations aren't readonly - uint64_t busyStart = memObjectsInQueue_[j].start_; - uint64_t busyEnd = memObjectsInQueue_[j].end_; + if (flushL1Cache) { + // Flush cache + gpu.flushCUCaches(); - // Check if the start inside the busy region - if ((((curStart >= busyStart) && (curStart < busyEnd)) || - // Check if the end inside the busy region - ((curEnd > busyStart) && (curEnd <= busyEnd)) || - // Check if the start/end cover the busy region - ((curStart <= busyStart) && (curEnd >= busyEnd))) && - // If the buys region was written or the current one is for write - (!memObjectsInQueue_[j].readOnly_ || !readOnly)) { - flushL1Cache = true; - break; - } - } + // Clear memory dependency state + const static bool All = true; + clear(!All); + } - // Did we reach the limit? - if (maxMemObjectsInQueue_ <= (numMemObjectsInQueue_ + 1)) { - flushL1Cache = true; - } - - if (flushL1Cache) { - // Flush cache - gpu.flushCUCaches(); - - // Clear memory dependency state - const static bool All = true; - clear(!All); - } - - // Insert current memory object into the queue always, - // since runtime calls flush before kernel execution and it has to keep - // current kernel in tracking - memObjectsInQueue_ - [numMemObjectsInQueue_].start_ = curStart; - memObjectsInQueue_ - [numMemObjectsInQueue_].end_ = curEnd; - memObjectsInQueue_ - [numMemObjectsInQueue_].readOnly_ = readOnly; - numMemObjectsInQueue_++; + // Insert current memory object into the queue always, + // since runtime calls flush before kernel execution and it has to keep + // current kernel in tracking + memObjectsInQueue_[numMemObjectsInQueue_].start_ = curStart; + memObjectsInQueue_[numMemObjectsInQueue_].end_ = curEnd; + memObjectsInQueue_[numMemObjectsInQueue_].readOnly_ = readOnly; + numMemObjectsInQueue_++; } -void -VirtualGPU::MemoryDependency::clear(bool all) -{ - if (numMemObjectsInQueue_ > 0) { - size_t i, j; - if (all) { - endMemObjectsInQueue_ = numMemObjectsInQueue_; - } - - // Preserve all objects from the current kernel - for (i = 0, j = endMemObjectsInQueue_; j < numMemObjectsInQueue_; i++, j++) { - memObjectsInQueue_[i].start_ = memObjectsInQueue_[j].start_; - memObjectsInQueue_[i].end_ = memObjectsInQueue_[j].end_; - memObjectsInQueue_[i].readOnly_ = memObjectsInQueue_[j].readOnly_; - } - // Clear all objects except current kernel - memset(&memObjectsInQueue_[i], 0, sizeof(amd::Memory*) * numMemObjectsInQueue_); - numMemObjectsInQueue_ -= endMemObjectsInQueue_; - endMemObjectsInQueue_ = 0; +void VirtualGPU::MemoryDependency::clear(bool all) { + if (numMemObjectsInQueue_ > 0) { + size_t i, j; + if (all) { + endMemObjectsInQueue_ = numMemObjectsInQueue_; } + + // Preserve all objects from the current kernel + for (i = 0, j = endMemObjectsInQueue_; j < numMemObjectsInQueue_; i++, j++) { + memObjectsInQueue_[i].start_ = memObjectsInQueue_[j].start_; + memObjectsInQueue_[i].end_ = memObjectsInQueue_[j].end_; + memObjectsInQueue_[i].readOnly_ = memObjectsInQueue_[j].readOnly_; + } + // Clear all objects except current kernel + memset(&memObjectsInQueue_[i], 0, sizeof(amd::Memory*) * numMemObjectsInQueue_); + numMemObjectsInQueue_ -= endMemObjectsInQueue_; + endMemObjectsInQueue_ = 0; + } } -VirtualGPU::DmaFlushMgmt::DmaFlushMgmt(const Device& dev) - : cbWorkload_(0) - , dispatchSplitSize_(0) -{ - aluCnt_ = dev.info().simdPerCU_ * dev.info().simdWidth_ * dev.info().maxComputeUnits_; - maxDispatchWorkload_ = static_cast(dev.info().maxClockFrequency_) * - // find time in us - dev.settings().maxWorkloadTime_ * - aluCnt_; - resetCbWorkload(dev); +VirtualGPU::DmaFlushMgmt::DmaFlushMgmt(const Device& dev) : cbWorkload_(0), dispatchSplitSize_(0) { + aluCnt_ = dev.info().simdPerCU_ * dev.info().simdWidth_ * dev.info().maxComputeUnits_; + maxDispatchWorkload_ = static_cast(dev.info().maxClockFrequency_) * + // find time in us + dev.settings().maxWorkloadTime_ * aluCnt_; + resetCbWorkload(dev); } -void -VirtualGPU::DmaFlushMgmt::resetCbWorkload(const Device& dev) -{ +void VirtualGPU::DmaFlushMgmt::resetCbWorkload(const Device& dev) { + cbWorkload_ = 0; + maxCbWorkload_ = static_cast(dev.info().maxClockFrequency_) * + // find time in us + dev.settings().minWorkloadTime_ * aluCnt_; +} + +void VirtualGPU::DmaFlushMgmt::findSplitSize(const Device& dev, uint64_t threads, + uint instructions) { + uint64_t workload = threads * instructions; + if (maxDispatchWorkload_ < workload) { + dispatchSplitSize_ = static_cast(maxDispatchWorkload_ / instructions); + uint fullLoad = dev.info().maxComputeUnits_ * dev.info().maxWorkGroupSize_; + if ((dispatchSplitSize_ % fullLoad) != 0) { + dispatchSplitSize_ = (dispatchSplitSize_ / fullLoad + 1) * fullLoad; + } + } else { + dispatchSplitSize_ = + (threads > dev.settings().workloadSplitSize_) ? dev.settings().workloadSplitSize_ : 0; + } +} + +bool VirtualGPU::DmaFlushMgmt::isCbReady(VirtualGPU& gpu, uint64_t threads, uint instructions) { + bool cbReady = false; + uint64_t workload = amd::alignUp(threads, 4 * aluCnt_) * instructions; + // Add current workload to the overall workload in the current DMA + cbWorkload_ += workload; + // Did it exceed maximum? + if (cbWorkload_ > maxCbWorkload_) { + // Reset DMA workload cbWorkload_ = 0; - maxCbWorkload_ = static_cast(dev.info().maxClockFrequency_) * - // find time in us - dev.settings().minWorkloadTime_ * aluCnt_; + // Increase workload of the next DMA buffer by 50% + maxCbWorkload_ = maxCbWorkload_ * 3 / 2; + if (maxCbWorkload_ > maxDispatchWorkload_) { + maxCbWorkload_ = maxDispatchWorkload_; + } + cbReady = true; + } + return cbReady; } -void -VirtualGPU::DmaFlushMgmt::findSplitSize( - const Device& dev, uint64_t threads, uint instructions) -{ - uint64_t workload = threads * instructions; - if (maxDispatchWorkload_ < workload) { - dispatchSplitSize_ = static_cast(maxDispatchWorkload_ / instructions); - uint fullLoad = dev.info().maxComputeUnits_ * dev.info().maxWorkGroupSize_; - if ((dispatchSplitSize_ % fullLoad) != 0) { - dispatchSplitSize_ = (dispatchSplitSize_ / fullLoad + 1) * fullLoad; - } - } - else { - dispatchSplitSize_ = (threads > dev.settings().workloadSplitSize_) ? - dev.settings().workloadSplitSize_ : 0; - } +bool VirtualGPU::gslOpen(uint nEngines, gslEngineDescriptor* engines, uint32_t rtCUs) { + // GSL device initialization + dev().PerformFullInitialization(); + + // Wait the event + m_waitType = dev().settings().syncObject_ ? CAL_WAIT_LOW_CPU_UTILIZATION : CAL_WAIT_POLLING; + + if (!open(&dev(), nEngines, engines, rtCUs)) { + return false; + } + + return true; } -bool -VirtualGPU::DmaFlushMgmt::isCbReady( - VirtualGPU& gpu, uint64_t threads, uint instructions) -{ - bool cbReady = false; - uint64_t workload = amd::alignUp(threads, 4 * aluCnt_) * instructions; - // Add current workload to the overall workload in the current DMA - cbWorkload_ += workload; - // Did it exceed maximum? - if (cbWorkload_ > maxCbWorkload_) { - // Reset DMA workload - cbWorkload_ = 0; - // Increase workload of the next DMA buffer by 50% - maxCbWorkload_ = maxCbWorkload_ * 3 / 2; - if (maxCbWorkload_ > maxDispatchWorkload_) { - maxCbWorkload_ = maxDispatchWorkload_; - } - cbReady = true; - } - return cbReady; +void VirtualGPU::gslDestroy() { close(dev().getNative()); } + +void VirtualGPU::addXferWrite(Memory& memory) { + if (xferWriteBuffers_.size() > 7) { + dev().xferWrite().release(*this, *xferWriteBuffers_.front()); + xferWriteBuffers_.pop_front(); + } + + // Delay destruction + xferWriteBuffers_.push_back(&memory); } -bool -VirtualGPU::gslOpen(uint nEngines, gslEngineDescriptor *engines, uint32_t rtCUs) -{ - // GSL device initialization - dev().PerformFullInitialization(); - - // Wait the event - m_waitType = dev().settings().syncObject_ - ? CAL_WAIT_LOW_CPU_UTILIZATION - : CAL_WAIT_POLLING; - - if (!open(&dev(), nEngines, engines, rtCUs)) { - return false; - } - - return true; +void VirtualGPU::releaseXferWrite() { + for (auto& memory : xferWriteBuffers_) { + dev().xferWrite().release(*this, *memory); + } + xferWriteBuffers_.clear(); } -void -VirtualGPU::gslDestroy() -{ - close(dev().getNative()); -} - -void -VirtualGPU::addXferWrite(Memory& memory) -{ - if (xferWriteBuffers_.size() > 7) { - dev().xferWrite().release(*this, *xferWriteBuffers_.front()); - xferWriteBuffers_.pop_front(); +void VirtualGPU::addPinnedMem(amd::Memory* mem) { + if (NULL == findPinnedMem(mem->getHostMem(), mem->getSize())) { + if (pinnedMems_.size() > 7) { + pinnedMems_.front()->release(); + pinnedMems_.pop_front(); } + // Start operation, since we should release mem object + flushDMA(getGpuEvent(dev().getGpuMemory(mem)->gslResource())->engineId_); + // Delay destruction - xferWriteBuffers_.push_back(&memory); + pinnedMems_.push_back(mem); + } } -void -VirtualGPU::releaseXferWrite() -{ - for (auto& memory : xferWriteBuffers_) { - dev().xferWrite().release(*this, *memory); - } - xferWriteBuffers_.clear(); +void VirtualGPU::releasePinnedMem() { + for (auto& amdMemory : pinnedMems_) { + amdMemory->release(); + } + pinnedMems_.clear(); } -void -VirtualGPU::addPinnedMem(amd::Memory* mem) -{ - if (NULL == findPinnedMem(mem->getHostMem(), mem->getSize())) { - if (pinnedMems_.size() > 7) { - pinnedMems_.front()->release(); - pinnedMems_.pop_front(); - } - - // Start operation, since we should release mem object - flushDMA(getGpuEvent(dev().getGpuMemory(mem)->gslResource())->engineId_); - - // Delay destruction - pinnedMems_.push_back(mem); +amd::Memory* VirtualGPU::findPinnedMem(void* addr, size_t size) { + for (auto& amdMemory : pinnedMems_) { + if ((amdMemory->getHostMem() == addr) && (size <= amdMemory->getSize())) { + return amdMemory; } + } + return NULL; } -void -VirtualGPU::releasePinnedMem() -{ - for (auto& amdMemory : pinnedMems_) { - amdMemory->release(); - } - pinnedMems_.clear(); -} +bool VirtualGPU::createVirtualQueue(uint deviceQueueSize) { + uint MinDeviceQueueSize = 16 * 1024; + deviceQueueSize = std::max(deviceQueueSize, MinDeviceQueueSize); -amd::Memory* -VirtualGPU::findPinnedMem(void* addr, size_t size) -{ - for (auto& amdMemory : pinnedMems_) { - if ((amdMemory->getHostMem() == addr) && (size <= amdMemory->getSize())) { - return amdMemory; - } - } - return NULL; -} + maskGroups_ = deviceQueueSize / (512 * Ki); + maskGroups_ = (maskGroups_ == 0) ? 1 : maskGroups_; -bool -VirtualGPU::createVirtualQueue(uint deviceQueueSize) -{ - uint MinDeviceQueueSize = 16 * 1024; - deviceQueueSize = std::max(deviceQueueSize, MinDeviceQueueSize); - - maskGroups_ = deviceQueueSize / (512 * Ki); - maskGroups_ = (maskGroups_== 0) ? 1 : maskGroups_; - - // Align the queue size for the multiple dispatch scheduler. - // Each thread works with 32 entries * maskGroups - uint extra = deviceQueueSize % (sizeof(AmdAqlWrap) * - DeviceQueueMaskSize * maskGroups_); - if (extra != 0) { - deviceQueueSize += (sizeof(AmdAqlWrap) * - DeviceQueueMaskSize * maskGroups_) - extra; - } - - if (deviceQueueSize_ == deviceQueueSize) { - return true; - } - else { - //! @todo Temporarily keep the buffer mapped for debug purpose - if (NULL != schedParams_) { - schedParams_->unmap(this); - } - delete vqHeader_; - delete virtualQueue_; - delete schedParams_; - vqHeader_ = NULL; - virtualQueue_ = NULL; - schedParams_ = NULL; - schedParamIdx_ = 0; - deviceQueueSize_ = 0; - } - uint numSlots = deviceQueueSize / sizeof(AmdAqlWrap); - uint allocSize = deviceQueueSize; - - // Add the virtual queue header - allocSize += sizeof(AmdVQueueHeader); - allocSize = amd::alignUp(allocSize, sizeof(AmdAqlWrap)); - - uint argOffs = allocSize; - - // Add the kernel arguments and wait events - uint singleArgSize = amd::alignUp(dev().info().maxParameterSize_ + 64 + - dev().settings().numWaitEvents_ * sizeof(uint64_t), sizeof(AmdAqlWrap)); - allocSize += singleArgSize * numSlots; - - uint eventsOffs = allocSize; - // Add the device events - allocSize += dev().settings().numDeviceEvents_ * sizeof(AmdEvent); - - uint eventMaskOffs = allocSize; - // Add mask array for events - allocSize += amd::alignUp(dev().settings().numDeviceEvents_, DeviceQueueMaskSize) / 8; - - uint slotMaskOffs = allocSize; - // Add mask array for AmdAqlWrap slots - allocSize += amd::alignUp(numSlots, DeviceQueueMaskSize) / 8; - - virtualQueue_ = new Memory(dev(), allocSize); - Resource::MemoryType type = (GPU_PRINT_CHILD_KERNEL == 0) ? - Resource::Local : Resource::Remote; - if ((virtualQueue_ == NULL) || !virtualQueue_->create(type)) { - return false; - } - address ptr = reinterpret_cast
( - virtualQueue_->map(this, Resource::WriteOnly)); - if (NULL == ptr) { - return false; - } - // Clear memory - memset(ptr, 0, allocSize); - uint64_t vaBase = virtualQueue_->vmAddress(); - AmdVQueueHeader* header = reinterpret_cast(ptr); - - // Initialize the virtual queue header - header->aql_slot_num = numSlots; - header->event_slot_num = dev().settings().numDeviceEvents_; - header->event_slot_mask = vaBase + eventMaskOffs; - header->event_slots = vaBase + eventsOffs; - header->aql_slot_mask = vaBase + slotMaskOffs; - header->wait_size = dev().settings().numWaitEvents_; - header->arg_size = dev().info().maxParameterSize_ + 64; - header->mask_groups = maskGroups_; - vqHeader_ = new AmdVQueueHeader; - if (NULL == vqHeader_) { - return false; - } - *vqHeader_ = *header; - - // Go over all slots and perform initialization - AmdAqlWrap* slots = reinterpret_cast(&header[1]); - for (uint i = 0; i < numSlots; ++i) { - uint64_t argStart = vaBase + argOffs + i * singleArgSize; - slots[i].aql.kernarg_address = reinterpret_cast(argStart); - slots[i].wait_list = argStart + dev().info().maxParameterSize_ + 64; - } - // Upload data back to local memory - if (GPU_PRINT_CHILD_KERNEL == 0) { - virtualQueue_->unmap(this); - } - - schedParams_ = new Memory(dev(), 64 * Ki); - if ((schedParams_ == NULL) || !schedParams_->create(Resource::RemoteUSWC)) { - return false; - } - - ptr = reinterpret_cast
(schedParams_->map(this)); - - deviceQueueSize_ = deviceQueueSize; + // Align the queue size for the multiple dispatch scheduler. + // Each thread works with 32 entries * maskGroups + uint extra = deviceQueueSize % (sizeof(AmdAqlWrap) * DeviceQueueMaskSize * maskGroups_); + if (extra != 0) { + deviceQueueSize += (sizeof(AmdAqlWrap) * DeviceQueueMaskSize * maskGroups_) - extra; + } + if (deviceQueueSize_ == deviceQueueSize) { return true; -} - -VirtualGPU::VirtualGPU( - Device& device) - : device::VirtualDevice(device) - , CALGSLContext() - , engineID_(MainEngine) - , activeKernelDesc_(NULL) - , gpuDevice_(static_cast(device)) - , execution_("Virtual GPU execution lock", true) - , printfDbg_(NULL) - , printfDbgHSA_(NULL) - , tsCache_(NULL) - , vmMems_(NULL) - , numVmMems_(0) - , dmaFlushMgmt_(device) - , hwRing_(0) - , readjustTimeGPU_(0) - , currTs_(NULL) - , vqHeader_(NULL) - , virtualQueue_(NULL) - , schedParams_(NULL) - , schedParamIdx_(0) - , deviceQueueSize_(0) - , maskGroups_(1) - , hsaQueueMem_(NULL) - , profileEnabled_(false) -{ - memset(&cal_, 0, sizeof(CalVirtualDesc)); - for (uint i = 0; i < AllEngines; ++i) { - cal_.events_[i].invalidate(); - } - memset(&cal_.samplersState_, 0xff, sizeof(cal_.samplersState_)); - - // Note: Virtual GPU device creation must be a thread safe operation - index_ = gpuDevice_.numOfVgpus_++; - gpuDevice_.vgpus_.resize(gpuDevice_.numOfVgpus()); - gpuDevice_.vgpus_[index()] = this; -} - -bool -VirtualGPU::create(bool profiling, uint rtCUs, uint deviceQueueSize, - amd::CommandQueue::Priority priority) -{ - device::BlitManager::Setup blitSetup; - gslEngineDescriptor engines[2]; - uint engineMask = 0; - uint32_t num = 0; - - if (index() >= GPU_MAX_COMMAND_QUEUES) { - // Cap the maximum number of concurrent Virtual GPUs. - return false; - } - - // Virtual GPU will have profiling enabled - state_.profiling_ = profiling; - - { - if (dev().engines().numComputeRings()) { - uint idx; - - if ((amd::CommandQueue::RealTimeDisabled == rtCUs) && - (priority == amd::CommandQueue::Priority::Normal)) { - idx = index() % dev().engines().numComputeRings(); - engineMask = dev().engines().getMask( - (gslEngineID)(dev().isComputeRingIDForced() ? - dev().getforcedComputeEngineID() : - (dev().getFirstAvailableComputeEngineID() + idx))); - - } - else { - if (priority == amd::CommandQueue::Priority::Medium) { - engineMask = dev().engines().getMask((gslEngineID) - (GSL_ENGINEID_COMPUTE_MEDIUM_PRIORITY)); - } - else { - engineMask = dev().engines().getMask((gslEngineID) - (GSL_ENGINEID_COMPUTE_RT)); - } - //!@todo This is not a generic solution and - // may have issues with > 8 queues - idx = index() % (dev().engines().numComputeRings() + - dev().engines().numComputeRingsRT()); - } - // hwRing_ should be set 0 if forced to have single scratch buffer - hwRing_ = (dev().settings().useSingleScratch_) ? 0 : idx; - - if (dev().canDMA()) { - // If only 1 DMA engine is available then use that one - if (dev().engines().numDMAEngines() < 2) { - engineMask |= dev().engines().getMask(GSL_ENGINEID_DRMDMA0); - } - else if (index() & 0x1) { - engineMask |= dev().engines().getMask(GSL_ENGINEID_DRMDMA0); - } - else { - engineMask |= dev().engines().getMask(GSL_ENGINEID_DRMDMA1); - } - } - } - else { - engineMask = dev().engines().getMask(GSL_ENGINEID_3DCOMPUTE0); - if (dev().canDMA()) { - engineMask |= dev().engines().getMask(GSL_ENGINEID_DRMDMA0); - } - } - } - num = dev().engines().getRequested(engineMask, engines); - - // Open GSL context - if ((num == 0) || !gslOpen(num, engines, rtCUs)) { - return false; - } - - // Diable double copy optimization, - // since UAV read from nonlocal is fast enough - blitSetup.disableCopyBufferToImageOpt_ = true; - if (!allocConstantBuffers()) { - return false; - } - - // Create Printf class - printfDbg_ = new PrintfDbg(gpuDevice_); - if ((NULL == printfDbg_) || !printfDbg_->create()) { - delete printfDbg_; - LogError("Could not allocate debug buffer for printf()!"); - return false; - } - - // Create HSAILPrintf class - printfDbgHSA_ = new PrintfDbgHSA(gpuDevice_); - if (NULL == printfDbgHSA_) { - delete printfDbgHSA_; - LogError("Could not create PrintfDbgHSA class!"); - return false; - } - - // Choose the appropriate class for blit engine - switch (dev().settings().blitEngine_) { - default: - // Fall through ... - case Settings::BlitEngineHost: - blitSetup.disableAll(); - // Fall through ... - case Settings::BlitEngineCAL: - case Settings::BlitEngineKernel: - // use host blit for HW debug - if (dev().settings().enableHwDebug_) { - blitSetup.disableCopyImageToBuffer_ = true; - blitSetup.disableCopyBufferToImage_ = true; - } - blitMgr_ = new KernelBlitManager(*this, blitSetup); - break; - } - if ((NULL == blitMgr_) || !blitMgr_->create(gpuDevice_)) { - LogError("Could not create BlitManager!"); - return false; - } - - tsCache_ = new TimeStampCache(*this); - if (NULL == tsCache_) { - LogError("Could not create TimeStamp cache!"); - return false; - } - - if (!memoryDependency().create(dev().settings().numMemDependencies_)) { - LogError("Could not create the array of memory objects!"); - return false; - } - - if(!allocHsaQueueMem()) { - LogError("Could not create hsaQueueMem object!"); - return false; - } - - // Check if the app requested a device queue creation - if (dev().settings().useDeviceQueue_ && - (0 != deviceQueueSize) && !createVirtualQueue(deviceQueueSize)) { - LogError("Could not create a virtual queue!"); - return false; - } - - return true; -} - -bool -VirtualGPU::allocHsaQueueMem() -{ - // Allocate a dummy HSA queue - hsaQueueMem_ = new Memory(dev(), sizeof(amd_queue_t)); - if ((hsaQueueMem_ == NULL) || - (!hsaQueueMem_->create(Resource::Local))) { - delete hsaQueueMem_; - return false; - } - amd_queue_t* queue = reinterpret_cast - (hsaQueueMem_->map(NULL, Resource::WriteOnly)); - if (NULL == queue) { - delete hsaQueueMem_; - return false; - } - memset(queue, 0, sizeof(amd_queue_t)); - // Provide private and local heap addresses - const static uint addressShift = LP64_SWITCH(0, 32); - queue->private_segment_aperture_base_hi = - static_cast(dev().gslCtx()->getPrivateApertureBase()>>addressShift); - queue->group_segment_aperture_base_hi = - static_cast(dev().gslCtx()->getSharedApertureBase()>>addressShift); - hsaQueueMem_->unmap(NULL); - return true; -} - -VirtualGPU::~VirtualGPU() -{ - // Not safe to remove a queue. So lock the device - amd::ScopedLock k(dev().lockAsyncOps()); - amd::ScopedLock lock(dev().vgpusAccess()); - - uint i; - // Destroy all kernels - for (GslKernels::const_iterator it = gslKernels_.begin(); - it != gslKernels_.end(); ++it) { - if (it->first != 0) { - freeKernelDesc(it->second); - } - } - gslKernels_.clear(); - - // Destroy all memories - static const bool SkipScratch = false; - releaseMemObjects(SkipScratch); - - // Destroy printf object - delete printfDbg_; - - // Destroy printfHSA object - delete printfDbgHSA_; - - // Destroy BlitManager object - delete blitMgr_; - - // Destroy TimeStamp cache - delete tsCache_; - - // Destroy resource list with the constant buffers - for (i = 0; i < constBufs_.size(); ++i) { - delete constBufs_[i]; - } - - gslDestroy(); - - gpuDevice_.numOfVgpus_--; - gpuDevice_.vgpus_.erase(gpuDevice_.vgpus_.begin() + index()); - for (uint idx = index(); idx < dev().vgpus().size(); ++idx) { - dev().vgpus()[idx]->index_--; - } - - // Release scratch buffer memory to reduce memory pressure - //!@note OCLtst uses single device with multiple tests - //! Release memory only if it's the last command queue. - //! The first queue is reserved for the transfers on device - if (gpuDevice_.numOfVgpus_ <= 1) { - gpuDevice_.destroyScratchBuffers(); - } - - delete [] vmMems_; + } else { //! @todo Temporarily keep the buffer mapped for debug purpose if (NULL != schedParams_) { - schedParams_->unmap(this); + schedParams_->unmap(this); } delete vqHeader_; delete virtualQueue_; delete schedParams_; + vqHeader_ = NULL; + virtualQueue_ = NULL; + schedParams_ = NULL; + schedParamIdx_ = 0; + deviceQueueSize_ = 0; + } + uint numSlots = deviceQueueSize / sizeof(AmdAqlWrap); + uint allocSize = deviceQueueSize; + + // Add the virtual queue header + allocSize += sizeof(AmdVQueueHeader); + allocSize = amd::alignUp(allocSize, sizeof(AmdAqlWrap)); + + uint argOffs = allocSize; + + // Add the kernel arguments and wait events + uint singleArgSize = amd::alignUp( + dev().info().maxParameterSize_ + 64 + dev().settings().numWaitEvents_ * sizeof(uint64_t), + sizeof(AmdAqlWrap)); + allocSize += singleArgSize * numSlots; + + uint eventsOffs = allocSize; + // Add the device events + allocSize += dev().settings().numDeviceEvents_ * sizeof(AmdEvent); + + uint eventMaskOffs = allocSize; + // Add mask array for events + allocSize += amd::alignUp(dev().settings().numDeviceEvents_, DeviceQueueMaskSize) / 8; + + uint slotMaskOffs = allocSize; + // Add mask array for AmdAqlWrap slots + allocSize += amd::alignUp(numSlots, DeviceQueueMaskSize) / 8; + + virtualQueue_ = new Memory(dev(), allocSize); + Resource::MemoryType type = (GPU_PRINT_CHILD_KERNEL == 0) ? Resource::Local : Resource::Remote; + if ((virtualQueue_ == NULL) || !virtualQueue_->create(type)) { + return false; + } + address ptr = reinterpret_cast
(virtualQueue_->map(this, Resource::WriteOnly)); + if (NULL == ptr) { + return false; + } + // Clear memory + memset(ptr, 0, allocSize); + uint64_t vaBase = virtualQueue_->vmAddress(); + AmdVQueueHeader* header = reinterpret_cast(ptr); + + // Initialize the virtual queue header + header->aql_slot_num = numSlots; + header->event_slot_num = dev().settings().numDeviceEvents_; + header->event_slot_mask = vaBase + eventMaskOffs; + header->event_slots = vaBase + eventsOffs; + header->aql_slot_mask = vaBase + slotMaskOffs; + header->wait_size = dev().settings().numWaitEvents_; + header->arg_size = dev().info().maxParameterSize_ + 64; + header->mask_groups = maskGroups_; + vqHeader_ = new AmdVQueueHeader; + if (NULL == vqHeader_) { + return false; + } + *vqHeader_ = *header; + + // Go over all slots and perform initialization + AmdAqlWrap* slots = reinterpret_cast(&header[1]); + for (uint i = 0; i < numSlots; ++i) { + uint64_t argStart = vaBase + argOffs + i * singleArgSize; + slots[i].aql.kernarg_address = reinterpret_cast(argStart); + slots[i].wait_list = argStart + dev().info().maxParameterSize_ + 64; + } + // Upload data back to local memory + if (GPU_PRINT_CHILD_KERNEL == 0) { + virtualQueue_->unmap(this); + } + + schedParams_ = new Memory(dev(), 64 * Ki); + if ((schedParams_ == NULL) || !schedParams_->create(Resource::RemoteUSWC)) { + return false; + } + + ptr = reinterpret_cast
(schedParams_->map(this)); + + deviceQueueSize_ = deviceQueueSize; + + return true; +} + +VirtualGPU::VirtualGPU(Device& device) + : device::VirtualDevice(device), + CALGSLContext(), + engineID_(MainEngine), + activeKernelDesc_(NULL), + gpuDevice_(static_cast(device)), + execution_("Virtual GPU execution lock", true), + printfDbg_(NULL), + printfDbgHSA_(NULL), + tsCache_(NULL), + vmMems_(NULL), + numVmMems_(0), + dmaFlushMgmt_(device), + hwRing_(0), + readjustTimeGPU_(0), + currTs_(NULL), + vqHeader_(NULL), + virtualQueue_(NULL), + schedParams_(NULL), + schedParamIdx_(0), + deviceQueueSize_(0), + maskGroups_(1), + hsaQueueMem_(NULL), + profileEnabled_(false) { + memset(&cal_, 0, sizeof(CalVirtualDesc)); + for (uint i = 0; i < AllEngines; ++i) { + cal_.events_[i].invalidate(); + } + memset(&cal_.samplersState_, 0xff, sizeof(cal_.samplersState_)); + + // Note: Virtual GPU device creation must be a thread safe operation + index_ = gpuDevice_.numOfVgpus_++; + gpuDevice_.vgpus_.resize(gpuDevice_.numOfVgpus()); + gpuDevice_.vgpus_[index()] = this; +} + +bool VirtualGPU::create(bool profiling, uint rtCUs, uint deviceQueueSize, + amd::CommandQueue::Priority priority) { + device::BlitManager::Setup blitSetup; + gslEngineDescriptor engines[2]; + uint engineMask = 0; + uint32_t num = 0; + + if (index() >= GPU_MAX_COMMAND_QUEUES) { + // Cap the maximum number of concurrent Virtual GPUs. + return false; + } + + // Virtual GPU will have profiling enabled + state_.profiling_ = profiling; + + { + if (dev().engines().numComputeRings()) { + uint idx; + + if ((amd::CommandQueue::RealTimeDisabled == rtCUs) && + (priority == amd::CommandQueue::Priority::Normal)) { + idx = index() % dev().engines().numComputeRings(); + engineMask = dev().engines().getMask((gslEngineID)( + dev().isComputeRingIDForced() ? dev().getforcedComputeEngineID() + : (dev().getFirstAvailableComputeEngineID() + idx))); + + } else { + if (priority == amd::CommandQueue::Priority::Medium) { + engineMask = dev().engines().getMask((gslEngineID)(GSL_ENGINEID_COMPUTE_MEDIUM_PRIORITY)); + } else { + engineMask = dev().engines().getMask((gslEngineID)(GSL_ENGINEID_COMPUTE_RT)); + } + //!@todo This is not a generic solution and + // may have issues with > 8 queues + idx = index() % (dev().engines().numComputeRings() + dev().engines().numComputeRingsRT()); + } + // hwRing_ should be set 0 if forced to have single scratch buffer + hwRing_ = (dev().settings().useSingleScratch_) ? 0 : idx; + + if (dev().canDMA()) { + // If only 1 DMA engine is available then use that one + if (dev().engines().numDMAEngines() < 2) { + engineMask |= dev().engines().getMask(GSL_ENGINEID_DRMDMA0); + } else if (index() & 0x1) { + engineMask |= dev().engines().getMask(GSL_ENGINEID_DRMDMA0); + } else { + engineMask |= dev().engines().getMask(GSL_ENGINEID_DRMDMA1); + } + } + } else { + engineMask = dev().engines().getMask(GSL_ENGINEID_3DCOMPUTE0); + if (dev().canDMA()) { + engineMask |= dev().engines().getMask(GSL_ENGINEID_DRMDMA0); + } + } + } + num = dev().engines().getRequested(engineMask, engines); + + // Open GSL context + if ((num == 0) || !gslOpen(num, engines, rtCUs)) { + return false; + } + + // Diable double copy optimization, + // since UAV read from nonlocal is fast enough + blitSetup.disableCopyBufferToImageOpt_ = true; + if (!allocConstantBuffers()) { + return false; + } + + // Create Printf class + printfDbg_ = new PrintfDbg(gpuDevice_); + if ((NULL == printfDbg_) || !printfDbg_->create()) { + delete printfDbg_; + LogError("Could not allocate debug buffer for printf()!"); + return false; + } + + // Create HSAILPrintf class + printfDbgHSA_ = new PrintfDbgHSA(gpuDevice_); + if (NULL == printfDbgHSA_) { + delete printfDbgHSA_; + LogError("Could not create PrintfDbgHSA class!"); + return false; + } + + // Choose the appropriate class for blit engine + switch (dev().settings().blitEngine_) { + default: + // Fall through ... + case Settings::BlitEngineHost: + blitSetup.disableAll(); + // Fall through ... + case Settings::BlitEngineCAL: + case Settings::BlitEngineKernel: + // use host blit for HW debug + if (dev().settings().enableHwDebug_) { + blitSetup.disableCopyImageToBuffer_ = true; + blitSetup.disableCopyBufferToImage_ = true; + } + blitMgr_ = new KernelBlitManager(*this, blitSetup); + break; + } + if ((NULL == blitMgr_) || !blitMgr_->create(gpuDevice_)) { + LogError("Could not create BlitManager!"); + return false; + } + + tsCache_ = new TimeStampCache(*this); + if (NULL == tsCache_) { + LogError("Could not create TimeStamp cache!"); + return false; + } + + if (!memoryDependency().create(dev().settings().numMemDependencies_)) { + LogError("Could not create the array of memory objects!"); + return false; + } + + if (!allocHsaQueueMem()) { + LogError("Could not create hsaQueueMem object!"); + return false; + } + + // Check if the app requested a device queue creation + if (dev().settings().useDeviceQueue_ && (0 != deviceQueueSize) && + !createVirtualQueue(deviceQueueSize)) { + LogError("Could not create a virtual queue!"); + return false; + } + + return true; +} + +bool VirtualGPU::allocHsaQueueMem() { + // Allocate a dummy HSA queue + hsaQueueMem_ = new Memory(dev(), sizeof(amd_queue_t)); + if ((hsaQueueMem_ == NULL) || (!hsaQueueMem_->create(Resource::Local))) { delete hsaQueueMem_; + return false; + } + amd_queue_t* queue = reinterpret_cast(hsaQueueMem_->map(NULL, Resource::WriteOnly)); + if (NULL == queue) { + delete hsaQueueMem_; + return false; + } + memset(queue, 0, sizeof(amd_queue_t)); + // Provide private and local heap addresses + const static uint addressShift = LP64_SWITCH(0, 32); + queue->private_segment_aperture_base_hi = + static_cast(dev().gslCtx()->getPrivateApertureBase() >> addressShift); + queue->group_segment_aperture_base_hi = + static_cast(dev().gslCtx()->getSharedApertureBase() >> addressShift); + hsaQueueMem_->unmap(NULL); + return true; } -void -VirtualGPU::submitReadMemory(amd::ReadMemoryCommand& vcmd) -{ - // Make sure VirtualGPU has an exclusive access to the resources - amd::ScopedLock lock(execution()); +VirtualGPU::~VirtualGPU() { + // Not safe to remove a queue. So lock the device + amd::ScopedLock k(dev().lockAsyncOps()); + amd::ScopedLock lock(dev().vgpusAccess()); - // Translate memory references and ensure cache up-to-date - gpu::Memory* memory = dev().getGpuMemory(&vcmd.source()); - - size_t offset = 0; - // Find if virtual address is a CL allocation - device::Memory* hostMemory = dev().findMemoryFromVA(vcmd.destination(), &offset); - - profilingBegin(vcmd, true); - - memory->syncCacheFromHost(*this); - cl_command_type type = vcmd.type(); - bool result = false; - amd::Memory* bufferFromImage = NULL; - - // Force buffer read for IMAGE1D_BUFFER - if ((type == CL_COMMAND_READ_IMAGE) && - (vcmd.source().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) { - bufferFromImage = createBufferFromImage(vcmd.source()); - if (NULL == bufferFromImage) { - LogError("We should not fail buffer creation from image_buffer!"); - } - else { - type = CL_COMMAND_READ_BUFFER; - bufferFromImage->setVirtualDevice(this); - memory = dev().getGpuMemory(bufferFromImage); - } + uint i; + // Destroy all kernels + for (GslKernels::const_iterator it = gslKernels_.begin(); it != gslKernels_.end(); ++it) { + if (it->first != 0) { + freeKernelDesc(it->second); } + } + gslKernels_.clear(); - // Process different write commands - switch (type) { + // Destroy all memories + static const bool SkipScratch = false; + releaseMemObjects(SkipScratch); + + // Destroy printf object + delete printfDbg_; + + // Destroy printfHSA object + delete printfDbgHSA_; + + // Destroy BlitManager object + delete blitMgr_; + + // Destroy TimeStamp cache + delete tsCache_; + + // Destroy resource list with the constant buffers + for (i = 0; i < constBufs_.size(); ++i) { + delete constBufs_[i]; + } + + gslDestroy(); + + gpuDevice_.numOfVgpus_--; + gpuDevice_.vgpus_.erase(gpuDevice_.vgpus_.begin() + index()); + for (uint idx = index(); idx < dev().vgpus().size(); ++idx) { + dev().vgpus()[idx]->index_--; + } + + // Release scratch buffer memory to reduce memory pressure + //!@note OCLtst uses single device with multiple tests + //! Release memory only if it's the last command queue. + //! The first queue is reserved for the transfers on device + if (gpuDevice_.numOfVgpus_ <= 1) { + gpuDevice_.destroyScratchBuffers(); + } + + delete[] vmMems_; + //! @todo Temporarily keep the buffer mapped for debug purpose + if (NULL != schedParams_) { + schedParams_->unmap(this); + } + delete vqHeader_; + delete virtualQueue_; + delete schedParams_; + delete hsaQueueMem_; +} + +void VirtualGPU::submitReadMemory(amd::ReadMemoryCommand& vcmd) { + // Make sure VirtualGPU has an exclusive access to the resources + amd::ScopedLock lock(execution()); + + // Translate memory references and ensure cache up-to-date + gpu::Memory* memory = dev().getGpuMemory(&vcmd.source()); + + size_t offset = 0; + // Find if virtual address is a CL allocation + device::Memory* hostMemory = dev().findMemoryFromVA(vcmd.destination(), &offset); + + profilingBegin(vcmd, true); + + memory->syncCacheFromHost(*this); + cl_command_type type = vcmd.type(); + bool result = false; + amd::Memory* bufferFromImage = NULL; + + // Force buffer read for IMAGE1D_BUFFER + if ((type == CL_COMMAND_READ_IMAGE) && + (vcmd.source().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) { + bufferFromImage = createBufferFromImage(vcmd.source()); + if (NULL == bufferFromImage) { + LogError("We should not fail buffer creation from image_buffer!"); + } else { + type = CL_COMMAND_READ_BUFFER; + bufferFromImage->setVirtualDevice(this); + memory = dev().getGpuMemory(bufferFromImage); + } + } + + // Process different write commands + switch (type) { case CL_COMMAND_READ_BUFFER: { - amd::Coord3D origin(vcmd.origin()[0]); - amd::Coord3D size(vcmd.size()[0]); - if (NULL != bufferFromImage) { - size_t elemSize = - vcmd.source().asImage()->getImageFormat().getElementSize(); - origin.c[0] *= elemSize; - size.c[0] *= elemSize; - } - if (hostMemory != NULL) { - // Accelerated transfer without pinning - amd::Coord3D dstOrigin(offset); - result = blitMgr().copyBuffer(*memory, *hostMemory, - origin, dstOrigin, size, vcmd.isEntireMemory()); - } - else { - result = blitMgr().readBuffer( - *memory, vcmd.destination(), - origin, size, vcmd.isEntireMemory()); - } - if (NULL != bufferFromImage) { - bufferFromImage->release(); - } - } - break; + amd::Coord3D origin(vcmd.origin()[0]); + amd::Coord3D size(vcmd.size()[0]); + if (NULL != bufferFromImage) { + size_t elemSize = vcmd.source().asImage()->getImageFormat().getElementSize(); + origin.c[0] *= elemSize; + size.c[0] *= elemSize; + } + if (hostMemory != NULL) { + // Accelerated transfer without pinning + amd::Coord3D dstOrigin(offset); + result = blitMgr().copyBuffer(*memory, *hostMemory, origin, dstOrigin, size, + vcmd.isEntireMemory()); + } else { + result = + blitMgr().readBuffer(*memory, vcmd.destination(), origin, size, vcmd.isEntireMemory()); + } + if (NULL != bufferFromImage) { + bufferFromImage->release(); + } + } break; case CL_COMMAND_READ_BUFFER_RECT: { - amd::BufferRect hostbufferRect; - amd::Coord3D region(0); - amd::Coord3D hostOrigin(vcmd.hostRect().start_+ offset); - hostbufferRect.create(hostOrigin.c, vcmd.size().c , vcmd.hostRect().rowPitch_, vcmd.hostRect().slicePitch_); - if (hostMemory != NULL) { - result = blitMgr().copyBufferRect(*memory, *hostMemory, - vcmd.bufRect(), hostbufferRect, vcmd.size(), - vcmd.isEntireMemory()); - } - else { - result = blitMgr().readBufferRect(*memory, - vcmd.destination(), vcmd.bufRect(), vcmd.hostRect(), vcmd.size(), - vcmd.isEntireMemory()); - } - } - break; + amd::BufferRect hostbufferRect; + amd::Coord3D region(0); + amd::Coord3D hostOrigin(vcmd.hostRect().start_ + offset); + hostbufferRect.create(hostOrigin.c, vcmd.size().c, vcmd.hostRect().rowPitch_, + vcmd.hostRect().slicePitch_); + if (hostMemory != NULL) { + result = blitMgr().copyBufferRect(*memory, *hostMemory, vcmd.bufRect(), hostbufferRect, + vcmd.size(), vcmd.isEntireMemory()); + } else { + result = blitMgr().readBufferRect(*memory, vcmd.destination(), vcmd.bufRect(), + vcmd.hostRect(), vcmd.size(), vcmd.isEntireMemory()); + } + } break; case CL_COMMAND_READ_IMAGE: - if (hostMemory != NULL) { - // Accelerated image to buffer transfer without pinning - amd::Coord3D dstOrigin(offset); - result = blitMgr().copyImageToBuffer(*memory, *hostMemory, - vcmd.origin(), dstOrigin, vcmd.size(), - vcmd.isEntireMemory(), - vcmd.rowPitch(), vcmd.slicePitch()); - } - else { - result = blitMgr().readImage(*memory, vcmd.destination(), - vcmd.origin(), vcmd.size(), vcmd.rowPitch(), vcmd.slicePitch(), - vcmd.isEntireMemory()); - } - break; + if (hostMemory != NULL) { + // Accelerated image to buffer transfer without pinning + amd::Coord3D dstOrigin(offset); + result = + blitMgr().copyImageToBuffer(*memory, *hostMemory, vcmd.origin(), dstOrigin, vcmd.size(), + vcmd.isEntireMemory(), vcmd.rowPitch(), vcmd.slicePitch()); + } else { + result = blitMgr().readImage(*memory, vcmd.destination(), vcmd.origin(), vcmd.size(), + vcmd.rowPitch(), vcmd.slicePitch(), vcmd.isEntireMemory()); + } + break; default: - LogError("Unsupported type for the read command"); - break; - } + LogError("Unsupported type for the read command"); + break; + } - if (!result) { - LogError("submitReadMemory failed!"); - vcmd.setStatus(CL_INVALID_OPERATION); - } + if (!result) { + LogError("submitReadMemory failed!"); + vcmd.setStatus(CL_INVALID_OPERATION); + } - profilingEnd(vcmd); + profilingEnd(vcmd); } -void -VirtualGPU::submitWriteMemory(amd::WriteMemoryCommand& vcmd) -{ - // Make sure VirtualGPU has an exclusive access to the resources - amd::ScopedLock lock(execution()); +void VirtualGPU::submitWriteMemory(amd::WriteMemoryCommand& vcmd) { + // Make sure VirtualGPU has an exclusive access to the resources + amd::ScopedLock lock(execution()); - // Translate memory references and ensure cache up to date - gpu::Memory* memory = dev().getGpuMemory(&vcmd.destination()); - size_t offset = 0; - // Find if virtual address is a CL allocation - device::Memory* hostMemory = dev().findMemoryFromVA(vcmd.source(), &offset); + // Translate memory references and ensure cache up to date + gpu::Memory* memory = dev().getGpuMemory(&vcmd.destination()); + size_t offset = 0; + // Find if virtual address is a CL allocation + device::Memory* hostMemory = dev().findMemoryFromVA(vcmd.source(), &offset); - profilingBegin(vcmd, true); + profilingBegin(vcmd, true); - bool entire = vcmd.isEntireMemory(); + bool entire = vcmd.isEntireMemory(); - // Synchronize memory from host if necessary - device::Memory::SyncFlags syncFlags; - syncFlags.skipEntire_ = entire; - memory->syncCacheFromHost(*this, syncFlags); + // Synchronize memory from host if necessary + device::Memory::SyncFlags syncFlags; + syncFlags.skipEntire_ = entire; + memory->syncCacheFromHost(*this, syncFlags); - cl_command_type type = vcmd.type(); - bool result = false; - amd::Memory* bufferFromImage = NULL; + cl_command_type type = vcmd.type(); + bool result = false; + amd::Memory* bufferFromImage = NULL; - // Force buffer write for IMAGE1D_BUFFER - if ((type == CL_COMMAND_WRITE_IMAGE) && - (vcmd.destination().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) { - bufferFromImage = createBufferFromImage(vcmd.destination()); - if (NULL == bufferFromImage) { - LogError("We should not fail buffer creation from image_buffer!"); - } - else { - type = CL_COMMAND_WRITE_BUFFER; - bufferFromImage->setVirtualDevice(this); - memory = dev().getGpuMemory(bufferFromImage); - } + // Force buffer write for IMAGE1D_BUFFER + if ((type == CL_COMMAND_WRITE_IMAGE) && + (vcmd.destination().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) { + bufferFromImage = createBufferFromImage(vcmd.destination()); + if (NULL == bufferFromImage) { + LogError("We should not fail buffer creation from image_buffer!"); + } else { + type = CL_COMMAND_WRITE_BUFFER; + bufferFromImage->setVirtualDevice(this); + memory = dev().getGpuMemory(bufferFromImage); } + } - // Process different write commands - switch (type) { + // Process different write commands + switch (type) { case CL_COMMAND_WRITE_BUFFER: { - amd::Coord3D origin(vcmd.origin()[0]); - amd::Coord3D size(vcmd.size()[0]); - if (NULL != bufferFromImage) { - size_t elemSize = - vcmd.destination().asImage()->getImageFormat().getElementSize(); - origin.c[0] *= elemSize; - size.c[0] *= elemSize; - } - if (hostMemory != NULL) { - // Accelerated transfer without pinning - amd::Coord3D srcOrigin(offset); - result = blitMgr().copyBuffer(*hostMemory, *memory, - srcOrigin, origin, size, vcmd.isEntireMemory()); - } - else { - result = blitMgr().writeBuffer(vcmd.source(), *memory, - origin, size, vcmd.isEntireMemory()); - } - if (NULL != bufferFromImage) { - bufferFromImage->release(); - } - } - break; + amd::Coord3D origin(vcmd.origin()[0]); + amd::Coord3D size(vcmd.size()[0]); + if (NULL != bufferFromImage) { + size_t elemSize = vcmd.destination().asImage()->getImageFormat().getElementSize(); + origin.c[0] *= elemSize; + size.c[0] *= elemSize; + } + if (hostMemory != NULL) { + // Accelerated transfer without pinning + amd::Coord3D srcOrigin(offset); + result = blitMgr().copyBuffer(*hostMemory, *memory, srcOrigin, origin, size, + vcmd.isEntireMemory()); + } else { + result = blitMgr().writeBuffer(vcmd.source(), *memory, origin, size, vcmd.isEntireMemory()); + } + if (NULL != bufferFromImage) { + bufferFromImage->release(); + } + } break; case CL_COMMAND_WRITE_BUFFER_RECT: { - amd::BufferRect hostbufferRect; - amd::Coord3D region(0); - amd::Coord3D hostOrigin(vcmd.hostRect().start_+ offset); - hostbufferRect.create(hostOrigin.c, vcmd.size().c , vcmd.hostRect().rowPitch_, vcmd.hostRect().slicePitch_); - if (hostMemory != NULL) { - result = blitMgr().copyBufferRect(*hostMemory, *memory, - hostbufferRect, vcmd.bufRect(), vcmd.size(), - vcmd.isEntireMemory()); - } - else { - result = blitMgr().writeBufferRect(vcmd.source(), *memory, - vcmd.hostRect(), vcmd.bufRect(), vcmd.size(), - vcmd.isEntireMemory()); - } - } - break; + amd::BufferRect hostbufferRect; + amd::Coord3D region(0); + amd::Coord3D hostOrigin(vcmd.hostRect().start_ + offset); + hostbufferRect.create(hostOrigin.c, vcmd.size().c, vcmd.hostRect().rowPitch_, + vcmd.hostRect().slicePitch_); + if (hostMemory != NULL) { + result = blitMgr().copyBufferRect(*hostMemory, *memory, hostbufferRect, vcmd.bufRect(), + vcmd.size(), vcmd.isEntireMemory()); + } else { + result = blitMgr().writeBufferRect(vcmd.source(), *memory, vcmd.hostRect(), vcmd.bufRect(), + vcmd.size(), vcmd.isEntireMemory()); + } + } break; case CL_COMMAND_WRITE_IMAGE: - if (hostMemory != NULL) { - // Accelerated buffer to image transfer without pinning - amd::Coord3D srcOrigin(offset); - result = blitMgr().copyBufferToImage(*hostMemory, *memory, - srcOrigin, vcmd.origin(), vcmd.size(), - vcmd.isEntireMemory(), - vcmd.rowPitch(), vcmd.slicePitch()); - } - else { - result = blitMgr().writeImage(vcmd.source(), *memory, - vcmd.origin(), vcmd.size(), vcmd.rowPitch(), vcmd.slicePitch(), - vcmd.isEntireMemory()); - } - break; + if (hostMemory != NULL) { + // Accelerated buffer to image transfer without pinning + amd::Coord3D srcOrigin(offset); + result = + blitMgr().copyBufferToImage(*hostMemory, *memory, srcOrigin, vcmd.origin(), vcmd.size(), + vcmd.isEntireMemory(), vcmd.rowPitch(), vcmd.slicePitch()); + } else { + result = blitMgr().writeImage(vcmd.source(), *memory, vcmd.origin(), vcmd.size(), + vcmd.rowPitch(), vcmd.slicePitch(), vcmd.isEntireMemory()); + } + break; default: - LogError("Unsupported type for the write command"); - break; - } + LogError("Unsupported type for the write command"); + break; + } - if (!result) { - LogError("submitWriteMemory failed!"); - vcmd.setStatus(CL_INVALID_OPERATION); - } - else { - // Mark this as the most-recently written cache of the destination - vcmd.destination().signalWrite(&gpuDevice_); - } - profilingEnd(vcmd); + if (!result) { + LogError("submitWriteMemory failed!"); + vcmd.setStatus(CL_INVALID_OPERATION); + } else { + // Mark this as the most-recently written cache of the destination + vcmd.destination().signalWrite(&gpuDevice_); + } + profilingEnd(vcmd); } -bool -VirtualGPU::copyMemory(cl_command_type type - , amd::Memory& srcMem - , amd::Memory& dstMem - , bool entire - , const amd::Coord3D& srcOrigin - , const amd::Coord3D& dstOrigin - , const amd::Coord3D& size - , const amd::BufferRect& srcRect - , const amd::BufferRect& dstRect - ) -{ - // Translate memory references and ensure cache up-to-date - gpu::Memory* dstMemory = dev().getGpuMemory(&dstMem); - gpu::Memory* srcMemory = dev().getGpuMemory(&srcMem); +bool VirtualGPU::copyMemory(cl_command_type type, amd::Memory& srcMem, amd::Memory& dstMem, + bool entire, const amd::Coord3D& srcOrigin, + const amd::Coord3D& dstOrigin, const amd::Coord3D& size, + const amd::BufferRect& srcRect, const amd::BufferRect& dstRect) { + // Translate memory references and ensure cache up-to-date + gpu::Memory* dstMemory = dev().getGpuMemory(&dstMem); + gpu::Memory* srcMemory = dev().getGpuMemory(&srcMem); - // Synchronize source and destination memory - device::Memory::SyncFlags syncFlags; - syncFlags.skipEntire_ = entire; - dstMemory->syncCacheFromHost(*this, syncFlags); - srcMemory->syncCacheFromHost(*this); + // Synchronize source and destination memory + device::Memory::SyncFlags syncFlags; + syncFlags.skipEntire_ = entire; + dstMemory->syncCacheFromHost(*this, syncFlags); + srcMemory->syncCacheFromHost(*this); - amd::Memory* bufferFromImageSrc = NULL; - amd::Memory* bufferFromImageDst = NULL; + amd::Memory* bufferFromImageSrc = NULL; + amd::Memory* bufferFromImageDst = NULL; - // Force buffer read for IMAGE1D_BUFFER - if ((srcMem.getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) { - bufferFromImageSrc = createBufferFromImage(srcMem); - if (NULL == bufferFromImageSrc) { - LogError("We should not fail buffer creation from image_buffer!"); - } - else { - type = CL_COMMAND_COPY_BUFFER; - bufferFromImageSrc->setVirtualDevice(this); - srcMemory = dev().getGpuMemory(bufferFromImageSrc); - } + // Force buffer read for IMAGE1D_BUFFER + if ((srcMem.getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) { + bufferFromImageSrc = createBufferFromImage(srcMem); + if (NULL == bufferFromImageSrc) { + LogError("We should not fail buffer creation from image_buffer!"); + } else { + type = CL_COMMAND_COPY_BUFFER; + bufferFromImageSrc->setVirtualDevice(this); + srcMemory = dev().getGpuMemory(bufferFromImageSrc); } - // Force buffer write for IMAGE1D_BUFFER - if ((dstMem.getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) { - bufferFromImageDst = createBufferFromImage(dstMem); - if (NULL == bufferFromImageDst) { - LogError("We should not fail buffer creation from image_buffer!"); - } - else { - type = CL_COMMAND_COPY_BUFFER; - bufferFromImageDst->setVirtualDevice(this); - dstMemory = dev().getGpuMemory(bufferFromImageDst); - } + } + // Force buffer write for IMAGE1D_BUFFER + if ((dstMem.getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) { + bufferFromImageDst = createBufferFromImage(dstMem); + if (NULL == bufferFromImageDst) { + LogError("We should not fail buffer creation from image_buffer!"); + } else { + type = CL_COMMAND_COPY_BUFFER; + bufferFromImageDst->setVirtualDevice(this); + dstMemory = dev().getGpuMemory(bufferFromImageDst); } + } - bool result = false; + bool result = false; - // Check if HW can be used for memory copy - switch (type) { + // Check if HW can be used for memory copy + switch (type) { case CL_COMMAND_SVM_MEMCPY: case CL_COMMAND_COPY_BUFFER: { - amd::Coord3D realSrcOrigin(srcOrigin[0]); - amd::Coord3D realDstOrigin(dstOrigin[0]); - amd::Coord3D realSize(size.c[0],size.c[1],size.c[2]); + amd::Coord3D realSrcOrigin(srcOrigin[0]); + amd::Coord3D realDstOrigin(dstOrigin[0]); + amd::Coord3D realSize(size.c[0], size.c[1], size.c[2]); - if (NULL != bufferFromImageSrc) { - size_t elemSize = - srcMem.asImage()->getImageFormat().getElementSize(); - realSrcOrigin.c[0] *= elemSize; - if (NULL != bufferFromImageDst) { - realDstOrigin.c[0] *= elemSize; - } - realSize.c[0] *= elemSize; - } - else if (NULL != bufferFromImageDst) { - size_t elemSize = - dstMem.asImage()->getImageFormat().getElementSize(); - realDstOrigin.c[0] *= elemSize; - realSize.c[0] *= elemSize; - } - - result = blitMgr().copyBuffer(*srcMemory, *dstMemory, - realSrcOrigin, realDstOrigin, realSize, entire); - - if (NULL != bufferFromImageSrc) { - bufferFromImageSrc->release(); - } + if (NULL != bufferFromImageSrc) { + size_t elemSize = srcMem.asImage()->getImageFormat().getElementSize(); + realSrcOrigin.c[0] *= elemSize; if (NULL != bufferFromImageDst) { - bufferFromImageDst->release(); + realDstOrigin.c[0] *= elemSize; } - } - break; + realSize.c[0] *= elemSize; + } else if (NULL != bufferFromImageDst) { + size_t elemSize = dstMem.asImage()->getImageFormat().getElementSize(); + realDstOrigin.c[0] *= elemSize; + realSize.c[0] *= elemSize; + } + + result = blitMgr().copyBuffer(*srcMemory, *dstMemory, realSrcOrigin, realDstOrigin, realSize, + entire); + + if (NULL != bufferFromImageSrc) { + bufferFromImageSrc->release(); + } + if (NULL != bufferFromImageDst) { + bufferFromImageDst->release(); + } + } break; case CL_COMMAND_COPY_BUFFER_RECT: - result = blitMgr().copyBufferRect(*srcMemory, *dstMemory, - srcRect, dstRect, size, entire); - break; + result = blitMgr().copyBufferRect(*srcMemory, *dstMemory, srcRect, dstRect, size, entire); + break; case CL_COMMAND_COPY_IMAGE_TO_BUFFER: - result = blitMgr().copyImageToBuffer(*srcMemory, *dstMemory, - srcOrigin, dstOrigin, size, entire); - break; + result = + blitMgr().copyImageToBuffer(*srcMemory, *dstMemory, srcOrigin, dstOrigin, size, entire); + break; case CL_COMMAND_COPY_BUFFER_TO_IMAGE: - result = blitMgr().copyBufferToImage(*srcMemory, *dstMemory, - srcOrigin, dstOrigin, size, entire); - break; + result = + blitMgr().copyBufferToImage(*srcMemory, *dstMemory, srcOrigin, dstOrigin, size, entire); + break; case CL_COMMAND_COPY_IMAGE: - result = blitMgr().copyImage(*srcMemory, *dstMemory, - srcOrigin, dstOrigin, size, entire); - break; + result = blitMgr().copyImage(*srcMemory, *dstMemory, srcOrigin, dstOrigin, size, entire); + break; default: - LogError("Unsupported command type for memory copy!"); - break; - } + LogError("Unsupported command type for memory copy!"); + break; + } - if (!result) { - LogError("submitCopyMemory failed!"); - return false; - } - else { - // Mark this as the most-recently written cache of the destination - dstMem.signalWrite(&gpuDevice_); - } - return true; + if (!result) { + LogError("submitCopyMemory failed!"); + return false; + } else { + // Mark this as the most-recently written cache of the destination + dstMem.signalWrite(&gpuDevice_); + } + return true; } -void -VirtualGPU::submitCopyMemory(amd::CopyMemoryCommand& vcmd) -{ - // Make sure VirtualGPU has an exclusive access to the resources - amd::ScopedLock lock(execution()); +void VirtualGPU::submitCopyMemory(amd::CopyMemoryCommand& vcmd) { + // Make sure VirtualGPU has an exclusive access to the resources + amd::ScopedLock lock(execution()); - profilingBegin(vcmd); + profilingBegin(vcmd); - cl_command_type type = vcmd.type(); - bool entire = vcmd.isEntireMemory(); + cl_command_type type = vcmd.type(); + bool entire = vcmd.isEntireMemory(); - if (!copyMemory(type, vcmd.source(), vcmd.destination(), entire, - vcmd.srcOrigin(), vcmd.dstOrigin(), vcmd.size(), vcmd.srcRect(), - vcmd.dstRect())) { - vcmd.setStatus(CL_INVALID_OPERATION); - } + if (!copyMemory(type, vcmd.source(), vcmd.destination(), entire, vcmd.srcOrigin(), + vcmd.dstOrigin(), vcmd.size(), vcmd.srcRect(), vcmd.dstRect())) { + vcmd.setStatus(CL_INVALID_OPERATION); + } - profilingEnd(vcmd); + profilingEnd(vcmd); } -void -VirtualGPU::submitSvmCopyMemory(amd::SvmCopyMemoryCommand& vcmd) -{ - // Make sure VirtualGPU has an exclusive access to the resources - amd::ScopedLock lock(execution()); - profilingBegin(vcmd); +void VirtualGPU::submitSvmCopyMemory(amd::SvmCopyMemoryCommand& vcmd) { + // Make sure VirtualGPU has an exclusive access to the resources + amd::ScopedLock lock(execution()); + profilingBegin(vcmd); - cl_command_type type = vcmd.type(); - //no op for FGS supported device - if (!dev().isFineGrainedSystem()) { - amd::Coord3D srcOrigin(0, 0, 0); - amd::Coord3D dstOrigin(0, 0, 0); - amd::Coord3D size(vcmd.srcSize(), 1, 1); - amd::BufferRect srcRect; - amd::BufferRect dstRect; - - bool result = false; - amd::Memory* srcMem = amd::SvmManager::FindSvmBuffer(vcmd.src()); - amd::Memory* dstMem = amd::SvmManager::FindSvmBuffer(vcmd.dst()); - - device::Memory::SyncFlags syncFlags; - if (nullptr != srcMem) { - srcMem->commitSvmMemory(); - srcOrigin.c[0] = static_cast(vcmd.src()) - static_cast
(srcMem->getSvmPtr()); - if (!(srcMem->validateRegion(srcOrigin, size))) { - vcmd.setStatus(CL_INVALID_OPERATION); - return; - } - } - if (nullptr != dstMem) { - dstMem->commitSvmMemory(); - dstOrigin.c[0] = static_cast(vcmd.dst()) - static_cast
(dstMem->getSvmPtr()); - if (!(dstMem->validateRegion(dstOrigin, size))) { - vcmd.setStatus(CL_INVALID_OPERATION); - return; - } - } - - if (nullptr == srcMem && nullptr != dstMem) { //src not in svm space - Memory* memory = dev().getGpuMemory(dstMem); - // Synchronize source and destination memory - syncFlags.skipEntire_ = dstMem->isEntirelyCovered(dstOrigin, size); - memory->syncCacheFromHost(*this, syncFlags); - - result = blitMgr().writeBuffer(vcmd.src(), *memory, - dstOrigin, size, dstMem->isEntirelyCovered(dstOrigin, size)); - // Mark this as the most-recently written cache of the destination - dstMem->signalWrite(&gpuDevice_); - } - else if (nullptr != srcMem && nullptr == dstMem) { //dst not in svm space - Memory* memory = dev().getGpuMemory(srcMem); - // Synchronize source and destination memory - memory->syncCacheFromHost(*this); - - result = blitMgr().readBuffer(*memory, vcmd.dst(), - srcOrigin, size, srcMem->isEntirelyCovered(srcOrigin, size)); - } - else if (nullptr != srcMem && nullptr != dstMem) { //both not in svm space - bool entire = srcMem->isEntirelyCovered(srcOrigin, size) && - dstMem->isEntirelyCovered(dstOrigin, size); - result = copyMemory(type, *srcMem, *dstMem, entire, srcOrigin, dstOrigin, - size, srcRect, dstRect); - } - - if (!result) { - vcmd.setStatus(CL_INVALID_OPERATION); - } - } - else { - //direct memcpy for FGS enabled system - amd::SvmBuffer::memFill(vcmd.dst(), vcmd.src(), vcmd.srcSize(), 1); - } - profilingEnd(vcmd); -} - -void -VirtualGPU::submitMapMemory(amd::MapMemoryCommand& vcmd) -{ - // Make sure VirtualGPU has an exclusive access to the resources - amd::ScopedLock lock(execution()); - - profilingBegin(vcmd, true); - - gpu::Memory* memory = dev().getGpuMemory(&vcmd.memory()); - - // Save map info for unmap operation - memory->saveMapInfo(vcmd.mapPtr(), vcmd.origin(), vcmd.size(), - vcmd.mapFlags(), vcmd.isEntireMemory()); - - // If we have host memory, use it - if ((memory->owner()->getHostMem() != NULL) && memory->isDirectMap()) { - if (!memory->isHostMemDirectAccess()) { - // Make sure GPU finished operation before - // synchronization with the backing store - memory->wait(*this); - } - - // Target is the backing store, so just ensure that owner is up-to-date - memory->owner()->cacheWriteBack(); - - // Add memory to VA cache, so rutnime can detect direct access to VA - dev().addVACache(memory); - } - else if (memory->isPersistentDirectMap()) { - // Nothing to do here - } - else if (memory->mapMemory() != NULL) { - // Target is a remote resource, so copy - assert(memory->mapMemory() != NULL); - if (vcmd.mapFlags() & (CL_MAP_READ | CL_MAP_WRITE)) { - amd::Coord3D dstOrigin(0, 0, 0); - if (memory->cal()->buffer_) { - if (!blitMgr().copyBuffer(*memory, - *memory->mapMemory(), vcmd.origin(), vcmd.origin(), - vcmd.size(), vcmd.isEntireMemory())) { - LogError("submitMapMemory() - copy failed"); - vcmd.setStatus(CL_MAP_FAILURE); - } - } - else if ((vcmd.memory().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) { - amd::Memory* bufferFromImage = NULL; - Memory* memoryBuf = memory; - amd::Coord3D origin(vcmd.origin()[0]); - amd::Coord3D size(vcmd.size()[0]); - size_t elemSize = - vcmd.memory().asImage()->getImageFormat().getElementSize(); - origin.c[0] *= elemSize; - size.c[0] *= elemSize; - - bufferFromImage = createBufferFromImage(vcmd.memory()); - if (NULL == bufferFromImage) { - LogError("We should not fail buffer creation from image_buffer!"); - } - else { - bufferFromImage->setVirtualDevice(this); - memoryBuf = dev().getGpuMemory(bufferFromImage); - } - if (!blitMgr().copyBuffer(*memoryBuf, - *memory->mapMemory(), origin, dstOrigin, - size, vcmd.isEntireMemory())) { - LogError("submitMapMemory() - copy failed"); - vcmd.setStatus(CL_MAP_FAILURE); - } - if (NULL != bufferFromImage) { - bufferFromImage->release(); - } - } - else { - // Validate if it's a view for a map of mip level - if (vcmd.memory().parent() != NULL) { - amd::Image* amdImage = vcmd.memory().parent()->asImage(); - if ((amdImage != NULL) && (amdImage->getMipLevels() > 1)) { - // Save map write info in the parent object - dev().getGpuMemory(amdImage)->saveMapInfo(vcmd.mapPtr(), - vcmd.origin(), vcmd.size(), - vcmd.mapFlags(), vcmd.isEntireMemory(), - vcmd.memory().asImage()); - } - } - if (!blitMgr().copyImageToBuffer(*memory, - *memory->mapMemory(), vcmd.origin(), dstOrigin, - vcmd.size(), vcmd.isEntireMemory())) { - LogError("submitMapMemory() - copy failed"); - vcmd.setStatus(CL_MAP_FAILURE); - } - } - } - } - else { - LogError("Unhandled map!"); - } - - profilingEnd(vcmd); -} - -void -VirtualGPU::submitUnmapMemory(amd::UnmapMemoryCommand& vcmd) -{ - // Make sure VirtualGPU has an exclusive access to the resources - amd::ScopedLock lock(execution()); - gpu::Memory* memory = dev().getGpuMemory(&vcmd.memory()); - amd::Memory* owner = memory->owner(); - bool unmapMip = false; - const device::Memory::WriteMapInfo* writeMapInfo = - memory->writeMapInfo(vcmd.mapPtr()); - if (nullptr == writeMapInfo) { - LogError("Unmap without map call"); - return; - } - profilingBegin(vcmd, true); - - // Check if image is a mipmap and assign a saved view - amd::Image* amdImage = owner->asImage(); - if ((amdImage != NULL) && (amdImage->getMipLevels() > 1) && - (writeMapInfo->baseMip_ != NULL)) { - // Assign mip level view - amdImage = writeMapInfo->baseMip_; - // Clear unmap flags from the parent image - memory->clearUnmapInfo(vcmd.mapPtr()); - memory = dev().getGpuMemory(amdImage); - unmapMip = true; - writeMapInfo = memory->writeMapInfo(vcmd.mapPtr()); - } - - // We used host memory - if ((owner->getHostMem() != NULL) && memory->isDirectMap()) { - if (writeMapInfo->isUnmapWrite()) { - // Target is the backing store, so sync - owner->signalWrite(NULL); - memory->syncCacheFromHost(*this); - } - // Remove memory from VA cache - dev().removeVACache(memory); - } - // data check was added for persistent memory that failed to get aperture - // and therefore are treated like a remote resource - else if (memory->isPersistentDirectMap() && (memory->data() != NULL)) { - memory->unmap(this); - } - else if (memory->mapMemory() != NULL) { - if (writeMapInfo->isUnmapWrite()) { - amd::Coord3D srcOrigin(0, 0, 0); - // Target is a remote resource, so copy - assert(memory->mapMemory() != NULL); - if (memory->cal()->buffer_) { - if (!blitMgr().copyBuffer( - *memory->mapMemory(), *memory, - writeMapInfo->origin_, - writeMapInfo->origin_, - writeMapInfo->region_, - writeMapInfo->isEntire())) { - LogError("submitUnmapMemory() - copy failed"); - vcmd.setStatus(CL_OUT_OF_RESOURCES); - } - } - else if ((vcmd.memory().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) { - amd::Memory* bufferFromImage = NULL; - Memory* memoryBuf = memory; - amd::Coord3D origin(writeMapInfo->origin_[0]); - amd::Coord3D size(writeMapInfo->region_[0]); - size_t elemSize = - vcmd.memory().asImage()->getImageFormat().getElementSize(); - origin.c[0] *= elemSize; - size.c[0] *= elemSize; - - bufferFromImage = createBufferFromImage(vcmd.memory()); - if (NULL == bufferFromImage) { - LogError("We should not fail buffer creation from image_buffer!"); - } - else { - bufferFromImage->setVirtualDevice(this); - memoryBuf = dev().getGpuMemory(bufferFromImage); - } - if (!blitMgr().copyBuffer( - *memory->mapMemory(), *memoryBuf, - srcOrigin, origin, size, - writeMapInfo->isEntire())) { - LogError("submitUnmapMemory() - copy failed"); - vcmd.setStatus(CL_OUT_OF_RESOURCES); - } - if (NULL != bufferFromImage) { - bufferFromImage->release(); - } - } - else { - if (!blitMgr().copyBufferToImage( - *memory->mapMemory(), *memory, - srcOrigin, - writeMapInfo->origin_, - writeMapInfo->region_, - writeMapInfo->isEntire())) { - LogError("submitUnmapMemory() - copy failed"); - vcmd.setStatus(CL_OUT_OF_RESOURCES); - } - } - } - } - else { - LogError("Unhandled unmap!"); - vcmd.setStatus(CL_INVALID_VALUE); - } - - // Clear unmap flags - memory->clearUnmapInfo(vcmd.mapPtr()); - - // Release a view for a mipmap map - if (unmapMip) { - amdImage->release(); - } - profilingEnd(vcmd); -} - -bool -VirtualGPU::fillMemory(cl_command_type type, amd::Memory* amdMemory, const void* pattern, - size_t patternSize, const amd::Coord3D& origin, const amd::Coord3D& size) -{ - gpu::Memory* memory = dev().getGpuMemory(amdMemory); - bool entire = amdMemory->isEntirelyCovered(origin, size); - - // Synchronize memory from host if necessary - device::Memory::SyncFlags syncFlags; - syncFlags.skipEntire_ = entire; - memory->syncCacheFromHost(*this, syncFlags); + cl_command_type type = vcmd.type(); + // no op for FGS supported device + if (!dev().isFineGrainedSystem()) { + amd::Coord3D srcOrigin(0, 0, 0); + amd::Coord3D dstOrigin(0, 0, 0); + amd::Coord3D size(vcmd.srcSize(), 1, 1); + amd::BufferRect srcRect; + amd::BufferRect dstRect; bool result = false; - amd::Memory* bufferFromImage = NULL; - float fillValue[4]; + amd::Memory* srcMem = amd::SvmManager::FindSvmBuffer(vcmd.src()); + amd::Memory* dstMem = amd::SvmManager::FindSvmBuffer(vcmd.dst()); - // Force fill buffer for IMAGE1D_BUFFER - if ((type == CL_COMMAND_FILL_IMAGE) && - (amdMemory->getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) { - bufferFromImage = createBufferFromImage(*amdMemory); + device::Memory::SyncFlags syncFlags; + if (nullptr != srcMem) { + srcMem->commitSvmMemory(); + srcOrigin.c[0] = + static_cast(vcmd.src()) - static_cast
(srcMem->getSvmPtr()); + if (!(srcMem->validateRegion(srcOrigin, size))) { + vcmd.setStatus(CL_INVALID_OPERATION); + return; + } + } + if (nullptr != dstMem) { + dstMem->commitSvmMemory(); + dstOrigin.c[0] = + static_cast(vcmd.dst()) - static_cast
(dstMem->getSvmPtr()); + if (!(dstMem->validateRegion(dstOrigin, size))) { + vcmd.setStatus(CL_INVALID_OPERATION); + return; + } + } + + if (nullptr == srcMem && nullptr != dstMem) { // src not in svm space + Memory* memory = dev().getGpuMemory(dstMem); + // Synchronize source and destination memory + syncFlags.skipEntire_ = dstMem->isEntirelyCovered(dstOrigin, size); + memory->syncCacheFromHost(*this, syncFlags); + + result = blitMgr().writeBuffer(vcmd.src(), *memory, dstOrigin, size, + dstMem->isEntirelyCovered(dstOrigin, size)); + // Mark this as the most-recently written cache of the destination + dstMem->signalWrite(&gpuDevice_); + } else if (nullptr != srcMem && nullptr == dstMem) { // dst not in svm space + Memory* memory = dev().getGpuMemory(srcMem); + // Synchronize source and destination memory + memory->syncCacheFromHost(*this); + + result = blitMgr().readBuffer(*memory, vcmd.dst(), srcOrigin, size, + srcMem->isEntirelyCovered(srcOrigin, size)); + } else if (nullptr != srcMem && nullptr != dstMem) { // both not in svm space + bool entire = + srcMem->isEntirelyCovered(srcOrigin, size) && dstMem->isEntirelyCovered(dstOrigin, size); + result = + copyMemory(type, *srcMem, *dstMem, entire, srcOrigin, dstOrigin, size, srcRect, dstRect); + } + + if (!result) { + vcmd.setStatus(CL_INVALID_OPERATION); + } + } else { + // direct memcpy for FGS enabled system + amd::SvmBuffer::memFill(vcmd.dst(), vcmd.src(), vcmd.srcSize(), 1); + } + profilingEnd(vcmd); +} + +void VirtualGPU::submitMapMemory(amd::MapMemoryCommand& vcmd) { + // Make sure VirtualGPU has an exclusive access to the resources + amd::ScopedLock lock(execution()); + + profilingBegin(vcmd, true); + + gpu::Memory* memory = dev().getGpuMemory(&vcmd.memory()); + + // Save map info for unmap operation + memory->saveMapInfo(vcmd.mapPtr(), vcmd.origin(), vcmd.size(), vcmd.mapFlags(), + vcmd.isEntireMemory()); + + // If we have host memory, use it + if ((memory->owner()->getHostMem() != NULL) && memory->isDirectMap()) { + if (!memory->isHostMemDirectAccess()) { + // Make sure GPU finished operation before + // synchronization with the backing store + memory->wait(*this); + } + + // Target is the backing store, so just ensure that owner is up-to-date + memory->owner()->cacheWriteBack(); + + // Add memory to VA cache, so rutnime can detect direct access to VA + dev().addVACache(memory); + } else if (memory->isPersistentDirectMap()) { + // Nothing to do here + } else if (memory->mapMemory() != NULL) { + // Target is a remote resource, so copy + assert(memory->mapMemory() != NULL); + if (vcmd.mapFlags() & (CL_MAP_READ | CL_MAP_WRITE)) { + amd::Coord3D dstOrigin(0, 0, 0); + if (memory->cal()->buffer_) { + if (!blitMgr().copyBuffer(*memory, *memory->mapMemory(), vcmd.origin(), vcmd.origin(), + vcmd.size(), vcmd.isEntireMemory())) { + LogError("submitMapMemory() - copy failed"); + vcmd.setStatus(CL_MAP_FAILURE); + } + } else if ((vcmd.memory().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) { + amd::Memory* bufferFromImage = NULL; + Memory* memoryBuf = memory; + amd::Coord3D origin(vcmd.origin()[0]); + amd::Coord3D size(vcmd.size()[0]); + size_t elemSize = vcmd.memory().asImage()->getImageFormat().getElementSize(); + origin.c[0] *= elemSize; + size.c[0] *= elemSize; + + bufferFromImage = createBufferFromImage(vcmd.memory()); if (NULL == bufferFromImage) { - LogError("We should not fail buffer creation from image_buffer!"); + LogError("We should not fail buffer creation from image_buffer!"); + } else { + bufferFromImage->setVirtualDevice(this); + memoryBuf = dev().getGpuMemory(bufferFromImage); } - else { - type = CL_COMMAND_FILL_BUFFER; - bufferFromImage->setVirtualDevice(this); - memory = dev().getGpuMemory(bufferFromImage); + if (!blitMgr().copyBuffer(*memoryBuf, *memory->mapMemory(), origin, dstOrigin, size, + vcmd.isEntireMemory())) { + LogError("submitMapMemory() - copy failed"); + vcmd.setStatus(CL_MAP_FAILURE); } - } - - // Find the the right fill operation - switch (type) { - case CL_COMMAND_FILL_BUFFER : - case CL_COMMAND_SVM_MEMFILL : { - amd::Coord3D realOrigin(origin[0]); - amd::Coord3D realSize(size[0]); - // Reprogram fill parameters if it's an IMAGE1D_BUFFER object if (NULL != bufferFromImage) { - size_t elemSize = - amdMemory->asImage()->getImageFormat().getElementSize(); - realOrigin.c[0] *= elemSize; - realSize.c[0] *= elemSize; - memset(fillValue, 0, sizeof(fillValue)); - amdMemory->asImage()->getImageFormat().formatColor(pattern, fillValue); - pattern = fillValue; - patternSize = elemSize; + bufferFromImage->release(); } - result = blitMgr().fillBuffer(*memory, pattern, - patternSize, realOrigin, realSize, amdMemory->isEntirelyCovered(origin, size)); - if (NULL != bufferFromImage) { - bufferFromImage->release(); + } else { + // Validate if it's a view for a map of mip level + if (vcmd.memory().parent() != NULL) { + amd::Image* amdImage = vcmd.memory().parent()->asImage(); + if ((amdImage != NULL) && (amdImage->getMipLevels() > 1)) { + // Save map write info in the parent object + dev().getGpuMemory(amdImage)->saveMapInfo(vcmd.mapPtr(), vcmd.origin(), vcmd.size(), + vcmd.mapFlags(), vcmd.isEntireMemory(), + vcmd.memory().asImage()); + } } + if (!blitMgr().copyImageToBuffer(*memory, *memory->mapMemory(), vcmd.origin(), dstOrigin, + vcmd.size(), vcmd.isEntireMemory())) { + LogError("submitMapMemory() - copy failed"); + vcmd.setStatus(CL_MAP_FAILURE); + } + } } - break; - case CL_COMMAND_FILL_IMAGE: - result = blitMgr().fillImage(*memory, pattern, - origin, size, amdMemory->isEntirelyCovered(origin, size)); - break; - default: - LogError("Unsupported command type for FillMemory!"); - break; - } + } else { + LogError("Unhandled map!"); + } - if (!result) { - LogError("fillMemory failed!"); - return false; - } - - // Mark this as the most-recently written cache of the destination - amdMemory->signalWrite(&gpuDevice_); - return true; + profilingEnd(vcmd); } -void -VirtualGPU::submitFillMemory(amd::FillMemoryCommand& vcmd) -{ - // Make sure VirtualGPU has an exclusive access to the resources - amd::ScopedLock lock(execution()); - - profilingBegin(vcmd, true); - - if (!fillMemory(vcmd.type(), &vcmd.memory(),vcmd.pattern(), - vcmd.patternSize(), vcmd.origin(), vcmd.size())) { - vcmd.setStatus(CL_INVALID_OPERATION); - } - - profilingEnd(vcmd); -} - -void -VirtualGPU::submitSvmMapMemory(amd::SvmMapMemoryCommand& vcmd) -{ - // Make sure VirtualGPU has an exclusive access to the resources - amd::ScopedLock lock(execution()); - - profilingBegin(vcmd, true); - - //no op for FGS supported device - if (!dev().isFineGrainedSystem()) { - // Make sure we have memory for the command execution - gpu::Memory* memory = dev().getGpuMemory(vcmd.getSvmMem()); - memory->saveMapInfo(vcmd.svmPtr(), vcmd.origin(), vcmd.size(), - vcmd.mapFlags(), vcmd.isEntireMemory()); - - if (memory->mapMemory() != NULL) { - if (vcmd.mapFlags() & (CL_MAP_READ | CL_MAP_WRITE)) { - assert(memory->cal()->buffer_ && "SVM memory can't be an image"); - if (!blitMgr().copyBuffer(*memory, *memory->mapMemory(), - vcmd.origin(), vcmd.origin(), vcmd.size(), - vcmd.isEntireMemory())) { - LogError("submitSVMMapMemory() - copy failed"); - vcmd.setStatus(CL_MAP_FAILURE); - } - } - } - else if ((memory->owner()->getHostMem() != nullptr) && memory->isDirectMap()) { - if (!memory->isHostMemDirectAccess()) { - // Make sure GPU finished operation before - // synchronization with the backing store - memory->wait(*this); - } - - // Target is the backing store, so just ensure that owner is up-to-date - memory->owner()->cacheWriteBack(); - } - else { - LogError("Unhandled svm map!"); - } - } - - profilingEnd(vcmd); -} - -void -VirtualGPU::submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& vcmd) -{ - // Make sure VirtualGPU has an exclusive access to the resources - amd::ScopedLock lock(execution()); - profilingBegin(vcmd, true); - - //no op for FGS supported device - if (!dev().isFineGrainedSystem()) { - gpu::Memory* memory = dev().getGpuMemory(vcmd.getSvmMem()); - const device::Memory::WriteMapInfo* writeMapInfo = - memory->writeMapInfo(vcmd.svmPtr()); - - if (memory->mapMemory() != NULL) { - if (writeMapInfo->isUnmapWrite()) { - // Target is a remote resource, so copy - assert(memory->cal()->buffer_ && "SVM memory can't be an image"); - if (!blitMgr().copyBuffer(*memory->mapMemory(), *memory, - writeMapInfo->origin_, writeMapInfo->origin_, - writeMapInfo->region_, writeMapInfo->isEntire())) { - LogError("submitSvmUnmapMemory() - copy failed"); - vcmd.setStatus(CL_OUT_OF_RESOURCES); - } - } - } - else if ((memory->owner()->getHostMem() != nullptr) && memory->isDirectMap()) { - if (writeMapInfo->isUnmapWrite()) { - // Target is the backing store, so sync - memory->owner()->signalWrite(nullptr); - memory->syncCacheFromHost(*this); - } - } - memory->clearUnmapInfo(vcmd.svmPtr()); - } - - profilingEnd(vcmd); -} - -void -VirtualGPU::submitSvmFillMemory(amd::SvmFillMemoryCommand& vcmd) -{ - // Make sure VirtualGPU has an exclusive access to the resources - amd::ScopedLock lock(execution()); - - profilingBegin(vcmd, true); - - if (!dev().isFineGrainedSystem()) { - size_t patternSize = vcmd.patternSize(); - size_t fillSize = patternSize * vcmd.times(); - size_t offset = 0; - amd::Memory* dstMemory = amd::SvmManager::FindSvmBuffer(vcmd.dst()); - assert(dstMemory&&"No svm Buffer to fill with!"); - offset = reinterpret_cast(vcmd.dst()) - - reinterpret_cast(dstMemory->getSvmPtr()); - assert((offset >= 0) && "wrong svm ptr to fill with!"); - - gpu::Memory* memory = dev().getGpuMemory(dstMemory); - - amd::Coord3D origin(offset, 0, 0); - amd::Coord3D size(fillSize, 1, 1); - assert((dstMemory->validateRegion(origin, size)) && "The incorrect fill size!"); - - if (!fillMemory(vcmd.type(), dstMemory, vcmd.pattern(), - vcmd.patternSize(), origin, size)) { - vcmd.setStatus(CL_INVALID_OPERATION); - } - // Mark this as the most-recently written cache of the destination - dstMemory->signalWrite(&gpuDevice_); - } - else { - // for FGS capable device, fill CPU memory directly - amd::SvmBuffer::memFill(vcmd.dst(), vcmd.pattern(), vcmd.patternSize(), vcmd.times()); - } - - profilingEnd(vcmd); -} - -void -VirtualGPU::submitMigrateMemObjects(amd::MigrateMemObjectsCommand& vcmd) -{ - // Make sure VirtualGPU has an exclusive access to the resources - amd::ScopedLock lock(execution()); - - profilingBegin(vcmd, true); - - std::vector::const_iterator itr; - for (itr = vcmd.memObjects().begin(); itr != vcmd.memObjects().end(); itr++) { - // Find device memory - gpu::Memory* memory = dev().getGpuMemory(*itr); - - if (vcmd.migrationFlags() & CL_MIGRATE_MEM_OBJECT_HOST) { - memory->mgpuCacheWriteBack(); - } - else if (vcmd.migrationFlags() & CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED) { - // Synchronize memory from host if necessary. - // The sync function will perform memory migration from - // another device if necessary - device::Memory::SyncFlags syncFlags; - memory->syncCacheFromHost(*this, syncFlags); - } - else { - LogWarning("Unknown operation for memory migration!"); - } - } - - profilingEnd(vcmd); -} - -void -VirtualGPU::submitSvmFreeMemory(amd::SvmFreeMemoryCommand& vcmd) -{ - // in-order semantics: previous commands need to be done before we start - // Make sure VirtualGPU has an exclusive access to the resources - amd::ScopedLock lock(execution()); - - profilingBegin(vcmd); - std::vector& svmPointers = vcmd.svmPointers(); - if (vcmd.pfnFreeFunc() == NULL) { - // pointers allocated using clSVMAlloc - for (cl_uint i = 0; i < svmPointers.size(); i++) { - dev().svmFree(svmPointers[i]); - } - } - else { - vcmd.pfnFreeFunc()(as_cl(vcmd.queue()->asCommandQueue()), svmPointers.size(), - static_cast(&(svmPointers[0])), vcmd.userData()); - } - profilingEnd(vcmd); -} - -void -VirtualGPU::findIterations( - const amd::NDRangeContainer& sizes, - const amd::NDRange& local, - amd::NDRange& groups, - amd::NDRange& remainder, - size_t& extra) -{ - size_t dimensions = sizes.dimensions(); - - if (cal()->iterations_ > 1) { - size_t iterations = cal()->iterations_; - cal_.iterations_ = 1; - - // Find the total amount of all groups - groups = sizes.global() / local; - if (dev().settings().partialDispatch_) { - for (uint j = 0; j < dimensions; ++j) { - if ((sizes.global()[j] % local[j]) != 0) { - groups[j]++; - } - } - } - - // Calculate the real number of required iterations and - // the workgroup size of each iteration - for (int j = (dimensions - 1); j >= 0; --j) { - // Find possible size of each iteration - size_t tmp = (groups[j] / iterations); - // Make sure the group size is more than 1 - if (tmp > 0) { - remainder = groups; - remainder[j] = (groups[j] % tmp); - - extra = ((groups[j] / tmp) + - // Check for the remainder - ((remainder[j] != 0) ? 1 : 0)); - // Recalculate the number of iterations - cal_.iterations_ *= extra; - if (remainder[j] == 0) { - extra = 0; - } - groups[j] = tmp; - break; - } - else { - iterations = ((iterations / groups[j]) + - (((iterations % groups[j]) != 0) ? 1 : 0)); - cal_.iterations_ *= groups[j]; - groups[j] = 1; - } - } - } -} - -void -VirtualGPU::setupIteration( - uint iteration, - const amd::NDRangeContainer& sizes, - Kernel& gpuKernel, - amd::NDRange& global, - amd::NDRange& offsets, - amd::NDRange& local, - amd::NDRange& groups, - amd::NDRange& groupOffset, - amd::NDRange& divider, - amd::NDRange& remainder, - size_t extra) -{ - size_t dimensions = sizes.dimensions(); - - // Calculate the workload size for the remainder - if ((extra != 0) && ((iteration % extra) == 0)) { - groups = remainder; - } - else { - groups = divider; - } - global = groups * local; - - if (dev().settings().partialDispatch_) { - for (uint j = 0; j < dimensions; ++j) { - size_t offset = groupOffset[j] * local[j]; - if ((offset + global[j]) > sizes.global()[j]) { - global[j] = sizes.global()[j] - offset; - } - } - } - - // Reprogram the kernel parameters for the GPU execution - gpuKernel.setupProgramGrid(*this, dimensions, - offsets, global, local, groupOffset, - sizes.offset(), sizes.global()); - - // Update the constant buffers - gpuKernel.bindConstantBuffers(*this); - - uint sub = 0; - // Find the offsets for the next execution - for (uint j = 0; j < dimensions; ++j) { - groupOffset[j] += groups[j]; - // Make sure the offset doesn't go over the size limit - if (sizes.global()[j] <= groupOffset[j] * local[j]) { - // Check if we counted a group in one dimension already - if (sub) { - groupOffset[j] -= groups[j]; - } - else { - groupOffset[j] = 0; - } - } - else { - groupOffset[j] -= sub; - // We already counted elements in one dimension - sub = 1; - } - - offsets[j] = groupOffset[j] * local[j] + - sizes.offset()[j]; - } -} - -void -VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) -{ - // Make sure VirtualGPU has an exclusive access to the resources - amd::ScopedLock lock(execution()); - - profilingBegin(vcmd); - - // Submit kernel to HW - if (!submitKernelInternal(vcmd.sizes(), vcmd.kernel(), vcmd.parameters(), false, - &vcmd.event())) { - vcmd.setStatus(CL_INVALID_OPERATION); - } - - profilingEnd(vcmd); -} - -bool -VirtualGPU::submitKernelInternalHSA( - const amd::NDRangeContainer& sizes, - const amd::Kernel& kernel, - const_address parameters, - bool nativeMem, - amd::Event* enqueueEvent) -{ - uint64_t vmParentWrap = 0; - uint64_t vmDefQueue = 0; - amd::DeviceQueue* defQueue = kernel.program().context().defDeviceQueue(dev()); - VirtualGPU* gpuDefQueue = NULL; - amd::HwDebugManager * dbgManager = dev().hwDebugMgr(); - - // Get the HSA kernel object - const HSAILKernel& hsaKernel = - static_cast(*(kernel.getDeviceKernel(dev()))); - std::vector memList; - - bool printfEnabled = (hsaKernel.printfInfo().size() > 0) ? true:false; - if (!printfDbgHSA().init(*this, printfEnabled )) { - LogError( "Printf debug buffer initialization failed!"); - return false; - } - - // Check memory dependency and SVM objects - if (!processMemObjectsHSA(kernel, parameters, nativeMem, &memList)) { - LogError("Wrong memory objects!"); - return false; - } - - cal_.memCount_ = 0; - - if (hsaKernel.dynamicParallelism()) { - if (NULL == defQueue) { - LogError("Default device queue wasn't allocated"); - return false; - } - else { - if (dev().settings().useDeviceQueue_) { - gpuDefQueue = static_cast(defQueue->vDev()); - if (gpuDefQueue->hwRing() == hwRing()) { - LogError("Can't submit the child kernels to the same HW ring as the host queue!"); - return false; - } - } - else { - createVirtualQueue(defQueue->size()); - gpuDefQueue = this; - } - } - vmDefQueue = gpuDefQueue->virtualQueue_->vmAddress(); - - // Add memory handles before the actual dispatch - memList.push_back(gpuDefQueue->virtualQueue_); - memList.push_back(gpuDefQueue->schedParams_); - memList.push_back(hsaKernel.prog().kernelTable()); - gpuDefQueue->writeVQueueHeader(*this, - hsaKernel.prog().kernelTable()->vmAddress()); - } - - // setup the storage for the memory pointers of the kernel parameters - uint numParams = kernel.signature().numParameters(); - if (dbgManager) { - dbgManager->allocParamMemList(numParams); - } - - bool needFlush = false; - dmaFlushMgmt_.findSplitSize(dev(), sizes.global().product(), hsaKernel.aqlCodeSize()); - if (dmaFlushMgmt().dispatchSplitSize() != 0) { - needFlush = true; - } - - size_t newOffset[3] = {0, 0, 0}; - size_t newGlobalSize[3] = {0, 0, 0}; - - int dim = -1; - int iteration = 1; - size_t globalStep = 0; - for (uint i = 0; i < sizes.dimensions(); i++) { - newGlobalSize[i] = sizes.global()[i]; - newOffset[i] = sizes.offset()[i]; - } - // Check if it is blit kernel. If it is, then check if split is needed. - if (hsaKernel.isInternalKernel()) { - // Calculate new group size for each submission - for (uint i = 0; i < sizes.dimensions(); i++) { - if (sizes.global()[i] > static_cast(0xffffffff)) { - dim = i; - iteration = sizes.global()[i] / 0xC0000000 - + ((sizes.global()[i] % 0xC0000000) ? 1: 0); - globalStep = (sizes.global()[i] / sizes.local()[i]) / iteration - * sizes.local()[dim]; - break; - } - } - } - - for (int j = 0; j < iteration; j++) { - // Reset global size for dimension dim if split is needed - if (dim != -1) { - newOffset[dim] = sizes.offset()[dim] + globalStep * j; - if (((newOffset[dim] + globalStep) < sizes.global()[dim]) && - (j != (iteration - 1))) { - newGlobalSize[dim] = globalStep; - } - else { - newGlobalSize[dim] = sizes.global()[dim] - newOffset[dim]; - } - } - - amd::NDRangeContainer tmpSizes(sizes.dimensions(), - &newOffset[0], &newGlobalSize[0], - &(const_cast(sizes).local()[0])); - - // Program the kernel arguments for the GPU execution - hsa_kernel_dispatch_packet_t* aqlPkt = - hsaKernel.loadArguments(*this, kernel, tmpSizes, parameters, nativeMem, - vmDefQueue, &vmParentWrap, memList); - if (NULL == aqlPkt) { - LogError("Couldn't load kernel arguments"); - return false; - } - - gslMemObject scratch = NULL; - uint scratchOffset = 0; - // Check if the device allocated more registers than the old setup - if (hsaKernel.workGroupInfo()->scratchRegs_ > 0) { - const Device::ScratchBuffer* scratchObj = dev().scratch(hwRing()); - scratch = scratchObj->memObj_->gslResource(); - memList.push_back(scratchObj->memObj_); - scratchOffset = scratchObj->offset_; - } - - // Add GSL handle to the memory list for VidMM - for (uint i = 0; i < memList.size(); ++i) { - addVmMemory(memList[i]); - } - - // HW Debug for the kernel? - HwDbgKernelInfo kernelInfo; - HwDbgKernelInfo *pKernelInfo = NULL; - - if (dbgManager) { - buildKernelInfo(hsaKernel, aqlPkt, kernelInfo, enqueueEvent); - pKernelInfo = &kernelInfo; - } - - // Set up the dispatch information - KernelDispatchInfo dispatchInfo; - dispatchInfo.aqlPacket = aqlPkt; - dispatchInfo.mems = vmMems(); - dispatchInfo.numMems = cal_.memCount_; - dispatchInfo.scratch = scratch; - dispatchInfo.scratchOffset = scratchOffset; - dispatchInfo.cpuAqlCode = hsaKernel.cpuAqlCode(); - dispatchInfo.hsaQueueVA = hsaQueueMem_->vmAddress(); - dispatchInfo.kernelInfo = pKernelInfo; - dispatchInfo.wavesPerSH = hsaKernel.getWavesPerSH(this); - dispatchInfo.lastDoppSubmission = kernel.parameters().getExecNewVcop(); - dispatchInfo.pfpaDoppSubmission = kernel.parameters().getExecPfpaVcop(); - - GpuEvent gpuEvent; - // Run AQL dispatch in HW - eventBegin(MainEngine); - cs()->AqlDispatch(&dispatchInfo); - eventEnd(MainEngine, gpuEvent); - - if (dbgManager && (NULL != dbgManager->postDispatchCallBackFunc())) { - dbgManager->executePostDispatchCallBack(); - } - - if (hsaKernel.dynamicParallelism()) { - // Make sure exculsive access to the device queue - amd::ScopedLock(defQueue->lock()); - - if (GPU_PRINT_CHILD_KERNEL != 0) { - waitForEvent(&gpuEvent); - - AmdAqlWrap* wraps = (AmdAqlWrap*)(&((AmdVQueueHeader*)gpuDefQueue->virtualQueue_->data())[1]); - uint p = 0; - for (uint i = 0; i < gpuDefQueue->vqHeader_->aql_slot_num; ++i) { - if (wraps[i].state != 0) { - uint j; - if (p == GPU_PRINT_CHILD_KERNEL) { - break; - } - p++; - std::stringstream print; - print.flags(std::ios::right | std::ios_base::hex | std::ios_base::uppercase); - print << "Slot#: " << i << "\n"; - print << "\tenqueue_flags: " << wraps[i].enqueue_flags << "\n"; - print << "\tcommand_id: " << wraps[i].command_id << "\n"; - print << "\tchild_counter: " << wraps[i].child_counter << "\n"; - print << "\tcompletion: " << wraps[i].completion << "\n"; - print << "\tparent_wrap: " << wraps[i].parent_wrap << "\n"; - print << "\twait_list: " << wraps[i].wait_list << "\n"; - print << "\twait_num: " << wraps[i].wait_num << "\n"; - uint offsEvents = wraps[i].wait_list - - gpuDefQueue->virtualQueue_->vmAddress(); - size_t* events = reinterpret_cast( - gpuDefQueue->virtualQueue_->data() + offsEvents); - for (j = 0; j < wraps[i].wait_num; ++j) { - uint offs = static_cast(events[j]) - - gpuDefQueue->virtualQueue_->vmAddress(); - AmdEvent* eventD = (AmdEvent*)(gpuDefQueue->virtualQueue_->data() + offs); - print << "Wait Event#: " << j << "\n"; - print << "\tState: " << eventD->state << - "; Counter: " << eventD->counter << "\n"; - } - print << "WorkGroupSize[ " << wraps[i].aql.workgroup_size_x << ", "; - print << wraps[i].aql.workgroup_size_y << ", "; - print << wraps[i].aql.workgroup_size_z << "]\n"; - print << "GridSize[ " << wraps[i].aql.grid_size_x << ", "; - print << wraps[i].aql.grid_size_y << ", "; - print << wraps[i].aql.grid_size_z << "]\n"; - - uint64_t* kernels = (uint64_t*)( - const_cast(hsaKernel.prog().kernelTable())->map(this)); - for (j = 0; j < hsaKernel.prog().kernels().size(); ++j) { - if (kernels[j] == wraps[i].aql.kernel_object) { - break; - } - } - const_cast(hsaKernel.prog().kernelTable())->unmap(this); - HSAILKernel* child = NULL; - for (auto it = hsaKernel.prog().kernels().begin(); - it != hsaKernel.prog().kernels().end(); ++it) { - if (j == static_cast(it->second)->index()) { - child = static_cast(it->second); - } - } - if (child == NULL) { - printf("Error: couldn't find child kernel!\n"); - continue; - } - const uint64_t kernarg_address = - static_cast(reinterpret_cast(wraps[i].aql.kernarg_address)); - uint offsArg = kernarg_address - - gpuDefQueue->virtualQueue_->vmAddress(); - address argum = gpuDefQueue->virtualQueue_->data() + offsArg; - print << "Kernel: " << child->name() << "\n"; - static const char* Names[HSAILKernel::MaxExtraArgumentsNum] = { - "Offset0: ", "Offset1: ","Offset2: ","PrintfBuf: ", "VqueuePtr: ", "AqlWrap: "}; - for (j = 0; j < child->extraArgumentsNum(); ++j) { - print << "\t" << Names[j] << *(size_t*)argum; - print << "\n"; - argum += sizeof(size_t); - } - for (j = 0; j < child->numArguments(); ++j) { - print << "\t" << child->argument(j)->name_ << ": "; - for (int s = child->argument(j)->size_ - 1; s >= 0; --s) { - print.width(2); - print.fill('0'); - print << (uint32_t)(argum[s]); - } - argum += child->argument(j)->size_; - print << "\n"; - } - printf("%s", print.str().c_str()); - } - } - } - - if (!dev().settings().useDeviceQueue_) { - // Add the termination handshake to the host queue - eventBegin(MainEngine); - cs()->VirtualQueueHandshake(gpuDefQueue->schedParams_->gslResource(), - vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE, - vmParentWrap + offsetof(AmdAqlWrap, child_counter), - 0, dev().settings().useDeviceQueue_); - eventEnd(MainEngine, gpuEvent); - } - - // Get the global loop start before the scheduler - mcaddr loopStart = gpuDefQueue->cs()->VirtualQueueDispatcherStart(); - static_cast(gpuDefQueue->blitMgr()).runScheduler( - *gpuDefQueue->virtualQueue_, - *gpuDefQueue->schedParams_, gpuDefQueue->schedParamIdx_, - gpuDefQueue->vqHeader_->aql_slot_num / (DeviceQueueMaskSize * maskGroups_)); - const static bool FlushL2 = true; - gpuDefQueue->flushCUCaches(FlushL2); - - // Get the address of PM4 template and add write it to params - //! @note DMA flush must not occur between patch and the scheduler - mcaddr patchStart = gpuDefQueue->cs()->VirtualQueueDispatcherStart(); - - // Program parameters for the scheduler - SchedulerParam* param = &reinterpret_cast - (gpuDefQueue->schedParams_->data())[gpuDefQueue->schedParamIdx_]; - param->signal = 1; - // Scale clock to 1024 to avoid 64 bit div in the scheduler - param->eng_clk = (1000 * 1024) / dev().info().maxClockFrequency_; - param->hw_queue = patchStart + sizeof(uint32_t)/* Rewind packet*/; - param->hsa_queue = gpuDefQueue->hsaQueueMem()->vmAddress(); - param->releaseHostCP = 0; - param->parentAQL = vmParentWrap; - param->dedicatedQueue = dev().settings().useDeviceQueue_; - param->useATC = dev().settings().svmFineGrainSystem_; - - // Fill the scratch buffer information - if (hsaKernel.prog().maxScratchRegs() > 0) { - gpu::Memory* scratchBuf = dev().scratch(gpuDefQueue->hwRing())->memObj_; - param->scratchSize = scratchBuf->size(); - param->scratch = scratchBuf->vmAddress(); - param->numMaxWaves = 32 * dev().info().maxComputeUnits_; - param->scratchOffset = dev().scratch(gpuDefQueue->hwRing())->offset_; - memList.push_back(scratchBuf); - } - else { - param->numMaxWaves = 0; - param->scratchSize = 0; - param->scratch = 0; - param->scratchOffset = 0; - } - - // Add all kernels in the program to the mem list. - //! \note Runtime doesn't know which one will be called - hsaKernel.prog().fillResListWithKernels(memList); - - // Add GSL handle to the memory list for VidMM - for (uint i = 0; i < memList.size(); ++i) { - gpuDefQueue->addVmMemory(memList[i]); - } - - mcaddr signalAddr = gpuDefQueue->schedParams_->vmAddress() + - gpuDefQueue->schedParamIdx_ * sizeof(SchedulerParam); - gpuDefQueue->eventBegin(MainEngine); - gpuDefQueue->cs()->VirtualQueueDispatcherEnd( - gpuDefQueue->vmMems(), gpuDefQueue->cal_.memCount_, - signalAddr, loopStart, gpuDefQueue->vqHeader_->aql_slot_num / - (DeviceQueueMaskSize * maskGroups_)); - gpuDefQueue->eventEnd(MainEngine, gpuEvent); - - // Set GPU event for the used resources - for (uint i = 0; i < memList.size(); ++i) { - memList[i]->setBusy(*gpuDefQueue, gpuEvent); - } - - if (dev().settings().useDeviceQueue_) { - // Add the termination handshake to the host queue - eventBegin(MainEngine); - cs()->VirtualQueueHandshake(gpuDefQueue->schedParams_->gslResource(), - vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE, - vmParentWrap + offsetof(AmdAqlWrap, child_counter), - signalAddr, dev().settings().useDeviceQueue_); - eventEnd(MainEngine, gpuEvent); - } - - ++gpuDefQueue->schedParamIdx_ %= - gpuDefQueue->schedParams_->size() / sizeof(SchedulerParam); - //! \todo optimize the wrap around - if (gpuDefQueue->schedParamIdx_ == 0) { - gpuDefQueue->schedParams_->wait(*gpuDefQueue); - } - } - - // Set GPU event for the used resources - for (uint i = 0; i < memList.size(); ++i) { - memList[i]->setBusy(*this, gpuEvent); - } - - // Update the global GPU event - setGpuEvent(gpuEvent, needFlush); - - if (!printfDbgHSA().output(*this, printfEnabled, hsaKernel.printfInfo())) { - LogError("Couldn't read printf data from the buffer!\n"); - return false; - } - } - - // Runtime submitted a HSAIL kernel - state_.hsailKernel_ = true; - - return true; -} - -bool -VirtualGPU::submitKernelInternal( - const amd::NDRangeContainer& sizes, - const amd::Kernel& kernel, - const_address parameters, - bool nativeMem, - amd::Event* enqueueEvent) -{ - bool result = true; - uint i; - size_t dimensions = sizes.dimensions(); - amd::NDRange local(sizes.local()); - amd::NDRange groupOffset(dimensions); - GpuEvent gpuEvent; - groupOffset = 0; - - // Get the GPU kernel object with optimization enabled - bool noAlias = true; - device::Kernel* devKernel = const_cast - (kernel.getDeviceKernel(dev(), noAlias)); - Kernel& gpuKernelOpt = static_cast(*devKernel); - - if (gpuKernelOpt.hsa()) { - return submitKernelInternalHSA(sizes, kernel, parameters, nativeMem, enqueueEvent); - } - else if (state_.hsailKernel_) { - // Reload GSL state to HW, so runtime could run AMDIL kernel - flushDMA(MainEngine); - // Reset HSAIL state - state_.hsailKernel_ = false; - } - - // Find if arguments contain memory aliases or a dependency in the queue - gpuKernelOpt.processMemObjects(*this, kernel, parameters, nativeMem); - - Kernel& gpuKernel = static_cast(*devKernel); - bool printfEnabled = (gpuKernel.flags() & - gpu::NullKernel::PrintfOutput) ? true:false; - // Set current kernel CAL descriptor as active - if (!setActiveKernelDesc(sizes, &gpuKernel) || - // Initialize printf support - !printfDbg().init(*this, printfEnabled , sizes.global())) { - LogPrintfError("We couldn't set \"%s\" kernel as active!", - gpuKernel.name().data()); - return false; - } - - // Find if we have to split workload - dmaFlushMgmt_.findSplitSize(dev(), sizes.global().product(), gpuKernel.instructionCnt()); - - // Program the kernel parameters for the GPU execution - cal_.memCount_ = 0; - gpuKernel.setupProgramGrid(*this, dimensions, - sizes.offset(), sizes.global(), - local, groupOffset, sizes.offset(), sizes.global()); - - // Load kernel arguments - if (gpuKernel.loadParameters(*this, kernel, parameters, nativeMem)) { - amd::NDRange global(sizes.global()); - amd::NDRange groups(dimensions); - amd::NDRange offsets(sizes.offset()); - amd::NDRange divider(dimensions); - amd::NDRange remainder(dimensions); - size_t extra = 0; - - // Split the workload if necessary for local/private emulation or printf - findIterations(sizes, local, groups, remainder, extra); - - divider = groups; - i = 0; - do { - bool lastRun = (i == (cal()->iterations_ - 1)) ? true : false; - // Reprogram the CAL grid and constant buffers if - // the workload split is on - if (cal()->iterations_ > 1) { - // Initialize printf support - if (!printfDbg().init(*this, printfEnabled, local)) { - result = false; - break; - } - - // Reprogram the CAL grid and constant buffers - setupIteration(i, sizes, - gpuKernel, global, offsets, local, - groups, groupOffset, divider, remainder, extra); - } - - // Execute the kernel - if (gpuKernel.run(*this, &gpuEvent, lastRun, kernel.parameters().getExecNewVcop(), - kernel.parameters().getExecPfpaVcop())) { - //! @todo A flush is necessary to make sure - // that 2 consecutive runs won't access to the same - // private/local memory. CAL has to generate cache flush - // and wait for idle commands - bool flush = ((cal()->iterations_ > 1) || - dmaFlushMgmt_.isCbReady(*this, global.product(), - gpuKernel.instructionCnt())) ? true : false; - - // Update the global GPU event - setGpuEvent(gpuEvent, flush); - - // This code for the kernel execution debugging - if (dev().settings().debugFlags_ & Settings::LockGlobalMemory) { - gpuKernel.debug(*this); - } - } - else { - result = false; - break; - } - - // Print the debug buffer output result - if (printfDbg().output(*this, printfEnabled, - (cal()->iterations_ > 1) ? local : sizes.global(), - gpuKernel.prog().printfInfo())) { - // Go to the next iteration - ++i; - } - else { - result = false; - break; - } - } - // Check if we have to make multiple iterations - while (i < cal()->iterations_); - } - else { - result = false; - } - - if (!result) { - LogPrintfError("submitKernel failed to execute the \"%s\" kernel on HW!", - gpuKernel.name().data()); - } - - return result; -} - -void -VirtualGPU::submitNativeFn(amd::NativeFnCommand& vcmd) -{ - // Make sure VirtualGPU has an exclusive access to the resources - amd::ScopedLock lock(execution()); - - Unimplemented(); //!< @todo: Unimplemented -} - -void -VirtualGPU::submitMarker(amd::Marker& vcmd) -{ - //!@note runtime doesn't need to lock this command on execution - - if (vcmd.waitingEvent() != NULL) { - bool foundEvent = false; - - // Loop through all outstanding command batches - while (!cbList_.empty()) { - CommandBatchList::const_iterator it = cbList_.begin(); - // Wait for completion - foundEvent = awaitCompletion(*it, vcmd.waitingEvent()); - // Release a command batch - delete *it; - // Remove command batch from the list - cbList_.pop_front(); - // Early exit if we found a command - if (foundEvent) break; - } - - // Event should be in the current command batch - if (!foundEvent) { - state_.forceWait_ = true; - } - // If we don't have any more batches, then assume GPU is idle - else if (cbList_.empty()) { - dmaFlushMgmt_.resetCbWorkload(dev()); - } - } -} - -void -VirtualGPU::releaseMemory(gslMemObject gslResource, bool wait) -{ - bool result = true; - if (wait) { - waitForEvent(&gpuEvents_[gslResource]); - } - - // Unbind resource if it's active kernel desc - for (uint i = 0; i < MaxUavArguments; ++i) { - if (gslResource == cal_.uavs_[i]) { - result = setUAVBuffer(i, 0, GSL_UAV_TYPE_UNKNOWN); - cal_.uavs_[i] = 0; - } - } - for (uint i = 0; i < MaxReadImage; ++i) { - if (gslResource == cal_.readImages_[i]) { - result = setInput(i, 0); - cal_.readImages_[i] = 0; - } - } - for (uint i = 0; i < MaxConstBuffers; ++i) { - if (gslResource == cal_.constBuffers_[i]) { - result = setConstantBuffer(i, 0, 0, 0); - cal_.constBuffers_[i] = 0; - } - } - - if ((dev().scratch(hwRing()) != NULL) && - (dev().scratch(hwRing())->regNum_ > 0)) { - // Unbind scratch memory - const Device::ScratchBuffer* scratch = dev().scratch(hwRing()); - if ((scratch->memObj_ != NULL) && (scratch->memObj_->gslResource() == gslResource)) { - setScratchBuffer(NULL, 0); - } - } - - gpuEvents_.erase(gslResource); -} - -void -VirtualGPU::releaseKernel(CALimage calImage) -{ - GslKernelDesc* desc = gslKernels_[calImage]; - if (desc != NULL) { - freeKernelDesc(desc); - } - gslKernels_.erase(calImage); -} - -void -VirtualGPU::submitPerfCounter(amd::PerfCounterCommand& vcmd) -{ - // Make sure VirtualGPU has an exclusive access to the resources - amd::ScopedLock lock(execution()); - - gslQueryObject gslCounter; - - const amd::PerfCounterCommand::PerfCounterList counters = vcmd.getCounters(); - - // Create a HW counter - gslCounter = cs()->createQuery(GSL_PERFORMANCE_COUNTERS_ATI); - if (0 == gslCounter) { - LogError("We failed to allocate memory for the GPU perfcounter"); - vcmd.setStatus(CL_INVALID_OPERATION); - return; - } - CalCounterReference* calRef = new CalCounterReference(*this, gslCounter); - if (calRef == NULL) { - LogError("We failed to allocate memory for the GPU perfcounter"); - vcmd.setStatus(CL_INVALID_OPERATION); - return; - } - gslCounter = 0; - - for (uint i = 0; i < vcmd.getNumCounters(); ++i) { - amd::PerfCounter* amdCounter = - static_cast(counters[i]); - const PerfCounter* counter = - static_cast(amdCounter->getDeviceCounter()); - - // Make sure we have a valid gpu performance counter - if (NULL == counter) { - amd::PerfCounter::Properties prop = amdCounter->properties(); - PerfCounter* gpuCounter = new PerfCounter( - gpuDevice_, - *this, - prop[CL_PERFCOUNTER_GPU_BLOCK_INDEX], - prop[CL_PERFCOUNTER_GPU_COUNTER_INDEX], - prop[CL_PERFCOUNTER_GPU_EVENT_INDEX]); - if (NULL == gpuCounter) { - LogError("We failed to allocate memory for the GPU perfcounter"); - vcmd.setStatus(CL_INVALID_OPERATION); - return; - } - else if (gpuCounter->create(calRef)) { - amdCounter->setDeviceCounter(gpuCounter); - } - else { - LogPrintfError("We failed to allocate a perfcounter in CAL.\ - Block: %d, counter: #d, event: %d", - gpuCounter->info()->blockIndex_, - gpuCounter->info()->counterIndex_, - gpuCounter->info()->eventIndex_); - delete gpuCounter; - vcmd.setStatus(CL_INVALID_OPERATION); - return; - } - counter = gpuCounter; - } - } - - calRef->release(); - - for (uint i = 0; i < vcmd.getNumCounters(); ++i) { - amd::PerfCounter* amdCounter = - static_cast(counters[i]); - const PerfCounter* counter = - static_cast(amdCounter->getDeviceCounter()); - - if (gslCounter != counter->gslCounter()) { - gslCounter = counter->gslCounter(); - // Find the state and sends the command to CAL - if (vcmd.getState() == amd::PerfCounterCommand::Begin) { - gslCounter->BeginQuery(cs(), GSL_PERFORMANCE_COUNTERS_ATI, 0); - } - else if (vcmd.getState() == amd::PerfCounterCommand::End) { - GpuEvent event; - eventBegin(MainEngine); - gslCounter->EndQuery(cs(), 0); - eventEnd(MainEngine, event); - setGpuEvent(event); - } - else { - LogError("Unsupported performance counter state"); - vcmd.setStatus(CL_INVALID_OPERATION); - return; - } - } - } -} -void -VirtualGPU::submitThreadTraceMemObjects(amd::ThreadTraceMemObjectsCommand& cmd) -{ - // Make sure VirtualGPU has an exclusive access to the resources - amd::ScopedLock lock(execution()); - - profilingBegin(cmd); - - switch(cmd.type()) { - case CL_COMMAND_THREAD_TRACE_MEM: - { - amd::ThreadTrace* amdThreadTrace = &cmd.getThreadTrace(); - ThreadTrace* threadTrace = - static_cast(amdThreadTrace->getDeviceThreadTrace()); - - if (threadTrace == NULL) { - gslQueryObject gslThreadTrace; - // Create a HW thread trace query object - gslThreadTrace = cs()->createQuery(GSL_SHADER_TRACE_BYTES_WRITTEN); - if (0 == gslThreadTrace) { - LogError("Failure in memory allocation for the GPU threadtrace"); - cmd.setStatus(CL_INVALID_OPERATION); - return; - } - CalThreadTraceReference* calRef = new CalThreadTraceReference(*this,gslThreadTrace); - if (calRef == NULL) { - LogError("Failure in memory allocation for the GPU threadtrace"); - cmd.setStatus(CL_INVALID_OPERATION); - return; - } - size_t seNum = amdThreadTrace->deviceSeNumThreadTrace(); - ThreadTrace* gpuThreadTrace = new ThreadTrace( - gpuDevice_, - *this, - seNum); - if (NULL == gpuThreadTrace) { - LogError("Failure in memory allocation for the GPU threadtrace"); - cmd.setStatus(CL_INVALID_OPERATION); - return; - } - if (gpuThreadTrace->create(calRef)) { - amdThreadTrace->setDeviceThreadTrace(gpuThreadTrace); - } - else { - LogError("Failure in memory allocation for the GPU threadtrace"); - delete gpuThreadTrace; - cmd.setStatus(CL_INVALID_OPERATION); - return; - } - threadTrace = gpuThreadTrace; - calRef->release(); - } - gslShaderTraceBufferObject* threadTraceBufferObjects = threadTrace->getThreadTraceBufferObjects(); - const size_t memObjSize = cmd.getMemoryObjectSize(); - const std::vector& memObj = cmd.getMemList(); - size_t se = 0; - for (std::vector::const_iterator itMemObj = memObj.begin();itMemObj != memObj.end();++itMemObj,++se) { - // Find GSL Mem Object - gslMemObject gslMemObj = dev().getGpuMemory(*itMemObj)->gslResource(); - - // Bind GSL MemObject to the appropriate SE Thread Trace Buffer Object - threadTraceBufferObjects[se]->attachMemObject(cs(), gslMemObj, 0, 0, memObjSize, se); - } - break; - } - default: - LogError("Unsupported command type for ThreadTraceMemObjects!"); - break; - } -} - -void -VirtualGPU::submitThreadTrace(amd::ThreadTraceCommand& cmd) -{ - // Make sure VirtualGPU has an exclusive access to the resources - amd::ScopedLock lock(execution()); - - profilingBegin(cmd); - - switch(cmd.type()) { - case CL_COMMAND_THREAD_TRACE: - { - amd::ThreadTrace* amdThreadTrace = - static_cast(&cmd.getThreadTrace()); - ThreadTrace* threadTrace = - static_cast(amdThreadTrace->getDeviceThreadTrace()); - - // gpu thread trace object had to be generated prior to begin/end/pause/resume due - // to ThreadTraceMemObjectsCommand execution - if (threadTrace == NULL) { - return; - } - else { - gslQueryObject gslThreadTrace; - gslThreadTrace = threadTrace->gslThreadTrace(); - uint32_t seNum = amdThreadTrace->deviceSeNumThreadTrace(); - - // Find the state and sends the commands to GSL - if (cmd.getState() == amd::ThreadTraceCommand::Begin) { - amd::ThreadTrace::ThreadTraceConfig* traceCfg = - static_cast(cmd.threadTraceConfig()); - const gslErrorCode ec = gslThreadTrace->BeginQuery(cs(), - GSL_SHADER_TRACE_BYTES_WRITTEN, 0); - assert(ec == GSL_NO_ERROR); - - for (uint32_t idx = 0; idx < seNum; ++idx) { - rs()->enableShaderTrace(cs(), idx, true); - rs()->setShaderTraceComputeUnit (idx, traceCfg->cu_); - rs()->setShaderTraceShaderArray (idx, traceCfg->sh_); - rs()->setShaderTraceSIMDMask (idx, traceCfg->simdMask_); - rs()->setShaderTraceVmIdMask (idx, traceCfg->vmIdMask_); - rs()->setShaderTraceTokenMask (idx, traceCfg->tokenMask_); - rs()->setShaderTraceRegisterMask(idx, traceCfg->regMask_); - rs()->setShaderTraceIssueMask (idx, traceCfg->instMask_); - rs()->setShaderTraceRandomSeed (idx, traceCfg->randomSeed_); - rs()->setShaderTraceCaptureMode (idx, traceCfg->captureMode_); - rs()->setShaderTraceWrap (idx, traceCfg->isWrapped_); - rs()->setShaderTraceUserData (idx, - (traceCfg->isUserData_) ? traceCfg->userData_ : 0); - } - } - else if (cmd.getState() == amd::ThreadTraceCommand::End) { - for (uint32_t idx = 0; idx < seNum; ++idx) { - rs()->enableShaderTrace(cs(), idx, false); - } - gslThreadTrace->EndQuery(cs(), 0); - } - else if (cmd.getState() == amd::ThreadTraceCommand::Pause) { - for (uint32_t idx = 0; idx < seNum; ++idx) { - rs()->setShaderTraceIsPaused(cs(), idx, true); - } - } - else if (cmd.getState() == amd::ThreadTraceCommand::Resume) { - for (uint32_t idx = 0; idx < seNum; ++idx) { - rs()->setShaderTraceIsPaused(cs(), idx, false); - } - } - } - break; - } - default: - LogError("Unsupported command type for ThreadTrace!"); - break; - } -} - -void -VirtualGPU::submitAcquireExtObjects(amd::AcquireExtObjectsCommand& vcmd) -{ - // Make sure VirtualGPU has an exclusive access to the resources - amd::ScopedLock lock(execution()); - - profilingBegin(vcmd); - - for (std::vector::const_iterator it = vcmd.getMemList().begin(); - it != vcmd.getMemList().end(); it++) { - // amd::Memory object should never be NULL - assert(*it && "Memory object for interop is NULL"); - gpu::Memory* memory = dev().getGpuMemory(*it); - - // If resource is a shared copy of original resource, then - // runtime needs to copy data from original resource - (*it)->getInteropObj()->copyOrigToShared(); - - // Check if OpenCL has direct access to the interop memory - if (memory->interopType() == Memory::InteropDirectAccess) { - continue; - } - - // Does interop use HW emulation? - if (memory->interopType() == Memory::InteropHwEmulation) { - static const bool Entire = true; - amd::Coord3D origin(0, 0, 0); - amd::Coord3D region(memory->size()); - - // Synchronize the object - if (!blitMgr().copyBuffer(*memory->interop(), - *memory, origin, origin, region, Entire)) { - LogError("submitAcquireExtObjects - Interop synchronization failed!"); - vcmd.setStatus(CL_INVALID_OPERATION); - return; - } - } - } - - profilingEnd(vcmd); -} - -void -VirtualGPU::submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& vcmd) -{ - // Make sure VirtualGPU has an exclusive access to the resources - amd::ScopedLock lock(execution()); - - profilingBegin(vcmd); - - for (std::vector::const_iterator it = vcmd.getMemList().begin(); - it != vcmd.getMemList().end(); it++) { - // amd::Memory object should never be NULL - assert(*it && "Memory object for interop is NULL"); - gpu::Memory* memory = dev().getGpuMemory(*it); - - // Check if we can use HW interop - if (memory->interopType() == Memory::InteropHwEmulation) { - static const bool Entire = true; - amd::Coord3D origin(0, 0, 0); - amd::Coord3D region(memory->size()); - - // Synchronize the object - if (!blitMgr().copyBuffer(*memory, *memory->interop(), - origin, origin, region, Entire)) { - LogError("submitReleaseExtObjects interop synchronization failed!"); - vcmd.setStatus(CL_INVALID_OPERATION); - return; - } - } - else { - if (memory->interopType() != Memory::InteropDirectAccess) { - LogError("None interop release!"); - } - } - - // If resource is a shared copy of original resource, then - // runtime needs to copy data back to original resource - (*it)->getInteropObj()->copySharedToOrig(); - } - - profilingEnd(vcmd); -} - -void -VirtualGPU::submitSignal(amd::SignalCommand & vcmd) -{ - amd::ScopedLock lock(execution()); - profilingBegin(vcmd); - gpu::Memory* gpuMemory = dev().getGpuMemory(&vcmd.memory()); - GpuEvent gpuEvent; - eventBegin(MainEngine); - if (vcmd.type() == CL_COMMAND_WAIT_SIGNAL_AMD) { - uint64_t surfAddr = gpuMemory->gslResource()->getPhysicalAddress(cs()); - uint64_t markerAddr = gpuMemory->gslResource()->getMarkerAddress(cs()); - uint64_t markerOffset = markerAddr - surfAddr; - cs()->p2pMarkerOp(gpuMemory->gslResource(), vcmd.markerValue(), - markerOffset, false); - } - else if (vcmd.type() == CL_COMMAND_WRITE_SIGNAL_AMD) { - cs()->p2pMarkerOp(gpuMemory->gslResource(), vcmd.markerValue(), vcmd.markerOffset(), true); - } - eventEnd(MainEngine, gpuEvent); - gpuMemory->setBusy(*this, gpuEvent); - // Update the global GPU event - setGpuEvent(gpuEvent); - - profilingEnd(vcmd); -} - -void -VirtualGPU::submitMakeBuffersResident(amd::MakeBuffersResidentCommand & vcmd) -{ - amd::ScopedLock lock(execution()); - profilingBegin(vcmd); - std::vector memObjects = vcmd.memObjects(); - cl_uint numObjects = memObjects.size(); - gslMemObject* pGSLMemObjects = new gslMemObject[numObjects]; - - for(cl_uint i = 0; i < numObjects; ++i) - { - gpu::Memory* gpuMemory = dev().getGpuMemory(memObjects[i]); - pGSLMemObjects[i] = gpuMemory->gslResource(); - gpuMemory->syncCacheFromHost(*this); - } - - uint64* surfBusAddr = new uint64[numObjects]; - uint64* markerBusAddr = new uint64[numObjects]; - gslErrorCode res = cs()->makeBuffersResident(numObjects, pGSLMemObjects, - surfBusAddr, markerBusAddr); - if(res != GSL_NO_ERROR) { - LogError("MakeBuffersResident failed"); - vcmd.setStatus(CL_INVALID_OPERATION); - } - else { - cl_bus_address_amd* busAddr = vcmd.busAddress(); - for(cl_uint i = 0; i < numObjects; ++i) - { - busAddr[i].surface_bus_address = surfBusAddr[i]; - busAddr[i].marker_bus_address = markerBusAddr[i]; - } - } - delete[] pGSLMemObjects; - delete[] surfBusAddr; - delete[] markerBusAddr; - profilingEnd(vcmd); -} - - -bool -VirtualGPU::awaitCompletion(CommandBatch* cb, const amd::Event* waitingEvent) -{ - bool found = false; - amd::Command* current; - amd::Command* head = cb->head_; - - // Make sure that profiling is enabled - if (profileEnabled_) { - return profilingCollectResults(cb, waitingEvent); - } - // Mark the first command in the batch as running - if (head != NULL) { - head->setStatus(CL_RUNNING); - } - else { - return found; - } - - // Wait for the last known GPU event - waitEventLock(cb); - - while (NULL != head) { - current = head->getNext(); - if (head->status() == CL_SUBMITTED) { - head->setStatus(CL_RUNNING); - head->setStatus(CL_COMPLETE); - } - else if (head->status() == CL_RUNNING) { - head->setStatus(CL_COMPLETE); - } - else if ((head->status() != CL_COMPLETE) && (current != NULL)) { - LogPrintfError("Unexpected command status - %d!", head->status()); - } - - // Check if it's a waiting command - if (head == waitingEvent) { - found = true; - } - - head->release(); - head = current; - } - - return found; -} - -void -VirtualGPU::flush(amd::Command* list, bool wait) -{ - CommandBatch* cb = NULL; - bool gpuCommand = false; - - for (uint i = 0; i < AllEngines; ++i) { - if (cal_.events_[i].isValid()) { - gpuCommand = true; - } - } - - // If the batch doesn't have any GPU command and the list is empty - if (!gpuCommand && cbList_.empty()) { - state_.forceWait_ = true; - } - - // Insert the current batch into a list - if (NULL != list) { - cb = new CommandBatch(list, cal()->events_, cal()->lastTS_); - } - - { - //! @note: flushDMA() requires a lock, because GSL can - //! defer destruction of internal memory objects and releases them - //! on GSL flush. If runtime calls another GSL flush at the same time, - //! then double release can occur. - amd::ScopedLock lock(execution()); - for (uint i = 0; i < AllEngines; ++i) { - flushDMA(i); - // Reset event so we won't try to wait again, - // if runtime didn't submit any commands - //! @note: it's safe to invalidate events, since - //! we already saved them with the batch creation step above - cal_.events_[i].invalidate(); - } - } - - // Mark last TS as NULL, so runtime won't process empty batches with the old TS - cal_.lastTS_ = NULL; - if (NULL != cb) { - cbList_.push_back(cb); - } - - wait |= state_.forceWait_; - // Loop through all outstanding command batches - while (!cbList_.empty()) { - CommandBatchList::const_iterator it = cbList_.begin(); - // Check if command batch finished without a wait - bool finished = true; - for (uint i = 0; i < AllEngines; ++i) { - finished &= isDone(&(*it)->events_[i]); - } - if (finished || wait) { - // Wait for completion - awaitCompletion(*it); - // Release a command batch - delete *it; - // Remove command batch from the list - cbList_.pop_front(); - } - else { - // Early exit if no finished - break; - } - } - state_.forceWait_ = false; -} - -void -VirtualGPU::enableSyncedBlit() const -{ - return blitMgr_->enableSynchronization(); -} - -void -VirtualGPU::releaseMemObjects(bool scratch) -{ - for (GpuEvents::const_iterator it = gpuEvents_.begin(); - it != gpuEvents_.end(); ++it) { - GpuEvent event = it->second; - waitForEvent(&event); - } - // Unbind all resources.So the queue won't have any bound mem objects - for (uint i = 0; i < MaxUavArguments; ++i) { - if (NULL != cal_.uavs_[i]) { - setUAVBuffer(i, 0, GSL_UAV_TYPE_UNKNOWN); - cal_.uavs_[i] = 0; - } - } - for (uint i = 0; i < MaxReadImage; ++i) { - if (NULL != cal_.readImages_[i]) { - setInput(i, 0); - cal_.readImages_[i] = 0; - } - } - for (uint i = 0; i < MaxConstBuffers; ++i) { - if (NULL != cal_.constBuffers_[i]) { - setConstantBuffer(i, 0, 0, 0); - cal_.constBuffers_[i] = 0; - } - } - - if (scratch) { - setScratchBuffer(NULL, 0); - } - - gpuEvents_.clear(); -} - -void -VirtualGPU::setGpuEvent( - GpuEvent gpuEvent, - bool flush) -{ - cal_.events_[engineID_] = gpuEvent; - - // Flush current DMA buffer if requested - if (flush || GPU_FLUSH_ON_EXECUTION) { - flushDMA(engineID_); - } -} - -void -VirtualGPU::flushDMA(uint engineID) -{ - if (engineID == MainEngine) { - // Clear memory dependency state, since runtime flushes compute - // memoryDependency().clear(); - //!@todo Keep memory dependency alive even if we flush DMA, - //! since only L2 cache is flushed in KMD frame, - //! but L1 still has to be invalidated. - } - //! \note Use CtxIsEventDone, so we won't flush compute for DRM engine - isDone(&cal_.events_[engineID]); -} - -bool -VirtualGPU::waitAllEngines(CommandBatch* cb) -{ - uint i; - GpuEvent* events; //!< GPU events for the batch - // If command batch is NULL then wait for the current - if (NULL == cb) { - events = cal_.events_; - } - else { - events = cb->events_; - } - - bool earlyDone = true; - // The first loop is to flush all engines and/or check if - // engines are idle already - for (i = 0; i < AllEngines; ++i) { - earlyDone &= isDone(&events[i]); - } - - // Release all transfer buffers on this command queue - releaseXferWrite(); - - // Rlease all pinned memory - releasePinnedMem(); - - // The second loop is to wait all engines - for (i = 0; i < AllEngines; ++i) { - waitForEvent(&events[i]); - } - - return earlyDone; -} - -void -VirtualGPU::waitEventLock(CommandBatch* cb) -{ - // Make sure VirtualGPU has an exclusive access to the resources - amd::ScopedLock lock(execution()); - - bool earlyDone = waitAllEngines(cb); - - // Free resource cache if we have too many entries - //! \note we do it here, when all engines are idle, - // because Vista/Win7 idles GPU on a resource destruction - static const size_t MinCacheEntries = 4096; - dev().resourceCache().free(MinCacheEntries); - - // Find the timestamp object of the last command in the batch - if (cb->lastTS_ != NULL) { - // If earlyDone is TRUE, then CPU didn't wait for GPU. - // Thus the sync point between CPU and GPU is unclear and runtime - // will use an older adjustment value to maintain the same timeline - if (!earlyDone || - //! \note Workaround for APU(s). - //! GPU-CPU timelines may go off too much, thus always - //! force calibration with the last batch in the list - (cbList_.size() <= 1) || - (readjustTimeGPU_ == 0)) { - uint64_t startTimeStampGPU = 0; - uint64_t endTimeStampGPU = 0; - - // Get the timestamp value of the last command in the batch - cb->lastTS_->value(&startTimeStampGPU, &endTimeStampGPU); - - uint64_t endTimeStampCPU = amd::Os::timeNanos(); - // Make sure the command batch has a valid GPU TS - if (!GPU_RAW_TIMESTAMP) { - // Adjust the base time by the execution time - readjustTimeGPU_ = endTimeStampGPU - endTimeStampCPU; - } - } - } -} - -void -VirtualGPU::validateScratchBuffer(const Kernel* kernel) -{ - // Check if a scratch buffer is required - if (dev().scratch(hwRing())->regNum_ > 0) { - // Setup scratch buffer - setScratchBuffer(dev().scratch(hwRing())->memObj_->gslResource(), 0); - } -} - -bool -VirtualGPU::setActiveKernelDesc( - const amd::NDRangeContainer& sizes, - const Kernel* kernel) -{ - bool result = true; - CALimage calImage = kernel->calImage(); - - GslKernelDesc* desc = gslKernels_[calImage]; - - validateScratchBuffer(kernel); - - // Early exit - if ((activeKernelDesc_ == desc) && (desc != NULL)) { - return result; - } - - // Does the kernel descriptor for this virtual device exist? - if (desc == NULL) { - desc = allocKernelDesc(kernel, calImage); - if (desc == NULL) { - return false; - } - gslKernels_[calImage] = desc; - } - - // Set the descriptor as active - activeKernelDesc_ = desc; - - // Program the samplers defined in the kernel - if (!kernel->setInternalSamplers(*this)) { - result = false; - } - - // Bind global HW constant buffers - if (!kernel->bindGlobalHwCb(*this, desc)) { - result = false; - } - - if (result) { - // Set program in GSL - rs()->setCurrentProgramObject(GSL_COMPUTE_PROGRAM, desc->func_); - - // Update internal constant buffer - if (desc->intCb_ != 0) { - cs()->setIntConstants(GSL_COMPUTE_PROGRAM, desc->intCb_); - } - } - - return result; -} - -bool -VirtualGPU::allocConstantBuffers() -{ - // Allocate/reallocate constant buffers - size_t minCbSize; - // GCN doesn't really have a limit - minCbSize = 128 * Ki; - uint i; - - // Create/reallocate constant buffer resources - for (i = 0; i < MaxConstBuffersArguments; ++i) { - ConstBuffer* constBuf = new ConstBuffer(*this, ((minCbSize + - ConstBuffer::VectorSize - 1) / ConstBuffer::VectorSize)); - - if ((constBuf != NULL) && constBuf->create()) { - addConstBuffer(constBuf); - } - else { - // We failed to create a constant buffer - delete constBuf; - return false; - } - } - - return true; -} - -VirtualGPU::GslKernelDesc* -VirtualGPU::allocKernelDesc(const Kernel* kernel, CALimage calImage) -{ - // Sanity checks - assert(kernel != NULL); - GslKernelDesc* desc = new GslKernelDesc; - - if (desc != NULL) { - memset(desc, 0, sizeof(GslKernelDesc)); - - if (kernel->calImage() != calImage) { - desc->image_ = calImage; - } - - if (!moduleLoad(calImage, &desc->func_, &desc->intCb_)) { - LogPrintfError("calModuleLoad failed for \"%s\" kernel!", - kernel->name().c_str()); - delete desc; - return NULL; - } - } - - if (kernel->argSize() > slots_.size()) { - slots_.resize(kernel->argSize()); - } - - return desc; -} - -void -VirtualGPU::freeKernelDesc(VirtualGPU::GslKernelDesc* desc) -{ - if (desc) { - if (gslKernelDesc() == desc) { - // Clear active kernel desc - activeKernelDesc_ = NULL; - rs()->setCurrentProgramObject(GSL_COMPUTE_PROGRAM, 0); - } - - if (desc->image_ != 0) { - // Free CAL image - free(desc->image_); - } - - if (desc->func_ != 0) { - if (desc->intCb_ != 0) { - cs()->setIntConstants(GSL_COMPUTE_PROGRAM, 0); - cs()->destroyMemObject(desc->intCb_); - } - cs()->destroyProgramObject(desc->func_); - } - - delete desc; - } -} - -void -VirtualGPU::profilingBegin(amd::Command& command, bool drmProfiling) -{ - // Is profiling enabled? - if (command.profilingInfo().enabled_) { - // Allocate a timestamp object from the cache - TimeStamp* ts = tsCache_->allocTimeStamp(); - if (NULL == ts) { - return; - } - // Save the TimeStamp object in the current OCL event - command.setData(ts); - currTs_ = ts; - profileEnabled_ = true; - } -} - -void -VirtualGPU::profilingEnd(amd::Command& command) -{ - // Get the TimeStamp object associated witht the current command - TimeStamp* ts = reinterpret_cast(command.data()); - if (ts != NULL) { - // Check if the command actually did any GPU submission - if (ts->isValid()) { - cal_.lastTS_ = ts; - } - else { - // Destroy the TimeStamp object - tsCache_->freeTimeStamp(ts); - command.setData(NULL); - } - } -} - -bool -VirtualGPU::profilingCollectResults(CommandBatch* cb, const amd::Event* waitingEvent) -{ - bool found = false; - amd::Command* current; - amd::Command* first = cb->head_; - - // If the command list is, empty then exit - if (NULL == first) { - return found; - } - - // Wait for the last known GPU events on all engines - waitEventLock(cb); - - // Find the CPU base time of the entire command batch execution - uint64_t endTimeStamp = amd::Os::timeNanos(); - uint64_t startTimeStamp = endTimeStamp; - - // First step, walk the command list to find the first valid command - //! \note The batch may have empty markers at the beginning. - //! So the start/end of the empty commands is equal to - //! the start of the first valid command in the batch. - first = cb->head_; - while (NULL != first) { - // Get the TimeStamp object associated witht the current command - TimeStamp* ts = reinterpret_cast(first->data()); - - if (ts != NULL) { - ts->value(&startTimeStamp, &endTimeStamp); - endTimeStamp -= readjustTimeGPU_; - startTimeStamp -= readjustTimeGPU_; - // Assign to endTimeStamp the start of the first valid command - endTimeStamp = startTimeStamp; - break; - } - first = first->getNext(); - } - - // Second step, walk the command list to construct the time line - first = cb->head_; - while (NULL != first) { - // Get the TimeStamp object associated witht the current command - TimeStamp* ts = reinterpret_cast(first->data()); - - current = first->getNext(); - - if (ts != NULL) { - ts->value(&startTimeStamp, &endTimeStamp); - endTimeStamp -= readjustTimeGPU_; - startTimeStamp -= readjustTimeGPU_; - // Destroy the TimeStamp object - tsCache_->freeTimeStamp(ts); - first->setData(NULL); - } - else { - // For empty commands start/end is equal to - // the end of the last valid command - startTimeStamp = endTimeStamp; - } - - // Update the command status with the proper timestamps - if (first->status() == CL_SUBMITTED) { - first->setStatus(CL_RUNNING, startTimeStamp); - first->setStatus(CL_COMPLETE, endTimeStamp); - } - else if (first->status() == CL_RUNNING) { - first->setStatus(CL_COMPLETE, endTimeStamp); - } - else if ((first->status() != CL_COMPLETE) && (current != NULL)) { - LogPrintfError("Unexpected command status - %d!", first->status()); - } - - // Do we wait this event? - if (first == waitingEvent) { - found = true; - } - - first->release(); - first = current; - } - - return found; -} - -bool -VirtualGPU::addVmMemory(const Memory* memory) -{ - uint* cnt = &cal_.memCount_; - (*cnt)++; - // Reallocate array if kernel uses more memory objects - if (numVmMems_ < *cnt) { - gslMemObject* tmp; - tmp = new gslMemObject [*cnt]; - if (tmp == NULL) { - return false; - } - memcpy(tmp, vmMems_, sizeof(gslMemObject) * numVmMems_); - delete [] vmMems_; - vmMems_ = tmp; - numVmMems_ = *cnt; - } - vmMems_[*cnt - 1] = memory->gslResource(); - - return true; -} - -void -VirtualGPU::profileEvent(EngineType engine, bool type) const -{ - if (NULL == currTs_) { - return; - } - if (type) { - currTs_->begin((engine == SdmaEngine) ? true : false); - } - else { - currTs_->end((engine == SdmaEngine) ? true : false); - } -} - -bool -VirtualGPU::processMemObjectsHSA( - const amd::Kernel& kernel, - const_address params, - bool nativeMem, - std::vector* memList) -{ - static const bool NoAlias = true; - const HSAILKernel& hsaKernel = static_cast - (*(kernel.getDeviceKernel(dev(), NoAlias))); - const amd::KernelSignature& signature = kernel.signature(); - const amd::KernelParameters& kernelParams = kernel.parameters(); - - // Mark the tracker with a new kernel, - // so we can avoid checks of the aliased objects - memoryDependency().newKernel(); - - bool deviceSupportFGS = 0 != dev().isFineGrainedSystem(true); - bool supportFineGrainedSystem = deviceSupportFGS; - FGSStatus status = kernelParams.getSvmSystemPointersSupport(); - switch (status) { - case FGS_YES: - if (!deviceSupportFGS) { - return false; - } - supportFineGrainedSystem = true; - break; - case FGS_NO: - supportFineGrainedSystem = false; - break; - case FGS_DEFAULT: - default: - break; - } - - size_t count = kernelParams.getNumberOfSvmPtr(); - size_t execInfoOffset = kernelParams.getExecInfoOffset(); - bool sync = true; - - amd::Memory* memory = NULL; - //get svm non arugment information - void* const* svmPtrArray = - reinterpret_cast(params + execInfoOffset); - for (size_t i = 0; i < count; i++) { - memory = amd::SvmManager::FindSvmBuffer(svmPtrArray[i]); - if (NULL == memory) { - if (!supportFineGrainedSystem) { - return false; - } - else if (sync) { - flushCUCaches(); - // Clear memory dependency state - const static bool All = true; - memoryDependency().clear(!All); - continue; - } - } - else { - Memory* gpuMemory = dev().getGpuMemory(memory); - if (NULL != gpuMemory) { - // Synchronize data with other memory instances if necessary - gpuMemory->syncCacheFromHost(*this); - - const static bool IsReadOnly = false; - // Validate SVM passed in the non argument list - memoryDependency().validate(*this, gpuMemory, IsReadOnly); - - // Mark signal write for cache coherency, - // since this object isn't a part of kernel arg setup - if ((memory->getMemFlags() & CL_MEM_READ_ONLY) == 0) { - memory->signalWrite(&dev()); - } - - memList->push_back(gpuMemory); - } - else { - return false; - } - } - } - - // Check all parameters for the current kernel - for (size_t i = 0; i < signature.numParameters(); ++i) { - const amd::KernelParameterDescriptor& desc = signature.at(i); - const HSAILKernel::Argument* arg = hsaKernel.argument(i); - Memory* memory = NULL; - bool readOnly = false; - amd::Memory* svmMem = NULL; - - // Find if current argument is a buffer - if ((desc.type_ == T_POINTER) && (arg->addrQual_ != HSAIL_ADDRESS_LOCAL)) { - if (kernelParams.boundToSvmPointer(dev(), params, i)) { - svmMem = amd::SvmManager::FindSvmBuffer( - *reinterpret_cast(params + desc.offset_)); - if (!svmMem) { - flushCUCaches(); - // Clear memory dependency state - const static bool All = true; - memoryDependency().clear(!All); - continue; - } - } - - if (nativeMem) { - memory = *reinterpret_cast(params + desc.offset_); - } - else if (*reinterpret_cast - (params + desc.offset_) != NULL) { - if (NULL == svmMem) { - memory = dev().getGpuMemory(*reinterpret_cast - (params + desc.offset_)); - } - else { - memory = dev().getGpuMemory(svmMem); - } - // Synchronize data with other memory instances if necessary - memory->syncCacheFromHost(*this); - } - - if (memory != NULL) { - // Check image - readOnly = (desc.accessQualifier_ == - CL_KERNEL_ARG_ACCESS_READ_ONLY) ? true : false; - // Check buffer - readOnly |= (arg->access_ == HSAIL_ACCESS_TYPE_RO) ? true : false; - // Validate memory for a dependency in the queue - memoryDependency().validate(*this, memory, readOnly); - } - } - } - - for (gpu::Memory* mem : hsaKernel.prog().globalStores()) { - const static bool IsReadOnly = false; - // Validate global store for a dependency in the queue - memoryDependency().validate(*this, mem, IsReadOnly); - } - - return true; -} - -amd::Memory* -VirtualGPU::createBufferFromImage(amd::Memory& amdImage) const -{ - amd::Memory* mem = new(amdImage.getContext()) - amd::Buffer(amdImage, 0, 0, amdImage.getSize()); - - if ((mem != NULL) && !mem->create()) { - mem->release(); - } - - return mem; -} - -void -VirtualGPU::writeVQueueHeader(VirtualGPU& hostQ, uint64_t kernelTable) -{ - const static bool Wait = true; - vqHeader_->kernel_table = kernelTable; - virtualQueue_->writeRawData(hostQ, sizeof(AmdVQueueHeader), vqHeader_, !Wait); -} - -void -VirtualGPU::flushCuCaches(HwDbgGpuCacheMask cache_mask) -{ - //! @todo: fix issue of no event available for the flush/invalidate cache command - InvalidateSqCaches(cache_mask.sqICache_, - cache_mask.sqKCache_, - cache_mask.tcL1_, - cache_mask.tcL2_); - - flushDMA(engineID_); - +void VirtualGPU::submitUnmapMemory(amd::UnmapMemoryCommand& vcmd) { + // Make sure VirtualGPU has an exclusive access to the resources + amd::ScopedLock lock(execution()); + gpu::Memory* memory = dev().getGpuMemory(&vcmd.memory()); + amd::Memory* owner = memory->owner(); + bool unmapMip = false; + const device::Memory::WriteMapInfo* writeMapInfo = memory->writeMapInfo(vcmd.mapPtr()); + if (nullptr == writeMapInfo) { + LogError("Unmap without map call"); return; + } + profilingBegin(vcmd, true); + + // Check if image is a mipmap and assign a saved view + amd::Image* amdImage = owner->asImage(); + if ((amdImage != NULL) && (amdImage->getMipLevels() > 1) && (writeMapInfo->baseMip_ != NULL)) { + // Assign mip level view + amdImage = writeMapInfo->baseMip_; + // Clear unmap flags from the parent image + memory->clearUnmapInfo(vcmd.mapPtr()); + memory = dev().getGpuMemory(amdImage); + unmapMip = true; + writeMapInfo = memory->writeMapInfo(vcmd.mapPtr()); + } + + // We used host memory + if ((owner->getHostMem() != NULL) && memory->isDirectMap()) { + if (writeMapInfo->isUnmapWrite()) { + // Target is the backing store, so sync + owner->signalWrite(NULL); + memory->syncCacheFromHost(*this); + } + // Remove memory from VA cache + dev().removeVACache(memory); + } + // data check was added for persistent memory that failed to get aperture + // and therefore are treated like a remote resource + else if (memory->isPersistentDirectMap() && (memory->data() != NULL)) { + memory->unmap(this); + } else if (memory->mapMemory() != NULL) { + if (writeMapInfo->isUnmapWrite()) { + amd::Coord3D srcOrigin(0, 0, 0); + // Target is a remote resource, so copy + assert(memory->mapMemory() != NULL); + if (memory->cal()->buffer_) { + if (!blitMgr().copyBuffer(*memory->mapMemory(), *memory, writeMapInfo->origin_, + writeMapInfo->origin_, writeMapInfo->region_, + writeMapInfo->isEntire())) { + LogError("submitUnmapMemory() - copy failed"); + vcmd.setStatus(CL_OUT_OF_RESOURCES); + } + } else if ((vcmd.memory().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) { + amd::Memory* bufferFromImage = NULL; + Memory* memoryBuf = memory; + amd::Coord3D origin(writeMapInfo->origin_[0]); + amd::Coord3D size(writeMapInfo->region_[0]); + size_t elemSize = vcmd.memory().asImage()->getImageFormat().getElementSize(); + origin.c[0] *= elemSize; + size.c[0] *= elemSize; + + bufferFromImage = createBufferFromImage(vcmd.memory()); + if (NULL == bufferFromImage) { + LogError("We should not fail buffer creation from image_buffer!"); + } else { + bufferFromImage->setVirtualDevice(this); + memoryBuf = dev().getGpuMemory(bufferFromImage); + } + if (!blitMgr().copyBuffer(*memory->mapMemory(), *memoryBuf, srcOrigin, origin, size, + writeMapInfo->isEntire())) { + LogError("submitUnmapMemory() - copy failed"); + vcmd.setStatus(CL_OUT_OF_RESOURCES); + } + if (NULL != bufferFromImage) { + bufferFromImage->release(); + } + } else { + if (!blitMgr().copyBufferToImage(*memory->mapMemory(), *memory, srcOrigin, + writeMapInfo->origin_, writeMapInfo->region_, + writeMapInfo->isEntire())) { + LogError("submitUnmapMemory() - copy failed"); + vcmd.setStatus(CL_OUT_OF_RESOURCES); + } + } + } + } else { + LogError("Unhandled unmap!"); + vcmd.setStatus(CL_INVALID_VALUE); + } + + // Clear unmap flags + memory->clearUnmapInfo(vcmd.mapPtr()); + + // Release a view for a mipmap map + if (unmapMip) { + amdImage->release(); + } + profilingEnd(vcmd); } -void -VirtualGPU::buildKernelInfo(const HSAILKernel& hsaKernel, - hsa_kernel_dispatch_packet_t* aqlPkt, - HwDbgKernelInfo& kernelInfo, - amd::Event* enqueueEvent) -{ - amd::HwDebugManager * dbgManager = dev().hwDebugMgr(); - assert (dbgManager && "No HW Debug Manager!"); +bool VirtualGPU::fillMemory(cl_command_type type, amd::Memory* amdMemory, const void* pattern, + size_t patternSize, const amd::Coord3D& origin, + const amd::Coord3D& size) { + gpu::Memory* memory = dev().getGpuMemory(amdMemory); + bool entire = amdMemory->isEntirelyCovered(origin, size); - // Initialize structure with default values + // Synchronize memory from host if necessary + device::Memory::SyncFlags syncFlags; + syncFlags.skipEntire_ = entire; + memory->syncCacheFromHost(*this, syncFlags); - if (hsaKernel.prog().maxScratchRegs() > 0) { - gpu::Memory* scratchBuf = dev().scratch(hwRing())->memObj_; - kernelInfo.scratchBufAddr = scratchBuf->vmAddress(); - kernelInfo.scratchBufferSizeInBytes = scratchBuf->size(); + bool result = false; + amd::Memory* bufferFromImage = NULL; + float fillValue[4]; - // Get the address of the scratch buffer and its size for CPU access - address scratchRingAddr = NULL; - scratchRingAddr = static_cast
(scratchBuf->map(NULL, 0)); - dbgManager->setScratchRing(scratchRingAddr,scratchBuf->size()); - scratchBuf->unmap(NULL); - } - else { - kernelInfo.scratchBufAddr = 0; - kernelInfo.scratchBufferSizeInBytes = 0; - dbgManager->setScratchRing(NULL, 0); + // Force fill buffer for IMAGE1D_BUFFER + if ((type == CL_COMMAND_FILL_IMAGE) && (amdMemory->getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) { + bufferFromImage = createBufferFromImage(*amdMemory); + if (NULL == bufferFromImage) { + LogError("We should not fail buffer creation from image_buffer!"); + } else { + type = CL_COMMAND_FILL_BUFFER; + bufferFromImage->setVirtualDevice(this); + memory = dev().getGpuMemory(bufferFromImage); } + } + // Find the the right fill operation + switch (type) { + case CL_COMMAND_FILL_BUFFER: + case CL_COMMAND_SVM_MEMFILL: { + amd::Coord3D realOrigin(origin[0]); + amd::Coord3D realSize(size[0]); + // Reprogram fill parameters if it's an IMAGE1D_BUFFER object + if (NULL != bufferFromImage) { + size_t elemSize = amdMemory->asImage()->getImageFormat().getElementSize(); + realOrigin.c[0] *= elemSize; + realSize.c[0] *= elemSize; + memset(fillValue, 0, sizeof(fillValue)); + amdMemory->asImage()->getImageFormat().formatColor(pattern, fillValue); + pattern = fillValue; + patternSize = elemSize; + } + result = blitMgr().fillBuffer(*memory, pattern, patternSize, realOrigin, realSize, + amdMemory->isEntirelyCovered(origin, size)); + if (NULL != bufferFromImage) { + bufferFromImage->release(); + } + } break; + case CL_COMMAND_FILL_IMAGE: + result = blitMgr().fillImage(*memory, pattern, origin, size, + amdMemory->isEntirelyCovered(origin, size)); + break; + default: + LogError("Unsupported command type for FillMemory!"); + break; + } - //! @todo: need to verify what is wanted for the global memory - kernelInfo.heapBufAddr = (dev().globalMem()).vmAddress(); + if (!result) { + LogError("fillMemory failed!"); + return false; + } - kernelInfo.pAqlDispatchPacket = aqlPkt; - kernelInfo.pAqlQueuePtr = reinterpret_cast(hsaQueueMem_->vmAddress()); - - // Get the address of the kernel code and its size for CPU access - gpu::Memory* aqlCode = hsaKernel.gpuAqlCode(); - if (NULL != aqlCode) { - address aqlCodeAddr = static_cast
(aqlCode->map(NULL, 0)); - dbgManager->setKernelCodeInfo(aqlCodeAddr, hsaKernel.aqlCodeSize()); - aqlCode->unmap(NULL); - } - else { - dbgManager->setKernelCodeInfo(NULL, 0); - } - - kernelInfo.trapPresent = false; - kernelInfo.trapHandler = NULL; - kernelInfo.trapHandlerBuffer = NULL; - - kernelInfo.excpEn = 0; - kernelInfo.cacheDisableMask = 0; - kernelInfo.sqDebugMode = 0; - - kernelInfo.mgmtSe0Mask = 0xFFFFFFFF; - kernelInfo.mgmtSe1Mask = 0xFFFFFFFF; - - // set kernel info for HW debug and call the callback function - if (NULL != dbgManager->preDispatchCallBackFunc()) { - DebugToolInfo dbgSetting = {0}; - dbgSetting.scratchAddress_ = kernelInfo.scratchBufAddr; - dbgSetting.scratchSize_ = kernelInfo.scratchBufferSizeInBytes; - dbgSetting.globalAddress_ = kernelInfo.heapBufAddr; - dbgSetting.aclBinary_ = hsaKernel.prog().binaryElf(); - dbgSetting.event_ = enqueueEvent; - - // Call the predispatch callback function & set the trap info - AqlCodeInfo aqlCodeInfo; - aqlCodeInfo.aqlCode_ = (amd_kernel_code_t *) hsaKernel.cpuAqlCode(); - aqlCodeInfo.aqlCodeSize_ = hsaKernel.aqlCodeSize(); - - // Execute the pre-dispatch call back function - dbgManager->executePreDispatchCallBack(reinterpret_cast(aqlPkt), &dbgSetting); - - // assign the debug TMA and TBA for kernel dispatch - if (NULL != dbgSetting.trapHandler_ && NULL != dbgSetting.trapBuffer_) { - assignDebugTrapHandler(dbgSetting, kernelInfo); - } - - kernelInfo.trapPresent = (kernelInfo.trapHandler) ? true : false; - - // Execption policy - kernelInfo.excpEn = dbgSetting.exceptionMask_; - kernelInfo.cacheDisableMask = dbgSetting.cacheDisableMask_; - kernelInfo.sqDebugMode = dbgSetting.gpuSingleStepMode_; - - // Compute the mask for reserved CUs. These two dwords correspond to - // two registers used for reserving CUs for display. In the current - // implementation, the number of CUs reserved can be 0 to 7, and it - // is set by debugger users. - if (dbgSetting.monitorMode_) { - uint32_t i = dbgSetting.reservedCuNum_ / 2; - kernelInfo.mgmtSe0Mask <<= i; - i = dbgSetting.reservedCuNum_ - i; - kernelInfo.mgmtSe1Mask <<= i; - } - - // flush/invalidate the instruction, data, L1 and L2 caches - InvalidateSqCaches(); - } + // Mark this as the most-recently written cache of the destination + amdMemory->signalWrite(&gpuDevice_); + return true; } -void -VirtualGPU::assignDebugTrapHandler(const DebugToolInfo& dbgSetting, - HwDbgKernelInfo& kernelInfo) -{ - // setup the runtime trap handler code and trap buffer to be assigned before kernel dispatching - // - Memory * rtTrapHandlerMem = static_cast(dev().hwDebugMgr()->runtimeTBA()); - Memory * rtTrapBufferMem = static_cast(dev().hwDebugMgr()->runtimeTMA()); +void VirtualGPU::submitFillMemory(amd::FillMemoryCommand& vcmd) { + // Make sure VirtualGPU has an exclusive access to the resources + amd::ScopedLock lock(execution()); - kernelInfo.trapHandler = reinterpret_cast(rtTrapHandlerMem->vmAddress() + TbaStartOffset); - // With the TMA corruption hw bug workaround, the trap handler buffer can be set to zero. - // However, by setting the runtime trap buffer (TMA) correct, the runtime trap hander - // without the workaround can still function correctly. - kernelInfo.trapHandlerBuffer = reinterpret_cast(rtTrapBufferMem->vmAddress()); + profilingBegin(vcmd, true); - address rtTrapBufferAddress = static_cast
(rtTrapBufferMem->map(this)); + if (!fillMemory(vcmd.type(), &vcmd.memory(), vcmd.pattern(), vcmd.patternSize(), vcmd.origin(), + vcmd.size())) { + vcmd.setStatus(CL_INVALID_OPERATION); + } - Memory * trapHandlerMem = dev().getGpuMemory(dbgSetting.trapHandler_); - Memory * trapBufferMem = dev().getGpuMemory(dbgSetting.trapBuffer_); + profilingEnd(vcmd); +} - // Address of the trap handler code/buffer should be 256-byte aligned - uint64_t tbaAddress = trapHandlerMem->vmAddress(); - uint64_t tmaAddress = trapBufferMem->vmAddress(); - if ((tbaAddress & 0xFF) != 0 || (tmaAddress & 0xFF) != 0) { - assert(false && "Trap handler/buffer is not 256-byte aligned"); +void VirtualGPU::submitSvmMapMemory(amd::SvmMapMemoryCommand& vcmd) { + // Make sure VirtualGPU has an exclusive access to the resources + amd::ScopedLock lock(execution()); + + profilingBegin(vcmd, true); + + // no op for FGS supported device + if (!dev().isFineGrainedSystem()) { + // Make sure we have memory for the command execution + gpu::Memory* memory = dev().getGpuMemory(vcmd.getSvmMem()); + memory->saveMapInfo(vcmd.svmPtr(), vcmd.origin(), vcmd.size(), vcmd.mapFlags(), + vcmd.isEntireMemory()); + + if (memory->mapMemory() != NULL) { + if (vcmd.mapFlags() & (CL_MAP_READ | CL_MAP_WRITE)) { + assert(memory->cal()->buffer_ && "SVM memory can't be an image"); + if (!blitMgr().copyBuffer(*memory, *memory->mapMemory(), vcmd.origin(), vcmd.origin(), + vcmd.size(), vcmd.isEntireMemory())) { + LogError("submitSVMMapMemory() - copy failed"); + vcmd.setStatus(CL_MAP_FAILURE); + } + } + } else if ((memory->owner()->getHostMem() != nullptr) && memory->isDirectMap()) { + if (!memory->isHostMemDirectAccess()) { + // Make sure GPU finished operation before + // synchronization with the backing store + memory->wait(*this); + } + + // Target is the backing store, so just ensure that owner is up-to-date + memory->owner()->cacheWriteBack(); + } else { + LogError("Unhandled svm map!"); + } + } + + profilingEnd(vcmd); +} + +void VirtualGPU::submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& vcmd) { + // Make sure VirtualGPU has an exclusive access to the resources + amd::ScopedLock lock(execution()); + profilingBegin(vcmd, true); + + // no op for FGS supported device + if (!dev().isFineGrainedSystem()) { + gpu::Memory* memory = dev().getGpuMemory(vcmd.getSvmMem()); + const device::Memory::WriteMapInfo* writeMapInfo = memory->writeMapInfo(vcmd.svmPtr()); + + if (memory->mapMemory() != NULL) { + if (writeMapInfo->isUnmapWrite()) { + // Target is a remote resource, so copy + assert(memory->cal()->buffer_ && "SVM memory can't be an image"); + if (!blitMgr().copyBuffer(*memory->mapMemory(), *memory, writeMapInfo->origin_, + writeMapInfo->origin_, writeMapInfo->region_, + writeMapInfo->isEntire())) { + LogError("submitSvmUnmapMemory() - copy failed"); + vcmd.setStatus(CL_OUT_OF_RESOURCES); + } + } + } else if ((memory->owner()->getHostMem() != nullptr) && memory->isDirectMap()) { + if (writeMapInfo->isUnmapWrite()) { + // Target is the backing store, so sync + memory->owner()->signalWrite(nullptr); + memory->syncCacheFromHost(*this); + } + } + memory->clearUnmapInfo(vcmd.svmPtr()); + } + + profilingEnd(vcmd); +} + +void VirtualGPU::submitSvmFillMemory(amd::SvmFillMemoryCommand& vcmd) { + // Make sure VirtualGPU has an exclusive access to the resources + amd::ScopedLock lock(execution()); + + profilingBegin(vcmd, true); + + if (!dev().isFineGrainedSystem()) { + size_t patternSize = vcmd.patternSize(); + size_t fillSize = patternSize * vcmd.times(); + size_t offset = 0; + amd::Memory* dstMemory = amd::SvmManager::FindSvmBuffer(vcmd.dst()); + assert(dstMemory && "No svm Buffer to fill with!"); + offset = reinterpret_cast(vcmd.dst()) - + reinterpret_cast(dstMemory->getSvmPtr()); + assert((offset >= 0) && "wrong svm ptr to fill with!"); + + gpu::Memory* memory = dev().getGpuMemory(dstMemory); + + amd::Coord3D origin(offset, 0, 0); + amd::Coord3D size(fillSize, 1, 1); + assert((dstMemory->validateRegion(origin, size)) && "The incorrect fill size!"); + + if (!fillMemory(vcmd.type(), dstMemory, vcmd.pattern(), vcmd.patternSize(), origin, size)) { + vcmd.setStatus(CL_INVALID_OPERATION); + } + // Mark this as the most-recently written cache of the destination + dstMemory->signalWrite(&gpuDevice_); + } else { + // for FGS capable device, fill CPU memory directly + amd::SvmBuffer::memFill(vcmd.dst(), vcmd.pattern(), vcmd.patternSize(), vcmd.times()); + } + + profilingEnd(vcmd); +} + +void VirtualGPU::submitMigrateMemObjects(amd::MigrateMemObjectsCommand& vcmd) { + // Make sure VirtualGPU has an exclusive access to the resources + amd::ScopedLock lock(execution()); + + profilingBegin(vcmd, true); + + std::vector::const_iterator itr; + for (itr = vcmd.memObjects().begin(); itr != vcmd.memObjects().end(); itr++) { + // Find device memory + gpu::Memory* memory = dev().getGpuMemory(*itr); + + if (vcmd.migrationFlags() & CL_MIGRATE_MEM_OBJECT_HOST) { + memory->mgpuCacheWriteBack(); + } else if (vcmd.migrationFlags() & CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED) { + // Synchronize memory from host if necessary. + // The sync function will perform memory migration from + // another device if necessary + device::Memory::SyncFlags syncFlags; + memory->syncCacheFromHost(*this, syncFlags); + } else { + LogWarning("Unknown operation for memory migration!"); + } + } + + profilingEnd(vcmd); +} + +void VirtualGPU::submitSvmFreeMemory(amd::SvmFreeMemoryCommand& vcmd) { + // in-order semantics: previous commands need to be done before we start + // Make sure VirtualGPU has an exclusive access to the resources + amd::ScopedLock lock(execution()); + + profilingBegin(vcmd); + std::vector& svmPointers = vcmd.svmPointers(); + if (vcmd.pfnFreeFunc() == NULL) { + // pointers allocated using clSVMAlloc + for (cl_uint i = 0; i < svmPointers.size(); i++) { + dev().svmFree(svmPointers[i]); + } + } else { + vcmd.pfnFreeFunc()(as_cl(vcmd.queue()->asCommandQueue()), svmPointers.size(), + static_cast(&(svmPointers[0])), vcmd.userData()); + } + profilingEnd(vcmd); +} + +void VirtualGPU::findIterations(const amd::NDRangeContainer& sizes, const amd::NDRange& local, + amd::NDRange& groups, amd::NDRange& remainder, size_t& extra) { + size_t dimensions = sizes.dimensions(); + + if (cal()->iterations_ > 1) { + size_t iterations = cal()->iterations_; + cal_.iterations_ = 1; + + // Find the total amount of all groups + groups = sizes.global() / local; + if (dev().settings().partialDispatch_) { + for (uint j = 0; j < dimensions; ++j) { + if ((sizes.global()[j] % local[j]) != 0) { + groups[j]++; + } + } } - // The addresses of the debug trap handler code (TBA) and buffer (TMA) are - // stored in the runtime trap handler buffer with offset location of 0x18-19 - // and 0x20-21, respectively. - uint64_t * rtTmaPtr = reinterpret_cast(rtTrapBufferAddress + 0x18); - rtTmaPtr[0] = tbaAddress; - rtTmaPtr[1] = tmaAddress; + // Calculate the real number of required iterations and + // the workgroup size of each iteration + for (int j = (dimensions - 1); j >= 0; --j) { + // Find possible size of each iteration + size_t tmp = (groups[j] / iterations); + // Make sure the group size is more than 1 + if (tmp > 0) { + remainder = groups; + remainder[j] = (groups[j] % tmp); - rtTrapBufferMem->unmap(NULL); + extra = ((groups[j] / tmp) + + // Check for the remainder + ((remainder[j] != 0) ? 1 : 0)); + // Recalculate the number of iterations + cal_.iterations_ *= extra; + if (remainder[j] == 0) { + extra = 0; + } + groups[j] = tmp; + break; + } else { + iterations = ((iterations / groups[j]) + (((iterations % groups[j]) != 0) ? 1 : 0)); + cal_.iterations_ *= groups[j]; + groups[j] = 1; + } + } + } +} + +void VirtualGPU::setupIteration(uint iteration, const amd::NDRangeContainer& sizes, + Kernel& gpuKernel, amd::NDRange& global, amd::NDRange& offsets, + amd::NDRange& local, amd::NDRange& groups, + amd::NDRange& groupOffset, amd::NDRange& divider, + amd::NDRange& remainder, size_t extra) { + size_t dimensions = sizes.dimensions(); + + // Calculate the workload size for the remainder + if ((extra != 0) && ((iteration % extra) == 0)) { + groups = remainder; + } else { + groups = divider; + } + global = groups * local; + + if (dev().settings().partialDispatch_) { + for (uint j = 0; j < dimensions; ++j) { + size_t offset = groupOffset[j] * local[j]; + if ((offset + global[j]) > sizes.global()[j]) { + global[j] = sizes.global()[j] - offset; + } + } + } + + // Reprogram the kernel parameters for the GPU execution + gpuKernel.setupProgramGrid(*this, dimensions, offsets, global, local, groupOffset, sizes.offset(), + sizes.global()); + + // Update the constant buffers + gpuKernel.bindConstantBuffers(*this); + + uint sub = 0; + // Find the offsets for the next execution + for (uint j = 0; j < dimensions; ++j) { + groupOffset[j] += groups[j]; + // Make sure the offset doesn't go over the size limit + if (sizes.global()[j] <= groupOffset[j] * local[j]) { + // Check if we counted a group in one dimension already + if (sub) { + groupOffset[j] -= groups[j]; + } else { + groupOffset[j] = 0; + } + } else { + groupOffset[j] -= sub; + // We already counted elements in one dimension + sub = 1; + } + + offsets[j] = groupOffset[j] * local[j] + sizes.offset()[j]; + } +} + +void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) { + // Make sure VirtualGPU has an exclusive access to the resources + amd::ScopedLock lock(execution()); + + profilingBegin(vcmd); + + // Submit kernel to HW + if (!submitKernelInternal(vcmd.sizes(), vcmd.kernel(), vcmd.parameters(), false, &vcmd.event())) { + vcmd.setStatus(CL_INVALID_OPERATION); + } + + profilingEnd(vcmd); +} + +bool VirtualGPU::submitKernelInternalHSA(const amd::NDRangeContainer& sizes, + const amd::Kernel& kernel, const_address parameters, + bool nativeMem, amd::Event* enqueueEvent) { + uint64_t vmParentWrap = 0; + uint64_t vmDefQueue = 0; + amd::DeviceQueue* defQueue = kernel.program().context().defDeviceQueue(dev()); + VirtualGPU* gpuDefQueue = NULL; + amd::HwDebugManager* dbgManager = dev().hwDebugMgr(); + + // Get the HSA kernel object + const HSAILKernel& hsaKernel = static_cast(*(kernel.getDeviceKernel(dev()))); + std::vector memList; + + bool printfEnabled = (hsaKernel.printfInfo().size() > 0) ? true : false; + if (!printfDbgHSA().init(*this, printfEnabled)) { + LogError("Printf debug buffer initialization failed!"); + return false; + } + + // Check memory dependency and SVM objects + if (!processMemObjectsHSA(kernel, parameters, nativeMem, &memList)) { + LogError("Wrong memory objects!"); + return false; + } + + cal_.memCount_ = 0; + + if (hsaKernel.dynamicParallelism()) { + if (NULL == defQueue) { + LogError("Default device queue wasn't allocated"); + return false; + } else { + if (dev().settings().useDeviceQueue_) { + gpuDefQueue = static_cast(defQueue->vDev()); + if (gpuDefQueue->hwRing() == hwRing()) { + LogError("Can't submit the child kernels to the same HW ring as the host queue!"); + return false; + } + } else { + createVirtualQueue(defQueue->size()); + gpuDefQueue = this; + } + } + vmDefQueue = gpuDefQueue->virtualQueue_->vmAddress(); + + // Add memory handles before the actual dispatch + memList.push_back(gpuDefQueue->virtualQueue_); + memList.push_back(gpuDefQueue->schedParams_); + memList.push_back(hsaKernel.prog().kernelTable()); + gpuDefQueue->writeVQueueHeader(*this, hsaKernel.prog().kernelTable()->vmAddress()); + } + + // setup the storage for the memory pointers of the kernel parameters + uint numParams = kernel.signature().numParameters(); + if (dbgManager) { + dbgManager->allocParamMemList(numParams); + } + + bool needFlush = false; + dmaFlushMgmt_.findSplitSize(dev(), sizes.global().product(), hsaKernel.aqlCodeSize()); + if (dmaFlushMgmt().dispatchSplitSize() != 0) { + needFlush = true; + } + + size_t newOffset[3] = {0, 0, 0}; + size_t newGlobalSize[3] = {0, 0, 0}; + + int dim = -1; + int iteration = 1; + size_t globalStep = 0; + for (uint i = 0; i < sizes.dimensions(); i++) { + newGlobalSize[i] = sizes.global()[i]; + newOffset[i] = sizes.offset()[i]; + } + // Check if it is blit kernel. If it is, then check if split is needed. + if (hsaKernel.isInternalKernel()) { + // Calculate new group size for each submission + for (uint i = 0; i < sizes.dimensions(); i++) { + if (sizes.global()[i] > static_cast(0xffffffff)) { + dim = i; + iteration = sizes.global()[i] / 0xC0000000 + ((sizes.global()[i] % 0xC0000000) ? 1 : 0); + globalStep = (sizes.global()[i] / sizes.local()[i]) / iteration * sizes.local()[dim]; + break; + } + } + } + + for (int j = 0; j < iteration; j++) { + // Reset global size for dimension dim if split is needed + if (dim != -1) { + newOffset[dim] = sizes.offset()[dim] + globalStep * j; + if (((newOffset[dim] + globalStep) < sizes.global()[dim]) && (j != (iteration - 1))) { + newGlobalSize[dim] = globalStep; + } else { + newGlobalSize[dim] = sizes.global()[dim] - newOffset[dim]; + } + } + + amd::NDRangeContainer tmpSizes(sizes.dimensions(), &newOffset[0], &newGlobalSize[0], + &(const_cast(sizes).local()[0])); + + // Program the kernel arguments for the GPU execution + hsa_kernel_dispatch_packet_t* aqlPkt = hsaKernel.loadArguments( + *this, kernel, tmpSizes, parameters, nativeMem, vmDefQueue, &vmParentWrap, memList); + if (NULL == aqlPkt) { + LogError("Couldn't load kernel arguments"); + return false; + } + + gslMemObject scratch = NULL; + uint scratchOffset = 0; + // Check if the device allocated more registers than the old setup + if (hsaKernel.workGroupInfo()->scratchRegs_ > 0) { + const Device::ScratchBuffer* scratchObj = dev().scratch(hwRing()); + scratch = scratchObj->memObj_->gslResource(); + memList.push_back(scratchObj->memObj_); + scratchOffset = scratchObj->offset_; + } // Add GSL handle to the memory list for VidMM - addVmMemory(trapHandlerMem); - addVmMemory(trapBufferMem); - addVmMemory(rtTrapHandlerMem); - addVmMemory(rtTrapBufferMem); + for (uint i = 0; i < memList.size(); ++i) { + addVmMemory(memList[i]); + } + + // HW Debug for the kernel? + HwDbgKernelInfo kernelInfo; + HwDbgKernelInfo* pKernelInfo = NULL; + + if (dbgManager) { + buildKernelInfo(hsaKernel, aqlPkt, kernelInfo, enqueueEvent); + pKernelInfo = &kernelInfo; + } + + // Set up the dispatch information + KernelDispatchInfo dispatchInfo; + dispatchInfo.aqlPacket = aqlPkt; + dispatchInfo.mems = vmMems(); + dispatchInfo.numMems = cal_.memCount_; + dispatchInfo.scratch = scratch; + dispatchInfo.scratchOffset = scratchOffset; + dispatchInfo.cpuAqlCode = hsaKernel.cpuAqlCode(); + dispatchInfo.hsaQueueVA = hsaQueueMem_->vmAddress(); + dispatchInfo.kernelInfo = pKernelInfo; + dispatchInfo.wavesPerSH = hsaKernel.getWavesPerSH(this); + dispatchInfo.lastDoppSubmission = kernel.parameters().getExecNewVcop(); + dispatchInfo.pfpaDoppSubmission = kernel.parameters().getExecPfpaVcop(); + + GpuEvent gpuEvent; + // Run AQL dispatch in HW + eventBegin(MainEngine); + cs()->AqlDispatch(&dispatchInfo); + eventEnd(MainEngine, gpuEvent); + + if (dbgManager && (NULL != dbgManager->postDispatchCallBackFunc())) { + dbgManager->executePostDispatchCallBack(); + } + + if (hsaKernel.dynamicParallelism()) { + // Make sure exculsive access to the device queue + amd::ScopedLock(defQueue->lock()); + + if (GPU_PRINT_CHILD_KERNEL != 0) { + waitForEvent(&gpuEvent); + + AmdAqlWrap* wraps = + (AmdAqlWrap*)(&((AmdVQueueHeader*)gpuDefQueue->virtualQueue_->data())[1]); + uint p = 0; + for (uint i = 0; i < gpuDefQueue->vqHeader_->aql_slot_num; ++i) { + if (wraps[i].state != 0) { + uint j; + if (p == GPU_PRINT_CHILD_KERNEL) { + break; + } + p++; + std::stringstream print; + print.flags(std::ios::right | std::ios_base::hex | std::ios_base::uppercase); + print << "Slot#: " << i << "\n"; + print << "\tenqueue_flags: " << wraps[i].enqueue_flags << "\n"; + print << "\tcommand_id: " << wraps[i].command_id << "\n"; + print << "\tchild_counter: " << wraps[i].child_counter << "\n"; + print << "\tcompletion: " << wraps[i].completion << "\n"; + print << "\tparent_wrap: " << wraps[i].parent_wrap << "\n"; + print << "\twait_list: " << wraps[i].wait_list << "\n"; + print << "\twait_num: " << wraps[i].wait_num << "\n"; + uint offsEvents = wraps[i].wait_list - gpuDefQueue->virtualQueue_->vmAddress(); + size_t* events = + reinterpret_cast(gpuDefQueue->virtualQueue_->data() + offsEvents); + for (j = 0; j < wraps[i].wait_num; ++j) { + uint offs = + static_cast(events[j]) - gpuDefQueue->virtualQueue_->vmAddress(); + AmdEvent* eventD = (AmdEvent*)(gpuDefQueue->virtualQueue_->data() + offs); + print << "Wait Event#: " << j << "\n"; + print << "\tState: " << eventD->state << "; Counter: " << eventD->counter << "\n"; + } + print << "WorkGroupSize[ " << wraps[i].aql.workgroup_size_x << ", "; + print << wraps[i].aql.workgroup_size_y << ", "; + print << wraps[i].aql.workgroup_size_z << "]\n"; + print << "GridSize[ " << wraps[i].aql.grid_size_x << ", "; + print << wraps[i].aql.grid_size_y << ", "; + print << wraps[i].aql.grid_size_z << "]\n"; + + uint64_t* kernels = + (uint64_t*)(const_cast(hsaKernel.prog().kernelTable())->map(this)); + for (j = 0; j < hsaKernel.prog().kernels().size(); ++j) { + if (kernels[j] == wraps[i].aql.kernel_object) { + break; + } + } + const_cast(hsaKernel.prog().kernelTable())->unmap(this); + HSAILKernel* child = NULL; + for (auto it = hsaKernel.prog().kernels().begin(); + it != hsaKernel.prog().kernels().end(); ++it) { + if (j == static_cast(it->second)->index()) { + child = static_cast(it->second); + } + } + if (child == NULL) { + printf("Error: couldn't find child kernel!\n"); + continue; + } + const uint64_t kernarg_address = + static_cast(reinterpret_cast(wraps[i].aql.kernarg_address)); + uint offsArg = kernarg_address - gpuDefQueue->virtualQueue_->vmAddress(); + address argum = gpuDefQueue->virtualQueue_->data() + offsArg; + print << "Kernel: " << child->name() << "\n"; + static const char* Names[HSAILKernel::MaxExtraArgumentsNum] = { + "Offset0: ", "Offset1: ", "Offset2: ", "PrintfBuf: ", "VqueuePtr: ", "AqlWrap: "}; + for (j = 0; j < child->extraArgumentsNum(); ++j) { + print << "\t" << Names[j] << *(size_t*)argum; + print << "\n"; + argum += sizeof(size_t); + } + for (j = 0; j < child->numArguments(); ++j) { + print << "\t" << child->argument(j)->name_ << ": "; + for (int s = child->argument(j)->size_ - 1; s >= 0; --s) { + print.width(2); + print.fill('0'); + print << (uint32_t)(argum[s]); + } + argum += child->argument(j)->size_; + print << "\n"; + } + printf("%s", print.str().c_str()); + } + } + } + + if (!dev().settings().useDeviceQueue_) { + // Add the termination handshake to the host queue + eventBegin(MainEngine); + cs()->VirtualQueueHandshake(gpuDefQueue->schedParams_->gslResource(), + vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE, + vmParentWrap + offsetof(AmdAqlWrap, child_counter), 0, + dev().settings().useDeviceQueue_); + eventEnd(MainEngine, gpuEvent); + } + + // Get the global loop start before the scheduler + mcaddr loopStart = gpuDefQueue->cs()->VirtualQueueDispatcherStart(); + static_cast(gpuDefQueue->blitMgr()) + .runScheduler(*gpuDefQueue->virtualQueue_, *gpuDefQueue->schedParams_, + gpuDefQueue->schedParamIdx_, + gpuDefQueue->vqHeader_->aql_slot_num / (DeviceQueueMaskSize * maskGroups_)); + const static bool FlushL2 = true; + gpuDefQueue->flushCUCaches(FlushL2); + + // Get the address of PM4 template and add write it to params + //! @note DMA flush must not occur between patch and the scheduler + mcaddr patchStart = gpuDefQueue->cs()->VirtualQueueDispatcherStart(); + + // Program parameters for the scheduler + SchedulerParam* param = &reinterpret_cast( + gpuDefQueue->schedParams_->data())[gpuDefQueue->schedParamIdx_]; + param->signal = 1; + // Scale clock to 1024 to avoid 64 bit div in the scheduler + param->eng_clk = (1000 * 1024) / dev().info().maxClockFrequency_; + param->hw_queue = patchStart + sizeof(uint32_t) /* Rewind packet*/; + param->hsa_queue = gpuDefQueue->hsaQueueMem()->vmAddress(); + param->releaseHostCP = 0; + param->parentAQL = vmParentWrap; + param->dedicatedQueue = dev().settings().useDeviceQueue_; + param->useATC = dev().settings().svmFineGrainSystem_; + + // Fill the scratch buffer information + if (hsaKernel.prog().maxScratchRegs() > 0) { + gpu::Memory* scratchBuf = dev().scratch(gpuDefQueue->hwRing())->memObj_; + param->scratchSize = scratchBuf->size(); + param->scratch = scratchBuf->vmAddress(); + param->numMaxWaves = 32 * dev().info().maxComputeUnits_; + param->scratchOffset = dev().scratch(gpuDefQueue->hwRing())->offset_; + memList.push_back(scratchBuf); + } else { + param->numMaxWaves = 0; + param->scratchSize = 0; + param->scratch = 0; + param->scratchOffset = 0; + } + + // Add all kernels in the program to the mem list. + //! \note Runtime doesn't know which one will be called + hsaKernel.prog().fillResListWithKernels(memList); + + // Add GSL handle to the memory list for VidMM + for (uint i = 0; i < memList.size(); ++i) { + gpuDefQueue->addVmMemory(memList[i]); + } + + mcaddr signalAddr = gpuDefQueue->schedParams_->vmAddress() + + gpuDefQueue->schedParamIdx_ * sizeof(SchedulerParam); + gpuDefQueue->eventBegin(MainEngine); + gpuDefQueue->cs()->VirtualQueueDispatcherEnd( + gpuDefQueue->vmMems(), gpuDefQueue->cal_.memCount_, signalAddr, loopStart, + gpuDefQueue->vqHeader_->aql_slot_num / (DeviceQueueMaskSize * maskGroups_)); + gpuDefQueue->eventEnd(MainEngine, gpuEvent); + + // Set GPU event for the used resources + for (uint i = 0; i < memList.size(); ++i) { + memList[i]->setBusy(*gpuDefQueue, gpuEvent); + } + + if (dev().settings().useDeviceQueue_) { + // Add the termination handshake to the host queue + eventBegin(MainEngine); + cs()->VirtualQueueHandshake(gpuDefQueue->schedParams_->gslResource(), + vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE, + vmParentWrap + offsetof(AmdAqlWrap, child_counter), signalAddr, + dev().settings().useDeviceQueue_); + eventEnd(MainEngine, gpuEvent); + } + + ++gpuDefQueue->schedParamIdx_ %= gpuDefQueue->schedParams_->size() / sizeof(SchedulerParam); + //! \todo optimize the wrap around + if (gpuDefQueue->schedParamIdx_ == 0) { + gpuDefQueue->schedParams_->wait(*gpuDefQueue); + } + } + + // Set GPU event for the used resources + for (uint i = 0; i < memList.size(); ++i) { + memList[i]->setBusy(*this, gpuEvent); + } + + // Update the global GPU event + setGpuEvent(gpuEvent, needFlush); + + if (!printfDbgHSA().output(*this, printfEnabled, hsaKernel.printfInfo())) { + LogError("Couldn't read printf data from the buffer!\n"); + return false; + } + } + + // Runtime submitted a HSAIL kernel + state_.hsailKernel_ = true; + + return true; } -void -VirtualGPU::submitTransferBufferFromFile(amd::TransferBufferFileCommand& cmd) -{ - size_t copySize = cmd.size()[0]; - size_t fileOffset = cmd.fileOffset(); - Memory* mem = dev().getGpuMemory(&cmd.memory()); - uint idx = 0; +bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const amd::Kernel& kernel, + const_address parameters, bool nativeMem, + amd::Event* enqueueEvent) { + bool result = true; + uint i; + size_t dimensions = sizes.dimensions(); + amd::NDRange local(sizes.local()); + amd::NDRange groupOffset(dimensions); + GpuEvent gpuEvent; + groupOffset = 0; - assert((cmd.type() == CL_COMMAND_READ_SSG_FILE_AMD) || - (cmd.type() == CL_COMMAND_WRITE_SSG_FILE_AMD)); - const bool writeBuffer(cmd.type() == CL_COMMAND_READ_SSG_FILE_AMD); + // Get the GPU kernel object with optimization enabled + bool noAlias = true; + device::Kernel* devKernel = const_cast(kernel.getDeviceKernel(dev(), noAlias)); + Kernel& gpuKernelOpt = static_cast(*devKernel); - if (writeBuffer) { - size_t dstOffset = cmd.origin()[0]; - while (copySize > 0) { - Memory* staging = dev().getGpuMemory(&cmd.staging(idx)); - size_t dstSize = amd::TransferBufferFileCommand::StagingBufferSize; - dstSize = std::min(dstSize, copySize); - void* dstBuffer = staging->cpuMap(*this); - if (!cmd.file()->transferBlock(writeBuffer, - dstBuffer, staging->size(), fileOffset, 0, dstSize)) { - cmd.setStatus(CL_INVALID_OPERATION); - return; - } - staging->cpuUnmap(*this); + if (gpuKernelOpt.hsa()) { + return submitKernelInternalHSA(sizes, kernel, parameters, nativeMem, enqueueEvent); + } else if (state_.hsailKernel_) { + // Reload GSL state to HW, so runtime could run AMDIL kernel + flushDMA(MainEngine); + // Reset HSAIL state + state_.hsailKernel_ = false; + } - bool result = blitMgr().copyBuffer(*staging, *mem, - 0, dstOffset, dstSize, false); - flushDMA(getGpuEvent(staging->gslResource())->engineId_); - fileOffset += dstSize; - dstOffset += dstSize; - copySize -= dstSize; + // Find if arguments contain memory aliases or a dependency in the queue + gpuKernelOpt.processMemObjects(*this, kernel, parameters, nativeMem); + + Kernel& gpuKernel = static_cast(*devKernel); + bool printfEnabled = (gpuKernel.flags() & gpu::NullKernel::PrintfOutput) ? true : false; + // Set current kernel CAL descriptor as active + if (!setActiveKernelDesc(sizes, &gpuKernel) || + // Initialize printf support + !printfDbg().init(*this, printfEnabled, sizes.global())) { + LogPrintfError("We couldn't set \"%s\" kernel as active!", gpuKernel.name().data()); + return false; + } + + // Find if we have to split workload + dmaFlushMgmt_.findSplitSize(dev(), sizes.global().product(), gpuKernel.instructionCnt()); + + // Program the kernel parameters for the GPU execution + cal_.memCount_ = 0; + gpuKernel.setupProgramGrid(*this, dimensions, sizes.offset(), sizes.global(), local, groupOffset, + sizes.offset(), sizes.global()); + + // Load kernel arguments + if (gpuKernel.loadParameters(*this, kernel, parameters, nativeMem)) { + amd::NDRange global(sizes.global()); + amd::NDRange groups(dimensions); + amd::NDRange offsets(sizes.offset()); + amd::NDRange divider(dimensions); + amd::NDRange remainder(dimensions); + size_t extra = 0; + + // Split the workload if necessary for local/private emulation or printf + findIterations(sizes, local, groups, remainder, extra); + + divider = groups; + i = 0; + do { + bool lastRun = (i == (cal()->iterations_ - 1)) ? true : false; + // Reprogram the CAL grid and constant buffers if + // the workload split is on + if (cal()->iterations_ > 1) { + // Initialize printf support + if (!printfDbg().init(*this, printfEnabled, local)) { + result = false; + break; } - } - else { - size_t srcOffset = cmd.origin()[0]; - while (copySize > 0) { - Memory* staging = dev().getGpuMemory(&cmd.staging(idx)); - size_t srcSize = amd::TransferBufferFileCommand::StagingBufferSize; - srcSize = std::min(srcSize, copySize); - bool result = blitMgr().copyBuffer(*mem, *staging, - srcOffset, 0, srcSize, false); - void* srcBuffer = staging->cpuMap(*this); - if (!cmd.file()->transferBlock(writeBuffer, - srcBuffer, staging->size(), fileOffset, 0, srcSize)) { - cmd.setStatus(CL_INVALID_OPERATION); - return; - } - staging->cpuUnmap(*this); + // Reprogram the CAL grid and constant buffers + setupIteration(i, sizes, gpuKernel, global, offsets, local, groups, groupOffset, divider, + remainder, extra); + } - fileOffset += srcSize; - srcOffset += srcSize; - copySize -= srcSize; + // Execute the kernel + if (gpuKernel.run(*this, &gpuEvent, lastRun, kernel.parameters().getExecNewVcop(), + kernel.parameters().getExecPfpaVcop())) { + //! @todo A flush is necessary to make sure + // that 2 consecutive runs won't access to the same + // private/local memory. CAL has to generate cache flush + // and wait for idle commands + bool flush = ((cal()->iterations_ > 1) || + dmaFlushMgmt_.isCbReady(*this, global.product(), gpuKernel.instructionCnt())) + ? true + : false; + + // Update the global GPU event + setGpuEvent(gpuEvent, flush); + + // This code for the kernel execution debugging + if (dev().settings().debugFlags_ & Settings::LockGlobalMemory) { + gpuKernel.debug(*this); } + } else { + result = false; + break; + } + + // Print the debug buffer output result + if (printfDbg().output(*this, printfEnabled, + (cal()->iterations_ > 1) ? local : sizes.global(), + gpuKernel.prog().printfInfo())) { + // Go to the next iteration + ++i; + } else { + result = false; + break; + } } + // Check if we have to make multiple iterations + while (i < cal()->iterations_); + } else { + result = false; + } + + if (!result) { + LogPrintfError("submitKernel failed to execute the \"%s\" kernel on HW!", + gpuKernel.name().data()); + } + + return result; } -} // namespace gpu +void VirtualGPU::submitNativeFn(amd::NativeFnCommand& vcmd) { + // Make sure VirtualGPU has an exclusive access to the resources + amd::ScopedLock lock(execution()); + + Unimplemented(); //!< @todo: Unimplemented +} + +void VirtualGPU::submitMarker(amd::Marker& vcmd) { + //!@note runtime doesn't need to lock this command on execution + + if (vcmd.waitingEvent() != NULL) { + bool foundEvent = false; + + // Loop through all outstanding command batches + while (!cbList_.empty()) { + CommandBatchList::const_iterator it = cbList_.begin(); + // Wait for completion + foundEvent = awaitCompletion(*it, vcmd.waitingEvent()); + // Release a command batch + delete *it; + // Remove command batch from the list + cbList_.pop_front(); + // Early exit if we found a command + if (foundEvent) break; + } + + // Event should be in the current command batch + if (!foundEvent) { + state_.forceWait_ = true; + } + // If we don't have any more batches, then assume GPU is idle + else if (cbList_.empty()) { + dmaFlushMgmt_.resetCbWorkload(dev()); + } + } +} + +void VirtualGPU::releaseMemory(gslMemObject gslResource, bool wait) { + bool result = true; + if (wait) { + waitForEvent(&gpuEvents_[gslResource]); + } + + // Unbind resource if it's active kernel desc + for (uint i = 0; i < MaxUavArguments; ++i) { + if (gslResource == cal_.uavs_[i]) { + result = setUAVBuffer(i, 0, GSL_UAV_TYPE_UNKNOWN); + cal_.uavs_[i] = 0; + } + } + for (uint i = 0; i < MaxReadImage; ++i) { + if (gslResource == cal_.readImages_[i]) { + result = setInput(i, 0); + cal_.readImages_[i] = 0; + } + } + for (uint i = 0; i < MaxConstBuffers; ++i) { + if (gslResource == cal_.constBuffers_[i]) { + result = setConstantBuffer(i, 0, 0, 0); + cal_.constBuffers_[i] = 0; + } + } + + if ((dev().scratch(hwRing()) != NULL) && (dev().scratch(hwRing())->regNum_ > 0)) { + // Unbind scratch memory + const Device::ScratchBuffer* scratch = dev().scratch(hwRing()); + if ((scratch->memObj_ != NULL) && (scratch->memObj_->gslResource() == gslResource)) { + setScratchBuffer(NULL, 0); + } + } + + gpuEvents_.erase(gslResource); +} + +void VirtualGPU::releaseKernel(CALimage calImage) { + GslKernelDesc* desc = gslKernels_[calImage]; + if (desc != NULL) { + freeKernelDesc(desc); + } + gslKernels_.erase(calImage); +} + +void VirtualGPU::submitPerfCounter(amd::PerfCounterCommand& vcmd) { + // Make sure VirtualGPU has an exclusive access to the resources + amd::ScopedLock lock(execution()); + + gslQueryObject gslCounter; + + const amd::PerfCounterCommand::PerfCounterList counters = vcmd.getCounters(); + + // Create a HW counter + gslCounter = cs()->createQuery(GSL_PERFORMANCE_COUNTERS_ATI); + if (0 == gslCounter) { + LogError("We failed to allocate memory for the GPU perfcounter"); + vcmd.setStatus(CL_INVALID_OPERATION); + return; + } + CalCounterReference* calRef = new CalCounterReference(*this, gslCounter); + if (calRef == NULL) { + LogError("We failed to allocate memory for the GPU perfcounter"); + vcmd.setStatus(CL_INVALID_OPERATION); + return; + } + gslCounter = 0; + + for (uint i = 0; i < vcmd.getNumCounters(); ++i) { + amd::PerfCounter* amdCounter = static_cast(counters[i]); + const PerfCounter* counter = static_cast(amdCounter->getDeviceCounter()); + + // Make sure we have a valid gpu performance counter + if (NULL == counter) { + amd::PerfCounter::Properties prop = amdCounter->properties(); + PerfCounter* gpuCounter = new PerfCounter( + gpuDevice_, *this, prop[CL_PERFCOUNTER_GPU_BLOCK_INDEX], + prop[CL_PERFCOUNTER_GPU_COUNTER_INDEX], prop[CL_PERFCOUNTER_GPU_EVENT_INDEX]); + if (NULL == gpuCounter) { + LogError("We failed to allocate memory for the GPU perfcounter"); + vcmd.setStatus(CL_INVALID_OPERATION); + return; + } else if (gpuCounter->create(calRef)) { + amdCounter->setDeviceCounter(gpuCounter); + } else { + LogPrintfError( + "We failed to allocate a perfcounter in CAL.\ + Block: %d, counter: #d, event: %d", + gpuCounter->info()->blockIndex_, gpuCounter->info()->counterIndex_, + gpuCounter->info()->eventIndex_); + delete gpuCounter; + vcmd.setStatus(CL_INVALID_OPERATION); + return; + } + counter = gpuCounter; + } + } + + calRef->release(); + + for (uint i = 0; i < vcmd.getNumCounters(); ++i) { + amd::PerfCounter* amdCounter = static_cast(counters[i]); + const PerfCounter* counter = static_cast(amdCounter->getDeviceCounter()); + + if (gslCounter != counter->gslCounter()) { + gslCounter = counter->gslCounter(); + // Find the state and sends the command to CAL + if (vcmd.getState() == amd::PerfCounterCommand::Begin) { + gslCounter->BeginQuery(cs(), GSL_PERFORMANCE_COUNTERS_ATI, 0); + } else if (vcmd.getState() == amd::PerfCounterCommand::End) { + GpuEvent event; + eventBegin(MainEngine); + gslCounter->EndQuery(cs(), 0); + eventEnd(MainEngine, event); + setGpuEvent(event); + } else { + LogError("Unsupported performance counter state"); + vcmd.setStatus(CL_INVALID_OPERATION); + return; + } + } + } +} +void VirtualGPU::submitThreadTraceMemObjects(amd::ThreadTraceMemObjectsCommand& cmd) { + // Make sure VirtualGPU has an exclusive access to the resources + amd::ScopedLock lock(execution()); + + profilingBegin(cmd); + + switch (cmd.type()) { + case CL_COMMAND_THREAD_TRACE_MEM: { + amd::ThreadTrace* amdThreadTrace = &cmd.getThreadTrace(); + ThreadTrace* threadTrace = static_cast(amdThreadTrace->getDeviceThreadTrace()); + + if (threadTrace == NULL) { + gslQueryObject gslThreadTrace; + // Create a HW thread trace query object + gslThreadTrace = cs()->createQuery(GSL_SHADER_TRACE_BYTES_WRITTEN); + if (0 == gslThreadTrace) { + LogError("Failure in memory allocation for the GPU threadtrace"); + cmd.setStatus(CL_INVALID_OPERATION); + return; + } + CalThreadTraceReference* calRef = new CalThreadTraceReference(*this, gslThreadTrace); + if (calRef == NULL) { + LogError("Failure in memory allocation for the GPU threadtrace"); + cmd.setStatus(CL_INVALID_OPERATION); + return; + } + size_t seNum = amdThreadTrace->deviceSeNumThreadTrace(); + ThreadTrace* gpuThreadTrace = new ThreadTrace(gpuDevice_, *this, seNum); + if (NULL == gpuThreadTrace) { + LogError("Failure in memory allocation for the GPU threadtrace"); + cmd.setStatus(CL_INVALID_OPERATION); + return; + } + if (gpuThreadTrace->create(calRef)) { + amdThreadTrace->setDeviceThreadTrace(gpuThreadTrace); + } else { + LogError("Failure in memory allocation for the GPU threadtrace"); + delete gpuThreadTrace; + cmd.setStatus(CL_INVALID_OPERATION); + return; + } + threadTrace = gpuThreadTrace; + calRef->release(); + } + gslShaderTraceBufferObject* threadTraceBufferObjects = + threadTrace->getThreadTraceBufferObjects(); + const size_t memObjSize = cmd.getMemoryObjectSize(); + const std::vector& memObj = cmd.getMemList(); + size_t se = 0; + for (std::vector::const_iterator itMemObj = memObj.begin(); + itMemObj != memObj.end(); ++itMemObj, ++se) { + // Find GSL Mem Object + gslMemObject gslMemObj = dev().getGpuMemory(*itMemObj)->gslResource(); + + // Bind GSL MemObject to the appropriate SE Thread Trace Buffer Object + threadTraceBufferObjects[se]->attachMemObject(cs(), gslMemObj, 0, 0, memObjSize, se); + } + break; + } + default: + LogError("Unsupported command type for ThreadTraceMemObjects!"); + break; + } +} + +void VirtualGPU::submitThreadTrace(amd::ThreadTraceCommand& cmd) { + // Make sure VirtualGPU has an exclusive access to the resources + amd::ScopedLock lock(execution()); + + profilingBegin(cmd); + + switch (cmd.type()) { + case CL_COMMAND_THREAD_TRACE: { + amd::ThreadTrace* amdThreadTrace = static_cast(&cmd.getThreadTrace()); + ThreadTrace* threadTrace = static_cast(amdThreadTrace->getDeviceThreadTrace()); + + // gpu thread trace object had to be generated prior to begin/end/pause/resume due + // to ThreadTraceMemObjectsCommand execution + if (threadTrace == NULL) { + return; + } else { + gslQueryObject gslThreadTrace; + gslThreadTrace = threadTrace->gslThreadTrace(); + uint32_t seNum = amdThreadTrace->deviceSeNumThreadTrace(); + + // Find the state and sends the commands to GSL + if (cmd.getState() == amd::ThreadTraceCommand::Begin) { + amd::ThreadTrace::ThreadTraceConfig* traceCfg = + static_cast(cmd.threadTraceConfig()); + const gslErrorCode ec = + gslThreadTrace->BeginQuery(cs(), GSL_SHADER_TRACE_BYTES_WRITTEN, 0); + assert(ec == GSL_NO_ERROR); + + for (uint32_t idx = 0; idx < seNum; ++idx) { + rs()->enableShaderTrace(cs(), idx, true); + rs()->setShaderTraceComputeUnit(idx, traceCfg->cu_); + rs()->setShaderTraceShaderArray(idx, traceCfg->sh_); + rs()->setShaderTraceSIMDMask(idx, traceCfg->simdMask_); + rs()->setShaderTraceVmIdMask(idx, traceCfg->vmIdMask_); + rs()->setShaderTraceTokenMask(idx, traceCfg->tokenMask_); + rs()->setShaderTraceRegisterMask(idx, traceCfg->regMask_); + rs()->setShaderTraceIssueMask(idx, traceCfg->instMask_); + rs()->setShaderTraceRandomSeed(idx, traceCfg->randomSeed_); + rs()->setShaderTraceCaptureMode(idx, traceCfg->captureMode_); + rs()->setShaderTraceWrap(idx, traceCfg->isWrapped_); + rs()->setShaderTraceUserData(idx, (traceCfg->isUserData_) ? traceCfg->userData_ : 0); + } + } else if (cmd.getState() == amd::ThreadTraceCommand::End) { + for (uint32_t idx = 0; idx < seNum; ++idx) { + rs()->enableShaderTrace(cs(), idx, false); + } + gslThreadTrace->EndQuery(cs(), 0); + } else if (cmd.getState() == amd::ThreadTraceCommand::Pause) { + for (uint32_t idx = 0; idx < seNum; ++idx) { + rs()->setShaderTraceIsPaused(cs(), idx, true); + } + } else if (cmd.getState() == amd::ThreadTraceCommand::Resume) { + for (uint32_t idx = 0; idx < seNum; ++idx) { + rs()->setShaderTraceIsPaused(cs(), idx, false); + } + } + } + break; + } + default: + LogError("Unsupported command type for ThreadTrace!"); + break; + } +} + +void VirtualGPU::submitAcquireExtObjects(amd::AcquireExtObjectsCommand& vcmd) { + // Make sure VirtualGPU has an exclusive access to the resources + amd::ScopedLock lock(execution()); + + profilingBegin(vcmd); + + for (std::vector::const_iterator it = vcmd.getMemList().begin(); + it != vcmd.getMemList().end(); it++) { + // amd::Memory object should never be NULL + assert(*it && "Memory object for interop is NULL"); + gpu::Memory* memory = dev().getGpuMemory(*it); + + // If resource is a shared copy of original resource, then + // runtime needs to copy data from original resource + (*it)->getInteropObj()->copyOrigToShared(); + + // Check if OpenCL has direct access to the interop memory + if (memory->interopType() == Memory::InteropDirectAccess) { + continue; + } + + // Does interop use HW emulation? + if (memory->interopType() == Memory::InteropHwEmulation) { + static const bool Entire = true; + amd::Coord3D origin(0, 0, 0); + amd::Coord3D region(memory->size()); + + // Synchronize the object + if (!blitMgr().copyBuffer(*memory->interop(), *memory, origin, origin, region, Entire)) { + LogError("submitAcquireExtObjects - Interop synchronization failed!"); + vcmd.setStatus(CL_INVALID_OPERATION); + return; + } + } + } + + profilingEnd(vcmd); +} + +void VirtualGPU::submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& vcmd) { + // Make sure VirtualGPU has an exclusive access to the resources + amd::ScopedLock lock(execution()); + + profilingBegin(vcmd); + + for (std::vector::const_iterator it = vcmd.getMemList().begin(); + it != vcmd.getMemList().end(); it++) { + // amd::Memory object should never be NULL + assert(*it && "Memory object for interop is NULL"); + gpu::Memory* memory = dev().getGpuMemory(*it); + + // Check if we can use HW interop + if (memory->interopType() == Memory::InteropHwEmulation) { + static const bool Entire = true; + amd::Coord3D origin(0, 0, 0); + amd::Coord3D region(memory->size()); + + // Synchronize the object + if (!blitMgr().copyBuffer(*memory, *memory->interop(), origin, origin, region, Entire)) { + LogError("submitReleaseExtObjects interop synchronization failed!"); + vcmd.setStatus(CL_INVALID_OPERATION); + return; + } + } else { + if (memory->interopType() != Memory::InteropDirectAccess) { + LogError("None interop release!"); + } + } + + // If resource is a shared copy of original resource, then + // runtime needs to copy data back to original resource + (*it)->getInteropObj()->copySharedToOrig(); + } + + profilingEnd(vcmd); +} + +void VirtualGPU::submitSignal(amd::SignalCommand& vcmd) { + amd::ScopedLock lock(execution()); + profilingBegin(vcmd); + gpu::Memory* gpuMemory = dev().getGpuMemory(&vcmd.memory()); + GpuEvent gpuEvent; + eventBegin(MainEngine); + if (vcmd.type() == CL_COMMAND_WAIT_SIGNAL_AMD) { + uint64_t surfAddr = gpuMemory->gslResource()->getPhysicalAddress(cs()); + uint64_t markerAddr = gpuMemory->gslResource()->getMarkerAddress(cs()); + uint64_t markerOffset = markerAddr - surfAddr; + cs()->p2pMarkerOp(gpuMemory->gslResource(), vcmd.markerValue(), markerOffset, false); + } else if (vcmd.type() == CL_COMMAND_WRITE_SIGNAL_AMD) { + cs()->p2pMarkerOp(gpuMemory->gslResource(), vcmd.markerValue(), vcmd.markerOffset(), true); + } + eventEnd(MainEngine, gpuEvent); + gpuMemory->setBusy(*this, gpuEvent); + // Update the global GPU event + setGpuEvent(gpuEvent); + + profilingEnd(vcmd); +} + +void VirtualGPU::submitMakeBuffersResident(amd::MakeBuffersResidentCommand& vcmd) { + amd::ScopedLock lock(execution()); + profilingBegin(vcmd); + std::vector memObjects = vcmd.memObjects(); + cl_uint numObjects = memObjects.size(); + gslMemObject* pGSLMemObjects = new gslMemObject[numObjects]; + + for (cl_uint i = 0; i < numObjects; ++i) { + gpu::Memory* gpuMemory = dev().getGpuMemory(memObjects[i]); + pGSLMemObjects[i] = gpuMemory->gslResource(); + gpuMemory->syncCacheFromHost(*this); + } + + uint64* surfBusAddr = new uint64[numObjects]; + uint64* markerBusAddr = new uint64[numObjects]; + gslErrorCode res = + cs()->makeBuffersResident(numObjects, pGSLMemObjects, surfBusAddr, markerBusAddr); + if (res != GSL_NO_ERROR) { + LogError("MakeBuffersResident failed"); + vcmd.setStatus(CL_INVALID_OPERATION); + } else { + cl_bus_address_amd* busAddr = vcmd.busAddress(); + for (cl_uint i = 0; i < numObjects; ++i) { + busAddr[i].surface_bus_address = surfBusAddr[i]; + busAddr[i].marker_bus_address = markerBusAddr[i]; + } + } + delete[] pGSLMemObjects; + delete[] surfBusAddr; + delete[] markerBusAddr; + profilingEnd(vcmd); +} + + +bool VirtualGPU::awaitCompletion(CommandBatch* cb, const amd::Event* waitingEvent) { + bool found = false; + amd::Command* current; + amd::Command* head = cb->head_; + + // Make sure that profiling is enabled + if (profileEnabled_) { + return profilingCollectResults(cb, waitingEvent); + } + // Mark the first command in the batch as running + if (head != NULL) { + head->setStatus(CL_RUNNING); + } else { + return found; + } + + // Wait for the last known GPU event + waitEventLock(cb); + + while (NULL != head) { + current = head->getNext(); + if (head->status() == CL_SUBMITTED) { + head->setStatus(CL_RUNNING); + head->setStatus(CL_COMPLETE); + } else if (head->status() == CL_RUNNING) { + head->setStatus(CL_COMPLETE); + } else if ((head->status() != CL_COMPLETE) && (current != NULL)) { + LogPrintfError("Unexpected command status - %d!", head->status()); + } + + // Check if it's a waiting command + if (head == waitingEvent) { + found = true; + } + + head->release(); + head = current; + } + + return found; +} + +void VirtualGPU::flush(amd::Command* list, bool wait) { + CommandBatch* cb = NULL; + bool gpuCommand = false; + + for (uint i = 0; i < AllEngines; ++i) { + if (cal_.events_[i].isValid()) { + gpuCommand = true; + } + } + + // If the batch doesn't have any GPU command and the list is empty + if (!gpuCommand && cbList_.empty()) { + state_.forceWait_ = true; + } + + // Insert the current batch into a list + if (NULL != list) { + cb = new CommandBatch(list, cal()->events_, cal()->lastTS_); + } + + { + //! @note: flushDMA() requires a lock, because GSL can + //! defer destruction of internal memory objects and releases them + //! on GSL flush. If runtime calls another GSL flush at the same time, + //! then double release can occur. + amd::ScopedLock lock(execution()); + for (uint i = 0; i < AllEngines; ++i) { + flushDMA(i); + // Reset event so we won't try to wait again, + // if runtime didn't submit any commands + //! @note: it's safe to invalidate events, since + //! we already saved them with the batch creation step above + cal_.events_[i].invalidate(); + } + } + + // Mark last TS as NULL, so runtime won't process empty batches with the old TS + cal_.lastTS_ = NULL; + if (NULL != cb) { + cbList_.push_back(cb); + } + + wait |= state_.forceWait_; + // Loop through all outstanding command batches + while (!cbList_.empty()) { + CommandBatchList::const_iterator it = cbList_.begin(); + // Check if command batch finished without a wait + bool finished = true; + for (uint i = 0; i < AllEngines; ++i) { + finished &= isDone(&(*it)->events_[i]); + } + if (finished || wait) { + // Wait for completion + awaitCompletion(*it); + // Release a command batch + delete *it; + // Remove command batch from the list + cbList_.pop_front(); + } else { + // Early exit if no finished + break; + } + } + state_.forceWait_ = false; +} + +void VirtualGPU::enableSyncedBlit() const { return blitMgr_->enableSynchronization(); } + +void VirtualGPU::releaseMemObjects(bool scratch) { + for (GpuEvents::const_iterator it = gpuEvents_.begin(); it != gpuEvents_.end(); ++it) { + GpuEvent event = it->second; + waitForEvent(&event); + } + // Unbind all resources.So the queue won't have any bound mem objects + for (uint i = 0; i < MaxUavArguments; ++i) { + if (NULL != cal_.uavs_[i]) { + setUAVBuffer(i, 0, GSL_UAV_TYPE_UNKNOWN); + cal_.uavs_[i] = 0; + } + } + for (uint i = 0; i < MaxReadImage; ++i) { + if (NULL != cal_.readImages_[i]) { + setInput(i, 0); + cal_.readImages_[i] = 0; + } + } + for (uint i = 0; i < MaxConstBuffers; ++i) { + if (NULL != cal_.constBuffers_[i]) { + setConstantBuffer(i, 0, 0, 0); + cal_.constBuffers_[i] = 0; + } + } + + if (scratch) { + setScratchBuffer(NULL, 0); + } + + gpuEvents_.clear(); +} + +void VirtualGPU::setGpuEvent(GpuEvent gpuEvent, bool flush) { + cal_.events_[engineID_] = gpuEvent; + + // Flush current DMA buffer if requested + if (flush || GPU_FLUSH_ON_EXECUTION) { + flushDMA(engineID_); + } +} + +void VirtualGPU::flushDMA(uint engineID) { + if (engineID == MainEngine) { + // Clear memory dependency state, since runtime flushes compute + // memoryDependency().clear(); + //!@todo Keep memory dependency alive even if we flush DMA, + //! since only L2 cache is flushed in KMD frame, + //! but L1 still has to be invalidated. + } + //! \note Use CtxIsEventDone, so we won't flush compute for DRM engine + isDone(&cal_.events_[engineID]); +} + +bool VirtualGPU::waitAllEngines(CommandBatch* cb) { + uint i; + GpuEvent* events; //!< GPU events for the batch + // If command batch is NULL then wait for the current + if (NULL == cb) { + events = cal_.events_; + } else { + events = cb->events_; + } + + bool earlyDone = true; + // The first loop is to flush all engines and/or check if + // engines are idle already + for (i = 0; i < AllEngines; ++i) { + earlyDone &= isDone(&events[i]); + } + + // Release all transfer buffers on this command queue + releaseXferWrite(); + + // Rlease all pinned memory + releasePinnedMem(); + + // The second loop is to wait all engines + for (i = 0; i < AllEngines; ++i) { + waitForEvent(&events[i]); + } + + return earlyDone; +} + +void VirtualGPU::waitEventLock(CommandBatch* cb) { + // Make sure VirtualGPU has an exclusive access to the resources + amd::ScopedLock lock(execution()); + + bool earlyDone = waitAllEngines(cb); + + // Free resource cache if we have too many entries + //! \note we do it here, when all engines are idle, + // because Vista/Win7 idles GPU on a resource destruction + static const size_t MinCacheEntries = 4096; + dev().resourceCache().free(MinCacheEntries); + + // Find the timestamp object of the last command in the batch + if (cb->lastTS_ != NULL) { + // If earlyDone is TRUE, then CPU didn't wait for GPU. + // Thus the sync point between CPU and GPU is unclear and runtime + // will use an older adjustment value to maintain the same timeline + if (!earlyDone || + //! \note Workaround for APU(s). + //! GPU-CPU timelines may go off too much, thus always + //! force calibration with the last batch in the list + (cbList_.size() <= 1) || (readjustTimeGPU_ == 0)) { + uint64_t startTimeStampGPU = 0; + uint64_t endTimeStampGPU = 0; + + // Get the timestamp value of the last command in the batch + cb->lastTS_->value(&startTimeStampGPU, &endTimeStampGPU); + + uint64_t endTimeStampCPU = amd::Os::timeNanos(); + // Make sure the command batch has a valid GPU TS + if (!GPU_RAW_TIMESTAMP) { + // Adjust the base time by the execution time + readjustTimeGPU_ = endTimeStampGPU - endTimeStampCPU; + } + } + } +} + +void VirtualGPU::validateScratchBuffer(const Kernel* kernel) { + // Check if a scratch buffer is required + if (dev().scratch(hwRing())->regNum_ > 0) { + // Setup scratch buffer + setScratchBuffer(dev().scratch(hwRing())->memObj_->gslResource(), 0); + } +} + +bool VirtualGPU::setActiveKernelDesc(const amd::NDRangeContainer& sizes, const Kernel* kernel) { + bool result = true; + CALimage calImage = kernel->calImage(); + + GslKernelDesc* desc = gslKernels_[calImage]; + + validateScratchBuffer(kernel); + + // Early exit + if ((activeKernelDesc_ == desc) && (desc != NULL)) { + return result; + } + + // Does the kernel descriptor for this virtual device exist? + if (desc == NULL) { + desc = allocKernelDesc(kernel, calImage); + if (desc == NULL) { + return false; + } + gslKernels_[calImage] = desc; + } + + // Set the descriptor as active + activeKernelDesc_ = desc; + + // Program the samplers defined in the kernel + if (!kernel->setInternalSamplers(*this)) { + result = false; + } + + // Bind global HW constant buffers + if (!kernel->bindGlobalHwCb(*this, desc)) { + result = false; + } + + if (result) { + // Set program in GSL + rs()->setCurrentProgramObject(GSL_COMPUTE_PROGRAM, desc->func_); + + // Update internal constant buffer + if (desc->intCb_ != 0) { + cs()->setIntConstants(GSL_COMPUTE_PROGRAM, desc->intCb_); + } + } + + return result; +} + +bool VirtualGPU::allocConstantBuffers() { + // Allocate/reallocate constant buffers + size_t minCbSize; + // GCN doesn't really have a limit + minCbSize = 128 * Ki; + uint i; + + // Create/reallocate constant buffer resources + for (i = 0; i < MaxConstBuffersArguments; ++i) { + ConstBuffer* constBuf = new ConstBuffer( + *this, ((minCbSize + ConstBuffer::VectorSize - 1) / ConstBuffer::VectorSize)); + + if ((constBuf != NULL) && constBuf->create()) { + addConstBuffer(constBuf); + } else { + // We failed to create a constant buffer + delete constBuf; + return false; + } + } + + return true; +} + +VirtualGPU::GslKernelDesc* VirtualGPU::allocKernelDesc(const Kernel* kernel, CALimage calImage) { + // Sanity checks + assert(kernel != NULL); + GslKernelDesc* desc = new GslKernelDesc; + + if (desc != NULL) { + memset(desc, 0, sizeof(GslKernelDesc)); + + if (kernel->calImage() != calImage) { + desc->image_ = calImage; + } + + if (!moduleLoad(calImage, &desc->func_, &desc->intCb_)) { + LogPrintfError("calModuleLoad failed for \"%s\" kernel!", kernel->name().c_str()); + delete desc; + return NULL; + } + } + + if (kernel->argSize() > slots_.size()) { + slots_.resize(kernel->argSize()); + } + + return desc; +} + +void VirtualGPU::freeKernelDesc(VirtualGPU::GslKernelDesc* desc) { + if (desc) { + if (gslKernelDesc() == desc) { + // Clear active kernel desc + activeKernelDesc_ = NULL; + rs()->setCurrentProgramObject(GSL_COMPUTE_PROGRAM, 0); + } + + if (desc->image_ != 0) { + // Free CAL image + free(desc->image_); + } + + if (desc->func_ != 0) { + if (desc->intCb_ != 0) { + cs()->setIntConstants(GSL_COMPUTE_PROGRAM, 0); + cs()->destroyMemObject(desc->intCb_); + } + cs()->destroyProgramObject(desc->func_); + } + + delete desc; + } +} + +void VirtualGPU::profilingBegin(amd::Command& command, bool drmProfiling) { + // Is profiling enabled? + if (command.profilingInfo().enabled_) { + // Allocate a timestamp object from the cache + TimeStamp* ts = tsCache_->allocTimeStamp(); + if (NULL == ts) { + return; + } + // Save the TimeStamp object in the current OCL event + command.setData(ts); + currTs_ = ts; + profileEnabled_ = true; + } +} + +void VirtualGPU::profilingEnd(amd::Command& command) { + // Get the TimeStamp object associated witht the current command + TimeStamp* ts = reinterpret_cast(command.data()); + if (ts != NULL) { + // Check if the command actually did any GPU submission + if (ts->isValid()) { + cal_.lastTS_ = ts; + } else { + // Destroy the TimeStamp object + tsCache_->freeTimeStamp(ts); + command.setData(NULL); + } + } +} + +bool VirtualGPU::profilingCollectResults(CommandBatch* cb, const amd::Event* waitingEvent) { + bool found = false; + amd::Command* current; + amd::Command* first = cb->head_; + + // If the command list is, empty then exit + if (NULL == first) { + return found; + } + + // Wait for the last known GPU events on all engines + waitEventLock(cb); + + // Find the CPU base time of the entire command batch execution + uint64_t endTimeStamp = amd::Os::timeNanos(); + uint64_t startTimeStamp = endTimeStamp; + + // First step, walk the command list to find the first valid command + //! \note The batch may have empty markers at the beginning. + //! So the start/end of the empty commands is equal to + //! the start of the first valid command in the batch. + first = cb->head_; + while (NULL != first) { + // Get the TimeStamp object associated witht the current command + TimeStamp* ts = reinterpret_cast(first->data()); + + if (ts != NULL) { + ts->value(&startTimeStamp, &endTimeStamp); + endTimeStamp -= readjustTimeGPU_; + startTimeStamp -= readjustTimeGPU_; + // Assign to endTimeStamp the start of the first valid command + endTimeStamp = startTimeStamp; + break; + } + first = first->getNext(); + } + + // Second step, walk the command list to construct the time line + first = cb->head_; + while (NULL != first) { + // Get the TimeStamp object associated witht the current command + TimeStamp* ts = reinterpret_cast(first->data()); + + current = first->getNext(); + + if (ts != NULL) { + ts->value(&startTimeStamp, &endTimeStamp); + endTimeStamp -= readjustTimeGPU_; + startTimeStamp -= readjustTimeGPU_; + // Destroy the TimeStamp object + tsCache_->freeTimeStamp(ts); + first->setData(NULL); + } else { + // For empty commands start/end is equal to + // the end of the last valid command + startTimeStamp = endTimeStamp; + } + + // Update the command status with the proper timestamps + if (first->status() == CL_SUBMITTED) { + first->setStatus(CL_RUNNING, startTimeStamp); + first->setStatus(CL_COMPLETE, endTimeStamp); + } else if (first->status() == CL_RUNNING) { + first->setStatus(CL_COMPLETE, endTimeStamp); + } else if ((first->status() != CL_COMPLETE) && (current != NULL)) { + LogPrintfError("Unexpected command status - %d!", first->status()); + } + + // Do we wait this event? + if (first == waitingEvent) { + found = true; + } + + first->release(); + first = current; + } + + return found; +} + +bool VirtualGPU::addVmMemory(const Memory* memory) { + uint* cnt = &cal_.memCount_; + (*cnt)++; + // Reallocate array if kernel uses more memory objects + if (numVmMems_ < *cnt) { + gslMemObject* tmp; + tmp = new gslMemObject[*cnt]; + if (tmp == NULL) { + return false; + } + memcpy(tmp, vmMems_, sizeof(gslMemObject) * numVmMems_); + delete[] vmMems_; + vmMems_ = tmp; + numVmMems_ = *cnt; + } + vmMems_[*cnt - 1] = memory->gslResource(); + + return true; +} + +void VirtualGPU::profileEvent(EngineType engine, bool type) const { + if (NULL == currTs_) { + return; + } + if (type) { + currTs_->begin((engine == SdmaEngine) ? true : false); + } else { + currTs_->end((engine == SdmaEngine) ? true : false); + } +} + +bool VirtualGPU::processMemObjectsHSA(const amd::Kernel& kernel, const_address params, + bool nativeMem, std::vector* memList) { + static const bool NoAlias = true; + const HSAILKernel& hsaKernel = + static_cast(*(kernel.getDeviceKernel(dev(), NoAlias))); + const amd::KernelSignature& signature = kernel.signature(); + const amd::KernelParameters& kernelParams = kernel.parameters(); + + // Mark the tracker with a new kernel, + // so we can avoid checks of the aliased objects + memoryDependency().newKernel(); + + bool deviceSupportFGS = 0 != dev().isFineGrainedSystem(true); + bool supportFineGrainedSystem = deviceSupportFGS; + FGSStatus status = kernelParams.getSvmSystemPointersSupport(); + switch (status) { + case FGS_YES: + if (!deviceSupportFGS) { + return false; + } + supportFineGrainedSystem = true; + break; + case FGS_NO: + supportFineGrainedSystem = false; + break; + case FGS_DEFAULT: + default: + break; + } + + size_t count = kernelParams.getNumberOfSvmPtr(); + size_t execInfoOffset = kernelParams.getExecInfoOffset(); + bool sync = true; + + amd::Memory* memory = NULL; + // get svm non arugment information + void* const* svmPtrArray = reinterpret_cast(params + execInfoOffset); + for (size_t i = 0; i < count; i++) { + memory = amd::SvmManager::FindSvmBuffer(svmPtrArray[i]); + if (NULL == memory) { + if (!supportFineGrainedSystem) { + return false; + } else if (sync) { + flushCUCaches(); + // Clear memory dependency state + const static bool All = true; + memoryDependency().clear(!All); + continue; + } + } else { + Memory* gpuMemory = dev().getGpuMemory(memory); + if (NULL != gpuMemory) { + // Synchronize data with other memory instances if necessary + gpuMemory->syncCacheFromHost(*this); + + const static bool IsReadOnly = false; + // Validate SVM passed in the non argument list + memoryDependency().validate(*this, gpuMemory, IsReadOnly); + + // Mark signal write for cache coherency, + // since this object isn't a part of kernel arg setup + if ((memory->getMemFlags() & CL_MEM_READ_ONLY) == 0) { + memory->signalWrite(&dev()); + } + + memList->push_back(gpuMemory); + } else { + return false; + } + } + } + + // Check all parameters for the current kernel + for (size_t i = 0; i < signature.numParameters(); ++i) { + const amd::KernelParameterDescriptor& desc = signature.at(i); + const HSAILKernel::Argument* arg = hsaKernel.argument(i); + Memory* memory = NULL; + bool readOnly = false; + amd::Memory* svmMem = NULL; + + // Find if current argument is a buffer + if ((desc.type_ == T_POINTER) && (arg->addrQual_ != HSAIL_ADDRESS_LOCAL)) { + if (kernelParams.boundToSvmPointer(dev(), params, i)) { + svmMem = + amd::SvmManager::FindSvmBuffer(*reinterpret_cast(params + desc.offset_)); + if (!svmMem) { + flushCUCaches(); + // Clear memory dependency state + const static bool All = true; + memoryDependency().clear(!All); + continue; + } + } + + if (nativeMem) { + memory = *reinterpret_cast(params + desc.offset_); + } else if (*reinterpret_cast(params + desc.offset_) != NULL) { + if (NULL == svmMem) { + memory = + dev().getGpuMemory(*reinterpret_cast(params + desc.offset_)); + } else { + memory = dev().getGpuMemory(svmMem); + } + // Synchronize data with other memory instances if necessary + memory->syncCacheFromHost(*this); + } + + if (memory != NULL) { + // Check image + readOnly = (desc.accessQualifier_ == CL_KERNEL_ARG_ACCESS_READ_ONLY) ? true : false; + // Check buffer + readOnly |= (arg->access_ == HSAIL_ACCESS_TYPE_RO) ? true : false; + // Validate memory for a dependency in the queue + memoryDependency().validate(*this, memory, readOnly); + } + } + } + + for (gpu::Memory* mem : hsaKernel.prog().globalStores()) { + const static bool IsReadOnly = false; + // Validate global store for a dependency in the queue + memoryDependency().validate(*this, mem, IsReadOnly); + } + + return true; +} + +amd::Memory* VirtualGPU::createBufferFromImage(amd::Memory& amdImage) const { + amd::Memory* mem = new (amdImage.getContext()) amd::Buffer(amdImage, 0, 0, amdImage.getSize()); + + if ((mem != NULL) && !mem->create()) { + mem->release(); + } + + return mem; +} + +void VirtualGPU::writeVQueueHeader(VirtualGPU& hostQ, uint64_t kernelTable) { + const static bool Wait = true; + vqHeader_->kernel_table = kernelTable; + virtualQueue_->writeRawData(hostQ, sizeof(AmdVQueueHeader), vqHeader_, !Wait); +} + +void VirtualGPU::flushCuCaches(HwDbgGpuCacheMask cache_mask) { + //! @todo: fix issue of no event available for the flush/invalidate cache command + InvalidateSqCaches(cache_mask.sqICache_, cache_mask.sqKCache_, cache_mask.tcL1_, + cache_mask.tcL2_); + + flushDMA(engineID_); + + return; +} + +void VirtualGPU::buildKernelInfo(const HSAILKernel& hsaKernel, hsa_kernel_dispatch_packet_t* aqlPkt, + HwDbgKernelInfo& kernelInfo, amd::Event* enqueueEvent) { + amd::HwDebugManager* dbgManager = dev().hwDebugMgr(); + assert(dbgManager && "No HW Debug Manager!"); + + // Initialize structure with default values + + if (hsaKernel.prog().maxScratchRegs() > 0) { + gpu::Memory* scratchBuf = dev().scratch(hwRing())->memObj_; + kernelInfo.scratchBufAddr = scratchBuf->vmAddress(); + kernelInfo.scratchBufferSizeInBytes = scratchBuf->size(); + + // Get the address of the scratch buffer and its size for CPU access + address scratchRingAddr = NULL; + scratchRingAddr = static_cast
(scratchBuf->map(NULL, 0)); + dbgManager->setScratchRing(scratchRingAddr, scratchBuf->size()); + scratchBuf->unmap(NULL); + } else { + kernelInfo.scratchBufAddr = 0; + kernelInfo.scratchBufferSizeInBytes = 0; + dbgManager->setScratchRing(NULL, 0); + } + + + //! @todo: need to verify what is wanted for the global memory + kernelInfo.heapBufAddr = (dev().globalMem()).vmAddress(); + + kernelInfo.pAqlDispatchPacket = aqlPkt; + kernelInfo.pAqlQueuePtr = reinterpret_cast(hsaQueueMem_->vmAddress()); + + // Get the address of the kernel code and its size for CPU access + gpu::Memory* aqlCode = hsaKernel.gpuAqlCode(); + if (NULL != aqlCode) { + address aqlCodeAddr = static_cast
(aqlCode->map(NULL, 0)); + dbgManager->setKernelCodeInfo(aqlCodeAddr, hsaKernel.aqlCodeSize()); + aqlCode->unmap(NULL); + } else { + dbgManager->setKernelCodeInfo(NULL, 0); + } + + kernelInfo.trapPresent = false; + kernelInfo.trapHandler = NULL; + kernelInfo.trapHandlerBuffer = NULL; + + kernelInfo.excpEn = 0; + kernelInfo.cacheDisableMask = 0; + kernelInfo.sqDebugMode = 0; + + kernelInfo.mgmtSe0Mask = 0xFFFFFFFF; + kernelInfo.mgmtSe1Mask = 0xFFFFFFFF; + + // set kernel info for HW debug and call the callback function + if (NULL != dbgManager->preDispatchCallBackFunc()) { + DebugToolInfo dbgSetting = {0}; + dbgSetting.scratchAddress_ = kernelInfo.scratchBufAddr; + dbgSetting.scratchSize_ = kernelInfo.scratchBufferSizeInBytes; + dbgSetting.globalAddress_ = kernelInfo.heapBufAddr; + dbgSetting.aclBinary_ = hsaKernel.prog().binaryElf(); + dbgSetting.event_ = enqueueEvent; + + // Call the predispatch callback function & set the trap info + AqlCodeInfo aqlCodeInfo; + aqlCodeInfo.aqlCode_ = (amd_kernel_code_t*)hsaKernel.cpuAqlCode(); + aqlCodeInfo.aqlCodeSize_ = hsaKernel.aqlCodeSize(); + + // Execute the pre-dispatch call back function + dbgManager->executePreDispatchCallBack(reinterpret_cast(aqlPkt), &dbgSetting); + + // assign the debug TMA and TBA for kernel dispatch + if (NULL != dbgSetting.trapHandler_ && NULL != dbgSetting.trapBuffer_) { + assignDebugTrapHandler(dbgSetting, kernelInfo); + } + + kernelInfo.trapPresent = (kernelInfo.trapHandler) ? true : false; + + // Execption policy + kernelInfo.excpEn = dbgSetting.exceptionMask_; + kernelInfo.cacheDisableMask = dbgSetting.cacheDisableMask_; + kernelInfo.sqDebugMode = dbgSetting.gpuSingleStepMode_; + + // Compute the mask for reserved CUs. These two dwords correspond to + // two registers used for reserving CUs for display. In the current + // implementation, the number of CUs reserved can be 0 to 7, and it + // is set by debugger users. + if (dbgSetting.monitorMode_) { + uint32_t i = dbgSetting.reservedCuNum_ / 2; + kernelInfo.mgmtSe0Mask <<= i; + i = dbgSetting.reservedCuNum_ - i; + kernelInfo.mgmtSe1Mask <<= i; + } + + // flush/invalidate the instruction, data, L1 and L2 caches + InvalidateSqCaches(); + } +} + +void VirtualGPU::assignDebugTrapHandler(const DebugToolInfo& dbgSetting, + HwDbgKernelInfo& kernelInfo) { + // setup the runtime trap handler code and trap buffer to be assigned before kernel dispatching + // + Memory* rtTrapHandlerMem = static_cast(dev().hwDebugMgr()->runtimeTBA()); + Memory* rtTrapBufferMem = static_cast(dev().hwDebugMgr()->runtimeTMA()); + + kernelInfo.trapHandler = reinterpret_cast(rtTrapHandlerMem->vmAddress() + TbaStartOffset); + // With the TMA corruption hw bug workaround, the trap handler buffer can be set to zero. + // However, by setting the runtime trap buffer (TMA) correct, the runtime trap hander + // without the workaround can still function correctly. + kernelInfo.trapHandlerBuffer = reinterpret_cast(rtTrapBufferMem->vmAddress()); + + address rtTrapBufferAddress = static_cast
(rtTrapBufferMem->map(this)); + + Memory* trapHandlerMem = dev().getGpuMemory(dbgSetting.trapHandler_); + Memory* trapBufferMem = dev().getGpuMemory(dbgSetting.trapBuffer_); + + // Address of the trap handler code/buffer should be 256-byte aligned + uint64_t tbaAddress = trapHandlerMem->vmAddress(); + uint64_t tmaAddress = trapBufferMem->vmAddress(); + if ((tbaAddress & 0xFF) != 0 || (tmaAddress & 0xFF) != 0) { + assert(false && "Trap handler/buffer is not 256-byte aligned"); + } + + // The addresses of the debug trap handler code (TBA) and buffer (TMA) are + // stored in the runtime trap handler buffer with offset location of 0x18-19 + // and 0x20-21, respectively. + uint64_t* rtTmaPtr = reinterpret_cast(rtTrapBufferAddress + 0x18); + rtTmaPtr[0] = tbaAddress; + rtTmaPtr[1] = tmaAddress; + + rtTrapBufferMem->unmap(NULL); + + // Add GSL handle to the memory list for VidMM + addVmMemory(trapHandlerMem); + addVmMemory(trapBufferMem); + addVmMemory(rtTrapHandlerMem); + addVmMemory(rtTrapBufferMem); +} + +void VirtualGPU::submitTransferBufferFromFile(amd::TransferBufferFileCommand& cmd) { + size_t copySize = cmd.size()[0]; + size_t fileOffset = cmd.fileOffset(); + Memory* mem = dev().getGpuMemory(&cmd.memory()); + uint idx = 0; + + assert((cmd.type() == CL_COMMAND_READ_SSG_FILE_AMD) || + (cmd.type() == CL_COMMAND_WRITE_SSG_FILE_AMD)); + const bool writeBuffer(cmd.type() == CL_COMMAND_READ_SSG_FILE_AMD); + + if (writeBuffer) { + size_t dstOffset = cmd.origin()[0]; + while (copySize > 0) { + Memory* staging = dev().getGpuMemory(&cmd.staging(idx)); + size_t dstSize = amd::TransferBufferFileCommand::StagingBufferSize; + dstSize = std::min(dstSize, copySize); + void* dstBuffer = staging->cpuMap(*this); + if (!cmd.file()->transferBlock(writeBuffer, dstBuffer, staging->size(), fileOffset, 0, + dstSize)) { + cmd.setStatus(CL_INVALID_OPERATION); + return; + } + staging->cpuUnmap(*this); + + bool result = blitMgr().copyBuffer(*staging, *mem, 0, dstOffset, dstSize, false); + flushDMA(getGpuEvent(staging->gslResource())->engineId_); + fileOffset += dstSize; + dstOffset += dstSize; + copySize -= dstSize; + } + } else { + size_t srcOffset = cmd.origin()[0]; + while (copySize > 0) { + Memory* staging = dev().getGpuMemory(&cmd.staging(idx)); + size_t srcSize = amd::TransferBufferFileCommand::StagingBufferSize; + srcSize = std::min(srcSize, copySize); + bool result = blitMgr().copyBuffer(*mem, *staging, srcOffset, 0, srcSize, false); + + void* srcBuffer = staging->cpuMap(*this); + if (!cmd.file()->transferBlock(writeBuffer, srcBuffer, staging->size(), fileOffset, 0, + srcSize)) { + cmd.setStatus(CL_INVALID_OPERATION); + return; + } + staging->cpuUnmap(*this); + + fileOffset += srcSize; + srcOffset += srcSize; + copySize -= srcSize; + } + } +} + +} // namespace gpu diff --git a/rocclr/runtime/device/gpu/gpuvirtual.hpp b/rocclr/runtime/device/gpu/gpuvirtual.hpp index e3cce21d2f..b5d51dd6b0 100644 --- a/rocclr/runtime/device/gpu/gpuvirtual.hpp +++ b/rocclr/runtime/device/gpu/gpuvirtual.hpp @@ -34,515 +34,481 @@ class ThreadTrace; class HSAILKernel; //! Virtual GPU -class VirtualGPU : public device::VirtualDevice, public CALGSLContext -{ -public: - struct CommandBatch : public amd::HeapObject - { - amd::Command* head_; //!< Command batch head - GpuEvent events_[AllEngines]; //!< Last known GPU events - TimeStamp* lastTS_; //!< TS associated with command batch +class VirtualGPU : public device::VirtualDevice, public CALGSLContext { + public: + struct CommandBatch : public amd::HeapObject { + amd::Command* head_; //!< Command batch head + GpuEvent events_[AllEngines]; //!< Last known GPU events + TimeStamp* lastTS_; //!< TS associated with command batch - //! Constructor - CommandBatch( - amd::Command* head, //!< Command batch head - const GpuEvent* events, //!< HW events on all engines - TimeStamp* lastTS //!< Last TS in command batch - ): head_(head), lastTS_(lastTS) - { - memcpy(&events_, events, AllEngines * sizeof(GpuEvent)); - } + //! Constructor + CommandBatch(amd::Command* head, //!< Command batch head + const GpuEvent* events, //!< HW events on all engines + TimeStamp* lastTS //!< Last TS in command batch + ) + : head_(head), lastTS_(lastTS) { + memcpy(&events_, events, AllEngines * sizeof(GpuEvent)); + } + }; + + //! The virtual GPU states + union State { + struct { + uint boundGlobal_ : 1; //!< Global buffer was bound + uint profiling_ : 1; //!< Profiling is enabled + uint forceWait_ : 1; //!< Forces wait in flush() + uint boundCb_ : 1; //!< Constant buffer was bound + uint boundPrintf_ : 1; //!< Printf buffer was bound + uint hsailKernel_ : 1; //!< True if HSAIL kernel was used + }; + uint value_; + State() : value_(0) {} + }; + + //! CAL descriptor for the GPU virtual device + struct CalVirtualDesc : public amd::EmbeddedObject { + gslDomain3D gridBlock; //!< size of a block of data + gslDomain3D gridSize; //!< size of 'blocks' to execute + gslDomain3D partialGridBlock; //!< Partial grid block + CALuint localSize; //!< size of OpenCL Local Memory in bytes + uint memCount_; //!< Memory objects count + GpuEvent events_[AllEngines]; //!< Last known GPU events + uint iterations_; //!< Number of iterations for the execution + TimeStamp* lastTS_; //!< Last timestamp executed on Virtual GPU + gslMemObject constBuffers_[MaxConstBuffers]; //!< Constant buffer names + gslMemObject uavs_[MaxUavArguments]; //!< UAV bindings + gslMemObject readImages_[MaxReadImage]; //!< Read images + uint32_t samplersState_[MaxSamplers]; //!< State of all samplers + }; + + typedef std::vector constbufs_t; + + //! GSL descriptor for the GPU kernel, specific to the virtual device + struct GslKernelDesc : public amd::HeapObject { + CALimage image_; //!< CAL image for the program + gslProgramObject func_; //!< GSL program object + gslMemObject intCb_; //!< Internal constant buffer + }; + + struct ResourceSlot { + union State { + struct { + uint bound_ : 1; //!< Resource is bound + uint constant_ : 1; //!< Resource is a constant + }; + uint value_; + State() : value_(0) {} }; - //! The virtual GPU states - union State - { - struct - { - uint boundGlobal_ : 1; //!< Global buffer was bound - uint profiling_ : 1; //!< Profiling is enabled - uint forceWait_ : 1; //!< Forces wait in flush() - uint boundCb_ : 1; //!< Constant buffer was bound - uint boundPrintf_ : 1; //!< Printf buffer was bound - uint hsailKernel_ : 1; //!< True if HSAIL kernel was used - }; - uint value_; - State(): value_(0) {} + State state_; //!< slot's state + const Memory* memory_; //!< GPU memory object + + ResourceSlot() : memory_(NULL) {} + + //! Copy constructor for the kernel argument + ResourceSlot(const ResourceSlot& data) { *this = data; } + + //! Overloads operator= + ResourceSlot& operator=(const ResourceSlot& data) { + state_.value_ = data.state_.value_; + memory_ = data.memory_; + return *this; + } + }; + + class MemoryDependency : public amd::EmbeddedObject { + public: + //! Default constructor + MemoryDependency() + : memObjectsInQueue_(NULL), numMemObjectsInQueue_(0), maxMemObjectsInQueue_(0) {} + + ~MemoryDependency() { delete[] memObjectsInQueue_; } + + //! Creates memory dependecy structure + bool create(size_t numMemObj); + + //! Notify the tracker about new kernel + void newKernel() { endMemObjectsInQueue_ = numMemObjectsInQueue_; } + + //! Validates memory object on dependency + void validate(VirtualGPU& gpu, const Memory* memory, bool readOnly); + + //! Clear memory dependency + void clear(bool all = true); + + private: + struct MemoryState { + uint64_t start_; //! Busy memory start address + uint64_t end_; //! Busy memory end address + bool readOnly_; //! Current GPU state in the queue }; - //! CAL descriptor for the GPU virtual device - struct CalVirtualDesc : public amd::EmbeddedObject - { - gslDomain3D gridBlock; //!< size of a block of data - gslDomain3D gridSize; //!< size of 'blocks' to execute - gslDomain3D partialGridBlock;//!< Partial grid block - CALuint localSize; //!< size of OpenCL Local Memory in bytes - uint memCount_; //!< Memory objects count - GpuEvent events_[AllEngines]; //!< Last known GPU events - uint iterations_; //!< Number of iterations for the execution - TimeStamp* lastTS_; //!< Last timestamp executed on Virtual GPU - gslMemObject constBuffers_[MaxConstBuffers];//!< Constant buffer names - gslMemObject uavs_[MaxUavArguments]; //!< UAV bindings - gslMemObject readImages_[MaxReadImage]; //!< Read images - uint32_t samplersState_[MaxSamplers]; //!< State of all samplers - }; - - typedef std::vector constbufs_t; - - //! GSL descriptor for the GPU kernel, specific to the virtual device - struct GslKernelDesc : public amd::HeapObject - { - CALimage image_; //!< CAL image for the program - gslProgramObject func_; //!< GSL program object - gslMemObject intCb_; //!< Internal constant buffer - }; - - struct ResourceSlot - { - union State - { - struct - { - uint bound_ : 1; //!< Resource is bound - uint constant_ : 1; //!< Resource is a constant - }; - uint value_; - State(): value_(0) {} - }; - - State state_; //!< slot's state - const Memory* memory_; //!< GPU memory object - - ResourceSlot(): memory_(NULL) {} - - //! Copy constructor for the kernel argument - ResourceSlot(const ResourceSlot& data) { *this = data; } - - //! Overloads operator= - ResourceSlot& operator=(const ResourceSlot& data) - { - state_.value_ = data.state_.value_; - memory_ = data.memory_; - return *this; - } - }; - - class MemoryDependency : public amd::EmbeddedObject - { - public: - //! Default constructor - MemoryDependency() - : memObjectsInQueue_(NULL) - , numMemObjectsInQueue_(0) - , maxMemObjectsInQueue_(0) {} - - ~MemoryDependency() { delete [] memObjectsInQueue_; } - - //! Creates memory dependecy structure - bool create(size_t numMemObj); - - //! Notify the tracker about new kernel - void newKernel() { endMemObjectsInQueue_ = numMemObjectsInQueue_; } - - //! Validates memory object on dependency - void validate(VirtualGPU& gpu, const Memory* memory, bool readOnly); - - //! Clear memory dependency - void clear(bool all = true); - - private: - struct MemoryState { - uint64_t start_; //! Busy memory start address - uint64_t end_; //! Busy memory end address - bool readOnly_; //! Current GPU state in the queue - }; - - MemoryState* memObjectsInQueue_; //!< Memory object state in the queue - size_t endMemObjectsInQueue_; //!< End of mem objects in the queue - size_t numMemObjectsInQueue_; //!< Number of mem objects in the queue - size_t maxMemObjectsInQueue_; //!< Maximum number of mem objects in the queue - }; - - - class DmaFlushMgmt : public amd::EmbeddedObject - { - public: - DmaFlushMgmt(const Device& dev); - - // Resets DMA command buffer workload - void resetCbWorkload(const Device& dev); - - // Finds split size for the current dispatch - void findSplitSize( - const Device& dev, //!< GPU device object - uint64_t threads, //!< Total number of execution threads - uint instructions //!< Number of ALU instructions - ); - - // Returns TRUE if DMA command buffer is ready for a flush - bool isCbReady( - VirtualGPU& gpu, //!< Virtual GPU object - uint64_t threads, //!< Total number of execution threads - uint instructions //!< Number of ALU instructions - ); - - // Returns dispatch split size - uint dispatchSplitSize() const { return dispatchSplitSize_; } - - private: - uint64_t maxDispatchWorkload_; //!< Maximum number of operations for a single dispatch - uint64_t maxCbWorkload_; //!< Maximum number of operations for DMA command buffer - uint64_t cbWorkload_; //!< Current number of operations in DMA command buffer - uint aluCnt_; //!< All ALUs on the chip - uint dispatchSplitSize_; //!< Dispath split size in elements - }; - - typedef std::vector ResourceSlots; - -public: - VirtualGPU(Device& device); - bool create(bool profiling, uint rtCUs = amd::CommandQueue::RealTimeDisabled, - uint deviceQueueSize = 0, - amd::CommandQueue::Priority priority = amd::CommandQueue::Priority::Normal); - ~VirtualGPU(); - - void submitReadMemory(amd::ReadMemoryCommand& vcmd); - void submitWriteMemory(amd::WriteMemoryCommand& vcmd); - void submitCopyMemory(amd::CopyMemoryCommand& vcmd); - void submitMapMemory(amd::MapMemoryCommand& vcmd); - void submitUnmapMemory(amd::UnmapMemoryCommand& vcmd); - void submitKernel(amd::NDRangeKernelCommand& vcmd); - bool submitKernelInternal( - const amd::NDRangeContainer& sizes, //!< Workload sizes - const amd::Kernel& kernel, //!< Kernel for execution - const_address parameters, //!< Parameters for the kernel - bool nativeMem = true, //!< Native memory objects - amd::Event* enqueueEvent = NULL //!< Event provided in the enqueue kernel command - ); - bool submitKernelInternalHSA( - const amd::NDRangeContainer& sizes, //!< Workload sizes - const amd::Kernel& kernel, //!< Kernel for execution - const_address parameters, //!< Parameters for the kernel - bool nativeMem = true, //!< Native memory objects - amd::Event* enqueueEvent = NULL //!< Event provided in the enqueue kernel command - ); - void submitNativeFn(amd::NativeFnCommand& vcmd); - void submitFillMemory(amd::FillMemoryCommand& vcmd); - void submitMigrateMemObjects(amd::MigrateMemObjectsCommand& cmd); - void submitMarker(amd::Marker& vcmd); - void submitAcquireExtObjects(amd::AcquireExtObjectsCommand& vcmd); - void submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& vcmd); - void submitPerfCounter(amd::PerfCounterCommand& vcmd); - void submitThreadTraceMemObjects(amd::ThreadTraceMemObjectsCommand& cmd); - void submitThreadTrace(amd::ThreadTraceCommand& vcmd); - void submitSignal(amd::SignalCommand & vcmd); - void submitMakeBuffersResident(amd::MakeBuffersResidentCommand & vcmd); - virtual void submitSvmFreeMemory(amd::SvmFreeMemoryCommand& cmd); - virtual void submitSvmCopyMemory(amd::SvmCopyMemoryCommand& cmd); - virtual void submitSvmFillMemory(amd::SvmFillMemoryCommand& cmd); - virtual void submitSvmMapMemory(amd::SvmMapMemoryCommand& cmd); - virtual void submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& cmd); - virtual void submitTransferBufferFromFile(amd::TransferBufferFileCommand& cmd); - - void releaseMemory(gslMemObject gslResource, bool wait = true); - void releaseKernel(CALimage calImage); - - void flush(amd::Command* list = NULL, bool wait = false); - bool terminate() { return true; } - - //! Returns GPU device object associated with this kernel - const Device& dev() const { return gpuDevice_; } - - //! Returns CAL descriptor of the virtual device - const CalVirtualDesc* cal() const { return &cal_; } - - //! Returns active kernel descriptor for this virtual device - const GslKernelDesc* gslKernelDesc() const { return activeKernelDesc_; } - - //! Returns a GPU event, associated with GPU memory - GpuEvent* getGpuEvent( - const gslMemObject gslMem //!< GSL mem object - ) { return &gpuEvents_[gslMem]; } - - //! Assigns a GPU event, associated with GPU memory - void assignGpuEvent( - const gslMemObject gslMem, //!< GSL mem object - GpuEvent gpuEvent - ) { gpuEvents_[gslMem] = gpuEvent; } - - //! Set the kernel as active - bool setActiveKernelDesc( - const amd::NDRangeContainer& sizes, //!< kernel execution work sizes - const Kernel* kernel //!< GPU kernel object - ); - - //! Set the last known GPU event - void setGpuEvent( - GpuEvent gpuEvent, //!< GPU event for tracking - bool flush = false //!< TRUE if flush is required - ); - - //! Flush DMA buffer on the specified engine - void flushDMA( - uint engineID //!< Engine ID for DMA flush - ); - - //! Wait for all engines on this Virtual GPU - //! Returns TRUE if CPU didn't wait for GPU - bool waitAllEngines( - CommandBatch* cb = NULL //!< Command batch - ); - - //! Waits for the latest GPU event with a lock to prevent multiple entries - void waitEventLock( - CommandBatch* cb //!< Command batch - ); - - //! Returns a resource associated with the constant buffer - const ConstBuffer* cb(uint idx) const { return constBufs_[idx]; } - - //! Adds CAL objects into the constant buffer vector - void addConstBuffer(ConstBuffer* cb) { constBufs_.push_back(cb); } - - constbufs_t constBufs_; //!< constant buffers - - //! Start the command profiling - void profilingBegin( - amd::Command& command, //!< Command queue object - bool drmProfiling = false //!< Measure DRM time - ); - - //! End the command profiling - void profilingEnd(amd::Command& command); - - //! Collect the profiling results - bool profilingCollectResults( - CommandBatch* cb, //!< Command batch - const amd::Event* waitingEvent //!< Waiting event - ); - - //! Adds a memory handle into the GSL memory array for Virtual Heap - bool addVmMemory( - const Memory* memory //!< GPU memory object - ); - - //! Adds a stage write buffer into a list - void addXferWrite(Memory& memory); - - //! Adds a pinned memory object into a map - void addPinnedMem(amd::Memory* mem); - - //! Release pinned memory objects - void releasePinnedMem(); - - //! Finds if pinned memory is cached - amd::Memory* findPinnedMem(void* addr, size_t size); - - //! Returns gsl memory object for VM - const gslMemObject* vmMems() const { return vmMems_; } - - //! Returns the monitor object for execution access by VirtualGPU - amd::Monitor& execution() { return execution_; } + MemoryState* memObjectsInQueue_; //!< Memory object state in the queue + size_t endMemObjectsInQueue_; //!< End of mem objects in the queue + size_t numMemObjectsInQueue_; //!< Number of mem objects in the queue + size_t maxMemObjectsInQueue_; //!< Maximum number of mem objects in the queue + }; + + + class DmaFlushMgmt : public amd::EmbeddedObject { + public: + DmaFlushMgmt(const Device& dev); + + // Resets DMA command buffer workload + void resetCbWorkload(const Device& dev); + + // Finds split size for the current dispatch + void findSplitSize(const Device& dev, //!< GPU device object + uint64_t threads, //!< Total number of execution threads + uint instructions //!< Number of ALU instructions + ); + + // Returns TRUE if DMA command buffer is ready for a flush + bool isCbReady(VirtualGPU& gpu, //!< Virtual GPU object + uint64_t threads, //!< Total number of execution threads + uint instructions //!< Number of ALU instructions + ); + + // Returns dispatch split size + uint dispatchSplitSize() const { return dispatchSplitSize_; } + + private: + uint64_t maxDispatchWorkload_; //!< Maximum number of operations for a single dispatch + uint64_t maxCbWorkload_; //!< Maximum number of operations for DMA command buffer + uint64_t cbWorkload_; //!< Current number of operations in DMA command buffer + uint aluCnt_; //!< All ALUs on the chip + uint dispatchSplitSize_; //!< Dispath split size in elements + }; + + typedef std::vector ResourceSlots; + + public: + VirtualGPU(Device& device); + bool create(bool profiling, uint rtCUs = amd::CommandQueue::RealTimeDisabled, + uint deviceQueueSize = 0, + amd::CommandQueue::Priority priority = amd::CommandQueue::Priority::Normal); + ~VirtualGPU(); + + void submitReadMemory(amd::ReadMemoryCommand& vcmd); + void submitWriteMemory(amd::WriteMemoryCommand& vcmd); + void submitCopyMemory(amd::CopyMemoryCommand& vcmd); + void submitMapMemory(amd::MapMemoryCommand& vcmd); + void submitUnmapMemory(amd::UnmapMemoryCommand& vcmd); + void submitKernel(amd::NDRangeKernelCommand& vcmd); + bool submitKernelInternal( + const amd::NDRangeContainer& sizes, //!< Workload sizes + const amd::Kernel& kernel, //!< Kernel for execution + const_address parameters, //!< Parameters for the kernel + bool nativeMem = true, //!< Native memory objects + amd::Event* enqueueEvent = NULL //!< Event provided in the enqueue kernel command + ); + bool submitKernelInternalHSA( + const amd::NDRangeContainer& sizes, //!< Workload sizes + const amd::Kernel& kernel, //!< Kernel for execution + const_address parameters, //!< Parameters for the kernel + bool nativeMem = true, //!< Native memory objects + amd::Event* enqueueEvent = NULL //!< Event provided in the enqueue kernel command + ); + void submitNativeFn(amd::NativeFnCommand& vcmd); + void submitFillMemory(amd::FillMemoryCommand& vcmd); + void submitMigrateMemObjects(amd::MigrateMemObjectsCommand& cmd); + void submitMarker(amd::Marker& vcmd); + void submitAcquireExtObjects(amd::AcquireExtObjectsCommand& vcmd); + void submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& vcmd); + void submitPerfCounter(amd::PerfCounterCommand& vcmd); + void submitThreadTraceMemObjects(amd::ThreadTraceMemObjectsCommand& cmd); + void submitThreadTrace(amd::ThreadTraceCommand& vcmd); + void submitSignal(amd::SignalCommand& vcmd); + void submitMakeBuffersResident(amd::MakeBuffersResidentCommand& vcmd); + virtual void submitSvmFreeMemory(amd::SvmFreeMemoryCommand& cmd); + virtual void submitSvmCopyMemory(amd::SvmCopyMemoryCommand& cmd); + virtual void submitSvmFillMemory(amd::SvmFillMemoryCommand& cmd); + virtual void submitSvmMapMemory(amd::SvmMapMemoryCommand& cmd); + virtual void submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& cmd); + virtual void submitTransferBufferFromFile(amd::TransferBufferFileCommand& cmd); + + void releaseMemory(gslMemObject gslResource, bool wait = true); + void releaseKernel(CALimage calImage); + + void flush(amd::Command* list = NULL, bool wait = false); + bool terminate() { return true; } + + //! Returns GPU device object associated with this kernel + const Device& dev() const { return gpuDevice_; } + + //! Returns CAL descriptor of the virtual device + const CalVirtualDesc* cal() const { return &cal_; } + + //! Returns active kernel descriptor for this virtual device + const GslKernelDesc* gslKernelDesc() const { return activeKernelDesc_; } + + //! Returns a GPU event, associated with GPU memory + GpuEvent* getGpuEvent(const gslMemObject gslMem //!< GSL mem object + ) { + return &gpuEvents_[gslMem]; + } + + //! Assigns a GPU event, associated with GPU memory + void assignGpuEvent(const gslMemObject gslMem, //!< GSL mem object + GpuEvent gpuEvent) { + gpuEvents_[gslMem] = gpuEvent; + } + + //! Set the kernel as active + bool setActiveKernelDesc(const amd::NDRangeContainer& sizes, //!< kernel execution work sizes + const Kernel* kernel //!< GPU kernel object + ); + + //! Set the last known GPU event + void setGpuEvent(GpuEvent gpuEvent, //!< GPU event for tracking + bool flush = false //!< TRUE if flush is required + ); + + //! Flush DMA buffer on the specified engine + void flushDMA(uint engineID //!< Engine ID for DMA flush + ); + + //! Wait for all engines on this Virtual GPU + //! Returns TRUE if CPU didn't wait for GPU + bool waitAllEngines(CommandBatch* cb = NULL //!< Command batch + ); + + //! Waits for the latest GPU event with a lock to prevent multiple entries + void waitEventLock(CommandBatch* cb //!< Command batch + ); + + //! Returns a resource associated with the constant buffer + const ConstBuffer* cb(uint idx) const { return constBufs_[idx]; } + + //! Adds CAL objects into the constant buffer vector + void addConstBuffer(ConstBuffer* cb) { constBufs_.push_back(cb); } + + constbufs_t constBufs_; //!< constant buffers + + //! Start the command profiling + void profilingBegin(amd::Command& command, //!< Command queue object + bool drmProfiling = false //!< Measure DRM time + ); + + //! End the command profiling + void profilingEnd(amd::Command& command); + + //! Collect the profiling results + bool profilingCollectResults(CommandBatch* cb, //!< Command batch + const amd::Event* waitingEvent //!< Waiting event + ); + + //! Adds a memory handle into the GSL memory array for Virtual Heap + bool addVmMemory(const Memory* memory //!< GPU memory object + ); + + //! Adds a stage write buffer into a list + void addXferWrite(Memory& memory); + + //! Adds a pinned memory object into a map + void addPinnedMem(amd::Memory* mem); + + //! Release pinned memory objects + void releasePinnedMem(); + + //! Finds if pinned memory is cached + amd::Memory* findPinnedMem(void* addr, size_t size); + + //! Returns gsl memory object for VM + const gslMemObject* vmMems() const { return vmMems_; } - //! Returns the virtual gpu unique index - uint index() const { return index_; } + //! Returns the monitor object for execution access by VirtualGPU + amd::Monitor& execution() { return execution_; } - //! Get the PrintfDbg object - PrintfDbg& printfDbg() const { return *printfDbg_; } + //! Returns the virtual gpu unique index + uint index() const { return index_; } - //! Get the PrintfDbgHSA object - PrintfDbgHSA& printfDbgHSA() const { return *printfDbgHSA_; } + //! Get the PrintfDbg object + PrintfDbg& printfDbg() const { return *printfDbg_; } - //! Enables synchronized transfers - void enableSyncedBlit() const; - - //! Checks if profiling is enabled - bool profiling() const { return state_.profiling_; } - - //! Returns memory dependency class - MemoryDependency& memoryDependency() { return memoryDependency_; } - - //! Returns hsaQueueMem_ - const Memory* hsaQueueMem() const { return hsaQueueMem_;} - - //! Returns DMA flush management structure - const DmaFlushMgmt& dmaFlushMgmt() const { return dmaFlushMgmt_; } - - //! Releases GSL memory objects allocated on this queue - void releaseMemObjects(bool scratch = true); - - //! Returns the HW ring used on this virtual device - uint hwRing() const { return hwRing_; } - - //! Returns current timestamp object for profiling - TimeStamp* currTs() const { return cal_.lastTS_; } - - //! Returns virtual queue object for device enqueuing - Memory* vQueue() const { return virtualQueue_; } - - //! Update virtual queue header - void writeVQueueHeader(VirtualGPU& hostQ, uint64_t kernelTable); - - //! Returns TRUE if virtual queue was successfully allocatted - bool createVirtualQueue( - uint deviceQueueSize //!< Device queue size - ); - - EngineType engineID_; //!< Engine ID for this VirtualGPU - ResourceSlots slots_; //!< Resource slots for kernel arguments - State state_; //!< virtual GPU current state - CalVirtualDesc cal_; //!< CAL virtual device descriptor - - void flushCuCaches(HwDbgGpuCacheMask cache_mask); //!< flush/invalidate SQ cache - -protected: - virtual void profileEvent(EngineType engine, bool type) const; - - //! Creates buffer object from image - amd::Memory* createBufferFromImage( - amd::Memory& amdImage //! The parent image object(untiled images only) - ) const; - -private: - typedef std::map GslKernels; - typedef std::map GpuEvents; - - //! Finds total amount of necessary iterations - inline void findIterations( - const amd::NDRangeContainer& sizes, //!< Original workload sizes - const amd::NDRange& local, //!< Local workgroup size - amd::NDRange& groups, //!< Calculated workgroup sizes - amd::NDRange& remainder, //!< Calculated remainder sizes - size_t& extra //!< Amount of extra executions for remainder - ); - - //! Setups workloads for the current iteration - inline void setupIteration( - uint iteration, //!< Current iteration - const amd::NDRangeContainer& sizes, //!< Original workload sizes - Kernel& gpuKernel, //!< GPU kernel - amd::NDRange& global, //!< Global size for the current iteration - amd::NDRange& offsets, //!< Offsets for the current iteration - amd::NDRange& local, //!< Local sizes for the current iteration - amd::NDRange& groups, //!< Group sizes for the current iteration - amd::NDRange& groupOffset, //!< Group offsets for the current iteration - amd::NDRange& divider, //!< Group divider - amd::NDRange& remainder, //!< Remain workload - size_t extra //!< Extra groups - ); - - //! Allocates constant buffers - bool allocConstantBuffers(); - - //! Allocates CAL kernel descriptor of the virtual device - GslKernelDesc* allocKernelDesc( - const Kernel* kernel, //!< Kernel object - CALimage calImage); //!< CAL image - - //! Frees CAL kernel descriptor of the virtual device - void freeKernelDesc(GslKernelDesc* desc); - - bool gslOpen(uint nEngines, gslEngineDescriptor *engines, uint32_t rtCUs); - void gslDestroy(); - - //! Releases stage write buffers - void releaseXferWrite(); - - //! Allocate hsaQueueMem_ - bool allocHsaQueueMem(); - - //! Awaits a command batch with a waiting event - bool awaitCompletion( - CommandBatch* cb, //!< Command batch for to wait - const amd::Event* waitingEvent = NULL //!< A waiting event - ); - - //! Validates the scratch buffer memory for a specified kernel - void validateScratchBuffer( - const Kernel* kernel //!< Kernel for validaiton - ); - - //! Detects memory dependency for HSAIL kernels and flushes caches - bool processMemObjectsHSA( - const amd::Kernel& kernel, //!< AMD kernel object for execution - const_address params, //!< Pointer to the param's store - bool nativeMem, //!< Native memory objects - std::vector* memList //!< Memory list for KMD tracking - ); - - //! Common function for fill memory used by both svm Fill and non-svm fill - bool fillMemory( - cl_command_type type, //!< the command type - amd::Memory* amdMemory, //!< memory object to fill - const void* pattern, //!< pattern to fill the memory - size_t patternSize, //!< pattern size - const amd::Coord3D& origin, //!< memory origin - const amd::Coord3D& size //!< memory size for filling - ); - - bool copyMemory( - cl_command_type type, //!< the command type - amd::Memory& srcMem, //!< source memory object - amd::Memory& dstMem, //!< destination memory object - bool entire, //!< flag of entire memory copy - const amd::Coord3D& srcOrigin, //!< source memory origin - const amd::Coord3D& dstOrigin, //!< destination memory object - const amd::Coord3D& size, //!< copy size - const amd::BufferRect& srcRect, //!< region of source for copy - const amd::BufferRect& dstRect //!< region of destination for copy - ); - - void buildKernelInfo( - const HSAILKernel& hsaKernel, //!< hsa kernel - hsa_kernel_dispatch_packet_t* aqlPkt, //!< aql packet for dispatch - HwDbgKernelInfo& kernelInfo, //!< kernel info for the dispatch - amd::Event* enqueueEvent //!< Event provided in the enqueue kernel command - ); - - void assignDebugTrapHandler( - const DebugToolInfo& dbgSetting, //!< debug settings - HwDbgKernelInfo& kernelInfo //!< kernel info for the dispatch - ); - - GslKernels gslKernels_; //!< GSL kernel descriptors - GslKernelDesc* activeKernelDesc_; //!< active GSL kernel descriptors - GpuEvents gpuEvents_; //!< GPU events - - Device& gpuDevice_; //!< physical GPU device - amd::Monitor execution_; //!< Lock to serialise access to all device objects - uint index_; //!< The virtual device unique index - - PrintfDbg* printfDbg_; //!< GPU printf implemenation - PrintfDbgHSA* printfDbgHSA_; //!< HSAIL printf implemenation - - TimeStampCache* tsCache_; //!< TimeStamp cache - MemoryDependency memoryDependency_; //!< Memory dependency class - - gslMemObject* vmMems_; //!< Array of GSL memories for VM mode - uint numVmMems_; //!< Number of entries in VM mem array - - DmaFlushMgmt dmaFlushMgmt_; //!< DMA flush management - - std::list xferWriteBuffers_; //!< Stage write buffers - std::list pinnedMems_;//!< Pinned memory list - - typedef std::list CommandBatchList; - CommandBatchList cbList_; //!< List of command batches - - uint hwRing_; //!< HW ring used on this virtual device - - uint64_t readjustTimeGPU_; //!< Readjust time between GPU and CPU timestamps - TimeStamp* currTs_; //!< current timestamp for command - - AmdVQueueHeader* vqHeader_; //!< Sysmem copy for virtual queue header - Memory* virtualQueue_; //!< Virtual device queue - Memory* schedParams_; //!< The scheduler parameters - uint schedParamIdx_; //!< Index in the scheduler parameters buffer - uint deviceQueueSize_; //!< Device queue size - uint maskGroups_; //!< The number of mask groups processed in the scheduler by one thread - - Memory* hsaQueueMem_; //!< Memory for the amd_queue_t object - bool profileEnabled_;//!< Profiling is enabled + //! Get the PrintfDbgHSA object + PrintfDbgHSA& printfDbgHSA() const { return *printfDbgHSA_; } + + //! Enables synchronized transfers + void enableSyncedBlit() const; + + //! Checks if profiling is enabled + bool profiling() const { return state_.profiling_; } + + //! Returns memory dependency class + MemoryDependency& memoryDependency() { return memoryDependency_; } + + //! Returns hsaQueueMem_ + const Memory* hsaQueueMem() const { return hsaQueueMem_; } + + //! Returns DMA flush management structure + const DmaFlushMgmt& dmaFlushMgmt() const { return dmaFlushMgmt_; } + + //! Releases GSL memory objects allocated on this queue + void releaseMemObjects(bool scratch = true); + + //! Returns the HW ring used on this virtual device + uint hwRing() const { return hwRing_; } + + //! Returns current timestamp object for profiling + TimeStamp* currTs() const { return cal_.lastTS_; } + + //! Returns virtual queue object for device enqueuing + Memory* vQueue() const { return virtualQueue_; } + + //! Update virtual queue header + void writeVQueueHeader(VirtualGPU& hostQ, uint64_t kernelTable); + + //! Returns TRUE if virtual queue was successfully allocatted + bool createVirtualQueue(uint deviceQueueSize //!< Device queue size + ); + + EngineType engineID_; //!< Engine ID for this VirtualGPU + ResourceSlots slots_; //!< Resource slots for kernel arguments + State state_; //!< virtual GPU current state + CalVirtualDesc cal_; //!< CAL virtual device descriptor + + void flushCuCaches(HwDbgGpuCacheMask cache_mask); //!< flush/invalidate SQ cache + + protected: + virtual void profileEvent(EngineType engine, bool type) const; + + //! Creates buffer object from image + amd::Memory* createBufferFromImage( + amd::Memory& amdImage //! The parent image object(untiled images only) + ) const; + + private: + typedef std::map GslKernels; + typedef std::map GpuEvents; + + //! Finds total amount of necessary iterations + inline void findIterations(const amd::NDRangeContainer& sizes, //!< Original workload sizes + const amd::NDRange& local, //!< Local workgroup size + amd::NDRange& groups, //!< Calculated workgroup sizes + amd::NDRange& remainder, //!< Calculated remainder sizes + size_t& extra //!< Amount of extra executions for remainder + ); + + //! Setups workloads for the current iteration + inline void setupIteration( + uint iteration, //!< Current iteration + const amd::NDRangeContainer& sizes, //!< Original workload sizes + Kernel& gpuKernel, //!< GPU kernel + amd::NDRange& global, //!< Global size for the current iteration + amd::NDRange& offsets, //!< Offsets for the current iteration + amd::NDRange& local, //!< Local sizes for the current iteration + amd::NDRange& groups, //!< Group sizes for the current iteration + amd::NDRange& groupOffset, //!< Group offsets for the current iteration + amd::NDRange& divider, //!< Group divider + amd::NDRange& remainder, //!< Remain workload + size_t extra //!< Extra groups + ); + + //! Allocates constant buffers + bool allocConstantBuffers(); + + //! Allocates CAL kernel descriptor of the virtual device + GslKernelDesc* allocKernelDesc(const Kernel* kernel, //!< Kernel object + CALimage calImage); //!< CAL image + + //! Frees CAL kernel descriptor of the virtual device + void freeKernelDesc(GslKernelDesc* desc); + + bool gslOpen(uint nEngines, gslEngineDescriptor* engines, uint32_t rtCUs); + void gslDestroy(); + + //! Releases stage write buffers + void releaseXferWrite(); + + //! Allocate hsaQueueMem_ + bool allocHsaQueueMem(); + + //! Awaits a command batch with a waiting event + bool awaitCompletion(CommandBatch* cb, //!< Command batch for to wait + const amd::Event* waitingEvent = NULL //!< A waiting event + ); + + //! Validates the scratch buffer memory for a specified kernel + void validateScratchBuffer(const Kernel* kernel //!< Kernel for validaiton + ); + + //! Detects memory dependency for HSAIL kernels and flushes caches + bool processMemObjectsHSA(const amd::Kernel& kernel, //!< AMD kernel object for execution + const_address params, //!< Pointer to the param's store + bool nativeMem, //!< Native memory objects + std::vector* memList //!< Memory list for KMD tracking + ); + + //! Common function for fill memory used by both svm Fill and non-svm fill + bool fillMemory(cl_command_type type, //!< the command type + amd::Memory* amdMemory, //!< memory object to fill + const void* pattern, //!< pattern to fill the memory + size_t patternSize, //!< pattern size + const amd::Coord3D& origin, //!< memory origin + const amd::Coord3D& size //!< memory size for filling + ); + + bool copyMemory(cl_command_type type, //!< the command type + amd::Memory& srcMem, //!< source memory object + amd::Memory& dstMem, //!< destination memory object + bool entire, //!< flag of entire memory copy + const amd::Coord3D& srcOrigin, //!< source memory origin + const amd::Coord3D& dstOrigin, //!< destination memory object + const amd::Coord3D& size, //!< copy size + const amd::BufferRect& srcRect, //!< region of source for copy + const amd::BufferRect& dstRect //!< region of destination for copy + ); + + void buildKernelInfo(const HSAILKernel& hsaKernel, //!< hsa kernel + hsa_kernel_dispatch_packet_t* aqlPkt, //!< aql packet for dispatch + HwDbgKernelInfo& kernelInfo, //!< kernel info for the dispatch + amd::Event* enqueueEvent //!< Event provided in the enqueue kernel command + ); + + void assignDebugTrapHandler(const DebugToolInfo& dbgSetting, //!< debug settings + HwDbgKernelInfo& kernelInfo //!< kernel info for the dispatch + ); + + GslKernels gslKernels_; //!< GSL kernel descriptors + GslKernelDesc* activeKernelDesc_; //!< active GSL kernel descriptors + GpuEvents gpuEvents_; //!< GPU events + + Device& gpuDevice_; //!< physical GPU device + amd::Monitor execution_; //!< Lock to serialise access to all device objects + uint index_; //!< The virtual device unique index + + PrintfDbg* printfDbg_; //!< GPU printf implemenation + PrintfDbgHSA* printfDbgHSA_; //!< HSAIL printf implemenation + + TimeStampCache* tsCache_; //!< TimeStamp cache + MemoryDependency memoryDependency_; //!< Memory dependency class + + gslMemObject* vmMems_; //!< Array of GSL memories for VM mode + uint numVmMems_; //!< Number of entries in VM mem array + + DmaFlushMgmt dmaFlushMgmt_; //!< DMA flush management + + std::list xferWriteBuffers_; //!< Stage write buffers + std::list pinnedMems_; //!< Pinned memory list + + typedef std::list CommandBatchList; + CommandBatchList cbList_; //!< List of command batches + + uint hwRing_; //!< HW ring used on this virtual device + + uint64_t readjustTimeGPU_; //!< Readjust time between GPU and CPU timestamps + TimeStamp* currTs_; //!< current timestamp for command + + AmdVQueueHeader* vqHeader_; //!< Sysmem copy for virtual queue header + Memory* virtualQueue_; //!< Virtual device queue + Memory* schedParams_; //!< The scheduler parameters + uint schedParamIdx_; //!< Index in the scheduler parameters buffer + uint deviceQueueSize_; //!< Device queue size + uint maskGroups_; //!< The number of mask groups processed in the scheduler by one thread + + Memory* hsaQueueMem_; //!< Memory for the amd_queue_t object + bool profileEnabled_; //!< Profiling is enabled }; /*@}*/} // namespace gpu diff --git a/rocclr/runtime/device/gpu/gpuwavelimiter.cpp b/rocclr/runtime/device/gpu/gpuwavelimiter.cpp index e3e57a7573..8da90b2404 100644 --- a/rocclr/runtime/device/gpu/gpuwavelimiter.cpp +++ b/rocclr/runtime/device/gpu/gpuwavelimiter.cpp @@ -19,336 +19,317 @@ uint WLAlgorithmSmooth::AdaptCount; uint WLAlgorithmSmooth::AbandonThresh; uint WLAlgorithmSmooth::DscThresh; -WaveLimiter::WaveLimiter( - WaveLimiterManager* manager, - uint seqNum, - bool enable, - bool enableDump): - manager_(manager), - dumper_(manager_->name() + "_" + std::to_string(seqNum), enableDump) { +WaveLimiter::WaveLimiter(WaveLimiterManager* manager, uint seqNum, bool enable, bool enableDump) + : manager_(manager), dumper_(manager_->name() + "_" + std::to_string(seqNum), enableDump) { + setIfNotDefault(SIMDPerSH_, GPU_WAVE_LIMIT_CU_PER_SH, manager->getSimdPerSH()); + MaxWave = GPU_WAVE_LIMIT_MAX_WAVE; + WarmUpCount = GPU_WAVE_LIMIT_WARMUP; + RunCount = GPU_WAVE_LIMIT_RUN * MaxWave; - setIfNotDefault(SIMDPerSH_, GPU_WAVE_LIMIT_CU_PER_SH, manager->getSimdPerSH()); - MaxWave = GPU_WAVE_LIMIT_MAX_WAVE; - WarmUpCount = GPU_WAVE_LIMIT_WARMUP; - RunCount = GPU_WAVE_LIMIT_RUN * MaxWave; + state_ = WARMUP; + if (!flagIsDefault(GPU_WAVE_LIMIT_TRACE)) { + traceStream_.open(std::string(GPU_WAVE_LIMIT_TRACE) + manager_->name() + ".txt"); + } - state_ = WARMUP; - if (!flagIsDefault(GPU_WAVE_LIMIT_TRACE)) { - traceStream_.open(std::string(GPU_WAVE_LIMIT_TRACE) + manager_->name() + - ".txt"); - } - - waves_ = MaxWave; - currWaves_ = MaxWave; - bestWave_ = MaxWave; - enable_ = enable; + waves_ = MaxWave; + currWaves_ = MaxWave; + bestWave_ = MaxWave; + enable_ = enable; } WaveLimiter::~WaveLimiter() { - if (traceStream_.is_open()) { - traceStream_.close(); - } + if (traceStream_.is_open()) { + traceStream_.close(); + } } -uint WaveLimiter::getWavesPerSH(){ - currWaves_ = waves_; - return waves_ * SIMDPerSH_; +uint WaveLimiter::getWavesPerSH() { + currWaves_ = waves_; + return waves_ * SIMDPerSH_; } -WLAlgorithmSmooth::WLAlgorithmSmooth(WaveLimiterManager* manager, uint seqNum, bool enable, bool enableDump): - WaveLimiter(manager, seqNum, enable, enableDump) { - AdaptCount = 2 * MaxWave + 1; - AbandonThresh = GPU_WAVE_LIMIT_ABANDON; - DscThresh = GPU_WAVE_LIMIT_DSC_THRESH; +WLAlgorithmSmooth::WLAlgorithmSmooth(WaveLimiterManager* manager, uint seqNum, bool enable, + bool enableDump) + : WaveLimiter(manager, seqNum, enable, enableDump) { + AdaptCount = 2 * MaxWave + 1; + AbandonThresh = GPU_WAVE_LIMIT_ABANDON; + DscThresh = GPU_WAVE_LIMIT_DSC_THRESH; - dynRunCount_ = RunCount; - measure_.resize(MaxWave + 1); - reference_.resize(MaxWave + 1); - trial_.resize(MaxWave + 1); - ratio_.resize(MaxWave + 1); + dynRunCount_ = RunCount; + measure_.resize(MaxWave + 1); + reference_.resize(MaxWave + 1); + trial_.resize(MaxWave + 1); + ratio_.resize(MaxWave + 1); - clearData(); + clearData(); } -WLAlgorithmSmooth::~WLAlgorithmSmooth() { - -} +WLAlgorithmSmooth::~WLAlgorithmSmooth() {} void WLAlgorithmSmooth::clearData() { - waves_ = MaxWave; - countAll_ = 0; - clear(measure_); - clear(reference_); - clear(trial_); - clear(ratio_); - discontinuous_ = false; - dataCount_ = 0; + waves_ = MaxWave; + countAll_ = 0; + clear(measure_); + clear(reference_); + clear(trial_); + clear(ratio_); + discontinuous_ = false; + dataCount_ = 0; } void WLAlgorithmSmooth::updateData(ulong time) { - auto count = dataCount_ - 1; - assert(count < 2 * MaxWave + 1); - assert(time > 0); - assert(currWaves_ == waves_); - if (count % 2 == 0) { - assert(waves_ == MaxWave); - auto pos = count / 2; - measure_[pos] = time; - if (pos > 0) { - auto wave = MaxWave + 1 - pos; - if (abs(static_cast(measure_[pos - 1]) - - static_cast(measure_[pos])) * 100 / measure_[pos] > - DscThresh) { - discontinuous_ = true; - } - reference_[wave] = (time + measure_[pos - 1]) / 2; - ratio_[wave] = trial_[wave] * 100 / reference_[wave]; - if (ratio_[bestWave_] > ratio_[wave] && !discontinuous_) { - bestWave_ = wave; - } - } - } else { - assert(waves_ == MaxWave - count / 2); - trial_[waves_] = time; + auto count = dataCount_ - 1; + assert(count < 2 * MaxWave + 1); + assert(time > 0); + assert(currWaves_ == waves_); + if (count % 2 == 0) { + assert(waves_ == MaxWave); + auto pos = count / 2; + measure_[pos] = time; + if (pos > 0) { + auto wave = MaxWave + 1 - pos; + if (abs(static_cast(measure_[pos - 1]) - static_cast(measure_[pos])) * 100 / + measure_[pos] > + DscThresh) { + discontinuous_ = true; + } + reference_[wave] = (time + measure_[pos - 1]) / 2; + ratio_[wave] = trial_[wave] * 100 / reference_[wave]; + if (ratio_[bestWave_] > ratio_[wave] && !discontinuous_) { + bestWave_ = wave; + } } - outputTrace(); + } else { + assert(waves_ == MaxWave - count / 2); + trial_[waves_] = time; + } + outputTrace(); } void WLAlgorithmSmooth::outputTrace() { - if (!traceStream_.is_open()) { - return; - } + if (!traceStream_.is_open()) { + return; + } - traceStream_ << "[WaveLimiter] " << manager_->name() << " state=" << state_ - << " currWaves=" << currWaves_ << " waves=" << waves_ - << " bestWave=" << bestWave_ << '\n'; - output(traceStream_, "\n measure = ", measure_); - output(traceStream_, "\n reference = ", reference_); - output(traceStream_, "\n ratio = ", ratio_); - traceStream_ << "\n\n"; + traceStream_ << "[WaveLimiter] " << manager_->name() << " state=" << state_ + << " currWaves=" << currWaves_ << " waves=" << waves_ << " bestWave=" << bestWave_ + << '\n'; + output(traceStream_, "\n measure = ", measure_); + output(traceStream_, "\n reference = ", reference_); + output(traceStream_, "\n ratio = ", ratio_); + traceStream_ << "\n\n"; } void WLAlgorithmSmooth::callback(ulong duration) { - dumper_.addData(duration, currWaves_, static_cast(state_)); + dumper_.addData(duration, currWaves_, static_cast(state_)); - if (!enable_ || (duration == 0)) { - return; - } + if (!enable_ || (duration == 0)) { + return; + } - countAll_++; + countAll_++; - switch (state_) { + switch (state_) { case WARMUP: - if (countAll_ < WarmUpCount) { - return; - } - state_ = ADAPT; - bestWave_ = MaxWave; - clearData(); + if (countAll_ < WarmUpCount) { return; + } + state_ = ADAPT; + bestWave_ = MaxWave; + clearData(); + return; case ADAPT: - assert(duration > 0); - if (waves_ == currWaves_) { - dataCount_++; - updateData(duration); - waves_ = MaxWave + 1 - dataCount_ / 2; - if (dataCount_ == 1 || (dataCount_ < AdaptCount && - !discontinuous_ && (dataCount_ % 2 == 0 || - ratio_[waves_] < AbandonThresh))) { - if (dataCount_ % 2 == 1) { - --waves_; - } else { - waves_ = MaxWave; - } - return; - } - waves_ = bestWave_; - if (dataCount_ >= AdaptCount) { - dynRunCount_ = RunCount; - } else { - dynRunCount_ = AdaptCount; - } - countAll_ = rand() % MaxWave; - state_ = RUN; + assert(duration > 0); + if (waves_ == currWaves_) { + dataCount_++; + updateData(duration); + waves_ = MaxWave + 1 - dataCount_ / 2; + if (dataCount_ == 1 || (dataCount_ < AdaptCount && !discontinuous_ && + (dataCount_ % 2 == 0 || ratio_[waves_] < AbandonThresh))) { + if (dataCount_ % 2 == 1) { + --waves_; + } else { + waves_ = MaxWave; + } + return; } - return; + waves_ = bestWave_; + if (dataCount_ >= AdaptCount) { + dynRunCount_ = RunCount; + } else { + dynRunCount_ = AdaptCount; + } + countAll_ = rand() % MaxWave; + state_ = RUN; + } + return; case RUN: - if (countAll_ < dynRunCount_) { - return; - } - state_ = ADAPT; - bestWave_ = MaxWave; - clearData(); + if (countAll_ < dynRunCount_) { return; - } + } + state_ = ADAPT; + bestWave_ = MaxWave; + clearData(); + return; + } } -WaveLimiter::DataDumper::DataDumper(const std::string &kernelName, bool enable) { - enable_ = enable; - if (enable_) { - fileName_ = std::string(GPU_WAVE_LIMIT_DUMP) + kernelName + ".csv"; - } +WaveLimiter::DataDumper::DataDumper(const std::string& kernelName, bool enable) { + enable_ = enable; + if (enable_) { + fileName_ = std::string(GPU_WAVE_LIMIT_DUMP) + kernelName + ".csv"; + } } WaveLimiter::DataDumper::~DataDumper() { - if (!enable_) { - return; - } + if (!enable_) { + return; + } - std::ofstream OFS(fileName_); - for (size_t i = 0, e = time_.size(); i != e; ++i) { - OFS << i << ',' << time_[i] << ',' << wavePerSIMD_[i] << ',' - << static_cast(state_[i]) << '\n'; - } - OFS.close(); + std::ofstream OFS(fileName_); + for (size_t i = 0, e = time_.size(); i != e; ++i) { + OFS << i << ',' << time_[i] << ',' << wavePerSIMD_[i] << ',' << static_cast(state_[i]) + << '\n'; + } + OFS.close(); } void WaveLimiter::DataDumper::addData(ulong time, uint wave, char state) { - if (!enable_) { - return; - } + if (!enable_) { + return; + } - time_.push_back(time); - wavePerSIMD_.push_back(wave); - state_.push_back(state); + time_.push_back(time); + wavePerSIMD_.push_back(wave); + state_.push_back(state); } -WLAlgorithmAvrg::WLAlgorithmAvrg(WaveLimiterManager* manager, uint seqNum, bool enable, bool enableDump): - WaveLimiter(manager, seqNum, enable, enableDump) { - - measure_.resize(MaxWave + 1); - clear(measure_); - countAll_ = 0; +WLAlgorithmAvrg::WLAlgorithmAvrg(WaveLimiterManager* manager, uint seqNum, bool enable, + bool enableDump) + : WaveLimiter(manager, seqNum, enable, enableDump) { + measure_.resize(MaxWave + 1); + clear(measure_); + countAll_ = 0; } -WLAlgorithmAvrg::~WLAlgorithmAvrg() { - -} +WLAlgorithmAvrg::~WLAlgorithmAvrg() {} void WLAlgorithmAvrg::outputTrace() { - if (!traceStream_.is_open()) { - return; - } + if (!traceStream_.is_open()) { + return; + } - traceStream_ << "[WaveLimiter] " << manager_->name() << " state=" << state_ - << " currWaves=" << currWaves_ << " waves=" << waves_ - << " bestWave=" << bestWave_ << '\n'; - output(traceStream_, "\n measure = ", measure_); - traceStream_ << "\n\n"; + traceStream_ << "[WaveLimiter] " << manager_->name() << " state=" << state_ + << " currWaves=" << currWaves_ << " waves=" << waves_ << " bestWave=" << bestWave_ + << '\n'; + output(traceStream_, "\n measure = ", measure_); + traceStream_ << "\n\n"; } void WLAlgorithmAvrg::callback(ulong duration) { - dumper_.addData(duration, currWaves_, static_cast(state_)); + dumper_.addData(duration, currWaves_, static_cast(state_)); - if (!enable_) { - return; - } + if (!enable_) { + return; + } - countAll_++; + countAll_++; - switch (state_) { + switch (state_) { case WARMUP: - state_ = ADAPT; + state_ = ADAPT; case ADAPT: - measure_[waves_] += duration; - if (countAll_ <= MaxWave * 5) { - waves_--; - if (waves_ == 0) { - waves_ = MaxWave; - } + measure_[waves_] += duration; + if (countAll_ <= MaxWave * 5) { + waves_--; + if (waves_ == 0) { + waves_ = MaxWave; } - else { - bestWave_ = MaxWave; - for (uint i=1; i 0) { - return fixed_; - } - if (!enable_) { - return 0; - } - auto loc = limiters_.find(vdev); - if (loc == limiters_.end()) { - return 0; - } - assert(loc->second != NULL); - return loc->second->getWavesPerSH(); +uint WaveLimiterManager::getWavesPerSH(const device::VirtualDevice* vdev) const { + if (fixed_ > 0) { + return fixed_; + } + if (!enable_) { + return 0; + } + auto loc = limiters_.find(vdev); + if (loc == limiters_.end()) { + return 0; + } + assert(loc->second != NULL); + return loc->second->getWavesPerSH(); } amd::ProfilingCallback* WaveLimiterManager::getProfilingCallback( - const device::VirtualDevice *vdev) { - assert(vdev != NULL); - if (!enable_ && !enableDump_) { - return NULL; - } + const device::VirtualDevice* vdev) { + assert(vdev != NULL); + if (!enable_ && !enableDump_) { + return NULL; + } - amd::ScopedLock SL(monitor_); - auto loc = limiters_.find(vdev); - if (loc != limiters_.end()) { - return loc->second; - } + amd::ScopedLock SL(monitor_); + auto loc = limiters_.find(vdev); + if (loc != limiters_.end()) { + return loc->second; + } - auto limiter = new WLAlgorithmSmooth(this, limiters_.size(), enable_, - enableDump_); - if (limiter == NULL) { - enable_ = false; - return NULL; - } - limiters_[vdev] = limiter; - return limiter; + auto limiter = new WLAlgorithmSmooth(this, limiters_.size(), enable_, enableDump_); + if (limiter == NULL) { + enable_ = false; + return NULL; + } + limiters_[vdev] = limiter; + return limiter; } void WaveLimiterManager::enable(const bool isCiPlus) { - if (fixed_ > 0) { - return; - } + if (fixed_ > 0) { + return; + } - // Enable it only for CI+, unless GPU_WAVE_LIMIT_ENABLE is set to 1 - // Disabled for SI due to bug #10817 - if (!flagIsDefault(GPU_WAVE_LIMIT_ENABLE)) { - enable_ = GPU_WAVE_LIMIT_ENABLE; - } - else { - if (isCiPlus) { - if (owner_->workGroupInfo()->wavesPerSimdHint_ == 0) { - enable_ = true; - } - else if (owner_->workGroupInfo()->wavesPerSimdHint_ <= GPU_WAVE_LIMIT_MAX_WAVE) { - fixed_ = owner_->workGroupInfo()->wavesPerSimdHint_ * getSimdPerSH(); - } - } + // Enable it only for CI+, unless GPU_WAVE_LIMIT_ENABLE is set to 1 + // Disabled for SI due to bug #10817 + if (!flagIsDefault(GPU_WAVE_LIMIT_ENABLE)) { + enable_ = GPU_WAVE_LIMIT_ENABLE; + } else { + if (isCiPlus) { + if (owner_->workGroupInfo()->wavesPerSimdHint_ == 0) { + enable_ = true; + } else if (owner_->workGroupInfo()->wavesPerSimdHint_ <= GPU_WAVE_LIMIT_MAX_WAVE) { + fixed_ = owner_->workGroupInfo()->wavesPerSimdHint_ * getSimdPerSH(); + } } + } } - } - diff --git a/rocclr/runtime/device/gpu/gpuwavelimiter.hpp b/rocclr/runtime/device/gpu/gpuwavelimiter.hpp index 07612fa451..a58ec1e7e8 100644 --- a/rocclr/runtime/device/gpu/gpuwavelimiter.hpp +++ b/rocclr/runtime/device/gpu/gpuwavelimiter.hpp @@ -19,144 +19,146 @@ namespace gpu { class WaveLimiterManager; // Adaptively limit the number of waves per SIMD based on kernel execution time -class WaveLimiter: public amd::ProfilingCallback { -public: - explicit WaveLimiter(WaveLimiterManager* manager, uint seqNum, bool enable, bool enableDump); - virtual ~WaveLimiter(); +class WaveLimiter : public amd::ProfilingCallback { + public: + explicit WaveLimiter(WaveLimiterManager* manager, uint seqNum, bool enable, bool enableDump); + virtual ~WaveLimiter(); - //! Get waves per shader array to be used for kernel execution. - uint getWavesPerSH(); + //! Get waves per shader array to be used for kernel execution. + uint getWavesPerSH(); -protected: - enum StateKind { - WARMUP, ADAPT, RUN - }; + protected: + enum StateKind { WARMUP, ADAPT, RUN }; - class DataDumper { - public: - explicit DataDumper(const std::string &kernelName, bool enable); - ~DataDumper(); + class DataDumper { + public: + explicit DataDumper(const std::string& kernelName, bool enable); + ~DataDumper(); - //! Record execution time, waves/simd and state of wave limiter. - void addData(ulong time, uint wave, char state); + //! Record execution time, waves/simd and state of wave limiter. + void addData(ulong time, uint wave, char state); - //! Whether this data dumper is enabled. - bool enabled() const { return enable_;} - private: - bool enable_; - std::string fileName_; - std::vector time_; - std::vector wavePerSIMD_; - std::vector state_; - }; + //! Whether this data dumper is enabled. + bool enabled() const { return enable_; } - std::vector measure_; + private: bool enable_; - uint SIMDPerSH_; // Number of SIMDs per SH - uint waves_; // Waves per SIMD to be set - uint bestWave_; // Optimal waves per SIMD - uint countAll_; // Number of kernel executions - StateKind state_; - WaveLimiterManager* manager_; - DataDumper dumper_; - std::ofstream traceStream_; - uint currWaves_; // Current waves per SIMD + std::string fileName_; + std::vector time_; + std::vector wavePerSIMD_; + std::vector state_; + }; - static uint MaxWave; // Maximum number of waves per SIMD - static uint WarmUpCount; // Number of kernel executions for warm up - static uint RunCount; // Number of kernel executions for normal run + std::vector measure_; + bool enable_; + uint SIMDPerSH_; // Number of SIMDs per SH + uint waves_; // Waves per SIMD to be set + uint bestWave_; // Optimal waves per SIMD + uint countAll_; // Number of kernel executions + StateKind state_; + WaveLimiterManager* manager_; + DataDumper dumper_; + std::ofstream traceStream_; + uint currWaves_; // Current waves per SIMD - //! Call back from Event::recordProfilingInfo to get execution time. - virtual void callback(ulong duration)=0; + static uint MaxWave; // Maximum number of waves per SIMD + static uint WarmUpCount; // Number of kernel executions for warm up + static uint RunCount; // Number of kernel executions for normal run - //! Output trace of measurement/adaptation. - virtual void outputTrace()=0; + //! Call back from Event::recordProfilingInfo to get execution time. + virtual void callback(ulong duration) = 0; - template void clear(T& A) { - for (auto &I : A) { - I = 0; - } + //! Output trace of measurement/adaptation. + virtual void outputTrace() = 0; + + template void clear(T& A) { + for (auto& I : A) { + I = 0; } - template void output(std::ofstream &ofs, const std::string &prompt, - T& A) { - ofs << prompt; - for (auto &I : A) { - ofs << ' ' << static_cast(I); - } + } + template void output(std::ofstream& ofs, const std::string& prompt, T& A) { + ofs << prompt; + for (auto& I : A) { + ofs << ' ' << static_cast(I); } + } }; -class WLAlgorithmSmooth: public WaveLimiter { -public: - explicit WLAlgorithmSmooth(WaveLimiterManager* manager, uint seqNum, bool enable, bool enableDump); - virtual ~WLAlgorithmSmooth(); -private: - std::vector reference_; - std::vector trial_; - std::vector ratio_; - bool discontinuous_; // Measured data is discontinuous - uint dynRunCount_; - uint dataCount_; +class WLAlgorithmSmooth : public WaveLimiter { + public: + explicit WLAlgorithmSmooth(WaveLimiterManager* manager, uint seqNum, bool enable, + bool enableDump); + virtual ~WLAlgorithmSmooth(); - static uint AdaptCount; // Number of kernel executions for adapting - static uint AbandonThresh; // Threshold to abandon adaptation - static uint DscThresh; // Threshold for identifying discontinuities + private: + std::vector reference_; + std::vector trial_; + std::vector ratio_; + bool discontinuous_; // Measured data is discontinuous + uint dynRunCount_; + uint dataCount_; - //! Update measurement data and optimal waves/simd with execution time. - void updateData(ulong time); + static uint AdaptCount; // Number of kernel executions for adapting + static uint AbandonThresh; // Threshold to abandon adaptation + static uint DscThresh; // Threshold for identifying discontinuities - //! Clear measurement data for the next adaptation. - void clearData(); + //! Update measurement data and optimal waves/simd with execution time. + void updateData(ulong time); - //! Call back from Event::recordProfilingInfo to get execution time. - void callback(ulong duration); + //! Clear measurement data for the next adaptation. + void clearData(); - //! Output trace of measurement/adaptation. - void outputTrace(); + //! Call back from Event::recordProfilingInfo to get execution time. + void callback(ulong duration); + + //! Output trace of measurement/adaptation. + void outputTrace(); }; -class WLAlgorithmAvrg: public WaveLimiter { -public: - explicit WLAlgorithmAvrg(WaveLimiterManager* manager, uint seqNum, bool enable, bool enableDump); - virtual ~WLAlgorithmAvrg(); -private: - //! Call back from Event::recordProfilingInfo to get execution time. - void callback(ulong duration); +class WLAlgorithmAvrg : public WaveLimiter { + public: + explicit WLAlgorithmAvrg(WaveLimiterManager* manager, uint seqNum, bool enable, bool enableDump); + virtual ~WLAlgorithmAvrg(); - //! Output trace of measurement/adaptation. - void outputTrace(); + private: + //! Call back from Event::recordProfilingInfo to get execution time. + void callback(ulong duration); + + //! Output trace of measurement/adaptation. + void outputTrace(); }; // Create wave limiter for each virtual device for a kernel and manages the wave limiters. class WaveLimiterManager { -public: - explicit WaveLimiterManager(device::Kernel* owner, const uint simdPerSH); - virtual ~WaveLimiterManager(); + public: + explicit WaveLimiterManager(device::Kernel* owner, const uint simdPerSH); + virtual ~WaveLimiterManager(); - //! Get waves per shader array for a specific virtual device. - uint getWavesPerSH(const device::VirtualDevice *) const; + //! Get waves per shader array for a specific virtual device. + uint getWavesPerSH(const device::VirtualDevice*) const; - //! Provide call back function for a specific virtual device. - amd::ProfilingCallback* getProfilingCallback(const device::VirtualDevice *); + //! Provide call back function for a specific virtual device. + amd::ProfilingCallback* getProfilingCallback(const device::VirtualDevice*); - //! Enable wave limiter manager by kernel metadata and flags. - void enable(const bool isCiPlus); + //! Enable wave limiter manager by kernel metadata and flags. + void enable(const bool isCiPlus); - //! Returns the kernel name - const std::string& name() const { return owner_->name(); } + //! Returns the kernel name + const std::string& name() const { return owner_->name(); } - //! Get SimdPerSH. - uint getSimdPerSH() const {return simdPerSH_;} + //! Get SimdPerSH. + uint getSimdPerSH() const { return simdPerSH_; } -private: - device::Kernel *owner_; // The kernel which owns this object - uint simdPerSH_; // Simd Per SH - std::unordered_map limiters_; // Maps virtual device to wave limiter - bool enable_; // Whether the adaptation is enabled - bool enableDump_; // Whether the data dumper is enabled - uint fixed_; // The fixed waves/simd value if not zero - amd::Monitor monitor_; // The mutex for updating the wave limiter map + private: + device::Kernel* owner_; // The kernel which owns this object + uint simdPerSH_; // Simd Per SH + std::unordered_map + limiters_; // Maps virtual device to wave limiter + bool enable_; // Whether the adaptation is enabled + bool enableDump_; // Whether the data dumper is enabled + uint fixed_; // The fixed waves/simd value if not zero + amd::Monitor monitor_; // The mutex for updating the wave limiter map }; } #endif diff --git a/rocclr/runtime/device/hwdebug.cpp b/rocclr/runtime/device/hwdebug.cpp index 23fb0d4ca3..ef5f177212 100644 --- a/rocclr/runtime/device/hwdebug.cpp +++ b/rocclr/runtime/device/hwdebug.cpp @@ -27,154 +27,119 @@ class Device; //! Constructor of the debug manager class HwDebugManager::HwDebugManager(amd::Device* device) - : context_(NULL) - , device_(device) - , preDispatchCallBackFunc_(NULL) - , postDispatchCallBackFunc_(NULL) - , preDispatchCallBackArgs_(NULL) - , postDispatchCallBackArgs_(NULL) - , paramMemory_(NULL) - , numParams_(0) - , aclBinary_(NULL) - , aqlCodeAddr_(NULL) - , aqlCodeSize_(0) - , scratchRingAddr_(NULL) - , scratchRingSize_(0) - , isRegistered_(false) - , runtimeTBA_(NULL) - , runtimeTMA_(NULL) -{ - memset(&debugInfo_, 0, sizeof(debugInfo_)); + : context_(NULL), + device_(device), + preDispatchCallBackFunc_(NULL), + postDispatchCallBackFunc_(NULL), + preDispatchCallBackArgs_(NULL), + postDispatchCallBackArgs_(NULL), + paramMemory_(NULL), + numParams_(0), + aclBinary_(NULL), + aqlCodeAddr_(NULL), + aqlCodeSize_(0), + scratchRingAddr_(NULL), + scratchRingSize_(0), + isRegistered_(false), + runtimeTBA_(NULL), + runtimeTMA_(NULL) { + memset(&debugInfo_, 0, sizeof(debugInfo_)); - for (int i = 0; i < kDebugTrapLocationMax; i++) { - rtTrapInfo_[i] = NULL; - } + for (int i = 0; i < kDebugTrapLocationMax; i++) { + rtTrapInfo_[i] = NULL; + } } -HwDebugManager::~HwDebugManager() -{ - delete[] paramMemory_; +HwDebugManager::~HwDebugManager() { + delete[] paramMemory_; - delete runtimeTMA_; - delete runtimeTBA_; + delete runtimeTMA_; + delete runtimeTBA_; } //! Setup the call back function pointer -void -HwDebugManager::setCallBackFunctions(cl_PreDispatchCallBackFunctionAMD preDispatchFunction, - cl_PostDispatchCallBackFunctionAMD postDispatchFunction) -{ - preDispatchCallBackFunc_ = preDispatchFunction; - postDispatchCallBackFunc_ = postDispatchFunction; +void HwDebugManager::setCallBackFunctions(cl_PreDispatchCallBackFunctionAMD preDispatchFunction, + cl_PostDispatchCallBackFunctionAMD postDispatchFunction) { + preDispatchCallBackFunc_ = preDispatchFunction; + postDispatchCallBackFunc_ = postDispatchFunction; } //! Setup the call back argument pointers -void -HwDebugManager::setCallBackArguments(void* preDispatchArgs, void* postDispatchArgs) -{ - preDispatchCallBackArgs_ = preDispatchArgs; - postDispatchCallBackArgs_ = postDispatchArgs; +void HwDebugManager::setCallBackArguments(void* preDispatchArgs, void* postDispatchArgs) { + preDispatchCallBackArgs_ = preDispatchArgs; + postDispatchCallBackArgs_ = postDispatchArgs; } //! Get dispatch debug info -void -HwDebugManager::getDispatchDebugInfo(void* debugInfo) const -{ - memcpy(debugInfo, (void*) &debugInfo_, sizeof(DispatchDebugInfo)); +void HwDebugManager::getDispatchDebugInfo(void* debugInfo) const { + memcpy(debugInfo, (void*)&debugInfo_, sizeof(DispatchDebugInfo)); } //! Set the kernel code address and its size -void -HwDebugManager::setKernelCodeInfo(address aqlCodeAddr, uint32_t aqlCodeSize) -{ - aqlCodeAddr_ = aqlCodeAddr; - aqlCodeSize_ = aqlCodeSize; +void HwDebugManager::setKernelCodeInfo(address aqlCodeAddr, uint32_t aqlCodeSize) { + aqlCodeAddr_ = aqlCodeAddr; + aqlCodeSize_ = aqlCodeSize; } //! Get the scratch ring -void -HwDebugManager::setScratchRing(address scratchRingAddr, uint32_t scratchRingSize) -{ - scratchRingAddr_ = scratchRingAddr; - scratchRingSize_ = scratchRingSize; +void HwDebugManager::setScratchRing(address scratchRingAddr, uint32_t scratchRingSize) { + scratchRingAddr_ = scratchRingAddr; + scratchRingSize_ = scratchRingSize; } //! Map the scratch ring for host access -void -HwDebugManager::mapScratchRing(uint64_t* scratchRingAddr, uint32_t* scratchRingSize) const -{ - *scratchRingAddr = reinterpret_cast(scratchRingAddr_); - *scratchRingSize = scratchRingSize_; +void HwDebugManager::mapScratchRing(uint64_t* scratchRingAddr, uint32_t* scratchRingSize) const { + *scratchRingAddr = reinterpret_cast(scratchRingAddr_); + *scratchRingSize = scratchRingSize_; } -void -HwDebugManager::setExceptionPolicy(void* exceptionPolicy) -{ - memcpy(&excpPolicy_, exceptionPolicy, sizeof(cl_dbg_exception_policy_amd)); +void HwDebugManager::setExceptionPolicy(void* exceptionPolicy) { + memcpy(&excpPolicy_, exceptionPolicy, sizeof(cl_dbg_exception_policy_amd)); } -void -HwDebugManager::getExceptionPolicy(void* exceptionPolicy) const -{ - memcpy(exceptionPolicy, &excpPolicy_, sizeof(cl_dbg_exception_policy_amd)); +void HwDebugManager::getExceptionPolicy(void* exceptionPolicy) const { + memcpy(exceptionPolicy, &excpPolicy_, sizeof(cl_dbg_exception_policy_amd)); } -void -HwDebugManager::setKernelExecutionMode(void* mode) -{ - cl_dbg_kernel_exec_mode_amd* execMode = reinterpret_cast(mode); - execMode_.ui32All = execMode->ui32All; +void HwDebugManager::setKernelExecutionMode(void* mode) { + cl_dbg_kernel_exec_mode_amd* execMode = reinterpret_cast(mode); + execMode_.ui32All = execMode->ui32All; } -void -HwDebugManager::getKernelExecutionMode(void* mode) const -{ - cl_dbg_kernel_exec_mode_amd* execMode = reinterpret_cast(mode); - execMode->ui32All = execMode_.ui32All; +void HwDebugManager::getKernelExecutionMode(void* mode) const { + cl_dbg_kernel_exec_mode_amd* execMode = reinterpret_cast(mode); + execMode->ui32All = execMode_.ui32All; } -void -HwDebugManager::setAclBinary(void* aclBinary) -{ - aclBinary_ = aclBinary; +void HwDebugManager::setAclBinary(void* aclBinary) { aclBinary_ = aclBinary; } + +void HwDebugManager::allocParamMemList(uint32_t numParams) { + if (NULL != paramMemory_) { + delete[] paramMemory_; + } + + numParams_ = numParams; + paramMemory_ = new amd::Memory*[numParams]; } -void -HwDebugManager::allocParamMemList(uint32_t numParams) -{ - if (NULL != paramMemory_) { - delete [] paramMemory_; - } +cl_mem HwDebugManager::getKernelParamMem(uint32_t paramIdx) const { + assert((paramIdx < numParams_) && "Invalid kernel parameter index too big"); - numParams_ = numParams; - paramMemory_ = new amd::Memory*[numParams]; + return as_cl(paramMemory_[paramIdx]); } -cl_mem -HwDebugManager::getKernelParamMem(uint32_t paramIdx) const -{ - assert((paramIdx < numParams_) && "Invalid kernel parameter index too big"); +void HwDebugManager::assignKernelParamMem(uint32_t paramIdx, amd::Memory* mem) { + assert((paramIdx < numParams_) && "Invalid kernel parameter index too big"); - return as_cl(paramMemory_[paramIdx]); + paramMemory_[paramIdx] = mem; } -void -HwDebugManager::assignKernelParamMem(uint32_t paramIdx, amd::Memory* mem) -{ - assert((paramIdx < numParams_) && "Invalid kernel parameter index too big"); - - paramMemory_[paramIdx] = mem; -} - -void -HwDebugManager::installTrap(cl_dbg_trap_type_amd trapType, - amd::Memory* trapHandler, - amd::Memory* trapBuffer) -{ - rtTrapInfo_[trapType<<2] = trapHandler; - rtTrapInfo_[(trapType<<2)+1] = trapBuffer; +void HwDebugManager::installTrap(cl_dbg_trap_type_amd trapType, amd::Memory* trapHandler, + amd::Memory* trapBuffer) { + rtTrapInfo_[trapType << 2] = trapHandler; + rtTrapInfo_[(trapType << 2) + 1] = trapBuffer; } -} // namespace amd +} // namespace amd diff --git a/rocclr/runtime/device/hwdebug.hpp b/rocclr/runtime/device/hwdebug.hpp index ef1032b471..9344ffc797 100644 --- a/rocclr/runtime/device/hwdebug.hpp +++ b/rocclr/runtime/device/hwdebug.hpp @@ -11,17 +11,14 @@ static const int TbaStartOffset = 256; static const int RtTrapBufferWaveSize = 64; -static const int RtTrapBufferSeNum = 4; -static const int RtTrapBufferShNum = 2; -static const int RtTrapBufferCuNum = 16; -static const int RtTrapBufferSimdNum = 4; -static const int RtTrapBufferWaveNum = 16; +static const int RtTrapBufferSeNum = 4; +static const int RtTrapBufferShNum = 2; +static const int RtTrapBufferCuNum = 16; +static const int RtTrapBufferSimdNum = 4; +static const int RtTrapBufferWaveNum = 16; static const int RtTrapBufferTotalWaveNum = - ((RtTrapBufferSeNum) * \ - (RtTrapBufferShNum) * \ - (RtTrapBufferCuNum) * \ - (RtTrapBufferSimdNum) * \ - (RtTrapBufferWaveNum)); + ((RtTrapBufferSeNum) * (RtTrapBufferShNum) * (RtTrapBufferCuNum) * (RtTrapBufferSimdNum) * + (RtTrapBufferWaveNum)); /*! \brief Debug trap handler location in the runtime trap buffer @@ -29,11 +26,10 @@ static const int RtTrapBufferTotalWaveNum = * This enumeration is used to indicate the location where the debug * trap handler and debug trap buffer are set in the device trap buffer. */ -enum DebugTrapLocation -{ - kDebugTrapHandlerLocation = 0, //! Debug Trap handler location, this location must be 0 - kDebugTrapBufferLocation = 1, //! Debug Trap buffer location, this location must be 1 - kDebugTrapLocationMax = 2 +enum DebugTrapLocation { + kDebugTrapHandlerLocation = 0, //! Debug Trap handler location, this location must be 0 + kDebugTrapBufferLocation = 1, //! Debug Trap buffer location, this location must be 1 + kDebugTrapLocationMax = 2 }; @@ -42,10 +38,9 @@ enum DebugTrapLocation * Contains the memory descriptor information of the scratch memory and the global * memory */ -struct DispatchDebugInfo -{ - uint32_t scratchMemoryDescriptor_[4]; //! Scratch memory descriptor - uint32_t globalMemoryDescriptor_[4]; //! Global memory descriptor +struct DispatchDebugInfo { + uint32_t scratchMemoryDescriptor_[4]; //! Scratch memory descriptor + uint32_t globalMemoryDescriptor_[4]; //! Global memory descriptor }; /*! \brief Trap handler descriptor @@ -53,8 +48,8 @@ struct DispatchDebugInfo * The trap handler descriptor contains the details of a given trap handler. */ struct TrapHandlerInfo { - amd::Memory* trapHandler_; //!< Device memory for the trap handler - amd::Memory* trapBuffer_; //!< Device memory for the trap buffer + amd::Memory* trapHandler_; //!< Device memory for the trap handler + amd::Memory* trapBuffer_; //!< Device memory for the trap buffer }; /*! \brief Structure of the runtime trap handler buffer, which includes the following @@ -62,10 +57,10 @@ struct TrapHandlerInfo { * the level-2 trap handlers and buffers. */ struct RuntimeTrapInfo { - TrapHandlerInfo trap_; //!< Structure of the address of all trap handlers - uint32_t dispatchId_; //!< Dispatch ID that signals the shader event - uint32_t vgpr_backup_[RtTrapBufferTotalWaveNum][RtTrapBufferWaveSize]; - //!< Buffer to backup the VGPR used by the runtime trap handler + TrapHandlerInfo trap_; //!< Structure of the address of all trap handlers + uint32_t dispatchId_; //!< Dispatch ID that signals the shader event + uint32_t vgpr_backup_[RtTrapBufferTotalWaveNum][RtTrapBufferWaveSize]; + //!< Buffer to backup the VGPR used by the runtime trap handler }; /** @@ -85,188 +80,177 @@ class HostQueue; * * \brief The device interface class for the hardware debug manager */ -class HwDebugManager -{ -public: +class HwDebugManager { + public: + //! Constructor for the Hardware Debug Manager + HwDebugManager(amd::Device* device); - //! Constructor for the Hardware Debug Manager - HwDebugManager(amd::Device* device); + //! Destructor for Hardware Debug Manager + virtual ~HwDebugManager(); - //! Destructor for Hardware Debug Manager - virtual ~HwDebugManager(); + //! Setup the call back function pointer + void setCallBackFunctions(cl_PreDispatchCallBackFunctionAMD preDispatchFn, + cl_PostDispatchCallBackFunctionAMD postDispatchFn); - //! Setup the call back function pointer - void setCallBackFunctions(cl_PreDispatchCallBackFunctionAMD preDispatchFn, - cl_PostDispatchCallBackFunctionAMD postDispatchFn); + //! Setup the call back argument pointers + void setCallBackArguments(void* preDispatchArgs, void* postDispatchArgs); - //! Setup the call back argument pointers - void setCallBackArguments(void* preDispatchArgs, void* postDispatchArgs); + //! Get dispatch debug info + void getDispatchDebugInfo(void* debugInfo) const; - //! Get dispatch debug info - void getDispatchDebugInfo(void* debugInfo) const; + //! Set the kernel code address and its size + void setKernelCodeInfo(address aqlCodeAddr, uint32_t aqlCodeSize); - //! Set the kernel code address and its size - void setKernelCodeInfo(address aqlCodeAddr, uint32_t aqlCodeSize); + //! Get the scratch ring + void setScratchRing(address scratchRingAddr, uint32_t scratchRingSize); - //! Get the scratch ring - void setScratchRing(address scratchRingAddr, uint32_t scratchRingSize); + //! Map the scratch ring for host access + void mapScratchRing(uint64_t* scratchRingAddr, uint32_t* scratchRingSize) const; - //! Map the scratch ring for host access - void mapScratchRing(uint64_t* scratchRingAddr, uint32_t* scratchRingSize) const; + //! Retrieve the pre-dispatch callback function + cl_PreDispatchCallBackFunctionAMD preDispatchCallBackFunc() const { + return preDispatchCallBackFunc_; + } - //! Retrieve the pre-dispatch callback function - cl_PreDispatchCallBackFunctionAMD preDispatchCallBackFunc() const - { return preDispatchCallBackFunc_; } + //! Retrieve the post-dispatch callback function + cl_PostDispatchCallBackFunctionAMD postDispatchCallBackFunc() const { + return postDispatchCallBackFunc_; + } - //! Retrieve the post-dispatch callback function - cl_PostDispatchCallBackFunctionAMD postDispatchCallBackFunc() const - { return postDispatchCallBackFunc_; } + //! Retrieve the pre-dispatch callback function arguments + void* preDispatchCallBackArgs() const { return preDispatchCallBackArgs_; } - //! Retrieve the pre-dispatch callback function arguments - void* preDispatchCallBackArgs() const { return preDispatchCallBackArgs_; } + //! Retrieve the post-dispatch callback function arguments + void* postDispatchCallBackArgs() const { return postDispatchCallBackArgs_; } - //! Retrieve the post-dispatch callback function arguments - void* postDispatchCallBackArgs() const { return postDispatchCallBackArgs_; } + //! Retrieve the memory pointer of the runtime trap handler code + device::Memory* runtimeTBA() const { return runtimeTBA_; } - //! Retrieve the memory pointer of the runtime trap handler code - device::Memory* runtimeTBA() const { return runtimeTBA_; } + //! Retrieve the memory pointer of the runtime trap handler buffer + device::Memory* runtimeTMA() const { return runtimeTMA_; } - //! Retrieve the memory pointer of the runtime trap handler buffer - device::Memory* runtimeTMA() const { return runtimeTMA_; } + //! Set exception policy + void setExceptionPolicy(void* policy); - //! Set exception policy - void setExceptionPolicy(void* policy); + //! Get exception policy + void getExceptionPolicy(void* policy) const; - //! Get exception policy - void getExceptionPolicy(void* policy) const; + //! Set the kernel execution mode + void setKernelExecutionMode(void* mode); - //! Set the kernel execution mode - void setKernelExecutionMode(void* mode); + //! Get the kernel execution mode + void getKernelExecutionMode(void* mode) const; - //! Get the kernel execution mode - void getKernelExecutionMode(void* mode) const; + //! Setup the pointer to the aclBinary within the debug manager + void setAclBinary(void* aclBinary); - //! Setup the pointer to the aclBinary within the debug manager - void setAclBinary(void* aclBinary); + //! Allocate storage to keep the memory pointers of the kernel parameters + void allocParamMemList(uint32_t numParams); - //! Allocate storage to keep the memory pointers of the kernel parameters - void allocParamMemList(uint32_t numParams); + //! Assign the kernel parameter memory + void assignKernelParamMem(uint32_t paramIdx, amd::Memory* mem); - //! Assign the kernel parameter memory - void assignKernelParamMem(uint32_t paramIdx, amd::Memory* mem); + //! Get kernel parameter memory object + cl_mem getKernelParamMem(uint32_t paramIdx) const; - //! Get kernel parameter memory object - cl_mem getKernelParamMem(uint32_t paramIdx) const; + //! Install trap handler + void installTrap(cl_dbg_trap_type_amd trapType, amd::Memory* pTrapHandler, + amd::Memory* pTrapBuffer); - //! Install trap handler - void installTrap(cl_dbg_trap_type_amd trapType, - amd::Memory* pTrapHandler, - amd::Memory* pTrapBuffer); + //! Flush cache + virtual void flushCache(uint32_t mask) = 0; - //! Flush cache - virtual void flushCache(uint32_t mask) = 0; + //! Create the debug event + virtual DebugEvent createDebugEvent(const bool autoReset) = 0; - //! Create the debug event - virtual DebugEvent createDebugEvent(const bool autoReset) = 0; + //! Wait for the debug event + virtual cl_int waitDebugEvent(DebugEvent pEvent, uint32_t timeOut) const = 0; - //! Wait for the debug event - virtual cl_int waitDebugEvent(DebugEvent pEvent, uint32_t timeOut) const = 0; + //! Destroy the debug event + virtual void destroyDebugEvent(DebugEvent* pEvent) = 0; - //! Destroy the debug event - virtual void destroyDebugEvent(DebugEvent* pEvent) = 0; + //! Register the debugger + virtual cl_int registerDebugger(amd::Context* context, uintptr_t pMessageStorage) = 0; - //! Register the debugger - virtual cl_int registerDebugger(amd::Context* context, uintptr_t pMessageStorage) = 0; + //! Unregister the debugger + virtual void unregisterDebugger() = 0; - //! Unregister the debugger - virtual void unregisterDebugger() = 0; + //! Send the wavefront control cmmand + virtual void wavefrontControl(uint32_t waveAction, uint32_t waveMode, uint32_t trapId, + void* waveAddr) const = 0; - //! Send the wavefront control cmmand - virtual void wavefrontControl(uint32_t waveAction, - uint32_t waveMode, - uint32_t trapId, - void* waveAddr) const = 0; + //! Set address watching point + virtual void setAddressWatch(uint32_t numWatchPoints, void** watchAddress, uint64_t* watchMask, + uint64_t* watchMode, DebugEvent* event) = 0; - //! Set address watching point - virtual void setAddressWatch(uint32_t numWatchPoints, - void** watchAddress, - uint64_t* watchMask, - uint64_t* watchMode, - DebugEvent* event) = 0; + //! Map the shader (AQL code) for host access + virtual void mapKernelCode(void* aqlCodeInfo) const = 0; - //! Map the shader (AQL code) for host access - virtual void mapKernelCode(void* aqlCodeInfo) const = 0; + //! Get the packet information for dispatch + virtual void getPacketAmdInfo(const void* aqlCodeInfo, void* packetInfo) const = 0; - //! Get the packet information for dispatch - virtual void getPacketAmdInfo(const void* aqlCodeInfo, - void* packetInfo) const = 0; + //! Set global memory values + virtual void setGlobalMemory(amd::Memory* memObj, uint32_t offset, void* srcPtr, + uint32_t size) = 0; - //! Set global memory values - virtual void setGlobalMemory(amd::Memory* memObj, - uint32_t offset, - void* srcPtr, - uint32_t size) = 0; + //! Execute the post-dispatch callback function + virtual void executePostDispatchCallBack() = 0; - //! Execute the post-dispatch callback function - virtual void executePostDispatchCallBack() = 0; + //! Execute the pre-dispatch callback function + virtual void executePreDispatchCallBack(void* aqlPacket, void* toolInfo) = 0; - //! Execute the pre-dispatch callback function - virtual void executePreDispatchCallBack(void* aqlPacket, - void* toolInfo) = 0; + protected: + //! Return the context + const amd::Context* context() const { return context_; } -protected: - //! Return the context - const amd::Context* context() const { return context_; } + //! Get the debug device + const amd::Device* device() const { return device_; } - //! Get the debug device - const amd::Device* device() const { return device_; } + //! Return the register flag + bool isRegistered() const { return isRegistered_; } - //! Return the register flag - bool isRegistered() const { return isRegistered_; } + protected: + const amd::Context* context_; ///< context that used to create host queue for the debugger + amd::Device* device_; ///< Device to run the debugger -protected: + cl_PreDispatchCallBackFunctionAMD preDispatchCallBackFunc_; //!< pre-dispatch callback function + cl_PostDispatchCallBackFunctionAMD + postDispatchCallBackFunc_; //!< post-dispatch callback function + void* preDispatchCallBackArgs_; //!< pre-dispatch callback function arguments + void* postDispatchCallBackArgs_; //!< post-dispatch callback function arguments - const amd::Context* context_; ///< context that used to create host queue for the debugger - amd::Device* device_; ///< Device to run the debugger + DispatchDebugInfo debugInfo_; //!< Debug setting/information for kernel dispatch + amd::Memory* rtTrapInfo_[kDebugTrapLocationMax]; //!< Device trap buffer, to store various trap + //!handlers on the device - cl_PreDispatchCallBackFunctionAMD preDispatchCallBackFunc_; //!< pre-dispatch callback function - cl_PostDispatchCallBackFunctionAMD postDispatchCallBackFunc_; //!< post-dispatch callback function - void* preDispatchCallBackArgs_; //!< pre-dispatch callback function arguments - void* postDispatchCallBackArgs_; //!< post-dispatch callback function arguments + amd::Memory** paramMemory_; //!< list of memory pointers for kernel parameters + uint32_t numParams_; //!< number of kernel parameters - DispatchDebugInfo debugInfo_; //!< Debug setting/information for kernel dispatch - amd::Memory* rtTrapInfo_[kDebugTrapLocationMax]; //!< Device trap buffer, to store various trap handlers on the device + void* aclBinary_; //!< ACL binary - amd::Memory** paramMemory_; //!< list of memory pointers for kernel parameters - uint32_t numParams_; //!< number of kernel parameters + address aqlCodeAddr_; //!< The mapped AQL code to allow host access + uint32_t aqlCodeSize_; //!< The size of the AQL code info - void* aclBinary_; //!< ACL binary + address scratchRingAddr_; //!< The mapped address of the scratch buffer + uint32_t scratchRingSize_; //!< The size of the scratch ring - address aqlCodeAddr_; //!< The mapped AQL code to allow host access - uint32_t aqlCodeSize_; //!< The size of the AQL code info + bool isRegistered_; //! flag to indicate the debugger has been registered - address scratchRingAddr_; //!< The mapped address of the scratch buffer - uint32_t scratchRingSize_; //!< The size of the scratch ring - - bool isRegistered_; //! flag to indicate the debugger has been registered - - cl_dbg_exception_policy_amd excpPolicy_; //!< exception policy - cl_dbg_kernel_exec_mode_amd execMode_; //!< kernel execution mode - RuntimeTrapInfo rtTrapHandlerInfo_; //!< Runtime trap information - - //! Runtime Trap handler pointer (TBA) & its buffer (TMA) - device::Memory* runtimeTBA_; //! runtime trap handler pointer - device::Memory* runtimeTMA_; //! runtime trap handler buffer + cl_dbg_exception_policy_amd excpPolicy_; //!< exception policy + cl_dbg_kernel_exec_mode_amd execMode_; //!< kernel execution mode + RuntimeTrapInfo rtTrapHandlerInfo_; //!< Runtime trap information + //! Runtime Trap handler pointer (TBA) & its buffer (TMA) + device::Memory* runtimeTBA_; //! runtime trap handler pointer + device::Memory* runtimeTMA_; //! runtime trap handler buffer }; - /**@}*/ /** * @} */ -} // namespace amd +} // namespace amd #endif // HWDEBUG_H_ diff --git a/rocclr/runtime/device/pal/palappprofile.cpp b/rocclr/runtime/device/pal/palappprofile.cpp index b5a7e40a6c..84bc9e4722 100644 --- a/rocclr/runtime/device/pal/palappprofile.cpp +++ b/rocclr/runtime/device/pal/palappprofile.cpp @@ -10,16 +10,11 @@ namespace pal { AppProfile::AppProfile() - : amd::AppProfile() - , enableHighPerformanceState_(true) - , reportAsOCL12Device_(false) -{ - propertyDataMap_.insert(DataMap::value_type("HighPerfState", - PropertyData(DataType_Boolean, &enableHighPerformanceState_))); + : amd::AppProfile(), enableHighPerformanceState_(true), reportAsOCL12Device_(false) { + propertyDataMap_.insert(DataMap::value_type( + "HighPerfState", PropertyData(DataType_Boolean, &enableHighPerformanceState_))); - propertyDataMap_.insert(DataMap::value_type("OCL12Device", - PropertyData(DataType_Boolean, &reportAsOCL12Device_))); + propertyDataMap_.insert( + DataMap::value_type("OCL12Device", PropertyData(DataType_Boolean, &reportAsOCL12Device_))); } - } - diff --git a/rocclr/runtime/device/pal/palappprofile.hpp b/rocclr/runtime/device/pal/palappprofile.hpp index 6f079b34c8..a337517cd6 100644 --- a/rocclr/runtime/device/pal/palappprofile.hpp +++ b/rocclr/runtime/device/pal/palappprofile.hpp @@ -8,19 +8,16 @@ namespace pal { -class AppProfile : public amd::AppProfile -{ -public: - AppProfile(); +class AppProfile : public amd::AppProfile { + public: + AppProfile(); - //! return the value of enableHighPerformanceState_ - bool enableHighPerformanceState() const { return enableHighPerformanceState_; } - bool reportAsOCL12Device() const { return reportAsOCL12Device_; } + //! return the value of enableHighPerformanceState_ + bool enableHighPerformanceState() const { return enableHighPerformanceState_; } + bool reportAsOCL12Device() const { return reportAsOCL12Device_; } -private: - - bool enableHighPerformanceState_; - bool reportAsOCL12Device_; + private: + bool enableHighPerformanceState_; + bool reportAsOCL12Device_; }; - } diff --git a/rocclr/runtime/device/pal/palbinary.cpp b/rocclr/runtime/device/pal/palbinary.cpp index 0ceca32b3c..a158d4f08f 100644 --- a/rocclr/runtime/device/pal/palbinary.cpp +++ b/rocclr/runtime/device/pal/palbinary.cpp @@ -1,7 +1,4 @@ // // Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved. // -namespace pal { - - -} // namespace pal +namespace pal {} // namespace pal diff --git a/rocclr/runtime/device/pal/palbinary.hpp b/rocclr/runtime/device/pal/palbinary.hpp index 39da3da2b0..fe96637745 100644 --- a/rocclr/runtime/device/pal/palbinary.hpp +++ b/rocclr/runtime/device/pal/palbinary.hpp @@ -9,37 +9,33 @@ namespace pal { -class ClBinaryHsa : public device::ClBinary -{ -public: - ClBinaryHsa(const Device& dev, BinaryImageFormat bifVer = BIF_VERSION3) - : device::ClBinary(dev, bifVer) - {} +class ClBinaryHsa : public device::ClBinary { + public: + ClBinaryHsa(const Device& dev, BinaryImageFormat bifVer = BIF_VERSION3) + : device::ClBinary(dev, bifVer) {} - //! Destructor - ~ClBinaryHsa() {} + //! Destructor + ~ClBinaryHsa() {} -protected: - bool setElfTarget() { - uint32_t target = static_cast(21);//dev().calTarget()); - assert (((0xFFFF8000 & target) == 0) && "ASIC target ID >= 2^15"); - uint16_t elf_target = (uint16_t)(0x7FFF & target); - return elfOut()->setTarget(elf_target, amd::OclElf::CAL_PLATFORM); - return true; - } + protected: + bool setElfTarget() { + uint32_t target = static_cast(21); // dev().calTarget()); + assert(((0xFFFF8000 & target) == 0) && "ASIC target ID >= 2^15"); + uint16_t elf_target = (uint16_t)(0x7FFF & target); + return elfOut()->setTarget(elf_target, amd::OclElf::CAL_PLATFORM); + return true; + } -private: - //! Disable default copy constructor - ClBinaryHsa(const ClBinaryHsa&); + private: + //! Disable default copy constructor + ClBinaryHsa(const ClBinaryHsa&); - //! Disable default operator= - ClBinaryHsa& operator=(const ClBinaryHsa&); - - //! Returns the HSA device for this object - const Device& dev() const { return static_cast(dev_); } + //! Disable default operator= + ClBinaryHsa& operator=(const ClBinaryHsa&); + //! Returns the HSA device for this object + const Device& dev() const { return static_cast(dev_); } }; -} // namespace pal - +} // namespace pal diff --git a/rocclr/runtime/device/pal/palblit.cpp b/rocclr/runtime/device/pal/palblit.cpp index 33a39eef04..ecf3390fb0 100644 --- a/rocclr/runtime/device/pal/palblit.cpp +++ b/rocclr/runtime/device/pal/palblit.cpp @@ -12,2779 +12,2387 @@ namespace pal { DmaBlitManager::DmaBlitManager(VirtualGPU& gpu, Setup setup) - : HostBlitManager(gpu, setup) - , MinSizeForPinnedTransfer(dev().settings().pinnedMinXferSize_) - , completeOperation_(false) - , context_(NULL) -{ + : HostBlitManager(gpu, setup), + MinSizeForPinnedTransfer(dev().settings().pinnedMinXferSize_), + completeOperation_(false), + context_(NULL) {} + +inline void DmaBlitManager::synchronize() const { + if (syncOperation_) { + gpu().releaseMemObjects(); + gpu().waitAllEngines(); + } } -inline void -DmaBlitManager::synchronize() const -{ - if (syncOperation_) { - gpu().releaseMemObjects(); - gpu().waitAllEngines(); - } +inline Memory& DmaBlitManager::gpuMem(device::Memory& mem) const { + return static_cast(mem); } -inline Memory& -DmaBlitManager::gpuMem(device::Memory& mem) const -{ - return static_cast(mem); -} +bool DmaBlitManager::readMemoryStaged(Memory& srcMemory, void* dstHost, Memory** xferBuf, + size_t origin, size_t& offset, size_t& totalSize, + size_t xferSize) const { + amd::Coord3D dst(0, 0, 0); + size_t tmpSize; + uint idxWrite = 0; + uint idxRead = 0; + size_t chunkSize; + static const bool CopyRect = false; + // Flush DMA for ASYNC copy + static const bool FlushDMA = true; -bool -DmaBlitManager::readMemoryStaged( - Memory& srcMemory, - void* dstHost, - Memory** xferBuf, - size_t origin, - size_t& offset, - size_t& totalSize, - size_t xferSize) const -{ - amd::Coord3D dst(0, 0, 0); - size_t tmpSize; - uint idxWrite = 0; - uint idxRead = 0; - size_t chunkSize; - static const bool CopyRect = false; - // Flush DMA for ASYNC copy - static const bool FlushDMA = true; + if (dev().xferRead().bufSize() < 128 * Ki) { + chunkSize = dev().xferRead().bufSize(); + } else { + chunkSize = std::min(amd::alignUp(xferSize / 4, 256), dev().xferRead().bufSize()); + chunkSize = std::max(chunkSize, 128 * Ki); + } - if (dev().xferRead().bufSize() < 128 * Ki) { - chunkSize = dev().xferRead().bufSize(); - } - else { - chunkSize = std::min(amd::alignUp(xferSize / 4, 256), - dev().xferRead().bufSize()); - chunkSize = std::max(chunkSize, 128 * Ki); - } + // Find the partial transfer size + tmpSize = std::min(chunkSize, xferSize); + amd::Coord3D srcLast(origin + offset, 0, 0); + amd::Coord3D copySizeLast(tmpSize, 0, 0); + + // Copy data into the temporary surface + if (!srcMemory.partialMemCopyTo(gpu(), srcLast, dst, copySizeLast, *xferBuf[idxWrite], CopyRect, + FlushDMA)) { + return false; + } + + totalSize -= tmpSize; + xferSize -= tmpSize; + offset += tmpSize; + + while (xferSize != 0) { // Find the partial transfer size tmpSize = std::min(chunkSize, xferSize); - amd::Coord3D srcLast(origin + offset, 0, 0); - amd::Coord3D copySizeLast(tmpSize, 0, 0); + amd::Coord3D src(origin + offset, 0, 0); + amd::Coord3D copySize(tmpSize, 0, 0); + idxWrite = (idxWrite + 1) % 2; // Copy data into the temporary surface - if (!srcMemory.partialMemCopyTo(gpu(), srcLast, dst, copySizeLast, - *xferBuf[idxWrite], CopyRect, FlushDMA)) { - return false; + if (!srcMemory.partialMemCopyTo(gpu(), src, dst, copySize, *xferBuf[idxWrite], CopyRect, + FlushDMA)) { + return false; } + // Read previous buffer + if (!xferBuf[idxRead]->hostRead(&gpu(), + reinterpret_cast(dstHost) + offset - copySizeLast[0], + dst, copySizeLast)) { + return false; + } + idxRead = (idxRead + 1) % 2; + copySizeLast = copySize; + totalSize -= tmpSize; xferSize -= tmpSize; offset += tmpSize; + } - while (xferSize != 0) { - // Find the partial transfer size - tmpSize = std::min(chunkSize, xferSize); + // Last read + if (!xferBuf[idxRead]->hostRead( + &gpu(), reinterpret_cast(dstHost) + offset - copySizeLast[0], dst, copySizeLast)) { + return false; + } + return true; +} - amd::Coord3D src(origin + offset, 0, 0); - amd::Coord3D copySize(tmpSize, 0, 0); +bool DmaBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost, + const amd::Coord3D& origin, const amd::Coord3D& size, + bool entire) const { + // Use host copy if memory has direct access + if (setup_.disableReadBuffer_ || + (gpuMem(srcMemory).isHostMemDirectAccess() && gpuMem(srcMemory).isCacheable())) { + return HostBlitManager::readBuffer(srcMemory, dstHost, origin, size, entire); + } else { + size_t srcSize = size[0]; + size_t offset = 0; + size_t pinSize = dev().settings().pinnedXferSize_; + pinSize = std::min(pinSize, srcSize); - idxWrite = (idxWrite + 1) % 2; - // Copy data into the temporary surface - if (!srcMemory.partialMemCopyTo(gpu(), src, dst, copySize, - *xferBuf[idxWrite], CopyRect, FlushDMA)) { - return false; + // Check if a pinned transfer can be executed + if (pinSize && (srcSize > MinSizeForPinnedTransfer)) { + // Allign offset to 4K boundary (Vista/Win7 limitation) + char* tmpHost = const_cast( + amd::alignDown(reinterpret_cast(dstHost), PinnedMemoryAlignment)); + + // Find the partial size for unaligned copy + size_t partial = reinterpret_cast(dstHost) - tmpHost; + + amd::Memory* pinned = NULL; + bool first = true; + size_t tmpSize; + size_t pinAllocSize; + + // Copy memory, using pinning + while (srcSize > 0) { + // If it's the first iterarion, then readjust the copy size + // to include alignment + if (first) { + pinAllocSize = amd::alignUp(pinSize + partial, PinnedMemoryAlignment); + tmpSize = std::min(pinAllocSize - partial, srcSize); + first = false; + } else { + tmpSize = std::min(pinSize, srcSize); + pinAllocSize = amd::alignUp(tmpSize, PinnedMemoryAlignment); + partial = 0; } + amd::Coord3D dst(partial, 0, 0); + amd::Coord3D srcPin(origin[0] + offset, 0, 0); + amd::Coord3D copySizePin(tmpSize, 0, 0); + size_t partial2; - // Read previous buffer - if (!xferBuf[idxRead]->hostRead(&gpu(), - reinterpret_cast(dstHost) + offset - copySizeLast[0], - dst, copySizeLast)) { - return false; + // Allocate a GPU resource for pinning + pinned = pinHostMemory(tmpHost, pinAllocSize, partial2); + + if (pinned != NULL) { + // Get device memory for this virtual device + Memory* dstMemory = dev().getGpuMemory(pinned); + + if (!gpuMem(srcMemory).partialMemCopyTo(gpu(), srcPin, dst, copySizePin, *dstMemory)) { + LogWarning("DmaBlitManager::readBuffer failed a pinned copy!"); + gpu().addPinnedMem(pinned); + break; + } + gpu().addPinnedMem(pinned); + } else { + LogWarning("DmaBlitManager::readBuffer failed to pin a resource!"); + break; } - idxRead = (idxRead + 1) % 2; - copySizeLast = copySize; - - totalSize -= tmpSize; - xferSize -= tmpSize; + srcSize -= tmpSize; offset += tmpSize; + tmpHost = reinterpret_cast(tmpHost) + tmpSize + partial; + } } - // Last read - if (!xferBuf[idxRead]->hostRead(&gpu(), - reinterpret_cast(dstHost) + offset - copySizeLast[0], dst, copySizeLast)) { + if (0 != srcSize) { + Memory& xferBuf0 = dev().xferRead().acquire(); + Memory& xferBuf1 = dev().xferRead().acquire(); + Memory* xferBuf[2] = {&xferBuf0, &xferBuf1}; + + // Read memory using a staged resource + if (!readMemoryStaged(gpuMem(srcMemory), dstHost, xferBuf, origin[0], offset, srcSize, + srcSize)) { + LogError("DmaBlitManager::readBuffer failed!"); return false; + } + + dev().xferRead().release(gpu(), xferBuf1); + dev().xferRead().release(gpu(), xferBuf0); } - return true; + } + + return true; } -bool -DmaBlitManager::readBuffer( - device::Memory& srcMemory, - void* dstHost, - const amd::Coord3D& origin, - const amd::Coord3D& size, - bool entire) const -{ - // Use host copy if memory has direct access - if (setup_.disableReadBuffer_ || - (gpuMem(srcMemory).isHostMemDirectAccess() && gpuMem(srcMemory).isCacheable())) { - return HostBlitManager::readBuffer( - srcMemory, dstHost, origin, size, entire); - } - else { - size_t srcSize = size[0]; - size_t offset = 0; - size_t pinSize = dev().settings().pinnedXferSize_; - pinSize = std::min(pinSize, srcSize); +bool DmaBlitManager::readBufferRect(device::Memory& srcMemory, void* dstHost, + const amd::BufferRect& bufRect, const amd::BufferRect& hostRect, + const amd::Coord3D& size, bool entire) const { + // Use host copy if memory has direct access + if (setup_.disableReadBufferRect_ || + (gpuMem(srcMemory).isHostMemDirectAccess() && gpuMem(srcMemory).isCacheable())) { + return HostBlitManager::readBufferRect(srcMemory, dstHost, bufRect, hostRect, size, entire); + } else { + Memory& xferBuf = dev().xferRead().acquire(); - // Check if a pinned transfer can be executed - if (pinSize && (srcSize > MinSizeForPinnedTransfer)) { - // Allign offset to 4K boundary (Vista/Win7 limitation) - char* tmpHost = const_cast( - amd::alignDown(reinterpret_cast(dstHost), - PinnedMemoryAlignment)); + amd::Coord3D dst(0, 0, 0); + size_t tmpSize = 0; + size_t bufOffset; + size_t hostOffset; + size_t srcSize; - // Find the partial size for unaligned copy - size_t partial = reinterpret_cast(dstHost) - tmpHost; + for (size_t z = 0; z < size[2]; ++z) { + for (size_t y = 0; y < size[1]; ++y) { + srcSize = size[0]; + bufOffset = bufRect.offset(0, y, z); + hostOffset = hostRect.offset(0, y, z); - amd::Memory* pinned = NULL; - bool first = true; - size_t tmpSize; - size_t pinAllocSize; + while (srcSize != 0) { + // Find the partial transfer size + tmpSize = std::min(dev().xferRead().bufSize(), srcSize); - // Copy memory, using pinning - while (srcSize > 0) { - // If it's the first iterarion, then readjust the copy size - // to include alignment - if (first) { - pinAllocSize = amd::alignUp(pinSize + partial, - PinnedMemoryAlignment); - tmpSize = std::min(pinAllocSize - partial, srcSize); - first = false; - } - else { - tmpSize = std::min(pinSize, srcSize); - pinAllocSize = amd::alignUp(tmpSize, PinnedMemoryAlignment); - partial = 0; - } - amd::Coord3D dst(partial, 0, 0); - amd::Coord3D srcPin(origin[0] + offset, 0, 0); - amd::Coord3D copySizePin(tmpSize, 0, 0); - size_t partial2; + amd::Coord3D src(bufOffset, 0, 0); + amd::Coord3D copySize(tmpSize, 0, 0); - // Allocate a GPU resource for pinning - pinned = pinHostMemory(tmpHost, pinAllocSize, partial2); + // Copy data into the temporary surface + if (!gpuMem(srcMemory).partialMemCopyTo(gpu(), src, dst, copySize, xferBuf, true)) { + LogError("DmaBlitManager::readBufferRect failed!"); + return false; + } - if (pinned != NULL) { - // Get device memory for this virtual device - Memory* dstMemory = dev().getGpuMemory(pinned); + if (!xferBuf.hostRead(&gpu(), reinterpret_cast(dstHost) + hostOffset, dst, + copySize)) { + LogError("DmaBlitManager::readBufferRect failed!"); + return false; + } - if (!gpuMem(srcMemory).partialMemCopyTo( - gpu(), srcPin, dst, copySizePin, *dstMemory)) { - LogWarning("DmaBlitManager::readBuffer failed a pinned copy!"); - gpu().addPinnedMem(pinned); - break; - } - gpu().addPinnedMem(pinned); - } - else { - LogWarning("DmaBlitManager::readBuffer failed to pin a resource!"); - break; - } - srcSize -= tmpSize; - offset += tmpSize; - tmpHost = reinterpret_cast(tmpHost) + tmpSize + partial; - } + srcSize -= tmpSize; + bufOffset += tmpSize; + hostOffset += tmpSize; } + } + } + dev().xferRead().release(gpu(), xferBuf); + } - if (0 != srcSize) { - Memory& xferBuf0 = dev().xferRead().acquire(); - Memory& xferBuf1 = dev().xferRead().acquire(); - Memory* xferBuf[2] = { &xferBuf0, &xferBuf1 }; + return true; +} - // Read memory using a staged resource - if (!readMemoryStaged(gpuMem(srcMemory), dstHost, xferBuf, origin[0], - offset, srcSize, srcSize)) { - LogError("DmaBlitManager::readBuffer failed!"); - return false; - } +bool DmaBlitManager::readImage(device::Memory& srcMemory, void* dstHost, const amd::Coord3D& origin, + const amd::Coord3D& size, size_t rowPitch, size_t slicePitch, + bool entire) const { + if (setup_.disableReadImage_) { + return HostBlitManager::readImage(srcMemory, dstHost, origin, size, rowPitch, slicePitch, + entire); + } else { + //! @todo Add HW accelerated path + return HostBlitManager::readImage(srcMemory, dstHost, origin, size, rowPitch, slicePitch, + entire); + } - dev().xferRead().release(gpu(), xferBuf1); - dev().xferRead().release(gpu(), xferBuf0); + return true; +} + +bool DmaBlitManager::writeMemoryStaged(const void* srcHost, Memory& dstMemory, Memory& xferBuf, + size_t origin, size_t& offset, size_t& totalSize, + size_t xferSize) const { + amd::Coord3D src(0, 0, 0); + size_t tmpSize; + size_t chunkSize; + static const bool CopyRect = false; + // Flush DMA for ASYNC copy + // @todo Blocking write requires a flush to start earlier, + // but currently VDI doesn't provide that info + static const bool FlushDMA = false; + + if (dev().xferRead().bufSize() < 128 * Ki) { + chunkSize = dev().xferWrite().bufSize(); + } else { + chunkSize = std::min(amd::alignUp(xferSize / 4, 256), dev().xferWrite().bufSize()); + chunkSize = std::max(chunkSize, 128 * Ki); + } + + while (xferSize != 0) { + // Find the partial transfer size + tmpSize = std::min(chunkSize, xferSize); + amd::Coord3D dst(origin + offset, 0, 0); + amd::Coord3D copySize(tmpSize, 0, 0); + + // Copy data into the temporary buffer, using CPU + if (!xferBuf.hostWrite(&gpu(), reinterpret_cast(srcHost) + offset, src, copySize, + Resource::Discard)) { + return false; + } + + // Copy data into the original destination memory + if (!xferBuf.partialMemCopyTo(gpu(), src, dst, copySize, dstMemory, CopyRect, FlushDMA)) { + return false; + } + + totalSize -= tmpSize; + offset += tmpSize; + xferSize -= tmpSize; + } + return true; +} + +bool DmaBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemory, + const amd::Coord3D& origin, const amd::Coord3D& size, + bool entire) const { + // Use host copy if memory has direct access or it's persistent + if (setup_.disableWriteBuffer_ || gpuMem(dstMemory).isHostMemDirectAccess() || + gpuMem(dstMemory).isPersistentDirectMap()) { + return HostBlitManager::writeBuffer(srcHost, dstMemory, origin, size, entire); + } else { + size_t dstSize = size[0]; + size_t tmpSize = 0; + size_t offset = 0; + size_t pinSize = dev().settings().pinnedXferSize_; + pinSize = std::min(pinSize, dstSize); + + // Check if a pinned transfer can be executed + if (pinSize && (dstSize > MinSizeForPinnedTransfer)) { + // Allign offset to 4K boundary (Vista/Win7 limitation) + char* tmpHost = const_cast( + amd::alignDown(reinterpret_cast(srcHost), PinnedMemoryAlignment)); + + // Find the partial size for unaligned copy + size_t partial = reinterpret_cast(srcHost) - tmpHost; + + amd::Memory* pinned = NULL; + bool first = true; + size_t tmpSize; + size_t pinAllocSize; + + // Copy memory, using pinning + while (dstSize > 0) { + // If it's the first iterarion, then readjust the copy size + // to include alignment + if (first) { + pinAllocSize = amd::alignUp(pinSize + partial, PinnedMemoryAlignment); + tmpSize = std::min(pinAllocSize - partial, dstSize); + first = false; + } else { + tmpSize = std::min(pinSize, dstSize); + pinAllocSize = amd::alignUp(tmpSize, PinnedMemoryAlignment); + partial = 0; } - } + amd::Coord3D src(partial, 0, 0); + amd::Coord3D dstPin(origin[0] + offset, 0, 0); + amd::Coord3D copySizePin(tmpSize, 0, 0); + size_t partial2; - return true; -} + // Allocate a GPU resource for pinning + pinned = pinHostMemory(tmpHost, pinAllocSize, partial2); -bool -DmaBlitManager::readBufferRect( - device::Memory& srcMemory, - void* dstHost, - const amd::BufferRect& bufRect, - const amd::BufferRect& hostRect, - const amd::Coord3D& size, - bool entire) const -{ - // Use host copy if memory has direct access - if (setup_.disableReadBufferRect_ || - (gpuMem(srcMemory).isHostMemDirectAccess() && gpuMem(srcMemory).isCacheable())) { - return HostBlitManager::readBufferRect( - srcMemory, dstHost, bufRect, hostRect, size, entire); - } - else { - Memory& xferBuf = dev().xferRead().acquire(); + if (pinned != NULL) { + // Get device memory for this virtual device + Memory* srcMemory = dev().getGpuMemory(pinned); - amd::Coord3D dst(0, 0, 0); - size_t tmpSize = 0; - size_t bufOffset; - size_t hostOffset; - size_t srcSize; - - for (size_t z = 0; z < size[2]; ++z) { - for (size_t y = 0; y < size[1]; ++y) { - srcSize = size[0]; - bufOffset = bufRect.offset(0, y, z); - hostOffset = hostRect.offset(0, y, z); - - while (srcSize != 0) { - // Find the partial transfer size - tmpSize = std::min(dev().xferRead().bufSize(), srcSize); - - amd::Coord3D src(bufOffset, 0, 0); - amd::Coord3D copySize(tmpSize, 0, 0); - - // Copy data into the temporary surface - if (!gpuMem(srcMemory).partialMemCopyTo( - gpu(), src, dst, copySize, xferBuf, true)) { - LogError("DmaBlitManager::readBufferRect failed!"); - return false; - } - - if (!xferBuf.hostRead(&gpu(), - reinterpret_cast(dstHost) + hostOffset, - dst, copySize)) { - LogError("DmaBlitManager::readBufferRect failed!"); - return false; - } - - srcSize -= tmpSize; - bufOffset += tmpSize; - hostOffset += tmpSize; - } - } + if (!srcMemory->partialMemCopyTo(gpu(), src, dstPin, copySizePin, gpuMem(dstMemory))) { + LogWarning("DmaBlitManager::writeBuffer failed a pinned copy!"); + gpu().addPinnedMem(pinned); + break; + } + gpu().addPinnedMem(pinned); + } else { + LogWarning("DmaBlitManager::writeBuffer failed to pin a resource!"); + break; } - dev().xferRead().release(gpu(), xferBuf); + dstSize -= tmpSize; + offset += tmpSize; + tmpHost = reinterpret_cast(tmpHost) + tmpSize + partial; + } } - return true; + if (dstSize != 0) { + Memory& xferBuf = dev().xferWrite().acquire(); + + // Write memory using a staged resource + if (!writeMemoryStaged(srcHost, gpuMem(dstMemory), xferBuf, origin[0], offset, dstSize, + dstSize)) { + LogError("DmaBlitManager::writeBuffer failed!"); + return false; + } + + gpu().addXferWrite(xferBuf); + } + } + + return true; } -bool -DmaBlitManager::readImage( - device::Memory& srcMemory, - void* dstHost, - const amd::Coord3D& origin, - const amd::Coord3D& size, - size_t rowPitch, - size_t slicePitch, - bool entire) const -{ - if (setup_.disableReadImage_) { - return HostBlitManager::readImage(srcMemory, dstHost, - origin, size, rowPitch, slicePitch, entire); - } - else { - //! @todo Add HW accelerated path - return HostBlitManager::readImage(srcMemory, dstHost, - origin, size, rowPitch, slicePitch, entire); - } +bool DmaBlitManager::writeBufferRect(const void* srcHost, device::Memory& dstMemory, + const amd::BufferRect& hostRect, + const amd::BufferRect& bufRect, const amd::Coord3D& size, + bool entire) const { + // Use host copy if memory has direct access or it's persistent + if (setup_.disableWriteBufferRect_ || dstMemory.isHostMemDirectAccess() || + gpuMem(dstMemory).isPersistentDirectMap()) { + return HostBlitManager::writeBufferRect(srcHost, dstMemory, hostRect, bufRect, size, entire); + } else { + Memory& xferBuf = dev().xferWrite().acquire(); - return true; -} - -bool -DmaBlitManager::writeMemoryStaged( - const void* srcHost, - Memory& dstMemory, - Memory& xferBuf, - size_t origin, - size_t& offset, - size_t& totalSize, - size_t xferSize) const -{ amd::Coord3D src(0, 0, 0); - size_t tmpSize; - size_t chunkSize; - static const bool CopyRect = false; - // Flush DMA for ASYNC copy - // @todo Blocking write requires a flush to start earlier, - // but currently VDI doesn't provide that info - static const bool FlushDMA = false; + size_t tmpSize = 0; + size_t bufOffset; + size_t hostOffset; + size_t dstSize; - if (dev().xferRead().bufSize() < 128 * Ki) { - chunkSize = dev().xferWrite().bufSize(); - } - else { - chunkSize = std::min(amd::alignUp(xferSize / 4, 256), - dev().xferWrite().bufSize()); - chunkSize = std::max(chunkSize, 128 * Ki); - } + for (size_t z = 0; z < size[2]; ++z) { + for (size_t y = 0; y < size[1]; ++y) { + dstSize = size[0]; + bufOffset = bufRect.offset(0, y, z); + hostOffset = hostRect.offset(0, y, z); - while (xferSize != 0) { - // Find the partial transfer size - tmpSize = std::min(chunkSize, xferSize); - amd::Coord3D dst(origin + offset, 0, 0); - amd::Coord3D copySize(tmpSize, 0, 0); + while (dstSize != 0) { + // Find the partial transfer size + tmpSize = std::min(dev().xferWrite().bufSize(), dstSize); - // Copy data into the temporary buffer, using CPU - if (!xferBuf.hostWrite(&gpu(), - reinterpret_cast(srcHost) + offset, - src, copySize, Resource::Discard)) { + amd::Coord3D dst(bufOffset, 0, 0); + amd::Coord3D copySize(tmpSize, 0, 0); + + // Copy data into the temporary buffer, using CPU + if (!xferBuf.hostWrite(&gpu(), reinterpret_cast(srcHost) + hostOffset, src, + copySize, Resource::Discard)) { + LogError("DmaBlitManager::writeBufferRect failed!"); return false; - } + } - // Copy data into the original destination memory - if (!xferBuf.partialMemCopyTo( - gpu(), src, dst, copySize, dstMemory, CopyRect, FlushDMA)) { + // Copy data into the original destination memory + if (!xferBuf.partialMemCopyTo(gpu(), src, dst, copySize, gpuMem(dstMemory))) { + LogError("DmaBlitManager::writeBufferRect failed!"); return false; - } + } - totalSize -= tmpSize; - offset += tmpSize; - xferSize -= tmpSize; + dstSize -= tmpSize; + bufOffset += tmpSize; + hostOffset += tmpSize; + } + } } - return true; + gpu().addXferWrite(xferBuf); + } + + return true; } -bool -DmaBlitManager::writeBuffer( - const void* srcHost, - device::Memory& dstMemory, - const amd::Coord3D& origin, - const amd::Coord3D& size, - bool entire) const -{ - // Use host copy if memory has direct access or it's persistent - if (setup_.disableWriteBuffer_ || - gpuMem(dstMemory).isHostMemDirectAccess() || - gpuMem(dstMemory).isPersistentDirectMap()) { - return HostBlitManager::writeBuffer( - srcHost, dstMemory, origin, size, entire); - } - else { - size_t dstSize = size[0]; - size_t tmpSize = 0; - size_t offset = 0; - size_t pinSize = dev().settings().pinnedXferSize_; - pinSize = std::min(pinSize, dstSize); +bool DmaBlitManager::writeImage(const void* srcHost, device::Memory& dstMemory, + const amd::Coord3D& origin, const amd::Coord3D& size, + size_t rowPitch, size_t slicePitch, bool entire) const { + if (setup_.disableWriteImage_) { + return HostBlitManager::writeImage(srcHost, dstMemory, origin, size, rowPitch, slicePitch, + entire); + } else { + //! @todo Add HW accelerated path + return HostBlitManager::writeImage(srcHost, dstMemory, origin, size, rowPitch, slicePitch, + entire); + } - // Check if a pinned transfer can be executed - if (pinSize && (dstSize > MinSizeForPinnedTransfer)) { - // Allign offset to 4K boundary (Vista/Win7 limitation) - char* tmpHost = const_cast( - amd::alignDown(reinterpret_cast(srcHost), - PinnedMemoryAlignment)); - - // Find the partial size for unaligned copy - size_t partial = reinterpret_cast(srcHost) - tmpHost; - - amd::Memory* pinned = NULL; - bool first = true; - size_t tmpSize; - size_t pinAllocSize; - - // Copy memory, using pinning - while (dstSize > 0) { - // If it's the first iterarion, then readjust the copy size - // to include alignment - if (first) { - pinAllocSize = amd::alignUp(pinSize + partial, - PinnedMemoryAlignment); - tmpSize = std::min(pinAllocSize - partial, dstSize); - first = false; - } - else { - tmpSize = std::min(pinSize, dstSize); - pinAllocSize = amd::alignUp(tmpSize, PinnedMemoryAlignment); - partial = 0; - } - amd::Coord3D src(partial, 0, 0); - amd::Coord3D dstPin(origin[0] + offset, 0, 0); - amd::Coord3D copySizePin(tmpSize, 0, 0); - size_t partial2; - - // Allocate a GPU resource for pinning - pinned = pinHostMemory(tmpHost, pinAllocSize, partial2); - - if (pinned != NULL) { - // Get device memory for this virtual device - Memory* srcMemory = dev().getGpuMemory(pinned); - - if (!srcMemory->partialMemCopyTo( - gpu(), src, dstPin, copySizePin, gpuMem(dstMemory))) { - LogWarning("DmaBlitManager::writeBuffer failed a pinned copy!"); - gpu().addPinnedMem(pinned); - break; - } - gpu().addPinnedMem(pinned); - } - else { - LogWarning("DmaBlitManager::writeBuffer failed to pin a resource!"); - break; - } - dstSize -= tmpSize; - offset += tmpSize; - tmpHost = reinterpret_cast(tmpHost) + tmpSize + partial; - } - } - - if (dstSize != 0) { - Memory& xferBuf = dev().xferWrite().acquire(); - - // Write memory using a staged resource - if (!writeMemoryStaged(srcHost, gpuMem(dstMemory), xferBuf, origin[0], - offset, dstSize, dstSize)) { - LogError("DmaBlitManager::writeBuffer failed!"); - return false; - } - - gpu().addXferWrite(xferBuf); - } - } - - return true; + return true; } -bool -DmaBlitManager::writeBufferRect( - const void* srcHost, - device::Memory& dstMemory, - const amd::BufferRect& hostRect, - const amd::BufferRect& bufRect, - const amd::Coord3D& size, - bool entire) const -{ - // Use host copy if memory has direct access or it's persistent - if (setup_.disableWriteBufferRect_ || - dstMemory.isHostMemDirectAccess() || - gpuMem(dstMemory).isPersistentDirectMap()) { - return HostBlitManager::writeBufferRect( - srcHost, dstMemory, hostRect, bufRect, size, entire); - } - else { - Memory& xferBuf = dev().xferWrite().acquire(); +bool DmaBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& dstMemory, + const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin, + const amd::Coord3D& size, bool entire) const { + if (setup_.disableCopyBuffer_ || + (gpuMem(srcMemory).isHostMemDirectAccess() && gpuMem(srcMemory).isCacheable() && + !dev().settings().apuSystem_ && gpuMem(dstMemory).isHostMemDirectAccess())) { + return HostBlitManager::copyBuffer(srcMemory, dstMemory, srcOrigin, dstOrigin, size); + } else { + return gpuMem(srcMemory).partialMemCopyTo(gpu(), srcOrigin, dstOrigin, size, gpuMem(dstMemory)); + } - amd::Coord3D src(0, 0, 0); - size_t tmpSize = 0; - size_t bufOffset; - size_t hostOffset; - size_t dstSize; - - for (size_t z = 0; z < size[2]; ++z) { - for (size_t y = 0; y < size[1]; ++y) { - dstSize = size[0]; - bufOffset = bufRect.offset(0, y, z); - hostOffset = hostRect.offset(0, y, z); - - while (dstSize != 0) { - // Find the partial transfer size - tmpSize = std::min(dev().xferWrite().bufSize(), dstSize); - - amd::Coord3D dst(bufOffset, 0, 0); - amd::Coord3D copySize(tmpSize, 0, 0); - - // Copy data into the temporary buffer, using CPU - if (!xferBuf.hostWrite(&gpu(), - reinterpret_cast(srcHost) + hostOffset, - src, copySize, Resource::Discard)) { - LogError("DmaBlitManager::writeBufferRect failed!"); - return false; - } - - // Copy data into the original destination memory - if (!xferBuf.partialMemCopyTo( - gpu(), src, dst, copySize, gpuMem(dstMemory))) { - LogError("DmaBlitManager::writeBufferRect failed!"); - return false; - } - - dstSize -= tmpSize; - bufOffset += tmpSize; - hostOffset += tmpSize; - } - } - } - gpu().addXferWrite(xferBuf); - } - - return true; + return true; } -bool -DmaBlitManager::writeImage( - const void* srcHost, - device::Memory& dstMemory, - const amd::Coord3D& origin, - const amd::Coord3D& size, - size_t rowPitch, - size_t slicePitch, - bool entire) const -{ - if (setup_.disableWriteImage_) { - return HostBlitManager::writeImage( - srcHost, dstMemory, origin, size, rowPitch, slicePitch, entire); - } - else { - //! @todo Add HW accelerated path - return HostBlitManager::writeImage( - srcHost, dstMemory, origin, size, rowPitch, slicePitch, entire); +bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& dstMemory, + const amd::BufferRect& srcRect, const amd::BufferRect& dstRect, + const amd::Coord3D& size, bool entire) const { + if (setup_.disableCopyBufferRect_ || + (gpuMem(srcMemory).isHostMemDirectAccess() && gpuMem(srcMemory).isCacheable() && + gpuMem(dstMemory).isHostMemDirectAccess())) { + return HostBlitManager::copyBufferRect(srcMemory, dstMemory, srcRect, dstRect, size, entire); + } else { + size_t srcOffset; + size_t dstOffset; + + uint bytesPerElement = 16; + bool optimalElementSize = false; + bool subWindowRectCopy = true; + + srcOffset = srcRect.offset(0, 0, 0); + dstOffset = dstRect.offset(0, 0, 0); + + while (bytesPerElement >= 1) { + if (((srcOffset % 4) == 0) && ((dstOffset % 4) == 0) && ((size[0] % bytesPerElement) == 0) && + ((srcRect.rowPitch_ % bytesPerElement) == 0) && + ((srcRect.slicePitch_ % bytesPerElement) == 0) && + ((dstRect.rowPitch_ % bytesPerElement) == 0) && + ((dstRect.slicePitch_ % bytesPerElement) == 0)) { + optimalElementSize = true; + break; + } + bytesPerElement = bytesPerElement >> 1; } - return true; + // 19 bit limit in HW in SI and 16 bit limit in CI+ + // (we adjust the ElementSize to 4bytes but the packet still has 14bits) + size_t pitchLimit = (0x3FFF * bytesPerElement) | 0xF; + size_t sizeLimit = (0x3FFF * bytesPerElement) | 0xF; + + if (!optimalElementSize || (srcRect.rowPitch_ > pitchLimit) || + (dstRect.rowPitch_ > pitchLimit) || (size[0] > sizeLimit) || // See above + (size[1] > 0x3fff) || // 14 bits limit in HW + (size[2] > 0x7ff)) { // 11 bits limit in HW + // Restriction with rectLinearDRMDMA packet + subWindowRectCopy = false; + } + + if (subWindowRectCopy) { + // Copy data with subwindow copy packet + if (!gpuMem(srcMemory).partialMemCopyTo( + gpu(), amd::Coord3D(srcOffset, srcRect.rowPitch_, srcRect.slicePitch_), + amd::Coord3D(dstOffset, dstRect.rowPitch_, dstRect.slicePitch_), size, + gpuMem(dstMemory), true, false, bytesPerElement)) { + LogError("copyBufferRect failed!"); + return false; + } + } else { + for (size_t z = 0; z < size[2]; ++z) { + for (size_t y = 0; y < size[1]; ++y) { + srcOffset = srcRect.offset(0, y, z); + dstOffset = dstRect.offset(0, y, z); + + amd::Coord3D src(srcOffset, 0, 0); + amd::Coord3D dst(dstOffset, 0, 0); + amd::Coord3D copySize(size[0], 0, 0); + + // Copy data + if (!gpuMem(srcMemory).partialMemCopyTo(gpu(), src, dst, copySize, gpuMem(dstMemory))) { + LogError("copyBufferRect failed!"); + return false; + } + } + } + } + } + return true; } -bool -DmaBlitManager::copyBuffer( - device::Memory& srcMemory, - device::Memory& dstMemory, - const amd::Coord3D& srcOrigin, - const amd::Coord3D& dstOrigin, - const amd::Coord3D& size, - bool entire) const -{ - if (setup_.disableCopyBuffer_ || - (gpuMem(srcMemory).isHostMemDirectAccess() && gpuMem(srcMemory).isCacheable() && - !dev().settings().apuSystem_ && - gpuMem(dstMemory).isHostMemDirectAccess())) { - return HostBlitManager::copyBuffer( - srcMemory, dstMemory, srcOrigin, dstOrigin, size); - } - else { - return gpuMem(srcMemory).partialMemCopyTo(gpu(), - srcOrigin, dstOrigin, size, gpuMem(dstMemory)); - } +bool DmaBlitManager::copyImageToBuffer(device::Memory& srcMemory, device::Memory& dstMemory, + const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin, + const amd::Coord3D& size, bool entire, size_t rowPitch, + size_t slicePitch) const { + bool result = false; - return true; + if (setup_.disableCopyImageToBuffer_) { + result = HostBlitManager::copyImageToBuffer(srcMemory, dstMemory, srcOrigin, dstOrigin, size, + entire, rowPitch, slicePitch); + } else { + // Use PAL path for a transfer + result = + gpuMem(srcMemory).partialMemCopyTo(gpu(), srcOrigin, dstOrigin, size, gpuMem(dstMemory)); + + // Check if a HostBlit transfer is required + if (completeOperation_ && !result) { + result = HostBlitManager::copyImageToBuffer(srcMemory, dstMemory, srcOrigin, dstOrigin, size, + entire, rowPitch, slicePitch); + } + } + + return result; } -bool -DmaBlitManager::copyBufferRect( - device::Memory& srcMemory, - device::Memory& dstMemory, - const amd::BufferRect& srcRect, - const amd::BufferRect& dstRect, - const amd::Coord3D& size, - bool entire) const -{ - if (setup_.disableCopyBufferRect_ || - (gpuMem(srcMemory).isHostMemDirectAccess() && gpuMem(srcMemory).isCacheable() && - gpuMem(dstMemory).isHostMemDirectAccess())) { - return HostBlitManager::copyBufferRect( - srcMemory, dstMemory, srcRect, dstRect, size, entire); +bool DmaBlitManager::copyBufferToImage(device::Memory& srcMemory, device::Memory& dstMemory, + const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin, + const amd::Coord3D& size, bool entire, size_t rowPitch, + size_t slicePitch) const { + bool result = false; + + if (setup_.disableCopyBufferToImage_) { + result = HostBlitManager::copyBufferToImage(srcMemory, dstMemory, srcOrigin, dstOrigin, size, + entire, rowPitch, slicePitch); + } else { + // Use PAL path for a transfer + result = + gpuMem(srcMemory).partialMemCopyTo(gpu(), srcOrigin, dstOrigin, size, gpuMem(dstMemory)); + + // Check if a HostBlit transfer is required + if (completeOperation_ && !result) { + result = HostBlitManager::copyBufferToImage(srcMemory, dstMemory, srcOrigin, dstOrigin, size, + entire, rowPitch, slicePitch); } - else { - size_t srcOffset; - size_t dstOffset; + } - uint bytesPerElement = 16; - bool optimalElementSize = false; - bool subWindowRectCopy = true; - - srcOffset = srcRect.offset(0, 0, 0); - dstOffset = dstRect.offset(0, 0, 0); - - while (bytesPerElement >= 1) { - if (((srcOffset % 4) == 0) && - ((dstOffset % 4) == 0) && - ((size[0] % bytesPerElement) == 0) && - ((srcRect.rowPitch_ % bytesPerElement) == 0) && - ((srcRect.slicePitch_ % bytesPerElement) == 0) && - ((dstRect.rowPitch_ % bytesPerElement) == 0) && - ((dstRect.slicePitch_ % bytesPerElement) == 0)) { - optimalElementSize = true; - break; - } - bytesPerElement = bytesPerElement >> 1; - } - - // 19 bit limit in HW in SI and 16 bit limit in CI+ - // (we adjust the ElementSize to 4bytes but the packet still has 14bits) - size_t pitchLimit = (0x3FFF * bytesPerElement) | 0xF; - size_t sizeLimit = (0x3FFF * bytesPerElement) | 0xF; - - if (!optimalElementSize || - (srcRect.rowPitch_ > pitchLimit) || - (dstRect.rowPitch_ > pitchLimit) || - (size[0] > sizeLimit) || // See above - (size[1] > 0x3fff) || // 14 bits limit in HW - (size[2] > 0x7ff)) { // 11 bits limit in HW - // Restriction with rectLinearDRMDMA packet - subWindowRectCopy = false; - } - - if (subWindowRectCopy) { - // Copy data with subwindow copy packet - if (!gpuMem(srcMemory).partialMemCopyTo(gpu(), - amd::Coord3D(srcOffset, srcRect.rowPitch_, srcRect.slicePitch_), - amd::Coord3D(dstOffset, dstRect.rowPitch_, dstRect.slicePitch_), - size, gpuMem(dstMemory), true, false, bytesPerElement)) { - LogError("copyBufferRect failed!"); - return false; - } - } - else { - for (size_t z = 0; z < size[2]; ++z) { - for (size_t y = 0; y < size[1]; ++y) { - srcOffset = srcRect.offset(0, y, z); - dstOffset = dstRect.offset(0, y, z); - - amd::Coord3D src(srcOffset, 0, 0); - amd::Coord3D dst(dstOffset, 0, 0); - amd::Coord3D copySize(size[0], 0, 0); - - // Copy data - if (!gpuMem(srcMemory).partialMemCopyTo( - gpu(), src, dst, copySize, gpuMem(dstMemory))) { - LogError("copyBufferRect failed!"); - return false; - } - } - } - } - } - return true; + return result; } -bool -DmaBlitManager::copyImageToBuffer( - device::Memory& srcMemory, - device::Memory& dstMemory, - const amd::Coord3D& srcOrigin, - const amd::Coord3D& dstOrigin, - const amd::Coord3D& size, - bool entire, - size_t rowPitch, - size_t slicePitch) const -{ - bool result = false; +bool DmaBlitManager::copyImage(device::Memory& srcMemory, device::Memory& dstMemory, + const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin, + const amd::Coord3D& size, bool entire) const { + bool result = false; - if (setup_.disableCopyImageToBuffer_) { - result = HostBlitManager::copyImageToBuffer(srcMemory, dstMemory, - srcOrigin, dstOrigin, size, entire, rowPitch, slicePitch); - } - else { - // Use PAL path for a transfer - result = gpuMem(srcMemory).partialMemCopyTo( - gpu(), srcOrigin, dstOrigin, size, gpuMem(dstMemory)); + if (setup_.disableCopyImage_) { + return HostBlitManager::copyImage(srcMemory, dstMemory, srcOrigin, dstOrigin, size, entire); + } else { + //! @todo Add HW accelerated path + return HostBlitManager::copyImage(srcMemory, dstMemory, srcOrigin, dstOrigin, size, entire); + } - // Check if a HostBlit transfer is required - if (completeOperation_ && !result) { - result = HostBlitManager::copyImageToBuffer(srcMemory, - dstMemory, srcOrigin, dstOrigin, size, entire, rowPitch, slicePitch); - } - } - - return result; + return result; } -bool -DmaBlitManager::copyBufferToImage( - device::Memory& srcMemory, - device::Memory& dstMemory, - const amd::Coord3D& srcOrigin, - const amd::Coord3D& dstOrigin, - const amd::Coord3D& size, - bool entire, - size_t rowPitch, - size_t slicePitch) const -{ - bool result = false; +KernelBlitManager::KernelBlitManager(VirtualGPU& gpu, Setup setup) + : DmaBlitManager(gpu, setup), + program_(NULL), + constantBuffer_(NULL), + xferBufferSize_(0), + lockXferOps_(NULL) { + for (uint i = 0; i < BlitTotal; ++i) { + kernels_[i] = NULL; + } - if (setup_.disableCopyBufferToImage_) { - result = HostBlitManager::copyBufferToImage(srcMemory, - dstMemory, srcOrigin, dstOrigin, size, entire, rowPitch, slicePitch); - } - else { - // Use PAL path for a transfer - result = gpuMem(srcMemory).partialMemCopyTo( - gpu(), srcOrigin, dstOrigin, size, gpuMem(dstMemory)); + for (uint i = 0; i < MaxXferBuffers; ++i) { + xferBuffers_[i] = NULL; + } - // Check if a HostBlit transfer is required - if (completeOperation_ && !result) { - result = HostBlitManager::copyBufferToImage(srcMemory, - dstMemory, srcOrigin, dstOrigin, size, entire, rowPitch, slicePitch); - } - } - - return result; + completeOperation_ = false; } -bool -DmaBlitManager::copyImage( - device::Memory& srcMemory, - device::Memory& dstMemory, - const amd::Coord3D& srcOrigin, - const amd::Coord3D& dstOrigin, - const amd::Coord3D& size, - bool entire) const -{ - bool result = false; - - if (setup_.disableCopyImage_) { - return HostBlitManager::copyImage(srcMemory, dstMemory, - srcOrigin, dstOrigin, size, entire); - } - else { - //! @todo Add HW accelerated path - return HostBlitManager::copyImage(srcMemory, dstMemory, - srcOrigin, dstOrigin, size, entire); +KernelBlitManager::~KernelBlitManager() { + for (uint i = 0; i < BlitTotal; ++i) { + if (NULL != kernels_[i]) { + kernels_[i]->release(); } + } + if (NULL != program_) { + program_->release(); + } - return result; + if (NULL != context_) { + // Release a dummy context + context_->release(); + } + + if (NULL != constantBuffer_) { + constantBuffer_->release(); + } + + for (uint i = 0; i < MaxXferBuffers; ++i) { + if (NULL != xferBuffers_[i]) { + xferBuffers_[i]->release(); + } + } + + delete lockXferOps_; } -KernelBlitManager::KernelBlitManager( - VirtualGPU& gpu, Setup setup) - : DmaBlitManager(gpu, setup) - , program_(NULL) - , constantBuffer_(NULL) - , xferBufferSize_(0) - , lockXferOps_(NULL) -{ +bool KernelBlitManager::create(amd::Device& device) { + if (!createProgram(static_cast(device))) { + return false; + } + return true; +} + +bool KernelBlitManager::createProgram(Device& device) { + if (device.blitProgram() == nullptr) { + if (!device.createBlitProgram()) { + return false; + } + } + + std::vector devices; + devices.push_back(&device); + + // Save context and program for this device + context_ = device.blitProgram()->context_; + context_->retain(); + program_ = device.blitProgram()->program_; + program_->retain(); + + bool result = false; + do { + // Create kernel objects for all blits for (uint i = 0; i < BlitTotal; ++i) { - kernels_[i] = NULL; + const amd::Symbol* symbol = program_->findSymbol(BlitName[i]); + if (symbol == NULL) { + break; + } + kernels_[i] = new amd::Kernel(*program_, *symbol, BlitName[i]); + if (kernels_[i] == NULL) { + break; + } + // Validate blit kernels for the scratch memory usage (pre SI) + if (!device.validateKernel(*kernels_[i], &gpu())) { + break; + } } + result = true; + } while (!result); + + // Create an internal constant buffer + constantBuffer_ = new (*context_) amd::Buffer(*context_, CL_MEM_ALLOC_HOST_PTR, 4 * Ki); + + if ((constantBuffer_ != NULL) && !constantBuffer_->create(NULL)) { + constantBuffer_->release(); + constantBuffer_ = NULL; + return false; + } else if (constantBuffer_ == NULL) { + return false; + } + + // Assign the constant buffer to the current virtual GPU + constantBuffer_->setVirtualDevice(&gpu()); + + if (dev().settings().xferBufSize_ > 0) { + xferBufferSize_ = dev().settings().xferBufSize_; for (uint i = 0; i < MaxXferBuffers; ++i) { + // Create internal xfer buffers for image copy optimization + xferBuffers_[i] = new (*context_) amd::Buffer(*context_, 0, xferBufferSize_); + + if ((xferBuffers_[i] != NULL) && !xferBuffers_[i]->create(NULL)) { + xferBuffers_[i]->release(); xferBuffers_[i] = NULL; - } - - completeOperation_ = false; -} - -KernelBlitManager::~KernelBlitManager() -{ - for (uint i = 0; i < BlitTotal; ++i) { - if (NULL != kernels_[i]) { - kernels_[i]->release(); - } - } - if (NULL != program_) { - program_->release(); - } - - if (NULL != context_) { - // Release a dummy context - context_->release(); - } - - if (NULL != constantBuffer_) { - constantBuffer_->release(); - } - - for (uint i = 0; i < MaxXferBuffers; ++i) { - if (NULL != xferBuffers_[i]) { - xferBuffers_[i]->release(); - } - } - - delete lockXferOps_; -} - -bool -KernelBlitManager::create(amd::Device& device) -{ - if (!createProgram(static_cast(device))) { return false; - } - return true; -} - -bool -KernelBlitManager::createProgram(Device& device) -{ - if (device.blitProgram() == nullptr) { - if (!device.createBlitProgram()) { - return false; - } - } - - std::vector devices; - devices.push_back(&device); - - // Save context and program for this device - context_ = device.blitProgram()->context_; - context_->retain(); - program_ = device.blitProgram()->program_; - program_->retain(); - - bool result = false; - do { - // Create kernel objects for all blits - for (uint i = 0; i < BlitTotal; ++i) { - const amd::Symbol* symbol = program_->findSymbol(BlitName[i]); - if (symbol == NULL) { - break; - } - kernels_[i] = new amd::Kernel(*program_, *symbol, BlitName[i]); - if (kernels_[i] == NULL) { - break; - } - // Validate blit kernels for the scratch memory usage (pre SI) - if (!device.validateKernel(*kernels_[i], &gpu())) { - break; - } - } - - result = true; - } while(!result); - - // Create an internal constant buffer - constantBuffer_ = new (*context_) - amd::Buffer(*context_, CL_MEM_ALLOC_HOST_PTR, 4 * Ki); - - if ((constantBuffer_ != NULL) && !constantBuffer_->create(NULL)) { - constantBuffer_->release(); - constantBuffer_ = NULL; - return false; - } - else if (constantBuffer_ == NULL) { + } else if (xferBuffers_[i] == NULL) { return false; + } + + // Assign the xfer buffer to the current virtual GPU + xferBuffers_[i]->setVirtualDevice(&gpu()); + //! @note Workaround for conformance allocation test. + //! Force GPU mem alloc. + //! Unaligned images require xfer optimization, + //! but deferred memory allocation can cause + //! virtual heap fragmentation for big allocations and + //! then fail the following test with 32 bit ISA, because + //! runtime runs out of 4GB space. + dev().getGpuMemory(xferBuffers_[i]); } + } - // Assign the constant buffer to the current virtual GPU - constantBuffer_->setVirtualDevice(&gpu()); + lockXferOps_ = new amd::Monitor("Transfer Ops Lock", true); + if (NULL == lockXferOps_) { + return false; + } - if (dev().settings().xferBufSize_ > 0) { - xferBufferSize_ = dev().settings().xferBufSize_; - for (uint i = 0; i < MaxXferBuffers; ++i) { - // Create internal xfer buffers for image copy optimization - xferBuffers_[i] = new (*context_) - amd::Buffer(*context_, 0, xferBufferSize_); - - if ((xferBuffers_[i] != NULL) && !xferBuffers_[i]->create(NULL)) { - xferBuffers_[i]->release(); - xferBuffers_[i] = NULL; - return false; - } - else if (xferBuffers_[i] == NULL) { - return false; - } - - // Assign the xfer buffer to the current virtual GPU - xferBuffers_[i]->setVirtualDevice(&gpu()); - //! @note Workaround for conformance allocation test. - //! Force GPU mem alloc. - //! Unaligned images require xfer optimization, - //! but deferred memory allocation can cause - //! virtual heap fragmentation for big allocations and - //! then fail the following test with 32 bit ISA, because - //! runtime runs out of 4GB space. - dev().getGpuMemory(xferBuffers_[i]); - } - } - - lockXferOps_ = new amd::Monitor("Transfer Ops Lock", true); - if (NULL == lockXferOps_) { - return false; - } - - return result; + return result; } // The following data structures will be used for the view creations. // Some formats has to be converted before a kernel blit operation struct FormatConvertion { - cl_uint clOldType_; - cl_uint clNewType_; + cl_uint clOldType_; + cl_uint clNewType_; }; // The list of rejected data formats and corresponding conversion -static const FormatConvertion RejectedData[] = -{ - { CL_UNORM_INT8, CL_UNSIGNED_INT8 }, - { CL_UNORM_INT16, CL_UNSIGNED_INT16 }, - { CL_SNORM_INT8, CL_UNSIGNED_INT8 }, - { CL_SNORM_INT16, CL_UNSIGNED_INT16 }, - { CL_HALF_FLOAT, CL_UNSIGNED_INT16 }, - { CL_FLOAT, CL_UNSIGNED_INT32 }, - { CL_SIGNED_INT8, CL_UNSIGNED_INT8 }, - { CL_SIGNED_INT16, CL_UNSIGNED_INT16 }, - { CL_UNORM_INT_101010, CL_UNSIGNED_INT8 }, - { CL_SIGNED_INT32, CL_UNSIGNED_INT32 } -}; +static const FormatConvertion RejectedData[] = { + {CL_UNORM_INT8, CL_UNSIGNED_INT8}, {CL_UNORM_INT16, CL_UNSIGNED_INT16}, + {CL_SNORM_INT8, CL_UNSIGNED_INT8}, {CL_SNORM_INT16, CL_UNSIGNED_INT16}, + {CL_HALF_FLOAT, CL_UNSIGNED_INT16}, {CL_FLOAT, CL_UNSIGNED_INT32}, + {CL_SIGNED_INT8, CL_UNSIGNED_INT8}, {CL_SIGNED_INT16, CL_UNSIGNED_INT16}, + {CL_UNORM_INT_101010, CL_UNSIGNED_INT8}, {CL_SIGNED_INT32, CL_UNSIGNED_INT32}}; // The list of rejected channel's order and corresponding conversion -static const FormatConvertion RejectedOrder[] = -{ - { CL_A, CL_R }, - { CL_RA, CL_RG }, - { CL_LUMINANCE, CL_R }, - { CL_INTENSITY, CL_R }, - { CL_RGB, CL_RGBA }, - { CL_BGRA, CL_RGBA }, - { CL_ARGB, CL_RGBA }, - { CL_sRGB, CL_RGBA }, - { CL_sRGBx, CL_RGBA }, - { CL_sRGBA, CL_RGBA }, - { CL_sBGRA, CL_RGBA } -}; +static const FormatConvertion RejectedOrder[] = { + {CL_A, CL_R}, {CL_RA, CL_RG}, {CL_LUMINANCE, CL_R}, {CL_INTENSITY, CL_R}, + {CL_RGB, CL_RGBA}, {CL_BGRA, CL_RGBA}, {CL_ARGB, CL_RGBA}, {CL_sRGB, CL_RGBA}, + {CL_sRGBx, CL_RGBA}, {CL_sRGBA, CL_RGBA}, {CL_sBGRA, CL_RGBA}}; -const uint RejectedFormatDataTotal = - sizeof(RejectedData) / sizeof(FormatConvertion); -const uint RejectedFormatChannelTotal = - sizeof(RejectedOrder) / sizeof(FormatConvertion); +const uint RejectedFormatDataTotal = sizeof(RejectedData) / sizeof(FormatConvertion); +const uint RejectedFormatChannelTotal = sizeof(RejectedOrder) / sizeof(FormatConvertion); -bool -KernelBlitManager::copyBufferToImage( - device::Memory& srcMemory, - device::Memory& dstMemory, - const amd::Coord3D& srcOrigin, - const amd::Coord3D& dstOrigin, - const amd::Coord3D& size, - bool entire, - size_t rowPitch, - size_t slicePitch) const -{ - amd::ScopedLock k(lockXferOps_); - bool result = false; - static const bool CopyRect = false; - // Flush DMA for ASYNC copy - static const bool FlushDMA = true; - size_t imgRowPitch = size[0] * gpuMem(dstMemory).elementSize(); - size_t imgSlicePitch = imgRowPitch * size[1]; - - if (setup_.disableCopyBufferToImage_) { - result = DmaBlitManager::copyBufferToImage( - srcMemory, dstMemory, srcOrigin, dstOrigin, size, - entire, rowPitch, slicePitch); - synchronize(); - return result; - } - // Check if buffer is in system memory with direct access - else if (gpuMem(srcMemory).isHostMemDirectAccess() && - (((rowPitch == 0) && (slicePitch == 0)) || - ((rowPitch == imgRowPitch) && - ((slicePitch == 0) || (slicePitch == imgSlicePitch))))) { - // First attempt to do this all with DMA, - // but there are restriciton with older hardware - if (dev().settings().imageDMA_) { - result = DmaBlitManager::copyBufferToImage( - srcMemory, dstMemory, srcOrigin, dstOrigin, size, - entire, rowPitch, slicePitch); - if (result) { - synchronize(); - return result; - } - } - - if (!setup_.disableCopyBufferToImageOpt_) { - // Find the overall copy size - size_t copySize = size[0] * size[1] * size[2] * gpuMem(dstMemory).elementSize(); - - // Check if double copy was requested - if (xferBufferSize_ != 0) { - amd::Coord3D src(srcOrigin); - amd::Coord3D xferSrc(0, 0, 0); - amd::Coord3D dst(dstOrigin); - amd::Coord3D xferRect(size); - // Find transfer size in pixels - size_t xferSizePix = xferBufferSize_ / gpuMem(dstMemory).elementSize(); - bool transfer = true; - - // Find transfer rectangle - if (xferRect[0] > xferSizePix) { - // The algorithm can't break a line. - // It requires multiple rectangles tracking - transfer = false; - } - else { - xferRect.c[1] = xferSizePix / xferRect[0]; - } - // Check if we exceeded the original size boundary in Y - if (xferRect[1] > size[1]) { - xferRect.c[1] = size[1]; - xferRect.c[2] = xferSizePix / (xferRect[0] * xferRect[1]); - } - else { - xferRect.c[2] = 1; - } - // Check if we exceeded the original size boundary in Z - if (xferRect[2] > size[2]) { - xferRect.c[2] = size[2]; - } - // Make sure size in Y dimension is divided by the rectangle size - if (size[2] > 1) { - while ((size[1] % xferRect[1]) != 0) { - xferRect.c[1]--; - } - } - - // Find one step copy size, based on the copy rectange - amd::Coord3D oneStepSize( - xferRect[0] * xferRect[1] * xferRect[2] * gpuMem(dstMemory).elementSize()); - - // Initialize transfer buffer array - Memory* xferBuf[MaxXferBuffers]; - for (uint i = 0; i < MaxXferBuffers; ++i) { - xferBuf[i] = dev().getGpuMemory(xferBuffers_[i]); - if (xferBuf[i] == NULL) { - transfer = false; - break; - } - } - - // Loop until we transfer all data - while (transfer && (copySize > 0)) { - size_t copySizeTmp = copySize; - amd::Coord3D srcTmp(src); - amd::Coord3D oneStepSizeTmp(oneStepSize); - // Step 1. Initiate DRM transfer with all staging buffers - for (uint i = 0; i < MaxXferBuffers; ++i) { - // Make sure we don't transfer more than copy size - if (copySizeTmp > 0) { - if (!gpuMem(srcMemory).partialMemCopyTo(gpu(), srcTmp, - xferSrc, oneStepSizeTmp, *xferBuf[i], CopyRect, FlushDMA)) { - transfer = false; - break; - } - - copySizeTmp -= oneStepSizeTmp[0]; - // Change buffer offset - srcTmp.c[0] += oneStepSizeTmp[0]; - - if (copySizeTmp < oneStepSizeTmp[0]) { - oneStepSizeTmp.c[0] = copySizeTmp; - } - } - else { - break; - } - } - - // Step 2. Initiate compute transfer with all staging buffers - for (uint i = 0; i < MaxXferBuffers; ++i) { - if (copySize > 0) { - if (!copyBufferToImageKernel( - *xferBuf[i], dstMemory, - xferSrc, dst, xferRect, false)) { - transfer = false; - break; - } - gpu().flushDMA(MainEngine); - - copySize -= oneStepSize[0]; - // Change buffer offset - src.c[0] += oneStepSize[0]; - // Change image offset, ignore X offset - for (uint j = 1; j < 3; ++j) { - dst.c[j] += xferRect[j]; - if ((dst[j] - dstOrigin[j]) >= size[j]) { - dst.c[j] = dstOrigin[j]; - } - else { - break; - } - } - // Recalculate rectangle size if the remain data is smaller - if (copySize < oneStepSize[0]) { - for (uint j = 0; j < 3; ++j) { - xferRect.c[j] = size[j] - (dst[j] - dstOrigin[j]); - } - oneStepSize.c[0] = copySize; - } - } - else { - break; - } - } - } - - if (copySize == 0) { - result = true; - } - else { - LogWarning("2 step transfer in copyBufferToImage failed"); - } - } - } - } - - if (!result) { - result = copyBufferToImageKernel(srcMemory, - dstMemory, srcOrigin, dstOrigin, size, entire, rowPitch, slicePitch); - } +bool KernelBlitManager::copyBufferToImage(device::Memory& srcMemory, device::Memory& dstMemory, + const amd::Coord3D& srcOrigin, + const amd::Coord3D& dstOrigin, const amd::Coord3D& size, + bool entire, size_t rowPitch, size_t slicePitch) const { + amd::ScopedLock k(lockXferOps_); + bool result = false; + static const bool CopyRect = false; + // Flush DMA for ASYNC copy + static const bool FlushDMA = true; + size_t imgRowPitch = size[0] * gpuMem(dstMemory).elementSize(); + size_t imgSlicePitch = imgRowPitch * size[1]; + if (setup_.disableCopyBufferToImage_) { + result = DmaBlitManager::copyBufferToImage(srcMemory, dstMemory, srcOrigin, dstOrigin, size, + entire, rowPitch, slicePitch); synchronize(); - return result; -} - -void -CalcRowSlicePitches( - cl_ulong* pitch, const cl_int* copySize, - size_t rowPitch, size_t slicePitch, const Memory& mem) -{ - uint32_t memFmtSize = mem.elementSize(); - bool img1Darray = (mem.desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY) ? true : false; - - if (rowPitch == 0) { - pitch[0] = copySize[0]; - } - else { - pitch[0] = rowPitch / memFmtSize; - } - if (slicePitch == 0) { - pitch[1] = pitch[0] * (img1Darray ? 1 : copySize[1]); - } - else { - pitch[1] = slicePitch / memFmtSize; - } - assert((pitch[0] <= pitch[1]) && "rowPitch must be <= slicePitch"); - - if (img1Darray) { - // For 1D array rowRitch = slicePitch - pitch[0] = pitch[1]; - } -} - -static void -setArgument(amd::Kernel* kernel, size_t index, size_t size, const void* value) -{ - const amd::KernelParameterDescriptor& desc = kernel->signature().at(index); - - void* param = kernel->parameters().values() + desc.offset_; - assert((desc.type_ == T_POINTER || value != NULL || desc.size_ == 0) && - "not a valid local mem arg"); - - uint32_t uint32_value = 0; - uint64_t uint64_value = 0; - - if (desc.type_ == T_POINTER && desc.size_ != 0) { - if ((value == NULL) || (static_cast(value) == NULL)) { - LP64_SWITCH(uint32_value, uint64_value) = 0; - } - else { - // convert cl_mem to amd::Memory*, return false if invalid. - LP64_SWITCH(uint32_value, uint64_value) = - (uintptr_t)(*static_cast(value)); - } - } - else if (desc.type_ == T_SAMPLER) { - assert(false && "No sampler support in blit manager! Use internal samplers!"); - } - else switch (desc.size_) { - case 1: uint32_value = *static_cast(value); break; - case 2: uint32_value = *static_cast(value); break; - case 4: uint32_value = *static_cast(value); break; - case 8: uint64_value = *static_cast(value); break; - default: break; - } - - switch (desc.size_) { - case 0 /*local mem*/ : *static_cast(param) = size; break; - case sizeof(uint32_t): *static_cast(param) = uint32_value; break; - case sizeof(uint64_t): *static_cast(param) = uint64_value; break; - default: ::memcpy(param, value, size); break; - } -} - -bool -KernelBlitManager::copyBufferToImageKernel( - device::Memory& srcMemory, - device::Memory& dstMemory, - const amd::Coord3D& srcOrigin, - const amd::Coord3D& dstOrigin, - const amd::Coord3D& size, - bool entire, - size_t rowPitch, - size_t slicePitch) const -{ - bool rejected = false; - Memory* dstView = &gpuMem(dstMemory); - bool releaseView = false; - bool result = false; - amd::Image::Format newFormat(gpuMem(dstMemory).desc().format_); - - // Find unsupported formats - for (uint i = 0; i < RejectedFormatDataTotal; ++i) { - if (RejectedData[i].clOldType_ == newFormat.image_channel_data_type) { - newFormat.image_channel_data_type = RejectedData[i].clNewType_; - rejected = true; - break; - } - } - - // Find unsupported channel's order - for (uint i = 0; i < RejectedFormatChannelTotal; ++i) { - if (RejectedOrder[i].clOldType_ == newFormat.image_channel_order) { - newFormat.image_channel_order = RejectedOrder[i].clNewType_; - rejected = true; - break; - } - } - - // If the image format was rejected, then attempt to create a view - if (rejected) { - dstView = createView(gpuMem(dstMemory), newFormat); - if (dstView != NULL) { - rejected = false; - releaseView = true; - } - } - - // Fall into the host path if the image format was rejected - if (rejected) { - return HostBlitManager::copyBufferToImage( - srcMemory, dstMemory, srcOrigin, dstOrigin, size, entire); - } - - // Use a common blit type with three dimensions by default - uint blitType = BlitCopyBufferToImage; - size_t dim = 0; - size_t globalWorkOffset[3] = { 0, 0, 0 }; - size_t globalWorkSize[3]; - size_t localWorkSize[3]; - - // Program the kernels workload depending on the blit dimensions - dim = 3; - if (gpuMem(dstMemory).desc().dimSize_ == 1) { - globalWorkSize[0] = amd::alignUp(size[0], 256); - globalWorkSize[1] = amd::alignUp(size[1], 1); - globalWorkSize[2] = amd::alignUp(size[2], 1); - localWorkSize[0] = 256; - localWorkSize[1] = localWorkSize[2] = 1; - } - else if (gpuMem(dstMemory).desc().dimSize_ == 2) { - globalWorkSize[0] = amd::alignUp(size[0], 16); - globalWorkSize[1] = amd::alignUp(size[1], 16); - globalWorkSize[2] = amd::alignUp(size[2], 1); - localWorkSize[0] = localWorkSize[1] = 16; - localWorkSize[2] = 1; - } - else { - globalWorkSize[0] = amd::alignUp(size[0], 8); - globalWorkSize[1] = amd::alignUp(size[1], 8); - globalWorkSize[2] = amd::alignUp(size[2], 4); - localWorkSize[0] = localWorkSize[1] = 8; - localWorkSize[2] = 4; - } - - // Program kernels arguments for the blit operation - Memory* mem = &gpuMem(srcMemory); - setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem); - mem = dstView; - setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem); - uint32_t memFmtSize = gpuMem(dstMemory).elementSize(); - uint32_t components = gpuMem(dstMemory).numComponents(); - - // 1 element granularity for writes by default - cl_int granularity = 1; - if (memFmtSize == 2) { - granularity = 2; - } - else if (memFmtSize >= 4) { - granularity = 4; - } - CondLog(((srcOrigin[0] % granularity) != 0), "Unaligned offset in blit!"); - cl_ulong srcOrg[4] = { srcOrigin[0] / granularity, - srcOrigin[1], - srcOrigin[2], 0 }; - setArgument(kernels_[blitType], 2, sizeof(srcOrg), srcOrg); - - cl_int dstOrg[4] = { (cl_int)dstOrigin[0], - (cl_int)dstOrigin[1], - (cl_int)dstOrigin[2], 0 }; - cl_int copySize[4] = { (cl_int)size[0], - (cl_int)size[1], - (cl_int)size[2], 0 }; - - setArgument(kernels_[blitType], 3, sizeof(dstOrg), dstOrg); - setArgument(kernels_[blitType], 4, sizeof(copySize), copySize); - - // Program memory format - uint multiplier = memFmtSize / sizeof(uint32_t); - multiplier = (multiplier == 0) ? 1 : multiplier; - cl_uint format[4] = { components, - memFmtSize / components, - multiplier, 0 }; - setArgument(kernels_[blitType], 5, sizeof(format), format); - - // Program row and slice pitches - cl_ulong pitch[4] = { 0 }; - CalcRowSlicePitches(pitch, copySize, rowPitch, slicePitch, gpuMem(dstMemory)); - setArgument(kernels_[blitType], 6, sizeof(pitch), pitch); - - // Create ND range object for the kernel's execution - amd::NDRangeContainer ndrange(dim, - globalWorkOffset, globalWorkSize, localWorkSize); - - // Execute the blit - address parameters = kernels_[blitType]->parameters().values(); - result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters); - if (releaseView) { - delete dstView; - } - - return result; -} - -bool -KernelBlitManager::copyImageToBuffer( - device::Memory& srcMemory, - device::Memory& dstMemory, - const amd::Coord3D& srcOrigin, - const amd::Coord3D& dstOrigin, - const amd::Coord3D& size, - bool entire, - size_t rowPitch, - size_t slicePitch) const -{ - amd::ScopedLock k(lockXferOps_); - bool result = false; - static const bool CopyRect = false; - // Flush DMA for ASYNC copy - static const bool FlushDMA = true; - size_t imgRowPitch = size[0] * gpuMem(srcMemory).elementSize(); - size_t imgSlicePitch = imgRowPitch * size[1]; - - if (setup_.disableCopyImageToBuffer_) { - result = HostBlitManager::copyImageToBuffer( - srcMemory, dstMemory, srcOrigin, dstOrigin, - size, entire, rowPitch, slicePitch); + } + // Check if buffer is in system memory with direct access + else if (gpuMem(srcMemory).isHostMemDirectAccess() && + (((rowPitch == 0) && (slicePitch == 0)) || + ((rowPitch == imgRowPitch) && ((slicePitch == 0) || (slicePitch == imgSlicePitch))))) { + // First attempt to do this all with DMA, + // but there are restriciton with older hardware + if (dev().settings().imageDMA_) { + result = DmaBlitManager::copyBufferToImage(srcMemory, dstMemory, srcOrigin, dstOrigin, size, + entire, rowPitch, slicePitch); + if (result) { synchronize(); return result; + } } - // Check if buffer is in system memory with direct access - else if (gpuMem(dstMemory).isHostMemDirectAccess() && - (((rowPitch == 0) && (slicePitch == 0)) || - ((rowPitch == imgRowPitch) && - ((slicePitch == 0) || (slicePitch == imgSlicePitch))))) { - // First attempt to do this all with DMA, - // but there are restriciton with older hardware - // If the dest buffer is external physical(SDI), copy two step as - // single step SDMA is causing corruption and the cause is under investigation - if (dev().settings().imageDMA_ && - gpuMem(dstMemory).memoryType() != Resource::ExternalPhysical) { - result = DmaBlitManager::copyImageToBuffer( - srcMemory, dstMemory, srcOrigin, dstOrigin, - size, entire, rowPitch, slicePitch); - if (result) { - synchronize(); - return result; - } + + if (!setup_.disableCopyBufferToImageOpt_) { + // Find the overall copy size + size_t copySize = size[0] * size[1] * size[2] * gpuMem(dstMemory).elementSize(); + + // Check if double copy was requested + if (xferBufferSize_ != 0) { + amd::Coord3D src(srcOrigin); + amd::Coord3D xferSrc(0, 0, 0); + amd::Coord3D dst(dstOrigin); + amd::Coord3D xferRect(size); + // Find transfer size in pixels + size_t xferSizePix = xferBufferSize_ / gpuMem(dstMemory).elementSize(); + bool transfer = true; + + // Find transfer rectangle + if (xferRect[0] > xferSizePix) { + // The algorithm can't break a line. + // It requires multiple rectangles tracking + transfer = false; + } else { + xferRect.c[1] = xferSizePix / xferRect[0]; + } + // Check if we exceeded the original size boundary in Y + if (xferRect[1] > size[1]) { + xferRect.c[1] = size[1]; + xferRect.c[2] = xferSizePix / (xferRect[0] * xferRect[1]); + } else { + xferRect.c[2] = 1; + } + // Check if we exceeded the original size boundary in Z + if (xferRect[2] > size[2]) { + xferRect.c[2] = size[2]; + } + // Make sure size in Y dimension is divided by the rectangle size + if (size[2] > 1) { + while ((size[1] % xferRect[1]) != 0) { + xferRect.c[1]--; + } } - // Find the overall copy size - size_t copySize = size[0] * size[1] * size[2] * gpuMem(srcMemory).elementSize(); + // Find one step copy size, based on the copy rectange + amd::Coord3D oneStepSize(xferRect[0] * xferRect[1] * xferRect[2] * + gpuMem(dstMemory).elementSize()); - // Check if double copy was requested - if (xferBufferSize_ != 0) { - amd::Coord3D src(srcOrigin); - amd::Coord3D dst(dstOrigin); - amd::Coord3D xferDst(0, 0, 0); - amd::Coord3D xferRect(size); - // Find transfer size in pixels - size_t xferSizePix = xferBufferSize_ / gpuMem(srcMemory).elementSize(); - bool transfer = true; + // Initialize transfer buffer array + Memory* xferBuf[MaxXferBuffers]; + for (uint i = 0; i < MaxXferBuffers; ++i) { + xferBuf[i] = dev().getGpuMemory(xferBuffers_[i]); + if (xferBuf[i] == NULL) { + transfer = false; + break; + } + } - // Find transfer rectangle - if (xferRect[0] > xferSizePix) { - // The algorithm can't break a line. - // It requires multiple rectangles tracking + // Loop until we transfer all data + while (transfer && (copySize > 0)) { + size_t copySizeTmp = copySize; + amd::Coord3D srcTmp(src); + amd::Coord3D oneStepSizeTmp(oneStepSize); + // Step 1. Initiate DRM transfer with all staging buffers + for (uint i = 0; i < MaxXferBuffers; ++i) { + // Make sure we don't transfer more than copy size + if (copySizeTmp > 0) { + if (!gpuMem(srcMemory).partialMemCopyTo(gpu(), srcTmp, xferSrc, oneStepSizeTmp, + *xferBuf[i], CopyRect, FlushDMA)) { transfer = false; + break; + } + + copySizeTmp -= oneStepSizeTmp[0]; + // Change buffer offset + srcTmp.c[0] += oneStepSizeTmp[0]; + + if (copySizeTmp < oneStepSizeTmp[0]) { + oneStepSizeTmp.c[0] = copySizeTmp; + } + } else { + break; } - else { - xferRect.c[1] = xferSizePix / xferRect[0]; - } - // Check if we exceeded the original size boundary in Y - if (xferRect[1] > size[1]) { - xferRect.c[1] = size[1]; - xferRect.c[2] = xferSizePix / (xferRect[0] * xferRect[1]); - } - else { - xferRect.c[2] = 1; - } - // Check if we exceeded the original size boundary in Z - if (xferRect[2] > size[2]) { - xferRect.c[2] = size[2]; - } - // Make sure size in Y dimension is divided by the rectangle size - if (size[2] > 1) { - while ((size[1] % xferRect[1]) != 0) { - xferRect.c[1]--; + } + + // Step 2. Initiate compute transfer with all staging buffers + for (uint i = 0; i < MaxXferBuffers; ++i) { + if (copySize > 0) { + if (!copyBufferToImageKernel(*xferBuf[i], dstMemory, xferSrc, dst, xferRect, false)) { + transfer = false; + break; + } + gpu().flushDMA(MainEngine); + + copySize -= oneStepSize[0]; + // Change buffer offset + src.c[0] += oneStepSize[0]; + // Change image offset, ignore X offset + for (uint j = 1; j < 3; ++j) { + dst.c[j] += xferRect[j]; + if ((dst[j] - dstOrigin[j]) >= size[j]) { + dst.c[j] = dstOrigin[j]; + } else { + break; } - } - - // Find one step copy size, based on the copy rectange - amd::Coord3D oneStepSize( - xferRect[0] * xferRect[1] * xferRect[2] * gpuMem(srcMemory).elementSize()); - - // Initialize transfer buffer array - Memory* xferBuf[MaxXferBuffers]; - for (uint i = 0; i < MaxXferBuffers; ++i) { - xferBuf[i] = dev().getGpuMemory(xferBuffers_[i]); - if (xferBuf[i] == NULL) { - transfer = false; - break; + } + // Recalculate rectangle size if the remain data is smaller + if (copySize < oneStepSize[0]) { + for (uint j = 0; j < 3; ++j) { + xferRect.c[j] = size[j] - (dst[j] - dstOrigin[j]); } + oneStepSize.c[0] = copySize; + } + } else { + break; } - - // Loop until we transfer all data - while (transfer && (copySize > 0)) { - size_t copySizeTmp = copySize; - amd::Coord3D srcTmp(src); - amd::Coord3D oneStepSizeTmp(oneStepSize); - amd::Coord3D xferRectTmp(xferRect); - - // Step 1. Initiate compute transfer with all staging buffers - for (uint i = 0; i < MaxXferBuffers; ++i) { - if (copySizeTmp > 0) { - if (!copyImageToBufferKernel( - srcMemory, *xferBuf[i], - srcTmp, xferDst, xferRectTmp, false)) { - transfer = false; - break; - } - gpu().flushDMA(MainEngine); - - copySizeTmp -= oneStepSizeTmp[0]; - // Change image offset, ignore X offset - for (uint j = 1; j < 3; ++j) { - srcTmp.c[j] += xferRectTmp[j]; - if ((srcTmp[j] - srcOrigin[j]) >= size[j]) { - srcTmp.c[j] = srcOrigin[j]; - } - else { - break; - } - } - // Recalculate rectangle size if the remain data is smaller - if (copySizeTmp < oneStepSizeTmp[0]) { - for (uint j = 0; j < 3; ++j) { - xferRectTmp.c[j] = size[j] - (srcTmp[j] - srcOrigin[j]); - } - } - } - else { - break; - } - } - - // Step 2. Initiate DRM transfer with all staging buffers - for (uint i = 0; i < MaxXferBuffers; ++i) { - // Make sure we don't transfer more than copy size - if (copySize > 0) { - if (!xferBuf[i]->partialMemCopyTo(gpu(), xferDst, dst, - oneStepSize, gpuMem(dstMemory), CopyRect, FlushDMA)) { - transfer = false; - break; - } - - copySize -= oneStepSize[0]; - // Change buffer offset - dst.c[0] += oneStepSize[0]; - // Change image offset, ignore X offset - for (uint j = 1; j < 3; ++j) { - src.c[j] += xferRect[j]; - if ((src[j] - srcOrigin[j]) >= size[j]) { - src.c[j] = srcOrigin[j]; - } - else { - break; - } - } - // Recalculate rectangle size if the remain data is smaller - if (copySize < oneStepSize[0]) { - for (uint j = 0; j < 3; ++j) { - xferRect.c[j] = size[j] - (src[j] - srcOrigin[j]); - } - oneStepSize.c[0] = copySize; - } - } - else { - break; - } - } - } - - if (copySize == 0) { - result = true; - } - else { - LogWarning("2 step transfer in copyBufferToImage failed"); - } + } } + + if (copySize == 0) { + result = true; + } else { + LogWarning("2 step transfer in copyBufferToImage failed"); + } + } } + } - if (!result) { - result = copyImageToBufferKernel(srcMemory, - dstMemory, srcOrigin, dstOrigin, size, entire, rowPitch, slicePitch); - } + if (!result) { + result = copyBufferToImageKernel(srcMemory, dstMemory, srcOrigin, dstOrigin, size, entire, + rowPitch, slicePitch); + } - synchronize(); + synchronize(); - return result; + return result; } -bool -KernelBlitManager::copyImageToBufferKernel( - device::Memory& srcMemory, - device::Memory& dstMemory, - const amd::Coord3D& srcOrigin, - const amd::Coord3D& dstOrigin, - const amd::Coord3D& size, - bool entire, - size_t rowPitch, - size_t slicePitch) const -{ - bool rejected = false; - Memory* srcView = &gpuMem(srcMemory); - bool releaseView = false; - bool result = false; - amd::Image::Format newFormat(gpuMem(srcMemory).desc().format_); +void CalcRowSlicePitches(cl_ulong* pitch, const cl_int* copySize, size_t rowPitch, + size_t slicePitch, const Memory& mem) { + uint32_t memFmtSize = mem.elementSize(); + bool img1Darray = (mem.desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY) ? true : false; - // Find unsupported formats - for (uint i = 0; i < RejectedFormatDataTotal; ++i) { - if (RejectedData[i].clOldType_ == newFormat.image_channel_data_type) { - newFormat.image_channel_data_type = RejectedData[i].clNewType_; - rejected = true; - break; - } + if (rowPitch == 0) { + pitch[0] = copySize[0]; + } else { + pitch[0] = rowPitch / memFmtSize; + } + if (slicePitch == 0) { + pitch[1] = pitch[0] * (img1Darray ? 1 : copySize[1]); + } else { + pitch[1] = slicePitch / memFmtSize; + } + assert((pitch[0] <= pitch[1]) && "rowPitch must be <= slicePitch"); + + if (img1Darray) { + // For 1D array rowRitch = slicePitch + pitch[0] = pitch[1]; + } +} + +static void setArgument(amd::Kernel* kernel, size_t index, size_t size, const void* value) { + const amd::KernelParameterDescriptor& desc = kernel->signature().at(index); + + void* param = kernel->parameters().values() + desc.offset_; + assert((desc.type_ == T_POINTER || value != NULL || desc.size_ == 0) && + "not a valid local mem arg"); + + uint32_t uint32_value = 0; + uint64_t uint64_value = 0; + + if (desc.type_ == T_POINTER && desc.size_ != 0) { + if ((value == NULL) || (static_cast(value) == NULL)) { + LP64_SWITCH(uint32_value, uint64_value) = 0; + } else { + // convert cl_mem to amd::Memory*, return false if invalid. + LP64_SWITCH(uint32_value, uint64_value) = (uintptr_t)(*static_cast(value)); + } + } else if (desc.type_ == T_SAMPLER) { + assert(false && "No sampler support in blit manager! Use internal samplers!"); + } else + switch (desc.size_) { + case 1: + uint32_value = *static_cast(value); + break; + case 2: + uint32_value = *static_cast(value); + break; + case 4: + uint32_value = *static_cast(value); + break; + case 8: + uint64_value = *static_cast(value); + break; + default: + break; } - // Find unsupported channel's order + switch (desc.size_) { + case 0 /*local mem*/: + *static_cast(param) = size; + break; + case sizeof(uint32_t): + *static_cast(param) = uint32_value; + break; + case sizeof(uint64_t): + *static_cast(param) = uint64_value; + break; + default: + ::memcpy(param, value, size); + break; + } +} + +bool KernelBlitManager::copyBufferToImageKernel(device::Memory& srcMemory, + device::Memory& dstMemory, + const amd::Coord3D& srcOrigin, + const amd::Coord3D& dstOrigin, + const amd::Coord3D& size, bool entire, + size_t rowPitch, size_t slicePitch) const { + bool rejected = false; + Memory* dstView = &gpuMem(dstMemory); + bool releaseView = false; + bool result = false; + amd::Image::Format newFormat(gpuMem(dstMemory).desc().format_); + + // Find unsupported formats + for (uint i = 0; i < RejectedFormatDataTotal; ++i) { + if (RejectedData[i].clOldType_ == newFormat.image_channel_data_type) { + newFormat.image_channel_data_type = RejectedData[i].clNewType_; + rejected = true; + break; + } + } + + // Find unsupported channel's order + for (uint i = 0; i < RejectedFormatChannelTotal; ++i) { + if (RejectedOrder[i].clOldType_ == newFormat.image_channel_order) { + newFormat.image_channel_order = RejectedOrder[i].clNewType_; + rejected = true; + break; + } + } + + // If the image format was rejected, then attempt to create a view + if (rejected) { + dstView = createView(gpuMem(dstMemory), newFormat); + if (dstView != NULL) { + rejected = false; + releaseView = true; + } + } + + // Fall into the host path if the image format was rejected + if (rejected) { + return HostBlitManager::copyBufferToImage(srcMemory, dstMemory, srcOrigin, dstOrigin, size, + entire); + } + + // Use a common blit type with three dimensions by default + uint blitType = BlitCopyBufferToImage; + size_t dim = 0; + size_t globalWorkOffset[3] = {0, 0, 0}; + size_t globalWorkSize[3]; + size_t localWorkSize[3]; + + // Program the kernels workload depending on the blit dimensions + dim = 3; + if (gpuMem(dstMemory).desc().dimSize_ == 1) { + globalWorkSize[0] = amd::alignUp(size[0], 256); + globalWorkSize[1] = amd::alignUp(size[1], 1); + globalWorkSize[2] = amd::alignUp(size[2], 1); + localWorkSize[0] = 256; + localWorkSize[1] = localWorkSize[2] = 1; + } else if (gpuMem(dstMemory).desc().dimSize_ == 2) { + globalWorkSize[0] = amd::alignUp(size[0], 16); + globalWorkSize[1] = amd::alignUp(size[1], 16); + globalWorkSize[2] = amd::alignUp(size[2], 1); + localWorkSize[0] = localWorkSize[1] = 16; + localWorkSize[2] = 1; + } else { + globalWorkSize[0] = amd::alignUp(size[0], 8); + globalWorkSize[1] = amd::alignUp(size[1], 8); + globalWorkSize[2] = amd::alignUp(size[2], 4); + localWorkSize[0] = localWorkSize[1] = 8; + localWorkSize[2] = 4; + } + + // Program kernels arguments for the blit operation + Memory* mem = &gpuMem(srcMemory); + setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem); + mem = dstView; + setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem); + uint32_t memFmtSize = gpuMem(dstMemory).elementSize(); + uint32_t components = gpuMem(dstMemory).numComponents(); + + // 1 element granularity for writes by default + cl_int granularity = 1; + if (memFmtSize == 2) { + granularity = 2; + } else if (memFmtSize >= 4) { + granularity = 4; + } + CondLog(((srcOrigin[0] % granularity) != 0), "Unaligned offset in blit!"); + cl_ulong srcOrg[4] = {srcOrigin[0] / granularity, srcOrigin[1], srcOrigin[2], 0}; + setArgument(kernels_[blitType], 2, sizeof(srcOrg), srcOrg); + + cl_int dstOrg[4] = {(cl_int)dstOrigin[0], (cl_int)dstOrigin[1], (cl_int)dstOrigin[2], 0}; + cl_int copySize[4] = {(cl_int)size[0], (cl_int)size[1], (cl_int)size[2], 0}; + + setArgument(kernels_[blitType], 3, sizeof(dstOrg), dstOrg); + setArgument(kernels_[blitType], 4, sizeof(copySize), copySize); + + // Program memory format + uint multiplier = memFmtSize / sizeof(uint32_t); + multiplier = (multiplier == 0) ? 1 : multiplier; + cl_uint format[4] = {components, memFmtSize / components, multiplier, 0}; + setArgument(kernels_[blitType], 5, sizeof(format), format); + + // Program row and slice pitches + cl_ulong pitch[4] = {0}; + CalcRowSlicePitches(pitch, copySize, rowPitch, slicePitch, gpuMem(dstMemory)); + setArgument(kernels_[blitType], 6, sizeof(pitch), pitch); + + // Create ND range object for the kernel's execution + amd::NDRangeContainer ndrange(dim, globalWorkOffset, globalWorkSize, localWorkSize); + + // Execute the blit + address parameters = kernels_[blitType]->parameters().values(); + result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters); + if (releaseView) { + delete dstView; + } + + return result; +} + +bool KernelBlitManager::copyImageToBuffer(device::Memory& srcMemory, device::Memory& dstMemory, + const amd::Coord3D& srcOrigin, + const amd::Coord3D& dstOrigin, const amd::Coord3D& size, + bool entire, size_t rowPitch, size_t slicePitch) const { + amd::ScopedLock k(lockXferOps_); + bool result = false; + static const bool CopyRect = false; + // Flush DMA for ASYNC copy + static const bool FlushDMA = true; + size_t imgRowPitch = size[0] * gpuMem(srcMemory).elementSize(); + size_t imgSlicePitch = imgRowPitch * size[1]; + + if (setup_.disableCopyImageToBuffer_) { + result = HostBlitManager::copyImageToBuffer(srcMemory, dstMemory, srcOrigin, dstOrigin, size, + entire, rowPitch, slicePitch); + synchronize(); + return result; + } + // Check if buffer is in system memory with direct access + else if (gpuMem(dstMemory).isHostMemDirectAccess() && + (((rowPitch == 0) && (slicePitch == 0)) || + ((rowPitch == imgRowPitch) && ((slicePitch == 0) || (slicePitch == imgSlicePitch))))) { + // First attempt to do this all with DMA, + // but there are restriciton with older hardware + // If the dest buffer is external physical(SDI), copy two step as + // single step SDMA is causing corruption and the cause is under investigation + if (dev().settings().imageDMA_ && + gpuMem(dstMemory).memoryType() != Resource::ExternalPhysical) { + result = DmaBlitManager::copyImageToBuffer(srcMemory, dstMemory, srcOrigin, dstOrigin, size, + entire, rowPitch, slicePitch); + if (result) { + synchronize(); + return result; + } + } + + // Find the overall copy size + size_t copySize = size[0] * size[1] * size[2] * gpuMem(srcMemory).elementSize(); + + // Check if double copy was requested + if (xferBufferSize_ != 0) { + amd::Coord3D src(srcOrigin); + amd::Coord3D dst(dstOrigin); + amd::Coord3D xferDst(0, 0, 0); + amd::Coord3D xferRect(size); + // Find transfer size in pixels + size_t xferSizePix = xferBufferSize_ / gpuMem(srcMemory).elementSize(); + bool transfer = true; + + // Find transfer rectangle + if (xferRect[0] > xferSizePix) { + // The algorithm can't break a line. + // It requires multiple rectangles tracking + transfer = false; + } else { + xferRect.c[1] = xferSizePix / xferRect[0]; + } + // Check if we exceeded the original size boundary in Y + if (xferRect[1] > size[1]) { + xferRect.c[1] = size[1]; + xferRect.c[2] = xferSizePix / (xferRect[0] * xferRect[1]); + } else { + xferRect.c[2] = 1; + } + // Check if we exceeded the original size boundary in Z + if (xferRect[2] > size[2]) { + xferRect.c[2] = size[2]; + } + // Make sure size in Y dimension is divided by the rectangle size + if (size[2] > 1) { + while ((size[1] % xferRect[1]) != 0) { + xferRect.c[1]--; + } + } + + // Find one step copy size, based on the copy rectange + amd::Coord3D oneStepSize(xferRect[0] * xferRect[1] * xferRect[2] * + gpuMem(srcMemory).elementSize()); + + // Initialize transfer buffer array + Memory* xferBuf[MaxXferBuffers]; + for (uint i = 0; i < MaxXferBuffers; ++i) { + xferBuf[i] = dev().getGpuMemory(xferBuffers_[i]); + if (xferBuf[i] == NULL) { + transfer = false; + break; + } + } + + // Loop until we transfer all data + while (transfer && (copySize > 0)) { + size_t copySizeTmp = copySize; + amd::Coord3D srcTmp(src); + amd::Coord3D oneStepSizeTmp(oneStepSize); + amd::Coord3D xferRectTmp(xferRect); + + // Step 1. Initiate compute transfer with all staging buffers + for (uint i = 0; i < MaxXferBuffers; ++i) { + if (copySizeTmp > 0) { + if (!copyImageToBufferKernel(srcMemory, *xferBuf[i], srcTmp, xferDst, xferRectTmp, + false)) { + transfer = false; + break; + } + gpu().flushDMA(MainEngine); + + copySizeTmp -= oneStepSizeTmp[0]; + // Change image offset, ignore X offset + for (uint j = 1; j < 3; ++j) { + srcTmp.c[j] += xferRectTmp[j]; + if ((srcTmp[j] - srcOrigin[j]) >= size[j]) { + srcTmp.c[j] = srcOrigin[j]; + } else { + break; + } + } + // Recalculate rectangle size if the remain data is smaller + if (copySizeTmp < oneStepSizeTmp[0]) { + for (uint j = 0; j < 3; ++j) { + xferRectTmp.c[j] = size[j] - (srcTmp[j] - srcOrigin[j]); + } + } + } else { + break; + } + } + + // Step 2. Initiate DRM transfer with all staging buffers + for (uint i = 0; i < MaxXferBuffers; ++i) { + // Make sure we don't transfer more than copy size + if (copySize > 0) { + if (!xferBuf[i]->partialMemCopyTo(gpu(), xferDst, dst, oneStepSize, gpuMem(dstMemory), + CopyRect, FlushDMA)) { + transfer = false; + break; + } + + copySize -= oneStepSize[0]; + // Change buffer offset + dst.c[0] += oneStepSize[0]; + // Change image offset, ignore X offset + for (uint j = 1; j < 3; ++j) { + src.c[j] += xferRect[j]; + if ((src[j] - srcOrigin[j]) >= size[j]) { + src.c[j] = srcOrigin[j]; + } else { + break; + } + } + // Recalculate rectangle size if the remain data is smaller + if (copySize < oneStepSize[0]) { + for (uint j = 0; j < 3; ++j) { + xferRect.c[j] = size[j] - (src[j] - srcOrigin[j]); + } + oneStepSize.c[0] = copySize; + } + } else { + break; + } + } + } + + if (copySize == 0) { + result = true; + } else { + LogWarning("2 step transfer in copyBufferToImage failed"); + } + } + } + + if (!result) { + result = copyImageToBufferKernel(srcMemory, dstMemory, srcOrigin, dstOrigin, size, entire, + rowPitch, slicePitch); + } + + synchronize(); + + return result; +} + +bool KernelBlitManager::copyImageToBufferKernel(device::Memory& srcMemory, + device::Memory& dstMemory, + const amd::Coord3D& srcOrigin, + const amd::Coord3D& dstOrigin, + const amd::Coord3D& size, bool entire, + size_t rowPitch, size_t slicePitch) const { + bool rejected = false; + Memory* srcView = &gpuMem(srcMemory); + bool releaseView = false; + bool result = false; + amd::Image::Format newFormat(gpuMem(srcMemory).desc().format_); + + // Find unsupported formats + for (uint i = 0; i < RejectedFormatDataTotal; ++i) { + if (RejectedData[i].clOldType_ == newFormat.image_channel_data_type) { + newFormat.image_channel_data_type = RejectedData[i].clNewType_; + rejected = true; + break; + } + } + + // Find unsupported channel's order + for (uint i = 0; i < RejectedFormatChannelTotal; ++i) { + if (RejectedOrder[i].clOldType_ == newFormat.image_channel_order) { + newFormat.image_channel_order = RejectedOrder[i].clNewType_; + rejected = true; + break; + } + } + + // If the image format was rejected, then attempt to create a view + if (rejected) { + srcView = createView(gpuMem(srcMemory), newFormat); + if (srcView != NULL) { + rejected = false; + releaseView = true; + } + } + + // Fall into the host path if the image format was rejected + if (rejected) { + return HostBlitManager::copyImageToBuffer(srcMemory, dstMemory, srcOrigin, dstOrigin, size, + entire); + } + + uint blitType = BlitCopyImageToBuffer; + size_t dim = 0; + size_t globalWorkOffset[3] = {0, 0, 0}; + size_t globalWorkSize[3]; + size_t localWorkSize[3]; + + // Program the kernels workload depending on the blit dimensions + dim = 3; + // Find the current blit type + if (gpuMem(srcMemory).desc().dimSize_ == 1) { + globalWorkSize[0] = amd::alignUp(size[0], 256); + globalWorkSize[1] = amd::alignUp(size[1], 1); + globalWorkSize[2] = amd::alignUp(size[2], 1); + localWorkSize[0] = 256; + localWorkSize[1] = localWorkSize[2] = 1; + } else if (gpuMem(srcMemory).desc().dimSize_ == 2) { + globalWorkSize[0] = amd::alignUp(size[0], 16); + globalWorkSize[1] = amd::alignUp(size[1], 16); + globalWorkSize[2] = amd::alignUp(size[2], 1); + localWorkSize[0] = localWorkSize[1] = 16; + localWorkSize[2] = 1; + } else { + globalWorkSize[0] = amd::alignUp(size[0], 8); + globalWorkSize[1] = amd::alignUp(size[1], 8); + globalWorkSize[2] = amd::alignUp(size[2], 4); + localWorkSize[0] = localWorkSize[1] = 8; + localWorkSize[2] = 4; + } + + // Program kernels arguments for the blit operation + Memory* mem = srcView; + setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem); + mem = &gpuMem(dstMemory); + setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem); + + // Update extra paramters for USHORT and UBYTE pointers. + // Only then compiler can optimize the kernel to use + // UAV Raw for other writes + setArgument(kernels_[blitType], 2, sizeof(cl_mem), &mem); + setArgument(kernels_[blitType], 3, sizeof(cl_mem), &mem); + + cl_int srcOrg[4] = {(cl_int)srcOrigin[0], (cl_int)srcOrigin[1], (cl_int)srcOrigin[2], 0}; + cl_int copySize[4] = {(cl_int)size[0], (cl_int)size[1], (cl_int)size[2], 0}; + setArgument(kernels_[blitType], 4, sizeof(srcOrg), srcOrg); + uint32_t memFmtSize = gpuMem(srcMemory).elementSize(); + uint32_t components = gpuMem(srcMemory).numComponents(); + + // 1 element granularity for writes by default + cl_int granularity = 1; + if (memFmtSize == 2) { + granularity = 2; + } else if (memFmtSize >= 4) { + granularity = 4; + } + CondLog(((dstOrigin[0] % granularity) != 0), "Unaligned offset in blit!"); + cl_ulong dstOrg[4] = {dstOrigin[0] / granularity, dstOrigin[1], dstOrigin[2], 0}; + setArgument(kernels_[blitType], 5, sizeof(dstOrg), dstOrg); + setArgument(kernels_[blitType], 6, sizeof(copySize), copySize); + + // Program memory format + uint multiplier = memFmtSize / sizeof(uint32_t); + multiplier = (multiplier == 0) ? 1 : multiplier; + cl_uint format[4] = {components, memFmtSize / components, multiplier, 0}; + setArgument(kernels_[blitType], 7, sizeof(format), format); + + // Program row and slice pitches + cl_ulong pitch[4] = {0}; + CalcRowSlicePitches(pitch, copySize, rowPitch, slicePitch, gpuMem(srcMemory)); + setArgument(kernels_[blitType], 8, sizeof(pitch), pitch); + + // Create ND range object for the kernel's execution + amd::NDRangeContainer ndrange(dim, globalWorkOffset, globalWorkSize, localWorkSize); + + // Execute the blit + address parameters = kernels_[blitType]->parameters().values(); + result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters); + if (releaseView) { + delete srcView; + } + + return result; +} + +bool KernelBlitManager::copyImage(device::Memory& srcMemory, device::Memory& dstMemory, + const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin, + const amd::Coord3D& size, bool entire) const { + amd::ScopedLock k(lockXferOps_); + bool rejected = false; + Memory* srcView = &gpuMem(srcMemory); + Memory* dstView = &gpuMem(dstMemory); + bool releaseView = false; + bool result = false; + amd::Image::Format newFormat(gpuMem(srcMemory).desc().format_); + + // Find unsupported formats + for (uint i = 0; i < RejectedFormatDataTotal; ++i) { + if (RejectedData[i].clOldType_ == newFormat.image_channel_data_type) { + newFormat.image_channel_data_type = RejectedData[i].clNewType_; + rejected = true; + break; + } + } + + // Search for the rejected channel's order only if the format was rejected + // Note: Image blit is independent from the channel order + if (rejected) { for (uint i = 0; i < RejectedFormatChannelTotal; ++i) { - if (RejectedOrder[i].clOldType_ == newFormat.image_channel_order) { - newFormat.image_channel_order = RejectedOrder[i].clNewType_; - rejected = true; - break; - } + if (RejectedOrder[i].clOldType_ == newFormat.image_channel_order) { + newFormat.image_channel_order = RejectedOrder[i].clNewType_; + rejected = true; + break; + } } + } - // If the image format was rejected, then attempt to create a view - if (rejected) { - srcView = createView(gpuMem(srcMemory), newFormat); - if (srcView != NULL) { - rejected = false; - releaseView = true; - } - } - - // Fall into the host path if the image format was rejected - if (rejected) { - return HostBlitManager::copyImageToBuffer( - srcMemory, dstMemory, srcOrigin, dstOrigin, size, entire); - } - - uint blitType = BlitCopyImageToBuffer; - size_t dim = 0; - size_t globalWorkOffset[3] = { 0, 0, 0 }; - size_t globalWorkSize[3]; - size_t localWorkSize[3]; - - // Program the kernels workload depending on the blit dimensions - dim = 3; - // Find the current blit type - if (gpuMem(srcMemory).desc().dimSize_ == 1) { - globalWorkSize[0] = amd::alignUp(size[0], 256); - globalWorkSize[1] = amd::alignUp(size[1], 1); - globalWorkSize[2] = amd::alignUp(size[2], 1); - localWorkSize[0] = 256; - localWorkSize[1] = localWorkSize[2] = 1; - } - else if (gpuMem(srcMemory).desc().dimSize_ == 2) { - globalWorkSize[0] = amd::alignUp(size[0], 16); - globalWorkSize[1] = amd::alignUp(size[1], 16); - globalWorkSize[2] = amd::alignUp(size[2], 1); - localWorkSize[0] = localWorkSize[1] = 16; - localWorkSize[2] = 1; - } - else { - globalWorkSize[0] = amd::alignUp(size[0], 8); - globalWorkSize[1] = amd::alignUp(size[1], 8); - globalWorkSize[2] = amd::alignUp(size[2], 4); - localWorkSize[0] = localWorkSize[1] = 8; - localWorkSize[2] = 4; - } - - // Program kernels arguments for the blit operation - Memory* mem = srcView; - setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem); - mem = &gpuMem(dstMemory); - setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem); - - // Update extra paramters for USHORT and UBYTE pointers. - // Only then compiler can optimize the kernel to use - // UAV Raw for other writes - setArgument(kernels_[blitType], 2, sizeof(cl_mem), &mem); - setArgument(kernels_[blitType], 3, sizeof(cl_mem), &mem); - - cl_int srcOrg[4] = { (cl_int)srcOrigin[0], - (cl_int)srcOrigin[1], - (cl_int)srcOrigin[2], 0 }; - cl_int copySize[4] = { (cl_int)size[0], - (cl_int)size[1], - (cl_int)size[2], 0 }; - setArgument(kernels_[blitType], 4, sizeof(srcOrg), srcOrg); - uint32_t memFmtSize = gpuMem(srcMemory).elementSize(); - uint32_t components = gpuMem(srcMemory).numComponents(); - - // 1 element granularity for writes by default - cl_int granularity = 1; - if (memFmtSize == 2) { - granularity = 2; - } - else if (memFmtSize >= 4) { - granularity = 4; - } - CondLog(((dstOrigin[0] % granularity) != 0), "Unaligned offset in blit!"); - cl_ulong dstOrg[4] = { dstOrigin[0] / granularity, - dstOrigin[1], - dstOrigin[2], 0 }; - setArgument(kernels_[blitType], 5, sizeof(dstOrg), dstOrg); - setArgument(kernels_[blitType], 6, sizeof(copySize), copySize); - - // Program memory format - uint multiplier = memFmtSize / sizeof(uint32_t); - multiplier = (multiplier == 0) ? 1 : multiplier; - cl_uint format[4] = { components, - memFmtSize / components, - multiplier, 0 }; - setArgument(kernels_[blitType], 7, sizeof(format), format); - - // Program row and slice pitches - cl_ulong pitch[4] = { 0 }; - CalcRowSlicePitches(pitch, copySize, rowPitch, slicePitch, gpuMem(srcMemory)); - setArgument(kernels_[blitType], 8, sizeof(pitch), pitch); - - // Create ND range object for the kernel's execution - amd::NDRangeContainer ndrange(dim, - globalWorkOffset, globalWorkSize, localWorkSize); - - // Execute the blit - address parameters = kernels_[blitType]->parameters().values(); - result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters); - if (releaseView) { + // Attempt to create a view if the format was rejected + if (rejected) { + srcView = createView(gpuMem(srcMemory), newFormat); + if (srcView != NULL) { + dstView = createView(gpuMem(dstMemory), newFormat); + if (dstView != NULL) { + rejected = false; + releaseView = true; + } else { delete srcView; + } } + } + // Fall into the host path for the entire 2D copy or + // if the image format was rejected + if (rejected) { + result = HostBlitManager::copyImage(srcMemory, dstMemory, srcOrigin, dstOrigin, size, entire); + synchronize(); return result; + } + + uint blitType = BlitCopyImage; + size_t dim = 0; + size_t globalWorkOffset[3] = {0, 0, 0}; + size_t globalWorkSize[3]; + size_t localWorkSize[3]; + + // Program the kernels workload depending on the blit dimensions + dim = 3; + // Find the current blit type + if ((gpuMem(srcMemory).desc().dimSize_ == 1) || (gpuMem(dstMemory).desc().dimSize_ == 1)) { + globalWorkSize[0] = amd::alignUp(size[0], 256); + globalWorkSize[1] = amd::alignUp(size[1], 1); + globalWorkSize[2] = amd::alignUp(size[2], 1); + localWorkSize[0] = 256; + localWorkSize[1] = localWorkSize[2] = 1; + } else if ((gpuMem(srcMemory).desc().dimSize_ == 2) || (gpuMem(dstMemory).desc().dimSize_ == 2)) { + globalWorkSize[0] = amd::alignUp(size[0], 16); + globalWorkSize[1] = amd::alignUp(size[1], 16); + globalWorkSize[2] = amd::alignUp(size[2], 1); + localWorkSize[0] = localWorkSize[1] = 16; + localWorkSize[2] = 1; + } else { + globalWorkSize[0] = amd::alignUp(size[0], 8); + globalWorkSize[1] = amd::alignUp(size[1], 8); + globalWorkSize[2] = amd::alignUp(size[2], 4); + localWorkSize[0] = localWorkSize[1] = 8; + localWorkSize[2] = 4; + } + + // The current OpenCL spec allows "copy images from a 1D image + // array object to a 1D image array object" only. + if ((gpuMem(srcMemory).desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY) || + (gpuMem(dstMemory).desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY)) { + blitType = BlitCopyImage1DA; + } + + // Program kernels arguments for the blit operation + Memory* mem = srcView; + setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem); + mem = dstView; + setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem); + + // Program source origin + cl_int srcOrg[4] = {(cl_int)srcOrigin[0], (cl_int)srcOrigin[1], (cl_int)srcOrigin[2], 0}; + setArgument(kernels_[blitType], 2, sizeof(srcOrg), srcOrg); + + // Program destinaiton origin + cl_int dstOrg[4] = {(cl_int)dstOrigin[0], (cl_int)dstOrigin[1], (cl_int)dstOrigin[2], 0}; + setArgument(kernels_[blitType], 3, sizeof(dstOrg), dstOrg); + + cl_int copySize[4] = {(cl_int)size[0], (cl_int)size[1], (cl_int)size[2], 0}; + setArgument(kernels_[blitType], 4, sizeof(copySize), copySize); + + // Create ND range object for the kernel's execution + amd::NDRangeContainer ndrange(dim, globalWorkOffset, globalWorkSize, localWorkSize); + + // Execute the blit + address parameters = kernels_[blitType]->parameters().values(); + result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters); + if (releaseView) { + delete srcView; + delete dstView; + } + + synchronize(); + + return result; } -bool -KernelBlitManager::copyImage( - device::Memory& srcMemory, - device::Memory& dstMemory, - const amd::Coord3D& srcOrigin, - const amd::Coord3D& dstOrigin, - const amd::Coord3D& size, - bool entire) const -{ - amd::ScopedLock k(lockXferOps_); - bool rejected = false; - Memory* srcView = &gpuMem(srcMemory); - Memory* dstView = &gpuMem(dstMemory); - bool releaseView = false; - bool result = false; - amd::Image::Format newFormat(gpuMem(srcMemory).desc().format_); +void FindPinSize(size_t& pinSize, const amd::Coord3D& size, size_t& rowPitch, size_t& slicePitch, + const Memory& mem) { + pinSize = size[0] * mem.elementSize(); + if ((rowPitch == 0) || (rowPitch == pinSize)) { + rowPitch = 0; + } else { + pinSize = rowPitch; + } - // Find unsupported formats - for (uint i = 0; i < RejectedFormatDataTotal; ++i) { - if (RejectedData[i].clOldType_ == newFormat.image_channel_data_type) { - newFormat.image_channel_data_type = RejectedData[i].clNewType_; - rejected = true; - break; + // Calculate the pin size, which should be equal to the copy size + for (uint i = 1; i < mem.desc().dimSize_; ++i) { + pinSize *= size[i]; + if (i == 1) { + if ((slicePitch == 0) || (slicePitch == pinSize)) { + slicePitch = 0; + } else { + if (mem.desc().topology_ != CL_MEM_OBJECT_IMAGE1D_ARRAY) { + pinSize = slicePitch; + } else { + pinSize = slicePitch * size[i]; } + } + } + } +} + +bool KernelBlitManager::readImage(device::Memory& srcMemory, void* dstHost, + const amd::Coord3D& origin, const amd::Coord3D& size, + size_t rowPitch, size_t slicePitch, bool entire) const { + amd::ScopedLock k(lockXferOps_); + bool result = false; + + // Use host copy if memory has direct access or it's persistent + if (setup_.disableReadImage_ || + (gpuMem(srcMemory).isHostMemDirectAccess() && gpuMem(srcMemory).isCacheable())) { + result = + HostBlitManager::readImage(srcMemory, dstHost, origin, size, rowPitch, slicePitch, entire); + synchronize(); + return result; + } else { + size_t pinSize; + FindPinSize(pinSize, size, rowPitch, slicePitch, gpuMem(srcMemory)); + + size_t partial; + amd::Memory* amdMemory = pinHostMemory(dstHost, pinSize, partial); + + if (amdMemory == NULL) { + // Force SW copy + result = HostBlitManager::readImage(srcMemory, dstHost, origin, size, rowPitch, slicePitch, + entire); + synchronize(); + return result; } - // Search for the rejected channel's order only if the format was rejected - // Note: Image blit is independent from the channel order - if (rejected) { - for (uint i = 0; i < RejectedFormatChannelTotal; ++i) { - if (RejectedOrder[i].clOldType_ == newFormat.image_channel_order) { - newFormat.image_channel_order = RejectedOrder[i].clNewType_; - rejected = true; - break; - } - } + // Readjust destination offset + const amd::Coord3D dstOrigin(partial); + + // Get device memory for this virtual device + Memory* dstMemory = dev().getGpuMemory(amdMemory); + + // Copy image to buffer + result = copyImageToBuffer(srcMemory, *dstMemory, origin, dstOrigin, size, entire, rowPitch, + slicePitch); + + // Add pinned memory for a later release + gpu().addPinnedMem(amdMemory); + } + + synchronize(); + + return result; +} + +bool KernelBlitManager::writeImage(const void* srcHost, device::Memory& dstMemory, + const amd::Coord3D& origin, const amd::Coord3D& size, + size_t rowPitch, size_t slicePitch, bool entire) const { + amd::ScopedLock k(lockXferOps_); + bool result = false; + + // Use host copy if memory has direct access or it's persistent + if (setup_.disableWriteImage_ || gpuMem(dstMemory).isHostMemDirectAccess() || + gpuMem(dstMemory).isPersistentDirectMap()) { + result = + HostBlitManager::writeImage(srcHost, dstMemory, origin, size, rowPitch, slicePitch, entire); + synchronize(); + return result; + } else { + size_t pinSize; + FindPinSize(pinSize, size, rowPitch, slicePitch, gpuMem(dstMemory)); + + size_t partial; + amd::Memory* amdMemory = pinHostMemory(srcHost, pinSize, partial); + + if (amdMemory == NULL) { + // Force SW copy + result = HostBlitManager::writeImage(srcHost, dstMemory, origin, size, rowPitch, slicePitch, + entire); + synchronize(); + return result; } - // Attempt to create a view if the format was rejected - if (rejected) { - srcView = createView(gpuMem(srcMemory), newFormat); - if (srcView != NULL) { - dstView = createView(gpuMem(dstMemory), newFormat); - if (dstView != NULL) { - rejected = false; - releaseView = true; - } - else { - delete srcView; - } - } - } + // Readjust destination offset + const amd::Coord3D srcOrigin(partial); - // Fall into the host path for the entire 2D copy or - // if the image format was rejected - if (rejected) { - result = HostBlitManager::copyImage(srcMemory, dstMemory, - srcOrigin, dstOrigin, size, entire); + // Get device memory for this virtual device + Memory* srcMemory = dev().getGpuMemory(amdMemory); + + // Copy image to buffer + result = copyBufferToImage(*srcMemory, dstMemory, srcOrigin, origin, size, entire, rowPitch, + slicePitch); + + // Add pinned memory for a later release + gpu().addPinnedMem(amdMemory); + } + + synchronize(); + + return result; +} + +bool KernelBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& dstMemory, + const amd::BufferRect& srcRectIn, + const amd::BufferRect& dstRectIn, const amd::Coord3D& sizeIn, + bool entire) const { + amd::ScopedLock k(lockXferOps_); + bool result = false; + bool rejected = false; + + // Fall into the PAL path for rejected transfers + if (setup_.disableCopyBufferRect_ || gpuMem(srcMemory).isHostMemDirectAccess() || + gpuMem(dstMemory).isHostMemDirectAccess()) { + result = + DmaBlitManager::copyBufferRect(srcMemory, dstMemory, srcRectIn, dstRectIn, sizeIn, entire); + + if (result) { + synchronize(); + return result; + } + } + + uint blitType = BlitCopyBufferRect; + size_t dim = 3; + size_t globalWorkOffset[3] = {0, 0, 0}; + size_t globalWorkSize[3]; + size_t localWorkSize[3]; + + const static uint CopyRectAlignment[3] = {16, 4, 1}; + + bool aligned; + uint i; + for (i = 0; i < sizeof(CopyRectAlignment) / sizeof(uint); i++) { + // Check source alignments + aligned = ((srcRectIn.rowPitch_ % CopyRectAlignment[i]) == 0); + aligned &= ((srcRectIn.slicePitch_ % CopyRectAlignment[i]) == 0); + aligned &= ((srcRectIn.start_ % CopyRectAlignment[i]) == 0); + + // Check destination alignments + aligned &= ((dstRectIn.rowPitch_ % CopyRectAlignment[i]) == 0); + aligned &= ((dstRectIn.slicePitch_ % CopyRectAlignment[i]) == 0); + aligned &= ((dstRectIn.start_ % CopyRectAlignment[i]) == 0); + + // Check copy size alignment in the first dimension + aligned &= ((sizeIn[0] % CopyRectAlignment[i]) == 0); + + if (aligned) { + if (CopyRectAlignment[i] != 1) { + blitType = BlitCopyBufferRectAligned; + } + break; + } + } + + amd::BufferRect srcRect; + amd::BufferRect dstRect; + amd::Coord3D size(sizeIn[0], sizeIn[1], sizeIn[2]); + + srcRect.rowPitch_ = srcRectIn.rowPitch_ / CopyRectAlignment[i]; + srcRect.slicePitch_ = srcRectIn.slicePitch_ / CopyRectAlignment[i]; + srcRect.start_ = srcRectIn.start_ / CopyRectAlignment[i]; + srcRect.end_ = srcRectIn.end_ / CopyRectAlignment[i]; + + dstRect.rowPitch_ = dstRectIn.rowPitch_ / CopyRectAlignment[i]; + dstRect.slicePitch_ = dstRectIn.slicePitch_ / CopyRectAlignment[i]; + dstRect.start_ = dstRectIn.start_ / CopyRectAlignment[i]; + dstRect.end_ = dstRectIn.end_ / CopyRectAlignment[i]; + + size.c[0] /= CopyRectAlignment[i]; + + // Program the kernel's workload depending on the transfer dimensions + if ((size[1] == 1) && (size[2] == 1)) { + globalWorkSize[0] = amd::alignUp(size[0], 256); + globalWorkSize[1] = 1; + globalWorkSize[2] = 1; + localWorkSize[0] = 256; + localWorkSize[1] = 1; + localWorkSize[2] = 1; + } else if (size[2] == 1) { + globalWorkSize[0] = amd::alignUp(size[0], 16); + globalWorkSize[1] = amd::alignUp(size[1], 16); + globalWorkSize[2] = 1; + localWorkSize[0] = localWorkSize[1] = 16; + localWorkSize[2] = 1; + } else { + globalWorkSize[0] = amd::alignUp(size[0], 8); + globalWorkSize[1] = amd::alignUp(size[1], 8); + globalWorkSize[2] = amd::alignUp(size[2], 4); + localWorkSize[0] = localWorkSize[1] = 8; + localWorkSize[2] = 4; + } + + + // Program kernels arguments for the blit operation + Memory* mem = &gpuMem(srcMemory); + setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem); + mem = &gpuMem(dstMemory); + setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem); + cl_ulong src[4] = {srcRect.rowPitch_, srcRect.slicePitch_, srcRect.start_, 0}; + setArgument(kernels_[blitType], 2, sizeof(src), src); + cl_ulong dst[4] = {dstRect.rowPitch_, dstRect.slicePitch_, dstRect.start_, 0}; + setArgument(kernels_[blitType], 3, sizeof(dst), dst); + cl_ulong copySize[4] = {size[0], size[1], size[2], CopyRectAlignment[i]}; + setArgument(kernels_[blitType], 4, sizeof(copySize), copySize); + + // Create ND range object for the kernel's execution + amd::NDRangeContainer ndrange(dim, globalWorkOffset, globalWorkSize, localWorkSize); + + // Execute the blit + address parameters = kernels_[blitType]->parameters().values(); + result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters); + + synchronize(); + + return result; +} + +bool KernelBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost, + const amd::Coord3D& origin, const amd::Coord3D& size, + bool entire) const { + amd::ScopedLock k(lockXferOps_); + bool result = false; + // Use host copy if memory has direct access + if (setup_.disableReadBuffer_ || + (gpuMem(srcMemory).isHostMemDirectAccess() && gpuMem(srcMemory).isCacheable())) { + result = HostBlitManager::readBuffer(srcMemory, dstHost, origin, size, entire); + synchronize(); + return result; + } else { + size_t pinSize = size[0]; + // Check if a pinned transfer can be executed with a single pin + if ((pinSize <= dev().settings().pinnedXferSize_) && (pinSize > MinSizeForPinnedTransfer)) { + size_t partial; + amd::Memory* amdMemory = pinHostMemory(dstHost, pinSize, partial); + + if (amdMemory == NULL) { + // Force SW copy + result = HostBlitManager::readBuffer(srcMemory, dstHost, origin, size, entire); synchronize(); return result; + } + + // Readjust host mem offset + amd::Coord3D dstOrigin(partial); + + // Get device memory for this virtual device + Memory* dstMemory = dev().getGpuMemory(amdMemory); + + // Copy image to buffer + result = copyBuffer(srcMemory, *dstMemory, origin, dstOrigin, size, entire); + + // Add pinned memory for a later release + gpu().addPinnedMem(amdMemory); + } else { + result = DmaBlitManager::readBuffer(srcMemory, dstHost, origin, size, entire); + } + } + + synchronize(); + + return result; +} + +bool KernelBlitManager::readBufferRect(device::Memory& srcMemory, void* dstHost, + const amd::BufferRect& bufRect, + const amd::BufferRect& hostRect, const amd::Coord3D& size, + bool entire) const { + amd::ScopedLock k(lockXferOps_); + bool result = false; + + // Use host copy if memory has direct access + if (setup_.disableReadBufferRect_ || + (gpuMem(srcMemory).isHostMemDirectAccess() && gpuMem(srcMemory).isCacheable())) { + result = HostBlitManager::readBufferRect(srcMemory, dstHost, bufRect, hostRect, size, entire); + synchronize(); + return result; + } else { + size_t pinSize = hostRect.start_ + hostRect.end_; + size_t partial; + amd::Memory* amdMemory = pinHostMemory(dstHost, pinSize, partial); + + if (amdMemory == NULL) { + // Force SW copy + result = HostBlitManager::readBufferRect(srcMemory, dstHost, bufRect, hostRect, size, entire); + synchronize(); + return result; } - uint blitType = BlitCopyImage; - size_t dim = 0; - size_t globalWorkOffset[3] = { 0, 0, 0 }; - size_t globalWorkSize[3]; - size_t localWorkSize[3]; + // Readjust host mem offset + amd::BufferRect rect; + rect.rowPitch_ = hostRect.rowPitch_; + rect.slicePitch_ = hostRect.slicePitch_; + rect.start_ = hostRect.start_ + partial; + rect.end_ = hostRect.end_; - // Program the kernels workload depending on the blit dimensions - dim = 3; - // Find the current blit type - if ((gpuMem(srcMemory).desc().dimSize_ == 1) || - (gpuMem(dstMemory).desc().dimSize_ == 1)) { - globalWorkSize[0] = amd::alignUp(size[0], 256); - globalWorkSize[1] = amd::alignUp(size[1], 1); - globalWorkSize[2] = amd::alignUp(size[2], 1); - localWorkSize[0] = 256; - localWorkSize[1] = localWorkSize[2] = 1; + // Get device memory for this virtual device + Memory* dstMemory = dev().getGpuMemory(amdMemory); + + // Copy image to buffer + result = copyBufferRect(srcMemory, *dstMemory, bufRect, rect, size, entire); + + // Add pinned memory for a later release + gpu().addPinnedMem(amdMemory); + } + + synchronize(); + + return result; +} + +bool KernelBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemory, + const amd::Coord3D& origin, const amd::Coord3D& size, + bool entire) const { + amd::ScopedLock k(lockXferOps_); + bool result = false; + + // Use host copy if memory has direct access or it's persistent + if (setup_.disableWriteBuffer_ || gpuMem(dstMemory).isHostMemDirectAccess() || + (gpuMem(dstMemory).memoryType() == Resource::Persistent)) { + result = HostBlitManager::writeBuffer(srcHost, dstMemory, origin, size, entire); + synchronize(); + return result; + } else { + size_t pinSize = size[0]; + + // Check if a pinned transfer can be executed with a single pin + if ((pinSize <= dev().settings().pinnedXferSize_) && (pinSize > MinSizeForPinnedTransfer)) { + size_t partial; + amd::Memory* amdMemory = pinHostMemory(srcHost, pinSize, partial); + + if (amdMemory == NULL) { + // Force SW copy + result = HostBlitManager::writeBuffer(srcHost, dstMemory, origin, size, entire); + synchronize(); + return result; + } + + // Readjust destination offset + const amd::Coord3D srcOrigin(partial); + + // Get device memory for this virtual device + Memory* srcMemory = dev().getGpuMemory(amdMemory); + + // Copy buffer rect + result = copyBuffer(*srcMemory, dstMemory, srcOrigin, origin, size, entire); + + // Add pinned memory for a later release + gpu().addPinnedMem(amdMemory); + } else { + result = DmaBlitManager::writeBuffer(srcHost, dstMemory, origin, size, entire); } - else if ((gpuMem(srcMemory).desc().dimSize_ == 2) || - (gpuMem(dstMemory).desc().dimSize_ == 2)) { - globalWorkSize[0] = amd::alignUp(size[0], 16); - globalWorkSize[1] = amd::alignUp(size[1], 16); - globalWorkSize[2] = amd::alignUp(size[2], 1); - localWorkSize[0] = localWorkSize[1] = 16; - localWorkSize[2] = 1; - } - else { - globalWorkSize[0] = amd::alignUp(size[0], 8); - globalWorkSize[1] = amd::alignUp(size[1], 8); - globalWorkSize[2] = amd::alignUp(size[2], 4); - localWorkSize[0] = localWorkSize[1] = 8; - localWorkSize[2] = 4; + } + + synchronize(); + + return result; +} + +bool KernelBlitManager::writeBufferRect(const void* srcHost, device::Memory& dstMemory, + const amd::BufferRect& hostRect, + const amd::BufferRect& bufRect, const amd::Coord3D& size, + bool entire) const { + amd::ScopedLock k(lockXferOps_); + bool result = false; + + // Use host copy if memory has direct access or it's persistent + if (setup_.disableWriteBufferRect_ || gpuMem(dstMemory).isHostMemDirectAccess() || + gpuMem(dstMemory).isPersistentDirectMap()) { + result = HostBlitManager::writeBufferRect(srcHost, dstMemory, hostRect, bufRect, size, entire); + synchronize(); + return result; + } else { + size_t pinSize = hostRect.start_ + hostRect.end_; + size_t partial; + amd::Memory* amdMemory = pinHostMemory(srcHost, pinSize, partial); + + if (amdMemory == NULL) { + // Force SW copy + result = + HostBlitManager::writeBufferRect(srcHost, dstMemory, hostRect, bufRect, size, entire); + synchronize(); + return result; } - // The current OpenCL spec allows "copy images from a 1D image - // array object to a 1D image array object" only. - if ((gpuMem(srcMemory).desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY) || - (gpuMem(dstMemory).desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY)) { - blitType = BlitCopyImage1DA; + // Readjust destination offset + const amd::Coord3D srcOrigin(partial); + + // Get device memory for this virtual device + Memory* srcMemory = dev().getGpuMemory(amdMemory); + + // Readjust host mem offset + amd::BufferRect rect; + rect.rowPitch_ = hostRect.rowPitch_; + rect.slicePitch_ = hostRect.slicePitch_; + rect.start_ = hostRect.start_ + partial; + rect.end_ = hostRect.end_; + + // Copy buffer rect + result = copyBufferRect(*srcMemory, dstMemory, rect, bufRect, size, entire); + + // Add pinned memory for a later release + gpu().addPinnedMem(amdMemory); + } + + synchronize(); + + return result; +} + +bool KernelBlitManager::fillBuffer(device::Memory& memory, const void* pattern, size_t patternSize, + const amd::Coord3D& origin, const amd::Coord3D& size, + bool entire) const { + amd::ScopedLock k(lockXferOps_); + bool result = false; + + // Use host fill if memory has direct access + if (setup_.disableFillBuffer_ || gpuMem(memory).isHostMemDirectAccess()) { + result = HostBlitManager::fillBuffer(memory, pattern, patternSize, origin, size, entire); + synchronize(); + return result; + } else { + uint fillType = FillBuffer; + size_t globalWorkOffset[3] = {0, 0, 0}; + cl_ulong fillSize = size[0] / patternSize; + size_t globalWorkSize = amd::alignUp(fillSize, 256); + size_t localWorkSize = 256; + bool dwordAligned = ((patternSize % sizeof(uint32_t)) == 0) ? true : false; + + // Program kernels arguments for the fill operation + Memory* mem = &gpuMem(memory); + if (dwordAligned) { + setArgument(kernels_[fillType], 0, sizeof(cl_mem), NULL); + setArgument(kernels_[fillType], 1, sizeof(cl_mem), &mem); + } else { + setArgument(kernels_[fillType], 0, sizeof(cl_mem), &mem); + setArgument(kernels_[fillType], 1, sizeof(cl_mem), NULL); } - - // Program kernels arguments for the blit operation - Memory* mem = srcView; - setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem); - mem = dstView; - setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem); - - // Program source origin - cl_int srcOrg[4] = { (cl_int)srcOrigin[0], - (cl_int)srcOrigin[1], - (cl_int)srcOrigin[2], 0 }; - setArgument(kernels_[blitType], 2, sizeof(srcOrg), srcOrg); - - // Program destinaiton origin - cl_int dstOrg[4] = { (cl_int)dstOrigin[0], - (cl_int)dstOrigin[1], - (cl_int)dstOrigin[2], 0 }; - setArgument(kernels_[blitType], 3, sizeof(dstOrg), dstOrg); - - cl_int copySize[4] = { (cl_int)size[0], - (cl_int)size[1], - (cl_int)size[2], 0 }; - setArgument(kernels_[blitType], 4, sizeof(copySize), copySize); + Memory* gpuCB = dev().getGpuMemory(constantBuffer_); + if (gpuCB == NULL) { + return false; + } + void* constBuf = gpuCB->map(&gpu(), Resource::WriteOnly); + memcpy(constBuf, pattern, patternSize); + gpuCB->unmap(&gpu()); + setArgument(kernels_[fillType], 2, sizeof(cl_mem), &gpuCB); + cl_ulong offset = origin[0]; + if (dwordAligned) { + patternSize /= sizeof(uint32_t); + offset /= sizeof(uint32_t); + } + setArgument(kernels_[fillType], 3, sizeof(cl_uint), &patternSize); + setArgument(kernels_[fillType], 4, sizeof(offset), &offset); + setArgument(kernels_[fillType], 5, sizeof(fillSize), &fillSize); // Create ND range object for the kernel's execution - amd::NDRangeContainer ndrange(dim, - globalWorkOffset, globalWorkSize, localWorkSize); - - // Execute the blit - address parameters = kernels_[blitType]->parameters().values(); - result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters); - if (releaseView) { - delete srcView; - delete dstView; - } - - synchronize(); - - return result; -} - -void -FindPinSize( - size_t& pinSize, const amd::Coord3D& size, - size_t& rowPitch, size_t& slicePitch, const Memory& mem) -{ - pinSize = size[0] * mem.elementSize(); - if ((rowPitch == 0) || (rowPitch == pinSize)) { - rowPitch = 0; - } - else { - pinSize = rowPitch; - } - - // Calculate the pin size, which should be equal to the copy size - for (uint i = 1; i < mem.desc().dimSize_; ++i) { - pinSize *= size[i]; - if (i == 1) { - if ((slicePitch == 0) || (slicePitch == pinSize)) { - slicePitch = 0; - } - else { - if (mem.desc().topology_ != CL_MEM_OBJECT_IMAGE1D_ARRAY) { - pinSize = slicePitch; - } - else { - pinSize = slicePitch * size[i]; - } - } - } - } -} - -bool -KernelBlitManager::readImage( - device::Memory& srcMemory, - void* dstHost, - const amd::Coord3D& origin, - const amd::Coord3D& size, - size_t rowPitch, - size_t slicePitch, - bool entire) const -{ - amd::ScopedLock k(lockXferOps_); - bool result = false; - - // Use host copy if memory has direct access or it's persistent - if (setup_.disableReadImage_ || - (gpuMem(srcMemory).isHostMemDirectAccess() && gpuMem(srcMemory).isCacheable())) { - result = HostBlitManager::readImage(srcMemory, dstHost, - origin, size, rowPitch, slicePitch, entire); - synchronize(); - return result; - } - else { - size_t pinSize; - FindPinSize(pinSize, size, rowPitch, slicePitch, gpuMem(srcMemory)); - - size_t partial; - amd::Memory* amdMemory = pinHostMemory(dstHost, pinSize, partial); - - if (amdMemory == NULL) { - // Force SW copy - result = HostBlitManager::readImage(srcMemory, dstHost, - origin, size, rowPitch, slicePitch, entire); - synchronize(); - return result; - } - - // Readjust destination offset - const amd::Coord3D dstOrigin(partial); - - // Get device memory for this virtual device - Memory* dstMemory = dev().getGpuMemory(amdMemory); - - // Copy image to buffer - result = copyImageToBuffer(srcMemory, *dstMemory, - origin, dstOrigin, size, entire, rowPitch, slicePitch); - - // Add pinned memory for a later release - gpu().addPinnedMem(amdMemory); - } - - synchronize(); - - return result; -} - -bool -KernelBlitManager::writeImage( - const void* srcHost, - device::Memory& dstMemory, - const amd::Coord3D& origin, - const amd::Coord3D& size, - size_t rowPitch, - size_t slicePitch, - bool entire) const -{ - amd::ScopedLock k(lockXferOps_); - bool result = false; - - // Use host copy if memory has direct access or it's persistent - if (setup_.disableWriteImage_|| - gpuMem(dstMemory).isHostMemDirectAccess() || - gpuMem(dstMemory).isPersistentDirectMap()) { - result = HostBlitManager::writeImage( - srcHost, dstMemory, origin, size, rowPitch, slicePitch, entire); - synchronize(); - return result; - } - else { - size_t pinSize; - FindPinSize(pinSize, size, rowPitch, slicePitch, gpuMem(dstMemory)); - - size_t partial; - amd::Memory* amdMemory = pinHostMemory(srcHost, pinSize, partial); - - if (amdMemory == NULL) { - // Force SW copy - result = HostBlitManager::writeImage( - srcHost, dstMemory, origin, size, rowPitch, slicePitch, entire); - synchronize(); - return result; - } - - // Readjust destination offset - const amd::Coord3D srcOrigin(partial); - - // Get device memory for this virtual device - Memory* srcMemory = dev().getGpuMemory(amdMemory); - - // Copy image to buffer - result = copyBufferToImage(*srcMemory, dstMemory, - srcOrigin, origin, size, entire, rowPitch, slicePitch); - - // Add pinned memory for a later release - gpu().addPinnedMem(amdMemory); - } - - synchronize(); - - return result; -} - -bool -KernelBlitManager::copyBufferRect( - device::Memory& srcMemory, - device::Memory& dstMemory, - const amd::BufferRect& srcRectIn, - const amd::BufferRect& dstRectIn, - const amd::Coord3D& sizeIn, - bool entire) const -{ - amd::ScopedLock k(lockXferOps_); - bool result = false; - bool rejected = false; - - // Fall into the PAL path for rejected transfers - if (setup_.disableCopyBufferRect_ || - gpuMem(srcMemory).isHostMemDirectAccess() || gpuMem(dstMemory).isHostMemDirectAccess()) { - result = DmaBlitManager::copyBufferRect(srcMemory, dstMemory, - srcRectIn, dstRectIn, sizeIn, entire); - - if (result) { - synchronize(); - return result; - } - } - - uint blitType = BlitCopyBufferRect; - size_t dim = 3; - size_t globalWorkOffset[3] = { 0, 0, 0 }; - size_t globalWorkSize[3]; - size_t localWorkSize[3]; - - const static uint CopyRectAlignment[3] = { 16, 4, 1 }; - - bool aligned; - uint i; - for (i = 0; i < sizeof(CopyRectAlignment) / sizeof(uint); i++) { - // Check source alignments - aligned = ((srcRectIn.rowPitch_ % CopyRectAlignment[i]) == 0); - aligned &= ((srcRectIn.slicePitch_ % CopyRectAlignment[i]) == 0); - aligned &= ((srcRectIn.start_ % CopyRectAlignment[i]) == 0); - - // Check destination alignments - aligned &= ((dstRectIn.rowPitch_ % CopyRectAlignment[i]) == 0); - aligned &= ((dstRectIn.slicePitch_ % CopyRectAlignment[i]) == 0); - aligned &= ((dstRectIn.start_ % CopyRectAlignment[i]) == 0); - - // Check copy size alignment in the first dimension - aligned &= ((sizeIn[0] % CopyRectAlignment[i]) == 0); - - if (aligned) { - if (CopyRectAlignment[i] != 1) { - blitType = BlitCopyBufferRectAligned; - } - break; - } - } - - amd::BufferRect srcRect; - amd::BufferRect dstRect; - amd::Coord3D size(sizeIn[0], sizeIn[1], sizeIn[2]); - - srcRect.rowPitch_ = srcRectIn.rowPitch_ / CopyRectAlignment[i]; - srcRect.slicePitch_ = srcRectIn.slicePitch_ / CopyRectAlignment[i]; - srcRect.start_ = srcRectIn.start_ / CopyRectAlignment[i]; - srcRect.end_ = srcRectIn.end_ / CopyRectAlignment[i]; - - dstRect.rowPitch_ = dstRectIn.rowPitch_ / CopyRectAlignment[i]; - dstRect.slicePitch_ = dstRectIn.slicePitch_ / CopyRectAlignment[i]; - dstRect.start_ = dstRectIn.start_ / CopyRectAlignment[i]; - dstRect.end_ = dstRectIn.end_ / CopyRectAlignment[i]; - - size.c[0] /= CopyRectAlignment[i]; - - // Program the kernel's workload depending on the transfer dimensions - if ((size[1] == 1) && (size[2] == 1)) { - globalWorkSize[0] = amd::alignUp(size[0], 256); - globalWorkSize[1] = 1; - globalWorkSize[2] = 1; - localWorkSize[0] = 256; - localWorkSize[1] = 1; - localWorkSize[2] = 1; - } - else if (size[2] == 1) { - globalWorkSize[0] = amd::alignUp(size[0], 16); - globalWorkSize[1] = amd::alignUp(size[1], 16); - globalWorkSize[2] = 1; - localWorkSize[0] = localWorkSize[1] = 16; - localWorkSize[2] = 1; - } - else { - globalWorkSize[0] = amd::alignUp(size[0], 8); - globalWorkSize[1] = amd::alignUp(size[1], 8); - globalWorkSize[2] = amd::alignUp(size[2], 4); - localWorkSize[0] = localWorkSize[1] = 8; - localWorkSize[2] = 4; - } - - - // Program kernels arguments for the blit operation - Memory* mem = &gpuMem(srcMemory); - setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem); - mem = &gpuMem(dstMemory); - setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem); - cl_ulong src[4] = { srcRect.rowPitch_, - srcRect.slicePitch_, - srcRect.start_, 0 }; - setArgument(kernels_[blitType], 2, sizeof(src), src); - cl_ulong dst[4] = { dstRect.rowPitch_, - dstRect.slicePitch_, - dstRect.start_, 0 }; - setArgument(kernels_[blitType], 3, sizeof(dst), dst); - cl_ulong copySize[4] = { size[0], size[1], size[2], CopyRectAlignment[i] }; - setArgument(kernels_[blitType], 4, sizeof(copySize), copySize); - - // Create ND range object for the kernel's execution - amd::NDRangeContainer ndrange(dim, - globalWorkOffset, globalWorkSize, localWorkSize); - - // Execute the blit - address parameters = kernels_[blitType]->parameters().values(); - result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters); - - synchronize(); - - return result; -} - -bool -KernelBlitManager::readBuffer( - device::Memory& srcMemory, - void* dstHost, - const amd::Coord3D& origin, - const amd::Coord3D& size, - bool entire) const -{ - amd::ScopedLock k(lockXferOps_); - bool result = false; - // Use host copy if memory has direct access - if (setup_.disableReadBuffer_ || - (gpuMem(srcMemory).isHostMemDirectAccess() && gpuMem(srcMemory).isCacheable())) { - result = HostBlitManager::readBuffer( - srcMemory, dstHost, origin, size, entire); - synchronize(); - return result; - } - else { - size_t pinSize = size[0]; - // Check if a pinned transfer can be executed with a single pin - if ((pinSize <= dev().settings().pinnedXferSize_) && - (pinSize > MinSizeForPinnedTransfer)) { - size_t partial; - amd::Memory* amdMemory = pinHostMemory(dstHost, pinSize, partial); - - if (amdMemory == NULL) { - // Force SW copy - result = HostBlitManager::readBuffer( - srcMemory, dstHost, origin, size, entire); - synchronize(); - return result; - } - - // Readjust host mem offset - amd::Coord3D dstOrigin(partial); - - // Get device memory for this virtual device - Memory* dstMemory = dev().getGpuMemory(amdMemory); - - // Copy image to buffer - result = copyBuffer(srcMemory, *dstMemory, - origin, dstOrigin, size, entire); - - // Add pinned memory for a later release - gpu().addPinnedMem(amdMemory); - } - else { - result = DmaBlitManager::readBuffer( - srcMemory, dstHost, origin, size, entire); - } - } - - synchronize(); - - return result; -} - -bool -KernelBlitManager::readBufferRect( - device::Memory& srcMemory, - void* dstHost, - const amd::BufferRect& bufRect, - const amd::BufferRect& hostRect, - const amd::Coord3D& size, - bool entire) const -{ - amd::ScopedLock k(lockXferOps_); - bool result = false; - - // Use host copy if memory has direct access - if (setup_.disableReadBufferRect_ || - (gpuMem(srcMemory).isHostMemDirectAccess() && gpuMem(srcMemory).isCacheable())) { - result = HostBlitManager::readBufferRect( - srcMemory, dstHost, bufRect, hostRect, size, entire); - synchronize(); - return result; - } - else { - size_t pinSize = hostRect.start_ + hostRect.end_; - size_t partial; - amd::Memory* amdMemory = pinHostMemory(dstHost, pinSize, partial); - - if (amdMemory == NULL) { - // Force SW copy - result = HostBlitManager::readBufferRect( - srcMemory, dstHost, bufRect, hostRect, size, entire); - synchronize(); - return result; - } - - // Readjust host mem offset - amd::BufferRect rect; - rect.rowPitch_ = hostRect.rowPitch_; - rect.slicePitch_ = hostRect.slicePitch_; - rect.start_ = hostRect.start_ + partial; - rect.end_ = hostRect.end_; - - // Get device memory for this virtual device - Memory* dstMemory = dev().getGpuMemory(amdMemory); - - // Copy image to buffer - result = copyBufferRect(srcMemory, *dstMemory, - bufRect, rect, size, entire); - - // Add pinned memory for a later release - gpu().addPinnedMem(amdMemory); - } - - synchronize(); - - return result; -} - -bool -KernelBlitManager::writeBuffer( - const void* srcHost, - device::Memory& dstMemory, - const amd::Coord3D& origin, - const amd::Coord3D& size, - bool entire) const -{ - amd::ScopedLock k(lockXferOps_); - bool result = false; - - // Use host copy if memory has direct access or it's persistent - if (setup_.disableWriteBuffer_ || - gpuMem(dstMemory).isHostMemDirectAccess() || - (gpuMem(dstMemory).memoryType() == Resource::Persistent)) { - result = HostBlitManager::writeBuffer( - srcHost, dstMemory, origin, size, entire); - synchronize(); - return result; - } - else { - size_t pinSize = size[0]; - - // Check if a pinned transfer can be executed with a single pin - if ((pinSize <= dev().settings().pinnedXferSize_) && - (pinSize > MinSizeForPinnedTransfer)) { - size_t partial; - amd::Memory* amdMemory = pinHostMemory(srcHost, pinSize, partial); - - if (amdMemory == NULL) { - // Force SW copy - result = HostBlitManager::writeBuffer( - srcHost, dstMemory, origin, size, entire); - synchronize(); - return result; - } - - // Readjust destination offset - const amd::Coord3D srcOrigin(partial); - - // Get device memory for this virtual device - Memory* srcMemory = dev().getGpuMemory(amdMemory); - - // Copy buffer rect - result = copyBuffer(*srcMemory, dstMemory, - srcOrigin, origin, size, entire); - - // Add pinned memory for a later release - gpu().addPinnedMem(amdMemory); - } - else { - result = DmaBlitManager::writeBuffer( - srcHost, dstMemory, origin, size, entire); - } - } - - synchronize(); - - return result; -} - -bool -KernelBlitManager::writeBufferRect( - const void* srcHost, - device::Memory& dstMemory, - const amd::BufferRect& hostRect, - const amd::BufferRect& bufRect, - const amd::Coord3D& size, - bool entire) const -{ - amd::ScopedLock k(lockXferOps_); - bool result = false; - - // Use host copy if memory has direct access or it's persistent - if (setup_.disableWriteBufferRect_ || - gpuMem(dstMemory).isHostMemDirectAccess() || - gpuMem(dstMemory).isPersistentDirectMap()) { - result = HostBlitManager::writeBufferRect( - srcHost, dstMemory, hostRect, bufRect, size, entire); - synchronize(); - return result; - } - else { - size_t pinSize = hostRect.start_ + hostRect.end_; - size_t partial; - amd::Memory* amdMemory = pinHostMemory(srcHost, pinSize, partial); - - if (amdMemory == NULL) { - // Force SW copy - result = HostBlitManager::writeBufferRect( - srcHost, dstMemory, hostRect, bufRect, size, entire); - synchronize(); - return result; - } - - // Readjust destination offset - const amd::Coord3D srcOrigin(partial); - - // Get device memory for this virtual device - Memory* srcMemory = dev().getGpuMemory(amdMemory); - - // Readjust host mem offset - amd::BufferRect rect; - rect.rowPitch_ = hostRect.rowPitch_; - rect.slicePitch_ = hostRect.slicePitch_; - rect.start_ = hostRect.start_ + partial; - rect.end_ = hostRect.end_; - - // Copy buffer rect - result = copyBufferRect(*srcMemory, dstMemory, - rect, bufRect, size, entire); - - // Add pinned memory for a later release - gpu().addPinnedMem(amdMemory); - } - - synchronize(); - - return result; -} - -bool -KernelBlitManager::fillBuffer( - device::Memory& memory, - const void* pattern, - size_t patternSize, - const amd::Coord3D& origin, - const amd::Coord3D& size, - bool entire - ) const -{ - amd::ScopedLock k(lockXferOps_); - bool result = false; - - // Use host fill if memory has direct access - if (setup_.disableFillBuffer_ || - gpuMem(memory).isHostMemDirectAccess()) { - result = HostBlitManager::fillBuffer( - memory, pattern, patternSize, origin, size, entire); - synchronize(); - return result; - } - else { - uint fillType = FillBuffer; - size_t globalWorkOffset[3] = { 0, 0, 0 }; - cl_ulong fillSize = size[0] / patternSize; - size_t globalWorkSize = amd::alignUp(fillSize, 256); - size_t localWorkSize = 256; - bool dwordAligned = - ((patternSize % sizeof(uint32_t)) == 0) ? true : false; - - // Program kernels arguments for the fill operation - Memory* mem = &gpuMem(memory); - if (dwordAligned) { - setArgument(kernels_[fillType], 0, sizeof(cl_mem), NULL); - setArgument(kernels_[fillType], 1, sizeof(cl_mem), &mem); - } - else { - setArgument(kernels_[fillType], 0, sizeof(cl_mem), &mem); - setArgument(kernels_[fillType], 1, sizeof(cl_mem), NULL); - } - Memory* gpuCB = dev().getGpuMemory(constantBuffer_); - if (gpuCB == NULL) { - return false; - } - void* constBuf = gpuCB->map(&gpu(), Resource::WriteOnly); - memcpy(constBuf, pattern, patternSize); - gpuCB->unmap(&gpu()); - setArgument(kernels_[fillType], 2, sizeof(cl_mem), &gpuCB); - cl_ulong offset = origin[0]; - if (dwordAligned) { - patternSize /= sizeof(uint32_t); - offset /= sizeof(uint32_t); - } - setArgument(kernels_[fillType], 3, sizeof(cl_uint), &patternSize); - setArgument(kernels_[fillType], 4, sizeof(offset), &offset); - setArgument(kernels_[fillType], 5, sizeof(fillSize), &fillSize); - - // Create ND range object for the kernel's execution - amd::NDRangeContainer ndrange(1, - globalWorkOffset, &globalWorkSize, &localWorkSize); - - // Execute the blit - address parameters = kernels_[fillType]->parameters().values(); - result = gpu().submitKernelInternal(ndrange, *kernels_[fillType], parameters); - } - - synchronize(); - - return result; -} - -bool -KernelBlitManager::copyBuffer( - device::Memory& srcMemory, - device::Memory& dstMemory, - const amd::Coord3D& srcOrigin, - const amd::Coord3D& dstOrigin, - const amd::Coord3D& sizeIn, - bool entire) const -{ - amd::ScopedLock k(lockXferOps_); - bool result = false; - - if (!gpuMem(srcMemory).isHostMemDirectAccess() && - !gpuMem(dstMemory).isHostMemDirectAccess()) { - uint blitType = BlitCopyBuffer; - size_t dim = 1; - size_t globalWorkOffset[3] = { 0, 0, 0 }; - size_t globalWorkSize = 0; - size_t localWorkSize = 0; - - const static uint CopyBuffAlignment[3] = { 16, 4, 1 }; - amd::Coord3D size(sizeIn[0], sizeIn[1], sizeIn[2]); - - bool aligned; - uint i; - for (i = 0; i < sizeof(CopyBuffAlignment) / sizeof(uint); i++) { - // Check source alignments - aligned = ((srcOrigin[0] % CopyBuffAlignment[i]) == 0); - // Check destination alignments - aligned &= ((dstOrigin[0] % CopyBuffAlignment[i]) == 0); - // Check copy size alignment in the first dimension - aligned &= ((sizeIn[0] % CopyBuffAlignment[i]) == 0); - - if (aligned) { - if (CopyBuffAlignment[i] != 1) { - blitType = BlitCopyBufferAligned; - } - break; - } - } - - cl_uint remain; - if (blitType == BlitCopyBufferAligned) { - size.c[0] /= CopyBuffAlignment[i]; - } - else { - remain = size[0] % 4; - size.c[0] /= 4; - size.c[0] += 1; - } - - // Program the dispatch dimensions - localWorkSize = 256; - globalWorkSize = amd::alignUp(size[0] , 256); - - // Program kernels arguments for the blit operation - Memory* mem = &gpuMem(srcMemory); - setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem); - mem = &gpuMem(dstMemory); - setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem); - // Program source origin - cl_ulong srcOffset = srcOrigin[0] / CopyBuffAlignment[i];; - setArgument(kernels_[blitType], 2, sizeof(srcOffset), &srcOffset); - - // Program destinaiton origin - cl_ulong dstOffset = dstOrigin[0] / CopyBuffAlignment[i];; - setArgument(kernels_[blitType], 3, sizeof(dstOffset), &dstOffset); - - cl_ulong copySize = size[0]; - setArgument(kernels_[blitType], 4, sizeof(copySize), ©Size); - - if (blitType == BlitCopyBufferAligned) { - cl_int alignment = CopyBuffAlignment[i]; - setArgument(kernels_[blitType], 5, sizeof(alignment), &alignment); - } - else { - setArgument(kernels_[blitType], 5, sizeof(remain), &remain); - } - - // Create ND range object for the kernel's execution - amd::NDRangeContainer ndrange(1, - globalWorkOffset, &globalWorkSize, &localWorkSize); - - // Execute the blit - address parameters = kernels_[blitType]->parameters().values(); - result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters); - } - else { - result = DmaBlitManager::copyBuffer( - srcMemory, dstMemory, srcOrigin, dstOrigin, sizeIn, entire); - } - - synchronize(); - - return result; -} - -bool -KernelBlitManager::fillImage( - device::Memory& memory, - const void* pattern, - const amd::Coord3D& origin, - const amd::Coord3D& size, - bool entire - ) const -{ - amd::ScopedLock k(lockXferOps_); - bool result = false; - - // Use host fill if memory has direct access - if (setup_.disableFillImage_ || - gpuMem(memory).isHostMemDirectAccess()) { - result = HostBlitManager::fillImage( - memory, pattern, origin, size, entire); - synchronize(); - return result; - } - - uint fillType; - size_t dim = 0; - size_t globalWorkOffset[3] = { 0, 0, 0 }; - size_t globalWorkSize[3]; - size_t localWorkSize[3]; - Memory* memView = &gpuMem(memory); - amd::Image::Format newFormat(gpuMem(memory).owner()->asImage()->getImageFormat()); - - // Program the kernels workload depending on the fill dimensions - fillType = FillImage; - dim = 3; - - void *newpattern = const_cast(pattern); - cl_uint4 iFillColor; - - bool rejected = false; - bool releaseView = false; - // For depth, we need to create a view - if (memView->desc().format_.image_channel_order == CL_sRGBA) { - // Find unsupported data type - for (uint i = 0; i < RejectedFormatDataTotal; ++i) { - if (RejectedData[i].clOldType_ == newFormat.image_channel_data_type) { - newFormat.image_channel_data_type = RejectedData[i].clNewType_; - rejected = true; - break; - } - } - - if (gpuMem(memory).desc().format_.image_channel_order == CL_sRGBA) { - // Converting a linear RGB floating-point color value to a 8-bit unsigned integer sRGB value because hw is not support write_imagef for sRGB. - float *fColor = static_cast(newpattern); - iFillColor.s[0] = sRGBmap(fColor[0]); - iFillColor.s[1] = sRGBmap(fColor[1]); - iFillColor.s[2] = sRGBmap(fColor[2]); - iFillColor.s[3] = (cl_uint)(fColor[3]*255.0f); - newpattern = static_cast(&iFillColor); - for (uint i = 0; i < RejectedFormatChannelTotal; ++i) { - if (RejectedOrder[i].clOldType_ == newFormat.image_channel_order) { - newFormat.image_channel_order = RejectedOrder[i].clNewType_; - rejected = true; - break; - } - } - } - } - // If the image format was rejected, then attempt to create a view - if (rejected) { - memView = createView(gpuMem(memory), newFormat); - if (memView != NULL) { - rejected = false; - releaseView = true; - } - } - - // Perform workload split to allow multiple operations in a single thread - globalWorkSize[0] = (size[0] + TransferSplitSize - 1) / TransferSplitSize; - // Find the current blit type - if (memView->desc().dimSize_ == 1) { - globalWorkSize[0] = amd::alignUp(globalWorkSize[0], 256); - globalWorkSize[1] = amd::alignUp(size[1], 1); - globalWorkSize[2] = amd::alignUp(size[2], 1); - localWorkSize[0] = 256; - localWorkSize[1] = localWorkSize[2] = 1; - } - else if (memView->desc().dimSize_ == 2) { - globalWorkSize[0] = amd::alignUp(globalWorkSize[0], 16); - globalWorkSize[1] = amd::alignUp(size[1], 16); - globalWorkSize[2] = amd::alignUp(size[2], 1); - localWorkSize[0] = localWorkSize[1] = 16; - localWorkSize[2] = 1; - } - else { - globalWorkSize[0] = amd::alignUp(globalWorkSize[0], 8); - globalWorkSize[1] = amd::alignUp(size[1], 8); - globalWorkSize[2] = amd::alignUp(size[2], 4); - localWorkSize[0] = localWorkSize[1] = 8; - localWorkSize[2] = 4; - } - - // Program kernels arguments for the blit operation - Memory* mem = memView; - setArgument(kernels_[fillType], 0, sizeof(cl_mem), &mem); - setArgument(kernels_[fillType], 1, sizeof(cl_float4), newpattern); - setArgument(kernels_[fillType], 2, sizeof(cl_int4), newpattern); - setArgument(kernels_[fillType], 3, sizeof(cl_uint4), newpattern); - - cl_int fillOrigin[4] = { (cl_int)origin[0], - (cl_int)origin[1], - (cl_int)origin[2], 0 }; - cl_int fillSize[4] = { (cl_int)size[0], - (cl_int)size[1], - (cl_int)size[2], 0 }; - setArgument(kernels_[fillType], 4, sizeof(fillOrigin), fillOrigin); - setArgument(kernels_[fillType], 5, sizeof(fillSize), fillSize); - - // Find the type of image - uint32_t type = 0; - switch (newFormat.image_channel_data_type) { - case CL_SNORM_INT8: - case CL_SNORM_INT16: - case CL_UNORM_INT8: - case CL_UNORM_INT16: - case CL_UNORM_SHORT_565: - case CL_UNORM_SHORT_555: - case CL_UNORM_INT_101010: - case CL_HALF_FLOAT: - case CL_FLOAT: - type = 0; - break; - case CL_SIGNED_INT8: - case CL_SIGNED_INT16: - case CL_SIGNED_INT32: - type = 1; - break; - case CL_UNSIGNED_INT8: - case CL_UNSIGNED_INT16: - case CL_UNSIGNED_INT32: - type = 2; - break; - } - setArgument(kernels_[fillType], 6, sizeof(type), &type); - - // Create ND range object for the kernel's execution - amd::NDRangeContainer ndrange(dim, - globalWorkOffset, globalWorkSize, localWorkSize); + amd::NDRangeContainer ndrange(1, globalWorkOffset, &globalWorkSize, &localWorkSize); // Execute the blit address parameters = kernels_[fillType]->parameters().values(); result = gpu().submitKernelInternal(ndrange, *kernels_[fillType], parameters); - if (releaseView) { - delete memView; - } + } - synchronize(); + synchronize(); - return result; + return result; } -bool -KernelBlitManager::runScheduler( - device::Memory& vqueue, - device::Memory& params, - uint paramIdx, - uint threads - ) const -{ - amd::ScopedLock k(lockXferOps_); - bool result = false; +bool KernelBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& dstMemory, + const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin, + const amd::Coord3D& sizeIn, bool entire) const { + amd::ScopedLock k(lockXferOps_); + bool result = false; - size_t dim = 1; - size_t globalWorkOffset[1] = { 0 }; - size_t globalWorkSize[1] = { threads }; - size_t localWorkSize[1] = { 1 }; + if (!gpuMem(srcMemory).isHostMemDirectAccess() && !gpuMem(dstMemory).isHostMemDirectAccess()) { + uint blitType = BlitCopyBuffer; + size_t dim = 1; + size_t globalWorkOffset[3] = {0, 0, 0}; + size_t globalWorkSize = 0; + size_t localWorkSize = 0; - // Program kernels arguments - Memory* q = &gpuMem(vqueue); - Memory* p = &gpuMem(params); - setArgument(kernels_[Scheduler], 0, sizeof(cl_mem), &q); - setArgument(kernels_[Scheduler], 1, sizeof(cl_mem), &p); - setArgument(kernels_[Scheduler], 2, sizeof(uint), ¶mIdx); + const static uint CopyBuffAlignment[3] = {16, 4, 1}; + amd::Coord3D size(sizeIn[0], sizeIn[1], sizeIn[2]); + + bool aligned; + uint i; + for (i = 0; i < sizeof(CopyBuffAlignment) / sizeof(uint); i++) { + // Check source alignments + aligned = ((srcOrigin[0] % CopyBuffAlignment[i]) == 0); + // Check destination alignments + aligned &= ((dstOrigin[0] % CopyBuffAlignment[i]) == 0); + // Check copy size alignment in the first dimension + aligned &= ((sizeIn[0] % CopyBuffAlignment[i]) == 0); + + if (aligned) { + if (CopyBuffAlignment[i] != 1) { + blitType = BlitCopyBufferAligned; + } + break; + } + } + + cl_uint remain; + if (blitType == BlitCopyBufferAligned) { + size.c[0] /= CopyBuffAlignment[i]; + } else { + remain = size[0] % 4; + size.c[0] /= 4; + size.c[0] += 1; + } + + // Program the dispatch dimensions + localWorkSize = 256; + globalWorkSize = amd::alignUp(size[0], 256); + + // Program kernels arguments for the blit operation + Memory* mem = &gpuMem(srcMemory); + setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem); + mem = &gpuMem(dstMemory); + setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem); + // Program source origin + cl_ulong srcOffset = srcOrigin[0] / CopyBuffAlignment[i]; + ; + setArgument(kernels_[blitType], 2, sizeof(srcOffset), &srcOffset); + + // Program destinaiton origin + cl_ulong dstOffset = dstOrigin[0] / CopyBuffAlignment[i]; + ; + setArgument(kernels_[blitType], 3, sizeof(dstOffset), &dstOffset); + + cl_ulong copySize = size[0]; + setArgument(kernels_[blitType], 4, sizeof(copySize), ©Size); + + if (blitType == BlitCopyBufferAligned) { + cl_int alignment = CopyBuffAlignment[i]; + setArgument(kernels_[blitType], 5, sizeof(alignment), &alignment); + } else { + setArgument(kernels_[blitType], 5, sizeof(remain), &remain); + } // Create ND range object for the kernel's execution - amd::NDRangeContainer ndrange(1, - globalWorkOffset, globalWorkSize, localWorkSize); + amd::NDRangeContainer ndrange(1, globalWorkOffset, &globalWorkSize, &localWorkSize); // Execute the blit - address parameters = kernels_[Scheduler]->parameters().values(); - result = gpu().submitKernelInternal(ndrange, *kernels_[Scheduler], parameters); + address parameters = kernels_[blitType]->parameters().values(); + result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters); + } else { + result = DmaBlitManager::copyBuffer(srcMemory, dstMemory, srcOrigin, dstOrigin, sizeIn, entire); + } + synchronize(); + + return result; +} + +bool KernelBlitManager::fillImage(device::Memory& memory, const void* pattern, + const amd::Coord3D& origin, const amd::Coord3D& size, + bool entire) const { + amd::ScopedLock k(lockXferOps_); + bool result = false; + + // Use host fill if memory has direct access + if (setup_.disableFillImage_ || gpuMem(memory).isHostMemDirectAccess()) { + result = HostBlitManager::fillImage(memory, pattern, origin, size, entire); synchronize(); - return result; -} + } -void -KernelBlitManager::writeRawData( - device::Memory& memory, - size_t size, - const void* data - ) const -{ - static_cast(memory).writeRawData(gpu(), 0, size, data, false); + uint fillType; + size_t dim = 0; + size_t globalWorkOffset[3] = {0, 0, 0}; + size_t globalWorkSize[3]; + size_t localWorkSize[3]; + Memory* memView = &gpuMem(memory); + amd::Image::Format newFormat(gpuMem(memory).owner()->asImage()->getImageFormat()); - synchronize(); -} + // Program the kernels workload depending on the fill dimensions + fillType = FillImage; + dim = 3; -amd::Memory* -DmaBlitManager::pinHostMemory( - const void* hostMem, - size_t pinSize, - size_t& partial) const -{ - size_t pinAllocSize; - const static bool SysMem = true; - amd::Memory* amdMemory; + void* newpattern = const_cast(pattern); + cl_uint4 iFillColor; - // Allign offset to 4K boundary (Vista/Win7 limitation) - char* tmpHost = const_cast( - amd::alignDown(reinterpret_cast(hostMem), - PinnedMemoryAlignment)); - - // Find the partial size for unaligned copy - partial = reinterpret_cast(hostMem) - tmpHost; - - // Recalculate pin memory size - pinAllocSize = amd::alignUp(pinSize + partial, PinnedMemoryAlignment); - - amdMemory = gpu().findPinnedMem(tmpHost, pinAllocSize); - - if (NULL != amdMemory) { - return amdMemory; + bool rejected = false; + bool releaseView = false; + // For depth, we need to create a view + if (memView->desc().format_.image_channel_order == CL_sRGBA) { + // Find unsupported data type + for (uint i = 0; i < RejectedFormatDataTotal; ++i) { + if (RejectedData[i].clOldType_ == newFormat.image_channel_data_type) { + newFormat.image_channel_data_type = RejectedData[i].clNewType_; + rejected = true; + break; + } } - amdMemory = new(*context_) - amd::Buffer(*context_, CL_MEM_USE_HOST_PTR, pinAllocSize); - - if ((amdMemory != NULL) && !amdMemory->create(tmpHost, SysMem)) { - amdMemory->release(); - return NULL; - } - - // Get device memory for this virtual device - // @note: This will force real memory pinning - amdMemory->setVirtualDevice(&gpu()); - Memory* srcMemory = dev().getGpuMemory(amdMemory); - - if (srcMemory == NULL) { - // Release all pinned memory and attempt pinning again - gpu().releasePinnedMem(); - srcMemory = dev().getGpuMemory(amdMemory); - if (srcMemory == NULL) { - // Release memory - amdMemory->release(); - amdMemory = NULL; + if (gpuMem(memory).desc().format_.image_channel_order == CL_sRGBA) { + // Converting a linear RGB floating-point color value to a 8-bit unsigned integer sRGB value + // because hw is not support write_imagef for sRGB. + float* fColor = static_cast(newpattern); + iFillColor.s[0] = sRGBmap(fColor[0]); + iFillColor.s[1] = sRGBmap(fColor[1]); + iFillColor.s[2] = sRGBmap(fColor[2]); + iFillColor.s[3] = (cl_uint)(fColor[3] * 255.0f); + newpattern = static_cast(&iFillColor); + for (uint i = 0; i < RejectedFormatChannelTotal; ++i) { + if (RejectedOrder[i].clOldType_ == newFormat.image_channel_order) { + newFormat.image_channel_order = RejectedOrder[i].clNewType_; + rejected = true; + break; } + } } + } + // If the image format was rejected, then attempt to create a view + if (rejected) { + memView = createView(gpuMem(memory), newFormat); + if (memView != NULL) { + rejected = false; + releaseView = true; + } + } + // Perform workload split to allow multiple operations in a single thread + globalWorkSize[0] = (size[0] + TransferSplitSize - 1) / TransferSplitSize; + // Find the current blit type + if (memView->desc().dimSize_ == 1) { + globalWorkSize[0] = amd::alignUp(globalWorkSize[0], 256); + globalWorkSize[1] = amd::alignUp(size[1], 1); + globalWorkSize[2] = amd::alignUp(size[2], 1); + localWorkSize[0] = 256; + localWorkSize[1] = localWorkSize[2] = 1; + } else if (memView->desc().dimSize_ == 2) { + globalWorkSize[0] = amd::alignUp(globalWorkSize[0], 16); + globalWorkSize[1] = amd::alignUp(size[1], 16); + globalWorkSize[2] = amd::alignUp(size[2], 1); + localWorkSize[0] = localWorkSize[1] = 16; + localWorkSize[2] = 1; + } else { + globalWorkSize[0] = amd::alignUp(globalWorkSize[0], 8); + globalWorkSize[1] = amd::alignUp(size[1], 8); + globalWorkSize[2] = amd::alignUp(size[2], 4); + localWorkSize[0] = localWorkSize[1] = 8; + localWorkSize[2] = 4; + } + + // Program kernels arguments for the blit operation + Memory* mem = memView; + setArgument(kernels_[fillType], 0, sizeof(cl_mem), &mem); + setArgument(kernels_[fillType], 1, sizeof(cl_float4), newpattern); + setArgument(kernels_[fillType], 2, sizeof(cl_int4), newpattern); + setArgument(kernels_[fillType], 3, sizeof(cl_uint4), newpattern); + + cl_int fillOrigin[4] = {(cl_int)origin[0], (cl_int)origin[1], (cl_int)origin[2], 0}; + cl_int fillSize[4] = {(cl_int)size[0], (cl_int)size[1], (cl_int)size[2], 0}; + setArgument(kernels_[fillType], 4, sizeof(fillOrigin), fillOrigin); + setArgument(kernels_[fillType], 5, sizeof(fillSize), fillSize); + + // Find the type of image + uint32_t type = 0; + switch (newFormat.image_channel_data_type) { + case CL_SNORM_INT8: + case CL_SNORM_INT16: + case CL_UNORM_INT8: + case CL_UNORM_INT16: + case CL_UNORM_SHORT_565: + case CL_UNORM_SHORT_555: + case CL_UNORM_INT_101010: + case CL_HALF_FLOAT: + case CL_FLOAT: + type = 0; + break; + case CL_SIGNED_INT8: + case CL_SIGNED_INT16: + case CL_SIGNED_INT32: + type = 1; + break; + case CL_UNSIGNED_INT8: + case CL_UNSIGNED_INT16: + case CL_UNSIGNED_INT32: + type = 2; + break; + } + setArgument(kernels_[fillType], 6, sizeof(type), &type); + + // Create ND range object for the kernel's execution + amd::NDRangeContainer ndrange(dim, globalWorkOffset, globalWorkSize, localWorkSize); + + // Execute the blit + address parameters = kernels_[fillType]->parameters().values(); + result = gpu().submitKernelInternal(ndrange, *kernels_[fillType], parameters); + if (releaseView) { + delete memView; + } + + synchronize(); + + return result; +} + +bool KernelBlitManager::runScheduler(device::Memory& vqueue, device::Memory& params, uint paramIdx, + uint threads) const { + amd::ScopedLock k(lockXferOps_); + bool result = false; + + size_t dim = 1; + size_t globalWorkOffset[1] = {0}; + size_t globalWorkSize[1] = {threads}; + size_t localWorkSize[1] = {1}; + + // Program kernels arguments + Memory* q = &gpuMem(vqueue); + Memory* p = &gpuMem(params); + setArgument(kernels_[Scheduler], 0, sizeof(cl_mem), &q); + setArgument(kernels_[Scheduler], 1, sizeof(cl_mem), &p); + setArgument(kernels_[Scheduler], 2, sizeof(uint), ¶mIdx); + + // Create ND range object for the kernel's execution + amd::NDRangeContainer ndrange(1, globalWorkOffset, globalWorkSize, localWorkSize); + + // Execute the blit + address parameters = kernels_[Scheduler]->parameters().values(); + result = gpu().submitKernelInternal(ndrange, *kernels_[Scheduler], parameters); + + synchronize(); + + return result; +} + +void KernelBlitManager::writeRawData(device::Memory& memory, size_t size, const void* data) const { + static_cast(memory).writeRawData(gpu(), 0, size, data, false); + + synchronize(); +} + +amd::Memory* DmaBlitManager::pinHostMemory(const void* hostMem, size_t pinSize, + size_t& partial) const { + size_t pinAllocSize; + const static bool SysMem = true; + amd::Memory* amdMemory; + + // Allign offset to 4K boundary (Vista/Win7 limitation) + char* tmpHost = const_cast( + amd::alignDown(reinterpret_cast(hostMem), PinnedMemoryAlignment)); + + // Find the partial size for unaligned copy + partial = reinterpret_cast(hostMem) - tmpHost; + + // Recalculate pin memory size + pinAllocSize = amd::alignUp(pinSize + partial, PinnedMemoryAlignment); + + amdMemory = gpu().findPinnedMem(tmpHost, pinAllocSize); + + if (NULL != amdMemory) { return amdMemory; -} + } -Memory* -KernelBlitManager::createView( - const Memory& parent, - const cl_image_format format -) const -{ - assert(!parent.desc().buffer_ && "View supports images only"); - Memory* gpuImage = NULL; + amdMemory = new (*context_) amd::Buffer(*context_, CL_MEM_USE_HOST_PTR, pinAllocSize); - gpuImage = new Image(dev(), parent.size(), - parent.desc().width_, - parent.desc().height_, - parent.desc().depth_, - format, - parent.desc().topology_, - 1); + if ((amdMemory != NULL) && !amdMemory->create(tmpHost, SysMem)) { + amdMemory->release(); + return NULL; + } - // Create resource - if (NULL != gpuImage) { - bool result = false; - Resource::ImageViewParams params; - const Memory& gpuMem = static_cast(parent); + // Get device memory for this virtual device + // @note: This will force real memory pinning + amdMemory->setVirtualDevice(&gpu()); + Memory* srcMemory = dev().getGpuMemory(amdMemory); - params.owner_ = parent.owner(); - params.level_ = parent.desc().baseLevel_; - params.layer_ = 0; - params.resource_ = &gpuMem; - params.memory_ = &gpuMem; - params.gpu_ = &gpu(); - - // Create memory object - result = gpuImage->create(Resource::ImageView, ¶ms); - if (!result) { - delete gpuImage; - return NULL; - } + if (srcMemory == NULL) { + // Release all pinned memory and attempt pinning again + gpu().releasePinnedMem(); + srcMemory = dev().getGpuMemory(amdMemory); + if (srcMemory == NULL) { + // Release memory + amdMemory->release(); + amdMemory = NULL; } + } - return gpuImage; + return amdMemory; } -} // namespace pal +Memory* KernelBlitManager::createView(const Memory& parent, const cl_image_format format) const { + assert(!parent.desc().buffer_ && "View supports images only"); + Memory* gpuImage = NULL; + + gpuImage = new Image(dev(), parent.size(), parent.desc().width_, parent.desc().height_, + parent.desc().depth_, format, parent.desc().topology_, 1); + + // Create resource + if (NULL != gpuImage) { + bool result = false; + Resource::ImageViewParams params; + const Memory& gpuMem = static_cast(parent); + + params.owner_ = parent.owner(); + params.level_ = parent.desc().baseLevel_; + params.layer_ = 0; + params.resource_ = &gpuMem; + params.memory_ = &gpuMem; + params.gpu_ = &gpu(); + + // Create memory object + result = gpuImage->create(Resource::ImageView, ¶ms); + if (!result) { + delete gpuImage; + return NULL; + } + } + + return gpuImage; +} + +} // namespace pal diff --git a/rocclr/runtime/device/pal/palblit.hpp b/rocclr/runtime/device/pal/palblit.hpp index b19dc114d3..4e7e0187a3 100644 --- a/rocclr/runtime/device/pal/palblit.hpp +++ b/rocclr/runtime/device/pal/palblit.hpp @@ -22,436 +22,390 @@ class Memory; class VirtualGPU; //! DMA Blit Manager -class DmaBlitManager : public device::HostBlitManager -{ -public: - //! Constructor - DmaBlitManager( - VirtualGPU& gpu, //!< Virtual GPU to be used for blits - Setup setup = Setup() //!< Specifies HW accelerated blits - ); +class DmaBlitManager : public device::HostBlitManager { + public: + //! Constructor + DmaBlitManager(VirtualGPU& gpu, //!< Virtual GPU to be used for blits + Setup setup = Setup() //!< Specifies HW accelerated blits + ); - //! Destructor - virtual ~DmaBlitManager() {} + //! Destructor + virtual ~DmaBlitManager() {} - //! Creates DmaBlitManager object - virtual bool create(amd::Device& device) { return true; } + //! Creates DmaBlitManager object + virtual bool create(amd::Device& device) { return true; } - //! Copies a buffer object to system memory - virtual bool readBuffer( - device::Memory& srcMemory, //!< Source memory object - void* dstHost, //!< Destination host memory - const amd::Coord3D& origin, //!< Source origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; + //! Copies a buffer object to system memory + virtual bool readBuffer(device::Memory& srcMemory, //!< Source memory object + void* dstHost, //!< Destination host memory + const amd::Coord3D& origin, //!< Source origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const; - //! Copies a buffer object to system memory - virtual bool readBufferRect( - device::Memory& srcMemory, //!< Source memory object - void* dstHost, //!< Destinaiton host memory - const amd::BufferRect& bufRect, //!< Source rectangle - const amd::BufferRect& hostRect, //!< Destination rectangle - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; + //! Copies a buffer object to system memory + virtual bool readBufferRect(device::Memory& srcMemory, //!< Source memory object + void* dstHost, //!< Destinaiton host memory + const amd::BufferRect& bufRect, //!< Source rectangle + const amd::BufferRect& hostRect, //!< Destination rectangle + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const; - //! Copies an image object to system memory - virtual bool readImage( - device::Memory& srcMemory, //!< Source memory object - void* dstHost, //!< Destination host memory - const amd::Coord3D& origin, //!< Source origin - const amd::Coord3D& size, //!< Size of the copy region - size_t rowPitch, //!< Row pitch for host memory - size_t slicePitch, //!< Slice pitch for host memory - bool entire = false //!< Entire buffer will be updated - ) const; + //! Copies an image object to system memory + virtual bool readImage(device::Memory& srcMemory, //!< Source memory object + void* dstHost, //!< Destination host memory + const amd::Coord3D& origin, //!< Source origin + const amd::Coord3D& size, //!< Size of the copy region + size_t rowPitch, //!< Row pitch for host memory + size_t slicePitch, //!< Slice pitch for host memory + bool entire = false //!< Entire buffer will be updated + ) const; - //! Copies system memory to a buffer object - virtual bool writeBuffer( - const void* srcHost, //!< Source host memory - device::Memory& dstMemory, //!< Destination memory object - const amd::Coord3D& origin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; + //! Copies system memory to a buffer object + virtual bool writeBuffer(const void* srcHost, //!< Source host memory + device::Memory& dstMemory, //!< Destination memory object + const amd::Coord3D& origin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const; - //! Copies system memory to a buffer object - virtual bool writeBufferRect( - const void* srcHost, //!< Source host memory - device::Memory& dstMemory, //!< Destination memory object - const amd::BufferRect& hostRect, //!< Destination rectangle - const amd::BufferRect& bufRect, //!< Source rectangle - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; + //! Copies system memory to a buffer object + virtual bool writeBufferRect(const void* srcHost, //!< Source host memory + device::Memory& dstMemory, //!< Destination memory object + const amd::BufferRect& hostRect, //!< Destination rectangle + const amd::BufferRect& bufRect, //!< Source rectangle + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const; - //! Copies system memory to an image object - virtual bool writeImage( - const void* srcHost, //!< Source host memory - device::Memory& dstMemory, //!< Destination memory object - const amd::Coord3D& origin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - size_t rowPitch, //!< Row pitch for host memory - size_t slicePitch, //!< Slice pitch for host memory - bool entire = false //!< Entire buffer will be updated - ) const; + //! Copies system memory to an image object + virtual bool writeImage(const void* srcHost, //!< Source host memory + device::Memory& dstMemory, //!< Destination memory object + const amd::Coord3D& origin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + size_t rowPitch, //!< Row pitch for host memory + size_t slicePitch, //!< Slice pitch for host memory + bool entire = false //!< Entire buffer will be updated + ) const; - //! Copies a buffer object to another buffer object - virtual bool copyBuffer( - device::Memory& srcMemory, //!< Source memory object - device::Memory& dstMemory, //!< Destination memory object - const amd::Coord3D& srcOrigin, //!< Source origin - const amd::Coord3D& dstOrigin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; + //! Copies a buffer object to another buffer object + virtual bool copyBuffer(device::Memory& srcMemory, //!< Source memory object + device::Memory& dstMemory, //!< Destination memory object + const amd::Coord3D& srcOrigin, //!< Source origin + const amd::Coord3D& dstOrigin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const; - //! Copies a buffer object to another buffer object - virtual bool copyBufferRect( - device::Memory& srcMemory, //!< Source memory object - device::Memory& dstMemory, //!< Destination memory object - const amd::BufferRect& srcRect, //!< Source rectangle - const amd::BufferRect& dstRect, //!< Destination rectangle - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; + //! Copies a buffer object to another buffer object + virtual bool copyBufferRect(device::Memory& srcMemory, //!< Source memory object + device::Memory& dstMemory, //!< Destination memory object + const amd::BufferRect& srcRect, //!< Source rectangle + const amd::BufferRect& dstRect, //!< Destination rectangle + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const; - //! Copies an image object to a buffer object - virtual bool copyImageToBuffer( - device::Memory& srcMemory, //!< Source memory object - device::Memory& dstMemory, //!< Destination memory object - const amd::Coord3D& srcOrigin, //!< Source origin - const amd::Coord3D& dstOrigin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false, //!< Entire buffer will be updated - size_t rowPitch = 0, //!< Pitch for buffer - size_t slicePitch = 0 //!< Slice for buffer - ) const; + //! Copies an image object to a buffer object + virtual bool copyImageToBuffer(device::Memory& srcMemory, //!< Source memory object + device::Memory& dstMemory, //!< Destination memory object + const amd::Coord3D& srcOrigin, //!< Source origin + const amd::Coord3D& dstOrigin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false, //!< Entire buffer will be updated + size_t rowPitch = 0, //!< Pitch for buffer + size_t slicePitch = 0 //!< Slice for buffer + ) const; - //! Copies a buffer object to an image object - virtual bool copyBufferToImage( - device::Memory& srcMemory, //!< Source memory object - device::Memory& dstMemory, //!< Destination memory object - const amd::Coord3D& srcOrigin, //!< Source origin - const amd::Coord3D& dstOrigin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false, //!< Entire buffer will be updated - size_t rowPitch = 0, //!< Pitch for buffer - size_t slicePitch = 0 //!< Slice for buffer - ) const; + //! Copies a buffer object to an image object + virtual bool copyBufferToImage(device::Memory& srcMemory, //!< Source memory object + device::Memory& dstMemory, //!< Destination memory object + const amd::Coord3D& srcOrigin, //!< Source origin + const amd::Coord3D& dstOrigin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false, //!< Entire buffer will be updated + size_t rowPitch = 0, //!< Pitch for buffer + size_t slicePitch = 0 //!< Slice for buffer + ) const; - //! Copies an image object to another image object - virtual bool copyImage( - device::Memory& srcMemory, //!< Source memory object - device::Memory& dstMemory, //!< Destination memory object - const amd::Coord3D& srcOrigin, //!< Source origin - const amd::Coord3D& dstOrigin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; + //! Copies an image object to another image object + virtual bool copyImage(device::Memory& srcMemory, //!< Source memory object + device::Memory& dstMemory, //!< Destination memory object + const amd::Coord3D& srcOrigin, //!< Source origin + const amd::Coord3D& dstOrigin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const; -protected: - const static uint MaxPinnedBuffers = 4; + protected: + const static uint MaxPinnedBuffers = 4; - //! Synchronizes the blit operations if necessary - inline void synchronize() const; + //! Synchronizes the blit operations if necessary + inline void synchronize() const; - //! Returns the virtual GPU object - VirtualGPU& gpu() const { return static_cast(vDev_); } + //! Returns the virtual GPU object + VirtualGPU& gpu() const { return static_cast(vDev_); } - //! Returns the GPU device object - const Device& dev() const { return static_cast(dev_); }; + //! Returns the GPU device object + const Device& dev() const { return static_cast(dev_); }; - inline Memory& gpuMem(device::Memory& mem) const; + inline Memory& gpuMem(device::Memory& mem) const; - //! Pins host memory for GPU access - amd::Memory* pinHostMemory( - const void* hostMem, //!< Host memory pointer - size_t pinSize, //!< Host memory size - size_t& partial //!< Extra offset for memory alignment - ) const; + //! Pins host memory for GPU access + amd::Memory* pinHostMemory(const void* hostMem, //!< Host memory pointer + size_t pinSize, //!< Host memory size + size_t& partial //!< Extra offset for memory alignment + ) const; - const size_t MinSizeForPinnedTransfer; - bool completeOperation_; //!< DMA blit manager must complete operation - amd::Context* context_; //!< A dummy context + const size_t MinSizeForPinnedTransfer; + bool completeOperation_; //!< DMA blit manager must complete operation + amd::Context* context_; //!< A dummy context -private: + private: + //! Disable copy constructor + DmaBlitManager(const DmaBlitManager&); - //! Disable copy constructor - DmaBlitManager(const DmaBlitManager&); + //! Disable operator= + DmaBlitManager& operator=(const DmaBlitManager&); - //! Disable operator= - DmaBlitManager& operator=(const DmaBlitManager&); + //! Reads video memory, using a staged buffer + bool readMemoryStaged(Memory& srcMemory, //!< Source memory object + void* dstHost, //!< Destination host memory + Memory** xferBuf, //!< Staged buffer for read + size_t origin, //!< Original offset in the source memory + size_t& offset, //!< Offset for the current copy pointer + size_t& totalSize, //!< Total size for copy region + size_t xferSize //!< Transfer size + ) const; - //! Reads video memory, using a staged buffer - bool readMemoryStaged( - Memory& srcMemory, //!< Source memory object - void* dstHost, //!< Destination host memory - Memory** xferBuf, //!< Staged buffer for read - size_t origin, //!< Original offset in the source memory - size_t& offset, //!< Offset for the current copy pointer - size_t& totalSize, //!< Total size for copy region - size_t xferSize //!< Transfer size - ) const; - - //! Write into video memory, using a staged buffer - bool writeMemoryStaged( - const void* srcHost, //!< Source host memory - Memory& dstMemory, //!< Destination memory object - Memory& xferBuf, //!< Staged buffer for write - size_t origin, //!< Original offset in the destination memory - size_t& offset, //!< Offset for the current copy pointer - size_t& totalSize, //!< Total size for the copy region - size_t xferSize //!< Transfer size - ) const; + //! Write into video memory, using a staged buffer + bool writeMemoryStaged(const void* srcHost, //!< Source host memory + Memory& dstMemory, //!< Destination memory object + Memory& xferBuf, //!< Staged buffer for write + size_t origin, //!< Original offset in the destination memory + size_t& offset, //!< Offset for the current copy pointer + size_t& totalSize, //!< Total size for the copy region + size_t xferSize //!< Transfer size + ) const; }; //! Kernel Blit Manager -class KernelBlitManager : public DmaBlitManager -{ -public: - enum { - BlitCopyImage = 0, - BlitCopyImage1DA, - BlitCopyImageToBuffer, - BlitCopyBufferToImage, - BlitCopyBufferRect, - BlitCopyBufferRectAligned, - BlitCopyBuffer, - BlitCopyBufferAligned, - FillBuffer, - FillImage, - Scheduler, - BlitTotal - }; +class KernelBlitManager : public DmaBlitManager { + public: + enum { + BlitCopyImage = 0, + BlitCopyImage1DA, + BlitCopyImageToBuffer, + BlitCopyBufferToImage, + BlitCopyBufferRect, + BlitCopyBufferRectAligned, + BlitCopyBuffer, + BlitCopyBufferAligned, + FillBuffer, + FillImage, + Scheduler, + BlitTotal + }; - //! Constructor - KernelBlitManager( - VirtualGPU& gpu, //!< Virtual GPU to be used for blits - Setup setup = Setup() //!< Specifies HW accelerated blits - ); + //! Constructor + KernelBlitManager(VirtualGPU& gpu, //!< Virtual GPU to be used for blits + Setup setup = Setup() //!< Specifies HW accelerated blits + ); - //! Destructor - virtual ~KernelBlitManager(); + //! Destructor + virtual ~KernelBlitManager(); - //! Creates DmaBlitManager object - virtual bool create(amd::Device& device); + //! Creates DmaBlitManager object + virtual bool create(amd::Device& device); - //! Copies a buffer object to another buffer object - virtual bool copyBufferRect( - device::Memory& srcMemory, //!< Source memory object - device::Memory& dstMemory, //!< Destination memory object - const amd::BufferRect& srcRectIn, //!< Source rectangle - const amd::BufferRect& dstRectIn, //!< Destination rectangle - const amd::Coord3D& sizeIn, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; + //! Copies a buffer object to another buffer object + virtual bool copyBufferRect(device::Memory& srcMemory, //!< Source memory object + device::Memory& dstMemory, //!< Destination memory object + const amd::BufferRect& srcRectIn, //!< Source rectangle + const amd::BufferRect& dstRectIn, //!< Destination rectangle + const amd::Coord3D& sizeIn, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const; - //! Copies a buffer object to system memory - virtual bool readBuffer( - device::Memory& srcMemory, //!< Source memory object - void* dstHost, //!< Destination host memory - const amd::Coord3D& origin, //!< Source origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; + //! Copies a buffer object to system memory + virtual bool readBuffer(device::Memory& srcMemory, //!< Source memory object + void* dstHost, //!< Destination host memory + const amd::Coord3D& origin, //!< Source origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const; - //! Copies a buffer object to system memory - virtual bool readBufferRect( - device::Memory& srcMemory, //!< Source memory object - void* dstHost, //!< Destinaiton host memory - const amd::BufferRect& bufRect, //!< Source rectangle - const amd::BufferRect& hostRect, //!< Destination rectangle - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; + //! Copies a buffer object to system memory + virtual bool readBufferRect(device::Memory& srcMemory, //!< Source memory object + void* dstHost, //!< Destinaiton host memory + const amd::BufferRect& bufRect, //!< Source rectangle + const amd::BufferRect& hostRect, //!< Destination rectangle + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const; - //! Copies system memory to a buffer object - virtual bool writeBuffer( - const void* srcHost, //!< Source host memory - device::Memory& dstMemory, //!< Destination memory object - const amd::Coord3D& origin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; + //! Copies system memory to a buffer object + virtual bool writeBuffer(const void* srcHost, //!< Source host memory + device::Memory& dstMemory, //!< Destination memory object + const amd::Coord3D& origin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const; - //! Copies system memory to a buffer object - virtual bool writeBufferRect( - const void* srcHost, //!< Source host memory - device::Memory& dstMemory, //!< Destination memory object - const amd::BufferRect& hostRect, //!< Destination rectangle - const amd::BufferRect& bufRect, //!< Source rectangle - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; + //! Copies system memory to a buffer object + virtual bool writeBufferRect(const void* srcHost, //!< Source host memory + device::Memory& dstMemory, //!< Destination memory object + const amd::BufferRect& hostRect, //!< Destination rectangle + const amd::BufferRect& bufRect, //!< Source rectangle + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const; - //! Copies a buffer object to an image object - virtual bool copyBuffer( - device::Memory& srcMemory, //!< Source memory object - device::Memory& dstMemory, //!< Destination memory object - const amd::Coord3D& srcOrigin, //!< Source origin - const amd::Coord3D& dstOrigin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; + //! Copies a buffer object to an image object + virtual bool copyBuffer(device::Memory& srcMemory, //!< Source memory object + device::Memory& dstMemory, //!< Destination memory object + const amd::Coord3D& srcOrigin, //!< Source origin + const amd::Coord3D& dstOrigin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const; - //! Copies a buffer object to an image object - virtual bool copyBufferToImage( - device::Memory& srcMemory, //!< Source memory object - device::Memory& dstMemory, //!< Destination memory object - const amd::Coord3D& srcOrigin, //!< Source origin - const amd::Coord3D& dstOrigin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false, //!< Entire buffer will be updated - size_t rowPitch = 0, //!< Pitch for buffer - size_t slicePitch = 0 //!< Slice for buffer - ) const; + //! Copies a buffer object to an image object + virtual bool copyBufferToImage(device::Memory& srcMemory, //!< Source memory object + device::Memory& dstMemory, //!< Destination memory object + const amd::Coord3D& srcOrigin, //!< Source origin + const amd::Coord3D& dstOrigin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false, //!< Entire buffer will be updated + size_t rowPitch = 0, //!< Pitch for buffer + size_t slicePitch = 0 //!< Slice for buffer + ) const; - //! Copies an image object to a buffer object - virtual bool copyImageToBuffer( - device::Memory& srcMemory, //!< Source memory object - device::Memory& dstMemory, //!< Destination memory object - const amd::Coord3D& srcOrigin, //!< Source origin - const amd::Coord3D& dstOrigin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false, //!< Entire buffer will be updated - size_t rowPitch = 0, //!< Pitch for buffer - size_t slicePitch = 0 //!< Slice for buffer - ) const; + //! Copies an image object to a buffer object + virtual bool copyImageToBuffer(device::Memory& srcMemory, //!< Source memory object + device::Memory& dstMemory, //!< Destination memory object + const amd::Coord3D& srcOrigin, //!< Source origin + const amd::Coord3D& dstOrigin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false, //!< Entire buffer will be updated + size_t rowPitch = 0, //!< Pitch for buffer + size_t slicePitch = 0 //!< Slice for buffer + ) const; - //! Copies an image object to another image object - virtual bool copyImage( - device::Memory& srcMemory, //!< Source memory object - device::Memory& dstMemory, //!< Destination memory object - const amd::Coord3D& srcOrigin, //!< Source origin - const amd::Coord3D& dstOrigin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; + //! Copies an image object to another image object + virtual bool copyImage(device::Memory& srcMemory, //!< Source memory object + device::Memory& dstMemory, //!< Destination memory object + const amd::Coord3D& srcOrigin, //!< Source origin + const amd::Coord3D& dstOrigin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const; - //! Copies an image object to system memory - virtual bool readImage( - device::Memory& srcMemory, //!< Source memory object - void* dstHost, //!< Destination host memory - const amd::Coord3D& origin, //!< Source origin - const amd::Coord3D& size, //!< Size of the copy region - size_t rowPitch, //!< Row pitch for host memory - size_t slicePitch, //!< Slice pitch for host memory - bool entire = false //!< Entire buffer will be updated - ) const; + //! Copies an image object to system memory + virtual bool readImage(device::Memory& srcMemory, //!< Source memory object + void* dstHost, //!< Destination host memory + const amd::Coord3D& origin, //!< Source origin + const amd::Coord3D& size, //!< Size of the copy region + size_t rowPitch, //!< Row pitch for host memory + size_t slicePitch, //!< Slice pitch for host memory + bool entire = false //!< Entire buffer will be updated + ) const; - //! Copies system memory to an image object - virtual bool writeImage( - const void* srcHost, //!< Source host memory - device::Memory& dstMemory, //!< Destination memory object - const amd::Coord3D& origin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - size_t rowPitch, //!< Row pitch for host memory - size_t slicePitch, //!< Slice pitch for host memory - bool entire = false //!< Entire buffer will be updated - ) const; + //! Copies system memory to an image object + virtual bool writeImage(const void* srcHost, //!< Source host memory + device::Memory& dstMemory, //!< Destination memory object + const amd::Coord3D& origin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + size_t rowPitch, //!< Row pitch for host memory + size_t slicePitch, //!< Slice pitch for host memory + bool entire = false //!< Entire buffer will be updated + ) const; - //! Fills a buffer memory with a pattern data - virtual bool fillBuffer( - device::Memory& memory, //!< Memory object to fill with pattern - const void* pattern, //!< Pattern data - size_t patternSize, //!< Pattern size - const amd::Coord3D& origin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; + //! Fills a buffer memory with a pattern data + virtual bool fillBuffer(device::Memory& memory, //!< Memory object to fill with pattern + const void* pattern, //!< Pattern data + size_t patternSize, //!< Pattern size + const amd::Coord3D& origin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const; - //! Fills an image memory with a pattern data - virtual bool fillImage( - device::Memory& dstMemory, //!< Memory object to fill with pattern - const void* pattern, //!< Pattern data - const amd::Coord3D& origin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; + //! Fills an image memory with a pattern data + virtual bool fillImage(device::Memory& dstMemory, //!< Memory object to fill with pattern + const void* pattern, //!< Pattern data + const amd::Coord3D& origin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const; - //! Runs a GPU scheduler for device enqueue - bool runScheduler( - device::Memory& vqueue, //!< Memory object for virtual queue - device::Memory& params, //!< Extra arguments for the scheduler - uint paramIdx, //!< Parameter index - uint threads //!< Number of scheduling threads - ) const; + //! Runs a GPU scheduler for device enqueue + bool runScheduler(device::Memory& vqueue, //!< Memory object for virtual queue + device::Memory& params, //!< Extra arguments for the scheduler + uint paramIdx, //!< Parameter index + uint threads //!< Number of scheduling threads + ) const; - //! Writes CPU raw data into GPU memory - void writeRawData( - device::Memory& memory, //!< Memory object for data udpate - size_t size, //!< Size of raw data - const void* data //!< Raw data pointer - ) const; + //! Writes CPU raw data into GPU memory + void writeRawData(device::Memory& memory, //!< Memory object for data udpate + size_t size, //!< Size of raw data + const void* data //!< Raw data pointer + ) const; -private: - static const size_t MaxXferBuffers = 2; - static const uint TransferSplitSize = 3; + private: + static const size_t MaxXferBuffers = 2; + static const uint TransferSplitSize = 3; - //! Copies a buffer object to an image object - bool copyBufferToImageKernel( - device::Memory& srcMemory, //!< Source memory object - device::Memory& dstMemory, //!< Destination memory object - const amd::Coord3D& srcOrigin, //!< Source origin - const amd::Coord3D& dstOrigin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false, //!< Entire buffer will be updated - size_t rowPitch = 0, //!< Pitch for buffer - size_t slicePitch = 0 //!< Slice for buffer - ) const; + //! Copies a buffer object to an image object + bool copyBufferToImageKernel(device::Memory& srcMemory, //!< Source memory object + device::Memory& dstMemory, //!< Destination memory object + const amd::Coord3D& srcOrigin, //!< Source origin + const amd::Coord3D& dstOrigin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false, //!< Entire buffer will be updated + size_t rowPitch = 0, //!< Pitch for buffer + size_t slicePitch = 0 //!< Slice for buffer + ) const; - //! Copies an image object to a buffer object - bool copyImageToBufferKernel( - device::Memory& srcMemory, //!< Source memory object - device::Memory& dstMemory, //!< Destination memory object - const amd::Coord3D& srcOrigin, //!< Source origin - const amd::Coord3D& dstOrigin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false, //!< Entire buffer will be updated - size_t rowPitch = 0, //!< Pitch for buffer - size_t slicePitch = 0 //!< Slice for buffer - ) const; + //! Copies an image object to a buffer object + bool copyImageToBufferKernel(device::Memory& srcMemory, //!< Source memory object + device::Memory& dstMemory, //!< Destination memory object + const amd::Coord3D& srcOrigin, //!< Source origin + const amd::Coord3D& dstOrigin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false, //!< Entire buffer will be updated + size_t rowPitch = 0, //!< Pitch for buffer + size_t slicePitch = 0 //!< Slice for buffer + ) const; - //! Creates a program for all blit operations - bool createProgram( - Device& device //!< Device object - ); + //! Creates a program for all blit operations + bool createProgram(Device& device //!< Device object + ); - //! Creates a view memory object - Memory* createView( - const Memory& parent, //!< Parent memory object - const cl_image_format format //!< The new format for a view - ) const; + //! Creates a view memory object + Memory* createView(const Memory& parent, //!< Parent memory object + const cl_image_format format //!< The new format for a view + ) const; - //! Disable copy constructor - KernelBlitManager(const KernelBlitManager&); + //! Disable copy constructor + KernelBlitManager(const KernelBlitManager&); - //! Disable operator= - KernelBlitManager& operator=(const KernelBlitManager&); + //! Disable operator= + KernelBlitManager& operator=(const KernelBlitManager&); - amd::Program* program_; //!< GPU program obejct - amd::Kernel* kernels_[BlitTotal]; //!< GPU kernels for blit - amd::Memory* constantBuffer_; //!< An internal CB for blits - amd::Memory* xferBuffers_[MaxXferBuffers]; //!< Transfer buffers for images - size_t xferBufferSize_; //!< Transfer buffer size - amd::Monitor* lockXferOps_; //!< Lock transfer operation + amd::Program* program_; //!< GPU program obejct + amd::Kernel* kernels_[BlitTotal]; //!< GPU kernels for blit + amd::Memory* constantBuffer_; //!< An internal CB for blits + amd::Memory* xferBuffers_[MaxXferBuffers]; //!< Transfer buffers for images + size_t xferBufferSize_; //!< Transfer buffer size + amd::Monitor* lockXferOps_; //!< Lock transfer operation }; static const char* BlitName[KernelBlitManager::BlitTotal] = { - "copyImage", - "copyImage1DA", - "copyImageToBuffer", - "copyBufferToImage", - "copyBufferRect", - "copyBufferRectAligned", - "copyBuffer", - "copyBufferAligned", - "fillBuffer", - "fillImage", - "scheduler", - }; + "copyImage", "copyImage1DA", "copyImageToBuffer", + "copyBufferToImage", "copyBufferRect", "copyBufferRectAligned", + "copyBuffer", "copyBufferAligned", "fillBuffer", + "fillImage", "scheduler", +}; /*@}*/} // namespace pal - diff --git a/rocclr/runtime/device/pal/palcompiler.cpp b/rocclr/runtime/device/pal/palcompiler.cpp index dc96144654..ef19c2dfea 100644 --- a/rocclr/runtime/device/pal/palcompiler.cpp +++ b/rocclr/runtime/device/pal/palcompiler.cpp @@ -17,402 +17,384 @@ #include "driver/AmdCompiler.h" #include "opencl1.2-c.amdgcn.inc" #include "opencl2.0-c.amdgcn.inc" -#endif // !defined(WITH_LIGHTNING_COMPILER) +#endif // !defined(WITH_LIGHTNING_COMPILER) #include #if defined(ATI_OS_LINUX) #include #include -#endif // defined(ATI_OS_LINUX) +#endif // defined(ATI_OS_LINUX) #if defined(ATI_OS_WIN) #include -#endif // defined(ATI_OS_WIN) +#endif // defined(ATI_OS_WIN) -//CLC_IN_PROCESS_CHANGE +// CLC_IN_PROCESS_CHANGE extern int openclFrontEnd(const char* cmdline, std::string*, std::string* typeInfo = nullptr); namespace pal { -bool -HSAILProgram::compileImpl( - const std::string& sourceCode, - const std::vector& headers, - const char** headerIncludeNames, - amd::option::Options* options) -{ +bool HSAILProgram::compileImpl(const std::string& sourceCode, + const std::vector& headers, + const char** headerIncludeNames, amd::option::Options* options) { #if defined(WITH_LIGHTNING_COMPILER) - assert(!"Should not reach here"); -#else // !defined(WITH_LIGHTNING_COMPILER) - acl_error errorCode; - aclTargetInfo target; + assert(!"Should not reach here"); +#else // !defined(WITH_LIGHTNING_COMPILER) + acl_error errorCode; + aclTargetInfo target; - std::string arch = "hsail"; - if (dev().settings().use64BitPtr_) { - arch += "64"; + std::string arch = "hsail"; + if (dev().settings().use64BitPtr_) { + arch += "64"; + } + target = aclGetTargetInfo(arch.c_str(), dev().hwInfo()->targetName_, &errorCode); + + // end if asic info is ready + // We dump the source code for each program (param: headers) + // into their filenames (headerIncludeNames) into the TEMP + // folder specific to the OS and add the include path while + // compiling + + // Find the temp folder for the OS + std::string tempFolder = amd::Os::getTempPath(); + std::string tempFileName = amd::Os::getTempFileName(); + + // Iterate through each source code and dump it into tmp + std::fstream f; + std::vector headerFileNames(headers.size()); + std::vector newDirs; + for (size_t i = 0; i < headers.size(); ++i) { + std::string headerPath = tempFolder; + std::string headerIncludeName(headerIncludeNames[i]); + // replace / in path with current os's file separator + if (amd::Os::fileSeparator() != '/') { + for (std::string::iterator it = headerIncludeName.begin(), end = headerIncludeName.end(); + it != end; ++it) { + if (*it == '/') *it = amd::Os::fileSeparator(); + } } - target = aclGetTargetInfo(arch.c_str(), - dev().hwInfo()->targetName_, &errorCode); - - // end if asic info is ready - // We dump the source code for each program (param: headers) - // into their filenames (headerIncludeNames) into the TEMP - // folder specific to the OS and add the include path while - // compiling - - // Find the temp folder for the OS - std::string tempFolder = amd::Os::getTempPath(); - std::string tempFileName = amd::Os::getTempFileName(); - - // Iterate through each source code and dump it into tmp - std::fstream f; - std::vector headerFileNames(headers.size()); - std::vector newDirs; - for (size_t i = 0; i < headers.size(); ++i) { - std::string headerPath = tempFolder; - std::string headerIncludeName(headerIncludeNames[i]); - // replace / in path with current os's file separator - if (amd::Os::fileSeparator() != '/') { - for (std::string::iterator it = headerIncludeName.begin(), - end = headerIncludeName.end(); it != end; ++it) { - if (*it == '/') *it = amd::Os::fileSeparator(); - } - } - size_t pos = headerIncludeName.rfind(amd::Os::fileSeparator()); - if (pos != std::string::npos) { - headerPath += amd::Os::fileSeparator(); - headerPath += headerIncludeName.substr(0, pos); - headerIncludeName = headerIncludeName.substr(pos+1); - } - if (!amd::Os::pathExists(headerPath)) { - bool ret = amd::Os::createPath(headerPath); - assert(ret && "failed creating path!"); - newDirs.push_back(headerPath); - } - std::string headerFullName = - headerPath + amd::Os::fileSeparator() + headerIncludeName; - headerFileNames[i] = headerFullName; - f.open(headerFullName.c_str(), std::fstream::out); - // Should we allow asserts - assert(!f.fail() && "failed creating header file!"); - f.write(headers[i]->c_str(), headers[i]->length()); - f.close(); + size_t pos = headerIncludeName.rfind(amd::Os::fileSeparator()); + if (pos != std::string::npos) { + headerPath += amd::Os::fileSeparator(); + headerPath += headerIncludeName.substr(0, pos); + headerIncludeName = headerIncludeName.substr(pos + 1); } - - // Create Binary - binaryElf_ = aclBinaryInit(sizeof(aclBinary), - &target, &binOpts_, &errorCode); - if (errorCode != ACL_SUCCESS) { - buildLog_ += "Error: aclBinary init failure\n"; - LogWarning("aclBinaryInit failed"); - return false; + if (!amd::Os::pathExists(headerPath)) { + bool ret = amd::Os::createPath(headerPath); + assert(ret && "failed creating path!"); + newDirs.push_back(headerPath); } + std::string headerFullName = headerPath + amd::Os::fileSeparator() + headerIncludeName; + headerFileNames[i] = headerFullName; + f.open(headerFullName.c_str(), std::fstream::out); + // Should we allow asserts + assert(!f.fail() && "failed creating header file!"); + f.write(headers[i]->c_str(), headers[i]->length()); + f.close(); + } - // Insert opencl into binary - errorCode = aclInsertSection(dev().compiler(), binaryElf_, - sourceCode.c_str(), strlen(sourceCode.c_str()), aclSOURCE); - if (errorCode != ACL_SUCCESS) { - buildLog_ += "Error: Inserting openCl Source to binary\n"; - } + // Create Binary + binaryElf_ = aclBinaryInit(sizeof(aclBinary), &target, &binOpts_, &errorCode); + if (errorCode != ACL_SUCCESS) { + buildLog_ += "Error: aclBinary init failure\n"; + LogWarning("aclBinaryInit failed"); + return false; + } - // Set the options for the compiler - // Set the include path for the temp folder that contains the includes - if (!headers.empty()) { - compileOptions_.append(" -I"); - compileOptions_.append(tempFolder); - } + // Insert opencl into binary + errorCode = aclInsertSection(dev().compiler(), binaryElf_, sourceCode.c_str(), + strlen(sourceCode.c_str()), aclSOURCE); + if (errorCode != ACL_SUCCESS) { + buildLog_ += "Error: Inserting openCl Source to binary\n"; + } + + // Set the options for the compiler + // Set the include path for the temp folder that contains the includes + if (!headers.empty()) { + compileOptions_.append(" -I"); + compileOptions_.append(tempFolder); + } #if !defined(_LP64) && defined(ATI_OS_LINUX) - if (options->origOptionStr.find("-cl-std=CL2.0") != std::string::npos && !dev().settings().force32BitOcl20_) { - errorCode = ACL_UNSUPPORTED; - LogWarning("aclCompile failed"); - return false; - } + if (options->origOptionStr.find("-cl-std=CL2.0") != std::string::npos && + !dev().settings().force32BitOcl20_) { + errorCode = ACL_UNSUPPORTED; + LogWarning("aclCompile failed"); + return false; + } #endif - // Compile source to IR - compileOptions_.append(hsailOptions(options)); - errorCode = aclCompile(dev().compiler(), binaryElf_, compileOptions_.c_str(), - ACL_TYPE_OPENCL, ACL_TYPE_LLVMIR_BINARY, nullptr); - buildLog_ += aclGetCompilerLog(dev().compiler()); - if (errorCode != ACL_SUCCESS) { - LogWarning("aclCompile failed"); - buildLog_ += "Error: Compiling CL to IR\n"; - return false; - } + // Compile source to IR + compileOptions_.append(hsailOptions(options)); + errorCode = aclCompile(dev().compiler(), binaryElf_, compileOptions_.c_str(), ACL_TYPE_OPENCL, + ACL_TYPE_LLVMIR_BINARY, nullptr); + buildLog_ += aclGetCompilerLog(dev().compiler()); + if (errorCode != ACL_SUCCESS) { + LogWarning("aclCompile failed"); + buildLog_ += "Error: Compiling CL to IR\n"; + return false; + } - clBinary()->storeCompileOptions(compileOptions_); + clBinary()->storeCompileOptions(compileOptions_); - // Save the binary in the interface class - saveBinaryAndSetType(TYPE_COMPILED); -#endif // !defined(WITH_LIGHTNING_COMPILER) - return true; + // Save the binary in the interface class + saveBinaryAndSetType(TYPE_COMPILED); +#endif // !defined(WITH_LIGHTNING_COMPILER) + return true; } #if defined(WITH_LIGHTNING_COMPILER) static std::string llvmBin_(amd::Os::getEnvironment("LLVM_BIN")); #if defined(ATI_OS_WIN) -static BOOL CALLBACK -checkLLVM_BIN(PINIT_ONCE InitOnce, PVOID Parameter, PVOID *lpContex) -{ - if (llvmBin_.empty()) { - HMODULE hm = NULL; - if (GetModuleHandleExA(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS - | GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT, - (LPCSTR) &amd::Device::init, &hm)) { - char path[1024]; - GetModuleFileNameA(hm, path, sizeof(path)); - llvmBin_ = path; - size_t pos = llvmBin_.rfind('\\'); - if (pos != std::string::npos) { - llvmBin_.resize(pos); - } - } +static BOOL CALLBACK checkLLVM_BIN(PINIT_ONCE InitOnce, PVOID Parameter, PVOID* lpContex) { + if (llvmBin_.empty()) { + HMODULE hm = NULL; + if (GetModuleHandleExA( + GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS | GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT, + (LPCSTR)&amd::Device::init, &hm)) { + char path[1024]; + GetModuleFileNameA(hm, path, sizeof(path)); + llvmBin_ = path; + size_t pos = llvmBin_.rfind('\\'); + if (pos != std::string::npos) { + llvmBin_.resize(pos); + } } - return TRUE; + } + return TRUE; } -#endif // defined (ATI_OS_WINDOWS) +#endif // defined (ATI_OS_WINDOWS) #if defined(ATI_OS_LINUX) static pthread_once_t once = PTHREAD_ONCE_INIT; -static void -checkLLVM_BIN() -{ - if (llvmBin_.empty()) { - Dl_info info; - if (dladdr((const void*)&amd::Device::init, &info)) { - llvmBin_ = dirname(strdup(info.dli_fname)); - size_t pos = llvmBin_.rfind("lib"); - if (pos != std::string::npos) { - llvmBin_.replace(pos, 3, "bin"); - } - } +static void checkLLVM_BIN() { + if (llvmBin_.empty()) { + Dl_info info; + if (dladdr((const void*)&amd::Device::init, &info)) { + llvmBin_ = dirname(strdup(info.dli_fname)); + size_t pos = llvmBin_.rfind("lib"); + if (pos != std::string::npos) { + llvmBin_.replace(pos, 3, "bin"); + } } + } } -#endif // defined(ATI_OS_LINUX) +#endif // defined(ATI_OS_LINUX) -std::auto_ptr -LightningProgram::newCompilerInstance() -{ +std::auto_ptr LightningProgram::newCompilerInstance() { #if defined(ATI_OS_WIN) - static INIT_ONCE initOnce; - InitOnceExecuteOnce(&initOnce, checkLLVM_BIN, NULL, NULL); -#endif // defined(ATI_OS_WIN) + static INIT_ONCE initOnce; + InitOnceExecuteOnce(&initOnce, checkLLVM_BIN, NULL, NULL); +#endif // defined(ATI_OS_WIN) #if defined(ATI_OS_LINUX) - pthread_once(&once, checkLLVM_BIN); -#endif // defined(ATI_OS_LINUX) + pthread_once(&once, checkLLVM_BIN); +#endif // defined(ATI_OS_LINUX) #if defined(DEBUG) - std::string clangExe(llvmBin_ + LINUX_SWITCH("/clang", "\\clang.exe")); - struct stat buf; - if (stat(clangExe.c_str(), &buf)) { - std::string msg("Could not find the Clang binary in " + llvmBin_); - LogWarning(msg.c_str()); - } -#endif // defined(DEBUG) - - return std::auto_ptr( - amd::opencl_driver::CompilerFactory().CreateAMDGPUCompiler(llvmBin_)); + std::string clangExe(llvmBin_ + LINUX_SWITCH("/clang", "\\clang.exe")); + struct stat buf; + if (stat(clangExe.c_str(), &buf)) { + std::string msg("Could not find the Clang binary in " + llvmBin_); + LogWarning(msg.c_str()); + } +#endif // defined(DEBUG) + + return std::auto_ptr( + amd::opencl_driver::CompilerFactory().CreateAMDGPUCompiler(llvmBin_)); } -bool -LightningProgram::compileImpl( - const std::string& sourceCode, - const std::vector& headers, - const char** headerIncludeNames, - amd::option::Options* options) -{ - using namespace amd::opencl_driver; - std::auto_ptr C(newCompilerInstance()); - std::vector inputs; +bool LightningProgram::compileImpl(const std::string& sourceCode, + const std::vector& headers, + const char** headerIncludeNames, amd::option::Options* options) { + using namespace amd::opencl_driver; + std::auto_ptr C(newCompilerInstance()); + std::vector inputs; - Data* input = C->NewBufferReference(DT_CL, - sourceCode.c_str(), sourceCode.length()); - if (input == NULL) { - buildLog_ += "Error while creating data from source code"; - return false; - } + Data* input = C->NewBufferReference(DT_CL, sourceCode.c_str(), sourceCode.length()); + if (input == NULL) { + buildLog_ += "Error while creating data from source code"; + return false; + } - inputs.push_back(input); + inputs.push_back(input); - amd::opencl_driver::Buffer* output = C->NewBuffer(DT_LLVM_BC); - if (output == NULL) { - buildLog_ += "Error while creating buffer for the LLVM bitcode"; - return false; - } + amd::opencl_driver::Buffer* output = C->NewBuffer(DT_LLVM_BC); + if (output == NULL) { + buildLog_ += "Error while creating buffer for the LLVM bitcode"; + return false; + } - //Set the options for the compiler - std::ostringstream ostrstr; - std::copy(options->clangOptions.begin(), options->clangOptions.end(), - std::ostream_iterator(ostrstr, " ")); + // Set the options for the compiler + std::ostringstream ostrstr; + std::copy(options->clangOptions.begin(), options->clangOptions.end(), + std::ostream_iterator(ostrstr, " ")); - ostrstr << " -m" << sizeof(void*) * 8; - std::string driverOptions(ostrstr.str()); + ostrstr << " -m" << sizeof(void*) * 8; + std::string driverOptions(ostrstr.str()); - const char* xLang = options->oVariables->XLang; - if (xLang != NULL && strcmp(xLang, "cl")) { - buildLog_ += "Unsupported OpenCL language.\n"; - } + const char* xLang = options->oVariables->XLang; + if (xLang != NULL && strcmp(xLang, "cl")) { + buildLog_ += "Unsupported OpenCL language.\n"; + } - //FIXME_Nikolay: the program manager should be setting the language - //driverOptions.append(" -x cl"); + // FIXME_Nikolay: the program manager should be setting the language + // driverOptions.append(" -x cl"); - driverOptions.append(" -cl-std=").append(options->oVariables->CLStd); + driverOptions.append(" -cl-std=").append(options->oVariables->CLStd); - // Set the -O# - std::ostringstream optLevel; - optLevel << " -O" << options->oVariables->OptLevel; - driverOptions.append(optLevel.str()); + // Set the -O# + std::ostringstream optLevel; + optLevel << " -O" << options->oVariables->OptLevel; + driverOptions.append(optLevel.str()); - // Set the machine target - std::ostringstream mCPU; - mCPU << " -mcpu=gfx" << dev().hwInfo()->gfxipVersion_; - driverOptions.append(mCPU.str()); + // Set the machine target + std::ostringstream mCPU; + mCPU << " -mcpu=gfx" << dev().hwInfo()->gfxipVersion_; + driverOptions.append(mCPU.str()); - driverOptions.append(options->llvmOptions); - driverOptions.append(hsailOptions(options)); + driverOptions.append(options->llvmOptions); + driverOptions.append(hsailOptions(options)); - // Set whole program mode - driverOptions.append(" -mllvm -amdgpu-early-inline-all"); + // Set whole program mode + driverOptions.append(" -mllvm -amdgpu-early-inline-all"); - //Find the temp folder for the OS - std::string tempFolder = amd::Os::getEnvironment("TEMP"); + // Find the temp folder for the OS + std::string tempFolder = amd::Os::getEnvironment("TEMP"); + if (tempFolder.empty()) { + tempFolder = amd::Os::getEnvironment("TMP"); if (tempFolder.empty()) { - tempFolder = amd::Os::getEnvironment("TMP"); - if (tempFolder.empty()) { - tempFolder = WINDOWS_SWITCH(".","/tmp");; - } + tempFolder = WINDOWS_SWITCH(".", "/tmp"); + ; } - //Iterate through each source code and dump it into tmp - std::fstream f; - std::vector headerFileNames(headers.size()); - std::vector newDirs; - for (size_t i = 0; i < headers.size(); ++i) { - std::string headerPath = tempFolder; - std::string headerIncludeName(headerIncludeNames[i]); - // replace / in path with current os's file separator - if ( amd::Os::fileSeparator() != '/') { - for (std::string::iterator it = headerIncludeName.begin(), - end = headerIncludeName.end(); - it != end; - ++it) { - if (*it == '/') *it = amd::Os::fileSeparator(); - } - } - size_t pos = headerIncludeName.rfind(amd::Os::fileSeparator()); - if (pos != std::string::npos) { - headerPath += amd::Os::fileSeparator(); - headerPath += headerIncludeName.substr(0, pos); - headerIncludeName = headerIncludeName.substr(pos+1); - } - if (!amd::Os::pathExists(headerPath)) { - bool ret = amd::Os::createPath(headerPath); - assert(ret && "failed creating path!"); - newDirs.push_back(headerPath); - } - std::string headerFullName - = headerPath + amd::Os::fileSeparator() + headerIncludeName; - headerFileNames[i] = headerFullName; - f.open(headerFullName.c_str(), std::fstream::out); - //Should we allow asserts - assert(!f.fail() && "failed creating header file!"); - f.write(headers[i]->c_str(), headers[i]->length()); - f.close(); - - Data* inc = C->NewFileReference(DT_CL_HEADER, headerFileNames[i]); - if (inc == NULL) { - buildLog_ += "Error while creating data from headers"; - return false; - } - inputs.push_back(inc); + } + // Iterate through each source code and dump it into tmp + std::fstream f; + std::vector headerFileNames(headers.size()); + std::vector newDirs; + for (size_t i = 0; i < headers.size(); ++i) { + std::string headerPath = tempFolder; + std::string headerIncludeName(headerIncludeNames[i]); + // replace / in path with current os's file separator + if (amd::Os::fileSeparator() != '/') { + for (std::string::iterator it = headerIncludeName.begin(), end = headerIncludeName.end(); + it != end; ++it) { + if (*it == '/') *it = amd::Os::fileSeparator(); + } } - - //Set the include path for the temp folder that contains the includes - if(!headers.empty()) { - driverOptions.append(" -I"); - driverOptions.append(tempFolder); + size_t pos = headerIncludeName.rfind(amd::Os::fileSeparator()); + if (pos != std::string::npos) { + headerPath += amd::Os::fileSeparator(); + headerPath += headerIncludeName.substr(0, pos); + headerIncludeName = headerIncludeName.substr(pos + 1); } - - if (options->isDumpFlagSet(amd::option::DUMP_CL)) { - std::ofstream f(options->getDumpFileName(".cl").c_str(), std::ios::trunc); - if(f.is_open()) { - f << "/* Compiler options:\n" \ - "-c -emit-llvm -target amdgcn-amd-amdhsa-opencl -x cl " - << driverOptions << " -include opencl-c.h " - << "\n*/\n\n" << sourceCode; - f.close(); - } else { - buildLog_ += - "Warning: opening the file to dump the OpenCL source failed.\n"; - } + if (!amd::Os::pathExists(headerPath)) { + bool ret = amd::Os::createPath(headerPath); + assert(ret && "failed creating path!"); + newDirs.push_back(headerPath); } + std::string headerFullName = headerPath + amd::Os::fileSeparator() + headerIncludeName; + headerFileNames[i] = headerFullName; + f.open(headerFullName.c_str(), std::fstream::out); + // Should we allow asserts + assert(!f.fail() && "failed creating header file!"); + f.write(headers[i]->c_str(), headers[i]->length()); + f.close(); - //FIXME_lmoriche: has the CL option been validated? - uint clcStd = (options->oVariables->CLStd[2] - '0') * 100 - + (options->oVariables->CLStd[4] - '0') * 10; + Data* inc = C->NewFileReference(DT_CL_HEADER, headerFileNames[i]); + if (inc == NULL) { + buildLog_ += "Error while creating data from headers"; + return false; + } + inputs.push_back(inc); + } - std::pair hdr; - switch(clcStd) { - case 100: case 110: case 120: - hdr = std::make_pair(opencl1_2_c_amdgcn, opencl1_2_c_amdgcn_size); - break; + // Set the include path for the temp folder that contains the includes + if (!headers.empty()) { + driverOptions.append(" -I"); + driverOptions.append(tempFolder); + } + + if (options->isDumpFlagSet(amd::option::DUMP_CL)) { + std::ofstream f(options->getDumpFileName(".cl").c_str(), std::ios::trunc); + if (f.is_open()) { + f << "/* Compiler options:\n" + "-c -emit-llvm -target amdgcn-amd-amdhsa-opencl -x cl " + << driverOptions << " -include opencl-c.h " + << "\n*/\n\n" + << sourceCode; + f.close(); + } else { + buildLog_ += "Warning: opening the file to dump the OpenCL source failed.\n"; + } + } + + // FIXME_lmoriche: has the CL option been validated? + uint clcStd = + (options->oVariables->CLStd[2] - '0') * 100 + (options->oVariables->CLStd[4] - '0') * 10; + + std::pair hdr; + switch (clcStd) { + case 100: + case 110: + case 120: + hdr = std::make_pair(opencl1_2_c_amdgcn, opencl1_2_c_amdgcn_size); + break; case 200: - hdr = std::make_pair(opencl2_0_c_amdgcn, opencl2_0_c_amdgcn_size); - break; + hdr = std::make_pair(opencl2_0_c_amdgcn, opencl2_0_c_amdgcn_size); + break; default: - buildLog_ += "Unsupported requested OpenCL C version (-cl-std).\n"; - return false; + buildLog_ += "Unsupported requested OpenCL C version (-cl-std).\n"; + return false; + } + + File* pch = C->NewTempFile(DT_CL_HEADER); + if (pch == NULL || !pch->WriteData((const char*)hdr.first, hdr.second)) { + buildLog_ += "Error while opening the opencl-c header "; + return false; + } + + driverOptions.append(" -include-pch " + pch->Name()); + driverOptions.append(" -Xclang -fno-validate-pch"); + + // Tokenize the options string into a vector of strings + std::istringstream istrstr(driverOptions); + std::istream_iterator sit(istrstr), end; + std::vector params(sit, end); + + // Compile source to IR + bool ret = + dev().cacheCompilation()->compileToLLVMBitcode(C.get(), inputs, output, params, buildLog_); + buildLog_ += C->Output(); + if (!ret) { + buildLog_ += "Error: Failed to compile opencl source (from CL to LLVM IR).\n"; + return false; + } + + llvmBinary_.assign(output->Buf().data(), output->Size()); + elfSectionType_ = amd::OclElf::LLVMIR; + + if (options->isDumpFlagSet(amd::option::DUMP_BC_ORIGINAL)) { + std::ofstream f(options->getDumpFileName("_original.bc").c_str(), + std::ios::binary | std::ios::trunc); + if (f.is_open()) { + f.write(llvmBinary_.data(), llvmBinary_.size()); + f.close(); + } else { + buildLog_ += "Warning: opening the file to dump the compiled IR failed.\n"; } + } - File* pch = C->NewTempFile(DT_CL_HEADER); - if (pch == NULL || !pch->WriteData((const char*) hdr.first, hdr.second)) { - buildLog_ += "Error while opening the opencl-c header "; - return false; - } - - driverOptions.append(" -include-pch " + pch->Name()); - driverOptions.append(" -Xclang -fno-validate-pch"); - - // Tokenize the options string into a vector of strings - std::istringstream istrstr(driverOptions); - std::istream_iterator sit(istrstr), end; - std::vector params(sit, end); - - // Compile source to IR - bool ret = dev().cacheCompilation()->compileToLLVMBitcode(C.get(), inputs, output, params, buildLog_); - buildLog_ += C->Output(); - if (!ret) { - buildLog_ += "Error: Failed to compile opencl source (from CL to LLVM IR).\n"; - return false; - } - - llvmBinary_.assign(output->Buf().data(), output->Size()); - elfSectionType_ = amd::OclElf::LLVMIR; - - if (options->isDumpFlagSet(amd::option::DUMP_BC_ORIGINAL)) { - std::ofstream f(options->getDumpFileName("_original.bc").c_str(), - std::ios::binary | std::ios::trunc); - if(f.is_open()) { - f.write(llvmBinary_.data(), llvmBinary_.size()); - f.close(); - } else { - buildLog_ += - "Warning: opening the file to dump the compiled IR failed.\n"; - } - } - - if (clBinary()->saveSOURCE()) { - clBinary()->elfOut()->addSection( - amd::OclElf::SOURCE, sourceCode.data(), sourceCode.size()); - } - if (clBinary()->saveLLVMIR()) { - clBinary()->elfOut()->addSection( - amd::OclElf::LLVMIR, llvmBinary_.data(), llvmBinary_.size(), false); - // store the original compile options - clBinary()->storeCompileOptions(compileOptions_); - } - return true; + if (clBinary()->saveSOURCE()) { + clBinary()->elfOut()->addSection(amd::OclElf::SOURCE, sourceCode.data(), sourceCode.size()); + } + if (clBinary()->saveLLVMIR()) { + clBinary()->elfOut()->addSection(amd::OclElf::LLVMIR, llvmBinary_.data(), llvmBinary_.size(), + false); + // store the original compile options + clBinary()->storeCompileOptions(compileOptions_); + } + return true; } -#endif // defined(WITH_LIGHTNING_COMPILER) +#endif // defined(WITH_LIGHTNING_COMPILER) -} // namespace pal +} // namespace pal diff --git a/rocclr/runtime/device/pal/palconstbuf.cpp b/rocclr/runtime/device/pal/palconstbuf.cpp index 6e3ed49c10..f23a860096 100644 --- a/rocclr/runtime/device/pal/palconstbuf.cpp +++ b/rocclr/runtime/device/pal/palconstbuf.cpp @@ -9,81 +9,74 @@ namespace pal { -ConstBuffer::ConstBuffer( - VirtualGPU& gpu, - size_t size) - : Memory(const_cast(gpu.dev()), size * VectorSize) - , gpu_(gpu) - , size_(size * VectorSize) - , wrtOffset_(0) - , lastWrtSize_(0) - , wrtAddress_(nullptr) -{ +ConstBuffer::ConstBuffer(VirtualGPU& gpu, size_t size) + : Memory(const_cast(gpu.dev()), size * VectorSize), + gpu_(gpu), + size_(size * VectorSize), + wrtOffset_(0), + lastWrtSize_(0), + wrtAddress_(nullptr) {} + +ConstBuffer::~ConstBuffer() { + if (wrtAddress_ != nullptr) { + unmap(&gpu_); + } + + amd::AlignedMemory::deallocate(sysMemCopy_); } -ConstBuffer::~ConstBuffer() -{ +bool ConstBuffer::create() { + // Create sysmem copy for the constant buffer + sysMemCopy_ = reinterpret_cast
(amd::AlignedMemory::allocate(size_, 256)); + if (sysMemCopy_ == nullptr) { + LogPrintfError( + "We couldn't allocate sysmem copy for constant buffer,\ + size(%d)!", + size_); + return false; + } + memset(sysMemCopy_, 0, size_); + + if (!Memory::create(Resource::RemoteUSWC)) { + LogPrintfError("We couldn't create HW constant buffer, size(%d)!", size_); + return false; + } + + // Constant buffer warm-up + warmUpRenames(gpu_); + + wrtAddress_ = map(&gpu_, Resource::Discard); + if (wrtAddress_ == nullptr) { + LogPrintfError("We couldn't map HW constant buffer, size(%d)!", size_); + return false; + } + + return true; +} + +bool ConstBuffer::uploadDataToHw(size_t size) { + static const size_t HwCbAlignment = 256; + + // Align copy size on the vector's boundary + size_t count = amd::alignUp(size, VectorSize); + wrtOffset_ += lastWrtSize_; + + // Check if CB has enough space for copy + if ((wrtOffset_ + count) > size_) { if (wrtAddress_ != nullptr) { - unmap(&gpu_); + unmap(&gpu_); } - - amd::AlignedMemory::deallocate(sysMemCopy_); -} - -bool -ConstBuffer::create() -{ - // Create sysmem copy for the constant buffer - sysMemCopy_ = reinterpret_cast
(amd::AlignedMemory::allocate(size_, 256)); - if (sysMemCopy_ == nullptr) { - LogPrintfError("We couldn't allocate sysmem copy for constant buffer,\ - size(%d)!", size_); - return false; - } - memset(sysMemCopy_, 0, size_); - - if (!Memory::create(Resource::RemoteUSWC)) { - LogPrintfError("We couldn't create HW constant buffer, size(%d)!", size_); - return false; - } - - // Constant buffer warm-up - warmUpRenames(gpu_); - wrtAddress_ = map(&gpu_, Resource::Discard); - if (wrtAddress_ == nullptr) { - LogPrintfError("We couldn't map HW constant buffer, size(%d)!", size_); - return false; - } + wrtOffset_ = 0; + lastWrtSize_ = 0; + } - return true; + // Update memory with new CB data + memcpy((reinterpret_cast(wrtAddress_) + wrtOffset_), sysMemCopy_, count); + + // Adjust the size by the HW CB buffer alignment + lastWrtSize_ = amd::alignUp(size, HwCbAlignment); + return true; } -bool -ConstBuffer::uploadDataToHw(size_t size) -{ - static const size_t HwCbAlignment = 256; - - // Align copy size on the vector's boundary - size_t count = amd::alignUp(size, VectorSize); - wrtOffset_ += lastWrtSize_; - - // Check if CB has enough space for copy - if ((wrtOffset_ + count) > size_) { - if (wrtAddress_ != nullptr) { - unmap(&gpu_); - } - wrtAddress_ = map(&gpu_, Resource::Discard); - wrtOffset_ = 0; - lastWrtSize_ = 0; - } - - // Update memory with new CB data - memcpy((reinterpret_cast(wrtAddress_) + wrtOffset_), sysMemCopy_, count); - - // Adjust the size by the HW CB buffer alignment - lastWrtSize_ = amd::alignUp(size, HwCbAlignment); - return true; -} - -} // namespace pal +} // namespace pal diff --git a/rocclr/runtime/device/pal/palconstbuf.hpp b/rocclr/runtime/device/pal/palconstbuf.hpp index ff66c1e176..9847e719d4 100644 --- a/rocclr/runtime/device/pal/palconstbuf.hpp +++ b/rocclr/runtime/device/pal/palconstbuf.hpp @@ -9,58 +9,54 @@ namespace pal { //! Cconstant buffer -class ConstBuffer : public Memory -{ -public: - //! Vector size of the constant buffer - static const size_t VectorSize = 16; +class ConstBuffer : public Memory { + public: + //! Vector size of the constant buffer + static const size_t VectorSize = 16; - //! Constructor for the ConstBuffer class - ConstBuffer( - VirtualGPU& gpu, //!< Virtual GPU device object - size_t size //!< size of the constant buffer in vectors - ); + //! Constructor for the ConstBuffer class + ConstBuffer(VirtualGPU& gpu, //!< Virtual GPU device object + size_t size //!< size of the constant buffer in vectors + ); - //! Destructor for the ConstBuffer class - ~ConstBuffer(); + //! Destructor for the ConstBuffer class + ~ConstBuffer(); - //! Creates the real HW constant buffer - bool create(); + //! Creates the real HW constant buffer + bool create(); - /*! \brief Uploads current constant buffer data from sysMemCopy_ to HW - * - * \return True if the data upload was succesful - */ - bool uploadDataToHw( - size_t size //!< real data size for upload - ); + /*! \brief Uploads current constant buffer data from sysMemCopy_ to HW + * + * \return True if the data upload was succesful + */ + bool uploadDataToHw(size_t size //!< real data size for upload + ); - //! Returns a pointer to the system memory copy for CB - address sysMemCopy() const { return sysMemCopy_; } + //! Returns a pointer to the system memory copy for CB + address sysMemCopy() const { return sysMemCopy_; } - //! Returns CB size - size_t size() const { return size_; } + //! Returns CB size + size_t size() const { return size_; } - //! Returns current write offset for the constant buffer - size_t wrtOffset() const { return wrtOffset_; } + //! Returns current write offset for the constant buffer + size_t wrtOffset() const { return wrtOffset_; } - //! Returns last write size for the constant buffer - size_t lastWrtSize() const { return lastWrtSize_; } + //! Returns last write size for the constant buffer + size_t lastWrtSize() const { return lastWrtSize_; } -private: - //! Disable copy constructor - ConstBuffer(const ConstBuffer&); + private: + //! Disable copy constructor + ConstBuffer(const ConstBuffer&); - //! Disable operator= - ConstBuffer& operator=(const ConstBuffer&); + //! Disable operator= + ConstBuffer& operator=(const ConstBuffer&); - VirtualGPU& gpu_; //!< Virtual GPU object - address sysMemCopy_; //!< System memory copy - size_t size_; //!< Constant buffer size - size_t wrtOffset_; //!< Current write offset - size_t lastWrtSize_; //!< Last write size - void* wrtAddress_; //!< Write address in CB + VirtualGPU& gpu_; //!< Virtual GPU object + address sysMemCopy_; //!< System memory copy + size_t size_; //!< Constant buffer size + size_t wrtOffset_; //!< Current write offset + size_t lastWrtSize_; //!< Last write size + void* wrtAddress_; //!< Write address in CB }; /*@}*/} // namespace pal - diff --git a/rocclr/runtime/device/pal/palcounters.cpp b/rocclr/runtime/device/pal/palcounters.cpp index aeef06aa01..3acacc3411 100644 --- a/rocclr/runtime/device/pal/palcounters.cpp +++ b/rocclr/runtime/device/pal/palcounters.cpp @@ -8,602 +8,583 @@ namespace pal { -PalCounterReference* -PalCounterReference::Create(VirtualGPU& gpu) -{ - Pal::Result result; +PalCounterReference* PalCounterReference::Create(VirtualGPU& gpu) { + Pal::Result result; - // Create performance experiment - Pal::PerfExperimentCreateInfo createInfo = {}; + // Create performance experiment + Pal::PerfExperimentCreateInfo createInfo = {}; - createInfo.optionFlags.sampleInternalOperations = 1; - createInfo.optionFlags.cacheFlushOnCounterCollection = 1; - createInfo.optionFlags.sqShaderMask = 1; - createInfo.optionValues.sampleInternalOperations = true; - createInfo.optionValues.cacheFlushOnCounterCollection = true; - createInfo.optionValues.sqShaderMask = Pal::PerfShaderMaskCs; + createInfo.optionFlags.sampleInternalOperations = 1; + createInfo.optionFlags.cacheFlushOnCounterCollection = 1; + createInfo.optionFlags.sqShaderMask = 1; + createInfo.optionValues.sampleInternalOperations = true; + createInfo.optionValues.cacheFlushOnCounterCollection = true; + createInfo.optionValues.sqShaderMask = Pal::PerfShaderMaskCs; - size_t palExperSize = gpu.dev().iDev()->GetPerfExperimentSize( - createInfo, &result); + size_t palExperSize = gpu.dev().iDev()->GetPerfExperimentSize(createInfo, &result); + if (result != Pal::Result::Success) { + return nullptr; + } + + PalCounterReference* memRef = new (palExperSize) PalCounterReference(gpu); + if (memRef != nullptr) { + result = gpu.dev().iDev()->CreatePerfExperiment(createInfo, &memRef[1], &memRef->perfExp_); if (result != Pal::Result::Success) { - return nullptr; + memRef->release(); + return nullptr; } + } - PalCounterReference* memRef = new (palExperSize) PalCounterReference(gpu); - if (memRef != nullptr) { - result = gpu.dev().iDev()->CreatePerfExperiment(createInfo, - &memRef[1], &memRef->perfExp_); - if (result != Pal::Result::Success) { - memRef->release(); - return nullptr; - } - } - - return memRef; + return memRef; } -PalCounterReference::~PalCounterReference() -{ - // The counter object is always associated with a particular queue, - // so we have to lock just this queue - amd::ScopedLock lock(gpu_.execution()); +PalCounterReference::~PalCounterReference() { + // The counter object is always associated with a particular queue, + // so we have to lock just this queue + amd::ScopedLock lock(gpu_.execution()); + if (layout_ != nullptr) { + delete layout_; + } + + if (memory_ != nullptr) { + delete memory_; + } + + if (nullptr != iPerf()) { + iPerf()->Destroy(); + } +} + +uint64_t PalCounterReference::result(const std::vector& index) { + if (index.size() == 0) { + // These are counters that have no corresponding PalSample created + return 0; + } + + if (layout_ == nullptr) { + return 0; + } + + uint64_t result = 0; + for (auto const& i : index) { + assert(i <= static_cast(layout_->sampleCount) && "index not in range"); + const Pal::GlobalSampleLayout& sample = layout_->samples[i]; + if (sample.dataType == Pal::PerfCounterDataType::Uint32) { + uint32_t beginVal = + *reinterpret_cast(reinterpret_cast(cpuAddr_) + sample.beginValueOffset); + uint32_t endVal = + *reinterpret_cast(reinterpret_cast(cpuAddr_) + sample.endValueOffset); + result += (endVal - beginVal); + } else if (sample.dataType == Pal::PerfCounterDataType::Uint64) { + uint64_t beginVal = + *reinterpret_cast(reinterpret_cast(cpuAddr_) + sample.beginValueOffset); + uint64_t endVal = + *reinterpret_cast(reinterpret_cast(cpuAddr_) + sample.endValueOffset); + result += (endVal - beginVal); + } else { + assert(0 && "dataType should be either Uint32 or Uint64"); + return 0; + } + } + + return result; +} + +bool PalCounterReference::finalize() { + Pal::Result result; + + iPerf()->Finalize(); + + // Acquire GPU memory for the query from the pool and bind it. + Pal::GpuMemoryRequirements gpuMemReqs = {}; + iPerf()->GetGpuMemoryRequirements(&gpuMemReqs); + + memory_ = new Memory(gpu().dev(), amd::alignUp(gpuMemReqs.size, gpuMemReqs.alignment)); + + if (nullptr == memory_) { + return false; + } + + if (!memory_->create(Resource::Remote)) { + return false; + } + + cpuAddr_ = memory_->cpuMap(gpu_); + + if (nullptr == cpuAddr_) { + return false; + } + + gpu_.queue(gpu_.engineID_).addMemRef(memory_->iMem()); + + result = iPerf()->BindGpuMemory(memory_->iMem(), 0); + + if (result == Pal::Result::Success) { + Pal::GlobalCounterLayout layout = {}; + iPerf()->GetGlobalCounterLayout(&layout); + + assert(layout.sampleCount == numExpCounters_); + size_t size = sizeof(Pal::GlobalCounterLayout) + + (sizeof(Pal::GlobalSampleLayout) * (layout.sampleCount - 1)); + layout_ = reinterpret_cast(new char[size]); if (layout_ != nullptr) { - delete layout_; - } - - if (memory_ != nullptr) { - delete memory_; - } - - if (nullptr != iPerf()) { - iPerf()->Destroy(); + layout_->sampleCount = layout.sampleCount; + iPerf()->GetGlobalCounterLayout(layout_); } + return true; + } else { + return false; + } } -uint64_t PalCounterReference::result(const std::vector& index) -{ - if (index.size() == 0) { - // These are counters that have no corresponding PalSample created - return 0; - } - - if (layout_ == nullptr) { - return 0; - } - - uint64_t result = 0; - for (auto const& i: index) { - assert(i <= static_cast(layout_->sampleCount) && "index not in range"); - const Pal::GlobalSampleLayout& sample = layout_->samples[i]; - if (sample.dataType == Pal::PerfCounterDataType::Uint32) { - uint32_t beginVal = *reinterpret_cast(reinterpret_cast(cpuAddr_) + sample.beginValueOffset); - uint32_t endVal = *reinterpret_cast(reinterpret_cast(cpuAddr_) + sample.endValueOffset); - result += (endVal - beginVal); - } - else if (sample.dataType == Pal::PerfCounterDataType::Uint64) { - uint64_t beginVal = *reinterpret_cast(reinterpret_cast(cpuAddr_) + sample.beginValueOffset); - uint64_t endVal = *reinterpret_cast(reinterpret_cast(cpuAddr_) + sample.endValueOffset); - result += (endVal - beginVal); - } - else { - assert(0 && "dataType should be either Uint32 or Uint64"); - return 0; - } - } - - return result; -} - -bool PalCounterReference::finalize() -{ - Pal::Result result; - - iPerf()->Finalize(); - - // Acquire GPU memory for the query from the pool and bind it. - Pal::GpuMemoryRequirements gpuMemReqs = {}; - iPerf()->GetGpuMemoryRequirements(&gpuMemReqs); - - memory_ = new Memory(gpu().dev(), amd::alignUp(gpuMemReqs.size, gpuMemReqs.alignment)); - - if (nullptr == memory_) { - return false; - } - - if (!memory_->create(Resource::Remote)) { - return false; - } - - cpuAddr_ = memory_->cpuMap(gpu_); - - if (nullptr == cpuAddr_) { - return false; - } - - gpu_.queue(gpu_.engineID_).addMemRef(memory_->iMem()); - - result = iPerf()->BindGpuMemory(memory_->iMem(), 0); - - if (result == Pal::Result::Success) { - Pal::GlobalCounterLayout layout = {}; - iPerf()->GetGlobalCounterLayout(&layout); - - assert(layout.sampleCount == numExpCounters_); - size_t size = sizeof(Pal::GlobalCounterLayout) + (sizeof(Pal::GlobalSampleLayout) * (layout.sampleCount - 1)); - layout_ = reinterpret_cast(new char[size]); - if (layout_ != nullptr) { - layout_->sampleCount = layout.sampleCount; - iPerf()->GetGlobalCounterLayout(layout_); - } - return true; - } - else { - return false; - } -} - -static const -std::array blockIdToIndexSelect = -{{ - PCIndexSelect::None, // CPF - PCIndexSelect::ShaderEngine, // IA - PCIndexSelect::ShaderEngine, // VGT - PCIndexSelect::ShaderEngine, // PA - PCIndexSelect::ShaderEngine, // SC - PCIndexSelect::ShaderEngine, // SPI - PCIndexSelect::ShaderEngine, // SQ - PCIndexSelect::ShaderEngine, // SX - PCIndexSelect::ShaderEngineAndInstance, // TA - PCIndexSelect::ShaderEngineAndInstance, // TD - PCIndexSelect::ShaderEngineAndInstance, // TCP - PCIndexSelect::Instance, // TCC - PCIndexSelect::Instance, // TCA - PCIndexSelect::ShaderEngineAndInstance, // DB - PCIndexSelect::ShaderEngineAndInstance, // CB - PCIndexSelect::None, // GDS - PCIndexSelect::None, // SRBM - PCIndexSelect::None, // GRBM - PCIndexSelect::None, // GRBMSE - PCIndexSelect::None, // RLC - PCIndexSelect::None, // DMA - PCIndexSelect::None, // MC - PCIndexSelect::None, // CPG - PCIndexSelect::None, // CPC - PCIndexSelect::None, // WD - PCIndexSelect::None, // TCS - PCIndexSelect::None, // UTC12 +static const std::array blockIdToIndexSelect = {{ + PCIndexSelect::None, // CPF + PCIndexSelect::ShaderEngine, // IA + PCIndexSelect::ShaderEngine, // VGT + PCIndexSelect::ShaderEngine, // PA + PCIndexSelect::ShaderEngine, // SC + PCIndexSelect::ShaderEngine, // SPI + PCIndexSelect::ShaderEngine, // SQ + PCIndexSelect::ShaderEngine, // SX + PCIndexSelect::ShaderEngineAndInstance, // TA + PCIndexSelect::ShaderEngineAndInstance, // TD + PCIndexSelect::ShaderEngineAndInstance, // TCP + PCIndexSelect::Instance, // TCC + PCIndexSelect::Instance, // TCA + PCIndexSelect::ShaderEngineAndInstance, // DB + PCIndexSelect::ShaderEngineAndInstance, // CB + PCIndexSelect::None, // GDS + PCIndexSelect::None, // SRBM + PCIndexSelect::None, // GRBM + PCIndexSelect::None, // GRBMSE + PCIndexSelect::None, // RLC + PCIndexSelect::None, // DMA + PCIndexSelect::None, // MC + PCIndexSelect::None, // CPG + PCIndexSelect::None, // CPC + PCIndexSelect::None, // WD + PCIndexSelect::None, // TCS + PCIndexSelect::None, // UTC12 }}; // Converting from ORCA cmndefs.h to PAL palPerfExperiment.h -static const -std::array, 83> ciBlockIdOrcaToPal = -{{ - {0x0E, 0}, // CB0 - {0x0E, 1}, // CB1 - {0x0E, 2}, // CB2 - {0x0E, 3}, // CB3 - {0x00, 0}, // CPF - {0x0D, 0}, // DB0 - {0x0D, 1}, // DB1 - {0x0D, 2}, // DB2 - {0x0D, 3}, // DB3 - {0x11, 0}, // GRBM - {0x12, 0}, // GRBMSE - {0x03, 0}, // PA_SU - {0x03, 0}, // PA_SC - {0x05, 0}, // SPI - {0x06, 0}, // SQ - {0x06, 0}, // SQ_ES - {0x06, 0}, // SQ_GS - {0x06, 0}, // SQ_VS - {0x06, 0}, // SQ_PS - {0x06, 0}, // SQ_LS - {0x06, 0}, // SQ_HS - {0x06, 0}, // SQ_CS - {0x07, 0}, // SX - {0x08, 0}, // TA0 - {0x08, 1}, // TA1 - {0x08, 2}, // TA2 - {0x08, 3}, // TA3 - {0x08, 4}, // TA4 - {0x08, 5}, // TA5 - {0x08, 6}, // TA6 - {0x08, 7}, // TA7 - {0x08, 8}, // TA8 - {0x08, 9}, // TA9 - {0x08, 0x0a}, // TA10 - {0x0C, 0}, // TCA0 - {0x0C, 1}, // TCA1 - {0x0B, 0}, // TCC0 - {0x0B, 1}, // TCC1 - {0x0B, 2}, // TCC2 - {0x0B, 3}, // TCC3 - {0x0B, 4}, // TCC4 - {0x0B, 5}, // TCC5 - {0x0B, 6}, // TCC6 - {0x0B, 7}, // TCC7 - {0x0B, 8}, // TCC8 - {0x0B, 9}, // TCC9 - {0x0B, 0x0a}, // TCC10 - {0x0B, 0x0b}, // TCC11 - {0x0B, 0x0c}, // TCC12 - {0x0B, 0x0d}, // TCC13 - {0x0B, 0x0e}, // TCC14 - {0x0B, 0x0f}, // TCC15 - {0x09, 0}, // TD0 - {0x09, 1}, // TD1 - {0x09, 2}, // TD2 - {0x09, 3}, // TD3 - {0x09, 4}, // TD4 - {0x09, 5}, // TD5 - {0x09, 6}, // TD6 - {0x09, 7}, // TD7 - {0x09, 8}, // TD8 - {0x09, 9}, // TD9 - {0x09, 0x0a}, // TD10 - {0x0A, 0}, // TCP0 - {0x0A, 1}, // TCP1 - {0x0A, 2}, // TCP2 - {0x0A, 3}, // TCP3 - {0x0A, 4}, // TCP4 - {0x0A, 5}, // TCP5 - {0x0A, 6}, // TCP6 - {0x0A, 7}, // TCP7 - {0x0A, 8}, // TCP8 - {0x0A, 9}, // TCP9 - {0x0A, 0x0a}, // TCP10 - {0x0F, 0}, // GDS - {0x02, 0}, // VGT - {0x01, 0}, // IA - {0x15, 0}, // MC - {0x10, 0}, // SRBM - {0x19, 0}, // TCS - {0x18, 0}, // WD - {0x16, 0}, // CPG - {0x17, 0}, // CPC +static const std::array, 83> ciBlockIdOrcaToPal = {{ + {0x0E, 0}, // CB0 + {0x0E, 1}, // CB1 + {0x0E, 2}, // CB2 + {0x0E, 3}, // CB3 + {0x00, 0}, // CPF + {0x0D, 0}, // DB0 + {0x0D, 1}, // DB1 + {0x0D, 2}, // DB2 + {0x0D, 3}, // DB3 + {0x11, 0}, // GRBM + {0x12, 0}, // GRBMSE + {0x03, 0}, // PA_SU + {0x03, 0}, // PA_SC + {0x05, 0}, // SPI + {0x06, 0}, // SQ + {0x06, 0}, // SQ_ES + {0x06, 0}, // SQ_GS + {0x06, 0}, // SQ_VS + {0x06, 0}, // SQ_PS + {0x06, 0}, // SQ_LS + {0x06, 0}, // SQ_HS + {0x06, 0}, // SQ_CS + {0x07, 0}, // SX + {0x08, 0}, // TA0 + {0x08, 1}, // TA1 + {0x08, 2}, // TA2 + {0x08, 3}, // TA3 + {0x08, 4}, // TA4 + {0x08, 5}, // TA5 + {0x08, 6}, // TA6 + {0x08, 7}, // TA7 + {0x08, 8}, // TA8 + {0x08, 9}, // TA9 + {0x08, 0x0a}, // TA10 + {0x0C, 0}, // TCA0 + {0x0C, 1}, // TCA1 + {0x0B, 0}, // TCC0 + {0x0B, 1}, // TCC1 + {0x0B, 2}, // TCC2 + {0x0B, 3}, // TCC3 + {0x0B, 4}, // TCC4 + {0x0B, 5}, // TCC5 + {0x0B, 6}, // TCC6 + {0x0B, 7}, // TCC7 + {0x0B, 8}, // TCC8 + {0x0B, 9}, // TCC9 + {0x0B, 0x0a}, // TCC10 + {0x0B, 0x0b}, // TCC11 + {0x0B, 0x0c}, // TCC12 + {0x0B, 0x0d}, // TCC13 + {0x0B, 0x0e}, // TCC14 + {0x0B, 0x0f}, // TCC15 + {0x09, 0}, // TD0 + {0x09, 1}, // TD1 + {0x09, 2}, // TD2 + {0x09, 3}, // TD3 + {0x09, 4}, // TD4 + {0x09, 5}, // TD5 + {0x09, 6}, // TD6 + {0x09, 7}, // TD7 + {0x09, 8}, // TD8 + {0x09, 9}, // TD9 + {0x09, 0x0a}, // TD10 + {0x0A, 0}, // TCP0 + {0x0A, 1}, // TCP1 + {0x0A, 2}, // TCP2 + {0x0A, 3}, // TCP3 + {0x0A, 4}, // TCP4 + {0x0A, 5}, // TCP5 + {0x0A, 6}, // TCP6 + {0x0A, 7}, // TCP7 + {0x0A, 8}, // TCP8 + {0x0A, 9}, // TCP9 + {0x0A, 0x0a}, // TCP10 + {0x0F, 0}, // GDS + {0x02, 0}, // VGT + {0x01, 0}, // IA + {0x15, 0}, // MC + {0x10, 0}, // SRBM + {0x19, 0}, // TCS + {0x18, 0}, // WD + {0x16, 0}, // CPG + {0x17, 0}, // CPC }}; -static const -std::array, 97> viBlockIdOrcaToPal = -{{ - {0x0E, 0}, // CB0 - {0x0E, 1}, // CB1 - {0x0E, 2}, // CB2 - {0x0E, 3}, // CB3 - {0x00, 0}, // CPF - {0x0D, 0}, // DB0 - {0x0D, 1}, // DB1 - {0x0D, 2}, // DB2 - {0x0D, 3}, // DB3 - {0x11, 0}, // GRBM - {0x12, 0}, // GRBMSE - {0x03, 0}, // PA_SU - {0x03, 0}, // PA_SC - {0x05, 0}, // SPI - {0x06, 0}, // SQ - {0x06, 0}, // SQ_ES - {0x06, 0}, // SQ_GS - {0x06, 0}, // SQ_VS - {0x06, 0}, // SQ_PS - {0x06, 0}, // SQ_LS - {0x06, 0}, // SQ_HS - {0x06, 0}, // SQ_CS - {0x07, 0}, // SX - {0x08, 0}, // TA0 - {0x08, 1}, // TA1 - {0x08, 2}, // TA2 - {0x08, 3}, // TA3 - {0x08, 4}, // TA4 - {0x08, 5}, // TA5 - {0x08, 6}, // TA6 - {0x08, 7}, // TA7 - {0x08, 8}, // TA8 - {0x08, 9}, // TA9 - {0x08, 0x0a}, // TA10 - {0x08, 0x0b}, // TA11 - {0x08, 0x0c}, // TA12 - {0x08, 0x0d}, // TA13 - {0x08, 0x0e}, // TA14 - {0x08, 0x0f}, // TA15 - {0x0C, 0}, // TCA0 - {0x0C, 1}, // TCA1 - {0x0B, 0}, // TCC0 - {0x0B, 1}, // TCC1 - {0x0B, 2}, // TCC2 - {0x0B, 3}, // TCC3 - {0x0B, 4}, // TCC4 - {0x0B, 5}, // TCC5 - {0x0B, 6}, // TCC6 - {0x0B, 7}, // TCC7 - {0x0B, 8}, // TCC8 - {0x0B, 9}, // TCC9 - {0x0B, 0x0a}, // TCC10 - {0x0B, 0x0b}, // TCC11 - {0x0B, 0x0c}, // TCC12 - {0x0B, 0x0d}, // TCC13 - {0x0B, 0x0e}, // TCC14 - {0x0B, 0x0f}, // TCC15 - {0x09, 0}, // TD0 - {0x09, 1}, // TD1 - {0x09, 2}, // TD2 - {0x09, 3}, // TD3 - {0x09, 4}, // TD4 - {0x09, 5}, // TD5 - {0x09, 6}, // TD6 - {0x09, 7}, // TD7 - {0x09, 8}, // TD8 - {0x09, 9}, // TD9 - {0x09, 0x0a}, // TD10 - {0x09, 0x0b}, // TD11 - {0x09, 0x0c}, // TD12 - {0x09, 0x0d}, // TD13 - {0x09, 0x0e}, // TD14 - {0x09, 0x0f}, // TD15 - {0x0A, 0}, // TCP0 - {0x0A, 1}, // TCP1 - {0x0A, 2}, // TCP2 - {0x0A, 3}, // TCP3 - {0x0A, 4}, // TCP4 - {0x0A, 5}, // TCP5 - {0x0A, 6}, // TCP6 - {0x0A, 7}, // TCP7 - {0x0A, 8}, // TCP8 - {0x0A, 9}, // TCP9 - {0x0A, 0x0a}, // TCP10 - {0x0A, 0x0b}, // TCP11 - {0x0A, 0x0c}, // TCP12 - {0x0A, 0x0d}, // TCP13 - {0x0A, 0x0e}, // TCP14 - {0x0A, 0x0f}, // TCP15 - {0x0F, 0}, // GDS - {0x02, 0}, // VGT - {0x01, 0}, // IA - {0x15, 0}, // MC - {0x10, 0}, // SRBM - {0x18, 0}, // WD - {0x16, 0}, // CPG - {0x17, 0}, // CPC +static const std::array, 97> viBlockIdOrcaToPal = {{ + {0x0E, 0}, // CB0 + {0x0E, 1}, // CB1 + {0x0E, 2}, // CB2 + {0x0E, 3}, // CB3 + {0x00, 0}, // CPF + {0x0D, 0}, // DB0 + {0x0D, 1}, // DB1 + {0x0D, 2}, // DB2 + {0x0D, 3}, // DB3 + {0x11, 0}, // GRBM + {0x12, 0}, // GRBMSE + {0x03, 0}, // PA_SU + {0x03, 0}, // PA_SC + {0x05, 0}, // SPI + {0x06, 0}, // SQ + {0x06, 0}, // SQ_ES + {0x06, 0}, // SQ_GS + {0x06, 0}, // SQ_VS + {0x06, 0}, // SQ_PS + {0x06, 0}, // SQ_LS + {0x06, 0}, // SQ_HS + {0x06, 0}, // SQ_CS + {0x07, 0}, // SX + {0x08, 0}, // TA0 + {0x08, 1}, // TA1 + {0x08, 2}, // TA2 + {0x08, 3}, // TA3 + {0x08, 4}, // TA4 + {0x08, 5}, // TA5 + {0x08, 6}, // TA6 + {0x08, 7}, // TA7 + {0x08, 8}, // TA8 + {0x08, 9}, // TA9 + {0x08, 0x0a}, // TA10 + {0x08, 0x0b}, // TA11 + {0x08, 0x0c}, // TA12 + {0x08, 0x0d}, // TA13 + {0x08, 0x0e}, // TA14 + {0x08, 0x0f}, // TA15 + {0x0C, 0}, // TCA0 + {0x0C, 1}, // TCA1 + {0x0B, 0}, // TCC0 + {0x0B, 1}, // TCC1 + {0x0B, 2}, // TCC2 + {0x0B, 3}, // TCC3 + {0x0B, 4}, // TCC4 + {0x0B, 5}, // TCC5 + {0x0B, 6}, // TCC6 + {0x0B, 7}, // TCC7 + {0x0B, 8}, // TCC8 + {0x0B, 9}, // TCC9 + {0x0B, 0x0a}, // TCC10 + {0x0B, 0x0b}, // TCC11 + {0x0B, 0x0c}, // TCC12 + {0x0B, 0x0d}, // TCC13 + {0x0B, 0x0e}, // TCC14 + {0x0B, 0x0f}, // TCC15 + {0x09, 0}, // TD0 + {0x09, 1}, // TD1 + {0x09, 2}, // TD2 + {0x09, 3}, // TD3 + {0x09, 4}, // TD4 + {0x09, 5}, // TD5 + {0x09, 6}, // TD6 + {0x09, 7}, // TD7 + {0x09, 8}, // TD8 + {0x09, 9}, // TD9 + {0x09, 0x0a}, // TD10 + {0x09, 0x0b}, // TD11 + {0x09, 0x0c}, // TD12 + {0x09, 0x0d}, // TD13 + {0x09, 0x0e}, // TD14 + {0x09, 0x0f}, // TD15 + {0x0A, 0}, // TCP0 + {0x0A, 1}, // TCP1 + {0x0A, 2}, // TCP2 + {0x0A, 3}, // TCP3 + {0x0A, 4}, // TCP4 + {0x0A, 5}, // TCP5 + {0x0A, 6}, // TCP6 + {0x0A, 7}, // TCP7 + {0x0A, 8}, // TCP8 + {0x0A, 9}, // TCP9 + {0x0A, 0x0a}, // TCP10 + {0x0A, 0x0b}, // TCP11 + {0x0A, 0x0c}, // TCP12 + {0x0A, 0x0d}, // TCP13 + {0x0A, 0x0e}, // TCP14 + {0x0A, 0x0f}, // TCP15 + {0x0F, 0}, // GDS + {0x02, 0}, // VGT + {0x01, 0}, // IA + {0x15, 0}, // MC + {0x10, 0}, // SRBM + {0x18, 0}, // WD + {0x16, 0}, // CPG + {0x17, 0}, // CPC }}; -// The number of counters per block has been increased for gfx9 but this table may not reflect all of them +// The number of counters per block has been increased for gfx9 but this table may not reflect all +// of them // as compute may not use all of them. -static const -std::array, 104> gfx9BlockIdPal = -{{ - {0x0E, 0}, // CB0 - {0x0E, 1}, // CB1 - {0x0E, 2}, // CB2 - {0x0E, 3}, // CB3 - {0x00, 0}, // CPF0 - {0x00, 1}, // CPF1 - {0x0D, 0}, // DB0 - {0x0D, 1}, // DB1 - {0x0D, 2}, // DB2 - {0x0D, 3}, // DB3 - {0x11, 0}, // GRBM0 - {0x11, 1}, // GRBM1 - {0x12, 0}, // GRBMSE0 - {0x03, 0}, // PA_SU - {0x03, 0}, // PA_SC - {0x05, 0}, // SPI - {0x06, 0}, // SQ0 - {0x06, 1}, // SQ1 - {0x06, 0}, // SQ_ES - {0x06, 0}, // SQ_GS - {0x06, 0}, // SQ_VS - {0x06, 0}, // SQ_PS - {0x06, 0}, // SQ_LS - {0x06, 0}, // SQ_HS - {0x06, 0}, // SQ_CS0 - {0x06, 1}, // SQ_CS1 - {0x07, 0}, // SX - {0x08, 0}, // TA0 - {0x08, 1}, // TA1 - {0x08, 2}, // TA2 - {0x08, 3}, // TA3 - {0x08, 4}, // TA4 - {0x08, 5}, // TA5 - {0x08, 6}, // TA6 - {0x08, 7}, // TA7 - {0x08, 8}, // TA8 - {0x08, 9}, // TA9 - {0x08, 0x0a}, // TA10 - {0x08, 0x0b}, // TA11 - {0x08, 0x0c}, // TA12 - {0x08, 0x0d}, // TA13 - {0x08, 0x0e}, // TA14 - {0x08, 0x0f}, // TA15 - {0x0C, 0}, // TCA0 - {0x0C, 1}, // TCA1 - {0x0B, 0}, // TCC0 - {0x0B, 1}, // TCC1 - {0x0B, 2}, // TCC2 - {0x0B, 3}, // TCC3 - {0x0B, 4}, // TCC4 - {0x0B, 5}, // TCC5 - {0x0B, 6}, // TCC6 - {0x0B, 7}, // TCC7 - {0x0B, 8}, // TCC8 - {0x0B, 9}, // TCC9 - {0x0B, 0x0a}, // TCC10 - {0x0B, 0x0b}, // TCC11 - {0x0B, 0x0c}, // TCC12 - {0x0B, 0x0d}, // TCC13 - {0x0B, 0x0e}, // TCC14 - {0x0B, 0x0f}, // TCC15 - {0x09, 0}, // TD0 - {0x09, 1}, // TD1 - {0x09, 2}, // TD2 - {0x09, 3}, // TD3 - {0x09, 4}, // TD4 - {0x09, 5}, // TD5 - {0x09, 6}, // TD6 - {0x09, 7}, // TD7 - {0x09, 8}, // TD8 - {0x09, 9}, // TD9 - {0x09, 0x0a}, // TD10 - {0x09, 0x0b}, // TD11 - {0x09, 0x0c}, // TD12 - {0x09, 0x0d}, // TD13 - {0x09, 0x0e}, // TD14 - {0x09, 0x0f}, // TD15 - {0x0A, 0}, // TCP0 - {0x0A, 1}, // TCP1 - {0x0A, 2}, // TCP2 - {0x0A, 3}, // TCP3 - {0x0A, 4}, // TCP4 - {0x0A, 5}, // TCP5 - {0x0A, 6}, // TCP6 - {0x0A, 7}, // TCP7 - {0x0A, 8}, // TCP8 - {0x0A, 9}, // TCP9 - {0x0A, 0x0a}, // TCP10 - {0x0A, 0x0b}, // TCP11 - {0x0A, 0x0c}, // TCP12 - {0x0A, 0x0d}, // TCP13 - {0x0A, 0x0e}, // TCP14 - {0x0A, 0x0f}, // TCP15 - {0x0F, 0}, // GDS0 - {0x0F, 1}, // GDS1 - {0x02, 0}, // VGT - {0x01, 0}, // IA - {0x15, 0}, // MC - {0x10, 0}, // SRBM - {0x18, 0}, // WD - {0x16, 0}, // CPG0 - {0x16, 1}, // CPG1 - {0x17, 0}, // CPC0 - {0x17, 1}, // CPC1 +static const std::array, 104> gfx9BlockIdPal = {{ + {0x0E, 0}, // CB0 + {0x0E, 1}, // CB1 + {0x0E, 2}, // CB2 + {0x0E, 3}, // CB3 + {0x00, 0}, // CPF0 + {0x00, 1}, // CPF1 + {0x0D, 0}, // DB0 + {0x0D, 1}, // DB1 + {0x0D, 2}, // DB2 + {0x0D, 3}, // DB3 + {0x11, 0}, // GRBM0 + {0x11, 1}, // GRBM1 + {0x12, 0}, // GRBMSE0 + {0x03, 0}, // PA_SU + {0x03, 0}, // PA_SC + {0x05, 0}, // SPI + {0x06, 0}, // SQ0 + {0x06, 1}, // SQ1 + {0x06, 0}, // SQ_ES + {0x06, 0}, // SQ_GS + {0x06, 0}, // SQ_VS + {0x06, 0}, // SQ_PS + {0x06, 0}, // SQ_LS + {0x06, 0}, // SQ_HS + {0x06, 0}, // SQ_CS0 + {0x06, 1}, // SQ_CS1 + {0x07, 0}, // SX + {0x08, 0}, // TA0 + {0x08, 1}, // TA1 + {0x08, 2}, // TA2 + {0x08, 3}, // TA3 + {0x08, 4}, // TA4 + {0x08, 5}, // TA5 + {0x08, 6}, // TA6 + {0x08, 7}, // TA7 + {0x08, 8}, // TA8 + {0x08, 9}, // TA9 + {0x08, 0x0a}, // TA10 + {0x08, 0x0b}, // TA11 + {0x08, 0x0c}, // TA12 + {0x08, 0x0d}, // TA13 + {0x08, 0x0e}, // TA14 + {0x08, 0x0f}, // TA15 + {0x0C, 0}, // TCA0 + {0x0C, 1}, // TCA1 + {0x0B, 0}, // TCC0 + {0x0B, 1}, // TCC1 + {0x0B, 2}, // TCC2 + {0x0B, 3}, // TCC3 + {0x0B, 4}, // TCC4 + {0x0B, 5}, // TCC5 + {0x0B, 6}, // TCC6 + {0x0B, 7}, // TCC7 + {0x0B, 8}, // TCC8 + {0x0B, 9}, // TCC9 + {0x0B, 0x0a}, // TCC10 + {0x0B, 0x0b}, // TCC11 + {0x0B, 0x0c}, // TCC12 + {0x0B, 0x0d}, // TCC13 + {0x0B, 0x0e}, // TCC14 + {0x0B, 0x0f}, // TCC15 + {0x09, 0}, // TD0 + {0x09, 1}, // TD1 + {0x09, 2}, // TD2 + {0x09, 3}, // TD3 + {0x09, 4}, // TD4 + {0x09, 5}, // TD5 + {0x09, 6}, // TD6 + {0x09, 7}, // TD7 + {0x09, 8}, // TD8 + {0x09, 9}, // TD9 + {0x09, 0x0a}, // TD10 + {0x09, 0x0b}, // TD11 + {0x09, 0x0c}, // TD12 + {0x09, 0x0d}, // TD13 + {0x09, 0x0e}, // TD14 + {0x09, 0x0f}, // TD15 + {0x0A, 0}, // TCP0 + {0x0A, 1}, // TCP1 + {0x0A, 2}, // TCP2 + {0x0A, 3}, // TCP3 + {0x0A, 4}, // TCP4 + {0x0A, 5}, // TCP5 + {0x0A, 6}, // TCP6 + {0x0A, 7}, // TCP7 + {0x0A, 8}, // TCP8 + {0x0A, 9}, // TCP9 + {0x0A, 0x0a}, // TCP10 + {0x0A, 0x0b}, // TCP11 + {0x0A, 0x0c}, // TCP12 + {0x0A, 0x0d}, // TCP13 + {0x0A, 0x0e}, // TCP14 + {0x0A, 0x0f}, // TCP15 + {0x0F, 0}, // GDS0 + {0x0F, 1}, // GDS1 + {0x02, 0}, // VGT + {0x01, 0}, // IA + {0x15, 0}, // MC + {0x10, 0}, // SRBM + {0x18, 0}, // WD + {0x16, 0}, // CPG0 + {0x16, 1}, // CPG1 + {0x17, 0}, // CPC0 + {0x17, 1}, // CPC1 }}; -void PerfCounter::convertInfo() -{ - switch (dev().ipLevel()) { +void PerfCounter::convertInfo() { + switch (dev().ipLevel()) { case Pal::GfxIpLevel::GfxIp7: - if (info_.blockIndex_ < ciBlockIdOrcaToPal.size()) { - auto p = ciBlockIdOrcaToPal[info_.blockIndex_]; - info_.blockIndex_ = std::get<0>(p); - info_.counterIndex_ = std::get<1>(p); - } - break; + if (info_.blockIndex_ < ciBlockIdOrcaToPal.size()) { + auto p = ciBlockIdOrcaToPal[info_.blockIndex_]; + info_.blockIndex_ = std::get<0>(p); + info_.counterIndex_ = std::get<1>(p); + } + break; case Pal::GfxIpLevel::GfxIp8: - if (info_.blockIndex_ < viBlockIdOrcaToPal.size()) { - auto p = viBlockIdOrcaToPal[info_.blockIndex_]; - info_.blockIndex_ = std::get<0>(p); - info_.counterIndex_ = std::get<1>(p); - } - break; + if (info_.blockIndex_ < viBlockIdOrcaToPal.size()) { + auto p = viBlockIdOrcaToPal[info_.blockIndex_]; + info_.blockIndex_ = std::get<0>(p); + info_.counterIndex_ = std::get<1>(p); + } + break; case Pal::GfxIpLevel::GfxIp9: - if (info_.blockIndex_ < gfx9BlockIdPal.size()) { - auto p = gfx9BlockIdPal[info_.blockIndex_]; - info_.blockIndex_ = std::get<0>(p); - info_.counterIndex_ = std::get<1>(p); - } - break; + if (info_.blockIndex_ < gfx9BlockIdPal.size()) { + auto p = gfx9BlockIdPal[info_.blockIndex_]; + info_.blockIndex_ = std::get<0>(p); + info_.counterIndex_ = std::get<1>(p); + } + break; default: - Unimplemented(); - break; - } + Unimplemented(); + break; + } - assert(info_.blockIndex_ < blockIdToIndexSelect.size()); - info_.indexSelect_ = blockIdToIndexSelect.at(info_.blockIndex_); + assert(info_.blockIndex_ < blockIdToIndexSelect.size()); + info_.indexSelect_ = blockIdToIndexSelect.at(info_.blockIndex_); } -PerfCounter::~PerfCounter() -{ - if (palRef_ == nullptr) { - return; - } +PerfCounter::~PerfCounter() { + if (palRef_ == nullptr) { + return; + } - // Release the counter reference object - palRef_->release(); + // Release the counter reference object + palRef_->release(); } -bool -PerfCounter::create() -{ - palRef_->retain(); +bool PerfCounter::create() { + palRef_->retain(); - // Initialize the counter - Pal::PerfCounterInfo counterInfo = {}; - counterInfo.counterType = Pal::PerfCounterType::Global; - counterInfo.block = static_cast(info_.blockIndex_); - counterInfo.eventId = info_.eventIndex_; + // Initialize the counter + Pal::PerfCounterInfo counterInfo = {}; + counterInfo.counterType = Pal::PerfCounterType::Global; + counterInfo.block = static_cast(info_.blockIndex_); + counterInfo.eventId = info_.eventIndex_; - Pal::PerfExperimentProperties perfExpProps; - Pal::Result result; - result = dev().iDev()->GetPerfExperimentProperties(&perfExpProps); - if (result != Pal::Result::Success) { - return false; - } + Pal::PerfExperimentProperties perfExpProps; + Pal::Result result; + result = dev().iDev()->GetPerfExperimentProperties(&perfExpProps); + if (result != Pal::Result::Success) { + return false; + } - const auto& blockProps = perfExpProps.blocks[static_cast(counterInfo.block)]; - uint32_t counter_start, counter_step; + const auto& blockProps = perfExpProps.blocks[static_cast(counterInfo.block)]; + uint32_t counter_start, counter_step; - switch (info_.indexSelect_) { + switch (info_.indexSelect_) { case PCIndexSelect::ShaderEngine: case PCIndexSelect::None: - counter_start = 0; - counter_step = 1; - break; + counter_start = 0; + counter_step = 1; + break; case PCIndexSelect::ShaderEngineAndInstance: - if (info_.counterIndex_ >= - dev().properties().gfxipProperties.shaderCore.maxCusPerShaderArray) { - return true; - } - counter_start = info_.counterIndex_; - counter_step = dev().properties().gfxipProperties.shaderCore.maxCusPerShaderArray; - break; + if (info_.counterIndex_ >= + dev().properties().gfxipProperties.shaderCore.maxCusPerShaderArray) { + return true; + } + counter_start = info_.counterIndex_; + counter_step = dev().properties().gfxipProperties.shaderCore.maxCusPerShaderArray; + break; case PCIndexSelect::Instance: - counter_start = info_.counterIndex_; - counter_step = blockProps.instanceCount; - break; + counter_start = info_.counterIndex_; + counter_step = blockProps.instanceCount; + break; default: - assert(0 && "Unknown indexSelect_"); - return true; - } + assert(0 && "Unknown indexSelect_"); + return true; + } - for (uint32_t i = counter_start; i < blockProps.instanceCount; i += counter_step) { - counterInfo.instance = i; - result = iPerf()->AddCounter(counterInfo); - if (result == Pal::Result::Success) { - index_.push_back(palRef_->getPalCounterIndex()); - } - else { - // Get here when there's no HW PerfCounter matching the counterInfo - assert(0 && "AddCounter() failed"); - } + for (uint32_t i = counter_start; i < blockProps.instanceCount; i += counter_step) { + counterInfo.instance = i; + result = iPerf()->AddCounter(counterInfo); + if (result == Pal::Result::Success) { + index_.push_back(palRef_->getPalCounterIndex()); + } else { + // Get here when there's no HW PerfCounter matching the counterInfo + assert(0 && "AddCounter() failed"); } - return true; + } + return true; } -uint64_t -PerfCounter::getInfo(uint64_t infoType) const -{ - switch (infoType) { +uint64_t PerfCounter::getInfo(uint64_t infoType) const { + switch (infoType) { case CL_PERFCOUNTER_GPU_BLOCK_INDEX: { - // Return the GPU block index - return info()->blockIndex_; + // Return the GPU block index + return info()->blockIndex_; } case CL_PERFCOUNTER_GPU_COUNTER_INDEX: { - // Return the GPU counter index - return info()->counterIndex_; + // Return the GPU counter index + return info()->counterIndex_; } case CL_PERFCOUNTER_GPU_EVENT_INDEX: { - // Return the GPU event index - return info()->eventIndex_; + // Return the GPU event index + return info()->eventIndex_; } case CL_PERFCOUNTER_DATA: { - return palRef_->result(index_); + return palRef_->result(index_); } default: - LogError("Wrong PerfCounter::getInfo parameter"); - } - return 0; + LogError("Wrong PerfCounter::getInfo parameter"); + } + return 0; } -} // namespace pal +} // namespace pal diff --git a/rocclr/runtime/device/pal/palcounters.hpp b/rocclr/runtime/device/pal/palcounters.hpp index 10762532e6..ea55cc1600 100644 --- a/rocclr/runtime/device/pal/palcounters.hpp +++ b/rocclr/runtime/device/pal/palcounters.hpp @@ -10,132 +10,124 @@ namespace pal { -enum class PCIndexSelect : uint -{ - None = 0, ///< no index - Instance, ///< index by block instance - ShaderEngine, ///< index by shader engine - ShaderEngineAndInstance, ///< index by shader and instance +enum class PCIndexSelect : uint { + None = 0, ///< no index + Instance, ///< index by block instance + ShaderEngine, ///< index by shader engine + ShaderEngineAndInstance, ///< index by shader and instance }; class VirtualGPU; -class PalCounterReference : public amd::ReferenceCountedObject -{ -public: - static PalCounterReference* Create(VirtualGPU& gpu); +class PalCounterReference : public amd::ReferenceCountedObject { + public: + static PalCounterReference* Create(VirtualGPU& gpu); - //! Default constructor - PalCounterReference( - VirtualGPU& gpu //!< Virtual GPU device object - ) - : gpu_(gpu) - , perfExp_(nullptr) - , layout_(nullptr) - , memory_(nullptr) - , cpuAddr_(nullptr) - , numExpCounters_(0) {} + //! Default constructor + PalCounterReference(VirtualGPU& gpu //!< Virtual GPU device object + ) + : gpu_(gpu), + perfExp_(nullptr), + layout_(nullptr), + memory_(nullptr), + cpuAddr_(nullptr), + numExpCounters_(0) {} - //! Get PAL counter - Pal::IPerfExperiment* iPerf() const { return perfExp_; } + //! Get PAL counter + Pal::IPerfExperiment* iPerf() const { return perfExp_; } - //! Returns the virtual GPU device - const VirtualGPU& gpu() const { return gpu_; } + //! Returns the virtual GPU device + const VirtualGPU& gpu() const { return gpu_; } - //! Prepare for execution - bool finalize(); + //! Prepare for execution + bool finalize(); - //! Returns the PAL counter results - uint64_t result(const std::vector& index); + //! Returns the PAL counter results + uint64_t result(const std::vector& index); - //! Get the latest Experiment Counter index - uint getPalCounterIndex() { return numExpCounters_++; }; + //! Get the latest Experiment Counter index + uint getPalCounterIndex() { return numExpCounters_++; }; -protected: - //! Default destructor - ~PalCounterReference(); + protected: + //! Default destructor + ~PalCounterReference(); -private: - //! Disable copy constructor - PalCounterReference(const PalCounterReference&); + private: + //! Disable copy constructor + PalCounterReference(const PalCounterReference&); - //! Disable operator= - PalCounterReference& operator=(const PalCounterReference&); + //! Disable operator= + PalCounterReference& operator=(const PalCounterReference&); - VirtualGPU& gpu_; //!< The virtual GPU device object - Pal::IPerfExperiment* perfExp_; //!< PAL performance experiment object - Pal::GlobalCounterLayout* layout_; //!< Layout of the result - Memory* memory_; //!< Memory used by PAL performance experiment - void* cpuAddr_; //!< CPU address of memory_ - uint numExpCounters_; //!< Number of Experiment Counter created + VirtualGPU& gpu_; //!< The virtual GPU device object + Pal::IPerfExperiment* perfExp_; //!< PAL performance experiment object + Pal::GlobalCounterLayout* layout_; //!< Layout of the result + Memory* memory_; //!< Memory used by PAL performance experiment + void* cpuAddr_; //!< CPU address of memory_ + uint numExpCounters_; //!< Number of Experiment Counter created }; //! Performance counter implementation on GPU -class PerfCounter : public device::PerfCounter -{ -public: - //! The performance counter info - struct Info : public amd::EmbeddedObject - { - uint blockIndex_; //!< Index of the block to configure - uint counterIndex_; //!< Index of the hardware counter - uint eventIndex_; //!< Event you wish to count with the counter - PCIndexSelect indexSelect_; //!< IndexSelect type of the counter - }; +class PerfCounter : public device::PerfCounter { + public: + //! The performance counter info + struct Info : public amd::EmbeddedObject { + uint blockIndex_; //!< Index of the block to configure + uint counterIndex_; //!< Index of the hardware counter + uint eventIndex_; //!< Event you wish to count with the counter + PCIndexSelect indexSelect_; //!< IndexSelect type of the counter + }; - //! Constructor for the GPU PerfCounter object - PerfCounter( - const Device& device, //!< A GPU device object - PalCounterReference* palRef, //!< Counter Reference - cl_uint blockIndex, //!< HW block index - cl_uint counterIndex, //!< Counter index within the block - cl_uint eventIndex) //!< Event index for profiling - : gpuDevice_(device) - , palRef_(palRef) - { - info_.blockIndex_ = blockIndex; - info_.counterIndex_ = counterIndex; - info_.eventIndex_ = eventIndex; - convertInfo(); - } + //! Constructor for the GPU PerfCounter object + PerfCounter(const Device& device, //!< A GPU device object + PalCounterReference* palRef, //!< Counter Reference + cl_uint blockIndex, //!< HW block index + cl_uint counterIndex, //!< Counter index within the block + cl_uint eventIndex) //!< Event index for profiling + : gpuDevice_(device), + palRef_(palRef) { + info_.blockIndex_ = blockIndex; + info_.counterIndex_ = counterIndex; + info_.eventIndex_ = eventIndex; + convertInfo(); + } - //! Destructor for the GPU PerfCounter object - virtual ~PerfCounter(); + //! Destructor for the GPU PerfCounter object + virtual ~PerfCounter(); - //! Creates the current object - bool create(); + //! Creates the current object + bool create(); - //! Returns the specific information about the counter - uint64_t getInfo( - uint64_t infoType //!< The type of returned information - ) const; + //! Returns the specific information about the counter + uint64_t getInfo(uint64_t infoType //!< The type of returned information + ) const; - //! Returns the GPU device, associated with the current object - const Device& dev() const { return gpuDevice_; } + //! Returns the GPU device, associated with the current object + const Device& dev() const { return gpuDevice_; } - //! Returns the virtual GPU device - const VirtualGPU& gpu() const { return palRef_->gpu(); } + //! Returns the virtual GPU device + const VirtualGPU& gpu() const { return palRef_->gpu(); } - //! Returns the PAL performance counter descriptor - const Info* info() const { return &info_; } + //! Returns the PAL performance counter descriptor + const Info* info() const { return &info_; } - //! Returns the Info structure for performance counter - Pal::IPerfExperiment* iPerf() const { return palRef_->iPerf(); } + //! Returns the Info structure for performance counter + Pal::IPerfExperiment* iPerf() const { return palRef_->iPerf(); } -private: - //! Disable default copy constructor - PerfCounter(const PerfCounter&); + private: + //! Disable default copy constructor + PerfCounter(const PerfCounter&); - //! Disable default operator= - PerfCounter& operator=(const PerfCounter&); + //! Disable default operator= + PerfCounter& operator=(const PerfCounter&); - //! Convert info from ORCA to PAL - void convertInfo(); + //! Convert info from ORCA to PAL + void convertInfo(); - const Device& gpuDevice_; //!< The backend device - PalCounterReference* palRef_; //!< Reference counter - Info info_; //!< The info structure for perfcounter - std::vector index_; //!< Counter index in the PAL container + const Device& gpuDevice_; //!< The backend device + PalCounterReference* palRef_; //!< Reference counter + Info info_; //!< The info structure for perfcounter + std::vector index_; //!< Counter index in the PAL container }; -} // namespace pal +} // namespace pal diff --git a/rocclr/runtime/device/pal/paldebugger.hpp b/rocclr/runtime/device/pal/paldebugger.hpp index 162a5c4a6f..cb1d4dd981 100644 --- a/rocclr/runtime/device/pal/paldebugger.hpp +++ b/rocclr/runtime/device/pal/paldebugger.hpp @@ -24,82 +24,77 @@ namespace pal { * * This structure contains the packet information for kernel dispatch */ -struct PacketAmdInfo -{ - uint32_t trapReservedVgprIndex_; //!< reserved VGPR index, -1 when they are not valid - uint32_t scratchBufferWaveOffset_; //!< scratch buffer wave offset, -1 when no scratch buffer - void* pointerToIsaBuffer_; //!< pointer to the buffer containing ISA - size_t sizeOfIsaBuffer_; //!< size of the ISA buffer - uint32_t numberOfVgprs_; //!< number of VGPRs used by the kernel - uint32_t numberOfSgprs_; //!< number of SGPRs used by the kernel - size_t sizeOfStaticGroupMemory_; //!< Static local memory used by the kernel +struct PacketAmdInfo { + uint32_t trapReservedVgprIndex_; //!< reserved VGPR index, -1 when they are not valid + uint32_t scratchBufferWaveOffset_; //!< scratch buffer wave offset, -1 when no scratch buffer + void* pointerToIsaBuffer_; //!< pointer to the buffer containing ISA + size_t sizeOfIsaBuffer_; //!< size of the ISA buffer + uint32_t numberOfVgprs_; //!< number of VGPRs used by the kernel + uint32_t numberOfSgprs_; //!< number of SGPRs used by the kernel + size_t sizeOfStaticGroupMemory_; //!< Static local memory used by the kernel }; /*! \brief Cache mask for invalidation */ -struct HwDbgGpuCacheMask -{ - HwDbgGpuCacheMask() :ui32All_(0) {} +struct HwDbgGpuCacheMask { + HwDbgGpuCacheMask() : ui32All_(0) {} - HwDbgGpuCacheMask(uint32_t mask) :ui32All_(mask) {} + HwDbgGpuCacheMask(uint32_t mask) : ui32All_(mask) {} - union { - struct { - uint32_t sqICache_ : 1; //!< Instruction cache - uint32_t sqKCache_ : 1; //!< Data cache - uint32_t tcL1_ : 1; //!< tcL1 cache - uint32_t tcL2_ : 1; //!< tcL2 cache - uint32_t reserved_ : 28; - }; - uint32_t ui32All_; + union { + struct { + uint32_t sqICache_ : 1; //!< Instruction cache + uint32_t sqKCache_ : 1; //!< Data cache + uint32_t tcL1_ : 1; //!< tcL1 cache + uint32_t tcL2_ : 1; //!< tcL2 cache + uint32_t reserved_ : 28; }; + uint32_t ui32All_; + }; }; /*! \brief Address watch information * * Information about each watch point - address, mask, mode and event */ -struct HwDbgAddressWatch -{ - void* watchAddress_; //! The address of watch point - uint64_t watchMask_; //! The mask for watch point (lower 24 bits) - cl_dbg_address_watch_mode_amd watchMode_; //! The watch mode for this watch - DebugEvent event_; //! Event of the watch point (not used for now) +struct HwDbgAddressWatch { + void* watchAddress_; //! The address of watch point + uint64_t watchMask_; //! The mask for watch point (lower 24 bits) + cl_dbg_address_watch_mode_amd watchMode_; //! The watch mode for this watch + DebugEvent event_; //! Event of the watch point (not used for now) }; /*! \brief Runtime structure used to communicate debug information * between Ocl services and core for a kernel dispatch. */ -struct DebugToolInfo -{ - uint64_t scratchAddress_; //! Scratch memory address - size_t scratchSize_; //! Scratch memory size - uint64_t globalAddress_; //! Global memory address - uint32_t cacheDisableMask_; //! Cache mask, indicating caches disabled - uint32_t exceptionMask_; //! Exception mask - uint32_t reservedCuNum_; //! Number of reserved CUs for display, - //! which ranges from 0 to 7 in the current implementation. - bool monitorMode_; //! Debug or profiler mode - bool gpuSingleStepMode_; //! SQ debug mode - amd::Memory* trapHandler_; //! Trap handler address - amd::Memory* trapBuffer_; //! Trap buffer address - bool sqPerfcounterEnable_; //! whether SQ perf counters are enabled - aclBinary* aclBinary_; //! pointer of the kernel ACL binary - amd::Event* event_; //! pointer of the kernel event in the enqueue command +struct DebugToolInfo { + uint64_t scratchAddress_; //! Scratch memory address + size_t scratchSize_; //! Scratch memory size + uint64_t globalAddress_; //! Global memory address + uint32_t cacheDisableMask_; //! Cache mask, indicating caches disabled + uint32_t exceptionMask_; //! Exception mask + uint32_t reservedCuNum_; //! Number of reserved CUs for display, + //! which ranges from 0 to 7 in the current implementation. + bool monitorMode_; //! Debug or profiler mode + bool gpuSingleStepMode_; //! SQ debug mode + amd::Memory* trapHandler_; //! Trap handler address + amd::Memory* trapBuffer_; //! Trap buffer address + bool sqPerfcounterEnable_; //! whether SQ perf counters are enabled + aclBinary* aclBinary_; //! pointer of the kernel ACL binary + amd::Event* event_; //! pointer of the kernel event in the enqueue command }; /*! \brief Message used by the KFD wave control for CI * * Structure indicates the various information used by the wave control function. */ -struct HwDebugWaveAddr -{ - uint32_t VMID_ : 4; //! Virtual memory id - uint32_t wave_ : 4; //! Wave id - uint32_t SIMD_ : 2; //! SIMD id - uint32_t CU_ : 4; //! Compute unit - uint32_t SH_ : 1; //! Shader array - uint32_t SE_ : 1; //! Shader engine +struct HwDebugWaveAddr { + uint32_t VMID_ : 4; //! Virtual memory id + uint32_t wave_ : 4; //! Wave id + uint32_t SIMD_ : 2; //! SIMD id + uint32_t CU_ : 4; //! Compute unit + uint32_t SH_ : 1; //! Shader array + uint32_t SE_ : 1; //! Shader engine }; /*! \brief Kernel code information @@ -107,13 +102,11 @@ struct HwDebugWaveAddr * This structure contains the pointer of mapped kernel code for host access * and its size (in bytes) */ -struct AqlCodeInfo -{ - amd_kernel_code_t * aqlCode_; //! pointer of AQL code to allow host access - uint32_t aqlCodeSize_; //! size of AQL code +struct AqlCodeInfo { + amd_kernel_code_t* aqlCode_; //! pointer of AQL code to allow host access + uint32_t aqlCodeSize_; //! size of AQL code }; /**@}*/ } // namespace pal - diff --git a/rocclr/runtime/device/pal/paldebugmanager.cpp b/rocclr/runtime/device/pal/paldebugmanager.cpp index 55438b881f..7ed056645f 100644 --- a/rocclr/runtime/device/pal/paldebugmanager.cpp +++ b/rocclr/runtime/device/pal/paldebugmanager.cpp @@ -24,389 +24,330 @@ class Memory; */ GpuDebugManager::GpuDebugManager(amd::Device* device) - : HwDebugManager(device) - , vGpu_(nullptr) - , debugMessages_(0) - , addressWatch_(nullptr) - , addressWatchSize_(0) - , oclEventHandle_(nullptr) -{ - // Initialize the exception info and the kernel execution mode - excpPolicy_.exceptionMask = 0x0; - excpPolicy_.waveAction = CL_DBG_WAVES_RESUME; - excpPolicy_.hostAction = CL_DBG_HOST_IGNORE; - excpPolicy_.waveMode = CL_DBG_WAVEMODE_BROADCAST; + : HwDebugManager(device), + vGpu_(nullptr), + debugMessages_(0), + addressWatch_(nullptr), + addressWatchSize_(0), + oclEventHandle_(nullptr) { + // Initialize the exception info and the kernel execution mode + excpPolicy_.exceptionMask = 0x0; + excpPolicy_.waveAction = CL_DBG_WAVES_RESUME; + excpPolicy_.hostAction = CL_DBG_HOST_IGNORE; + excpPolicy_.waveMode = CL_DBG_WAVEMODE_BROADCAST; - execMode_.ui32All = 0; + execMode_.ui32All = 0; - rtTrapHandlerInfo_.trap_.trapHandler_ = nullptr; - rtTrapHandlerInfo_.trap_.trapBuffer_ = nullptr; + rtTrapHandlerInfo_.trap_.trapHandler_ = nullptr; + rtTrapHandlerInfo_.trap_.trapBuffer_ = nullptr; - aqlPacket_ = (hsa_kernel_dispatch_packet_t *) nullptr; + aqlPacket_ = (hsa_kernel_dispatch_packet_t*)nullptr; - return; + return; } -GpuDebugManager::~GpuDebugManager() -{ - if (nullptr != addressWatch_) { - delete [] addressWatch_; - } +GpuDebugManager::~GpuDebugManager() { + if (nullptr != addressWatch_) { + delete[] addressWatch_; + } } -void -GpuDebugManager::executePreDispatchCallBack(void* aqlPacket, - void* toolInfo) -{ - DebugToolInfo* info = reinterpret_cast(toolInfo); +void GpuDebugManager::executePreDispatchCallBack(void* aqlPacket, void* toolInfo) { + DebugToolInfo* info = reinterpret_cast(toolInfo); - aqlPacket_ = reinterpret_cast(aqlPacket); - Unimplemented(); - // Only if the pre-dispatch callback is set, will we update cache - // flush configuration and build the memory descriptor. - if (nullptr != preDispatchCallBackFunc_) { -/* - // Build the scratch memory descriptor - device()->gslCtx()->BuildScratchBufferResource(debugInfo_.scratchMemoryDescriptor_, - info->scratchAddress_, - info->scratchSize_); + aqlPacket_ = reinterpret_cast(aqlPacket); + Unimplemented(); + // Only if the pre-dispatch callback is set, will we update cache + // flush configuration and build the memory descriptor. + if (nullptr != preDispatchCallBackFunc_) { + /* + // Build the scratch memory descriptor + device()->gslCtx()->BuildScratchBufferResource(debugInfo_.scratchMemoryDescriptor_, + info->scratchAddress_, + info->scratchSize_); - // Build the global memory descriptor - device()->gslCtx()->BuildHeapBufferResource(debugInfo_.globalMemoryDescriptor_, - info->globalAddress_); -*/ -// // for invalidate cache (BuildEndOfKernelNotifyCommands) -// aqlPacket->release_fence_scope = 2; + // Build the global memory descriptor + device()->gslCtx()->BuildHeapBufferResource(debugInfo_.globalMemoryDescriptor_, + info->globalAddress_); + */ + // // for invalidate cache (BuildEndOfKernelNotifyCommands) + // aqlPacket->release_fence_scope = 2; - aclBinary_ = reinterpret_cast(info->aclBinary_); - oclEventHandle_ = reinterpret_cast(as_cl(info->event_)); + aclBinary_ = reinterpret_cast(info->aclBinary_); + oclEventHandle_ = reinterpret_cast(as_cl(info->event_)); - cl_device_id clDeviceId = as_cl(device_); - preDispatchCallBackFunc_(clDeviceId, - oclEventHandle_, - aqlPacket_, - aclBinary_, - preDispatchCallBackArgs_); - } + cl_device_id clDeviceId = as_cl(device_); + preDispatchCallBackFunc_(clDeviceId, oclEventHandle_, aqlPacket_, aclBinary_, + preDispatchCallBackArgs_); + } - // setup the trap handler information only if the debugger has been registered - if (isRegistered()) { - // Copy the various info set by the debugger/profiler to the tool info structure - setupTrapInformation(info); - } + // setup the trap handler information only if the debugger has been registered + if (isRegistered()) { + // Copy the various info set by the debugger/profiler to the tool info structure + setupTrapInformation(info); + } } -void -GpuDebugManager::executePostDispatchCallBack() -{ - if (nullptr != postDispatchCallBackFunc_) { - cl_device_id clDeviceId = as_cl(device_); - postDispatchCallBackFunc_(clDeviceId, - aqlPacket_->completion_signal.handle, - postDispatchCallBackArgs_); - } +void GpuDebugManager::executePostDispatchCallBack() { + if (nullptr != postDispatchCallBackFunc_) { + cl_device_id clDeviceId = as_cl(device_); + postDispatchCallBackFunc_(clDeviceId, aqlPacket_->completion_signal.handle, + postDispatchCallBackArgs_); + } } //! Map the kernel code for host access -void -GpuDebugManager::mapKernelCode(void* aqlCodeInfo) const -{ - AqlCodeInfo* codeInfo = reinterpret_cast(aqlCodeInfo); +void GpuDebugManager::mapKernelCode(void* aqlCodeInfo) const { + AqlCodeInfo* codeInfo = reinterpret_cast(aqlCodeInfo); - codeInfo->aqlCode_ = reinterpret_cast(aqlCodeAddr_); - codeInfo->aqlCodeSize_ = aqlCodeSize_; + codeInfo->aqlCode_ = reinterpret_cast(aqlCodeAddr_); + codeInfo->aqlCodeSize_ = aqlCodeSize_; } -cl_int -GpuDebugManager::registerDebugger(amd::Context* context, uintptr_t messageStorage) -{ - if (!device()->settings().enableHwDebug_) { - LogError("debugmanager: Register debugger error - HW DEBUG is not enable"); - return CL_DEBUGGER_REGISTER_FAILURE_AMD; - } +cl_int GpuDebugManager::registerDebugger(amd::Context* context, uintptr_t messageStorage) { + if (!device()->settings().enableHwDebug_) { + LogError("debugmanager: Register debugger error - HW DEBUG is not enable"); + return CL_DEBUGGER_REGISTER_FAILURE_AMD; + } - // first time register - set the message storage, flush queue and enable hw debug - if (!isRegistered()) { - debugMessages_ = messageStorage; - Unimplemented(); -/* - if (!device()->gslCtx()->registerHwDebugger(debugMessages_)) { - LogError("debugmanager: Register debugger failed"); - return CL_OUT_OF_RESOURCES; - } -*/ - isRegistered_ = true; - - if (CL_SUCCESS != createRuntimeTrapHandler()) { - LogError("debugmanager: Create runtime trap handler failed"); - return CL_OUT_OF_RESOURCES; - } - } - - context_ = context; - - return CL_SUCCESS; -} - -void -GpuDebugManager::unregisterDebugger() -{ - if (isRegistered()) { - // reset the debugger registration flag - isRegistered_ = false; - context_ = nullptr; - } -} - -void -GpuDebugManager::flushCache(uint32_t mask) -{ - HwDbgGpuCacheMask cacheMask(mask); - device()->xferQueue()->flushCuCaches(cacheMask); -} - - -void -GpuDebugManager::setupTrapInformation(DebugToolInfo* toolInfo) -{ - toolInfo->scratchAddress_ = 0; - toolInfo->scratchSize_ = 0; - toolInfo->globalAddress_ = 0; - toolInfo->sqPerfcounterEnable_ = false; - - // Set up trap related info in the kernel info structure to be - // used in the kernel dispatch. - toolInfo->exceptionMask_ = excpPolicy_.exceptionMask; - toolInfo->gpuSingleStepMode_ = execMode_.gpuSingleStepMode; - toolInfo->monitorMode_ = execMode_.monitorMode; - - // The order of these three bits is determined by the definition - // of the register COMPUTE_DISPATCH_INITIATOR - toolInfo->cacheDisableMask_ = ((execMode_.disableL1Scalar << 2) - | (execMode_.disableL2Cache << 1) - | (execMode_.disableL1Vector)); - - toolInfo->reservedCuNum_ = execMode_.reservedCuNum; - - toolInfo->trapHandler_ = rtTrapInfo_[kDebugTrapHandlerLocation]; - toolInfo->trapBuffer_ = rtTrapInfo_[kDebugTrapBufferLocation]; -} - -void -GpuDebugManager::getPacketAmdInfo( - const void* aqlCodeInfo, - void* packetInfo) const - -{ - const AqlCodeInfo* codeInfo = - reinterpret_cast(aqlCodeInfo); - - const amd_kernel_code_t* hostAqlCode = codeInfo->aqlCode_; - - PacketAmdInfo* packet = - reinterpret_cast(packetInfo); - - const amd_kernel_code_t* akc = hostAqlCode; - - packet->numberOfSgprs_ = akc->wavefront_sgpr_count; - packet->numberOfVgprs_ = akc->workitem_vgpr_count; - - // use mapped kernel_object_address for host accessing of ISA buffer - packet->pointerToIsaBuffer_ = (char*) (hostAqlCode) + - akc->kernel_code_entry_byte_offset; - - packet->scratchBufferWaveOffset_ = - akc->debug_wavefront_private_segment_offset_sgpr; - - packet->sizeOfIsaBuffer_ = codeInfo->aqlCodeSize_; - - packet->sizeOfStaticGroupMemory_ = akc->workgroup_group_segment_byte_size; - - // The trap_reserved_vgpr_index will be 4 less the original - // This value must be used only by the debugger - packet->trapReservedVgprIndex_ = akc->workitem_vgpr_count - NumberReserveVgprs; -} - -DebugEvent -GpuDebugManager::createDebugEvent( - const bool autoReset) -{ + // first time register - set the message storage, flush queue and enable hw debug + if (!isRegistered()) { + debugMessages_ = messageStorage; Unimplemented(); -/* - // create the event object - osEventHandle shaderEvent = osEventCreate(!autoReset); + /* + if (!device()->gslCtx()->registerHwDebugger(debugMessages_)) { + LogError("debugmanager: Register debugger failed"); + return CL_OUT_OF_RESOURCES; + } + */ + isRegistered_ = true; - // event object has been created, set the initial state - if (shaderEvent != 0) { - - osEventReset(shaderEvent); // initial state is non-signaled - - if (device()->gslCtx()->exceptionNotification(shaderEvent)) { - return shaderEvent; - } + if (CL_SUCCESS != createRuntimeTrapHandler()) { + LogError("debugmanager: Create runtime trap handler failed"); + return CL_OUT_OF_RESOURCES; } -*/ - return 0; + } + + context_ = context; + + return CL_SUCCESS; } -cl_int -GpuDebugManager::waitDebugEvent( - DebugEvent pEvent, - uint32_t timeOut) const -{ - Unimplemented(); -/* - if (osEventTimedWait(pEvent, timeOut)) { - return CL_SUCCESS; - } - else { - return CL_EVENT_TIMEOUT_AMD; - } -*/ - return CL_SUCCESS; +void GpuDebugManager::unregisterDebugger() { + if (isRegistered()) { + // reset the debugger registration flag + isRegistered_ = false; + context_ = nullptr; + } } -void -GpuDebugManager::destroyDebugEvent(DebugEvent* pEvent) -{ - Unimplemented(); -/* - osEventDestroy(*pEvent); - *pEvent = 0; - - device()->gslCtx()->exceptionNotification(0); -*/ +void GpuDebugManager::flushCache(uint32_t mask) { + HwDbgGpuCacheMask cacheMask(mask); + device()->xferQueue()->flushCuCaches(cacheMask); } -void -GpuDebugManager::wavefrontControl( - uint32_t waveAction, - uint32_t waveMode, - uint32_t trapId, - void* waveAddr) const -{ - Unimplemented(); - //device()->gslCtx()->executeSqCommand(waveAction, waveMode, trapId, waveAddr); + +void GpuDebugManager::setupTrapInformation(DebugToolInfo* toolInfo) { + toolInfo->scratchAddress_ = 0; + toolInfo->scratchSize_ = 0; + toolInfo->globalAddress_ = 0; + toolInfo->sqPerfcounterEnable_ = false; + + // Set up trap related info in the kernel info structure to be + // used in the kernel dispatch. + toolInfo->exceptionMask_ = excpPolicy_.exceptionMask; + toolInfo->gpuSingleStepMode_ = execMode_.gpuSingleStepMode; + toolInfo->monitorMode_ = execMode_.monitorMode; + + // The order of these three bits is determined by the definition + // of the register COMPUTE_DISPATCH_INITIATOR + toolInfo->cacheDisableMask_ = ((execMode_.disableL1Scalar << 2) | + (execMode_.disableL2Cache << 1) | (execMode_.disableL1Vector)); + + toolInfo->reservedCuNum_ = execMode_.reservedCuNum; + + toolInfo->trapHandler_ = rtTrapInfo_[kDebugTrapHandlerLocation]; + toolInfo->trapBuffer_ = rtTrapInfo_[kDebugTrapBufferLocation]; } -void -GpuDebugManager::setAddressWatch( - uint32_t numWatchPoints, - void** watchAddress, - uint64_t* watchMask, - uint64_t* watchMode, - DebugEvent* event) +void GpuDebugManager::getPacketAmdInfo(const void* aqlCodeInfo, void* packetInfo) const + { - size_t requiredSize = numWatchPoints * sizeof(HwDbgAddressWatch); + const AqlCodeInfo* codeInfo = reinterpret_cast(aqlCodeInfo); - // previously allocated size is not big enough, allocate new memory - if (addressWatchSize_ < requiredSize) { - if (nullptr != addressWatch_) { // free the smaller address watch storage - delete [] addressWatch_; - } - addressWatch_ = new HwDbgAddressWatch[numWatchPoints]; - addressWatchSize_ = requiredSize; - } + const amd_kernel_code_t* hostAqlCode = codeInfo->aqlCode_; - // fill in the address watch structure - memset(addressWatch_, 0, addressWatchSize_); + PacketAmdInfo* packet = reinterpret_cast(packetInfo); - for (uint32_t i = 0; i < numWatchPoints; i++) - { - amd::Memory* watchMem = as_amd(reinterpret_cast(watchAddress[i])); - Memory* watchMemAddress = device()->getGpuMemory(watchMem); + const amd_kernel_code_t* akc = hostAqlCode; - addressWatch_[i].watchAddress_ = reinterpret_cast(watchMemAddress->vmAddress()); - addressWatch_[i].watchMask_ = watchMask[i]; - addressWatch_[i].watchMode_ = (cl_dbg_address_watch_mode_amd) watchMode[i]; - addressWatch_[i].event_ = (0 != event) ? event[i] : 0; - } + packet->numberOfSgprs_ = akc->wavefront_sgpr_count; + packet->numberOfVgprs_ = akc->workitem_vgpr_count; - Unimplemented(); - // setup the watch addresses - //device()->gslCtx()->setAddressWatch(numWatchPoints, (void*) addressWatch_); + // use mapped kernel_object_address for host accessing of ISA buffer + packet->pointerToIsaBuffer_ = (char*)(hostAqlCode) + akc->kernel_code_entry_byte_offset; + packet->scratchBufferWaveOffset_ = akc->debug_wavefront_private_segment_offset_sgpr; + + packet->sizeOfIsaBuffer_ = codeInfo->aqlCodeSize_; + + packet->sizeOfStaticGroupMemory_ = akc->workgroup_group_segment_byte_size; + + // The trap_reserved_vgpr_index will be 4 less the original + // This value must be used only by the debugger + packet->trapReservedVgprIndex_ = akc->workitem_vgpr_count - NumberReserveVgprs; } -void -GpuDebugManager::setGlobalMemory( - amd::Memory* memObj, - uint32_t offset, - void* srcPtr, - uint32_t size) -{ - Memory* globalMem = device()->getGpuMemory(memObj); +DebugEvent GpuDebugManager::createDebugEvent(const bool autoReset) { + Unimplemented(); + /* + // create the event object + osEventHandle shaderEvent = osEventCreate(!autoReset); - address mappedMem = static_cast
(globalMem->map(nullptr,0)); - assert(mappedMem != 0); + // event object has been created, set the initial state + if (shaderEvent != 0) { - void* dest_ptr = reinterpret_cast(mappedMem + offset); - memcpy(dest_ptr, srcPtr, size); + osEventReset(shaderEvent); // initial state is non-signaled - globalMem->unmap(nullptr); + if (device()->gslCtx()->exceptionNotification(shaderEvent)) { + return shaderEvent; + } + } + */ + return 0; } -cl_int -GpuDebugManager::createRuntimeTrapHandler() -{ - size_t codeSize = 0; - const uint32_t* rtTrapCode = nullptr; +cl_int GpuDebugManager::waitDebugEvent(DebugEvent pEvent, uint32_t timeOut) const { + Unimplemented(); + /* + if (osEventTimedWait(pEvent, timeOut)) { + return CL_SUCCESS; + } + else { + return CL_EVENT_TIMEOUT_AMD; + } + */ + return CL_SUCCESS; +} - if (device()->settings().viPlus_) { - codeSize = sizeof(RuntimeTrapCodeVi); - rtTrapCode = RuntimeTrapCodeVi; - } - else { - codeSize = sizeof(RuntimeTrapCode); - rtTrapCode = RuntimeTrapCode; +void GpuDebugManager::destroyDebugEvent(DebugEvent* pEvent) { + Unimplemented(); + /* + osEventDestroy(*pEvent); + *pEvent = 0; + + device()->gslCtx()->exceptionNotification(0); + */ +} + +void GpuDebugManager::wavefrontControl(uint32_t waveAction, uint32_t waveMode, uint32_t trapId, + void* waveAddr) const { + Unimplemented(); + // device()->gslCtx()->executeSqCommand(waveAction, waveMode, trapId, waveAddr); +} + +void GpuDebugManager::setAddressWatch(uint32_t numWatchPoints, void** watchAddress, + uint64_t* watchMask, uint64_t* watchMode, DebugEvent* event) { + size_t requiredSize = numWatchPoints * sizeof(HwDbgAddressWatch); + + // previously allocated size is not big enough, allocate new memory + if (addressWatchSize_ < requiredSize) { + if (nullptr != addressWatch_) { // free the smaller address watch storage + delete[] addressWatch_; } + addressWatch_ = new HwDbgAddressWatch[numWatchPoints]; + addressWatchSize_ = requiredSize; + } - uint32_t numCodes = codeSize / sizeof(uint32_t); + // fill in the address watch structure + memset(addressWatch_, 0, addressWatchSize_); - // Handle TMA corruption hw bug workaround - - // The trap handler buffer has extra 256 bytes allocated, the TMA address - // is stored in the first two DWORDs and the actual trap handler code - // is stored starting at the location of 256 bytes (TbaStartOffset). - // - // allocate memory for the runtime trap handler (TBA) + TMA address - uint32_t allocSize = codeSize + TbaStartOffset; + for (uint32_t i = 0; i < numWatchPoints; i++) { + amd::Memory* watchMem = as_amd(reinterpret_cast(watchAddress[i])); + Memory* watchMemAddress = device()->getGpuMemory(watchMem); - Memory* rtTBA = new Memory(*device(), allocSize); - runtimeTBA_ = rtTBA; + addressWatch_[i].watchAddress_ = reinterpret_cast(watchMemAddress->vmAddress()); + addressWatch_[i].watchMask_ = watchMask[i]; + addressWatch_[i].watchMode_ = (cl_dbg_address_watch_mode_amd)watchMode[i]; + addressWatch_[i].event_ = (0 != event) ? event[i] : 0; + } - if ((rtTBA == nullptr) || !rtTBA->create(Resource::RemoteUSWC)) { - return CL_OUT_OF_RESOURCES; - } - address tbaAddress = reinterpret_cast
(rtTBA->map(nullptr)); + Unimplemented(); + // setup the watch addresses + // device()->gslCtx()->setAddressWatch(numWatchPoints, (void*) addressWatch_); +} - // allocate buffer for the runtime trap handler buffer (TMA) - uint32_t tmaSize = 0x100; - Memory* rtTMA = new Memory(*device(), tmaSize); - runtimeTMA_ = rtTMA; +void GpuDebugManager::setGlobalMemory(amd::Memory* memObj, uint32_t offset, void* srcPtr, + uint32_t size) { + Memory* globalMem = device()->getGpuMemory(memObj); - if ((rtTMA == nullptr) || !rtTMA->create(Resource::RemoteUSWC)) { - return CL_OUT_OF_RESOURCES; - } + address mappedMem = static_cast
(globalMem->map(nullptr, 0)); + assert(mappedMem != 0); - uint64_t rtTmaAddress = rtTMA->vmAddress(); - if ((rtTBA->vmAddress() & 0xFF) != 0 || (rtTmaAddress & 0xFF) != 0) { - LogError("debugmanager: Trap handler/buffer is not 256-byte aligned"); - return CL_INVALID_VALUE; - } + void* dest_ptr = reinterpret_cast(mappedMem + offset); + memcpy(dest_ptr, srcPtr, size); - // store the TMA address at the beginning of trap handler buffer - uint64_t* tbaStorage = reinterpret_cast(tbaAddress); - tbaStorage[0] = rtTmaAddress; + globalMem->unmap(nullptr); +} - // save the trap handler code - uint32_t* trapHandlerPtr = (uint32_t*)(tbaAddress + TbaStartOffset); - for (uint32_t i = 0; i < numCodes; i++) { - trapHandlerPtr[i] = rtTrapCode[i]; - } +cl_int GpuDebugManager::createRuntimeTrapHandler() { + size_t codeSize = 0; + const uint32_t* rtTrapCode = nullptr; - rtTBA->unmap(nullptr); + if (device()->settings().viPlus_) { + codeSize = sizeof(RuntimeTrapCodeVi); + rtTrapCode = RuntimeTrapCodeVi; + } else { + codeSize = sizeof(RuntimeTrapCode); + rtTrapCode = RuntimeTrapCode; + } - return CL_SUCCESS; + uint32_t numCodes = codeSize / sizeof(uint32_t); + + // Handle TMA corruption hw bug workaround - + // The trap handler buffer has extra 256 bytes allocated, the TMA address + // is stored in the first two DWORDs and the actual trap handler code + // is stored starting at the location of 256 bytes (TbaStartOffset). + // + // allocate memory for the runtime trap handler (TBA) + TMA address + uint32_t allocSize = codeSize + TbaStartOffset; + + Memory* rtTBA = new Memory(*device(), allocSize); + runtimeTBA_ = rtTBA; + + if ((rtTBA == nullptr) || !rtTBA->create(Resource::RemoteUSWC)) { + return CL_OUT_OF_RESOURCES; + } + address tbaAddress = reinterpret_cast
(rtTBA->map(nullptr)); + + // allocate buffer for the runtime trap handler buffer (TMA) + uint32_t tmaSize = 0x100; + Memory* rtTMA = new Memory(*device(), tmaSize); + runtimeTMA_ = rtTMA; + + if ((rtTMA == nullptr) || !rtTMA->create(Resource::RemoteUSWC)) { + return CL_OUT_OF_RESOURCES; + } + + uint64_t rtTmaAddress = rtTMA->vmAddress(); + if ((rtTBA->vmAddress() & 0xFF) != 0 || (rtTmaAddress & 0xFF) != 0) { + LogError("debugmanager: Trap handler/buffer is not 256-byte aligned"); + return CL_INVALID_VALUE; + } + + // store the TMA address at the beginning of trap handler buffer + uint64_t* tbaStorage = reinterpret_cast(tbaAddress); + tbaStorage[0] = rtTmaAddress; + + // save the trap handler code + uint32_t* trapHandlerPtr = (uint32_t*)(tbaAddress + TbaStartOffset); + for (uint32_t i = 0; i < numCodes; i++) { + trapHandlerPtr[i] = rtTrapCode[i]; + } + + rtTBA->unmap(nullptr); + + return CL_SUCCESS; } } // namespace pal diff --git a/rocclr/runtime/device/pal/paldebugmanager.hpp b/rocclr/runtime/device/pal/paldebugmanager.hpp index 52a131f2b3..1b148a99da 100644 --- a/rocclr/runtime/device/pal/paldebugmanager.hpp +++ b/rocclr/runtime/device/pal/paldebugmanager.hpp @@ -28,87 +28,79 @@ class Memory; * */ class GpuDebugManager : public amd::HwDebugManager { -public: + public: + //! Constructor of the debug manager class + GpuDebugManager(amd::Device* device); - //! Constructor of the debug manager class - GpuDebugManager(amd::Device* device); + //! Destructor of the debug manager class + ~GpuDebugManager(); - //! Destructor of the debug manager class - ~GpuDebugManager(); + //! Get the single instance of the GpuDebugManager class + static GpuDebugManager* getDefaultInstance(); - //! Get the single instance of the GpuDebugManager class - static GpuDebugManager* getDefaultInstance(); + //! Destroy the GpuDebugManager class object + static void destroyInstances(); - //! Destroy the GpuDebugManager class object - static void destroyInstances(); + //! Flush cache + void flushCache(uint32_t mask); - //! Flush cache - void flushCache(uint32_t mask); + //! Create the debug event + DebugEvent createDebugEvent(const bool autoReset); - //! Create the debug event - DebugEvent createDebugEvent(const bool autoReset); + //! Wait for the debug event + cl_int waitDebugEvent(DebugEvent pEvent, uint32_t timeOut) const; - //! Wait for the debug event - cl_int waitDebugEvent(DebugEvent pEvent, uint32_t timeOut) const; + //! Destroy the debug event + void destroyDebugEvent(DebugEvent* pEvent); - //! Destroy the debug event - void destroyDebugEvent(DebugEvent* pEvent); + //! Register the debugger + cl_int registerDebugger(amd::Context* context, uintptr_t messageStorage); - //! Register the debugger - cl_int registerDebugger(amd::Context*context, uintptr_t messageStorage); + //! Unregister the debugger + void unregisterDebugger(); - //! Unregister the debugger - void unregisterDebugger(); + //! Send the wavefront control cmmand + void wavefrontControl(uint32_t waveAction, uint32_t waveMode, uint32_t trapId, + void* waveAddr) const; - //! Send the wavefront control cmmand - void wavefrontControl(uint32_t waveAction, - uint32_t waveMode, - uint32_t trapId, - void* waveAddr) const; + //! Set address watching point + void setAddressWatch(uint32_t numWatchPoints, void** watchAddress, uint64_t* watchMask, + uint64_t* watchMode, DebugEvent* pEvent); - //! Set address watching point - void setAddressWatch(uint32_t numWatchPoints, - void** watchAddress, - uint64_t* watchMask, - uint64_t* watchMode, - DebugEvent* pEvent); + //! Map the kernel code for host access + void mapKernelCode(void* aqlCodeInfo) const; - //! Map the kernel code for host access - void mapKernelCode(void* aqlCodeInfo) const; + //! Get the packet information for dispatch + void getPacketAmdInfo(const void* aqlCodeInfo, void* packetInfo) const; - //! Get the packet information for dispatch - void getPacketAmdInfo(const void* aqlCodeInfo, void* packetInfo) const; + //! Set global memory values + void setGlobalMemory(amd::Memory* memObj, uint32_t offset, void* srcPtr, uint32_t size); - //! Set global memory values - void setGlobalMemory(amd::Memory* memObj, uint32_t offset, void* srcPtr, uint32_t size); + //! Execute the post-dispatch callback function + void executePostDispatchCallBack(); - //! Execute the post-dispatch callback function - void executePostDispatchCallBack(); + //! Execute the pre-dispatch callback function + void executePreDispatchCallBack(void* aqlPacket, void* toolInfo); - //! Execute the pre-dispatch callback function - void executePreDispatchCallBack(void* aqlPacket, - void* toolInfo); + protected: + const VirtualGPU* vGpu() const { return vGpu_; } -protected: - const VirtualGPU* vGpu() const { return vGpu_; } + private: + //! Setup trap handler info for kernel execution + void setupTrapInformation(DebugToolInfo* toolInfo); -private: - //! Setup trap handler info for kernel execution - void setupTrapInformation(DebugToolInfo* toolInfo); + //! Create runtime trap handler + cl_int createRuntimeTrapHandler(); - //! Create runtime trap handler - cl_int createRuntimeTrapHandler(); + const pal::Device* device() const { return reinterpret_cast(device_); } - const pal::Device* device() const { - return reinterpret_cast(device_); } - - VirtualGPU* vGpu_; //!< the virtual GPU - uintptr_t debugMessages_; //!< Pointer to a SHARED_DEBUG_MESSAGES pass to the KMD - HwDbgAddressWatch* addressWatch_; //!< Address watch data - size_t addressWatchSize_; //!< Size of address watch data - //! Arguments used by the callback function - void* oclEventHandle_; //!< event handler - const hsa_kernel_dispatch_packet_t* aqlPacket_; //!< AQL packet + VirtualGPU* vGpu_; //!< the virtual GPU + uintptr_t debugMessages_; //!< Pointer to a SHARED_DEBUG_MESSAGES pass to the KMD + HwDbgAddressWatch* addressWatch_; //!< Address watch data + size_t addressWatchSize_; //!< Size of address watch data + //! Arguments used by the callback function + void* oclEventHandle_; //!< event handler + const hsa_kernel_dispatch_packet_t* aqlPacket_; //!< AQL packet }; } // namespace pal diff --git a/rocclr/runtime/device/pal/paldefs.hpp b/rocclr/runtime/device/pal/paldefs.hpp index ab74566de9..3a79785a67 100644 --- a/rocclr/runtime/device/pal/paldefs.hpp +++ b/rocclr/runtime/device/pal/paldefs.hpp @@ -13,56 +13,49 @@ /// Memory Object Type // enum PalGpuMemoryType { - PAL_DEPTH_BUFFER = 0, ///< Depth Buffer - PAL_BUFFER, ///< Pure buffer - PAL_TEXTURE_3D, ///< 3D texture - PAL_TEXTURE_2D, ///< 2D texture - PAL_TEXTURE_1D, ///< 1D texture - PAL_TEXTURE_1D_ARRAY, ///< 1D Array texture - PAL_TEXTURE_2D_ARRAY, ///< 2D Array texture - PAL_TEXTURE_BUFFER, ///< "buffer" texture inside VBO + PAL_DEPTH_BUFFER = 0, ///< Depth Buffer + PAL_BUFFER, ///< Pure buffer + PAL_TEXTURE_3D, ///< 3D texture + PAL_TEXTURE_2D, ///< 2D texture + PAL_TEXTURE_1D, ///< 1D texture + PAL_TEXTURE_1D_ARRAY, ///< 1D Array texture + PAL_TEXTURE_2D_ARRAY, ///< 2D Array texture + PAL_TEXTURE_BUFFER, ///< "buffer" texture inside VBO }; -struct HwDbgKernelInfo -{ - uint64_t scratchBufAddr; ///< Handle of GPU local memory for kernel private scratch space - size_t scratchBufferSizeInBytes; ///< size of memory pointed to by pScratchBuffer, - uint64_t heapBufAddr; ///< Address of the global heap base - const void* pAqlDispatchPacket; ///< Pointer to the dipatch packet - const void* pAqlQueuePtr; ///< pointer to the AQL Queue - void* trapHandler; ///< address of the trap handler (TBA) - void* trapHandlerBuffer; ///< address of the trap handler buffer (TMA) - uint32_t excpEn; ///< excecption mask - bool trapPresent; ///< trap present flag - bool sqDebugMode; ///< debug mode flag (GPU single step mode) - uint32_t mgmtSe0Mask; ///< mask for SE0 (reserving CU for display) - uint32_t mgmtSe1Mask; ///< mask for SE1 (reserving CU for display) - uint32_t cacheDisableMask; ///< cache disable mask +struct HwDbgKernelInfo { + uint64_t scratchBufAddr; ///< Handle of GPU local memory for kernel private scratch space + size_t scratchBufferSizeInBytes; ///< size of memory pointed to by pScratchBuffer, + uint64_t heapBufAddr; ///< Address of the global heap base + const void* pAqlDispatchPacket; ///< Pointer to the dipatch packet + const void* pAqlQueuePtr; ///< pointer to the AQL Queue + void* trapHandler; ///< address of the trap handler (TBA) + void* trapHandlerBuffer; ///< address of the trap handler buffer (TMA) + uint32_t excpEn; ///< excecption mask + bool trapPresent; ///< trap present flag + bool sqDebugMode; ///< debug mode flag (GPU single step mode) + uint32_t mgmtSe0Mask; ///< mask for SE0 (reserving CU for display) + uint32_t mgmtSe1Mask; ///< mask for SE1 (reserving CU for display) + uint32_t cacheDisableMask; ///< cache disable mask }; //! Engine types -enum EngineType -{ - MainEngine = 0, - SdmaEngine, - AllEngines -}; +enum EngineType { MainEngine = 0, SdmaEngine, AllEngines }; -struct GpuEvent -{ - static const unsigned int InvalidID = ((1<<30) - 1); +struct GpuEvent { + static const unsigned int InvalidID = ((1 << 30) - 1); - EngineType engineId_; ///< type of the id - unsigned int id; ///< actual event id + EngineType engineId_; ///< type of the id + unsigned int id; ///< actual event id - //! GPU event default constructor - GpuEvent(): engineId_(MainEngine), id(InvalidID) {} + //! GPU event default constructor + GpuEvent() : engineId_(MainEngine), id(InvalidID) {} - //! Returns true if the current event is valid - bool isValid() const { return (id != InvalidID) ? true : false; } + //! Returns true if the current event is valid + bool isValid() const { return (id != InvalidID) ? true : false; } - //! Set invalid event id - void invalidate() { id = InvalidID; } + //! Set invalid event id + void invalidate() { id = InvalidID; } }; /*! \addtogroup PAL @@ -76,9 +69,9 @@ namespace pal { //! Maximum number of the supported global atomic counters const static uint MaxAtomicCounters = 8; //! Maximum number of the supported samplers -const static uint MaxSamplers = 16; +const static uint MaxSamplers = 16; //! Maximum number of supported read images -const static uint MaxReadImage = 128; +const static uint MaxReadImage = 128; //! Maximum number of supported write images const static uint MaxWriteImage = 8; //! Maximum number of supported read/write images for OCL20 @@ -111,70 +104,70 @@ const static uint HsaSamplerObjectAlignment = 16; const static uint DeviceQueueMaskSize = 32; struct AMDDeviceInfo { - const char* targetName_; //!< Target name - const char* machineTarget_; //!< Machine target - uint simdPerCU_; //!< Number of SIMDs per CU - uint simdWidth_; //!< Number of workitems processed per SIMD - uint simdInstructionWidth_; //!< Number of instructions processed per SIMD - uint memChannelBankWidth_; //!< Memory channel bank width - uint localMemSizePerCU_; //!< Local memory size per CU - uint localMemBanks_; //!< Number of banks of local memory - uint gfxipVersion_; //!< The core engine GFXIP version + const char* targetName_; //!< Target name + const char* machineTarget_; //!< Machine target + uint simdPerCU_; //!< Number of SIMDs per CU + uint simdWidth_; //!< Number of workitems processed per SIMD + uint simdInstructionWidth_; //!< Number of instructions processed per SIMD + uint memChannelBankWidth_; //!< Memory channel bank width + uint localMemSizePerCU_; //!< Local memory size per CU + uint localMemBanks_; //!< Number of banks of local memory + uint gfxipVersion_; //!< The core engine GFXIP version }; static const AMDDeviceInfo DeviceInfo[] = { -/* Unknown */ { "", "unknown", 4, 16, 1, 256, 64 * Ki, 32, 0 }, -/* Tahiti */ { "", "tahiti", 4, 16, 1, 256, 64 * Ki, 32, 600 }, -/* Pitcairn */ { "", "pitcairn", 4, 16, 1, 256, 64 * Ki, 32, 600 }, -/* Capeverde */ { "", "bonaire", 4, 16, 1, 256, 64 * Ki, 32, 700 }, -/* Oland */ { "", "oland", 4, 16, 1, 256, 64 * Ki, 32, 600 }, -/* Hainan */ { "", "hainan", 4, 16, 1, 256, 64 * Ki, 32, 600 }, + /* Unknown */ {"", "unknown", 4, 16, 1, 256, 64 * Ki, 32, 0}, + /* Tahiti */ {"", "tahiti", 4, 16, 1, 256, 64 * Ki, 32, 600}, + /* Pitcairn */ {"", "pitcairn", 4, 16, 1, 256, 64 * Ki, 32, 600}, + /* Capeverde */ {"", "bonaire", 4, 16, 1, 256, 64 * Ki, 32, 700}, + /* Oland */ {"", "oland", 4, 16, 1, 256, 64 * Ki, 32, 600}, + /* Hainan */ {"", "hainan", 4, 16, 1, 256, 64 * Ki, 32, 600}, -/* Bonaire */ { "Bonaire", "bonaire", 4, 16, 1, 256, 64 * Ki, 32, 700 }, -/* Hawaii */ { "Hawaii", "hawaii", 4, 16, 1, 256, 64 * Ki, 32, 701 }, -/* Hawaii */ { "", "grenada", 4, 16, 1, 256, 64 * Ki, 32, 701 }, -/* Hawaii */ { "", "maui", 4, 16, 1, 256, 64 * Ki, 32, 701 }, + /* Bonaire */ {"Bonaire", "bonaire", 4, 16, 1, 256, 64 * Ki, 32, 700}, + /* Hawaii */ {"Hawaii", "hawaii", 4, 16, 1, 256, 64 * Ki, 32, 701}, + /* Hawaii */ {"", "grenada", 4, 16, 1, 256, 64 * Ki, 32, 701}, + /* Hawaii */ {"", "maui", 4, 16, 1, 256, 64 * Ki, 32, 701}, -/* Kalindi */ { "Kalindi", "kalindi", 4, 16, 1, 256, 64 * Ki, 32, 702 }, -/* Godavari */ { "Mullins", "mullins", 4, 16, 1, 256, 64 * Ki, 32, 702 }, -/* Spectre */ { "Spectre", "spectre", 4, 16, 1, 256, 64 * Ki, 32, 701 }, -/* Spooky */ { "Spooky", "spooky", 4, 16, 1, 256, 64 * Ki, 32, 701 }, + /* Kalindi */ {"Kalindi", "kalindi", 4, 16, 1, 256, 64 * Ki, 32, 702}, + /* Godavari */ {"Mullins", "mullins", 4, 16, 1, 256, 64 * Ki, 32, 702}, + /* Spectre */ {"Spectre", "spectre", 4, 16, 1, 256, 64 * Ki, 32, 701}, + /* Spooky */ {"Spooky", "spooky", 4, 16, 1, 256, 64 * Ki, 32, 701}, -/* Carrizo */ { "Carrizo" , "carrizo", 4, 16, 1, 256, 64 * Ki, 32, 801 }, -/* Bristol */ { "Bristol Ridge" , "carrizo", 4, 16, 1, 256, 64 * Ki, 32, 801 }, -/* Stoney */ { "Stoney", "stoney", 4, 16, 1, 256, 64 * Ki, 32, 810 }, + /* Carrizo */ {"Carrizo", "carrizo", 4, 16, 1, 256, 64 * Ki, 32, 801}, + /* Bristol */ {"Bristol Ridge", "carrizo", 4, 16, 1, 256, 64 * Ki, 32, 801}, + /* Stoney */ {"Stoney", "stoney", 4, 16, 1, 256, 64 * Ki, 32, 810}, -/* Iceland */ { "Iceland", "iceland", 4, 16, 1, 256, 64 * Ki, 32, 800 }, -/* Tonga */ { "Tonga", "tonga", 4, 16, 1, 256, 64 * Ki, 32, 800 }, -/* Fiji */ { "Fiji", "fiji", 4, 16, 1, 256, 64 * Ki, 32, 804 }, -/* Ellesmere */ { "Ellesmere", "ellesmere",4, 16, 1, 256, 64 * Ki, 32, 804 }, -/* Baffin */ { "Baffin", "baffin", 4, 16, 1, 256, 64 * Ki, 32, 804 }, + /* Iceland */ {"Iceland", "iceland", 4, 16, 1, 256, 64 * Ki, 32, 800}, + /* Tonga */ {"Tonga", "tonga", 4, 16, 1, 256, 64 * Ki, 32, 800}, + /* Fiji */ {"Fiji", "fiji", 4, 16, 1, 256, 64 * Ki, 32, 804}, + /* Ellesmere */ {"Ellesmere", "ellesmere", 4, 16, 1, 256, 64 * Ki, 32, 804}, + /* Baffin */ {"Baffin", "baffin", 4, 16, 1, 256, 64 * Ki, 32, 804}, }; // The GfxIpDeviceInfo table must match with GfxIpLevel enum // (located in //depot/stg/pal/inc/core/palDevice.h). static const AMDDeviceInfo GfxIpDeviceInfo[] = { -/* Unknown */ { "unknown", "unknown", 4, 16, 1, 256, 64 * Ki, 32, 000 }, -/* GFX600 */ { "gfx600", "gfx600", 4, 16, 1, 256, 64 * Ki, 32, 600 }, -/* GFX700 */ { "gfx700", "gfx700", 4, 16, 1, 256, 64 * Ki, 32, 700 }, -/* GFX800 */ { "gfx800", "gfx800", 4, 16, 1, 256, 64 * Ki, 32, 800 }, -/* GFX801 */ { "gfx801", "gfx801", 4, 16, 1, 256, 64 * Ki, 32, 801 }, -/* GFX900 */ { "gfx900", "gfx900", 4, 16, 1, 256, 64 * Ki, 32, 900 }, + /* Unknown */ {"unknown", "unknown", 4, 16, 1, 256, 64 * Ki, 32, 000}, + /* GFX600 */ {"gfx600", "gfx600", 4, 16, 1, 256, 64 * Ki, 32, 600}, + /* GFX700 */ {"gfx700", "gfx700", 4, 16, 1, 256, 64 * Ki, 32, 700}, + /* GFX800 */ {"gfx800", "gfx800", 4, 16, 1, 256, 64 * Ki, 32, 800}, + /* GFX801 */ {"gfx801", "gfx801", 4, 16, 1, 256, 64 * Ki, 32, 801}, + /* GFX900 */ {"gfx900", "gfx900", 4, 16, 1, 256, 64 * Ki, 32, 900}, }; static const AMDDeviceInfo Gfx901DeviceInfo = -/* GFX901 */ { "gfx901", "gfx901", 4, 16, 1, 256, 64 * Ki, 32, 901 }; + /* GFX901 */ {"gfx901", "gfx901", 4, 16, 1, 256, 64 * Ki, 32, 901}; enum gfx_handle { - gfx700 = 700, - gfx701 = 701, - gfx702 = 702, - gfx800 = 800, - gfx801 = 801, - gfx804 = 804, - gfx810 = 810, - gfx900 = 900, - gfx901 = 901 + gfx700 = 700, + gfx701 = 701, + gfx702 = 702, + gfx800 = 800, + gfx801 = 801, + gfx804 = 804, + gfx810 = 810, + gfx900 = 900, + gfx901 = 901 }; static const char* Gfx700 = "AMD:AMDGPU:7:0:0"; @@ -187,428 +180,420 @@ static const char* Gfx900 = "AMD:AMDGPU:9:0:0"; static const char* Gfx901 = "AMD:AMDGPU:9:0:1"; // Supported OpenCL versions -enum OclVersion { - OpenCL10, - OpenCL11, - OpenCL12, - OpenCL20 -}; +enum OclVersion { OpenCL10, OpenCL11, OpenCL12, OpenCL20 }; struct MemoryFormat { - cl_image_format clFormat_; //!< CL image format - Pal::ChNumFormat palFormat_; //!< PAL image format - Pal::ChannelMapping palChannel_; //!< PAL channel mapping + cl_image_format clFormat_; //!< CL image format + Pal::ChNumFormat palFormat_; //!< PAL image format + Pal::ChannelMapping palChannel_; //!< PAL channel mapping }; -static const MemoryFormat -MemoryFormatMap[] = { +static const MemoryFormat MemoryFormatMap[] = { // R - { { CL_R, CL_UNORM_INT8 }, - Pal::ChNumFormat::X8_Unorm, - { Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Zero, - Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } }, - { { CL_R, CL_UNORM_INT16 }, - Pal::ChNumFormat::X16_Unorm, - { Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Zero, - Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } }, + {{CL_R, CL_UNORM_INT8}, + Pal::ChNumFormat::X8_Unorm, + {Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero, + Pal::ChannelSwizzle::One}}, + {{CL_R, CL_UNORM_INT16}, + Pal::ChNumFormat::X16_Unorm, + {Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero, + Pal::ChannelSwizzle::One}}, - { { CL_R, CL_SNORM_INT8 }, - Pal::ChNumFormat::X8_Snorm, - { Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Zero, - Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } }, - { { CL_R, CL_SNORM_INT16 }, - Pal::ChNumFormat::X16_Snorm, - { Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Zero, - Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } }, + {{CL_R, CL_SNORM_INT8}, + Pal::ChNumFormat::X8_Snorm, + {Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero, + Pal::ChannelSwizzle::One}}, + {{CL_R, CL_SNORM_INT16}, + Pal::ChNumFormat::X16_Snorm, + {Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero, + Pal::ChannelSwizzle::One}}, - { { CL_R, CL_SIGNED_INT8 }, - Pal::ChNumFormat::X8_Sint, - { Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Zero, - Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } }, - { { CL_R, CL_SIGNED_INT16 }, - Pal::ChNumFormat::X16_Sint, - { Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Zero, - Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } }, - { { CL_R, CL_SIGNED_INT32 }, - Pal::ChNumFormat::X32_Sint, - { Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Zero, - Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } }, - { { CL_R, CL_UNSIGNED_INT8 }, - Pal::ChNumFormat::X8_Uint, - { Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Zero, - Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } }, - { { CL_R, CL_UNSIGNED_INT16 }, - Pal::ChNumFormat::X16_Uint, - { Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Zero, - Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } }, - { { CL_R, CL_UNSIGNED_INT32 }, - Pal::ChNumFormat::X32_Uint, - { Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Zero, - Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } }, + {{CL_R, CL_SIGNED_INT8}, + Pal::ChNumFormat::X8_Sint, + {Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero, + Pal::ChannelSwizzle::One}}, + {{CL_R, CL_SIGNED_INT16}, + Pal::ChNumFormat::X16_Sint, + {Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero, + Pal::ChannelSwizzle::One}}, + {{CL_R, CL_SIGNED_INT32}, + Pal::ChNumFormat::X32_Sint, + {Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero, + Pal::ChannelSwizzle::One}}, + {{CL_R, CL_UNSIGNED_INT8}, + Pal::ChNumFormat::X8_Uint, + {Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero, + Pal::ChannelSwizzle::One}}, + {{CL_R, CL_UNSIGNED_INT16}, + Pal::ChNumFormat::X16_Uint, + {Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero, + Pal::ChannelSwizzle::One}}, + {{CL_R, CL_UNSIGNED_INT32}, + Pal::ChNumFormat::X32_Uint, + {Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero, + Pal::ChannelSwizzle::One}}, - { { CL_R, CL_HALF_FLOAT }, - Pal::ChNumFormat::X16_Float, - { Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Zero, - Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } }, - { { CL_R, CL_FLOAT }, - Pal::ChNumFormat::X32_Float, - { Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Zero, - Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } }, + {{CL_R, CL_HALF_FLOAT}, + Pal::ChNumFormat::X16_Float, + {Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero, + Pal::ChannelSwizzle::One}}, + {{CL_R, CL_FLOAT}, + Pal::ChNumFormat::X32_Float, + {Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero, + Pal::ChannelSwizzle::One}}, // A - { { CL_A, CL_UNORM_INT8 }, - Pal::ChNumFormat::X8_Unorm, - { Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero, - Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::X } }, - { { CL_A, CL_UNORM_INT16 }, - Pal::ChNumFormat::X16_Unorm, - { Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero, - Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::X } }, + {{CL_A, CL_UNORM_INT8}, + Pal::ChNumFormat::X8_Unorm, + {Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero, + Pal::ChannelSwizzle::X}}, + {{CL_A, CL_UNORM_INT16}, + Pal::ChNumFormat::X16_Unorm, + {Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero, + Pal::ChannelSwizzle::X}}, - { { CL_A, CL_SNORM_INT8 }, - Pal::ChNumFormat::X8_Snorm, - { Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero, - Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::X } }, - { { CL_A, CL_SNORM_INT16 }, - Pal::ChNumFormat::X16_Snorm, - { Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero, - Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::X } }, + {{CL_A, CL_SNORM_INT8}, + Pal::ChNumFormat::X8_Snorm, + {Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero, + Pal::ChannelSwizzle::X}}, + {{CL_A, CL_SNORM_INT16}, + Pal::ChNumFormat::X16_Snorm, + {Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero, + Pal::ChannelSwizzle::X}}, - { { CL_A, CL_SIGNED_INT8 }, - Pal::ChNumFormat::X8_Sint, - { Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero, - Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::X } }, - { { CL_A, CL_SIGNED_INT16 }, - Pal::ChNumFormat::X16_Sint, - { Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero, - Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::X } }, - { { CL_A, CL_SIGNED_INT32}, - Pal::ChNumFormat::X32_Sint, - { Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero, - Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::X } }, - { { CL_A, CL_UNSIGNED_INT8 }, - Pal::ChNumFormat::X8_Uint, - { Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero, - Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::X } }, - { { CL_A, CL_UNSIGNED_INT16 }, - Pal::ChNumFormat::X16_Uint, - { Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero, - Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::X } }, - { { CL_A, CL_UNSIGNED_INT32}, - Pal::ChNumFormat::X32_Uint, - { Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero, - Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::X } }, + {{CL_A, CL_SIGNED_INT8}, + Pal::ChNumFormat::X8_Sint, + {Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero, + Pal::ChannelSwizzle::X}}, + {{CL_A, CL_SIGNED_INT16}, + Pal::ChNumFormat::X16_Sint, + {Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero, + Pal::ChannelSwizzle::X}}, + {{CL_A, CL_SIGNED_INT32}, + Pal::ChNumFormat::X32_Sint, + {Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero, + Pal::ChannelSwizzle::X}}, + {{CL_A, CL_UNSIGNED_INT8}, + Pal::ChNumFormat::X8_Uint, + {Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero, + Pal::ChannelSwizzle::X}}, + {{CL_A, CL_UNSIGNED_INT16}, + Pal::ChNumFormat::X16_Uint, + {Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero, + Pal::ChannelSwizzle::X}}, + {{CL_A, CL_UNSIGNED_INT32}, + Pal::ChNumFormat::X32_Uint, + {Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero, + Pal::ChannelSwizzle::X}}, - { { CL_A, CL_HALF_FLOAT }, - Pal::ChNumFormat::X16_Float, - { Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero, - Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::X } }, - { { CL_A, CL_FLOAT }, - Pal::ChNumFormat::X32_Float, - { Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero, - Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::X } }, + {{CL_A, CL_HALF_FLOAT}, + Pal::ChNumFormat::X16_Float, + {Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero, + Pal::ChannelSwizzle::X}}, + {{CL_A, CL_FLOAT}, + Pal::ChNumFormat::X32_Float, + {Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero, + Pal::ChannelSwizzle::X}}, // RG - { { CL_RG, CL_UNORM_INT8 }, - Pal::ChNumFormat::X8Y8_Unorm, - { Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Y, - Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } }, - { { CL_RG, CL_UNORM_INT16 }, - Pal::ChNumFormat::X16Y16_Unorm, - { Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Y, - Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } }, + {{CL_RG, CL_UNORM_INT8}, + Pal::ChNumFormat::X8Y8_Unorm, + {Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Y, Pal::ChannelSwizzle::Zero, + Pal::ChannelSwizzle::One}}, + {{CL_RG, CL_UNORM_INT16}, + Pal::ChNumFormat::X16Y16_Unorm, + {Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Y, Pal::ChannelSwizzle::Zero, + Pal::ChannelSwizzle::One}}, - { { CL_RG, CL_SNORM_INT8 }, - Pal::ChNumFormat::X8Y8_Snorm, - { Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Y, - Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } }, - { { CL_RG, CL_SNORM_INT16 }, - Pal::ChNumFormat::X16Y16_Snorm, - { Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Y, - Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } }, + {{CL_RG, CL_SNORM_INT8}, + Pal::ChNumFormat::X8Y8_Snorm, + {Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Y, Pal::ChannelSwizzle::Zero, + Pal::ChannelSwizzle::One}}, + {{CL_RG, CL_SNORM_INT16}, + Pal::ChNumFormat::X16Y16_Snorm, + {Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Y, Pal::ChannelSwizzle::Zero, + Pal::ChannelSwizzle::One}}, - { { CL_RG, CL_SIGNED_INT8 }, - Pal::ChNumFormat::X8Y8_Sint, - { Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Y, - Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } }, - { { CL_RG, CL_SIGNED_INT16 }, - Pal::ChNumFormat::X16Y16_Sint, - { Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Y, - Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } }, - { { CL_RG, CL_SIGNED_INT32}, - Pal::ChNumFormat::X32Y32_Sint, - { Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Y, - Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } }, - { { CL_RG, CL_UNSIGNED_INT8 }, - Pal::ChNumFormat::X8Y8_Uint, - { Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Y, - Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } }, - { { CL_RG, CL_UNSIGNED_INT16 }, - Pal::ChNumFormat::X16Y16_Uint, - { Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Y, - Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } }, - { { CL_RG, CL_UNSIGNED_INT32}, - Pal::ChNumFormat::X32Y32_Uint, - { Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Y, - Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } }, + {{CL_RG, CL_SIGNED_INT8}, + Pal::ChNumFormat::X8Y8_Sint, + {Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Y, Pal::ChannelSwizzle::Zero, + Pal::ChannelSwizzle::One}}, + {{CL_RG, CL_SIGNED_INT16}, + Pal::ChNumFormat::X16Y16_Sint, + {Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Y, Pal::ChannelSwizzle::Zero, + Pal::ChannelSwizzle::One}}, + {{CL_RG, CL_SIGNED_INT32}, + Pal::ChNumFormat::X32Y32_Sint, + {Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Y, Pal::ChannelSwizzle::Zero, + Pal::ChannelSwizzle::One}}, + {{CL_RG, CL_UNSIGNED_INT8}, + Pal::ChNumFormat::X8Y8_Uint, + {Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Y, Pal::ChannelSwizzle::Zero, + Pal::ChannelSwizzle::One}}, + {{CL_RG, CL_UNSIGNED_INT16}, + Pal::ChNumFormat::X16Y16_Uint, + {Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Y, Pal::ChannelSwizzle::Zero, + Pal::ChannelSwizzle::One}}, + {{CL_RG, CL_UNSIGNED_INT32}, + Pal::ChNumFormat::X32Y32_Uint, + {Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Y, Pal::ChannelSwizzle::Zero, + Pal::ChannelSwizzle::One}}, - { { CL_RG, CL_HALF_FLOAT }, - Pal::ChNumFormat::X16Y16_Float, - { Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Y, - Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } }, - { { CL_RG, CL_FLOAT }, - Pal::ChNumFormat::X32Y32_Float, - { Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Y, - Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } }, -/* - // RA - { { CL_RA, CL_UNORM_INT8 }, - { GSL_CHANNEL_ORDER_RA, CM_SURF_FMT_RG8 } }, - { { CL_RA, CL_UNORM_INT16 }, - { GSL_CHANNEL_ORDER_RA, CM_SURF_FMT_RG16 } }, + {{CL_RG, CL_HALF_FLOAT}, + Pal::ChNumFormat::X16Y16_Float, + {Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Y, Pal::ChannelSwizzle::Zero, + Pal::ChannelSwizzle::One}}, + {{CL_RG, CL_FLOAT}, + Pal::ChNumFormat::X32Y32_Float, + {Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Y, Pal::ChannelSwizzle::Zero, + Pal::ChannelSwizzle::One}}, + /* + // RA + { { CL_RA, CL_UNORM_INT8 }, + { GSL_CHANNEL_ORDER_RA, CM_SURF_FMT_RG8 } }, + { { CL_RA, CL_UNORM_INT16 }, + { GSL_CHANNEL_ORDER_RA, CM_SURF_FMT_RG16 } }, - { { CL_RA, CL_SNORM_INT8 }, - { GSL_CHANNEL_ORDER_RA, CM_SURF_FMT_sRG8 } }, - { { CL_RA, CL_SNORM_INT16 }, - { GSL_CHANNEL_ORDER_RA, CM_SURF_FMT_sUV16 } }, + { { CL_RA, CL_SNORM_INT8 }, + { GSL_CHANNEL_ORDER_RA, CM_SURF_FMT_sRG8 } }, + { { CL_RA, CL_SNORM_INT16 }, + { GSL_CHANNEL_ORDER_RA, CM_SURF_FMT_sUV16 } }, - { { CL_RA, CL_SIGNED_INT8 }, - { GSL_CHANNEL_ORDER_RA, CM_SURF_FMT_sRG8I } }, - { { CL_RA, CL_SIGNED_INT16 }, - { GSL_CHANNEL_ORDER_RA, CM_SURF_FMT_sRG16I } }, - { { CL_RA, CL_SIGNED_INT32}, - { GSL_CHANNEL_ORDER_RA, CM_SURF_FMT_sRG32I } }, - { { CL_RA, CL_UNSIGNED_INT8 }, - { GSL_CHANNEL_ORDER_RA, CM_SURF_FMT_RG8I } }, - { { CL_RA, CL_UNSIGNED_INT16 }, - { GSL_CHANNEL_ORDER_RA, CM_SURF_FMT_RG16I } }, - { { CL_RA, CL_UNSIGNED_INT32}, - { GSL_CHANNEL_ORDER_RA , CM_SURF_FMT_RG32I } }, + { { CL_RA, CL_SIGNED_INT8 }, + { GSL_CHANNEL_ORDER_RA, CM_SURF_FMT_sRG8I } }, + { { CL_RA, CL_SIGNED_INT16 }, + { GSL_CHANNEL_ORDER_RA, CM_SURF_FMT_sRG16I } }, + { { CL_RA, CL_SIGNED_INT32}, + { GSL_CHANNEL_ORDER_RA, CM_SURF_FMT_sRG32I } }, + { { CL_RA, CL_UNSIGNED_INT8 }, + { GSL_CHANNEL_ORDER_RA, CM_SURF_FMT_RG8I } }, + { { CL_RA, CL_UNSIGNED_INT16 }, + { GSL_CHANNEL_ORDER_RA, CM_SURF_FMT_RG16I } }, + { { CL_RA, CL_UNSIGNED_INT32}, + { GSL_CHANNEL_ORDER_RA , CM_SURF_FMT_RG32I } }, - { { CL_RA, CL_HALF_FLOAT }, - { GSL_CHANNEL_ORDER_RA, CM_SURF_FMT_RG16F } }, - { { CL_RA, CL_FLOAT }, - { GSL_CHANNEL_ORDER_RA, CM_SURF_FMT_RG32F } }, -*/ + { { CL_RA, CL_HALF_FLOAT }, + { GSL_CHANNEL_ORDER_RA, CM_SURF_FMT_RG16F } }, + { { CL_RA, CL_FLOAT }, + { GSL_CHANNEL_ORDER_RA, CM_SURF_FMT_RG32F } }, + */ // RGB - { { CL_RGB, CL_UNORM_INT_101010 }, - Pal::ChNumFormat::X10Y10Z10W2_Unorm, - { Pal::ChannelSwizzle::Z, Pal::ChannelSwizzle::Y, - Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::One } }, - { { CL_RGB, CL_UNSIGNED_INT8 }, // This is used only by blit kernel - Pal::ChNumFormat::X8Y8Z8W8_Uint, - { Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Y, - Pal::ChannelSwizzle::Z, Pal::ChannelSwizzle::One } }, + {{CL_RGB, CL_UNORM_INT_101010}, + Pal::ChNumFormat::X10Y10Z10W2_Unorm, + {Pal::ChannelSwizzle::Z, Pal::ChannelSwizzle::Y, Pal::ChannelSwizzle::X, + Pal::ChannelSwizzle::One}}, + {{CL_RGB, CL_UNSIGNED_INT8}, // This is used only by blit kernel + Pal::ChNumFormat::X8Y8Z8W8_Uint, + {Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Y, Pal::ChannelSwizzle::Z, + Pal::ChannelSwizzle::One}}, // RGBA - { { CL_RGBA, CL_UNORM_INT8 }, - Pal::ChNumFormat::X8Y8Z8W8_Unorm, - { Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Y, - Pal::ChannelSwizzle::Z, Pal::ChannelSwizzle::W } }, - { { CL_RGBA, CL_UNORM_INT16 }, - Pal::ChNumFormat::X16Y16Z16W16_Unorm, - { Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Y, - Pal::ChannelSwizzle::Z, Pal::ChannelSwizzle::W } }, + {{CL_RGBA, CL_UNORM_INT8}, + Pal::ChNumFormat::X8Y8Z8W8_Unorm, + {Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Y, Pal::ChannelSwizzle::Z, + Pal::ChannelSwizzle::W}}, + {{CL_RGBA, CL_UNORM_INT16}, + Pal::ChNumFormat::X16Y16Z16W16_Unorm, + {Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Y, Pal::ChannelSwizzle::Z, + Pal::ChannelSwizzle::W}}, - { { CL_RGBA, CL_SNORM_INT8 }, - Pal::ChNumFormat::X8Y8Z8W8_Snorm, - { Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Y, - Pal::ChannelSwizzle::Z, Pal::ChannelSwizzle::W } }, - { { CL_RGBA, CL_SNORM_INT16 }, - Pal::ChNumFormat::X16Y16Z16W16_Snorm, - { Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Y, - Pal::ChannelSwizzle::Z, Pal::ChannelSwizzle::W } }, + {{CL_RGBA, CL_SNORM_INT8}, + Pal::ChNumFormat::X8Y8Z8W8_Snorm, + {Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Y, Pal::ChannelSwizzle::Z, + Pal::ChannelSwizzle::W}}, + {{CL_RGBA, CL_SNORM_INT16}, + Pal::ChNumFormat::X16Y16Z16W16_Snorm, + {Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Y, Pal::ChannelSwizzle::Z, + Pal::ChannelSwizzle::W}}, - { { CL_RGBA, CL_SIGNED_INT8 }, - Pal::ChNumFormat::X8Y8Z8W8_Sint, - { Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Y, - Pal::ChannelSwizzle::Z, Pal::ChannelSwizzle::W } }, - { { CL_RGBA, CL_SIGNED_INT16 }, - Pal::ChNumFormat::X16Y16Z16W16_Sint, - { Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Y, - Pal::ChannelSwizzle::Z, Pal::ChannelSwizzle::W } }, - { { CL_RGBA, CL_SIGNED_INT32 }, - Pal::ChNumFormat::X32Y32Z32W32_Sint, - { Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Y, - Pal::ChannelSwizzle::Z, Pal::ChannelSwizzle::W } }, - { { CL_RGBA, CL_UNSIGNED_INT8 }, - Pal::ChNumFormat::X8Y8Z8W8_Uint, - { Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Y, - Pal::ChannelSwizzle::Z, Pal::ChannelSwizzle::W } }, - { { CL_RGBA, CL_UNSIGNED_INT16 }, - Pal::ChNumFormat::X16Y16Z16W16_Uint, - { Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Y, - Pal::ChannelSwizzle::Z, Pal::ChannelSwizzle::W } }, - { { CL_RGBA, CL_UNSIGNED_INT32}, - Pal::ChNumFormat::X32Y32Z32W32_Uint, - { Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Y, - Pal::ChannelSwizzle::Z, Pal::ChannelSwizzle::W } }, + {{CL_RGBA, CL_SIGNED_INT8}, + Pal::ChNumFormat::X8Y8Z8W8_Sint, + {Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Y, Pal::ChannelSwizzle::Z, + Pal::ChannelSwizzle::W}}, + {{CL_RGBA, CL_SIGNED_INT16}, + Pal::ChNumFormat::X16Y16Z16W16_Sint, + {Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Y, Pal::ChannelSwizzle::Z, + Pal::ChannelSwizzle::W}}, + {{CL_RGBA, CL_SIGNED_INT32}, + Pal::ChNumFormat::X32Y32Z32W32_Sint, + {Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Y, Pal::ChannelSwizzle::Z, + Pal::ChannelSwizzle::W}}, + {{CL_RGBA, CL_UNSIGNED_INT8}, + Pal::ChNumFormat::X8Y8Z8W8_Uint, + {Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Y, Pal::ChannelSwizzle::Z, + Pal::ChannelSwizzle::W}}, + {{CL_RGBA, CL_UNSIGNED_INT16}, + Pal::ChNumFormat::X16Y16Z16W16_Uint, + {Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Y, Pal::ChannelSwizzle::Z, + Pal::ChannelSwizzle::W}}, + {{CL_RGBA, CL_UNSIGNED_INT32}, + Pal::ChNumFormat::X32Y32Z32W32_Uint, + {Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Y, Pal::ChannelSwizzle::Z, + Pal::ChannelSwizzle::W}}, - { { CL_RGBA, CL_HALF_FLOAT }, - Pal::ChNumFormat::X16Y16Z16W16_Float, - { Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Y, - Pal::ChannelSwizzle::Z, Pal::ChannelSwizzle::W } }, - { { CL_RGBA, CL_FLOAT }, - Pal::ChNumFormat::X32Y32Z32W32_Float, - { Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Y, - Pal::ChannelSwizzle::Z, Pal::ChannelSwizzle::W } }, + {{CL_RGBA, CL_HALF_FLOAT}, + Pal::ChNumFormat::X16Y16Z16W16_Float, + {Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Y, Pal::ChannelSwizzle::Z, + Pal::ChannelSwizzle::W}}, + {{CL_RGBA, CL_FLOAT}, + Pal::ChNumFormat::X32Y32Z32W32_Float, + {Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Y, Pal::ChannelSwizzle::Z, + Pal::ChannelSwizzle::W}}, // ARGB - { { CL_ARGB, CL_UNORM_INT8 }, - Pal::ChNumFormat::X8Y8Z8W8_Unorm, - { Pal::ChannelSwizzle::Y, Pal::ChannelSwizzle::Z, - Pal::ChannelSwizzle::W, Pal::ChannelSwizzle::X } }, - { { CL_ARGB, CL_SNORM_INT8 }, - Pal::ChNumFormat::X8Y8Z8W8_Snorm, - { Pal::ChannelSwizzle::Y, Pal::ChannelSwizzle::Z, - Pal::ChannelSwizzle::W, Pal::ChannelSwizzle::X } }, - { { CL_ARGB, CL_SIGNED_INT8 }, - Pal::ChNumFormat::X8Y8Z8W8_Sint, - { Pal::ChannelSwizzle::Y, Pal::ChannelSwizzle::Z, - Pal::ChannelSwizzle::W, Pal::ChannelSwizzle::X } }, - { { CL_ARGB, CL_UNSIGNED_INT8 }, - Pal::ChNumFormat::X8Y8Z8W8_Uint, - { Pal::ChannelSwizzle::Y, Pal::ChannelSwizzle::Z, - Pal::ChannelSwizzle::W, Pal::ChannelSwizzle::X } }, + {{CL_ARGB, CL_UNORM_INT8}, + Pal::ChNumFormat::X8Y8Z8W8_Unorm, + {Pal::ChannelSwizzle::Y, Pal::ChannelSwizzle::Z, Pal::ChannelSwizzle::W, + Pal::ChannelSwizzle::X}}, + {{CL_ARGB, CL_SNORM_INT8}, + Pal::ChNumFormat::X8Y8Z8W8_Snorm, + {Pal::ChannelSwizzle::Y, Pal::ChannelSwizzle::Z, Pal::ChannelSwizzle::W, + Pal::ChannelSwizzle::X}}, + {{CL_ARGB, CL_SIGNED_INT8}, + Pal::ChNumFormat::X8Y8Z8W8_Sint, + {Pal::ChannelSwizzle::Y, Pal::ChannelSwizzle::Z, Pal::ChannelSwizzle::W, + Pal::ChannelSwizzle::X}}, + {{CL_ARGB, CL_UNSIGNED_INT8}, + Pal::ChNumFormat::X8Y8Z8W8_Uint, + {Pal::ChannelSwizzle::Y, Pal::ChannelSwizzle::Z, Pal::ChannelSwizzle::W, + Pal::ChannelSwizzle::X}}, // BGRA - { { CL_BGRA, CL_UNORM_INT8 }, - Pal::ChNumFormat::X8Y8Z8W8_Unorm, - { Pal::ChannelSwizzle::Z, Pal::ChannelSwizzle::Y, - Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::W } }, - { { CL_BGRA, CL_SNORM_INT8 }, - Pal::ChNumFormat::X8Y8Z8W8_Snorm, - { Pal::ChannelSwizzle::Z, Pal::ChannelSwizzle::Y, - Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::W } }, - { { CL_BGRA, CL_SIGNED_INT8 }, - Pal::ChNumFormat::X8Y8Z8W8_Sint, - { Pal::ChannelSwizzle::Z, Pal::ChannelSwizzle::Y, - Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::W } }, - { { CL_BGRA, CL_UNSIGNED_INT8 }, - Pal::ChNumFormat::X8Y8Z8W8_Uint, - { Pal::ChannelSwizzle::Z, Pal::ChannelSwizzle::Y, - Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::W } }, + {{CL_BGRA, CL_UNORM_INT8}, + Pal::ChNumFormat::X8Y8Z8W8_Unorm, + {Pal::ChannelSwizzle::Z, Pal::ChannelSwizzle::Y, Pal::ChannelSwizzle::X, + Pal::ChannelSwizzle::W}}, + {{CL_BGRA, CL_SNORM_INT8}, + Pal::ChNumFormat::X8Y8Z8W8_Snorm, + {Pal::ChannelSwizzle::Z, Pal::ChannelSwizzle::Y, Pal::ChannelSwizzle::X, + Pal::ChannelSwizzle::W}}, + {{CL_BGRA, CL_SIGNED_INT8}, + Pal::ChNumFormat::X8Y8Z8W8_Sint, + {Pal::ChannelSwizzle::Z, Pal::ChannelSwizzle::Y, Pal::ChannelSwizzle::X, + Pal::ChannelSwizzle::W}}, + {{CL_BGRA, CL_UNSIGNED_INT8}, + Pal::ChNumFormat::X8Y8Z8W8_Uint, + {Pal::ChannelSwizzle::Z, Pal::ChannelSwizzle::Y, Pal::ChannelSwizzle::X, + Pal::ChannelSwizzle::W}}, // LUMINANCE - { { CL_LUMINANCE, CL_SNORM_INT8 }, - Pal::ChNumFormat::X8_Snorm, - { Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::X, - Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::One } }, - { { CL_LUMINANCE, CL_SNORM_INT16 }, - Pal::ChNumFormat::X16_Snorm, - { Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::X, - Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::One } }, - { { CL_LUMINANCE, CL_UNORM_INT8 }, - Pal::ChNumFormat::X8_Unorm, - { Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::X, - Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::One } }, - { { CL_LUMINANCE, CL_UNORM_INT16 }, - Pal::ChNumFormat::X16_Unorm, - { Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::X, - Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::One } }, - { { CL_LUMINANCE, CL_HALF_FLOAT }, - Pal::ChNumFormat::X16_Float, - { Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::X, - Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::One } }, - { { CL_LUMINANCE, CL_FLOAT }, - Pal::ChNumFormat::X32_Float, - { Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::X, - Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::One } }, + {{CL_LUMINANCE, CL_SNORM_INT8}, + Pal::ChNumFormat::X8_Snorm, + {Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::X, + Pal::ChannelSwizzle::One}}, + {{CL_LUMINANCE, CL_SNORM_INT16}, + Pal::ChNumFormat::X16_Snorm, + {Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::X, + Pal::ChannelSwizzle::One}}, + {{CL_LUMINANCE, CL_UNORM_INT8}, + Pal::ChNumFormat::X8_Unorm, + {Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::X, + Pal::ChannelSwizzle::One}}, + {{CL_LUMINANCE, CL_UNORM_INT16}, + Pal::ChNumFormat::X16_Unorm, + {Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::X, + Pal::ChannelSwizzle::One}}, + {{CL_LUMINANCE, CL_HALF_FLOAT}, + Pal::ChNumFormat::X16_Float, + {Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::X, + Pal::ChannelSwizzle::One}}, + {{CL_LUMINANCE, CL_FLOAT}, + Pal::ChNumFormat::X32_Float, + {Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::X, + Pal::ChannelSwizzle::One}}, // INTENSITY - { { CL_INTENSITY, CL_SNORM_INT8 }, - Pal::ChNumFormat::X8_Snorm, - { Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::X, - Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::X } }, - { { CL_INTENSITY, CL_SNORM_INT16 }, - Pal::ChNumFormat::X16_Snorm, - { Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::X, - Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::X } }, - { { CL_INTENSITY, CL_UNORM_INT8 }, - Pal::ChNumFormat::X8_Unorm, - { Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::X, - Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::X } }, - { { CL_INTENSITY, CL_UNORM_INT16 }, - Pal::ChNumFormat::X16_Unorm, - { Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::X, - Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::X } }, - { { CL_INTENSITY, CL_HALF_FLOAT }, - Pal::ChNumFormat::X16_Float, - { Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::X, - Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::X } }, - { { CL_INTENSITY, CL_FLOAT }, - Pal::ChNumFormat::X32_Float, - { Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::X, - Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::X } }, + {{CL_INTENSITY, CL_SNORM_INT8}, + Pal::ChNumFormat::X8_Snorm, + {Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::X, + Pal::ChannelSwizzle::X}}, + {{CL_INTENSITY, CL_SNORM_INT16}, + Pal::ChNumFormat::X16_Snorm, + {Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::X, + Pal::ChannelSwizzle::X}}, + {{CL_INTENSITY, CL_UNORM_INT8}, + Pal::ChNumFormat::X8_Unorm, + {Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::X, + Pal::ChannelSwizzle::X}}, + {{CL_INTENSITY, CL_UNORM_INT16}, + Pal::ChNumFormat::X16_Unorm, + {Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::X, + Pal::ChannelSwizzle::X}}, + {{CL_INTENSITY, CL_HALF_FLOAT}, + Pal::ChNumFormat::X16_Float, + {Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::X, + Pal::ChannelSwizzle::X}}, + {{CL_INTENSITY, CL_FLOAT}, + Pal::ChNumFormat::X32_Float, + {Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::X, + Pal::ChannelSwizzle::X}}, // sRBGA - { { CL_sRGBA, CL_UNORM_INT8 }, - Pal::ChNumFormat::X8Y8Z8W8_Srgb, - { Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Y, - Pal::ChannelSwizzle::Z, Pal::ChannelSwizzle::W } }, - { { CL_sRGBA, CL_UNSIGNED_INT8 }, // This is used only by blit kernel - Pal::ChNumFormat::X8Y8Z8W8_Uint, - { Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Y, - Pal::ChannelSwizzle::Z, Pal::ChannelSwizzle::W } }, + {{CL_sRGBA, CL_UNORM_INT8}, + Pal::ChNumFormat::X8Y8Z8W8_Srgb, + {Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Y, Pal::ChannelSwizzle::Z, + Pal::ChannelSwizzle::W}}, + {{CL_sRGBA, CL_UNSIGNED_INT8}, // This is used only by blit kernel + Pal::ChNumFormat::X8Y8Z8W8_Uint, + {Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Y, Pal::ChannelSwizzle::Z, + Pal::ChannelSwizzle::W}}, // sRBG - { { CL_sRGB, CL_UNORM_INT8 }, - Pal::ChNumFormat::X8Y8Z8W8_Srgb, - { Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Y, - Pal::ChannelSwizzle::Z, Pal::ChannelSwizzle::One } }, - { { CL_sRGB, CL_UNSIGNED_INT8 }, // This is used only by blit kernel - Pal::ChNumFormat::X8Y8Z8W8_Uint, - { Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Y, - Pal::ChannelSwizzle::Z, Pal::ChannelSwizzle::One } }, + {{CL_sRGB, CL_UNORM_INT8}, + Pal::ChNumFormat::X8Y8Z8W8_Srgb, + {Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Y, Pal::ChannelSwizzle::Z, + Pal::ChannelSwizzle::One}}, + {{CL_sRGB, CL_UNSIGNED_INT8}, // This is used only by blit kernel + Pal::ChNumFormat::X8Y8Z8W8_Uint, + {Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Y, Pal::ChannelSwizzle::Z, + Pal::ChannelSwizzle::One}}, // sRBGx - { { CL_sRGBx, CL_UNORM_INT8 }, - Pal::ChNumFormat::X8Y8Z8W8_Srgb, - { Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Y, - Pal::ChannelSwizzle::Z, Pal::ChannelSwizzle::One } }, - { { CL_sRGBx, CL_UNSIGNED_INT8 }, // This is used only by blit kernel - Pal::ChNumFormat::X8Y8Z8W8_Uint, - { Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Y, - Pal::ChannelSwizzle::Z, Pal::ChannelSwizzle::One } }, + {{CL_sRGBx, CL_UNORM_INT8}, + Pal::ChNumFormat::X8Y8Z8W8_Srgb, + {Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Y, Pal::ChannelSwizzle::Z, + Pal::ChannelSwizzle::One}}, + {{CL_sRGBx, CL_UNSIGNED_INT8}, // This is used only by blit kernel + Pal::ChNumFormat::X8Y8Z8W8_Uint, + {Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Y, Pal::ChannelSwizzle::Z, + Pal::ChannelSwizzle::One}}, // sBGRA - { { CL_sBGRA, CL_UNORM_INT8 }, - Pal::ChNumFormat::X8Y8Z8W8_Srgb, - { Pal::ChannelSwizzle::Z, Pal::ChannelSwizzle::Y, - Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::W } }, - { { CL_sBGRA, CL_UNSIGNED_INT8 }, // This is used only by blit kernel - Pal::ChNumFormat::X8Y8Z8W8_Uint, - { Pal::ChannelSwizzle::Z, Pal::ChannelSwizzle::Y, - Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::W } }, + {{CL_sBGRA, CL_UNORM_INT8}, + Pal::ChNumFormat::X8Y8Z8W8_Srgb, + {Pal::ChannelSwizzle::Z, Pal::ChannelSwizzle::Y, Pal::ChannelSwizzle::X, + Pal::ChannelSwizzle::W}}, + {{CL_sBGRA, CL_UNSIGNED_INT8}, // This is used only by blit kernel + Pal::ChNumFormat::X8Y8Z8W8_Uint, + {Pal::ChannelSwizzle::Z, Pal::ChannelSwizzle::Y, Pal::ChannelSwizzle::X, + Pal::ChannelSwizzle::W}}, // DEPTH - { { CL_DEPTH, CL_FLOAT }, - Pal::ChNumFormat::X32_Float, - { Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::X, - Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::X } }, - { { CL_DEPTH, CL_UNSIGNED_INT32 }, // This is used only by blit kernel - Pal::ChNumFormat::X32_Uint, - { Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::X, - Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::X } }, + {{CL_DEPTH, CL_FLOAT}, + Pal::ChNumFormat::X32_Float, + {Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::X, + Pal::ChannelSwizzle::X}}, + {{CL_DEPTH, CL_UNSIGNED_INT32}, // This is used only by blit kernel + Pal::ChNumFormat::X32_Uint, + {Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::X, + Pal::ChannelSwizzle::X}}, - { { CL_DEPTH, CL_UNORM_INT16 }, - Pal::ChNumFormat::X16_Unorm, - { Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::X, - Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::X } }, - { { CL_DEPTH, CL_UNSIGNED_INT16 }, // This is used only by blit kernel - Pal::ChNumFormat::X16_Uint, - { Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::X, - Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::X } }, + {{CL_DEPTH, CL_UNORM_INT16}, + Pal::ChNumFormat::X16_Unorm, + {Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::X, + Pal::ChannelSwizzle::X}}, + {{CL_DEPTH, CL_UNSIGNED_INT16}, // This is used only by blit kernel + Pal::ChNumFormat::X16_Uint, + {Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::X, + Pal::ChannelSwizzle::X}}, - { { CL_DEPTH_STENCIL, CL_UNORM_INT24 }, - Pal::ChNumFormat::X32_Uint, - { Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::X, - Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::X } }, - { { CL_DEPTH_STENCIL, CL_FLOAT }, - Pal::ChNumFormat::X32_Float, - { Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::X, - Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::X } } -}; - -} // namespace pal + {{CL_DEPTH_STENCIL, CL_UNORM_INT24}, + Pal::ChNumFormat::X32_Uint, + {Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::X, + Pal::ChannelSwizzle::X}}, + {{CL_DEPTH_STENCIL, CL_FLOAT}, + Pal::ChNumFormat::X32_Float, + {Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::X, + Pal::ChannelSwizzle::X}}}; +} // namespace pal diff --git a/rocclr/runtime/device/pal/paldevice.cpp b/rocclr/runtime/device/pal/paldevice.cpp index d4d65a5480..cda913ac90 100644 --- a/rocclr/runtime/device/pal/paldevice.cpp +++ b/rocclr/runtime/device/pal/paldevice.cpp @@ -30,7 +30,7 @@ #include "CL/cl_d3d10.h" #include "CL/cl_d3d11.h" #include "CL/cl_dx9_media_sharing.h" -#endif // _WIN32 +#endif // _WIN32 #include #include @@ -39,24 +39,18 @@ #include #include -bool -PalDeviceLoad() -{ - bool ret = false; +bool PalDeviceLoad() { + bool ret = false; - // Create online devices - ret |= pal::Device::init(); - // Create offline GPU devices - ret |= pal::NullDevice::init(); + // Create online devices + ret |= pal::Device::init(); + // Create offline GPU devices + ret |= pal::NullDevice::init(); - return ret; + return ret; } -void -PalDeviceUnload() -{ - pal::Device::tearDown(); -} +void PalDeviceUnload() { pal::Device::tearDown(); } namespace pal { @@ -64,2227 +58,2032 @@ NullDevice::Compiler* NullDevice::compiler_; AppProfile Device::appProfile_; NullDevice::NullDevice() - : amd::Device(nullptr) - , ipLevel_(Pal::GfxIpLevel::None) - , hwInfo_(nullptr) -{ + : amd::Device(nullptr), ipLevel_(Pal::GfxIpLevel::None), hwInfo_(nullptr) {} + +bool NullDevice::init() { + std::vector devices; + + devices = getDevices(CL_DEVICE_TYPE_GPU, false); + + // Loop through all supported devices and create each of them + for (uint id = 0; id < sizeof(DeviceInfo) / sizeof(AMDDeviceInfo); ++id) { + bool foundActive = false; + Pal::AsicRevision revision = static_cast(id); + + if (pal::DeviceInfo[id].targetName_[0] == '\0') { + continue; + } + + // Loop through all active devices and see if we match one + for (uint i = 0; i < devices.size(); ++i) { + if (static_cast(devices[i])->asicRevision() == revision) { + foundActive = true; + break; + } + } + + // Don't report an offline device if it's active + if (foundActive) { + continue; + } + + NullDevice* dev = new NullDevice(); + if (nullptr != dev) { + if (!dev->create(revision, Pal::GfxIpLevel::_None)) { + delete dev; + } else { + dev->registerDevice(); + } + } + } + + // Loop through all supported devices and create each of them + for (uint id = static_cast(Pal::GfxIpLevel::GfxIp7); + id <= static_cast(Pal::GfxIpLevel::GfxIp9); ++id) { + bool foundActive = false; + Pal::GfxIpLevel ipLevel = static_cast(id); + + if (pal::GfxIpDeviceInfo[id].targetName_[0] == '\0') { + continue; + } + + // Loop through all active devices and see if we match one + for (uint i = 0; i < devices.size(); ++i) { + if (static_cast(devices[i])->ipLevel() == ipLevel) { + foundActive = true; + break; + } + } + + // Don't report an offline device if it's active + if (foundActive) { + continue; + } + + NullDevice* dev = new NullDevice(); + if (nullptr != dev) { + if (!dev->create(Pal::AsicRevision::Unknown, ipLevel)) { + delete dev; + } else { + dev->registerDevice(); + } + } + } + + return true; } -bool -NullDevice::init() -{ - std::vector devices; +bool NullDevice::create(Pal::AsicRevision asicRevision, Pal::GfxIpLevel ipLevel) { + online_ = false; + Pal::DeviceProperties properties = {}; - devices = getDevices(CL_DEVICE_TYPE_GPU, false); + // Use fake GFX IP for the device init + asicRevision_ = asicRevision; + ipLevel_ = ipLevel; + properties.revision = asicRevision; + properties.gfxLevel = ipLevel; - // Loop through all supported devices and create each of them - for (uint id = 0; id < sizeof(DeviceInfo) / sizeof(AMDDeviceInfo); ++id) { - bool foundActive = false; - Pal::AsicRevision revision = static_cast(id); + // Update HW info for the device + if ((GPU_ENABLE_PAL == 1) && (ipLevel == Pal::GfxIpLevel::_None)) { + hwInfo_ = &DeviceInfo[static_cast(asicRevision)]; + } else if (ipLevel >= Pal::GfxIpLevel::GfxIp9) { + hwInfo_ = &GfxIpDeviceInfo[static_cast(ipLevel)]; + } else { + return false; + } - if (pal::DeviceInfo[id].targetName_[0] == '\0') { - continue; - } + settings_ = new pal::Settings(); + pal::Settings* palSettings = reinterpret_cast(settings_); - // Loop through all active devices and see if we match one - for (uint i = 0; i < devices.size(); ++i) { - if (static_cast(devices[i])->asicRevision() == revision) { - foundActive = true; - break; - } - } + // Report 512MB for all offline devices + Pal::GpuMemoryHeapProperties heaps[Pal::GpuHeapCount]; + heaps[Pal::GpuHeapLocal].heapSize = 512 * Mi; - // Don't report an offline device if it's active - if (foundActive) { - continue; - } + Pal::WorkStationCaps wscaps = {}; - NullDevice* dev = new NullDevice(); - if (nullptr != dev) { - if (!dev->create(revision, Pal::GfxIpLevel::_None)) { - delete dev; - } - else { - dev->registerDevice(); - } - } - } + // Create setting for the offline target + if ((palSettings == nullptr) || !palSettings->create(properties, heaps, wscaps)) { + return false; + } - // Loop through all supported devices and create each of them - for (uint id = static_cast(Pal::GfxIpLevel::GfxIp7); - id <= static_cast(Pal::GfxIpLevel::GfxIp9); ++id) { - bool foundActive = false; - Pal::GfxIpLevel ipLevel = static_cast(id); + // Fill the device info structure + fillDeviceInfo(properties, heaps, 4096, 1, 0); - if (pal::GfxIpDeviceInfo[id].targetName_[0] == '\0') { - continue; - } + // Runtime doesn't know what local size could be on the real board + info_.maxGlobalVariableSize_ = static_cast(512 * Mi); - // Loop through all active devices and see if we match one - for (uint i = 0; i < devices.size(); ++i) { - if (static_cast(devices[i])->ipLevel() == ipLevel) { - foundActive = true; - break; - } - } - - // Don't report an offline device if it's active - if (foundActive) { - continue; - } - - NullDevice* dev = new NullDevice(); - if (nullptr != dev) { - if (!dev->create(Pal::AsicRevision::Unknown, ipLevel)) { - delete dev; - } - else { - dev->registerDevice(); - } - } - } - - return true; + return true; } -bool -NullDevice::create(Pal::AsicRevision asicRevision, Pal::GfxIpLevel ipLevel) -{ - online_ = false; - Pal::DeviceProperties properties = {}; - - // Use fake GFX IP for the device init - asicRevision_ = asicRevision; - ipLevel_ = ipLevel; - properties.revision = asicRevision; - properties.gfxLevel = ipLevel; - - // Update HW info for the device - if ((GPU_ENABLE_PAL == 1) && (ipLevel == Pal::GfxIpLevel::_None)) { - hwInfo_ = &DeviceInfo[static_cast(asicRevision)]; - } - else if (ipLevel >= Pal::GfxIpLevel::GfxIp9) { - hwInfo_ = &GfxIpDeviceInfo[static_cast(ipLevel)]; - } - else { - return false; - } - - settings_ = new pal::Settings(); - pal::Settings* palSettings = reinterpret_cast(settings_); - - // Report 512MB for all offline devices - Pal::GpuMemoryHeapProperties heaps[Pal::GpuHeapCount]; - heaps[Pal::GpuHeapLocal].heapSize = 512 * Mi; - - Pal::WorkStationCaps wscaps = {}; - - // Create setting for the offline target - if ((palSettings == nullptr) || !palSettings->create(properties, heaps, wscaps)) { - return false; - } - - // Fill the device info structure - fillDeviceInfo(properties, heaps, 4096, 1, 0); - - // Runtime doesn't know what local size could be on the real board - info_.maxGlobalVariableSize_ = static_cast(512 * Mi); - - return true; -} - -device::Program* -NullDevice::createProgram(amd::option::Options* options) -{ - device::Program* program; +device::Program* NullDevice::createProgram(amd::option::Options* options) { + device::Program* program; #if defined(WITH_LIGHTNING_COMPILER) - program = new LightningProgram(*this); -#else // !defined(WITH_LIGHTNING_COMPILER) - program = new HSAILProgram(*this); -#endif // defined(WITH_LIGHTNING_COMPILER) + program = new LightningProgram(*this); +#else // !defined(WITH_LIGHTNING_COMPILER) + program = new HSAILProgram(*this); +#endif // defined(WITH_LIGHTNING_COMPILER) - if (program == nullptr) { - LogError("Memory allocation has failed!"); - } + if (program == nullptr) { + LogError("Memory allocation has failed!"); + } - return program; + return program; } -void NullDevice::fillDeviceInfo( - const Pal::DeviceProperties& palProp, - const Pal::GpuMemoryHeapProperties heaps[Pal::GpuHeapCount], - size_t maxTextureSize, - uint numComputeRings, - uint numExclusiveComputeRings) -{ - info_.type_ = CL_DEVICE_TYPE_GPU; - info_.vendorId_ = palProp.vendorId; +void NullDevice::fillDeviceInfo(const Pal::DeviceProperties& palProp, + const Pal::GpuMemoryHeapProperties heaps[Pal::GpuHeapCount], + size_t maxTextureSize, uint numComputeRings, + uint numExclusiveComputeRings) { + info_.type_ = CL_DEVICE_TYPE_GPU; + info_.vendorId_ = palProp.vendorId; - info_.maxWorkItemDimensions_ = 3; - info_.maxComputeUnits_ = - palProp.gfxipProperties.shaderCore.numShaderEngines * - palProp.gfxipProperties.shaderCore.numShaderArrays * - palProp.gfxipProperties.shaderCore.numCusPerShaderArray; - info_.numberOfShaderEngines = palProp.gfxipProperties.shaderCore.numShaderEngines; + info_.maxWorkItemDimensions_ = 3; + info_.maxComputeUnits_ = palProp.gfxipProperties.shaderCore.numShaderEngines * + palProp.gfxipProperties.shaderCore.numShaderArrays * + palProp.gfxipProperties.shaderCore.numCusPerShaderArray; + info_.numberOfShaderEngines = palProp.gfxipProperties.shaderCore.numShaderEngines; - // SI parts are scalar. Also, reads don't need to be 128-bits to get peak rates. - // For example, float4 is not faster than float as long as all threads fetch the same - // amount of data and the reads are coalesced. This is from the H/W team and confirmed - // through experimentation. May also be true on EG/NI, but no point in confusing - // developers now. - info_.nativeVectorWidthChar_ = info_.preferredVectorWidthChar_ = 4; - info_.nativeVectorWidthShort_ = info_.preferredVectorWidthShort_ = 2; - info_.nativeVectorWidthInt_ = info_.preferredVectorWidthInt_ = 1; - info_.nativeVectorWidthLong_ = info_.preferredVectorWidthLong_ = 1; - info_.nativeVectorWidthFloat_ = info_.preferredVectorWidthFloat_ = 1; - info_.nativeVectorWidthDouble_ = info_.preferredVectorWidthDouble_ = - (settings().checkExtension(ClKhrFp64)) ? 1 : 0; - info_.nativeVectorWidthHalf_ = info_.preferredVectorWidthHalf_ = 0; // no half support + // SI parts are scalar. Also, reads don't need to be 128-bits to get peak rates. + // For example, float4 is not faster than float as long as all threads fetch the same + // amount of data and the reads are coalesced. This is from the H/W team and confirmed + // through experimentation. May also be true on EG/NI, but no point in confusing + // developers now. + info_.nativeVectorWidthChar_ = info_.preferredVectorWidthChar_ = 4; + info_.nativeVectorWidthShort_ = info_.preferredVectorWidthShort_ = 2; + info_.nativeVectorWidthInt_ = info_.preferredVectorWidthInt_ = 1; + info_.nativeVectorWidthLong_ = info_.preferredVectorWidthLong_ = 1; + info_.nativeVectorWidthFloat_ = info_.preferredVectorWidthFloat_ = 1; + info_.nativeVectorWidthDouble_ = info_.preferredVectorWidthDouble_ = + (settings().checkExtension(ClKhrFp64)) ? 1 : 0; + info_.nativeVectorWidthHalf_ = info_.preferredVectorWidthHalf_ = 0; // no half support - info_.maxClockFrequency_ = (palProp.gfxipProperties.performance.maxGpuClock != 0) ? - palProp.gfxipProperties.performance.maxGpuClock : 555; - info_.maxParameterSize_ = 1024; - info_.minDataTypeAlignSize_ = sizeof(cl_long16); - info_.singleFPConfig_ = CL_FP_ROUND_TO_NEAREST | CL_FP_ROUND_TO_ZERO - | CL_FP_ROUND_TO_INF | CL_FP_INF_NAN | CL_FP_FMA; + info_.maxClockFrequency_ = (palProp.gfxipProperties.performance.maxGpuClock != 0) + ? palProp.gfxipProperties.performance.maxGpuClock + : 555; + info_.maxParameterSize_ = 1024; + info_.minDataTypeAlignSize_ = sizeof(cl_long16); + info_.singleFPConfig_ = + CL_FP_ROUND_TO_NEAREST | CL_FP_ROUND_TO_ZERO | CL_FP_ROUND_TO_INF | CL_FP_INF_NAN | CL_FP_FMA; - if (settings().singleFpDenorm_) { - info_.singleFPConfig_ |= CL_FP_DENORM; - } + if (settings().singleFpDenorm_) { + info_.singleFPConfig_ |= CL_FP_DENORM; + } - if (settings().checkExtension(ClKhrFp64)) { - info_.doubleFPConfig_ = info_.singleFPConfig_ | CL_FP_DENORM; - } + if (settings().checkExtension(ClKhrFp64)) { + info_.doubleFPConfig_ = info_.singleFPConfig_ | CL_FP_DENORM; + } - if (settings().reportFMA_) { - info_.singleFPConfig_ |= CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT; - } + if (settings().reportFMA_) { + info_.singleFPConfig_ |= CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT; + } - info_.globalMemCacheLineSize_ = settings().cacheLineSize_; - info_.globalMemCacheSize_ = settings().cacheSize_; - if ((settings().cacheLineSize_ != 0) || (settings().cacheSize_ != 0)) { - info_.globalMemCacheType_ = CL_READ_WRITE_CACHE; - } - else { - info_.globalMemCacheType_ = CL_NONE; - } + info_.globalMemCacheLineSize_ = settings().cacheLineSize_; + info_.globalMemCacheSize_ = settings().cacheSize_; + if ((settings().cacheLineSize_ != 0) || (settings().cacheSize_ != 0)) { + info_.globalMemCacheType_ = CL_READ_WRITE_CACHE; + } else { + info_.globalMemCacheType_ = CL_NONE; + } - uint64_t localRAM = heaps[Pal::GpuHeapLocal].heapSize + - heaps[Pal::GpuHeapInvisible].heapSize; + uint64_t localRAM = heaps[Pal::GpuHeapLocal].heapSize + heaps[Pal::GpuHeapInvisible].heapSize; - info_.globalMemSize_ = - (static_cast(std::min(GPU_MAX_HEAP_SIZE, 100u)) * - static_cast(localRAM) / 100u); + info_.globalMemSize_ = (static_cast(std::min(GPU_MAX_HEAP_SIZE, 100u)) * + static_cast(localRAM) / 100u); - uint uswcPercentAvailable = ((static_cast(heaps[Pal::GpuHeapGartUswc].heapSize) / Mi) > 1536 && IS_WINDOWS) - ? 75 : 50; - if (settings().apuSystem_) { - info_.globalMemSize_ += - (static_cast(heaps[Pal::GpuHeapGartUswc].heapSize) * uswcPercentAvailable) / 100; - } + uint uswcPercentAvailable = + ((static_cast(heaps[Pal::GpuHeapGartUswc].heapSize) / Mi) > 1536 && IS_WINDOWS) + ? 75 + : 50; + if (settings().apuSystem_) { + info_.globalMemSize_ += + (static_cast(heaps[Pal::GpuHeapGartUswc].heapSize) * uswcPercentAvailable) / 100; + } - // Find the largest heap form FB memory - info_.maxMemAllocSize_ = std::max( - cl_ulong(heaps[Pal::GpuHeapLocal].heapSize), - cl_ulong(heaps[Pal::GpuHeapInvisible].heapSize)); + // Find the largest heap form FB memory + info_.maxMemAllocSize_ = std::max(cl_ulong(heaps[Pal::GpuHeapLocal].heapSize), + cl_ulong(heaps[Pal::GpuHeapInvisible].heapSize)); #if defined(ATI_OS_WIN) - if (settings().apuSystem_) { - info_.maxMemAllocSize_ = std::max( - (static_cast(heaps[Pal::GpuHeapGartUswc].heapSize) * uswcPercentAvailable)/100, - info_.maxMemAllocSize_); - } + if (settings().apuSystem_) { + info_.maxMemAllocSize_ = std::max( + (static_cast(heaps[Pal::GpuHeapGartUswc].heapSize) * uswcPercentAvailable) / 100, + info_.maxMemAllocSize_); + } #endif - info_.maxMemAllocSize_ = cl_ulong(info_.maxMemAllocSize_ * - std::min(GPU_SINGLE_ALLOC_PERCENT, 100u) / 100u); + info_.maxMemAllocSize_ = + cl_ulong(info_.maxMemAllocSize_ * std::min(GPU_SINGLE_ALLOC_PERCENT, 100u) / 100u); - //! \note Force max single allocation size. - //! 4GB limit for the blit kernels and 64 bit optimizations. - info_.maxMemAllocSize_ = std::min(info_.maxMemAllocSize_, - static_cast(settings().maxAllocSize_)); + //! \note Force max single allocation size. + //! 4GB limit for the blit kernels and 64 bit optimizations. + info_.maxMemAllocSize_ = + std::min(info_.maxMemAllocSize_, static_cast(settings().maxAllocSize_)); - if (info_.maxMemAllocSize_ < cl_ulong(128 * Mi)) { - LogError("We are unable to get a heap large enough to support the OpenCL minimum "\ - "requirement for FULL_PROFILE"); - } + if (info_.maxMemAllocSize_ < cl_ulong(128 * Mi)) { + LogError( + "We are unable to get a heap large enough to support the OpenCL minimum " + "requirement for FULL_PROFILE"); + } - info_.maxMemAllocSize_ = std::max(cl_ulong(128 * Mi), info_.maxMemAllocSize_); + info_.maxMemAllocSize_ = std::max(cl_ulong(128 * Mi), info_.maxMemAllocSize_); - // Clamp max single alloc size to the globalMemSize since it's - // reduced by default - info_.maxMemAllocSize_ = std::min(info_.maxMemAllocSize_, info_.globalMemSize_); + // Clamp max single alloc size to the globalMemSize since it's + // reduced by default + info_.maxMemAllocSize_ = std::min(info_.maxMemAllocSize_, info_.globalMemSize_); - // We need to verify that we are not reporting more global memory - // that 4x single alloc - info_.globalMemSize_ = std::min( 4 * info_.maxMemAllocSize_, info_.globalMemSize_); + // We need to verify that we are not reporting more global memory + // that 4x single alloc + info_.globalMemSize_ = std::min(4 * info_.maxMemAllocSize_, info_.globalMemSize_); - // Use 64 bit pointers - if (settings().use64BitPtr_) { - info_.addressBits_ = 64; - } - else { - info_.addressBits_ = 32; - // Limit total size with 3GB for 32 bit - info_.globalMemSize_ = std::min(info_.globalMemSize_, cl_ulong(3 * Gi)); - } + // Use 64 bit pointers + if (settings().use64BitPtr_) { + info_.addressBits_ = 64; + } else { + info_.addressBits_ = 32; + // Limit total size with 3GB for 32 bit + info_.globalMemSize_ = std::min(info_.globalMemSize_, cl_ulong(3 * Gi)); + } - // Alignment in BITS of the base address of any allocated memory object - static const size_t MemBaseAlignment = 256; - //! @note Force 256 bytes alignment, since currently - //! calAttr.surface_alignment returns 4KB. For pinned memory runtime - //! should be able to create a view with 256 bytes alignement - info_.memBaseAddrAlign_ = 8 * MemBaseAlignment; + // Alignment in BITS of the base address of any allocated memory object + static const size_t MemBaseAlignment = 256; + //! @note Force 256 bytes alignment, since currently + //! calAttr.surface_alignment returns 4KB. For pinned memory runtime + //! should be able to create a view with 256 bytes alignement + info_.memBaseAddrAlign_ = 8 * MemBaseAlignment; - info_.maxConstantBufferSize_ = info_.maxMemAllocSize_; - info_.maxConstantArgs_ = MaxConstArguments; + info_.maxConstantBufferSize_ = info_.maxMemAllocSize_; + info_.maxConstantArgs_ = MaxConstArguments; - // Image support fields - if (settings().imageSupport_) { - info_.imageSupport_ = CL_TRUE; - info_.maxSamplers_ = MaxSamplers; - info_.maxReadImageArgs_ = MaxReadImage; - info_.maxWriteImageArgs_ = MaxWriteImage; - info_.image2DMaxWidth_ = maxTextureSize; - info_.image2DMaxHeight_ = maxTextureSize; - info_.image3DMaxWidth_ = std::min(2 * Ki, maxTextureSize); - info_.image3DMaxHeight_ = std::min(2 * Ki, maxTextureSize); - info_.image3DMaxDepth_ = std::min(2 * Ki, maxTextureSize); + // Image support fields + if (settings().imageSupport_) { + info_.imageSupport_ = CL_TRUE; + info_.maxSamplers_ = MaxSamplers; + info_.maxReadImageArgs_ = MaxReadImage; + info_.maxWriteImageArgs_ = MaxWriteImage; + info_.image2DMaxWidth_ = maxTextureSize; + info_.image2DMaxHeight_ = maxTextureSize; + info_.image3DMaxWidth_ = std::min(2 * Ki, maxTextureSize); + info_.image3DMaxHeight_ = std::min(2 * Ki, maxTextureSize); + info_.image3DMaxDepth_ = std::min(2 * Ki, maxTextureSize); - info_.imagePitchAlignment_ = 256; // PAL uses LINEAR_ALIGNED - info_.imageBaseAddressAlignment_ = 256; // XXX: 256 byte base address alignment for now + info_.imagePitchAlignment_ = 256; // PAL uses LINEAR_ALIGNED + info_.imageBaseAddressAlignment_ = 256; // XXX: 256 byte base address alignment for now - info_.bufferFromImageSupport_ = CL_TRUE; - } + info_.bufferFromImageSupport_ = CL_TRUE; + } - info_.errorCorrectionSupport_ = CL_FALSE; + info_.errorCorrectionSupport_ = CL_FALSE; - if (settings().apuSystem_) { - info_.hostUnifiedMemory_ = CL_TRUE; - } + if (settings().apuSystem_) { + info_.hostUnifiedMemory_ = CL_TRUE; + } - info_.profilingTimerResolution_ = 1; - info_.profilingTimerOffset_ = amd::Os::offsetToEpochNanos(); - info_.littleEndian_ = CL_TRUE; - info_.available_ = CL_TRUE; - info_.compilerAvailable_ = CL_TRUE; - info_.linkerAvailable_ = CL_TRUE; + info_.profilingTimerResolution_ = 1; + info_.profilingTimerOffset_ = amd::Os::offsetToEpochNanos(); + info_.littleEndian_ = CL_TRUE; + info_.available_ = CL_TRUE; + info_.compilerAvailable_ = CL_TRUE; + info_.linkerAvailable_ = CL_TRUE; - info_.executionCapabilities_ = CL_EXEC_KERNEL; - info_.preferredPlatformAtomicAlignment_ = 0; - info_.preferredGlobalAtomicAlignment_ = 0; - info_.preferredLocalAtomicAlignment_ = 0; - info_.queueProperties_ = CL_QUEUE_PROFILING_ENABLE; + info_.executionCapabilities_ = CL_EXEC_KERNEL; + info_.preferredPlatformAtomicAlignment_ = 0; + info_.preferredGlobalAtomicAlignment_ = 0; + info_.preferredLocalAtomicAlignment_ = 0; + info_.queueProperties_ = CL_QUEUE_PROFILING_ENABLE; - info_.platform_ = AMD_PLATFORM; + info_.platform_ = AMD_PLATFORM; - if (false && (asicRevision() == Pal::AsicRevision::Carrizo) && - ASICREV_IS_CARRIZO_BRISTOL(palProp.revisionId)) { - const static char* bristol = "Bristol Ridge"; - ::strcpy(info_.name_, bristol); - } - else { - ::strcpy(info_.name_, hwInfo()->targetName_); - } - ::strcpy(info_.vendor_, "Advanced Micro Devices, Inc."); - ::snprintf(info_.driverVersion_, sizeof(info_.driverVersion_) - 1, - AMD_BUILD_STRING " (PAL%s)", + if (false && (asicRevision() == Pal::AsicRevision::Carrizo) && + ASICREV_IS_CARRIZO_BRISTOL(palProp.revisionId)) { + const static char* bristol = "Bristol Ridge"; + ::strcpy(info_.name_, bristol); + } else { + ::strcpy(info_.name_, hwInfo()->targetName_); + } + ::strcpy(info_.vendor_, "Advanced Micro Devices, Inc."); + ::snprintf(info_.driverVersion_, sizeof(info_.driverVersion_) - 1, AMD_BUILD_STRING " (PAL%s)", #if defined(WITH_LIGHTNING_COMPILER) - ",LC" -#else // ! defined(WITH_LIGHTNING_COMPILER) - ",HSAIL" -#endif // ! defined(WITH_LIGHTNING_COMPILER) - ); + ",LC" +#else // ! defined(WITH_LIGHTNING_COMPILER) + ",HSAIL" +#endif // ! defined(WITH_LIGHTNING_COMPILER) + ); - info_.profile_ = "FULL_PROFILE"; - if (settings().oclVersion_ == OpenCL20) { - info_.version_ = "OpenCL 2.0 " AMD_PLATFORM_INFO; - info_.oclcVersion_ = "OpenCL C 2.0 "; - info_.spirVersions_ = "1.2"; + info_.profile_ = "FULL_PROFILE"; + if (settings().oclVersion_ == OpenCL20) { + info_.version_ = "OpenCL 2.0 " AMD_PLATFORM_INFO; + info_.oclcVersion_ = "OpenCL C 2.0 "; + info_.spirVersions_ = "1.2"; + } else if (settings().oclVersion_ == OpenCL12) { + info_.version_ = "OpenCL 1.2 " AMD_PLATFORM_INFO; + info_.oclcVersion_ = "OpenCL C 1.2 "; + info_.spirVersions_ = "1.2"; + } else { + info_.version_ = "OpenCL 1.0 " AMD_PLATFORM_INFO; + info_.oclcVersion_ = "OpenCL C 1.0 "; + info_.spirVersions_ = ""; + LogError("Unknown version for support"); + } + + // Fill workgroup info size + info_.maxWorkGroupSize_ = settings().maxWorkGroupSize_; + info_.maxWorkItemSizes_[0] = info_.maxWorkGroupSize_; + info_.maxWorkItemSizes_[1] = info_.maxWorkGroupSize_; + info_.maxWorkItemSizes_[2] = info_.maxWorkGroupSize_; + + info_.localMemType_ = CL_LOCAL; + info_.localMemSize_ = settings().hwLDSSize_; + info_.extensions_ = getExtensionString(); + + info_.deviceTopology_.pcie.type = CL_DEVICE_TOPOLOGY_TYPE_PCIE_AMD; + info_.deviceTopology_.pcie.bus = palProp.pciProperties.busNumber; + info_.deviceTopology_.pcie.device = palProp.pciProperties.deviceNumber; + info_.deviceTopology_.pcie.function = palProp.pciProperties.functionNumber; + + ::strncpy(info_.boardName_, palProp.gpuName, + ::strnlen(palProp.gpuName, sizeof(info_.boardName_))); + + // OpenCL1.2 device info fields + info_.builtInKernels_ = ""; + info_.imageMaxBufferSize_ = MaxImageBufferSize; + info_.imageMaxArraySize_ = MaxImageArraySize; + info_.preferredInteropUserSync_ = true; + info_.printfBufferSize_ = PrintfDbg::WorkitemDebugSize * info().maxWorkGroupSize_; + + if (settings().oclVersion_ >= OpenCL20) { + info_.svmCapabilities_ = (CL_DEVICE_SVM_COARSE_GRAIN_BUFFER | CL_DEVICE_SVM_FINE_GRAIN_BUFFER); + if (settings().svmAtomics_) { + info_.svmCapabilities_ |= CL_DEVICE_SVM_ATOMICS; } - else if (settings().oclVersion_ == OpenCL12) { - info_.version_ = "OpenCL 1.2 " AMD_PLATFORM_INFO; - info_.oclcVersion_ = "OpenCL C 1.2 "; - info_.spirVersions_ = "1.2"; - } - else { - info_.version_ = "OpenCL 1.0 " AMD_PLATFORM_INFO; - info_.oclcVersion_ = "OpenCL C 1.0 "; - info_.spirVersions_ = ""; - LogError("Unknown version for support"); + if (settings().svmFineGrainSystem_) { + info_.svmCapabilities_ |= CL_DEVICE_SVM_FINE_GRAIN_SYSTEM; } + // OpenCL2.0 device info fields + info_.maxWriteImageArgs_ = MaxReadWriteImage; //!< For compatibility + info_.maxReadWriteImageArgs_ = MaxReadWriteImage; - // Fill workgroup info size - info_.maxWorkGroupSize_ = settings().maxWorkGroupSize_; - info_.maxWorkItemSizes_[0] = info_.maxWorkGroupSize_; - info_.maxWorkItemSizes_[1] = info_.maxWorkGroupSize_; - info_.maxWorkItemSizes_[2] = info_.maxWorkGroupSize_; + info_.maxPipePacketSize_ = info_.maxMemAllocSize_; + info_.maxPipeActiveReservations_ = 16; + info_.maxPipeArgs_ = 16; - info_.localMemType_ = CL_LOCAL; - info_.localMemSize_ = settings().hwLDSSize_; - info_.extensions_ = getExtensionString(); + info_.queueOnDeviceProperties_ = + CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_PROFILING_ENABLE; + info_.queueOnDevicePreferredSize_ = 256 * Ki; + info_.queueOnDeviceMaxSize_ = 8 * Mi; + info_.maxOnDeviceQueues_ = 1; + info_.maxOnDeviceEvents_ = settings().numDeviceEvents_; + info_.globalVariablePreferredTotalSize_ = static_cast(info_.globalMemSize_); + //! \todo Remove % calculation. + //! Use 90% of max single alloc size. + //! Boards with max single alloc size around 4GB will fail allocations + info_.maxGlobalVariableSize_ = + static_cast(amd::alignDown(info_.maxMemAllocSize_ * 9 / 10, 256)); + } - info_.deviceTopology_.pcie.type = CL_DEVICE_TOPOLOGY_TYPE_PCIE_AMD; - info_.deviceTopology_.pcie.bus = palProp.pciProperties.busNumber; - info_.deviceTopology_.pcie.device = palProp.pciProperties.deviceNumber; - info_.deviceTopology_.pcie.function = palProp.pciProperties.functionNumber; - - ::strncpy(info_.boardName_, palProp.gpuName, - ::strnlen(palProp.gpuName, sizeof(info_.boardName_))); - - // OpenCL1.2 device info fields - info_.builtInKernels_ = ""; - info_.imageMaxBufferSize_ = MaxImageBufferSize; - info_.imageMaxArraySize_ = MaxImageArraySize; - info_.preferredInteropUserSync_ = true; - info_.printfBufferSize_ = PrintfDbg::WorkitemDebugSize * info().maxWorkGroupSize_; - - if (settings().oclVersion_ >= OpenCL20) { - info_.svmCapabilities_ = - (CL_DEVICE_SVM_COARSE_GRAIN_BUFFER | CL_DEVICE_SVM_FINE_GRAIN_BUFFER); - if (settings().svmAtomics_) { - info_.svmCapabilities_ |= CL_DEVICE_SVM_ATOMICS; - } - if (settings().svmFineGrainSystem_) { - info_.svmCapabilities_ |= CL_DEVICE_SVM_FINE_GRAIN_SYSTEM; - } - // OpenCL2.0 device info fields - info_.maxWriteImageArgs_ = MaxReadWriteImage; //!< For compatibility - info_.maxReadWriteImageArgs_ = MaxReadWriteImage; - - info_.maxPipePacketSize_ = info_.maxMemAllocSize_; - info_.maxPipeActiveReservations_ = 16; - info_.maxPipeArgs_ = 16; - - info_.queueOnDeviceProperties_ = - CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_PROFILING_ENABLE; - info_.queueOnDevicePreferredSize_ = 256 * Ki; - info_.queueOnDeviceMaxSize_ = 8 * Mi; - info_.maxOnDeviceQueues_ = 1; - info_.maxOnDeviceEvents_ = settings().numDeviceEvents_; - info_.globalVariablePreferredTotalSize_ = static_cast(info_.globalMemSize_); - //! \todo Remove % calculation. - //! Use 90% of max single alloc size. - //! Boards with max single alloc size around 4GB will fail allocations - info_.maxGlobalVariableSize_ = static_cast( - amd::alignDown(info_.maxMemAllocSize_ * 9 / 10, 256)); - } - - if (settings().checkExtension(ClAmdDeviceAttributeQuery)) { - info_.simdPerCU_ = hwInfo()->simdPerCU_; - info_.simdWidth_ = hwInfo()->simdWidth_; - info_.simdInstructionWidth_ = hwInfo()->simdInstructionWidth_; - info_.wavefrontWidth_ = palProp.gfxipProperties.shaderCore.wavefrontSize; - info_.globalMemChannels_ = palProp.gpuMemoryProperties.performance.vramBusBitWidth / 32; - info_.globalMemChannelBanks_ = 4; - info_.globalMemChannelBankWidth_ = hwInfo()->memChannelBankWidth_; - info_.localMemSizePerCU_ = hwInfo()->localMemSizePerCU_; - info_.localMemBanks_ = hwInfo()->localMemBanks_; - info_.gfxipVersion_ = hwInfo()->gfxipVersion_; - info_.numAsyncQueues_ = numComputeRings; - info_.numRTQueues_ = numExclusiveComputeRings; - info_.numRTCUs_ = - palProp.engineProperties[Pal::EngineTypeExclusiveCompute].maxNumDedicatedCu; - info_.threadTraceEnable_ = settings().threadTraceEnable_; - } + if (settings().checkExtension(ClAmdDeviceAttributeQuery)) { + info_.simdPerCU_ = hwInfo()->simdPerCU_; + info_.simdWidth_ = hwInfo()->simdWidth_; + info_.simdInstructionWidth_ = hwInfo()->simdInstructionWidth_; + info_.wavefrontWidth_ = palProp.gfxipProperties.shaderCore.wavefrontSize; + info_.globalMemChannels_ = palProp.gpuMemoryProperties.performance.vramBusBitWidth / 32; + info_.globalMemChannelBanks_ = 4; + info_.globalMemChannelBankWidth_ = hwInfo()->memChannelBankWidth_; + info_.localMemSizePerCU_ = hwInfo()->localMemSizePerCU_; + info_.localMemBanks_ = hwInfo()->localMemBanks_; + info_.gfxipVersion_ = hwInfo()->gfxipVersion_; + info_.numAsyncQueues_ = numComputeRings; + info_.numRTQueues_ = numExclusiveComputeRings; + info_.numRTCUs_ = palProp.engineProperties[Pal::EngineTypeExclusiveCompute].maxNumDedicatedCu; + info_.threadTraceEnable_ = settings().threadTraceEnable_; + } } -Device::XferBuffers::~XferBuffers() -{ - // Destroy temporary buffer for reads - for (const auto& buf : freeBuffers_) { - // CPU optimization: unmap staging buffer just once - if (!buf->desc().cardMemory_) { - buf->unmap(nullptr); - } - delete buf; +Device::XferBuffers::~XferBuffers() { + // Destroy temporary buffer for reads + for (const auto& buf : freeBuffers_) { + // CPU optimization: unmap staging buffer just once + if (!buf->desc().cardMemory_) { + buf->unmap(nullptr); } - freeBuffers_.clear(); + delete buf; + } + freeBuffers_.clear(); } -bool -Device::XferBuffers::create() -{ - Memory* xferBuf = nullptr; - bool result = false; - // Create a buffer object +bool Device::XferBuffers::create() { + Memory* xferBuf = nullptr; + bool result = false; + // Create a buffer object + xferBuf = new Memory(dev(), bufSize_); + + // Try to allocate memory for the transfer buffer + if ((nullptr == xferBuf) || !xferBuf->create(type_)) { + delete xferBuf; + xferBuf = nullptr; + LogError("Couldn't allocate a transfer buffer!"); + } else { + result = true; + freeBuffers_.push_back(xferBuf); + // CPU optimization: map staging buffer just once + if (!xferBuf->desc().cardMemory_) { + xferBuf->map(nullptr); + } + } + + return result; +} + +Memory& Device::XferBuffers::acquire() { + Memory* xferBuf = nullptr; + size_t listSize; + + // Lock the operations with the staged buffer list + amd::ScopedLock l(lock_); + listSize = freeBuffers_.size(); + + // If the list is empty, then attempt to allocate a staged buffer + if (listSize == 0) { + // Allocate memory xferBuf = new Memory(dev(), bufSize_); - // Try to allocate memory for the transfer buffer + // Allocate memory for the transfer buffer if ((nullptr == xferBuf) || !xferBuf->create(type_)) { - delete xferBuf; - xferBuf = nullptr; - LogError("Couldn't allocate a transfer buffer!"); - } - else { - result = true; - freeBuffers_.push_back(xferBuf); - // CPU optimization: map staging buffer just once - if (!xferBuf->desc().cardMemory_) { - xferBuf->map(nullptr); - } + delete xferBuf; + xferBuf = nullptr; + LogError("Couldn't allocate a transfer buffer!"); + } else { + ++acquiredCnt_; + // CPU optimization: map staging buffer just once + if (!xferBuf->desc().cardMemory_) { + xferBuf->map(nullptr); + } } + } - return result; + if (xferBuf == nullptr) { + xferBuf = *(freeBuffers_.begin()); + freeBuffers_.erase(freeBuffers_.begin()); + ++acquiredCnt_; + } + + return *xferBuf; } -Memory& -Device::XferBuffers::acquire() -{ - Memory* xferBuf = nullptr; - size_t listSize; - - // Lock the operations with the staged buffer list - amd::ScopedLock l(lock_); - listSize = freeBuffers_.size(); - - // If the list is empty, then attempt to allocate a staged buffer - if (listSize == 0) { - // Allocate memory - xferBuf = new Memory(dev(), bufSize_); - - // Allocate memory for the transfer buffer - if ((nullptr == xferBuf) || !xferBuf->create(type_)) { - delete xferBuf; - xferBuf = nullptr; - LogError("Couldn't allocate a transfer buffer!"); - } - else { - ++acquiredCnt_; - // CPU optimization: map staging buffer just once - if (!xferBuf->desc().cardMemory_) { - xferBuf->map(nullptr); - } - } - } - - if (xferBuf == nullptr) { - xferBuf = *(freeBuffers_.begin()); - freeBuffers_.erase(freeBuffers_.begin()); - ++acquiredCnt_; - } - - return *xferBuf; -} - -void -Device::XferBuffers::release(VirtualGPU& gpu, Memory& buffer) -{ - // Make sure buffer isn't busy on the current VirtualGPU, because - // the next aquire can come from different queue - buffer.wait(gpu); - // Lock the operations with the staged buffer list - amd::ScopedLock l(lock_); - freeBuffers_.push_back(&buffer); - --acquiredCnt_; +void Device::XferBuffers::release(VirtualGPU& gpu, Memory& buffer) { + // Make sure buffer isn't busy on the current VirtualGPU, because + // the next aquire can come from different queue + buffer.wait(gpu); + // Lock the operations with the staged buffer list + amd::ScopedLock l(lock_); + freeBuffers_.push_back(&buffer); + --acquiredCnt_; } -Device::ScopedLockVgpus::ScopedLockVgpus(const Device& dev) - : dev_(dev) -{ - // Lock the virtual GPU list - dev_.vgpusAccess()->lock(); +Device::ScopedLockVgpus::ScopedLockVgpus(const Device& dev) : dev_(dev) { + // Lock the virtual GPU list + dev_.vgpusAccess()->lock(); - // Find all available virtual GPUs and lock them - // from the execution of commands - for (uint idx = 0; idx < dev_.vgpus().size(); ++idx) { - dev_.vgpus()[idx]->execution().lock(); - } + // Find all available virtual GPUs and lock them + // from the execution of commands + for (uint idx = 0; idx < dev_.vgpus().size(); ++idx) { + dev_.vgpus()[idx]->execution().lock(); + } } -Device::ScopedLockVgpus::~ScopedLockVgpus() -{ - // Find all available virtual GPUs and unlock them - // for the execution of commands - for (uint idx = 0; idx < dev_.vgpus().size(); ++idx) { - dev_.vgpus()[idx]->execution().unlock(); - } +Device::ScopedLockVgpus::~ScopedLockVgpus() { + // Find all available virtual GPUs and unlock them + // for the execution of commands + for (uint idx = 0; idx < dev_.vgpus().size(); ++idx) { + dev_.vgpus()[idx]->execution().unlock(); + } - // Unock the virtual GPU list - dev_.vgpusAccess()->unlock(); + // Unock the virtual GPU list + dev_.vgpusAccess()->unlock(); } Device::Device() - : NullDevice() - , numOfVgpus_(0) - , context_(nullptr) - , lockAsyncOps_(nullptr) - , lockForInitHeap_(nullptr) - , lockPAL_(nullptr) - , vgpusAccess_(nullptr) - , scratchAlloc_(nullptr) - , mapCacheOps_(nullptr) - , xferRead_(nullptr) - , xferWrite_(nullptr) - , mapCache_(nullptr) - , resourceCache_(nullptr) - , numComputeEngines_(0) - , numExclusiveComputeEngines_(0) - , numDmaEngines_(0) - , heapInitComplete_(false) - , xferQueue_(nullptr) - , globalScratchBuf_(nullptr) - , srdManager_(nullptr) -{ -} + : NullDevice(), + numOfVgpus_(0), + context_(nullptr), + lockAsyncOps_(nullptr), + lockForInitHeap_(nullptr), + lockPAL_(nullptr), + vgpusAccess_(nullptr), + scratchAlloc_(nullptr), + mapCacheOps_(nullptr), + xferRead_(nullptr), + xferWrite_(nullptr), + mapCache_(nullptr), + resourceCache_(nullptr), + numComputeEngines_(0), + numExclusiveComputeEngines_(0), + numDmaEngines_(0), + heapInitComplete_(false), + xferQueue_(nullptr), + globalScratchBuf_(nullptr), + srdManager_(nullptr) {} -Device::~Device() -{ - // remove the HW debug manager - delete hwDebugMgr_; - hwDebugMgr_ = nullptr; +Device::~Device() { + // remove the HW debug manager + delete hwDebugMgr_; + hwDebugMgr_ = nullptr; - delete srdManager_; + delete srdManager_; - for (uint s = 0; s < scratch_.size(); ++s) { - delete scratch_[s]; - scratch_[s] = nullptr; + for (uint s = 0; s < scratch_.size(); ++s) { + delete scratch_[s]; + scratch_[s] = nullptr; + } + + delete globalScratchBuf_; + globalScratchBuf_ = nullptr; + + // Destroy transfer queue + delete xferQueue_; + + // Destroy blit program + delete blitProgram_; + + // Release cached map targets + for (uint i = 0; mapCache_ != nullptr && i < mapCache_->size(); ++i) { + if ((*mapCache_)[i] != nullptr) { + (*mapCache_)[i]->release(); } + } + delete mapCache_; - delete globalScratchBuf_; - globalScratchBuf_ = nullptr; + // Destroy temporary buffers for read/write + delete xferRead_; + delete xferWrite_; - // Destroy transfer queue - delete xferQueue_; + // Destroy resource cache + delete resourceCache_; - // Destroy blit program - delete blitProgram_; + delete lockAsyncOps_; + delete lockForInitHeap_; + delete lockPAL_; + delete vgpusAccess_; + delete scratchAlloc_; + delete mapCacheOps_; - // Release cached map targets - for (uint i = 0; mapCache_ != nullptr && i < mapCache_->size(); ++i) { - if ((*mapCache_)[i] != nullptr) { - (*mapCache_)[i]->release(); - } - } - delete mapCache_; + if (context_ != nullptr) { + context_->release(); + } - // Destroy temporary buffers for read/write - delete xferRead_; - delete xferWrite_; - - // Destroy resource cache - delete resourceCache_; - - delete lockAsyncOps_; - delete lockForInitHeap_; - delete lockPAL_; - delete vgpusAccess_; - delete scratchAlloc_; - delete mapCacheOps_; - - if (context_ != nullptr) { - context_->release(); - } - - device_ = nullptr; + device_ = nullptr; } extern const char* SchedulerSourceCode; -bool -Device::create(Pal::IDevice* device) -{ - if (!amd::Device::create()) { - return false; +bool Device::create(Pal::IDevice* device) { + if (!amd::Device::create()) { + return false; + } + + appProfile_.init(); + device_ = device; + Pal::Result result; + + // Retrive device properties + result = iDev()->GetProperties(&properties_); + + // Save the IP level for the offline detection + ipLevel_ = properties().gfxLevel; + asicRevision_ = properties().revision; + + // Update HW info for the device + if ((GPU_ENABLE_PAL == 1) && (properties().revision <= Pal::AsicRevision::Baffin)) { + hwInfo_ = &DeviceInfo[static_cast(properties().revision)]; + } else if (ipLevel_ >= Pal::GfxIpLevel::GfxIp9) { + if (properties().gpuType == Pal::GpuType::Integrated) { + hwInfo_ = &Gfx901DeviceInfo; + } else { + hwInfo_ = &GfxIpDeviceInfo[static_cast(ipLevel_)]; } + } else { + return false; + } - appProfile_.init(); - device_ = device; - Pal::Result result; - - // Retrive device properties - result = iDev()->GetProperties(&properties_); - - // Save the IP level for the offline detection - ipLevel_ = properties().gfxLevel; - asicRevision_ = properties().revision; - - // Update HW info for the device - if ((GPU_ENABLE_PAL == 1) && (properties().revision <= Pal::AsicRevision::Baffin)) { - hwInfo_ = &DeviceInfo[static_cast(properties().revision)]; + // Find the number of available engines + numComputeEngines_ = properties().engineProperties[Pal::EngineTypeCompute].engineCount; + if (properties().engineProperties[Pal::EngineTypeExclusiveCompute].maxNumDedicatedCu > 0) { + for (uint i = 0; i < properties().engineProperties[Pal::EngineTypeExclusiveCompute].engineCount; + ++i) { + if ((properties().engineProperties[Pal::EngineTypeExclusiveCompute].engineSubType[i] == + Pal::EngineSubType::RtCuHighCompute) || + (properties().engineProperties[Pal::EngineTypeExclusiveCompute].engineSubType[i] == + Pal::EngineSubType::RtCuMedCompute)) { + numExclusiveComputeEngines_++; + } } - else if (ipLevel_ >= Pal::GfxIpLevel::GfxIp9) { - if (properties().gpuType == Pal::GpuType::Integrated) { - hwInfo_ = &Gfx901DeviceInfo; - } - else { - hwInfo_ = &GfxIpDeviceInfo[static_cast(ipLevel_)]; - } - } - else { - return false; + } + numDmaEngines_ = properties().engineProperties[Pal::EngineTypeDma].engineCount; + + // Creates device settings + settings_ = new pal::Settings(); + Pal::PalPublicSettings* const palSettings = iDev()->GetPublicSettings(); + // Modify settings here + // palSettings ... + palSettings->textureOptLevel = Pal::TextureFilterOptimizationsDisabled; + palSettings->forceHighClocks = appProfile_.enableHighPerformanceState(); + palSettings->longRunningSubmissions = true; + palSettings->cmdBufBatchedSubmitChainLimit = 0; + palSettings->disableResourceProcessingManager = true; + palSettings->disableScManager = true; + palSettings->numScratchWavesPerCu = settings().numScratchWavesPerCu_; + + // Commit the new settings for the device + result = iDev()->CommitSettingsAndInit(); + if (result == Pal::Result::Success) { + Pal::DeviceFinalizeInfo finalizeInfo = {}; + + // Request all compute engines + finalizeInfo.requestedEngineCounts[Pal::EngineTypeCompute].engines = + ((1 << numComputeEngines_) - 1); + // Request real time compute engines + finalizeInfo.requestedEngineCounts[Pal::EngineTypeExclusiveCompute].engines = + ((1 << numExclusiveComputeEngines_) - 1); + // Request all SDMA engines + finalizeInfo.requestedEngineCounts[Pal::EngineTypeDma].engines = (1 << numDmaEngines_) - 1; + + result = iDev()->Finalize(finalizeInfo); + if (result != Pal::Result::Success) { + return false; } + } - // Find the number of available engines - numComputeEngines_ = - properties().engineProperties[Pal::EngineTypeCompute].engineCount; - if (properties().engineProperties[Pal::EngineTypeExclusiveCompute]. - maxNumDedicatedCu > 0) { - for (uint i = 0; i < properties().engineProperties[ - Pal::EngineTypeExclusiveCompute].engineCount; ++i) { - if ((properties().engineProperties[ - Pal::EngineTypeExclusiveCompute].engineSubType[i] == - Pal::EngineSubType::RtCuHighCompute) || - (properties().engineProperties[ - Pal::EngineTypeExclusiveCompute].engineSubType[i] == - Pal::EngineSubType::RtCuMedCompute)) { - numExclusiveComputeEngines_++; - } - } - } - numDmaEngines_ = - properties().engineProperties[Pal::EngineTypeDma].engineCount; + Pal::GpuMemoryHeapProperties heaps[Pal::GpuHeapCount]; + iDev()->GetGpuMemoryHeapProperties(heaps); - // Creates device settings - settings_ = new pal::Settings(); - Pal::PalPublicSettings*const palSettings = iDev()->GetPublicSettings(); - // Modify settings here - // palSettings ... - palSettings->textureOptLevel = Pal::TextureFilterOptimizationsDisabled; - palSettings->forceHighClocks = appProfile_.enableHighPerformanceState(); - palSettings->longRunningSubmissions = true; - palSettings->cmdBufBatchedSubmitChainLimit = 0; - palSettings->disableResourceProcessingManager = true; - palSettings->disableScManager = true; - palSettings->numScratchWavesPerCu = settings().numScratchWavesPerCu_; + Pal::WorkStationCaps wscaps = {}; + iDev()->QueryWorkStationCaps(&wscaps); - // Commit the new settings for the device - result = iDev()->CommitSettingsAndInit(); - if (result == Pal::Result::Success) { - Pal::DeviceFinalizeInfo finalizeInfo = {}; + pal::Settings* gpuSettings = reinterpret_cast(settings_); + if ((gpuSettings == nullptr) || + !gpuSettings->create(properties(), heaps, wscaps, appProfile_.reportAsOCL12Device())) { + return false; + } + numComputeEngines_ = std::min(numComputeEngines_, settings().numComputeRings_); - // Request all compute engines - finalizeInfo.requestedEngineCounts[Pal::EngineTypeCompute].engines = - ((1 << numComputeEngines_) - 1); - // Request real time compute engines - finalizeInfo.requestedEngineCounts[Pal::EngineTypeExclusiveCompute].engines = - ((1 << numExclusiveComputeEngines_) - 1); - // Request all SDMA engines - finalizeInfo.requestedEngineCounts[Pal::EngineTypeDma].engines = - (1 << numDmaEngines_) - 1; + amd::Context::Info info = {0}; + std::vector devices; + devices.push_back(this); - result = iDev()->Finalize(finalizeInfo); - if (result != Pal::Result::Success) { - return false; - } - } + // Create a dummy context + context_ = new amd::Context(devices, info); + if (context_ == nullptr) { + return false; + } - Pal::GpuMemoryHeapProperties heaps[Pal::GpuHeapCount]; - iDev()->GetGpuMemoryHeapProperties(heaps); + // Create the locks + lockAsyncOps_ = new amd::Monitor("Device Async Ops Lock", true); + if (nullptr == lockAsyncOps_) { + return false; + } + lockPAL_ = new amd::Monitor("PAL Ops Lock", true); + if (nullptr == lockPAL_) { + return false; + } - Pal::WorkStationCaps wscaps = {}; - iDev()->QueryWorkStationCaps(&wscaps); + lockForInitHeap_ = new amd::Monitor("Async Ops Lock For Initialization of Heap Resource", true); + if (nullptr == lockForInitHeap_) { + return false; + } - pal::Settings* gpuSettings = reinterpret_cast(settings_); - if ((gpuSettings == nullptr) || !gpuSettings->create(properties(), heaps, - wscaps, appProfile_.reportAsOCL12Device())) { - return false; - } - numComputeEngines_ = std::min(numComputeEngines_, settings().numComputeRings_); + vgpusAccess_ = new amd::Monitor("Virtual GPU List Ops Lock", true); + if (nullptr == vgpusAccess_) { + return false; + } - amd::Context::Info info = {0}; - std::vector devices; - devices.push_back(this); + scratchAlloc_ = new amd::Monitor("Scratch Allocation Lock", true); + if (nullptr == scratchAlloc_) { + return false; + } - // Create a dummy context - context_ = new amd::Context(devices, info); - if (context_ == nullptr) { - return false; - } + mapCacheOps_ = new amd::Monitor("Map Cache Lock", true); + if (nullptr == mapCacheOps_) { + return false; + } - // Create the locks - lockAsyncOps_ = new amd::Monitor("Device Async Ops Lock", true); - if (nullptr == lockAsyncOps_) { - return false; - } - lockPAL_ = new amd::Monitor("PAL Ops Lock", true); - if (nullptr == lockPAL_) { - return false; - } + mapCache_ = new std::vector(); + if (mapCache_ == nullptr) { + return false; + } + // Use just 1 entry by default for the map cache + mapCache_->push_back(nullptr); - lockForInitHeap_ = new amd::Monitor("Async Ops Lock For Initialization of Heap Resource", true); - if (nullptr == lockForInitHeap_) { - return false; - } + size_t resourceCacheSize = settings().resourceCacheSize_; + // Create resource cache. + // \note Cache must be created before any resource creation to avoid nullptr check + resourceCache_ = new ResourceCache(resourceCacheSize); + if (nullptr == resourceCache_) { + return false; + } - vgpusAccess_ = new amd::Monitor("Virtual GPU List Ops Lock", true); - if (nullptr == vgpusAccess_) { - return false; - } - - scratchAlloc_ = new amd::Monitor("Scratch Allocation Lock", true); - if (nullptr == scratchAlloc_) { - return false; - } - - mapCacheOps_ = new amd::Monitor("Map Cache Lock", true); - if (nullptr == mapCacheOps_) { - return false; - } - - mapCache_ = new std::vector(); - if (mapCache_ == nullptr) { - return false; - } - // Use just 1 entry by default for the map cache - mapCache_->push_back(nullptr); - - size_t resourceCacheSize = settings().resourceCacheSize_; - // Create resource cache. - // \note Cache must be created before any resource creation to avoid nullptr check - resourceCache_ = new ResourceCache(resourceCacheSize); - if (nullptr == resourceCache_) { - return false; - } - - // Fill the device info structure - fillDeviceInfo(properties(), heaps, 16*Ki, numComputeEngines(), numExclusiveComputeEngines()); + // Fill the device info structure + fillDeviceInfo(properties(), heaps, 16 * Ki, numComputeEngines(), numExclusiveComputeEngines()); #ifdef DEBUG - std::stringstream message; - message << info_.name_; - if (settings().remoteAlloc_) { - message << ": Using *Remote* memory"; - } - else { - message << ": Using *Local* memory"; - } + std::stringstream message; + message << info_.name_; + if (settings().remoteAlloc_) { + message << ": Using *Remote* memory"; + } else { + message << ": Using *Local* memory"; + } - message << std::endl; - LogInfo(message.str().c_str()); -#endif // DEBUG + message << std::endl; + LogInfo(message.str().c_str()); +#endif // DEBUG - for (uint i = 0; i < Pal::GpuHeap::GpuHeapCount; ++i) { - freeMem[i] = heaps[i].heapSize; - } + for (uint i = 0; i < Pal::GpuHeap::GpuHeapCount; ++i) { + freeMem[i] = heaps[i].heapSize; + } - // Allocate SRD manager - srdManager_ = new SrdManager(*this, - std::max(HsaImageObjectSize, HsaSamplerObjectSize), 64 * Ki); - if (srdManager_ == nullptr) { - return false; - } + // Allocate SRD manager + srdManager_ = new SrdManager(*this, std::max(HsaImageObjectSize, HsaSamplerObjectSize), 64 * Ki); + if (srdManager_ == nullptr) { + return false; + } - // create the HW debug manager if needed - if (settings().enableHwDebug_) { - hwDebugMgr_ = new GpuDebugManager(this); - } + // create the HW debug manager if needed + if (settings().enableHwDebug_) { + hwDebugMgr_ = new GpuDebugManager(this); + } #if defined(WITH_LIGHTNING_COMPILER) - // create compilation object with cache support - int gfxipMajor = hwInfo()->gfxipVersion_ / 100; - int gfxipMinor = hwInfo()->gfxipVersion_ / 10 % 10; - int gfxipStepping = hwInfo()->gfxipVersion_ % 10; + // create compilation object with cache support + int gfxipMajor = hwInfo()->gfxipVersion_ / 100; + int gfxipMinor = hwInfo()->gfxipVersion_ / 10 % 10; + int gfxipStepping = hwInfo()->gfxipVersion_ % 10; - // Use compute capability as target (AMD:AMDGPU:major:minor:stepping) - // with dash as delimiter to be compatible with Windows directory name - std::ostringstream cacheTarget; - cacheTarget << "AMD-AMDGPU-" << gfxipMajor << "-" << gfxipMinor << "-" << gfxipStepping; + // Use compute capability as target (AMD:AMDGPU:major:minor:stepping) + // with dash as delimiter to be compatible with Windows directory name + std::ostringstream cacheTarget; + cacheTarget << "AMD-AMDGPU-" << gfxipMajor << "-" << gfxipMinor << "-" << gfxipStepping; - amd::CacheCompilation* compObj = new amd::CacheCompilation(cacheTarget.str(), - "_pal", - OCL_CODE_CACHE_ENABLE, - OCL_CODE_CACHE_RESET); - if (!compObj) { - LogError("Unable to create cache compilation object!"); - return false; - } + amd::CacheCompilation* compObj = new amd::CacheCompilation( + cacheTarget.str(), "_pal", OCL_CODE_CACHE_ENABLE, OCL_CODE_CACHE_RESET); + if (!compObj) { + LogError("Unable to create cache compilation object!"); + return false; + } - cacheCompilation_.reset(compObj); + cacheCompilation_.reset(compObj); #endif - return true; + return true; } -bool -Device::initializeHeapResources() -{ - amd::ScopedLock k(lockForInitHeap_); - if (!heapInitComplete_) { - heapInitComplete_ = true; +bool Device::initializeHeapResources() { + amd::ScopedLock k(lockForInitHeap_); + if (!heapInitComplete_) { + heapInitComplete_ = true; - scratch_.resize((settings().useSingleScratch_) ? - 1 : (numComputeEngines() ? numComputeEngines() : 1)); + scratch_.resize( + (settings().useSingleScratch_) ? 1 : (numComputeEngines() ? numComputeEngines() : 1)); - // Initialize the number of mem object for the scratch buffer - for (uint s = 0; s < scratch_.size(); ++s) { - scratch_[s] = new ScratchBuffer(); - if (nullptr == scratch_[s]) { - return false; - } - } - - if (settings().stagedXferSize_ != 0) { - // Initialize staged write buffers - if (settings().stagedXferWrite_) { - Resource::MemoryType type; - if (settings().stagingWritePersistent_ && !settings().disablePersistent_) { - type = Resource::Persistent; - } else { - type = Resource::RemoteUSWC; - } - xferWrite_ = new XferBuffers(*this, type, - amd::alignUp(settings().stagedXferSize_, 4 * Ki)); - if ((xferWrite_ == nullptr) || !xferWrite_->create()) { - LogError("Couldn't allocate transfer buffer objects for read"); - return false; - } - } - - // Initialize staged read buffers - if (settings().stagedXferRead_) { - xferRead_ = new XferBuffers(*this, Resource::Remote, - amd::alignUp(settings().stagedXferSize_, 4 * Ki)); - if ((xferRead_ == nullptr) || !xferRead_->create()) { - LogError("Couldn't allocate transfer buffer objects for write"); - return false; - } - } - } - - // Create a synchronized transfer queue - xferQueue_ = new VirtualGPU(*this); - if (!(xferQueue_ && xferQueue_->create( - false - ))) { - delete xferQueue_; - xferQueue_ = nullptr; - } - if (nullptr == xferQueue_) { - LogError("Couldn't create the device transfer manager!"); - return false; - } - xferQueue_->enableSyncedBlit(); + // Initialize the number of mem object for the scratch buffer + for (uint s = 0; s < scratch_.size(); ++s) { + scratch_[s] = new ScratchBuffer(); + if (nullptr == scratch_[s]) { + return false; + } } - return true; + + if (settings().stagedXferSize_ != 0) { + // Initialize staged write buffers + if (settings().stagedXferWrite_) { + Resource::MemoryType type; + if (settings().stagingWritePersistent_ && !settings().disablePersistent_) { + type = Resource::Persistent; + } else { + type = Resource::RemoteUSWC; + } + xferWrite_ = new XferBuffers(*this, type, amd::alignUp(settings().stagedXferSize_, 4 * Ki)); + if ((xferWrite_ == nullptr) || !xferWrite_->create()) { + LogError("Couldn't allocate transfer buffer objects for read"); + return false; + } + } + + // Initialize staged read buffers + if (settings().stagedXferRead_) { + xferRead_ = new XferBuffers(*this, Resource::Remote, + amd::alignUp(settings().stagedXferSize_, 4 * Ki)); + if ((xferRead_ == nullptr) || !xferRead_->create()) { + LogError("Couldn't allocate transfer buffer objects for write"); + return false; + } + } + } + + // Create a synchronized transfer queue + xferQueue_ = new VirtualGPU(*this); + if (!(xferQueue_ && xferQueue_->create(false))) { + delete xferQueue_; + xferQueue_ = nullptr; + } + if (nullptr == xferQueue_) { + LogError("Couldn't create the device transfer manager!"); + return false; + } + xferQueue_->enableSyncedBlit(); + } + return true; } -device::VirtualDevice* -Device::createVirtualDevice( - amd::CommandQueue* queue - ) -{ - bool profiling = false; - bool interopQueue = false; - uint rtCUs = amd::CommandQueue::RealTimeDisabled; - uint deviceQueueSize = 0; +device::VirtualDevice* Device::createVirtualDevice(amd::CommandQueue* queue) { + bool profiling = false; + bool interopQueue = false; + uint rtCUs = amd::CommandQueue::RealTimeDisabled; + uint deviceQueueSize = 0; - if (queue != nullptr) { - profiling = queue->properties().test(CL_QUEUE_PROFILING_ENABLE); - if (queue->asHostQueue() != nullptr) { - interopQueue = (0 != (queue->context().info().flags_ & - (amd::Context::GLDeviceKhr | - amd::Context::D3D10DeviceKhr | - amd::Context::D3D11DeviceKhr))); - rtCUs = queue->rtCUs(); - } - else if (queue->asDeviceQueue() != nullptr) { - deviceQueueSize = queue->asDeviceQueue()->size(); - } + if (queue != nullptr) { + profiling = queue->properties().test(CL_QUEUE_PROFILING_ENABLE); + if (queue->asHostQueue() != nullptr) { + interopQueue = (0 != (queue->context().info().flags_ & + (amd::Context::GLDeviceKhr | amd::Context::D3D10DeviceKhr | + amd::Context::D3D11DeviceKhr))); + rtCUs = queue->rtCUs(); + } else if (queue->asDeviceQueue() != nullptr) { + deviceQueueSize = queue->asDeviceQueue()->size(); } + } - // Not safe to add a queue. So lock the device - amd::ScopedLock k(lockAsyncOps()); - amd::ScopedLock lock(vgpusAccess()); + // Not safe to add a queue. So lock the device + amd::ScopedLock k(lockAsyncOps()); + amd::ScopedLock lock(vgpusAccess()); - // Initialization of heap and other resources occur during the command queue creation time. - if (!initializeHeapResources()) { - LogError("Heap initializaiton fails!"); - return nullptr; - } + // Initialization of heap and other resources occur during the command queue creation time. + if (!initializeHeapResources()) { + LogError("Heap initializaiton fails!"); + return nullptr; + } - VirtualGPU* vgpu = new VirtualGPU(*this); - if (vgpu && vgpu->create(profiling, deviceQueueSize, rtCUs, queue->priority())) { - return vgpu; - } else { - delete vgpu; - return nullptr; - } + VirtualGPU* vgpu = new VirtualGPU(*this); + if (vgpu && vgpu->create(profiling, deviceQueueSize, rtCUs, queue->priority())) { + return vgpu; + } else { + delete vgpu; + return nullptr; + } } -device::Program* -Device::createProgram(amd::option::Options* options) -{ - device::Program* program; +device::Program* Device::createProgram(amd::option::Options* options) { + device::Program* program; #if defined(WITH_LIGHTNING_COMPILER) - program = new LightningProgram(*this); -#else // !defined(WITH_LIGHTNING_COMPILER) - program = new HSAILProgram(*this); -#endif // defined(WITH_LIGHTNING_COMPILER) - if (program == nullptr) { - LogError("We failed memory allocation for program!"); - } + program = new LightningProgram(*this); +#else // !defined(WITH_LIGHTNING_COMPILER) + program = new HSAILProgram(*this); +#endif // defined(WITH_LIGHTNING_COMPILER) + if (program == nullptr) { + LogError("We failed memory allocation for program!"); + } - return program; + return program; } //! Requested devices list as configured by the GPU_DEVICE_ORDINAL typedef std::map requestedDevices_t; //! Parses the requested list of devices to be exposed to the user. -static void -parseRequestedDeviceList(requestedDevices_t &requestedDevices) { - char *pch = nullptr; - int requestedDeviceCount = 0; - const char* requestedDeviceList = GPU_DEVICE_ORDINAL; +static void parseRequestedDeviceList(requestedDevices_t& requestedDevices) { + char* pch = nullptr; + int requestedDeviceCount = 0; + const char* requestedDeviceList = GPU_DEVICE_ORDINAL; - pch = strtok(const_cast(requestedDeviceList), ","); - while (pch != nullptr) { - bool deviceIdValid = true; - int currentDeviceIndex = atoi(pch); - // Validate device index. - for (size_t i = 0; i < strlen(pch); i++) { - if (!isdigit(pch[i])) { - deviceIdValid = false; - break; - } - } - if (currentDeviceIndex < 0) { - deviceIdValid = false; - } - // Get next token. - pch = strtok(nullptr, ","); - if (!deviceIdValid) { - continue; - } - - // Requested device is valid. - requestedDevices[currentDeviceIndex] = true; + pch = strtok(const_cast(requestedDeviceList), ","); + while (pch != nullptr) { + bool deviceIdValid = true; + int currentDeviceIndex = atoi(pch); + // Validate device index. + for (size_t i = 0; i < strlen(pch); i++) { + if (!isdigit(pch[i])) { + deviceIdValid = false; + break; + } } + if (currentDeviceIndex < 0) { + deviceIdValid = false; + } + // Get next token. + pch = strtok(nullptr, ","); + if (!deviceIdValid) { + continue; + } + + // Requested device is valid. + requestedDevices[currentDeviceIndex] = true; + } } -#if defined(_WIN32) && defined (DEBUG) +#if defined(_WIN32) && defined(DEBUG) #include #include -static int reportHook(int reportType, char *message, int *returnValue) -{ - fprintf(stderr, "%s", message); - ::exit(3); - return 1; +static int reportHook(int reportType, char* message, int* returnValue) { + fprintf(stderr, "%s", message); + ::exit(3); + return 1; } -#endif // _WIN32 & DEBUG +#endif // _WIN32 & DEBUG static char* platformObj; static Pal::IPlatform* platform; -bool -Device::init() -{ - uint32_t numDevices = 0; - bool useDeviceList = false; - requestedDevices_t requestedDevices; +bool Device::init() { + uint32_t numDevices = 0; + bool useDeviceList = false; + requestedDevices_t requestedDevices; #if !defined(WITH_LIGHTNING_COMPILER) - const char* library = getenv("HSA_COMPILER_LIBRARY"); - aclCompilerOptions opts = { - sizeof(aclCompilerOptions_0_8), - library, - nullptr, - nullptr, - nullptr, - nullptr, - nullptr, - AMD_OCL_SC_LIB - }; - // Initialize the compiler handle - acl_error error; - compiler_ = aclCompilerInit(&opts, &error); - if (error != ACL_SUCCESS) { - LogError("Error initializing the compiler"); - return false; - } -#endif // !defined(WITH_LIGHTNING_COMPILER) + const char* library = getenv("HSA_COMPILER_LIBRARY"); + aclCompilerOptions opts = {sizeof(aclCompilerOptions_0_8), + library, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + AMD_OCL_SC_LIB}; + // Initialize the compiler handle + acl_error error; + compiler_ = aclCompilerInit(&opts, &error); + if (error != ACL_SUCCESS) { + LogError("Error initializing the compiler"); + return false; + } +#endif // !defined(WITH_LIGHTNING_COMPILER) - size_t size = Pal::GetPlatformSize(); - platformObj = new char[size]; - Pal::PlatformCreateInfo info = {}; - info.flags.disableGpuTimeout = true; + size_t size = Pal::GetPlatformSize(); + platformObj = new char[size]; + Pal::PlatformCreateInfo info = {}; + info.flags.disableGpuTimeout = true; #ifdef ATI_BITS_32 - info.flags.force32BitVaSpace = true; - info.flags.enableSvmMode = false; + info.flags.force32BitVaSpace = true; + info.flags.enableSvmMode = false; #else - info.flags.enableSvmMode = true; + info.flags.enableSvmMode = true; #endif - info.pSettingsPath = "OCL"; - info.maxSvmSize = static_cast(OCL_SET_SVM_SIZE * Mi); + info.pSettingsPath = "OCL"; + info.maxSvmSize = static_cast(OCL_SET_SVM_SIZE * Mi); - // PAL init - if (Pal::Result::Success != - Pal::CreatePlatform(info, platformObj, &platform)) { - return false; - } + // PAL init + if (Pal::Result::Success != Pal::CreatePlatform(info, platformObj, &platform)) { + return false; + } - // Get the total number of active devices - // Count up all the devices in the system. - Pal::IDevice* deviceList[Pal::MaxDevices] = {}; - platform->EnumerateDevices(&numDevices, &deviceList[0]); + // Get the total number of active devices + // Count up all the devices in the system. + Pal::IDevice* deviceList[Pal::MaxDevices] = {}; + platform->EnumerateDevices(&numDevices, &deviceList[0]); - uint ordinal = 0; - const char* selectDeviceByName = nullptr; - if (!flagIsDefault(GPU_DEVICE_ORDINAL)) { - useDeviceList = true; - parseRequestedDeviceList(requestedDevices); - } - else if (!flagIsDefault(GPU_DEVICE_NAME)) { - selectDeviceByName = GPU_DEVICE_NAME; - } + uint ordinal = 0; + const char* selectDeviceByName = nullptr; + if (!flagIsDefault(GPU_DEVICE_ORDINAL)) { + useDeviceList = true; + parseRequestedDeviceList(requestedDevices); + } else if (!flagIsDefault(GPU_DEVICE_NAME)) { + selectDeviceByName = GPU_DEVICE_NAME; + } - // Loop through all active devices and initialize the device info structure - for (; ordinal < numDevices; ++ordinal) { - // Create the GPU device object - Device *d = new Device(); - bool result = (nullptr != d) && d->create(deviceList[ordinal]); - if (useDeviceList) { - result &= (requestedDevices.find(ordinal) != requestedDevices.end()); - } - if (result && - ((nullptr == selectDeviceByName) || ('\0' == selectDeviceByName[0]) || - (strstr(selectDeviceByName, d->info().name_) != nullptr))) { - d->registerDevice(); - } - else { - delete d; - } + // Loop through all active devices and initialize the device info structure + for (; ordinal < numDevices; ++ordinal) { + // Create the GPU device object + Device* d = new Device(); + bool result = (nullptr != d) && d->create(deviceList[ordinal]); + if (useDeviceList) { + result &= (requestedDevices.find(ordinal) != requestedDevices.end()); } - return true; + if (result && ((nullptr == selectDeviceByName) || ('\0' == selectDeviceByName[0]) || + (strstr(selectDeviceByName, d->info().name_) != nullptr))) { + d->registerDevice(); + } else { + delete d; + } + } + return true; } -void -Device::tearDown() -{ - platform->Destroy(); - delete platformObj; +void Device::tearDown() { + platform->Destroy(); + delete platformObj; #if !defined(WITH_LIGHTNING_COMPILER) - if (compiler_ != nullptr) { - aclCompilerFini(compiler_); + if (compiler_ != nullptr) { + aclCompilerFini(compiler_); + } +#endif // !defined(WITH_LIGHTNING_COMPILER) +} + +Memory* Device::getGpuMemory(amd::Memory* mem) const { + return static_cast(mem->getDeviceMemory(*this)); +} + +const device::BlitManager& Device::xferMgr() const { return xferQueue_->blitMgr(); } + +Pal::ChNumFormat Device::getPalFormat(const amd::Image::Format& format, + Pal::ChannelMapping* channel) const { + // Find PAL format + for (uint i = 0; i < sizeof(MemoryFormatMap) / sizeof(MemoryFormat); ++i) { + if ((format.image_channel_data_type == MemoryFormatMap[i].clFormat_.image_channel_data_type) && + (format.image_channel_order == MemoryFormatMap[i].clFormat_.image_channel_order)) { + *channel = MemoryFormatMap[i].palChannel_; + return MemoryFormatMap[i].palFormat_; } -#endif // !defined(WITH_LIGHTNING_COMPILER) -} - -Memory* -Device::getGpuMemory(amd::Memory* mem) const -{ - return static_cast(mem->getDeviceMemory(*this)); -} - -const device::BlitManager& -Device::xferMgr() const -{ - return xferQueue_->blitMgr(); -} - -Pal::ChNumFormat -Device::getPalFormat(const amd::Image::Format& format, Pal::ChannelMapping* channel) const -{ - // Find PAL format - for (uint i = 0; i < sizeof(MemoryFormatMap) / sizeof(MemoryFormat); ++i) { - if ((format.image_channel_data_type == - MemoryFormatMap[i].clFormat_.image_channel_data_type) && - (format.image_channel_order == - MemoryFormatMap[i].clFormat_.image_channel_order)) { - *channel = MemoryFormatMap[i].palChannel_; - return MemoryFormatMap[i].palFormat_; - } - } - assert(!"We didn't find PAL resource format!"); - *channel = MemoryFormatMap[0].palChannel_; - return MemoryFormatMap[0].palFormat_; + } + assert(!"We didn't find PAL resource format!"); + *channel = MemoryFormatMap[0].palChannel_; + return MemoryFormatMap[0].palFormat_; } // Create buffer without an owner (merge common code with createBuffer() ?) -pal::Memory* -Device::createScratchBuffer(size_t size) const -{ - Memory* gpuMemory = nullptr; +pal::Memory* Device::createScratchBuffer(size_t size) const { + Memory* gpuMemory = nullptr; - // Create a memory object - gpuMemory = new pal::Memory(*this, size); - if (nullptr == gpuMemory || !gpuMemory->create(Resource::Local)) { - delete gpuMemory; - gpuMemory = nullptr; - } + // Create a memory object + gpuMemory = new pal::Memory(*this, size); + if (nullptr == gpuMemory || !gpuMemory->create(Resource::Local)) { + delete gpuMemory; + gpuMemory = nullptr; + } - return gpuMemory; + return gpuMemory; } -pal::Memory* -Device::createBuffer( - amd::Memory& owner, - bool directAccess) const -{ - size_t size = owner.getSize(); - pal::Memory* gpuMemory; +pal::Memory* Device::createBuffer(amd::Memory& owner, bool directAccess) const { + size_t size = owner.getSize(); + pal::Memory* gpuMemory; - // Create resource - bool result = false; + // Create resource + bool result = false; - if (owner.getType() == CL_MEM_OBJECT_PIPE) { - // directAccess isnt needed as Pipes shouldnt be host accessible for GPU - directAccess = false; + if (owner.getType() == CL_MEM_OBJECT_PIPE) { + // directAccess isnt needed as Pipes shouldnt be host accessible for GPU + directAccess = false; + } + + if (nullptr != owner.parent()) { + pal::Memory* gpuParent = getGpuMemory(owner.parent()); + if (nullptr == gpuParent) { + LogError("Can't get the owner object for subbuffer allocation"); + return nullptr; } - if (nullptr != owner.parent()) { - pal::Memory* gpuParent = getGpuMemory(owner.parent()); - if (nullptr == gpuParent) { - LogError("Can't get the owner object for subbuffer allocation"); + if (nullptr != owner.parent()->getSvmPtr()) { + amd::Memory* amdParent = owner.parent(); + { + // Lock memory object, so only one commitment will occur + amd::ScopedLock lock(amdParent->lockMemoryOps()); + amdParent->commitSvmMemory(); + amdParent->setHostMem(amdParent->getSvmPtr()); + } + // Ignore a possible pinning error. Runtime will fallback to SW emulation + bool ok = gpuParent->pinSystemMemory(amdParent->getHostMem(), amdParent->getSize()); + } + return gpuParent->createBufferView(owner); + } + + Resource::MemoryType type = + (owner.forceSysMemAlloc() || (owner.getMemFlags() & CL_MEM_SVM_FINE_GRAIN_BUFFER)) + ? Resource::Remote + : Resource::Local; + + if (owner.getMemFlags() & CL_MEM_BUS_ADDRESSABLE_AMD) { + type = Resource::BusAddressable; + } else if (owner.getMemFlags() & CL_MEM_EXTERNAL_PHYSICAL_AMD) { + type = Resource::ExternalPhysical; + } + + // Use direct access if it's possible + bool remoteAlloc = false; + // Internal means VirtualDevice!=nullptr + bool internalAlloc = + ((owner.getMemFlags() & CL_MEM_USE_HOST_PTR) && (owner.getVirtualDevice() != nullptr)) + ? true + : false; + + // Create a memory object + gpuMemory = new pal::Buffer(*this, owner, owner.getSize()); + if (nullptr == gpuMemory) { + return nullptr; + } + + // Check if owner is interop memory + if (owner.isInterop()) { + result = gpuMemory->createInterop(Memory::InteropDirectAccess); + } else if (owner.getMemFlags() & CL_MEM_USE_PERSISTENT_MEM_AMD) { + // Attempt to allocate from persistent heap + result = gpuMemory->create(Resource::Persistent); + } else if (directAccess || (type == Resource::Remote)) { + // Check for system memory allocations + if ((owner.getMemFlags() & (CL_MEM_ALLOC_HOST_PTR | CL_MEM_USE_HOST_PTR)) || + (settings().remoteAlloc_)) { + // Allocate remote memory if AHP allocation and context has just 1 device + if ((owner.getMemFlags() & CL_MEM_ALLOC_HOST_PTR) && + (owner.getContext().devices().size() == 1)) { + if (owner.getMemFlags() & + (CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS)) { + // GPU will be reading from this host memory buffer, + // so assume Host write into it + type = Resource::RemoteUSWC; + remoteAlloc = true; + } + } + // Make sure owner has a valid hostmem pointer and it's not COPY + if (!remoteAlloc && (owner.getHostMem() != nullptr)) { + Resource::PinnedParams params; + params.owner_ = &owner; + params.gpu_ = reinterpret_cast(owner.getVirtualDevice()); + + params.hostMemRef_ = owner.getHostMemRef(); + params.size_ = owner.getHostMemRef()->size(); + if (0 == params.size_) { + params.size_ = owner.getSize(); + } + // Create memory object + result = gpuMemory->create(Resource::Pinned, ¶ms); + + // If direct access failed + if (!result) { + // Don't use cached allocation + // if size is biger than max single alloc + if (owner.getSize() > info().maxMemAllocSize_) { + delete gpuMemory; return nullptr; + } } + } + } + } - if (nullptr != owner.parent()->getSvmPtr()) { - amd::Memory* amdParent = owner.parent(); - { - // Lock memory object, so only one commitment will occur - amd::ScopedLock lock(amdParent->lockMemoryOps()); - amdParent->commitSvmMemory(); - amdParent->setHostMem(amdParent->getSvmPtr()); - } - // Ignore a possible pinning error. Runtime will fallback to SW emulation - bool ok = gpuParent->pinSystemMemory(amdParent->getHostMem(), amdParent->getSize()); + if (!result && + // Make sure it's not internal alloc + !internalAlloc) { + Resource::CreateParams params; + params.owner_ = &owner; + params.gpu_ = static_cast(owner.getVirtualDevice()); + params.svmBase_ = static_cast(owner.svmBase()); + + // Create memory object + result = gpuMemory->create(type, ¶ms); + + // If allocation was successful + if (result) { + // Initialize if the memory is a pipe object + if (owner.getType() == CL_MEM_OBJECT_PIPE) { + // Pipe initialize in order read_idx, write_idx, end_idx. Refer clk_pipe_t structure. + // Init with 3 DWORDS for 32bit addressing and 6 DWORDS for 64bit + size_t pipeInit[3] = {0, 0, owner.asPipe()->getMaxNumPackets()}; + static_cast(xferMgr()).writeRawData(*gpuMemory, sizeof(pipeInit), + pipeInit); + } + // If memory has direct access from host, then get CPU address + if (gpuMemory->isHostMemDirectAccess() && (type != Resource::ExternalPhysical)) { + void* address = gpuMemory->map(nullptr); + if (address != nullptr) { + // Copy saved memory + if (owner.getMemFlags() & CL_MEM_COPY_HOST_PTR) { + memcpy(address, owner.getHostMem(), owner.getSize()); + } + // It should be safe to change the host memory pointer, + // because it's lock protected from the upper caller + owner.setHostMem(address); + } else { + result = false; } - return gpuParent->createBufferView(owner); + } + // An optimization for CHP. Copy memory and destroy sysmem allocation + else if ((gpuMemory->memoryType() != Resource::Pinned) && + (owner.getMemFlags() & CL_MEM_COPY_HOST_PTR) && + (owner.getContext().devices().size() == 1)) { + amd::Coord3D origin(0, 0, 0); + amd::Coord3D region(owner.getSize()); + static const bool Entire = true; + if (xferMgr().writeBuffer(owner.getHostMem(), *gpuMemory, origin, region, Entire)) { + // Clear CHP memory + owner.setHostMem(nullptr); + } + } } + } - Resource::MemoryType type = (owner.forceSysMemAlloc() || - (owner.getMemFlags() & CL_MEM_SVM_FINE_GRAIN_BUFFER)) ? - Resource::Remote : Resource::Local; + if (!result) { + delete gpuMemory; + return nullptr; + } - if (owner.getMemFlags() & CL_MEM_BUS_ADDRESSABLE_AMD) { - type = Resource::BusAddressable; + return gpuMemory; +} + +pal::Memory* Device::createImage(amd::Memory& owner, bool directAccess) const { + size_t size = owner.getSize(); + amd::Image& image = *owner.asImage(); + pal::Memory* gpuImage = nullptr; + + if ((nullptr != owner.parent()) && (owner.parent()->asImage() != nullptr)) { + device::Memory* devParent = owner.parent()->getDeviceMemory(*this); + if (nullptr == devParent) { + LogError("Can't get the owner object for image view allocation"); + return nullptr; } - else if (owner.getMemFlags() & CL_MEM_EXTERNAL_PHYSICAL_AMD) { - type = Resource::ExternalPhysical; + // Create a view on the specified device + gpuImage = (pal::Memory*)createView(owner, *devParent); + if ((nullptr != gpuImage) && (gpuImage->owner() != nullptr)) { + gpuImage->owner()->setHostMem((address)(owner.parent()->getHostMem()) + + gpuImage->owner()->getOrigin()); } + return gpuImage; + } - // Use direct access if it's possible - bool remoteAlloc = false; - // Internal means VirtualDevice!=nullptr - bool internalAlloc = ((owner.getMemFlags() & CL_MEM_USE_HOST_PTR) && - (owner.getVirtualDevice() != nullptr)) ? true : false; + gpuImage = new pal::Image(*this, owner, image.getWidth(), image.getHeight(), image.getDepth(), + image.getImageFormat(), image.getType(), image.getMipLevels()); - // Create a memory object - gpuMemory = new pal::Buffer(*this, owner, owner.getSize()); - if (nullptr == gpuMemory) { - return nullptr; - } + // Create resource + if (nullptr != gpuImage) { + const bool imageBuffer = + ((owner.getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER) || + ((owner.getType() == CL_MEM_OBJECT_IMAGE2D) && (owner.parent() != nullptr) && + (owner.parent()->asBuffer() != nullptr))); + bool result = false; // Check if owner is interop memory if (owner.isInterop()) { - result = gpuMemory->createInterop(Memory::InteropDirectAccess); + result = gpuImage->createInterop(Memory::InteropDirectAccess); + } else if (imageBuffer) { + Resource::ImageBufferParams params; + pal::Memory* buffer = reinterpret_cast(image.parent()->getDeviceMemory(*this)); + if (buffer == nullptr) { + LogError("Buffer creation for ImageBuffer failed!"); + delete gpuImage; + return nullptr; + } + params.owner_ = &owner; + params.resource_ = buffer; + params.memory_ = buffer; + + // Create memory object + result = gpuImage->create(Resource::ImageBuffer, ¶ms); + } else if (directAccess && (owner.getMemFlags() & CL_MEM_ALLOC_HOST_PTR)) { + Resource::PinnedParams params; + params.owner_ = &owner; + params.hostMemRef_ = owner.getHostMemRef(); + params.size_ = owner.getHostMemRef()->size(); + + // Create memory object + result = gpuImage->create(Resource::Pinned, ¶ms); } - else if (owner.getMemFlags() & CL_MEM_USE_PERSISTENT_MEM_AMD) { + + if (!result && !owner.isInterop()) { + if (owner.getMemFlags() & CL_MEM_USE_PERSISTENT_MEM_AMD) { // Attempt to allocate from persistent heap - result = gpuMemory->create(Resource::Persistent); - } - else if (directAccess || (type == Resource::Remote)) { - // Check for system memory allocations - if ((owner.getMemFlags() & (CL_MEM_ALLOC_HOST_PTR | CL_MEM_USE_HOST_PTR)) - || (settings().remoteAlloc_)) { - // Allocate remote memory if AHP allocation and context has just 1 device - if ((owner.getMemFlags() & CL_MEM_ALLOC_HOST_PTR) && - (owner.getContext().devices().size() == 1)) { - if (owner.getMemFlags() & (CL_MEM_READ_ONLY | - CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS)) { - // GPU will be reading from this host memory buffer, - // so assume Host write into it - type = Resource::RemoteUSWC; - remoteAlloc = true; - } - } - // Make sure owner has a valid hostmem pointer and it's not COPY - if (!remoteAlloc && (owner.getHostMem() != nullptr)) { - Resource::PinnedParams params; - params.owner_ = &owner; - params.gpu_ = - reinterpret_cast(owner.getVirtualDevice()); - - params.hostMemRef_ = owner.getHostMemRef(); - params.size_ = owner.getHostMemRef()->size(); - if (0 == params.size_) { - params.size_ = owner.getSize(); - } - // Create memory object - result = gpuMemory->create(Resource::Pinned, ¶ms); - - // If direct access failed - if (!result) { - // Don't use cached allocation - // if size is biger than max single alloc - if (owner.getSize() > info().maxMemAllocSize_) { - delete gpuMemory; - return nullptr; - } - } - } - } - } - - if (!result && - // Make sure it's not internal alloc - !internalAlloc) { - Resource::CreateParams params; - params.owner_ = &owner; - params.gpu_ = static_cast(owner.getVirtualDevice()); - params.svmBase_ = static_cast(owner.svmBase()); - + result = gpuImage->create(Resource::Persistent); + } else { + Resource::MemoryType type = + (owner.forceSysMemAlloc()) ? Resource::RemoteUSWC : Resource::Local; // Create memory object - result = gpuMemory->create(type, ¶ms); - - // If allocation was successful - if (result) { - // Initialize if the memory is a pipe object - if (owner.getType() == CL_MEM_OBJECT_PIPE) { - // Pipe initialize in order read_idx, write_idx, end_idx. Refer clk_pipe_t structure. - // Init with 3 DWORDS for 32bit addressing and 6 DWORDS for 64bit - size_t pipeInit[3] = {0 , 0, owner.asPipe()->getMaxNumPackets()}; - static_cast(xferMgr()).writeRawData( - *gpuMemory, sizeof(pipeInit), pipeInit); - } - // If memory has direct access from host, then get CPU address - if (gpuMemory->isHostMemDirectAccess() && - (type != Resource::ExternalPhysical)) { - void* address = gpuMemory->map(nullptr); - if (address != nullptr) { - // Copy saved memory - if (owner.getMemFlags() & CL_MEM_COPY_HOST_PTR) { - memcpy(address, owner.getHostMem(), owner.getSize()); - } - // It should be safe to change the host memory pointer, - // because it's lock protected from the upper caller - owner.setHostMem(address); - } - else { - result = false; - } - } - // An optimization for CHP. Copy memory and destroy sysmem allocation - else if ((gpuMemory->memoryType() != Resource::Pinned) && - (owner.getMemFlags() & CL_MEM_COPY_HOST_PTR) && - (owner.getContext().devices().size() == 1)) { - amd::Coord3D origin(0, 0, 0); - amd::Coord3D region(owner.getSize()); - static const bool Entire = true; - if (xferMgr().writeBuffer(owner.getHostMem(), - *gpuMemory, origin, region, Entire)) { - // Clear CHP memory - owner.setHostMem(nullptr); - } - } - } + result = gpuImage->create(type); + } } if (!result) { - delete gpuMemory; - return nullptr; + delete gpuImage; + return nullptr; + } else if ((gpuImage->memoryType() != Resource::Pinned) && + (owner.getMemFlags() & CL_MEM_COPY_HOST_PTR) && + (owner.getContext().devices().size() == 1)) { + // Ignore copy for image1D_buffer, since it was already done for buffer + if (imageBuffer) { + // Clear CHP memory + owner.setHostMem(nullptr); + } else { + amd::Coord3D origin(0, 0, 0); + static const bool Entire = true; + if (xferMgr().writeImage(owner.getHostMem(), *gpuImage, origin, image.getRegion(), 0, 0, + Entire)) { + // Clear CHP memory + owner.setHostMem(nullptr); + } + } } - return gpuMemory; -} - -pal::Memory* -Device::createImage(amd::Memory& owner, bool directAccess) const -{ - size_t size = owner.getSize(); - amd::Image& image = *owner.asImage(); - pal::Memory* gpuImage = nullptr; - - if ((nullptr != owner.parent()) && (owner.parent()->asImage() != nullptr)) { - device::Memory* devParent = owner.parent()->getDeviceMemory(*this); - if (nullptr == devParent) { - LogError("Can't get the owner object for image view allocation"); - return nullptr; - } - // Create a view on the specified device - gpuImage = (pal::Memory*)createView(owner, *devParent); - if ((nullptr != gpuImage) && (gpuImage->owner() != nullptr)) { - gpuImage->owner()->setHostMem((address)(owner.parent()->getHostMem()) + gpuImage->owner()->getOrigin()); - } - return gpuImage; + if (result) { + size_t bytePitch = gpuImage->elementSize() * gpuImage->desc().width_; + image.setBytePitch(bytePitch); } + } - gpuImage = new pal::Image(*this, owner, - image.getWidth(), - image.getHeight(), - image.getDepth(), - image.getImageFormat(), - image.getType(), - image.getMipLevels()); - - // Create resource - if (nullptr != gpuImage) { - const bool imageBuffer = - ((owner.getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER) || - ((owner.getType() == CL_MEM_OBJECT_IMAGE2D) && - (owner.parent() != nullptr) && - (owner.parent()->asBuffer() != nullptr))); - bool result = false; - - // Check if owner is interop memory - if (owner.isInterop()) { - result = gpuImage->createInterop(Memory::InteropDirectAccess); - } - else if (imageBuffer) { - Resource::ImageBufferParams params; - pal::Memory* buffer = reinterpret_cast - (image.parent()->getDeviceMemory(*this)); - if (buffer == nullptr) { - LogError("Buffer creation for ImageBuffer failed!"); - delete gpuImage; - return nullptr; - } - params.owner_ = &owner; - params.resource_ = buffer; - params.memory_ = buffer; - - // Create memory object - result = gpuImage->create(Resource::ImageBuffer, ¶ms); - } - else if (directAccess && (owner.getMemFlags() & CL_MEM_ALLOC_HOST_PTR)) { - Resource::PinnedParams params; - params.owner_ = &owner; - params.hostMemRef_ = owner.getHostMemRef(); - params.size_ = owner.getHostMemRef()->size(); - - // Create memory object - result = gpuImage->create(Resource::Pinned, ¶ms); - } - - if (!result && !owner.isInterop()) { - if (owner.getMemFlags() & CL_MEM_USE_PERSISTENT_MEM_AMD) { - // Attempt to allocate from persistent heap - result = gpuImage->create(Resource::Persistent); - } - else { - Resource::MemoryType type = (owner.forceSysMemAlloc()) ? - Resource::RemoteUSWC : Resource::Local; - // Create memory object - result = gpuImage->create(type); - } - } - - if (!result) { - delete gpuImage; - return nullptr; - } - else if ((gpuImage->memoryType() != Resource::Pinned) && - (owner.getMemFlags() & CL_MEM_COPY_HOST_PTR) && - (owner.getContext().devices().size() == 1)) { - // Ignore copy for image1D_buffer, since it was already done for buffer - if (imageBuffer) { - // Clear CHP memory - owner.setHostMem(nullptr); - } - else { - amd::Coord3D origin(0, 0, 0); - static const bool Entire = true; - if (xferMgr().writeImage(owner.getHostMem(), - *gpuImage, origin, image.getRegion(), 0, 0, Entire)) { - // Clear CHP memory - owner.setHostMem(nullptr); - } - } - } - - if (result) { - size_t bytePitch = gpuImage->elementSize() * gpuImage->desc().width_; - image.setBytePitch(bytePitch); - } - } - - return gpuImage; + return gpuImage; } //! Allocates cache memory on the card -device::Memory* -Device::createMemory( - amd::Memory& owner) const -{ - bool directAccess = false; - pal::Memory* memory = nullptr; +device::Memory* Device::createMemory(amd::Memory& owner) const { + bool directAccess = false; + pal::Memory* memory = nullptr; - if (owner.asBuffer()) { - directAccess = (settings().hostMemDirectAccess_ & Settings::HostMemBuffer) - ? true : false; - memory = createBuffer(owner, directAccess); - } - else if (owner.asImage()) { - directAccess = (settings().hostMemDirectAccess_ & Settings::HostMemImage) - ? true : false; - memory = createImage(owner, directAccess); - } - else { - LogError("Unknown memory type!"); - } + if (owner.asBuffer()) { + directAccess = (settings().hostMemDirectAccess_ & Settings::HostMemBuffer) ? true : false; + memory = createBuffer(owner, directAccess); + } else if (owner.asImage()) { + directAccess = (settings().hostMemDirectAccess_ & Settings::HostMemImage) ? true : false; + memory = createImage(owner, directAccess); + } else { + LogError("Unknown memory type!"); + } - // Attempt to pin system memory if runtime didn't use direct access - if ((memory != nullptr) && - (memory->memoryType() != Resource::Pinned) && - (memory->memoryType() != Resource::Remote) && - (memory->memoryType() != Resource::RemoteUSWC) && - (memory->memoryType() != Resource::ExternalPhysical) && - ((owner.getHostMem() != nullptr) || - ((nullptr != owner.parent()) && (owner.getHostMem() != nullptr)))) { - bool ok = memory->pinSystemMemory( - owner.getHostMem(), (owner.getHostMemRef()->size()) ? - owner.getHostMemRef()->size() : owner.getSize()); - //! \note: Ignore the pinning result for now - } + // Attempt to pin system memory if runtime didn't use direct access + if ((memory != nullptr) && (memory->memoryType() != Resource::Pinned) && + (memory->memoryType() != Resource::Remote) && + (memory->memoryType() != Resource::RemoteUSWC) && + (memory->memoryType() != Resource::ExternalPhysical) && + ((owner.getHostMem() != nullptr) || + ((nullptr != owner.parent()) && (owner.getHostMem() != nullptr)))) { + bool ok = memory->pinSystemMemory(owner.getHostMem(), (owner.getHostMemRef()->size()) + ? owner.getHostMemRef()->size() + : owner.getSize()); + //! \note: Ignore the pinning result for now + } - return memory; + return memory; } -bool -Device::createSampler(const amd::Sampler& owner, device::Sampler** sampler) const -{ - *sampler = nullptr; - Sampler* gpuSampler = new Sampler(*this); - if ((nullptr == gpuSampler) || !gpuSampler->create(owner)) { - delete gpuSampler; - return false; - } - *sampler = gpuSampler; - return true; +bool Device::createSampler(const amd::Sampler& owner, device::Sampler** sampler) const { + *sampler = nullptr; + Sampler* gpuSampler = new Sampler(*this); + if ((nullptr == gpuSampler) || !gpuSampler->create(owner)) { + delete gpuSampler; + return false; + } + *sampler = gpuSampler; + return true; } //! \note reallocMemory() must be called only from outside of //! VirtualGPU submit commands methods. //! Otherwise a deadlock in lockVgpus() is possible -bool -Device::reallocMemory(amd::Memory& owner) const -{ - bool directAccess = false; +bool Device::reallocMemory(amd::Memory& owner) const { + bool directAccess = false; - // For now we have to serialize reallocation code - amd::ScopedLock lk(*lockAsyncOps_); - - // Read device memory after the lock, - // since realloc from another thread can replace the pointer - pal::Memory* gpuMemory = getGpuMemory(&owner); - if (gpuMemory == nullptr) { - return false; - } - - if (gpuMemory->pinOffset() == 0) { - return true; - } - else if (nullptr != owner.parent()) { - if (!reallocMemory(*owner.parent())) { - return false; - } - } - - if (owner.asBuffer()) { - gpuMemory = createBuffer(owner, directAccess); - } - else if (owner.asImage()) { - return true; - } - else { - LogError("Unknown memory type!"); - } - - if (gpuMemory != nullptr) { - pal::Memory* newMemory = gpuMemory; - pal::Memory* oldMemory = getGpuMemory(&owner); - - // Transfer the object - if (oldMemory != nullptr) { - if (!oldMemory->moveTo(*newMemory)) { - delete newMemory; - return false; - } - } - - // Attempt to pin system memory - if ((newMemory->memoryType() != Resource::Pinned) && - ((owner.getHostMem() != nullptr) || - ((nullptr != owner.parent()) && (owner.getHostMem() != nullptr)))) { - bool ok = newMemory->pinSystemMemory( - owner.getHostMem(), (owner.getHostMemRef()->size()) ? - owner.getHostMemRef()->size() : owner.getSize()); - //! \note: Ignore the pinning result for now - } - - return true; - } + // For now we have to serialize reallocation code + amd::ScopedLock lk(*lockAsyncOps_); + // Read device memory after the lock, + // since realloc from another thread can replace the pointer + pal::Memory* gpuMemory = getGpuMemory(&owner); + if (gpuMemory == nullptr) { return false; + } + + if (gpuMemory->pinOffset() == 0) { + return true; + } else if (nullptr != owner.parent()) { + if (!reallocMemory(*owner.parent())) { + return false; + } + } + + if (owner.asBuffer()) { + gpuMemory = createBuffer(owner, directAccess); + } else if (owner.asImage()) { + return true; + } else { + LogError("Unknown memory type!"); + } + + if (gpuMemory != nullptr) { + pal::Memory* newMemory = gpuMemory; + pal::Memory* oldMemory = getGpuMemory(&owner); + + // Transfer the object + if (oldMemory != nullptr) { + if (!oldMemory->moveTo(*newMemory)) { + delete newMemory; + return false; + } + } + + // Attempt to pin system memory + if ((newMemory->memoryType() != Resource::Pinned) && + ((owner.getHostMem() != nullptr) || + ((nullptr != owner.parent()) && (owner.getHostMem() != nullptr)))) { + bool ok = newMemory->pinSystemMemory(owner.getHostMem(), (owner.getHostMemRef()->size()) + ? owner.getHostMemRef()->size() + : owner.getSize()); + //! \note: Ignore the pinning result for now + } + + return true; + } + + return false; } -device::Memory* -Device::createView(amd::Memory& owner, const device::Memory& parent) const -{ - size_t size = owner.getSize(); - assert((owner.asImage() != nullptr) && "View supports images only"); - const amd::Image& image = *owner.asImage(); - pal::Memory* gpuImage = nullptr; +device::Memory* Device::createView(amd::Memory& owner, const device::Memory& parent) const { + size_t size = owner.getSize(); + assert((owner.asImage() != nullptr) && "View supports images only"); + const amd::Image& image = *owner.asImage(); + pal::Memory* gpuImage = nullptr; - gpuImage = new pal::Image(*this, owner, - image.getWidth(), - image.getHeight(), - image.getDepth(), - image.getImageFormat(), - image.getType(), - image.getMipLevels()); + gpuImage = new pal::Image(*this, owner, image.getWidth(), image.getHeight(), image.getDepth(), + image.getImageFormat(), image.getType(), image.getMipLevels()); - // Create resource - if (nullptr != gpuImage) { - bool result = false; - Resource::ImageViewParams params; - const pal::Memory& gpuMem = static_cast(parent); + // Create resource + if (nullptr != gpuImage) { + bool result = false; + Resource::ImageViewParams params; + const pal::Memory& gpuMem = static_cast(parent); - params.owner_ = &owner; - params.level_ = image.getBaseMipLevel(); - params.layer_ = 0; - params.resource_ = &gpuMem; - params.gpu_ = reinterpret_cast(owner.getVirtualDevice()); - params.memory_ = &gpuMem; + params.owner_ = &owner; + params.level_ = image.getBaseMipLevel(); + params.layer_ = 0; + params.resource_ = &gpuMem; + params.gpu_ = reinterpret_cast(owner.getVirtualDevice()); + params.memory_ = &gpuMem; - // Create memory object - result = gpuImage->create(Resource::ImageView, ¶ms); - if (!result) { - delete gpuImage; - return nullptr; - } + // Create memory object + result = gpuImage->create(Resource::ImageView, ¶ms); + if (!result) { + delete gpuImage; + return nullptr; } + } - return gpuImage; + return gpuImage; } //! Attempt to bind with external graphics API's device/context -bool -Device::bindExternalDevice(uint flags, void* const pDevice[], void* pContext, bool validateOnly) -{ - assert(pDevice); +bool Device::bindExternalDevice(uint flags, void* const pDevice[], void* pContext, + bool validateOnly) { + assert(pDevice); #ifdef _WIN32 - if (flags & amd::Context::Flags::D3D10DeviceKhr) { - if (!associateD3D10Device(pDevice[amd::Context::DeviceFlagIdx::D3D10DeviceKhrIdx])) { - LogError("Failed gslD3D10Associate()"); - return false; - } + if (flags & amd::Context::Flags::D3D10DeviceKhr) { + if (!associateD3D10Device(pDevice[amd::Context::DeviceFlagIdx::D3D10DeviceKhrIdx])) { + LogError("Failed gslD3D10Associate()"); + return false; } + } - if (flags & amd::Context::Flags::D3D11DeviceKhr) { - if (!associateD3D11Device(pDevice[amd::Context::DeviceFlagIdx::D3D11DeviceKhrIdx])) { - LogError("Failed gslD3D11Associate()"); - return false; - } + if (flags & amd::Context::Flags::D3D11DeviceKhr) { + if (!associateD3D11Device(pDevice[amd::Context::DeviceFlagIdx::D3D11DeviceKhrIdx])) { + LogError("Failed gslD3D11Associate()"); + return false; } + } - if (flags & amd::Context::Flags::D3D9DeviceKhr) { - if (!associateD3D9Device(pDevice[amd::Context::DeviceFlagIdx::D3D9DeviceKhrIdx])) { - LogWarning("D3D9<->OpenCL adapter mismatch or D3D9Associate() failure"); - return false; - } + if (flags & amd::Context::Flags::D3D9DeviceKhr) { + if (!associateD3D9Device(pDevice[amd::Context::DeviceFlagIdx::D3D9DeviceKhrIdx])) { + LogWarning("D3D9<->OpenCL adapter mismatch or D3D9Associate() failure"); + return false; } + } - if (flags & amd::Context::Flags::D3D9DeviceEXKhr) { - if (!associateD3D9Device(pDevice[amd::Context::DeviceFlagIdx::D3D9DeviceEXKhrIdx])) { - LogWarning("D3D9<->OpenCL adapter mismatch or D3D9Associate() failure"); - return false; - } + if (flags & amd::Context::Flags::D3D9DeviceEXKhr) { + if (!associateD3D9Device(pDevice[amd::Context::DeviceFlagIdx::D3D9DeviceEXKhrIdx])) { + LogWarning("D3D9<->OpenCL adapter mismatch or D3D9Associate() failure"); + return false; } -#endif //_WIN32 + } +#endif //_WIN32 - if (flags & amd::Context::Flags::GLDeviceKhr) { - // Attempt to associate GSL-OGL - if (!glAssociate(pContext, pDevice[amd::Context::DeviceFlagIdx::GLDeviceKhrIdx])) { - if (!validateOnly) { - LogError("Failed gslGLAssociate()"); - } - return false; - } + if (flags & amd::Context::Flags::GLDeviceKhr) { + // Attempt to associate GSL-OGL + if (!glAssociate(pContext, pDevice[amd::Context::DeviceFlagIdx::GLDeviceKhrIdx])) { + if (!validateOnly) { + LogError("Failed gslGLAssociate()"); + } + return false; } + } + return true; +} + +bool Device::unbindExternalDevice(uint flags, void* const pDevice[], void* pContext, + bool validateOnly) { + if ((flags & amd::Context::Flags::GLDeviceKhr) == 0) { return true; + } + + void* glDevice = pDevice[amd::Context::DeviceFlagIdx::GLDeviceKhrIdx]; + if (glDevice != nullptr) { + // Dissociate GSL-OGL + if (!glDissociate(pContext, glDevice)) { + if (validateOnly) { + LogWarning("Failed gslGLDiassociate()"); + } + return false; + } + } + return true; } -bool -Device::unbindExternalDevice(uint flags, void* const pDevice[], void* pContext, bool validateOnly) -{ - if ((flags & amd::Context::Flags::GLDeviceKhr) == 0) { - return true; - } +bool Device::globalFreeMemory(size_t* freeMemory) const { + const uint TotalFreeMemory = 0; + const uint LargestFreeBlock = 1; - void * glDevice = pDevice[amd::Context::DeviceFlagIdx::GLDeviceKhrIdx]; - if (glDevice != nullptr) { - // Dissociate GSL-OGL - if (!glDissociate(pContext, glDevice)) { - if (validateOnly) { - LogWarning("Failed gslGLDiassociate()"); - } - return false; + // Initialization of heap and other resources because getMemInfo needs it. + if (!(const_cast(this)->initializeHeapResources())) { + return false; + } + + Pal::gpusize local = freeMem[Pal::GpuHeapLocal]; + Pal::gpusize invisible = freeMem[Pal::GpuHeapInvisible]; + + // Fill free memory info + freeMemory[TotalFreeMemory] = static_cast((local + invisible) / Ki); + freeMemory[LargestFreeBlock] = static_cast(std::max(local, invisible) / Ki); + + if (settings().apuSystem_) { + Pal::gpusize uswc = freeMem[Pal::GpuHeapGartUswc]; + uswc /= Ki; + freeMemory[TotalFreeMemory] += static_cast(uswc); + if (freeMemory[LargestFreeBlock] < uswc) { + freeMemory[LargestFreeBlock] = static_cast(uswc); + } + } + + return true; +} + +amd::Memory* Device::findMapTarget(size_t size) const { + // Must be serialised for access + amd::ScopedLock lk(*mapCacheOps_); + + amd::Memory* map = nullptr; + size_t minSize = 0; + size_t maxSize = 0; + uint mapId = mapCache_->size(); + uint releaseId = mapCache_->size(); + + // Find if the list has a map target of appropriate size + for (uint i = 0; i < mapCache_->size(); i++) { + if ((*mapCache_)[i] != nullptr) { + // Requested size is smaller than the entry size + if (size < (*mapCache_)[i]->getSize()) { + if ((minSize == 0) || (minSize > (*mapCache_)[i]->getSize())) { + minSize = (*mapCache_)[i]->getSize(); + mapId = i; } - } - return true; -} - -bool -Device::globalFreeMemory(size_t* freeMemory) const -{ - const uint TotalFreeMemory = 0; - const uint LargestFreeBlock = 1; - - // Initialization of heap and other resources because getMemInfo needs it. - if (!(const_cast(this)->initializeHeapResources())) { - return false; - } - - Pal::gpusize local = freeMem[Pal::GpuHeapLocal]; - Pal::gpusize invisible = freeMem[Pal::GpuHeapInvisible]; - - // Fill free memory info - freeMemory[TotalFreeMemory] = static_cast((local + invisible) / Ki); - freeMemory[LargestFreeBlock] = static_cast(std::max(local, invisible) / Ki); - - if (settings().apuSystem_) { - Pal::gpusize uswc = freeMem[Pal::GpuHeapGartUswc]; - uswc /= Ki; - freeMemory[TotalFreeMemory] += static_cast(uswc); - if (freeMemory[LargestFreeBlock] < uswc) { - freeMemory[LargestFreeBlock] = static_cast(uswc); + } + // Requeted size matches the entry size + else if (size == (*mapCache_)[i]->getSize()) { + mapId = i; + break; + } else { + // Find the biggest map target in the list + if (maxSize < (*mapCache_)[i]->getSize()) { + maxSize = (*mapCache_)[i]->getSize(); + releaseId = i; } + } } + } - return true; + // Check if we found any map target + if (mapId < mapCache_->size()) { + map = (*mapCache_)[mapId]; + (*mapCache_)[mapId] = nullptr; + Memory* gpuMemory = reinterpret_cast(map->getDeviceMemory(*this)); + + // Get the base pointer for the map resource + if ((gpuMemory == nullptr) || (nullptr == gpuMemory->map(nullptr))) { + (*mapCache_)[mapId]->release(); + map = nullptr; + } + } + // If cache is full, then release the biggest map target + else if (releaseId < mapCache_->size()) { + (*mapCache_)[releaseId]->release(); + (*mapCache_)[releaseId] = nullptr; + } + + return map; } -amd::Memory* -Device::findMapTarget(size_t size) const -{ - // Must be serialised for access - amd::ScopedLock lk(*mapCacheOps_); +bool Device::addMapTarget(amd::Memory* memory) const { + // Must be serialised for access + amd::ScopedLock lk(*mapCacheOps_); - amd::Memory* map = nullptr; - size_t minSize = 0; - size_t maxSize = 0; - uint mapId = mapCache_->size(); - uint releaseId = mapCache_->size(); + // the svm memory shouldn't be cached + if (!memory->canBeCached()) { + return false; + } + // Find if the list has a map target of appropriate size + for (uint i = 0; i < mapCache_->size(); ++i) { + if ((*mapCache_)[i] == nullptr) { + (*mapCache_)[i] = memory; + return true; + } + } - // Find if the list has a map target of appropriate size - for (uint i = 0; i < mapCache_->size(); i++) { - if ((*mapCache_)[i] != nullptr) { - // Requested size is smaller than the entry size - if (size < (*mapCache_)[i]->getSize()) { - if ((minSize == 0) || - (minSize > (*mapCache_)[i]->getSize())) { - minSize = (*mapCache_)[i]->getSize(); - mapId = i; - } - } - // Requeted size matches the entry size - else if (size == (*mapCache_)[i]->getSize()) { - mapId = i; - break; - } - else { - // Find the biggest map target in the list - if (maxSize < (*mapCache_)[i]->getSize()) { - maxSize = (*mapCache_)[i]->getSize(); - releaseId = i; - } - } + // Add a new entry + mapCache_->push_back(memory); + + return true; +} + +Device::ScratchBuffer::~ScratchBuffer() { destroyMemory(); } + +void Device::ScratchBuffer::destroyMemory() { + // Release memory object + delete memObj_; + memObj_ = nullptr; +} + +bool Device::allocScratch(uint regNum, const VirtualGPU* vgpu) { + if (regNum > 0) { + // Serialize the scratch buffer allocation code + amd::ScopedLock lk(*scratchAlloc_); + uint sb = vgpu->hwRing(); + static const uint WaveSizeLimit = ((1 << 21) - 256); + const uint threadSizeLimit = + WaveSizeLimit / properties().gfxipProperties.shaderCore.wavefrontSize; + if (regNum > threadSizeLimit) { + LogError("Requested private memory is bigger than HW supports!"); + regNum = threadSizeLimit; + } + // Check if the current buffer isn't big enough + if (regNum > scratch_[sb]->regNum_) { + // Stall all command queues, since runtime will reallocate memory + ScopedLockVgpus lock(*this); + + scratch_[sb]->regNum_ = regNum; + uint64_t size = 0; + uint64_t offset = 0; + + // Destroy all views + for (uint s = 0; s < scratch_.size(); ++s) { + ScratchBuffer* scratchBuf = scratch_[s]; + if (scratchBuf->regNum_ > 0) { + scratchBuf->destroyMemory(); + // Calculate the size of the scratch buffer for a queue + uint32_t numTotalCUs = info().maxComputeUnits_; + uint32_t numMaxWaves = settings().numScratchWavesPerCu_ * numTotalCUs; + scratchBuf->size_ = + static_cast(properties().gfxipProperties.shaderCore.wavefrontSize) * + scratchBuf->regNum_ * numMaxWaves * sizeof(uint32_t); + scratchBuf->size_ = std::min(scratchBuf->size_, info().maxMemAllocSize_); + scratchBuf->size_ = std::min(scratchBuf->size_, uint64_t(3 * Gi)); + scratchBuf->size_ = amd::alignUp(scratchBuf->size_, 0xFFFF); + scratchBuf->offset_ = offset; + size += scratchBuf->size_; + offset += scratchBuf->size_; } - } + } - // Check if we found any map target - if (mapId < mapCache_->size()) { - map = (*mapCache_)[mapId]; - (*mapCache_)[mapId] = nullptr; - Memory* gpuMemory = reinterpret_cast - (map->getDeviceMemory(*this)); + delete globalScratchBuf_; - // Get the base pointer for the map resource - if ((gpuMemory == nullptr) || (nullptr == gpuMemory->map(nullptr))) { - (*mapCache_)[mapId]->release(); - map = nullptr; - } - } - // If cache is full, then release the biggest map target - else if (releaseId < mapCache_->size()) { - (*mapCache_)[releaseId]->release(); - (*mapCache_)[releaseId] = nullptr; - } - - return map; -} - -bool -Device::addMapTarget(amd::Memory* memory) const -{ - // Must be serialised for access - amd::ScopedLock lk(*mapCacheOps_); - - //the svm memory shouldn't be cached - if (!memory->canBeCached()) { - return false; - } - // Find if the list has a map target of appropriate size - for (uint i = 0; i < mapCache_->size(); ++i) { - if ((*mapCache_)[i] == nullptr) { - (*mapCache_)[i] = memory; - return true; - } - } - - // Add a new entry - mapCache_->push_back(memory); - - return true; -} - -Device::ScratchBuffer::~ScratchBuffer() -{ - destroyMemory(); -} - -void -Device::ScratchBuffer::destroyMemory() -{ - // Release memory object - delete memObj_; - memObj_ = nullptr; -} - -bool -Device::allocScratch(uint regNum, const VirtualGPU* vgpu) -{ - if (regNum > 0) { - // Serialize the scratch buffer allocation code - amd::ScopedLock lk(*scratchAlloc_); - uint sb = vgpu->hwRing(); - static const uint WaveSizeLimit = ((1 << 21) - 256); - const uint threadSizeLimit = WaveSizeLimit / - properties().gfxipProperties.shaderCore.wavefrontSize; - if (regNum > threadSizeLimit) { - LogError("Requested private memory is bigger than HW supports!"); - regNum = threadSizeLimit; - } - // Check if the current buffer isn't big enough - if (regNum > scratch_[sb]->regNum_) { - // Stall all command queues, since runtime will reallocate memory - ScopedLockVgpus lock(*this); - - scratch_[sb]->regNum_ = regNum; - uint64_t size = 0; - uint64_t offset = 0; - - // Destroy all views - for (uint s = 0; s < scratch_.size(); ++s) { - ScratchBuffer* scratchBuf = scratch_[s]; - if (scratchBuf->regNum_ > 0) { - scratchBuf->destroyMemory(); - // Calculate the size of the scratch buffer for a queue - uint32_t numTotalCUs = info().maxComputeUnits_; - uint32_t numMaxWaves = - settings().numScratchWavesPerCu_* numTotalCUs; - scratchBuf->size_ = static_cast(properties(). - gfxipProperties.shaderCore.wavefrontSize) * - scratchBuf->regNum_ * numMaxWaves * sizeof(uint32_t); - scratchBuf->size_ = std::min(scratchBuf->size_, info().maxMemAllocSize_); - scratchBuf->size_ = std::min(scratchBuf->size_, uint64_t(3 * Gi)); - scratchBuf->size_ = amd::alignUp(scratchBuf->size_, 0xFFFF); - scratchBuf->offset_ = offset; - size += scratchBuf->size_; - offset += scratchBuf->size_; - } - } - - delete globalScratchBuf_; - - // Allocate new buffer. - globalScratchBuf_ = new pal::Memory(*this, static_cast(size)); - if ((globalScratchBuf_ == nullptr) || - !globalScratchBuf_->create(Resource::Scratch)) { - LogError("Couldn't allocate scratch memory"); - for (uint s = 0; s < scratch_.size(); ++s) { - scratch_[s]->regNum_ = 0; - } - return false; - } - - for (uint s = 0; s < scratch_.size(); ++s) { - // Loop through all memory objects and reallocate them - if (scratch_[s]->regNum_ > 0) { - // Allocate new buffer - scratch_[s]->memObj_ = new pal::Memory(*this, scratch_[s]->size_); - Resource::ViewParams view; - view.resource_ = globalScratchBuf_; - view.offset_ = scratch_[s]->offset_; - view.size_ = scratch_[s]->size_; - if ((scratch_[s]->memObj_ == nullptr) || - !scratch_[s]->memObj_->create(Resource::View, &view)) { - LogError("Couldn't allocate a scratch view"); - delete scratch_[s]->memObj_; - scratch_[s]->regNum_ = 0; - return false; - } - } - } - } - } - return true; -} - -bool -Device::validateKernel(const amd::Kernel& kernel, const device::VirtualDevice* vdev) -{ - // Find the number of scratch registers used in the kernel - const device::Kernel* devKernel = kernel.getDeviceKernel(*this); - uint regNum = static_cast(devKernel->workGroupInfo()->scratchRegs_); - const VirtualGPU* vgpu = static_cast(vdev); - - if (!allocScratch(regNum, vgpu)) { - return false; - } - - if (devKernel->hsa()) { - const HSAILKernel* hsaKernel = static_cast(devKernel); - if (hsaKernel->dynamicParallelism()) { - amd::DeviceQueue* defQueue = - kernel.program().context().defDeviceQueue(*this); - if (defQueue != nullptr) { - vgpu = static_cast(defQueue->vDev()); - if (!allocScratch(hsaKernel->prog().maxScratchRegs(), vgpu)) { - return false; - } - } - else { - return false; - } - } - } - - return true; -} - -void -Device::destroyScratchBuffers() -{ - if (globalScratchBuf_ != nullptr) { + // Allocate new buffer. + globalScratchBuf_ = new pal::Memory(*this, static_cast(size)); + if ((globalScratchBuf_ == nullptr) || !globalScratchBuf_->create(Resource::Scratch)) { + LogError("Couldn't allocate scratch memory"); for (uint s = 0; s < scratch_.size(); ++s) { - scratch_[s]->destroyMemory(); + scratch_[s]->regNum_ = 0; + } + return false; + } + + for (uint s = 0; s < scratch_.size(); ++s) { + // Loop through all memory objects and reallocate them + if (scratch_[s]->regNum_ > 0) { + // Allocate new buffer + scratch_[s]->memObj_ = new pal::Memory(*this, scratch_[s]->size_); + Resource::ViewParams view; + view.resource_ = globalScratchBuf_; + view.offset_ = scratch_[s]->offset_; + view.size_ = scratch_[s]->size_; + if ((scratch_[s]->memObj_ == nullptr) || + !scratch_[s]->memObj_->create(Resource::View, &view)) { + LogError("Couldn't allocate a scratch view"); + delete scratch_[s]->memObj_; scratch_[s]->regNum_ = 0; + return false; + } } - delete globalScratchBuf_; - globalScratchBuf_ = nullptr; + } } + } + return true; } -void -Device::fillHwSampler( - uint32_t state, void* hwState, uint32_t hwStateSize, - uint32_t mipFilter, float minLod, float maxLod) const -{ - Pal::SamplerInfo samplerInfo = {}; +bool Device::validateKernel(const amd::Kernel& kernel, const device::VirtualDevice* vdev) { + // Find the number of scratch registers used in the kernel + const device::Kernel* devKernel = kernel.getDeviceKernel(*this); + uint regNum = static_cast(devKernel->workGroupInfo()->scratchRegs_); + const VirtualGPU* vgpu = static_cast(vdev); - samplerInfo.borderColorType = Pal::BorderColorType::TransparentBlack; + if (!allocScratch(regNum, vgpu)) { + return false; + } - samplerInfo.filter.zFilter = Pal::XyFilterPoint; - - samplerInfo.flags.unnormalizedCoords = !(state & amd::Sampler::StateNormalizedCoordsMask); - samplerInfo.maxLod = 4096.0f; - - state &= ~amd::Sampler::StateNormalizedCoordsMask; - - // Program the sampler address mode - switch (state & amd::Sampler::StateAddressMask) { - case amd::Sampler::StateAddressRepeat: - samplerInfo.addressU = Pal::TexAddressMode::Wrap; - samplerInfo.addressV = Pal::TexAddressMode::Wrap; - samplerInfo.addressW = Pal::TexAddressMode::Wrap; - break; - case amd::Sampler::StateAddressClampToEdge: - samplerInfo.addressU = Pal::TexAddressMode::Clamp; - samplerInfo.addressV = Pal::TexAddressMode::Clamp; - samplerInfo.addressW = Pal::TexAddressMode::Clamp; - break; - case amd::Sampler::StateAddressMirroredRepeat: - samplerInfo.addressU = Pal::TexAddressMode::Mirror; - samplerInfo.addressV = Pal::TexAddressMode::Mirror; - samplerInfo.addressW = Pal::TexAddressMode::Mirror; - break; - case amd::Sampler::StateAddressClamp: - case amd::Sampler::StateAddressNone: - samplerInfo.addressU = Pal::TexAddressMode::ClampBorder; - samplerInfo.addressV = Pal::TexAddressMode::ClampBorder; - samplerInfo.addressW = Pal::TexAddressMode::ClampBorder; - default: - break; - } - state &= ~amd::Sampler::StateAddressMask; - - // Program texture filter mode - if (state == amd::Sampler::StateFilterLinear) { - samplerInfo.filter.magnification = Pal::XyFilterLinear; - samplerInfo.filter.minification = Pal::XyFilterLinear; - samplerInfo.filter.zFilter = Pal::ZFilterLinear; - } - - if (mipFilter == CL_FILTER_NEAREST) { - samplerInfo.filter.mipFilter = Pal::MipFilterPoint; - } - else if (mipFilter == CL_FILTER_LINEAR) { - samplerInfo.filter.mipFilter = Pal::MipFilterLinear; - } - - iDev()->CreateSamplerSrds(1, &samplerInfo, hwState); -} - -void* -Device::hostAlloc(size_t size, size_t alignment, bool atomics) const -{ - //for discrete gpu, we only reserve,no commit yet. - return amd::Os::reserveMemory(nullptr, size, alignment, amd::Os::MEM_PROT_NONE); -} - -void -Device::hostFree(void* ptr, size_t size) const -{ - //If we allocate the host memory, we need free, or we have to release - amd::Os::releaseMemory(ptr, size); -} - -void* -Device::svmAlloc(amd::Context& context, size_t size, size_t alignment, cl_svm_mem_flags flags, void* svmPtr) const -{ - alignment = std::max(alignment, static_cast(info_.memBaseAddrAlign_)); - - //VAM for GPU needs 64K alignment for Tahiti and CI+, will pull idnfo from gsl later - size_t vmBigK = 64 * Ki; - alignment = (alignment < vmBigK) ? vmBigK : alignment; - - size = amd::alignUp(size, alignment); - amd::Memory* mem = nullptr; - freeCPUMem_ = false; - if (nullptr == svmPtr) { - if (isFineGrainedSystem()) { - freeCPUMem_ = true; - return amd::Os::alignedMalloc(size, alignment); + if (devKernel->hsa()) { + const HSAILKernel* hsaKernel = static_cast(devKernel); + if (hsaKernel->dynamicParallelism()) { + amd::DeviceQueue* defQueue = kernel.program().context().defDeviceQueue(*this); + if (defQueue != nullptr) { + vgpu = static_cast(defQueue->vDev()); + if (!allocScratch(hsaKernel->prog().maxScratchRegs(), vgpu)) { + return false; } - - //create a hidden buffer, which will allocated on the device later - mem = new (context)amd::Buffer(context, flags, size, reinterpret_cast(1)); - if (mem == nullptr) { - LogError("failed to create a svm mem object!"); - return nullptr; - } - - if (!mem->create(nullptr, false)) { - LogError("failed to create a svm hidden buffer!"); - mem->release(); - return nullptr; - } - //if the device supports SVM FGS, return the committed CPU address directly. - pal::Memory* gpuMem = getGpuMemory(mem); - - //add the information to context so that we can use it later. - amd::SvmManager::AddSvmBuffer(mem->getSvmPtr(), mem); - svmPtr = mem->getSvmPtr(); - } - else { - //find the existing amd::mem object - mem = amd::SvmManager::FindSvmBuffer(svmPtr); - if (nullptr == mem) { - return nullptr; - } - //commit the CPU memory for FGS device. - if (isFineGrainedSystem()) { - mem->commitSvmMemory(); - } - else { - pal::Memory* gpuMem = getGpuMemory(mem); - } - svmPtr = mem->getSvmPtr(); - } - return svmPtr; -} - -void -Device::svmFree(void *ptr) const -{ - if (freeCPUMem_) { - amd::Os::alignedFree(ptr); - } - else { - amd::Memory * svmMem = nullptr; - svmMem = amd::SvmManager::FindSvmBuffer(ptr); - if (nullptr != svmMem) { - svmMem->release(); - amd::SvmManager::RemoveSvmBuffer(ptr); - } - } -} - - -Device::SrdManager::~SrdManager() -{ - for (uint i = 0; i < pool_.size(); ++i) { - pool_[i].buf_->unmap(nullptr); - delete pool_[i].buf_; - delete pool_[i].flags_; - } -} - -bool -Sampler::create(uint32_t oclSamplerState) -{ - hwSrd_ = dev_.srds().allocSrdSlot(&hwState_); - if (0 == hwSrd_) { + } else { return false; + } } - dev_.fillHwSampler(oclSamplerState, hwState_, HsaSamplerObjectSize); - return true; + } + + return true; } -bool -Sampler::create(const amd::Sampler& owner) -{ - hwSrd_ = dev_.srds().allocSrdSlot(&hwState_); - if (0 == hwSrd_) { - return false; +void Device::destroyScratchBuffers() { + if (globalScratchBuf_ != nullptr) { + for (uint s = 0; s < scratch_.size(); ++s) { + scratch_[s]->destroyMemory(); + scratch_[s]->regNum_ = 0; } - dev_.fillHwSampler(owner.state(), hwState_, HsaSamplerObjectSize, - owner.mipFilter(), owner.minLod(), owner.maxLod()); - return true; + delete globalScratchBuf_; + globalScratchBuf_ = nullptr; + } } -Sampler::~Sampler() -{ - dev_.srds().freeSrdSlot(hwSrd_); +void Device::fillHwSampler(uint32_t state, void* hwState, uint32_t hwStateSize, uint32_t mipFilter, + float minLod, float maxLod) const { + Pal::SamplerInfo samplerInfo = {}; + + samplerInfo.borderColorType = Pal::BorderColorType::TransparentBlack; + + samplerInfo.filter.zFilter = Pal::XyFilterPoint; + + samplerInfo.flags.unnormalizedCoords = !(state & amd::Sampler::StateNormalizedCoordsMask); + samplerInfo.maxLod = 4096.0f; + + state &= ~amd::Sampler::StateNormalizedCoordsMask; + + // Program the sampler address mode + switch (state & amd::Sampler::StateAddressMask) { + case amd::Sampler::StateAddressRepeat: + samplerInfo.addressU = Pal::TexAddressMode::Wrap; + samplerInfo.addressV = Pal::TexAddressMode::Wrap; + samplerInfo.addressW = Pal::TexAddressMode::Wrap; + break; + case amd::Sampler::StateAddressClampToEdge: + samplerInfo.addressU = Pal::TexAddressMode::Clamp; + samplerInfo.addressV = Pal::TexAddressMode::Clamp; + samplerInfo.addressW = Pal::TexAddressMode::Clamp; + break; + case amd::Sampler::StateAddressMirroredRepeat: + samplerInfo.addressU = Pal::TexAddressMode::Mirror; + samplerInfo.addressV = Pal::TexAddressMode::Mirror; + samplerInfo.addressW = Pal::TexAddressMode::Mirror; + break; + case amd::Sampler::StateAddressClamp: + case amd::Sampler::StateAddressNone: + samplerInfo.addressU = Pal::TexAddressMode::ClampBorder; + samplerInfo.addressV = Pal::TexAddressMode::ClampBorder; + samplerInfo.addressW = Pal::TexAddressMode::ClampBorder; + default: + break; + } + state &= ~amd::Sampler::StateAddressMask; + + // Program texture filter mode + if (state == amd::Sampler::StateFilterLinear) { + samplerInfo.filter.magnification = Pal::XyFilterLinear; + samplerInfo.filter.minification = Pal::XyFilterLinear; + samplerInfo.filter.zFilter = Pal::ZFilterLinear; + } + + if (mipFilter == CL_FILTER_NEAREST) { + samplerInfo.filter.mipFilter = Pal::MipFilterPoint; + } else if (mipFilter == CL_FILTER_LINEAR) { + samplerInfo.filter.mipFilter = Pal::MipFilterLinear; + } + + iDev()->CreateSamplerSrds(1, &samplerInfo, hwState); } -uint64_t -Device::SrdManager::allocSrdSlot(address* cpuAddr) -{ - amd::ScopedLock lock(ml_); - // Check all buffers in the pool of chunks - for (uint i = 0; i < pool_.size(); ++i) { - const Chunk& ch = pool_[i]; - // Search for an empty slot - for (uint s = 0; s < numFlags_; ++s) { - uint mask = ch.flags_[s]; - // Check if there is an empty slot in this group - if (mask != 0) { - uint idx; - // Find the first empty index - for (idx = 0; (mask & 0x1) == 0; mask >>= 1, ++idx); - // Mark the slot as busy - ch.flags_[s] &= ~(1 << idx); - // Calculate SRD offset in the buffer - uint offset = (s * MaskBits + idx) * srdSize_; - *cpuAddr = ch.buf_->data() + offset; - return ch.buf_->vmAddress() + offset; - } - } - } - // At this point the manager doesn't have empty slots - // and has to allocate a new chunk - Chunk chunk; - chunk.flags_ = new uint[numFlags_]; - if (chunk.flags_ == nullptr) { - return 0; - } - chunk.buf_ = new Memory(dev_, bufSize_); - if (chunk.buf_ == nullptr || !chunk.buf_->create(Resource::Remote) || - (nullptr == chunk.buf_->map(nullptr))) { - delete [] chunk.flags_; - delete chunk.buf_; - return 0; - } - // All slots in the chunk are in "free" state - memset(chunk.flags_, 0xff, numFlags_ * sizeof(uint)); - // Take the first one... - chunk.flags_[0] &= ~0x1; - pool_.push_back(chunk); - *cpuAddr = chunk.buf_->data(); - return chunk.buf_->vmAddress(); +void* Device::hostAlloc(size_t size, size_t alignment, bool atomics) const { + // for discrete gpu, we only reserve,no commit yet. + return amd::Os::reserveMemory(nullptr, size, alignment, amd::Os::MEM_PROT_NONE); } -void -Device::SrdManager::freeSrdSlot(uint64_t addr) { - amd::ScopedLock lock(ml_); - if (addr == 0) return; - // Check all buffers in the pool of chunks - for (uint i = 0; i < pool_.size(); ++i) { - Chunk* ch = &pool_[i]; - // Find the offset - int64_t offs = static_cast(addr) - - static_cast(ch->buf_->vmAddress()); - // Check if the offset inside the chunk buffer - if ((offs >= 0) && (offs < bufSize_)) { - // Find the index in the chunk - uint idx = offs / srdSize_; - uint s = idx / MaskBits; - // Free the slot - ch->flags_[s] |= 1 << (idx % MaskBits); - return; - } - } - assert(false && "Wrong slot address!"); +void Device::hostFree(void* ptr, size_t size) const { + // If we allocate the host memory, we need free, or we have to release + amd::Os::releaseMemory(ptr, size); } -void -Device::updateFreeMemory(Pal::GpuHeap heap, Pal::gpusize size, bool free) -{ - if (free) { - freeMem[heap] += size; +void* Device::svmAlloc(amd::Context& context, size_t size, size_t alignment, cl_svm_mem_flags flags, + void* svmPtr) const { + alignment = std::max(alignment, static_cast(info_.memBaseAddrAlign_)); + + // VAM for GPU needs 64K alignment for Tahiti and CI+, will pull idnfo from gsl later + size_t vmBigK = 64 * Ki; + alignment = (alignment < vmBigK) ? vmBigK : alignment; + + size = amd::alignUp(size, alignment); + amd::Memory* mem = nullptr; + freeCPUMem_ = false; + if (nullptr == svmPtr) { + if (isFineGrainedSystem()) { + freeCPUMem_ = true; + return amd::Os::alignedMalloc(size, alignment); } - else { - freeMem[heap] -= size; + + // create a hidden buffer, which will allocated on the device later + mem = new (context) amd::Buffer(context, flags, size, reinterpret_cast(1)); + if (mem == nullptr) { + LogError("failed to create a svm mem object!"); + return nullptr; } + + if (!mem->create(nullptr, false)) { + LogError("failed to create a svm hidden buffer!"); + mem->release(); + return nullptr; + } + // if the device supports SVM FGS, return the committed CPU address directly. + pal::Memory* gpuMem = getGpuMemory(mem); + + // add the information to context so that we can use it later. + amd::SvmManager::AddSvmBuffer(mem->getSvmPtr(), mem); + svmPtr = mem->getSvmPtr(); + } else { + // find the existing amd::mem object + mem = amd::SvmManager::FindSvmBuffer(svmPtr); + if (nullptr == mem) { + return nullptr; + } + // commit the CPU memory for FGS device. + if (isFineGrainedSystem()) { + mem->commitSvmMemory(); + } else { + pal::Memory* gpuMem = getGpuMemory(mem); + } + svmPtr = mem->getSvmPtr(); + } + return svmPtr; } -bool -Device::createBlitProgram() -{ - bool result = true; +void Device::svmFree(void* ptr) const { + if (freeCPUMem_) { + amd::Os::alignedFree(ptr); + } else { + amd::Memory* svmMem = nullptr; + svmMem = amd::SvmManager::FindSvmBuffer(ptr); + if (nullptr != svmMem) { + svmMem->release(); + amd::SvmManager::RemoveSvmBuffer(ptr); + } + } +} - // Delayed compilation due to brig_loader memory allocation - const char* scheduler = nullptr; - const char* ocl20 = nullptr; + +Device::SrdManager::~SrdManager() { + for (uint i = 0; i < pool_.size(); ++i) { + pool_[i].buf_->unmap(nullptr); + delete pool_[i].buf_; + delete pool_[i].flags_; + } +} + +bool Sampler::create(uint32_t oclSamplerState) { + hwSrd_ = dev_.srds().allocSrdSlot(&hwState_); + if (0 == hwSrd_) { + return false; + } + dev_.fillHwSampler(oclSamplerState, hwState_, HsaSamplerObjectSize); + return true; +} + +bool Sampler::create(const amd::Sampler& owner) { + hwSrd_ = dev_.srds().allocSrdSlot(&hwState_); + if (0 == hwSrd_) { + return false; + } + dev_.fillHwSampler(owner.state(), hwState_, HsaSamplerObjectSize, owner.mipFilter(), + owner.minLod(), owner.maxLod()); + return true; +} + +Sampler::~Sampler() { dev_.srds().freeSrdSlot(hwSrd_); } + +uint64_t Device::SrdManager::allocSrdSlot(address* cpuAddr) { + amd::ScopedLock lock(ml_); + // Check all buffers in the pool of chunks + for (uint i = 0; i < pool_.size(); ++i) { + const Chunk& ch = pool_[i]; + // Search for an empty slot + for (uint s = 0; s < numFlags_; ++s) { + uint mask = ch.flags_[s]; + // Check if there is an empty slot in this group + if (mask != 0) { + uint idx; + // Find the first empty index + for (idx = 0; (mask & 0x1) == 0; mask >>= 1, ++idx) + ; + // Mark the slot as busy + ch.flags_[s] &= ~(1 << idx); + // Calculate SRD offset in the buffer + uint offset = (s * MaskBits + idx) * srdSize_; + *cpuAddr = ch.buf_->data() + offset; + return ch.buf_->vmAddress() + offset; + } + } + } + // At this point the manager doesn't have empty slots + // and has to allocate a new chunk + Chunk chunk; + chunk.flags_ = new uint[numFlags_]; + if (chunk.flags_ == nullptr) { + return 0; + } + chunk.buf_ = new Memory(dev_, bufSize_); + if (chunk.buf_ == nullptr || !chunk.buf_->create(Resource::Remote) || + (nullptr == chunk.buf_->map(nullptr))) { + delete[] chunk.flags_; + delete chunk.buf_; + return 0; + } + // All slots in the chunk are in "free" state + memset(chunk.flags_, 0xff, numFlags_ * sizeof(uint)); + // Take the first one... + chunk.flags_[0] &= ~0x1; + pool_.push_back(chunk); + *cpuAddr = chunk.buf_->data(); + return chunk.buf_->vmAddress(); +} + +void Device::SrdManager::freeSrdSlot(uint64_t addr) { + amd::ScopedLock lock(ml_); + if (addr == 0) return; + // Check all buffers in the pool of chunks + for (uint i = 0; i < pool_.size(); ++i) { + Chunk* ch = &pool_[i]; + // Find the offset + int64_t offs = static_cast(addr) - static_cast(ch->buf_->vmAddress()); + // Check if the offset inside the chunk buffer + if ((offs >= 0) && (offs < bufSize_)) { + // Find the index in the chunk + uint idx = offs / srdSize_; + uint s = idx / MaskBits; + // Free the slot + ch->flags_[s] |= 1 << (idx % MaskBits); + return; + } + } + assert(false && "Wrong slot address!"); +} + +void Device::updateFreeMemory(Pal::GpuHeap heap, Pal::gpusize size, bool free) { + if (free) { + freeMem[heap] += size; + } else { + freeMem[heap] -= size; + } +} + +bool Device::createBlitProgram() { + bool result = true; + + // Delayed compilation due to brig_loader memory allocation + const char* scheduler = nullptr; + const char* ocl20 = nullptr; #if !defined(WITH_LIGHTNING_COMPILER) - std::string sch = SchedulerSourceCode; - if (settings().oclVersion_ == OpenCL20) { - size_t loc = sch.find("%s"); - sch.replace(loc, 2, iDev()->GetDispatchKernelSource()); - scheduler = sch.c_str(); - ocl20 = "-cl-std=CL2.0"; - } -#endif // !defined(WITH_LIGHTNING_COMPILER) - blitProgram_ = new BlitProgram(context_); - // Create blit programs - if (blitProgram_ == nullptr || - !blitProgram_->create(this, scheduler, ocl20)) { - delete blitProgram_; - blitProgram_ = nullptr; - LogError("Couldn't create blit kernels!"); - result = false; - } - return result; + std::string sch = SchedulerSourceCode; + if (settings().oclVersion_ == OpenCL20) { + size_t loc = sch.find("%s"); + sch.replace(loc, 2, iDev()->GetDispatchKernelSource()); + scheduler = sch.c_str(); + ocl20 = "-cl-std=CL2.0"; + } +#endif // !defined(WITH_LIGHTNING_COMPILER) + blitProgram_ = new BlitProgram(context_); + // Create blit programs + if (blitProgram_ == nullptr || !blitProgram_->create(this, scheduler, ocl20)) { + delete blitProgram_; + blitProgram_ = nullptr; + LogError("Couldn't create blit kernels!"); + result = false; + } + return result; } -void -Device::SrdManager::fillResourceList(std::vector& memList) -{ - for (uint i = 0; i < pool_.size(); ++i) { - memList.push_back(pool_[i].buf_); - } +void Device::SrdManager::fillResourceList(std::vector& memList) { + for (uint i = 0; i < pool_.size(); ++i) { + memList.push_back(pool_[i].buf_); + } } -cl_int -Device::hwDebugManagerInit(amd::Context *context, uintptr_t messageStorage) -{ - cl_int status = hwDebugMgr_->registerDebugger(context, messageStorage); +cl_int Device::hwDebugManagerInit(amd::Context* context, uintptr_t messageStorage) { + cl_int status = hwDebugMgr_->registerDebugger(context, messageStorage); - if (CL_SUCCESS != status) { - delete hwDebugMgr_; - hwDebugMgr_ = nullptr; - } + if (CL_SUCCESS != status) { + delete hwDebugMgr_; + hwDebugMgr_ = nullptr; + } - return status; + return status; } -} // namespace pal +} // namespace pal diff --git a/rocclr/runtime/device/pal/paldevice.hpp b/rocclr/runtime/device/pal/paldevice.hpp index a5f4fd1543..3c5b2930e8 100644 --- a/rocclr/runtime/device/pal/paldevice.hpp +++ b/rocclr/runtime/device/pal/paldevice.hpp @@ -29,110 +29,111 @@ namespace pal { //! A nil device object -class NullDevice : public amd::Device -{ -protected: - static Compiler* compiler_; -public: - Compiler* compiler() const { return compiler_; } +class NullDevice : public amd::Device { + protected: + static Compiler* compiler_; -public: - static bool init(void); + public: + Compiler* compiler() const { return compiler_; } - //! Construct a new identifier - NullDevice(); + public: + static bool init(void); - //! Creates an offline device with the specified target - bool create( - Pal::AsicRevision asicRevision, //!< GPU ASIC revision - Pal::GfxIpLevel ipLevel //!< GPU ip level - ); + //! Construct a new identifier + NullDevice(); - virtual cl_int createSubDevices( - device::CreateSubDevicesInfo& create_info, - cl_uint num_entries, - cl_device_id* devices, - cl_uint* num_devices) { - return CL_INVALID_VALUE; - } + //! Creates an offline device with the specified target + bool create(Pal::AsicRevision asicRevision, //!< GPU ASIC revision + Pal::GfxIpLevel ipLevel //!< GPU ip level + ); - //! Instantiate a new virtual device - virtual device::VirtualDevice* createVirtualDevice( - amd::CommandQueue* queue = NULL - ) { return NULL; } + virtual cl_int createSubDevices(device::CreateSubDevicesInfo& create_info, cl_uint num_entries, + cl_device_id* devices, cl_uint* num_devices) { + return CL_INVALID_VALUE; + } - //! Compile the given source code. - virtual device::Program* createProgram(amd::option::Options* options = NULL); + //! Instantiate a new virtual device + virtual device::VirtualDevice* createVirtualDevice(amd::CommandQueue* queue = NULL) { + return NULL; + } - //! Just returns NULL for the dummy device - virtual device::Memory* createMemory(amd::Memory& owner) const { return NULL; } + //! Compile the given source code. + virtual device::Program* createProgram(amd::option::Options* options = NULL); - //! Sampler object allocation - virtual bool createSampler( - const amd::Sampler& owner, //!< abstraction layer sampler object - device::Sampler** sampler //!< device sampler object - ) const - { - ShouldNotReachHere(); - return true; - } + //! Just returns NULL for the dummy device + virtual device::Memory* createMemory(amd::Memory& owner) const { return NULL; } - //! Just returns NULL for the dummy device - virtual device::Memory* createView( - amd::Memory& owner, //!< Owner memory object - const device::Memory& parent //!< Parent device memory object for the view - ) const { return NULL; } + //! Sampler object allocation + virtual bool createSampler(const amd::Sampler& owner, //!< abstraction layer sampler object + device::Sampler** sampler //!< device sampler object + ) const { + ShouldNotReachHere(); + return true; + } - //! Reallocates the provided buffer object - virtual bool reallocMemory(amd::Memory& owner) const { return true; } + //! Just returns NULL for the dummy device + virtual device::Memory* createView( + amd::Memory& owner, //!< Owner memory object + const device::Memory& parent //!< Parent device memory object for the view + ) const { + return NULL; + } - //! Acquire external graphics API object in the host thread - //! Needed for OpenGL objects on CPU device + //! Reallocates the provided buffer object + virtual bool reallocMemory(amd::Memory& owner) const { return true; } - virtual bool bindExternalDevice( - uint flags, void* const pDevice[], void* pContext, bool validateOnly) { return true; } + //! Acquire external graphics API object in the host thread + //! Needed for OpenGL objects on CPU device - virtual bool unbindExternalDevice( - uint flags, void* const pDevice[], void* pContext, bool validateOnly) { return true; } + virtual bool bindExternalDevice(uint flags, void* const pDevice[], void* pContext, + bool validateOnly) { + return true; + } - //! Releases non-blocking map target memory - virtual void freeMapTarget(amd::Memory& mem, void* target) {} + virtual bool unbindExternalDevice(uint flags, void* const pDevice[], void* pContext, + bool validateOnly) { + return true; + } - Pal::GfxIpLevel ipLevel() const { return ipLevel_; } - Pal::AsicRevision asicRevision() const { return asicRevision_; } + //! Releases non-blocking map target memory + virtual void freeMapTarget(amd::Memory& mem, void* target) {} - const AMDDeviceInfo* hwInfo() const { return hwInfo_; } + Pal::GfxIpLevel ipLevel() const { return ipLevel_; } + Pal::AsicRevision asicRevision() const { return asicRevision_; } - //! Empty implementation on Null device - virtual bool globalFreeMemory(size_t* freeMemory) const { return false; } + const AMDDeviceInfo* hwInfo() const { return hwInfo_; } - //! Get GPU device settings - const pal::Settings& settings() const - { return reinterpret_cast(*settings_); } - virtual void* svmAlloc(amd::Context& context, size_t size, size_t alignment, cl_svm_mem_flags flags, void* svmPtr) const { return NULL; } - virtual void svmFree(void* ptr) const {return;} + //! Empty implementation on Null device + virtual bool globalFreeMemory(size_t* freeMemory) const { return false; } + + //! Get GPU device settings + const pal::Settings& settings() const { return reinterpret_cast(*settings_); } + virtual void* svmAlloc(amd::Context& context, size_t size, size_t alignment, + cl_svm_mem_flags flags, void* svmPtr) const { + return NULL; + } + virtual void svmFree(void* ptr) const { return; } #if defined(WITH_LIGHTNING_COMPILER) - amd::CacheCompilation* cacheCompilation() const { return cacheCompilation_.get(); } + amd::CacheCompilation* cacheCompilation() const { return cacheCompilation_.get(); } #endif -protected: - Pal::AsicRevision asicRevision_; //!< ASIC revision - Pal::GfxIpLevel ipLevel_; //!< Device IP level - const AMDDeviceInfo* hwInfo_; //!< Device HW info structure + protected: + Pal::AsicRevision asicRevision_; //!< ASIC revision + Pal::GfxIpLevel ipLevel_; //!< Device IP level + const AMDDeviceInfo* hwInfo_; //!< Device HW info structure #if defined(WITH_LIGHTNING_COMPILER) - std::unique_ptr cacheCompilation_; //! Compilation with cache support + std::unique_ptr cacheCompilation_; //! Compilation with cache support #endif - //! Fills OpenCL device info structure - void fillDeviceInfo( - const Pal::DeviceProperties& palProp,//!< PAL device properties - const Pal::GpuMemoryHeapProperties heaps[Pal::GpuHeapCount], - size_t maxTextureSize, //!< Maximum texture size supported in HW - uint numComputeRings, //!< Number of compute rings - uint numExclusiveComputeRings //!< Number of exclusive compute rings - ); + //! Fills OpenCL device info structure + void fillDeviceInfo(const Pal::DeviceProperties& palProp, //!< PAL device properties + const Pal::GpuMemoryHeapProperties heaps[Pal::GpuHeapCount], + size_t maxTextureSize, //!< Maximum texture size supported in HW + uint numComputeRings, //!< Number of compute rings + uint numExclusiveComputeRings //!< Number of exclusive compute rings + ); }; //! Forward declarations @@ -153,429 +154,400 @@ class ThreadTrace; #define CL_FILTER_NONE 0x1142 #endif -class Sampler : public device::Sampler -{ -public: - //! Constructor - Sampler(const Device& dev): dev_(dev) {} +class Sampler : public device::Sampler { + public: + //! Constructor + Sampler(const Device& dev) : dev_(dev) {} - //! Default destructor for the device memory object - virtual ~Sampler(); + //! Default destructor for the device memory object + virtual ~Sampler(); - //! Creates a device sampler from the OCL sampler state - bool create( - uint32_t oclSamplerState //!< OCL sampler state - ); + //! Creates a device sampler from the OCL sampler state + bool create(uint32_t oclSamplerState //!< OCL sampler state + ); - //! Creates a device sampler from the OCL sampler state - bool create( - const amd::Sampler& owner //!< AMD sampler object - ); + //! Creates a device sampler from the OCL sampler state + bool create(const amd::Sampler& owner //!< AMD sampler object + ); - const void* hwState() const { return hwState_; } + const void* hwState() const { return hwState_; } -private: - //! Disable default copy constructor - Sampler& operator=(const Sampler&); + private: + //! Disable default copy constructor + Sampler& operator=(const Sampler&); - //! Disable operator= - Sampler(const Sampler&); + //! Disable operator= + Sampler(const Sampler&); - const Device& dev_; //!< Device object associated with the sampler - address hwState_; //!< GPU HW state (\todo legacy path) + const Device& dev_; //!< Device object associated with the sampler + address hwState_; //!< GPU HW state (\todo legacy path) }; //! A GPU device ordinal (physical GPU device) -class Device : public NullDevice -{ -public: - //! Locks any access to the virtual GPUs - class ScopedLockVgpus : public amd::StackObject { - public: - //! Default constructor - ScopedLockVgpus(const Device& dev); +class Device : public NullDevice { + public: + //! Locks any access to the virtual GPUs + class ScopedLockVgpus : public amd::StackObject { + public: + //! Default constructor + ScopedLockVgpus(const Device& dev); - //! Destructor - ~ScopedLockVgpus(); + //! Destructor + ~ScopedLockVgpus(); - private: - const Device& dev_; //! Device object - }; + private: + const Device& dev_; //! Device object + }; - //! Transfer buffers - class XferBuffers : public amd::HeapObject - { - public: - static const size_t MaxXferBufListSize = 8; + //! Transfer buffers + class XferBuffers : public amd::HeapObject { + public: + static const size_t MaxXferBufListSize = 8; - //! Default constructor - XferBuffers(const Device& device, Resource::MemoryType type, size_t bufSize) - : type_(type) - , bufSize_(bufSize) - , acquiredCnt_(0) - , gpuDevice_(device) - {} + //! Default constructor + XferBuffers(const Device& device, Resource::MemoryType type, size_t bufSize) + : type_(type), bufSize_(bufSize), acquiredCnt_(0), gpuDevice_(device) {} - //! Default destructor - ~XferBuffers(); + //! Default destructor + ~XferBuffers(); - //! Creates the xfer buffers object - bool create(); + //! Creates the xfer buffers object + bool create(); - //! Acquires an instance of the transfer buffers - Memory& acquire(); + //! Acquires an instance of the transfer buffers + Memory& acquire(); - //! Releases transfer buffer - void release( - VirtualGPU& gpu, //!< Virual GPU object used with the buffer - Memory& buffer //!< Transfer buffer for release - ); + //! Releases transfer buffer + void release(VirtualGPU& gpu, //!< Virual GPU object used with the buffer + Memory& buffer //!< Transfer buffer for release + ); - //! Returns the buffer's size for transfer - size_t bufSize() const { return bufSize_; } + //! Returns the buffer's size for transfer + size_t bufSize() const { return bufSize_; } - private: - //! Disable copy constructor - XferBuffers(const XferBuffers&); - - //! Disable assignment operator - XferBuffers& operator=(const XferBuffers&); - - //! Get device object - const Device& dev() const { return gpuDevice_; } - - Resource::MemoryType type_; //!< The buffer's type - size_t bufSize_; //!< Staged buffer size - std::list freeBuffers_; //!< The list of free buffers - amd::Atomic acquiredCnt_; //!< The total number of acquired buffers - amd::Monitor lock_; //!< Stgaed buffer acquire/release lock - const Device& gpuDevice_; //!< GPU device object - }; - - struct ScratchBuffer : public amd::HeapObject - { - uint regNum_; //!< The number of used scratch registers - Memory* memObj_; //!< Memory objects for scratch buffers - uint64_t offset_; //!< Offset from the global scratch store - uint64_t size_; //!< Scratch buffer size on this queue - - //! Default constructor - ScratchBuffer(): regNum_(0), memObj_(NULL), offset_(0) {} - - //! Default constructor - ~ScratchBuffer(); - - //! Destroys memory objects - void destroyMemory(); - }; - - - class SrdManager : public amd::HeapObject { - public: - SrdManager(const Device& dev, uint srdSize, uint bufSize) - : dev_(dev) - , numFlags_(bufSize / (srdSize * MaskBits)) - , srdSize_(srdSize) - , bufSize_(bufSize) {} - ~SrdManager(); - - //! Allocates a new SRD slot for a resource - uint64_t allocSrdSlot(address* cpuAddr); - - //! Frees a SRD slot - void freeSrdSlot(uint64_t addr); - - // Fills the memory list for VidMM KMD - void fillResourceList(std::vector& memList); - - private: - //! Disable copy constructor - SrdManager(const SrdManager&); - - //! Disable assignment operator - SrdManager& operator=(const SrdManager&); - - struct Chunk { - Memory* buf_; - uint* flags_; - Chunk(): buf_(NULL), flags_(NULL) {} - }; - - static const uint MaskBits = 32; - const Device& dev_; //!< GPU device for the chunk manager - amd::Monitor ml_; //!< Global lock for the SRD manager - std::vector pool_; //!< Pool of SRD buffers - uint numFlags_; //!< Total number of flags in array - uint srdSize_; //!< SRD size - uint bufSize_; //!< Buffer size that holds SRDs - }; - - //! Initialise the whole GPU device subsystem - static bool init(); - - //! Shutdown the whole GPU device subsystem - static void tearDown(); - - //! Construct a new physical GPU device - Device(); - - //! Initialise a device (i.e. all parts of the constructor that could - //! potentially fail) - bool create( - Pal::IDevice* device //!< PAL device interface object - ); - - //! Destructor for the physical GPU device - virtual ~Device(); - - //! Instantiate a new virtual device - device::VirtualDevice* createVirtualDevice( - amd::CommandQueue* queue = NULL - ); - - //! Memory allocation - virtual device::Memory* createMemory( - amd::Memory& owner //!< abstraction layer memory object - ) const; - - //! Sampler object allocation - virtual bool createSampler( - const amd::Sampler& owner, //!< abstraction layer sampler object - device::Sampler** sampler //!< device sampler object - ) const; - - //! Reallocates the provided buffer object - virtual bool reallocMemory( - amd::Memory& owner //!< Buffer for reallocation - ) const; - - //! Allocates a view object from the device memory - virtual device::Memory* createView( - amd::Memory& owner, //!< Owner memory object - const device::Memory& parent //!< Parent device memory object for the view - ) const; - - //! Create the device program. - virtual device::Program* createProgram(amd::option::Options* options = NULL); - - //! Attempt to bind with external graphics API's device/context - virtual bool bindExternalDevice( - uint flags, void* const pDevice[], void* pContext, bool validateOnly); - - //! Attempt to unbind with external graphics API's device/context - virtual bool unbindExternalDevice( - uint flags, void* const pDevice[], void* pContext, bool validateOnly); - - //! Validates kernel before execution - virtual bool validateKernel( - const amd::Kernel& kernel, //!< AMD kernel object - const device::VirtualDevice* vdev - ); - - //! Retrieves information about free memory on a GPU device - virtual bool globalFreeMemory(size_t* freeMemory) const; - - //! Returns a GPU memory object from AMD memory object - pal::Memory* getGpuMemory( - amd::Memory* mem //!< Pointer to AMD memory object - ) const; - - amd::Monitor& lockAsyncOps() const { return *lockAsyncOps_; } - - //! Returns the lock object for the virtual gpus list - amd::Monitor* vgpusAccess() const { return vgpusAccess_; } - - //! Returns the monitor object for PAL - amd::Monitor& lockPAL() const { return *lockPAL_; } - - //! Returns the number of virtual GPUs allocated on this device - uint numOfVgpus() const { return numOfVgpus_; } - uint numOfVgpus_; //!< The number of virtual GPUs (lock protected) - - typedef std::vector VirtualGPUs; - - //! Returns the list of all virtual GPUs running on this device - const VirtualGPUs& vgpus() const { return vgpus_; } - VirtualGPUs vgpus_; //!< The list of all running virtual gpus (lock protected) - - //! Scratch buffer allocation - pal::Memory* createScratchBuffer( - size_t size //!< Size of buffer - ) const; - - //! Returns transfer buffer object - XferBuffers& xferWrite() const { return *xferWrite_; } - - //! Returns transfer buffer object - XferBuffers& xferRead() const { return *xferRead_; } - - //! Finds an appropriate map target - amd::Memory* findMapTarget(size_t size) const; - - //! Adds a map target to the cache - bool addMapTarget(amd::Memory* memory) const; - - //! Returns resource cache object - ResourceCache& resourceCache() const { return *resourceCache_; } - - //! Returns the number of available compute rings - uint numComputeEngines() const { return numComputeEngines_; } - - //! Returns the number of available compute rings - uint numExclusiveComputeEngines() const { return numExclusiveComputeEngines_; } - - //! Returns the number of available DMA engines - uint numDMAEngines() const { return numDmaEngines_; } - - //! Returns engines object - const device::BlitManager& xferMgr() const; - - VirtualGPU* xferQueue() const { return xferQueue_; } - - //! Retrieves the internal format from the OCL format - Pal::ChNumFormat getPalFormat( - const amd::Image::Format& format, //! OCL image format - Pal::ChannelMapping* channel - ) const; - - const ScratchBuffer* scratch(uint idx) const { return scratch_[idx]; } - - //! Returns the global scratch buffer - Memory* globalScratchBuf() const { return globalScratchBuf_; }; - - //! Destroys scratch buffer memory - void destroyScratchBuffers(); - - //! Initialize heap resources if uninitialized - bool initializeHeapResources(); - - //! Set GSL sampler to the specified state - void fillHwSampler( - uint32_t state, //!< Sampler's OpenCL state - void* hwState, //!< Sampler's HW state - uint32_t hwStateSize, //!< Size of sampler's HW state - uint32_t mipFilter = CL_FILTER_NONE, //!< Mip filter - float minLod = 0.f, //!< Min level of detail - float maxLod = CL_MAXFLOAT //!< Max level of detail - ) const; - - //! host memory alloc - virtual void* hostAlloc(size_t size, size_t alignment, bool atomics = false) const; - - //! SVM allocation - virtual void* svmAlloc(amd::Context& context, size_t size, size_t alignment, - cl_svm_mem_flags flags, void* svmPtr) const; - - //! Free host SVM memory - void hostFree(void* ptr, size_t size) const; - - //! SVM free - virtual void svmFree(void* ptr) const; - - //! Returns SRD manger object - SrdManager& srds() const { return *srdManager_; } - - //! Initial the Hardware Debug Manager - cl_int hwDebugManagerInit(amd::Context *context, uintptr_t messageStorage); - - //! Returns PAL device properties - const Pal::DeviceProperties& properties() const { return properties_; } - - //! Returns PAL device interface - Pal::IDevice* iDev() const { return device_; } - - //! Return private device context for internal allocations - amd::Context& context() const { return *context_; } - - //! Update free memory for OCL extension - void updateFreeMemory( - Pal::GpuHeap heap, //!< PAL GPU heap for update - Pal::gpusize size, //!< Size of alocated/destroyed memory - bool free //!< TRUE if runtime frees memory - ); - - //! Create internal blit program - bool createBlitProgram(); - - //! Interop for GL device - bool initGLInteropPrivateExt(void* GLplatformContext, void* GLdeviceContext) const; - bool glCanInterop(void* GLplatformContext, void* GLdeviceContext) const; - bool resGLAssociate(void* GLContext, uint name, uint type, void** handle, void** mbResHandle, size_t* offset -#ifdef ATI_OS_WIN - , Pal::DoppDesktopInfo& doppDesktopInfo -#endif - ) const; - bool resGLAcquire(void* GLplatformContext, void* mbResHandle, uint type) const; - bool resGLRelease(void* GLplatformContext, void* mbResHandle, uint type) const; - bool resGLFree(void* GLplatformContext, void* mbResHandle, uint type) const; - -private: + private: //! Disable copy constructor - Device(const Device&); + XferBuffers(const XferBuffers&); - //! Disable assignment - Device& operator=(const Device&); + //! Disable assignment operator + XferBuffers& operator=(const XferBuffers&); - //! Sends the stall command to all queues - bool stallQueues(); + //! Get device object + const Device& dev() const { return gpuDevice_; } - //! Buffer allocation - pal::Memory* createBuffer( - amd::Memory& owner, //!< Abstraction layer memory object - bool directAccess //!< Use direct host memory access - ) const; + Resource::MemoryType type_; //!< The buffer's type + size_t bufSize_; //!< Staged buffer size + std::list freeBuffers_; //!< The list of free buffers + amd::Atomic acquiredCnt_; //!< The total number of acquired buffers + amd::Monitor lock_; //!< Stgaed buffer acquire/release lock + const Device& gpuDevice_; //!< GPU device object + }; - //! Image allocation - pal::Memory* createImage( - amd::Memory& owner, //!< Abstraction layer memory object - bool directAccess //!< Use direct host memory access - ) const; + struct ScratchBuffer : public amd::HeapObject { + uint regNum_; //!< The number of used scratch registers + Memory* memObj_; //!< Memory objects for scratch buffers + uint64_t offset_; //!< Offset from the global scratch store + uint64_t size_; //!< Scratch buffer size on this queue - //! Allocates/reallocates the scratch buffer, according to the usage - bool allocScratch( - uint regNum, //!< Number of the scratch registers - const VirtualGPU* vgpu //!< Virtual GPU for the allocation - ); + //! Default constructor + ScratchBuffer() : regNum_(0), memObj_(NULL), offset_(0) {} - //! Interop for D3D devices - bool associateD3D11Device( - void* d3d11Device //!< void* is of type ID3D11Device* - ); - bool associateD3D10Device( - void* d3d10Device //!< void* is of type ID3D10Device* - ); - bool associateD3D9Device( - void* d3d9Device //!< void* is of type IDirect3DDevice9* - ); - //! Interop for GL device - bool glAssociate(void* GLplatformContext, void* GLdeviceContext) const; - bool glDissociate(void* GLplatformContext, void* GLdeviceContext) const; + //! Default constructor + ~ScratchBuffer(); - amd::Context* context_; //!< A dummy context for internal allocations - amd::Monitor* lockAsyncOps_; //!< Lock to serialise all async ops on this device - amd::Monitor* lockForInitHeap_; //!< Lock to serialise all async ops on initialization heap operation - amd::Monitor* lockPAL_; //!< Lock to serialise PAL access - amd::Monitor* vgpusAccess_; //!< Lock to serialise virtual gpu list access - amd::Monitor* scratchAlloc_; //!< Lock to serialise scratch allocation - amd::Monitor* mapCacheOps_; //!< Lock to serialise cache for the map resources - XferBuffers* xferRead_; //!< Transfer buffers read - XferBuffers* xferWrite_; //!< Transfer buffers write - std::vector* mapCache_; //!< Map cache info structure - ResourceCache* resourceCache_; //!< Resource cache - uint numComputeEngines_; //!< The number of available compute engines - uint numExclusiveComputeEngines_; //!< The number of available compute engines - uint numDmaEngines_; //!< The number of available compute engines - bool heapInitComplete_; //!< Keep track of initialization status of heap resources - VirtualGPU* xferQueue_; //!< Transfer queue - std::vector scratch_; //!< Scratch buffers for kernels - Memory* globalScratchBuf_; //!< Global scratch buffer - SrdManager* srdManager_; //!< SRD manager object - static AppProfile appProfile_; //!< application profile - mutable bool freeCPUMem_; //!< flag to mark GPU free SVM CPU mem - Pal::DeviceProperties properties_; //!< PAL device properties - Pal::IDevice* device_; //!< PAL device object - std::atomic freeMem[Pal::GpuHeap::GpuHeapCount]; //!< Free memory counter + //! Destroys memory objects + void destroyMemory(); + }; + + + class SrdManager : public amd::HeapObject { + public: + SrdManager(const Device& dev, uint srdSize, uint bufSize) + : dev_(dev), + numFlags_(bufSize / (srdSize * MaskBits)), + srdSize_(srdSize), + bufSize_(bufSize) {} + ~SrdManager(); + + //! Allocates a new SRD slot for a resource + uint64_t allocSrdSlot(address* cpuAddr); + + //! Frees a SRD slot + void freeSrdSlot(uint64_t addr); + + // Fills the memory list for VidMM KMD + void fillResourceList(std::vector& memList); + + private: + //! Disable copy constructor + SrdManager(const SrdManager&); + + //! Disable assignment operator + SrdManager& operator=(const SrdManager&); + + struct Chunk { + Memory* buf_; + uint* flags_; + Chunk() : buf_(NULL), flags_(NULL) {} + }; + + static const uint MaskBits = 32; + const Device& dev_; //!< GPU device for the chunk manager + amd::Monitor ml_; //!< Global lock for the SRD manager + std::vector pool_; //!< Pool of SRD buffers + uint numFlags_; //!< Total number of flags in array + uint srdSize_; //!< SRD size + uint bufSize_; //!< Buffer size that holds SRDs + }; + + //! Initialise the whole GPU device subsystem + static bool init(); + + //! Shutdown the whole GPU device subsystem + static void tearDown(); + + //! Construct a new physical GPU device + Device(); + + //! Initialise a device (i.e. all parts of the constructor that could + //! potentially fail) + bool create(Pal::IDevice* device //!< PAL device interface object + ); + + //! Destructor for the physical GPU device + virtual ~Device(); + + //! Instantiate a new virtual device + device::VirtualDevice* createVirtualDevice(amd::CommandQueue* queue = NULL); + + //! Memory allocation + virtual device::Memory* createMemory(amd::Memory& owner //!< abstraction layer memory object + ) const; + + //! Sampler object allocation + virtual bool createSampler(const amd::Sampler& owner, //!< abstraction layer sampler object + device::Sampler** sampler //!< device sampler object + ) const; + + //! Reallocates the provided buffer object + virtual bool reallocMemory(amd::Memory& owner //!< Buffer for reallocation + ) const; + + //! Allocates a view object from the device memory + virtual device::Memory* createView( + amd::Memory& owner, //!< Owner memory object + const device::Memory& parent //!< Parent device memory object for the view + ) const; + + //! Create the device program. + virtual device::Program* createProgram(amd::option::Options* options = NULL); + + //! Attempt to bind with external graphics API's device/context + virtual bool bindExternalDevice(uint flags, void* const pDevice[], void* pContext, + bool validateOnly); + + //! Attempt to unbind with external graphics API's device/context + virtual bool unbindExternalDevice(uint flags, void* const pDevice[], void* pContext, + bool validateOnly); + + //! Validates kernel before execution + virtual bool validateKernel(const amd::Kernel& kernel, //!< AMD kernel object + const device::VirtualDevice* vdev); + + //! Retrieves information about free memory on a GPU device + virtual bool globalFreeMemory(size_t* freeMemory) const; + + //! Returns a GPU memory object from AMD memory object + pal::Memory* getGpuMemory(amd::Memory* mem //!< Pointer to AMD memory object + ) const; + + amd::Monitor& lockAsyncOps() const { return *lockAsyncOps_; } + + //! Returns the lock object for the virtual gpus list + amd::Monitor* vgpusAccess() const { return vgpusAccess_; } + + //! Returns the monitor object for PAL + amd::Monitor& lockPAL() const { return *lockPAL_; } + + //! Returns the number of virtual GPUs allocated on this device + uint numOfVgpus() const { return numOfVgpus_; } + uint numOfVgpus_; //!< The number of virtual GPUs (lock protected) + + typedef std::vector VirtualGPUs; + + //! Returns the list of all virtual GPUs running on this device + const VirtualGPUs& vgpus() const { return vgpus_; } + VirtualGPUs vgpus_; //!< The list of all running virtual gpus (lock protected) + + //! Scratch buffer allocation + pal::Memory* createScratchBuffer(size_t size //!< Size of buffer + ) const; + + //! Returns transfer buffer object + XferBuffers& xferWrite() const { return *xferWrite_; } + + //! Returns transfer buffer object + XferBuffers& xferRead() const { return *xferRead_; } + + //! Finds an appropriate map target + amd::Memory* findMapTarget(size_t size) const; + + //! Adds a map target to the cache + bool addMapTarget(amd::Memory* memory) const; + + //! Returns resource cache object + ResourceCache& resourceCache() const { return *resourceCache_; } + + //! Returns the number of available compute rings + uint numComputeEngines() const { return numComputeEngines_; } + + //! Returns the number of available compute rings + uint numExclusiveComputeEngines() const { return numExclusiveComputeEngines_; } + + //! Returns the number of available DMA engines + uint numDMAEngines() const { return numDmaEngines_; } + + //! Returns engines object + const device::BlitManager& xferMgr() const; + + VirtualGPU* xferQueue() const { return xferQueue_; } + + //! Retrieves the internal format from the OCL format + Pal::ChNumFormat getPalFormat(const amd::Image::Format& format, //! OCL image format + Pal::ChannelMapping* channel) const; + + const ScratchBuffer* scratch(uint idx) const { return scratch_[idx]; } + + //! Returns the global scratch buffer + Memory* globalScratchBuf() const { return globalScratchBuf_; }; + + //! Destroys scratch buffer memory + void destroyScratchBuffers(); + + //! Initialize heap resources if uninitialized + bool initializeHeapResources(); + + //! Set GSL sampler to the specified state + void fillHwSampler(uint32_t state, //!< Sampler's OpenCL state + void* hwState, //!< Sampler's HW state + uint32_t hwStateSize, //!< Size of sampler's HW state + uint32_t mipFilter = CL_FILTER_NONE, //!< Mip filter + float minLod = 0.f, //!< Min level of detail + float maxLod = CL_MAXFLOAT //!< Max level of detail + ) const; + + //! host memory alloc + virtual void* hostAlloc(size_t size, size_t alignment, bool atomics = false) const; + + //! SVM allocation + virtual void* svmAlloc(amd::Context& context, size_t size, size_t alignment, + cl_svm_mem_flags flags, void* svmPtr) const; + + //! Free host SVM memory + void hostFree(void* ptr, size_t size) const; + + //! SVM free + virtual void svmFree(void* ptr) const; + + //! Returns SRD manger object + SrdManager& srds() const { return *srdManager_; } + + //! Initial the Hardware Debug Manager + cl_int hwDebugManagerInit(amd::Context* context, uintptr_t messageStorage); + + //! Returns PAL device properties + const Pal::DeviceProperties& properties() const { return properties_; } + + //! Returns PAL device interface + Pal::IDevice* iDev() const { return device_; } + + //! Return private device context for internal allocations + amd::Context& context() const { return *context_; } + + //! Update free memory for OCL extension + void updateFreeMemory(Pal::GpuHeap heap, //!< PAL GPU heap for update + Pal::gpusize size, //!< Size of alocated/destroyed memory + bool free //!< TRUE if runtime frees memory + ); + + //! Create internal blit program + bool createBlitProgram(); + + //! Interop for GL device + bool initGLInteropPrivateExt(void* GLplatformContext, void* GLdeviceContext) const; + bool glCanInterop(void* GLplatformContext, void* GLdeviceContext) const; + bool resGLAssociate(void* GLContext, uint name, uint type, void** handle, void** mbResHandle, + size_t* offset +#ifdef ATI_OS_WIN + , + Pal::DoppDesktopInfo& doppDesktopInfo +#endif + ) const; + bool resGLAcquire(void* GLplatformContext, void* mbResHandle, uint type) const; + bool resGLRelease(void* GLplatformContext, void* mbResHandle, uint type) const; + bool resGLFree(void* GLplatformContext, void* mbResHandle, uint type) const; + + private: + //! Disable copy constructor + Device(const Device&); + + //! Disable assignment + Device& operator=(const Device&); + + //! Sends the stall command to all queues + bool stallQueues(); + + //! Buffer allocation + pal::Memory* createBuffer(amd::Memory& owner, //!< Abstraction layer memory object + bool directAccess //!< Use direct host memory access + ) const; + + //! Image allocation + pal::Memory* createImage(amd::Memory& owner, //!< Abstraction layer memory object + bool directAccess //!< Use direct host memory access + ) const; + + //! Allocates/reallocates the scratch buffer, according to the usage + bool allocScratch(uint regNum, //!< Number of the scratch registers + const VirtualGPU* vgpu //!< Virtual GPU for the allocation + ); + + //! Interop for D3D devices + bool associateD3D11Device(void* d3d11Device //!< void* is of type ID3D11Device* + ); + bool associateD3D10Device(void* d3d10Device //!< void* is of type ID3D10Device* + ); + bool associateD3D9Device(void* d3d9Device //!< void* is of type IDirect3DDevice9* + ); + //! Interop for GL device + bool glAssociate(void* GLplatformContext, void* GLdeviceContext) const; + bool glDissociate(void* GLplatformContext, void* GLdeviceContext) const; + + amd::Context* context_; //!< A dummy context for internal allocations + amd::Monitor* lockAsyncOps_; //!< Lock to serialise all async ops on this device + amd::Monitor* + lockForInitHeap_; //!< Lock to serialise all async ops on initialization heap operation + amd::Monitor* lockPAL_; //!< Lock to serialise PAL access + amd::Monitor* vgpusAccess_; //!< Lock to serialise virtual gpu list access + amd::Monitor* scratchAlloc_; //!< Lock to serialise scratch allocation + amd::Monitor* mapCacheOps_; //!< Lock to serialise cache for the map resources + XferBuffers* xferRead_; //!< Transfer buffers read + XferBuffers* xferWrite_; //!< Transfer buffers write + std::vector* mapCache_; //!< Map cache info structure + ResourceCache* resourceCache_; //!< Resource cache + uint numComputeEngines_; //!< The number of available compute engines + uint numExclusiveComputeEngines_; //!< The number of available compute engines + uint numDmaEngines_; //!< The number of available compute engines + bool heapInitComplete_; //!< Keep track of initialization status of heap resources + VirtualGPU* xferQueue_; //!< Transfer queue + std::vector scratch_; //!< Scratch buffers for kernels + Memory* globalScratchBuf_; //!< Global scratch buffer + SrdManager* srdManager_; //!< SRD manager object + static AppProfile appProfile_; //!< application profile + mutable bool freeCPUMem_; //!< flag to mark GPU free SVM CPU mem + Pal::DeviceProperties properties_; //!< PAL device properties + Pal::IDevice* device_; //!< PAL device object + std::atomic freeMem[Pal::GpuHeap::GpuHeapCount]; //!< Free memory counter }; /*@}*/} // namespace pal - diff --git a/rocclr/runtime/device/pal/paldeviced3d10.cpp b/rocclr/runtime/device/pal/paldeviced3d10.cpp index d03ac6c18c..e7d31a9d86 100644 --- a/rocclr/runtime/device/pal/paldeviced3d10.cpp +++ b/rocclr/runtime/device/pal/paldeviced3d10.cpp @@ -2,142 +2,131 @@ #if defined(ATI_OS_LINUX) namespace pal { -bool -Device::associateD3D10Device(void* d3d10Device) -{ - return false; -} -} // pal -#else // !ATI_OS_WIN +bool Device::associateD3D10Device(void* d3d10Device) { return false; } +} // pal +#else // !ATI_OS_WIN #include /************************************************************************************************************** * Note: ideally the DXX extension interfaces should be mapped from the DXX perforce branch. -* This means OCL client spec will need to change to include headers directly from the DXX perforce tree. +* This means OCL client spec will need to change to include headers directly from the DXX perforce +*tree. * However, OCL only cares about the DXX OpenCL extension interface class. The spec cannot change -* without notification. So it is safe to use a local copy of the relevant DXX extension interface classes. +* without notification. So it is safe to use a local copy of the relevant DXX extension interface +*classes. **************************************************************************************************************/ #include "DxxOpenCLInteropExt.h" namespace pal { -static bool -queryD3D10DeviceGPUMask(ID3D10Device* pd3d10Device, UINT* pd3d10DeviceGPUMask) -{ - HMODULE hDLL = nullptr; - IAmdDxExt* pExt = nullptr; - IAmdDxExtCLInterop* pCLExt = nullptr; - PFNAmdDxExtCreate AmdDxExtCreate; - HRESULT hr = S_OK; +static bool queryD3D10DeviceGPUMask(ID3D10Device* pd3d10Device, UINT* pd3d10DeviceGPUMask) { + HMODULE hDLL = nullptr; + IAmdDxExt* pExt = nullptr; + IAmdDxExtCLInterop* pCLExt = nullptr; + PFNAmdDxExtCreate AmdDxExtCreate; + HRESULT hr = S_OK; - // Get a handle to the DXX DLL with extension API support +// Get a handle to the DXX DLL with extension API support #if defined _WIN64 - static const CHAR dxxModuleName[13] = "atidxx64.dll"; + static const CHAR dxxModuleName[13] = "atidxx64.dll"; #else - static const CHAR dxxModuleName[13] = "atidxx32.dll"; + static const CHAR dxxModuleName[13] = "atidxx32.dll"; #endif - hDLL = GetModuleHandle(dxxModuleName); + hDLL = GetModuleHandle(dxxModuleName); - if (hDLL == nullptr) { - hr = E_FAIL; + if (hDLL == nullptr) { + hr = E_FAIL; + } + + // Get the exported AmdDxExtCreate() function pointer + if (SUCCEEDED(hr)) { + AmdDxExtCreate = reinterpret_cast(GetProcAddress(hDLL, "AmdDxExtCreate")); + if (AmdDxExtCreate == nullptr) { + hr = E_FAIL; } + } - // Get the exported AmdDxExtCreate() function pointer - if (SUCCEEDED(hr)) { - AmdDxExtCreate = reinterpret_cast( - GetProcAddress(hDLL, "AmdDxExtCreate")); - if (AmdDxExtCreate == nullptr) { - hr = E_FAIL; - } - } - - // Create the extension object - if (SUCCEEDED(hr)) { - hr = AmdDxExtCreate(pd3d10Device, &pExt); - } - - // Get the extension version information - if (SUCCEEDED(hr)) { - AmdDxExtVersion extVersion; - hr = pExt->GetVersion(&extVersion); - - if (extVersion.majorVersion == 0) - { - hr = E_FAIL; - } - } - - // Get the OpenCL Interop interface - if (SUCCEEDED(hr)) { - pCLExt = static_cast( - pExt->GetExtInterface(AmdDxExtCLInteropID)); - if (pCLExt != nullptr) { - // Get the GPU mask using the CL Interop extension. - pCLExt->QueryInteropGpuMask(pd3d10DeviceGPUMask); - } - else { - hr = E_FAIL; - } + // Create the extension object + if (SUCCEEDED(hr)) { + hr = AmdDxExtCreate(pd3d10Device, &pExt); + } + + // Get the extension version information + if (SUCCEEDED(hr)) { + AmdDxExtVersion extVersion; + hr = pExt->GetVersion(&extVersion); + + if (extVersion.majorVersion == 0) { + hr = E_FAIL; } + } + // Get the OpenCL Interop interface + if (SUCCEEDED(hr)) { + pCLExt = static_cast(pExt->GetExtInterface(AmdDxExtCLInteropID)); if (pCLExt != nullptr) { - pCLExt->Release(); + // Get the GPU mask using the CL Interop extension. + pCLExt->QueryInteropGpuMask(pd3d10DeviceGPUMask); + } else { + hr = E_FAIL; } + } - if (pExt != nullptr) { - pExt->Release(); - } + if (pCLExt != nullptr) { + pCLExt->Release(); + } - return (SUCCEEDED(hr)); + if (pExt != nullptr) { + pExt->Release(); + } + + return (SUCCEEDED(hr)); } -bool -Device::associateD3D10Device(void* d3d10Device) -{ - ID3D10Device* pd3d10Device = static_cast(d3d10Device); +bool Device::associateD3D10Device(void* d3d10Device) { + ID3D10Device* pd3d10Device = static_cast(d3d10Device); - IDXGIDevice* pDXGIDevice; - pd3d10Device->QueryInterface(__uuidof(IDXGIDevice), (void **)&pDXGIDevice); + IDXGIDevice* pDXGIDevice; + pd3d10Device->QueryInterface(__uuidof(IDXGIDevice), (void**)&pDXGIDevice); - IDXGIAdapter* pDXGIAdapter; - pDXGIDevice->GetAdapter(&pDXGIAdapter); + IDXGIAdapter* pDXGIAdapter; + pDXGIDevice->GetAdapter(&pDXGIAdapter); - DXGI_ADAPTER_DESC adapterDesc; - pDXGIAdapter->GetDesc(&adapterDesc); + DXGI_ADAPTER_DESC adapterDesc; + pDXGIAdapter->GetDesc(&adapterDesc); - // match the adapter - bool canInteroperate = - (properties().osProperties.luidHighPart == adapterDesc.AdapterLuid.HighPart) && - (properties().osProperties.luidLowPart == adapterDesc.AdapterLuid.LowPart); + // match the adapter + bool canInteroperate = + (properties().osProperties.luidHighPart == adapterDesc.AdapterLuid.HighPart) && + (properties().osProperties.luidLowPart == adapterDesc.AdapterLuid.LowPart); - UINT chainBitMask = 1 << properties().gpuIndex; + UINT chainBitMask = 1 << properties().gpuIndex; - // match the chain ID - if (canInteroperate) { - UINT d3d10DeviceGPUMask = 0; + // match the chain ID + if (canInteroperate) { + UINT d3d10DeviceGPUMask = 0; - if (queryD3D10DeviceGPUMask(pd3d10Device, &d3d10DeviceGPUMask)) { - canInteroperate = (chainBitMask & d3d10DeviceGPUMask) != 0; - } - else { - // special handling for Intel iGPU + AMD dGPU in LDA mode - // (only occurs on a PX platform) where - // the D3D10Device object is created on the Intel iGPU and - // passed to AMD dGPU (secondary) to interoperate. - if (chainBitMask > 1) { - canInteroperate = false; - } - } + if (queryD3D10DeviceGPUMask(pd3d10Device, &d3d10DeviceGPUMask)) { + canInteroperate = (chainBitMask & d3d10DeviceGPUMask) != 0; + } else { + // special handling for Intel iGPU + AMD dGPU in LDA mode + // (only occurs on a PX platform) where + // the D3D10Device object is created on the Intel iGPU and + // passed to AMD dGPU (secondary) to interoperate. + if (chainBitMask > 1) { + canInteroperate = false; + } } + } - pDXGIDevice->Release(); - pDXGIAdapter->Release(); + pDXGIDevice->Release(); + pDXGIAdapter->Release(); - return canInteroperate; + return canInteroperate; } -} // pal +} // pal -#endif // !ATI_OS_WIN +#endif // !ATI_OS_WIN diff --git a/rocclr/runtime/device/pal/paldeviced3d11.cpp b/rocclr/runtime/device/pal/paldeviced3d11.cpp index e12cc14d5d..025b8ed9a5 100644 --- a/rocclr/runtime/device/pal/paldeviced3d11.cpp +++ b/rocclr/runtime/device/pal/paldeviced3d11.cpp @@ -2,141 +2,132 @@ #if defined(ATI_OS_LINUX) namespace pal { -bool -Device::associateD3D11Device(void* d3d11Device) -{ - return false; +bool Device::associateD3D11Device(void* d3d11Device) { return false; } } -} -#else // !ATI_OS_LINUX +#else // !ATI_OS_LINUX #include /************************************************************************************************************** -* Note: ideally the DXX extension interfaces should be mapped from the DXX perforce branch. -* This means OCL client spec will need to change to include headers directly from the DXX perforce tree. +* Note: ideally the DXX extension interfaces should be mapped from the DXX perforce branch. +* This means OCL client spec will need to change to include headers directly from the DXX perforce +*tree. * However, OCL only cares about the DXX OpenCL extension interface class. The spec cannot change -* without notification. So it is safe to use a local copy of the relevant DXX extension interface classes. +* without notification. So it is safe to use a local copy of the relevant DXX extension interface +*classes. **************************************************************************************************************/ #include "DxxOpenCLInteropExt.h" namespace pal { -static bool -queryD3D11DeviceGPUMask(ID3D11Device* pd3d11Device, UINT* pd3d11DeviceGPUMask) -{ - HMODULE hDLL = nullptr; - IAmdDxExt* pExt = nullptr; - IAmdDxExtCLInterop* pCLExt = nullptr; - PFNAmdDxExtCreate11 AmdDxExtCreate11; - HRESULT hr = S_OK; +static bool queryD3D11DeviceGPUMask(ID3D11Device* pd3d11Device, UINT* pd3d11DeviceGPUMask) { + HMODULE hDLL = nullptr; + IAmdDxExt* pExt = nullptr; + IAmdDxExtCLInterop* pCLExt = nullptr; + PFNAmdDxExtCreate11 AmdDxExtCreate11; + HRESULT hr = S_OK; - // Get a handle to the DXX DLL with extension API support +// Get a handle to the DXX DLL with extension API support #if defined _WIN64 - static const CHAR dxxModuleName[13] = "atidxx64.dll"; + static const CHAR dxxModuleName[13] = "atidxx64.dll"; #else - static const CHAR dxxModuleName[13] = "atidxx32.dll"; + static const CHAR dxxModuleName[13] = "atidxx32.dll"; #endif - hDLL = GetModuleHandle(dxxModuleName); + hDLL = GetModuleHandle(dxxModuleName); - if (hDLL == nullptr) { - hr = E_FAIL; + if (hDLL == nullptr) { + hr = E_FAIL; + } + + // Get the exported AmdDxExtCreate() function pointer + if (SUCCEEDED(hr)) { + AmdDxExtCreate11 = + reinterpret_cast(GetProcAddress(hDLL, "AmdDxExtCreate11")); + if (AmdDxExtCreate11 == nullptr) { + hr = E_FAIL; } + } - // Get the exported AmdDxExtCreate() function pointer - if (SUCCEEDED(hr)) { - AmdDxExtCreate11 = reinterpret_cast( - GetProcAddress(hDLL, "AmdDxExtCreate11")); - if (AmdDxExtCreate11 == nullptr) { - hr = E_FAIL; - } - } - - // Create the extension object - if (SUCCEEDED(hr)) { - hr = AmdDxExtCreate11(pd3d11Device, &pExt); - } - - // Get the extension version information - if (SUCCEEDED(hr)) { - AmdDxExtVersion extVersion; - hr = pExt->GetVersion(&extVersion); - - if (extVersion.majorVersion == 0) { - hr = E_FAIL; - } - } - - // Get the OpenCL Interop interface - if (SUCCEEDED(hr)) { - pCLExt = static_cast( - pExt->GetExtInterface(AmdDxExtCLInteropID)); - if (pCLExt != nullptr) { - // Get the GPU mask using the CL Interop extension. - pCLExt->QueryInteropGpuMask(pd3d11DeviceGPUMask); - } - else { - hr = E_FAIL; - } + // Create the extension object + if (SUCCEEDED(hr)) { + hr = AmdDxExtCreate11(pd3d11Device, &pExt); + } + + // Get the extension version information + if (SUCCEEDED(hr)) { + AmdDxExtVersion extVersion; + hr = pExt->GetVersion(&extVersion); + + if (extVersion.majorVersion == 0) { + hr = E_FAIL; } + } + // Get the OpenCL Interop interface + if (SUCCEEDED(hr)) { + pCLExt = static_cast(pExt->GetExtInterface(AmdDxExtCLInteropID)); if (pCLExt != nullptr) { - pCLExt->Release(); + // Get the GPU mask using the CL Interop extension. + pCLExt->QueryInteropGpuMask(pd3d11DeviceGPUMask); + } else { + hr = E_FAIL; } + } - if (pExt != nullptr) { - pExt->Release(); - } + if (pCLExt != nullptr) { + pCLExt->Release(); + } - return (SUCCEEDED(hr)); + if (pExt != nullptr) { + pExt->Release(); + } + + return (SUCCEEDED(hr)); } -bool -Device::associateD3D11Device(void* d3d11Device) -{ - ID3D11Device* pd3d11Device = static_cast(d3d11Device); +bool Device::associateD3D11Device(void* d3d11Device) { + ID3D11Device* pd3d11Device = static_cast(d3d11Device); - IDXGIDevice* pDXGIDevice; - pd3d11Device->QueryInterface(__uuidof(IDXGIDevice), (void **)&pDXGIDevice); + IDXGIDevice* pDXGIDevice; + pd3d11Device->QueryInterface(__uuidof(IDXGIDevice), (void**)&pDXGIDevice); - IDXGIAdapter* pDXGIAdapter; - pDXGIDevice->GetAdapter(&pDXGIAdapter); + IDXGIAdapter* pDXGIAdapter; + pDXGIDevice->GetAdapter(&pDXGIAdapter); - DXGI_ADAPTER_DESC adapterDesc; - pDXGIAdapter->GetDesc(&adapterDesc); + DXGI_ADAPTER_DESC adapterDesc; + pDXGIAdapter->GetDesc(&adapterDesc); - // match the adapter - bool canInteroperate = - (properties().osProperties.luidHighPart == adapterDesc.AdapterLuid.HighPart) && - (properties().osProperties.luidLowPart == adapterDesc.AdapterLuid.LowPart); + // match the adapter + bool canInteroperate = + (properties().osProperties.luidHighPart == adapterDesc.AdapterLuid.HighPart) && + (properties().osProperties.luidLowPart == adapterDesc.AdapterLuid.LowPart); - UINT chainBitMask = 1 << properties().gpuIndex; + UINT chainBitMask = 1 << properties().gpuIndex; - // match the chain ID - if (canInteroperate) { - UINT d3d11DeviceGPUMask = 0; + // match the chain ID + if (canInteroperate) { + UINT d3d11DeviceGPUMask = 0; - if (queryD3D11DeviceGPUMask(pd3d11Device, &d3d11DeviceGPUMask)) { - canInteroperate = (chainBitMask & d3d11DeviceGPUMask) != 0; - } - else { - // special handling for Intel iGPU + AMD dGPU in LDA mode - // (only occurs on a PX platform) where - // the D3D11Device object is created on the Intel iGPU and - // passed to AMD dGPU (secondary) to interoperate. - if (chainBitMask > 1) { - canInteroperate = false; - } - } + if (queryD3D11DeviceGPUMask(pd3d11Device, &d3d11DeviceGPUMask)) { + canInteroperate = (chainBitMask & d3d11DeviceGPUMask) != 0; + } else { + // special handling for Intel iGPU + AMD dGPU in LDA mode + // (only occurs on a PX platform) where + // the D3D11Device object is created on the Intel iGPU and + // passed to AMD dGPU (secondary) to interoperate. + if (chainBitMask > 1) { + canInteroperate = false; + } } + } - pDXGIDevice->Release(); - pDXGIAdapter->Release(); + pDXGIDevice->Release(); + pDXGIAdapter->Release(); - return canInteroperate; + return canInteroperate; } -} // pal +} // pal -#endif // !ATI_OS_LINUX +#endif // !ATI_OS_LINUX diff --git a/rocclr/runtime/device/pal/paldeviced3d9.cpp b/rocclr/runtime/device/pal/paldeviced3d9.cpp index 98bc526a23..a589d2abcf 100644 --- a/rocclr/runtime/device/pal/paldeviced3d9.cpp +++ b/rocclr/runtime/device/pal/paldeviced3d9.cpp @@ -2,52 +2,47 @@ #if defined(ATI_OS_LINUX) namespace pal { -bool -Device::associateD3D9Device(void* d3dDevice) -{ - return false; +bool Device::associateD3D9Device(void* d3dDevice) { return false; } } -} -#else // !ATI_OS_LINUX +#else // !ATI_OS_LINUX #include #include /************************************************************************************************************** -* Note: ideally the DXX extension interfaces should be mapped from the DXX perforce branch. -* This means OCL client spec will need to change to include headers directly from the DXX perforce tree. +* Note: ideally the DXX extension interfaces should be mapped from the DXX perforce branch. +* This means OCL client spec will need to change to include headers directly from the DXX perforce +*tree. * However, OCL only cares about the DXX OpenCL extension interface class. The spec cannot change -* without notification. So it is safe to use a local copy of the relevant DXX extension interface classes. +* without notification. So it is safe to use a local copy of the relevant DXX extension interface +*classes. **************************************************************************************************************/ #include "DxxOpenCLInteropExt.h" namespace pal { -bool -Device::associateD3D9Device(void* d3d9Device) -{ - D3DCAPS9 pCaps; - IDirect3D9* p3d9dev; - LUID d3d9deviceLuid = {0, 0}; +bool Device::associateD3D9Device(void* d3d9Device) { + D3DCAPS9 pCaps; + IDirect3D9* p3d9dev; + LUID d3d9deviceLuid = {0, 0}; - IDirect3DDevice9* pd3d9Device = static_cast(d3d9Device); + IDirect3DDevice9* pd3d9Device = static_cast(d3d9Device); - // Get D3D9 Device caps - pd3d9Device->GetDeviceCaps(&pCaps); - // Get 3D9 Device - pd3d9Device->GetDirect3D(&p3d9dev); + // Get D3D9 Device caps + pd3d9Device->GetDeviceCaps(&pCaps); + // Get 3D9 Device + pd3d9Device->GetDirect3D(&p3d9dev); - IDirect3D9Ex* p3d9devEx = static_cast(p3d9dev); - p3d9devEx->GetAdapterLUID(pCaps.AdapterOrdinal, &d3d9deviceLuid); - p3d9dev->Release(); + IDirect3D9Ex* p3d9devEx = static_cast(p3d9dev); + p3d9devEx->GetAdapterLUID(pCaps.AdapterOrdinal, &d3d9deviceLuid); + p3d9dev->Release(); - // match the adapter - bool canInteroperate = - (properties().osProperties.luidHighPart == d3d9deviceLuid.HighPart) && - (properties().osProperties.luidLowPart == d3d9deviceLuid.LowPart); + // match the adapter + bool canInteroperate = (properties().osProperties.luidHighPart == d3d9deviceLuid.HighPart) && + (properties().osProperties.luidLowPart == d3d9deviceLuid.LowPart); - return canInteroperate; + return canInteroperate; } -} // pal -#endif // !ATI_OS_WIN +} // pal +#endif // !ATI_OS_WIN diff --git a/rocclr/runtime/device/pal/paldevicegl.cpp b/rocclr/runtime/device/pal/paldevicegl.cpp index 40378706b3..e9d61ed826 100644 --- a/rocclr/runtime/device/pal/paldevicegl.cpp +++ b/rocclr/runtime/device/pal/paldevicegl.cpp @@ -6,7 +6,7 @@ #include #include "CL/cl_d3d10.h" #include "CL/cl_d3d11.h" -#endif // _WIN32 +#endif // _WIN32 #include #include @@ -27,7 +27,7 @@ #ifdef ATI_OS_LINUX typedef void* (*PFNGlxGetProcAddress)(const GLubyte* procName); -static PFNGlxGetProcAddress pfnGlxGetProcAddress=NULL; +static PFNGlxGetProcAddress pfnGlxGetProcAddress = NULL; static PFNGLXBEGINCLINTEROPAMD glXBeginCLInteropAMD = NULL; static PFNGLXENDCLINTEROPAMD glXEndCLInteropAMD = NULL; static PFNGLXRESOURCEATTACHAMD glXResourceAttachAMD = NULL; @@ -47,256 +47,242 @@ static PFNWGLGETCONTEXTGPUINFOAMD wglGetContextGPUInfoAMD = NULL; namespace pal { -bool -Device::initGLInteropPrivateExt(void* GLplatformContext, void* GLdeviceContext) const -{ +bool Device::initGLInteropPrivateExt(void* GLplatformContext, void* GLdeviceContext) const { #ifdef ATI_OS_LINUX - GLXContext ctx = (GLXContext)GLplatformContext; - void * pModule = dlopen("libGL.so.1",RTLD_NOW); + GLXContext ctx = (GLXContext)GLplatformContext; + void* pModule = dlopen("libGL.so.1", RTLD_NOW); - if(NULL == pModule) { - return false; - } - pfnGlxGetProcAddress = (PFNGlxGetProcAddress) dlsym(pModule,"glXGetProcAddress"); + if (NULL == pModule) { + return false; + } + pfnGlxGetProcAddress = (PFNGlxGetProcAddress)dlsym(pModule, "glXGetProcAddress"); - if (NULL == pfnGlxGetProcAddress) { - return false; - } + if (NULL == pfnGlxGetProcAddress) { + return false; + } - if (!glXBeginCLInteropAMD || !glXEndCLInteropAMD || !glXResourceAttachAMD || - !glXResourceDetachAMD || !glXGetContextMVPUInfoAMD) { - glXBeginCLInteropAMD = (PFNGLXBEGINCLINTEROPAMD) pfnGlxGetProcAddress ((const GLubyte *)"glXBeginCLInteroperabilityAMD"); - glXEndCLInteropAMD = (PFNGLXENDCLINTEROPAMD) pfnGlxGetProcAddress ((const GLubyte *)"glXEndCLInteroperabilityAMD"); - glXResourceAttachAMD = (PFNGLXRESOURCEATTACHAMD) pfnGlxGetProcAddress ((const GLubyte *)"glXResourceAttachAMD"); - glxResourceAcquireAMD = (PFNGLXRESOURCEDETACHAMD) pfnGlxGetProcAddress ((const GLubyte *)"glXResourceAcquireAMD"); - glxResourceReleaseAMD = (PFNGLXRESOURCEDETACHAMD) pfnGlxGetProcAddress ((const GLubyte *)"glXResourceReleaseAMD"); - glXResourceDetachAMD = (PFNGLXRESOURCEDETACHAMD) pfnGlxGetProcAddress ((const GLubyte *)"glXResourceDetachAMD"); - glXGetContextMVPUInfoAMD = (PFNGLXGETCONTEXTMVPUINFOAMD) pfnGlxGetProcAddress ((const GLubyte *)"glXGetContextMVPUInfoAMD"); - } + if (!glXBeginCLInteropAMD || !glXEndCLInteropAMD || !glXResourceAttachAMD || + !glXResourceDetachAMD || !glXGetContextMVPUInfoAMD) { + glXBeginCLInteropAMD = (PFNGLXBEGINCLINTEROPAMD)pfnGlxGetProcAddress( + (const GLubyte*)"glXBeginCLInteroperabilityAMD"); + glXEndCLInteropAMD = + (PFNGLXENDCLINTEROPAMD)pfnGlxGetProcAddress((const GLubyte*)"glXEndCLInteroperabilityAMD"); + glXResourceAttachAMD = + (PFNGLXRESOURCEATTACHAMD)pfnGlxGetProcAddress((const GLubyte*)"glXResourceAttachAMD"); + glxResourceAcquireAMD = + (PFNGLXRESOURCEDETACHAMD)pfnGlxGetProcAddress((const GLubyte*)"glXResourceAcquireAMD"); + glxResourceReleaseAMD = + (PFNGLXRESOURCEDETACHAMD)pfnGlxGetProcAddress((const GLubyte*)"glXResourceReleaseAMD"); + glXResourceDetachAMD = + (PFNGLXRESOURCEDETACHAMD)pfnGlxGetProcAddress((const GLubyte*)"glXResourceDetachAMD"); + glXGetContextMVPUInfoAMD = (PFNGLXGETCONTEXTMVPUINFOAMD)pfnGlxGetProcAddress( + (const GLubyte*)"glXGetContextMVPUInfoAMD"); + } - if (!glXBeginCLInteropAMD || !glXEndCLInteropAMD || !glXResourceAttachAMD || - !glXResourceDetachAMD - ) { - return false; - } + if (!glXBeginCLInteropAMD || !glXEndCLInteropAMD || !glXResourceAttachAMD || + !glXResourceDetachAMD) { + return false; + } #else - if (!wglBeginCLInteropAMD || !wglEndCLInteropAMD || !wglResourceAttachAMD || - !wglResourceDetachAMD || !wglGetContextGPUInfoAMD) { - HGLRC fakeRC = NULL; + if (!wglBeginCLInteropAMD || !wglEndCLInteropAMD || !wglResourceAttachAMD || + !wglResourceDetachAMD || !wglGetContextGPUInfoAMD) { + HGLRC fakeRC = NULL; - if (!wglGetCurrentContext()) { - fakeRC = wglCreateContext((HDC)GLdeviceContext); - wglMakeCurrent((HDC)GLdeviceContext, fakeRC); - } - - wglBeginCLInteropAMD = (PFNWGLBEGINCLINTEROPAMD) wglGetProcAddress ("wglBeginCLInteroperabilityAMD"); - wglEndCLInteropAMD = (PFNWGLENDCLINTEROPAMD) wglGetProcAddress ("wglEndCLInteroperabilityAMD"); - wglResourceAttachAMD = (PFNWGLRESOURCEATTACHAMD) wglGetProcAddress ("wglResourceAttachAMD"); - wglResourceAcquireAMD = (PFNWGLRESOURCEDETACHAMD) wglGetProcAddress ("wglResourceAcquireAMD"); - wglResourceReleaseAMD = (PFNWGLRESOURCEDETACHAMD) wglGetProcAddress ("wglResourceReleaseAMD"); - wglResourceDetachAMD = (PFNWGLRESOURCEDETACHAMD) wglGetProcAddress ("wglResourceDetachAMD"); - wglGetContextGPUInfoAMD = (PFNWGLGETCONTEXTGPUINFOAMD) wglGetProcAddress ("wglGetContextGPUInfoAMD"); - - if (fakeRC) { - wglMakeCurrent(NULL, NULL); - wglDeleteContext(fakeRC); - } + if (!wglGetCurrentContext()) { + fakeRC = wglCreateContext((HDC)GLdeviceContext); + wglMakeCurrent((HDC)GLdeviceContext, fakeRC); } - if (!wglBeginCLInteropAMD || !wglEndCLInteropAMD || !wglResourceAttachAMD || - !wglResourceDetachAMD || !wglGetContextGPUInfoAMD) { - return false; + + wglBeginCLInteropAMD = + (PFNWGLBEGINCLINTEROPAMD)wglGetProcAddress("wglBeginCLInteroperabilityAMD"); + wglEndCLInteropAMD = (PFNWGLENDCLINTEROPAMD)wglGetProcAddress("wglEndCLInteroperabilityAMD"); + wglResourceAttachAMD = (PFNWGLRESOURCEATTACHAMD)wglGetProcAddress("wglResourceAttachAMD"); + wglResourceAcquireAMD = (PFNWGLRESOURCEDETACHAMD)wglGetProcAddress("wglResourceAcquireAMD"); + wglResourceReleaseAMD = (PFNWGLRESOURCEDETACHAMD)wglGetProcAddress("wglResourceReleaseAMD"); + wglResourceDetachAMD = (PFNWGLRESOURCEDETACHAMD)wglGetProcAddress("wglResourceDetachAMD"); + wglGetContextGPUInfoAMD = + (PFNWGLGETCONTEXTGPUINFOAMD)wglGetProcAddress("wglGetContextGPUInfoAMD"); + + if (fakeRC) { + wglMakeCurrent(NULL, NULL); + wglDeleteContext(fakeRC); } + } + if (!wglBeginCLInteropAMD || !wglEndCLInteropAMD || !wglResourceAttachAMD || + !wglResourceDetachAMD || !wglGetContextGPUInfoAMD) { + return false; + } #endif + return true; +} + +bool Device::glCanInterop(void* GLplatformContext, void* GLdeviceContext) const { + bool canInteroperate = false; + +#ifdef ATI_OS_WIN + LUID glAdapterLuid = {0, 0}; + UINT glChainBitMask = 0; + HGLRC hRC = (HGLRC)GLplatformContext; + + // get GL context's LUID and chainBitMask from UGL + if (wglGetContextGPUInfoAMD(hRC, &glAdapterLuid, &glChainBitMask)) { + // match the adapter + canInteroperate = (properties().osProperties.luidHighPart == glAdapterLuid.HighPart) && + (properties().osProperties.luidLowPart == glAdapterLuid.LowPart) && + ((1 << properties().gpuIndex) == glChainBitMask); + } +#else + canInteroperate = true; +#endif + return canInteroperate; +} + +bool Device::glAssociate(void* GLplatformContext, void* GLdeviceContext) const { + // initialize pointers to the gl extension that supports interoperability + if (!initGLInteropPrivateExt(GLplatformContext, GLdeviceContext) || + !glCanInterop(GLplatformContext, GLdeviceContext)) { + return false; + } + + int flags = 0; +/* + if (m_adp->pAsicInfo->svmFineGrainSystem) + { + flags = GL_INTEROP_SVM; + } +*/ +#ifdef ATI_OS_LINUX + GLXContext ctx = (GLXContext)GLplatformContext; + return (glXBeginCLInteropAMD(ctx, 0)) ? true : false; +#else + HGLRC hRC = (HGLRC)GLplatformContext; + return (wglBeginCLInteropAMD(hRC, flags)) ? true : false; +#endif +} + +bool Device::glDissociate(void* GLplatformContext, void* GLdeviceContext) const { + int flags = 0; +/* + if (m_adp->pAsicInfo->svmFineGrainSystem) + { + flags = GL_INTEROP_SVM; + } +*/ +#ifdef ATI_OS_LINUX + GLXContext ctx = (GLXContext)GLplatformContext; + return (glXEndCLInteropAMD(ctx, 0)) ? true : false; +#else + HGLRC hRC = (HGLRC)GLplatformContext; + return (wglEndCLInteropAMD(hRC, flags)) ? true : false; +#endif +} + +bool Device::resGLAssociate(void* GLContext, uint name, uint type, void** handle, + void** mbResHandle, size_t* offset +#ifdef ATI_OS_WIN + , + Pal::DoppDesktopInfo& doppDesktopInfo +#endif + ) const { + amd::ScopedLock lk(lockPAL()); + + GLResource hRes = {}; + GLResourceData hData = {}; + + bool status = false; + + hRes.type = type; + hRes.name = name; + + hData.version = GL_RESOURCE_DATA_VERSION; +#ifdef ATI_OS_LINUX + GLXContext ctx = (GLXContext)GLContext; + if (glXResourceAttachAMD(ctx, &hRes, &hData)) { + // attribs.dynamicSharedBufferID = hData->sharedBufferID; + status = true; + } +#else + HGLRC hRC = (HGLRC)GLContext; + if (wglResourceAttachAMD(hRC, &hRes, &hData)) { + status = true; + } +#endif + + if (!status) { + return false; + } + + *handle = reinterpret_cast(hData.handle); + *mbResHandle = reinterpret_cast(hData.mbResHandle); + *offset = static_cast(hData.offset); +#ifdef ATI_OS_WIN + if (hData.isDoppDesktopTexture) { + doppDesktopInfo.gpuVirtAddr = hData.cardAddr; + doppDesktopInfo.vidPnSourceId = hData.vidpnSourceId; + } else { + doppDesktopInfo.gpuVirtAddr = 0; + doppDesktopInfo.vidPnSourceId = 0; + } +#endif + + return status; +} + +bool Device::resGLAcquire(void* GLplatformContext, void* mbResHandle, uint type) const { + amd::ScopedLock lk(lockPAL()); + + GLResource hRes = {}; + hRes.mbResHandle = (GLuintp)mbResHandle; + hRes.type = type; + +#ifdef ATI_OS_LINUX + GLXContext ctx = (GLXContext)GLplatformContext; + return (glxResourceAcquireAMD(ctx, &hRes)) ? true : false; +#else + HGLRC hRC = wglGetCurrentContext(); + //! @todo A temporary workaround for MT issue in conformance fence_sync + if (0 == hRC) { return true; -} - -bool -Device::glCanInterop(void* GLplatformContext, void* GLdeviceContext) const -{ - bool canInteroperate = false; - -#ifdef ATI_OS_WIN - LUID glAdapterLuid = {0, 0}; - UINT glChainBitMask = 0; - HGLRC hRC = (HGLRC)GLplatformContext; - - //get GL context's LUID and chainBitMask from UGL - if (wglGetContextGPUInfoAMD(hRC, &glAdapterLuid, &glChainBitMask)) { - // match the adapter - canInteroperate = - (properties().osProperties.luidHighPart == glAdapterLuid.HighPart) && - (properties().osProperties.luidLowPart == glAdapterLuid.LowPart) && - ((1 << properties().gpuIndex) == glChainBitMask); - } -#else - canInteroperate = true; + } + return (wglResourceAcquireAMD(hRC, &hRes)) ? true : false; #endif - return canInteroperate; } -bool -Device::glAssociate(void* GLplatformContext, void* GLdeviceContext) const -{ - //initialize pointers to the gl extension that supports interoperability - if (!initGLInteropPrivateExt(GLplatformContext, GLdeviceContext) || - !glCanInterop(GLplatformContext, GLdeviceContext)) { - return false; - } +bool Device::resGLRelease(void* GLplatformContext, void* mbResHandle, uint type) const { + amd::ScopedLock lk(lockPAL()); - int flags = 0; -/* - if (m_adp->pAsicInfo->svmFineGrainSystem) - { - flags = GL_INTEROP_SVM; - } -*/ + GLResource hRes = {}; + hRes.mbResHandle = (GLuintp)mbResHandle; + hRes.type = type; #ifdef ATI_OS_LINUX - GLXContext ctx = (GLXContext)GLplatformContext; - return (glXBeginCLInteropAMD(ctx, 0)) ? true : false; + // TODO : make sure the application GL context is current. if not no + // point calling into the GL RT. + GLXContext ctx = (GLXContext)GLplatformContext; + return (glxResourceReleaseAMD(ctx, &hRes)) ? true : false; #else - HGLRC hRC = (HGLRC)GLplatformContext; - return (wglBeginCLInteropAMD(hRC, flags)) ? true : false; + // Make the call into the GL driver only if the application GL context is current + HGLRC hRC = wglGetCurrentContext(); + //! @todo A temporary workaround for MT issue in conformance fence_sync + if (0 == hRC) { + return true; + } + return (wglResourceReleaseAMD(hRC, &hRes)) ? true : false; #endif } -bool -Device::glDissociate(void* GLplatformContext, void* GLdeviceContext) const -{ - int flags = 0; -/* - if (m_adp->pAsicInfo->svmFineGrainSystem) - { - flags = GL_INTEROP_SVM; - } -*/ +bool Device::resGLFree(void* GLplatformContext, void* mbResHandle, uint type) const { + amd::ScopedLock lk(lockPAL()); + + GLResource hRes = {}; + hRes.mbResHandle = (GLuintp)mbResHandle; + hRes.type = type; #ifdef ATI_OS_LINUX - GLXContext ctx = (GLXContext)GLplatformContext; - return (glXEndCLInteropAMD(ctx, 0)) ? true : false; + GLXContext ctx = (GLXContext)GLplatformContext; + return (glXResourceDetachAMD(ctx, &hRes)) ? true : false; #else - HGLRC hRC = (HGLRC)GLplatformContext; - return (wglEndCLInteropAMD(hRC, flags)) ? true : false; + HGLRC hRC = (HGLRC)GLplatformContext; + return (wglResourceDetachAMD(hRC, &hRes)) ? true : false; #endif } -bool -Device::resGLAssociate( - void* GLContext, - uint name, - uint type, - void** handle, - void** mbResHandle, - size_t* offset -#ifdef ATI_OS_WIN - , Pal::DoppDesktopInfo& doppDesktopInfo -#endif - ) const -{ - amd::ScopedLock lk(lockPAL()); - - GLResource hRes = {}; - GLResourceData hData = {}; - - bool status = false; - - hRes.type = type; - hRes.name = name; - - hData.version = GL_RESOURCE_DATA_VERSION; -#ifdef ATI_OS_LINUX - GLXContext ctx = (GLXContext)GLContext; - if (glXResourceAttachAMD(ctx, &hRes, &hData)) { - //attribs.dynamicSharedBufferID = hData->sharedBufferID; - status = true; - } -#else - HGLRC hRC = (HGLRC)GLContext; - if (wglResourceAttachAMD(hRC, &hRes, &hData)) { - status = true; - } -#endif - - if (!status) { - return false; - } - - *handle = reinterpret_cast(hData.handle); - *mbResHandle = reinterpret_cast(hData.mbResHandle); - *offset = static_cast(hData.offset); -#ifdef ATI_OS_WIN - if (hData.isDoppDesktopTexture) { - doppDesktopInfo.gpuVirtAddr = hData.cardAddr; - doppDesktopInfo.vidPnSourceId = hData.vidpnSourceId; - } - else { - doppDesktopInfo.gpuVirtAddr = 0; - doppDesktopInfo.vidPnSourceId = 0; - } -#endif - - return status; -} - -bool -Device::resGLAcquire(void* GLplatformContext, void* mbResHandle, uint type) const -{ - amd::ScopedLock lk(lockPAL()); - - GLResource hRes = {}; - hRes.mbResHandle = (GLuintp)mbResHandle; - hRes.type = type; - -#ifdef ATI_OS_LINUX - GLXContext ctx = (GLXContext) GLplatformContext; - return (glxResourceAcquireAMD(ctx, &hRes)) ? true : false; -#else - HGLRC hRC = wglGetCurrentContext(); - //! @todo A temporary workaround for MT issue in conformance fence_sync - if (0 == hRC) { - return true; - } - return (wglResourceAcquireAMD(hRC, &hRes)) ? true : false; -#endif -} - -bool -Device::resGLRelease(void* GLplatformContext, void* mbResHandle, uint type) const -{ - amd::ScopedLock lk(lockPAL()); - - GLResource hRes = {}; - hRes.mbResHandle = (GLuintp)mbResHandle; - hRes.type = type; -#ifdef ATI_OS_LINUX - //TODO : make sure the application GL context is current. if not no - // point calling into the GL RT. - GLXContext ctx = (GLXContext) GLplatformContext; - return (glxResourceReleaseAMD(ctx, &hRes)) ? true : false; -#else - // Make the call into the GL driver only if the application GL context is current - HGLRC hRC = wglGetCurrentContext(); - //! @todo A temporary workaround for MT issue in conformance fence_sync - if (0 == hRC) { - return true; - } - return (wglResourceReleaseAMD(hRC, &hRes)) ? true : false; -#endif -} - -bool -Device::resGLFree(void* GLplatformContext, void* mbResHandle, uint type) const -{ - amd::ScopedLock lk(lockPAL()); - - GLResource hRes = {}; - hRes.mbResHandle = (GLuintp)mbResHandle; - hRes.type = type; -#ifdef ATI_OS_LINUX - GLXContext ctx = (GLXContext)GLplatformContext; - return (glXResourceDetachAMD(ctx, &hRes)) ? true : false; -#else - HGLRC hRC = (HGLRC)GLplatformContext; - return (wglResourceDetachAMD(hRC, &hRes)) ? true : false; -#endif -} - -} // pal +} // pal diff --git a/rocclr/runtime/device/pal/palkernel.cpp b/rocclr/runtime/device/pal/palkernel.cpp index 0fa8a6e636..a7a201e5c1 100644 --- a/rocclr/runtime/device/pal/palkernel.cpp +++ b/rocclr/runtime/device/pal/palkernel.cpp @@ -21,1258 +21,1167 @@ namespace pal { -inline static HSAIL_ARG_TYPE -GetHSAILArgType(const aclArgData* argInfo) -{ - if (argInfo->argStr[0] == '_' && argInfo->argStr[1] == '.') { - if (strcmp(&argInfo->argStr[2], "global_offset_0") == 0) { - return HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X; - } - else if (strcmp(&argInfo->argStr[2], "global_offset_1") == 0) { - return HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y; - } - else if (strcmp(&argInfo->argStr[2], "global_offset_2") == 0) { - return HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z; - } - else if (strcmp(&argInfo->argStr[2], "printf_buffer") == 0) { - return HSAIL_ARGTYPE_HIDDEN_PRINTF_BUFFER; - } - else if (strcmp(&argInfo->argStr[2], "vqueue_pointer") == 0) { - return HSAIL_ARGTYPE_HIDDEN_DEFAULT_QUEUE; - } - else if (strcmp(&argInfo->argStr[2], "aqlwrap_pointer") == 0) { - return HSAIL_ARGTYPE_HIDDEN_COMPLETION_ACTION; - } - return HSAIL_ARGTYPE_HIDDEN_NONE; +inline static HSAIL_ARG_TYPE GetHSAILArgType(const aclArgData* argInfo) { + if (argInfo->argStr[0] == '_' && argInfo->argStr[1] == '.') { + if (strcmp(&argInfo->argStr[2], "global_offset_0") == 0) { + return HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X; + } else if (strcmp(&argInfo->argStr[2], "global_offset_1") == 0) { + return HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y; + } else if (strcmp(&argInfo->argStr[2], "global_offset_2") == 0) { + return HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z; + } else if (strcmp(&argInfo->argStr[2], "printf_buffer") == 0) { + return HSAIL_ARGTYPE_HIDDEN_PRINTF_BUFFER; + } else if (strcmp(&argInfo->argStr[2], "vqueue_pointer") == 0) { + return HSAIL_ARGTYPE_HIDDEN_DEFAULT_QUEUE; + } else if (strcmp(&argInfo->argStr[2], "aqlwrap_pointer") == 0) { + return HSAIL_ARGTYPE_HIDDEN_COMPLETION_ACTION; } + return HSAIL_ARGTYPE_HIDDEN_NONE; + } - switch (argInfo->type) { - case ARG_TYPE_POINTER: - return HSAIL_ARGTYPE_POINTER; - case ARG_TYPE_QUEUE: - return HSAIL_ARGTYPE_QUEUE; - case ARG_TYPE_VALUE: - return (argInfo->arg.value.data == DATATYPE_struct) - ? HSAIL_ARGTYPE_REFERENCE : HSAIL_ARGTYPE_VALUE; - case ARG_TYPE_IMAGE: - return HSAIL_ARGTYPE_IMAGE; - case ARG_TYPE_SAMPLER: - return HSAIL_ARGTYPE_SAMPLER; - case ARG_TYPE_ERROR: - default: - return HSAIL_ARGTYPE_ERROR; - } + switch (argInfo->type) { + case ARG_TYPE_POINTER: + return HSAIL_ARGTYPE_POINTER; + case ARG_TYPE_QUEUE: + return HSAIL_ARGTYPE_QUEUE; + case ARG_TYPE_VALUE: + return (argInfo->arg.value.data == DATATYPE_struct) ? HSAIL_ARGTYPE_REFERENCE + : HSAIL_ARGTYPE_VALUE; + case ARG_TYPE_IMAGE: + return HSAIL_ARGTYPE_IMAGE; + case ARG_TYPE_SAMPLER: + return HSAIL_ARGTYPE_SAMPLER; + case ARG_TYPE_ERROR: + default: + return HSAIL_ARGTYPE_ERROR; + } } -inline static size_t -GetHSAILArgAlignment(const aclArgData* argInfo) -{ - switch (argInfo->type) { +inline static size_t GetHSAILArgAlignment(const aclArgData* argInfo) { + switch (argInfo->type) { case ARG_TYPE_POINTER: - return sizeof(void*); + return sizeof(void*); case ARG_TYPE_VALUE: - switch (argInfo->arg.value.data) { + switch (argInfo->arg.value.data) { case DATATYPE_i8: case DATATYPE_u8: - return 1; + return 1; case DATATYPE_u16: case DATATYPE_i16: case DATATYPE_f16: - return 2; + return 2; case DATATYPE_u32: case DATATYPE_i32: case DATATYPE_f32: - return 4; + return 4; case DATATYPE_i64: case DATATYPE_u64: case DATATYPE_f64: - return 8; + return 8; case DATATYPE_struct: - return 128; + return 128; case DATATYPE_ERROR: default: - return -1; - } - case ARG_TYPE_IMAGE: return sizeof(cl_mem); - case ARG_TYPE_SAMPLER: return sizeof(cl_sampler); - default: return -1; - } + return -1; + } + case ARG_TYPE_IMAGE: + return sizeof(cl_mem); + case ARG_TYPE_SAMPLER: + return sizeof(cl_sampler); + default: + return -1; + } } -inline static size_t -GetHSAILArgPointeeAlignment(const aclArgData* argInfo) -{ - if (argInfo->type == ARG_TYPE_POINTER) { - return argInfo->arg.pointer.align; - } - return 1; +inline static size_t GetHSAILArgPointeeAlignment(const aclArgData* argInfo) { + if (argInfo->type == ARG_TYPE_POINTER) { + return argInfo->arg.pointer.align; + } + return 1; } -inline static HSAIL_ACCESS_TYPE -GetHSAILArgAccessType(const aclArgData* argInfo) -{ - aclAccessType accessType; +inline static HSAIL_ACCESS_TYPE GetHSAILArgAccessType(const aclArgData* argInfo) { + aclAccessType accessType; - if (argInfo->type == ARG_TYPE_POINTER) { - accessType = argInfo->arg.pointer.type; - } - else if (argInfo->type == ARG_TYPE_IMAGE) { - accessType = argInfo->arg.image.type; - } - else { - return HSAIL_ACCESS_TYPE_NONE; - } - if (accessType == ACCESS_TYPE_RO) { - return HSAIL_ACCESS_TYPE_RO; - } - else if (accessType == ACCESS_TYPE_WO) { - return HSAIL_ACCESS_TYPE_WO; - } + if (argInfo->type == ARG_TYPE_POINTER) { + accessType = argInfo->arg.pointer.type; + } else if (argInfo->type == ARG_TYPE_IMAGE) { + accessType = argInfo->arg.image.type; + } else { + return HSAIL_ACCESS_TYPE_NONE; + } + if (accessType == ACCESS_TYPE_RO) { + return HSAIL_ACCESS_TYPE_RO; + } else if (accessType == ACCESS_TYPE_WO) { + return HSAIL_ACCESS_TYPE_WO; + } - return HSAIL_ACCESS_TYPE_RW; + return HSAIL_ACCESS_TYPE_RW; } -inline static HSAIL_ADDRESS_QUALIFIER -GetHSAILAddrQual(const aclArgData* argInfo) -{ - if (argInfo->type == ARG_TYPE_POINTER) { - switch (argInfo->arg.pointer.memory) { - case PTR_MT_UAV_CONSTANT: - case PTR_MT_CONSTANT_EMU: - case PTR_MT_CONSTANT: - return HSAIL_ADDRESS_CONSTANT; - case PTR_MT_UAV: - case PTR_MT_GLOBAL: - return HSAIL_ADDRESS_GLOBAL; - case PTR_MT_LDS_EMU: - case PTR_MT_LDS: - return HSAIL_ADDRESS_LOCAL; - case PTR_MT_SCRATCH_EMU: - return HSAIL_ADDRESS_GLOBAL; - case PTR_MT_ERROR: - default: - LogError("Unsupported address type"); - return HSAIL_ADDRESS_ERROR; - } - } - else if ((argInfo->type == ARG_TYPE_IMAGE) || - (argInfo->type == ARG_TYPE_SAMPLER)) { +inline static HSAIL_ADDRESS_QUALIFIER GetHSAILAddrQual(const aclArgData* argInfo) { + if (argInfo->type == ARG_TYPE_POINTER) { + switch (argInfo->arg.pointer.memory) { + case PTR_MT_UAV_CONSTANT: + case PTR_MT_CONSTANT_EMU: + case PTR_MT_CONSTANT: + return HSAIL_ADDRESS_CONSTANT; + case PTR_MT_UAV: + case PTR_MT_GLOBAL: return HSAIL_ADDRESS_GLOBAL; - } - else if (argInfo->type == ARG_TYPE_QUEUE) { + case PTR_MT_LDS_EMU: + case PTR_MT_LDS: + return HSAIL_ADDRESS_LOCAL; + case PTR_MT_SCRATCH_EMU: return HSAIL_ADDRESS_GLOBAL; + case PTR_MT_ERROR: + default: + LogError("Unsupported address type"); + return HSAIL_ADDRESS_ERROR; } - return HSAIL_ADDRESS_ERROR; + } else if ((argInfo->type == ARG_TYPE_IMAGE) || (argInfo->type == ARG_TYPE_SAMPLER)) { + return HSAIL_ADDRESS_GLOBAL; + } else if (argInfo->type == ARG_TYPE_QUEUE) { + return HSAIL_ADDRESS_GLOBAL; + } + return HSAIL_ADDRESS_ERROR; } /* f16 returns f32 - workaround due to comp lib */ -inline static HSAIL_DATA_TYPE -GetHSAILDataType(const aclArgData* argInfo) -{ - aclArgDataType dataType; +inline static HSAIL_DATA_TYPE GetHSAILDataType(const aclArgData* argInfo) { + aclArgDataType dataType; - if (argInfo->type == ARG_TYPE_POINTER) { - dataType = argInfo->arg.pointer.data; - } - else if (argInfo->type == ARG_TYPE_VALUE) { - dataType = argInfo->arg.value.data; - } - else { - return HSAIL_DATATYPE_ERROR; - } - switch (dataType) { - case DATATYPE_i1: - return HSAIL_DATATYPE_B1; + if (argInfo->type == ARG_TYPE_POINTER) { + dataType = argInfo->arg.pointer.data; + } else if (argInfo->type == ARG_TYPE_VALUE) { + dataType = argInfo->arg.value.data; + } else { + return HSAIL_DATATYPE_ERROR; + } + switch (dataType) { + case DATATYPE_i1: + return HSAIL_DATATYPE_B1; + case DATATYPE_i8: + return HSAIL_DATATYPE_S8; + case DATATYPE_i16: + return HSAIL_DATATYPE_S16; + case DATATYPE_i32: + return HSAIL_DATATYPE_S32; + case DATATYPE_i64: + return HSAIL_DATATYPE_S64; + case DATATYPE_u8: + return HSAIL_DATATYPE_U8; + case DATATYPE_u16: + return HSAIL_DATATYPE_U16; + case DATATYPE_u32: + return HSAIL_DATATYPE_U32; + case DATATYPE_u64: + return HSAIL_DATATYPE_U64; + case DATATYPE_f16: + return HSAIL_DATATYPE_F32; + case DATATYPE_f32: + return HSAIL_DATATYPE_F32; + case DATATYPE_f64: + return HSAIL_DATATYPE_F64; + case DATATYPE_struct: + return HSAIL_DATATYPE_STRUCT; + case DATATYPE_opaque: + return HSAIL_DATATYPE_OPAQUE; + case DATATYPE_ERROR: + default: + return HSAIL_DATATYPE_ERROR; + } +} + +inline static int GetHSAILArgSize(const aclArgData* argInfo) { + switch (argInfo->type) { + case ARG_TYPE_POINTER: + return sizeof(void*); + case ARG_TYPE_VALUE: + switch (argInfo->arg.value.data) { case DATATYPE_i8: - return HSAIL_DATATYPE_S8; - case DATATYPE_i16: - return HSAIL_DATATYPE_S16; - case DATATYPE_i32: - return HSAIL_DATATYPE_S32; - case DATATYPE_i64: - return HSAIL_DATATYPE_S64; case DATATYPE_u8: - return HSAIL_DATATYPE_U8; - case DATATYPE_u16: - return HSAIL_DATATYPE_U16; - case DATATYPE_u32: - return HSAIL_DATATYPE_U32; - case DATATYPE_u64: - return HSAIL_DATATYPE_U64; - case DATATYPE_f16: - return HSAIL_DATATYPE_F32; - case DATATYPE_f32: - return HSAIL_DATATYPE_F32; - case DATATYPE_f64: - return HSAIL_DATATYPE_F64; case DATATYPE_struct: - return HSAIL_DATATYPE_STRUCT; - case DATATYPE_opaque: - return HSAIL_DATATYPE_OPAQUE; + return 1 * argInfo->arg.value.numElements; + case DATATYPE_u16: + case DATATYPE_i16: + case DATATYPE_f16: + return 2 * argInfo->arg.value.numElements; + case DATATYPE_u32: + case DATATYPE_i32: + case DATATYPE_f32: + return 4 * argInfo->arg.value.numElements; + case DATATYPE_i64: + case DATATYPE_u64: + case DATATYPE_f64: + return 8 * argInfo->arg.value.numElements; case DATATYPE_ERROR: default: - return HSAIL_DATATYPE_ERROR; - } + return -1; + } + case ARG_TYPE_IMAGE: + case ARG_TYPE_SAMPLER: + case ARG_TYPE_QUEUE: + return sizeof(void*); + default: + return -1; + } } -inline static int -GetHSAILArgSize(const aclArgData *argInfo) -{ - switch (argInfo->type) { - case ARG_TYPE_POINTER: return sizeof(void *); - case ARG_TYPE_VALUE: - switch (argInfo->arg.value.data) { - case DATATYPE_i8: - case DATATYPE_u8: - case DATATYPE_struct: - return 1 * argInfo->arg.value.numElements; - case DATATYPE_u16: - case DATATYPE_i16: - case DATATYPE_f16: - return 2 * argInfo->arg.value.numElements; - case DATATYPE_u32: - case DATATYPE_i32: - case DATATYPE_f32: - return 4 * argInfo->arg.value.numElements; - case DATATYPE_i64: - case DATATYPE_u64: - case DATATYPE_f64: - return 8 * argInfo->arg.value.numElements; - case DATATYPE_ERROR: - default: return -1; - } - case ARG_TYPE_IMAGE: - case ARG_TYPE_SAMPLER: - case ARG_TYPE_QUEUE: - return sizeof(void*); - default: - return -1; - } -} +inline static clk_value_type_t GetOclType(const HSAILKernel::Argument* arg) { + static const clk_value_type_t ClkValueMapType[6][6] = { + {T_CHAR, T_CHAR2, T_CHAR3, T_CHAR4, T_CHAR8, T_CHAR16}, + {T_SHORT, T_SHORT2, T_SHORT3, T_SHORT4, T_SHORT8, T_SHORT16}, + {T_INT, T_INT2, T_INT3, T_INT4, T_INT8, T_INT16}, + {T_LONG, T_LONG2, T_LONG3, T_LONG4, T_LONG8, T_LONG16}, + {T_FLOAT, T_FLOAT2, T_FLOAT3, T_FLOAT4, T_FLOAT8, T_FLOAT16}, + {T_DOUBLE, T_DOUBLE2, T_DOUBLE3, T_DOUBLE4, T_DOUBLE8, T_DOUBLE16}, + }; -inline static clk_value_type_t -GetOclType(const HSAILKernel::Argument* arg) -{ - static const clk_value_type_t ClkValueMapType[6][6] = { - { T_CHAR, T_CHAR2, T_CHAR3, T_CHAR4, T_CHAR8, T_CHAR16 }, - { T_SHORT, T_SHORT2, T_SHORT3, T_SHORT4, T_SHORT8, T_SHORT16 }, - { T_INT, T_INT2, T_INT3, T_INT4, T_INT8, T_INT16 }, - { T_LONG, T_LONG2, T_LONG3, T_LONG4, T_LONG8, T_LONG16 }, - { T_FLOAT, T_FLOAT2, T_FLOAT3, T_FLOAT4, T_FLOAT8, T_FLOAT16 }, - { T_DOUBLE, T_DOUBLE2, T_DOUBLE3, T_DOUBLE4, T_DOUBLE8, T_DOUBLE16 }, - }; - - uint sizeType; - uint numElements; - if (arg->type_ == HSAIL_ARGTYPE_QUEUE) { - return T_QUEUE; - } - else if (arg->type_ == HSAIL_ARGTYPE_POINTER || arg->type_ == HSAIL_ARGTYPE_IMAGE) { - return T_POINTER; - } - else if (arg->type_ == HSAIL_ARGTYPE_VALUE - || arg->type_ == HSAIL_ARGTYPE_REFERENCE) { - switch (arg->dataType_) { - case HSAIL_DATATYPE_S8: - case HSAIL_DATATYPE_U8: - sizeType = 0; - numElements = arg->size_; - break; - case HSAIL_DATATYPE_S16: - case HSAIL_DATATYPE_U16: - sizeType = 1; - numElements = arg->size_ / 2; - break; - case HSAIL_DATATYPE_S32: - case HSAIL_DATATYPE_U32: - sizeType = 2; - numElements = arg->size_ / 4; - break; - case HSAIL_DATATYPE_S64: - case HSAIL_DATATYPE_U64: - sizeType = 3; - numElements = arg->size_ / 8; - break; - case HSAIL_DATATYPE_F16: - sizeType = 4; - numElements = arg->size_ / 2; - break; - case HSAIL_DATATYPE_F32: - sizeType = 4; - numElements = arg->size_ / 4; - break; - case HSAIL_DATATYPE_F64: - sizeType = 5; - numElements = arg->size_ / 8; - break; - default: - return T_VOID; - } - - switch (numElements) { - case 1: return ClkValueMapType[sizeType][0]; - case 2: return ClkValueMapType[sizeType][1]; - case 3: return ClkValueMapType[sizeType][2]; - case 4: return ClkValueMapType[sizeType][3]; - case 8: return ClkValueMapType[sizeType][4]; - case 16: return ClkValueMapType[sizeType][5]; - default: return T_VOID; - } - } - else if (arg->type_ == HSAIL_ARGTYPE_SAMPLER) { - return T_SAMPLER; - } - else { + uint sizeType; + uint numElements; + if (arg->type_ == HSAIL_ARGTYPE_QUEUE) { + return T_QUEUE; + } else if (arg->type_ == HSAIL_ARGTYPE_POINTER || arg->type_ == HSAIL_ARGTYPE_IMAGE) { + return T_POINTER; + } else if (arg->type_ == HSAIL_ARGTYPE_VALUE || arg->type_ == HSAIL_ARGTYPE_REFERENCE) { + switch (arg->dataType_) { + case HSAIL_DATATYPE_S8: + case HSAIL_DATATYPE_U8: + sizeType = 0; + numElements = arg->size_; + break; + case HSAIL_DATATYPE_S16: + case HSAIL_DATATYPE_U16: + sizeType = 1; + numElements = arg->size_ / 2; + break; + case HSAIL_DATATYPE_S32: + case HSAIL_DATATYPE_U32: + sizeType = 2; + numElements = arg->size_ / 4; + break; + case HSAIL_DATATYPE_S64: + case HSAIL_DATATYPE_U64: + sizeType = 3; + numElements = arg->size_ / 8; + break; + case HSAIL_DATATYPE_F16: + sizeType = 4; + numElements = arg->size_ / 2; + break; + case HSAIL_DATATYPE_F32: + sizeType = 4; + numElements = arg->size_ / 4; + break; + case HSAIL_DATATYPE_F64: + sizeType = 5; + numElements = arg->size_ / 8; + break; + default: return T_VOID; } + + switch (numElements) { + case 1: + return ClkValueMapType[sizeType][0]; + case 2: + return ClkValueMapType[sizeType][1]; + case 3: + return ClkValueMapType[sizeType][2]; + case 4: + return ClkValueMapType[sizeType][3]; + case 8: + return ClkValueMapType[sizeType][4]; + case 16: + return ClkValueMapType[sizeType][5]; + default: + return T_VOID; + } + } else if (arg->type_ == HSAIL_ARGTYPE_SAMPLER) { + return T_SAMPLER; + } else { + return T_VOID; + } } -inline static cl_kernel_arg_address_qualifier -GetOclAddrQual(const HSAILKernel::Argument* arg) -{ - if (arg->type_ == HSAIL_ARGTYPE_POINTER) { - switch (arg->addrQual_) { - case HSAIL_ADDRESS_GLOBAL: - return CL_KERNEL_ARG_ADDRESS_GLOBAL; - case HSAIL_ADDRESS_CONSTANT: - return CL_KERNEL_ARG_ADDRESS_CONSTANT; - case HSAIL_ADDRESS_LOCAL: - return CL_KERNEL_ARG_ADDRESS_LOCAL; - default: - return CL_KERNEL_ARG_ADDRESS_PRIVATE; - } - } - else if (arg->type_ == HSAIL_ARGTYPE_IMAGE) { +inline static cl_kernel_arg_address_qualifier GetOclAddrQual(const HSAILKernel::Argument* arg) { + if (arg->type_ == HSAIL_ARGTYPE_POINTER) { + switch (arg->addrQual_) { + case HSAIL_ADDRESS_GLOBAL: return CL_KERNEL_ARG_ADDRESS_GLOBAL; + case HSAIL_ADDRESS_CONSTANT: + return CL_KERNEL_ARG_ADDRESS_CONSTANT; + case HSAIL_ADDRESS_LOCAL: + return CL_KERNEL_ARG_ADDRESS_LOCAL; + default: + return CL_KERNEL_ARG_ADDRESS_PRIVATE; } - //default for all other cases - return CL_KERNEL_ARG_ADDRESS_PRIVATE; + } else if (arg->type_ == HSAIL_ARGTYPE_IMAGE) { + return CL_KERNEL_ARG_ADDRESS_GLOBAL; + } + // default for all other cases + return CL_KERNEL_ARG_ADDRESS_PRIVATE; } -inline static cl_kernel_arg_access_qualifier -GetOclAccessQual(const HSAILKernel::Argument* arg) -{ - if (arg->type_ == HSAIL_ARGTYPE_IMAGE) { - switch (arg->access_) { - case HSAIL_ACCESS_TYPE_RO: - return CL_KERNEL_ARG_ACCESS_READ_ONLY; - case HSAIL_ACCESS_TYPE_WO: - return CL_KERNEL_ARG_ACCESS_WRITE_ONLY; - case HSAIL_ACCESS_TYPE_RW: - return CL_KERNEL_ARG_ACCESS_READ_WRITE; - default: - return CL_KERNEL_ARG_ACCESS_NONE; - } +inline static cl_kernel_arg_access_qualifier GetOclAccessQual(const HSAILKernel::Argument* arg) { + if (arg->type_ == HSAIL_ARGTYPE_IMAGE) { + switch (arg->access_) { + case HSAIL_ACCESS_TYPE_RO: + return CL_KERNEL_ARG_ACCESS_READ_ONLY; + case HSAIL_ACCESS_TYPE_WO: + return CL_KERNEL_ARG_ACCESS_WRITE_ONLY; + case HSAIL_ACCESS_TYPE_RW: + return CL_KERNEL_ARG_ACCESS_READ_WRITE; + default: + return CL_KERNEL_ARG_ACCESS_NONE; } - return CL_KERNEL_ARG_ACCESS_NONE; + } + return CL_KERNEL_ARG_ACCESS_NONE; } -inline static cl_kernel_arg_type_qualifier -GetOclTypeQual(const aclArgData* argInfo) -{ - cl_kernel_arg_type_qualifier rv = CL_KERNEL_ARG_TYPE_NONE; - if (argInfo->type == ARG_TYPE_POINTER) { - if (argInfo->arg.pointer.isVolatile) { - rv |= CL_KERNEL_ARG_TYPE_VOLATILE; - } - if (argInfo->arg.pointer.isRestrict) { - rv |= CL_KERNEL_ARG_TYPE_RESTRICT; - } - if (argInfo->arg.pointer.isPipe) { - rv |= CL_KERNEL_ARG_TYPE_PIPE; - } - if (argInfo->isConst) { - rv |= CL_KERNEL_ARG_TYPE_CONST; - } - switch (argInfo->arg.pointer.memory) { - case PTR_MT_CONSTANT: - case PTR_MT_UAV_CONSTANT: - case PTR_MT_CONSTANT_EMU: - rv |= CL_KERNEL_ARG_TYPE_CONST; - break; - default: - break; - } +inline static cl_kernel_arg_type_qualifier GetOclTypeQual(const aclArgData* argInfo) { + cl_kernel_arg_type_qualifier rv = CL_KERNEL_ARG_TYPE_NONE; + if (argInfo->type == ARG_TYPE_POINTER) { + if (argInfo->arg.pointer.isVolatile) { + rv |= CL_KERNEL_ARG_TYPE_VOLATILE; } - return rv; + if (argInfo->arg.pointer.isRestrict) { + rv |= CL_KERNEL_ARG_TYPE_RESTRICT; + } + if (argInfo->arg.pointer.isPipe) { + rv |= CL_KERNEL_ARG_TYPE_PIPE; + } + if (argInfo->isConst) { + rv |= CL_KERNEL_ARG_TYPE_CONST; + } + switch (argInfo->arg.pointer.memory) { + case PTR_MT_CONSTANT: + case PTR_MT_UAV_CONSTANT: + case PTR_MT_CONSTANT_EMU: + rv |= CL_KERNEL_ARG_TYPE_CONST; + break; + default: + break; + } + } + return rv; } -bool -HSAILKernel::aqlCreateHWInfo(amd::hsa::loader::Symbol *sym) -{ - if (!sym) { - return false; - } - if (!sym->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, reinterpret_cast(&code_))) { - return false; - } +bool HSAILKernel::aqlCreateHWInfo(amd::hsa::loader::Symbol* sym) { + if (!sym) { + return false; + } + if (!sym->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, reinterpret_cast(&code_))) { + return false; + } - amd_kernel_code_t *akc = reinterpret_cast(prog().findHostKernelAddress(code_)); - cpuAqlCode_ = akc; - if (!sym->GetInfo(HSA_EXT_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT_SIZE, reinterpret_cast(&codeSize_))) { - return false; - } - size_t akc_align = 0; - if (!sym->GetInfo(HSA_EXT_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT_ALIGN, reinterpret_cast(&akc_align))) { - return false; - } + amd_kernel_code_t* akc = + reinterpret_cast(prog().findHostKernelAddress(code_)); + cpuAqlCode_ = akc; + if (!sym->GetInfo(HSA_EXT_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT_SIZE, + reinterpret_cast(&codeSize_))) { + return false; + } + size_t akc_align = 0; + if (!sym->GetInfo(HSA_EXT_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT_ALIGN, + reinterpret_cast(&akc_align))) { + return false; + } - assert((akc->workitem_private_segment_byte_size & 3) == 0 && - "Scratch must be DWORD aligned"); - workGroupInfo_.scratchRegs_ = - amd::alignUp(akc->workitem_private_segment_byte_size, 16) / sizeof(uint); - workGroupInfo_.privateMemSize_ = akc->workitem_private_segment_byte_size; - workGroupInfo_.localMemSize_ = - workGroupInfo_.usedLDSSize_ = akc->workgroup_group_segment_byte_size; - workGroupInfo_.usedSGPRs_ = akc->wavefront_sgpr_count; - workGroupInfo_.usedStackSize_ = 0; - workGroupInfo_.usedVGPRs_ = akc->workitem_vgpr_count; + assert((akc->workitem_private_segment_byte_size & 3) == 0 && "Scratch must be DWORD aligned"); + workGroupInfo_.scratchRegs_ = + amd::alignUp(akc->workitem_private_segment_byte_size, 16) / sizeof(uint); + workGroupInfo_.privateMemSize_ = akc->workitem_private_segment_byte_size; + workGroupInfo_.localMemSize_ = workGroupInfo_.usedLDSSize_ = + akc->workgroup_group_segment_byte_size; + workGroupInfo_.usedSGPRs_ = akc->wavefront_sgpr_count; + workGroupInfo_.usedStackSize_ = 0; + workGroupInfo_.usedVGPRs_ = akc->workitem_vgpr_count; - if (!prog().isNull()) { - workGroupInfo_.availableLDSSize_ = dev().properties().gfxipProperties.shaderCore.ldsSizePerCu; - workGroupInfo_.availableSGPRs_ = dev().properties().gfxipProperties.shaderCore.numAvailableSgprs; - workGroupInfo_.availableVGPRs_ = dev().properties().gfxipProperties.shaderCore.numAvailableVgprs; - workGroupInfo_.preferredSizeMultiple_ = - workGroupInfo_.wavefrontPerSIMD_ = dev().properties().gfxipProperties.shaderCore.wavefrontSize; - } - else { - workGroupInfo_.availableLDSSize_ = 64 * Ki; - workGroupInfo_.availableSGPRs_ = 104; - workGroupInfo_.availableVGPRs_ = 256; - workGroupInfo_.preferredSizeMultiple_ = - workGroupInfo_.wavefrontPerSIMD_ = 64; - } - return true; -} - -void -HSAILKernel::initArgList(const aclArgData* aclArg) -{ - // Initialize the hsail argument list too - initHsailArgs(aclArg); - - // Iterate through the arguments and insert into parameterList - device::Kernel::parameters_t params; - amd::KernelParameterDescriptor desc; - size_t offset = 0; - - for (uint i = 0; aclArg->struct_size != 0; i++, aclArg++) { - // skip the hidden arguments - if (arguments_[i]->index_ == uint(-1)) continue; - - desc.name_ = arguments_[i]->name_.c_str(); - desc.type_ = GetOclType(arguments_[i]); - desc.addressQualifier_ = GetOclAddrQual(arguments_[i]); - desc.accessQualifier_ = GetOclAccessQual(arguments_[i]); - desc.typeQualifier_ = GetOclTypeQual(aclArg); - desc.typeName_ = arguments_[i]->typeName_.c_str(); - - // Make a check if it is local or global - if (desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_LOCAL) { - desc.size_ = 0; - } - else { - desc.size_ = arguments_[i]->size_; - } - - // Make offset alignment to match CPU metadata, since - // in multidevice config abstraction layer has a single signature - // and CPU sends the paramaters as they are allocated in memory - size_t size = desc.size_; - if (size == 0) { - // Local memory for CPU - size = sizeof(cl_mem); - } - offset = amd::alignUp(offset, std::min(size, size_t(16))); - desc.offset_ = offset; - offset += amd::alignUp(size, sizeof(uint32_t)); - params.push_back(desc); - - if (arguments_[i]->type_ == HSAIL_ARGTYPE_IMAGE) { - flags_.imageEna_ = true; - if (desc.accessQualifier_ != CL_KERNEL_ARG_ACCESS_READ_ONLY) { - flags_.imageWriteEna_ = true; - } - } - } - - createSignature(params); -} - -void -HSAILKernel::initHsailArgs(const aclArgData* aclArg) -{ - // Iterate through the each kernel argument - for (uint index = 0; aclArg->struct_size != 0; aclArg++) { - Argument* arg = new Argument; - - // Initialize HSAIL kernel argument - arg->name_ = aclArg->argStr; - arg->typeName_ = aclArg->typeStr; - arg->size_ = GetHSAILArgSize(aclArg); - arg->type_ = GetHSAILArgType(aclArg); - arg->addrQual_ = GetHSAILAddrQual(aclArg); - arg->dataType_ = GetHSAILDataType(aclArg); - arg->alignment_ = GetHSAILArgAlignment(aclArg); - arg->access_ = GetHSAILArgAccessType(aclArg); - arg->pointeeAlignment_ = GetHSAILArgPointeeAlignment(aclArg); - - bool isHidden = arg->type_ == HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X - || arg->type_ == HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y - || arg->type_ == HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z - || arg->type_ == HSAIL_ARGTYPE_HIDDEN_PRINTF_BUFFER - || arg->type_ == HSAIL_ARGTYPE_HIDDEN_DEFAULT_QUEUE - || arg->type_ == HSAIL_ARGTYPE_HIDDEN_COMPLETION_ACTION - || arg->type_ == HSAIL_ARGTYPE_HIDDEN_NONE; - - arg->index_ = isHidden ? uint(-1) : index++; - - arguments_.push_back(arg); - } -} - -void -HSAILKernel::initPrintf(const aclPrintfFmt* aclPrintf) -{ - PrintfInfo info; - uint index = 0; - for (; aclPrintf->struct_size != 0; aclPrintf++) { - index = aclPrintf->ID; - if (printf_.size() <= index) { - printf_.resize(index + 1); - } - std::string pfmt = aclPrintf->fmtStr; - info.fmtString_.clear(); - size_t pos = 0; - bool need_nl = true; - for (size_t pos = 0; pos < pfmt.size(); ++pos) { - char symbol = pfmt[pos]; - need_nl = true; - if (symbol == '\\') { - // Rest of the C escape sequences (e.g. \') are handled correctly - // by the MDParser, we are not sure exactly how! - switch (pfmt[pos + 1]) { - case 'a': - pos++; - symbol = '\a'; - break; - case 'b': - pos++; - symbol = '\b'; - break; - case 'f': - pos++; - symbol = '\f'; - break; - case 'n': - pos++; - symbol = '\n'; - need_nl = false; - break; - case 'r': - pos++; - symbol = '\r'; - break; - case 'v': - pos++; - symbol = '\v'; - break; - case '7': - if (pfmt[pos + 2] == '2') { - pos += 2; - symbol = '\72'; - } - break; - default: - break; - } - } - info.fmtString_.push_back(symbol); - } - if (need_nl) { - info.fmtString_ += "\n"; - } - uint32_t *tmp_ptr = const_cast(aclPrintf->argSizes); - for (uint i = 0; i < aclPrintf->numSizes; i++, tmp_ptr++) { - info.arguments_.push_back(*tmp_ptr); - } - printf_[index] = info; - info.arguments_.clear(); - } -} - -HSAILKernel::HSAILKernel(std::string name, - HSAILProgram* prog, - std::string compileOptions) - : device::Kernel(name) - , compileOptions_(compileOptions) - , dev_(prog->dev()) - , prog_(*prog) - , index_(0) - , code_(0) - , codeSize_(0) - , waveLimiter_(this, (prog->isNull() ? 1 : - dev().properties().gfxipProperties.shaderCore.numCusPerShaderArray) * dev().hwInfo()->simdPerCU_) -{ - hsa_ = true; -} - -HSAILKernel::~HSAILKernel() -{ - while (!arguments_.empty()) { - Argument* arg = arguments_.back(); - delete arg; - arguments_.pop_back(); - } -} - -bool -HSAILKernel::init(amd::hsa::loader::Symbol *sym, bool finalize) -{ -#if defined(WITH_LIGHTNING_COMPILER) - assert(!"Should not reach here"); -#else // !defined(WITH_LIGHTNING_COMPILER) - acl_error error = ACL_SUCCESS; - std::string openClKernelName = openclMangledName(name()); - flags_.internalKernel_ = (compileOptions_.find("-cl-internal-kernel") != - std::string::npos) ? true: false; - //compile kernel down to ISA - if (finalize) { - std::string options(compileOptions_.c_str()); - options.append(" -just-kernel="); - options.append(openClKernelName.c_str()); - // Append an option so that we can selectively enable a SCOption on CZ - // whenever IOMMUv2 is enabled. - if (dev().settings().svmFineGrainSystem_) { - options.append(" -sc-xnack-iommu"); - } - error = aclCompile(dev().compiler(), prog().binaryElf(), - options.c_str(), ACL_TYPE_CG, ACL_TYPE_ISA, nullptr); - buildLog_ += aclGetCompilerLog(dev().compiler()); - if (error != ACL_SUCCESS) { - LogError("Failed to finalize kernel"); - return false; - } - } - - aqlCreateHWInfo(sym); - - // Pull out metadata from the ELF - size_t sizeOfArgList; - error = aclQueryInfo(dev().compiler(), prog().binaryElf(), - RT_ARGUMENT_ARRAY, openClKernelName.c_str(), nullptr, &sizeOfArgList); - if (error != ACL_SUCCESS) { - return false; - } - - char* aclArgList = new char[sizeOfArgList]; - if (nullptr == aclArgList) { - return false; - } - error = aclQueryInfo(dev().compiler(), prog().binaryElf(), - RT_ARGUMENT_ARRAY, openClKernelName.c_str(), aclArgList, &sizeOfArgList); - if (error != ACL_SUCCESS) { - return false; - } - // Set the argList - initArgList(reinterpret_cast(aclArgList)); - delete [] aclArgList; - - size_t sizeOfWorkGroupSize; - error = aclQueryInfo(dev().compiler(), prog().binaryElf(), - RT_WORK_GROUP_SIZE, openClKernelName.c_str(), nullptr, &sizeOfWorkGroupSize); - if (error != ACL_SUCCESS) { - return false; - } - error = aclQueryInfo(dev().compiler(), prog().binaryElf(), - RT_WORK_GROUP_SIZE, openClKernelName.c_str(), - workGroupInfo_.compileSize_, &sizeOfWorkGroupSize); - if (error != ACL_SUCCESS) { - return false; - } - - // Copy wavefront size - workGroupInfo_.wavefrontSize_ = prog().isNull() ? 64 : + if (!prog().isNull()) { + workGroupInfo_.availableLDSSize_ = dev().properties().gfxipProperties.shaderCore.ldsSizePerCu; + workGroupInfo_.availableSGPRs_ = + dev().properties().gfxipProperties.shaderCore.numAvailableSgprs; + workGroupInfo_.availableVGPRs_ = + dev().properties().gfxipProperties.shaderCore.numAvailableVgprs; + workGroupInfo_.preferredSizeMultiple_ = workGroupInfo_.wavefrontPerSIMD_ = dev().properties().gfxipProperties.shaderCore.wavefrontSize; - // Find total workgroup size - if (workGroupInfo_.compileSize_[0] != 0) { - workGroupInfo_.size_ = - workGroupInfo_.compileSize_[0] * - workGroupInfo_.compileSize_[1] * - workGroupInfo_.compileSize_[2]; - } - else { - workGroupInfo_.size_ = dev().info().maxWorkGroupSize_; - } - - // Pull out printf metadata from the ELF - size_t sizeOfPrintfList; - error = aclQueryInfo(dev().compiler(), prog().binaryElf(), - RT_GPU_PRINTF_ARRAY, openClKernelName.c_str(), nullptr, &sizeOfPrintfList); - if (error != ACL_SUCCESS) { - return false; - } - - // Make sure kernel has any printf info - if (0 != sizeOfPrintfList) { - char* aclPrintfList = new char[sizeOfPrintfList]; - if (nullptr == aclPrintfList) { - return false; - } - error = aclQueryInfo(dev().compiler(), prog().binaryElf(), - RT_GPU_PRINTF_ARRAY, openClKernelName.c_str(), aclPrintfList, - &sizeOfPrintfList); - if (error != ACL_SUCCESS) { - return false; - } - - // Set the PrintfList - initPrintf(reinterpret_cast(aclPrintfList)); - delete [] aclPrintfList; - } - - aclMetadata md; - md.enqueue_kernel = false; - size_t sizeOfDeviceEnqueue = sizeof(md.enqueue_kernel); - error = aclQueryInfo(dev().compiler(), prog().binaryElf(), - RT_DEVICE_ENQUEUE, openClKernelName.c_str(), - &md.enqueue_kernel, &sizeOfDeviceEnqueue); - if (error != ACL_SUCCESS) { - return false; - } - flags_.dynamicParallelism_ = md.enqueue_kernel; - - md.kernel_index = -1; - size_t sizeOfIndex = sizeof(md.kernel_index); - error = aclQueryInfo(dev().compiler(), prog().binaryElf(), - RT_KERNEL_INDEX, openClKernelName.c_str(), - &md.kernel_index, &sizeOfIndex); - if (error != ACL_SUCCESS) { - return false; - } - index_ = md.kernel_index; - - size_t sizeOfWavesPerSimdHint = sizeof(workGroupInfo_.wavesPerSimdHint_); - error = aclQueryInfo(dev().compiler(), prog().binaryElf(), - RT_WAVES_PER_SIMD_HINT, openClKernelName.c_str(), - &workGroupInfo_.wavesPerSimdHint_, &sizeOfWavesPerSimdHint); - if (error != ACL_SUCCESS) { - return false; - } - - waveLimiter_.enable(); - - size_t sizeOfWorkGroupSizeHint = sizeof(workGroupInfo_.compileSizeHint_); - error = aclQueryInfo(dev().compiler(), prog().binaryElf(), - RT_WORK_GROUP_SIZE_HINT, openClKernelName.c_str(), - workGroupInfo_.compileSizeHint_, &sizeOfWorkGroupSizeHint); - if (error != ACL_SUCCESS) { - return false; - } - - size_t sizeOfVecTypeHint; - error = aclQueryInfo(dev().compiler(), prog().binaryElf(), - RT_VEC_TYPE_HINT, openClKernelName.c_str(), - NULL, &sizeOfVecTypeHint); - if (error != ACL_SUCCESS) { - return false; - } - - if (0 != sizeOfVecTypeHint) { - char* VecTypeHint = new char[sizeOfVecTypeHint + 1]; - if (NULL == VecTypeHint) { - return false; - } - error = aclQueryInfo(dev().compiler(), prog().binaryElf(), - RT_VEC_TYPE_HINT, openClKernelName.c_str(), - VecTypeHint, &sizeOfVecTypeHint); - if (error != ACL_SUCCESS) { - return false; - } - VecTypeHint[sizeOfVecTypeHint] = '\0'; - workGroupInfo_.compileVecTypeHint_ = std::string(VecTypeHint); - delete[] VecTypeHint; - } - -#endif // !defined(WITH_LIGHTNING_COMPILER) - return true; + } else { + workGroupInfo_.availableLDSSize_ = 64 * Ki; + workGroupInfo_.availableSGPRs_ = 104; + workGroupInfo_.availableVGPRs_ = 256; + workGroupInfo_.preferredSizeMultiple_ = workGroupInfo_.wavefrontPerSIMD_ = 64; + } + return true; } -bool -HSAILKernel::validateMemory(uint idx, amd::Memory* amdMem) const -{ - // Check if memory doesn't require reallocation - bool noRealloc = true; - //amdMem->reallocedDeviceMemory(&dev())); +void HSAILKernel::initArgList(const aclArgData* aclArg) { + // Initialize the hsail argument list too + initHsailArgs(aclArg); - return noRealloc; + // Iterate through the arguments and insert into parameterList + device::Kernel::parameters_t params; + amd::KernelParameterDescriptor desc; + size_t offset = 0; + + for (uint i = 0; aclArg->struct_size != 0; i++, aclArg++) { + // skip the hidden arguments + if (arguments_[i]->index_ == uint(-1)) continue; + + desc.name_ = arguments_[i]->name_.c_str(); + desc.type_ = GetOclType(arguments_[i]); + desc.addressQualifier_ = GetOclAddrQual(arguments_[i]); + desc.accessQualifier_ = GetOclAccessQual(arguments_[i]); + desc.typeQualifier_ = GetOclTypeQual(aclArg); + desc.typeName_ = arguments_[i]->typeName_.c_str(); + + // Make a check if it is local or global + if (desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_LOCAL) { + desc.size_ = 0; + } else { + desc.size_ = arguments_[i]->size_; + } + + // Make offset alignment to match CPU metadata, since + // in multidevice config abstraction layer has a single signature + // and CPU sends the paramaters as they are allocated in memory + size_t size = desc.size_; + if (size == 0) { + // Local memory for CPU + size = sizeof(cl_mem); + } + offset = amd::alignUp(offset, std::min(size, size_t(16))); + desc.offset_ = offset; + offset += amd::alignUp(size, sizeof(uint32_t)); + params.push_back(desc); + + if (arguments_[i]->type_ == HSAIL_ARGTYPE_IMAGE) { + flags_.imageEna_ = true; + if (desc.accessQualifier_ != CL_KERNEL_ARG_ACCESS_READ_ONLY) { + flags_.imageWriteEna_ = true; + } + } + } + + createSignature(params); } -const Device& -HSAILKernel::dev() const -{ - return reinterpret_cast(dev_); +void HSAILKernel::initHsailArgs(const aclArgData* aclArg) { + // Iterate through the each kernel argument + for (uint index = 0; aclArg->struct_size != 0; aclArg++) { + Argument* arg = new Argument; + + // Initialize HSAIL kernel argument + arg->name_ = aclArg->argStr; + arg->typeName_ = aclArg->typeStr; + arg->size_ = GetHSAILArgSize(aclArg); + arg->type_ = GetHSAILArgType(aclArg); + arg->addrQual_ = GetHSAILAddrQual(aclArg); + arg->dataType_ = GetHSAILDataType(aclArg); + arg->alignment_ = GetHSAILArgAlignment(aclArg); + arg->access_ = GetHSAILArgAccessType(aclArg); + arg->pointeeAlignment_ = GetHSAILArgPointeeAlignment(aclArg); + + bool isHidden = arg->type_ == HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X || + arg->type_ == HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y || + arg->type_ == HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z || + arg->type_ == HSAIL_ARGTYPE_HIDDEN_PRINTF_BUFFER || + arg->type_ == HSAIL_ARGTYPE_HIDDEN_DEFAULT_QUEUE || + arg->type_ == HSAIL_ARGTYPE_HIDDEN_COMPLETION_ACTION || + arg->type_ == HSAIL_ARGTYPE_HIDDEN_NONE; + + arg->index_ = isHidden ? uint(-1) : index++; + + arguments_.push_back(arg); + } } -const HSAILProgram& -HSAILKernel::prog() const -{ - return reinterpret_cast(prog_); -} - -void -HSAILKernel::findLocalWorkSize( - size_t workDim, - const amd::NDRange& gblWorkSize, - amd::NDRange& lclWorkSize) const -{ - // Initialize the default workgoup info - // Check if the kernel has the compiled sizes - if (workGroupInfo()->compileSize_[0] == 0) { - // Find the default local workgroup size, if it wasn't specified - if (lclWorkSize[0] == 0) { - size_t thrPerGrp; - bool b1DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE); - bool b2DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_X) || - !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_Y); - bool b3DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_X) || - !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Y) || - !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Z); - - bool overrideSet = ((workDim == 1) && b1DOverrideSet) || - ((workDim == 2) && b2DOverrideSet) || - ((workDim == 3) && b3DOverrideSet); - if (!overrideSet) { - // Find threads per group - thrPerGrp = workGroupInfo()->size_; - - // Check if kernel uses images - if (flags_.imageEna_ && - // and thread group is a multiple value of wavefronts - ((thrPerGrp % workGroupInfo()->wavefrontSize_) == 0) && - // and it's 2 or 3-dimensional workload - (workDim > 1) && - ((dev().settings().partialDispatch_) || - (((gblWorkSize[0] % 16) == 0) && - ((gblWorkSize[1] % 16) == 0)))) { - // Use 8x8 workgroup size if kernel has image writes - if (flags_.imageWriteEna_ || - (thrPerGrp != dev().info().maxWorkGroupSize_)) { - lclWorkSize[0] = 8; - lclWorkSize[1] = 8; - } - else { - lclWorkSize[0] = 16; - lclWorkSize[1] = 16; - } - if (workDim == 3) { - lclWorkSize[2] = 1; - } - } - else { - size_t tmp = thrPerGrp; - // Split the local workgroup into the most efficient way - for (uint d = 0; d < workDim; ++d) { - size_t div = tmp; - for (; (gblWorkSize[d] % div) != 0; div--); - lclWorkSize[d] = div; - tmp /= div; - } - - // Check if partial dispatch is enabled and - if (dev().settings().partialDispatch_ && - // we couldn't find optimal workload - (lclWorkSize.product() % workGroupInfo()->wavefrontSize_) != 0) { - size_t maxSize = 0; - size_t maxDim = 0; - for (uint d = 0; d < workDim; ++d) { - if (maxSize < gblWorkSize[d]) { - maxSize = gblWorkSize[d]; - maxDim = d; - } - } - // Check if a local workgroup has the most optimal size - if (thrPerGrp > maxSize) { - thrPerGrp = maxSize; - } - lclWorkSize[maxDim] = thrPerGrp; - for (uint d = 0; d < workDim; ++d) { - if (d != maxDim) { - lclWorkSize[d] = 1; - } - } - } - } +void HSAILKernel::initPrintf(const aclPrintfFmt* aclPrintf) { + PrintfInfo info; + uint index = 0; + for (; aclPrintf->struct_size != 0; aclPrintf++) { + index = aclPrintf->ID; + if (printf_.size() <= index) { + printf_.resize(index + 1); + } + std::string pfmt = aclPrintf->fmtStr; + info.fmtString_.clear(); + size_t pos = 0; + bool need_nl = true; + for (size_t pos = 0; pos < pfmt.size(); ++pos) { + char symbol = pfmt[pos]; + need_nl = true; + if (symbol == '\\') { + // Rest of the C escape sequences (e.g. \') are handled correctly + // by the MDParser, we are not sure exactly how! + switch (pfmt[pos + 1]) { + case 'a': + pos++; + symbol = '\a'; + break; + case 'b': + pos++; + symbol = '\b'; + break; + case 'f': + pos++; + symbol = '\f'; + break; + case 'n': + pos++; + symbol = '\n'; + need_nl = false; + break; + case 'r': + pos++; + symbol = '\r'; + break; + case 'v': + pos++; + symbol = '\v'; + break; + case '7': + if (pfmt[pos + 2] == '2') { + pos += 2; + symbol = '\72'; } - else { - // Use overrides when app doesn't provide workgroup dimensions - if (workDim == 1) { - lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE; - } - else if (workDim == 2) { - lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_2D_X; - lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_2D_Y; - } - else if (workDim == 3) { - lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_3D_X; - lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_3D_Y; - lclWorkSize[2] = GPU_MAX_WORKGROUP_SIZE_3D_Z; - } - else - { - assert(0 && "Invalid workDim!"); - } + break; + default: + break; + } + } + info.fmtString_.push_back(symbol); + } + if (need_nl) { + info.fmtString_ += "\n"; + } + uint32_t* tmp_ptr = const_cast(aclPrintf->argSizes); + for (uint i = 0; i < aclPrintf->numSizes; i++, tmp_ptr++) { + info.arguments_.push_back(*tmp_ptr); + } + printf_[index] = info; + info.arguments_.clear(); + } +} + +HSAILKernel::HSAILKernel(std::string name, HSAILProgram* prog, std::string compileOptions) + : device::Kernel(name), + compileOptions_(compileOptions), + dev_(prog->dev()), + prog_(*prog), + index_(0), + code_(0), + codeSize_(0), + waveLimiter_( + this, + (prog->isNull() ? 1 + : dev().properties().gfxipProperties.shaderCore.numCusPerShaderArray) * + dev().hwInfo()->simdPerCU_) { + hsa_ = true; +} + +HSAILKernel::~HSAILKernel() { + while (!arguments_.empty()) { + Argument* arg = arguments_.back(); + delete arg; + arguments_.pop_back(); + } +} + +bool HSAILKernel::init(amd::hsa::loader::Symbol* sym, bool finalize) { +#if defined(WITH_LIGHTNING_COMPILER) + assert(!"Should not reach here"); +#else // !defined(WITH_LIGHTNING_COMPILER) + acl_error error = ACL_SUCCESS; + std::string openClKernelName = openclMangledName(name()); + flags_.internalKernel_ = + (compileOptions_.find("-cl-internal-kernel") != std::string::npos) ? true : false; + // compile kernel down to ISA + if (finalize) { + std::string options(compileOptions_.c_str()); + options.append(" -just-kernel="); + options.append(openClKernelName.c_str()); + // Append an option so that we can selectively enable a SCOption on CZ + // whenever IOMMUv2 is enabled. + if (dev().settings().svmFineGrainSystem_) { + options.append(" -sc-xnack-iommu"); + } + error = aclCompile(dev().compiler(), prog().binaryElf(), options.c_str(), ACL_TYPE_CG, + ACL_TYPE_ISA, nullptr); + buildLog_ += aclGetCompilerLog(dev().compiler()); + if (error != ACL_SUCCESS) { + LogError("Failed to finalize kernel"); + return false; + } + } + + aqlCreateHWInfo(sym); + + // Pull out metadata from the ELF + size_t sizeOfArgList; + error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_ARGUMENT_ARRAY, + openClKernelName.c_str(), nullptr, &sizeOfArgList); + if (error != ACL_SUCCESS) { + return false; + } + + char* aclArgList = new char[sizeOfArgList]; + if (nullptr == aclArgList) { + return false; + } + error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_ARGUMENT_ARRAY, + openClKernelName.c_str(), aclArgList, &sizeOfArgList); + if (error != ACL_SUCCESS) { + return false; + } + // Set the argList + initArgList(reinterpret_cast(aclArgList)); + delete[] aclArgList; + + size_t sizeOfWorkGroupSize; + error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_WORK_GROUP_SIZE, + openClKernelName.c_str(), nullptr, &sizeOfWorkGroupSize); + if (error != ACL_SUCCESS) { + return false; + } + error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_WORK_GROUP_SIZE, + openClKernelName.c_str(), workGroupInfo_.compileSize_, &sizeOfWorkGroupSize); + if (error != ACL_SUCCESS) { + return false; + } + + // Copy wavefront size + workGroupInfo_.wavefrontSize_ = + prog().isNull() ? 64 : dev().properties().gfxipProperties.shaderCore.wavefrontSize; + // Find total workgroup size + if (workGroupInfo_.compileSize_[0] != 0) { + workGroupInfo_.size_ = workGroupInfo_.compileSize_[0] * workGroupInfo_.compileSize_[1] * + workGroupInfo_.compileSize_[2]; + } else { + workGroupInfo_.size_ = dev().info().maxWorkGroupSize_; + } + + // Pull out printf metadata from the ELF + size_t sizeOfPrintfList; + error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_GPU_PRINTF_ARRAY, + openClKernelName.c_str(), nullptr, &sizeOfPrintfList); + if (error != ACL_SUCCESS) { + return false; + } + + // Make sure kernel has any printf info + if (0 != sizeOfPrintfList) { + char* aclPrintfList = new char[sizeOfPrintfList]; + if (nullptr == aclPrintfList) { + return false; + } + error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_GPU_PRINTF_ARRAY, + openClKernelName.c_str(), aclPrintfList, &sizeOfPrintfList); + if (error != ACL_SUCCESS) { + return false; + } + + // Set the PrintfList + initPrintf(reinterpret_cast(aclPrintfList)); + delete[] aclPrintfList; + } + + aclMetadata md; + md.enqueue_kernel = false; + size_t sizeOfDeviceEnqueue = sizeof(md.enqueue_kernel); + error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_DEVICE_ENQUEUE, + openClKernelName.c_str(), &md.enqueue_kernel, &sizeOfDeviceEnqueue); + if (error != ACL_SUCCESS) { + return false; + } + flags_.dynamicParallelism_ = md.enqueue_kernel; + + md.kernel_index = -1; + size_t sizeOfIndex = sizeof(md.kernel_index); + error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_KERNEL_INDEX, + openClKernelName.c_str(), &md.kernel_index, &sizeOfIndex); + if (error != ACL_SUCCESS) { + return false; + } + index_ = md.kernel_index; + + size_t sizeOfWavesPerSimdHint = sizeof(workGroupInfo_.wavesPerSimdHint_); + error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_WAVES_PER_SIMD_HINT, + openClKernelName.c_str(), &workGroupInfo_.wavesPerSimdHint_, + &sizeOfWavesPerSimdHint); + if (error != ACL_SUCCESS) { + return false; + } + + waveLimiter_.enable(); + + size_t sizeOfWorkGroupSizeHint = sizeof(workGroupInfo_.compileSizeHint_); + error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_WORK_GROUP_SIZE_HINT, + openClKernelName.c_str(), workGroupInfo_.compileSizeHint_, + &sizeOfWorkGroupSizeHint); + if (error != ACL_SUCCESS) { + return false; + } + + size_t sizeOfVecTypeHint; + error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_VEC_TYPE_HINT, + openClKernelName.c_str(), NULL, &sizeOfVecTypeHint); + if (error != ACL_SUCCESS) { + return false; + } + + if (0 != sizeOfVecTypeHint) { + char* VecTypeHint = new char[sizeOfVecTypeHint + 1]; + if (NULL == VecTypeHint) { + return false; + } + error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_VEC_TYPE_HINT, + openClKernelName.c_str(), VecTypeHint, &sizeOfVecTypeHint); + if (error != ACL_SUCCESS) { + return false; + } + VecTypeHint[sizeOfVecTypeHint] = '\0'; + workGroupInfo_.compileVecTypeHint_ = std::string(VecTypeHint); + delete[] VecTypeHint; + } + +#endif // !defined(WITH_LIGHTNING_COMPILER) + return true; +} + +bool HSAILKernel::validateMemory(uint idx, amd::Memory* amdMem) const { + // Check if memory doesn't require reallocation + bool noRealloc = true; + // amdMem->reallocedDeviceMemory(&dev())); + + return noRealloc; +} + +const Device& HSAILKernel::dev() const { return reinterpret_cast(dev_); } + +const HSAILProgram& HSAILKernel::prog() const { + return reinterpret_cast(prog_); +} + +void HSAILKernel::findLocalWorkSize(size_t workDim, const amd::NDRange& gblWorkSize, + amd::NDRange& lclWorkSize) const { + // Initialize the default workgoup info + // Check if the kernel has the compiled sizes + if (workGroupInfo()->compileSize_[0] == 0) { + // Find the default local workgroup size, if it wasn't specified + if (lclWorkSize[0] == 0) { + size_t thrPerGrp; + bool b1DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE); + bool b2DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_X) || + !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_Y); + bool b3DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_X) || + !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Y) || + !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Z); + + bool overrideSet = ((workDim == 1) && b1DOverrideSet) || ((workDim == 2) && b2DOverrideSet) || + ((workDim == 3) && b3DOverrideSet); + if (!overrideSet) { + // Find threads per group + thrPerGrp = workGroupInfo()->size_; + + // Check if kernel uses images + if (flags_.imageEna_ && + // and thread group is a multiple value of wavefronts + ((thrPerGrp % workGroupInfo()->wavefrontSize_) == 0) && + // and it's 2 or 3-dimensional workload + (workDim > 1) && ((dev().settings().partialDispatch_) || + (((gblWorkSize[0] % 16) == 0) && ((gblWorkSize[1] % 16) == 0)))) { + // Use 8x8 workgroup size if kernel has image writes + if (flags_.imageWriteEna_ || (thrPerGrp != dev().info().maxWorkGroupSize_)) { + lclWorkSize[0] = 8; + lclWorkSize[1] = 8; + } else { + lclWorkSize[0] = 16; + lclWorkSize[1] = 16; + } + if (workDim == 3) { + lclWorkSize[2] = 1; + } + } else { + size_t tmp = thrPerGrp; + // Split the local workgroup into the most efficient way + for (uint d = 0; d < workDim; ++d) { + size_t div = tmp; + for (; (gblWorkSize[d] % div) != 0; div--) + ; + lclWorkSize[d] = div; + tmp /= div; + } + + // Check if partial dispatch is enabled and + if (dev().settings().partialDispatch_ && + // we couldn't find optimal workload + (lclWorkSize.product() % workGroupInfo()->wavefrontSize_) != 0) { + size_t maxSize = 0; + size_t maxDim = 0; + for (uint d = 0; d < workDim; ++d) { + if (maxSize < gblWorkSize[d]) { + maxSize = gblWorkSize[d]; + maxDim = d; + } } + // Check if a local workgroup has the most optimal size + if (thrPerGrp > maxSize) { + thrPerGrp = maxSize; + } + lclWorkSize[maxDim] = thrPerGrp; + for (uint d = 0; d < workDim; ++d) { + if (d != maxDim) { + lclWorkSize[d] = 1; + } + } + } } - } - else { - for (uint d = 0; d < workDim; ++d) { - lclWorkSize[d] = workGroupInfo()->compileSize_[d]; + } else { + // Use overrides when app doesn't provide workgroup dimensions + if (workDim == 1) { + lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE; + } else if (workDim == 2) { + lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_2D_X; + lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_2D_Y; + } else if (workDim == 3) { + lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_3D_X; + lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_3D_Y; + lclWorkSize[2] = GPU_MAX_WORKGROUP_SIZE_3D_Z; + } else { + assert(0 && "Invalid workDim!"); } + } } + } else { + for (uint d = 0; d < workDim; ++d) { + lclWorkSize[d] = workGroupInfo()->compileSize_[d]; + } + } } -inline static void -WriteAqlArg( - unsigned char** dst,//!< The write pointer to the buffer - const void* src, //!< The source pointer - uint size, //!< The size in bytes to copy - uint alignment = 0 //!< The alignment to follow while writing to the buffer - ) -{ - if (alignment == 0) { - *dst = amd::alignUp(*dst, size); - } - else { - *dst = amd::alignUp(*dst, alignment); - } - memcpy(*dst, src, size); - *dst += size; +inline static void WriteAqlArg( + unsigned char** dst, //!< The write pointer to the buffer + const void* src, //!< The source pointer + uint size, //!< The size in bytes to copy + uint alignment = 0 //!< The alignment to follow while writing to the buffer + ) { + if (alignment == 0) { + *dst = amd::alignUp(*dst, size); + } else { + *dst = amd::alignUp(*dst, alignment); + } + memcpy(*dst, src, size); + *dst += size; } -const uint16_t kDispatchPacketHeader = - (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) | +const uint16_t kDispatchPacketHeader = (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) | (1 << HSA_PACKET_HEADER_BARRIER) | (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) | (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); -hsa_kernel_dispatch_packet_t* -HSAILKernel::loadArguments( - VirtualGPU& gpu, - const amd::Kernel& kernel, - const amd::NDRangeContainer& sizes, - const_address parameters, - bool nativeMem, - uint64_t vmDefQueue, - uint64_t* vmParentWrap, - std::vector& memList) const -{ - static const bool WaitOnBusyEngine = true; - uint64_t ldsAddress = ldsSize(); - address aqlArgBuf = gpu.cb(0)->sysMemCopy(); - address aqlStruct = gpu.cb(1)->sysMemCopy(); - bool srdResource = false; +hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments( + VirtualGPU& gpu, const amd::Kernel& kernel, const amd::NDRangeContainer& sizes, + const_address parameters, bool nativeMem, uint64_t vmDefQueue, uint64_t* vmParentWrap, + std::vector& memList) const { + static const bool WaitOnBusyEngine = true; + uint64_t ldsAddress = ldsSize(); + address aqlArgBuf = gpu.cb(0)->sysMemCopy(); + address aqlStruct = gpu.cb(1)->sysMemCopy(); + bool srdResource = false; - if (dynamicParallelism()) { - // Provide the host parent AQL wrap object to the kernel - AmdAqlWrap* wrap = reinterpret_cast(aqlStruct); - memset(wrap, 0, sizeof(AmdAqlWrap)); - wrap->state = AQL_WRAP_BUSY; - ConstBuffer* cb = gpu.constBufs_[1]; - cb->uploadDataToHw(sizeof(AmdAqlWrap)); - *vmParentWrap = cb->vmAddress() + cb->wrtOffset(); - memList.push_back(cb); + if (dynamicParallelism()) { + // Provide the host parent AQL wrap object to the kernel + AmdAqlWrap* wrap = reinterpret_cast(aqlStruct); + memset(wrap, 0, sizeof(AmdAqlWrap)); + wrap->state = AQL_WRAP_BUSY; + ConstBuffer* cb = gpu.constBufs_[1]; + cb->uploadDataToHw(sizeof(AmdAqlWrap)); + *vmParentWrap = cb->vmAddress() + cb->wrtOffset(); + memList.push_back(cb); + } + + const amd::KernelSignature& signature = kernel.signature(); + const amd::KernelParameters& kernelParams = kernel.parameters(); + + // Find all parameters for the current kernel + for (auto arg : arguments_) { + const_address paramaddr = nullptr; + if (arg->index_ != uint(-1)) { + paramaddr = parameters + signature.at(arg->index_).offset_; } - const amd::KernelSignature& signature = kernel.signature(); - const amd::KernelParameters& kernelParams = kernel.parameters(); - - // Find all parameters for the current kernel - for (auto arg : arguments_) { - const_address paramaddr = nullptr; - if (arg->index_ != uint(-1)) { - paramaddr = parameters + signature.at(arg->index_).offset_; + // Handle the hidden arguments first, as they do not have a + // matching parameter in the OCL signature (not a valid arg->index_) + switch (arg->type_) { + case HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X: { + size_t offset_x = sizes.dimensions() >= 1 ? sizes.offset()[0] : 0; + assert(arg->size_ == sizeof(offset_x) && "check the sizes"); + WriteAqlArg(&aqlArgBuf, &offset_x, arg->size_, arg->alignment_); + break; + } + case HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y: { + size_t offset_y = sizes.dimensions() >= 2 ? sizes.offset()[1] : 0; + assert(arg->size_ == sizeof(offset_y) && "check the sizes"); + WriteAqlArg(&aqlArgBuf, &offset_y, arg->size_, arg->alignment_); + break; + } + case HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z: { + size_t offset_z = sizes.dimensions() == 3 ? sizes.offset()[2] : 0; + assert(arg->size_ == sizeof(offset_z) && "check the sizes"); + WriteAqlArg(&aqlArgBuf, &offset_z, arg->size_, arg->alignment_); + break; + } + case HSAIL_ARGTYPE_HIDDEN_PRINTF_BUFFER: { + size_t bufferPtr = 0; + if ((printfInfo().size() > 0) && + // and printf buffer was allocated + (gpu.printfDbgHSA().dbgBuffer() != nullptr)) { + // and set the fourth argument as the printf_buffer pointer + bufferPtr = static_cast(gpu.printfDbgHSA().dbgBuffer()->vmAddress()); + memList.push_back(gpu.printfDbgHSA().dbgBuffer()); } - - // Handle the hidden arguments first, as they do not have a - // matching parameter in the OCL signature (not a valid arg->index_) - switch (arg->type_) { - case HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X: { - size_t offset_x = sizes.dimensions() >= 1 ? sizes.offset()[0] : 0; - assert(arg->size_ == sizeof(offset_x) && "check the sizes"); - WriteAqlArg(&aqlArgBuf, &offset_x, arg->size_, arg->alignment_); - break; + assert(arg->size_ == sizeof(bufferPtr) && "check the sizes"); + WriteAqlArg(&aqlArgBuf, &bufferPtr, arg->size_, arg->alignment_); + break; + } + case HSAIL_ARGTYPE_HIDDEN_DEFAULT_QUEUE: + assert(arg->size_ == sizeof(static_cast(vmDefQueue)) && "check the sizes"); + WriteAqlArg(&aqlArgBuf, &vmDefQueue, arg->size_, arg->alignment_); + break; + case HSAIL_ARGTYPE_HIDDEN_COMPLETION_ACTION: + assert(arg->size_ == sizeof(static_cast(*vmParentWrap)) && "check the sizes"); + WriteAqlArg(&aqlArgBuf, vmParentWrap, arg->size_, arg->alignment_); + break; + case HSAIL_ARGTYPE_HIDDEN_NONE: { + void* zero = 0; + assert(arg->size_ <= sizeof(zero) && "check the sizes"); + WriteAqlArg(&aqlArgBuf, &zero, arg->size_, arg->alignment_); + break; + } + case HSAIL_ARGTYPE_POINTER: { + // If it is a local pointer + if (arg->addrQual_ == HSAIL_ADDRESS_LOCAL) { + ldsAddress = amd::alignUp(ldsAddress, arg->pointeeAlignment_); + WriteAqlArg(&aqlArgBuf, &ldsAddress, arg->size_, arg->alignment_); + ldsAddress += *reinterpret_cast(paramaddr); + break; } - case HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y: { - size_t offset_y = sizes.dimensions() >= 2 ? sizes.offset()[1] : 0; - assert(arg->size_ == sizeof(offset_y) && "check the sizes"); - WriteAqlArg(&aqlArgBuf, &offset_y, arg->size_, arg->alignment_); - break; - } - case HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z: { - size_t offset_z = sizes.dimensions() == 3 ? sizes.offset()[2] : 0; - assert(arg->size_ == sizeof(offset_z) && "check the sizes"); - WriteAqlArg(&aqlArgBuf, &offset_z, arg->size_, arg->alignment_); - break; - } - case HSAIL_ARGTYPE_HIDDEN_PRINTF_BUFFER: { - size_t bufferPtr = 0; - if ((printfInfo().size() > 0) && - // and printf buffer was allocated - (gpu.printfDbgHSA().dbgBuffer() != nullptr)) { - // and set the fourth argument as the printf_buffer pointer - bufferPtr = static_cast(gpu.printfDbgHSA().dbgBuffer()->vmAddress()); - memList.push_back(gpu.printfDbgHSA().dbgBuffer()); - } - assert(arg->size_ == sizeof(bufferPtr) && "check the sizes"); - WriteAqlArg(&aqlArgBuf, &bufferPtr, arg->size_, arg->alignment_); - break; - } - case HSAIL_ARGTYPE_HIDDEN_DEFAULT_QUEUE: - assert(arg->size_ == sizeof(static_cast(vmDefQueue)) && "check the sizes"); - WriteAqlArg(&aqlArgBuf, &vmDefQueue, arg->size_, arg->alignment_); - break; - case HSAIL_ARGTYPE_HIDDEN_COMPLETION_ACTION: - assert(arg->size_ == sizeof(static_cast(*vmParentWrap)) && "check the sizes"); - WriteAqlArg(&aqlArgBuf, vmParentWrap, arg->size_, arg->alignment_); - break; - case HSAIL_ARGTYPE_HIDDEN_NONE: { - void* zero = 0; - assert(arg->size_ <= sizeof(zero) && "check the sizes"); - WriteAqlArg(&aqlArgBuf, &zero, arg->size_, arg->alignment_); - break; - } - case HSAIL_ARGTYPE_POINTER: { - // If it is a local pointer - if (arg->addrQual_ == HSAIL_ADDRESS_LOCAL) { - ldsAddress = amd::alignUp(ldsAddress, arg->pointeeAlignment_); - WriteAqlArg(&aqlArgBuf, &ldsAddress, arg->size_, arg->alignment_); - ldsAddress += *reinterpret_cast(paramaddr); - break; - } - assert((arg->addrQual_ == HSAIL_ADDRESS_GLOBAL || - arg->addrQual_ == HSAIL_ADDRESS_CONSTANT) && "Unsupported address qualifier"); + assert( + (arg->addrQual_ == HSAIL_ADDRESS_GLOBAL || arg->addrQual_ == HSAIL_ADDRESS_CONSTANT) && + "Unsupported address qualifier"); - // If it is a global pointer - Memory* gpuMem = nullptr; - amd::Memory* mem = nullptr; + // If it is a global pointer + Memory* gpuMem = nullptr; + amd::Memory* mem = nullptr; - if (kernelParams.boundToSvmPointer(dev(), parameters, arg->index_)) { - WriteAqlArg(&aqlArgBuf, paramaddr, sizeof(paramaddr)); - mem = amd::SvmManager::FindSvmBuffer(*reinterpret_cast(paramaddr)); - if (mem != nullptr) { - gpuMem = dev().getGpuMemory(mem); - gpuMem->wait(gpu, WaitOnBusyEngine); - if ((mem->getMemFlags() & CL_MEM_READ_ONLY) == 0) { - mem->signalWrite(&dev()); - } - memList.push_back(gpuMem); - } - // If finegrainsystem is present then the pointer can be malloced by the app and - // passed to kernel directly. If so copy the pointer location to aqlArgBuf - else if (!dev().isFineGrainedSystem(true)) { - return nullptr; - } - break; - } - if (nativeMem) { - gpuMem = *reinterpret_cast(paramaddr); - if (nullptr != gpuMem) { - mem = gpuMem->owner(); - } - } - else { - mem = *reinterpret_cast(paramaddr); - if (mem != nullptr) { - gpuMem = dev().getGpuMemory(mem); - } - } - if (gpuMem == nullptr) { - WriteAqlArg(&aqlArgBuf, &gpuMem, arg->size_, arg->alignment_); - break; - } - - //! 64 bit isn't supported with 32 bit binary - uint64_t globalAddress = gpuMem->vmAddress() + gpuMem->pinOffset(); - WriteAqlArg(&aqlArgBuf, &globalAddress, arg->size_, arg->alignment_); - - // Wait for resource if it was used on an inactive engine - //! \note syncCache may call DRM transfer + if (kernelParams.boundToSvmPointer(dev(), parameters, arg->index_)) { + WriteAqlArg(&aqlArgBuf, paramaddr, sizeof(paramaddr)); + mem = amd::SvmManager::FindSvmBuffer(*reinterpret_cast(paramaddr)); + if (mem != nullptr) { + gpuMem = dev().getGpuMemory(mem); gpuMem->wait(gpu, WaitOnBusyEngine); - - //! @todo Compiler has to return read/write attributes - if ((nullptr != mem) && - ((mem->getMemFlags() & CL_MEM_READ_ONLY) == 0)) { - mem->signalWrite(&dev()); + if ((mem->getMemFlags() & CL_MEM_READ_ONLY) == 0) { + mem->signalWrite(&dev()); } memList.push_back(gpuMem); - - // save the memory object pointer to allow global memory access - if (nullptr != dev().hwDebugMgr()) { - dev().hwDebugMgr()->assignKernelParamMem(arg->index_, gpuMem->owner()); - } - break; - } - case HSAIL_ARGTYPE_REFERENCE: { - // Copy the current structure into CB1 - memcpy(aqlStruct, paramaddr, arg->size_); - ConstBuffer* cb = gpu.constBufs_[1]; - cb->uploadDataToHw(arg->size_); - // Then use a pointer in aqlArgBuffer to CB1 - size_t gpuPtr = static_cast(cb->vmAddress() + cb->wrtOffset()); - WriteAqlArg(&aqlArgBuf, &gpuPtr, sizeof(size_t)); - memList.push_back(cb); - break; - } - case HSAIL_ARGTYPE_VALUE: - WriteAqlArg(&aqlArgBuf, paramaddr, arg->size_, arg->alignment_); - break; - case HSAIL_ARGTYPE_IMAGE: { - Image* image = nullptr; - amd::Memory* mem = nullptr; - if (nativeMem) { - image = static_cast(*reinterpret_cast(paramaddr)); - } - else { - mem = *reinterpret_cast(paramaddr); - if (mem == nullptr) { - LogError( "The kernel image argument isn't an image object!"); - return nullptr; - } - image = static_cast(dev().getGpuMemory(mem)); - } - - // Wait for resource if it was used on an inactive engine - //! \note syncCache may call DRM transfer - image->wait(gpu, WaitOnBusyEngine); - - //! \note Special case for the image views. - //! Copy SRD to CB1, so blit manager will be able to release - //! this view without a wait for SRD resource. - if (image->memoryType() == Resource::ImageView) { - // Copy the current structre into CB1 - memcpy(aqlStruct, image->hwState(), HsaImageObjectSize); - ConstBuffer* cb = gpu.constBufs_[1]; - cb->uploadDataToHw(HsaImageObjectSize); - // Then use a pointer in aqlArgBuffer to CB1 - uint64_t srd = cb->vmAddress() + cb->wrtOffset(); - WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd)); - memList.push_back(cb); - } - else { - uint64_t srd = image->hwSrd(); - WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd)); - srdResource = true; - } - - //! @todo Compiler has to return read/write attributes - if ((nullptr != mem) && - ((mem->getMemFlags() & CL_MEM_READ_ONLY) == 0)) { - mem->signalWrite(&dev()); - } - - memList.push_back(image); - break; - } - case HSAIL_ARGTYPE_SAMPLER: { - const amd::Sampler* sampler = - *reinterpret_cast(paramaddr); - const Sampler* gpuSampler = static_cast - (sampler->getDeviceSampler(dev())); - uint64_t srd = gpuSampler->hwSrd(); - WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd)); - srdResource = true; - break; - } - case HSAIL_ARGTYPE_QUEUE: { - const amd::DeviceQueue* queue = - *reinterpret_cast(paramaddr); - VirtualGPU* gpuQueue = static_cast(queue->vDev()); - uint64_t vmQueue; - if (dev().settings().useDeviceQueue_) { - vmQueue = gpuQueue->vQueue()->vmAddress(); - } - else { - if (!gpu.createVirtualQueue(queue->size())) { - LogError("Virtual queue creation failed!"); - return nullptr; - } - vmQueue = gpu.vQueue()->vmAddress(); - } - WriteAqlArg(&aqlArgBuf, &vmQueue, sizeof(vmQueue)); - break; - } - default: - LogError(" Unsupported argument type "); + } + // If finegrainsystem is present then the pointer can be malloced by the app and + // passed to kernel directly. If so copy the pointer location to aqlArgBuf + else if (!dev().isFineGrainedSystem(true)) { return nullptr; + } + break; + } + if (nativeMem) { + gpuMem = *reinterpret_cast(paramaddr); + if (nullptr != gpuMem) { + mem = gpuMem->owner(); + } + } else { + mem = *reinterpret_cast(paramaddr); + if (mem != nullptr) { + gpuMem = dev().getGpuMemory(mem); + } + } + if (gpuMem == nullptr) { + WriteAqlArg(&aqlArgBuf, &gpuMem, arg->size_, arg->alignment_); + break; } - } - if (ldsAddress > dev().info().localMemSize_) { - LogError("No local memory available\n"); + //! 64 bit isn't supported with 32 bit binary + uint64_t globalAddress = gpuMem->vmAddress() + gpuMem->pinOffset(); + WriteAqlArg(&aqlArgBuf, &globalAddress, arg->size_, arg->alignment_); + + // Wait for resource if it was used on an inactive engine + //! \note syncCache may call DRM transfer + gpuMem->wait(gpu, WaitOnBusyEngine); + + //! @todo Compiler has to return read/write attributes + if ((nullptr != mem) && ((mem->getMemFlags() & CL_MEM_READ_ONLY) == 0)) { + mem->signalWrite(&dev()); + } + memList.push_back(gpuMem); + + // save the memory object pointer to allow global memory access + if (nullptr != dev().hwDebugMgr()) { + dev().hwDebugMgr()->assignKernelParamMem(arg->index_, gpuMem->owner()); + } + break; + } + case HSAIL_ARGTYPE_REFERENCE: { + // Copy the current structure into CB1 + memcpy(aqlStruct, paramaddr, arg->size_); + ConstBuffer* cb = gpu.constBufs_[1]; + cb->uploadDataToHw(arg->size_); + // Then use a pointer in aqlArgBuffer to CB1 + size_t gpuPtr = static_cast(cb->vmAddress() + cb->wrtOffset()); + WriteAqlArg(&aqlArgBuf, &gpuPtr, sizeof(size_t)); + memList.push_back(cb); + break; + } + case HSAIL_ARGTYPE_VALUE: + WriteAqlArg(&aqlArgBuf, paramaddr, arg->size_, arg->alignment_); + break; + case HSAIL_ARGTYPE_IMAGE: { + Image* image = nullptr; + amd::Memory* mem = nullptr; + if (nativeMem) { + image = static_cast(*reinterpret_cast(paramaddr)); + } else { + mem = *reinterpret_cast(paramaddr); + if (mem == nullptr) { + LogError("The kernel image argument isn't an image object!"); + return nullptr; + } + image = static_cast(dev().getGpuMemory(mem)); + } + + // Wait for resource if it was used on an inactive engine + //! \note syncCache may call DRM transfer + image->wait(gpu, WaitOnBusyEngine); + + //! \note Special case for the image views. + //! Copy SRD to CB1, so blit manager will be able to release + //! this view without a wait for SRD resource. + if (image->memoryType() == Resource::ImageView) { + // Copy the current structre into CB1 + memcpy(aqlStruct, image->hwState(), HsaImageObjectSize); + ConstBuffer* cb = gpu.constBufs_[1]; + cb->uploadDataToHw(HsaImageObjectSize); + // Then use a pointer in aqlArgBuffer to CB1 + uint64_t srd = cb->vmAddress() + cb->wrtOffset(); + WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd)); + memList.push_back(cb); + } else { + uint64_t srd = image->hwSrd(); + WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd)); + srdResource = true; + } + + //! @todo Compiler has to return read/write attributes + if ((nullptr != mem) && ((mem->getMemFlags() & CL_MEM_READ_ONLY) == 0)) { + mem->signalWrite(&dev()); + } + + memList.push_back(image); + break; + } + case HSAIL_ARGTYPE_SAMPLER: { + const amd::Sampler* sampler = *reinterpret_cast(paramaddr); + const Sampler* gpuSampler = static_cast(sampler->getDeviceSampler(dev())); + uint64_t srd = gpuSampler->hwSrd(); + WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd)); + srdResource = true; + break; + } + case HSAIL_ARGTYPE_QUEUE: { + const amd::DeviceQueue* queue = *reinterpret_cast(paramaddr); + VirtualGPU* gpuQueue = static_cast(queue->vDev()); + uint64_t vmQueue; + if (dev().settings().useDeviceQueue_) { + vmQueue = gpuQueue->vQueue()->vmAddress(); + } else { + if (!gpu.createVirtualQueue(queue->size())) { + LogError("Virtual queue creation failed!"); + return nullptr; + } + vmQueue = gpu.vQueue()->vmAddress(); + } + WriteAqlArg(&aqlArgBuf, &vmQueue, sizeof(vmQueue)); + break; + } + default: + LogError(" Unsupported argument type "); return nullptr; } + } + + if (ldsAddress > dev().info().localMemSize_) { + LogError("No local memory available\n"); + return nullptr; + } #if defined(WITH_LIGHTNING_COMPILER) - // Check there is no arguments' buffer overflow. We may not use all the - // hidden argument slots. - assert(aqlArgBuf <= (gpu.cb(0)->sysMemCopy() + argsBufferSize())); -#else // !defined(WITH_LIGHTNING_COMPILER) - // HSAIL kernarg segment size is rounded up to multiple of 16. - aqlArgBuf = amd::alignUp(aqlArgBuf, 16); - assert((aqlArgBuf == (gpu.cb(0)->sysMemCopy() + argsBufferSize())) && - "Size and the number of arguments don't match!"); -#endif // !defined(WITH_LIGHTNING_COMPILER) - hsa_kernel_dispatch_packet_t* hsaDisp = - reinterpret_cast( - gpu.cb(0)->sysMemCopy() + argsBufferSize()); + // Check there is no arguments' buffer overflow. We may not use all the + // hidden argument slots. + assert(aqlArgBuf <= (gpu.cb(0)->sysMemCopy() + argsBufferSize())); +#else // !defined(WITH_LIGHTNING_COMPILER) + // HSAIL kernarg segment size is rounded up to multiple of 16. + aqlArgBuf = amd::alignUp(aqlArgBuf, 16); + assert((aqlArgBuf == (gpu.cb(0)->sysMemCopy() + argsBufferSize())) && + "Size and the number of arguments don't match!"); +#endif // !defined(WITH_LIGHTNING_COMPILER) + hsa_kernel_dispatch_packet_t* hsaDisp = + reinterpret_cast(gpu.cb(0)->sysMemCopy() + argsBufferSize()); - amd::NDRange local(sizes.local()); - const amd::NDRange& global = sizes.global(); + amd::NDRange local(sizes.local()); + const amd::NDRange& global = sizes.global(); - // Check if runtime has to find local workgroup size - findLocalWorkSize(sizes.dimensions(), sizes.global(), local); + // Check if runtime has to find local workgroup size + findLocalWorkSize(sizes.dimensions(), sizes.global(), local); - hsaDisp->header = kDispatchPacketHeader; - hsaDisp->setup = sizes.dimensions(); + hsaDisp->header = kDispatchPacketHeader; + hsaDisp->setup = sizes.dimensions(); - hsaDisp->workgroup_size_x = local[0]; - hsaDisp->workgroup_size_y = (sizes.dimensions() > 1) ? local[1] : 1; - hsaDisp->workgroup_size_z = (sizes.dimensions() > 2) ? local[2] : 1; + hsaDisp->workgroup_size_x = local[0]; + hsaDisp->workgroup_size_y = (sizes.dimensions() > 1) ? local[1] : 1; + hsaDisp->workgroup_size_z = (sizes.dimensions() > 2) ? local[2] : 1; - hsaDisp->grid_size_x = global[0]; - hsaDisp->grid_size_y = (sizes.dimensions() > 1) ? global[1] : 1; - hsaDisp->grid_size_z = (sizes.dimensions() > 2) ? global[2] : 1; - hsaDisp->reserved2 = 0; + hsaDisp->grid_size_x = global[0]; + hsaDisp->grid_size_y = (sizes.dimensions() > 1) ? global[1] : 1; + hsaDisp->grid_size_z = (sizes.dimensions() > 2) ? global[2] : 1; + hsaDisp->reserved2 = 0; - // Initialize kernel ISA and execution buffer requirements - hsaDisp->private_segment_size = spillSegSize(); - hsaDisp->group_segment_size = ldsAddress - ldsSize(); - hsaDisp->kernel_object = gpuAqlCode(); + // Initialize kernel ISA and execution buffer requirements + hsaDisp->private_segment_size = spillSegSize(); + hsaDisp->group_segment_size = ldsAddress - ldsSize(); + hsaDisp->kernel_object = gpuAqlCode(); - ConstBuffer* cb = gpu.constBufs_[0]; - cb->uploadDataToHw(argsBufferSize() + sizeof(hsa_kernel_dispatch_packet_t)); - uint64_t argList = cb->vmAddress() + cb->wrtOffset(); + ConstBuffer* cb = gpu.constBufs_[0]; + cb->uploadDataToHw(argsBufferSize() + sizeof(hsa_kernel_dispatch_packet_t)); + uint64_t argList = cb->vmAddress() + cb->wrtOffset(); - hsaDisp->kernarg_address = reinterpret_cast(argList); - hsaDisp->reserved2 = 0; - hsaDisp->completion_signal.handle = 0; + hsaDisp->kernarg_address = reinterpret_cast(argList); + hsaDisp->reserved2 = 0; + hsaDisp->completion_signal.handle = 0; - memList.push_back(cb); - memList.push_back(&prog().codeSegGpu()); - for (pal::Memory * mem : prog().globalStores()) { - memList.push_back(mem); - } - if (AMD_HSA_BITS_GET(cpuAqlCode_->kernel_code_properties, - AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_QUEUE_PTR)) { - memList.push_back(gpu.hsaQueueMem()); - } + memList.push_back(cb); + memList.push_back(&prog().codeSegGpu()); + for (pal::Memory* mem : prog().globalStores()) { + memList.push_back(mem); + } + if (AMD_HSA_BITS_GET(cpuAqlCode_->kernel_code_properties, + AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_QUEUE_PTR)) { + memList.push_back(gpu.hsaQueueMem()); + } - if (srdResource || prog().isStaticSampler()) { - dev().srds().fillResourceList(memList); - } + if (srdResource || prog().isStaticSampler()) { + dev().srds().fillResourceList(memList); + } - return hsaDisp; + return hsaDisp; } #if defined(WITH_LIGHTNING_COMPILER) @@ -1282,404 +1191,372 @@ using llvm::AMDGPU::CodeObject::AddressSpaceQualifier; using llvm::AMDGPU::CodeObject::ValueKind; using llvm::AMDGPU::CodeObject::ValueType; -const LightningProgram& -LightningKernel::prog() const -{ - return reinterpret_cast(prog_); +const LightningProgram& LightningKernel::prog() const { + return reinterpret_cast(prog_); } -void -LightningKernel::initPrintf(const std::vector& printfInfoStrings) -{ - for (auto str : printfInfoStrings) { - std::vector tokens; +void LightningKernel::initPrintf(const std::vector& printfInfoStrings) { + for (auto str : printfInfoStrings) { + std::vector tokens; - size_t end, pos = 0; - do { - end = str.find_first_of(':', pos); - tokens.push_back(str.substr(pos, end-pos)); - pos = end + 1; - } while (end != std::string::npos); + size_t end, pos = 0; + do { + end = str.find_first_of(':', pos); + tokens.push_back(str.substr(pos, end - pos)); + pos = end + 1; + } while (end != std::string::npos); - if (tokens.size() < 2) { - LogPrintfWarning("Invalid PrintInfo string: \"%s\"", str.c_str()); - continue; - } - - pos = 0; - size_t printfInfoID = std::stoi(tokens[pos++]); - if (printf_.size() <= printfInfoID) { - printf_.resize(printfInfoID + 1); - } - PrintfInfo& info = printf_[printfInfoID]; - - size_t numSizes = std::stoi(tokens[pos++]); - end = pos + numSizes; - - // ensure that we have the correct number of tokens - if (tokens.size() < end + 1/*last token is the fmtString*/) { - LogPrintfWarning("Invalid PrintInfo string: \"%s\"", str.c_str()); - continue; - } - - // push the argument sizes - while (pos < end) { - info.arguments_.push_back(std::stoi(tokens[pos++])); - } - - // FIXME: We should not need this! [ - std::string& fmt = tokens[pos]; - bool need_nl = true; - - for (pos = 0; pos < fmt.size(); ++pos) { - char symbol = fmt[pos]; - need_nl = true; - if (symbol == '\\') { - switch (fmt[pos+1]) { - case 'a': - pos++; - symbol = '\a'; - break; - case 'b': - pos++; - symbol = '\b'; - break; - case 'f': - pos++; - symbol = '\f'; - break; - case 'n': - pos++; - symbol = '\n'; - need_nl = false; - break; - case 'r': - pos++; - symbol = '\r'; - break; - case 'v': - pos++; - symbol = '\v'; - break; - case '7': - if (fmt[pos+2] == '2') { - pos += 2; - symbol = '\72'; - } - break; - default: - break; - } - } - info.fmtString_.push_back(symbol); - } - if (need_nl) { - info.fmtString_ += "\n"; - } - // ] + if (tokens.size() < 2) { + LogPrintfWarning("Invalid PrintInfo string: \"%s\"", str.c_str()); + continue; } + + pos = 0; + size_t printfInfoID = std::stoi(tokens[pos++]); + if (printf_.size() <= printfInfoID) { + printf_.resize(printfInfoID + 1); + } + PrintfInfo& info = printf_[printfInfoID]; + + size_t numSizes = std::stoi(tokens[pos++]); + end = pos + numSizes; + + // ensure that we have the correct number of tokens + if (tokens.size() < end + 1 /*last token is the fmtString*/) { + LogPrintfWarning("Invalid PrintInfo string: \"%s\"", str.c_str()); + continue; + } + + // push the argument sizes + while (pos < end) { + info.arguments_.push_back(std::stoi(tokens[pos++])); + } + + // FIXME: We should not need this! [ + std::string& fmt = tokens[pos]; + bool need_nl = true; + + for (pos = 0; pos < fmt.size(); ++pos) { + char symbol = fmt[pos]; + need_nl = true; + if (symbol == '\\') { + switch (fmt[pos + 1]) { + case 'a': + pos++; + symbol = '\a'; + break; + case 'b': + pos++; + symbol = '\b'; + break; + case 'f': + pos++; + symbol = '\f'; + break; + case 'n': + pos++; + symbol = '\n'; + need_nl = false; + break; + case 'r': + pos++; + symbol = '\r'; + break; + case 'v': + pos++; + symbol = '\v'; + break; + case '7': + if (fmt[pos + 2] == '2') { + pos += 2; + symbol = '\72'; + } + break; + default: + break; + } + } + info.fmtString_.push_back(symbol); + } + if (need_nl) { + info.fmtString_ += "\n"; + } + // ] + } } -static inline HSAIL_ARG_TYPE -GetKernelArgType(const KernelArgMD& lcArg) -{ - switch (lcArg.mValueKind) { +static inline HSAIL_ARG_TYPE GetKernelArgType(const KernelArgMD& lcArg) { + switch (lcArg.mValueKind) { case ValueKind::GlobalBuffer: case ValueKind::DynamicSharedPointer: - return HSAIL_ARGTYPE_POINTER; + return HSAIL_ARGTYPE_POINTER; case ValueKind::ByValue: - return HSAIL_ARGTYPE_VALUE; + return HSAIL_ARGTYPE_VALUE; case ValueKind::Image: - return HSAIL_ARGTYPE_IMAGE; + return HSAIL_ARGTYPE_IMAGE; case ValueKind::Sampler: - return HSAIL_ARGTYPE_SAMPLER; + return HSAIL_ARGTYPE_SAMPLER; case ValueKind::HiddenGlobalOffsetX: - return HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X; + return HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X; case ValueKind::HiddenGlobalOffsetY: - return HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y; + return HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y; case ValueKind::HiddenGlobalOffsetZ: - return HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z; + return HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z; case ValueKind::HiddenPrintfBuffer: - return HSAIL_ARGTYPE_HIDDEN_PRINTF_BUFFER; + return HSAIL_ARGTYPE_HIDDEN_PRINTF_BUFFER; case ValueKind::HiddenDefaultQueue: - return HSAIL_ARGTYPE_HIDDEN_DEFAULT_QUEUE; + return HSAIL_ARGTYPE_HIDDEN_DEFAULT_QUEUE; case ValueKind::HiddenCompletionAction: - return HSAIL_ARGTYPE_HIDDEN_COMPLETION_ACTION; + return HSAIL_ARGTYPE_HIDDEN_COMPLETION_ACTION; case ValueKind::HiddenNone: - return HSAIL_ARGTYPE_HIDDEN_NONE; + return HSAIL_ARGTYPE_HIDDEN_NONE; default: - return HSAIL_ARGTYPE_ERROR; - } + return HSAIL_ARGTYPE_ERROR; + } } -static inline size_t -GetKernelArgAlignment(const KernelArgMD& lcArg) -{ - return lcArg.mAlign; +static inline size_t GetKernelArgAlignment(const KernelArgMD& lcArg) { return lcArg.mAlign; } + +static inline size_t GetKernelArgPointeeAlignment(const KernelArgMD& lcArg) { + if (lcArg.mValueKind == ValueKind::DynamicSharedPointer) { + uint32_t align = lcArg.mPointeeAlign; + if (align == 0) { + LogWarning("Missing DynamicSharedPointer alignment"); + align = 128; /* worst case alignment */ + ; + } + return align; + } + return 1; } -static inline size_t -GetKernelArgPointeeAlignment(const KernelArgMD& lcArg) -{ - if (lcArg.mValueKind == ValueKind::DynamicSharedPointer) { - uint32_t align = lcArg.mPointeeAlign; - if (align == 0) { - LogWarning("Missing DynamicSharedPointer alignment"); - align = 128; /* worst case alignment */; - } - return align; +static inline HSAIL_ACCESS_TYPE GetKernelArgAccessType(const KernelArgMD& lcArg) { + if (lcArg.mValueKind == ValueKind::GlobalBuffer || lcArg.mValueKind == ValueKind::Image) { + switch (lcArg.mAccQual) { + case AccessQualifier::ReadOnly: + return HSAIL_ACCESS_TYPE_RO; + case AccessQualifier::WriteOnly: + return HSAIL_ACCESS_TYPE_WO; + case AccessQualifier::ReadWrite: + default: + return HSAIL_ACCESS_TYPE_RW; } - return 1; + } + return HSAIL_ACCESS_TYPE_NONE; } -static inline HSAIL_ACCESS_TYPE -GetKernelArgAccessType(const KernelArgMD& lcArg) -{ - if (lcArg.mValueKind == ValueKind::GlobalBuffer - || lcArg.mValueKind == ValueKind::Image) { - switch (lcArg.mAccQual) { - case AccessQualifier::ReadOnly: - return HSAIL_ACCESS_TYPE_RO; - case AccessQualifier::WriteOnly: - return HSAIL_ACCESS_TYPE_WO; - case AccessQualifier::ReadWrite: - default: - return HSAIL_ACCESS_TYPE_RW; - } - } - return HSAIL_ACCESS_TYPE_NONE; -} - -static inline HSAIL_ADDRESS_QUALIFIER -GetKernelAddrQual(const KernelArgMD& lcArg) -{ - if (lcArg.mValueKind == ValueKind::DynamicSharedPointer) { - return HSAIL_ADDRESS_LOCAL; - } - else if (lcArg.mValueKind == ValueKind::GlobalBuffer) { - if (lcArg.mAddrSpaceQual == AddressSpaceQualifier::Global) { - return HSAIL_ADDRESS_GLOBAL; - } - else if (lcArg.mAddrSpaceQual == AddressSpaceQualifier::Constant) { - return HSAIL_ADDRESS_CONSTANT; - } - LogError("Unsupported address type"); - return HSAIL_ADDRESS_ERROR; - } - else if (lcArg.mValueKind == ValueKind::Image - || lcArg.mValueKind == ValueKind::Sampler) { - return HSAIL_ADDRESS_GLOBAL; +static inline HSAIL_ADDRESS_QUALIFIER GetKernelAddrQual(const KernelArgMD& lcArg) { + if (lcArg.mValueKind == ValueKind::DynamicSharedPointer) { + return HSAIL_ADDRESS_LOCAL; + } else if (lcArg.mValueKind == ValueKind::GlobalBuffer) { + if (lcArg.mAddrSpaceQual == AddressSpaceQualifier::Global) { + return HSAIL_ADDRESS_GLOBAL; + } else if (lcArg.mAddrSpaceQual == AddressSpaceQualifier::Constant) { + return HSAIL_ADDRESS_CONSTANT; } + LogError("Unsupported address type"); return HSAIL_ADDRESS_ERROR; + } else if (lcArg.mValueKind == ValueKind::Image || lcArg.mValueKind == ValueKind::Sampler) { + return HSAIL_ADDRESS_GLOBAL; + } + return HSAIL_ADDRESS_ERROR; } -static inline HSAIL_DATA_TYPE -GetKernelDataType(const KernelArgMD& lcArg) -{ - if (lcArg.mValueKind != ValueKind::ByValue) { - return HSAIL_DATATYPE_ERROR; - } +static inline HSAIL_DATA_TYPE GetKernelDataType(const KernelArgMD& lcArg) { + if (lcArg.mValueKind != ValueKind::ByValue) { + return HSAIL_DATATYPE_ERROR; + } - switch (lcArg.mValueType) { + switch (lcArg.mValueType) { case ValueType::I8: - return HSAIL_DATATYPE_S8; + return HSAIL_DATATYPE_S8; case ValueType::I16: - return HSAIL_DATATYPE_S16; + return HSAIL_DATATYPE_S16; case ValueType::I32: - return HSAIL_DATATYPE_S32; + return HSAIL_DATATYPE_S32; case ValueType::I64: - return HSAIL_DATATYPE_S64; + return HSAIL_DATATYPE_S64; case ValueType::U8: - return HSAIL_DATATYPE_U8; + return HSAIL_DATATYPE_U8; case ValueType::U16: - return HSAIL_DATATYPE_U16; + return HSAIL_DATATYPE_U16; case ValueType::U32: - return HSAIL_DATATYPE_U32; + return HSAIL_DATATYPE_U32; case ValueType::U64: - return HSAIL_DATATYPE_U64; + return HSAIL_DATATYPE_U64; case ValueType::F16: - return HSAIL_DATATYPE_F16; + return HSAIL_DATATYPE_F16; case ValueType::F32: - return HSAIL_DATATYPE_F32; + return HSAIL_DATATYPE_F32; case ValueType::F64: - return HSAIL_DATATYPE_F64; + return HSAIL_DATATYPE_F64; case ValueType::Struct: - return HSAIL_DATATYPE_STRUCT; + return HSAIL_DATATYPE_STRUCT; default: - return HSAIL_DATATYPE_ERROR; - } + return HSAIL_DATATYPE_ERROR; + } } -static inline cl_kernel_arg_type_qualifier -GetOclTypeQual(const KernelArgMD& lcArg) -{ - cl_kernel_arg_type_qualifier rv = CL_KERNEL_ARG_TYPE_NONE; - if (lcArg.mValueKind == ValueKind::GlobalBuffer || - lcArg.mValueKind == ValueKind::DynamicSharedPointer) { - if (lcArg.mIsVolatile) { - rv |= CL_KERNEL_ARG_TYPE_VOLATILE; - } - if (lcArg.mIsRestrict) { - rv |= CL_KERNEL_ARG_TYPE_RESTRICT; - } - if (lcArg.mIsConst) { - rv |= CL_KERNEL_ARG_TYPE_CONST; - } +static inline cl_kernel_arg_type_qualifier GetOclTypeQual(const KernelArgMD& lcArg) { + cl_kernel_arg_type_qualifier rv = CL_KERNEL_ARG_TYPE_NONE; + if (lcArg.mValueKind == ValueKind::GlobalBuffer || + lcArg.mValueKind == ValueKind::DynamicSharedPointer) { + if (lcArg.mIsVolatile) { + rv |= CL_KERNEL_ARG_TYPE_VOLATILE; } - return rv; + if (lcArg.mIsRestrict) { + rv |= CL_KERNEL_ARG_TYPE_RESTRICT; + } + if (lcArg.mIsConst) { + rv |= CL_KERNEL_ARG_TYPE_CONST; + } + } + return rv; } -void -LightningKernel::initArgList(const KernelMD& kernelMD) -{ - device::Kernel::parameters_t params; +void LightningKernel::initArgList(const KernelMD& kernelMD) { + device::Kernel::parameters_t params; - size_t offset = 0; + size_t offset = 0; - for (size_t i = 0; i < kernelMD.mArgs.size(); ++i) { - const KernelArgMD& lcArg = kernelMD.mArgs[i]; + for (size_t i = 0; i < kernelMD.mArgs.size(); ++i) { + const KernelArgMD& lcArg = kernelMD.mArgs[i]; - // Initialize HSAIL kernel argument - auto arg = new HSAILKernel::Argument; - arg->name_ = lcArg.mName; - arg->typeName_ = lcArg.mTypeName; - arg->size_ = lcArg.mSize; - arg->type_ = GetKernelArgType(lcArg); - arg->addrQual_ = GetKernelAddrQual(lcArg); - arg->dataType_ = GetKernelDataType(lcArg); - arg->alignment_ = GetKernelArgAlignment(lcArg); - arg->access_ = GetKernelArgAccessType(lcArg); - arg->pointeeAlignment_ = GetKernelArgPointeeAlignment(lcArg); + // Initialize HSAIL kernel argument + auto arg = new HSAILKernel::Argument; + arg->name_ = lcArg.mName; + arg->typeName_ = lcArg.mTypeName; + arg->size_ = lcArg.mSize; + arg->type_ = GetKernelArgType(lcArg); + arg->addrQual_ = GetKernelAddrQual(lcArg); + arg->dataType_ = GetKernelDataType(lcArg); + arg->alignment_ = GetKernelArgAlignment(lcArg); + arg->access_ = GetKernelArgAccessType(lcArg); + arg->pointeeAlignment_ = GetKernelArgPointeeAlignment(lcArg); - bool isHidden = arg->type_ == HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X - || arg->type_ == HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y - || arg->type_ == HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z - || arg->type_ == HSAIL_ARGTYPE_HIDDEN_PRINTF_BUFFER - || arg->type_ == HSAIL_ARGTYPE_HIDDEN_DEFAULT_QUEUE - || arg->type_ == HSAIL_ARGTYPE_HIDDEN_COMPLETION_ACTION - || arg->type_ == HSAIL_ARGTYPE_HIDDEN_NONE; + bool isHidden = arg->type_ == HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X || + arg->type_ == HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y || + arg->type_ == HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z || + arg->type_ == HSAIL_ARGTYPE_HIDDEN_PRINTF_BUFFER || + arg->type_ == HSAIL_ARGTYPE_HIDDEN_DEFAULT_QUEUE || + arg->type_ == HSAIL_ARGTYPE_HIDDEN_COMPLETION_ACTION || + arg->type_ == HSAIL_ARGTYPE_HIDDEN_NONE; - arg->index_ = isHidden ? uint(-1) : params.size(); - arguments_.push_back(arg); + arg->index_ = isHidden ? uint(-1) : params.size(); + arguments_.push_back(arg); - if (isHidden) { - continue; - } - - // Initialize Device kernel parameters - amd::KernelParameterDescriptor desc; - - desc.name_ = lcArg.mName.c_str(); - desc.type_ = GetOclType(arg); - desc.addressQualifier_ = GetOclAddrQual(arg); - desc.accessQualifier_ = GetOclAccessQual(arg); - desc.typeQualifier_ = GetOclTypeQual(lcArg); - desc.typeName_ = lcArg.mTypeName.c_str(); - - // Make a check if it is local or global - if (desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_LOCAL) { - desc.size_ = 0; - } - else { - desc.size_ = arg->size_; - } - - // Make offset alignment to match CPU metadata, since - // in multidevice config abstraction layer has a single signature - // and CPU sends the parameters as they are allocated in memory - size_t size = desc.size_; - if (size == 0) { - // Local memory for CPU - size = sizeof(cl_mem); - } - offset = (size_t) amd::alignUp(offset, std::min(size, size_t(16))); - desc.offset_ = offset; - offset += amd::alignUp(size, sizeof(uint32_t)); - - params.push_back(desc); + if (isHidden) { + continue; } - createSignature(params); + // Initialize Device kernel parameters + amd::KernelParameterDescriptor desc; + + desc.name_ = lcArg.mName.c_str(); + desc.type_ = GetOclType(arg); + desc.addressQualifier_ = GetOclAddrQual(arg); + desc.accessQualifier_ = GetOclAccessQual(arg); + desc.typeQualifier_ = GetOclTypeQual(lcArg); + desc.typeName_ = lcArg.mTypeName.c_str(); + + // Make a check if it is local or global + if (desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_LOCAL) { + desc.size_ = 0; + } else { + desc.size_ = arg->size_; + } + + // Make offset alignment to match CPU metadata, since + // in multidevice config abstraction layer has a single signature + // and CPU sends the parameters as they are allocated in memory + size_t size = desc.size_; + if (size == 0) { + // Local memory for CPU + size = sizeof(cl_mem); + } + offset = (size_t)amd::alignUp(offset, std::min(size, size_t(16))); + desc.offset_ = offset; + offset += amd::alignUp(size, sizeof(uint32_t)); + + params.push_back(desc); + } + + createSignature(params); } -static const KernelMD* FindKernelMetadata(const CodeObjectMD* programMD, - const std::string& name) { - for (const KernelMD& kernelMD : programMD->mKernels) { - if (kernelMD.mName == name) { return &kernelMD; } +static const KernelMD* FindKernelMetadata(const CodeObjectMD* programMD, const std::string& name) { + for (const KernelMD& kernelMD : programMD->mKernels) { + if (kernelMD.mName == name) { + return &kernelMD; } - return nullptr; + } + return nullptr; } -bool -LightningKernel::init(amd::hsa::loader::Symbol* symbol) -{ - flags_.internalKernel_ = (compileOptions_.find("-cl-internal-kernel") != - std::string::npos) ? true: false; +bool LightningKernel::init(amd::hsa::loader::Symbol* symbol) { + flags_.internalKernel_ = + (compileOptions_.find("-cl-internal-kernel") != std::string::npos) ? true : false; - aqlCreateHWInfo(symbol); + aqlCreateHWInfo(symbol); - const CodeObjectMD* programMD = prog().metadata(); - assert(programMD != nullptr); + const CodeObjectMD* programMD = prog().metadata(); + assert(programMD != nullptr); - const KernelMD* kernelMD = FindKernelMetadata(programMD, name()); + const KernelMD* kernelMD = FindKernelMetadata(programMD, name()); - if (kernelMD == nullptr) { - return false; - } + if (kernelMD == nullptr) { + return false; + } - // Set the argList - initArgList(*kernelMD); + // Set the argList + initArgList(*kernelMD); - if (!kernelMD->mAttrs.mReqdWorkGroupSize.empty()) { - const auto& requiredWorkgroupSize = kernelMD->mAttrs.mReqdWorkGroupSize; - workGroupInfo_.compileSize_[0] = requiredWorkgroupSize[0]; - workGroupInfo_.compileSize_[1] = requiredWorkgroupSize[1]; - workGroupInfo_.compileSize_[2] = requiredWorkgroupSize[2]; - } + if (!kernelMD->mAttrs.mReqdWorkGroupSize.empty()) { + const auto& requiredWorkgroupSize = kernelMD->mAttrs.mReqdWorkGroupSize; + workGroupInfo_.compileSize_[0] = requiredWorkgroupSize[0]; + workGroupInfo_.compileSize_[1] = requiredWorkgroupSize[1]; + workGroupInfo_.compileSize_[2] = requiredWorkgroupSize[2]; + } - if (!kernelMD->mAttrs.mWorkGroupSizeHint.empty()) { - const auto& workgroupSizeHint = kernelMD->mAttrs.mWorkGroupSizeHint; - workGroupInfo_.compileSizeHint_[0] = workgroupSizeHint[0]; - workGroupInfo_.compileSizeHint_[1] = workgroupSizeHint[1]; - workGroupInfo_.compileSizeHint_[2] = workgroupSizeHint[2]; - } + if (!kernelMD->mAttrs.mWorkGroupSizeHint.empty()) { + const auto& workgroupSizeHint = kernelMD->mAttrs.mWorkGroupSizeHint; + workGroupInfo_.compileSizeHint_[0] = workgroupSizeHint[0]; + workGroupInfo_.compileSizeHint_[1] = workgroupSizeHint[1]; + workGroupInfo_.compileSizeHint_[2] = workgroupSizeHint[2]; + } - if (!kernelMD->mAttrs.mVecTypeHint.empty()) { - workGroupInfo_.compileVecTypeHint_ = - kernelMD->mAttrs.mVecTypeHint.c_str(); - } + if (!kernelMD->mAttrs.mVecTypeHint.empty()) { + workGroupInfo_.compileVecTypeHint_ = kernelMD->mAttrs.mVecTypeHint.c_str(); + } - // Copy wavefront size - workGroupInfo_.wavefrontSize_ = prog().isNull() ? 64 : - dev().properties().gfxipProperties.shaderCore.wavefrontSize; - // Find total workgroup size - if (workGroupInfo_.compileSize_[0] != 0) { - workGroupInfo_.size_ = - workGroupInfo_.compileSize_[0] * - workGroupInfo_.compileSize_[1] * - workGroupInfo_.compileSize_[2]; - } - else { - workGroupInfo_.size_ = dev().info().maxWorkGroupSize_; - } + // Copy wavefront size + workGroupInfo_.wavefrontSize_ = + prog().isNull() ? 64 : dev().properties().gfxipProperties.shaderCore.wavefrontSize; + // Find total workgroup size + if (workGroupInfo_.compileSize_[0] != 0) { + workGroupInfo_.size_ = workGroupInfo_.compileSize_[0] * workGroupInfo_.compileSize_[1] * + workGroupInfo_.compileSize_[2]; + } else { + workGroupInfo_.size_ = dev().info().maxWorkGroupSize_; + } - initPrintf(programMD->mPrintf); + initPrintf(programMD->mPrintf); - /*FIXME_lmoriche: - size_t sizeOfWavesPerSimdHint = sizeof(workGroupInfo_.wavesPerSimdHint_); - error = aclQueryInfo(dev().compiler(), prog().binaryElf(), - RT_WAVES_PER_SIMD_HINT, openClKernelName.c_str(), - &workGroupInfo_.wavesPerSimdHint_, &sizeOfWavesPerSimdHint); - if (error != ACL_SUCCESS) { - return false; - } + /*FIXME_lmoriche: + size_t sizeOfWavesPerSimdHint = sizeof(workGroupInfo_.wavesPerSimdHint_); + error = aclQueryInfo(dev().compiler(), prog().binaryElf(), + RT_WAVES_PER_SIMD_HINT, openClKernelName.c_str(), + &workGroupInfo_.wavesPerSimdHint_, &sizeOfWavesPerSimdHint); + if (error != ACL_SUCCESS) { + return false; + } - waveLimiter_.enable(); - */ + waveLimiter_.enable(); + */ - return true; + return true; } -#endif // defined(WITH_LIGHTNING_COMPILER) +#endif // defined(WITH_LIGHTNING_COMPILER) -} // namespace pal +} // namespace pal diff --git a/rocclr/runtime/device/pal/palkernel.hpp b/rocclr/runtime/device/pal/palkernel.hpp index 1f8ed92254..e1fc12d868 100644 --- a/rocclr/runtime/device/pal/palkernel.hpp +++ b/rocclr/runtime/device/pal/palkernel.hpp @@ -21,20 +21,20 @@ typedef llvm::AMDGPU::CodeObject::Kernel::Metadata KernelMD; typedef llvm::AMDGPU::CodeObject::Kernel::Arg::Metadata KernelArgMD; -#endif // defined(WITH_LIGHTNING_COMPILER) +#endif // defined(WITH_LIGHTNING_COMPILER) namespace amd { namespace hsa { namespace loader { class Symbol; -} // loader +} // loader namespace code { namespace Kernel { class Metadata; -} // Kernel -} // code -} // hsa -} // amd +} // Kernel +} // code +} // hsa +} // amd //! \namespace pal PAL Device Implementation namespace pal { @@ -49,249 +49,231 @@ class LightningProgram; * @{ */ -enum HSAIL_ADDRESS_QUALIFIER{ - HSAIL_ADDRESS_ERROR = 0, - HSAIL_ADDRESS_GLOBAL, - HSAIL_ADDRESS_LOCAL, - HSAIL_ADDRESS_CONSTANT, - HSAIL_MAX_ADDRESS_QUALIFIERS -} ; - -enum HSAIL_ARG_TYPE{ - HSAIL_ARGTYPE_ERROR = 0, - HSAIL_ARGTYPE_POINTER, - HSAIL_ARGTYPE_VALUE, - HSAIL_ARGTYPE_REFERENCE, - HSAIL_ARGTYPE_IMAGE, - HSAIL_ARGTYPE_SAMPLER, - HSAIL_ARGTYPE_QUEUE, - HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X, - HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y, - HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z, - HSAIL_ARGTYPE_HIDDEN_PRINTF_BUFFER, - HSAIL_ARGTYPE_HIDDEN_DEFAULT_QUEUE, - HSAIL_ARGTYPE_HIDDEN_COMPLETION_ACTION, - HSAIL_ARGTYPE_HIDDEN_NONE, - HSAIL_ARGMAX_ARG_TYPES +enum HSAIL_ADDRESS_QUALIFIER { + HSAIL_ADDRESS_ERROR = 0, + HSAIL_ADDRESS_GLOBAL, + HSAIL_ADDRESS_LOCAL, + HSAIL_ADDRESS_CONSTANT, + HSAIL_MAX_ADDRESS_QUALIFIERS }; -enum HSAIL_DATA_TYPE{ - HSAIL_DATATYPE_ERROR = 0, - HSAIL_DATATYPE_B1, - HSAIL_DATATYPE_B8, - HSAIL_DATATYPE_B16, - HSAIL_DATATYPE_B32, - HSAIL_DATATYPE_B64, - HSAIL_DATATYPE_S8, - HSAIL_DATATYPE_S16, - HSAIL_DATATYPE_S32, - HSAIL_DATATYPE_S64, - HSAIL_DATATYPE_U8, - HSAIL_DATATYPE_U16, - HSAIL_DATATYPE_U32, - HSAIL_DATATYPE_U64, - HSAIL_DATATYPE_F16, - HSAIL_DATATYPE_F32, - HSAIL_DATATYPE_F64, - HSAIL_DATATYPE_STRUCT, - HSAIL_DATATYPE_OPAQUE, - HSAIL_DATATYPE_MAX_TYPES +enum HSAIL_ARG_TYPE { + HSAIL_ARGTYPE_ERROR = 0, + HSAIL_ARGTYPE_POINTER, + HSAIL_ARGTYPE_VALUE, + HSAIL_ARGTYPE_REFERENCE, + HSAIL_ARGTYPE_IMAGE, + HSAIL_ARGTYPE_SAMPLER, + HSAIL_ARGTYPE_QUEUE, + HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X, + HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y, + HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z, + HSAIL_ARGTYPE_HIDDEN_PRINTF_BUFFER, + HSAIL_ARGTYPE_HIDDEN_DEFAULT_QUEUE, + HSAIL_ARGTYPE_HIDDEN_COMPLETION_ACTION, + HSAIL_ARGTYPE_HIDDEN_NONE, + HSAIL_ARGMAX_ARG_TYPES +}; + +enum HSAIL_DATA_TYPE { + HSAIL_DATATYPE_ERROR = 0, + HSAIL_DATATYPE_B1, + HSAIL_DATATYPE_B8, + HSAIL_DATATYPE_B16, + HSAIL_DATATYPE_B32, + HSAIL_DATATYPE_B64, + HSAIL_DATATYPE_S8, + HSAIL_DATATYPE_S16, + HSAIL_DATATYPE_S32, + HSAIL_DATATYPE_S64, + HSAIL_DATATYPE_U8, + HSAIL_DATATYPE_U16, + HSAIL_DATATYPE_U32, + HSAIL_DATATYPE_U64, + HSAIL_DATATYPE_F16, + HSAIL_DATATYPE_F32, + HSAIL_DATATYPE_F64, + HSAIL_DATATYPE_STRUCT, + HSAIL_DATATYPE_OPAQUE, + HSAIL_DATATYPE_MAX_TYPES }; enum HSAIL_ACCESS_TYPE { - HSAIL_ACCESS_TYPE_NONE = 0, - HSAIL_ACCESS_TYPE_RO, - HSAIL_ACCESS_TYPE_WO, - HSAIL_ACCESS_TYPE_RW + HSAIL_ACCESS_TYPE_NONE = 0, + HSAIL_ACCESS_TYPE_RO, + HSAIL_ACCESS_TYPE_WO, + HSAIL_ACCESS_TYPE_RW }; -class HSAILKernel : public device::Kernel -{ -public: - struct Argument - { - uint index_; //!< Argument's index in the OCL signature - std::string name_; //!< Argument's name - std::string typeName_; //!< Argument's type name - uint size_; //!< Size in bytes - uint alignment_; //!< Argument's alignment - uint pointeeAlignment_; //!< Alignment of the data pointed to - HSAIL_ARG_TYPE type_; //!< Type of the argument - HSAIL_ADDRESS_QUALIFIER addrQual_; //!< Address qualifier of the argument - HSAIL_DATA_TYPE dataType_; //!< The type of data - HSAIL_ACCESS_TYPE access_; //!< Access type for the argument +class HSAILKernel : public device::Kernel { + public: + struct Argument { + uint index_; //!< Argument's index in the OCL signature + std::string name_; //!< Argument's name + std::string typeName_; //!< Argument's type name + uint size_; //!< Size in bytes + uint alignment_; //!< Argument's alignment + uint pointeeAlignment_; //!< Alignment of the data pointed to + HSAIL_ARG_TYPE type_; //!< Type of the argument + HSAIL_ADDRESS_QUALIFIER addrQual_; //!< Address qualifier of the argument + HSAIL_DATA_TYPE dataType_; //!< The type of data + HSAIL_ACCESS_TYPE access_; //!< Access type for the argument + }; + + HSAILKernel(std::string name, HSAILProgram* prog, std::string compileOptions); + + virtual ~HSAILKernel(); + + //! Initializes the metadata required for this kernel, + //! finalizes the kernel if needed + bool init(amd::hsa::loader::Symbol* sym, bool finalize = false); + + //! Returns true if memory is valid for execution + virtual bool validateMemory(uint idx, amd::Memory* amdMem) const; + + //! Returns the kernel argument list + const std::vector& arguments() const { return arguments_; } + + //! Returns a pointer to the hsail argument at the specified index + Argument* argumentAt(size_t index) const { + for (auto arg : arguments_) + if (arg->index_ == index) return arg; + assert(!"Should not reach here"); + return NULL; + } + + //! Returns GPU device object, associated with this kernel + const Device& dev() const; + + //! Returns HSA program associated with this kernel + const HSAILProgram& prog() const; + + //! Returns LDS size used in this kernel + uint32_t ldsSize() const { return cpuAqlCode_->workgroup_group_segment_byte_size; } + + //! Returns pointer on CPU to AQL code info + const amd_kernel_code_t* cpuAqlCode() const { return cpuAqlCode_; } + + //! Returns memory object with AQL code + uint64_t gpuAqlCode() const { return code_; } + + //! Returns size of AQL code + size_t aqlCodeSize() const { return codeSize_; } + + //! Returns the size of argument buffer + size_t argsBufferSize() const { return cpuAqlCode_->kernarg_segment_byte_size; } + + //! Returns spill reg size per workitem + int spillSegSize() const { return cpuAqlCode_->workitem_private_segment_byte_size; } + + //! Returns TRUE if kernel uses dynamic parallelism + bool dynamicParallelism() const { return (flags_.dynamicParallelism_) ? true : false; } + + //! Returns TRUE if kernel is internal kernel + bool isInternalKernel() const { return (flags_.internalKernel_) ? true : false; } + + //! Finds local workgroup size + void findLocalWorkSize(size_t workDim, //!< Work dimension + const amd::NDRange& gblWorkSize, //!< Global work size + amd::NDRange& lclWorkSize //!< Local work size + ) const; + + //! Returns AQL packet in CPU memory + //! if the kernel arguments were successfully loaded, otherwise NULL + hsa_kernel_dispatch_packet_t* loadArguments( + VirtualGPU& gpu, //!< Running GPU context + const amd::Kernel& kernel, //!< AMD kernel object + const amd::NDRangeContainer& sizes, //!< NDrange container + const_address parameters, //!< Application arguments for the kernel + bool nativeMem, //!< Native memory objects are passed + uint64_t vmDefQueue, //!< GPU VM default queue pointer + uint64_t* vmParentWrap, //!< GPU VM parent aql wrap object + std::vector& memList //!< Memory list for GSL/VidMM handles + ) const; + + + //! Returns pritnf info array + const std::vector& printfInfo() const { return printf_; } + + //! Returns the kernel index in the program + uint index() const { return index_; } + + //! Get profiling callback object + virtual amd::ProfilingCallback* getProfilingCallback(const device::VirtualDevice* vdev) { + return waveLimiter_.getProfilingCallback(vdev); + } + + //! Get waves per shader array to be used for kernel execution. + uint getWavesPerSH(const device::VirtualDevice* vdev) const { + return waveLimiter_.getWavesPerSH(vdev); + } + + private: + //! Disable copy constructor + HSAILKernel(const HSAILKernel&); + + //! Disable operator= + HSAILKernel& operator=(const HSAILKernel&); + + protected: + //! Creates AQL kernel HW info + bool aqlCreateHWInfo(amd::hsa::loader::Symbol* sym); + + //! Initializes arguments_ and the abstraction layer kernel parameters + void initArgList(const aclArgData* aclArg //!< List of ACL arguments + ); + + //! Initializes Hsail Argument metadata and info + void initHsailArgs(const aclArgData* aclArg //!< List of ACL arguments + ); + + //! Initializes Hsail Printf metadata and info + void initPrintf(const aclPrintfFmt* aclPrintf //!< List of ACL printfs + ); + + std::vector arguments_; //!< Vector list of HSAIL Arguments + std::string compileOptions_; //!< compile used for finalizing this kernel + amd_kernel_code_t* cpuAqlCode_; //!< AQL kernel code on CPU + const NullDevice& dev_; //!< GPU device object + const HSAILProgram& prog_; //!< Reference to the parent program + std::vector printf_; //!< Format strings for GPU printf support + uint index_; //!< Kernel index in the program + + uint64_t code_; //!< GPU memory pointer to the kernel + size_t codeSize_; //!< Size of ISA code + + union Flags { + struct { + uint imageEna_ : 1; //!< Kernel uses images + uint imageWriteEna_ : 1; //!< Kernel uses image writes + uint dynamicParallelism_ : 1; //!< Dynamic parallelism enabled + uint internalKernel_ : 1; //!< True: internal kernel }; + uint value_; + Flags() : value_(0) {} + } flags_; - HSAILKernel(std::string name, - HSAILProgram* prog, - std::string compileOptions); - - virtual ~HSAILKernel(); - - //! Initializes the metadata required for this kernel, - //! finalizes the kernel if needed - bool init(amd::hsa::loader::Symbol *sym, bool finalize = false); - - //! Returns true if memory is valid for execution - virtual bool validateMemory(uint idx, amd::Memory* amdMem) const; - - //! Returns the kernel argument list - const std::vector& arguments() const { return arguments_; } - - //! Returns a pointer to the hsail argument at the specified index - Argument* argumentAt(size_t index) const { - for (auto arg : arguments_) if (arg->index_ == index) return arg; - assert(!"Should not reach here"); - return NULL; - } - - //! Returns GPU device object, associated with this kernel - const Device& dev() const; - - //! Returns HSA program associated with this kernel - const HSAILProgram& prog() const; - - //! Returns LDS size used in this kernel - uint32_t ldsSize() const - { return cpuAqlCode_->workgroup_group_segment_byte_size; } - - //! Returns pointer on CPU to AQL code info - const amd_kernel_code_t* cpuAqlCode() const { return cpuAqlCode_; } - - //! Returns memory object with AQL code - uint64_t gpuAqlCode() const { return code_; } - - //! Returns size of AQL code - size_t aqlCodeSize() const { return codeSize_; } - - //! Returns the size of argument buffer - size_t argsBufferSize() const - { return cpuAqlCode_->kernarg_segment_byte_size; } - - //! Returns spill reg size per workitem - int spillSegSize() const - { return cpuAqlCode_->workitem_private_segment_byte_size; } - - //! Returns TRUE if kernel uses dynamic parallelism - bool dynamicParallelism() const - { return (flags_.dynamicParallelism_) ? true : false; } - - //! Returns TRUE if kernel is internal kernel - bool isInternalKernel() const - { return (flags_.internalKernel_) ? true : false; } - - //! Finds local workgroup size - void findLocalWorkSize( - size_t workDim, //!< Work dimension - const amd::NDRange& gblWorkSize,//!< Global work size - amd::NDRange& lclWorkSize //!< Local work size - ) const; - - //! Returns AQL packet in CPU memory - //! if the kernel arguments were successfully loaded, otherwise NULL - hsa_kernel_dispatch_packet_t* loadArguments( - VirtualGPU& gpu, //!< Running GPU context - const amd::Kernel& kernel, //!< AMD kernel object - const amd::NDRangeContainer& sizes, //!< NDrange container - const_address parameters, //!< Application arguments for the kernel - bool nativeMem, //!< Native memory objects are passed - uint64_t vmDefQueue, //!< GPU VM default queue pointer - uint64_t* vmParentWrap, //!< GPU VM parent aql wrap object - std::vector& memList //!< Memory list for GSL/VidMM handles - ) const; - - - //! Returns pritnf info array - const std::vector& printfInfo() const { return printf_; } - - //! Returns the kernel index in the program - uint index() const { return index_; } - - //! Get profiling callback object - virtual amd::ProfilingCallback* getProfilingCallback( - const device::VirtualDevice *vdev) { - return waveLimiter_.getProfilingCallback(vdev); - } - - //! Get waves per shader array to be used for kernel execution. - uint getWavesPerSH(const device::VirtualDevice *vdev) const { - return waveLimiter_.getWavesPerSH(vdev); - } - -private: - //! Disable copy constructor - HSAILKernel(const HSAILKernel&); - - //! Disable operator= - HSAILKernel& operator=(const HSAILKernel&); - -protected: - //! Creates AQL kernel HW info - bool aqlCreateHWInfo(amd::hsa::loader::Symbol *sym); - - //! Initializes arguments_ and the abstraction layer kernel parameters - void initArgList( - const aclArgData* aclArg //!< List of ACL arguments - ); - - //! Initializes Hsail Argument metadata and info - void initHsailArgs( - const aclArgData* aclArg //!< List of ACL arguments - ); - - //! Initializes Hsail Printf metadata and info - void initPrintf( - const aclPrintfFmt* aclPrintf //!< List of ACL printfs - ); - - std::vector arguments_; //!< Vector list of HSAIL Arguments - std::string compileOptions_; //!< compile used for finalizing this kernel - amd_kernel_code_t* cpuAqlCode_; //!< AQL kernel code on CPU - const NullDevice& dev_; //!< GPU device object - const HSAILProgram& prog_; //!< Reference to the parent program - std::vector printf_; //!< Format strings for GPU printf support - uint index_; //!< Kernel index in the program - - uint64_t code_; //!< GPU memory pointer to the kernel - size_t codeSize_; //!< Size of ISA code - - union Flags { - struct { - uint imageEna_: 1; //!< Kernel uses images - uint imageWriteEna_: 1; //!< Kernel uses image writes - uint dynamicParallelism_: 1; //!< Dynamic parallelism enabled - uint internalKernel_: 1; //!< True: internal kernel - }; - uint value_; - Flags(): value_(0) {} - } flags_; - - WaveLimiterManager waveLimiter_; //!< adaptively control number of waves + WaveLimiterManager waveLimiter_; //!< adaptively control number of waves }; #if defined(WITH_LIGHTNING_COMPILER) -class LightningKernel : public HSAILKernel -{ -public: - LightningKernel(const std::string& name, - HSAILProgram* prog, - const std::string& compileOptions - ): HSAILKernel(name, prog, compileOptions) - {} +class LightningKernel : public HSAILKernel { + public: + LightningKernel(const std::string& name, HSAILProgram* prog, const std::string& compileOptions) + : HSAILKernel(name, prog, compileOptions) {} - //! Returns Lightning program associated with this kernel - const LightningProgram& prog() const; + //! Returns Lightning program associated with this kernel + const LightningProgram& prog() const; - //! Initializes the metadata required for this kernel, - bool init(amd::hsa::loader::Symbol* symbol); + //! Initializes the metadata required for this kernel, + bool init(amd::hsa::loader::Symbol* symbol); - //! Initializes Hsail Argument metadata and info for LC - void initArgList(const KernelMD& kernelMD); + //! Initializes Hsail Argument metadata and info for LC + void initArgList(const KernelMD& kernelMD); - //! Initializes HSAIL Printf metadata and info for LC - void initPrintf(const std::vector& printfInfoStrings); + //! Initializes HSAIL Printf metadata and info for LC + void initPrintf(const std::vector& printfInfoStrings); }; -#endif // defined(WITH_LIGHTNING_COMPILER) +#endif // defined(WITH_LIGHTNING_COMPILER) /*@}*/} // namespace pal - diff --git a/rocclr/runtime/device/pal/palmemory.cpp b/rocclr/runtime/device/pal/palmemory.cpp index 039017b1fc..b8e017c51d 100644 --- a/rocclr/runtime/device/pal/palmemory.cpp +++ b/rocclr/runtime/device/pal/palmemory.cpp @@ -12,7 +12,7 @@ #include "amdocl/cl_d3d9_amd.hpp" #include "amdocl/cl_d3d10_amd.hpp" #include "amdocl/cl_d3d11_amd.hpp" -#endif //_WIN32 +#endif //_WIN32 #include "amdocl/cl_gl_amd.hpp" #include @@ -22,1279 +22,1136 @@ namespace pal { -Memory::Memory( - const Device& gpuDev, - amd::Memory& owner, - size_t size) - : device::Memory(owner) - , Resource(gpuDev, size) -{ - init(); +Memory::Memory(const Device& gpuDev, amd::Memory& owner, size_t size) + : device::Memory(owner), Resource(gpuDev, size) { + init(); - if (owner.parent() != nullptr) { - flags_ |= SubMemoryObject; - } + if (owner.parent() != nullptr) { + flags_ |= SubMemoryObject; + } } -Memory::Memory( - const Device& gpuDev, - size_t size) - : device::Memory(size) - , Resource(gpuDev, size) -{ - init(); +Memory::Memory(const Device& gpuDev, size_t size) : device::Memory(size), Resource(gpuDev, size) { + init(); } -Memory::Memory( - const Device& gpuDev, - amd::Memory& owner, - size_t width, - size_t height, - size_t depth, - cl_image_format format, - cl_mem_object_type imageType, - uint mipLevels - ) - : device::Memory(owner) - , Resource(gpuDev, width, height, depth, format, imageType, mipLevels) -{ - init(); +Memory::Memory(const Device& gpuDev, amd::Memory& owner, size_t width, size_t height, size_t depth, + cl_image_format format, cl_mem_object_type imageType, uint mipLevels) + : device::Memory(owner), Resource(gpuDev, width, height, depth, format, imageType, mipLevels) { + init(); - if (owner.parent() != nullptr) { - flags_ |= SubMemoryObject; - } + if (owner.parent() != nullptr) { + flags_ |= SubMemoryObject; + } } -Memory::Memory( - const Device& gpuDev, - size_t size, - size_t width, - size_t height, - size_t depth, - cl_image_format format, - cl_mem_object_type imageType, - uint mipLevels - ) - : device::Memory(size) - , Resource(gpuDev, width, height, depth, format, imageType, mipLevels) -{ - init(); +Memory::Memory(const Device& gpuDev, size_t size, size_t width, size_t height, size_t depth, + cl_image_format format, cl_mem_object_type imageType, uint mipLevels) + : device::Memory(size), Resource(gpuDev, width, height, depth, format, imageType, mipLevels) { + init(); } -void -Memory::init() -{ - indirectMapCount_ = 0; - interopType_ = InteropNone; - interopMemory_ = nullptr; - pinnedMemory_ = nullptr; - parent_ = nullptr; +void Memory::init() { + indirectMapCount_ = 0; + interopType_ = InteropNone; + interopMemory_ = nullptr; + pinnedMemory_ = nullptr; + parent_ = nullptr; } #ifdef _WIN32 -static HANDLE -getSharedHandle(IUnknown* pIface) -{ - // Sanity checks - assert(pIface != nullptr); +static HANDLE getSharedHandle(IUnknown* pIface) { + // Sanity checks + assert(pIface != nullptr); - HRESULT hRes; - HANDLE hShared; - IDXGIResource* pDxgiRes = nullptr; - if((hRes = (const_cast(pIface))->QueryInterface( - __uuidof(IDXGIResource), - (void**) &pDxgiRes)) != S_OK) { - return (HANDLE) 0; - } - if(!pDxgiRes) { - return (HANDLE) 0; - } - hRes = pDxgiRes->GetSharedHandle(&hShared); - pDxgiRes->Release(); - if(hRes != S_OK) { - return (HANDLE) 0; - } - return hShared; + HRESULT hRes; + HANDLE hShared; + IDXGIResource* pDxgiRes = nullptr; + if ((hRes = (const_cast(pIface)) + ->QueryInterface(__uuidof(IDXGIResource), (void**)&pDxgiRes)) != S_OK) { + return (HANDLE)0; + } + if (!pDxgiRes) { + return (HANDLE)0; + } + hRes = pDxgiRes->GetSharedHandle(&hShared); + pDxgiRes->Release(); + if (hRes != S_OK) { + return (HANDLE)0; + } + return hShared; } -#endif //_WIN32 +#endif //_WIN32 -bool -Memory::create( - Resource::MemoryType memType, - Resource::CreateParams* params) -{ - bool result; - uint allocAttempt = 0; - // Reset the flag in case we reallocate the heap in local/remote - flags_ &= ~HostMemoryDirectAccess; +bool Memory::create(Resource::MemoryType memType, Resource::CreateParams* params) { + bool result; + uint allocAttempt = 0; + // Reset the flag in case we reallocate the heap in local/remote + flags_ &= ~HostMemoryDirectAccess; - do { - // Create a resource in CAL - result = Resource::create(memType, params); - if (!result) { - size_t freeMemory[2]; - // if requested memory is greater than available then exit the loop - dev().globalFreeMemory(freeMemory); + do { + // Create a resource in CAL + result = Resource::create(memType, params); + if (!result) { + size_t freeMemory[2]; + // if requested memory is greater than available then exit the loop + dev().globalFreeMemory(freeMemory); - // Local to Persistent - if (memoryType() == Local) { - // For dgpu freeMemory[0] reports a sum of visible+invisible fb - if (size() > (freeMemory[0] * Ki)) { - break; - } - memType = Persistent; - } - // Don't switch to USWC if persistent memory was explicitly asked - else if ((allocAttempt > 0) && (memoryType() == Persistent)) { - memType = RemoteUSWC; - } - // Remote cacheable to uncacheable - else if (memoryType() == Remote) { - memType = RemoteUSWC; - } - else if (dev().settings().apuSystem_ && memoryType() == RemoteUSWC) { - if (size() > (freeMemory[0] * Ki) || allocAttempt >= 2) { - break; - } - } - else { - break; - } - allocAttempt++; + // Local to Persistent + if (memoryType() == Local) { + // For dgpu freeMemory[0] reports a sum of visible+invisible fb + if (size() > (freeMemory[0] * Ki)) { + break; } - } while (!result); - - // Check if CAL created a resource - if (result) { - switch (memoryType()) { - case Resource::Pinned: - // Marks memory object for direct GPU access to the host memory - flags_ |= HostMemoryDirectAccess; - break; - case Resource::Remote: - case Resource::RemoteUSWC: - if (!desc().tiled_) { - // Marks memory object for direct GPU access to the host memory - flags_ |= HostMemoryDirectAccess; - } - break; - case Resource::View: { - Resource::ViewParams* view = - reinterpret_cast(params); - // Check if parent was allocated in system memory - if ((view->resource_->memoryType() == Resource::Pinned) || - (view->resource_->memoryType() == Resource::Remote) || - (view->resource_->memoryType() == Resource::RemoteUSWC)) { - // Marks memory object for direct GPU access to the host memory - flags_ |= HostMemoryDirectAccess; - } - if ((view->owner_ != nullptr) && (view->owner_->parent() != nullptr)) { - parent_ = reinterpret_cast(view->memory_); - flags_ |= SubMemoryObject; - } - break; + memType = Persistent; + } + // Don't switch to USWC if persistent memory was explicitly asked + else if ((allocAttempt > 0) && (memoryType() == Persistent)) { + memType = RemoteUSWC; + } + // Remote cacheable to uncacheable + else if (memoryType() == Remote) { + memType = RemoteUSWC; + } else if (dev().settings().apuSystem_ && memoryType() == RemoteUSWC) { + if (size() > (freeMemory[0] * Ki) || allocAttempt >= 2) { + break; } - case Resource::ImageView: { - Resource::ImageViewParams* view = - reinterpret_cast(params); - parent_ = reinterpret_cast(view->memory_); - flags_ |= SubMemoryObject | (parent_->flags_ & HostMemoryDirectAccess); - break; - } - case Resource::ImageBuffer: { - Resource::ImageBufferParams* view = - reinterpret_cast(params); - parent_ = reinterpret_cast(view->memory_); - flags_ |= SubMemoryObject | (parent_->flags_ & HostMemoryDirectAccess); - break; - } - default: - break; - } - } - - return result; -} - -bool Memory::processGLResource(GLResourceOP operation) -{ - bool retVal = false; - switch (operation) - { - case GLDecompressResource: - retVal = glAcquire(); + } else { break; - case GLInvalidateFBO: - retVal = glRelease(); - break; - default: - assert(false && "unknown GLResourceOP"); + } + allocAttempt++; } - return retVal; + } while (!result); + + // Check if CAL created a resource + if (result) { + switch (memoryType()) { + case Resource::Pinned: + // Marks memory object for direct GPU access to the host memory + flags_ |= HostMemoryDirectAccess; + break; + case Resource::Remote: + case Resource::RemoteUSWC: + if (!desc().tiled_) { + // Marks memory object for direct GPU access to the host memory + flags_ |= HostMemoryDirectAccess; + } + break; + case Resource::View: { + Resource::ViewParams* view = reinterpret_cast(params); + // Check if parent was allocated in system memory + if ((view->resource_->memoryType() == Resource::Pinned) || + (view->resource_->memoryType() == Resource::Remote) || + (view->resource_->memoryType() == Resource::RemoteUSWC)) { + // Marks memory object for direct GPU access to the host memory + flags_ |= HostMemoryDirectAccess; + } + if ((view->owner_ != nullptr) && (view->owner_->parent() != nullptr)) { + parent_ = reinterpret_cast(view->memory_); + flags_ |= SubMemoryObject; + } + break; + } + case Resource::ImageView: { + Resource::ImageViewParams* view = reinterpret_cast(params); + parent_ = reinterpret_cast(view->memory_); + flags_ |= SubMemoryObject | (parent_->flags_ & HostMemoryDirectAccess); + break; + } + case Resource::ImageBuffer: { + Resource::ImageBufferParams* view = reinterpret_cast(params); + parent_ = reinterpret_cast(view->memory_); + flags_ |= SubMemoryObject | (parent_->flags_ & HostMemoryDirectAccess); + break; + } + default: + break; + } + } + + return result; } -bool -Memory::createInterop(InteropType type) -{ - Resource::MemoryType memType = Resource::Empty; - Resource::OGLInteropParams oglRes; +bool Memory::processGLResource(GLResourceOP operation) { + bool retVal = false; + switch (operation) { + case GLDecompressResource: + retVal = glAcquire(); + break; + case GLInvalidateFBO: + retVal = glRelease(); + break; + default: + assert(false && "unknown GLResourceOP"); + } + return retVal; +} + +bool Memory::createInterop(InteropType type) { + Resource::MemoryType memType = Resource::Empty; + Resource::OGLInteropParams oglRes; #ifdef _WIN32 - Resource::D3DInteropParams d3dRes; -#endif //_WIN32 + Resource::D3DInteropParams d3dRes; +#endif //_WIN32 - // Only external objects support interop - assert(owner() != nullptr); + // Only external objects support interop + assert(owner() != nullptr); - Resource::CreateParams* createParams = nullptr; + Resource::CreateParams* createParams = nullptr; - amd::InteropObject* interop = owner()->getInteropObj(); - assert((interop != nullptr) && "An invalid interop object is impossible!"); + amd::InteropObject* interop = owner()->getInteropObj(); + assert((interop != nullptr) && "An invalid interop object is impossible!"); - amd::GLObject* glObject = interop->asGLObject(); + amd::GLObject* glObject = interop->asGLObject(); #ifdef _WIN32 - amd::D3D10Object* d3d10Object = interop->asD3D10Object(); - amd::D3D11Object* d3d11Object = interop->asD3D11Object(); - amd::D3D9Object* d3d9Object = interop->asD3D9Object(); + amd::D3D10Object* d3d10Object = interop->asD3D10Object(); + amd::D3D11Object* d3d11Object = interop->asD3D11Object(); + amd::D3D9Object* d3d9Object = interop->asD3D9Object(); - if (d3d10Object != nullptr) { - createParams = &d3dRes; + if (d3d10Object != nullptr) { + createParams = &d3dRes; - d3dRes.owner_ = owner(); + d3dRes.owner_ = owner(); - const amd::D3D10ObjDesc_t* objDesc = d3d10Object->getObjDesc(); + const amd::D3D10ObjDesc_t* objDesc = d3d10Object->getObjDesc(); - memType = Resource::D3D10Interop; + memType = Resource::D3D10Interop; - // Get shared handle - if ((d3dRes.handle_ = - getSharedHandle(d3d10Object->getD3D10Resource()))) { - d3dRes.iDirect3D_ = static_cast - (d3d10Object->getD3D10Resource()); - d3dRes.type_ = Resource::InteropTypeless; - } - - d3dRes.misc = 0; - // Find D3D10 object type - switch (objDesc->objDim_) { - case D3D10_RESOURCE_DIMENSION_BUFFER: - d3dRes.type_ = Resource::InteropVertexBuffer; - break; - case D3D10_RESOURCE_DIMENSION_TEXTURE1D: - case D3D10_RESOURCE_DIMENSION_TEXTURE2D: - case D3D10_RESOURCE_DIMENSION_TEXTURE3D: - d3dRes.type_ = Resource::InteropTexture; - if (objDesc->mipLevels_ > 1) { - d3dRes.type_ = Resource::InteropTextureViewLevel; - - if (objDesc->arraySize_ > 1) { - d3dRes.layer_ = d3d10Object->getSubresource() / - objDesc->mipLevels_; - d3dRes.mipLevel_ = d3d10Object->getSubresource() % - objDesc->mipLevels_; - } - else { - d3dRes.layer_ = 0; - d3dRes.mipLevel_ = d3d10Object->getSubresource(); - } - } - break; - default: - return false; - break; - } + // Get shared handle + if ((d3dRes.handle_ = getSharedHandle(d3d10Object->getD3D10Resource()))) { + d3dRes.iDirect3D_ = static_cast(d3d10Object->getD3D10Resource()); + d3dRes.type_ = Resource::InteropTypeless; } - else if (d3d11Object != nullptr) { - createParams = &d3dRes; - d3dRes.owner_ = owner(); + d3dRes.misc = 0; + // Find D3D10 object type + switch (objDesc->objDim_) { + case D3D10_RESOURCE_DIMENSION_BUFFER: + d3dRes.type_ = Resource::InteropVertexBuffer; + break; + case D3D10_RESOURCE_DIMENSION_TEXTURE1D: + case D3D10_RESOURCE_DIMENSION_TEXTURE2D: + case D3D10_RESOURCE_DIMENSION_TEXTURE3D: + d3dRes.type_ = Resource::InteropTexture; + if (objDesc->mipLevels_ > 1) { + d3dRes.type_ = Resource::InteropTextureViewLevel; - const amd::D3D11ObjDesc_t* objDesc = d3d11Object->getObjDesc(); - - memType = Resource::D3D11Interop; - - // Get shared handle - if ((d3dRes.handle_ = - getSharedHandle(d3d11Object->getD3D11Resource()))) { - d3dRes.iDirect3D_ = static_cast - (d3d11Object->getD3D11Resource()); - d3dRes.type_ = Resource::InteropTypeless; + if (objDesc->arraySize_ > 1) { + d3dRes.layer_ = d3d10Object->getSubresource() / objDesc->mipLevels_; + d3dRes.mipLevel_ = d3d10Object->getSubresource() % objDesc->mipLevels_; + } else { + d3dRes.layer_ = 0; + d3dRes.mipLevel_ = d3d10Object->getSubresource(); + } } - - d3dRes.misc = 0; - // Find D3D11 object type - switch (objDesc->objDim_) { - case D3D11_RESOURCE_DIMENSION_BUFFER: - d3dRes.type_ = Resource::InteropVertexBuffer; - break; - case D3D11_RESOURCE_DIMENSION_TEXTURE1D: - case D3D11_RESOURCE_DIMENSION_TEXTURE2D: - case D3D11_RESOURCE_DIMENSION_TEXTURE3D: - d3dRes.type_ = Resource::InteropTexture; - d3dRes.layer_= d3d11Object->getPlane(); - d3dRes.misc = d3d11Object->getMiscFlag(); - if (objDesc->mipLevels_ > 1) { - d3dRes.type_ = Resource::InteropTextureViewLevel; - - if (objDesc->arraySize_ > 1) { - d3dRes.layer_ = d3d11Object->getSubresource() / - objDesc->mipLevels_; - d3dRes.mipLevel_ = d3d11Object->getSubresource() % - objDesc->mipLevels_; - } - else { - d3dRes.layer_ = 0; - d3dRes.mipLevel_ = d3d11Object->getSubresource(); - } - } - break; - default: - return false; - break; - } - } - else if (d3d9Object != nullptr) { - createParams = &d3dRes; - - d3dRes.owner_ = owner(); - - const amd::D3D9ObjDesc_t* objDesc = d3d9Object->getObjDesc(); - - memType = Resource::D3D9Interop; - - // Get shared handle - if ((d3dRes.handle_ = d3d9Object->getD3D9SharedHandle())) { - d3dRes.iDirect3D_ = static_cast - (d3d9Object->getD3D9Resource()); - d3dRes.type_ = Resource::InteropSurface; - d3dRes.mipLevel_ = 0; - d3dRes.layer_ = d3d9Object->getPlane(); - d3dRes.misc = d3d9Object->getMiscFlag(); - } - } - else -#endif //_WIN32 - if (glObject != nullptr) { - createParams = &oglRes; - - oglRes.owner_ = owner(); - - memType = Resource::OGLInterop; - - // Fill the interop creation parameters - oglRes.handle_ = static_cast(glObject->getGLName()); - - // Find OGL object type - switch (glObject->getCLGLObjectType()) { - case CL_GL_OBJECT_BUFFER: - oglRes.type_ = Resource::InteropVertexBuffer; - break; - case CL_GL_OBJECT_TEXTURE_BUFFER: - case CL_GL_OBJECT_TEXTURE1D: - case CL_GL_OBJECT_TEXTURE1D_ARRAY: - case CL_GL_OBJECT_TEXTURE2D: - case CL_GL_OBJECT_TEXTURE2D_ARRAY: - case CL_GL_OBJECT_TEXTURE3D: - oglRes.type_ = Resource::InteropTexture; - if (GL_TEXTURE_CUBE_MAP == glObject->getGLTarget()) { - switch (glObject->getCubemapFace()) { - case GL_TEXTURE_CUBE_MAP_POSITIVE_X: - case GL_TEXTURE_CUBE_MAP_NEGATIVE_X: - case GL_TEXTURE_CUBE_MAP_POSITIVE_Y: - case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y: - case GL_TEXTURE_CUBE_MAP_POSITIVE_Z: - case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z: - oglRes.type_ = Resource::InteropTextureViewCube; - oglRes.layer_ = - glObject->getCubemapFace() - GL_TEXTURE_CUBE_MAP_POSITIVE_X; - oglRes.mipLevel_ = glObject->getGLMipLevel(); - break; - default: - break; - } - } - else if (glObject->getGLMipLevel() != 0) { - oglRes.type_ = Resource::InteropTextureViewLevel; - oglRes.layer_ = 0; - oglRes.mipLevel_ = glObject->getGLMipLevel(); - } - break; - case CL_GL_OBJECT_RENDERBUFFER: - oglRes.type_ = Resource::InteropRenderBuffer; - break; - default: - return false; - break; - } - - oglRes.glPlatformContext_ = owner()->getContext().info().hCtx_; - oglRes.glDeviceContext_ = owner()->getContext().info().hDev_[amd::Context::DeviceFlagIdx::GLDeviceKhrIdx]; - // We dont pass any flags here for the GL Resource. - oglRes.flags_ = 0; - } - else { + break; + default: return false; + break; + } + } else if (d3d11Object != nullptr) { + createParams = &d3dRes; + + d3dRes.owner_ = owner(); + + const amd::D3D11ObjDesc_t* objDesc = d3d11Object->getObjDesc(); + + memType = Resource::D3D11Interop; + + // Get shared handle + if ((d3dRes.handle_ = getSharedHandle(d3d11Object->getD3D11Resource()))) { + d3dRes.iDirect3D_ = static_cast(d3d11Object->getD3D11Resource()); + d3dRes.type_ = Resource::InteropTypeless; } - // Get the interop settings - if (type == InteropDirectAccess) { - // Create memory object - if (!create(memType, createParams)) { - return false; + d3dRes.misc = 0; + // Find D3D11 object type + switch (objDesc->objDim_) { + case D3D11_RESOURCE_DIMENSION_BUFFER: + d3dRes.type_ = Resource::InteropVertexBuffer; + break; + case D3D11_RESOURCE_DIMENSION_TEXTURE1D: + case D3D11_RESOURCE_DIMENSION_TEXTURE2D: + case D3D11_RESOURCE_DIMENSION_TEXTURE3D: + d3dRes.type_ = Resource::InteropTexture; + d3dRes.layer_ = d3d11Object->getPlane(); + d3dRes.misc = d3d11Object->getMiscFlag(); + if (objDesc->mipLevels_ > 1) { + d3dRes.type_ = Resource::InteropTextureViewLevel; + + if (objDesc->arraySize_ > 1) { + d3dRes.layer_ = d3d11Object->getSubresource() / objDesc->mipLevels_; + d3dRes.mipLevel_ = d3d11Object->getSubresource() % objDesc->mipLevels_; + } else { + d3dRes.layer_ = 0; + d3dRes.mipLevel_ = d3d11Object->getSubresource(); + } } + break; + default: + return false; + break; } - else { - // Allocate Resource object for interop as buffer - interopMemory_ = new Memory(dev(), size()); + } else if (d3d9Object != nullptr) { + createParams = &d3dRes; - // Create the interop object in CAL - if (nullptr == interopMemory_ || !interopMemory_->create(memType, createParams)) { - delete interopMemory_; - interopMemory_ = nullptr; - return false; + d3dRes.owner_ = owner(); + + const amd::D3D9ObjDesc_t* objDesc = d3d9Object->getObjDesc(); + + memType = Resource::D3D9Interop; + + // Get shared handle + if ((d3dRes.handle_ = d3d9Object->getD3D9SharedHandle())) { + d3dRes.iDirect3D_ = static_cast(d3d9Object->getD3D9Resource()); + d3dRes.type_ = Resource::InteropSurface; + d3dRes.mipLevel_ = 0; + d3dRes.layer_ = d3d9Object->getPlane(); + d3dRes.misc = d3d9Object->getMiscFlag(); + } + } else +#endif //_WIN32 + if (glObject != nullptr) { + createParams = &oglRes; + + oglRes.owner_ = owner(); + + memType = Resource::OGLInterop; + + // Fill the interop creation parameters + oglRes.handle_ = static_cast(glObject->getGLName()); + + // Find OGL object type + switch (glObject->getCLGLObjectType()) { + case CL_GL_OBJECT_BUFFER: + oglRes.type_ = Resource::InteropVertexBuffer; + break; + case CL_GL_OBJECT_TEXTURE_BUFFER: + case CL_GL_OBJECT_TEXTURE1D: + case CL_GL_OBJECT_TEXTURE1D_ARRAY: + case CL_GL_OBJECT_TEXTURE2D: + case CL_GL_OBJECT_TEXTURE2D_ARRAY: + case CL_GL_OBJECT_TEXTURE3D: + oglRes.type_ = Resource::InteropTexture; + if (GL_TEXTURE_CUBE_MAP == glObject->getGLTarget()) { + switch (glObject->getCubemapFace()) { + case GL_TEXTURE_CUBE_MAP_POSITIVE_X: + case GL_TEXTURE_CUBE_MAP_NEGATIVE_X: + case GL_TEXTURE_CUBE_MAP_POSITIVE_Y: + case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y: + case GL_TEXTURE_CUBE_MAP_POSITIVE_Z: + case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z: + oglRes.type_ = Resource::InteropTextureViewCube; + oglRes.layer_ = glObject->getCubemapFace() - GL_TEXTURE_CUBE_MAP_POSITIVE_X; + oglRes.mipLevel_ = glObject->getGLMipLevel(); + break; + default: + break; + } + } else if (glObject->getGLMipLevel() != 0) { + oglRes.type_ = Resource::InteropTextureViewLevel; + oglRes.layer_ = 0; + oglRes.mipLevel_ = glObject->getGLMipLevel(); } + break; + case CL_GL_OBJECT_RENDERBUFFER: + oglRes.type_ = Resource::InteropRenderBuffer; + break; + default: + return false; + break; } - setInteropType(type); + oglRes.glPlatformContext_ = owner()->getContext().info().hCtx_; + oglRes.glDeviceContext_ = + owner()->getContext().info().hDev_[amd::Context::DeviceFlagIdx::GLDeviceKhrIdx]; + // We dont pass any flags here for the GL Resource. + oglRes.flags_ = 0; + } else { + return false; + } - return true; + // Get the interop settings + if (type == InteropDirectAccess) { + // Create memory object + if (!create(memType, createParams)) { + return false; + } + } else { + // Allocate Resource object for interop as buffer + interopMemory_ = new Memory(dev(), size()); + + // Create the interop object in CAL + if (nullptr == interopMemory_ || !interopMemory_->create(memType, createParams)) { + delete interopMemory_; + interopMemory_ = nullptr; + return false; + } + } + + setInteropType(type); + + return true; } -Memory::~Memory() -{ - // Clean VA cache - dev().removeVACache(this); +Memory::~Memory() { + // Clean VA cache + dev().removeVACache(this); - delete interopMemory_; + delete interopMemory_; - // Release associated map target, if any - if (nullptr != mapMemory_) { - mapMemory()->unmap(nullptr); - mapMemory_->release(); + // Release associated map target, if any + if (nullptr != mapMemory_) { + mapMemory()->unmap(nullptr); + mapMemory_->release(); + } + + // Destory pinned memory + if (flags_ & PinnedMemoryAlloced) { + delete pinnedMemory_; + } + + if ((owner() != nullptr) && isHostMemDirectAccess() && !(flags_ & SubMemoryObject) && + (memoryType() != Resource::ExternalPhysical)) { + // Unmap memory if direct access was requested + unmap(nullptr); + } +} + +void Memory::syncCacheFromHost(VirtualGPU& gpu, device::Memory::SyncFlags syncFlags) { + // If the last writer was another GPU, then make a writeback + if (!isHostMemDirectAccess() && (owner()->getLastWriter() != nullptr) && + (&dev() != owner()->getLastWriter())) { + mgpuCacheWriteBack(); + } + + // If host memory doesn't have direct access, then we have to synchronize + if (!isHostMemDirectAccess() && (nullptr != owner()->getHostMem())) { + bool hasUpdates = true; + + // Make sure the parent of subbuffer is up to date + if (!syncFlags.skipParent_ && (flags_ & SubMemoryObject)) { + pal::Memory* gpuMemory = dev().getGpuMemory(owner()->parent()); + + //! \note: Skipping the sync for a view doesn't reflect the parent settings, + //! since a view is a small portion of parent + device::Memory::SyncFlags syncFlagsTmp; + + // Sync parent from a view, so views have to be skipped + syncFlagsTmp.skipViews_ = true; + + // Make sure the parent sync is an unique operation. + // If the app uses multiple subbuffers from multiple queues, + // then the parent sync can be called from multiple threads + amd::ScopedLock lock(owner()->parent()->lockMemoryOps()); + gpuMemory->syncCacheFromHost(gpu, syncFlagsTmp); + //! \note Don't do early exit here, since we still have to sync + //! this view, if the parent sync operation was a NOP. + //! If parent was synchronized, then this view sync will be a NOP } - // Destory pinned memory + // Is this a NOP? + if ((version_ == owner()->getVersion()) || (&dev() == owner()->getLastWriter())) { + hasUpdates = false; + } + + // Update all available views, since we sync the parent + if ((owner()->subBuffers().size() != 0) && (hasUpdates || !syncFlags.skipViews_)) { + device::Memory::SyncFlags syncFlagsTmp; + + // Sync views from parent, so parent has to be skipped + syncFlagsTmp.skipParent_ = true; + + if (hasUpdates) { + // Parent will be synced so update all views with a skip + syncFlagsTmp.skipEntire_ = true; + } else { + // Passthrough the skip entire flag to the views, since + // any view is a submemory of the parent + syncFlagsTmp.skipEntire_ = syncFlags.skipEntire_; + } + + amd::ScopedLock lock(owner()->lockMemoryOps()); + for (auto& sub : owner()->subBuffers()) { + //! \note Don't allow subbuffer's allocation in the worker thread. + //! It may cause a system lock, because possible resource + //! destruction, heap reallocation or subbuffer allocation + static const bool AllocSubBuffer = false; + device::Memory* devSub = sub->getDeviceMemory(dev(), AllocSubBuffer); + if (nullptr != devSub) { + pal::Memory* gpuSub = reinterpret_cast(devSub); + gpuSub->syncCacheFromHost(gpu, syncFlagsTmp); + } + } + } + + // Make sure we didn't have a NOP, + // because this GPU device was the last writer + if (&dev() != owner()->getLastWriter()) { + // Update the latest version + version_ = owner()->getVersion(); + } + + // Exit if sync is a NOP or sync can be skipped + if (!hasUpdates || syncFlags.skipEntire_) { + return; + } + + bool result = false; + static const bool Entire = true; + amd::Coord3D origin(0, 0, 0); + + // If host memory was pinned then make a transfer if (flags_ & PinnedMemoryAlloced) { - delete pinnedMemory_; + if (desc().buffer_) { + amd::Coord3D region(owner()->getSize()); + result = gpu.blitMgr().copyBuffer(*pinnedMemory_, *this, origin, origin, region, Entire); + } else { + amd::Image& image = static_cast(*owner()); + result = gpu.blitMgr().copyBufferToImage(*pinnedMemory_, *this, origin, origin, + image.getRegion(), Entire, image.getRowPitch(), + image.getSlicePitch()); + } } - if ((owner() != nullptr) && isHostMemDirectAccess() && - !(flags_ & SubMemoryObject) && - (memoryType() != Resource::ExternalPhysical)) { - // Unmap memory if direct access was requested - unmap(nullptr); + if (!result) { + if (desc().buffer_) { + amd::Coord3D region(owner()->getSize()); + result = gpu.blitMgr().writeBuffer(owner()->getHostMem(), *this, origin, region, Entire); + } else { + amd::Image& image = static_cast(*owner()); + result = gpu.blitMgr().writeImage(owner()->getHostMem(), *this, origin, image.getRegion(), + image.getRowPitch(), image.getSlicePitch(), Entire); + } } + + //!@todo A wait isn't really necessary. However processMemObjects() + // may lose the track of dependencies with a compute transfer(if sdma failed). + wait(gpu); + + // Should never fail + assert(result && "Memory synchronization failed!"); + } } -void -Memory::syncCacheFromHost(VirtualGPU& gpu, device::Memory::SyncFlags syncFlags) -{ - // If the last writer was another GPU, then make a writeback - if (!isHostMemDirectAccess() && - (owner()->getLastWriter() != nullptr) && - (&dev() != owner()->getLastWriter())) { - mgpuCacheWriteBack(); +void Memory::syncHostFromCache(device::Memory::SyncFlags syncFlags) { + // Sanity checks + assert(owner() != nullptr); + + // If host memory doesn't have direct access, then we have to synchronize + if (!isHostMemDirectAccess()) { + bool hasUpdates = true; + + // Make sure the parent of subbuffer is up to date + if (!syncFlags.skipParent_ && (flags_ & SubMemoryObject)) { + device::Memory* m = owner()->parent()->getDeviceMemory(dev()); + + //! \note: Skipping the sync for a view doesn't reflect the parent settings, + //! since a view is a small portion of parent + device::Memory::SyncFlags syncFlagsTmp; + + // Sync parent from a view, so views have to be skipped + syncFlagsTmp.skipViews_ = true; + + // Make sure the parent sync is an unique operation. + // If the app uses multiple subbuffers from multiple queues, + // then the parent sync can be called from multiple threads + amd::ScopedLock lock(owner()->parent()->lockMemoryOps()); + m->syncHostFromCache(syncFlagsTmp); + //! \note Don't do early exit here, since we still have to sync + //! this view, if the parent sync operation was a NOP. + //! If parent was synchronized, then this view sync will be a NOP } - // If host memory doesn't have direct access, then we have to synchronize - if (!isHostMemDirectAccess() && (nullptr != owner()->getHostMem())) { - bool hasUpdates = true; - - // Make sure the parent of subbuffer is up to date - if (!syncFlags.skipParent_ && (flags_ & SubMemoryObject)) { - pal::Memory* gpuMemory = dev().getGpuMemory(owner()->parent()); - - //! \note: Skipping the sync for a view doesn't reflect the parent settings, - //! since a view is a small portion of parent - device::Memory::SyncFlags syncFlagsTmp; - - // Sync parent from a view, so views have to be skipped - syncFlagsTmp.skipViews_ = true; - - // Make sure the parent sync is an unique operation. - // If the app uses multiple subbuffers from multiple queues, - // then the parent sync can be called from multiple threads - amd::ScopedLock lock(owner()->parent()->lockMemoryOps()); - gpuMemory->syncCacheFromHost(gpu, syncFlagsTmp); - //! \note Don't do early exit here, since we still have to sync - //! this view, if the parent sync operation was a NOP. - //! If parent was synchronized, then this view sync will be a NOP - } - - // Is this a NOP? - if ((version_ == owner()->getVersion()) || - (&dev() == owner()->getLastWriter())) { - hasUpdates = false; - } - - // Update all available views, since we sync the parent - if ((owner()->subBuffers().size() != 0) && - (hasUpdates || !syncFlags.skipViews_)) { - device::Memory::SyncFlags syncFlagsTmp; - - // Sync views from parent, so parent has to be skipped - syncFlagsTmp.skipParent_ = true; - - if (hasUpdates) { - // Parent will be synced so update all views with a skip - syncFlagsTmp.skipEntire_ = true; - } - else { - // Passthrough the skip entire flag to the views, since - // any view is a submemory of the parent - syncFlagsTmp.skipEntire_ = syncFlags.skipEntire_; - } - - amd::ScopedLock lock(owner()->lockMemoryOps()); - for (auto& sub : owner()->subBuffers()) { - //! \note Don't allow subbuffer's allocation in the worker thread. - //! It may cause a system lock, because possible resource - //! destruction, heap reallocation or subbuffer allocation - static const bool AllocSubBuffer = false; - device::Memory* devSub = - sub->getDeviceMemory(dev(), AllocSubBuffer); - if (nullptr != devSub) { - pal::Memory* gpuSub = reinterpret_cast(devSub); - gpuSub->syncCacheFromHost(gpu, syncFlagsTmp); - } - } - } - - // Make sure we didn't have a NOP, - // because this GPU device was the last writer - if (&dev() != owner()->getLastWriter()) { - // Update the latest version - version_ = owner()->getVersion(); - } - - // Exit if sync is a NOP or sync can be skipped - if (!hasUpdates || syncFlags.skipEntire_) { - return; - } - - bool result = false; - static const bool Entire = true; - amd::Coord3D origin(0, 0, 0); - - // If host memory was pinned then make a transfer - if (flags_ & PinnedMemoryAlloced) { - if (desc().buffer_) { - amd::Coord3D region(owner()->getSize()); - result = gpu.blitMgr().copyBuffer(*pinnedMemory_, - *this, origin, origin, region, Entire); - } - else { - amd::Image& image = static_cast(*owner()); - result = gpu.blitMgr().copyBufferToImage(*pinnedMemory_, - *this, origin, origin, image.getRegion(), Entire, - image.getRowPitch(), image.getSlicePitch()); - } - } - - if (!result) { - if (desc().buffer_) { - amd::Coord3D region(owner()->getSize()); - result = gpu.blitMgr().writeBuffer(owner()->getHostMem(), - *this, origin, region, Entire); - } - else { - amd::Image& image = static_cast(*owner()); - result = gpu.blitMgr().writeImage(owner()->getHostMem(), - *this, origin, image.getRegion(), - image.getRowPitch(), image.getSlicePitch(), Entire); - } - } - - //!@todo A wait isn't really necessary. However processMemObjects() - // may lose the track of dependencies with a compute transfer(if sdma failed). - wait(gpu); - - // Should never fail - assert(result && "Memory synchronization failed!"); + // Is this a NOP? + if ((nullptr == owner()->getLastWriter()) || (version_ == owner()->getVersion())) { + hasUpdates = false; } + + // Update all available views, since we sync the parent + if ((owner()->subBuffers().size() != 0) && (hasUpdates || !syncFlags.skipViews_)) { + device::Memory::SyncFlags syncFlagsTmp; + + // Sync views from parent, so parent has to be skipped + syncFlagsTmp.skipParent_ = true; + + if (hasUpdates) { + // Parent will be synced so update all views with a skip + syncFlagsTmp.skipEntire_ = true; + } else { + // Passthrough the skip entire flag to the views, since + // any view is a submemory of the parent + syncFlagsTmp.skipEntire_ = syncFlags.skipEntire_; + } + + amd::ScopedLock lock(owner()->lockMemoryOps()); + for (auto& sub : owner()->subBuffers()) { + //! \note Don't allow subbuffer's allocation in the worker thread. + //! It may cause a system lock, because possible resource + //! destruction, heap reallocation or subbuffer allocation + static const bool AllocSubBuffer = false; + device::Memory* devSub = sub->getDeviceMemory(dev(), AllocSubBuffer); + if (nullptr != devSub) { + pal::Memory* gpuSub = reinterpret_cast(devSub); + gpuSub->syncHostFromCache(syncFlagsTmp); + } + } + } + + // Make sure we didn't have a NOP, + // because CPU was the last writer + if (nullptr != owner()->getLastWriter()) { + // Mark parent as up to date, set our version accordingly + version_ = owner()->getVersion(); + } + + // Exit if sync is a NOP or sync can be skipped + if (!hasUpdates || syncFlags.skipEntire_) { + return; + } + + bool result = false; + static const bool Entire = true; + amd::Coord3D origin(0, 0, 0); + + // If backing store was pinned then make a transfer + if (flags_ & PinnedMemoryAlloced) { + if (desc().buffer_) { + amd::Coord3D region(owner()->getSize()); + result = dev().xferMgr().copyBuffer(*this, *pinnedMemory_, origin, origin, region, Entire); + } else { + amd::Image& image = static_cast(*owner()); + result = dev().xferMgr().copyImageToBuffer(*this, *pinnedMemory_, origin, origin, + image.getRegion(), Entire, image.getRowPitch(), + image.getSlicePitch()); + } + } + + // Just do a basic host read + if (!result) { + if (desc().buffer_) { + amd::Coord3D region(owner()->getSize()); + result = dev().xferMgr().readBuffer(*this, owner()->getHostMem(), origin, region, Entire); + } else { + amd::Image& image = static_cast(*owner()); + result = dev().xferMgr().readImage(*this, owner()->getHostMem(), origin, image.getRegion(), + image.getRowPitch(), image.getSlicePitch(), Entire); + } + } + + // Should never fail + assert(result && "Memory synchronization failed!"); + } } -void -Memory::syncHostFromCache(device::Memory::SyncFlags syncFlags) -{ - // Sanity checks - assert(owner() != nullptr); +pal::Memory* Memory::createBufferView(amd::Memory& subBufferOwner) { + pal::Memory* viewMemory; + Resource::ViewParams params; - // If host memory doesn't have direct access, then we have to synchronize - if (!isHostMemDirectAccess()) { - bool hasUpdates = true; + size_t offset = subBufferOwner.getOrigin(); + size_t size = subBufferOwner.getSize(); - // Make sure the parent of subbuffer is up to date - if (!syncFlags.skipParent_ && (flags_ & SubMemoryObject)) { - device::Memory* m = owner()->parent()->getDeviceMemory(dev()); + // Create a memory object + viewMemory = new pal::Memory(dev(), subBufferOwner, size); + if (nullptr == viewMemory) { + return nullptr; + } - //! \note: Skipping the sync for a view doesn't reflect the parent settings, - //! since a view is a small portion of parent - device::Memory::SyncFlags syncFlagsTmp; + params.owner_ = &subBufferOwner; + params.gpu_ = static_cast(subBufferOwner.getVirtualDevice()); + params.offset_ = offset; + params.size_ = size; + params.resource_ = this; + params.memory_ = this; + if (!viewMemory->create(Resource::View, ¶ms)) { + delete viewMemory; + return nullptr; + } - // Sync parent from a view, so views have to be skipped - syncFlagsTmp.skipViews_ = true; + // Explicitly set the host memory location, + // because the parent location could change after reallocation + if (nullptr != owner()->getHostMem()) { + subBufferOwner.setHostMem(reinterpret_cast(owner()->getHostMem()) + offset); + } else { + subBufferOwner.setHostMem(nullptr); + } - // Make sure the parent sync is an unique operation. - // If the app uses multiple subbuffers from multiple queues, - // then the parent sync can be called from multiple threads - amd::ScopedLock lock(owner()->parent()->lockMemoryOps()); - m->syncHostFromCache(syncFlagsTmp); - //! \note Don't do early exit here, since we still have to sync - //! this view, if the parent sync operation was a NOP. - //! If parent was synchronized, then this view sync will be a NOP - } - - // Is this a NOP? - if ((nullptr == owner()->getLastWriter()) || - (version_ == owner()->getVersion())) { - hasUpdates = false; - } - - // Update all available views, since we sync the parent - if ((owner()->subBuffers().size() != 0) && - (hasUpdates || !syncFlags.skipViews_)) { - device::Memory::SyncFlags syncFlagsTmp; - - // Sync views from parent, so parent has to be skipped - syncFlagsTmp.skipParent_ = true; - - if (hasUpdates) { - // Parent will be synced so update all views with a skip - syncFlagsTmp.skipEntire_ = true; - } - else { - // Passthrough the skip entire flag to the views, since - // any view is a submemory of the parent - syncFlagsTmp.skipEntire_ = syncFlags.skipEntire_; - } - - amd::ScopedLock lock(owner()->lockMemoryOps()); - for (auto& sub : owner()->subBuffers()) { - //! \note Don't allow subbuffer's allocation in the worker thread. - //! It may cause a system lock, because possible resource - //! destruction, heap reallocation or subbuffer allocation - static const bool AllocSubBuffer = false; - device::Memory* devSub = - sub->getDeviceMemory(dev(), AllocSubBuffer); - if (nullptr != devSub) { - pal::Memory* gpuSub = reinterpret_cast(devSub); - gpuSub->syncHostFromCache(syncFlagsTmp); - } - } - } - - // Make sure we didn't have a NOP, - // because CPU was the last writer - if (nullptr != owner()->getLastWriter()) { - // Mark parent as up to date, set our version accordingly - version_ = owner()->getVersion(); - } - - // Exit if sync is a NOP or sync can be skipped - if (!hasUpdates || syncFlags.skipEntire_) { - return; - } - - bool result = false; - static const bool Entire = true; - amd::Coord3D origin(0, 0, 0); - - // If backing store was pinned then make a transfer - if (flags_ & PinnedMemoryAlloced) { - if (desc().buffer_) { - amd::Coord3D region(owner()->getSize()); - result = dev().xferMgr().copyBuffer(*this, - *pinnedMemory_, origin, origin, region, Entire); - } - else { - amd::Image& image = static_cast(*owner()); - result = dev().xferMgr().copyImageToBuffer(*this, - *pinnedMemory_, origin, origin, image.getRegion(), Entire, - image.getRowPitch(), image.getSlicePitch()); - } - } - - // Just do a basic host read - if (!result) { - if (desc().buffer_) { - amd::Coord3D region(owner()->getSize()); - result = dev().xferMgr().readBuffer(*this, - owner()->getHostMem(), origin, region, Entire); - } - else { - amd::Image& image = static_cast(*owner()); - result = dev().xferMgr().readImage(*this, - owner()->getHostMem(), origin, image.getRegion(), - image.getRowPitch(), image.getSlicePitch(), Entire); - } - } - - // Should never fail - assert(result && "Memory synchronization failed!"); - } + return viewMemory; } -pal::Memory* -Memory::createBufferView(amd::Memory& subBufferOwner) -{ - pal::Memory* viewMemory; - Resource::ViewParams params; +void Memory::decIndMapCount() { + // Map/unmap must be serialized + amd::ScopedLock lock(owner()->lockMemoryOps()); - size_t offset = subBufferOwner.getOrigin(); - size_t size = subBufferOwner.getSize(); - - // Create a memory object - viewMemory = new pal::Memory(dev(), subBufferOwner, size); - if (nullptr == viewMemory) { - return nullptr; + if (indirectMapCount_ == 0) { + if (!mipMapped()) { + LogError("decIndMapCount() called when indirectMapCount_ already zero"); } + return; + } - params.owner_ = &subBufferOwner; - params.gpu_ = static_cast(subBufferOwner.getVirtualDevice()); - params.offset_ = offset; - params.size_ = size; - params.resource_ = this; - params.memory_ = this; - if (!viewMemory->create(Resource::View, ¶ms)) { - delete viewMemory; - return nullptr; - } - - // Explicitly set the host memory location, - // because the parent location could change after reallocation - if (nullptr != owner()->getHostMem()) { - subBufferOwner.setHostMem( - reinterpret_cast(owner()->getHostMem()) + offset); - } - else { - subBufferOwner.setHostMem(nullptr); - } - - return viewMemory; -} - -void -Memory::decIndMapCount() -{ - // Map/unmap must be serialized - amd::ScopedLock lock(owner()->lockMemoryOps()); - - if (indirectMapCount_ == 0) { - if (!mipMapped()) { - LogError("decIndMapCount() called when indirectMapCount_ already zero"); - } - return; - } - - // Decrement the counter and release indirect map if it's the last op - if (--indirectMapCount_ == 0) { - if (nullptr != mapMemory_) { - amd::Memory* memory = mapMemory_; - amd::Memory* empty = nullptr; - - // Get GPU memory - Memory* gpuMemory = mapMemory(); - gpuMemory->unmap(nullptr); - - if (!dev().addMapTarget(memory)) { - memory->release(); - } - - // Map/unamp is serialized for the same memory object, - // so it's safe to clear the pointer - assert((mapMemory_ != nullptr) && "Mapped buffer should be valid"); - mapMemory_ = nullptr; - } + // Decrement the counter and release indirect map if it's the last op + if (--indirectMapCount_ == 0) { + if (nullptr != mapMemory_) { + amd::Memory* memory = mapMemory_; + amd::Memory* empty = nullptr; + + // Get GPU memory + Memory* gpuMemory = mapMemory(); + gpuMemory->unmap(nullptr); + + if (!dev().addMapTarget(memory)) { + memory->release(); + } + + // Map/unamp is serialized for the same memory object, + // so it's safe to clear the pointer + assert((mapMemory_ != nullptr) && "Mapped buffer should be valid"); + mapMemory_ = nullptr; } + } } // Note - must be called by the device under the async lock, so no spinning // or long pauses allowed in this function. -void* -Memory::allocMapTarget( - const amd::Coord3D& origin, - const amd::Coord3D& region, - uint mapFlags, - size_t* rowPitch, - size_t* slicePitch) -{ - // Sanity checks - assert(owner() != nullptr); +void* Memory::allocMapTarget(const amd::Coord3D& origin, const amd::Coord3D& region, uint mapFlags, + size_t* rowPitch, size_t* slicePitch) { + // Sanity checks + assert(owner() != nullptr); - // Map/unmap must be serialized - amd::ScopedLock lock(owner()->lockMemoryOps()); + // Map/unmap must be serialized + amd::ScopedLock lock(owner()->lockMemoryOps()); - address mapAddress = nullptr; - size_t offset = origin[0]; + address mapAddress = nullptr; + size_t offset = origin[0]; - //For SVM implementation, we cannot use cached map. if svm space, use the svm host pointer - void *initHostPtr = owner()->getSvmPtr(); - if (nullptr != initHostPtr) { - owner()->commitSvmMemory(); + // For SVM implementation, we cannot use cached map. if svm space, use the svm host pointer + void* initHostPtr = owner()->getSvmPtr(); + if (nullptr != initHostPtr) { + owner()->commitSvmMemory(); + } + + if (owner()->numDevices() > 1) { + if ((nullptr == initHostPtr) && (owner()->getHostMem() == nullptr)) { + static const bool forceAllocHostMem = true; + if (!owner()->allocHostMemory(nullptr, forceAllocHostMem)) { + return nullptr; + } + //! \note Ignore pinning result + bool ok = pinSystemMemory(owner()->getHostMem(), owner()->getSize()); + } + } + + incIndMapCount(); + // If host memory exists, use it + if ((owner()->getHostMem() != nullptr) && isDirectMap()) { + mapAddress = reinterpret_cast
(owner()->getHostMem()); + } + // If resource is a persistent allocation, we can use it directly + else if (isPersistentDirectMap()) { + if (nullptr == map(nullptr)) { + LogError("Could not map target persistent resource"); + decIndMapCount(); + return nullptr; + } + mapAddress = data(); + } + // Otherwise we can use a remote resource: + else { + // Are we in range? + size_t elementCount = desc().width_; + size_t rSize = elementCount * elementSize(); + if (offset >= rSize || offset + region[0] > rSize) { + LogWarning("Memory::allocMapTarget() - offset/size out of bounds"); + return nullptr; } - if (owner()->numDevices() > 1) { - if ((nullptr == initHostPtr) && (owner()->getHostMem() == nullptr)) { - static const bool forceAllocHostMem = true; - if (!owner()->allocHostMemory(nullptr, forceAllocHostMem)) { - return nullptr; - } - //! \note Ignore pinning result - bool ok = pinSystemMemory(owner()->getHostMem(), owner()->getSize()); + // Allocate a map resource if there isn't any yet + if (indirectMapCount_ == 1) { + const static bool SysMem = true; + bool failed = false; + amd::Memory* memory = nullptr; + // Search for a possible indirect resource + cl_mem_flags flag = 0; + bool canBeCached = true; + if (nullptr != initHostPtr) { + // make sure the host memory is committed already, or we have a big problem. + assert(owner()->isSvmPtrCommited() && "The host svm memory not committed yet!"); + flag = CL_MEM_USE_HOST_PTR; + canBeCached = false; + } else { + memory = dev().findMapTarget(owner()->getSize()); + } + + if (memory == nullptr) { + // for map target of svm buffer , we need use svm host ptr + memory = new (dev().context()) amd::Buffer(dev().context(), flag, owner()->getSize()); + Memory* gpuMemory; + + do { + if ((memory == nullptr) || !memory->create(initHostPtr, SysMem)) { + failed = true; + break; + } + memory->setCacheStatus(canBeCached); + + gpuMemory = reinterpret_cast(memory->getDeviceMemory(dev())); + + // Create, Map and get the base pointer for the resource + if ((gpuMemory == nullptr) || (nullptr == gpuMemory->map(nullptr))) { + failed = true; + break; + } + } while (false); + } + + if (failed) { + if (memory != nullptr) { + memory->release(); } + decIndMapCount(); + LogError("Could not map target resource"); + return nullptr; + } + + // Map/unamp is serialized for the same memory object, + // so it's safe to assign the new pointer + assert((mapMemory_ == nullptr) && "Mapped buffer can't be valid"); + mapMemory_ = memory; + } else { + // Did the map resource allocation fail? + if (mapMemory_ == nullptr) { + LogError("Could not map target resource"); + return nullptr; + } } + mapAddress = mapMemory()->data(); + } - incIndMapCount(); - // If host memory exists, use it - if ((owner()->getHostMem() != nullptr) && isDirectMap()) { - mapAddress = reinterpret_cast
(owner()->getHostMem()); - } - // If resource is a persistent allocation, we can use it directly - else if (isPersistentDirectMap()) { - if (nullptr == map(nullptr)) { - LogError("Could not map target persistent resource"); - decIndMapCount(); - return nullptr; - } - mapAddress = data(); - } - // Otherwise we can use a remote resource: - else { - // Are we in range? - size_t elementCount = desc().width_; - size_t rSize = elementCount * elementSize(); - if (offset >= rSize || offset + region[0] > rSize) { - LogWarning("Memory::allocMapTarget() - offset/size out of bounds"); - return nullptr; - } - - // Allocate a map resource if there isn't any yet - if (indirectMapCount_ == 1) { - const static bool SysMem = true; - bool failed = false; - amd::Memory* memory = nullptr; - // Search for a possible indirect resource - cl_mem_flags flag = 0; - bool canBeCached = true; - if (nullptr != initHostPtr) { - //make sure the host memory is committed already, or we have a big problem. - assert(owner()->isSvmPtrCommited() && "The host svm memory not committed yet!"); - flag = CL_MEM_USE_HOST_PTR; - canBeCached = false; - } - else { - memory = dev().findMapTarget(owner()->getSize()); - } - - if (memory == nullptr) { - // for map target of svm buffer , we need use svm host ptr - memory = new(dev().context()) - amd::Buffer(dev().context(), flag, owner()->getSize()); - Memory* gpuMemory; - - do { - if ((memory == nullptr) || !memory->create(initHostPtr, SysMem)) { - failed = true; - break; - } - memory->setCacheStatus(canBeCached); - - gpuMemory = reinterpret_cast - (memory->getDeviceMemory(dev())); - - // Create, Map and get the base pointer for the resource - if ((gpuMemory == nullptr) || (nullptr == gpuMemory->map(nullptr))) { - failed = true; - break; - } - } - while (false); - } - - if (failed) { - if (memory != nullptr) { - memory->release(); - } - decIndMapCount(); - LogError("Could not map target resource"); - return nullptr; - } - - // Map/unamp is serialized for the same memory object, - // so it's safe to assign the new pointer - assert((mapMemory_ == nullptr) && "Mapped buffer can't be valid"); - mapMemory_ = memory; - } - else { - // Did the map resource allocation fail? - if (mapMemory_ == nullptr) { - LogError("Could not map target resource"); - return nullptr; - } - } - mapAddress = mapMemory()->data(); - } - - return mapAddress + offset; + return mapAddress + offset; } -bool -Memory::pinSystemMemory(void* hostPtr, size_t size) -{ - bool result = false; +bool Memory::pinSystemMemory(void* hostPtr, size_t size) { + bool result = false; - // If memory has a direct access already, then skip the host memory pinning - if (isHostMemDirectAccess()) { - return true; - } - - // Memory was pinned already - if (flags_ & PinnedMemoryAlloced) { - return true; - } - - // Allocate memory for the pinned object - pinnedMemory_ = new Memory(dev(), size); - - if (pinnedMemory_ == nullptr) { - return false; - } - - // Check if it's a view - if (flags_ & SubMemoryObject) { - const pal::Memory* gpuMemory; - if (owner() != nullptr) { - gpuMemory = dev().getGpuMemory(owner()->parent()); - } - else { - gpuMemory = parent(); - } - - if (gpuMemory->flags_ & PinnedMemoryAlloced) { - Resource::ViewParams params; - params.owner_ = owner(); - params.offset_ = owner()->getOrigin(); - params.size_ = owner()->getSize(); - params.resource_ = gpuMemory->pinnedMemory_; - params.memory_ = nullptr; - result = pinnedMemory_->create(Resource::View, ¶ms); - } - } - else { - Resource::PinnedParams params; - // Fill resource creation parameters - params.owner_ = owner(); - params.hostMemRef_ = owner()->getHostMemRef(); - params.size_ = size; - - // Create resource - result = pinnedMemory_->create(Resource::Pinned, ¶ms); - } - - if (!result) { - delete pinnedMemory_; - pinnedMemory_ = nullptr; - return false; - } - - flags_ |= PinnedMemoryAlloced; + // If memory has a direct access already, then skip the host memory pinning + if (isHostMemDirectAccess()) { return true; -} + } -void* -Memory::cpuMap( - device::VirtualDevice& vDev, uint flags, - uint startLayer, uint numLayers, - size_t* rowPitch, - size_t* slicePitch) -{ - uint resFlags = 0; - if (flags == Memory::CpuReadOnly) { - resFlags = Resource::ReadOnly; - } - else if (flags == Memory::CpuWriteOnly) { - resFlags = Resource::WriteOnly; + // Memory was pinned already + if (flags_ & PinnedMemoryAlloced) { + return true; + } + + // Allocate memory for the pinned object + pinnedMemory_ = new Memory(dev(), size); + + if (pinnedMemory_ == nullptr) { + return false; + } + + // Check if it's a view + if (flags_ & SubMemoryObject) { + const pal::Memory* gpuMemory; + if (owner() != nullptr) { + gpuMemory = dev().getGpuMemory(owner()->parent()); + } else { + gpuMemory = parent(); } - void* ptr = map(&static_cast(vDev), resFlags, startLayer, numLayers); - if (!desc().buffer_) { - *rowPitch = desc().pitch_ * elementSize(); - *slicePitch = desc().slice_ * elementSize(); + if (gpuMemory->flags_ & PinnedMemoryAlloced) { + Resource::ViewParams params; + params.owner_ = owner(); + params.offset_ = owner()->getOrigin(); + params.size_ = owner()->getSize(); + params.resource_ = gpuMemory->pinnedMemory_; + params.memory_ = nullptr; + result = pinnedMemory_->create(Resource::View, ¶ms); } - return ptr; + } else { + Resource::PinnedParams params; + // Fill resource creation parameters + params.owner_ = owner(); + params.hostMemRef_ = owner()->getHostMemRef(); + params.size_ = size; + + // Create resource + result = pinnedMemory_->create(Resource::Pinned, ¶ms); + } + + if (!result) { + delete pinnedMemory_; + pinnedMemory_ = nullptr; + return false; + } + + flags_ |= PinnedMemoryAlloced; + return true; } -void -Memory::cpuUnmap(device::VirtualDevice& vDev) -{ - unmap(&static_cast(vDev)); +void* Memory::cpuMap(device::VirtualDevice& vDev, uint flags, uint startLayer, uint numLayers, + size_t* rowPitch, size_t* slicePitch) { + uint resFlags = 0; + if (flags == Memory::CpuReadOnly) { + resFlags = Resource::ReadOnly; + } else if (flags == Memory::CpuWriteOnly) { + resFlags = Resource::WriteOnly; + } + + void* ptr = map(&static_cast(vDev), resFlags, startLayer, numLayers); + if (!desc().buffer_) { + *rowPitch = desc().pitch_ * elementSize(); + *slicePitch = desc().slice_ * elementSize(); + } + return ptr; } +void Memory::cpuUnmap(device::VirtualDevice& vDev) { unmap(&static_cast(vDev)); } + //! \note moveTo() must be called only from outside of //! VirtualGPU submit command methods. //! Otherwise a deadlock in lockVgpus() is possible. //! Also the logic in this function is very specific to //! the zero-copy functionality. -bool -Memory::moveTo(Memory& dst) -{ - bool result = false; +bool Memory::moveTo(Memory& dst) { + bool result = false; - // Make sure that all virtual devices don't process any commands - Device::ScopedLockVgpus lock(dev()); + // Make sure that all virtual devices don't process any commands + Device::ScopedLockVgpus lock(dev()); - // Wait for idle on all virtual GPUs - //!@note It's enough to wait on the active queue only - for (uint idx = 0; idx < dev().vgpus().size(); ++idx) { - wait(*(dev().vgpus()[idx])); - } + // Wait for idle on all virtual GPUs + //!@note It's enough to wait on the active queue only + for (uint idx = 0; idx < dev().vgpus().size(); ++idx) { + wait(*(dev().vgpus()[idx])); + } - static const bool Entire = true; - amd::Coord3D origin(0, 0, 0); - amd::Coord3D region(size()); + static const bool Entire = true; + amd::Coord3D origin(0, 0, 0); + amd::Coord3D region(size()); - // Transfer the data from old location to a new one - if (dev().xferMgr().copyBuffer( - *this, dst, origin, origin, region, Entire)) { - // Move all properties to the new object - dst.mapMemory_ = mapMemory_; - mapMemory_ = nullptr; + // Transfer the data from old location to a new one + if (dev().xferMgr().copyBuffer(*this, dst, origin, origin, region, Entire)) { + // Move all properties to the new object + dst.mapMemory_ = mapMemory_; + mapMemory_ = nullptr; - dst.flags_ |= flags_ & ~HostMemoryDirectAccess; - flags_ &= HostMemoryDirectAccess; + dst.flags_ |= flags_ & ~HostMemoryDirectAccess; + flags_ &= HostMemoryDirectAccess; - dst.indirectMapCount_ = indirectMapCount_; - indirectMapCount_ = 0; + dst.indirectMapCount_ = indirectMapCount_; + indirectMapCount_ = 0; - dst.pinnedMemory_ = pinnedMemory_; - pinnedMemory_ = nullptr; + dst.pinnedMemory_ = pinnedMemory_; + pinnedMemory_ = nullptr; - // Replace the device memory object - //! @note: current object will be destroyed - owner()->replaceDeviceMemory(&dev(), &dst); - result = true; - } + // Replace the device memory object + //! @note: current object will be destroyed + owner()->replaceDeviceMemory(&dev(), &dst); + result = true; + } - return result; + return result; } -Memory* -Memory::mapMemory() const -{ - Memory* map = nullptr; - if (nullptr != mapMemory_) { - map = reinterpret_cast(mapMemory_->getDeviceMemory(dev())); - } - return map; +Memory* Memory::mapMemory() const { + Memory* map = nullptr; + if (nullptr != mapMemory_) { + map = reinterpret_cast(mapMemory_->getDeviceMemory(dev())); + } + return map; } -void -Memory::mgpuCacheWriteBack() -{ - // Lock memory object, so only one write back can occur - amd::ScopedLock lock(owner()->lockMemoryOps()); +void Memory::mgpuCacheWriteBack() { + // Lock memory object, so only one write back can occur + amd::ScopedLock lock(owner()->lockMemoryOps()); - // Attempt to allocate a staging buffer if don't have any - if (owner()->getHostMem() == nullptr) { - if (nullptr != owner()->getSvmPtr()) { - owner()->commitSvmMemory(); - owner()->setHostMem(owner()->getSvmPtr()); + // Attempt to allocate a staging buffer if don't have any + if (owner()->getHostMem() == nullptr) { + if (nullptr != owner()->getSvmPtr()) { + owner()->commitSvmMemory(); + owner()->setHostMem(owner()->getSvmPtr()); + } else { + static const bool forceAllocHostMem = true; + owner()->allocHostMemory(nullptr, forceAllocHostMem); + } + } + + // Make synchronization + if (owner()->getHostMem() != nullptr) { + //! \note Ignore pinning result + bool ok = pinSystemMemory(owner()->getHostMem(), owner()->getSize()); + owner()->cacheWriteBack(); + } +} + +Memory* Buffer::createBufferView(amd::Memory& subBufferOwner) const { + pal::Memory* subBuffer; + Resource::ViewParams params; + + size_t offset = subBufferOwner.getOrigin(); + size_t size = subBufferOwner.getSize(); + + // Create a memory object + subBuffer = new pal::Buffer(dev(), subBufferOwner, size); + if (nullptr == subBuffer) { + return nullptr; + } + + // Allocate a view for this buffer object + params.owner_ = &subBufferOwner; + params.offset_ = offset; + params.size_ = size; + params.resource_ = this; + params.memory_ = this; + + if (!subBuffer->create(Resource::View, ¶ms)) { + delete subBuffer; + return nullptr; + } + + return subBuffer; +} + +void* Image::allocMapTarget(const amd::Coord3D& origin, const amd::Coord3D& region, uint mapFlags, + size_t* rowPitch, size_t* slicePitch) { + // Sanity checks + assert(owner() != nullptr); + bool useRemoteResource = true; + size_t slicePitchTmp = 0; + size_t height = desc().height_; + size_t depth = desc().depth_; + + // Map/unmap must be serialized + amd::ScopedLock lock(owner()->lockMemoryOps()); + + address mapAddress = nullptr; + size_t offset = origin[0]; + + incIndMapCount(); + + // If host memory exists, use it + if ((owner()->getHostMem() != nullptr) && isDirectMap()) { + useRemoteResource = false; + mapAddress = reinterpret_cast
(owner()->getHostMem()); + amd::Image* amdImage = owner()->asImage(); + + // Calculate the offset in bytes + offset *= elementSize(); + + // Update the row and slice pitches value + *rowPitch = + (amdImage->getRowPitch() == 0) ? (desc().width_ * elementSize()) : amdImage->getRowPitch(); + slicePitchTmp = + (amdImage->getSlicePitch() == 0) ? (height * (*rowPitch)) : amdImage->getSlicePitch(); + + // Adjust the offset in Y and Z dimensions + offset += origin[1] * (*rowPitch); + offset += origin[2] * slicePitchTmp; + } + // If resource is a persistent allocation, we can use it directly + //! @note Even if resource is a persistent allocation, + //! runtime can't use it directly, + //! because CAL volume map doesn't work properly. + //! @todo arrays can be added for persistent lock with some CAL changes + else if (isPersistentDirectMap()) { + if (nullptr == map(nullptr)) { + useRemoteResource = true; + LogError("Could not map target persistent resource, try remote resource"); + } else { + useRemoteResource = false; + mapAddress = data(); + + // Calculate the offset in bytes + offset *= elementSize(); + + // Update the row pitch value + *rowPitch = desc().pitch_ * elementSize(); + + // Adjust the offset in Y dimension + offset += origin[1] * (*rowPitch); + } + } + + // Otherwise we can use a remote resource: + if (useRemoteResource) { + // Calculate X offset in bytes + offset *= elementSize(); + + // Allocate a map resource if there isn't any yet + if (indirectMapCount_ == 1) { + const static bool SysMem = true; + bool failed = false; + amd::Memory* memory; + + // Search for a possible indirect resource + memory = dev().findMapTarget(owner()->getSize()); + + if (memory == nullptr) { + // Allocate a new buffer to use as the map target + //! @note Allocate a 1D buffer, since CAL issues with 3D + //! Also HW doesn't support untiled images + memory = new (dev().context()) + amd::Buffer(dev().context(), 0, desc().width_ * height * depth * elementSize()); + memory->setVirtualDevice(owner()->getVirtualDevice()); + + Memory* gpuMemory; + do { + if ((memory == nullptr) || !memory->create(nullptr, SysMem)) { + failed = true; + break; + } + + gpuMemory = reinterpret_cast(memory->getDeviceMemory(dev())); + + // Create, Map and get the base pointer for the resource + if ((gpuMemory == nullptr) || (nullptr == gpuMemory->map(nullptr))) { + failed = true; + break; + } + } while (false); + } + + if (failed) { + if (memory != nullptr) { + memory->release(); } - else { - static const bool forceAllocHostMem = true; - owner()->allocHostMemory(nullptr, forceAllocHostMem); - } - } - - // Make synchronization - if (owner()->getHostMem() != nullptr) { - //! \note Ignore pinning result - bool ok = pinSystemMemory(owner()->getHostMem(), owner()->getSize()); - owner()->cacheWriteBack(); - } -} - -Memory* -Buffer::createBufferView(amd::Memory& subBufferOwner) const -{ - pal::Memory* subBuffer; - Resource::ViewParams params; - - size_t offset = subBufferOwner.getOrigin(); - size_t size = subBufferOwner.getSize(); - - // Create a memory object - subBuffer = new pal::Buffer(dev(), subBufferOwner, size); - if (nullptr == subBuffer) { + decIndMapCount(); + LogError("Could not map target resource"); return nullptr; - } + } - // Allocate a view for this buffer object - params.owner_ = &subBufferOwner; - params.offset_ = offset; - params.size_ = size; - params.resource_ = this; - params.memory_ = this; - - if (!subBuffer->create(Resource::View, ¶ms)) { - delete subBuffer; + // Map/unamp is serialized for the same memory object, + // so it's safe to assign the new pointer + assert((mapMemory_ == nullptr) && "Mapped buffer can't be valid"); + mapMemory_ = memory; + } else { + // Did the map resource allocation fail? + if (mapMemory_ == nullptr) { + LogError("Could not map target resource"); return nullptr; + } } - return subBuffer; + mapAddress = mapMemory()->data(); + + // Update the row and slice pitches value + *rowPitch = region[0] * elementSize(); + if (desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY) { + slicePitchTmp = *rowPitch; + } else { + slicePitchTmp = *rowPitch * region[1]; + } + // Use start of the indirect buffer + offset = 0; + } + + if (slicePitch != nullptr) { + *slicePitch = slicePitchTmp; + } + + return mapAddress + offset; } -void* -Image::allocMapTarget( - const amd::Coord3D& origin, - const amd::Coord3D& region, - uint mapFlags, - size_t* rowPitch, - size_t* slicePitch) -{ - // Sanity checks - assert(owner() != nullptr); - bool useRemoteResource = true; - size_t slicePitchTmp = 0; - size_t height = desc().height_; - size_t depth = desc().depth_; - - // Map/unmap must be serialized - amd::ScopedLock lock(owner()->lockMemoryOps()); - - address mapAddress = nullptr; - size_t offset = origin[0]; - - incIndMapCount(); - - // If host memory exists, use it - if ((owner()->getHostMem() != nullptr) && isDirectMap()) { - useRemoteResource = false; - mapAddress = reinterpret_cast
(owner()->getHostMem()); - amd::Image* amdImage = owner()->asImage(); - - // Calculate the offset in bytes - offset *= elementSize(); - - // Update the row and slice pitches value - *rowPitch = (amdImage->getRowPitch() == 0) ? - (desc().width_ * elementSize()) : amdImage->getRowPitch(); - slicePitchTmp = (amdImage->getSlicePitch() == 0) ? - (height * (*rowPitch)) : amdImage->getSlicePitch(); - - // Adjust the offset in Y and Z dimensions - offset += origin[1] * (*rowPitch); - offset += origin[2] * slicePitchTmp; - } - // If resource is a persistent allocation, we can use it directly - //! @note Even if resource is a persistent allocation, - //! runtime can't use it directly, - //! because CAL volume map doesn't work properly. - //! @todo arrays can be added for persistent lock with some CAL changes - else if (isPersistentDirectMap()) { - if (nullptr == map(nullptr)) { - useRemoteResource = true; - LogError("Could not map target persistent resource, try remote resource"); - } - else { - useRemoteResource = false; - mapAddress = data(); - - // Calculate the offset in bytes - offset *= elementSize(); - - // Update the row pitch value - *rowPitch = desc().pitch_ * elementSize(); - - // Adjust the offset in Y dimension - offset += origin[1] * (*rowPitch); - } - } - - // Otherwise we can use a remote resource: - if (useRemoteResource) { - // Calculate X offset in bytes - offset *= elementSize(); - - // Allocate a map resource if there isn't any yet - if (indirectMapCount_ == 1) { - const static bool SysMem = true; - bool failed = false; - amd::Memory* memory; - - // Search for a possible indirect resource - memory = dev().findMapTarget(owner()->getSize()); - - if (memory == nullptr) { - // Allocate a new buffer to use as the map target - //! @note Allocate a 1D buffer, since CAL issues with 3D - //! Also HW doesn't support untiled images - memory = new (dev().context()) - amd::Buffer(dev().context(), 0, - desc().width_ * height * depth * elementSize()); - memory->setVirtualDevice(owner()->getVirtualDevice()); - - Memory* gpuMemory; - do { - if ((memory == nullptr) || !memory->create(nullptr, SysMem)) { - failed = true; - break; - } - - gpuMemory = reinterpret_cast - (memory->getDeviceMemory(dev())); - - // Create, Map and get the base pointer for the resource - if ((gpuMemory == nullptr) || (nullptr == gpuMemory->map(nullptr))) { - failed = true; - break; - } - } - while (false); - } - - if (failed) { - if (memory != nullptr) { - memory->release(); - } - decIndMapCount(); - LogError("Could not map target resource"); - return nullptr; - } - - // Map/unamp is serialized for the same memory object, - // so it's safe to assign the new pointer - assert((mapMemory_ == nullptr) && "Mapped buffer can't be valid"); - mapMemory_ = memory; - } - else { - // Did the map resource allocation fail? - if (mapMemory_ == nullptr) { - LogError("Could not map target resource"); - return nullptr; - } - } - - mapAddress = mapMemory()->data(); - - // Update the row and slice pitches value - *rowPitch = region[0] * elementSize(); - if (desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY) { - slicePitchTmp = *rowPitch ; - } - else { - slicePitchTmp = *rowPitch * region[1]; - } - // Use start of the indirect buffer - offset = 0; - } - - if (slicePitch != nullptr) { - *slicePitch = slicePitchTmp; - } - - return mapAddress + offset; -} - -} // namespace pal +} // namespace pal diff --git a/rocclr/runtime/device/pal/palmemory.hpp b/rocclr/runtime/device/pal/palmemory.hpp index c889472c5c..bbe713caf8 100644 --- a/rocclr/runtime/device/pal/palmemory.hpp +++ b/rocclr/runtime/device/pal/palmemory.hpp @@ -26,247 +26,223 @@ class VirtualGPU; //! GPU memory object. // Wrapper that can contain a heap block or an interop buffer/image. -class Memory: public device::Memory, public Resource -{ -public: - enum InteropType { - InteropNone = 0, //!< None interop memory - InteropHwEmulation = 1, //!< Uses HW emulaiton with calMemCopy - InteropDirectAccess = 2 //!< Uses direct access to the interop surface - }; +class Memory : public device::Memory, public Resource { + public: + enum InteropType { + InteropNone = 0, //!< None interop memory + InteropHwEmulation = 1, //!< Uses HW emulaiton with calMemCopy + InteropDirectAccess = 2 //!< Uses direct access to the interop surface + }; - //! Constructor (with owner) - Memory( - const Device& gpuDev, //!< GPU device object - amd::Memory& owner, //!< Abstraction layer memory object - size_t size //!< Memory size for allocation - ); + //! Constructor (with owner) + Memory(const Device& gpuDev, //!< GPU device object + amd::Memory& owner, //!< Abstraction layer memory object + size_t size //!< Memory size for allocation + ); - //! Constructor (nonfat version for local scratch mem use without heap block) - Memory( - const Device& gpuDev, //!< GPU device object - size_t size //!< Memory size for allocation - ); + //! Constructor (nonfat version for local scratch mem use without heap block) + Memory(const Device& gpuDev, //!< GPU device object + size_t size //!< Memory size for allocation + ); - //! Constructor memory for images (without global heap allocation) - Memory( - const Device& gpuDev, //!< GPU device object - amd::Memory& owner, //!< Abstraction layer memory object - size_t width, //!< Allocated memory width - size_t height, //!< Allocated memory height - size_t depth, //!< Allocated memory depth - cl_image_format format, //!< Memory format - cl_mem_object_type imageType, //!< CL image type - uint mipLevels //!< The number of mip levels - ); + //! Constructor memory for images (without global heap allocation) + Memory(const Device& gpuDev, //!< GPU device object + amd::Memory& owner, //!< Abstraction layer memory object + size_t width, //!< Allocated memory width + size_t height, //!< Allocated memory height + size_t depth, //!< Allocated memory depth + cl_image_format format, //!< Memory format + cl_mem_object_type imageType, //!< CL image type + uint mipLevels //!< The number of mip levels + ); - //! Constructor memory for images (without global heap allocation) - Memory( - const Device& gpuDev, //!< GPU device object - size_t size, //!< Memory object size - size_t width, //!< Allocated memory width - size_t height, //!< Allocated memory height - size_t depth, //!< Allocated memory depth - cl_image_format format, //!< Memory format - cl_mem_object_type imageType, //!< CL image type - uint mipLevels //!< The number of mip levels - ); + //! Constructor memory for images (without global heap allocation) + Memory(const Device& gpuDev, //!< GPU device object + size_t size, //!< Memory object size + size_t width, //!< Allocated memory width + size_t height, //!< Allocated memory height + size_t depth, //!< Allocated memory depth + cl_image_format format, //!< Memory format + cl_mem_object_type imageType, //!< CL image type + uint mipLevels //!< The number of mip levels + ); - //! Default destructor - ~Memory(); + //! Default destructor + ~Memory(); - //! Creates the interop memory - bool createInterop( - InteropType type //!< The interop type - ); + //! Creates the interop memory + bool createInterop(InteropType type //!< The interop type + ); - //! Overloads the resource create method - virtual bool create( - Resource::MemoryType memType, //!< Memory type - Resource::CreateParams* params = NULL //!< Prameters for create - ); + //! Overloads the resource create method + virtual bool create(Resource::MemoryType memType, //!< Memory type + Resource::CreateParams* params = NULL //!< Prameters for create + ); - //! Allocate memory for API-level maps - virtual void* allocMapTarget( - const amd::Coord3D& origin, //!< The map location in memory - const amd::Coord3D& region, //!< The map region in memory - uint mapFlags, //!< Map flags - size_t* rowPitch = NULL, //!< Row pitch for the mapped memory - size_t* slicePitch = NULL //!< Slice for the mapped memory - ); + //! Allocate memory for API-level maps + virtual void* allocMapTarget(const amd::Coord3D& origin, //!< The map location in memory + const amd::Coord3D& region, //!< The map region in memory + uint mapFlags, //!< Map flags + size_t* rowPitch = NULL, //!< Row pitch for the mapped memory + size_t* slicePitch = NULL //!< Slice for the mapped memory + ); - //! Pins system memory associated with this memory object - virtual bool pinSystemMemory( - void* hostPtr, //!< System memory address - size_t size //!< Size of allocated system memory - ); + //! Pins system memory associated with this memory object + virtual bool pinSystemMemory(void* hostPtr, //!< System memory address + size_t size //!< Size of allocated system memory + ); - //! Releases indirect map surface - virtual void releaseIndirectMap() { decIndMapCount(); } + //! Releases indirect map surface + virtual void releaseIndirectMap() { decIndMapCount(); } - //! Map the device memory to CPU visible - virtual void* cpuMap( - device::VirtualDevice& vDev,//!< Virtual device for map operaiton - uint flags = 0, //!< flags for the map operation - // Optimization for multilayer map/unmap - uint startLayer = 0, //!< Start layer for multilayer map - uint numLayers = 0, //!< End layer for multilayer map - size_t* rowPitch = NULL, //!< Row pitch for the device memory - size_t* slicePitch = NULL //!< Slice pitch for the device memory - ); + //! Map the device memory to CPU visible + virtual void* cpuMap(device::VirtualDevice& vDev, //!< Virtual device for map operaiton + uint flags = 0, //!< flags for the map operation + // Optimization for multilayer map/unmap + uint startLayer = 0, //!< Start layer for multilayer map + uint numLayers = 0, //!< End layer for multilayer map + size_t* rowPitch = NULL, //!< Row pitch for the device memory + size_t* slicePitch = NULL //!< Slice pitch for the device memory + ); - //! Unmap the device memory - virtual void cpuUnmap( - device::VirtualDevice& vDev //!< Virtual device for unmap operaiton - ); + //! Unmap the device memory + virtual void cpuUnmap(device::VirtualDevice& vDev //!< Virtual device for unmap operaiton + ); - //! Updates device memory from the owner's host allocation - void syncCacheFromHost( - VirtualGPU& gpu, //!< Virtual GPU device object - //! Synchronization flags - device::Memory::SyncFlags syncFlags = device::Memory::SyncFlags() - ); + //! Updates device memory from the owner's host allocation + void syncCacheFromHost(VirtualGPU& gpu, //!< Virtual GPU device object + //! Synchronization flags + device::Memory::SyncFlags syncFlags = device::Memory::SyncFlags()); - //! Updates the owner's host allocation from device memory - virtual void syncHostFromCache( - //! Synchronization flags - device::Memory::SyncFlags syncFlags = device::Memory::SyncFlags() - ); + //! Updates the owner's host allocation from device memory + virtual void syncHostFromCache( + //! Synchronization flags + device::Memory::SyncFlags syncFlags = device::Memory::SyncFlags()); - //! Creates a view from current resource - virtual Memory* createBufferView( - amd::Memory& subBufferOwner //!< The abstraction layer subbuf owner - ); + //! Creates a view from current resource + virtual Memory* createBufferView( + amd::Memory& subBufferOwner //!< The abstraction layer subbuf owner + ); - //! Allocates host memory for synchronization with MGPU context - void mgpuCacheWriteBack(); + //! Allocates host memory for synchronization with MGPU context + void mgpuCacheWriteBack(); - //! Transfers objects data to the destination object - bool moveTo(Memory& dst); + //! Transfers objects data to the destination object + bool moveTo(Memory& dst); - //! Accessors for indirect map memory object - Memory* mapMemory() const; + //! Accessors for indirect map memory object + Memory* mapMemory() const; - //! Returns the interop memory for this memory object - Memory* interop() const { return interopMemory_; } + //! Returns the interop memory for this memory object + Memory* interop() const { return interopMemory_; } - //! Gets interop type for this memory object - InteropType interopType() const { return interopType_; } + //! Gets interop type for this memory object + InteropType interopType() const { return interopType_; } - //! Sets interop type for this memory object - void setInteropType(InteropType type) { interopType_ = type; } + //! Sets interop type for this memory object + void setInteropType(InteropType type) { interopType_ = type; } - //! Set the owner - void setOwner(amd::Memory* owner) { owner_ = owner; } + //! Set the owner + void setOwner(amd::Memory* owner) { owner_ = owner; } - // Decompress GL depth-stencil/MSAA resources for CL access - // Invalidates any FBOs the resource may be bound to, otherwise the GL driver may crash. - virtual bool processGLResource(GLResourceOP operation); + // Decompress GL depth-stencil/MSAA resources for CL access + // Invalidates any FBOs the resource may be bound to, otherwise the GL driver may crash. + virtual bool processGLResource(GLResourceOP operation); - //! Returns the interop resource for this memory object - const Memory* parent() const { return parent_; } + //! Returns the interop resource for this memory object + const Memory* parent() const { return parent_; } - //! Returns TRUE if direct map is acceaptable. The method detects - //! forced USWC memory on APU and will cause a switch to - //! indirect map for allocations with a possibility of host read - bool isDirectMap() - { - return (isCacheable() || !isHostMemDirectAccess() || + //! Returns TRUE if direct map is acceaptable. The method detects + //! forced USWC memory on APU and will cause a switch to + //! indirect map for allocations with a possibility of host read + bool isDirectMap() { + return (isCacheable() || !isHostMemDirectAccess() || (owner()->getMemFlags() & (CL_MEM_ALLOC_HOST_PTR | CL_MEM_HOST_WRITE_ONLY | CL_MEM_READ_ONLY))); - } + } -protected: - //! Decrement map count - void decIndMapCount(); + protected: + //! Decrement map count + void decIndMapCount(); - //! Initialize the object members - void init(); + //! Initialize the object members + void init(); -private: - //! Disable copy constructor - Memory(const Memory&); + private: + //! Disable copy constructor + Memory(const Memory&); - //! Disable operator= - Memory& operator=(const Memory&); + //! Disable operator= + Memory& operator=(const Memory&); - InteropType interopType_; //!< Interop type - Memory* interopMemory_; //!< interop memory - Memory* pinnedMemory_; //!< Memory used as pinned system memory - const Memory* parent_; //!< Parent memory object + InteropType interopType_; //!< Interop type + Memory* interopMemory_; //!< interop memory + Memory* pinnedMemory_; //!< Memory used as pinned system memory + const Memory* parent_; //!< Parent memory object }; -class Buffer: public pal::Memory -{ -public: - //! Buffer constructor - Buffer( - const Device& gpuDev, //!< GPU device object - amd::Memory& owner, //!< Abstraction layer memory object - size_t size //!< Buffer size - ) - : pal::Memory(gpuDev, owner, size) - {} +class Buffer : public pal::Memory { + public: + //! Buffer constructor + Buffer(const Device& gpuDev, //!< GPU device object + amd::Memory& owner, //!< Abstraction layer memory object + size_t size //!< Buffer size + ) + : pal::Memory(gpuDev, owner, size) {} - //! Creates a view from current resource - virtual Memory* createBufferView( - amd::Memory& subBufferOwner //!< The abstraction layer subbuf owner - ) const; + //! Creates a view from current resource + virtual Memory* createBufferView( + amd::Memory& subBufferOwner //!< The abstraction layer subbuf owner + ) const; -private: - //! Disable copy constructor - Buffer(const Buffer&); + private: + //! Disable copy constructor + Buffer(const Buffer&); - //! Disable operator= - Buffer& operator=(const Buffer&); + //! Disable operator= + Buffer& operator=(const Buffer&); }; -class Image: public pal::Memory -{ -public: - //! Image constructor - Image( - const Device& gpuDev, //!< GPU device object - amd::Memory& owner, //!< Abstraction layer memory object - size_t width, //!< Allocated memory width - size_t height, //!< Allocated memory height - size_t depth, //!< Allocated memory depth - cl_image_format format, //!< Memory format - cl_mem_object_type imageType, //!< CL image type - uint mipLevels //!< The number of mip levels +class Image : public pal::Memory { + public: + //! Image constructor + Image(const Device& gpuDev, //!< GPU device object + amd::Memory& owner, //!< Abstraction layer memory object + size_t width, //!< Allocated memory width + size_t height, //!< Allocated memory height + size_t depth, //!< Allocated memory depth + cl_image_format format, //!< Memory format + cl_mem_object_type imageType, //!< CL image type + uint mipLevels //!< The number of mip levels ) - : pal::Memory(gpuDev, owner, width, height, depth, format, imageType, mipLevels) - {} + : pal::Memory(gpuDev, owner, width, height, depth, format, imageType, mipLevels) {} - //! Image constructor - Image( - const Device& gpuDev, //!< GPU device object - size_t size, //!< Memory size - size_t width, //!< Allocated memory width - size_t height, //!< Allocated memory height - size_t depth, //!< Allocated memory depth - cl_image_format format, //!< Memory format - cl_mem_object_type imageType, //!< CL image type - uint mipLevels //!< The number of mip levels + //! Image constructor + Image(const Device& gpuDev, //!< GPU device object + size_t size, //!< Memory size + size_t width, //!< Allocated memory width + size_t height, //!< Allocated memory height + size_t depth, //!< Allocated memory depth + cl_image_format format, //!< Memory format + cl_mem_object_type imageType, //!< CL image type + uint mipLevels //!< The number of mip levels ) - : pal::Memory(gpuDev, size, width, height, depth, format, imageType, mipLevels) - {} + : pal::Memory(gpuDev, size, width, height, depth, format, imageType, mipLevels) {} - //! Allocate memory for API-level maps - virtual void* allocMapTarget( - const amd::Coord3D& origin, //!< The map location in memory - const amd::Coord3D& region, //!< The map region in memory - uint mapFlags, //!< Map flags - size_t* rowPitch = NULL, //!< Row pitch for the mapped memory - size_t* slicePitch = NULL //!< Slice for the mapped memory - ); + //! Allocate memory for API-level maps + virtual void* allocMapTarget(const amd::Coord3D& origin, //!< The map location in memory + const amd::Coord3D& region, //!< The map region in memory + uint mapFlags, //!< Map flags + size_t* rowPitch = NULL, //!< Row pitch for the mapped memory + size_t* slicePitch = NULL //!< Slice for the mapped memory + ); -private: - //! Disable copy constructor - Image(const Image&); + private: + //! Disable copy constructor + Image(const Image&); - //! Disable operator= - Image& operator=(const Image&); + //! Disable operator= + Image& operator=(const Image&); }; -} // namespace pal +} // namespace pal diff --git a/rocclr/runtime/device/pal/palprintf.cpp b/rocclr/runtime/device/pal/palprintf.cpp index c1dbfca483..b8933e769f 100644 --- a/rocclr/runtime/device/pal/palprintf.cpp +++ b/rocclr/runtime/device/pal/palprintf.cpp @@ -16,700 +16,622 @@ namespace pal { PrintfDbg::PrintfDbg(Device& device, FILE* file) - : dbgBuffer_(nullptr) - , dbgFile_(file) - , gpuDevice_(device) - , wiDbgSize_(0) - , initCntValue_(device, 4) -{ + : dbgBuffer_(nullptr), + dbgFile_(file), + gpuDevice_(device), + wiDbgSize_(0), + initCntValue_(device, 4) {} + +PrintfDbg::~PrintfDbg() { delete dbgBuffer_; } + +bool PrintfDbg::create() { + // Create a resource for the init count value + if (initCntValue_.create(Resource::Remote)) { + uint32_t* value = reinterpret_cast(initCntValue_.map(nullptr)); + // The counter starts from 1 + if (nullptr != value) { + *value = 1; + } else { + return false; + } + initCntValue_.unmap(nullptr); + return true; + } + return false; } -PrintfDbg::~PrintfDbg() -{ +bool PrintfDbg::init(VirtualGPU& gpu, bool printfEnabled, const amd::NDRange& size) { + // Set up debug output buffer (if printf active) + if (printfEnabled) { + if (!allocate()) { + return false; + } + + // Make sure that the size isn't bigger than the reported max + if (size.product() <= dev().settings().maxWorkGroupSize_) { + size_t wiDbgSizeTmp; + + // Calculate the debug buffer size per workitem + wiDbgSizeTmp = std::min(dbgBuffer_->size() / size.product(), dev().xferRead().bufSize()); + + // Make sure the size is DWORD aligned + wiDbgSizeTmp = amd::alignDown(wiDbgSizeTmp, sizeof(uint32_t)); + + // If the new size is different, then clear the initial values + if (wiDbgSize_ != wiDbgSizeTmp) { + wiDbgSize_ = wiDbgSizeTmp; + if (!clearWorkitems(gpu, 0, size.product())) { + wiDbgSize_ = 0; + return false; + } + } + } + } + + return true; +} + +bool PrintfDbg::output(VirtualGPU& gpu, bool printfEnabled, const amd::NDRange& size, + const std::vector& printfInfo) { + // Are we expected to generate debug output? + if (printfEnabled && !printfInfo.empty()) { + uint32_t* workitemData; + size_t i, j, k, z; + bool realloc = false; + + // Wait for kernel execution + gpu.waitAllEngines(); + + size_t zdim = 1; + size_t ydim = 1; + size_t xdim = 1; + + switch (size.dimensions()) { + case 3: + zdim = size[2]; + // Fall through ... + case 2: + ydim = size[1]; + // Fall through ... + case 1: + xdim = size[0]; + // Fall through ... + default: + break; + } + + for (k = 0; k < zdim; ++k) { + for (j = 0; j < ydim; ++j) { + for (i = 0; i < xdim; ++i) { + size_t idx = (xdim * (ydim * k + j) + i); + workitemData = mapWorkitem(gpu, idx, &realloc); + + if (nullptr != workitemData) { + uint32_t wp = workitemData[0]; // write pointer (i.e. first unwritten element) + // Walk through each PrintfDbg entry + for (z = 1; (z < (wiDbgSize() / sizeof(uint32_t))) && (z < wp);) { + if (printfInfo.size() < workitemData[z]) { + LogError("The format string wasn't reported"); + return false; + } + // Get the PrintfDbg info + const PrintfInfo& info = printfInfo[workitemData[z++]]; + // There's something in this buffer + outputDbgBuffer(info, workitemData, z); + } + } + unmapWorkitem(gpu, workitemData); + } + } + } + + // Reallocate debug buffer if necessary + if (!allocate(realloc)) { + return false; + } + } + return true; +} + +bool PrintfDbg::allocate(bool realloc) { + if (nullptr == dbgBuffer_) { + dbgBuffer_ = dev().createScratchBuffer(dev().info().printfBufferSize_); + } else if (realloc) { + LogWarning("Debug buffer reallocation!"); + // Double the buffer size if it's not big enough + size_t size = dbgBuffer_->size(); delete dbgBuffer_; + dbgBuffer_ = dev().createScratchBuffer(size << 1); + } + + return (nullptr != dbgBuffer_) ? true : false; } -bool -PrintfDbg::create() -{ - // Create a resource for the init count value - if (initCntValue_.create(Resource::Remote)) { - uint32_t* value = reinterpret_cast(initCntValue_.map(nullptr)); - // The counter starts from 1 - if (nullptr != value) { - *value = 1; - } - else { - return false; - } - initCntValue_.unmap(nullptr); - return true; - } - return false; -} - -bool -PrintfDbg::init( - VirtualGPU& gpu, - bool printfEnabled, - const amd::NDRange& size) -{ - // Set up debug output buffer (if printf active) - if (printfEnabled) { - if (!allocate()) { - return false; - } - - // Make sure that the size isn't bigger than the reported max - if (size.product() <= dev().settings().maxWorkGroupSize_) { - size_t wiDbgSizeTmp; - - // Calculate the debug buffer size per workitem - wiDbgSizeTmp = std::min(dbgBuffer_->size() / size.product(), - dev().xferRead().bufSize()); - - // Make sure the size is DWORD aligned - wiDbgSizeTmp = amd::alignDown(wiDbgSizeTmp, sizeof(uint32_t)); - - // If the new size is different, then clear the initial values - if (wiDbgSize_ != wiDbgSizeTmp) { - wiDbgSize_ = wiDbgSizeTmp; - if (!clearWorkitems(gpu, 0, size.product())) { - wiDbgSize_ = 0; - return false; - } - } - } - } - - return true; -} - -bool -PrintfDbg::output( - VirtualGPU& gpu, - bool printfEnabled, - const amd::NDRange& size, - const std::vector& printfInfo) -{ - // Are we expected to generate debug output? - if (printfEnabled && !printfInfo.empty()) { - uint32_t* workitemData; - size_t i, j, k, z; - bool realloc = false; - - // Wait for kernel execution - gpu.waitAllEngines(); - - size_t zdim = 1; - size_t ydim = 1; - size_t xdim = 1; - - switch (size.dimensions()) { - case 3: - zdim = size[2]; - // Fall through ... - case 2: - ydim = size[1]; - // Fall through ... - case 1: - xdim = size[0]; - // Fall through ... - default: - break; - } - - for (k = 0; k < zdim; ++k) { - for (j = 0; j < ydim; ++j) { - for (i = 0; i < xdim; ++i) { - size_t idx = (xdim * (ydim * k + j) + i); - workitemData = mapWorkitem(gpu, idx, &realloc); - - if (nullptr != workitemData) { - uint32_t wp = workitemData[0]; // write pointer (i.e. first unwritten element) - // Walk through each PrintfDbg entry - for (z = 1; (z < (wiDbgSize() / sizeof(uint32_t))) && (z < wp); ) { - if (printfInfo.size() < workitemData[z]) { - LogError("The format string wasn't reported"); - return false; - } - // Get the PrintfDbg info - const PrintfInfo& info = printfInfo[workitemData[z++]]; - // There's something in this buffer - outputDbgBuffer(info, workitemData, z); - } - } - unmapWorkitem(gpu, workitemData); - } - } - } - - // Reallocate debug buffer if necessary - if (!allocate(realloc)) { - return false; - } - } - return true; -} - -bool -PrintfDbg::allocate(bool realloc) -{ - if (nullptr == dbgBuffer_) { - dbgBuffer_ = dev().createScratchBuffer(dev().info().printfBufferSize_); - } - else if (realloc) { - LogWarning("Debug buffer reallocation!"); - // Double the buffer size if it's not big enough - size_t size = dbgBuffer_->size(); - delete dbgBuffer_; - dbgBuffer_ = dev().createScratchBuffer(size << 1); - } - - return (nullptr != dbgBuffer_) ? true : false; -} - -bool -PrintfDbg::checkFloat(const std::string& fmt) const -{ - switch (fmt[fmt.size() - 1]) { +bool PrintfDbg::checkFloat(const std::string& fmt) const { + switch (fmt[fmt.size() - 1]) { case 'e': case 'E': case 'f': case 'g': case 'G': case 'a': - return true; - break; + return true; + break; default: - break; - } - return false; + break; + } + return false; } -bool -PrintfDbg::checkString(const std::string& fmt) const -{ - if (fmt[fmt.size() - 1] == 's') - return true; - return false; +bool PrintfDbg::checkString(const std::string& fmt) const { + if (fmt[fmt.size() - 1] == 's') return true; + return false; } -int -PrintfDbg::checkVectorSpecifier( - const std::string& fmt, - size_t startPos, - size_t& curPos) const -{ - int vectorSize = 0; - size_t pos = curPos; - size_t size = curPos - startPos; +int PrintfDbg::checkVectorSpecifier(const std::string& fmt, size_t startPos, size_t& curPos) const { + int vectorSize = 0; + size_t pos = curPos; + size_t size = curPos - startPos; - if (size >= 3) { - size = 0; - //no modifiers - if (fmt[curPos - 3] == 'v') { - size = 2; - } - //the modifiers are "h" or "l" - else if (fmt[curPos - 4] == 'v') { - size = 3; - } - //the modifier is "hh" - else if ((curPos >= 5) && (fmt[curPos - 5] == 'v')) { - size = 4; - } - if (size > 0) { - curPos = size; - pos -= curPos; - - // Get vector size - vectorSize = fmt[pos++] - '0'; - // PrintfDbg supports only 2, 3, 4, 8 and 16 wide vectors - switch (vectorSize) { - case 1: - if ((fmt[pos++] - '0') == 6) { - vectorSize = 16; - } - else { - vectorSize = 0; - } - break; - case 2: - case 3: - case 4: - case 8: - break; - default: - vectorSize = 0; - break; - } - } + if (size >= 3) { + size = 0; + // no modifiers + if (fmt[curPos - 3] == 'v') { + size = 2; } + // the modifiers are "h" or "l" + else if (fmt[curPos - 4] == 'v') { + size = 3; + } + // the modifier is "hh" + else if ((curPos >= 5) && (fmt[curPos - 5] == 'v')) { + size = 4; + } + if (size > 0) { + curPos = size; + pos -= curPos; - return vectorSize; + // Get vector size + vectorSize = fmt[pos++] - '0'; + // PrintfDbg supports only 2, 3, 4, 8 and 16 wide vectors + switch (vectorSize) { + case 1: + if ((fmt[pos++] - '0') == 6) { + vectorSize = 16; + } else { + vectorSize = 0; + } + break; + case 2: + case 3: + case 4: + case 8: + break; + default: + vectorSize = 0; + break; + } + } + } + + return vectorSize; } static const size_t ConstStr = 0xffffffff; static const char Separator[] = ",\0"; -size_t -PrintfDbg::outputArgument( - const std::string& fmt, - bool printFloat, - size_t size, - const uint32_t* argument) const -{ - // Serialize the output to the screen - amd::ScopedLock k(dev().lockAsyncOps()); +size_t PrintfDbg::outputArgument(const std::string& fmt, bool printFloat, size_t size, + const uint32_t* argument) const { + // Serialize the output to the screen + amd::ScopedLock k(dev().lockAsyncOps()); - size_t copiedBytes = size; - // Print the string argument, using standard PrintfDbg() - if (checkString(fmt.c_str())) { - //copiedBytes should be as number of printed chars - copiedBytes = 0; - //(null) should be printed - if (*argument == 0) { - amd::Os::printf(fmt.data(),0); - //copiedBytes = strlen("(null)") - copiedBytes = 6; - } - else { - const unsigned char* argumentStr = reinterpret_cast(argument); - amd::Os::printf(fmt.data(),argumentStr); - //copiedBytes = strlen(argumentStr) - while (argumentStr[copiedBytes++] != 0); - } + size_t copiedBytes = size; + // Print the string argument, using standard PrintfDbg() + if (checkString(fmt.c_str())) { + // copiedBytes should be as number of printed chars + copiedBytes = 0; + //(null) should be printed + if (*argument == 0) { + amd::Os::printf(fmt.data(), 0); + // copiedBytes = strlen("(null)") + copiedBytes = 6; + } else { + const unsigned char* argumentStr = reinterpret_cast(argument); + amd::Os::printf(fmt.data(), argumentStr); + // copiedBytes = strlen(argumentStr) + while (argumentStr[copiedBytes++] != 0) + ; } + } - // Print the argument(except for string ), using standard PrintfDbg() - else { - bool hlModifier = (strstr(fmt.c_str(),"hl") != nullptr); - std::string hlFmt; - if (hlModifier) { - hlFmt = fmt; - hlFmt.erase(hlFmt.find_first_of("hl"),2); - } - switch (size) { - case 0: { - const char* str = reinterpret_cast(argument); - amd::Os::printf(fmt.data(), str); - // Find the string length - while (str[copiedBytes++] != 0); - } - break; - case 1: - amd::Os::printf(fmt.data(), *(reinterpret_cast(argument))); - break; - case 2: - case 4: - if (printFloat) { - static const char* fSpecifiers = "eEfgGa"; - std::string fmtF = fmt; - size_t posS = fmtF.find_first_of("%"); - size_t posE = fmtF.find_first_of(fSpecifiers); - if (posS != std::string::npos &&posE != std::string::npos) { - fmtF.replace(posS+1,posE-posS,"s"); - } - float fArg = *(reinterpret_cast(argument)); - float fSign = copysign(1.0,fArg); - if (isinf(fArg)&&!isnan(fArg)) { - if(fSign < 0) { - amd::Os::printf(fmtF.data(),"-infinity"); - } - else { - amd::Os::printf(fmtF.data(),"infinity"); - } - } - else if (isnan(fArg)) { - if(fSign < 0) { - amd::Os::printf(fmtF.data(),"-nan"); - } - else { - amd::Os::printf(fmtF.data(),"nan"); - } - } - else if (hlModifier) { - amd::Os::printf(hlFmt.data(),fArg); - } - else { - amd::Os::printf(fmt.data(),fArg); - } - } - else { - bool hhModifier = (strstr(fmt.c_str(),"hh") != nullptr); - if (hhModifier) { - //current implementation of printf in gcc 4.5.2 runtime libraries, doesn`t recognize "hh" modifier ==> - //argument should be explicitly converted to unsigned char (uchar) before printing and - //fmt should be updated not to contain "hh" modifier - std::string hhFmt = fmt; - hhFmt.erase(hhFmt.find_first_of("h"),2); - amd::Os::printf(hhFmt.data(), *(reinterpret_cast(argument))); - } - else if (hlModifier) { - amd::Os::printf(hlFmt.data(), *argument); - } - else { - amd::Os::printf(fmt.data(), *argument); - } - } - break; - case 8: - if (printFloat) { - if (hlModifier) { - amd::Os::printf(hlFmt.data(), *(reinterpret_cast(argument))); - } - else { - amd::Os::printf(fmt.data(), *(reinterpret_cast(argument))); - } - } - else { - std::string out = fmt; - // Use 'll' for 64 bit printf - out.insert((out.size() - 1), 1, 'l'); - amd::Os::printf(out.data(), *(reinterpret_cast(argument))); - } - break; - case ConstStr: { - const char* str = reinterpret_cast(argument); - amd::Os::printf(fmt.data(), str); - } - break; - default: - amd::Os::printf("Error: Unsupported data size for PrintfDbg. %d bytes", - static_cast(size)); - return 0; - } + // Print the argument(except for string ), using standard PrintfDbg() + else { + bool hlModifier = (strstr(fmt.c_str(), "hl") != nullptr); + std::string hlFmt; + if (hlModifier) { + hlFmt = fmt; + hlFmt.erase(hlFmt.find_first_of("hl"), 2); } - fflush(stdout); - return copiedBytes; + switch (size) { + case 0: { + const char* str = reinterpret_cast(argument); + amd::Os::printf(fmt.data(), str); + // Find the string length + while (str[copiedBytes++] != 0) + ; + } break; + case 1: + amd::Os::printf(fmt.data(), *(reinterpret_cast(argument))); + break; + case 2: + case 4: + if (printFloat) { + static const char* fSpecifiers = "eEfgGa"; + std::string fmtF = fmt; + size_t posS = fmtF.find_first_of("%"); + size_t posE = fmtF.find_first_of(fSpecifiers); + if (posS != std::string::npos && posE != std::string::npos) { + fmtF.replace(posS + 1, posE - posS, "s"); + } + float fArg = *(reinterpret_cast(argument)); + float fSign = copysign(1.0, fArg); + if (isinf(fArg) && !isnan(fArg)) { + if (fSign < 0) { + amd::Os::printf(fmtF.data(), "-infinity"); + } else { + amd::Os::printf(fmtF.data(), "infinity"); + } + } else if (isnan(fArg)) { + if (fSign < 0) { + amd::Os::printf(fmtF.data(), "-nan"); + } else { + amd::Os::printf(fmtF.data(), "nan"); + } + } else if (hlModifier) { + amd::Os::printf(hlFmt.data(), fArg); + } else { + amd::Os::printf(fmt.data(), fArg); + } + } else { + bool hhModifier = (strstr(fmt.c_str(), "hh") != nullptr); + if (hhModifier) { + // current implementation of printf in gcc 4.5.2 runtime libraries, doesn`t recognize + // "hh" modifier ==> + // argument should be explicitly converted to unsigned char (uchar) before printing and + // fmt should be updated not to contain "hh" modifier + std::string hhFmt = fmt; + hhFmt.erase(hhFmt.find_first_of("h"), 2); + amd::Os::printf(hhFmt.data(), *(reinterpret_cast(argument))); + } else if (hlModifier) { + amd::Os::printf(hlFmt.data(), *argument); + } else { + amd::Os::printf(fmt.data(), *argument); + } + } + break; + case 8: + if (printFloat) { + if (hlModifier) { + amd::Os::printf(hlFmt.data(), *(reinterpret_cast(argument))); + } else { + amd::Os::printf(fmt.data(), *(reinterpret_cast(argument))); + } + } else { + std::string out = fmt; + // Use 'll' for 64 bit printf + out.insert((out.size() - 1), 1, 'l'); + amd::Os::printf(out.data(), *(reinterpret_cast(argument))); + } + break; + case ConstStr: { + const char* str = reinterpret_cast(argument); + amd::Os::printf(fmt.data(), str); + } break; + default: + amd::Os::printf("Error: Unsupported data size for PrintfDbg. %d bytes", + static_cast(size)); + return 0; + } + } + fflush(stdout); + return copiedBytes; } -void -PrintfDbg::outputDbgBuffer(const PrintfInfo& info, const uint32_t* workitemData, size_t& i) const -{ - static const char* specifiers = "cdieEfgGaosuxXp"; - static const char* modifiers = "hl"; - static const char* special = "%n"; - static const std::string sepStr = "%s"; - const uint32_t* s = workitemData; - size_t pos = 0; +void PrintfDbg::outputDbgBuffer(const PrintfInfo& info, const uint32_t* workitemData, + size_t& i) const { + static const char* specifiers = "cdieEfgGaosuxXp"; + static const char* modifiers = "hl"; + static const char* special = "%n"; + static const std::string sepStr = "%s"; + const uint32_t* s = workitemData; + size_t pos = 0; - // Find the format string - std::string str = info.fmtString_; - std::string fmt; - size_t posStart, posEnd; + // Find the format string + std::string str = info.fmtString_; + std::string fmt; + size_t posStart, posEnd; - // Print all arguments - // Note: the following code walks through all arguments, provided by the kernel and - // finds the corresponding specifier in the format string. - // Then it splits the original string into substrings with a single specifier and - // uses standard PrintfDbg() to print each argument - for (uint j = 0; j < info.arguments_.size(); ++j) { - do { - posStart = str.find_first_of("%", pos); - if (posStart != std::string::npos) { - posStart++; - // Erase all spaces after % - while (str[posStart] == ' ') { - str.erase(posStart, 1); - } - size_t tmp = str.find_first_of(special, posStart); - size_t tmp2 = str.find_first_of(specifiers, posStart); - // Special cases. Special symbol is located before any specifier - if (tmp < tmp2) { - posEnd = posStart + 1; - fmt = str.substr(pos, posEnd - pos); - fmt.erase(posStart - pos - 1, 1); - pos = posStart = posEnd; - outputArgument(sepStr, false, ConstStr, - reinterpret_cast(fmt.data())); - continue; - } - break; - } - else if (pos < str.length()) { - outputArgument(sepStr, false, ConstStr,reinterpret_cast((str.substr(pos)).data())); - } + // Print all arguments + // Note: the following code walks through all arguments, provided by the kernel and + // finds the corresponding specifier in the format string. + // Then it splits the original string into substrings with a single specifier and + // uses standard PrintfDbg() to print each argument + for (uint j = 0; j < info.arguments_.size(); ++j) { + do { + posStart = str.find_first_of("%", pos); + if (posStart != std::string::npos) { + posStart++; + // Erase all spaces after % + while (str[posStart] == ' ') { + str.erase(posStart, 1); } - while (posStart != std::string::npos); - - if (posStart != std::string::npos) { - bool printFloat = false; - int vectorSize = 0; - size_t length; - size_t idPos = 0; - - // Search for PrintfDbg specifier in the format string. - // It will be a split point for the output - posEnd = str.find_first_of(specifiers, posStart); - if (posEnd == std::string::npos) { - pos = posStart = posEnd; - break; - } - posEnd++; - - size_t curPos = posEnd; - vectorSize = checkVectorSpecifier(str, posStart, curPos); - - // Get substring from the last position to the current specifier - fmt = str.substr(pos, posEnd - pos); - - // Readjust the string pointer if PrintfDbg outputs a vector - if (vectorSize != 0) { - size_t posVecSpec = fmt.length()-(curPos + 1); - size_t posVecMod = fmt.find_first_of(modifiers,posVecSpec + 1); - size_t posMod = str.find_first_of(modifiers,posStart); - if(posMod < posEnd){ - fmt = fmt.erase(posVecSpec, posVecMod - posVecSpec); - } - else{ - fmt = fmt.erase(posVecSpec, curPos); - } - idPos = posStart - pos - 1; - } - pos = posStart = posEnd; - - // Find out if the argument is a float - printFloat = checkFloat(fmt); - - // Is it a scalar value? - if (vectorSize == 0) { - length = outputArgument(fmt, printFloat, info.arguments_[j], &s[i]); - if (0 == length) { - return; - } - i += amd::alignUp(length, sizeof(uint32_t)) / sizeof(uint32_t); - } - else { - // 3-component vector's size is defined as 4 * size of each scalar component - size_t elemSize = info.arguments_[j] / (vectorSize == 3 ? 4 : vectorSize); - size_t k = i * sizeof(uint32_t); - std::string elementStr = fmt.substr(idPos, fmt.size()); - - // Print first element with full string - if (0 == outputArgument(fmt, printFloat, elemSize, &s[i])) { - return; - } - - // Print other elemnts with separator if available - for (int e = 1; e < vectorSize; ++e) { - const char* t = reinterpret_cast(s); - // Output the vector separator - outputArgument(sepStr, false, ConstStr, - reinterpret_cast(Separator)); - - // Output the next element - outputArgument(elementStr, printFloat, elemSize, - reinterpret_cast(&t[k + e * elemSize])); - } - i += (amd::alignUp(info.arguments_[j], sizeof(uint32_t))) - / sizeof(uint32_t); - } + size_t tmp = str.find_first_of(special, posStart); + size_t tmp2 = str.find_first_of(specifiers, posStart); + // Special cases. Special symbol is located before any specifier + if (tmp < tmp2) { + posEnd = posStart + 1; + fmt = str.substr(pos, posEnd - pos); + fmt.erase(posStart - pos - 1, 1); + pos = posStart = posEnd; + outputArgument(sepStr, false, ConstStr, reinterpret_cast(fmt.data())); + continue; } - } - - if (pos != std::string::npos) { - fmt = str.substr(pos, str.size() - pos); + break; + } else if (pos < str.length()) { outputArgument(sepStr, false, ConstStr, - reinterpret_cast(fmt.data())); - } -} + reinterpret_cast((str.substr(pos)).data())); + } + } while (posStart != std::string::npos); -bool -PrintfDbg::clearWorkitems(VirtualGPU& gpu, size_t idxStart, size_t number) const -{ - // Go through all locations for every thread and copy 1 - for (uint i = idxStart; i < idxStart + number; ++i) { - amd::Coord3D dst(i * wiDbgSize(), 0, 0); - amd::Coord3D size(sizeof(uint32_t), 0, 0); + if (posStart != std::string::npos) { + bool printFloat = false; + int vectorSize = 0; + size_t length; + size_t idPos = 0; - // Copy 1 into the corresponding location in the debug buffer - if (!initCntValue_.partialMemCopyTo( - gpu, amd::Coord3D(0, 0, 0), dst, size, *dbgBuffer_)) { - return false; + // Search for PrintfDbg specifier in the format string. + // It will be a split point for the output + posEnd = str.find_first_of(specifiers, posStart); + if (posEnd == std::string::npos) { + pos = posStart = posEnd; + break; + } + posEnd++; + + size_t curPos = posEnd; + vectorSize = checkVectorSpecifier(str, posStart, curPos); + + // Get substring from the last position to the current specifier + fmt = str.substr(pos, posEnd - pos); + + // Readjust the string pointer if PrintfDbg outputs a vector + if (vectorSize != 0) { + size_t posVecSpec = fmt.length() - (curPos + 1); + size_t posVecMod = fmt.find_first_of(modifiers, posVecSpec + 1); + size_t posMod = str.find_first_of(modifiers, posStart); + if (posMod < posEnd) { + fmt = fmt.erase(posVecSpec, posVecMod - posVecSpec); + } else { + fmt = fmt.erase(posVecSpec, curPos); } + idPos = posStart - pos - 1; + } + pos = posStart = posEnd; + + // Find out if the argument is a float + printFloat = checkFloat(fmt); + + // Is it a scalar value? + if (vectorSize == 0) { + length = outputArgument(fmt, printFloat, info.arguments_[j], &s[i]); + if (0 == length) { + return; + } + i += amd::alignUp(length, sizeof(uint32_t)) / sizeof(uint32_t); + } else { + // 3-component vector's size is defined as 4 * size of each scalar component + size_t elemSize = info.arguments_[j] / (vectorSize == 3 ? 4 : vectorSize); + size_t k = i * sizeof(uint32_t); + std::string elementStr = fmt.substr(idPos, fmt.size()); + + // Print first element with full string + if (0 == outputArgument(fmt, printFloat, elemSize, &s[i])) { + return; + } + + // Print other elemnts with separator if available + for (int e = 1; e < vectorSize; ++e) { + const char* t = reinterpret_cast(s); + // Output the vector separator + outputArgument(sepStr, false, ConstStr, reinterpret_cast(Separator)); + + // Output the next element + outputArgument(elementStr, printFloat, elemSize, + reinterpret_cast(&t[k + e * elemSize])); + } + i += (amd::alignUp(info.arguments_[j], sizeof(uint32_t))) / sizeof(uint32_t); + } } - return true; + } + + if (pos != std::string::npos) { + fmt = str.substr(pos, str.size() - pos); + outputArgument(sepStr, false, ConstStr, reinterpret_cast(fmt.data())); + } } -uint32_t* -PrintfDbg::mapWorkitem(VirtualGPU& gpu, size_t idx, bool* realloc) -{ - uint32_t wiSize = 0; - amd::Coord3D src(idx * wiDbgSize(), 0, 0); +bool PrintfDbg::clearWorkitems(VirtualGPU& gpu, size_t idxStart, size_t number) const { + // Go through all locations for every thread and copy 1 + for (uint i = idxStart; i < idxStart + number; ++i) { + amd::Coord3D dst(i * wiDbgSize(), 0, 0); + amd::Coord3D size(sizeof(uint32_t), 0, 0); + + // Copy 1 into the corresponding location in the debug buffer + if (!initCntValue_.partialMemCopyTo(gpu, amd::Coord3D(0, 0, 0), dst, size, *dbgBuffer_)) { + return false; + } + } + return true; +} + +uint32_t* PrintfDbg::mapWorkitem(VirtualGPU& gpu, size_t idx, bool* realloc) { + uint32_t wiSize = 0; + amd::Coord3D src(idx * wiDbgSize(), 0, 0); + xferBufRead_ = &(dev().xferRead().acquire()); + + // Copy workitem size from the corresponding location in the debug buffer + if (!dbgBuffer_->partialMemCopyTo(gpu, src, amd::Coord3D(0, 0, 0), + amd::Coord3D(sizeof(uint32_t), 0, 0), *xferBufRead_)) { + return nullptr; + } + + // Get memory pointer to the satged buffer + uint32_t* workitem = reinterpret_cast(xferBufRead_->map(&gpu)); + if (nullptr == workitem) { + return nullptr; + } + + // Copy size value + wiSize = *workitem; + xferBufRead_->unmap(&gpu); + + // Check if the cuurent workitem almost reached the size limit + if ((wiDbgSize() - static_cast(wiSize)) < 3) { + *realloc = true; + } + + // If the current workitem had any output then get the data + if ((wiSize > 1) && (wiSize <= wiDbgSize())) { + amd::Coord3D size(wiSize * sizeof(uint32_t), 0, 0); + + // Copy the current workitem output data to the staged buffer + if (!dbgBuffer_->partialMemCopyTo(gpu, src, amd::Coord3D(0, 0, 0), size, *xferBufRead_) || + // Clear the write pointer back to index 1 for the current workitem + !clearWorkitems(gpu, idx, 1)) { + LogError("Reading the workitem data failed!"); + return nullptr; + } + + // Get a pointer to the workitem data + uint32_t* workitem = reinterpret_cast(xferBufRead_->map(&gpu)); + + return workitem; + } + + return nullptr; +} + +void PrintfDbg::unmapWorkitem(VirtualGPU& gpu, const uint32_t* workitemData) const { + if (nullptr != workitemData) { + xferBufRead_->unmap(&gpu); + } + + dev().xferRead().release(gpu, *xferBufRead_); +} + +bool PrintfDbgHSA::init(VirtualGPU& gpu, bool printfEnabled) { + // Set up debug output buffer (if printf active) + if (printfEnabled) { + if (!allocate()) { + return false; + } + + // The first two DWORDs in the printf buffer are as follows: + // First DWORD = Offset to where next information is to + // be written, initialized to 0 + // Second DWORD = Number of bytes available for printf data + // = buffer size – 2*sizeof(uint32_t) + const uint8_t initSize = 2 * sizeof(uint32_t); + uint8_t sysMem[initSize]; + memset(sysMem, 0, initSize); + uint32_t dbgBufferSize = dbgBuffer_->size() - initSize; + memcpy(&sysMem[4], &dbgBufferSize, sizeof(dbgBufferSize)); + + // Copy offset and number of bytes available for printf data + // into the corresponding location in the debug buffer + dbgBuffer_->writeRawData(gpu, 0, initSize, sysMem, true); + } + return true; +} + +bool PrintfDbgHSA::output(VirtualGPU& gpu, bool printfEnabled, + const std::vector& printfInfo) { + if (printfEnabled) { + uint32_t offsetSize = 0; xferBufRead_ = &(dev().xferRead().acquire()); - // Copy workitem size from the corresponding location in the debug buffer - if (!dbgBuffer_->partialMemCopyTo(gpu, - src, amd::Coord3D(0, 0, 0), amd::Coord3D(sizeof(uint32_t), 0, 0), - *xferBufRead_)) { - return nullptr; + // Copy offset from the first DWORD in the debug buffer + if (!dbgBuffer_->partialMemCopyTo(gpu, amd::Coord3D(0, 0, 0), amd::Coord3D(0, 0, 0), + amd::Coord3D(sizeof(uint32_t), 0, 0), *xferBufRead_)) { + return false; } // Get memory pointer to the satged buffer - uint32_t* workitem = reinterpret_cast(xferBufRead_->map(&gpu)); - if (nullptr == workitem) { - return nullptr; + uint32_t* dbgBufferPtr = reinterpret_cast(xferBufRead_->map(&gpu)); + if (nullptr == dbgBufferPtr) { + return false; } - // Copy size value - wiSize = *workitem; + offsetSize = *dbgBufferPtr; xferBufRead_->unmap(&gpu); - // Check if the cuurent workitem almost reached the size limit - if ((wiDbgSize() - static_cast(wiSize)) < 3) { - *realloc = true; + if (offsetSize == 0) { + LogInfo("The printf buffer is empty!"); + dev().xferRead().release(gpu, *xferBufRead_); + return true; } - // If the current workitem had any output then get the data - if ((wiSize > 1) && (wiSize <= wiDbgSize())) { - amd::Coord3D size(wiSize * sizeof(uint32_t), 0, 0); + size_t bufSize = dev().xferRead().bufSize(); + size_t copySize = offsetSize; + while (copySize != 0) { + // Copy the buffer data (i.e., the printfID followed by the + // argument data for each printf call in th kernel) to the staged buffer + if (!dbgBuffer_->partialMemCopyTo( + gpu, amd::Coord3D(2 * sizeof(uint32_t) + offsetSize - copySize, 0, 0), + amd::Coord3D(0, 0, 0), std::min(copySize, bufSize), *xferBufRead_)) { + return false; + } - // Copy the current workitem output data to the staged buffer - if (!dbgBuffer_->partialMemCopyTo( - gpu, src, amd::Coord3D(0, 0, 0), size, *xferBufRead_) || - // Clear the write pointer back to index 1 for the current workitem - !clearWorkitems(gpu, idx, 1)) { - LogError("Reading the workitem data failed!"); - return nullptr; + // Get a pointer to the buffer data + dbgBufferPtr = reinterpret_cast(xferBufRead_->map(&gpu)); + if (nullptr == dbgBufferPtr) { + return false; + } + + std::vector::const_iterator ita; + uint sb = 0; + uint sbt = 0; + + // parse the debug buffer + while (sbt < copySize) { + assert(((*dbgBufferPtr) < printfInfo.size()) && "Cound't find the reported PrintfID!"); + const PrintfInfo& info = printfInfo[(*dbgBufferPtr)]; + sb += sizeof(uint32_t); + for (ita = info.arguments_.begin(); ita != info.arguments_.end(); ++ita) { + sb += *ita; } - // Get a pointer to the workitem data - uint32_t* workitem = reinterpret_cast - (xferBufRead_->map(&gpu)); + if (sbt + sb > bufSize) { + break; // Need new portion of data in staging buffer + } - return workitem; - } + size_t idx = 1; + // There's something in the debug buffer + outputDbgBuffer(info, dbgBufferPtr, idx); - return nullptr; -} + sbt += sb; + dbgBufferPtr += sb / sizeof(uint32_t); + sb = 0; + } -void -PrintfDbg::unmapWorkitem(VirtualGPU& gpu , const uint32_t* workitemData) const -{ - if (nullptr != workitemData) { - xferBufRead_->unmap(&gpu); + copySize -= sbt; + xferBufRead_->unmap(&gpu); } dev().xferRead().release(gpu, *xferBufRead_); + } + + return true; } -bool -PrintfDbgHSA::init( - VirtualGPU& gpu, - bool printfEnabled) -{ - // Set up debug output buffer (if printf active) - if (printfEnabled) { - if (!allocate()) { - return false; - } - - // The first two DWORDs in the printf buffer are as follows: - // First DWORD = Offset to where next information is to - // be written, initialized to 0 - // Second DWORD = Number of bytes available for printf data - // = buffer size – 2*sizeof(uint32_t) - const uint8_t initSize = 2*sizeof(uint32_t); - uint8_t sysMem[initSize]; - memset(sysMem, 0, initSize); - uint32_t dbgBufferSize = dbgBuffer_->size() - initSize; - memcpy(&sysMem[4], &dbgBufferSize, sizeof(dbgBufferSize)); - - // Copy offset and number of bytes available for printf data - // into the corresponding location in the debug buffer - dbgBuffer_->writeRawData(gpu, 0, initSize, sysMem, true); - } - return true; -} - -bool -PrintfDbgHSA::output( - VirtualGPU& gpu, - bool printfEnabled, - const std::vector& printfInfo) -{ - if (printfEnabled) { - uint32_t offsetSize = 0; - xferBufRead_ = &(dev().xferRead().acquire()); - - // Copy offset from the first DWORD in the debug buffer - if (!dbgBuffer_->partialMemCopyTo(gpu, - amd::Coord3D(0, 0, 0), amd::Coord3D(0, 0, 0), - amd::Coord3D(sizeof(uint32_t), 0, 0),*xferBufRead_)) { - return false; - } - - // Get memory pointer to the satged buffer - uint32_t* dbgBufferPtr = reinterpret_cast(xferBufRead_->map(&gpu)); - if (nullptr == dbgBufferPtr) { - return false; - } - - offsetSize = *dbgBufferPtr; - xferBufRead_->unmap(&gpu); - - if (offsetSize == 0) { - LogInfo("The printf buffer is empty!"); - dev().xferRead().release(gpu, *xferBufRead_); - return true; - } - - size_t bufSize = dev().xferRead().bufSize(); - size_t copySize = offsetSize; - while (copySize != 0) { - // Copy the buffer data (i.e., the printfID followed by the - //argument data for each printf call in th kernel) to the staged buffer - if (!dbgBuffer_->partialMemCopyTo(gpu, - amd::Coord3D(2*sizeof(uint32_t) + offsetSize - copySize, 0, 0), - amd::Coord3D(0, 0, 0), - std::min(copySize, bufSize), *xferBufRead_)) { - return false; - } - - // Get a pointer to the buffer data - dbgBufferPtr = reinterpret_cast(xferBufRead_->map(&gpu)); - if (nullptr == dbgBufferPtr) { - return false; - } - - std::vector::const_iterator ita; - uint sb = 0; - uint sbt = 0; - - // parse the debug buffer - while (sbt < copySize) { - assert(((*dbgBufferPtr) < printfInfo.size()) && - "Cound't find the reported PrintfID!"); - const PrintfInfo& info = printfInfo[(*dbgBufferPtr)]; - sb += sizeof(uint32_t); - for (ita = info.arguments_.begin(); - ita != info.arguments_.end(); ++ita){ - sb += *ita; - } - - if (sbt + sb > bufSize) { - break; // Need new portion of data in staging buffer - } - - size_t idx = 1; - // There's something in the debug buffer - outputDbgBuffer(info, dbgBufferPtr, idx); - - sbt += sb; - dbgBufferPtr += sb/sizeof(uint32_t); - sb = 0; - } - - copySize -= sbt; - xferBufRead_->unmap(&gpu); - } - - dev().xferRead().release(gpu, *xferBufRead_); - } - - return true; -} - -} // namespace pal +} // namespace pal diff --git a/rocclr/runtime/device/pal/palprintf.hpp b/rocclr/runtime/device/pal/palprintf.hpp index ca84bda555..94fa94a426 100644 --- a/rocclr/runtime/device/pal/palprintf.hpp +++ b/rocclr/runtime/device/pal/palprintf.hpp @@ -11,179 +11,156 @@ #ifndef isinf #ifdef _MSC_VER #define isinf(X) (!_finite(X) && !_isnan(X)) -#endif //_MSC_VER -#endif //isinf +#endif //_MSC_VER +#endif // isinf #ifndef isnan #ifdef _MSC_VER #define isnan(X) (_isnan(X)) -#endif //_MSC_VER -#endif //isnan +#endif //_MSC_VER +#endif // isnan #ifndef copysign #ifdef _MSC_VER -#define copysign(X,Y) (_copysign(X,Y)) -#endif //_MSC_VER -#endif //copysign +#define copysign(X, Y) (_copysign(X, Y)) +#endif //_MSC_VER +#endif // copysign //! GPU Device Implementation namespace pal { //! Printf info structure -struct PrintfInfo -{ - std::string fmtString_; //!< formated string for printf - std::vector arguments_; //!< passed arguments to the printf() call +struct PrintfInfo { + std::string fmtString_; //!< formated string for printf + std::vector arguments_; //!< passed arguments to the printf() call }; class Kernel; class VirtualGPU; class Memory; -class PrintfDbg : public amd::HeapObject -{ -public: - //! Debug buffer size per workitem - static const uint WorkitemDebugSize = 4096; +class PrintfDbg : public amd::HeapObject { + public: + //! Debug buffer size per workitem + static const uint WorkitemDebugSize = 4096; - //! Default constructor - PrintfDbg( - Device& device, - FILE* file = NULL - ); + //! Default constructor + PrintfDbg(Device& device, FILE* file = NULL); - //! Destructor - ~PrintfDbg(); + //! Destructor + ~PrintfDbg(); - //! Creates the PrintfDbg object - bool create(); + //! Creates the PrintfDbg object + bool create(); - //! Initializes the debug buffer before kernel's execution - bool init( - VirtualGPU& gpu, //!< Virtual GPU object - bool printfEnabled, //!< checks for printf - const amd::NDRange& size //!< Kernel's workload - ); + //! Initializes the debug buffer before kernel's execution + bool init(VirtualGPU& gpu, //!< Virtual GPU object + bool printfEnabled, //!< checks for printf + const amd::NDRange& size //!< Kernel's workload + ); - //! Prints the kernel's debug informaiton from the buffer - bool output( - VirtualGPU& gpu, //!< Virtual GPU object - bool printfEnabled, //!< checks for printf - const amd::NDRange& size, //!< Kernel's workload - const std::vector& printfInfo //!< printf info - ); + //! Prints the kernel's debug informaiton from the buffer + bool output(VirtualGPU& gpu, //!< Virtual GPU object + bool printfEnabled, //!< checks for printf + const amd::NDRange& size, //!< Kernel's workload + const std::vector& printfInfo //!< printf info + ); - //! Debug buffer size per workitem - size_t wiDbgSize() const { return wiDbgSize_; } + //! Debug buffer size per workitem + size_t wiDbgSize() const { return wiDbgSize_; } - //! Returns debug buffer object - Memory* dbgBuffer() const { return dbgBuffer_; } + //! Returns debug buffer object + Memory* dbgBuffer() const { return dbgBuffer_; } -protected: - Memory* dbgBuffer_; //!< Buffer to hold debug output - FILE* dbgFile_; //!< Debug file - Device& gpuDevice_; //!< GPU device object - Memory* xferBufRead_; //!< Transfer buffer for the dump read + protected: + Memory* dbgBuffer_; //!< Buffer to hold debug output + FILE* dbgFile_; //!< Debug file + Device& gpuDevice_; //!< GPU device object + Memory* xferBufRead_; //!< Transfer buffer for the dump read - //! Gets GPU device object - Device& dev() const { return gpuDevice_; } + //! Gets GPU device object + Device& dev() const { return gpuDevice_; } - //! Allocates the debug buffer - bool allocate( - bool realloc = false //!< If TRUE then reallocate the debug memory - ); + //! Allocates the debug buffer + bool allocate(bool realloc = false //!< If TRUE then reallocate the debug memory + ); - //! Returns TRUE if a float value has to be printed - bool checkFloat( - const std::string& fmt //!< Format string - ) const; + //! Returns TRUE if a float value has to be printed + bool checkFloat(const std::string& fmt //!< Format string + ) const; - //! Returns TRUE if a string value has to be printed - bool checkString( - const std::string& fmt //!< Format string - ) const; + //! Returns TRUE if a string value has to be printed + bool checkString(const std::string& fmt //!< Format string + ) const; - //! Finds the specifier in the format string - int checkVectorSpecifier( - const std::string& fmt, //!< Format string - size_t startPos, //!< Start position for processing - size_t& curPos //!< End position for processing - ) const; + //! Finds the specifier in the format string + int checkVectorSpecifier(const std::string& fmt, //!< Format string + size_t startPos, //!< Start position for processing + size_t& curPos //!< End position for processing + ) const; - //! Outputs an argument - size_t outputArgument( - const std::string& fmt, //!< Format strint - bool printFloat, //!< Argument is a float value - size_t size, //!< Argument's size - const uint32_t* argument //!< Argument's location - ) const; + //! Outputs an argument + size_t outputArgument(const std::string& fmt, //!< Format strint + bool printFloat, //!< Argument is a float value + size_t size, //!< Argument's size + const uint32_t* argument //!< Argument's location + ) const; - //! Displays the PrintfDbg - void outputDbgBuffer( - const PrintfInfo& info, //!< printf info - const uint32_t* workitemData, //!< The PrintfDbg dump buffer - size_t& i //!< index to the data in the buffer - ) const; + //! Displays the PrintfDbg + void outputDbgBuffer(const PrintfInfo& info, //!< printf info + const uint32_t* workitemData, //!< The PrintfDbg dump buffer + size_t& i //!< index to the data in the buffer + ) const; -private: - //! Disable copy constructor - PrintfDbg(const PrintfDbg&); + private: + //! Disable copy constructor + PrintfDbg(const PrintfDbg&); - //! Disable assignment - PrintfDbg& operator=(const PrintfDbg&); + //! Disable assignment + PrintfDbg& operator=(const PrintfDbg&); - //! Returns the pointer to the workitem data block - bool clearWorkitems( - VirtualGPU& gpu, //!< Virtual GPU object - size_t idxStart, //!< Workitem global index start - size_t number //!< Number of workitems to clear - ) const; + //! Returns the pointer to the workitem data block + bool clearWorkitems(VirtualGPU& gpu, //!< Virtual GPU object + size_t idxStart, //!< Workitem global index start + size_t number //!< Number of workitems to clear + ) const; - //! Returns the pointer to the workitem data block - uint32_t* mapWorkitem( - VirtualGPU& gpu, //!< Virtual GPU object - size_t idx, //!< Workitem global index - bool* realloc //!< Returns TRUE if workitem reached the buffer limit - ); + //! Returns the pointer to the workitem data block + uint32_t* mapWorkitem(VirtualGPU& gpu, //!< Virtual GPU object + size_t idx, //!< Workitem global index + bool* realloc //!< Returns TRUE if workitem reached the buffer limit + ); - //! Unamp the staged buffer - void unmapWorkitem( - VirtualGPU& gpu, //!< Virtual GPU object - const uint32_t* workitemData //!< The PrintfDbg dump buffer - ) const; + //! Unamp the staged buffer + void unmapWorkitem(VirtualGPU& gpu, //!< Virtual GPU object + const uint32_t* workitemData //!< The PrintfDbg dump buffer + ) const; - size_t wiDbgSize_; //!< Workitem debug size - Memory initCntValue_; //!< Initialized count value + size_t wiDbgSize_; //!< Workitem debug size + Memory initCntValue_; //!< Initialized count value }; -class PrintfDbgHSA : public PrintfDbg -{ -public: +class PrintfDbgHSA : public PrintfDbg { + public: + //! Default constructor + PrintfDbgHSA(Device& device, FILE* file = NULL) : PrintfDbg(device, file) {} - //! Default constructor - PrintfDbgHSA( - Device& device, - FILE* file = NULL - ): PrintfDbg(device, file) { } + //! Initializes the debug buffer before kernel's execution + bool init(VirtualGPU& gpu, //!< Virtual GPU object + bool printfEnabled //!< checks for printf + ); - //! Initializes the debug buffer before kernel's execution - bool init( - VirtualGPU& gpu, //!< Virtual GPU object - bool printfEnabled //!< checks for printf - ); + //! Prints the kernel's debug informaiton from the buffer + bool output(VirtualGPU& gpu, //!< Virtual GPU object + bool printfEnabled, //!< checks for printf + const std::vector& printfInfo //!< printf info + ); - //! Prints the kernel's debug informaiton from the buffer - bool output( - VirtualGPU& gpu, //!< Virtual GPU object - bool printfEnabled, //!< checks for printf - const std::vector& printfInfo //!< printf info - ); + private: + //! Disable copy constructor + PrintfDbgHSA(const PrintfDbgHSA&); -private: - //! Disable copy constructor - PrintfDbgHSA(const PrintfDbgHSA&); - - //! Disable assignment - PrintfDbgHSA& operator=(const PrintfDbgHSA&); + //! Disable assignment + PrintfDbgHSA& operator=(const PrintfDbgHSA&); }; /*@}*/} // namespace pal diff --git a/rocclr/runtime/device/pal/palprogram.cpp b/rocclr/runtime/device/pal/palprogram.cpp index 2ab9b7ba88..d991536b96 100644 --- a/rocclr/runtime/device/pal/palprogram.cpp +++ b/rocclr/runtime/device/pal/palprogram.cpp @@ -24,541 +24,515 @@ #include "driver/AmdCompiler.h" #include "libraries.amdgcn.inc" #include "gelf.h" -#endif // !defined(WITH_LIGHTNING_COMPILER) +#endif // !defined(WITH_LIGHTNING_COMPILER) namespace pal { -Segment::Segment() - : gpuAccess_(nullptr) - , cpuAccess_(nullptr) -{} +Segment::Segment() : gpuAccess_(nullptr), cpuAccess_(nullptr) {} -Segment::~Segment() -{ - delete gpuAccess_; - if (cpuAccess_ != nullptr) { - cpuAccess_->unmap(nullptr); - delete cpuAccess_; - } +Segment::~Segment() { + delete gpuAccess_; + if (cpuAccess_ != nullptr) { + cpuAccess_->unmap(nullptr); + delete cpuAccess_; + } } -bool -Segment::alloc( - HSAILProgram& prog, amdgpu_hsa_elf_segment_t segment, - size_t size, size_t align, bool zero) -{ - align = amd::alignUp(align, sizeof(uint32_t)); - gpuAccess_ = new pal::Memory(prog.dev(), amd::alignUp(size, align)); - if ((gpuAccess_ == nullptr) || !gpuAccess_->create(pal::Resource::Shader)) { - delete gpuAccess_; - gpuAccess_ = nullptr; - return false; +bool Segment::alloc(HSAILProgram& prog, amdgpu_hsa_elf_segment_t segment, size_t size, size_t align, + bool zero) { + align = amd::alignUp(align, sizeof(uint32_t)); + gpuAccess_ = new pal::Memory(prog.dev(), amd::alignUp(size, align)); + if ((gpuAccess_ == nullptr) || !gpuAccess_->create(pal::Resource::Shader)) { + delete gpuAccess_; + gpuAccess_ = nullptr; + return false; + } + if (segment == AMDGPU_HSA_SEGMENT_CODE_AGENT) { + cpuAccess_ = new pal::Memory(prog.dev(), amd::alignUp(size, align)); + if ((cpuAccess_ == nullptr) || !cpuAccess_->create(pal::Resource::Remote)) { + delete cpuAccess_; + cpuAccess_ = nullptr; + return false; } - if (segment == AMDGPU_HSA_SEGMENT_CODE_AGENT) { - cpuAccess_ = new pal::Memory(prog.dev(), amd::alignUp(size, align)); - if ((cpuAccess_ == nullptr) || !cpuAccess_->create(pal::Resource::Remote)) { - delete cpuAccess_; - cpuAccess_ = nullptr; - return false; - } - void* ptr = cpuAccess_->map(nullptr, 0); - if (zero) { - memset(ptr, 0, size); - } + void* ptr = cpuAccess_->map(nullptr, 0); + if (zero) { + memset(ptr, 0, size); } + } - if (zero && !prog.isInternal()) { - char pattern = 0; - prog.dev().xferMgr().fillBuffer(*gpuAccess_, &pattern, sizeof(pattern), - amd::Coord3D(0), amd::Coord3D(size)); - } + if (zero && !prog.isInternal()) { + char pattern = 0; + prog.dev().xferMgr().fillBuffer(*gpuAccess_, &pattern, sizeof(pattern), amd::Coord3D(0), + amd::Coord3D(size)); + } - switch (segment) { + switch (segment) { case AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM: case AMDGPU_HSA_SEGMENT_GLOBAL_AGENT: case AMDGPU_HSA_SEGMENT_READONLY_AGENT: - prog.addGlobalStore(gpuAccess_); - prog.setGlobalVariableTotalSize(prog.globalVariableTotalSize() + size); - break; + prog.addGlobalStore(gpuAccess_); + prog.setGlobalVariableTotalSize(prog.globalVariableTotalSize() + size); + break; case AMDGPU_HSA_SEGMENT_CODE_AGENT: - prog.setCodeObjects(gpuAccess_, cpuAccess_->data()); - break; + prog.setCodeObjects(gpuAccess_, cpuAccess_->data()); + break; default: - break; - } - return true; + break; + } + return true; } -void -Segment::copy(size_t offset, const void* src, size_t size) -{ - if (cpuAccess_ != nullptr) { - amd::Os::fastMemcpy(cpuAddress(offset), src, size); - } - else { - VirtualGPU& gpu = *gpuAccess_->dev().xferQueue(); - Memory& xferBuf = gpuAccess_->dev().xferWrite().acquire(); - size_t tmpSize = std::min(static_cast(xferBuf.vmSize()), size); - size_t srcOffs = 0; - while (size != 0) { - xferBuf.hostWrite(&gpu, - reinterpret_cast(src) + srcOffs, 0, tmpSize); - bool result = xferBuf.partialMemCopyTo(gpu, - 0, (offset + srcOffs), tmpSize, *gpuAccess_, false, true); - size -= tmpSize; - srcOffs += tmpSize; - tmpSize = std::min(static_cast(xferBuf.vmSize()), size); - } - gpu.releaseMemObjects(); - gpu.waitAllEngines(); - } -} - -bool -Segment::freeze(bool destroySysmem) -{ +void Segment::copy(size_t offset, const void* src, size_t size) { + if (cpuAccess_ != nullptr) { + amd::Os::fastMemcpy(cpuAddress(offset), src, size); + } else { VirtualGPU& gpu = *gpuAccess_->dev().xferQueue(); - bool result = true; - if (cpuAccess_ != nullptr) { - assert(gpuAccess_->size() == cpuAccess_->size() && "Backing store size mismatch!"); - result = cpuAccess_->partialMemCopyTo(gpu, - 0, 0, gpuAccess_->size(), *gpuAccess_, false, true); - gpu.releaseMemObjects(); - gpu.waitAllEngines(); + Memory& xferBuf = gpuAccess_->dev().xferWrite().acquire(); + size_t tmpSize = std::min(static_cast(xferBuf.vmSize()), size); + size_t srcOffs = 0; + while (size != 0) { + xferBuf.hostWrite(&gpu, reinterpret_cast(src) + srcOffs, 0, tmpSize); + bool result = + xferBuf.partialMemCopyTo(gpu, 0, (offset + srcOffs), tmpSize, *gpuAccess_, false, true); + size -= tmpSize; + srcOffs += tmpSize; + tmpSize = std::min(static_cast(xferBuf.vmSize()), size); } - assert(!destroySysmem || (cpuAccess_ == nullptr)); - return result; + gpu.releaseMemObjects(); + gpu.waitAllEngines(); + } +} + +bool Segment::freeze(bool destroySysmem) { + VirtualGPU& gpu = *gpuAccess_->dev().xferQueue(); + bool result = true; + if (cpuAccess_ != nullptr) { + assert(gpuAccess_->size() == cpuAccess_->size() && "Backing store size mismatch!"); + result = cpuAccess_->partialMemCopyTo(gpu, 0, 0, gpuAccess_->size(), *gpuAccess_, false, true); + gpu.releaseMemObjects(); + gpu.waitAllEngines(); + } + assert(!destroySysmem || (cpuAccess_ == nullptr)); + return result; } HSAILProgram::HSAILProgram(Device& device) - : Program(device) - , llvmBinary_() - , binaryElf_(nullptr) - , rawBinary_(nullptr) - , kernels_(nullptr) - , codeSegGpu_(nullptr) - , codeSegCpu_(nullptr) - , maxScratchRegs_(0) - , flags_(0) - , executable_(nullptr) - , loaderContext_(this) -{ - memset(&binOpts_, 0, sizeof(binOpts_)); - binOpts_.struct_size = sizeof(binOpts_); - binOpts_.elfclass = LP64_SWITCH(ELFCLASS32, ELFCLASS64); - binOpts_.bitness = ELFDATA2LSB; - binOpts_.alloc = &::malloc; - binOpts_.dealloc = &::free; - loader_ = amd::hsa::loader::Loader::Create(&loaderContext_); + : Program(device), + llvmBinary_(), + binaryElf_(nullptr), + rawBinary_(nullptr), + kernels_(nullptr), + codeSegGpu_(nullptr), + codeSegCpu_(nullptr), + maxScratchRegs_(0), + flags_(0), + executable_(nullptr), + loaderContext_(this) { + memset(&binOpts_, 0, sizeof(binOpts_)); + binOpts_.struct_size = sizeof(binOpts_); + binOpts_.elfclass = LP64_SWITCH(ELFCLASS32, ELFCLASS64); + binOpts_.bitness = ELFDATA2LSB; + binOpts_.alloc = &::malloc; + binOpts_.dealloc = &::free; + loader_ = amd::hsa::loader::Loader::Create(&loaderContext_); } HSAILProgram::HSAILProgram(NullDevice& device) - : Program(device) - , llvmBinary_() - , binaryElf_(nullptr) - , rawBinary_(nullptr) - , kernels_(nullptr) - , codeSegGpu_(nullptr) - , codeSegCpu_(nullptr) - , maxScratchRegs_(0) - , flags_(0) - , executable_(nullptr) - , loaderContext_(this) -{ - memset(&binOpts_, 0, sizeof(binOpts_)); - isNull_ = true; - binOpts_.struct_size = sizeof(binOpts_); - binOpts_.elfclass = LP64_SWITCH(ELFCLASS32, ELFCLASS64); - binOpts_.bitness = ELFDATA2LSB; - binOpts_.alloc = &::malloc; - binOpts_.dealloc = &::free; - loader_ = amd::hsa::loader::Loader::Create(&loaderContext_); + : Program(device), + llvmBinary_(), + binaryElf_(nullptr), + rawBinary_(nullptr), + kernels_(nullptr), + codeSegGpu_(nullptr), + codeSegCpu_(nullptr), + maxScratchRegs_(0), + flags_(0), + executable_(nullptr), + loaderContext_(this) { + memset(&binOpts_, 0, sizeof(binOpts_)); + isNull_ = true; + binOpts_.struct_size = sizeof(binOpts_); + binOpts_.elfclass = LP64_SWITCH(ELFCLASS32, ELFCLASS64); + binOpts_.bitness = ELFDATA2LSB; + binOpts_.alloc = &::malloc; + binOpts_.dealloc = &::free; + loader_ = amd::hsa::loader::Loader::Create(&loaderContext_); } -HSAILProgram::~HSAILProgram() -{ - // Destroy internal static samplers - for (auto& it : staticSamplers_) { - delete it; - } +HSAILProgram::~HSAILProgram() { + // Destroy internal static samplers + for (auto& it : staticSamplers_) { + delete it; + } #if !defined(WITH_LIGHTNING_COMPILER) - if (rawBinary_ != nullptr) { - aclFreeMem(binaryElf_, rawBinary_); + if (rawBinary_ != nullptr) { + aclFreeMem(binaryElf_, rawBinary_); + } + acl_error error; + // Free the elf binary + if (binaryElf_ != nullptr) { + error = aclBinaryFini(binaryElf_); + if (error != ACL_SUCCESS) { + LogWarning("Error while destroying the acl binary \n"); } - acl_error error; - // Free the elf binary - if (binaryElf_ != nullptr) { - error = aclBinaryFini(binaryElf_); - if (error != ACL_SUCCESS) { - LogWarning( "Error while destroying the acl binary \n" ); - } - } -#endif // !defined(WITH_LIGHTNING_COMPILER) - releaseClBinary(); - if (executable_ != nullptr) { - loader_->DestroyExecutable(executable_); - } - delete kernels_; - amd::hsa::loader::Loader::Destroy(loader_); + } +#endif // !defined(WITH_LIGHTNING_COMPILER) + releaseClBinary(); + if (executable_ != nullptr) { + loader_->DestroyExecutable(executable_); + } + delete kernels_; + amd::hsa::loader::Loader::Destroy(loader_); } -bool -HSAILProgram::initBuild(amd::option::Options *options) -{ - if (!device::Program::initBuild(options)) { - return false; - } - - const char* devName = dev().hwInfo()->machineTarget_; - options->setPerBuildInfo( - (devName && (devName[0] != '\0')) ? devName : "gpu", - clBinary()->getEncryptCode(), true); - - // Elf Binary setup - std::string outFileName; - - // true means fsail required - clBinary()->init(options, true); - if (options->isDumpFlagSet(amd::option::DUMP_BIF)) { - outFileName = options->getDumpFileName(".bin"); - } - - if (!clBinary()->setElfOut(LP64_SWITCH(ELFCLASS32, ELFCLASS64), - (outFileName.size() > 0) ? outFileName.c_str() : nullptr)) { - LogError("Setup elf out for gpu failed"); - return false; - } - return true; -} - -bool -HSAILProgram::finiBuild(bool isBuildGood) -{ - clBinary()->resetElfOut(); - clBinary()->resetElfIn(); - - if (!isBuildGood) { - // Prevent the encrypted binary form leaking out - clBinary()->setBinary(nullptr, 0); - } - - return device::Program::finiBuild(isBuildGood); -} - -bool -HSAILProgram::linkImpl( - const std::vector &inputPrograms, - amd::option::Options *options, - bool createLibrary) -{ -#if defined(WITH_LIGHTNING_COMPILER) - assert(!"Should not reach here"); +bool HSAILProgram::initBuild(amd::option::Options* options) { + if (!device::Program::initBuild(options)) { return false; -#else // !defined(WITH_LIGHTNING_COMPILER) - std::vector::const_iterator it - = inputPrograms.begin(); - std::vector::const_iterator itEnd - = inputPrograms.end(); - acl_error errorCode; + } - // For each program we need to extract the LLVMIR and create - // aclBinary for each - std::vector binaries_to_link; + const char* devName = dev().hwInfo()->machineTarget_; + options->setPerBuildInfo((devName && (devName[0] != '\0')) ? devName : "gpu", + clBinary()->getEncryptCode(), true); - for (size_t i = 0; it != itEnd; ++it, ++i) { - HSAILProgram *program = (HSAILProgram *)*it; - // Check if the program was created with clCreateProgramWIthBinary - binary_t binary = program->binary(); - if ((binary.first != nullptr) && (binary.second > 0)) { - // Binary already exists -- we can also check if there is no - // opencl source code - // Need to check if LLVMIR exists in the binary - // If LLVMIR does not exist then is it valid - // We need to pull out all the compiled kernels - // We cannot do this at present because we need at least - // Hsail text to pull the kernels oout - void *mem = const_cast(binary.first); - binaryElf_ = aclReadFromMem(mem, binary.second, &errorCode); - if (errorCode != ACL_SUCCESS) { - LogWarning("Error while linking : Could not read from raw binary"); - return false; - } - } - // At this stage each HSAILProgram contains a valid binary_elf - // Check if LLVMIR is in the binary - // @TODO - Memory leak , cannot free this buffer - // need to fix this.. File EPR on compiler library - size_t llvmirSize = 0; - const void *llvmirText = aclExtractSection(dev().compiler(), - binaryElf_, &llvmirSize, aclLLVMIR, &errorCode); - if (errorCode != ACL_SUCCESS) { - bool spirv = false; - size_t boolSize = sizeof(bool); - errorCode = aclQueryInfo(dev().compiler(), binaryElf_, - RT_CONTAINS_SPIRV, nullptr, &spirv, &boolSize); - if (errorCode != ACL_SUCCESS) { - spirv = false; - } - if (spirv) { - errorCode = aclCompile(dev().compiler(), binaryElf_, - options->origOptionStr.c_str(), ACL_TYPE_SPIRV_BINARY, - ACL_TYPE_LLVMIR_BINARY, nullptr); - buildLog_ += aclGetCompilerLog(dev().compiler()); - if (errorCode != ACL_SUCCESS) { - buildLog_ += "Error while linking: Could not load SPIR-V" ; - return false; - } - } else { - buildLog_ +="Error while linking : \ - Invalid binary (Missing LLVMIR section)" ; - return false; - } - } - // Create a new aclBinary for each LLVMIR and save it in a list - aclBIFVersion ver = aclBinaryVersion(binaryElf_); - aclBinary *bin = aclCreateFromBinary(binaryElf_, ver); - binaries_to_link.push_back(bin); - } + // Elf Binary setup + std::string outFileName; - errorCode = aclLink(dev().compiler(), - binaries_to_link[0], binaries_to_link.size() - 1, - binaries_to_link.size() > 1 ? &binaries_to_link[1] : NULL, - ACL_TYPE_LLVMIR_BINARY, "-create-library", NULL); - if (errorCode != ACL_SUCCESS) { - buildLog_ += aclGetCompilerLog(dev().compiler()); - buildLog_ +="Error while linking : aclLink failed" ; - return false; - } - // Store the newly linked aclBinary for this program. - binaryElf_ = binaries_to_link[0]; - // Free all the other aclBinaries - for (size_t i = 1; i < binaries_to_link.size(); i++) { - aclBinaryFini(binaries_to_link[i]); - } - if (createLibrary) { - saveBinaryAndSetType(TYPE_LIBRARY); - buildLog_ += aclGetCompilerLog(dev().compiler()); - return true; - } - // Now call linkImpl with the new options - return linkImpl(options); -#endif // !defined(WITH_LIGHTNING_COMPILER) + // true means fsail required + clBinary()->init(options, true); + if (options->isDumpFlagSet(amd::option::DUMP_BIF)) { + outFileName = options->getDumpFileName(".bin"); + } + + if (!clBinary()->setElfOut(LP64_SWITCH(ELFCLASS32, ELFCLASS64), + (outFileName.size() > 0) ? outFileName.c_str() : nullptr)) { + LogError("Setup elf out for gpu failed"); + return false; + } + return true; } -aclType -HSAILProgram::getCompilationStagesFromBinary(std::vector& completeStages, bool& needOptionsCheck) -{ +bool HSAILProgram::finiBuild(bool isBuildGood) { + clBinary()->resetElfOut(); + clBinary()->resetElfIn(); + + if (!isBuildGood) { + // Prevent the encrypted binary form leaking out + clBinary()->setBinary(nullptr, 0); + } + + return device::Program::finiBuild(isBuildGood); +} + +bool HSAILProgram::linkImpl(const std::vector& inputPrograms, + amd::option::Options* options, bool createLibrary) { #if defined(WITH_LIGHTNING_COMPILER) - assert(!"Should not reach here"); + assert(!"Should not reach here"); + return false; +#else // !defined(WITH_LIGHTNING_COMPILER) + std::vector::const_iterator it = inputPrograms.begin(); + std::vector::const_iterator itEnd = inputPrograms.end(); + acl_error errorCode; + + // For each program we need to extract the LLVMIR and create + // aclBinary for each + std::vector binaries_to_link; + + for (size_t i = 0; it != itEnd; ++it, ++i) { + HSAILProgram* program = (HSAILProgram*)*it; + // Check if the program was created with clCreateProgramWIthBinary + binary_t binary = program->binary(); + if ((binary.first != nullptr) && (binary.second > 0)) { + // Binary already exists -- we can also check if there is no + // opencl source code + // Need to check if LLVMIR exists in the binary + // If LLVMIR does not exist then is it valid + // We need to pull out all the compiled kernels + // We cannot do this at present because we need at least + // Hsail text to pull the kernels oout + void* mem = const_cast(binary.first); + binaryElf_ = aclReadFromMem(mem, binary.second, &errorCode); + if (errorCode != ACL_SUCCESS) { + LogWarning("Error while linking : Could not read from raw binary"); + return false; + } + } + // At this stage each HSAILProgram contains a valid binary_elf + // Check if LLVMIR is in the binary + // @TODO - Memory leak , cannot free this buffer + // need to fix this.. File EPR on compiler library + size_t llvmirSize = 0; + const void* llvmirText = + aclExtractSection(dev().compiler(), binaryElf_, &llvmirSize, aclLLVMIR, &errorCode); + if (errorCode != ACL_SUCCESS) { + bool spirv = false; + size_t boolSize = sizeof(bool); + errorCode = + aclQueryInfo(dev().compiler(), binaryElf_, RT_CONTAINS_SPIRV, nullptr, &spirv, &boolSize); + if (errorCode != ACL_SUCCESS) { + spirv = false; + } + if (spirv) { + errorCode = aclCompile(dev().compiler(), binaryElf_, options->origOptionStr.c_str(), + ACL_TYPE_SPIRV_BINARY, ACL_TYPE_LLVMIR_BINARY, nullptr); + buildLog_ += aclGetCompilerLog(dev().compiler()); + if (errorCode != ACL_SUCCESS) { + buildLog_ += "Error while linking: Could not load SPIR-V"; + return false; + } + } else { + buildLog_ += + "Error while linking : \ + Invalid binary (Missing LLVMIR section)"; + return false; + } + } + // Create a new aclBinary for each LLVMIR and save it in a list + aclBIFVersion ver = aclBinaryVersion(binaryElf_); + aclBinary* bin = aclCreateFromBinary(binaryElf_, ver); + binaries_to_link.push_back(bin); + } + + errorCode = aclLink(dev().compiler(), binaries_to_link[0], binaries_to_link.size() - 1, + binaries_to_link.size() > 1 ? &binaries_to_link[1] : NULL, + ACL_TYPE_LLVMIR_BINARY, "-create-library", NULL); + if (errorCode != ACL_SUCCESS) { + buildLog_ += aclGetCompilerLog(dev().compiler()); + buildLog_ += "Error while linking : aclLink failed"; + return false; + } + // Store the newly linked aclBinary for this program. + binaryElf_ = binaries_to_link[0]; + // Free all the other aclBinaries + for (size_t i = 1; i < binaries_to_link.size(); i++) { + aclBinaryFini(binaries_to_link[i]); + } + if (createLibrary) { + saveBinaryAndSetType(TYPE_LIBRARY); + buildLog_ += aclGetCompilerLog(dev().compiler()); + return true; + } + // Now call linkImpl with the new options + return linkImpl(options); +#endif // !defined(WITH_LIGHTNING_COMPILER) +} + +aclType HSAILProgram::getCompilationStagesFromBinary(std::vector& completeStages, + bool& needOptionsCheck) { +#if defined(WITH_LIGHTNING_COMPILER) + assert(!"Should not reach here"); + return ACL_TYPE_DEFAULT; +#else // !defined(WITH_LIGHTNING_COMPILER) + acl_error errorCode; + size_t secSize = 0; + completeStages.clear(); + aclType from = ACL_TYPE_DEFAULT; + needOptionsCheck = true; + size_t boolSize = sizeof(bool); + // Checking llvmir in .llvmir section + bool containsSpirv = true; + errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_CONTAINS_SPIRV, nullptr, &containsSpirv, + &boolSize); + if (errorCode != ACL_SUCCESS) { + containsSpirv = false; + } + if (containsSpirv) { + completeStages.push_back(from); + from = ACL_TYPE_SPIRV_BINARY; + } + bool containsSpirText = true; + errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_CONTAINS_SPIR, nullptr, + &containsSpirText, &boolSize); + if (errorCode != ACL_SUCCESS) { + containsSpirText = false; + } + if (containsSpirText) { + completeStages.push_back(from); + from = ACL_TYPE_SPIR_BINARY; + } + bool containsLlvmirText = true; + errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_CONTAINS_LLVMIR, nullptr, + &containsLlvmirText, &boolSize); + if (errorCode != ACL_SUCCESS) { + containsLlvmirText = false; + } + // Checking compile & link options in .comment section + bool containsOpts = true; + errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_CONTAINS_OPTIONS, nullptr, + &containsOpts, &boolSize); + if (errorCode != ACL_SUCCESS) { + containsOpts = false; + } + if (containsLlvmirText && containsOpts) { + completeStages.push_back(from); + from = ACL_TYPE_LLVMIR_BINARY; + } + // Checking HSAIL in .cg section + bool containsHsailText = true; + errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_CONTAINS_HSAIL, nullptr, + &containsHsailText, &boolSize); + if (errorCode != ACL_SUCCESS) { + containsHsailText = false; + } + // Checking BRIG sections + bool containsBrig = true; + errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_CONTAINS_BRIG, nullptr, &containsBrig, + &boolSize); + if (errorCode != ACL_SUCCESS) { + containsBrig = false; + } + if (containsBrig) { + completeStages.push_back(from); + from = ACL_TYPE_HSAIL_BINARY; + } else if (containsHsailText) { + completeStages.push_back(from); + from = ACL_TYPE_HSAIL_TEXT; + } + // Checking Loader Map symbol from CG section + bool containsLoaderMap = true; + errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_CONTAINS_LOADER_MAP, nullptr, + &containsLoaderMap, &boolSize); + if (errorCode != ACL_SUCCESS) { + containsLoaderMap = false; + } + if (containsLoaderMap) { + completeStages.push_back(from); + from = ACL_TYPE_CG; + } + // Checking ISA in .text section + bool containsShaderIsa = true; + errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_CONTAINS_ISA, nullptr, + &containsShaderIsa, &boolSize); + if (errorCode != ACL_SUCCESS) { + containsShaderIsa = false; + } + if (containsShaderIsa) { + completeStages.push_back(from); + from = ACL_TYPE_ISA; + } + std::string sCurOptions = compileOptions_ + linkOptions_; + amd::option::Options curOptions; + if (!amd::option::parseAllOptions(sCurOptions, curOptions)) { + buildLog_ += curOptions.optionsLog(); + LogError("Parsing compile options failed."); return ACL_TYPE_DEFAULT; -#else // !defined(WITH_LIGHTNING_COMPILER) - acl_error errorCode; - size_t secSize = 0; - completeStages.clear(); - aclType from = ACL_TYPE_DEFAULT; - needOptionsCheck = true; - size_t boolSize = sizeof(bool); - // Checking llvmir in .llvmir section - bool containsSpirv = true; - errorCode = aclQueryInfo(dev().compiler(), binaryElf_, - RT_CONTAINS_SPIRV, nullptr, &containsSpirv, &boolSize); - if (errorCode != ACL_SUCCESS) { - containsSpirv = false; - } - if (containsSpirv) { - completeStages.push_back(from); - from = ACL_TYPE_SPIRV_BINARY; - } - bool containsSpirText = true; - errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_CONTAINS_SPIR, nullptr, &containsSpirText, &boolSize); - if (errorCode != ACL_SUCCESS) { - containsSpirText = false; - } - if (containsSpirText) { - completeStages.push_back(from); - from = ACL_TYPE_SPIR_BINARY; - } - bool containsLlvmirText = true; - errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_CONTAINS_LLVMIR, nullptr, &containsLlvmirText, &boolSize); - if (errorCode != ACL_SUCCESS) { - containsLlvmirText = false; - } - // Checking compile & link options in .comment section - bool containsOpts = true; - errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_CONTAINS_OPTIONS, nullptr, &containsOpts, &boolSize); - if (errorCode != ACL_SUCCESS) { - containsOpts = false; - } - if (containsLlvmirText && containsOpts) { - completeStages.push_back(from); - from = ACL_TYPE_LLVMIR_BINARY; - } - // Checking HSAIL in .cg section - bool containsHsailText = true; - errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_CONTAINS_HSAIL, nullptr, &containsHsailText, &boolSize); - if (errorCode != ACL_SUCCESS) { - containsHsailText = false; - } - // Checking BRIG sections - bool containsBrig = true; - errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_CONTAINS_BRIG, nullptr, &containsBrig, &boolSize); - if (errorCode != ACL_SUCCESS) { - containsBrig = false; - } - if (containsBrig) { - completeStages.push_back(from); - from = ACL_TYPE_HSAIL_BINARY; - } else if (containsHsailText) { - completeStages.push_back(from); - from = ACL_TYPE_HSAIL_TEXT; - } - // Checking Loader Map symbol from CG section - bool containsLoaderMap = true; - errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_CONTAINS_LOADER_MAP, nullptr, &containsLoaderMap, &boolSize); - if (errorCode != ACL_SUCCESS) { - containsLoaderMap = false; - } - if (containsLoaderMap) { - completeStages.push_back(from); - from = ACL_TYPE_CG; - } - // Checking ISA in .text section - bool containsShaderIsa = true; - errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_CONTAINS_ISA, nullptr, &containsShaderIsa, &boolSize); - if (errorCode != ACL_SUCCESS) { - containsShaderIsa = false; - } - if (containsShaderIsa) { - completeStages.push_back(from); - from = ACL_TYPE_ISA; - } - std::string sCurOptions = compileOptions_ + linkOptions_; - amd::option::Options curOptions; - if (!amd::option::parseAllOptions(sCurOptions, curOptions)) { - buildLog_ += curOptions.optionsLog(); - LogError("Parsing compile options failed."); - return ACL_TYPE_DEFAULT; - } - switch (from) { + } + switch (from) { // compile from HSAIL text, no matter prev. stages and options case ACL_TYPE_HSAIL_TEXT: - needOptionsCheck = false; - break; + needOptionsCheck = false; + break; case ACL_TYPE_HSAIL_BINARY: - // do not check options, if LLVMIR is absent or might be absent or options are absent - if (!curOptions.oVariables->BinLLVMIR || !containsLlvmirText || !containsOpts) { - needOptionsCheck = false; - } - break; + // do not check options, if LLVMIR is absent or might be absent or options are absent + if (!curOptions.oVariables->BinLLVMIR || !containsLlvmirText || !containsOpts) { + needOptionsCheck = false; + } + break; case ACL_TYPE_CG: case ACL_TYPE_ISA: - // do not check options, if LLVMIR is absent or might be absent or options are absent - if (!curOptions.oVariables->BinLLVMIR || !containsLlvmirText || !containsOpts) { - needOptionsCheck = false; - } - // do not check options, if BRIG is absent or might be absent or LoaderMap is absent - if (!curOptions.oVariables->BinCG || !containsBrig || !containsLoaderMap) { - needOptionsCheck = false; - } - break; + // do not check options, if LLVMIR is absent or might be absent or options are absent + if (!curOptions.oVariables->BinLLVMIR || !containsLlvmirText || !containsOpts) { + needOptionsCheck = false; + } + // do not check options, if BRIG is absent or might be absent or LoaderMap is absent + if (!curOptions.oVariables->BinCG || !containsBrig || !containsLoaderMap) { + needOptionsCheck = false; + } + break; // recompilation might be needed case ACL_TYPE_LLVMIR_BINARY: case ACL_TYPE_DEFAULT: default: - break; - } - return from; -#endif // !defined(WITH_LIGHTNING_COMPILER) + break; + } + return from; +#endif // !defined(WITH_LIGHTNING_COMPILER) } -aclType -HSAILProgram::getNextCompilationStageFromBinary(amd::option::Options* options) { +aclType HSAILProgram::getNextCompilationStageFromBinary(amd::option::Options* options) { #if defined(WITH_LIGHTNING_COMPILER) - assert(!"Should not reach here"); - return ACL_TYPE_DEFAULT; -#else // !defined(WITH_LIGHTNING_COMPILER) - aclType continueCompileFrom = ACL_TYPE_DEFAULT; - binary_t binary = this->binary(); - // If the binary already exists - if ((binary.first != nullptr) && (binary.second > 0)) { - void *mem = const_cast(binary.first); - acl_error errorCode; - binaryElf_ = aclReadFromMem(mem, binary.second, &errorCode); - if (errorCode != ACL_SUCCESS) { - buildLog_ += "Error: Reading the binary from memory failed.\n"; - return continueCompileFrom; - } - // Calculate the next stage to compile from, based on sections in binaryElf_; - // No any validity checks here - std::vector completeStages; - bool needOptionsCheck = true; - continueCompileFrom = getCompilationStagesFromBinary(completeStages, needOptionsCheck); - // Saving binary in the interface class, - // which also load compile & link options from binary - setBinary(static_cast(mem), binary.second); - if (!options || !needOptionsCheck) { - return continueCompileFrom; - } - bool recompile = false; - switch (continueCompileFrom) { + assert(!"Should not reach here"); + return ACL_TYPE_DEFAULT; +#else // !defined(WITH_LIGHTNING_COMPILER) + aclType continueCompileFrom = ACL_TYPE_DEFAULT; + binary_t binary = this->binary(); + // If the binary already exists + if ((binary.first != nullptr) && (binary.second > 0)) { + void* mem = const_cast(binary.first); + acl_error errorCode; + binaryElf_ = aclReadFromMem(mem, binary.second, &errorCode); + if (errorCode != ACL_SUCCESS) { + buildLog_ += "Error: Reading the binary from memory failed.\n"; + return continueCompileFrom; + } + // Calculate the next stage to compile from, based on sections in binaryElf_; + // No any validity checks here + std::vector completeStages; + bool needOptionsCheck = true; + continueCompileFrom = getCompilationStagesFromBinary(completeStages, needOptionsCheck); + // Saving binary in the interface class, + // which also load compile & link options from binary + setBinary(static_cast(mem), binary.second); + if (!options || !needOptionsCheck) { + return continueCompileFrom; + } + bool recompile = false; + switch (continueCompileFrom) { case ACL_TYPE_HSAIL_BINARY: case ACL_TYPE_CG: case ACL_TYPE_ISA: { - // Compare options loaded from binary with current ones, recompile if differ; - // If compile options are absent in binary, do not compare and recompile - if (compileOptions_.empty()) - break; - const oclBIFSymbolStruct* symbol = findBIF30SymStruct(symOpenclCompilerOptions); - assert(symbol && "symbol not found"); - std::string symName = std::string(symbol->str[bif::PRE]) + std::string(symbol->str[bif::POST]); - size_t symSize = 0; - const void *opts = aclExtractSymbol(dev().compiler(), - binaryElf_, &symSize, aclCOMMENT, symName.c_str(), &errorCode); - if (errorCode != ACL_SUCCESS) { - recompile = true; - break; - } - std::string sBinOptions = std::string((char*)opts, symSize); - std::string sCurOptions = compileOptions_ + linkOptions_; - amd::option::Options curOptions, binOptions; - if (!amd::option::parseAllOptions(sBinOptions, binOptions)) { - buildLog_ += binOptions.optionsLog(); - LogError("Parsing compile options from binary failed."); - return ACL_TYPE_DEFAULT; - } - if (!amd::option::parseAllOptions(sCurOptions, curOptions)) { - buildLog_ += curOptions.optionsLog(); - LogError("Parsing compile options failed."); - return ACL_TYPE_DEFAULT; - } - if (!curOptions.equals(binOptions)) { - recompile = true; - } + // Compare options loaded from binary with current ones, recompile if differ; + // If compile options are absent in binary, do not compare and recompile + if (compileOptions_.empty()) break; + const oclBIFSymbolStruct* symbol = findBIF30SymStruct(symOpenclCompilerOptions); + assert(symbol && "symbol not found"); + std::string symName = + std::string(symbol->str[bif::PRE]) + std::string(symbol->str[bif::POST]); + size_t symSize = 0; + const void* opts = aclExtractSymbol(dev().compiler(), binaryElf_, &symSize, aclCOMMENT, + symName.c_str(), &errorCode); + if (errorCode != ACL_SUCCESS) { + recompile = true; break; + } + std::string sBinOptions = std::string((char*)opts, symSize); + std::string sCurOptions = compileOptions_ + linkOptions_; + amd::option::Options curOptions, binOptions; + if (!amd::option::parseAllOptions(sBinOptions, binOptions)) { + buildLog_ += binOptions.optionsLog(); + LogError("Parsing compile options from binary failed."); + return ACL_TYPE_DEFAULT; + } + if (!amd::option::parseAllOptions(sCurOptions, curOptions)) { + buildLog_ += curOptions.optionsLog(); + LogError("Parsing compile options failed."); + return ACL_TYPE_DEFAULT; + } + if (!curOptions.equals(binOptions)) { + recompile = true; + } + break; } default: + break; + } + if (recompile) { + while (!completeStages.empty()) { + continueCompileFrom = completeStages.back(); + if (continueCompileFrom == ACL_TYPE_SPIRV_BINARY || + continueCompileFrom == ACL_TYPE_LLVMIR_BINARY || + continueCompileFrom == ACL_TYPE_SPIR_BINARY || + continueCompileFrom == ACL_TYPE_DEFAULT) { break; - } - if (recompile) { - while (!completeStages.empty()) { - continueCompileFrom = completeStages.back(); - if (continueCompileFrom == ACL_TYPE_SPIRV_BINARY || - continueCompileFrom == ACL_TYPE_LLVMIR_BINARY || - continueCompileFrom == ACL_TYPE_SPIR_BINARY || - continueCompileFrom == ACL_TYPE_DEFAULT) { - break; - } - completeStages.pop_back(); - } + } + completeStages.pop_back(); } } - return continueCompileFrom; -#endif // !defined(WITH_LIGHTNING_COMPILER) + } + return continueCompileFrom; +#endif // !defined(WITH_LIGHTNING_COMPILER) } -inline static std::vector -splitSpaceSeparatedString(char *str) -{ +inline static std::vector splitSpaceSeparatedString(char* str) { std::string s(str); std::stringstream ss(s); std::istream_iterator beg(ss), end; @@ -566,26 +540,23 @@ splitSpaceSeparatedString(char *str) return vec; } -bool -HSAILProgram::linkImpl(amd::option::Options* options) -{ +bool HSAILProgram::linkImpl(amd::option::Options* options) { #if defined(WITH_LIGHTNING_COMPILER) - assert(!"Should not reach here"); - return false; -#else // !defined(WITH_LIGHTNING_COMPILER) - acl_error errorCode; - aclType continueCompileFrom = ACL_TYPE_LLVMIR_BINARY; - bool finalize = true; - bool hsaLoad = true; - internal_ = (compileOptions_.find("-cl-internal-kernel") != - std::string::npos) ? true : false; + assert(!"Should not reach here"); + return false; +#else // !defined(WITH_LIGHTNING_COMPILER) + acl_error errorCode; + aclType continueCompileFrom = ACL_TYPE_LLVMIR_BINARY; + bool finalize = true; + bool hsaLoad = true; + internal_ = (compileOptions_.find("-cl-internal-kernel") != std::string::npos) ? true : false; - // If !binaryElf_ then program must have been created using clCreateProgramWithBinary - if (!binaryElf_) { - continueCompileFrom = getNextCompilationStageFromBinary(options); - } - switch (continueCompileFrom) { + // If !binaryElf_ then program must have been created using clCreateProgramWithBinary + if (!binaryElf_) { + continueCompileFrom = getNextCompilationStageFromBinary(options); + } + switch (continueCompileFrom) { case ACL_TYPE_SPIRV_BINARY: case ACL_TYPE_SPIR_BINARY: // Compilation from ACL_TYPE_LLVMIR_BINARY to ACL_TYPE_CG in cases: @@ -600,1115 +571,1088 @@ HSAILProgram::linkImpl(amd::option::Options* options) // Compilation from ACL_TYPE_HSAIL_TEXT to ACL_TYPE_CG in cases: // 1. if the program is created with binary and contains only hsail text case ACL_TYPE_HSAIL_TEXT: { - std::string curOptions = options->origOptionStr + hsailOptions(options); - errorCode = aclCompile(dev().compiler(), binaryElf_, - curOptions.c_str(), continueCompileFrom, ACL_TYPE_CG, nullptr); - buildLog_ += aclGetCompilerLog(dev().compiler()); - if (errorCode != ACL_SUCCESS) { - buildLog_ += "Error: BRIG code generation failed.\n"; - return false; - } - break; + std::string curOptions = options->origOptionStr + hsailOptions(options); + errorCode = aclCompile(dev().compiler(), binaryElf_, curOptions.c_str(), continueCompileFrom, + ACL_TYPE_CG, nullptr); + buildLog_ += aclGetCompilerLog(dev().compiler()); + if (errorCode != ACL_SUCCESS) { + buildLog_ += "Error: BRIG code generation failed.\n"; + return false; + } + break; } case ACL_TYPE_CG: - break; + break; case ACL_TYPE_ISA: - finalize = false; - break; + finalize = false; + break; default: - buildLog_ += "Error: The binary is incorrect or incomplete. Finalization to ISA couldn't be performed.\n"; - return false; + buildLog_ += + "Error: The binary is incorrect or incomplete. Finalization to ISA couldn't be " + "performed.\n"; + return false; + } + if (finalize) { + std::string fin_options(options->origOptionStr + hsailOptions(options)); + // Append an option so that we can selectively enable a SCOption on CZ + // whenever IOMMUv2 is enabled. + if (dev().settings().svmFineGrainSystem_) { + fin_options.append(" -sc-xnack-iommu"); } - if (finalize) { - std::string fin_options(options->origOptionStr + hsailOptions(options)); - // Append an option so that we can selectively enable a SCOption on CZ - // whenever IOMMUv2 is enabled. - if (dev().settings().svmFineGrainSystem_) { - fin_options.append(" -sc-xnack-iommu"); - } - errorCode = aclCompile(dev().compiler(), binaryElf_, - fin_options.c_str(), ACL_TYPE_CG, ACL_TYPE_ISA, nullptr); - buildLog_ += aclGetCompilerLog(dev().compiler()); - if (errorCode != ACL_SUCCESS) { - buildLog_ += "Error: BRIG finalization to ISA failed.\n"; - return false; - } - } - // ACL_TYPE_CG stage is not performed for offline compilation - hsa_agent_t agent; - agent.handle = 1; - if (hsaLoad) { - executable_ = loader_->CreateExecutable(HSA_PROFILE_FULL, NULL); - if (executable_ == nullptr) { - buildLog_ += "Error: Executable for AMD HSA Code Object isn't created.\n"; - return false; - } - size_t size = 0; - hsa_code_object_t code_object; - code_object.handle = reinterpret_cast(aclExtractSection(dev().compiler(), binaryElf_, &size, aclTEXT, &errorCode)); - if (errorCode != ACL_SUCCESS) { - buildLog_ += "Error: Extracting AMD HSA Code Object from binary failed.\n"; - return false; - } - hsa_status_t status = executable_->LoadCodeObject(agent, code_object, nullptr); - if (status != HSA_STATUS_SUCCESS) { - buildLog_ += "Error: AMD HSA Code Object loading failed.\n"; - return false; - } - status = executable_->Freeze(nullptr); - if (status != HSA_STATUS_SUCCESS) { - buildLog_ += "Error: AMD HSA Code Object freeze failed.\n"; - return false; - } - } - size_t kernelNamesSize = 0; - errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_KERNEL_NAMES, nullptr, nullptr, &kernelNamesSize); - if (errorCode != ACL_SUCCESS) { - buildLog_ += "Error: Querying of kernel names size from the binary failed.\n"; - return false; - } - if (kernelNamesSize > 0) { - char* kernelNames = new char[kernelNamesSize]; - errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_KERNEL_NAMES, nullptr, kernelNames, &kernelNamesSize); - if (errorCode != ACL_SUCCESS) { - buildLog_ += "Error: Querying of kernel names from the binary failed.\n"; - delete kernelNames; - return false; - } - std::vector vKernels = splitSpaceSeparatedString(kernelNames); - delete kernelNames; - std::vector::iterator it = vKernels.begin(); - bool dynamicParallelism = false; - for (it; it != vKernels.end(); ++it) { - std::string kernelName(*it); - std::string openclKernelName = device::Kernel::openclMangledName(kernelName); - - HSAILKernel *aKernel = new HSAILKernel(kernelName, this, options->origOptionStr + hsailOptions(options)); - kernels()[kernelName] = aKernel; - - amd::hsa::loader::Symbol *sym = executable_->GetSymbol(openclKernelName.c_str(), &agent); - if (!sym) { - buildLog_ += "Error: Getting kernel ISA code symbol '" + openclKernelName + - "' from AMD HSA Code Object failed. Kernel initialization failed.\n"; - return false; - } - if (!aKernel->init(sym, false)) { - buildLog_ += "Error: Kernel '" + openclKernelName + "' initialization failed.\n"; - return false; - } - buildLog_ += aKernel->buildLog(); - aKernel->setUniformWorkGroupSize(options->oVariables->UniformWorkGroupSize); - dynamicParallelism |= aKernel->dynamicParallelism(); - // Find max scratch regs used in the program. It's used for scratch buffer preallocation - // with dynamic parallelism, since runtime doesn't know which child kernel will be called - maxScratchRegs_ = std::max(static_cast(aKernel->workGroupInfo()->scratchRegs_), maxScratchRegs_); - } - // Allocate kernel table for device enqueuing - if (!isNull() && dynamicParallelism && !allocKernelTable()) { - return false; - } - } - // Save the binary in the interface class - saveBinaryAndSetType(TYPE_EXECUTABLE); + errorCode = aclCompile(dev().compiler(), binaryElf_, fin_options.c_str(), ACL_TYPE_CG, + ACL_TYPE_ISA, nullptr); buildLog_ += aclGetCompilerLog(dev().compiler()); - return true; -#endif // !defined(WITH_LIGHTNING_COMPILER) + if (errorCode != ACL_SUCCESS) { + buildLog_ += "Error: BRIG finalization to ISA failed.\n"; + return false; + } + } + // ACL_TYPE_CG stage is not performed for offline compilation + hsa_agent_t agent; + agent.handle = 1; + if (hsaLoad) { + executable_ = loader_->CreateExecutable(HSA_PROFILE_FULL, NULL); + if (executable_ == nullptr) { + buildLog_ += "Error: Executable for AMD HSA Code Object isn't created.\n"; + return false; + } + size_t size = 0; + hsa_code_object_t code_object; + code_object.handle = reinterpret_cast( + aclExtractSection(dev().compiler(), binaryElf_, &size, aclTEXT, &errorCode)); + if (errorCode != ACL_SUCCESS) { + buildLog_ += "Error: Extracting AMD HSA Code Object from binary failed.\n"; + return false; + } + hsa_status_t status = executable_->LoadCodeObject(agent, code_object, nullptr); + if (status != HSA_STATUS_SUCCESS) { + buildLog_ += "Error: AMD HSA Code Object loading failed.\n"; + return false; + } + status = executable_->Freeze(nullptr); + if (status != HSA_STATUS_SUCCESS) { + buildLog_ += "Error: AMD HSA Code Object freeze failed.\n"; + return false; + } + } + size_t kernelNamesSize = 0; + errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_KERNEL_NAMES, nullptr, nullptr, + &kernelNamesSize); + if (errorCode != ACL_SUCCESS) { + buildLog_ += "Error: Querying of kernel names size from the binary failed.\n"; + return false; + } + if (kernelNamesSize > 0) { + char* kernelNames = new char[kernelNamesSize]; + errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_KERNEL_NAMES, nullptr, kernelNames, + &kernelNamesSize); + if (errorCode != ACL_SUCCESS) { + buildLog_ += "Error: Querying of kernel names from the binary failed.\n"; + delete kernelNames; + return false; + } + std::vector vKernels = splitSpaceSeparatedString(kernelNames); + delete kernelNames; + std::vector::iterator it = vKernels.begin(); + bool dynamicParallelism = false; + for (it; it != vKernels.end(); ++it) { + std::string kernelName(*it); + std::string openclKernelName = device::Kernel::openclMangledName(kernelName); + + HSAILKernel* aKernel = + new HSAILKernel(kernelName, this, options->origOptionStr + hsailOptions(options)); + kernels()[kernelName] = aKernel; + + amd::hsa::loader::Symbol* sym = executable_->GetSymbol(openclKernelName.c_str(), &agent); + if (!sym) { + buildLog_ += "Error: Getting kernel ISA code symbol '" + openclKernelName + + "' from AMD HSA Code Object failed. Kernel initialization failed.\n"; + return false; + } + if (!aKernel->init(sym, false)) { + buildLog_ += "Error: Kernel '" + openclKernelName + "' initialization failed.\n"; + return false; + } + buildLog_ += aKernel->buildLog(); + aKernel->setUniformWorkGroupSize(options->oVariables->UniformWorkGroupSize); + dynamicParallelism |= aKernel->dynamicParallelism(); + // Find max scratch regs used in the program. It's used for scratch buffer preallocation + // with dynamic parallelism, since runtime doesn't know which child kernel will be called + maxScratchRegs_ = + std::max(static_cast(aKernel->workGroupInfo()->scratchRegs_), maxScratchRegs_); + } + // Allocate kernel table for device enqueuing + if (!isNull() && dynamicParallelism && !allocKernelTable()) { + return false; + } + } + // Save the binary in the interface class + saveBinaryAndSetType(TYPE_EXECUTABLE); + buildLog_ += aclGetCompilerLog(dev().compiler()); + return true; +#endif // !defined(WITH_LIGHTNING_COMPILER) } -bool -HSAILProgram::createBinary(amd::option::Options *options) -{ - return true; -} +bool HSAILProgram::createBinary(amd::option::Options* options) { return true; } -bool -HSAILProgram::initClBinary() -{ +bool HSAILProgram::initClBinary() { + if (clBinary_ == nullptr) { + clBinary_ = new ClBinaryHsa(static_cast(device())); if (clBinary_ == nullptr) { - clBinary_ = new ClBinaryHsa(static_cast(device())); - if (clBinary_ == nullptr) { - return false; - } + return false; } - return true; + } + return true; } -void -HSAILProgram::releaseClBinary() -{ - if (clBinary_ != nullptr) { - delete clBinary_; - clBinary_ = nullptr; - } +void HSAILProgram::releaseClBinary() { + if (clBinary_ != nullptr) { + delete clBinary_; + clBinary_ = nullptr; + } } -std::string -HSAILProgram::hsailOptions(amd::option::Options* options) -{ - std::string hsailOptions; +std::string HSAILProgram::hsailOptions(amd::option::Options* options) { + std::string hsailOptions; - hsailOptions.append(" -D__AMD__=1"); + hsailOptions.append(" -D__AMD__=1"); - hsailOptions.append(" -D__").append(device().info().name_).append("__=1"); - hsailOptions.append(" -D__").append(device().info().name_).append("=1"); + hsailOptions.append(" -D__").append(device().info().name_).append("__=1"); + hsailOptions.append(" -D__").append(device().info().name_).append("=1"); - int major, minor; - ::sscanf(device().info().version_, "OpenCL %d.%d ", &major, &minor); + int major, minor; + ::sscanf(device().info().version_, "OpenCL %d.%d ", &major, &minor); #ifdef WITH_LIGHTNING_COMPILER - std::stringstream ss; - ss << " -D__OPENCL_VERSION__=" << (major * 100 + minor * 10); - hsailOptions.append(ss.str()); + std::stringstream ss; + ss << " -D__OPENCL_VERSION__=" << (major * 100 + minor * 10); + hsailOptions.append(ss.str()); #endif - if (device().info().imageSupport_ && options->oVariables->ImageSupport) { - hsailOptions.append(" -D__IMAGE_SUPPORT__=1"); - } + if (device().info().imageSupport_ && options->oVariables->ImageSupport) { + hsailOptions.append(" -D__IMAGE_SUPPORT__=1"); + } - // Set options for the standard device specific options - // All our devices support these options now - if (dev().settings().reportFMAF_) { - hsailOptions.append(" -DFP_FAST_FMAF=1"); - } - if (dev().settings().reportFMA_) { - hsailOptions.append(" -DFP_FAST_FMA=1"); - } + // Set options for the standard device specific options + // All our devices support these options now + if (dev().settings().reportFMAF_) { + hsailOptions.append(" -DFP_FAST_FMAF=1"); + } + if (dev().settings().reportFMA_) { + hsailOptions.append(" -DFP_FAST_FMA=1"); + } - uint clcStd = (options->oVariables->CLStd[2] - '0') * 100 - + (options->oVariables->CLStd[4] - '0') * 10; + uint clcStd = + (options->oVariables->CLStd[2] - '0') * 100 + (options->oVariables->CLStd[4] - '0') * 10; - if (clcStd >= 200) { - std::stringstream opts; - //Add only for CL2.0 and later - opts << " -D" << "CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE=" - << device().info().maxGlobalVariableSize_; - hsailOptions.append(opts.str()); - } + if (clcStd >= 200) { + std::stringstream opts; + // Add only for CL2.0 and later + opts << " -D" + << "CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE=" << device().info().maxGlobalVariableSize_; + hsailOptions.append(opts.str()); + } #if !defined(WITH_LIGHTNING_COMPILER) - if (!dev().settings().singleFpDenorm_) { - hsailOptions.append(" -cl-denorms-are-zero"); - } -#endif // !defined(WITH_LIGHTNING_COMPILER) + if (!dev().settings().singleFpDenorm_) { + hsailOptions.append(" -cl-denorms-are-zero"); + } +#endif // !defined(WITH_LIGHTNING_COMPILER) - // Check if the host is 64 bit or 32 bit - LP64_ONLY(hsailOptions.append(" -m64")); + // Check if the host is 64 bit or 32 bit + LP64_ONLY(hsailOptions.append(" -m64")); - // Tokenize the extensions string into a vector of strings - std::istringstream istrstr(device().info().extensions_); - std::istream_iterator sit(istrstr), end; - std::vector extensions(sit, end); + // Tokenize the extensions string into a vector of strings + std::istringstream istrstr(device().info().extensions_); + std::istream_iterator sit(istrstr), end; + std::vector extensions(sit, end); #if defined(WITH_LIGHTNING_COMPILER) - // FIXME_lmoriche: opencl-c.h defines 'cl_khr_depth_images', so - // remove it from the command line. Should we fix opencl-c.h? - auto found = std::find(extensions.begin(), extensions.end(), - "cl_khr_depth_images"); - if (found != extensions.end()) { - extensions.erase(found); - } + // FIXME_lmoriche: opencl-c.h defines 'cl_khr_depth_images', so + // remove it from the command line. Should we fix opencl-c.h? + auto found = std::find(extensions.begin(), extensions.end(), "cl_khr_depth_images"); + if (found != extensions.end()) { + extensions.erase(found); + } - if (!extensions.empty()) { - std::ostringstream clext; + if (!extensions.empty()) { + std::ostringstream clext; - clext << " -Xclang -cl-ext=+"; - std::copy(extensions.begin(), extensions.end() - 1, - std::ostream_iterator(clext, ",+")); - clext << extensions.back(); + clext << " -Xclang -cl-ext=+"; + std::copy(extensions.begin(), extensions.end() - 1, + std::ostream_iterator(clext, ",+")); + clext << extensions.back(); - hsailOptions.append(clext.str()); - } -#else // !defined(WITH_LIGHTNING_COMPILER) - for (auto e : extensions) { - hsailOptions.append(" -D").append(e).append("=1"); - } -#endif // !defined(WITH_LIGHTNING_COMPILER) + hsailOptions.append(clext.str()); + } +#else // !defined(WITH_LIGHTNING_COMPILER) + for (auto e : extensions) { + hsailOptions.append(" -D").append(e).append("=1"); + } +#endif // !defined(WITH_LIGHTNING_COMPILER) - return hsailOptions; + return hsailOptions; } -bool -HSAILProgram::allocKernelTable() -{ - uint size = kernels().size() * sizeof(size_t); +bool HSAILProgram::allocKernelTable() { + uint size = kernels().size() * sizeof(size_t); - kernels_ = new pal::Memory(dev(), size); - // Initialize kernel table - if ((kernels_ == nullptr) || !kernels_->create(Resource::RemoteUSWC)) { - delete kernels_; - return false; + kernels_ = new pal::Memory(dev(), size); + // Initialize kernel table + if ((kernels_ == nullptr) || !kernels_->create(Resource::RemoteUSWC)) { + delete kernels_; + return false; + } else { + size_t* table = reinterpret_cast(kernels_->map(nullptr, pal::Resource::WriteOnly)); + for (auto& it : kernels()) { + HSAILKernel* kernel = static_cast(it.second); + table[kernel->index()] = static_cast(kernel->gpuAqlCode()); } - else { - size_t* table = reinterpret_cast( - kernels_->map(nullptr, pal::Resource::WriteOnly)); - for (auto& it : kernels()) { - HSAILKernel* kernel = static_cast(it.second); - table[kernel->index()] = static_cast(kernel->gpuAqlCode()); - } - kernels_->unmap(nullptr); - } - return true; + kernels_->unmap(nullptr); + } + return true; } -void -HSAILProgram::fillResListWithKernels( - std::vector& memList) const -{ - memList.push_back(&codeSegGpu()); +void HSAILProgram::fillResListWithKernels(std::vector& memList) const { + memList.push_back(&codeSegGpu()); } -const aclTargetInfo & -HSAILProgram::info(const char * str) -{ +const aclTargetInfo& HSAILProgram::info(const char* str) { #if defined(WITH_LIGHTNING_COMPILER) - assert(!"Should not reach here"); -#else // !defined(WITH_LIGHTNING_COMPILER) - acl_error err; - std::string arch = "hsail"; - if (dev().settings().use64BitPtr_) { - arch = "hsail64"; - } - info_ = aclGetTargetInfo(arch.c_str(), ( str && str[0] == '\0' ? - dev().hwInfo()->targetName_ : str ), &err); - if (err != ACL_SUCCESS) { - LogWarning("aclGetTargetInfo failed"); - } -#endif // !defined(WITH_LIGHTNING_COMPILER) - return info_; + assert(!"Should not reach here"); +#else // !defined(WITH_LIGHTNING_COMPILER) + acl_error err; + std::string arch = "hsail"; + if (dev().settings().use64BitPtr_) { + arch = "hsail64"; + } + info_ = aclGetTargetInfo(arch.c_str(), + (str && str[0] == '\0' ? dev().hwInfo()->targetName_ : str), &err); + if (err != ACL_SUCCESS) { + LogWarning("aclGetTargetInfo failed"); + } +#endif // !defined(WITH_LIGHTNING_COMPILER) + return info_; } -bool -HSAILProgram::saveBinaryAndSetType(type_t type) -{ +bool HSAILProgram::saveBinaryAndSetType(type_t type) { #if defined(WITH_LIGHTNING_COMPILER) - assert(!"Should not reach here"); -#else // !defined(WITH_LIGHTNING_COMPILER) - //Write binary to memory - if (rawBinary_ != nullptr) { - //Free memory containing rawBinary - aclFreeMem(binaryElf_, rawBinary_); - rawBinary_ = nullptr; - } - size_t size = 0; - if (aclWriteToMem(binaryElf_, &rawBinary_, &size) != ACL_SUCCESS) { - buildLog_ += "Failed to write binary to memory \n"; - return false; - } - setBinary(static_cast(rawBinary_), size); - //Set the type of binary - setType(type); -#endif // !defined(WITH_LIGHTNING_COMPILER) - return true; + assert(!"Should not reach here"); +#else // !defined(WITH_LIGHTNING_COMPILER) + // Write binary to memory + if (rawBinary_ != nullptr) { + // Free memory containing rawBinary + aclFreeMem(binaryElf_, rawBinary_); + rawBinary_ = nullptr; + } + size_t size = 0; + if (aclWriteToMem(binaryElf_, &rawBinary_, &size) != ACL_SUCCESS) { + buildLog_ += "Failed to write binary to memory \n"; + return false; + } + setBinary(static_cast(rawBinary_), size); + // Set the type of binary + setType(type); +#endif // !defined(WITH_LIGHTNING_COMPILER) + return true; } -hsa_isa_t PALHSALoaderContext::IsaFromName(const char *name) { - hsa_isa_t isa = {0}; - if (!strcmp(Gfx700, name)) { isa.handle = gfx700; return isa; } - if (!strcmp(Gfx701, name)) { isa.handle = gfx701; return isa; } - if (!strcmp(Gfx800, name)) { isa.handle = gfx800; return isa; } - if (!strcmp(Gfx801, name)) { isa.handle = gfx801; return isa; } - if (!strcmp(Gfx804, name)) { isa.handle = gfx804; return isa; } - if (!strcmp(Gfx810, name)) { isa.handle = gfx810; return isa; } - if (!strcmp(Gfx900, name)) { isa.handle = gfx900; return isa; } - if (!strcmp(Gfx901, name)) { isa.handle = gfx901; return isa; } +hsa_isa_t PALHSALoaderContext::IsaFromName(const char* name) { + hsa_isa_t isa = {0}; + if (!strcmp(Gfx700, name)) { + isa.handle = gfx700; return isa; + } + if (!strcmp(Gfx701, name)) { + isa.handle = gfx701; + return isa; + } + if (!strcmp(Gfx800, name)) { + isa.handle = gfx800; + return isa; + } + if (!strcmp(Gfx801, name)) { + isa.handle = gfx801; + return isa; + } + if (!strcmp(Gfx804, name)) { + isa.handle = gfx804; + return isa; + } + if (!strcmp(Gfx810, name)) { + isa.handle = gfx810; + return isa; + } + if (!strcmp(Gfx900, name)) { + isa.handle = gfx900; + return isa; + } + if (!strcmp(Gfx901, name)) { + isa.handle = gfx901; + return isa; + } + return isa; } bool PALHSALoaderContext::IsaSupportedByAgent(hsa_agent_t agent, hsa_isa_t isa) { - switch (program_->dev().hwInfo()->gfxipVersion_) { + switch (program_->dev().hwInfo()->gfxipVersion_) { default: - LogError("Unsupported gfxip version"); - return false; - case gfx700: case gfx701: case gfx702: - // gfx701 only differs from gfx700 by faster fp operations and can be loaded on either device. - return isa.handle == gfx700 || isa.handle == gfx701; + LogError("Unsupported gfxip version"); + return false; + case gfx700: + case gfx701: + case gfx702: + // gfx701 only differs from gfx700 by faster fp operations and can be loaded on either device. + return isa.handle == gfx700 || isa.handle == gfx701; case gfx800: - return isa.handle == gfx800; + return isa.handle == gfx800; case gfx801: - return isa.handle == gfx801; + return isa.handle == gfx801; case gfx804: - // gfx800 ISA has only sgrps limited and can be loaded. - // gfx801 ISA has XNACK limitations and can be loaded. - return isa.handle == gfx800 || isa.handle == gfx801 || isa.handle == gfx804; + // gfx800 ISA has only sgrps limited and can be loaded. + // gfx801 ISA has XNACK limitations and can be loaded. + return isa.handle == gfx800 || isa.handle == gfx801 || isa.handle == gfx804; case gfx810: - return isa.handle == gfx810; - case gfx900: case gfx901: - return isa.handle == gfx900 || isa.handle == gfx901; - } + return isa.handle == gfx810; + case gfx900: + case gfx901: + return isa.handle == gfx900 || isa.handle == gfx901; + } } -void* PALHSALoaderContext::SegmentAlloc( - amdgpu_hsa_elf_segment_t segment, - hsa_agent_t agent, size_t size, size_t align, bool zero) -{ - assert(size); - assert(align); - if (program_->isNull()) { - void* ptr = amd::Os::alignedMalloc(size, align); - if (zero) { - memset(ptr, 0, size); - } - return ptr; +void* PALHSALoaderContext::SegmentAlloc(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, + size_t size, size_t align, bool zero) { + assert(size); + assert(align); + if (program_->isNull()) { + void* ptr = amd::Os::alignedMalloc(size, align); + if (zero) { + memset(ptr, 0, size); } - Segment* seg = new Segment(); - if (seg != nullptr && !seg->alloc(*program_, segment, size, align, zero)) { - return nullptr; - } - return seg; + return ptr; + } + Segment* seg = new Segment(); + if (seg != nullptr && !seg->alloc(*program_, segment, size, align, zero)) { + return nullptr; + } + return seg; } -bool PALHSALoaderContext::SegmentCopy(amdgpu_hsa_elf_segment_t segment, - hsa_agent_t agent, void* dst, size_t offset, const void* src, size_t size) -{ - if (program_->isNull()) { - amd::Os::fastMemcpy(reinterpret_cast
(dst) + offset, src, size); - return true; - } - Segment* s = reinterpret_cast(dst); - s->copy(offset, src, size); +bool PALHSALoaderContext::SegmentCopy(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, + void* dst, size_t offset, const void* src, size_t size) { + if (program_->isNull()) { + amd::Os::fastMemcpy(reinterpret_cast
(dst) + offset, src, size); return true; + } + Segment* s = reinterpret_cast(dst); + s->copy(offset, src, size); + return true; } -void PALHSALoaderContext::SegmentFree( - amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* seg, size_t size) -{ - if (program_->isNull()) { - amd::Os::alignedFree(seg); - } - else { - Segment* s = reinterpret_cast(seg); - delete s ; - } -} - -void* PALHSALoaderContext::SegmentAddress( - amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* seg, size_t offset) -{ - assert(seg); - if (program_->isNull()) { - return (reinterpret_cast
(seg) + offset); - } +void PALHSALoaderContext::SegmentFree(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, + void* seg, size_t size) { + if (program_->isNull()) { + amd::Os::alignedFree(seg); + } else { Segment* s = reinterpret_cast(seg); - return reinterpret_cast(s->gpuAddress(offset)); + delete s; + } } -void* PALHSALoaderContext::SegmentHostAddress( - amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* seg, size_t offset) -{ - assert(seg); - if (program_->isNull()) { - return (reinterpret_cast
(seg) + offset); - } - Segment* s = reinterpret_cast(seg); - return s ->cpuAddress(offset); +void* PALHSALoaderContext::SegmentAddress(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, + void* seg, size_t offset) { + assert(seg); + if (program_->isNull()) { + return (reinterpret_cast
(seg) + offset); + } + Segment* s = reinterpret_cast(seg); + return reinterpret_cast(s->gpuAddress(offset)); } -bool PALHSALoaderContext::SegmentFreeze( - amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* seg, size_t size) -{ - if (program_->isNull()) { - return true; - } - Segment* s = reinterpret_cast(seg); - return s->freeze((segment == AMDGPU_HSA_SEGMENT_CODE_AGENT) ? false : true); +void* PALHSALoaderContext::SegmentHostAddress(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, + void* seg, size_t offset) { + assert(seg); + if (program_->isNull()) { + return (reinterpret_cast
(seg) + offset); + } + Segment* s = reinterpret_cast(seg); + return s->cpuAddress(offset); +} + +bool PALHSALoaderContext::SegmentFreeze(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, + void* seg, size_t size) { + if (program_->isNull()) { + return true; + } + Segment* s = reinterpret_cast(seg); + return s->freeze((segment == AMDGPU_HSA_SEGMENT_CODE_AGENT) ? false : true); } hsa_status_t PALHSALoaderContext::SamplerCreate( - hsa_agent_t agent, - const hsa_ext_sampler_descriptor_t *sampler_descriptor, - hsa_ext_sampler_t *sampler_handle) -{ - if (!agent.handle) { - return HSA_STATUS_ERROR_INVALID_AGENT; - } - if (!sampler_descriptor || !sampler_handle) { - return HSA_STATUS_ERROR_INVALID_ARGUMENT; - } - if (program_->isNull()) { - // Offline compilation. Provide a fake handle to avoid an assert - sampler_handle->handle = 1; - return HSA_STATUS_SUCCESS; - } - uint32_t state = 0; - switch (sampler_descriptor->coordinate_mode) { - case HSA_EXT_SAMPLER_COORDINATE_MODE_UNNORMALIZED: state = amd::Sampler::StateNormalizedCoordsFalse; break; - case HSA_EXT_SAMPLER_COORDINATE_MODE_NORMALIZED: state = amd::Sampler::StateNormalizedCoordsTrue; break; - default: - assert(false); - return HSA_STATUS_ERROR_INVALID_ARGUMENT; - } - switch (sampler_descriptor->filter_mode) { - case HSA_EXT_SAMPLER_FILTER_MODE_NEAREST: state |= amd::Sampler::StateFilterNearest; break; - case HSA_EXT_SAMPLER_FILTER_MODE_LINEAR: state |= amd::Sampler::StateFilterLinear; break; - default: - assert(false); - return HSA_STATUS_ERROR_INVALID_ARGUMENT; - - } - switch (sampler_descriptor->address_mode) { - case HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE: state |= amd::Sampler::StateAddressClampToEdge; break; - case HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_BORDER: state |= amd::Sampler::StateAddressClamp; break; - case HSA_EXT_SAMPLER_ADDRESSING_MODE_REPEAT: state |= amd::Sampler::StateAddressRepeat; break; - case HSA_EXT_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT: state |= amd::Sampler::StateAddressMirroredRepeat; break; - case HSA_EXT_SAMPLER_ADDRESSING_MODE_UNDEFINED: state |= amd::Sampler::StateAddressNone; break; - default: - assert(false); - return HSA_STATUS_ERROR_INVALID_ARGUMENT; - } - pal::Sampler* sampler = new pal::Sampler(program_->dev()); - if (!sampler || !sampler->create(state)) { - delete sampler; - return HSA_STATUS_ERROR; - } - program_->addSampler(sampler); - sampler_handle->handle = sampler->hwSrd(); + hsa_agent_t agent, const hsa_ext_sampler_descriptor_t* sampler_descriptor, + hsa_ext_sampler_t* sampler_handle) { + if (!agent.handle) { + return HSA_STATUS_ERROR_INVALID_AGENT; + } + if (!sampler_descriptor || !sampler_handle) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + if (program_->isNull()) { + // Offline compilation. Provide a fake handle to avoid an assert + sampler_handle->handle = 1; return HSA_STATUS_SUCCESS; + } + uint32_t state = 0; + switch (sampler_descriptor->coordinate_mode) { + case HSA_EXT_SAMPLER_COORDINATE_MODE_UNNORMALIZED: + state = amd::Sampler::StateNormalizedCoordsFalse; + break; + case HSA_EXT_SAMPLER_COORDINATE_MODE_NORMALIZED: + state = amd::Sampler::StateNormalizedCoordsTrue; + break; + default: + assert(false); + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + switch (sampler_descriptor->filter_mode) { + case HSA_EXT_SAMPLER_FILTER_MODE_NEAREST: + state |= amd::Sampler::StateFilterNearest; + break; + case HSA_EXT_SAMPLER_FILTER_MODE_LINEAR: + state |= amd::Sampler::StateFilterLinear; + break; + default: + assert(false); + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + switch (sampler_descriptor->address_mode) { + case HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE: + state |= amd::Sampler::StateAddressClampToEdge; + break; + case HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_BORDER: + state |= amd::Sampler::StateAddressClamp; + break; + case HSA_EXT_SAMPLER_ADDRESSING_MODE_REPEAT: + state |= amd::Sampler::StateAddressRepeat; + break; + case HSA_EXT_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT: + state |= amd::Sampler::StateAddressMirroredRepeat; + break; + case HSA_EXT_SAMPLER_ADDRESSING_MODE_UNDEFINED: + state |= amd::Sampler::StateAddressNone; + break; + default: + assert(false); + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + pal::Sampler* sampler = new pal::Sampler(program_->dev()); + if (!sampler || !sampler->create(state)) { + delete sampler; + return HSA_STATUS_ERROR; + } + program_->addSampler(sampler); + sampler_handle->handle = sampler->hwSrd(); + return HSA_STATUS_SUCCESS; } -hsa_status_t PALHSALoaderContext::SamplerDestroy( - hsa_agent_t agent, hsa_ext_sampler_t sampler_handle) -{ - if (!agent.handle) { - return HSA_STATUS_ERROR_INVALID_AGENT; - } - if (!sampler_handle.handle) { - return HSA_STATUS_ERROR_INVALID_ARGUMENT; - } - return HSA_STATUS_SUCCESS; +hsa_status_t PALHSALoaderContext::SamplerDestroy(hsa_agent_t agent, + hsa_ext_sampler_t sampler_handle) { + if (!agent.handle) { + return HSA_STATUS_ERROR_INVALID_AGENT; + } + if (!sampler_handle.handle) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + return HSA_STATUS_SUCCESS; } #if defined(WITH_LIGHTNING_COMPILER) -static hsa_status_t -GetKernelNamesCallback( - hsa_executable_t hExec, - hsa_executable_symbol_t hSymbol, - void *data) -{ - auto symbol = Symbol::Object(hSymbol); - auto symbolNameList = reinterpret_cast*>(data); +static hsa_status_t GetKernelNamesCallback(hsa_executable_t hExec, hsa_executable_symbol_t hSymbol, + void* data) { + auto symbol = Symbol::Object(hSymbol); + auto symbolNameList = reinterpret_cast*>(data); - hsa_symbol_kind_t type; - if (!symbol->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &type)) { - return HSA_STATUS_ERROR; + hsa_symbol_kind_t type; + if (!symbol->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &type)) { + return HSA_STATUS_ERROR; + } + + if (type == HSA_SYMBOL_KIND_KERNEL) { + uint32_t length; + if (!symbol->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &length)) { + return HSA_STATUS_ERROR; } - if (type == HSA_SYMBOL_KIND_KERNEL) { - uint32_t length; - if (!symbol->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &length)) { - return HSA_STATUS_ERROR; - } - - char* name = (char*) alloca(length+1); - if (!symbol->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_NAME, name)) { - return HSA_STATUS_ERROR; - } - name[length] = '\0'; - - symbolNameList->push_back(std::string(name)); + char* name = (char*)alloca(length + 1); + if (!symbol->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_NAME, name)) { + return HSA_STATUS_ERROR; } - return HSA_STATUS_SUCCESS; + name[length] = '\0'; + + symbolNameList->push_back(std::string(name)); + } + return HSA_STATUS_SUCCESS; } -aclType -LightningProgram::getCompilationStagesFromBinary( - std::vector& completeStages, - bool& needOptionsCheck - ) -{ - completeStages.clear(); - aclType from = ACL_TYPE_DEFAULT; - needOptionsCheck = true; +aclType LightningProgram::getCompilationStagesFromBinary(std::vector& completeStages, + bool& needOptionsCheck) { + completeStages.clear(); + aclType from = ACL_TYPE_DEFAULT; + needOptionsCheck = true; - bool containsLlvmirText = (type() == TYPE_COMPILED); - bool containsShaderIsa = (type() == TYPE_EXECUTABLE); - bool containsOpts = !(compileOptions_.empty() && linkOptions_.empty()); + bool containsLlvmirText = (type() == TYPE_COMPILED); + bool containsShaderIsa = (type() == TYPE_EXECUTABLE); + bool containsOpts = !(compileOptions_.empty() && linkOptions_.empty()); - if (containsLlvmirText && containsOpts) { - completeStages.push_back(from); - from = ACL_TYPE_LLVMIR_BINARY; - } - if (containsShaderIsa) { - completeStages.push_back(from); - from = ACL_TYPE_ISA; - } - std::string sCurOptions = compileOptions_ + linkOptions_; - amd::option::Options curOptions; - if (!amd::option::parseAllOptions(sCurOptions, curOptions)) { - buildLog_ += curOptions.optionsLog(); - LogError("Parsing compile options failed."); - return ACL_TYPE_DEFAULT; - } - switch (from) { + if (containsLlvmirText && containsOpts) { + completeStages.push_back(from); + from = ACL_TYPE_LLVMIR_BINARY; + } + if (containsShaderIsa) { + completeStages.push_back(from); + from = ACL_TYPE_ISA; + } + std::string sCurOptions = compileOptions_ + linkOptions_; + amd::option::Options curOptions; + if (!amd::option::parseAllOptions(sCurOptions, curOptions)) { + buildLog_ += curOptions.optionsLog(); + LogError("Parsing compile options failed."); + return ACL_TYPE_DEFAULT; + } + switch (from) { case ACL_TYPE_ISA: - // do not check options, if LLVMIR is absent or might be absent or options are absent - if (curOptions.oVariables->BinLLVMIR || !containsLlvmirText || !containsOpts) { - needOptionsCheck = false; - } - break; - // recompilation might be needed + // do not check options, if LLVMIR is absent or might be absent or options are absent + if (curOptions.oVariables->BinLLVMIR || !containsLlvmirText || !containsOpts) { + needOptionsCheck = false; + } + break; + // recompilation might be needed case ACL_TYPE_LLVMIR_BINARY: case ACL_TYPE_DEFAULT: default: + break; + } + return from; +} + + +aclType LightningProgram::getNextCompilationStageFromBinary(amd::option::Options* options) { + aclType continueCompileFrom = ACL_TYPE_DEFAULT; + binary_t binary = this->binary(); + + // If the binary already exists + if ((binary.first != NULL) && (binary.second > 0)) { + void* mem = const_cast(binary.first); + + // save the current options + std::string sCurCompileOptions = compileOptions_; + std::string sCurLinkOptions = linkOptions_; + std::string sCurOptions = compileOptions_ + linkOptions_; + + // Saving binary in the interface class, + // which also load compile & link options from binary + setBinary(static_cast(mem), binary.second); + + // Calculate the next stage to compile from, based on sections in binaryElf_; + // No any validity checks here + std::vector completeStages; + bool needOptionsCheck = true; + continueCompileFrom = getCompilationStagesFromBinary(completeStages, needOptionsCheck); + if (!options || !needOptionsCheck) { + return continueCompileFrom; + } + bool recompile = false; + //! @todo Should we also check for ACL_TYPE_OPENCL & ACL_TYPE_LLVMIR_TEXT? + switch (continueCompileFrom) { + case ACL_TYPE_ISA: { + // Compare options loaded from binary with current ones, recompile if differ; + // If compile options are absent in binary, do not compare and recompile + if (compileOptions_.empty()) break; + + std::string sBinOptions = compileOptions_ + linkOptions_; + + compileOptions_ = sCurCompileOptions; + linkOptions_ = sCurLinkOptions; + + amd::option::Options curOptions, binOptions; + if (!amd::option::parseAllOptions(sBinOptions, binOptions)) { + buildLog_ += binOptions.optionsLog(); + LogError("Parsing compile options from binary failed."); + return ACL_TYPE_DEFAULT; + } + if (!amd::option::parseAllOptions(sCurOptions, curOptions)) { + buildLog_ += curOptions.optionsLog(); + LogError("Parsing compile options failed."); + return ACL_TYPE_DEFAULT; + } + if (!curOptions.equals(binOptions)) { + recompile = true; + } + break; + } + default: break; } - return from; + if (recompile) { + while (!completeStages.empty()) { + continueCompileFrom = completeStages.back(); + if (continueCompileFrom == ACL_TYPE_LLVMIR_BINARY || + continueCompileFrom == ACL_TYPE_DEFAULT) { + break; + } + completeStages.pop_back(); + } + } + } + return continueCompileFrom; } - -aclType -LightningProgram::getNextCompilationStageFromBinary(amd::option::Options* options) -{ - aclType continueCompileFrom = ACL_TYPE_DEFAULT; - binary_t binary = this->binary(); - - // If the binary already exists - if ((binary.first != NULL) && (binary.second > 0)) { - void *mem = const_cast(binary.first); - - // save the current options - std::string sCurCompileOptions = compileOptions_; - std::string sCurLinkOptions = linkOptions_; - std::string sCurOptions = compileOptions_ + linkOptions_; - - // Saving binary in the interface class, - // which also load compile & link options from binary - setBinary(static_cast(mem), binary.second); - - // Calculate the next stage to compile from, based on sections in binaryElf_; - // No any validity checks here - std::vector completeStages; - bool needOptionsCheck = true; - continueCompileFrom = getCompilationStagesFromBinary(completeStages, needOptionsCheck); - if (!options || !needOptionsCheck) { - return continueCompileFrom; - } - bool recompile = false; - //! @todo Should we also check for ACL_TYPE_OPENCL & ACL_TYPE_LLVMIR_TEXT? - switch (continueCompileFrom) { - case ACL_TYPE_ISA: { - // Compare options loaded from binary with current ones, recompile if differ; - // If compile options are absent in binary, do not compare and recompile - if (compileOptions_.empty()) - break; - - std::string sBinOptions = compileOptions_ + linkOptions_; - - compileOptions_ = sCurCompileOptions; - linkOptions_ = sCurLinkOptions; - - amd::option::Options curOptions, binOptions; - if (!amd::option::parseAllOptions(sBinOptions, binOptions)) { - buildLog_ += binOptions.optionsLog(); - LogError("Parsing compile options from binary failed."); - return ACL_TYPE_DEFAULT; - } - if (!amd::option::parseAllOptions(sCurOptions, curOptions)) { - buildLog_ += curOptions.optionsLog(); - LogError("Parsing compile options failed."); - return ACL_TYPE_DEFAULT; - } - if (!curOptions.equals(binOptions)) { - recompile = true; - } - break; - } - default: - break; - } - if (recompile) { - while (!completeStages.empty()) { - continueCompileFrom = completeStages.back(); - if (continueCompileFrom == ACL_TYPE_LLVMIR_BINARY || - continueCompileFrom == ACL_TYPE_DEFAULT) { - break; - } - completeStages.pop_back(); - } - } - } - return continueCompileFrom; +bool LightningProgram::createBinary(amd::option::Options* options) { + if (!clBinary()->createElfBinary(options->oVariables->BinEncrypt, type())) { + LogError("Failed to create ELF binary image!"); + return false; + } + return true; } -bool -LightningProgram::createBinary(amd::option::Options *options) -{ - if (!clBinary()->createElfBinary(options->oVariables->BinEncrypt, type())) { - LogError("Failed to create ELF binary image!"); +bool LightningProgram::linkImpl(const std::vector& inputPrograms, + amd::option::Options* options, bool createLibrary) { + using namespace amd::opencl_driver; + std::auto_ptr C(newCompilerInstance()); + + std::vector inputs; + for (auto program : (const std::vector&)inputPrograms) { + if (program->llvmBinary_.empty()) { + if (program->clBinary() == NULL) { + buildLog_ += "Internal error: Input program not compiled!\n"; return false; - } - return true; -} + } -bool -LightningProgram::linkImpl( - const std::vector &inputPrograms, - amd::option::Options *options, - bool createLibrary) -{ - using namespace amd::opencl_driver; - std::auto_ptr C(newCompilerInstance()); - - std::vector inputs; - for (auto program : (const std::vector&)inputPrograms) { - if (program->llvmBinary_.empty()) { - if (program->clBinary() == NULL) { - buildLog_ += "Internal error: Input program not compiled!\n"; - return false; - } - - // We are using CL binary directly. - // Setup elfIn() and try to load llvmIR from binary - // This elfIn() will be released at the end of build by finiBuild(). - if (!program->clBinary()->setElfIn(ELFCLASS64)) { - buildLog_ += "Internal error: Setting input OCL binary failed!\n"; - return false; - } - if (!program->clBinary()->loadLlvmBinary(program->llvmBinary_, - program->elfSectionType_)) { - buildLog_ += "Internal error: Failed loading compiled binary!\n"; - return false; - } - } - - if (program->elfSectionType_ != amd::OclElf::LLVMIR) { - buildLog_ += "Error: Input binary format is not supported\n."; - return false; - } - - Data* input = C->NewBufferReference(DT_LLVM_BC, - (const char*) program->llvmBinary_.data(), - program->llvmBinary_.size()); - - if (!input) { - buildLog_ += "Internal error: Failed to open the compiled programs.\n"; - return false; - } - - // release elfIn() for the program - program->clBinary()->resetElfIn(); - - inputs.push_back(input); - } - - // open the linked output - amd::opencl_driver::Buffer* output = C->NewBuffer(DT_LLVM_BC); - - if (!output) { - buildLog_ += "Error: Failed to open the linked program.\n"; + // We are using CL binary directly. + // Setup elfIn() and try to load llvmIR from binary + // This elfIn() will be released at the end of build by finiBuild(). + if (!program->clBinary()->setElfIn(ELFCLASS64)) { + buildLog_ += "Internal error: Setting input OCL binary failed!\n"; return false; - } - - std::vector linkOptions; - - // NOTE: The params is also used to identy cached code object. This parameter - // should not contain any dyanamically generated filename. - bool ret = dev().cacheCompilation()->linkLLVMBitcode(C.get(), inputs, output, linkOptions, buildLog_); - buildLog_ += C->Output(); - if (!ret) { - buildLog_ += "Error: Linking bitcode failed: linking source & IR libraries.\n"; + } + if (!program->clBinary()->loadLlvmBinary(program->llvmBinary_, program->elfSectionType_)) { + buildLog_ += "Internal error: Failed loading compiled binary!\n"; return false; + } } - llvmBinary_.assign(output->Buf().data(), output->Size()); - elfSectionType_ = amd::OclElf::LLVMIR; - - - if (clBinary()->saveLLVMIR()) { - clBinary()->elfOut()->addSection( - amd::OclElf::LLVMIR, llvmBinary_.data(), llvmBinary_.size(), false); - // store the original link options - clBinary()->storeLinkOptions(linkOptions_); - // store the original compile options - clBinary()->storeCompileOptions(compileOptions_); + if (program->elfSectionType_ != amd::OclElf::LLVMIR) { + buildLog_ += "Error: Input binary format is not supported\n."; + return false; } - // skip the rest if we are building an opencl library - if (createLibrary) { - setType(TYPE_LIBRARY); - if (!createBinary(options)) { - buildLog_ += "Internal error: creating OpenCL binary failed\n"; - return false; - } - return true; - } - - return linkImpl(options); -} - -bool -LightningProgram::linkImpl(amd::option::Options *options) -{ - using namespace amd::opencl_driver; - internal_ = (compileOptions_.find("-cl-internal-kernel") != - std::string::npos) ? true : false; - - aclType continueCompileFrom = llvmBinary_.empty() - ? getNextCompilationStageFromBinary(options) - : ACL_TYPE_LLVMIR_BINARY; - - if (continueCompileFrom == ACL_TYPE_ISA) { - binary_t isa = binary(); - if ((isa.first != NULL) && (isa.second > 0)) { - return setKernels(options, (void*) isa.first, isa.second ); - } - else { - buildLog_ += "Error: code object is empty \n" ; - return false; - } - return true; - } - if (continueCompileFrom != ACL_TYPE_LLVMIR_BINARY) { - buildLog_ += "Error while Codegen phase: the binary is incomplete \n" ; - return false; - } - - std::auto_ptr C(newCompilerInstance()); - // call LinkLLVMBitcode - std::vector inputs; - - // open the input IR source - Data* input = C->NewBufferReference( - DT_LLVM_BC, llvmBinary_.data(), llvmBinary_.size()); + Data* input = C->NewBufferReference(DT_LLVM_BC, (const char*)program->llvmBinary_.data(), + program->llvmBinary_.size()); if (!input) { - buildLog_ += "Error: Failed to open the compiled program.\n"; - return false; + buildLog_ += "Internal error: Failed to open the compiled programs.\n"; + return false; } - inputs.push_back(input); //< must be the first input + // release elfIn() for the program + program->clBinary()->resetElfIn(); - // open the bitcode libraries - Data* opencl_bc = C->NewBufferReference(DT_LLVM_BC, - (const char*) opencl_amdgcn, opencl_amdgcn_size); - Data* ocml_bc = C->NewBufferReference(DT_LLVM_BC, - (const char*) ocml_amdgcn, ocml_amdgcn_size); - Data* ockl_bc = C->NewBufferReference(DT_LLVM_BC, - (const char*) ockl_amdgcn, ockl_amdgcn_size); - Data* irif_bc = C->NewBufferReference(DT_LLVM_BC, - (const char*) irif_amdgcn, irif_amdgcn_size); + inputs.push_back(input); + } - if (!opencl_bc || !ocml_bc || !ockl_bc || !irif_bc) { - buildLog_ += "Error: Failed to open the bitcode library.\n"; - return false; + // open the linked output + amd::opencl_driver::Buffer* output = C->NewBuffer(DT_LLVM_BC); + + if (!output) { + buildLog_ += "Error: Failed to open the linked program.\n"; + return false; + } + + std::vector linkOptions; + + // NOTE: The params is also used to identy cached code object. This parameter + // should not contain any dyanamically generated filename. + bool ret = + dev().cacheCompilation()->linkLLVMBitcode(C.get(), inputs, output, linkOptions, buildLog_); + buildLog_ += C->Output(); + if (!ret) { + buildLog_ += "Error: Linking bitcode failed: linking source & IR libraries.\n"; + return false; + } + + llvmBinary_.assign(output->Buf().data(), output->Size()); + elfSectionType_ = amd::OclElf::LLVMIR; + + + if (clBinary()->saveLLVMIR()) { + clBinary()->elfOut()->addSection(amd::OclElf::LLVMIR, llvmBinary_.data(), llvmBinary_.size(), + false); + // store the original link options + clBinary()->storeLinkOptions(linkOptions_); + // store the original compile options + clBinary()->storeCompileOptions(compileOptions_); + } + + // skip the rest if we are building an opencl library + if (createLibrary) { + setType(TYPE_LIBRARY); + if (!createBinary(options)) { + buildLog_ += "Internal error: creating OpenCL binary failed\n"; + return false; } - - inputs.push_back(opencl_bc); // depends on oclm & ockl - inputs.push_back(ockl_bc); // depends on irif - inputs.push_back(ocml_bc); // depends on irif - inputs.push_back(irif_bc); - - // open the control functions - auto isa_version = get_oclc_isa_version(dev().hwInfo()->gfxipVersion_); - if (!isa_version.first) { - buildLog_ += "Error: Linking for this device is not supported\n"; - return false; - } - - Data* isa_version_bc = C->NewBufferReference(DT_LLVM_BC, - (const char*) isa_version.first, isa_version.second); - - if (!isa_version_bc) { - buildLog_ += "Error: Failed to open the control functions.\n"; - return false; - } - - inputs.push_back(isa_version_bc); - - auto correctly_rounded_sqrt = get_oclc_correctly_rounded_sqrt( - options->oVariables->FP32RoundDivideSqrt); - Data* correctly_rounded_sqrt_bc = C->NewBufferReference(DT_LLVM_BC, - correctly_rounded_sqrt.first, correctly_rounded_sqrt.second); - - auto daz_opt = get_oclc_daz_opt(dev().hwInfo()->gfxipVersion_ < 900 - || options->oVariables->DenormsAreZero); - Data* daz_opt_bc = C->NewBufferReference(DT_LLVM_BC, - daz_opt.first, daz_opt.second); - - auto finite_only = get_oclc_finite_only(options->oVariables->FiniteMathOnly - || options->oVariables->FastRelaxedMath); - Data* finite_only_bc = C->NewBufferReference(DT_LLVM_BC, - finite_only.first, finite_only.second); - - auto unsafe_math = get_oclc_unsafe_math(options->oVariables->UnsafeMathOpt - || options->oVariables->FastRelaxedMath); - Data* unsafe_math_bc = C->NewBufferReference(DT_LLVM_BC, - unsafe_math.first, unsafe_math.second); - - if (!correctly_rounded_sqrt_bc || !daz_opt_bc - || !finite_only_bc || !unsafe_math_bc) { - buildLog_ += "Error: Failed to open the control functions.\n"; - return false; - } - - - if (!correctly_rounded_sqrt_bc || !daz_opt_bc - || !finite_only_bc || !unsafe_math_bc) { - buildLog_ += "Error: Failed to open the control functions.\n"; - return false; - } - - inputs.push_back(correctly_rounded_sqrt_bc); - inputs.push_back(daz_opt_bc); - inputs.push_back(finite_only_bc); - inputs.push_back(unsafe_math_bc); - - // open the linked output - std::vector linkOptions; - amd::opencl_driver::Buffer* linked_bc = C->NewBuffer(DT_LLVM_BC); - - if (!linked_bc) { - buildLog_ += "Error: Failed to open the linked program.\n"; - return false; - } - - // NOTE: The linkOptions parameter is also used to identy cached code object. This parameter - // should not contain any dyanamically generated filename. - bool ret = dev().cacheCompilation()->linkLLVMBitcode(C.get(), inputs, linked_bc, linkOptions, buildLog_); - buildLog_ += C->Output(); - if (!ret) { - buildLog_ += "Error: Linking bitcode failed: linking source & IR libraries.\n"; - return false; - } - - if (options->isDumpFlagSet(amd::option::DUMP_BC_LINKED)) { - std::ofstream f(options->getDumpFileName("_linked.bc").c_str(), - std::ios::binary | std::ios::trunc); - if(f.is_open()) { - f.write(linked_bc->Buf().data(), linked_bc->Size()); - f.close(); - } else { - buildLog_ += - "Warning: opening the file to dump the linked IR failed.\n"; - } - } - - inputs.clear(); - inputs.push_back(linked_bc); - - amd::opencl_driver::Buffer* out_exec = C->NewBuffer(DT_EXECUTABLE); - if (!out_exec) { - buildLog_ += "Error: Failed to create the linked executable.\n"; - return false; - } - - std::string codegenOptions(options->llvmOptions); - - // Set the machine target - std::ostringstream mCPU; - mCPU << " -mcpu=gfx" << dev().hwInfo()->gfxipVersion_; - codegenOptions.append(mCPU.str()); - - // Set the -O# - std::ostringstream optLevel; - optLevel << "-O" << options->oVariables->OptLevel; - codegenOptions.append(" ").append(optLevel.str()); - - // Pass clang options - std::ostringstream ostrstr; - std::copy(options->clangOptions.begin(), options->clangOptions.end(), - std::ostream_iterator(ostrstr, " ")); - codegenOptions.append(" ").append(ostrstr.str()); - - // Set whole program mode - codegenOptions.append(" -mllvm -amdgpu-internalize-symbols -mllvm -amdgpu-early-inline-all"); - - // Tokenize the options string into a vector of strings - std::istringstream strstr(codegenOptions); - std::istream_iterator sit(strstr), end; - std::vector params(sit, end); - - // NOTE: The params is also used to identy cached code object. This parameter - // should not contain any dyanamically generated filename. - ret = dev().cacheCompilation()->compileAndLinkExecutable(C.get(), inputs, out_exec, params, buildLog_); - buildLog_ += C->Output(); - if (!ret) { - buildLog_ += "Error: Creating the executable failed: Compiling LLVM IRs to exeutable\n"; - return false; - } - - if (options->isDumpFlagSet(amd::option::DUMP_O)) { - std::ofstream f(options->getDumpFileName(".so").c_str(), - std::ios::binary | std::ios::trunc); - if(f.is_open()) { - f.write(out_exec->Buf().data(), out_exec->Size()); - f.close(); - } else { - buildLog_ += - "Warning: opening the file to dump the code object failed.\n"; - } - } - - if (options->isDumpFlagSet(amd::option::DUMP_ISA)) { - std::string name = options->getDumpFileName(".s"); - File *dump = C->NewFile(DT_INTERNAL, name); - if (!C->DumpExecutableAsText(out_exec, dump)) { - buildLog_ += "Warning: failed to dump code object.\n"; - } - } - - return setKernels(options, out_exec->Buf().data(), out_exec->Size()); -} - -bool -LightningProgram::setKernels( - amd::option::Options *options, - void* binary, - size_t size - ) -{ - hsa_agent_t agent; - agent.handle = 1; - - executable_ = loader_->CreateExecutable(HSA_PROFILE_FULL, NULL); - if (executable_ == nullptr) { - buildLog_ += "Error: Executable for AMD HSA Code Object isn't created.\n"; - return false; - } - - hsa_code_object_t code_object; - code_object.handle = reinterpret_cast(binary); - - hsa_status_t status = executable_->LoadCodeObject(agent, code_object, nullptr); - if (status != HSA_STATUS_SUCCESS) { - buildLog_ += "Error: AMD HSA Code Object loading failed.\n"; - return false; - } - - status = executable_->Freeze(nullptr); - if (status != HSA_STATUS_SUCCESS) { - buildLog_ += "Error: Freezing the executable failed: "; - return false; - } - - size_t progvarsTotalSize = 0; - - // Begin the Elf image from memory - Elf* e = elf_memory((char*) binary, size, NULL); - if (elf_kind(e) != ELF_K_ELF) { - buildLog_ += "Error while reading the ELF program binary\n"; - return false; - } - - size_t numpHdrs; - if (elf_getphdrnum(e, &numpHdrs) != 0) { - buildLog_ += "Error while reading the ELF program binary\n"; - return false; - } - - for (size_t i = 0; i < numpHdrs; ++i) { - GElf_Phdr pHdr; - if (gelf_getphdr(e, i, &pHdr) != &pHdr) { - continue; - } - // Look for the runtime metadata note - if (pHdr.p_type == PT_NOTE && pHdr.p_align >= sizeof(int)) { - // Iterate over the notes in this segment - address ptr = (address) binary + pHdr.p_offset; - address segmentEnd = ptr + pHdr.p_filesz; - - while (ptr < segmentEnd) { - Elf_Note* note = (Elf_Note*) ptr; - address name = (address) ¬e[1]; - address desc = name + amd::alignUp(note->n_namesz, sizeof(int)); - - //! @todo: Use constants and enums defined in AMDGPUPTNote.h. - //! In order to switch to using constants and enums defined in - //! AMDGPUPTNote.h, we need to clean up internal header files. - if (note->n_type == 7 || note->n_type == 8) { - buildLog_ += "Error: object code with old metadata is not " \ - "supported\n"; - return false; - } - else if (note->n_type == 10 /*AMDGPU::ElfNote::NT_AMDGPU_HSA_CODE_OBJECT_METADATA*/ - && note->n_namesz == sizeof "AMD" - && !memcmp(name, "AMD", note->n_namesz)) { - std::string metadataStr((const char *) desc, (size_t) note->n_descsz); - metadata_ = new CodeObjectMD(); - if (CodeObjectMD::fromYamlString(metadataStr, *metadata_)) { - buildLog_ += "Error: failed to process metadata\n"; - return false; - } - // We've found and loaded the runtime metadata, exit the - // note record loop now. - break; - } - ptr += sizeof(*note) - + amd::alignUp(note->n_namesz, sizeof(int)) - + amd::alignUp(note->n_descsz, sizeof(int)); - } - } - // Accumulate the size of R & !X loadable segments - else if (pHdr.p_type == PT_LOAD - && (pHdr.p_flags & PF_R) && !(pHdr.p_flags & PF_X)) { - progvarsTotalSize += pHdr.p_memsz; - } - } - - elf_end(e); - - if (!metadata_) { - buildLog_ += "Error: runtime metadata section not present in " \ - "ELF program binary\n"; - return false; - } - - // note: The global variable size is updated in the context loader - //setGlobalVariableTotalSize(progvarsTotalSize); - - // Get the list of kernels - std::vector kernelNameList; - status = executable_->IterateSymbols(GetKernelNamesCallback, - (void *) &kernelNameList ); - if (status != HSA_STATUS_SUCCESS) { - buildLog_ += "Error: Failed to get kernel names\n"; - return false; - } - - for (auto &kernelName : kernelNameList) { - auto kernel = new LightningKernel( - kernelName, this, options->origOptionStr + hsailOptions(options)); - - kernels()[kernelName] = kernel; - - auto symbol = executable_->GetSymbol(kernelName.c_str(), &agent); - if (!symbol) { - buildLog_ += "Error: Getting kernel symbol '" + kernelName - + "' from AMD HSA Code Object failed. " \ - "Kernel initialization failed.\n"; - return false; - } - if (!kernel->init(symbol)) { - buildLog_ += "Error: Kernel '" + kernelName - + "' initialization failed.\n"; - return false; - } - buildLog_ += kernel->buildLog(); - - kernel->setUniformWorkGroupSize(options->oVariables->UniformWorkGroupSize); - - // Find max scratch regs used in the program. It's used for scratch buffer preallocation - // with dynamic parallelism, since runtime doesn't know which child kernel will be called - maxScratchRegs_ = std::max(static_cast(kernel->workGroupInfo()->scratchRegs_), maxScratchRegs_); - } - - // Allocate kernel table for device enqueuing - if (!isNull() && false/*dynamicParallelism*/ && !allocKernelTable()) { - return false; - } - - // Save the binary and type - clBinary()->saveBIFBinary((char*)binary, size); - setType(TYPE_EXECUTABLE); - return true; + } + + return linkImpl(options); } -LightningProgram::~LightningProgram() -{ - delete metadata_; +bool LightningProgram::linkImpl(amd::option::Options* options) { + using namespace amd::opencl_driver; + internal_ = (compileOptions_.find("-cl-internal-kernel") != std::string::npos) ? true : false; + + aclType continueCompileFrom = + llvmBinary_.empty() ? getNextCompilationStageFromBinary(options) : ACL_TYPE_LLVMIR_BINARY; + + if (continueCompileFrom == ACL_TYPE_ISA) { + binary_t isa = binary(); + if ((isa.first != NULL) && (isa.second > 0)) { + return setKernels(options, (void*)isa.first, isa.second); + } else { + buildLog_ += "Error: code object is empty \n"; + return false; + } + return true; + } + if (continueCompileFrom != ACL_TYPE_LLVMIR_BINARY) { + buildLog_ += "Error while Codegen phase: the binary is incomplete \n"; + return false; + } + + std::auto_ptr C(newCompilerInstance()); + // call LinkLLVMBitcode + std::vector inputs; + + // open the input IR source + Data* input = C->NewBufferReference(DT_LLVM_BC, llvmBinary_.data(), llvmBinary_.size()); + + if (!input) { + buildLog_ += "Error: Failed to open the compiled program.\n"; + return false; + } + + inputs.push_back(input); //< must be the first input + + // open the bitcode libraries + Data* opencl_bc = + C->NewBufferReference(DT_LLVM_BC, (const char*)opencl_amdgcn, opencl_amdgcn_size); + Data* ocml_bc = C->NewBufferReference(DT_LLVM_BC, (const char*)ocml_amdgcn, ocml_amdgcn_size); + Data* ockl_bc = C->NewBufferReference(DT_LLVM_BC, (const char*)ockl_amdgcn, ockl_amdgcn_size); + Data* irif_bc = C->NewBufferReference(DT_LLVM_BC, (const char*)irif_amdgcn, irif_amdgcn_size); + + if (!opencl_bc || !ocml_bc || !ockl_bc || !irif_bc) { + buildLog_ += "Error: Failed to open the bitcode library.\n"; + return false; + } + + inputs.push_back(opencl_bc); // depends on oclm & ockl + inputs.push_back(ockl_bc); // depends on irif + inputs.push_back(ocml_bc); // depends on irif + inputs.push_back(irif_bc); + + // open the control functions + auto isa_version = get_oclc_isa_version(dev().hwInfo()->gfxipVersion_); + if (!isa_version.first) { + buildLog_ += "Error: Linking for this device is not supported\n"; + return false; + } + + Data* isa_version_bc = + C->NewBufferReference(DT_LLVM_BC, (const char*)isa_version.first, isa_version.second); + + if (!isa_version_bc) { + buildLog_ += "Error: Failed to open the control functions.\n"; + return false; + } + + inputs.push_back(isa_version_bc); + + auto correctly_rounded_sqrt = + get_oclc_correctly_rounded_sqrt(options->oVariables->FP32RoundDivideSqrt); + Data* correctly_rounded_sqrt_bc = C->NewBufferReference(DT_LLVM_BC, correctly_rounded_sqrt.first, + correctly_rounded_sqrt.second); + + auto daz_opt = + get_oclc_daz_opt(dev().hwInfo()->gfxipVersion_ < 900 || options->oVariables->DenormsAreZero); + Data* daz_opt_bc = C->NewBufferReference(DT_LLVM_BC, daz_opt.first, daz_opt.second); + + auto finite_only = get_oclc_finite_only(options->oVariables->FiniteMathOnly || + options->oVariables->FastRelaxedMath); + Data* finite_only_bc = C->NewBufferReference(DT_LLVM_BC, finite_only.first, finite_only.second); + + auto unsafe_math = get_oclc_unsafe_math(options->oVariables->UnsafeMathOpt || + options->oVariables->FastRelaxedMath); + Data* unsafe_math_bc = C->NewBufferReference(DT_LLVM_BC, unsafe_math.first, unsafe_math.second); + + if (!correctly_rounded_sqrt_bc || !daz_opt_bc || !finite_only_bc || !unsafe_math_bc) { + buildLog_ += "Error: Failed to open the control functions.\n"; + return false; + } + + + if (!correctly_rounded_sqrt_bc || !daz_opt_bc || !finite_only_bc || !unsafe_math_bc) { + buildLog_ += "Error: Failed to open the control functions.\n"; + return false; + } + + inputs.push_back(correctly_rounded_sqrt_bc); + inputs.push_back(daz_opt_bc); + inputs.push_back(finite_only_bc); + inputs.push_back(unsafe_math_bc); + + // open the linked output + std::vector linkOptions; + amd::opencl_driver::Buffer* linked_bc = C->NewBuffer(DT_LLVM_BC); + + if (!linked_bc) { + buildLog_ += "Error: Failed to open the linked program.\n"; + return false; + } + + // NOTE: The linkOptions parameter is also used to identy cached code object. This parameter + // should not contain any dyanamically generated filename. + bool ret = + dev().cacheCompilation()->linkLLVMBitcode(C.get(), inputs, linked_bc, linkOptions, buildLog_); + buildLog_ += C->Output(); + if (!ret) { + buildLog_ += "Error: Linking bitcode failed: linking source & IR libraries.\n"; + return false; + } + + if (options->isDumpFlagSet(amd::option::DUMP_BC_LINKED)) { + std::ofstream f(options->getDumpFileName("_linked.bc").c_str(), + std::ios::binary | std::ios::trunc); + if (f.is_open()) { + f.write(linked_bc->Buf().data(), linked_bc->Size()); + f.close(); + } else { + buildLog_ += "Warning: opening the file to dump the linked IR failed.\n"; + } + } + + inputs.clear(); + inputs.push_back(linked_bc); + + amd::opencl_driver::Buffer* out_exec = C->NewBuffer(DT_EXECUTABLE); + if (!out_exec) { + buildLog_ += "Error: Failed to create the linked executable.\n"; + return false; + } + + std::string codegenOptions(options->llvmOptions); + + // Set the machine target + std::ostringstream mCPU; + mCPU << " -mcpu=gfx" << dev().hwInfo()->gfxipVersion_; + codegenOptions.append(mCPU.str()); + + // Set the -O# + std::ostringstream optLevel; + optLevel << "-O" << options->oVariables->OptLevel; + codegenOptions.append(" ").append(optLevel.str()); + + // Pass clang options + std::ostringstream ostrstr; + std::copy(options->clangOptions.begin(), options->clangOptions.end(), + std::ostream_iterator(ostrstr, " ")); + codegenOptions.append(" ").append(ostrstr.str()); + + // Set whole program mode + codegenOptions.append(" -mllvm -amdgpu-internalize-symbols -mllvm -amdgpu-early-inline-all"); + + // Tokenize the options string into a vector of strings + std::istringstream strstr(codegenOptions); + std::istream_iterator sit(strstr), end; + std::vector params(sit, end); + + // NOTE: The params is also used to identy cached code object. This parameter + // should not contain any dyanamically generated filename. + ret = dev().cacheCompilation()->compileAndLinkExecutable(C.get(), inputs, out_exec, params, + buildLog_); + buildLog_ += C->Output(); + if (!ret) { + buildLog_ += "Error: Creating the executable failed: Compiling LLVM IRs to exeutable\n"; + return false; + } + + if (options->isDumpFlagSet(amd::option::DUMP_O)) { + std::ofstream f(options->getDumpFileName(".so").c_str(), std::ios::binary | std::ios::trunc); + if (f.is_open()) { + f.write(out_exec->Buf().data(), out_exec->Size()); + f.close(); + } else { + buildLog_ += "Warning: opening the file to dump the code object failed.\n"; + } + } + + if (options->isDumpFlagSet(amd::option::DUMP_ISA)) { + std::string name = options->getDumpFileName(".s"); + File* dump = C->NewFile(DT_INTERNAL, name); + if (!C->DumpExecutableAsText(out_exec, dump)) { + buildLog_ += "Warning: failed to dump code object.\n"; + } + } + + return setKernels(options, out_exec->Buf().data(), out_exec->Size()); } -#endif // defined(WITH_LIGHTNING_COMPILER) +bool LightningProgram::setKernels(amd::option::Options* options, void* binary, size_t size) { + hsa_agent_t agent; + agent.handle = 1; -} // namespace pal + executable_ = loader_->CreateExecutable(HSA_PROFILE_FULL, NULL); + if (executable_ == nullptr) { + buildLog_ += "Error: Executable for AMD HSA Code Object isn't created.\n"; + return false; + } + + hsa_code_object_t code_object; + code_object.handle = reinterpret_cast(binary); + + hsa_status_t status = executable_->LoadCodeObject(agent, code_object, nullptr); + if (status != HSA_STATUS_SUCCESS) { + buildLog_ += "Error: AMD HSA Code Object loading failed.\n"; + return false; + } + + status = executable_->Freeze(nullptr); + if (status != HSA_STATUS_SUCCESS) { + buildLog_ += "Error: Freezing the executable failed: "; + return false; + } + + size_t progvarsTotalSize = 0; + + // Begin the Elf image from memory + Elf* e = elf_memory((char*)binary, size, NULL); + if (elf_kind(e) != ELF_K_ELF) { + buildLog_ += "Error while reading the ELF program binary\n"; + return false; + } + + size_t numpHdrs; + if (elf_getphdrnum(e, &numpHdrs) != 0) { + buildLog_ += "Error while reading the ELF program binary\n"; + return false; + } + + for (size_t i = 0; i < numpHdrs; ++i) { + GElf_Phdr pHdr; + if (gelf_getphdr(e, i, &pHdr) != &pHdr) { + continue; + } + // Look for the runtime metadata note + if (pHdr.p_type == PT_NOTE && pHdr.p_align >= sizeof(int)) { + // Iterate over the notes in this segment + address ptr = (address)binary + pHdr.p_offset; + address segmentEnd = ptr + pHdr.p_filesz; + + while (ptr < segmentEnd) { + Elf_Note* note = (Elf_Note*)ptr; + address name = (address)¬e[1]; + address desc = name + amd::alignUp(note->n_namesz, sizeof(int)); + + //! @todo: Use constants and enums defined in AMDGPUPTNote.h. + //! In order to switch to using constants and enums defined in + //! AMDGPUPTNote.h, we need to clean up internal header files. + if (note->n_type == 7 || note->n_type == 8) { + buildLog_ += + "Error: object code with old metadata is not " + "supported\n"; + return false; + } else if (note->n_type == 10 /*AMDGPU::ElfNote::NT_AMDGPU_HSA_CODE_OBJECT_METADATA*/ + && note->n_namesz == sizeof "AMD" && !memcmp(name, "AMD", note->n_namesz)) { + std::string metadataStr((const char*)desc, (size_t)note->n_descsz); + metadata_ = new CodeObjectMD(); + if (CodeObjectMD::fromYamlString(metadataStr, *metadata_)) { + buildLog_ += "Error: failed to process metadata\n"; + return false; + } + // We've found and loaded the runtime metadata, exit the + // note record loop now. + break; + } + ptr += sizeof(*note) + amd::alignUp(note->n_namesz, sizeof(int)) + + amd::alignUp(note->n_descsz, sizeof(int)); + } + } + // Accumulate the size of R & !X loadable segments + else if (pHdr.p_type == PT_LOAD && (pHdr.p_flags & PF_R) && !(pHdr.p_flags & PF_X)) { + progvarsTotalSize += pHdr.p_memsz; + } + } + + elf_end(e); + + if (!metadata_) { + buildLog_ += + "Error: runtime metadata section not present in " + "ELF program binary\n"; + return false; + } + + // note: The global variable size is updated in the context loader + // setGlobalVariableTotalSize(progvarsTotalSize); + + // Get the list of kernels + std::vector kernelNameList; + status = executable_->IterateSymbols(GetKernelNamesCallback, (void*)&kernelNameList); + if (status != HSA_STATUS_SUCCESS) { + buildLog_ += "Error: Failed to get kernel names\n"; + return false; + } + + for (auto& kernelName : kernelNameList) { + auto kernel = + new LightningKernel(kernelName, this, options->origOptionStr + hsailOptions(options)); + + kernels()[kernelName] = kernel; + + auto symbol = executable_->GetSymbol(kernelName.c_str(), &agent); + if (!symbol) { + buildLog_ += "Error: Getting kernel symbol '" + kernelName + + "' from AMD HSA Code Object failed. " + "Kernel initialization failed.\n"; + return false; + } + if (!kernel->init(symbol)) { + buildLog_ += "Error: Kernel '" + kernelName + "' initialization failed.\n"; + return false; + } + buildLog_ += kernel->buildLog(); + + kernel->setUniformWorkGroupSize(options->oVariables->UniformWorkGroupSize); + + // Find max scratch regs used in the program. It's used for scratch buffer preallocation + // with dynamic parallelism, since runtime doesn't know which child kernel will be called + maxScratchRegs_ = + std::max(static_cast(kernel->workGroupInfo()->scratchRegs_), maxScratchRegs_); + } + + // Allocate kernel table for device enqueuing + if (!isNull() && false /*dynamicParallelism*/ && !allocKernelTable()) { + return false; + } + + // Save the binary and type + clBinary()->saveBIFBinary((char*)binary, size); + setType(TYPE_EXECUTABLE); + + return true; +} + +LightningProgram::~LightningProgram() { delete metadata_; } + +#endif // defined(WITH_LIGHTNING_COMPILER) + +} // namespace pal diff --git a/rocclr/runtime/device/pal/palprogram.hpp b/rocclr/runtime/device/pal/palprogram.hpp index 8d6046de0c..017124bd2a 100644 --- a/rocclr/runtime/device/pal/palprogram.hpp +++ b/rocclr/runtime/device/pal/palprogram.hpp @@ -11,20 +11,20 @@ #include "AMDGPUCodeObjectMetadata.h" typedef llvm::AMDGPU::CodeObject::Metadata CodeObjectMD; -#endif // defined(WITH_LIGHTNING_COMPILER) +#endif // defined(WITH_LIGHTNING_COMPILER) namespace amd { namespace option { class Options; -} // option +} // option namespace hsa { namespace loader { class Loader; class Executable; class Context; -} // loader -} // hsa -} // amd +} // loader +} // hsa +} // amd //! \namespace pal PAL Device Implementation namespace pal { @@ -37,313 +37,293 @@ using namespace amd::hsa::loader; class HSAILProgram; class Segment : public amd::HeapObject { -public: - Segment(); - ~Segment(); + public: + Segment(); + ~Segment(); - //! Allocates a segment - bool alloc(HSAILProgram& prog, amdgpu_hsa_elf_segment_t segment, - size_t size, size_t align, bool zero); + //! Allocates a segment + bool alloc(HSAILProgram& prog, amdgpu_hsa_elf_segment_t segment, size_t size, size_t align, + bool zero); - //! Copies data from host to the segment - void copy(size_t offset, const void* src, size_t size); + //! Copies data from host to the segment + void copy(size_t offset, const void* src, size_t size); - //! Segment freeze - bool freeze(bool destroySysmem); + //! Segment freeze + bool freeze(bool destroySysmem); - //! Returns address for GPU access in the segment - uint64_t gpuAddress(size_t offset) const { return gpuAccess_->vmAddress() + offset; } + //! Returns address for GPU access in the segment + uint64_t gpuAddress(size_t offset) const { return gpuAccess_->vmAddress() + offset; } - //! Returns address for CPU access in the segment - void* cpuAddress(size_t offset) const { return cpuAccess_->data() + offset; } + //! Returns address for CPU access in the segment + void* cpuAddress(size_t offset) const { return cpuAccess_->data() + offset; } -private: - Memory* gpuAccess_; //!< GPU memory for segment access - Memory* cpuAccess_; //!< CPU memory for segment (backing store) + private: + Memory* gpuAccess_; //!< GPU memory for segment access + Memory* cpuAccess_; //!< CPU memory for segment (backing store) }; -class PALHSALoaderContext final: public Context { -public: - PALHSALoaderContext(HSAILProgram* program): program_(program) {} +class PALHSALoaderContext final : public Context { + public: + PALHSALoaderContext(HSAILProgram* program) : program_(program) {} - virtual ~PALHSALoaderContext() {} + virtual ~PALHSALoaderContext() {} - hsa_isa_t IsaFromName(const char *name) override; + hsa_isa_t IsaFromName(const char* name) override; - bool IsaSupportedByAgent(hsa_agent_t agent, hsa_isa_t isa) override; + bool IsaSupportedByAgent(hsa_agent_t agent, hsa_isa_t isa) override; - void* SegmentAlloc(amdgpu_hsa_elf_segment_t segment, - hsa_agent_t agent, size_t size, size_t align, bool zero) override; + void* SegmentAlloc(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, size_t size, size_t align, + bool zero) override; - bool SegmentCopy(amdgpu_hsa_elf_segment_t segment, - hsa_agent_t agent, void* dst, size_t offset, - const void* src, size_t size) override; + bool SegmentCopy(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* dst, size_t offset, + const void* src, size_t size) override; - void SegmentFree(amdgpu_hsa_elf_segment_t segment, - hsa_agent_t agent, void* seg, size_t size = 0) override; + void SegmentFree(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* seg, + size_t size = 0) override; - void* SegmentAddress(amdgpu_hsa_elf_segment_t segment, - hsa_agent_t agent, void* seg, size_t offset) override; + void* SegmentAddress(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* seg, + size_t offset) override; - void* SegmentHostAddress(amdgpu_hsa_elf_segment_t segment, - hsa_agent_t agent, void* seg, size_t offset) override; + void* SegmentHostAddress(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* seg, + size_t offset) override; - bool SegmentFreeze(amdgpu_hsa_elf_segment_t segment, - hsa_agent_t agent, void* seg, size_t size) override; + bool SegmentFreeze(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* seg, + size_t size) override; - bool ImageExtensionSupported() override { return false; } + bool ImageExtensionSupported() override { return false; } - hsa_status_t ImageCreate( - hsa_agent_t agent, - hsa_access_permission_t image_permission, - const hsa_ext_image_descriptor_t *image_descriptor, - const void *image_data, - hsa_ext_image_t *image_handle) override { - // not supported - assert(false); - return HSA_STATUS_ERROR; - } + hsa_status_t ImageCreate(hsa_agent_t agent, hsa_access_permission_t image_permission, + const hsa_ext_image_descriptor_t* image_descriptor, + const void* image_data, hsa_ext_image_t* image_handle) override { + // not supported + assert(false); + return HSA_STATUS_ERROR; + } - hsa_status_t ImageDestroy( - hsa_agent_t agent, hsa_ext_image_t image_handle) override { - // not supported - assert(false); - return HSA_STATUS_ERROR; - } + hsa_status_t ImageDestroy(hsa_agent_t agent, hsa_ext_image_t image_handle) override { + // not supported + assert(false); + return HSA_STATUS_ERROR; + } - hsa_status_t SamplerCreate( - hsa_agent_t agent, - const hsa_ext_sampler_descriptor_t *sampler_descriptor, - hsa_ext_sampler_t *sampler_handle) override; + hsa_status_t SamplerCreate(hsa_agent_t agent, + const hsa_ext_sampler_descriptor_t* sampler_descriptor, + hsa_ext_sampler_t* sampler_handle) override; - //! All samplers are owned by HSAILProgram and are deleted in its destructor. - hsa_status_t SamplerDestroy( - hsa_agent_t agent, hsa_ext_sampler_t sampler_handle) override; + //! All samplers are owned by HSAILProgram and are deleted in its destructor. + hsa_status_t SamplerDestroy(hsa_agent_t agent, hsa_ext_sampler_t sampler_handle) override; -private: - PALHSALoaderContext(const PALHSALoaderContext &c); - PALHSALoaderContext& operator=(const PALHSALoaderContext &c); + private: + PALHSALoaderContext(const PALHSALoaderContext& c); + PALHSALoaderContext& operator=(const PALHSALoaderContext& c); - pal::HSAILProgram* program_; + pal::HSAILProgram* program_; }; //! \class HSAIL program -class HSAILProgram : public device::Program -{ - friend class ClBinary; -public: - //! Default constructor - HSAILProgram(Device& device); - HSAILProgram(NullDevice& device); - //! Default destructor - virtual ~HSAILProgram(); +class HSAILProgram : public device::Program { + friend class ClBinary; - //! Returns the aclBinary associated with the progrm - aclBinary* binaryElf() const { - return static_cast(binaryElf_); } + public: + //! Default constructor + HSAILProgram(Device& device); + HSAILProgram(NullDevice& device); + //! Default destructor + virtual ~HSAILProgram(); - void addGlobalStore(Memory* mem) { globalStores_.push_back(mem); } + //! Returns the aclBinary associated with the progrm + aclBinary* binaryElf() const { return static_cast(binaryElf_); } - void setCodeObjects(Memory* codeGpu, address codeCpu) - { codeSegGpu_ = codeGpu; codeSegCpu_ = codeCpu; } + void addGlobalStore(Memory* mem) { globalStores_.push_back(mem); } - const std::vector& globalStores() const { return globalStores_; } + void setCodeObjects(Memory* codeGpu, address codeCpu) { + codeSegGpu_ = codeGpu; + codeSegCpu_ = codeCpu; + } - //! Return a typecasted GPU device - pal::Device& dev() - { return const_cast( - static_cast(device())); } + const std::vector& globalStores() const { return globalStores_; } - //! Returns GPU kernel table - const Memory* kernelTable() const { return kernels_; } + //! Return a typecasted GPU device + pal::Device& dev() { return const_cast(static_cast(device())); } - //! Adds all kernels to the mem handle lists - void fillResListWithKernels(std::vector& memList) const; + //! Returns GPU kernel table + const Memory* kernelTable() const { return kernels_; } - //! Returns the maximum number of scratch regs used in the program - uint maxScratchRegs() const { return maxScratchRegs_; } + //! Adds all kernels to the mem handle lists + void fillResListWithKernels(std::vector& memList) const; - //! Add internal static sampler - void addSampler(Sampler* sampler) { staticSamplers_.push_back(sampler); } + //! Returns the maximum number of scratch regs used in the program + uint maxScratchRegs() const { return maxScratchRegs_; } - //! Returns TRUE if the program just compiled - bool isNull() const { return isNull_; } + //! Add internal static sampler + void addSampler(Sampler* sampler) { staticSamplers_.push_back(sampler); } - //! Returns TRUE if the program used internally by runtime - bool isInternal() const { return internal_; } + //! Returns TRUE if the program just compiled + bool isNull() const { return isNull_; } - //! Returns TRUE if the program contains static samplers - bool isStaticSampler() const { return (staticSamplers_.size() != 0); } + //! Returns TRUE if the program used internally by runtime + bool isInternal() const { return internal_; } - //! Returns code segement on GPU - const Memory& codeSegGpu() const { return *codeSegGpu_; } + //! Returns TRUE if the program contains static samplers + bool isStaticSampler() const { return (staticSamplers_.size() != 0); } - //! Returns code segement on CPU - address codeSegCpu() const { return codeSegCpu_; } + //! Returns code segement on GPU + const Memory& codeSegGpu() const { return *codeSegGpu_; } - //! Returns CPU address for a kernel - uint64_t findHostKernelAddress(uint64_t devAddr) const - { - return loader_->FindHostAddress(devAddr); - } + //! Returns code segement on CPU + address codeSegCpu() const { return codeSegCpu_; } -protected: - //! pre-compile setup for GPU - virtual bool initBuild(amd::option::Options* options); + //! Returns CPU address for a kernel + uint64_t findHostKernelAddress(uint64_t devAddr) const { + return loader_->FindHostAddress(devAddr); + } - //! post-compile setup for GPU - virtual bool finiBuild(bool isBuildGood); + protected: + //! pre-compile setup for GPU + virtual bool initBuild(amd::option::Options* options); - /*! \brief Compiles GPU CL program to LLVM binary (compiler frontend) - * - * \return True if we successefully compiled a GPU program - */ - virtual bool compileImpl( - const std::string& sourceCode, //!< the program's source code - const std::vector& headers, - const char** headerIncludeNames, - amd::option::Options* options //!< compile options's object - ); + //! post-compile setup for GPU + virtual bool finiBuild(bool isBuildGood); - /* \brief Returns the next stage to compile from, based on sections in binary, - * also returns completeStages in a vector, which contains at least ACL_TYPE_DEFAULT, - * sets needOptionsCheck to true if options check is needed to decide whether or not to recompile - */ - aclType getCompilationStagesFromBinary(std::vector& completeStages, bool& needOptionsCheck); + /*! \brief Compiles GPU CL program to LLVM binary (compiler frontend) + * + * \return True if we successefully compiled a GPU program + */ + virtual bool compileImpl(const std::string& sourceCode, //!< the program's source code + const std::vector& headers, + const char** headerIncludeNames, + amd::option::Options* options //!< compile options's object + ); - /* \brief Returns the next stage to compile from, based on sections and options in binary - */ - aclType getNextCompilationStageFromBinary(amd::option::Options* options); - - bool saveBinaryAndSetType(type_t type); + /* \brief Returns the next stage to compile from, based on sections in binary, + * also returns completeStages in a vector, which contains at least ACL_TYPE_DEFAULT, + * sets needOptionsCheck to true if options check is needed to decide whether or not to recompile + */ + aclType getCompilationStagesFromBinary(std::vector& completeStages, + bool& needOptionsCheck); - virtual bool linkImpl(amd::option::Options* options); + /* \brief Returns the next stage to compile from, based on sections and options in binary + */ + aclType getNextCompilationStageFromBinary(amd::option::Options* options); - //! Link the device programs. - virtual bool linkImpl (const std::vector& inputPrograms, - amd::option::Options* options, - bool createLibrary); + bool saveBinaryAndSetType(type_t type); - virtual bool createBinary(amd::option::Options* options); + virtual bool linkImpl(amd::option::Options* options); - //! Initialize Binary - virtual bool initClBinary(); + //! Link the device programs. + virtual bool linkImpl(const std::vector& inputPrograms, + amd::option::Options* options, bool createLibrary); - //! Release the Binary - virtual void releaseClBinary(); + virtual bool createBinary(amd::option::Options* options); - virtual const aclTargetInfo & info(const char * str = ""); + //! Initialize Binary + virtual bool initClBinary(); - virtual bool isElf(const char* bin) const { - return amd::isElfMagic(bin); - //return false; - } + //! Release the Binary + virtual void releaseClBinary(); - //! Returns the binary - // This should ensure that the binary is updated with all the kernels - // ClBinary& clBinary() { return binary_; } - ClBinaryHsa* clBinary() { - return static_cast(device::Program::clBinary()); - } - const ClBinaryHsa* clBinary() const { - return static_cast(device::Program::clBinary()); - } + virtual const aclTargetInfo& info(const char* str = ""); -private: - //! Disable default copy constructor - HSAILProgram(const HSAILProgram&); + virtual bool isElf(const char* bin) const { + return amd::isElfMagic(bin); + // return false; + } - //! Disable operator= - HSAILProgram& operator=(const HSAILProgram&); + //! Returns the binary + // This should ensure that the binary is updated with all the kernels + // ClBinary& clBinary() { return binary_; } + ClBinaryHsa* clBinary() { return static_cast(device::Program::clBinary()); } + const ClBinaryHsa* clBinary() const { + return static_cast(device::Program::clBinary()); + } -protected: - //! Returns all the options to be appended while passing to the - //compiler library - std::string hsailOptions(amd::option::Options* options); + private: + //! Disable default copy constructor + HSAILProgram(const HSAILProgram&); - //! Allocate kernel table - bool allocKernelTable(); + //! Disable operator= + HSAILProgram& operator=(const HSAILProgram&); - std::string openCLSource_; //!< Original OpenCL source - std::string HSAILProgram_; //!< FSAIL program after compilation - std::string llvmBinary_; //!< LLVM IR binary code - aclBinary* binaryElf_; //!< Binary for the new compiler library - void* rawBinary_; //!< Pointer to the raw binary - aclBinaryOptions binOpts_; //!< Binary options to create aclBinary - std::vector globalStores_; //!< Global memory for the program - Memory* kernels_; //!< Table with kernel object pointers - Memory* codeSegGpu_; //!< GPU memory with code objects - address codeSegCpu_; //!< CPU memory with code objects - uint maxScratchRegs_; //!< Maximum number of scratch regs used in the program by individual kernel - std::list staticSamplers_; //!< List od internal static samplers - union { - struct { - uint32_t isNull_ : 1; //!< Null program no memory allocations - uint32_t internal_ : 1; //!< Internal blit program - }; - uint32_t flags_; //!< Program flags + protected: + //! Returns all the options to be appended while passing to the + // compiler library + std::string hsailOptions(amd::option::Options* options); + + //! Allocate kernel table + bool allocKernelTable(); + + std::string openCLSource_; //!< Original OpenCL source + std::string HSAILProgram_; //!< FSAIL program after compilation + std::string llvmBinary_; //!< LLVM IR binary code + aclBinary* binaryElf_; //!< Binary for the new compiler library + void* rawBinary_; //!< Pointer to the raw binary + aclBinaryOptions binOpts_; //!< Binary options to create aclBinary + std::vector globalStores_; //!< Global memory for the program + Memory* kernels_; //!< Table with kernel object pointers + Memory* codeSegGpu_; //!< GPU memory with code objects + address codeSegCpu_; //!< CPU memory with code objects + uint + maxScratchRegs_; //!< Maximum number of scratch regs used in the program by individual kernel + std::list staticSamplers_; //!< List od internal static samplers + union { + struct { + uint32_t isNull_ : 1; //!< Null program no memory allocations + uint32_t internal_ : 1; //!< Internal blit program }; - amd::hsa::loader::Loader* loader_; //!< Loader object - amd::hsa::loader::Executable* executable_; //!< Executable for HSA Loader - PALHSALoaderContext loaderContext_; //!< Context for HSA Loader + uint32_t flags_; //!< Program flags + }; + amd::hsa::loader::Loader* loader_; //!< Loader object + amd::hsa::loader::Executable* executable_; //!< Executable for HSA Loader + PALHSALoaderContext loaderContext_; //!< Context for HSA Loader }; #if defined(WITH_LIGHTNING_COMPILER) //! \class Lightning Compiler Program -class LightningProgram : public HSAILProgram -{ -public: - LightningProgram(NullDevice& device) - : HSAILProgram(device), - metadata_(nullptr) - {} +class LightningProgram : public HSAILProgram { + public: + LightningProgram(NullDevice& device) : HSAILProgram(device), metadata_(nullptr) {} - LightningProgram(Device& device) - : HSAILProgram(device), - metadata_(nullptr) - {} + LightningProgram(Device& device) : HSAILProgram(device), metadata_(nullptr) {} - const CodeObjectMD* metadata() const { - return metadata_; - } -private: - virtual ~LightningProgram(); + const CodeObjectMD* metadata() const { return metadata_; } - /* \brief Returns the next stage to compile from, based on sections in binary, - * also returns completeStages in a vector, which contains at least ACL_TYPE_DEFAULT, - * sets needOptionsCheck to true if options check is needed to decide whether or not to recompile - */ - aclType getCompilationStagesFromBinary(std::vector& completeStages, bool& needOptionsCheck); + private: + virtual ~LightningProgram(); - /* \brief Returns the next stage to compile from, based on sections and options in binary - */ - aclType getNextCompilationStageFromBinary(amd::option::Options* options); + /* \brief Returns the next stage to compile from, based on sections in binary, + * also returns completeStages in a vector, which contains at least ACL_TYPE_DEFAULT, + * sets needOptionsCheck to true if options check is needed to decide whether or not to recompile + */ + aclType getCompilationStagesFromBinary(std::vector& completeStages, + bool& needOptionsCheck); -protected: - virtual bool compileImpl( - const std::string& sourceCode, //!< the program's source code - const std::vector& headers, - const char** headerIncludeNames, - amd::option::Options* options //!< compile options's object - ) override; + /* \brief Returns the next stage to compile from, based on sections and options in binary + */ + aclType getNextCompilationStageFromBinary(amd::option::Options* options); - virtual bool linkImpl(amd::option::Options* options) override; + protected: + virtual bool compileImpl(const std::string& sourceCode, //!< the program's source code + const std::vector& headers, + const char** headerIncludeNames, + amd::option::Options* options //!< compile options's object + ) override; - //! Link the device programs. - virtual bool linkImpl (const std::vector& inputPrograms, - amd::option::Options* options, - bool createLibrary) override; + virtual bool linkImpl(amd::option::Options* options) override; - bool setKernels(amd::option::Options *options, void* binary, size_t size); + //! Link the device programs. + virtual bool linkImpl(const std::vector& inputPrograms, + amd::option::Options* options, bool createLibrary) override; - virtual bool createBinary(amd::option::Options* options) override; + bool setKernels(amd::option::Options* options, void* binary, size_t size); - //! Return a new transient compiler instance. - static std::auto_ptr newCompilerInstance(); + virtual bool createBinary(amd::option::Options* options) override; -private: - CodeObjectMD* metadata_; //!< Runtime metadata + //! Return a new transient compiler instance. + static std::auto_ptr newCompilerInstance(); + + private: + CodeObjectMD* metadata_; //!< Runtime metadata }; -#endif // defined(WITH_LIGHTNING_COMPILER) +#endif // defined(WITH_LIGHTNING_COMPILER) /*@}*/} // namespace pal diff --git a/rocclr/runtime/device/pal/palresource.cpp b/rocclr/runtime/device/pal/palresource.cpp index 00f89d1a08..8e67c4b70c 100644 --- a/rocclr/runtime/device/pal/palresource.cpp +++ b/rocclr/runtime/device/pal/palresource.cpp @@ -16,7 +16,7 @@ #include #include "CL/cl_d3d10.h" #include "CL/cl_d3d11.h" -#endif // _WIN32 +#endif // _WIN32 #include #include "GL/glATIInternal.h" @@ -28,922 +28,583 @@ namespace pal { -GpuMemoryReference* -GpuMemoryReference::Create( - const Device& dev, - const Pal::GpuMemoryCreateInfo& createInfo) -{ - Pal::Result result; - size_t gpuMemSize = dev.iDev()->GetGpuMemorySize(createInfo, &result); +GpuMemoryReference* GpuMemoryReference::Create(const Device& dev, + const Pal::GpuMemoryCreateInfo& createInfo) { + Pal::Result result; + size_t gpuMemSize = dev.iDev()->GetGpuMemorySize(createInfo, &result); + if (result != Pal::Result::Success) { + return nullptr; + } + + GpuMemoryReference* memRef = new (gpuMemSize) GpuMemoryReference(); + if (memRef != nullptr) { + result = dev.iDev()->CreateGpuMemory(createInfo, &memRef[1], &memRef->gpuMem_); if (result != Pal::Result::Success) { - return nullptr; + memRef->release(); + return nullptr; } - - GpuMemoryReference* memRef = new (gpuMemSize) GpuMemoryReference(); - if (memRef != nullptr) { - result = dev.iDev()->CreateGpuMemory(createInfo, &memRef[1], &memRef->gpuMem_); - if (result != Pal::Result::Success) { - memRef->release(); - return nullptr; - } - } - // Update free memory size counters - const_cast(dev).updateFreeMemory( - createInfo.heaps[0], createInfo.size, false); - return memRef; + } + // Update free memory size counters + const_cast(dev).updateFreeMemory(createInfo.heaps[0], createInfo.size, false); + return memRef; } -GpuMemoryReference* -GpuMemoryReference::Create( - const Device& dev, - const Pal::PinnedGpuMemoryCreateInfo& createInfo) -{ - Pal::Result result; - size_t gpuMemSize = dev.iDev()->GetPinnedGpuMemorySize(createInfo, &result); +GpuMemoryReference* GpuMemoryReference::Create(const Device& dev, + const Pal::PinnedGpuMemoryCreateInfo& createInfo) { + Pal::Result result; + size_t gpuMemSize = dev.iDev()->GetPinnedGpuMemorySize(createInfo, &result); + if (result != Pal::Result::Success) { + return nullptr; + } + + GpuMemoryReference* memRef = new (gpuMemSize) GpuMemoryReference(); + Pal::VaRange vaRange = Pal::VaRange::Default; + if (memRef != nullptr) { + result = dev.iDev()->CreatePinnedGpuMemory(createInfo, &memRef[1], &memRef->gpuMem_); if (result != Pal::Result::Success) { - return nullptr; + memRef->release(); + return nullptr; } - - GpuMemoryReference* memRef = new (gpuMemSize) GpuMemoryReference(); - Pal::VaRange vaRange = Pal::VaRange::Default; - if (memRef != nullptr) { - result = dev.iDev()->CreatePinnedGpuMemory(createInfo, - &memRef[1], &memRef->gpuMem_); - if (result != Pal::Result::Success) { - memRef->release(); - return nullptr; - } - } - // Update free memory size counters - const_cast(dev).updateFreeMemory( - Pal::GpuHeap::GpuHeapGartCacheable, createInfo.size, false); - return memRef; + } + // Update free memory size counters + const_cast(dev).updateFreeMemory(Pal::GpuHeap::GpuHeapGartCacheable, createInfo.size, + false); + return memRef; } -GpuMemoryReference* -GpuMemoryReference::Create( - const Device& dev, - const Pal::SvmGpuMemoryCreateInfo& createInfo) -{ - Pal::Result result; - size_t gpuMemSize = dev.iDev()->GetSvmGpuMemorySize(createInfo, &result); +GpuMemoryReference* GpuMemoryReference::Create(const Device& dev, + const Pal::SvmGpuMemoryCreateInfo& createInfo) { + Pal::Result result; + size_t gpuMemSize = dev.iDev()->GetSvmGpuMemorySize(createInfo, &result); + if (result != Pal::Result::Success) { + return nullptr; + } + + GpuMemoryReference* memRef = new (gpuMemSize) GpuMemoryReference(); + if (memRef != nullptr) { + result = dev.iDev()->CreateSvmGpuMemory(createInfo, &memRef[1], &memRef->gpuMem_); if (result != Pal::Result::Success) { - return nullptr; + memRef->release(); + return nullptr; } - - GpuMemoryReference* memRef = new (gpuMemSize) GpuMemoryReference(); - if (memRef != nullptr) { - result = dev.iDev()->CreateSvmGpuMemory(createInfo, - &memRef[1], &memRef->gpuMem_); - if (result != Pal::Result::Success) { - memRef->release(); - return nullptr; - } - } - // Update free memory size counters - const_cast(dev).updateFreeMemory( - Pal::GpuHeap::GpuHeapGartCacheable, createInfo.size, false); - return memRef; + } + // Update free memory size counters + const_cast(dev).updateFreeMemory(Pal::GpuHeap::GpuHeapGartCacheable, createInfo.size, + false); + return memRef; } -GpuMemoryReference* -GpuMemoryReference::Create( - const Device& dev, - const Pal::ExternalGpuMemoryOpenInfo& openInfo) -{ - Pal::Result result; - size_t gpuMemSize = dev.iDev()->GetExternalSharedGpuMemorySize(&result); +GpuMemoryReference* GpuMemoryReference::Create(const Device& dev, + const Pal::ExternalGpuMemoryOpenInfo& openInfo) { + Pal::Result result; + size_t gpuMemSize = dev.iDev()->GetExternalSharedGpuMemorySize(&result); + if (result != Pal::Result::Success) { + return nullptr; + } + + Pal::GpuMemoryCreateInfo createInfo = {}; + GpuMemoryReference* memRef = new (gpuMemSize) GpuMemoryReference(); + if (memRef != nullptr) { + result = dev.iDev()->OpenExternalSharedGpuMemory(openInfo, &memRef[1], &createInfo, + &memRef->gpuMem_); if (result != Pal::Result::Success) { - return nullptr; + memRef->release(); + return nullptr; } + } - Pal::GpuMemoryCreateInfo createInfo = {}; - GpuMemoryReference* memRef = new (gpuMemSize) GpuMemoryReference(); - if (memRef != nullptr) { - result = dev.iDev()->OpenExternalSharedGpuMemory( - openInfo, &memRef[1], &createInfo, &memRef->gpuMem_); - if (result != Pal::Result::Success) { - memRef->release(); - return nullptr; - } - } - - return memRef; + return memRef; } -GpuMemoryReference* -GpuMemoryReference::Create( - const Device& dev, - const Pal::ExternalImageOpenInfo& openInfo, - Pal::ImageCreateInfo* imgCreateInfo, - Pal::IImage** image) -{ - Pal::Result result; - size_t gpuMemSize = 0; - size_t imageSize = 0; - if (Pal::Result::Success != dev.iDev()->GetExternalSharedImageSizes( - openInfo, &imageSize, &gpuMemSize, imgCreateInfo)) { - return nullptr; - } +GpuMemoryReference* GpuMemoryReference::Create(const Device& dev, + const Pal::ExternalImageOpenInfo& openInfo, + Pal::ImageCreateInfo* imgCreateInfo, + Pal::IImage** image) { + Pal::Result result; + size_t gpuMemSize = 0; + size_t imageSize = 0; + if (Pal::Result::Success != + dev.iDev()->GetExternalSharedImageSizes(openInfo, &imageSize, &gpuMemSize, imgCreateInfo)) { + return nullptr; + } - Pal::GpuMemoryCreateInfo createInfo = {}; - GpuMemoryReference* memRef = new (gpuMemSize) GpuMemoryReference(); - char* imgMem = new char [imageSize]; - if (memRef != nullptr) { - result = dev.iDev()->OpenExternalSharedImage( - openInfo, imgMem, &memRef[1], &createInfo, image, &memRef->gpuMem_); - if (result != Pal::Result::Success) { - memRef->release(); - return nullptr; - } + Pal::GpuMemoryCreateInfo createInfo = {}; + GpuMemoryReference* memRef = new (gpuMemSize) GpuMemoryReference(); + char* imgMem = new char[imageSize]; + if (memRef != nullptr) { + result = dev.iDev()->OpenExternalSharedImage(openInfo, imgMem, &memRef[1], &createInfo, image, + &memRef->gpuMem_); + if (result != Pal::Result::Success) { + memRef->release(); + return nullptr; } + } - return memRef; + return memRef; } -GpuMemoryReference::GpuMemoryReference() - : gpuMem_(nullptr) - , cpuAddress_(nullptr) -{ +GpuMemoryReference::GpuMemoryReference() : gpuMem_(nullptr), cpuAddress_(nullptr) {} + +GpuMemoryReference::~GpuMemoryReference() { + if (cpuAddress_ != nullptr) { + iMem()->Unmap(); + } + if (0 != iMem()) { + iMem()->Destroy(); + gpuMem_ = nullptr; + } } -GpuMemoryReference::~GpuMemoryReference() -{ - if (cpuAddress_ != nullptr) { - iMem()->Unmap(); - } - if (0 != iMem()) { - iMem()->Destroy(); - gpuMem_ = nullptr; - } +Resource::Resource(const Device& gpuDev, size_t size) + : elementSize_(0), + gpuDevice_(gpuDev), + mapCount_(0), + address_(nullptr), + offset_(0), + curRename_(0), + memRef_(nullptr), + viewOwner_(nullptr), + pinOffset_(0), + gpu_(nullptr), + image_(nullptr), + hwSrd_(0) { + // Fill resource descriptor fields + desc_.state_ = 0; + desc_.type_ = Empty; + desc_.width_ = amd::alignUp(size, Pal::Formats::BytesPerPixel(Pal::ChNumFormat::X32_Uint)) / + Pal::Formats::BytesPerPixel(Pal::ChNumFormat::X32_Uint); + desc_.height_ = 1; + desc_.depth_ = 1; + desc_.mipLevels_ = 1; + desc_.format_.image_channel_order = CL_R; + desc_.format_.image_channel_data_type = CL_FLOAT; + desc_.flags_ = 0; + desc_.pitch_ = 0; + desc_.slice_ = 0; + desc_.cardMemory_ = true; + desc_.dimSize_ = 1; + desc_.buffer_ = true; + desc_.imageArray_ = false; + desc_.topology_ = CL_MEM_OBJECT_BUFFER; + desc_.SVMRes_ = false; + desc_.scratch_ = false; + desc_.isAllocExecute_ = false; + desc_.baseLevel_ = 0; } -Resource::Resource( - const Device& gpuDev, - size_t size) - : elementSize_(0) - , gpuDevice_(gpuDev) - , mapCount_(0) - , address_(nullptr) - , offset_(0) - , curRename_(0) - , memRef_(nullptr) - , viewOwner_(nullptr) - , pinOffset_(0) - , gpu_(nullptr) - , image_(nullptr) - , hwSrd_(0) -{ - // Fill resource descriptor fields - desc_.state_ = 0; - desc_.type_ = Empty; - desc_.width_ = amd::alignUp(size, - Pal::Formats::BytesPerPixel(Pal::ChNumFormat::X32_Uint)) / - Pal::Formats::BytesPerPixel(Pal::ChNumFormat::X32_Uint); - desc_.height_ = 1; - desc_.depth_ = 1; - desc_.mipLevels_ = 1; - desc_.format_.image_channel_order = CL_R; - desc_.format_.image_channel_data_type = CL_FLOAT; - desc_.flags_ = 0; - desc_.pitch_ = 0; - desc_.slice_ = 0; - desc_.cardMemory_ = true; - desc_.dimSize_ = 1; - desc_.buffer_ = true; - desc_.imageArray_ = false; - desc_.topology_ = CL_MEM_OBJECT_BUFFER; - desc_.SVMRes_ = false; - desc_.scratch_ = false; - desc_.isAllocExecute_ = false; - desc_.baseLevel_ = 0; -} +Resource::Resource(const Device& gpuDev, size_t width, size_t height, size_t depth, + cl_image_format format, cl_mem_object_type imageType, uint mipLevels) + : elementSize_(0), + gpuDevice_(gpuDev), + mapCount_(0), + address_(nullptr), + offset_(0), + curRename_(0), + memRef_(nullptr), + viewOwner_(nullptr), + pinOffset_(0), + gpu_(nullptr), + image_(nullptr), + hwSrd_(0) { + // Fill resource descriptor fields + desc_.state_ = 0; + desc_.type_ = Empty; + desc_.width_ = width; + desc_.height_ = height; + desc_.depth_ = depth; + desc_.mipLevels_ = mipLevels; + desc_.format_ = format; + desc_.flags_ = 0; + desc_.pitch_ = 0; + desc_.slice_ = 0; + desc_.cardMemory_ = true; + desc_.buffer_ = false; + desc_.imageArray_ = false; + desc_.topology_ = imageType; + desc_.SVMRes_ = false; + desc_.scratch_ = false; + desc_.isAllocExecute_ = false; + desc_.baseLevel_ = 0; -Resource::Resource( - const Device& gpuDev, - size_t width, - size_t height, - size_t depth, - cl_image_format format, - cl_mem_object_type imageType, - uint mipLevels) - : elementSize_(0) - , gpuDevice_(gpuDev) - , mapCount_(0) - , address_(nullptr) - , offset_(0) - , curRename_(0) - , memRef_(nullptr) - , viewOwner_(nullptr) - , pinOffset_(0) - , gpu_(nullptr) - , image_(nullptr) - , hwSrd_(0) -{ - // Fill resource descriptor fields - desc_.state_ = 0; - desc_.type_ = Empty; - desc_.width_ = width; - desc_.height_ = height; - desc_.depth_ = depth; - desc_.mipLevels_ = mipLevels; - desc_.format_ = format; - desc_.flags_ = 0; - desc_.pitch_ = 0; - desc_.slice_ = 0; - desc_.cardMemory_ = true; - desc_.buffer_ = false; - desc_.imageArray_ = false; - desc_.topology_ = imageType; - desc_.SVMRes_ = false; - desc_.scratch_ = false; - desc_.isAllocExecute_ = false; - desc_.baseLevel_ = 0; - - switch (imageType) { + switch (imageType) { case CL_MEM_OBJECT_IMAGE2D: - desc_.dimSize_ = 2; - break; + desc_.dimSize_ = 2; + break; case CL_MEM_OBJECT_IMAGE3D: - desc_.dimSize_ = 3; - break; + desc_.dimSize_ = 3; + break; case CL_MEM_OBJECT_IMAGE2D_ARRAY: - desc_.dimSize_ = 3; - desc_.imageArray_ = true; - break; + desc_.dimSize_ = 3; + desc_.imageArray_ = true; + break; case CL_MEM_OBJECT_IMAGE1D: - desc_.dimSize_ = 1; - break; + desc_.dimSize_ = 1; + break; case CL_MEM_OBJECT_IMAGE1D_ARRAY: - desc_.dimSize_ = 2; - desc_.imageArray_ = true; - break; + desc_.dimSize_ = 2; + desc_.imageArray_ = true; + break; case CL_MEM_OBJECT_IMAGE1D_BUFFER: - desc_.dimSize_ = 1; - break; + desc_.dimSize_ = 1; + break; default: - desc_.dimSize_ = 1; - LogError("Unknown image type!"); - break; - } + desc_.dimSize_ = 1; + LogError("Unknown image type!"); + break; + } } -Resource::~Resource() -{ - Pal::GpuHeap heap = Pal::GpuHeapCount; - switch (memoryType()) { +Resource::~Resource() { + Pal::GpuHeap heap = Pal::GpuHeapCount; + switch (memoryType()) { case Persistent: - heap = Pal::GpuHeapLocal; - break; + heap = Pal::GpuHeapLocal; + break; case RemoteUSWC: - heap = Pal::GpuHeapGartUswc; - break; + heap = Pal::GpuHeapGartUswc; + break; case Pinned: case Remote: - heap = Pal::GpuHeapGartCacheable; - break; + heap = Pal::GpuHeapGartCacheable; + break; case Shader: case BusAddressable: case ExternalPhysical: - // Fall through to process the memory allocation ... + // Fall through to process the memory allocation ... case Local: - heap = Pal::GpuHeapInvisible; - break; + heap = Pal::GpuHeapInvisible; + break; default: - heap = Pal::GpuHeapLocal; - break; - } - if ((memRef_ != nullptr) && (heap != Pal::GpuHeapCount)) { - // Update free memory size counters - const_cast(dev()).updateFreeMemory( - heap, iMem()->Desc().size, true); - } + heap = Pal::GpuHeapLocal; + break; + } + if ((memRef_ != nullptr) && (heap != Pal::GpuHeapCount)) { + // Update free memory size counters + const_cast(dev()).updateFreeMemory(heap, iMem()->Desc().size, true); + } - free(); + free(); - if ((nullptr != image_) && ((memoryType() != ImageView) || - //! @todo PAL doesn't allow an SRD view creation with different pixel size - (elementSize() != viewOwner_->elementSize()))) { - image_->Destroy(); - delete [] reinterpret_cast(image_); - } + if ((nullptr != image_) && + ((memoryType() != ImageView) || + //! @todo PAL doesn't allow an SRD view creation with different pixel size + (elementSize() != viewOwner_->elementSize()))) { + image_->Destroy(); + delete[] reinterpret_cast(image_); + } } -static uint32_t GetHSAILImageFormatType(const cl_image_format& format) -{ - static const uint32_t FormatType[] = { - HSA_EXT_IMAGE_CHANNEL_TYPE_SNORM_INT8, - HSA_EXT_IMAGE_CHANNEL_TYPE_SNORM_INT16, - HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT8, - HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT16, - HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565, - HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555, - HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_101010, - HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT8, - HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT16, - HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT32, - HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8, - HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16, - HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32, - HSA_EXT_IMAGE_CHANNEL_TYPE_HALF_FLOAT, - HSA_EXT_IMAGE_CHANNEL_TYPE_FLOAT, - HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT24 }; +static uint32_t GetHSAILImageFormatType(const cl_image_format& format) { + static const uint32_t FormatType[] = {HSA_EXT_IMAGE_CHANNEL_TYPE_SNORM_INT8, + HSA_EXT_IMAGE_CHANNEL_TYPE_SNORM_INT16, + HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT8, + HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT16, + HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565, + HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555, + HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_101010, + HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT8, + HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT16, + HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT32, + HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8, + HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16, + HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32, + HSA_EXT_IMAGE_CHANNEL_TYPE_HALF_FLOAT, + HSA_EXT_IMAGE_CHANNEL_TYPE_FLOAT, + HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT24}; - uint idx = format.image_channel_data_type - CL_SNORM_INT8; - assert((idx <= (CL_UNORM_INT24 - CL_SNORM_INT8)) && "Out of range format channel!"); - return FormatType[idx]; + uint idx = format.image_channel_data_type - CL_SNORM_INT8; + assert((idx <= (CL_UNORM_INT24 - CL_SNORM_INT8)) && "Out of range format channel!"); + return FormatType[idx]; } -static uint32_t GetHSAILImageOrderType(const cl_image_format& format) -{ - static const uint32_t OrderType[] = { - HSA_EXT_IMAGE_CHANNEL_ORDER_R, - HSA_EXT_IMAGE_CHANNEL_ORDER_A, - HSA_EXT_IMAGE_CHANNEL_ORDER_RG, - HSA_EXT_IMAGE_CHANNEL_ORDER_RA, - HSA_EXT_IMAGE_CHANNEL_ORDER_RGB, - HSA_EXT_IMAGE_CHANNEL_ORDER_RGBA, - HSA_EXT_IMAGE_CHANNEL_ORDER_BGRA, - HSA_EXT_IMAGE_CHANNEL_ORDER_ARGB, - HSA_EXT_IMAGE_CHANNEL_ORDER_INTENSITY, - HSA_EXT_IMAGE_CHANNEL_ORDER_LUMINANCE, - HSA_EXT_IMAGE_CHANNEL_ORDER_RX, - HSA_EXT_IMAGE_CHANNEL_ORDER_RGX, - HSA_EXT_IMAGE_CHANNEL_ORDER_RGBX, - HSA_EXT_IMAGE_CHANNEL_ORDER_DEPTH, - HSA_EXT_IMAGE_CHANNEL_ORDER_DEPTH_STENCIL, - HSA_EXT_IMAGE_CHANNEL_ORDER_SRGB, - HSA_EXT_IMAGE_CHANNEL_ORDER_SRGBX, - HSA_EXT_IMAGE_CHANNEL_ORDER_SRGBA, - HSA_EXT_IMAGE_CHANNEL_ORDER_SBGRA, - HSA_EXT_IMAGE_CHANNEL_ORDER_ABGR }; +static uint32_t GetHSAILImageOrderType(const cl_image_format& format) { + static const uint32_t OrderType[] = {HSA_EXT_IMAGE_CHANNEL_ORDER_R, + HSA_EXT_IMAGE_CHANNEL_ORDER_A, + HSA_EXT_IMAGE_CHANNEL_ORDER_RG, + HSA_EXT_IMAGE_CHANNEL_ORDER_RA, + HSA_EXT_IMAGE_CHANNEL_ORDER_RGB, + HSA_EXT_IMAGE_CHANNEL_ORDER_RGBA, + HSA_EXT_IMAGE_CHANNEL_ORDER_BGRA, + HSA_EXT_IMAGE_CHANNEL_ORDER_ARGB, + HSA_EXT_IMAGE_CHANNEL_ORDER_INTENSITY, + HSA_EXT_IMAGE_CHANNEL_ORDER_LUMINANCE, + HSA_EXT_IMAGE_CHANNEL_ORDER_RX, + HSA_EXT_IMAGE_CHANNEL_ORDER_RGX, + HSA_EXT_IMAGE_CHANNEL_ORDER_RGBX, + HSA_EXT_IMAGE_CHANNEL_ORDER_DEPTH, + HSA_EXT_IMAGE_CHANNEL_ORDER_DEPTH_STENCIL, + HSA_EXT_IMAGE_CHANNEL_ORDER_SRGB, + HSA_EXT_IMAGE_CHANNEL_ORDER_SRGBX, + HSA_EXT_IMAGE_CHANNEL_ORDER_SRGBA, + HSA_EXT_IMAGE_CHANNEL_ORDER_SBGRA, + HSA_EXT_IMAGE_CHANNEL_ORDER_ABGR}; - uint idx = format.image_channel_order - CL_R; - assert((idx <= (CL_ABGR - CL_R)) && "Out of range format order!"); - return OrderType[idx]; + uint idx = format.image_channel_order - CL_R; + assert((idx <= (CL_ABGR - CL_R)) && "Out of range format order!"); + return OrderType[idx]; } -void -Resource::memTypeToHeap(Pal::GpuMemoryCreateInfo* createInfo) -{ - createInfo->heapCount = 1; - switch (memoryType()) { +void Resource::memTypeToHeap(Pal::GpuMemoryCreateInfo* createInfo) { + createInfo->heapCount = 1; + switch (memoryType()) { case Persistent: - createInfo->heaps[0] = Pal::GpuHeapLocal; - break; + createInfo->heaps[0] = Pal::GpuHeapLocal; + break; case RemoteUSWC: - createInfo->heaps[0] = Pal::GpuHeapGartUswc; - desc_.cardMemory_ = false; - break; + createInfo->heaps[0] = Pal::GpuHeapGartUswc; + desc_.cardMemory_ = false; + break; case Remote: - createInfo->heaps[0] = Pal::GpuHeapGartCacheable; - desc_.cardMemory_ = false; - break; + createInfo->heaps[0] = Pal::GpuHeapGartCacheable; + desc_.cardMemory_ = false; + break; case ExternalPhysical: - desc_.cardMemory_ = false; + desc_.cardMemory_ = false; case Shader: - // Fall through to process the memory allocation ... + // Fall through to process the memory allocation ... case Local: - createInfo->heapCount = 2; - createInfo->heaps[0] = Pal::GpuHeapInvisible; - createInfo->heaps[1] = Pal::GpuHeapLocal; - break; + createInfo->heapCount = 2; + createInfo->heaps[0] = Pal::GpuHeapInvisible; + createInfo->heaps[1] = Pal::GpuHeapLocal; + break; default: - createInfo->heaps[0] = Pal::GpuHeapLocal; - break; - } + createInfo->heaps[0] = Pal::GpuHeapLocal; + break; + } } -bool -Resource::create(MemoryType memType, CreateParams* params) -{ - static const Pal::gpusize MaxGpuAlignment = 64 * Ki; - const amd::HostMemoryReference* hostMemRef = nullptr; - bool imageCreateView = false; - uint hostMemOffset = 0; - bool foundCalRef = false; - bool viewDefined = false; - uint viewLayer = 0; - uint viewLevel = 0; - uint viewFlags = 0; - Pal::SubresId ImgSubresId = { Pal::ImageAspect::Color, 0, 0 }; - Pal::SubresRange ImgSubresRange = { ImgSubresId, 1, 1 }; - Pal::ChannelMapping channels; - Pal::ChNumFormat format = dev().getPalFormat(desc().format_, &channels); +bool Resource::create(MemoryType memType, CreateParams* params) { + static const Pal::gpusize MaxGpuAlignment = 64 * Ki; + const amd::HostMemoryReference* hostMemRef = nullptr; + bool imageCreateView = false; + uint hostMemOffset = 0; + bool foundCalRef = false; + bool viewDefined = false; + uint viewLayer = 0; + uint viewLevel = 0; + uint viewFlags = 0; + Pal::SubresId ImgSubresId = {Pal::ImageAspect::Color, 0, 0}; + Pal::SubresRange ImgSubresRange = {ImgSubresId, 1, 1}; + Pal::ChannelMapping channels; + Pal::ChNumFormat format = dev().getPalFormat(desc().format_, &channels); - // This is a thread safe operation - const_cast(dev()).initializeHeapResources(); + // This is a thread safe operation + const_cast(dev()).initializeHeapResources(); - amd::ScopedLock lk(dev().lockPAL()); + amd::ScopedLock lk(dev().lockPAL()); - if (memType == Shader) { - if (dev().settings().svmFineGrainSystem_) { - desc_.isAllocExecute_ = true; - desc_.SVMRes_ = true; - memType = RemoteUSWC; - } - else { - memType = Local; - } - // force to use remote memory for HW DEBUG or use - // local memory once we determine if FGS is supported - // memType = (!dev().settings().enableHwDebug_) ? Local : RemoteUSWC; + if (memType == Shader) { + if (dev().settings().svmFineGrainSystem_) { + desc_.isAllocExecute_ = true; + desc_.SVMRes_ = true; + memType = RemoteUSWC; + } else { + memType = Local; } + // force to use remote memory for HW DEBUG or use + // local memory once we determine if FGS is supported + // memType = (!dev().settings().enableHwDebug_) ? Local : RemoteUSWC; + } - // Get the element size - elementSize_ = Pal::Formats::BytesPerPixel(format); - desc_.type_ = memType; - if (memType == Scratch) { - // use local memory for scratch buffer unless it is using HW DEBUG - desc_.type_ = (!dev().settings().enableHwDebug_) ? Local : RemoteUSWC; - desc_.scratch_ = true; + // Get the element size + elementSize_ = Pal::Formats::BytesPerPixel(format); + desc_.type_ = memType; + if (memType == Scratch) { + // use local memory for scratch buffer unless it is using HW DEBUG + desc_.type_ = (!dev().settings().enableHwDebug_) ? Local : RemoteUSWC; + desc_.scratch_ = true; + } + + // Force remote allocation if it was requested in the settings + if (dev().settings().remoteAlloc_ && ((memoryType() == Local) || (memoryType() == Persistent))) { + if (dev().settings().apuSystem_ && dev().settings().viPlus_) { + desc_.type_ = Remote; + } else { + desc_.type_ = RemoteUSWC; } + } - // Force remote allocation if it was requested in the settings - if (dev().settings().remoteAlloc_ && - ((memoryType() == Local) || - (memoryType() == Persistent))) { - if (dev().settings().apuSystem_ && dev().settings().viPlus_) { - desc_.type_ = Remote; - } - else { - desc_.type_ = RemoteUSWC; - } - } + if (dev().settings().disablePersistent_ && (memoryType() == Persistent)) { + desc_.type_ = RemoteUSWC; + } - if (dev().settings().disablePersistent_ && (memoryType() == Persistent)) { - desc_.type_ = RemoteUSWC; - } + if (params != nullptr) { + gpu_ = params->gpu_; + } - if (params != nullptr) { - gpu_ = params->gpu_; - } - - Pal::Result result; + Pal::Result result; #ifdef _WIN32 - if ((memoryType() == OGLInterop) || - (memoryType() == D3D9Interop) || - (memoryType() == D3D10Interop) || - (memoryType() == D3D11Interop)) { - Pal::ExternalGpuMemoryOpenInfo gpuMemOpenInfo = {}; - Pal::ExternalResourceOpenInfo& openInfo = gpuMemOpenInfo.resourceInfo; - uint misc = 0; - uint layer = 0; - uint mipLevel = 0; - InteropType type = InteropTypeless; + if ((memoryType() == OGLInterop) || (memoryType() == D3D9Interop) || + (memoryType() == D3D10Interop) || (memoryType() == D3D11Interop)) { + Pal::ExternalGpuMemoryOpenInfo gpuMemOpenInfo = {}; + Pal::ExternalResourceOpenInfo& openInfo = gpuMemOpenInfo.resourceInfo; + uint misc = 0; + uint layer = 0; + uint mipLevel = 0; + InteropType type = InteropTypeless; - if (memoryType() == OGLInterop) { - OGLInteropParams* oglRes = reinterpret_cast(params); - assert(oglRes->glPlatformContext_ && "We don't have OGL context!"); - switch (oglRes->type_) { - case InteropVertexBuffer: - glType_ = GL_RESOURCE_ATTACH_VERTEXBUFFER_AMD; - break; - case InteropRenderBuffer: - glType_ = GL_RESOURCE_ATTACH_RENDERBUFFER_AMD; - break; - case InteropTexture: - case InteropTextureViewLevel: - case InteropTextureViewCube: - glType_ = GL_RESOURCE_ATTACH_TEXTURE_AMD; - break; - default: - LogError("Unknown OGL interop type!"); - return false; - break; - } - glPlatformContext_ = oglRes->glPlatformContext_; - glDeviceContext_ = oglRes->glDeviceContext_; - layer = oglRes->layer_; - type = oglRes->type_; - mipLevel = oglRes->mipLevel_; + if (memoryType() == OGLInterop) { + OGLInteropParams* oglRes = reinterpret_cast(params); + assert(oglRes->glPlatformContext_ && "We don't have OGL context!"); + switch (oglRes->type_) { + case InteropVertexBuffer: + glType_ = GL_RESOURCE_ATTACH_VERTEXBUFFER_AMD; + break; + case InteropRenderBuffer: + glType_ = GL_RESOURCE_ATTACH_RENDERBUFFER_AMD; + break; + case InteropTexture: + case InteropTextureViewLevel: + case InteropTextureViewCube: + glType_ = GL_RESOURCE_ATTACH_TEXTURE_AMD; + break; + default: + LogError("Unknown OGL interop type!"); + return false; + break; + } + glPlatformContext_ = oglRes->glPlatformContext_; + glDeviceContext_ = oglRes->glDeviceContext_; + layer = oglRes->layer_; + type = oglRes->type_; + mipLevel = oglRes->mipLevel_; - if (!dev().resGLAssociate(oglRes->glPlatformContext_, oglRes->handle_, - glType_, &openInfo.hExternalResource, &glInteropMbRes_, &offset_, - openInfo.doppDesktopInfo)) { - return false; - } - desc_.isDoppTexture_ = (openInfo.doppDesktopInfo.gpuVirtAddr != 0); - } - else { - D3DInteropParams* d3dRes = reinterpret_cast(params); - openInfo.hExternalResource = d3dRes->handle_; - misc = d3dRes->misc; - layer = d3dRes->layer_; - type = d3dRes->type_; - mipLevel = d3dRes->mipLevel_; - } - //! @todo PAL query for image/buffer object doesn't work properly! + if (!dev().resGLAssociate(oglRes->glPlatformContext_, oglRes->handle_, glType_, + &openInfo.hExternalResource, &glInteropMbRes_, &offset_, + openInfo.doppDesktopInfo)) { + return false; + } + desc_.isDoppTexture_ = (openInfo.doppDesktopInfo.gpuVirtAddr != 0); + } else { + D3DInteropParams* d3dRes = reinterpret_cast(params); + openInfo.hExternalResource = d3dRes->handle_; + misc = d3dRes->misc; + layer = d3dRes->layer_; + type = d3dRes->type_; + mipLevel = d3dRes->mipLevel_; + } +//! @todo PAL query for image/buffer object doesn't work properly! #if 0 bool isImage = false; if (Pal::Result::Success != dev().iDev()->DetermineExternalSharedResourceType(openInfo, &isImage)) { return false; } -#endif // 0 - if (desc().buffer_ || misc) { - memRef_ = GpuMemoryReference::Create(dev(), gpuMemOpenInfo); - if (nullptr == memRef_) { - return false; - } +#endif // 0 + if (desc().buffer_ || misc) { + memRef_ = GpuMemoryReference::Create(dev(), gpuMemOpenInfo); + if (nullptr == memRef_) { + return false; + } - if (misc) { - Pal::ImageCreateInfo imgCreateInfo = {}; - Pal::ExternalImageOpenInfo imgOpenInfo = {}; - imgOpenInfo.resourceInfo = openInfo; - imgOpenInfo.swizzledFormat.format = format; - imgOpenInfo.swizzledFormat.swizzle = channels; - imgOpenInfo.flags.formatChangeSrd = true; - imgOpenInfo.usage.shaderRead = true; - imgOpenInfo.usage.shaderWrite = true; - size_t imageSize; - size_t gpuMemSize; + if (misc) { + Pal::ImageCreateInfo imgCreateInfo = {}; + Pal::ExternalImageOpenInfo imgOpenInfo = {}; + imgOpenInfo.resourceInfo = openInfo; + imgOpenInfo.swizzledFormat.format = format; + imgOpenInfo.swizzledFormat.swizzle = channels; + imgOpenInfo.flags.formatChangeSrd = true; + imgOpenInfo.usage.shaderRead = true; + imgOpenInfo.usage.shaderWrite = true; + size_t imageSize; + size_t gpuMemSize; - if (Pal::Result::Success != dev().iDev()->GetExternalSharedImageSizes( - imgOpenInfo, &imageSize, &gpuMemSize, &imgCreateInfo)) { - return false; - } - - Pal::gpusize viewOffset = 0; - imgCreateInfo.flags.shareable = false; - imgCreateInfo.imageType = Pal::ImageType::Tex2d; - imgCreateInfo.extent.width = desc().width_; - imgCreateInfo.extent.height = desc().height_; - imgCreateInfo.extent.depth = desc().depth_; - imgCreateInfo.arraySize = 1; - imgCreateInfo.flags.formatChangeSrd = true; - imgCreateInfo.usageFlags.shaderRead = true; - imgCreateInfo.usageFlags.shaderWrite = true; - imgCreateInfo.swizzledFormat.format = format; - imgCreateInfo.swizzledFormat.swizzle = channels; - imgCreateInfo.mipLevels = 1; - imgCreateInfo.samples = 1; - imgCreateInfo.fragments = 1; - imgCreateInfo.tiling = Pal::ImageTiling::Linear; - imgCreateInfo.depthPitch = desc().height_ * imgCreateInfo.rowPitch; - - switch (misc) { - case 1: // NV12 format - switch (layer) { - case -1: - break; - case 0: - break; - case 1: - // Y - plane size to the offset - // NV12 format. UV is 2 times smaller plane Y - viewOffset = 2 * imgCreateInfo.rowPitch * desc().height_; - imgCreateInfo.depthPitch = imgCreateInfo.rowPitch * desc().height_; - break; - default: - LogError("Unknown Interop View Type"); - return false; - } - break; - case 2: // YV12 format - switch (layer) { - case -1: - break; - case 0: - break; - case 1: - // Y - plane size to the offset - // YV12 format. U is 4 times smaller plane than Y - viewOffset = 2 * imgCreateInfo.rowPitch * desc().height_; - imgCreateInfo.rowPitch >>= 1; - break; - case 2: - // Y + U plane sizes to the offest. - // U plane is 4 times smaller than Y and U == V - viewOffset = 5 * imgCreateInfo.rowPitch * desc().height_ / 2; - imgCreateInfo.rowPitch >>= 1; - break; - default: - LogError("Unknown Interop View Type"); - return false; - } - imgCreateInfo.depthPitch = imgCreateInfo.rowPitch * desc().height_; - break; - default: - LogError("Unknown Interop View Type"); - return false; - } - - imageSize = dev().iDev()->GetImageSize(imgCreateInfo, &result); - if (result != Pal::Result::Success) { - return false; - } - - char* memImg = new char[imageSize]; - if (memImg != nullptr) { - result = dev().iDev()->CreateImage(imgCreateInfo, memImg, &image_); - if (result != Pal::Result::Success) { - delete memImg; - return false; - } - } - result = image_->BindGpuMemory(iMem(), viewOffset); - if (result != Pal::Result::Success) { - return false; - } - offset_ = static_cast(viewOffset); - hwSrd_ = dev().srds().allocSrdSlot(reinterpret_cast(&hwState_)); - if ((0 == hwSrd_) && (memoryType() != ImageView)) { - return false; - } - Pal::ImageViewInfo viewInfo = {}; - viewInfo.viewType = Pal::ImageViewType::Tex2d; - viewInfo.pImage = image_; - viewInfo.swizzledFormat.format = format; - viewInfo.swizzledFormat.swizzle = channels; - viewInfo.subresRange = ImgSubresRange; - dev().iDev()->CreateImageViewSrds(1, &viewInfo, hwState_); - - hwState_[8] = GetHSAILImageFormatType(desc().format_); - hwState_[9] = GetHSAILImageOrderType(desc().format_); - hwState_[10] = static_cast(desc().width_); - hwState_[11] = 0; // one extra reserved field in the argument - } - } - else if (desc().topology_ == CL_MEM_OBJECT_IMAGE1D_BUFFER) { - memRef_ = GpuMemoryReference::Create(dev(), gpuMemOpenInfo); - if (nullptr == memRef_) { - return false; - } - Pal::BufferViewInfo viewInfo = {}; - viewInfo.gpuAddr = memRef_->iMem()->Desc().gpuVirtAddr + offset(); - viewInfo.range = memRef_->iMem()->Desc().size; - viewInfo.stride = elementSize(); - viewInfo.swizzledFormat.format = format; - viewInfo.swizzledFormat.swizzle = channels; - hwSrd_ = dev().srds().allocSrdSlot(reinterpret_cast(&hwState_)); - if ((0 == hwSrd_) && (memoryType() != ImageView)) { - return false; - } - - dev().iDev()->CreateTypedBufferViewSrds(1, &viewInfo, hwState_); - hwState_[8] = GetHSAILImageFormatType(desc().format_); - hwState_[9] = GetHSAILImageOrderType(desc().format_); - hwState_[10] = static_cast(desc().width_); - hwState_[11] = 0; // one extra reserved field in the argument - } - else { - Pal::ExternalImageOpenInfo imgOpenInfo = {}; - Pal::ImageCreateInfo imgCreateInfo = {}; - imgOpenInfo.resourceInfo = openInfo; - imgOpenInfo.swizzledFormat.format = format; - imgOpenInfo.swizzledFormat.swizzle = channels; - imgOpenInfo.flags.formatChangeSrd = true; - imgOpenInfo.usage.shaderRead = true; - imgOpenInfo.usage.shaderWrite = true; - memRef_ = GpuMemoryReference::Create( - dev(), imgOpenInfo, &imgCreateInfo, &image_); - if (nullptr == memRef_) { - return false; - } - - hwSrd_ = dev().srds().allocSrdSlot(reinterpret_cast(&hwState_)); - if ((0 == hwSrd_) && (memoryType() != ImageView)) { - return false; - } - Pal::ImageViewInfo viewInfo = {}; - viewInfo.viewType = Pal::ImageViewType::Tex2d; - switch (imgCreateInfo.imageType) { - case Pal::ImageType::Tex3d: - viewInfo.viewType = Pal::ImageViewType::Tex3d; - break; - case Pal::ImageType::Tex1d: - viewInfo.viewType = Pal::ImageViewType::Tex1d; - break; - } - viewInfo.pImage = image_; - viewInfo.swizzledFormat.format = format; - viewInfo.swizzledFormat.swizzle = channels; - if ((type == InteropTextureViewLevel) || - (type == InteropTextureViewCube)) { - ImgSubresRange.startSubres.mipLevel = mipLevel; - if (type == InteropTextureViewCube) { - ImgSubresRange.startSubres.arraySlice = layer; - viewInfo.viewType = Pal::ImageViewType::Tex2d; - } - } - if (desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY) { - ImgSubresRange.numSlices = desc_.height_; - } - if (desc().topology_ == CL_MEM_OBJECT_IMAGE2D_ARRAY) { - ImgSubresRange.numSlices = desc_.depth_; - } - ImgSubresRange.numMips = desc().mipLevels_; - viewInfo.subresRange = ImgSubresRange; - - dev().iDev()->CreateImageViewSrds(1, &viewInfo, hwState_); - //! It's a workaround for D24S8 format, since PAL doesn't support this format - //! and GSL decompresses 24bit DEPTH into D24S8 for OGL compatibility - if ((desc().format_.image_channel_order == CL_DEPTH_STENCIL) && - (desc().format_.image_channel_data_type == CL_UNORM_INT24)) { - hwState_[1] &= ~0x3c000000; - hwState_[1] = (hwState_[1] & ~0x3f00000) | 0x1400000; - } - hwState_[8] = GetHSAILImageFormatType(desc().format_); - hwState_[9] = GetHSAILImageOrderType(desc().format_); - hwState_[10] = static_cast(desc().width_); - hwState_[11] = 0; // one extra reserved field in the argument - } - return true; - } -#endif // _WIN32 - - if (!desc_.buffer_) { - if (desc().topology_ == CL_MEM_OBJECT_IMAGE1D_BUFFER) { - if (memoryType() == ImageBuffer) { - ImageBufferParams* imageBuffer = reinterpret_cast(params); - viewOwner_ = imageBuffer->resource_; - memRef_ = viewOwner_->memRef_; - memRef_->retain(); - desc_.cardMemory_ = viewOwner_->desc().cardMemory_; - } - else { - Pal::GpuMemoryCreateInfo createInfo = {}; - createInfo.size = desc().width_ * elementSize(); - // @todo 64K alignment is too big - createInfo.size = amd::alignUp(createInfo.size, MaxGpuAlignment); - createInfo.alignment = MaxGpuAlignment; - createInfo.vaRange = Pal::VaRange::Default; - createInfo.priority = Pal::GpuMemPriority::Normal; - memTypeToHeap(&createInfo); - // createInfo.priority; - memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size, createInfo.alignment); - if (nullptr == memRef_) { - memRef_ = GpuMemoryReference::Create(dev(), createInfo); - if (nullptr == memRef_) { - LogError("Failed PAL memory allocation!"); - return false; - } - } - } - Pal::BufferViewInfo viewInfo = {}; - viewInfo.gpuAddr = memRef_->iMem()->Desc().gpuVirtAddr + offset(); - viewInfo.range = memRef_->iMem()->Desc().size; - viewInfo.stride = elementSize(); - viewInfo.swizzledFormat.format = format; - viewInfo.swizzledFormat.swizzle = channels; - //viewInfo.channels = channels; - hwSrd_ = dev().srds().allocSrdSlot(reinterpret_cast(&hwState_)); - if ((0 == hwSrd_) && (memoryType() != ImageView)) { - return false; - } - - dev().iDev()->CreateTypedBufferViewSrds(1, &viewInfo, hwState_); - hwState_[8] = GetHSAILImageFormatType(desc().format_); - hwState_[9] = GetHSAILImageOrderType(desc().format_); - hwState_[10] = static_cast(desc().width_); - hwState_[11] = 0; // one extra reserved field in the argument - return true; + if (Pal::Result::Success != + dev().iDev()->GetExternalSharedImageSizes(imgOpenInfo, &imageSize, &gpuMemSize, + &imgCreateInfo)) { + return false; } - Pal::ImageViewInfo viewInfo = {}; - Pal::ImageCreateInfo imgCreateInfo = {}; - Pal::GpuMemoryRequirements req = {}; - char* memImg; + Pal::gpusize viewOffset = 0; + imgCreateInfo.flags.shareable = false; imgCreateInfo.imageType = Pal::ImageType::Tex2d; - viewInfo.viewType = Pal::ImageViewType::Tex2d; - imgCreateInfo.extent.width = desc_.width_; - imgCreateInfo.extent.height = desc_.height_; - imgCreateInfo.extent.depth = desc_.depth_; - imgCreateInfo.arraySize = 1; + imgCreateInfo.extent.width = desc().width_; + imgCreateInfo.extent.height = desc().height_; + imgCreateInfo.extent.depth = desc().depth_; + imgCreateInfo.arraySize = 1; + imgCreateInfo.flags.formatChangeSrd = true; + imgCreateInfo.usageFlags.shaderRead = true; + imgCreateInfo.usageFlags.shaderWrite = true; + imgCreateInfo.swizzledFormat.format = format; + imgCreateInfo.swizzledFormat.swizzle = channels; + imgCreateInfo.mipLevels = 1; + imgCreateInfo.samples = 1; + imgCreateInfo.fragments = 1; + imgCreateInfo.tiling = Pal::ImageTiling::Linear; + imgCreateInfo.depthPitch = desc().height_ * imgCreateInfo.rowPitch; - switch (desc_.topology_) { - case CL_MEM_OBJECT_IMAGE3D: - imgCreateInfo.imageType = Pal::ImageType::Tex3d; - viewInfo.viewType = Pal::ImageViewType::Tex3d; - break; - case CL_MEM_OBJECT_IMAGE1D: - case CL_MEM_OBJECT_IMAGE1D_ARRAY: - case CL_MEM_OBJECT_IMAGE1D_BUFFER: - imgCreateInfo.imageType = Pal::ImageType::Tex1d; - viewInfo.viewType = Pal::ImageViewType::Tex1d; - break; - } - if (desc_.topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY) { - ImgSubresRange.numSlices = imgCreateInfo.arraySize = desc_.height_; - imgCreateInfo.extent.depth = desc_.height_; - imgCreateInfo.extent.height = 1; - } - if (desc_.topology_ == CL_MEM_OBJECT_IMAGE2D_ARRAY) { - ImgSubresRange.numSlices = imgCreateInfo.arraySize = desc_.depth_; - } - - if (memoryType() == ImageView) { - ImageViewParams* imageView = reinterpret_cast(params); - ImgSubresRange.startSubres.mipLevel = imageView->level_; - desc_.baseLevel_ = imageView->level_; - ImgSubresRange.startSubres.arraySlice = imageView->layer_; - viewOwner_ = imageView->resource_; - image_ = viewOwner_->image_; - offset_ = viewOwner_->offset_; - } - else if (memoryType() == ImageBuffer) { - ImageBufferParams* imageBuffer = reinterpret_cast(params); - viewOwner_ = imageBuffer->resource_; - } - ImgSubresRange.numMips = desc().mipLevels_; - - if ((memoryType() != ImageView) || - //! @todo PAL doesn't allow an SRD view creation with different pixel size - (elementSize() != viewOwner_->elementSize())) { - imgCreateInfo.flags.formatChangeSrd = true; - imgCreateInfo.usageFlags.shaderRead = true; - imgCreateInfo.usageFlags.shaderWrite = - (format == Pal::ChNumFormat::X8Y8Z8W8_Srgb) ? false : true; - imgCreateInfo.swizzledFormat.format = format; - imgCreateInfo.swizzledFormat.swizzle = channels; - imgCreateInfo.mipLevels = (desc_.mipLevels_) ? desc_.mipLevels_ : 1; - imgCreateInfo.samples = 1; - imgCreateInfo.fragments = 1; - Pal::ImageTiling tiling = Pal::ImageTiling::Optimal; - uint32_t rowPitch = 0; - - if (((memoryType() == Persistent) && - dev().settings().linearPersistentImage_) || - (memoryType() == ImageBuffer)) { - tiling = Pal::ImageTiling::Linear; - } - else if (memoryType() == ImageView) { - tiling = viewOwner_->image_->GetImageCreateInfo().tiling; - // Find the new pitch in pixels for the new format - rowPitch = viewOwner_->desc().pitch_ * - viewOwner_->elementSize() / elementSize(); - } - - if (memoryType() == ImageBuffer) { - if ((params->owner_ != NULL) && params->owner_->asImage() && - (params->owner_->asImage()->getRowPitch() != 0)) { - rowPitch = params->owner_->asImage()->getRowPitch() / elementSize(); - } - else { - rowPitch = desc().width_; - } - } - desc_.pitch_ = rowPitch; - // Make sure the row pitch is aligned to pixels - imgCreateInfo.rowPitch = elementSize() * - amd::alignUp(rowPitch, dev().info().imagePitchAlignment_); - imgCreateInfo.depthPitch = imgCreateInfo.rowPitch * desc().height_; - imgCreateInfo.tiling = tiling; - - size_t imageSize = dev().iDev()->GetImageSize(imgCreateInfo, &result); - if (result != Pal::Result::Success) { + switch (misc) { + case 1: // NV12 format + switch (layer) { + case -1: + break; + case 0: + break; + case 1: + // Y - plane size to the offset + // NV12 format. UV is 2 times smaller plane Y + viewOffset = 2 * imgCreateInfo.rowPitch * desc().height_; + imgCreateInfo.depthPitch = imgCreateInfo.rowPitch * desc().height_; + break; + default: + LogError("Unknown Interop View Type"); return false; } - - memImg = new char[imageSize]; - if (memImg != nullptr) { - result = dev().iDev()->CreateImage(imgCreateInfo, memImg, &image_); - if (result != Pal::Result::Success) { - delete memImg; - return false; - } + break; + case 2: // YV12 format + switch (layer) { + case -1: + break; + case 0: + break; + case 1: + // Y - plane size to the offset + // YV12 format. U is 4 times smaller plane than Y + viewOffset = 2 * imgCreateInfo.rowPitch * desc().height_; + imgCreateInfo.rowPitch >>= 1; + break; + case 2: + // Y + U plane sizes to the offest. + // U plane is 4 times smaller than Y and U == V + viewOffset = 5 * imgCreateInfo.rowPitch * desc().height_ / 2; + imgCreateInfo.rowPitch >>= 1; + break; + default: + LogError("Unknown Interop View Type"); + return false; } - image_->GetGpuMemoryRequirements(&req); - // createInfo.priority; - } - - if ((memoryType() != ImageView) && (memoryType() != ImageBuffer)) { - Pal::GpuMemoryCreateInfo createInfo = {}; - createInfo.size = amd::alignUp(req.size, MaxGpuAlignment); - createInfo.alignment = std::max(req.alignment, MaxGpuAlignment); - createInfo.vaRange = Pal::VaRange::Default; - createInfo.priority = Pal::GpuMemPriority::Normal; - memTypeToHeap(&createInfo); - - memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size, createInfo.alignment); - if (nullptr == memRef_) { - memRef_ = GpuMemoryReference::Create(dev(), createInfo); - if (nullptr == memRef_) { - LogError("Failed PAL memory allocation!"); - return false; - } - } - } - else { - memRef_ = viewOwner_->memRef_; - memRef_->retain(); - desc_.cardMemory_ = viewOwner_->desc().cardMemory_; - if (req.size > viewOwner_->iMem()->Desc().size) { - LogWarning("Image is bigger than the original mem object!"); - } - } - - result = image_->BindGpuMemory(memRef_->gpuMem_, offset_); - if (result != Pal::Result::Success) { + imgCreateInfo.depthPitch = imgCreateInfo.rowPitch * desc().height_; + break; + default: + LogError("Unknown Interop View Type"); return false; } + imageSize = dev().iDev()->GetImageSize(imgCreateInfo, &result); + if (result != Pal::Result::Success) { + return false; + } + + char* memImg = new char[imageSize]; + if (memImg != nullptr) { + result = dev().iDev()->CreateImage(imgCreateInfo, memImg, &image_); + if (result != Pal::Result::Success) { + delete memImg; + return false; + } + } + result = image_->BindGpuMemory(iMem(), viewOffset); + if (result != Pal::Result::Success) { + return false; + } + offset_ = static_cast(viewOffset); hwSrd_ = dev().srds().allocSrdSlot(reinterpret_cast(&hwState_)); if ((0 == hwSrd_) && (memoryType() != ImageView)) { - return false; + return false; } + Pal::ImageViewInfo viewInfo = {}; + viewInfo.viewType = Pal::ImageViewType::Tex2d; viewInfo.pImage = image_; viewInfo.swizzledFormat.format = format; viewInfo.swizzledFormat.swizzle = channels; @@ -953,1294 +614,1434 @@ Resource::create(MemoryType memType, CreateParams* params) hwState_[8] = GetHSAILImageFormatType(desc().format_); hwState_[9] = GetHSAILImageOrderType(desc().format_); hwState_[10] = static_cast(desc().width_); - hwState_[11] = 0; // one extra reserved field in the argument - return true; + hwState_[11] = 0; // one extra reserved field in the argument + } + } else if (desc().topology_ == CL_MEM_OBJECT_IMAGE1D_BUFFER) { + memRef_ = GpuMemoryReference::Create(dev(), gpuMemOpenInfo); + if (nullptr == memRef_) { + return false; + } + Pal::BufferViewInfo viewInfo = {}; + viewInfo.gpuAddr = memRef_->iMem()->Desc().gpuVirtAddr + offset(); + viewInfo.range = memRef_->iMem()->Desc().size; + viewInfo.stride = elementSize(); + viewInfo.swizzledFormat.format = format; + viewInfo.swizzledFormat.swizzle = channels; + hwSrd_ = dev().srds().allocSrdSlot(reinterpret_cast(&hwState_)); + if ((0 == hwSrd_) && (memoryType() != ImageView)) { + return false; + } + + dev().iDev()->CreateTypedBufferViewSrds(1, &viewInfo, hwState_); + hwState_[8] = GetHSAILImageFormatType(desc().format_); + hwState_[9] = GetHSAILImageOrderType(desc().format_); + hwState_[10] = static_cast(desc().width_); + hwState_[11] = 0; // one extra reserved field in the argument + } else { + Pal::ExternalImageOpenInfo imgOpenInfo = {}; + Pal::ImageCreateInfo imgCreateInfo = {}; + imgOpenInfo.resourceInfo = openInfo; + imgOpenInfo.swizzledFormat.format = format; + imgOpenInfo.swizzledFormat.swizzle = channels; + imgOpenInfo.flags.formatChangeSrd = true; + imgOpenInfo.usage.shaderRead = true; + imgOpenInfo.usage.shaderWrite = true; + memRef_ = GpuMemoryReference::Create(dev(), imgOpenInfo, &imgCreateInfo, &image_); + if (nullptr == memRef_) { + return false; + } + + hwSrd_ = dev().srds().allocSrdSlot(reinterpret_cast(&hwState_)); + if ((0 == hwSrd_) && (memoryType() != ImageView)) { + return false; + } + Pal::ImageViewInfo viewInfo = {}; + viewInfo.viewType = Pal::ImageViewType::Tex2d; + switch (imgCreateInfo.imageType) { + case Pal::ImageType::Tex3d: + viewInfo.viewType = Pal::ImageViewType::Tex3d; + break; + case Pal::ImageType::Tex1d: + viewInfo.viewType = Pal::ImageViewType::Tex1d; + break; + } + viewInfo.pImage = image_; + viewInfo.swizzledFormat.format = format; + viewInfo.swizzledFormat.swizzle = channels; + if ((type == InteropTextureViewLevel) || (type == InteropTextureViewCube)) { + ImgSubresRange.startSubres.mipLevel = mipLevel; + if (type == InteropTextureViewCube) { + ImgSubresRange.startSubres.arraySlice = layer; + viewInfo.viewType = Pal::ImageViewType::Tex2d; + } + } + if (desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY) { + ImgSubresRange.numSlices = desc_.height_; + } + if (desc().topology_ == CL_MEM_OBJECT_IMAGE2D_ARRAY) { + ImgSubresRange.numSlices = desc_.depth_; + } + ImgSubresRange.numMips = desc().mipLevels_; + viewInfo.subresRange = ImgSubresRange; + + dev().iDev()->CreateImageViewSrds(1, &viewInfo, hwState_); + //! It's a workaround for D24S8 format, since PAL doesn't support this format + //! and GSL decompresses 24bit DEPTH into D24S8 for OGL compatibility + if ((desc().format_.image_channel_order == CL_DEPTH_STENCIL) && + (desc().format_.image_channel_data_type == CL_UNORM_INT24)) { + hwState_[1] &= ~0x3c000000; + hwState_[1] = (hwState_[1] & ~0x3f00000) | 0x1400000; + } + hwState_[8] = GetHSAILImageFormatType(desc().format_); + hwState_[9] = GetHSAILImageOrderType(desc().format_); + hwState_[10] = static_cast(desc().width_); + hwState_[11] = 0; // one extra reserved field in the argument } + return true; + } +#endif // _WIN32 - if (memoryType() == View) { - // Save the offset in the global heap - ViewParams* view = reinterpret_cast(params); - offset_ = view->offset_; - - // Make sure parent was provided - if (nullptr != view->resource_) { - viewOwner_ = view->resource_; - offset_ += viewOwner_->offset(); - if (viewOwner_->data() != nullptr) { - address_ = viewOwner_->data() + view->offset_; - } - pinOffset_ = viewOwner_->pinOffset(); - memRef_ = viewOwner_->memRef_; - memRef_->retain(); - desc_.cardMemory_ = viewOwner_->desc().cardMemory_; - } - else { - desc_.type_ = Empty; - } - return true; - } - - if (memoryType() == Pinned) { - PinnedParams* pinned = reinterpret_cast(params); - uint allocSize = static_cast(pinned->size_); - void* pinAddress; - hostMemRef = pinned->hostMemRef_; - pinAddress = address_ = hostMemRef->hostMem(); - // assert((allocSize == (desc().width_ * elementSize())) && "Sizes don't match"); - if (desc().topology_ == CL_MEM_OBJECT_BUFFER) { - // Allign offset to 4K boundary (Vista/Win7 limitation) - char* tmpHost = const_cast( - amd::alignDown(reinterpret_cast(address_), - PinnedMemoryAlignment)); - - // Find the partial size for unaligned copy - hostMemOffset = static_cast( - reinterpret_cast(address_) - tmpHost); - - pinOffset_ = hostMemOffset; - - pinAddress = tmpHost; - - if (hostMemOffset != 0) { - allocSize += hostMemOffset; - } - allocSize = amd::alignUp(allocSize, PinnedMemoryAlignment); -// hostMemOffset &= ~(0xff); - } - else if (desc().topology_ == CL_MEM_OBJECT_IMAGE2D) { - //! @todo: Width has to be aligned for 3D. - //! Need to be replaced with a compute copy - // Width aligned by 8 texels - if (((desc().width_ % 0x8) != 0) || - // Pitch aligned by 64 bytes - (((desc().width_ * elementSize()) % 0x40) != 0)) { - return false; - } - } - else { - //! @todo GSL doesn't support pinning with resAlloc_ - return false; - } - - if(dev().settings().svmFineGrainSystem_) { - desc_.SVMRes_ = true; - } - - // Ensure page alignment - if ((uint64_t)(pinAddress) & (amd::Os::pageSize() - 1)) { - return false; - } - Pal::PinnedGpuMemoryCreateInfo createInfo = {}; - createInfo.pSysMem = pinAddress; - createInfo.size = allocSize; - createInfo.vaRange = Pal::VaRange::Default; - memRef_ = GpuMemoryReference::Create(dev(), createInfo); - if (nullptr == memRef_) { - LogError("Failed PAL memory allocation!"); - pinOffset_ = 0; - return false; - } - desc_.cardMemory_ = false; - return true; - } - - Pal::gpusize svmPtr = 0; - if ((nullptr != params) && - (nullptr != params->owner_) && - (nullptr != params->owner_->getSvmPtr())) { - svmPtr = reinterpret_cast(params->owner_->getSvmPtr()); - desc_.SVMRes_ = true; - svmPtr = (svmPtr == 1) ? 0 : svmPtr; - } - if (desc_.SVMRes_) { + if (!desc_.buffer_) { + if (desc().topology_ == CL_MEM_OBJECT_IMAGE1D_BUFFER) { + if (memoryType() == ImageBuffer) { + ImageBufferParams* imageBuffer = reinterpret_cast(params); + viewOwner_ = imageBuffer->resource_; + memRef_ = viewOwner_->memRef_; + memRef_->retain(); + desc_.cardMemory_ = viewOwner_->desc().cardMemory_; + } else { + Pal::GpuMemoryCreateInfo createInfo = {}; + createInfo.size = desc().width_ * elementSize(); // @todo 64K alignment is too big - uint allocSize = amd::alignUp(desc().width_ * elementSize_, MaxGpuAlignment); - if ((memoryType() == RemoteUSWC) || - (memoryType() == Remote)) { - Pal::SvmGpuMemoryCreateInfo createInfo = {}; - createInfo.isUsedForKernel = desc_.isAllocExecute_; - createInfo.size = allocSize; - createInfo.alignment = MaxGpuAlignment; - if (svmPtr != 0) { - createInfo.flags.useReservedGpuVa = true; - createInfo.pReservedGpuVaOwner = params->svmBase_->iMem(); - } - else { - createInfo.flags.useReservedGpuVa = false; - createInfo.pReservedGpuVaOwner = nullptr; - } - memRef_ = GpuMemoryReference::Create(dev(), createInfo); - } - else { - Pal::GpuMemoryCreateInfo createInfo = {}; - createInfo.size = allocSize; - createInfo.alignment = MaxGpuAlignment; - createInfo.vaRange = Pal::VaRange::Svm; - createInfo.priority = Pal::GpuMemPriority::Normal; - if (svmPtr != 0) { - createInfo.flags.useReservedGpuVa = true; - createInfo.pReservedGpuVaOwner = params->svmBase_->iMem(); - } - memTypeToHeap(&createInfo); - memRef_ = GpuMemoryReference::Create(dev(), createInfo); - } + createInfo.size = amd::alignUp(createInfo.size, MaxGpuAlignment); + createInfo.alignment = MaxGpuAlignment; + createInfo.vaRange = Pal::VaRange::Default; + createInfo.priority = Pal::GpuMemPriority::Normal; + memTypeToHeap(&createInfo); + // createInfo.priority; + memRef_ = + dev().resourceCache().findGpuMemory(&desc_, createInfo.size, createInfo.alignment); if (nullptr == memRef_) { + memRef_ = GpuMemoryReference::Create(dev(), createInfo); + if (nullptr == memRef_) { LogError("Failed PAL memory allocation!"); return false; + } } - desc_.cardMemory_ = false; - if ((nullptr != params) && - (nullptr != params->owner_) && - (nullptr != params->owner_->getSvmPtr())) { - params->owner_->setSvmPtr(reinterpret_cast(memRef_->iMem()->Desc().gpuVirtAddr)); + } + Pal::BufferViewInfo viewInfo = {}; + viewInfo.gpuAddr = memRef_->iMem()->Desc().gpuVirtAddr + offset(); + viewInfo.range = memRef_->iMem()->Desc().size; + viewInfo.stride = elementSize(); + viewInfo.swizzledFormat.format = format; + viewInfo.swizzledFormat.swizzle = channels; + // viewInfo.channels = channels; + hwSrd_ = dev().srds().allocSrdSlot(reinterpret_cast(&hwState_)); + if ((0 == hwSrd_) && (memoryType() != ImageView)) { + return false; + } + + dev().iDev()->CreateTypedBufferViewSrds(1, &viewInfo, hwState_); + hwState_[8] = GetHSAILImageFormatType(desc().format_); + hwState_[9] = GetHSAILImageOrderType(desc().format_); + hwState_[10] = static_cast(desc().width_); + hwState_[11] = 0; // one extra reserved field in the argument + return true; + } + + Pal::ImageViewInfo viewInfo = {}; + Pal::ImageCreateInfo imgCreateInfo = {}; + Pal::GpuMemoryRequirements req = {}; + char* memImg; + imgCreateInfo.imageType = Pal::ImageType::Tex2d; + viewInfo.viewType = Pal::ImageViewType::Tex2d; + imgCreateInfo.extent.width = desc_.width_; + imgCreateInfo.extent.height = desc_.height_; + imgCreateInfo.extent.depth = desc_.depth_; + imgCreateInfo.arraySize = 1; + + switch (desc_.topology_) { + case CL_MEM_OBJECT_IMAGE3D: + imgCreateInfo.imageType = Pal::ImageType::Tex3d; + viewInfo.viewType = Pal::ImageViewType::Tex3d; + break; + case CL_MEM_OBJECT_IMAGE1D: + case CL_MEM_OBJECT_IMAGE1D_ARRAY: + case CL_MEM_OBJECT_IMAGE1D_BUFFER: + imgCreateInfo.imageType = Pal::ImageType::Tex1d; + viewInfo.viewType = Pal::ImageViewType::Tex1d; + break; + } + if (desc_.topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY) { + ImgSubresRange.numSlices = imgCreateInfo.arraySize = desc_.height_; + imgCreateInfo.extent.depth = desc_.height_; + imgCreateInfo.extent.height = 1; + } + if (desc_.topology_ == CL_MEM_OBJECT_IMAGE2D_ARRAY) { + ImgSubresRange.numSlices = imgCreateInfo.arraySize = desc_.depth_; + } + + if (memoryType() == ImageView) { + ImageViewParams* imageView = reinterpret_cast(params); + ImgSubresRange.startSubres.mipLevel = imageView->level_; + desc_.baseLevel_ = imageView->level_; + ImgSubresRange.startSubres.arraySlice = imageView->layer_; + viewOwner_ = imageView->resource_; + image_ = viewOwner_->image_; + offset_ = viewOwner_->offset_; + } else if (memoryType() == ImageBuffer) { + ImageBufferParams* imageBuffer = reinterpret_cast(params); + viewOwner_ = imageBuffer->resource_; + } + ImgSubresRange.numMips = desc().mipLevels_; + + if ((memoryType() != ImageView) || + //! @todo PAL doesn't allow an SRD view creation with different pixel size + (elementSize() != viewOwner_->elementSize())) { + imgCreateInfo.flags.formatChangeSrd = true; + imgCreateInfo.usageFlags.shaderRead = true; + imgCreateInfo.usageFlags.shaderWrite = + (format == Pal::ChNumFormat::X8Y8Z8W8_Srgb) ? false : true; + imgCreateInfo.swizzledFormat.format = format; + imgCreateInfo.swizzledFormat.swizzle = channels; + imgCreateInfo.mipLevels = (desc_.mipLevels_) ? desc_.mipLevels_ : 1; + imgCreateInfo.samples = 1; + imgCreateInfo.fragments = 1; + Pal::ImageTiling tiling = Pal::ImageTiling::Optimal; + uint32_t rowPitch = 0; + + if (((memoryType() == Persistent) && dev().settings().linearPersistentImage_) || + (memoryType() == ImageBuffer)) { + tiling = Pal::ImageTiling::Linear; + } else if (memoryType() == ImageView) { + tiling = viewOwner_->image_->GetImageCreateInfo().tiling; + // Find the new pitch in pixels for the new format + rowPitch = viewOwner_->desc().pitch_ * viewOwner_->elementSize() / elementSize(); + } + + if (memoryType() == ImageBuffer) { + if ((params->owner_ != NULL) && params->owner_->asImage() && + (params->owner_->asImage()->getRowPitch() != 0)) { + rowPitch = params->owner_->asImage()->getRowPitch() / elementSize(); + } else { + rowPitch = desc().width_; } - return true; + } + desc_.pitch_ = rowPitch; + // Make sure the row pitch is aligned to pixels + imgCreateInfo.rowPitch = + elementSize() * amd::alignUp(rowPitch, dev().info().imagePitchAlignment_); + imgCreateInfo.depthPitch = imgCreateInfo.rowPitch * desc().height_; + imgCreateInfo.tiling = tiling; + + size_t imageSize = dev().iDev()->GetImageSize(imgCreateInfo, &result); + if (result != Pal::Result::Success) { + return false; + } + + memImg = new char[imageSize]; + if (memImg != nullptr) { + result = dev().iDev()->CreateImage(imgCreateInfo, memImg, &image_); + if (result != Pal::Result::Success) { + delete memImg; + return false; + } + } + image_->GetGpuMemoryRequirements(&req); + // createInfo.priority; } - Pal::GpuMemoryCreateInfo createInfo = {}; - createInfo.size = desc().width_ * elementSize_; - // @todo 64K alignment is too big - createInfo.size = amd::alignUp(createInfo.size, MaxGpuAlignment); - createInfo.alignment = MaxGpuAlignment; - createInfo.vaRange = Pal::VaRange::Default; - createInfo.priority = Pal::GpuMemPriority::Normal; + if ((memoryType() != ImageView) && (memoryType() != ImageBuffer)) { + Pal::GpuMemoryCreateInfo createInfo = {}; + createInfo.size = amd::alignUp(req.size, MaxGpuAlignment); + createInfo.alignment = std::max(req.alignment, MaxGpuAlignment); + createInfo.vaRange = Pal::VaRange::Default; + createInfo.priority = Pal::GpuMemPriority::Normal; + memTypeToHeap(&createInfo); - if (memoryType() == ExternalPhysical){ - cl_bus_address_amd bus_address = - (reinterpret_cast(params->owner_))->busAddress(); - createInfo.surfaceBusAddr = bus_address.surface_bus_address; - createInfo.markerBusAddr = bus_address.marker_bus_address; - createInfo.flags.sdiExternal = true; - } - else if (memoryType() == BusAddressable){ - createInfo.flags.busAddressable = true; - } - - memTypeToHeap(&createInfo); - // createInfo.priority; - memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size, createInfo.alignment); - if (nullptr == memRef_) { + memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size, createInfo.alignment); + if (nullptr == memRef_) { memRef_ = GpuMemoryReference::Create(dev(), createInfo); if (nullptr == memRef_) { - LogError("Failed PAL memory allocation!"); - return false; + LogError("Failed PAL memory allocation!"); + return false; } + } + } else { + memRef_ = viewOwner_->memRef_; + memRef_->retain(); + desc_.cardMemory_ = viewOwner_->desc().cardMemory_; + if (req.size > viewOwner_->iMem()->Desc().size) { + LogWarning("Image is bigger than the original mem object!"); + } } + result = image_->BindGpuMemory(memRef_->gpuMem_, offset_); + if (result != Pal::Result::Success) { + return false; + } + + hwSrd_ = dev().srds().allocSrdSlot(reinterpret_cast(&hwState_)); + if ((0 == hwSrd_) && (memoryType() != ImageView)) { + return false; + } + viewInfo.pImage = image_; + viewInfo.swizzledFormat.format = format; + viewInfo.swizzledFormat.swizzle = channels; + viewInfo.subresRange = ImgSubresRange; + dev().iDev()->CreateImageViewSrds(1, &viewInfo, hwState_); + + hwState_[8] = GetHSAILImageFormatType(desc().format_); + hwState_[9] = GetHSAILImageOrderType(desc().format_); + hwState_[10] = static_cast(desc().width_); + hwState_[11] = 0; // one extra reserved field in the argument return true; -} + } -void -Resource::free() -{ - if (memRef_ == nullptr) { - return; + if (memoryType() == View) { + // Save the offset in the global heap + ViewParams* view = reinterpret_cast(params); + offset_ = view->offset_; + + // Make sure parent was provided + if (nullptr != view->resource_) { + viewOwner_ = view->resource_; + offset_ += viewOwner_->offset(); + if (viewOwner_->data() != nullptr) { + address_ = viewOwner_->data() + view->offset_; + } + pinOffset_ = viewOwner_->pinOffset(); + memRef_ = viewOwner_->memRef_; + memRef_->retain(); + desc_.cardMemory_ = viewOwner_->desc().cardMemory_; + } else { + desc_.type_ = Empty; } + return true; + } - // Sanity check for the map calls - if (mapCount_ != 0) { - LogWarning("Resource wasn't unlocked, but destroyed!"); - } - const bool wait = (memoryType() != ImageView) && - (memoryType() != ImageBuffer) && - (memoryType() != View); + if (memoryType() == Pinned) { + PinnedParams* pinned = reinterpret_cast(params); + uint allocSize = static_cast(pinned->size_); + void* pinAddress; + hostMemRef = pinned->hostMemRef_; + pinAddress = address_ = hostMemRef->hostMem(); + // assert((allocSize == (desc().width_ * elementSize())) && "Sizes don't match"); + if (desc().topology_ == CL_MEM_OBJECT_BUFFER) { + // Allign offset to 4K boundary (Vista/Win7 limitation) + char* tmpHost = const_cast( + amd::alignDown(reinterpret_cast(address_), PinnedMemoryAlignment)); - // Check if resource could be used in any queue(thread) - if (gpu_ == nullptr) { - Device::ScopedLockVgpus lock(dev()); + // Find the partial size for unaligned copy + hostMemOffset = static_cast(reinterpret_cast(address_) - tmpHost); - if (renames_.size() == 0) { - // Destroy GSL resource - if (iMem() != 0) { - // Release all virtual memory objects on all virtual GPUs - for (uint idx = 0; idx < dev().vgpus().size(); ++idx) { - // Ignore the transfer queue, - // since it releases resources after every operation - if (dev().vgpus()[idx] != dev().xferQueue()) { - dev().vgpus()[idx]->releaseMemory(iMem(), wait); - } - } + pinOffset_ = hostMemOffset; - //! @note: This is a workaround for bad applications that - //! don't unmap memory - if (mapCount_ != 0) { - unmap(nullptr); - } + pinAddress = tmpHost; - // Add resource to the cache - if (!dev().resourceCache().addGpuMemory(&desc_, memRef_)) { - palFree(); - } - } - } - else { - renames_[curRename_]->cpuAddress_ = 0; - for (size_t i = 0; i < renames_.size(); ++i) { - memRef_ = renames_[i]; - // Destroy GSL resource - if (iMem() != 0) { - // Release all virtual memory objects on all virtual GPUs - for (uint idx = 0; idx < dev().vgpus().size(); ++idx) { - // Ignore the transfer queue, - // since it releases resources after every operation - if (dev().vgpus()[idx] != dev().xferQueue()) { - dev().vgpus()[idx]->releaseMemory(iMem()); - } - } - palFree(); - } - } - } - } - else { - if (renames_.size() == 0) { - // Destroy GSL resource - if (iMem() != 0) { - // Release virtual memory object on the specified virtual GPU - gpu_->releaseMemory(iMem(), wait); - palFree(); - } - } - else for (size_t i = 0; i < renames_.size(); ++i) { - memRef_ = renames_[i]; - // Destroy GSL resource - if (iMem() != 0) { - // Release virtual memory object on the specified virtual GPUs - gpu_->releaseMemory(iMem()); - palFree(); - } - } - } - - // Free SRD for images - if (!desc().buffer_) { - dev().srds().freeSrdSlot(hwSrd_); - } -} - -void -Resource::writeRawData( - VirtualGPU& gpu, - size_t offset, - size_t size, - const void* data, - bool waitForEvent) const -{ - GpuEvent event; - - // Write data size bytes to surface - // size needs to be DWORD aligned - assert((size & 3) == 0); - gpu.eventBegin(MainEngine); - gpu.queue(MainEngine).addCmdMemRef(iMem()); - gpu.iCmd()->CmdUpdateMemory(*iMem(), offset, size, reinterpret_cast(data)); - gpu.eventEnd(MainEngine, event); - - if (waitForEvent) { - //! @note: We don't really have to mark the allocations as busy - //! if we are waiting for a transfer - - // Wait for event to complete - gpu.waitForEvent(&event); - } - else { - setBusy(gpu, event); - // Update the global GPU event - gpu.setGpuEvent(event, false); - } -} -static const Pal::ChNumFormat ChannelFmt(uint bytesPerElement) -{ - if (bytesPerElement == 16) { - return Pal::ChNumFormat::X32Y32Z32W32_Uint; - } - else if (bytesPerElement == 8) { - return Pal::ChNumFormat::X32Y32_Uint; - } - else if (bytesPerElement == 4) { - return Pal::ChNumFormat::X32_Uint; - } - else if (bytesPerElement == 2) { - return Pal::ChNumFormat::X16_Uint; - } - else { - return Pal::ChNumFormat::X8_Uint; - } -} - -bool -Resource::partialMemCopyTo( - VirtualGPU& gpu, - const amd::Coord3D& srcOrigin, - const amd::Coord3D& dstOrigin, - const amd::Coord3D& size, - Resource& dstResource, - bool enableCopyRect, - bool flushDMA, - uint bytesPerElement) const -{ - GpuEvent event; - bool result = true; - EngineType activeEngineID = gpu.engineID_; - static const bool waitOnBusyEngine = true; - - assert(!(desc().cardMemory_ && dstResource.desc().cardMemory_) && - "Unsupported configuraiton!"); - - size_t calSrcOrigin[3], calDstOrigin[3], calSize[3]; - calSrcOrigin[0] = srcOrigin[0] + pinOffset(); - calSrcOrigin[1] = srcOrigin[1]; - calSrcOrigin[2] = srcOrigin[2]; - calDstOrigin[0] = dstOrigin[0] + dstResource.pinOffset(); - calDstOrigin[1] = dstOrigin[1]; - calDstOrigin[2] = dstOrigin[2]; - calSize[0] = size[0]; - calSize[1] = size[1]; - calSize[2] = size[2]; - - uint64_t gpuMemoryOffset, gpuMemoryRowPitch, imageOffsetx; - bool img1Darray = false; - bool img2Darray = false; - - if (desc().buffer_ && !dstResource.desc().buffer_) { - imageOffsetx = calDstOrigin[0] % dstResource.elementSize(); - gpuMemoryOffset = calSrcOrigin[0] + offset(); - gpuMemoryRowPitch = (calSrcOrigin[1]) ? calSrcOrigin[1] : - calSize[0] * dstResource.elementSize(); - img1Darray = (dstResource.desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY); - img2Darray = (dstResource.desc().topology_ == CL_MEM_OBJECT_IMAGE2D_ARRAY); - } - else if (!desc().buffer_ && dstResource.desc().buffer_) { - imageOffsetx = calSrcOrigin[0] % elementSize(); - gpuMemoryOffset = calDstOrigin[0] + dstResource.offset(); - gpuMemoryRowPitch = (calDstOrigin[1]) ? calDstOrigin[1] : - calSize[0] * elementSize(); - img1Darray = (desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY); - img2Darray = (desc().topology_ == CL_MEM_OBJECT_IMAGE2D_ARRAY); - - } - - if ((desc().buffer_ && !dstResource.desc().buffer_) || - (!desc().buffer_ && dstResource.desc().buffer_)) { - - //sDMA cannot be used for the below conditions - // Make sure linear pitch in bytes is 4 bytes aligned - if (((gpuMemoryRowPitch % 4) != 0) || - // another DRM restriciton... SI has 4 pixels - (gpuMemoryOffset % 4 != 0) || - (dev().settings().sdamPageFaultWar_ && - (imageOffsetx != 0))) { - return false; - } - - } - - gpu.engineID_ = SdmaEngine; - - // Wait for the resources, since runtime may use async transfers - wait(gpu, waitOnBusyEngine); - dstResource.wait(gpu, waitOnBusyEngine); - - if (gpu.validateSdmaOverlap(*this, dstResource)) { - gpu.flushDMA(SdmaEngine); - } - - Pal::ImageLayout imgLayout = {}; - gpu.eventBegin(gpu.engineID_); - gpu.queue(gpu.engineID_).addCmdMemRef(iMem()); - gpu.queue(gpu.engineID_).addCmdMemRef(dstResource.iMem()); - if (desc().buffer_ && !dstResource.desc().buffer_) { - Pal::SubresId ImgSubresId = { Pal::ImageAspect::Color, dstResource.desc().baseLevel_, 0 }; - Pal::MemoryImageCopyRegion copyRegion = {}; - copyRegion.imageSubres = ImgSubresId; - copyRegion.imageOffset.x = calDstOrigin[0]; - copyRegion.imageOffset.y = calDstOrigin[1]; - copyRegion.imageOffset.z = calDstOrigin[2]; - copyRegion.imageExtent.width = calSize[0]; - copyRegion.imageExtent.height = calSize[1]; - copyRegion.imageExtent.depth = calSize[2]; - copyRegion.numSlices = 1; - if (img1Darray) { - copyRegion.numSlices = copyRegion.imageExtent.height; - copyRegion.imageExtent.height = 1; - } - else if (img2Darray) { - copyRegion.numSlices = copyRegion.imageExtent.depth; - copyRegion.imageExtent.depth = 1; - } - copyRegion.gpuMemoryOffset = gpuMemoryOffset; - copyRegion.gpuMemoryRowPitch = gpuMemoryRowPitch; - copyRegion.gpuMemoryDepthPitch = (calSrcOrigin[2]) ? calSrcOrigin[2] : - copyRegion.gpuMemoryRowPitch * copyRegion.imageExtent.height; - gpu.iCmd()->CmdCopyMemoryToImage(*iMem(), *dstResource.image_, - imgLayout, 1, ©Region); - } - else if (!desc().buffer_ && dstResource.desc().buffer_) { - Pal::MemoryImageCopyRegion copyRegion = {}; - Pal::SubresId ImgSubresId = { Pal::ImageAspect::Color, desc().baseLevel_, 0 }; - copyRegion.imageSubres = ImgSubresId; - copyRegion.imageOffset.x = calSrcOrigin[0]; - copyRegion.imageOffset.y = calSrcOrigin[1]; - copyRegion.imageOffset.z = calSrcOrigin[2]; - copyRegion.imageExtent.width = calSize[0]; - copyRegion.imageExtent.height = calSize[1]; - copyRegion.imageExtent.depth = calSize[2]; - copyRegion.numSlices = 1; - if (img1Darray) { - copyRegion.numSlices = copyRegion.imageExtent.height; - copyRegion.imageExtent.height = 1; - } - else if (img2Darray) { - copyRegion.numSlices = copyRegion.imageExtent.depth; - copyRegion.imageExtent.depth = 1; - } - copyRegion.gpuMemoryOffset = gpuMemoryOffset; - copyRegion.gpuMemoryRowPitch = gpuMemoryRowPitch; - copyRegion.gpuMemoryDepthPitch = (calDstOrigin[2]) ? calDstOrigin[2] : - copyRegion.gpuMemoryRowPitch * copyRegion.imageExtent.height; - gpu.iCmd()->CmdCopyImageToMemory(*image_, imgLayout, - *dstResource.iMem(), 1, ©Region); - } - else { - if (enableCopyRect) { - Pal::TypedBufferCopyRegion copyRegion = {}; - Pal::ChannelMapping channels = { Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Y, - Pal::ChannelSwizzle::Z, Pal::ChannelSwizzle::W }; - copyRegion.srcBuffer.swizzledFormat.format = ChannelFmt(bytesPerElement); - copyRegion.srcBuffer.swizzledFormat.swizzle = channels; - copyRegion.srcBuffer.offset = calSrcOrigin[0] + offset(); - copyRegion.srcBuffer.rowPitch = calSrcOrigin[1]; - copyRegion.srcBuffer.depthPitch = calSrcOrigin[2]; - copyRegion.extent.width = calSize[0] / bytesPerElement; - copyRegion.extent.height = calSize[1]; - copyRegion.extent.depth = calSize[2]; - copyRegion.dstBuffer.swizzledFormat.format = ChannelFmt(bytesPerElement); - copyRegion.dstBuffer.swizzledFormat.swizzle = channels; - copyRegion.dstBuffer.offset = calDstOrigin[0] + dstResource.offset(); - copyRegion.dstBuffer.rowPitch = calDstOrigin[1]; - copyRegion.dstBuffer.depthPitch = calDstOrigin[2]; - gpu.iCmd()->CmdCopyTypedBuffer(*iMem(), *dstResource.iMem(), - 1, ©Region); - } - else { - Pal::MemoryCopyRegion copyRegion = {}; - copyRegion.srcOffset = calSrcOrigin[0] + offset(); - copyRegion.dstOffset = calDstOrigin[0] + dstResource.offset(); - copyRegion.copySize = calSize[0]; - gpu.iCmd()->CmdCopyMemory(*iMem(), *dstResource.iMem(), - 1, ©Region); - } - } - - gpu.eventEnd(gpu.engineID_, event); - - if (result) { - // Mark source and destination as busy - setBusy(gpu, event); - dstResource.setBusy(gpu, event); - - // Update the global GPU event - gpu.setGpuEvent(event, flushDMA); - } - - // Restore the original engine - gpu.engineID_ = activeEngineID; - - return result; -} - -void -Resource::setBusy( - VirtualGPU& gpu, - GpuEvent gpuEvent - ) const -{ - gpu.assignGpuEvent(iMem(), gpuEvent); - - // If current resource is a view, then update the parent event as well - if (viewOwner_ != nullptr) { - viewOwner_->setBusy(gpu, gpuEvent); - } -} - -void -Resource::wait(VirtualGPU& gpu, bool waitOnBusyEngine) const -{ - GpuEvent* gpuEvent = gpu.getGpuEvent(iMem()); - - // Check if we have to wait unconditionally - if (!waitOnBusyEngine || - // or we have to wait only if another engine was used on this resource - (waitOnBusyEngine && (gpuEvent->engineId_ != gpu.engineID_))) { - gpu.waitForEvent(gpuEvent); - } - - // If current resource is a view and not in the global heap, - // then wait for the parent event as well - if (viewOwner_ != nullptr) { - viewOwner_->wait(gpu, waitOnBusyEngine); - } -} - -bool -Resource::hostWrite( - VirtualGPU* gpu, - const void* hostPtr, - const amd::Coord3D& origin, - const amd::Coord3D& size, - uint flags, - size_t rowPitch, - size_t slicePitch) -{ - void* dst; - - size_t startLayer = origin[2]; - size_t numLayers = size[2]; - if (desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY) { - startLayer = origin[1]; - numLayers = size[1]; - } - - // Get physical GPU memmory - dst = map(gpu, flags, startLayer, numLayers); - if (nullptr == dst) { - LogError("Couldn't map GPU memory for host write"); + if (hostMemOffset != 0) { + allocSize += hostMemOffset; + } + allocSize = amd::alignUp(allocSize, PinnedMemoryAlignment); + // hostMemOffset &= ~(0xff); + } else if (desc().topology_ == CL_MEM_OBJECT_IMAGE2D) { + //! @todo: Width has to be aligned for 3D. + //! Need to be replaced with a compute copy + // Width aligned by 8 texels + if (((desc().width_ % 0x8) != 0) || + // Pitch aligned by 64 bytes + (((desc().width_ * elementSize()) % 0x40) != 0)) { return false; + } + } else { + //! @todo GSL doesn't support pinning with resAlloc_ + return false; } - if (1 == desc().dimSize_) { - size_t copySize = (desc().buffer_) ? size[0] : size[0] * elementSize_; - - // Update the pointer - dst = static_cast(static_cast(dst) + origin[0]); - - // Copy memory - amd::Os::fastMemcpy(dst, hostPtr, copySize); - } - else { - size_t srcOffs = 0; - size_t dstOffsBase = origin[0] * elementSize_; - size_t dstOffs; - - // Make sure we use the right pitch if it's not specified - if (rowPitch == 0) { - rowPitch = size[0] * elementSize_; - } - - // Make sure we use the right slice if it's not specified - if (slicePitch == 0) { - slicePitch = size[0] * size[1] * elementSize_; - } - - // Adjust the destination offset with Y dimension - dstOffsBase += desc().pitch_ * origin[1] * elementSize_; - - // Adjust the destination offset with Z dimension - dstOffsBase += desc().slice_ * origin[2] * elementSize_; - - // Copy memory slice by slice - for (size_t slice = 0; slice < size[2]; ++slice) { - dstOffs = dstOffsBase + slice * desc().slice_ * elementSize_; - srcOffs = slice * slicePitch; - - // Copy memory line by line - for (size_t row = 0; row < size[1]; ++row) { - // Copy memory - amd::Os::fastMemcpy( - (reinterpret_cast
(dst) + dstOffs), - (reinterpret_cast(hostPtr) + srcOffs), - size[0] * elementSize_); - - dstOffs += desc().pitch_ * elementSize_; - srcOffs += rowPitch; - } - } + if (dev().settings().svmFineGrainSystem_) { + desc_.SVMRes_ = true; } - // Unmap GPU memory - unmap(gpu); - + // Ensure page alignment + if ((uint64_t)(pinAddress) & (amd::Os::pageSize() - 1)) { + return false; + } + Pal::PinnedGpuMemoryCreateInfo createInfo = {}; + createInfo.pSysMem = pinAddress; + createInfo.size = allocSize; + createInfo.vaRange = Pal::VaRange::Default; + memRef_ = GpuMemoryReference::Create(dev(), createInfo); + if (nullptr == memRef_) { + LogError("Failed PAL memory allocation!"); + pinOffset_ = 0; + return false; + } + desc_.cardMemory_ = false; return true; -} + } -bool -Resource::hostRead( - VirtualGPU* gpu, - void* hostPtr, - const amd::Coord3D& origin, - const amd::Coord3D& size, - size_t rowPitch, - size_t slicePitch) -{ - void* src; - - size_t startLayer = origin[2]; - size_t numLayers = size[2]; - if (desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY) { - startLayer = origin[1]; - numLayers = size[1]; + Pal::gpusize svmPtr = 0; + if ((nullptr != params) && (nullptr != params->owner_) && + (nullptr != params->owner_->getSvmPtr())) { + svmPtr = reinterpret_cast(params->owner_->getSvmPtr()); + desc_.SVMRes_ = true; + svmPtr = (svmPtr == 1) ? 0 : svmPtr; + } + if (desc_.SVMRes_) { + // @todo 64K alignment is too big + uint allocSize = amd::alignUp(desc().width_ * elementSize_, MaxGpuAlignment); + if ((memoryType() == RemoteUSWC) || (memoryType() == Remote)) { + Pal::SvmGpuMemoryCreateInfo createInfo = {}; + createInfo.isUsedForKernel = desc_.isAllocExecute_; + createInfo.size = allocSize; + createInfo.alignment = MaxGpuAlignment; + if (svmPtr != 0) { + createInfo.flags.useReservedGpuVa = true; + createInfo.pReservedGpuVaOwner = params->svmBase_->iMem(); + } else { + createInfo.flags.useReservedGpuVa = false; + createInfo.pReservedGpuVaOwner = nullptr; + } + memRef_ = GpuMemoryReference::Create(dev(), createInfo); + } else { + Pal::GpuMemoryCreateInfo createInfo = {}; + createInfo.size = allocSize; + createInfo.alignment = MaxGpuAlignment; + createInfo.vaRange = Pal::VaRange::Svm; + createInfo.priority = Pal::GpuMemPriority::Normal; + if (svmPtr != 0) { + createInfo.flags.useReservedGpuVa = true; + createInfo.pReservedGpuVaOwner = params->svmBase_->iMem(); + } + memTypeToHeap(&createInfo); + memRef_ = GpuMemoryReference::Create(dev(), createInfo); } - - // Get physical GPU memmory - src = map(gpu, ReadOnly, startLayer, numLayers); - if (nullptr == src) { - LogError("Couldn't map GPU memory for host read"); - return false; + if (nullptr == memRef_) { + LogError("Failed PAL memory allocation!"); + return false; } - - if (1 == desc().dimSize_) { - size_t copySize = (desc().buffer_) ? size[0] : size[0] * elementSize_; - - // Update the pointer - src = static_cast(static_cast(src) + origin[0]); - - // Copy memory - amd::Os::fastMemcpy(hostPtr, src, copySize); + desc_.cardMemory_ = false; + if ((nullptr != params) && (nullptr != params->owner_) && + (nullptr != params->owner_->getSvmPtr())) { + params->owner_->setSvmPtr(reinterpret_cast(memRef_->iMem()->Desc().gpuVirtAddr)); } - else { - size_t srcOffsBase = origin[0] * elementSize_; - size_t srcOffs; - size_t dstOffs = 0; - - // Make sure we use the right pitch if it's not specified - if (rowPitch == 0) { - rowPitch = size[0] * elementSize_; - } - - // Make sure we use the right slice if it's not specified - if (slicePitch == 0) { - slicePitch = size[0] * size[1] * elementSize_; - } - - // Adjust destination offset with Y dimension - srcOffsBase += desc().pitch_ * origin[1] * elementSize_; - - // Adjust the destination offset with Z dimension - srcOffsBase += desc().slice_ * origin[2] * elementSize_; - - // Copy memory line by line - for (size_t slice = 0; slice < size[2]; ++slice) { - srcOffs = srcOffsBase + slice * desc().slice_ * elementSize_; - dstOffs = slice * slicePitch; - - // Copy memory line by line - for (size_t row = 0; row < size[1]; ++row) { - // Copy memory - amd::Os::fastMemcpy( - (reinterpret_cast
(hostPtr) + dstOffs), - (reinterpret_cast(src) + srcOffs), - size[0] * elementSize_); - - srcOffs += desc().pitch_ * elementSize_; - dstOffs += rowPitch; - } - } - } - - // Unmap GPU memory - unmap(gpu); - return true; + } + + Pal::GpuMemoryCreateInfo createInfo = {}; + createInfo.size = desc().width_ * elementSize_; + // @todo 64K alignment is too big + createInfo.size = amd::alignUp(createInfo.size, MaxGpuAlignment); + createInfo.alignment = MaxGpuAlignment; + createInfo.vaRange = Pal::VaRange::Default; + createInfo.priority = Pal::GpuMemPriority::Normal; + + if (memoryType() == ExternalPhysical) { + cl_bus_address_amd bus_address = (reinterpret_cast(params->owner_))->busAddress(); + createInfo.surfaceBusAddr = bus_address.surface_bus_address; + createInfo.markerBusAddr = bus_address.marker_bus_address; + createInfo.flags.sdiExternal = true; + } else if (memoryType() == BusAddressable) { + createInfo.flags.busAddressable = true; + } + + memTypeToHeap(&createInfo); + // createInfo.priority; + memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size, createInfo.alignment); + if (nullptr == memRef_) { + memRef_ = GpuMemoryReference::Create(dev(), createInfo); + if (nullptr == memRef_) { + LogError("Failed PAL memory allocation!"); + return false; + } + } + + return true; } -void* -Resource::gpuMemoryMap(size_t* pitch, uint flags, Pal::IGpuMemory* resource) const -{ - if (desc_.cardMemory_ && !isPersistentDirectMap()) { - // @todo remove const cast - Unimplemented(); - return nullptr; -// return const_cast(dev()).resMapLocal(*pitch, resource, flags); - } - else { - amd::ScopedLock lk(dev().lockPAL()); - void* address; - if (image_ != nullptr) { - constexpr Pal::SubresId ImgSubresId = { Pal::ImageAspect::Color, 0, 0 }; - Pal::SubresLayout layout; - image_->GetSubresourceLayout(ImgSubresId, &layout); - *pitch = layout.rowPitch / elementSize(); +void Resource::free() { + if (memRef_ == nullptr) { + return; + } + + // Sanity check for the map calls + if (mapCount_ != 0) { + LogWarning("Resource wasn't unlocked, but destroyed!"); + } + const bool wait = + (memoryType() != ImageView) && (memoryType() != ImageBuffer) && (memoryType() != View); + + // Check if resource could be used in any queue(thread) + if (gpu_ == nullptr) { + Device::ScopedLockVgpus lock(dev()); + + if (renames_.size() == 0) { + // Destroy GSL resource + if (iMem() != 0) { + // Release all virtual memory objects on all virtual GPUs + for (uint idx = 0; idx < dev().vgpus().size(); ++idx) { + // Ignore the transfer queue, + // since it releases resources after every operation + if (dev().vgpus()[idx] != dev().xferQueue()) { + dev().vgpus()[idx]->releaseMemory(iMem(), wait); + } } - *pitch = desc().width_; - if (Pal::Result::Success == resource->Map(&address)) { - return address; + + //! @note: This is a workaround for bad applications that + //! don't unmap memory + if (mapCount_ != 0) { + unmap(nullptr); } - else { - LogError("PAL GpuMemory->Map() failed!"); - return nullptr; + + // Add resource to the cache + if (!dev().resourceCache().addGpuMemory(&desc_, memRef_)) { + palFree(); } - } -} - -void -Resource::gpuMemoryUnmap(Pal::IGpuMemory* resource) const -{ - if (desc_.cardMemory_ && !isPersistentDirectMap()) { - // @todo remove const cast - Unimplemented(); -// const_cast(dev()).resUnmapLocal(resource); - } - else { - Pal::Result result = resource->Unmap(); - if (Pal::Result::Success != result) { - LogError("PAL GpuMemory->Unmap() failed!"); + } + } else { + renames_[curRename_]->cpuAddress_ = 0; + for (size_t i = 0; i < renames_.size(); ++i) { + memRef_ = renames_[i]; + // Destroy GSL resource + if (iMem() != 0) { + // Release all virtual memory objects on all virtual GPUs + for (uint idx = 0; idx < dev().vgpus().size(); ++idx) { + // Ignore the transfer queue, + // since it releases resources after every operation + if (dev().vgpus()[idx] != dev().xferQueue()) { + dev().vgpus()[idx]->releaseMemory(iMem()); + } + } + palFree(); } + } } + } else { + if (renames_.size() == 0) { + // Destroy GSL resource + if (iMem() != 0) { + // Release virtual memory object on the specified virtual GPU + gpu_->releaseMemory(iMem(), wait); + palFree(); + } + } else + for (size_t i = 0; i < renames_.size(); ++i) { + memRef_ = renames_[i]; + // Destroy GSL resource + if (iMem() != 0) { + // Release virtual memory object on the specified virtual GPUs + gpu_->releaseMemory(iMem()); + palFree(); + } + } + } + + // Free SRD for images + if (!desc().buffer_) { + dev().srds().freeSrdSlot(hwSrd_); + } } -bool -Resource::glAcquire() -{ - bool retVal = true; - if (desc().type_ == OGLInterop) { - retVal = dev().resGLAcquire(glPlatformContext_, glInteropMbRes_, glType_); - } - return retVal; +void Resource::writeRawData(VirtualGPU& gpu, size_t offset, size_t size, const void* data, + bool waitForEvent) const { + GpuEvent event; + + // Write data size bytes to surface + // size needs to be DWORD aligned + assert((size & 3) == 0); + gpu.eventBegin(MainEngine); + gpu.queue(MainEngine).addCmdMemRef(iMem()); + gpu.iCmd()->CmdUpdateMemory(*iMem(), offset, size, reinterpret_cast(data)); + gpu.eventEnd(MainEngine, event); + + if (waitForEvent) { + //! @note: We don't really have to mark the allocations as busy + //! if we are waiting for a transfer + + // Wait for event to complete + gpu.waitForEvent(&event); + } else { + setBusy(gpu, event); + // Update the global GPU event + gpu.setGpuEvent(event, false); + } +} +static const Pal::ChNumFormat ChannelFmt(uint bytesPerElement) { + if (bytesPerElement == 16) { + return Pal::ChNumFormat::X32Y32Z32W32_Uint; + } else if (bytesPerElement == 8) { + return Pal::ChNumFormat::X32Y32_Uint; + } else if (bytesPerElement == 4) { + return Pal::ChNumFormat::X32_Uint; + } else if (bytesPerElement == 2) { + return Pal::ChNumFormat::X16_Uint; + } else { + return Pal::ChNumFormat::X8_Uint; + } } -bool -Resource::glRelease() -{ - bool retVal = true; - if (desc().type_ == OGLInterop) { - retVal = dev().resGLRelease(glPlatformContext_,glInteropMbRes_, glType_); - } - return retVal; -} -void -Resource::palFree() const -{ - amd::ScopedLock lk(dev().lockPAL()); +bool Resource::partialMemCopyTo(VirtualGPU& gpu, const amd::Coord3D& srcOrigin, + const amd::Coord3D& dstOrigin, const amd::Coord3D& size, + Resource& dstResource, bool enableCopyRect, bool flushDMA, + uint bytesPerElement) const { + GpuEvent event; + bool result = true; + EngineType activeEngineID = gpu.engineID_; + static const bool waitOnBusyEngine = true; - if (desc().type_ == OGLInterop) { - dev().resGLFree(glPlatformContext_, glInteropMbRes_, glType_); + assert(!(desc().cardMemory_ && dstResource.desc().cardMemory_) && "Unsupported configuraiton!"); + + size_t calSrcOrigin[3], calDstOrigin[3], calSize[3]; + calSrcOrigin[0] = srcOrigin[0] + pinOffset(); + calSrcOrigin[1] = srcOrigin[1]; + calSrcOrigin[2] = srcOrigin[2]; + calDstOrigin[0] = dstOrigin[0] + dstResource.pinOffset(); + calDstOrigin[1] = dstOrigin[1]; + calDstOrigin[2] = dstOrigin[2]; + calSize[0] = size[0]; + calSize[1] = size[1]; + calSize[2] = size[2]; + + uint64_t gpuMemoryOffset, gpuMemoryRowPitch, imageOffsetx; + bool img1Darray = false; + bool img2Darray = false; + + if (desc().buffer_ && !dstResource.desc().buffer_) { + imageOffsetx = calDstOrigin[0] % dstResource.elementSize(); + gpuMemoryOffset = calSrcOrigin[0] + offset(); + gpuMemoryRowPitch = + (calSrcOrigin[1]) ? calSrcOrigin[1] : calSize[0] * dstResource.elementSize(); + img1Darray = (dstResource.desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY); + img2Darray = (dstResource.desc().topology_ == CL_MEM_OBJECT_IMAGE2D_ARRAY); + } else if (!desc().buffer_ && dstResource.desc().buffer_) { + imageOffsetx = calSrcOrigin[0] % elementSize(); + gpuMemoryOffset = calDstOrigin[0] + dstResource.offset(); + gpuMemoryRowPitch = (calDstOrigin[1]) ? calDstOrigin[1] : calSize[0] * elementSize(); + img1Darray = (desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY); + img2Darray = (desc().topology_ == CL_MEM_OBJECT_IMAGE2D_ARRAY); + } + + if ((desc().buffer_ && !dstResource.desc().buffer_) || + (!desc().buffer_ && dstResource.desc().buffer_)) { + // sDMA cannot be used for the below conditions + // Make sure linear pitch in bytes is 4 bytes aligned + if (((gpuMemoryRowPitch % 4) != 0) || + // another DRM restriciton... SI has 4 pixels + (gpuMemoryOffset % 4 != 0) || (dev().settings().sdamPageFaultWar_ && (imageOffsetx != 0))) { + return false; } - memRef_->release(); + } + + gpu.engineID_ = SdmaEngine; + + // Wait for the resources, since runtime may use async transfers + wait(gpu, waitOnBusyEngine); + dstResource.wait(gpu, waitOnBusyEngine); + + if (gpu.validateSdmaOverlap(*this, dstResource)) { + gpu.flushDMA(SdmaEngine); + } + + Pal::ImageLayout imgLayout = {}; + gpu.eventBegin(gpu.engineID_); + gpu.queue(gpu.engineID_).addCmdMemRef(iMem()); + gpu.queue(gpu.engineID_).addCmdMemRef(dstResource.iMem()); + if (desc().buffer_ && !dstResource.desc().buffer_) { + Pal::SubresId ImgSubresId = {Pal::ImageAspect::Color, dstResource.desc().baseLevel_, 0}; + Pal::MemoryImageCopyRegion copyRegion = {}; + copyRegion.imageSubres = ImgSubresId; + copyRegion.imageOffset.x = calDstOrigin[0]; + copyRegion.imageOffset.y = calDstOrigin[1]; + copyRegion.imageOffset.z = calDstOrigin[2]; + copyRegion.imageExtent.width = calSize[0]; + copyRegion.imageExtent.height = calSize[1]; + copyRegion.imageExtent.depth = calSize[2]; + copyRegion.numSlices = 1; + if (img1Darray) { + copyRegion.numSlices = copyRegion.imageExtent.height; + copyRegion.imageExtent.height = 1; + } else if (img2Darray) { + copyRegion.numSlices = copyRegion.imageExtent.depth; + copyRegion.imageExtent.depth = 1; + } + copyRegion.gpuMemoryOffset = gpuMemoryOffset; + copyRegion.gpuMemoryRowPitch = gpuMemoryRowPitch; + copyRegion.gpuMemoryDepthPitch = (calSrcOrigin[2]) + ? calSrcOrigin[2] + : copyRegion.gpuMemoryRowPitch * copyRegion.imageExtent.height; + gpu.iCmd()->CmdCopyMemoryToImage(*iMem(), *dstResource.image_, imgLayout, 1, ©Region); + } else if (!desc().buffer_ && dstResource.desc().buffer_) { + Pal::MemoryImageCopyRegion copyRegion = {}; + Pal::SubresId ImgSubresId = {Pal::ImageAspect::Color, desc().baseLevel_, 0}; + copyRegion.imageSubres = ImgSubresId; + copyRegion.imageOffset.x = calSrcOrigin[0]; + copyRegion.imageOffset.y = calSrcOrigin[1]; + copyRegion.imageOffset.z = calSrcOrigin[2]; + copyRegion.imageExtent.width = calSize[0]; + copyRegion.imageExtent.height = calSize[1]; + copyRegion.imageExtent.depth = calSize[2]; + copyRegion.numSlices = 1; + if (img1Darray) { + copyRegion.numSlices = copyRegion.imageExtent.height; + copyRegion.imageExtent.height = 1; + } else if (img2Darray) { + copyRegion.numSlices = copyRegion.imageExtent.depth; + copyRegion.imageExtent.depth = 1; + } + copyRegion.gpuMemoryOffset = gpuMemoryOffset; + copyRegion.gpuMemoryRowPitch = gpuMemoryRowPitch; + copyRegion.gpuMemoryDepthPitch = (calDstOrigin[2]) + ? calDstOrigin[2] + : copyRegion.gpuMemoryRowPitch * copyRegion.imageExtent.height; + gpu.iCmd()->CmdCopyImageToMemory(*image_, imgLayout, *dstResource.iMem(), 1, ©Region); + } else { + if (enableCopyRect) { + Pal::TypedBufferCopyRegion copyRegion = {}; + Pal::ChannelMapping channels = {Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Y, + Pal::ChannelSwizzle::Z, Pal::ChannelSwizzle::W}; + copyRegion.srcBuffer.swizzledFormat.format = ChannelFmt(bytesPerElement); + copyRegion.srcBuffer.swizzledFormat.swizzle = channels; + copyRegion.srcBuffer.offset = calSrcOrigin[0] + offset(); + copyRegion.srcBuffer.rowPitch = calSrcOrigin[1]; + copyRegion.srcBuffer.depthPitch = calSrcOrigin[2]; + copyRegion.extent.width = calSize[0] / bytesPerElement; + copyRegion.extent.height = calSize[1]; + copyRegion.extent.depth = calSize[2]; + copyRegion.dstBuffer.swizzledFormat.format = ChannelFmt(bytesPerElement); + copyRegion.dstBuffer.swizzledFormat.swizzle = channels; + copyRegion.dstBuffer.offset = calDstOrigin[0] + dstResource.offset(); + copyRegion.dstBuffer.rowPitch = calDstOrigin[1]; + copyRegion.dstBuffer.depthPitch = calDstOrigin[2]; + gpu.iCmd()->CmdCopyTypedBuffer(*iMem(), *dstResource.iMem(), 1, ©Region); + } else { + Pal::MemoryCopyRegion copyRegion = {}; + copyRegion.srcOffset = calSrcOrigin[0] + offset(); + copyRegion.dstOffset = calDstOrigin[0] + dstResource.offset(); + copyRegion.copySize = calSize[0]; + gpu.iCmd()->CmdCopyMemory(*iMem(), *dstResource.iMem(), 1, ©Region); + } + } + + gpu.eventEnd(gpu.engineID_, event); + + if (result) { + // Mark source and destination as busy + setBusy(gpu, event); + dstResource.setBusy(gpu, event); + + // Update the global GPU event + gpu.setGpuEvent(event, flushDMA); + } + + // Restore the original engine + gpu.engineID_ = activeEngineID; + + return result; } -bool -Resource::isMemoryType(MemoryType memType) const -{ - if (memoryType() == memType) { - return true; - } - else if (memoryType() == View) { - return viewOwner_->isMemoryType(memType); - } +void Resource::setBusy(VirtualGPU& gpu, GpuEvent gpuEvent) const { + gpu.assignGpuEvent(iMem(), gpuEvent); + // If current resource is a view, then update the parent event as well + if (viewOwner_ != nullptr) { + viewOwner_->setBusy(gpu, gpuEvent); + } +} + +void Resource::wait(VirtualGPU& gpu, bool waitOnBusyEngine) const { + GpuEvent* gpuEvent = gpu.getGpuEvent(iMem()); + + // Check if we have to wait unconditionally + if (!waitOnBusyEngine || + // or we have to wait only if another engine was used on this resource + (waitOnBusyEngine && (gpuEvent->engineId_ != gpu.engineID_))) { + gpu.waitForEvent(gpuEvent); + } + + // If current resource is a view and not in the global heap, + // then wait for the parent event as well + if (viewOwner_ != nullptr) { + viewOwner_->wait(gpu, waitOnBusyEngine); + } +} + +bool Resource::hostWrite(VirtualGPU* gpu, const void* hostPtr, const amd::Coord3D& origin, + const amd::Coord3D& size, uint flags, size_t rowPitch, size_t slicePitch) { + void* dst; + + size_t startLayer = origin[2]; + size_t numLayers = size[2]; + if (desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY) { + startLayer = origin[1]; + numLayers = size[1]; + } + + // Get physical GPU memmory + dst = map(gpu, flags, startLayer, numLayers); + if (nullptr == dst) { + LogError("Couldn't map GPU memory for host write"); return false; + } + + if (1 == desc().dimSize_) { + size_t copySize = (desc().buffer_) ? size[0] : size[0] * elementSize_; + + // Update the pointer + dst = static_cast(static_cast(dst) + origin[0]); + + // Copy memory + amd::Os::fastMemcpy(dst, hostPtr, copySize); + } else { + size_t srcOffs = 0; + size_t dstOffsBase = origin[0] * elementSize_; + size_t dstOffs; + + // Make sure we use the right pitch if it's not specified + if (rowPitch == 0) { + rowPitch = size[0] * elementSize_; + } + + // Make sure we use the right slice if it's not specified + if (slicePitch == 0) { + slicePitch = size[0] * size[1] * elementSize_; + } + + // Adjust the destination offset with Y dimension + dstOffsBase += desc().pitch_ * origin[1] * elementSize_; + + // Adjust the destination offset with Z dimension + dstOffsBase += desc().slice_ * origin[2] * elementSize_; + + // Copy memory slice by slice + for (size_t slice = 0; slice < size[2]; ++slice) { + dstOffs = dstOffsBase + slice * desc().slice_ * elementSize_; + srcOffs = slice * slicePitch; + + // Copy memory line by line + for (size_t row = 0; row < size[1]; ++row) { + // Copy memory + amd::Os::fastMemcpy((reinterpret_cast
(dst) + dstOffs), + (reinterpret_cast(hostPtr) + srcOffs), + size[0] * elementSize_); + + dstOffs += desc().pitch_ * elementSize_; + srcOffs += rowPitch; + } + } + } + + // Unmap GPU memory + unmap(gpu); + + return true; } -bool -Resource::isPersistentDirectMap() const -{ - bool directMap = ((memoryType() == Resource::Persistent) && - (desc().dimSize_ < 3) && !desc().imageArray_); +bool Resource::hostRead(VirtualGPU* gpu, void* hostPtr, const amd::Coord3D& origin, + const amd::Coord3D& size, size_t rowPitch, size_t slicePitch) { + void* src; - // If direct map is possible, then validate it with the current tiling - if (directMap && desc().tiled_) { - //!@note IOL for Linux doesn't support tiling aperture - // and runtime doesn't force linear images in persistent - directMap = IS_WINDOWS && !dev().settings().linearPersistentImage_; + size_t startLayer = origin[2]; + size_t numLayers = size[2]; + if (desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY) { + startLayer = origin[1]; + numLayers = size[1]; + } + + // Get physical GPU memmory + src = map(gpu, ReadOnly, startLayer, numLayers); + if (nullptr == src) { + LogError("Couldn't map GPU memory for host read"); + return false; + } + + if (1 == desc().dimSize_) { + size_t copySize = (desc().buffer_) ? size[0] : size[0] * elementSize_; + + // Update the pointer + src = static_cast(static_cast(src) + origin[0]); + + // Copy memory + amd::Os::fastMemcpy(hostPtr, src, copySize); + } else { + size_t srcOffsBase = origin[0] * elementSize_; + size_t srcOffs; + size_t dstOffs = 0; + + // Make sure we use the right pitch if it's not specified + if (rowPitch == 0) { + rowPitch = size[0] * elementSize_; } - return directMap; + // Make sure we use the right slice if it's not specified + if (slicePitch == 0) { + slicePitch = size[0] * size[1] * elementSize_; + } + + // Adjust destination offset with Y dimension + srcOffsBase += desc().pitch_ * origin[1] * elementSize_; + + // Adjust the destination offset with Z dimension + srcOffsBase += desc().slice_ * origin[2] * elementSize_; + + // Copy memory line by line + for (size_t slice = 0; slice < size[2]; ++slice) { + srcOffs = srcOffsBase + slice * desc().slice_ * elementSize_; + dstOffs = slice * slicePitch; + + // Copy memory line by line + for (size_t row = 0; row < size[1]; ++row) { + // Copy memory + amd::Os::fastMemcpy((reinterpret_cast
(hostPtr) + dstOffs), + (reinterpret_cast(src) + srcOffs), + size[0] * elementSize_); + + srcOffs += desc().pitch_ * elementSize_; + dstOffs += rowPitch; + } + } + } + + // Unmap GPU memory + unmap(gpu); + + return true; } -void* -Resource::map(VirtualGPU* gpu, uint flags, uint startLayer, uint numLayers) -{ - if (isMemoryType(Pinned)) { - // Check if we have to wait - if (!(flags & NoWait)) { - if (gpu != nullptr) { - wait(*gpu); - } - } - return address_; +void* Resource::gpuMemoryMap(size_t* pitch, uint flags, Pal::IGpuMemory* resource) const { + if (desc_.cardMemory_ && !isPersistentDirectMap()) { + // @todo remove const cast + Unimplemented(); + return nullptr; + // return const_cast(dev()).resMapLocal(*pitch, resource, flags); + } else { + amd::ScopedLock lk(dev().lockPAL()); + void* address; + if (image_ != nullptr) { + constexpr Pal::SubresId ImgSubresId = {Pal::ImageAspect::Color, 0, 0}; + Pal::SubresLayout layout; + image_->GetSubresourceLayout(ImgSubresId, &layout); + *pitch = layout.rowPitch / elementSize(); } - - if (flags & ReadOnly) { - assert(!(flags & Discard) && "We can't use lock discard with read only!"); + *pitch = desc().width_; + if (Pal::Result::Success == resource->Map(&address)) { + return address; + } else { + LogError("PAL GpuMemory->Map() failed!"); + return nullptr; } + } +} - if (flags & WriteOnly) { +void Resource::gpuMemoryUnmap(Pal::IGpuMemory* resource) const { + if (desc_.cardMemory_ && !isPersistentDirectMap()) { + // @todo remove const cast + Unimplemented(); + // const_cast(dev()).resUnmapLocal(resource); + } else { + Pal::Result result = resource->Unmap(); + if (Pal::Result::Success != result) { + LogError("PAL GpuMemory->Unmap() failed!"); } + } +} - // Check if use map discard - if (flags & Discard) { - if (gpu != nullptr) { - // If we use a new renamed allocation, then skip the wait - if (rename(*gpu)) { - flags |= NoWait; - } - } - } +bool Resource::glAcquire() { + bool retVal = true; + if (desc().type_ == OGLInterop) { + retVal = dev().resGLAcquire(glPlatformContext_, glInteropMbRes_, glType_); + } + return retVal; +} +bool Resource::glRelease() { + bool retVal = true; + if (desc().type_ == OGLInterop) { + retVal = dev().resGLRelease(glPlatformContext_, glInteropMbRes_, glType_); + } + return retVal; +} +void Resource::palFree() const { + amd::ScopedLock lk(dev().lockPAL()); + + if (desc().type_ == OGLInterop) { + dev().resGLFree(glPlatformContext_, glInteropMbRes_, glType_); + } + memRef_->release(); +} + +bool Resource::isMemoryType(MemoryType memType) const { + if (memoryType() == memType) { + return true; + } else if (memoryType() == View) { + return viewOwner_->isMemoryType(memType); + } + + return false; +} + +bool Resource::isPersistentDirectMap() const { + bool directMap = + ((memoryType() == Resource::Persistent) && (desc().dimSize_ < 3) && !desc().imageArray_); + + // If direct map is possible, then validate it with the current tiling + if (directMap && desc().tiled_) { + //!@note IOL for Linux doesn't support tiling aperture + // and runtime doesn't force linear images in persistent + directMap = IS_WINDOWS && !dev().settings().linearPersistentImage_; + } + + return directMap; +} + +void* Resource::map(VirtualGPU* gpu, uint flags, uint startLayer, uint numLayers) { + if (isMemoryType(Pinned)) { // Check if we have to wait if (!(flags & NoWait)) { - if (gpu != nullptr) { - wait(*gpu); - } + if (gpu != nullptr) { + wait(*gpu); + } } - - // Check if memory wasn't mapped yet - if (++mapCount_ == 1) { - if ((desc().dimSize_ == 3) || desc().imageArray_ || - ((desc().type_ == ImageView) && viewOwner_->mipMapped())) { - // Save map info for multilayer map/unmap - startLayer_ = startLayer; - numLayers_ = numLayers; - mapFlags_ = flags; - // Map with layers - address_ = mapLayers(gpu, flags); - } - else { - // Map current resource - address_ = gpuMemoryMap(&desc_.pitch_, flags, iMem()); - if (address_ == nullptr) { - LogError("cal::ResMap failed!"); - --mapCount_; - return nullptr; - } - } - } - - //! \note the atomic operation with counter doesn't - // guarantee that the address will be valid, - // since PAL could still process the first map - if (address_ == nullptr) { - for (uint i = 0; address_ == NULL && i < 10; ++i) { - amd::Os::sleep(1); - } - assert((address_ != nullptr) && "Multiple maps failed!"); - } - return address_; -} + } -void* -Resource::mapLayers(VirtualGPU* gpu, uint flags) -{ - size_t srcOffs = 0; - size_t dstOffs = 0; - Pal::IGpuMemory* sliceResource = 0; - PalGpuMemoryType palDim = PAL_TEXTURE_2D; - size_t layers = desc().depth_; - size_t height = desc().height_; + if (flags & ReadOnly) { + assert(!(flags & Discard) && "We can't use lock discard with read only!"); + } - // Use 1D layers - if (CL_MEM_OBJECT_IMAGE1D_ARRAY == desc().topology_) { - palDim = PAL_TEXTURE_1D; - height = 1; - layers = desc().height_; + if (flags & WriteOnly) { + } + + // Check if use map discard + if (flags & Discard) { + if (gpu != nullptr) { + // If we use a new renamed allocation, then skip the wait + if (rename(*gpu)) { + flags |= NoWait; + } } + } - desc_.pitch_ = desc().width_; - desc_.slice_ = desc().pitch_ * height; - address_ = new char [desc().slice_ * layers * elementSize()]; - if (nullptr == address_) { + // Check if we have to wait + if (!(flags & NoWait)) { + if (gpu != nullptr) { + wait(*gpu); + } + } + + // Check if memory wasn't mapped yet + if (++mapCount_ == 1) { + if ((desc().dimSize_ == 3) || desc().imageArray_ || + ((desc().type_ == ImageView) && viewOwner_->mipMapped())) { + // Save map info for multilayer map/unmap + startLayer_ = startLayer; + numLayers_ = numLayers; + mapFlags_ = flags; + // Map with layers + address_ = mapLayers(gpu, flags); + } else { + // Map current resource + address_ = gpuMemoryMap(&desc_.pitch_, flags, iMem()); + if (address_ == nullptr) { + LogError("cal::ResMap failed!"); + --mapCount_; return nullptr; + } } + } - // Check if map is write only - if (flags & WriteOnly) { - return address_; + //! \note the atomic operation with counter doesn't + // guarantee that the address will be valid, + // since PAL could still process the first map + if (address_ == nullptr) { + for (uint i = 0; address_ == NULL && i < 10; ++i) { + amd::Os::sleep(1); } + assert((address_ != nullptr) && "Multiple maps failed!"); + } - if (numLayers_ != 0) { - layers = startLayer_ + numLayers_; - } + return address_; +} - dstOffs = startLayer_ * desc().slice_ * elementSize(); +void* Resource::mapLayers(VirtualGPU* gpu, uint flags) { + size_t srcOffs = 0; + size_t dstOffs = 0; + Pal::IGpuMemory* sliceResource = 0; + PalGpuMemoryType palDim = PAL_TEXTURE_2D; + size_t layers = desc().depth_; + size_t height = desc().height_; - // Loop through all layers - for (uint i = startLayer_; i < layers; ++i) { - // gslResource3D gslSize; - size_t calOffset; - void* sliceAddr; - size_t pitch; - Unimplemented(); - // Allocate a layer from the image - // gslSize.width = desc().width_; - //gslSize.height = height; - //gslSize.depth = 1; - calOffset = 0; -/* - sliceResource = dev().resAllocView( - iMem(), gslSize, - calOffset, desc().format_, desc().channelOrder_, palDim, - 0, i, CAL_RESALLOCSLICEVIEW_LEVEL_AND_LAYER); - if (0 == sliceResource) { - LogError("Map layer. resAllocSliceView failed!"); - return nullptr; - } -*/ - // Map 2D layer - sliceAddr = gpuMemoryMap(&pitch, ReadOnly, sliceResource); - if (sliceAddr == nullptr) { - LogError("Map layer. CalResMap failed!"); - return nullptr; - } + // Use 1D layers + if (CL_MEM_OBJECT_IMAGE1D_ARRAY == desc().topology_) { + palDim = PAL_TEXTURE_1D; + height = 1; + layers = desc().height_; + } - srcOffs = 0; - // Copy memory line by line - for (size_t rows = 0; rows < height; ++rows) { - // Copy memory - amd::Os::fastMemcpy( - (reinterpret_cast
(address_) + dstOffs), - (reinterpret_cast(sliceAddr) + srcOffs), - desc().width_ * elementSize_); - - dstOffs += desc().pitch_ * elementSize(); - srcOffs += pitch * elementSize(); - } - - // Unmap a layer - gpuMemoryUnmap(sliceResource); - //dev().resFree(sliceResource); - } + desc_.pitch_ = desc().width_; + desc_.slice_ = desc().pitch_ * height; + address_ = new char[desc().slice_ * layers * elementSize()]; + if (nullptr == address_) { + return nullptr; + } + // Check if map is write only + if (flags & WriteOnly) { return address_; -} + } -void -Resource::unmap(VirtualGPU* gpu) -{ - if (isMemoryType(Pinned)) { - return; - } + if (numLayers_ != 0) { + layers = startLayer_ + numLayers_; + } - // Decrement map counter - int count = --mapCount_; + dstOffs = startLayer_ * desc().slice_ * elementSize(); - // Check if it's the last unmap - if (count == 0) { - if ((desc().dimSize_ == 3) || desc().imageArray_ || - ((desc().type_ == ImageView) && viewOwner_->mipMapped())) { - // Unmap layers - unmapLayers(gpu); - } - else { - // Unmap current resource - gpuMemoryUnmap(iMem()); - } - address_ = nullptr; - } - else if (count < 0) { - LogError("dev().serialCalResUnmap failed!"); - ++mapCount_; - return; - } -} - -void -Resource::unmapLayers(VirtualGPU* gpu) -{ - size_t srcOffs = 0; - size_t dstOffs = 0; - PalGpuMemoryType palDim = PAL_TEXTURE_2D; - Pal::IGpuMemory* sliceResource = nullptr; - uint layers = desc().depth_; - uint height = desc().height_; - - // Use 1D layers - if (CL_MEM_OBJECT_IMAGE1D_ARRAY == desc().topology_) { - palDim = PAL_TEXTURE_1D; - height = 1; - layers = desc().height_; - } - - if (numLayers_ != 0) { - layers = startLayer_ + numLayers_; - } - - srcOffs = startLayer_ * desc().slice_ * elementSize(); - - // Check if map is write only - if (!(mapFlags_ & ReadOnly)) { - // Loop through all layers - for (uint i = startLayer_; i < layers; ++i) { - Unimplemented(); -// gslResource3D gslSize; - size_t calOffset; - void* sliceAddr; - size_t pitch; - - // Allocate a layer from the image - //gslSize.width = desc().width_; - //gslSize.height = height; - //gslSize.depth = 1; - calOffset = 0; - /*sliceResource = dev().resAllocView( + // Loop through all layers + for (uint i = startLayer_; i < layers; ++i) { + // gslResource3D gslSize; + size_t calOffset; + void* sliceAddr; + size_t pitch; + Unimplemented(); + // Allocate a layer from the image + // gslSize.width = desc().width_; + // gslSize.height = height; + // gslSize.depth = 1; + calOffset = 0; + /* + sliceResource = dev().resAllocView( iMem(), gslSize, calOffset, desc().format_, desc().channelOrder_, palDim, 0, i, CAL_RESALLOCSLICEVIEW_LEVEL_AND_LAYER); if (0 == sliceResource) { - LogError("Unmap layer. resAllocSliceView failed!"); - return; + LogError("Map layer. resAllocSliceView failed!"); + return nullptr; } + */ + // Map 2D layer + sliceAddr = gpuMemoryMap(&pitch, ReadOnly, sliceResource); + if (sliceAddr == nullptr) { + LogError("Map layer. CalResMap failed!"); + return nullptr; + } + + srcOffs = 0; + // Copy memory line by line + for (size_t rows = 0; rows < height; ++rows) { + // Copy memory + amd::Os::fastMemcpy((reinterpret_cast
(address_) + dstOffs), + (reinterpret_cast(sliceAddr) + srcOffs), + desc().width_ * elementSize_); + + dstOffs += desc().pitch_ * elementSize(); + srcOffs += pitch * elementSize(); + } + + // Unmap a layer + gpuMemoryUnmap(sliceResource); + // dev().resFree(sliceResource); + } + + return address_; +} + +void Resource::unmap(VirtualGPU* gpu) { + if (isMemoryType(Pinned)) { + return; + } + + // Decrement map counter + int count = --mapCount_; + + // Check if it's the last unmap + if (count == 0) { + if ((desc().dimSize_ == 3) || desc().imageArray_ || + ((desc().type_ == ImageView) && viewOwner_->mipMapped())) { + // Unmap layers + unmapLayers(gpu); + } else { + // Unmap current resource + gpuMemoryUnmap(iMem()); + } + address_ = nullptr; + } else if (count < 0) { + LogError("dev().serialCalResUnmap failed!"); + ++mapCount_; + return; + } +} + +void Resource::unmapLayers(VirtualGPU* gpu) { + size_t srcOffs = 0; + size_t dstOffs = 0; + PalGpuMemoryType palDim = PAL_TEXTURE_2D; + Pal::IGpuMemory* sliceResource = nullptr; + uint layers = desc().depth_; + uint height = desc().height_; + + // Use 1D layers + if (CL_MEM_OBJECT_IMAGE1D_ARRAY == desc().topology_) { + palDim = PAL_TEXTURE_1D; + height = 1; + layers = desc().height_; + } + + if (numLayers_ != 0) { + layers = startLayer_ + numLayers_; + } + + srcOffs = startLayer_ * desc().slice_ * elementSize(); + + // Check if map is write only + if (!(mapFlags_ & ReadOnly)) { + // Loop through all layers + for (uint i = startLayer_; i < layers; ++i) { + Unimplemented(); + // gslResource3D gslSize; + size_t calOffset; + void* sliceAddr; + size_t pitch; + + // Allocate a layer from the image + // gslSize.width = desc().width_; + // gslSize.height = height; + // gslSize.depth = 1; + calOffset = 0; + /*sliceResource = dev().resAllocView( + iMem(), gslSize, + calOffset, desc().format_, desc().channelOrder_, palDim, + 0, i, CAL_RESALLOCSLICEVIEW_LEVEL_AND_LAYER); + if (0 == sliceResource) { + LogError("Unmap layer. resAllocSliceView failed!"); + return; + } */ - // Map a layer - sliceAddr = gpuMemoryMap(&pitch, WriteOnly, sliceResource); - if (sliceAddr == nullptr) { - LogError("Unmap layer. CalResMap failed!"); - return; - } + // Map a layer + sliceAddr = gpuMemoryMap(&pitch, WriteOnly, sliceResource); + if (sliceAddr == nullptr) { + LogError("Unmap layer. CalResMap failed!"); + return; + } - dstOffs = 0; - // Copy memory line by line - for (size_t rows = 0; rows < height; ++rows) { - // Copy memory - amd::Os::fastMemcpy( - (reinterpret_cast
(sliceAddr) + dstOffs), - (reinterpret_cast(address_) + srcOffs), - desc().width_ * elementSize_); + dstOffs = 0; + // Copy memory line by line + for (size_t rows = 0; rows < height; ++rows) { + // Copy memory + amd::Os::fastMemcpy((reinterpret_cast
(sliceAddr) + dstOffs), + (reinterpret_cast(address_) + srcOffs), + desc().width_ * elementSize_); - dstOffs += pitch * elementSize(); - srcOffs += desc().pitch_ * elementSize(); - } + dstOffs += pitch * elementSize(); + srcOffs += desc().pitch_ * elementSize(); + } - // Unmap a layer - gpuMemoryUnmap(sliceResource); - //dev().resFree(sliceResource); - } + // Unmap a layer + gpuMemoryUnmap(sliceResource); + // dev().resFree(sliceResource); } + } - // Destroy the mapped memory - delete [] reinterpret_cast(address_); + // Destroy the mapped memory + delete[] reinterpret_cast(address_); } -void -Resource::setActiveRename(VirtualGPU& gpu, GpuMemoryReference* rename) -{ - // Copy the unique GSL data - memRef_ = rename; - address_ = rename->cpuAddress_; +void Resource::setActiveRename(VirtualGPU& gpu, GpuMemoryReference* rename) { + // Copy the unique GSL data + memRef_ = rename; + address_ = rename->cpuAddress_; } -bool -Resource::getActiveRename(VirtualGPU& gpu, GpuMemoryReference** rename) -{ - // Copy the old data to the rename descriptor - *rename = memRef_; +bool Resource::getActiveRename(VirtualGPU& gpu, GpuMemoryReference** rename) { + // Copy the old data to the rename descriptor + *rename = memRef_; + return true; +} + +bool Resource::rename(VirtualGPU& gpu, bool force) { + GpuEvent* gpuEvent = gpu.getGpuEvent(iMem()); + if (!gpuEvent->isValid() && !force) { return true; -} + } -bool -Resource::rename(VirtualGPU& gpu, bool force) -{ - GpuEvent* gpuEvent = gpu.getGpuEvent(iMem()); - if (!gpuEvent->isValid() && !force) { - return true; + bool useNext = false; + uint resSize = desc().width_ * ((desc().height_) ? desc().height_ : 1) * elementSize_; + + // Rename will work with real GSL resources + if (((memoryType() != Local) && (memoryType() != Persistent) && (memoryType() != Remote) && + (memoryType() != RemoteUSWC)) || + (dev().settings().maxRenames_ == 0)) { + return false; + } + + // If the resource for renaming is too big, then lets check the current status first + // at the cost of an extra flush + if (resSize >= (dev().settings().maxRenameSize_ / dev().settings().maxRenames_)) { + if (gpu.isDone(gpuEvent)) { + return true; + } + } + + // Save the first + if (renames_.size() == 0) { + GpuMemoryReference* rename; + if (mapCount_ > 0) { + memRef_->cpuAddress_ = address_; + } + if (!getActiveRename(gpu, &rename)) { + return false; } - bool useNext = false; - uint resSize = desc().width_ * ((desc().height_) ? desc().height_ : 1) * - elementSize_; + curRename_ = renames_.size(); + renames_.push_back(rename); + } - // Rename will work with real GSL resources - if (((memoryType() != Local) && - (memoryType() != Persistent) && - (memoryType() != Remote) && - (memoryType() != RemoteUSWC)) || - (dev().settings().maxRenames_ == 0)) { - return false; - } + // Can we use a new rename? + if ((renames_.size() <= dev().settings().maxRenames_) && + ((renames_.size() * resSize) <= dev().settings().maxRenameSize_)) { + GpuMemoryReference* rename; - // If the resource for renaming is too big, then lets check the current status first - // at the cost of an extra flush - if (resSize >= (dev().settings().maxRenameSize_ / dev().settings().maxRenames_)) { - if (gpu.isDone(gpuEvent)) { - return true; + // Create a new GSL allocation + if (create(memoryType())) { + if (mapCount_ > 0) { + assert(!desc().cardMemory_ && "Unsupported memory type!"); + memRef_->cpuAddress_ = gpuMemoryMap(&desc_.pitch_, 0, iMem()); + if (memRef_->cpuAddress_ == nullptr) { + LogError("gslMap fails on rename!"); } - } - - // Save the first - if (renames_.size() == 0) { - GpuMemoryReference* rename; - if (mapCount_ > 0) { - memRef_->cpuAddress_ = address_; - } - if (!getActiveRename(gpu, &rename)) { - return false; - } - + address_ = memRef_->cpuAddress_; + } + if (getActiveRename(gpu, &rename)) { curRename_ = renames_.size(); renames_.push_back(rename); + } else { + memRef_->release(); + useNext = true; + } + } else { + useNext = true; } + } else { + useNext = true; + } - // Can we use a new rename? - if ((renames_.size() <= dev().settings().maxRenames_) && - ((renames_.size() * resSize) <= dev().settings().maxRenameSize_)) { - GpuMemoryReference* rename; - - // Create a new GSL allocation - if (create(memoryType())) { - if (mapCount_ > 0) { - assert(!desc().cardMemory_ && "Unsupported memory type!"); - memRef_->cpuAddress_ = gpuMemoryMap(&desc_.pitch_, 0, iMem()); - if (memRef_->cpuAddress_ == nullptr) { - LogError("gslMap fails on rename!"); - } - address_ = memRef_->cpuAddress_; - } - if (getActiveRename(gpu, &rename)) { - curRename_ = renames_.size(); - renames_.push_back(rename); - } - else { - memRef_->release(); - useNext = true; - } - } - else { - useNext = true; - } - } - else { - useNext = true; + if (useNext) { + // Get the last submitted + curRename_++; + if (curRename_ >= renames_.size()) { + curRename_ = 0; } + setActiveRename(gpu, renames_[curRename_]); + return false; + } - if (useNext) { - // Get the last submitted - curRename_++; - if (curRename_ >= renames_.size()) { - curRename_ = 0; - } - setActiveRename(gpu, renames_[curRename_]); - return false; - } - - return true; + return true; } -void -Resource::warmUpRenames(VirtualGPU& gpu) -{ - // Make sure OCL touches every command buffer in the queue to avoid delays on the first submit - uint flush = dev().settings().maxRenames_ / VirtualGPU::Queue::MaxCmdBuffers; - flush = (flush == 0) ? 1 : flush; - for (uint i = 1; i <= dev().settings().maxRenames_; ++i) { - uint dummy = 0; - const bool Wait = (i % flush == 0) ? true : false; - // Write 0 for the buffer paging by VidMM - writeRawData(gpu, 0, sizeof(dummy), &dummy, Wait); - const bool Force = true; - rename(gpu, Force); - } +void Resource::warmUpRenames(VirtualGPU& gpu) { + // Make sure OCL touches every command buffer in the queue to avoid delays on the first submit + uint flush = dev().settings().maxRenames_ / VirtualGPU::Queue::MaxCmdBuffers; + flush = (flush == 0) ? 1 : flush; + for (uint i = 1; i <= dev().settings().maxRenames_; ++i) { + uint dummy = 0; + const bool Wait = (i % flush == 0) ? true : false; + // Write 0 for the buffer paging by VidMM + writeRawData(gpu, 0, sizeof(dummy), &dummy, Wait); + const bool Force = true; + rename(gpu, Force); + } } -ResourceCache::~ResourceCache() -{ - free(); -} +ResourceCache::~ResourceCache() { free(); } //! \note the cache works in FILO mode -bool -ResourceCache::addGpuMemory( - Resource::Descriptor* desc, GpuMemoryReference* ref) -{ - amd::ScopedLock l(&lockCacheOps_); - bool result = false; - size_t size = ref->iMem()->Desc().size; +bool ResourceCache::addGpuMemory(Resource::Descriptor* desc, GpuMemoryReference* ref) { + amd::ScopedLock l(&lockCacheOps_); + bool result = false; + size_t size = ref->iMem()->Desc().size; - // Make sure current allocation isn't bigger than cache - if (((desc->type_ == Resource::Local) || - (desc->type_ == Resource::Persistent) || - (desc->type_ == Resource::Remote) || - (desc->type_ == Resource::RemoteUSWC)) && - (size < cacheSizeLimit_) && - !desc->SVMRes_) { - // Validate the cache size limit. Loop until we have enough space - while ((cacheSize_ + size) > cacheSizeLimit_) { - removeLast(); - } - Resource::Descriptor* descCached = new Resource::Descriptor; - if (descCached != nullptr) { - // Copy the original desc to the cached version - memcpy(descCached, desc, sizeof(Resource::Descriptor)); - - // Add the current resource to the cache - resCache_.push_front(std::make_pair(descCached, ref)); - cacheSize_ += size; - result = true; - } + // Make sure current allocation isn't bigger than cache + if (((desc->type_ == Resource::Local) || (desc->type_ == Resource::Persistent) || + (desc->type_ == Resource::Remote) || (desc->type_ == Resource::RemoteUSWC)) && + (size < cacheSizeLimit_) && !desc->SVMRes_) { + // Validate the cache size limit. Loop until we have enough space + while ((cacheSize_ + size) > cacheSizeLimit_) { + removeLast(); } + Resource::Descriptor* descCached = new Resource::Descriptor; + if (descCached != nullptr) { + // Copy the original desc to the cached version + memcpy(descCached, desc, sizeof(Resource::Descriptor)); - return result; + // Add the current resource to the cache + resCache_.push_front(std::make_pair(descCached, ref)); + cacheSize_ += size; + result = true; + } + } + + return result; } -GpuMemoryReference* -ResourceCache::findGpuMemory( - Resource::Descriptor* desc, Pal::gpusize size, Pal::gpusize alignment) -{ - amd::ScopedLock l(&lockCacheOps_); - GpuMemoryReference* ref = nullptr; - - // Early exit if resource is too big - if (size >= cacheSizeLimit_ || desc->SVMRes_) { - //! \note we may need to free the cache here to reduce memory pressure - return ref; - } - - // Serach the right resource through the cache list - for (const auto& it: resCache_) { - Resource::Descriptor* entry = it.first; - size_t sizeRes = it.second->iMem()->Desc().size; - // Find if we can reuse this entry - if ((entry->type_ == desc->type_) && - (entry->flags_ == desc->flags_) && - (size <= sizeRes) && - (size > (sizeRes >> 2)) && - ((it.second->iMem()->Desc().gpuVirtAddr % alignment) == 0) && - (entry->isAllocExecute_ == desc->isAllocExecute_)) { - ref = it.second; - delete it.first; - // Remove the found etry from the cache - resCache_.remove(it); - cacheSize_ -= sizeRes; - break; - } - } +GpuMemoryReference* ResourceCache::findGpuMemory(Resource::Descriptor* desc, Pal::gpusize size, + Pal::gpusize alignment) { + amd::ScopedLock l(&lockCacheOps_); + GpuMemoryReference* ref = nullptr; + // Early exit if resource is too big + if (size >= cacheSizeLimit_ || desc->SVMRes_) { + //! \note we may need to free the cache here to reduce memory pressure return ref; -} + } -bool -ResourceCache::free(size_t minCacheEntries) -{ - amd::ScopedLock l(&lockCacheOps_); - bool result = false; - - if (minCacheEntries < resCache_.size()) { - if (static_cast(cacheSize_) > 0) { - result = true; - } - // Clear the cache - while (static_cast(cacheSize_) > 0) { - removeLast(); - } - CondLog((cacheSize_ != 0), "Incorrect size for cache release!"); + // Serach the right resource through the cache list + for (const auto& it : resCache_) { + Resource::Descriptor* entry = it.first; + size_t sizeRes = it.second->iMem()->Desc().size; + // Find if we can reuse this entry + if ((entry->type_ == desc->type_) && (entry->flags_ == desc->flags_) && (size <= sizeRes) && + (size > (sizeRes >> 2)) && ((it.second->iMem()->Desc().gpuVirtAddr % alignment) == 0) && + (entry->isAllocExecute_ == desc->isAllocExecute_)) { + ref = it.second; + delete it.first; + // Remove the found etry from the cache + resCache_.remove(it); + cacheSize_ -= sizeRes; + break; } - return result; + } + + return ref; } -void -ResourceCache::removeLast() -{ - std::pair entry; - entry = resCache_.back(); - resCache_.pop_back(); +bool ResourceCache::free(size_t minCacheEntries) { + amd::ScopedLock l(&lockCacheOps_); + bool result = false; - size_t size = entry.second->iMem()->Desc().size; - - // Delete Descriptor - delete entry.first; - - // Destroy GSL resource - entry.second->release(); - cacheSize_ -= size; + if (minCacheEntries < resCache_.size()) { + if (static_cast(cacheSize_) > 0) { + result = true; + } + // Clear the cache + while (static_cast(cacheSize_) > 0) { + removeLast(); + } + CondLog((cacheSize_ != 0), "Incorrect size for cache release!"); + } + return result; } -} // namespace pal +void ResourceCache::removeLast() { + std::pair entry; + entry = resCache_.back(); + resCache_.pop_back(); + + size_t size = entry.second->iMem()->Desc().size; + + // Delete Descriptor + delete entry.first; + + // Destroy GSL resource + entry.second->release(); + cacheSize_ -= size; +} + +} // namespace pal diff --git a/rocclr/runtime/device/pal/palresource.hpp b/rocclr/runtime/device/pal/palresource.hpp index 2bdf858b51..dc8d55d915 100644 --- a/rocclr/runtime/device/pal/palresource.hpp +++ b/rocclr/runtime/device/pal/palresource.hpp @@ -17,496 +17,457 @@ class VirtualGPU; * @{ */ -class GpuMemoryReference : public amd::ReferenceCountedObject -{ -public: - static GpuMemoryReference* Create( - const Device& dev, - const Pal::GpuMemoryCreateInfo& createInfo); +class GpuMemoryReference : public amd::ReferenceCountedObject { + public: + static GpuMemoryReference* Create(const Device& dev, const Pal::GpuMemoryCreateInfo& createInfo); - static GpuMemoryReference* Create( - const Device& dev, - const Pal::PinnedGpuMemoryCreateInfo& createInfo); + static GpuMemoryReference* Create(const Device& dev, + const Pal::PinnedGpuMemoryCreateInfo& createInfo); - static GpuMemoryReference* Create( - const Device& dev, - const Pal::SvmGpuMemoryCreateInfo& createInfo); + static GpuMemoryReference* Create(const Device& dev, + const Pal::SvmGpuMemoryCreateInfo& createInfo); - static GpuMemoryReference* Create( - const Device& dev, - const Pal::ExternalGpuMemoryOpenInfo& openInfo); + static GpuMemoryReference* Create(const Device& dev, + const Pal::ExternalGpuMemoryOpenInfo& openInfo); - static GpuMemoryReference* Create( - const Device& dev, - const Pal::ExternalImageOpenInfo& openInfo, - Pal::ImageCreateInfo* imgCreateInfo, - Pal::IImage** image); + static GpuMemoryReference* Create(const Device& dev, const Pal::ExternalImageOpenInfo& openInfo, + Pal::ImageCreateInfo* imgCreateInfo, Pal::IImage** image); - //! Default constructor - GpuMemoryReference(); + //! Default constructor + GpuMemoryReference(); - //! Get PAL memory object - Pal::IGpuMemory* iMem() const { return gpuMem_; } + //! Get PAL memory object + Pal::IGpuMemory* iMem() const { return gpuMem_; } - Pal::IGpuMemory* gpuMem_; //!< PAL GPU memory object - void* cpuAddress_; //!< CPU address of this memory + Pal::IGpuMemory* gpuMem_; //!< PAL GPU memory object + void* cpuAddress_; //!< CPU address of this memory -protected: - //! Default destructor - ~GpuMemoryReference(); + protected: + //! Default destructor + ~GpuMemoryReference(); -private: - //! Disable copy constructor - GpuMemoryReference(const GpuMemoryReference&); + private: + //! Disable copy constructor + GpuMemoryReference(const GpuMemoryReference&); - //! Disable operator= - GpuMemoryReference& operator=(const GpuMemoryReference&); + //! Disable operator= + GpuMemoryReference& operator=(const GpuMemoryReference&); }; //! GPU resource -class Resource : public amd::HeapObject -{ -public: - enum InteropType { - InteropTypeless = 0, - InteropVertexBuffer, - InteropIndexBuffer, - InteropRenderBuffer, - InteropTexture, - InteropTextureViewLevel, - InteropTextureViewCube, - InteropSurface - }; +class Resource : public amd::HeapObject { + public: + enum InteropType { + InteropTypeless = 0, + InteropVertexBuffer, + InteropIndexBuffer, + InteropRenderBuffer, + InteropTexture, + InteropTextureViewLevel, + InteropTextureViewCube, + InteropSurface + }; - struct CreateParams : public amd::StackObject { - amd::Memory* owner_; //!< Resource's owner - VirtualGPU* gpu_; //!< Resource won't be shared between multiple queues - const Resource* svmBase_; //!< SVM base for MGPU allocations - CreateParams(): owner_(nullptr), gpu_(nullptr), svmBase_(nullptr) {} - }; + struct CreateParams : public amd::StackObject { + amd::Memory* owner_; //!< Resource's owner + VirtualGPU* gpu_; //!< Resource won't be shared between multiple queues + const Resource* svmBase_; //!< SVM base for MGPU allocations + CreateParams() : owner_(nullptr), gpu_(nullptr), svmBase_(nullptr) {} + }; - struct PinnedParams : public CreateParams { - const amd::HostMemoryReference* hostMemRef_;//!< System memory pointer for pinning - size_t size_; //!< System memory size - }; + struct PinnedParams : public CreateParams { + const amd::HostMemoryReference* hostMemRef_; //!< System memory pointer for pinning + size_t size_; //!< System memory size + }; - struct ViewParams : public CreateParams { - size_t offset_; //!< Alias resource offset - size_t size_; //!< Alias resource size - const Resource* resource_; //!< Parent resource for the view creation - const void* memory_; - }; + struct ViewParams : public CreateParams { + size_t offset_; //!< Alias resource offset + size_t size_; //!< Alias resource size + const Resource* resource_; //!< Parent resource for the view creation + const void* memory_; + }; - struct ImageViewParams : public CreateParams { - size_t level_; //!< Image mip level for a new view - size_t layer_; //!< Image layer for a new view - const Resource* resource_; //!< Parent resource for the view creation - const void* memory_; - }; + struct ImageViewParams : public CreateParams { + size_t level_; //!< Image mip level for a new view + size_t layer_; //!< Image layer for a new view + const Resource* resource_; //!< Parent resource for the view creation + const void* memory_; + }; - struct ImageBufferParams : public CreateParams { - const Resource* resource_; //!< Parent resource for the image creation - const void* memory_; - }; + struct ImageBufferParams : public CreateParams { + const Resource* resource_; //!< Parent resource for the image creation + const void* memory_; + }; - struct OGLInteropParams : public CreateParams { - InteropType type_; //!< OGL resource type - uint handle_; //!< OGL resource handle - uint mipLevel_; //!< Texture mip level - uint layer_; //!< Texture layer - void* glPlatformContext_; - void* glDeviceContext_; - uint flags_; - }; + struct OGLInteropParams : public CreateParams { + InteropType type_; //!< OGL resource type + uint handle_; //!< OGL resource handle + uint mipLevel_; //!< Texture mip level + uint layer_; //!< Texture layer + void* glPlatformContext_; + void* glDeviceContext_; + uint flags_; + }; #ifdef _WIN32 - struct D3DInteropParams : public CreateParams { - InteropType type_; //!< D3D resource type - void* iDirect3D_; //!< D3D resource interface object - void* handle_; //!< D3D resource handle - uint mipLevel_; //!< Texture mip level - int layer_; //!< Texture layer - uint misc; //!< miscellaneous cases + struct D3DInteropParams : public CreateParams { + InteropType type_; //!< D3D resource type + void* iDirect3D_; //!< D3D resource interface object + void* handle_; //!< D3D resource handle + uint mipLevel_; //!< Texture mip level + int layer_; //!< Texture layer + uint misc; //!< miscellaneous cases + }; +#endif // _WIN32 + + //! Resource memory + enum MemoryType { + Empty = 0x0, //!< resource is empty + Local, //!< resource in local memory + Persistent, //!< resource in persistent memory + Remote, //!< resource in nonlocal memory + RemoteUSWC, //!< resource in nonlocal memory + Pinned, //!< resource in pinned system memory + View, //!< resource is an alias + OGLInterop, //!< resource is an OGL memory object + D3D10Interop, //!< resource is a D3D10 memory object + D3D11Interop, //!< resource is a D3D11 memory object + ImageView, //!< resource is a view to some image + ImageBuffer, //!< resource is an image view of a buffer + BusAddressable, //!< resource is a bus addressable memory + ExternalPhysical, //!< resource is an external physical memory + D3D9Interop, //!< resource is a D3D9 memory object + Scratch, //!< resource is scratch memory + Shader, //!< resource is a shader + }; + + //! Resource map flags + enum MapFlags { + Discard = 0x00000001, //!< discard lock + NoOverwrite = 0x00000002, //!< lock with no overwrite + ReadOnly = 0x00000004, //!< lock for read only operation + WriteOnly = 0x00000008, //!< lock for write only operation + NoWait = 0x00000010, //!< lock with no wait + }; + + //! Resource descriptor + struct Descriptor : public amd::HeapObject { + MemoryType type_; //!< Memory type + size_t width_; //!< Resource width + size_t height_; //!< Resource height + size_t depth_; //!< Resource depth + uint baseLevel_; //!< The base level for the view + uint mipLevels_; //!< Number of mip levels + uint flags_; //!< Resource flags, used in creation + size_t pitch_; //!< Resource pitch, valid if locked + size_t slice_; //!< Resource slice, valid if locked + cl_image_format format_; //!< CL image format + cl_mem_object_type topology_; //!< CL mem object type + union { + struct { + uint dimSize_ : 2; //!< Dimension size + uint cardMemory_ : 1; //!< PAL resource is in video memory + uint imageArray_ : 1; //!< PAL resource is an array of images + uint buffer_ : 1; //!< PAL resource is a buffer + uint tiled_ : 1; //!< PAL resource is tiled + uint SVMRes_ : 1; //!< SVM flag to the cal resource + uint scratch_ : 1; //!< Scratch buffer + uint isAllocExecute_ : 1; //!< SVM resource allocation attribute for shader\cmdbuf + uint isDoppTexture_ : 1; //!< PAL resource is for a DOPP desktop texture + }; + uint state_; }; -#endif // _WIN32 + }; - //! Resource memory - enum MemoryType - { - Empty = 0x0, //!< resource is empty - Local, //!< resource in local memory - Persistent, //!< resource in persistent memory - Remote, //!< resource in nonlocal memory - RemoteUSWC, //!< resource in nonlocal memory - Pinned, //!< resource in pinned system memory - View, //!< resource is an alias - OGLInterop, //!< resource is an OGL memory object - D3D10Interop, //!< resource is a D3D10 memory object - D3D11Interop, //!< resource is a D3D11 memory object - ImageView, //!< resource is a view to some image - ImageBuffer, //!< resource is an image view of a buffer - BusAddressable, //!< resource is a bus addressable memory - ExternalPhysical, //!< resource is an external physical memory - D3D9Interop, //!< resource is a D3D9 memory object - Scratch, //!< resource is scratch memory - Shader, //!< resource is a shader - }; + //! Constructor of 1D Resource object + Resource(const Device& gpuDev, //!< GPU device object + size_t size //!< Resource size + ); - //! Resource map flags - enum MapFlags - { - Discard = 0x00000001, //!< discard lock - NoOverwrite = 0x00000002, //!< lock with no overwrite - ReadOnly = 0x00000004, //!< lock for read only operation - WriteOnly = 0x00000008, //!< lock for write only operation - NoWait = 0x00000010, //!< lock with no wait - }; + //! Constructor of Image Resource object + Resource(const Device& gpuDev, //!< GPU device object + size_t width, //!< resource width + size_t height, //!< resource height + size_t depth, //!< resource depth + cl_image_format format, //!< resource format + cl_mem_object_type imageType, //!< CL image type + uint mipLevels = 1 //!< Number of mip levels + ); - //! Resource descriptor - struct Descriptor : public amd::HeapObject - { - MemoryType type_; //!< Memory type - size_t width_; //!< Resource width - size_t height_; //!< Resource height - size_t depth_; //!< Resource depth - uint baseLevel_; //!< The base level for the view - uint mipLevels_; //!< Number of mip levels - uint flags_; //!< Resource flags, used in creation - size_t pitch_; //!< Resource pitch, valid if locked - size_t slice_; //!< Resource slice, valid if locked - cl_image_format format_; //!< CL image format - cl_mem_object_type topology_;//!< CL mem object type - union { - struct { - uint dimSize_ : 2; //!< Dimension size - uint cardMemory_ : 1; //!< PAL resource is in video memory - uint imageArray_ : 1; //!< PAL resource is an array of images - uint buffer_ : 1; //!< PAL resource is a buffer - uint tiled_ : 1; //!< PAL resource is tiled - uint SVMRes_ : 1; //!< SVM flag to the cal resource - uint scratch_ : 1; //!< Scratch buffer - uint isAllocExecute_ : 1; //!< SVM resource allocation attribute for shader\cmdbuf - uint isDoppTexture_ : 1; //!< PAL resource is for a DOPP desktop texture - }; - uint state_; - }; - }; + //! Destructor of the resource + virtual ~Resource(); - //! Constructor of 1D Resource object - Resource( - const Device& gpuDev, //!< GPU device object - size_t size //!< Resource size - ); + /*! \brief Creates a CAL object, associated with the resource + * + * \return True if we succesfully created a CAL resource + */ + virtual bool create(MemoryType memType, //!< memory type + CreateParams* params = 0 //!< special parameters for resource allocation + ); - //! Constructor of Image Resource object - Resource( - const Device& gpuDev, //!< GPU device object - size_t width, //!< resource width - size_t height, //!< resource height - size_t depth, //!< resource depth - cl_image_format format, //!< resource format - cl_mem_object_type imageType, //!< CL image type - uint mipLevels = 1 //!< Number of mip levels - ); + /*! \brief Copies a subregion of memory from one resource to another + * + * This is a general copy from anything to anything (as long as it fits). + * All positions and sizes are given in bytes. Note, however, that only + * a subset of this general interface is currently implemented. + * + * \return true if successful + */ + bool partialMemCopyTo(VirtualGPU& gpu, //!< Virtual GPU device object + const amd::Coord3D& srcOrigin, //!< Origin of the source region + const amd::Coord3D& dstOrigin, //!< Origin of the destination region + const amd::Coord3D& size, //!< Size of the region to copy + Resource& dstResource, //!< Destination resource + bool enableRectCopy = false, //!< Rectangular DMA support + bool flushDMA = false, //!< Flush DMA if requested + uint bytesPerElement = 1 //!< Bytes Per Element + ) const; - //! Destructor of the resource - virtual ~Resource(); + /*! \brief Copies size/4 DWORD of memory to a surface + * + * This is a raw copy to any surface using a CP packet. + * Size needs to be atleast a DWORD or multiple + * + */ + void writeRawData(VirtualGPU& gpu, //!< Virtual GPU device object + size_t offset, //!< Offset for in the buffer for data + size_t size, //!< Size in bytes of data to be copied(multiple of DWORDS) + const void* data, //!< Data to be copied + bool waitForEvent //!< Wait for event complete + ) const; - /*! \brief Creates a CAL object, associated with the resource - * - * \return True if we succesfully created a CAL resource - */ - virtual bool create( - MemoryType memType, //!< memory type - CreateParams* params = 0 //!< special parameters for resource allocation - ); + //! Returns the offset in GPU memory for aliases + size_t offset() const { return offset_; } - /*! \brief Copies a subregion of memory from one resource to another - * - * This is a general copy from anything to anything (as long as it fits). - * All positions and sizes are given in bytes. Note, however, that only - * a subset of this general interface is currently implemented. - * - * \return true if successful - */ - bool partialMemCopyTo( - VirtualGPU& gpu, //!< Virtual GPU device object - const amd::Coord3D& srcOrigin, //!< Origin of the source region - const amd::Coord3D& dstOrigin, //!< Origin of the destination region - const amd::Coord3D& size, //!< Size of the region to copy - Resource& dstResource, //!< Destination resource - bool enableRectCopy = false, //!< Rectangular DMA support - bool flushDMA = false, //!< Flush DMA if requested - uint bytesPerElement = 1 //!< Bytes Per Element - ) const; + //! Returns the pinned memory offset + uint64_t pinOffset() const { return pinOffset_; } - /*! \brief Copies size/4 DWORD of memory to a surface - * - * This is a raw copy to any surface using a CP packet. - * Size needs to be atleast a DWORD or multiple - * - */ - void writeRawData( - VirtualGPU& gpu, //!< Virtual GPU device object - size_t offset, //!< Offset for in the buffer for data - size_t size, //!< Size in bytes of data to be copied(multiple of DWORDS) - const void* data, //!< Data to be copied - bool waitForEvent //!< Wait for event complete - ) const; + //! Returns the GPU device that owns this resource + const Device& dev() const { return gpuDevice_; } - //! Returns the offset in GPU memory for aliases - size_t offset() const { return offset_; } + //! Returns the descriptor for resource + const Descriptor& desc() const { return desc_; } - //! Returns the pinned memory offset - uint64_t pinOffset() const { return pinOffset_; } + //! Returns the PAL memory object + Pal::IGpuMemory* iMem() const { return memRef_->iMem(); } - //! Returns the GPU device that owns this resource - const Device& dev() const { return gpuDevice_; } + //! Returns global memory offset + uint64_t vmAddress() const { return iMem()->Desc().gpuVirtAddr + offset_; } - //! Returns the descriptor for resource - const Descriptor& desc() const { return desc_; } + //! Returns global memory offset + uint64_t vmSize() const { return iMem()->Desc().size - offset_; } - //! Returns the PAL memory object - Pal::IGpuMemory* iMem() const { return memRef_->iMem(); } + //! Returns global memory offset + bool mipMapped() const { return (desc().mipLevels_ > 1) ? true : false; } - //! Returns global memory offset - uint64_t vmAddress() const { return iMem()->Desc().gpuVirtAddr + offset_; } + //! Checks if persistent memory can have a direct map + bool isPersistentDirectMap() const; - //! Returns global memory offset - uint64_t vmSize() const { return iMem()->Desc().size - offset_; } + /*! \brief Locks the resource and returns a physical pointer + * + * \note This operation stalls HW pipeline! + * + * \return Pointer to the physical memory + */ + void* map(VirtualGPU* gpu, //!< Virtual GPU device object + uint flags = 0, //!< flags for the map operation + // Optimization for multilayer map/unmap + uint startLayer = 0, //!< Start layer for multilayer map + uint numLayers = 0 //!< End layer for multilayer map + ); - //! Returns global memory offset - bool mipMapped() const { return (desc().mipLevels_ > 1) ? true : false; } + //! Unlocks the resource if it was locked + void unmap(VirtualGPU* gpu //!< Virtual GPU device object + ); - //! Checks if persistent memory can have a direct map - bool isPersistentDirectMap() const; + //! Marks the resource as busy + void setBusy(VirtualGPU& gpu, //!< Virtual GPU device object + GpuEvent calEvent //!< CAL event + ) const; - /*! \brief Locks the resource and returns a physical pointer - * - * \note This operation stalls HW pipeline! - * - * \return Pointer to the physical memory - */ - void* map( - VirtualGPU* gpu, //!< Virtual GPU device object - uint flags = 0, //!< flags for the map operation - // Optimization for multilayer map/unmap - uint startLayer = 0, //!< Start layer for multilayer map - uint numLayers = 0 //!< End layer for multilayer map - ); + //! Wait for the resource + void wait(VirtualGPU& gpu, //!< Virtual GPU device object + bool waitOnBusyEngine = false //!< Wait only if engine has changed + ) const; - //! Unlocks the resource if it was locked - void unmap( - VirtualGPU* gpu //!< Virtual GPU device object - ); + //! Performs host write to the resource GPU memory + bool hostWrite(VirtualGPU* gpu, //!< Virtual GPU device object + const void* hostPtr, //!< Host pointer to the SRC data + const amd::Coord3D& origin, //!< Offsets for the update + const amd::Coord3D& size, //!< The number of bytes to write + uint flags = 0, //!< Map flags + size_t rowPitch = 0, //!< Raw data row pitch + size_t slicePitch = 0 //!< Raw data slice pitch + ); - //! Marks the resource as busy - void setBusy( - VirtualGPU& gpu, //!< Virtual GPU device object - GpuEvent calEvent //!< CAL event - ) const; + //! Performs host read from the resource GPU memory + bool hostRead(VirtualGPU* gpu, //!< Virtual GPU device object + void* hostPtr, //!< Host pointer to the DST data + const amd::Coord3D& origin, //!< Offsets for the update + const amd::Coord3D& size, //!< The number of bytes to write + size_t rowPitch = 0, //!< Raw data row pitch + size_t slicePitch = 0 //!< Raw data slice pitch + ); - //! Wait for the resource - void wait( - VirtualGPU& gpu, //!< Virtual GPU device object - bool waitOnBusyEngine = false//!< Wait only if engine has changed - ) const; + //! Warms up the rename list for this resource + void warmUpRenames(VirtualGPU& gpu); - //! Performs host write to the resource GPU memory - bool hostWrite( - VirtualGPU* gpu, //!< Virtual GPU device object - const void* hostPtr, //!< Host pointer to the SRC data - const amd::Coord3D& origin, //!< Offsets for the update - const amd::Coord3D& size, //!< The number of bytes to write - uint flags = 0, //!< Map flags - size_t rowPitch = 0, //!< Raw data row pitch - size_t slicePitch = 0 //!< Raw data slice pitch - ); + //! Gets the resource element size + uint elementSize() const { return elementSize_; } - //! Performs host read from the resource GPU memory - bool hostRead( - VirtualGPU* gpu, //!< Virtual GPU device object - void* hostPtr, //!< Host pointer to the DST data - const amd::Coord3D& origin, //!< Offsets for the update - const amd::Coord3D& size, //!< The number of bytes to write - size_t rowPitch = 0, //!< Raw data row pitch - size_t slicePitch = 0 //!< Raw data slice pitch - ); + //! Get the mapped address of this resource + address data() const { return reinterpret_cast
(address_); } - //! Warms up the rename list for this resource - void warmUpRenames(VirtualGPU& gpu); + //! Frees all allocated CAL memories and resources, + //! associated with this objects. And also destroys all rename structures + //! Note: doesn't destroy the object itself + void free(); - //! Gets the resource element size - uint elementSize() const { return elementSize_; } + //! Return memory type + MemoryType memoryType() const { return desc().type_; } - //! Get the mapped address of this resource - address data() const { return reinterpret_cast
(address_); } + //! Retunrs true if memory type matches specified + bool isMemoryType(MemoryType memType) const; - //! Frees all allocated CAL memories and resources, - //! associated with this objects. And also destroys all rename structures - //! Note: doesn't destroy the object itself - void free(); + //! Returns TRUE if resource was allocated as cacheable + bool isCacheable() const { return (isMemoryType(Remote) || isMemoryType(Pinned)) ? true : false; } - //! Return memory type - MemoryType memoryType() const { return desc().type_; } + bool glAcquire(); + bool glRelease(); - //! Retunrs true if memory type matches specified - bool isMemoryType(MemoryType memType) const; + //! Returns HW state for the resource (used for images only) + const void* hwState() const { return hwState_; } - //! Returns TRUE if resource was allocated as cacheable - bool isCacheable() const - { return (isMemoryType(Remote) || isMemoryType(Pinned)) ? true : false; } + //! Returns CPU HW SRD for the resource (used for images only) + uint64_t hwSrd() const { return hwSrd_; } - bool glAcquire() ; - bool glRelease() ; + uint numComponents() const { + return Pal::Formats::NumComponents(image_->GetImageCreateInfo().swizzledFormat.format); + } - //! Returns HW state for the resource (used for images only) - const void* hwState() const { return hwState_; } + protected: + uint elementSize_; //!< Size of a single element in bytes - //! Returns CPU HW SRD for the resource (used for images only) - uint64_t hwSrd() const { return hwSrd_; } + private: + //! Disable copy constructor + Resource(const Resource&); - uint numComponents() const { - return Pal::Formats::NumComponents(image_->GetImageCreateInfo().swizzledFormat.format); } + //! Disable operator= + Resource& operator=(const Resource&); -protected: - uint elementSize_; //!< Size of a single element in bytes + typedef std::vector RenameList; -private: - //! Disable copy constructor - Resource(const Resource&); + //! Rename current resource + bool rename(VirtualGPU& gpu, //!< Virtual GPU device object + bool force = false //!< Force renaming + ); - //! Disable operator= - Resource& operator=(const Resource&); + //! Sets the rename as active + void setActiveRename(VirtualGPU& gpu, //!< Virtual GPU device object + GpuMemoryReference* rename //!< new active rename + ); - typedef std::vector RenameList; + //! Gets the active rename + bool getActiveRename(VirtualGPU& gpu, //!< Virtual GPU device object + GpuMemoryReference** rename //!< Saved active rename + ); - //! Rename current resource - bool rename( - VirtualGPU& gpu, //!< Virtual GPU device object - bool force = false //!< Force renaming - ); + /*! \brief Locks the resource with layers and returns a physical pointer + * + * \return Pointer to the physical memory + */ + void* mapLayers(VirtualGPU* gpu, //!< Virtual GPU device object + uint flags = 0 //!< flags for the map operation + ); - //! Sets the rename as active - void setActiveRename( - VirtualGPU& gpu, //!< Virtual GPU device object - GpuMemoryReference* rename //!< new active rename - ); + //! Unlocks the resource with layers if it was locked + void unmapLayers(VirtualGPU* gpu //!< Virtual GPU device object + ); - //! Gets the active rename - bool getActiveRename( - VirtualGPU& gpu, //!< Virtual GPU device object - GpuMemoryReference** rename //!< Saved active rename - ); + //! Calls PAL to map a resource + void* gpuMemoryMap(size_t* pitch, //!< Pitch value for the image + uint flags, //!< Map flags + Pal::IGpuMemory* resource //!< PAL memory object + ) const; - /*! \brief Locks the resource with layers and returns a physical pointer - * - * \return Pointer to the physical memory - */ - void* mapLayers( - VirtualGPU* gpu, //!< Virtual GPU device object - uint flags = 0 //!< flags for the map operation - ); + //! Uses PAL to unmap a resource + void gpuMemoryUnmap(Pal::IGpuMemory* resource //!< PAL memory object + ) const; - //! Unlocks the resource with layers if it was locked - void unmapLayers( - VirtualGPU* gpu //!< Virtual GPU device object - ); + //! Fress all PAL resources associated with OCL resource + void palFree() const; - //! Calls PAL to map a resource - void* gpuMemoryMap( - size_t* pitch, //!< Pitch value for the image - uint flags, //!< Map flags - Pal::IGpuMemory* resource //!< PAL memory object - ) const; + //! Converts Resource memory type to the PAL heaps + void memTypeToHeap(Pal::GpuMemoryCreateInfo* createInfo //!< Memory create info + ); - //! Uses PAL to unmap a resource - void gpuMemoryUnmap( - Pal::IGpuMemory* resource //!< PAL memory object - ) const; + const Device& gpuDevice_; //!< GPU device + Descriptor desc_; //!< Descriptor for this resource + amd::Atomic mapCount_; //!< Total number of maps + void* address_; //!< Physical address of this resource + size_t offset_; //!< Resource offset + size_t curRename_; //!< Current active rename in the list + RenameList renames_; //!< Rename resource list + GpuMemoryReference* memRef_; //!< PAL resource reference + const Resource* viewOwner_; //!< GPU resource, which owns this view + uint64_t pinOffset_; //!< Pinned memory offset + void* glInteropMbRes_; //!< Mb Res handle + uint32_t glType_; //!< GL interop type + void* glPlatformContext_; + void* glDeviceContext_; - //! Fress all PAL resources associated with OCL resource - void palFree() const; + // Optimization for multilayer map/unmap + uint startLayer_; //!< Start layer for map/unmapLayer + uint numLayers_; //!< Number of layers for map/unmapLayer + uint mapFlags_; //!< Map flags for map/umapLayer - //! Converts Resource memory type to the PAL heaps - void memTypeToHeap( - Pal::GpuMemoryCreateInfo* createInfo //!< Memory create info - ); + //! @note: This field is necessary for the thread safe release only + VirtualGPU* gpu_; //!< Resource will be used only on this queue + Pal::IImage* image_; //!< PAL image object - const Device& gpuDevice_; //!< GPU device - Descriptor desc_; //!< Descriptor for this resource - amd::Atomic mapCount_; //!< Total number of maps - void* address_; //!< Physical address of this resource - size_t offset_; //!< Resource offset - size_t curRename_; //!< Current active rename in the list - RenameList renames_; //!< Rename resource list - GpuMemoryReference* memRef_; //!< PAL resource reference - const Resource* viewOwner_; //!< GPU resource, which owns this view - uint64_t pinOffset_; //!< Pinned memory offset - void* glInteropMbRes_;//!< Mb Res handle - uint32_t glType_; //!< GL interop type - void* glPlatformContext_; - void* glDeviceContext_; - - // Optimization for multilayer map/unmap - uint startLayer_; //!< Start layer for map/unmapLayer - uint numLayers_; //!< Number of layers for map/unmapLayer - uint mapFlags_; //!< Map flags for map/umapLayer - - //! @note: This field is necessary for the thread safe release only - VirtualGPU* gpu_; //!< Resource will be used only on this queue - Pal::IImage* image_; //!< PAL image object - - uint32_t* hwState_; //!< HW state for image object - uint64_t hwSrd_; //!< GPU pointer to HW SRD + uint32_t* hwState_; //!< HW state for image object + uint64_t hwSrd_; //!< GPU pointer to HW SRD }; -class ResourceCache : public amd::HeapObject -{ -public: - //! Default constructor - ResourceCache(size_t cacheSizeLimit) - : lockCacheOps_("PAL resource cache", true) - , cacheSize_(0) - , cacheSizeLimit_(cacheSizeLimit) - {} +class ResourceCache : public amd::HeapObject { + public: + //! Default constructor + ResourceCache(size_t cacheSizeLimit) + : lockCacheOps_("PAL resource cache", true), cacheSize_(0), cacheSizeLimit_(cacheSizeLimit) {} - //! Default destructor - ~ResourceCache(); + //! Default destructor + ~ResourceCache(); - //! Adds a CAL resource to the cache - bool addGpuMemory( - Resource::Descriptor* desc, //!< Resource descriptor - cache key - GpuMemoryReference* ref //!< Resource reference - ); + //! Adds a CAL resource to the cache + bool addGpuMemory(Resource::Descriptor* desc, //!< Resource descriptor - cache key + GpuMemoryReference* ref //!< Resource reference + ); - //! Finds a CAL resource from the cache - GpuMemoryReference* findGpuMemory( - Resource::Descriptor* desc, //!< Resource descriptor - cache key - Pal::gpusize size, - Pal::gpusize alignment - ); + //! Finds a CAL resource from the cache + GpuMemoryReference* findGpuMemory( + Resource::Descriptor* desc, //!< Resource descriptor - cache key + Pal::gpusize size, Pal::gpusize alignment); - //! Destroys cache - bool free(size_t minCacheEntries = 0); + //! Destroys cache + bool free(size_t minCacheEntries = 0); -private: - //! Disable copy constructor - ResourceCache(const ResourceCache&); + private: + //! Disable copy constructor + ResourceCache(const ResourceCache&); - //! Disable operator= - ResourceCache& operator=(const ResourceCache&); + //! Disable operator= + ResourceCache& operator=(const ResourceCache&); - //! Removes one last entry from the cache - void removeLast(); + //! Removes one last entry from the cache + void removeLast(); - amd::Monitor lockCacheOps_; //!< Lock to serialise cache access + amd::Monitor lockCacheOps_; //!< Lock to serialise cache access - size_t cacheSize_; //!< Current cache size in bytes - size_t cacheSizeLimit_; //!< Cache size limit in bytes + size_t cacheSize_; //!< Current cache size in bytes + size_t cacheSizeLimit_; //!< Cache size limit in bytes - //! CAL resource cache - std::list > resCache_; + //! CAL resource cache + std::list > resCache_; }; /*@}*/} // namespace pal diff --git a/rocclr/runtime/device/pal/palsched.hpp b/rocclr/runtime/device/pal/palsched.hpp index 1d45cb002b..8e3702e54a 100644 --- a/rocclr/runtime/device/pal/palsched.hpp +++ b/rocclr/runtime/device/pal/palsched.hpp @@ -9,67 +9,68 @@ namespace pal { //! AmdAqlWrap slot state enum AqlWrapState { - AQL_WRAP_FREE = 0, - AQL_WRAP_RESERVED, - AQL_WRAP_READY, - AQL_WRAP_MARKER, - AQL_WRAP_BUSY, - AQL_WRAP_DONE + AQL_WRAP_FREE = 0, + AQL_WRAP_RESERVED, + AQL_WRAP_READY, + AQL_WRAP_MARKER, + AQL_WRAP_BUSY, + AQL_WRAP_DONE }; struct AmdVQueueHeader { - uint32_t aql_slot_num; //!< [LRO/SRO] The total number of the AQL slots (multiple of 64). - uint32_t event_slot_num; //!< [LRO] The number of kernel events in the events buffer - uint64_t event_slot_mask; //!< [LRO] A pointer to the allocation bitmask array for the events - uint64_t event_slots; //!< [LRO] Pointer to a buffer for the events. - // Array of event_slot_num entries of AmdEvent - uint64_t aql_slot_mask; //!< [LRO/SRO]A pointer to the allocation bitmask for aql_warp slots - uint32_t command_counter; //!< [LRW] The global counter for the submitted commands into the queue - uint32_t wait_size; //!< [LRO] The wait list size (in clk_event_t) - uint32_t arg_size; //!< [LRO] The size of argument buffer (in bytes) - uint32_t mask_groups; //!< Processed mask groups by one thread - uint64_t kernel_table; //!< [LRO] Pointer to an array with all kernel objects (ulong for each entry) - uint32_t reserved[2]; //!< For the future usage + uint32_t aql_slot_num; //!< [LRO/SRO] The total number of the AQL slots (multiple of 64). + uint32_t event_slot_num; //!< [LRO] The number of kernel events in the events buffer + uint64_t event_slot_mask; //!< [LRO] A pointer to the allocation bitmask array for the events + uint64_t event_slots; //!< [LRO] Pointer to a buffer for the events. + // Array of event_slot_num entries of AmdEvent + uint64_t aql_slot_mask; //!< [LRO/SRO]A pointer to the allocation bitmask for aql_warp slots + uint32_t command_counter; //!< [LRW] The global counter for the submitted commands into the queue + uint32_t wait_size; //!< [LRO] The wait list size (in clk_event_t) + uint32_t arg_size; //!< [LRO] The size of argument buffer (in bytes) + uint32_t mask_groups; //!< Processed mask groups by one thread + uint64_t + kernel_table; //!< [LRO] Pointer to an array with all kernel objects (ulong for each entry) + uint32_t reserved[2]; //!< For the future usage }; struct AmdAqlWrap { - uint32_t state; //!< [LRW/SRW] The current state of the AQL wrapper: FREE, RESERVED, READY, - // MARKER, BUSY and DONE. The block could be returned back to a free state. - uint32_t enqueue_flags; //!< [LWO/SRO] Contains the flags for the kernel execution start - uint32_t command_id; //!< [LWO/SRO] The unique command ID - uint32_t child_counter; //!< [LRW/SRW] Counter that determine the launches of child kernels. - // It’s incremented on the - // start and decremented on the finish. The parent kernel can be considered as - // done when the value is 0 and the state is DONE - uint64_t completion; //!< [LWO/SRO] CL event for the current execution (clk_event_t) - uint64_t parent_wrap; //!< [LWO/SRO] Pointer to the parent AQL wrapper (AmdAqlWrap*) - uint64_t wait_list; //!< [LRO/SRO] Pointer to an array of clk_event_t objects (64 bytes default) - uint32_t wait_num; //!< [LWO/SRO] The number of cl_event_wait objects - uint32_t reserved[5]; //!< For the future usage - hsa_kernel_dispatch_packet_t aql; //!< [LWO/SRO] AQL packet – 64 bytes AQL packet + uint32_t state; //!< [LRW/SRW] The current state of the AQL wrapper: FREE, RESERVED, READY, + // MARKER, BUSY and DONE. The block could be returned back to a free state. + uint32_t enqueue_flags; //!< [LWO/SRO] Contains the flags for the kernel execution start + uint32_t command_id; //!< [LWO/SRO] The unique command ID + uint32_t child_counter; //!< [LRW/SRW] Counter that determine the launches of child kernels. + // It’s incremented on the + // start and decremented on the finish. The parent kernel can be considered as + // done when the value is 0 and the state is DONE + uint64_t completion; //!< [LWO/SRO] CL event for the current execution (clk_event_t) + uint64_t parent_wrap; //!< [LWO/SRO] Pointer to the parent AQL wrapper (AmdAqlWrap*) + uint64_t wait_list; //!< [LRO/SRO] Pointer to an array of clk_event_t objects (64 bytes default) + uint32_t wait_num; //!< [LWO/SRO] The number of cl_event_wait objects + uint32_t reserved[5]; //!< For the future usage + hsa_kernel_dispatch_packet_t aql; //!< [LWO/SRO] AQL packet – 64 bytes AQL packet }; struct AmdEvent { - uint32_t state; //!< [LRO/SRW] Event state: START, END, COMPLETE - uint32_t counter; //!< [LRW] Event retain/release counter. 0 means the event is free - uint64_t timer[3]; //!< [LRO/SWO] Timer values for profiling for each state - uint64_t captureInfo; //!< [LRW/SRO] Profiling capture info for CLK_PROFILING_COMMAND_EXEC_TIME + uint32_t state; //!< [LRO/SRW] Event state: START, END, COMPLETE + uint32_t counter; //!< [LRW] Event retain/release counter. 0 means the event is free + uint64_t timer[3]; //!< [LRO/SWO] Timer values for profiling for each state + uint64_t captureInfo; //!< [LRW/SRO] Profiling capture info for CLK_PROFILING_COMMAND_EXEC_TIME }; struct SchedulerParam { - uint32_t signal; //!< Signal to stop the child queue(address must be 16 bytes aligned) - uint32_t eng_clk; //!< Engine clock in Mhz - uint64_t hw_queue; //!< Address to HW queue - uint64_t hsa_queue; //!< Address to HSA dummy queue - uint32_t useATC; //!< GPU access to shader program by ATC. - uint32_t scratchSize; //!< Scratch buffer size - uint64_t scratch; //!< GPU address to the scratch buffer - uint32_t numMaxWaves; //!< The max number of possible waves - uint32_t releaseHostCP; //!< Releases CP on the host queue - uint64_t parentAQL; //!< Host parent AmdAqlWrap packet - uint32_t dedicatedQueue; //!< Scheduler uses a dedicated queue - uint32_t scratchOffset; //!< Scratch buffer offset - uint32_t reserved[2]; //!< Reserved + uint32_t signal; //!< Signal to stop the child queue(address must be 16 bytes aligned) + uint32_t eng_clk; //!< Engine clock in Mhz + uint64_t hw_queue; //!< Address to HW queue + uint64_t hsa_queue; //!< Address to HSA dummy queue + uint32_t useATC; //!< GPU access to shader program by ATC. + uint32_t scratchSize; //!< Scratch buffer size + uint64_t scratch; //!< GPU address to the scratch buffer + uint32_t numMaxWaves; //!< The max number of possible waves + uint32_t releaseHostCP; //!< Releases CP on the host queue + uint64_t parentAQL; //!< Host parent AmdAqlWrap packet + uint32_t dedicatedQueue; //!< Scheduler uses a dedicated queue + uint32_t scratchOffset; //!< Scratch buffer offset + uint32_t reserved[2]; //!< Reserved }; -} // namespace pal +} // namespace pal diff --git a/rocclr/runtime/device/pal/palschedcl.cpp b/rocclr/runtime/device/pal/palschedcl.cpp index 047d5b610b..aa0e8e9a80 100644 --- a/rocclr/runtime/device/pal/palschedcl.cpp +++ b/rocclr/runtime/device/pal/palschedcl.cpp @@ -5,19 +5,11 @@ namespace pal { #define SCHEDULER_KERNEL(...) #__VA_ARGS__ -const char* SchedulerSourceCode = SCHEDULER_KERNEL( -%s -\n -extern void __amd_scheduler(__global void *, __global void *, uint); -\n -__kernel void -scheduler( - __global void * queue, - __global void * params, - uint paramIdx) -{ - __amd_scheduler(queue, params, paramIdx); +const char* SchedulerSourceCode = SCHEDULER_KERNEL(% s +\n extern void __amd_scheduler(__global void*, __global void*, uint); +\n __kernel void scheduler(__global void* queue, __global void* params, uint paramIdx) { + __amd_scheduler(queue, params, paramIdx); } \n); -} // namespace pal +} // namespace pal diff --git a/rocclr/runtime/device/pal/palsettings.cpp b/rocclr/runtime/device/pal/palsettings.cpp index bc6cbd06ad..5eb054878e 100644 --- a/rocclr/runtime/device/pal/palsettings.cpp +++ b/rocclr/runtime/device/pal/palsettings.cpp @@ -21,462 +21,452 @@ namespace pal { * This structure contains the time and OS minor version for max workload time * adjustment for Windows 7 or 8. */ -struct ModifyMaxWorkload -{ - uint32_t time; //!< max work load time (10x ms) - uint32_t minorVersion; //!< OS minor version +struct ModifyMaxWorkload { + uint32_t time; //!< max work load time (10x ms) + uint32_t minorVersion; //!< OS minor version #if defined(_WIN32) - BYTE comparisonOps; //!< Comparison option + BYTE comparisonOps; //!< Comparison option #endif }; -Settings::Settings() -{ - // Initialize the GPU device default settings - oclVersion_ = OpenCL12; - debugFlags_ = 0; - syncObject_ = GPU_USE_SYNC_OBJECTS; - remoteAlloc_ = REMOTE_ALLOC; +Settings::Settings() { + // Initialize the GPU device default settings + oclVersion_ = OpenCL12; + debugFlags_ = 0; + syncObject_ = GPU_USE_SYNC_OBJECTS; + remoteAlloc_ = REMOTE_ALLOC; - stagedXferRead_ = true; - stagedXferWrite_ = true; - stagedXferSize_ = GPU_STAGING_BUFFER_SIZE * Ki; + stagedXferRead_ = true; + stagedXferWrite_ = true; + stagedXferSize_ = GPU_STAGING_BUFFER_SIZE * Ki; - // We will enable staged read/write if we use local memory - disablePersistent_ = false; + // We will enable staged read/write if we use local memory + disablePersistent_ = false; - // By Default persistent writes will be disabled. - stagingWritePersistent_ = GPU_STAGING_WRITE_PERSISTENT; + // By Default persistent writes will be disabled. + stagingWritePersistent_ = GPU_STAGING_WRITE_PERSISTENT; - maxRenames_ = 4; - maxRenameSize_ = 4 * Mi; + maxRenames_ = 4; + maxRenameSize_ = 4 * Mi; - imageSupport_ = false; - hwLDSSize_ = 0; + imageSupport_ = false; + hwLDSSize_ = 0; - // Set this to true when we drop the flag - doublePrecision_ = ::CL_KHR_FP64; + // Set this to true when we drop the flag + doublePrecision_ = ::CL_KHR_FP64; - // Fill workgroup info size - // @todo: revisit the 256 limitation on workgroup size - maxWorkGroupSize_ = 256; + // Fill workgroup info size + // @todo: revisit the 256 limitation on workgroup size + maxWorkGroupSize_ = 256; - hostMemDirectAccess_ = HostMemDisable; + hostMemDirectAccess_ = HostMemDisable; - libSelector_ = amd::LibraryUndefined; + libSelector_ = amd::LibraryUndefined; - // Enable workload split by default (for 24 bit arithmetic or timeout) - workloadSplitSize_ = 1 << GPU_WORKLOAD_SPLIT; + // Enable workload split by default (for 24 bit arithmetic or timeout) + workloadSplitSize_ = 1 << GPU_WORKLOAD_SPLIT; - // By default use host blit - blitEngine_ = BlitEngineHost; - const static size_t MaxPinnedXferSize = 32; - pinnedXferSize_ = std::min(GPU_PINNED_XFER_SIZE, MaxPinnedXferSize) * Mi; - pinnedMinXferSize_ = std::min(GPU_PINNED_MIN_XFER_SIZE * Ki, pinnedXferSize_); + // By default use host blit + blitEngine_ = BlitEngineHost; + const static size_t MaxPinnedXferSize = 32; + pinnedXferSize_ = std::min(GPU_PINNED_XFER_SIZE, MaxPinnedXferSize) * Mi; + pinnedMinXferSize_ = std::min(GPU_PINNED_MIN_XFER_SIZE * Ki, pinnedXferSize_); - // Disable FP_FAST_FMA defines by default - reportFMAF_ = false; - reportFMA_ = false; + // Disable FP_FAST_FMA defines by default + reportFMAF_ = false; + reportFMA_ = false; - // GPU device by default - apuSystem_ = false; + // GPU device by default + apuSystem_ = false; - // Disable 64 bit pointers support by default - use64BitPtr_ = false; + // Disable 64 bit pointers support by default + use64BitPtr_ = false; - // Max alloc size is 16GB - maxAllocSize_ = 16 * static_cast(Gi); + // Max alloc size is 16GB + maxAllocSize_ = 16 * static_cast(Gi); - // Disable memory dependency tracking by default - numMemDependencies_ = 0; + // Disable memory dependency tracking by default + numMemDependencies_ = 0; - // By default cache isn't present - cacheLineSize_ = 0; - cacheSize_ = 0; + // By default cache isn't present + cacheLineSize_ = 0; + cacheSize_ = 0; - // Initialize transfer buffer size to 1MB by default - xferBufSize_ = 1024 * Ki; + // Initialize transfer buffer size to 1MB by default + xferBufSize_ = 1024 * Ki; - // Use image DMA if requested - imageDMA_ = GPU_IMAGE_DMA; + // Use image DMA if requested + imageDMA_ = GPU_IMAGE_DMA; - // Disable ASIC specific features by default - viPlus_ = false; - aiPlus_ = false; + // Disable ASIC specific features by default + viPlus_ = false; + aiPlus_ = false; - // Number of compute rings. - numComputeRings_ = 0; + // Number of compute rings. + numComputeRings_ = 0; - minWorkloadTime_ = 1; // 0.1 ms - maxWorkloadTime_ = 500000; // 500 ms + minWorkloadTime_ = 1; // 0.1 ms + maxWorkloadTime_ = 500000; // 500 ms - // Controls tiled images in persistent - //!@note IOL for Linux doesn't setup tiling aperture in CMM/QS - linearPersistentImage_ = false; + // Controls tiled images in persistent + //!@note IOL for Linux doesn't setup tiling aperture in CMM/QS + linearPersistentImage_ = false; - useSingleScratch_ = GPU_USE_SINGLE_SCRATCH; + useSingleScratch_ = GPU_USE_SINGLE_SCRATCH; - // Device enqueuing settings - numDeviceEvents_ = 1024; - numWaitEvents_ = 8; + // Device enqueuing settings + numDeviceEvents_ = 1024; + numWaitEvents_ = 8; - numScratchWavesPerCu_ = 16; + numScratchWavesPerCu_ = 16; - // Don't support platform atomics by default. - svmAtomics_ = false; + // Don't support platform atomics by default. + svmAtomics_ = false; - // Use host queue for device enqueuing by default - useDeviceQueue_ = GPU_USE_DEVICE_QUEUE; + // Use host queue for device enqueuing by default + useDeviceQueue_ = GPU_USE_DEVICE_QUEUE; - // Don't support Denormals for single precision by default - singleFpDenorm_ = false; - - // Disable SDMA workaround by default - sdamPageFaultWar_ = false; + // Don't support Denormals for single precision by default + singleFpDenorm_ = false; + // Disable SDMA workaround by default + sdamPageFaultWar_ = false; } -bool -Settings::create( - const Pal::DeviceProperties& palProp, - const Pal::GpuMemoryHeapProperties* heaps, - const Pal::WorkStationCaps& wscaps, - bool reportAsOCL12Device -) -{ -// uint target = calAttr.target; - uint32_t osVer = 0x0; +bool Settings::create(const Pal::DeviceProperties& palProp, + const Pal::GpuMemoryHeapProperties* heaps, const Pal::WorkStationCaps& wscaps, + bool reportAsOCL12Device) { + // uint target = calAttr.target; + uint32_t osVer = 0x0; - // Disable thread trace by default for all devices - threadTraceEnable_ = false; - bool doublePrecision = true; + // Disable thread trace by default for all devices + threadTraceEnable_ = false; + bool doublePrecision = true; - if (doublePrecision) { - // Report FP_FAST_FMA define if double precision HW - reportFMA_ = true; - // FMA is 1/4 speed on Pitcairn, Cape Verde, Devastator and Scrapper - // Bonaire, Kalindi, Spectre and Spooky so disable - // FP_FMA_FMAF for those parts in switch below - reportFMAF_ = true; - } + if (doublePrecision) { + // Report FP_FAST_FMA define if double precision HW + reportFMA_ = true; + // FMA is 1/4 speed on Pitcairn, Cape Verde, Devastator and Scrapper + // Bonaire, Kalindi, Spectre and Spooky so disable + // FP_FMA_FMAF for those parts in switch below + reportFMAF_ = true; + } - // Update GPU specific settings and info structure if we have any - ModifyMaxWorkload modifyMaxWorkload = {0}; + // Update GPU specific settings and info structure if we have any + ModifyMaxWorkload modifyMaxWorkload = {0}; - // APU systems - if (palProp.gpuType == Pal::GpuType::Integrated) { - apuSystem_ = true; - } + // APU systems + if (palProp.gpuType == Pal::GpuType::Integrated) { + apuSystem_ = true; + } - switch (palProp.revision) { + switch (palProp.revision) { case Pal::AsicRevision::Unknown: - switch (palProp.gfxLevel) { + switch (palProp.gfxLevel) { case Pal::GfxIpLevel::GfxIp9: - aiPlus_ = true; - break; + aiPlus_ = true; + break; default: - assert(0 && "Unknown GfxIP type!"); - return false; - } + assert(0 && "Unknown GfxIP type!"); + return false; + } case Pal::AsicRevision::Vega10: case Pal::AsicRevision::Raven: - aiPlus_ = true; - // Fall through to VI ... + aiPlus_ = true; + // Fall through to VI ... case Pal::AsicRevision::Carrizo: case Pal::AsicRevision::Bristol: case Pal::AsicRevision::Stoney: - if (!aiPlus_) { - // Fix BSOD/TDR issues observed on Stoney Win7/8.1/10 - minWorkloadTime_ = 1000; - modifyMaxWorkload.time = 1000; // Decided by experiment - modifyMaxWorkload.minorVersion = 1; // Win 7 + if (!aiPlus_) { + // Fix BSOD/TDR issues observed on Stoney Win7/8.1/10 + minWorkloadTime_ = 1000; + modifyMaxWorkload.time = 1000; // Decided by experiment + modifyMaxWorkload.minorVersion = 1; // Win 7 #if defined(_WIN32) - modifyMaxWorkload.comparisonOps = VER_EQUAL; // Limit to Win 7 only + modifyMaxWorkload.comparisonOps = VER_EQUAL; // Limit to Win 7 only #endif - } + } case Pal::AsicRevision::Iceland: case Pal::AsicRevision::Tonga: case Pal::AsicRevision::Fiji: case Pal::AsicRevision::Ellesmere: case Pal::AsicRevision::Baffin: - // Disable tiling aperture on VI+ - linearPersistentImage_ = true; - // Keep this false even though we have support - // singleFpDenorm_ = true; - viPlus_ = true; - // SDMA may have memory access outside of - // the valid buffer range and cause a page fault - sdamPageFaultWar_ = true; - enableExtension(ClKhrFp16); - // Fall through to CI ... + // Disable tiling aperture on VI+ + linearPersistentImage_ = true; + // Keep this false even though we have support + // singleFpDenorm_ = true; + viPlus_ = true; + // SDMA may have memory access outside of + // the valid buffer range and cause a page fault + sdamPageFaultWar_ = true; + enableExtension(ClKhrFp16); + // Fall through to CI ... case Pal::AsicRevision::Kalindi: case Pal::AsicRevision::Godavari: case Pal::AsicRevision::Spectre: case Pal::AsicRevision::Spooky: - if (!viPlus_) { - // Fix BSOD/TDR issues observed on Kaveri Win7 (EPR#416903) - modifyMaxWorkload.time = 250000; // 250ms - modifyMaxWorkload.minorVersion = 1; // Win 7 + if (!viPlus_) { + // Fix BSOD/TDR issues observed on Kaveri Win7 (EPR#416903) + modifyMaxWorkload.time = 250000; // 250ms + modifyMaxWorkload.minorVersion = 1; // Win 7 #if defined(_WIN32) - modifyMaxWorkload.comparisonOps = VER_EQUAL; // limit to Win 7 + modifyMaxWorkload.comparisonOps = VER_EQUAL; // limit to Win 7 #endif - } - // Fall through ... + } + // Fall through ... case Pal::AsicRevision::Bonaire: case Pal::AsicRevision::Hawaii: - threadTraceEnable_ = AMD_THREAD_TRACE_ENABLE; - reportFMAF_ = false; - if (palProp.revision == Pal::AsicRevision::Hawaii) { - reportFMAF_ = true; - } - // Cache line size is 64 bytes - cacheLineSize_ = 64; - // L1 cache size is 16KB - cacheSize_ = 16 * Ki; + threadTraceEnable_ = AMD_THREAD_TRACE_ENABLE; + reportFMAF_ = false; + if (palProp.revision == Pal::AsicRevision::Hawaii) { + reportFMAF_ = true; + } + // Cache line size is 64 bytes + cacheLineSize_ = 64; + // L1 cache size is 16KB + cacheSize_ = 16 * Ki; - libSelector_ = amd::GPU_Library_CI; - if (LP64_SWITCH(false, true)) { - oclVersion_ = !reportAsOCL12Device /*&& calAttr.isOpenCL200Device*/ ? - XCONCAT(OpenCL, XCONCAT(OPENCL_MAJOR, OPENCL_MINOR)) : OpenCL12; - } - if (GPU_FORCE_OCL20_32BIT) { - force32BitOcl20_ = true; - oclVersion_ = !reportAsOCL12Device /*&& calAttr.isOpenCL200Device*/ ? - XCONCAT(OpenCL, XCONCAT(OPENCL_MAJOR, OPENCL_MINOR)) : OpenCL12; - } - if (OPENCL_VERSION < 200) { - oclVersion_ = OpenCL12; - } - numComputeRings_ = 8; + libSelector_ = amd::GPU_Library_CI; + if (LP64_SWITCH(false, true)) { + oclVersion_ = !reportAsOCL12Device /*&& calAttr.isOpenCL200Device*/ + ? XCONCAT(OpenCL, XCONCAT(OPENCL_MAJOR, OPENCL_MINOR)) + : OpenCL12; + } + if (GPU_FORCE_OCL20_32BIT) { + force32BitOcl20_ = true; + oclVersion_ = !reportAsOCL12Device /*&& calAttr.isOpenCL200Device*/ + ? XCONCAT(OpenCL, XCONCAT(OPENCL_MAJOR, OPENCL_MINOR)) + : OpenCL12; + } + if (OPENCL_VERSION < 200) { + oclVersion_ = OpenCL12; + } + numComputeRings_ = 8; - // This needs to be cleaned once 64bit addressing is stable - if (oclVersion_ < OpenCL20) { - use64BitPtr_ = flagIsDefault(GPU_FORCE_64BIT_PTR) ? LP64_SWITCH(false, - /*calAttr.isWorkstation ||*/ true) : GPU_FORCE_64BIT_PTR; - } - else { - if (GPU_FORCE_64BIT_PTR || LP64_SWITCH(false, true)) { - use64BitPtr_ = true; - } + // This needs to be cleaned once 64bit addressing is stable + if (oclVersion_ < OpenCL20) { + use64BitPtr_ = flagIsDefault(GPU_FORCE_64BIT_PTR) + ? LP64_SWITCH(false, + /*calAttr.isWorkstation ||*/ true) + : GPU_FORCE_64BIT_PTR; + } else { + if (GPU_FORCE_64BIT_PTR || LP64_SWITCH(false, true)) { + use64BitPtr_ = true; } + } - if (oclVersion_ >= OpenCL20) { - supportDepthsRGB_ = true; - } - if (use64BitPtr_) { - if (GPU_ENABLE_LARGE_ALLOCATION && false/*wscaps.workStationBoard*/) { - maxAllocSize_ = 64ULL * Gi; - } - else { - maxAllocSize_ = 4048 * Mi; - } - } - else { - maxAllocSize_ = 3ULL * Gi; + if (oclVersion_ >= OpenCL20) { + supportDepthsRGB_ = true; + } + if (use64BitPtr_) { + if (GPU_ENABLE_LARGE_ALLOCATION && false /*wscaps.workStationBoard*/) { + maxAllocSize_ = 64ULL * Gi; + } else { + maxAllocSize_ = 4048 * Mi; } + } else { + maxAllocSize_ = 3ULL * Gi; + } - supportRA_ = false; - partialDispatch_ = GPU_PARTIAL_DISPATCH; - numMemDependencies_ = GPU_NUM_MEM_DEPENDENCY; - break; + supportRA_ = false; + partialDispatch_ = GPU_PARTIAL_DISPATCH; + numMemDependencies_ = GPU_NUM_MEM_DEPENDENCY; + break; default: - assert(0 && "Unknown ASIC type!"); - return false; - } + assert(0 && "Unknown ASIC type!"); + return false; + } #if defined(_WIN32) - if (modifyMaxWorkload.time > 0) { - OSVERSIONINFOEX versionInfo = { 0 }; - versionInfo.dwOSVersionInfoSize = sizeof(OSVERSIONINFOEX); - versionInfo.dwMajorVersion = 6; - versionInfo.dwMinorVersion = modifyMaxWorkload.minorVersion; + if (modifyMaxWorkload.time > 0) { + OSVERSIONINFOEX versionInfo = {0}; + versionInfo.dwOSVersionInfoSize = sizeof(OSVERSIONINFOEX); + versionInfo.dwMajorVersion = 6; + versionInfo.dwMinorVersion = modifyMaxWorkload.minorVersion; - DWORDLONG conditionMask = 0; - VER_SET_CONDITION(conditionMask, VER_MAJORVERSION, modifyMaxWorkload.comparisonOps); - VER_SET_CONDITION(conditionMask, VER_MINORVERSION, modifyMaxWorkload.comparisonOps); - if (VerifyVersionInfo(&versionInfo, VER_MAJORVERSION | VER_MINORVERSION, conditionMask)) { - maxWorkloadTime_ = modifyMaxWorkload.time; - } + DWORDLONG conditionMask = 0; + VER_SET_CONDITION(conditionMask, VER_MAJORVERSION, modifyMaxWorkload.comparisonOps); + VER_SET_CONDITION(conditionMask, VER_MINORVERSION, modifyMaxWorkload.comparisonOps); + if (VerifyVersionInfo(&versionInfo, VER_MAJORVERSION | VER_MINORVERSION, conditionMask)) { + maxWorkloadTime_ = modifyMaxWorkload.time; } -#endif // defined(_WIN32) + } +#endif // defined(_WIN32) - // Enable atomics support - enableExtension(ClKhrInt64BaseAtomics); - enableExtension(ClKhrInt64ExtendedAtomics); - enableExtension(ClKhrGlobalInt32BaseAtomics); - enableExtension(ClKhrGlobalInt32ExtendedAtomics); - enableExtension(ClKhrLocalInt32BaseAtomics); - enableExtension(ClKhrLocalInt32ExtendedAtomics); - enableExtension(ClKhrByteAddressableStore); - enableExtension(ClKhrGlSharing); - enableExtension(ClKhrGlEvent); - enableExtension(ClKhr3DImageWrites); - enableExtension(ClKhrImage2dFromBuffer); - enableExtension(ClAmdMediaOps); - enableExtension(ClAmdMediaOps2); + // Enable atomics support + enableExtension(ClKhrInt64BaseAtomics); + enableExtension(ClKhrInt64ExtendedAtomics); + enableExtension(ClKhrGlobalInt32BaseAtomics); + enableExtension(ClKhrGlobalInt32ExtendedAtomics); + enableExtension(ClKhrLocalInt32BaseAtomics); + enableExtension(ClKhrLocalInt32ExtendedAtomics); + enableExtension(ClKhrByteAddressableStore); + enableExtension(ClKhrGlSharing); + enableExtension(ClKhrGlEvent); + enableExtension(ClKhr3DImageWrites); + enableExtension(ClKhrImage2dFromBuffer); + enableExtension(ClAmdMediaOps); + enableExtension(ClAmdMediaOps2); #if !defined(WITH_LIGHTNING_COMPILER) - enableExtension(ClAmdPopcnt); - enableExtension(ClAmdVec3); - enableExtension(ClAmdPrintf); -#endif // !defined(WITH_LIGHTNING_COMPILER) - // Enable some platform extensions - enableExtension(ClAmdDeviceAttributeQuery); - enableExtension(ClKhrSpir); - enableExtension(ClAMDLiquidFlash); + enableExtension(ClAmdPopcnt); + enableExtension(ClAmdVec3); + enableExtension(ClAmdPrintf); +#endif // !defined(WITH_LIGHTNING_COMPILER) + // Enable some platform extensions + enableExtension(ClAmdDeviceAttributeQuery); + enableExtension(ClKhrSpir); + enableExtension(ClAMDLiquidFlash); - hwLDSSize_ = 32 * Ki; + hwLDSSize_ = 32 * Ki; - imageSupport_ = true; + imageSupport_ = true; - // Use kernels for blit if appropriate - blitEngine_ = BlitEngineKernel; + // Use kernels for blit if appropriate + blitEngine_ = BlitEngineKernel; - hostMemDirectAccess_ |= HostMemBuffer; - // HW doesn't support untiled image writes - // hostMemDirectAccess_ |= HostMemImage; + hostMemDirectAccess_ |= HostMemBuffer; + // HW doesn't support untiled image writes + // hostMemDirectAccess_ |= HostMemImage; - // Make sure device actually supports double precision - doublePrecision_ = (doublePrecision) ? doublePrecision_ : false; - if (doublePrecision_) { - // Enable KHR double precision extension - enableExtension(ClKhrFp64); - } + // Make sure device actually supports double precision + doublePrecision_ = (doublePrecision) ? doublePrecision_ : false; + if (doublePrecision_) { + // Enable KHR double precision extension + enableExtension(ClKhrFp64); + } #if !defined(WITH_LIGHTNING_COMPILER) - if (doublePrecision) { - // Enable AMD double precision extension - doublePrecision_ = true; - enableExtension(ClAmdFp64); - } -#endif // !defined(WITH_LIGHTNING_COMPILER) + if (doublePrecision) { + // Enable AMD double precision extension + doublePrecision_ = true; + enableExtension(ClAmdFp64); + } +#endif // !defined(WITH_LIGHTNING_COMPILER) - if (palProp.gpuMemoryProperties.busAddressableMemSize > 0) { - //Enable bus addressable memory extension - enableExtension(ClAMDBusAddressableMemory); - } -//! @todo -/* - if (calAttr.longIdleDetect) { - // KMD is unable to detect if we map the visible memory for CPU access, so - // accessing persistent staged buffer may fail if LongIdleDetct is enabled. - disablePersistent_ = true; - } -*/ + if (palProp.gpuMemoryProperties.busAddressableMemSize > 0) { + // Enable bus addressable memory extension + enableExtension(ClAMDBusAddressableMemory); + } + //! @todo + /* + if (calAttr.longIdleDetect) { + // KMD is unable to detect if we map the visible memory for CPU access, so + // accessing persistent staged buffer may fail if LongIdleDetct is enabled. + disablePersistent_ = true; + } + */ - svmFineGrainSystem_ = palProp.gpuMemoryProperties.flags.iommuv2Support; - svmAtomics_ = svmFineGrainSystem_; + svmFineGrainSystem_ = palProp.gpuMemoryProperties.flags.iommuv2Support; + svmAtomics_ = svmFineGrainSystem_; - // SVM is not currently supported for DX Interop +// SVM is not currently supported for DX Interop #if defined(_WIN32) - enableExtension(ClKhrD3d9Sharing); - enableExtension(ClKhrD3d10Sharing); - enableExtension(ClKhrD3d11Sharing); -#endif // _WIN32 + enableExtension(ClKhrD3d9Sharing); + enableExtension(ClKhrD3d10Sharing); + enableExtension(ClKhrD3d11Sharing); +#endif // _WIN32 - // Enable some OpenCL 2.0 extensions - if (oclVersion_ >= OpenCL20) { - enableExtension(ClKhrGLDepthImages); - enableExtension(ClKhrSubGroups); - enableExtension(ClKhrDepthImages); + // Enable some OpenCL 2.0 extensions + if (oclVersion_ >= OpenCL20) { + enableExtension(ClKhrGLDepthImages); + enableExtension(ClKhrSubGroups); + enableExtension(ClKhrDepthImages); - if (GPU_MIPMAP) { - enableExtension(ClKhrMipMapImage); - enableExtension(ClKhrMipMapImageWrites); - } - - // Enable HW debug - if (GPU_ENABLE_HW_DEBUG) { - enableHwDebug_ = true; - } + if (GPU_MIPMAP) { + enableExtension(ClKhrMipMapImage); + enableExtension(ClKhrMipMapImageWrites); } - - if (apuSystem_ && - ((heaps[Pal::GpuHeapLocal].heapSize + heaps[Pal::GpuHeapInvisible].heapSize) < (150*Mi))) { - remoteAlloc_ = true; + // Enable HW debug + if (GPU_ENABLE_HW_DEBUG) { + enableHwDebug_ = true; } + } - // Save resource cache size + + if (apuSystem_ && + ((heaps[Pal::GpuHeapLocal].heapSize + heaps[Pal::GpuHeapInvisible].heapSize) < (150 * Mi))) { + remoteAlloc_ = true; + } + +// Save resource cache size #ifdef ATI_OS_LINUX - // Due to EPR#406216, set the default value for Linux for now - resourceCacheSize_ = GPU_RESOURCE_CACHE_SIZE * Mi; + // Due to EPR#406216, set the default value for Linux for now + resourceCacheSize_ = GPU_RESOURCE_CACHE_SIZE * Mi; #else - if (remoteAlloc_) { - resourceCacheSize_ = std::max((heaps[Pal::GpuHeapGartUswc].heapSize / 8), - (uint64_t)GPU_RESOURCE_CACHE_SIZE * Mi); - } - else { - resourceCacheSize_ = std::max(((heaps[Pal::GpuHeapLocal].heapSize + - heaps[Pal::GpuHeapInvisible].heapSize) / 8), - (uint64_t)GPU_RESOURCE_CACHE_SIZE * Mi); - } - resourceCacheSize_ = std::min(resourceCacheSize_, 512 * Mi); + if (remoteAlloc_) { + resourceCacheSize_ = std::max((heaps[Pal::GpuHeapGartUswc].heapSize / 8), + (uint64_t)GPU_RESOURCE_CACHE_SIZE * Mi); + } else { + resourceCacheSize_ = + std::max(((heaps[Pal::GpuHeapLocal].heapSize + heaps[Pal::GpuHeapInvisible].heapSize) / 8), + (uint64_t)GPU_RESOURCE_CACHE_SIZE * Mi); + } + resourceCacheSize_ = std::min(resourceCacheSize_, 512 * Mi); #endif #if defined(WITH_LIGHTNING_COMPILER) - switch (palProp.gfxLevel) { + switch (palProp.gfxLevel) { case Pal::GfxIpLevel::GfxIp9: + singleFpDenorm_ = true; + break; + } +#endif // WITH_LIGHTNING_COMPILER + + // Override current device settings + override(); + + return true; +} + +void Settings::override() { + // Limit reported workgroup size + if (GPU_MAX_WORKGROUP_SIZE != 0) { + maxWorkGroupSize_ = GPU_MAX_WORKGROUP_SIZE; + } + + // Override blit engine type + if (GPU_BLIT_ENGINE_TYPE != BlitEngineDefault) { + blitEngine_ = GPU_BLIT_ENGINE_TYPE; + } + + if (!flagIsDefault(DEBUG_GPU_FLAGS)) { + debugFlags_ = DEBUG_GPU_FLAGS; + } + + if (!flagIsDefault(DEBUG_GPU_FLAGS)) { + debugFlags_ = DEBUG_GPU_FLAGS; + } + + if (!flagIsDefault(GPU_XFER_BUFFER_SIZE)) { + xferBufSize_ = GPU_XFER_BUFFER_SIZE * Ki; + } + + if (!flagIsDefault(GPU_USE_SYNC_OBJECTS)) { + syncObject_ = GPU_USE_SYNC_OBJECTS; + } + + if (!flagIsDefault(GPU_NUM_COMPUTE_RINGS)) { + numComputeRings_ = GPU_NUM_COMPUTE_RINGS; + } + + if (!flagIsDefault(GPU_RESOURCE_CACHE_SIZE)) { + resourceCacheSize_ = GPU_RESOURCE_CACHE_SIZE * Mi; + } + + if (!flagIsDefault(AMD_GPU_FORCE_SINGLE_FP_DENORM)) { + switch (AMD_GPU_FORCE_SINGLE_FP_DENORM) { + case 0: + singleFpDenorm_ = false; + break; + case 1: singleFpDenorm_ = true; break; + default: + break; } -#endif // WITH_LIGHTNING_COMPILER - - // Override current device settings - override(); - - return true; + } } -void -Settings::override() -{ - // Limit reported workgroup size - if (GPU_MAX_WORKGROUP_SIZE != 0) { - maxWorkGroupSize_ = GPU_MAX_WORKGROUP_SIZE; - } - - // Override blit engine type - if (GPU_BLIT_ENGINE_TYPE != BlitEngineDefault) { - blitEngine_ = GPU_BLIT_ENGINE_TYPE; - } - - if (!flagIsDefault(DEBUG_GPU_FLAGS)) { - debugFlags_ = DEBUG_GPU_FLAGS; - } - - if (!flagIsDefault(DEBUG_GPU_FLAGS)) { - debugFlags_ = DEBUG_GPU_FLAGS; - } - - if (!flagIsDefault(GPU_XFER_BUFFER_SIZE)) { - xferBufSize_ = GPU_XFER_BUFFER_SIZE * Ki; - } - - if (!flagIsDefault(GPU_USE_SYNC_OBJECTS)) { - syncObject_ = GPU_USE_SYNC_OBJECTS; - } - - if (!flagIsDefault(GPU_NUM_COMPUTE_RINGS)) { - numComputeRings_ = GPU_NUM_COMPUTE_RINGS; - } - - if (!flagIsDefault(GPU_RESOURCE_CACHE_SIZE)) { - resourceCacheSize_ = GPU_RESOURCE_CACHE_SIZE * Mi; - } - - if (!flagIsDefault(AMD_GPU_FORCE_SINGLE_FP_DENORM)) { - switch (AMD_GPU_FORCE_SINGLE_FP_DENORM) { - case 0: - singleFpDenorm_ = false; - break; - case 1: - singleFpDenorm_ = true; - break; - default: - break; - } - } -} - -} // namespace pal +} // namespace pal diff --git a/rocclr/runtime/device/pal/palsettings.hpp b/rocclr/runtime/device/pal/palsettings.hpp index 9c63b2900d..638a13eaef 100644 --- a/rocclr/runtime/device/pal/palsettings.hpp +++ b/rocclr/runtime/device/pal/palsettings.hpp @@ -15,110 +15,105 @@ namespace pal { //! Device settings -class Settings : public device::Settings -{ -public: - //! Debug GPU flags - enum DebugGpuFlags - { - CheckForILSource = 0x00000001, - StubCLPrograms = 0x00000002, //!< Enables OpenCL programs stubbing - LockGlobalMemory = 0x00000004, +class Settings : public device::Settings { + public: + //! Debug GPU flags + enum DebugGpuFlags { + CheckForILSource = 0x00000001, + StubCLPrograms = 0x00000002, //!< Enables OpenCL programs stubbing + LockGlobalMemory = 0x00000004, + }; + + enum BlitEngineType { + BlitEngineDefault = 0x00000000, + BlitEngineHost = 0x00000001, + BlitEngineCAL = 0x00000002, + BlitEngineKernel = 0x00000003, + }; + + enum HostMemFlags { + HostMemDisable = 0x00000000, + HostMemBuffer = 0x00000001, + HostMemImage = 0x00000002, + }; + + union { + struct { + uint remoteAlloc_ : 1; //!< Allocate remote memory for the heap + uint stagedXferRead_ : 1; //!< Uses a staged buffer read + uint stagedXferWrite_ : 1; //!< Uses a staged buffer write + uint disablePersistent_ : 1; //!< Disables using persistent memory for staging + uint imageSupport_ : 1; //!< Report images support + uint doublePrecision_ : 1; //!< Enables double precision support + uint reportFMAF_ : 1; //!< Report FP_FAST_FMAF define in CL program + uint reportFMA_ : 1; //!< Report FP_FAST_FMA define in CL program + uint use64BitPtr_ : 1; //!< Use 64bit pointers on GPU + uint force32BitOcl20_ : 1; //!< Force 32bit apps to take CLANG/HSAIL path on GPU + uint imageDMA_ : 1; //!< Enable direct image DMA transfers + uint syncObject_ : 1; //!< Enable syncobject + uint viPlus_ : 1; //!< VI and post VI features + uint aiPlus_ : 1; //!< AI and post AI features + uint threadTraceEnable_ : 1; //!< Thread trace enable + uint linearPersistentImage_ : 1; //!< Allocates linear images in persistent + uint useSingleScratch_ : 1; //!< Allocates single scratch per device + uint stagingWritePersistent_ : 1; //!< Enables persistent writes + uint svmAtomics_ : 1; //!< SVM device atomics + uint svmFineGrainSystem_ : 1; //!< SVM fine grain system support + uint apuSystem_ : 1; //!< Device is APU system with shared memory + uint useDeviceQueue_ : 1; //!< Submit to separate device queue + uint singleFpDenorm_ : 1; //!< Support Single FP Denorm + uint sdamPageFaultWar_ : 1; //!< SDMA page fault workaround + uint reserved_ : 9; }; + uint value_; + }; - enum BlitEngineType - { - BlitEngineDefault = 0x00000000, - BlitEngineHost = 0x00000001, - BlitEngineCAL = 0x00000002, - BlitEngineKernel = 0x00000003, - }; + uint oclVersion_; //!< Reported OpenCL version support + uint debugFlags_; //!< Debug GPU flags + uint maxRenames_; //!< Maximum number of possible renames + uint maxRenameSize_; //!< Maximum size for all renames + uint hwLDSSize_; //!< HW local data store size + uint maxWorkGroupSize_; //!< Requested workgroup size for this device + uint workloadSplitSize_; //!< Workload split size + uint minWorkloadTime_; //!< Minimal workload time in 0.1 ms + uint maxWorkloadTime_; //!< Maximum workload time in 0.1 ms + uint blitEngine_; //!< Blit engine type + uint cacheLineSize_; //!< Cache line size in bytes + uint cacheSize_; //!< L1 cache size in bytes + uint numComputeRings_; //!< 0 - disabled, 1 , 2,.. - the number of compute rings + uint numDeviceEvents_; //!< The number of device events + uint numWaitEvents_; //!< The number of wait events for device enqueue + uint hostMemDirectAccess_; //!< Enables direct access to the host memory + uint numScratchWavesPerCu_; //!< Maximum number of waves when scratch is enabled + size_t xferBufSize_; //!< Transfer buffer size for image copy optimization + size_t stagedXferSize_; //!< Staged buffer size + size_t pinnedXferSize_; //!< Pinned buffer size for transfer + size_t pinnedMinXferSize_; //!< Minimal buffer size for pinned transfer + size_t resourceCacheSize_; //!< Resource cache size in MB + size_t numMemDependencies_; //!< The array size for memory dependencies tracking + uint64_t maxAllocSize_; //!< Maximum single allocation size - enum HostMemFlags - { - HostMemDisable = 0x00000000, - HostMemBuffer = 0x00000001, - HostMemImage = 0x00000002, - }; + amd::LibrarySelector libSelector_; //!< Select linking libraries for compiler - union { - struct { - uint remoteAlloc_: 1; //!< Allocate remote memory for the heap - uint stagedXferRead_: 1; //!< Uses a staged buffer read - uint stagedXferWrite_: 1; //!< Uses a staged buffer write - uint disablePersistent_: 1; //!< Disables using persistent memory for staging - uint imageSupport_: 1; //!< Report images support - uint doublePrecision_: 1; //!< Enables double precision support - uint reportFMAF_: 1; //!< Report FP_FAST_FMAF define in CL program - uint reportFMA_: 1; //!< Report FP_FAST_FMA define in CL program - uint use64BitPtr_: 1; //!< Use 64bit pointers on GPU - uint force32BitOcl20_: 1; //!< Force 32bit apps to take CLANG/HSAIL path on GPU - uint imageDMA_: 1; //!< Enable direct image DMA transfers - uint syncObject_: 1; //!< Enable syncobject - uint viPlus_: 1; //!< VI and post VI features - uint aiPlus_: 1; //!< AI and post AI features - uint threadTraceEnable_: 1; //!< Thread trace enable - uint linearPersistentImage_: 1; //!< Allocates linear images in persistent - uint useSingleScratch_: 1; //!< Allocates single scratch per device - uint stagingWritePersistent_: 1; //!< Enables persistent writes - uint svmAtomics_: 1; //!< SVM device atomics - uint svmFineGrainSystem_: 1; //!< SVM fine grain system support - uint apuSystem_: 1; //!< Device is APU system with shared memory - uint useDeviceQueue_: 1; //!< Submit to separate device queue - uint singleFpDenorm_: 1; //!< Support Single FP Denorm - uint sdamPageFaultWar_: 1; //!< SDMA page fault workaround - uint reserved_: 9; - }; - uint value_; - }; + //! Default constructor + Settings(); - uint oclVersion_; //!< Reported OpenCL version support - uint debugFlags_; //!< Debug GPU flags - uint maxRenames_; //!< Maximum number of possible renames - uint maxRenameSize_; //!< Maximum size for all renames - uint hwLDSSize_; //!< HW local data store size - uint maxWorkGroupSize_; //!< Requested workgroup size for this device - uint workloadSplitSize_; //!< Workload split size - uint minWorkloadTime_; //!< Minimal workload time in 0.1 ms - uint maxWorkloadTime_; //!< Maximum workload time in 0.1 ms - uint blitEngine_; //!< Blit engine type - uint cacheLineSize_; //!< Cache line size in bytes - uint cacheSize_; //!< L1 cache size in bytes - uint numComputeRings_; //!< 0 - disabled, 1 , 2,.. - the number of compute rings - uint numDeviceEvents_; //!< The number of device events - uint numWaitEvents_; //!< The number of wait events for device enqueue - uint hostMemDirectAccess_; //!< Enables direct access to the host memory - uint numScratchWavesPerCu_; //!< Maximum number of waves when scratch is enabled - size_t xferBufSize_; //!< Transfer buffer size for image copy optimization - size_t stagedXferSize_; //!< Staged buffer size - size_t pinnedXferSize_; //!< Pinned buffer size for transfer - size_t pinnedMinXferSize_; //!< Minimal buffer size for pinned transfer - size_t resourceCacheSize_; //!< Resource cache size in MB - size_t numMemDependencies_;//!< The array size for memory dependencies tracking - uint64_t maxAllocSize_; //!< Maximum single allocation size + //! Creates settings + bool create(const Pal::DeviceProperties& palProp, //!< PAL device properties + const Pal::GpuMemoryHeapProperties* heaps, //!< PAL heap settings + const Pal::WorkStationCaps& wscaps, //!< PAL workstation settings + bool reportAsOCL12Device = false //!< Report As OpenCL1.2 Device + ); - amd::LibrarySelector libSelector_; //!< Select linking libraries for compiler + private: + //! Disable copy constructor + Settings(const Settings&); - //! Default constructor - Settings(); + //! Disable assignment + Settings& operator=(const Settings&); - //! Creates settings - bool create( - const Pal::DeviceProperties& palProp, //!< PAL device properties - const Pal::GpuMemoryHeapProperties* heaps, //!< PAL heap settings - const Pal::WorkStationCaps& wscaps, //!< PAL workstation settings - bool reportAsOCL12Device = false //!< Report As OpenCL1.2 Device - ); - -private: - //! Disable copy constructor - Settings(const Settings&); - - //! Disable assignment - Settings& operator=(const Settings&); - - //! Overrides current settings based on registry/environment - void override(); + //! Overrides current settings based on registry/environment + void override(); }; /*@}*/} // namespace pal diff --git a/rocclr/runtime/device/pal/palthreadtrace.cpp b/rocclr/runtime/device/pal/palthreadtrace.cpp index 7370f24e6b..b9853aebe1 100644 --- a/rocclr/runtime/device/pal/palthreadtrace.cpp +++ b/rocclr/runtime/device/pal/palthreadtrace.cpp @@ -7,176 +7,161 @@ namespace pal { -PalThreadTraceReference* -PalThreadTraceReference::Create(VirtualGPU& gpu) -{ - Pal::Result result; +PalThreadTraceReference* PalThreadTraceReference::Create(VirtualGPU& gpu) { + Pal::Result result; - // Create performance experiment - Pal::PerfExperimentCreateInfo createInfo = {}; + // Create performance experiment + Pal::PerfExperimentCreateInfo createInfo = {}; - createInfo.optionFlags.sampleInternalOperations = 1; - createInfo.optionFlags.cacheFlushOnCounterCollection = 1; - createInfo.optionFlags.sqShaderMask = 1; - createInfo.optionValues.sampleInternalOperations = true; - createInfo.optionValues.cacheFlushOnCounterCollection = true; - createInfo.optionValues.sqShaderMask = Pal::PerfShaderMaskCs; + createInfo.optionFlags.sampleInternalOperations = 1; + createInfo.optionFlags.cacheFlushOnCounterCollection = 1; + createInfo.optionFlags.sqShaderMask = 1; + createInfo.optionValues.sampleInternalOperations = true; + createInfo.optionValues.cacheFlushOnCounterCollection = true; + createInfo.optionValues.sqShaderMask = Pal::PerfShaderMaskCs; - size_t palExperSize = gpu.dev().iDev()->GetPerfExperimentSize( - createInfo, &result); + size_t palExperSize = gpu.dev().iDev()->GetPerfExperimentSize(createInfo, &result); + if (result != Pal::Result::Success) { + return nullptr; + } + + PalThreadTraceReference* memRef = new (palExperSize) PalThreadTraceReference(gpu); + if (memRef != nullptr) { + result = gpu.dev().iDev()->CreatePerfExperiment(createInfo, &memRef[1], &memRef->perfExp_); if (result != Pal::Result::Success) { - return nullptr; + memRef->release(); + return nullptr; } + } - PalThreadTraceReference* memRef = new (palExperSize) PalThreadTraceReference(gpu); - if (memRef != nullptr) { - result = gpu.dev().iDev()->CreatePerfExperiment(createInfo, - &memRef[1], &memRef->perfExp_); - if (result != Pal::Result::Success) { - memRef->release(); - return nullptr; - } - } - - return memRef; + return memRef; } -PalThreadTraceReference::~PalThreadTraceReference() -{ - // The thread trace object is always associated with a particular queue, - // so we have to lock just this queue - amd::ScopedLock lock(gpu_.execution()); +PalThreadTraceReference::~PalThreadTraceReference() { + // The thread trace object is always associated with a particular queue, + // so we have to lock just this queue + amd::ScopedLock lock(gpu_.execution()); - delete layout_; - delete memory_; + delete layout_; + delete memory_; - if (nullptr != iPerf()) { - iPerf()->Destroy(); - } + if (nullptr != iPerf()) { + iPerf()->Destroy(); + } } -bool -PalThreadTraceReference::finalize() -{ - Pal::Result result; +bool PalThreadTraceReference::finalize() { + Pal::Result result; - iPerf()->Finalize(); + iPerf()->Finalize(); - // Acquire GPU memory for the query from the pool and bind it. - Pal::GpuMemoryRequirements gpuMemReqs = {}; - iPerf()->GetGpuMemoryRequirements(&gpuMemReqs); + // Acquire GPU memory for the query from the pool and bind it. + Pal::GpuMemoryRequirements gpuMemReqs = {}; + iPerf()->GetGpuMemoryRequirements(&gpuMemReqs); - memory_ = new Memory(gpu().dev(), amd::alignUp(gpuMemReqs.size, gpuMemReqs.alignment)); + memory_ = new Memory(gpu().dev(), amd::alignUp(gpuMemReqs.size, gpuMemReqs.alignment)); - if (nullptr == memory_) { - return false; - } + if (nullptr == memory_) { + return false; + } - if (!memory_->create(Resource::Local)) { - return false; - } + if (!memory_->create(Resource::Local)) { + return false; + } - gpu_.queue(gpu_.engineID_).addMemRef(memory_->iMem()); + gpu_.queue(gpu_.engineID_).addMemRef(memory_->iMem()); - result = iPerf()->BindGpuMemory(memory_->iMem(), 0); + result = iPerf()->BindGpuMemory(memory_->iMem(), 0); + if (result != Pal::Result::Success) { + return false; + } + + Pal::ThreadTraceLayout layout = {}; + iPerf()->GetThreadTraceLayout(&layout); + + size_t size = + sizeof(Pal::ThreadTraceLayout) + (sizeof(Pal::ThreadTraceSeLayout) * (layout.traceCount - 1)); + layout_ = reinterpret_cast(new char[size]); + if (layout_ == nullptr) { + return false; + } + + layout_->traceCount = layout.traceCount; + iPerf()->GetThreadTraceLayout(layout_); + + return true; +} + +void PalThreadTraceReference::copyToUserBuffer(Memory* dstMem, uint seIndex) { + amd::Coord3D srcOrigin(layout_->traces[seIndex].dataOffset, 0, 0); + amd::Coord3D dstOrigin(0, 0, 0); + amd::Coord3D size(dstMem->size(), 0, 0); + + gpu_.blitMgr().copyBuffer(*memory_, *dstMem, srcOrigin, dstOrigin, size, true); +} + +ThreadTrace::~ThreadTrace() { + if (palRef_ == nullptr) { + return; + } + + // Release the thread trace reference object + palRef_->release(); +} + +bool ThreadTrace::create() { + palRef_->retain(); + + size_t se = 0; + for (auto itMemObj = memObj_.begin(); itMemObj != memObj_.end(); ++itMemObj, ++se) { + // Initialize the thread trace + Pal::PerfTraceInfo sqttInfo = {}; + sqttInfo.traceType = Pal::PerfTraceType::ThreadTrace; + sqttInfo.instance = se; + + sqttInfo.optionFlags.bufferSize = 1; + // PAL requires ThreadTrace buffer aligned to 4KB + sqttInfo.optionValues.bufferSize = + amd::alignUp(dev().getGpuMemory(*itMemObj)->size(), (0x1 << 12)); + sqttInfo.optionFlags.threadTraceTokenMask = 1; + sqttInfo.optionValues.threadTraceTokenMask = 0x0000ffff; + + Pal::Result result = iPerf()->AddTrace(sqttInfo); if (result != Pal::Result::Success) { - return false; + return false; } + } - Pal::ThreadTraceLayout layout = {}; - iPerf()->GetThreadTraceLayout(&layout); - - size_t size = sizeof(Pal::ThreadTraceLayout) + (sizeof(Pal::ThreadTraceSeLayout) * (layout.traceCount - 1)); - layout_ = reinterpret_cast(new char[size]); - if (layout_ == nullptr) { - return false; - } - - layout_->traceCount = layout.traceCount; - iPerf()->GetThreadTraceLayout(layout_); - - return true; + return true; } -void -PalThreadTraceReference::copyToUserBuffer(Memory* dstMem, uint seIndex) -{ - amd::Coord3D srcOrigin(layout_->traces[seIndex].dataOffset, 0, 0); - amd::Coord3D dstOrigin(0, 0, 0); - amd::Coord3D size(dstMem->size(), 0, 0); - - gpu_.blitMgr().copyBuffer(*memory_, *dstMem, srcOrigin, dstOrigin, size, true); +void ThreadTrace::populateUserMemory() { + uint se = 0; + for (auto itMemObj = memObj_.begin(); itMemObj != memObj_.end(); ++itMemObj, ++se) { + palRef_->copyToUserBuffer(dev().getGpuMemory(*itMemObj), se); + } } -ThreadTrace::~ThreadTrace() -{ - if (palRef_ == nullptr) { - return; - } - - // Release the thread trace reference object - palRef_->release(); -} - -bool -ThreadTrace::create() -{ - palRef_->retain(); - - size_t se = 0; - for (auto itMemObj = memObj_.begin(); itMemObj != memObj_.end(); ++itMemObj, ++se) { - // Initialize the thread trace - Pal::PerfTraceInfo sqttInfo = {}; - sqttInfo.traceType = Pal::PerfTraceType::ThreadTrace; - sqttInfo.instance = se; - - sqttInfo.optionFlags.bufferSize = 1; - // PAL requires ThreadTrace buffer aligned to 4KB - sqttInfo.optionValues.bufferSize = amd::alignUp(dev().getGpuMemory(*itMemObj)->size(), (0x1 << 12)); - sqttInfo.optionFlags.threadTraceTokenMask = 1; - sqttInfo.optionValues.threadTraceTokenMask = 0x0000ffff; - - Pal::Result result = iPerf()->AddTrace(sqttInfo); - if (result != Pal::Result::Success) { - return false; - } - } - - return true; -} - -void -ThreadTrace::populateUserMemory() -{ - uint se = 0; - for (auto itMemObj = memObj_.begin(); itMemObj != memObj_.end(); ++itMemObj, ++se) { - palRef_->copyToUserBuffer(dev().getGpuMemory(*itMemObj), se); - } -} - -bool -ThreadTrace::info(uint infoType, uint* info, uint infoSize) const -{ - switch (infoType) { +bool ThreadTrace::info(uint infoType, uint* info, uint infoSize) const { + switch (infoType) { case CL_THREAD_TRACE_BUFFERS_SIZE: { - if (infoSize < numSe_) { - LogError("The amount of buffers should be equal to the amount of Shader Engines"); - return false; + if (infoSize < numSe_) { + LogError("The amount of buffers should be equal to the amount of Shader Engines"); + return false; + } else { + uint se = 0; + for (auto itMemObj = memObj_.begin(); itMemObj != memObj_.end(); ++itMemObj, ++se) { + info[se] = dev().getGpuMemory(*itMemObj)->size(); } - else { - uint se = 0; - for (auto itMemObj = memObj_.begin(); itMemObj != memObj_.end(); ++itMemObj, ++se) { - info[se] = dev().getGpuMemory(*itMemObj)->size(); - } - } - break; + } + break; } default: - LogError("Wrong ThreadTrace::getInfo parameter"); - return false; - } - return true; + LogError("Wrong ThreadTrace::getInfo parameter"); + return false; + } + return true; } -} // namespace pal +} // namespace pal diff --git a/rocclr/runtime/device/pal/palthreadtrace.hpp b/rocclr/runtime/device/pal/palthreadtrace.hpp index 2c7fbc4423..d3822f0a83 100644 --- a/rocclr/runtime/device/pal/palthreadtrace.hpp +++ b/rocclr/runtime/device/pal/palthreadtrace.hpp @@ -13,108 +13,93 @@ namespace pal { class VirtualGPU; -class PalThreadTraceReference : public amd::ReferenceCountedObject -{ -public: - static PalThreadTraceReference* Create(VirtualGPU& gpu); +class PalThreadTraceReference : public amd::ReferenceCountedObject { + public: + static PalThreadTraceReference* Create(VirtualGPU& gpu); - //! Default constructor - PalThreadTraceReference( - VirtualGPU& gpu //!< Virtual GPU device object - ) - : gpu_(gpu) - , perfExp_(nullptr) - , layout_(nullptr) - , memory_(nullptr) - {} + //! Default constructor + PalThreadTraceReference(VirtualGPU& gpu //!< Virtual GPU device object + ) + : gpu_(gpu), perfExp_(nullptr), layout_(nullptr), memory_(nullptr) {} - //! Get PAL thread race object - Pal::IPerfExperiment* iPerf() const { return perfExp_; } + //! Get PAL thread race object + Pal::IPerfExperiment* iPerf() const { return perfExp_; } - //! Returns the virtual GPU device - const VirtualGPU& gpu() const { return gpu_; } + //! Returns the virtual GPU device + const VirtualGPU& gpu() const { return gpu_; } - //! Prepare for execution - bool finalize(); + //! Prepare for execution + bool finalize(); - //! Copy ThreadTrace capture to User Buffer - void copyToUserBuffer(Memory* dstMem, uint seIndex); + //! Copy ThreadTrace capture to User Buffer + void copyToUserBuffer(Memory* dstMem, uint seIndex); -protected: - //! Default destructor - ~PalThreadTraceReference(); + protected: + //! Default destructor + ~PalThreadTraceReference(); -private: - //! Disable copy constructor - PalThreadTraceReference(const PalThreadTraceReference&); + private: + //! Disable copy constructor + PalThreadTraceReference(const PalThreadTraceReference&); - //! Disable operator= - PalThreadTraceReference& operator=(const PalThreadTraceReference&); + //! Disable operator= + PalThreadTraceReference& operator=(const PalThreadTraceReference&); - VirtualGPU& gpu_; //!< The virtual GPU device object - Pal::IPerfExperiment* perfExp_; //!< PAL performance experiment object - Pal::ThreadTraceLayout* layout_; //!< Layout of the result - Memory* memory_; //!< Memory bound to PerfExperiment + VirtualGPU& gpu_; //!< The virtual GPU device object + Pal::IPerfExperiment* perfExp_; //!< PAL performance experiment object + Pal::ThreadTraceLayout* layout_; //!< Layout of the result + Memory* memory_; //!< Memory bound to PerfExperiment }; //! ThreadTrace implementation on GPU -class ThreadTrace : public device::ThreadTrace -{ -public: +class ThreadTrace : public device::ThreadTrace { + public: + //! Constructor for the GPU ThreadTrace object + ThreadTrace(Device& device, //!< A GPU device object + PalThreadTraceReference* palRef, //!< Reference ThreadTrace + const std::vector& memObjs, //!< ThreadTrace memory objects + uint numSe //!< Number of Shader Engines + ) + : gpuDevice_(device), palRef_(palRef), numSe_(numSe), memObj_(memObjs) {} - //! Constructor for the GPU ThreadTrace object - ThreadTrace( - Device& device, //!< A GPU device object - PalThreadTraceReference* palRef, //!< Reference ThreadTrace - const std::vector& memObjs, //!< ThreadTrace memory objects - uint numSe //!< Number of Shader Engines - ) - : gpuDevice_(device) - , palRef_(palRef) - , numSe_(numSe) - , memObj_(memObjs) - {} + //! Destructor for the GPU ThreadTrace object + virtual ~ThreadTrace(); - //! Destructor for the GPU ThreadTrace object - virtual ~ThreadTrace(); + //! Creates the current object4 + bool create(); - //! Creates the current object4 - bool create(); + // Populate ThreadTrace memory with PerfExperiment memory + void populateUserMemory(); - // Populate ThreadTrace memory with PerfExperiment memory - void populateUserMemory(); + //! Returns the specific information about the thread trace object + bool info(uint infoType, //!< The type of returned information + uint* info, //!< The returned information + uint infoSize //!< The size of returned information + ) const; - //! Returns the specific information about the thread trace object - bool info( - uint infoType, //!< The type of returned information - uint* info, //!< The returned information - uint infoSize //!< The size of returned information - ) const; + //! Set isNewBufferBinded_ to true/false if new buffer was binded/unbinded respectively + void setNewBufferBinded(bool isNewBufferBinded) {} - //! Set isNewBufferBinded_ to true/false if new buffer was binded/unbinded respectively - void setNewBufferBinded(bool isNewBufferBinded) {} + //! Returns the GPU device, associated with the current object + const Device& dev() const { return gpuDevice_; } - //! Returns the GPU device, associated with the current object - const Device& dev() const { return gpuDevice_; } + //! Returns the virtual GPU device + const VirtualGPU& gpu() const { return palRef_->gpu(); } - //! Returns the virtual GPU device - const VirtualGPU& gpu() const { return palRef_->gpu(); } + //! Get PAL thread trace object + Pal::IPerfExperiment* iPerf() const { return palRef_->iPerf(); } - //! Get PAL thread trace object - Pal::IPerfExperiment* iPerf() const { return palRef_->iPerf(); } + private: + //! Disable default copy constructor + ThreadTrace(const ThreadTrace&); -private: - //! Disable default copy constructor - ThreadTrace(const ThreadTrace&); + //! Disable default operator= + ThreadTrace& operator=(const ThreadTrace&); - //! Disable default operator= - ThreadTrace& operator=(const ThreadTrace&); - - const Device& gpuDevice_; //!< The backend device - PalThreadTraceReference* palRef_; //!< Reference ThreadTrace - uint numSe_; //!< Number of Shader Engines - std::vector memObj_; //!< ThreadTrace memory objects + const Device& gpuDevice_; //!< The backend device + PalThreadTraceReference* palRef_; //!< Reference ThreadTrace + uint numSe_; //!< Number of Shader Engines + std::vector memObj_; //!< ThreadTrace memory objects }; -} // namespace pal - +} // namespace pal diff --git a/rocclr/runtime/device/pal/paltimestamp.cpp b/rocclr/runtime/device/pal/paltimestamp.cpp index 15876345ac..c46fe3ec97 100644 --- a/rocclr/runtime/device/pal/paltimestamp.cpp +++ b/rocclr/runtime/device/pal/paltimestamp.cpp @@ -10,114 +10,92 @@ namespace pal { -TimeStamp::TimeStamp( - const VirtualGPU& gpu, - Pal::IGpuMemory* iMem, - uint memOffset, - address cpuAddr) - : gpu_(gpu) - , iMem_(iMem) - , memOffset_(memOffset) -{ - values_ = reinterpret_cast(cpuAddr + memOffset); +TimeStamp::TimeStamp(const VirtualGPU& gpu, Pal::IGpuMemory* iMem, uint memOffset, address cpuAddr) + : gpu_(gpu), iMem_(iMem), memOffset_(memOffset) { + values_ = reinterpret_cast(cpuAddr + memOffset); } -TimeStamp::~TimeStamp() -{ +TimeStamp::~TimeStamp() {} + +void TimeStamp::begin(bool sdma) { + if (!flags_.beginIssued_) { + gpu().iCmd()->CmdWriteTimestamp(Pal::HwPipePoint::HwPipeTop, *iMem_, + memOffset_ + CommandStartTime * sizeof(uint64_t)); + flags_.beginIssued_ = true; + } } -void -TimeStamp::begin(bool sdma) -{ - if (!flags_.beginIssued_) { - gpu().iCmd()->CmdWriteTimestamp(Pal::HwPipePoint::HwPipeTop, *iMem_, - memOffset_ + CommandStartTime * sizeof(uint64_t)); - flags_.beginIssued_ = true; +void TimeStamp::end(bool sdma) { + CondLog(!flags_.beginIssued_, "We didn't issue a begin operation!"); + gpu().iCmd()->CmdWriteTimestamp(Pal::HwPipePoint::HwPipeBottom, *iMem_, + memOffset_ + CommandEndTime * sizeof(uint64_t)); + flags_.endIssued_ = true; + flags_.sdma_ = sdma; +} + +inline void SetValue(uint64_t* time, uint64_t val, double nanos) { + *time = static_cast(static_cast(val) * nanos); +} + +void TimeStamp::value(uint64_t* startTime, uint64_t* endTime) { + CondLog(!flags_.endIssued_, "We didn't send the counter end operation!"); + //! @todo optimize! + const double NanoSecondsPerTick = 1000000000.0 / (gpu_.dev().properties().timestampFrequency); + + SetValue(startTime, values_[CommandStartTime], NanoSecondsPerTick); + SetValue(endTime, values_[CommandEndTime], NanoSecondsPerTick); +} + +TimeStampCache::~TimeStampCache() { + // Release all time stamp objects from the cache + for (uint i = 0; i < freedTS_.size(); ++i) { + delete freedTS_[i]; + } + freedTS_.clear(); + + // Release all memory objects + for (uint i = 0; i < tsBuf_.size(); ++i) { + tsBuf_[i]->unmap(&gpu_); + gpu_.queue(MainEngine).removeMemRef(tsBuf_[i]->iMem()); + gpu_.queue(SdmaEngine).removeMemRef(tsBuf_[i]->iMem()); + delete tsBuf_[i]; + } + tsBuf_.clear(); +} + +TimeStamp* TimeStampCache::allocTimeStamp() { + TimeStamp* ts = nullptr; + if (0 != freedTS_.size()) { + ts = freedTS_.back(); + freedTS_.pop_back(); + } + + if (nullptr == ts) { + if ((tsBufCpu_ == nullptr) || ((tsOffset_ + TimerSlotSize) > TimerBufSize)) { + Memory* buf = new Memory(gpu_.dev(), TimerBufSize); + if (buf == nullptr || !buf->create(Resource::Remote)) { + return nullptr; + } + gpu_.queue(MainEngine).addMemRef(buf->iMem()); + gpu_.queue(SdmaEngine).addMemRef(buf->iMem()); + tsBufCpu_ = reinterpret_cast
(buf->map(&gpu_)); + memset(tsBufCpu_, 0, TimerBufSize); + tsOffset_ = 0; + tsBuf_.push_back(buf); } -} - -void -TimeStamp::end(bool sdma) -{ - CondLog(!flags_.beginIssued_, "We didn't issue a begin operation!"); - gpu().iCmd()->CmdWriteTimestamp(Pal::HwPipePoint::HwPipeBottom, *iMem_, - memOffset_ + CommandEndTime * sizeof(uint64_t)); - flags_.endIssued_ = true; - flags_.sdma_ = sdma; -} - -inline void -SetValue(uint64_t* time, uint64_t val, double nanos) -{ - *time = static_cast(static_cast(val) * nanos); -} - -void -TimeStamp::value(uint64_t* startTime, uint64_t* endTime) -{ - CondLog(!flags_.endIssued_, "We didn't send the counter end operation!"); - //! @todo optimize! - const double NanoSecondsPerTick = 1000000000.0 / (gpu_.dev().properties().timestampFrequency); - - SetValue(startTime, values_[CommandStartTime], NanoSecondsPerTick); - SetValue(endTime, values_[CommandEndTime], NanoSecondsPerTick); -} - -TimeStampCache::~TimeStampCache() -{ - // Release all time stamp objects from the cache - for (uint i = 0; i < freedTS_.size(); ++i) { - delete freedTS_[i]; + // Allocate a TimeStamp object + ts = new TimeStamp(gpu_, tsBuf_[(tsBuf_.size() - 1)]->iMem(), tsOffset_, tsBufCpu_); + // Create a timestamp + if (ts == nullptr) { + return nullptr; } - freedTS_.clear(); + tsOffset_ += TimerSlotSize; + } - // Release all memory objects - for (uint i = 0; i < tsBuf_.size(); ++i) { - tsBuf_[i]->unmap(&gpu_); - gpu_.queue(MainEngine).removeMemRef(tsBuf_[i]->iMem()); - gpu_.queue(SdmaEngine).removeMemRef(tsBuf_[i]->iMem()); - delete tsBuf_[i]; - } - tsBuf_.clear(); + // Set this timestamp into DRM profile mode if it was requested + ts->clearStates(); + return ts; } -TimeStamp* -TimeStampCache::allocTimeStamp() -{ - TimeStamp* ts = nullptr; - if (0 != freedTS_.size()) { - ts = freedTS_.back(); - freedTS_.pop_back(); - } - - if (nullptr == ts) { - if ((tsBufCpu_ == nullptr) || ((tsOffset_ + TimerSlotSize) > TimerBufSize)) { - Memory* buf = new Memory(gpu_.dev(), TimerBufSize); - if (buf == nullptr || !buf->create(Resource::Remote)) { - return nullptr; - } - gpu_.queue(MainEngine).addMemRef(buf->iMem()); - gpu_.queue(SdmaEngine).addMemRef(buf->iMem()); - tsBufCpu_ = reinterpret_cast
(buf->map(&gpu_)); - memset(tsBufCpu_, 0, TimerBufSize); - tsOffset_ = 0; - tsBuf_.push_back(buf); - } - // Allocate a TimeStamp object - ts = new TimeStamp(gpu_, tsBuf_[(tsBuf_.size() - 1)]->iMem(), - tsOffset_, tsBufCpu_); - // Create a timestamp - if (ts == nullptr) { - return nullptr; - } - tsOffset_ += TimerSlotSize; - } - - // Set this timestamp into DRM profile mode if it was requested - ts->clearStates(); - - return ts; -} - -} // namespace pal +} // namespace pal diff --git a/rocclr/runtime/device/pal/paltimestamp.hpp b/rocclr/runtime/device/pal/paltimestamp.hpp index 6ec10e52e7..97b2d5e5ca 100644 --- a/rocclr/runtime/device/pal/paltimestamp.hpp +++ b/rocclr/runtime/device/pal/paltimestamp.hpp @@ -17,113 +17,101 @@ class Device; class VirtualGPU; class Memory; -class TimeStamp : public amd::HeapObject -{ -public: - //! Enums for the timestamp information - //! \note *4 is the limitaiton of SDMA HW - //! (address has to be aligned by 256 bit) - enum TimeStampValue { - CommandStartTime = 0, - CommandEndTime = 4, - CommandTotal = 8 +class TimeStamp : public amd::HeapObject { + public: + //! Enums for the timestamp information + //! \note *4 is the limitaiton of SDMA HW + //! (address has to be aligned by 256 bit) + enum TimeStampValue { CommandStartTime = 0, CommandEndTime = 4, CommandTotal = 8 }; + + //! The TimeStamp object flags + union Flags { + struct { + uint32_t beginIssued_ : 1; + uint32_t endIssued_ : 1; + uint32_t sdma_ : 1; }; + uint32_t value_; + Flags() : value_(0) {} + }; - //! The TimeStamp object flags - union Flags - { - struct - { - uint32_t beginIssued_ : 1; - uint32_t endIssued_ : 1; - uint32_t sdma_ : 1; - }; - uint32_t value_; - Flags(): value_(0) {} - }; + //! Default constructor + TimeStamp(const VirtualGPU& gpu, //!< Virtual GPU + Pal::IGpuMemory* iMem, //!< Buffer with the timer values + uint memOffset, //!< Offset in the buffer for the current TS + address cpuAddr //!< CPU pointer for the values in memory + ); - //! Default constructor - TimeStamp( - const VirtualGPU& gpu, //!< Virtual GPU - Pal::IGpuMemory* iMem, //!< Buffer with the timer values - uint memOffset, //!< Offset in the buffer for the current TS - address cpuAddr //!< CPU pointer for the values in memory - ); + //! Default destructor + ~TimeStamp(); - //! Default destructor - ~TimeStamp(); + //! Starts the timestamp + void begin(bool sdma = false); - //! Starts the timestamp - void begin(bool sdma = false); + //! Ends the timestamp + void end(bool sdma = false); - //! Ends the timestamp - void end(bool sdma = false); + //! Returns the timestamp result in nano seconds + void value(uint64_t* startTime, uint64_t* endTime); - //! Returns the timestamp result in nano seconds - void value(uint64_t* startTime, uint64_t* endTime); + //! Clear all TimeStamp states + void clearStates() { + flags_.value_ = 0; + values_[CommandStartTime] = 0; + values_[CommandEndTime] = 0; + } - //! Clear all TimeStamp states - void clearStates() - { flags_.value_ = 0; - values_[CommandStartTime] = 0; - values_[CommandEndTime] = 0; - } + //! Timer commands were submitted to HW + bool isValid() const { return (flags_.endIssued_) ? true : false; } - //! Timer commands were submitted to HW - bool isValid() const { return (flags_.endIssued_) ? true : false; } + private: + //! Disable copy constructor + TimeStamp(const TimeStamp&); -private: - //! Disable copy constructor - TimeStamp(const TimeStamp&); + //! Disable operator= + TimeStamp& operator=(const TimeStamp&); - //! Disable operator= - TimeStamp& operator=(const TimeStamp&); + //! Returns the GPU device object + const VirtualGPU& gpu() const { return gpu_; } - //! Returns the GPU device object - const VirtualGPU& gpu() const { return gpu_; } - - const VirtualGPU& gpu_; //!< Virtual GPU - Flags flags_; //!< The time stamp state - Pal::IGpuMemory* iMem_; //!< Buffer with the timer values - uint memOffset_; //!< Offset in the buffer for the current timer - volatile uint64_t* values_; //!< CPU pointer to the timer values + const VirtualGPU& gpu_; //!< Virtual GPU + Flags flags_; //!< The time stamp state + Pal::IGpuMemory* iMem_; //!< Buffer with the timer values + uint memOffset_; //!< Offset in the buffer for the current timer + volatile uint64_t* values_; //!< CPU pointer to the timer values }; -class TimeStampCache : public amd::HeapObject -{ -public: - //! Default constructor - TimeStampCache( - VirtualGPU& gpu //!< Virtual GPU object - ) - : gpu_(gpu) - , tsBufCpu_(NULL) - , tsOffset_(0) {} +class TimeStampCache : public amd::HeapObject { + public: + //! Default constructor + TimeStampCache(VirtualGPU& gpu //!< Virtual GPU object + ) + : gpu_(gpu), tsBufCpu_(NULL), tsOffset_(0) {} - //! Default destructor - ~TimeStampCache(); + //! Default destructor + ~TimeStampCache(); - //! Gets a time stamp object. It will find a freed object or allocate a new one - TimeStamp* allocTimeStamp(); + //! Gets a time stamp object. It will find a freed object or allocate a new one + TimeStamp* allocTimeStamp(); - //! Frees a time stamp object - void freeTimeStamp(TimeStamp* ts) { freedTS_.push_back(ts); } + //! Frees a time stamp object + void freeTimeStamp(TimeStamp* ts) { freedTS_.push_back(ts); } -private: - static const uint TimerSlotSize = TimeStamp::CommandTotal * sizeof(uint64_t); - static const uint TimerBufSize = TimerSlotSize * 4096; + private: + static const uint TimerSlotSize = TimeStamp::CommandTotal * sizeof(uint64_t); + static const uint TimerBufSize = TimerSlotSize * 4096; - //! Disable copy constructor - TimeStampCache(const TimeStampCache&); + //! Disable copy constructor + TimeStampCache(const TimeStampCache&); - //! Disable operator= - TimeStampCache& operator=(const TimeStampCache&); + //! Disable operator= + TimeStampCache& operator=(const TimeStampCache&); - std::vector freedTS_; //!< Array of freed time stamp objects - VirtualGPU& gpu_; //!< Virtual GPU - std::vector tsBuf_; //!< Array of memory objects with the timer value - address tsBufCpu_; //!< CPU pointer for current TS memory - uint tsOffset_; //!< Active offset in the current mem object + std::vector freedTS_; //!< Array of freed time stamp objects + VirtualGPU& gpu_; //!< Virtual GPU + std::vector tsBuf_; //!< Array of memory objects with the timer value + address tsBufCpu_; //!< CPU pointer for current TS memory + uint tsOffset_; //!< Active offset in the current mem object }; /*@}*/} // namespace pal diff --git a/rocclr/runtime/device/pal/paltrap.hpp b/rocclr/runtime/device/pal/paltrap.hpp index e1eed63243..1494710819 100644 --- a/rocclr/runtime/device/pal/paltrap.hpp +++ b/rocclr/runtime/device/pal/paltrap.hpp @@ -116,72 +116,24 @@ end *******************************************************************************/ /// shader codes with "asic(TAHITI)" instruction -static const uint32_t RuntimeTrapCode [] = { - 0x7e008200, 0xbf8c0000, - 0xbef8036c, 0x8779ff6d, - 0x0000ffff, 0x8879ff79, - 0x01000000, 0xbefa03ff, - 0x00002000, 0xbefb03ff, - 0x00024fac, 0x80f8ff78, - 0x00000100, 0xbef70300, - 0xc2007900, 0xbf8c0000, - 0xbeee0300, 0xc2007901, - 0xbf8c0000, 0xbeef0300, - 0xbe800377, 0xbef60398, - 0x8078766e, 0x8779ff6f, - 0x0000ffff, 0x8879ff79, - 0x00680000, 0xbefa03ff, - 0x00002000, 0xbefb03ff, - 0x00024fac, 0xbef6036e, - 0xbef7036f, 0xbef30300, - 0xc2007902, 0xbf8c0000, - 0xbeee0300, 0xc2007903, - 0xbf8c0000, 0xbeef0300, - 0xc2007900, 0xbf8c0000, - 0xbef20300, 0xc2007901, - 0xbf8c0000, 0x89737300, - 0x89007300, 0x89737300, - 0xbef80372, 0xbef90373, - 0xbef21f00, 0x80728872, - 0xbe802078, 0xbeef0377, - 0xbeee0376, 0x8771ff71, - 0x0000ffff, 0xbe802270 -}; +static const uint32_t RuntimeTrapCode[] = { + 0x7e008200, 0xbf8c0000, 0xbef8036c, 0x8779ff6d, 0x0000ffff, 0x8879ff79, 0x01000000, 0xbefa03ff, + 0x00002000, 0xbefb03ff, 0x00024fac, 0x80f8ff78, 0x00000100, 0xbef70300, 0xc2007900, 0xbf8c0000, + 0xbeee0300, 0xc2007901, 0xbf8c0000, 0xbeef0300, 0xbe800377, 0xbef60398, 0x8078766e, 0x8779ff6f, + 0x0000ffff, 0x8879ff79, 0x00680000, 0xbefa03ff, 0x00002000, 0xbefb03ff, 0x00024fac, 0xbef6036e, + 0xbef7036f, 0xbef30300, 0xc2007902, 0xbf8c0000, 0xbeee0300, 0xc2007903, 0xbf8c0000, 0xbeef0300, + 0xc2007900, 0xbf8c0000, 0xbef20300, 0xc2007901, 0xbf8c0000, 0x89737300, 0x89007300, 0x89737300, + 0xbef80372, 0xbef90373, 0xbef21f00, 0x80728872, 0xbe802078, 0xbeef0377, 0xbeee0376, 0x8771ff71, + 0x0000ffff, 0xbe802270}; /// shader codes with "asic(VI)" instruction -static const uint32_t RuntimeTrapCodeVi [] = { - 0x7e006a00, 0xbf8c0000, - 0xbef8006c, 0x8679ff6d, - 0x0000ffff, 0x8779ff79, - 0x01000000, 0xbefa00ff, - 0x00002000, 0xbefb00ff, - 0x00024fac, 0x80f8ff78, - 0x00000100, 0xbef70000, - 0xc022003c, 0x00000000, - 0xbf8c0000, 0xbeee0000, - 0xc022003c, 0x00000004, - 0xbf8c0000, 0xbeef0000, - 0xbe800077, 0xbef60098, - 0x8078766e, 0x8679ff6f, - 0x0000ffff, 0x8779ff79, - 0x00680000, 0xbefa00ff, - 0x00002000, 0xbefb00ff, - 0x00024fac, 0xbef6006e, - 0xbef7006f, 0xbef30000, - 0xc022003c, 0x00000008, - 0xbf8c0000, 0xbeee0000, - 0xc022003c, 0x0000000c, - 0xbf8c0000, 0xbeef0000, - 0xc022003c, 0x00000000, - 0xbf8c0000, 0xbef20000, - 0xc022003c, 0x00000004, - 0xbf8c0000, 0x88737300, - 0x88007300, 0x88737300, - 0xbef80072, 0xbef90073, - 0xbef21c00, 0x80728872, - 0xbe801d78, 0xbeef0077, - 0xbeee0076, 0x8671ff71, - 0x0000ffff, 0xbe801f70 -}; - +static const uint32_t RuntimeTrapCodeVi[] = { + 0x7e006a00, 0xbf8c0000, 0xbef8006c, 0x8679ff6d, 0x0000ffff, 0x8779ff79, 0x01000000, 0xbefa00ff, + 0x00002000, 0xbefb00ff, 0x00024fac, 0x80f8ff78, 0x00000100, 0xbef70000, 0xc022003c, 0x00000000, + 0xbf8c0000, 0xbeee0000, 0xc022003c, 0x00000004, 0xbf8c0000, 0xbeef0000, 0xbe800077, 0xbef60098, + 0x8078766e, 0x8679ff6f, 0x0000ffff, 0x8779ff79, 0x00680000, 0xbefa00ff, 0x00002000, 0xbefb00ff, + 0x00024fac, 0xbef6006e, 0xbef7006f, 0xbef30000, 0xc022003c, 0x00000008, 0xbf8c0000, 0xbeee0000, + 0xc022003c, 0x0000000c, 0xbf8c0000, 0xbeef0000, 0xc022003c, 0x00000000, 0xbf8c0000, 0xbef20000, + 0xc022003c, 0x00000004, 0xbf8c0000, 0x88737300, 0x88007300, 0x88737300, 0xbef80072, 0xbef90073, + 0xbef21c00, 0x80728872, 0xbe801d78, 0xbeef0077, 0xbeee0076, 0x8671ff71, 0x0000ffff, 0xbe801f70}; diff --git a/rocclr/runtime/device/pal/palvirtual.cpp b/rocclr/runtime/device/pal/palvirtual.cpp index 0775016867..0c13824e63 100644 --- a/rocclr/runtime/device/pal/palvirtual.cpp +++ b/rocclr/runtime/device/pal/palvirtual.cpp @@ -28,3641 +28,3291 @@ #include "amdocl/cl_d3d9_amd.hpp" #include "amdocl/cl_d3d10_amd.hpp" #include "amdocl/cl_d3d11_amd.hpp" -#endif // _WIN32 +#endif // _WIN32 namespace pal { -VirtualGPU::Queue* -VirtualGPU::Queue::Create( - Pal::IDevice* palDev, - Pal::QueueType queueType, - uint engineIdx, - Pal::ICmdAllocator* cmdAllocator, - uint rtCU, - amd::CommandQueue::Priority priority) -{ - Pal::Result result; - Pal::CmdBufferCreateInfo cmdCreateInfo = {}; - Pal::QueueCreateInfo qCreateInfo = {}; - qCreateInfo.engineIndex = engineIdx; - qCreateInfo.aqlQueue = true; - qCreateInfo.queueType = queueType; +VirtualGPU::Queue* VirtualGPU::Queue::Create(Pal::IDevice* palDev, Pal::QueueType queueType, + uint engineIdx, Pal::ICmdAllocator* cmdAllocator, + uint rtCU, amd::CommandQueue::Priority priority) { + Pal::Result result; + Pal::CmdBufferCreateInfo cmdCreateInfo = {}; + Pal::QueueCreateInfo qCreateInfo = {}; + qCreateInfo.engineIndex = engineIdx; + qCreateInfo.aqlQueue = true; + qCreateInfo.queueType = queueType; - if (queueType == Pal::QueueTypeDma) { - cmdCreateInfo.engineType = qCreateInfo.engineType = Pal::EngineTypeDma; - } - else { - cmdCreateInfo.engineType = qCreateInfo.engineType = Pal::EngineTypeCompute; - } + if (queueType == Pal::QueueTypeDma) { + cmdCreateInfo.engineType = qCreateInfo.engineType = Pal::EngineTypeDma; + } else { + cmdCreateInfo.engineType = qCreateInfo.engineType = Pal::EngineTypeCompute; + } - if (priority == amd::CommandQueue::Priority::Medium) { - qCreateInfo.engineIndex = 0x1; - cmdCreateInfo.engineType = qCreateInfo.engineType = Pal::EngineTypeExclusiveCompute; - } - else if (amd::CommandQueue::RealTimeDisabled != rtCU) { - qCreateInfo.numReservedCu = rtCU; - qCreateInfo.engineIndex = 0x0; - cmdCreateInfo.engineType = qCreateInfo.engineType = Pal::EngineTypeExclusiveCompute; - cmdCreateInfo.flags.realtimeComputeUnits = true; - } + if (priority == amd::CommandQueue::Priority::Medium) { + qCreateInfo.engineIndex = 0x1; + cmdCreateInfo.engineType = qCreateInfo.engineType = Pal::EngineTypeExclusiveCompute; + } else if (amd::CommandQueue::RealTimeDisabled != rtCU) { + qCreateInfo.numReservedCu = rtCU; + qCreateInfo.engineIndex = 0x0; + cmdCreateInfo.engineType = qCreateInfo.engineType = Pal::EngineTypeExclusiveCompute; + cmdCreateInfo.flags.realtimeComputeUnits = true; + } - // Find queue object size - size_t qSize = palDev->GetQueueSize(qCreateInfo, &result); + // Find queue object size + size_t qSize = palDev->GetQueueSize(qCreateInfo, &result); + if (result != Pal::Result::Success) { + return nullptr; + } + + cmdCreateInfo.pCmdAllocator = cmdAllocator; + cmdCreateInfo.queueType = queueType; + + // Find command buffer object size + size_t cmdSize = palDev->GetCmdBufferSize(cmdCreateInfo, &result); + if (result != Pal::Result::Success) { + return nullptr; + } + + // Find fence object size + size_t fSize = palDev->GetFenceSize(&result); + if (result != Pal::Result::Success) { + return nullptr; + } + + size_t allocSize = qSize + MaxCmdBuffers * (cmdSize + fSize); + VirtualGPU::Queue* queue = new (allocSize) VirtualGPU::Queue(palDev); + if (queue != nullptr) { + address addrQ = reinterpret_cast
(&queue[1]); + // Create PAL queue object + result = palDev->CreateQueue(qCreateInfo, addrQ, &queue->iQueue_); if (result != Pal::Result::Success) { - return nullptr; + delete queue; + return nullptr; } - cmdCreateInfo.pCmdAllocator = cmdAllocator; - cmdCreateInfo.queueType = queueType; - - // Find command buffer object size - size_t cmdSize = palDev->GetCmdBufferSize(cmdCreateInfo, &result); - if (result != Pal::Result::Success) { - return nullptr; - } - - // Find fence object size - size_t fSize = palDev->GetFenceSize(&result); - if (result != Pal::Result::Success) { - return nullptr; - } - - size_t allocSize = qSize + MaxCmdBuffers * (cmdSize + fSize); - VirtualGPU::Queue* queue = new (allocSize) VirtualGPU::Queue(palDev); - if (queue != nullptr) { - address addrQ = reinterpret_cast
(&queue[1]); - // Create PAL queue object - result = palDev->CreateQueue(qCreateInfo, addrQ, &queue->iQueue_); - if (result != Pal::Result::Success) { - delete queue; - return nullptr; - } - - address addrCmd = addrQ + qSize; - address addrF = addrCmd + MaxCmdBuffers * cmdSize; - Pal::CmdBufferBuildInfo cmdBuildInfo = {}; - - for (uint i = 0; i < MaxCmdBuffers; ++i) { - result = palDev->CreateCmdBuffer(cmdCreateInfo, - &addrCmd[i*cmdSize], &queue->iCmdBuffs_[i]); - if (result != Pal::Result::Success) { - delete queue; - return nullptr; - } - static const bool InitiallySignaled = false; - result = palDev->CreateFence(InitiallySignaled, &addrF[i*fSize], - &queue->iCmdFences_[i]); - if (result != Pal::Result::Success) { - delete queue; - return nullptr; - } - if (i == StartCmdBufIdx) { - result = queue->iCmdBuffs_[i]->Begin(cmdBuildInfo); - if (result != Pal::Result::Success) { - delete queue; - return nullptr; - } - } - } - } - return queue; -} - -VirtualGPU::Queue::~Queue() -{ - std::vector memRef; - // Remove all memory references - for (auto it: memReferences_) { - memRef.push_back(it.first); - } - if (memRef.size() != 0) { - iDev_->RemoveGpuMemoryReferences(memRef.size(), &memRef[0], NULL); - } - memReferences_.clear(); + address addrCmd = addrQ + qSize; + address addrF = addrCmd + MaxCmdBuffers * cmdSize; + Pal::CmdBufferBuildInfo cmdBuildInfo = {}; for (uint i = 0; i < MaxCmdBuffers; ++i) { - if (nullptr != iCmdBuffs_[i]) { - iCmdBuffs_[i]->Destroy(); - } - if (nullptr != iCmdFences_[i]) { - iCmdFences_[i]->Destroy(); + result = palDev->CreateCmdBuffer(cmdCreateInfo, &addrCmd[i * cmdSize], &queue->iCmdBuffs_[i]); + if (result != Pal::Result::Success) { + delete queue; + return nullptr; + } + static const bool InitiallySignaled = false; + result = palDev->CreateFence(InitiallySignaled, &addrF[i * fSize], &queue->iCmdFences_[i]); + if (result != Pal::Result::Success) { + delete queue; + return nullptr; + } + if (i == StartCmdBufIdx) { + result = queue->iCmdBuffs_[i]->Begin(cmdBuildInfo); + if (result != Pal::Result::Success) { + delete queue; + return nullptr; } + } } - - if (nullptr != iQueue_) { - iQueue_->Destroy(); - } + } + return queue; } -void -VirtualGPU::Queue::addCmdMemRef(Pal::IGpuMemory* iMem) -{ - auto it = memReferences_.find(iMem); - if (it != memReferences_.end()) { - it->second = (it->second & FirstMemoryReference) | cmdBufIdSlot_; +VirtualGPU::Queue::~Queue() { + std::vector memRef; + // Remove all memory references + for (auto it : memReferences_) { + memRef.push_back(it.first); + } + if (memRef.size() != 0) { + iDev_->RemoveGpuMemoryReferences(memRef.size(), &memRef[0], NULL); + } + memReferences_.clear(); + + for (uint i = 0; i < MaxCmdBuffers; ++i) { + if (nullptr != iCmdBuffs_[i]) { + iCmdBuffs_[i]->Destroy(); } - else { - memReferences_[iMem] = FirstMemoryReference | cmdBufIdSlot_; + if (nullptr != iCmdFences_[i]) { + iCmdFences_[i]->Destroy(); } + } + + if (nullptr != iQueue_) { + iQueue_->Destroy(); + } } -void -VirtualGPU::Queue::removeCmdMemRef(Pal::IGpuMemory* iMem) -{ - if (0 != memReferences_.erase(iMem)) { - iDev_->RemoveGpuMemoryReferences(1, &iMem, iQueue_); - } +void VirtualGPU::Queue::addCmdMemRef(Pal::IGpuMemory* iMem) { + auto it = memReferences_.find(iMem); + if (it != memReferences_.end()) { + it->second = (it->second & FirstMemoryReference) | cmdBufIdSlot_; + } else { + memReferences_[iMem] = FirstMemoryReference | cmdBufIdSlot_; + } } -void -VirtualGPU::Queue::addCmdDoppRef(Pal::IGpuMemory* iMem, bool lastDoppCmd, bool pfpaDoppCmd) -{ - for (int i = 0; i < palDoppRefs_.size(); i++) { - if (palDoppRefs_[i].pGpuMemory == iMem) { - // If both LAST_DOPP_SUBMISSION and PFPA_DOPP_SUBMISSION VCOPs are requested, - // the LAST_DOPP_SUBMISSION is send as requsted by KMD - // - if (palDoppRefs_[i].flags.lastPfpaCmd == 1) { - return; // no need to override the last submission command - } - - if (lastDoppCmd) { - palDoppRefs_[i].flags.lastPfpaCmd = 1; - palDoppRefs_[i].flags.pfpa = 0; - } - else if (pfpaDoppCmd) { - palDoppRefs_[i].flags.pfpa = 1; - } - return; - } - } - - // this is the first reference of the DOPP desktop texture, add it in the vector - Pal::DoppRef doppRef = {}; - doppRef.flags.pfpa = pfpaDoppCmd ? 1 : 0; - doppRef.flags.lastPfpaCmd = lastDoppCmd ? 1 : 0; - doppRef.pGpuMemory = iMem; - palDoppRefs_.push_back(doppRef); +void VirtualGPU::Queue::removeCmdMemRef(Pal::IGpuMemory* iMem) { + if (0 != memReferences_.erase(iMem)) { + iDev_->RemoveGpuMemoryReferences(1, &iMem, iQueue_); + } } -uint -VirtualGPU::Queue::submit(bool forceFlush) -{ - cmdCnt_++; - uint id = cmdBufIdCurrent_; - if ((cmdCnt_ > MaxCommands) || forceFlush) { - if (!flush()) { - return GpuEvent::InvalidID; - } +void VirtualGPU::Queue::addCmdDoppRef(Pal::IGpuMemory* iMem, bool lastDoppCmd, bool pfpaDoppCmd) { + for (int i = 0; i < palDoppRefs_.size(); i++) { + if (palDoppRefs_[i].pGpuMemory == iMem) { + // If both LAST_DOPP_SUBMISSION and PFPA_DOPP_SUBMISSION VCOPs are requested, + // the LAST_DOPP_SUBMISSION is send as requsted by KMD + // + if (palDoppRefs_[i].flags.lastPfpaCmd == 1) { + return; // no need to override the last submission command + } + + if (lastDoppCmd) { + palDoppRefs_[i].flags.lastPfpaCmd = 1; + palDoppRefs_[i].flags.pfpa = 0; + } else if (pfpaDoppCmd) { + palDoppRefs_[i].flags.pfpa = 1; + } + return; } - return id; + } + + // this is the first reference of the DOPP desktop texture, add it in the vector + Pal::DoppRef doppRef = {}; + doppRef.flags.pfpa = pfpaDoppCmd ? 1 : 0; + doppRef.flags.lastPfpaCmd = lastDoppCmd ? 1 : 0; + doppRef.pGpuMemory = iMem; + palDoppRefs_.push_back(doppRef); } -bool -VirtualGPU::Queue::flush() -{ - palMemRefs_.resize(0); - // Stop commands building - if (Pal::Result::Success != iCmdBuffs_[cmdBufIdSlot_]->End()) { - LogError("PAL failed to finalize a command buffer!"); - return false; +uint VirtualGPU::Queue::submit(bool forceFlush) { + cmdCnt_++; + uint id = cmdBufIdCurrent_; + if ((cmdCnt_ > MaxCommands) || forceFlush) { + if (!flush()) { + return GpuEvent::InvalidID; } - - // Add memory references - for (auto it = memReferences_.begin(); it != memReferences_.end(); ++it) { - if (it->second & FirstMemoryReference) { - it->second &= ~FirstMemoryReference; - Pal::GpuMemoryRef memRef = {}; - memRef.pGpuMemory = it->first; - palMemRefs_.push_back(memRef); - } - } - - if (palMemRefs_.size() != 0) { - if (Pal::Result::Success != iDev_->AddGpuMemoryReferences( - palMemRefs_.size(), &palMemRefs_[0], iQueue_, Pal::GpuMemoryRefCantTrim)) { - LogError("PAL failed to make resident resources!"); - return false; - } - } - - // Reset the fence. PAL will reset OS event - if (Pal::Result::Success != iDev_->ResetFences(1, &iCmdFences_[cmdBufIdSlot_])) { - LogError("PAL failed to reset a fence!"); - return false; - } - - Pal::SubmitInfo submitInfo = {}; - submitInfo.cmdBufferCount = 1; - submitInfo.ppCmdBuffers = &iCmdBuffs_[cmdBufIdSlot_]; - submitInfo.pFence = iCmdFences_[cmdBufIdSlot_]; - submitInfo.doppRefCount = palDoppRefs_.size(); - submitInfo.pDoppRefs = palDoppRefs_.data(); - - // Submit command buffer to OS - if (Pal::Result::Success != iQueue_->Submit(submitInfo)) { - LogError("PAL failed to submit CMD!"); - return false; - } - if (GPU_FLUSH_ON_EXECUTION) { - waifForFence(cmdBufIdSlot_); - } - - // Reset the counter of commands - cmdCnt_ = 0; - - // Find the next command buffer - cmdBufIdCurrent_++; - - if (cmdBufIdCurrent_ == GpuEvent::InvalidID) { - // Wait for the last one - waifForFence(cmdBufIdSlot_); - cmdBufIdCurrent_ = 1; - cmbBufIdRetired_ = 0; - } - - // Wrap current slot - cmdBufIdSlot_ = cmdBufIdCurrent_ % MaxCmdBuffers; - - // Make sure the slot isn't busy - constexpr bool IbReuse = true; - waifForFence(cmdBufIdSlot_, IbReuse); - - // Progress retired TS - if ((cmdBufIdCurrent_ > MaxCmdBuffers) && - (cmbBufIdRetired_ < (cmdBufIdCurrent_ - MaxCmdBuffers))) { - cmbBufIdRetired_ = cmdBufIdCurrent_ - MaxCmdBuffers; - } - - // Reset command buffer, so CB chunks could be reused - if (Pal::Result::Success != iCmdBuffs_[cmdBufIdSlot_]->Reset(nullptr, false)) { - LogError("PAL failed CB reset!"); - return false; - } - // Start command buffer building - Pal::CmdBufferBuildInfo cmdBuildInfo = {}; - cmdBuildInfo.pMemAllocator = &vlAlloc_; - if (Pal::Result::Success != iCmdBuffs_[cmdBufIdSlot_]->Begin(cmdBuildInfo)) { - LogError("PAL failed CB building initialization!"); - return false; - } - - // Clear dopp references - palDoppRefs_.resize(0); - - palMems_.resize(0); - // Remove old memory references - for (auto it = memReferences_.begin(); it != memReferences_.end();) { - if (it->second == cmdBufIdSlot_) { - palMems_.push_back(it->first); - it = memReferences_.erase(it); - } - else { - ++it; - } - } - if (palMems_.size() != 0) { - iDev_->RemoveGpuMemoryReferences(palMems_.size(), &palMems_[0], iQueue_); - } - - return true; + } + return id; } -bool -VirtualGPU::Queue::waitForEvent(uint id) -{ - if (isDone(id)) { - return true; +bool VirtualGPU::Queue::flush() { + palMemRefs_.resize(0); + // Stop commands building + if (Pal::Result::Success != iCmdBuffs_[cmdBufIdSlot_]->End()) { + LogError("PAL failed to finalize a command buffer!"); + return false; + } + + // Add memory references + for (auto it = memReferences_.begin(); it != memReferences_.end(); ++it) { + if (it->second & FirstMemoryReference) { + it->second &= ~FirstMemoryReference; + Pal::GpuMemoryRef memRef = {}; + memRef.pGpuMemory = it->first; + palMemRefs_.push_back(memRef); } + } - uint slotId = id % MaxCmdBuffers; - bool result = waifForFence(slotId); - cmbBufIdRetired_ = id; - return result; -} - -bool -VirtualGPU::Queue::isDone(uint id) -{ - if ((id <= cmbBufIdRetired_) || (id > cmdBufIdCurrent_)) { - return true; - } - - if (id == cmdBufIdCurrent_) { - // Flush the current command buffer - flush(); - } - - if (Pal::Result::Success != iCmdFences_[id % MaxCmdBuffers]->GetStatus()) { - return false; - } - cmbBufIdRetired_ = id; - return true; -} - -void -VirtualGPU::Queue::DumpMemoryReferences() const -{ - std::fstream dump; - std::stringstream file_name("ocl_hang_dump.txt"); - uint64_t start = amd::Os::timeNanos() / 1e9; - - dump.open(file_name.str().c_str(), (std::fstream::out | std::fstream::app)); - // Check if we have OpenCL program - if (dump.is_open()) { - dump << start << " Queue: "; - switch (iQueue_->Type()) { - case Pal::QueueTypeCompute: - dump << "Compute"; - break; - case Pal::QueueTypeDma: - dump << "SDMA"; - break; - default: - dump << "unknown"; - break; - } - dump << "\n" << "Resident memory resources:\n"; - uint idx = 0; - for (auto it : memReferences_) { - dump << " " << idx << "\t["; - dump.setf(std::ios::hex, std::ios::basefield); - dump.setf(std::ios::showbase); - dump << (it.first)->Desc().gpuVirtAddr << ", " << - (it.first)->Desc().gpuVirtAddr + (it.first)->Desc().size; - dump.setf(std::ios::dec); - dump << "] CbId:" << it.second << "\n"; - idx++; - } - } - if (last_kernel_ != nullptr) { - const amd::KernelSignature& signature = last_kernel_->signature(); - const amd::KernelParameters& params = last_kernel_->parameters(); - dump << last_kernel_->name() << std::endl; - for (size_t i = 0; i < signature.numParameters(); ++i) { - const amd::KernelParameterDescriptor& desc = signature.at(i); - // Find if the current argument is a memory object - if ((desc.type_ == T_POINTER) && - (desc.addressQualifier_ != CL_KERNEL_ARG_ADDRESS_LOCAL)) { - dump << " " << desc.name_ << ": " << std::endl; - } - } - } - dump.close(); -} - -bool -VirtualGPU::MemoryDependency::create(size_t numMemObj) -{ - if (numMemObj > 0) { - // Allocate the array of memory objects for dependency tracking - memObjectsInQueue_ = new MemoryState[numMemObj]; - if (nullptr == memObjectsInQueue_) { - return false; - } - memset(memObjectsInQueue_, 0, sizeof(MemoryState) * numMemObj); - maxMemObjectsInQueue_ = numMemObj; - } - - return true; -} - -void -VirtualGPU::MemoryDependency::validate( - VirtualGPU& gpu, - const Memory* memory, - bool readOnly) -{ - bool flushL1Cache = false; - - if (maxMemObjectsInQueue_ == 0) { - // Flush cache - gpu.flushCUCaches(); - return; - } - - uint64_t curStart = memory->vmAddress(); - uint64_t curEnd = curStart + memory->vmSize(); - - // Loop through all memory objects in the queue and find dependency - // @note don't include objects from the current kernel - for (size_t j = 0; j < endMemObjectsInQueue_; ++j) { - // Check if the queue already contains this mem object and - // GPU operations aren't readonly - uint64_t busyStart = memObjectsInQueue_[j].start_; - uint64_t busyEnd = memObjectsInQueue_[j].end_; - - // Check if the start inside the busy region - if ((((curStart >= busyStart) && (curStart < busyEnd)) || - // Check if the end inside the busy region - ((curEnd > busyStart) && (curEnd <= busyEnd)) || - // Check if the start/end cover the busy region - ((curStart <= busyStart) && (curEnd >= busyEnd))) && - // If the buys region was written or the current one is for write - (!memObjectsInQueue_[j].readOnly_ || !readOnly)) { - flushL1Cache = true; - break; - } - } - - // Did we reach the limit? - if (maxMemObjectsInQueue_ <= (numMemObjectsInQueue_ + 1)) { - flushL1Cache = true; - } - - if (flushL1Cache) { - // Flush cache - gpu.flushCUCaches(); - - // Clear memory dependency state - const static bool All = true; - clear(!All); - } - - // Insert current memory object into the queue always, - // since runtime calls flush before kernel execution and it has to keep - // current kernel in tracking - memObjectsInQueue_ - [numMemObjectsInQueue_].start_ = curStart; - memObjectsInQueue_ - [numMemObjectsInQueue_].end_ = curEnd; - memObjectsInQueue_ - [numMemObjectsInQueue_].readOnly_ = readOnly; - numMemObjectsInQueue_++; -} - -void -VirtualGPU::MemoryDependency::clear(bool all) -{ - if (numMemObjectsInQueue_ > 0) { - size_t i, j; - if (all) { - endMemObjectsInQueue_ = numMemObjectsInQueue_; - } - - // Preserve all objects from the current kernel - for (i = 0, j = endMemObjectsInQueue_; j < numMemObjectsInQueue_; i++, j++) { - memObjectsInQueue_[i].start_ = memObjectsInQueue_[j].start_; - memObjectsInQueue_[i].end_ = memObjectsInQueue_[j].end_; - memObjectsInQueue_[i].readOnly_ = memObjectsInQueue_[j].readOnly_; - } - // Clear all objects except current kernel - memset(&memObjectsInQueue_[i], 0, sizeof(amd::Memory*) * numMemObjectsInQueue_); - numMemObjectsInQueue_ -= endMemObjectsInQueue_; - endMemObjectsInQueue_ = 0; - } -} - -VirtualGPU::DmaFlushMgmt::DmaFlushMgmt(const Device& dev) - : cbWorkload_(0) - , dispatchSplitSize_(0) -{ - aluCnt_ = dev.info().simdPerCU_ * dev.info().simdWidth_ * dev.info().maxComputeUnits_; - maxDispatchWorkload_ = static_cast(dev.info().maxClockFrequency_) * - // find time in us - dev.settings().maxWorkloadTime_ * - aluCnt_; - resetCbWorkload(dev); -} - -void -VirtualGPU::DmaFlushMgmt::resetCbWorkload(const Device& dev) -{ - cbWorkload_ = 0; - maxCbWorkload_ = static_cast(dev.info().maxClockFrequency_) * - // find time in us - dev.settings().minWorkloadTime_ * aluCnt_; -} - -void -VirtualGPU::DmaFlushMgmt::findSplitSize( - const Device& dev, uint64_t threads, uint instructions) -{ - uint64_t workload = threads * instructions; - if (maxDispatchWorkload_ < workload) { - dispatchSplitSize_ = static_cast(maxDispatchWorkload_ / instructions); - uint fullLoad = dev.info().maxComputeUnits_ * dev.info().maxWorkGroupSize_; - if ((dispatchSplitSize_ % fullLoad) != 0) { - dispatchSplitSize_ = (dispatchSplitSize_ / fullLoad + 1) * fullLoad; - } - } - else { - dispatchSplitSize_ = (threads > dev.settings().workloadSplitSize_) ? - dev.settings().workloadSplitSize_ : 0; - } -} - -bool -VirtualGPU::DmaFlushMgmt::isCbReady( - VirtualGPU& gpu, uint64_t threads, uint instructions) -{ - bool cbReady = false; - uint64_t workload = amd::alignUp(threads, 4 * aluCnt_) * instructions; - // Add current workload to the overall workload in the current DMA - cbWorkload_ += workload; - // Did it exceed maximum? - if (cbWorkload_ > maxCbWorkload_) { - // Reset DMA workload - cbWorkload_ = 0; - // Increase workload of the next DMA buffer by 50% - maxCbWorkload_ = maxCbWorkload_ * 3 / 2; - if (maxCbWorkload_ > maxDispatchWorkload_) { - maxCbWorkload_ = maxDispatchWorkload_; - } - cbReady = true; - } - return cbReady; -} - -void -VirtualGPU::addXferWrite(Memory& memory) -{ - if (xferWriteBuffers_.size() > 7) { - dev().xferWrite().release(*this, *xferWriteBuffers_.front()); - xferWriteBuffers_.erase(xferWriteBuffers_.begin()); - } - - // Delay destruction - xferWriteBuffers_.push_back(&memory); -} - -void -VirtualGPU::releaseXferWrite() -{ - for (auto& memory : xferWriteBuffers_) { - dev().xferWrite().release(*this, *memory); - } - xferWriteBuffers_.resize(0); -} - -void -VirtualGPU::addPinnedMem(amd::Memory* mem) -{ - if (nullptr == findPinnedMem(mem->getHostMem(), mem->getSize())) { - if (pinnedMems_.size() > 7) { - pinnedMems_.front()->release(); - pinnedMems_.erase(pinnedMems_.begin()); - } - - // Start operation, since we should release mem object - flushDMA(getGpuEvent(dev().getGpuMemory(mem)->iMem())->engineId_); - - // Delay destruction - pinnedMems_.push_back(mem); - } -} - -void -VirtualGPU::releasePinnedMem() -{ - for (auto& amdMemory : pinnedMems_) { - amdMemory->release(); - } - pinnedMems_.resize(0); -} - -amd::Memory* -VirtualGPU::findPinnedMem(void* addr, size_t size) -{ - for (auto& amdMemory : pinnedMems_) { - if ((amdMemory->getHostMem() == addr) && (size <= amdMemory->getSize())) { - return amdMemory; - } - } - return nullptr; -} - -bool -VirtualGPU::createVirtualQueue(uint deviceQueueSize) -{ - uint MinDeviceQueueSize = 16 * 1024; - deviceQueueSize = std::max(deviceQueueSize, MinDeviceQueueSize); - - maskGroups_ = deviceQueueSize / (512 * Ki); - maskGroups_ = (maskGroups_== 0) ? 1 : maskGroups_; - - // Align the queue size for the multiple dispatch scheduler. - // Each thread works with 32 entries * maskGroups - uint extra = deviceQueueSize % (sizeof(AmdAqlWrap) * - DeviceQueueMaskSize * maskGroups_); - if (extra != 0) { - deviceQueueSize += (sizeof(AmdAqlWrap) * - DeviceQueueMaskSize * maskGroups_) - extra; - } - - if (deviceQueueSize_ == deviceQueueSize) { - return true; - } - else { - //! @todo Temporarily keep the buffer mapped for debug purpose - if (nullptr != schedParams_) { - schedParams_->unmap(this); - } - delete vqHeader_; - delete virtualQueue_; - delete schedParams_; - vqHeader_ = nullptr; - virtualQueue_ = nullptr; - schedParams_ = nullptr; - schedParamIdx_ = 0; - deviceQueueSize_ = 0; - } - uint numSlots = deviceQueueSize / sizeof(AmdAqlWrap); - uint allocSize = deviceQueueSize; - - // Add the virtual queue header - allocSize += sizeof(AmdVQueueHeader); - allocSize = amd::alignUp(allocSize, sizeof(AmdAqlWrap)); - - uint argOffs = allocSize; - - // Add the kernel arguments and wait events - uint singleArgSize = amd::alignUp(dev().info().maxParameterSize_ + 64 + - dev().settings().numWaitEvents_ * sizeof(uint64_t), sizeof(AmdAqlWrap)); - allocSize += singleArgSize * numSlots; - - uint eventsOffs = allocSize; - // Add the device events - allocSize += dev().settings().numDeviceEvents_ * sizeof(AmdEvent); - - uint eventMaskOffs = allocSize; - // Add mask array for events - allocSize += amd::alignUp(dev().settings().numDeviceEvents_, DeviceQueueMaskSize) / 8; - - uint slotMaskOffs = allocSize; - // Add mask array for AmdAqlWrap slots - allocSize += amd::alignUp(numSlots, DeviceQueueMaskSize) / 8; - - virtualQueue_ = new Memory(dev(), allocSize); - Resource::MemoryType type = (GPU_PRINT_CHILD_KERNEL == 0) ? - Resource::Local : Resource::Remote; - if ((virtualQueue_ == nullptr) || !virtualQueue_->create(type)) { - return false; - } - - if (GPU_PRINT_CHILD_KERNEL != 0) { - address ptr = reinterpret_cast
( - virtualQueue_->map(this, Resource::WriteOnly)); - if (nullptr == ptr) { - return false; - } - } - - uint64_t vaBase = virtualQueue_->vmAddress(); - AmdVQueueHeader header = {}; - // Initialize the virtual queue header - header.aql_slot_num = numSlots; - header.event_slot_num = dev().settings().numDeviceEvents_; - header.event_slot_mask = vaBase + eventMaskOffs; - header.event_slots = vaBase + eventsOffs; - header.aql_slot_mask = vaBase + slotMaskOffs; - header.wait_size = dev().settings().numWaitEvents_; - header.arg_size = dev().info().maxParameterSize_ + 64; - header.mask_groups = maskGroups_; - - vqHeader_ = new AmdVQueueHeader; - if (nullptr == vqHeader_) { - return false; - } - *vqHeader_ = header; - - virtualQueue_->writeRawData(*this, 0, sizeof(AmdVQueueHeader), &header, false); - - // Go over all slots and perform initialization - AmdAqlWrap slot = {}; - size_t offset = sizeof(AmdVQueueHeader); - for (uint i = 0; i < numSlots; ++i) { - uint64_t argStart = vaBase + argOffs + i * singleArgSize; - slot.aql.kernarg_address = reinterpret_cast(argStart); - slot.wait_list = argStart + dev().info().maxParameterSize_ + 64; - virtualQueue_->writeRawData(*this, offset, sizeof(AmdAqlWrap), &slot, false); - offset += sizeof(AmdAqlWrap); - } - - schedParams_ = new Memory(dev(), 64 * Ki); - if ((schedParams_ == nullptr) || !schedParams_->create(Resource::RemoteUSWC)) { - return false; - } - - address ptr = reinterpret_cast
(schedParams_->map(this)); - - deviceQueueSize_ = deviceQueueSize; - - return true; -} - -VirtualGPU::VirtualGPU( - Device& device) - : device::VirtualDevice(device) - , engineID_(MainEngine) - , gpuDevice_(static_cast(device)) - , execution_("Virtual GPU execution lock", true) - , printfDbg_(nullptr) - , printfDbgHSA_(nullptr) - , tsCache_(nullptr) - , dmaFlushMgmt_(device) - , hwRing_(0) - , readjustTimeGPU_(0) - , currTs_(nullptr) - , vqHeader_(nullptr) - , virtualQueue_(nullptr) - , schedParams_(nullptr) - , schedParamIdx_(0) - , deviceQueueSize_(0) - , maskGroups_(1) - , hsaQueueMem_(nullptr) - , cmdAllocator_(nullptr) -{ - memset(&cal_, 0, sizeof(CalVirtualDesc)); - for (uint i = 0; i < AllEngines; ++i) { - cal_.events_[i].invalidate(); - } - - // Note: Virtual GPU device creation must be a thread safe operation - index_ = gpuDevice_.numOfVgpus_++; - gpuDevice_.vgpus_.resize(gpuDevice_.numOfVgpus()); - gpuDevice_.vgpus_[index()] = this; - queues_[MainEngine] = nullptr; - queues_[SdmaEngine] = nullptr; -} - -bool -VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs, - amd::CommandQueue::Priority priority) -{ - device::BlitManager::Setup blitSetup; - - if (index() >= GPU_MAX_COMMAND_QUEUES) { - // Cap the maximum number of concurrent Virtual GPUs - return false; - } - - // Virtual GPU will have profiling enabled - state_.profiling_ = profiling; - - Pal::CmdAllocatorCreateInfo createInfo = {}; - createInfo.flags.threadSafe = true; - // \todo forces PAL to reuse CBs, but requires postamble - createInfo.flags.autoMemoryReuse = false; - createInfo.allocInfo[Pal::CommandDataAlloc].allocHeap = - Pal::GpuHeapGartCacheable; - createInfo.allocInfo[Pal::CommandDataAlloc].allocSize = 128 * Ki; - createInfo.allocInfo[Pal::CommandDataAlloc].suballocSize = 128 * Ki; - - createInfo.allocInfo[Pal::EmbeddedDataAlloc].allocHeap = - Pal::GpuHeapGartCacheable; - createInfo.allocInfo[Pal::EmbeddedDataAlloc].allocSize = 64 * Ki; - createInfo.allocInfo[Pal::EmbeddedDataAlloc].suballocSize = 64 * Ki; - - Pal::Result result; - size_t cmdAllocSize = dev().iDev()->GetCmdAllocatorSize(createInfo, &result); - if (Pal::Result::Success != result) { - return false; - } - char* addr = new char [cmdAllocSize]; + if (palMemRefs_.size() != 0) { if (Pal::Result::Success != - dev().iDev()->CreateCmdAllocator(createInfo, addr, &cmdAllocator_)) { - return false; + iDev_->AddGpuMemoryReferences(palMemRefs_.size(), &palMemRefs_[0], iQueue_, + Pal::GpuMemoryRefCantTrim)) { + LogError("PAL failed to make resident resources!"); + return false; } + } - if (dev().numComputeEngines()) { - //! @todo There is a hang with a mix of user and non user queues. - //! Currently there is no simple way to detect which queue is what. - //! Disable first for now. - const uint firstQueue = (dev().numComputeEngines() > 2) ? 1 : 0; - uint idx = index() % (dev().numComputeEngines() - firstQueue); + // Reset the fence. PAL will reset OS event + if (Pal::Result::Success != iDev_->ResetFences(1, &iCmdFences_[cmdBufIdSlot_])) { + LogError("PAL failed to reset a fence!"); + return false; + } - // hwRing_ should be set 0 if forced to have single scratch buffer - hwRing_ = (dev().settings().useSingleScratch_) ? 0 : idx; + Pal::SubmitInfo submitInfo = {}; + submitInfo.cmdBufferCount = 1; + submitInfo.ppCmdBuffers = &iCmdBuffs_[cmdBufIdSlot_]; + submitInfo.pFence = iCmdFences_[cmdBufIdSlot_]; + submitInfo.doppRefCount = palDoppRefs_.size(); + submitInfo.pDoppRefs = palDoppRefs_.data(); - queues_[MainEngine] = Queue::Create( - dev().iDev(), Pal::QueueTypeCompute, idx + firstQueue, cmdAllocator_, rtCUs, priority); - if (nullptr == queues_[MainEngine]) { - return false; - } + // Submit command buffer to OS + if (Pal::Result::Success != iQueue_->Submit(submitInfo)) { + LogError("PAL failed to submit CMD!"); + return false; + } + if (GPU_FLUSH_ON_EXECUTION) { + waifForFence(cmdBufIdSlot_); + } - // Check if device has SDMA engines - if (dev().numDMAEngines() != 0) { - uint sdma; - // If only 1 DMA engine is available then use that one - if ((dev().numDMAEngines() < 2) || ((idx & 0x1) && !dev().settings().svmFineGrainSystem_)) { - sdma = 0; - } - else { - sdma = 1; - } + // Reset the counter of commands + cmdCnt_ = 0; - queues_[SdmaEngine] = Queue::Create( - dev().iDev(), Pal::QueueTypeDma, sdma, cmdAllocator_, - amd::CommandQueue::RealTimeDisabled, amd::CommandQueue::Priority::Normal); - if (nullptr == queues_[SdmaEngine]) { - return false; - } - } - else { - Unimplemented(); - } - } - else { - Unimplemented(); + // Find the next command buffer + cmdBufIdCurrent_++; + + if (cmdBufIdCurrent_ == GpuEvent::InvalidID) { + // Wait for the last one + waifForFence(cmdBufIdSlot_); + cmdBufIdCurrent_ = 1; + cmbBufIdRetired_ = 0; + } + + // Wrap current slot + cmdBufIdSlot_ = cmdBufIdCurrent_ % MaxCmdBuffers; + + // Make sure the slot isn't busy + constexpr bool IbReuse = true; + waifForFence(cmdBufIdSlot_, IbReuse); + + // Progress retired TS + if ((cmdBufIdCurrent_ > MaxCmdBuffers) && + (cmbBufIdRetired_ < (cmdBufIdCurrent_ - MaxCmdBuffers))) { + cmbBufIdRetired_ = cmdBufIdCurrent_ - MaxCmdBuffers; + } + + // Reset command buffer, so CB chunks could be reused + if (Pal::Result::Success != iCmdBuffs_[cmdBufIdSlot_]->Reset(nullptr, false)) { + LogError("PAL failed CB reset!"); + return false; + } + // Start command buffer building + Pal::CmdBufferBuildInfo cmdBuildInfo = {}; + cmdBuildInfo.pMemAllocator = &vlAlloc_; + if (Pal::Result::Success != iCmdBuffs_[cmdBufIdSlot_]->Begin(cmdBuildInfo)) { + LogError("PAL failed CB building initialization!"); + return false; + } + + // Clear dopp references + palDoppRefs_.resize(0); + + palMems_.resize(0); + // Remove old memory references + for (auto it = memReferences_.begin(); it != memReferences_.end();) { + if (it->second == cmdBufIdSlot_) { + palMems_.push_back(it->first); + it = memReferences_.erase(it); + } else { + ++it; } + } + if (palMems_.size() != 0) { + iDev_->RemoveGpuMemoryReferences(palMems_.size(), &palMems_[0], iQueue_); + } - // Diable double copy optimization, - // since UAV read from nonlocal is fast enough - blitSetup.disableCopyBufferToImageOpt_ = true; - if (!allocConstantBuffers()) { - return false; - } + return true; +} - // Create Printf class - printfDbg_ = new PrintfDbg(gpuDevice_); - if ((nullptr == printfDbg_) || !printfDbg_->create()) { - delete printfDbg_; - LogError("Could not allocate debug buffer for printf()!"); - return false; - } +bool VirtualGPU::Queue::waitForEvent(uint id) { + if (isDone(id)) { + return true; + } - // Create HSAILPrintf class - printfDbgHSA_ = new PrintfDbgHSA(gpuDevice_); - if (nullptr == printfDbgHSA_) { - delete printfDbgHSA_; - LogError("Could not create PrintfDbgHSA class!"); - return false; - } + uint slotId = id % MaxCmdBuffers; + bool result = waifForFence(slotId); + cmbBufIdRetired_ = id; + return result; +} - tsCache_ = new TimeStampCache(*this); - if (nullptr == tsCache_) { - LogError("Could not create TimeStamp cache!"); - return false; - } +bool VirtualGPU::Queue::isDone(uint id) { + if ((id <= cmbBufIdRetired_) || (id > cmdBufIdCurrent_)) { + return true; + } - if (!memoryDependency().create(dev().settings().numMemDependencies_)) { - LogError("Could not create the array of memory objects!"); - return false; - } + if (id == cmdBufIdCurrent_) { + // Flush the current command buffer + flush(); + } - if(!allocHsaQueueMem()) { - LogError("Could not create hsaQueueMem object!"); - return false; - } + if (Pal::Result::Success != iCmdFences_[id % MaxCmdBuffers]->GetStatus()) { + return false; + } + cmbBufIdRetired_ = id; + return true; +} - // Check if the app requested a device queue creation - if (dev().settings().useDeviceQueue_ && - (0 != deviceQueueSize) && !createVirtualQueue(deviceQueueSize)) { - LogError("Could not create a virtual queue!"); - return false; - } +void VirtualGPU::Queue::DumpMemoryReferences() const { + std::fstream dump; + std::stringstream file_name("ocl_hang_dump.txt"); + uint64_t start = amd::Os::timeNanos() / 1e9; - // Choose the appropriate class for blit engine - switch (dev().settings().blitEngine_) { - default: - // Fall through ... - case Settings::BlitEngineHost: - blitSetup.disableAll(); - // Fall through ... - case Settings::BlitEngineCAL: - case Settings::BlitEngineKernel: - // use host blit for HW debug - if (dev().settings().enableHwDebug_) { - blitSetup.disableCopyImageToBuffer_ = true; - blitSetup.disableCopyBufferToImage_ = true; - } - blitMgr_ = new KernelBlitManager(*this, blitSetup); + dump.open(file_name.str().c_str(), (std::fstream::out | std::fstream::app)); + // Check if we have OpenCL program + if (dump.is_open()) { + dump << start << " Queue: "; + switch (iQueue_->Type()) { + case Pal::QueueTypeCompute: + dump << "Compute"; + break; + case Pal::QueueTypeDma: + dump << "SDMA"; + break; + default: + dump << "unknown"; break; } - if ((nullptr == blitMgr_) || !blitMgr_->create(gpuDevice_)) { - LogError("Could not create BlitManager!"); - return false; + dump << "\n" + << "Resident memory resources:\n"; + uint idx = 0; + for (auto it : memReferences_) { + dump << " " << idx << "\t["; + dump.setf(std::ios::hex, std::ios::basefield); + dump.setf(std::ios::showbase); + dump << (it.first)->Desc().gpuVirtAddr << ", " + << (it.first)->Desc().gpuVirtAddr + (it.first)->Desc().size; + dump.setf(std::ios::dec); + dump << "] CbId:" << it.second << "\n"; + idx++; } - - return true; + } + if (last_kernel_ != nullptr) { + const amd::KernelSignature& signature = last_kernel_->signature(); + const amd::KernelParameters& params = last_kernel_->parameters(); + dump << last_kernel_->name() << std::endl; + for (size_t i = 0; i < signature.numParameters(); ++i) { + const amd::KernelParameterDescriptor& desc = signature.at(i); + // Find if the current argument is a memory object + if ((desc.type_ == T_POINTER) && (desc.addressQualifier_ != CL_KERNEL_ARG_ADDRESS_LOCAL)) { + dump << " " << desc.name_ << ": " << std::endl; + } + } + } + dump.close(); } -bool -VirtualGPU::allocHsaQueueMem() -{ - // Allocate a dummy HSA queue - hsaQueueMem_ = new Memory(dev(), sizeof(amd_queue_t)); - if ((hsaQueueMem_ == nullptr) || - (!hsaQueueMem_->create(Resource::RemoteUSWC))) { - delete hsaQueueMem_; - return false; +bool VirtualGPU::MemoryDependency::create(size_t numMemObj) { + if (numMemObj > 0) { + // Allocate the array of memory objects for dependency tracking + memObjectsInQueue_ = new MemoryState[numMemObj]; + if (nullptr == memObjectsInQueue_) { + return false; } - amd_queue_t* queue = reinterpret_cast - (hsaQueueMem_->map(nullptr, Resource::WriteOnly)); - if (nullptr == queue) { - delete hsaQueueMem_; - return false; - } - memset(queue, 0, sizeof(amd_queue_t)); + memset(memObjectsInQueue_, 0, sizeof(MemoryState) * numMemObj); + maxMemObjectsInQueue_ = numMemObj; + } - // Provide private and local heap addresses - const static uint addressShift = LP64_SWITCH(0, 32); - queue->private_segment_aperture_base_hi = static_cast( - dev().properties().gpuMemoryProperties.privateApertureBase >> addressShift); - queue->group_segment_aperture_base_hi = static_cast( - dev().properties().gpuMemoryProperties.sharedApertureBase >> addressShift); - - hsaQueueMem_->unmap(nullptr); - return true; + return true; } -VirtualGPU::~VirtualGPU() -{ - // Not safe to remove a queue. So lock the device - amd::ScopedLock k(dev().lockAsyncOps()); - amd::ScopedLock lock(dev().vgpusAccess()); +void VirtualGPU::MemoryDependency::validate(VirtualGPU& gpu, const Memory* memory, bool readOnly) { + bool flushL1Cache = false; - // Destroy all memories - static const bool SkipScratch = false; - releaseMemObjects(SkipScratch); + if (maxMemObjectsInQueue_ == 0) { + // Flush cache + gpu.flushCUCaches(); + return; + } - while (!freeCbQueue_.empty()) { - auto cb = freeCbQueue_.front(); - delete cb; - freeCbQueue_.pop(); + uint64_t curStart = memory->vmAddress(); + uint64_t curEnd = curStart + memory->vmSize(); + + // Loop through all memory objects in the queue and find dependency + // @note don't include objects from the current kernel + for (size_t j = 0; j < endMemObjectsInQueue_; ++j) { + // Check if the queue already contains this mem object and + // GPU operations aren't readonly + uint64_t busyStart = memObjectsInQueue_[j].start_; + uint64_t busyEnd = memObjectsInQueue_[j].end_; + + // Check if the start inside the busy region + if ((((curStart >= busyStart) && (curStart < busyEnd)) || + // Check if the end inside the busy region + ((curEnd > busyStart) && (curEnd <= busyEnd)) || + // Check if the start/end cover the busy region + ((curStart <= busyStart) && (curEnd >= busyEnd))) && + // If the buys region was written or the current one is for write + (!memObjectsInQueue_[j].readOnly_ || !readOnly)) { + flushL1Cache = true; + break; + } + } + + // Did we reach the limit? + if (maxMemObjectsInQueue_ <= (numMemObjectsInQueue_ + 1)) { + flushL1Cache = true; + } + + if (flushL1Cache) { + // Flush cache + gpu.flushCUCaches(); + + // Clear memory dependency state + const static bool All = true; + clear(!All); + } + + // Insert current memory object into the queue always, + // since runtime calls flush before kernel execution and it has to keep + // current kernel in tracking + memObjectsInQueue_[numMemObjectsInQueue_].start_ = curStart; + memObjectsInQueue_[numMemObjectsInQueue_].end_ = curEnd; + memObjectsInQueue_[numMemObjectsInQueue_].readOnly_ = readOnly; + numMemObjectsInQueue_++; +} + +void VirtualGPU::MemoryDependency::clear(bool all) { + if (numMemObjectsInQueue_ > 0) { + size_t i, j; + if (all) { + endMemObjectsInQueue_ = numMemObjectsInQueue_; } - // Destroy printf object - delete printfDbg_; + // Preserve all objects from the current kernel + for (i = 0, j = endMemObjectsInQueue_; j < numMemObjectsInQueue_; i++, j++) { + memObjectsInQueue_[i].start_ = memObjectsInQueue_[j].start_; + memObjectsInQueue_[i].end_ = memObjectsInQueue_[j].end_; + memObjectsInQueue_[i].readOnly_ = memObjectsInQueue_[j].readOnly_; + } + // Clear all objects except current kernel + memset(&memObjectsInQueue_[i], 0, sizeof(amd::Memory*) * numMemObjectsInQueue_); + numMemObjectsInQueue_ -= endMemObjectsInQueue_; + endMemObjectsInQueue_ = 0; + } +} - // Destroy printfHSA object - delete printfDbgHSA_; +VirtualGPU::DmaFlushMgmt::DmaFlushMgmt(const Device& dev) : cbWorkload_(0), dispatchSplitSize_(0) { + aluCnt_ = dev.info().simdPerCU_ * dev.info().simdWidth_ * dev.info().maxComputeUnits_; + maxDispatchWorkload_ = static_cast(dev.info().maxClockFrequency_) * + // find time in us + dev.settings().maxWorkloadTime_ * aluCnt_; + resetCbWorkload(dev); +} - // Destroy BlitManager object - delete blitMgr_; +void VirtualGPU::DmaFlushMgmt::resetCbWorkload(const Device& dev) { + cbWorkload_ = 0; + maxCbWorkload_ = static_cast(dev.info().maxClockFrequency_) * + // find time in us + dev.settings().minWorkloadTime_ * aluCnt_; +} - // Destroy TimeStamp cache - delete tsCache_; +void VirtualGPU::DmaFlushMgmt::findSplitSize(const Device& dev, uint64_t threads, + uint instructions) { + uint64_t workload = threads * instructions; + if (maxDispatchWorkload_ < workload) { + dispatchSplitSize_ = static_cast(maxDispatchWorkload_ / instructions); + uint fullLoad = dev.info().maxComputeUnits_ * dev.info().maxWorkGroupSize_; + if ((dispatchSplitSize_ % fullLoad) != 0) { + dispatchSplitSize_ = (dispatchSplitSize_ / fullLoad + 1) * fullLoad; + } + } else { + dispatchSplitSize_ = + (threads > dev.settings().workloadSplitSize_) ? dev.settings().workloadSplitSize_ : 0; + } +} - // Destroy resource list with the constant buffers - for (uint i = 0; i < constBufs_.size(); ++i) { - delete constBufs_[i]; +bool VirtualGPU::DmaFlushMgmt::isCbReady(VirtualGPU& gpu, uint64_t threads, uint instructions) { + bool cbReady = false; + uint64_t workload = amd::alignUp(threads, 4 * aluCnt_) * instructions; + // Add current workload to the overall workload in the current DMA + cbWorkload_ += workload; + // Did it exceed maximum? + if (cbWorkload_ > maxCbWorkload_) { + // Reset DMA workload + cbWorkload_ = 0; + // Increase workload of the next DMA buffer by 50% + maxCbWorkload_ = maxCbWorkload_ * 3 / 2; + if (maxCbWorkload_ > maxDispatchWorkload_) { + maxCbWorkload_ = maxDispatchWorkload_; + } + cbReady = true; + } + return cbReady; +} + +void VirtualGPU::addXferWrite(Memory& memory) { + if (xferWriteBuffers_.size() > 7) { + dev().xferWrite().release(*this, *xferWriteBuffers_.front()); + xferWriteBuffers_.erase(xferWriteBuffers_.begin()); + } + + // Delay destruction + xferWriteBuffers_.push_back(&memory); +} + +void VirtualGPU::releaseXferWrite() { + for (auto& memory : xferWriteBuffers_) { + dev().xferWrite().release(*this, *memory); + } + xferWriteBuffers_.resize(0); +} + +void VirtualGPU::addPinnedMem(amd::Memory* mem) { + if (nullptr == findPinnedMem(mem->getHostMem(), mem->getSize())) { + if (pinnedMems_.size() > 7) { + pinnedMems_.front()->release(); + pinnedMems_.erase(pinnedMems_.begin()); } - // Destroy queues - if (nullptr != queues_[MainEngine]) { - // Make sure the queues are idle - // It's unclear why PAL could still have a busy queue - queues_[MainEngine]->iQueue_->WaitIdle(); - delete queues_[MainEngine]; - } + // Start operation, since we should release mem object + flushDMA(getGpuEvent(dev().getGpuMemory(mem)->iMem())->engineId_); - if (nullptr != queues_[SdmaEngine]) { - queues_[SdmaEngine]->iQueue_->WaitIdle(); - delete queues_[SdmaEngine]; - } + // Delay destruction + pinnedMems_.push_back(mem); + } +} - if (nullptr != cmdAllocator_) { - cmdAllocator_->Destroy(); - delete [] reinterpret_cast(cmdAllocator_); - } +void VirtualGPU::releasePinnedMem() { + for (auto& amdMemory : pinnedMems_) { + amdMemory->release(); + } + pinnedMems_.resize(0); +} - gpuDevice_.numOfVgpus_--; - gpuDevice_.vgpus_.erase(gpuDevice_.vgpus_.begin() + index()); - for (uint idx = index(); idx < dev().vgpus().size(); ++idx) { - dev().vgpus()[idx]->index_--; +amd::Memory* VirtualGPU::findPinnedMem(void* addr, size_t size) { + for (auto& amdMemory : pinnedMems_) { + if ((amdMemory->getHostMem() == addr) && (size <= amdMemory->getSize())) { + return amdMemory; } + } + return nullptr; +} - // Release scratch buffer memory to reduce memory pressure - //!@note OCLtst uses single device with multiple tests - //! Release memory only if it's the last command queue. - //! The first queue is reserved for the transfers on device - if (gpuDevice_.numOfVgpus_ <= 1) { - gpuDevice_.destroyScratchBuffers(); - } +bool VirtualGPU::createVirtualQueue(uint deviceQueueSize) { + uint MinDeviceQueueSize = 16 * 1024; + deviceQueueSize = std::max(deviceQueueSize, MinDeviceQueueSize); + maskGroups_ = deviceQueueSize / (512 * Ki); + maskGroups_ = (maskGroups_ == 0) ? 1 : maskGroups_; + + // Align the queue size for the multiple dispatch scheduler. + // Each thread works with 32 entries * maskGroups + uint extra = deviceQueueSize % (sizeof(AmdAqlWrap) * DeviceQueueMaskSize * maskGroups_); + if (extra != 0) { + deviceQueueSize += (sizeof(AmdAqlWrap) * DeviceQueueMaskSize * maskGroups_) - extra; + } + + if (deviceQueueSize_ == deviceQueueSize) { + return true; + } else { //! @todo Temporarily keep the buffer mapped for debug purpose if (nullptr != schedParams_) { - schedParams_->unmap(this); + schedParams_->unmap(this); } delete vqHeader_; delete virtualQueue_; delete schedParams_; + vqHeader_ = nullptr; + virtualQueue_ = nullptr; + schedParams_ = nullptr; + schedParamIdx_ = 0; + deviceQueueSize_ = 0; + } + uint numSlots = deviceQueueSize / sizeof(AmdAqlWrap); + uint allocSize = deviceQueueSize; + + // Add the virtual queue header + allocSize += sizeof(AmdVQueueHeader); + allocSize = amd::alignUp(allocSize, sizeof(AmdAqlWrap)); + + uint argOffs = allocSize; + + // Add the kernel arguments and wait events + uint singleArgSize = amd::alignUp( + dev().info().maxParameterSize_ + 64 + dev().settings().numWaitEvents_ * sizeof(uint64_t), + sizeof(AmdAqlWrap)); + allocSize += singleArgSize * numSlots; + + uint eventsOffs = allocSize; + // Add the device events + allocSize += dev().settings().numDeviceEvents_ * sizeof(AmdEvent); + + uint eventMaskOffs = allocSize; + // Add mask array for events + allocSize += amd::alignUp(dev().settings().numDeviceEvents_, DeviceQueueMaskSize) / 8; + + uint slotMaskOffs = allocSize; + // Add mask array for AmdAqlWrap slots + allocSize += amd::alignUp(numSlots, DeviceQueueMaskSize) / 8; + + virtualQueue_ = new Memory(dev(), allocSize); + Resource::MemoryType type = (GPU_PRINT_CHILD_KERNEL == 0) ? Resource::Local : Resource::Remote; + if ((virtualQueue_ == nullptr) || !virtualQueue_->create(type)) { + return false; + } + + if (GPU_PRINT_CHILD_KERNEL != 0) { + address ptr = reinterpret_cast
(virtualQueue_->map(this, Resource::WriteOnly)); + if (nullptr == ptr) { + return false; + } + } + + uint64_t vaBase = virtualQueue_->vmAddress(); + AmdVQueueHeader header = {}; + // Initialize the virtual queue header + header.aql_slot_num = numSlots; + header.event_slot_num = dev().settings().numDeviceEvents_; + header.event_slot_mask = vaBase + eventMaskOffs; + header.event_slots = vaBase + eventsOffs; + header.aql_slot_mask = vaBase + slotMaskOffs; + header.wait_size = dev().settings().numWaitEvents_; + header.arg_size = dev().info().maxParameterSize_ + 64; + header.mask_groups = maskGroups_; + + vqHeader_ = new AmdVQueueHeader; + if (nullptr == vqHeader_) { + return false; + } + *vqHeader_ = header; + + virtualQueue_->writeRawData(*this, 0, sizeof(AmdVQueueHeader), &header, false); + + // Go over all slots and perform initialization + AmdAqlWrap slot = {}; + size_t offset = sizeof(AmdVQueueHeader); + for (uint i = 0; i < numSlots; ++i) { + uint64_t argStart = vaBase + argOffs + i * singleArgSize; + slot.aql.kernarg_address = reinterpret_cast(argStart); + slot.wait_list = argStart + dev().info().maxParameterSize_ + 64; + virtualQueue_->writeRawData(*this, offset, sizeof(AmdAqlWrap), &slot, false); + offset += sizeof(AmdAqlWrap); + } + + schedParams_ = new Memory(dev(), 64 * Ki); + if ((schedParams_ == nullptr) || !schedParams_->create(Resource::RemoteUSWC)) { + return false; + } + + address ptr = reinterpret_cast
(schedParams_->map(this)); + + deviceQueueSize_ = deviceQueueSize; + + return true; +} + +VirtualGPU::VirtualGPU(Device& device) + : device::VirtualDevice(device), + engineID_(MainEngine), + gpuDevice_(static_cast(device)), + execution_("Virtual GPU execution lock", true), + printfDbg_(nullptr), + printfDbgHSA_(nullptr), + tsCache_(nullptr), + dmaFlushMgmt_(device), + hwRing_(0), + readjustTimeGPU_(0), + currTs_(nullptr), + vqHeader_(nullptr), + virtualQueue_(nullptr), + schedParams_(nullptr), + schedParamIdx_(0), + deviceQueueSize_(0), + maskGroups_(1), + hsaQueueMem_(nullptr), + cmdAllocator_(nullptr) { + memset(&cal_, 0, sizeof(CalVirtualDesc)); + for (uint i = 0; i < AllEngines; ++i) { + cal_.events_[i].invalidate(); + } + + // Note: Virtual GPU device creation must be a thread safe operation + index_ = gpuDevice_.numOfVgpus_++; + gpuDevice_.vgpus_.resize(gpuDevice_.numOfVgpus()); + gpuDevice_.vgpus_[index()] = this; + queues_[MainEngine] = nullptr; + queues_[SdmaEngine] = nullptr; +} + +bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs, + amd::CommandQueue::Priority priority) { + device::BlitManager::Setup blitSetup; + + if (index() >= GPU_MAX_COMMAND_QUEUES) { + // Cap the maximum number of concurrent Virtual GPUs + return false; + } + + // Virtual GPU will have profiling enabled + state_.profiling_ = profiling; + + Pal::CmdAllocatorCreateInfo createInfo = {}; + createInfo.flags.threadSafe = true; + // \todo forces PAL to reuse CBs, but requires postamble + createInfo.flags.autoMemoryReuse = false; + createInfo.allocInfo[Pal::CommandDataAlloc].allocHeap = Pal::GpuHeapGartCacheable; + createInfo.allocInfo[Pal::CommandDataAlloc].allocSize = 128 * Ki; + createInfo.allocInfo[Pal::CommandDataAlloc].suballocSize = 128 * Ki; + + createInfo.allocInfo[Pal::EmbeddedDataAlloc].allocHeap = Pal::GpuHeapGartCacheable; + createInfo.allocInfo[Pal::EmbeddedDataAlloc].allocSize = 64 * Ki; + createInfo.allocInfo[Pal::EmbeddedDataAlloc].suballocSize = 64 * Ki; + + Pal::Result result; + size_t cmdAllocSize = dev().iDev()->GetCmdAllocatorSize(createInfo, &result); + if (Pal::Result::Success != result) { + return false; + } + char* addr = new char[cmdAllocSize]; + if (Pal::Result::Success != dev().iDev()->CreateCmdAllocator(createInfo, addr, &cmdAllocator_)) { + return false; + } + + if (dev().numComputeEngines()) { + //! @todo There is a hang with a mix of user and non user queues. + //! Currently there is no simple way to detect which queue is what. + //! Disable first for now. + const uint firstQueue = (dev().numComputeEngines() > 2) ? 1 : 0; + uint idx = index() % (dev().numComputeEngines() - firstQueue); + + // hwRing_ should be set 0 if forced to have single scratch buffer + hwRing_ = (dev().settings().useSingleScratch_) ? 0 : idx; + + queues_[MainEngine] = Queue::Create(dev().iDev(), Pal::QueueTypeCompute, idx + firstQueue, + cmdAllocator_, rtCUs, priority); + if (nullptr == queues_[MainEngine]) { + return false; + } + + // Check if device has SDMA engines + if (dev().numDMAEngines() != 0) { + uint sdma; + // If only 1 DMA engine is available then use that one + if ((dev().numDMAEngines() < 2) || ((idx & 0x1) && !dev().settings().svmFineGrainSystem_)) { + sdma = 0; + } else { + sdma = 1; + } + + queues_[SdmaEngine] = + Queue::Create(dev().iDev(), Pal::QueueTypeDma, sdma, cmdAllocator_, + amd::CommandQueue::RealTimeDisabled, amd::CommandQueue::Priority::Normal); + if (nullptr == queues_[SdmaEngine]) { + return false; + } + } else { + Unimplemented(); + } + } else { + Unimplemented(); + } + + // Diable double copy optimization, + // since UAV read from nonlocal is fast enough + blitSetup.disableCopyBufferToImageOpt_ = true; + if (!allocConstantBuffers()) { + return false; + } + + // Create Printf class + printfDbg_ = new PrintfDbg(gpuDevice_); + if ((nullptr == printfDbg_) || !printfDbg_->create()) { + delete printfDbg_; + LogError("Could not allocate debug buffer for printf()!"); + return false; + } + + // Create HSAILPrintf class + printfDbgHSA_ = new PrintfDbgHSA(gpuDevice_); + if (nullptr == printfDbgHSA_) { + delete printfDbgHSA_; + LogError("Could not create PrintfDbgHSA class!"); + return false; + } + + tsCache_ = new TimeStampCache(*this); + if (nullptr == tsCache_) { + LogError("Could not create TimeStamp cache!"); + return false; + } + + if (!memoryDependency().create(dev().settings().numMemDependencies_)) { + LogError("Could not create the array of memory objects!"); + return false; + } + + if (!allocHsaQueueMem()) { + LogError("Could not create hsaQueueMem object!"); + return false; + } + + // Check if the app requested a device queue creation + if (dev().settings().useDeviceQueue_ && (0 != deviceQueueSize) && + !createVirtualQueue(deviceQueueSize)) { + LogError("Could not create a virtual queue!"); + return false; + } + + // Choose the appropriate class for blit engine + switch (dev().settings().blitEngine_) { + default: + // Fall through ... + case Settings::BlitEngineHost: + blitSetup.disableAll(); + // Fall through ... + case Settings::BlitEngineCAL: + case Settings::BlitEngineKernel: + // use host blit for HW debug + if (dev().settings().enableHwDebug_) { + blitSetup.disableCopyImageToBuffer_ = true; + blitSetup.disableCopyBufferToImage_ = true; + } + blitMgr_ = new KernelBlitManager(*this, blitSetup); + break; + } + if ((nullptr == blitMgr_) || !blitMgr_->create(gpuDevice_)) { + LogError("Could not create BlitManager!"); + return false; + } + + return true; +} + +bool VirtualGPU::allocHsaQueueMem() { + // Allocate a dummy HSA queue + hsaQueueMem_ = new Memory(dev(), sizeof(amd_queue_t)); + if ((hsaQueueMem_ == nullptr) || (!hsaQueueMem_->create(Resource::RemoteUSWC))) { delete hsaQueueMem_; + return false; + } + amd_queue_t* queue = + reinterpret_cast(hsaQueueMem_->map(nullptr, Resource::WriteOnly)); + if (nullptr == queue) { + delete hsaQueueMem_; + return false; + } + memset(queue, 0, sizeof(amd_queue_t)); + + // Provide private and local heap addresses + const static uint addressShift = LP64_SWITCH(0, 32); + queue->private_segment_aperture_base_hi = static_cast( + dev().properties().gpuMemoryProperties.privateApertureBase >> addressShift); + queue->group_segment_aperture_base_hi = static_cast( + dev().properties().gpuMemoryProperties.sharedApertureBase >> addressShift); + + hsaQueueMem_->unmap(nullptr); + return true; } -void -VirtualGPU::submitReadMemory(amd::ReadMemoryCommand& vcmd) -{ - // Make sure VirtualGPU has an exclusive access to the resources - amd::ScopedLock lock(execution()); +VirtualGPU::~VirtualGPU() { + // Not safe to remove a queue. So lock the device + amd::ScopedLock k(dev().lockAsyncOps()); + amd::ScopedLock lock(dev().vgpusAccess()); - // Translate memory references and ensure cache up-to-date - pal::Memory* memory = dev().getGpuMemory(&vcmd.source()); + // Destroy all memories + static const bool SkipScratch = false; + releaseMemObjects(SkipScratch); - size_t offset = 0; - // Find if virtual address is a CL allocation - device::Memory* hostMemory = dev().findMemoryFromVA(vcmd.destination(), &offset); + while (!freeCbQueue_.empty()) { + auto cb = freeCbQueue_.front(); + delete cb; + freeCbQueue_.pop(); + } - profilingBegin(vcmd, true); + // Destroy printf object + delete printfDbg_; - memory->syncCacheFromHost(*this); - cl_command_type type = vcmd.type(); - bool result = false; - amd::Memory* bufferFromImage = nullptr; + // Destroy printfHSA object + delete printfDbgHSA_; - // Force buffer read for IMAGE1D_BUFFER - if ((type == CL_COMMAND_READ_IMAGE) && - (vcmd.source().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) { - bufferFromImage = createBufferFromImage(vcmd.source()); - if (nullptr == bufferFromImage) { - LogError("We should not fail buffer creation from image_buffer!"); - } - else { - type = CL_COMMAND_READ_BUFFER; - bufferFromImage->setVirtualDevice(this); - memory = dev().getGpuMemory(bufferFromImage); - } + // Destroy BlitManager object + delete blitMgr_; + + // Destroy TimeStamp cache + delete tsCache_; + + // Destroy resource list with the constant buffers + for (uint i = 0; i < constBufs_.size(); ++i) { + delete constBufs_[i]; + } + + // Destroy queues + if (nullptr != queues_[MainEngine]) { + // Make sure the queues are idle + // It's unclear why PAL could still have a busy queue + queues_[MainEngine]->iQueue_->WaitIdle(); + delete queues_[MainEngine]; + } + + if (nullptr != queues_[SdmaEngine]) { + queues_[SdmaEngine]->iQueue_->WaitIdle(); + delete queues_[SdmaEngine]; + } + + if (nullptr != cmdAllocator_) { + cmdAllocator_->Destroy(); + delete[] reinterpret_cast(cmdAllocator_); + } + + gpuDevice_.numOfVgpus_--; + gpuDevice_.vgpus_.erase(gpuDevice_.vgpus_.begin() + index()); + for (uint idx = index(); idx < dev().vgpus().size(); ++idx) { + dev().vgpus()[idx]->index_--; + } + + // Release scratch buffer memory to reduce memory pressure + //!@note OCLtst uses single device with multiple tests + //! Release memory only if it's the last command queue. + //! The first queue is reserved for the transfers on device + if (gpuDevice_.numOfVgpus_ <= 1) { + gpuDevice_.destroyScratchBuffers(); + } + + //! @todo Temporarily keep the buffer mapped for debug purpose + if (nullptr != schedParams_) { + schedParams_->unmap(this); + } + delete vqHeader_; + delete virtualQueue_; + delete schedParams_; + delete hsaQueueMem_; +} + +void VirtualGPU::submitReadMemory(amd::ReadMemoryCommand& vcmd) { + // Make sure VirtualGPU has an exclusive access to the resources + amd::ScopedLock lock(execution()); + + // Translate memory references and ensure cache up-to-date + pal::Memory* memory = dev().getGpuMemory(&vcmd.source()); + + size_t offset = 0; + // Find if virtual address is a CL allocation + device::Memory* hostMemory = dev().findMemoryFromVA(vcmd.destination(), &offset); + + profilingBegin(vcmd, true); + + memory->syncCacheFromHost(*this); + cl_command_type type = vcmd.type(); + bool result = false; + amd::Memory* bufferFromImage = nullptr; + + // Force buffer read for IMAGE1D_BUFFER + if ((type == CL_COMMAND_READ_IMAGE) && + (vcmd.source().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) { + bufferFromImage = createBufferFromImage(vcmd.source()); + if (nullptr == bufferFromImage) { + LogError("We should not fail buffer creation from image_buffer!"); + } else { + type = CL_COMMAND_READ_BUFFER; + bufferFromImage->setVirtualDevice(this); + memory = dev().getGpuMemory(bufferFromImage); } + } - // Process different write commands - switch (type) { + // Process different write commands + switch (type) { case CL_COMMAND_READ_BUFFER: { - amd::Coord3D origin(vcmd.origin()[0]); - amd::Coord3D size(vcmd.size()[0]); - if (nullptr != bufferFromImage) { - size_t elemSize = - vcmd.source().asImage()->getImageFormat().getElementSize(); - origin.c[0] *= elemSize; - size.c[0] *= elemSize; - } - if (hostMemory != nullptr) { - // Accelerated transfer without pinning - amd::Coord3D dstOrigin(offset); - result = blitMgr().copyBuffer(*memory, *hostMemory, - origin, dstOrigin, size, vcmd.isEntireMemory()); - } - else { - result = blitMgr().readBuffer( - *memory, vcmd.destination(), - origin, size, vcmd.isEntireMemory()); - } - if (nullptr != bufferFromImage) { - bufferFromImage->release(); - } - } - break; + amd::Coord3D origin(vcmd.origin()[0]); + amd::Coord3D size(vcmd.size()[0]); + if (nullptr != bufferFromImage) { + size_t elemSize = vcmd.source().asImage()->getImageFormat().getElementSize(); + origin.c[0] *= elemSize; + size.c[0] *= elemSize; + } + if (hostMemory != nullptr) { + // Accelerated transfer without pinning + amd::Coord3D dstOrigin(offset); + result = blitMgr().copyBuffer(*memory, *hostMemory, origin, dstOrigin, size, + vcmd.isEntireMemory()); + } else { + result = + blitMgr().readBuffer(*memory, vcmd.destination(), origin, size, vcmd.isEntireMemory()); + } + if (nullptr != bufferFromImage) { + bufferFromImage->release(); + } + } break; case CL_COMMAND_READ_BUFFER_RECT: { - amd::BufferRect hostbufferRect; - amd::Coord3D region(0); - amd::Coord3D hostOrigin(vcmd.hostRect().start_+ offset); - hostbufferRect.create(hostOrigin.c, vcmd.size().c , vcmd.hostRect().rowPitch_, vcmd.hostRect().slicePitch_); - if (hostMemory != nullptr) { - result = blitMgr().copyBufferRect(*memory, *hostMemory, - vcmd.bufRect(), hostbufferRect, vcmd.size(), - vcmd.isEntireMemory()); - } - else { - result = blitMgr().readBufferRect(*memory, - vcmd.destination(), vcmd.bufRect(), vcmd.hostRect(), vcmd.size(), - vcmd.isEntireMemory()); - } - } - break; + amd::BufferRect hostbufferRect; + amd::Coord3D region(0); + amd::Coord3D hostOrigin(vcmd.hostRect().start_ + offset); + hostbufferRect.create(hostOrigin.c, vcmd.size().c, vcmd.hostRect().rowPitch_, + vcmd.hostRect().slicePitch_); + if (hostMemory != nullptr) { + result = blitMgr().copyBufferRect(*memory, *hostMemory, vcmd.bufRect(), hostbufferRect, + vcmd.size(), vcmd.isEntireMemory()); + } else { + result = blitMgr().readBufferRect(*memory, vcmd.destination(), vcmd.bufRect(), + vcmd.hostRect(), vcmd.size(), vcmd.isEntireMemory()); + } + } break; case CL_COMMAND_READ_IMAGE: - if (hostMemory != nullptr) { - // Accelerated image to buffer transfer without pinning - amd::Coord3D dstOrigin(offset); - result = blitMgr().copyImageToBuffer(*memory, *hostMemory, - vcmd.origin(), dstOrigin, vcmd.size(), - vcmd.isEntireMemory(), - vcmd.rowPitch(), vcmd.slicePitch()); - } - else { - result = blitMgr().readImage(*memory, vcmd.destination(), - vcmd.origin(), vcmd.size(), vcmd.rowPitch(), vcmd.slicePitch(), - vcmd.isEntireMemory()); - } - break; + if (hostMemory != nullptr) { + // Accelerated image to buffer transfer without pinning + amd::Coord3D dstOrigin(offset); + result = + blitMgr().copyImageToBuffer(*memory, *hostMemory, vcmd.origin(), dstOrigin, vcmd.size(), + vcmd.isEntireMemory(), vcmd.rowPitch(), vcmd.slicePitch()); + } else { + result = blitMgr().readImage(*memory, vcmd.destination(), vcmd.origin(), vcmd.size(), + vcmd.rowPitch(), vcmd.slicePitch(), vcmd.isEntireMemory()); + } + break; default: - LogError("Unsupported type for the read command"); - break; - } + LogError("Unsupported type for the read command"); + break; + } - if (!result) { - LogError("submitReadMemory failed!"); - vcmd.setStatus(CL_INVALID_OPERATION); - } + if (!result) { + LogError("submitReadMemory failed!"); + vcmd.setStatus(CL_INVALID_OPERATION); + } - profilingEnd(vcmd); + profilingEnd(vcmd); } -void -VirtualGPU::submitWriteMemory(amd::WriteMemoryCommand& vcmd) -{ - // Make sure VirtualGPU has an exclusive access to the resources - amd::ScopedLock lock(execution()); +void VirtualGPU::submitWriteMemory(amd::WriteMemoryCommand& vcmd) { + // Make sure VirtualGPU has an exclusive access to the resources + amd::ScopedLock lock(execution()); - // Translate memory references and ensure cache up to date - pal::Memory* memory = dev().getGpuMemory(&vcmd.destination()); - size_t offset = 0; - // Find if virtual address is a CL allocation - device::Memory* hostMemory = dev().findMemoryFromVA(vcmd.source(), &offset); + // Translate memory references and ensure cache up to date + pal::Memory* memory = dev().getGpuMemory(&vcmd.destination()); + size_t offset = 0; + // Find if virtual address is a CL allocation + device::Memory* hostMemory = dev().findMemoryFromVA(vcmd.source(), &offset); - profilingBegin(vcmd, true); + profilingBegin(vcmd, true); - bool entire = vcmd.isEntireMemory(); + bool entire = vcmd.isEntireMemory(); - // Synchronize memory from host if necessary - device::Memory::SyncFlags syncFlags; - syncFlags.skipEntire_ = entire; - memory->syncCacheFromHost(*this, syncFlags); + // Synchronize memory from host if necessary + device::Memory::SyncFlags syncFlags; + syncFlags.skipEntire_ = entire; + memory->syncCacheFromHost(*this, syncFlags); - cl_command_type type = vcmd.type(); - bool result = false; - amd::Memory* bufferFromImage = nullptr; + cl_command_type type = vcmd.type(); + bool result = false; + amd::Memory* bufferFromImage = nullptr; - // Force buffer write for IMAGE1D_BUFFER - if ((type == CL_COMMAND_WRITE_IMAGE) && - (vcmd.destination().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) { - bufferFromImage = createBufferFromImage(vcmd.destination()); - if (nullptr == bufferFromImage) { - LogError("We should not fail buffer creation from image_buffer!"); - } - else { - type = CL_COMMAND_WRITE_BUFFER; - bufferFromImage->setVirtualDevice(this); - memory = dev().getGpuMemory(bufferFromImage); - } + // Force buffer write for IMAGE1D_BUFFER + if ((type == CL_COMMAND_WRITE_IMAGE) && + (vcmd.destination().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) { + bufferFromImage = createBufferFromImage(vcmd.destination()); + if (nullptr == bufferFromImage) { + LogError("We should not fail buffer creation from image_buffer!"); + } else { + type = CL_COMMAND_WRITE_BUFFER; + bufferFromImage->setVirtualDevice(this); + memory = dev().getGpuMemory(bufferFromImage); } + } - // Process different write commands - switch (type) { + // Process different write commands + switch (type) { case CL_COMMAND_WRITE_BUFFER: { - amd::Coord3D origin(vcmd.origin()[0]); - amd::Coord3D size(vcmd.size()[0]); - if (nullptr != bufferFromImage) { - size_t elemSize = - vcmd.destination().asImage()->getImageFormat().getElementSize(); - origin.c[0] *= elemSize; - size.c[0] *= elemSize; - } - if (hostMemory != nullptr) { - // Accelerated transfer without pinning - amd::Coord3D srcOrigin(offset); - result = blitMgr().copyBuffer(*hostMemory, *memory, - srcOrigin, origin, size, vcmd.isEntireMemory()); - } - else { - result = blitMgr().writeBuffer(vcmd.source(), *memory, - origin, size, vcmd.isEntireMemory()); - } - if (nullptr != bufferFromImage) { - bufferFromImage->release(); - } - } - break; + amd::Coord3D origin(vcmd.origin()[0]); + amd::Coord3D size(vcmd.size()[0]); + if (nullptr != bufferFromImage) { + size_t elemSize = vcmd.destination().asImage()->getImageFormat().getElementSize(); + origin.c[0] *= elemSize; + size.c[0] *= elemSize; + } + if (hostMemory != nullptr) { + // Accelerated transfer without pinning + amd::Coord3D srcOrigin(offset); + result = blitMgr().copyBuffer(*hostMemory, *memory, srcOrigin, origin, size, + vcmd.isEntireMemory()); + } else { + result = blitMgr().writeBuffer(vcmd.source(), *memory, origin, size, vcmd.isEntireMemory()); + } + if (nullptr != bufferFromImage) { + bufferFromImage->release(); + } + } break; case CL_COMMAND_WRITE_BUFFER_RECT: { - amd::BufferRect hostbufferRect; - amd::Coord3D region(0); - amd::Coord3D hostOrigin(vcmd.hostRect().start_+ offset); - hostbufferRect.create(hostOrigin.c, vcmd.size().c , vcmd.hostRect().rowPitch_, vcmd.hostRect().slicePitch_); - if (hostMemory != nullptr) { - result = blitMgr().copyBufferRect(*hostMemory, *memory, - hostbufferRect, vcmd.bufRect(), vcmd.size(), - vcmd.isEntireMemory()); - } - else { - result = blitMgr().writeBufferRect(vcmd.source(), *memory, - vcmd.hostRect(), vcmd.bufRect(), vcmd.size(), - vcmd.isEntireMemory()); - } - } - break; + amd::BufferRect hostbufferRect; + amd::Coord3D region(0); + amd::Coord3D hostOrigin(vcmd.hostRect().start_ + offset); + hostbufferRect.create(hostOrigin.c, vcmd.size().c, vcmd.hostRect().rowPitch_, + vcmd.hostRect().slicePitch_); + if (hostMemory != nullptr) { + result = blitMgr().copyBufferRect(*hostMemory, *memory, hostbufferRect, vcmd.bufRect(), + vcmd.size(), vcmd.isEntireMemory()); + } else { + result = blitMgr().writeBufferRect(vcmd.source(), *memory, vcmd.hostRect(), vcmd.bufRect(), + vcmd.size(), vcmd.isEntireMemory()); + } + } break; case CL_COMMAND_WRITE_IMAGE: - if (hostMemory != nullptr) { - // Accelerated buffer to image transfer without pinning - amd::Coord3D srcOrigin(offset); - result = blitMgr().copyBufferToImage(*hostMemory, *memory, - srcOrigin, vcmd.origin(), vcmd.size(), - vcmd.isEntireMemory(), - vcmd.rowPitch(), vcmd.slicePitch()); - } - else { - result = blitMgr().writeImage(vcmd.source(), *memory, - vcmd.origin(), vcmd.size(), vcmd.rowPitch(), vcmd.slicePitch(), - vcmd.isEntireMemory()); - } - break; + if (hostMemory != nullptr) { + // Accelerated buffer to image transfer without pinning + amd::Coord3D srcOrigin(offset); + result = + blitMgr().copyBufferToImage(*hostMemory, *memory, srcOrigin, vcmd.origin(), vcmd.size(), + vcmd.isEntireMemory(), vcmd.rowPitch(), vcmd.slicePitch()); + } else { + result = blitMgr().writeImage(vcmd.source(), *memory, vcmd.origin(), vcmd.size(), + vcmd.rowPitch(), vcmd.slicePitch(), vcmd.isEntireMemory()); + } + break; default: - LogError("Unsupported type for the write command"); - break; - } + LogError("Unsupported type for the write command"); + break; + } - if (!result) { - LogError("submitWriteMemory failed!"); - vcmd.setStatus(CL_INVALID_OPERATION); - } - else { - // Mark this as the most-recently written cache of the destination - vcmd.destination().signalWrite(&gpuDevice_); - } - profilingEnd(vcmd); + if (!result) { + LogError("submitWriteMemory failed!"); + vcmd.setStatus(CL_INVALID_OPERATION); + } else { + // Mark this as the most-recently written cache of the destination + vcmd.destination().signalWrite(&gpuDevice_); + } + profilingEnd(vcmd); } -bool -VirtualGPU::copyMemory(cl_command_type type - , amd::Memory& srcMem - , amd::Memory& dstMem - , bool entire - , const amd::Coord3D& srcOrigin - , const amd::Coord3D& dstOrigin - , const amd::Coord3D& size - , const amd::BufferRect& srcRect - , const amd::BufferRect& dstRect - ) -{ - // Translate memory references and ensure cache up-to-date - pal::Memory* dstMemory = dev().getGpuMemory(&dstMem); - pal::Memory* srcMemory = dev().getGpuMemory(&srcMem); +bool VirtualGPU::copyMemory(cl_command_type type, amd::Memory& srcMem, amd::Memory& dstMem, + bool entire, const amd::Coord3D& srcOrigin, + const amd::Coord3D& dstOrigin, const amd::Coord3D& size, + const amd::BufferRect& srcRect, const amd::BufferRect& dstRect) { + // Translate memory references and ensure cache up-to-date + pal::Memory* dstMemory = dev().getGpuMemory(&dstMem); + pal::Memory* srcMemory = dev().getGpuMemory(&srcMem); - // Synchronize source and destination memory - device::Memory::SyncFlags syncFlags; - syncFlags.skipEntire_ = entire; - dstMemory->syncCacheFromHost(*this, syncFlags); - srcMemory->syncCacheFromHost(*this); + // Synchronize source and destination memory + device::Memory::SyncFlags syncFlags; + syncFlags.skipEntire_ = entire; + dstMemory->syncCacheFromHost(*this, syncFlags); + srcMemory->syncCacheFromHost(*this); - amd::Memory* bufferFromImageSrc = nullptr; - amd::Memory* bufferFromImageDst = nullptr; + amd::Memory* bufferFromImageSrc = nullptr; + amd::Memory* bufferFromImageDst = nullptr; - // Force buffer read for IMAGE1D_BUFFER - if ((srcMem.getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) { - bufferFromImageSrc = createBufferFromImage(srcMem); - if (nullptr == bufferFromImageSrc) { - LogError("We should not fail buffer creation from image_buffer!"); - } - else { - type = CL_COMMAND_COPY_BUFFER; - bufferFromImageSrc->setVirtualDevice(this); - srcMemory = dev().getGpuMemory(bufferFromImageSrc); - } + // Force buffer read for IMAGE1D_BUFFER + if ((srcMem.getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) { + bufferFromImageSrc = createBufferFromImage(srcMem); + if (nullptr == bufferFromImageSrc) { + LogError("We should not fail buffer creation from image_buffer!"); + } else { + type = CL_COMMAND_COPY_BUFFER; + bufferFromImageSrc->setVirtualDevice(this); + srcMemory = dev().getGpuMemory(bufferFromImageSrc); } - // Force buffer write for IMAGE1D_BUFFER - if ((dstMem.getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) { - bufferFromImageDst = createBufferFromImage(dstMem); - if (nullptr == bufferFromImageDst) { - LogError("We should not fail buffer creation from image_buffer!"); - } - else { - type = CL_COMMAND_COPY_BUFFER; - bufferFromImageDst->setVirtualDevice(this); - dstMemory = dev().getGpuMemory(bufferFromImageDst); - } + } + // Force buffer write for IMAGE1D_BUFFER + if ((dstMem.getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) { + bufferFromImageDst = createBufferFromImage(dstMem); + if (nullptr == bufferFromImageDst) { + LogError("We should not fail buffer creation from image_buffer!"); + } else { + type = CL_COMMAND_COPY_BUFFER; + bufferFromImageDst->setVirtualDevice(this); + dstMemory = dev().getGpuMemory(bufferFromImageDst); } + } - bool result = false; + bool result = false; - // Check if HW can be used for memory copy - switch (type) { + // Check if HW can be used for memory copy + switch (type) { case CL_COMMAND_MAKE_BUFFERS_RESIDENT_AMD: case CL_COMMAND_SVM_MEMCPY: case CL_COMMAND_COPY_BUFFER: { - amd::Coord3D realSrcOrigin(srcOrigin[0]); - amd::Coord3D realDstOrigin(dstOrigin[0]); - amd::Coord3D realSize(size.c[0],size.c[1],size.c[2]); + amd::Coord3D realSrcOrigin(srcOrigin[0]); + amd::Coord3D realDstOrigin(dstOrigin[0]); + amd::Coord3D realSize(size.c[0], size.c[1], size.c[2]); - if (nullptr != bufferFromImageSrc) { - size_t elemSize = - srcMem.asImage()->getImageFormat().getElementSize(); - realSrcOrigin.c[0] *= elemSize; - if (nullptr != bufferFromImageDst) { - realDstOrigin.c[0] *= elemSize; - } - realSize.c[0] *= elemSize; - } - else if (nullptr != bufferFromImageDst) { - size_t elemSize = - dstMem.asImage()->getImageFormat().getElementSize(); - realDstOrigin.c[0] *= elemSize; - realSize.c[0] *= elemSize; - } - - result = blitMgr().copyBuffer(*srcMemory, *dstMemory, - realSrcOrigin, realDstOrigin, realSize, entire); - - if (nullptr != bufferFromImageSrc) { - bufferFromImageSrc->release(); - } + if (nullptr != bufferFromImageSrc) { + size_t elemSize = srcMem.asImage()->getImageFormat().getElementSize(); + realSrcOrigin.c[0] *= elemSize; if (nullptr != bufferFromImageDst) { - bufferFromImageDst->release(); + realDstOrigin.c[0] *= elemSize; } - } - break; + realSize.c[0] *= elemSize; + } else if (nullptr != bufferFromImageDst) { + size_t elemSize = dstMem.asImage()->getImageFormat().getElementSize(); + realDstOrigin.c[0] *= elemSize; + realSize.c[0] *= elemSize; + } + + result = blitMgr().copyBuffer(*srcMemory, *dstMemory, realSrcOrigin, realDstOrigin, realSize, + entire); + + if (nullptr != bufferFromImageSrc) { + bufferFromImageSrc->release(); + } + if (nullptr != bufferFromImageDst) { + bufferFromImageDst->release(); + } + } break; case CL_COMMAND_COPY_BUFFER_RECT: - result = blitMgr().copyBufferRect(*srcMemory, *dstMemory, - srcRect, dstRect, size, entire); - break; + result = blitMgr().copyBufferRect(*srcMemory, *dstMemory, srcRect, dstRect, size, entire); + break; case CL_COMMAND_COPY_IMAGE_TO_BUFFER: - result = blitMgr().copyImageToBuffer(*srcMemory, *dstMemory, - srcOrigin, dstOrigin, size, entire); - break; + result = + blitMgr().copyImageToBuffer(*srcMemory, *dstMemory, srcOrigin, dstOrigin, size, entire); + break; case CL_COMMAND_COPY_BUFFER_TO_IMAGE: - result = blitMgr().copyBufferToImage(*srcMemory, *dstMemory, - srcOrigin, dstOrigin, size, entire); - break; + result = + blitMgr().copyBufferToImage(*srcMemory, *dstMemory, srcOrigin, dstOrigin, size, entire); + break; case CL_COMMAND_COPY_IMAGE: - result = blitMgr().copyImage(*srcMemory, *dstMemory, - srcOrigin, dstOrigin, size, entire); - break; + result = blitMgr().copyImage(*srcMemory, *dstMemory, srcOrigin, dstOrigin, size, entire); + break; default: - LogError("Unsupported command type for memory copy!"); - break; - } + LogError("Unsupported command type for memory copy!"); + break; + } - if (!result) { - LogError("submitCopyMemory failed!"); - return false; - } - else { - // Mark this as the most-recently written cache of the destination - dstMem.signalWrite(&gpuDevice_); - } - return true; + if (!result) { + LogError("submitCopyMemory failed!"); + return false; + } else { + // Mark this as the most-recently written cache of the destination + dstMem.signalWrite(&gpuDevice_); + } + return true; } -void -VirtualGPU::submitCopyMemory(amd::CopyMemoryCommand& vcmd) -{ - // Make sure VirtualGPU has an exclusive access to the resources - amd::ScopedLock lock(execution()); +void VirtualGPU::submitCopyMemory(amd::CopyMemoryCommand& vcmd) { + // Make sure VirtualGPU has an exclusive access to the resources + amd::ScopedLock lock(execution()); - profilingBegin(vcmd); + profilingBegin(vcmd); - cl_command_type type = vcmd.type(); - bool entire = vcmd.isEntireMemory(); + cl_command_type type = vcmd.type(); + bool entire = vcmd.isEntireMemory(); - if (!copyMemory(type, vcmd.source(), vcmd.destination(), entire, - vcmd.srcOrigin(), vcmd.dstOrigin(), vcmd.size(), vcmd.srcRect(), - vcmd.dstRect())) { - vcmd.setStatus(CL_INVALID_OPERATION); - } + if (!copyMemory(type, vcmd.source(), vcmd.destination(), entire, vcmd.srcOrigin(), + vcmd.dstOrigin(), vcmd.size(), vcmd.srcRect(), vcmd.dstRect())) { + vcmd.setStatus(CL_INVALID_OPERATION); + } - profilingEnd(vcmd); + profilingEnd(vcmd); } -void -VirtualGPU::submitSvmCopyMemory(amd::SvmCopyMemoryCommand& vcmd) -{ - // Make sure VirtualGPU has an exclusive access to the resources - amd::ScopedLock lock(execution()); - profilingBegin(vcmd); +void VirtualGPU::submitSvmCopyMemory(amd::SvmCopyMemoryCommand& vcmd) { + // Make sure VirtualGPU has an exclusive access to the resources + amd::ScopedLock lock(execution()); + profilingBegin(vcmd); - cl_command_type type = vcmd.type(); - //no op for FGS supported device - if (!dev().isFineGrainedSystem()) { - amd::Coord3D srcOrigin(0, 0, 0); - amd::Coord3D dstOrigin(0, 0, 0); - amd::Coord3D size(vcmd.srcSize(), 1, 1); - amd::BufferRect srcRect; - amd::BufferRect dstRect; - - bool result = false; - amd::Memory* srcMem = amd::SvmManager::FindSvmBuffer(vcmd.src()); - amd::Memory* dstMem = amd::SvmManager::FindSvmBuffer(vcmd.dst()); - - device::Memory::SyncFlags syncFlags; - if (nullptr != srcMem) { - srcMem->commitSvmMemory(); - srcOrigin.c[0] = static_cast(vcmd.src()) - static_cast
(srcMem->getSvmPtr()); - if (!(srcMem->validateRegion(srcOrigin, size))) { - vcmd.setStatus(CL_INVALID_OPERATION); - return; - } - } - if (nullptr != dstMem) { - dstMem->commitSvmMemory(); - dstOrigin.c[0] = static_cast(vcmd.dst()) - static_cast
(dstMem->getSvmPtr()); - if (!(dstMem->validateRegion(dstOrigin, size))) { - vcmd.setStatus(CL_INVALID_OPERATION); - return; - } - } - - if (nullptr == srcMem && nullptr != dstMem) { //src not in svm space - Memory* memory = dev().getGpuMemory(dstMem); - // Synchronize source and destination memory - syncFlags.skipEntire_ = dstMem->isEntirelyCovered(dstOrigin, size); - memory->syncCacheFromHost(*this, syncFlags); - - result = blitMgr().writeBuffer(vcmd.src(), *memory, - dstOrigin, size, dstMem->isEntirelyCovered(dstOrigin, size)); - // Mark this as the most-recently written cache of the destination - dstMem->signalWrite(&gpuDevice_); - } - else if (nullptr != srcMem && nullptr == dstMem) { //dst not in svm space - Memory* memory = dev().getGpuMemory(srcMem); - // Synchronize source and destination memory - memory->syncCacheFromHost(*this); - - result = blitMgr().readBuffer(*memory, vcmd.dst(), - srcOrigin, size, srcMem->isEntirelyCovered(srcOrigin, size)); - } - else if (nullptr != srcMem && nullptr != dstMem) { //both not in svm space - bool entire = srcMem->isEntirelyCovered(srcOrigin, size) && - dstMem->isEntirelyCovered(dstOrigin, size); - result = copyMemory(type, *srcMem, *dstMem, entire, srcOrigin, dstOrigin, - size, srcRect, dstRect); - } - - if (!result) { - vcmd.setStatus(CL_INVALID_OPERATION); - } - } - else { - //direct memcpy for FGS enabled system - amd::SvmBuffer::memFill(vcmd.dst(), vcmd.src(), vcmd.srcSize(), 1); - } - profilingEnd(vcmd); -} - -void -VirtualGPU::submitMapMemory(amd::MapMemoryCommand& vcmd) -{ - // Make sure VirtualGPU has an exclusive access to the resources - amd::ScopedLock lock(execution()); - - profilingBegin(vcmd, true); - - pal::Memory* memory = dev().getGpuMemory(&vcmd.memory()); - - // Save map info for unmap operation - memory->saveMapInfo(vcmd.mapPtr(), vcmd.origin(), vcmd.size(), - vcmd.mapFlags(), vcmd.isEntireMemory()); - - // If we have host memory, use it - if ((memory->owner()->getHostMem() != nullptr) && memory->isDirectMap()) { - if (!memory->isHostMemDirectAccess()) { - // Make sure GPU finished operation before - // synchronization with the backing store - memory->wait(*this); - } - - // Target is the backing store, so just ensure that owner is up-to-date - memory->owner()->cacheWriteBack(); - - // Add memory to VA cache, so rutnime can detect direct access to VA - dev().addVACache(memory); - } - else if (memory->isPersistentDirectMap()) { - // Nothing to do here - } - else if (memory->mapMemory() != nullptr) { - // Target is a remote resource, so copy - assert(memory->mapMemory() != nullptr); - if (vcmd.mapFlags() & (CL_MAP_READ | CL_MAP_WRITE)) { - amd::Coord3D dstOrigin(0, 0, 0); - if (memory->desc().buffer_) { - if (!blitMgr().copyBuffer(*memory, - *memory->mapMemory(), vcmd.origin(), vcmd.origin(), - vcmd.size(), vcmd.isEntireMemory())) { - LogError("submitMapMemory() - copy failed"); - vcmd.setStatus(CL_MAP_FAILURE); - } - } - else if ((vcmd.memory().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) { - amd::Memory* bufferFromImage = nullptr; - Memory* memoryBuf = memory; - amd::Coord3D origin(vcmd.origin()[0]); - amd::Coord3D size(vcmd.size()[0]); - size_t elemSize = - vcmd.memory().asImage()->getImageFormat().getElementSize(); - origin.c[0] *= elemSize; - size.c[0] *= elemSize; - - bufferFromImage = createBufferFromImage(vcmd.memory()); - if (nullptr == bufferFromImage) { - LogError("We should not fail buffer creation from image_buffer!"); - } - else { - bufferFromImage->setVirtualDevice(this); - memoryBuf = dev().getGpuMemory(bufferFromImage); - } - if (!blitMgr().copyBuffer(*memoryBuf, - *memory->mapMemory(), origin, dstOrigin, - size, vcmd.isEntireMemory())) { - LogError("submitMapMemory() - copy failed"); - vcmd.setStatus(CL_MAP_FAILURE); - } - if (nullptr != bufferFromImage) { - bufferFromImage->release(); - } - } - else { - // Validate if it's a view for a map of mip level - if (vcmd.memory().parent() != nullptr) { - amd::Image* amdImage = vcmd.memory().parent()->asImage(); - if ((amdImage != nullptr) && (amdImage->getMipLevels() > 1)) { - // Save map write info in the parent object - dev().getGpuMemory(amdImage)->saveMapInfo(vcmd.mapPtr(), - vcmd.origin(), vcmd.size(), - vcmd.mapFlags(), vcmd.isEntireMemory(), - vcmd.memory().asImage()); - } - } - if (!blitMgr().copyImageToBuffer(*memory, - *memory->mapMemory(), vcmd.origin(), dstOrigin, - vcmd.size(), vcmd.isEntireMemory())) { - LogError("submitMapMemory() - copy failed"); - vcmd.setStatus(CL_MAP_FAILURE); - } - } - } - } - else { - LogError("Unhandled map!"); - } - - profilingEnd(vcmd); -} - -void -VirtualGPU::submitUnmapMemory(amd::UnmapMemoryCommand& vcmd) -{ - // Make sure VirtualGPU has an exclusive access to the resources - amd::ScopedLock lock(execution()); - - pal::Memory* memory = dev().getGpuMemory(&vcmd.memory()); - amd::Memory* owner = memory->owner(); - bool unmapMip = false; - const device::Memory::WriteMapInfo* writeMapInfo = - memory->writeMapInfo(vcmd.mapPtr()); - if (nullptr == writeMapInfo) { - LogError("Unmap without map call"); - return; - } - profilingBegin(vcmd, true); - - // Check if image is a mipmap and assign a saved view - amd::Image* amdImage = owner->asImage(); - if ((amdImage != nullptr) && (amdImage->getMipLevels() > 1) && - (writeMapInfo->baseMip_ != nullptr)) { - // Assign mip level view - amdImage = writeMapInfo->baseMip_; - // Clear unmap flags from the parent image - memory->clearUnmapInfo(vcmd.mapPtr()); - memory = dev().getGpuMemory(amdImage); - unmapMip = true; - writeMapInfo = memory->writeMapInfo(vcmd.mapPtr()); - } - - // We used host memory - if ((owner->getHostMem() != nullptr) && memory->isDirectMap()) { - if (writeMapInfo->isUnmapWrite()) { - // Target is the backing store, so sync - owner->signalWrite(nullptr); - memory->syncCacheFromHost(*this); - } - // Remove memory from VA cache - dev().removeVACache(memory); - } - // data check was added for persistent memory that failed to get aperture - // and therefore are treated like a remote resource - else if (memory->isPersistentDirectMap() && (memory->data() != nullptr)) { - memory->unmap(this); - } - else if (memory->mapMemory() != nullptr) { - if (writeMapInfo->isUnmapWrite()) { - amd::Coord3D srcOrigin(0, 0, 0); - // Target is a remote resource, so copy - assert(memory->mapMemory() != nullptr); - if (memory->desc().buffer_) { - if (!blitMgr().copyBuffer( - *memory->mapMemory(), *memory, - writeMapInfo->origin_, - writeMapInfo->origin_, - writeMapInfo->region_, - writeMapInfo->isEntire())) { - LogError("submitUnmapMemory() - copy failed"); - vcmd.setStatus(CL_OUT_OF_RESOURCES); - } - } - else if ((vcmd.memory().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) { - amd::Memory* bufferFromImage = nullptr; - Memory* memoryBuf = memory; - amd::Coord3D origin(writeMapInfo->origin_[0]); - amd::Coord3D size(writeMapInfo->region_[0]); - size_t elemSize = - vcmd.memory().asImage()->getImageFormat().getElementSize(); - origin.c[0] *= elemSize; - size.c[0] *= elemSize; - - bufferFromImage = createBufferFromImage(vcmd.memory()); - if (nullptr == bufferFromImage) { - LogError("We should not fail buffer creation from image_buffer!"); - } - else { - bufferFromImage->setVirtualDevice(this); - memoryBuf = dev().getGpuMemory(bufferFromImage); - } - if (!blitMgr().copyBuffer( - *memory->mapMemory(), *memoryBuf, - srcOrigin, origin, size, - writeMapInfo->isEntire())) { - LogError("submitUnmapMemory() - copy failed"); - vcmd.setStatus(CL_OUT_OF_RESOURCES); - } - if (nullptr != bufferFromImage) { - bufferFromImage->release(); - } - } - else { - if (!blitMgr().copyBufferToImage( - *memory->mapMemory(), *memory, - srcOrigin, - writeMapInfo->origin_, - writeMapInfo->region_, - writeMapInfo->isEntire())) { - LogError("submitUnmapMemory() - copy failed"); - vcmd.setStatus(CL_OUT_OF_RESOURCES); - } - } - } - } - else { - LogError("Unhandled unmap!"); - vcmd.setStatus(CL_INVALID_VALUE); - } - - // Clear unmap flags - memory->clearUnmapInfo(vcmd.mapPtr()); - - // Release a view for a mipmap map - if (unmapMip) { - amdImage->release(); - } - profilingEnd(vcmd); -} - -bool -VirtualGPU::fillMemory(cl_command_type type, amd::Memory* amdMemory, const void* pattern, - size_t patternSize, const amd::Coord3D& origin, const amd::Coord3D& size) -{ - pal::Memory* memory = dev().getGpuMemory(amdMemory); - bool entire = amdMemory->isEntirelyCovered(origin, size); - - // Synchronize memory from host if necessary - device::Memory::SyncFlags syncFlags; - syncFlags.skipEntire_ = entire; - memory->syncCacheFromHost(*this, syncFlags); + cl_command_type type = vcmd.type(); + // no op for FGS supported device + if (!dev().isFineGrainedSystem()) { + amd::Coord3D srcOrigin(0, 0, 0); + amd::Coord3D dstOrigin(0, 0, 0); + amd::Coord3D size(vcmd.srcSize(), 1, 1); + amd::BufferRect srcRect; + amd::BufferRect dstRect; bool result = false; - amd::Memory* bufferFromImage = nullptr; - float fillValue[4]; + amd::Memory* srcMem = amd::SvmManager::FindSvmBuffer(vcmd.src()); + amd::Memory* dstMem = amd::SvmManager::FindSvmBuffer(vcmd.dst()); - // Force fill buffer for IMAGE1D_BUFFER - if ((type == CL_COMMAND_FILL_IMAGE) && - (amdMemory->getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) { - bufferFromImage = createBufferFromImage(*amdMemory); - if (nullptr == bufferFromImage) { - LogError("We should not fail buffer creation from image_buffer!"); - } - else { - type = CL_COMMAND_FILL_BUFFER; - bufferFromImage->setVirtualDevice(this); - memory = dev().getGpuMemory(bufferFromImage); - } + device::Memory::SyncFlags syncFlags; + if (nullptr != srcMem) { + srcMem->commitSvmMemory(); + srcOrigin.c[0] = + static_cast(vcmd.src()) - static_cast
(srcMem->getSvmPtr()); + if (!(srcMem->validateRegion(srcOrigin, size))) { + vcmd.setStatus(CL_INVALID_OPERATION); + return; + } + } + if (nullptr != dstMem) { + dstMem->commitSvmMemory(); + dstOrigin.c[0] = + static_cast(vcmd.dst()) - static_cast
(dstMem->getSvmPtr()); + if (!(dstMem->validateRegion(dstOrigin, size))) { + vcmd.setStatus(CL_INVALID_OPERATION); + return; + } } - // Find the the right fill operation - switch (type) { - case CL_COMMAND_FILL_BUFFER : - case CL_COMMAND_SVM_MEMFILL : { - amd::Coord3D realOrigin(origin[0]); - amd::Coord3D realSize(size[0]); - // Reprogram fill parameters if it's an IMAGE1D_BUFFER object - if (nullptr != bufferFromImage) { - size_t elemSize = - amdMemory->asImage()->getImageFormat().getElementSize(); - realOrigin.c[0] *= elemSize; - realSize.c[0] *= elemSize; - memset(fillValue, 0, sizeof(fillValue)); - amdMemory->asImage()->getImageFormat().formatColor(pattern, fillValue); - pattern = fillValue; - patternSize = elemSize; - } - result = blitMgr().fillBuffer(*memory, pattern, - patternSize, realOrigin, realSize, amdMemory->isEntirelyCovered(origin, size)); - if (nullptr != bufferFromImage) { - bufferFromImage->release(); - } - } - break; - case CL_COMMAND_FILL_IMAGE: - result = blitMgr().fillImage(*memory, pattern, - origin, size, amdMemory->isEntirelyCovered(origin, size)); - break; - default: - LogError("Unsupported command type for FillMemory!"); - break; + if (nullptr == srcMem && nullptr != dstMem) { // src not in svm space + Memory* memory = dev().getGpuMemory(dstMem); + // Synchronize source and destination memory + syncFlags.skipEntire_ = dstMem->isEntirelyCovered(dstOrigin, size); + memory->syncCacheFromHost(*this, syncFlags); + + result = blitMgr().writeBuffer(vcmd.src(), *memory, dstOrigin, size, + dstMem->isEntirelyCovered(dstOrigin, size)); + // Mark this as the most-recently written cache of the destination + dstMem->signalWrite(&gpuDevice_); + } else if (nullptr != srcMem && nullptr == dstMem) { // dst not in svm space + Memory* memory = dev().getGpuMemory(srcMem); + // Synchronize source and destination memory + memory->syncCacheFromHost(*this); + + result = blitMgr().readBuffer(*memory, vcmd.dst(), srcOrigin, size, + srcMem->isEntirelyCovered(srcOrigin, size)); + } else if (nullptr != srcMem && nullptr != dstMem) { // both not in svm space + bool entire = + srcMem->isEntirelyCovered(srcOrigin, size) && dstMem->isEntirelyCovered(dstOrigin, size); + result = + copyMemory(type, *srcMem, *dstMem, entire, srcOrigin, dstOrigin, size, srcRect, dstRect); } if (!result) { - LogError("fillMemory failed!"); - return false; + vcmd.setStatus(CL_INVALID_OPERATION); + } + } else { + // direct memcpy for FGS enabled system + amd::SvmBuffer::memFill(vcmd.dst(), vcmd.src(), vcmd.srcSize(), 1); + } + profilingEnd(vcmd); +} + +void VirtualGPU::submitMapMemory(amd::MapMemoryCommand& vcmd) { + // Make sure VirtualGPU has an exclusive access to the resources + amd::ScopedLock lock(execution()); + + profilingBegin(vcmd, true); + + pal::Memory* memory = dev().getGpuMemory(&vcmd.memory()); + + // Save map info for unmap operation + memory->saveMapInfo(vcmd.mapPtr(), vcmd.origin(), vcmd.size(), vcmd.mapFlags(), + vcmd.isEntireMemory()); + + // If we have host memory, use it + if ((memory->owner()->getHostMem() != nullptr) && memory->isDirectMap()) { + if (!memory->isHostMemDirectAccess()) { + // Make sure GPU finished operation before + // synchronization with the backing store + memory->wait(*this); } + // Target is the backing store, so just ensure that owner is up-to-date + memory->owner()->cacheWriteBack(); + + // Add memory to VA cache, so rutnime can detect direct access to VA + dev().addVACache(memory); + } else if (memory->isPersistentDirectMap()) { + // Nothing to do here + } else if (memory->mapMemory() != nullptr) { + // Target is a remote resource, so copy + assert(memory->mapMemory() != nullptr); + if (vcmd.mapFlags() & (CL_MAP_READ | CL_MAP_WRITE)) { + amd::Coord3D dstOrigin(0, 0, 0); + if (memory->desc().buffer_) { + if (!blitMgr().copyBuffer(*memory, *memory->mapMemory(), vcmd.origin(), vcmd.origin(), + vcmd.size(), vcmd.isEntireMemory())) { + LogError("submitMapMemory() - copy failed"); + vcmd.setStatus(CL_MAP_FAILURE); + } + } else if ((vcmd.memory().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) { + amd::Memory* bufferFromImage = nullptr; + Memory* memoryBuf = memory; + amd::Coord3D origin(vcmd.origin()[0]); + amd::Coord3D size(vcmd.size()[0]); + size_t elemSize = vcmd.memory().asImage()->getImageFormat().getElementSize(); + origin.c[0] *= elemSize; + size.c[0] *= elemSize; + + bufferFromImage = createBufferFromImage(vcmd.memory()); + if (nullptr == bufferFromImage) { + LogError("We should not fail buffer creation from image_buffer!"); + } else { + bufferFromImage->setVirtualDevice(this); + memoryBuf = dev().getGpuMemory(bufferFromImage); + } + if (!blitMgr().copyBuffer(*memoryBuf, *memory->mapMemory(), origin, dstOrigin, size, + vcmd.isEntireMemory())) { + LogError("submitMapMemory() - copy failed"); + vcmd.setStatus(CL_MAP_FAILURE); + } + if (nullptr != bufferFromImage) { + bufferFromImage->release(); + } + } else { + // Validate if it's a view for a map of mip level + if (vcmd.memory().parent() != nullptr) { + amd::Image* amdImage = vcmd.memory().parent()->asImage(); + if ((amdImage != nullptr) && (amdImage->getMipLevels() > 1)) { + // Save map write info in the parent object + dev().getGpuMemory(amdImage)->saveMapInfo(vcmd.mapPtr(), vcmd.origin(), vcmd.size(), + vcmd.mapFlags(), vcmd.isEntireMemory(), + vcmd.memory().asImage()); + } + } + if (!blitMgr().copyImageToBuffer(*memory, *memory->mapMemory(), vcmd.origin(), dstOrigin, + vcmd.size(), vcmd.isEntireMemory())) { + LogError("submitMapMemory() - copy failed"); + vcmd.setStatus(CL_MAP_FAILURE); + } + } + } + } else { + LogError("Unhandled map!"); + } + + profilingEnd(vcmd); +} + +void VirtualGPU::submitUnmapMemory(amd::UnmapMemoryCommand& vcmd) { + // Make sure VirtualGPU has an exclusive access to the resources + amd::ScopedLock lock(execution()); + + pal::Memory* memory = dev().getGpuMemory(&vcmd.memory()); + amd::Memory* owner = memory->owner(); + bool unmapMip = false; + const device::Memory::WriteMapInfo* writeMapInfo = memory->writeMapInfo(vcmd.mapPtr()); + if (nullptr == writeMapInfo) { + LogError("Unmap without map call"); + return; + } + profilingBegin(vcmd, true); + + // Check if image is a mipmap and assign a saved view + amd::Image* amdImage = owner->asImage(); + if ((amdImage != nullptr) && (amdImage->getMipLevels() > 1) && + (writeMapInfo->baseMip_ != nullptr)) { + // Assign mip level view + amdImage = writeMapInfo->baseMip_; + // Clear unmap flags from the parent image + memory->clearUnmapInfo(vcmd.mapPtr()); + memory = dev().getGpuMemory(amdImage); + unmapMip = true; + writeMapInfo = memory->writeMapInfo(vcmd.mapPtr()); + } + + // We used host memory + if ((owner->getHostMem() != nullptr) && memory->isDirectMap()) { + if (writeMapInfo->isUnmapWrite()) { + // Target is the backing store, so sync + owner->signalWrite(nullptr); + memory->syncCacheFromHost(*this); + } + // Remove memory from VA cache + dev().removeVACache(memory); + } + // data check was added for persistent memory that failed to get aperture + // and therefore are treated like a remote resource + else if (memory->isPersistentDirectMap() && (memory->data() != nullptr)) { + memory->unmap(this); + } else if (memory->mapMemory() != nullptr) { + if (writeMapInfo->isUnmapWrite()) { + amd::Coord3D srcOrigin(0, 0, 0); + // Target is a remote resource, so copy + assert(memory->mapMemory() != nullptr); + if (memory->desc().buffer_) { + if (!blitMgr().copyBuffer(*memory->mapMemory(), *memory, writeMapInfo->origin_, + writeMapInfo->origin_, writeMapInfo->region_, + writeMapInfo->isEntire())) { + LogError("submitUnmapMemory() - copy failed"); + vcmd.setStatus(CL_OUT_OF_RESOURCES); + } + } else if ((vcmd.memory().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) { + amd::Memory* bufferFromImage = nullptr; + Memory* memoryBuf = memory; + amd::Coord3D origin(writeMapInfo->origin_[0]); + amd::Coord3D size(writeMapInfo->region_[0]); + size_t elemSize = vcmd.memory().asImage()->getImageFormat().getElementSize(); + origin.c[0] *= elemSize; + size.c[0] *= elemSize; + + bufferFromImage = createBufferFromImage(vcmd.memory()); + if (nullptr == bufferFromImage) { + LogError("We should not fail buffer creation from image_buffer!"); + } else { + bufferFromImage->setVirtualDevice(this); + memoryBuf = dev().getGpuMemory(bufferFromImage); + } + if (!blitMgr().copyBuffer(*memory->mapMemory(), *memoryBuf, srcOrigin, origin, size, + writeMapInfo->isEntire())) { + LogError("submitUnmapMemory() - copy failed"); + vcmd.setStatus(CL_OUT_OF_RESOURCES); + } + if (nullptr != bufferFromImage) { + bufferFromImage->release(); + } + } else { + if (!blitMgr().copyBufferToImage(*memory->mapMemory(), *memory, srcOrigin, + writeMapInfo->origin_, writeMapInfo->region_, + writeMapInfo->isEntire())) { + LogError("submitUnmapMemory() - copy failed"); + vcmd.setStatus(CL_OUT_OF_RESOURCES); + } + } + } + } else { + LogError("Unhandled unmap!"); + vcmd.setStatus(CL_INVALID_VALUE); + } + + // Clear unmap flags + memory->clearUnmapInfo(vcmd.mapPtr()); + + // Release a view for a mipmap map + if (unmapMip) { + amdImage->release(); + } + profilingEnd(vcmd); +} + +bool VirtualGPU::fillMemory(cl_command_type type, amd::Memory* amdMemory, const void* pattern, + size_t patternSize, const amd::Coord3D& origin, + const amd::Coord3D& size) { + pal::Memory* memory = dev().getGpuMemory(amdMemory); + bool entire = amdMemory->isEntirelyCovered(origin, size); + + // Synchronize memory from host if necessary + device::Memory::SyncFlags syncFlags; + syncFlags.skipEntire_ = entire; + memory->syncCacheFromHost(*this, syncFlags); + + bool result = false; + amd::Memory* bufferFromImage = nullptr; + float fillValue[4]; + + // Force fill buffer for IMAGE1D_BUFFER + if ((type == CL_COMMAND_FILL_IMAGE) && (amdMemory->getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) { + bufferFromImage = createBufferFromImage(*amdMemory); + if (nullptr == bufferFromImage) { + LogError("We should not fail buffer creation from image_buffer!"); + } else { + type = CL_COMMAND_FILL_BUFFER; + bufferFromImage->setVirtualDevice(this); + memory = dev().getGpuMemory(bufferFromImage); + } + } + + // Find the the right fill operation + switch (type) { + case CL_COMMAND_FILL_BUFFER: + case CL_COMMAND_SVM_MEMFILL: { + amd::Coord3D realOrigin(origin[0]); + amd::Coord3D realSize(size[0]); + // Reprogram fill parameters if it's an IMAGE1D_BUFFER object + if (nullptr != bufferFromImage) { + size_t elemSize = amdMemory->asImage()->getImageFormat().getElementSize(); + realOrigin.c[0] *= elemSize; + realSize.c[0] *= elemSize; + memset(fillValue, 0, sizeof(fillValue)); + amdMemory->asImage()->getImageFormat().formatColor(pattern, fillValue); + pattern = fillValue; + patternSize = elemSize; + } + result = blitMgr().fillBuffer(*memory, pattern, patternSize, realOrigin, realSize, + amdMemory->isEntirelyCovered(origin, size)); + if (nullptr != bufferFromImage) { + bufferFromImage->release(); + } + } break; + case CL_COMMAND_FILL_IMAGE: + result = blitMgr().fillImage(*memory, pattern, origin, size, + amdMemory->isEntirelyCovered(origin, size)); + break; + default: + LogError("Unsupported command type for FillMemory!"); + break; + } + + if (!result) { + LogError("fillMemory failed!"); + return false; + } + + // Mark this as the most-recently written cache of the destination + amdMemory->signalWrite(&gpuDevice_); + return true; +} + +void VirtualGPU::submitFillMemory(amd::FillMemoryCommand& vcmd) { + // Make sure VirtualGPU has an exclusive access to the resources + amd::ScopedLock lock(execution()); + + profilingBegin(vcmd, true); + + if (!fillMemory(vcmd.type(), &vcmd.memory(), vcmd.pattern(), vcmd.patternSize(), vcmd.origin(), + vcmd.size())) { + vcmd.setStatus(CL_INVALID_OPERATION); + } + + profilingEnd(vcmd); +} + +void VirtualGPU::submitSvmMapMemory(amd::SvmMapMemoryCommand& vcmd) { + // Make sure VirtualGPU has an exclusive access to the resources + amd::ScopedLock lock(execution()); + + profilingBegin(vcmd, true); + + // no op for FGS supported device + if (!dev().isFineGrainedSystem()) { + // Make sure we have memory for the command execution + pal::Memory* memory = dev().getGpuMemory(vcmd.getSvmMem()); + + memory->saveMapInfo(vcmd.svmPtr(), vcmd.origin(), vcmd.size(), vcmd.mapFlags(), + vcmd.isEntireMemory()); + + if (memory->mapMemory() != nullptr) { + if (vcmd.mapFlags() & (CL_MAP_READ | CL_MAP_WRITE)) { + assert(memory->desc().buffer_ && "SVM memory can't be an image"); + if (!blitMgr().copyBuffer(*memory, *memory->mapMemory(), vcmd.origin(), vcmd.origin(), + vcmd.size(), vcmd.isEntireMemory())) { + LogError("submitSVMMapMemory() - copy failed"); + vcmd.setStatus(CL_MAP_FAILURE); + } + } + } else if ((memory->owner()->getHostMem() != nullptr) && memory->isDirectMap()) { + if (!memory->isHostMemDirectAccess()) { + // Make sure GPU finished operation before + // synchronization with the backing store + memory->wait(*this); + } + + // Target is the backing store, so just ensure that owner is up-to-date + memory->owner()->cacheWriteBack(); + } else { + LogError("Unhandled svm map!"); + } + } + + profilingEnd(vcmd); +} + +void VirtualGPU::submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& vcmd) { + // Make sure VirtualGPU has an exclusive access to the resources + amd::ScopedLock lock(execution()); + profilingBegin(vcmd, true); + + // no op for FGS supported device + if (!dev().isFineGrainedSystem()) { + pal::Memory* memory = dev().getGpuMemory(vcmd.getSvmMem()); + const device::Memory::WriteMapInfo* writeMapInfo = memory->writeMapInfo(vcmd.svmPtr()); + + if (memory->mapMemory() != nullptr) { + if (writeMapInfo->isUnmapWrite()) { + amd::Coord3D srcOrigin(0, 0, 0); + // Target is a remote resource, so copy + assert(memory->desc().buffer_ && "SVM memory can't be an image"); + if (!blitMgr().copyBuffer(*memory->mapMemory(), *memory, writeMapInfo->origin_, + writeMapInfo->origin_, writeMapInfo->region_, + writeMapInfo->isEntire())) { + LogError("submitSvmUnmapMemory() - copy failed"); + vcmd.setStatus(CL_OUT_OF_RESOURCES); + } + } + } else if ((memory->owner()->getHostMem() != nullptr) && memory->isDirectMap()) { + if (writeMapInfo->isUnmapWrite()) { + // Target is the backing store, so sync + memory->owner()->signalWrite(nullptr); + memory->syncCacheFromHost(*this); + } + } + memory->clearUnmapInfo(vcmd.svmPtr()); + } + + profilingEnd(vcmd); +} + +void VirtualGPU::submitSvmFillMemory(amd::SvmFillMemoryCommand& vcmd) { + // Make sure VirtualGPU has an exclusive access to the resources + amd::ScopedLock lock(execution()); + + profilingBegin(vcmd, true); + + if (!dev().isFineGrainedSystem()) { + size_t patternSize = vcmd.patternSize(); + size_t fillSize = patternSize * vcmd.times(); + size_t offset = 0; + amd::Memory* dstMemory = amd::SvmManager::FindSvmBuffer(vcmd.dst()); + assert(dstMemory && "No svm Buffer to fill with!"); + offset = reinterpret_cast(vcmd.dst()) - + reinterpret_cast(dstMemory->getSvmPtr()); + assert((offset >= 0) && "wrong svm ptr to fill with!"); + + pal::Memory* memory = dev().getGpuMemory(dstMemory); + + amd::Coord3D origin(offset, 0, 0); + amd::Coord3D size(fillSize, 1, 1); + + assert((dstMemory->validateRegion(origin, size)) && "The incorrect fill size!"); + // Synchronize memory from host if necessary + device::Memory::SyncFlags syncFlags; + syncFlags.skipEntire_ = dstMemory->isEntirelyCovered(origin, size); + memory->syncCacheFromHost(*this, syncFlags); + + if (!fillMemory(vcmd.type(), dstMemory, vcmd.pattern(), vcmd.patternSize(), origin, size)) { + vcmd.setStatus(CL_INVALID_OPERATION); + } // Mark this as the most-recently written cache of the destination - amdMemory->signalWrite(&gpuDevice_); - return true; + dstMemory->signalWrite(&gpuDevice_); + } else { + // for FGS capable device, fill CPU memory directly + amd::SvmBuffer::memFill(vcmd.dst(), vcmd.pattern(), vcmd.patternSize(), vcmd.times()); + } + + profilingEnd(vcmd); } -void -VirtualGPU::submitFillMemory(amd::FillMemoryCommand& vcmd) -{ - // Make sure VirtualGPU has an exclusive access to the resources - amd::ScopedLock lock(execution()); +void VirtualGPU::submitMigrateMemObjects(amd::MigrateMemObjectsCommand& vcmd) { + // Make sure VirtualGPU has an exclusive access to the resources + amd::ScopedLock lock(execution()); - profilingBegin(vcmd, true); + profilingBegin(vcmd, true); - if (!fillMemory(vcmd.type(), &vcmd.memory(),vcmd.pattern(), - vcmd.patternSize(), vcmd.origin(), vcmd.size())) { - vcmd.setStatus(CL_INVALID_OPERATION); + std::vector::const_iterator itr; + for (itr = vcmd.memObjects().begin(); itr != vcmd.memObjects().end(); itr++) { + // Find device memory + pal::Memory* memory = dev().getGpuMemory(*itr); + + if (vcmd.migrationFlags() & CL_MIGRATE_MEM_OBJECT_HOST) { + memory->mgpuCacheWriteBack(); + } else if (vcmd.migrationFlags() & CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED) { + // Synchronize memory from host if necessary. + // The sync function will perform memory migration from + // another device if necessary + device::Memory::SyncFlags syncFlags; + memory->syncCacheFromHost(*this, syncFlags); + } else { + LogWarning("Unknown operation for memory migration!"); } + } - profilingEnd(vcmd); + profilingEnd(vcmd); } -void -VirtualGPU::submitSvmMapMemory(amd::SvmMapMemoryCommand& vcmd) -{ - // Make sure VirtualGPU has an exclusive access to the resources - amd::ScopedLock lock(execution()); +void VirtualGPU::submitSvmFreeMemory(amd::SvmFreeMemoryCommand& vcmd) { + // in-order semantics: previous commands need to be done before we start + // Make sure VirtualGPU has an exclusive access to the resources + amd::ScopedLock lock(execution()); - profilingBegin(vcmd, true); - - //no op for FGS supported device - if (!dev().isFineGrainedSystem()) { - // Make sure we have memory for the command execution - pal::Memory* memory = dev().getGpuMemory(vcmd.getSvmMem()); - - memory->saveMapInfo(vcmd.svmPtr(), vcmd.origin(), vcmd.size(), - vcmd.mapFlags(), vcmd.isEntireMemory()); - - if (memory->mapMemory() != nullptr) { - if (vcmd.mapFlags() & (CL_MAP_READ | CL_MAP_WRITE)) { - assert(memory->desc().buffer_ && "SVM memory can't be an image"); - if (!blitMgr().copyBuffer(*memory, *memory->mapMemory(), - vcmd.origin(), vcmd.origin(), vcmd.size(), vcmd.isEntireMemory())) { - LogError("submitSVMMapMemory() - copy failed"); - vcmd.setStatus(CL_MAP_FAILURE); - } - } - } - else if ((memory->owner()->getHostMem() != nullptr) && memory->isDirectMap()) { - if (!memory->isHostMemDirectAccess()) { - // Make sure GPU finished operation before - // synchronization with the backing store - memory->wait(*this); - } - - // Target is the backing store, so just ensure that owner is up-to-date - memory->owner()->cacheWriteBack(); - } - else { - LogError("Unhandled svm map!"); - } + profilingBegin(vcmd); + std::vector& svmPointers = vcmd.svmPointers(); + if (vcmd.pfnFreeFunc() == nullptr) { + // pointers allocated using clSVMAlloc + for (cl_uint i = 0; i < svmPointers.size(); i++) { + dev().svmFree(svmPointers[i]); } - - profilingEnd(vcmd); + } else { + vcmd.pfnFreeFunc()(as_cl(vcmd.queue()->asCommandQueue()), svmPointers.size(), + static_cast(&(svmPointers[0])), vcmd.userData()); + } + profilingEnd(vcmd); } -void -VirtualGPU::submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& vcmd) -{ - // Make sure VirtualGPU has an exclusive access to the resources - amd::ScopedLock lock(execution()); - profilingBegin(vcmd, true); +void VirtualGPU::findIterations(const amd::NDRangeContainer& sizes, const amd::NDRange& local, + amd::NDRange& groups, amd::NDRange& remainder, size_t& extra) { + size_t dimensions = sizes.dimensions(); - //no op for FGS supported device - if (!dev().isFineGrainedSystem()) { - pal::Memory* memory = dev().getGpuMemory(vcmd.getSvmMem()); - const device::Memory::WriteMapInfo* writeMapInfo = - memory->writeMapInfo(vcmd.svmPtr()); + if (cal()->iterations_ > 1) { + size_t iterations = cal()->iterations_; + cal_.iterations_ = 1; - if (memory->mapMemory() != nullptr) { - if (writeMapInfo->isUnmapWrite()) { - amd::Coord3D srcOrigin(0, 0, 0); - // Target is a remote resource, so copy - assert(memory->desc().buffer_ && "SVM memory can't be an image"); - if (!blitMgr().copyBuffer(*memory->mapMemory(), *memory, - writeMapInfo->origin_, writeMapInfo->origin_, - writeMapInfo->region_, writeMapInfo->isEntire())) { - LogError("submitSvmUnmapMemory() - copy failed"); - vcmd.setStatus(CL_OUT_OF_RESOURCES); - } - } + // Find the total amount of all groups + groups = sizes.global() / local; + if (dev().settings().partialDispatch_) { + for (uint j = 0; j < dimensions; ++j) { + if ((sizes.global()[j] % local[j]) != 0) { + groups[j]++; } - else if ((memory->owner()->getHostMem() != nullptr) && memory->isDirectMap()) { - if (writeMapInfo->isUnmapWrite()) { - // Target is the backing store, so sync - memory->owner()->signalWrite(nullptr); - memory->syncCacheFromHost(*this); - } - } - memory->clearUnmapInfo(vcmd.svmPtr()); + } } - profilingEnd(vcmd); + // Calculate the real number of required iterations and + // the workgroup size of each iteration + for (int j = (dimensions - 1); j >= 0; --j) { + // Find possible size of each iteration + size_t tmp = (groups[j] / iterations); + // Make sure the group size is more than 1 + if (tmp > 0) { + remainder = groups; + remainder[j] = (groups[j] % tmp); + + extra = ((groups[j] / tmp) + + // Check for the remainder + ((remainder[j] != 0) ? 1 : 0)); + // Recalculate the number of iterations + cal_.iterations_ *= extra; + if (remainder[j] == 0) { + extra = 0; + } + groups[j] = tmp; + break; + } else { + iterations = ((iterations / groups[j]) + (((iterations % groups[j]) != 0) ? 1 : 0)); + cal_.iterations_ *= groups[j]; + groups[j] = 1; + } + } + } } -void -VirtualGPU::submitSvmFillMemory(amd::SvmFillMemoryCommand& vcmd) -{ - // Make sure VirtualGPU has an exclusive access to the resources - amd::ScopedLock lock(execution()); +void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) { + // Make sure VirtualGPU has an exclusive access to the resources + amd::ScopedLock lock(execution()); - profilingBegin(vcmd, true); + profilingBegin(vcmd); - if (!dev().isFineGrainedSystem()) { - size_t patternSize = vcmd.patternSize(); - size_t fillSize = patternSize * vcmd.times(); - size_t offset = 0; - amd::Memory* dstMemory = amd::SvmManager::FindSvmBuffer(vcmd.dst()); - assert(dstMemory&&"No svm Buffer to fill with!"); - offset = reinterpret_cast(vcmd.dst()) - - reinterpret_cast(dstMemory->getSvmPtr()); - assert((offset >= 0) && "wrong svm ptr to fill with!"); + // Submit kernel to HW + if (!submitKernelInternal(vcmd.sizes(), vcmd.kernel(), vcmd.parameters(), false, &vcmd.event())) { + vcmd.setStatus(CL_INVALID_OPERATION); + } - pal::Memory* memory = dev().getGpuMemory(dstMemory); - - amd::Coord3D origin(offset, 0, 0); - amd::Coord3D size(fillSize, 1, 1); - - assert((dstMemory->validateRegion(origin, size)) && "The incorrect fill size!"); - // Synchronize memory from host if necessary - device::Memory::SyncFlags syncFlags; - syncFlags.skipEntire_ = dstMemory->isEntirelyCovered(origin, size); - memory->syncCacheFromHost(*this, syncFlags); - - if (!fillMemory(vcmd.type(), dstMemory, vcmd.pattern(), - vcmd.patternSize(), origin, size)) { - vcmd.setStatus(CL_INVALID_OPERATION); - } - // Mark this as the most-recently written cache of the destination - dstMemory->signalWrite(&gpuDevice_); - } - else { - // for FGS capable device, fill CPU memory directly - amd::SvmBuffer::memFill(vcmd.dst(), vcmd.pattern(), vcmd.patternSize(), vcmd.times()); - } - - profilingEnd(vcmd); + profilingEnd(vcmd); } -void -VirtualGPU::submitMigrateMemObjects(amd::MigrateMemObjectsCommand& vcmd) -{ - // Make sure VirtualGPU has an exclusive access to the resources - amd::ScopedLock lock(execution()); +bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const amd::Kernel& kernel, + const_address parameters, bool nativeMem, + amd::Event* enqueueEvent) { + uint64_t vmParentWrap = 0; + uint64_t vmDefQueue = 0; + amd::DeviceQueue* defQueue = kernel.program().context().defDeviceQueue(dev()); + VirtualGPU* gpuDefQueue = nullptr; + amd::HwDebugManager* dbgManager = dev().hwDebugMgr(); - profilingBegin(vcmd, true); + AddKernel(kernel); - std::vector::const_iterator itr; - for (itr = vcmd.memObjects().begin(); itr != vcmd.memObjects().end(); itr++) { - // Find device memory - pal::Memory* memory = dev().getGpuMemory(*itr); + // Get the HSA kernel object + const HSAILKernel& hsaKernel = static_cast(*(kernel.getDeviceKernel(dev()))); + std::vector dispMemList; //!< Memory list of all mem objects used in the disaptch - if (vcmd.migrationFlags() & CL_MIGRATE_MEM_OBJECT_HOST) { - memory->mgpuCacheWriteBack(); - } - else if (vcmd.migrationFlags() & CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED) { - // Synchronize memory from host if necessary. - // The sync function will perform memory migration from - // another device if necessary - device::Memory::SyncFlags syncFlags; - memory->syncCacheFromHost(*this, syncFlags); - } - else { - LogWarning("Unknown operation for memory migration!"); + bool printfEnabled = (hsaKernel.printfInfo().size() > 0) ? true : false; + if (!printfDbgHSA().init(*this, printfEnabled)) { + LogError("Printf debug buffer initialization failed!"); + return false; + } + + // Check memory dependency and SVM objects + if (!processMemObjectsHSA(kernel, parameters, nativeMem, &dispMemList)) { + LogError("Wrong memory objects!"); + return false; + } + + if (hsaKernel.dynamicParallelism()) { + if (nullptr == defQueue) { + LogError("Default device queue wasn't allocated"); + return false; + } else { + if (dev().settings().useDeviceQueue_) { + gpuDefQueue = static_cast(defQueue->vDev()); + if (gpuDefQueue->hwRing() == hwRing()) { + LogError("Can't submit the child kernels to the same HW ring as the host queue!"); + return false; } + } else { + createVirtualQueue(defQueue->size()); + gpuDefQueue = this; + } + } + vmDefQueue = gpuDefQueue->virtualQueue_->vmAddress(); + + // Add memory handles before the actual dispatch + dispMemList.push_back(gpuDefQueue->virtualQueue_); + dispMemList.push_back(gpuDefQueue->schedParams_); + dispMemList.push_back(hsaKernel.prog().kernelTable()); + gpuDefQueue->writeVQueueHeader(*this, hsaKernel.prog().kernelTable()->vmAddress()); + } + + // setup the storage for the memory pointers of the kernel parameters + uint numParams = kernel.signature().numParameters(); + if (dbgManager) { + dbgManager->allocParamMemList(numParams); + } + + bool needFlush = false; + dmaFlushMgmt_.findSplitSize(dev(), sizes.global().product(), hsaKernel.aqlCodeSize()); + if (dmaFlushMgmt().dispatchSplitSize() != 0) { + needFlush = true; + } + + size_t newOffset[3] = {0, 0, 0}; + size_t newGlobalSize[3] = {0, 0, 0}; + + int dim = -1; + int iteration = 1; + size_t globalStep = 0; + for (uint i = 0; i < sizes.dimensions(); i++) { + newGlobalSize[i] = sizes.global()[i]; + newOffset[i] = sizes.offset()[i]; + } + // Check if it is blit kernel. If it is, then check if split is needed. + if (hsaKernel.isInternalKernel()) { + // Calculate new group size for each submission + for (uint i = 0; i < sizes.dimensions(); i++) { + if (sizes.global()[i] > static_cast(0xffffffff)) { + dim = i; + iteration = sizes.global()[i] / 0xC0000000 + ((sizes.global()[i] % 0xC0000000) ? 1 : 0); + globalStep = (sizes.global()[i] / sizes.local()[i]) / iteration * sizes.local()[dim]; + break; + } + } + } + + for (int j = 0; j < iteration; j++) { + // Reset global size for dimension dim if split is needed + if (dim != -1) { + newOffset[dim] = sizes.offset()[dim] + globalStep * j; + if (((newOffset[dim] + globalStep) < sizes.global()[dim]) && (j != (iteration - 1))) { + newGlobalSize[dim] = globalStep; + } else { + newGlobalSize[dim] = sizes.global()[dim] - newOffset[dim]; + } } - profilingEnd(vcmd); -} + amd::NDRangeContainer tmpSizes(sizes.dimensions(), &newOffset[0], &newGlobalSize[0], + &(const_cast(sizes).local()[0])); -void -VirtualGPU::submitSvmFreeMemory(amd::SvmFreeMemoryCommand& vcmd) -{ - // in-order semantics: previous commands need to be done before we start - // Make sure VirtualGPU has an exclusive access to the resources - amd::ScopedLock lock(execution()); - - profilingBegin(vcmd); - std::vector& svmPointers = vcmd.svmPointers(); - if (vcmd.pfnFreeFunc() == nullptr) { - // pointers allocated using clSVMAlloc - for (cl_uint i = 0; i < svmPointers.size(); i++) { - dev().svmFree(svmPointers[i]); - } - } - else { - vcmd.pfnFreeFunc()(as_cl(vcmd.queue()->asCommandQueue()), svmPointers.size(), - static_cast(&(svmPointers[0])), vcmd.userData()); - } - profilingEnd(vcmd); -} - -void -VirtualGPU::findIterations( - const amd::NDRangeContainer& sizes, - const amd::NDRange& local, - amd::NDRange& groups, - amd::NDRange& remainder, - size_t& extra) -{ - size_t dimensions = sizes.dimensions(); - - if (cal()->iterations_ > 1) { - size_t iterations = cal()->iterations_; - cal_.iterations_ = 1; - - // Find the total amount of all groups - groups = sizes.global() / local; - if (dev().settings().partialDispatch_) { - for (uint j = 0; j < dimensions; ++j) { - if ((sizes.global()[j] % local[j]) != 0) { - groups[j]++; - } - } - } - - // Calculate the real number of required iterations and - // the workgroup size of each iteration - for (int j = (dimensions - 1); j >= 0; --j) { - // Find possible size of each iteration - size_t tmp = (groups[j] / iterations); - // Make sure the group size is more than 1 - if (tmp > 0) { - remainder = groups; - remainder[j] = (groups[j] % tmp); - - extra = ((groups[j] / tmp) + - // Check for the remainder - ((remainder[j] != 0) ? 1 : 0)); - // Recalculate the number of iterations - cal_.iterations_ *= extra; - if (remainder[j] == 0) { - extra = 0; - } - groups[j] = tmp; - break; - } - else { - iterations = ((iterations / groups[j]) + - (((iterations % groups[j]) != 0) ? 1 : 0)); - cal_.iterations_ *= groups[j]; - groups[j] = 1; - } - } - } -} - -void -VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) -{ - // Make sure VirtualGPU has an exclusive access to the resources - amd::ScopedLock lock(execution()); - - profilingBegin(vcmd); - - // Submit kernel to HW - if (!submitKernelInternal(vcmd.sizes(), vcmd.kernel(), vcmd.parameters(), false, - &vcmd.event())) { - vcmd.setStatus(CL_INVALID_OPERATION); + // Program the kernel arguments for the GPU execution + hsa_kernel_dispatch_packet_t* aqlPkt = hsaKernel.loadArguments( + *this, kernel, tmpSizes, parameters, nativeMem, vmDefQueue, &vmParentWrap, dispMemList); + if (nullptr == aqlPkt) { + LogError("Couldn't load kernel arguments"); + return false; } - profilingEnd(vcmd); -} - -bool -VirtualGPU::submitKernelInternal( - const amd::NDRangeContainer& sizes, - const amd::Kernel& kernel, - const_address parameters, - bool nativeMem, - amd::Event* enqueueEvent) -{ - uint64_t vmParentWrap = 0; - uint64_t vmDefQueue = 0; - amd::DeviceQueue* defQueue = kernel.program().context().defDeviceQueue(dev()); - VirtualGPU* gpuDefQueue = nullptr; - amd::HwDebugManager * dbgManager = dev().hwDebugMgr(); - - AddKernel(kernel); - - // Get the HSA kernel object - const HSAILKernel& hsaKernel = - static_cast(*(kernel.getDeviceKernel(dev()))); - std::vector dispMemList; //!< Memory list of all mem objects used in the disaptch - - bool printfEnabled = (hsaKernel.printfInfo().size() > 0) ? true:false; - if (!printfDbgHSA().init(*this, printfEnabled )) { - LogError( "Printf debug buffer initialization failed!"); - return false; + const Device::ScratchBuffer* scratch = nullptr; + // Check if the device allocated more registers than the old setup + if (hsaKernel.workGroupInfo()->scratchRegs_ > 0) { + scratch = dev().scratch(hwRing()); + dispMemList.push_back(scratch->memObj_); } - // Check memory dependency and SVM objects - if (!processMemObjectsHSA(kernel, parameters, nativeMem, &dispMemList)) { - LogError("Wrong memory objects!"); - return false; + // Add GSL handle to the memory list for VidMM + for (uint i = 0; i < dispMemList.size(); ++i) { + addVmMemory(dispMemList[i]); + if (dispMemList[i]->desc().isDoppTexture_) { + addDoppRef(dispMemList[i], kernel.parameters().getExecNewVcop(), + kernel.parameters().getExecPfpaVcop()); + } + } + + // HW Debug for the kernel? + HwDbgKernelInfo kernelInfo; + HwDbgKernelInfo* pKernelInfo = nullptr; + + if (dbgManager) { + buildKernelInfo(hsaKernel, aqlPkt, kernelInfo, enqueueEvent); + pKernelInfo = &kernelInfo; + } + + GpuEvent gpuEvent; + // Set up the dispatch information + Pal::DispatchAqlParams dispatchParam = {}; + dispatchParam.pAqlPacket = aqlPkt; + if (nullptr != scratch) { + dispatchParam.scratchAddr = scratch->memObj_->vmAddress(); + dispatchParam.scratchSize = scratch->size_; + dispatchParam.scratchOffset = scratch->offset_; + } + dispatchParam.pCpuAqlCode = hsaKernel.cpuAqlCode(); + dispatchParam.hsaQueueVa = hsaQueueMem_->vmAddress(); + dispatchParam.wavesPerSh = hsaKernel.getWavesPerSH(this); + dispatchParam.useAtc = dev().settings().svmFineGrainSystem_ ? true : false; + // Run AQL dispatch in HW + eventBegin(MainEngine); + iCmd()->CmdDispatchAql(dispatchParam); + eventEnd(MainEngine, gpuEvent); + + if (dbgManager && (nullptr != dbgManager->postDispatchCallBackFunc())) { + dbgManager->executePostDispatchCallBack(); } if (hsaKernel.dynamicParallelism()) { - if (nullptr == defQueue) { - LogError("Default device queue wasn't allocated"); - return false; - } - else { - if (dev().settings().useDeviceQueue_) { - gpuDefQueue = static_cast(defQueue->vDev()); - if (gpuDefQueue->hwRing() == hwRing()) { - LogError("Can't submit the child kernels to the same HW ring as the host queue!"); - return false; - } + // Make sure exculsive access to the device queue + amd::ScopedLock(defQueue->lock()); + + if (GPU_PRINT_CHILD_KERNEL != 0) { + waitForEvent(&gpuEvent); + + AmdAqlWrap* wraps = + (AmdAqlWrap*)(&((AmdVQueueHeader*)gpuDefQueue->virtualQueue_->data())[1]); + uint p = 0; + for (uint i = 0; i < gpuDefQueue->vqHeader_->aql_slot_num; ++i) { + if (wraps[i].state != 0) { + uint j; + if (p == GPU_PRINT_CHILD_KERNEL) { + break; } - else { - createVirtualQueue(defQueue->size()); - gpuDefQueue = this; + p++; + std::stringstream print; + print.flags(std::ios::right | std::ios_base::hex | std::ios_base::uppercase); + print << "Slot#: " << i << "\n"; + print << "\tenqueue_flags: " << wraps[i].enqueue_flags << "\n"; + print << "\tcommand_id: " << wraps[i].command_id << "\n"; + print << "\tchild_counter: " << wraps[i].child_counter << "\n"; + print << "\tcompletion: " << wraps[i].completion << "\n"; + print << "\tparent_wrap: " << wraps[i].parent_wrap << "\n"; + print << "\twait_list: " << wraps[i].wait_list << "\n"; + print << "\twait_num: " << wraps[i].wait_num << "\n"; + uint offsEvents = wraps[i].wait_list - gpuDefQueue->virtualQueue_->vmAddress(); + size_t* events = + reinterpret_cast(gpuDefQueue->virtualQueue_->data() + offsEvents); + for (j = 0; j < wraps[i].wait_num; ++j) { + uint offs = + static_cast(events[j]) - gpuDefQueue->virtualQueue_->vmAddress(); + AmdEvent* eventD = (AmdEvent*)(gpuDefQueue->virtualQueue_->data() + offs); + print << "Wait Event#: " << j << "\n"; + print << "\tState: " << eventD->state << "; Counter: " << eventD->counter << "\n"; } - } - vmDefQueue = gpuDefQueue->virtualQueue_->vmAddress(); + print << "WorkGroupSize[ " << wraps[i].aql.workgroup_size_x << ", "; + print << wraps[i].aql.workgroup_size_y << ", "; + print << wraps[i].aql.workgroup_size_z << "]\n"; + print << "GridSize[ " << wraps[i].aql.grid_size_x << ", "; + print << wraps[i].aql.grid_size_y << ", "; + print << wraps[i].aql.grid_size_z << "]\n"; - // Add memory handles before the actual dispatch - dispMemList.push_back(gpuDefQueue->virtualQueue_); - dispMemList.push_back(gpuDefQueue->schedParams_); - dispMemList.push_back(hsaKernel.prog().kernelTable()); - gpuDefQueue->writeVQueueHeader(*this, - hsaKernel.prog().kernelTable()->vmAddress()); - } - - // setup the storage for the memory pointers of the kernel parameters - uint numParams = kernel.signature().numParameters(); - if (dbgManager) { - dbgManager->allocParamMemList(numParams); - } - - bool needFlush = false; - dmaFlushMgmt_.findSplitSize(dev(), sizes.global().product(), hsaKernel.aqlCodeSize()); - if (dmaFlushMgmt().dispatchSplitSize() != 0) { - needFlush = true; - } - - size_t newOffset[3] = {0, 0, 0}; - size_t newGlobalSize[3] = {0, 0, 0}; - - int dim = -1; - int iteration = 1; - size_t globalStep = 0; - for (uint i = 0; i < sizes.dimensions(); i++) { - newGlobalSize[i] = sizes.global()[i]; - newOffset[i] = sizes.offset()[i]; - } - // Check if it is blit kernel. If it is, then check if split is needed. - if (hsaKernel.isInternalKernel()) { - // Calculate new group size for each submission - for (uint i = 0; i < sizes.dimensions(); i++) { - if (sizes.global()[i] > static_cast(0xffffffff)) { - dim = i; - iteration = sizes.global()[i] / 0xC0000000 - + ((sizes.global()[i] % 0xC0000000) ? 1: 0); - globalStep = (sizes.global()[i] / sizes.local()[i]) / iteration - * sizes.local()[dim]; + uint64_t* kernels = + (uint64_t*)(const_cast(hsaKernel.prog().kernelTable())->map(this)); + for (j = 0; j < hsaKernel.prog().kernels().size(); ++j) { + if (kernels[j] == wraps[i].aql.kernel_object) { break; + } } - } - } - - for (int j = 0; j < iteration; j++) { - // Reset global size for dimension dim if split is needed - if (dim != -1) { - newOffset[dim] = sizes.offset()[dim] + globalStep * j; - if (((newOffset[dim] + globalStep) < sizes.global()[dim]) && - (j != (iteration - 1))) { - newGlobalSize[dim] = globalStep; + const_cast(hsaKernel.prog().kernelTable())->unmap(this); + HSAILKernel* child = nullptr; + for (auto it = hsaKernel.prog().kernels().begin(); + it != hsaKernel.prog().kernels().end(); ++it) { + if (j == static_cast(it->second)->index()) { + child = static_cast(it->second); + } } - else { - newGlobalSize[dim] = sizes.global()[dim] - newOffset[dim]; + if (child == nullptr) { + printf("Error: couldn't find child kernel!\n"); + continue; } - } - - amd::NDRangeContainer tmpSizes(sizes.dimensions(), - &newOffset[0], &newGlobalSize[0], - &(const_cast(sizes).local()[0])); - - // Program the kernel arguments for the GPU execution - hsa_kernel_dispatch_packet_t* aqlPkt = - hsaKernel.loadArguments(*this, kernel, tmpSizes, parameters, nativeMem, - vmDefQueue, &vmParentWrap, dispMemList); - if (nullptr == aqlPkt) { - LogError("Couldn't load kernel arguments"); - return false; - } - - const Device::ScratchBuffer* scratch = nullptr; - // Check if the device allocated more registers than the old setup - if (hsaKernel.workGroupInfo()->scratchRegs_ > 0) { - scratch = dev().scratch(hwRing()); - dispMemList.push_back(scratch->memObj_); - } - - // Add GSL handle to the memory list for VidMM - for (uint i = 0; i < dispMemList.size(); ++i) { - addVmMemory(dispMemList[i]); - if (dispMemList[i]->desc().isDoppTexture_) { - addDoppRef(dispMemList[i], - kernel.parameters().getExecNewVcop(), - kernel.parameters().getExecPfpaVcop()); + const uint64_t kernarg_address = + static_cast(reinterpret_cast(wraps[i].aql.kernarg_address)); + uint offsArg = kernarg_address - gpuDefQueue->virtualQueue_->vmAddress(); + address argum = gpuDefQueue->virtualQueue_->data() + offsArg; + print << "Kernel: " << child->name() << "\n"; + for (auto arg : child->arguments()) { + const char* extraArgName = nullptr; + switch (arg->type_) { + case HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X: + extraArgName = "Offset0: "; + break; + case HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y: + extraArgName = "Offset1: "; + break; + case HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z: + extraArgName = "Offset2: "; + break; + case HSAIL_ARGTYPE_HIDDEN_PRINTF_BUFFER: + extraArgName = "PrintfBuf: "; + break; + case HSAIL_ARGTYPE_HIDDEN_DEFAULT_QUEUE: + extraArgName = "VqueuePtr: "; + break; + case HSAIL_ARGTYPE_HIDDEN_COMPLETION_ACTION: + extraArgName = "AqlWrap: "; + break; + case HSAIL_ARGTYPE_HIDDEN_NONE: + extraArgName = "Unknown: "; + break; + default: + break; + } + if (extraArgName) { + print << "\t" << extraArgName << *(size_t*)argum; + print << "\n"; + argum += sizeof(size_t); + continue; + } + print << "\t" << arg->name_ << ": "; + for (int s = arg->size_ - 1; s >= 0; --s) { + print.width(2); + print.fill('0'); + print << (uint32_t)(argum[s]); + } + argum += arg->size_; + print << "\n"; } + printf("%s", print.str().c_str()); + } } + } - // HW Debug for the kernel? - HwDbgKernelInfo kernelInfo; - HwDbgKernelInfo *pKernelInfo = nullptr; - - if (dbgManager) { - buildKernelInfo(hsaKernel, aqlPkt, kernelInfo, enqueueEvent); - pKernelInfo = &kernelInfo; - } - - GpuEvent gpuEvent; - // Set up the dispatch information - Pal::DispatchAqlParams dispatchParam = {}; - dispatchParam.pAqlPacket = aqlPkt; - if (nullptr != scratch) { - dispatchParam.scratchAddr = scratch->memObj_->vmAddress(); - dispatchParam.scratchSize = scratch->size_; - dispatchParam.scratchOffset = scratch->offset_; - } - dispatchParam.pCpuAqlCode = hsaKernel.cpuAqlCode(); - dispatchParam.hsaQueueVa = hsaQueueMem_->vmAddress(); - dispatchParam.wavesPerSh = hsaKernel.getWavesPerSH(this); - dispatchParam.useAtc = dev().settings().svmFineGrainSystem_ ? true : false; - // Run AQL dispatch in HW + if (!dev().settings().useDeviceQueue_) { + // Add the termination handshake to the host queue eventBegin(MainEngine); - iCmd()->CmdDispatchAql(dispatchParam); + iCmd()->CmdVirtualQueueHandshake(vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE, + vmParentWrap + offsetof(AmdAqlWrap, child_counter), 0, + dev().settings().useDeviceQueue_); eventEnd(MainEngine, gpuEvent); + } - if (dbgManager && (nullptr != dbgManager->postDispatchCallBackFunc())) { - dbgManager->executePostDispatchCallBack(); - } + // Get the global loop start before the scheduler + Pal::gpusize loopStart = gpuDefQueue->iCmd()->CmdVirtualQueueDispatcherStart(); + static_cast(gpuDefQueue->blitMgr()) + .runScheduler(*gpuDefQueue->virtualQueue_, *gpuDefQueue->schedParams_, + gpuDefQueue->schedParamIdx_, + gpuDefQueue->vqHeader_->aql_slot_num / (DeviceQueueMaskSize * maskGroups_)); + const static bool FlushL2 = true; + gpuDefQueue->flushCUCaches(FlushL2); - if (hsaKernel.dynamicParallelism()) { - // Make sure exculsive access to the device queue - amd::ScopedLock(defQueue->lock()); + // Get the address of PM4 template and add write it to params + //! @note DMA flush must not occur between patch and the scheduler + Pal::gpusize patchStart = gpuDefQueue->iCmd()->CmdVirtualQueueDispatcherStart(); + // Program parameters for the scheduler + SchedulerParam* param = &reinterpret_cast( + gpuDefQueue->schedParams_->data())[gpuDefQueue->schedParamIdx_]; + param->signal = 1; + // Scale clock to 1024 to avoid 64 bit div in the scheduler + param->eng_clk = (1000 * 1024) / dev().info().maxClockFrequency_; + param->hw_queue = patchStart + sizeof(uint32_t) /* Rewind packet*/; + param->hsa_queue = gpuDefQueue->hsaQueueMem()->vmAddress(); + param->releaseHostCP = 0; + param->parentAQL = vmParentWrap; + param->dedicatedQueue = dev().settings().useDeviceQueue_; + param->useATC = dev().settings().svmFineGrainSystem_; - if (GPU_PRINT_CHILD_KERNEL != 0) { - waitForEvent(&gpuEvent); + // Fill the scratch buffer information + if (hsaKernel.prog().maxScratchRegs() > 0) { + pal::Memory* scratchBuf = dev().scratch(gpuDefQueue->hwRing())->memObj_; + param->scratchSize = scratchBuf->size(); + param->scratch = scratchBuf->vmAddress(); + param->numMaxWaves = 32 * dev().info().maxComputeUnits_; + param->scratchOffset = dev().scratch(gpuDefQueue->hwRing())->offset_; + dispMemList.push_back(scratchBuf); + } else { + param->numMaxWaves = 0; + param->scratchSize = 0; + param->scratch = 0; + param->scratchOffset = 0; + } - AmdAqlWrap* wraps = (AmdAqlWrap*)(&((AmdVQueueHeader*)gpuDefQueue->virtualQueue_->data())[1]); - uint p = 0; - for (uint i = 0; i < gpuDefQueue->vqHeader_->aql_slot_num; ++i) { - if (wraps[i].state != 0) { - uint j; - if (p == GPU_PRINT_CHILD_KERNEL) { - break; - } - p++; - std::stringstream print; - print.flags(std::ios::right | std::ios_base::hex | std::ios_base::uppercase); - print << "Slot#: " << i << "\n"; - print << "\tenqueue_flags: " << wraps[i].enqueue_flags << "\n"; - print << "\tcommand_id: " << wraps[i].command_id << "\n"; - print << "\tchild_counter: " << wraps[i].child_counter << "\n"; - print << "\tcompletion: " << wraps[i].completion << "\n"; - print << "\tparent_wrap: " << wraps[i].parent_wrap << "\n"; - print << "\twait_list: " << wraps[i].wait_list << "\n"; - print << "\twait_num: " << wraps[i].wait_num << "\n"; - uint offsEvents = wraps[i].wait_list - - gpuDefQueue->virtualQueue_->vmAddress(); - size_t* events = reinterpret_cast( - gpuDefQueue->virtualQueue_->data() + offsEvents); - for (j = 0; j < wraps[i].wait_num; ++j) { - uint offs = static_cast(events[j]) - - gpuDefQueue->virtualQueue_->vmAddress(); - AmdEvent* eventD = (AmdEvent*)(gpuDefQueue->virtualQueue_->data() + offs); - print << "Wait Event#: " << j << "\n"; - print << "\tState: " << eventD->state << - "; Counter: " << eventD->counter << "\n"; - } - print << "WorkGroupSize[ " << wraps[i].aql.workgroup_size_x << ", "; - print << wraps[i].aql.workgroup_size_y << ", "; - print << wraps[i].aql.workgroup_size_z << "]\n"; - print << "GridSize[ " << wraps[i].aql.grid_size_x << ", "; - print << wraps[i].aql.grid_size_y << ", "; - print << wraps[i].aql.grid_size_z << "]\n"; + // Add all kernels in the program to the mem list. + //! \note Runtime doesn't know which one will be called + hsaKernel.prog().fillResListWithKernels(dispMemList); - uint64_t* kernels = (uint64_t*)( - const_cast(hsaKernel.prog().kernelTable())->map(this)); - for (j = 0; j < hsaKernel.prog().kernels().size(); ++j) { - if (kernels[j] == wraps[i].aql.kernel_object) { - break; - } - } - const_cast(hsaKernel.prog().kernelTable())->unmap(this); - HSAILKernel* child = nullptr; - for (auto it = hsaKernel.prog().kernels().begin(); - it != hsaKernel.prog().kernels().end(); ++it) { - if (j == static_cast(it->second)->index()) { - child = static_cast(it->second); - } - } - if (child == nullptr) { - printf("Error: couldn't find child kernel!\n"); - continue; - } - const uint64_t kernarg_address = - static_cast(reinterpret_cast(wraps[i].aql.kernarg_address)); - uint offsArg = kernarg_address - - gpuDefQueue->virtualQueue_->vmAddress(); - address argum = gpuDefQueue->virtualQueue_->data() + offsArg; - print << "Kernel: " << child->name() << "\n"; - for (auto arg : child->arguments()) { - const char* extraArgName = nullptr; - switch (arg->type_) { - case HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X: extraArgName = "Offset0: "; break; - case HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y: extraArgName = "Offset1: "; break; - case HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z: extraArgName = "Offset2: "; break; - case HSAIL_ARGTYPE_HIDDEN_PRINTF_BUFFER: extraArgName = "PrintfBuf: "; break; - case HSAIL_ARGTYPE_HIDDEN_DEFAULT_QUEUE: extraArgName = "VqueuePtr: "; break; - case HSAIL_ARGTYPE_HIDDEN_COMPLETION_ACTION: extraArgName = "AqlWrap: "; break; - case HSAIL_ARGTYPE_HIDDEN_NONE: extraArgName = "Unknown: "; break; - default: break; - } - if (extraArgName) { - print << "\t" << extraArgName << *(size_t*)argum; - print << "\n"; - argum += sizeof(size_t); - continue; - } - print << "\t" << arg->name_ << ": "; - for (int s = arg->size_ - 1; s >= 0; --s) { - print.width(2); - print.fill('0'); - print << (uint32_t)(argum[s]); - } - argum += arg->size_; - print << "\n"; - } - printf("%s", print.str().c_str()); - } - } - } + // Add GPU memory handle to the memory list for VidMM + for (uint i = 0; i < dispMemList.size(); ++i) { + gpuDefQueue->addVmMemory(dispMemList[i]); + } - if (!dev().settings().useDeviceQueue_) { - // Add the termination handshake to the host queue - eventBegin(MainEngine); - iCmd()->CmdVirtualQueueHandshake( - vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE, - vmParentWrap + offsetof(AmdAqlWrap, child_counter), - 0, dev().settings().useDeviceQueue_); - eventEnd(MainEngine, gpuEvent); - } + Pal::gpusize signalAddr = gpuDefQueue->schedParams_->vmAddress() + + gpuDefQueue->schedParamIdx_ * sizeof(SchedulerParam); + gpuDefQueue->eventBegin(MainEngine); + gpuDefQueue->iCmd()->CmdVirtualQueueDispatcherEnd( + signalAddr, loopStart, + gpuDefQueue->vqHeader_->aql_slot_num / (DeviceQueueMaskSize * maskGroups_)); + // Note: Device enqueue can't have extra commands after INDIRECT_BUFFER call. + // Thus TS command for profiling has to follow in the next CB. + constexpr bool ForceSubmitFirst = true; + gpuDefQueue->eventEnd(MainEngine, gpuEvent, ForceSubmitFirst); - // Get the global loop start before the scheduler - Pal::gpusize loopStart = gpuDefQueue->iCmd()->CmdVirtualQueueDispatcherStart(); - static_cast(gpuDefQueue->blitMgr()).runScheduler( - *gpuDefQueue->virtualQueue_, - *gpuDefQueue->schedParams_, gpuDefQueue->schedParamIdx_, - gpuDefQueue->vqHeader_->aql_slot_num / (DeviceQueueMaskSize * maskGroups_)); - const static bool FlushL2 = true; - gpuDefQueue->flushCUCaches(FlushL2); + // Set GPU event for the used resources + for (uint i = 0; i < dispMemList.size(); ++i) { + dispMemList[i]->setBusy(*gpuDefQueue, gpuEvent); + } - // Get the address of PM4 template and add write it to params - //! @note DMA flush must not occur between patch and the scheduler - Pal::gpusize patchStart = gpuDefQueue->iCmd()->CmdVirtualQueueDispatcherStart(); - // Program parameters for the scheduler - SchedulerParam* param = &reinterpret_cast - (gpuDefQueue->schedParams_->data())[gpuDefQueue->schedParamIdx_]; - param->signal = 1; - // Scale clock to 1024 to avoid 64 bit div in the scheduler - param->eng_clk = (1000 * 1024) / dev().info().maxClockFrequency_; - param->hw_queue = patchStart + sizeof(uint32_t)/* Rewind packet*/; - param->hsa_queue = gpuDefQueue->hsaQueueMem()->vmAddress(); - param->releaseHostCP = 0; - param->parentAQL = vmParentWrap; - param->dedicatedQueue = dev().settings().useDeviceQueue_; - param->useATC = dev().settings().svmFineGrainSystem_; + if (dev().settings().useDeviceQueue_) { + // Add the termination handshake to the host queue + eventBegin(MainEngine); + iCmd()->CmdVirtualQueueHandshake(vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE, + vmParentWrap + offsetof(AmdAqlWrap, child_counter), + signalAddr, dev().settings().useDeviceQueue_); + eventEnd(MainEngine, gpuEvent); + } - // Fill the scratch buffer information - if (hsaKernel.prog().maxScratchRegs() > 0) { - pal::Memory* scratchBuf = dev().scratch(gpuDefQueue->hwRing())->memObj_; - param->scratchSize = scratchBuf->size(); - param->scratch = scratchBuf->vmAddress(); - param->numMaxWaves = 32 * dev().info().maxComputeUnits_; - param->scratchOffset = dev().scratch(gpuDefQueue->hwRing())->offset_; - dispMemList.push_back(scratchBuf); - } - else { - param->numMaxWaves = 0; - param->scratchSize = 0; - param->scratch = 0; - param->scratchOffset = 0; - } - - // Add all kernels in the program to the mem list. - //! \note Runtime doesn't know which one will be called - hsaKernel.prog().fillResListWithKernels(dispMemList); - - // Add GPU memory handle to the memory list for VidMM - for (uint i = 0; i < dispMemList.size(); ++i) { - gpuDefQueue->addVmMemory(dispMemList[i]); - } - - Pal::gpusize signalAddr = gpuDefQueue->schedParams_->vmAddress() + - gpuDefQueue->schedParamIdx_ * sizeof(SchedulerParam); - gpuDefQueue->eventBegin(MainEngine); - gpuDefQueue->iCmd()->CmdVirtualQueueDispatcherEnd( - signalAddr, loopStart, gpuDefQueue->vqHeader_->aql_slot_num / - (DeviceQueueMaskSize * maskGroups_)); - // Note: Device enqueue can't have extra commands after INDIRECT_BUFFER call. - // Thus TS command for profiling has to follow in the next CB. - constexpr bool ForceSubmitFirst = true; - gpuDefQueue->eventEnd(MainEngine, gpuEvent, ForceSubmitFirst); - - // Set GPU event for the used resources - for (uint i = 0; i < dispMemList.size(); ++i) { - dispMemList[i]->setBusy(*gpuDefQueue, gpuEvent); - } - - if (dev().settings().useDeviceQueue_) { - // Add the termination handshake to the host queue - eventBegin(MainEngine); - iCmd()->CmdVirtualQueueHandshake( - vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE, - vmParentWrap + offsetof(AmdAqlWrap, child_counter), - signalAddr, dev().settings().useDeviceQueue_); - eventEnd(MainEngine, gpuEvent); - } - - ++gpuDefQueue->schedParamIdx_ %= - gpuDefQueue->schedParams_->size() / sizeof(SchedulerParam); - //! \todo optimize the wrap around - if (gpuDefQueue->schedParamIdx_ == 0) { - gpuDefQueue->schedParams_->wait(*gpuDefQueue); - } - } - - // Set GPU event for the used resources - for (uint i = 0; i < dispMemList.size(); ++i) { - dispMemList[i]->setBusy(*this, gpuEvent); - } - - // Update the global GPU event - setGpuEvent(gpuEvent, needFlush); - - if (!printfDbgHSA().output(*this, printfEnabled, hsaKernel.printfInfo())) { - LogError("Couldn't read printf data from the buffer!\n"); - return false; - } + ++gpuDefQueue->schedParamIdx_ %= gpuDefQueue->schedParams_->size() / sizeof(SchedulerParam); + //! \todo optimize the wrap around + if (gpuDefQueue->schedParamIdx_ == 0) { + gpuDefQueue->schedParams_->wait(*gpuDefQueue); + } } - return true; -} - -void -VirtualGPU::submitNativeFn(amd::NativeFnCommand& vcmd) -{ - // Make sure VirtualGPU has an exclusive access to the resources - amd::ScopedLock lock(execution()); - - Unimplemented(); //!< @todo: Unimplemented -} - -void -VirtualGPU::submitMarker(amd::Marker& vcmd) -{ - //!@note runtime doesn't need to lock this command on execution - - if (vcmd.waitingEvent() != nullptr) { - bool foundEvent = false; - - // Loop through all outstanding command batches - while (!cbQueue_.empty()) { - auto cb = cbQueue_.front(); - // Wait for completion - foundEvent = awaitCompletion(cb, vcmd.waitingEvent()); - // Release a command batch - freeCbQueue_.push(cb); - // Remove command batch from the list - cbQueue_.pop(); - // Early exit if we found a command - if (foundEvent) break; - } - - // Event should be in the current command batch - if (!foundEvent) { - state_.forceWait_ = true; - } - // If we don't have any more batches, then assume GPU is idle - else if (cbQueue_.empty()) { - dmaFlushMgmt_.resetCbWorkload(dev()); - } + // Set GPU event for the used resources + for (uint i = 0; i < dispMemList.size(); ++i) { + dispMemList[i]->setBusy(*this, gpuEvent); } -} -GpuEvent* -VirtualGPU::getGpuEvent(Pal::IGpuMemory* iMem) -{ - return &gpuEvents_[iMem]; -} + // Update the global GPU event + setGpuEvent(gpuEvent, needFlush); -void -VirtualGPU::assignGpuEvent(Pal::IGpuMemory* iMem, GpuEvent gpuEvent) -{ - auto it = gpuEvents_.find(iMem); - - if (it != gpuEvents_.end()) { - it->second = gpuEvent; - } - else { - gpuEvents_[iMem] = gpuEvent; + if (!printfDbgHSA().output(*this, printfEnabled, hsaKernel.printfInfo())) { + LogError("Couldn't read printf data from the buffer!\n"); + return false; } + } + + return true; } -void -VirtualGPU::releaseMemory(Pal::IGpuMemory* iMem, bool wait) -{ - auto it = gpuEvents_.find(iMem); - //! @note if there is no wait, then it's a view release - if (wait && (it != gpuEvents_.end())) { - waitForEvent(&it->second); - queues_[MainEngine]->removeCmdMemRef(iMem); - queues_[SdmaEngine]->removeCmdMemRef(iMem); - gpuEvents_.erase(it); - } +void VirtualGPU::submitNativeFn(amd::NativeFnCommand& vcmd) { + // Make sure VirtualGPU has an exclusive access to the resources + amd::ScopedLock lock(execution()); + + Unimplemented(); //!< @todo: Unimplemented } -void -VirtualGPU::submitPerfCounter(amd::PerfCounterCommand& vcmd) -{ - // Make sure VirtualGPU has an exclusive access to the resources - amd::ScopedLock lock(execution()); +void VirtualGPU::submitMarker(amd::Marker& vcmd) { + //!@note runtime doesn't need to lock this command on execution - const amd::PerfCounterCommand::PerfCounterList counters = vcmd.getCounters(); + if (vcmd.waitingEvent() != nullptr) { + bool foundEvent = false; - PalCounterReference* palRef = PalCounterReference::Create(*this); - if (palRef == nullptr) { + // Loop through all outstanding command batches + while (!cbQueue_.empty()) { + auto cb = cbQueue_.front(); + // Wait for completion + foundEvent = awaitCompletion(cb, vcmd.waitingEvent()); + // Release a command batch + freeCbQueue_.push(cb); + // Remove command batch from the list + cbQueue_.pop(); + // Early exit if we found a command + if (foundEvent) break; + } + + // Event should be in the current command batch + if (!foundEvent) { + state_.forceWait_ = true; + } + // If we don't have any more batches, then assume GPU is idle + else if (cbQueue_.empty()) { + dmaFlushMgmt_.resetCbWorkload(dev()); + } + } +} + +GpuEvent* VirtualGPU::getGpuEvent(Pal::IGpuMemory* iMem) { return &gpuEvents_[iMem]; } + +void VirtualGPU::assignGpuEvent(Pal::IGpuMemory* iMem, GpuEvent gpuEvent) { + auto it = gpuEvents_.find(iMem); + + if (it != gpuEvents_.end()) { + it->second = gpuEvent; + } else { + gpuEvents_[iMem] = gpuEvent; + } +} + +void VirtualGPU::releaseMemory(Pal::IGpuMemory* iMem, bool wait) { + auto it = gpuEvents_.find(iMem); + //! @note if there is no wait, then it's a view release + if (wait && (it != gpuEvents_.end())) { + waitForEvent(&it->second); + queues_[MainEngine]->removeCmdMemRef(iMem); + queues_[SdmaEngine]->removeCmdMemRef(iMem); + gpuEvents_.erase(it); + } +} + +void VirtualGPU::submitPerfCounter(amd::PerfCounterCommand& vcmd) { + // Make sure VirtualGPU has an exclusive access to the resources + amd::ScopedLock lock(execution()); + + const amd::PerfCounterCommand::PerfCounterList counters = vcmd.getCounters(); + + PalCounterReference* palRef = PalCounterReference::Create(*this); + if (palRef == nullptr) { + LogError("We failed to allocate memory for the GPU perfcounter"); + vcmd.setStatus(CL_INVALID_OPERATION); + return; + } + + bool newExperiment = false; + + for (uint i = 0; i < vcmd.getNumCounters(); ++i) { + amd::PerfCounter* amdCounter = static_cast(counters[i]); + const PerfCounter* counter = static_cast(amdCounter->getDeviceCounter()); + + // Make sure we have a valid gpu performance counter + if (nullptr == counter) { + amd::PerfCounter::Properties prop = amdCounter->properties(); + PerfCounter* gpuCounter = new PerfCounter( + gpuDevice_, palRef, prop[CL_PERFCOUNTER_GPU_BLOCK_INDEX], + prop[CL_PERFCOUNTER_GPU_COUNTER_INDEX], prop[CL_PERFCOUNTER_GPU_EVENT_INDEX]); + if (nullptr == gpuCounter) { LogError("We failed to allocate memory for the GPU perfcounter"); vcmd.setStatus(CL_INVALID_OPERATION); return; - } - - bool newExperiment = false; - - for (uint i = 0; i < vcmd.getNumCounters(); ++i) { - amd::PerfCounter* amdCounter = - static_cast(counters[i]); - const PerfCounter* counter = - static_cast(amdCounter->getDeviceCounter()); - - // Make sure we have a valid gpu performance counter - if (nullptr == counter) { - amd::PerfCounter::Properties prop = amdCounter->properties(); - PerfCounter* gpuCounter = new PerfCounter( - gpuDevice_, - palRef, - prop[CL_PERFCOUNTER_GPU_BLOCK_INDEX], - prop[CL_PERFCOUNTER_GPU_COUNTER_INDEX], - prop[CL_PERFCOUNTER_GPU_EVENT_INDEX]); - if (nullptr == gpuCounter) { - LogError("We failed to allocate memory for the GPU perfcounter"); - vcmd.setStatus(CL_INVALID_OPERATION); - return; - } - else if (gpuCounter->create()) { - newExperiment = true; - } - else { - LogPrintfError("We failed to allocate a perfcounter in PAL.\ + } else if (gpuCounter->create()) { + newExperiment = true; + } else { + LogPrintfError( + "We failed to allocate a perfcounter in PAL.\ Block: %d, counter: #d, event: %d", - gpuCounter->info()->blockIndex_, - gpuCounter->info()->counterIndex_, - gpuCounter->info()->eventIndex_); - } - amdCounter->setDeviceCounter(gpuCounter); - } + gpuCounter->info()->blockIndex_, gpuCounter->info()->counterIndex_, + gpuCounter->info()->eventIndex_); + } + amdCounter->setDeviceCounter(gpuCounter); } - - if (newExperiment) { - palRef->finalize(); - } - - palRef->release(); - - Pal::IPerfExperiment* palPerf = nullptr; - for (uint i = 0; i < vcmd.getNumCounters(); ++i) { - amd::PerfCounter* amdCounter = - static_cast(counters[i]); - const PerfCounter* counter = - static_cast(amdCounter->getDeviceCounter()); - - if (palPerf != counter->iPerf()) { - palPerf = counter->iPerf(); - // Find the state and sends the command to PAL - if (vcmd.getState() == amd::PerfCounterCommand::Begin) { - Pal::SetClockModeInput input; - Pal::SetClockModeOutput output = {}; - input.clockMode = Pal::DeviceClockMode::Profiling; - dev().iDev()->SetClockMode(input, &output); - GpuEvent event; - eventBegin(MainEngine); - iCmd()->CmdBeginPerfExperiment(palPerf); - eventEnd(MainEngine, event); - setGpuEvent(event); - } - else if (vcmd.getState() == amd::PerfCounterCommand::End) { - GpuEvent event; - eventBegin(MainEngine); - iCmd()->CmdEndPerfExperiment(palPerf); - eventEnd(MainEngine, event); - setGpuEvent(event); - } - else { - LogError("Unsupported performance counter state"); - vcmd.setStatus(CL_INVALID_OPERATION); - return; - } - } - } -} - -void -VirtualGPU::submitThreadTraceMemObjects(amd::ThreadTraceMemObjectsCommand& cmd) -{ - // Make sure VirtualGPU has an exclusive access to the resources - amd::ScopedLock lock(execution()); - - profilingBegin(cmd); - - switch(cmd.type()) { - case CL_COMMAND_THREAD_TRACE_MEM: - { - amd::ThreadTrace* amdThreadTrace = &cmd.getThreadTrace(); - ThreadTrace* threadTrace = - static_cast(amdThreadTrace->getDeviceThreadTrace()); - - if (threadTrace == nullptr) { - PalThreadTraceReference* palRef = PalThreadTraceReference::Create(*this); - if (palRef == nullptr) { - LogError("Failure in memory allocation for the GPU threadtrace"); - cmd.setStatus(CL_INVALID_OPERATION); - return; - } - - size_t numSe = amdThreadTrace->deviceSeNumThreadTrace(); - - ThreadTrace* gpuThreadTrace = new ThreadTrace( - gpuDevice_, - palRef, - cmd.getMemList(), - numSe); - if (nullptr == gpuThreadTrace) { - LogError("Failure in memory allocation for the GPU threadtrace"); - cmd.setStatus(CL_INVALID_OPERATION); - return; - } - - if (gpuThreadTrace->create()) { - amdThreadTrace->setDeviceThreadTrace(gpuThreadTrace); - } - else { - LogError("Failure in memory allocation for the GPU threadtrace"); - delete gpuThreadTrace; - cmd.setStatus(CL_INVALID_OPERATION); - return; - } - - palRef->finalize(); - palRef->release(); - } - - break; - } - default: - LogError("Unsupported command type for ThreadTraceMemObjects!"); - break; - } -} - -void -VirtualGPU::submitThreadTrace(amd::ThreadTraceCommand& cmd) -{ - // Make sure VirtualGPU has an exclusive access to the resources - amd::ScopedLock lock(execution()); - - profilingBegin(cmd); - - switch(cmd.type()) { - case CL_COMMAND_THREAD_TRACE: - { - amd::ThreadTrace* amdThreadTrace = - static_cast(&cmd.getThreadTrace()); - ThreadTrace* threadTrace = - static_cast(amdThreadTrace->getDeviceThreadTrace()); - - // gpu thread trace object had to be generated prior to begin/end/pause/resume due - // to ThreadTraceMemObjectsCommand execution - if (threadTrace == nullptr) { - return; - } - else { - Pal::IPerfExperiment* palPerf = threadTrace->iPerf(); - if (cmd.getState() == amd::ThreadTraceCommand::Begin) { - amd::ThreadTrace::ThreadTraceConfig* traceCfg = - static_cast(cmd.threadTraceConfig()); - iCmd()->CmdBeginPerfExperiment(palPerf); - } - else if (cmd.getState() == amd::ThreadTraceCommand::End) { - GpuEvent event; - eventBegin(MainEngine); - iCmd()->CmdEndPerfExperiment(palPerf); - threadTrace->populateUserMemory(); - eventEnd(MainEngine, event); - setGpuEvent(event); - } - else if (cmd.getState() == amd::ThreadTraceCommand::Pause) { - // There's no Pause from the PerfExperiment interface - } - else if (cmd.getState() == amd::ThreadTraceCommand::Resume) { - // There's no Resume from the PerfExperiment interface - } - } - break; - } - default: - LogError("Unsupported command type for ThreadTrace!"); - break; - } -} - -void -VirtualGPU::submitAcquireExtObjects(amd::AcquireExtObjectsCommand& vcmd) -{ - // Make sure VirtualGPU has an exclusive access to the resources - amd::ScopedLock lock(execution()); - - profilingBegin(vcmd); - - for (std::vector::const_iterator it = vcmd.getMemList().begin(); - it != vcmd.getMemList().end(); it++) { - // amd::Memory object should never be nullptr - assert(*it && "Memory object for interop is nullptr"); - pal::Memory* memory = dev().getGpuMemory(*it); - - // If resource is a shared copy of original resource, then - // runtime needs to copy data from original resource - (*it)->getInteropObj()->copyOrigToShared(); - - // Check if OpenCL has direct access to the interop memory - if (memory->interopType() == Memory::InteropDirectAccess) { - continue; - } - - // Does interop use HW emulation? - if (memory->interopType() == Memory::InteropHwEmulation) { - static const bool Entire = true; - amd::Coord3D origin(0, 0, 0); - amd::Coord3D region(memory->size()); - - // Synchronize the object - if (!blitMgr().copyBuffer(*memory->interop(), - *memory, origin, origin, region, Entire)) { - LogError("submitAcquireExtObjects - Interop synchronization failed!"); - vcmd.setStatus(CL_INVALID_OPERATION); - return; - } - } - } - - profilingEnd(vcmd); -} - -void -VirtualGPU::submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& vcmd) -{ - // Make sure VirtualGPU has an exclusive access to the resources - amd::ScopedLock lock(execution()); - - profilingBegin(vcmd); - - for (std::vector::const_iterator it = vcmd.getMemList().begin(); - it != vcmd.getMemList().end(); it++) { - // amd::Memory object should never be nullptr - assert(*it && "Memory object for interop is nullptr"); - pal::Memory* memory = dev().getGpuMemory(*it); - - // Check if we can use HW interop - if (memory->interopType() == Memory::InteropHwEmulation) { - static const bool Entire = true; - amd::Coord3D origin(0, 0, 0); - amd::Coord3D region(memory->size()); - - // Synchronize the object - if (!blitMgr().copyBuffer(*memory, *memory->interop(), - origin, origin, region, Entire)) { - LogError("submitReleaseExtObjects interop synchronization failed!"); - vcmd.setStatus(CL_INVALID_OPERATION); - return; - } - } - else { - if (memory->interopType() != Memory::InteropDirectAccess) { - LogError("None interop release!"); - } - } - - // If resource is a shared copy of original resource, then - // runtime needs to copy data back to original resource - (*it)->getInteropObj()->copySharedToOrig(); - } - - profilingEnd(vcmd); -} - -void -VirtualGPU::submitSignal(amd::SignalCommand & vcmd) -{ - amd::ScopedLock lock(execution()); - profilingBegin(vcmd); - pal::Memory* pGpuMemory = dev().getGpuMemory(&vcmd.memory()); - - GpuEvent gpuEvent; - eventBegin(MainEngine); - - uint32_t value = vcmd.markerValue(); - uint32_t size = vcmd.memory().getSize(); - - addVmMemory(pGpuMemory); - - uint32_t offset = pGpuMemory->iMem()->Desc().markerBusAddr - pGpuMemory->iMem()->Desc().surfaceBusAddr; - - if (vcmd.type() == CL_COMMAND_WAIT_SIGNAL_AMD) { - iCmd()->CmdWaitMemoryValue(*(pGpuMemory->iMem()), offset, value, 0xFFFFFFFF, Pal::CompareFunc::GreaterEqual); - } - else if (vcmd.type() == CL_COMMAND_WRITE_SIGNAL_AMD) { - iCmd()->CmdUpdateMemory(*(pGpuMemory->iMem()), size, 4, &value); - } - - eventEnd(MainEngine, gpuEvent); - pGpuMemory->setBusy(*this, gpuEvent); - // Update the global GPU event - setGpuEvent(gpuEvent); - - profilingEnd(vcmd); -} - -void -VirtualGPU::submitMakeBuffersResident(amd::MakeBuffersResidentCommand & vcmd) -{ - amd::ScopedLock lock(execution()); - profilingBegin(vcmd); - - std::vector memObjects = vcmd.memObjects(); - uint32_t numObjects = memObjects.size(); - - for (uint i = 0; i < numObjects; i++) - { - // dummy render into the SDI surfaces so that KMD will be able to provide the bus addresses - uint dummy = 0; - static_cast(dev().xferMgr()) - .writeRawData(*(dev().getGpuMemory(memObjects[i])), sizeof(dummy), &dummy); - - pal::Memory* pGpuMemory = dev().getGpuMemory(memObjects[i]); - - pGpuMemory->syncCacheFromHost(*this); - - vcmd.busAddress()[i].surface_bus_address = pGpuMemory->iMem()->Desc().surfaceBusAddr; - vcmd.busAddress()[i].marker_bus_address = pGpuMemory->iMem()->Desc().markerBusAddr; - } - profilingEnd(vcmd); -} - - -bool -VirtualGPU::awaitCompletion(CommandBatch* cb, const amd::Event* waitingEvent) -{ - bool found = false; - amd::Command* current; - amd::Command* head = cb->head_; - - // Make sure that profiling is enabled - if (state_.profileEnabled_) { - return profilingCollectResults(cb, waitingEvent); - } - // Mark the first command in the batch as running - if (head != nullptr) { - head->setStatus(CL_RUNNING); - } - else { - return found; - } - - // Wait for the last known GPU event - waitEventLock(cb); - - while (nullptr != head) { - current = head->getNext(); - if (head->status() == CL_SUBMITTED) { - head->setStatus(CL_RUNNING); - head->setStatus(CL_COMPLETE); - } - else if (head->status() == CL_RUNNING) { - head->setStatus(CL_COMPLETE); - } - else if ((head->status() != CL_COMPLETE) && (current != nullptr)) { - LogPrintfError("Unexpected command status - %d!", head->status()); - } - - // Check if it's a waiting command - if (head == waitingEvent) { - found = true; - } - - head->release(); - head = current; - } - - return found; -} - -void -VirtualGPU::flush(amd::Command* list, bool wait) -{ - CommandBatch* cb = nullptr; - bool gpuCommand = false; - - for (uint i = 0; i < AllEngines; ++i) { - if (cal_.events_[i].isValid()) { - gpuCommand = true; - } - } - - // If the batch doesn't have any GPU command and the list is empty - if (!gpuCommand && cbQueue_.empty()) { - state_.forceWait_ = true; - } - - // Insert the current batch into a list - if (nullptr != list) { - if (!freeCbQueue_.empty()) { - cb = freeCbQueue_.front(); - } - - if (nullptr == cb) { - cb = new CommandBatch(list, cal()->events_, cal()->lastTS_); - } - else { - freeCbQueue_.pop(); - cb->init(list, cal()->events_, cal()->lastTS_); - } - } - - { - //! @todo: Check if really need a lock - amd::ScopedLock lock(execution()); - for (uint i = 0; i < AllEngines; ++i) { - flushDMA(i); - // Reset event so we won't try to wait again, - // if runtime didn't submit any commands - //! @note: it's safe to invalidate events, since - //! we already saved them with the batch creation step above - cal_.events_[i].invalidate(); - } - } - - // Mark last TS as nullptr, so runtime won't process empty batches with the old TS - cal_.lastTS_ = nullptr; - if (nullptr != cb) { - cbQueue_.push(cb); - } - - wait |= state_.forceWait_; - // Loop through all outstanding command batches - while (!cbQueue_.empty()) { - auto cb = cbQueue_.front(); - // Check if command batch finished without a wait - bool finished = true; - for (uint i = 0; i < AllEngines; ++i) { - finished &= isDone(&cb->events_[i]); - } - if (finished || wait) { - // Wait for completion - awaitCompletion(cb); - // Release a command batch - freeCbQueue_.push(cb); - // Remove command batch from the list - cbQueue_.pop(); - } - else { - // Early exit if no finished - break; - } - } - state_.forceWait_ = false; -} - -void -VirtualGPU::enableSyncedBlit() const -{ - return blitMgr_->enableSynchronization(); -} - -void -VirtualGPU::releaseMemObjects(bool scratch) -{ - for (GpuEvents::const_iterator it = gpuEvents_.begin(); - it != gpuEvents_.end(); ++it) { - GpuEvent event = it->second; - waitForEvent(&event); - queues_[MainEngine]->removeCmdMemRef(const_cast(it->first)); - queues_[SdmaEngine]->removeCmdMemRef(const_cast(it->first)); - } - - gpuEvents_.clear(); -} - -void -VirtualGPU::setGpuEvent( - GpuEvent gpuEvent, - bool flush) -{ - cal_.events_[engineID_] = gpuEvent; - - // Flush current DMA buffer if requested - if (flush) { - flushDMA(engineID_); - } -} - -void -VirtualGPU::flushDMA(uint engineID) -{ - if (engineID == MainEngine) { - // Clear memory dependency state, since runtime flushes compute - // memoryDependency().clear(); - //!@todo Keep memory dependency alive even if we flush DMA, - //! since only L2 cache is flushed in KMD frame, - //! but L1 still has to be invalidated. - } - - isDone(&cal_.events_[engineID]); -} - -bool -VirtualGPU::waitAllEngines(CommandBatch* cb) -{ - uint i; - GpuEvent* events; //!< GPU events for the batch - - // If command batch is nullptr then wait for the current - if (nullptr == cb) { - events = cal_.events_; - } - else { - events = cb->events_; - } - - bool earlyDone = true; - // The first loop is to flush all engines and/or check if - // engines are idle already - for (i = 0; i < AllEngines; ++i) { - earlyDone &= isDone(&events[i]); - } - - // Release all transfer buffers on this command queue - releaseXferWrite(); - - // Rlease all pinned memory - releasePinnedMem(); - - // The second loop is to wait all engines - for (i = 0; i < AllEngines; ++i) { - waitForEvent(&events[i]); - } - - return earlyDone; -} - -void -VirtualGPU::waitEventLock(CommandBatch* cb) -{ - // Make sure VirtualGPU has an exclusive access to the resources - amd::ScopedLock lock(execution()); - - bool earlyDone = waitAllEngines(cb); - - // Free resource cache if we have too many entries - //! \note we do it here, when all engines are idle, - // because Vista/Win7 idles GPU on a resource destruction - static const size_t MinCacheEntries = 4096; - dev().resourceCache().free(MinCacheEntries); - - // Find the timestamp object of the last command in the batch - if (cb->lastTS_ != nullptr) { - // If earlyDone is TRUE, then CPU didn't wait for GPU. - // Thus the sync point between CPU and GPU is unclear and runtime - // will use an older adjustment value to maintain the same timeline - if (!earlyDone || - //! \note Workaround for APU(s). - //! GPU-CPU timelines may go off too much, thus always - //! force calibration with the last batch in the list - (cbQueue_.size() <= 1) || - (readjustTimeGPU_ == 0)) { - uint64_t startTimeStampGPU = 0; - uint64_t endTimeStampGPU = 0; - - // Get the timestamp value of the last command in the batch - cb->lastTS_->value(&startTimeStampGPU, &endTimeStampGPU); - - uint64_t endTimeStampCPU = amd::Os::timeNanos(); - // Make sure the command batch has a valid GPU TS - if (!GPU_RAW_TIMESTAMP) { - // Adjust the base time by the execution time - readjustTimeGPU_ = endTimeStampGPU - endTimeStampCPU; - } - } - } -} - -bool -VirtualGPU::allocConstantBuffers() -{ - // Allocate/reallocate constant buffers - size_t minCbSize; - // GCN doesn't really have a limit - minCbSize = 256 * Ki; - uint i; - - // Create/reallocate constant buffer resources - for (i = 0; i < MaxConstBuffersArguments; ++i) { - ConstBuffer* constBuf = new ConstBuffer(*this, ((minCbSize + - ConstBuffer::VectorSize - 1) / ConstBuffer::VectorSize)); - - if ((constBuf != nullptr) && constBuf->create()) { - addConstBuffer(constBuf); - } - else { - // We failed to create a constant buffer - delete constBuf; - return false; - } - } - - return true; -} - -void -VirtualGPU::profilingBegin(amd::Command& command, bool drmProfiling) -{ - // Is profiling enabled? - if (command.profilingInfo().enabled_) { - // Allocate a timestamp object from the cache - TimeStamp* ts = tsCache_->allocTimeStamp(); - if (nullptr == ts) { - return; - } - // Save the TimeStamp object in the current OCL event - command.setData(ts); - currTs_ = ts; - state_.profileEnabled_ = true; - } -} - -void -VirtualGPU::profilingEnd(amd::Command& command) -{ - // Get the TimeStamp object associated witht the current command - TimeStamp* ts = reinterpret_cast(command.data()); - if (ts != nullptr) { - // Check if the command actually did any GPU submission - if (ts->isValid()) { - cal_.lastTS_ = ts; - } - else { - // Destroy the TimeStamp object - tsCache_->freeTimeStamp(ts); - command.setData(nullptr); - } - } -} - -bool -VirtualGPU::profilingCollectResults(CommandBatch* cb, const amd::Event* waitingEvent) -{ - bool found = false; - amd::Command* current; - amd::Command* first = cb->head_; - - // If the command list is, empty then exit - if (nullptr == first) { - return found; - } - - // Wait for the last known GPU events on all engines - waitEventLock(cb); - - // Find the CPU base time of the entire command batch execution - uint64_t endTimeStamp = amd::Os::timeNanos(); - uint64_t startTimeStamp = endTimeStamp; - - // First step, walk the command list to find the first valid command - //! \note The batch may have empty markers at the beginning. - //! So the start/end of the empty commands is equal to - //! the start of the first valid command in the batch. - first = cb->head_; - while (nullptr != first) { - // Get the TimeStamp object associated witht the current command - TimeStamp* ts = reinterpret_cast(first->data()); - - if (ts != nullptr) { - ts->value(&startTimeStamp, &endTimeStamp); - endTimeStamp -= readjustTimeGPU_; - startTimeStamp -= readjustTimeGPU_; - // Assign to endTimeStamp the start of the first valid command - endTimeStamp = startTimeStamp; - break; - } - first = first->getNext(); - } - - // Second step, walk the command list to construct the time line - first = cb->head_; - while (nullptr != first) { - // Get the TimeStamp object associated witht the current command - TimeStamp* ts = reinterpret_cast(first->data()); - - current = first->getNext(); - - if (ts != nullptr) { - ts->value(&startTimeStamp, &endTimeStamp); - endTimeStamp -= readjustTimeGPU_; - startTimeStamp -= readjustTimeGPU_; - // Destroy the TimeStamp object - tsCache_->freeTimeStamp(ts); - first->setData(nullptr); - } - else { - // For empty commands start/end is equal to - // the end of the last valid command - startTimeStamp = endTimeStamp; - } - - // Update the command status with the proper timestamps - if (first->status() == CL_SUBMITTED) { - first->setStatus(CL_RUNNING, startTimeStamp); - first->setStatus(CL_COMPLETE, endTimeStamp); - } - else if (first->status() == CL_RUNNING) { - first->setStatus(CL_COMPLETE, endTimeStamp); - } - else if ((first->status() != CL_COMPLETE) && (current != nullptr)) { - LogPrintfError("Unexpected command status - %d!", first->status()); - } - - // Do we wait this event? - if (first == waitingEvent) { - found = true; - } - - first->release(); - first = current; - } - - return found; -} - -void -VirtualGPU::addVmMemory(const Memory* memory) -{ - queues_[MainEngine]->addCmdMemRef(memory->iMem()); -} -void -VirtualGPU::AddKernel(const amd::Kernel& kernel) const -{ - queues_[MainEngine]->last_kernel_ = &kernel; -} - -void -VirtualGPU::addDoppRef(const Memory* memory, bool lastDoppCmd, bool pfpaDoppCmd) -{ - queues_[MainEngine]->addCmdDoppRef(memory->iMem(), lastDoppCmd, pfpaDoppCmd); -} - -void -VirtualGPU::profileEvent(EngineType engine, bool type) const -{ - if (nullptr == currTs_) { + } + + if (newExperiment) { + palRef->finalize(); + } + + palRef->release(); + + Pal::IPerfExperiment* palPerf = nullptr; + for (uint i = 0; i < vcmd.getNumCounters(); ++i) { + amd::PerfCounter* amdCounter = static_cast(counters[i]); + const PerfCounter* counter = static_cast(amdCounter->getDeviceCounter()); + + if (palPerf != counter->iPerf()) { + palPerf = counter->iPerf(); + // Find the state and sends the command to PAL + if (vcmd.getState() == amd::PerfCounterCommand::Begin) { + Pal::SetClockModeInput input; + Pal::SetClockModeOutput output = {}; + input.clockMode = Pal::DeviceClockMode::Profiling; + dev().iDev()->SetClockMode(input, &output); + GpuEvent event; + eventBegin(MainEngine); + iCmd()->CmdBeginPerfExperiment(palPerf); + eventEnd(MainEngine, event); + setGpuEvent(event); + } else if (vcmd.getState() == amd::PerfCounterCommand::End) { + GpuEvent event; + eventBegin(MainEngine); + iCmd()->CmdEndPerfExperiment(palPerf); + eventEnd(MainEngine, event); + setGpuEvent(event); + } else { + LogError("Unsupported performance counter state"); + vcmd.setStatus(CL_INVALID_OPERATION); return; + } } - if (type) { - currTs_->begin((engine == SdmaEngine) ? true : false); - } - else { - currTs_->end((engine == SdmaEngine) ? true : false); - } + } } -bool -VirtualGPU::processMemObjectsHSA( - const amd::Kernel& kernel, - const_address params, - bool nativeMem, - std::vector* memList) -{ - static const bool NoAlias = true; - const HSAILKernel& hsaKernel = static_cast - (*(kernel.getDeviceKernel(dev(), NoAlias))); - const amd::KernelSignature& signature = kernel.signature(); - const amd::KernelParameters& kernelParams = kernel.parameters(); +void VirtualGPU::submitThreadTraceMemObjects(amd::ThreadTraceMemObjectsCommand& cmd) { + // Make sure VirtualGPU has an exclusive access to the resources + amd::ScopedLock lock(execution()); - // Mark the tracker with a new kernel, - // so we can avoid checks of the aliased objects - memoryDependency().newKernel(); + profilingBegin(cmd); - bool deviceSupportFGS = 0 != dev().isFineGrainedSystem(true); - bool supportFineGrainedSystem = deviceSupportFGS; - FGSStatus status = kernelParams.getSvmSystemPointersSupport(); - switch (status) { - case FGS_YES: - if (!deviceSupportFGS) { - return false; - } - supportFineGrainedSystem = true; - break; - case FGS_NO: - supportFineGrainedSystem = false; - break; - case FGS_DEFAULT: - default: - break; - } + switch (cmd.type()) { + case CL_COMMAND_THREAD_TRACE_MEM: { + amd::ThreadTrace* amdThreadTrace = &cmd.getThreadTrace(); + ThreadTrace* threadTrace = static_cast(amdThreadTrace->getDeviceThreadTrace()); - size_t count = kernelParams.getNumberOfSvmPtr(); - size_t execInfoOffset = kernelParams.getExecInfoOffset(); - bool sync = true; - - amd::Memory* memory = nullptr; - //get svm non arugment information - void* const* svmPtrArray = - reinterpret_cast(params + execInfoOffset); - for (size_t i = 0; i < count; i++) { - memory = amd::SvmManager::FindSvmBuffer(svmPtrArray[i]); - if (nullptr == memory) { - if (!supportFineGrainedSystem) { - return false; - } - else if (sync) { - flushCUCaches(); - // Clear memory dependency state - const static bool All = true; - memoryDependency().clear(!All); - continue; - } + if (threadTrace == nullptr) { + PalThreadTraceReference* palRef = PalThreadTraceReference::Create(*this); + if (palRef == nullptr) { + LogError("Failure in memory allocation for the GPU threadtrace"); + cmd.setStatus(CL_INVALID_OPERATION); + return; } - else { - Memory* gpuMemory = dev().getGpuMemory(memory); - if (nullptr != gpuMemory) { - // Synchronize data with other memory instances if necessary - gpuMemory->syncCacheFromHost(*this); - const static bool IsReadOnly = false; - // Validate SVM passed in the non argument list - memoryDependency().validate(*this, gpuMemory, IsReadOnly); + size_t numSe = amdThreadTrace->deviceSeNumThreadTrace(); - // Mark signal write for cache coherency, - // since this object isn't a part of kernel arg setup - if ((memory->getMemFlags() & CL_MEM_READ_ONLY) == 0) { - memory->signalWrite(&dev()); - } - - memList->push_back(gpuMemory); - } - else { - return false; - } + ThreadTrace* gpuThreadTrace = new ThreadTrace(gpuDevice_, palRef, cmd.getMemList(), numSe); + if (nullptr == gpuThreadTrace) { + LogError("Failure in memory allocation for the GPU threadtrace"); + cmd.setStatus(CL_INVALID_OPERATION); + return; } - } - // Check all parameters for the current kernel - for (size_t i = 0; i < signature.numParameters(); ++i) { - const amd::KernelParameterDescriptor& desc = signature.at(i); - const HSAILKernel::Argument* arg = hsaKernel.argumentAt(i); - Memory* memory = nullptr; - bool readOnly = false; - amd::Memory* svmMem = nullptr; - - // Find if current argument is a buffer - if ((desc.type_ == T_POINTER) && (arg->addrQual_ != HSAIL_ADDRESS_LOCAL)) { - if (kernelParams.boundToSvmPointer(dev(), params, i)) { - svmMem = amd::SvmManager::FindSvmBuffer( - *reinterpret_cast(params + desc.offset_)); - if (!svmMem) { - flushCUCaches(); - // Clear memory dependency state - const static bool All = true; - memoryDependency().clear(!All); - continue; - } - } - - if (nativeMem) { - memory = *reinterpret_cast(params + desc.offset_); - } - else if (*reinterpret_cast - (params + desc.offset_) != nullptr) { - if (nullptr == svmMem) { - memory = dev().getGpuMemory(*reinterpret_cast - (params + desc.offset_)); - } - else { - memory = dev().getGpuMemory(svmMem); - } - // Synchronize data with other memory instances if necessary - memory->syncCacheFromHost(*this); - } - - if (memory != nullptr) { - // Check image - readOnly = (desc.accessQualifier_ == - CL_KERNEL_ARG_ACCESS_READ_ONLY) ? true : false; - // Check buffer - readOnly |= (arg->access_ == HSAIL_ACCESS_TYPE_RO) ? true : false; - // Validate memory for a dependency in the queue - memoryDependency().validate(*this, memory, readOnly); - } + if (gpuThreadTrace->create()) { + amdThreadTrace->setDeviceThreadTrace(gpuThreadTrace); + } else { + LogError("Failure in memory allocation for the GPU threadtrace"); + delete gpuThreadTrace; + cmd.setStatus(CL_INVALID_OPERATION); + return; } - } - for (pal::Memory* mem : hsaKernel.prog().globalStores()) { - const static bool IsReadOnly = false; - // Validate global store for a dependency in the queue - memoryDependency().validate(*this, mem, IsReadOnly); - } + palRef->finalize(); + palRef->release(); + } - return true; + break; + } + default: + LogError("Unsupported command type for ThreadTraceMemObjects!"); + break; + } } -amd::Memory* -VirtualGPU::createBufferFromImage(amd::Memory& amdImage) const -{ - amd::Memory* mem = new(amdImage.getContext()) - amd::Buffer(amdImage, 0, 0, amdImage.getSize()); +void VirtualGPU::submitThreadTrace(amd::ThreadTraceCommand& cmd) { + // Make sure VirtualGPU has an exclusive access to the resources + amd::ScopedLock lock(execution()); - if ((mem != nullptr) && !mem->create()) { - mem->release(); + profilingBegin(cmd); + + switch (cmd.type()) { + case CL_COMMAND_THREAD_TRACE: { + amd::ThreadTrace* amdThreadTrace = static_cast(&cmd.getThreadTrace()); + ThreadTrace* threadTrace = static_cast(amdThreadTrace->getDeviceThreadTrace()); + + // gpu thread trace object had to be generated prior to begin/end/pause/resume due + // to ThreadTraceMemObjectsCommand execution + if (threadTrace == nullptr) { + return; + } else { + Pal::IPerfExperiment* palPerf = threadTrace->iPerf(); + if (cmd.getState() == amd::ThreadTraceCommand::Begin) { + amd::ThreadTrace::ThreadTraceConfig* traceCfg = + static_cast(cmd.threadTraceConfig()); + iCmd()->CmdBeginPerfExperiment(palPerf); + } else if (cmd.getState() == amd::ThreadTraceCommand::End) { + GpuEvent event; + eventBegin(MainEngine); + iCmd()->CmdEndPerfExperiment(palPerf); + threadTrace->populateUserMemory(); + eventEnd(MainEngine, event); + setGpuEvent(event); + } else if (cmd.getState() == amd::ThreadTraceCommand::Pause) { + // There's no Pause from the PerfExperiment interface + } else if (cmd.getState() == amd::ThreadTraceCommand::Resume) { + // There's no Resume from the PerfExperiment interface + } + } + break; + } + default: + LogError("Unsupported command type for ThreadTrace!"); + break; + } +} + +void VirtualGPU::submitAcquireExtObjects(amd::AcquireExtObjectsCommand& vcmd) { + // Make sure VirtualGPU has an exclusive access to the resources + amd::ScopedLock lock(execution()); + + profilingBegin(vcmd); + + for (std::vector::const_iterator it = vcmd.getMemList().begin(); + it != vcmd.getMemList().end(); it++) { + // amd::Memory object should never be nullptr + assert(*it && "Memory object for interop is nullptr"); + pal::Memory* memory = dev().getGpuMemory(*it); + + // If resource is a shared copy of original resource, then + // runtime needs to copy data from original resource + (*it)->getInteropObj()->copyOrigToShared(); + + // Check if OpenCL has direct access to the interop memory + if (memory->interopType() == Memory::InteropDirectAccess) { + continue; } - return mem; + // Does interop use HW emulation? + if (memory->interopType() == Memory::InteropHwEmulation) { + static const bool Entire = true; + amd::Coord3D origin(0, 0, 0); + amd::Coord3D region(memory->size()); + + // Synchronize the object + if (!blitMgr().copyBuffer(*memory->interop(), *memory, origin, origin, region, Entire)) { + LogError("submitAcquireExtObjects - Interop synchronization failed!"); + vcmd.setStatus(CL_INVALID_OPERATION); + return; + } + } + } + + profilingEnd(vcmd); } -void -VirtualGPU::writeVQueueHeader(VirtualGPU& hostQ, uint64_t kernelTable) -{ - const static bool Wait = true; - vqHeader_->kernel_table = kernelTable; - virtualQueue_->writeRawData(hostQ, 0, sizeof(AmdVQueueHeader), vqHeader_, Wait); +void VirtualGPU::submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& vcmd) { + // Make sure VirtualGPU has an exclusive access to the resources + amd::ScopedLock lock(execution()); + + profilingBegin(vcmd); + + for (std::vector::const_iterator it = vcmd.getMemList().begin(); + it != vcmd.getMemList().end(); it++) { + // amd::Memory object should never be nullptr + assert(*it && "Memory object for interop is nullptr"); + pal::Memory* memory = dev().getGpuMemory(*it); + + // Check if we can use HW interop + if (memory->interopType() == Memory::InteropHwEmulation) { + static const bool Entire = true; + amd::Coord3D origin(0, 0, 0); + amd::Coord3D region(memory->size()); + + // Synchronize the object + if (!blitMgr().copyBuffer(*memory, *memory->interop(), origin, origin, region, Entire)) { + LogError("submitReleaseExtObjects interop synchronization failed!"); + vcmd.setStatus(CL_INVALID_OPERATION); + return; + } + } else { + if (memory->interopType() != Memory::InteropDirectAccess) { + LogError("None interop release!"); + } + } + + // If resource is a shared copy of original resource, then + // runtime needs to copy data back to original resource + (*it)->getInteropObj()->copySharedToOrig(); + } + + profilingEnd(vcmd); } -void -VirtualGPU::flushCuCaches(HwDbgGpuCacheMask cache_mask) -{ - Unimplemented(); -/* - //! @todo: fix issue of no event available for the flush/invalidate cache command - InvalidateSqCaches(cache_mask.sqICache_, - cache_mask.sqKCache_, - cache_mask.tcL1_, - cache_mask.tcL2_); -*/ +void VirtualGPU::submitSignal(amd::SignalCommand& vcmd) { + amd::ScopedLock lock(execution()); + profilingBegin(vcmd); + pal::Memory* pGpuMemory = dev().getGpuMemory(&vcmd.memory()); + + GpuEvent gpuEvent; + eventBegin(MainEngine); + + uint32_t value = vcmd.markerValue(); + uint32_t size = vcmd.memory().getSize(); + + addVmMemory(pGpuMemory); + + uint32_t offset = + pGpuMemory->iMem()->Desc().markerBusAddr - pGpuMemory->iMem()->Desc().surfaceBusAddr; + + if (vcmd.type() == CL_COMMAND_WAIT_SIGNAL_AMD) { + iCmd()->CmdWaitMemoryValue(*(pGpuMemory->iMem()), offset, value, 0xFFFFFFFF, + Pal::CompareFunc::GreaterEqual); + } else if (vcmd.type() == CL_COMMAND_WRITE_SIGNAL_AMD) { + iCmd()->CmdUpdateMemory(*(pGpuMemory->iMem()), size, 4, &value); + } + + eventEnd(MainEngine, gpuEvent); + pGpuMemory->setBusy(*this, gpuEvent); + // Update the global GPU event + setGpuEvent(gpuEvent); + + profilingEnd(vcmd); +} + +void VirtualGPU::submitMakeBuffersResident(amd::MakeBuffersResidentCommand& vcmd) { + amd::ScopedLock lock(execution()); + profilingBegin(vcmd); + + std::vector memObjects = vcmd.memObjects(); + uint32_t numObjects = memObjects.size(); + + for (uint i = 0; i < numObjects; i++) { + // dummy render into the SDI surfaces so that KMD will be able to provide the bus addresses + uint dummy = 0; + static_cast(dev().xferMgr()) + .writeRawData(*(dev().getGpuMemory(memObjects[i])), sizeof(dummy), &dummy); + + pal::Memory* pGpuMemory = dev().getGpuMemory(memObjects[i]); + + pGpuMemory->syncCacheFromHost(*this); + + vcmd.busAddress()[i].surface_bus_address = pGpuMemory->iMem()->Desc().surfaceBusAddr; + vcmd.busAddress()[i].marker_bus_address = pGpuMemory->iMem()->Desc().markerBusAddr; + } + profilingEnd(vcmd); +} + + +bool VirtualGPU::awaitCompletion(CommandBatch* cb, const amd::Event* waitingEvent) { + bool found = false; + amd::Command* current; + amd::Command* head = cb->head_; + + // Make sure that profiling is enabled + if (state_.profileEnabled_) { + return profilingCollectResults(cb, waitingEvent); + } + // Mark the first command in the batch as running + if (head != nullptr) { + head->setStatus(CL_RUNNING); + } else { + return found; + } + + // Wait for the last known GPU event + waitEventLock(cb); + + while (nullptr != head) { + current = head->getNext(); + if (head->status() == CL_SUBMITTED) { + head->setStatus(CL_RUNNING); + head->setStatus(CL_COMPLETE); + } else if (head->status() == CL_RUNNING) { + head->setStatus(CL_COMPLETE); + } else if ((head->status() != CL_COMPLETE) && (current != nullptr)) { + LogPrintfError("Unexpected command status - %d!", head->status()); + } + + // Check if it's a waiting command + if (head == waitingEvent) { + found = true; + } + + head->release(); + head = current; + } + + return found; +} + +void VirtualGPU::flush(amd::Command* list, bool wait) { + CommandBatch* cb = nullptr; + bool gpuCommand = false; + + for (uint i = 0; i < AllEngines; ++i) { + if (cal_.events_[i].isValid()) { + gpuCommand = true; + } + } + + // If the batch doesn't have any GPU command and the list is empty + if (!gpuCommand && cbQueue_.empty()) { + state_.forceWait_ = true; + } + + // Insert the current batch into a list + if (nullptr != list) { + if (!freeCbQueue_.empty()) { + cb = freeCbQueue_.front(); + } + + if (nullptr == cb) { + cb = new CommandBatch(list, cal()->events_, cal()->lastTS_); + } else { + freeCbQueue_.pop(); + cb->init(list, cal()->events_, cal()->lastTS_); + } + } + + { + //! @todo: Check if really need a lock + amd::ScopedLock lock(execution()); + for (uint i = 0; i < AllEngines; ++i) { + flushDMA(i); + // Reset event so we won't try to wait again, + // if runtime didn't submit any commands + //! @note: it's safe to invalidate events, since + //! we already saved them with the batch creation step above + cal_.events_[i].invalidate(); + } + } + + // Mark last TS as nullptr, so runtime won't process empty batches with the old TS + cal_.lastTS_ = nullptr; + if (nullptr != cb) { + cbQueue_.push(cb); + } + + wait |= state_.forceWait_; + // Loop through all outstanding command batches + while (!cbQueue_.empty()) { + auto cb = cbQueue_.front(); + // Check if command batch finished without a wait + bool finished = true; + for (uint i = 0; i < AllEngines; ++i) { + finished &= isDone(&cb->events_[i]); + } + if (finished || wait) { + // Wait for completion + awaitCompletion(cb); + // Release a command batch + freeCbQueue_.push(cb); + // Remove command batch from the list + cbQueue_.pop(); + } else { + // Early exit if no finished + break; + } + } + state_.forceWait_ = false; +} + +void VirtualGPU::enableSyncedBlit() const { return blitMgr_->enableSynchronization(); } + +void VirtualGPU::releaseMemObjects(bool scratch) { + for (GpuEvents::const_iterator it = gpuEvents_.begin(); it != gpuEvents_.end(); ++it) { + GpuEvent event = it->second; + waitForEvent(&event); + queues_[MainEngine]->removeCmdMemRef(const_cast(it->first)); + queues_[SdmaEngine]->removeCmdMemRef(const_cast(it->first)); + } + + gpuEvents_.clear(); +} + +void VirtualGPU::setGpuEvent(GpuEvent gpuEvent, bool flush) { + cal_.events_[engineID_] = gpuEvent; + + // Flush current DMA buffer if requested + if (flush) { flushDMA(engineID_); + } +} +void VirtualGPU::flushDMA(uint engineID) { + if (engineID == MainEngine) { + // Clear memory dependency state, since runtime flushes compute + // memoryDependency().clear(); + //!@todo Keep memory dependency alive even if we flush DMA, + //! since only L2 cache is flushed in KMD frame, + //! but L1 still has to be invalidated. + } + + isDone(&cal_.events_[engineID]); +} + +bool VirtualGPU::waitAllEngines(CommandBatch* cb) { + uint i; + GpuEvent* events; //!< GPU events for the batch + + // If command batch is nullptr then wait for the current + if (nullptr == cb) { + events = cal_.events_; + } else { + events = cb->events_; + } + + bool earlyDone = true; + // The first loop is to flush all engines and/or check if + // engines are idle already + for (i = 0; i < AllEngines; ++i) { + earlyDone &= isDone(&events[i]); + } + + // Release all transfer buffers on this command queue + releaseXferWrite(); + + // Rlease all pinned memory + releasePinnedMem(); + + // The second loop is to wait all engines + for (i = 0; i < AllEngines; ++i) { + waitForEvent(&events[i]); + } + + return earlyDone; +} + +void VirtualGPU::waitEventLock(CommandBatch* cb) { + // Make sure VirtualGPU has an exclusive access to the resources + amd::ScopedLock lock(execution()); + + bool earlyDone = waitAllEngines(cb); + + // Free resource cache if we have too many entries + //! \note we do it here, when all engines are idle, + // because Vista/Win7 idles GPU on a resource destruction + static const size_t MinCacheEntries = 4096; + dev().resourceCache().free(MinCacheEntries); + + // Find the timestamp object of the last command in the batch + if (cb->lastTS_ != nullptr) { + // If earlyDone is TRUE, then CPU didn't wait for GPU. + // Thus the sync point between CPU and GPU is unclear and runtime + // will use an older adjustment value to maintain the same timeline + if (!earlyDone || + //! \note Workaround for APU(s). + //! GPU-CPU timelines may go off too much, thus always + //! force calibration with the last batch in the list + (cbQueue_.size() <= 1) || (readjustTimeGPU_ == 0)) { + uint64_t startTimeStampGPU = 0; + uint64_t endTimeStampGPU = 0; + + // Get the timestamp value of the last command in the batch + cb->lastTS_->value(&startTimeStampGPU, &endTimeStampGPU); + + uint64_t endTimeStampCPU = amd::Os::timeNanos(); + // Make sure the command batch has a valid GPU TS + if (!GPU_RAW_TIMESTAMP) { + // Adjust the base time by the execution time + readjustTimeGPU_ = endTimeStampGPU - endTimeStampCPU; + } + } + } +} + +bool VirtualGPU::allocConstantBuffers() { + // Allocate/reallocate constant buffers + size_t minCbSize; + // GCN doesn't really have a limit + minCbSize = 256 * Ki; + uint i; + + // Create/reallocate constant buffer resources + for (i = 0; i < MaxConstBuffersArguments; ++i) { + ConstBuffer* constBuf = new ConstBuffer( + *this, ((minCbSize + ConstBuffer::VectorSize - 1) / ConstBuffer::VectorSize)); + + if ((constBuf != nullptr) && constBuf->create()) { + addConstBuffer(constBuf); + } else { + // We failed to create a constant buffer + delete constBuf; + return false; + } + } + + return true; +} + +void VirtualGPU::profilingBegin(amd::Command& command, bool drmProfiling) { + // Is profiling enabled? + if (command.profilingInfo().enabled_) { + // Allocate a timestamp object from the cache + TimeStamp* ts = tsCache_->allocTimeStamp(); + if (nullptr == ts) { + return; + } + // Save the TimeStamp object in the current OCL event + command.setData(ts); + currTs_ = ts; + state_.profileEnabled_ = true; + } +} + +void VirtualGPU::profilingEnd(amd::Command& command) { + // Get the TimeStamp object associated witht the current command + TimeStamp* ts = reinterpret_cast(command.data()); + if (ts != nullptr) { + // Check if the command actually did any GPU submission + if (ts->isValid()) { + cal_.lastTS_ = ts; + } else { + // Destroy the TimeStamp object + tsCache_->freeTimeStamp(ts); + command.setData(nullptr); + } + } +} + +bool VirtualGPU::profilingCollectResults(CommandBatch* cb, const amd::Event* waitingEvent) { + bool found = false; + amd::Command* current; + amd::Command* first = cb->head_; + + // If the command list is, empty then exit + if (nullptr == first) { + return found; + } + + // Wait for the last known GPU events on all engines + waitEventLock(cb); + + // Find the CPU base time of the entire command batch execution + uint64_t endTimeStamp = amd::Os::timeNanos(); + uint64_t startTimeStamp = endTimeStamp; + + // First step, walk the command list to find the first valid command + //! \note The batch may have empty markers at the beginning. + //! So the start/end of the empty commands is equal to + //! the start of the first valid command in the batch. + first = cb->head_; + while (nullptr != first) { + // Get the TimeStamp object associated witht the current command + TimeStamp* ts = reinterpret_cast(first->data()); + + if (ts != nullptr) { + ts->value(&startTimeStamp, &endTimeStamp); + endTimeStamp -= readjustTimeGPU_; + startTimeStamp -= readjustTimeGPU_; + // Assign to endTimeStamp the start of the first valid command + endTimeStamp = startTimeStamp; + break; + } + first = first->getNext(); + } + + // Second step, walk the command list to construct the time line + first = cb->head_; + while (nullptr != first) { + // Get the TimeStamp object associated witht the current command + TimeStamp* ts = reinterpret_cast(first->data()); + + current = first->getNext(); + + if (ts != nullptr) { + ts->value(&startTimeStamp, &endTimeStamp); + endTimeStamp -= readjustTimeGPU_; + startTimeStamp -= readjustTimeGPU_; + // Destroy the TimeStamp object + tsCache_->freeTimeStamp(ts); + first->setData(nullptr); + } else { + // For empty commands start/end is equal to + // the end of the last valid command + startTimeStamp = endTimeStamp; + } + + // Update the command status with the proper timestamps + if (first->status() == CL_SUBMITTED) { + first->setStatus(CL_RUNNING, startTimeStamp); + first->setStatus(CL_COMPLETE, endTimeStamp); + } else if (first->status() == CL_RUNNING) { + first->setStatus(CL_COMPLETE, endTimeStamp); + } else if ((first->status() != CL_COMPLETE) && (current != nullptr)) { + LogPrintfError("Unexpected command status - %d!", first->status()); + } + + // Do we wait this event? + if (first == waitingEvent) { + found = true; + } + + first->release(); + first = current; + } + + return found; +} + +void VirtualGPU::addVmMemory(const Memory* memory) { + queues_[MainEngine]->addCmdMemRef(memory->iMem()); +} +void VirtualGPU::AddKernel(const amd::Kernel& kernel) const { + queues_[MainEngine]->last_kernel_ = &kernel; +} + +void VirtualGPU::addDoppRef(const Memory* memory, bool lastDoppCmd, bool pfpaDoppCmd) { + queues_[MainEngine]->addCmdDoppRef(memory->iMem(), lastDoppCmd, pfpaDoppCmd); +} + +void VirtualGPU::profileEvent(EngineType engine, bool type) const { + if (nullptr == currTs_) { return; + } + if (type) { + currTs_->begin((engine == SdmaEngine) ? true : false); + } else { + currTs_->end((engine == SdmaEngine) ? true : false); + } } -void -VirtualGPU::buildKernelInfo(const HSAILKernel& hsaKernel, - hsa_kernel_dispatch_packet_t* aqlPkt, - HwDbgKernelInfo& kernelInfo, - amd::Event* enqueueEvent) -{ - amd::HwDebugManager * dbgManager = dev().hwDebugMgr(); - assert (dbgManager && "No HW Debug Manager!"); +bool VirtualGPU::processMemObjectsHSA(const amd::Kernel& kernel, const_address params, + bool nativeMem, std::vector* memList) { + static const bool NoAlias = true; + const HSAILKernel& hsaKernel = + static_cast(*(kernel.getDeviceKernel(dev(), NoAlias))); + const amd::KernelSignature& signature = kernel.signature(); + const amd::KernelParameters& kernelParams = kernel.parameters(); - // Initialize structure with default values + // Mark the tracker with a new kernel, + // so we can avoid checks of the aliased objects + memoryDependency().newKernel(); - if (hsaKernel.prog().maxScratchRegs() > 0) { - pal::Memory* scratchBuf = dev().scratch(hwRing())->memObj_; - kernelInfo.scratchBufAddr = scratchBuf->vmAddress(); - kernelInfo.scratchBufferSizeInBytes = scratchBuf->size(); + bool deviceSupportFGS = 0 != dev().isFineGrainedSystem(true); + bool supportFineGrainedSystem = deviceSupportFGS; + FGSStatus status = kernelParams.getSvmSystemPointersSupport(); + switch (status) { + case FGS_YES: + if (!deviceSupportFGS) { + return false; + } + supportFineGrainedSystem = true; + break; + case FGS_NO: + supportFineGrainedSystem = false; + break; + case FGS_DEFAULT: + default: + break; + } - // Get the address of the scratch buffer and its size for CPU access - address scratchRingAddr = nullptr; - scratchRingAddr = static_cast
(scratchBuf->map(nullptr, 0)); - dbgManager->setScratchRing(scratchRingAddr,scratchBuf->size()); - scratchBuf->unmap(nullptr); + size_t count = kernelParams.getNumberOfSvmPtr(); + size_t execInfoOffset = kernelParams.getExecInfoOffset(); + bool sync = true; + + amd::Memory* memory = nullptr; + // get svm non arugment information + void* const* svmPtrArray = reinterpret_cast(params + execInfoOffset); + for (size_t i = 0; i < count; i++) { + memory = amd::SvmManager::FindSvmBuffer(svmPtrArray[i]); + if (nullptr == memory) { + if (!supportFineGrainedSystem) { + return false; + } else if (sync) { + flushCUCaches(); + // Clear memory dependency state + const static bool All = true; + memoryDependency().clear(!All); + continue; + } + } else { + Memory* gpuMemory = dev().getGpuMemory(memory); + if (nullptr != gpuMemory) { + // Synchronize data with other memory instances if necessary + gpuMemory->syncCacheFromHost(*this); + + const static bool IsReadOnly = false; + // Validate SVM passed in the non argument list + memoryDependency().validate(*this, gpuMemory, IsReadOnly); + + // Mark signal write for cache coherency, + // since this object isn't a part of kernel arg setup + if ((memory->getMemFlags() & CL_MEM_READ_ONLY) == 0) { + memory->signalWrite(&dev()); + } + + memList->push_back(gpuMemory); + } else { + return false; + } } - else { - kernelInfo.scratchBufAddr = 0; - kernelInfo.scratchBufferSizeInBytes = 0; - dbgManager->setScratchRing(nullptr, 0); + } + + // Check all parameters for the current kernel + for (size_t i = 0; i < signature.numParameters(); ++i) { + const amd::KernelParameterDescriptor& desc = signature.at(i); + const HSAILKernel::Argument* arg = hsaKernel.argumentAt(i); + Memory* memory = nullptr; + bool readOnly = false; + amd::Memory* svmMem = nullptr; + + // Find if current argument is a buffer + if ((desc.type_ == T_POINTER) && (arg->addrQual_ != HSAIL_ADDRESS_LOCAL)) { + if (kernelParams.boundToSvmPointer(dev(), params, i)) { + svmMem = + amd::SvmManager::FindSvmBuffer(*reinterpret_cast(params + desc.offset_)); + if (!svmMem) { + flushCUCaches(); + // Clear memory dependency state + const static bool All = true; + memoryDependency().clear(!All); + continue; + } + } + + if (nativeMem) { + memory = *reinterpret_cast(params + desc.offset_); + } else if (*reinterpret_cast(params + desc.offset_) != nullptr) { + if (nullptr == svmMem) { + memory = + dev().getGpuMemory(*reinterpret_cast(params + desc.offset_)); + } else { + memory = dev().getGpuMemory(svmMem); + } + // Synchronize data with other memory instances if necessary + memory->syncCacheFromHost(*this); + } + + if (memory != nullptr) { + // Check image + readOnly = (desc.accessQualifier_ == CL_KERNEL_ARG_ACCESS_READ_ONLY) ? true : false; + // Check buffer + readOnly |= (arg->access_ == HSAIL_ACCESS_TYPE_RO) ? true : false; + // Validate memory for a dependency in the queue + memoryDependency().validate(*this, memory, readOnly); + } + } + } + + for (pal::Memory* mem : hsaKernel.prog().globalStores()) { + const static bool IsReadOnly = false; + // Validate global store for a dependency in the queue + memoryDependency().validate(*this, mem, IsReadOnly); + } + + return true; +} + +amd::Memory* VirtualGPU::createBufferFromImage(amd::Memory& amdImage) const { + amd::Memory* mem = new (amdImage.getContext()) amd::Buffer(amdImage, 0, 0, amdImage.getSize()); + + if ((mem != nullptr) && !mem->create()) { + mem->release(); + } + + return mem; +} + +void VirtualGPU::writeVQueueHeader(VirtualGPU& hostQ, uint64_t kernelTable) { + const static bool Wait = true; + vqHeader_->kernel_table = kernelTable; + virtualQueue_->writeRawData(hostQ, 0, sizeof(AmdVQueueHeader), vqHeader_, Wait); +} + +void VirtualGPU::flushCuCaches(HwDbgGpuCacheMask cache_mask) { + Unimplemented(); + /* + //! @todo: fix issue of no event available for the flush/invalidate cache command + InvalidateSqCaches(cache_mask.sqICache_, + cache_mask.sqKCache_, + cache_mask.tcL1_, + cache_mask.tcL2_); + */ + flushDMA(engineID_); + + return; +} + +void VirtualGPU::buildKernelInfo(const HSAILKernel& hsaKernel, hsa_kernel_dispatch_packet_t* aqlPkt, + HwDbgKernelInfo& kernelInfo, amd::Event* enqueueEvent) { + amd::HwDebugManager* dbgManager = dev().hwDebugMgr(); + assert(dbgManager && "No HW Debug Manager!"); + + // Initialize structure with default values + + if (hsaKernel.prog().maxScratchRegs() > 0) { + pal::Memory* scratchBuf = dev().scratch(hwRing())->memObj_; + kernelInfo.scratchBufAddr = scratchBuf->vmAddress(); + kernelInfo.scratchBufferSizeInBytes = scratchBuf->size(); + + // Get the address of the scratch buffer and its size for CPU access + address scratchRingAddr = nullptr; + scratchRingAddr = static_cast
(scratchBuf->map(nullptr, 0)); + dbgManager->setScratchRing(scratchRingAddr, scratchBuf->size()); + scratchBuf->unmap(nullptr); + } else { + kernelInfo.scratchBufAddr = 0; + kernelInfo.scratchBufferSizeInBytes = 0; + dbgManager->setScratchRing(nullptr, 0); + } + + //! @todo: need to verify what is wanted for the global memory + Unimplemented(); + kernelInfo.heapBufAddr = 0; + + kernelInfo.pAqlDispatchPacket = aqlPkt; + kernelInfo.pAqlQueuePtr = reinterpret_cast(hsaQueueMem_->vmAddress()); + + // Get the address of the kernel code and its size for CPU access + /* pal::Memory* aqlCode = hsaKernel.gpuAqlCode(); + if (nullptr != aqlCode) { + address aqlCodeAddr = static_cast
(aqlCode->map(nullptr, 0)); + dbgManager->setKernelCodeInfo(aqlCodeAddr, hsaKernel.aqlCodeSize()); + aqlCode->unmap(nullptr); + } + else { + dbgManager->setKernelCodeInfo(nullptr, 0); + } + */ + kernelInfo.trapPresent = false; + kernelInfo.trapHandler = nullptr; + kernelInfo.trapHandlerBuffer = nullptr; + + kernelInfo.excpEn = 0; + kernelInfo.cacheDisableMask = 0; + kernelInfo.sqDebugMode = 0; + + kernelInfo.mgmtSe0Mask = 0xFFFFFFFF; + kernelInfo.mgmtSe1Mask = 0xFFFFFFFF; + + // set kernel info for HW debug and call the callback function + if (nullptr != dbgManager->preDispatchCallBackFunc()) { + DebugToolInfo dbgSetting = {0}; + dbgSetting.scratchAddress_ = kernelInfo.scratchBufAddr; + dbgSetting.scratchSize_ = kernelInfo.scratchBufferSizeInBytes; + dbgSetting.globalAddress_ = kernelInfo.heapBufAddr; + dbgSetting.aclBinary_ = hsaKernel.prog().binaryElf(); + dbgSetting.event_ = enqueueEvent; + + // Call the predispatch callback function & set the trap info + AqlCodeInfo aqlCodeInfo; + aqlCodeInfo.aqlCode_ = (amd_kernel_code_t*)hsaKernel.cpuAqlCode(); + aqlCodeInfo.aqlCodeSize_ = hsaKernel.aqlCodeSize(); + + // Execute the pre-dispatch call back function + dbgManager->executePreDispatchCallBack(reinterpret_cast(aqlPkt), &dbgSetting); + + // assign the debug TMA and TBA for kernel dispatch + if (nullptr != dbgSetting.trapHandler_ && nullptr != dbgSetting.trapBuffer_) { + assignDebugTrapHandler(dbgSetting, kernelInfo); } - //! @todo: need to verify what is wanted for the global memory + kernelInfo.trapPresent = (kernelInfo.trapHandler) ? true : false; + + // Execption policy + kernelInfo.excpEn = dbgSetting.exceptionMask_; + kernelInfo.cacheDisableMask = dbgSetting.cacheDisableMask_; + kernelInfo.sqDebugMode = dbgSetting.gpuSingleStepMode_; + + // Compute the mask for reserved CUs. These two dwords correspond to + // two registers used for reserving CUs for display. In the current + // implementation, the number of CUs reserved can be 0 to 7, and it + // is set by debugger users. + if (dbgSetting.monitorMode_) { + uint32_t i = dbgSetting.reservedCuNum_ / 2; + kernelInfo.mgmtSe0Mask <<= i; + i = dbgSetting.reservedCuNum_ - i; + kernelInfo.mgmtSe1Mask <<= i; + } Unimplemented(); - kernelInfo.heapBufAddr = 0; - - kernelInfo.pAqlDispatchPacket = aqlPkt; - kernelInfo.pAqlQueuePtr = reinterpret_cast(hsaQueueMem_->vmAddress()); - - // Get the address of the kernel code and its size for CPU access -/* pal::Memory* aqlCode = hsaKernel.gpuAqlCode(); - if (nullptr != aqlCode) { - address aqlCodeAddr = static_cast
(aqlCode->map(nullptr, 0)); - dbgManager->setKernelCodeInfo(aqlCodeAddr, hsaKernel.aqlCodeSize()); - aqlCode->unmap(nullptr); - } - else { - dbgManager->setKernelCodeInfo(nullptr, 0); - } -*/ - kernelInfo.trapPresent = false; - kernelInfo.trapHandler = nullptr; - kernelInfo.trapHandlerBuffer = nullptr; - - kernelInfo.excpEn = 0; - kernelInfo.cacheDisableMask = 0; - kernelInfo.sqDebugMode = 0; - - kernelInfo.mgmtSe0Mask = 0xFFFFFFFF; - kernelInfo.mgmtSe1Mask = 0xFFFFFFFF; - - // set kernel info for HW debug and call the callback function - if (nullptr != dbgManager->preDispatchCallBackFunc()) { - DebugToolInfo dbgSetting = {0}; - dbgSetting.scratchAddress_ = kernelInfo.scratchBufAddr; - dbgSetting.scratchSize_ = kernelInfo.scratchBufferSizeInBytes; - dbgSetting.globalAddress_ = kernelInfo.heapBufAddr; - dbgSetting.aclBinary_ = hsaKernel.prog().binaryElf(); - dbgSetting.event_ = enqueueEvent; - - // Call the predispatch callback function & set the trap info - AqlCodeInfo aqlCodeInfo; - aqlCodeInfo.aqlCode_ = (amd_kernel_code_t *) hsaKernel.cpuAqlCode(); - aqlCodeInfo.aqlCodeSize_ = hsaKernel.aqlCodeSize(); - - // Execute the pre-dispatch call back function - dbgManager->executePreDispatchCallBack(reinterpret_cast(aqlPkt), &dbgSetting); - - // assign the debug TMA and TBA for kernel dispatch - if (nullptr != dbgSetting.trapHandler_ && nullptr != dbgSetting.trapBuffer_) { - assignDebugTrapHandler(dbgSetting, kernelInfo); - } - - kernelInfo.trapPresent = (kernelInfo.trapHandler) ? true : false; - - // Execption policy - kernelInfo.excpEn = dbgSetting.exceptionMask_; - kernelInfo.cacheDisableMask = dbgSetting.cacheDisableMask_; - kernelInfo.sqDebugMode = dbgSetting.gpuSingleStepMode_; - - // Compute the mask for reserved CUs. These two dwords correspond to - // two registers used for reserving CUs for display. In the current - // implementation, the number of CUs reserved can be 0 to 7, and it - // is set by debugger users. - if (dbgSetting.monitorMode_) { - uint32_t i = dbgSetting.reservedCuNum_ / 2; - kernelInfo.mgmtSe0Mask <<= i; - i = dbgSetting.reservedCuNum_ - i; - kernelInfo.mgmtSe1Mask <<= i; - } - Unimplemented(); -/* - // flush/invalidate the instruction, data, L1 and L2 caches - InvalidateSqCaches(); -*/ - } + /* + // flush/invalidate the instruction, data, L1 and L2 caches + InvalidateSqCaches(); + */ + } } -void -VirtualGPU::assignDebugTrapHandler(const DebugToolInfo& dbgSetting, - HwDbgKernelInfo& kernelInfo) -{ - // setup the runtime trap handler code and trap buffer to be assigned before kernel dispatching - // - Memory* rtTrapHandlerMem = static_cast(dev().hwDebugMgr()->runtimeTBA()); - Memory* rtTrapBufferMem = static_cast(dev().hwDebugMgr()->runtimeTMA()); +void VirtualGPU::assignDebugTrapHandler(const DebugToolInfo& dbgSetting, + HwDbgKernelInfo& kernelInfo) { + // setup the runtime trap handler code and trap buffer to be assigned before kernel dispatching + // + Memory* rtTrapHandlerMem = static_cast(dev().hwDebugMgr()->runtimeTBA()); + Memory* rtTrapBufferMem = static_cast(dev().hwDebugMgr()->runtimeTMA()); - kernelInfo.trapHandler = reinterpret_cast(rtTrapHandlerMem->vmAddress() + TbaStartOffset); - // With the TMA corruption hw bug workaround, the trap handler buffer can be set to zero. - // However, by setting the runtime trap buffer (TMA) correct, the runtime trap hander - // without the workaround can still function correctly. - kernelInfo.trapHandlerBuffer = reinterpret_cast(rtTrapBufferMem->vmAddress()); + kernelInfo.trapHandler = reinterpret_cast(rtTrapHandlerMem->vmAddress() + TbaStartOffset); + // With the TMA corruption hw bug workaround, the trap handler buffer can be set to zero. + // However, by setting the runtime trap buffer (TMA) correct, the runtime trap hander + // without the workaround can still function correctly. + kernelInfo.trapHandlerBuffer = reinterpret_cast(rtTrapBufferMem->vmAddress()); - address rtTrapBufferAddress = static_cast
(rtTrapBufferMem->map(this)); + address rtTrapBufferAddress = static_cast
(rtTrapBufferMem->map(this)); - Memory* trapHandlerMem = dev().getGpuMemory(dbgSetting.trapHandler_); - Memory* trapBufferMem = dev().getGpuMemory(dbgSetting.trapBuffer_); + Memory* trapHandlerMem = dev().getGpuMemory(dbgSetting.trapHandler_); + Memory* trapBufferMem = dev().getGpuMemory(dbgSetting.trapBuffer_); - // Address of the trap handler code/buffer should be 256-byte aligned - uint64_t tbaAddress = trapHandlerMem->vmAddress(); - uint64_t tmaAddress = trapBufferMem->vmAddress(); - if ((tbaAddress & 0xFF) != 0 || (tmaAddress & 0xFF) != 0) { - assert(false && "Trap handler/buffer is not 256-byte aligned"); - } + // Address of the trap handler code/buffer should be 256-byte aligned + uint64_t tbaAddress = trapHandlerMem->vmAddress(); + uint64_t tmaAddress = trapBufferMem->vmAddress(); + if ((tbaAddress & 0xFF) != 0 || (tmaAddress & 0xFF) != 0) { + assert(false && "Trap handler/buffer is not 256-byte aligned"); + } - // The addresses of the debug trap handler code (TBA) and buffer (TMA) are - // stored in the runtime trap handler buffer with offset location of 0x18-19 - // and 0x20-21, respectively. - uint64_t * rtTmaPtr = reinterpret_cast(rtTrapBufferAddress + 0x18); - rtTmaPtr[0] = tbaAddress; - rtTmaPtr[1] = tmaAddress; + // The addresses of the debug trap handler code (TBA) and buffer (TMA) are + // stored in the runtime trap handler buffer with offset location of 0x18-19 + // and 0x20-21, respectively. + uint64_t* rtTmaPtr = reinterpret_cast(rtTrapBufferAddress + 0x18); + rtTmaPtr[0] = tbaAddress; + rtTmaPtr[1] = tmaAddress; - rtTrapBufferMem->unmap(nullptr); - - // Add GPU mem handles to the memory list for VidMM - addVmMemory(trapHandlerMem); - addVmMemory(trapBufferMem); - addVmMemory(rtTrapHandlerMem); - addVmMemory(rtTrapBufferMem); + rtTrapBufferMem->unmap(nullptr); + // Add GPU mem handles to the memory list for VidMM + addVmMemory(trapHandlerMem); + addVmMemory(trapBufferMem); + addVmMemory(rtTrapHandlerMem); + addVmMemory(rtTrapBufferMem); } -bool -VirtualGPU::validateSdmaOverlap(const Resource& src, const Resource& dst) -{ - uint64_t srcVmEnd = src.vmAddress() + src.vmSize(); - if (((src.vmAddress() >= sdmaRange_.start_) && - (src.vmAddress() <= sdmaRange_.end_)) || - ((srcVmEnd >= sdmaRange_.start_) && - (srcVmEnd <= sdmaRange_.end_)) || - ((src.vmAddress() <= sdmaRange_.start_) && - (srcVmEnd >= sdmaRange_.end_))) { - sdmaRange_.start_ = dst.vmAddress(); - sdmaRange_.end_ = dst.vmAddress() + dst.vmSize(); - return true; - } +bool VirtualGPU::validateSdmaOverlap(const Resource& src, const Resource& dst) { + uint64_t srcVmEnd = src.vmAddress() + src.vmSize(); + if (((src.vmAddress() >= sdmaRange_.start_) && (src.vmAddress() <= sdmaRange_.end_)) || + ((srcVmEnd >= sdmaRange_.start_) && (srcVmEnd <= sdmaRange_.end_)) || + ((src.vmAddress() <= sdmaRange_.start_) && (srcVmEnd >= sdmaRange_.end_))) { + sdmaRange_.start_ = dst.vmAddress(); + sdmaRange_.end_ = dst.vmAddress() + dst.vmSize(); + return true; + } - sdmaRange_.start_ = std::min(sdmaRange_.start_, dst.vmAddress()); - sdmaRange_.end_ = std::max(sdmaRange_.end_, dst.vmAddress() + dst.vmSize()); - return false; + sdmaRange_.start_ = std::min(sdmaRange_.start_, dst.vmAddress()); + sdmaRange_.end_ = std::max(sdmaRange_.end_, dst.vmAddress() + dst.vmSize()); + return false; } -void -VirtualGPU::submitTransferBufferFromFile(amd::TransferBufferFileCommand& cmd) -{ - size_t copySize = cmd.size()[0]; - size_t fileOffset = cmd.fileOffset(); - Memory* mem = dev().getGpuMemory(&cmd.memory()); - uint idx = 0; +void VirtualGPU::submitTransferBufferFromFile(amd::TransferBufferFileCommand& cmd) { + size_t copySize = cmd.size()[0]; + size_t fileOffset = cmd.fileOffset(); + Memory* mem = dev().getGpuMemory(&cmd.memory()); + uint idx = 0; - assert((cmd.type() == CL_COMMAND_READ_SSG_FILE_AMD) || - (cmd.type() == CL_COMMAND_WRITE_SSG_FILE_AMD)); - const bool writeBuffer(cmd.type() == CL_COMMAND_READ_SSG_FILE_AMD); + assert((cmd.type() == CL_COMMAND_READ_SSG_FILE_AMD) || + (cmd.type() == CL_COMMAND_WRITE_SSG_FILE_AMD)); + const bool writeBuffer(cmd.type() == CL_COMMAND_READ_SSG_FILE_AMD); - if (writeBuffer) { - size_t dstOffset = cmd.origin()[0]; - while (copySize > 0) { - Memory* staging = dev().getGpuMemory(&cmd.staging(idx)); - size_t dstSize = amd::TransferBufferFileCommand::StagingBufferSize; - dstSize = std::min(dstSize, copySize); - void* dstBuffer = staging->cpuMap(*this); - if (!cmd.file()->transferBlock(writeBuffer, - dstBuffer, staging->size(), fileOffset, 0, dstSize)) { - cmd.setStatus(CL_INVALID_OPERATION); - return; - } - staging->cpuUnmap(*this); + if (writeBuffer) { + size_t dstOffset = cmd.origin()[0]; + while (copySize > 0) { + Memory* staging = dev().getGpuMemory(&cmd.staging(idx)); + size_t dstSize = amd::TransferBufferFileCommand::StagingBufferSize; + dstSize = std::min(dstSize, copySize); + void* dstBuffer = staging->cpuMap(*this); + if (!cmd.file()->transferBlock(writeBuffer, dstBuffer, staging->size(), fileOffset, 0, + dstSize)) { + cmd.setStatus(CL_INVALID_OPERATION); + return; + } + staging->cpuUnmap(*this); - bool result = blitMgr().copyBuffer(*staging, *mem, - 0, dstOffset, dstSize, false); - flushDMA(getGpuEvent(staging->iMem())->engineId_); - fileOffset += dstSize; - dstOffset += dstSize; - copySize -= dstSize; - } + bool result = blitMgr().copyBuffer(*staging, *mem, 0, dstOffset, dstSize, false); + flushDMA(getGpuEvent(staging->iMem())->engineId_); + fileOffset += dstSize; + dstOffset += dstSize; + copySize -= dstSize; } - else { - size_t srcOffset = cmd.origin()[0]; - while (copySize > 0) { - Memory* staging = dev().getGpuMemory(&cmd.staging(idx)); - size_t srcSize = amd::TransferBufferFileCommand::StagingBufferSize; - srcSize = std::min(srcSize, copySize); - bool result = blitMgr().copyBuffer(*mem, *staging, - srcOffset, 0, srcSize, false); + } else { + size_t srcOffset = cmd.origin()[0]; + while (copySize > 0) { + Memory* staging = dev().getGpuMemory(&cmd.staging(idx)); + size_t srcSize = amd::TransferBufferFileCommand::StagingBufferSize; + srcSize = std::min(srcSize, copySize); + bool result = blitMgr().copyBuffer(*mem, *staging, srcOffset, 0, srcSize, false); - void* srcBuffer = staging->cpuMap(*this); - if (!cmd.file()->transferBlock(writeBuffer, - srcBuffer, staging->size(), fileOffset, 0, srcSize)) { - cmd.setStatus(CL_INVALID_OPERATION); - return; - } - staging->cpuUnmap(*this); + void* srcBuffer = staging->cpuMap(*this); + if (!cmd.file()->transferBlock(writeBuffer, srcBuffer, staging->size(), fileOffset, 0, + srcSize)) { + cmd.setStatus(CL_INVALID_OPERATION); + return; + } + staging->cpuUnmap(*this); - fileOffset += srcSize; - srcOffset += srcSize; - copySize -= srcSize; - } + fileOffset += srcSize; + srcOffset += srcSize; + copySize -= srcSize; } + } } -} // namespace pal +} // namespace pal diff --git a/rocclr/runtime/device/pal/palvirtual.hpp b/rocclr/runtime/device/pal/palvirtual.hpp index 66295988f1..3c59de3e1a 100644 --- a/rocclr/runtime/device/pal/palvirtual.hpp +++ b/rocclr/runtime/device/pal/palvirtual.hpp @@ -37,629 +37,587 @@ class ThreadTrace; class HSAILKernel; //! Virtual GPU -class VirtualGPU : public device::VirtualDevice -{ -public: - class Queue : public amd::HeapObject - { - public: - static const uint MaxCmdBuffers = 8; - static const uint MaxCommands = 512; - static const uint StartCmdBufIdx = 1; - static const uint FirstMemoryReference = 0x80000000; - static const uint64_t WaitTimeoutInNsec = 6000000000; - static const uint64_t PollIntervalInNsec = 200000; - - Queue(const Queue&) = delete; - Queue& operator=(const Queue&) = delete; - - static Queue* Create( - Pal::IDevice* palDev, //!< PAL device object - Pal::QueueType queueType, //!< PAL queue type - uint engineIdx, //!< Select particular engine index - Pal::ICmdAllocator* cmdAlloc, //!< PAL CMD buffer allocator - uint rtCU, //!< The number of reserved CUs - amd::CommandQueue::Priority priority //!< Queue priority - ); - - Queue(Pal::IDevice* palDev) - : iQueue_(nullptr), last_kernel_(nullptr), iDev_(palDev), - cmdBufIdSlot_(StartCmdBufIdx), cmdBufIdCurrent_(StartCmdBufIdx), - cmbBufIdRetired_(0), cmdCnt_(0), vlAlloc_(64 * Ki) - { - for (uint i = 0; i < MaxCmdBuffers; ++i) { - iCmdBuffs_[i] = nullptr; - iCmdFences_[i] = nullptr; - } - vlAlloc_.Init(); - } - - ~Queue(); - - void addCmdMemRef(Pal::IGpuMemory* iMem); - void removeCmdMemRef(Pal::IGpuMemory* iMem); - - void addCmdDoppRef(Pal::IGpuMemory* iMem, bool lastDoppCmd, bool pfpaDoppCmd); - - void addMemRef(Pal::IGpuMemory* iMem) const - { - Pal::GpuMemoryRef memRef = {}; - memRef.pGpuMemory = iMem; - iDev_->AddGpuMemoryReferences(1, &memRef, nullptr, - Pal::GpuMemoryRefCantTrim); - } - void removeMemRef(Pal::IGpuMemory* iMem) const - { - iDev_->RemoveGpuMemoryReferences(1, &iMem, nullptr); - } - - // ibReuse forces event wait without polling, to make sure event occured - bool waifForFence(uint cbId, bool ibReuse = false) const - { - Pal::Result result = Pal::Result::Success; - uint64_t start = amd::Os::timeNanos(); - while (Pal::Result::Success != (result = iCmdFences_[cbId]->GetStatus())) { - if (result == Pal::Result::ErrorFenceNeverSubmitted) { - result = Pal::Result::Success; - break; - } - uint64_t end = amd::Os::timeNanos(); - if (!ibReuse && ((end - start) < PollIntervalInNsec)) { - amd::Os::yield(); - continue; - } - result = iDev_->WaitForFences(1, &iCmdFences_[cbId], true, WaitTimeoutInNsec); - if (Pal::Result::Success == result) { - break; - } - else if ((Pal::Result::NotReady == result) || - (Pal::Result::Timeout == result)) { - LogWarning("PAL fence isn't ready!"); - if (GPU_ANALYZE_HANG) { - DumpMemoryReferences(); - } - } - else { - LogError("PAL wait for a fence failed!"); - break; - } - } - return (result == Pal::Result::Success) ? true : false; - } - - //! Flushes the current command buffer to HW - //! Returns ID associated with the submission - uint submit(bool forceFlush); - - bool flush(); - - bool waitForEvent(uint id); - - bool isDone(uint id); - - Pal::ICmdBuffer* iCmd() const { return iCmdBuffs_[cmdBufIdSlot_]; } - - Pal::IQueue* iQueue_; //!< PAL queue object - Pal::ICmdBuffer* iCmdBuffs_[MaxCmdBuffers]; //!< PAL command buffers - Pal::IFence* iCmdFences_[MaxCmdBuffers];//!< PAL fences, associated with CMD - const amd::Kernel* last_kernel_; //!< Last submitted kernel - - private: - void DumpMemoryReferences() const; - Pal::IDevice* iDev_; //!< PAL device - uint cmdBufIdSlot_; //!< Command buffer ID slot for submissions - uint cmdBufIdCurrent_; //!< Current global command buffer ID - uint cmbBufIdRetired_; //!< The last retired command buffer ID - uint cmdCnt_; //!< Counter of commands - std::map memReferences_; - Util::VirtualLinearAllocator vlAlloc_; - std::vector palMemRefs_; - std::vector palMems_; - std::vector palDoppRefs_; - }; - - struct CommandBatch : public amd::HeapObject - { - amd::Command* head_; //!< Command batch head - GpuEvent events_[AllEngines]; //!< Last known GPU events - TimeStamp* lastTS_; //!< TS associated with command batch - - //! Constructor - CommandBatch( - amd::Command* head, //!< Command batch head - const GpuEvent* events, //!< HW events on all engines - TimeStamp* lastTS //!< Last TS in command batch - ) - { - init(head, events, lastTS); - } - - void init( - amd::Command* head, //!< Command batch head - const GpuEvent* events, //!< HW events on all engines - TimeStamp* lastTS //!< Last TS in command batch - ) - { - head_ = head; - lastTS_ = lastTS; - memcpy(&events_, events, AllEngines * sizeof(GpuEvent)); - } - - }; - - //! The virtual GPU states - union State - { - struct - { - uint profiling_ : 1; //!< Profiling is enabled - uint forceWait_ : 1; //!< Forces wait in flush() - uint profileEnabled_: 1; //!< Profiling is enabled for WaveLimiter - }; - uint value_; - State(): value_(0) {} - }; - - //! CAL descriptor for the GPU virtual device - struct CalVirtualDesc : public amd::EmbeddedObject - { - GpuEvent events_[AllEngines]; //!< Last known GPU events - uint iterations_; //!< Number of iterations for the execution - TimeStamp* lastTS_; //!< Last timestamp executed on Virtual GPU - }; - - typedef std::vector constbufs_t; - - class MemoryDependency : public amd::EmbeddedObject - { - public: - //! Default constructor - MemoryDependency() - : memObjectsInQueue_(nullptr) - , numMemObjectsInQueue_(0) - , maxMemObjectsInQueue_(0) {} - - ~MemoryDependency() { delete [] memObjectsInQueue_; } - - //! Creates memory dependecy structure - bool create(size_t numMemObj); - - //! Notify the tracker about new kernel - void newKernel() { endMemObjectsInQueue_ = numMemObjectsInQueue_; } - - //! Validates memory object on dependency - void validate(VirtualGPU& gpu, const Memory* memory, bool readOnly); - - //! Clear memory dependency - void clear(bool all = true); - - private: - struct MemoryState { - uint64_t start_; //! Busy memory start address - uint64_t end_; //! Busy memory end address - bool readOnly_; //! Current GPU state in the queue - }; - - MemoryState* memObjectsInQueue_; //!< Memory object state in the queue - size_t endMemObjectsInQueue_; //!< End of mem objects in the queue - size_t numMemObjectsInQueue_; //!< Number of mem objects in the queue - size_t maxMemObjectsInQueue_; //!< Maximum number of mem objects in the queue - }; - - - class DmaFlushMgmt : public amd::EmbeddedObject - { - public: - DmaFlushMgmt(const Device& dev); - - // Resets DMA command buffer workload - void resetCbWorkload(const Device& dev); - - // Finds split size for the current dispatch - void findSplitSize( - const Device& dev, //!< GPU device object - uint64_t threads, //!< Total number of execution threads - uint instructions //!< Number of ALU instructions - ); - - // Returns TRUE if DMA command buffer is ready for a flush - bool isCbReady( - VirtualGPU& gpu, //!< Virtual GPU object - uint64_t threads, //!< Total number of execution threads - uint instructions //!< Number of ALU instructions - ); - - // Returns dispatch split size - uint dispatchSplitSize() const { return dispatchSplitSize_; } - - private: - uint64_t maxDispatchWorkload_; //!< Maximum number of operations for a single dispatch - uint64_t maxCbWorkload_; //!< Maximum number of operations for DMA command buffer - uint64_t cbWorkload_; //!< Current number of operations in DMA command buffer - uint aluCnt_; //!< All ALUs on the chip - uint dispatchSplitSize_; //!< Dispath split size in elements - }; - -public: - VirtualGPU(Device& device); - //! Creates virtual gpu object - bool create( - bool profiling, //!< Enables profilng on the queue - uint deviceQueueSize = 0, //!< Device queue size, 0 if host queue - uint rtCUs = amd::CommandQueue::RealTimeDisabled, - amd::CommandQueue::Priority priority = amd::CommandQueue::Priority::Normal - ); - ~VirtualGPU(); - - void submitReadMemory(amd::ReadMemoryCommand& vcmd); - void submitWriteMemory(amd::WriteMemoryCommand& vcmd); - void submitCopyMemory(amd::CopyMemoryCommand& vcmd); - void submitMapMemory(amd::MapMemoryCommand& vcmd); - void submitUnmapMemory(amd::UnmapMemoryCommand& vcmd); - void submitKernel(amd::NDRangeKernelCommand& vcmd); - bool submitKernelInternal( - const amd::NDRangeContainer& sizes, //!< Workload sizes - const amd::Kernel& kernel, //!< Kernel for execution - const_address parameters, //!< Parameters for the kernel - bool nativeMem = true, //!< Native memory objects - amd::Event* enqueueEvent = nullptr //!< Event provided in the enqueue kernel command - ); - void submitNativeFn(amd::NativeFnCommand& vcmd); - void submitFillMemory(amd::FillMemoryCommand& vcmd); - void submitMigrateMemObjects(amd::MigrateMemObjectsCommand& cmd); - void submitMarker(amd::Marker& vcmd); - void submitAcquireExtObjects(amd::AcquireExtObjectsCommand& vcmd); - void submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& vcmd); - void submitPerfCounter(amd::PerfCounterCommand& vcmd); - void submitThreadTraceMemObjects(amd::ThreadTraceMemObjectsCommand& cmd); - void submitThreadTrace(amd::ThreadTraceCommand& vcmd); - void submitSignal(amd::SignalCommand & vcmd); - void submitMakeBuffersResident(amd::MakeBuffersResidentCommand & vcmd); - virtual void submitSvmFreeMemory(amd::SvmFreeMemoryCommand& cmd); - virtual void submitSvmCopyMemory(amd::SvmCopyMemoryCommand& cmd); - virtual void submitSvmFillMemory(amd::SvmFillMemoryCommand& cmd); - virtual void submitSvmMapMemory(amd::SvmMapMemoryCommand& cmd); - virtual void submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& cmd); - virtual void submitTransferBufferFromFile(amd::TransferBufferFileCommand& cmd); - - void releaseMemory(Pal::IGpuMemory* iMem, bool wait = true); - - void flush(amd::Command* list = nullptr, bool wait = false); - bool terminate() { return true; } - - //! Returns GPU device object associated with this kernel - const Device& dev() const { return gpuDevice_; } - - //! Returns CAL descriptor of the virtual device - const CalVirtualDesc* cal() const { return &cal_; } - - //! Returns a GPU event, associated with GPU memory - GpuEvent* getGpuEvent( - Pal::IGpuMemory* iMem //!< PAL mem object - ); - - //! Assigns a GPU event, associated with GPU memory - void assignGpuEvent( - Pal::IGpuMemory* iMem, //!< PAL mem object - GpuEvent gpuEvent - ); - - //! Set the last known GPU event - void setGpuEvent( - GpuEvent gpuEvent, //!< GPU event for tracking - bool flush = false //!< TRUE if flush is required - ); - - //! Flush DMA buffer on the specified engine - void flushDMA( - uint engineID //!< Engine ID for DMA flush - ); - - //! Wait for all engines on this Virtual GPU - //! Returns TRUE if CPU didn't wait for GPU - bool waitAllEngines( - CommandBatch* cb = nullptr //!< Command batch - ); - - //! Waits for the latest GPU event with a lock to prevent multiple entries - void waitEventLock( - CommandBatch* cb //!< Command batch - ); - - //! Returns a resource associated with the constant buffer - const ConstBuffer* cb(uint idx) const { return constBufs_[idx]; } - - //! Adds CAL objects into the constant buffer vector - void addConstBuffer(ConstBuffer* cb) { constBufs_.push_back(cb); } - - constbufs_t constBufs_; //!< constant buffers - - //! Start the command profiling - void profilingBegin( - amd::Command& command, //!< Command queue object - bool drmProfiling = false //!< Measure DRM time - ); - - //! End the command profiling - void profilingEnd(amd::Command& command); - - //! Collect the profiling results - bool profilingCollectResults( - CommandBatch* cb, //!< Command batch - const amd::Event* waitingEvent //!< Waiting event - ); - - //! Adds a memory handle into the GSL memory array for Virtual Heap - void addVmMemory( - const Memory* memory //!< GPU memory object - ); - - //! Adds the last submitted kernel to the queue for tracking a possible hang - void AddKernel( - const amd::Kernel& kernel //!< AMD kernel object - ) const; - - //! Adds a dopp desktop texture reference - void addDoppRef( - const Memory* memory, //!< GPU memory object - bool lastDoopCmd, //!< is the last submission for the pre-present primary - bool pfpaDoppCmd //!< is a submission for the pre-present primary - ); - - //! Adds a stage write buffer into a list - void addXferWrite(Memory& memory); - - //! Adds a pinned memory object into a map - void addPinnedMem(amd::Memory* mem); - - //! Release pinned memory objects - void releasePinnedMem(); - - //! Finds if pinned memory is cached - amd::Memory* findPinnedMem(void* addr, size_t size); - - //! Returns the monitor object for execution access by VirtualGPU - amd::Monitor& execution() { return execution_; } - - //! Returns the virtual gpu unique index - uint index() const { return index_; } - - //! Get the PrintfDbg object - PrintfDbg& printfDbg() const { return *printfDbg_; } - - //! Get the PrintfDbgHSA object - PrintfDbgHSA& printfDbgHSA() const { return *printfDbgHSA_; } - - //! Enables synchronized transfers - void enableSyncedBlit() const; - - //! Checks if profiling is enabled - bool profiling() const { return state_.profiling_; } - - //! Returns memory dependency class - MemoryDependency& memoryDependency() { return memoryDependency_; } - - //! Returns hsaQueueMem_ - const Memory* hsaQueueMem() const { return hsaQueueMem_;} - - //! Returns DMA flush management structure - const DmaFlushMgmt& dmaFlushMgmt() const { return dmaFlushMgmt_; } - - //! Releases GSL memory objects allocated on this queue - void releaseMemObjects(bool scratch = true); - - //! Returns the HW ring used on this virtual device - uint hwRing() const { return hwRing_; } - - //! Returns current timestamp object for profiling - TimeStamp* currTs() const { return cal_.lastTS_; } - - //! Returns virtual queue object for device enqueuing - Memory* vQueue() const { return virtualQueue_; } - - //! Update virtual queue header - void writeVQueueHeader(VirtualGPU& hostQ, uint64_t kernelTable); - - //! Returns TRUE if virtual queue was successfully allocatted - bool createVirtualQueue( - uint deviceQueueSize //!< Device queue size - ); - - EngineType engineID_; //!< Engine ID for this VirtualGPU - State state_; //!< virtual GPU current state - CalVirtualDesc cal_; //!< CAL virtual device descriptor - - void flushCuCaches(HwDbgGpuCacheMask cache_mask); //!< flush/invalidate SQ cache - - //! Returns PAL command buffer interface - Pal::ICmdBuffer* iCmd() const { - Queue* queue = queues_[engineID_]; - return queue->iCmd(); +class VirtualGPU : public device::VirtualDevice { + public: + class Queue : public amd::HeapObject { + public: + static const uint MaxCmdBuffers = 8; + static const uint MaxCommands = 512; + static const uint StartCmdBufIdx = 1; + static const uint FirstMemoryReference = 0x80000000; + static const uint64_t WaitTimeoutInNsec = 6000000000; + static const uint64_t PollIntervalInNsec = 200000; + + Queue(const Queue&) = delete; + Queue& operator=(const Queue&) = delete; + + static Queue* Create(Pal::IDevice* palDev, //!< PAL device object + Pal::QueueType queueType, //!< PAL queue type + uint engineIdx, //!< Select particular engine index + Pal::ICmdAllocator* cmdAlloc, //!< PAL CMD buffer allocator + uint rtCU, //!< The number of reserved CUs + amd::CommandQueue::Priority priority //!< Queue priority + ); + + Queue(Pal::IDevice* palDev) + : iQueue_(nullptr), + last_kernel_(nullptr), + iDev_(palDev), + cmdBufIdSlot_(StartCmdBufIdx), + cmdBufIdCurrent_(StartCmdBufIdx), + cmbBufIdRetired_(0), + cmdCnt_(0), + vlAlloc_(64 * Ki) { + for (uint i = 0; i < MaxCmdBuffers; ++i) { + iCmdBuffs_[i] = nullptr; + iCmdFences_[i] = nullptr; + } + vlAlloc_.Init(); } - //! Returns queue, associated with VirtualGPU - Queue& queue(EngineType id) const { return *queues_[id]; } + ~Queue(); - void flushCUCaches(bool flushL2 = false) const - { - Pal::BarrierInfo barrier = {}; - barrier.pipePointWaitCount = 1; - Pal::HwPipePoint point = Pal::HwPipePostCs; - barrier.pPipePoints = &point; - barrier.transitionCount = 1; - uint32_t cacheMask = (flushL2) ? Pal::CoherCopy : Pal::CoherShader; - Pal::BarrierTransition trans = { cacheMask, cacheMask, - { nullptr, { { Pal::ImageAspect::Color, 0, 0 }, 0, 0 }, - Pal::LayoutShaderRead, Pal::LayoutShaderRead}}; - barrier.pTransitions = &trans; - barrier.waitPoint = Pal::HwPipePreCs; - iCmd()->CmdBarrier(barrier); + void addCmdMemRef(Pal::IGpuMemory* iMem); + void removeCmdMemRef(Pal::IGpuMemory* iMem); + + void addCmdDoppRef(Pal::IGpuMemory* iMem, bool lastDoppCmd, bool pfpaDoppCmd); + + void addMemRef(Pal::IGpuMemory* iMem) const { + Pal::GpuMemoryRef memRef = {}; + memRef.pGpuMemory = iMem; + iDev_->AddGpuMemoryReferences(1, &memRef, nullptr, Pal::GpuMemoryRefCantTrim); + } + void removeMemRef(Pal::IGpuMemory* iMem) const { + iDev_->RemoveGpuMemoryReferences(1, &iMem, nullptr); } - void eventBegin(EngineType engId) const { - const static bool Begin = true; - profileEvent(engId, Begin); + // ibReuse forces event wait without polling, to make sure event occured + bool waifForFence(uint cbId, bool ibReuse = false) const { + Pal::Result result = Pal::Result::Success; + uint64_t start = amd::Os::timeNanos(); + while (Pal::Result::Success != (result = iCmdFences_[cbId]->GetStatus())) { + if (result == Pal::Result::ErrorFenceNeverSubmitted) { + result = Pal::Result::Success; + break; + } + uint64_t end = amd::Os::timeNanos(); + if (!ibReuse && ((end - start) < PollIntervalInNsec)) { + amd::Os::yield(); + continue; + } + result = iDev_->WaitForFences(1, &iCmdFences_[cbId], true, WaitTimeoutInNsec); + if (Pal::Result::Success == result) { + break; + } else if ((Pal::Result::NotReady == result) || (Pal::Result::Timeout == result)) { + LogWarning("PAL fence isn't ready!"); + if (GPU_ANALYZE_HANG) { + DumpMemoryReferences(); + } + } else { + LogError("PAL wait for a fence failed!"); + break; + } + } + return (result == Pal::Result::Success) ? true : false; } - void eventEnd(EngineType engId, GpuEvent& event, bool forceExec = false) const { - constexpr bool End = false; - if (forceExec) { - constexpr bool ForceFlush = true; - event.id = queues_[engId]->submit(ForceFlush); - profileEvent(engId, End); - } - else { - profileEvent(engId, End); - event.id = queues_[engId]->submit(GPU_FLUSH_ON_EXECUTION); - } - event.engineId_ = engId; + //! Flushes the current command buffer to HW + //! Returns ID associated with the submission + uint submit(bool forceFlush); + + bool flush(); + + bool waitForEvent(uint id); + + bool isDone(uint id); + + Pal::ICmdBuffer* iCmd() const { return iCmdBuffs_[cmdBufIdSlot_]; } + + Pal::IQueue* iQueue_; //!< PAL queue object + Pal::ICmdBuffer* iCmdBuffs_[MaxCmdBuffers]; //!< PAL command buffers + Pal::IFence* iCmdFences_[MaxCmdBuffers]; //!< PAL fences, associated with CMD + const amd::Kernel* last_kernel_; //!< Last submitted kernel + + private: + void DumpMemoryReferences() const; + Pal::IDevice* iDev_; //!< PAL device + uint cmdBufIdSlot_; //!< Command buffer ID slot for submissions + uint cmdBufIdCurrent_; //!< Current global command buffer ID + uint cmbBufIdRetired_; //!< The last retired command buffer ID + uint cmdCnt_; //!< Counter of commands + std::map memReferences_; + Util::VirtualLinearAllocator vlAlloc_; + std::vector palMemRefs_; + std::vector palMems_; + std::vector palDoppRefs_; + }; + + struct CommandBatch : public amd::HeapObject { + amd::Command* head_; //!< Command batch head + GpuEvent events_[AllEngines]; //!< Last known GPU events + TimeStamp* lastTS_; //!< TS associated with command batch + + //! Constructor + CommandBatch(amd::Command* head, //!< Command batch head + const GpuEvent* events, //!< HW events on all engines + TimeStamp* lastTS //!< Last TS in command batch + ) { + init(head, events, lastTS); } - void waitForEvent(GpuEvent* event) const { - if (event->isValid()) { - assert(event->engineId_ < AllEngines); - queues_[event->engineId_]->waitForEvent(event->id); - event->invalidate(); - } + void init(amd::Command* head, //!< Command batch head + const GpuEvent* events, //!< HW events on all engines + TimeStamp* lastTS //!< Last TS in command batch + ) { + head_ = head; + lastTS_ = lastTS; + memcpy(&events_, events, AllEngines * sizeof(GpuEvent)); } + }; - bool isDone(GpuEvent* event) { - if (event->isValid()) { - assert(event->engineId_ < AllEngines); - if (queues_[event->engineId_]->isDone(event->id)) { - event->invalidate(); - return true; - } - return false; - } + //! The virtual GPU states + union State { + struct { + uint profiling_ : 1; //!< Profiling is enabled + uint forceWait_ : 1; //!< Forces wait in flush() + uint profileEnabled_ : 1; //!< Profiling is enabled for WaveLimiter + }; + uint value_; + State() : value_(0) {} + }; + + //! CAL descriptor for the GPU virtual device + struct CalVirtualDesc : public amd::EmbeddedObject { + GpuEvent events_[AllEngines]; //!< Last known GPU events + uint iterations_; //!< Number of iterations for the execution + TimeStamp* lastTS_; //!< Last timestamp executed on Virtual GPU + }; + + typedef std::vector constbufs_t; + + class MemoryDependency : public amd::EmbeddedObject { + public: + //! Default constructor + MemoryDependency() + : memObjectsInQueue_(nullptr), numMemObjectsInQueue_(0), maxMemObjectsInQueue_(0) {} + + ~MemoryDependency() { delete[] memObjectsInQueue_; } + + //! Creates memory dependecy structure + bool create(size_t numMemObj); + + //! Notify the tracker about new kernel + void newKernel() { endMemObjectsInQueue_ = numMemObjectsInQueue_; } + + //! Validates memory object on dependency + void validate(VirtualGPU& gpu, const Memory* memory, bool readOnly); + + //! Clear memory dependency + void clear(bool all = true); + + private: + struct MemoryState { + uint64_t start_; //! Busy memory start address + uint64_t end_; //! Busy memory end address + bool readOnly_; //! Current GPU state in the queue + }; + + MemoryState* memObjectsInQueue_; //!< Memory object state in the queue + size_t endMemObjectsInQueue_; //!< End of mem objects in the queue + size_t numMemObjectsInQueue_; //!< Number of mem objects in the queue + size_t maxMemObjectsInQueue_; //!< Maximum number of mem objects in the queue + }; + + + class DmaFlushMgmt : public amd::EmbeddedObject { + public: + DmaFlushMgmt(const Device& dev); + + // Resets DMA command buffer workload + void resetCbWorkload(const Device& dev); + + // Finds split size for the current dispatch + void findSplitSize(const Device& dev, //!< GPU device object + uint64_t threads, //!< Total number of execution threads + uint instructions //!< Number of ALU instructions + ); + + // Returns TRUE if DMA command buffer is ready for a flush + bool isCbReady(VirtualGPU& gpu, //!< Virtual GPU object + uint64_t threads, //!< Total number of execution threads + uint instructions //!< Number of ALU instructions + ); + + // Returns dispatch split size + uint dispatchSplitSize() const { return dispatchSplitSize_; } + + private: + uint64_t maxDispatchWorkload_; //!< Maximum number of operations for a single dispatch + uint64_t maxCbWorkload_; //!< Maximum number of operations for DMA command buffer + uint64_t cbWorkload_; //!< Current number of operations in DMA command buffer + uint aluCnt_; //!< All ALUs on the chip + uint dispatchSplitSize_; //!< Dispath split size in elements + }; + + public: + VirtualGPU(Device& device); + //! Creates virtual gpu object + bool create(bool profiling, //!< Enables profilng on the queue + uint deviceQueueSize = 0, //!< Device queue size, 0 if host queue + uint rtCUs = amd::CommandQueue::RealTimeDisabled, + amd::CommandQueue::Priority priority = amd::CommandQueue::Priority::Normal); + ~VirtualGPU(); + + void submitReadMemory(amd::ReadMemoryCommand& vcmd); + void submitWriteMemory(amd::WriteMemoryCommand& vcmd); + void submitCopyMemory(amd::CopyMemoryCommand& vcmd); + void submitMapMemory(amd::MapMemoryCommand& vcmd); + void submitUnmapMemory(amd::UnmapMemoryCommand& vcmd); + void submitKernel(amd::NDRangeKernelCommand& vcmd); + bool submitKernelInternal( + const amd::NDRangeContainer& sizes, //!< Workload sizes + const amd::Kernel& kernel, //!< Kernel for execution + const_address parameters, //!< Parameters for the kernel + bool nativeMem = true, //!< Native memory objects + amd::Event* enqueueEvent = nullptr //!< Event provided in the enqueue kernel command + ); + void submitNativeFn(amd::NativeFnCommand& vcmd); + void submitFillMemory(amd::FillMemoryCommand& vcmd); + void submitMigrateMemObjects(amd::MigrateMemObjectsCommand& cmd); + void submitMarker(amd::Marker& vcmd); + void submitAcquireExtObjects(amd::AcquireExtObjectsCommand& vcmd); + void submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& vcmd); + void submitPerfCounter(amd::PerfCounterCommand& vcmd); + void submitThreadTraceMemObjects(amd::ThreadTraceMemObjectsCommand& cmd); + void submitThreadTrace(amd::ThreadTraceCommand& vcmd); + void submitSignal(amd::SignalCommand& vcmd); + void submitMakeBuffersResident(amd::MakeBuffersResidentCommand& vcmd); + virtual void submitSvmFreeMemory(amd::SvmFreeMemoryCommand& cmd); + virtual void submitSvmCopyMemory(amd::SvmCopyMemoryCommand& cmd); + virtual void submitSvmFillMemory(amd::SvmFillMemoryCommand& cmd); + virtual void submitSvmMapMemory(amd::SvmMapMemoryCommand& cmd); + virtual void submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& cmd); + virtual void submitTransferBufferFromFile(amd::TransferBufferFileCommand& cmd); + + void releaseMemory(Pal::IGpuMemory* iMem, bool wait = true); + + void flush(amd::Command* list = nullptr, bool wait = false); + bool terminate() { return true; } + + //! Returns GPU device object associated with this kernel + const Device& dev() const { return gpuDevice_; } + + //! Returns CAL descriptor of the virtual device + const CalVirtualDesc* cal() const { return &cal_; } + + //! Returns a GPU event, associated with GPU memory + GpuEvent* getGpuEvent(Pal::IGpuMemory* iMem //!< PAL mem object + ); + + //! Assigns a GPU event, associated with GPU memory + void assignGpuEvent(Pal::IGpuMemory* iMem, //!< PAL mem object + GpuEvent gpuEvent); + + //! Set the last known GPU event + void setGpuEvent(GpuEvent gpuEvent, //!< GPU event for tracking + bool flush = false //!< TRUE if flush is required + ); + + //! Flush DMA buffer on the specified engine + void flushDMA(uint engineID //!< Engine ID for DMA flush + ); + + //! Wait for all engines on this Virtual GPU + //! Returns TRUE if CPU didn't wait for GPU + bool waitAllEngines(CommandBatch* cb = nullptr //!< Command batch + ); + + //! Waits for the latest GPU event with a lock to prevent multiple entries + void waitEventLock(CommandBatch* cb //!< Command batch + ); + + //! Returns a resource associated with the constant buffer + const ConstBuffer* cb(uint idx) const { return constBufs_[idx]; } + + //! Adds CAL objects into the constant buffer vector + void addConstBuffer(ConstBuffer* cb) { constBufs_.push_back(cb); } + + constbufs_t constBufs_; //!< constant buffers + + //! Start the command profiling + void profilingBegin(amd::Command& command, //!< Command queue object + bool drmProfiling = false //!< Measure DRM time + ); + + //! End the command profiling + void profilingEnd(amd::Command& command); + + //! Collect the profiling results + bool profilingCollectResults(CommandBatch* cb, //!< Command batch + const amd::Event* waitingEvent //!< Waiting event + ); + + //! Adds a memory handle into the GSL memory array for Virtual Heap + void addVmMemory(const Memory* memory //!< GPU memory object + ); + + //! Adds the last submitted kernel to the queue for tracking a possible hang + void AddKernel(const amd::Kernel& kernel //!< AMD kernel object + ) const; + + //! Adds a dopp desktop texture reference + void addDoppRef(const Memory* memory, //!< GPU memory object + bool lastDoopCmd, //!< is the last submission for the pre-present primary + bool pfpaDoppCmd //!< is a submission for the pre-present primary + ); + + //! Adds a stage write buffer into a list + void addXferWrite(Memory& memory); + + //! Adds a pinned memory object into a map + void addPinnedMem(amd::Memory* mem); + + //! Release pinned memory objects + void releasePinnedMem(); + + //! Finds if pinned memory is cached + amd::Memory* findPinnedMem(void* addr, size_t size); + + //! Returns the monitor object for execution access by VirtualGPU + amd::Monitor& execution() { return execution_; } + + //! Returns the virtual gpu unique index + uint index() const { return index_; } + + //! Get the PrintfDbg object + PrintfDbg& printfDbg() const { return *printfDbg_; } + + //! Get the PrintfDbgHSA object + PrintfDbgHSA& printfDbgHSA() const { return *printfDbgHSA_; } + + //! Enables synchronized transfers + void enableSyncedBlit() const; + + //! Checks if profiling is enabled + bool profiling() const { return state_.profiling_; } + + //! Returns memory dependency class + MemoryDependency& memoryDependency() { return memoryDependency_; } + + //! Returns hsaQueueMem_ + const Memory* hsaQueueMem() const { return hsaQueueMem_; } + + //! Returns DMA flush management structure + const DmaFlushMgmt& dmaFlushMgmt() const { return dmaFlushMgmt_; } + + //! Releases GSL memory objects allocated on this queue + void releaseMemObjects(bool scratch = true); + + //! Returns the HW ring used on this virtual device + uint hwRing() const { return hwRing_; } + + //! Returns current timestamp object for profiling + TimeStamp* currTs() const { return cal_.lastTS_; } + + //! Returns virtual queue object for device enqueuing + Memory* vQueue() const { return virtualQueue_; } + + //! Update virtual queue header + void writeVQueueHeader(VirtualGPU& hostQ, uint64_t kernelTable); + + //! Returns TRUE if virtual queue was successfully allocatted + bool createVirtualQueue(uint deviceQueueSize //!< Device queue size + ); + + EngineType engineID_; //!< Engine ID for this VirtualGPU + State state_; //!< virtual GPU current state + CalVirtualDesc cal_; //!< CAL virtual device descriptor + + void flushCuCaches(HwDbgGpuCacheMask cache_mask); //!< flush/invalidate SQ cache + + //! Returns PAL command buffer interface + Pal::ICmdBuffer* iCmd() const { + Queue* queue = queues_[engineID_]; + return queue->iCmd(); + } + + //! Returns queue, associated with VirtualGPU + Queue& queue(EngineType id) const { return *queues_[id]; } + + void flushCUCaches(bool flushL2 = false) const { + Pal::BarrierInfo barrier = {}; + barrier.pipePointWaitCount = 1; + Pal::HwPipePoint point = Pal::HwPipePostCs; + barrier.pPipePoints = &point; + barrier.transitionCount = 1; + uint32_t cacheMask = (flushL2) ? Pal::CoherCopy : Pal::CoherShader; + Pal::BarrierTransition trans = {cacheMask, + cacheMask, + {nullptr, + {{Pal::ImageAspect::Color, 0, 0}, 0, 0}, + Pal::LayoutShaderRead, + Pal::LayoutShaderRead}}; + barrier.pTransitions = &trans; + barrier.waitPoint = Pal::HwPipePreCs; + iCmd()->CmdBarrier(barrier); + } + + void eventBegin(EngineType engId) const { + const static bool Begin = true; + profileEvent(engId, Begin); + } + + void eventEnd(EngineType engId, GpuEvent& event, bool forceExec = false) const { + constexpr bool End = false; + if (forceExec) { + constexpr bool ForceFlush = true; + event.id = queues_[engId]->submit(ForceFlush); + profileEvent(engId, End); + } else { + profileEvent(engId, End); + event.id = queues_[engId]->submit(GPU_FLUSH_ON_EXECUTION); + } + event.engineId_ = engId; + } + + void waitForEvent(GpuEvent* event) const { + if (event->isValid()) { + assert(event->engineId_ < AllEngines); + queues_[event->engineId_]->waitForEvent(event->id); + event->invalidate(); + } + } + + bool isDone(GpuEvent* event) { + if (event->isValid()) { + assert(event->engineId_ < AllEngines); + if (queues_[event->engineId_]->isDone(event->id)) { + event->invalidate(); return true; + } + return false; } + return true; + } - //! Returns TRUE if SDMA requires overlap synchronizaiton - bool validateSdmaOverlap( - const Resource& src, //!< Source resource for SDMA transfer - const Resource& dst //!< Destination resource for SDMA transfer - ); -protected: - void profileEvent(EngineType engine, bool type) const; + //! Returns TRUE if SDMA requires overlap synchronizaiton + bool validateSdmaOverlap(const Resource& src, //!< Source resource for SDMA transfer + const Resource& dst //!< Destination resource for SDMA transfer + ); - //! Creates buffer object from image - amd::Memory* createBufferFromImage( - amd::Memory& amdImage //! The parent image object(untiled images only) - ) const; + protected: + void profileEvent(EngineType engine, bool type) const; -private: - struct MemoryRange { - uint64_t start_; //!< Memory range start address - uint64_t end_; //!< Memory range end address - MemoryRange(): start_(0), end_(0) {} - }; + //! Creates buffer object from image + amd::Memory* createBufferFromImage( + amd::Memory& amdImage //! The parent image object(untiled images only) + ) const; - typedef std::map GpuEvents; + private: + struct MemoryRange { + uint64_t start_; //!< Memory range start address + uint64_t end_; //!< Memory range end address + MemoryRange() : start_(0), end_(0) {} + }; - //! Finds total amount of necessary iterations - inline void findIterations( - const amd::NDRangeContainer& sizes, //!< Original workload sizes - const amd::NDRange& local, //!< Local workgroup size - amd::NDRange& groups, //!< Calculated workgroup sizes - amd::NDRange& remainder, //!< Calculated remainder sizes - size_t& extra //!< Amount of extra executions for remainder - ); + typedef std::map GpuEvents; - //! Allocates constant buffers - bool allocConstantBuffers(); + //! Finds total amount of necessary iterations + inline void findIterations(const amd::NDRangeContainer& sizes, //!< Original workload sizes + const amd::NDRange& local, //!< Local workgroup size + amd::NDRange& groups, //!< Calculated workgroup sizes + amd::NDRange& remainder, //!< Calculated remainder sizes + size_t& extra //!< Amount of extra executions for remainder + ); - //! Releases stage write buffers - void releaseXferWrite(); + //! Allocates constant buffers + bool allocConstantBuffers(); - //! Allocate hsaQueueMem_ - bool allocHsaQueueMem(); + //! Releases stage write buffers + void releaseXferWrite(); - //! Awaits a command batch with a waiting event - bool awaitCompletion( - CommandBatch* cb, //!< Command batch for to wait - const amd::Event* waitingEvent = nullptr //!< A waiting event - ); + //! Allocate hsaQueueMem_ + bool allocHsaQueueMem(); - //! Detects memory dependency for HSAIL kernels and flushes caches - bool processMemObjectsHSA( - const amd::Kernel& kernel, //!< AMD kernel object for execution - const_address params, //!< Pointer to the param's store - bool nativeMem, //!< Native memory objects - std::vector* memList //!< Memory list for KMD tracking - ); + //! Awaits a command batch with a waiting event + bool awaitCompletion(CommandBatch* cb, //!< Command batch for to wait + const amd::Event* waitingEvent = nullptr //!< A waiting event + ); - //! Common function for fill memory used by both svm Fill and non-svm fill - bool fillMemory( - cl_command_type type, //!< the command type - amd::Memory* amdMemory, //!< memory object to fill - const void* pattern, //!< pattern to fill the memory - size_t patternSize, //!< pattern size - const amd::Coord3D& origin, //!< memory origin - const amd::Coord3D& size //!< memory size for filling - ); + //! Detects memory dependency for HSAIL kernels and flushes caches + bool processMemObjectsHSA(const amd::Kernel& kernel, //!< AMD kernel object for execution + const_address params, //!< Pointer to the param's store + bool nativeMem, //!< Native memory objects + std::vector* memList //!< Memory list for KMD tracking + ); - bool copyMemory( - cl_command_type type, //!< the command type - amd::Memory& srcMem, //!< source memory object - amd::Memory& dstMem, //!< destination memory object - bool entire, //!< flag of entire memory copy - const amd::Coord3D& srcOrigin, //!< source memory origin - const amd::Coord3D& dstOrigin, //!< destination memory object - const amd::Coord3D& size, //!< copy size - const amd::BufferRect& srcRect, //!< region of source for copy - const amd::BufferRect& dstRect //!< region of destination for copy - ); + //! Common function for fill memory used by both svm Fill and non-svm fill + bool fillMemory(cl_command_type type, //!< the command type + amd::Memory* amdMemory, //!< memory object to fill + const void* pattern, //!< pattern to fill the memory + size_t patternSize, //!< pattern size + const amd::Coord3D& origin, //!< memory origin + const amd::Coord3D& size //!< memory size for filling + ); - void buildKernelInfo( - const HSAILKernel& hsaKernel, //!< hsa kernel - hsa_kernel_dispatch_packet_t* aqlPkt, //!< aql packet for dispatch - HwDbgKernelInfo& kernelInfo, //!< kernel info for the dispatch - amd::Event* enqueueEvent //!< Event provided in the enqueue kernel command - ); + bool copyMemory(cl_command_type type, //!< the command type + amd::Memory& srcMem, //!< source memory object + amd::Memory& dstMem, //!< destination memory object + bool entire, //!< flag of entire memory copy + const amd::Coord3D& srcOrigin, //!< source memory origin + const amd::Coord3D& dstOrigin, //!< destination memory object + const amd::Coord3D& size, //!< copy size + const amd::BufferRect& srcRect, //!< region of source for copy + const amd::BufferRect& dstRect //!< region of destination for copy + ); - void assignDebugTrapHandler( - const DebugToolInfo& dbgSetting, //!< debug settings - HwDbgKernelInfo& kernelInfo //!< kernel info for the dispatch - ); + void buildKernelInfo(const HSAILKernel& hsaKernel, //!< hsa kernel + hsa_kernel_dispatch_packet_t* aqlPkt, //!< aql packet for dispatch + HwDbgKernelInfo& kernelInfo, //!< kernel info for the dispatch + amd::Event* enqueueEvent //!< Event provided in the enqueue kernel command + ); - GpuEvents gpuEvents_; //!< GPU events + void assignDebugTrapHandler(const DebugToolInfo& dbgSetting, //!< debug settings + HwDbgKernelInfo& kernelInfo //!< kernel info for the dispatch + ); - Device& gpuDevice_; //!< physical GPU device - amd::Monitor execution_; //!< Lock to serialise access to all device objects - uint index_; //!< The virtual device unique index + GpuEvents gpuEvents_; //!< GPU events - PrintfDbg* printfDbg_; //!< GPU printf implemenation - PrintfDbgHSA* printfDbgHSA_; //!< HSAIL printf implemenation + Device& gpuDevice_; //!< physical GPU device + amd::Monitor execution_; //!< Lock to serialise access to all device objects + uint index_; //!< The virtual device unique index - TimeStampCache* tsCache_; //!< TimeStamp cache - MemoryDependency memoryDependency_; //!< Memory dependency class + PrintfDbg* printfDbg_; //!< GPU printf implemenation + PrintfDbgHSA* printfDbgHSA_; //!< HSAIL printf implemenation - DmaFlushMgmt dmaFlushMgmt_; //!< DMA flush management + TimeStampCache* tsCache_; //!< TimeStamp cache + MemoryDependency memoryDependency_; //!< Memory dependency class - std::vector xferWriteBuffers_; //!< Stage write buffers - std::vector pinnedMems_;//!< Pinned memory list + DmaFlushMgmt dmaFlushMgmt_; //!< DMA flush management - typedef std::queue CommandBatchQueue; - CommandBatchQueue cbQueue_; //!< Queue of command batches - CommandBatchQueue freeCbQueue_; //!< Queue of free command batches + std::vector xferWriteBuffers_; //!< Stage write buffers + std::vector pinnedMems_; //!< Pinned memory list - uint hwRing_; //!< HW ring used on this virtual device + typedef std::queue CommandBatchQueue; + CommandBatchQueue cbQueue_; //!< Queue of command batches + CommandBatchQueue freeCbQueue_; //!< Queue of free command batches - uint64_t readjustTimeGPU_; //!< Readjust time between GPU and CPU timestamps - TimeStamp* currTs_; //!< current timestamp for command + uint hwRing_; //!< HW ring used on this virtual device - AmdVQueueHeader* vqHeader_; //!< Sysmem copy for virtual queue header - Memory* virtualQueue_; //!< Virtual device queue - Memory* schedParams_; //!< The scheduler parameters - uint schedParamIdx_; //!< Index in the scheduler parameters buffer - uint deviceQueueSize_; //!< Device queue size - uint maskGroups_; //!< The number of mask groups processed in the scheduler by one thread + uint64_t readjustTimeGPU_; //!< Readjust time between GPU and CPU timestamps + TimeStamp* currTs_; //!< current timestamp for command - Memory* hsaQueueMem_; //!< Memory for the amd_queue_t object - Pal::ICmdAllocator* cmdAllocator_; //!< Command buffer allocator - Queue* queues_[AllEngines]; //!< HW queues for all engines - MemoryRange sdmaRange_; //!< SDMA memory range for write access + AmdVQueueHeader* vqHeader_; //!< Sysmem copy for virtual queue header + Memory* virtualQueue_; //!< Virtual device queue + Memory* schedParams_; //!< The scheduler parameters + uint schedParamIdx_; //!< Index in the scheduler parameters buffer + uint deviceQueueSize_; //!< Device queue size + uint maskGroups_; //!< The number of mask groups processed in the scheduler by one thread + + Memory* hsaQueueMem_; //!< Memory for the amd_queue_t object + Pal::ICmdAllocator* cmdAllocator_; //!< Command buffer allocator + Queue* queues_[AllEngines]; //!< HW queues for all engines + MemoryRange sdmaRange_; //!< SDMA memory range for write access }; /*@}*/} // namespace pal diff --git a/rocclr/runtime/device/pal/palwavelimiter.cpp b/rocclr/runtime/device/pal/palwavelimiter.cpp index 300e7b0541..d464b91348 100644 --- a/rocclr/runtime/device/pal/palwavelimiter.cpp +++ b/rocclr/runtime/device/pal/palwavelimiter.cpp @@ -18,351 +18,315 @@ uint WLAlgorithmSmooth::AdaptCount; uint WLAlgorithmSmooth::AbandonThresh; uint WLAlgorithmSmooth::DscThresh; -WaveLimiter::WaveLimiter( - WaveLimiterManager* manager, - uint seqNum, - bool enable, - bool enableDump) - : manager_(manager) - , dumper_(manager_->name() + "_" + std::to_string(seqNum), enableDump) -{ +WaveLimiter::WaveLimiter(WaveLimiterManager* manager, uint seqNum, bool enable, bool enableDump) + : manager_(manager), dumper_(manager_->name() + "_" + std::to_string(seqNum), enableDump) { + setIfNotDefault(SIMDPerSH_, GPU_WAVE_LIMIT_CU_PER_SH, manager->getSimdPerSH()); + MaxWave = GPU_WAVE_LIMIT_MAX_WAVE; + WarmUpCount = GPU_WAVE_LIMIT_WARMUP; + RunCount = GPU_WAVE_LIMIT_RUN * MaxWave; - setIfNotDefault(SIMDPerSH_, GPU_WAVE_LIMIT_CU_PER_SH, manager->getSimdPerSH()); - MaxWave = GPU_WAVE_LIMIT_MAX_WAVE; - WarmUpCount = GPU_WAVE_LIMIT_WARMUP; - RunCount = GPU_WAVE_LIMIT_RUN * MaxWave; + state_ = WARMUP; + if (!flagIsDefault(GPU_WAVE_LIMIT_TRACE)) { + traceStream_.open(std::string(GPU_WAVE_LIMIT_TRACE) + manager_->name() + ".txt"); + } - state_ = WARMUP; - if (!flagIsDefault(GPU_WAVE_LIMIT_TRACE)) { - traceStream_.open(std::string(GPU_WAVE_LIMIT_TRACE) + manager_->name() + - ".txt"); + waves_ = MaxWave; + currWaves_ = MaxWave; + bestWave_ = MaxWave; + enable_ = enable; +} + +WaveLimiter::~WaveLimiter() { + if (traceStream_.is_open()) { + traceStream_.close(); + } +} + +uint WaveLimiter::getWavesPerSH() { + currWaves_ = waves_; + return waves_ * SIMDPerSH_; +} + +WLAlgorithmSmooth::WLAlgorithmSmooth(WaveLimiterManager* manager, uint seqNum, bool enable, + bool enableDump) + : WaveLimiter(manager, seqNum, enable, enableDump) { + AdaptCount = 2 * MaxWave + 1; + AbandonThresh = GPU_WAVE_LIMIT_ABANDON; + DscThresh = GPU_WAVE_LIMIT_DSC_THRESH; + + dynRunCount_ = RunCount; + measure_.resize(MaxWave + 1); + reference_.resize(MaxWave + 1); + trial_.resize(MaxWave + 1); + ratio_.resize(MaxWave + 1); + + clearData(); +} + +WLAlgorithmSmooth::~WLAlgorithmSmooth() {} + +void WLAlgorithmSmooth::clearData() { + waves_ = MaxWave; + countAll_ = 0; + clear(measure_); + clear(reference_); + clear(trial_); + clear(ratio_); + discontinuous_ = false; + dataCount_ = 0; +} + +void WLAlgorithmSmooth::updateData(ulong time) { + auto count = dataCount_ - 1; + assert(count < 2 * MaxWave + 1); + assert(time > 0); + assert(currWaves_ == waves_); + if (count % 2 == 0) { + assert(waves_ == MaxWave); + auto pos = count / 2; + measure_[pos] = time; + if (pos > 0) { + auto wave = MaxWave + 1 - pos; + if (abs(static_cast(measure_[pos - 1]) - static_cast(measure_[pos])) * 100 / + measure_[pos] > + DscThresh) { + discontinuous_ = true; + } + reference_[wave] = (time + measure_[pos - 1]) / 2; + ratio_[wave] = trial_[wave] * 100 / reference_[wave]; + if (ratio_[bestWave_] > ratio_[wave] && !discontinuous_) { + bestWave_ = wave; + } } - - waves_ = MaxWave; - currWaves_ = MaxWave; - bestWave_ = MaxWave; - enable_ = enable; + } else { + assert(waves_ == MaxWave - count / 2); + trial_[waves_] = time; + } + outputTrace(); } -WaveLimiter::~WaveLimiter() -{ - if (traceStream_.is_open()) { - traceStream_.close(); - } -} +void WLAlgorithmSmooth::outputTrace() { + if (!traceStream_.is_open()) { + return; + } -uint WaveLimiter::getWavesPerSH() -{ - currWaves_ = waves_; - return waves_ * SIMDPerSH_; -} - -WLAlgorithmSmooth::WLAlgorithmSmooth(WaveLimiterManager* manager, uint seqNum, bool enable, bool enableDump) - : WaveLimiter(manager, seqNum, enable, enableDump) -{ - AdaptCount = 2 * MaxWave + 1; - AbandonThresh = GPU_WAVE_LIMIT_ABANDON; - DscThresh = GPU_WAVE_LIMIT_DSC_THRESH; - - dynRunCount_ = RunCount; - measure_.resize(MaxWave + 1); - reference_.resize(MaxWave + 1); - trial_.resize(MaxWave + 1); - ratio_.resize(MaxWave + 1); - - clearData(); -} - -WLAlgorithmSmooth::~WLAlgorithmSmooth() -{ - -} - -void WLAlgorithmSmooth::clearData() -{ - waves_ = MaxWave; - countAll_ = 0; - clear(measure_); - clear(reference_); - clear(trial_); - clear(ratio_); - discontinuous_ = false; - dataCount_ = 0; -} - -void WLAlgorithmSmooth::updateData(ulong time) -{ - auto count = dataCount_ - 1; - assert(count < 2 * MaxWave + 1); - assert(time > 0); - assert(currWaves_ == waves_); - if (count % 2 == 0) { - assert(waves_ == MaxWave); - auto pos = count / 2; - measure_[pos] = time; - if (pos > 0) { - auto wave = MaxWave + 1 - pos; - if (abs(static_cast(measure_[pos - 1]) - - static_cast(measure_[pos])) * 100 / measure_[pos] > - DscThresh) { - discontinuous_ = true; - } - reference_[wave] = (time + measure_[pos - 1]) / 2; - ratio_[wave] = trial_[wave] * 100 / reference_[wave]; - if (ratio_[bestWave_] > ratio_[wave] && !discontinuous_) { - bestWave_ = wave; - } - } - } else { - assert(waves_ == MaxWave - count / 2); - trial_[waves_] = time; - } - outputTrace(); -} - -void WLAlgorithmSmooth::outputTrace() -{ - if (!traceStream_.is_open()) { - return; - } - - traceStream_ << "[WaveLimiter] " << manager_->name() << " state=" << state_ - << " currWaves=" << currWaves_ << " waves=" << waves_ - << " bestWave=" << bestWave_ << '\n'; - output(traceStream_, "\n measure = ", measure_); - output(traceStream_, "\n reference = ", reference_); - output(traceStream_, "\n ratio = ", ratio_); - traceStream_ << "\n\n"; + traceStream_ << "[WaveLimiter] " << manager_->name() << " state=" << state_ + << " currWaves=" << currWaves_ << " waves=" << waves_ << " bestWave=" << bestWave_ + << '\n'; + output(traceStream_, "\n measure = ", measure_); + output(traceStream_, "\n reference = ", reference_); + output(traceStream_, "\n ratio = ", ratio_); + traceStream_ << "\n\n"; } -void WLAlgorithmSmooth::callback(ulong duration) -{ - dumper_.addData(duration, currWaves_, static_cast(state_)); +void WLAlgorithmSmooth::callback(ulong duration) { + dumper_.addData(duration, currWaves_, static_cast(state_)); - if (!enable_ || (duration == 0)) { - return; - } + if (!enable_ || (duration == 0)) { + return; + } - countAll_++; + countAll_++; - switch (state_) { + switch (state_) { case WARMUP: - if (countAll_ < WarmUpCount) { - return; - } - state_ = ADAPT; - bestWave_ = MaxWave; - clearData(); + if (countAll_ < WarmUpCount) { return; + } + state_ = ADAPT; + bestWave_ = MaxWave; + clearData(); + return; case ADAPT: - assert(duration > 0); - if (waves_ == currWaves_) { - dataCount_++; - updateData(duration); - waves_ = MaxWave + 1 - dataCount_ / 2; - if (dataCount_ == 1 || (dataCount_ < AdaptCount && - !discontinuous_ && (dataCount_ % 2 == 0 || - ratio_[waves_] < AbandonThresh))) { - if (dataCount_ % 2 == 1) { - --waves_; - } else { - waves_ = MaxWave; - } - return; - } - waves_ = bestWave_; - if (dataCount_ >= AdaptCount) { - dynRunCount_ = RunCount; - } else { - dynRunCount_ = AdaptCount; - } - countAll_ = rand() % MaxWave; - state_ = RUN; + assert(duration > 0); + if (waves_ == currWaves_) { + dataCount_++; + updateData(duration); + waves_ = MaxWave + 1 - dataCount_ / 2; + if (dataCount_ == 1 || (dataCount_ < AdaptCount && !discontinuous_ && + (dataCount_ % 2 == 0 || ratio_[waves_] < AbandonThresh))) { + if (dataCount_ % 2 == 1) { + --waves_; + } else { + waves_ = MaxWave; + } + return; } - return; + waves_ = bestWave_; + if (dataCount_ >= AdaptCount) { + dynRunCount_ = RunCount; + } else { + dynRunCount_ = AdaptCount; + } + countAll_ = rand() % MaxWave; + state_ = RUN; + } + return; case RUN: - if (countAll_ < dynRunCount_) { - return; - } - state_ = ADAPT; - bestWave_ = MaxWave; - clearData(); + if (countAll_ < dynRunCount_) { return; - } + } + state_ = ADAPT; + bestWave_ = MaxWave; + clearData(); + return; + } } -WaveLimiter::DataDumper::DataDumper(const std::string &kernelName, bool enable) -{ - enable_ = enable; - if (enable_) { - fileName_ = std::string(GPU_WAVE_LIMIT_DUMP) + kernelName + ".csv"; - } +WaveLimiter::DataDumper::DataDumper(const std::string& kernelName, bool enable) { + enable_ = enable; + if (enable_) { + fileName_ = std::string(GPU_WAVE_LIMIT_DUMP) + kernelName + ".csv"; + } } -WaveLimiter::DataDumper::~DataDumper() -{ - if (!enable_) { - return; - } +WaveLimiter::DataDumper::~DataDumper() { + if (!enable_) { + return; + } - std::ofstream OFS(fileName_); - for (size_t i = 0, e = time_.size(); i != e; ++i) { - OFS << i << ',' << time_[i] << ',' << wavePerSIMD_[i] << ',' - << static_cast(state_[i]) << '\n'; - } - OFS.close(); + std::ofstream OFS(fileName_); + for (size_t i = 0, e = time_.size(); i != e; ++i) { + OFS << i << ',' << time_[i] << ',' << wavePerSIMD_[i] << ',' << static_cast(state_[i]) + << '\n'; + } + OFS.close(); } -void WaveLimiter::DataDumper::addData(ulong time, uint wave, char state) -{ - if (!enable_) { - return; - } +void WaveLimiter::DataDumper::addData(ulong time, uint wave, char state) { + if (!enable_) { + return; + } - time_.push_back(time); - wavePerSIMD_.push_back(wave); - state_.push_back(state); + time_.push_back(time); + wavePerSIMD_.push_back(wave); + state_.push_back(state); } -WLAlgorithmAvrg::WLAlgorithmAvrg(WaveLimiterManager* manager, uint seqNum, bool enable, bool enableDump) - : WaveLimiter(manager, seqNum, enable, enableDump) -{ - measure_.resize(MaxWave + 1); - clear(measure_); - countAll_ = 0; +WLAlgorithmAvrg::WLAlgorithmAvrg(WaveLimiterManager* manager, uint seqNum, bool enable, + bool enableDump) + : WaveLimiter(manager, seqNum, enable, enableDump) { + measure_.resize(MaxWave + 1); + clear(measure_); + countAll_ = 0; } -WLAlgorithmAvrg::~WLAlgorithmAvrg() -{ +WLAlgorithmAvrg::~WLAlgorithmAvrg() {} +void WLAlgorithmAvrg::outputTrace() { + if (!traceStream_.is_open()) { + return; + } + + traceStream_ << "[WaveLimiter] " << manager_->name() << " state=" << state_ + << " currWaves=" << currWaves_ << " waves=" << waves_ << " bestWave=" << bestWave_ + << '\n'; + output(traceStream_, "\n measure = ", measure_); + traceStream_ << "\n\n"; } -void WLAlgorithmAvrg::outputTrace() -{ - if (!traceStream_.is_open()) { - return; - } +void WLAlgorithmAvrg::callback(ulong duration) { + dumper_.addData(duration, currWaves_, static_cast(state_)); - traceStream_ << "[WaveLimiter] " << manager_->name() << " state=" << state_ - << " currWaves=" << currWaves_ << " waves=" << waves_ - << " bestWave=" << bestWave_ << '\n'; - output(traceStream_, "\n measure = ", measure_); - traceStream_ << "\n\n"; -} + if (!enable_) { + return; + } -void WLAlgorithmAvrg::callback(ulong duration) -{ - dumper_.addData(duration, currWaves_, static_cast(state_)); + countAll_++; - if (!enable_) { - return; - } - - countAll_++; - - switch (state_) { + switch (state_) { case WARMUP: - state_ = ADAPT; + state_ = ADAPT; case ADAPT: - measure_[waves_] += duration; - if (countAll_ <= MaxWave * 5) { - waves_--; - if (waves_ == 0) { - waves_ = MaxWave; - } + measure_[waves_] += duration; + if (countAll_ <= MaxWave * 5) { + waves_--; + if (waves_ == 0) { + waves_ = MaxWave; } - else { - bestWave_ = MaxWave; - for (uint i=1; i 0) { - return fixed_; - } - if (!enable_) { - return 0; - } - auto loc = limiters_.find(vdev); - if (loc == limiters_.end()) { - return 0; - } - assert(loc->second != nullptr); - return loc->second->getWavesPerSH(); +uint WaveLimiterManager::getWavesPerSH(const device::VirtualDevice* vdev) const { + if (fixed_ > 0) { + return fixed_; + } + if (!enable_) { + return 0; + } + auto loc = limiters_.find(vdev); + if (loc == limiters_.end()) { + return 0; + } + assert(loc->second != nullptr); + return loc->second->getWavesPerSH(); } amd::ProfilingCallback* WaveLimiterManager::getProfilingCallback( - const device::VirtualDevice *vdev) -{ - assert(vdev != nullptr); - if (!enable_ && !enableDump_) { - return nullptr; - } + const device::VirtualDevice* vdev) { + assert(vdev != nullptr); + if (!enable_ && !enableDump_) { + return nullptr; + } - amd::ScopedLock SL(monitor_); - auto loc = limiters_.find(vdev); - if (loc != limiters_.end()) { - return loc->second; - } + amd::ScopedLock SL(monitor_); + auto loc = limiters_.find(vdev); + if (loc != limiters_.end()) { + return loc->second; + } - auto limiter = new WLAlgorithmSmooth(this, limiters_.size(), enable_, enableDump_); - if (limiter == nullptr) { - enable_ = false; - return nullptr; - } - limiters_[vdev] = limiter; - return limiter; + auto limiter = new WLAlgorithmSmooth(this, limiters_.size(), enable_, enableDump_); + if (limiter == nullptr) { + enable_ = false; + return nullptr; + } + limiters_[vdev] = limiter; + return limiter; } -void WaveLimiterManager::enable() - { - if (fixed_ > 0) { - return; - } +void WaveLimiterManager::enable() { + if (fixed_ > 0) { + return; + } - // Enable it only for CI+, unless GPU_WAVE_LIMIT_ENABLE is set to 1 - // Disabled for SI due to bug #10817 - if (!flagIsDefault(GPU_WAVE_LIMIT_ENABLE)) { - enable_ = GPU_WAVE_LIMIT_ENABLE; - } - else { - if (owner_->workGroupInfo()->wavesPerSimdHint_ == 0) { - enable_ = true; - } - else if (owner_->workGroupInfo()->wavesPerSimdHint_ <= GPU_WAVE_LIMIT_MAX_WAVE) { - fixed_ = owner_->workGroupInfo()->wavesPerSimdHint_ * getSimdPerSH(); - } + // Enable it only for CI+, unless GPU_WAVE_LIMIT_ENABLE is set to 1 + // Disabled for SI due to bug #10817 + if (!flagIsDefault(GPU_WAVE_LIMIT_ENABLE)) { + enable_ = GPU_WAVE_LIMIT_ENABLE; + } else { + if (owner_->workGroupInfo()->wavesPerSimdHint_ == 0) { + enable_ = true; + } else if (owner_->workGroupInfo()->wavesPerSimdHint_ <= GPU_WAVE_LIMIT_MAX_WAVE) { + fixed_ = owner_->workGroupInfo()->wavesPerSimdHint_ * getSimdPerSH(); } + } } } // namespace pal diff --git a/rocclr/runtime/device/pal/palwavelimiter.hpp b/rocclr/runtime/device/pal/palwavelimiter.hpp index cd2639a73f..0ef121b859 100644 --- a/rocclr/runtime/device/pal/palwavelimiter.hpp +++ b/rocclr/runtime/device/pal/palwavelimiter.hpp @@ -18,143 +18,145 @@ class WaveLimiterManager; class HSAILKernel; // Adaptively limit the number of waves per SIMD based on kernel execution time -class WaveLimiter: public amd::ProfilingCallback { -public: - explicit WaveLimiter(WaveLimiterManager* manager, uint seqNum, bool enable, bool enableDump); - virtual ~WaveLimiter(); +class WaveLimiter : public amd::ProfilingCallback { + public: + explicit WaveLimiter(WaveLimiterManager* manager, uint seqNum, bool enable, bool enableDump); + virtual ~WaveLimiter(); - //! Get waves per shader array to be used for kernel execution. - uint getWavesPerSH(); + //! Get waves per shader array to be used for kernel execution. + uint getWavesPerSH(); -protected: - enum StateKind { - WARMUP, ADAPT, RUN - }; + protected: + enum StateKind { WARMUP, ADAPT, RUN }; - class DataDumper { - public: - explicit DataDumper(const std::string &kernelName, bool enable); - ~DataDumper(); + class DataDumper { + public: + explicit DataDumper(const std::string& kernelName, bool enable); + ~DataDumper(); - //! Record execution time, waves/simd and state of wave limiter. - void addData(ulong time, uint wave, char state); + //! Record execution time, waves/simd and state of wave limiter. + void addData(ulong time, uint wave, char state); - //! Whether this data dumper is enabled. - bool enabled() const { return enable_;} - private: - bool enable_; - std::string fileName_; - std::vector time_; - std::vector wavePerSIMD_; - std::vector state_; - }; + //! Whether this data dumper is enabled. + bool enabled() const { return enable_; } - std::vector measure_; + private: bool enable_; - uint SIMDPerSH_; // Number of SIMDs per SH - uint waves_; // Waves per SIMD to be set - uint bestWave_; // Optimal waves per SIMD - uint countAll_; // Number of kernel executions - StateKind state_; - WaveLimiterManager* manager_; - DataDumper dumper_; - std::ofstream traceStream_; - uint currWaves_; // Current waves per SIMD + std::string fileName_; + std::vector time_; + std::vector wavePerSIMD_; + std::vector state_; + }; - static uint MaxWave; // Maximum number of waves per SIMD - static uint WarmUpCount; // Number of kernel executions for warm up - static uint RunCount; // Number of kernel executions for normal run + std::vector measure_; + bool enable_; + uint SIMDPerSH_; // Number of SIMDs per SH + uint waves_; // Waves per SIMD to be set + uint bestWave_; // Optimal waves per SIMD + uint countAll_; // Number of kernel executions + StateKind state_; + WaveLimiterManager* manager_; + DataDumper dumper_; + std::ofstream traceStream_; + uint currWaves_; // Current waves per SIMD - //! Call back from Event::recordProfilingInfo to get execution time. - virtual void callback(ulong duration)=0; + static uint MaxWave; // Maximum number of waves per SIMD + static uint WarmUpCount; // Number of kernel executions for warm up + static uint RunCount; // Number of kernel executions for normal run - //! Output trace of measurement/adaptation. - virtual void outputTrace()=0; + //! Call back from Event::recordProfilingInfo to get execution time. + virtual void callback(ulong duration) = 0; - template void clear(T& A) { - for (auto &I : A) { - I = 0; - } + //! Output trace of measurement/adaptation. + virtual void outputTrace() = 0; + + template void clear(T& A) { + for (auto& I : A) { + I = 0; } - template void output(std::ofstream &ofs, const std::string &prompt, - T& A) { - ofs << prompt; - for (auto &I : A) { - ofs << ' ' << static_cast(I); - } + } + template void output(std::ofstream& ofs, const std::string& prompt, T& A) { + ofs << prompt; + for (auto& I : A) { + ofs << ' ' << static_cast(I); } + } }; -class WLAlgorithmSmooth: public WaveLimiter { -public: - explicit WLAlgorithmSmooth(WaveLimiterManager* manager, uint seqNum, bool enable, bool enableDump); - virtual ~WLAlgorithmSmooth(); -private: - std::vector reference_; - std::vector trial_; - std::vector ratio_; - bool discontinuous_; // Measured data is discontinuous - uint dynRunCount_; - uint dataCount_; +class WLAlgorithmSmooth : public WaveLimiter { + public: + explicit WLAlgorithmSmooth(WaveLimiterManager* manager, uint seqNum, bool enable, + bool enableDump); + virtual ~WLAlgorithmSmooth(); - static uint AdaptCount; // Number of kernel executions for adapting - static uint AbandonThresh; // Threshold to abandon adaptation - static uint DscThresh; // Threshold for identifying discontinuities + private: + std::vector reference_; + std::vector trial_; + std::vector ratio_; + bool discontinuous_; // Measured data is discontinuous + uint dynRunCount_; + uint dataCount_; - //! Update measurement data and optimal waves/simd with execution time. - void updateData(ulong time); + static uint AdaptCount; // Number of kernel executions for adapting + static uint AbandonThresh; // Threshold to abandon adaptation + static uint DscThresh; // Threshold for identifying discontinuities - //! Clear measurement data for the next adaptation. - void clearData(); + //! Update measurement data and optimal waves/simd with execution time. + void updateData(ulong time); - //! Call back from Event::recordProfilingInfo to get execution time. - void callback(ulong duration); + //! Clear measurement data for the next adaptation. + void clearData(); - //! Output trace of measurement/adaptation. - void outputTrace(); + //! Call back from Event::recordProfilingInfo to get execution time. + void callback(ulong duration); + + //! Output trace of measurement/adaptation. + void outputTrace(); }; -class WLAlgorithmAvrg: public WaveLimiter { -public: - explicit WLAlgorithmAvrg(WaveLimiterManager* manager, uint seqNum, bool enable, bool enableDump); - virtual ~WLAlgorithmAvrg(); -private: - //! Call back from Event::recordProfilingInfo to get execution time. - void callback(ulong duration); +class WLAlgorithmAvrg : public WaveLimiter { + public: + explicit WLAlgorithmAvrg(WaveLimiterManager* manager, uint seqNum, bool enable, bool enableDump); + virtual ~WLAlgorithmAvrg(); - //! Output trace of measurement/adaptation. - void outputTrace(); + private: + //! Call back from Event::recordProfilingInfo to get execution time. + void callback(ulong duration); + + //! Output trace of measurement/adaptation. + void outputTrace(); }; // Create wave limiter for each virtual device for a kernel and manages the wave limiters. class WaveLimiterManager { -public: - explicit WaveLimiterManager(device::Kernel* owner, const uint simdPerSH); - virtual ~WaveLimiterManager(); + public: + explicit WaveLimiterManager(device::Kernel* owner, const uint simdPerSH); + virtual ~WaveLimiterManager(); - //! Get waves per shader array for a specific virtual device. - uint getWavesPerSH(const device::VirtualDevice *) const; + //! Get waves per shader array for a specific virtual device. + uint getWavesPerSH(const device::VirtualDevice*) const; - //! Provide call back function for a specific virtual device. - amd::ProfilingCallback* getProfilingCallback(const device::VirtualDevice *); + //! Provide call back function for a specific virtual device. + amd::ProfilingCallback* getProfilingCallback(const device::VirtualDevice*); - //! Enable wave limiter manager by kernel metadata and flags. - void enable(); + //! Enable wave limiter manager by kernel metadata and flags. + void enable(); - //! Returns the kernel name - const std::string& name() const { return owner_->name(); } + //! Returns the kernel name + const std::string& name() const { return owner_->name(); } - //! Get SimdPerSH. - uint getSimdPerSH() const {return simdPerSH_;} + //! Get SimdPerSH. + uint getSimdPerSH() const { return simdPerSH_; } -private: - device::Kernel *owner_; // The kernel which owns this object - uint simdPerSH_; // Simd Per SH - std::unordered_map limiters_; // Maps virtual device to wave limiter - bool enable_; // Whether the adaptation is enabled - bool enableDump_; // Whether the data dumper is enabled - uint fixed_; // The fixed waves/simd value if not zero - amd::Monitor monitor_; // The mutex for updating the wave limiter map + private: + device::Kernel* owner_; // The kernel which owns this object + uint simdPerSH_; // Simd Per SH + std::unordered_map + limiters_; // Maps virtual device to wave limiter + bool enable_; // Whether the adaptation is enabled + bool enableDump_; // Whether the data dumper is enabled + uint fixed_; // The fixed waves/simd value if not zero + amd::Monitor monitor_; // The mutex for updating the wave limiter map }; } diff --git a/rocclr/runtime/device/rocm/mesa_glinterop.h b/rocclr/runtime/device/rocm/mesa_glinterop.h index 0b9cb4e9f7..aee7d60f96 100644 --- a/rocclr/runtime/device/rocm/mesa_glinterop.h +++ b/rocclr/runtime/device/rocm/mesa_glinterop.h @@ -68,24 +68,24 @@ extern "C" { /** Returned error codes. */ enum { - MESA_GLINTEROP_SUCCESS = 0, - MESA_GLINTEROP_OUT_OF_RESOURCES, - MESA_GLINTEROP_OUT_OF_HOST_MEMORY, - MESA_GLINTEROP_INVALID_OPERATION, - MESA_GLINTEROP_INVALID_VALUE, - MESA_GLINTEROP_INVALID_DISPLAY, - MESA_GLINTEROP_INVALID_CONTEXT, - MESA_GLINTEROP_INVALID_TARGET, - MESA_GLINTEROP_INVALID_OBJECT, - MESA_GLINTEROP_INVALID_MIP_LEVEL, - MESA_GLINTEROP_UNSUPPORTED + MESA_GLINTEROP_SUCCESS = 0, + MESA_GLINTEROP_OUT_OF_RESOURCES, + MESA_GLINTEROP_OUT_OF_HOST_MEMORY, + MESA_GLINTEROP_INVALID_OPERATION, + MESA_GLINTEROP_INVALID_VALUE, + MESA_GLINTEROP_INVALID_DISPLAY, + MESA_GLINTEROP_INVALID_CONTEXT, + MESA_GLINTEROP_INVALID_TARGET, + MESA_GLINTEROP_INVALID_OBJECT, + MESA_GLINTEROP_INVALID_MIP_LEVEL, + MESA_GLINTEROP_UNSUPPORTED }; /** Access flags. */ enum { - MESA_GLINTEROP_ACCESS_READ_WRITE = 0, - MESA_GLINTEROP_ACCESS_READ_ONLY, - MESA_GLINTEROP_ACCESS_WRITE_ONLY + MESA_GLINTEROP_ACCESS_READ_WRITE = 0, + MESA_GLINTEROP_ACCESS_READ_ONLY, + MESA_GLINTEROP_ACCESS_WRITE_ONLY }; @@ -93,17 +93,17 @@ enum { * Device information returned by Mesa. */ typedef struct _mesa_glinterop_device_info { - uint32_t size; /* size of this structure */ + uint32_t size; /* size of this structure */ - /* PCI location */ - uint32_t pci_segment_group; - uint32_t pci_bus; - uint32_t pci_device; - uint32_t pci_function; + /* PCI location */ + uint32_t pci_segment_group; + uint32_t pci_bus; + uint32_t pci_device; + uint32_t pci_function; - /* Device identification */ - uint32_t vendor_id; - uint32_t device_id; + /* Device identification */ + uint32_t vendor_id; + uint32_t device_id; } mesa_glinterop_device_info; @@ -111,53 +111,53 @@ typedef struct _mesa_glinterop_device_info { * Input parameters to Mesa interop export functions. */ typedef struct _mesa_glinterop_export_in { - uint32_t size; /* size of this structure */ + uint32_t size; /* size of this structure */ - /* One of the following: - * - GL_TEXTURE_BUFFER - * - GL_TEXTURE_1D - * - GL_TEXTURE_2D - * - GL_TEXTURE_3D - * - GL_TEXTURE_RECTANGLE - * - GL_TEXTURE_1D_ARRAY - * - GL_TEXTURE_2D_ARRAY - * - GL_TEXTURE_CUBE_MAP_ARRAY - * - GL_TEXTURE_CUBE_MAP - * - GL_TEXTURE_CUBE_MAP_POSITIVE_X - * - GL_TEXTURE_CUBE_MAP_NEGATIVE_X - * - GL_TEXTURE_CUBE_MAP_POSITIVE_Y - * - GL_TEXTURE_CUBE_MAP_NEGATIVE_Y - * - GL_TEXTURE_CUBE_MAP_POSITIVE_Z - * - GL_TEXTURE_CUBE_MAP_NEGATIVE_Z - * - GL_TEXTURE_2D_MULTISAMPLE - * - GL_TEXTURE_2D_MULTISAMPLE_ARRAY - * - GL_TEXTURE_EXTERNAL_OES - * - GL_RENDERBUFFER - * - GL_ARRAY_BUFFER - */ - GLenum target; + /* One of the following: + * - GL_TEXTURE_BUFFER + * - GL_TEXTURE_1D + * - GL_TEXTURE_2D + * - GL_TEXTURE_3D + * - GL_TEXTURE_RECTANGLE + * - GL_TEXTURE_1D_ARRAY + * - GL_TEXTURE_2D_ARRAY + * - GL_TEXTURE_CUBE_MAP_ARRAY + * - GL_TEXTURE_CUBE_MAP + * - GL_TEXTURE_CUBE_MAP_POSITIVE_X + * - GL_TEXTURE_CUBE_MAP_NEGATIVE_X + * - GL_TEXTURE_CUBE_MAP_POSITIVE_Y + * - GL_TEXTURE_CUBE_MAP_NEGATIVE_Y + * - GL_TEXTURE_CUBE_MAP_POSITIVE_Z + * - GL_TEXTURE_CUBE_MAP_NEGATIVE_Z + * - GL_TEXTURE_2D_MULTISAMPLE + * - GL_TEXTURE_2D_MULTISAMPLE_ARRAY + * - GL_TEXTURE_EXTERNAL_OES + * - GL_RENDERBUFFER + * - GL_ARRAY_BUFFER + */ + GLenum target; - /* If target is GL_ARRAY_BUFFER, it's a buffer object. - * If target is GL_RENDERBUFFER, it's a renderbuffer object. - * If target is GL_TEXTURE_*, it's a texture object. - */ - GLuint obj; + /* If target is GL_ARRAY_BUFFER, it's a buffer object. + * If target is GL_RENDERBUFFER, it's a renderbuffer object. + * If target is GL_TEXTURE_*, it's a texture object. + */ + GLuint obj; - /* Mipmap level. Ignored for non-texture objects. */ - GLuint miplevel; + /* Mipmap level. Ignored for non-texture objects. */ + GLuint miplevel; - /* One of MESA_GLINTEROP_ACCESS_* flags. This describes how the exported - * object is going to be used. - */ - uint32_t access; + /* One of MESA_GLINTEROP_ACCESS_* flags. This describes how the exported + * object is going to be used. + */ + uint32_t access; - /* Size of memory pointed to by out_driver_data. */ - uint32_t out_driver_data_size; + /* Size of memory pointed to by out_driver_data. */ + uint32_t out_driver_data_size; - /* If the caller wants to query driver-specific data about the OpenGL - * object, this should point to the memory where that data will be stored. - */ - void *out_driver_data; + /* If the caller wants to query driver-specific data about the OpenGL + * object, this should point to the memory where that data will be stored. + */ + void* out_driver_data; } mesa_glinterop_export_in; @@ -165,36 +165,36 @@ typedef struct _mesa_glinterop_export_in { * Outputs of Mesa interop export functions. */ typedef struct _mesa_glinterop_export_out { - uint32_t size; /* size of this structure */ + uint32_t size; /* size of this structure */ - /* The DMABUF handle. It must be closed by the caller using the POSIX - * close() function when it's not needed anymore. Mesa is not responsible - * for closing the handle. - * - * Not closing the handle by the caller will lead to a resource leak, - * prevents releasing the GPU buffer, and may prevent creating new DMABUF - * handles until the process termination. - */ - int dmabuf_fd; + /* The DMABUF handle. It must be closed by the caller using the POSIX + * close() function when it's not needed anymore. Mesa is not responsible + * for closing the handle. + * + * Not closing the handle by the caller will lead to a resource leak, + * prevents releasing the GPU buffer, and may prevent creating new DMABUF + * handles until the process termination. + */ + int dmabuf_fd; - /* The mutable OpenGL internal format specified by glTextureView or - * glTexBuffer. If the object is not one of those, the original internal - * format specified by glTexStorage, glTexImage, or glRenderbufferStorage - * will be returned. - */ - GLenum internalformat; + /* The mutable OpenGL internal format specified by glTextureView or + * glTexBuffer. If the object is not one of those, the original internal + * format specified by glTexStorage, glTexImage, or glRenderbufferStorage + * will be returned. + */ + GLenum internalformat; - /* Parameters specified by glTexBufferRange for GL_TEXTURE_BUFFER. */ - GLintptr buf_offset; - GLsizeiptr buf_size; + /* Parameters specified by glTexBufferRange for GL_TEXTURE_BUFFER. */ + GLintptr buf_offset; + GLsizeiptr buf_size; - /* Parameters specified by glTextureView. If the object is not a texture - * view, default parameters covering the whole texture will be returned. - */ - GLuint view_minlevel; - GLuint view_numlevels; - GLuint view_minlayer; - GLuint view_numlayers; + /* Parameters specified by glTextureView. If the object is not a texture + * view, default parameters covering the whole texture will be returned. + */ + GLuint view_minlevel; + GLuint view_numlevels; + GLuint view_minlayer; + GLuint view_numlayers; } mesa_glinterop_export_out; #if !defined(MESA_GLINTEROP_NO_GLX) @@ -207,18 +207,16 @@ typedef struct _mesa_glinterop_export_out { * * \return MESA_GLINTEROP_SUCCESS or MESA_GLINTEROP_* != 0 on error */ -GLAPI int GLAPIENTRY -MesaGLInteropGLXQueryDeviceInfo(Display *dpy, GLXContext context, - mesa_glinterop_device_info *out); +GLAPI int GLAPIENTRY MesaGLInteropGLXQueryDeviceInfo(Display* dpy, GLXContext context, + mesa_glinterop_device_info* out); #endif /** * Same as MesaGLInteropGLXQueryDeviceInfo except that it accepts EGLDisplay * and EGLContext. */ -GLAPI int GLAPIENTRY -MesaGLInteropEGLQueryDeviceInfo(EGLDisplay dpy, EGLContext context, - mesa_glinterop_device_info *out); +GLAPI int GLAPIENTRY MesaGLInteropEGLQueryDeviceInfo(EGLDisplay dpy, EGLContext context, + mesa_glinterop_device_info* out); #if !defined(MESA_GLINTEROP_NO_GLX) @@ -233,36 +231,34 @@ MesaGLInteropEGLQueryDeviceInfo(EGLDisplay dpy, EGLContext context, * * \return MESA_GLINTEROP_SUCCESS or MESA_GLINTEROP_* != 0 on error */ -GLAPI int GLAPIENTRY -MesaGLInteropGLXExportObject(Display *dpy, GLXContext context, - mesa_glinterop_export_in *in, - mesa_glinterop_export_out *out); +GLAPI int GLAPIENTRY MesaGLInteropGLXExportObject(Display* dpy, GLXContext context, + mesa_glinterop_export_in* in, + mesa_glinterop_export_out* out); #endif /** * Same as MesaGLInteropGLXExportObject except that it accepts * EGLDisplay and EGLContext. */ -GLAPI int GLAPIENTRY -MesaGLInteropEGLExportObject(EGLDisplay dpy, EGLContext context, - mesa_glinterop_export_in *in, - mesa_glinterop_export_out *out); +GLAPI int GLAPIENTRY MesaGLInteropEGLExportObject(EGLDisplay dpy, EGLContext context, + mesa_glinterop_export_in* in, + mesa_glinterop_export_out* out); #if !defined(MESA_GLINTEROP_NO_GLX) -typedef int (APIENTRYP PFNMESAGLINTEROPGLXQUERYDEVICEINFOPROC)(Display *dpy, GLXContext context, - mesa_glinterop_device_info *out); +typedef int(APIENTRYP PFNMESAGLINTEROPGLXQUERYDEVICEINFOPROC)(Display* dpy, GLXContext context, + mesa_glinterop_device_info* out); #endif -typedef int (APIENTRYP PFNMESAGLINTEROPEGLQUERYDEVICEINFOPROC)(EGLDisplay dpy, EGLContext context, - mesa_glinterop_device_info *out); +typedef int(APIENTRYP PFNMESAGLINTEROPEGLQUERYDEVICEINFOPROC)(EGLDisplay dpy, EGLContext context, + mesa_glinterop_device_info* out); #if !defined(MESA_GLINTEROP_NO_GLX) -typedef int (APIENTRYP PFNMESAGLINTEROPGLXEXPORTOBJECTPROC)(Display *dpy, GLXContext context, - mesa_glinterop_export_in *in, - mesa_glinterop_export_out *out); +typedef int(APIENTRYP PFNMESAGLINTEROPGLXEXPORTOBJECTPROC)(Display* dpy, GLXContext context, + mesa_glinterop_export_in* in, + mesa_glinterop_export_out* out); #endif -typedef int (APIENTRYP PFNMESAGLINTEROPEGLEXPORTOBJECTPROC)(EGLDisplay dpy, EGLContext context, - mesa_glinterop_export_in *in, - mesa_glinterop_export_out *out); +typedef int(APIENTRYP PFNMESAGLINTEROPEGLEXPORTOBJECTPROC)(EGLDisplay dpy, EGLContext context, + mesa_glinterop_export_in* in, + mesa_glinterop_export_out* out); #ifdef __cplusplus } diff --git a/rocclr/runtime/device/rocm/rocappprofile.cpp b/rocclr/runtime/device/rocm/rocappprofile.cpp index be864b932c..cb996534b9 100644 --- a/rocclr/runtime/device/rocm/rocappprofile.cpp +++ b/rocclr/runtime/device/rocm/rocappprofile.cpp @@ -12,34 +12,31 @@ #include -amd::AppProfile* rocCreateAppProfile() -{ - amd::AppProfile* appProfile = new roc::AppProfile; +amd::AppProfile* rocCreateAppProfile() { + amd::AppProfile* appProfile = new roc::AppProfile; - if ((appProfile == nullptr) || !appProfile->init()) { - return nullptr; - } + if ((appProfile == nullptr) || !appProfile->init()) { + return nullptr; + } - return appProfile; + return appProfile; } namespace roc { -bool AppProfile::ParseApplicationProfile() -{ - std::string appName("Explorer"); +bool AppProfile::ParseApplicationProfile() { + std::string appName("Explorer"); - std::transform(appName.begin(), appName.end(), appName.begin(), ::tolower); - std::transform(appFileName_.begin(), appFileName_.end(), appFileName_.begin(), ::tolower); + std::transform(appName.begin(), appName.end(), appName.begin(), ::tolower); + std::transform(appFileName_.begin(), appFileName_.end(), appFileName_.begin(), ::tolower); - if (appFileName_.compare(appName) == 0 ) { - gpuvmHighAddr_ = false; - profileOverridesAllSettings_ = true; - } + if (appFileName_.compare(appName) == 0) { + gpuvmHighAddr_ = false; + profileOverridesAllSettings_ = true; + } - return true; + return true; } - } #endif diff --git a/rocclr/runtime/device/rocm/rocappprofile.hpp b/rocclr/runtime/device/rocm/rocappprofile.hpp index 00221dda2c..8f1bded66f 100644 --- a/rocclr/runtime/device/rocm/rocappprofile.hpp +++ b/rocclr/runtime/device/rocm/rocappprofile.hpp @@ -7,17 +7,14 @@ namespace roc { -class AppProfile : public amd::AppProfile -{ -public: - AppProfile(): amd::AppProfile() {} +class AppProfile : public amd::AppProfile { + public: + AppProfile() : amd::AppProfile() {} -protected: - //! parse application profile based on application file name - virtual bool ParseApplicationProfile(); + protected: + //! parse application profile based on application file name + virtual bool ParseApplicationProfile(); }; - } #endif - diff --git a/rocclr/runtime/device/rocm/rocbinary.hpp b/rocclr/runtime/device/rocm/rocbinary.hpp index cdb910f622..056a038787 100644 --- a/rocclr/runtime/device/rocm/rocbinary.hpp +++ b/rocclr/runtime/device/rocm/rocbinary.hpp @@ -12,40 +12,35 @@ namespace roc { typedef std::map NameKernelMap; -class ClBinary : public device::ClBinary -{ -public: - ClBinary(const Device& dev, BinaryImageFormat bifVer = BIF_VERSION3) - : device::ClBinary(dev, bifVer) - {} +class ClBinary : public device::ClBinary { + public: + ClBinary(const Device& dev, BinaryImageFormat bifVer = BIF_VERSION3) + : device::ClBinary(dev, bifVer) {} - //! Destructor - ~ClBinary() {} + //! Destructor + ~ClBinary() {} -protected: - bool setElfTarget() { - uint32_t target = static_cast(21);//dev().calTarget()); - assert (((0xFFFF8000 & target) == 0) && "ASIC target ID >= 2^15"); - uint16_t elf_target = (uint16_t)(0x7FFF & target); - return elfOut()->setTarget(elf_target, amd::OclElf::CAL_PLATFORM); - return true; - } - -private: - //! Disable default copy constructor - ClBinary(const ClBinary&); + protected: + bool setElfTarget() { + uint32_t target = static_cast(21); // dev().calTarget()); + assert(((0xFFFF8000 & target) == 0) && "ASIC target ID >= 2^15"); + uint16_t elf_target = (uint16_t)(0x7FFF & target); + return elfOut()->setTarget(elf_target, amd::OclElf::CAL_PLATFORM); + return true; + } - //! Disable default operator= - ClBinary& operator=(const ClBinary&); + private: + //! Disable default copy constructor + ClBinary(const ClBinary&); - //! Returns the HSA device for this object - const Device& dev() const { return static_cast(dev_); } + //! Disable default operator= + ClBinary& operator=(const ClBinary&); + //! Returns the HSA device for this object + const Device& dev() const { return static_cast(dev_); } }; -} // namespace roc - -#endif // WITHOUT_HSA_BACKEND - +} // namespace roc +#endif // WITHOUT_HSA_BACKEND diff --git a/rocclr/runtime/device/rocm/rocblit.cpp b/rocclr/runtime/device/rocm/rocblit.cpp index 7cd64d1c6e..cebf2e5d5c 100644 --- a/rocclr/runtime/device/rocm/rocblit.cpp +++ b/rocclr/runtime/device/rocm/rocblit.cpp @@ -12,2471 +12,2089 @@ namespace roc { DmaBlitManager::DmaBlitManager(VirtualGPU& gpu, Setup setup) - : HostBlitManager(gpu, setup) - , MinSizeForPinnedTransfer(dev().settings().pinnedMinXferSize_) - , completeOperation_(false) - , context_(nullptr) -{ + : HostBlitManager(gpu, setup), + MinSizeForPinnedTransfer(dev().settings().pinnedMinXferSize_), + completeOperation_(false), + context_(nullptr) {} + +inline void DmaBlitManager::synchronize() const { + // todo TS tracking isn't implemented + gpu().releaseGpuMemoryFence(); + + if (syncOperation_) { + // gpu().waitAllEngines(); + gpu().releasePinnedMem(); + } } -inline void -DmaBlitManager::synchronize() const -{ - // todo TS tracking isn't implemented - gpu().releaseGpuMemoryFence(); - - if (syncOperation_) { -// gpu().waitAllEngines(); - gpu().releasePinnedMem(); - } +inline Memory& DmaBlitManager::gpuMem(device::Memory& mem) const { + return static_cast(mem); } -inline Memory& -DmaBlitManager::gpuMem(device::Memory& mem) const -{ - return static_cast(mem); +bool DmaBlitManager::readMemoryStaged(Memory& srcMemory, void* dstHost, Memory& xferBuf, + size_t origin, size_t& offset, size_t& totalSize, + size_t xferSize) const { + const_address src = srcMemory.getDeviceMemory(); + address staging = xferBuf.getDeviceMemory(); + + // Copy data from device to host + src += origin + offset; + address dst = reinterpret_cast
(dstHost) + offset; + bool ret = hsaCopyStaged(src, dst, totalSize, staging, false); + + return ret; } -bool -DmaBlitManager::readMemoryStaged( - Memory& srcMemory, - void* dstHost, - Memory& xferBuf, - size_t origin, - size_t& offset, - size_t& totalSize, - size_t xferSize) const -{ - const_address src = srcMemory.getDeviceMemory(); - address staging = xferBuf.getDeviceMemory(); +bool DmaBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost, + const amd::Coord3D& origin, const amd::Coord3D& size, + bool entire) const { + // Use host copy if memory has direct access + if (setup_.disableReadBuffer_ || gpuMem(srcMemory).isHostMemDirectAccess()) { + return HostBlitManager::readBuffer(srcMemory, dstHost, origin, size, entire); + } else { + size_t srcSize = size[0]; + size_t offset = 0; + size_t pinSize = dev().settings().pinnedXferSize_; + pinSize = std::min(pinSize, srcSize); - // Copy data from device to host - src += origin + offset; - address dst = reinterpret_cast
(dstHost) + offset; - bool ret = hsaCopyStaged(src, dst, totalSize, staging, false); + // Check if a pinned transfer can be executed + if (pinSize && (srcSize > MinSizeForPinnedTransfer)) { + // Align offset to 4K boundary + char* tmpHost = const_cast( + amd::alignDown(reinterpret_cast(dstHost), PinnedMemoryAlignment)); - return ret; -} + // Find the partial size for unaligned copy + size_t partial = reinterpret_cast(dstHost) - tmpHost; -bool -DmaBlitManager::readBuffer( - device::Memory& srcMemory, - void* dstHost, - const amd::Coord3D& origin, - const amd::Coord3D& size, - bool entire) const -{ - // Use host copy if memory has direct access - if (setup_.disableReadBuffer_ || gpuMem(srcMemory).isHostMemDirectAccess()) { - return HostBlitManager::readBuffer( - srcMemory, dstHost, origin, size, entire); - } - else { - size_t srcSize = size[0]; - size_t offset = 0; - size_t pinSize = dev().settings().pinnedXferSize_; - pinSize = std::min(pinSize, srcSize); + amd::Memory* pinned = nullptr; + bool first = true; + size_t tmpSize; + size_t pinAllocSize; - // Check if a pinned transfer can be executed - if (pinSize && (srcSize > MinSizeForPinnedTransfer)) { - // Align offset to 4K boundary - char* tmpHost = const_cast( - amd::alignDown(reinterpret_cast(dstHost), - PinnedMemoryAlignment)); - - // Find the partial size for unaligned copy - size_t partial = reinterpret_cast(dstHost) - tmpHost; - - amd::Memory* pinned = nullptr; - bool first = true; - size_t tmpSize; - size_t pinAllocSize; - - // Copy memory, using pinning - while (srcSize > 0) { - // If it's the first iterarion, then readjust the copy size - // to include alignment - if (first) { - pinAllocSize = amd::alignUp(pinSize + partial, - PinnedMemoryAlignment); - tmpSize = std::min(pinAllocSize - partial, srcSize); - first = false; - } - else { - tmpSize = std::min(pinSize, srcSize); - pinAllocSize = amd::alignUp(tmpSize, PinnedMemoryAlignment); - partial = 0; - } - amd::Coord3D dst(partial, 0, 0); - amd::Coord3D srcPin(origin[0] + offset, 0, 0); - amd::Coord3D copySizePin(tmpSize, 0, 0); - size_t partial2; - - // Allocate a GPU resource for pinning - pinned = pinHostMemory(tmpHost, pinAllocSize, partial2); - if (pinned != nullptr) { - // Get device memory for this virtual device - Memory* dstMemory = dev().getRocMemory(pinned); - - if (!hsaCopy(gpuMem(srcMemory), *dstMemory, - srcPin, dst, copySizePin)) { - LogWarning("DmaBlitManager::readBuffer failed a pinned copy!"); - gpu().addPinnedMem(pinned); - break; - } - gpu().addPinnedMem(pinned); - } - else { - LogWarning("DmaBlitManager::readBuffer failed to pin a resource!"); - break; - } - srcSize -= tmpSize; - offset += tmpSize; - tmpHost = reinterpret_cast(tmpHost) + tmpSize + partial; - } + // Copy memory, using pinning + while (srcSize > 0) { + // If it's the first iterarion, then readjust the copy size + // to include alignment + if (first) { + pinAllocSize = amd::alignUp(pinSize + partial, PinnedMemoryAlignment); + tmpSize = std::min(pinAllocSize - partial, srcSize); + first = false; + } else { + tmpSize = std::min(pinSize, srcSize); + pinAllocSize = amd::alignUp(tmpSize, PinnedMemoryAlignment); + partial = 0; } + amd::Coord3D dst(partial, 0, 0); + amd::Coord3D srcPin(origin[0] + offset, 0, 0); + amd::Coord3D copySizePin(tmpSize, 0, 0); + size_t partial2; - if (0 != srcSize) { - Memory& xferBuf = dev().xferRead().acquire(); + // Allocate a GPU resource for pinning + pinned = pinHostMemory(tmpHost, pinAllocSize, partial2); + if (pinned != nullptr) { + // Get device memory for this virtual device + Memory* dstMemory = dev().getRocMemory(pinned); - // Read memory using a staging resource - if (!readMemoryStaged(gpuMem(srcMemory), dstHost, xferBuf, origin[0], - offset, srcSize, srcSize)) { - LogError("DmaBlitManager::readBuffer failed!"); - return false; - } - - dev().xferRead().release(gpu(), xferBuf); + if (!hsaCopy(gpuMem(srcMemory), *dstMemory, srcPin, dst, copySizePin)) { + LogWarning("DmaBlitManager::readBuffer failed a pinned copy!"); + gpu().addPinnedMem(pinned); + break; + } + gpu().addPinnedMem(pinned); + } else { + LogWarning("DmaBlitManager::readBuffer failed to pin a resource!"); + break; } + srcSize -= tmpSize; + offset += tmpSize; + tmpHost = reinterpret_cast(tmpHost) + tmpSize + partial; + } } - return true; -} + if (0 != srcSize) { + Memory& xferBuf = dev().xferRead().acquire(); -bool -DmaBlitManager::readBufferRect( - device::Memory& srcMemory, - void* dstHost, - const amd::BufferRect& bufRect, - const amd::BufferRect& hostRect, - const amd::Coord3D& size, - bool entire) const -{ - // Use host copy if memory has direct access - if (setup_.disableReadBufferRect_ || gpuMem(srcMemory).isHostMemDirectAccess()) { - return HostBlitManager::readBufferRect( - srcMemory, dstHost, bufRect, hostRect, size, entire); - } - else { - Memory& xferBuf = dev().xferRead().acquire(); - address staging = xferBuf.getDeviceMemory(); - const_address src = gpuMem(srcMemory).getDeviceMemory(); - - size_t srcOffset; - size_t dstOffset; - - for (size_t z = 0; z < size[2]; ++z) { - for (size_t y = 0; y < size[1]; ++y) { - srcOffset = bufRect.offset(0, y, z); - dstOffset = hostRect.offset(0, y, z); - - // Copy data from device to host - line by line - address dst = reinterpret_cast
(dstHost) + dstOffset; - src += srcOffset; - bool retval = hsaCopyStaged(src, dst, size[0], staging, false); - if (!retval) { - return retval; - } - } - } - dev().xferRead().release(gpu(), xferBuf); - } - - return true; -} - -bool -DmaBlitManager::readImage( - device::Memory& srcMemory, - void* dstHost, - const amd::Coord3D& origin, - const amd::Coord3D& size, - size_t rowPitch, - size_t slicePitch, - bool entire) const -{ - if (setup_.disableReadImage_) { - return HostBlitManager::readImage(srcMemory, dstHost, - origin, size, rowPitch, slicePitch, entire); - } - else { - //! @todo Add HW accelerated path - return HostBlitManager::readImage(srcMemory, dstHost, - origin, size, rowPitch, slicePitch, entire); - } - - return true; -} - -bool -DmaBlitManager::writeMemoryStaged( - const void* srcHost, - Memory& dstMemory, - Memory& xferBuf, - size_t origin, - size_t& offset, - size_t& totalSize, - size_t xferSize) const -{ - address dst = dstMemory.getDeviceMemory(); - address staging = xferBuf.getDeviceMemory(); - - // Copy data from host to device - dst += origin + offset; - const_address src = reinterpret_cast(srcHost) + offset; - bool retval = hsaCopyStaged(src, dst, totalSize, staging, true); - - return retval; -} - -bool -DmaBlitManager::writeBuffer( - const void* srcHost, - device::Memory& dstMemory, - const amd::Coord3D& origin, - const amd::Coord3D& size, - bool entire) const -{ - // Use host copy if memory has direct access - if (setup_.disableWriteBuffer_ || - gpuMem(dstMemory).isHostMemDirectAccess()) { - return HostBlitManager::writeBuffer( - srcHost, dstMemory, origin, size, entire); - } - else { - size_t dstSize = size[0]; - size_t tmpSize = 0; - size_t offset = 0; - size_t pinSize = dev().settings().pinnedXferSize_; - pinSize = std::min(pinSize, dstSize); - - // Check if a pinned transfer can be executed - if (pinSize && (dstSize > MinSizeForPinnedTransfer)) { - // Align offset to 4K boundary - char* tmpHost = const_cast( - amd::alignDown(reinterpret_cast(srcHost), - PinnedMemoryAlignment)); - - // Find the partial size for unaligned copy - size_t partial = reinterpret_cast(srcHost) - tmpHost; - - amd::Memory* pinned = nullptr; - bool first = true; - size_t tmpSize; - size_t pinAllocSize; - - // Copy memory, using pinning - while (dstSize > 0) { - // If it's the first iterarion, then readjust the copy size - // to include alignment - if (first) { - pinAllocSize = amd::alignUp(pinSize + partial, - PinnedMemoryAlignment); - tmpSize = std::min(pinAllocSize - partial, dstSize); - first = false; - } - else { - tmpSize = std::min(pinSize, dstSize); - pinAllocSize = amd::alignUp(tmpSize, PinnedMemoryAlignment); - partial = 0; - } - amd::Coord3D src(partial, 0, 0); - amd::Coord3D dstPin(origin[0] + offset, 0, 0); - amd::Coord3D copySizePin(tmpSize, 0, 0); - size_t partial2; - - // Allocate a GPU resource for pinning - pinned = pinHostMemory(tmpHost, pinAllocSize, partial2); - - if (pinned != nullptr) { - // Get device memory for this virtual device - Memory* srcMemory = dev().getRocMemory(pinned); - - if (!hsaCopy(*srcMemory, gpuMem(dstMemory), src, dstPin, - copySizePin)) { - LogWarning("DmaBlitManager::writeBuffer failed a pinned copy!"); - gpu().addPinnedMem(pinned); - break; - } - gpu().addPinnedMem(pinned); - } - else { - LogWarning("DmaBlitManager::writeBuffer failed to pin a resource!"); - break; - } - dstSize -= tmpSize; - offset += tmpSize; - tmpHost = reinterpret_cast(tmpHost) + tmpSize + partial; - } - } - - if (dstSize != 0) { - Memory& xferBuf = dev().xferWrite().acquire(); - - // Write memory using a staging resource - if (!writeMemoryStaged(srcHost, gpuMem(dstMemory), xferBuf, origin[0], - offset, dstSize, dstSize)) { - LogError("DmaBlitManager::writeBuffer failed!"); - return false; - } - - gpu().addXferWrite(xferBuf); - } - } - - return true; -} - -bool -DmaBlitManager::writeBufferRect( - const void* srcHost, - device::Memory& dstMemory, - const amd::BufferRect& hostRect, - const amd::BufferRect& bufRect, - const amd::Coord3D& size, - bool entire) const -{ - // Use host copy if memory has direct access - if (setup_.disableWriteBufferRect_ || dstMemory.isHostMemDirectAccess()) { - return HostBlitManager::writeBufferRect( - srcHost, dstMemory, hostRect, bufRect, size, entire); - } - else { - Memory& xferBuf = dev().xferWrite().acquire(); - address staging = xferBuf.getDeviceMemory(); - address dst = static_cast(dstMemory).getDeviceMemory(); - - size_t srcOffset; - size_t dstOffset; - - for (size_t z = 0; z < size[2]; ++z) { - for (size_t y = 0; y < size[1]; ++y) { - srcOffset = hostRect.offset(0, y, z); - dstOffset = bufRect.offset(0, y, z); - - // Copy data from host to device - line by line - dst += dstOffset; - const_address src = reinterpret_cast(srcHost) + srcOffset; - bool retval = hsaCopyStaged(src, dst, size[0], staging, true); - if (!retval) { - return retval; - } - } - } - gpu().addXferWrite(xferBuf); - } - - return true; -} - -bool -DmaBlitManager::writeImage( - const void* srcHost, - device::Memory& dstMemory, - const amd::Coord3D& origin, - const amd::Coord3D& size, - size_t rowPitch, - size_t slicePitch, - bool entire) const -{ - if (setup_.disableWriteImage_) { - return HostBlitManager::writeImage( - srcHost, dstMemory, origin, size, rowPitch, slicePitch, entire); - } - else { - //! @todo Add HW accelerated path - return HostBlitManager::writeImage( - srcHost, dstMemory, origin, size, rowPitch, slicePitch, entire); - } - - return true; -} - -bool -DmaBlitManager::copyBuffer( - device::Memory& srcMemory, - device::Memory& dstMemory, - const amd::Coord3D& srcOrigin, - const amd::Coord3D& dstOrigin, - const amd::Coord3D& size, - bool entire) const -{ - if (setup_.disableCopyBuffer_ || - (gpuMem(srcMemory).isHostMemDirectAccess() && - (dev().agent_profile() != HSA_PROFILE_FULL) && - gpuMem(dstMemory).isHostMemDirectAccess())) { - return HostBlitManager::copyBuffer( - srcMemory, dstMemory, srcOrigin, dstOrigin, size); - } - else { - return hsaCopy(gpuMem(srcMemory), gpuMem(dstMemory), - srcOrigin, dstOrigin, size); - } - - return true; -} - -bool -DmaBlitManager::copyBufferRect( - device::Memory& srcMemory, - device::Memory& dstMemory, - const amd::BufferRect& srcRect, - const amd::BufferRect& dstRect, - const amd::Coord3D& size, - bool entire) const -{ - if (setup_.disableCopyBufferRect_ || - (gpuMem(srcMemory).isHostMemDirectAccess() && - gpuMem(dstMemory).isHostMemDirectAccess())) { - return HostBlitManager::copyBufferRect( - srcMemory, dstMemory, srcRect, dstRect, size, entire); - } - else { + // Read memory using a staging resource + if (!readMemoryStaged(gpuMem(srcMemory), dstHost, xferBuf, origin[0], offset, srcSize, + srcSize)) { + LogError("DmaBlitManager::readBuffer failed!"); return false; - void* src = gpuMem(srcMemory).getDeviceMemory(); - void* dst = gpuMem(dstMemory).getDeviceMemory(); + } - // Detect the agents for memory allocations - const hsa_agent_t srcAgent = (srcMemory.isHostMemDirectAccess()) ? - dev().getCpuAgent() : dev().getBackendDevice(); - const hsa_agent_t dstAgent = (dstMemory.isHostMemDirectAccess()) ? - dev().getCpuAgent() : dev().getBackendDevice(); - - const hsa_signal_value_t kInitVal = size[2] * size[1]; - hsa_signal_store_relaxed(completion_signal_, kInitVal); - - for (size_t z = 0; z < size[2]; ++z) { - for (size_t y = 0; y < size[1]; ++y) { - size_t srcOffset = srcRect.offset(0, y, z); - size_t dstOffset = dstRect.offset(0, y, z); - - // Copy memory line by line - hsa_status_t status = hsa_amd_memory_async_copy( - (reinterpret_cast
(dst) + dstOffset), dstAgent, - (reinterpret_cast(src) + srcOffset), - srcAgent, size[0], 0, nullptr, completion_signal_); - if (status != HSA_STATUS_SUCCESS) { - LogPrintfError("DMA buffer failed with code %d", status); - return false; - } - } - } - - hsa_signal_value_t val = - hsa_signal_wait_acquire(completion_signal_, HSA_SIGNAL_CONDITION_EQ, - 0, uint64_t(-1), HSA_WAIT_STATE_BLOCKED); - - if (val != 0) { - LogError("Async copy failed"); - return false; - } + dev().xferRead().release(gpu(), xferBuf); } - return true; + } + + return true; } -bool -DmaBlitManager::copyImageToBuffer( - device::Memory& srcMemory, - device::Memory& dstMemory, - const amd::Coord3D& srcOrigin, - const amd::Coord3D& dstOrigin, - const amd::Coord3D& size, - bool entire, - size_t rowPitch, - size_t slicePitch) const -{ - bool result = false; +bool DmaBlitManager::readBufferRect(device::Memory& srcMemory, void* dstHost, + const amd::BufferRect& bufRect, const amd::BufferRect& hostRect, + const amd::Coord3D& size, bool entire) const { + // Use host copy if memory has direct access + if (setup_.disableReadBufferRect_ || gpuMem(srcMemory).isHostMemDirectAccess()) { + return HostBlitManager::readBufferRect(srcMemory, dstHost, bufRect, hostRect, size, entire); + } else { + Memory& xferBuf = dev().xferRead().acquire(); + address staging = xferBuf.getDeviceMemory(); + const_address src = gpuMem(srcMemory).getDeviceMemory(); - if (setup_.disableCopyImageToBuffer_) { - result = HostBlitManager::copyImageToBuffer(srcMemory, dstMemory, - srcOrigin, dstOrigin, size, entire, rowPitch, slicePitch); - } - else { - Image& srcImage = static_cast(srcMemory); - Buffer& dstBuffer = static_cast(dstMemory); + size_t srcOffset; + size_t dstOffset; - // Use ROC path for a transfer - // Note: it doesn't support SDMA - address dstHost = reinterpret_cast
(dstBuffer.getDeviceMemory()) + - dstOrigin[0]; + for (size_t z = 0; z < size[2]; ++z) { + for (size_t y = 0; y < size[1]; ++y) { + srcOffset = bufRect.offset(0, y, z); + dstOffset = hostRect.offset(0, y, z); - // Use ROCm path for a transfer. - // Note: it doesn't support SDMA - hsa_ext_image_region_t image_region; - image_region.offset.x = srcOrigin[0]; - image_region.offset.y = srcOrigin[1]; - image_region.offset.z = srcOrigin[2]; - image_region.range.x = size[0]; - image_region.range.y = size[1]; - image_region.range.z = size[2]; - - hsa_status_t status = hsa_ext_image_export(gpu().gpu_device(), - srcImage.getHsaImageObject(), dstHost, rowPitch, - slicePitch, &image_region); - result = (status == HSA_STATUS_SUCCESS) ? true : false; - - // Check if a HostBlit transfer is required - if (completeOperation_ && !result) { - result = HostBlitManager::copyImageToBuffer(srcMemory, - dstMemory, srcOrigin, dstOrigin, size, entire, rowPitch, slicePitch); + // Copy data from device to host - line by line + address dst = reinterpret_cast
(dstHost) + dstOffset; + src += srcOffset; + bool retval = hsaCopyStaged(src, dst, size[0], staging, false); + if (!retval) { + return retval; } + } } + dev().xferRead().release(gpu(), xferBuf); + } - return result; + return true; } -bool -DmaBlitManager::copyBufferToImage( - device::Memory& srcMemory, - device::Memory& dstMemory, - const amd::Coord3D& srcOrigin, - const amd::Coord3D& dstOrigin, - const amd::Coord3D& size, - bool entire, - size_t rowPitch, - size_t slicePitch) const -{ - bool result = false; +bool DmaBlitManager::readImage(device::Memory& srcMemory, void* dstHost, const amd::Coord3D& origin, + const amd::Coord3D& size, size_t rowPitch, size_t slicePitch, + bool entire) const { + if (setup_.disableReadImage_) { + return HostBlitManager::readImage(srcMemory, dstHost, origin, size, rowPitch, slicePitch, + entire); + } else { + //! @todo Add HW accelerated path + return HostBlitManager::readImage(srcMemory, dstHost, origin, size, rowPitch, slicePitch, + entire); + } - if (setup_.disableCopyBufferToImage_) { - result = HostBlitManager::copyBufferToImage(srcMemory, - dstMemory, srcOrigin, dstOrigin, size, entire, rowPitch, slicePitch); - } - else { - Buffer& srcBuffer = static_cast(srcMemory); - Image& dstImage = static_cast(dstMemory); - - // Use ROC path for a transfer - // Note: it doesn't support SDMA - address srcHost = reinterpret_cast
(srcBuffer.getDeviceMemory()) + - srcOrigin[0]; - - hsa_ext_image_region_t image_region; - image_region.offset.x = dstOrigin[0]; - image_region.offset.y = dstOrigin[1]; - image_region.offset.z = dstOrigin[2]; - image_region.range.x = size[0]; - image_region.range.y = size[1]; - image_region.range.z = size[2]; - - hsa_status_t status = hsa_ext_image_import(gpu().gpu_device(), - srcHost, rowPitch, slicePitch, dstImage.getHsaImageObject(), &image_region); - result = (status == HSA_STATUS_SUCCESS) ? true : false; - - // Check if a HostBlit tran sfer is required - if (completeOperation_ && !result) { - result = HostBlitManager::copyBufferToImage(srcMemory, - dstMemory, srcOrigin, dstOrigin, size, entire, rowPitch, slicePitch); - } - } - - return result; + return true; } -bool -DmaBlitManager::copyImage( - device::Memory& srcMemory, - device::Memory& dstMemory, - const amd::Coord3D& srcOrigin, - const amd::Coord3D& dstOrigin, - const amd::Coord3D& size, - bool entire) const -{ - bool result = false; +bool DmaBlitManager::writeMemoryStaged(const void* srcHost, Memory& dstMemory, Memory& xferBuf, + size_t origin, size_t& offset, size_t& totalSize, + size_t xferSize) const { + address dst = dstMemory.getDeviceMemory(); + address staging = xferBuf.getDeviceMemory(); - if (setup_.disableCopyImage_) { - return HostBlitManager::copyImage(srcMemory, dstMemory, - srcOrigin, dstOrigin, size, entire); - } - else { - //! @todo Add HW accelerated path - return HostBlitManager::copyImage(srcMemory, dstMemory, - srcOrigin, dstOrigin, size, entire); - } + // Copy data from host to device + dst += origin + offset; + const_address src = reinterpret_cast(srcHost) + offset; + bool retval = hsaCopyStaged(src, dst, totalSize, staging, true); - return result; + return retval; } -bool DmaBlitManager::hsaCopy( - const Memory& srcMemory, - const Memory& dstMemory, - const amd::Coord3D& srcOrigin, - const amd::Coord3D& dstOrigin, - const amd::Coord3D& size, - bool enableCopyRect, - bool flushDMA) const -{ - address src = reinterpret_cast
(srcMemory.getDeviceMemory()); - address dst = reinterpret_cast
(dstMemory.getDeviceMemory()); +bool DmaBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemory, + const amd::Coord3D& origin, const amd::Coord3D& size, + bool entire) const { + // Use host copy if memory has direct access + if (setup_.disableWriteBuffer_ || gpuMem(dstMemory).isHostMemDirectAccess()) { + return HostBlitManager::writeBuffer(srcHost, dstMemory, origin, size, entire); + } else { + size_t dstSize = size[0]; + size_t tmpSize = 0; + size_t offset = 0; + size_t pinSize = dev().settings().pinnedXferSize_; + pinSize = std::min(pinSize, dstSize); - src += srcOrigin[0]; - dst += dstOrigin[0]; + // Check if a pinned transfer can be executed + if (pinSize && (dstSize > MinSizeForPinnedTransfer)) { + // Align offset to 4K boundary + char* tmpHost = const_cast( + amd::alignDown(reinterpret_cast(srcHost), PinnedMemoryAlignment)); - // Just call copy function for full profile - hsa_status_t status; - if (dev().agent_profile() == HSA_PROFILE_FULL) { - status = hsa_memory_copy(dst, src, size[0]); - if (status != HSA_STATUS_SUCCESS) { - LogPrintfError("Hsa copy of data failed with code %d", status); + // Find the partial size for unaligned copy + size_t partial = reinterpret_cast(srcHost) - tmpHost; + + amd::Memory* pinned = nullptr; + bool first = true; + size_t tmpSize; + size_t pinAllocSize; + + // Copy memory, using pinning + while (dstSize > 0) { + // If it's the first iterarion, then readjust the copy size + // to include alignment + if (first) { + pinAllocSize = amd::alignUp(pinSize + partial, PinnedMemoryAlignment); + tmpSize = std::min(pinAllocSize - partial, dstSize); + first = false; + } else { + tmpSize = std::min(pinSize, dstSize); + pinAllocSize = amd::alignUp(tmpSize, PinnedMemoryAlignment); + partial = 0; } - return (status == HSA_STATUS_SUCCESS); + amd::Coord3D src(partial, 0, 0); + amd::Coord3D dstPin(origin[0] + offset, 0, 0); + amd::Coord3D copySizePin(tmpSize, 0, 0); + size_t partial2; + + // Allocate a GPU resource for pinning + pinned = pinHostMemory(tmpHost, pinAllocSize, partial2); + + if (pinned != nullptr) { + // Get device memory for this virtual device + Memory* srcMemory = dev().getRocMemory(pinned); + + if (!hsaCopy(*srcMemory, gpuMem(dstMemory), src, dstPin, copySizePin)) { + LogWarning("DmaBlitManager::writeBuffer failed a pinned copy!"); + gpu().addPinnedMem(pinned); + break; + } + gpu().addPinnedMem(pinned); + } else { + LogWarning("DmaBlitManager::writeBuffer failed to pin a resource!"); + break; + } + dstSize -= tmpSize; + offset += tmpSize; + tmpHost = reinterpret_cast(tmpHost) + tmpSize + partial; + } } + if (dstSize != 0) { + Memory& xferBuf = dev().xferWrite().acquire(); + + // Write memory using a staging resource + if (!writeMemoryStaged(srcHost, gpuMem(dstMemory), xferBuf, origin[0], offset, dstSize, + dstSize)) { + LogError("DmaBlitManager::writeBuffer failed!"); + return false; + } + + gpu().addXferWrite(xferBuf); + } + } + + return true; +} + +bool DmaBlitManager::writeBufferRect(const void* srcHost, device::Memory& dstMemory, + const amd::BufferRect& hostRect, + const amd::BufferRect& bufRect, const amd::Coord3D& size, + bool entire) const { + // Use host copy if memory has direct access + if (setup_.disableWriteBufferRect_ || dstMemory.isHostMemDirectAccess()) { + return HostBlitManager::writeBufferRect(srcHost, dstMemory, hostRect, bufRect, size, entire); + } else { + Memory& xferBuf = dev().xferWrite().acquire(); + address staging = xferBuf.getDeviceMemory(); + address dst = static_cast(dstMemory).getDeviceMemory(); + + size_t srcOffset; + size_t dstOffset; + + for (size_t z = 0; z < size[2]; ++z) { + for (size_t y = 0; y < size[1]; ++y) { + srcOffset = hostRect.offset(0, y, z); + dstOffset = bufRect.offset(0, y, z); + + // Copy data from host to device - line by line + dst += dstOffset; + const_address src = reinterpret_cast(srcHost) + srcOffset; + bool retval = hsaCopyStaged(src, dst, size[0], staging, true); + if (!retval) { + return retval; + } + } + } + gpu().addXferWrite(xferBuf); + } + + return true; +} + +bool DmaBlitManager::writeImage(const void* srcHost, device::Memory& dstMemory, + const amd::Coord3D& origin, const amd::Coord3D& size, + size_t rowPitch, size_t slicePitch, bool entire) const { + if (setup_.disableWriteImage_) { + return HostBlitManager::writeImage(srcHost, dstMemory, origin, size, rowPitch, slicePitch, + entire); + } else { + //! @todo Add HW accelerated path + return HostBlitManager::writeImage(srcHost, dstMemory, origin, size, rowPitch, slicePitch, + entire); + } + + return true; +} + +bool DmaBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& dstMemory, + const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin, + const amd::Coord3D& size, bool entire) const { + if (setup_.disableCopyBuffer_ || + (gpuMem(srcMemory).isHostMemDirectAccess() && (dev().agent_profile() != HSA_PROFILE_FULL) && + gpuMem(dstMemory).isHostMemDirectAccess())) { + return HostBlitManager::copyBuffer(srcMemory, dstMemory, srcOrigin, dstOrigin, size); + } else { + return hsaCopy(gpuMem(srcMemory), gpuMem(dstMemory), srcOrigin, dstOrigin, size); + } + + return true; +} + +bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& dstMemory, + const amd::BufferRect& srcRect, const amd::BufferRect& dstRect, + const amd::Coord3D& size, bool entire) const { + if (setup_.disableCopyBufferRect_ || + (gpuMem(srcMemory).isHostMemDirectAccess() && gpuMem(dstMemory).isHostMemDirectAccess())) { + return HostBlitManager::copyBufferRect(srcMemory, dstMemory, srcRect, dstRect, size, entire); + } else { + return false; + void* src = gpuMem(srcMemory).getDeviceMemory(); + void* dst = gpuMem(dstMemory).getDeviceMemory(); + // Detect the agents for memory allocations - const hsa_agent_t srcAgent = (srcMemory.isHostMemDirectAccess()) ? - dev().getCpuAgent() : dev().getBackendDevice(); - const hsa_agent_t dstAgent = (dstMemory.isHostMemDirectAccess()) ? - dev().getCpuAgent() : dev().getBackendDevice(); + const hsa_agent_t srcAgent = + (srcMemory.isHostMemDirectAccess()) ? dev().getCpuAgent() : dev().getBackendDevice(); + const hsa_agent_t dstAgent = + (dstMemory.isHostMemDirectAccess()) ? dev().getCpuAgent() : dev().getBackendDevice(); - const hsa_signal_value_t kInitVal = 1; + const hsa_signal_value_t kInitVal = size[2] * size[1]; hsa_signal_store_relaxed(completion_signal_, kInitVal); - // Use SDMA to transfer the data - status = hsa_amd_memory_async_copy(dst, dstAgent, src, srcAgent, - size[0], 0, nullptr, completion_signal_); + for (size_t z = 0; z < size[2]; ++z) { + for (size_t y = 0; y < size[1]; ++y) { + size_t srcOffset = srcRect.offset(0, y, z); + size_t dstOffset = dstRect.offset(0, y, z); - if (status == HSA_STATUS_SUCCESS) { - hsa_signal_value_t val = hsa_signal_wait_acquire( - completion_signal_, HSA_SIGNAL_CONDITION_EQ, 0, - uint64_t(-1), HSA_WAIT_STATE_BLOCKED); - if (val != (kInitVal - 1)) { - LogError("Async copy failed"); - status = HSA_STATUS_ERROR; - } - } - else { - LogPrintfError("Hsa copy from host to device failed with code %d", status); - } - - return (status == HSA_STATUS_SUCCESS); -} - -bool DmaBlitManager::hsaCopyStaged( - const_address hostSrc, address hostDst, size_t size, address staging, bool hostToDev) const -{ - // No allocation is necessary for Full Profile - hsa_status_t status; - if (dev().agent_profile() == HSA_PROFILE_FULL) { - status = hsa_memory_copy(hostDst, hostSrc, size); + // Copy memory line by line + hsa_status_t status = + hsa_amd_memory_async_copy((reinterpret_cast
(dst) + dstOffset), dstAgent, + (reinterpret_cast(src) + srcOffset), srcAgent, + size[0], 0, nullptr, completion_signal_); if (status != HSA_STATUS_SUCCESS) { - LogPrintfError("Hsa copy of data failed with code %d", status); + LogPrintfError("DMA buffer failed with code %d", status); + return false; } - return (status == HSA_STATUS_SUCCESS); + } } - size_t totalSize = size; - size_t offset = 0; + hsa_signal_value_t val = hsa_signal_wait_acquire(completion_signal_, HSA_SIGNAL_CONDITION_EQ, 0, + uint64_t(-1), HSA_WAIT_STATE_BLOCKED); - address hsaBuffer = staging; - - const hsa_signal_value_t kInitVal = 1; - - // Allocate requested size of memory - while (totalSize > 0) { - size = std::min(totalSize, dev().settings().stagedXferSize_); - hsa_signal_store_relaxed(completion_signal_, kInitVal); - - // Copy data from Host to Device - if (hostToDev) { - memcpy(hsaBuffer, hostSrc + offset, size); - status = hsa_amd_memory_async_copy( - hostDst + offset, dev().getBackendDevice(), hsaBuffer, - dev().getCpuAgent(), size, 0, nullptr, completion_signal_); - if (status == HSA_STATUS_SUCCESS) { - hsa_signal_value_t val = - hsa_signal_wait_acquire(completion_signal_, - HSA_SIGNAL_CONDITION_EQ, 0, - uint64_t(-1), HSA_WAIT_STATE_BLOCKED); - - if (val != (kInitVal - 1)) { - LogError("Async copy failed"); - return false; - } - } - else { - LogPrintfError("Hsa copy from host to device failed with code %d", status); - return false; - } - totalSize -= size; - offset += size; - continue; - } - - // Copy data from Device to Host - status = hsa_amd_memory_async_copy(hsaBuffer, - dev().getCpuAgent(), hostSrc + offset, dev().getBackendDevice(), - size, 0, nullptr, completion_signal_); - if (status == HSA_STATUS_SUCCESS) { - hsa_signal_value_t val = hsa_signal_wait_acquire( - completion_signal_, HSA_SIGNAL_CONDITION_EQ, 0, uint64_t(-1), - HSA_WAIT_STATE_BLOCKED); - - if (val != (kInitVal - 1)) { - LogError("Async copy failed"); - return false; - } - memcpy(hostDst + offset, hsaBuffer, size); - } - else { - LogPrintfError("Hsa copy from device to host failed with code %d", status); - return false; - } - totalSize -= size; - offset += size; + if (val != 0) { + LogError("Async copy failed"); + return false; } - - return true; + } + return true; } -KernelBlitManager::KernelBlitManager( - VirtualGPU& gpu, Setup setup) - : DmaBlitManager(gpu, setup) - , program_(nullptr) - , constantBuffer_(nullptr) - , xferBufferSize_(0) - , lockXferOps_(nullptr) -{ - for (uint i = 0; i < BlitTotal; ++i) { - kernels_[i] = nullptr; +bool DmaBlitManager::copyImageToBuffer(device::Memory& srcMemory, device::Memory& dstMemory, + const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin, + const amd::Coord3D& size, bool entire, size_t rowPitch, + size_t slicePitch) const { + bool result = false; + + if (setup_.disableCopyImageToBuffer_) { + result = HostBlitManager::copyImageToBuffer(srcMemory, dstMemory, srcOrigin, dstOrigin, size, + entire, rowPitch, slicePitch); + } else { + Image& srcImage = static_cast(srcMemory); + Buffer& dstBuffer = static_cast(dstMemory); + + // Use ROC path for a transfer + // Note: it doesn't support SDMA + address dstHost = reinterpret_cast
(dstBuffer.getDeviceMemory()) + dstOrigin[0]; + + // Use ROCm path for a transfer. + // Note: it doesn't support SDMA + hsa_ext_image_region_t image_region; + image_region.offset.x = srcOrigin[0]; + image_region.offset.y = srcOrigin[1]; + image_region.offset.z = srcOrigin[2]; + image_region.range.x = size[0]; + image_region.range.y = size[1]; + image_region.range.z = size[2]; + + hsa_status_t status = hsa_ext_image_export(gpu().gpu_device(), srcImage.getHsaImageObject(), + dstHost, rowPitch, slicePitch, &image_region); + result = (status == HSA_STATUS_SUCCESS) ? true : false; + + // Check if a HostBlit transfer is required + if (completeOperation_ && !result) { + result = HostBlitManager::copyImageToBuffer(srcMemory, dstMemory, srcOrigin, dstOrigin, size, + entire, rowPitch, slicePitch); + } + } + + return result; +} + +bool DmaBlitManager::copyBufferToImage(device::Memory& srcMemory, device::Memory& dstMemory, + const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin, + const amd::Coord3D& size, bool entire, size_t rowPitch, + size_t slicePitch) const { + bool result = false; + + if (setup_.disableCopyBufferToImage_) { + result = HostBlitManager::copyBufferToImage(srcMemory, dstMemory, srcOrigin, dstOrigin, size, + entire, rowPitch, slicePitch); + } else { + Buffer& srcBuffer = static_cast(srcMemory); + Image& dstImage = static_cast(dstMemory); + + // Use ROC path for a transfer + // Note: it doesn't support SDMA + address srcHost = reinterpret_cast
(srcBuffer.getDeviceMemory()) + srcOrigin[0]; + + hsa_ext_image_region_t image_region; + image_region.offset.x = dstOrigin[0]; + image_region.offset.y = dstOrigin[1]; + image_region.offset.z = dstOrigin[2]; + image_region.range.x = size[0]; + image_region.range.y = size[1]; + image_region.range.z = size[2]; + + hsa_status_t status = hsa_ext_image_import(gpu().gpu_device(), srcHost, rowPitch, slicePitch, + dstImage.getHsaImageObject(), &image_region); + result = (status == HSA_STATUS_SUCCESS) ? true : false; + + // Check if a HostBlit tran sfer is required + if (completeOperation_ && !result) { + result = HostBlitManager::copyBufferToImage(srcMemory, dstMemory, srcOrigin, dstOrigin, size, + entire, rowPitch, slicePitch); + } + } + + return result; +} + +bool DmaBlitManager::copyImage(device::Memory& srcMemory, device::Memory& dstMemory, + const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin, + const amd::Coord3D& size, bool entire) const { + bool result = false; + + if (setup_.disableCopyImage_) { + return HostBlitManager::copyImage(srcMemory, dstMemory, srcOrigin, dstOrigin, size, entire); + } else { + //! @todo Add HW accelerated path + return HostBlitManager::copyImage(srcMemory, dstMemory, srcOrigin, dstOrigin, size, entire); + } + + return result; +} + +bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory, + const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin, + const amd::Coord3D& size, bool enableCopyRect, bool flushDMA) const { + address src = reinterpret_cast
(srcMemory.getDeviceMemory()); + address dst = reinterpret_cast
(dstMemory.getDeviceMemory()); + + src += srcOrigin[0]; + dst += dstOrigin[0]; + + // Just call copy function for full profile + hsa_status_t status; + if (dev().agent_profile() == HSA_PROFILE_FULL) { + status = hsa_memory_copy(dst, src, size[0]); + if (status != HSA_STATUS_SUCCESS) { + LogPrintfError("Hsa copy of data failed with code %d", status); + } + return (status == HSA_STATUS_SUCCESS); + } + + // Detect the agents for memory allocations + const hsa_agent_t srcAgent = + (srcMemory.isHostMemDirectAccess()) ? dev().getCpuAgent() : dev().getBackendDevice(); + const hsa_agent_t dstAgent = + (dstMemory.isHostMemDirectAccess()) ? dev().getCpuAgent() : dev().getBackendDevice(); + + const hsa_signal_value_t kInitVal = 1; + hsa_signal_store_relaxed(completion_signal_, kInitVal); + + // Use SDMA to transfer the data + status = hsa_amd_memory_async_copy(dst, dstAgent, src, srcAgent, size[0], 0, nullptr, + completion_signal_); + + if (status == HSA_STATUS_SUCCESS) { + hsa_signal_value_t val = hsa_signal_wait_acquire(completion_signal_, HSA_SIGNAL_CONDITION_EQ, 0, + uint64_t(-1), HSA_WAIT_STATE_BLOCKED); + if (val != (kInitVal - 1)) { + LogError("Async copy failed"); + status = HSA_STATUS_ERROR; + } + } else { + LogPrintfError("Hsa copy from host to device failed with code %d", status); + } + + return (status == HSA_STATUS_SUCCESS); +} + +bool DmaBlitManager::hsaCopyStaged(const_address hostSrc, address hostDst, size_t size, + address staging, bool hostToDev) const { + // No allocation is necessary for Full Profile + hsa_status_t status; + if (dev().agent_profile() == HSA_PROFILE_FULL) { + status = hsa_memory_copy(hostDst, hostSrc, size); + if (status != HSA_STATUS_SUCCESS) { + LogPrintfError("Hsa copy of data failed with code %d", status); + } + return (status == HSA_STATUS_SUCCESS); + } + + size_t totalSize = size; + size_t offset = 0; + + address hsaBuffer = staging; + + const hsa_signal_value_t kInitVal = 1; + + // Allocate requested size of memory + while (totalSize > 0) { + size = std::min(totalSize, dev().settings().stagedXferSize_); + hsa_signal_store_relaxed(completion_signal_, kInitVal); + + // Copy data from Host to Device + if (hostToDev) { + memcpy(hsaBuffer, hostSrc + offset, size); + status = hsa_amd_memory_async_copy(hostDst + offset, dev().getBackendDevice(), hsaBuffer, + dev().getCpuAgent(), size, 0, nullptr, completion_signal_); + if (status == HSA_STATUS_SUCCESS) { + hsa_signal_value_t val = hsa_signal_wait_acquire( + completion_signal_, HSA_SIGNAL_CONDITION_EQ, 0, uint64_t(-1), HSA_WAIT_STATE_BLOCKED); + + if (val != (kInitVal - 1)) { + LogError("Async copy failed"); + return false; + } + } else { + LogPrintfError("Hsa copy from host to device failed with code %d", status); + return false; + } + totalSize -= size; + offset += size; + continue; } + // Copy data from Device to Host + status = + hsa_amd_memory_async_copy(hsaBuffer, dev().getCpuAgent(), hostSrc + offset, + dev().getBackendDevice(), size, 0, nullptr, completion_signal_); + if (status == HSA_STATUS_SUCCESS) { + hsa_signal_value_t val = hsa_signal_wait_acquire(completion_signal_, HSA_SIGNAL_CONDITION_EQ, + 0, uint64_t(-1), HSA_WAIT_STATE_BLOCKED); + + if (val != (kInitVal - 1)) { + LogError("Async copy failed"); + return false; + } + memcpy(hostDst + offset, hsaBuffer, size); + } else { + LogPrintfError("Hsa copy from device to host failed with code %d", status); + return false; + } + totalSize -= size; + offset += size; + } + + return true; +} + +KernelBlitManager::KernelBlitManager(VirtualGPU& gpu, Setup setup) + : DmaBlitManager(gpu, setup), + program_(nullptr), + constantBuffer_(nullptr), + xferBufferSize_(0), + lockXferOps_(nullptr) { + for (uint i = 0; i < BlitTotal; ++i) { + kernels_[i] = nullptr; + } + + for (uint i = 0; i < MaxXferBuffers; ++i) { + xferBuffers_[i] = nullptr; + } + + completeOperation_ = false; +} + +KernelBlitManager::~KernelBlitManager() { + for (uint i = 0; i < BlitTotal; ++i) { + if (nullptr != kernels_[i]) { + kernels_[i]->release(); + } + } + if (nullptr != program_) { + program_->release(); + } + + if (nullptr != context_) { + // Release a dummy context + context_->release(); + } + + if (nullptr != constantBuffer_) { + constantBuffer_->release(); + } + + for (uint i = 0; i < MaxXferBuffers; ++i) { + if (nullptr != xferBuffers_[i]) { + xferBuffers_[i]->release(); + } + } + + delete lockXferOps_; +} + +bool KernelBlitManager::create(amd::Device& device) { + if (!DmaBlitManager::create(device)) { + return false; + } + + if (!createProgram(static_cast(device))) { + return false; + } + return true; +} + +bool KernelBlitManager::createProgram(Device& device) { + if (device.blitProgram() == nullptr) { + return false; + } + + std::vector devices; + devices.push_back(&device); + + // Save context and program for this device + context_ = device.blitProgram()->context_; + context_->retain(); + program_ = device.blitProgram()->program_; + program_->retain(); + + bool result = false; + do { + // Create kernel objects for all blits + for (uint i = 0; i < BlitTotal; ++i) { + const amd::Symbol* symbol = program_->findSymbol(BlitName[i]); + if (symbol == nullptr) { + break; + } + kernels_[i] = new amd::Kernel(*program_, *symbol, BlitName[i]); + if (kernels_[i] == nullptr) { + break; + } + // Validate blit kernels for the scratch memory usage (pre SI) + if (!device.validateKernel(*kernels_[i], &gpu())) { + break; + } + } + + result = true; + } while (!result); + + // Create an internal constant buffer + constantBuffer_ = new (*context_) amd::Buffer(*context_, CL_MEM_ALLOC_HOST_PTR, 4 * Ki); + + if ((constantBuffer_ != nullptr) && !constantBuffer_->create(nullptr)) { + constantBuffer_->release(); + constantBuffer_ = nullptr; + return false; + } else if (constantBuffer_ == nullptr) { + return false; + } + + // Assign the constant buffer to the current virtual GPU + constantBuffer_->setVirtualDevice(&gpu()); + + if (dev().settings().xferBufSize_ > 0) { + xferBufferSize_ = dev().settings().xferBufSize_; for (uint i = 0; i < MaxXferBuffers; ++i) { + // Create internal xfer buffers for image copy optimization + xferBuffers_[i] = new (*context_) amd::Buffer(*context_, 0, xferBufferSize_); + + if ((xferBuffers_[i] != nullptr) && !xferBuffers_[i]->create(nullptr)) { + xferBuffers_[i]->release(); xferBuffers_[i] = nullptr; - } - - completeOperation_ = false; -} - -KernelBlitManager::~KernelBlitManager() -{ - for (uint i = 0; i < BlitTotal; ++i) { - if (nullptr != kernels_[i]) { - kernels_[i]->release(); - } - } - if (nullptr != program_) { - program_->release(); - } - - if (nullptr != context_) { - // Release a dummy context - context_->release(); - } - - if (nullptr != constantBuffer_) { - constantBuffer_->release(); - } - - for (uint i = 0; i < MaxXferBuffers; ++i) { - if (nullptr != xferBuffers_[i]) { - xferBuffers_[i]->release(); - } - } - - delete lockXferOps_; -} - -bool -KernelBlitManager::create(amd::Device& device) -{ - if (!DmaBlitManager::create(device)) { return false; - } - - if (!createProgram(static_cast(device))) { + } else if (xferBuffers_[i] == nullptr) { return false; + } + + // Assign the xfer buffer to the current virtual GPU + xferBuffers_[i]->setVirtualDevice(&gpu()); + //! @note Workaround for conformance allocation test. + //! Force GPU mem alloc. + //! Unaligned images require xfer optimization, + //! but deferred memory allocation can cause + //! virtual heap fragmentation for big allocations and + //! then fail the following test with 32 bit ISA, because + //! runtime runs out of 4GB space. + dev().getRocMemory(xferBuffers_[i]); } - return true; -} + } -bool -KernelBlitManager::createProgram(Device& device) -{ - if (device.blitProgram() == nullptr) { - return false; - } + lockXferOps_ = new amd::Monitor("Transfer Ops Lock", true); + if (nullptr == lockXferOps_) { + return false; + } - std::vector devices; - devices.push_back(&device); - - // Save context and program for this device - context_ = device.blitProgram()->context_; - context_->retain(); - program_ = device.blitProgram()->program_; - program_->retain(); - - bool result = false; - do { - // Create kernel objects for all blits - for (uint i = 0; i < BlitTotal; ++i) { - const amd::Symbol* symbol = program_->findSymbol(BlitName[i]); - if (symbol == nullptr) { - break; - } - kernels_[i] = new amd::Kernel(*program_, *symbol, BlitName[i]); - if (kernels_[i] == nullptr) { - break; - } - // Validate blit kernels for the scratch memory usage (pre SI) - if (!device.validateKernel(*kernels_[i], &gpu())) { - break; - } - } - - result = true; - } while(!result); - - // Create an internal constant buffer - constantBuffer_ = new (*context_) - amd::Buffer(*context_, CL_MEM_ALLOC_HOST_PTR, 4 * Ki); - - if ((constantBuffer_ != nullptr) && !constantBuffer_->create(nullptr)) { - constantBuffer_->release(); - constantBuffer_ = nullptr; - return false; - } - else if (constantBuffer_ == nullptr) { - return false; - } - - // Assign the constant buffer to the current virtual GPU - constantBuffer_->setVirtualDevice(&gpu()); - - if (dev().settings().xferBufSize_ > 0) { - xferBufferSize_ = dev().settings().xferBufSize_; - for (uint i = 0; i < MaxXferBuffers; ++i) { - // Create internal xfer buffers for image copy optimization - xferBuffers_[i] = new (*context_) - amd::Buffer(*context_, 0, xferBufferSize_); - - if ((xferBuffers_[i] != nullptr) && !xferBuffers_[i]->create(nullptr)) { - xferBuffers_[i]->release(); - xferBuffers_[i] = nullptr; - return false; - } - else if (xferBuffers_[i] == nullptr) { - return false; - } - - // Assign the xfer buffer to the current virtual GPU - xferBuffers_[i]->setVirtualDevice(&gpu()); - //! @note Workaround for conformance allocation test. - //! Force GPU mem alloc. - //! Unaligned images require xfer optimization, - //! but deferred memory allocation can cause - //! virtual heap fragmentation for big allocations and - //! then fail the following test with 32 bit ISA, because - //! runtime runs out of 4GB space. - dev().getRocMemory(xferBuffers_[i]); - } - } - - lockXferOps_ = new amd::Monitor("Transfer Ops Lock", true); - if (nullptr == lockXferOps_) { - return false; - } - - return result; + return result; } // The following data structures will be used for the view creations. // Some formats has to be converted before a kernel blit operation struct FormatConvertion { - cl_uint clOldType_; - cl_uint clNewType_; + cl_uint clOldType_; + cl_uint clNewType_; }; // The list of rejected data formats and corresponding conversion -static const FormatConvertion RejectedData[] = -{ - { CL_UNORM_INT8, CL_UNSIGNED_INT8 }, - { CL_UNORM_INT16, CL_UNSIGNED_INT16 }, - { CL_SNORM_INT8, CL_UNSIGNED_INT8 }, - { CL_SNORM_INT16, CL_UNSIGNED_INT16 }, - { CL_HALF_FLOAT, CL_UNSIGNED_INT16 }, - { CL_FLOAT, CL_UNSIGNED_INT32 }, - { CL_SIGNED_INT8, CL_UNSIGNED_INT8 }, - { CL_SIGNED_INT16, CL_UNSIGNED_INT16 }, - { CL_UNORM_INT_101010, CL_UNSIGNED_INT8 }, - { CL_SIGNED_INT32, CL_UNSIGNED_INT32 } -}; +static const FormatConvertion RejectedData[] = { + {CL_UNORM_INT8, CL_UNSIGNED_INT8}, {CL_UNORM_INT16, CL_UNSIGNED_INT16}, + {CL_SNORM_INT8, CL_UNSIGNED_INT8}, {CL_SNORM_INT16, CL_UNSIGNED_INT16}, + {CL_HALF_FLOAT, CL_UNSIGNED_INT16}, {CL_FLOAT, CL_UNSIGNED_INT32}, + {CL_SIGNED_INT8, CL_UNSIGNED_INT8}, {CL_SIGNED_INT16, CL_UNSIGNED_INT16}, + {CL_UNORM_INT_101010, CL_UNSIGNED_INT8}, {CL_SIGNED_INT32, CL_UNSIGNED_INT32}}; // The list of rejected channel's order and corresponding conversion -static const FormatConvertion RejectedOrder[] = -{ - { CL_A, CL_R }, - { CL_RA, CL_RG }, - { CL_LUMINANCE, CL_R }, - { CL_INTENSITY, CL_R }, - { CL_RGB, CL_RGBA }, - { CL_BGRA, CL_RGBA }, - { CL_ARGB, CL_RGBA }, - { CL_sRGB, CL_RGBA }, - { CL_sRGBx, CL_RGBA }, - { CL_sRGBA, CL_RGBA }, - { CL_sBGRA, CL_RGBA }, - { CL_DEPTH, CL_R } -}; +static const FormatConvertion RejectedOrder[] = { + {CL_A, CL_R}, {CL_RA, CL_RG}, {CL_LUMINANCE, CL_R}, {CL_INTENSITY, CL_R}, + {CL_RGB, CL_RGBA}, {CL_BGRA, CL_RGBA}, {CL_ARGB, CL_RGBA}, {CL_sRGB, CL_RGBA}, + {CL_sRGBx, CL_RGBA}, {CL_sRGBA, CL_RGBA}, {CL_sBGRA, CL_RGBA}, {CL_DEPTH, CL_R}}; -const uint RejectedFormatDataTotal = - sizeof(RejectedData) / sizeof(FormatConvertion); -const uint RejectedFormatChannelTotal = - sizeof(RejectedOrder) / sizeof(FormatConvertion); +const uint RejectedFormatDataTotal = sizeof(RejectedData) / sizeof(FormatConvertion); +const uint RejectedFormatChannelTotal = sizeof(RejectedOrder) / sizeof(FormatConvertion); -bool -KernelBlitManager::copyBufferToImage( - device::Memory& srcMemory, - device::Memory& dstMemory, - const amd::Coord3D& srcOrigin, - const amd::Coord3D& dstOrigin, - const amd::Coord3D& size, - bool entire, - size_t rowPitch, - size_t slicePitch) const -{ - amd::ScopedLock k(lockXferOps_); - bool result = false; - static const bool CopyRect = false; - // Flush DMA for ASYNC copy - static const bool FlushDMA = true; - amd::Image* dstImage = static_cast(dstMemory.owner()); - size_t imgRowPitch = size[0] * dstImage->getImageFormat().getElementSize(); - size_t imgSlicePitch = imgRowPitch * size[1]; +bool KernelBlitManager::copyBufferToImage(device::Memory& srcMemory, device::Memory& dstMemory, + const amd::Coord3D& srcOrigin, + const amd::Coord3D& dstOrigin, const amd::Coord3D& size, + bool entire, size_t rowPitch, size_t slicePitch) const { + amd::ScopedLock k(lockXferOps_); + bool result = false; + static const bool CopyRect = false; + // Flush DMA for ASYNC copy + static const bool FlushDMA = true; + amd::Image* dstImage = static_cast(dstMemory.owner()); + size_t imgRowPitch = size[0] * dstImage->getImageFormat().getElementSize(); + size_t imgSlicePitch = imgRowPitch * size[1]; - if (setup_.disableCopyBufferToImage_) { - result = HostBlitManager::copyBufferToImage( - srcMemory, dstMemory, srcOrigin, dstOrigin, size, - entire, rowPitch, slicePitch); + if (setup_.disableCopyBufferToImage_) { + result = HostBlitManager::copyBufferToImage(srcMemory, dstMemory, srcOrigin, dstOrigin, size, + entire, rowPitch, slicePitch); + synchronize(); + return result; + } + // Check if buffer is in system memory with direct access + else if (gpuMem(srcMemory).isHostMemDirectAccess() && + (((rowPitch == 0) && (slicePitch == 0)) || + ((rowPitch == imgRowPitch) && ((slicePitch == 0) || (slicePitch == imgSlicePitch))))) { + // First attempt to do this all with DMA, + // but there are restriciton with older hardware + if (dev().settings().imageDMA_) { + result = DmaBlitManager::copyBufferToImage(srcMemory, dstMemory, srcOrigin, dstOrigin, size, + entire, rowPitch, slicePitch); + if (result) { synchronize(); return result; + } } - // Check if buffer is in system memory with direct access - else if (gpuMem(srcMemory).isHostMemDirectAccess() && - (((rowPitch == 0) && (slicePitch == 0)) || - ((rowPitch == imgRowPitch) && - ((slicePitch == 0) || (slicePitch == imgSlicePitch))))) { - // First attempt to do this all with DMA, - // but there are restriciton with older hardware - if (dev().settings().imageDMA_) { - result = DmaBlitManager::copyBufferToImage( - srcMemory, dstMemory, srcOrigin, dstOrigin, size, - entire, rowPitch, slicePitch); - if (result) { - synchronize(); - return result; - } - } - } + } - if (!result) { - result = copyBufferToImageKernel(srcMemory, - dstMemory, srcOrigin, dstOrigin, size, entire, rowPitch, slicePitch); - } + if (!result) { + result = copyBufferToImageKernel(srcMemory, dstMemory, srcOrigin, dstOrigin, size, entire, + rowPitch, slicePitch); + } + synchronize(); + + return result; +} + +void CalcRowSlicePitches(cl_ulong* pitch, const cl_int* copySize, size_t rowPitch, + size_t slicePitch, const Memory& mem) { + amd::Image* image = static_cast(mem.owner()); + uint32_t memFmtSize = image->getImageFormat().getElementSize(); + bool img1Darray = (mem.owner()->getType() == CL_MEM_OBJECT_IMAGE1D_ARRAY) ? true : false; + + if (rowPitch == 0) { + pitch[0] = copySize[0]; + } else { + pitch[0] = rowPitch / memFmtSize; + } + if (slicePitch == 0) { + pitch[1] = pitch[0] * (img1Darray ? 1 : copySize[1]); + } else { + pitch[1] = slicePitch / memFmtSize; + } + assert((pitch[0] <= pitch[1]) && "rowPitch must be <= slicePitch"); + + if (img1Darray) { + // For 1D array rowRitch = slicePitch + pitch[0] = pitch[1]; + } +} + +static inline void setArgument(amd::Kernel* kernel, size_t index, size_t size, const void* value) { + kernel->parameters().set(index, size, value); +} + +bool KernelBlitManager::copyBufferToImageKernel(device::Memory& srcMemory, + device::Memory& dstMemory, + const amd::Coord3D& srcOrigin, + const amd::Coord3D& dstOrigin, + const amd::Coord3D& size, bool entire, + size_t rowPitch, size_t slicePitch) const { + bool rejected = false; + Memory* dstView = &gpuMem(dstMemory); + bool releaseView = false; + bool result = false; + amd::Image* dstImage = static_cast(dstMemory.owner()); + amd::Image* srcImage = static_cast(srcMemory.owner()); + amd::Image::Format newFormat(dstImage->getImageFormat()); + + // Find unsupported formats + for (uint i = 0; i < RejectedFormatDataTotal; ++i) { + if (RejectedData[i].clOldType_ == newFormat.image_channel_data_type) { + newFormat.image_channel_data_type = RejectedData[i].clNewType_; + rejected = true; + break; + } + } + + // Find unsupported channel's order + for (uint i = 0; i < RejectedFormatChannelTotal; ++i) { + if (RejectedOrder[i].clOldType_ == newFormat.image_channel_order) { + newFormat.image_channel_order = RejectedOrder[i].clNewType_; + rejected = true; + break; + } + } + + // If the image format was rejected, then attempt to create a view + if (rejected && + // todo ROC runtime has a problem with a view for this format + (dstImage->getImageFormat().image_channel_data_type != CL_UNORM_INT_101010)) { + dstView = createView(gpuMem(dstMemory), newFormat, CL_MEM_WRITE_ONLY); + if (dstView != nullptr) { + rejected = false; + releaseView = true; + } + } + + // Fall into the host path if the image format was rejected + if (rejected) { + return DmaBlitManager::copyBufferToImage(srcMemory, dstMemory, srcOrigin, dstOrigin, size, + entire, rowPitch, slicePitch); + } + + // Use a common blit type with three dimensions by default + uint blitType = BlitCopyBufferToImage; + size_t dim = 0; + size_t globalWorkOffset[3] = {0, 0, 0}; + size_t globalWorkSize[3]; + size_t localWorkSize[3]; + + // Program the kernels workload depending on the blit dimensions + dim = 3; + if (dstImage->getDims() == 1) { + globalWorkSize[0] = amd::alignUp(size[0], 256); + globalWorkSize[1] = amd::alignUp(size[1], 1); + globalWorkSize[2] = amd::alignUp(size[2], 1); + localWorkSize[0] = 256; + localWorkSize[1] = localWorkSize[2] = 1; + } else if (dstImage->getDims() == 2) { + globalWorkSize[0] = amd::alignUp(size[0], 16); + globalWorkSize[1] = amd::alignUp(size[1], 16); + globalWorkSize[2] = amd::alignUp(size[2], 1); + localWorkSize[0] = localWorkSize[1] = 16; + localWorkSize[2] = 1; + } else { + globalWorkSize[0] = amd::alignUp(size[0], 8); + globalWorkSize[1] = amd::alignUp(size[1], 8); + globalWorkSize[2] = amd::alignUp(size[2], 4); + localWorkSize[0] = localWorkSize[1] = 8; + localWorkSize[2] = 4; + } + + // Program kernels arguments for the blit operation + cl_mem mem = as_cl(srcMemory.owner()); + setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem); + mem = as_cl(dstView->owner()); + setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem); + uint32_t memFmtSize = dstImage->getImageFormat().getElementSize(); + uint32_t components = dstImage->getImageFormat().getNumChannels(); + + // 1 element granularity for writes by default + cl_int granularity = 1; + if (memFmtSize == 2) { + granularity = 2; + } else if (memFmtSize >= 4) { + granularity = 4; + } + CondLog(((srcOrigin[0] % granularity) != 0), "Unaligned offset in blit!"); + cl_ulong srcOrg[4] = {srcOrigin[0] / granularity, srcOrigin[1], srcOrigin[2], 0}; + setArgument(kernels_[blitType], 2, sizeof(srcOrg), srcOrg); + + cl_int dstOrg[4] = {(cl_int)dstOrigin[0], (cl_int)dstOrigin[1], (cl_int)dstOrigin[2], 0}; + cl_int copySize[4] = {(cl_int)size[0], (cl_int)size[1], (cl_int)size[2], 0}; + + setArgument(kernels_[blitType], 3, sizeof(dstOrg), dstOrg); + setArgument(kernels_[blitType], 4, sizeof(copySize), copySize); + + // Program memory format + uint multiplier = memFmtSize / sizeof(uint32_t); + multiplier = (multiplier == 0) ? 1 : multiplier; + cl_uint format[4] = {components, memFmtSize / components, multiplier, 0}; + setArgument(kernels_[blitType], 5, sizeof(format), format); + + // Program row and slice pitches + cl_ulong pitch[4] = {0}; + CalcRowSlicePitches(pitch, copySize, rowPitch, slicePitch, gpuMem(dstMemory)); + setArgument(kernels_[blitType], 6, sizeof(pitch), pitch); + + // Create ND range object for the kernel's execution + amd::NDRangeContainer ndrange(dim, globalWorkOffset, globalWorkSize, localWorkSize); + + // Execute the blit + address parameters = captureArguments(kernels_[blitType]); + result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters, nullptr); + releaseArguments(parameters); + if (releaseView) { + // todo SRD programming could be changed to avoid a stall + gpu().releaseGpuMemoryFence(); + dstView->owner()->release(); + } + + return result; +} + +bool KernelBlitManager::copyImageToBuffer(device::Memory& srcMemory, device::Memory& dstMemory, + const amd::Coord3D& srcOrigin, + const amd::Coord3D& dstOrigin, const amd::Coord3D& size, + bool entire, size_t rowPitch, size_t slicePitch) const { + amd::ScopedLock k(lockXferOps_); + bool result = false; + static const bool CopyRect = false; + // Flush DMA for ASYNC copy + static const bool FlushDMA = true; + amd::Image* srcImage = static_cast(srcMemory.owner()); + size_t imgRowPitch = size[0] * srcImage->getImageFormat().getElementSize(); + size_t imgSlicePitch = imgRowPitch * size[1]; + + if (setup_.disableCopyImageToBuffer_) { + result = DmaBlitManager::copyImageToBuffer(srcMemory, dstMemory, srcOrigin, dstOrigin, size, + entire, rowPitch, slicePitch); synchronize(); - return result; + } + // Check if buffer is in system memory with direct access + else if (gpuMem(dstMemory).isHostMemDirectAccess() && + (((rowPitch == 0) && (slicePitch == 0)) || + ((rowPitch == imgRowPitch) && ((slicePitch == 0) || (slicePitch == imgSlicePitch))))) { + // First attempt to do this all with DMA, + // but there are restriciton with older hardware + // If the dest buffer is external physical(SDI), copy two step as + // single step SDMA is causing corruption and the cause is under investigation + if (dev().settings().imageDMA_) { + result = DmaBlitManager::copyImageToBuffer(srcMemory, dstMemory, srcOrigin, dstOrigin, size, + entire, rowPitch, slicePitch); + if (result) { + synchronize(); + return result; + } + } + } + + if (!result) { + result = copyImageToBufferKernel(srcMemory, dstMemory, srcOrigin, dstOrigin, size, entire, + rowPitch, slicePitch); + } + + synchronize(); + + return result; } -void -CalcRowSlicePitches( - cl_ulong* pitch, const cl_int* copySize, - size_t rowPitch, size_t slicePitch, const Memory& mem) -{ - amd::Image* image = static_cast(mem.owner()); - uint32_t memFmtSize = image->getImageFormat().getElementSize(); - bool img1Darray = (mem.owner()->getType() == CL_MEM_OBJECT_IMAGE1D_ARRAY) ? true : false; +bool KernelBlitManager::copyImageToBufferKernel(device::Memory& srcMemory, + device::Memory& dstMemory, + const amd::Coord3D& srcOrigin, + const amd::Coord3D& dstOrigin, + const amd::Coord3D& size, bool entire, + size_t rowPitch, size_t slicePitch) const { + bool rejected = false; + Memory* srcView = &gpuMem(srcMemory); + bool releaseView = false; + bool result = false; + amd::Image* srcImage = static_cast(srcMemory.owner()); + amd::Image::Format newFormat(srcImage->getImageFormat()); - if (rowPitch == 0) { - pitch[0] = copySize[0]; + // Find unsupported formats + for (uint i = 0; i < RejectedFormatDataTotal; ++i) { + if (RejectedData[i].clOldType_ == newFormat.image_channel_data_type) { + newFormat.image_channel_data_type = RejectedData[i].clNewType_; + rejected = true; + break; } - else { - pitch[0] = rowPitch / memFmtSize; - } - if (slicePitch == 0) { - pitch[1] = pitch[0] * (img1Darray ? 1 : copySize[1]); - } - else { - pitch[1] = slicePitch / memFmtSize; - } - assert((pitch[0] <= pitch[1]) && "rowPitch must be <= slicePitch"); + } - if (img1Darray) { - // For 1D array rowRitch = slicePitch - pitch[0] = pitch[1]; + // Find unsupported channel's order + for (uint i = 0; i < RejectedFormatChannelTotal; ++i) { + if (RejectedOrder[i].clOldType_ == newFormat.image_channel_order) { + newFormat.image_channel_order = RejectedOrder[i].clNewType_; + rejected = true; + break; } + } + + // If the image format was rejected, then attempt to create a view + if (rejected && + // todo ROC runtime has a problem with a view for this format + (srcImage->getImageFormat().image_channel_data_type != CL_UNORM_INT_101010)) { + srcView = createView(gpuMem(srcMemory), newFormat, CL_MEM_READ_ONLY); + if (srcView != nullptr) { + rejected = false; + releaseView = true; + } + } + + // Fall into the host path if the image format was rejected + if (rejected) { + return DmaBlitManager::copyImageToBuffer(srcMemory, dstMemory, srcOrigin, dstOrigin, size, + entire, rowPitch, slicePitch); + } + + uint blitType = BlitCopyImageToBuffer; + size_t dim = 0; + size_t globalWorkOffset[3] = {0, 0, 0}; + size_t globalWorkSize[3]; + size_t localWorkSize[3]; + + // Program the kernels workload depending on the blit dimensions + dim = 3; + // Find the current blit type + if (srcImage->getDims() == 1) { + globalWorkSize[0] = amd::alignUp(size[0], 256); + globalWorkSize[1] = amd::alignUp(size[1], 1); + globalWorkSize[2] = amd::alignUp(size[2], 1); + localWorkSize[0] = 256; + localWorkSize[1] = localWorkSize[2] = 1; + } else if (srcImage->getDims() == 2) { + globalWorkSize[0] = amd::alignUp(size[0], 16); + globalWorkSize[1] = amd::alignUp(size[1], 16); + globalWorkSize[2] = amd::alignUp(size[2], 1); + localWorkSize[0] = localWorkSize[1] = 16; + localWorkSize[2] = 1; + } else { + globalWorkSize[0] = amd::alignUp(size[0], 8); + globalWorkSize[1] = amd::alignUp(size[1], 8); + globalWorkSize[2] = amd::alignUp(size[2], 4); + localWorkSize[0] = localWorkSize[1] = 8; + localWorkSize[2] = 4; + } + + // Program kernels arguments for the blit operation + cl_mem mem = as_cl(srcView->owner()); + setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem); + mem = as_cl(dstMemory.owner()); + setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem); + + // Update extra paramters for USHORT and UBYTE pointers. + // Only then compiler can optimize the kernel to use + // UAV Raw for other writes + setArgument(kernels_[blitType], 2, sizeof(cl_mem), &mem); + setArgument(kernels_[blitType], 3, sizeof(cl_mem), &mem); + + cl_int srcOrg[4] = {(cl_int)srcOrigin[0], (cl_int)srcOrigin[1], (cl_int)srcOrigin[2], 0}; + cl_int copySize[4] = {(cl_int)size[0], (cl_int)size[1], (cl_int)size[2], 0}; + setArgument(kernels_[blitType], 4, sizeof(srcOrg), srcOrg); + uint32_t memFmtSize = srcImage->getImageFormat().getElementSize(); + uint32_t components = srcImage->getImageFormat().getNumChannels(); + + // 1 element granularity for writes by default + cl_int granularity = 1; + if (memFmtSize == 2) { + granularity = 2; + } else if (memFmtSize >= 4) { + granularity = 4; + } + CondLog(((dstOrigin[0] % granularity) != 0), "Unaligned offset in blit!"); + cl_ulong dstOrg[4] = {dstOrigin[0] / granularity, dstOrigin[1], dstOrigin[2], 0}; + setArgument(kernels_[blitType], 5, sizeof(dstOrg), dstOrg); + setArgument(kernels_[blitType], 6, sizeof(copySize), copySize); + + // Program memory format + uint multiplier = memFmtSize / sizeof(uint32_t); + multiplier = (multiplier == 0) ? 1 : multiplier; + cl_uint format[4] = {components, memFmtSize / components, multiplier, 0}; + setArgument(kernels_[blitType], 7, sizeof(format), format); + + // Program row and slice pitches + cl_ulong pitch[4] = {0}; + CalcRowSlicePitches(pitch, copySize, rowPitch, slicePitch, gpuMem(srcMemory)); + setArgument(kernels_[blitType], 8, sizeof(pitch), pitch); + + // Create ND range object for the kernel's execution + amd::NDRangeContainer ndrange(dim, globalWorkOffset, globalWorkSize, localWorkSize); + + // Execute the blit + address parameters = captureArguments(kernels_[blitType]); + result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters, nullptr); + releaseArguments(parameters); + if (releaseView) { + // todo SRD programming could be changed to avoid a stall + gpu().releaseGpuMemoryFence(); + srcView->owner()->release(); + } + + return result; } -static inline void -setArgument(amd::Kernel* kernel, size_t index, size_t size, const void* value) -{ - kernel->parameters().set(index, size, value); -} +bool KernelBlitManager::copyImage(device::Memory& srcMemory, device::Memory& dstMemory, + const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin, + const amd::Coord3D& size, bool entire) const { + amd::ScopedLock k(lockXferOps_); + bool rejected = false; + Memory* srcView = &gpuMem(srcMemory); + Memory* dstView = &gpuMem(dstMemory); + bool releaseView = false; + bool result = false; + amd::Image* srcImage = static_cast(srcMemory.owner()); + amd::Image* dstImage = static_cast(dstMemory.owner()); + amd::Image::Format newFormat(srcImage->getImageFormat()); -bool -KernelBlitManager::copyBufferToImageKernel( - device::Memory& srcMemory, - device::Memory& dstMemory, - const amd::Coord3D& srcOrigin, - const amd::Coord3D& dstOrigin, - const amd::Coord3D& size, - bool entire, - size_t rowPitch, - size_t slicePitch) const -{ - bool rejected = false; - Memory* dstView = &gpuMem(dstMemory); - bool releaseView = false; - bool result = false; - amd::Image* dstImage = static_cast(dstMemory.owner()); - amd::Image* srcImage = static_cast(srcMemory.owner()); - amd::Image::Format newFormat(dstImage->getImageFormat()); - - // Find unsupported formats - for (uint i = 0; i < RejectedFormatDataTotal; ++i) { - if (RejectedData[i].clOldType_ == newFormat.image_channel_data_type) { - newFormat.image_channel_data_type = RejectedData[i].clNewType_; - rejected = true; - break; - } + // Find unsupported formats + for (uint i = 0; i < RejectedFormatDataTotal; ++i) { + if (RejectedData[i].clOldType_ == newFormat.image_channel_data_type) { + newFormat.image_channel_data_type = RejectedData[i].clNewType_; + rejected = true; + break; } + } - // Find unsupported channel's order + // Search for the rejected channel's order only if the format was rejected + // Note: Image blit is independent from the channel order + if (rejected) { for (uint i = 0; i < RejectedFormatChannelTotal; ++i) { - if (RejectedOrder[i].clOldType_ == newFormat.image_channel_order) { - newFormat.image_channel_order = RejectedOrder[i].clNewType_; - rejected = true; - break; + if (RejectedOrder[i].clOldType_ == newFormat.image_channel_order) { + newFormat.image_channel_order = RejectedOrder[i].clNewType_; + rejected = true; + break; + } + } + } + + // Attempt to create a view if the format was rejected + if (rejected) { + srcView = createView(gpuMem(srcMemory), newFormat, CL_MEM_READ_ONLY); + if (srcView != nullptr) { + dstView = createView(gpuMem(dstMemory), newFormat, CL_MEM_WRITE_ONLY); + if (dstView != nullptr) { + rejected = false; + releaseView = true; + } else { + delete srcView; + } + } + } + + // Fall into the host path for the entire 2D copy or + // if the image format was rejected + if (rejected) { + result = DmaBlitManager::copyImage(srcMemory, dstMemory, srcOrigin, dstOrigin, size, entire); + synchronize(); + return result; + } + + uint blitType = BlitCopyImage; + size_t dim = 0; + size_t globalWorkOffset[3] = {0, 0, 0}; + size_t globalWorkSize[3]; + size_t localWorkSize[3]; + + // Program the kernels workload depending on the blit dimensions + dim = 3; + // Find the current blit type + if ((srcImage->getDims() == 1) || (dstImage->getDims() == 1)) { + globalWorkSize[0] = amd::alignUp(size[0], 256); + globalWorkSize[1] = amd::alignUp(size[1], 1); + globalWorkSize[2] = amd::alignUp(size[2], 1); + localWorkSize[0] = 256; + localWorkSize[1] = localWorkSize[2] = 1; + } else if ((srcImage->getDims() == 2) || (dstImage->getDims() == 2)) { + globalWorkSize[0] = amd::alignUp(size[0], 16); + globalWorkSize[1] = amd::alignUp(size[1], 16); + globalWorkSize[2] = amd::alignUp(size[2], 1); + localWorkSize[0] = localWorkSize[1] = 16; + localWorkSize[2] = 1; + } else { + globalWorkSize[0] = amd::alignUp(size[0], 8); + globalWorkSize[1] = amd::alignUp(size[1], 8); + globalWorkSize[2] = amd::alignUp(size[2], 4); + localWorkSize[0] = localWorkSize[1] = 8; + localWorkSize[2] = 4; + } + + // The current OpenCL spec allows "copy images from a 1D image + // array object to a 1D image array object" only. + if ((gpuMem(srcMemory).owner()->getType() == CL_MEM_OBJECT_IMAGE1D_ARRAY) || + (gpuMem(dstMemory).owner()->getType() == CL_MEM_OBJECT_IMAGE1D_ARRAY)) { + blitType = BlitCopyImage1DA; + } + + // Program kernels arguments for the blit operation + cl_mem mem = as_cl(srcView->owner()); + setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem); + mem = as_cl(dstView->owner()); + setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem); + + // Program source origin + cl_int srcOrg[4] = {(cl_int)srcOrigin[0], (cl_int)srcOrigin[1], (cl_int)srcOrigin[2], 0}; + setArgument(kernels_[blitType], 2, sizeof(srcOrg), srcOrg); + + // Program destinaiton origin + cl_int dstOrg[4] = {(cl_int)dstOrigin[0], (cl_int)dstOrigin[1], (cl_int)dstOrigin[2], 0}; + setArgument(kernels_[blitType], 3, sizeof(dstOrg), dstOrg); + + cl_int copySize[4] = {(cl_int)size[0], (cl_int)size[1], (cl_int)size[2], 0}; + setArgument(kernels_[blitType], 4, sizeof(copySize), copySize); + + // Create ND range object for the kernel's execution + amd::NDRangeContainer ndrange(dim, globalWorkOffset, globalWorkSize, localWorkSize); + + // Execute the blit + address parameters = captureArguments(kernels_[blitType]); + result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters, nullptr); + releaseArguments(parameters); + if (releaseView) { + // todo SRD programming could be changed to avoid a stall + gpu().releaseGpuMemoryFence(); + srcView->owner()->release(); + dstView->owner()->release(); + } + + synchronize(); + + return result; +} + +void FindPinSize(size_t& pinSize, const amd::Coord3D& size, size_t& rowPitch, size_t& slicePitch, + const Memory& mem) { + amd::Image* image = static_cast(mem.owner()); + pinSize = size[0] * image->getImageFormat().getElementSize(); + if ((rowPitch == 0) || (rowPitch == pinSize)) { + rowPitch = 0; + } else { + pinSize = rowPitch; + } + + // Calculate the pin size, which should be equal to the copy size + for (uint i = 1; i < image->getDims(); ++i) { + pinSize *= size[i]; + if (i == 1) { + if ((slicePitch == 0) || (slicePitch == pinSize)) { + slicePitch = 0; + } else { + if (mem.owner()->getType() != CL_MEM_OBJECT_IMAGE1D_ARRAY) { + pinSize = slicePitch; + } else { + pinSize = slicePitch * size[i]; } + } + } + } +} + +bool KernelBlitManager::readImage(device::Memory& srcMemory, void* dstHost, + const amd::Coord3D& origin, const amd::Coord3D& size, + size_t rowPitch, size_t slicePitch, bool entire) const { + amd::ScopedLock k(lockXferOps_); + bool result = false; + + // Use host copy if memory has direct access + if (setup_.disableReadImage_ || (gpuMem(srcMemory).isHostMemDirectAccess())) { + result = + HostBlitManager::readImage(srcMemory, dstHost, origin, size, rowPitch, slicePitch, entire); + synchronize(); + return result; + } else { + size_t pinSize; + FindPinSize(pinSize, size, rowPitch, slicePitch, gpuMem(srcMemory)); + + size_t partial; + amd::Memory* amdMemory = pinHostMemory(dstHost, pinSize, partial); + + if (amdMemory == nullptr) { + // Force SW copy + result = + DmaBlitManager::readImage(srcMemory, dstHost, origin, size, rowPitch, slicePitch, entire); + synchronize(); + return result; } - // If the image format was rejected, then attempt to create a view - if (rejected && - // todo ROC runtime has a problem with a view for this format - (dstImage->getImageFormat().image_channel_data_type != CL_UNORM_INT_101010)) { - dstView = createView(gpuMem(dstMemory), newFormat, CL_MEM_WRITE_ONLY); - if (dstView != nullptr) { - rejected = false; - releaseView = true; - } + // Readjust destination offset + const amd::Coord3D dstOrigin(partial); + + // Get device memory for this virtual device + Memory* dstMemory = dev().getRocMemory(amdMemory); + + // Copy image to buffer + result = copyImageToBuffer(srcMemory, *dstMemory, origin, dstOrigin, size, entire, rowPitch, + slicePitch); + + // Add pinned memory for a later release + gpu().addPinnedMem(amdMemory); + } + + synchronize(); + + return result; +} + +bool KernelBlitManager::writeImage(const void* srcHost, device::Memory& dstMemory, + const amd::Coord3D& origin, const amd::Coord3D& size, + size_t rowPitch, size_t slicePitch, bool entire) const { + amd::ScopedLock k(lockXferOps_); + bool result = false; + + // Use host copy if memory has direct access + if (setup_.disableWriteImage_ || gpuMem(dstMemory).isHostMemDirectAccess()) { + result = + HostBlitManager::writeImage(srcHost, dstMemory, origin, size, rowPitch, slicePitch, entire); + synchronize(); + return result; + } else { + size_t pinSize; + FindPinSize(pinSize, size, rowPitch, slicePitch, gpuMem(dstMemory)); + + size_t partial; + amd::Memory* amdMemory = pinHostMemory(srcHost, pinSize, partial); + + if (amdMemory == nullptr) { + // Force SW copy + result = DmaBlitManager::writeImage(srcHost, dstMemory, origin, size, rowPitch, slicePitch, + entire); + synchronize(); + return result; } - // Fall into the host path if the image format was rejected - if (rejected) { - return DmaBlitManager::copyBufferToImage( - srcMemory, dstMemory, srcOrigin, dstOrigin, - size, entire, rowPitch, slicePitch); + // Readjust destination offset + const amd::Coord3D srcOrigin(partial); + + // Get device memory for this virtual device + Memory* srcMemory = dev().getRocMemory(amdMemory); + + // Copy image to buffer + result = copyBufferToImage(*srcMemory, dstMemory, srcOrigin, origin, size, entire, rowPitch, + slicePitch); + + // Add pinned memory for a later release + gpu().addPinnedMem(amdMemory); + } + + synchronize(); + + return result; +} + +bool KernelBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& dstMemory, + const amd::BufferRect& srcRectIn, + const amd::BufferRect& dstRectIn, const amd::Coord3D& sizeIn, + bool entire) const { + amd::ScopedLock k(lockXferOps_); + bool result = false; + bool rejected = false; + + // Fall into the ROC path for rejected transfers + if (setup_.disableCopyBufferRect_ || gpuMem(srcMemory).isHostMemDirectAccess() || + gpuMem(dstMemory).isHostMemDirectAccess()) { + result = + HostBlitManager::copyBufferRect(srcMemory, dstMemory, srcRectIn, dstRectIn, sizeIn, entire); + + if (result) { + synchronize(); + return result; + } + } + + uint blitType = BlitCopyBufferRect; + size_t dim = 3; + size_t globalWorkOffset[3] = {0, 0, 0}; + size_t globalWorkSize[3]; + size_t localWorkSize[3]; + + const static uint CopyRectAlignment[3] = {16, 4, 1}; + + bool aligned; + uint i; + for (i = 0; i < sizeof(CopyRectAlignment) / sizeof(uint); i++) { + // Check source alignments + aligned = ((srcRectIn.rowPitch_ % CopyRectAlignment[i]) == 0); + aligned &= ((srcRectIn.slicePitch_ % CopyRectAlignment[i]) == 0); + aligned &= ((srcRectIn.start_ % CopyRectAlignment[i]) == 0); + + // Check destination alignments + aligned &= ((dstRectIn.rowPitch_ % CopyRectAlignment[i]) == 0); + aligned &= ((dstRectIn.slicePitch_ % CopyRectAlignment[i]) == 0); + aligned &= ((dstRectIn.start_ % CopyRectAlignment[i]) == 0); + + // Check copy size alignment in the first dimension + aligned &= ((sizeIn[0] % CopyRectAlignment[i]) == 0); + + if (aligned) { + if (CopyRectAlignment[i] != 1) { + blitType = BlitCopyBufferRectAligned; + } + break; + } + } + + amd::BufferRect srcRect; + amd::BufferRect dstRect; + amd::Coord3D size(sizeIn[0], sizeIn[1], sizeIn[2]); + + srcRect.rowPitch_ = srcRectIn.rowPitch_ / CopyRectAlignment[i]; + srcRect.slicePitch_ = srcRectIn.slicePitch_ / CopyRectAlignment[i]; + srcRect.start_ = srcRectIn.start_ / CopyRectAlignment[i]; + srcRect.end_ = srcRectIn.end_ / CopyRectAlignment[i]; + + dstRect.rowPitch_ = dstRectIn.rowPitch_ / CopyRectAlignment[i]; + dstRect.slicePitch_ = dstRectIn.slicePitch_ / CopyRectAlignment[i]; + dstRect.start_ = dstRectIn.start_ / CopyRectAlignment[i]; + dstRect.end_ = dstRectIn.end_ / CopyRectAlignment[i]; + + size.c[0] /= CopyRectAlignment[i]; + + // Program the kernel's workload depending on the transfer dimensions + if ((size[1] == 1) && (size[2] == 1)) { + globalWorkSize[0] = amd::alignUp(size[0], 256); + globalWorkSize[1] = 1; + globalWorkSize[2] = 1; + localWorkSize[0] = 256; + localWorkSize[1] = 1; + localWorkSize[2] = 1; + } else if (size[2] == 1) { + globalWorkSize[0] = amd::alignUp(size[0], 16); + globalWorkSize[1] = amd::alignUp(size[1], 16); + globalWorkSize[2] = 1; + localWorkSize[0] = localWorkSize[1] = 16; + localWorkSize[2] = 1; + } else { + globalWorkSize[0] = amd::alignUp(size[0], 8); + globalWorkSize[1] = amd::alignUp(size[1], 8); + globalWorkSize[2] = amd::alignUp(size[2], 4); + localWorkSize[0] = localWorkSize[1] = 8; + localWorkSize[2] = 4; + } + + + // Program kernels arguments for the blit operation + cl_mem mem = as_cl(srcMemory.owner()); + setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem); + mem = as_cl(dstMemory.owner()); + setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem); + cl_ulong src[4] = {srcRect.rowPitch_, srcRect.slicePitch_, srcRect.start_, 0}; + setArgument(kernels_[blitType], 2, sizeof(src), src); + cl_ulong dst[4] = {dstRect.rowPitch_, dstRect.slicePitch_, dstRect.start_, 0}; + setArgument(kernels_[blitType], 3, sizeof(dst), dst); + cl_ulong copySize[4] = {size[0], size[1], size[2], CopyRectAlignment[i]}; + setArgument(kernels_[blitType], 4, sizeof(copySize), copySize); + + // Create ND range object for the kernel's execution + amd::NDRangeContainer ndrange(dim, globalWorkOffset, globalWorkSize, localWorkSize); + + // Execute the blit + address parameters = captureArguments(kernels_[blitType]); + result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters, nullptr); + releaseArguments(parameters); + synchronize(); + + return result; +} + +bool KernelBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost, + const amd::Coord3D& origin, const amd::Coord3D& size, + bool entire) const { + amd::ScopedLock k(lockXferOps_); + bool result = false; + // Use host copy if memory has direct access + if (setup_.disableReadBuffer_ || (gpuMem(srcMemory).isHostMemDirectAccess())) { + result = HostBlitManager::readBuffer(srcMemory, dstHost, origin, size, entire); + synchronize(); + return result; + } else { + size_t pinSize = size[0]; + // Check if a pinned transfer can be executed with a single pin + if ((pinSize <= dev().settings().pinnedXferSize_) && (pinSize > MinSizeForPinnedTransfer)) { + size_t partial; + amd::Memory* amdMemory = pinHostMemory(dstHost, pinSize, partial); + + if (amdMemory == nullptr) { + // Force SW copy + result = DmaBlitManager::readBuffer(srcMemory, dstHost, origin, size, entire); + synchronize(); + return result; + } + + // Readjust host mem offset + amd::Coord3D dstOrigin(partial); + + // Get device memory for this virtual device + Memory* dstMemory = dev().getRocMemory(amdMemory); + + // Copy image to buffer + result = copyBuffer(srcMemory, *dstMemory, origin, dstOrigin, size, entire); + + // Add pinned memory for a later release + gpu().addPinnedMem(amdMemory); + } else { + result = DmaBlitManager::readBuffer(srcMemory, dstHost, origin, size, entire); + } + } + + synchronize(); + + return result; +} + +bool KernelBlitManager::readBufferRect(device::Memory& srcMemory, void* dstHost, + const amd::BufferRect& bufRect, + const amd::BufferRect& hostRect, const amd::Coord3D& size, + bool entire) const { + amd::ScopedLock k(lockXferOps_); + bool result = false; + + // Use host copy if memory has direct access + if (setup_.disableReadBufferRect_ || gpuMem(srcMemory).isHostMemDirectAccess()) { + result = HostBlitManager::readBufferRect(srcMemory, dstHost, bufRect, hostRect, size, entire); + synchronize(); + return result; + } else { + size_t pinSize = hostRect.start_ + hostRect.end_; + size_t partial; + amd::Memory* amdMemory = pinHostMemory(dstHost, pinSize, partial); + + if (amdMemory == nullptr) { + // Force SW copy + result = DmaBlitManager::readBufferRect(srcMemory, dstHost, bufRect, hostRect, size, entire); + synchronize(); + return result; } - // Use a common blit type with three dimensions by default - uint blitType = BlitCopyBufferToImage; - size_t dim = 0; - size_t globalWorkOffset[3] = { 0, 0, 0 }; - size_t globalWorkSize[3]; - size_t localWorkSize[3]; + // Readjust host mem offset + amd::BufferRect rect; + rect.rowPitch_ = hostRect.rowPitch_; + rect.slicePitch_ = hostRect.slicePitch_; + rect.start_ = hostRect.start_ + partial; + rect.end_ = hostRect.end_; - // Program the kernels workload depending on the blit dimensions - dim = 3; - if (dstImage->getDims() == 1) { - globalWorkSize[0] = amd::alignUp(size[0], 256); - globalWorkSize[1] = amd::alignUp(size[1], 1); - globalWorkSize[2] = amd::alignUp(size[2], 1); - localWorkSize[0] = 256; - localWorkSize[1] = localWorkSize[2] = 1; + // Get device memory for this virtual device + Memory* dstMemory = dev().getRocMemory(amdMemory); + + // Copy image to buffer + result = copyBufferRect(srcMemory, *dstMemory, bufRect, rect, size, entire); + + // Add pinned memory for a later release + gpu().addPinnedMem(amdMemory); + } + + synchronize(); + + return result; +} + +bool KernelBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemory, + const amd::Coord3D& origin, const amd::Coord3D& size, + bool entire) const { + amd::ScopedLock k(lockXferOps_); + bool result = false; + + // Use host copy if memory has direct access + if (setup_.disableWriteBuffer_ || gpuMem(dstMemory).isHostMemDirectAccess()) { + result = HostBlitManager::writeBuffer(srcHost, dstMemory, origin, size, entire); + synchronize(); + return result; + } else { + size_t pinSize = size[0]; + + // Check if a pinned transfer can be executed with a single pin + if ((pinSize <= dev().settings().pinnedXferSize_) && (pinSize > MinSizeForPinnedTransfer)) { + size_t partial; + amd::Memory* amdMemory = pinHostMemory(srcHost, pinSize, partial); + + if (amdMemory == nullptr) { + // Force SW copy + result = DmaBlitManager::writeBuffer(srcHost, dstMemory, origin, size, entire); + synchronize(); + return result; + } + + // Readjust destination offset + const amd::Coord3D srcOrigin(partial); + + // Get device memory for this virtual device + Memory* srcMemory = dev().getRocMemory(amdMemory); + + // Copy buffer rect + result = copyBuffer(*srcMemory, dstMemory, srcOrigin, origin, size, entire); + + // Add pinned memory for a later release + gpu().addPinnedMem(amdMemory); + } else { + result = DmaBlitManager::writeBuffer(srcHost, dstMemory, origin, size, entire); } - else if (dstImage->getDims() == 2) { - globalWorkSize[0] = amd::alignUp(size[0], 16); - globalWorkSize[1] = amd::alignUp(size[1], 16); - globalWorkSize[2] = amd::alignUp(size[2], 1); - localWorkSize[0] = localWorkSize[1] = 16; - localWorkSize[2] = 1; - } - else { - globalWorkSize[0] = amd::alignUp(size[0], 8); - globalWorkSize[1] = amd::alignUp(size[1], 8); - globalWorkSize[2] = amd::alignUp(size[2], 4); - localWorkSize[0] = localWorkSize[1] = 8; - localWorkSize[2] = 4; + } + + synchronize(); + + return result; +} + +bool KernelBlitManager::writeBufferRect(const void* srcHost, device::Memory& dstMemory, + const amd::BufferRect& hostRect, + const amd::BufferRect& bufRect, const amd::Coord3D& size, + bool entire) const { + amd::ScopedLock k(lockXferOps_); + bool result = false; + + // Use host copy if memory has direct access + if (setup_.disableWriteBufferRect_ || gpuMem(dstMemory).isHostMemDirectAccess()) { + result = HostBlitManager::writeBufferRect(srcHost, dstMemory, hostRect, bufRect, size, entire); + synchronize(); + return result; + } else { + size_t pinSize = hostRect.start_ + hostRect.end_; + size_t partial; + amd::Memory* amdMemory = pinHostMemory(srcHost, pinSize, partial); + + if (amdMemory == nullptr) { + // Force DMA copy with staging + result = DmaBlitManager::writeBufferRect(srcHost, dstMemory, hostRect, bufRect, size, entire); + synchronize(); + return result; } - // Program kernels arguments for the blit operation - cl_mem mem = as_cl(srcMemory.owner()); - setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem); - mem = as_cl(dstView->owner()); - setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem); - uint32_t memFmtSize = dstImage->getImageFormat().getElementSize(); - uint32_t components = dstImage->getImageFormat().getNumChannels(); + // Readjust destination offset + const amd::Coord3D srcOrigin(partial); - // 1 element granularity for writes by default - cl_int granularity = 1; - if (memFmtSize == 2) { - granularity = 2; + // Get device memory for this virtual device + Memory* srcMemory = dev().getRocMemory(amdMemory); + + // Readjust host mem offset + amd::BufferRect rect; + rect.rowPitch_ = hostRect.rowPitch_; + rect.slicePitch_ = hostRect.slicePitch_; + rect.start_ = hostRect.start_ + partial; + rect.end_ = hostRect.end_; + + // Copy buffer rect + result = copyBufferRect(*srcMemory, dstMemory, rect, bufRect, size, entire); + + // Add pinned memory for a later release + gpu().addPinnedMem(amdMemory); + } + + synchronize(); + + return result; +} + +bool KernelBlitManager::fillBuffer(device::Memory& memory, const void* pattern, size_t patternSize, + const amd::Coord3D& origin, const amd::Coord3D& size, + bool entire) const { + amd::ScopedLock k(lockXferOps_); + bool result = false; + + // Use host fill if memory has direct access + if (setup_.disableFillBuffer_ || gpuMem(memory).isHostMemDirectAccess()) { + result = HostBlitManager::fillBuffer(memory, pattern, patternSize, origin, size, entire); + synchronize(); + return result; + } else { + uint fillType = FillBuffer; + size_t globalWorkOffset[3] = {0, 0, 0}; + cl_ulong fillSize = size[0] / patternSize; + size_t globalWorkSize = amd::alignUp(fillSize, 256); + size_t localWorkSize = 256; + bool dwordAligned = ((patternSize % sizeof(uint32_t)) == 0) ? true : false; + + // Program kernels arguments for the fill operation + cl_mem mem = as_cl(memory.owner()); + if (dwordAligned) { + setArgument(kernels_[fillType], 0, sizeof(cl_mem), nullptr); + setArgument(kernels_[fillType], 1, sizeof(cl_mem), &mem); + } else { + setArgument(kernels_[fillType], 0, sizeof(cl_mem), &mem); + setArgument(kernels_[fillType], 1, sizeof(cl_mem), nullptr); } - else if (memFmtSize >= 4) { - granularity = 4; + Memory* gpuCB = dev().getRocMemory(constantBuffer_); + if (gpuCB == nullptr) { + return false; } - CondLog(((srcOrigin[0] % granularity) != 0), "Unaligned offset in blit!"); - cl_ulong srcOrg[4] = { srcOrigin[0] / granularity, - srcOrigin[1], - srcOrigin[2], 0 }; - setArgument(kernels_[blitType], 2, sizeof(srcOrg), srcOrg); + void* constBuf = constantBuffer_->getHostMem(); + memcpy(constBuf, pattern, patternSize); - cl_int dstOrg[4] = { (cl_int)dstOrigin[0], - (cl_int)dstOrigin[1], - (cl_int)dstOrigin[2], 0 }; - cl_int copySize[4] = { (cl_int)size[0], - (cl_int)size[1], - (cl_int)size[2], 0 }; - - setArgument(kernels_[blitType], 3, sizeof(dstOrg), dstOrg); - setArgument(kernels_[blitType], 4, sizeof(copySize), copySize); - - // Program memory format - uint multiplier = memFmtSize / sizeof(uint32_t); - multiplier = (multiplier == 0) ? 1 : multiplier; - cl_uint format[4] = { components, - memFmtSize / components, - multiplier, 0 }; - setArgument(kernels_[blitType], 5, sizeof(format), format); - - // Program row and slice pitches - cl_ulong pitch[4] = { 0 }; - CalcRowSlicePitches(pitch, copySize, rowPitch, slicePitch, gpuMem(dstMemory)); - setArgument(kernels_[blitType], 6, sizeof(pitch), pitch); + mem = as_cl(gpuCB->owner()); + setArgument(kernels_[fillType], 2, sizeof(cl_mem), &mem); + cl_ulong offset = origin[0]; + if (dwordAligned) { + patternSize /= sizeof(uint32_t); + offset /= sizeof(uint32_t); + } + setArgument(kernels_[fillType], 3, sizeof(cl_uint), &patternSize); + setArgument(kernels_[fillType], 4, sizeof(offset), &offset); + setArgument(kernels_[fillType], 5, sizeof(fillSize), &fillSize); // Create ND range object for the kernel's execution - amd::NDRangeContainer ndrange(dim, - globalWorkOffset, globalWorkSize, localWorkSize); - - // Execute the blit - address parameters = captureArguments(kernels_[blitType]); - result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters, nullptr); - releaseArguments(parameters); - if (releaseView) { - // todo SRD programming could be changed to avoid a stall - gpu().releaseGpuMemoryFence(); - dstView->owner()->release(); - } - - return result; -} - -bool -KernelBlitManager::copyImageToBuffer( - device::Memory& srcMemory, - device::Memory& dstMemory, - const amd::Coord3D& srcOrigin, - const amd::Coord3D& dstOrigin, - const amd::Coord3D& size, - bool entire, - size_t rowPitch, - size_t slicePitch) const -{ - amd::ScopedLock k(lockXferOps_); - bool result = false; - static const bool CopyRect = false; - // Flush DMA for ASYNC copy - static const bool FlushDMA = true; - amd::Image* srcImage = static_cast(srcMemory.owner()); - size_t imgRowPitch = size[0] * srcImage->getImageFormat().getElementSize(); - size_t imgSlicePitch = imgRowPitch * size[1]; - - if (setup_.disableCopyImageToBuffer_) { - result = DmaBlitManager::copyImageToBuffer( - srcMemory, dstMemory, srcOrigin, dstOrigin, - size, entire, rowPitch, slicePitch); - synchronize(); - return result; - } - // Check if buffer is in system memory with direct access - else if (gpuMem(dstMemory).isHostMemDirectAccess() && - (((rowPitch == 0) && (slicePitch == 0)) || - ((rowPitch == imgRowPitch) && - ((slicePitch == 0) || (slicePitch == imgSlicePitch))))) { - // First attempt to do this all with DMA, - // but there are restriciton with older hardware - // If the dest buffer is external physical(SDI), copy two step as - // single step SDMA is causing corruption and the cause is under investigation - if (dev().settings().imageDMA_) { - result = DmaBlitManager::copyImageToBuffer( - srcMemory, dstMemory, srcOrigin, dstOrigin, - size, entire, rowPitch, slicePitch); - if (result) { - synchronize(); - return result; - } - } - } - - if (!result) { - result = copyImageToBufferKernel(srcMemory, - dstMemory, srcOrigin, dstOrigin, size, entire, rowPitch, slicePitch); - } - - synchronize(); - - return result; -} - -bool -KernelBlitManager::copyImageToBufferKernel( - device::Memory& srcMemory, - device::Memory& dstMemory, - const amd::Coord3D& srcOrigin, - const amd::Coord3D& dstOrigin, - const amd::Coord3D& size, - bool entire, - size_t rowPitch, - size_t slicePitch) const -{ - bool rejected = false; - Memory* srcView = &gpuMem(srcMemory); - bool releaseView = false; - bool result = false; - amd::Image* srcImage = static_cast(srcMemory.owner()); - amd::Image::Format newFormat(srcImage->getImageFormat()); - - // Find unsupported formats - for (uint i = 0; i < RejectedFormatDataTotal; ++i) { - if (RejectedData[i].clOldType_ == newFormat.image_channel_data_type) { - newFormat.image_channel_data_type = RejectedData[i].clNewType_; - rejected = true; - break; - } - } - - // Find unsupported channel's order - for (uint i = 0; i < RejectedFormatChannelTotal; ++i) { - if (RejectedOrder[i].clOldType_ == newFormat.image_channel_order) { - newFormat.image_channel_order = RejectedOrder[i].clNewType_; - rejected = true; - break; - } - } - - // If the image format was rejected, then attempt to create a view - if (rejected && - // todo ROC runtime has a problem with a view for this format - (srcImage->getImageFormat().image_channel_data_type != CL_UNORM_INT_101010)) { - srcView = createView(gpuMem(srcMemory), newFormat, CL_MEM_READ_ONLY); - if (srcView != nullptr) { - rejected = false; - releaseView = true; - } - } - - // Fall into the host path if the image format was rejected - if (rejected) { - return DmaBlitManager::copyImageToBuffer( - srcMemory, dstMemory, srcOrigin, dstOrigin, - size, entire, rowPitch, slicePitch); - } - - uint blitType = BlitCopyImageToBuffer; - size_t dim = 0; - size_t globalWorkOffset[3] = { 0, 0, 0 }; - size_t globalWorkSize[3]; - size_t localWorkSize[3]; - - // Program the kernels workload depending on the blit dimensions - dim = 3; - // Find the current blit type - if (srcImage->getDims() == 1) { - globalWorkSize[0] = amd::alignUp(size[0], 256); - globalWorkSize[1] = amd::alignUp(size[1], 1); - globalWorkSize[2] = amd::alignUp(size[2], 1); - localWorkSize[0] = 256; - localWorkSize[1] = localWorkSize[2] = 1; - } - else if (srcImage->getDims() == 2) { - globalWorkSize[0] = amd::alignUp(size[0], 16); - globalWorkSize[1] = amd::alignUp(size[1], 16); - globalWorkSize[2] = amd::alignUp(size[2], 1); - localWorkSize[0] = localWorkSize[1] = 16; - localWorkSize[2] = 1; - } - else { - globalWorkSize[0] = amd::alignUp(size[0], 8); - globalWorkSize[1] = amd::alignUp(size[1], 8); - globalWorkSize[2] = amd::alignUp(size[2], 4); - localWorkSize[0] = localWorkSize[1] = 8; - localWorkSize[2] = 4; - } - - // Program kernels arguments for the blit operation - cl_mem mem = as_cl(srcView->owner()); - setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem); - mem = as_cl(dstMemory.owner()); - setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem); - - // Update extra paramters for USHORT and UBYTE pointers. - // Only then compiler can optimize the kernel to use - // UAV Raw for other writes - setArgument(kernels_[blitType], 2, sizeof(cl_mem), &mem); - setArgument(kernels_[blitType], 3, sizeof(cl_mem), &mem); - - cl_int srcOrg[4] = { (cl_int)srcOrigin[0], - (cl_int)srcOrigin[1], - (cl_int)srcOrigin[2], 0 }; - cl_int copySize[4] = { (cl_int)size[0], - (cl_int)size[1], - (cl_int)size[2], 0 }; - setArgument(kernels_[blitType], 4, sizeof(srcOrg), srcOrg); - uint32_t memFmtSize = srcImage->getImageFormat().getElementSize(); - uint32_t components = srcImage->getImageFormat().getNumChannels(); - - // 1 element granularity for writes by default - cl_int granularity = 1; - if (memFmtSize == 2) { - granularity = 2; - } - else if (memFmtSize >= 4) { - granularity = 4; - } - CondLog(((dstOrigin[0] % granularity) != 0), "Unaligned offset in blit!"); - cl_ulong dstOrg[4] = { dstOrigin[0] / granularity, - dstOrigin[1], - dstOrigin[2], 0 }; - setArgument(kernels_[blitType], 5, sizeof(dstOrg), dstOrg); - setArgument(kernels_[blitType], 6, sizeof(copySize), copySize); - - // Program memory format - uint multiplier = memFmtSize / sizeof(uint32_t); - multiplier = (multiplier == 0) ? 1 : multiplier; - cl_uint format[4] = { components, - memFmtSize / components, - multiplier, 0 }; - setArgument(kernels_[blitType], 7, sizeof(format), format); - - // Program row and slice pitches - cl_ulong pitch[4] = { 0 }; - CalcRowSlicePitches(pitch, copySize, rowPitch, slicePitch, gpuMem(srcMemory)); - setArgument(kernels_[blitType], 8, sizeof(pitch), pitch); - - // Create ND range object for the kernel's execution - amd::NDRangeContainer ndrange(dim, - globalWorkOffset, globalWorkSize, localWorkSize); - - // Execute the blit - address parameters = captureArguments(kernels_[blitType]); - result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters, nullptr); - releaseArguments(parameters); - if (releaseView) { - // todo SRD programming could be changed to avoid a stall - gpu().releaseGpuMemoryFence(); - srcView->owner()->release(); - } - - return result; -} - -bool -KernelBlitManager::copyImage( - device::Memory& srcMemory, - device::Memory& dstMemory, - const amd::Coord3D& srcOrigin, - const amd::Coord3D& dstOrigin, - const amd::Coord3D& size, - bool entire) const -{ - amd::ScopedLock k(lockXferOps_); - bool rejected = false; - Memory* srcView = &gpuMem(srcMemory); - Memory* dstView = &gpuMem(dstMemory); - bool releaseView = false; - bool result = false; - amd::Image* srcImage = static_cast(srcMemory.owner()); - amd::Image* dstImage = static_cast(dstMemory.owner()); - amd::Image::Format newFormat(srcImage->getImageFormat()); - - // Find unsupported formats - for (uint i = 0; i < RejectedFormatDataTotal; ++i) { - if (RejectedData[i].clOldType_ == newFormat.image_channel_data_type) { - newFormat.image_channel_data_type = RejectedData[i].clNewType_; - rejected = true; - break; - } - } - - // Search for the rejected channel's order only if the format was rejected - // Note: Image blit is independent from the channel order - if (rejected) { - for (uint i = 0; i < RejectedFormatChannelTotal; ++i) { - if (RejectedOrder[i].clOldType_ == newFormat.image_channel_order) { - newFormat.image_channel_order = RejectedOrder[i].clNewType_; - rejected = true; - break; - } - } - } - - // Attempt to create a view if the format was rejected - if (rejected) { - srcView = createView(gpuMem(srcMemory), newFormat, CL_MEM_READ_ONLY); - if (srcView != nullptr) { - dstView = createView(gpuMem(dstMemory), newFormat, CL_MEM_WRITE_ONLY); - if (dstView != nullptr) { - rejected = false; - releaseView = true; - } - else { - delete srcView; - } - } - } - - // Fall into the host path for the entire 2D copy or - // if the image format was rejected - if (rejected) { - result = DmaBlitManager::copyImage(srcMemory, dstMemory, - srcOrigin, dstOrigin, size, entire); - synchronize(); - return result; - } - - uint blitType = BlitCopyImage; - size_t dim = 0; - size_t globalWorkOffset[3] = { 0, 0, 0 }; - size_t globalWorkSize[3]; - size_t localWorkSize[3]; - - // Program the kernels workload depending on the blit dimensions - dim = 3; - // Find the current blit type - if ((srcImage->getDims() == 1) || - (dstImage->getDims() == 1)) { - globalWorkSize[0] = amd::alignUp(size[0], 256); - globalWorkSize[1] = amd::alignUp(size[1], 1); - globalWorkSize[2] = amd::alignUp(size[2], 1); - localWorkSize[0] = 256; - localWorkSize[1] = localWorkSize[2] = 1; - } - else if ((srcImage->getDims() == 2) || - (dstImage->getDims() == 2)) { - globalWorkSize[0] = amd::alignUp(size[0], 16); - globalWorkSize[1] = amd::alignUp(size[1], 16); - globalWorkSize[2] = amd::alignUp(size[2], 1); - localWorkSize[0] = localWorkSize[1] = 16; - localWorkSize[2] = 1; - } - else { - globalWorkSize[0] = amd::alignUp(size[0], 8); - globalWorkSize[1] = amd::alignUp(size[1], 8); - globalWorkSize[2] = amd::alignUp(size[2], 4); - localWorkSize[0] = localWorkSize[1] = 8; - localWorkSize[2] = 4; - } - - // The current OpenCL spec allows "copy images from a 1D image - // array object to a 1D image array object" only. - if ((gpuMem(srcMemory).owner()->getType() == CL_MEM_OBJECT_IMAGE1D_ARRAY) || - (gpuMem(dstMemory).owner()->getType() == CL_MEM_OBJECT_IMAGE1D_ARRAY)) { - blitType = BlitCopyImage1DA; - } - - // Program kernels arguments for the blit operation - cl_mem mem = as_cl(srcView->owner()); - setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem); - mem = as_cl(dstView->owner()); - setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem); - - // Program source origin - cl_int srcOrg[4] = { (cl_int)srcOrigin[0], - (cl_int)srcOrigin[1], - (cl_int)srcOrigin[2], 0 }; - setArgument(kernels_[blitType], 2, sizeof(srcOrg), srcOrg); - - // Program destinaiton origin - cl_int dstOrg[4] = { (cl_int)dstOrigin[0], - (cl_int)dstOrigin[1], - (cl_int)dstOrigin[2], 0 }; - setArgument(kernels_[blitType], 3, sizeof(dstOrg), dstOrg); - - cl_int copySize[4] = { (cl_int)size[0], - (cl_int)size[1], - (cl_int)size[2], 0 }; - setArgument(kernels_[blitType], 4, sizeof(copySize), copySize); - - // Create ND range object for the kernel's execution - amd::NDRangeContainer ndrange(dim, - globalWorkOffset, globalWorkSize, localWorkSize); - - // Execute the blit - address parameters = captureArguments(kernels_[blitType]); - result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters, nullptr); - releaseArguments(parameters); - if (releaseView) { - // todo SRD programming could be changed to avoid a stall - gpu().releaseGpuMemoryFence(); - srcView->owner()->release(); - dstView->owner()->release(); - } - - synchronize(); - - return result; -} - -void -FindPinSize( - size_t& pinSize, const amd::Coord3D& size, - size_t& rowPitch, size_t& slicePitch, const Memory& mem) -{ - amd::Image* image = static_cast(mem.owner()); - pinSize = size[0] * image->getImageFormat().getElementSize(); - if ((rowPitch == 0) || (rowPitch == pinSize)) { - rowPitch = 0; - } - else { - pinSize = rowPitch; - } - - // Calculate the pin size, which should be equal to the copy size - for (uint i = 1; i < image->getDims(); ++i) { - pinSize *= size[i]; - if (i == 1) { - if ((slicePitch == 0) || (slicePitch == pinSize)) { - slicePitch = 0; - } - else { - if (mem.owner()->getType() != CL_MEM_OBJECT_IMAGE1D_ARRAY) { - pinSize = slicePitch; - } - else { - pinSize = slicePitch * size[i]; - } - } - } - } -} - -bool -KernelBlitManager::readImage( - device::Memory& srcMemory, - void* dstHost, - const amd::Coord3D& origin, - const amd::Coord3D& size, - size_t rowPitch, - size_t slicePitch, - bool entire) const -{ - amd::ScopedLock k(lockXferOps_); - bool result = false; - - // Use host copy if memory has direct access - if (setup_.disableReadImage_ || - (gpuMem(srcMemory).isHostMemDirectAccess())) { - result = HostBlitManager::readImage(srcMemory, dstHost, - origin, size, rowPitch, slicePitch, entire); - synchronize(); - return result; - } - else { - size_t pinSize; - FindPinSize(pinSize, size, rowPitch, slicePitch, gpuMem(srcMemory)); - - size_t partial; - amd::Memory* amdMemory = pinHostMemory(dstHost, pinSize, partial); - - if (amdMemory == nullptr) { - // Force SW copy - result = DmaBlitManager::readImage(srcMemory, dstHost, - origin, size, rowPitch, slicePitch, entire); - synchronize(); - return result; - } - - // Readjust destination offset - const amd::Coord3D dstOrigin(partial); - - // Get device memory for this virtual device - Memory* dstMemory = dev().getRocMemory(amdMemory); - - // Copy image to buffer - result = copyImageToBuffer(srcMemory, *dstMemory, - origin, dstOrigin, size, entire, rowPitch, slicePitch); - - // Add pinned memory for a later release - gpu().addPinnedMem(amdMemory); - } - - synchronize(); - - return result; -} - -bool -KernelBlitManager::writeImage( - const void* srcHost, - device::Memory& dstMemory, - const amd::Coord3D& origin, - const amd::Coord3D& size, - size_t rowPitch, - size_t slicePitch, - bool entire) const -{ - amd::ScopedLock k(lockXferOps_); - bool result = false; - - // Use host copy if memory has direct access - if (setup_.disableWriteImage_|| gpuMem(dstMemory).isHostMemDirectAccess()) { - result = HostBlitManager::writeImage( - srcHost, dstMemory, origin, size, rowPitch, slicePitch, entire); - synchronize(); - return result; - } - else { - size_t pinSize; - FindPinSize(pinSize, size, rowPitch, slicePitch, gpuMem(dstMemory)); - - size_t partial; - amd::Memory* amdMemory = pinHostMemory(srcHost, pinSize, partial); - - if (amdMemory == nullptr) { - // Force SW copy - result = DmaBlitManager::writeImage( - srcHost, dstMemory, origin, size, rowPitch, slicePitch, entire); - synchronize(); - return result; - } - - // Readjust destination offset - const amd::Coord3D srcOrigin(partial); - - // Get device memory for this virtual device - Memory* srcMemory = dev().getRocMemory(amdMemory); - - // Copy image to buffer - result = copyBufferToImage(*srcMemory, dstMemory, - srcOrigin, origin, size, entire, rowPitch, slicePitch); - - // Add pinned memory for a later release - gpu().addPinnedMem(amdMemory); - } - - synchronize(); - - return result; -} - -bool -KernelBlitManager::copyBufferRect( - device::Memory& srcMemory, - device::Memory& dstMemory, - const amd::BufferRect& srcRectIn, - const amd::BufferRect& dstRectIn, - const amd::Coord3D& sizeIn, - bool entire) const -{ - amd::ScopedLock k(lockXferOps_); - bool result = false; - bool rejected = false; - - // Fall into the ROC path for rejected transfers - if (setup_.disableCopyBufferRect_ || - gpuMem(srcMemory).isHostMemDirectAccess() || gpuMem(dstMemory).isHostMemDirectAccess()) { - result = HostBlitManager::copyBufferRect(srcMemory, dstMemory, - srcRectIn, dstRectIn, sizeIn, entire); - - if (result) { - synchronize(); - return result; - } - } - - uint blitType = BlitCopyBufferRect; - size_t dim = 3; - size_t globalWorkOffset[3] = { 0, 0, 0 }; - size_t globalWorkSize[3]; - size_t localWorkSize[3]; - - const static uint CopyRectAlignment[3] = { 16, 4, 1 }; - - bool aligned; - uint i; - for (i = 0; i < sizeof(CopyRectAlignment) / sizeof(uint); i++) { - // Check source alignments - aligned = ((srcRectIn.rowPitch_ % CopyRectAlignment[i]) == 0); - aligned &= ((srcRectIn.slicePitch_ % CopyRectAlignment[i]) == 0); - aligned &= ((srcRectIn.start_ % CopyRectAlignment[i]) == 0); - - // Check destination alignments - aligned &= ((dstRectIn.rowPitch_ % CopyRectAlignment[i]) == 0); - aligned &= ((dstRectIn.slicePitch_ % CopyRectAlignment[i]) == 0); - aligned &= ((dstRectIn.start_ % CopyRectAlignment[i]) == 0); - - // Check copy size alignment in the first dimension - aligned &= ((sizeIn[0] % CopyRectAlignment[i]) == 0); - - if (aligned) { - if (CopyRectAlignment[i] != 1) { - blitType = BlitCopyBufferRectAligned; - } - break; - } - } - - amd::BufferRect srcRect; - amd::BufferRect dstRect; - amd::Coord3D size(sizeIn[0], sizeIn[1], sizeIn[2]); - - srcRect.rowPitch_ = srcRectIn.rowPitch_ / CopyRectAlignment[i]; - srcRect.slicePitch_ = srcRectIn.slicePitch_ / CopyRectAlignment[i]; - srcRect.start_ = srcRectIn.start_ / CopyRectAlignment[i]; - srcRect.end_ = srcRectIn.end_ / CopyRectAlignment[i]; - - dstRect.rowPitch_ = dstRectIn.rowPitch_ / CopyRectAlignment[i]; - dstRect.slicePitch_ = dstRectIn.slicePitch_ / CopyRectAlignment[i]; - dstRect.start_ = dstRectIn.start_ / CopyRectAlignment[i]; - dstRect.end_ = dstRectIn.end_ / CopyRectAlignment[i]; - - size.c[0] /= CopyRectAlignment[i]; - - // Program the kernel's workload depending on the transfer dimensions - if ((size[1] == 1) && (size[2] == 1)) { - globalWorkSize[0] = amd::alignUp(size[0], 256); - globalWorkSize[1] = 1; - globalWorkSize[2] = 1; - localWorkSize[0] = 256; - localWorkSize[1] = 1; - localWorkSize[2] = 1; - } - else if (size[2] == 1) { - globalWorkSize[0] = amd::alignUp(size[0], 16); - globalWorkSize[1] = amd::alignUp(size[1], 16); - globalWorkSize[2] = 1; - localWorkSize[0] = localWorkSize[1] = 16; - localWorkSize[2] = 1; - } - else { - globalWorkSize[0] = amd::alignUp(size[0], 8); - globalWorkSize[1] = amd::alignUp(size[1], 8); - globalWorkSize[2] = amd::alignUp(size[2], 4); - localWorkSize[0] = localWorkSize[1] = 8; - localWorkSize[2] = 4; - } - - - // Program kernels arguments for the blit operation - cl_mem mem = as_cl(srcMemory.owner()); - setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem); - mem = as_cl(dstMemory.owner()); - setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem); - cl_ulong src[4] = { srcRect.rowPitch_, - srcRect.slicePitch_, - srcRect.start_, 0 }; - setArgument(kernels_[blitType], 2, sizeof(src), src); - cl_ulong dst[4] = { dstRect.rowPitch_, - dstRect.slicePitch_, - dstRect.start_, 0 }; - setArgument(kernels_[blitType], 3, sizeof(dst), dst); - cl_ulong copySize[4] = { size[0], size[1], size[2], CopyRectAlignment[i] }; - setArgument(kernels_[blitType], 4, sizeof(copySize), copySize); - - // Create ND range object for the kernel's execution - amd::NDRangeContainer ndrange(dim, - globalWorkOffset, globalWorkSize, localWorkSize); - - // Execute the blit - address parameters = captureArguments(kernels_[blitType]); - result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters, nullptr); - releaseArguments(parameters); - synchronize(); - - return result; -} - -bool -KernelBlitManager::readBuffer( - device::Memory& srcMemory, - void* dstHost, - const amd::Coord3D& origin, - const amd::Coord3D& size, - bool entire) const -{ - amd::ScopedLock k(lockXferOps_); - bool result = false; - // Use host copy if memory has direct access - if (setup_.disableReadBuffer_ || - (gpuMem(srcMemory).isHostMemDirectAccess())) { - result = HostBlitManager::readBuffer( - srcMemory, dstHost, origin, size, entire); - synchronize(); - return result; - } - else { - size_t pinSize = size[0]; - // Check if a pinned transfer can be executed with a single pin - if ((pinSize <= dev().settings().pinnedXferSize_) && - (pinSize > MinSizeForPinnedTransfer)) { - size_t partial; - amd::Memory* amdMemory = pinHostMemory(dstHost, pinSize, partial); - - if (amdMemory == nullptr) { - // Force SW copy - result = DmaBlitManager::readBuffer( - srcMemory, dstHost, origin, size, entire); - synchronize(); - return result; - } - - // Readjust host mem offset - amd::Coord3D dstOrigin(partial); - - // Get device memory for this virtual device - Memory* dstMemory = dev().getRocMemory(amdMemory); - - // Copy image to buffer - result = copyBuffer(srcMemory, *dstMemory, - origin, dstOrigin, size, entire); - - // Add pinned memory for a later release - gpu().addPinnedMem(amdMemory); - } - else { - result = DmaBlitManager::readBuffer( - srcMemory, dstHost, origin, size, entire); - } - } - - synchronize(); - - return result; -} - -bool -KernelBlitManager::readBufferRect( - device::Memory& srcMemory, - void* dstHost, - const amd::BufferRect& bufRect, - const amd::BufferRect& hostRect, - const amd::Coord3D& size, - bool entire) const -{ - amd::ScopedLock k(lockXferOps_); - bool result = false; - - // Use host copy if memory has direct access - if (setup_.disableReadBufferRect_ || gpuMem(srcMemory).isHostMemDirectAccess()) { - result = HostBlitManager::readBufferRect( - srcMemory, dstHost, bufRect, hostRect, size, entire); - synchronize(); - return result; - } - else { - size_t pinSize = hostRect.start_ + hostRect.end_; - size_t partial; - amd::Memory* amdMemory = pinHostMemory(dstHost, pinSize, partial); - - if (amdMemory == nullptr) { - // Force SW copy - result = DmaBlitManager::readBufferRect( - srcMemory, dstHost, bufRect, hostRect, size, entire); - synchronize(); - return result; - } - - // Readjust host mem offset - amd::BufferRect rect; - rect.rowPitch_ = hostRect.rowPitch_; - rect.slicePitch_ = hostRect.slicePitch_; - rect.start_ = hostRect.start_ + partial; - rect.end_ = hostRect.end_; - - // Get device memory for this virtual device - Memory* dstMemory = dev().getRocMemory(amdMemory); - - // Copy image to buffer - result = copyBufferRect(srcMemory, *dstMemory, - bufRect, rect, size, entire); - - // Add pinned memory for a later release - gpu().addPinnedMem(amdMemory); - } - - synchronize(); - - return result; -} - -bool -KernelBlitManager::writeBuffer( - const void* srcHost, - device::Memory& dstMemory, - const amd::Coord3D& origin, - const amd::Coord3D& size, - bool entire) const -{ - amd::ScopedLock k(lockXferOps_); - bool result = false; - - // Use host copy if memory has direct access - if (setup_.disableWriteBuffer_ || gpuMem(dstMemory).isHostMemDirectAccess()) { - result = HostBlitManager::writeBuffer( - srcHost, dstMemory, origin, size, entire); - synchronize(); - return result; - } - else { - size_t pinSize = size[0]; - - // Check if a pinned transfer can be executed with a single pin - if ((pinSize <= dev().settings().pinnedXferSize_) && - (pinSize > MinSizeForPinnedTransfer)) { - size_t partial; - amd::Memory* amdMemory = pinHostMemory(srcHost, pinSize, partial); - - if (amdMemory == nullptr) { - // Force SW copy - result = DmaBlitManager::writeBuffer( - srcHost, dstMemory, origin, size, entire); - synchronize(); - return result; - } - - // Readjust destination offset - const amd::Coord3D srcOrigin(partial); - - // Get device memory for this virtual device - Memory* srcMemory = dev().getRocMemory(amdMemory); - - // Copy buffer rect - result = copyBuffer(*srcMemory, dstMemory, - srcOrigin, origin, size, entire); - - // Add pinned memory for a later release - gpu().addPinnedMem(amdMemory); - } - else { - result = DmaBlitManager::writeBuffer( - srcHost, dstMemory, origin, size, entire); - } - } - - synchronize(); - - return result; -} - -bool -KernelBlitManager::writeBufferRect( - const void* srcHost, - device::Memory& dstMemory, - const amd::BufferRect& hostRect, - const amd::BufferRect& bufRect, - const amd::Coord3D& size, - bool entire) const -{ - amd::ScopedLock k(lockXferOps_); - bool result = false; - - // Use host copy if memory has direct access - if (setup_.disableWriteBufferRect_ || - gpuMem(dstMemory).isHostMemDirectAccess()) { - result = HostBlitManager::writeBufferRect( - srcHost, dstMemory, hostRect, bufRect, size, entire); - synchronize(); - return result; - } - else { - size_t pinSize = hostRect.start_ + hostRect.end_; - size_t partial; - amd::Memory* amdMemory = pinHostMemory(srcHost, pinSize, partial); - - if (amdMemory == nullptr) { - // Force DMA copy with staging - result = DmaBlitManager::writeBufferRect( - srcHost, dstMemory, hostRect, bufRect, size, entire); - synchronize(); - return result; - } - - // Readjust destination offset - const amd::Coord3D srcOrigin(partial); - - // Get device memory for this virtual device - Memory* srcMemory = dev().getRocMemory(amdMemory); - - // Readjust host mem offset - amd::BufferRect rect; - rect.rowPitch_ = hostRect.rowPitch_; - rect.slicePitch_ = hostRect.slicePitch_; - rect.start_ = hostRect.start_ + partial; - rect.end_ = hostRect.end_; - - // Copy buffer rect - result = copyBufferRect(*srcMemory, dstMemory, - rect, bufRect, size, entire); - - // Add pinned memory for a later release - gpu().addPinnedMem(amdMemory); - } - - synchronize(); - - return result; -} - -bool -KernelBlitManager::fillBuffer( - device::Memory& memory, - const void* pattern, - size_t patternSize, - const amd::Coord3D& origin, - const amd::Coord3D& size, - bool entire - ) const -{ - amd::ScopedLock k(lockXferOps_); - bool result = false; - - // Use host fill if memory has direct access - if (setup_.disableFillBuffer_ || - gpuMem(memory).isHostMemDirectAccess()) { - result = HostBlitManager::fillBuffer( - memory, pattern, patternSize, origin, size, entire); - synchronize(); - return result; - } - else { - uint fillType = FillBuffer; - size_t globalWorkOffset[3] = { 0, 0, 0 }; - cl_ulong fillSize = size[0] / patternSize; - size_t globalWorkSize = amd::alignUp(fillSize, 256); - size_t localWorkSize = 256; - bool dwordAligned = - ((patternSize % sizeof(uint32_t)) == 0) ? true : false; - - // Program kernels arguments for the fill operation - cl_mem mem = as_cl(memory.owner()); - if (dwordAligned) { - setArgument(kernels_[fillType], 0, sizeof(cl_mem), nullptr); - setArgument(kernels_[fillType], 1, sizeof(cl_mem), &mem); - } - else { - setArgument(kernels_[fillType], 0, sizeof(cl_mem), &mem); - setArgument(kernels_[fillType], 1, sizeof(cl_mem), nullptr); - } - Memory* gpuCB = dev().getRocMemory(constantBuffer_); - if (gpuCB == nullptr) { - return false; - } - void* constBuf = constantBuffer_->getHostMem(); - memcpy(constBuf, pattern, patternSize); - - mem = as_cl(gpuCB->owner()); - setArgument(kernels_[fillType], 2, sizeof(cl_mem), &mem); - cl_ulong offset = origin[0]; - if (dwordAligned) { - patternSize /= sizeof(uint32_t); - offset /= sizeof(uint32_t); - } - setArgument(kernels_[fillType], 3, sizeof(cl_uint), &patternSize); - setArgument(kernels_[fillType], 4, sizeof(offset), &offset); - setArgument(kernels_[fillType], 5, sizeof(fillSize), &fillSize); - - // Create ND range object for the kernel's execution - amd::NDRangeContainer ndrange(1, - globalWorkOffset, &globalWorkSize, &localWorkSize); - - // Execute the blit - address parameters = captureArguments(kernels_[fillType]); - result = gpu().submitKernelInternal(ndrange, *kernels_[fillType], parameters, nullptr); - releaseArguments(parameters); - } - - synchronize(); - - return result; -} - -bool -KernelBlitManager::copyBuffer( - device::Memory& srcMemory, - device::Memory& dstMemory, - const amd::Coord3D& srcOrigin, - const amd::Coord3D& dstOrigin, - const amd::Coord3D& sizeIn, - bool entire) const -{ - amd::ScopedLock k(lockXferOps_); - bool result = false; - - if (setup_.disableHwlCopyBuffer_ || - (!gpuMem(srcMemory).isHostMemDirectAccess() && - !gpuMem(dstMemory).isHostMemDirectAccess())) { - uint blitType = BlitCopyBuffer; - size_t dim = 1; - size_t globalWorkOffset[3] = { 0, 0, 0 }; - size_t globalWorkSize = 0; - size_t localWorkSize = 0; - - // todo LC shows much better performance with the unaligned version - const static uint CopyBuffAlignment[3] = { 1/*16*/, 1/*4*/, 1 }; - amd::Coord3D size(sizeIn[0], sizeIn[1], sizeIn[2]); - - bool aligned = false; - uint i; - for (i = 0; i < sizeof(CopyBuffAlignment) / sizeof(uint); i++) { - // Check source alignments - aligned = ((srcOrigin[0] % CopyBuffAlignment[i]) == 0); - // Check destination alignments - aligned &= ((dstOrigin[0] % CopyBuffAlignment[i]) == 0); - // Check copy size alignment in the first dimension - aligned &= ((sizeIn[0] % CopyBuffAlignment[i]) == 0); - - if (aligned) { - if (CopyBuffAlignment[i] != 1) { - blitType = BlitCopyBufferAligned; - } - break; - } - } - - cl_uint remain; - if (blitType == BlitCopyBufferAligned) { - size.c[0] /= CopyBuffAlignment[i]; - } - else { - remain = size[0] % 4; - size.c[0] /= 4; - size.c[0] += 1; - } - - // Program the dispatch dimensions - localWorkSize = 256; - globalWorkSize = amd::alignUp(size[0] , 256); - - // Program kernels arguments for the blit operation - cl_mem mem = as_cl(srcMemory.owner()); - setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem); - mem = as_cl(dstMemory.owner()); - setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem); - // Program source origin - cl_ulong srcOffset = srcOrigin[0] / CopyBuffAlignment[i];; - setArgument(kernels_[blitType], 2, sizeof(srcOffset), &srcOffset); - - // Program destinaiton origin - cl_ulong dstOffset = dstOrigin[0] / CopyBuffAlignment[i];; - setArgument(kernels_[blitType], 3, sizeof(dstOffset), &dstOffset); - - cl_ulong copySize = size[0]; - setArgument(kernels_[blitType], 4, sizeof(copySize), ©Size); - - if (blitType == BlitCopyBufferAligned) { - cl_int alignment = CopyBuffAlignment[i]; - setArgument(kernels_[blitType], 5, sizeof(alignment), &alignment); - } - else { - setArgument(kernels_[blitType], 5, sizeof(remain), &remain); - } - - // Create ND range object for the kernel's execution - amd::NDRangeContainer ndrange(1, - globalWorkOffset, &globalWorkSize, &localWorkSize); - - // Execute the blit - address parameters = captureArguments(kernels_[blitType]); - result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters, nullptr); - releaseArguments(parameters); - } - else { - result = DmaBlitManager::copyBuffer( - srcMemory, dstMemory, srcOrigin, dstOrigin, sizeIn, entire); - } - - synchronize(); - - return result; -} - -bool -KernelBlitManager::fillImage( - device::Memory& memory, - const void* pattern, - const amd::Coord3D& origin, - const amd::Coord3D& size, - bool entire - ) const -{ - amd::ScopedLock k(lockXferOps_); - bool result = false; - - // Use host fill if memory has direct access - if (setup_.disableFillImage_ || - gpuMem(memory).isHostMemDirectAccess()) { - result = HostBlitManager::fillImage( - memory, pattern, origin, size, entire); - synchronize(); - return result; - } - - uint fillType; - size_t dim = 0; - size_t globalWorkOffset[3] = { 0, 0, 0 }; - size_t globalWorkSize[3]; - size_t localWorkSize[3]; - Memory* memView = &gpuMem(memory); - amd::Image* image = static_cast(memory.owner()); - amd::Image::Format newFormat(image->getImageFormat()); - - // Program the kernels workload depending on the fill dimensions - fillType = FillImage; - dim = 3; - - void *newpattern = const_cast(pattern); - cl_uint4 iFillColor; - - bool rejected = false; - bool releaseView = false; - - // For depth, we need to create a view - if (newFormat.image_channel_order == CL_sRGBA) { - // Find unsupported data type - for (uint i = 0; i < RejectedFormatDataTotal; ++i) { - if (RejectedData[i].clOldType_ == newFormat.image_channel_data_type) { - newFormat.image_channel_data_type = RejectedData[i].clNewType_; - rejected = true; - break; - } - } - - if (newFormat.image_channel_order == CL_sRGBA) { - // Converting a linear RGB floating-point color value to a 8-bit unsigned integer sRGB value because hw is not support write_imagef for sRGB. - float *fColor = static_cast(newpattern); - iFillColor.s[0] = sRGBmap(fColor[0]); - iFillColor.s[1] = sRGBmap(fColor[1]); - iFillColor.s[2] = sRGBmap(fColor[2]); - iFillColor.s[3] = (cl_uint)(fColor[3]*255.0f); - newpattern = static_cast(&iFillColor); - for (uint i = 0; i < RejectedFormatChannelTotal; ++i) { - if (RejectedOrder[i].clOldType_ == newFormat.image_channel_order) { - newFormat.image_channel_order = RejectedOrder[i].clNewType_; - rejected = true; - break; - } - } - } - } - // If the image format was rejected, then attempt to create a view - if (rejected) { - memView = createView(gpuMem(memory), newFormat, CL_MEM_WRITE_ONLY); - if (memView != nullptr) { - rejected = false; - releaseView = true; - } - } - - if (rejected) { - return DmaBlitManager::fillImage(memory, pattern, origin, size, entire); - } - - // Perform workload split to allow multiple operations in a single thread - globalWorkSize[0] = (size[0] + TransferSplitSize - 1) / TransferSplitSize; - // Find the current blit type - if (image->getDims() == 1) { - globalWorkSize[0] = amd::alignUp(globalWorkSize[0], 256); - globalWorkSize[1] = amd::alignUp(size[1], 1); - globalWorkSize[2] = amd::alignUp(size[2], 1); - localWorkSize[0] = 256; - localWorkSize[1] = localWorkSize[2] = 1; - } - else if (image->getDims()== 2) { - globalWorkSize[0] = amd::alignUp(globalWorkSize[0], 16); - globalWorkSize[1] = amd::alignUp(size[1], 16); - globalWorkSize[2] = amd::alignUp(size[2], 1); - localWorkSize[0] = localWorkSize[1] = 16; - localWorkSize[2] = 1; - } - else { - globalWorkSize[0] = amd::alignUp(globalWorkSize[0], 8); - globalWorkSize[1] = amd::alignUp(size[1], 8); - globalWorkSize[2] = amd::alignUp(size[2], 4); - localWorkSize[0] = localWorkSize[1] = 8; - localWorkSize[2] = 4; - } - - // Program kernels arguments for the blit operation - cl_mem mem = as_cl(memView->owner()); - setArgument(kernels_[fillType], 0, sizeof(cl_mem), &mem); - setArgument(kernels_[fillType], 1, sizeof(cl_float4), newpattern); - setArgument(kernels_[fillType], 2, sizeof(cl_int4), newpattern); - setArgument(kernels_[fillType], 3, sizeof(cl_uint4), newpattern); - - cl_int fillOrigin[4] = { (cl_int)origin[0], - (cl_int)origin[1], - (cl_int)origin[2], 0 }; - cl_int fillSize[4] = { (cl_int)size[0], - (cl_int)size[1], - (cl_int)size[2], 0 }; - setArgument(kernels_[fillType], 4, sizeof(fillOrigin), fillOrigin); - setArgument(kernels_[fillType], 5, sizeof(fillSize), fillSize); - - // Find the type of image - uint32_t type = 0; - switch (newFormat.image_channel_data_type) { - case CL_SNORM_INT8: - case CL_SNORM_INT16: - case CL_UNORM_INT8: - case CL_UNORM_INT16: - case CL_UNORM_SHORT_565: - case CL_UNORM_SHORT_555: - case CL_UNORM_INT_101010: - case CL_HALF_FLOAT: - case CL_FLOAT: - type = 0; - break; - case CL_SIGNED_INT8: - case CL_SIGNED_INT16: - case CL_SIGNED_INT32: - type = 1; - break; - case CL_UNSIGNED_INT8: - case CL_UNSIGNED_INT16: - case CL_UNSIGNED_INT32: - type = 2; - break; - } - setArgument(kernels_[fillType], 6, sizeof(type), &type); - - // Create ND range object for the kernel's execution - amd::NDRangeContainer ndrange(dim, - globalWorkOffset, globalWorkSize, localWorkSize); + amd::NDRangeContainer ndrange(1, globalWorkOffset, &globalWorkSize, &localWorkSize); // Execute the blit address parameters = captureArguments(kernels_[fillType]); result = gpu().submitKernelInternal(ndrange, *kernels_[fillType], parameters, nullptr); releaseArguments(parameters); - if (releaseView) { - // todo SRD programming could be changed to avoid a stall - gpu().releaseGpuMemoryFence(); - memView->owner()->release(); - } + } - synchronize(); + synchronize(); - return result; + return result; } -amd::Memory* -DmaBlitManager::pinHostMemory( - const void* hostMem, - size_t pinSize, - size_t& partial) const -{ - size_t pinAllocSize; - const static bool SysMem = true; - amd::Memory* amdMemory; +bool KernelBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& dstMemory, + const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin, + const amd::Coord3D& sizeIn, bool entire) const { + amd::ScopedLock k(lockXferOps_); + bool result = false; - // Align offset to 4K boundary - char* tmpHost = const_cast( - amd::alignDown(reinterpret_cast(hostMem), - PinnedMemoryAlignment)); + if (setup_.disableHwlCopyBuffer_ || + (!gpuMem(srcMemory).isHostMemDirectAccess() && !gpuMem(dstMemory).isHostMemDirectAccess())) { + uint blitType = BlitCopyBuffer; + size_t dim = 1; + size_t globalWorkOffset[3] = {0, 0, 0}; + size_t globalWorkSize = 0; + size_t localWorkSize = 0; - // Find the partial size for unaligned copy - partial = reinterpret_cast(hostMem) - tmpHost; + // todo LC shows much better performance with the unaligned version + const static uint CopyBuffAlignment[3] = {1 /*16*/, 1 /*4*/, 1}; + amd::Coord3D size(sizeIn[0], sizeIn[1], sizeIn[2]); - // Recalculate pin memory size - pinAllocSize = amd::alignUp(pinSize + partial, PinnedMemoryAlignment); + bool aligned = false; + uint i; + for (i = 0; i < sizeof(CopyBuffAlignment) / sizeof(uint); i++) { + // Check source alignments + aligned = ((srcOrigin[0] % CopyBuffAlignment[i]) == 0); + // Check destination alignments + aligned &= ((dstOrigin[0] % CopyBuffAlignment[i]) == 0); + // Check copy size alignment in the first dimension + aligned &= ((sizeIn[0] % CopyBuffAlignment[i]) == 0); - amdMemory = gpu().findPinnedMem(tmpHost, pinAllocSize); - - if (nullptr != amdMemory) { - return amdMemory; - } - - amdMemory = new(*context_) - amd::Buffer(*context_, CL_MEM_USE_HOST_PTR, pinAllocSize); - - if ((amdMemory != nullptr) && !amdMemory->create(tmpHost, SysMem)) { - amdMemory->release(); - return nullptr; - } - - // Get device memory for this virtual device - // @note: This will force real memory pinning - amdMemory->setVirtualDevice(&gpu()); - Memory* srcMemory = dev().getRocMemory(amdMemory); - - if (srcMemory == nullptr) { - // Release all pinned memory and attempt pinning again - gpu().releasePinnedMem(); - srcMemory = dev().getRocMemory(amdMemory); - if (srcMemory == nullptr) { - // Release memory - amdMemory->release(); - amdMemory = nullptr; + if (aligned) { + if (CopyBuffAlignment[i] != 1) { + blitType = BlitCopyBufferAligned; } + break; + } } + cl_uint remain; + if (blitType == BlitCopyBufferAligned) { + size.c[0] /= CopyBuffAlignment[i]; + } else { + remain = size[0] % 4; + size.c[0] /= 4; + size.c[0] += 1; + } + + // Program the dispatch dimensions + localWorkSize = 256; + globalWorkSize = amd::alignUp(size[0], 256); + + // Program kernels arguments for the blit operation + cl_mem mem = as_cl(srcMemory.owner()); + setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem); + mem = as_cl(dstMemory.owner()); + setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem); + // Program source origin + cl_ulong srcOffset = srcOrigin[0] / CopyBuffAlignment[i]; + ; + setArgument(kernels_[blitType], 2, sizeof(srcOffset), &srcOffset); + + // Program destinaiton origin + cl_ulong dstOffset = dstOrigin[0] / CopyBuffAlignment[i]; + ; + setArgument(kernels_[blitType], 3, sizeof(dstOffset), &dstOffset); + + cl_ulong copySize = size[0]; + setArgument(kernels_[blitType], 4, sizeof(copySize), ©Size); + + if (blitType == BlitCopyBufferAligned) { + cl_int alignment = CopyBuffAlignment[i]; + setArgument(kernels_[blitType], 5, sizeof(alignment), &alignment); + } else { + setArgument(kernels_[blitType], 5, sizeof(remain), &remain); + } + + // Create ND range object for the kernel's execution + amd::NDRangeContainer ndrange(1, globalWorkOffset, &globalWorkSize, &localWorkSize); + + // Execute the blit + address parameters = captureArguments(kernels_[blitType]); + result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters, nullptr); + releaseArguments(parameters); + } else { + result = DmaBlitManager::copyBuffer(srcMemory, dstMemory, srcOrigin, dstOrigin, sizeIn, entire); + } + + synchronize(); + + return result; +} + +bool KernelBlitManager::fillImage(device::Memory& memory, const void* pattern, + const amd::Coord3D& origin, const amd::Coord3D& size, + bool entire) const { + amd::ScopedLock k(lockXferOps_); + bool result = false; + + // Use host fill if memory has direct access + if (setup_.disableFillImage_ || gpuMem(memory).isHostMemDirectAccess()) { + result = HostBlitManager::fillImage(memory, pattern, origin, size, entire); + synchronize(); + return result; + } + + uint fillType; + size_t dim = 0; + size_t globalWorkOffset[3] = {0, 0, 0}; + size_t globalWorkSize[3]; + size_t localWorkSize[3]; + Memory* memView = &gpuMem(memory); + amd::Image* image = static_cast(memory.owner()); + amd::Image::Format newFormat(image->getImageFormat()); + + // Program the kernels workload depending on the fill dimensions + fillType = FillImage; + dim = 3; + + void* newpattern = const_cast(pattern); + cl_uint4 iFillColor; + + bool rejected = false; + bool releaseView = false; + + // For depth, we need to create a view + if (newFormat.image_channel_order == CL_sRGBA) { + // Find unsupported data type + for (uint i = 0; i < RejectedFormatDataTotal; ++i) { + if (RejectedData[i].clOldType_ == newFormat.image_channel_data_type) { + newFormat.image_channel_data_type = RejectedData[i].clNewType_; + rejected = true; + break; + } + } + + if (newFormat.image_channel_order == CL_sRGBA) { + // Converting a linear RGB floating-point color value to a 8-bit unsigned integer sRGB value + // because hw is not support write_imagef for sRGB. + float* fColor = static_cast(newpattern); + iFillColor.s[0] = sRGBmap(fColor[0]); + iFillColor.s[1] = sRGBmap(fColor[1]); + iFillColor.s[2] = sRGBmap(fColor[2]); + iFillColor.s[3] = (cl_uint)(fColor[3] * 255.0f); + newpattern = static_cast(&iFillColor); + for (uint i = 0; i < RejectedFormatChannelTotal; ++i) { + if (RejectedOrder[i].clOldType_ == newFormat.image_channel_order) { + newFormat.image_channel_order = RejectedOrder[i].clNewType_; + rejected = true; + break; + } + } + } + } + // If the image format was rejected, then attempt to create a view + if (rejected) { + memView = createView(gpuMem(memory), newFormat, CL_MEM_WRITE_ONLY); + if (memView != nullptr) { + rejected = false; + releaseView = true; + } + } + + if (rejected) { + return DmaBlitManager::fillImage(memory, pattern, origin, size, entire); + } + + // Perform workload split to allow multiple operations in a single thread + globalWorkSize[0] = (size[0] + TransferSplitSize - 1) / TransferSplitSize; + // Find the current blit type + if (image->getDims() == 1) { + globalWorkSize[0] = amd::alignUp(globalWorkSize[0], 256); + globalWorkSize[1] = amd::alignUp(size[1], 1); + globalWorkSize[2] = amd::alignUp(size[2], 1); + localWorkSize[0] = 256; + localWorkSize[1] = localWorkSize[2] = 1; + } else if (image->getDims() == 2) { + globalWorkSize[0] = amd::alignUp(globalWorkSize[0], 16); + globalWorkSize[1] = amd::alignUp(size[1], 16); + globalWorkSize[2] = amd::alignUp(size[2], 1); + localWorkSize[0] = localWorkSize[1] = 16; + localWorkSize[2] = 1; + } else { + globalWorkSize[0] = amd::alignUp(globalWorkSize[0], 8); + globalWorkSize[1] = amd::alignUp(size[1], 8); + globalWorkSize[2] = amd::alignUp(size[2], 4); + localWorkSize[0] = localWorkSize[1] = 8; + localWorkSize[2] = 4; + } + + // Program kernels arguments for the blit operation + cl_mem mem = as_cl(memView->owner()); + setArgument(kernels_[fillType], 0, sizeof(cl_mem), &mem); + setArgument(kernels_[fillType], 1, sizeof(cl_float4), newpattern); + setArgument(kernels_[fillType], 2, sizeof(cl_int4), newpattern); + setArgument(kernels_[fillType], 3, sizeof(cl_uint4), newpattern); + + cl_int fillOrigin[4] = {(cl_int)origin[0], (cl_int)origin[1], (cl_int)origin[2], 0}; + cl_int fillSize[4] = {(cl_int)size[0], (cl_int)size[1], (cl_int)size[2], 0}; + setArgument(kernels_[fillType], 4, sizeof(fillOrigin), fillOrigin); + setArgument(kernels_[fillType], 5, sizeof(fillSize), fillSize); + + // Find the type of image + uint32_t type = 0; + switch (newFormat.image_channel_data_type) { + case CL_SNORM_INT8: + case CL_SNORM_INT16: + case CL_UNORM_INT8: + case CL_UNORM_INT16: + case CL_UNORM_SHORT_565: + case CL_UNORM_SHORT_555: + case CL_UNORM_INT_101010: + case CL_HALF_FLOAT: + case CL_FLOAT: + type = 0; + break; + case CL_SIGNED_INT8: + case CL_SIGNED_INT16: + case CL_SIGNED_INT32: + type = 1; + break; + case CL_UNSIGNED_INT8: + case CL_UNSIGNED_INT16: + case CL_UNSIGNED_INT32: + type = 2; + break; + } + setArgument(kernels_[fillType], 6, sizeof(type), &type); + + // Create ND range object for the kernel's execution + amd::NDRangeContainer ndrange(dim, globalWorkOffset, globalWorkSize, localWorkSize); + + // Execute the blit + address parameters = captureArguments(kernels_[fillType]); + result = gpu().submitKernelInternal(ndrange, *kernels_[fillType], parameters, nullptr); + releaseArguments(parameters); + if (releaseView) { + // todo SRD programming could be changed to avoid a stall + gpu().releaseGpuMemoryFence(); + memView->owner()->release(); + } + + synchronize(); + + return result; +} + +amd::Memory* DmaBlitManager::pinHostMemory(const void* hostMem, size_t pinSize, + size_t& partial) const { + size_t pinAllocSize; + const static bool SysMem = true; + amd::Memory* amdMemory; + + // Align offset to 4K boundary + char* tmpHost = const_cast( + amd::alignDown(reinterpret_cast(hostMem), PinnedMemoryAlignment)); + + // Find the partial size for unaligned copy + partial = reinterpret_cast(hostMem) - tmpHost; + + // Recalculate pin memory size + pinAllocSize = amd::alignUp(pinSize + partial, PinnedMemoryAlignment); + + amdMemory = gpu().findPinnedMem(tmpHost, pinAllocSize); + + if (nullptr != amdMemory) { return amdMemory; + } + + amdMemory = new (*context_) amd::Buffer(*context_, CL_MEM_USE_HOST_PTR, pinAllocSize); + + if ((amdMemory != nullptr) && !amdMemory->create(tmpHost, SysMem)) { + amdMemory->release(); + return nullptr; + } + + // Get device memory for this virtual device + // @note: This will force real memory pinning + amdMemory->setVirtualDevice(&gpu()); + Memory* srcMemory = dev().getRocMemory(amdMemory); + + if (srcMemory == nullptr) { + // Release all pinned memory and attempt pinning again + gpu().releasePinnedMem(); + srcMemory = dev().getRocMemory(amdMemory); + if (srcMemory == nullptr) { + // Release memory + amdMemory->release(); + amdMemory = nullptr; + } + } + + return amdMemory; } -Memory* -KernelBlitManager::createView( - const Memory& parent, - cl_image_format format, - cl_mem_flags flags) const -{ - assert((parent.owner()->asBuffer() == nullptr) && "View supports images only"); - amd::Image* parentImage = static_cast(parent.owner()); - amd::Image* image = parentImage->createView( - parent.owner()->getContext(), format, &gpu(), 0, flags); +Memory* KernelBlitManager::createView(const Memory& parent, cl_image_format format, + cl_mem_flags flags) const { + assert((parent.owner()->asBuffer() == nullptr) && "View supports images only"); + amd::Image* parentImage = static_cast(parent.owner()); + amd::Image* image = + parentImage->createView(parent.owner()->getContext(), format, &gpu(), 0, flags); - if (image == nullptr) { - LogError("[OCL] Fail to allocate view of image object"); - return nullptr; - } + if (image == nullptr) { + LogError("[OCL] Fail to allocate view of image object"); + return nullptr; + } - Image* devImage = new roc::Image(dev(), *image); - if (devImage == nullptr) { - LogError("[OCL] Fail to allocate device mem object for the view"); - image->release(); - return nullptr; - } + Image* devImage = new roc::Image(dev(), *image); + if (devImage == nullptr) { + LogError("[OCL] Fail to allocate device mem object for the view"); + image->release(); + return nullptr; + } - if (!devImage->createView(parent)) { - LogError("[OCL] Fail to create device mem object for the view"); - delete devImage; - image->release(); - return nullptr; - } + if (!devImage->createView(parent)) { + LogError("[OCL] Fail to create device mem object for the view"); + delete devImage; + image->release(); + return nullptr; + } - image->replaceDeviceMemory(&dev_, devImage); + image->replaceDeviceMemory(&dev_, devImage); - return devImage; + return devImage; } -address -KernelBlitManager::captureArguments(const amd::Kernel* kernel) const -{ - const size_t stackSize = kernel->signature().paramsSize(); - const size_t svmInfoSize = kernel->signature().numParameters() * sizeof(bool); - address args = reinterpret_cast
(amd::AlignedMemory::allocate( - stackSize + svmInfoSize, PARAMETERS_MIN_ALIGNMENT)); - if (args == nullptr) { - LogWarning("Failed to allocate memory for arguments"); - return nullptr; - } - memcpy(args, kernel->parameters().values(), kernel->signature().paramsSize()); - memset(args + stackSize, 0, svmInfoSize); - return args; +address KernelBlitManager::captureArguments(const amd::Kernel* kernel) const { + const size_t stackSize = kernel->signature().paramsSize(); + const size_t svmInfoSize = kernel->signature().numParameters() * sizeof(bool); + address args = reinterpret_cast
( + amd::AlignedMemory::allocate(stackSize + svmInfoSize, PARAMETERS_MIN_ALIGNMENT)); + if (args == nullptr) { + LogWarning("Failed to allocate memory for arguments"); + return nullptr; + } + memcpy(args, kernel->parameters().values(), kernel->signature().paramsSize()); + memset(args + stackSize, 0, svmInfoSize); + return args; } -void -KernelBlitManager::releaseArguments(address args) const -{ - amd::AlignedMemory::deallocate(args); +void KernelBlitManager::releaseArguments(address args) const { + amd::AlignedMemory::deallocate(args); } -} // namespace pal +} // namespace pal diff --git a/rocclr/runtime/device/rocm/rocblit.hpp b/rocclr/runtime/device/rocm/rocblit.hpp index c2e6314406..3e5a25198a 100644 --- a/rocclr/runtime/device/rocm/rocblit.hpp +++ b/rocclr/runtime/device/rocm/rocblit.hpp @@ -23,457 +23,408 @@ class Memory; class VirtualGPU; //! DMA Blit Manager -class DmaBlitManager : public device::HostBlitManager -{ -public: - //! Constructor - DmaBlitManager( - VirtualGPU& gpu, //!< Virtual GPU to be used for blits - Setup setup = Setup() //!< Specifies HW accelerated blits - ); +class DmaBlitManager : public device::HostBlitManager { + public: + //! Constructor + DmaBlitManager(VirtualGPU& gpu, //!< Virtual GPU to be used for blits + Setup setup = Setup() //!< Specifies HW accelerated blits + ); - //! Destructor - virtual ~DmaBlitManager() { - if (completion_signal_.handle != 0) { - hsa_signal_destroy(completion_signal_); - } + //! Destructor + virtual ~DmaBlitManager() { + if (completion_signal_.handle != 0) { + hsa_signal_destroy(completion_signal_); } + } - //! Creates DmaBlitManager object - virtual bool create(amd::Device& device) { - if (HSA_STATUS_SUCCESS != hsa_signal_create(0, 0, nullptr, &completion_signal_)) { - false; - } - return true; + //! Creates DmaBlitManager object + virtual bool create(amd::Device& device) { + if (HSA_STATUS_SUCCESS != hsa_signal_create(0, 0, nullptr, &completion_signal_)) { + false; } + return true; + } - //! Copies a buffer object to system memory - virtual bool readBuffer( - device::Memory& srcMemory, //!< Source memory object - void* dstHost, //!< Destination host memory - const amd::Coord3D& origin, //!< Source origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; + //! Copies a buffer object to system memory + virtual bool readBuffer(device::Memory& srcMemory, //!< Source memory object + void* dstHost, //!< Destination host memory + const amd::Coord3D& origin, //!< Source origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const; - //! Copies a buffer object to system memory - virtual bool readBufferRect( - device::Memory& srcMemory, //!< Source memory object - void* dstHost, //!< Destinaiton host memory - const amd::BufferRect& bufRect, //!< Source rectangle - const amd::BufferRect& hostRect, //!< Destination rectangle - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; + //! Copies a buffer object to system memory + virtual bool readBufferRect(device::Memory& srcMemory, //!< Source memory object + void* dstHost, //!< Destinaiton host memory + const amd::BufferRect& bufRect, //!< Source rectangle + const amd::BufferRect& hostRect, //!< Destination rectangle + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const; - //! Copies an image object to system memory - virtual bool readImage( - device::Memory& srcMemory, //!< Source memory object - void* dstHost, //!< Destination host memory - const amd::Coord3D& origin, //!< Source origin - const amd::Coord3D& size, //!< Size of the copy region - size_t rowPitch, //!< Row pitch for host memory - size_t slicePitch, //!< Slice pitch for host memory - bool entire = false //!< Entire buffer will be updated - ) const; + //! Copies an image object to system memory + virtual bool readImage(device::Memory& srcMemory, //!< Source memory object + void* dstHost, //!< Destination host memory + const amd::Coord3D& origin, //!< Source origin + const amd::Coord3D& size, //!< Size of the copy region + size_t rowPitch, //!< Row pitch for host memory + size_t slicePitch, //!< Slice pitch for host memory + bool entire = false //!< Entire buffer will be updated + ) const; - //! Copies system memory to a buffer object - virtual bool writeBuffer( - const void* srcHost, //!< Source host memory - device::Memory& dstMemory, //!< Destination memory object - const amd::Coord3D& origin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; + //! Copies system memory to a buffer object + virtual bool writeBuffer(const void* srcHost, //!< Source host memory + device::Memory& dstMemory, //!< Destination memory object + const amd::Coord3D& origin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const; - //! Copies system memory to a buffer object - virtual bool writeBufferRect( - const void* srcHost, //!< Source host memory - device::Memory& dstMemory, //!< Destination memory object - const amd::BufferRect& hostRect, //!< Destination rectangle - const amd::BufferRect& bufRect, //!< Source rectangle - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; + //! Copies system memory to a buffer object + virtual bool writeBufferRect(const void* srcHost, //!< Source host memory + device::Memory& dstMemory, //!< Destination memory object + const amd::BufferRect& hostRect, //!< Destination rectangle + const amd::BufferRect& bufRect, //!< Source rectangle + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const; - //! Copies system memory to an image object - virtual bool writeImage( - const void* srcHost, //!< Source host memory - device::Memory& dstMemory, //!< Destination memory object - const amd::Coord3D& origin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - size_t rowPitch, //!< Row pitch for host memory - size_t slicePitch, //!< Slice pitch for host memory - bool entire = false //!< Entire buffer will be updated - ) const; + //! Copies system memory to an image object + virtual bool writeImage(const void* srcHost, //!< Source host memory + device::Memory& dstMemory, //!< Destination memory object + const amd::Coord3D& origin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + size_t rowPitch, //!< Row pitch for host memory + size_t slicePitch, //!< Slice pitch for host memory + bool entire = false //!< Entire buffer will be updated + ) const; - //! Copies a buffer object to another buffer object - virtual bool copyBuffer( - device::Memory& srcMemory, //!< Source memory object - device::Memory& dstMemory, //!< Destination memory object - const amd::Coord3D& srcOrigin, //!< Source origin - const amd::Coord3D& dstOrigin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; + //! Copies a buffer object to another buffer object + virtual bool copyBuffer(device::Memory& srcMemory, //!< Source memory object + device::Memory& dstMemory, //!< Destination memory object + const amd::Coord3D& srcOrigin, //!< Source origin + const amd::Coord3D& dstOrigin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const; - //! Copies a buffer object to another buffer object - virtual bool copyBufferRect( - device::Memory& srcMemory, //!< Source memory object - device::Memory& dstMemory, //!< Destination memory object - const amd::BufferRect& srcRect, //!< Source rectangle - const amd::BufferRect& dstRect, //!< Destination rectangle - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; + //! Copies a buffer object to another buffer object + virtual bool copyBufferRect(device::Memory& srcMemory, //!< Source memory object + device::Memory& dstMemory, //!< Destination memory object + const amd::BufferRect& srcRect, //!< Source rectangle + const amd::BufferRect& dstRect, //!< Destination rectangle + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const; - //! Copies an image object to a buffer object - virtual bool copyImageToBuffer( - device::Memory& srcMemory, //!< Source memory object - device::Memory& dstMemory, //!< Destination memory object - const amd::Coord3D& srcOrigin, //!< Source origin - const amd::Coord3D& dstOrigin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false, //!< Entire buffer will be updated - size_t rowPitch = 0, //!< Pitch for buffer - size_t slicePitch = 0 //!< Slice for buffer - ) const; + //! Copies an image object to a buffer object + virtual bool copyImageToBuffer(device::Memory& srcMemory, //!< Source memory object + device::Memory& dstMemory, //!< Destination memory object + const amd::Coord3D& srcOrigin, //!< Source origin + const amd::Coord3D& dstOrigin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false, //!< Entire buffer will be updated + size_t rowPitch = 0, //!< Pitch for buffer + size_t slicePitch = 0 //!< Slice for buffer + ) const; - //! Copies a buffer object to an image object - virtual bool copyBufferToImage( - device::Memory& srcMemory, //!< Source memory object - device::Memory& dstMemory, //!< Destination memory object - const amd::Coord3D& srcOrigin, //!< Source origin - const amd::Coord3D& dstOrigin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false, //!< Entire buffer will be updated - size_t rowPitch = 0, //!< Pitch for buffer - size_t slicePitch = 0 //!< Slice for buffer - ) const; + //! Copies a buffer object to an image object + virtual bool copyBufferToImage(device::Memory& srcMemory, //!< Source memory object + device::Memory& dstMemory, //!< Destination memory object + const amd::Coord3D& srcOrigin, //!< Source origin + const amd::Coord3D& dstOrigin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false, //!< Entire buffer will be updated + size_t rowPitch = 0, //!< Pitch for buffer + size_t slicePitch = 0 //!< Slice for buffer + ) const; - //! Copies an image object to another image object - virtual bool copyImage( - device::Memory& srcMemory, //!< Source memory object - device::Memory& dstMemory, //!< Destination memory object - const amd::Coord3D& srcOrigin, //!< Source origin - const amd::Coord3D& dstOrigin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; + //! Copies an image object to another image object + virtual bool copyImage(device::Memory& srcMemory, //!< Source memory object + device::Memory& dstMemory, //!< Destination memory object + const amd::Coord3D& srcOrigin, //!< Source origin + const amd::Coord3D& dstOrigin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const; -protected: - const static uint MaxPinnedBuffers = 4; + protected: + const static uint MaxPinnedBuffers = 4; - //! Synchronizes the blit operations if necessary - inline void synchronize() const; + //! Synchronizes the blit operations if necessary + inline void synchronize() const; - //! Returns the virtual GPU object - VirtualGPU& gpu() const { return static_cast(vDev_); } + //! Returns the virtual GPU object + VirtualGPU& gpu() const { return static_cast(vDev_); } - //! Returns the ROC device object - const Device& dev() const { return static_cast(dev_); }; + //! Returns the ROC device object + const Device& dev() const { return static_cast(dev_); }; - inline Memory& gpuMem(device::Memory& mem) const; + inline Memory& gpuMem(device::Memory& mem) const; - //! Pins host memory for GPU access - amd::Memory* pinHostMemory( - const void* hostMem, //!< Host memory pointer - size_t pinSize, //!< Host memory size - size_t& partial //!< Extra offset for memory alignment - ) const; + //! Pins host memory for GPU access + amd::Memory* pinHostMemory(const void* hostMem, //!< Host memory pointer + size_t pinSize, //!< Host memory size + size_t& partial //!< Extra offset for memory alignment + ) const; - //! Assits in transferring data from Host to Local or vice versa - //! taking into account the Hsail profile supported by Hsa Agent - bool hsaCopy( - const Memory& srcMemory, - const Memory& dstMemory, - const amd::Coord3D& srcOrigin, - const amd::Coord3D& dstOrigin, - const amd::Coord3D& size, - bool enableCopyRect = false, - bool flushDMA = true) const; + //! Assits in transferring data from Host to Local or vice versa + //! taking into account the Hsail profile supported by Hsa Agent + bool hsaCopy(const Memory& srcMemory, const Memory& dstMemory, const amd::Coord3D& srcOrigin, + const amd::Coord3D& dstOrigin, const amd::Coord3D& size, bool enableCopyRect = false, + bool flushDMA = true) const; - const size_t MinSizeForPinnedTransfer; - bool completeOperation_; //!< DMA blit manager must complete operation - amd::Context* context_; //!< A dummy context + const size_t MinSizeForPinnedTransfer; + bool completeOperation_; //!< DMA blit manager must complete operation + amd::Context* context_; //!< A dummy context -private: + private: + //! Disable copy constructor + DmaBlitManager(const DmaBlitManager&); - //! Disable copy constructor - DmaBlitManager(const DmaBlitManager&); + //! Disable operator= + DmaBlitManager& operator=(const DmaBlitManager&); - //! Disable operator= - DmaBlitManager& operator=(const DmaBlitManager&); + //! Reads video memory, using a staged buffer + bool readMemoryStaged(Memory& srcMemory, //!< Source memory object + void* dstHost, //!< Destination host memory + Memory& xferBuf, //!< Staged buffer for read + size_t origin, //!< Original offset in the source memory + size_t& offset, //!< Offset for the current copy pointer + size_t& totalSize, //!< Total size for copy region + size_t xferSize //!< Transfer size + ) const; - //! Reads video memory, using a staged buffer - bool readMemoryStaged( - Memory& srcMemory, //!< Source memory object - void* dstHost, //!< Destination host memory - Memory& xferBuf, //!< Staged buffer for read - size_t origin, //!< Original offset in the source memory - size_t& offset, //!< Offset for the current copy pointer - size_t& totalSize, //!< Total size for copy region - size_t xferSize //!< Transfer size - ) const; + //! Write into video memory, using a staged buffer + bool writeMemoryStaged(const void* srcHost, //!< Source host memory + Memory& dstMemory, //!< Destination memory object + Memory& xferBuf, //!< Staged buffer for write + size_t origin, //!< Original offset in the destination memory + size_t& offset, //!< Offset for the current copy pointer + size_t& totalSize, //!< Total size for the copy region + size_t xferSize //!< Transfer size + ) const; - //! Write into video memory, using a staged buffer - bool writeMemoryStaged( - const void* srcHost, //!< Source host memory - Memory& dstMemory, //!< Destination memory object - Memory& xferBuf, //!< Staged buffer for write - size_t origin, //!< Original offset in the destination memory - size_t& offset, //!< Offset for the current copy pointer - size_t& totalSize, //!< Total size for the copy region - size_t xferSize //!< Transfer size - ) const; + //! Handle of ROC Device object + hsa_signal_t completion_signal_; - //! Handle of ROC Device object - hsa_signal_t completion_signal_; - - //! Assits in transferring data from Host to Local or vice versa - //! taking into account the Hsail profile supported by Hsa Agent - bool hsaCopyStaged( - const_address hostSrc, //!< Contains source data to be copied - address hostDst, //!< Destination buffer address for copying - size_t size, //!< Size of data to copy in bytes - address staging, //!< Staging resource - bool hostToDev //!< True if data is copied from Host To Device - ) const; + //! Assits in transferring data from Host to Local or vice versa + //! taking into account the Hsail profile supported by Hsa Agent + bool hsaCopyStaged(const_address hostSrc, //!< Contains source data to be copied + address hostDst, //!< Destination buffer address for copying + size_t size, //!< Size of data to copy in bytes + address staging, //!< Staging resource + bool hostToDev //!< True if data is copied from Host To Device + ) const; }; //! Kernel Blit Manager -class KernelBlitManager : public DmaBlitManager -{ -public: - enum { - BlitCopyImage = 0, - BlitCopyImage1DA, - BlitCopyImageToBuffer, - BlitCopyBufferToImage, - BlitCopyBufferRect, - BlitCopyBufferRectAligned, - BlitCopyBuffer, - BlitCopyBufferAligned, - FillBuffer, - FillImage, - BlitTotal - }; +class KernelBlitManager : public DmaBlitManager { + public: + enum { + BlitCopyImage = 0, + BlitCopyImage1DA, + BlitCopyImageToBuffer, + BlitCopyBufferToImage, + BlitCopyBufferRect, + BlitCopyBufferRectAligned, + BlitCopyBuffer, + BlitCopyBufferAligned, + FillBuffer, + FillImage, + BlitTotal + }; - //! Constructor - KernelBlitManager( - VirtualGPU& gpu, //!< Virtual GPU to be used for blits - Setup setup = Setup() //!< Specifies HW accelerated blits - ); + //! Constructor + KernelBlitManager(VirtualGPU& gpu, //!< Virtual GPU to be used for blits + Setup setup = Setup() //!< Specifies HW accelerated blits + ); - //! Destructor - virtual ~KernelBlitManager(); + //! Destructor + virtual ~KernelBlitManager(); - //! Creates DmaBlitManager object - virtual bool create(amd::Device& device); + //! Creates DmaBlitManager object + virtual bool create(amd::Device& device); - //! Copies a buffer object to another buffer object - virtual bool copyBufferRect( - device::Memory& srcMemory, //!< Source memory object - device::Memory& dstMemory, //!< Destination memory object - const amd::BufferRect& srcRectIn, //!< Source rectangle - const amd::BufferRect& dstRectIn, //!< Destination rectangle - const amd::Coord3D& sizeIn, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; + //! Copies a buffer object to another buffer object + virtual bool copyBufferRect(device::Memory& srcMemory, //!< Source memory object + device::Memory& dstMemory, //!< Destination memory object + const amd::BufferRect& srcRectIn, //!< Source rectangle + const amd::BufferRect& dstRectIn, //!< Destination rectangle + const amd::Coord3D& sizeIn, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const; - //! Copies a buffer object to system memory - virtual bool readBuffer( - device::Memory& srcMemory, //!< Source memory object - void* dstHost, //!< Destination host memory - const amd::Coord3D& origin, //!< Source origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; + //! Copies a buffer object to system memory + virtual bool readBuffer(device::Memory& srcMemory, //!< Source memory object + void* dstHost, //!< Destination host memory + const amd::Coord3D& origin, //!< Source origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const; - //! Copies a buffer object to system memory - virtual bool readBufferRect( - device::Memory& srcMemory, //!< Source memory object - void* dstHost, //!< Destinaiton host memory - const amd::BufferRect& bufRect, //!< Source rectangle - const amd::BufferRect& hostRect, //!< Destination rectangle - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; + //! Copies a buffer object to system memory + virtual bool readBufferRect(device::Memory& srcMemory, //!< Source memory object + void* dstHost, //!< Destinaiton host memory + const amd::BufferRect& bufRect, //!< Source rectangle + const amd::BufferRect& hostRect, //!< Destination rectangle + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const; - //! Copies system memory to a buffer object - virtual bool writeBuffer( - const void* srcHost, //!< Source host memory - device::Memory& dstMemory, //!< Destination memory object - const amd::Coord3D& origin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; + //! Copies system memory to a buffer object + virtual bool writeBuffer(const void* srcHost, //!< Source host memory + device::Memory& dstMemory, //!< Destination memory object + const amd::Coord3D& origin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const; - //! Copies system memory to a buffer object - virtual bool writeBufferRect( - const void* srcHost, //!< Source host memory - device::Memory& dstMemory, //!< Destination memory object - const amd::BufferRect& hostRect, //!< Destination rectangle - const amd::BufferRect& bufRect, //!< Source rectangle - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; + //! Copies system memory to a buffer object + virtual bool writeBufferRect(const void* srcHost, //!< Source host memory + device::Memory& dstMemory, //!< Destination memory object + const amd::BufferRect& hostRect, //!< Destination rectangle + const amd::BufferRect& bufRect, //!< Source rectangle + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const; - //! Copies a buffer object to an image object - virtual bool copyBuffer( - device::Memory& srcMemory, //!< Source memory object - device::Memory& dstMemory, //!< Destination memory object - const amd::Coord3D& srcOrigin, //!< Source origin - const amd::Coord3D& dstOrigin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; + //! Copies a buffer object to an image object + virtual bool copyBuffer(device::Memory& srcMemory, //!< Source memory object + device::Memory& dstMemory, //!< Destination memory object + const amd::Coord3D& srcOrigin, //!< Source origin + const amd::Coord3D& dstOrigin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const; - //! Copies a buffer object to an image object - virtual bool copyBufferToImage( - device::Memory& srcMemory, //!< Source memory object - device::Memory& dstMemory, //!< Destination memory object - const amd::Coord3D& srcOrigin, //!< Source origin - const amd::Coord3D& dstOrigin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false, //!< Entire buffer will be updated - size_t rowPitch = 0, //!< Pitch for buffer - size_t slicePitch = 0 //!< Slice for buffer - ) const; + //! Copies a buffer object to an image object + virtual bool copyBufferToImage(device::Memory& srcMemory, //!< Source memory object + device::Memory& dstMemory, //!< Destination memory object + const amd::Coord3D& srcOrigin, //!< Source origin + const amd::Coord3D& dstOrigin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false, //!< Entire buffer will be updated + size_t rowPitch = 0, //!< Pitch for buffer + size_t slicePitch = 0 //!< Slice for buffer + ) const; - //! Copies an image object to a buffer object - virtual bool copyImageToBuffer( - device::Memory& srcMemory, //!< Source memory object - device::Memory& dstMemory, //!< Destination memory object - const amd::Coord3D& srcOrigin, //!< Source origin - const amd::Coord3D& dstOrigin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false, //!< Entire buffer will be updated - size_t rowPitch = 0, //!< Pitch for buffer - size_t slicePitch = 0 //!< Slice for buffer - ) const; + //! Copies an image object to a buffer object + virtual bool copyImageToBuffer(device::Memory& srcMemory, //!< Source memory object + device::Memory& dstMemory, //!< Destination memory object + const amd::Coord3D& srcOrigin, //!< Source origin + const amd::Coord3D& dstOrigin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false, //!< Entire buffer will be updated + size_t rowPitch = 0, //!< Pitch for buffer + size_t slicePitch = 0 //!< Slice for buffer + ) const; - //! Copies an image object to another image object - virtual bool copyImage( - device::Memory& srcMemory, //!< Source memory object - device::Memory& dstMemory, //!< Destination memory object - const amd::Coord3D& srcOrigin, //!< Source origin - const amd::Coord3D& dstOrigin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; + //! Copies an image object to another image object + virtual bool copyImage(device::Memory& srcMemory, //!< Source memory object + device::Memory& dstMemory, //!< Destination memory object + const amd::Coord3D& srcOrigin, //!< Source origin + const amd::Coord3D& dstOrigin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const; - //! Copies an image object to system memory - virtual bool readImage( - device::Memory& srcMemory, //!< Source memory object - void* dstHost, //!< Destination host memory - const amd::Coord3D& origin, //!< Source origin - const amd::Coord3D& size, //!< Size of the copy region - size_t rowPitch, //!< Row pitch for host memory - size_t slicePitch, //!< Slice pitch for host memory - bool entire = false //!< Entire buffer will be updated - ) const; + //! Copies an image object to system memory + virtual bool readImage(device::Memory& srcMemory, //!< Source memory object + void* dstHost, //!< Destination host memory + const amd::Coord3D& origin, //!< Source origin + const amd::Coord3D& size, //!< Size of the copy region + size_t rowPitch, //!< Row pitch for host memory + size_t slicePitch, //!< Slice pitch for host memory + bool entire = false //!< Entire buffer will be updated + ) const; - //! Copies system memory to an image object - virtual bool writeImage( - const void* srcHost, //!< Source host memory - device::Memory& dstMemory, //!< Destination memory object - const amd::Coord3D& origin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - size_t rowPitch, //!< Row pitch for host memory - size_t slicePitch, //!< Slice pitch for host memory - bool entire = false //!< Entire buffer will be updated - ) const; + //! Copies system memory to an image object + virtual bool writeImage(const void* srcHost, //!< Source host memory + device::Memory& dstMemory, //!< Destination memory object + const amd::Coord3D& origin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + size_t rowPitch, //!< Row pitch for host memory + size_t slicePitch, //!< Slice pitch for host memory + bool entire = false //!< Entire buffer will be updated + ) const; - //! Fills a buffer memory with a pattern data - virtual bool fillBuffer( - device::Memory& memory, //!< Memory object to fill with pattern - const void* pattern, //!< Pattern data - size_t patternSize, //!< Pattern size - const amd::Coord3D& origin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; + //! Fills a buffer memory with a pattern data + virtual bool fillBuffer(device::Memory& memory, //!< Memory object to fill with pattern + const void* pattern, //!< Pattern data + size_t patternSize, //!< Pattern size + const amd::Coord3D& origin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const; - //! Fills an image memory with a pattern data - virtual bool fillImage( - device::Memory& dstMemory, //!< Memory object to fill with pattern - const void* pattern, //!< Pattern data - const amd::Coord3D& origin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; + //! Fills an image memory with a pattern data + virtual bool fillImage(device::Memory& dstMemory, //!< Memory object to fill with pattern + const void* pattern, //!< Pattern data + const amd::Coord3D& origin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const; -private: - static const size_t MaxXferBuffers = 2; - static const uint TransferSplitSize = 1; - static const uint MaxNumIssuedTransfers = 3; + private: + static const size_t MaxXferBuffers = 2; + static const uint TransferSplitSize = 1; + static const uint MaxNumIssuedTransfers = 3; - //! Copies a buffer object to an image object - bool copyBufferToImageKernel( - device::Memory& srcMemory, //!< Source memory object - device::Memory& dstMemory, //!< Destination memory object - const amd::Coord3D& srcOrigin, //!< Source origin - const amd::Coord3D& dstOrigin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false, //!< Entire buffer will be updated - size_t rowPitch = 0, //!< Pitch for buffer - size_t slicePitch = 0 //!< Slice for buffer - ) const; + //! Copies a buffer object to an image object + bool copyBufferToImageKernel(device::Memory& srcMemory, //!< Source memory object + device::Memory& dstMemory, //!< Destination memory object + const amd::Coord3D& srcOrigin, //!< Source origin + const amd::Coord3D& dstOrigin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false, //!< Entire buffer will be updated + size_t rowPitch = 0, //!< Pitch for buffer + size_t slicePitch = 0 //!< Slice for buffer + ) const; - //! Copies an image object to a buffer object - bool copyImageToBufferKernel( - device::Memory& srcMemory, //!< Source memory object - device::Memory& dstMemory, //!< Destination memory object - const amd::Coord3D& srcOrigin, //!< Source origin - const amd::Coord3D& dstOrigin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false, //!< Entire buffer will be updated - size_t rowPitch = 0, //!< Pitch for buffer - size_t slicePitch = 0 //!< Slice for buffer - ) const; + //! Copies an image object to a buffer object + bool copyImageToBufferKernel(device::Memory& srcMemory, //!< Source memory object + device::Memory& dstMemory, //!< Destination memory object + const amd::Coord3D& srcOrigin, //!< Source origin + const amd::Coord3D& dstOrigin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false, //!< Entire buffer will be updated + size_t rowPitch = 0, //!< Pitch for buffer + size_t slicePitch = 0 //!< Slice for buffer + ) const; - //! Creates a program for all blit operations - bool createProgram( - Device& device //!< Device object - ); + //! Creates a program for all blit operations + bool createProgram(Device& device //!< Device object + ); - //! Creates a view memory object - Memory* createView( - const Memory& parent, //!< Parent memory object - cl_image_format format, //!< The new format for a view - cl_mem_flags flags //!< Memory flags - ) const; + //! Creates a view memory object + Memory* createView(const Memory& parent, //!< Parent memory object + cl_image_format format, //!< The new format for a view + cl_mem_flags flags //!< Memory flags + ) const; - address captureArguments(const amd::Kernel* kernel) const; - void releaseArguments(address args) const; + address captureArguments(const amd::Kernel* kernel) const; + void releaseArguments(address args) const; - //! Disable copy constructor - KernelBlitManager(const KernelBlitManager&); + //! Disable copy constructor + KernelBlitManager(const KernelBlitManager&); - //! Disable operator= - KernelBlitManager& operator=(const KernelBlitManager&); + //! Disable operator= + KernelBlitManager& operator=(const KernelBlitManager&); - amd::Program* program_; //!< GPU program obejct - amd::Kernel* kernels_[BlitTotal]; //!< GPU kernels for blit - amd::Memory* constantBuffer_; //!< An internal CB for blits - amd::Memory* xferBuffers_[MaxXferBuffers]; //!< Transfer buffers for images - size_t xferBufferSize_; //!< Transfer buffer size - amd::Monitor* lockXferOps_; //!< Lock transfer operation + amd::Program* program_; //!< GPU program obejct + amd::Kernel* kernels_[BlitTotal]; //!< GPU kernels for blit + amd::Memory* constantBuffer_; //!< An internal CB for blits + amd::Memory* xferBuffers_[MaxXferBuffers]; //!< Transfer buffers for images + size_t xferBufferSize_; //!< Transfer buffer size + amd::Monitor* lockXferOps_; //!< Lock transfer operation }; static const char* BlitName[KernelBlitManager::BlitTotal] = { - "copyImage", - "copyImage1DA", - "copyImageToBuffer", - "copyBufferToImage", - "copyBufferRect", - "copyBufferRectAligned", - "copyBuffer", - "copyBufferAligned", - "fillBuffer", + "copyImage", "copyImage1DA", "copyImageToBuffer", + "copyBufferToImage", "copyBufferRect", "copyBufferRectAligned", + "copyBuffer", "copyBufferAligned", "fillBuffer", "fillImage", - }; +}; /*@}*/} // namespace roc - diff --git a/rocclr/runtime/device/rocm/roccompiler.cpp b/rocclr/runtime/device/rocm/roccompiler.cpp index d6d3cfa552..fad26f6af8 100644 --- a/rocclr/runtime/device/rocm/roccompiler.cpp +++ b/rocclr/runtime/device/rocm/roccompiler.cpp @@ -15,409 +15,384 @@ #if defined(WITH_LIGHTNING_COMPILER) #include "opencl1.2-c.amdgcn.inc" #include "opencl2.0-c.amdgcn.inc" -#else // !defined(WITH_LIGHTNING_COMPILER) +#else // !defined(WITH_LIGHTNING_COMPILER) #include "roccompilerlib.hpp" -#endif // !defined(WITH_LIGHTNING_COMPILER) +#endif // !defined(WITH_LIGHTNING_COMPILER) #include "utils/options.hpp" #include #if defined(ATI_OS_LINUX) #include #include -#endif // defined(ATI_OS_LINUX) +#endif // defined(ATI_OS_LINUX) #if defined(WITH_LIGHTNING_COMPILER) static std::string llvmBin_(amd::Os::getEnvironment("LLVM_BIN")); -#endif // defined(WITH_LIGHTNING_COMPILER) +#endif // defined(WITH_LIGHTNING_COMPILER) -//CLC_IN_PROCESS_CHANGE +// CLC_IN_PROCESS_CHANGE extern int openclFrontEnd(const char* cmdline, std::string*, std::string* typeInfo = nullptr); namespace roc { /* Temporary log function for the compiler library */ -static void -logFunction(const char* msg, size_t size) -{ - std::cout<< "Compiler Log: " << msg << std::endl; +static void logFunction(const char* msg, size_t size) { + std::cout << "Compiler Log: " << msg << std::endl; } static int programsCount = 0; #if defined(WITH_LIGHTNING_COMPILER) -bool -HSAILProgram::compileImpl_LC( - const std::string& sourceCode, - const std::vector& headers, - const char** headerIncludeNames, - amd::option::Options* options) -{ - using namespace amd::opencl_driver; - std::unique_ptr C(newCompilerInstance()); - std::vector inputs; +bool HSAILProgram::compileImpl_LC(const std::string& sourceCode, + const std::vector& headers, + const char** headerIncludeNames, amd::option::Options* options) { + using namespace amd::opencl_driver; + std::unique_ptr C(newCompilerInstance()); + std::vector inputs; - Data* input = C->NewBufferReference(DT_CL, - sourceCode.c_str(), sourceCode.length()); - if (input == nullptr) { - buildLog_ += "Error while creating data from source code"; - return false; - } + Data* input = C->NewBufferReference(DT_CL, sourceCode.c_str(), sourceCode.length()); + if (input == nullptr) { + buildLog_ += "Error while creating data from source code"; + return false; + } - inputs.push_back(input); + inputs.push_back(input); - Buffer* output = C->NewBuffer(DT_LLVM_BC); - if (output == nullptr) { - buildLog_ += "Error while creating buffer for the LLVM bitcode"; - return false; - } + Buffer* output = C->NewBuffer(DT_LLVM_BC); + if (output == nullptr) { + buildLog_ += "Error while creating buffer for the LLVM bitcode"; + return false; + } - //Set the options for the compiler - std::ostringstream ostrstr; - std::copy(options->clangOptions.begin(), options->clangOptions.end(), - std::ostream_iterator(ostrstr, " ")); + // Set the options for the compiler + std::ostringstream ostrstr; + std::copy(options->clangOptions.begin(), options->clangOptions.end(), + std::ostream_iterator(ostrstr, " ")); - ostrstr << " -m" << sizeof(void*) * 8; - std::string driverOptions(ostrstr.str()); + ostrstr << " -m" << sizeof(void*) * 8; + std::string driverOptions(ostrstr.str()); - const char* xLang = options->oVariables->XLang; - if (xLang != nullptr && strcmp(xLang, "cl")) { - buildLog_ += "Unsupported OpenCL language.\n"; - } + const char* xLang = options->oVariables->XLang; + if (xLang != nullptr && strcmp(xLang, "cl")) { + buildLog_ += "Unsupported OpenCL language.\n"; + } - //FIXME_Nikolay: the program manager should be setting the language - //driverOptions.append(" -x cl"); + // FIXME_Nikolay: the program manager should be setting the language + // driverOptions.append(" -x cl"); - driverOptions.append(" -cl-std=").append(options->oVariables->CLStd); + driverOptions.append(" -cl-std=").append(options->oVariables->CLStd); - // Set the -O# - std::ostringstream optLevel; - optLevel << " -O" << options->oVariables->OptLevel; - driverOptions.append(optLevel.str()); + // Set the -O# + std::ostringstream optLevel; + optLevel << " -O" << options->oVariables->OptLevel; + driverOptions.append(optLevel.str()); - // Set the machine target - driverOptions.append(" -mcpu="); - driverOptions.append(dev().deviceInfo().machineTarget_); + // Set the machine target + driverOptions.append(" -mcpu="); + driverOptions.append(dev().deviceInfo().machineTarget_); - driverOptions.append(options->llvmOptions); + driverOptions.append(options->llvmOptions); - // Set whole program mode - driverOptions.append(" -mllvm -amdgpu-early-inline-all"); + // Set whole program mode + driverOptions.append(" -mllvm -amdgpu-early-inline-all"); - driverOptions.append(preprocessorOptions(options)); + driverOptions.append(preprocessorOptions(options)); - //Find the temp folder for the OS - std::string tempFolder = amd::Os::getEnvironment("TEMP"); + // Find the temp folder for the OS + std::string tempFolder = amd::Os::getEnvironment("TEMP"); + if (tempFolder.empty()) { + tempFolder = amd::Os::getEnvironment("TMP"); if (tempFolder.empty()) { - tempFolder = amd::Os::getEnvironment("TMP"); - if (tempFolder.empty()) { - tempFolder = WINDOWS_SWITCH(".","/tmp");; - } + tempFolder = WINDOWS_SWITCH(".", "/tmp"); + ; } - //Iterate through each source code and dump it into tmp - std::fstream f; - std::vector headerFileNames(headers.size()); - std::vector newDirs; - for (size_t i = 0; i < headers.size(); ++i) { - std::string headerPath = tempFolder; - std::string headerIncludeName(headerIncludeNames[i]); - // replace / in path with current os's file separator - if ( amd::Os::fileSeparator() != '/') { - for (std::string::iterator it = headerIncludeName.begin(), - end = headerIncludeName.end(); - it != end; - ++it) { - if (*it == '/') *it = amd::Os::fileSeparator(); - } - } - size_t pos = headerIncludeName.rfind(amd::Os::fileSeparator()); - if (pos != std::string::npos) { - headerPath += amd::Os::fileSeparator(); - headerPath += headerIncludeName.substr(0, pos); - headerIncludeName = headerIncludeName.substr(pos+1); - } - if (!amd::Os::pathExists(headerPath)) { - bool ret = amd::Os::createPath(headerPath); - assert(ret && "failed creating path!"); - newDirs.push_back(headerPath); - } - std::string headerFullName - = headerPath + amd::Os::fileSeparator() + headerIncludeName; - headerFileNames[i] = headerFullName; - f.open(headerFullName.c_str(), std::fstream::out); - //Should we allow asserts - assert(!f.fail() && "failed creating header file!"); - f.write(headers[i]->c_str(), headers[i]->length()); - f.close(); - - Data* inc = C->NewFileReference(DT_CL_HEADER, headerFileNames[i]); - if (inc == nullptr) { - buildLog_ += "Error while creating data from headers"; - return false; - } - inputs.push_back(inc); + } + // Iterate through each source code and dump it into tmp + std::fstream f; + std::vector headerFileNames(headers.size()); + std::vector newDirs; + for (size_t i = 0; i < headers.size(); ++i) { + std::string headerPath = tempFolder; + std::string headerIncludeName(headerIncludeNames[i]); + // replace / in path with current os's file separator + if (amd::Os::fileSeparator() != '/') { + for (std::string::iterator it = headerIncludeName.begin(), end = headerIncludeName.end(); + it != end; ++it) { + if (*it == '/') *it = amd::Os::fileSeparator(); + } } - - //Set the include path for the temp folder that contains the includes - if(!headers.empty()) { - driverOptions.append(" -I"); - driverOptions.append(tempFolder); + size_t pos = headerIncludeName.rfind(amd::Os::fileSeparator()); + if (pos != std::string::npos) { + headerPath += amd::Os::fileSeparator(); + headerPath += headerIncludeName.substr(0, pos); + headerIncludeName = headerIncludeName.substr(pos + 1); } - - if (options->isDumpFlagSet(amd::option::DUMP_CL)) { - std::ofstream f(options->getDumpFileName(".cl").c_str(), std::ios::trunc); - if(f.is_open()) { - f << "/* Compiler options:\n" \ - "-c -emit-llvm -target amdgcn-amd-amdhsa-opencl -x cl " - << driverOptions << " -include opencl-c.h " - << "\n*/\n\n" << sourceCode; - } else { - buildLog_ += - "Warning: opening the file to dump the OpenCL source failed.\n"; - } + if (!amd::Os::pathExists(headerPath)) { + bool ret = amd::Os::createPath(headerPath); + assert(ret && "failed creating path!"); + newDirs.push_back(headerPath); } + std::string headerFullName = headerPath + amd::Os::fileSeparator() + headerIncludeName; + headerFileNames[i] = headerFullName; + f.open(headerFullName.c_str(), std::fstream::out); + // Should we allow asserts + assert(!f.fail() && "failed creating header file!"); + f.write(headers[i]->c_str(), headers[i]->length()); + f.close(); - //FIXME_lmoriche: has the CL option been validated? - uint clcStd = (options->oVariables->CLStd[2] - '0') * 100 - + (options->oVariables->CLStd[4] - '0') * 10; + Data* inc = C->NewFileReference(DT_CL_HEADER, headerFileNames[i]); + if (inc == nullptr) { + buildLog_ += "Error while creating data from headers"; + return false; + } + inputs.push_back(inc); + } - std::pair hdr; - switch(clcStd) { - case 100: case 110: case 120: - hdr = std::make_pair(opencl1_2_c_amdgcn, opencl1_2_c_amdgcn_size); - break; + // Set the include path for the temp folder that contains the includes + if (!headers.empty()) { + driverOptions.append(" -I"); + driverOptions.append(tempFolder); + } + + if (options->isDumpFlagSet(amd::option::DUMP_CL)) { + std::ofstream f(options->getDumpFileName(".cl").c_str(), std::ios::trunc); + if (f.is_open()) { + f << "/* Compiler options:\n" + "-c -emit-llvm -target amdgcn-amd-amdhsa-opencl -x cl " + << driverOptions << " -include opencl-c.h " + << "\n*/\n\n" + << sourceCode; + } else { + buildLog_ += "Warning: opening the file to dump the OpenCL source failed.\n"; + } + } + + // FIXME_lmoriche: has the CL option been validated? + uint clcStd = + (options->oVariables->CLStd[2] - '0') * 100 + (options->oVariables->CLStd[4] - '0') * 10; + + std::pair hdr; + switch (clcStd) { + case 100: + case 110: + case 120: + hdr = std::make_pair(opencl1_2_c_amdgcn, opencl1_2_c_amdgcn_size); + break; case 200: - hdr = std::make_pair(opencl2_0_c_amdgcn, opencl2_0_c_amdgcn_size); - break; + hdr = std::make_pair(opencl2_0_c_amdgcn, opencl2_0_c_amdgcn_size); + break; default: - buildLog_ += "Unsupported requested OpenCL C version (-cl-std).\n"; - return false; + buildLog_ += "Unsupported requested OpenCL C version (-cl-std).\n"; + return false; + } + + File* pch = C->NewTempFile(DT_CL_HEADER); + if (pch == nullptr || !pch->WriteData((const char*)hdr.first, hdr.second)) { + buildLog_ += "Error while opening the opencl-c header "; + return false; + } + + driverOptions.append(" -include-pch " + pch->Name()); + driverOptions.append(" -Xclang -fno-validate-pch"); + + // Tokenize the options string into a vector of strings + std::istringstream istrstr(driverOptions); + std::istream_iterator sit(istrstr), end; + std::vector params(sit, end); + + // Compile source to IR + bool ret = + dev().cacheCompilation()->compileToLLVMBitcode(C.get(), inputs, output, params, buildLog_); + buildLog_ += C->Output(); + if (!ret) { + buildLog_ += "Error: Failed to compile opencl source (from CL to LLVM IR).\n"; + return false; + } + + llvmBinary_.assign(output->Buf().data(), output->Size()); + elfSectionType_ = amd::OclElf::LLVMIR; + + if (options->isDumpFlagSet(amd::option::DUMP_BC_ORIGINAL)) { + std::ofstream f(options->getDumpFileName("_original.bc").c_str(), std::ios::trunc); + if (f.is_open()) { + f.write(llvmBinary_.data(), llvmBinary_.size()); + } else { + buildLog_ += "Warning: opening the file to dump the compiled IR failed.\n"; } + } - File* pch = C->NewTempFile(DT_CL_HEADER); - if (pch == nullptr || !pch->WriteData((const char*) hdr.first, hdr.second)) { - buildLog_ += "Error while opening the opencl-c header "; - return false; - } - - driverOptions.append(" -include-pch " + pch->Name()); - driverOptions.append(" -Xclang -fno-validate-pch"); - - // Tokenize the options string into a vector of strings - std::istringstream istrstr(driverOptions); - std::istream_iterator sit(istrstr), end; - std::vector params(sit, end); - - // Compile source to IR - bool ret = dev().cacheCompilation()->compileToLLVMBitcode(C.get(), inputs, output, params, buildLog_); - buildLog_ += C->Output(); - if (!ret) { - buildLog_ += "Error: Failed to compile opencl source (from CL to LLVM IR).\n"; - return false; - } - - llvmBinary_.assign(output->Buf().data(), output->Size()); - elfSectionType_ = amd::OclElf::LLVMIR; - - if (options->isDumpFlagSet(amd::option::DUMP_BC_ORIGINAL)) { - std::ofstream f(options->getDumpFileName("_original.bc").c_str(), std::ios::trunc); - if(f.is_open()) { - f.write(llvmBinary_.data(), llvmBinary_.size()); - } else { - buildLog_ += - "Warning: opening the file to dump the compiled IR failed.\n"; - } - } - - if (clBinary()->saveSOURCE()) { - clBinary()->elfOut()->addSection( - amd::OclElf::SOURCE, sourceCode.data(), sourceCode.size()); - } - if (clBinary()->saveLLVMIR()) { - clBinary()->elfOut()->addSection( - amd::OclElf::LLVMIR, llvmBinary_.data(), llvmBinary_.size(), false); - // store the original compile options - clBinary()->storeCompileOptions(compileOptions_); - } - return true; + if (clBinary()->saveSOURCE()) { + clBinary()->elfOut()->addSection(amd::OclElf::SOURCE, sourceCode.data(), sourceCode.size()); + } + if (clBinary()->saveLLVMIR()) { + clBinary()->elfOut()->addSection(amd::OclElf::LLVMIR, llvmBinary_.data(), llvmBinary_.size(), + false); + // store the original compile options + clBinary()->storeCompileOptions(compileOptions_); + } + return true; } -#endif // defined(WITH_LIGHTNING_COMPILER) +#endif // defined(WITH_LIGHTNING_COMPILER) -bool -HSAILProgram::compileImpl( - const std::string& sourceCode, - const std::vector& headers, - const char** headerIncludeNames, - amd::option::Options* options) -{ +bool HSAILProgram::compileImpl(const std::string& sourceCode, + const std::vector& headers, + const char** headerIncludeNames, amd::option::Options* options) { #if defined(WITH_LIGHTNING_COMPILER) - return compileImpl_LC(sourceCode, headers, headerIncludeNames, options); -#else // !defined(WITH_LIGHTNING_COMPILER) - acl_error errorCode; - aclTargetInfo target; + return compileImpl_LC(sourceCode, headers, headerIncludeNames, options); +#else // !defined(WITH_LIGHTNING_COMPILER) + acl_error errorCode; + aclTargetInfo target; - target = g_complibApi._aclGetTargetInfo(LP64_SWITCH("hsail","hsail64"), - dev().deviceInfo().complibTarget_, &errorCode); + target = g_complibApi._aclGetTargetInfo(LP64_SWITCH("hsail", "hsail64"), + dev().deviceInfo().complibTarget_, &errorCode); - //end if asic info is ready - // We dump the source code for each program (param: headers) - // into their filenames (headerIncludeNames) into the TEMP - // folder specific to the OS and add the include path while - // compiling + // end if asic info is ready + // We dump the source code for each program (param: headers) + // into their filenames (headerIncludeNames) into the TEMP + // folder specific to the OS and add the include path while + // compiling - //Find the temp folder for the OS - std::string tempFolder = amd::Os::getEnvironment("TEMP"); + // Find the temp folder for the OS + std::string tempFolder = amd::Os::getEnvironment("TEMP"); + if (tempFolder.empty()) { + tempFolder = amd::Os::getEnvironment("TMP"); if (tempFolder.empty()) { - tempFolder = amd::Os::getEnvironment("TMP"); - if (tempFolder.empty()) { - tempFolder = WINDOWS_SWITCH(".","/tmp");; - } + tempFolder = WINDOWS_SWITCH(".", "/tmp"); + ; } - //Iterate through each source code and dump it into tmp - std::fstream f; - std::vector headerFileNames(headers.size()); - std::vector newDirs; - for (size_t i = 0; i < headers.size(); ++i) { - std::string headerPath = tempFolder; - std::string headerIncludeName(headerIncludeNames[i]); - // replace / in path with current os's file separator - if ( amd::Os::fileSeparator() != '/') { - for (std::string::iterator it = headerIncludeName.begin(), - end = headerIncludeName.end(); - it != end; - ++it) { - if (*it == '/') *it = amd::Os::fileSeparator(); - } - } - size_t pos = headerIncludeName.rfind(amd::Os::fileSeparator()); - if (pos != std::string::npos) { - headerPath += amd::Os::fileSeparator(); - headerPath += headerIncludeName.substr(0, pos); - headerIncludeName = headerIncludeName.substr(pos+1); - } - if (!amd::Os::pathExists(headerPath)) { - bool ret = amd::Os::createPath(headerPath); - assert(ret && "failed creating path!"); - newDirs.push_back(headerPath); - } - std::string headerFullName - = headerPath + amd::Os::fileSeparator() + headerIncludeName; - headerFileNames[i] = headerFullName; - f.open(headerFullName.c_str(), std::fstream::out); - //Should we allow asserts - assert(!f.fail() && "failed creating header file!"); - f.write(headers[i]->c_str(), headers[i]->length()); - f.close(); + } + // Iterate through each source code and dump it into tmp + std::fstream f; + std::vector headerFileNames(headers.size()); + std::vector newDirs; + for (size_t i = 0; i < headers.size(); ++i) { + std::string headerPath = tempFolder; + std::string headerIncludeName(headerIncludeNames[i]); + // replace / in path with current os's file separator + if (amd::Os::fileSeparator() != '/') { + for (std::string::iterator it = headerIncludeName.begin(), end = headerIncludeName.end(); + it != end; ++it) { + if (*it == '/') *it = amd::Os::fileSeparator(); + } } + size_t pos = headerIncludeName.rfind(amd::Os::fileSeparator()); + if (pos != std::string::npos) { + headerPath += amd::Os::fileSeparator(); + headerPath += headerIncludeName.substr(0, pos); + headerIncludeName = headerIncludeName.substr(pos + 1); + } + if (!amd::Os::pathExists(headerPath)) { + bool ret = amd::Os::createPath(headerPath); + assert(ret && "failed creating path!"); + newDirs.push_back(headerPath); + } + std::string headerFullName = headerPath + amd::Os::fileSeparator() + headerIncludeName; + headerFileNames[i] = headerFullName; + f.open(headerFullName.c_str(), std::fstream::out); + // Should we allow asserts + assert(!f.fail() && "failed creating header file!"); + f.write(headers[i]->c_str(), headers[i]->length()); + f.close(); + } - //Create Binary - binaryElf_ = g_complibApi._aclBinaryInit(sizeof(aclBinary), - &target, - &binOpts_, - &errorCode); + // Create Binary + binaryElf_ = g_complibApi._aclBinaryInit(sizeof(aclBinary), &target, &binOpts_, &errorCode); - if( errorCode!=ACL_SUCCESS ) { - buildLog_ += "Error while compiling opencl source:\ + if (errorCode != ACL_SUCCESS) { + buildLog_ += + "Error while compiling opencl source:\ aclBinary init failure \n"; - LogWarning("aclBinaryInit failed"); - return false; - } + LogWarning("aclBinaryInit failed"); + return false; + } - //Insert opencl into binary - errorCode = g_complibApi._aclInsertSection(device().compiler(), - binaryElf_, - sourceCode.c_str(), - strlen(sourceCode.c_str()), - aclSOURCE); + // Insert opencl into binary + errorCode = g_complibApi._aclInsertSection(device().compiler(), binaryElf_, sourceCode.c_str(), + strlen(sourceCode.c_str()), aclSOURCE); - if ( errorCode != ACL_SUCCESS ) { - buildLog_ += "Error while converting to BRIG: \ + if (errorCode != ACL_SUCCESS) { + buildLog_ += + "Error while converting to BRIG: \ Inserting openCl Source \n"; - } + } - //Set the options for the compiler - //Set the include path for the temp folder that contains the includes - if(!headers.empty()) { - this->compileOptions_.append(" -I"); - this->compileOptions_.append(tempFolder); - } + // Set the options for the compiler + // Set the include path for the temp folder that contains the includes + if (!headers.empty()) { + this->compileOptions_.append(" -I"); + this->compileOptions_.append(tempFolder); + } - //Add only for CL2.0 and later - if (options->oVariables->CLStd[2] >= '2') { - std::stringstream opts; - opts << " -D" << "CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE=" - << device().info().maxGlobalVariableSize_; - compileOptions_.append(opts.str()); - } + // Add only for CL2.0 and later + if (options->oVariables->CLStd[2] >= '2') { + std::stringstream opts; + opts << " -D" + << "CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE=" << device().info().maxGlobalVariableSize_; + compileOptions_.append(opts.str()); + } - //Compile source to IR - this->compileOptions_.append(preprocessorOptions(options)); - this->compileOptions_.append(codegenOptions(options)); + // Compile source to IR + this->compileOptions_.append(preprocessorOptions(options)); + this->compileOptions_.append(codegenOptions(options)); - errorCode = g_complibApi._aclCompile(device().compiler(), - binaryElf_, - //"-Wf,--support_all_extensions", - this->compileOptions_.c_str(), - ACL_TYPE_OPENCL, - ACL_TYPE_LLVMIR_BINARY, - logFunction); - buildLog_ += g_complibApi._aclGetCompilerLog(device().compiler()); - if( errorCode!=ACL_SUCCESS ) { - LogWarning("aclCompile failed"); - buildLog_ += "Error while compiling \ + errorCode = g_complibApi._aclCompile(device().compiler(), binaryElf_, + //"-Wf,--support_all_extensions", + this->compileOptions_.c_str(), ACL_TYPE_OPENCL, + ACL_TYPE_LLVMIR_BINARY, logFunction); + buildLog_ += g_complibApi._aclGetCompilerLog(device().compiler()); + if (errorCode != ACL_SUCCESS) { + LogWarning("aclCompile failed"); + buildLog_ += + "Error while compiling \ opencl source: Compiling CL to IR"; - return false; - } - // Save the binary in the interface class - saveBinaryAndSetType(TYPE_COMPILED); - return true; -#endif // !defined(WITH_LIGHTNING_COMPILER) + return false; + } + // Save the binary in the interface class + saveBinaryAndSetType(TYPE_COMPILED); + return true; +#endif // !defined(WITH_LIGHTNING_COMPILER) } #if defined(WITH_LIGHTNING_COMPILER) #if defined(ATI_OS_LINUX) static pthread_once_t once = PTHREAD_ONCE_INIT; -static void -checkLLVM_BIN() -{ - if (llvmBin_.empty()) { - Dl_info info; - if (dladdr((const void*)&amd::Device::init, &info)) { - llvmBin_ = dirname(strdup(info.dli_fname)); - size_t pos = llvmBin_.rfind("lib"); - if (pos != std::string::npos) { - llvmBin_.replace(pos, 3, "bin"); - } - } +static void checkLLVM_BIN() { + if (llvmBin_.empty()) { + Dl_info info; + if (dladdr((const void*)&amd::Device::init, &info)) { + llvmBin_ = dirname(strdup(info.dli_fname)); + size_t pos = llvmBin_.rfind("lib"); + if (pos != std::string::npos) { + llvmBin_.replace(pos, 3, "bin"); + } } + } #if defined(DEBUG) - static const std::string tools[] = { "clang", "llvm-link", "ld.lld" }; + static const std::string tools[] = {"clang", "llvm-link", "ld.lld"}; - for (const std::string tool : tools) { - std::string exePath(llvmBin_ + "/" + tool); - struct stat buf; - if (stat(exePath.c_str(), &buf)) { - std::string msg(exePath + " not found"); - LogWarning(msg.c_str()); - } - else if ((buf.st_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0) { - std::string msg("Cannot execute " + exePath); - LogWarning(msg.c_str()); - } + for (const std::string tool : tools) { + std::string exePath(llvmBin_ + "/" + tool); + struct stat buf; + if (stat(exePath.c_str(), &buf)) { + std::string msg(exePath + " not found"); + LogWarning(msg.c_str()); + } else if ((buf.st_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0) { + std::string msg("Cannot execute " + exePath); + LogWarning(msg.c_str()); } -#endif // defined(DEBUG) + } +#endif // defined(DEBUG) } -#endif // defined(ATI_OS_LINUX) +#endif // defined(ATI_OS_LINUX) -amd::opencl_driver::Compiler* -HSAILProgram::newCompilerInstance() -{ +amd::opencl_driver::Compiler* HSAILProgram::newCompilerInstance() { #if defined(ATI_OS_LINUX) - pthread_once(&once, checkLLVM_BIN); -#endif // defined(ATI_OS_LINUX) - return amd::opencl_driver::CompilerFactory().CreateAMDGPUCompiler(llvmBin_); + pthread_once(&once, checkLLVM_BIN); +#endif // defined(ATI_OS_LINUX) + return amd::opencl_driver::CompilerFactory().CreateAMDGPUCompiler(llvmBin_); } -#endif // defined(WITH_LIGHTNING_COMPILER) +#endif // defined(WITH_LIGHTNING_COMPILER) -} // namespace roc -#endif // WITHOUT_GPU_BACKEND +} // namespace roc +#endif // WITHOUT_GPU_BACKEND diff --git a/rocclr/runtime/device/rocm/roccompilerlib.cpp b/rocclr/runtime/device/rocm/roccompilerlib.cpp index d5c63a067a..9fccfe212e 100644 --- a/rocclr/runtime/device/rocm/roccompilerlib.cpp +++ b/rocclr/runtime/device/rocm/roccompilerlib.cpp @@ -9,51 +9,49 @@ void* g_complibModule = nullptr; struct CompLibApi g_complibApi; // -// g_complibModule is defined in LoadCompLib(). This macro must be used only in LoadCompLib() function. +// g_complibModule is defined in LoadCompLib(). This macro must be used only in LoadCompLib() +// function. // -#define LOADSYMBOL(api) \ - g_complibApi._##api = (pfn_##api) amd::Os::getSymbol(g_complibModule, #api); \ - if( g_complibApi._##api == nullptr ) { \ - LogError ("amd::Os::getSymbol() for exported func " #api " failed."); \ - amd::Os::unloadLibrary(g_complibModule); \ - return false; \ +#define LOADSYMBOL(api) \ + g_complibApi._##api = (pfn_##api)amd::Os::getSymbol(g_complibModule, #api); \ + if (g_complibApi._##api == nullptr) { \ + LogError("amd::Os::getSymbol() for exported func " #api " failed."); \ + amd::Os::unloadLibrary(g_complibModule); \ + return false; \ + } + +bool LoadCompLib(bool offline) { + g_complibModule = amd::Os::loadLibrary("amdhsacl" LP64_SWITCH(LINUX_SWITCH("32", ""), "64")); + if (g_complibModule == nullptr) { + if (!offline) { + LogError("amd::Os::loadLibrary() for loading of amdhsacl.dll failed."); } + return false; + } -bool LoadCompLib(bool offline) -{ - g_complibModule = amd::Os::loadLibrary("amdhsacl" LP64_SWITCH(LINUX_SWITCH("32",""), "64")); - if( g_complibModule == nullptr ) { - if (!offline) { - LogError( "amd::Os::loadLibrary() for loading of amdhsacl.dll failed."); - } - return false; - } + LOADSYMBOL(aclCompilerInit) + LOADSYMBOL(aclGetTargetInfo) + LOADSYMBOL(aclBinaryInit) + LOADSYMBOL(aclInsertSection) + LOADSYMBOL(aclCompile) + LOADSYMBOL(aclCompilerFini) + LOADSYMBOL(aclBinaryFini) + LOADSYMBOL(aclWriteToMem) + LOADSYMBOL(aclQueryInfo) + LOADSYMBOL(aclExtractSymbol) + LOADSYMBOL(aclGetCompilerLog) + LOADSYMBOL(aclCreateFromBinary) + LOADSYMBOL(aclReadFromMem) + LOADSYMBOL(aclBinaryVersion) + LOADSYMBOL(aclLink) - LOADSYMBOL(aclCompilerInit) - LOADSYMBOL(aclGetTargetInfo) - LOADSYMBOL(aclBinaryInit) - LOADSYMBOL(aclInsertSection) - LOADSYMBOL(aclCompile) - LOADSYMBOL(aclCompilerFini) - LOADSYMBOL(aclBinaryFini) - LOADSYMBOL(aclWriteToMem) - LOADSYMBOL(aclQueryInfo) - LOADSYMBOL(aclExtractSymbol) - LOADSYMBOL(aclGetCompilerLog) - LOADSYMBOL(aclCreateFromBinary) - LOADSYMBOL(aclReadFromMem) - LOADSYMBOL(aclBinaryVersion) - LOADSYMBOL(aclLink) - - return true; + return true; } -void UnloadCompLib() -{ - if( g_complibModule ) - { - amd::Os::unloadLibrary(g_complibModule); - } +void UnloadCompLib() { + if (g_complibModule) { + amd::Os::unloadLibrary(g_complibModule); + } } -} // namespace roc +} // namespace roc diff --git a/rocclr/runtime/device/rocm/roccompilerlib.hpp b/rocclr/runtime/device/rocm/roccompilerlib.hpp index bc90a6a666..422713002c 100644 --- a/rocclr/runtime/device/rocm/roccompilerlib.hpp +++ b/rocclr/runtime/device/rocm/roccompilerlib.hpp @@ -1,7 +1,7 @@ #pragma once // -// This file hsa the code for explicity loading amdoclcl.dll. +// This file hsa the code for explicity loading amdoclcl.dll. // Exported functions from amdoclcl.dll can be added for usage as need-basis. // With explicit/dynamic loading roc will not have any linkage to amdoclcl.lib. // @@ -12,57 +12,71 @@ #if defined(WITH_LIGHTNING_COMPILER) #error Should not include this file -#endif // defined(WITH_LIGHTNING_COMPILER) +#endif // defined(WITH_LIGHTNING_COMPILER) using namespace amd; namespace roc { // -// To use any new exported function from amdhsacl.dll please add/make that function specific changes +// To use any new exported function from amdhsacl.dll please add/make that function specific changes // in typedef below, struct CompLibApi and in hsacompilerLib.cpp::LoadCompLib() function. // // // Convention: The typedefed function name must be prefixed with pfn_ // -typedef aclCompiler* (ACL_API_ENTRY *pfn_aclCompilerInit) (aclCompilerOptions *opts, acl_error *error_code); -typedef aclTargetInfo (ACL_API_ENTRY *pfn_aclGetTargetInfo) (const char*, const char*, acl_error*); -typedef aclBinary* (ACL_API_ENTRY *pfn_aclBinaryInit) (size_t, const aclTargetInfo*, const aclBinaryOptions*, acl_error*); -typedef acl_error (ACL_API_ENTRY *pfn_aclInsertSection) (aclCompiler *cl, aclBinary *binary, const void *data, size_t data_size, aclSections id); -typedef acl_error (ACL_API_ENTRY *pfn_aclCompile) (aclCompiler *cl, aclBinary *bin, const char *options, aclType from, aclType to, aclLogFunction compile_callback); -typedef acl_error (ACL_API_ENTRY *pfn_aclCompilerFini) (aclCompiler *cl); -typedef acl_error (ACL_API_ENTRY *pfn_aclBinaryFini) (aclBinary *bin); -typedef acl_error (ACL_API_ENTRY *pfn_aclWriteToMem) (aclBinary *bin,void **mem, size_t *size); -typedef acl_error (ACL_API_ENTRY *pfn_aclQueryInfo) (aclCompiler *cl, const aclBinary *binary, aclQueryType query, const char *kernel, void *data_ptr, size_t *ptr_size); -typedef const void* (ACL_API_ENTRY *pfn_aclExtractSymbol) (aclCompiler *cl,const aclBinary *binary,size_t *size,aclSections id,const char *symbol,acl_error *error_code); -typedef aclBinary* (ACL_API_ENTRY *pfn_aclReadFromMem) (void *mem,size_t size, acl_error *error_code); -typedef char* (ACL_API_ENTRY *pfn_aclGetCompilerLog) (aclCompiler* cl); -typedef aclBinary* (ACL_API_ENTRY *pfn_aclCreateFromBinary) (const aclBinary *binary,aclBIFVersion version); -typedef aclBIFVersion (ACL_API_ENTRY *pfn_aclBinaryVersion) (const aclBinary *binary); -typedef acl_error (ACL_API_ENTRY *pfn_aclLink) (aclCompiler* cl, aclBinary *src_bin, unsigned int num_libs, aclBinary **libs, aclType link_mode,const char* options, aclLogFunction link_callback); +typedef aclCompiler*(ACL_API_ENTRY* pfn_aclCompilerInit)(aclCompilerOptions* opts, + acl_error* error_code); +typedef aclTargetInfo(ACL_API_ENTRY* pfn_aclGetTargetInfo)(const char*, const char*, acl_error*); +typedef aclBinary*(ACL_API_ENTRY* pfn_aclBinaryInit)(size_t, const aclTargetInfo*, + const aclBinaryOptions*, acl_error*); +typedef acl_error(ACL_API_ENTRY* pfn_aclInsertSection)(aclCompiler* cl, aclBinary* binary, + const void* data, size_t data_size, + aclSections id); +typedef acl_error(ACL_API_ENTRY* pfn_aclCompile)(aclCompiler* cl, aclBinary* bin, + const char* options, aclType from, aclType to, + aclLogFunction compile_callback); +typedef acl_error(ACL_API_ENTRY* pfn_aclCompilerFini)(aclCompiler* cl); +typedef acl_error(ACL_API_ENTRY* pfn_aclBinaryFini)(aclBinary* bin); +typedef acl_error(ACL_API_ENTRY* pfn_aclWriteToMem)(aclBinary* bin, void** mem, size_t* size); +typedef acl_error(ACL_API_ENTRY* pfn_aclQueryInfo)(aclCompiler* cl, const aclBinary* binary, + aclQueryType query, const char* kernel, + void* data_ptr, size_t* ptr_size); +typedef const void*(ACL_API_ENTRY* pfn_aclExtractSymbol)(aclCompiler* cl, const aclBinary* binary, + size_t* size, aclSections id, + const char* symbol, acl_error* error_code); +typedef aclBinary*(ACL_API_ENTRY* pfn_aclReadFromMem)(void* mem, size_t size, + acl_error* error_code); +typedef char*(ACL_API_ENTRY* pfn_aclGetCompilerLog)(aclCompiler* cl); +typedef aclBinary*(ACL_API_ENTRY* pfn_aclCreateFromBinary)(const aclBinary* binary, + aclBIFVersion version); +typedef aclBIFVersion(ACL_API_ENTRY* pfn_aclBinaryVersion)(const aclBinary* binary); +typedef acl_error(ACL_API_ENTRY* pfn_aclLink)(aclCompiler* cl, aclBinary* src_bin, + unsigned int num_libs, aclBinary** libs, + aclType link_mode, const char* options, + aclLogFunction link_callback); // // Convention: prefix struct member variable with with underscore '_' // would be nice if there was no underscore prfix, but on Linux the token // pasting in the macro is srtict and his is the workaround. // -struct CompLibApi -{ - pfn_aclCompilerInit _aclCompilerInit; - pfn_aclGetTargetInfo _aclGetTargetInfo; - pfn_aclBinaryInit _aclBinaryInit; - pfn_aclInsertSection _aclInsertSection; - pfn_aclCompile _aclCompile; - pfn_aclCompilerFini _aclCompilerFini; - pfn_aclBinaryFini _aclBinaryFini; - pfn_aclWriteToMem _aclWriteToMem; - pfn_aclQueryInfo _aclQueryInfo; - pfn_aclExtractSymbol _aclExtractSymbol; - pfn_aclReadFromMem _aclReadFromMem; - pfn_aclGetCompilerLog _aclGetCompilerLog; - pfn_aclCreateFromBinary _aclCreateFromBinary; - pfn_aclBinaryVersion _aclBinaryVersion; - pfn_aclLink _aclLink; +struct CompLibApi { + pfn_aclCompilerInit _aclCompilerInit; + pfn_aclGetTargetInfo _aclGetTargetInfo; + pfn_aclBinaryInit _aclBinaryInit; + pfn_aclInsertSection _aclInsertSection; + pfn_aclCompile _aclCompile; + pfn_aclCompilerFini _aclCompilerFini; + pfn_aclBinaryFini _aclBinaryFini; + pfn_aclWriteToMem _aclWriteToMem; + pfn_aclQueryInfo _aclQueryInfo; + pfn_aclExtractSymbol _aclExtractSymbol; + pfn_aclReadFromMem _aclReadFromMem; + pfn_aclGetCompilerLog _aclGetCompilerLog; + pfn_aclCreateFromBinary _aclCreateFromBinary; + pfn_aclBinaryVersion _aclBinaryVersion; + pfn_aclLink _aclLink; }; @@ -74,8 +88,7 @@ extern CompLibApi g_complibApi; // Note: initializes global variable g_complibApi. // Not sure what error values we have, for now returning false on failure. -bool LoadCompLib(bool isOfflineDevice=false); +bool LoadCompLib(bool isOfflineDevice = false); void UnloadCompLib(); -} // namespace roc - +} // namespace roc diff --git a/rocclr/runtime/device/rocm/rocdefs.hpp b/rocclr/runtime/device/rocm/rocdefs.hpp index 26fb001b20..cb17821153 100644 --- a/rocclr/runtime/device/rocm/rocdefs.hpp +++ b/rocclr/runtime/device/rocm/rocdefs.hpp @@ -10,21 +10,21 @@ const static size_t PinnedMemoryAlignment = 4 * Ki; typedef uint HsaDeviceId; struct AMDDeviceInfo { - HsaDeviceId hsaDeviceId_; //!< Machine id - const char* targetName_; //!< Target name for compilation - const char* machineTarget_; //!< Machine target - const char* complibTarget_; //!< Compiler library target name - uint simdPerCU_; //!< Number of SIMDs per CU - uint simdWidth_; //!< Number of workitems processed per SIMD - uint simdInstructionWidth_; //!< Number of instructions processed per SIMD - uint memChannelBankWidth_; //!< Memory channel bank width - uint localMemSizePerCU_; //!< Local memory size per CU - uint localMemBanks_; //!< Number of banks of local memory - uint gfxipVersion_; //!< The core engine GFXIP version - uint pciDeviceId_; //!< PCIe device id + HsaDeviceId hsaDeviceId_; //!< Machine id + const char* targetName_; //!< Target name for compilation + const char* machineTarget_; //!< Machine target + const char* complibTarget_; //!< Compiler library target name + uint simdPerCU_; //!< Number of SIMDs per CU + uint simdWidth_; //!< Number of workitems processed per SIMD + uint simdInstructionWidth_; //!< Number of instructions processed per SIMD + uint memChannelBankWidth_; //!< Memory channel bank width + uint localMemSizePerCU_; //!< Local memory size per CU + uint localMemBanks_; //!< Number of banks of local memory + uint gfxipVersion_; //!< The core engine GFXIP version + uint pciDeviceId_; //!< PCIe device id }; -//The device ID must match with the device's index into DeviceInfo +// The device ID must match with the device's index into DeviceInfo const HsaDeviceId HSA_SPECTRE_ID = 0; const HsaDeviceId HSA_SPOOKY_ID = 1; const HsaDeviceId HSA_TONGA_ID = 2; @@ -38,19 +38,22 @@ const HsaDeviceId HSA_VEGA10_ID = 9; const HsaDeviceId HSA_INVALID_DEVICE_ID = -1; static const AMDDeviceInfo DeviceInfo[] = { - // targetName machineTarget - /* TARGET_KAVERI_SPECTRE */ {HSA_SPECTRE_ID, "", "kaveri", "Spectre", 4, 16, 1, 256, 64 * Ki, 32, 0, 0 }, - /* TARGET_KAVERI_SPOOKY */ {HSA_SPOOKY_ID, "", "kaveri", "Spooky", 4, 16, 1, 256, 64 * Ki, 32, 0, 0 }, - /* TARGET_TONGA */ {HSA_TONGA_ID, "", "tonga", "Tonga", 4, 16, 1, 256, 64 * Ki, 32, 0, 0}, - /* TARGET_CARRIZO */ {HSA_CARRIZO_ID, "", "carrizo", "Carrizo", 4, 16, 1, 256, 64 * Ki, 32, 0, 0}, - /* TARGET_ICELAND */ {HSA_ICELAND_ID, "", "iceland", "Iceland", 4, 16, 1, 256, 64 * Ki, 32, 0, 0}, - /* TARGET_FIJI */ {HSA_FIJI_ID, "", "fiji", "Fiji", 4, 16, 1, 256, 64 * Ki, 32, 0, 0 }, - /* TARGET HAWAII */ {HSA_HAWAII_ID, "", "hawaii", "Hawaii", 4, 16, 1, 256, 64 * Ki, 32, 0, 0 }, - /* TARGET ELLESMERE */ {HSA_ELLESMERE_ID, "", "polaris10", "Ellesmere", 4, 16, 1, 256, 64 * Ki, 32, 0, 0 }, - /* TARGET BAFFIN */ {HSA_BAFFIN_ID, "", "polaris11", "Baffin", 4, 16, 1, 256, 64 * Ki, 32, 0, 0 }, - /* TARGET VEGA10 */ {HSA_VEGA10_ID, "", "gfx900", "gfx900", 4, 16, 1, 256, 64 * Ki, 32, 0, 0 } -}; - + // targetName machineTarget + /* TARGET_KAVERI_SPECTRE */ {HSA_SPECTRE_ID, "", "kaveri", "Spectre", 4, 16, 1, 256, 64 * Ki, + 32, 0, 0}, + /* TARGET_KAVERI_SPOOKY */ {HSA_SPOOKY_ID, "", "kaveri", "Spooky", 4, 16, 1, 256, 64 * Ki, 32, + 0, 0}, + /* TARGET_TONGA */ {HSA_TONGA_ID, "", "tonga", "Tonga", 4, 16, 1, 256, 64 * Ki, 32, 0, 0}, + /* TARGET_CARRIZO */ {HSA_CARRIZO_ID, "", "carrizo", "Carrizo", 4, 16, 1, 256, 64 * Ki, 32, 0, + 0}, + /* TARGET_ICELAND */ {HSA_ICELAND_ID, "", "iceland", "Iceland", 4, 16, 1, 256, 64 * Ki, 32, 0, + 0}, + /* TARGET_FIJI */ {HSA_FIJI_ID, "", "fiji", "Fiji", 4, 16, 1, 256, 64 * Ki, 32, 0, 0}, + /* TARGET HAWAII */ {HSA_HAWAII_ID, "", "hawaii", "Hawaii", 4, 16, 1, 256, 64 * Ki, 32, 0, 0}, + /* TARGET ELLESMERE */ {HSA_ELLESMERE_ID, "", "polaris10", "Ellesmere", 4, 16, 1, 256, 64 * Ki, + 32, 0, 0}, + /* TARGET BAFFIN */ {HSA_BAFFIN_ID, "", "polaris11", "Baffin", 4, 16, 1, 256, 64 * Ki, 32, 0, + 0}, + /* TARGET VEGA10 */ {HSA_VEGA10_ID, "", "gfx900", "gfx900", 4, 16, 1, 256, 64 * Ki, 32, 0, 0}}; } #endif - diff --git a/rocclr/runtime/device/rocm/rocdevice.cpp b/rocclr/runtime/device/rocm/rocdevice.cpp index a1bad09ec9..9a123923da 100644 --- a/rocclr/runtime/device/rocm/rocdevice.cpp +++ b/rocclr/runtime/device/rocm/rocdevice.cpp @@ -20,9 +20,9 @@ #include "device/rocm/rocprogram.hpp" #if defined(WITH_LIGHTNING_COMPILER) #include "driver/AmdCompiler.h" -#else // !defined(WITH_LIGHTNING_COMPILER) +#else // !defined(WITH_LIGHTNING_COMPILER) #include "device/rocm/roccompilerlib.hpp" -#endif // !defined(WITH_LIGHTNING_COMPILER) +#endif // !defined(WITH_LIGHTNING_COMPILER) #include "device/rocm/rocmemory.hpp" #include "device/rocm/rocglinterop.hpp" #include "kv_id.h" @@ -48,1478 +48,1332 @@ extern const char* BlitSourceCode; namespace roc { amd::Device::Compiler* NullDevice::compilerHandle_; bool roc::Device::isHsaInitialized_ = false; -hsa_agent_t roc::Device::cpu_agent_ = { 0 }; +hsa_agent_t roc::Device::cpu_agent_ = {0}; std::vector roc::Device::gpu_agents_; const bool roc::Device::offlineDevice_ = false; -const bool roc::NullDevice::offlineDevice_= true; +const bool roc::NullDevice::offlineDevice_ = true; static HsaDeviceId getHsaDeviceId(hsa_agent_t device, uint32_t& pci_id) { + if (HSA_STATUS_SUCCESS != + hsa_agent_get_info(device, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_CHIP_ID, &pci_id)) { + return HSA_INVALID_DEVICE_ID; + } - if (HSA_STATUS_SUCCESS != - hsa_agent_get_info( - device, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_CHIP_ID, - &pci_id)) { - return HSA_INVALID_DEVICE_ID; - } + char agent_name[64] = {0}; - char agent_name[64] = { 0 }; + if (HSA_STATUS_SUCCESS != hsa_agent_get_info(device, HSA_AGENT_INFO_NAME, agent_name)) { + return HSA_INVALID_DEVICE_ID; + } - if (HSA_STATUS_SUCCESS != hsa_agent_get_info(device, HSA_AGENT_INFO_NAME, agent_name)) { - return HSA_INVALID_DEVICE_ID; - } + if (strncmp(agent_name, "gfx", 3) != 0) { + return HSA_INVALID_DEVICE_ID; + } - if (strncmp(agent_name, "gfx", 3) != 0) { - return HSA_INVALID_DEVICE_ID; - } - - uint gfxipVersion = atoi(&agent_name[3]); - switch (gfxipVersion) { - case 701: - return HSA_HAWAII_ID; - case 801: - return HSA_CARRIZO_ID; - case 802: - return HSA_TONGA_ID; - case 803: - return HSA_FIJI_ID; - case 900: - return HSA_VEGA10_ID; - default: - return HSA_INVALID_DEVICE_ID; - } + uint gfxipVersion = atoi(&agent_name[3]); + switch (gfxipVersion) { + case 701: + return HSA_HAWAII_ID; + case 801: + return HSA_CARRIZO_ID; + case 802: + return HSA_TONGA_ID; + case 803: + return HSA_FIJI_ID; + case 900: + return HSA_VEGA10_ID; + default: + return HSA_INVALID_DEVICE_ID; + } } bool NullDevice::create(const AMDDeviceInfo& deviceInfo) { - online_ = false; - deviceInfo_ = deviceInfo; - // Mark the device as GPU type - info_.type_ = CL_DEVICE_TYPE_GPU; - info_.vendorId_ = 0x1002; + online_ = false; + deviceInfo_ = deviceInfo; + // Mark the device as GPU type + info_.type_ = CL_DEVICE_TYPE_GPU; + info_.vendorId_ = 0x1002; - settings_ = new Settings(); - roc::Settings* hsaSettings = static_cast(settings_); - if ((hsaSettings == nullptr) || !hsaSettings->create(false, deviceInfo_.gfxipVersion_)) { - LogError("Error creating settings for nullptr HSA device"); - return false; - } - // Report the device name - ::strcpy(info_.name_, "AMD HSA Device"); - info_.extensions_ = getExtensionString(); - info_.maxWorkGroupSize_ = hsaSettings->maxWorkGroupSize_; - ::strcpy(info_.vendor_, "Advanced Micro Devices, Inc."); - info_.oclcVersion_ = "OpenCL C " IF(IS_LIGHTNING,OPENCL_VERSION_STR,"1.2") " "; - strcpy(info_.driverVersion_, "1.0 Provisional (hsa)"); - info_.version_ = "OpenCL " OPENCL_VERSION_STR " "; - return true; + settings_ = new Settings(); + roc::Settings* hsaSettings = static_cast(settings_); + if ((hsaSettings == nullptr) || !hsaSettings->create(false, deviceInfo_.gfxipVersion_)) { + LogError("Error creating settings for nullptr HSA device"); + return false; + } + // Report the device name + ::strcpy(info_.name_, "AMD HSA Device"); + info_.extensions_ = getExtensionString(); + info_.maxWorkGroupSize_ = hsaSettings->maxWorkGroupSize_; + ::strcpy(info_.vendor_, "Advanced Micro Devices, Inc."); + info_.oclcVersion_ = "OpenCL C " IF(IS_LIGHTNING, OPENCL_VERSION_STR, "1.2") " "; + strcpy(info_.driverVersion_, "1.0 Provisional (hsa)"); + info_.version_ = "OpenCL " OPENCL_VERSION_STR " "; + return true; } Device::Device(hsa_agent_t bkendDevice) - : mapCacheOps_(nullptr) - , mapCache_(nullptr) - , _bkendDevice(bkendDevice) - , gpuvm_segment_max_alloc_(0) - , alloc_granularity_(0) - , context_(nullptr) - , xferQueue_(nullptr) - , xferRead_(nullptr) - , xferWrite_(nullptr) - , numOfVgpus_(0) -{ - group_segment_.handle = 0; - system_segment_.handle = 0; - system_coarse_segment_.handle = 0; - gpuvm_segment_.handle = 0; + : mapCacheOps_(nullptr), + mapCache_(nullptr), + _bkendDevice(bkendDevice), + gpuvm_segment_max_alloc_(0), + alloc_granularity_(0), + context_(nullptr), + xferQueue_(nullptr), + xferRead_(nullptr), + xferWrite_(nullptr), + numOfVgpus_(0) { + group_segment_.handle = 0; + system_segment_.handle = 0; + system_coarse_segment_.handle = 0; + gpuvm_segment_.handle = 0; } -Device::~Device() -{ - // Release cached map targets - for (uint i = 0; mapCache_ != nullptr && i < mapCache_->size(); ++i) { - if ((*mapCache_)[i] != nullptr) { - (*mapCache_)[i]->release(); - } +Device::~Device() { + // Release cached map targets + for (uint i = 0; mapCache_ != nullptr && i < mapCache_->size(); ++i) { + if ((*mapCache_)[i] != nullptr) { + (*mapCache_)[i]->release(); } - delete mapCache_; - delete mapCacheOps_; + } + delete mapCache_; + delete mapCacheOps_; - // Destroy temporary buffers for read/write - delete xferRead_; - delete xferWrite_; + // Destroy temporary buffers for read/write + delete xferRead_; + delete xferWrite_; - // Destroy transfer queue - if (xferQueue_ && xferQueue_->terminate()) { - delete xferQueue_; - xferQueue_ = nullptr; - } + // Destroy transfer queue + if (xferQueue_ && xferQueue_->terminate()) { + delete xferQueue_; + xferQueue_ = nullptr; + } - if (blitProgram_) { - delete blitProgram_; - blitProgram_ = nullptr; - } + if (blitProgram_) { + delete blitProgram_; + blitProgram_ = nullptr; + } - if (context_ != nullptr) { - context_->release(); - } + if (context_ != nullptr) { + context_->release(); + } - if (info_.extensions_) { - delete[]info_.extensions_; - info_.extensions_ = nullptr; - } + if (info_.extensions_) { + delete[] info_.extensions_; + info_.extensions_ = nullptr; + } - if (settings_) { - delete settings_; - settings_ = nullptr; - } + if (settings_) { + delete settings_; + settings_ = nullptr; + } } bool NullDevice::initCompiler(bool isOffline) { #if !defined(WITH_LIGHTNING_COMPILER) - // Initializes g_complibModule and g_complibApi if they were not initialized - if( g_complibModule == nullptr ){ - if (!LoadCompLib(isOffline)) { - if (!isOffline) { - LogError("Error - could not find the compiler library"); - } - return false; - } + // Initializes g_complibModule and g_complibApi if they were not initialized + if (g_complibModule == nullptr) { + if (!LoadCompLib(isOffline)) { + if (!isOffline) { + LogError("Error - could not find the compiler library"); + } + return false; } - //Initialize the compiler handle if has already not been initialized - //This is destroyed in Device::teardown - acl_error error; - if (!compilerHandle_) { - compilerHandle_ = g_complibApi._aclCompilerInit(nullptr, &error); - if (error != ACL_SUCCESS) { - LogError("Error initializing the compiler handle"); - return false; - } + } + // Initialize the compiler handle if has already not been initialized + // This is destroyed in Device::teardown + acl_error error; + if (!compilerHandle_) { + compilerHandle_ = g_complibApi._aclCompilerInit(nullptr, &error); + if (error != ACL_SUCCESS) { + LogError("Error initializing the compiler handle"); + return false; } -#endif // !defined(WITH_LIGHTNING_COMPILER) - return true; + } +#endif // !defined(WITH_LIGHTNING_COMPILER) + return true; } bool NullDevice::destroyCompiler() { #if defined(WITH_LIGHTNING_COMPILER) - delete compilerHandle_; - compilerHandle_ = nullptr; -#else // !defined(WITH_LIGHTNING_COMPILER) - if (compilerHandle_ != nullptr) { - acl_error error = g_complibApi._aclCompilerFini(compilerHandle_); - if (error != ACL_SUCCESS) { - LogError("Error closing the compiler"); - return false; - } - } - if( g_complibModule != nullptr ){ - UnloadCompLib(); - } -#endif // !defined(WITH_LIGHTNING_COMPILER) - return true; -} - -void NullDevice::tearDown() { - destroyCompiler(); -} -bool NullDevice::init() { - //Initialize the compiler - if (!initCompiler(offlineDevice_)){ - return false; - } - - // Return without initializing offline device list - return true; - -#if !defined(WITH_LIGHTNING_COMPILER) - //If there is an HSA enabled device online then skip any offline device - std::vector devices; - devices = getDevices(CL_DEVICE_TYPE_GPU, false); - - //Load the offline devices - //Iterate through the set of available offline devices - for (uint id = 0; id < sizeof(DeviceInfo)/sizeof(AMDDeviceInfo); id++) { - bool isOnline = false; - //Check if the particular device is online - for (unsigned int i=0; i< devices.size(); i++) { - if (static_cast(devices[i])->deviceInfo_.hsaDeviceId_ == - DeviceInfo[id].hsaDeviceId_){ - isOnline = true; - } - } - if (isOnline) { - continue; - } - NullDevice* nullDevice = new NullDevice(); - if (!nullDevice->create(DeviceInfo[id])) { - LogError("Error creating new instance of Device."); - delete nullDevice; - return false; - } - nullDevice->registerDevice(); - } -#endif // !defined(WITH_LIGHTNING_COMPILER) - return true; -} -NullDevice::~NullDevice() { - if (info_.extensions_) { - delete[]info_.extensions_; - info_.extensions_ = nullptr; - } - - if (settings_) { - delete settings_; - settings_ = nullptr; - } -} - -hsa_status_t Device::iterateAgentCallback(hsa_agent_t agent, void *data) { - hsa_device_type_t dev_type = HSA_DEVICE_TYPE_CPU; - - hsa_status_t stat = - hsa_agent_get_info( - agent, HSA_AGENT_INFO_DEVICE, &dev_type); - - if (stat != HSA_STATUS_SUCCESS) { - return stat; - } - - if (dev_type == HSA_DEVICE_TYPE_CPU) { - Device::cpu_agent_ = agent; - } - else if (dev_type == HSA_DEVICE_TYPE_GPU) { - gpu_agents_.push_back(agent); - } - - return HSA_STATUS_SUCCESS; -} - -hsa_ven_amd_loader_1_00_pfn_t -Device::amd_loader_ext_table = {nullptr}; - -hsa_status_t -Device::loaderQueryHostAddress(const void* device, const void** host) -{ - return amd_loader_ext_table.hsa_ven_amd_loader_query_host_address - ? amd_loader_ext_table.hsa_ven_amd_loader_query_host_address(device, host) - : HSA_STATUS_ERROR; -} - -Device::XferBuffers::~XferBuffers() -{ - // Destroy temporary buffer for reads - for (const auto& buf : freeBuffers_) { - delete buf; - } - freeBuffers_.clear(); -} - -bool -Device::XferBuffers::create() -{ - Memory* xferBuf = nullptr; - bool result = false; - - // Create a buffer object - xferBuf = new Buffer(dev(), bufSize_); - - // Try to allocate memory for the transfer buffer - if ((nullptr == xferBuf) || !xferBuf->create()) { - delete xferBuf; - xferBuf = nullptr; - LogError("Couldn't allocate a transfer buffer!"); - } - else { - result = true; - freeBuffers_.push_back(xferBuf); - } - - return result; -} - -Memory& -Device::XferBuffers::acquire() -{ - Memory* xferBuf = nullptr; - size_t listSize; - - // Lock the operations with the staged buffer list - amd::ScopedLock l(lock_); - listSize = freeBuffers_.size(); - - // If the list is empty, then attempt to allocate a staged buffer - if (listSize == 0) { - // Allocate memory - xferBuf = new Buffer(dev(), bufSize_); - - // Allocate memory for the transfer buffer - if ((nullptr == xferBuf) || !xferBuf->create()) { - delete xferBuf; - xferBuf = nullptr; - LogError("Couldn't allocate a transfer buffer!"); - } - else { - ++acquiredCnt_; - } - } - - if (xferBuf == nullptr) { - xferBuf = *(freeBuffers_.begin()); - freeBuffers_.erase(freeBuffers_.begin()); - ++acquiredCnt_; - } - - return *xferBuf; -} - -void -Device::XferBuffers::release(VirtualGPU& gpu, Memory& buffer) -{ - // Make sure buffer isn't busy on the current VirtualGPU, because - // the next aquire can come from different queue -// buffer.wait(gpu); - // Lock the operations with the staged buffer list - amd::ScopedLock l(lock_); - freeBuffers_.push_back(&buffer); - --acquiredCnt_; -} - -bool Device::init() -{ -#if defined(__linux__) - if (amd::Os::getEnvironment("HSA_ENABLE_SDMA").empty()) { - ::setenv("HSA_ENABLE_SDMA", "0", false); - } -#endif // defined (__linux__) - - LogInfo("Initializing HSA stack."); - - //Initialize the compiler - if (!initCompiler(offlineDevice_)){ - return false; - } - - if (HSA_STATUS_SUCCESS != hsa_init()) { - LogError("hsa_init failed."); - return false; - } - - hsa_system_get_major_extension_table(HSA_EXTENSION_AMD_LOADER, 1, - sizeof(amd_loader_ext_table), &amd_loader_ext_table); - - if (HSA_STATUS_SUCCESS != - hsa_iterate_agents(iterateAgentCallback, nullptr)) { - return false; - } - - std::vector selectedDevices; - selectedDevices.resize(gpu_agents_.size(), true); - - if (!flagIsDefault(GPU_DEVICE_ORDINAL)) { - std::fill(selectedDevices.begin(), selectedDevices.end(), false); - - std::string ordinals(GPU_DEVICE_ORDINAL); - size_t end, pos = 0; - do { - end = ordinals.find_first_of(',', pos); - size_t index = atoi(ordinals.substr(pos, end-pos).c_str()); - selectedDevices.resize(index+1); - selectedDevices[index] = true; - pos = end + 1; - } while (end != std::string::npos); - } - - size_t ordinal = 0; - for (auto agent : gpu_agents_ ) { - std::unique_ptr roc_device(new Device(agent)); - - if (!roc_device) { - LogError("Error creating new instance of Device on then heap."); - return HSA_STATUS_ERROR_OUT_OF_RESOURCES; - } - - uint32_t pci_id; - HsaDeviceId deviceId = getHsaDeviceId(agent, pci_id); - if (deviceId == HSA_INVALID_DEVICE_ID) { - LogPrintfError("Invalid HSA device %x", pci_id); - continue; - } - //Find device id in the table - uint id = HSA_INVALID_DEVICE_ID; - for (uint i = 0; i < sizeof(DeviceInfo) / sizeof(AMDDeviceInfo); ++i) { - if (DeviceInfo[i].hsaDeviceId_ == deviceId){ - id = i; - break; - } - } - //If the AmdDeviceInfo for the HsaDevice Id could not be found return false - if (id == HSA_INVALID_DEVICE_ID) { - LogPrintfWarning("Could not find a DeviceInfo entry for %d", deviceId); - continue; - } - roc_device->deviceInfo_ = DeviceInfo[id]; - roc_device->deviceInfo_.pciDeviceId_ = pci_id; - - // Query the agent's ISA name to fill deviceInfo.gfxipVersion_. We can't - // have a static mapping as some marketing names cover multiple gfxip. - hsa_isa_t isa = {0}; - if (hsa_agent_get_info(agent, HSA_AGENT_INFO_ISA, &isa) - != HSA_STATUS_SUCCESS) { - continue; - } - - uint32_t isaNameLength = 0; - if (hsa_isa_get_info_alt(isa, HSA_ISA_INFO_NAME_LENGTH, &isaNameLength) - != HSA_STATUS_SUCCESS) { - continue; - } - - char *isaName = (char*)alloca((size_t)isaNameLength + 1); - if (hsa_isa_get_info_alt(isa, HSA_ISA_INFO_NAME, isaName) - != HSA_STATUS_SUCCESS) { - continue; - } - isaName[isaNameLength] = '\0'; - - std::string str(isaName); - std::vector tokens; - size_t end, pos = 0; - do { - end = str.find_first_of(':', pos); - tokens.push_back(str.substr(pos, end-pos)); - pos = end + 1; - } while (end != std::string::npos); - - if (tokens.size() != 5 || tokens[0] != "AMD" || tokens[1] != "AMDGPU") { - LogError("Not an AMD:AMDGPU ISA name"); - continue; - } - - uint major = atoi(tokens[2].c_str()); - uint minor = atoi(tokens[3].c_str()); - uint stepping = atoi(tokens[4].c_str()); - if (minor >= 10 && stepping >= 10) { - LogError("Invalid ISA string"); - continue; - } - - roc_device->deviceInfo_.gfxipVersion_ = - major * 100 + minor * 10 + stepping; - - if (!roc_device->mapHSADeviceToOpenCLDevice(agent)) { - LogError("Failed mapping of HsaDevice to Device."); - continue; - } - - if (!roc_device->create()) { - LogError("Error creating new instance of Device."); - continue; - } - - if (selectedDevices[ordinal++] && (flagIsDefault(GPU_DEVICE_NAME) - || GPU_DEVICE_NAME == 0 || GPU_DEVICE_NAME[0] == '\0' - || !strcmp(GPU_DEVICE_NAME, roc_device->info_.name_))) { - roc_device.release()->registerDevice(); - } - } - - return true; -} - -void -Device::tearDown() -{ - NullDevice::tearDown(); - hsa_shut_down(); -} - -bool -Device::create() -{ - if (!amd::Device::create()) { - return false; - } - - amd::Context::Info info = {0}; - std::vector devices; - devices.push_back(this); - - // Create a dummy context - context_ = new amd::Context(devices, info); - if (context_ == nullptr) { - return false; - } - - blitProgram_ = new BlitProgram(context_); - // Create blit programs - if (blitProgram_ == nullptr || !blitProgram_->create(this)) { - delete blitProgram_; - blitProgram_ = nullptr; - LogError("Couldn't create blit kernels!"); - return false; - } - - mapCacheOps_ = new amd::Monitor("Map Cache Lock", true); - if (nullptr == mapCacheOps_) { - return false; - } - - mapCache_ = new std::vector(); - if (mapCache_ == nullptr) { - return false; - } - // Use just 1 entry by default for the map cache - mapCache_->push_back(nullptr); - - if (settings().stagedXferSize_ != 0) { - // Initialize staged write buffers - if (settings().stagedXferWrite_) { - xferWrite_ = new XferBuffers(*this, - amd::alignUp(settings().stagedXferSize_, 4 * Ki)); - if ((xferWrite_ == nullptr) || !xferWrite_->create()) { - LogError("Couldn't allocate transfer buffer objects for read"); - return false; - } - } - - // Initialize staged read buffers - if (settings().stagedXferRead_) { - xferRead_ = new XferBuffers(*this, - amd::alignUp(settings().stagedXferSize_, 4 * Ki)); - if ((xferRead_ == nullptr) || !xferRead_->create()) { - LogError("Couldn't allocate transfer buffer objects for write"); - return false; - } - } - } - - xferQueue(); - - return true; -} - -device::Program* -NullDevice::createProgram(amd::option::Options* options) { - return new roc::HSAILProgram(*this); -} - -device::Program* -Device::createProgram(amd::option::Options* options) { - return new roc::HSAILProgram(*this); -} - -bool -Device::mapHSADeviceToOpenCLDevice(hsa_agent_t dev) -{ - if (HSA_STATUS_SUCCESS != hsa_agent_get_info(_bkendDevice, - HSA_AGENT_INFO_PROFILE, - &agent_profile_)) { - return false; - } - - // Create HSA settings - settings_ = new Settings(); - roc::Settings* hsaSettings = static_cast(settings_); - if ((hsaSettings == nullptr) || - !hsaSettings->create((agent_profile_ == HSA_PROFILE_FULL), deviceInfo_.gfxipVersion_)) { - return false; - } - - if (populateOCLDeviceConstants() == false) { - return false; - } - - // Setup System Memory to be Non-Coherent per user - // request via environment variable. By default the - // System Memory is setup to be Coherent - if (hsaSettings->enableNCMode_) { - hsa_status_t err = - hsa_amd_coherency_set_type(dev, HSA_AMD_COHERENCY_TYPE_NONCOHERENT); - if (err != HSA_STATUS_SUCCESS) { - LogError("Unable to set NC memory policy!"); - return false; - } - } - -#if defined(WITH_LIGHTNING_COMPILER) - // create compilation object with cache support - int gfxipMajor = deviceInfo_.gfxipVersion_ / 100; - int gfxipMinor = deviceInfo_.gfxipVersion_ / 10 % 10; - int gfxipStepping = deviceInfo_.gfxipVersion_ % 10; - - // Use compute capability as target (AMD:AMDGPU:major:minor:stepping) - // with dash as delimiter to be compatible with Windows directory name - std::ostringstream cacheTarget; - cacheTarget << "AMD-AMDGPU-" << gfxipMajor << "-" << gfxipMinor << "-" << gfxipStepping; - - amd::CacheCompilation* compObj = new amd::CacheCompilation(cacheTarget.str(), - "_rocm", - OCL_CODE_CACHE_ENABLE, - OCL_CODE_CACHE_RESET); - if (!compObj) { - LogError("Unable to create cache compilation object!"); - return false; - } - - cacheCompilation_.reset(compObj); -#endif - - return true; -} - -hsa_status_t Device::iterateGpuMemoryPoolCallback(hsa_amd_memory_pool_t pool, - void* data) { - if (data == nullptr) { - return HSA_STATUS_ERROR_INVALID_ARGUMENT; - } - - hsa_region_segment_t segment_type = (hsa_region_segment_t)0; - hsa_status_t stat = - hsa_amd_memory_pool_get_info( - pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment_type); - if (stat != HSA_STATUS_SUCCESS) { - return stat; - } - - // TODO: system and device local segment - Device *dev = reinterpret_cast(data); - switch (segment_type) { - case HSA_REGION_SEGMENT_GLOBAL: { - if (dev->settings().enableLocalMemory_) { - dev->gpuvm_segment_ = pool; - } - break; - } - case HSA_REGION_SEGMENT_GROUP: - dev->group_segment_ = pool; - break; - default: - break; - } - - return HSA_STATUS_SUCCESS; -} - -hsa_status_t Device::iterateCpuMemoryPoolCallback(hsa_amd_memory_pool_t pool, - void* data) { - if (data == nullptr) { - return HSA_STATUS_ERROR_INVALID_ARGUMENT; - } - - hsa_region_segment_t segment_type = (hsa_region_segment_t)0; - hsa_status_t stat = hsa_amd_memory_pool_get_info( - pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment_type); - if (stat != HSA_STATUS_SUCCESS) { - return stat; - } - - Device* dev = reinterpret_cast(data); - switch (segment_type) { - case HSA_REGION_SEGMENT_GLOBAL: { - uint32_t global_flag = 0; - hsa_status_t stat = hsa_amd_memory_pool_get_info( - pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &global_flag); - if (stat != HSA_STATUS_SUCCESS) { - return stat; - } - - if ((global_flag & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED) != 0) { - dev->system_segment_ = pool; - } else { - dev->system_coarse_segment_ = pool; - } - break; - } - default: - break; - } - - return HSA_STATUS_SUCCESS; -} - -bool -Device::populateOCLDeviceConstants() -{ - info_.available_ = true; - - roc::Settings* hsa_settings = static_cast(settings_); - - int gfxipMajor = deviceInfo_.gfxipVersion_ / 100; - int gfxipMinor = deviceInfo_.gfxipVersion_ / 10 % 10; - int gfxipStepping = deviceInfo_.gfxipVersion_ % 10; - - std::ostringstream oss; - oss << "gfx" << gfxipMajor << gfxipMinor << gfxipStepping; - ::strcpy(info_.name_, oss.str().c_str()); - - char device_name[64] = { 0 }; - if (HSA_STATUS_SUCCESS == - hsa_agent_get_info( - _bkendDevice, - (hsa_agent_info_t)HSA_AMD_AGENT_INFO_PRODUCT_NAME, - device_name)) { - ::strcpy(info_.boardName_, device_name); - } - - if (HSA_STATUS_SUCCESS != - hsa_agent_get_info( - _bkendDevice, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, - &info_.maxComputeUnits_)) { - return false; - } - assert(info_.maxComputeUnits_ > 0); - - if (HSA_STATUS_SUCCESS != - hsa_agent_get_info( - _bkendDevice, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_CACHELINE_SIZE, - &info_.globalMemCacheLineSize_)) { - return false; - } - assert(info_.globalMemCacheLineSize_ > 0); - - uint32_t cachesize[4] = { 0 }; - if (HSA_STATUS_SUCCESS != - hsa_agent_get_info( - _bkendDevice, HSA_AGENT_INFO_CACHE_SIZE, cachesize)) { - return false; - } - assert(cachesize[0] > 0); - info_.globalMemCacheSize_ = cachesize[0]; - - info_.globalMemCacheType_ = CL_READ_WRITE_CACHE; - - info_.type_ = CL_DEVICE_TYPE_GPU; - - uint32_t hsa_bdf_id = 0; - if (HSA_STATUS_SUCCESS != - hsa_agent_get_info( - _bkendDevice, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_BDFID, &hsa_bdf_id)) { - return false; - } - - info_.deviceTopology_.pcie.type = CL_DEVICE_TOPOLOGY_TYPE_PCIE_AMD; - info_.deviceTopology_.pcie.bus = (hsa_bdf_id & (0xFF << 8)) >> 8; - info_.deviceTopology_.pcie.device = (hsa_bdf_id & (0x1F << 3)) >> 3; - info_.deviceTopology_.pcie.function = (hsa_bdf_id & 0x07); - info_.extensions_ = getExtensionString(); - info_.nativeVectorWidthDouble_ = - info_.preferredVectorWidthDouble_ = (settings().doublePrecision_) ? 1 : 0; - - if (HSA_STATUS_SUCCESS != - hsa_agent_get_info( - _bkendDevice, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY, - &info_.maxClockFrequency_)) { - return false; - } - assert(info_.maxClockFrequency_ > 0); - - if (HSA_STATUS_SUCCESS != - hsa_amd_agent_iterate_memory_pools( - cpu_agent_, Device::iterateCpuMemoryPoolCallback, this)) { + delete compilerHandle_; + compilerHandle_ = nullptr; +#else // !defined(WITH_LIGHTNING_COMPILER) + if (compilerHandle_ != nullptr) { + acl_error error = g_complibApi._aclCompilerFini(compilerHandle_); + if (error != ACL_SUCCESS) { + LogError("Error closing the compiler"); return false; } + } + if (g_complibModule != nullptr) { + UnloadCompLib(); + } +#endif // !defined(WITH_LIGHTNING_COMPILER) + return true; +} - assert(system_segment_.handle != 0); +void NullDevice::tearDown() { destroyCompiler(); } +bool NullDevice::init() { + // Initialize the compiler + if (!initCompiler(offlineDevice_)) { + return false; + } - if (HSA_STATUS_SUCCESS != - hsa_amd_agent_iterate_memory_pools( - _bkendDevice, Device::iterateGpuMemoryPoolCallback, this)) { + // Return without initializing offline device list + return true; + +#if !defined(WITH_LIGHTNING_COMPILER) + // If there is an HSA enabled device online then skip any offline device + std::vector devices; + devices = getDevices(CL_DEVICE_TYPE_GPU, false); + + // Load the offline devices + // Iterate through the set of available offline devices + for (uint id = 0; id < sizeof(DeviceInfo) / sizeof(AMDDeviceInfo); id++) { + bool isOnline = false; + // Check if the particular device is online + for (unsigned int i = 0; i < devices.size(); i++) { + if (static_cast(devices[i])->deviceInfo_.hsaDeviceId_ == + DeviceInfo[id].hsaDeviceId_) { + isOnline = true; + } + } + if (isOnline) { + continue; + } + NullDevice* nullDevice = new NullDevice(); + if (!nullDevice->create(DeviceInfo[id])) { + LogError("Error creating new instance of Device."); + delete nullDevice; + return false; + } + nullDevice->registerDevice(); + } +#endif // !defined(WITH_LIGHTNING_COMPILER) + return true; +} +NullDevice::~NullDevice() { + if (info_.extensions_) { + delete[] info_.extensions_; + info_.extensions_ = nullptr; + } + + if (settings_) { + delete settings_; + settings_ = nullptr; + } +} + +hsa_status_t Device::iterateAgentCallback(hsa_agent_t agent, void* data) { + hsa_device_type_t dev_type = HSA_DEVICE_TYPE_CPU; + + hsa_status_t stat = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &dev_type); + + if (stat != HSA_STATUS_SUCCESS) { + return stat; + } + + if (dev_type == HSA_DEVICE_TYPE_CPU) { + Device::cpu_agent_ = agent; + } else if (dev_type == HSA_DEVICE_TYPE_GPU) { + gpu_agents_.push_back(agent); + } + + return HSA_STATUS_SUCCESS; +} + +hsa_ven_amd_loader_1_00_pfn_t Device::amd_loader_ext_table = {nullptr}; + +hsa_status_t Device::loaderQueryHostAddress(const void* device, const void** host) { + return amd_loader_ext_table.hsa_ven_amd_loader_query_host_address + ? amd_loader_ext_table.hsa_ven_amd_loader_query_host_address(device, host) + : HSA_STATUS_ERROR; +} + +Device::XferBuffers::~XferBuffers() { + // Destroy temporary buffer for reads + for (const auto& buf : freeBuffers_) { + delete buf; + } + freeBuffers_.clear(); +} + +bool Device::XferBuffers::create() { + Memory* xferBuf = nullptr; + bool result = false; + + // Create a buffer object + xferBuf = new Buffer(dev(), bufSize_); + + // Try to allocate memory for the transfer buffer + if ((nullptr == xferBuf) || !xferBuf->create()) { + delete xferBuf; + xferBuf = nullptr; + LogError("Couldn't allocate a transfer buffer!"); + } else { + result = true; + freeBuffers_.push_back(xferBuf); + } + + return result; +} + +Memory& Device::XferBuffers::acquire() { + Memory* xferBuf = nullptr; + size_t listSize; + + // Lock the operations with the staged buffer list + amd::ScopedLock l(lock_); + listSize = freeBuffers_.size(); + + // If the list is empty, then attempt to allocate a staged buffer + if (listSize == 0) { + // Allocate memory + xferBuf = new Buffer(dev(), bufSize_); + + // Allocate memory for the transfer buffer + if ((nullptr == xferBuf) || !xferBuf->create()) { + delete xferBuf; + xferBuf = nullptr; + LogError("Couldn't allocate a transfer buffer!"); + } else { + ++acquiredCnt_; + } + } + + if (xferBuf == nullptr) { + xferBuf = *(freeBuffers_.begin()); + freeBuffers_.erase(freeBuffers_.begin()); + ++acquiredCnt_; + } + + return *xferBuf; +} + +void Device::XferBuffers::release(VirtualGPU& gpu, Memory& buffer) { + // Make sure buffer isn't busy on the current VirtualGPU, because + // the next aquire can come from different queue + // buffer.wait(gpu); + // Lock the operations with the staged buffer list + amd::ScopedLock l(lock_); + freeBuffers_.push_back(&buffer); + --acquiredCnt_; +} + +bool Device::init() { +#if defined(__linux__) + if (amd::Os::getEnvironment("HSA_ENABLE_SDMA").empty()) { + ::setenv("HSA_ENABLE_SDMA", "0", false); + } +#endif // defined (__linux__) + + LogInfo("Initializing HSA stack."); + + // Initialize the compiler + if (!initCompiler(offlineDevice_)) { + return false; + } + + if (HSA_STATUS_SUCCESS != hsa_init()) { + LogError("hsa_init failed."); + return false; + } + + hsa_system_get_major_extension_table(HSA_EXTENSION_AMD_LOADER, 1, sizeof(amd_loader_ext_table), + &amd_loader_ext_table); + + if (HSA_STATUS_SUCCESS != hsa_iterate_agents(iterateAgentCallback, nullptr)) { + return false; + } + + std::vector selectedDevices; + selectedDevices.resize(gpu_agents_.size(), true); + + if (!flagIsDefault(GPU_DEVICE_ORDINAL)) { + std::fill(selectedDevices.begin(), selectedDevices.end(), false); + + std::string ordinals(GPU_DEVICE_ORDINAL); + size_t end, pos = 0; + do { + end = ordinals.find_first_of(',', pos); + size_t index = atoi(ordinals.substr(pos, end - pos).c_str()); + selectedDevices.resize(index + 1); + selectedDevices[index] = true; + pos = end + 1; + } while (end != std::string::npos); + } + + size_t ordinal = 0; + for (auto agent : gpu_agents_) { + std::unique_ptr roc_device(new Device(agent)); + + if (!roc_device) { + LogError("Error creating new instance of Device on then heap."); + return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + } + + uint32_t pci_id; + HsaDeviceId deviceId = getHsaDeviceId(agent, pci_id); + if (deviceId == HSA_INVALID_DEVICE_ID) { + LogPrintfError("Invalid HSA device %x", pci_id); + continue; + } + // Find device id in the table + uint id = HSA_INVALID_DEVICE_ID; + for (uint i = 0; i < sizeof(DeviceInfo) / sizeof(AMDDeviceInfo); ++i) { + if (DeviceInfo[i].hsaDeviceId_ == deviceId) { + id = i; + break; + } + } + // If the AmdDeviceInfo for the HsaDevice Id could not be found return false + if (id == HSA_INVALID_DEVICE_ID) { + LogPrintfWarning("Could not find a DeviceInfo entry for %d", deviceId); + continue; + } + roc_device->deviceInfo_ = DeviceInfo[id]; + roc_device->deviceInfo_.pciDeviceId_ = pci_id; + + // Query the agent's ISA name to fill deviceInfo.gfxipVersion_. We can't + // have a static mapping as some marketing names cover multiple gfxip. + hsa_isa_t isa = {0}; + if (hsa_agent_get_info(agent, HSA_AGENT_INFO_ISA, &isa) != HSA_STATUS_SUCCESS) { + continue; + } + + uint32_t isaNameLength = 0; + if (hsa_isa_get_info_alt(isa, HSA_ISA_INFO_NAME_LENGTH, &isaNameLength) != HSA_STATUS_SUCCESS) { + continue; + } + + char* isaName = (char*)alloca((size_t)isaNameLength + 1); + if (hsa_isa_get_info_alt(isa, HSA_ISA_INFO_NAME, isaName) != HSA_STATUS_SUCCESS) { + continue; + } + isaName[isaNameLength] = '\0'; + + std::string str(isaName); + std::vector tokens; + size_t end, pos = 0; + do { + end = str.find_first_of(':', pos); + tokens.push_back(str.substr(pos, end - pos)); + pos = end + 1; + } while (end != std::string::npos); + + if (tokens.size() != 5 || tokens[0] != "AMD" || tokens[1] != "AMDGPU") { + LogError("Not an AMD:AMDGPU ISA name"); + continue; + } + + uint major = atoi(tokens[2].c_str()); + uint minor = atoi(tokens[3].c_str()); + uint stepping = atoi(tokens[4].c_str()); + if (minor >= 10 && stepping >= 10) { + LogError("Invalid ISA string"); + continue; + } + + roc_device->deviceInfo_.gfxipVersion_ = major * 100 + minor * 10 + stepping; + + if (!roc_device->mapHSADeviceToOpenCLDevice(agent)) { + LogError("Failed mapping of HsaDevice to Device."); + continue; + } + + if (!roc_device->create()) { + LogError("Error creating new instance of Device."); + continue; + } + + if (selectedDevices[ordinal++] && + (flagIsDefault(GPU_DEVICE_NAME) || GPU_DEVICE_NAME == 0 || GPU_DEVICE_NAME[0] == '\0' || + !strcmp(GPU_DEVICE_NAME, roc_device->info_.name_))) { + roc_device.release()->registerDevice(); + } + } + + return true; +} + +void Device::tearDown() { + NullDevice::tearDown(); + hsa_shut_down(); +} + +bool Device::create() { + if (!amd::Device::create()) { + return false; + } + + amd::Context::Info info = {0}; + std::vector devices; + devices.push_back(this); + + // Create a dummy context + context_ = new amd::Context(devices, info); + if (context_ == nullptr) { + return false; + } + + blitProgram_ = new BlitProgram(context_); + // Create blit programs + if (blitProgram_ == nullptr || !blitProgram_->create(this)) { + delete blitProgram_; + blitProgram_ = nullptr; + LogError("Couldn't create blit kernels!"); + return false; + } + + mapCacheOps_ = new amd::Monitor("Map Cache Lock", true); + if (nullptr == mapCacheOps_) { + return false; + } + + mapCache_ = new std::vector(); + if (mapCache_ == nullptr) { + return false; + } + // Use just 1 entry by default for the map cache + mapCache_->push_back(nullptr); + + if (settings().stagedXferSize_ != 0) { + // Initialize staged write buffers + if (settings().stagedXferWrite_) { + xferWrite_ = new XferBuffers(*this, amd::alignUp(settings().stagedXferSize_, 4 * Ki)); + if ((xferWrite_ == nullptr) || !xferWrite_->create()) { + LogError("Couldn't allocate transfer buffer objects for read"); return false; - } - - assert(group_segment_.handle != 0); - - size_t group_segment_size = 0; - if (HSA_STATUS_SUCCESS != - hsa_amd_memory_pool_get_info( - group_segment_, HSA_AMD_MEMORY_POOL_INFO_SIZE, &group_segment_size)) { - return false; - } - assert(group_segment_size > 0); - - info_.localMemSizePerCU_ = group_segment_size; - info_.localMemSize_ = group_segment_size; - - info_.maxWorkItemDimensions_ = 3; - - if (settings().enableLocalMemory_ && gpuvm_segment_.handle != 0) { - size_t global_segment_size = 0; - if (HSA_STATUS_SUCCESS != - hsa_amd_memory_pool_get_info(gpuvm_segment_, - HSA_AMD_MEMORY_POOL_INFO_SIZE, - &global_segment_size)) { - return false; - } - - assert(global_segment_size > 0); - info_.globalMemSize_ = static_cast(global_segment_size); - - gpuvm_segment_max_alloc_ = - cl_ulong(info_.globalMemSize_ * - std::min(GPU_SINGLE_ALLOC_PERCENT, 100u) / 100u); - assert(gpuvm_segment_max_alloc_ > 0); - - info_.maxMemAllocSize_ = - static_cast(gpuvm_segment_max_alloc_); - - if (HSA_STATUS_SUCCESS != - hsa_amd_memory_pool_get_info(gpuvm_segment_, - HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE, - &alloc_granularity_)) { - return false; - } - - assert(alloc_granularity_ > 0); - } - else { - static const cl_ulong kDefaultGlobalMemSize = cl_ulong(1 * Gi); - info_.globalMemSize_ = kDefaultGlobalMemSize; - info_.maxMemAllocSize_ = info_.globalMemSize_ / 4; - - if (HSA_STATUS_SUCCESS != - hsa_amd_memory_pool_get_info(system_segment_, - HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE, - &alloc_granularity_)) { - return false; } } - // Make sure the max allocation size is not larger than the available - // memory size. - info_.maxMemAllocSize_ = - std::min(info_.maxMemAllocSize_, info_.globalMemSize_); - - /*make sure we don't run anything over 8 params for now*/ - info_.maxParameterSize_ = 1024; // [TODO]: CAL stack values: 1024* - // constant - - uint32_t max_work_group_size = 0; - if (HSA_STATUS_SUCCESS != - hsa_agent_get_info( - _bkendDevice, HSA_AGENT_INFO_WORKGROUP_MAX_SIZE, &max_work_group_size)) { + // Initialize staged read buffers + if (settings().stagedXferRead_) { + xferRead_ = new XferBuffers(*this, amd::alignUp(settings().stagedXferSize_, 4 * Ki)); + if ((xferRead_ == nullptr) || !xferRead_->create()) { + LogError("Couldn't allocate transfer buffer objects for write"); return false; + } } - assert(max_work_group_size > 0); - max_work_group_size = std::min(max_work_group_size, - static_cast(settings().maxWorkGroupSize_)); - info_.maxWorkGroupSize_ = max_work_group_size; + } - uint16_t max_workgroup_size[3] = { 0, 0, 0 }; - if (HSA_STATUS_SUCCESS != - hsa_agent_get_info( - _bkendDevice, HSA_AGENT_INFO_WORKGROUP_MAX_DIM, &max_workgroup_size)) { - return false; + xferQueue(); + + return true; +} + +device::Program* NullDevice::createProgram(amd::option::Options* options) { + return new roc::HSAILProgram(*this); +} + +device::Program* Device::createProgram(amd::option::Options* options) { + return new roc::HSAILProgram(*this); +} + +bool Device::mapHSADeviceToOpenCLDevice(hsa_agent_t dev) { + if (HSA_STATUS_SUCCESS != + hsa_agent_get_info(_bkendDevice, HSA_AGENT_INFO_PROFILE, &agent_profile_)) { + return false; + } + + // Create HSA settings + settings_ = new Settings(); + roc::Settings* hsaSettings = static_cast(settings_); + if ((hsaSettings == nullptr) || + !hsaSettings->create((agent_profile_ == HSA_PROFILE_FULL), deviceInfo_.gfxipVersion_)) { + return false; + } + + if (populateOCLDeviceConstants() == false) { + return false; + } + + // Setup System Memory to be Non-Coherent per user + // request via environment variable. By default the + // System Memory is setup to be Coherent + if (hsaSettings->enableNCMode_) { + hsa_status_t err = hsa_amd_coherency_set_type(dev, HSA_AMD_COHERENCY_TYPE_NONCOHERENT); + if (err != HSA_STATUS_SUCCESS) { + LogError("Unable to set NC memory policy!"); + return false; } - assert(max_workgroup_size[0] != 0 && max_workgroup_size[1] != 0 && - max_workgroup_size[2] != 0); + } - uint16_t max_work_item_size = static_cast(max_work_group_size); - info_.maxWorkItemSizes_[0] = std::min(max_workgroup_size[0], max_work_item_size); - info_.maxWorkItemSizes_[1] = std::min(max_workgroup_size[1], max_work_item_size); - info_.maxWorkItemSizes_[2] = std::min(max_workgroup_size[2], max_work_item_size); +#if defined(WITH_LIGHTNING_COMPILER) + // create compilation object with cache support + int gfxipMajor = deviceInfo_.gfxipVersion_ / 100; + int gfxipMinor = deviceInfo_.gfxipVersion_ / 10 % 10; + int gfxipStepping = deviceInfo_.gfxipVersion_ % 10; - info_.nativeVectorWidthChar_ = info_.preferredVectorWidthChar_ = 4; - info_.nativeVectorWidthShort_ = info_.preferredVectorWidthShort_ = 2; - info_.nativeVectorWidthInt_ = info_.preferredVectorWidthInt_ = 1; - info_.nativeVectorWidthLong_ = info_.preferredVectorWidthLong_ = 1; - info_.nativeVectorWidthFloat_ = info_.preferredVectorWidthFloat_ = 1; + // Use compute capability as target (AMD:AMDGPU:major:minor:stepping) + // with dash as delimiter to be compatible with Windows directory name + std::ostringstream cacheTarget; + cacheTarget << "AMD-AMDGPU-" << gfxipMajor << "-" << gfxipMinor << "-" << gfxipStepping; - if (agent_profile_ == HSA_PROFILE_FULL) { // full-profile = participating in coherent memory, - // base-profile = NUMA based non-coherent memory - info_.hostUnifiedMemory_ = CL_TRUE; + amd::CacheCompilation* compObj = new amd::CacheCompilation( + cacheTarget.str(), "_rocm", OCL_CODE_CACHE_ENABLE, OCL_CODE_CACHE_RESET); + if (!compObj) { + LogError("Unable to create cache compilation object!"); + return false; + } + + cacheCompilation_.reset(compObj); +#endif + + return true; +} + +hsa_status_t Device::iterateGpuMemoryPoolCallback(hsa_amd_memory_pool_t pool, void* data) { + if (data == nullptr) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + hsa_region_segment_t segment_type = (hsa_region_segment_t)0; + hsa_status_t stat = + hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment_type); + if (stat != HSA_STATUS_SUCCESS) { + return stat; + } + + // TODO: system and device local segment + Device* dev = reinterpret_cast(data); + switch (segment_type) { + case HSA_REGION_SEGMENT_GLOBAL: { + if (dev->settings().enableLocalMemory_) { + dev->gpuvm_segment_ = pool; + } + break; } - info_.memBaseAddrAlign_ = 8 * (flagIsDefault(MEMOBJ_BASE_ADDR_ALIGN) ? - sizeof(cl_long16) : MEMOBJ_BASE_ADDR_ALIGN); - info_.minDataTypeAlignSize_ = sizeof(cl_long16); + case HSA_REGION_SEGMENT_GROUP: + dev->group_segment_ = pool; + break; + default: + break; + } - info_.maxConstantArgs_ = 8; - info_.maxConstantBufferSize_ = info_.maxMemAllocSize_; - info_.localMemType_ = CL_LOCAL; - info_.errorCorrectionSupport_ = false; - info_.profilingTimerResolution_ = 1; - info_.littleEndian_ = true; - info_.compilerAvailable_ = true; - info_.executionCapabilities_ = CL_EXEC_KERNEL; - info_.queueProperties_ = CL_QUEUE_PROFILING_ENABLE; - info_.platform_ = AMD_PLATFORM; - info_.profile_ = "FULL_PROFILE"; - strcpy(info_.vendor_, "Advanced Micro Devices, Inc."); + return HSA_STATUS_SUCCESS; +} - info_.addressBits_ = LP64_SWITCH(32, 64); - info_.maxSamplers_ = 16; - info_.bufferFromImageSupport_ = CL_FALSE; - info_.oclcVersion_ = "OpenCL C " IF(IS_LIGHTNING,OPENCL_VERSION_STR,"1.2") " "; +hsa_status_t Device::iterateCpuMemoryPoolCallback(hsa_amd_memory_pool_t pool, void* data) { + if (data == nullptr) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } - uint16_t major, minor; - if (hsa_agent_get_info(_bkendDevice, HSA_AGENT_INFO_VERSION_MAJOR, &major) - != HSA_STATUS_SUCCESS - || hsa_agent_get_info(_bkendDevice, HSA_AGENT_INFO_VERSION_MINOR, &minor) - != HSA_STATUS_SUCCESS) { - return false; + hsa_region_segment_t segment_type = (hsa_region_segment_t)0; + hsa_status_t stat = + hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment_type); + if (stat != HSA_STATUS_SUCCESS) { + return stat; + } + + Device* dev = reinterpret_cast(data); + switch (segment_type) { + case HSA_REGION_SEGMENT_GLOBAL: { + uint32_t global_flag = 0; + hsa_status_t stat = + hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &global_flag); + if (stat != HSA_STATUS_SUCCESS) { + return stat; + } + + if ((global_flag & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED) != 0) { + dev->system_segment_ = pool; + } else { + dev->system_coarse_segment_ = pool; + } + break; } - std::stringstream ss; - ss << major << "." << minor << " (HSA," IF(IS_LIGHTNING,"LC","HSAIL") ")"; + default: + break; + } - strcpy(info_.driverVersion_, ss.str().c_str()); - info_.version_ = "OpenCL " /*OPENCL_VERSION_STR*/"1.2" " "; + return HSA_STATUS_SUCCESS; +} - info_.builtInKernels_ = ""; - info_.linkerAvailable_ = true; - info_.preferredInteropUserSync_ = true; - info_.printfBufferSize_ = PrintfDbg::WorkitemDebugSize * info().maxWorkGroupSize_; - info_.vendorId_ = 0x1002; // AMD's PCIe vendor id +bool Device::populateOCLDeviceConstants() { + info_.available_ = true; - info_.maxGlobalVariableSize_ = static_cast(info_.maxMemAllocSize_); - info_.globalVariablePreferredTotalSize_ = - static_cast(info_.globalMemSize_); + roc::Settings* hsa_settings = static_cast(settings_); - // Populate the single config setting. - info_.singleFPConfig_ = CL_FP_ROUND_TO_NEAREST | CL_FP_ROUND_TO_ZERO | - CL_FP_ROUND_TO_INF | CL_FP_INF_NAN | CL_FP_FMA; + int gfxipMajor = deviceInfo_.gfxipVersion_ / 100; + int gfxipMinor = deviceInfo_.gfxipVersion_ / 10 % 10; + int gfxipStepping = deviceInfo_.gfxipVersion_ % 10; - if (hsa_settings->doublePrecision_) { - info_.doubleFPConfig_ = info_.singleFPConfig_ | CL_FP_DENORM; - info_.singleFPConfig_ |= CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT; - } + std::ostringstream oss; + oss << "gfx" << gfxipMajor << gfxipMinor << gfxipStepping; + ::strcpy(info_.name_, oss.str().c_str()); - if (hsa_settings->singleFpDenorm_) { - info_.singleFPConfig_ |= CL_FP_DENORM; - } + char device_name[64] = {0}; + if (HSA_STATUS_SUCCESS == hsa_agent_get_info(_bkendDevice, + (hsa_agent_info_t)HSA_AMD_AGENT_INFO_PRODUCT_NAME, + device_name)) { + ::strcpy(info_.boardName_, device_name); + } - info_.preferredPlatformAtomicAlignment_ = 0; - info_.preferredGlobalAtomicAlignment_ = 0; - info_.preferredLocalAtomicAlignment_ = 0; + if (HSA_STATUS_SUCCESS != + hsa_agent_get_info(_bkendDevice, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, + &info_.maxComputeUnits_)) { + return false; + } + assert(info_.maxComputeUnits_ > 0); - uint8_t hsa_extensions[128]; - if (HSA_STATUS_SUCCESS != hsa_agent_get_info(_bkendDevice, - HSA_AGENT_INFO_EXTENSIONS, - hsa_extensions)) { + if (HSA_STATUS_SUCCESS != hsa_agent_get_info(_bkendDevice, + (hsa_agent_info_t)HSA_AMD_AGENT_INFO_CACHELINE_SIZE, + &info_.globalMemCacheLineSize_)) { + return false; + } + assert(info_.globalMemCacheLineSize_ > 0); + + uint32_t cachesize[4] = {0}; + if (HSA_STATUS_SUCCESS != + hsa_agent_get_info(_bkendDevice, HSA_AGENT_INFO_CACHE_SIZE, cachesize)) { + return false; + } + assert(cachesize[0] > 0); + info_.globalMemCacheSize_ = cachesize[0]; + + info_.globalMemCacheType_ = CL_READ_WRITE_CACHE; + + info_.type_ = CL_DEVICE_TYPE_GPU; + + uint32_t hsa_bdf_id = 0; + if (HSA_STATUS_SUCCESS != + hsa_agent_get_info(_bkendDevice, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_BDFID, &hsa_bdf_id)) { + return false; + } + + info_.deviceTopology_.pcie.type = CL_DEVICE_TOPOLOGY_TYPE_PCIE_AMD; + info_.deviceTopology_.pcie.bus = (hsa_bdf_id & (0xFF << 8)) >> 8; + info_.deviceTopology_.pcie.device = (hsa_bdf_id & (0x1F << 3)) >> 3; + info_.deviceTopology_.pcie.function = (hsa_bdf_id & 0x07); + info_.extensions_ = getExtensionString(); + info_.nativeVectorWidthDouble_ = info_.preferredVectorWidthDouble_ = + (settings().doublePrecision_) ? 1 : 0; + + if (HSA_STATUS_SUCCESS != + hsa_agent_get_info(_bkendDevice, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY, + &info_.maxClockFrequency_)) { + return false; + } + assert(info_.maxClockFrequency_ > 0); + + if (HSA_STATUS_SUCCESS != + hsa_amd_agent_iterate_memory_pools(cpu_agent_, Device::iterateCpuMemoryPoolCallback, this)) { + return false; + } + + assert(system_segment_.handle != 0); + + if (HSA_STATUS_SUCCESS != hsa_amd_agent_iterate_memory_pools( + _bkendDevice, Device::iterateGpuMemoryPoolCallback, this)) { + return false; + } + + assert(group_segment_.handle != 0); + + size_t group_segment_size = 0; + if (HSA_STATUS_SUCCESS != hsa_amd_memory_pool_get_info(group_segment_, + HSA_AMD_MEMORY_POOL_INFO_SIZE, + &group_segment_size)) { + return false; + } + assert(group_segment_size > 0); + + info_.localMemSizePerCU_ = group_segment_size; + info_.localMemSize_ = group_segment_size; + + info_.maxWorkItemDimensions_ = 3; + + if (settings().enableLocalMemory_ && gpuvm_segment_.handle != 0) { + size_t global_segment_size = 0; + if (HSA_STATUS_SUCCESS != hsa_amd_memory_pool_get_info(gpuvm_segment_, + HSA_AMD_MEMORY_POOL_INFO_SIZE, + &global_segment_size)) { return false; } - assert(HSA_EXTENSION_IMAGES < 8); - const bool image_is_supported = - ((hsa_extensions[0] & (1 << HSA_EXTENSION_IMAGES)) != 0); - if (image_is_supported) { - // Images - if (HSA_STATUS_SUCCESS != - hsa_agent_get_info(_bkendDevice, - static_cast( - HSA_EXT_AGENT_INFO_MAX_SAMPLER_HANDLERS), - &info_.maxSamplers_)) { - return false; - } + assert(global_segment_size > 0); + info_.globalMemSize_ = static_cast(global_segment_size); - if (HSA_STATUS_SUCCESS != - hsa_agent_get_info(_bkendDevice, - static_cast( - HSA_EXT_AGENT_INFO_MAX_IMAGE_RD_HANDLES), - &info_.maxReadImageArgs_)) { - return false; - } + gpuvm_segment_max_alloc_ = + cl_ulong(info_.globalMemSize_ * std::min(GPU_SINGLE_ALLOC_PERCENT, 100u) / 100u); + assert(gpuvm_segment_max_alloc_ > 0); - // TODO: no attribute for write image. - info_.maxWriteImageArgs_ = 8; + info_.maxMemAllocSize_ = static_cast(gpuvm_segment_max_alloc_); - if (HSA_STATUS_SUCCESS != - hsa_agent_get_info(_bkendDevice, - static_cast( - HSA_EXT_AGENT_INFO_MAX_IMAGE_RORW_HANDLES), - &info_.maxReadWriteImageArgs_)) { - return false; - } - - uint32_t image_max_dim[3]; - if (HSA_STATUS_SUCCESS != - hsa_agent_get_info(_bkendDevice, - static_cast( - HSA_EXT_AGENT_INFO_IMAGE_2D_MAX_ELEMENTS), - &image_max_dim)) { - return false; - } - - info_.image2DMaxWidth_ = image_max_dim[0]; - info_.image2DMaxHeight_ = image_max_dim[1]; - - if (HSA_STATUS_SUCCESS != - hsa_agent_get_info(_bkendDevice, - static_cast( - HSA_EXT_AGENT_INFO_IMAGE_3D_MAX_ELEMENTS), - &image_max_dim)) { - return false; - } - - info_.image3DMaxWidth_ = image_max_dim[0]; - info_.image3DMaxHeight_ = image_max_dim[1]; - info_.image3DMaxDepth_ = image_max_dim[2]; - - uint32_t max_array_size = 0; - if (HSA_STATUS_SUCCESS != - hsa_agent_get_info(_bkendDevice, - static_cast( - HSA_EXT_AGENT_INFO_IMAGE_ARRAY_MAX_LAYERS), - &max_array_size)) { - return false; - } - - info_.imageMaxArraySize_ = max_array_size; - - if (HSA_STATUS_SUCCESS != - hsa_agent_get_info(_bkendDevice, - static_cast( - HSA_EXT_AGENT_INFO_IMAGE_1DB_MAX_ELEMENTS), - &image_max_dim)) { - return false; - } - info_.imageMaxBufferSize_ = image_max_dim[0]; - - info_.imagePitchAlignment_ = 256; - - info_.imageBaseAddressAlignment_ = 256; - - info_.bufferFromImageSupport_ = CL_FALSE; - - info_.imageSupport_ = - (info_.maxReadWriteImageArgs_ > 0) ? CL_TRUE : CL_FALSE; + if (HSA_STATUS_SUCCESS != + hsa_amd_memory_pool_get_info(gpuvm_segment_, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE, + &alloc_granularity_)) { + return false; } - // Enable SVM Capabilities of Hsa device. Ensure - // user has not setup memory to be non-coherent - info_.svmCapabilities_ = 0; - if (hsa_settings->enableNCMode_ == false) { - info_.svmCapabilities_ = CL_DEVICE_SVM_COARSE_GRAIN_BUFFER; - info_.svmCapabilities_ |= CL_DEVICE_SVM_FINE_GRAIN_BUFFER; - // Report fine-grain system only on full profile - if (agent_profile_ == HSA_PROFILE_FULL) { - info_.svmCapabilities_ |= CL_DEVICE_SVM_FINE_GRAIN_SYSTEM; - } -#if !defined(WITH_LIGHTNING_COMPILER) - // Report atomics capability based on GFX IP, control on Hawaii - // and Vega10. - if (info_.hostUnifiedMemory_ || ((deviceInfo_.gfxipVersion_ >= 800) && - (deviceInfo_.gfxipVersion_ < 900))) { - info_.svmCapabilities_ |= CL_DEVICE_SVM_ATOMICS; - } -#endif // !defined(WITH_LIGHTNING_COMPILER) + assert(alloc_granularity_ > 0); + } else { + static const cl_ulong kDefaultGlobalMemSize = cl_ulong(1 * Gi); + info_.globalMemSize_ = kDefaultGlobalMemSize; + info_.maxMemAllocSize_ = info_.globalMemSize_ / 4; + + if (HSA_STATUS_SUCCESS != + hsa_amd_memory_pool_get_info( + system_segment_, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE, &alloc_granularity_)) { + return false; + } + } + + // Make sure the max allocation size is not larger than the available + // memory size. + info_.maxMemAllocSize_ = std::min(info_.maxMemAllocSize_, info_.globalMemSize_); + + /*make sure we don't run anything over 8 params for now*/ + info_.maxParameterSize_ = 1024; // [TODO]: CAL stack values: 1024* + // constant + + uint32_t max_work_group_size = 0; + if (HSA_STATUS_SUCCESS != + hsa_agent_get_info(_bkendDevice, HSA_AGENT_INFO_WORKGROUP_MAX_SIZE, &max_work_group_size)) { + return false; + } + assert(max_work_group_size > 0); + max_work_group_size = + std::min(max_work_group_size, static_cast(settings().maxWorkGroupSize_)); + info_.maxWorkGroupSize_ = max_work_group_size; + + uint16_t max_workgroup_size[3] = {0, 0, 0}; + if (HSA_STATUS_SUCCESS != + hsa_agent_get_info(_bkendDevice, HSA_AGENT_INFO_WORKGROUP_MAX_DIM, &max_workgroup_size)) { + return false; + } + assert(max_workgroup_size[0] != 0 && max_workgroup_size[1] != 0 && max_workgroup_size[2] != 0); + + uint16_t max_work_item_size = static_cast(max_work_group_size); + info_.maxWorkItemSizes_[0] = std::min(max_workgroup_size[0], max_work_item_size); + info_.maxWorkItemSizes_[1] = std::min(max_workgroup_size[1], max_work_item_size); + info_.maxWorkItemSizes_[2] = std::min(max_workgroup_size[2], max_work_item_size); + + info_.nativeVectorWidthChar_ = info_.preferredVectorWidthChar_ = 4; + info_.nativeVectorWidthShort_ = info_.preferredVectorWidthShort_ = 2; + info_.nativeVectorWidthInt_ = info_.preferredVectorWidthInt_ = 1; + info_.nativeVectorWidthLong_ = info_.preferredVectorWidthLong_ = 1; + info_.nativeVectorWidthFloat_ = info_.preferredVectorWidthFloat_ = 1; + + if (agent_profile_ == HSA_PROFILE_FULL) { // full-profile = participating in coherent memory, + // base-profile = NUMA based non-coherent memory + info_.hostUnifiedMemory_ = CL_TRUE; + } + info_.memBaseAddrAlign_ = + 8 * (flagIsDefault(MEMOBJ_BASE_ADDR_ALIGN) ? sizeof(cl_long16) : MEMOBJ_BASE_ADDR_ALIGN); + info_.minDataTypeAlignSize_ = sizeof(cl_long16); + + info_.maxConstantArgs_ = 8; + info_.maxConstantBufferSize_ = info_.maxMemAllocSize_; + info_.localMemType_ = CL_LOCAL; + info_.errorCorrectionSupport_ = false; + info_.profilingTimerResolution_ = 1; + info_.littleEndian_ = true; + info_.compilerAvailable_ = true; + info_.executionCapabilities_ = CL_EXEC_KERNEL; + info_.queueProperties_ = CL_QUEUE_PROFILING_ENABLE; + info_.platform_ = AMD_PLATFORM; + info_.profile_ = "FULL_PROFILE"; + strcpy(info_.vendor_, "Advanced Micro Devices, Inc."); + + info_.addressBits_ = LP64_SWITCH(32, 64); + info_.maxSamplers_ = 16; + info_.bufferFromImageSupport_ = CL_FALSE; + info_.oclcVersion_ = "OpenCL C " IF(IS_LIGHTNING, OPENCL_VERSION_STR, "1.2") " "; + + uint16_t major, minor; + if (hsa_agent_get_info(_bkendDevice, HSA_AGENT_INFO_VERSION_MAJOR, &major) != + HSA_STATUS_SUCCESS || + hsa_agent_get_info(_bkendDevice, HSA_AGENT_INFO_VERSION_MINOR, &minor) != + HSA_STATUS_SUCCESS) { + return false; + } + std::stringstream ss; + ss << major << "." << minor << " (HSA," IF(IS_LIGHTNING, "LC", "HSAIL") ")"; + + strcpy(info_.driverVersion_, ss.str().c_str()); + info_.version_ = "OpenCL " /*OPENCL_VERSION_STR*/"1.2" " "; + + info_.builtInKernels_ = ""; + info_.linkerAvailable_ = true; + info_.preferredInteropUserSync_ = true; + info_.printfBufferSize_ = PrintfDbg::WorkitemDebugSize * info().maxWorkGroupSize_; + info_.vendorId_ = 0x1002; // AMD's PCIe vendor id + + info_.maxGlobalVariableSize_ = static_cast(info_.maxMemAllocSize_); + info_.globalVariablePreferredTotalSize_ = static_cast(info_.globalMemSize_); + + // Populate the single config setting. + info_.singleFPConfig_ = + CL_FP_ROUND_TO_NEAREST | CL_FP_ROUND_TO_ZERO | CL_FP_ROUND_TO_INF | CL_FP_INF_NAN | CL_FP_FMA; + + if (hsa_settings->doublePrecision_) { + info_.doubleFPConfig_ = info_.singleFPConfig_ | CL_FP_DENORM; + info_.singleFPConfig_ |= CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT; + } + + if (hsa_settings->singleFpDenorm_) { + info_.singleFPConfig_ |= CL_FP_DENORM; + } + + info_.preferredPlatformAtomicAlignment_ = 0; + info_.preferredGlobalAtomicAlignment_ = 0; + info_.preferredLocalAtomicAlignment_ = 0; + + uint8_t hsa_extensions[128]; + if (HSA_STATUS_SUCCESS != + hsa_agent_get_info(_bkendDevice, HSA_AGENT_INFO_EXTENSIONS, hsa_extensions)) { + return false; + } + + assert(HSA_EXTENSION_IMAGES < 8); + const bool image_is_supported = ((hsa_extensions[0] & (1 << HSA_EXTENSION_IMAGES)) != 0); + if (image_is_supported) { + // Images + if (HSA_STATUS_SUCCESS != + hsa_agent_get_info(_bkendDevice, + static_cast(HSA_EXT_AGENT_INFO_MAX_SAMPLER_HANDLERS), + &info_.maxSamplers_)) { + return false; } if (HSA_STATUS_SUCCESS != hsa_agent_get_info(_bkendDevice, - HSA_AGENT_INFO_WAVEFRONT_SIZE, - &info_.wavefrontWidth_)) { - return false; + static_cast(HSA_EXT_AGENT_INFO_MAX_IMAGE_RD_HANDLES), + &info_.maxReadImageArgs_)) { + return false; } - return true; -} + // TODO: no attribute for write image. + info_.maxWriteImageArgs_ = 8; -device::VirtualDevice* -Device::createVirtualDevice(amd::CommandQueue* queue) -{ - bool profiling = (queue != nullptr) && - queue->properties().test(CL_QUEUE_PROFILING_ENABLE); - - // Initialization of heap and other resources occur during the command - // queue creation time. - VirtualGPU *virtualDevice = new VirtualGPU(*this); - - if (!virtualDevice->create(profiling)) { - delete virtualDevice; - return nullptr; + if (HSA_STATUS_SUCCESS != + hsa_agent_get_info(_bkendDevice, + static_cast(HSA_EXT_AGENT_INFO_MAX_IMAGE_RORW_HANDLES), + &info_.maxReadWriteImageArgs_)) { + return false; } - if (profiling) { - hsa_amd_profiling_set_profiler_enabled(virtualDevice->gpu_queue(), 1); + uint32_t image_max_dim[3]; + if (HSA_STATUS_SUCCESS != + hsa_agent_get_info(_bkendDevice, + static_cast(HSA_EXT_AGENT_INFO_IMAGE_2D_MAX_ELEMENTS), + &image_max_dim)) { + return false; } - return virtualDevice; -} + info_.image2DMaxWidth_ = image_max_dim[0]; + info_.image2DMaxHeight_ = image_max_dim[1]; -bool -Device::globalFreeMemory(size_t *freeMemory) const -{ + if (HSA_STATUS_SUCCESS != + hsa_agent_get_info(_bkendDevice, + static_cast(HSA_EXT_AGENT_INFO_IMAGE_3D_MAX_ELEMENTS), + &image_max_dim)) { + return false; + } + + info_.image3DMaxWidth_ = image_max_dim[0]; + info_.image3DMaxHeight_ = image_max_dim[1]; + info_.image3DMaxDepth_ = image_max_dim[2]; + + uint32_t max_array_size = 0; + if (HSA_STATUS_SUCCESS != + hsa_agent_get_info(_bkendDevice, + static_cast(HSA_EXT_AGENT_INFO_IMAGE_ARRAY_MAX_LAYERS), + &max_array_size)) { + return false; + } + + info_.imageMaxArraySize_ = max_array_size; + + if (HSA_STATUS_SUCCESS != + hsa_agent_get_info(_bkendDevice, + static_cast(HSA_EXT_AGENT_INFO_IMAGE_1DB_MAX_ELEMENTS), + &image_max_dim)) { + return false; + } + info_.imageMaxBufferSize_ = image_max_dim[0]; + + info_.imagePitchAlignment_ = 256; + + info_.imageBaseAddressAlignment_ = 256; + + info_.bufferFromImageSupport_ = CL_FALSE; + + info_.imageSupport_ = (info_.maxReadWriteImageArgs_ > 0) ? CL_TRUE : CL_FALSE; + } + + // Enable SVM Capabilities of Hsa device. Ensure + // user has not setup memory to be non-coherent + info_.svmCapabilities_ = 0; + if (hsa_settings->enableNCMode_ == false) { + info_.svmCapabilities_ = CL_DEVICE_SVM_COARSE_GRAIN_BUFFER; + info_.svmCapabilities_ |= CL_DEVICE_SVM_FINE_GRAIN_BUFFER; + // Report fine-grain system only on full profile + if (agent_profile_ == HSA_PROFILE_FULL) { + info_.svmCapabilities_ |= CL_DEVICE_SVM_FINE_GRAIN_SYSTEM; + } +#if !defined(WITH_LIGHTNING_COMPILER) + // Report atomics capability based on GFX IP, control on Hawaii + // and Vega10. + if (info_.hostUnifiedMemory_ || + ((deviceInfo_.gfxipVersion_ >= 800) && (deviceInfo_.gfxipVersion_ < 900))) { + info_.svmCapabilities_ |= CL_DEVICE_SVM_ATOMICS; + } +#endif // !defined(WITH_LIGHTNING_COMPILER) + } + + if (HSA_STATUS_SUCCESS != + hsa_agent_get_info(_bkendDevice, HSA_AGENT_INFO_WAVEFRONT_SIZE, &info_.wavefrontWidth_)) { return false; + } + + return true; } -bool -Device::bindExternalDevice( - uint flags, - void* const gfxDevice[], - void* gfxContext, - bool validateOnly) -{ +device::VirtualDevice* Device::createVirtualDevice(amd::CommandQueue* queue) { + bool profiling = (queue != nullptr) && queue->properties().test(CL_QUEUE_PROFILING_ENABLE); + + // Initialization of heap and other resources occur during the command + // queue creation time. + VirtualGPU* virtualDevice = new VirtualGPU(*this); + + if (!virtualDevice->create(profiling)) { + delete virtualDevice; + return nullptr; + } + + if (profiling) { + hsa_amd_profiling_set_profiler_enabled(virtualDevice->gpu_queue(), 1); + } + + return virtualDevice; +} + +bool Device::globalFreeMemory(size_t* freeMemory) const { return false; } + +bool Device::bindExternalDevice(uint flags, void* const gfxDevice[], void* gfxContext, + bool validateOnly) { #if defined(_WIN32) return false; #else - if((flags&amd::Context::GLDeviceKhr)==0) - return false; + if ((flags & amd::Context::GLDeviceKhr) == 0) return false; - MesaInterop::MESA_INTEROP_KIND kind=MesaInterop::MESA_INTEROP_NONE; + MesaInterop::MESA_INTEROP_KIND kind = MesaInterop::MESA_INTEROP_NONE; MesaInterop::DisplayHandle display; MesaInterop::ContextHandle context; - if((flags&amd::Context::EGLDeviceKhr)!=0) - { - kind=MesaInterop::MESA_INTEROP_EGL; - display.eglDisplay=reinterpret_cast(gfxDevice[amd::Context::GLDeviceKhrIdx]); - context.eglContext=reinterpret_cast(gfxContext); - } - else - { - kind=MesaInterop::MESA_INTEROP_GLX; - display.glxDisplay=reinterpret_cast(gfxDevice[amd::Context::GLDeviceKhrIdx]); - context.glxContext=reinterpret_cast(gfxContext); + if ((flags & amd::Context::EGLDeviceKhr) != 0) { + kind = MesaInterop::MESA_INTEROP_EGL; + display.eglDisplay = reinterpret_cast(gfxDevice[amd::Context::GLDeviceKhrIdx]); + context.eglContext = reinterpret_cast(gfxContext); + } else { + kind = MesaInterop::MESA_INTEROP_GLX; + display.glxDisplay = reinterpret_cast(gfxDevice[amd::Context::GLDeviceKhrIdx]); + context.glxContext = reinterpret_cast(gfxContext); } mesa_glinterop_device_info info; - info.size=sizeof(mesa_glinterop_device_info); + info.size = sizeof(mesa_glinterop_device_info); MesaInterop temp; - if(!temp.Bind(kind, display, context)) - { + if (!temp.Bind(kind, display, context)) { assert(false && "Failed mesa interop bind."); return false; } - if(!temp.GetInfo(info)) - { + if (!temp.GetInfo(info)) { assert(false && "Failed to get mesa interop device info."); return false; } - bool match=true; - match &= info_.deviceTopology_.pcie.bus==info.pci_bus; - match &= info_.deviceTopology_.pcie.device==info.pci_device; - match &= info_.deviceTopology_.pcie.function==info.pci_function; - match &= info_.vendorId_==info.vendor_id; - match &= deviceInfo_.pciDeviceId_==info.device_id; + bool match = true; + match &= info_.deviceTopology_.pcie.bus == info.pci_bus; + match &= info_.deviceTopology_.pcie.device == info.pci_device; + match &= info_.deviceTopology_.pcie.function == info.pci_function; + match &= info_.vendorId_ == info.vendor_id; + match &= deviceInfo_.pciDeviceId_ == info.device_id; - if(!validateOnly) - mesa_=temp; + if (!validateOnly) mesa_ = temp; return match; #endif } -bool -Device::unbindExternalDevice( - uint flags, - void* const gfxDevice[], - void* gfxContext, - bool validateOnly) -{ +bool Device::unbindExternalDevice(uint flags, void* const gfxDevice[], void* gfxContext, + bool validateOnly) { #if defined(_WIN32) - return false; + return false; #else - if ((flags&amd::Context::GLDeviceKhr)==0) - return false; - if(!validateOnly) - mesa_.Unbind(); + if ((flags & amd::Context::GLDeviceKhr) == 0) return false; + if (!validateOnly) mesa_.Unbind(); return true; #endif } -amd::Memory* -Device::findMapTarget(size_t size) const -{ - // Must be serialised for access - amd::ScopedLock lk(*mapCacheOps_); +amd::Memory* Device::findMapTarget(size_t size) const { + // Must be serialised for access + amd::ScopedLock lk(*mapCacheOps_); - amd::Memory* map = nullptr; - size_t minSize = 0; - size_t maxSize = 0; - uint mapId = mapCache_->size(); - uint releaseId = mapCache_->size(); + amd::Memory* map = nullptr; + size_t minSize = 0; + size_t maxSize = 0; + uint mapId = mapCache_->size(); + uint releaseId = mapCache_->size(); - // Find if the list has a map target of appropriate size - for (uint i = 0; i < mapCache_->size(); i++) { - if ((*mapCache_)[i] != nullptr) { - // Requested size is smaller than the entry size - if (size < (*mapCache_)[i]->getSize()) { - if ((minSize == 0) || - (minSize > (*mapCache_)[i]->getSize())) { - minSize = (*mapCache_)[i]->getSize(); - mapId = i; - } - } - // Requeted size matches the entry size - else if (size == (*mapCache_)[i]->getSize()) { - mapId = i; - break; - } - else { - // Find the biggest map target in the list - if (maxSize < (*mapCache_)[i]->getSize()) { - maxSize = (*mapCache_)[i]->getSize(); - releaseId = i; - } - } + // Find if the list has a map target of appropriate size + for (uint i = 0; i < mapCache_->size(); i++) { + if ((*mapCache_)[i] != nullptr) { + // Requested size is smaller than the entry size + if (size < (*mapCache_)[i]->getSize()) { + if ((minSize == 0) || (minSize > (*mapCache_)[i]->getSize())) { + minSize = (*mapCache_)[i]->getSize(); + mapId = i; } + } + // Requeted size matches the entry size + else if (size == (*mapCache_)[i]->getSize()) { + mapId = i; + break; + } else { + // Find the biggest map target in the list + if (maxSize < (*mapCache_)[i]->getSize()) { + maxSize = (*mapCache_)[i]->getSize(); + releaseId = i; + } + } } + } - // Check if we found any map target - if (mapId < mapCache_->size()) { - map = (*mapCache_)[mapId]; - (*mapCache_)[mapId] = nullptr; - } - // If cache is full, then release the biggest map target - else if (releaseId < mapCache_->size()) { - (*mapCache_)[releaseId]->release(); - (*mapCache_)[releaseId] = nullptr; - } + // Check if we found any map target + if (mapId < mapCache_->size()) { + map = (*mapCache_)[mapId]; + (*mapCache_)[mapId] = nullptr; + } + // If cache is full, then release the biggest map target + else if (releaseId < mapCache_->size()) { + (*mapCache_)[releaseId]->release(); + (*mapCache_)[releaseId] = nullptr; + } - return map; + return map; } -bool -Device::addMapTarget(amd::Memory* memory) const -{ - // Must be serialised for access - amd::ScopedLock lk(*mapCacheOps_); +bool Device::addMapTarget(amd::Memory* memory) const { + // Must be serialised for access + amd::ScopedLock lk(*mapCacheOps_); - //the svm memory shouldn't be cached - if (!memory->canBeCached()) { - return false; - } - // Find if the list has a map target of appropriate size - for (uint i = 0; i < mapCache_->size(); ++i) { - if ((*mapCache_)[i] == nullptr) { - (*mapCache_)[i] = memory; - return true; - } + // the svm memory shouldn't be cached + if (!memory->canBeCached()) { + return false; + } + // Find if the list has a map target of appropriate size + for (uint i = 0; i < mapCache_->size(); ++i) { + if ((*mapCache_)[i] == nullptr) { + (*mapCache_)[i] = memory; + return true; } + } - // Add a new entry - mapCache_->push_back(memory); + // Add a new entry + mapCache_->push_back(memory); - return true; + return true; } -Memory* -Device::getRocMemory(amd::Memory* mem) const -{ - return static_cast(mem->getDeviceMemory(*this)); +Memory* Device::getRocMemory(amd::Memory* mem) const { + return static_cast(mem->getDeviceMemory(*this)); } -device::Memory* -Device::createMemory(amd::Memory &owner) const -{ - roc::Memory* memory = nullptr; - if (owner.asBuffer()) { - memory = new roc::Buffer(*this, owner); - } - else if (owner.asImage()) { - memory = new roc::Image(*this, owner); - } - else { - LogError("Unknown memory type"); - } +device::Memory* Device::createMemory(amd::Memory& owner) const { + roc::Memory* memory = nullptr; + if (owner.asBuffer()) { + memory = new roc::Buffer(*this, owner); + } else if (owner.asImage()) { + memory = new roc::Image(*this, owner); + } else { + LogError("Unknown memory type"); + } - if (memory == nullptr) { - return nullptr; - } + if (memory == nullptr) { + return nullptr; + } - bool result = memory->create(); + bool result = memory->create(); - if (!result) { - LogError("Failed creating memory"); - delete memory; - return nullptr; - } + if (!result) { + LogError("Failed creating memory"); + delete memory; + return nullptr; + } - // Transfer data only if OCL context has one device. - // Cache coherency layer will update data for multiple devices - if (!memory->isHostMemDirectAccess() && owner.asImage() && - (owner.parent() == nullptr) && - (owner.getMemFlags() & CL_MEM_COPY_HOST_PTR) && - (owner.getContext().devices().size() == 1)) { - // To avoid recurssive call to Device::createMemory, we perform - // data transfer to the view of the image. - amd::Image* imageView = owner.asImage()->createView( - owner.getContext(), owner.asImage()->getImageFormat(), xferQueue()); + // Transfer data only if OCL context has one device. + // Cache coherency layer will update data for multiple devices + if (!memory->isHostMemDirectAccess() && owner.asImage() && (owner.parent() == nullptr) && + (owner.getMemFlags() & CL_MEM_COPY_HOST_PTR) && (owner.getContext().devices().size() == 1)) { + // To avoid recurssive call to Device::createMemory, we perform + // data transfer to the view of the image. + amd::Image* imageView = owner.asImage()->createView( + owner.getContext(), owner.asImage()->getImageFormat(), xferQueue()); - if (imageView == nullptr) { - LogError("[OCL] Fail to allocate view of image object"); - return nullptr; - } - - Image* devImageView = - new roc::Image(static_cast(*this), *imageView); - if (devImageView == nullptr) { - LogError("[OCL] Fail to allocate device mem object for the view"); - imageView->release(); - return nullptr; - } - - if (devImageView != nullptr && - !devImageView->createView(static_cast(*memory))) { - LogError("[OCL] Fail to create device mem object for the view"); - delete devImageView; - imageView->release(); - return nullptr; - } - - imageView->replaceDeviceMemory(this, devImageView); - - result = xferMgr().writeImage(owner.getHostMem(), *devImageView, - amd::Coord3D(0, 0, 0), imageView->getRegion(), - 0, - 0, true); - - // Release host memory, since runtime copied data - owner.setHostMem(nullptr); - - imageView->release(); - } - - // Prepin sysmem buffer for possible data synchronization between CPU and GPU - if (!memory->isHostMemDirectAccess() && (owner.getHostMem() != nullptr)) { - memory->pinSystemMemory(owner.getHostMem(), owner.getSize()); - } - - if (!result) { - delete memory; - return nullptr; - } - - return memory; -} - -void* -Device::hostAlloc(size_t size, size_t alignment, bool atomics) const { - void* ptr = nullptr; - const hsa_amd_memory_pool_t segment = - (!atomics) - ? (system_coarse_segment_.handle != 0) ? system_coarse_segment_ - : system_segment_ - : system_segment_; - assert(segment.handle != 0); - hsa_status_t stat = hsa_amd_memory_pool_allocate(segment, size, 0, &ptr); - if (stat != HSA_STATUS_SUCCESS) { - LogError("Fail allocation host memory"); - return nullptr; - } - - stat = hsa_amd_agents_allow_access(gpu_agents_.size(), &gpu_agents_[0], - nullptr, ptr); - if (stat != HSA_STATUS_SUCCESS) { - LogError("Fail hsa_amd_agents_allow_access"); + if (imageView == nullptr) { + LogError("[OCL] Fail to allocate view of image object"); return nullptr; } - return ptr; + Image* devImageView = new roc::Image(static_cast(*this), *imageView); + if (devImageView == nullptr) { + LogError("[OCL] Fail to allocate device mem object for the view"); + imageView->release(); + return nullptr; + } + + if (devImageView != nullptr && !devImageView->createView(static_cast(*memory))) { + LogError("[OCL] Fail to create device mem object for the view"); + delete devImageView; + imageView->release(); + return nullptr; + } + + imageView->replaceDeviceMemory(this, devImageView); + + result = xferMgr().writeImage(owner.getHostMem(), *devImageView, amd::Coord3D(0, 0, 0), + imageView->getRegion(), 0, 0, true); + + // Release host memory, since runtime copied data + owner.setHostMem(nullptr); + + imageView->release(); + } + + // Prepin sysmem buffer for possible data synchronization between CPU and GPU + if (!memory->isHostMemDirectAccess() && (owner.getHostMem() != nullptr)) { + memory->pinSystemMemory(owner.getHostMem(), owner.getSize()); + } + + if (!result) { + delete memory; + return nullptr; + } + + return memory; } -void -Device::hostFree(void* ptr, size_t size) const -{ +void* Device::hostAlloc(size_t size, size_t alignment, bool atomics) const { + void* ptr = nullptr; + const hsa_amd_memory_pool_t segment = (!atomics) + ? (system_coarse_segment_.handle != 0) ? system_coarse_segment_ : system_segment_ + : system_segment_; + assert(segment.handle != 0); + hsa_status_t stat = hsa_amd_memory_pool_allocate(segment, size, 0, &ptr); + if (stat != HSA_STATUS_SUCCESS) { + LogError("Fail allocation host memory"); + return nullptr; + } + + stat = hsa_amd_agents_allow_access(gpu_agents_.size(), &gpu_agents_[0], nullptr, ptr); + if (stat != HSA_STATUS_SUCCESS) { + LogError("Fail hsa_amd_agents_allow_access"); + return nullptr; + } + + return ptr; +} + +void Device::hostFree(void* ptr, size_t size) const { memFree(ptr, size); } + +void* Device::deviceLocalAlloc(size_t size) const { + if (gpuvm_segment_.handle == 0 || gpuvm_segment_max_alloc_ == 0) { + return nullptr; + } + + void* ptr = nullptr; + hsa_status_t stat = hsa_amd_memory_pool_allocate(gpuvm_segment_, size, 0, &ptr); + if (stat != HSA_STATUS_SUCCESS) { + LogError("Fail allocation local memory"); + return nullptr; + } + + stat = hsa_memory_assign_agent(ptr, _bkendDevice, HSA_ACCESS_PERMISSION_RW); + if (stat != HSA_STATUS_SUCCESS) { + LogError("Fail assigning local memory to agent"); memFree(ptr, size); + return nullptr; + } + + return ptr; } -void * -Device::deviceLocalAlloc(size_t size) const -{ - if (gpuvm_segment_.handle == 0 || gpuvm_segment_max_alloc_ == 0) { - return nullptr; - } +void Device::memFree(void* ptr, size_t size) const { + hsa_status_t stat = hsa_amd_memory_pool_free(ptr); + if (stat != HSA_STATUS_SUCCESS) { + LogError("Fail freeing local memory"); + } +} - void *ptr = nullptr; - hsa_status_t stat = - hsa_amd_memory_pool_allocate(gpuvm_segment_, size, 0, &ptr); - if (stat != HSA_STATUS_SUCCESS) { - LogError("Fail allocation local memory"); - return nullptr; - } +void* Device::svmAlloc(amd::Context& context, size_t size, size_t alignment, cl_svm_mem_flags flags, + void* svmPtr) const { + amd::Memory* mem = nullptr; + if (nullptr == svmPtr) { + bool atomics = (flags & CL_MEM_SVM_ATOMICS) != 0; + void* ptr = hostAlloc(size, alignment, atomics); - stat = hsa_memory_assign_agent(ptr, _bkendDevice, HSA_ACCESS_PERMISSION_RW); - if (stat != HSA_STATUS_SUCCESS) { - LogError("Fail assigning local memory to agent"); - memFree(ptr, size); + if (ptr != nullptr) { + // Copy paste from ORCA code. + // create a hidden buffer, which will allocated on the device later + mem = new (context) amd::Buffer(context, CL_MEM_USE_HOST_PTR, size, ptr); + if (mem == nullptr) { + LogError("failed to create a svm mem object!"); + return nullptr; + } + + if (!mem->create(ptr)) { + LogError("failed to create a svm hidden buffer!"); + mem->release(); + return nullptr; + } + + // add the information to context so that we can use it later. + amd::SvmManager::AddSvmBuffer(ptr, mem); + + return ptr; + } else { + return nullptr; + } + } else { + // Copy paste from ORCA code. + // Find the existing amd::mem object + mem = amd::SvmManager::FindSvmBuffer(svmPtr); + + if (nullptr == mem) { return nullptr; } - return ptr; + return svmPtr; + } } -void -Device::memFree(void *ptr, size_t size) const -{ - hsa_status_t stat = - hsa_amd_memory_pool_free(ptr); - if (stat != HSA_STATUS_SUCCESS) { - LogError("Fail freeing local memory"); - } +void Device::svmFree(void* ptr) const { + amd::Memory* svmMem = nullptr; + svmMem = amd::SvmManager::FindSvmBuffer(ptr); + if (nullptr != svmMem) { + svmMem->release(); + amd::SvmManager::RemoveSvmBuffer(ptr); + hostFree(ptr); + } } -void* -Device::svmAlloc(amd::Context& context, size_t size, size_t alignment, cl_svm_mem_flags flags, void* svmPtr) const -{ - amd::Memory* mem = nullptr; - if (nullptr == svmPtr) { - bool atomics = (flags & CL_MEM_SVM_ATOMICS) != 0; - void* ptr = hostAlloc(size, alignment, atomics); - - if (ptr != nullptr) { - // Copy paste from ORCA code. - // create a hidden buffer, which will allocated on the device later - mem = new (context) - amd::Buffer(context, CL_MEM_USE_HOST_PTR, size, ptr); - if (mem == nullptr) { - LogError("failed to create a svm mem object!"); - return nullptr; - } - - if (!mem->create(ptr)) { - LogError("failed to create a svm hidden buffer!"); - mem->release(); - return nullptr; - } - - // add the information to context so that we can use it later. - amd::SvmManager::AddSvmBuffer(ptr, mem); - - return ptr; - } - else { - return nullptr; - } - } else { - // Copy paste from ORCA code. - // Find the existing amd::mem object - mem = amd::SvmManager::FindSvmBuffer(svmPtr); - - if (nullptr == mem) { - return nullptr; - } - - return svmPtr; - } -} - -void -Device::svmFree(void* ptr) const -{ - amd::Memory * svmMem = nullptr; - svmMem = amd::SvmManager::FindSvmBuffer(ptr); - if (nullptr != svmMem) { - svmMem->release(); - amd::SvmManager::RemoveSvmBuffer(ptr); - hostFree(ptr); - } -} - -VirtualGPU* -Device::xferQueue() const -{ +VirtualGPU* Device::xferQueue() const { + if (!xferQueue_) { + // Create virtual device for internal memory transfer + Device* thisDevice = const_cast(this); + thisDevice->xferQueue_ = reinterpret_cast(thisDevice->createVirtualDevice()); if (!xferQueue_) { - // Create virtual device for internal memory transfer - Device* thisDevice = const_cast(this); - thisDevice->xferQueue_ = reinterpret_cast( - thisDevice->createVirtualDevice()); - if (!xferQueue_) { - LogError("Couldn't create the device transfer manager!"); - } + LogError("Couldn't create the device transfer manager!"); } - xferQueue_->enableSyncBlit(); - return xferQueue_; + } + xferQueue_->enableSyncBlit(); + return xferQueue_; } - } #endif // WITHOUT_HSA_BACKEND diff --git a/rocclr/runtime/device/rocm/rocdevice.hpp b/rocclr/runtime/device/rocm/rocdevice.hpp index f381545cd1..37d1bd74b3 100644 --- a/rocclr/runtime/device/rocm/rocdevice.hpp +++ b/rocclr/runtime/device/rocm/rocdevice.hpp @@ -59,396 +59,369 @@ class Resource; class VirtualDevice; class PrintfDbg; -//A NULL Device type used only for offline compilation +// A NULL Device type used only for offline compilation // Only functions that are used for compilation will be in this device class NullDevice : public amd::Device { -public: - //! constructor - NullDevice(){}; + public: + //! constructor + NullDevice(){}; - //!create the device - bool create(const AMDDeviceInfo& deviceInfo); + //! create the device + bool create(const AMDDeviceInfo& deviceInfo); - //! Initialise all the offline devices that can be used for compilation - static bool init(); - //! Teardown for offline devices - static void tearDown(); + //! Initialise all the offline devices that can be used for compilation + static bool init(); + //! Teardown for offline devices + static void tearDown(); - //! Destructor for the Null device - virtual ~NullDevice(); + //! Destructor for the Null device + virtual ~NullDevice(); - Compiler* compiler() const { return compilerHandle_; } + Compiler* compiler() const { return compilerHandle_; } - const Settings &settings() const { return reinterpret_cast(*settings_); } + const Settings& settings() const { return reinterpret_cast(*settings_); } - //! Construct an HSAIL program object from the ELF assuming it is valid - virtual device::Program *createProgram(amd::option::Options* options = nullptr); - const AMDDeviceInfo& deviceInfo() const { - return deviceInfo_; - } - //! Gets the backend device for the Null device type - virtual hsa_agent_t getBackendDevice() const { - ShouldNotReachHere(); - const hsa_agent_t kInvalidAgent = { 0 }; - return kInvalidAgent; - } + //! Construct an HSAIL program object from the ELF assuming it is valid + virtual device::Program* createProgram(amd::option::Options* options = nullptr); + const AMDDeviceInfo& deviceInfo() const { return deviceInfo_; } + //! Gets the backend device for the Null device type + virtual hsa_agent_t getBackendDevice() const { + ShouldNotReachHere(); + const hsa_agent_t kInvalidAgent = {0}; + return kInvalidAgent; + } - //List of dummy functions which are disabled for NullDevice + // List of dummy functions which are disabled for NullDevice - //! Create sub-devices according to the given partition scheme. - virtual cl_int createSubDevices( - device::CreateSubDevicesInfo& create_info, - cl_uint num_entries, - cl_device_id* devices, - cl_uint* num_devices) { - ShouldNotReachHere(); - return CL_INVALID_VALUE; }; + //! Create sub-devices according to the given partition scheme. + virtual cl_int createSubDevices(device::CreateSubDevicesInfo& create_info, cl_uint num_entries, + cl_device_id* devices, cl_uint* num_devices) { + ShouldNotReachHere(); + return CL_INVALID_VALUE; + }; - //! Create a new virtual device environment. - virtual device::VirtualDevice* createVirtualDevice( - amd::CommandQueue* queue = nullptr) { - ShouldNotReachHere(); - return nullptr; - } + //! Create a new virtual device environment. + virtual device::VirtualDevice* createVirtualDevice(amd::CommandQueue* queue = nullptr) { + ShouldNotReachHere(); + return nullptr; + } - virtual bool registerSvmMemory(void* ptr, size_t size) const { - ShouldNotReachHere(); - return false; - } + virtual bool registerSvmMemory(void* ptr, size_t size) const { + ShouldNotReachHere(); + return false; + } - virtual void deregisterSvmMemory(void* ptr) const { - ShouldNotReachHere(); - } + virtual void deregisterSvmMemory(void* ptr) const { ShouldNotReachHere(); } - //! Just returns nullptr for the dummy device - virtual device::Memory* createMemory(amd::Memory& owner) const { - ShouldNotReachHere(); - return nullptr; } + //! Just returns nullptr for the dummy device + virtual device::Memory* createMemory(amd::Memory& owner) const { + ShouldNotReachHere(); + return nullptr; + } - //! Sampler object allocation - virtual bool createSampler( - const amd::Sampler& owner, //!< abstraction layer sampler object - device::Sampler** sampler //!< device sampler object - ) const - { - ShouldNotReachHere(); - return true; - } + //! Sampler object allocation + virtual bool createSampler(const amd::Sampler& owner, //!< abstraction layer sampler object + device::Sampler** sampler //!< device sampler object + ) const { + ShouldNotReachHere(); + return true; + } - //! Just returns nullptr for the dummy device - virtual device::Memory* createView( - amd::Memory& owner, //!< Owner memory object - const device::Memory& parent //!< Parent device memory object for the view - ) const { - ShouldNotReachHere(); - return nullptr; - } + //! Just returns nullptr for the dummy device + virtual device::Memory* createView( + amd::Memory& owner, //!< Owner memory object + const device::Memory& parent //!< Parent device memory object for the view + ) const { + ShouldNotReachHere(); + return nullptr; + } - //! Just returns nullptr for the dummy device - virtual void* svmAlloc( - amd::Context& context, //!< The context used to create a buffer - size_t size, //!< size of svm spaces - size_t alignment, //!< alignment requirement of svm spaces - cl_svm_mem_flags flags, //!< flags of creation svm spaces - void* svmPtr //!< existing svm pointer for mGPU case - ) const { - ShouldNotReachHere(); - return nullptr; - } + //! Just returns nullptr for the dummy device + virtual void* svmAlloc(amd::Context& context, //!< The context used to create a buffer + size_t size, //!< size of svm spaces + size_t alignment, //!< alignment requirement of svm spaces + cl_svm_mem_flags flags, //!< flags of creation svm spaces + void* svmPtr //!< existing svm pointer for mGPU case + ) const { + ShouldNotReachHere(); + return nullptr; + } - //! Just returns nullptr for the dummy device - virtual void svmFree( - void* ptr //!< svm pointer needed to be freed - ) const { - ShouldNotReachHere(); - return; - } + //! Just returns nullptr for the dummy device + virtual void svmFree(void* ptr //!< svm pointer needed to be freed + ) const { + ShouldNotReachHere(); + return; + } - //! Reallocates the provided buffer object - virtual bool reallocMemory(amd::Memory& owner) const { - ShouldNotReachHere(); - return false; - } + //! Reallocates the provided buffer object + virtual bool reallocMemory(amd::Memory& owner) const { + ShouldNotReachHere(); + return false; + } - //! Acquire external graphics API object in the host thread - //! Needed for OpenGL objects on CPU device + //! Acquire external graphics API object in the host thread + //! Needed for OpenGL objects on CPU device - virtual bool bindExternalDevice( - uint flags, void* const pDevice[], void* pContext, bool validateOnly) { - ShouldNotReachHere(); - return false; - } + virtual bool bindExternalDevice(uint flags, void* const pDevice[], void* pContext, + bool validateOnly) { + ShouldNotReachHere(); + return false; + } - virtual bool unbindExternalDevice( - uint flags, void* const pDevice[], void* pContext, bool validateOnly) { - ShouldNotReachHere(); - return false; - } + virtual bool unbindExternalDevice(uint flags, void* const pDevice[], void* pContext, + bool validateOnly) { + ShouldNotReachHere(); + return false; + } - //! Releases non-blocking map target memory - virtual void freeMapTarget(amd::Memory& mem, void* target) { ShouldNotReachHere();} + //! Releases non-blocking map target memory + virtual void freeMapTarget(amd::Memory& mem, void* target) { ShouldNotReachHere(); } - //! Empty implementation on Null device - virtual bool globalFreeMemory(size_t* freeMemory) const { - ShouldNotReachHere(); - return false; - } + //! Empty implementation on Null device + virtual bool globalFreeMemory(size_t* freeMemory) const { + ShouldNotReachHere(); + return false; + } #if defined(WITH_LIGHTNING_COMPILER) - amd::CacheCompilation* cacheCompilation() const { return cacheCompilation_.get(); } + amd::CacheCompilation* cacheCompilation() const { return cacheCompilation_.get(); } #endif -protected: - //! Initialize compiler instance and handle - static bool initCompiler(bool isOffline); - //! destroy compiler instance and handle - static bool destroyCompiler(); - //! Handle to the the compiler - static Compiler* compilerHandle_; - //! Device Id for an HsaDevice - AMDDeviceInfo deviceInfo_; + protected: + //! Initialize compiler instance and handle + static bool initCompiler(bool isOffline); + //! destroy compiler instance and handle + static bool destroyCompiler(); + //! Handle to the the compiler + static Compiler* compilerHandle_; + //! Device Id for an HsaDevice + AMDDeviceInfo deviceInfo_; #if defined(WITH_LIGHTNING_COMPILER) - //! Compilation with cache support - std::unique_ptr cacheCompilation_; + //! Compilation with cache support + std::unique_ptr cacheCompilation_; #endif -private: - static const bool offlineDevice_; + private: + static const bool offlineDevice_; }; //! A HSA device ordinal (physical HSA device) class Device : public NullDevice { -public: - //! Transfer buffers - class XferBuffers : public amd::HeapObject - { - public: - static const size_t MaxXferBufListSize = 8; + public: + //! Transfer buffers + class XferBuffers : public amd::HeapObject { + public: + static const size_t MaxXferBufListSize = 8; - //! Default constructor - XferBuffers(const Device& device, size_t bufSize) - : bufSize_(bufSize) - , acquiredCnt_(0) - , gpuDevice_(device) - {} + //! Default constructor + XferBuffers(const Device& device, size_t bufSize) + : bufSize_(bufSize), acquiredCnt_(0), gpuDevice_(device) {} - //! Default destructor - ~XferBuffers(); - - //! Creates the xfer buffers object - bool create(); - - //! Acquires an instance of the transfer buffers - Memory& acquire(); - - //! Releases transfer buffer - void release( - VirtualGPU& gpu, //!< Virual GPU object used with the buffer - Memory& buffer //!< Transfer buffer for release - ); - - //! Returns the buffer's size for transfer - size_t bufSize() const { return bufSize_; } - - private: - //! Disable copy constructor - XferBuffers(const XferBuffers&); - - //! Disable assignment operator - XferBuffers& operator=(const XferBuffers&); - - //! Get device object - const Device& dev() const { return gpuDevice_; } - - size_t bufSize_; //!< Staged buffer size - std::list freeBuffers_; //!< The list of free buffers - amd::Atomic acquiredCnt_; //!< The total number of acquired buffers - amd::Monitor lock_; //!< Stgaed buffer acquire/release lock - const Device& gpuDevice_; //!< GPU device object - }; - - //! Initialise the whole HSA device subsystem (CAL init, device enumeration, etc). - static bool init(); - static void tearDown(); - - //! Lookup all AMD HSA devices and memory regions. - static hsa_status_t iterateAgentCallback(hsa_agent_t agent, void *data); - static hsa_status_t iterateGpuMemoryPoolCallback( - hsa_amd_memory_pool_t region, void* data); - static hsa_status_t iterateCpuMemoryPoolCallback( - hsa_amd_memory_pool_t region, void* data); - static hsa_status_t loaderQueryHostAddress( - const void* device, const void** host); - - static bool loadHsaModules(); + //! Default destructor + ~XferBuffers(); + //! Creates the xfer buffers object bool create(); - //! Construct a new physical HSA device - Device(hsa_agent_t bkendDevice); - virtual hsa_agent_t getBackendDevice() const { return _bkendDevice; } + //! Acquires an instance of the transfer buffers + Memory& acquire(); - static const std::vector& getGpuAgents() { - return gpu_agents_; - } + //! Releases transfer buffer + void release(VirtualGPU& gpu, //!< Virual GPU object used with the buffer + Memory& buffer //!< Transfer buffer for release + ); - static hsa_agent_t getCpuAgent() - { - return cpu_agent_; - } + //! Returns the buffer's size for transfer + size_t bufSize() const { return bufSize_; } - //! Destructor for the physical HSA device - virtual ~Device(); + private: + //! Disable copy constructor + XferBuffers(const XferBuffers&); - bool mapHSADeviceToOpenCLDevice(hsa_agent_t hsadevice); + //! Disable assignment operator + XferBuffers& operator=(const XferBuffers&); - // Temporary, delete it later when HSA Runtime and KFD is fully fucntional. - void fake_device(); + //! Get device object + const Device& dev() const { return gpuDevice_; } - /////////////////////////////////////////////////////////////////////////////// - // TODO: Below are all mocked up virtual functions from amd::Device, they may - // need real implementation. - /////////////////////////////////////////////////////////////////////////////// + size_t bufSize_; //!< Staged buffer size + std::list freeBuffers_; //!< The list of free buffers + amd::Atomic acquiredCnt_; //!< The total number of acquired buffers + amd::Monitor lock_; //!< Stgaed buffer acquire/release lock + const Device& gpuDevice_; //!< GPU device object + }; -// #ifdef cl_ext_device_fission - //! Create sub-devices according to the given partition scheme. - virtual cl_int createSubDevices( - device::CreateSubDevicesInfo &create_inf, - cl_uint num_entries, - cl_device_id *devices, - cl_uint *num_devices) - { return CL_INVALID_VALUE; } -// #endif // cl_ext_device_fission + //! Initialise the whole HSA device subsystem (CAL init, device enumeration, etc). + static bool init(); + static void tearDown(); - // bool Device::create(CALuint ordinal); + //! Lookup all AMD HSA devices and memory regions. + static hsa_status_t iterateAgentCallback(hsa_agent_t agent, void* data); + static hsa_status_t iterateGpuMemoryPoolCallback(hsa_amd_memory_pool_t region, void* data); + static hsa_status_t iterateCpuMemoryPoolCallback(hsa_amd_memory_pool_t region, void* data); + static hsa_status_t loaderQueryHostAddress(const void* device, const void** host); - //! Instantiate a new virtual device - virtual device::VirtualDevice *createVirtualDevice( - amd::CommandQueue* queue = nullptr); + static bool loadHsaModules(); - //! Construct an HSAIL program object from the ELF assuming it is valid - virtual device::Program *createProgram(amd::option::Options* options = nullptr); + bool create(); - virtual device::Memory *createMemory(amd::Memory &owner) const; + //! Construct a new physical HSA device + Device(hsa_agent_t bkendDevice); + virtual hsa_agent_t getBackendDevice() const { return _bkendDevice; } - //! Sampler object allocation - virtual bool createSampler( - const amd::Sampler& owner, //!< abstraction layer sampler object - device::Sampler** sampler //!< device sampler object - ) const - { - //! \todo HSA team has to implement sampler allocation - *sampler = nullptr; - return true; - } + static const std::vector& getGpuAgents() { return gpu_agents_; } + + static hsa_agent_t getCpuAgent() { return cpu_agent_; } + + //! Destructor for the physical HSA device + virtual ~Device(); + + bool mapHSADeviceToOpenCLDevice(hsa_agent_t hsadevice); + + // Temporary, delete it later when HSA Runtime and KFD is fully fucntional. + void fake_device(); + + /////////////////////////////////////////////////////////////////////////////// + // TODO: Below are all mocked up virtual functions from amd::Device, they may + // need real implementation. + /////////////////////////////////////////////////////////////////////////////// + + // #ifdef cl_ext_device_fission + //! Create sub-devices according to the given partition scheme. + virtual cl_int createSubDevices(device::CreateSubDevicesInfo& create_inf, cl_uint num_entries, + cl_device_id* devices, cl_uint* num_devices) { + return CL_INVALID_VALUE; + } + // #endif // cl_ext_device_fission + + // bool Device::create(CALuint ordinal); + + //! Instantiate a new virtual device + virtual device::VirtualDevice* createVirtualDevice(amd::CommandQueue* queue = nullptr); + + //! Construct an HSAIL program object from the ELF assuming it is valid + virtual device::Program* createProgram(amd::option::Options* options = nullptr); + + virtual device::Memory* createMemory(amd::Memory& owner) const; + + //! Sampler object allocation + virtual bool createSampler(const amd::Sampler& owner, //!< abstraction layer sampler object + device::Sampler** sampler //!< device sampler object + ) const { + //! \todo HSA team has to implement sampler allocation + *sampler = nullptr; + return true; + } - //! Just returns nullptr for the dummy device - virtual device::Memory *createView( - amd::Memory &owner, //!< Owner memory object - const device::Memory &parent //!< Parent device memory object for the view - ) const { return nullptr; } + //! Just returns nullptr for the dummy device + virtual device::Memory* createView( + amd::Memory& owner, //!< Owner memory object + const device::Memory& parent //!< Parent device memory object for the view + ) const { + return nullptr; + } - //! Reallocates the provided buffer object - virtual bool reallocMemory(amd::Memory &owner) const {return true; } + //! Reallocates the provided buffer object + virtual bool reallocMemory(amd::Memory& owner) const { return true; } - //! Acquire external graphics API object in the host thread - //! Needed for OpenGL objects on CPU device - virtual bool bindExternalDevice( - uint flags, void * const pDevice[], void *pContext, bool validateOnly); + //! Acquire external graphics API object in the host thread + //! Needed for OpenGL objects on CPU device + virtual bool bindExternalDevice(uint flags, void* const pDevice[], void* pContext, + bool validateOnly); - /** - * @brief Removes the external device as an available device. - * - * @note: The current implementation is to avoid build break - * and does not represent actual / correct implementation. This - * needs to be done. - */ - bool unbindExternalDevice( - uint flags, //!< Enum val. for ext.API type: GL, D3D10, etc. - void * const gfxDevice[], //!< D3D device do D3D, HDC/Display handle of X Window for GL - void *gfxContext, //!< HGLRC/GLXContext handle - bool validateOnly //!< Only validate if the device can inter-operate with - //!< pDevice/pContext, do not bind. - ); + /** + * @brief Removes the external device as an available device. + * + * @note: The current implementation is to avoid build break + * and does not represent actual / correct implementation. This + * needs to be done. + */ + bool unbindExternalDevice( + uint flags, //!< Enum val. for ext.API type: GL, D3D10, etc. + void* const gfxDevice[], //!< D3D device do D3D, HDC/Display handle of X Window for GL + void* gfxContext, //!< HGLRC/GLXContext handle + bool validateOnly //!< Only validate if the device can inter-operate with + //!< pDevice/pContext, do not bind. + ); - //! Gets free memory on a GPU device - virtual bool globalFreeMemory(size_t *freeMemory) const; + //! Gets free memory on a GPU device + virtual bool globalFreeMemory(size_t* freeMemory) const; - virtual void* hostAlloc(size_t size, size_t alignment, bool atomics = false) const; + virtual void* hostAlloc(size_t size, size_t alignment, bool atomics = false) const; - virtual void hostFree(void* ptr, size_t size = 0) const; + virtual void hostFree(void* ptr, size_t size = 0) const; - void *deviceLocalAlloc(size_t size) const; + void* deviceLocalAlloc(size_t size) const; - void memFree(void *ptr, size_t size) const; + void memFree(void* ptr, size_t size) const; - virtual void* svmAlloc(amd::Context& context, size_t size, size_t alignment, cl_svm_mem_flags flags = CL_MEM_READ_WRITE, void* svmPtr = nullptr) const; + virtual void* svmAlloc(amd::Context& context, size_t size, size_t alignment, + cl_svm_mem_flags flags = CL_MEM_READ_WRITE, void* svmPtr = nullptr) const; - virtual void svmFree(void* ptr) const; + virtual void svmFree(void* ptr) const; - //! Returns transfer engine object - const device::BlitManager& xferMgr() const { return xferQueue()->blitMgr(); } + //! Returns transfer engine object + const device::BlitManager& xferMgr() const { return xferQueue()->blitMgr(); } - const size_t alloc_granularity() const { return alloc_granularity_; } + const size_t alloc_granularity() const { return alloc_granularity_; } - const hsa_profile_t agent_profile() const { return agent_profile_; } + const hsa_profile_t agent_profile() const { return agent_profile_; } - const MesaInterop& mesa() const { return mesa_; } + const MesaInterop& mesa() const { return mesa_; } - //! Finds an appropriate map target - amd::Memory* findMapTarget(size_t size) const; + //! Finds an appropriate map target + amd::Memory* findMapTarget(size_t size) const; - //! Adds a map target to the cache - bool addMapTarget(amd::Memory* memory) const; + //! Adds a map target to the cache + bool addMapTarget(amd::Memory* memory) const; - //! Returns transfer buffer object - XferBuffers& xferWrite() const { return *xferWrite_; } + //! Returns transfer buffer object + XferBuffers& xferWrite() const { return *xferWrite_; } - //! Returns transfer buffer object - XferBuffers& xferRead() const { return *xferRead_; } + //! Returns transfer buffer object + XferBuffers& xferRead() const { return *xferRead_; } - //! Returns a ROC memory object from AMD memory object - roc::Memory* getRocMemory( - amd::Memory* mem //!< Pointer to AMD memory object - ) const; + //! Returns a ROC memory object from AMD memory object + roc::Memory* getRocMemory(amd::Memory* mem //!< Pointer to AMD memory object + ) const; - amd::Context& context() const { return *context_; } + amd::Context& context() const { return *context_; } -private: - static hsa_ven_amd_loader_1_00_pfn_t amd_loader_ext_table; + private: + static hsa_ven_amd_loader_1_00_pfn_t amd_loader_ext_table; - amd::Monitor* mapCacheOps_; //!< Lock to serialise cache for the map resources - std::vector* mapCache_; //!< Map cache info structure + amd::Monitor* mapCacheOps_; //!< Lock to serialise cache for the map resources + std::vector* mapCache_; //!< Map cache info structure - bool populateOCLDeviceConstants(); - static bool isHsaInitialized_; - static hsa_agent_t cpu_agent_; - static std::vector gpu_agents_; - MesaInterop mesa_; - hsa_agent_t _bkendDevice; - hsa_profile_t agent_profile_; - hsa_amd_memory_pool_t group_segment_; - hsa_amd_memory_pool_t system_segment_; - hsa_amd_memory_pool_t system_coarse_segment_; - hsa_amd_memory_pool_t gpuvm_segment_; - size_t gpuvm_segment_max_alloc_; - size_t alloc_granularity_; - static const bool offlineDevice_; - amd::Context* context_; //!< A dummy context for internal data transfer - VirtualGPU* xferQueue_; //!< Transfer queue, created on demand + bool populateOCLDeviceConstants(); + static bool isHsaInitialized_; + static hsa_agent_t cpu_agent_; + static std::vector gpu_agents_; + MesaInterop mesa_; + hsa_agent_t _bkendDevice; + hsa_profile_t agent_profile_; + hsa_amd_memory_pool_t group_segment_; + hsa_amd_memory_pool_t system_segment_; + hsa_amd_memory_pool_t system_coarse_segment_; + hsa_amd_memory_pool_t gpuvm_segment_; + size_t gpuvm_segment_max_alloc_; + size_t alloc_granularity_; + static const bool offlineDevice_; + amd::Context* context_; //!< A dummy context for internal data transfer + VirtualGPU* xferQueue_; //!< Transfer queue, created on demand - VirtualGPU* xferQueue() const; + VirtualGPU* xferQueue() const; - XferBuffers* xferRead_; //!< Transfer buffers read - XferBuffers* xferWrite_; //!< Transfer buffers write + XferBuffers* xferRead_; //!< Transfer buffers read + XferBuffers* xferWrite_; //!< Transfer buffers write -public: - amd::Atomic numOfVgpus_; //!< Virtual gpu unique index -}; // class roc::Device + public: + amd::Atomic numOfVgpus_; //!< Virtual gpu unique index +}; // class roc::Device } // namespace roc /** * @} */ -#endif /*WITHOUT_HSA_BACKEND*/ - +#endif /*WITHOUT_HSA_BACKEND*/ diff --git a/rocclr/runtime/device/rocm/rocglinterop.cpp b/rocclr/runtime/device/rocm/rocglinterop.cpp index 0db28ab973..463ba31545 100644 --- a/rocclr/runtime/device/rocm/rocglinterop.cpp +++ b/rocclr/runtime/device/rocm/rocglinterop.cpp @@ -13,20 +13,18 @@ #include #endif -namespace roc -{ +namespace roc { #if !defined(_WIN32) -static PFNMESAGLINTEROPGLXQUERYDEVICEINFOPROC GlxInfo = nullptr; -static PFNMESAGLINTEROPGLXEXPORTOBJECTPROC GlxExport = nullptr; -static PFNMESAGLINTEROPEGLQUERYDEVICEINFOPROC EglInfo = nullptr; -static PFNMESAGLINTEROPEGLEXPORTOBJECTPROC EglExport = nullptr; +static PFNMESAGLINTEROPGLXQUERYDEVICEINFOPROC GlxInfo = nullptr; +static PFNMESAGLINTEROPGLXEXPORTOBJECTPROC GlxExport = nullptr; +static PFNMESAGLINTEROPEGLQUERYDEVICEINFOPROC EglInfo = nullptr; +static PFNMESAGLINTEROPEGLEXPORTOBJECTPROC EglExport = nullptr; #endif std::atomic MesaInterop::refCount(0); -bool MesaInterop::Supported() -{ +bool MesaInterop::Supported() { #ifdef _WIN32 return false; #else @@ -34,45 +32,42 @@ bool MesaInterop::Supported() #endif } -//Attempt to locate Mesa interop APIs. Return which of glx/egl are supported. -bool MesaInterop::Bind(MESA_INTEROP_KIND Kind, const DisplayHandle& Display, const ContextHandle& Context) -{ +// Attempt to locate Mesa interop APIs. Return which of glx/egl are supported. +bool MesaInterop::Bind(MESA_INTEROP_KIND Kind, const DisplayHandle& Display, + const ContextHandle& Context) { #if defined(_WIN32) return false; #else - if(Kind==MESA_INTEROP_NONE) - return false; + if (Kind == MESA_INTEROP_NONE) return false; - if(kind!=MESA_INTEROP_NONE) - { + if (kind != MESA_INTEROP_NONE) { LogError("Error - MesaInterop Bind while already bound."); return false; } - void* glxinfo=dlsym(RTLD_DEFAULT, "MesaGLInteropGLXQueryDeviceInfo"); - void* eglinfo=dlsym(RTLD_DEFAULT, "MesaGLInteropEGLQueryDeviceInfo"); - - if(((glxinfo!=GlxInfo) || (eglinfo!=EglInfo)) && (refCount!=0)) + void* glxinfo = dlsym(RTLD_DEFAULT, "MesaGLInteropGLXQueryDeviceInfo"); + void* eglinfo = dlsym(RTLD_DEFAULT, "MesaGLInteropEGLQueryDeviceInfo"); + + if (((glxinfo != GlxInfo) || (eglinfo != EglInfo)) && (refCount != 0)) LogWarning("Warning - Mesa changed while holding interop contexts."); - GlxInfo=(PFNMESAGLINTEROPGLXQUERYDEVICEINFOPROC)glxinfo; - EglInfo=(PFNMESAGLINTEROPEGLQUERYDEVICEINFOPROC)eglinfo; + GlxInfo = (PFNMESAGLINTEROPGLXQUERYDEVICEINFOPROC)glxinfo; + EglInfo = (PFNMESAGLINTEROPEGLQUERYDEVICEINFOPROC)eglinfo; - GlxExport=(PFNMESAGLINTEROPGLXEXPORTOBJECTPROC)dlsym(RTLD_DEFAULT, "MesaGLInteropGLXExportObject"); - EglExport=(PFNMESAGLINTEROPEGLEXPORTOBJECTPROC)dlsym(RTLD_DEFAULT, "MesaGLInteropEGLExportObject"); + GlxExport = + (PFNMESAGLINTEROPGLXEXPORTOBJECTPROC)dlsym(RTLD_DEFAULT, "MesaGLInteropGLXExportObject"); + EglExport = + (PFNMESAGLINTEROPEGLEXPORTOBJECTPROC)dlsym(RTLD_DEFAULT, "MesaGLInteropEGLExportObject"); - uint32_t ret=MESA_INTEROP_NONE; - if(GlxInfo && GlxExport) - ret|=MESA_INTEROP_GLX; - if(EglInfo && EglExport) - ret|=MESA_INTEROP_EGL; + uint32_t ret = MESA_INTEROP_NONE; + if (GlxInfo && GlxExport) ret |= MESA_INTEROP_GLX; + if (EglInfo && EglExport) ret |= MESA_INTEROP_EGL; kind = MESA_INTEROP_KIND(ret & Kind); - display=Display; - context=Context; + display = Display; + context = Context; - if(kind!=MESA_INTEROP_NONE) - { + if (kind != MESA_INTEROP_NONE) { refCount++; return true; } @@ -81,40 +76,35 @@ bool MesaInterop::Bind(MESA_INTEROP_KIND Kind, const DisplayHandle& Display, con #endif } -bool MesaInterop::GetInfo(mesa_glinterop_device_info& info) const -{ +bool MesaInterop::GetInfo(mesa_glinterop_device_info& info) const { #ifdef _WIN32 return false; #else - switch(kind) - { - case MESA_INTEROP_GLX: - return GlxInfo(display.glxDisplay, context.glxContext, &info)==MESA_GLINTEROP_SUCCESS; - case MESA_INTEROP_EGL: - return EglInfo(display.eglDisplay, context.eglContext, &info)==MESA_GLINTEROP_SUCCESS; - default: - return false; + switch (kind) { + case MESA_INTEROP_GLX: + return GlxInfo(display.glxDisplay, context.glxContext, &info) == MESA_GLINTEROP_SUCCESS; + case MESA_INTEROP_EGL: + return EglInfo(display.eglDisplay, context.eglContext, &info) == MESA_GLINTEROP_SUCCESS; + default: + return false; } #endif } -bool MesaInterop::Export (mesa_glinterop_export_in& in, mesa_glinterop_export_out& out) const -{ +bool MesaInterop::Export(mesa_glinterop_export_in& in, mesa_glinterop_export_out& out) const { #ifdef _WIN32 return false; #else - switch(kind) - { - case MESA_INTEROP_GLX: - return GlxExport(display.glxDisplay, context.glxContext, &in, &out)==MESA_GLINTEROP_SUCCESS; - case MESA_INTEROP_EGL: - return EglExport(display.eglDisplay, context.eglContext, &in, &out)==MESA_GLINTEROP_SUCCESS; - default: - return false; + switch (kind) { + case MESA_INTEROP_GLX: + return GlxExport(display.glxDisplay, context.glxContext, &in, &out) == MESA_GLINTEROP_SUCCESS; + case MESA_INTEROP_EGL: + return EglExport(display.eglDisplay, context.eglContext, &in, &out) == MESA_GLINTEROP_SUCCESS; + default: + return false; } #endif } - } #endif // WITHOUT_HSA_BACKEND diff --git a/rocclr/runtime/device/rocm/rocglinterop.hpp b/rocclr/runtime/device/rocm/rocglinterop.hpp index 219baa8cdd..a899096d9c 100644 --- a/rocclr/runtime/device/rocm/rocglinterop.hpp +++ b/rocclr/runtime/device/rocm/rocglinterop.hpp @@ -6,10 +6,10 @@ #ifndef WITHOUT_HSA_BACKEND #ifdef _WIN32 -//GLX header cannot be included in Windows due to X11 header dependency +// GLX header cannot be included in Windows due to X11 header dependency #define MESA_GLINTEROP_NO_GLX #include "device/rocm/mesa_glinterop.h" -//Give GLX parameters void* size +// Give GLX parameters void* size typedef void Display; typedef void* GLXContext; #undef MESA_GLINTEROP_NO_GLX @@ -22,137 +22,119 @@ typedef void* GLXContext; #include -namespace roc -{ - - //Specific typed container for version 1 - typedef struct metadata_amd_ci_vi_s { - uint32_t version; // Must be 1 - uint32_t vendorID; // AMD | CZ - SQ_IMG_RSRC_WORD0 word0; - SQ_IMG_RSRC_WORD1 word1; - SQ_IMG_RSRC_WORD2 word2; - SQ_IMG_RSRC_WORD3 word3; - SQ_IMG_RSRC_WORD4 word4; - SQ_IMG_RSRC_WORD5 word5; - SQ_IMG_RSRC_WORD6 word6; - SQ_IMG_RSRC_WORD7 word7; - uint32_t mip_offsets[0]; //Mip level offset bits [39:8] for each level (if any) - } metadata_amd_ci_vi_t; +namespace roc { - class image_metadata - { - private: - metadata_amd_ci_vi_t* data; +// Specific typed container for version 1 +typedef struct metadata_amd_ci_vi_s { + uint32_t version; // Must be 1 + uint32_t vendorID; // AMD | CZ + SQ_IMG_RSRC_WORD0 word0; + SQ_IMG_RSRC_WORD1 word1; + SQ_IMG_RSRC_WORD2 word2; + SQ_IMG_RSRC_WORD3 word3; + SQ_IMG_RSRC_WORD4 word4; + SQ_IMG_RSRC_WORD5 word5; + SQ_IMG_RSRC_WORD6 word6; + SQ_IMG_RSRC_WORD7 word7; + uint32_t mip_offsets[0]; // Mip level offset bits [39:8] for each level (if any) +} metadata_amd_ci_vi_t; - image_metadata(const image_metadata&)=delete; - image_metadata& operator=(const image_metadata&)=delete; +class image_metadata { + private: + metadata_amd_ci_vi_t* data; - public: - image_metadata() : data(nullptr) {} - ~image_metadata() { data=nullptr; } + image_metadata(const image_metadata&) = delete; + image_metadata& operator=(const image_metadata&) = delete; - bool create(hsa_amd_image_descriptor_t* image_desc) - { - if((image_desc->version!=1) || ((image_desc->deviceID>>16)!=0x1002)) return false; - data=reinterpret_cast(image_desc); - return true; - } + public: + image_metadata() : data(nullptr) {} + ~image_metadata() { data = nullptr; } - bool setMipLevel(uint32_t level) - { - if(level>data->word3.bits.last_level) - return false; - data->word3.bits.base_level=level; - data->word3.bits.last_level=level; - return true; - } + bool create(hsa_amd_image_descriptor_t* image_desc) { + if ((image_desc->version != 1) || ((image_desc->deviceID >> 16) != 0x1002)) return false; + data = reinterpret_cast(image_desc); + return true; + } - bool setLayer(uint32_t layer) - { - data->word3.bits.type=SQ_RSRC_IMG_2D_ARRAY; - data->word5.bits.last_array=layer; - data->word5.bits.base_array=layer; - return true; - } + bool setMipLevel(uint32_t level) { + if (level > data->word3.bits.last_level) return false; + data->word3.bits.base_level = level; + data->word3.bits.last_level = level; + return true; + } - bool setFace(GLenum face) - { - int index=face-GL_TEXTURE_CUBE_MAP_POSITIVE_X; - if(index<0 || index>5) - return false; - if(data->word3.bits.type!=SQ_RSRC_IMG_CUBE) - return false; - return setLayer(index); - } + bool setLayer(uint32_t layer) { + data->word3.bits.type = SQ_RSRC_IMG_2D_ARRAY; + data->word5.bits.last_array = layer; + data->word5.bits.base_array = layer; + return true; + } + + bool setFace(GLenum face) { + int index = face - GL_TEXTURE_CUBE_MAP_POSITIVE_X; + if (index < 0 || index > 5) return false; + if (data->word3.bits.type != SQ_RSRC_IMG_CUBE) return false; + return setLayer(index); + } +}; + +class MesaInterop { + public: + enum MESA_INTEROP_KIND { MESA_INTEROP_NONE = 0, MESA_INTEROP_GLX = 1, MESA_INTEROP_EGL = 2 }; + + union DisplayHandle { + Display* glxDisplay; + EGLDisplay eglDisplay; }; - class MesaInterop - { - public: - - enum MESA_INTEROP_KIND { MESA_INTEROP_NONE=0, MESA_INTEROP_GLX=1, MESA_INTEROP_EGL=2 }; - - union DisplayHandle - { - Display* glxDisplay; - EGLDisplay eglDisplay; - }; - - union ContextHandle - { - GLXContext glxContext; - EGLContext eglContext; - }; - - //True if the configuration supports the indicated interop ability. - static bool Supported(); - - MesaInterop() { kind=MESA_INTEROP_NONE; } - MesaInterop(const MesaInterop& rhs) { *this=rhs; } - ~MesaInterop() { Unbind(); } - - const MesaInterop& operator=(const MesaInterop& rhs) - { - display=rhs.display; - context=rhs.context; - kind=rhs.kind; - if(kind!=MESA_INTEROP_NONE) - refCount++; - return *this; - } - - /* - Loads Mesa interop APIs and sets this interface object to use the indicated - subsystem (GLX/EGL). Returns true if the required subsystem is found. - */ - bool Bind(MESA_INTEROP_KIND Kind, const DisplayHandle& Display, const ContextHandle& Context); - - /* - Releases use of Mesa interop APIs. - Used to check for bad load/unload sequences. - */ - void Unbind() - { - if(kind==MESA_INTEROP_NONE) return; - assert(refCount>0 && "Invalid refCount in MesaInterop."); - refCount--; - kind=MESA_INTEROP_NONE; - } - - bool GetInfo(mesa_glinterop_device_info& info) const; - - bool Export (mesa_glinterop_export_in& in, mesa_glinterop_export_out& out) const; - - private: - static std::atomic refCount; - - DisplayHandle display; - ContextHandle context; - MESA_INTEROP_KIND kind; + union ContextHandle { + GLXContext glxContext; + EGLContext eglContext; }; + // True if the configuration supports the indicated interop ability. + static bool Supported(); + + MesaInterop() { kind = MESA_INTEROP_NONE; } + MesaInterop(const MesaInterop& rhs) { *this = rhs; } + ~MesaInterop() { Unbind(); } + + const MesaInterop& operator=(const MesaInterop& rhs) { + display = rhs.display; + context = rhs.context; + kind = rhs.kind; + if (kind != MESA_INTEROP_NONE) refCount++; + return *this; + } + + /* + Loads Mesa interop APIs and sets this interface object to use the indicated + subsystem (GLX/EGL). Returns true if the required subsystem is found. + */ + bool Bind(MESA_INTEROP_KIND Kind, const DisplayHandle& Display, const ContextHandle& Context); + + /* + Releases use of Mesa interop APIs. + Used to check for bad load/unload sequences. + */ + void Unbind() { + if (kind == MESA_INTEROP_NONE) return; + assert(refCount > 0 && "Invalid refCount in MesaInterop."); + refCount--; + kind = MESA_INTEROP_NONE; + } + + bool GetInfo(mesa_glinterop_device_info& info) const; + + bool Export(mesa_glinterop_export_in& in, mesa_glinterop_export_out& out) const; + + private: + static std::atomic refCount; + + DisplayHandle display; + ContextHandle context; + MESA_INTEROP_KIND kind; +}; } -#endif /*WITHOUT_HSA_BACKEND*/ - +#endif /*WITHOUT_HSA_BACKEND*/ diff --git a/rocclr/runtime/device/rocm/rockernel.cpp b/rocclr/runtime/device/rocm/rockernel.cpp index 16b6396fbe..2054e89b2c 100644 --- a/rocclr/runtime/device/rocm/rockernel.cpp +++ b/rocclr/runtime/device/rocm/rockernel.cpp @@ -19,679 +19,623 @@ using llvm::AMDGPU::CodeObject::AddressSpaceQualifier; using llvm::AMDGPU::CodeObject::ValueKind; using llvm::AMDGPU::CodeObject::ValueType; -static inline ROC_ARG_TYPE -GetKernelArgType(const KernelArgMD& lcArg) -{ - switch (lcArg.mValueKind) { +static inline ROC_ARG_TYPE GetKernelArgType(const KernelArgMD& lcArg) { + switch (lcArg.mValueKind) { case ValueKind::GlobalBuffer: case ValueKind::DynamicSharedPointer: - return ROC_ARGTYPE_POINTER; + return ROC_ARGTYPE_POINTER; case ValueKind::ByValue: - return ROC_ARGTYPE_VALUE; + return ROC_ARGTYPE_VALUE; case ValueKind::Image: - return ROC_ARGTYPE_IMAGE; + return ROC_ARGTYPE_IMAGE; case ValueKind::Sampler: - return ROC_ARGTYPE_SAMPLER; + return ROC_ARGTYPE_SAMPLER; case ValueKind::HiddenGlobalOffsetX: - return ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X; + return ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X; case ValueKind::HiddenGlobalOffsetY: - return ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y; + return ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y; case ValueKind::HiddenGlobalOffsetZ: - return ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z; + return ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z; case ValueKind::HiddenPrintfBuffer: - return ROC_ARGTYPE_HIDDEN_PRINTF_BUFFER; + return ROC_ARGTYPE_HIDDEN_PRINTF_BUFFER; case ValueKind::HiddenDefaultQueue: - return ROC_ARGTYPE_HIDDEN_DEFAULT_QUEUE; + return ROC_ARGTYPE_HIDDEN_DEFAULT_QUEUE; case ValueKind::HiddenCompletionAction: - return ROC_ARGTYPE_HIDDEN_COMPLETION_ACTION; + return ROC_ARGTYPE_HIDDEN_COMPLETION_ACTION; case ValueKind::HiddenNone: - return ROC_ARGTYPE_HIDDEN_NONE; + return ROC_ARGTYPE_HIDDEN_NONE; default: - return ROC_ARGTYPE_ERROR; - } + return ROC_ARGTYPE_ERROR; + } } -#endif // defined(WITH_LIGHTNING_COMPILER) +#endif // defined(WITH_LIGHTNING_COMPILER) -static inline ROC_ARG_TYPE -GetKernelArgType(const aclArgData* argInfo) -{ - if (argInfo->argStr[0] == '_' && argInfo->argStr[1] == '.') { - if (strcmp(&argInfo->argStr[2], "global_offset_0") == 0) { - return ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X; - } - else if (strcmp(&argInfo->argStr[2], "global_offset_1") == 0) { - return ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y; - } - else if (strcmp(&argInfo->argStr[2], "global_offset_2") == 0) { - return ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z; - } - else if (strcmp(&argInfo->argStr[2], "printf_buffer") == 0) { - return ROC_ARGTYPE_HIDDEN_PRINTF_BUFFER; - } - else if (strcmp(&argInfo->argStr[2], "vqueue_pointer") == 0) { - return ROC_ARGTYPE_HIDDEN_DEFAULT_QUEUE; - } - else if (strcmp(&argInfo->argStr[2], "aqlwrap_pointer") == 0) { - return ROC_ARGTYPE_HIDDEN_COMPLETION_ACTION; - } - return ROC_ARGTYPE_HIDDEN_NONE; +static inline ROC_ARG_TYPE GetKernelArgType(const aclArgData* argInfo) { + if (argInfo->argStr[0] == '_' && argInfo->argStr[1] == '.') { + if (strcmp(&argInfo->argStr[2], "global_offset_0") == 0) { + return ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X; + } else if (strcmp(&argInfo->argStr[2], "global_offset_1") == 0) { + return ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y; + } else if (strcmp(&argInfo->argStr[2], "global_offset_2") == 0) { + return ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z; + } else if (strcmp(&argInfo->argStr[2], "printf_buffer") == 0) { + return ROC_ARGTYPE_HIDDEN_PRINTF_BUFFER; + } else if (strcmp(&argInfo->argStr[2], "vqueue_pointer") == 0) { + return ROC_ARGTYPE_HIDDEN_DEFAULT_QUEUE; + } else if (strcmp(&argInfo->argStr[2], "aqlwrap_pointer") == 0) { + return ROC_ARGTYPE_HIDDEN_COMPLETION_ACTION; } + return ROC_ARGTYPE_HIDDEN_NONE; + } - switch (argInfo->type) { + switch (argInfo->type) { case ARG_TYPE_POINTER: - return ROC_ARGTYPE_POINTER; + return ROC_ARGTYPE_POINTER; case ARG_TYPE_VALUE: - return (argInfo->arg.value.data == DATATYPE_struct) - ? ROC_ARGTYPE_REFERENCE : ROC_ARGTYPE_VALUE; + return (argInfo->arg.value.data == DATATYPE_struct) ? ROC_ARGTYPE_REFERENCE + : ROC_ARGTYPE_VALUE; case ARG_TYPE_IMAGE: - return ROC_ARGTYPE_IMAGE; + return ROC_ARGTYPE_IMAGE; case ARG_TYPE_SAMPLER: - return ROC_ARGTYPE_SAMPLER; + return ROC_ARGTYPE_SAMPLER; case ARG_TYPE_ERROR: default: - return ROC_ARGTYPE_ERROR; - } + return ROC_ARGTYPE_ERROR; + } } #if defined(WITH_LIGHTNING_COMPILER) -static inline size_t -GetKernelArgAlignment(const KernelArgMD& lcArg) -{ - return lcArg.mAlign; -} -#endif // defined(WITH_LIGHTNING_COMPILER) +static inline size_t GetKernelArgAlignment(const KernelArgMD& lcArg) { return lcArg.mAlign; } +#endif // defined(WITH_LIGHTNING_COMPILER) -static inline size_t -GetKernelArgAlignment(const aclArgData* argInfo) -{ - switch (argInfo->type) { +static inline size_t GetKernelArgAlignment(const aclArgData* argInfo) { + switch (argInfo->type) { case ARG_TYPE_POINTER: - return sizeof(void*); + return sizeof(void*); case ARG_TYPE_VALUE: - switch (argInfo->arg.value.data) { + switch (argInfo->arg.value.data) { case DATATYPE_i8: case DATATYPE_u8: - return 1; + return 1; case DATATYPE_u16: case DATATYPE_i16: case DATATYPE_f16: - return 2; + return 2; case DATATYPE_u32: case DATATYPE_i32: case DATATYPE_f32: - return 4; + return 4; case DATATYPE_i64: case DATATYPE_u64: case DATATYPE_f64: - return 8; + return 8; case DATATYPE_struct: - return 128; + return 128; case DATATYPE_ERROR: default: - return -1; - } - case ARG_TYPE_IMAGE: return sizeof(cl_mem); - case ARG_TYPE_SAMPLER: return sizeof(cl_sampler); - default: return -1; - } + return -1; + } + case ARG_TYPE_IMAGE: + return sizeof(cl_mem); + case ARG_TYPE_SAMPLER: + return sizeof(cl_sampler); + default: + return -1; + } } #if defined(WITH_LIGHTNING_COMPILER) -static inline size_t -GetKernelArgPointeeAlignment(const KernelArgMD& lcArg) -{ - if (lcArg.mValueKind == ValueKind::DynamicSharedPointer) { - uint32_t align = lcArg.mPointeeAlign; - if (align == 0) { - LogWarning("Missing DynamicSharedPointer alignment"); - align = 128; /* worst case alignment */; - } - return align; +static inline size_t GetKernelArgPointeeAlignment(const KernelArgMD& lcArg) { + if (lcArg.mValueKind == ValueKind::DynamicSharedPointer) { + uint32_t align = lcArg.mPointeeAlign; + if (align == 0) { + LogWarning("Missing DynamicSharedPointer alignment"); + align = 128; /* worst case alignment */ + ; } - return 1; + return align; + } + return 1; } -#endif // defined(WITH_LIGHTNING_COMPILER) +#endif // defined(WITH_LIGHTNING_COMPILER) -static inline size_t -GetKernelArgPointeeAlignment(const aclArgData* argInfo) -{ - if (argInfo->type == ARG_TYPE_POINTER) { - return argInfo->arg.pointer.align; - } - return 1; +static inline size_t GetKernelArgPointeeAlignment(const aclArgData* argInfo) { + if (argInfo->type == ARG_TYPE_POINTER) { + return argInfo->arg.pointer.align; + } + return 1; } #if defined(WITH_LIGHTNING_COMPILER) -static inline ROC_ACCESS_TYPE -GetKernelArgAccessType(const KernelArgMD& lcArg) -{ - if (lcArg.mValueKind == ValueKind::GlobalBuffer || - lcArg.mValueKind == ValueKind::Image) { - switch (lcArg.mAccQual) { - case AccessQualifier::ReadOnly: - return ROC_ACCESS_TYPE_RO; - case AccessQualifier::WriteOnly: - return ROC_ACCESS_TYPE_WO; - case AccessQualifier::ReadWrite: - default: - return ROC_ACCESS_TYPE_RW; - } - } - return ROC_ACCESS_TYPE_NONE; -} -#endif // defined(WITH_LIGHTNING_COMPILER) - -static inline ROC_ACCESS_TYPE -GetKernelArgAccessType(const aclArgData* argInfo) -{ - aclAccessType accessType; - - if (argInfo->type == ARG_TYPE_POINTER) { - accessType = argInfo->arg.pointer.type; - } - else if (argInfo->type == ARG_TYPE_IMAGE) { - accessType = argInfo->arg.image.type; - } - else { - return ROC_ACCESS_TYPE_NONE; - } - if (accessType == ACCESS_TYPE_RO) { +static inline ROC_ACCESS_TYPE GetKernelArgAccessType(const KernelArgMD& lcArg) { + if (lcArg.mValueKind == ValueKind::GlobalBuffer || lcArg.mValueKind == ValueKind::Image) { + switch (lcArg.mAccQual) { + case AccessQualifier::ReadOnly: return ROC_ACCESS_TYPE_RO; - } - else if (accessType == ACCESS_TYPE_WO) { + case AccessQualifier::WriteOnly: return ROC_ACCESS_TYPE_WO; + case AccessQualifier::ReadWrite: + default: + return ROC_ACCESS_TYPE_RW; } + } + return ROC_ACCESS_TYPE_NONE; +} +#endif // defined(WITH_LIGHTNING_COMPILER) - return ROC_ACCESS_TYPE_RW; +static inline ROC_ACCESS_TYPE GetKernelArgAccessType(const aclArgData* argInfo) { + aclAccessType accessType; + + if (argInfo->type == ARG_TYPE_POINTER) { + accessType = argInfo->arg.pointer.type; + } else if (argInfo->type == ARG_TYPE_IMAGE) { + accessType = argInfo->arg.image.type; + } else { + return ROC_ACCESS_TYPE_NONE; + } + if (accessType == ACCESS_TYPE_RO) { + return ROC_ACCESS_TYPE_RO; + } else if (accessType == ACCESS_TYPE_WO) { + return ROC_ACCESS_TYPE_WO; + } + + return ROC_ACCESS_TYPE_RW; } #if defined(WITH_LIGHTNING_COMPILER) -static inline ROC_ADDRESS_QUALIFIER -GetKernelAddrQual(const KernelArgMD& lcArg) -{ - if (lcArg.mValueKind == ValueKind::DynamicSharedPointer) { - return ROC_ADDRESS_LOCAL; +static inline ROC_ADDRESS_QUALIFIER GetKernelAddrQual(const KernelArgMD& lcArg) { + if (lcArg.mValueKind == ValueKind::DynamicSharedPointer) { + return ROC_ADDRESS_LOCAL; + } else if (lcArg.mValueKind == ValueKind::GlobalBuffer) { + if (lcArg.mAddrSpaceQual == AddressSpaceQualifier::Global) { + return ROC_ADDRESS_GLOBAL; + } else if (lcArg.mAddrSpaceQual == AddressSpaceQualifier::Constant) { + return ROC_ADDRESS_CONSTANT; } - else if (lcArg.mValueKind == ValueKind::GlobalBuffer) { - if (lcArg.mAddrSpaceQual == AddressSpaceQualifier::Global) { - return ROC_ADDRESS_GLOBAL; - } - else if (lcArg.mAddrSpaceQual == AddressSpaceQualifier::Constant) { - return ROC_ADDRESS_CONSTANT; - } + LogError("Unsupported address type"); + return ROC_ADDRESS_ERROR; + } else if (lcArg.mValueKind == ValueKind::Image || lcArg.mValueKind == ValueKind::Sampler) { + return ROC_ADDRESS_GLOBAL; + } + return ROC_ADDRESS_ERROR; +} +#endif // defined(WITH_LIGHTNING_COMPILER) + +static inline ROC_ADDRESS_QUALIFIER GetKernelAddrQual(const aclArgData* argInfo) { + if (argInfo->type == ARG_TYPE_POINTER) { + switch (argInfo->arg.pointer.memory) { + case PTR_MT_CONSTANT_EMU: + case PTR_MT_UAV_CONSTANT: + case PTR_MT_CONSTANT: + return ROC_ADDRESS_CONSTANT; + case PTR_MT_UAV: + case PTR_MT_GLOBAL: + return ROC_ADDRESS_GLOBAL; + case PTR_MT_LDS_EMU: + case PTR_MT_LDS: + return ROC_ADDRESS_LOCAL; + case PTR_MT_ERROR: + default: LogError("Unsupported address type"); return ROC_ADDRESS_ERROR; } - else if (lcArg.mValueKind == ValueKind::Image - || lcArg.mValueKind == ValueKind::Sampler) { - return ROC_ADDRESS_GLOBAL; - } - return ROC_ADDRESS_ERROR; -} -#endif // defined(WITH_LIGHTNING_COMPILER) - -static inline ROC_ADDRESS_QUALIFIER -GetKernelAddrQual(const aclArgData* argInfo) -{ - if (argInfo->type == ARG_TYPE_POINTER) { - switch (argInfo->arg.pointer.memory) { - case PTR_MT_CONSTANT_EMU: - case PTR_MT_UAV_CONSTANT: - case PTR_MT_CONSTANT: - return ROC_ADDRESS_CONSTANT; - case PTR_MT_UAV: - case PTR_MT_GLOBAL: - return ROC_ADDRESS_GLOBAL; - case PTR_MT_LDS_EMU: - case PTR_MT_LDS: - return ROC_ADDRESS_LOCAL; - case PTR_MT_ERROR: - default: - LogError("Unsupported address type"); - return ROC_ADDRESS_ERROR; - } - } - else if ((argInfo->type == ARG_TYPE_IMAGE) || - (argInfo->type == ARG_TYPE_SAMPLER)) { - return ROC_ADDRESS_GLOBAL; - } - return ROC_ADDRESS_ERROR; + } else if ((argInfo->type == ARG_TYPE_IMAGE) || (argInfo->type == ARG_TYPE_SAMPLER)) { + return ROC_ADDRESS_GLOBAL; + } + return ROC_ADDRESS_ERROR; } #if defined(WITH_LIGHTNING_COMPILER) -static inline ROC_DATA_TYPE -GetKernelDataType(const KernelArgMD& lcArg) -{ - aclArgDataType dataType; +static inline ROC_DATA_TYPE GetKernelDataType(const KernelArgMD& lcArg) { + aclArgDataType dataType; - if (lcArg.mValueKind != ValueKind::ByValue) { - return ROC_DATATYPE_ERROR; - } + if (lcArg.mValueKind != ValueKind::ByValue) { + return ROC_DATATYPE_ERROR; + } - switch (lcArg.mValueType) { + switch (lcArg.mValueType) { case ValueType::I8: - return ROC_DATATYPE_S8; + return ROC_DATATYPE_S8; case ValueType::I16: - return ROC_DATATYPE_S16; + return ROC_DATATYPE_S16; case ValueType::I32: - return ROC_DATATYPE_S32; + return ROC_DATATYPE_S32; case ValueType::I64: - return ROC_DATATYPE_S64; + return ROC_DATATYPE_S64; case ValueType::U8: - return ROC_DATATYPE_U8; + return ROC_DATATYPE_U8; case ValueType::U16: - return ROC_DATATYPE_U16; + return ROC_DATATYPE_U16; case ValueType::U32: - return ROC_DATATYPE_U32; + return ROC_DATATYPE_U32; case ValueType::U64: - return ROC_DATATYPE_U64; + return ROC_DATATYPE_U64; case ValueType::F16: - return ROC_DATATYPE_F16; + return ROC_DATATYPE_F16; case ValueType::F32: - return ROC_DATATYPE_F32; + return ROC_DATATYPE_F32; case ValueType::F64: - return ROC_DATATYPE_F64; + return ROC_DATATYPE_F64; case ValueType::Struct: - return ROC_DATATYPE_STRUCT; + return ROC_DATATYPE_STRUCT; default: - return ROC_DATATYPE_ERROR; - } + return ROC_DATATYPE_ERROR; + } } -#endif // defined(WITH_LIGHTNING_COMPILER) +#endif // defined(WITH_LIGHTNING_COMPILER) /* f16 returns f32 - workaround due to comp lib */ -static inline ROC_DATA_TYPE -GetKernelDataType(const aclArgData* argInfo) -{ - aclArgDataType dataType; +static inline ROC_DATA_TYPE GetKernelDataType(const aclArgData* argInfo) { + aclArgDataType dataType; - if (argInfo->type == ARG_TYPE_POINTER) { - dataType = argInfo->arg.pointer.data; - } - else if (argInfo->type == ARG_TYPE_VALUE) { - dataType = argInfo->arg.value.data; - } - else { - return ROC_DATATYPE_ERROR; - } - switch (dataType) { + if (argInfo->type == ARG_TYPE_POINTER) { + dataType = argInfo->arg.pointer.data; + } else if (argInfo->type == ARG_TYPE_VALUE) { + dataType = argInfo->arg.value.data; + } else { + return ROC_DATATYPE_ERROR; + } + switch (dataType) { case DATATYPE_i1: - return ROC_DATATYPE_B1; + return ROC_DATATYPE_B1; case DATATYPE_i8: - return ROC_DATATYPE_S8; + return ROC_DATATYPE_S8; case DATATYPE_i16: - return ROC_DATATYPE_S16; + return ROC_DATATYPE_S16; case DATATYPE_i32: - return ROC_DATATYPE_S32; + return ROC_DATATYPE_S32; case DATATYPE_i64: - return ROC_DATATYPE_S64; + return ROC_DATATYPE_S64; case DATATYPE_u8: - return ROC_DATATYPE_U8; + return ROC_DATATYPE_U8; case DATATYPE_u16: - return ROC_DATATYPE_U16; + return ROC_DATATYPE_U16; case DATATYPE_u32: - return ROC_DATATYPE_U32; + return ROC_DATATYPE_U32; case DATATYPE_u64: - return ROC_DATATYPE_U64; + return ROC_DATATYPE_U64; case DATATYPE_f16: - return ROC_DATATYPE_F32; + return ROC_DATATYPE_F32; case DATATYPE_f32: - return ROC_DATATYPE_F32; + return ROC_DATATYPE_F32; case DATATYPE_f64: - return ROC_DATATYPE_F64; + return ROC_DATATYPE_F64; case DATATYPE_struct: - return ROC_DATATYPE_STRUCT; + return ROC_DATATYPE_STRUCT; case DATATYPE_opaque: - return ROC_DATATYPE_OPAQUE; + return ROC_DATATYPE_OPAQUE; case DATATYPE_ERROR: default: - return ROC_DATATYPE_ERROR; - } + return ROC_DATATYPE_ERROR; + } } -static inline int -GetKernelArgSize(const aclArgData* argInfo) -{ - switch (argInfo->type) { - case ARG_TYPE_POINTER: return sizeof(void *); - case ARG_TYPE_VALUE: - switch (argInfo->arg.value.data) { - case DATATYPE_i8: - case DATATYPE_u8: - case DATATYPE_struct: - return 1 * argInfo->arg.value.numElements; - case DATATYPE_u16: - case DATATYPE_i16: - case DATATYPE_f16: - return 2 * argInfo->arg.value.numElements; - case DATATYPE_u32: - case DATATYPE_i32: - case DATATYPE_f32: - return 4 * argInfo->arg.value.numElements; - case DATATYPE_i64: - case DATATYPE_u64: - case DATATYPE_f64: - return 8 * argInfo->arg.value.numElements; - case DATATYPE_ERROR: - default: return -1; - } - case ARG_TYPE_IMAGE: return sizeof(cl_mem); - case ARG_TYPE_SAMPLER: return sizeof(cl_sampler); - default: return -1; - } -} - -static inline clk_value_type_t -GetOclType(const Kernel::Argument* arg) -{ - static const clk_value_type_t ClkValueMapType[6][6] = { - { T_CHAR, T_CHAR2, T_CHAR3, T_CHAR4, T_CHAR8, T_CHAR16 }, - { T_SHORT, T_SHORT2, T_SHORT3, T_SHORT4, T_SHORT8, T_SHORT16 }, - { T_INT, T_INT2, T_INT3, T_INT4, T_INT8, T_INT16 }, - { T_LONG, T_LONG2, T_LONG3, T_LONG4, T_LONG8, T_LONG16 }, - { T_FLOAT, T_FLOAT2, T_FLOAT3, T_FLOAT4, T_FLOAT8, T_FLOAT16 }, - { T_DOUBLE, T_DOUBLE2, T_DOUBLE3, T_DOUBLE4, T_DOUBLE8, T_DOUBLE16 }, - }; - - uint sizeType; - uint numElements; - if (arg->type_ == ROC_ARGTYPE_POINTER || arg->type_ == ROC_ARGTYPE_IMAGE) { - return T_POINTER; - } - else if (arg->type_ == ROC_ARGTYPE_VALUE - || arg->type_ == ROC_ARGTYPE_REFERENCE) { - switch (arg->dataType_) { - case ROC_DATATYPE_S8: - case ROC_DATATYPE_U8: - sizeType = 0; - numElements = arg->size_; - break; - case ROC_DATATYPE_S16: - case ROC_DATATYPE_U16: - sizeType = 1; - numElements = arg->size_ / 2; - break; - case ROC_DATATYPE_S32: - case ROC_DATATYPE_U32: - sizeType = 2; - numElements = arg->size_ / 4; - break; - case ROC_DATATYPE_S64: - case ROC_DATATYPE_U64: - sizeType = 3; - numElements = arg->size_ / 8; - break; - case ROC_DATATYPE_F16: - sizeType = 4; - numElements = arg->size_ / 2; - break; - case ROC_DATATYPE_F32: - sizeType = 4; - numElements = arg->size_ / 4; - break; - case ROC_DATATYPE_F64: - sizeType = 5; - numElements = arg->size_ / 8; - break; +static inline int GetKernelArgSize(const aclArgData* argInfo) { + switch (argInfo->type) { + case ARG_TYPE_POINTER: + return sizeof(void*); + case ARG_TYPE_VALUE: + switch (argInfo->arg.value.data) { + case DATATYPE_i8: + case DATATYPE_u8: + case DATATYPE_struct: + return 1 * argInfo->arg.value.numElements; + case DATATYPE_u16: + case DATATYPE_i16: + case DATATYPE_f16: + return 2 * argInfo->arg.value.numElements; + case DATATYPE_u32: + case DATATYPE_i32: + case DATATYPE_f32: + return 4 * argInfo->arg.value.numElements; + case DATATYPE_i64: + case DATATYPE_u64: + case DATATYPE_f64: + return 8 * argInfo->arg.value.numElements; + case DATATYPE_ERROR: default: - return T_VOID; - } + return -1; + } + case ARG_TYPE_IMAGE: + return sizeof(cl_mem); + case ARG_TYPE_SAMPLER: + return sizeof(cl_sampler); + default: + return -1; + } +} - switch (numElements) { - case 1: return ClkValueMapType[sizeType][0]; - case 2: return ClkValueMapType[sizeType][1]; - case 3: return ClkValueMapType[sizeType][2]; - case 4: return ClkValueMapType[sizeType][3]; - case 8: return ClkValueMapType[sizeType][4]; - case 16: return ClkValueMapType[sizeType][5]; - default: return T_VOID; - } - } - else if (arg->type_ == ROC_ARGTYPE_SAMPLER) { - return T_SAMPLER; - } - else { +static inline clk_value_type_t GetOclType(const Kernel::Argument* arg) { + static const clk_value_type_t ClkValueMapType[6][6] = { + {T_CHAR, T_CHAR2, T_CHAR3, T_CHAR4, T_CHAR8, T_CHAR16}, + {T_SHORT, T_SHORT2, T_SHORT3, T_SHORT4, T_SHORT8, T_SHORT16}, + {T_INT, T_INT2, T_INT3, T_INT4, T_INT8, T_INT16}, + {T_LONG, T_LONG2, T_LONG3, T_LONG4, T_LONG8, T_LONG16}, + {T_FLOAT, T_FLOAT2, T_FLOAT3, T_FLOAT4, T_FLOAT8, T_FLOAT16}, + {T_DOUBLE, T_DOUBLE2, T_DOUBLE3, T_DOUBLE4, T_DOUBLE8, T_DOUBLE16}, + }; + + uint sizeType; + uint numElements; + if (arg->type_ == ROC_ARGTYPE_POINTER || arg->type_ == ROC_ARGTYPE_IMAGE) { + return T_POINTER; + } else if (arg->type_ == ROC_ARGTYPE_VALUE || arg->type_ == ROC_ARGTYPE_REFERENCE) { + switch (arg->dataType_) { + case ROC_DATATYPE_S8: + case ROC_DATATYPE_U8: + sizeType = 0; + numElements = arg->size_; + break; + case ROC_DATATYPE_S16: + case ROC_DATATYPE_U16: + sizeType = 1; + numElements = arg->size_ / 2; + break; + case ROC_DATATYPE_S32: + case ROC_DATATYPE_U32: + sizeType = 2; + numElements = arg->size_ / 4; + break; + case ROC_DATATYPE_S64: + case ROC_DATATYPE_U64: + sizeType = 3; + numElements = arg->size_ / 8; + break; + case ROC_DATATYPE_F16: + sizeType = 4; + numElements = arg->size_ / 2; + break; + case ROC_DATATYPE_F32: + sizeType = 4; + numElements = arg->size_ / 4; + break; + case ROC_DATATYPE_F64: + sizeType = 5; + numElements = arg->size_ / 8; + break; + default: return T_VOID; } + + switch (numElements) { + case 1: + return ClkValueMapType[sizeType][0]; + case 2: + return ClkValueMapType[sizeType][1]; + case 3: + return ClkValueMapType[sizeType][2]; + case 4: + return ClkValueMapType[sizeType][3]; + case 8: + return ClkValueMapType[sizeType][4]; + case 16: + return ClkValueMapType[sizeType][5]; + default: + return T_VOID; + } + } else if (arg->type_ == ROC_ARGTYPE_SAMPLER) { + return T_SAMPLER; + } else { + return T_VOID; + } } -static inline cl_kernel_arg_address_qualifier -GetOclAddrQual(const Kernel::Argument* arg) -{ - if (arg->type_ == ROC_ARGTYPE_POINTER) { - switch (arg->addrQual_) { - case ROC_ADDRESS_GLOBAL: - return CL_KERNEL_ARG_ADDRESS_GLOBAL; - case ROC_ADDRESS_CONSTANT: - return CL_KERNEL_ARG_ADDRESS_CONSTANT; - case ROC_ADDRESS_LOCAL: - return CL_KERNEL_ARG_ADDRESS_LOCAL; - default: - return CL_KERNEL_ARG_ADDRESS_PRIVATE; - } - } - else if (arg->type_ == ROC_ARGTYPE_IMAGE) { +static inline cl_kernel_arg_address_qualifier GetOclAddrQual(const Kernel::Argument* arg) { + if (arg->type_ == ROC_ARGTYPE_POINTER) { + switch (arg->addrQual_) { + case ROC_ADDRESS_GLOBAL: return CL_KERNEL_ARG_ADDRESS_GLOBAL; + case ROC_ADDRESS_CONSTANT: + return CL_KERNEL_ARG_ADDRESS_CONSTANT; + case ROC_ADDRESS_LOCAL: + return CL_KERNEL_ARG_ADDRESS_LOCAL; + default: + return CL_KERNEL_ARG_ADDRESS_PRIVATE; } - //default for all other cases - return CL_KERNEL_ARG_ADDRESS_PRIVATE; + } else if (arg->type_ == ROC_ARGTYPE_IMAGE) { + return CL_KERNEL_ARG_ADDRESS_GLOBAL; + } + // default for all other cases + return CL_KERNEL_ARG_ADDRESS_PRIVATE; } -static inline cl_kernel_arg_access_qualifier -GetOclAccessQual(const Kernel::Argument* arg) -{ - if (arg->type_ == ROC_ARGTYPE_IMAGE) { - switch (arg->access_) { - case ROC_ACCESS_TYPE_RO: - return CL_KERNEL_ARG_ACCESS_READ_ONLY; - case ROC_ACCESS_TYPE_WO: - return CL_KERNEL_ARG_ACCESS_WRITE_ONLY; - case ROC_ACCESS_TYPE_RW: - return CL_KERNEL_ARG_ACCESS_READ_WRITE; - default: - return CL_KERNEL_ARG_ACCESS_NONE; - } +static inline cl_kernel_arg_access_qualifier GetOclAccessQual(const Kernel::Argument* arg) { + if (arg->type_ == ROC_ARGTYPE_IMAGE) { + switch (arg->access_) { + case ROC_ACCESS_TYPE_RO: + return CL_KERNEL_ARG_ACCESS_READ_ONLY; + case ROC_ACCESS_TYPE_WO: + return CL_KERNEL_ARG_ACCESS_WRITE_ONLY; + case ROC_ACCESS_TYPE_RW: + return CL_KERNEL_ARG_ACCESS_READ_WRITE; + default: + return CL_KERNEL_ARG_ACCESS_NONE; } - return CL_KERNEL_ARG_ACCESS_NONE; + } + return CL_KERNEL_ARG_ACCESS_NONE; } #if defined(WITH_LIGHTNING_COMPILER) -static inline cl_kernel_arg_type_qualifier -GetOclTypeQual(const KernelArgMD& lcArg) -{ - cl_kernel_arg_type_qualifier rv = CL_KERNEL_ARG_TYPE_NONE; - if (lcArg.mValueKind == ValueKind::GlobalBuffer || - lcArg.mValueKind == ValueKind::DynamicSharedPointer) { - if (lcArg.mIsVolatile) { - rv |= CL_KERNEL_ARG_TYPE_VOLATILE; - } - if (lcArg.mIsRestrict) { - rv |= CL_KERNEL_ARG_TYPE_RESTRICT; - } - if (lcArg.mIsConst) { - rv |= CL_KERNEL_ARG_TYPE_CONST; - } +static inline cl_kernel_arg_type_qualifier GetOclTypeQual(const KernelArgMD& lcArg) { + cl_kernel_arg_type_qualifier rv = CL_KERNEL_ARG_TYPE_NONE; + if (lcArg.mValueKind == ValueKind::GlobalBuffer || + lcArg.mValueKind == ValueKind::DynamicSharedPointer) { + if (lcArg.mIsVolatile) { + rv |= CL_KERNEL_ARG_TYPE_VOLATILE; } - return rv; + if (lcArg.mIsRestrict) { + rv |= CL_KERNEL_ARG_TYPE_RESTRICT; + } + if (lcArg.mIsConst) { + rv |= CL_KERNEL_ARG_TYPE_CONST; + } + } + return rv; } -#endif // defined(WITH_LIGHTNING_COMPILER) +#endif // defined(WITH_LIGHTNING_COMPILER) -static inline cl_kernel_arg_type_qualifier -GetOclTypeQual(const aclArgData* argInfo) -{ - cl_kernel_arg_type_qualifier rv = CL_KERNEL_ARG_TYPE_NONE; - if (argInfo->type == ARG_TYPE_POINTER) { - if (argInfo->arg.pointer.isVolatile) { - rv |= CL_KERNEL_ARG_TYPE_VOLATILE; - } - if (argInfo->arg.pointer.isRestrict) { - rv |= CL_KERNEL_ARG_TYPE_RESTRICT; - } - if (argInfo->isConst) { - rv |= CL_KERNEL_ARG_TYPE_CONST; - } - switch (argInfo->arg.pointer.memory) { - case PTR_MT_CONSTANT: - case PTR_MT_UAV_CONSTANT: - case PTR_MT_CONSTANT_EMU: - rv |= CL_KERNEL_ARG_TYPE_CONST; - break; - default: - break; - } +static inline cl_kernel_arg_type_qualifier GetOclTypeQual(const aclArgData* argInfo) { + cl_kernel_arg_type_qualifier rv = CL_KERNEL_ARG_TYPE_NONE; + if (argInfo->type == ARG_TYPE_POINTER) { + if (argInfo->arg.pointer.isVolatile) { + rv |= CL_KERNEL_ARG_TYPE_VOLATILE; } - return rv; + if (argInfo->arg.pointer.isRestrict) { + rv |= CL_KERNEL_ARG_TYPE_RESTRICT; + } + if (argInfo->isConst) { + rv |= CL_KERNEL_ARG_TYPE_CONST; + } + switch (argInfo->arg.pointer.memory) { + case PTR_MT_CONSTANT: + case PTR_MT_UAV_CONSTANT: + case PTR_MT_CONSTANT_EMU: + rv |= CL_KERNEL_ARG_TYPE_CONST; + break; + default: + break; + } + } + return rv; } -void -Kernel::initArguments(const aclArgData* aclArg) -{ - device::Kernel::parameters_t params; +void Kernel::initArguments(const aclArgData* aclArg) { + device::Kernel::parameters_t params; - // Iterate through the arguments and insert into parameterList - for (size_t offset = 0; aclArg->struct_size != 0; aclArg++) { + // Iterate through the arguments and insert into parameterList + for (size_t offset = 0; aclArg->struct_size != 0; aclArg++) { + // Initialize HSAIL kernel argument + Kernel::Argument* arg = new Kernel::Argument; + arg->name_ = aclArg->argStr; + arg->typeName_ = aclArg->typeStr; + arg->size_ = GetKernelArgSize(aclArg); + arg->type_ = GetKernelArgType(aclArg); + arg->addrQual_ = GetKernelAddrQual(aclArg); + arg->dataType_ = GetKernelDataType(aclArg); + arg->alignment_ = GetKernelArgAlignment(aclArg); + arg->access_ = GetKernelArgAccessType(aclArg); + arg->pointeeAlignment_ = GetKernelArgPointeeAlignment(aclArg); - // Initialize HSAIL kernel argument - Kernel::Argument* arg = new Kernel::Argument; - arg->name_ = aclArg->argStr; - arg->typeName_ = aclArg->typeStr; - arg->size_ = GetKernelArgSize(aclArg); - arg->type_ = GetKernelArgType(aclArg); - arg->addrQual_ = GetKernelAddrQual(aclArg); - arg->dataType_ = GetKernelDataType(aclArg); - arg->alignment_ = GetKernelArgAlignment(aclArg); - arg->access_ = GetKernelArgAccessType(aclArg); - arg->pointeeAlignment_ = GetKernelArgPointeeAlignment(aclArg); + bool isHidden = arg->type_ == ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X || + arg->type_ == ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y || + arg->type_ == ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z || + arg->type_ == ROC_ARGTYPE_HIDDEN_PRINTF_BUFFER || + arg->type_ == ROC_ARGTYPE_HIDDEN_DEFAULT_QUEUE || + arg->type_ == ROC_ARGTYPE_HIDDEN_COMPLETION_ACTION || arg->type_ == ROC_ARGTYPE_HIDDEN_NONE; - bool isHidden = arg->type_ == ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X - || arg->type_ == ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y - || arg->type_ == ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z - || arg->type_ == ROC_ARGTYPE_HIDDEN_PRINTF_BUFFER - || arg->type_ == ROC_ARGTYPE_HIDDEN_DEFAULT_QUEUE - || arg->type_ == ROC_ARGTYPE_HIDDEN_COMPLETION_ACTION - || arg->type_ == ROC_ARGTYPE_HIDDEN_NONE; + arg->index_ = isHidden ? uint(-1) : params.size(); + hsailArgList_.push_back(arg); - arg->index_ = isHidden ? uint(-1) : params.size(); - hsailArgList_.push_back(arg); - - if (isHidden) { - continue; - } - - amd::KernelParameterDescriptor desc; - desc.name_ = arg->name_.c_str(); - desc.type_ = GetOclType(arg); - desc.addressQualifier_ = GetOclAddrQual(arg); - desc.accessQualifier_ = GetOclAccessQual(arg); - desc.typeQualifier_ = GetOclTypeQual(aclArg); - desc.typeName_ = arg->typeName_.c_str(); - - // Make a check if it is local or global - if (desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_LOCAL) { - desc.size_ = 0; - } - else { - desc.size_ = arg->size_; - } - - // Make offset alignment to match CPU metadata, since - // in multidevice config abstraction layer has a single signature - // and CPU sends the parameters as they are allocated in memory - size_t size = desc.size_; - if (size == 0) { - // Local memory for CPU - size = sizeof(cl_mem); - } - offset = amd::alignUp(offset, std::min(size, size_t(16))); - desc.offset_ = offset; - offset += amd::alignUp(size, sizeof(uint32_t)); - - params.push_back(desc); + if (isHidden) { + continue; } - createSignature(params); + + amd::KernelParameterDescriptor desc; + desc.name_ = arg->name_.c_str(); + desc.type_ = GetOclType(arg); + desc.addressQualifier_ = GetOclAddrQual(arg); + desc.accessQualifier_ = GetOclAccessQual(arg); + desc.typeQualifier_ = GetOclTypeQual(aclArg); + desc.typeName_ = arg->typeName_.c_str(); + + // Make a check if it is local or global + if (desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_LOCAL) { + desc.size_ = 0; + } else { + desc.size_ = arg->size_; + } + + // Make offset alignment to match CPU metadata, since + // in multidevice config abstraction layer has a single signature + // and CPU sends the parameters as they are allocated in memory + size_t size = desc.size_; + if (size == 0) { + // Local memory for CPU + size = sizeof(cl_mem); + } + offset = amd::alignUp(offset, std::min(size, size_t(16))); + desc.offset_ = offset; + offset += amd::alignUp(size, sizeof(uint32_t)); + + params.push_back(desc); + } + createSignature(params); } #if defined(WITH_LIGHTNING_COMPILER) -void -Kernel::initArguments_LC(const KernelMD& kernelMD) -{ - device::Kernel::parameters_t params; +void Kernel::initArguments_LC(const KernelMD& kernelMD) { + device::Kernel::parameters_t params; - size_t offset = 0; + size_t offset = 0; - for (size_t i = 0; i < kernelMD.mArgs.size(); ++i) { - const KernelArgMD& lcArg = kernelMD.mArgs[i]; + for (size_t i = 0; i < kernelMD.mArgs.size(); ++i) { + const KernelArgMD& lcArg = kernelMD.mArgs[i]; - // Initialize HSAIL kernel argument - Kernel::Argument* arg = new Kernel::Argument; - arg->name_ = lcArg.mName; - arg->typeName_ = lcArg.mTypeName; - arg->size_ = lcArg.mSize; - arg->type_ = GetKernelArgType(lcArg); - arg->addrQual_ = GetKernelAddrQual(lcArg); - arg->dataType_ = GetKernelDataType(lcArg); - arg->alignment_ = GetKernelArgAlignment(lcArg); - arg->access_ = GetKernelArgAccessType(lcArg); - arg->pointeeAlignment_ = GetKernelArgPointeeAlignment(lcArg); + // Initialize HSAIL kernel argument + Kernel::Argument* arg = new Kernel::Argument; + arg->name_ = lcArg.mName; + arg->typeName_ = lcArg.mTypeName; + arg->size_ = lcArg.mSize; + arg->type_ = GetKernelArgType(lcArg); + arg->addrQual_ = GetKernelAddrQual(lcArg); + arg->dataType_ = GetKernelDataType(lcArg); + arg->alignment_ = GetKernelArgAlignment(lcArg); + arg->access_ = GetKernelArgAccessType(lcArg); + arg->pointeeAlignment_ = GetKernelArgPointeeAlignment(lcArg); - bool isHidden = arg->type_ == ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X - || arg->type_ == ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y - || arg->type_ == ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z - || arg->type_ == ROC_ARGTYPE_HIDDEN_PRINTF_BUFFER - || arg->type_ == ROC_ARGTYPE_HIDDEN_DEFAULT_QUEUE - || arg->type_ == ROC_ARGTYPE_HIDDEN_COMPLETION_ACTION - || arg->type_ == ROC_ARGTYPE_HIDDEN_NONE; + bool isHidden = arg->type_ == ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X || + arg->type_ == ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y || + arg->type_ == ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z || + arg->type_ == ROC_ARGTYPE_HIDDEN_PRINTF_BUFFER || + arg->type_ == ROC_ARGTYPE_HIDDEN_DEFAULT_QUEUE || + arg->type_ == ROC_ARGTYPE_HIDDEN_COMPLETION_ACTION || arg->type_ == ROC_ARGTYPE_HIDDEN_NONE; - arg->index_ = isHidden ? uint(-1) : params.size(); - hsailArgList_.push_back(arg); + arg->index_ = isHidden ? uint(-1) : params.size(); + hsailArgList_.push_back(arg); - if (isHidden) { - continue; - } - - // Initialize Device kernel parameters - amd::KernelParameterDescriptor desc; - - desc.name_ = lcArg.mName.c_str(); - desc.type_ = GetOclType(arg); - desc.addressQualifier_ = GetOclAddrQual(arg); - desc.accessQualifier_ = GetOclAccessQual(arg); - desc.typeQualifier_ = GetOclTypeQual(lcArg); - desc.typeName_ = lcArg.mTypeName.c_str(); - - // Make a check if it is local or global - if (desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_LOCAL) { - desc.size_ = 0; - } - else { - desc.size_ = arg->size_; - } - - // Make offset alignment to match CPU metadata, since - // in multidevice config abstraction layer has a single signature - // and CPU sends the parameters as they are allocated in memory - size_t size = desc.size_; - if (size == 0) { - // Local memory for CPU - size = sizeof(cl_mem); - } - offset = (size_t) amd::alignUp(offset, std::min(size, size_t(16))); - desc.offset_ = offset; - offset += amd::alignUp(size, sizeof(uint32_t)); - - params.push_back(desc); + if (isHidden) { + continue; } - createSignature(params); -} -#endif // defined(WITH_LIGHTNING_COMPILER) + // Initialize Device kernel parameters + amd::KernelParameterDescriptor desc; -Kernel::Kernel( - std::string name, HSAILProgram* prog, - const uint64_t& kernelCodeHandle, + desc.name_ = lcArg.mName.c_str(); + desc.type_ = GetOclType(arg); + desc.addressQualifier_ = GetOclAddrQual(arg); + desc.accessQualifier_ = GetOclAccessQual(arg); + desc.typeQualifier_ = GetOclTypeQual(lcArg); + desc.typeName_ = lcArg.mTypeName.c_str(); + + // Make a check if it is local or global + if (desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_LOCAL) { + desc.size_ = 0; + } else { + desc.size_ = arg->size_; + } + + // Make offset alignment to match CPU metadata, since + // in multidevice config abstraction layer has a single signature + // and CPU sends the parameters as they are allocated in memory + size_t size = desc.size_; + if (size == 0) { + // Local memory for CPU + size = sizeof(cl_mem); + } + offset = (size_t)amd::alignUp(offset, std::min(size, size_t(16))); + desc.offset_ = offset; + offset += amd::alignUp(size, sizeof(uint32_t)); + + params.push_back(desc); + } + + createSignature(params); +} +#endif // defined(WITH_LIGHTNING_COMPILER) + +Kernel::Kernel(std::string name, HSAILProgram* prog, const uint64_t& kernelCodeHandle, const uint32_t workgroupGroupSegmentByteSize, - const uint32_t workitemPrivateSegmentByteSize, - const uint32_t kernargSegmentByteSize, + const uint32_t workitemPrivateSegmentByteSize, const uint32_t kernargSegmentByteSize, const uint32_t kernargSegmentAlignment) : device::Kernel(name), program_(prog), @@ -703,219 +647,188 @@ Kernel::Kernel( #if defined(WITH_LIGHTNING_COMPILER) -static const KernelMD* FindKernelMetadata(const CodeObjectMD* programMD, - const std::string& name) { - for (const KernelMD& kernelMD : programMD->mKernels) { - if (kernelMD.mName == name) { return &kernelMD; } +static const KernelMD* FindKernelMetadata(const CodeObjectMD* programMD, const std::string& name) { + for (const KernelMD& kernelMD : programMD->mKernels) { + if (kernelMD.mName == name) { + return &kernelMD; } - return nullptr; + } + return nullptr; } -bool Kernel::init_LC() -{ - hsa_agent_t hsaDevice = program_->hsaDevice(); +bool Kernel::init_LC() { + hsa_agent_t hsaDevice = program_->hsaDevice(); - // Pull out metadata from the ELF - const CodeObjectMD* programMD = program_->metadata(); - assert(programMD != nullptr); + // Pull out metadata from the ELF + const CodeObjectMD* programMD = program_->metadata(); + assert(programMD != nullptr); - const KernelMD* kernelMD = FindKernelMetadata(programMD, name()); - if (kernelMD == nullptr) { - return false; - } - initArguments_LC(*kernelMD); - - //Set the workgroup information for the kernel - memset(&workGroupInfo_, 0, sizeof(workGroupInfo_)); - workGroupInfo_.availableLDSSize_ = program_->dev().info().localMemSizePerCU_; - assert(workGroupInfo_.availableLDSSize_ > 0); - workGroupInfo_.availableSGPRs_ = 0; - workGroupInfo_.availableVGPRs_ = 0; - - if (!kernelMD->mAttrs.mReqdWorkGroupSize.empty()) { - const auto& requiredWorkgroupSize = kernelMD->mAttrs.mReqdWorkGroupSize; - workGroupInfo_.compileSize_[0] = requiredWorkgroupSize[0]; - workGroupInfo_.compileSize_[1] = requiredWorkgroupSize[1]; - workGroupInfo_.compileSize_[2] = requiredWorkgroupSize[2]; - } - - if (!kernelMD->mAttrs.mWorkGroupSizeHint.empty()) { - const auto& workgroupSizeHint = kernelMD->mAttrs.mWorkGroupSizeHint; - workGroupInfo_.compileSizeHint_[0] = workgroupSizeHint[0]; - workGroupInfo_.compileSizeHint_[1] = workgroupSizeHint[1]; - workGroupInfo_.compileSizeHint_[2] = workgroupSizeHint[2]; - } - - if (!kernelMD->mAttrs.mVecTypeHint.empty()) { - workGroupInfo_.compileVecTypeHint_ = - kernelMD->mAttrs.mVecTypeHint.c_str(); - } - - uint32_t wavefront_size = 0; - if (hsa_agent_get_info( - program_->hsaDevice(), - HSA_AGENT_INFO_WAVEFRONT_SIZE, - &wavefront_size) != HSA_STATUS_SUCCESS) { - return false; - } - assert(wavefront_size > 0); - - workGroupInfo_.privateMemSize_ = workitemPrivateSegmentByteSize_; - workGroupInfo_.localMemSize_ = workgroupGroupSegmentByteSize_; - workGroupInfo_.usedLDSSize_ = workgroupGroupSegmentByteSize_; - - workGroupInfo_.preferredSizeMultiple_ = wavefront_size; - - /// TODO: Are there any other fields that are getting queried from akc? - /// If so, code properties metadata should be used instead. - workGroupInfo_.usedSGPRs_ = kernelMD->mCodeProps.mWavefrontNumSGPRs; - workGroupInfo_.usedVGPRs_ = kernelMD->mCodeProps.mWorkitemNumVGPRs; - - workGroupInfo_.usedStackSize_ = 0; - - workGroupInfo_.wavefrontPerSIMD_ = - program_->dev().info().maxWorkItemSizes_[0] / wavefront_size; - - workGroupInfo_.wavefrontSize_ = wavefront_size; - - if (workGroupInfo_.compileSize_[0] != 0) { - workGroupInfo_.size_ = - workGroupInfo_.compileSize_[0] * - workGroupInfo_.compileSize_[1] * - workGroupInfo_.compileSize_[2]; - } - else { - workGroupInfo_.size_ = program_->dev().info().maxWorkGroupSize_; - } - - initPrintf_LC(programMD->mPrintf); - - return true; -} -#endif // defined(WITH_LIGHTNING_COMPILER) - -bool Kernel::init() -{ -#if defined(WITH_LIGHTNING_COMPILER) - return init_LC(); -#else // !defined(WITH_LIGHTNING_COMPILER) - acl_error errorCode; - //compile kernel down to ISA - hsa_agent_t hsaDevice = program_->hsaDevice(); - // Pull out metadata from the ELF - size_t sizeOfArgList; - aclCompiler* compileHandle = program_->dev().compiler(); - std::string openClKernelName("&__OpenCL_" + name() + "_kernel"); - errorCode = g_complibApi._aclQueryInfo(compileHandle, - program_->binaryElf(), - RT_ARGUMENT_ARRAY, - openClKernelName.c_str(), - nullptr, - &sizeOfArgList); - if (errorCode != ACL_SUCCESS) { - return false; - } - std::unique_ptr argList(new char[sizeOfArgList]); - errorCode = g_complibApi._aclQueryInfo(compileHandle, - program_->binaryElf(), - RT_ARGUMENT_ARRAY, - openClKernelName.c_str(), - argList.get(), - &sizeOfArgList); - if (errorCode != ACL_SUCCESS) { - return false; - } - - //Set the argList - initArguments((const aclArgData *) argList.get()); - - //Set the workgroup information for the kernel - memset(&workGroupInfo_, 0, sizeof(workGroupInfo_)); - workGroupInfo_.availableLDSSize_ = program_->dev().info().localMemSizePerCU_; - assert(workGroupInfo_.availableLDSSize_ > 0); - workGroupInfo_.availableSGPRs_ = 0; - workGroupInfo_.availableVGPRs_ = 0; - size_t sizeOfWorkGroupSize; - errorCode = g_complibApi._aclQueryInfo(compileHandle, - program_->binaryElf(), - RT_WORK_GROUP_SIZE, - openClKernelName.c_str(), - nullptr, - &sizeOfWorkGroupSize); - if (errorCode != ACL_SUCCESS) { - return false; - } - errorCode = g_complibApi._aclQueryInfo(compileHandle, - program_->binaryElf(), - RT_WORK_GROUP_SIZE, - openClKernelName.c_str(), - workGroupInfo_.compileSize_, - &sizeOfWorkGroupSize); - if (errorCode != ACL_SUCCESS) { - return false; - } - - uint32_t wavefront_size = 0; - if (HSA_STATUS_SUCCESS != - hsa_agent_get_info( - program_->hsaDevice(), HSA_AGENT_INFO_WAVEFRONT_SIZE, - &wavefront_size)) { - return false; - } - assert(wavefront_size > 0); - - // Setting it the same as used LDS. - workGroupInfo_.localMemSize_ = workgroupGroupSegmentByteSize_; - workGroupInfo_.privateMemSize_ = workitemPrivateSegmentByteSize_; - workGroupInfo_.usedLDSSize_ = workgroupGroupSegmentByteSize_; - workGroupInfo_.preferredSizeMultiple_ = wavefront_size; - - // Query kernel header object to initialize the number of - // SGPR's and VGPR's used by the kernel - const void* kernelHostPtr = nullptr; - if (Device::loaderQueryHostAddress( - reinterpret_cast(kernelCodeHandle_), &kernelHostPtr - ) == HSA_STATUS_SUCCESS) { - auto akc = reinterpret_cast(kernelHostPtr); - workGroupInfo_.usedSGPRs_ = akc->wavefront_sgpr_count; - workGroupInfo_.usedVGPRs_ = akc->workitem_vgpr_count; - } - else { - workGroupInfo_.usedSGPRs_ = 0; - workGroupInfo_.usedVGPRs_ = 0; - } - - workGroupInfo_.usedStackSize_ = 0; - workGroupInfo_.wavefrontPerSIMD_ = - program_->dev().info().maxWorkItemSizes_[0] / wavefront_size; - workGroupInfo_.wavefrontSize_ = wavefront_size; - if (workGroupInfo_.compileSize_[0] != 0) { - workGroupInfo_.size_ = - workGroupInfo_.compileSize_[0] * - workGroupInfo_.compileSize_[1] * - workGroupInfo_.compileSize_[2]; - } - else { - workGroupInfo_.size_ = program_->dev().info().maxWorkGroupSize_; - } - - // Pull out printf metadata from the ELF - size_t sizeOfPrintfList; - errorCode = g_complibApi._aclQueryInfo(compileHandle, program_->binaryElf(), RT_GPU_PRINTF_ARRAY, - openClKernelName.c_str(), nullptr, &sizeOfPrintfList); - if (errorCode != ACL_SUCCESS){ + const KernelMD* kernelMD = FindKernelMetadata(programMD, name()); + if (kernelMD == nullptr) { return false; - } + } + initArguments_LC(*kernelMD); - // Make sure kernel has any printf info + // Set the workgroup information for the kernel + memset(&workGroupInfo_, 0, sizeof(workGroupInfo_)); + workGroupInfo_.availableLDSSize_ = program_->dev().info().localMemSizePerCU_; + assert(workGroupInfo_.availableLDSSize_ > 0); + workGroupInfo_.availableSGPRs_ = 0; + workGroupInfo_.availableVGPRs_ = 0; + + if (!kernelMD->mAttrs.mReqdWorkGroupSize.empty()) { + const auto& requiredWorkgroupSize = kernelMD->mAttrs.mReqdWorkGroupSize; + workGroupInfo_.compileSize_[0] = requiredWorkgroupSize[0]; + workGroupInfo_.compileSize_[1] = requiredWorkgroupSize[1]; + workGroupInfo_.compileSize_[2] = requiredWorkgroupSize[2]; + } + + if (!kernelMD->mAttrs.mWorkGroupSizeHint.empty()) { + const auto& workgroupSizeHint = kernelMD->mAttrs.mWorkGroupSizeHint; + workGroupInfo_.compileSizeHint_[0] = workgroupSizeHint[0]; + workGroupInfo_.compileSizeHint_[1] = workgroupSizeHint[1]; + workGroupInfo_.compileSizeHint_[2] = workgroupSizeHint[2]; + } + + if (!kernelMD->mAttrs.mVecTypeHint.empty()) { + workGroupInfo_.compileVecTypeHint_ = kernelMD->mAttrs.mVecTypeHint.c_str(); + } + + uint32_t wavefront_size = 0; + if (hsa_agent_get_info(program_->hsaDevice(), HSA_AGENT_INFO_WAVEFRONT_SIZE, &wavefront_size) != + HSA_STATUS_SUCCESS) { + return false; + } + assert(wavefront_size > 0); + + workGroupInfo_.privateMemSize_ = workitemPrivateSegmentByteSize_; + workGroupInfo_.localMemSize_ = workgroupGroupSegmentByteSize_; + workGroupInfo_.usedLDSSize_ = workgroupGroupSegmentByteSize_; + + workGroupInfo_.preferredSizeMultiple_ = wavefront_size; + + /// TODO: Are there any other fields that are getting queried from akc? + /// If so, code properties metadata should be used instead. + workGroupInfo_.usedSGPRs_ = kernelMD->mCodeProps.mWavefrontNumSGPRs; + workGroupInfo_.usedVGPRs_ = kernelMD->mCodeProps.mWorkitemNumVGPRs; + + workGroupInfo_.usedStackSize_ = 0; + + workGroupInfo_.wavefrontPerSIMD_ = program_->dev().info().maxWorkItemSizes_[0] / wavefront_size; + + workGroupInfo_.wavefrontSize_ = wavefront_size; + + if (workGroupInfo_.compileSize_[0] != 0) { + workGroupInfo_.size_ = workGroupInfo_.compileSize_[0] * workGroupInfo_.compileSize_[1] * + workGroupInfo_.compileSize_[2]; + } else { + workGroupInfo_.size_ = program_->dev().info().maxWorkGroupSize_; + } + + initPrintf_LC(programMD->mPrintf); + + return true; +} +#endif // defined(WITH_LIGHTNING_COMPILER) + +bool Kernel::init() { +#if defined(WITH_LIGHTNING_COMPILER) + return init_LC(); +#else // !defined(WITH_LIGHTNING_COMPILER) + acl_error errorCode; + // compile kernel down to ISA + hsa_agent_t hsaDevice = program_->hsaDevice(); + // Pull out metadata from the ELF + size_t sizeOfArgList; + aclCompiler* compileHandle = program_->dev().compiler(); + std::string openClKernelName("&__OpenCL_" + name() + "_kernel"); + errorCode = g_complibApi._aclQueryInfo(compileHandle, program_->binaryElf(), RT_ARGUMENT_ARRAY, + openClKernelName.c_str(), nullptr, &sizeOfArgList); + if (errorCode != ACL_SUCCESS) { + return false; + } + std::unique_ptr argList(new char[sizeOfArgList]); + errorCode = g_complibApi._aclQueryInfo(compileHandle, program_->binaryElf(), RT_ARGUMENT_ARRAY, + openClKernelName.c_str(), argList.get(), &sizeOfArgList); + if (errorCode != ACL_SUCCESS) { + return false; + } + + // Set the argList + initArguments((const aclArgData*)argList.get()); + + // Set the workgroup information for the kernel + memset(&workGroupInfo_, 0, sizeof(workGroupInfo_)); + workGroupInfo_.availableLDSSize_ = program_->dev().info().localMemSizePerCU_; + assert(workGroupInfo_.availableLDSSize_ > 0); + workGroupInfo_.availableSGPRs_ = 0; + workGroupInfo_.availableVGPRs_ = 0; + size_t sizeOfWorkGroupSize; + errorCode = g_complibApi._aclQueryInfo(compileHandle, program_->binaryElf(), RT_WORK_GROUP_SIZE, + openClKernelName.c_str(), nullptr, &sizeOfWorkGroupSize); + if (errorCode != ACL_SUCCESS) { + return false; + } + errorCode = g_complibApi._aclQueryInfo(compileHandle, program_->binaryElf(), RT_WORK_GROUP_SIZE, + openClKernelName.c_str(), workGroupInfo_.compileSize_, + &sizeOfWorkGroupSize); + if (errorCode != ACL_SUCCESS) { + return false; + } + + uint32_t wavefront_size = 0; + if (HSA_STATUS_SUCCESS != + hsa_agent_get_info(program_->hsaDevice(), HSA_AGENT_INFO_WAVEFRONT_SIZE, &wavefront_size)) { + return false; + } + assert(wavefront_size > 0); + + // Setting it the same as used LDS. + workGroupInfo_.localMemSize_ = workgroupGroupSegmentByteSize_; + workGroupInfo_.privateMemSize_ = workitemPrivateSegmentByteSize_; + workGroupInfo_.usedLDSSize_ = workgroupGroupSegmentByteSize_; + workGroupInfo_.preferredSizeMultiple_ = wavefront_size; + + // Query kernel header object to initialize the number of + // SGPR's and VGPR's used by the kernel + const void* kernelHostPtr = nullptr; + if (Device::loaderQueryHostAddress(reinterpret_cast(kernelCodeHandle_), + &kernelHostPtr) == HSA_STATUS_SUCCESS) { + auto akc = reinterpret_cast(kernelHostPtr); + workGroupInfo_.usedSGPRs_ = akc->wavefront_sgpr_count; + workGroupInfo_.usedVGPRs_ = akc->workitem_vgpr_count; + } else { + workGroupInfo_.usedSGPRs_ = 0; + workGroupInfo_.usedVGPRs_ = 0; + } + + workGroupInfo_.usedStackSize_ = 0; + workGroupInfo_.wavefrontPerSIMD_ = program_->dev().info().maxWorkItemSizes_[0] / wavefront_size; + workGroupInfo_.wavefrontSize_ = wavefront_size; + if (workGroupInfo_.compileSize_[0] != 0) { + workGroupInfo_.size_ = workGroupInfo_.compileSize_[0] * workGroupInfo_.compileSize_[1] * + workGroupInfo_.compileSize_[2]; + } else { + workGroupInfo_.size_ = program_->dev().info().maxWorkGroupSize_; + } + + // Pull out printf metadata from the ELF + size_t sizeOfPrintfList; + errorCode = g_complibApi._aclQueryInfo(compileHandle, program_->binaryElf(), RT_GPU_PRINTF_ARRAY, + openClKernelName.c_str(), nullptr, &sizeOfPrintfList); + if (errorCode != ACL_SUCCESS) { + return false; + } + + // Make sure kernel has any printf info if (0 != sizeOfPrintfList) { std::unique_ptr aclPrintfList(new char[sizeOfPrintfList]); if (!aclPrintfList) { return false; } - errorCode = g_complibApi._aclQueryInfo( - compileHandle, program_->binaryElf(), RT_GPU_PRINTF_ARRAY, - openClKernelName.c_str(), aclPrintfList.get(), &sizeOfPrintfList); + errorCode = g_complibApi._aclQueryInfo(compileHandle, program_->binaryElf(), + RT_GPU_PRINTF_ARRAY, openClKernelName.c_str(), + aclPrintfList.get(), &sizeOfPrintfList); if (errorCode != ACL_SUCCESS) { return false; } @@ -924,177 +837,172 @@ bool Kernel::init() initPrintf(reinterpret_cast(aclPrintfList.get())); } return true; -#endif // !defined(WITH_LIGHTNING_COMPILER) +#endif // !defined(WITH_LIGHTNING_COMPILER) } #if defined(WITH_LIGHTNING_COMPILER) -void -Kernel::initPrintf_LC(const std::vector& printfInfoStrings) -{ - for (auto str : printfInfoStrings) { - std::vector tokens; +void Kernel::initPrintf_LC(const std::vector& printfInfoStrings) { + for (auto str : printfInfoStrings) { + std::vector tokens; - size_t end, pos = 0; - do { - end = str.find_first_of(':', pos); - tokens.push_back(str.substr(pos, end-pos)); - pos = end + 1; - } while (end != std::string::npos); + size_t end, pos = 0; + do { + end = str.find_first_of(':', pos); + tokens.push_back(str.substr(pos, end - pos)); + pos = end + 1; + } while (end != std::string::npos); - if (tokens.size() < 2) { - LogPrintfWarning("Invalid PrintInfo string: \"%s\"", str.c_str()); - continue; - } - - pos = 0; - size_t printfInfoID = std::stoi(tokens[pos++]); - if (printf_.size() <= printfInfoID) { - printf_.resize(printfInfoID + 1); - } - PrintfInfo& info = printf_[printfInfoID]; - - size_t numSizes = std::stoi(tokens[pos++]); - end = pos + numSizes; - - // ensure that we have the correct number of tokens - if (tokens.size() < end + 1/*last token is the fmtString*/) { - LogPrintfWarning("Invalid PrintInfo string: \"%s\"", str.c_str()); - continue; - } - - // push the argument sizes - while (pos < end) { - info.arguments_.push_back(std::stoi(tokens[pos++])); - } - - // FIXME: We should not need this! [ - std::string& fmt = tokens[pos]; - bool need_nl = true; - - for (pos = 0; pos < fmt.size(); ++pos) { - char symbol = fmt[pos]; - need_nl = true; - if (symbol == '\\') { - switch (fmt[pos+1]) { - case 'a': - pos++; - symbol = '\a'; - break; - case 'b': - pos++; - symbol = '\b'; - break; - case 'f': - pos++; - symbol = '\f'; - break; - case 'n': - pos++; - symbol = '\n'; - need_nl = false; - break; - case 'r': - pos++; - symbol = '\r'; - break; - case 'v': - pos++; - symbol = '\v'; - break; - case '7': - if (fmt[pos+2] == '2') { - pos += 2; - symbol = '\72'; - } - break; - default: - break; - } - } - info.fmtString_.push_back(symbol); - } - if (need_nl) { - info.fmtString_ += "\n"; - } - // ] + if (tokens.size() < 2) { + LogPrintfWarning("Invalid PrintInfo string: \"%s\"", str.c_str()); + continue; } + + pos = 0; + size_t printfInfoID = std::stoi(tokens[pos++]); + if (printf_.size() <= printfInfoID) { + printf_.resize(printfInfoID + 1); + } + PrintfInfo& info = printf_[printfInfoID]; + + size_t numSizes = std::stoi(tokens[pos++]); + end = pos + numSizes; + + // ensure that we have the correct number of tokens + if (tokens.size() < end + 1 /*last token is the fmtString*/) { + LogPrintfWarning("Invalid PrintInfo string: \"%s\"", str.c_str()); + continue; + } + + // push the argument sizes + while (pos < end) { + info.arguments_.push_back(std::stoi(tokens[pos++])); + } + + // FIXME: We should not need this! [ + std::string& fmt = tokens[pos]; + bool need_nl = true; + + for (pos = 0; pos < fmt.size(); ++pos) { + char symbol = fmt[pos]; + need_nl = true; + if (symbol == '\\') { + switch (fmt[pos + 1]) { + case 'a': + pos++; + symbol = '\a'; + break; + case 'b': + pos++; + symbol = '\b'; + break; + case 'f': + pos++; + symbol = '\f'; + break; + case 'n': + pos++; + symbol = '\n'; + need_nl = false; + break; + case 'r': + pos++; + symbol = '\r'; + break; + case 'v': + pos++; + symbol = '\v'; + break; + case '7': + if (fmt[pos + 2] == '2') { + pos += 2; + symbol = '\72'; + } + break; + default: + break; + } + } + info.fmtString_.push_back(symbol); + } + if (need_nl) { + info.fmtString_ += "\n"; + } + // ] + } } -#endif // defined(WITH_LIGHTNING_COMPILER) +#endif // defined(WITH_LIGHTNING_COMPILER) -void -Kernel::initPrintf(const aclPrintfFmt* aclPrintf) -{ - PrintfInfo info; - uint index = 0; - for (; aclPrintf->struct_size != 0; aclPrintf++) { - index = aclPrintf->ID; - if (printf_.size() <= index) { - printf_.resize(index + 1); - } - std::string pfmt = aclPrintf->fmtStr; - bool need_nl = true; - for (size_t pos = 0; pos < pfmt.size(); ++pos) { - char symbol = pfmt[pos]; - need_nl = true; - if (symbol == '\\') { - switch (pfmt[pos+1]) { - case 'a': - pos++; - symbol = '\a'; - break; - case 'b': - pos++; - symbol = '\b'; - break; - case 'f': - pos++; - symbol = '\f'; - break; - case 'n': - pos++; - symbol = '\n'; - need_nl = false; - break; - case 'r': - pos++; - symbol = '\r'; - break; - case 'v': - pos++; - symbol = '\v'; - break; - case '7': - if (pfmt[pos+2] == '2') { - pos += 2; - symbol = '\72'; - } - break; - default: - break; - } - } - info.fmtString_.push_back(symbol); - } - if (need_nl) { - info.fmtString_ += "\n"; - } - uint32_t* tmp_ptr = const_cast(aclPrintf->argSizes); - for (uint i = 0; i < aclPrintf->numSizes; i++, tmp_ptr++) { - info.arguments_.push_back(*tmp_ptr); - } - printf_[index] = info; - info.arguments_.clear(); +void Kernel::initPrintf(const aclPrintfFmt* aclPrintf) { + PrintfInfo info; + uint index = 0; + for (; aclPrintf->struct_size != 0; aclPrintf++) { + index = aclPrintf->ID; + if (printf_.size() <= index) { + printf_.resize(index + 1); } + std::string pfmt = aclPrintf->fmtStr; + bool need_nl = true; + for (size_t pos = 0; pos < pfmt.size(); ++pos) { + char symbol = pfmt[pos]; + need_nl = true; + if (symbol == '\\') { + switch (pfmt[pos + 1]) { + case 'a': + pos++; + symbol = '\a'; + break; + case 'b': + pos++; + symbol = '\b'; + break; + case 'f': + pos++; + symbol = '\f'; + break; + case 'n': + pos++; + symbol = '\n'; + need_nl = false; + break; + case 'r': + pos++; + symbol = '\r'; + break; + case 'v': + pos++; + symbol = '\v'; + break; + case '7': + if (pfmt[pos + 2] == '2') { + pos += 2; + symbol = '\72'; + } + break; + default: + break; + } + } + info.fmtString_.push_back(symbol); + } + if (need_nl) { + info.fmtString_ += "\n"; + } + uint32_t* tmp_ptr = const_cast(aclPrintf->argSizes); + for (uint i = 0; i < aclPrintf->numSizes; i++, tmp_ptr++) { + info.arguments_.push_back(*tmp_ptr); + } + printf_[index] = info; + info.arguments_.clear(); + } } -Kernel::~Kernel() -{ - while (!hsailArgList_.empty()) { - Argument* kernelArgPointer = hsailArgList_.back(); - delete kernelArgPointer; - hsailArgList_.pop_back(); - } +Kernel::~Kernel() { + while (!hsailArgList_.empty()) { + Argument* kernelArgPointer = hsailArgList_.back(); + delete kernelArgPointer; + hsailArgList_.pop_back(); + } } } // namespace roc diff --git a/rocclr/runtime/device/rocm/rockernel.hpp b/rocclr/runtime/device/rocm/rockernel.hpp index cef92431c7..954324a6d8 100644 --- a/rocclr/runtime/device/rocm/rockernel.hpp +++ b/rocclr/runtime/device/rocm/rockernel.hpp @@ -15,180 +15,156 @@ namespace roc { #define MAX_INFO_STRING_LEN 0x40 -enum ROC_ARG_TYPE -{ - ROC_ARGTYPE_ERROR = 0, - ROC_ARGTYPE_POINTER, - ROC_ARGTYPE_VALUE, - ROC_ARGTYPE_REFERENCE, - ROC_ARGTYPE_IMAGE, - ROC_ARGTYPE_SAMPLER, - ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X, - ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y, - ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z, - ROC_ARGTYPE_HIDDEN_PRINTF_BUFFER, - ROC_ARGTYPE_HIDDEN_DEFAULT_QUEUE, - ROC_ARGTYPE_HIDDEN_COMPLETION_ACTION, - ROC_ARGTYPE_HIDDEN_NONE, - ROC_ARGMAX_ARG_TYPES +enum ROC_ARG_TYPE { + ROC_ARGTYPE_ERROR = 0, + ROC_ARGTYPE_POINTER, + ROC_ARGTYPE_VALUE, + ROC_ARGTYPE_REFERENCE, + ROC_ARGTYPE_IMAGE, + ROC_ARGTYPE_SAMPLER, + ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X, + ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y, + ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z, + ROC_ARGTYPE_HIDDEN_PRINTF_BUFFER, + ROC_ARGTYPE_HIDDEN_DEFAULT_QUEUE, + ROC_ARGTYPE_HIDDEN_COMPLETION_ACTION, + ROC_ARGTYPE_HIDDEN_NONE, + ROC_ARGMAX_ARG_TYPES }; -enum ROC_ADDRESS_QUALIFIER -{ - ROC_ADDRESS_ERROR = 0, - ROC_ADDRESS_GLOBAL, - ROC_ADDRESS_CONSTANT, - ROC_ADDRESS_LOCAL, - ROC_MAX_ADDRESS_QUALIFIERS +enum ROC_ADDRESS_QUALIFIER { + ROC_ADDRESS_ERROR = 0, + ROC_ADDRESS_GLOBAL, + ROC_ADDRESS_CONSTANT, + ROC_ADDRESS_LOCAL, + ROC_MAX_ADDRESS_QUALIFIERS }; -enum ROC_DATA_TYPE -{ - ROC_DATATYPE_ERROR = 0, - ROC_DATATYPE_B1, - ROC_DATATYPE_B8, - ROC_DATATYPE_B16, - ROC_DATATYPE_B32, - ROC_DATATYPE_B64, - ROC_DATATYPE_S8, - ROC_DATATYPE_S16, - ROC_DATATYPE_S32, - ROC_DATATYPE_S64, - ROC_DATATYPE_U8, - ROC_DATATYPE_U16, - ROC_DATATYPE_U32, - ROC_DATATYPE_U64, - ROC_DATATYPE_F16, - ROC_DATATYPE_F32, - ROC_DATATYPE_F64, - ROC_DATATYPE_STRUCT, - ROC_DATATYPE_OPAQUE, - ROC_DATATYPE_MAX_TYPES +enum ROC_DATA_TYPE { + ROC_DATATYPE_ERROR = 0, + ROC_DATATYPE_B1, + ROC_DATATYPE_B8, + ROC_DATATYPE_B16, + ROC_DATATYPE_B32, + ROC_DATATYPE_B64, + ROC_DATATYPE_S8, + ROC_DATATYPE_S16, + ROC_DATATYPE_S32, + ROC_DATATYPE_S64, + ROC_DATATYPE_U8, + ROC_DATATYPE_U16, + ROC_DATATYPE_U32, + ROC_DATATYPE_U64, + ROC_DATATYPE_F16, + ROC_DATATYPE_F32, + ROC_DATATYPE_F64, + ROC_DATATYPE_STRUCT, + ROC_DATATYPE_OPAQUE, + ROC_DATATYPE_MAX_TYPES }; -enum ROC_ACCESS_TYPE -{ - ROC_ACCESS_TYPE_NONE = 0, - ROC_ACCESS_TYPE_RO, - ROC_ACCESS_TYPE_WO, - ROC_ACCESS_TYPE_RW +enum ROC_ACCESS_TYPE { + ROC_ACCESS_TYPE_NONE = 0, + ROC_ACCESS_TYPE_RO, + ROC_ACCESS_TYPE_WO, + ROC_ACCESS_TYPE_RW }; -class Kernel : public device::Kernel -{ -public: - struct Argument - { - uint index_; //!< Argument's index in the OCL signature - std::string name_; //!< Argument's name - std::string typeName_; //!< Argument's type name - uint size_; //!< Size in bytes - uint alignment_; //!< Argument's alignment - uint pointeeAlignment_; //!< Alignment of the data pointed to - ROC_ARG_TYPE type_; //!< Type of the argument - ROC_ADDRESS_QUALIFIER addrQual_; //!< Address qualifier of the argument - ROC_DATA_TYPE dataType_; //!< The type of data - ROC_ACCESS_TYPE access_; //!< Access type for the argument +class Kernel : public device::Kernel { + public: + struct Argument { + uint index_; //!< Argument's index in the OCL signature + std::string name_; //!< Argument's name + std::string typeName_; //!< Argument's type name + uint size_; //!< Size in bytes + uint alignment_; //!< Argument's alignment + uint pointeeAlignment_; //!< Alignment of the data pointed to + ROC_ARG_TYPE type_; //!< Type of the argument + ROC_ADDRESS_QUALIFIER addrQual_; //!< Address qualifier of the argument + ROC_DATA_TYPE dataType_; //!< The type of data + ROC_ACCESS_TYPE access_; //!< Access type for the argument + }; + + Kernel(std::string name, HSAILProgram* prog, const uint64_t& kernelCodeHandle, + const uint32_t workgroupGroupSegmentByteSize, + const uint32_t workitemPrivateSegmentByteSize, const uint32_t kernargSegmentByteSize, + const uint32_t kernargSegmentAlignment); + + const uint64_t& KernelCodeHandle() { return kernelCodeHandle_; } + + const uint32_t WorkgroupGroupSegmentByteSize() const { return workgroupGroupSegmentByteSize_; } + + const uint32_t workitemPrivateSegmentByteSize() const { return workitemPrivateSegmentByteSize_; } + + const uint64_t KernargSegmentByteSize() const { return kernargSegmentByteSize_; } + + const uint8_t KernargSegmentAlignment() const { return kernargSegmentAlignment_; } + + ~Kernel(); + + //! Initializes the metadata required for this kernel + bool init(); +#if defined(WITH_LIGHTNING_COMPILER) + //! Initializes the metadata required for this kernel + bool init_LC(); +#endif // defined(WITH_LIGHTNING_COMPILER) + + const HSAILProgram* program() const { return static_cast(program_); } + + //! Returns the kernel argument list + const std::vector& hsailArgs() const { return hsailArgList_; } + + //! Returns a pointer to the hsail argument at the specified index + Argument* hsailArgAt(size_t index) const { + for (auto arg : hsailArgList_) + if (arg->index_ == index) return arg; + assert(!"Should not reach here"); + return nullptr; + } + + //! Return printf info array + const std::vector& printfInfo() const { return printf_; } + + //! Return TRUE if kernel is internal blit kernel + bool isInternalKernel() const { return (flags_.internalKernel_) ? true : false; } + + //! set internal kernel flag + void setInternalKernelFlag(bool flag) { flags_.internalKernel_ = flag; } + + private: + union Flags { + struct { + uint internalKernel_ : 1; //!< Is a blit kernel? }; + uint value_; + Flags() : value_(0) {} + } flags_; - Kernel(std::string name, - HSAILProgram* prog, - const uint64_t &kernelCodeHandle, - const uint32_t workgroupGroupSegmentByteSize, - const uint32_t workitemPrivateSegmentByteSize, - const uint32_t kernargSegmentByteSize, - const uint32_t kernargSegmentAlignment); - - const uint64_t& KernelCodeHandle() { - return kernelCodeHandle_; - } - - const uint32_t WorkgroupGroupSegmentByteSize() const { - return workgroupGroupSegmentByteSize_; - } - - const uint32_t workitemPrivateSegmentByteSize() const { - return workitemPrivateSegmentByteSize_; - } - - const uint64_t KernargSegmentByteSize() const { - return kernargSegmentByteSize_; - } - - const uint8_t KernargSegmentAlignment() const { - return kernargSegmentAlignment_; - } - - ~Kernel(); - - //! Initializes the metadata required for this kernel - bool init(); + //! Populates hsailArgList_ + void initArguments(const aclArgData* aclArg); #if defined(WITH_LIGHTNING_COMPILER) - //! Initializes the metadata required for this kernel - bool init_LC(); -#endif // defined(WITH_LIGHTNING_COMPILER) + //! Initializes Hsail Argument metadata and info for LC + void initArguments_LC(const KernelMD& kernelMD); +#endif // defined(WITH_LIGHTNING_COMPILER) - const HSAILProgram* program() const { - return static_cast(program_); - } - - //! Returns the kernel argument list - const std::vector& hsailArgs() const { - return hsailArgList_; - } - - //! Returns a pointer to the hsail argument at the specified index - Argument* hsailArgAt(size_t index) const { - for (auto arg : hsailArgList_) if (arg->index_ == index) return arg; - assert(!"Should not reach here"); - return nullptr; - } - - //! Return printf info array - const std::vector& printfInfo() const {return printf_;} - - //! Return TRUE if kernel is internal blit kernel - bool isInternalKernel() const { return (flags_.internalKernel_) ? true : false; } - - //! set internal kernel flag - void setInternalKernelFlag(bool flag) { flags_.internalKernel_ = flag; } - -private: - union Flags { - struct { - uint internalKernel_: 1; //!< Is a blit kernel? - }; - uint value_; - Flags(): value_(0) {} - } flags_; - - //! Populates hsailArgList_ - void initArguments(const aclArgData* aclArg); + //! Initializes HSAIL Printf metadata and info + void initPrintf(const aclPrintfFmt* aclPrintf); #if defined(WITH_LIGHTNING_COMPILER) - //! Initializes Hsail Argument metadata and info for LC - void initArguments_LC(const KernelMD& kernelMD); -#endif // defined(WITH_LIGHTNING_COMPILER) + //! Initializes HSAIL Printf metadata and info for LC + void initPrintf_LC(const std::vector& printfInfoStrings); +#endif // defined(WITH_LIGHTNING_COMPILER) - //! Initializes HSAIL Printf metadata and info - void initPrintf(const aclPrintfFmt* aclPrintf); -#if defined(WITH_LIGHTNING_COMPILER) - //! Initializes HSAIL Printf metadata and info for LC - void initPrintf_LC(const std::vector& printfInfoStrings); -#endif // defined(WITH_LIGHTNING_COMPILER) - - HSAILProgram *program_; //!< The roc::HSAILProgram context - std::vector hsailArgList_; //!< Vector list of HSAIL Arguments - uint64_t kernelCodeHandle_; //!< Kernel code handle (aka amd_kernel_code_t) - const uint32_t workgroupGroupSegmentByteSize_; - const uint32_t workitemPrivateSegmentByteSize_; - const uint32_t kernargSegmentByteSize_; - const uint32_t kernargSegmentAlignment_; - size_t kernelDirectiveOffset_; - std::vector printf_; + HSAILProgram* program_; //!< The roc::HSAILProgram context + std::vector hsailArgList_; //!< Vector list of HSAIL Arguments + uint64_t kernelCodeHandle_; //!< Kernel code handle (aka amd_kernel_code_t) + const uint32_t workgroupGroupSegmentByteSize_; + const uint32_t workitemPrivateSegmentByteSize_; + const uint32_t kernargSegmentByteSize_; + const uint32_t kernargSegmentAlignment_; + size_t kernelDirectiveOffset_; + std::vector printf_; }; -} // namespace roc - -#endif // WITHOUT_HSA_BACKEND - +} // namespace roc +#endif // WITHOUT_HSA_BACKEND diff --git a/rocclr/runtime/device/rocm/rocmemory.cpp b/rocclr/runtime/device/rocm/rocmemory.cpp index 55627d4c28..78dce676a6 100644 --- a/rocclr/runtime/device/rocm/rocmemory.cpp +++ b/rocclr/runtime/device/rocm/rocmemory.cpp @@ -24,180 +24,150 @@ namespace roc { /////////////////////////////////roc::Memory////////////////////////////// -Memory::Memory(const roc::Device &dev, amd::Memory &owner) - : device::Memory(owner) - , dev_(dev) - , deviceMemory_(nullptr) - , kind_(MEMORY_KIND_NORMAL) - , pinnedMemory_(nullptr) -{ +Memory::Memory(const roc::Device& dev, amd::Memory& owner) + : device::Memory(owner), + dev_(dev), + deviceMemory_(nullptr), + kind_(MEMORY_KIND_NORMAL), + pinnedMemory_(nullptr) {} + +Memory::Memory(const roc::Device& dev, size_t size) + : device::Memory(size), + dev_(dev), + deviceMemory_(nullptr), + kind_(MEMORY_KIND_NORMAL), + pinnedMemory_(nullptr) {} + +Memory::~Memory() { + // Destory pinned memory + if (flags_ & PinnedMemoryAlloced) { + pinnedMemory_->release(); + } + + dev().removeVACache(this); + if (nullptr != mapMemory_) { + mapMemory_->release(); + } } -Memory::Memory(const roc::Device &dev, size_t size) - : device::Memory(size) - , dev_(dev) - , deviceMemory_(nullptr) - , kind_(MEMORY_KIND_NORMAL) - , pinnedMemory_(nullptr) -{ +bool Memory::allocateMapMemory(size_t allocationSize) { + assert(mapMemory_ == nullptr); + + void* mapData = nullptr; + + amd::Memory* mapMemory = dev().findMapTarget(owner()->getSize()); + if (mapMemory == nullptr) { + // Create buffer object to contain the map target. + mapMemory = new (dev().context()) + amd::Buffer(dev().context(), CL_MEM_ALLOC_HOST_PTR, owner()->getSize()); + + if ((mapMemory == nullptr) || (!mapMemory->create())) { + LogError("[OCL] Fail to allocate map target object"); + if (mapMemory) { + mapMemory->release(); + } + return false; + } + + roc::Memory* hsaMapMemory = reinterpret_cast(mapMemory->getDeviceMemory(dev_)); + if (hsaMapMemory == nullptr) { + mapMemory->release(); + return false; + } + } + + mapMemory_ = mapMemory; + + return true; } -Memory::~Memory() -{ - // Destory pinned memory - if (flags_ & PinnedMemoryAlloced) { - pinnedMemory_->release(); +void* Memory::allocMapTarget(const amd::Coord3D& origin, const amd::Coord3D& region, uint mapFlags, + size_t* rowPitch, size_t* slicePitch) { + // Map/Unmap must be serialized. + amd::ScopedLock lock(owner()->lockMemoryOps()); + + incIndMapCount(); + // If the device backing storage is direct accessible, use it. + if (isHostMemDirectAccess()) { + if (owner()->getHostMem() != nullptr) { + return (static_cast(owner()->getHostMem()) + origin[0]); } - dev().removeVACache(this); - if (nullptr != mapMemory_) { - mapMemory_->release(); + return (static_cast(deviceMemory_) + origin[0]); + } + + // Otherwise, check for host memory. + void* hostMem = owner()->getHostMem(); + if (hostMem != nullptr) { + return (static_cast(hostMem) + origin[0]); + } + + // Allocate one if needed. + if (indirectMapCount_ == 1) { + if (!allocateMapMemory(owner()->getSize())) { + decIndMapCount(); + return nullptr; } + } else { + // Did the map resource allocation fail? + if (mapMemory_ == nullptr) { + LogError("Could not map target resource"); + return nullptr; + } + } + return reinterpret_cast
(mapMemory_->getHostMem()) + origin[0]; } -bool -Memory::allocateMapMemory(size_t allocationSize) -{ - assert(mapMemory_ == nullptr); +void Memory::decIndMapCount() { + // Map/Unmap must be serialized. + amd::ScopedLock lock(owner()->lockMemoryOps()); - void *mapData = nullptr; + if (indirectMapCount_ == 0) { + LogError("decIndMapCount() called when indirectMapCount_ already zero"); + return; + } - amd::Memory* mapMemory = dev().findMapTarget(owner()->getSize()); - if (mapMemory == nullptr) { - // Create buffer object to contain the map target. - mapMemory = new (dev().context()) amd::Buffer( - dev().context(), CL_MEM_ALLOC_HOST_PTR, owner()->getSize()); - - if ((mapMemory == nullptr) || (!mapMemory->create())) { - LogError("[OCL] Fail to allocate map target object"); - if (mapMemory) { - mapMemory->release(); - } - return false; - } - - roc::Memory* hsaMapMemory = reinterpret_cast( - mapMemory->getDeviceMemory(dev_)); - if (hsaMapMemory == nullptr) { - mapMemory->release(); - return false; - } + // Decrement the counter and release indirect map if it's the last op + if (--indirectMapCount_ == 0 && mapMemory_ != nullptr) { + if (!dev().addMapTarget(mapMemory_)) { + // Release the buffer object containing the map data. + mapMemory_->release(); } - - mapMemory_ = mapMemory; - - return true; + mapMemory_ = nullptr; + } } -void* -Memory::allocMapTarget( - const amd::Coord3D &origin, - const amd::Coord3D ®ion, - uint mapFlags, - size_t *rowPitch, - size_t *slicePitch) -{ - // Map/Unmap must be serialized. - amd::ScopedLock lock(owner()->lockMemoryOps()); +void* Memory::cpuMap(device::VirtualDevice& vDev, uint flags, uint startLayer, uint numLayers, + size_t* rowPitch, size_t* slicePitch) { + // Create the map target. + void* mapTarget = allocMapTarget(amd::Coord3D(0), amd::Coord3D(0), 0, rowPitch, slicePitch); - incIndMapCount(); - // If the device backing storage is direct accessible, use it. - if (isHostMemDirectAccess()) { - if (owner()->getHostMem() != nullptr) { - return (static_cast(owner()->getHostMem()) + origin[0]); - } + assert(mapTarget != nullptr); - return (static_cast(deviceMemory_) + origin[0]); + if (!isHostMemDirectAccess()) { + if (!vDev.blitMgr().readBuffer(*this, mapTarget, amd::Coord3D(0), amd::Coord3D(size()), true)) { + decIndMapCount(); + return nullptr; } + } - // Otherwise, check for host memory. - void *hostMem = owner()->getHostMem(); - if (hostMem != nullptr) { - return (static_cast(hostMem) + origin[0]); - } - - // Allocate one if needed. - if (indirectMapCount_ == 1) { - if (!allocateMapMemory(owner()->getSize())) { - decIndMapCount(); - return nullptr; - } - } - else { - // Did the map resource allocation fail? - if (mapMemory_ == nullptr) { - LogError("Could not map target resource"); - return nullptr; - } - } - return reinterpret_cast
(mapMemory_->getHostMem()) + origin[0]; + return mapTarget; } -void -Memory::decIndMapCount() -{ - // Map/Unmap must be serialized. - amd::ScopedLock lock(owner()->lockMemoryOps()); - - if (indirectMapCount_ == 0) { - LogError("decIndMapCount() called when indirectMapCount_ already zero"); - return; +void Memory::cpuUnmap(device::VirtualDevice& vDev) { + if (!isHostMemDirectAccess()) { + if (!vDev.blitMgr().writeBuffer(mapMemory_->getHostMem(), *this, amd::Coord3D(0), + amd::Coord3D(size()), true)) { + LogError("[OCL] Fail sync the device memory on cpuUnmap"); } + } - // Decrement the counter and release indirect map if it's the last op - if (--indirectMapCount_ == 0 && - mapMemory_ != nullptr) { - if (!dev().addMapTarget(mapMemory_)) { - // Release the buffer object containing the map data. - mapMemory_->release(); - } - mapMemory_ = nullptr; - } -} - -void * -Memory::cpuMap( - device::VirtualDevice& vDev, - uint flags, - uint startLayer, - uint numLayers, - size_t* rowPitch, - size_t* slicePitch) -{ - // Create the map target. - void * mapTarget = - allocMapTarget(amd::Coord3D(0), amd::Coord3D(0), 0, rowPitch, slicePitch); - - assert(mapTarget != nullptr); - - if (!isHostMemDirectAccess()) { - if (!vDev.blitMgr().readBuffer( - *this, mapTarget, amd::Coord3D(0), amd::Coord3D(size()), true)) { - decIndMapCount(); - return nullptr; - } - } - - return mapTarget; -} - -void -Memory::cpuUnmap(device::VirtualDevice& vDev) -{ - if (!isHostMemDirectAccess()) { - if (!vDev.blitMgr().writeBuffer( - mapMemory_->getHostMem(), *this, amd::Coord3D(0), - amd::Coord3D(size()), true)) { - LogError("[OCL] Fail sync the device memory on cpuUnmap"); - } - } - - decIndMapCount(); + decIndMapCount(); } // Setup an interop buffer (dmabuf handle) as an OpenCL buffer -bool Memory::createInteropBuffer(GLenum targetType, int miplevel, size_t* metadata_size, const hsa_amd_image_descriptor_t** metadata) -{ +bool Memory::createInteropBuffer(GLenum targetType, int miplevel, size_t* metadata_size, + const hsa_amd_image_descriptor_t** metadata) { #if defined(_WIN32) return false; #else @@ -206,610 +176,551 @@ bool Memory::createInteropBuffer(GLenum targetType, int miplevel, size_t* metada mesa_glinterop_export_in in; mesa_glinterop_export_out out; - in.size=sizeof(mesa_glinterop_export_in); - out.size=sizeof(mesa_glinterop_export_out); + in.size = sizeof(mesa_glinterop_export_in); + out.size = sizeof(mesa_glinterop_export_out); - if(owner()->getMemFlags() & CL_MEM_READ_ONLY) - in.access=MESA_GLINTEROP_ACCESS_READ_ONLY; - else if(owner()->getMemFlags() & CL_MEM_WRITE_ONLY) - in.access=MESA_GLINTEROP_ACCESS_WRITE_ONLY; + if (owner()->getMemFlags() & CL_MEM_READ_ONLY) + in.access = MESA_GLINTEROP_ACCESS_READ_ONLY; + else if (owner()->getMemFlags() & CL_MEM_WRITE_ONLY) + in.access = MESA_GLINTEROP_ACCESS_WRITE_ONLY; else - in.access=MESA_GLINTEROP_ACCESS_READ_WRITE; + in.access = MESA_GLINTEROP_ACCESS_READ_WRITE; in.target = targetType; - in.obj=owner()->getInteropObj()->asGLObject()->getGLName(); - in.miplevel=miplevel; - in.out_driver_data_size=0; - in.out_driver_data=nullptr; + in.obj = owner()->getInteropObj()->asGLObject()->getGLName(); + in.miplevel = miplevel; + in.out_driver_data_size = 0; + in.out_driver_data = nullptr; - if(!dev().mesa().Export(in, out)) - return false; + if (!dev().mesa().Export(in, out)) return false; size_t size; - hsa_agent_t agent=dev().getBackendDevice(); - hsa_status_t status=hsa_amd_interop_map_buffer(1, &agent, out.dmabuf_fd, 0, &size, &deviceMemory_, metadata_size, (const void**)metadata); + hsa_agent_t agent = dev().getBackendDevice(); + hsa_status_t status = hsa_amd_interop_map_buffer( + 1, &agent, out.dmabuf_fd, 0, &size, &deviceMemory_, metadata_size, (const void**)metadata); close(out.dmabuf_fd); - if(status!=HSA_STATUS_SUCCESS) - return false; + if (status != HSA_STATUS_SUCCESS) return false; - kind_=MEMORY_KIND_INTEROP; - assert(deviceMemory_!=nullptr && "Interop map failed to produce a pointer!"); + kind_ = MEMORY_KIND_INTEROP; + assert(deviceMemory_ != nullptr && "Interop map failed to produce a pointer!"); return true; #endif } -void Memory::destroyInteropBuffer() -{ - assert(kind_==MEMORY_KIND_INTEROP && "Memory must be interop type."); +void Memory::destroyInteropBuffer() { + assert(kind_ == MEMORY_KIND_INTEROP && "Memory must be interop type."); hsa_amd_interop_unmap_buffer(deviceMemory_); - deviceMemory_=nullptr; + deviceMemory_ = nullptr; } -bool -Memory::pinSystemMemory(void* hostPtr, size_t size) -{ - size_t pinAllocSize; - const static bool SysMem = true; - amd::Memory* amdMemory = nullptr; +bool Memory::pinSystemMemory(void* hostPtr, size_t size) { + size_t pinAllocSize; + const static bool SysMem = true; + amd::Memory* amdMemory = nullptr; + amd::Memory* amdParent = owner()->parent(); + + // If memory has a direct access already, then skip the host memory pinning + if (isHostMemDirectAccess()) { + return true; + } + + // Memory was pinned already + if (flags_ & PinnedMemoryAlloced) { + return true; + } + + // Check if runtime allocates a parent object + if (amdParent != nullptr) { + Memory* parent = dev().getRocMemory(amdParent); + amd::Memory* amdPinned = parent->pinnedMemory_; + if (amdPinned != nullptr) { + // Create view on the parent's pinned memory + amdMemory = new (amdPinned->getContext()) + amd::Buffer(*amdPinned, 0, owner()->getOrigin(), owner()->getSize()); + if ((amdMemory != nullptr) && !amdMemory->create()) { + amdMemory->release(); + amdMemory = nullptr; + } + } + } + + if (amdMemory == nullptr) { + amdMemory = new (dev().context()) amd::Buffer(dev().context(), CL_MEM_USE_HOST_PTR, size); + if ((amdMemory != nullptr) && !amdMemory->create(hostPtr, SysMem)) { + amdMemory->release(); + return false; + } + } + + // Get device memory for this virtual device + // @note: This will force real memory pinning + Memory* srcMemory = dev().getRocMemory(amdMemory); + + if (srcMemory == nullptr) { + // Release memory + amdMemory->release(); + return false; + } else { + pinnedMemory_ = amdMemory; + flags_ |= PinnedMemoryAlloced; + } + + return true; +} + +void Memory::syncCacheFromHost(VirtualGPU& gpu, device::Memory::SyncFlags syncFlags) { + // If the last writer was another GPU, then make a writeback + if (!isHostMemDirectAccess() && (owner()->getLastWriter() != nullptr) && + (&dev() != owner()->getLastWriter())) { + mgpuCacheWriteBack(); + } + + // If host memory doesn't have direct access, then we have to synchronize + if (!isHostMemDirectAccess() && (nullptr != owner()->getHostMem())) { + bool hasUpdates = true; amd::Memory* amdParent = owner()->parent(); - // If memory has a direct access already, then skip the host memory pinning - if (isHostMemDirectAccess()) { - return true; + // Make sure the parent of subbuffer is up to date + if (!syncFlags.skipParent_ && (amdParent != nullptr)) { + Memory* gpuMemory = dev().getRocMemory(amdParent); + + //! \note: Skipping the sync for a view doesn't reflect the parent settings, + //! since a view is a small portion of parent + device::Memory::SyncFlags syncFlagsTmp; + + // Sync parent from a view, so views have to be skipped + syncFlagsTmp.skipViews_ = true; + + // Make sure the parent sync is an unique operation. + // If the app uses multiple subbuffers from multiple queues, + // then the parent sync can be called from multiple threads + amd::ScopedLock lock(owner()->parent()->lockMemoryOps()); + gpuMemory->syncCacheFromHost(gpu, syncFlagsTmp); + //! \note Don't do early exit here, since we still have to sync + //! this view, if the parent sync operation was a NOP. + //! If parent was synchronized, then this view sync will be a NOP } - // Memory was pinned already + // Is this a NOP? + if ((version_ == owner()->getVersion()) || (&dev() == owner()->getLastWriter())) { + hasUpdates = false; + } + + // Update all available views, since we sync the parent + if ((owner()->subBuffers().size() != 0) && (hasUpdates || !syncFlags.skipViews_)) { + device::Memory::SyncFlags syncFlagsTmp; + + // Sync views from parent, so parent has to be skipped + syncFlagsTmp.skipParent_ = true; + + if (hasUpdates) { + // Parent will be synced so update all views with a skip + syncFlagsTmp.skipEntire_ = true; + } else { + // Passthrough the skip entire flag to the views, since + // any view is a submemory of the parent + syncFlagsTmp.skipEntire_ = syncFlags.skipEntire_; + } + + amd::ScopedLock lock(owner()->lockMemoryOps()); + for (auto& sub : owner()->subBuffers()) { + //! \note Don't allow subbuffer's allocation in the worker thread. + //! It may cause a system lock, because possible resource + //! destruction, heap reallocation or subbuffer allocation + static const bool AllocSubBuffer = false; + device::Memory* devSub = sub->getDeviceMemory(dev(), AllocSubBuffer); + if (nullptr != devSub) { + Memory* gpuSub = reinterpret_cast(devSub); + gpuSub->syncCacheFromHost(gpu, syncFlagsTmp); + } + } + } + + // Make sure we didn't have a NOP, + // because this GPU device was the last writer + if (&dev() != owner()->getLastWriter()) { + // Update the latest version + version_ = owner()->getVersion(); + } + + // Exit if sync is a NOP or sync can be skipped + if (!hasUpdates || syncFlags.skipEntire_) { + return; + } + + bool result = false; + static const bool Entire = true; + amd::Coord3D origin(0, 0, 0); + + // If host memory was pinned then make a transfer if (flags_ & PinnedMemoryAlloced) { - return true; + Memory& pinned = *dev().getRocMemory(pinnedMemory_); + if (owner()->getType() == CL_MEM_OBJECT_BUFFER) { + amd::Coord3D region(owner()->getSize()); + result = gpu.blitMgr().copyBuffer(pinned, *this, origin, origin, region, Entire); + } else { + amd::Image& image = static_cast(*owner()); + result = + gpu.blitMgr().copyBufferToImage(pinned, *this, origin, origin, image.getRegion(), + Entire, image.getRowPitch(), image.getSlicePitch()); + } } - // Check if runtime allocates a parent object - if (amdParent != nullptr) { - Memory* parent = dev().getRocMemory(amdParent); - amd::Memory* amdPinned = parent->pinnedMemory_; - if (amdPinned != nullptr) { - // Create view on the parent's pinned memory - amdMemory = new (amdPinned->getContext()) amd::Buffer( - *amdPinned, 0, owner()->getOrigin(), owner()->getSize()); - if ((amdMemory != nullptr) && !amdMemory->create()) { - amdMemory->release(); - amdMemory = nullptr; - } - } + if (!result) { + if (owner()->getType() == CL_MEM_OBJECT_BUFFER) { + amd::Coord3D region(owner()->getSize()); + result = gpu.blitMgr().writeBuffer(owner()->getHostMem(), *this, origin, region, Entire); + } else { + amd::Image& image = static_cast(*owner()); + result = gpu.blitMgr().writeImage(owner()->getHostMem(), *this, origin, image.getRegion(), + image.getRowPitch(), image.getSlicePitch(), Entire); + } } - if (amdMemory == nullptr) { - amdMemory = new (dev().context()) - amd::Buffer(dev().context(), CL_MEM_USE_HOST_PTR, size); - if ((amdMemory != nullptr) && !amdMemory->create(hostPtr, SysMem)) { - amdMemory->release(); - return false; - } - } + //!@todo A wait isn't really necessary. However processMemObjects() + // may lose the track of dependencies with a compute transfer(if sdma failed). + wait(gpu); - // Get device memory for this virtual device - // @note: This will force real memory pinning - Memory* srcMemory = dev().getRocMemory(amdMemory); - - if (srcMemory == nullptr) { - // Release memory - amdMemory->release(); - return false; - } - else { - pinnedMemory_ = amdMemory; - flags_ |= PinnedMemoryAlloced; - } - - return true; + // Should never fail + assert(result && "Memory synchronization failed!"); + } } -void -Memory::syncCacheFromHost(VirtualGPU& gpu, device::Memory::SyncFlags syncFlags) -{ - // If the last writer was another GPU, then make a writeback - if (!isHostMemDirectAccess() && - (owner()->getLastWriter() != nullptr) && - (&dev() != owner()->getLastWriter())) { - mgpuCacheWriteBack(); +void Memory::syncHostFromCache(device::Memory::SyncFlags syncFlags) { + // Sanity checks + assert(owner() != nullptr); + + // If host memory doesn't have direct access, then we have to synchronize + if (!isHostMemDirectAccess()) { + bool hasUpdates = true; + amd::Memory* amdParent = owner()->parent(); + + // Make sure the parent of subbuffer is up to date + if (!syncFlags.skipParent_ && (amdParent != nullptr)) { + device::Memory* m = dev().getRocMemory(amdParent); + + //! \note: Skipping the sync for a view doesn't reflect the parent settings, + //! since a view is a small portion of parent + device::Memory::SyncFlags syncFlagsTmp; + + // Sync parent from a view, so views have to be skipped + syncFlagsTmp.skipViews_ = true; + + // Make sure the parent sync is an unique operation. + // If the app uses multiple subbuffers from multiple queues, + // then the parent sync can be called from multiple threads + amd::ScopedLock lock(owner()->parent()->lockMemoryOps()); + m->syncHostFromCache(syncFlagsTmp); + //! \note Don't do early exit here, since we still have to sync + //! this view, if the parent sync operation was a NOP. + //! If parent was synchronized, then this view sync will be a NOP } - // If host memory doesn't have direct access, then we have to synchronize - if (!isHostMemDirectAccess() && (nullptr != owner()->getHostMem())) { - bool hasUpdates = true; - amd::Memory* amdParent = owner()->parent(); - - // Make sure the parent of subbuffer is up to date - if (!syncFlags.skipParent_ && (amdParent != nullptr)) { - Memory* gpuMemory = dev().getRocMemory(amdParent); - - //! \note: Skipping the sync for a view doesn't reflect the parent settings, - //! since a view is a small portion of parent - device::Memory::SyncFlags syncFlagsTmp; - - // Sync parent from a view, so views have to be skipped - syncFlagsTmp.skipViews_ = true; - - // Make sure the parent sync is an unique operation. - // If the app uses multiple subbuffers from multiple queues, - // then the parent sync can be called from multiple threads - amd::ScopedLock lock(owner()->parent()->lockMemoryOps()); - gpuMemory->syncCacheFromHost(gpu, syncFlagsTmp); - //! \note Don't do early exit here, since we still have to sync - //! this view, if the parent sync operation was a NOP. - //! If parent was synchronized, then this view sync will be a NOP - } - - // Is this a NOP? - if ((version_ == owner()->getVersion()) || - (&dev() == owner()->getLastWriter())) { - hasUpdates = false; - } - - // Update all available views, since we sync the parent - if ((owner()->subBuffers().size() != 0) && - (hasUpdates || !syncFlags.skipViews_)) { - device::Memory::SyncFlags syncFlagsTmp; - - // Sync views from parent, so parent has to be skipped - syncFlagsTmp.skipParent_ = true; - - if (hasUpdates) { - // Parent will be synced so update all views with a skip - syncFlagsTmp.skipEntire_ = true; - } - else { - // Passthrough the skip entire flag to the views, since - // any view is a submemory of the parent - syncFlagsTmp.skipEntire_ = syncFlags.skipEntire_; - } - - amd::ScopedLock lock(owner()->lockMemoryOps()); - for (auto& sub : owner()->subBuffers()) { - //! \note Don't allow subbuffer's allocation in the worker thread. - //! It may cause a system lock, because possible resource - //! destruction, heap reallocation or subbuffer allocation - static const bool AllocSubBuffer = false; - device::Memory* devSub = - sub->getDeviceMemory(dev(), AllocSubBuffer); - if (nullptr != devSub) { - Memory* gpuSub = reinterpret_cast(devSub); - gpuSub->syncCacheFromHost(gpu, syncFlagsTmp); - } - } - } - - // Make sure we didn't have a NOP, - // because this GPU device was the last writer - if (&dev() != owner()->getLastWriter()) { - // Update the latest version - version_ = owner()->getVersion(); - } - - // Exit if sync is a NOP or sync can be skipped - if (!hasUpdates || syncFlags.skipEntire_) { - return; - } - - bool result = false; - static const bool Entire = true; - amd::Coord3D origin(0, 0, 0); - - // If host memory was pinned then make a transfer - if (flags_ & PinnedMemoryAlloced) { - Memory& pinned = *dev().getRocMemory(pinnedMemory_); - if (owner()->getType() == CL_MEM_OBJECT_BUFFER) { - amd::Coord3D region(owner()->getSize()); - result = gpu.blitMgr().copyBuffer(pinned, - *this, origin, origin, region, Entire); - } - else { - amd::Image& image = static_cast(*owner()); - result = gpu.blitMgr().copyBufferToImage(pinned, - *this, origin, origin, image.getRegion(), Entire, - image.getRowPitch(), image.getSlicePitch()); - } - } - - if (!result) { - if (owner()->getType() == CL_MEM_OBJECT_BUFFER) { - amd::Coord3D region(owner()->getSize()); - result = gpu.blitMgr().writeBuffer(owner()->getHostMem(), - *this, origin, region, Entire); - } - else { - amd::Image& image = static_cast(*owner()); - result = gpu.blitMgr().writeImage(owner()->getHostMem(), - *this, origin, image.getRegion(), - image.getRowPitch(), image.getSlicePitch(), Entire); - } - } - - //!@todo A wait isn't really necessary. However processMemObjects() - // may lose the track of dependencies with a compute transfer(if sdma failed). - wait(gpu); - - // Should never fail - assert(result && "Memory synchronization failed!"); + // Is this a NOP? + if ((nullptr == owner()->getLastWriter()) || (version_ == owner()->getVersion())) { + hasUpdates = false; } + + // Update all available views, since we sync the parent + if ((owner()->subBuffers().size() != 0) && (hasUpdates || !syncFlags.skipViews_)) { + device::Memory::SyncFlags syncFlagsTmp; + + // Sync views from parent, so parent has to be skipped + syncFlagsTmp.skipParent_ = true; + + if (hasUpdates) { + // Parent will be synced so update all views with a skip + syncFlagsTmp.skipEntire_ = true; + } else { + // Passthrough the skip entire flag to the views, since + // any view is a submemory of the parent + syncFlagsTmp.skipEntire_ = syncFlags.skipEntire_; + } + + amd::ScopedLock lock(owner()->lockMemoryOps()); + for (auto& sub : owner()->subBuffers()) { + //! \note Don't allow subbuffer's allocation in the worker thread. + //! It may cause a system lock, because possible resource + //! destruction, heap reallocation or subbuffer allocation + static const bool AllocSubBuffer = false; + device::Memory* devSub = sub->getDeviceMemory(dev(), AllocSubBuffer); + if (nullptr != devSub) { + Memory* gpuSub = reinterpret_cast(devSub); + gpuSub->syncHostFromCache(syncFlagsTmp); + } + } + } + + // Make sure we didn't have a NOP, + // because CPU was the last writer + if (nullptr != owner()->getLastWriter()) { + // Mark parent as up to date, set our version accordingly + version_ = owner()->getVersion(); + } + + // Exit if sync is a NOP or sync can be skipped + if (!hasUpdates || syncFlags.skipEntire_) { + return; + } + + bool result = false; + static const bool Entire = true; + amd::Coord3D origin(0, 0, 0); + + // If backing store was pinned then make a transfer + if (flags_ & PinnedMemoryAlloced) { + Memory& pinned = *dev().getRocMemory(pinnedMemory_); + if (owner()->getType() == CL_MEM_OBJECT_BUFFER) { + amd::Coord3D region(owner()->getSize()); + result = dev().xferMgr().copyBuffer(*this, pinned, origin, origin, region, Entire); + } else { + amd::Image& image = static_cast(*owner()); + result = + dev().xferMgr().copyImageToBuffer(*this, pinned, origin, origin, image.getRegion(), + Entire, image.getRowPitch(), image.getSlicePitch()); + } + } + + // Just do a basic host read + if (!result) { + if (owner()->getType() == CL_MEM_OBJECT_BUFFER) { + amd::Coord3D region(owner()->getSize()); + result = dev().xferMgr().readBuffer(*this, owner()->getHostMem(), origin, region, Entire); + } else { + amd::Image& image = static_cast(*owner()); + result = dev().xferMgr().readImage(*this, owner()->getHostMem(), origin, image.getRegion(), + image.getRowPitch(), image.getSlicePitch(), Entire); + } + } + + // Should never fail + assert(result && "Memory synchronization failed!"); + } } -void -Memory::syncHostFromCache(device::Memory::SyncFlags syncFlags) -{ - // Sanity checks - assert(owner() != nullptr); +void Memory::mgpuCacheWriteBack() { + // Lock memory object, so only one write back can occur + amd::ScopedLock lock(owner()->lockMemoryOps()); - // If host memory doesn't have direct access, then we have to synchronize - if (!isHostMemDirectAccess()) { - bool hasUpdates = true; - amd::Memory* amdParent = owner()->parent(); - - // Make sure the parent of subbuffer is up to date - if (!syncFlags.skipParent_ && (amdParent != nullptr)) { - device::Memory* m = dev().getRocMemory(amdParent); - - //! \note: Skipping the sync for a view doesn't reflect the parent settings, - //! since a view is a small portion of parent - device::Memory::SyncFlags syncFlagsTmp; - - // Sync parent from a view, so views have to be skipped - syncFlagsTmp.skipViews_ = true; - - // Make sure the parent sync is an unique operation. - // If the app uses multiple subbuffers from multiple queues, - // then the parent sync can be called from multiple threads - amd::ScopedLock lock(owner()->parent()->lockMemoryOps()); - m->syncHostFromCache(syncFlagsTmp); - //! \note Don't do early exit here, since we still have to sync - //! this view, if the parent sync operation was a NOP. - //! If parent was synchronized, then this view sync will be a NOP - } - - // Is this a NOP? - if ((nullptr == owner()->getLastWriter()) || - (version_ == owner()->getVersion())) { - hasUpdates = false; - } - - // Update all available views, since we sync the parent - if ((owner()->subBuffers().size() != 0) && - (hasUpdates || !syncFlags.skipViews_)) { - device::Memory::SyncFlags syncFlagsTmp; - - // Sync views from parent, so parent has to be skipped - syncFlagsTmp.skipParent_ = true; - - if (hasUpdates) { - // Parent will be synced so update all views with a skip - syncFlagsTmp.skipEntire_ = true; - } - else { - // Passthrough the skip entire flag to the views, since - // any view is a submemory of the parent - syncFlagsTmp.skipEntire_ = syncFlags.skipEntire_; - } - - amd::ScopedLock lock(owner()->lockMemoryOps()); - for (auto& sub : owner()->subBuffers()) { - //! \note Don't allow subbuffer's allocation in the worker thread. - //! It may cause a system lock, because possible resource - //! destruction, heap reallocation or subbuffer allocation - static const bool AllocSubBuffer = false; - device::Memory* devSub = - sub->getDeviceMemory(dev(), AllocSubBuffer); - if (nullptr != devSub) { - Memory* gpuSub = reinterpret_cast(devSub); - gpuSub->syncHostFromCache(syncFlagsTmp); - } - } - } - - // Make sure we didn't have a NOP, - // because CPU was the last writer - if (nullptr != owner()->getLastWriter()) { - // Mark parent as up to date, set our version accordingly - version_ = owner()->getVersion(); - } - - // Exit if sync is a NOP or sync can be skipped - if (!hasUpdates || syncFlags.skipEntire_) { - return; - } - - bool result = false; - static const bool Entire = true; - amd::Coord3D origin(0, 0, 0); - - // If backing store was pinned then make a transfer - if (flags_ & PinnedMemoryAlloced) { - Memory& pinned = *dev().getRocMemory(pinnedMemory_); - if (owner()->getType() == CL_MEM_OBJECT_BUFFER) { - amd::Coord3D region(owner()->getSize()); - result = dev().xferMgr().copyBuffer(*this, - pinned, origin, origin, region, Entire); - } - else { - amd::Image& image = static_cast(*owner()); - result = dev().xferMgr().copyImageToBuffer(*this, - pinned, origin, origin, image.getRegion(), Entire, - image.getRowPitch(), image.getSlicePitch()); - } - } - - // Just do a basic host read - if (!result) { - if (owner()->getType() == CL_MEM_OBJECT_BUFFER) { - amd::Coord3D region(owner()->getSize()); - result = dev().xferMgr().readBuffer(*this, - owner()->getHostMem(), origin, region, Entire); - } - else { - amd::Image& image = static_cast(*owner()); - result = dev().xferMgr().readImage(*this, - owner()->getHostMem(), origin, image.getRegion(), - image.getRowPitch(), image.getSlicePitch(), Entire); - } - } - - // Should never fail - assert(result && "Memory synchronization failed!"); + // Attempt to allocate a staging buffer if don't have any + if (owner()->getHostMem() == nullptr) { + if (nullptr != owner()->getSvmPtr()) { + owner()->commitSvmMemory(); + owner()->setHostMem(owner()->getSvmPtr()); + } else { + static const bool forceAllocHostMem = true; + owner()->allocHostMemory(nullptr, forceAllocHostMem); } -} + } -void -Memory::mgpuCacheWriteBack() -{ - // Lock memory object, so only one write back can occur - amd::ScopedLock lock(owner()->lockMemoryOps()); - - // Attempt to allocate a staging buffer if don't have any - if (owner()->getHostMem() == nullptr) { - if (nullptr != owner()->getSvmPtr()) { - owner()->commitSvmMemory(); - owner()->setHostMem(owner()->getSvmPtr()); - } - else { - static const bool forceAllocHostMem = true; - owner()->allocHostMemory(nullptr, forceAllocHostMem); - } - } - - // Make synchronization - if (owner()->getHostMem() != nullptr) { - //! \note Ignore pinning result - bool ok = pinSystemMemory(owner()->getHostMem(), owner()->getSize()); - owner()->cacheWriteBack(); - } + // Make synchronization + if (owner()->getHostMem() != nullptr) { + //! \note Ignore pinning result + bool ok = pinSystemMemory(owner()->getHostMem(), owner()->getSize()); + owner()->cacheWriteBack(); + } } /////////////////////////////////roc::Buffer////////////////////////////// -Buffer::Buffer(const roc::Device &dev, amd::Memory &owner) - : roc::Memory(dev, owner) -{} +Buffer::Buffer(const roc::Device& dev, amd::Memory& owner) : roc::Memory(dev, owner) {} -Buffer::Buffer(const roc::Device &dev, size_t size) - : roc::Memory(dev, size) -{} +Buffer::Buffer(const roc::Device& dev, size_t size) : roc::Memory(dev, size) {} -Buffer::~Buffer() -{ - if (owner() == nullptr) { - dev().hostFree(deviceMemory_, size()); - } - else { - destroy(); - } +Buffer::~Buffer() { + if (owner() == nullptr) { + dev().hostFree(deviceMemory_, size()); + } else { + destroy(); + } } -void -Buffer::destroy() -{ - if (owner()->parent() != nullptr) { - return; +void Buffer::destroy() { + if (owner()->parent() != nullptr) { + return; + } + + if (kind_ == MEMORY_KIND_INTEROP) { + destroyInteropBuffer(); + return; + } + + const cl_mem_flags memFlags = owner()->getMemFlags(); + + if ((deviceMemory_ != nullptr) && (deviceMemory_ != owner()->getHostMem())) { + // if they are identical, the host pointer will be + // deallocated later on => avoid double deallocation + if (isHostMemDirectAccess()) { + if (memFlags & (CL_MEM_USE_HOST_PTR | CL_MEM_ALLOC_HOST_PTR)) { + if (dev().agent_profile() != HSA_PROFILE_FULL) { + hsa_amd_memory_unlock(owner()->getHostMem()); + } + } + } else { + dev().memFree(deviceMemory_, size()); } + } - if(kind_==MEMORY_KIND_INTEROP) - { - destroyInteropBuffer(); - return; - } - - const cl_mem_flags memFlags = owner()->getMemFlags(); - - if ((deviceMemory_ != nullptr) && - (deviceMemory_ != owner()->getHostMem())) { - // if they are identical, the host pointer will be - // deallocated later on => avoid double deallocation - if (isHostMemDirectAccess()) { - if (memFlags & (CL_MEM_USE_HOST_PTR | CL_MEM_ALLOC_HOST_PTR)) { - if (dev().agent_profile() != HSA_PROFILE_FULL) { - hsa_amd_memory_unlock(owner()->getHostMem()); - } - } - } - else { - dev().memFree(deviceMemory_, size()); - } - } - - if (memFlags & CL_MEM_USE_HOST_PTR) { - if (dev().agent_profile() == HSA_PROFILE_FULL) { - hsa_memory_deregister(owner()->getHostMem(), size()); - } - } -} - -bool -Buffer::create() -{ - if (owner() == nullptr) { - deviceMemory_ = dev().hostAlloc(size(), 1, false); - if (deviceMemory_ != nullptr) { - flags_ |= HostMemoryDirectAccess; - return true; - } - return false; - } - - //Interop buffer - if(owner()->isInterop()) - return createInteropBuffer(GL_ARRAY_BUFFER, 0, nullptr, nullptr); - - if (nullptr != owner()->parent()) { - amd::Memory& parent = *owner()->parent(); - // Sub-Buffer creation. - roc:Memory* parentBuffer = - static_cast(parent.getDeviceMemory(dev_)); - - if (parentBuffer == nullptr) { - LogError("[OCL] Fail to allocate parent buffer"); - return false; - } - - const size_t offset = owner()->getOrigin(); - deviceMemory_ = parentBuffer->getDeviceMemory() + offset; - - flags_ |= parentBuffer->isHostMemDirectAccess() ? - HostMemoryDirectAccess : 0; - - // Explicitly set the host memory location, - // because the parent location could change after reallocation - if (nullptr != parent.getHostMem()) { - owner()->setHostMem( - reinterpret_cast(parent.getHostMem()) + offset); - } - else { - owner()->setHostMem(nullptr); - } - - return true; - } - - // Allocate backing storage in device local memory unless UHP or AHP are set - const cl_mem_flags memFlags = owner()->getMemFlags(); - if (!(memFlags & (CL_MEM_USE_HOST_PTR | CL_MEM_ALLOC_HOST_PTR))) { - deviceMemory_ = dev().deviceLocalAlloc(size()); - - if (deviceMemory_ == nullptr) { - // TODO: device memory is not enabled yet. - // Fallback to system memory if exist. - flags_ |= HostMemoryDirectAccess; - if (dev().agent_profile() == HSA_PROFILE_FULL && - owner()->getHostMem() != nullptr) { - deviceMemory_ = owner()->getHostMem(); - assert( - amd::isMultipleOf( - deviceMemory_, - static_cast(dev().info().memBaseAddrAlign_))); - return true; - } - - deviceMemory_ = dev().hostAlloc(size(), 1, false); - owner()->setHostMem(deviceMemory_); - } - - assert( - amd::isMultipleOf( - deviceMemory_, - static_cast(dev().info().memBaseAddrAlign_))); - - // Transfer data only if OCL context has one device. - // Cache coherency layer will update data for multiple devices - if (deviceMemory_ && (memFlags & CL_MEM_COPY_HOST_PTR) && - (owner()->getContext().devices().size() == 1) ) { - // To avoid recurssive call to Device::createMemory, we perform - // data transfer to the view of the buffer. - amd::Buffer *bufferView = new (owner()->getContext()) amd::Buffer( - *owner(), 0, owner()->getOrigin(), owner()->getSize()); - bufferView->create(); - - roc::Buffer *devBufferView = - new roc::Buffer(dev_, *bufferView); - devBufferView->deviceMemory_ = deviceMemory_; - - bufferView->replaceDeviceMemory(&dev_, devBufferView); - - bool ret = dev().xferMgr().writeBuffer( - owner()->getHostMem(), *devBufferView, amd::Coord3D(0), - amd::Coord3D(size()), true); - - // Release host memory, since runtime copied data - owner()->setHostMem(nullptr); - bufferView->release(); - return ret; - } - - return deviceMemory_ != nullptr; - } - assert(owner()->getHostMem() != nullptr); - - flags_ |= HostMemoryDirectAccess; - + if (memFlags & CL_MEM_USE_HOST_PTR) { if (dev().agent_profile() == HSA_PROFILE_FULL) { - deviceMemory_ = owner()->getHostMem(); + hsa_memory_deregister(owner()->getHostMem(), size()); + } + } +} - if (memFlags & CL_MEM_USE_HOST_PTR) { - hsa_memory_register(deviceMemory_, size()); - } +bool Buffer::create() { + if (owner() == nullptr) { + deviceMemory_ = dev().hostAlloc(size(), 1, false); + if (deviceMemory_ != nullptr) { + flags_ |= HostMemoryDirectAccess; + return true; + } + return false; + } - return deviceMemory_ != nullptr; + // Interop buffer + if (owner()->isInterop()) return createInteropBuffer(GL_ARRAY_BUFFER, 0, nullptr, nullptr); + + if (nullptr != owner()->parent()) { + amd::Memory& parent = *owner()->parent(); + // Sub-Buffer creation. + roc: + Memory* parentBuffer = static_cast(parent.getDeviceMemory(dev_)); + + if (parentBuffer == nullptr) { + LogError("[OCL] Fail to allocate parent buffer"); + return false; } - if (owner()->getSvmPtr() != owner()->getHostMem()) { - if (memFlags & (CL_MEM_USE_HOST_PTR | CL_MEM_ALLOC_HOST_PTR)) { - hsa_status_t status = hsa_amd_memory_lock( - owner()->getHostMem(), owner()->getSize(), nullptr, 0, &deviceMemory_); - if (status != HSA_STATUS_SUCCESS) { - deviceMemory_ = nullptr; - } - } - else { - deviceMemory_ = owner()->getHostMem(); - } + const size_t offset = owner()->getOrigin(); + deviceMemory_ = parentBuffer->getDeviceMemory() + offset; + + flags_ |= parentBuffer->isHostMemDirectAccess() ? HostMemoryDirectAccess : 0; + + // Explicitly set the host memory location, + // because the parent location could change after reallocation + if (nullptr != parent.getHostMem()) { + owner()->setHostMem(reinterpret_cast(parent.getHostMem()) + offset); + } else { + owner()->setHostMem(nullptr); } - else { + + return true; + } + + // Allocate backing storage in device local memory unless UHP or AHP are set + const cl_mem_flags memFlags = owner()->getMemFlags(); + if (!(memFlags & (CL_MEM_USE_HOST_PTR | CL_MEM_ALLOC_HOST_PTR))) { + deviceMemory_ = dev().deviceLocalAlloc(size()); + + if (deviceMemory_ == nullptr) { + // TODO: device memory is not enabled yet. + // Fallback to system memory if exist. + flags_ |= HostMemoryDirectAccess; + if (dev().agent_profile() == HSA_PROFILE_FULL && owner()->getHostMem() != nullptr) { deviceMemory_ = owner()->getHostMem(); + assert( + amd::isMultipleOf(deviceMemory_, static_cast(dev().info().memBaseAddrAlign_))); + return true; + } + + deviceMemory_ = dev().hostAlloc(size(), 1, false); + owner()->setHostMem(deviceMemory_); + } + + assert(amd::isMultipleOf(deviceMemory_, static_cast(dev().info().memBaseAddrAlign_))); + + // Transfer data only if OCL context has one device. + // Cache coherency layer will update data for multiple devices + if (deviceMemory_ && (memFlags & CL_MEM_COPY_HOST_PTR) && + (owner()->getContext().devices().size() == 1)) { + // To avoid recurssive call to Device::createMemory, we perform + // data transfer to the view of the buffer. + amd::Buffer* bufferView = new (owner()->getContext()) + amd::Buffer(*owner(), 0, owner()->getOrigin(), owner()->getSize()); + bufferView->create(); + + roc::Buffer* devBufferView = new roc::Buffer(dev_, *bufferView); + devBufferView->deviceMemory_ = deviceMemory_; + + bufferView->replaceDeviceMemory(&dev_, devBufferView); + + bool ret = dev().xferMgr().writeBuffer(owner()->getHostMem(), *devBufferView, amd::Coord3D(0), + amd::Coord3D(size()), true); + + // Release host memory, since runtime copied data + owner()->setHostMem(nullptr); + bufferView->release(); + return ret; } return deviceMemory_ != nullptr; + } + assert(owner()->getHostMem() != nullptr); + + flags_ |= HostMemoryDirectAccess; + + if (dev().agent_profile() == HSA_PROFILE_FULL) { + deviceMemory_ = owner()->getHostMem(); + + if (memFlags & CL_MEM_USE_HOST_PTR) { + hsa_memory_register(deviceMemory_, size()); + } + + return deviceMemory_ != nullptr; + } + + if (owner()->getSvmPtr() != owner()->getHostMem()) { + if (memFlags & (CL_MEM_USE_HOST_PTR | CL_MEM_ALLOC_HOST_PTR)) { + hsa_status_t status = hsa_amd_memory_lock(owner()->getHostMem(), owner()->getSize(), nullptr, + 0, &deviceMemory_); + if (status != HSA_STATUS_SUCCESS) { + deviceMemory_ = nullptr; + } + } else { + deviceMemory_ = owner()->getHostMem(); + } + } else { + deviceMemory_ = owner()->getHostMem(); + } + + return deviceMemory_ != nullptr; } /////////////////////////////////roc::Image////////////////////////////// typedef struct ChannelOrderMap { - uint32_t cl_channel_order; - hsa_ext_image_channel_order_t hsa_channel_order; + uint32_t cl_channel_order; + hsa_ext_image_channel_order_t hsa_channel_order; } ChannelOrderMap; typedef struct ChannelTypeMap { - uint32_t cl_channel_type; - hsa_ext_image_channel_type_t hsa_channel_type; + uint32_t cl_channel_type; + hsa_ext_image_channel_type_t hsa_channel_type; } ChannelTypeMap; static const ChannelOrderMap kChannelOrderMapping[] = { - { CL_R, HSA_EXT_IMAGE_CHANNEL_ORDER_R }, - { CL_A, HSA_EXT_IMAGE_CHANNEL_ORDER_A }, - { CL_RG, HSA_EXT_IMAGE_CHANNEL_ORDER_RG }, - { CL_RA, HSA_EXT_IMAGE_CHANNEL_ORDER_RA }, - { CL_RGB, HSA_EXT_IMAGE_CHANNEL_ORDER_RGB }, - { CL_RGBA, HSA_EXT_IMAGE_CHANNEL_ORDER_RGBA }, - { CL_BGRA, HSA_EXT_IMAGE_CHANNEL_ORDER_BGRA }, - { CL_ARGB, HSA_EXT_IMAGE_CHANNEL_ORDER_ARGB }, - { CL_INTENSITY, HSA_EXT_IMAGE_CHANNEL_ORDER_INTENSITY }, - { CL_LUMINANCE, HSA_EXT_IMAGE_CHANNEL_ORDER_LUMINANCE }, - { CL_Rx, HSA_EXT_IMAGE_CHANNEL_ORDER_RX }, - { CL_RGx, HSA_EXT_IMAGE_CHANNEL_ORDER_RGX }, - { CL_RGBx, HSA_EXT_IMAGE_CHANNEL_ORDER_RGBX }, - { CL_DEPTH, HSA_EXT_IMAGE_CHANNEL_ORDER_DEPTH }, - { CL_DEPTH_STENCIL, HSA_EXT_IMAGE_CHANNEL_ORDER_DEPTH_STENCIL }, - { CL_sRGB, HSA_EXT_IMAGE_CHANNEL_ORDER_SRGB }, - { CL_sRGBx, HSA_EXT_IMAGE_CHANNEL_ORDER_SRGBX }, - { CL_sRGBA, HSA_EXT_IMAGE_CHANNEL_ORDER_SRGBA }, - { CL_sBGRA, HSA_EXT_IMAGE_CHANNEL_ORDER_SBGRA }, - { CL_ABGR, HSA_EXT_IMAGE_CHANNEL_ORDER_ABGR }, + {CL_R, HSA_EXT_IMAGE_CHANNEL_ORDER_R}, + {CL_A, HSA_EXT_IMAGE_CHANNEL_ORDER_A}, + {CL_RG, HSA_EXT_IMAGE_CHANNEL_ORDER_RG}, + {CL_RA, HSA_EXT_IMAGE_CHANNEL_ORDER_RA}, + {CL_RGB, HSA_EXT_IMAGE_CHANNEL_ORDER_RGB}, + {CL_RGBA, HSA_EXT_IMAGE_CHANNEL_ORDER_RGBA}, + {CL_BGRA, HSA_EXT_IMAGE_CHANNEL_ORDER_BGRA}, + {CL_ARGB, HSA_EXT_IMAGE_CHANNEL_ORDER_ARGB}, + {CL_INTENSITY, HSA_EXT_IMAGE_CHANNEL_ORDER_INTENSITY}, + {CL_LUMINANCE, HSA_EXT_IMAGE_CHANNEL_ORDER_LUMINANCE}, + {CL_Rx, HSA_EXT_IMAGE_CHANNEL_ORDER_RX}, + {CL_RGx, HSA_EXT_IMAGE_CHANNEL_ORDER_RGX}, + {CL_RGBx, HSA_EXT_IMAGE_CHANNEL_ORDER_RGBX}, + {CL_DEPTH, HSA_EXT_IMAGE_CHANNEL_ORDER_DEPTH}, + {CL_DEPTH_STENCIL, HSA_EXT_IMAGE_CHANNEL_ORDER_DEPTH_STENCIL}, + {CL_sRGB, HSA_EXT_IMAGE_CHANNEL_ORDER_SRGB}, + {CL_sRGBx, HSA_EXT_IMAGE_CHANNEL_ORDER_SRGBX}, + {CL_sRGBA, HSA_EXT_IMAGE_CHANNEL_ORDER_SRGBA}, + {CL_sBGRA, HSA_EXT_IMAGE_CHANNEL_ORDER_SBGRA}, + {CL_ABGR, HSA_EXT_IMAGE_CHANNEL_ORDER_ABGR}, }; static const ChannelTypeMap kChannelTypeMapping[] = { @@ -832,362 +743,319 @@ static const ChannelTypeMap kChannelTypeMapping[] = { }; -static hsa_access_permission_t -GetHsaAccessPermission(const cl_mem_flags flags) { - if(flags & CL_MEM_READ_ONLY) - return HSA_ACCESS_PERMISSION_RO; - else if(flags & CL_MEM_WRITE_ONLY) - return HSA_ACCESS_PERMISSION_WO; +static hsa_access_permission_t GetHsaAccessPermission(const cl_mem_flags flags) { + if (flags & CL_MEM_READ_ONLY) + return HSA_ACCESS_PERMISSION_RO; + else if (flags & CL_MEM_WRITE_ONLY) + return HSA_ACCESS_PERMISSION_WO; else return HSA_ACCESS_PERMISSION_RW; } -Image::Image(const roc::Device& dev, amd::Memory& owner) : - roc::Memory(dev, owner) -{ - flags_ &= (~HostMemoryDirectAccess & ~HostMemoryRegistered); - populateImageDescriptor(); - hsaImageObject_.handle = 0; - originalDeviceMemory_ = nullptr; +Image::Image(const roc::Device& dev, amd::Memory& owner) : roc::Memory(dev, owner) { + flags_ &= (~HostMemoryDirectAccess & ~HostMemoryRegistered); + populateImageDescriptor(); + hsaImageObject_.handle = 0; + originalDeviceMemory_ = nullptr; } -void -Image::populateImageDescriptor() -{ - amd::Image* image = owner()->asImage(); +void Image::populateImageDescriptor() { + amd::Image* image = owner()->asImage(); - // build HSA runtime image descriptor - imageDescriptor_.width = image->getWidth(); - imageDescriptor_.height = image->getHeight(); - imageDescriptor_.depth = image->getDepth(); - imageDescriptor_.array_size = 0; + // build HSA runtime image descriptor + imageDescriptor_.width = image->getWidth(); + imageDescriptor_.height = image->getHeight(); + imageDescriptor_.depth = image->getDepth(); + imageDescriptor_.array_size = 0; - switch (image->getType()) - { + switch (image->getType()) { case CL_MEM_OBJECT_IMAGE1D: - imageDescriptor_.geometry = HSA_EXT_IMAGE_GEOMETRY_1D; - imageDescriptor_.height = 1; - imageDescriptor_.depth = 1; - break; + imageDescriptor_.geometry = HSA_EXT_IMAGE_GEOMETRY_1D; + imageDescriptor_.height = 1; + imageDescriptor_.depth = 1; + break; case CL_MEM_OBJECT_IMAGE1D_BUFFER: - imageDescriptor_.geometry = HSA_EXT_IMAGE_GEOMETRY_1DB; - imageDescriptor_.height = 1; - imageDescriptor_.depth = 1; - break; + imageDescriptor_.geometry = HSA_EXT_IMAGE_GEOMETRY_1DB; + imageDescriptor_.height = 1; + imageDescriptor_.depth = 1; + break; case CL_MEM_OBJECT_IMAGE1D_ARRAY: - //@todo - arraySize = height ?! - imageDescriptor_.geometry = HSA_EXT_IMAGE_GEOMETRY_1DA; - imageDescriptor_.height = 1; - imageDescriptor_.array_size = image->getHeight(); - break; + //@todo - arraySize = height ?! + imageDescriptor_.geometry = HSA_EXT_IMAGE_GEOMETRY_1DA; + imageDescriptor_.height = 1; + imageDescriptor_.array_size = image->getHeight(); + break; case CL_MEM_OBJECT_IMAGE2D: - imageDescriptor_.geometry = HSA_EXT_IMAGE_GEOMETRY_2D; - imageDescriptor_.depth = 1; - break; + imageDescriptor_.geometry = HSA_EXT_IMAGE_GEOMETRY_2D; + imageDescriptor_.depth = 1; + break; case CL_MEM_OBJECT_IMAGE2D_ARRAY: - //@todo - arraySize = depth ?! - imageDescriptor_.geometry = HSA_EXT_IMAGE_GEOMETRY_2DA; - imageDescriptor_.depth = 1; - imageDescriptor_.array_size = image->getDepth(); - break; + //@todo - arraySize = depth ?! + imageDescriptor_.geometry = HSA_EXT_IMAGE_GEOMETRY_2DA; + imageDescriptor_.depth = 1; + imageDescriptor_.array_size = image->getDepth(); + break; case CL_MEM_OBJECT_IMAGE3D: - imageDescriptor_.geometry = HSA_EXT_IMAGE_GEOMETRY_3D; - break; - } + imageDescriptor_.geometry = HSA_EXT_IMAGE_GEOMETRY_3D; + break; + } - const int kChannelOrderCount = - sizeof(kChannelOrderMapping) / sizeof(ChannelOrderMap); - for (int i = 0; i < kChannelOrderCount; i++) { - if (image->getImageFormat().image_channel_order == - kChannelOrderMapping[i].cl_channel_order) { - imageDescriptor_.format.channel_order = - kChannelOrderMapping[i].hsa_channel_order; - break; - } + const int kChannelOrderCount = sizeof(kChannelOrderMapping) / sizeof(ChannelOrderMap); + for (int i = 0; i < kChannelOrderCount; i++) { + if (image->getImageFormat().image_channel_order == kChannelOrderMapping[i].cl_channel_order) { + imageDescriptor_.format.channel_order = kChannelOrderMapping[i].hsa_channel_order; + break; } + } - const int kChannelTypeCount = - sizeof(kChannelTypeMapping) / sizeof(ChannelTypeMap); - for (int i = 0; i < kChannelTypeCount; i++) { - if (image->getImageFormat().image_channel_data_type == - kChannelTypeMapping[i].cl_channel_type) { - imageDescriptor_.format.channel_type = - kChannelTypeMapping[i].hsa_channel_type; - break; - } + const int kChannelTypeCount = sizeof(kChannelTypeMapping) / sizeof(ChannelTypeMap); + for (int i = 0; i < kChannelTypeCount; i++) { + if (image->getImageFormat().image_channel_data_type == kChannelTypeMapping[i].cl_channel_type) { + imageDescriptor_.format.channel_type = kChannelTypeMapping[i].hsa_channel_type; + break; } + } - permission_ = - GetHsaAccessPermission(owner()->getMemFlags()); + permission_ = GetHsaAccessPermission(owner()->getMemFlags()); } -bool -Image::createInteropImage() -{ - auto obj=owner()->getInteropObj()->asGLObject(); - assert(obj->getCLGLObjectType()!=CL_GL_OBJECT_BUFFER && "Non-image OpenGL object used with interop image API."); +bool Image::createInteropImage() { + auto obj = owner()->getInteropObj()->asGLObject(); + assert(obj->getCLGLObjectType() != CL_GL_OBJECT_BUFFER && + "Non-image OpenGL object used with interop image API."); const hsa_amd_image_descriptor_t* meta; - size_t size=0; + size_t size = 0; GLenum glTarget = obj->getGLTarget(); if (glTarget == GL_TEXTURE_CUBE_MAP) { glTarget = obj->getCubemapFace(); } - if(!createInteropBuffer(glTarget, obj->getGLMipLevel(), &size, &meta)) - { + if (!createInteropBuffer(glTarget, obj->getGLMipLevel(), &size, &meta)) { assert(false && "Failed to map image buffer."); return false; } - MAKE_SCOPE_GUARD(BufferGuard, [&](){ destroyInteropBuffer(); }); + MAKE_SCOPE_GUARD(BufferGuard, [&]() { destroyInteropBuffer(); }); - amdImageDesc_=(hsa_amd_image_descriptor_t*)malloc(size); - if(amdImageDesc_==nullptr) - return false; - MAKE_SCOPE_GUARD(DescGuard, [&](){ free(amdImageDesc_); amdImageDesc_=nullptr; }); + amdImageDesc_ = (hsa_amd_image_descriptor_t*)malloc(size); + if (amdImageDesc_ == nullptr) return false; + MAKE_SCOPE_GUARD(DescGuard, [&]() { + free(amdImageDesc_); + amdImageDesc_ = nullptr; + }); memcpy(amdImageDesc_, meta, size); image_metadata desc; - if(!desc.create(amdImageDesc_)) - return false; + if (!desc.create(amdImageDesc_)) return false; - if(!desc.setMipLevel(obj->getGLMipLevel())) - return false; + if (!desc.setMipLevel(obj->getGLMipLevel())) return false; - if (obj->getGLTarget()==GL_TEXTURE_CUBE_MAP) - desc.setFace(obj->getCubemapFace()); + if (obj->getGLTarget() == GL_TEXTURE_CUBE_MAP) desc.setFace(obj->getCubemapFace()); - originalDeviceMemory_=deviceMemory_; + originalDeviceMemory_ = deviceMemory_; - hsa_status_t err=hsa_amd_image_create(dev().getBackendDevice(), &imageDescriptor_, amdImageDesc_, originalDeviceMemory_, permission_, &hsaImageObject_); - if(err!=HSA_STATUS_SUCCESS) - return false; + hsa_status_t err = + hsa_amd_image_create(dev().getBackendDevice(), &imageDescriptor_, amdImageDesc_, + originalDeviceMemory_, permission_, &hsaImageObject_); + if (err != HSA_STATUS_SUCCESS) return false; BufferGuard.Dismiss(); DescGuard.Dismiss(); return true; } -bool -Image::create() -{ - if (owner()->parent()) { - // Image view creation - roc::Memory *parent = - static_cast(owner()->parent()->getDeviceMemory(dev_)); +bool Image::create() { + if (owner()->parent()) { + // Image view creation + roc::Memory* parent = static_cast(owner()->parent()->getDeviceMemory(dev_)); - if (parent == nullptr) { - LogError("[OCL] Fail to allocate parent image"); - return false; - } - - return createView(*parent); + if (parent == nullptr) { + LogError("[OCL] Fail to allocate parent image"); + return false; } - //Interop image - if (owner()->isInterop()) { - return createInteropImage(); - } + return createView(*parent); + } - // Get memory size requirement for device specific image. - hsa_status_t status = hsa_ext_image_data_get_info( - dev().getBackendDevice(), &imageDescriptor_, - permission_, &deviceImageInfo_); + // Interop image + if (owner()->isInterop()) { + return createInteropImage(); + } - if (status != HSA_STATUS_SUCCESS) { - LogError("[OCL] Fail to allocate image memory"); - return false; - } + // Get memory size requirement for device specific image. + hsa_status_t status = hsa_ext_image_data_get_info(dev().getBackendDevice(), &imageDescriptor_, + permission_, &deviceImageInfo_); - // roc::Device::hostAlloc and deviceLocalAlloc implementation does not - // support alignment larger than HSA memory region allocation granularity. - // In this case, the user manages the alignment. - const size_t alloc_size = - (deviceImageInfo_.alignment <= dev().alloc_granularity()) - ? deviceImageInfo_.size - : deviceImageInfo_.size + deviceImageInfo_.alignment; + if (status != HSA_STATUS_SUCCESS) { + LogError("[OCL] Fail to allocate image memory"); + return false; + } - if (!(owner()->getMemFlags() & CL_MEM_ALLOC_HOST_PTR)) { - originalDeviceMemory_ = dev().deviceLocalAlloc(alloc_size); - } + // roc::Device::hostAlloc and deviceLocalAlloc implementation does not + // support alignment larger than HSA memory region allocation granularity. + // In this case, the user manages the alignment. + const size_t alloc_size = (deviceImageInfo_.alignment <= dev().alloc_granularity()) + ? deviceImageInfo_.size + : deviceImageInfo_.size + deviceImageInfo_.alignment; - if (originalDeviceMemory_ == nullptr) { - originalDeviceMemory_ = dev().hostAlloc(alloc_size, 1, false); - } + if (!(owner()->getMemFlags() & CL_MEM_ALLOC_HOST_PTR)) { + originalDeviceMemory_ = dev().deviceLocalAlloc(alloc_size); + } - deviceMemory_ = reinterpret_cast( - amd::alignUp(reinterpret_cast(originalDeviceMemory_), - deviceImageInfo_.alignment)); + if (originalDeviceMemory_ == nullptr) { + originalDeviceMemory_ = dev().hostAlloc(alloc_size, 1, false); + } - assert(amd::isMultipleOf( - deviceMemory_, static_cast(deviceImageInfo_.alignment))); + deviceMemory_ = reinterpret_cast( + amd::alignUp(reinterpret_cast(originalDeviceMemory_), deviceImageInfo_.alignment)); - status = hsa_ext_image_create( - dev().getBackendDevice(), &imageDescriptor_, deviceMemory_, - permission_, &hsaImageObject_); + assert(amd::isMultipleOf(deviceMemory_, static_cast(deviceImageInfo_.alignment))); - if (status != HSA_STATUS_SUCCESS) { - LogError("[OCL] Fail to allocate image memory"); - return false; - } + status = hsa_ext_image_create(dev().getBackendDevice(), &imageDescriptor_, deviceMemory_, + permission_, &hsaImageObject_); - return true; + if (status != HSA_STATUS_SUCCESS) { + LogError("[OCL] Fail to allocate image memory"); + return false; + } + + return true; } -bool -Image::createView(const Memory &parent) -{ - deviceMemory_ = parent.getDeviceMemory(); +bool Image::createView(const Memory& parent) { + deviceMemory_ = parent.getDeviceMemory(); - originalDeviceMemory_ = (parent.owner()->asBuffer() != nullptr) - ? deviceMemory_ - : static_cast(parent).originalDeviceMemory_; + originalDeviceMemory_ = (parent.owner()->asBuffer() != nullptr) + ? deviceMemory_ + : static_cast(parent).originalDeviceMemory_; - //Detect image view from buffer to distinguish linear paths from tiled. - amd::Memory* ancestor = parent.owner(); - while ((ancestor->asBuffer() == nullptr) && (ancestor->parent() != nullptr)) { - ancestor = ancestor->parent(); - } - bool linearLayout = (ancestor->asBuffer() != nullptr); + // Detect image view from buffer to distinguish linear paths from tiled. + amd::Memory* ancestor = parent.owner(); + while ((ancestor->asBuffer() == nullptr) && (ancestor->parent() != nullptr)) { + ancestor = ancestor->parent(); + } + bool linearLayout = (ancestor->asBuffer() != nullptr); - kind_ = parent.getKind(); - version_ = parent.version(); + kind_ = parent.getKind(); + version_ = parent.version(); - hsa_status_t status; - if (linearLayout) { - size_t rowPitch; - amd::Image& ownerImage = *owner()->asImage(); - size_t elementSize = ownerImage.getImageFormat().getElementSize(); - // First get the row pitch in pixels - if (ownerImage.getRowPitch() != 0) { - rowPitch = ownerImage.getRowPitch() / elementSize; - } - else { - rowPitch = ownerImage.getWidth(); - } - - // Make sure the row pitch is aligned to pixels - rowPitch = elementSize * - amd::alignUp(rowPitch, dev().info().imagePitchAlignment_); - - status = hsa_ext_image_create_with_layout(dev().getBackendDevice(), - &imageDescriptor_, deviceMemory_, permission_, - HSA_EXT_IMAGE_DATA_LAYOUT_LINEAR, rowPitch, 0, - &hsaImageObject_); - } else if (kind_ == MEMORY_KIND_INTEROP) { - amdImageDesc_ = static_cast(parent.owner()->getDeviceMemory(dev()))->amdImageDesc_; - status = hsa_amd_image_create(dev().getBackendDevice(), &imageDescriptor_, - amdImageDesc_, deviceMemory_, permission_, &hsaImageObject_); + hsa_status_t status; + if (linearLayout) { + size_t rowPitch; + amd::Image& ownerImage = *owner()->asImage(); + size_t elementSize = ownerImage.getImageFormat().getElementSize(); + // First get the row pitch in pixels + if (ownerImage.getRowPitch() != 0) { + rowPitch = ownerImage.getRowPitch() / elementSize; } else { - status= hsa_ext_image_create(dev().getBackendDevice(), &imageDescriptor_, - deviceMemory_, permission_, &hsaImageObject_); + rowPitch = ownerImage.getWidth(); } - if (status != HSA_STATUS_SUCCESS) { - LogError("[OCL] Fail to allocate image memory"); - return false; - } + // Make sure the row pitch is aligned to pixels + rowPitch = elementSize * amd::alignUp(rowPitch, dev().info().imagePitchAlignment_); - return true; + status = hsa_ext_image_create_with_layout( + dev().getBackendDevice(), &imageDescriptor_, deviceMemory_, permission_, + HSA_EXT_IMAGE_DATA_LAYOUT_LINEAR, rowPitch, 0, &hsaImageObject_); + } else if (kind_ == MEMORY_KIND_INTEROP) { + amdImageDesc_ = static_cast(parent.owner()->getDeviceMemory(dev()))->amdImageDesc_; + status = hsa_amd_image_create(dev().getBackendDevice(), &imageDescriptor_, amdImageDesc_, + deviceMemory_, permission_, &hsaImageObject_); + } else { + status = hsa_ext_image_create(dev().getBackendDevice(), &imageDescriptor_, deviceMemory_, + permission_, &hsaImageObject_); + } + + if (status != HSA_STATUS_SUCCESS) { + LogError("[OCL] Fail to allocate image memory"); + return false; + } + + return true; } -void* -Image::allocMapTarget( - const amd::Coord3D& origin, - const amd::Coord3D& region, - uint mapFlags, - size_t* rowPitch, - size_t* slicePitch) -{ - amd::ScopedLock lock(owner()->lockMemoryOps()); +void* Image::allocMapTarget(const amd::Coord3D& origin, const amd::Coord3D& region, uint mapFlags, + size_t* rowPitch, size_t* slicePitch) { + amd::ScopedLock lock(owner()->lockMemoryOps()); - incIndMapCount(); + incIndMapCount(); - void* pHostMem = owner()->getHostMem(); + void* pHostMem = owner()->getHostMem(); - amd::Image* image = owner()->asImage(); + amd::Image* image = owner()->asImage(); - size_t elementSize = image->getImageFormat().getElementSize(); + size_t elementSize = image->getImageFormat().getElementSize(); - size_t offset = origin[0] * elementSize; + size_t offset = origin[0] * elementSize; - if (pHostMem == nullptr) { - if (indirectMapCount_ == 1) { - if (!allocateMapMemory(owner()->getSize())) { - decIndMapCount(); - return nullptr; - } - } - else { - // Did the map resource allocation fail? - if (mapMemory_ == nullptr) { - LogError("Could not map target resource"); - return nullptr; - } - } - - pHostMem = mapMemory_->getHostMem(); - - *rowPitch = region[0] * elementSize; - - size_t slicePitchTmp = 0; - - if (imageDescriptor_.geometry == HSA_EXT_IMAGE_GEOMETRY_1DA) { - slicePitchTmp = *rowPitch; - } - else { - slicePitchTmp = *rowPitch * region[1]; - } - if (slicePitch != nullptr) { - *slicePitch = slicePitchTmp; - } - - return pHostMem; + if (pHostMem == nullptr) { + if (indirectMapCount_ == 1) { + if (!allocateMapMemory(owner()->getSize())) { + decIndMapCount(); + return nullptr; + } + } else { + // Did the map resource allocation fail? + if (mapMemory_ == nullptr) { + LogError("Could not map target resource"); + return nullptr; + } } - // Adjust offset with Y dimension - offset += image->getRowPitch() * origin[1]; + pHostMem = mapMemory_->getHostMem(); - // Adjust offset with Z dimension - offset += image->getSlicePitch() * origin[2]; + *rowPitch = region[0] * elementSize; - *rowPitch = image->getRowPitch(); + size_t slicePitchTmp = 0; + + if (imageDescriptor_.geometry == HSA_EXT_IMAGE_GEOMETRY_1DA) { + slicePitchTmp = *rowPitch; + } else { + slicePitchTmp = *rowPitch * region[1]; + } if (slicePitch != nullptr) { - *slicePitch = image->getSlicePitch(); + *slicePitch = slicePitchTmp; } - return (static_cast(pHostMem)+offset); + return pHostMem; + } + + // Adjust offset with Y dimension + offset += image->getRowPitch() * origin[1]; + + // Adjust offset with Z dimension + offset += image->getSlicePitch() * origin[2]; + + *rowPitch = image->getRowPitch(); + if (slicePitch != nullptr) { + *slicePitch = image->getSlicePitch(); + } + + return (static_cast(pHostMem) + offset); } -Image::~Image() -{ - destroy(); -} +Image::~Image() { destroy(); } -void -Image::destroy() -{ +void Image::destroy() { if (hsaImageObject_.handle != 0) { - hsa_status_t status = - hsa_ext_image_destroy(dev().getBackendDevice(), hsaImageObject_); - assert(status == HSA_STATUS_SUCCESS); + hsa_status_t status = hsa_ext_image_destroy(dev().getBackendDevice(), hsaImageObject_); + assert(status == HSA_STATUS_SUCCESS); } if (owner()->parent() != nullptr) { - return; + return; } - if(kind_==MEMORY_KIND_INTEROP) - { + if (kind_ == MEMORY_KIND_INTEROP) { free(amdImageDesc_); - amdImageDesc_=nullptr; + amdImageDesc_ = nullptr; destroyInteropBuffer(); return; } if (originalDeviceMemory_ != nullptr) { - dev().memFree(originalDeviceMemory_, deviceImageInfo_.size); + dev().memFree(originalDeviceMemory_, deviceImageInfo_.size); } } } diff --git a/rocclr/runtime/device/rocm/rocmemory.hpp b/rocclr/runtime/device/rocm/rocmemory.hpp index f0b76479f2..3aa7de869e 100644 --- a/rocclr/runtime/device/rocm/rocmemory.hpp +++ b/rocclr/runtime/device/rocm/rocmemory.hpp @@ -14,187 +14,180 @@ namespace roc { class Memory : public device::Memory { public: - enum MEMORY_KIND { MEMORY_KIND_NORMAL=0, MEMORY_KIND_LOCK, MEMORY_KIND_GART, MEMORY_KIND_INTEROP }; + enum MEMORY_KIND { + MEMORY_KIND_NORMAL = 0, + MEMORY_KIND_LOCK, + MEMORY_KIND_GART, + MEMORY_KIND_INTEROP + }; - Memory(const roc::Device &dev, amd::Memory &owner); + Memory(const roc::Device& dev, amd::Memory& owner); - Memory(const roc::Device &dev, size_t size); + Memory(const roc::Device& dev, size_t size); - virtual ~Memory(); + virtual ~Memory(); - // Getter for deviceMemory_ - address getDeviceMemory() const { return reinterpret_cast
(deviceMemory_); } + // Getter for deviceMemory_ + address getDeviceMemory() const { return reinterpret_cast
(deviceMemory_); } - // Gets a pointer to a region of host-visible memory for use as the target - // of an indirect map for a given memory object - virtual void *allocMapTarget(const amd::Coord3D &origin, - const amd::Coord3D ®ion, - uint mapFlags, - size_t *rowPitch, - size_t *slicePitch); + // Gets a pointer to a region of host-visible memory for use as the target + // of an indirect map for a given memory object + virtual void* allocMapTarget(const amd::Coord3D& origin, const amd::Coord3D& region, + uint mapFlags, size_t* rowPitch, size_t* slicePitch); - // Create device memory according to OpenCL memory flag. - virtual bool create() = 0; + // Create device memory according to OpenCL memory flag. + virtual bool create() = 0; - // Pins system memory associated with this memory object. - virtual bool pinSystemMemory(void *hostPtr, // System memory address - size_t size // Size of allocated system memory - ); + // Pins system memory associated with this memory object. + virtual bool pinSystemMemory(void* hostPtr, // System memory address + size_t size // Size of allocated system memory + ); - //! Updates device memory from the owner's host allocation - void syncCacheFromHost( - VirtualGPU& gpu, //!< Virtual GPU device object - //! Synchronization flags - device::Memory::SyncFlags syncFlags = device::Memory::SyncFlags() - ); + //! Updates device memory from the owner's host allocation + void syncCacheFromHost(VirtualGPU& gpu, //!< Virtual GPU device object + //! Synchronization flags + device::Memory::SyncFlags syncFlags = device::Memory::SyncFlags()); - // Immediate blocking write from device cache to owners's backing store. - // Marks owner as "current" by resetting the last writer to nullptr. - virtual void syncHostFromCache(SyncFlags syncFlags = SyncFlags()); + // Immediate blocking write from device cache to owners's backing store. + // Marks owner as "current" by resetting the last writer to nullptr. + virtual void syncHostFromCache(SyncFlags syncFlags = SyncFlags()); - //! Allocates host memory for synchronization with MGPU context - void mgpuCacheWriteBack(); + //! Allocates host memory for synchronization with MGPU context + void mgpuCacheWriteBack(); - // Releases indirect map surface - void releaseIndirectMap() { decIndMapCount(); } + // Releases indirect map surface + void releaseIndirectMap() { decIndMapCount(); } - //! Map the device memory to CPU visible - virtual void* cpuMap( - device::VirtualDevice& vDev, //!< Virtual device for map operaiton - uint flags = 0, //!< flags for the map operation - // Optimization for multilayer map/unmap - uint startLayer = 0, //!< Start layer for multilayer map - uint numLayers = 0, //!< End layer for multilayer map - size_t* rowPitch = nullptr, //!< Row pitch for the device memory - size_t* slicePitch = nullptr //!< Slice pitch for the device memory - ); + //! Map the device memory to CPU visible + virtual void* cpuMap(device::VirtualDevice& vDev, //!< Virtual device for map operaiton + uint flags = 0, //!< flags for the map operation + // Optimization for multilayer map/unmap + uint startLayer = 0, //!< Start layer for multilayer map + uint numLayers = 0, //!< End layer for multilayer map + size_t* rowPitch = nullptr, //!< Row pitch for the device memory + size_t* slicePitch = nullptr //!< Slice pitch for the device memory + ); - //! Unmap the device memory - virtual void cpuUnmap( - device::VirtualDevice& vDev //!< Virtual device for unmap operaiton - ); + //! Unmap the device memory + virtual void cpuUnmap(device::VirtualDevice& vDev //!< Virtual device for unmap operaiton + ); - //Mesa has already decomressed if needed and also does acquire at the start of every command batch. - virtual bool processGLResource(GLResourceOP operation) { return true; } + // Mesa has already decomressed if needed and also does acquire at the start of every command + // batch. + virtual bool processGLResource(GLResourceOP operation) { return true; } - // Accessors for indirect map memory object - amd::Memory *mapMemory() const { return mapMemory_; } + // Accessors for indirect map memory object + amd::Memory* mapMemory() const { return mapMemory_; } - MEMORY_KIND getKind() const { return kind_; } + MEMORY_KIND getKind() const { return kind_; } - const roc::Device& dev() const { return dev_; } + const roc::Device& dev() const { return dev_; } - size_t version() const { return version_; } + size_t version() const { return version_; } protected: + bool allocateMapMemory(size_t allocationSize); - bool allocateMapMemory(size_t allocationSize); + // Decrement map count + virtual void decIndMapCount(); - // Decrement map count - virtual void decIndMapCount(); + // Free / deregister device memory. + virtual void destroy() = 0; - // Free / deregister device memory. - virtual void destroy() = 0; + // Place interop object into HSA's flat address space + bool createInteropBuffer(GLenum targetType, int miplevel, size_t* metadata_size, + const hsa_amd_image_descriptor_t** metadata); - // Place interop object into HSA's flat address space - bool createInteropBuffer(GLenum targetType, int miplevel, size_t* metadata_size, const hsa_amd_image_descriptor_t** metadata); + void destroyInteropBuffer(); - void destroyInteropBuffer(); + // Pointer to the device associated with this memory object. + const roc::Device& dev_; - // Pointer to the device associated with this memory object. - const roc::Device &dev_; + // Pointer to the device memory. This could be in system or device local mem. + void* deviceMemory_; - // Pointer to the device memory. This could be in system or device local mem. - void* deviceMemory_; + // Track if this memory is interop, lock, gart, or normal. + MEMORY_KIND kind_; - // Track if this memory is interop, lock, gart, or normal. - MEMORY_KIND kind_; + private: + // Disable copy constructor + Memory(const Memory&); -private: - // Disable copy constructor - Memory(const Memory &); + // Disable operator= + Memory& operator=(const Memory&); - // Disable operator= - Memory &operator=(const Memory &); - - amd::Memory* pinnedMemory_; //!< Memory used as pinned system memory + amd::Memory* pinnedMemory_; //!< Memory used as pinned system memory }; class Buffer : public roc::Memory { public: - Buffer(const roc::Device &dev, amd::Memory &owner); - Buffer(const roc::Device &dev, size_t size); + Buffer(const roc::Device& dev, amd::Memory& owner); + Buffer(const roc::Device& dev, size_t size); - virtual ~Buffer(); + virtual ~Buffer(); - // Create device memory according to OpenCL memory flag. - virtual bool create(); + // Create device memory according to OpenCL memory flag. + virtual bool create(); - // Recreate the device memory using new size and alignment. - bool recreate(size_t newSize, size_t newAlignment, bool forceSystem); + // Recreate the device memory using new size and alignment. + bool recreate(size_t newSize, size_t newAlignment, bool forceSystem); private: - // Disable copy constructor - Buffer(const Buffer &); + // Disable copy constructor + Buffer(const Buffer&); - // Disable operator= - Buffer &operator=(const Buffer &); + // Disable operator= + Buffer& operator=(const Buffer&); - // Free device memory. - void destroy(); + // Free device memory. + void destroy(); }; -class Image : public roc::Memory -{ -public: - Image(const roc::Device& dev, amd::Memory& owner); +class Image : public roc::Memory { + public: + Image(const roc::Device& dev, amd::Memory& owner); - virtual ~Image(); + virtual ~Image(); - //! Create device memory according to OpenCL memory flag. - virtual bool create(); + //! Create device memory according to OpenCL memory flag. + virtual bool create(); - //! Create an image view - bool createView(const Memory &parent); + //! Create an image view + bool createView(const Memory& parent); - //! Gets a pointer to a region of host-visible memory for use as the target - //! of an indirect map for a given memory object - virtual void* allocMapTarget( - const amd::Coord3D& origin, - const amd::Coord3D& region, - uint mapFlags, - size_t* rowPitch, - size_t* slicePitch); + //! Gets a pointer to a region of host-visible memory for use as the target + //! of an indirect map for a given memory object + virtual void* allocMapTarget(const amd::Coord3D& origin, const amd::Coord3D& region, + uint mapFlags, size_t* rowPitch, size_t* slicePitch); - size_t getDeviceDataSize() { return deviceImageInfo_.size; } - size_t getDeviceDataAlignment() { return deviceImageInfo_.alignment; } + size_t getDeviceDataSize() { return deviceImageInfo_.size; } + size_t getDeviceDataAlignment() { return deviceImageInfo_.alignment; } - hsa_ext_image_t getHsaImageObject() { return hsaImageObject_; } - const hsa_ext_image_descriptor_t& getHsaImageDescriptor() const { return imageDescriptor_; } -private: - //! Disable copy constructor - Image(const Buffer&); + hsa_ext_image_t getHsaImageObject() { return hsaImageObject_; } + const hsa_ext_image_descriptor_t& getHsaImageDescriptor() const { return imageDescriptor_; } - //! Disable operator= - Image& operator=(const Buffer&); + private: + //! Disable copy constructor + Image(const Buffer&); - // Setup an interop image - bool createInteropImage(); + //! Disable operator= + Image& operator=(const Buffer&); - // Free / deregister device memory. - void destroy(); + // Setup an interop image + bool createInteropImage(); - void populateImageDescriptor(); + // Free / deregister device memory. + void destroy(); - hsa_ext_image_descriptor_t imageDescriptor_; - hsa_access_permission_t permission_; - hsa_ext_image_data_info_t deviceImageInfo_; - hsa_ext_image_t hsaImageObject_; - hsa_amd_image_descriptor_t* amdImageDesc_; + void populateImageDescriptor(); - void* originalDeviceMemory_; + hsa_ext_image_descriptor_t imageDescriptor_; + hsa_access_permission_t permission_; + hsa_ext_image_data_info_t deviceImageInfo_; + hsa_ext_image_t hsaImageObject_; + hsa_amd_image_descriptor_t* amdImageDesc_; + + void* originalDeviceMemory_; }; - } #endif - diff --git a/rocclr/runtime/device/rocm/rocprintf.cpp b/rocclr/runtime/device/rocm/rocprintf.cpp index 9dc25c810d..9d97d7c934 100644 --- a/rocclr/runtime/device/rocm/rocprintf.cpp +++ b/rocclr/runtime/device/rocm/rocprintf.cpp @@ -18,18 +18,14 @@ namespace roc { PrintfDbg::PrintfDbg(Device& device, FILE* file) - : dbgBuffer_(nullptr), - dbgBuffer_size_(0), - dbgFile_(file), - gpuDevice_(device) {} + : dbgBuffer_(nullptr), dbgBuffer_size_(0), dbgFile_(file), gpuDevice_(device) {} PrintfDbg::~PrintfDbg() { dev().hostFree(dbgBuffer_, dbgBuffer_size_); } bool PrintfDbg::allocate(bool realloc) { if (nullptr == dbgBuffer_) { dbgBuffer_size_ = dev().info().printfBufferSize_; - dbgBuffer_ = reinterpret_cast
( - dev().hostAlloc(dbgBuffer_size_, sizeof(void*))); + dbgBuffer_ = reinterpret_cast
(dev().hostAlloc(dbgBuffer_size_, sizeof(void*))); } else if (realloc) { LogWarning("Debug buffer reallocation!"); // Double the buffer size if it's not big enough @@ -62,8 +58,7 @@ bool PrintfDbg::checkString(const std::string& fmt) const { return false; } -int PrintfDbg::checkVectorSpecifier(const std::string& fmt, size_t startPos, - size_t& curPos) const { +int PrintfDbg::checkVectorSpecifier(const std::string& fmt, size_t startPos, size_t& curPos) const { int vectorSize = 0; size_t pos = curPos; size_t size = curPos - startPos; @@ -115,8 +110,8 @@ int PrintfDbg::checkVectorSpecifier(const std::string& fmt, size_t startPos, static const size_t ConstStr = 0xffffffff; static const char Separator[] = ",\0"; -size_t PrintfDbg::outputArgument(const std::string& fmt, bool printFloat, - size_t size, const uint32_t* argument) const { +size_t PrintfDbg::outputArgument(const std::string& fmt, bool printFloat, size_t size, + const uint32_t* argument) const { // Serialize the output to the screen // amd::ScopedLock k(dev().lockAsyncOps()); @@ -131,8 +126,7 @@ size_t PrintfDbg::outputArgument(const std::string& fmt, bool printFloat, // copiedBytes = strlen("(null)") copiedBytes = 6; } else { - const unsigned char* argumentStr = - reinterpret_cast(argument); + const unsigned char* argumentStr = reinterpret_cast(argument); amd::Os::printf(fmt.data(), argumentStr); // copiedBytes = strlen(argumentStr) while (argumentStr[copiedBytes++] != 0) @@ -157,8 +151,7 @@ size_t PrintfDbg::outputArgument(const std::string& fmt, bool printFloat, ; } break; case 1: - amd::Os::printf(fmt.data(), - *(reinterpret_cast(argument))); + amd::Os::printf(fmt.data(), *(reinterpret_cast(argument))); break; case 2: case 4: @@ -199,9 +192,7 @@ size_t PrintfDbg::outputArgument(const std::string& fmt, bool printFloat, // fmt should be updated not to contain "hh" modifier std::string hhFmt = fmt; hhFmt.erase(hhFmt.find_first_of("h"), 2); - amd::Os::printf( - hhFmt.data(), - *(reinterpret_cast(argument))); + amd::Os::printf(hhFmt.data(), *(reinterpret_cast(argument))); } else if (hlModifier) { amd::Os::printf(hlFmt.data(), *argument); } else { @@ -212,18 +203,15 @@ size_t PrintfDbg::outputArgument(const std::string& fmt, bool printFloat, case 8: if (printFloat) { if (hlModifier) { - amd::Os::printf(hlFmt.data(), - *(reinterpret_cast(argument))); + amd::Os::printf(hlFmt.data(), *(reinterpret_cast(argument))); } else { - amd::Os::printf(fmt.data(), - *(reinterpret_cast(argument))); + amd::Os::printf(fmt.data(), *(reinterpret_cast(argument))); } } else { std::string out = fmt; // Use 'll' for 64 bit printf out.insert((out.size() - 1), 1, 'l'); - amd::Os::printf(out.data(), - *(reinterpret_cast(argument))); + amd::Os::printf(out.data(), *(reinterpret_cast(argument))); } break; case ConstStr: { @@ -240,8 +228,8 @@ size_t PrintfDbg::outputArgument(const std::string& fmt, bool printFloat, return copiedBytes; } -void PrintfDbg::outputDbgBuffer(const PrintfInfo& info, - const uint32_t* workitemData, size_t& i) const { +void PrintfDbg::outputDbgBuffer(const PrintfInfo& info, const uint32_t* workitemData, + size_t& i) const { static const char* specifiers = "cdieEfgGaosuxXp"; static const char* modifiers = "hl"; static const char* special = "%n"; @@ -278,15 +266,13 @@ void PrintfDbg::outputDbgBuffer(const PrintfInfo& info, fmt = str.substr(pos, posEnd - pos); fmt.erase(posStart - pos - 1, 1); pos = posStart = posEnd; - outputArgument(sepStr, false, ConstStr, - reinterpret_cast(fmt.data())); + outputArgument(sepStr, false, ConstStr, reinterpret_cast(fmt.data())); continue; } break; } else if (pos < str.length()) { - outputArgument( - sepStr, false, ConstStr, - reinterpret_cast((str.substr(pos)).data())); + outputArgument(sepStr, false, ConstStr, + reinterpret_cast((str.substr(pos)).data())); } } while (posStart != std::string::npos); @@ -338,8 +324,7 @@ void PrintfDbg::outputDbgBuffer(const PrintfInfo& info, } else { // 3-component vector's size is defined as 4 * size of each scalar // component - size_t elemSize = - info.arguments_[j] / (vectorSize == 3 ? 4 : vectorSize); + size_t elemSize = info.arguments_[j] / (vectorSize == 3 ? 4 : vectorSize); size_t k = i * sizeof(uint32_t); std::string elementStr = fmt.substr(idPos, fmt.size()); @@ -352,16 +337,13 @@ void PrintfDbg::outputDbgBuffer(const PrintfInfo& info, for (int e = 1; e < vectorSize; ++e) { const char* t = reinterpret_cast(s); // Output the vector separator - outputArgument(sepStr, false, ConstStr, - reinterpret_cast(Separator)); + outputArgument(sepStr, false, ConstStr, reinterpret_cast(Separator)); // Output the next element - outputArgument( - elementStr, printFloat, elemSize, - reinterpret_cast(&t[k + e * elemSize])); + outputArgument(elementStr, printFloat, elemSize, + reinterpret_cast(&t[k + e * elemSize])); } - i += (amd::alignUp(info.arguments_[j], sizeof(uint32_t))) / - sizeof(uint32_t); + i += (amd::alignUp(info.arguments_[j], sizeof(uint32_t))) / sizeof(uint32_t); } } else { amd::Os::printf( @@ -374,8 +356,7 @@ void PrintfDbg::outputDbgBuffer(const PrintfInfo& info, if (pos != std::string::npos) { fmt = str.substr(pos, str.size() - pos); - outputArgument(sepStr, false, ConstStr, - reinterpret_cast(fmt.data())); + outputArgument(sepStr, false, ConstStr, reinterpret_cast(fmt.data())); } } @@ -399,8 +380,7 @@ bool PrintfDbg::init(bool printfEnabled) { // Copy offset and number of bytes available for printf data // into the corresponding location in the debug buffer - hsa_status_t err = - hsa_memory_copy(dbgBuffer_, sysMem, 2 * sizeof(uint32_t)); + hsa_status_t err = hsa_memory_copy(dbgBuffer_, sysMem, 2 * sizeof(uint32_t)); if (err != HSA_STATUS_SUCCESS) { LogError("\n Can't copy offset and bytes available data to dgbBuffer_!"); return false; @@ -430,8 +410,7 @@ bool PrintfDbg::output(VirtualGPU& gpu, bool printfEnabled, } // Get a pointer to the buffer data - dbgBufferPtr = - reinterpret_cast(dbgBuffer_ + 2 * sizeof(uint32_t)); + dbgBufferPtr = reinterpret_cast(dbgBuffer_ + 2 * sizeof(uint32_t)); if (nullptr == dbgBufferPtr) { return false; } @@ -442,8 +421,7 @@ bool PrintfDbg::output(VirtualGPU& gpu, bool printfEnabled, // parse the debug buffer while (sbt < offsetSize) { - assert(((*dbgBufferPtr) < printfInfo.size()) && - "Cound't find the reported PrintfID!"); + assert(((*dbgBufferPtr) < printfInfo.size()) && "Cound't find the reported PrintfID!"); const PrintfInfo& info = printfInfo[(*dbgBufferPtr)]; sb += sizeof(uint32_t); for (ita = info.arguments_.begin(); ita != info.arguments_.end(); ++ita) { diff --git a/rocclr/runtime/device/rocm/rocprintf.hpp b/rocclr/runtime/device/rocm/rocprintf.hpp index bcc2f252b5..b27b156936 100644 --- a/rocclr/runtime/device/rocm/rocprintf.hpp +++ b/rocclr/runtime/device/rocm/rocprintf.hpp @@ -71,9 +71,8 @@ class PrintfDbg : public amd::HeapObject { Device& dev() const { return gpuDevice_; } //! Allocates the debug buffer - bool allocate( - bool realloc = false //!< If TRUE then reallocate the debug memory - ); + bool allocate(bool realloc = false //!< If TRUE then reallocate the debug memory + ); //! Returns TRUE if a float value has to be printed bool checkFloat(const std::string& fmt //!< Format string @@ -85,8 +84,8 @@ class PrintfDbg : public amd::HeapObject { //! Finds the specifier in the format string int checkVectorSpecifier(const std::string& fmt, //!< Format string - size_t startPos, //!< Start position for processing - size_t& curPos //!< End position for processing + size_t startPos, //!< Start position for processing + size_t& curPos //!< End position for processing ) const; //! Outputs an argument @@ -97,11 +96,10 @@ class PrintfDbg : public amd::HeapObject { ) const; //! Displays the PrintfDbg - void outputDbgBuffer( - const PrintfInfo& info, //!< printf info - const uint32_t* workitemData, //!< The PrintfDbg dump buffer - size_t& i //!< index to the data in the buffer - ) const; + void outputDbgBuffer(const PrintfInfo& info, //!< printf info + const uint32_t* workitemData, //!< The PrintfDbg dump buffer + size_t& i //!< index to the data in the buffer + ) const; private: //! Disable copy constructor @@ -112,4 +110,3 @@ class PrintfDbg : public amd::HeapObject { }; /*@}*/} // namespace roc - diff --git a/rocclr/runtime/device/rocm/rocprogram.cpp b/rocclr/runtime/device/rocm/rocprogram.cpp index 2f6f0e2a3a..74651dbd97 100644 --- a/rocclr/runtime/device/rocm/rocprogram.cpp +++ b/rocclr/runtime/device/rocm/rocprogram.cpp @@ -14,10 +14,10 @@ #include "libelf/gelf.h" #include "driver/AmdCompiler.h" #include "libraries.amdgcn.inc" -#else // !defined(WITH_LIGHTNING_COMPILER) +#else // !defined(WITH_LIGHTNING_COMPILER) #include "roccompilerlib.hpp" #include "amd_hsa_code.hpp" -#endif // !defined(WITH_LIGHTNING_COMPILER) +#endif // !defined(WITH_LIGHTNING_COMPILER) #include "utils/bif_section_labels.hpp" #include "amd_hsa_kernel_code.h" @@ -33,1457 +33,1357 @@ namespace roc { -static hsa_status_t -GetKernelNamesCallback( - hsa_executable_t exec, - hsa_agent_t agent, - hsa_executable_symbol_t symbol, - void *data) -{ - std::vector* symNameList = reinterpret_cast*>(data); +static hsa_status_t GetKernelNamesCallback(hsa_executable_t exec, hsa_agent_t agent, + hsa_executable_symbol_t symbol, void* data) { + std::vector* symNameList = reinterpret_cast*>(data); - hsa_symbol_kind_t sym_type; - hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &sym_type); + hsa_symbol_kind_t sym_type; + hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &sym_type); - if (sym_type == HSA_SYMBOL_KIND_KERNEL) { - uint32_t len; - hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &len); + if (sym_type == HSA_SYMBOL_KIND_KERNEL) { + uint32_t len; + hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &len); - char* symName = (char*) alloca(len+1); - hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME, symName); - symName[len] = '\0'; + char* symName = (char*)alloca(len + 1); + hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME, symName); + symName[len] = '\0'; - std::string kernelName(symName); - symNameList->push_back(kernelName); - } + std::string kernelName(symName); + symNameList->push_back(kernelName); + } - return HSA_STATUS_SUCCESS; + return HSA_STATUS_SUCCESS; } /* Temporary log function for the compiler library */ -static void -logFunction(const char *msg, size_t size) -{ - std::cout << "Compiler Library log :" << msg << std::endl; +static void logFunction(const char* msg, size_t size) { + std::cout << "Compiler Library log :" << msg << std::endl; } -HSAILProgram::~HSAILProgram() -{ +HSAILProgram::~HSAILProgram() { #if !defined(WITH_LIGHTNING_COMPILER) - acl_error error; - // Free the elf binary - if (binaryElf_ != nullptr) { - error = g_complibApi._aclBinaryFini(binaryElf_); - if (error != ACL_SUCCESS) { - LogWarning( "Error while destroying the acl binary \n" ); - } + acl_error error; + // Free the elf binary + if (binaryElf_ != nullptr) { + error = g_complibApi._aclBinaryFini(binaryElf_); + if (error != ACL_SUCCESS) { + LogWarning("Error while destroying the acl binary \n"); } -#endif // !defined(WITH_LIGHTNING_COMPILER) - // Destroy the executable. - if (hsaExecutable_.handle != 0) { - hsa_executable_destroy(hsaExecutable_); - } - // Destroy the program handle. - if (hsaProgramHandle_.handle != 0) { - hsa_ext_program_destroy(hsaProgramHandle_); - } - releaseClBinary(); + } +#endif // !defined(WITH_LIGHTNING_COMPILER) + // Destroy the executable. + if (hsaExecutable_.handle != 0) { + hsa_executable_destroy(hsaExecutable_); + } + // Destroy the program handle. + if (hsaProgramHandle_.handle != 0) { + hsa_ext_program_destroy(hsaProgramHandle_); + } + releaseClBinary(); #if defined(WITH_LIGHTNING_COMPILER) - delete metadata_; -#endif // defined(WITH_LIGHTNING_COMPILER) + delete metadata_; +#endif // defined(WITH_LIGHTNING_COMPILER) } -HSAILProgram::HSAILProgram(roc::NullDevice& device) - : Program(device), - binaryElf_(nullptr) -{ - memset(&binOpts_, 0, sizeof(binOpts_)); - binOpts_.struct_size = sizeof(binOpts_); - //binOpts_.elfclass = LP64_SWITCH( ELFCLASS32, ELFCLASS64 ); - //Setting as 32 bit because hsail64 returns an invalid aclTargetInfo - //when aclGetTargetInfo is called - EPR# 377910 - binOpts_.elfclass = ELFCLASS32; - binOpts_.bitness = ELFDATA2LSB; - binOpts_.alloc = &::malloc; - binOpts_.dealloc = &::free; +HSAILProgram::HSAILProgram(roc::NullDevice& device) : Program(device), binaryElf_(nullptr) { + memset(&binOpts_, 0, sizeof(binOpts_)); + binOpts_.struct_size = sizeof(binOpts_); + // binOpts_.elfclass = LP64_SWITCH( ELFCLASS32, ELFCLASS64 ); + // Setting as 32 bit because hsail64 returns an invalid aclTargetInfo + // when aclGetTargetInfo is called - EPR# 377910 + binOpts_.elfclass = ELFCLASS32; + binOpts_.bitness = ELFDATA2LSB; + binOpts_.alloc = &::malloc; + binOpts_.dealloc = &::free; - hsaProgramHandle_.handle = 0; - hsaExecutable_.handle = 0; + hsaProgramHandle_.handle = 0; + hsaExecutable_.handle = 0; - hasGlobalStores_ = false; + hasGlobalStores_ = false; #if defined(WITH_LIGHTNING_COMPILER) - metadata_ = nullptr; -#endif // defined(WITH_LIGHTNING_COMPILER) + metadata_ = nullptr; +#endif // defined(WITH_LIGHTNING_COMPILER) } -bool -HSAILProgram::initClBinary(char *binaryIn, size_t size) -{ - // Save the original binary that isn't owned by ClBinary - clBinary()->saveOrigBinary(binaryIn, size); +bool HSAILProgram::initClBinary(char* binaryIn, size_t size) { + // Save the original binary that isn't owned by ClBinary + clBinary()->saveOrigBinary(binaryIn, size); - char *bin = binaryIn; - size_t sz = size; + char* bin = binaryIn; + size_t sz = size; - int encryptCode; + int encryptCode; - char *decryptedBin; - size_t decryptedSize; - if (!clBinary()->decryptElf(binaryIn, size, - &decryptedBin, &decryptedSize, &encryptCode)) { - return false; - } + char* decryptedBin; + size_t decryptedSize; + if (!clBinary()->decryptElf(binaryIn, size, &decryptedBin, &decryptedSize, &encryptCode)) { + return false; + } + if (decryptedBin != nullptr) { + // It is decrypted binary. + bin = decryptedBin; + sz = decryptedSize; + } + + // Both 32-bit and 64-bit are allowed! + if (!amd::isElfMagic(bin)) { + // Invalid binary. if (decryptedBin != nullptr) { - // It is decrypted binary. - bin = decryptedBin; - sz = decryptedSize; + delete[] decryptedBin; } + return false; + } - // Both 32-bit and 64-bit are allowed! - if (!amd::isElfMagic(bin)) { - // Invalid binary. - if (decryptedBin != nullptr) { - delete[]decryptedBin; - } - return false; - } + clBinary()->setFlags(encryptCode); - clBinary()->setFlags(encryptCode); - - return clBinary()->setBinary(bin, sz, (decryptedBin != nullptr)); + return clBinary()->setBinary(bin, sz, (decryptedBin != nullptr)); } -bool -HSAILProgram::initBuild(amd::option::Options *options) -{ - compileOptions_ = options->origOptionStr; +bool HSAILProgram::initBuild(amd::option::Options* options) { + compileOptions_ = options->origOptionStr; - if (!device::Program::initBuild(options)) { - return false; - } + if (!device::Program::initBuild(options)) { + return false; + } - const char* devName = dev().deviceInfo().machineTarget_; - options->setPerBuildInfo( - (devName && (devName[0] != '\0')) ? devName : "gpu", - clBinary()->getEncryptCode(), true); + const char* devName = dev().deviceInfo().machineTarget_; + options->setPerBuildInfo((devName && (devName[0] != '\0')) ? devName : "gpu", + clBinary()->getEncryptCode(), true); - // Elf Binary setup - std::string outFileName; + // Elf Binary setup + std::string outFileName; - // true means hsail required - clBinary()->init(options, true); - if (options->isDumpFlagSet(amd::option::DUMP_BIF)) { - outFileName = options->getDumpFileName(".bin"); - } + // true means hsail required + clBinary()->init(options, true); + if (options->isDumpFlagSet(amd::option::DUMP_BIF)) { + outFileName = options->getDumpFileName(".bin"); + } #if defined(WITH_LIGHTNING_COMPILER) - bool useELF64 = true; -#else // !defined(WITH_LIGHTNING_COMPILER) - bool useELF64 = getCompilerOptions()->oVariables->EnableGpuElf64; -#endif // !defined(WITH_LIGHTNING_COMPILER) - if (!clBinary()->setElfOut(useELF64 ? ELFCLASS64 : ELFCLASS32, - (outFileName.size() > - 0) ? outFileName.c_str() : nullptr)) { - LogError("Setup elf out for gpu failed"); - return false; - } - return true; + bool useELF64 = true; +#else // !defined(WITH_LIGHTNING_COMPILER) + bool useELF64 = getCompilerOptions()->oVariables->EnableGpuElf64; +#endif // !defined(WITH_LIGHTNING_COMPILER) + if (!clBinary()->setElfOut(useELF64 ? ELFCLASS64 : ELFCLASS32, + (outFileName.size() > 0) ? outFileName.c_str() : nullptr)) { + LogError("Setup elf out for gpu failed"); + return false; + } + return true; } // ! post-compile setup for GPU -bool -HSAILProgram::finiBuild(bool isBuildGood) -{ - clBinary()->resetElfOut(); - clBinary()->resetElfIn(); +bool HSAILProgram::finiBuild(bool isBuildGood) { + clBinary()->resetElfOut(); + clBinary()->resetElfIn(); - if (!isBuildGood) { - // Prevent the encrypted binary form leaking out - clBinary()->setBinary(nullptr, 0); + if (!isBuildGood) { + // Prevent the encrypted binary form leaking out + clBinary()->setBinary(nullptr, 0); + } - } - - return device::Program::finiBuild(isBuildGood); + return device::Program::finiBuild(isBuildGood); } -aclType -HSAILProgram::getCompilationStagesFromBinary(std::vector& completeStages, bool& needOptionsCheck) -{ - acl_error errorCode; - size_t secSize = 0; - completeStages.clear(); - aclType from = ACL_TYPE_DEFAULT; - needOptionsCheck = true; - size_t boolSize = sizeof(bool); - //! @todo Should we also check for ACL_TYPE_OPENCL & ACL_TYPE_LLVMIR_TEXT? - // Checking llvmir in .llvmir section - bool containsHsailText = false; - bool containsBrig = false; - bool containsLlvmirText = (type() == TYPE_COMPILED); - bool containsShaderIsa = (type() == TYPE_EXECUTABLE); - bool containsOpts = !(compileOptions_.empty() && linkOptions_.empty()); -#if !defined(WITH_LIGHTNING_COMPILER) // !defined(WITH_LIGHTNING_COMPILER) - errorCode = g_complibApi._aclQueryInfo(device().compiler(), binaryElf_, RT_CONTAINS_LLVMIR, nullptr, &containsLlvmirText, &boolSize); - if (errorCode != ACL_SUCCESS) { - containsLlvmirText = false; +aclType HSAILProgram::getCompilationStagesFromBinary(std::vector& completeStages, + bool& needOptionsCheck) { + acl_error errorCode; + size_t secSize = 0; + completeStages.clear(); + aclType from = ACL_TYPE_DEFAULT; + needOptionsCheck = true; + size_t boolSize = sizeof(bool); + //! @todo Should we also check for ACL_TYPE_OPENCL & ACL_TYPE_LLVMIR_TEXT? + // Checking llvmir in .llvmir section + bool containsHsailText = false; + bool containsBrig = false; + bool containsLlvmirText = (type() == TYPE_COMPILED); + bool containsShaderIsa = (type() == TYPE_EXECUTABLE); + bool containsOpts = !(compileOptions_.empty() && linkOptions_.empty()); +#if !defined(WITH_LIGHTNING_COMPILER) // !defined(WITH_LIGHTNING_COMPILER) + errorCode = g_complibApi._aclQueryInfo(device().compiler(), binaryElf_, RT_CONTAINS_LLVMIR, + nullptr, &containsLlvmirText, &boolSize); + if (errorCode != ACL_SUCCESS) { + containsLlvmirText = false; + } + // Checking compile & link options in .comment section + errorCode = g_complibApi._aclQueryInfo(device().compiler(), binaryElf_, RT_CONTAINS_OPTIONS, + nullptr, &containsOpts, &boolSize); + if (errorCode != ACL_SUCCESS) { + containsOpts = false; + } + // Checking HSAIL in .cg section + containsHsailText = true; + errorCode = g_complibApi._aclQueryInfo(device().compiler(), binaryElf_, RT_CONTAINS_HSAIL, + nullptr, &containsHsailText, &boolSize); + if (errorCode != ACL_SUCCESS) { + containsHsailText = false; + } + // Checking BRIG sections + containsBrig = true; + errorCode = g_complibApi._aclQueryInfo(device().compiler(), binaryElf_, RT_CONTAINS_BRIG, nullptr, + &containsBrig, &boolSize); + if (errorCode != ACL_SUCCESS) { + containsBrig = false; + } + if (containsBrig) { + completeStages.push_back(from); + from = ACL_TYPE_HSAIL_BINARY; + // Here we should check that CG stage was done. + // Right now there are 2 criterions to check it (besides BRIG itself): + // 1. matadata symbols symOpenclKernel for every kernel. + // 2. HSAIL text in aclCODEGEN section. + // Unfortunately there is no appropriate way in Compiler Lib to check 1. + // because kernel names are unknown here, therefore only 2. + if (containsHsailText) { + completeStages.push_back(from); + from = ACL_TYPE_CG; } - // Checking compile & link options in .comment section - errorCode = g_complibApi._aclQueryInfo(device().compiler(), binaryElf_, RT_CONTAINS_OPTIONS, nullptr, &containsOpts, &boolSize); - if (errorCode != ACL_SUCCESS) { - containsOpts = false; - } - // Checking HSAIL in .cg section - containsHsailText = true; - errorCode = g_complibApi._aclQueryInfo(device().compiler(), binaryElf_, RT_CONTAINS_HSAIL, nullptr, &containsHsailText, &boolSize); - if (errorCode != ACL_SUCCESS) { - containsHsailText = false; - } - // Checking BRIG sections - containsBrig = true; - errorCode = g_complibApi._aclQueryInfo(device().compiler(), binaryElf_, RT_CONTAINS_BRIG, nullptr, &containsBrig, &boolSize); - if (errorCode != ACL_SUCCESS) { - containsBrig = false; - } - if (containsBrig) { - completeStages.push_back(from); - from = ACL_TYPE_HSAIL_BINARY; - // Here we should check that CG stage was done. - // Right now there are 2 criterions to check it (besides BRIG itself): - // 1. matadata symbols symOpenclKernel for every kernel. - // 2. HSAIL text in aclCODEGEN section. - // Unfortunately there is no appropriate way in Compiler Lib to check 1. - // because kernel names are unknown here, therefore only 2. - if (containsHsailText) { - completeStages.push_back(from); - from = ACL_TYPE_CG; - } - } - else if (containsHsailText) { - completeStages.push_back(from); - from = ACL_TYPE_HSAIL_TEXT; - } - errorCode = g_complibApi._aclQueryInfo(device().compiler(), binaryElf_, RT_CONTAINS_ISA, nullptr, &containsShaderIsa, &boolSize); - if (errorCode != ACL_SUCCESS) { - containsShaderIsa = false; - } -#endif // !defined(WITH_LIGHTNING_COMPILER) + } else if (containsHsailText) { + completeStages.push_back(from); + from = ACL_TYPE_HSAIL_TEXT; + } + errorCode = g_complibApi._aclQueryInfo(device().compiler(), binaryElf_, RT_CONTAINS_ISA, nullptr, + &containsShaderIsa, &boolSize); + if (errorCode != ACL_SUCCESS) { + containsShaderIsa = false; + } +#endif // !defined(WITH_LIGHTNING_COMPILER) - if (containsLlvmirText && containsOpts) { - completeStages.push_back(from); - from = ACL_TYPE_LLVMIR_BINARY; - } - if (containsShaderIsa) { - completeStages.push_back(from); - from = ACL_TYPE_ISA; - } - std::string sCurOptions = compileOptions_ + linkOptions_; - amd::option::Options curOptions; - if (!amd::option::parseAllOptions(sCurOptions, curOptions)) { - buildLog_ += curOptions.optionsLog(); - LogError("Parsing compile options failed."); - return ACL_TYPE_DEFAULT; - } - switch (from) { + if (containsLlvmirText && containsOpts) { + completeStages.push_back(from); + from = ACL_TYPE_LLVMIR_BINARY; + } + if (containsShaderIsa) { + completeStages.push_back(from); + from = ACL_TYPE_ISA; + } + std::string sCurOptions = compileOptions_ + linkOptions_; + amd::option::Options curOptions; + if (!amd::option::parseAllOptions(sCurOptions, curOptions)) { + buildLog_ += curOptions.optionsLog(); + LogError("Parsing compile options failed."); + return ACL_TYPE_DEFAULT; + } + switch (from) { // compile from HSAIL text, no matter prev. stages and options case ACL_TYPE_HSAIL_TEXT: - needOptionsCheck = false; - break; + needOptionsCheck = false; + break; case ACL_TYPE_HSAIL_BINARY: case ACL_TYPE_CG: - // do not check options, if LLVMIR is absent or might be absent or options are absent - if (curOptions.oVariables->BinLLVMIR || !containsLlvmirText || !containsOpts) { - needOptionsCheck = false; - } - break; + // do not check options, if LLVMIR is absent or might be absent or options are absent + if (curOptions.oVariables->BinLLVMIR || !containsLlvmirText || !containsOpts) { + needOptionsCheck = false; + } + break; case ACL_TYPE_ISA: - // do not check options, if LLVMIR is absent or might be absent or options are absent - if (curOptions.oVariables->BinLLVMIR || !containsLlvmirText || !containsOpts) { - needOptionsCheck = false; - } + // do not check options, if LLVMIR is absent or might be absent or options are absent + if (curOptions.oVariables->BinLLVMIR || !containsLlvmirText || !containsOpts) { + needOptionsCheck = false; + } #if !defined(WITH_LIGHTNING_COMPILER) - if (containsBrig && containsHsailText && curOptions.oVariables->BinHSAIL) { - needOptionsCheck = false; - // recompile from prev. stage, if BRIG || HSAIL are absent - } else { - from = completeStages.back(); - completeStages.pop_back(); - needOptionsCheck = true; - } + if (containsBrig && containsHsailText && curOptions.oVariables->BinHSAIL) { + needOptionsCheck = false; + // recompile from prev. stage, if BRIG || HSAIL are absent + } else { + from = completeStages.back(); + completeStages.pop_back(); + needOptionsCheck = true; + } #endif - break; - // recompilation might be needed + break; + // recompilation might be needed case ACL_TYPE_LLVMIR_BINARY: case ACL_TYPE_DEFAULT: default: + break; + } + return from; +} + +aclType HSAILProgram::getNextCompilationStageFromBinary(amd::option::Options* options) { + aclType continueCompileFrom = ACL_TYPE_DEFAULT; + binary_t binary = this->binary(); + // If the binary already exists + if ((binary.first != nullptr) && (binary.second > 0)) { +#if defined(WITH_LIGHTNING_COMPILER) + void* mem = (void*)binary.first; +#else // !defined(WITH_LIGHTNING_COMPILER) + void* mem = const_cast(binary.first); + acl_error errorCode; + binaryElf_ = g_complibApi._aclReadFromMem(mem, binary.second, &errorCode); + if (errorCode != ACL_SUCCESS) { + buildLog_ += "Error while BRIG Codegen phase: aclReadFromMem failure \n"; + return continueCompileFrom; + } +#endif // !defined(WITH_LIGHTNING_COMPILER) + + // save the current options + std::string sCurCompileOptions = compileOptions_; + std::string sCurLinkOptions = linkOptions_; + std::string sCurOptions = compileOptions_ + linkOptions_; + + // Saving binary in the interface class, + // which also load compile & link options from binary + setBinary(static_cast(mem), binary.second); + + // Calculate the next stage to compile from, based on sections in binaryElf_; + // No any validity checks here + std::vector completeStages; + bool needOptionsCheck = true; + continueCompileFrom = getCompilationStagesFromBinary(completeStages, needOptionsCheck); + if (!options || !needOptionsCheck) { + return continueCompileFrom; + } + bool recompile = false; + //! @todo Should we also check for ACL_TYPE_OPENCL & ACL_TYPE_LLVMIR_TEXT? + switch (continueCompileFrom) { + case ACL_TYPE_HSAIL_BINARY: + case ACL_TYPE_CG: + case ACL_TYPE_ISA: { + // Compare options loaded from binary with current ones, recompile if differ; + // If compile options are absent in binary, do not compare and recompile + if (compileOptions_.empty()) break; + +#if defined(WITH_LIGHTNING_COMPILER) + std::string sBinOptions = compileOptions_ + linkOptions_; +#else // !defined(WITH_LIGHTNING_COMPILER) + const oclBIFSymbolStruct* symbol = findBIF30SymStruct(symOpenclCompilerOptions); + assert(symbol && "symbol not found"); + std::string symName = + std::string(symbol->str[bif::PRE]) + std::string(symbol->str[bif::POST]); + size_t symSize = 0; + + const void* opts = g_complibApi._aclExtractSymbol(device().compiler(), binaryElf_, &symSize, + aclCOMMENT, symName.c_str(), &errorCode); + if (errorCode != ACL_SUCCESS) { + recompile = true; + break; + } + std::string sBinOptions = std::string((char*)opts, symSize); +#endif // !defined(WITH_LIGHTNING_COMPILER) + + compileOptions_ = sCurCompileOptions; + linkOptions_ = sCurLinkOptions; + + amd::option::Options curOptions, binOptions; + if (!amd::option::parseAllOptions(sBinOptions, binOptions)) { + buildLog_ += binOptions.optionsLog(); + LogError("Parsing compile options from binary failed."); + return ACL_TYPE_DEFAULT; + } + if (!amd::option::parseAllOptions(sCurOptions, curOptions)) { + buildLog_ += curOptions.optionsLog(); + LogError("Parsing compile options failed."); + return ACL_TYPE_DEFAULT; + } + if (!curOptions.equals(binOptions)) { + recompile = true; + } + break; + } + default: break; } - return from; + if (recompile) { + while (!completeStages.empty()) { + continueCompileFrom = completeStages.back(); + if (continueCompileFrom == ACL_TYPE_LLVMIR_BINARY || + continueCompileFrom == ACL_TYPE_DEFAULT) { + break; + } + completeStages.pop_back(); + } + } + } + return continueCompileFrom; } -aclType -HSAILProgram::getNextCompilationStageFromBinary(amd::option::Options* options) -{ - aclType continueCompileFrom = ACL_TYPE_DEFAULT; - binary_t binary = this->binary(); - // If the binary already exists - if ((binary.first != nullptr) && (binary.second > 0)) { -#if defined(WITH_LIGHTNING_COMPILER) - void *mem = (void *) binary.first; -#else // !defined(WITH_LIGHTNING_COMPILER) - void *mem = const_cast(binary.first); - acl_error errorCode; - binaryElf_ = g_complibApi._aclReadFromMem(mem, binary.second, &errorCode); - if (errorCode != ACL_SUCCESS) { - buildLog_ += "Error while BRIG Codegen phase: aclReadFromMem failure \n" ; - return continueCompileFrom; - } -#endif // !defined(WITH_LIGHTNING_COMPILER) +static hsa_status_t allocFunc(size_t size, hsa_callback_data_t data, void** address) { + if (!address || 0 == size) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } - // save the current options - std::string sCurCompileOptions = compileOptions_; - std::string sCurLinkOptions = linkOptions_; - std::string sCurOptions = compileOptions_ + linkOptions_; + *address = (char*)malloc(size); + if (!*address) { + return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + } - // Saving binary in the interface class, - // which also load compile & link options from binary - setBinary(static_cast(mem), binary.second); - - // Calculate the next stage to compile from, based on sections in binaryElf_; - // No any validity checks here - std::vector completeStages; - bool needOptionsCheck = true; - continueCompileFrom = getCompilationStagesFromBinary(completeStages, needOptionsCheck); - if (!options || !needOptionsCheck) { - return continueCompileFrom; - } - bool recompile = false; - //! @todo Should we also check for ACL_TYPE_OPENCL & ACL_TYPE_LLVMIR_TEXT? - switch (continueCompileFrom) { - case ACL_TYPE_HSAIL_BINARY: - case ACL_TYPE_CG: - case ACL_TYPE_ISA: { - // Compare options loaded from binary with current ones, recompile if differ; - // If compile options are absent in binary, do not compare and recompile - if (compileOptions_.empty()) - break; - -#if defined(WITH_LIGHTNING_COMPILER) - std::string sBinOptions = compileOptions_ + linkOptions_; -#else // !defined(WITH_LIGHTNING_COMPILER) - const oclBIFSymbolStruct* symbol = findBIF30SymStruct(symOpenclCompilerOptions); - assert(symbol && "symbol not found"); - std::string symName = std::string(symbol->str[bif::PRE]) + std::string(symbol->str[bif::POST]); - size_t symSize = 0; - - const void *opts = g_complibApi._aclExtractSymbol(device().compiler(), - binaryElf_, &symSize, aclCOMMENT, symName.c_str(), &errorCode); - if (errorCode != ACL_SUCCESS) { - recompile = true; - break; - } - std::string sBinOptions = std::string((char*)opts, symSize); -#endif // !defined(WITH_LIGHTNING_COMPILER) - - compileOptions_ = sCurCompileOptions; - linkOptions_ = sCurLinkOptions; - - amd::option::Options curOptions, binOptions; - if (!amd::option::parseAllOptions(sBinOptions, binOptions)) { - buildLog_ += binOptions.optionsLog(); - LogError("Parsing compile options from binary failed."); - return ACL_TYPE_DEFAULT; - } - if (!amd::option::parseAllOptions(sCurOptions, curOptions)) { - buildLog_ += curOptions.optionsLog(); - LogError("Parsing compile options failed."); - return ACL_TYPE_DEFAULT; - } - if (!curOptions.equals(binOptions)) { - recompile = true; - } - break; - } - default: - break; - } - if (recompile) { - while (!completeStages.empty()) { - continueCompileFrom = completeStages.back(); - if (continueCompileFrom == ACL_TYPE_LLVMIR_BINARY || - continueCompileFrom == ACL_TYPE_DEFAULT) { - break; - } - completeStages.pop_back(); - } - } - } - return continueCompileFrom; + return HSA_STATUS_SUCCESS; } -static hsa_status_t -allocFunc(size_t size, hsa_callback_data_t data, void **address) -{ - if (!address || 0 == size) { - return HSA_STATUS_ERROR_INVALID_ARGUMENT; - } - - *address = (char*) malloc(size); - if (!*address) { - return HSA_STATUS_ERROR_OUT_OF_RESOURCES; - } - - return HSA_STATUS_SUCCESS; -} - -bool -HSAILProgram::saveBinaryAndSetType(type_t type, void* rawBinary, size_t size) -{ - //Write binary to memory +bool HSAILProgram::saveBinaryAndSetType(type_t type, void* rawBinary, size_t size) { +// Write binary to memory #if defined(WITH_LIGHTNING_COMPILER) - if (type == TYPE_EXECUTABLE) { // handle code object binary - assert(rawBinary != nullptr && size != 0 && "must pass in the binary"); + if (type == TYPE_EXECUTABLE) { // handle code object binary + assert(rawBinary != nullptr && size != 0 && "must pass in the binary"); + } else { // handle LLVM binary + if (llvmBinary_.empty()) { + buildLog_ += "ERROR: Tried to save emtpy LLVM binary \n"; + return false; } - else { // handle LLVM binary - if (llvmBinary_.empty()) { - buildLog_ += "ERROR: Tried to save emtpy LLVM binary \n"; - return false; - } - rawBinary = (void*) llvmBinary_.data(); - size = llvmBinary_.size(); - } -#else // !defined(WITH_LIGHTNING_COMPILER) - if (g_complibApi._aclWriteToMem(binaryElf_, &rawBinary, &size) - != ACL_SUCCESS) { - buildLog_ += "Failed to write binary to memory \n"; - return false; - } -#endif // !defined(WITH_LIGHTNING_COMPILER) - clBinary()->saveBIFBinary((char*)rawBinary, size); - //Set the type of binary - setType(type); + rawBinary = (void*)llvmBinary_.data(); + size = llvmBinary_.size(); + } +#else // !defined(WITH_LIGHTNING_COMPILER) + if (g_complibApi._aclWriteToMem(binaryElf_, &rawBinary, &size) != ACL_SUCCESS) { + buildLog_ += "Failed to write binary to memory \n"; + return false; + } +#endif // !defined(WITH_LIGHTNING_COMPILER) + clBinary()->saveBIFBinary((char*)rawBinary, size); + // Set the type of binary + setType(type); - //Free memory containing rawBinary +// Free memory containing rawBinary #if !defined(WITH_LIGHTNING_COMPILER) - binaryElf_->binOpts.dealloc(rawBinary); + binaryElf_->binOpts.dealloc(rawBinary); #endif - return true; + return true; } #if defined(WITH_LIGHTNING_COMPILER) -bool -HSAILProgram::linkImpl_LC( - const std::vector &inputPrograms, - amd::option::Options *options, - bool createLibrary) -{ - using namespace amd::opencl_driver; - std::unique_ptr C(newCompilerInstance()); +bool HSAILProgram::linkImpl_LC(const std::vector& inputPrograms, + amd::option::Options* options, bool createLibrary) { + using namespace amd::opencl_driver; + std::unique_ptr C(newCompilerInstance()); - std::vector inputs; - for (auto program : (const std::vector&)inputPrograms) { - if (program->llvmBinary_.empty()) { - if (program->clBinary() == nullptr) { - buildLog_ += "Internal error: Input program not compiled!\n"; - return false; - } - - // We are using CL binary directly. - // Setup elfIn() and try to load llvmIR from binary - // This elfIn() will be released at the end of build by finiBuild(). - if (!program->clBinary()->setElfIn(ELFCLASS64)) { - buildLog_ += "Internal error: Setting input OCL binary failed!\n"; - return false; - } - if (!program->clBinary()->loadLlvmBinary(program->llvmBinary_, - program->elfSectionType_)) { - buildLog_ += "Internal error: Failed loading compiled binary!\n"; - return false; - } - } - - if (program->elfSectionType_ != amd::OclElf::LLVMIR) { - buildLog_ += "Error: Input binary format is not supported\n."; - return false; - } - - Data* input = C->NewBufferReference(DT_LLVM_BC, - (const char*) program->llvmBinary_.data(), - program->llvmBinary_.size()); - - if (!input) { - buildLog_ += "Internal error: Failed to open the compiled programs.\n"; - return false; - } - - // release elfIn() for the program - program->clBinary()->resetElfIn(); - - inputs.push_back(input); - } - - // open the linked output - Buffer* output = C->NewBuffer(DT_LLVM_BC); - - if (!output) { - buildLog_ += "Error: Failed to open the linked program.\n"; + std::vector inputs; + for (auto program : (const std::vector&)inputPrograms) { + if (program->llvmBinary_.empty()) { + if (program->clBinary() == nullptr) { + buildLog_ += "Internal error: Input program not compiled!\n"; return false; - } + } - std::vector linkOptions; - - // NOTE: The linkOptions parameter is also used to identy cached code object. This parameter - // should not contain any dyanamically generated filename. - bool ret = dev().cacheCompilation()->linkLLVMBitcode(C.get(), inputs, output, linkOptions, buildLog_); - buildLog_ += C->Output(); - if (!ret) { - buildLog_ += "Error: Linking bitcode failed: linking source & IR libraries.\n"; + // We are using CL binary directly. + // Setup elfIn() and try to load llvmIR from binary + // This elfIn() will be released at the end of build by finiBuild(). + if (!program->clBinary()->setElfIn(ELFCLASS64)) { + buildLog_ += "Internal error: Setting input OCL binary failed!\n"; return false; - } - - llvmBinary_.assign(output->Buf().data(), output->Size()); - elfSectionType_ = amd::OclElf::LLVMIR; - - if (clBinary()->saveLLVMIR()) { - clBinary()->elfOut()->addSection( - amd::OclElf::LLVMIR, llvmBinary_.data(), llvmBinary_.size(), false); - // store the original link options - clBinary()->storeLinkOptions(linkOptions_); - // store the original compile options - clBinary()->storeCompileOptions(compileOptions_); - } - - // skip the rest if we are building an opencl library - if (createLibrary) { - setType(TYPE_LIBRARY); - if (!createBinary(options)) { - buildLog_ += "Internal error: creating OpenCL binary failed\n"; - return false; - } - return true; - } - - return linkImpl_LC(options); -} -#endif // defined(WITH_LIGHTNING_COMPILER) - -bool -HSAILProgram::linkImpl( - const std::vector &inputPrograms, - amd::option::Options *options, - bool createLibrary) -{ -#if defined(WITH_LIGHTNING_COMPILER) - return linkImpl_LC(inputPrograms, options, createLibrary); -#else // !defined(WITH_LIGHTNING_COMPILER) - std::vector::const_iterator it - = inputPrograms.begin(); - std::vector::const_iterator itEnd - = inputPrograms.end(); - acl_error errorCode; - - // For each program we need to extract the LLVMIR and create - // aclBinary for each - std::vector binaries_to_link; - - for (size_t i = 0; it != itEnd; ++it, ++i) { - HSAILProgram *program = (HSAILProgram *)*it; - // Check if the program was created with clCreateProgramWIthBinary - binary_t binary = program->binary(); - if ((binary.first != nullptr) && (binary.second > 0)) { - // Binary already exists -- we can also check if there is no - // opencl source code - // Need to check if LLVMIR exists in the binary - // If LLVMIR does not exist then is it valid - // We need to pull out all the compiled kernels - // We cannot do this at present because we need at least - // Hsail text to pull the kernels oout - void *mem = const_cast(binary.first); - binaryElf_ = g_complibApi._aclReadFromMem(mem, - binary.second, - &errorCode); - - if (errorCode != ACL_SUCCESS) { - LogWarning("Error while linking : Could not read from raw binary"); - return false; - } - } - // At this stage each HSAILProgram contains a valid binary_elf - // Check if LLVMIR is in the binary - size_t boolSize = sizeof(bool); - bool containsLLLVMIR = false; - errorCode = g_complibApi._aclQueryInfo(device().compiler(), binaryElf_, - RT_CONTAINS_LLVMIR, nullptr, &containsLLLVMIR, &boolSize); - if (errorCode != ACL_SUCCESS || !containsLLLVMIR) { - buildLog_ +="Error while linking : Invalid binary (Missing LLVMIR section)"; - return false; - } - // Create a new aclBinary for each LLVMIR and save it in a list - aclBIFVersion ver = g_complibApi._aclBinaryVersion(binaryElf_); - aclBinary *bin = g_complibApi._aclCreateFromBinary(binaryElf_, ver); - binaries_to_link.push_back(bin); - } - - // At this stage each HSAILProgram in the list has an aclBinary initialized - // and contains LLVMIR - // We can now go ahead and link them. - if (binaries_to_link.size() > 1) { - errorCode = g_complibApi._aclLink(device().compiler(), - binaries_to_link[0], - binaries_to_link.size() - 1, - &binaries_to_link[1], - ACL_TYPE_LLVMIR_BINARY, - "-create-library", - nullptr); - } - else { - errorCode = g_complibApi._aclLink(device().compiler(), - binaries_to_link[0], - 0, - nullptr, - ACL_TYPE_LLVMIR_BINARY, - "-create-library", - nullptr); - } - if (errorCode != ACL_SUCCESS) { - buildLog_ += "Failed to link programs"; + } + if (!program->clBinary()->loadLlvmBinary(program->llvmBinary_, program->elfSectionType_)) { + buildLog_ += "Internal error: Failed loading compiled binary!\n"; return false; - } - // Store the newly linked aclBinary for this program. - binaryElf_ = binaries_to_link[0]; - // Free all the other aclBinaries - for (size_t i = 1; i < binaries_to_link.size(); i++) { - g_complibApi._aclBinaryFini(binaries_to_link[i]); - } - if (createLibrary) { - saveBinaryAndSetType(TYPE_LIBRARY); - return true; + } } - // Now call linkImpl with the new options - return linkImpl(options); -#endif // !defined(WITH_LIGHTNING_COMPILER) -} - -static inline const char* -hsa_strerror(hsa_status_t status) -{ - const char* str = nullptr; - if (hsa_status_string(status, &str) == HSA_STATUS_SUCCESS) { - return str; + if (program->elfSectionType_ != amd::OclElf::LLVMIR) { + buildLog_ += "Error: Input binary format is not supported\n."; + return false; } - return "Unknown error"; -} -#if defined(WITH_LIGHTNING_COMPILER) -bool -HSAILProgram::linkImpl_LC(amd::option::Options *options) -{ - using namespace amd::opencl_driver; - std::unique_ptr C(newCompilerInstance()); - - // call LinkLLVMBitcode - std::vector inputs; - - // open the input IR source - Data* input = C->NewBufferReference( - DT_LLVM_BC, llvmBinary_.data(), llvmBinary_.size()); + Data* input = C->NewBufferReference(DT_LLVM_BC, (const char*)program->llvmBinary_.data(), + program->llvmBinary_.size()); if (!input) { - buildLog_ += "Error: Failed to open the compiled program.\n"; - return false; + buildLog_ += "Internal error: Failed to open the compiled programs.\n"; + return false; } - inputs.push_back(input); //< must be the first input + // release elfIn() for the program + program->clBinary()->resetElfIn(); - // open the bitcode libraries - Data* opencl_bc = C->NewBufferReference(DT_LLVM_BC, - (const char*) opencl_amdgcn, opencl_amdgcn_size); - Data* ocml_bc = C->NewBufferReference(DT_LLVM_BC, - (const char*) ocml_amdgcn, ocml_amdgcn_size); - Data* ockl_bc = C->NewBufferReference(DT_LLVM_BC, - (const char*) ockl_amdgcn, ockl_amdgcn_size); - Data* irif_bc = C->NewBufferReference(DT_LLVM_BC, - (const char*) irif_amdgcn, irif_amdgcn_size); + inputs.push_back(input); + } - if (!opencl_bc || !ocml_bc || !ockl_bc || !irif_bc) { - buildLog_ += "Error: Failed to open the bitcode library.\n"; - return false; + // open the linked output + Buffer* output = C->NewBuffer(DT_LLVM_BC); + + if (!output) { + buildLog_ += "Error: Failed to open the linked program.\n"; + return false; + } + + std::vector linkOptions; + + // NOTE: The linkOptions parameter is also used to identy cached code object. This parameter + // should not contain any dyanamically generated filename. + bool ret = + dev().cacheCompilation()->linkLLVMBitcode(C.get(), inputs, output, linkOptions, buildLog_); + buildLog_ += C->Output(); + if (!ret) { + buildLog_ += "Error: Linking bitcode failed: linking source & IR libraries.\n"; + return false; + } + + llvmBinary_.assign(output->Buf().data(), output->Size()); + elfSectionType_ = amd::OclElf::LLVMIR; + + if (clBinary()->saveLLVMIR()) { + clBinary()->elfOut()->addSection(amd::OclElf::LLVMIR, llvmBinary_.data(), llvmBinary_.size(), + false); + // store the original link options + clBinary()->storeLinkOptions(linkOptions_); + // store the original compile options + clBinary()->storeCompileOptions(compileOptions_); + } + + // skip the rest if we are building an opencl library + if (createLibrary) { + setType(TYPE_LIBRARY); + if (!createBinary(options)) { + buildLog_ += "Internal error: creating OpenCL binary failed\n"; + return false; } - - inputs.push_back(opencl_bc); // depends on oclm & ockl - inputs.push_back(ockl_bc); // depends on irif - inputs.push_back(ocml_bc); // depends on irif - inputs.push_back(irif_bc); - - // open the control functions - auto isa_version = get_oclc_isa_version(dev().deviceInfo().gfxipVersion_); - if (!isa_version.first) { - buildLog_ += "Error: Linking for this device is not supported\n"; - return false; - } - - Data* isa_version_bc = C->NewBufferReference(DT_LLVM_BC, - (const char*) isa_version.first, isa_version.second); - - if (!isa_version_bc) { - buildLog_ += "Error: Failed to open the control functions.\n"; - return false; - } - - inputs.push_back(isa_version_bc); - - auto correctly_rounded_sqrt = get_oclc_correctly_rounded_sqrt( - options->oVariables->FP32RoundDivideSqrt); - Data* correctly_rounded_sqrt_bc = C->NewBufferReference(DT_LLVM_BC, - correctly_rounded_sqrt.first, correctly_rounded_sqrt.second); - - auto daz_opt = get_oclc_daz_opt(dev().deviceInfo().gfxipVersion_ < 900 - || options->oVariables->DenormsAreZero); - Data* daz_opt_bc = C->NewBufferReference(DT_LLVM_BC, - daz_opt.first, daz_opt.second); - - auto finite_only = get_oclc_finite_only(options->oVariables->FiniteMathOnly - || options->oVariables->FastRelaxedMath); - Data* finite_only_bc = C->NewBufferReference(DT_LLVM_BC, - finite_only.first, finite_only.second); - - auto unsafe_math = get_oclc_unsafe_math(options->oVariables->UnsafeMathOpt - || options->oVariables->FastRelaxedMath); - Data* unsafe_math_bc = C->NewBufferReference(DT_LLVM_BC, - unsafe_math.first, unsafe_math.second); - - if (!correctly_rounded_sqrt_bc || !daz_opt_bc - || !finite_only_bc || !unsafe_math_bc) { - buildLog_ += "Error: Failed to open the control functions.\n"; - return false; - } - - inputs.push_back(correctly_rounded_sqrt_bc); - inputs.push_back(daz_opt_bc); - inputs.push_back(finite_only_bc); - inputs.push_back(unsafe_math_bc); - - // open the linked output - std::vector linkOptions; - Buffer* linked_bc = C->NewBuffer(DT_LLVM_BC); - - if (!linked_bc) { - buildLog_ += "Error: Failed to open the linked program.\n"; - return false; - } - - // NOTE: The linkOptions parameter is also used to identy cached code object. This parameter - // should not contain any dyanamically generated filename. - bool ret = dev().cacheCompilation()->linkLLVMBitcode(C.get(), inputs, linked_bc, linkOptions, buildLog_); - buildLog_ += C->Output(); - if (!ret) { - buildLog_ += "Error: Linking bitcode failed: linking source & IR libraries.\n"; - return false; - } - - if (options->isDumpFlagSet(amd::option::DUMP_BC_LINKED)) { - std::ofstream f(options->getDumpFileName("_linked.bc").c_str(), std::ios::trunc); - if(f.is_open()) { - f.write(linked_bc->Buf().data(), linked_bc->Size()); - } else { - buildLog_ += - "Warning: opening the file to dump the linked IR failed.\n"; - } - } - - inputs.clear(); - inputs.push_back(linked_bc); - - Buffer* out_exec = C->NewBuffer(DT_EXECUTABLE); - if (!out_exec) { - buildLog_ += "Error: Failed to create the linked executable.\n"; - return false; - } - - std::string codegenOptions(options->llvmOptions); - - // Set the machine target - codegenOptions.append(" -mcpu="); - codegenOptions.append(dev().deviceInfo().machineTarget_); - - // Set the -O# - std::ostringstream optLevel; - optLevel << "-O" << options->oVariables->OptLevel; - codegenOptions.append(" ").append(optLevel.str()); - - // Pass clang options - std::ostringstream ostrstr; - std::copy(options->clangOptions.begin(), options->clangOptions.end(), - std::ostream_iterator(ostrstr, " ")); - codegenOptions.append(" ").append(ostrstr.str()); - - // Set whole program mode - codegenOptions.append(" -mllvm -amdgpu-internalize-symbols -mllvm -amdgpu-early-inline-all"); - - // Tokenize the options string into a vector of strings - std::istringstream strstr(codegenOptions); - std::istream_iterator sit(strstr), end; - std::vector params(sit, end); - - // NOTE: The params is also used to identy cached code object. This paramete - // should not contain any dyanamically generated filename. - ret = dev().cacheCompilation()->compileAndLinkExecutable(C.get(), inputs, out_exec, params, buildLog_); - buildLog_ += C->Output(); - if (!ret) { - buildLog_ += "Error: Creating the executable failed: Compiling LLVM IRs to executable\n"; - return false; - } - - if (options->isDumpFlagSet(amd::option::DUMP_O)) { - std::ofstream f(options->getDumpFileName(".so").c_str(), std::ios::trunc); - if(f.is_open()) { - f.write(out_exec->Buf().data(), out_exec->Size()); - } else { - buildLog_ += - "Warning: opening the file to dump the code object failed.\n"; - } - } - - if (options->isDumpFlagSet(amd::option::DUMP_ISA)) { - std::string name = options->getDumpFileName(".s"); - File *dump = C->NewFile(DT_INTERNAL, name); - if (!C->DumpExecutableAsText(out_exec, dump)) { - buildLog_ += "Warning: failed to dump code object.\n"; - } - } - - return setKernels_LC( options, out_exec->Buf().data(), out_exec->Size() ); -} - -bool -HSAILProgram::setKernels_LC(amd::option::Options *options, void* binary, size_t binSize) -{ - hsa_agent_t agent = dev().getBackendDevice(); - hsa_status_t status; - - status = hsa_executable_create_alt( - HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, - nullptr, &hsaExecutable_ ); - if (status != HSA_STATUS_SUCCESS) { - buildLog_ += "Error: Executable for AMD HSA Code Object isn't created: "; - buildLog_ += hsa_strerror(status); - buildLog_ += "\n"; - return false; - } - - // Load the code object. - hsa_code_object_reader_t codeObjectReader; - status = hsa_code_object_reader_create_from_memory( - binary, binSize, &codeObjectReader); - if (status != HSA_STATUS_SUCCESS) { - buildLog_ += "Error: AMD HSA Code Object Reader create failed: "; - buildLog_ += hsa_strerror(status); - buildLog_ += "\n"; - return false; - } - - status = hsa_executable_load_agent_code_object( - hsaExecutable_, agent, codeObjectReader, nullptr, nullptr ); - if (status != HSA_STATUS_SUCCESS) { - buildLog_ += "Error: AMD HSA Code Object loading failed: "; - buildLog_ += hsa_strerror(status); - buildLog_ += "\n"; - return false; - } - - hsa_code_object_reader_destroy(codeObjectReader); - - // Freeze the executable. - status = hsa_executable_freeze( hsaExecutable_, nullptr ); - if (status != HSA_STATUS_SUCCESS) { - buildLog_ += "Error: Freezing the executable failed: "; - buildLog_ += hsa_strerror(status); - buildLog_ += "\n"; - return false; - } - - size_t progvarsTotalSize = 0; - size_t dynamicSize = 0; - size_t progvarsWriteSize = 0; - - // Begin the Elf image from memory - Elf* e = elf_memory((char*) binary, binSize, nullptr); - if (elf_kind(e) != ELF_K_ELF) { - buildLog_ += "Error while reading the ELF program binary\n"; - return false; - } - - size_t numpHdrs; - if (elf_getphdrnum(e, &numpHdrs) != 0) { - buildLog_ += "Error while reading the ELF program binary\n"; - return false; - } - - for (size_t i = 0; i < numpHdrs; ++i) { - GElf_Phdr pHdr; - if (gelf_getphdr(e, i, &pHdr) != &pHdr) { - continue; - } - // Look for the runtime metadata note - if (pHdr.p_type == PT_NOTE && pHdr.p_align >= sizeof(int)) { - // Iterate over the notes in this segment - address ptr = (address) binary + pHdr.p_offset; - address segmentEnd = ptr + pHdr.p_filesz; - - while (ptr < segmentEnd) { - Elf_Note* note = (Elf_Note*) ptr; - address name = (address) ¬e[1]; - address desc = name + amd::alignUp(note->n_namesz, sizeof(int)); - - if (note->n_type == 7 || note->n_type == 8) { - buildLog_ += "Error: object code with old metadata is not " \ - "supported\n"; - return false; - } - else if (note->n_type == AMDGPU::ElfNote::NT_AMDGPU_HSA_CODE_OBJECT_METADATA - && note->n_namesz == sizeof AMDGPU::ElfNote::NoteName - && !memcmp(name, AMDGPU::ElfNote::NoteName, note->n_namesz)) { - std::string metadataStr((const char *) desc, (size_t) note->n_descsz); - metadata_ = new CodeObjectMD(); - if (CodeObjectMD::fromYamlString(metadataStr, *metadata_)) { - buildLog_ += "Error: failed to process metadata\n"; - return false; - } - // We've found and loaded the runtime metadata, exit the - // note record loop now. - break; - } - ptr += sizeof(*note) - + amd::alignUp(note->n_namesz, sizeof(int)) - + amd::alignUp(note->n_descsz, sizeof(int)); - } - } - // Accumulate the size of R & !X loadable segments - else if (pHdr.p_type == PT_LOAD && !(pHdr.p_flags & PF_X)) { - if (pHdr.p_flags & PF_R) { - progvarsTotalSize += pHdr.p_memsz; - } - if (pHdr.p_flags & PF_W) { - progvarsWriteSize += pHdr.p_memsz; - } - } - else if (pHdr.p_type == PT_DYNAMIC) { - dynamicSize += pHdr.p_memsz; - } - } - - elf_end(e); - - if (!metadata_) { - buildLog_ += "Error: runtime metadata section not present in " \ - "ELF program binary\n"; - return false; - } - - if (progvarsWriteSize != dynamicSize) { - hasGlobalStores_ = true; - } - progvarsTotalSize -= dynamicSize; - setGlobalVariableTotalSize(progvarsTotalSize); - - saveBinaryAndSetType(TYPE_EXECUTABLE, binary, binSize); - - // Get the list of kernels - std::vector kernelNameList; - status = hsa_executable_iterate_agent_symbols( hsaExecutable_, - agent, GetKernelNamesCallback, (void *) &kernelNameList ); - if (status != HSA_STATUS_SUCCESS) { - buildLog_ += "Error: Failed to get kernel names: "; - buildLog_ += hsa_strerror(status); - buildLog_ += "\n"; - return false; - } - - for (auto &kernelName : kernelNameList) { - hsa_executable_symbol_t kernelSymbol; - - status = hsa_executable_get_symbol_by_name( - hsaExecutable_, kernelName.c_str(), &agent, &kernelSymbol); - if (status != HSA_STATUS_SUCCESS) { - buildLog_ += "Error: Failed to get the symbol: "; - buildLog_ += hsa_strerror(status); - buildLog_ += "\n"; - return false; - } - - uint64_t kernelCodeHandle; - status = hsa_executable_symbol_get_info( - kernelSymbol, - HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, - &kernelCodeHandle); - if (status != HSA_STATUS_SUCCESS) { - buildLog_ += "Error: Failed to get the kernel code: "; - buildLog_ += hsa_strerror(status); - buildLog_ += "\n"; - return false; - } - - uint32_t workgroupGroupSegmentByteSize; - status = hsa_executable_symbol_get_info( - kernelSymbol, - HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, - &workgroupGroupSegmentByteSize); - if (status != HSA_STATUS_SUCCESS) { - buildLog_ += "Error: Failed to get group segment size info: "; - buildLog_ += hsa_strerror(status); - buildLog_ += "\n"; - return false; - } - - uint32_t workitemPrivateSegmentByteSize; - status = hsa_executable_symbol_get_info( - kernelSymbol, - HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, - &workitemPrivateSegmentByteSize); - if (status != HSA_STATUS_SUCCESS) { - buildLog_ += "Error: Failed to get private segment size info: "; - buildLog_ += hsa_strerror(status); - buildLog_ += "\n"; - return false; - } - - uint32_t kernargSegmentByteSize; - status = hsa_executable_symbol_get_info( - kernelSymbol, - HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, - &kernargSegmentByteSize); - if (status != HSA_STATUS_SUCCESS) { - buildLog_ += "Error: Failed to get kernarg segment size info: "; - buildLog_ += hsa_strerror(status); - buildLog_ += "\n"; - return false; - } - - uint32_t kernargSegmentAlignment; - status = hsa_executable_symbol_get_info( - kernelSymbol, - HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_ALIGNMENT, - &kernargSegmentAlignment); - if (status != HSA_STATUS_SUCCESS) { - buildLog_ += "Error: Failed to get kernarg segment alignment info: "; - buildLog_ += hsa_strerror(status); - buildLog_ += "\n"; - return false; - } - - // FIME_lmoriche: the compiler should set the kernarg alignment based - // on the alignment requirement of the parameters. For now, bump it to - // the worse case: 128byte aligned. - kernargSegmentAlignment = std::max(kernargSegmentAlignment, 128u); - - Kernel *aKernel = new roc::Kernel( - kernelName, - this, - kernelCodeHandle, - workgroupGroupSegmentByteSize, - workitemPrivateSegmentByteSize, - kernargSegmentByteSize, - amd::alignUp(kernargSegmentAlignment,device().info().globalMemCacheLineSize_)); - if (!aKernel->init()) { - return false; - } - aKernel->setUniformWorkGroupSize(options->oVariables->UniformWorkGroupSize); - aKernel->setInternalKernelFlag(compileOptions_.find("-cl-internal-kernel") != std::string::npos); - kernels()[kernelName] = aKernel; - } - return true; -} -#endif // defined(WITH_LIGHTNING_COMPILER) + } -bool -HSAILProgram::linkImpl(amd::option::Options *options) -{ - acl_error errorCode; - aclType continueCompileFrom = ACL_TYPE_LLVMIR_BINARY; - bool finalize = true; -#if !defined(WITH_LIGHTNING_COMPILER) - // If !binaryElf_ then program must have been created using clCreateProgramWithBinary - if (!binaryElf_) -#else // defined(WITH_LIGHTNING_COMPILER) - if (llvmBinary_.empty()) -#endif // defined(WITH_LIGHTNING_COMPILER) - { - continueCompileFrom = getNextCompilationStageFromBinary(options); + return linkImpl_LC(options); +} +#endif // defined(WITH_LIGHTNING_COMPILER) + +bool HSAILProgram::linkImpl(const std::vector& inputPrograms, + amd::option::Options* options, bool createLibrary) { +#if defined(WITH_LIGHTNING_COMPILER) + return linkImpl_LC(inputPrograms, options, createLibrary); +#else // !defined(WITH_LIGHTNING_COMPILER) + std::vector::const_iterator it = inputPrograms.begin(); + std::vector::const_iterator itEnd = inputPrograms.end(); + acl_error errorCode; + + // For each program we need to extract the LLVMIR and create + // aclBinary for each + std::vector binaries_to_link; + + for (size_t i = 0; it != itEnd; ++it, ++i) { + HSAILProgram* program = (HSAILProgram*)*it; + // Check if the program was created with clCreateProgramWIthBinary + binary_t binary = program->binary(); + if ((binary.first != nullptr) && (binary.second > 0)) { + // Binary already exists -- we can also check if there is no + // opencl source code + // Need to check if LLVMIR exists in the binary + // If LLVMIR does not exist then is it valid + // We need to pull out all the compiled kernels + // We cannot do this at present because we need at least + // Hsail text to pull the kernels oout + void* mem = const_cast(binary.first); + binaryElf_ = g_complibApi._aclReadFromMem(mem, binary.second, &errorCode); + + if (errorCode != ACL_SUCCESS) { + LogWarning("Error while linking : Could not read from raw binary"); + return false; + } + } + // At this stage each HSAILProgram contains a valid binary_elf + // Check if LLVMIR is in the binary + size_t boolSize = sizeof(bool); + bool containsLLLVMIR = false; + errorCode = g_complibApi._aclQueryInfo(device().compiler(), binaryElf_, RT_CONTAINS_LLVMIR, + nullptr, &containsLLLVMIR, &boolSize); + if (errorCode != ACL_SUCCESS || !containsLLLVMIR) { + buildLog_ += "Error while linking : Invalid binary (Missing LLVMIR section)"; + return false; + } + // Create a new aclBinary for each LLVMIR and save it in a list + aclBIFVersion ver = g_complibApi._aclBinaryVersion(binaryElf_); + aclBinary* bin = g_complibApi._aclCreateFromBinary(binaryElf_, ver); + binaries_to_link.push_back(bin); + } + + // At this stage each HSAILProgram in the list has an aclBinary initialized + // and contains LLVMIR + // We can now go ahead and link them. + if (binaries_to_link.size() > 1) { + errorCode = g_complibApi._aclLink(device().compiler(), binaries_to_link[0], + binaries_to_link.size() - 1, &binaries_to_link[1], + ACL_TYPE_LLVMIR_BINARY, "-create-library", nullptr); + } else { + errorCode = g_complibApi._aclLink(device().compiler(), binaries_to_link[0], 0, nullptr, + ACL_TYPE_LLVMIR_BINARY, "-create-library", nullptr); + } + if (errorCode != ACL_SUCCESS) { + buildLog_ += "Failed to link programs"; + return false; + } + // Store the newly linked aclBinary for this program. + binaryElf_ = binaries_to_link[0]; + // Free all the other aclBinaries + for (size_t i = 1; i < binaries_to_link.size(); i++) { + g_complibApi._aclBinaryFini(binaries_to_link[i]); + } + if (createLibrary) { + saveBinaryAndSetType(TYPE_LIBRARY); + return true; + } + + // Now call linkImpl with the new options + return linkImpl(options); +#endif // !defined(WITH_LIGHTNING_COMPILER) +} + +static inline const char* hsa_strerror(hsa_status_t status) { + const char* str = nullptr; + if (hsa_status_string(status, &str) == HSA_STATUS_SUCCESS) { + return str; + } + return "Unknown error"; +} + +#if defined(WITH_LIGHTNING_COMPILER) +bool HSAILProgram::linkImpl_LC(amd::option::Options* options) { + using namespace amd::opencl_driver; + std::unique_ptr C(newCompilerInstance()); + + // call LinkLLVMBitcode + std::vector inputs; + + // open the input IR source + Data* input = C->NewBufferReference(DT_LLVM_BC, llvmBinary_.data(), llvmBinary_.size()); + + if (!input) { + buildLog_ += "Error: Failed to open the compiled program.\n"; + return false; + } + + inputs.push_back(input); //< must be the first input + + // open the bitcode libraries + Data* opencl_bc = + C->NewBufferReference(DT_LLVM_BC, (const char*)opencl_amdgcn, opencl_amdgcn_size); + Data* ocml_bc = C->NewBufferReference(DT_LLVM_BC, (const char*)ocml_amdgcn, ocml_amdgcn_size); + Data* ockl_bc = C->NewBufferReference(DT_LLVM_BC, (const char*)ockl_amdgcn, ockl_amdgcn_size); + Data* irif_bc = C->NewBufferReference(DT_LLVM_BC, (const char*)irif_amdgcn, irif_amdgcn_size); + + if (!opencl_bc || !ocml_bc || !ockl_bc || !irif_bc) { + buildLog_ += "Error: Failed to open the bitcode library.\n"; + return false; + } + + inputs.push_back(opencl_bc); // depends on oclm & ockl + inputs.push_back(ockl_bc); // depends on irif + inputs.push_back(ocml_bc); // depends on irif + inputs.push_back(irif_bc); + + // open the control functions + auto isa_version = get_oclc_isa_version(dev().deviceInfo().gfxipVersion_); + if (!isa_version.first) { + buildLog_ += "Error: Linking for this device is not supported\n"; + return false; + } + + Data* isa_version_bc = + C->NewBufferReference(DT_LLVM_BC, (const char*)isa_version.first, isa_version.second); + + if (!isa_version_bc) { + buildLog_ += "Error: Failed to open the control functions.\n"; + return false; + } + + inputs.push_back(isa_version_bc); + + auto correctly_rounded_sqrt = + get_oclc_correctly_rounded_sqrt(options->oVariables->FP32RoundDivideSqrt); + Data* correctly_rounded_sqrt_bc = C->NewBufferReference(DT_LLVM_BC, correctly_rounded_sqrt.first, + correctly_rounded_sqrt.second); + + auto daz_opt = get_oclc_daz_opt(dev().deviceInfo().gfxipVersion_ < 900 || + options->oVariables->DenormsAreZero); + Data* daz_opt_bc = C->NewBufferReference(DT_LLVM_BC, daz_opt.first, daz_opt.second); + + auto finite_only = get_oclc_finite_only(options->oVariables->FiniteMathOnly || + options->oVariables->FastRelaxedMath); + Data* finite_only_bc = C->NewBufferReference(DT_LLVM_BC, finite_only.first, finite_only.second); + + auto unsafe_math = get_oclc_unsafe_math(options->oVariables->UnsafeMathOpt || + options->oVariables->FastRelaxedMath); + Data* unsafe_math_bc = C->NewBufferReference(DT_LLVM_BC, unsafe_math.first, unsafe_math.second); + + if (!correctly_rounded_sqrt_bc || !daz_opt_bc || !finite_only_bc || !unsafe_math_bc) { + buildLog_ += "Error: Failed to open the control functions.\n"; + return false; + } + + inputs.push_back(correctly_rounded_sqrt_bc); + inputs.push_back(daz_opt_bc); + inputs.push_back(finite_only_bc); + inputs.push_back(unsafe_math_bc); + + // open the linked output + std::vector linkOptions; + Buffer* linked_bc = C->NewBuffer(DT_LLVM_BC); + + if (!linked_bc) { + buildLog_ += "Error: Failed to open the linked program.\n"; + return false; + } + + // NOTE: The linkOptions parameter is also used to identy cached code object. This parameter + // should not contain any dyanamically generated filename. + bool ret = + dev().cacheCompilation()->linkLLVMBitcode(C.get(), inputs, linked_bc, linkOptions, buildLog_); + buildLog_ += C->Output(); + if (!ret) { + buildLog_ += "Error: Linking bitcode failed: linking source & IR libraries.\n"; + return false; + } + + if (options->isDumpFlagSet(amd::option::DUMP_BC_LINKED)) { + std::ofstream f(options->getDumpFileName("_linked.bc").c_str(), std::ios::trunc); + if (f.is_open()) { + f.write(linked_bc->Buf().data(), linked_bc->Size()); + } else { + buildLog_ += "Warning: opening the file to dump the linked IR failed.\n"; + } + } + + inputs.clear(); + inputs.push_back(linked_bc); + + Buffer* out_exec = C->NewBuffer(DT_EXECUTABLE); + if (!out_exec) { + buildLog_ += "Error: Failed to create the linked executable.\n"; + return false; + } + + std::string codegenOptions(options->llvmOptions); + + // Set the machine target + codegenOptions.append(" -mcpu="); + codegenOptions.append(dev().deviceInfo().machineTarget_); + + // Set the -O# + std::ostringstream optLevel; + optLevel << "-O" << options->oVariables->OptLevel; + codegenOptions.append(" ").append(optLevel.str()); + + // Pass clang options + std::ostringstream ostrstr; + std::copy(options->clangOptions.begin(), options->clangOptions.end(), + std::ostream_iterator(ostrstr, " ")); + codegenOptions.append(" ").append(ostrstr.str()); + + // Set whole program mode + codegenOptions.append(" -mllvm -amdgpu-internalize-symbols -mllvm -amdgpu-early-inline-all"); + + // Tokenize the options string into a vector of strings + std::istringstream strstr(codegenOptions); + std::istream_iterator sit(strstr), end; + std::vector params(sit, end); + + // NOTE: The params is also used to identy cached code object. This paramete + // should not contain any dyanamically generated filename. + ret = dev().cacheCompilation()->compileAndLinkExecutable(C.get(), inputs, out_exec, params, + buildLog_); + buildLog_ += C->Output(); + if (!ret) { + buildLog_ += "Error: Creating the executable failed: Compiling LLVM IRs to executable\n"; + return false; + } + + if (options->isDumpFlagSet(amd::option::DUMP_O)) { + std::ofstream f(options->getDumpFileName(".so").c_str(), std::ios::trunc); + if (f.is_open()) { + f.write(out_exec->Buf().data(), out_exec->Size()); + } else { + buildLog_ += "Warning: opening the file to dump the code object failed.\n"; + } + } + + if (options->isDumpFlagSet(amd::option::DUMP_ISA)) { + std::string name = options->getDumpFileName(".s"); + File* dump = C->NewFile(DT_INTERNAL, name); + if (!C->DumpExecutableAsText(out_exec, dump)) { + buildLog_ += "Warning: failed to dump code object.\n"; + } + } + + return setKernels_LC(options, out_exec->Buf().data(), out_exec->Size()); +} + +bool HSAILProgram::setKernels_LC(amd::option::Options* options, void* binary, size_t binSize) { + hsa_agent_t agent = dev().getBackendDevice(); + hsa_status_t status; + + status = hsa_executable_create_alt(HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, + nullptr, &hsaExecutable_); + if (status != HSA_STATUS_SUCCESS) { + buildLog_ += "Error: Executable for AMD HSA Code Object isn't created: "; + buildLog_ += hsa_strerror(status); + buildLog_ += "\n"; + return false; + } + + // Load the code object. + hsa_code_object_reader_t codeObjectReader; + status = hsa_code_object_reader_create_from_memory(binary, binSize, &codeObjectReader); + if (status != HSA_STATUS_SUCCESS) { + buildLog_ += "Error: AMD HSA Code Object Reader create failed: "; + buildLog_ += hsa_strerror(status); + buildLog_ += "\n"; + return false; + } + + status = hsa_executable_load_agent_code_object(hsaExecutable_, agent, codeObjectReader, nullptr, + nullptr); + if (status != HSA_STATUS_SUCCESS) { + buildLog_ += "Error: AMD HSA Code Object loading failed: "; + buildLog_ += hsa_strerror(status); + buildLog_ += "\n"; + return false; + } + + hsa_code_object_reader_destroy(codeObjectReader); + + // Freeze the executable. + status = hsa_executable_freeze(hsaExecutable_, nullptr); + if (status != HSA_STATUS_SUCCESS) { + buildLog_ += "Error: Freezing the executable failed: "; + buildLog_ += hsa_strerror(status); + buildLog_ += "\n"; + return false; + } + + size_t progvarsTotalSize = 0; + size_t dynamicSize = 0; + size_t progvarsWriteSize = 0; + + // Begin the Elf image from memory + Elf* e = elf_memory((char*)binary, binSize, nullptr); + if (elf_kind(e) != ELF_K_ELF) { + buildLog_ += "Error while reading the ELF program binary\n"; + return false; + } + + size_t numpHdrs; + if (elf_getphdrnum(e, &numpHdrs) != 0) { + buildLog_ += "Error while reading the ELF program binary\n"; + return false; + } + + for (size_t i = 0; i < numpHdrs; ++i) { + GElf_Phdr pHdr; + if (gelf_getphdr(e, i, &pHdr) != &pHdr) { + continue; + } + // Look for the runtime metadata note + if (pHdr.p_type == PT_NOTE && pHdr.p_align >= sizeof(int)) { + // Iterate over the notes in this segment + address ptr = (address)binary + pHdr.p_offset; + address segmentEnd = ptr + pHdr.p_filesz; + + while (ptr < segmentEnd) { + Elf_Note* note = (Elf_Note*)ptr; + address name = (address)¬e[1]; + address desc = name + amd::alignUp(note->n_namesz, sizeof(int)); + + if (note->n_type == 7 || note->n_type == 8) { + buildLog_ += + "Error: object code with old metadata is not " + "supported\n"; + return false; + } else if (note->n_type == AMDGPU::ElfNote::NT_AMDGPU_HSA_CODE_OBJECT_METADATA && + note->n_namesz == sizeof AMDGPU::ElfNote::NoteName && + !memcmp(name, AMDGPU::ElfNote::NoteName, note->n_namesz)) { + std::string metadataStr((const char*)desc, (size_t)note->n_descsz); + metadata_ = new CodeObjectMD(); + if (CodeObjectMD::fromYamlString(metadataStr, *metadata_)) { + buildLog_ += "Error: failed to process metadata\n"; + return false; + } + // We've found and loaded the runtime metadata, exit the + // note record loop now. + break; } - switch (continueCompileFrom) { + ptr += sizeof(*note) + amd::alignUp(note->n_namesz, sizeof(int)) + + amd::alignUp(note->n_descsz, sizeof(int)); + } + } + // Accumulate the size of R & !X loadable segments + else if (pHdr.p_type == PT_LOAD && !(pHdr.p_flags & PF_X)) { + if (pHdr.p_flags & PF_R) { + progvarsTotalSize += pHdr.p_memsz; + } + if (pHdr.p_flags & PF_W) { + progvarsWriteSize += pHdr.p_memsz; + } + } else if (pHdr.p_type == PT_DYNAMIC) { + dynamicSize += pHdr.p_memsz; + } + } + + elf_end(e); + + if (!metadata_) { + buildLog_ += + "Error: runtime metadata section not present in " + "ELF program binary\n"; + return false; + } + + if (progvarsWriteSize != dynamicSize) { + hasGlobalStores_ = true; + } + progvarsTotalSize -= dynamicSize; + setGlobalVariableTotalSize(progvarsTotalSize); + + saveBinaryAndSetType(TYPE_EXECUTABLE, binary, binSize); + + // Get the list of kernels + std::vector kernelNameList; + status = hsa_executable_iterate_agent_symbols(hsaExecutable_, agent, GetKernelNamesCallback, + (void*)&kernelNameList); + if (status != HSA_STATUS_SUCCESS) { + buildLog_ += "Error: Failed to get kernel names: "; + buildLog_ += hsa_strerror(status); + buildLog_ += "\n"; + return false; + } + + for (auto& kernelName : kernelNameList) { + hsa_executable_symbol_t kernelSymbol; + + status = hsa_executable_get_symbol_by_name(hsaExecutable_, kernelName.c_str(), &agent, + &kernelSymbol); + if (status != HSA_STATUS_SUCCESS) { + buildLog_ += "Error: Failed to get the symbol: "; + buildLog_ += hsa_strerror(status); + buildLog_ += "\n"; + return false; + } + + uint64_t kernelCodeHandle; + status = hsa_executable_symbol_get_info(kernelSymbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, + &kernelCodeHandle); + if (status != HSA_STATUS_SUCCESS) { + buildLog_ += "Error: Failed to get the kernel code: "; + buildLog_ += hsa_strerror(status); + buildLog_ += "\n"; + return false; + } + + uint32_t workgroupGroupSegmentByteSize; + status = hsa_executable_symbol_get_info(kernelSymbol, + HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, + &workgroupGroupSegmentByteSize); + if (status != HSA_STATUS_SUCCESS) { + buildLog_ += "Error: Failed to get group segment size info: "; + buildLog_ += hsa_strerror(status); + buildLog_ += "\n"; + return false; + } + + uint32_t workitemPrivateSegmentByteSize; + status = hsa_executable_symbol_get_info(kernelSymbol, + HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, + &workitemPrivateSegmentByteSize); + if (status != HSA_STATUS_SUCCESS) { + buildLog_ += "Error: Failed to get private segment size info: "; + buildLog_ += hsa_strerror(status); + buildLog_ += "\n"; + return false; + } + + uint32_t kernargSegmentByteSize; + status = hsa_executable_symbol_get_info(kernelSymbol, + HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, + &kernargSegmentByteSize); + if (status != HSA_STATUS_SUCCESS) { + buildLog_ += "Error: Failed to get kernarg segment size info: "; + buildLog_ += hsa_strerror(status); + buildLog_ += "\n"; + return false; + } + + uint32_t kernargSegmentAlignment; + status = hsa_executable_symbol_get_info( + kernelSymbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_ALIGNMENT, + &kernargSegmentAlignment); + if (status != HSA_STATUS_SUCCESS) { + buildLog_ += "Error: Failed to get kernarg segment alignment info: "; + buildLog_ += hsa_strerror(status); + buildLog_ += "\n"; + return false; + } + + // FIME_lmoriche: the compiler should set the kernarg alignment based + // on the alignment requirement of the parameters. For now, bump it to + // the worse case: 128byte aligned. + kernargSegmentAlignment = std::max(kernargSegmentAlignment, 128u); + + Kernel* aKernel = new roc::Kernel( + kernelName, this, kernelCodeHandle, workgroupGroupSegmentByteSize, + workitemPrivateSegmentByteSize, kernargSegmentByteSize, + amd::alignUp(kernargSegmentAlignment, device().info().globalMemCacheLineSize_)); + if (!aKernel->init()) { + return false; + } + aKernel->setUniformWorkGroupSize(options->oVariables->UniformWorkGroupSize); + aKernel->setInternalKernelFlag(compileOptions_.find("-cl-internal-kernel") != + std::string::npos); + kernels()[kernelName] = aKernel; + } + + return true; +} +#endif // defined(WITH_LIGHTNING_COMPILER) + +bool HSAILProgram::linkImpl(amd::option::Options* options) { + acl_error errorCode; + aclType continueCompileFrom = ACL_TYPE_LLVMIR_BINARY; + bool finalize = true; +#if !defined(WITH_LIGHTNING_COMPILER) + // If !binaryElf_ then program must have been created using clCreateProgramWithBinary + if (!binaryElf_) +#else // defined(WITH_LIGHTNING_COMPILER) + if (llvmBinary_.empty()) +#endif // defined(WITH_LIGHTNING_COMPILER) + { + continueCompileFrom = getNextCompilationStageFromBinary(options); + } + switch (continueCompileFrom) { // Compilation from ACL_TYPE_LLVMIR_BINARY to ACL_TYPE_CG in cases: // 1. if the program is not created with binary; // 2. if the program is created with binary and contains only .llvmir & .comment // 3. if the program is created with binary, contains .llvmir, .comment, brig sections, // but the binary's compile & link options differ from current ones (recompilation); case ACL_TYPE_LLVMIR_BINARY: - // Compilation from ACL_TYPE_HSAIL_BINARY to ACL_TYPE_CG in cases: - // 1. if the program is created with binary and contains only brig sections + // Compilation from ACL_TYPE_HSAIL_BINARY to ACL_TYPE_CG in cases: + // 1. if the program is created with binary and contains only brig sections case ACL_TYPE_HSAIL_BINARY: - // Compilation from ACL_TYPE_HSAIL_TEXT to ACL_TYPE_CG in cases: - // 1. if the program is created with binary and contains only hsail text + // Compilation from ACL_TYPE_HSAIL_TEXT to ACL_TYPE_CG in cases: + // 1. if the program is created with binary and contains only hsail text case ACL_TYPE_HSAIL_TEXT: { #if defined(WITH_LIGHTNING_COMPILER) - if (!linkImpl_LC(options)) { - return false; - } -#else // !defined(WITH_LIGHTNING_COMPILER) - std::string curOptions = options->origOptionStr - + preprocessorOptions(options) + codegenOptions(options); - errorCode = g_complibApi._aclCompile(device().compiler(), binaryElf_, - curOptions.c_str(), continueCompileFrom, ACL_TYPE_CG, logFunction); - buildLog_ += g_complibApi._aclGetCompilerLog(device().compiler()); - if (errorCode != ACL_SUCCESS) { - buildLog_ += "Error while BRIG Codegen phase: compilation error \n" ; - return false; - } -#endif // !defined(WITH_LIGHTNING_COMPILER) - break; + if (!linkImpl_LC(options)) { + return false; + } +#else // !defined(WITH_LIGHTNING_COMPILER) + std::string curOptions = + options->origOptionStr + preprocessorOptions(options) + codegenOptions(options); + errorCode = g_complibApi._aclCompile(device().compiler(), binaryElf_, curOptions.c_str(), + continueCompileFrom, ACL_TYPE_CG, logFunction); + buildLog_ += g_complibApi._aclGetCompilerLog(device().compiler()); + if (errorCode != ACL_SUCCESS) { + buildLog_ += "Error while BRIG Codegen phase: compilation error \n"; + return false; + } +#endif // !defined(WITH_LIGHTNING_COMPILER) + break; } case ACL_TYPE_CG: - break; + break; case ACL_TYPE_ISA: { #if defined(WITH_LIGHTNING_COMPILER) - binary_t isaBinary = binary(); - if ((isaBinary.first != nullptr) && (isaBinary.second > 0)) { - return setKernels_LC(options, (void*) isaBinary.first, isaBinary.second ); - } - else { - buildLog_ += "Error: code object is empty \n" ; - return false; - } -#endif // !defined(WITH_LIGHTNING_COMPILER) - finalize = false; - break; + binary_t isaBinary = binary(); + if ((isaBinary.first != nullptr) && (isaBinary.second > 0)) { + return setKernels_LC(options, (void*)isaBinary.first, isaBinary.second); + } else { + buildLog_ += "Error: code object is empty \n"; + return false; + } +#endif // !defined(WITH_LIGHTNING_COMPILER) + finalize = false; + break; } default: - buildLog_ += "Error while BRIG Codegen phase: the binary is incomplete \n" ; - return false; - } - //Stop compilation if it is an offline device - HSA runtime does not - //support ISA compiled offline - if (!dev().isOnline()) { - return true; - } + buildLog_ += "Error while BRIG Codegen phase: the binary is incomplete \n"; + return false; + } + // Stop compilation if it is an offline device - HSA runtime does not + // support ISA compiled offline + if (!dev().isOnline()) { + return true; + } #if !defined(WITH_LIGHTNING_COMPILER) - hsa_agent_t hsaDevice = dev().getBackendDevice(); + hsa_agent_t hsaDevice = dev().getBackendDevice(); - std::string fin_options(options->origOptionStr); - // Append an option so that we can selectively enable a SCOption on CZ - // whenever IOMMUv2 is enabled. - if (dev().isFineGrainedSystem(true)) { - fin_options.append(" -sc-xnack-iommu"); - } - errorCode = aclCompile(dev().compiler(), binaryElf_, - fin_options.c_str(), ACL_TYPE_CG, ACL_TYPE_ISA, logFunction); - buildLog_ += aclGetCompilerLog(dev().compiler()); - if (errorCode != ACL_SUCCESS) { - buildLog_ += "Error: BRIG finalization to ISA failed.\n"; - return false; - } - size_t secSize; - void *data = (void*)aclExtractSection(device().compiler(), - binaryElf_, &secSize, aclTEXT, &errorCode); - if (errorCode != ACL_SUCCESS) { - buildLog_ += "Error: cannot extract ISA from compiled binary.\n"; - return false; - } - - // Create an executable. - hsa_status_t status = hsa_executable_create_alt( - HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, - nullptr, &hsaExecutable_ - ); - if (status != HSA_STATUS_SUCCESS) { - buildLog_ += "Error: Failed to create executable: "; - buildLog_ += hsa_strerror(status); - buildLog_ += "\n"; - return false; - } - - // Load the code object. - hsa_code_object_reader_t codeObjectReader; - status = hsa_code_object_reader_create_from_memory( - data, secSize, &codeObjectReader); - if (status != HSA_STATUS_SUCCESS) { - buildLog_ += "Error: AMD HSA Code Object Reader create failed: "; - buildLog_ += hsa_strerror(status); - buildLog_ += "\n"; - return false; - } - - status = hsa_executable_load_agent_code_object( - hsaExecutable_, hsaDevice, codeObjectReader, nullptr, nullptr ); - if (status != HSA_STATUS_SUCCESS) { - buildLog_ += "Error: AMD HSA Code Object loading failed: "; - buildLog_ += hsa_strerror(status); - buildLog_ += "\n"; - return false; - } - - hsa_code_object_reader_destroy(codeObjectReader); - - // Freeze the executable. - status = hsa_executable_freeze(hsaExecutable_, nullptr); - if (status != HSA_STATUS_SUCCESS) { - buildLog_ += "Error: Failed to freeze executable: "; - buildLog_ += hsa_strerror(status); - buildLog_ += "\n"; - return false; - } - - // Get the list of kernels - std::vector kernelNameList; - status = hsa_executable_iterate_agent_symbols( hsaExecutable_, - hsaDevice, GetKernelNamesCallback, (void *) &kernelNameList ); - if (status != HSA_STATUS_SUCCESS) { - buildLog_ += "Error: Failed to get kernel names: "; - buildLog_ += hsa_strerror(status); - buildLog_ += "\n"; - return false; - } - - for (auto &kernelName : kernelNameList) { - // Query symbol handle for this symbol. - hsa_executable_symbol_t kernelSymbol; - status = hsa_executable_get_symbol_by_name( - hsaExecutable_, kernelName.c_str(), &hsaDevice, &kernelSymbol - ); - if (status != HSA_STATUS_SUCCESS) { - buildLog_ += "Error: Failed to get executable symbol: "; - buildLog_ += hsa_strerror(status); - buildLog_ += "\n"; - return false; - } - - // Query code handle for this symbol. - uint64_t kernelCodeHandle; - status = hsa_executable_symbol_get_info( - kernelSymbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernelCodeHandle - ); - if (status != HSA_STATUS_SUCCESS) { - buildLog_ += "Error: Failed to get executable symbol info: "; - buildLog_ += hsa_strerror(status); - buildLog_ += "\n"; - return false; - } - - std::string openclKernelName = kernelName; - // Strip the opencl and kernel name - kernelName = kernelName.substr(strlen("&__OpenCL_"), kernelName.size()); - kernelName = kernelName.substr(0,kernelName.size() - strlen("_kernel")); - aclMetadata md; - md.numHiddenKernelArgs = 0; - - size_t sizeOfnumHiddenKernelArgs = sizeof(md.numHiddenKernelArgs); - errorCode = g_complibApi._aclQueryInfo(device().compiler(), binaryElf_, RT_NUM_KERNEL_HIDDEN_ARGS, - openclKernelName.c_str(), &md.numHiddenKernelArgs, &sizeOfnumHiddenKernelArgs); - if (errorCode != ACL_SUCCESS) { - buildLog_ += "Error while Finalization phase: Kernel extra arguments count querying from the ELF failed\n"; - return false; - } - - uint32_t workgroupGroupSegmentByteSize; - status = hsa_executable_symbol_get_info( - kernelSymbol, - HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, - &workgroupGroupSegmentByteSize); - if (status != HSA_STATUS_SUCCESS) { - buildLog_ += "Error: Failed to get group segment size info: "; - buildLog_ += hsa_strerror(status); - buildLog_ += "\n"; - return false; - } - - uint32_t workitemPrivateSegmentByteSize; - status = hsa_executable_symbol_get_info( - kernelSymbol, - HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, - &workitemPrivateSegmentByteSize); - if (status != HSA_STATUS_SUCCESS) { - buildLog_ += "Error: Failed to get private segment size info: "; - buildLog_ += hsa_strerror(status); - buildLog_ += "\n"; - return false; - } - - uint32_t kernargSegmentByteSize; - status = hsa_executable_symbol_get_info( - kernelSymbol, - HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, - &kernargSegmentByteSize); - if (status != HSA_STATUS_SUCCESS) { - buildLog_ += "Error: Failed to get kernarg segment size info: "; - buildLog_ += hsa_strerror(status); - buildLog_ += "\n"; - return false; - } - - uint32_t kernargSegmentAlignment; - status = hsa_executable_symbol_get_info( - kernelSymbol, - HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_ALIGNMENT, - &kernargSegmentAlignment); - if (status != HSA_STATUS_SUCCESS) { - buildLog_ += "Error: Failed to get kernarg segment alignment info: "; - buildLog_ += hsa_strerror(status); - buildLog_ += "\n"; - return false; - } - - Kernel *aKernel = new roc::Kernel( - kernelName, - this, - kernelCodeHandle, - workgroupGroupSegmentByteSize, - workitemPrivateSegmentByteSize, - kernargSegmentByteSize, - kernargSegmentAlignment); - if (!aKernel->init()) { - return false; - } - aKernel->setUniformWorkGroupSize(options->oVariables->UniformWorkGroupSize); - aKernel->setInternalKernelFlag(compileOptions_.find("-cl-internal-kernel") != std::string::npos); - kernels()[kernelName] = aKernel; - } - saveBinaryAndSetType(TYPE_EXECUTABLE); - buildLog_ += g_complibApi._aclGetCompilerLog(device().compiler()); -#endif // !defined(WITH_LIGHTNING_COMPILER) - return true; -} - -bool -HSAILProgram::createBinary(amd::option::Options *options) -{ -#if defined(WITH_LIGHTNING_COMPILER) - if (!clBinary()->createElfBinary(options->oVariables->BinEncrypt, type())) { - LogError("Failed to create ELF binary image!"); - return false; - } - return true; -#else // !defined(WITH_LIGHTNING_COMPILER) + std::string fin_options(options->origOptionStr); + // Append an option so that we can selectively enable a SCOption on CZ + // whenever IOMMUv2 is enabled. + if (dev().isFineGrainedSystem(true)) { + fin_options.append(" -sc-xnack-iommu"); + } + errorCode = aclCompile(dev().compiler(), binaryElf_, fin_options.c_str(), ACL_TYPE_CG, + ACL_TYPE_ISA, logFunction); + buildLog_ += aclGetCompilerLog(dev().compiler()); + if (errorCode != ACL_SUCCESS) { + buildLog_ += "Error: BRIG finalization to ISA failed.\n"; return false; -#endif // !defined(WITH_LIGHTNING_COMPILER) + } + size_t secSize; + void* data = + (void*)aclExtractSection(device().compiler(), binaryElf_, &secSize, aclTEXT, &errorCode); + if (errorCode != ACL_SUCCESS) { + buildLog_ += "Error: cannot extract ISA from compiled binary.\n"; + return false; + } + + // Create an executable. + hsa_status_t status = hsa_executable_create_alt( + HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, nullptr, &hsaExecutable_); + if (status != HSA_STATUS_SUCCESS) { + buildLog_ += "Error: Failed to create executable: "; + buildLog_ += hsa_strerror(status); + buildLog_ += "\n"; + return false; + } + + // Load the code object. + hsa_code_object_reader_t codeObjectReader; + status = hsa_code_object_reader_create_from_memory(data, secSize, &codeObjectReader); + if (status != HSA_STATUS_SUCCESS) { + buildLog_ += "Error: AMD HSA Code Object Reader create failed: "; + buildLog_ += hsa_strerror(status); + buildLog_ += "\n"; + return false; + } + + status = hsa_executable_load_agent_code_object(hsaExecutable_, hsaDevice, codeObjectReader, + nullptr, nullptr); + if (status != HSA_STATUS_SUCCESS) { + buildLog_ += "Error: AMD HSA Code Object loading failed: "; + buildLog_ += hsa_strerror(status); + buildLog_ += "\n"; + return false; + } + + hsa_code_object_reader_destroy(codeObjectReader); + + // Freeze the executable. + status = hsa_executable_freeze(hsaExecutable_, nullptr); + if (status != HSA_STATUS_SUCCESS) { + buildLog_ += "Error: Failed to freeze executable: "; + buildLog_ += hsa_strerror(status); + buildLog_ += "\n"; + return false; + } + + // Get the list of kernels + std::vector kernelNameList; + status = hsa_executable_iterate_agent_symbols(hsaExecutable_, hsaDevice, GetKernelNamesCallback, + (void*)&kernelNameList); + if (status != HSA_STATUS_SUCCESS) { + buildLog_ += "Error: Failed to get kernel names: "; + buildLog_ += hsa_strerror(status); + buildLog_ += "\n"; + return false; + } + + for (auto& kernelName : kernelNameList) { + // Query symbol handle for this symbol. + hsa_executable_symbol_t kernelSymbol; + status = hsa_executable_get_symbol_by_name(hsaExecutable_, kernelName.c_str(), &hsaDevice, + &kernelSymbol); + if (status != HSA_STATUS_SUCCESS) { + buildLog_ += "Error: Failed to get executable symbol: "; + buildLog_ += hsa_strerror(status); + buildLog_ += "\n"; + return false; + } + + // Query code handle for this symbol. + uint64_t kernelCodeHandle; + status = hsa_executable_symbol_get_info(kernelSymbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, + &kernelCodeHandle); + if (status != HSA_STATUS_SUCCESS) { + buildLog_ += "Error: Failed to get executable symbol info: "; + buildLog_ += hsa_strerror(status); + buildLog_ += "\n"; + return false; + } + + std::string openclKernelName = kernelName; + // Strip the opencl and kernel name + kernelName = kernelName.substr(strlen("&__OpenCL_"), kernelName.size()); + kernelName = kernelName.substr(0, kernelName.size() - strlen("_kernel")); + aclMetadata md; + md.numHiddenKernelArgs = 0; + + size_t sizeOfnumHiddenKernelArgs = sizeof(md.numHiddenKernelArgs); + errorCode = g_complibApi._aclQueryInfo(device().compiler(), binaryElf_, + RT_NUM_KERNEL_HIDDEN_ARGS, openclKernelName.c_str(), + &md.numHiddenKernelArgs, &sizeOfnumHiddenKernelArgs); + if (errorCode != ACL_SUCCESS) { + buildLog_ += + "Error while Finalization phase: Kernel extra arguments count querying from the ELF " + "failed\n"; + return false; + } + + uint32_t workgroupGroupSegmentByteSize; + status = hsa_executable_symbol_get_info(kernelSymbol, + HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, + &workgroupGroupSegmentByteSize); + if (status != HSA_STATUS_SUCCESS) { + buildLog_ += "Error: Failed to get group segment size info: "; + buildLog_ += hsa_strerror(status); + buildLog_ += "\n"; + return false; + } + + uint32_t workitemPrivateSegmentByteSize; + status = hsa_executable_symbol_get_info(kernelSymbol, + HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, + &workitemPrivateSegmentByteSize); + if (status != HSA_STATUS_SUCCESS) { + buildLog_ += "Error: Failed to get private segment size info: "; + buildLog_ += hsa_strerror(status); + buildLog_ += "\n"; + return false; + } + + uint32_t kernargSegmentByteSize; + status = hsa_executable_symbol_get_info(kernelSymbol, + HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, + &kernargSegmentByteSize); + if (status != HSA_STATUS_SUCCESS) { + buildLog_ += "Error: Failed to get kernarg segment size info: "; + buildLog_ += hsa_strerror(status); + buildLog_ += "\n"; + return false; + } + + uint32_t kernargSegmentAlignment; + status = hsa_executable_symbol_get_info( + kernelSymbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_ALIGNMENT, + &kernargSegmentAlignment); + if (status != HSA_STATUS_SUCCESS) { + buildLog_ += "Error: Failed to get kernarg segment alignment info: "; + buildLog_ += hsa_strerror(status); + buildLog_ += "\n"; + return false; + } + + Kernel* aKernel = new roc::Kernel(kernelName, this, kernelCodeHandle, + workgroupGroupSegmentByteSize, workitemPrivateSegmentByteSize, + kernargSegmentByteSize, kernargSegmentAlignment); + if (!aKernel->init()) { + return false; + } + aKernel->setUniformWorkGroupSize(options->oVariables->UniformWorkGroupSize); + aKernel->setInternalKernelFlag(compileOptions_.find("-cl-internal-kernel") != + std::string::npos); + kernels()[kernelName] = aKernel; + } + saveBinaryAndSetType(TYPE_EXECUTABLE); + buildLog_ += g_complibApi._aclGetCompilerLog(device().compiler()); +#endif // !defined(WITH_LIGHTNING_COMPILER) + return true; } -bool -HSAILProgram::initClBinary() -{ +bool HSAILProgram::createBinary(amd::option::Options* options) { +#if defined(WITH_LIGHTNING_COMPILER) + if (!clBinary()->createElfBinary(options->oVariables->BinEncrypt, type())) { + LogError("Failed to create ELF binary image!"); + return false; + } + return true; +#else // !defined(WITH_LIGHTNING_COMPILER) + return false; +#endif // !defined(WITH_LIGHTNING_COMPILER) +} + +bool HSAILProgram::initClBinary() { + if (clBinary_ == nullptr) { + clBinary_ = new ClBinary(static_cast(device())); if (clBinary_ == nullptr) { - clBinary_ = new ClBinary(static_cast(device())); - if (clBinary_ == nullptr) { - return false; - } + return false; } - return true; + } + return true; } -void -HSAILProgram::releaseClBinary() -{ - if (clBinary_ != nullptr) { - delete clBinary_; - clBinary_ = nullptr; - } +void HSAILProgram::releaseClBinary() { + if (clBinary_ != nullptr) { + delete clBinary_; + clBinary_ = nullptr; + } } -std::string -HSAILProgram::codegenOptions(amd::option::Options* options) -{ - std::string optionsStr; +std::string HSAILProgram::codegenOptions(amd::option::Options* options) { + std::string optionsStr; #if !defined(WITH_LIGHTNING_COMPILER) - if (dev().deviceInfo().gfxipVersion_ < 900 || - !dev().settings().singleFpDenorm_) { - optionsStr.append(" -cl-denorms-are-zero"); - } -#endif // !defined(WITH_LIGHTNING_COMPILER) + if (dev().deviceInfo().gfxipVersion_ < 900 || !dev().settings().singleFpDenorm_) { + optionsStr.append(" -cl-denorms-are-zero"); + } +#endif // !defined(WITH_LIGHTNING_COMPILER) - //check if the host is 64 bit or 32 bit - LP64_ONLY(optionsStr.append(" -m64")); + // check if the host is 64 bit or 32 bit + LP64_ONLY(optionsStr.append(" -m64")); - return optionsStr; + return optionsStr; } -std::string -HSAILProgram::preprocessorOptions(amd::option::Options* options) -{ - std::string optionsStr; +std::string HSAILProgram::preprocessorOptions(amd::option::Options* options) { + std::string optionsStr; - //Set options for the standard device specific options + // Set options for the standard device specific options - optionsStr.append(" -D__AMD__=1"); + optionsStr.append(" -D__AMD__=1"); - optionsStr.append(" -D__").append(device().info().name_).append("__=1"); - optionsStr.append(" -D__").append(device().info().name_).append("=1"); + optionsStr.append(" -D__").append(device().info().name_).append("__=1"); + optionsStr.append(" -D__").append(device().info().name_).append("=1"); - int major, minor; - ::sscanf(device().info().version_, "OpenCL %d.%d ", &major, &minor); + int major, minor; + ::sscanf(device().info().version_, "OpenCL %d.%d ", &major, &minor); - std::stringstream ss; - ss << " -D__OPENCL_VERSION__=" << (major * 100 + minor * 10); - optionsStr.append(ss.str()); + std::stringstream ss; + ss << " -D__OPENCL_VERSION__=" << (major * 100 + minor * 10); + optionsStr.append(ss.str()); - if (device().info().imageSupport_ && options->oVariables->ImageSupport) { - optionsStr.append(" -D__IMAGE_SUPPORT__=1"); - } + if (device().info().imageSupport_ && options->oVariables->ImageSupport) { + optionsStr.append(" -D__IMAGE_SUPPORT__=1"); + } - //This is just for legacy compiler code - // All our devices support these options now - if (options->oVariables->FastFMA) { - optionsStr.append(" -DFP_FAST_FMA=1"); - } - if (options->oVariables->FastFMAF) { - optionsStr.append(" -DFP_FAST_FMAF=1"); - } + // This is just for legacy compiler code + // All our devices support these options now + if (options->oVariables->FastFMA) { + optionsStr.append(" -DFP_FAST_FMA=1"); + } + if (options->oVariables->FastFMAF) { + optionsStr.append(" -DFP_FAST_FMAF=1"); + } - uint clcStd = (options->oVariables->CLStd[2] - '0') * 100 - + (options->oVariables->CLStd[4] - '0') * 10; + uint clcStd = + (options->oVariables->CLStd[2] - '0') * 100 + (options->oVariables->CLStd[4] - '0') * 10; - if (clcStd >= 200) { - std::stringstream opts; - //Add only for CL2.0 and later - opts << " -D" << "CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE=" - << device().info().maxGlobalVariableSize_; - optionsStr.append(opts.str()); - } + if (clcStd >= 200) { + std::stringstream opts; + // Add only for CL2.0 and later + opts << " -D" + << "CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE=" << device().info().maxGlobalVariableSize_; + optionsStr.append(opts.str()); + } - // Tokenize the extensions string into a vector of strings - std::istringstream istrstr(device().info().extensions_); - std::istream_iterator sit(istrstr), end; - std::vector extensions(sit, end); + // Tokenize the extensions string into a vector of strings + std::istringstream istrstr(device().info().extensions_); + std::istream_iterator sit(istrstr), end; + std::vector extensions(sit, end); #if defined(WITH_LIGHTNING_COMPILER) - // FIXME_lmoriche: opencl-c.h defines 'cl_khr_depth_images', so - // remove it from the command line. Should we fix opencl-c.h? - auto found = std::find(extensions.begin(), extensions.end(), - "cl_khr_depth_images"); - if (found != extensions.end()) { - extensions.erase(found); - } + // FIXME_lmoriche: opencl-c.h defines 'cl_khr_depth_images', so + // remove it from the command line. Should we fix opencl-c.h? + auto found = std::find(extensions.begin(), extensions.end(), "cl_khr_depth_images"); + if (found != extensions.end()) { + extensions.erase(found); + } - if (!extensions.empty()) { - std::ostringstream clext; + if (!extensions.empty()) { + std::ostringstream clext; - clext << " -Xclang -cl-ext=+"; - std::copy(extensions.begin(), extensions.end() - 1, - std::ostream_iterator(clext, ",+")); - clext << extensions.back(); + clext << " -Xclang -cl-ext=+"; + std::copy(extensions.begin(), extensions.end() - 1, + std::ostream_iterator(clext, ",+")); + clext << extensions.back(); - optionsStr.append(clext.str()); - } -#else // !defined(WITH_LIGHTNING_COMPILER) - for (auto e : extensions) { - optionsStr.append(" -D").append(e).append("=1"); - } -#endif // !defined(WITH_LIGHTNING_COMPILER) + optionsStr.append(clext.str()); + } +#else // !defined(WITH_LIGHTNING_COMPILER) + for (auto e : extensions) { + optionsStr.append(" -D").append(e).append("=1"); + } +#endif // !defined(WITH_LIGHTNING_COMPILER) - return optionsStr; + return optionsStr; } } // namespace roc diff --git a/rocclr/runtime/device/rocm/rocprogram.hpp b/rocclr/runtime/device/rocm/rocprogram.hpp index a9ab7d2191..b9909bd0d3 100644 --- a/rocclr/runtime/device/rocm/rocprogram.hpp +++ b/rocclr/runtime/device/rocm/rocprogram.hpp @@ -8,7 +8,7 @@ #include "rocbinary.hpp" #if !defined(WITH_LIGHTNING_COMPILER) #include "roccompilerlib.hpp" -#endif // !defined(WITH_LIGHTNING_COMPILER) +#endif // !defined(WITH_LIGHTNING_COMPILER) #include "acl.h" #include #include @@ -24,162 +24,149 @@ typedef llvm::AMDGPU::CodeObject::Metadata CodeObjectMD; typedef llvm::AMDGPU::CodeObject::Kernel::Metadata KernelMD; typedef llvm::AMDGPU::CodeObject::Kernel::Arg::Metadata KernelArgMD; -#endif // defined(WITH_LIGHTNING_COMPILER) +#endif // defined(WITH_LIGHTNING_COMPILER) using namespace HSAIL_ASM; //! \namespace roc HSA Device Implementation namespace roc { //! \class empty program -class HSAILProgram : public device::Program -{ - friend class ClBinary; -public: - //! Default constructor - HSAILProgram(roc::NullDevice& device); - //! Default destructor - ~HSAILProgram(); +class HSAILProgram : public device::Program { + friend class ClBinary; - // Initialize Binary for GPU (used only for clCreateProgramWithBinary()). - virtual bool initClBinary(char *binaryIn, size_t size); + public: + //! Default constructor + HSAILProgram(roc::NullDevice& device); + //! Default destructor + ~HSAILProgram(); - //! Returns the aclBinary associated with the program - const aclBinary* binaryElf() const { - return static_cast(binaryElf_); - } + // Initialize Binary for GPU (used only for clCreateProgramWithBinary()). + virtual bool initClBinary(char* binaryIn, size_t size); + + //! Returns the aclBinary associated with the program + const aclBinary* binaryElf() const { return static_cast(binaryElf_); } #if defined(WITH_LIGHTNING_COMPILER) - //! Returns the program metadata. - const CodeObjectMD* metadata() const { return metadata_; } -#endif // defined(WITH_LIGHTNING_COMPILER) + //! Returns the program metadata. + const CodeObjectMD* metadata() const { return metadata_; } +#endif // defined(WITH_LIGHTNING_COMPILER) - //! Return a typecasted GPU device - const NullDevice& dev() const - { return static_cast(device()); } + //! Return a typecasted GPU device + const NullDevice& dev() const { return static_cast(device()); } - //! Returns the hsaBinary associated with the program - hsa_agent_t hsaDevice() const { - return dev().getBackendDevice(); - } + //! Returns the hsaBinary associated with the program + hsa_agent_t hsaDevice() const { return dev().getBackendDevice(); } - bool hasGlobalStores() const { return hasGlobalStores_; } + bool hasGlobalStores() const { return hasGlobalStores_; } -protected: - //! pre-compile setup for GPU - virtual bool initBuild(amd::option::Options* options); + protected: + //! pre-compile setup for GPU + virtual bool initBuild(amd::option::Options* options); - //! post-compile setup for GPU - virtual bool finiBuild(bool isBuildGood); + //! post-compile setup for GPU + virtual bool finiBuild(bool isBuildGood); - /*! \brief Compiles GPU CL program to LLVM binary (compiler frontend) - * - * \return True if we successfully compiled a GPU program - */ - virtual bool compileImpl( - const std::string& sourceCode, //!< the program's source code - const std::vector& headers, - const char** headerIncludeNames, - amd::option::Options* options //!< compile options's object - ); + /*! \brief Compiles GPU CL program to LLVM binary (compiler frontend) + * + * \return True if we successfully compiled a GPU program + */ + virtual bool compileImpl(const std::string& sourceCode, //!< the program's source code + const std::vector& headers, + const char** headerIncludeNames, + amd::option::Options* options //!< compile options's object + ); #if defined(WITH_LIGHTNING_COMPILER) - virtual bool compileImpl_LC( - const std::string& sourceCode, //!< the program's source code - const std::vector& headers, - const char** headerIncludeNames, - amd::option::Options* options //!< compile options's object - ); -#endif // defined(WITH_LIGHTNING_COMPILER) + virtual bool compileImpl_LC(const std::string& sourceCode, //!< the program's source code + const std::vector& headers, + const char** headerIncludeNames, + amd::option::Options* options //!< compile options's object + ); +#endif // defined(WITH_LIGHTNING_COMPILER) - /*! \brief Compiles LLVM binary to HSAIL code (compiler backend: link+opt+codegen) - * - * \return The build error code - */ - int compileBinaryToHSAIL( - amd::option::Options* options //!< options for compilation - ); + /*! \brief Compiles LLVM binary to HSAIL code (compiler backend: link+opt+codegen) + * + * \return The build error code + */ + int compileBinaryToHSAIL(amd::option::Options* options //!< options for compilation + ); - virtual bool linkImpl(amd::option::Options* options); + virtual bool linkImpl(amd::option::Options* options); #if defined(WITH_LIGHTNING_COMPILER) - virtual bool linkImpl_LC(amd::option::Options* options); - bool setKernels_LC(amd::option::Options* options, void *binary, size_t binSize); -#endif // defined(WITH_LIGHTNING_COMPILER) + virtual bool linkImpl_LC(amd::option::Options* options); + bool setKernels_LC(amd::option::Options* options, void* binary, size_t binSize); +#endif // defined(WITH_LIGHTNING_COMPILER) - //! Link the device programs. - virtual bool linkImpl (const std::vector& inputPrograms, - amd::option::Options* options, - bool createLibrary); + //! Link the device programs. + virtual bool linkImpl(const std::vector& inputPrograms, amd::option::Options* options, + bool createLibrary); #if defined(WITH_LIGHTNING_COMPILER) - virtual bool linkImpl_LC(const std::vector& inputPrograms, - amd::option::Options* options, - bool createLibrary); -#endif // defined(WITH_LIGHTNING_COMPILER) + virtual bool linkImpl_LC(const std::vector& inputPrograms, + amd::option::Options* options, bool createLibrary); +#endif // defined(WITH_LIGHTNING_COMPILER) - virtual bool createBinary(amd::option::Options* options); + virtual bool createBinary(amd::option::Options* options); - //! Initialize Binary - virtual bool initClBinary(); + //! Initialize Binary + virtual bool initClBinary(); - //! Release the Binary - virtual void releaseClBinary(); + //! Release the Binary + virtual void releaseClBinary(); - virtual const aclTargetInfo & info(const char * str = ""){ - return info_; - } + virtual const aclTargetInfo& info(const char* str = "") { return info_; } - virtual bool isElf(const char* bin) const { - return amd::isElfMagic(bin); - //return false; - } + virtual bool isElf(const char* bin) const { + return amd::isElfMagic(bin); + // return false; + } - //! Returns the binary - // This should ensure that the binary is updated with all the kernels - // ClBinary& clBinary() { return binary_; } - ClBinary* clBinary() { - return static_cast(device::Program::clBinary()); - } - const ClBinary* clBinary() const { - return static_cast(device::Program::clBinary()); - } -private: - /* \brief Returns the next stage to compile from, based on sections in binary, - * also returns completeStages in a vector, which contains at least ACL_TYPE_DEFAULT, - * sets needOptionsCheck to true if options check is needed to decide whether or not to recompile - */ - aclType getCompilationStagesFromBinary(std::vector& completeStages, bool& needOptionsCheck); + //! Returns the binary + // This should ensure that the binary is updated with all the kernels + // ClBinary& clBinary() { return binary_; } + ClBinary* clBinary() { return static_cast(device::Program::clBinary()); } + const ClBinary* clBinary() const { + return static_cast(device::Program::clBinary()); + } - /* \brief Returns the next stage to compile from, based on sections and options in binary - */ - aclType getNextCompilationStageFromBinary(amd::option::Options* options); - bool saveBinaryAndSetType(type_t type, void* binary = nullptr, size_t size = 0); + private: + /* \brief Returns the next stage to compile from, based on sections in binary, + * also returns completeStages in a vector, which contains at least ACL_TYPE_DEFAULT, + * sets needOptionsCheck to true if options check is needed to decide whether or not to recompile + */ + aclType getCompilationStagesFromBinary(std::vector& completeStages, + bool& needOptionsCheck); - //! Disable default copy constructor - HSAILProgram(const HSAILProgram&) = delete; - //! Disable operator= - HSAILProgram& operator=(const HSAILProgram&) = delete; + /* \brief Returns the next stage to compile from, based on sections and options in binary + */ + aclType getNextCompilationStageFromBinary(amd::option::Options* options); + bool saveBinaryAndSetType(type_t type, void* binary = nullptr, size_t size = 0); - //! Returns all the options to be appended while passing to the - //compiler - std::string preprocessorOptions(amd::option::Options* options); - std::string codegenOptions(amd::option::Options* options); + //! Disable default copy constructor + HSAILProgram(const HSAILProgram&) = delete; + //! Disable operator= + HSAILProgram& operator=(const HSAILProgram&) = delete; - // aclBinary and aclCompiler - for the compiler library - aclBinary* binaryElf_; //!< Binary for the new compiler library - aclBinaryOptions binOpts_; //!< Binary options to create aclBinary - bool hasGlobalStores_; //!< program has writable program scope variables + //! Returns all the options to be appended while passing to the + // compiler + std::string preprocessorOptions(amd::option::Options* options); + std::string codegenOptions(amd::option::Options* options); - /* HSA executable */ - hsa_ext_program_t hsaProgramHandle_; //!< Handle to HSA runtime program - hsa_executable_t hsaExecutable_; //!< Handle to HSA executable + // aclBinary and aclCompiler - for the compiler library + aclBinary* binaryElf_; //!< Binary for the new compiler library + aclBinaryOptions binOpts_; //!< Binary options to create aclBinary + bool hasGlobalStores_; //!< program has writable program scope variables + + /* HSA executable */ + hsa_ext_program_t hsaProgramHandle_; //!< Handle to HSA runtime program + hsa_executable_t hsaExecutable_; //!< Handle to HSA executable #if defined(WITH_LIGHTNING_COMPILER) - CodeObjectMD* metadata_; //!< Runtime metadata - //! Return a new transient compiler instance. - static amd::opencl_driver::Compiler* newCompilerInstance(); -#endif // defined(WITH_LIGHTNING_COMPILER) + CodeObjectMD* metadata_; //!< Runtime metadata + //! Return a new transient compiler instance. + static amd::opencl_driver::Compiler* newCompilerInstance(); +#endif // defined(WITH_LIGHTNING_COMPILER) }; /*@}*/} // namespace roc #endif /*WITHOUT_HSA_BACKEND*/ - diff --git a/rocclr/runtime/device/rocm/rocregisters.hpp b/rocclr/runtime/device/rocm/rocregisters.hpp index f6736b6099..f521bcac56 100644 --- a/rocclr/runtime/device/rocm/rocregisters.hpp +++ b/rocclr/runtime/device/rocm/rocregisters.hpp @@ -22,177 +22,176 @@ WORD7 is defined in mesa but has no fields and isn't in GCN3 doc. Can I use thi namespace roc { - enum SQ_RSRC_IMG_TYPES { - SQ_RSRC_IMG_1D = 0x08, - SQ_RSRC_IMG_2D = 0x09, - SQ_RSRC_IMG_3D = 0x0A, - SQ_RSRC_IMG_CUBE = 0x0B, - SQ_RSRC_IMG_1D_ARRAY = 0x0C, - SQ_RSRC_IMG_2D_ARRAY = 0x0D, - SQ_RSRC_IMG_2D_MSAA = 0x0E, - SQ_RSRC_IMG_2D_MSAA_ARRAY = 0x0F - }; +enum SQ_RSRC_IMG_TYPES { + SQ_RSRC_IMG_1D = 0x08, + SQ_RSRC_IMG_2D = 0x09, + SQ_RSRC_IMG_3D = 0x0A, + SQ_RSRC_IMG_CUBE = 0x0B, + SQ_RSRC_IMG_1D_ARRAY = 0x0C, + SQ_RSRC_IMG_2D_ARRAY = 0x0D, + SQ_RSRC_IMG_2D_MSAA = 0x0E, + SQ_RSRC_IMG_2D_MSAA_ARRAY = 0x0F +}; - union SQ_IMG_RSRC_WORD0 { - struct { +union SQ_IMG_RSRC_WORD0 { + struct { #if defined(LITTLEENDIAN_CPU) - unsigned int base_address : 32; + unsigned int base_address : 32; #elif defined(BIGENDIAN_CPU) - unsigned int base_address : 32; + unsigned int base_address : 32; #endif - } bitfields, bits; - unsigned int u32_all; - signed int i32_all; - float f32_all; - }; + } bitfields, bits; + unsigned int u32_all; + signed int i32_all; + float f32_all; +}; - union SQ_IMG_RSRC_WORD1 { - struct { +union SQ_IMG_RSRC_WORD1 { + struct { #if defined(LITTLEENDIAN_CPU) - unsigned int base_address_hi : 8; - unsigned int min_lod : 12; - unsigned int data_format : 6; - unsigned int num_format : 4; - unsigned int mtype : 2; + unsigned int base_address_hi : 8; + unsigned int min_lod : 12; + unsigned int data_format : 6; + unsigned int num_format : 4; + unsigned int mtype : 2; #elif defined(BIGENDIAN_CPU) - unsigned int mtype : 2; - unsigned int num_format : 4; - unsigned int data_format : 6; - unsigned int min_lod : 12; - unsigned int base_address_hi : 8; + unsigned int mtype : 2; + unsigned int num_format : 4; + unsigned int data_format : 6; + unsigned int min_lod : 12; + unsigned int base_address_hi : 8; #endif - } bitfields, bits; - unsigned int u32_all; - signed int i32_all; - float f32_all; - }; + } bitfields, bits; + unsigned int u32_all; + signed int i32_all; + float f32_all; +}; - union SQ_IMG_RSRC_WORD2 { - struct { +union SQ_IMG_RSRC_WORD2 { + struct { #if defined(LITTLEENDIAN_CPU) - unsigned int width : 14; - unsigned int height : 14; - unsigned int perf_mod : 3; - unsigned int interlaced : 1; + unsigned int width : 14; + unsigned int height : 14; + unsigned int perf_mod : 3; + unsigned int interlaced : 1; #elif defined(BIGENDIAN_CPU) - unsigned int interlaced : 1; - unsigned int perf_mod : 3; - unsigned int height : 14; - unsigned int width : 14; + unsigned int interlaced : 1; + unsigned int perf_mod : 3; + unsigned int height : 14; + unsigned int width : 14; #endif - } bitfields, bits; - unsigned int u32_all; - signed int i32_all; - float f32_all; - }; + } bitfields, bits; + unsigned int u32_all; + signed int i32_all; + float f32_all; +}; - union SQ_IMG_RSRC_WORD3 { - struct { +union SQ_IMG_RSRC_WORD3 { + struct { #if defined(LITTLEENDIAN_CPU) - unsigned int dst_sel_x : 3; - unsigned int dst_sel_y : 3; - unsigned int dst_sel_z : 3; - unsigned int dst_sel_w : 3; - unsigned int base_level : 4; - unsigned int last_level : 4; - unsigned int tiling_index : 5; - unsigned int pow2_pad : 1; - unsigned int mtype : 1; - unsigned int atc : 1; - unsigned int type : 4; + unsigned int dst_sel_x : 3; + unsigned int dst_sel_y : 3; + unsigned int dst_sel_z : 3; + unsigned int dst_sel_w : 3; + unsigned int base_level : 4; + unsigned int last_level : 4; + unsigned int tiling_index : 5; + unsigned int pow2_pad : 1; + unsigned int mtype : 1; + unsigned int atc : 1; + unsigned int type : 4; #elif defined(BIGENDIAN_CPU) - unsigned int type : 4; - unsigned int atc : 1; - unsigned int mtype : 1; - unsigned int pow2_pad : 1; - unsigned int tiling_index : 5; - unsigned int last_level : 4; - unsigned int base_level : 4; - unsigned int dst_sel_w : 3; - unsigned int dst_sel_z : 3; - unsigned int dst_sel_y : 3; - unsigned int dst_sel_x : 3; + unsigned int type : 4; + unsigned int atc : 1; + unsigned int mtype : 1; + unsigned int pow2_pad : 1; + unsigned int tiling_index : 5; + unsigned int last_level : 4; + unsigned int base_level : 4; + unsigned int dst_sel_w : 3; + unsigned int dst_sel_z : 3; + unsigned int dst_sel_y : 3; + unsigned int dst_sel_x : 3; #endif - } bitfields, bits; - unsigned int u32_all; - signed int i32_all; - float f32_all; - }; + } bitfields, bits; + unsigned int u32_all; + signed int i32_all; + float f32_all; +}; - union SQ_IMG_RSRC_WORD4 { - struct { +union SQ_IMG_RSRC_WORD4 { + struct { #if defined(LITTLEENDIAN_CPU) - unsigned int depth : 13; - unsigned int pitch : 14; - unsigned int : 5; + unsigned int depth : 13; + unsigned int pitch : 14; + unsigned int : 5; #elif defined(BIGENDIAN_CPU) - unsigned int : 5; - unsigned int pitch : 14; - unsigned int depth : 13; + unsigned int : 5; + unsigned int pitch : 14; + unsigned int depth : 13; #endif - } bitfields, bits; - unsigned int u32_all; - signed int i32_all; - float f32_all; - }; + } bitfields, bits; + unsigned int u32_all; + signed int i32_all; + float f32_all; +}; - union SQ_IMG_RSRC_WORD5 { - struct { +union SQ_IMG_RSRC_WORD5 { + struct { #if defined(LITTLEENDIAN_CPU) - unsigned int base_array : 13; - unsigned int last_array : 13; - unsigned int : 6; + unsigned int base_array : 13; + unsigned int last_array : 13; + unsigned int : 6; #elif defined(BIGENDIAN_CPU) - unsigned int : 6; - unsigned int last_array : 13; - unsigned int base_array : 13; + unsigned int : 6; + unsigned int last_array : 13; + unsigned int base_array : 13; #endif - } bitfields, bits; - unsigned int u32_all; - signed int i32_all; - float f32_all; - }; + } bitfields, bits; + unsigned int u32_all; + signed int i32_all; + float f32_all; +}; - union SQ_IMG_RSRC_WORD6 { - struct { -#if defined(LITTLEENDIAN_CPU) - unsigned int min_lod_warn : 12; - unsigned int counter_bank_id : 8; - unsigned int lod_hdw_cnt_en : 1; - unsigned int compression_en : 1; - unsigned int alpha_is_on_msb : 1; - unsigned int color_transform : 1; - unsigned int lost_alpha_bits : 4; - unsigned int lost_color_bits : 4; -#elif defined(BIGENDIAN_CPU) - unsigned int lost_color_bits : 4; - unsigned int lost_alpha_bits : 4; - unsigned int color_transform : 1; - unsigned int alpha_is_on_msb : 1; - unsigned int compression_en : 1; - unsigned int lod_hdw_cnt_en : 1; - unsigned int counter_bank_id : 8; - unsigned int min_lod_warn : 12; +union SQ_IMG_RSRC_WORD6 { + struct { +#if defined(LITTLEENDIAN_CPU) + unsigned int min_lod_warn : 12; + unsigned int counter_bank_id : 8; + unsigned int lod_hdw_cnt_en : 1; + unsigned int compression_en : 1; + unsigned int alpha_is_on_msb : 1; + unsigned int color_transform : 1; + unsigned int lost_alpha_bits : 4; + unsigned int lost_color_bits : 4; +#elif defined(BIGENDIAN_CPU) + unsigned int lost_color_bits : 4; + unsigned int lost_alpha_bits : 4; + unsigned int color_transform : 1; + unsigned int alpha_is_on_msb : 1; + unsigned int compression_en : 1; + unsigned int lod_hdw_cnt_en : 1; + unsigned int counter_bank_id : 8; + unsigned int min_lod_warn : 12; #endif - } bitfields, bits; - unsigned int u32All; - signed int i32All; - float f32All; - }; + } bitfields, bits; + unsigned int u32All; + signed int i32All; + float f32All; +}; - union SQ_IMG_RSRC_WORD7 { - struct { -#if defined(LITTLEENDIAN_CPU) - unsigned int meta_data_address : 32; -#elif defined(BIGENDIAN_CPU) - unsigned int meta_data_address : 32; +union SQ_IMG_RSRC_WORD7 { + struct { +#if defined(LITTLEENDIAN_CPU) + unsigned int meta_data_address : 32; +#elif defined(BIGENDIAN_CPU) + unsigned int meta_data_address : 32; #endif - } bitfields, bits; - unsigned int u32All; - signed int i32All; - float f32All; - }; - + } bitfields, bits; + unsigned int u32All; + signed int i32All; + float f32All; +}; } #endif diff --git a/rocclr/runtime/device/rocm/rocsettings.cpp b/rocclr/runtime/device/rocm/rocsettings.cpp index 07d36ca733..3bb8c9d9e3 100644 --- a/rocclr/runtime/device/rocm/rocsettings.cpp +++ b/rocclr/runtime/device/rocm/rocsettings.cpp @@ -12,160 +12,154 @@ namespace roc { -Settings::Settings() -{ - // Initialize the HSA device default settings +Settings::Settings() { + // Initialize the HSA device default settings - // Set this to true when we drop the flag - doublePrecision_ = ::CL_KHR_FP64; - pollCompletion_ = ENVVAR_HSA_POLL_KERNEL_COMPLETION; + // Set this to true when we drop the flag + doublePrecision_ = ::CL_KHR_FP64; + pollCompletion_ = ENVVAR_HSA_POLL_KERNEL_COMPLETION; - enableLocalMemory_ = HSA_LOCAL_MEMORY_ENABLE; - enableImageHandle_ = true; + enableLocalMemory_ = HSA_LOCAL_MEMORY_ENABLE; + enableImageHandle_ = true; - maxWorkGroupSize_ = 256; - maxWorkGroupSize2DX_ = 16; - maxWorkGroupSize2DY_ = 16; - maxWorkGroupSize3DX_ = 4; - maxWorkGroupSize3DY_ = 4; - maxWorkGroupSize3DZ_ = 4; + maxWorkGroupSize_ = 256; + maxWorkGroupSize2DX_ = 16; + maxWorkGroupSize2DY_ = 16; + maxWorkGroupSize3DX_ = 4; + maxWorkGroupSize3DY_ = 4; + maxWorkGroupSize3DZ_ = 4; - kernargPoolSize_ = HSA_KERNARG_POOL_SIZE; - signalPoolSize_ = HSA_SIGNAL_POOL_SIZE; + kernargPoolSize_ = HSA_KERNARG_POOL_SIZE; + signalPoolSize_ = HSA_SIGNAL_POOL_SIZE; - // Determine if user is requesting Non-Coherent mode - // for system memory. By default system memory is - // operates or is programmed to be in Coherent mode. - // Users can turn it off for hardware that does not - // support this feature naturally - char *nonCoherentMode = nullptr; - nonCoherentMode = getenv("OPENCL_USE_NC_MEMORY_POLICY"); - enableNCMode_ = (nonCoherentMode)? true : false; + // Determine if user is requesting Non-Coherent mode + // for system memory. By default system memory is + // operates or is programmed to be in Coherent mode. + // Users can turn it off for hardware that does not + // support this feature naturally + char* nonCoherentMode = nullptr; + nonCoherentMode = getenv("OPENCL_USE_NC_MEMORY_POLICY"); + enableNCMode_ = (nonCoherentMode) ? true : false; - // Determine if user wishes to disable support for - // partial dispatch. By default support for partial - // dispatch is enabled. Users can turn it off for - // devices that do not support this feature. - // - // @note Update appropriate field of device::Settings - char *partialDispatch = nullptr; - partialDispatch = getenv("OPENCL_DISABLE_PARTIAL_DISPATCH"); - enablePartialDispatch_ = (partialDispatch) ? false : true; - partialDispatch_ = (partialDispatch) ? false : true; - commandQueues_ = 100; //!< Field value set to maximum number - //!< concurrent Virtual GPUs for ROCm backend + // Determine if user wishes to disable support for + // partial dispatch. By default support for partial + // dispatch is enabled. Users can turn it off for + // devices that do not support this feature. + // + // @note Update appropriate field of device::Settings + char* partialDispatch = nullptr; + partialDispatch = getenv("OPENCL_DISABLE_PARTIAL_DISPATCH"); + enablePartialDispatch_ = (partialDispatch) ? false : true; + partialDispatch_ = (partialDispatch) ? false : true; + commandQueues_ = 100; //!< Field value set to maximum number + //!< concurrent Virtual GPUs for ROCm backend - // Disable image DMA by default (ROCM runtime doesn't support it) - imageDMA_ = false; + // Disable image DMA by default (ROCM runtime doesn't support it) + imageDMA_ = false; - stagedXferRead_ = true; - stagedXferWrite_ = true; - stagedXferSize_ = GPU_STAGING_BUFFER_SIZE * Ki; + stagedXferRead_ = true; + stagedXferWrite_ = true; + stagedXferSize_ = GPU_STAGING_BUFFER_SIZE * Ki; - // Initialize transfer buffer size to 1MB by default - xferBufSize_ = 1024 * Ki; + // Initialize transfer buffer size to 1MB by default + xferBufSize_ = 1024 * Ki; - const static size_t MaxPinnedXferSize = 32; - pinnedXferSize_ = std::min(GPU_PINNED_XFER_SIZE, MaxPinnedXferSize) * Mi; - pinnedMinXferSize_ = std::min(GPU_PINNED_MIN_XFER_SIZE * Ki, pinnedXferSize_); + const static size_t MaxPinnedXferSize = 32; + pinnedXferSize_ = std::min(GPU_PINNED_XFER_SIZE, MaxPinnedXferSize) * Mi; + pinnedMinXferSize_ = std::min(GPU_PINNED_MIN_XFER_SIZE * Ki, pinnedXferSize_); - // Don't support Denormals for single precision by default - singleFpDenorm_ = false; + // Don't support Denormals for single precision by default + singleFpDenorm_ = false; } -bool -Settings::create(bool fullProfile, int gfxipVersion) -{ - customHostAllocator_ = false; +bool Settings::create(bool fullProfile, int gfxipVersion) { + customHostAllocator_ = false; - if (fullProfile) { - pinnedXferSize_ = 0; - stagedXferSize_ = 0; - xferBufSize_ = 0; - } - else { - pinnedXferSize_ = std::max(pinnedXferSize_, pinnedMinXferSize_); - stagedXferSize_ = std::max(stagedXferSize_, pinnedMinXferSize_ + 4 * Ki); - } + if (fullProfile) { + pinnedXferSize_ = 0; + stagedXferSize_ = 0; + xferBufSize_ = 0; + } else { + pinnedXferSize_ = std::max(pinnedXferSize_, pinnedMinXferSize_); + stagedXferSize_ = std::max(stagedXferSize_, pinnedMinXferSize_ + 4 * Ki); + } - // Enable extensions - enableExtension(ClKhrByteAddressableStore); - enableExtension(ClKhrGlobalInt32BaseAtomics); - enableExtension(ClKhrGlobalInt32ExtendedAtomics); - enableExtension(ClKhrLocalInt32BaseAtomics); - enableExtension(ClKhrLocalInt32ExtendedAtomics); - enableExtension(ClKhrInt64BaseAtomics); - enableExtension(ClKhrInt64ExtendedAtomics); - enableExtension(ClKhr3DImageWrites); - enableExtension(ClAmdMediaOps); - enableExtension(ClAmdMediaOps2); - if(MesaInterop::Supported()) { - enableExtension(ClKhrGlSharing); - } + // Enable extensions + enableExtension(ClKhrByteAddressableStore); + enableExtension(ClKhrGlobalInt32BaseAtomics); + enableExtension(ClKhrGlobalInt32ExtendedAtomics); + enableExtension(ClKhrLocalInt32BaseAtomics); + enableExtension(ClKhrLocalInt32ExtendedAtomics); + enableExtension(ClKhrInt64BaseAtomics); + enableExtension(ClKhrInt64ExtendedAtomics); + enableExtension(ClKhr3DImageWrites); + enableExtension(ClAmdMediaOps); + enableExtension(ClAmdMediaOps2); + if (MesaInterop::Supported()) { + enableExtension(ClKhrGlSharing); + } - // Enable KHR double precision extension - enableExtension(ClKhrFp64); + // Enable KHR double precision extension + enableExtension(ClKhrFp64); #if !defined(WITH_LIGHTNING_COMPILER) - // Also enable AMD double precision extension? - enableExtension(ClAmdFp64); -#endif // !defined(WITH_LIGHTNING_COMPILER) - enableExtension(ClKhrSubGroups); + // Also enable AMD double precision extension? + enableExtension(ClAmdFp64); +#endif // !defined(WITH_LIGHTNING_COMPILER) + enableExtension(ClKhrSubGroups); - enableExtension(ClKhrDepthImages); - supportDepthsRGB_ = true; + enableExtension(ClKhrDepthImages); + supportDepthsRGB_ = true; #if defined(WITH_LIGHTNING_COMPILER) - switch (gfxipVersion) { + switch (gfxipVersion) { case 900: + singleFpDenorm_ = true; + break; + } +#endif // WITH_LIGHTNING_COMPILER + + // Override current device settings + override(); + + return true; +} + +void Settings::override() { + // Limit reported workgroup size + if (GPU_MAX_WORKGROUP_SIZE != 0) { + maxWorkGroupSize_ = GPU_MAX_WORKGROUP_SIZE; + } + + if (!flagIsDefault(GPU_MAX_COMMAND_QUEUES)) { + commandQueues_ = GPU_MAX_COMMAND_QUEUES; + } + + if (!flagIsDefault(GPU_IMAGE_DMA)) { + commandQueues_ = GPU_IMAGE_DMA; + } + + if (!flagIsDefault(GPU_XFER_BUFFER_SIZE)) { + xferBufSize_ = GPU_XFER_BUFFER_SIZE * Ki; + } + + if (!flagIsDefault(GPU_PINNED_MIN_XFER_SIZE)) { + pinnedMinXferSize_ = std::min(GPU_PINNED_MIN_XFER_SIZE * Ki, pinnedXferSize_); + } + + if (!flagIsDefault(AMD_GPU_FORCE_SINGLE_FP_DENORM)) { + switch (AMD_GPU_FORCE_SINGLE_FP_DENORM) { + case 0: + singleFpDenorm_ = false; + break; + case 1: singleFpDenorm_ = true; break; + default: + break; } -#endif // WITH_LIGHTNING_COMPILER - - // Override current device settings - override(); - - return true; + } } -void -Settings::override() -{ - // Limit reported workgroup size - if (GPU_MAX_WORKGROUP_SIZE != 0) { - maxWorkGroupSize_ = GPU_MAX_WORKGROUP_SIZE; - } +} // namespace roc - if (!flagIsDefault(GPU_MAX_COMMAND_QUEUES)) { - commandQueues_ = GPU_MAX_COMMAND_QUEUES; - } - - if (!flagIsDefault(GPU_IMAGE_DMA)) { - commandQueues_ = GPU_IMAGE_DMA; - } - - if (!flagIsDefault(GPU_XFER_BUFFER_SIZE)) { - xferBufSize_ = GPU_XFER_BUFFER_SIZE * Ki; - } - - if (!flagIsDefault(GPU_PINNED_MIN_XFER_SIZE)) { - pinnedMinXferSize_ = std::min(GPU_PINNED_MIN_XFER_SIZE * Ki, pinnedXferSize_); - } - - if (!flagIsDefault(AMD_GPU_FORCE_SINGLE_FP_DENORM)) { - switch (AMD_GPU_FORCE_SINGLE_FP_DENORM) { - case 0: - singleFpDenorm_ = false; - break; - case 1: - singleFpDenorm_ = true; - break; - default: - break; - } - } -} - -} // namespace roc - -#endif // WITHOUT_GPU_BACKEND +#endif // WITHOUT_GPU_BACKEND diff --git a/rocclr/runtime/device/rocm/rocsettings.hpp b/rocclr/runtime/device/rocm/rocsettings.hpp index 40f0e75dfa..d09d93044b 100644 --- a/rocclr/runtime/device/rocm/rocsettings.hpp +++ b/rocclr/runtime/device/rocm/rocsettings.hpp @@ -15,64 +15,62 @@ namespace roc { //! Device settings -class Settings : public device::Settings -{ -public: - union { - struct { - uint doublePrecision_: 1; //!< Enables double precision support - uint pollCompletion_: 1; //!< Enables polling in HSA - uint enableLocalMemory_ : 1; //!< Enable GPUVM memory - uint enableImageHandle_: 1; //!< Use HSAIL image/sampler pointer - uint enableNCMode_: 1; //!< Enable Non Coherent mode for system memory - uint enablePartialDispatch_: 1; //!< Enable support for Partial Dispatch - uint imageDMA_: 1; //!< Enable direct image DMA transfers - uint stagedXferRead_: 1; //!< Uses a staged buffer read - uint stagedXferWrite_: 1; //!< Uses a staged buffer write - uint singleFpDenorm_: 1; //!< Support Single FP Denorm - uint reserved_: 21; - }; - uint value_; +class Settings : public device::Settings { + public: + union { + struct { + uint doublePrecision_ : 1; //!< Enables double precision support + uint pollCompletion_ : 1; //!< Enables polling in HSA + uint enableLocalMemory_ : 1; //!< Enable GPUVM memory + uint enableImageHandle_ : 1; //!< Use HSAIL image/sampler pointer + uint enableNCMode_ : 1; //!< Enable Non Coherent mode for system memory + uint enablePartialDispatch_ : 1; //!< Enable support for Partial Dispatch + uint imageDMA_ : 1; //!< Enable direct image DMA transfers + uint stagedXferRead_ : 1; //!< Uses a staged buffer read + uint stagedXferWrite_ : 1; //!< Uses a staged buffer write + uint singleFpDenorm_ : 1; //!< Support Single FP Denorm + uint reserved_ : 21; }; + uint value_; + }; - //! Default max workgroup size for 1D - int maxWorkGroupSize_; + //! Default max workgroup size for 1D + int maxWorkGroupSize_; - //! Default max workgroup sizes for 2D - int maxWorkGroupSize2DX_; - int maxWorkGroupSize2DY_; + //! Default max workgroup sizes for 2D + int maxWorkGroupSize2DX_; + int maxWorkGroupSize2DY_; - //! Default max workgroup sizes for 3D - int maxWorkGroupSize3DX_; - int maxWorkGroupSize3DY_; - int maxWorkGroupSize3DZ_; + //! Default max workgroup sizes for 3D + int maxWorkGroupSize3DX_; + int maxWorkGroupSize3DY_; + int maxWorkGroupSize3DZ_; - uint kernargPoolSize_; - uint signalPoolSize_; + uint kernargPoolSize_; + uint signalPoolSize_; - size_t xferBufSize_; //!< Transfer buffer size for image copy optimization - size_t stagedXferSize_; //!< Staged buffer size - size_t pinnedXferSize_; //!< Pinned buffer size for transfer - size_t pinnedMinXferSize_; //!< Minimal buffer size for pinned transfer + size_t xferBufSize_; //!< Transfer buffer size for image copy optimization + size_t stagedXferSize_; //!< Staged buffer size + size_t pinnedXferSize_; //!< Pinned buffer size for transfer + size_t pinnedMinXferSize_; //!< Minimal buffer size for pinned transfer - //! Default constructor - Settings(); + //! Default constructor + Settings(); - //! Creates settings - bool create(bool fullProfile, int gfxipVersion); + //! Creates settings + bool create(bool fullProfile, int gfxipVersion); -private: - //! Disable copy constructor - Settings(const Settings&); + private: + //! Disable copy constructor + Settings(const Settings&); - //! Disable assignment - Settings& operator=(const Settings&); + //! Disable assignment + Settings& operator=(const Settings&); - //! Overrides current settings based on registry/environment - void override(); + //! Overrides current settings based on registry/environment + void override(); }; /*@}*/} // namespace roc #endif /*WITHOUT_HSA_BACKEND*/ - diff --git a/rocclr/runtime/device/rocm/rocvirtual.cpp b/rocclr/runtime/device/rocm/rocvirtual.cpp index 490f376e47..0e214a7cb8 100644 --- a/rocclr/runtime/device/rocm/rocvirtual.cpp +++ b/rocclr/runtime/device/rocm/rocvirtual.cpp @@ -29,7 +29,7 @@ /** * HSA image object alignment in bytes (see HSAIL spec) */ -#define HSA_IMAGE_OBJECT_ALIGNMENT 16 +#define HSA_IMAGE_OBJECT_ALIGNMENT 16 /** * HSA sampler object size in bytes (see HSAIL spec) @@ -39,11 +39,12 @@ /** * HSA sampler object alignment in bytes (see HSAIL spec) */ -#define HSA_SAMPLER_OBJECT_ALIGNMENT 16 +#define HSA_SAMPLER_OBJECT_ALIGNMENT 16 namespace roc { // (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) invalidates I, K and L1 -// (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE) invalidates L1, L2 and flushes L2 +// (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE) invalidates L1, L2 and flushes +// L2 static const uint16_t kDispatchPacketHeaderNoSync = (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) | @@ -51,47 +52,43 @@ static const uint16_t kDispatchPacketHeaderNoSync = (HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); static const uint16_t kDispatchPacketHeader = - (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) | - (1 << HSA_PACKET_HEADER_BARRIER) | + (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) | (1 << HSA_PACKET_HEADER_BARRIER) | (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) | (HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); static const uint16_t kBarrierPacketHeader = - (HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE) | - (1 << HSA_PACKET_HEADER_BARRIER) | + (HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE) | (1 << HSA_PACKET_HEADER_BARRIER) | (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) | (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); static const uint16_t kBarrierPacketAcquireHeader = - (HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE) | - (1 << HSA_PACKET_HEADER_BARRIER) | + (HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE) | (1 << HSA_PACKET_HEADER_BARRIER) | (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) | (HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); static const uint16_t kBarrierPacketReleaseHeader = - (HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE) | - (1 << HSA_PACKET_HEADER_BARRIER) | + (HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE) | (1 << HSA_PACKET_HEADER_BARRIER) | (HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) | (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); -static const hsa_barrier_and_packet_t kBarrierAcquirePacket = - {kBarrierPacketAcquireHeader,0,0,0,0,0,0,0,0,0}; +static const hsa_barrier_and_packet_t kBarrierAcquirePacket = { + kBarrierPacketAcquireHeader, 0, 0, 0, 0, 0, 0, 0, 0, 0}; -static const hsa_barrier_and_packet_t kBarrierReleasePacket = - {kBarrierPacketReleaseHeader,0,0,0,0,0,0,0,0,0}; +static const hsa_barrier_and_packet_t kBarrierReleasePacket = { + kBarrierPacketReleaseHeader, 0, 0, 0, 0, 0, 0, 0, 0, 0}; -double Timestamp::ticksToTime_=0; +double Timestamp::ticksToTime_ = 0; /** * Set the ocl correlation handle (essentially the cl_event handle) * to correlate the cl kernel launch and HSA kernel dispatch */ -typedef hsa_status_t - (*hsa_ext_tools_set_correlation_handle)(const hsa_agent_t agent, - void *correlation_handle); -static void SetOclCorrelationHandle(void *tools_lib, const hsa_agent_t agent, void *handle) { +typedef hsa_status_t (*hsa_ext_tools_set_correlation_handle)(const hsa_agent_t agent, + void* correlation_handle); +static void SetOclCorrelationHandle(void* tools_lib, const hsa_agent_t agent, void* handle) { hsa_ext_tools_set_correlation_handle func = - (hsa_ext_tools_set_correlation_handle)amd::Os::getSymbol(tools_lib, "hsa_ext_tools_set_correlation_handler"); + (hsa_ext_tools_set_correlation_handle)amd::Os::getSymbol( + tools_lib, "hsa_ext_tools_set_correlation_handler"); if (func) { func(agent, handle); } @@ -99,333 +96,294 @@ static void SetOclCorrelationHandle(void *tools_lib, const hsa_agent_t agent, vo return; } -bool -VirtualGPU::MemoryDependency::create(size_t numMemObj) -{ - if (numMemObj > 0) { - // Allocate the array of memory objects for dependency tracking - memObjectsInQueue_ = new MemoryState[numMemObj]; - if (nullptr == memObjectsInQueue_) { - return false; - } - memset(memObjectsInQueue_, 0, sizeof(MemoryState) * numMemObj); - maxMemObjectsInQueue_ = numMemObj; +bool VirtualGPU::MemoryDependency::create(size_t numMemObj) { + if (numMemObj > 0) { + // Allocate the array of memory objects for dependency tracking + memObjectsInQueue_ = new MemoryState[numMemObj]; + if (nullptr == memObjectsInQueue_) { + return false; } + memset(memObjectsInQueue_, 0, sizeof(MemoryState) * numMemObj); + maxMemObjectsInQueue_ = numMemObj; + } - return true; + return true; } -void -VirtualGPU::MemoryDependency::validate( - VirtualGPU& gpu, - const Memory* memory, - bool readOnly) -{ - bool flushL1Cache = false; +void VirtualGPU::MemoryDependency::validate(VirtualGPU& gpu, const Memory* memory, bool readOnly) { + bool flushL1Cache = false; - if (maxMemObjectsInQueue_ == 0) { - // Sync AQL packets - gpu.setAqlHeader(kDispatchPacketHeader); - return; + if (maxMemObjectsInQueue_ == 0) { + // Sync AQL packets + gpu.setAqlHeader(kDispatchPacketHeader); + return; + } + + uint64_t curStart = reinterpret_cast(memory->getDeviceMemory()); + uint64_t curEnd = curStart + memory->size(); + + // Loop through all memory objects in the queue and find dependency + // @note don't include objects from the current kernel + for (size_t j = 0; j < endMemObjectsInQueue_; ++j) { + // Check if the queue already contains this mem object and + // GPU operations aren't readonly + uint64_t busyStart = memObjectsInQueue_[j].start_; + uint64_t busyEnd = memObjectsInQueue_[j].end_; + + // Check if the start inside the busy region + if ((((curStart >= busyStart) && (curStart < busyEnd)) || + // Check if the end inside the busy region + ((curEnd > busyStart) && (curEnd <= busyEnd)) || + // Check if the start/end cover the busy region + ((curStart <= busyStart) && (curEnd >= busyEnd))) && + // If the buys region was written or the current one is for write + (!memObjectsInQueue_[j].readOnly_ || !readOnly)) { + flushL1Cache = true; + break; } + } - uint64_t curStart = reinterpret_cast(memory->getDeviceMemory()); - uint64_t curEnd = curStart + memory->size(); + // Did we reach the limit? + if (maxMemObjectsInQueue_ <= (numMemObjectsInQueue_ + 1)) { + flushL1Cache = true; + } - // Loop through all memory objects in the queue and find dependency - // @note don't include objects from the current kernel - for (size_t j = 0; j < endMemObjectsInQueue_; ++j) { - // Check if the queue already contains this mem object and - // GPU operations aren't readonly - uint64_t busyStart = memObjectsInQueue_[j].start_; - uint64_t busyEnd = memObjectsInQueue_[j].end_; + if (flushL1Cache) { + // Sync AQL packets + gpu.setAqlHeader(kDispatchPacketHeader); - // Check if the start inside the busy region - if ((((curStart >= busyStart) && (curStart < busyEnd)) || - // Check if the end inside the busy region - ((curEnd > busyStart) && (curEnd <= busyEnd)) || - // Check if the start/end cover the busy region - ((curStart <= busyStart) && (curEnd >= busyEnd))) && - // If the buys region was written or the current one is for write - (!memObjectsInQueue_[j].readOnly_ || !readOnly)) { - flushL1Cache = true; - break; - } - } + // Clear memory dependency state + const static bool All = true; + clear(!All); + } - // Did we reach the limit? - if (maxMemObjectsInQueue_ <= (numMemObjectsInQueue_ + 1)) { - flushL1Cache = true; - } - - if (flushL1Cache) { - // Sync AQL packets - gpu.setAqlHeader(kDispatchPacketHeader); - - // Clear memory dependency state - const static bool All = true; - clear(!All); - } - - // Insert current memory object into the queue always, - // since runtime calls flush before kernel execution and it has to keep - // current kernel in tracking - memObjectsInQueue_ - [numMemObjectsInQueue_].start_ = curStart; - memObjectsInQueue_ - [numMemObjectsInQueue_].end_ = curEnd; - memObjectsInQueue_ - [numMemObjectsInQueue_].readOnly_ = readOnly; - numMemObjectsInQueue_++; + // Insert current memory object into the queue always, + // since runtime calls flush before kernel execution and it has to keep + // current kernel in tracking + memObjectsInQueue_[numMemObjectsInQueue_].start_ = curStart; + memObjectsInQueue_[numMemObjectsInQueue_].end_ = curEnd; + memObjectsInQueue_[numMemObjectsInQueue_].readOnly_ = readOnly; + numMemObjectsInQueue_++; } -void -VirtualGPU::MemoryDependency::clear(bool all) -{ - if (numMemObjectsInQueue_ > 0) { - size_t i, j; - if (all) { - endMemObjectsInQueue_ = numMemObjectsInQueue_; - } - - // Preserve all objects from the current kernel - for (i = 0, j = endMemObjectsInQueue_; j < numMemObjectsInQueue_; i++, j++) { - memObjectsInQueue_[i].start_ = memObjectsInQueue_[j].start_; - memObjectsInQueue_[i].end_ = memObjectsInQueue_[j].end_; - memObjectsInQueue_[i].readOnly_ = memObjectsInQueue_[j].readOnly_; - } - // Clear all objects except current kernel - memset(&memObjectsInQueue_[i], 0, sizeof(amd::Memory*) * numMemObjectsInQueue_); - numMemObjectsInQueue_ -= endMemObjectsInQueue_; - endMemObjectsInQueue_ = 0; +void VirtualGPU::MemoryDependency::clear(bool all) { + if (numMemObjectsInQueue_ > 0) { + size_t i, j; + if (all) { + endMemObjectsInQueue_ = numMemObjectsInQueue_; } + + // Preserve all objects from the current kernel + for (i = 0, j = endMemObjectsInQueue_; j < numMemObjectsInQueue_; i++, j++) { + memObjectsInQueue_[i].start_ = memObjectsInQueue_[j].start_; + memObjectsInQueue_[i].end_ = memObjectsInQueue_[j].end_; + memObjectsInQueue_[i].readOnly_ = memObjectsInQueue_[j].readOnly_; + } + // Clear all objects except current kernel + memset(&memObjectsInQueue_[i], 0, sizeof(amd::Memory*) * numMemObjectsInQueue_); + numMemObjectsInQueue_ -= endMemObjectsInQueue_; + endMemObjectsInQueue_ = 0; + } } -bool -VirtualGPU::processMemObjects( - const amd::Kernel& kernel, - const_address params) -{ - static const bool NoAlias = true; - const Kernel& hsaKernel = static_cast - (*(kernel.getDeviceKernel(dev(), NoAlias))); - const amd::KernelSignature& signature = kernel.signature(); - const amd::KernelParameters& kernelParams = kernel.parameters(); +bool VirtualGPU::processMemObjects(const amd::Kernel& kernel, const_address params) { + static const bool NoAlias = true; + const Kernel& hsaKernel = static_cast(*(kernel.getDeviceKernel(dev(), NoAlias))); + const amd::KernelSignature& signature = kernel.signature(); + const amd::KernelParameters& kernelParams = kernel.parameters(); - // AQL packets - setAqlHeader(kDispatchPacketHeaderNoSync); + // AQL packets + setAqlHeader(kDispatchPacketHeaderNoSync); - // Mark the tracker with a new kernel, - // so we can avoid checks of the aliased objects - memoryDependency().newKernel(); + // Mark the tracker with a new kernel, + // so we can avoid checks of the aliased objects + memoryDependency().newKernel(); - bool deviceSupportFGS = 0 != dev().isFineGrainedSystem(true); - bool supportFineGrainedSystem = deviceSupportFGS; - FGSStatus status = kernelParams.getSvmSystemPointersSupport(); - switch (status) { - case FGS_YES: - if (!deviceSupportFGS) { - return false; - } - supportFineGrainedSystem = true; - break; - case FGS_NO: - supportFineGrainedSystem = false; - break; - case FGS_DEFAULT: - default: - break; - } + bool deviceSupportFGS = 0 != dev().isFineGrainedSystem(true); + bool supportFineGrainedSystem = deviceSupportFGS; + FGSStatus status = kernelParams.getSvmSystemPointersSupport(); + switch (status) { + case FGS_YES: + if (!deviceSupportFGS) { + return false; + } + supportFineGrainedSystem = true; + break; + case FGS_NO: + supportFineGrainedSystem = false; + break; + case FGS_DEFAULT: + default: + break; + } - size_t count = kernelParams.getNumberOfSvmPtr(); - size_t execInfoOffset = kernelParams.getExecInfoOffset(); - bool sync = true; + size_t count = kernelParams.getNumberOfSvmPtr(); + size_t execInfoOffset = kernelParams.getExecInfoOffset(); + bool sync = true; - amd::Memory* memory = nullptr; - //get svm non arugment information - void* const* svmPtrArray = - reinterpret_cast(params + execInfoOffset); - for (size_t i = 0; i < count; i++) { - memory = amd::SvmManager::FindSvmBuffer(svmPtrArray[i]); - if (nullptr == memory) { - if (!supportFineGrainedSystem) { - return false; - } - else if (sync) { - // Sync AQL packets - setAqlHeader(kDispatchPacketHeader); - // Clear memory dependency state - const static bool All = true; - memoryDependency().clear(!All); - continue; - } - } - else { - Memory* rocMemory = static_cast(memory->getDeviceMemory(dev())); - if (nullptr != rocMemory) { - // Synchronize data with other memory instances if necessary - rocMemory->syncCacheFromHost(*this); - - const static bool IsReadOnly = false; - // Validate SVM passed in the non argument list - memoryDependency().validate(*this, rocMemory, IsReadOnly); - } - else { - return false; - } - } - } - - // Check all parameters for the current kernel - for (size_t i = 0; i < signature.numParameters(); ++i) { - const amd::KernelParameterDescriptor& desc = signature.at(i); - const Kernel::Argument* arg = hsaKernel.hsailArgAt(i); - Memory* memory = nullptr; - bool readOnly = false; - amd::Memory* svmMem = nullptr; - - // Find if current argument is a buffer - if ((desc.type_ == T_POINTER) && (arg->addrQual_ != ROC_ADDRESS_LOCAL)) { - if (kernelParams.boundToSvmPointer(dev(), params, i)) { - svmMem = amd::SvmManager::FindSvmBuffer( - *reinterpret_cast(params + desc.offset_)); - if (!svmMem) { - // Sync AQL packets - setAqlHeader(kDispatchPacketHeader); - // Clear memory dependency state - const static bool All = true; - memoryDependency().clear(!All); - continue; - } - } - - if (*reinterpret_cast - (params + desc.offset_) != nullptr) { - if (nullptr == svmMem) { - memory = static_cast((*reinterpret_cast - (params + desc.offset_))->getDeviceMemory(dev())); - } - else { - memory = static_cast(svmMem->getDeviceMemory(dev())); - } - // Don't sync for internal objects, - // since they are not shared between devices - if (memory->owner()->getVirtualDevice() == nullptr) { - // Synchronize data with other memory instances if necessary - memory->syncCacheFromHost(*this); - } - } - - if (memory != nullptr) { - readOnly |= (arg->access_ == ROC_ACCESS_TYPE_RO); - // Validate memory for a dependency in the queue - memoryDependency().validate(*this, memory, readOnly); - } - } - } - - if (hsaKernel.program()->hasGlobalStores()) { + amd::Memory* memory = nullptr; + // get svm non arugment information + void* const* svmPtrArray = reinterpret_cast(params + execInfoOffset); + for (size_t i = 0; i < count; i++) { + memory = amd::SvmManager::FindSvmBuffer(svmPtrArray[i]); + if (nullptr == memory) { + if (!supportFineGrainedSystem) { + return false; + } else if (sync) { // Sync AQL packets setAqlHeader(kDispatchPacketHeader); // Clear memory dependency state const static bool All = true; memoryDependency().clear(!All); - } + continue; + } + } else { + Memory* rocMemory = static_cast(memory->getDeviceMemory(dev())); + if (nullptr != rocMemory) { + // Synchronize data with other memory instances if necessary + rocMemory->syncCacheFromHost(*this); - return true; + const static bool IsReadOnly = false; + // Validate SVM passed in the non argument list + memoryDependency().validate(*this, rocMemory, IsReadOnly); + } else { + return false; + } + } + } + + // Check all parameters for the current kernel + for (size_t i = 0; i < signature.numParameters(); ++i) { + const amd::KernelParameterDescriptor& desc = signature.at(i); + const Kernel::Argument* arg = hsaKernel.hsailArgAt(i); + Memory* memory = nullptr; + bool readOnly = false; + amd::Memory* svmMem = nullptr; + + // Find if current argument is a buffer + if ((desc.type_ == T_POINTER) && (arg->addrQual_ != ROC_ADDRESS_LOCAL)) { + if (kernelParams.boundToSvmPointer(dev(), params, i)) { + svmMem = + amd::SvmManager::FindSvmBuffer(*reinterpret_cast(params + desc.offset_)); + if (!svmMem) { + // Sync AQL packets + setAqlHeader(kDispatchPacketHeader); + // Clear memory dependency state + const static bool All = true; + memoryDependency().clear(!All); + continue; + } + } + + if (*reinterpret_cast(params + desc.offset_) != nullptr) { + if (nullptr == svmMem) { + memory = + static_cast((*reinterpret_cast(params + desc.offset_)) + ->getDeviceMemory(dev())); + } else { + memory = static_cast(svmMem->getDeviceMemory(dev())); + } + // Don't sync for internal objects, + // since they are not shared between devices + if (memory->owner()->getVirtualDevice() == nullptr) { + // Synchronize data with other memory instances if necessary + memory->syncCacheFromHost(*this); + } + } + + if (memory != nullptr) { + readOnly |= (arg->access_ == ROC_ACCESS_TYPE_RO); + // Validate memory for a dependency in the queue + memoryDependency().validate(*this, memory, readOnly); + } + } + } + + if (hsaKernel.program()->hasGlobalStores()) { + // Sync AQL packets + setAqlHeader(kDispatchPacketHeader); + // Clear memory dependency state + const static bool All = true; + memoryDependency().clear(!All); + } + + return true; } -template -bool -VirtualGPU::dispatchGenericAqlPacket( - AqlPacket* packet, - bool blocking) -{ - const uint32_t queueSize = gpu_queue_->size; - const uint32_t queueMask = queueSize - 1; +template +bool VirtualGPU::dispatchGenericAqlPacket(AqlPacket* packet, bool blocking) { + const uint32_t queueSize = gpu_queue_->size; + const uint32_t queueMask = queueSize - 1; - //Check for queue full and wait if needed. - uint64_t index = hsa_queue_load_write_index_relaxed(gpu_queue_); - uint64_t read = hsa_queue_load_read_index_relaxed(gpu_queue_); - hsa_signal_t signal; + // Check for queue full and wait if needed. + uint64_t index = hsa_queue_load_write_index_relaxed(gpu_queue_); + uint64_t read = hsa_queue_load_read_index_relaxed(gpu_queue_); + hsa_signal_t signal; - // TODO: placeholder to setup the kernel to populate start and end timestamp. - if (timestamp_ != nullptr) { - // Find signal slot - ProfilingSignal* profilingSignal = &signal_pool_[index & queueMask]; - // Make sure we save the old results in the TS structure - if (profilingSignal->ts_ != nullptr) { - profilingSignal->ts_->checkGpuTime(); - } - // Update the new TS with the signal info - timestamp_->setProfilingSignal(profilingSignal); - packet->completion_signal = profilingSignal->signal_; - profilingSignal->ts_ = timestamp_; - timestamp_->setAgent(gpu_device_); + // TODO: placeholder to setup the kernel to populate start and end timestamp. + if (timestamp_ != nullptr) { + // Find signal slot + ProfilingSignal* profilingSignal = &signal_pool_[index & queueMask]; + // Make sure we save the old results in the TS structure + if (profilingSignal->ts_ != nullptr) { + profilingSignal->ts_->checkGpuTime(); + } + // Update the new TS with the signal info + timestamp_->setProfilingSignal(profilingSignal); + packet->completion_signal = profilingSignal->signal_; + profilingSignal->ts_ = timestamp_; + timestamp_->setAgent(gpu_device_); + } + + if (blocking || (index - read) == queueMask) { + if (packet->completion_signal.handle == 0) { + packet->completion_signal = barrier_signal_; + } + signal = packet->completion_signal; + // Initialize signal for a wait + hsa_signal_store_relaxed(signal, InitSignalValue); + blocking = true; + } + + // Insert packet + ((AqlPacket*)(gpu_queue_->base_address))[index & queueMask] = *packet; + hsa_queue_store_write_index_release(gpu_queue_, index + 1); + hsa_signal_store_relaxed(gpu_queue_->doorbell_signal, index); + + // Wait on signal ? + if (blocking) { + if (hsa_signal_wait_acquire(signal, HSA_SIGNAL_CONDITION_LT, 1, uint64_t(-1), + HSA_WAIT_STATE_BLOCKED) != 0) { + LogPrintfError("Failed signal [0x%lx] wait", signal.handle); + return false; } - if (blocking || (index - read) == queueMask) { - if (packet->completion_signal.handle == 0) { - packet->completion_signal = barrier_signal_; - } - signal = packet->completion_signal; - // Initialize signal for a wait - hsa_signal_store_relaxed(signal, InitSignalValue); - blocking = true; - } + // Release the pool, since runtime just drained the entire queue + resetKernArgPool(); + } - //Insert packet - ((AqlPacket*)( - gpu_queue_->base_address))[index & queueMask] = *packet; - hsa_queue_store_write_index_release(gpu_queue_, index + 1); - hsa_signal_store_relaxed(gpu_queue_->doorbell_signal, index); - - //Wait on signal ? - if (blocking) { - if (hsa_signal_wait_acquire( - signal, HSA_SIGNAL_CONDITION_LT, 1, uint64_t(-1), - HSA_WAIT_STATE_BLOCKED) != 0) { - LogPrintfError("Failed signal [0x%lx] wait", signal.handle); - return false; - } - - // Release the pool, since runtime just drained the entire queue - resetKernArgPool(); - } - - return true; + return true; } -bool -VirtualGPU::dispatchAqlPacket( - hsa_kernel_dispatch_packet_t* packet, - bool blocking) -{ +bool VirtualGPU::dispatchAqlPacket(hsa_kernel_dispatch_packet_t* packet, bool blocking) { return dispatchGenericAqlPacket(packet, blocking); } -bool -VirtualGPU::dispatchAqlPacket( - hsa_barrier_and_packet_t* packet, - bool blocking) -{ +bool VirtualGPU::dispatchAqlPacket(hsa_barrier_and_packet_t* packet, bool blocking) { return dispatchGenericAqlPacket(packet, blocking); } -void -VirtualGPU::dispatchBarrierPacket(const hsa_barrier_and_packet_t* packet) -{ - assert(packet->completion_signal.handle != 0); - const uint32_t queueSize = gpu_queue_->size; - const uint32_t queueMask = queueSize - 1; +void VirtualGPU::dispatchBarrierPacket(const hsa_barrier_and_packet_t* packet) { + assert(packet->completion_signal.handle != 0); + const uint32_t queueSize = gpu_queue_->size; + const uint32_t queueMask = queueSize - 1; - uint64_t index = hsa_queue_load_write_index_relaxed(gpu_queue_); - ((hsa_barrier_and_packet_t*)( - gpu_queue_->base_address))[index&queueMask] = *packet; + uint64_t index = hsa_queue_load_write_index_relaxed(gpu_queue_); + ((hsa_barrier_and_packet_t*)(gpu_queue_->base_address))[index & queueMask] = *packet; - hsa_queue_store_write_index_relaxed(gpu_queue_, index + 1); + hsa_queue_store_write_index_relaxed(gpu_queue_, index + 1); - hsa_signal_store_relaxed(gpu_queue_->doorbell_signal, index); + hsa_signal_store_relaxed(gpu_queue_->doorbell_signal, index); } /** @@ -436,274 +394,256 @@ VirtualGPU::dispatchBarrierPacket(const hsa_barrier_and_packet_t* packet) * otherwise */ bool VirtualGPU::releaseGpuMemoryFence() { - // Return if there is no pending dispatch - if (!hasPendingDispatch_) { + // Return if there is no pending dispatch + if (!hasPendingDispatch_) { + return false; + } + + // Initialize signal for the barrier packet. + hsa_signal_store_relaxed(barrier_signal_, InitSignalValue); + + // Dispatch barrier packet into the queue and wait till it finishes. + dispatchBarrierPacket(&barrier_packet_); + if (hsa_signal_wait_acquire(barrier_signal_, HSA_SIGNAL_CONDITION_EQ, 0, uint64_t(-1), + HSA_WAIT_STATE_BLOCKED) != 0) { + LogError("Barrier packet submission failed"); + return false; + } + + hasPendingDispatch_ = false; + + // Release all transfer buffers on this command queue + releaseXferWrite(); + + // Release all memory dependencies + memoryDependency().clear(); + + // Release the pool, since runtime just completed a barrier + resetKernArgPool(); + + return true; +} + +VirtualGPU::VirtualGPU(Device& device) + : device::VirtualDevice(device), + roc_device_(device), + index_(device.numOfVgpus_++) // Virtual gpu unique index incrementing +{ + gpu_device_ = device.getBackendDevice(); + printfdbg_ = nullptr; + + // Initialize the last signal and dispatch flags + timestamp_ = nullptr; + hasPendingDispatch_ = false; + tools_lib_ = nullptr; + + kernarg_pool_base_ = nullptr; + kernarg_pool_size_ = 0; + kernarg_pool_cur_offset_ = 0; + aqlHeader_ = kDispatchPacketHeaderNoSync; + barrier_signal_.handle = 0; +} + +VirtualGPU::~VirtualGPU() { + releasePinnedMem(); + + if (timestamp_ != nullptr) { + delete timestamp_; + timestamp_ = nullptr; + LogError("There was a timestamp that was not used; deleting."); + } + if (printfdbg_ != nullptr) { + delete printfdbg_; + printfdbg_ = nullptr; + } + + tools_lib_ = nullptr; + --roc_device_.numOfVgpus_; // Virtual gpu unique index decrementing +} + +bool VirtualGPU::create(bool profilingEna) { + // Set the event handle to the tools lib if the env var + // Load the library using its advertised "soname" + std::string lib_name = amd::Os::getEnvironment("HSA_TOOLS_LIB"); + if (lib_name != "") { +#if defined(_WIN32) || defined(__CYGWIN__) + const char* tools_lib_name = "hsa-runtime-tools" LP64_SWITCH("", "64") ".dll"; +#else + const char* tools_lib_name = "libhsa-runtime-tools" LP64_SWITCH("", "64") ".so.1"; +#endif + tools_lib_ = amd::Os::loadLibrary(tools_lib_name); + } + + // Checking Virtual gpu unique index for ROCm backend + if (index() > device().settings().commandQueues_) { + return false; + } + + uint32_t queue_max_packets = 0; + if (HSA_STATUS_SUCCESS != + hsa_agent_get_info(gpu_device_, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &queue_max_packets)) { + return false; + } + + // Pick a reasonable queue size + uint32_t queue_size = 1024; + queue_size = (queue_max_packets < queue_size) ? queue_max_packets : queue_size; + while (hsa_queue_create(gpu_device_, queue_size, HSA_QUEUE_TYPE_MULTI, nullptr, nullptr, UINT_MAX, + UINT_MAX, &gpu_queue_) != HSA_STATUS_SUCCESS) { + queue_size >>= 1; + if (queue_size < 64) { return false; } + } - // Initialize signal for the barrier packet. - hsa_signal_store_relaxed(barrier_signal_, InitSignalValue); + if (!initPool(dev().settings().kernargPoolSize_, (profilingEna) ? queue_size : 0)) { + LogError("Couldn't allocate arguments/signals for the queue"); + return false; + } - // Dispatch barrier packet into the queue and wait till it finishes. - dispatchBarrierPacket(&barrier_packet_); - if (hsa_signal_wait_acquire( - barrier_signal_, HSA_SIGNAL_CONDITION_EQ, 0, uint64_t(-1), - HSA_WAIT_STATE_BLOCKED) != 0) { - LogError("Barrier packet submission failed"); - return false; - } + device::BlitManager::Setup blitSetup; + blitMgr_ = new KernelBlitManager(*this, blitSetup); + if ((nullptr == blitMgr_) || !blitMgr_->create(roc_device_)) { + LogError("Could not create BlitManager!"); + return false; + } - hasPendingDispatch_ = false; + // Create signal for the barrier packet. + hsa_signal_t signal = {0}; + if (HSA_STATUS_SUCCESS != hsa_signal_create(InitSignalValue, 0, nullptr, &signal)) { + return false; + } + barrier_signal_ = signal; - // Release all transfer buffers on this command queue - releaseXferWrite(); + // Initialize barrier packet. + memset(&barrier_packet_, 0, sizeof(barrier_packet_)); + barrier_packet_.header = kBarrierPacketHeader; + barrier_packet_.completion_signal = barrier_signal_; - // Release all memory dependencies - memoryDependency().clear(); + // Create a object of PrintfDbg + printfdbg_ = new PrintfDbg(roc_device_); + if (nullptr == printfdbg_) { + LogError("\nCould not create printfDbg Object!"); + return false; + } - // Release the pool, since runtime just completed a barrier - resetKernArgPool(); + // Initialize timestamp conversion factor + if (Timestamp::getGpuTicksToTime() == 0) { + uint64_t frequency; + hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &frequency); + Timestamp::setGpuTicksToTime(1e9 / double(frequency)); + } - return true; + if (!memoryDependency().create(GPU_NUM_MEM_DEPENDENCY)) { + LogError("Could not create the array of memory objects!"); + return false; + } + + return true; } -VirtualGPU::VirtualGPU(Device &device) - : device::VirtualDevice(device) - , roc_device_(device) - , index_(device.numOfVgpus_++) // Virtual gpu unique index incrementing -{ - gpu_device_ = device.getBackendDevice(); - printfdbg_ = nullptr; +bool VirtualGPU::terminate() { + delete blitMgr_; - // Initialize the last signal and dispatch flags - timestamp_ = nullptr; - hasPendingDispatch_ = false; + // Release the resources of signal + releaseGpuMemoryFence(); + hsa_status_t err = hsa_queue_destroy(gpu_queue_); + if (err != HSA_STATUS_SUCCESS) { + return false; + } + + if (barrier_signal_.handle != 0) { + hsa_signal_destroy(barrier_signal_); + } + + if (tools_lib_) { + amd::Os::unloadLibrary(tools_lib_); tools_lib_ = nullptr; + } - kernarg_pool_base_ = nullptr; - kernarg_pool_size_ = 0; - kernarg_pool_cur_offset_ = 0; - aqlHeader_ = kDispatchPacketHeaderNoSync; - barrier_signal_.handle = 0; + destroyPool(); + + return true; } -VirtualGPU::~VirtualGPU() -{ - releasePinnedMem(); +bool VirtualGPU::initPool(size_t kernarg_pool_size, uint signal_pool_count) { + kernarg_pool_size_ = kernarg_pool_size; + kernarg_pool_base_ = reinterpret_cast(roc_device_.hostAlloc(kernarg_pool_size_, 1, true)); + if (kernarg_pool_base_ == nullptr) { + return false; + } - if (timestamp_ != nullptr) { - delete timestamp_; - timestamp_ = nullptr; - LogError("There was a timestamp that was not used; deleting."); - } - if (printfdbg_ != nullptr){ - delete printfdbg_; - printfdbg_ = nullptr; + if (signal_pool_count != 0) { + signal_pool_.resize(signal_pool_count); + for (uint i = 0; i < signal_pool_count; ++i) { + ProfilingSignal profilingSignal; + if (HSA_STATUS_SUCCESS != hsa_signal_create(0, 0, nullptr, &profilingSignal.signal_)) { + return false; + } + signal_pool_[i] = profilingSignal; } + } - tools_lib_ = nullptr; - --roc_device_.numOfVgpus_; // Virtual gpu unique index decrementing + return true; } -bool -VirtualGPU::create(bool profilingEna) -{ - // Set the event handle to the tools lib if the env var - // Load the library using its advertised "soname" - std::string lib_name = amd::Os::getEnvironment("HSA_TOOLS_LIB"); - if (lib_name != "") { -#if defined(_WIN32) || defined(__CYGWIN__) - const char *tools_lib_name = "hsa-runtime-tools" LP64_SWITCH("", "64") ".dll"; -#else - const char *tools_lib_name = "libhsa-runtime-tools" LP64_SWITCH("", "64") ".so.1"; -#endif - tools_lib_ = amd::Os::loadLibrary(tools_lib_name); +void VirtualGPU::destroyPool() { + if (kernarg_pool_base_ != nullptr) { + roc_device_.hostFree(kernarg_pool_base_, kernarg_pool_size_); + } + + if (signal_pool_.size() > 0) { + for (uint i = 0; i < signal_pool_.size(); ++i) { + hsa_signal_destroy(signal_pool_[i].signal_); } - - // Checking Virtual gpu unique index for ROCm backend - if (index() > device().settings().commandQueues_) { - return false; - } - - uint32_t queue_max_packets = 0; - if (HSA_STATUS_SUCCESS != - hsa_agent_get_info( - gpu_device_, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &queue_max_packets)) { - return false; - } - - //Pick a reasonable queue size - uint32_t queue_size = 1024; - queue_size = (queue_max_packets < queue_size) ? queue_max_packets : queue_size; - while (hsa_queue_create(gpu_device_, - queue_size, HSA_QUEUE_TYPE_MULTI, nullptr, nullptr, UINT_MAX, UINT_MAX, - &gpu_queue_) != HSA_STATUS_SUCCESS) { - queue_size >>= 1; - if (queue_size < 64) { - return false; - } - } - - if (!initPool(dev().settings().kernargPoolSize_, (profilingEna) ? queue_size : 0)) { - LogError("Couldn't allocate arguments/signals for the queue"); - return false; - } - - device::BlitManager::Setup blitSetup; - blitMgr_ = new KernelBlitManager(*this, blitSetup); - if ((nullptr == blitMgr_) || !blitMgr_->create(roc_device_)) { - LogError("Could not create BlitManager!"); - return false; - } - - // Create signal for the barrier packet. - hsa_signal_t signal = { 0 }; - if (HSA_STATUS_SUCCESS != - hsa_signal_create(InitSignalValue, 0, nullptr, &signal)) { - return false; - } - barrier_signal_ = signal; - - // Initialize barrier packet. - memset(&barrier_packet_, 0, sizeof(barrier_packet_)); - barrier_packet_.header = kBarrierPacketHeader; - barrier_packet_.completion_signal = barrier_signal_; - - // Create a object of PrintfDbg - printfdbg_ = new PrintfDbg(roc_device_); - if (nullptr == printfdbg_) { - LogError("\nCould not create printfDbg Object!"); - return false; - } - - // Initialize timestamp conversion factor - if (Timestamp::getGpuTicksToTime() == 0) { - uint64_t frequency; - hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &frequency); - Timestamp::setGpuTicksToTime(1e9/double(frequency)); - } - - if (!memoryDependency().create(GPU_NUM_MEM_DEPENDENCY)) { - LogError("Could not create the array of memory objects!"); - return false; - } - - return true; + } } -bool -VirtualGPU::terminate() -{ - delete blitMgr_; +void* VirtualGPU::allocKernArg(size_t size, size_t alignment) { + char* result = nullptr; + do { + result = amd::alignUp(kernarg_pool_base_ + kernarg_pool_cur_offset_, alignment); + const size_t pool_new_usage = (result + size) - kernarg_pool_base_; + if (pool_new_usage <= kernarg_pool_size_) { + kernarg_pool_cur_offset_ = pool_new_usage; + return result; + } else { + //! We run out of the arguments space! + //! That means the app didn't call clFlush/clFinish for very long time. + //! We can issue a barrier to avoid expensive extra memory allocations. - // Release the resources of signal - releaseGpuMemoryFence(); - hsa_status_t err = hsa_queue_destroy(gpu_queue_); - if (err != HSA_STATUS_SUCCESS) { - return false; + // Initialize signal for the barrier packet. + hsa_signal_store_relaxed(barrier_signal_, InitSignalValue); + + // Dispatch barrier packet into the queue and wait till it finishes. + dispatchBarrierPacket(&barrier_packet_); + if (hsa_signal_wait_acquire(barrier_signal_, HSA_SIGNAL_CONDITION_EQ, 0, uint64_t(-1), + HSA_WAIT_STATE_BLOCKED) != 0) { + LogError("Kernel arguments reset failed"); + } + + resetKernArgPool(); } + } while (true); - if (barrier_signal_.handle != 0) { - hsa_signal_destroy(barrier_signal_); - } - - if (tools_lib_) { - amd::Os::unloadLibrary(tools_lib_); - tools_lib_ = nullptr; - } - - destroyPool(); - - return true; -} - -bool -VirtualGPU::initPool(size_t kernarg_pool_size, uint signal_pool_count) -{ - kernarg_pool_size_ = kernarg_pool_size; - kernarg_pool_base_ = reinterpret_cast( - roc_device_.hostAlloc(kernarg_pool_size_, 1, true)); - if (kernarg_pool_base_ == nullptr) { - return false; - } - - if (signal_pool_count != 0) { - signal_pool_.resize(signal_pool_count); - for (uint i = 0; i < signal_pool_count; ++i) { - ProfilingSignal profilingSignal; - if (HSA_STATUS_SUCCESS != hsa_signal_create( - 0, 0, nullptr, &profilingSignal.signal_)) { - return false; - } - signal_pool_[i] = profilingSignal; - } - } - - return true; -} - -void -VirtualGPU::destroyPool() { - if (kernarg_pool_base_ != nullptr) { - roc_device_.hostFree(kernarg_pool_base_, kernarg_pool_size_); - } - - if (signal_pool_.size() > 0) { - for (uint i = 0; i < signal_pool_.size(); ++i) { - hsa_signal_destroy(signal_pool_[i].signal_); - } - } -} - -void* -VirtualGPU::allocKernArg(size_t size, size_t alignment) -{ - char* result = nullptr; - do { - result = amd::alignUp(kernarg_pool_base_ + kernarg_pool_cur_offset_, alignment); - const size_t pool_new_usage = (result + size) - kernarg_pool_base_; - if (pool_new_usage <= kernarg_pool_size_) { - kernarg_pool_cur_offset_ = pool_new_usage; - return result; - } - else { - //! We run out of the arguments space! - //! That means the app didn't call clFlush/clFinish for very long time. - //! We can issue a barrier to avoid expensive extra memory allocations. - - // Initialize signal for the barrier packet. - hsa_signal_store_relaxed(barrier_signal_, InitSignalValue); - - // Dispatch barrier packet into the queue and wait till it finishes. - dispatchBarrierPacket(&barrier_packet_); - if (hsa_signal_wait_acquire( - barrier_signal_, HSA_SIGNAL_CONDITION_EQ, 0, uint64_t(-1), - HSA_WAIT_STATE_BLOCKED) != 0) { - LogError("Kernel arguments reset failed"); - } - - resetKernArgPool(); - } - } while (true); - - return result; + return result; } /* profilingBegin, when profiling is enabled, creates a timestamp to save in * virtualgpu's timestamp_, and calls start() to get the current host * timestamp. */ -void VirtualGPU::profilingBegin(amd::Command &command, bool drmProfiling) -{ - if (command.profilingInfo().enabled_) { - if (timestamp_ != nullptr) { - LogWarning("Trying to create a second timestamp in VirtualGPU. \ +void VirtualGPU::profilingBegin(amd::Command& command, bool drmProfiling) { + if (command.profilingInfo().enabled_) { + if (timestamp_ != nullptr) { + LogWarning( + "Trying to create a second timestamp in VirtualGPU. \ This could have unintended consequences."); - return; - } - timestamp_ = new Timestamp; - timestamp_->start(); + return; } + timestamp_ = new Timestamp; + timestamp_->start(); + } } /* profilingEnd, when profiling is enabled, checks to see if a signal was @@ -711,721 +651,646 @@ void VirtualGPU::profilingBegin(amd::Command &command, bool drmProfiling) * current host timestamp if no signal is available. It then saves the pointer * timestamp_ to the command's data. */ -void VirtualGPU::profilingEnd(amd::Command &command) -{ - if (command.profilingInfo().enabled_) { - if (timestamp_->getProfilingSignal() == nullptr) { - timestamp_->end(); - } - command.setData(reinterpret_cast(timestamp_)); - timestamp_ = nullptr; +void VirtualGPU::profilingEnd(amd::Command& command) { + if (command.profilingInfo().enabled_) { + if (timestamp_->getProfilingSignal() == nullptr) { + timestamp_->end(); } + command.setData(reinterpret_cast(timestamp_)); + timestamp_ = nullptr; + } } -struct DestroySampler : public std::binary_function { - bool operator() (hsa_ext_sampler_t &sampler, - hsa_agent_t agent) const { - hsa_status_t status = hsa_ext_sampler_destroy(agent, sampler); - return status == HSA_STATUS_SUCCESS; - } +struct DestroySampler : public std::binary_function { + bool operator()(hsa_ext_sampler_t& sampler, hsa_agent_t agent) const { + hsa_status_t status = hsa_ext_sampler_destroy(agent, sampler); + return status == HSA_STATUS_SUCCESS; + } }; -void VirtualGPU::updateCommandsState(amd::Command *list) -{ - Timestamp *ts = nullptr; +void VirtualGPU::updateCommandsState(amd::Command* list) { + Timestamp* ts = nullptr; - amd::Command* current = list; - amd::Command* next = nullptr; + amd::Command* current = list; + amd::Command* next = nullptr; - if (current == nullptr) { - return; - } + if (current == nullptr) { + return; + } - uint64_t endTimeStamp = 0; - uint64_t startTimeStamp = endTimeStamp; + uint64_t endTimeStamp = 0; + uint64_t startTimeStamp = endTimeStamp; - if (current->profilingInfo().enabled_) { - // TODO: use GPU timestamp when available. - endTimeStamp = amd::Os::timeNanos(); - startTimeStamp = endTimeStamp; + if (current->profilingInfo().enabled_) { + // TODO: use GPU timestamp when available. + endTimeStamp = amd::Os::timeNanos(); + startTimeStamp = endTimeStamp; - // This block gets the first valid timestamp from the first command - // that has one. This timestamp is used below to mark any command that - // came before it to start and end with this first valid start time. - current = list; - while (current != nullptr) { - if (current->data() != nullptr) { - ts = reinterpret_cast(current->data()); - startTimeStamp = ts->getStart(); - endTimeStamp = ts->getStart(); - break; - } - current = current->getNext(); - } - } - - // Iterate through the list of commands, and set timestamps as appropriate - // Note, if a command does not have a timestamp, it does one of two things: - // - if the command (without a timestamp), A, precedes another command, C, - // that _does_ contain a valid timestamp, command A will set RUNNING and - // COMPLETE with the RUNNING (start) timestamp from command C. This would - // also be true for command B, which is between A and C. These timestamps - // are actually retrieved in the block above (startTimeStamp, endTimeStamp). - // - if the command (without a timestamp), C, follows another command, A, - // that has a valid timestamp, command C will be set RUNNING and COMPLETE - // with the COMPLETE (end) timestamp of the previous command, A. This is - // also true for any command B, which falls between A and C. + // This block gets the first valid timestamp from the first command + // that has one. This timestamp is used below to mark any command that + // came before it to start and end with this first valid start time. current = list; while (current != nullptr) { - if (current->profilingInfo().enabled_) { - if (current->data() != nullptr) { - // Since this is a valid command to get a timestamp, we use the - // timestamp provided by the runtime (saved in the data()) - ts = reinterpret_cast(current->data()); - startTimeStamp = ts->getStart(); - endTimeStamp = ts->getEnd(); - delete ts; - current->setData(nullptr); - } - else { - // If we don't have a command that contains a valid timestamp, - // we simply use the end timestamp of the previous command. - // Note, if this is a command before the first valid timestamp, - // this will be equal to the start timestamp of the first valid - // timestamp at this point. - startTimeStamp = endTimeStamp; - } - } + if (current->data() != nullptr) { + ts = reinterpret_cast(current->data()); + startTimeStamp = ts->getStart(); + endTimeStamp = ts->getStart(); + break; + } + current = current->getNext(); + } + } - if (current->status() == CL_SUBMITTED) { - current->setStatus(CL_RUNNING, startTimeStamp); - current->setStatus(CL_COMPLETE, endTimeStamp); - } - else if (current->status() != CL_COMPLETE) { - LogPrintfError("Unexpected command status - %d.", current->status()); - } - - next = current->getNext(); - current->release(); - current = next; + // Iterate through the list of commands, and set timestamps as appropriate + // Note, if a command does not have a timestamp, it does one of two things: + // - if the command (without a timestamp), A, precedes another command, C, + // that _does_ contain a valid timestamp, command A will set RUNNING and + // COMPLETE with the RUNNING (start) timestamp from command C. This would + // also be true for command B, which is between A and C. These timestamps + // are actually retrieved in the block above (startTimeStamp, endTimeStamp). + // - if the command (without a timestamp), C, follows another command, A, + // that has a valid timestamp, command C will be set RUNNING and COMPLETE + // with the COMPLETE (end) timestamp of the previous command, A. This is + // also true for any command B, which falls between A and C. + current = list; + while (current != nullptr) { + if (current->profilingInfo().enabled_) { + if (current->data() != nullptr) { + // Since this is a valid command to get a timestamp, we use the + // timestamp provided by the runtime (saved in the data()) + ts = reinterpret_cast(current->data()); + startTimeStamp = ts->getStart(); + endTimeStamp = ts->getEnd(); + delete ts; + current->setData(nullptr); + } else { + // If we don't have a command that contains a valid timestamp, + // we simply use the end timestamp of the previous command. + // Note, if this is a command before the first valid timestamp, + // this will be equal to the start timestamp of the first valid + // timestamp at this point. + startTimeStamp = endTimeStamp; + } } - // Release the sampler handles allocated for the various - // on one or more kernel submissions - std::for_each(samplerList_.begin(), - samplerList_.end(), - std::bind2nd(DestroySampler(), gpu_device_)); - samplerList_.clear(); + if (current->status() == CL_SUBMITTED) { + current->setStatus(CL_RUNNING, startTimeStamp); + current->setStatus(CL_COMPLETE, endTimeStamp); + } else if (current->status() != CL_COMPLETE) { + LogPrintfError("Unexpected command status - %d.", current->status()); + } - return; + next = current->getNext(); + current->release(); + current = next; + } + + // Release the sampler handles allocated for the various + // on one or more kernel submissions + std::for_each(samplerList_.begin(), samplerList_.end(), + std::bind2nd(DestroySampler(), gpu_device_)); + samplerList_.clear(); + + return; } -void VirtualGPU::submitReadMemory(amd::ReadMemoryCommand &cmd) -{ - // Wait on a kernel if one is outstanding - releaseGpuMemoryFence(); +void VirtualGPU::submitReadMemory(amd::ReadMemoryCommand& cmd) { + // Wait on a kernel if one is outstanding + releaseGpuMemoryFence(); - profilingBegin(cmd); + profilingBegin(cmd); - size_t offset = 0; - // Find if virtual address is a CL allocation - device::Memory* hostMemory = dev().findMemoryFromVA(cmd.destination(), &offset); + size_t offset = 0; + // Find if virtual address is a CL allocation + device::Memory* hostMemory = dev().findMemoryFromVA(cmd.destination(), &offset); - Memory* devMem = dev().getRocMemory(&cmd.source()); - // Synchronize data with other memory instances if necessary - devMem->syncCacheFromHost(*this); + Memory* devMem = dev().getRocMemory(&cmd.source()); + // Synchronize data with other memory instances if necessary + devMem->syncCacheFromHost(*this); - void *dst = cmd.destination(); - amd::Coord3D size = cmd.size(); + void* dst = cmd.destination(); + amd::Coord3D size = cmd.size(); - //! @todo: add multi-devices synchronization when supported. + //! @todo: add multi-devices synchronization when supported. - cl_command_type type = cmd.type(); - bool result = false; - bool imageBuffer = false; + cl_command_type type = cmd.type(); + bool result = false; + bool imageBuffer = false; - // Force buffer read for IMAGE1D_BUFFER - if ((type == CL_COMMAND_READ_IMAGE) && - (cmd.source().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) { - type = CL_COMMAND_READ_BUFFER; - imageBuffer = true; + // Force buffer read for IMAGE1D_BUFFER + if ((type == CL_COMMAND_READ_IMAGE) && (cmd.source().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) { + type = CL_COMMAND_READ_BUFFER; + imageBuffer = true; + } + + switch (type) { + case CL_COMMAND_READ_BUFFER: { + amd::Coord3D origin(cmd.origin()[0]); + if (imageBuffer) { + size_t elemSize = cmd.source().asImage()->getImageFormat().getElementSize(); + origin.c[0] *= elemSize; + size.c[0] *= elemSize; + } + if (hostMemory != nullptr) { + // Accelerated transfer without pinning + amd::Coord3D dstOrigin(offset); + result = blitMgr().copyBuffer(*devMem, *hostMemory, origin, dstOrigin, size, + cmd.isEntireMemory()); + } else { + result = blitMgr().readBuffer(*devMem, dst, origin, size, cmd.isEntireMemory()); + } + break; } - - switch (type) { - case CL_COMMAND_READ_BUFFER: { - amd::Coord3D origin(cmd.origin()[0]); - if (imageBuffer) { - size_t elemSize = - cmd.source().asImage()->getImageFormat().getElementSize(); - origin.c[0] *= elemSize; - size.c[0] *= elemSize; - } - if (hostMemory != nullptr) { - // Accelerated transfer without pinning - amd::Coord3D dstOrigin(offset); - result = blitMgr().copyBuffer(*devMem, *hostMemory, - origin, dstOrigin, size, cmd.isEntireMemory()); - } - else { - result = blitMgr().readBuffer( - *devMem, dst, origin, size, - cmd.isEntireMemory()); - } - break; - } - case CL_COMMAND_READ_BUFFER_RECT: { - result = blitMgr().readBufferRect( - *devMem, dst, cmd.bufRect(), cmd.hostRect(), size, - cmd.isEntireMemory()); - break; - } - case CL_COMMAND_READ_IMAGE: { - result = blitMgr().readImage( - *devMem, dst, cmd.origin(), size, cmd.rowPitch(), - cmd.slicePitch(), cmd.isEntireMemory()); - break; - } - default: - ShouldNotReachHere(); - break; + case CL_COMMAND_READ_BUFFER_RECT: { + result = blitMgr().readBufferRect(*devMem, dst, cmd.bufRect(), cmd.hostRect(), size, + cmd.isEntireMemory()); + break; } - - if (!result) { - LogError("submitReadMemory failed!"); - cmd.setStatus(CL_OUT_OF_RESOURCES); + case CL_COMMAND_READ_IMAGE: { + result = blitMgr().readImage(*devMem, dst, cmd.origin(), size, cmd.rowPitch(), + cmd.slicePitch(), cmd.isEntireMemory()); + break; } + default: + ShouldNotReachHere(); + break; + } - profilingEnd(cmd); + if (!result) { + LogError("submitReadMemory failed!"); + cmd.setStatus(CL_OUT_OF_RESOURCES); + } + + profilingEnd(cmd); } -void VirtualGPU::submitWriteMemory(amd::WriteMemoryCommand &cmd) -{ - // Wait on a kernel if one is outstanding - releaseGpuMemoryFence(); +void VirtualGPU::submitWriteMemory(amd::WriteMemoryCommand& cmd) { + // Wait on a kernel if one is outstanding + releaseGpuMemoryFence(); - profilingBegin(cmd); + profilingBegin(cmd); - size_t offset = 0; - // Find if virtual address is a CL allocation - device::Memory* hostMemory = dev().findMemoryFromVA(cmd.source(), &offset); + size_t offset = 0; + // Find if virtual address is a CL allocation + device::Memory* hostMemory = dev().findMemoryFromVA(cmd.source(), &offset); - Memory* devMem = dev().getRocMemory(&cmd.destination()); + Memory* devMem = dev().getRocMemory(&cmd.destination()); - // Synchronize memory from host if necessary - device::Memory::SyncFlags syncFlags; - syncFlags.skipEntire_ = cmd.isEntireMemory(); - devMem->syncCacheFromHost(*this, syncFlags); + // Synchronize memory from host if necessary + device::Memory::SyncFlags syncFlags; + syncFlags.skipEntire_ = cmd.isEntireMemory(); + devMem->syncCacheFromHost(*this, syncFlags); - const char* src = static_cast(cmd.source()); - amd::Coord3D size = cmd.size(); + const char* src = static_cast(cmd.source()); + amd::Coord3D size = cmd.size(); - //! @todo add multi-devices synchronization when supported. + //! @todo add multi-devices synchronization when supported. - cl_command_type type = cmd.type(); - bool result = false; - bool imageBuffer = false; + cl_command_type type = cmd.type(); + bool result = false; + bool imageBuffer = false; - // Force buffer write for IMAGE1D_BUFFER - if ((type == CL_COMMAND_WRITE_IMAGE) && - (cmd.destination().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) { - type = CL_COMMAND_WRITE_BUFFER; - imageBuffer = true; + // Force buffer write for IMAGE1D_BUFFER + if ((type == CL_COMMAND_WRITE_IMAGE) && + (cmd.destination().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) { + type = CL_COMMAND_WRITE_BUFFER; + imageBuffer = true; + } + + switch (type) { + case CL_COMMAND_WRITE_BUFFER: { + amd::Coord3D origin(cmd.origin()[0]); + if (imageBuffer) { + size_t elemSize = cmd.destination().asImage()->getImageFormat().getElementSize(); + origin.c[0] *= elemSize; + size.c[0] *= elemSize; + } + if (hostMemory != nullptr) { + // Accelerated transfer without pinning + amd::Coord3D srcOrigin(offset); + result = blitMgr().copyBuffer(*hostMemory, *devMem, srcOrigin, origin, size, + cmd.isEntireMemory()); + } else { + result = blitMgr().writeBuffer(src, *devMem, origin, size, cmd.isEntireMemory()); + } + break; } - - switch (type) { - case CL_COMMAND_WRITE_BUFFER: { - amd::Coord3D origin(cmd.origin()[0]); - if (imageBuffer) { - size_t elemSize = - cmd.destination().asImage()->getImageFormat().getElementSize(); - origin.c[0] *= elemSize; - size.c[0] *= elemSize; - } - if (hostMemory != nullptr) { - // Accelerated transfer without pinning - amd::Coord3D srcOrigin(offset); - result = blitMgr().copyBuffer(*hostMemory, *devMem, - srcOrigin, origin, size, cmd.isEntireMemory()); - } - else { - result = blitMgr().writeBuffer( - src, *devMem , origin, size, - cmd.isEntireMemory()); - } - break; - } - case CL_COMMAND_WRITE_BUFFER_RECT: { - result = blitMgr().writeBufferRect( - src, *devMem, cmd.hostRect(), cmd.bufRect(), size, - cmd.isEntireMemory()); - break; - } - case CL_COMMAND_WRITE_IMAGE: { - result = blitMgr().writeImage( - src, *devMem, cmd.origin(), size, cmd.rowPitch(), - cmd.slicePitch(), cmd.isEntireMemory()); - break; - } - default: - ShouldNotReachHere(); - break; + case CL_COMMAND_WRITE_BUFFER_RECT: { + result = blitMgr().writeBufferRect(src, *devMem, cmd.hostRect(), cmd.bufRect(), size, + cmd.isEntireMemory()); + break; } - - if (!result) { - LogError("submitWriteMemory failed!"); - cmd.setStatus(CL_OUT_OF_RESOURCES); - } - else { - cmd.destination().signalWrite(&dev()); - } - - profilingEnd(cmd); -} - -void VirtualGPU::submitSvmFreeMemory(amd::SvmFreeMemoryCommand& cmd) -{ - // in-order semantics: previous commands need to be done before we start - releaseGpuMemoryFence(); - - profilingBegin(cmd); - const std::vector& svmPointers = cmd.svmPointers(); - if (cmd.pfnFreeFunc() == nullptr) { - // pointers allocated using clSVMAlloc - for (cl_uint i = 0; i < svmPointers.size(); i++) { - amd::SvmBuffer::free(cmd.context(), svmPointers[i]); - } - } - else { - cmd.pfnFreeFunc()(as_cl(cmd.queue()->asCommandQueue()), svmPointers.size(), - (void**) (&(svmPointers[0])), cmd.userData()); - } - profilingEnd(cmd); -} - -void VirtualGPU::submitSvmCopyMemory(amd::SvmCopyMemoryCommand& cmd) -{ - // in-order semantics: previous commands need to be done before we start - releaseGpuMemoryFence(); - profilingBegin(cmd); - amd::SvmBuffer::memFill(cmd.dst(), cmd.src(), cmd.srcSize(), 1); - profilingEnd(cmd); -} - -void VirtualGPU::submitSvmFillMemory(amd::SvmFillMemoryCommand& cmd) -{ - // in-order semantics: previous commands need to be done before we start - releaseGpuMemoryFence(); - profilingBegin(cmd); - amd::SvmBuffer::memFill(cmd.dst(), cmd.pattern(), cmd.patternSize(), cmd.times()); - profilingEnd(cmd); -} - -void VirtualGPU::submitCopyMemory(amd::CopyMemoryCommand &cmd) -{ - // Wait on a kernel if one is outstanding - releaseGpuMemoryFence(); - - profilingBegin(cmd); - - Memory* srcDevMem = dev().getRocMemory(&cmd.source()); - Memory* dstDevMem = dev().getRocMemory(&cmd.destination()); - - // Synchronize source and destination memory - device::Memory::SyncFlags syncFlags; - syncFlags.skipEntire_ = cmd.isEntireMemory(); - dstDevMem->syncCacheFromHost(*this, syncFlags); - srcDevMem->syncCacheFromHost(*this); - - amd::Coord3D size = cmd.size(); - - cl_command_type type = cmd.type(); - bool result = false; - bool srcImageBuffer = false; - bool dstImageBuffer = false; - - // Force buffer copy for IMAGE1D_BUFFER - if (cmd.source().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER) { - srcImageBuffer = true; - type = CL_COMMAND_COPY_BUFFER; - } - if (cmd.destination().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER) { - dstImageBuffer = true; - type = CL_COMMAND_COPY_BUFFER; - } - - switch (cmd.type()) { - case CL_COMMAND_COPY_BUFFER: { - amd::Coord3D srcOrigin(cmd.srcOrigin()[0]); - amd::Coord3D dstOrigin(cmd.dstOrigin()[0]); - - if (srcImageBuffer) { - const size_t elemSize = - cmd.source().asImage()->getImageFormat().getElementSize(); - srcOrigin.c[0] *= elemSize; - if (dstImageBuffer) { - dstOrigin.c[0] *= elemSize; - } - size.c[0] *= elemSize; - } - else if (dstImageBuffer) { - const size_t elemSize = - cmd.destination().asImage()->getImageFormat().getElementSize(); - dstOrigin.c[0] *= elemSize; - size.c[0] *= elemSize; - } - - result = blitMgr().copyBuffer( - *srcDevMem, *dstDevMem, srcOrigin, - dstOrigin, size, cmd.isEntireMemory()); - break; - } - case CL_COMMAND_COPY_BUFFER_RECT: { - result = blitMgr().copyBufferRect( - *srcDevMem, *dstDevMem, cmd.srcRect(), - cmd.dstRect(), size, cmd.isEntireMemory()); - break; - } - case CL_COMMAND_COPY_IMAGE: { - result = blitMgr().copyImage( - *srcDevMem, *dstDevMem, cmd.srcOrigin(), - cmd.dstOrigin(), size, cmd.isEntireMemory()); - break; - } - case CL_COMMAND_COPY_IMAGE_TO_BUFFER: { - result = blitMgr().copyImageToBuffer( - *srcDevMem, *dstDevMem, cmd.srcOrigin(), - cmd.dstOrigin(), size, cmd.isEntireMemory()); - break; - } - case CL_COMMAND_COPY_BUFFER_TO_IMAGE: { - result = blitMgr().copyBufferToImage( - *srcDevMem, *dstDevMem, cmd.srcOrigin(), - cmd.dstOrigin(), size, cmd.isEntireMemory()); - break; - } - default: - ShouldNotReachHere(); - break; - } - - if (!result) { - LogError("submitCopyMemory failed!"); - cmd.setStatus(CL_OUT_OF_RESOURCES); + case CL_COMMAND_WRITE_IMAGE: { + result = blitMgr().writeImage(src, *devMem, cmd.origin(), size, cmd.rowPitch(), + cmd.slicePitch(), cmd.isEntireMemory()); + break; } + default: + ShouldNotReachHere(); + break; + } + if (!result) { + LogError("submitWriteMemory failed!"); + cmd.setStatus(CL_OUT_OF_RESOURCES); + } else { cmd.destination().signalWrite(&dev()); + } - profilingEnd(cmd); + profilingEnd(cmd); } -void VirtualGPU::submitSvmMapMemory(amd::SvmMapMemoryCommand& cmd) -{ - // No fence is needed since this is a no-op: the - // command will be completed only after all the - // previous commands are complete - profilingBegin(cmd); - profilingEnd(cmd); +void VirtualGPU::submitSvmFreeMemory(amd::SvmFreeMemoryCommand& cmd) { + // in-order semantics: previous commands need to be done before we start + releaseGpuMemoryFence(); + + profilingBegin(cmd); + const std::vector& svmPointers = cmd.svmPointers(); + if (cmd.pfnFreeFunc() == nullptr) { + // pointers allocated using clSVMAlloc + for (cl_uint i = 0; i < svmPointers.size(); i++) { + amd::SvmBuffer::free(cmd.context(), svmPointers[i]); + } + } else { + cmd.pfnFreeFunc()(as_cl(cmd.queue()->asCommandQueue()), svmPointers.size(), + (void**)(&(svmPointers[0])), cmd.userData()); + } + profilingEnd(cmd); } -void VirtualGPU::submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& cmd) -{ - // No fence is needed since this is a no-op: the - // command will be completed only after all the - // previous commands are complete - profilingBegin(cmd); - profilingEnd(cmd); +void VirtualGPU::submitSvmCopyMemory(amd::SvmCopyMemoryCommand& cmd) { + // in-order semantics: previous commands need to be done before we start + releaseGpuMemoryFence(); + profilingBegin(cmd); + amd::SvmBuffer::memFill(cmd.dst(), cmd.src(), cmd.srcSize(), 1); + profilingEnd(cmd); } -void VirtualGPU::submitMapMemory(amd::MapMemoryCommand &cmd) -{ - // Wait on a kernel if one is outstanding - releaseGpuMemoryFence(); - - profilingBegin(cmd); - - //! @todo add multi-devices synchronization when supported. - - roc::Memory* devMemory = reinterpret_cast( - cmd.memory().getDeviceMemory(dev(), false)); - - cl_command_type type = cmd.type(); - bool imageBuffer = false; - - // Save map requirement. - cl_map_flags mapFlag = cmd.mapFlags(); - - // Treat no map flag as read-write. - if (mapFlag == 0) { - mapFlag = CL_MAP_READ | CL_MAP_WRITE; - } - - devMemory->saveMapInfo(cmd.mapPtr(), cmd.origin(), cmd.size(), - mapFlag, cmd.isEntireMemory()); - - // Sync to the map target. - // If we have host memory, use it - if (devMemory->owner()->getHostMem() != nullptr) { - // Target is the backing store, so just ensure that owner is up-to-date - devMemory->owner()->cacheWriteBack(); - - if (devMemory->isHostMemDirectAccess()) { - // Add memory to VA cache, so rutnime can detect direct access to VA - dev().addVACache(devMemory); - } - } - else if (mapFlag & (CL_MAP_READ | CL_MAP_WRITE)) { - bool result = false; - roc::Memory *hsaMemory = static_cast(devMemory); - - amd::Memory* mapMemory = hsaMemory->mapMemory(); - void *hostPtr = mapMemory == nullptr ? - hsaMemory->owner()->getHostMem() : - mapMemory->getHostMem(); - - if (type == CL_COMMAND_MAP_BUFFER) { - amd::Coord3D origin(cmd.origin()[0]); - amd::Coord3D size(cmd.size()[0]); - amd::Coord3D dstOrigin(cmd.origin()[0], 0, 0); - if (imageBuffer) { - size_t elemSize = - cmd.memory().asImage()->getImageFormat().getElementSize(); - origin.c[0] *= elemSize; - size.c[0] *= elemSize; - } - - if (mapMemory != nullptr) { - roc::Memory *hsaMapMemory = static_cast( - mapMemory->getDeviceMemory(dev(), false)); - result = blitMgr().copyBuffer(*hsaMemory, - *hsaMapMemory, origin, dstOrigin, - size, cmd.isEntireMemory()); - } - else { - result = blitMgr().readBuffer( - *hsaMemory, static_cast(hostPtr)+origin[0], - origin, size, cmd.isEntireMemory()); - } - } - else if (type == CL_COMMAND_MAP_IMAGE) { - amd::Image* image = cmd.memory().asImage(); - if (mapMemory != nullptr) { - roc::Memory *mapMemory = static_cast( - devMemory->mapMemory()->getDeviceMemory(dev(), false)); - result = blitMgr().copyImageToBuffer( - *hsaMemory, *mapMemory, cmd.origin(), - amd::Coord3D(0,0,0), cmd.size(), cmd.isEntireMemory()); - } - else { - result = blitMgr().readImage( - *hsaMemory, hostPtr, amd::Coord3D(0), - image->getRegion(), image->getRowPitch(), - image->getSlicePitch(), true); - } - } - else { - ShouldNotReachHere(); - } - - if (!result) { - LogError("submitMapMemory failed!"); - cmd.setStatus(CL_OUT_OF_RESOURCES); - } - } - - profilingEnd(cmd); +void VirtualGPU::submitSvmFillMemory(amd::SvmFillMemoryCommand& cmd) { + // in-order semantics: previous commands need to be done before we start + releaseGpuMemoryFence(); + profilingBegin(cmd); + amd::SvmBuffer::memFill(cmd.dst(), cmd.pattern(), cmd.patternSize(), cmd.times()); + profilingEnd(cmd); } -void VirtualGPU::submitUnmapMemory(amd::UnmapMemoryCommand &cmd) -{ - roc::Memory* devMemory = static_cast( - cmd.memory().getDeviceMemory(dev(), false)); +void VirtualGPU::submitCopyMemory(amd::CopyMemoryCommand& cmd) { + // Wait on a kernel if one is outstanding + releaseGpuMemoryFence(); - const device::Memory::WriteMapInfo* mapInfo = - devMemory->writeMapInfo(cmd.mapPtr()); - if (nullptr == mapInfo) { - LogError("Unmap without map call"); - return; - } - // Wait on a kernel if one is outstanding - releaseGpuMemoryFence(); - profilingBegin(cmd); + profilingBegin(cmd); - // Force buffer write for IMAGE1D_BUFFER - bool imageBuffer = (cmd.memory().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER); + Memory* srcDevMem = dev().getRocMemory(&cmd.source()); + Memory* dstDevMem = dev().getRocMemory(&cmd.destination()); - // We used host memory - if (devMemory->owner()->getHostMem() != nullptr) { - if (mapInfo->isUnmapWrite()) { - // Target is the backing store, so sync - devMemory->owner()->signalWrite(nullptr); - devMemory->syncCacheFromHost(*this); - } - if (devMemory->isHostMemDirectAccess()) { - // Remove memory from VA cache - dev().removeVACache(devMemory); + // Synchronize source and destination memory + device::Memory::SyncFlags syncFlags; + syncFlags.skipEntire_ = cmd.isEntireMemory(); + dstDevMem->syncCacheFromHost(*this, syncFlags); + srcDevMem->syncCacheFromHost(*this); + + amd::Coord3D size = cmd.size(); + + cl_command_type type = cmd.type(); + bool result = false; + bool srcImageBuffer = false; + bool dstImageBuffer = false; + + // Force buffer copy for IMAGE1D_BUFFER + if (cmd.source().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER) { + srcImageBuffer = true; + type = CL_COMMAND_COPY_BUFFER; + } + if (cmd.destination().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER) { + dstImageBuffer = true; + type = CL_COMMAND_COPY_BUFFER; + } + + switch (cmd.type()) { + case CL_COMMAND_COPY_BUFFER: { + amd::Coord3D srcOrigin(cmd.srcOrigin()[0]); + amd::Coord3D dstOrigin(cmd.dstOrigin()[0]); + + if (srcImageBuffer) { + const size_t elemSize = cmd.source().asImage()->getImageFormat().getElementSize(); + srcOrigin.c[0] *= elemSize; + if (dstImageBuffer) { + dstOrigin.c[0] *= elemSize; } + size.c[0] *= elemSize; + } else if (dstImageBuffer) { + const size_t elemSize = cmd.destination().asImage()->getImageFormat().getElementSize(); + dstOrigin.c[0] *= elemSize; + size.c[0] *= elemSize; + } + + result = blitMgr().copyBuffer(*srcDevMem, *dstDevMem, srcOrigin, dstOrigin, size, + cmd.isEntireMemory()); + break; } - else if (mapInfo->isUnmapWrite()) { - // Commit the changes made by the user. - if (!devMemory->isHostMemDirectAccess()) { - bool result = false; - - if (cmd.memory().asImage() && !imageBuffer) { - amd::Image *image = cmd.memory().asImage(); - amd::Memory* mapMemory = devMemory->mapMemory(); - if (devMemory->mapMemory() != nullptr) { - roc::Memory *mapMemory = static_cast( - devMemory->mapMemory()->getDeviceMemory(dev(), false)); - result = blitMgr().copyBufferToImage( - *mapMemory, *devMemory, amd::Coord3D(0,0,0), - mapInfo->origin_, mapInfo->region_, mapInfo->isEntire()); - } - else { - void *hostPtr = devMemory->owner()->getHostMem(); - - result = blitMgr().writeImage( - hostPtr, *devMemory, - amd::Coord3D(0), image->getRegion(), - image->getRowPitch(), image->getSlicePitch(), true); - } - } - else { - amd::Coord3D origin(mapInfo->origin_[0]); - amd::Coord3D size(mapInfo->region_[0]); - if (imageBuffer) { - size_t elemSize = - cmd.memory().asImage()->getImageFormat().getElementSize(); - origin.c[0] *= elemSize; - size.c[0] *= elemSize; - } - if (devMemory->mapMemory() != nullptr) { - roc::Memory *mapMemory = static_cast( - devMemory->mapMemory()->getDeviceMemory(dev(), false)); - - result = blitMgr().copyBuffer( - *mapMemory, *devMemory, - mapInfo->origin_, - mapInfo->origin_, - mapInfo->region_, - mapInfo->isEntire()); - } - else { - result = blitMgr().writeBuffer( - cmd.mapPtr(), *devMemory, origin, size); - } - } - if (!result) { - LogError("submitMapMemory failed!"); - cmd.setStatus(CL_OUT_OF_RESOURCES); - } - } - - cmd.memory().signalWrite(&dev()); + case CL_COMMAND_COPY_BUFFER_RECT: { + result = blitMgr().copyBufferRect(*srcDevMem, *dstDevMem, cmd.srcRect(), cmd.dstRect(), size, + cmd.isEntireMemory()); + break; } + case CL_COMMAND_COPY_IMAGE: { + result = blitMgr().copyImage(*srcDevMem, *dstDevMem, cmd.srcOrigin(), cmd.dstOrigin(), size, + cmd.isEntireMemory()); + break; + } + case CL_COMMAND_COPY_IMAGE_TO_BUFFER: { + result = blitMgr().copyImageToBuffer(*srcDevMem, *dstDevMem, cmd.srcOrigin(), cmd.dstOrigin(), + size, cmd.isEntireMemory()); + break; + } + case CL_COMMAND_COPY_BUFFER_TO_IMAGE: { + result = blitMgr().copyBufferToImage(*srcDevMem, *dstDevMem, cmd.srcOrigin(), cmd.dstOrigin(), + size, cmd.isEntireMemory()); + break; + } + default: + ShouldNotReachHere(); + break; + } - devMemory->clearUnmapInfo(cmd.mapPtr()); + if (!result) { + LogError("submitCopyMemory failed!"); + cmd.setStatus(CL_OUT_OF_RESOURCES); + } - profilingEnd(cmd); + cmd.destination().signalWrite(&dev()); + + profilingEnd(cmd); } -void VirtualGPU::submitFillMemory(amd::FillMemoryCommand &cmd) -{ - // Wait on a kernel if one is outstanding - releaseGpuMemoryFence(); +void VirtualGPU::submitSvmMapMemory(amd::SvmMapMemoryCommand& cmd) { + // No fence is needed since this is a no-op: the + // command will be completed only after all the + // previous commands are complete + profilingBegin(cmd); + profilingEnd(cmd); +} - profilingBegin(cmd); +void VirtualGPU::submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& cmd) { + // No fence is needed since this is a no-op: the + // command will be completed only after all the + // previous commands are complete + profilingBegin(cmd); + profilingEnd(cmd); +} - Memory* memory = dev().getRocMemory(&cmd.memory()); +void VirtualGPU::submitMapMemory(amd::MapMemoryCommand& cmd) { + // Wait on a kernel if one is outstanding + releaseGpuMemoryFence(); - bool entire = cmd.isEntireMemory(); - // Synchronize memory from host if necessary - device::Memory::SyncFlags syncFlags; - syncFlags.skipEntire_ = entire; - memory->syncCacheFromHost(*this, syncFlags); + profilingBegin(cmd); - cl_command_type type = cmd.type(); + //! @todo add multi-devices synchronization when supported. + + roc::Memory* devMemory = + reinterpret_cast(cmd.memory().getDeviceMemory(dev(), false)); + + cl_command_type type = cmd.type(); + bool imageBuffer = false; + + // Save map requirement. + cl_map_flags mapFlag = cmd.mapFlags(); + + // Treat no map flag as read-write. + if (mapFlag == 0) { + mapFlag = CL_MAP_READ | CL_MAP_WRITE; + } + + devMemory->saveMapInfo(cmd.mapPtr(), cmd.origin(), cmd.size(), mapFlag, cmd.isEntireMemory()); + + // Sync to the map target. + // If we have host memory, use it + if (devMemory->owner()->getHostMem() != nullptr) { + // Target is the backing store, so just ensure that owner is up-to-date + devMemory->owner()->cacheWriteBack(); + + if (devMemory->isHostMemDirectAccess()) { + // Add memory to VA cache, so rutnime can detect direct access to VA + dev().addVACache(devMemory); + } + } else if (mapFlag & (CL_MAP_READ | CL_MAP_WRITE)) { bool result = false; - bool imageBuffer = false; - float fillValue[4]; + roc::Memory* hsaMemory = static_cast(devMemory); - // Force fill buffer for IMAGE1D_BUFFER - if ((type == CL_COMMAND_FILL_IMAGE) && - (cmd.memory().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) { - type = CL_COMMAND_FILL_BUFFER; - imageBuffer = true; - } + amd::Memory* mapMemory = hsaMemory->mapMemory(); + void* hostPtr = + mapMemory == nullptr ? hsaMemory->owner()->getHostMem() : mapMemory->getHostMem(); - // Find the the right fill operation - switch (type) { - case CL_COMMAND_FILL_BUFFER: { - const void* pattern = cmd.pattern(); - size_t patternSize = cmd.patternSize(); - amd::Coord3D origin(cmd.origin()[0]); - amd::Coord3D size(cmd.size()[0]); - // Reprogram fill parameters if it's an IMAGE1D_BUFFER object - if (imageBuffer) { - size_t elemSize = - cmd.memory().asImage()->getImageFormat().getElementSize(); - origin.c[0] *= elemSize; - size.c[0] *= elemSize; - memset(fillValue, 0, sizeof(fillValue)); - cmd.memory().asImage()->getImageFormat().formatColor(pattern, - fillValue); - pattern = fillValue; - patternSize = elemSize; - } - result = blitMgr().fillBuffer( - *memory, pattern, patternSize, origin, size, entire); - break; - } - case CL_COMMAND_FILL_IMAGE: { - result = blitMgr().fillImage( - *memory, cmd.pattern(), cmd.origin(), cmd.size(), entire); - break; - } - default: - ShouldNotReachHere(); - break; + if (type == CL_COMMAND_MAP_BUFFER) { + amd::Coord3D origin(cmd.origin()[0]); + amd::Coord3D size(cmd.size()[0]); + amd::Coord3D dstOrigin(cmd.origin()[0], 0, 0); + if (imageBuffer) { + size_t elemSize = cmd.memory().asImage()->getImageFormat().getElementSize(); + origin.c[0] *= elemSize; + size.c[0] *= elemSize; + } + + if (mapMemory != nullptr) { + roc::Memory* hsaMapMemory = + static_cast(mapMemory->getDeviceMemory(dev(), false)); + result = blitMgr().copyBuffer(*hsaMemory, *hsaMapMemory, origin, dstOrigin, size, + cmd.isEntireMemory()); + } else { + result = blitMgr().readBuffer(*hsaMemory, static_cast(hostPtr) + origin[0], origin, + size, cmd.isEntireMemory()); + } + } else if (type == CL_COMMAND_MAP_IMAGE) { + amd::Image* image = cmd.memory().asImage(); + if (mapMemory != nullptr) { + roc::Memory* mapMemory = + static_cast(devMemory->mapMemory()->getDeviceMemory(dev(), false)); + result = + blitMgr().copyImageToBuffer(*hsaMemory, *mapMemory, cmd.origin(), amd::Coord3D(0, 0, 0), + cmd.size(), cmd.isEntireMemory()); + } else { + result = blitMgr().readImage(*hsaMemory, hostPtr, amd::Coord3D(0), image->getRegion(), + image->getRowPitch(), image->getSlicePitch(), true); + } + } else { + ShouldNotReachHere(); } if (!result) { - LogError("submitFillMemory failed!"); + LogError("submitMapMemory failed!"); + cmd.setStatus(CL_OUT_OF_RESOURCES); + } + } + + profilingEnd(cmd); +} + +void VirtualGPU::submitUnmapMemory(amd::UnmapMemoryCommand& cmd) { + roc::Memory* devMemory = static_cast(cmd.memory().getDeviceMemory(dev(), false)); + + const device::Memory::WriteMapInfo* mapInfo = devMemory->writeMapInfo(cmd.mapPtr()); + if (nullptr == mapInfo) { + LogError("Unmap without map call"); + return; + } + // Wait on a kernel if one is outstanding + releaseGpuMemoryFence(); + profilingBegin(cmd); + + // Force buffer write for IMAGE1D_BUFFER + bool imageBuffer = (cmd.memory().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER); + + // We used host memory + if (devMemory->owner()->getHostMem() != nullptr) { + if (mapInfo->isUnmapWrite()) { + // Target is the backing store, so sync + devMemory->owner()->signalWrite(nullptr); + devMemory->syncCacheFromHost(*this); + } + if (devMemory->isHostMemDirectAccess()) { + // Remove memory from VA cache + dev().removeVACache(devMemory); + } + } else if (mapInfo->isUnmapWrite()) { + // Commit the changes made by the user. + if (!devMemory->isHostMemDirectAccess()) { + bool result = false; + + if (cmd.memory().asImage() && !imageBuffer) { + amd::Image* image = cmd.memory().asImage(); + amd::Memory* mapMemory = devMemory->mapMemory(); + if (devMemory->mapMemory() != nullptr) { + roc::Memory* mapMemory = + static_cast(devMemory->mapMemory()->getDeviceMemory(dev(), false)); + result = + blitMgr().copyBufferToImage(*mapMemory, *devMemory, amd::Coord3D(0, 0, 0), + mapInfo->origin_, mapInfo->region_, mapInfo->isEntire()); + } else { + void* hostPtr = devMemory->owner()->getHostMem(); + + result = blitMgr().writeImage(hostPtr, *devMemory, amd::Coord3D(0), image->getRegion(), + image->getRowPitch(), image->getSlicePitch(), true); + } + } else { + amd::Coord3D origin(mapInfo->origin_[0]); + amd::Coord3D size(mapInfo->region_[0]); + if (imageBuffer) { + size_t elemSize = cmd.memory().asImage()->getImageFormat().getElementSize(); + origin.c[0] *= elemSize; + size.c[0] *= elemSize; + } + if (devMemory->mapMemory() != nullptr) { + roc::Memory* mapMemory = + static_cast(devMemory->mapMemory()->getDeviceMemory(dev(), false)); + + result = blitMgr().copyBuffer(*mapMemory, *devMemory, mapInfo->origin_, mapInfo->origin_, + mapInfo->region_, mapInfo->isEntire()); + } else { + result = blitMgr().writeBuffer(cmd.mapPtr(), *devMemory, origin, size); + } + } + if (!result) { + LogError("submitMapMemory failed!"); cmd.setStatus(CL_OUT_OF_RESOURCES); + } } cmd.memory().signalWrite(&dev()); + } - profilingEnd(cmd); + devMemory->clearUnmapInfo(cmd.mapPtr()); + + profilingEnd(cmd); } -void VirtualGPU::submitMigrateMemObjects(amd::MigrateMemObjectsCommand &vcmd) -{ - // Wait on a kernel if one is outstanding - releaseGpuMemoryFence(); +void VirtualGPU::submitFillMemory(amd::FillMemoryCommand& cmd) { + // Wait on a kernel if one is outstanding + releaseGpuMemoryFence(); - profilingBegin(vcmd); + profilingBegin(cmd); - for (auto itr : vcmd.memObjects()) { - // Find device memory - Memory* memory = dev().getRocMemory(&(*itr)); + Memory* memory = dev().getRocMemory(&cmd.memory()); - if (vcmd.migrationFlags() & CL_MIGRATE_MEM_OBJECT_HOST) { - memory->mgpuCacheWriteBack(); - } - else if (vcmd.migrationFlags() & CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED) { - // Synchronize memory from host if necessary. - // The sync function will perform memory migration from - // another device if necessary - device::Memory::SyncFlags syncFlags; - memory->syncCacheFromHost(*this, syncFlags); - } - else { - LogWarning("Unknown operation for memory migration!"); - } + bool entire = cmd.isEntireMemory(); + // Synchronize memory from host if necessary + device::Memory::SyncFlags syncFlags; + syncFlags.skipEntire_ = entire; + memory->syncCacheFromHost(*this, syncFlags); + + cl_command_type type = cmd.type(); + bool result = false; + bool imageBuffer = false; + float fillValue[4]; + + // Force fill buffer for IMAGE1D_BUFFER + if ((type == CL_COMMAND_FILL_IMAGE) && (cmd.memory().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) { + type = CL_COMMAND_FILL_BUFFER; + imageBuffer = true; + } + + // Find the the right fill operation + switch (type) { + case CL_COMMAND_FILL_BUFFER: { + const void* pattern = cmd.pattern(); + size_t patternSize = cmd.patternSize(); + amd::Coord3D origin(cmd.origin()[0]); + amd::Coord3D size(cmd.size()[0]); + // Reprogram fill parameters if it's an IMAGE1D_BUFFER object + if (imageBuffer) { + size_t elemSize = cmd.memory().asImage()->getImageFormat().getElementSize(); + origin.c[0] *= elemSize; + size.c[0] *= elemSize; + memset(fillValue, 0, sizeof(fillValue)); + cmd.memory().asImage()->getImageFormat().formatColor(pattern, fillValue); + pattern = fillValue; + patternSize = elemSize; + } + result = blitMgr().fillBuffer(*memory, pattern, patternSize, origin, size, entire); + break; } + case CL_COMMAND_FILL_IMAGE: { + result = blitMgr().fillImage(*memory, cmd.pattern(), cmd.origin(), cmd.size(), entire); + break; + } + default: + ShouldNotReachHere(); + break; + } - profilingEnd(vcmd); + if (!result) { + LogError("submitFillMemory failed!"); + cmd.setStatus(CL_OUT_OF_RESOURCES); + } + + cmd.memory().signalWrite(&dev()); + + profilingEnd(cmd); +} + +void VirtualGPU::submitMigrateMemObjects(amd::MigrateMemObjectsCommand& vcmd) { + // Wait on a kernel if one is outstanding + releaseGpuMemoryFence(); + + profilingBegin(vcmd); + + for (auto itr : vcmd.memObjects()) { + // Find device memory + Memory* memory = dev().getRocMemory(&(*itr)); + + if (vcmd.migrationFlags() & CL_MIGRATE_MEM_OBJECT_HOST) { + memory->mgpuCacheWriteBack(); + } else if (vcmd.migrationFlags() & CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED) { + // Synchronize memory from host if necessary. + // The sync function will perform memory migration from + // another device if necessary + device::Memory::SyncFlags syncFlags; + memory->syncCacheFromHost(*this, syncFlags); + } else { + LogWarning("Unknown operation for memory migration!"); + } + } + + profilingEnd(vcmd); } /*! \brief Writes to the buffer and increments the write pointer to the @@ -1437,382 +1302,352 @@ void VirtualGPU::submitMigrateMemObjects(amd::MigrateMemObjectsCommand &vcmd) * @param size The size in bytes to copy * @param alignment The alignment to follow while writing to the buffer */ -static inline address -addArg(address dst, const void* src, size_t size, uint32_t alignment) -{ - dst = amd::alignUp(dst, alignment); - ::memcpy(dst, src, size); - return dst + size; +static inline address addArg(address dst, const void* src, size_t size, uint32_t alignment) { + dst = amd::alignUp(dst, alignment); + ::memcpy(dst, src, size); + return dst + size; } -static inline address -addArg(address dst, const void* src, size_t size) -{ - assert(size < UINT32_MAX); - return addArg(dst, src, size, size); +static inline address addArg(address dst, const void* src, size_t size) { + assert(size < UINT32_MAX); + return addArg(dst, src, size, size); } - //Over rides the workgroup size fields in the packet with runtime/compiler set sizes - void setRuntimeCompilerLocalSize(hsa_kernel_dispatch_packet_t& dispatchPacket, - amd::NDRangeContainer sizes, - const size_t* compile_size, - const roc::Device &dev){ - //Todo (sramalin) need to check if compile_size is set to 0 if dimension is not valid - // else this error check is incorrect - if (compile_size[0] || compile_size[1] || compile_size[2]) { - dispatchPacket.workgroup_size_x = sizes.dimensions()>0 ? compile_size[0] : 1; - dispatchPacket.workgroup_size_y = sizes.dimensions()>1 ? compile_size[1] : 1; - dispatchPacket.workgroup_size_z = sizes.dimensions()>2 ? compile_size[2] : 1; - } - else { - //Runtime must set the group size - dispatchPacket.workgroup_size_x = 1; - dispatchPacket.workgroup_size_y = 1; - dispatchPacket.workgroup_size_z = 1; +// Over rides the workgroup size fields in the packet with runtime/compiler set sizes +void setRuntimeCompilerLocalSize(hsa_kernel_dispatch_packet_t& dispatchPacket, + amd::NDRangeContainer sizes, const size_t* compile_size, + const roc::Device& dev) { + // Todo (sramalin) need to check if compile_size is set to 0 if dimension is not valid + // else this error check is incorrect + if (compile_size[0] || compile_size[1] || compile_size[2]) { + dispatchPacket.workgroup_size_x = sizes.dimensions() > 0 ? compile_size[0] : 1; + dispatchPacket.workgroup_size_y = sizes.dimensions() > 1 ? compile_size[1] : 1; + dispatchPacket.workgroup_size_z = sizes.dimensions() > 2 ? compile_size[2] : 1; + } else { + // Runtime must set the group size + dispatchPacket.workgroup_size_x = 1; + dispatchPacket.workgroup_size_y = 1; + dispatchPacket.workgroup_size_z = 1; - if (sizes.dimensions() == 1) { - dispatchPacket.workgroup_size_x = dev.settings().maxWorkGroupSize_; - } - else if (sizes.dimensions() == 2) { - dispatchPacket.workgroup_size_x = dev.settings().maxWorkGroupSize2DX_; - dispatchPacket.workgroup_size_y = dev.settings().maxWorkGroupSize2DY_; - } - else if (sizes.dimensions() == 3) { - dispatchPacket.workgroup_size_x = dev.settings().maxWorkGroupSize3DX_; - dispatchPacket.workgroup_size_y = dev.settings().maxWorkGroupSize3DY_; - dispatchPacket.workgroup_size_z = dev.settings().maxWorkGroupSize3DZ_; - } + if (sizes.dimensions() == 1) { + dispatchPacket.workgroup_size_x = dev.settings().maxWorkGroupSize_; + } else if (sizes.dimensions() == 2) { + dispatchPacket.workgroup_size_x = dev.settings().maxWorkGroupSize2DX_; + dispatchPacket.workgroup_size_y = dev.settings().maxWorkGroupSize2DY_; + } else if (sizes.dimensions() == 3) { + dispatchPacket.workgroup_size_x = dev.settings().maxWorkGroupSize3DX_; + dispatchPacket.workgroup_size_y = dev.settings().maxWorkGroupSize3DY_; + dispatchPacket.workgroup_size_z = dev.settings().maxWorkGroupSize3DZ_; } + } } - static void -fillSampleDescriptor( - hsa_ext_sampler_descriptor_t& samplerDescriptor, - const amd::Sampler& sampler) - { - samplerDescriptor.filter_mode = sampler.filterMode() == CL_FILTER_NEAREST ? - HSA_EXT_SAMPLER_FILTER_MODE_NEAREST : HSA_EXT_SAMPLER_FILTER_MODE_LINEAR; - samplerDescriptor.coordinate_mode = sampler.normalizedCoords() ? - HSA_EXT_SAMPLER_COORDINATE_MODE_NORMALIZED : - HSA_EXT_SAMPLER_COORDINATE_MODE_UNNORMALIZED; - switch (sampler.addressingMode()) { - case CL_ADDRESS_CLAMP_TO_EDGE: - samplerDescriptor.address_mode = - HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE; - break; - case CL_ADDRESS_REPEAT: - samplerDescriptor.address_mode = - HSA_EXT_SAMPLER_ADDRESSING_MODE_REPEAT; - break; - case CL_ADDRESS_CLAMP: - samplerDescriptor.address_mode = - HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_BORDER; - break; - case CL_ADDRESS_MIRRORED_REPEAT: - samplerDescriptor.address_mode = - HSA_EXT_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT; - break; - case CL_ADDRESS_NONE: - samplerDescriptor.address_mode = - HSA_EXT_SAMPLER_ADDRESSING_MODE_UNDEFINED; - break; - default: - return; - } - } +static void fillSampleDescriptor(hsa_ext_sampler_descriptor_t& samplerDescriptor, + const amd::Sampler& sampler) { + samplerDescriptor.filter_mode = sampler.filterMode() == CL_FILTER_NEAREST + ? HSA_EXT_SAMPLER_FILTER_MODE_NEAREST + : HSA_EXT_SAMPLER_FILTER_MODE_LINEAR; + samplerDescriptor.coordinate_mode = sampler.normalizedCoords() + ? HSA_EXT_SAMPLER_COORDINATE_MODE_NORMALIZED + : HSA_EXT_SAMPLER_COORDINATE_MODE_UNNORMALIZED; + switch (sampler.addressingMode()) { + case CL_ADDRESS_CLAMP_TO_EDGE: + samplerDescriptor.address_mode = HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE; + break; + case CL_ADDRESS_REPEAT: + samplerDescriptor.address_mode = HSA_EXT_SAMPLER_ADDRESSING_MODE_REPEAT; + break; + case CL_ADDRESS_CLAMP: + samplerDescriptor.address_mode = HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_BORDER; + break; + case CL_ADDRESS_MIRRORED_REPEAT: + samplerDescriptor.address_mode = HSA_EXT_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT; + break; + case CL_ADDRESS_NONE: + samplerDescriptor.address_mode = HSA_EXT_SAMPLER_ADDRESSING_MODE_UNDEFINED; + break; + default: + return; + } +} -bool -VirtualGPU::submitKernelInternal( - const amd::NDRangeContainer& sizes, - const amd::Kernel& kernel, - const_address parameters, - void *eventHandle) -{ - if (tools_lib_) { - SetOclCorrelationHandle(tools_lib_, this->gpu_device_, eventHandle); - } +bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const amd::Kernel& kernel, + const_address parameters, void* eventHandle) { + if (tools_lib_) { + SetOclCorrelationHandle(tools_lib_, this->gpu_device_, eventHandle); + } - device::Kernel *devKernel = const_cast( - kernel.getDeviceKernel(dev())); - Kernel &gpuKernel = static_cast(*devKernel); + device::Kernel* devKernel = const_cast(kernel.getDeviceKernel(dev())); + Kernel& gpuKernel = static_cast(*devKernel); - const size_t compilerLdsUsage = gpuKernel.WorkgroupGroupSegmentByteSize(); - size_t ldsUsage = compilerLdsUsage; + const size_t compilerLdsUsage = gpuKernel.WorkgroupGroupSegmentByteSize(); + size_t ldsUsage = compilerLdsUsage; - // Check memory dependency and SVM objects - if (!processMemObjects(kernel, parameters)) { - LogError("Wrong memory objects!"); - return false; - } + // Check memory dependency and SVM objects + if (!processMemObjects(kernel, parameters)) { + LogError("Wrong memory objects!"); + return false; + } - // Init PrintfDbg object if printf is enabled. - bool printfEnabled = (gpuKernel.printfInfo().size() > 0) ? true : false; - if (!printfDbg()->init(printfEnabled)){ - LogError("\nPrintfDbg object initialization failed!"); - return false; - } + // Init PrintfDbg object if printf is enabled. + bool printfEnabled = (gpuKernel.printfInfo().size() > 0) ? true : false; + if (!printfDbg()->init(printfEnabled)) { + LogError("\nPrintfDbg object initialization failed!"); + return false; + } - const amd::KernelSignature& signature = kernel.signature(); - const amd::KernelParameters& kernelParams = kernel.parameters(); + const amd::KernelSignature& signature = kernel.signature(); + const amd::KernelParameters& kernelParams = kernel.parameters(); - size_t newOffset[3] = {0, 0, 0}; - size_t newGlobalSize[3] = {0, 0, 0}; + size_t newOffset[3] = {0, 0, 0}; + size_t newGlobalSize[3] = {0, 0, 0}; - int dim = -1; - int iteration = 1; - size_t globalStep = 0; + int dim = -1; + int iteration = 1; + size_t globalStep = 0; + for (uint i = 0; i < sizes.dimensions(); i++) { + newGlobalSize[i] = sizes.global()[i]; + newOffset[i] = sizes.offset()[i]; + } + + if (gpuKernel.isInternalKernel()) { + // Calculate new group size for each submission for (uint i = 0; i < sizes.dimensions(); i++) { - newGlobalSize[i] = sizes.global()[i]; - newOffset[i] = sizes.offset()[i]; + if (sizes.global()[i] > static_cast(0xffffffff)) { + dim = i; + iteration = sizes.global()[i] / 0xC0000000 + ((sizes.global()[i] % 0xC0000000) ? 1 : 0); + globalStep = (sizes.global()[i] / sizes.local()[i]) / iteration * sizes.local()[dim]; + if (timestamp_ != nullptr) { + timestamp_->setSplittedDispatch(); + } + break; + } + } + } + + for (int j = 0; j < iteration; j++) { + // Reset global size for dimension dim if split is needed + if (dim != -1) { + newOffset[dim] = sizes.offset()[dim] + globalStep * j; + if (((newOffset[dim] + globalStep) < sizes.global()[dim]) && (j != (iteration - 1))) { + newGlobalSize[dim] = globalStep; + } else { + newGlobalSize[dim] = sizes.global()[dim] - newOffset[dim]; + } } - if (gpuKernel.isInternalKernel()) { - // Calculate new group size for each submission - for (uint i = 0; i < sizes.dimensions(); i++) { - if (sizes.global()[i] > static_cast(0xffffffff)) { - dim = i; - iteration = sizes.global()[i] / 0xC0000000 - + ((sizes.global()[i] % 0xC0000000) ? 1: 0); - globalStep = (sizes.global()[i] / sizes.local()[i]) / iteration - * sizes.local()[dim]; - if (timestamp_ != nullptr) { - timestamp_->setSplittedDispatch(); - } - break; - } - } + // Find all parameters for the current kernel + + // Allocate buffer to hold kernel arguments + address argBuffer = (address)allocKernArg(gpuKernel.KernargSegmentByteSize(), + gpuKernel.KernargSegmentAlignment()); + + if (argBuffer == nullptr) { + LogError("Out of memory"); + return false; } - for (int j = 0; j < iteration; j++) { - // Reset global size for dimension dim if split is needed - if (dim != -1) { - newOffset[dim] = sizes.offset()[dim] + globalStep * j; - if (((newOffset[dim] + globalStep) < sizes.global()[dim]) && - (j != (iteration - 1))) { - newGlobalSize[dim] = globalStep; - } - else { - newGlobalSize[dim] = sizes.global()[dim] - newOffset[dim]; - } + address argPtr = argBuffer; + for (auto arg : gpuKernel.hsailArgs()) { + const_address srcArgPtr = nullptr; + if (arg->index_ != uint(-1)) { + srcArgPtr = parameters + signature.at(arg->index_).offset_; + } + + // Handle the hidden arguments first, as they do not have a + // matching parameter in the OCL signature (not a valid arg->index_) + switch (arg->type_) { + case ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X: { + size_t offset_x = sizes.dimensions() >= 1 ? newOffset[0] : 0; + assert(arg->size_ == sizeof(offset_x) && "check the sizes"); + argPtr = addArg(argPtr, &offset_x, arg->size_, arg->alignment_); + break; } + case ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y: { + size_t offset_y = sizes.dimensions() >= 2 ? newOffset[1] : 0; + assert(arg->size_ == sizeof(offset_y) && "check the sizes"); + argPtr = addArg(argPtr, &offset_y, arg->size_, arg->alignment_); + break; + } + case ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z: { + size_t offset_z = sizes.dimensions() == 3 ? newOffset[2] : 0; + assert(arg->size_ == sizeof(offset_z) && "check the sizes"); + argPtr = addArg(argPtr, &offset_z, arg->size_, arg->alignment_); + break; + } + case ROC_ARGTYPE_HIDDEN_PRINTF_BUFFER: { + address bufferPtr = printfDbg()->dbgBuffer(); + assert(arg->size_ == sizeof(bufferPtr) && "check the sizes"); + argPtr = addArg(argPtr, &bufferPtr, arg->size_, arg->alignment_); + break; + } + case ROC_ARGTYPE_HIDDEN_DEFAULT_QUEUE: + case ROC_ARGTYPE_HIDDEN_COMPLETION_ACTION: + case ROC_ARGTYPE_HIDDEN_NONE: { + void* zero = 0; + assert(arg->size_ <= sizeof(zero) && "check the sizes"); + argPtr = addArg(argPtr, &zero, arg->size_, arg->alignment_); + break; + } + case ROC_ARGTYPE_POINTER: { + if (arg->addrQual_ == ROC_ADDRESS_LOCAL) { + // Align the LDS on the alignment requirement of type pointed to + ldsUsage = amd::alignUp(ldsUsage, arg->pointeeAlignment_); + argPtr = addArg(argPtr, &ldsUsage, arg->size_, arg->alignment_); + ldsUsage += *reinterpret_cast(srcArgPtr); + break; + } + assert((arg->addrQual_ == ROC_ADDRESS_GLOBAL || arg->addrQual_ == ROC_ADDRESS_CONSTANT) && + "Unsupported address qualifier"); + if (kernelParams.boundToSvmPointer(dev(), parameters, arg->index_)) { + argPtr = addArg(argPtr, srcArgPtr, arg->size_, arg->alignment_); + break; + } + amd::Memory* mem = *reinterpret_cast(srcArgPtr); + if (mem == nullptr) { + argPtr = addArg(argPtr, srcArgPtr, arg->size_, arg->alignment_); + break; + } - // Find all parameters for the current kernel + Memory* devMem = static_cast(mem->getDeviceMemory(dev())); + //! @todo add multi-devices synchronization when supported. + void* globalAddress = devMem->getDeviceMemory(); + argPtr = addArg(argPtr, &globalAddress, arg->size_, arg->alignment_); - // Allocate buffer to hold kernel arguments - address argBuffer = (address)allocKernArg( - gpuKernel.KernargSegmentByteSize(), - gpuKernel.KernargSegmentAlignment()); - - if (argBuffer == nullptr) { + //! @todo Compiler has to return read/write attributes + if ((mem->getMemFlags() & CL_MEM_READ_ONLY) == 0) { + mem->signalWrite(&dev()); + } + break; + } + case ROC_ARGTYPE_REFERENCE: { + void* mem = allocKernArg(arg->size_, arg->alignment_); + if (mem == nullptr) { LogError("Out of memory"); return false; + } + memcpy(mem, srcArgPtr, arg->size_); + argPtr = addArg(argPtr, &mem, sizeof(void*)); + break; } - - address argPtr = argBuffer; - for (auto arg : gpuKernel.hsailArgs()) { - const_address srcArgPtr = nullptr; - if (arg->index_ != uint(-1)) { - srcArgPtr = parameters + signature.at(arg->index_).offset_; - } - - // Handle the hidden arguments first, as they do not have a - // matching parameter in the OCL signature (not a valid arg->index_) - switch (arg->type_) { - case ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X: { - size_t offset_x = sizes.dimensions() >= 1 ? newOffset[0] : 0; - assert(arg->size_ == sizeof(offset_x) && "check the sizes"); - argPtr = addArg(argPtr, &offset_x, arg->size_, arg->alignment_); - break; - } - case ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y: { - size_t offset_y = sizes.dimensions() >= 2 ? newOffset[1] : 0; - assert(arg->size_ == sizeof(offset_y) && "check the sizes"); - argPtr = addArg(argPtr, &offset_y, arg->size_, arg->alignment_); - break; - } - case ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z: { - size_t offset_z = sizes.dimensions() == 3 ? newOffset[2] : 0; - assert(arg->size_ == sizeof(offset_z) && "check the sizes"); - argPtr = addArg(argPtr, &offset_z, arg->size_, arg->alignment_); - break; - } - case ROC_ARGTYPE_HIDDEN_PRINTF_BUFFER: { - address bufferPtr = printfDbg()->dbgBuffer(); - assert(arg->size_ == sizeof(bufferPtr) && "check the sizes"); - argPtr = addArg(argPtr, &bufferPtr, arg->size_, arg->alignment_); - break; - } - case ROC_ARGTYPE_HIDDEN_DEFAULT_QUEUE: - case ROC_ARGTYPE_HIDDEN_COMPLETION_ACTION: - case ROC_ARGTYPE_HIDDEN_NONE: { - void* zero = 0; - assert(arg->size_ <= sizeof(zero) && "check the sizes"); - argPtr = addArg(argPtr, &zero, arg->size_, arg->alignment_); - break; - } - case ROC_ARGTYPE_POINTER: { - if (arg->addrQual_ == ROC_ADDRESS_LOCAL) { - // Align the LDS on the alignment requirement of type pointed to - ldsUsage = amd::alignUp(ldsUsage, arg->pointeeAlignment_); - argPtr = addArg(argPtr, &ldsUsage, arg->size_, arg->alignment_); - ldsUsage += *reinterpret_cast(srcArgPtr); - break; - } - assert((arg->addrQual_ == ROC_ADDRESS_GLOBAL - || arg->addrQual_ == ROC_ADDRESS_CONSTANT) - && "Unsupported address qualifier"); - if (kernelParams.boundToSvmPointer(dev(), parameters, arg->index_)) { - argPtr = addArg(argPtr, srcArgPtr, arg->size_, arg->alignment_); - break; - } - amd::Memory* mem = *reinterpret_cast(srcArgPtr); - if (mem == nullptr) { - argPtr = addArg(argPtr, srcArgPtr, arg->size_, arg->alignment_); - break; - } - - Memory *devMem = static_cast(mem->getDeviceMemory(dev())); - //! @todo add multi-devices synchronization when supported. - void* globalAddress = devMem->getDeviceMemory(); - argPtr = addArg(argPtr, &globalAddress, arg->size_, arg->alignment_); - - //! @todo Compiler has to return read/write attributes - if ((mem->getMemFlags() & CL_MEM_READ_ONLY) == 0) { - mem->signalWrite(&dev()); - } - break; - } - case ROC_ARGTYPE_REFERENCE: { - void *mem = allocKernArg(arg->size_, arg->alignment_); - if (mem == nullptr) { - LogError("Out of memory"); - return false; - } - memcpy(mem, srcArgPtr, arg->size_); - argPtr = addArg(argPtr, &mem, sizeof(void*)); - break; - } - case ROC_ARGTYPE_VALUE: - argPtr = addArg(argPtr, srcArgPtr, arg->size_, arg->alignment_); - break; - case ROC_ARGTYPE_IMAGE: { - amd::Memory* mem = *reinterpret_cast(srcArgPtr); - Image* image = static_cast(mem->getDeviceMemory(dev())); - if (image == nullptr) { - LogError("Kernel image argument is not an image object"); - return false; - } - - if (dev().settings().enableImageHandle_) { - const uint64_t image_srd = image->getHsaImageObject().handle; - assert(amd::isMultipleOf(image_srd, sizeof(image_srd))); - argPtr = addArg(argPtr, &image_srd, sizeof(image_srd)); - } - else { - // Image arguments are of size 48 bytes and are aligned to 16 bytes - argPtr = addArg(argPtr, (void *)image->getHsaImageObject().handle, - HSA_IMAGE_OBJECT_SIZE, HSA_IMAGE_OBJECT_ALIGNMENT); - } - - //! @todo Compiler has to return read/write attributes - if ((mem->getMemFlags() & CL_MEM_READ_ONLY) == 0) { - mem->signalWrite(&dev()); - } - break; - } - case ROC_ARGTYPE_SAMPLER: { - amd::Sampler* sampler = *reinterpret_cast(srcArgPtr); - if (sampler == nullptr) { - LogError("Kernel sampler argument is not an sampler object"); - return false; - } - - hsa_ext_sampler_descriptor_t samplerDescriptor; - fillSampleDescriptor(samplerDescriptor, *sampler); - - hsa_ext_sampler_t hsa_sampler; - hsa_status_t status = hsa_ext_sampler_create(dev().getBackendDevice(), - &samplerDescriptor, &hsa_sampler); - if (status != HSA_STATUS_SUCCESS) { - LogError("Error creating device sampler object!"); - return false; - } - - if (dev().settings().enableImageHandle_) { - uint64_t sampler_srd = hsa_sampler.handle; - argPtr = addArg(argPtr, &sampler_srd, sizeof(sampler_srd)); - samplerList_.push_back(hsa_sampler); - // TODO: destroy sampler. - } - else { - argPtr = amd::alignUp(argPtr, HSA_SAMPLER_OBJECT_ALIGNMENT); - - memcpy(argPtr, (void*)hsa_sampler.handle, HSA_SAMPLER_OBJECT_SIZE); - argPtr += HSA_SAMPLER_OBJECT_SIZE; - hsa_ext_sampler_destroy(dev().getBackendDevice(), hsa_sampler); - } - break; - } - default: - return false; - } - } - - // Check there is no arguments' buffer overflow - assert(argPtr <= argBuffer + gpuKernel.KernargSegmentByteSize()); - - // Check for group memory overflow - //! @todo Check should be in HSA - here we should have at most an assert - assert(roc_device_.info().localMemSizePerCU_ > 0); - if (ldsUsage > roc_device_.info().localMemSizePerCU_) { - LogError("No local memory available\n"); + case ROC_ARGTYPE_VALUE: + argPtr = addArg(argPtr, srcArgPtr, arg->size_, arg->alignment_); + break; + case ROC_ARGTYPE_IMAGE: { + amd::Memory* mem = *reinterpret_cast(srcArgPtr); + Image* image = static_cast(mem->getDeviceMemory(dev())); + if (image == nullptr) { + LogError("Kernel image argument is not an image object"); return false; + } + + if (dev().settings().enableImageHandle_) { + const uint64_t image_srd = image->getHsaImageObject().handle; + assert(amd::isMultipleOf(image_srd, sizeof(image_srd))); + argPtr = addArg(argPtr, &image_srd, sizeof(image_srd)); + } else { + // Image arguments are of size 48 bytes and are aligned to 16 bytes + argPtr = addArg(argPtr, (void*)image->getHsaImageObject().handle, HSA_IMAGE_OBJECT_SIZE, + HSA_IMAGE_OBJECT_ALIGNMENT); + } + + //! @todo Compiler has to return read/write attributes + if ((mem->getMemFlags() & CL_MEM_READ_ONLY) == 0) { + mem->signalWrite(&dev()); + } + break; } - - //Initialize the dispatch Packet - hsa_kernel_dispatch_packet_t dispatchPacket; - memset(&dispatchPacket, 0, sizeof(dispatchPacket)); - - dispatchPacket.kernel_object = gpuKernel.KernelCodeHandle(); - - dispatchPacket.header = aqlHeader_; - dispatchPacket.setup |= sizes.dimensions() << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; - dispatchPacket.grid_size_x = sizes.dimensions()>0 ? newGlobalSize[0] : 1; - dispatchPacket.grid_size_y = sizes.dimensions()>1 ? newGlobalSize[1] : 1; - dispatchPacket.grid_size_z = sizes.dimensions()>2 ? newGlobalSize[2] : 1; - - const size_t* compile_size = devKernel->workGroupInfo()->compileSize_; - if (sizes.local().product() != 0) { - dispatchPacket.workgroup_size_x = sizes.dimensions()>0 ? sizes.local()[0] : 1; - dispatchPacket.workgroup_size_y = sizes.dimensions()>1 ? sizes.local()[1] : 1; - dispatchPacket.workgroup_size_z = sizes.dimensions()>2 ? sizes.local()[2] : 1; - } else { - amd::NDRangeContainer tmpSizes(sizes.dimensions(), - &newOffset[0], &newGlobalSize[0], - &(const_cast(sizes).local()[0])); - - setRuntimeCompilerLocalSize(dispatchPacket, tmpSizes, compile_size, dev()); - } - dispatchPacket.kernarg_address = argBuffer; - dispatchPacket.group_segment_size = ldsUsage; - dispatchPacket.private_segment_size = devKernel->workGroupInfo()->privateMemSize_; - - //Dispatch the packet - if (!dispatchAqlPacket(&dispatchPacket, GPU_FLUSH_ON_EXECUTION)){ + case ROC_ARGTYPE_SAMPLER: { + amd::Sampler* sampler = *reinterpret_cast(srcArgPtr); + if (sampler == nullptr) { + LogError("Kernel sampler argument is not an sampler object"); return false; + } + + hsa_ext_sampler_descriptor_t samplerDescriptor; + fillSampleDescriptor(samplerDescriptor, *sampler); + + hsa_ext_sampler_t hsa_sampler; + hsa_status_t status = + hsa_ext_sampler_create(dev().getBackendDevice(), &samplerDescriptor, &hsa_sampler); + if (status != HSA_STATUS_SUCCESS) { + LogError("Error creating device sampler object!"); + return false; + } + + if (dev().settings().enableImageHandle_) { + uint64_t sampler_srd = hsa_sampler.handle; + argPtr = addArg(argPtr, &sampler_srd, sizeof(sampler_srd)); + samplerList_.push_back(hsa_sampler); + // TODO: destroy sampler. + } else { + argPtr = amd::alignUp(argPtr, HSA_SAMPLER_OBJECT_ALIGNMENT); + + memcpy(argPtr, (void*)hsa_sampler.handle, HSA_SAMPLER_OBJECT_SIZE); + argPtr += HSA_SAMPLER_OBJECT_SIZE; + hsa_ext_sampler_destroy(dev().getBackendDevice(), hsa_sampler); + } + break; } + default: + return false; + } } - // Mark the flag indicating if a dispatch is outstanding. - // We are not waiting after every dispatch. - hasPendingDispatch_ = true; + // Check there is no arguments' buffer overflow + assert(argPtr <= argBuffer + gpuKernel.KernargSegmentByteSize()); - // Output printf buffer - if(!printfDbg()->output(*this, printfEnabled, gpuKernel.printfInfo())){ - LogError("\nCould not print data from the printf buffer!"); - return false; + // Check for group memory overflow + //! @todo Check should be in HSA - here we should have at most an assert + assert(roc_device_.info().localMemSizePerCU_ > 0); + if (ldsUsage > roc_device_.info().localMemSizePerCU_) { + LogError("No local memory available\n"); + return false; } - return true; + + // Initialize the dispatch Packet + hsa_kernel_dispatch_packet_t dispatchPacket; + memset(&dispatchPacket, 0, sizeof(dispatchPacket)); + + dispatchPacket.kernel_object = gpuKernel.KernelCodeHandle(); + + dispatchPacket.header = aqlHeader_; + dispatchPacket.setup |= sizes.dimensions() << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; + dispatchPacket.grid_size_x = sizes.dimensions() > 0 ? newGlobalSize[0] : 1; + dispatchPacket.grid_size_y = sizes.dimensions() > 1 ? newGlobalSize[1] : 1; + dispatchPacket.grid_size_z = sizes.dimensions() > 2 ? newGlobalSize[2] : 1; + + const size_t* compile_size = devKernel->workGroupInfo()->compileSize_; + if (sizes.local().product() != 0) { + dispatchPacket.workgroup_size_x = sizes.dimensions() > 0 ? sizes.local()[0] : 1; + dispatchPacket.workgroup_size_y = sizes.dimensions() > 1 ? sizes.local()[1] : 1; + dispatchPacket.workgroup_size_z = sizes.dimensions() > 2 ? sizes.local()[2] : 1; + } else { + amd::NDRangeContainer tmpSizes(sizes.dimensions(), &newOffset[0], &newGlobalSize[0], + &(const_cast(sizes).local()[0])); + + setRuntimeCompilerLocalSize(dispatchPacket, tmpSizes, compile_size, dev()); + } + dispatchPacket.kernarg_address = argBuffer; + dispatchPacket.group_segment_size = ldsUsage; + dispatchPacket.private_segment_size = devKernel->workGroupInfo()->privateMemSize_; + + // Dispatch the packet + if (!dispatchAqlPacket(&dispatchPacket, GPU_FLUSH_ON_EXECUTION)) { + return false; + } + } + + // Mark the flag indicating if a dispatch is outstanding. + // We are not waiting after every dispatch. + hasPendingDispatch_ = true; + + // Output printf buffer + if (!printfDbg()->output(*this, printfEnabled, gpuKernel.printfInfo())) { + LogError("\nCould not print data from the printf buffer!"); + return false; + } + return true; } /** * @brief Api to dispatch a kernel for execution. The implementation @@ -1823,110 +1658,92 @@ VirtualGPU::submitKernelInternal( * It also parses the kernel arguments buffer to inject into Hsa Runtime * the list of kernel parameters. */ -void VirtualGPU::submitKernel(amd::NDRangeKernelCommand &vcmd) { - profilingBegin(vcmd); - - // Submit kernel to HW - if (!submitKernelInternal( - vcmd.sizes(), vcmd.kernel(), vcmd.parameters(), - static_cast(as_cl(&vcmd.event())))) { - LogError("AQL dispatch failed!"); - vcmd.setStatus(CL_INVALID_OPERATION); - } - - profilingEnd(vcmd); -} - -void VirtualGPU::submitNativeFn(amd::NativeFnCommand &cmd) { - // std::cout<<__FUNCTION__<<" not implemented"<<"*********"<(as_cl(&vcmd.event())))) { + LogError("AQL dispatch failed!"); + vcmd.setStatus(CL_INVALID_OPERATION); + } + + profilingEnd(vcmd); +} + +void VirtualGPU::submitNativeFn(amd::NativeFnCommand& cmd) { + // std::cout<<__FUNCTION__<<" not implemented"<<"*********"< 7) { - dev().xferWrite().release(*this, *xferWriteBuffers_.front()); - xferWriteBuffers_.erase(xferWriteBuffers_.begin()); +void VirtualGPU::addXferWrite(Memory& memory) { + if (xferWriteBuffers_.size() > 7) { + dev().xferWrite().release(*this, *xferWriteBuffers_.front()); + xferWriteBuffers_.erase(xferWriteBuffers_.begin()); + } + + // Delay destruction + xferWriteBuffers_.push_back(&memory); +} + +void VirtualGPU::releaseXferWrite() { + for (auto& memory : xferWriteBuffers_) { + dev().xferWrite().release(*this, *memory); + } + xferWriteBuffers_.resize(0); +} + +void VirtualGPU::addPinnedMem(amd::Memory* mem) { + if (nullptr == findPinnedMem(mem->getHostMem(), mem->getSize())) { + if (pinnedMems_.size() > 7) { + pinnedMems_.front()->release(); + pinnedMems_.erase(pinnedMems_.begin()); } // Delay destruction - xferWriteBuffers_.push_back(&memory); + pinnedMems_.push_back(mem); + } } -void -VirtualGPU::releaseXferWrite() -{ - for (auto& memory : xferWriteBuffers_) { - dev().xferWrite().release(*this, *memory); +void VirtualGPU::releasePinnedMem() { + for (auto& amdMemory : pinnedMems_) { + amdMemory->release(); + } + pinnedMems_.resize(0); +} + +amd::Memory* VirtualGPU::findPinnedMem(void* addr, size_t size) { + for (auto& amdMemory : pinnedMems_) { + if ((amdMemory->getHostMem() == addr) && (size <= amdMemory->getSize())) { + return amdMemory; } - xferWriteBuffers_.resize(0); + } + return nullptr; } -void -VirtualGPU::addPinnedMem(amd::Memory* mem) -{ - if (nullptr == findPinnedMem(mem->getHostMem(), mem->getSize())) { - if (pinnedMems_.size() > 7) { - pinnedMems_.front()->release(); - pinnedMems_.erase(pinnedMems_.begin()); - } - - // Delay destruction - pinnedMems_.push_back(mem); - } -} - -void -VirtualGPU::releasePinnedMem() -{ - for (auto& amdMemory : pinnedMems_) { - amdMemory->release(); - } - pinnedMems_.resize(0); -} - -amd::Memory* -VirtualGPU::findPinnedMem(void* addr, size_t size) -{ - for (auto& amdMemory : pinnedMems_) { - if ((amdMemory->getHostMem() == addr) && (size <= amdMemory->getSize())) { - return amdMemory; - } - } - return nullptr; -} - -void -VirtualGPU::enableSyncBlit() const -{ - blitMgr_->enableSynchronization(); -} +void VirtualGPU::enableSyncBlit() const { blitMgr_->enableSynchronization(); } } // End of roc namespace diff --git a/rocclr/runtime/device/rocm/rocvirtual.hpp b/rocclr/runtime/device/rocm/rocvirtual.hpp index 64f4ace5e3..4af1002580 100644 --- a/rocclr/runtime/device/rocm/rocvirtual.hpp +++ b/rocclr/runtime/device/rocm/rocvirtual.hpp @@ -17,284 +17,274 @@ class Device; class Memory; class Timestamp; -struct ProfilingSignal : public amd::HeapObject -{ - hsa_signal_t signal_; //!< HSA signal to track profiling information - Timestamp* ts_; //!< Timestamp object associated with the signal +struct ProfilingSignal : public amd::HeapObject { + hsa_signal_t signal_; //!< HSA signal to track profiling information + Timestamp* ts_; //!< Timestamp object associated with the signal - ProfilingSignal(): ts_(nullptr) { signal_.handle = 0; } + ProfilingSignal() : ts_(nullptr) { signal_.handle = 0; } }; // Timestamp for keeping track of some profiling information for various commands // including EnqueueNDRangeKernel and clEnqueueCopyBuffer. class Timestamp { -private: - uint64_t start_; - uint64_t end_; - ProfilingSignal* profilingSignal_; - hsa_agent_t agent_; - static double ticksToTime_; - bool splittedDispatch_; - std::vector splittedSignals_; + private: + uint64_t start_; + uint64_t end_; + ProfilingSignal* profilingSignal_; + hsa_agent_t agent_; + static double ticksToTime_; + bool splittedDispatch_; + std::vector splittedSignals_; -public: - uint64_t getStart() { checkGpuTime(); return start_; } + public: + uint64_t getStart() { + checkGpuTime(); + return start_; + } - uint64_t getEnd() { checkGpuTime(); return end_; } + uint64_t getEnd() { + checkGpuTime(); + return end_; + } - void setProfilingSignal(ProfilingSignal* signal) { - profilingSignal_ = signal; - if (splittedDispatch_) { - splittedSignals_.push_back(profilingSignal_->signal_); + void setProfilingSignal(ProfilingSignal* signal) { + profilingSignal_ = signal; + if (splittedDispatch_) { + splittedSignals_.push_back(profilingSignal_->signal_); + } + } + const ProfilingSignal* getProfilingSignal() const { return profilingSignal_; } + + void setAgent(hsa_agent_t agent) { agent_ = agent; } + + Timestamp() : start_(0), end_(0), profilingSignal_(nullptr), splittedDispatch_(false) { + agent_.handle = 0; + } + + ~Timestamp() {} + + //! Finds execution ticks on GPU + void checkGpuTime() { + if (profilingSignal_ != nullptr) { + hsa_amd_profiling_dispatch_time_t time; + + if (splittedDispatch_) { + uint64_t start = UINT64_MAX; + uint64_t end = 0; + for (auto it = splittedSignals_.begin(); it < splittedSignals_.end(); it++) { + hsa_amd_profiling_get_dispatch_time(agent_, *it, &time); + if (time.start < start) { + start = time.start; + } + if (time.end > end) { + end = time.end; + } } + start_ = start * ticksToTime_; + end_ = end * ticksToTime_; + } else { + hsa_amd_profiling_get_dispatch_time(agent_, profilingSignal_->signal_, &time); + start_ = time.start * ticksToTime_; + end_ = time.end * ticksToTime_; + } + profilingSignal_->ts_ = nullptr; + profilingSignal_ = nullptr; } - const ProfilingSignal* getProfilingSignal() const { return profilingSignal_; } + } - void setAgent(hsa_agent_t agent) { agent_ = agent; } + // Start a timestamp (get timestamp from OS) + void start() { start_ = amd::Os::timeNanos(); } - Timestamp() : start_(0), end_(0), profilingSignal_(nullptr), splittedDispatch_(false) { - agent_.handle = 0; - } + // End a timestamp (get timestamp from OS) + void end() { end_ = amd::Os::timeNanos(); } - ~Timestamp() {} + bool isSplittedDispatch() const { return splittedDispatch_; } + void setSplittedDispatch() { splittedDispatch_ = true; } - //! Finds execution ticks on GPU - void checkGpuTime() { - if (profilingSignal_ != nullptr) { - hsa_amd_profiling_dispatch_time_t time; - - if (splittedDispatch_) { - uint64_t start = UINT64_MAX; - uint64_t end = 0; - for (auto it = splittedSignals_.begin(); it < splittedSignals_.end(); it++) { - hsa_amd_profiling_get_dispatch_time(agent_, *it, &time); - if (time.start < start) { - start = time.start; - } - if (time.end > end) { - end = time.end; - } - } - start_ = start * ticksToTime_; - end_ = end * ticksToTime_; - } - else { - hsa_amd_profiling_get_dispatch_time(agent_, profilingSignal_->signal_, &time); - start_ = time.start * ticksToTime_; - end_ = time.end * ticksToTime_; - } - profilingSignal_->ts_ = nullptr; - profilingSignal_ = nullptr; - } - } - - // Start a timestamp (get timestamp from OS) - void start() { - start_ = amd::Os::timeNanos(); - } - - // End a timestamp (get timestamp from OS) - void end() { - end_ = amd::Os::timeNanos(); - } - - bool isSplittedDispatch() const { return splittedDispatch_; } - void setSplittedDispatch() { splittedDispatch_ = true; } - - static void setGpuTicksToTime(double ticksToTime) { ticksToTime_=ticksToTime; } - static double getGpuTicksToTime() { return ticksToTime_; } + static void setGpuTicksToTime(double ticksToTime) { ticksToTime_ = ticksToTime; } + static double getGpuTicksToTime() { return ticksToTime_; } }; class VirtualGPU : public device::VirtualDevice { -public: - //! Initial signal value - static const hsa_signal_value_t InitSignalValue = 1; + public: + //! Initial signal value + static const hsa_signal_value_t InitSignalValue = 1; - class MemoryDependency : public amd::EmbeddedObject - { - public: - //! Default constructor - MemoryDependency() - : memObjectsInQueue_(nullptr) - , numMemObjectsInQueue_(0) - , maxMemObjectsInQueue_(0) {} + class MemoryDependency : public amd::EmbeddedObject { + public: + //! Default constructor + MemoryDependency() + : memObjectsInQueue_(nullptr), numMemObjectsInQueue_(0), maxMemObjectsInQueue_(0) {} - ~MemoryDependency() { delete [] memObjectsInQueue_; } + ~MemoryDependency() { delete[] memObjectsInQueue_; } - //! Creates memory dependecy structure - bool create(size_t numMemObj); + //! Creates memory dependecy structure + bool create(size_t numMemObj); - //! Notify the tracker about new kernel - void newKernel() { endMemObjectsInQueue_ = numMemObjectsInQueue_; } + //! Notify the tracker about new kernel + void newKernel() { endMemObjectsInQueue_ = numMemObjectsInQueue_; } - //! Validates memory object on dependency - void validate(VirtualGPU& gpu, const Memory* memory, bool readOnly); + //! Validates memory object on dependency + void validate(VirtualGPU& gpu, const Memory* memory, bool readOnly); - //! Clear memory dependency - void clear(bool all = true); + //! Clear memory dependency + void clear(bool all = true); - private: - struct MemoryState { - uint64_t start_; //! Busy memory start address - uint64_t end_; //! Busy memory end address - bool readOnly_; //! Current GPU state in the queue - }; - - MemoryState* memObjectsInQueue_; //!< Memory object state in the queue - size_t endMemObjectsInQueue_; //!< End of mem objects in the queue - size_t numMemObjectsInQueue_; //!< Number of mem objects in the queue - size_t maxMemObjectsInQueue_; //!< Maximum number of mem objects in the queue + private: + struct MemoryState { + uint64_t start_; //! Busy memory start address + uint64_t end_; //! Busy memory end address + bool readOnly_; //! Current GPU state in the queue }; - VirtualGPU(Device &device); - ~VirtualGPU(); + MemoryState* memObjectsInQueue_; //!< Memory object state in the queue + size_t endMemObjectsInQueue_; //!< End of mem objects in the queue + size_t numMemObjectsInQueue_; //!< Number of mem objects in the queue + size_t maxMemObjectsInQueue_; //!< Maximum number of mem objects in the queue + }; - bool create(bool profilingEna); - bool terminate(); - const Device& dev() const { return roc_device_; } + VirtualGPU(Device& device); + ~VirtualGPU(); - void profilingBegin(amd::Command &command, bool drmProfiling = false); - void profilingEnd(amd::Command &command); + bool create(bool profilingEna); + bool terminate(); + const Device& dev() const { return roc_device_; } - void updateCommandsState(amd::Command* list); + void profilingBegin(amd::Command& command, bool drmProfiling = false); + void profilingEnd(amd::Command& command); - void submitReadMemory(amd::ReadMemoryCommand& cmd); - void submitWriteMemory(amd::WriteMemoryCommand& cmd); - void submitCopyMemory(amd::CopyMemoryCommand& cmd); - void submitMapMemory(amd::MapMemoryCommand& cmd); - void submitUnmapMemory(amd::UnmapMemoryCommand& cmd); - void submitKernel(amd::NDRangeKernelCommand& cmd); - bool submitKernelInternal( - const amd::NDRangeContainer& sizes, //!< Workload sizes - const amd::Kernel& kernel, //!< Kernel for execution - const_address parameters, //!< Parameters for the kernel - void *event_handle //!< Handle to OCL event for debugging - ); - void submitNativeFn(amd::NativeFnCommand& cmd); - void submitMarker(amd::Marker& cmd); + void updateCommandsState(amd::Command* list); - void submitAcquireExtObjects(amd::AcquireExtObjectsCommand& cmd); - void submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& cmd); - void submitPerfCounter(amd::PerfCounterCommand& cmd){}; + void submitReadMemory(amd::ReadMemoryCommand& cmd); + void submitWriteMemory(amd::WriteMemoryCommand& cmd); + void submitCopyMemory(amd::CopyMemoryCommand& cmd); + void submitMapMemory(amd::MapMemoryCommand& cmd); + void submitUnmapMemory(amd::UnmapMemoryCommand& cmd); + void submitKernel(amd::NDRangeKernelCommand& cmd); + bool submitKernelInternal(const amd::NDRangeContainer& sizes, //!< Workload sizes + const amd::Kernel& kernel, //!< Kernel for execution + const_address parameters, //!< Parameters for the kernel + void* event_handle //!< Handle to OCL event for debugging + ); + void submitNativeFn(amd::NativeFnCommand& cmd); + void submitMarker(amd::Marker& cmd); - void flush(amd::Command* list = nullptr, bool wait = false); - void submitFillMemory(amd::FillMemoryCommand& cmd); - void submitMigrateMemObjects(amd::MigrateMemObjectsCommand& cmd); + void submitAcquireExtObjects(amd::AcquireExtObjectsCommand& cmd); + void submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& cmd); + void submitPerfCounter(amd::PerfCounterCommand& cmd){}; -// { roc OpenCL integration -// Added these stub (no-ops) implementation of pure virtual methods, -// when integrating HSA and OpenCL branches. -// TODO: After inegration, whoever is working on VirtualGPU should write -// actual implemention. - virtual void submitSignal(amd::SignalCommand &cmd) {} - virtual void submitMakeBuffersResident(amd::MakeBuffersResidentCommand &cmd) {} + void flush(amd::Command* list = nullptr, bool wait = false); + void submitFillMemory(amd::FillMemoryCommand& cmd); + void submitMigrateMemObjects(amd::MigrateMemObjectsCommand& cmd); - virtual void submitSvmFreeMemory(amd::SvmFreeMemoryCommand& cmd); - virtual void submitSvmCopyMemory(amd::SvmCopyMemoryCommand& cmd); - virtual void submitSvmFillMemory(amd::SvmFillMemoryCommand& cmd); - virtual void submitSvmMapMemory(amd::SvmMapMemoryCommand& cmd); - virtual void submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& cmd); + // { roc OpenCL integration + // Added these stub (no-ops) implementation of pure virtual methods, + // when integrating HSA and OpenCL branches. + // TODO: After inegration, whoever is working on VirtualGPU should write + // actual implemention. + virtual void submitSignal(amd::SignalCommand& cmd) {} + virtual void submitMakeBuffersResident(amd::MakeBuffersResidentCommand& cmd) {} - void submitThreadTraceMemObjects(amd::ThreadTraceMemObjectsCommand &cmd) {} - void submitThreadTrace(amd::ThreadTraceCommand &vcmd) {} + virtual void submitSvmFreeMemory(amd::SvmFreeMemoryCommand& cmd); + virtual void submitSvmCopyMemory(amd::SvmCopyMemoryCommand& cmd); + virtual void submitSvmFillMemory(amd::SvmFillMemoryCommand& cmd); + virtual void submitSvmMapMemory(amd::SvmMapMemoryCommand& cmd); + virtual void submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& cmd); - /** - * @brief Waits on an outstanding kernel without regard to how - * it was dispatched - with or without a signal - * - * @return bool true if Wait returned successfully, false - * otherwise - */ - bool releaseGpuMemoryFence(); + void submitThreadTraceMemObjects(amd::ThreadTraceMemObjectsCommand& cmd) {} + void submitThreadTrace(amd::ThreadTraceCommand& vcmd) {} - hsa_agent_t gpu_device() { return gpu_device_; } - hsa_queue_t* gpu_queue() { return gpu_queue_; } + /** + * @brief Waits on an outstanding kernel without regard to how + * it was dispatched - with or without a signal + * + * @return bool true if Wait returned successfully, false + * otherwise + */ + bool releaseGpuMemoryFence(); - // Return pointer to PrintfDbg - PrintfDbg* printfDbg() const {return printfdbg_;} + hsa_agent_t gpu_device() { return gpu_device_; } + hsa_queue_t* gpu_queue() { return gpu_queue_; } - //! Returns memory dependency class - MemoryDependency& memoryDependency() { return memoryDependency_; } + // Return pointer to PrintfDbg + PrintfDbg* printfDbg() const { return printfdbg_; } - //! Detects memory dependency for HSAIL kernels and uses appropriate AQL header - bool processMemObjects( - const amd::Kernel& kernel, //!< AMD kernel object for execution - const_address params //!< Pointer to the param's store - ); - //Retun the virtual gpu unique index - uint index() const { return index_; } + //! Returns memory dependency class + MemoryDependency& memoryDependency() { return memoryDependency_; } - //! Adds a stage write buffer into a list - void addXferWrite(Memory& memory); + //! Detects memory dependency for HSAIL kernels and uses appropriate AQL header + bool processMemObjects(const amd::Kernel& kernel, //!< AMD kernel object for execution + const_address params //!< Pointer to the param's store + ); + // Retun the virtual gpu unique index + uint index() const { return index_; } - //! Releases stage write buffers - void releaseXferWrite(); + //! Adds a stage write buffer into a list + void addXferWrite(Memory& memory); - //! Adds a pinned memory object into a map - void addPinnedMem(amd::Memory* mem); + //! Releases stage write buffers + void releaseXferWrite(); - //! Release pinned memory objects - void releasePinnedMem(); + //! Adds a pinned memory object into a map + void addPinnedMem(amd::Memory* mem); - //! Finds if pinned memory is cached - amd::Memory* findPinnedMem(void* addr, size_t size); + //! Release pinned memory objects + void releasePinnedMem(); - void enableSyncBlit() const; + //! Finds if pinned memory is cached + amd::Memory* findPinnedMem(void* addr, size_t size); -// } roc OpenCL integration -private: - bool dispatchAqlPacket( - hsa_kernel_dispatch_packet_t* packet, bool blocking = true); - bool dispatchAqlPacket( - hsa_barrier_and_packet_t* packet, bool blocking = true); - template bool dispatchGenericAqlPacket( - AqlPacket* packet, bool blocking); - void dispatchBarrierPacket(const hsa_barrier_and_packet_t* packet); - void initializeDispatchPacket(hsa_kernel_dispatch_packet_t* packet, - amd::NDRangeContainer& sizes); + void enableSyncBlit() const; - bool initPool(size_t kernarg_pool_size, uint signal_pool_count); - void destroyPool(); + // } roc OpenCL integration + private: + bool dispatchAqlPacket(hsa_kernel_dispatch_packet_t* packet, bool blocking = true); + bool dispatchAqlPacket(hsa_barrier_and_packet_t* packet, bool blocking = true); + template bool dispatchGenericAqlPacket(AqlPacket* packet, bool blocking); + void dispatchBarrierPacket(const hsa_barrier_and_packet_t* packet); + void initializeDispatchPacket(hsa_kernel_dispatch_packet_t* packet, amd::NDRangeContainer& sizes); - void* allocKernArg(size_t size, size_t alignment); - void resetKernArgPool() { kernarg_pool_cur_offset_ = 0; } + bool initPool(size_t kernarg_pool_size, uint signal_pool_count); + void destroyPool(); - //! Updates AQL header for the upcomming dispatch - void setAqlHeader(uint16_t header) { aqlHeader_ = header; } + void* allocKernArg(size_t size, size_t alignment); + void resetKernArgPool() { kernarg_pool_cur_offset_ = 0; } - std::vector xferWriteBuffers_; //!< Stage write buffers - std::vector pinnedMems_; //!< Pinned memory list + //! Updates AQL header for the upcomming dispatch + void setAqlHeader(uint16_t header) { aqlHeader_ = header; } - /** - * @brief Maintains the list of sampler allocated for one or more kernel - * submissions. - */ - std::vector samplerList_; + std::vector xferWriteBuffers_; //!< Stage write buffers + std::vector pinnedMems_; //!< Pinned memory list - /** - * @brief Indicates if a kernel dispatch is outstanding. This flag is - * used to synchronized on kernel outputs. - */ - bool hasPendingDispatch_; - Timestamp* timestamp_; - hsa_agent_t gpu_device_; //!< Physical device - hsa_queue_t* gpu_queue_; //!< Queue associated with a gpu - hsa_barrier_and_packet_t barrier_packet_; - hsa_signal_t barrier_signal_; - uint32_t dispatch_id_; //!< This variable must be updated atomically. - Device& roc_device_; //!< roc device object - void * tools_lib_; - PrintfDbg* printfdbg_; - MemoryDependency memoryDependency_; //!< Memory dependency class - uint16_t aqlHeader_; //!< AQL header for dispatch + /** + * @brief Maintains the list of sampler allocated for one or more kernel + * submissions. + */ + std::vector samplerList_; - char* kernarg_pool_base_; - size_t kernarg_pool_size_; - uint kernarg_pool_cur_offset_; + /** + * @brief Indicates if a kernel dispatch is outstanding. This flag is + * used to synchronized on kernel outputs. + */ + bool hasPendingDispatch_; + Timestamp* timestamp_; + hsa_agent_t gpu_device_; //!< Physical device + hsa_queue_t* gpu_queue_; //!< Queue associated with a gpu + hsa_barrier_and_packet_t barrier_packet_; + hsa_signal_t barrier_signal_; + uint32_t dispatch_id_; //!< This variable must be updated atomically. + Device& roc_device_; //!< roc device object + void* tools_lib_; + PrintfDbg* printfdbg_; + MemoryDependency memoryDependency_; //!< Memory dependency class + uint16_t aqlHeader_; //!< AQL header for dispatch - std::vector signal_pool_; //!< Pool of signals for profiling - const uint index_; //!< Virtual gpu unique index - friend class Timestamp; + char* kernarg_pool_base_; + size_t kernarg_pool_size_; + uint kernarg_pool_cur_offset_; + + std::vector signal_pool_; //!< Pool of signals for profiling + const uint index_; //!< Virtual gpu unique index + friend class Timestamp; }; } - diff --git a/rocclr/runtime/os/alloc.cpp b/rocclr/runtime/os/alloc.cpp index 53e5f61499..a95212ea71 100644 --- a/rocclr/runtime/os/alloc.cpp +++ b/rocclr/runtime/os/alloc.cpp @@ -10,78 +10,60 @@ namespace amd { -void* -AlignedMemory::allocate(size_t size, size_t alignment) -{ - return Os::alignedMalloc(size, alignment); -} - -void* -GuardedMemory::allocate(size_t size, size_t alignment, size_t guardSize) -{ - size_t sizeToAllocate = guardSize + alignment; - sizeToAllocate += size + guardSize + Os::pageSize(); - - sizeToAllocate = amd::alignUp(sizeToAllocate, Os::pageSize()); - address userHostMemGuarded = Os::reserveMemory(NULL, sizeToAllocate); - if (!userHostMemGuarded || !Os::commitMemory( - userHostMemGuarded, sizeToAllocate, Os::MEM_PROT_RW)) { - return NULL; - } - - address userHostMem = userHostMemGuarded + sizeToAllocate; - userHostMem = amd::alignDown(userHostMem - guardSize, Os::pageSize()); - - // Protect the guard pages after the end of the users's buffer. - if (!Os::protectMemory(userHostMem, guardSize, Os::MEM_PROT_NONE)) { - fatal("Protect memory (up) failed"); - } - - userHostMem = userHostMem - size; - userHostMem = amd::alignDown(userHostMem, alignment); - // Write the actual size allocated including all the guard pages, - // alignment, page file size... as well as the size of guarded byte - // count before the beginning of the user's buffer. - size_t* temp = reinterpret_cast(userHostMem); - *--temp = sizeToAllocate; - *--temp = userHostMem - userHostMemGuarded; - - // Protect the guard pages before the beginning of the user's buffer. - if (!Os::protectMemory(userHostMemGuarded, guardSize, Os::MEM_PROT_NONE)) { - fatal("Protect memory (down) failed"); - } - - return userHostMem; +void* AlignedMemory::allocate(size_t size, size_t alignment) { + return Os::alignedMalloc(size, alignment); } -void -AlignedMemory::deallocate(void* ptr) -{ - Os::alignedFree(ptr); +void* GuardedMemory::allocate(size_t size, size_t alignment, size_t guardSize) { + size_t sizeToAllocate = guardSize + alignment; + sizeToAllocate += size + guardSize + Os::pageSize(); + + sizeToAllocate = amd::alignUp(sizeToAllocate, Os::pageSize()); + address userHostMemGuarded = Os::reserveMemory(NULL, sizeToAllocate); + if (!userHostMemGuarded || + !Os::commitMemory(userHostMemGuarded, sizeToAllocate, Os::MEM_PROT_RW)) { + return NULL; + } + + address userHostMem = userHostMemGuarded + sizeToAllocate; + userHostMem = amd::alignDown(userHostMem - guardSize, Os::pageSize()); + + // Protect the guard pages after the end of the users's buffer. + if (!Os::protectMemory(userHostMem, guardSize, Os::MEM_PROT_NONE)) { + fatal("Protect memory (up) failed"); + } + + userHostMem = userHostMem - size; + userHostMem = amd::alignDown(userHostMem, alignment); + // Write the actual size allocated including all the guard pages, + // alignment, page file size... as well as the size of guarded byte + // count before the beginning of the user's buffer. + size_t* temp = reinterpret_cast(userHostMem); + *--temp = sizeToAllocate; + *--temp = userHostMem - userHostMemGuarded; + + // Protect the guard pages before the beginning of the user's buffer. + if (!Os::protectMemory(userHostMemGuarded, guardSize, Os::MEM_PROT_NONE)) { + fatal("Protect memory (down) failed"); + } + + return userHostMem; } -void -GuardedMemory::deallocate(void* ptr) -{ - size_t* userHostMem = static_cast(ptr); +void AlignedMemory::deallocate(void* ptr) { Os::alignedFree(ptr); } - size_t size = *--userHostMem; - size_t offset = *--userHostMem; +void GuardedMemory::deallocate(void* ptr) { + size_t* userHostMem = static_cast(ptr); - Os::releaseMemory(static_cast
(ptr) - offset, size); + size_t size = *--userHostMem; + size_t offset = *--userHostMem; + + Os::releaseMemory(static_cast
(ptr) - offset, size); } -void* -HeapObject::operator new(size_t size) -{ - return malloc(size); -} +void* HeapObject::operator new(size_t size) { return malloc(size); } -void -HeapObject::operator delete(void* obj) -{ - free(obj); -} +void HeapObject::operator delete(void* obj) { free(obj); } -} // namespace amd +} // namespace amd diff --git a/rocclr/runtime/os/alloc.hpp b/rocclr/runtime/os/alloc.hpp index c243d2b0d0..f9f9e0fb3b 100644 --- a/rocclr/runtime/os/alloc.hpp +++ b/rocclr/runtime/os/alloc.hpp @@ -9,22 +9,20 @@ namespace amd { -class AlignedMemory : public AllStatic -{ -public: - static void* allocate(size_t size, size_t alignment); +class AlignedMemory : public AllStatic { + public: + static void* allocate(size_t size, size_t alignment); - static void deallocate(void* ptr); + static void deallocate(void* ptr); }; -class GuardedMemory : public AllStatic -{ -public: - static void* allocate(size_t size, size_t alignment, size_t guardSize); +class GuardedMemory : public AllStatic { + public: + static void* allocate(size_t size, size_t alignment, size_t guardSize); - static void deallocate(void* ptr); + static void deallocate(void* ptr); }; -} // namespace amd +} // namespace amd #endif /*ALLOC_HPP_*/ diff --git a/rocclr/runtime/os/os.cpp b/rocclr/runtime/os/os.cpp index 9a80044b31..f3079cddcd 100644 --- a/rocclr/runtime/os/os.cpp +++ b/rocclr/runtime/os/os.cpp @@ -9,167 +9,151 @@ #include #if defined(_WIN32) || defined(__CYGWIN__) -# include -#else // !_WIN32 -# include -# include -#endif // !_WIN32 +#include +#else // !_WIN32 +#include +#include +#endif // !_WIN32 #if defined(ATI_ARCH_X86) -#include // for _mm_pause -#endif // ATI_ARCH_X86 +#include // for _mm_pause +#endif // ATI_ARCH_X86 namespace amd { -void* -Os::loadLibrary(const char* libraryname) -{ - void* handle; +void* Os::loadLibrary(const char* libraryname) { + void* handle; - // Try with the system library prefix and extension instead. - std::string str = libraryname; + // Try with the system library prefix and extension instead. + std::string str = libraryname; - size_t namestart = str.rfind(fileSeparator()); - namestart = (namestart != std::string::npos) ? namestart + 1 : 0; + size_t namestart = str.rfind(fileSeparator()); + namestart = (namestart != std::string::npos) ? namestart + 1 : 0; - if (namestart == 0) { + if (namestart == 0) { #if defined(ATI_OS_WIN) - // Try with the path of the current loaded dll(OCL runtime) first - HMODULE hm = NULL; - if (!GetModuleHandleExA(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS - | GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT, - (LPCSTR)&loadLibrary, &hm)) return NULL; + // Try with the path of the current loaded dll(OCL runtime) first + HMODULE hm = NULL; + if (!GetModuleHandleExA( + GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS | GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT, + (LPCSTR)&loadLibrary, &hm)) + return NULL; - char cszDllPath[1024] = { 0 }; - if (!GetModuleFileNameA(hm, cszDllPath, sizeof(cszDllPath))) - return NULL; + char cszDllPath[1024] = {0}; + if (!GetModuleFileNameA(hm, cszDllPath, sizeof(cszDllPath))) return NULL; - LPSTR cszFileName; - char buffer[1024] = { 0 }; - if (!GetFullPathNameA(cszDllPath, sizeof(buffer), buffer, &cszFileName)) - return NULL; + LPSTR cszFileName; + char buffer[1024] = {0}; + if (!GetFullPathNameA(cszDllPath, sizeof(buffer), buffer, &cszFileName)) return NULL; - std::string newPath; - newPath = cszDllPath; - newPath.replace(newPath.find(cszFileName), strlen(libraryname), libraryname); + std::string newPath; + newPath = cszDllPath; + newPath.replace(newPath.find(cszFileName), strlen(libraryname), libraryname); - handle = Os::loadLibrary_(newPath.c_str()); - if (handle != NULL) { - return handle; - } -#endif - } - - handle = Os::loadLibrary_(libraryname); + handle = Os::loadLibrary_(newPath.c_str()); if (handle != NULL) { - return handle; + return handle; } +#endif + } - const char* prefix = Os::libraryPrefix(); - if (prefix != NULL - && str.compare(namestart, strlen(prefix), prefix) == 0) { - // It is alread present, not need to prepend it. - prefix = NULL; - } - size_t dot = str.rfind('.'); - if (dot != std::string::npos) { - // check that the dot was on the filename not a dir name. - if (namestart < dot) { - // strip the previous extension. - str.resize(dot); - } - } - if (prefix != NULL && prefix[0] != '\0') { - str.insert(namestart, prefix); - } - str.append(Os::libraryExtension()); + handle = Os::loadLibrary_(libraryname); + if (handle != NULL) { + return handle; + } - handle = Os::loadLibrary_(str.c_str()); - if (handle != NULL || str.find(fileSeparator()) != std::string::npos) { - return handle; + const char* prefix = Os::libraryPrefix(); + if (prefix != NULL && str.compare(namestart, strlen(prefix), prefix) == 0) { + // It is alread present, not need to prepend it. + prefix = NULL; + } + size_t dot = str.rfind('.'); + if (dot != std::string::npos) { + // check that the dot was on the filename not a dir name. + if (namestart < dot) { + // strip the previous extension. + str.resize(dot); } + } + if (prefix != NULL && prefix[0] != '\0') { + str.insert(namestart, prefix); + } + str.append(Os::libraryExtension()); - // Try to find the lib in the current directory. - return Os::loadLibrary((std::string(".") + fileSeparator() - + std::string(libraryname)).c_str()); + handle = Os::loadLibrary_(str.c_str()); + if (handle != NULL || str.find(fileSeparator()) != std::string::npos) { + return handle; + } + + // Try to find the lib in the current directory. + return Os::loadLibrary((std::string(".") + fileSeparator() + std::string(libraryname)).c_str()); } size_t Os::pageSize_ = 0; int Os::processorCount_ = 0; -void -Os::spinPause() -{ +void Os::spinPause() { #if defined(ATI_ARCH_X86) - _mm_pause(); + _mm_pause(); #elif defined(__ARM_ARCH_7A__) - __asm__ __volatile__("yield"); + __asm__ __volatile__("yield"); #endif } -void -Os::sleep(long n) -{ - // FIXME_lmoriche: Should be nano-seconds not seconds. +void Os::sleep(long n) { +// FIXME_lmoriche: Should be nano-seconds not seconds. #ifdef _WIN32 - ::Sleep(n); -#else // !_WIN32 - time_t seconds = (time_t) n / 1000; - long nanoseconds = ((long) n - seconds * 1000) * 1000000; - timespec ts = { seconds, nanoseconds }; - ::nanosleep(&ts, NULL); -#endif // !_WIN32 + ::Sleep(n); +#else // !_WIN32 + time_t seconds = (time_t)n / 1000; + long nanoseconds = ((long)n - seconds * 1000) * 1000000; + timespec ts = {seconds, nanoseconds}; + ::nanosleep(&ts, NULL); +#endif // !_WIN32 } -void -Os::touchStackPages(address bottom, address top) -{ - top = alignDown(top, pageSize_) - pageSize_; - while (top >= bottom) { - *top = 0; - top -= pageSize_; - } +void Os::touchStackPages(address bottom, address top) { + top = alignDown(top, pageSize_) - pageSize_; + while (top >= bottom) { + *top = 0; + top -= pageSize_; + } } -bool -Os::skipIDIV(address& pc) -{ - address insn = pc; - if (insn[0] == 0x66) { // LCP prefix - insn += 1; - } - if ((insn[0] & 0xf0) == 0x40) { // REX prefix - insn += 1; - } - if (insn[0] == 0xf6 || insn[0] == 0xf7) { // IDIV - // This is a DivisionError: skip the insn and resume execution - char mod = insn[1] >> 6; - char rm = insn[1] & 0x7; - insn += 2; // skip opcode and mod/rm +bool Os::skipIDIV(address& pc) { + address insn = pc; + if (insn[0] == 0x66) { // LCP prefix + insn += 1; + } + if ((insn[0] & 0xf0) == 0x40) { // REX prefix + insn += 1; + } + if (insn[0] == 0xf6 || insn[0] == 0xf7) { // IDIV + // This is a DivisionError: skip the insn and resume execution + char mod = insn[1] >> 6; + char rm = insn[1] & 0x7; + insn += 2; // skip opcode and mod/rm - if (rm == 0x4 && mod != 0x3) { - insn += 1; // sib follows mod/rm - } - - if ((mod == 0x0 && rm == 0x5) || mod == 0x2) { - insn += 4; // disp32 - } - else if (mod == 0x1) { - insn += 1; // disp8 - } - pc = insn; - return true; + if (rm == 0x4 && mod != 0x3) { + insn += 1; // sib follows mod/rm } - return false; + + if ((mod == 0x0 && rm == 0x5) || mod == 0x2) { + insn += 4; // disp32 + } else if (mod == 0x1) { + insn += 1; // disp8 + } + pc = insn; + return true; + } + return false; } -void -Os::setThreadAffinity(const void* handle, unsigned int cpu) -{ - ThreadAffinityMask mask; - mask.set(cpu); - setThreadAffinity(handle, mask); +void Os::setThreadAffinity(const void* handle, unsigned int cpu) { + ThreadAffinityMask mask; + mask.set(cpu); + setThreadAffinity(handle, mask); } -} // namespace amd +} // namespace amd diff --git a/rocclr/runtime/os/os.hpp b/rocclr/runtime/os/os.hpp index 5299162e18..1febb40c99 100644 --- a/rocclr/runtime/os/os.hpp +++ b/rocclr/runtime/os/os.hpp @@ -12,16 +12,16 @@ #include #if defined(__linux__) -# include +#include #endif #ifdef _WIN32 -# include // For KAFFINITY -#endif // _WIN32 +#include // For KAFFINITY +#endif // _WIN32 // Smallest supported VM page size. #define MIN_PAGE_SHIFT 12 -#define MIN_PAGE_SIZE (1UL << MIN_PAGE_SHIFT) +#define MIN_PAGE_SIZE (1UL << MIN_PAGE_SHIFT) namespace amd { @@ -32,494 +32,420 @@ namespace amd { * @{ */ -class Thread; // For Os::createOsThread() +class Thread; // For Os::createOsThread() -class Os : AllStatic -{ -public: - enum MemProt - { - MEM_PROT_NONE = 0, - MEM_PROT_READ, - MEM_PROT_RW, - MEM_PROT_RWX - }; +class Os : AllStatic { + public: + enum MemProt { MEM_PROT_NONE = 0, MEM_PROT_READ, MEM_PROT_RW, MEM_PROT_RWX }; - class ThreadAffinityMask - { + class ThreadAffinityMask { friend class Os; - private: + + private: #if defined(__linux__) - cpu_set_t mask_; -#else // _WIN32 - #if !defined(_WIN32) - typedef uint KAFFINITY; - #endif - KAFFINITY mask_[512 / sizeof(KAFFINITY)]; + cpu_set_t mask_; +#else // _WIN32 +#if !defined(_WIN32) + typedef uint KAFFINITY; +#endif + KAFFINITY mask_[512 / sizeof(KAFFINITY)]; #endif - public: - ThreadAffinityMask() { init(); } + public: + ThreadAffinityMask() { init(); } - inline void init(); - inline void set(uint cpu); - inline void clear(uint cpu); - inline bool isSet(uint cpu) const; - inline bool isEmpty() const; - inline uint countSet() const; + inline void init(); + inline void set(uint cpu); + inline void clear(uint cpu); + inline bool isSet(uint cpu) const; + inline bool isEmpty() const; + inline uint countSet() const; - inline uint getFirstSet() const; - inline uint getNextSet(uint cpu) const; + inline uint getFirstSet() const; + inline uint getNextSet(uint cpu) const; #if defined(__linux__) - inline void set(const cpu_set_t& mask); - inline void clear(const cpu_set_t& mask); - inline void adjust(cpu_set_t& mask) const; - inline cpu_set_t& getNative() { return mask_; } + inline void set(const cpu_set_t& mask); + inline void clear(const cpu_set_t& mask); + inline void adjust(cpu_set_t& mask) const; + inline cpu_set_t& getNative() { return mask_; } #else - inline void set(size_t group, KAFFINITY affinity); - inline void adjust(size_t group, KAFFINITY& affinity) const; + inline void set(size_t group, KAFFINITY affinity); + inline void adjust(size_t group, KAFFINITY& affinity) const; #endif - }; + }; -private: - static const size_t FILE_PATH_MAX_LENGTH = 1024; + private: + static const size_t FILE_PATH_MAX_LENGTH = 1024; - static size_t pageSize_; //!< The default os page size. - static int processorCount_; //!< The number of active processors. + static size_t pageSize_; //!< The default os page size. + static int processorCount_; //!< The number of active processors. -private: - //! Load the shared library named by \a filename - static void* loadLibrary_(const char* filename); + private: + //! Load the shared library named by \a filename + static void* loadLibrary_(const char* filename); -public: - //! Initialize the Os package. - static bool init(); - //! Tear down the Os package. - static void tearDown(); + public: + //! Initialize the Os package. + static bool init(); + //! Tear down the Os package. + static void tearDown(); - // Topology helper routines: - // + // Topology helper routines: + // - //! Return the number of active processors in the system. - inline static int processorCount(); + //! Return the number of active processors in the system. + inline static int processorCount(); #if defined(ATI_ARCH_X86) - //! Query the processor information about supported features and CPU type. - static void cpuid(int regs[4], int info); - //! Get value of extended control register - static uint64_t xgetbv(uint32_t which); -#endif // ATI_ARCH_X86 + //! Query the processor information about supported features and CPU type. + static void cpuid(int regs[4], int info); + //! Get value of extended control register + static uint64_t xgetbv(uint32_t which); +#endif // ATI_ARCH_X86 - // Stack helper routines: - // + // Stack helper routines: + // - //! Return the current stack base and size information. - static void currentStackInfo(address* base, size_t *size); + //! Return the current stack base and size information. + static void currentStackInfo(address* base, size_t* size); - //! Return the value of the current stack pointer. - static NOT_WIN64(inline) address currentStackPtr(); - //! Set the value of the current stack pointer. - static WIN64_ONLY(inline) void WINDOWS_ONLY(__stdcall/*callee cleanup*/) - setCurrentStackPtr(address sp); - //! Touches all stack pages between [bottom,top[ - static void touchStackPages(address bottom, address top); + //! Return the value of the current stack pointer. + static NOT_WIN64(inline) address currentStackPtr(); + //! Set the value of the current stack pointer. + static WIN64_ONLY(inline) void WINDOWS_ONLY(__stdcall /*callee cleanup*/) + setCurrentStackPtr(address sp); + //! Touches all stack pages between [bottom,top[ + static void touchStackPages(address bottom, address top); - // Thread routines: - // + // Thread routines: + // - //! Create a native thread and link it to the given OsThread. - static const void* createOsThread(Thread* osThread); - //! Set the thread's affinity to the given cpu ordinal. - static void setThreadAffinity(const void* handle, unsigned int cpu); - //! Set the thread's affinity to the given cpu mask. - static void setThreadAffinity(const void* handle, const ThreadAffinityMask& mask); - //! Set the currently running thread's name. - static void setCurrentThreadName(const char* name); - //! Check if the thread is alive - static bool isThreadAlive(const Thread& osThread); + //! Create a native thread and link it to the given OsThread. + static const void* createOsThread(Thread* osThread); + //! Set the thread's affinity to the given cpu ordinal. + static void setThreadAffinity(const void* handle, unsigned int cpu); + //! Set the thread's affinity to the given cpu mask. + static void setThreadAffinity(const void* handle, const ThreadAffinityMask& mask); + //! Set the currently running thread's name. + static void setCurrentThreadName(const char* name); + //! Check if the thread is alive + static bool isThreadAlive(const Thread& osThread); - //! Sleep for n milli-seconds. - static void sleep(long n); - //! Yield to threads of the same or lower priority - static void yield(); - //! Execute a pause instruction (for spin loops). - static void spinPause(); + //! Sleep for n milli-seconds. + static void sleep(long n); + //! Yield to threads of the same or lower priority + static void yield(); + //! Execute a pause instruction (for spin loops). + static void spinPause(); - // Memory routines: - // + // Memory routines: + // - //! Return the default os page size. - inline static size_t pageSize(); - //! Return the amount of host total physical memory in bytes. - static uint64_t hostTotalPhysicalMemory(); + //! Return the default os page size. + inline static size_t pageSize(); + //! Return the amount of host total physical memory in bytes. + static uint64_t hostTotalPhysicalMemory(); - //! Reserve a chunk of memory (priv | anon | noreserve). - static address reserveMemory(address start, size_t size, size_t alignment = 0, MemProt prot = MEM_PROT_NONE); - //! Release a chunk of memory reserved with reserveMemory. - static bool releaseMemory(void* addr, size_t size); - //! Commit a chunk of memory previously reserved with reserveMemory. - static bool commitMemory(void* addr, size_t size, MemProt prot = MEM_PROT_NONE); - //! Uncommit a chunk of memory previously committed with commitMemory. - static bool uncommitMemory(void* addr, size_t size); - //! Set the page protections for the given memory region. - static bool protectMemory(void* addr, size_t size, MemProt prot); + //! Reserve a chunk of memory (priv | anon | noreserve). + static address reserveMemory(address start, size_t size, size_t alignment = 0, + MemProt prot = MEM_PROT_NONE); + //! Release a chunk of memory reserved with reserveMemory. + static bool releaseMemory(void* addr, size_t size); + //! Commit a chunk of memory previously reserved with reserveMemory. + static bool commitMemory(void* addr, size_t size, MemProt prot = MEM_PROT_NONE); + //! Uncommit a chunk of memory previously committed with commitMemory. + static bool uncommitMemory(void* addr, size_t size); + //! Set the page protections for the given memory region. + static bool protectMemory(void* addr, size_t size, MemProt prot); - //! Allocate an aligned chunk of memory. - static void* alignedMalloc(size_t size, size_t alignment); - //! Deallocate an aligned chunk of memory. - static void alignedFree(void* mem); + //! Allocate an aligned chunk of memory. + static void* alignedMalloc(size_t size, size_t alignment); + //! Deallocate an aligned chunk of memory. + static void alignedFree(void* mem); - //! Platform-specific optimized memcpy() - static void* fastMemcpy(void *dest, const void *src, size_t n); + //! Platform-specific optimized memcpy() + static void* fastMemcpy(void* dest, const void* src, size_t n); - // File/Path helper routines: - // + // File/Path helper routines: + // - //! Return the shared library extension string. - static const char* libraryExtension(); - //! Return the shared library prefix string. - static const char* libraryPrefix(); - //! Return the object extension string. - static const char* objectExtension(); - //! Return the file separator char. - static char fileSeparator(); - //! Return the path separator char. - static char pathSeparator(); - //! Return whether the path exists - static bool pathExists(const std::string& path); - //! Create the path if it does not exist - static bool createPath(const std::string& path); - //! Remove the path if it is empty - static bool removePath(const std::string& path); - //! Printf re-implementation (due to MS CRT problem) - static int printf(const char*fmt,...); - /*! \brief Invokes the command processor for the command execution - * - * \result Returns the operation result - */ - static int systemCall( - const std::string& command); //!< command for execution + //! Return the shared library extension string. + static const char* libraryExtension(); + //! Return the shared library prefix string. + static const char* libraryPrefix(); + //! Return the object extension string. + static const char* objectExtension(); + //! Return the file separator char. + static char fileSeparator(); + //! Return the path separator char. + static char pathSeparator(); + //! Return whether the path exists + static bool pathExists(const std::string& path); + //! Create the path if it does not exist + static bool createPath(const std::string& path); + //! Remove the path if it is empty + static bool removePath(const std::string& path); + //! Printf re-implementation (due to MS CRT problem) + static int printf(const char* fmt, ...); + /*! \brief Invokes the command processor for the command execution + * + * \result Returns the operation result + */ + static int systemCall(const std::string& command); //!< command for execution - /*! \brief Retrieves a string containing the value - * of the environment variable - * - * \result Returns the environment variable value - */ - static std::string getEnvironment( - const std::string& name); //!< the environment variable's name + /*! \brief Retrieves a string containing the value + * of the environment variable + * + * \result Returns the environment variable value + */ + static std::string getEnvironment(const std::string& name); //!< the environment variable's name - /*! \brief Retrieves the path of the directory designated for temporary - * files - * - * \result Returns the temporary path - */ - static std::string getTempPath(); + /*! \brief Retrieves the path of the directory designated for temporary + * files + * + * \result Returns the temporary path + */ + static std::string getTempPath(); - /*! \brief Creates a name for a temporary file - * - * \result Returns the name of temporary file - */ - static std::string getTempFileName(); + /*! \brief Creates a name for a temporary file + * + * \result Returns the name of temporary file + */ + static std::string getTempFileName(); - //! Deletes file - static int unlink(const std::string& path); + //! Deletes file + static int unlink(const std::string& path); - // Library routines: - // - typedef bool (*SymbolCallback)(std::string, const void*, void*); + // Library routines: + // + typedef bool (*SymbolCallback)(std::string, const void*, void*); - //! Load the shared library named by \a filename - static void* loadLibrary(const char* filename); - //! Unload the shared library. - static void unloadLibrary(void* handle); - //! Return the address of the function identified by \a name. - static void* getSymbol(void* handle, const char* name); - //! Get all the __kernel functions in the given shared library. - static bool iterateSymbols(void* handle, SymbolCallback func, void* data); + //! Load the shared library named by \a filename + static void* loadLibrary(const char* filename); + //! Unload the shared library. + static void unloadLibrary(void* handle); + //! Return the address of the function identified by \a name. + static void* getSymbol(void* handle, const char* name); + //! Get all the __kernel functions in the given shared library. + static bool iterateSymbols(void* handle, SymbolCallback func, void* data); - // Time routines: - // + // Time routines: + // - //! Return the current system time counter in nanoseconds. - static uint64_t timeNanos(); - //! Return the system timer's resolution in nanoseconds. - static uint64_t timerResolutionNanos(); - //! Return the timeNanos starting point offset to Epoch. - static uint64_t offsetToEpochNanos(); + //! Return the current system time counter in nanoseconds. + static uint64_t timeNanos(); + //! Return the system timer's resolution in nanoseconds. + static uint64_t timerResolutionNanos(); + //! Return the timeNanos starting point offset to Epoch. + static uint64_t offsetToEpochNanos(); - // X86 Instructions helpers: - // + // X86 Instructions helpers: + // - //! Skip an IDIV (F6/F7) instruction and return a pointer to the next insn. - static bool skipIDIV(address& insn); + //! Skip an IDIV (F6/F7) instruction and return a pointer to the next insn. + static bool skipIDIV(address& insn); - // return gloabal memory size to be assigned to device info - static size_t getPhysicalMemSize(); + // return gloabal memory size to be assigned to device info + static size_t getPhysicalMemSize(); - //! get Application file name - static std::string getAppFileName(); + //! get Application file name + static std::string getAppFileName(); - //! Install SIGFPE handler for CPU device - static bool installSigfpeHandler(); + //! Install SIGFPE handler for CPU device + static bool installSigfpeHandler(); - //! Uninstall SIGFPE handler for CPU device - static void uninstallSigfpeHandler(); + //! Uninstall SIGFPE handler for CPU device + static void uninstallSigfpeHandler(); }; /*@}*/ -inline size_t -Os::pageSize() -{ - assert(pageSize_ != 0 && "runtime is not initialized"); - return pageSize_; +inline size_t Os::pageSize() { + assert(pageSize_ != 0 && "runtime is not initialized"); + return pageSize_; } -inline int -Os::processorCount() -{ - return processorCount_; -} +inline int Os::processorCount() { return processorCount_; } #if defined(_WIN64) extern "C" void _Os_setCurrentStackPtr(address sp); -ALWAYSINLINE void -Os::setCurrentStackPtr(address sp) -{ - _Os_setCurrentStackPtr(sp); -} +ALWAYSINLINE void Os::setCurrentStackPtr(address sp) { _Os_setCurrentStackPtr(sp); } -#else // !_WIN64 +#else // !_WIN64 -ALWAYSINLINE address -Os::currentStackPtr() -{ - intptr_t value; +ALWAYSINLINE address Os::currentStackPtr() { + intptr_t value; #if defined(__GNUC__) - __asm__ __volatile__ ( -# if defined(ATI_ARCH_X86) - LP64_SWITCH("movl %%esp", "movq %%rsp") ",%0" : "=r"(value) -# elif defined(ATI_ARCH_ARM) - "mov %0,sp" : "=r"(value) -# endif - ); -#else // !__GNUC__ - __asm mov value, esp; -#endif // !__GNUC__ + __asm__ __volatile__( +#if defined(ATI_ARCH_X86) + LP64_SWITCH("movl %%esp", "movq %%rsp") ",%0" + : "=r"(value) +#elif defined(ATI_ARCH_ARM) + "mov %0,sp" + : "=r"(value) +#endif + ); +#else // !__GNUC__ + __asm mov value, esp; +#endif // !__GNUC__ - return (address)value; + return (address)value; } -#endif // !_WIN64 +#endif // !_WIN64 #if defined(__linux__) -inline void -Os::ThreadAffinityMask::init() -{ - CPU_ZERO(&mask_); +inline void Os::ThreadAffinityMask::init() { CPU_ZERO(&mask_); } + +inline void Os::ThreadAffinityMask::set(uint cpu) { CPU_SET(cpu, &mask_); } + +inline void Os::ThreadAffinityMask::clear(uint cpu) { CPU_CLR(cpu, &mask_); } + +inline bool Os::ThreadAffinityMask::isSet(uint cpu) const { return CPU_ISSET(cpu, &mask_); } + +inline bool Os::ThreadAffinityMask::isEmpty() const { + const uint32_t* bits = (const uint32_t*)mask_.__bits; + for (uint i = 0; i < sizeof(mask_.__bits) / sizeof(uint32_t); ++i) { + if (bits[i] != 0) { + return false; + } + } + return true; } -inline void -Os::ThreadAffinityMask::set(uint cpu) -{ - CPU_SET(cpu, &mask_); +inline void Os::ThreadAffinityMask::set(const cpu_set_t& mask) { mask_ = mask; } + +inline void Os::ThreadAffinityMask::clear(const cpu_set_t& mask) { + const uint32_t* bitsClear = (const uint32_t*)mask.__bits; + uint32_t* bits = (uint32_t*)mask_.__bits; + for (uint i = 0; i < sizeof(mask_.__bits) / sizeof(uint32_t); ++i) { + bits[i] &= ~bitsClear[i]; + } } -inline void -Os::ThreadAffinityMask::clear(uint cpu) -{ - CPU_CLR(cpu, &mask_); +inline void Os::ThreadAffinityMask::adjust(cpu_set_t& mask) const { + uint32_t* bitsOut = (uint32_t*)mask.__bits; + const uint32_t* bits = (const uint32_t*)mask_.__bits; + for (uint i = 0; i < sizeof(mask_.__bits) / sizeof(uint32_t); ++i) { + bitsOut[i] &= bits[i]; + } } -inline bool -Os::ThreadAffinityMask::isSet(uint cpu) const -{ - return CPU_ISSET(cpu, &mask_); +inline uint Os::ThreadAffinityMask::countSet() const { + uint count = 0; + const uint32_t* bits = (const uint32_t*)mask_.__bits; + for (uint i = 0; i < sizeof(mask_.__bits) / sizeof(uint32_t); ++i) { + count += countBitsSet(bits[i]); + } + return count; } -inline bool -Os::ThreadAffinityMask::isEmpty() const -{ - const uint32_t* bits = (const uint32_t*)mask_.__bits; - for (uint i = 0; i < sizeof(mask_.__bits) / sizeof(uint32_t); ++i) { - if (bits[i] != 0) { - return false; +inline uint Os::ThreadAffinityMask::getFirstSet() const { + const uint32_t* bits = (const uint32_t*)mask_.__bits; + for (uint i = 0; i < sizeof(mask_.__bits) / sizeof(uint32_t); ++i) { + if (bits[i] != 0) { + return leastBitSet(bits[i]) + (i * (8 * sizeof(uint32_t))); + } + } + return (uint)-1; +} + +inline uint Os::ThreadAffinityMask::getNextSet(uint cpu) const { + const uint32_t* bits = (const uint32_t*)mask_.__bits; + ++cpu; + uint j = cpu % (8 * sizeof(uint32_t)); + for (uint i = cpu / (8 * sizeof(uint32_t)); i < sizeof(mask_.__bits) / sizeof(uint32_t); ++i) { + if (bits[i] != 0) { + for (; j < (8 * sizeof(uint32_t)); ++j) { + if (0 != (bits[i] & ((uint32_t)1 << j))) { + return i * (8 * sizeof(uint32_t)) + j; } + } } - return true; -} - -inline void -Os::ThreadAffinityMask::set(const cpu_set_t& mask) -{ - mask_ = mask; -} - -inline void -Os::ThreadAffinityMask::clear(const cpu_set_t& mask) -{ - const uint32_t* bitsClear = (const uint32_t*)mask.__bits; - uint32_t* bits = (uint32_t*)mask_.__bits; - for (uint i = 0; i < sizeof(mask_.__bits) / sizeof(uint32_t); ++i) { - bits[i] &= ~bitsClear[i]; - } -} - -inline void -Os::ThreadAffinityMask::adjust(cpu_set_t& mask) const -{ - uint32_t* bitsOut = (uint32_t*)mask.__bits; - const uint32_t* bits = (const uint32_t*)mask_.__bits; - for (uint i = 0; i < sizeof(mask_.__bits) / sizeof(uint32_t); ++i) { - bitsOut[i] &= bits[i]; - } -} - -inline uint -Os::ThreadAffinityMask::countSet() const -{ - uint count = 0; - const uint32_t* bits = (const uint32_t*)mask_.__bits; - for (uint i = 0; i < sizeof(mask_.__bits) / sizeof(uint32_t); ++i) { - count += countBitsSet(bits[i]); - } - return count; -} - -inline uint -Os::ThreadAffinityMask::getFirstSet() const -{ - const uint32_t* bits = (const uint32_t*)mask_.__bits; - for (uint i = 0; i < sizeof(mask_.__bits) / sizeof(uint32_t); ++i) { - if (bits[i] != 0) { - return leastBitSet(bits[i]) + (i * (8*sizeof(uint32_t))); - } - } - return (uint)-1; -} - -inline uint -Os::ThreadAffinityMask::getNextSet(uint cpu) const -{ - const uint32_t* bits = (const uint32_t*)mask_.__bits; - ++cpu; - uint j = cpu % (8*sizeof(uint32_t)); - for (uint i = cpu / (8*sizeof(uint32_t)); - i < sizeof(mask_.__bits) / sizeof(uint32_t); ++i) { - if (bits[i] != 0) { - for (; j < (8*sizeof(uint32_t)); ++j) { - if (0 != (bits[i] & ((uint32_t)1 << j))) { - return i * (8*sizeof(uint32_t)) + j; - } - } - } - j = 0; - } - return (uint)-1; + j = 0; + } + return (uint)-1; } #else -inline void -Os::ThreadAffinityMask::init() -{ - for (uint i = 0; i < sizeof(mask_) / sizeof(KAFFINITY); ++i) { - mask_[i] = (KAFFINITY)0; +inline void Os::ThreadAffinityMask::init() { + for (uint i = 0; i < sizeof(mask_) / sizeof(KAFFINITY); ++i) { + mask_[i] = (KAFFINITY)0; + } +} + +inline void Os::ThreadAffinityMask::set(uint cpu) { + mask_[cpu / (8 * sizeof(KAFFINITY))] |= (KAFFINITY)1 << (cpu % (8 * sizeof(KAFFINITY))); +} + +inline void Os::ThreadAffinityMask::clear(uint cpu) { + mask_[cpu / (8 * sizeof(KAFFINITY))] &= ~((KAFFINITY)1 << (cpu % (8 * sizeof(KAFFINITY)))); +} + +inline bool Os::ThreadAffinityMask::isSet(uint cpu) const { + return (KAFFINITY)0 != + (mask_[cpu / (8 * sizeof(KAFFINITY))] & ((KAFFINITY)1 << (cpu % (8 * sizeof(KAFFINITY))))); +} + +inline bool Os::ThreadAffinityMask::isEmpty() const { + for (uint i = 0; i < sizeof(mask_) / sizeof(KAFFINITY); ++i) { + if (mask_[i] != (KAFFINITY)0) { + return false; } + } + return true; } -inline void -Os::ThreadAffinityMask::set(uint cpu) -{ - mask_[cpu / (8*sizeof(KAFFINITY))] |= - (KAFFINITY)1 << (cpu % (8*sizeof(KAFFINITY))); +inline void Os::ThreadAffinityMask::set(size_t group, KAFFINITY affinity) { + mask_[group] |= affinity; } -inline void -Os::ThreadAffinityMask::clear(uint cpu) -{ - mask_[cpu / (8*sizeof(KAFFINITY))] &= - ~( (KAFFINITY)1 << (cpu % (8*sizeof(KAFFINITY))) ); +inline void Os::ThreadAffinityMask::adjust(size_t group, KAFFINITY& affinity) const { + affinity &= mask_[group]; } -inline bool -Os::ThreadAffinityMask::isSet(uint cpu) const -{ - return (KAFFINITY)0 != (mask_[cpu / (8*sizeof(KAFFINITY))] & - ((KAFFINITY)1 << (cpu % (8*sizeof(KAFFINITY))))); +inline uint Os::ThreadAffinityMask::countSet() const { + uint count = 0; + for (uint i = 0; i < sizeof(mask_) / sizeof(KAFFINITY); ++i) { + count += countBitsSet(mask_[i]); + } + return count; } -inline bool -Os::ThreadAffinityMask::isEmpty() const -{ - for (uint i = 0; i < sizeof(mask_) / sizeof(KAFFINITY); ++i) { - if (mask_[i] != (KAFFINITY)0) { - return false; +inline uint Os::ThreadAffinityMask::getFirstSet() const { + for (uint i = 0; i < sizeof(mask_) / sizeof(KAFFINITY); ++i) { + if (mask_[i] != 0) { + return leastBitSet(mask_[i]) + (i * (8 * sizeof(KAFFINITY))); + } + } + return (uint)-1; +} + +inline uint Os::ThreadAffinityMask::getNextSet(uint cpu) const { + ++cpu; + uint j = cpu % (8 * sizeof(KAFFINITY)); + for (uint i = cpu / (8 * sizeof(KAFFINITY)); i < sizeof(mask_) / sizeof(KAFFINITY); ++i) { + if (mask_[i] != 0) { + for (; j < (8 * sizeof(KAFFINITY)); ++j) { + if (0 != (mask_[i] & ((KAFFINITY)1 << j))) { + return i * (8 * sizeof(KAFFINITY)) + j; } + } } - return true; -} - -inline void -Os::ThreadAffinityMask::set(size_t group, KAFFINITY affinity) -{ - mask_[group] |= affinity; -} - -inline void -Os::ThreadAffinityMask::adjust(size_t group, KAFFINITY& affinity) const -{ - affinity &= mask_[group]; -} - -inline uint -Os::ThreadAffinityMask::countSet() const -{ - uint count = 0; - for (uint i = 0; i < sizeof(mask_) / sizeof(KAFFINITY); ++i) { - count += countBitsSet(mask_[i]); - } - return count; -} - -inline uint -Os::ThreadAffinityMask::getFirstSet() const -{ - for (uint i = 0; i < sizeof(mask_) / sizeof(KAFFINITY); ++i) { - if (mask_[i] != 0) { - return leastBitSet(mask_[i]) + (i * (8*sizeof(KAFFINITY))); - } - } - return (uint)-1; -} - -inline uint -Os::ThreadAffinityMask::getNextSet(uint cpu) const -{ - ++cpu; - uint j = cpu % (8*sizeof(KAFFINITY)); - for (uint i = cpu / (8*sizeof(KAFFINITY)); - i < sizeof(mask_) / sizeof(KAFFINITY); ++i) { - if (mask_[i] != 0) { - for (; j < (8*sizeof(KAFFINITY)); ++j) { - if (0 != (mask_[i] & ((KAFFINITY)1 << j))) { - return i * (8*sizeof(KAFFINITY)) + j; - } - } - } - j = 0; - } - return (uint)-1; + j = 0; + } + return (uint)-1; } #endif -} // namespace amd +} // namespace amd #endif /*OS_HPP_*/ diff --git a/rocclr/runtime/os/os_posix.cpp b/rocclr/runtime/os/os_posix.cpp index 26cde31c9c..b82ca1861e 100644 --- a/rocclr/runtime/os/os_posix.cpp +++ b/rocclr/runtime/os/os_posix.cpp @@ -29,582 +29,476 @@ #include #include #ifndef DT_GNU_HASH -# define DT_GNU_HASH 0x6ffffef5 -#endif // DT_GNU_HASH +#define DT_GNU_HASH 0x6ffffef5 +#endif // DT_GNU_HASH #include #include #include #include -#include // for strncmp +#include // for strncmp #include -#include // for tempnam +#include // for tempnam #include #include #include - - namespace amd { static struct sigaction oldSigAction; -static bool -callOldSignalHandler(int sig, siginfo_t* info, void* ptr) -{ - if (oldSigAction.sa_handler == SIG_DFL) { - // no signal handler was previously installed. - return false; - } - else if (oldSigAction.sa_handler != SIG_IGN) { - - if ((oldSigAction.sa_flags & SA_NODEFER) == 0) { - sigaddset(&oldSigAction.sa_mask, sig); - } - - void (*handler)(int) = oldSigAction.sa_handler; - if (oldSigAction.sa_flags & SA_RESETHAND) { - oldSigAction.sa_handler = SIG_DFL; - } - - sigset_t savedSigSet; - pthread_sigmask(SIG_SETMASK, &oldSigAction.sa_mask, &savedSigSet); - - if (oldSigAction.sa_flags & SA_SIGINFO) { - oldSigAction.sa_sigaction(sig, info, ptr); - } - else { - handler(sig); - } - - pthread_sigmask(SIG_SETMASK, &savedSigSet, NULL); +static bool callOldSignalHandler(int sig, siginfo_t* info, void* ptr) { + if (oldSigAction.sa_handler == SIG_DFL) { + // no signal handler was previously installed. + return false; + } else if (oldSigAction.sa_handler != SIG_IGN) { + if ((oldSigAction.sa_flags & SA_NODEFER) == 0) { + sigaddset(&oldSigAction.sa_mask, sig); } - return true; + void (*handler)(int) = oldSigAction.sa_handler; + if (oldSigAction.sa_flags & SA_RESETHAND) { + oldSigAction.sa_handler = SIG_DFL; + } + + sigset_t savedSigSet; + pthread_sigmask(SIG_SETMASK, &oldSigAction.sa_mask, &savedSigSet); + + if (oldSigAction.sa_flags & SA_SIGINFO) { + oldSigAction.sa_sigaction(sig, info, ptr); + } else { + handler(sig); + } + + pthread_sigmask(SIG_SETMASK, &savedSigSet, NULL); + } + + return true; } -static void -divisionErrorHandler(int sig, siginfo_t* info, void* ptr) -{ - assert(info != NULL && ptr != NULL && "just checking"); - ucontext_t* uc = (ucontext_t*) ptr; - address insn; +static void divisionErrorHandler(int sig, siginfo_t* info, void* ptr) { + assert(info != NULL && ptr != NULL && "just checking"); + ucontext_t* uc = (ucontext_t*)ptr; + address insn; #if defined(ATI_ARCH_X86) - insn = (address)uc->uc_mcontext.gregs[LP64_SWITCH(REG_EIP,REG_RIP)]; + insn = (address)uc->uc_mcontext.gregs[LP64_SWITCH(REG_EIP, REG_RIP)]; #else - assert(!"Unimplemented"); + assert(!"Unimplemented"); #endif - if(Thread::current()->isWorkerThread()) { - if (Os::skipIDIV(insn)) { + if (Thread::current()->isWorkerThread()) { + if (Os::skipIDIV(insn)) { #if defined(ATI_ARCH_X86) - uc->uc_mcontext.gregs[LP64_SWITCH(REG_EIP,REG_RIP)] = (greg_t)insn; + uc->uc_mcontext.gregs[LP64_SWITCH(REG_EIP, REG_RIP)] = (greg_t)insn; #else - assert(!"Unimplemented"); + assert(!"Unimplemented"); #endif - return; - } + return; } + } - // Call the chained signal handler - if (callOldSignalHandler(sig, info, ptr)) { - return; - } + // Call the chained signal handler + if (callOldSignalHandler(sig, info, ptr)) { + return; + } - std::cerr << "Unhandled signal in divisionErrorHandler()" << std::endl; - ::abort(); + std::cerr << "Unhandled signal in divisionErrorHandler()" << std::endl; + ::abort(); } -typedef int (*pthread_setaffinity_fn)(pthread_t, size_t , const cpu_set_t *); +typedef int (*pthread_setaffinity_fn)(pthread_t, size_t, const cpu_set_t*); static pthread_setaffinity_fn pthread_setaffinity_fptr; static void init() __attribute__((constructor(101))); static void init() { Os::init(); } bool Os::installSigfpeHandler() { - // Install a SIGFPE signal handler @todo: Chain the handlers - struct sigaction sa; - sigfillset(&sa.sa_mask); - sa.sa_handler = SIG_DFL; - sa.sa_sigaction = divisionErrorHandler; - sa.sa_flags = SA_SIGINFO | SA_RESTART; + // Install a SIGFPE signal handler @todo: Chain the handlers + struct sigaction sa; + sigfillset(&sa.sa_mask); + sa.sa_handler = SIG_DFL; + sa.sa_sigaction = divisionErrorHandler; + sa.sa_flags = SA_SIGINFO | SA_RESTART; - if (sigaction(SIGFPE, &sa, &oldSigAction) != 0) { - return false; - } + if (sigaction(SIGFPE, &sa, &oldSigAction) != 0) { + return false; + } + return true; +} + +void Os::uninstallSigfpeHandler() {} + +bool Os::init() { + static bool initialized_ = false; + + // We could use pthread_once here: + if (initialized_) { return true; -} + } + initialized_ = true; -void Os::uninstallSigfpeHandler() { -} + pageSize_ = (size_t)::sysconf(_SC_PAGESIZE); + processorCount_ = ::sysconf(_SC_NPROCESSORS_CONF); -bool -Os::init() -{ - static bool initialized_ = false; + pthread_setaffinity_fptr = (pthread_setaffinity_fn)dlsym(RTLD_NEXT, "pthread_setaffinity_np"); - // We could use pthread_once here: - if (initialized_) { - return true; - } - initialized_ = true; - - pageSize_ = (size_t) ::sysconf(_SC_PAGESIZE); - processorCount_ = ::sysconf(_SC_NPROCESSORS_CONF); - - pthread_setaffinity_fptr = (pthread_setaffinity_fn) - dlsym(RTLD_NEXT, "pthread_setaffinity_np"); - - return Thread::init(); + return Thread::init(); } static void __exit() __attribute__((destructor(101))); static void __exit() { Os::tearDown(); } -void -Os::tearDown() -{ - Thread::tearDown(); -} +void Os::tearDown() { Thread::tearDown(); } -bool -Os::iterateSymbols(void* handle, Os::SymbolCallback callback, void* data) -{ - const char magic[] = "__OpenCL_"; - const size_t len = sizeof(magic) - 1; +bool Os::iterateSymbols(void* handle, Os::SymbolCallback callback, void* data) { + const char magic[] = "__OpenCL_"; + const size_t len = sizeof(magic) - 1; - struct link_map *link_map = NULL; - if (::dlinfo(handle, RTLD_DI_LINKMAP, &link_map) != 0) { - return false; + struct link_map* link_map = NULL; + if (::dlinfo(handle, RTLD_DI_LINKMAP, &link_map) != 0) { + return false; + } + + assert(link_map != NULL && "just checking"); + const ElfW(Dyn)* dyn = (ElfW(Dyn)*)(link_map->l_ld); + + const Elf32_Word* gnuhash = NULL; + const Elf_Symndx* hash = NULL; + const ElfW(Sym)* symbols = NULL; + const char* stringTable = NULL; + size_t tableSize = 0; + + // Search for the string table address and size. + while (dyn->d_tag != DT_NULL) { + switch (dyn->d_tag) { + case DT_HASH: + hash = (Elf_Symndx*)dyn->d_un.d_ptr; + break; + case DT_GNU_HASH: + gnuhash = (Elf32_Word*)dyn->d_un.d_ptr; + break; + case DT_SYMTAB: + symbols = (ElfW(Sym)*)dyn->d_un.d_ptr; + break; + case DT_STRTAB: + stringTable = (const char*)dyn->d_un.d_ptr; + break; + case DT_STRSZ: + tableSize = dyn->d_un.d_val; + break; + default: + break; } + ++dyn; + } + if (stringTable == NULL || tableSize == 0 || symbols == NULL || + (hash == NULL && gnuhash == NULL)) { + // Could not find the string table + return false; + } - assert(link_map != NULL && "just checking"); - const ElfW(Dyn)* dyn = (ElfW(Dyn)*)(link_map->l_ld); - - const Elf32_Word* gnuhash = NULL; - const Elf_Symndx* hash = NULL; - const ElfW(Sym)* symbols = NULL; - const char* stringTable = NULL; - size_t tableSize = 0; - - // Search for the string table address and size. - while (dyn->d_tag != DT_NULL) { - switch (dyn->d_tag) { - case DT_HASH: - hash = (Elf_Symndx*) dyn->d_un.d_ptr; - break; - case DT_GNU_HASH: - gnuhash = (Elf32_Word*) dyn->d_un.d_ptr; - break; - case DT_SYMTAB: - symbols = (ElfW(Sym)*) dyn->d_un.d_ptr; - break; - case DT_STRTAB: - stringTable = (const char*) dyn->d_un.d_ptr; - break; - case DT_STRSZ: - tableSize = dyn->d_un.d_val; - break; - default: - break; - } - ++dyn; - } - if (stringTable == NULL || tableSize == 0 || symbols == NULL - || (hash == NULL && gnuhash == NULL)) { - // Could not find the string table - return false; - } - - if (gnuhash == NULL) { - // Read the defined symbols out of the classic SYSV hashtable. - - Elf_Symndx nbuckets = hash[1]; - for (Elf_Symndx i = 0; i < nbuckets; ++i) { - - if (symbols[i].st_shndx == SHN_UNDEF - && symbols[i].st_value == 0) { - continue; - } - - const char* name = &stringTable[symbols[i].st_name]; - if (::strncmp(name, magic, len) == 0) { - callback(name, (const void*) - (link_map->l_addr + symbols[i].st_value), data); - } - } - return true; - } - - // Read the defined symbols out of the GNU hashtable. - - Elf_Symndx nbuckets = gnuhash[0]; - Elf32_Word bias = gnuhash[1]; - Elf32_Word nwords = gnuhash[2]; - const Elf32_Word* buckets = &gnuhash[4 + __ELF_NATIVE_CLASS / 32 * nwords]; - const Elf32_Word* chain0 = &buckets[nbuckets] - bias; + if (gnuhash == NULL) { + // Read the defined symbols out of the classic SYSV hashtable. + Elf_Symndx nbuckets = hash[1]; for (Elf_Symndx i = 0; i < nbuckets; ++i) { - size_t index = buckets[i]; - const Elf32_Word *hasharr = &chain0[index]; - do { - if (symbols[index].st_shndx != SHN_UNDEF - || symbols[index].st_value != 0) { - const char* name = &stringTable[symbols[index].st_name]; - if (::strncmp(name, magic, len) == 0) { - callback(name, (const void*) - (link_map->l_addr + symbols[index].st_value), data); - } - } - ++index; - } while ((*hasharr++ & 1) == 0); - } + if (symbols[i].st_shndx == SHN_UNDEF && symbols[i].st_value == 0) { + continue; + } + const char* name = &stringTable[symbols[i].st_name]; + if (::strncmp(name, magic, len) == 0) { + callback(name, (const void*)(link_map->l_addr + symbols[i].st_value), data); + } + } return true; -} + } -void* -Os::loadLibrary_(const char *filename) -{ - return (*filename == '\0') ? NULL : ::dlopen(filename, RTLD_LAZY); -} + // Read the defined symbols out of the GNU hashtable. -void -Os::unloadLibrary(void* handle) -{ - ::dlclose(handle); -} + Elf_Symndx nbuckets = gnuhash[0]; + Elf32_Word bias = gnuhash[1]; + Elf32_Word nwords = gnuhash[2]; + const Elf32_Word* buckets = &gnuhash[4 + __ELF_NATIVE_CLASS / 32 * nwords]; + const Elf32_Word* chain0 = &buckets[nbuckets] - bias; -void* -Os::getSymbol(void* handle, const char* name) -{ - return ::dlsym(handle, name); -} - -static inline int -memProtToOsProt(Os::MemProt prot) -{ - switch (prot) { - case Os::MEM_PROT_NONE: return PROT_NONE; - case Os::MEM_PROT_READ: return PROT_READ; - case Os::MEM_PROT_RW: return PROT_READ | PROT_WRITE; - case Os::MEM_PROT_RWX: return PROT_READ | PROT_WRITE | PROT_EXEC; - default: break; - } - ShouldNotReachHere(); - return -1; -} - -address -Os::reserveMemory(address start, size_t size, size_t alignment, MemProt prot) -{ - size = alignUp(size, pageSize()); - alignment = std::max(pageSize(), alignUp(alignment, pageSize())); - assert(isPowerOfTwo(alignment) && "not a power of 2"); - - size_t requested = size + alignment - pageSize(); - address mem = (address) ::mmap(start, requested, memProtToOsProt(prot), - MAP_PRIVATE | MAP_NORESERVE | MAP_ANONYMOUS, 0, 0); - - // check for out of memory - if (mem == NULL) return NULL; - - address aligned = alignUp(mem, alignment); - - // return the unused leading pages to the free state - if (&aligned[0] != &mem[0]) { - assert(&aligned[0] > &mem[0] && "check this code"); - if (::munmap(&mem[0], &aligned[0] - &mem[0]) != 0) { - assert(!"::munmap failed"); + for (Elf_Symndx i = 0; i < nbuckets; ++i) { + size_t index = buckets[i]; + const Elf32_Word* hasharr = &chain0[index]; + do { + if (symbols[index].st_shndx != SHN_UNDEF || symbols[index].st_value != 0) { + const char* name = &stringTable[symbols[index].st_name]; + if (::strncmp(name, magic, len) == 0) { + callback(name, (const void*)(link_map->l_addr + symbols[index].st_value), data); } + } + ++index; + } while ((*hasharr++ & 1) == 0); + } + + return true; +} + +void* Os::loadLibrary_(const char* filename) { + return (*filename == '\0') ? NULL : ::dlopen(filename, RTLD_LAZY); +} + +void Os::unloadLibrary(void* handle) { ::dlclose(handle); } + +void* Os::getSymbol(void* handle, const char* name) { return ::dlsym(handle, name); } + +static inline int memProtToOsProt(Os::MemProt prot) { + switch (prot) { + case Os::MEM_PROT_NONE: + return PROT_NONE; + case Os::MEM_PROT_READ: + return PROT_READ; + case Os::MEM_PROT_RW: + return PROT_READ | PROT_WRITE; + case Os::MEM_PROT_RWX: + return PROT_READ | PROT_WRITE | PROT_EXEC; + default: + break; + } + ShouldNotReachHere(); + return -1; +} + +address Os::reserveMemory(address start, size_t size, size_t alignment, MemProt prot) { + size = alignUp(size, pageSize()); + alignment = std::max(pageSize(), alignUp(alignment, pageSize())); + assert(isPowerOfTwo(alignment) && "not a power of 2"); + + size_t requested = size + alignment - pageSize(); + address mem = (address)::mmap(start, requested, memProtToOsProt(prot), + MAP_PRIVATE | MAP_NORESERVE | MAP_ANONYMOUS, 0, 0); + + // check for out of memory + if (mem == NULL) return NULL; + + address aligned = alignUp(mem, alignment); + + // return the unused leading pages to the free state + if (&aligned[0] != &mem[0]) { + assert(&aligned[0] > &mem[0] && "check this code"); + if (::munmap(&mem[0], &aligned[0] - &mem[0]) != 0) { + assert(!"::munmap failed"); } - // return the unused trailing pages to the free state - if (&aligned[size] != &mem[requested]) { - assert(&aligned[size] < &mem[requested] && "check this code"); - if (::munmap(&aligned[size], &mem[requested] - &aligned[size]) != 0) { - assert(!"::munmap failed"); - } + } + // return the unused trailing pages to the free state + if (&aligned[size] != &mem[requested]) { + assert(&aligned[size] < &mem[requested] && "check this code"); + if (::munmap(&aligned[size], &mem[requested] - &aligned[size]) != 0) { + assert(!"::munmap failed"); } + } - return aligned; + return aligned; } -bool -Os::releaseMemory(void* addr, size_t size) -{ - assert(isMultipleOf(addr, pageSize()) && "not page aligned!"); - size = alignUp(size, pageSize()); +bool Os::releaseMemory(void* addr, size_t size) { + assert(isMultipleOf(addr, pageSize()) && "not page aligned!"); + size = alignUp(size, pageSize()); - return 0 == ::munmap(addr, size); + return 0 == ::munmap(addr, size); } -bool -Os::commitMemory(void* addr, size_t size, MemProt prot) -{ - assert(isMultipleOf(addr, pageSize()) && "not page aligned!"); - size = alignUp(size, pageSize()); +bool Os::commitMemory(void* addr, size_t size, MemProt prot) { + assert(isMultipleOf(addr, pageSize()) && "not page aligned!"); + size = alignUp(size, pageSize()); - return ::mmap(addr, size, memProtToOsProt(prot), - MAP_PRIVATE | MAP_FIXED | MAP_ANONYMOUS, - -1, 0) != MAP_FAILED; + return ::mmap(addr, size, memProtToOsProt(prot), MAP_PRIVATE | MAP_FIXED | MAP_ANONYMOUS, -1, + 0) != MAP_FAILED; } -bool -Os::uncommitMemory(void* addr, size_t size) -{ - assert(isMultipleOf(addr, pageSize()) && "not page aligned!"); - size = alignUp(size, pageSize()); +bool Os::uncommitMemory(void* addr, size_t size) { + assert(isMultipleOf(addr, pageSize()) && "not page aligned!"); + size = alignUp(size, pageSize()); - return ::mmap(addr, size, PROT_NONE, - MAP_PRIVATE | MAP_FIXED | MAP_NORESERVE | MAP_ANONYMOUS, - -1, 0) != MAP_FAILED; + return ::mmap(addr, size, PROT_NONE, MAP_PRIVATE | MAP_FIXED | MAP_NORESERVE | MAP_ANONYMOUS, -1, + 0) != MAP_FAILED; } -bool -Os::protectMemory(void* addr, size_t size, MemProt prot) -{ - assert(isMultipleOf(addr, pageSize()) && "not page aligned!"); - size = alignUp(size, pageSize()); +bool Os::protectMemory(void* addr, size_t size, MemProt prot) { + assert(isMultipleOf(addr, pageSize()) && "not page aligned!"); + size = alignUp(size, pageSize()); - return 0 == ::mprotect(addr, size, memProtToOsProt(prot)); + return 0 == ::mprotect(addr, size, memProtToOsProt(prot)); } -uint64_t -Os::hostTotalPhysicalMemory() -{ - static uint64_t totalPhys = 0; +uint64_t Os::hostTotalPhysicalMemory() { + static uint64_t totalPhys = 0; - if (totalPhys != 0) { - return totalPhys; - } - - totalPhys = sysconf(_SC_PAGESIZE) * sysconf(_SC_PHYS_PAGES); + if (totalPhys != 0) { return totalPhys; + } + + totalPhys = sysconf(_SC_PAGESIZE) * sysconf(_SC_PHYS_PAGES); + return totalPhys; } -void* -Os::alignedMalloc(size_t size, size_t alignment) -{ - void * ptr = NULL; - if (0 == ::posix_memalign(&ptr, alignment, size)) { - return ptr; +void* Os::alignedMalloc(size_t size, size_t alignment) { + void* ptr = NULL; + if (0 == ::posix_memalign(&ptr, alignment, size)) { + return ptr; + } + return NULL; +} + +void Os::alignedFree(void* mem) { ::free(mem); } + +void Os::currentStackInfo(address* base, size_t* size) { + // There could be some issue trying to get the pthread_attr of + // the primordial thread if the pthread library is not present + // at load time (a binary loads the OpenCL app/runtime dynamically. + // We should look into this... -laurent + + pthread_t self = ::pthread_self(); + + pthread_attr_t threadAttr; + if (0 != ::pthread_getattr_np(self, &threadAttr)) { + fatal("pthread_getattr_np() failed"); + } + + if (0 != ::pthread_attr_getstack(&threadAttr, (void**)base, size)) { + fatal("pthread_attr_getstack() failed"); + } + *base += *size; + + ::pthread_attr_destroy(&threadAttr); + + assert(Os::currentStackPtr() >= *base - *size && Os::currentStackPtr() < *base && + "just checking"); +} + +void Os::setCurrentThreadName(const char* name) { ::prctl(PR_SET_NAME, name); } + + +void* Thread::entry(Thread* thread) { + sigset_t set; + + sigfillset(&set); + pthread_sigmask(SIG_BLOCK, &set, NULL); + + sigemptyset(&set); + sigaddset(&set, SIGFPE); + pthread_sigmask(SIG_UNBLOCK, &set, NULL); + + return thread->main(); +} + +bool Os::isThreadAlive(const Thread& thread) { return true; } + +const void* Os::createOsThread(amd::Thread* thread) { + pthread_attr_t threadAttr; + ::pthread_attr_init(&threadAttr); + + if (thread->stackSize_ != 0) { + size_t guardsize = 0; + if (0 != ::pthread_attr_getguardsize(&threadAttr, &guardsize)) { + fatal("pthread_attr_getguardsize() failed"); } - return NULL; -} + ::pthread_attr_setstacksize(&threadAttr, thread->stackSize_ + guardsize); + } -void -Os::alignedFree(void *mem) -{ - ::free(mem); -} + // We never plan the use join, so free the resources now. + ::pthread_attr_setdetachstate(&threadAttr, PTHREAD_CREATE_DETACHED); -void -Os::currentStackInfo(address* base, size_t *size) -{ - // There could be some issue trying to get the pthread_attr of - // the primordial thread if the pthread library is not present - // at load time (a binary loads the OpenCL app/runtime dynamically. - // We should look into this... -laurent + pthread_t handle = 0; + if (0 != ::pthread_create(&handle, &threadAttr, (void* (*)(void*)) & Thread::entry, thread)) { + thread->setState(Thread::FAILED); + } - pthread_t self = ::pthread_self(); - - pthread_attr_t threadAttr; - if (0 != ::pthread_getattr_np(self, &threadAttr)) { - fatal("pthread_getattr_np() failed"); - } - - if (0 != ::pthread_attr_getstack(&threadAttr, - (void **) base, size)) { - fatal("pthread_attr_getstack() failed"); - } - *base += *size; - - ::pthread_attr_destroy(&threadAttr); - - assert(Os::currentStackPtr() >= *base - *size - && Os::currentStackPtr() < *base - && "just checking"); -} - -void -Os::setCurrentThreadName(const char* name) -{ - ::prctl(PR_SET_NAME, name); + ::pthread_attr_destroy(&threadAttr); + return reinterpret_cast(handle); } -void* -Thread::entry(Thread* thread) -{ - sigset_t set; - - sigfillset(&set); - pthread_sigmask(SIG_BLOCK, &set, NULL); - - sigemptyset(&set); - sigaddset(&set, SIGFPE); - pthread_sigmask(SIG_UNBLOCK, &set, NULL); - - return thread->main(); +void Os::setThreadAffinity(const void* handle, const Os::ThreadAffinityMask& mask) { + if (pthread_setaffinity_fptr != NULL) { + pthread_setaffinity_fptr((pthread_t)handle, sizeof(cpu_set_t), &mask.mask_); + } } -bool -Os::isThreadAlive(const Thread& thread) -{ - return true; +void Os::yield() { ::sched_yield(); } + +uint64_t Os::timeNanos() { + struct timespec tp; + ::clock_gettime(CLOCK_MONOTONIC, &tp); + return (uint64_t)tp.tv_sec * (1000ULL * 1000ULL * 1000ULL) + (uint64_t)tp.tv_nsec; } -const void* -Os::createOsThread(amd::Thread* thread) -{ - pthread_attr_t threadAttr; - ::pthread_attr_init(&threadAttr); - - if (thread->stackSize_ != 0) { - size_t guardsize = 0; - if (0 != ::pthread_attr_getguardsize(&threadAttr, - &guardsize)) { - fatal("pthread_attr_getguardsize() failed"); - } - ::pthread_attr_setstacksize(&threadAttr, thread->stackSize_ + guardsize); - } - - // We never plan the use join, so free the resources now. - ::pthread_attr_setdetachstate(&threadAttr, PTHREAD_CREATE_DETACHED); - - pthread_t handle = 0; - if (0 != ::pthread_create(&handle, &threadAttr, - (void* (*)(void*)) &Thread::entry, thread)) { - thread->setState(Thread::FAILED); - } - - ::pthread_attr_destroy(&threadAttr); - return reinterpret_cast(handle); -} - - -void -Os::setThreadAffinity(const void* handle, const Os::ThreadAffinityMask& mask) -{ - if (pthread_setaffinity_fptr != NULL) { - pthread_setaffinity_fptr((pthread_t)handle, sizeof(cpu_set_t), &mask.mask_); - } -} - -void -Os::yield() -{ - ::sched_yield(); -} - -uint64_t -Os::timeNanos() -{ +uint64_t Os::timerResolutionNanos() { + static uint64_t resolution = 0; + if (resolution == 0) { struct timespec tp; - ::clock_gettime(CLOCK_MONOTONIC, &tp); - return (uint64_t) tp.tv_sec * (1000ULL*1000ULL*1000ULL) - + (uint64_t) tp.tv_nsec; + ::clock_getres(CLOCK_MONOTONIC, &tp); + resolution = (uint64_t)tp.tv_sec * (1000ULL * 1000ULL * 1000ULL) + (uint64_t)tp.tv_nsec; + } + return resolution; } -uint64_t -Os::timerResolutionNanos() -{ - static uint64_t resolution = 0; - if (resolution == 0) { - struct timespec tp; - ::clock_getres(CLOCK_MONOTONIC, &tp); - resolution = (uint64_t) tp.tv_sec * (1000ULL*1000ULL*1000ULL) - + (uint64_t) tp.tv_nsec; + +const char* Os::libraryExtension() { return MACOS_SWITCH(".dylib", ".so"); } + +const char* Os::libraryPrefix() { return "lib"; } + +const char* Os::objectExtension() { return ".o"; } + +char Os::fileSeparator() { return '/'; } + +char Os::pathSeparator() { return ':'; } + +bool Os::pathExists(const std::string& path) { + struct stat st; + if (stat(path.c_str(), &st) != 0) return false; + return S_ISDIR(st.st_mode); +} + +bool Os::createPath(const std::string& path) { + mode_t mode = S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH; + size_t pos = 0; + while (true) { + pos = path.find(fileSeparator(), pos); + const std::string currPath = path.substr(0, pos); + if (!currPath.empty() && !pathExists(currPath)) { + int ret = mkdir(currPath.c_str(), mode); + if (ret == -1) return false; } - return resolution; + if (pos == std::string::npos) break; + ++pos; + } + return true; } - -const char* -Os::libraryExtension() -{ - return MACOS_SWITCH(".dylib", ".so"); -} - -const char* -Os::libraryPrefix() -{ - return "lib"; -} - -const char* -Os::objectExtension() -{ - return ".o"; -} - -char -Os::fileSeparator() -{ - return '/'; -} - -char -Os::pathSeparator() -{ - return ':'; -} - -bool Os::pathExists(const std::string& path) -{ - struct stat st; - if (stat(path.c_str(), &st) != 0) - return false; - return S_ISDIR(st.st_mode); -} - -bool Os::createPath(const std::string& path) -{ - mode_t mode = S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH; - size_t pos = 0; - while (true) { - pos = path.find(fileSeparator(), pos); - const std::string currPath = path.substr(0, pos); - if (!currPath.empty() && !pathExists(currPath)) { - int ret = mkdir(currPath.c_str(), mode); - if (ret == -1) return false; - } - if (pos == std::string::npos) break; - ++pos; +bool Os::removePath(const std::string& path) { + size_t pos = std::string::npos; + bool removed = false; + while (true) { + const std::string currPath = path.substr(0, pos); + if (!currPath.empty()) { + int ret = rmdir(currPath.c_str()); + if (ret == -1) return removed; + removed = true; } - return true; + if (pos == 0) break; + pos = path.rfind(fileSeparator(), pos == std::string::npos ? pos : pos - 1); + if (pos == std::string::npos) break; + } + return true; } -bool Os::removePath(const std::string& path) -{ - size_t pos = std::string::npos; - bool removed =false; - while (true) { - const std::string currPath = path.substr(0, pos); - if (!currPath.empty()) { - int ret = rmdir(currPath.c_str()); - if (ret == -1) return removed; - removed = true; - } - if (pos == 0) break; - pos = path.rfind(fileSeparator(), pos == std::string::npos?pos:pos-1); - if (pos == std::string::npos) break; - } - return true; -} +int Os::printf(const char* fmt, ...) { + va_list ap; -int Os::printf(const char* fmt, ...) -{ - va_list ap; + va_start(ap, fmt); + int len = ::vprintf(fmt, ap); + va_end(ap); - va_start(ap, fmt); - int len = ::vprintf(fmt, ap); - va_end(ap); - - return len; + return len; } // Os::systemCall() @@ -625,137 +519,126 @@ int Os::printf(const char* fmt, ...) // // Note that stdin/stdout/stderr of the command are sent to /dev/null. // -int -Os::systemCall(const std::string& command) -{ +int Os::systemCall(const std::string& command) { #if 1 - size_t len = command.size(); - char* cmd = new char[len + 1]; - fastMemcpy(cmd, command.c_str(), len); - cmd[len] = 0; + size_t len = command.size(); + char* cmd = new char[len + 1]; + fastMemcpy(cmd, command.c_str(), len); + cmd[len] = 0; - // Split the command into arguments. This is a very - // simple parser that only takes care of quotes and - // doesn't support escaping with back-slash. In - // the future, Os::systemCall() will either - // disappear or it will be replaced with an - // argc/argv interface. This parser also assumes - // that if an argument is quoted, the whole - // argument starts and ends with a double-quote. - bool inQuote = false; - int argLength = 0; - int n = 0; - char* cp = cmd; - while(*cp) { - switch(static_cast(*cp)) { - case ' ': - if(inQuote) { - ++argLength; - } - else { - *cp = '\0'; - argLength = 0; - } - break; - case '"': - if(inQuote) { - inQuote = false; - *cp = '\0'; - } - else { - inQuote = true; - *cp = '\0'; - argLength = 1; - ++n; - } - break; - default: - if(++argLength == 1) { - ++n; - } - break; + // Split the command into arguments. This is a very + // simple parser that only takes care of quotes and + // doesn't support escaping with back-slash. In + // the future, Os::systemCall() will either + // disappear or it will be replaced with an + // argc/argv interface. This parser also assumes + // that if an argument is quoted, the whole + // argument starts and ends with a double-quote. + bool inQuote = false; + int argLength = 0; + int n = 0; + char* cp = cmd; + while (*cp) { + switch (static_cast(*cp)) { + case ' ': + if (inQuote) { + ++argLength; + } else { + *cp = '\0'; + argLength = 0; } - ++cp; + break; + case '"': + if (inQuote) { + inQuote = false; + *cp = '\0'; + } else { + inQuote = true; + *cp = '\0'; + argLength = 1; + ++n; + } + break; + default: + if (++argLength == 1) { + ++n; + } + break; } + ++cp; + } - char** argv = new char*[n + 1]; - int argc = 0; - cp = cmd; - do { - while('\0' == *cp) { - ++cp; - } - argv[argc++] = cp; - while('\0' != *cp) { - ++cp; - } - } while(argc < n); - argv[argc] = NULL; - - int ret = -1; - pid_t pid = vfork(); - if(0 == pid) { - // Child. Redirect stdin/stdout/stderr to /dev/null - int fdIn = open("/dev/null", O_RDONLY); - int fdOut = open("/dev/null", O_WRONLY); - if(0 <= fdIn || 0 <= fdOut) { - dup2(fdIn, 0); - dup2(fdOut, 1); - dup2(fdOut, 2); - - // Execute the program - execvp(argv[0], argv); - } - _exit(-1); + char** argv = new char*[n + 1]; + int argc = 0; + cp = cmd; + do { + while ('\0' == *cp) { + ++cp; } - else if(0 > pid) { - // Can't vfork + argv[argc++] = cp; + while ('\0' != *cp) { + ++cp; } - else { - // Parent - wait for program to complete and get exit code. - int exitCode; - if(0 <= waitpid(pid, &exitCode, 0)) { - ret = exitCode; - } - } - delete [] argv; - delete [] cmd; + } while (argc < n); + argv[argc] = NULL; - return ret; + int ret = -1; + pid_t pid = vfork(); + if (0 == pid) { + // Child. Redirect stdin/stdout/stderr to /dev/null + int fdIn = open("/dev/null", O_RDONLY); + int fdOut = open("/dev/null", O_WRONLY); + if (0 <= fdIn || 0 <= fdOut) { + dup2(fdIn, 0); + dup2(fdOut, 1); + dup2(fdOut, 2); + + // Execute the program + execvp(argv[0], argv); + } + _exit(-1); + } else if (0 > pid) { + // Can't vfork + } else { + // Parent - wait for program to complete and get exit code. + int exitCode; + if (0 <= waitpid(pid, &exitCode, 0)) { + ret = exitCode; + } + } + delete[] argv; + delete[] cmd; + + return ret; #else - return ::system(command.c_str()); + return ::system(command.c_str()); #endif } -std::string -Os::getEnvironment(const std::string& name) -{ - char* dstBuf; +std::string Os::getEnvironment(const std::string& name) { + char* dstBuf; - dstBuf = ::getenv(name.c_str()); - if (dstBuf == NULL) { - return std::string(""); - } - return std::string(dstBuf); + dstBuf = ::getenv(name.c_str()); + if (dstBuf == NULL) { + return std::string(""); + } + return std::string(dstBuf); } -std::string -Os::getTempPath() -{ - std::string tempFolder = amd::Os::getEnvironment("TEMP"); - if (tempFolder.empty()) { - tempFolder = amd::Os::getEnvironment("TMP"); - } +std::string Os::getTempPath() { + std::string tempFolder = amd::Os::getEnvironment("TEMP"); + if (tempFolder.empty()) { + tempFolder = amd::Os::getEnvironment("TMP"); + } - if (tempFolder.empty()) { - tempFolder = "/tmp";; - } - return tempFolder; + if (tempFolder.empty()) { + tempFolder = "/tmp"; + ; + } + return tempFolder; } -std::string -Os::getTempFileName() -{ +std::string Os::getTempFileName() { static std::atomic_size_t counter(0); std::string tempPath = getTempPath(); @@ -765,120 +648,98 @@ Os::getTempFileName() return tempFileName.str(); } -int -Os::unlink(const std::string& path) -{ - return ::unlink(path.c_str()); -} +int Os::unlink(const std::string& path) { return ::unlink(path.c_str()); } #if defined(ATI_ARCH_X86) -void -Os::cpuid(int regs[4], int info) -{ +void Os::cpuid(int regs[4], int info) { #ifdef _LP64 - __asm__ __volatile__ ( - "movq %%rbx, %%rsi;" - "cpuid;" - "xchgq %%rbx, %%rsi;" - : "=a" (regs[0]), "=S" (regs[1]), "=c" (regs[2]), "=d" (regs[3]) - : "a" (info)); + __asm__ __volatile__( + "movq %%rbx, %%rsi;" + "cpuid;" + "xchgq %%rbx, %%rsi;" + : "=a"(regs[0]), "=S"(regs[1]), "=c"(regs[2]), "=d"(regs[3]) + : "a"(info)); #else - __asm__ __volatile__ ( - "movl %%ebx, %%esi;" - "cpuid;" - "xchgl %%ebx, %%esi;" - : "=a" (regs[0]), "=S" (regs[1]), "=c" (regs[2]), "=d" (regs[3]) - : "a" (info)); + __asm__ __volatile__( + "movl %%ebx, %%esi;" + "cpuid;" + "xchgl %%ebx, %%esi;" + : "=a"(regs[0]), "=S"(regs[1]), "=c"(regs[2]), "=d"(regs[3]) + : "a"(info)); #endif } -uint64_t -Os::xgetbv(uint32_t ecx) -{ - uint32_t eax, edx; +uint64_t Os::xgetbv(uint32_t ecx) { + uint32_t eax, edx; - __asm__ __volatile__( - ".byte 0x0f,0x01,0xd0" // in case assembler doesn't recognize xgetbv - : "=a"(eax), "=d"(edx) - : "c"(ecx)); + __asm__ __volatile__(".byte 0x0f,0x01,0xd0" // in case assembler doesn't recognize xgetbv + : "=a"(eax), "=d"(edx) + : "c"(ecx)); - return ((uint64_t)edx << 32) | (uint64_t)eax; + return ((uint64_t)edx << 32) | (uint64_t)eax; } -#endif // ATI_ARCH_X86 +#endif // ATI_ARCH_X86 -void* -Os::fastMemcpy(void *dest, const void *src, size_t n) -{ - return memcpy(dest, src, n); -} +void* Os::fastMemcpy(void* dest, const void* src, size_t n) { return memcpy(dest, src, n); } -uint64_t -Os::offsetToEpochNanos() -{ - static uint64_t offset = 0; - - if (offset != 0) { - return offset; - } - - struct timeval now; - if (::gettimeofday(&now, NULL) != 0) { - return 0; - } - - offset = (now.tv_sec * UINT64_C(1000000) + now.tv_usec) - * UINT64_C(1000) - timeNanos(); +uint64_t Os::offsetToEpochNanos() { + static uint64_t offset = 0; + if (offset != 0) { return offset; + } + + struct timeval now; + if (::gettimeofday(&now, NULL) != 0) { + return 0; + } + + offset = (now.tv_sec * UINT64_C(1000000) + now.tv_usec) * UINT64_C(1000) - timeNanos(); + + return offset; } -void -Os::setCurrentStackPtr(address sp) -{ - sp -= sizeof(void*); - *(void**) sp = __builtin_return_address(0); +void Os::setCurrentStackPtr(address sp) { + sp -= sizeof(void*); + *(void**)sp = __builtin_return_address(0); #if defined(ATI_ARCH_ARM) - assert(!"Unimplemented"); + assert(!"Unimplemented"); #else - __asm__ __volatile__ ( + __asm__ __volatile__( #if !defined(OMIT_FRAME_POINTER) - LP64_SWITCH("movl (%%ebp),%%ebp;","movq (%%rbp),%%rbp;") -#endif // !OMIT_FRAME_POINTER - LP64_SWITCH("movl %0,%%esp; ret;","movq %0,%%rsp; ret;") - :: "r"(sp) - ); + LP64_SWITCH("movl (%%ebp),%%ebp;", "movq (%%rbp),%%rbp;") +#endif // !OMIT_FRAME_POINTER + LP64_SWITCH("movl %0,%%esp; ret;", "movq %0,%%rsp; ret;")::"r"(sp)); #endif } -size_t Os::getPhysicalMemSize() -{ - struct ::sysinfo si; +size_t Os::getPhysicalMemSize() { + struct ::sysinfo si; - if (::sysinfo(&si) != 0) { - return 0; - } + if (::sysinfo(&si) != 0) { + return 0; + } - if (si.mem_unit == 0) { - // Linux kernels prior to 2.3.23 return sizes in bytes. - si.mem_unit = 1; - } + if (si.mem_unit == 0) { + // Linux kernels prior to 2.3.23 return sizes in bytes. + si.mem_unit = 1; + } - return (size_t) si.totalram * si.mem_unit; + return (size_t)si.totalram * si.mem_unit; } -std::string Os::getAppFileName() -{ - std::unique_ptr buff(new char[FILE_PATH_MAX_LENGTH]()); +std::string Os::getAppFileName() { + std::unique_ptr buff(new char[FILE_PATH_MAX_LENGTH]()); - if (readlink("/proc/self/exe", buff.get(), FILE_PATH_MAX_LENGTH) > 0) { - // Get filename without path and extension. - return std::string(basename(buff.get())); - } + if (readlink("/proc/self/exe", buff.get(), FILE_PATH_MAX_LENGTH) > 0) { + // Get filename without path and extension. + return std::string(basename(buff.get())); + } - return ""; + return ""; } -} // namespace amd +} // namespace amd -#endif // !defined(_WIN32) && !defined(__CYGWIN__) +#endif // !defined(_WIN32) && !defined(__CYGWIN__) diff --git a/rocclr/runtime/os/os_win32.cpp b/rocclr/runtime/os/os_win32.cpp index 0d66e09374..909d6cb305 100644 --- a/rocclr/runtime/os/os_win32.cpp +++ b/rocclr/runtime/os/os_win32.cpp @@ -24,7 +24,7 @@ #endif -BOOL (WINAPI *pfnGetNumaNodeProcessorMaskEx)(USHORT,PGROUP_AFFINITY) = NULL; +BOOL(WINAPI* pfnGetNumaNodeProcessorMaskEx)(USHORT, PGROUP_AFFINITY) = NULL; namespace amd { @@ -34,57 +34,51 @@ static LONG WINAPI divExceptionFilter(struct _EXCEPTION_POINTERS* ep); #ifdef _WIN64 PVOID divExceptionHandler = NULL; -#endif // _WIN64 +#endif // _WIN64 static double PerformanceFrequency; -typedef BOOL (WINAPI *SetThreadGroupAffinity_fn)( - __in HANDLE, __in CONST GROUP_AFFINITY *, __out_opt PGROUP_AFFINITY); +typedef BOOL(WINAPI* SetThreadGroupAffinity_fn)(__in HANDLE, __in CONST GROUP_AFFINITY*, + __out_opt PGROUP_AFFINITY); static SetThreadGroupAffinity_fn pfnSetThreadGroupAffinity = NULL; -#pragma section(".CRT$XCU",long,read) +#pragma section(".CRT$XCU", long, read) __declspec(allocate(".CRT$XCU")) bool (*__init)(void) = Os::init; -bool -Os::init() -{ - static bool initialized_ = false; +bool Os::init() { + static bool initialized_ = false; - // We could use InitOnceExecuteOnce here: - if (initialized_) { - return true; - } - initialized_ = true; + // We could use InitOnceExecuteOnce here: + if (initialized_) { + return true; + } + initialized_ = true; - SYSTEM_INFO si; - ::GetSystemInfo(&si); - pageSize_ = si.dwPageSize; - allocationGranularity_ = (size_t) si.dwAllocationGranularity; - processorCount_ = si.dwNumberOfProcessors; + SYSTEM_INFO si; + ::GetSystemInfo(&si); + pageSize_ = si.dwPageSize; + allocationGranularity_ = (size_t)si.dwAllocationGranularity; + processorCount_ = si.dwNumberOfProcessors; - LARGE_INTEGER frequency; - QueryPerformanceFrequency(&frequency); - PerformanceFrequency = (double) frequency.QuadPart; + LARGE_INTEGER frequency; + QueryPerformanceFrequency(&frequency); + PerformanceFrequency = (double)frequency.QuadPart; - HMODULE handle = ::LoadLibrary("kernel32.dll"); - if (handle != NULL) { - pfnSetThreadGroupAffinity = (SetThreadGroupAffinity_fn) - ::GetProcAddress(handle, "SetThreadGroupAffinity"); - pfnGetNumaNodeProcessorMaskEx = (BOOL(WINAPI *)(USHORT,PGROUP_AFFINITY)) - ::GetProcAddress(handle, "GetNumaNodeProcessorMaskEx"); - } + HMODULE handle = ::LoadLibrary("kernel32.dll"); + if (handle != NULL) { + pfnSetThreadGroupAffinity = + (SetThreadGroupAffinity_fn)::GetProcAddress(handle, "SetThreadGroupAffinity"); + pfnGetNumaNodeProcessorMaskEx = (BOOL(WINAPI*)(USHORT, PGROUP_AFFINITY))::GetProcAddress( + handle, "GetNumaNodeProcessorMaskEx"); + } - return Thread::init(); + return Thread::init(); } -#pragma section(".CRT$XTU",long,read) +#pragma section(".CRT$XTU", long, read) __declspec(allocate(".CRT$XTU")) void (*__exit)(void) = Os::tearDown; -void -Os::tearDown() -{ - Thread::tearDown(); -} +void Os::tearDown() { Thread::tearDown(); } //#define DEBUG_getExportsFromMemory /** @@ -92,646 +86,535 @@ Os::tearDown() of dll in memory and push_back addresses and names of exports into \param kernels */ -static void -getExportsFromMemory( - PIMAGE_DOS_HEADER dosHeader, - Os::SymbolCallback callback, - void* data) -{ - PCHAR base = (PCHAR)dosHeader; - PIMAGE_NT_HEADERS pNTHeader = (PIMAGE_NT_HEADERS) - (base + dosHeader->e_lfanew); +static void getExportsFromMemory(PIMAGE_DOS_HEADER dosHeader, Os::SymbolCallback callback, + void* data) { + PCHAR base = (PCHAR)dosHeader; + PIMAGE_NT_HEADERS pNTHeader = (PIMAGE_NT_HEADERS)(base + dosHeader->e_lfanew); - DWORD exportsStart = pNTHeader->OptionalHeader. - DataDirectory[IMAGE_DIRECTORY_ENTRY_EXPORT].VirtualAddress; + DWORD exportsStart = + pNTHeader->OptionalHeader.DataDirectory[IMAGE_DIRECTORY_ENTRY_EXPORT].VirtualAddress; - if (exportsStart == 0) { return; } + if (exportsStart == 0) { + return; + } - PIMAGE_EXPORT_DIRECTORY exportDir = (PIMAGE_EXPORT_DIRECTORY) - (base + exportsStart); + PIMAGE_EXPORT_DIRECTORY exportDir = (PIMAGE_EXPORT_DIRECTORY)(base + exportsStart); - PSTR filename = (PSTR)(exportDir->Name + base); + PSTR filename = (PSTR)(exportDir->Name + base); - #if defined(DEBUG_getExportsFromMemory) - printf("\nExports Table:\n"); - printf(" Name: %s\n", filename); - printf(" Characteristics: %08X\n", exportDir->Characteristics); - printf(" TimeDateStamp: %08X -> %s", - exportDir->TimeDateStamp, - ctime((const time_t *)&exportDir->TimeDateStamp) ); - printf(" Version: %u.%02u\n", exportDir->MajorVersion, - exportDir->MinorVersion); - printf(" Ordinal base: %08X\n", exportDir->Base); - printf(" # of functions: %08X\n", exportDir->NumberOfFunctions); - printf(" # of Names: %08X\n", exportDir->NumberOfNames); - #endif +#if defined(DEBUG_getExportsFromMemory) + printf("\nExports Table:\n"); + printf(" Name: %s\n", filename); + printf(" Characteristics: %08X\n", exportDir->Characteristics); + printf(" TimeDateStamp: %08X -> %s", exportDir->TimeDateStamp, + ctime((const time_t*)&exportDir->TimeDateStamp)); + printf(" Version: %u.%02u\n", exportDir->MajorVersion, exportDir->MinorVersion); + printf(" Ordinal base: %08X\n", exportDir->Base); + printf(" # of functions: %08X\n", exportDir->NumberOfFunctions); + printf(" # of Names: %08X\n", exportDir->NumberOfNames); +#endif - /* address of Export Address table (EAT). */ - PDWORD functions = (PDWORD)(base + (DWORD)exportDir->AddressOfFunctions); - DWORD numberOfFunctions = exportDir->NumberOfFunctions; + /* address of Export Address table (EAT). */ + PDWORD functions = (PDWORD)(base + (DWORD)exportDir->AddressOfFunctions); + DWORD numberOfFunctions = exportDir->NumberOfFunctions; - /* address of the Export Name Table (ENT). - ENT is an array of RVAs to ASCII strings - each string corresponds to - a symbol (function or variable) exported by name. */ - DWORD* name = (DWORD *)(base + (DWORD)exportDir->AddressOfNames); - /* \note: number below is always <= numberOfFunctions */ - DWORD numberOfNames = exportDir->NumberOfNames; + /* address of the Export Name Table (ENT). + ENT is an array of RVAs to ASCII strings - each string corresponds to + a symbol (function or variable) exported by name. */ + DWORD* name = (DWORD*)(base + (DWORD)exportDir->AddressOfNames); + /* \note: number below is always <= numberOfFunctions */ + DWORD numberOfNames = exportDir->NumberOfNames; - /* address of the Export Ordinal Table. - This table maps an array index from ENT into - the corresponding index in EAT. - */ - PWORD ordinals = (PWORD)(base + (DWORD)exportDir->AddressOfNameOrdinals); + /* address of the Export Ordinal Table. + This table maps an array index from ENT into + the corresponding index in EAT. + */ + PWORD ordinals = (PWORD)(base + (DWORD)exportDir->AddressOfNameOrdinals); - #if defined (DEBUG_getExportsFromMemory) - /* \note On Ordinals and Algorithm Below. +#if defined(DEBUG_getExportsFromMemory) + /* \note On Ordinals and Algorithm Below. - Each exported symbol has an ordinal number associated with it that can - be used to look the exported symbol up. Also, there is almost always - an ASCII name associated with the symbol. Expectedly, the exported - symbol name is the same as the name of the function or variable, but - in general it is not guaranteed. Usually, when an executable imports - a symbol, it uses the symbol name rather than its ordinal. If it was - always a case the algorithm below could be much simple - just go over - all the names and print them, but some functions may be exported only - by ordinals. When importing by name, the system just uses the name to - look up the export ordinal of the desired symbol, and retrieves the - address using the ordinal value. It might be slightly faster if an - ordinal had been used in the first place. Exporting and importing by - name is solely a convenience for programmers. - The use of the ORDINAL keyword in the Exports section of a .DEF file - tells the linker to create an import library that forces an API to be - imported by ordinal, not by name. - The algorithm in the comments shows how to retrieve all the exports in - the general case. If we assume that all is exported by names then a - simple version (code below) is sufficient. + Each exported symbol has an ordinal number associated with it that can + be used to look the exported symbol up. Also, there is almost always + an ASCII name associated with the symbol. Expectedly, the exported + symbol name is the same as the name of the function or variable, but + in general it is not guaranteed. Usually, when an executable imports + a symbol, it uses the symbol name rather than its ordinal. If it was + always a case the algorithm below could be much simple - just go over + all the names and print them, but some functions may be exported only + by ordinals. When importing by name, the system just uses the name to + look up the export ordinal of the desired symbol, and retrieves the + address using the ordinal value. It might be slightly faster if an + ordinal had been used in the first place. Exporting and importing by + name is solely a convenience for programmers. + The use of the ORDINAL keyword in the Exports section of a .DEF file + tells the linker to create an import library that forces an API to be + imported by ordinal, not by name. + The algorithm in the comments shows how to retrieve all the exports in + the general case. If we assume that all is exported by names then a + simple version (code below) is sufficient. - \note removed file exportdump.cpp contains examples of reading - exported symbols from DLL loaded in memory or file. - */ - DWORD exportsEnd = pNTHeader->OptionalHeader. - DataDirectory[IMAGE_DIRECTORY_ENTRY_EXPORT].Size; + \note removed file exportdump.cpp contains examples of reading + exported symbols from DLL loaded in memory or file. + */ + DWORD exportsEnd = pNTHeader->OptionalHeader.DataDirectory[IMAGE_DIRECTORY_ENTRY_EXPORT].Size; - printf("\n Entry Pt Ordn Name\n"); - for (DWORD ii=0; ii < numberOfFunctions; ii++) { - DWORD entryPoint = functions[ii]; + printf("\n Entry Pt Ordn Name\n"); + for (DWORD ii = 0; ii < numberOfFunctions; ii++) { + DWORD entryPoint = functions[ii]; - if (entryPoint == 0) { // Skip over gaps in exported function - continue; // ordinals (the entrypoint is 0 for - } // these functions). - printf(" %08X %4u", entryPoint, ii + exportDir->Base); + if (entryPoint == 0) { // Skip over gaps in exported function + continue; // ordinals (the entrypoint is 0 for + } // these functions). + printf(" %08X %4u", entryPoint, ii + exportDir->Base); - // Browse thru all names and check out if a function has - // an associated exported name. - for (DWORD jj=0; jj < exportDir->NumberOfNames; jj++) { - if (ordinals[jj] == ii) { - printf(" %s", name[jj] + base); - } - } - // Is it a forwarder? If so, the entry point RVA is inside the - // .edata section, and is an RVA to the DllName.EntryPointName - if ((entryPoint >= exportsStart) && (entryPoint <= exportsEnd)) { - printf(" (forwarder -> %s)", entryPoint + base); - } - printf("\n"); - } - #endif - - char OpenCL_prefix[] = "___OpenCL_"; - size_t OpenCL_prefix_sz = sizeof( OpenCL_prefix ) - 1; - - for (DWORD jj=0; jj < numberOfNames; jj++) { - const char* OpenCL_name = (const char*)(base + name[jj]); - if (strncmp(OpenCL_name, OpenCL_prefix, OpenCL_prefix_sz) == 0) { - address addr = (address)(base + functions[ordinals[jj]]); - - unsigned char opcode = *(unsigned char*)addr; - if (opcode == 0xE9) { // jmp instruction at address of export name - long disp = *(long*)(addr+1); // dislacement in jmp - addr += 5 /* skip instruction */ + disp; - } - - #if defined (DEBUG_getExportsFromMemory) - printf("%08X: %s\n", addr, OpenCL_name); - #endif - callback(&OpenCL_name[1], (const void*)addr, data); - } - else if (strncmp( - OpenCL_name, &OpenCL_prefix[1], OpenCL_prefix_sz-1) == 0) { - - address addr = (address)(base + functions[ordinals[jj]]); - #if defined (DEBUG_getExportsFromMemory) - printf("%08X: %s\n", addr, OpenCL_name); - #endif - callback(OpenCL_name, (const void*)addr, data); - } + // Browse thru all names and check out if a function has + // an associated exported name. + for (DWORD jj = 0; jj < exportDir->NumberOfNames; jj++) { + if (ordinals[jj] == ii) { + printf(" %s", name[jj] + base); + } } -} - -bool -Os::iterateSymbols(void* handle, SymbolCallback callback, void* data) -{ - PIMAGE_DOS_HEADER dosHeader = (PIMAGE_DOS_HEADER)handle; - if (dosHeader->e_magic == IMAGE_DOS_SIGNATURE) { - // checking validity of NT header was removed since we do not want - // exception handling. It can be found in rev #21. - getExportsFromMemory((PIMAGE_DOS_HEADER)handle, callback, data); - return TRUE; + // Is it a forwarder? If so, the entry point RVA is inside the + // .edata section, and is an RVA to the DllName.EntryPointName + if ((entryPoint >= exportsStart) && (entryPoint <= exportsEnd)) { + printf(" (forwarder -> %s)", entryPoint + base); } - return FALSE; -} + printf("\n"); + } +#endif -void* -Os::loadLibrary_(const char *filename) -{ - if (filename != NULL) { - HMODULE hModule = ::LoadLibrary(filename); - return hModule; + char OpenCL_prefix[] = "___OpenCL_"; + size_t OpenCL_prefix_sz = sizeof(OpenCL_prefix) - 1; + + for (DWORD jj = 0; jj < numberOfNames; jj++) { + const char* OpenCL_name = (const char*)(base + name[jj]); + if (strncmp(OpenCL_name, OpenCL_prefix, OpenCL_prefix_sz) == 0) { + address addr = (address)(base + functions[ordinals[jj]]); + + unsigned char opcode = *(unsigned char*)addr; + if (opcode == 0xE9) { // jmp instruction at address of export name + long disp = *(long*)(addr + 1); // dislacement in jmp + addr += 5 /* skip instruction */ + disp; + } + +#if defined(DEBUG_getExportsFromMemory) + printf("%08X: %s\n", addr, OpenCL_name); +#endif + callback(&OpenCL_name[1], (const void*)addr, data); + } else if (strncmp(OpenCL_name, &OpenCL_prefix[1], OpenCL_prefix_sz - 1) == 0) { + address addr = (address)(base + functions[ordinals[jj]]); +#if defined(DEBUG_getExportsFromMemory) + printf("%08X: %s\n", addr, OpenCL_name); +#endif + callback(OpenCL_name, (const void*)addr, data); } - return NULL; + } } -void -Os::unloadLibrary(void* handle) -{ - ::FreeLibrary((HMODULE) handle); +bool Os::iterateSymbols(void* handle, SymbolCallback callback, void* data) { + PIMAGE_DOS_HEADER dosHeader = (PIMAGE_DOS_HEADER)handle; + if (dosHeader->e_magic == IMAGE_DOS_SIGNATURE) { + // checking validity of NT header was removed since we do not want + // exception handling. It can be found in rev #21. + getExportsFromMemory((PIMAGE_DOS_HEADER)handle, callback, data); + return TRUE; + } + return FALSE; } -void* -Os::getSymbol(void* handle, const char* name) -{ - return ::GetProcAddress((HMODULE) handle, name); +void* Os::loadLibrary_(const char* filename) { + if (filename != NULL) { + HMODULE hModule = ::LoadLibrary(filename); + return hModule; + } + return NULL; } -static inline int -memProtToOsProt(Os::MemProt prot) -{ - switch (prot) { - case Os::MEM_PROT_NONE: return PAGE_NOACCESS; - case Os::MEM_PROT_READ: return PAGE_READONLY; - case Os::MEM_PROT_RW: return PAGE_READWRITE; - case Os::MEM_PROT_RWX: return PAGE_EXECUTE_READWRITE; - default: break; - } - ShouldNotReachHere(); - return -1; +void Os::unloadLibrary(void* handle) { ::FreeLibrary((HMODULE)handle); } + +void* Os::getSymbol(void* handle, const char* name) { + return ::GetProcAddress((HMODULE)handle, name); } -address -Os::reserveMemory(address start, size_t size, size_t alignment, MemProt prot) -{ - size = alignUp(size, pageSize()); - alignment = std::max(allocationGranularity_, - alignUp(alignment, allocationGranularity_)); - assert(isPowerOfTwo(alignment) && "not a power of 2"); - - size_t requested = size + alignment - allocationGranularity_; - address mem, aligned; - do { - mem = (address)VirtualAlloc(start, requested, - MEM_RESERVE, memProtToOsProt(prot)); - - // check for out of memory. - if (mem == NULL) return NULL; - - aligned = alignUp(mem, alignment); - - // check for already aligned memory. - if (aligned == mem && size == requested) { - return mem; - } - - // try to reserve the aligned address. - if (VirtualFree(mem, 0, MEM_RELEASE) == 0) { - assert(!"VirtualFree failed"); - } - - mem = (address)VirtualAlloc(aligned, size, - MEM_RESERVE, memProtToOsProt(prot)); - assert((mem == NULL || mem == aligned) && "VirtualAlloc failed"); - - } while (mem != aligned); - - return mem; +static inline int memProtToOsProt(Os::MemProt prot) { + switch (prot) { + case Os::MEM_PROT_NONE: + return PAGE_NOACCESS; + case Os::MEM_PROT_READ: + return PAGE_READONLY; + case Os::MEM_PROT_RW: + return PAGE_READWRITE; + case Os::MEM_PROT_RWX: + return PAGE_EXECUTE_READWRITE; + default: + break; + } + ShouldNotReachHere(); + return -1; } -bool -Os::releaseMemory(void* addr, size_t size) -{ - return VirtualFree(addr, 0, MEM_RELEASE) != 0; -} +address Os::reserveMemory(address start, size_t size, size_t alignment, MemProt prot) { + size = alignUp(size, pageSize()); + alignment = std::max(allocationGranularity_, alignUp(alignment, allocationGranularity_)); + assert(isPowerOfTwo(alignment) && "not a power of 2"); -bool -Os::commitMemory(void* addr, size_t size, MemProt prot) -{ - return VirtualAlloc(addr, size, - MEM_COMMIT, memProtToOsProt(prot)) != NULL; -} + size_t requested = size + alignment - allocationGranularity_; + address mem, aligned; + do { + mem = (address)VirtualAlloc(start, requested, MEM_RESERVE, memProtToOsProt(prot)); -bool -Os::uncommitMemory(void* addr, size_t size) -{ - return VirtualFree(addr, size, MEM_DECOMMIT) != 0; -} + // check for out of memory. + if (mem == NULL) return NULL; -bool -Os::protectMemory(void* addr, size_t size, MemProt prot) -{ - DWORD OldProtect; - return VirtualProtect(addr, size, memProtToOsProt(prot), &OldProtect) != 0; -} + aligned = alignUp(mem, alignment); - -uint64_t -Os::hostTotalPhysicalMemory() -{ - static uint64_t totalPhys = 0; - - if (totalPhys != 0) { - return totalPhys; + // check for already aligned memory. + if (aligned == mem && size == requested) { + return mem; } - MEMORYSTATUSEX mstatus; - mstatus.dwLength = sizeof(mstatus); + // try to reserve the aligned address. + if (VirtualFree(mem, 0, MEM_RELEASE) == 0) { + assert(!"VirtualFree failed"); + } - ::GlobalMemoryStatusEx (&mstatus); + mem = (address)VirtualAlloc(aligned, size, MEM_RESERVE, memProtToOsProt(prot)); + assert((mem == NULL || mem == aligned) && "VirtualAlloc failed"); - totalPhys = mstatus.ullTotalPhys; + } while (mem != aligned); + + return mem; +} + +bool Os::releaseMemory(void* addr, size_t size) { return VirtualFree(addr, 0, MEM_RELEASE) != 0; } + +bool Os::commitMemory(void* addr, size_t size, MemProt prot) { + return VirtualAlloc(addr, size, MEM_COMMIT, memProtToOsProt(prot)) != NULL; +} + +bool Os::uncommitMemory(void* addr, size_t size) { + return VirtualFree(addr, size, MEM_DECOMMIT) != 0; +} + +bool Os::protectMemory(void* addr, size_t size, MemProt prot) { + DWORD OldProtect; + return VirtualProtect(addr, size, memProtToOsProt(prot), &OldProtect) != 0; +} + + +uint64_t Os::hostTotalPhysicalMemory() { + static uint64_t totalPhys = 0; + + if (totalPhys != 0) { return totalPhys; + } + + MEMORYSTATUSEX mstatus; + mstatus.dwLength = sizeof(mstatus); + + ::GlobalMemoryStatusEx(&mstatus); + + totalPhys = mstatus.ullTotalPhys; + return totalPhys; } -void* -Os::alignedMalloc(size_t size, size_t alignment) -{ - return ::_aligned_malloc(size, alignment); +void* Os::alignedMalloc(size_t size, size_t alignment) { + return ::_aligned_malloc(size, alignment); } -void -Os::alignedFree(void *mem) -{ - ::_aligned_free(mem); -} +void Os::alignedFree(void* mem) { ::_aligned_free(mem); } -void -Os::currentStackInfo(address* base, size_t *size) -{ - MEMORY_BASIC_INFORMATION mbInfo; +void Os::currentStackInfo(address* base, size_t* size) { + MEMORY_BASIC_INFORMATION mbInfo; - address currentStackPage = (address) alignDown( - (intptr_t) currentStackPtr(), pageSize()); + address currentStackPage = (address)alignDown((intptr_t)currentStackPtr(), pageSize()); - ::VirtualQuery(currentStackPage, &mbInfo, sizeof(mbInfo)); + ::VirtualQuery(currentStackPage, &mbInfo, sizeof(mbInfo)); - address stackBottom = (address) mbInfo.AllocationBase; - size_t stackSize = 0; + address stackBottom = (address)mbInfo.AllocationBase; + size_t stackSize = 0; - do { - stackSize += mbInfo.RegionSize; - ::VirtualQuery(stackBottom + stackSize, &mbInfo, sizeof(mbInfo)); - } while (stackBottom == (address) mbInfo.AllocationBase); + do { + stackSize += mbInfo.RegionSize; + ::VirtualQuery(stackBottom + stackSize, &mbInfo, sizeof(mbInfo)); + } while (stackBottom == (address)mbInfo.AllocationBase); - *base = stackBottom + stackSize; - *size = stackSize; + *base = stackBottom + stackSize; + *size = stackSize; - assert(currentStackPtr() >= *base - *size && currentStackPtr() < *base - && "just checking"); + assert(currentStackPtr() >= *base - *size && currentStackPtr() < *base && "just checking"); } #define MS_VC_EXCEPTION 0x406D1388 -#pragma pack(push,8) -struct THREADNAME_INFO -{ - DWORD dwType; // Must be 0x1000. - LPCSTR szName; // Pointer to name (in user addr space). - DWORD dwThreadID; // Thread ID (-1=caller thread). - DWORD dwFlags; // Reserved for future use, must be zero. +#pragma pack(push, 8) +struct THREADNAME_INFO { + DWORD dwType; // Must be 0x1000. + LPCSTR szName; // Pointer to name (in user addr space). + DWORD dwThreadID; // Thread ID (-1=caller thread). + DWORD dwFlags; // Reserved for future use, must be zero. }; #pragma pack(pop) -static void -SetThreadName(DWORD threadId, const char* name) -{ - if (name == NULL || *name == '\0') { - return; - } +static void SetThreadName(DWORD threadId, const char* name) { + if (name == NULL || *name == '\0') { + return; + } - THREADNAME_INFO info; - info.dwType = 0x1000; - info.szName = name; - info.dwThreadID = threadId; - info.dwFlags = 0; + THREADNAME_INFO info; + info.dwType = 0x1000; + info.szName = name; + info.dwThreadID = threadId; + info.dwFlags = 0; - __try { - ::RaiseException( - 0x406D1388, 0, sizeof(info)/sizeof(ULONG_PTR), (ULONG_PTR*)&info); - } - __except(EXCEPTION_EXECUTE_HANDLER) { } + __try { + ::RaiseException(0x406D1388, 0, sizeof(info) / sizeof(ULONG_PTR), (ULONG_PTR*)&info); + } __except (EXCEPTION_EXECUTE_HANDLER) { + } } -void -Os::setCurrentThreadName(const char* name) -{ - SetThreadName(GetCurrentThreadId(), name); -} +void Os::setCurrentThreadName(const char* name) { SetThreadName(GetCurrentThreadId(), name); } -static LONG WINAPI -divExceptionFilter(struct _EXCEPTION_POINTERS* ep) -{ - DWORD code = ep->ExceptionRecord->ExceptionCode; +static LONG WINAPI divExceptionFilter(struct _EXCEPTION_POINTERS* ep) { + DWORD code = ep->ExceptionRecord->ExceptionCode; - if ((code == EXCEPTION_INT_DIVIDE_BY_ZERO || - code == EXCEPTION_INT_OVERFLOW) && - Thread::current()->isWorkerThread()) { - address insn = (address)ep->ContextRecord->LP64_SWITCH(Eip,Rip); + if ((code == EXCEPTION_INT_DIVIDE_BY_ZERO || code == EXCEPTION_INT_OVERFLOW) && + Thread::current()->isWorkerThread()) { + address insn = (address)ep->ContextRecord->LP64_SWITCH(Eip, Rip); - if (Os::skipIDIV(insn)) { - ep->ContextRecord->LP64_SWITCH(Eip,Rip) = (uintptr_t)insn; - return EXCEPTION_CONTINUE_EXECUTION; - } + if (Os::skipIDIV(insn)) { + ep->ContextRecord->LP64_SWITCH(Eip, Rip) = (uintptr_t)insn; + return EXCEPTION_CONTINUE_EXECUTION; } - return EXCEPTION_CONTINUE_SEARCH; + } + return EXCEPTION_CONTINUE_SEARCH; } bool Os::installSigfpeHandler() { #ifdef _WIN64 - divExceptionHandler = AddVectoredExceptionHandler(1, divExceptionFilter); -#endif // _WIN64 - return true; + divExceptionHandler = AddVectoredExceptionHandler(1, divExceptionFilter); +#endif // _WIN64 + return true; } void Os::uninstallSigfpeHandler() { #ifdef _WIN64 - if (divExceptionHandler != NULL) { - RemoveVectoredExceptionHandler(divExceptionHandler); - divExceptionHandler = NULL; - } -#endif // _WIN64 + if (divExceptionHandler != NULL) { + RemoveVectoredExceptionHandler(divExceptionHandler); + divExceptionHandler = NULL; + } +#endif // _WIN64 } -void* -Thread::entry(Thread* thread) -{ - void* ret = NULL; +void* Thread::entry(Thread* thread) { + void* ret = NULL; #if !defined(_WIN64) - __try { - ret = thread->main(); - } - __except(divExceptionFilter(GetExceptionInformation())) { - // nothing to do here. - } -#else // _WIN64 + __try { ret = thread->main(); -#endif // _WIN64 + } __except (divExceptionFilter(GetExceptionInformation())) { + // nothing to do here. + } +#else // _WIN64 + ret = thread->main(); +#endif // _WIN64 - // The current thread exits, thus clear the pointer +// The current thread exits, thus clear the pointer #if defined(USE_DECLSPEC_THREAD) - details::thread_ = NULL; -#else // !USE_DECLSPEC_THREAD - TlsSetValue(details::threadIndex_, NULL); -#endif // !USE_DECLSPEC_THREAD - return ret; + details::thread_ = NULL; +#else // !USE_DECLSPEC_THREAD + TlsSetValue(details::threadIndex_, NULL); +#endif // !USE_DECLSPEC_THREAD + return ret; } -bool -Os::isThreadAlive(const Thread& thread) -{ - HANDLE handle = (HANDLE)(thread.handle()); +bool Os::isThreadAlive(const Thread& thread) { + HANDLE handle = (HANDLE)(thread.handle()); - DWORD exitCode = 0; - if (GetExitCodeThread(handle, &exitCode)) { - return exitCode == STILL_ACTIVE; + DWORD exitCode = 0; + if (GetExitCodeThread(handle, &exitCode)) { + return exitCode == STILL_ACTIVE; + } else { + // Could not get thread's exitcode + return false; + } +} + +const void* Os::createOsThread(Thread* thread) { + HANDLE handle = ::CreateThread(NULL, thread->stackSize_, (LPTHREAD_START_ROUTINE)Thread::entry, + thread, 0, NULL); + if (handle == NULL) { + thread->setState(Thread::FAILED); + } + return reinterpret_cast(handle); +} + +void Os::setThreadAffinity(const void* handle, const Os::ThreadAffinityMask& mask) { + if (pfnSetThreadGroupAffinity != NULL) { + GROUP_AFFINITY group = {0}; + for (WORD i = 0; i < sizeof(mask.mask_) / sizeof(KAFFINITY); ++i) { + group.Mask = mask.mask_[i]; + group.Group = i; + if (group.Mask != 0) { + pfnSetThreadGroupAffinity((HANDLE)handle, &group, NULL); + } } - else { - // Could not get thread's exitcode - return false; + } else { // pfnSetThreadGroupAffinity == NULL + DWORD_PTR threadAffinityMask = (DWORD_PTR)mask.mask_[0]; + if (threadAffinityMask != 0) { + ::SetThreadAffinityMask((HANDLE)handle, threadAffinityMask); } + } } -const void* -Os::createOsThread(Thread* thread) -{ - HANDLE handle = ::CreateThread(NULL, thread->stackSize_, - (LPTHREAD_START_ROUTINE) Thread::entry, thread, 0, NULL); - if (handle == NULL) { - thread->setState(Thread::FAILED); +void Os::yield() { ::SwitchToThread(); } + +uint64_t Os::timeNanos() { + LARGE_INTEGER current; + QueryPerformanceCounter(¤t); + return (uint64_t)((double)current.QuadPart / PerformanceFrequency * 1e9); +} + +uint64_t Os::timerResolutionNanos() { return (uint64_t)(1e9 / PerformanceFrequency); } + + +const char* Os::libraryExtension() { return ".DLL"; } + +const char* Os::libraryPrefix() { return NULL; } + +const char* Os::objectExtension() { return ".OBJ"; } + +char Os::fileSeparator() { return '\\'; } + +char Os::pathSeparator() { return ';'; } + +bool Os::pathExists(const std::string& path) { + return GetFileAttributes(path.c_str()) != INVALID_FILE_ATTRIBUTES; +} + +bool Os::createPath(const std::string& path) { + size_t pos = 0; + while (true) { + pos = path.find(fileSeparator(), pos); + const std::string currPath = path.substr(0, pos); + if (!currPath.empty() && !pathExists(currPath)) { + if (!CreateDirectory(currPath.c_str(), NULL)) return false; } - return reinterpret_cast(handle); + if (pos == std::string::npos) break; + ++pos; + } + return true; } -void -Os::setThreadAffinity(const void* handle, const Os::ThreadAffinityMask& mask) -{ - if (pfnSetThreadGroupAffinity != NULL) { - GROUP_AFFINITY group = {0}; - for (WORD i = 0; i < sizeof(mask.mask_) / sizeof(KAFFINITY); ++i) { - group.Mask = mask.mask_[i]; - group.Group = i; - if (group.Mask != 0) { - pfnSetThreadGroupAffinity((HANDLE)handle, &group, NULL); - } - } - } - else { // pfnSetThreadGroupAffinity == NULL - DWORD_PTR threadAffinityMask = (DWORD_PTR)mask.mask_[0]; - if (threadAffinityMask != 0) { - ::SetThreadAffinityMask((HANDLE)handle, threadAffinityMask); - } +bool Os::removePath(const std::string& path) { + size_t pos = std::string::npos; + bool removed = false; + while (true) { + const std::string currPath = path.substr(0, pos); + if (!currPath.empty()) { + if (!RemoveDirectory(currPath.c_str())) return removed; + removed = true; } + if (pos == 0) break; + pos = path.rfind(fileSeparator(), pos == std::string::npos ? pos : pos - 1); + if (pos == std::string::npos) break; + } + return true; } -void -Os::yield() -{ - ::SwitchToThread(); +int Os::printf(const char* fmt, ...) { + va_list ap; + DWORD dwBytesWritten; + + va_start(ap, fmt); + int len = ::_vsnprintf(NULL, 0, fmt, ap); + va_end(ap); + if (len <= 0) return len; + + va_start(ap, fmt); + char* str = static_cast(alloca(len + 1)); + len = ::_vsnprintf(str, len + 1, fmt, ap); + va_end(ap); + if (len <= 0) return len; + + ::WriteFile(::GetStdHandle(STD_OUTPUT_HANDLE), str, len, &dwBytesWritten, NULL); + + return len; } -uint64_t -Os::timeNanos() -{ - LARGE_INTEGER current; - QueryPerformanceCounter(¤t); - return (uint64_t) ((double) current.QuadPart / PerformanceFrequency * 1e9); -} - -uint64_t -Os::timerResolutionNanos() -{ - return (uint64_t) (1e9 / PerformanceFrequency); -} - - -const char* -Os::libraryExtension() -{ - return ".DLL"; -} - -const char* -Os::libraryPrefix() -{ - return NULL; -} - -const char* -Os::objectExtension() -{ - return ".OBJ"; -} - -char -Os::fileSeparator() -{ - return '\\'; -} - -char -Os::pathSeparator() -{ - return ';'; -} - -bool Os::pathExists(const std::string& path) -{ - return GetFileAttributes(path.c_str()) != INVALID_FILE_ATTRIBUTES; -} - -bool Os::createPath(const std::string& path) -{ - size_t pos = 0; - while (true) { - pos = path.find(fileSeparator(), pos); - const std::string currPath = path.substr(0, pos); - if (!currPath.empty() && !pathExists(currPath)) { - if (!CreateDirectory(currPath.c_str(), NULL)) return false; - } - if (pos == std::string::npos) break; - ++pos; - } - return true; -} - -bool Os::removePath(const std::string& path) -{ - size_t pos = std::string::npos; - bool removed = false; - while (true) { - const std::string currPath = path.substr(0, pos); - if (!currPath.empty()) { - if (!RemoveDirectory(currPath.c_str())) return removed; - removed = true; - } - if (pos == 0) break; - pos = path.rfind(fileSeparator(), pos == std::string::npos?pos:pos-1); - if (pos == std::string::npos) break; - } - return true; -} - -int Os::printf(const char* fmt, ...) -{ - va_list ap; - DWORD dwBytesWritten; - - va_start(ap, fmt); - int len = ::_vsnprintf(NULL, 0, fmt, ap); - va_end(ap); - if (len <= 0) return len; - - va_start(ap, fmt); - char* str = static_cast(alloca(len + 1)); - len = ::_vsnprintf(str, len + 1, fmt, ap); - va_end(ap); - if (len <= 0) return len; - - ::WriteFile(::GetStdHandle(STD_OUTPUT_HANDLE), str, len, &dwBytesWritten, NULL); - - return len; -} - -int -Os::systemCall(const std::string& command) -{ +int Os::systemCall(const std::string& command) { #if 1 - char* cmd = new char[command.size()+1]; - fastMemcpy(cmd, command.c_str(), command.size()); - cmd[command.size()] = 0; + char* cmd = new char[command.size() + 1]; + fastMemcpy(cmd, command.c_str(), command.size()); + cmd[command.size()] = 0; - STARTUPINFO si = {0}; - si.cb = sizeof(si); - PROCESS_INFORMATION pi; + STARTUPINFO si = {0}; + si.cb = sizeof(si); + PROCESS_INFORMATION pi; - if (::CreateProcess( NULL, cmd, NULL, NULL, - FALSE, CREATE_NO_WINDOW, NULL, NULL, - &si, &pi ) == 0 ) { - delete [] cmd; - return -1; // failed - }; + if (::CreateProcess(NULL, cmd, NULL, NULL, FALSE, CREATE_NO_WINDOW, NULL, NULL, &si, &pi) == 0) { + delete[] cmd; + return -1; // failed + }; - // Wait until child process exits. - ::WaitForSingleObject( pi.hProcess, INFINITE ); + // Wait until child process exits. + ::WaitForSingleObject(pi.hProcess, INFINITE); - DWORD ExitCode = 0; - ::GetExitCodeProcess(pi.hProcess, &ExitCode); + DWORD ExitCode = 0; + ::GetExitCodeProcess(pi.hProcess, &ExitCode); - // Close process and thread handles. - ::CloseHandle( pi.hProcess ); - ::CloseHandle( pi.hThread ); + // Close process and thread handles. + ::CloseHandle(pi.hProcess); + ::CloseHandle(pi.hThread); - delete [] cmd; - return (int)ExitCode; + delete[] cmd; + return (int)ExitCode; #else - std::stringstream str; - str << "\"" << command << "\""; - return ::system(str.str().c_str()); + std::stringstream str; + str << "\"" << command << "\""; + return ::system(str.str().c_str()); #endif } -std::string -Os::getEnvironment(const std::string& name) -{ - char dstBuf[MAX_PATH]; - size_t dstSize; +std::string Os::getEnvironment(const std::string& name) { + char dstBuf[MAX_PATH]; + size_t dstSize; - if (::getenv_s(&dstSize, dstBuf, MAX_PATH, name.c_str())) { - return std::string(""); - } - return std::string(dstBuf); + if (::getenv_s(&dstSize, dstBuf, MAX_PATH, name.c_str())) { + return std::string(""); + } + return std::string(dstBuf); } -std::string -Os::getTempPath() -{ - char tempPath[MAX_PATH]; - uint ret = GetTempPath(MAX_PATH, tempPath); - if (ret == 0 || (ret == 1 && tempPath[0] == '?')) { - return std::string("."); - } +std::string Os::getTempPath() { + char tempPath[MAX_PATH]; + uint ret = GetTempPath(MAX_PATH, tempPath); + if (ret == 0 || (ret == 1 && tempPath[0] == '?')) { + return std::string("."); + } - // If the app was started from an UNC path instead of a DOS path, - // the temp env var won't be set correctly and will point to windows - // system directory instead (usually c:/windows/temp), which will be - // blocked. So we check if the temp path returned by GetTempPath is - // under windows directory, use . instead - std::string tempPathStr(tempPath); - char winPath[MAX_PATH]; - if (GetWindowsDirectory(winPath, MAX_PATH) > 0) { - // Need to check if tempPath is C:\Windows or C:\Windows\ // - if (tempPath[strlen(tempPath)-1] == '\\') { - tempPath[strlen(tempPath)-1] = '\0' ; - ret--; - } - if (_memicmp(tempPath, winPath, ret) == 0) { - return std::string("."); - } + // If the app was started from an UNC path instead of a DOS path, + // the temp env var won't be set correctly and will point to windows + // system directory instead (usually c:/windows/temp), which will be + // blocked. So we check if the temp path returned by GetTempPath is + // under windows directory, use . instead + std::string tempPathStr(tempPath); + char winPath[MAX_PATH]; + if (GetWindowsDirectory(winPath, MAX_PATH) > 0) { + // Need to check if tempPath is C:\Windows or C:\Windows\ // + if (tempPath[strlen(tempPath) - 1] == '\\') { + tempPath[strlen(tempPath) - 1] = '\0'; + ret--; } - return tempPathStr; + if (_memicmp(tempPath, winPath, ret) == 0) { + return std::string("."); + } + } + return tempPathStr; } -std::string -Os::getTempFileName() -{ +std::string Os::getTempFileName() { static std::atomic_size_t counter(0); std::string tempPath = getTempPath(); @@ -741,23 +624,11 @@ Os::getTempFileName() return tempFileName.str(); } -int -Os::unlink(const std::string& path) -{ - return ::_unlink(path.c_str()); -} +int Os::unlink(const std::string& path) { return ::_unlink(path.c_str()); } -void -Os::cpuid(int regs[4], int info) -{ - return __cpuid(regs, info); -} +void Os::cpuid(int regs[4], int info) { return __cpuid(regs, info); } -uint64_t -Os::xgetbv(uint32_t ecx) -{ - return (uint64_t)_xgetbv(ecx); -} +uint64_t Os::xgetbv(uint32_t ecx) { return (uint64_t)_xgetbv(ecx); } // Various "fast" memcpy implementation (currently win32 only due to compiler limitations) @@ -770,7 +641,7 @@ Os::xgetbv(uint32_t ecx) // "Streaming Store"), and also uses the software prefetchnta instructions, // be sure youre running on Athlon/Duron or other recent CPU before calling! -#define TINY_BLOCK_COPY 64 // upper limit for movsd type copy +#define TINY_BLOCK_COPY 64 // upper limit for movsd type copy // The smallest copy uses the X86 "movsd" instruction, in an optimized // form which is an "unrolled loop". @@ -779,15 +650,15 @@ Os::xgetbv(uint32_t ecx) // also using the "unrolled loop" optimization. This code uses // the software prefetch instruction to get the data into the cache. -#define UNCACHED_COPY 197 * 1024 // upper limit for movq/movntq w/SW prefetch +#define UNCACHED_COPY 197 * 1024 // upper limit for movq/movntq w/SW prefetch // For larger blocks, which will spill beyond the cache, its faster to // use the Streaming Store instruction MOVNTQ. This write instruction // bypasses the cache and writes straight to main memory. This code also // uses the software prefetch instruction to pre-read the data. // USE 64 * 1024 FOR THIS VALUE IF YOURE ALWAYS FILLING A "CLEAN CACHE" -#define BLOCK_PREFETCH_COPY infinity // no limit for movq/movntq w/block prefetch -#define CACHEBLOCK 80h // number of 64-byte blocks (cache lines) for block prefetch +#define BLOCK_PREFETCH_COPY infinity // no limit for movq/movntq w/block prefetch +#define CACHEBLOCK 80h // number of 64-byte blocks (cache lines) for block prefetch // For the largest size blocks, a special technique called Block Prefetch // can be used to accelerate the read operations. Block Prefetch reads // one address per cache line, for a series of cache lines, in a short loop. @@ -796,9 +667,7 @@ Os::xgetbv(uint32_t ecx) // Inline assembly syntax for use with Visual C++ -void* -Os::fastMemcpy(void *dest, const void *src, size_t n) -{ +void* Os::fastMemcpy(void* dest, const void* src, size_t n) { #if !defined(_WIN64) __asm { @@ -843,9 +712,9 @@ $memcpy_align_done: ; destination is dword aligned cmp ecx, IN_CACHE_COPY/64 ; too big 4 cache? use uncached copy jae $memcpy_uc_test -// This is small block copy that uses the MMX registers to copy 8 bytes -// at a time. It uses the "unrolled loop" optimization, and also uses -// the software prefetch instruction to get the data into the cache. + // This is small block copy that uses the MMX registers to copy 8 bytes + // at a time. It uses the "unrolled loop" optimization, and also uses + // the software prefetch instruction to get the data into the cache. align 16 $memcpy_ic_1: ; 64-byte block copies, in-cache copy @@ -890,10 +759,10 @@ $memcpy_64_test: or ecx, ecx ; tail end of block prefetch will jump here jz $memcpy_ic_2 ; no more 64-byte blocks left -// For larger blocks, which will spill beyond the cache, its faster to -// use the Streaming Store instruction MOVNTQ. This write instruction -// bypasses the cache and writes straight to main memory. This code also -// uses the software prefetch instruction to pre-read the data. + // For larger blocks, which will spill beyond the cache, its faster to + // use the Streaming Store instruction MOVNTQ. This write instruction + // bypasses the cache and writes straight to main memory. This code also + // uses the software prefetch instruction to pre-read the data. align 16 $memcpy_uc_1: ; 64-byte blocks, uncached copy @@ -922,12 +791,12 @@ $memcpy_uc_1: ; 64-byte blocks, uncached copy jmp $memcpy_ic_2 ; almost done -// For the largest size blocks, a special technique called Block Prefetch -// can be used to accelerate the read operations. Block Prefetch reads -// one address per cache line, for a series of cache lines, in a short loop. -// This is faster than using software prefetch, in this case. -// The technique is great for getting maximum read bandwidth, -// especially in DDR memory systems. + // For the largest size blocks, a special technique called Block Prefetch + // can be used to accelerate the read operations. Block Prefetch reads + // one address per cache line, for a series of cache lines, in a short loop. + // This is faster than using software prefetch, in this case. + // The technique is great for getting maximum read bandwidth, + // especially in DDR memory systems. $memcpy_bp_1: ; large blocks, block prefetch copy cmp ecx, CACHEBLOCK ; big enough to run another prefetch loop? @@ -971,8 +840,8 @@ $memcpy_bp_3: sub ecx, CACHEBLOCK ; update the 64-byte block count jmp $memcpy_bp_1 ; keep processing chunks -// The smallest copy uses the X86 "movsd" instruction, in an optimized -// form which is an "unrolled loop". Then it handles the last few bytes. + // The smallest copy uses the X86 "movsd" instruction, in an optimized + // form which is an "unrolled loop". Then it handles the last few bytes. align 4 movsd movsd ; perform last 1-15 dword copies @@ -1002,89 +871,79 @@ $memcpy_final: sfence ; flush the write buffer mov eax, [dest] ; ret value = destination pointer - } -#else // !defined(_WIN64)) + } +#else // !defined(_WIN64)) - return memcpy(dest, src, n); + return memcpy(dest, src, n); #endif } -uint64_t -Os::offsetToEpochNanos() -{ - static uint64_t offset = 0; - - if (offset != 0) { - return offset; - } - - FILETIME ft; - GetSystemTimeAsFileTime(&ft); - - LARGE_INTEGER li; - li.LowPart = ft.dwLowDateTime; - li.HighPart = ft.dwHighDateTime; - - uint64_t now = (li.QuadPart - 116444736000000000ull) * 100; - offset = now - timeNanos(); +uint64_t Os::offsetToEpochNanos() { + static uint64_t offset = 0; + if (offset != 0) { return offset; + } + + FILETIME ft; + GetSystemTimeAsFileTime(&ft); + + LARGE_INTEGER li; + li.LowPart = ft.dwLowDateTime; + li.HighPart = ft.dwHighDateTime; + + uint64_t now = (li.QuadPart - 116444736000000000ull) * 100; + offset = now - timeNanos(); + + return offset; } #ifdef _WIN64 -address -Os::currentStackPtr() -{ - return (address) _AddressOfReturnAddress() + sizeof(void*); -} +address Os::currentStackPtr() { return (address)_AddressOfReturnAddress() + sizeof(void*); } -#else // !_WIN64 +#else // !_WIN64 -#pragma warning(disable:4731) +#pragma warning(disable : 4731) -void __stdcall -Os::setCurrentStackPtr(address newSp) -{ - newSp -= sizeof(void*); - *(void**) newSp = *(void**) _AddressOfReturnAddress(); - __asm { +void __stdcall Os::setCurrentStackPtr(address newSp) { + newSp -= sizeof(void*); + *(void**)newSp = *(void**)_AddressOfReturnAddress(); + __asm { mov esp,newSp mov ebp,[ebp] ret - } + } } -#endif // !_WIN64 +#endif // !_WIN64 -size_t Os::getPhysicalMemSize() -{ - MEMORYSTATUSEX statex; +size_t Os::getPhysicalMemSize() { + MEMORYSTATUSEX statex; - statex.dwLength = sizeof (statex); + statex.dwLength = sizeof(statex); - if (GlobalMemoryStatusEx (&statex) == 0) { - return 0; - } + if (GlobalMemoryStatusEx(&statex) == 0) { + return 0; + } - return (size_t) statex.ullTotalPhys; + return (size_t)statex.ullTotalPhys; } -std::string Os::getAppFileName() -{ - std::string strFileName; - char* buff = new char[FILE_PATH_MAX_LENGTH]; +std::string Os::getAppFileName() { + std::string strFileName; + char* buff = new char[FILE_PATH_MAX_LENGTH]; - if (GetModuleFileNameA(NULL, buff, FILE_PATH_MAX_LENGTH) != 0) { - // Get filename without path and extension. - strFileName = strrchr(buff, '\\') ? strrchr(buff, '\\') + 1 : buff; - } + if (GetModuleFileNameA(NULL, buff, FILE_PATH_MAX_LENGTH) != 0) { + // Get filename without path and extension. + strFileName = strrchr(buff, '\\') ? strrchr(buff, '\\') + 1 : buff; + } - delete buff; - return strFileName; + delete buff; + return strFileName; } -} // namespace amd +} // namespace amd -#endif // _WIN32 || __CYGWIN__ +#endif // _WIN32 || __CYGWIN__ diff --git a/rocclr/runtime/platform/agent.cpp b/rocclr/runtime/platform/agent.cpp index 221c8abf70..f9e5869c3a 100644 --- a/rocclr/runtime/platform/agent.cpp +++ b/rocclr/runtime/platform/agent.cpp @@ -15,516 +15,425 @@ namespace amd { -typedef cl_int (CL_CALLBACK * clAgent_OnLoad_fn)(cl_agent * agent); -typedef void (CL_CALLBACK * clAgent_OnUnload_fn)(cl_agent * agent); +typedef cl_int(CL_CALLBACK* clAgent_OnLoad_fn)(cl_agent* agent); +typedef void(CL_CALLBACK* clAgent_OnUnload_fn)(cl_agent* agent); -Agent::Agent(const char* moduleName) : - ready_(false) -{ - ::memset(&callbacks_, '\0', sizeof(callbacks_)); - ::memset(&capabilities_, '\0', sizeof(capabilities_)); +Agent::Agent(const char* moduleName) : ready_(false) { + ::memset(&callbacks_, '\0', sizeof(callbacks_)); + ::memset(&capabilities_, '\0', sizeof(capabilities_)); - library_ = Os::loadLibrary(moduleName); - if (library_ == NULL) { - return; - } + library_ = Os::loadLibrary(moduleName); + if (library_ == NULL) { + return; + } - clAgent_OnLoad_fn onLoad = reinterpret_cast( - Os::getSymbol(library_, "clAgent_OnLoad")); - if (onLoad == NULL) { - return; - } + clAgent_OnLoad_fn onLoad = + reinterpret_cast(Os::getSymbol(library_, "clAgent_OnLoad")); + if (onLoad == NULL) { + return; + } - _cl_agent* agent = static_cast<_cl_agent*>(this); - ::memcpy(agent, &entryPoints_, sizeof(entryPoints_)); + _cl_agent* agent = static_cast<_cl_agent*>(this); + ::memcpy(agent, &entryPoints_, sizeof(entryPoints_)); - // Register in the agents linked-list. - next_ = list_; - list_ = this; + // Register in the agents linked-list. + next_ = list_; + list_ = this; - if (onLoad(agent) != CL_SUCCESS) { - list_ = list_->next_; - } + if (onLoad(agent) != CL_SUCCESS) { + list_ = list_->next_; + } - // Mark this instance as ready for use. - ready_ = true; + // Mark this instance as ready for use. + ready_ = true; } -Agent::~Agent() -{ - if (library_ != NULL) { - clAgent_OnUnload_fn onUnload = reinterpret_cast( - Os::getSymbol(library_, "clAgent_OnUnload")); +Agent::~Agent() { + if (library_ != NULL) { + clAgent_OnUnload_fn onUnload = + reinterpret_cast(Os::getSymbol(library_, "clAgent_OnUnload")); - if (onUnload != NULL) { - onUnload(static_cast(this)); - } - - Os::unloadLibrary(library_); - } -} - -cl_int -Agent::setCallbacks(const cl_agent_callbacks *callbacks, size_t size) -{ - // FIXME_lmoriche: check size - memcpy(&callbacks_, callbacks, size); - return CL_SUCCESS; -} - -cl_int -Agent::getCapabilities(cl_agent_capabilities* caps) -{ - if (caps == NULL) { - return CL_INVALID_VALUE; - } - *caps = capabilities_; - return CL_SUCCESS; -} - -static inline cl_agent_capabilities -operator ~ (const cl_agent_capabilities& src) -{ - cl_agent_capabilities result; - - const char* a = reinterpret_cast(&src); - char *b = reinterpret_cast(&result); - for (size_t i = 0; i < sizeof(cl_agent_capabilities); ++i) { - *b++ = ~*a++; + if (onUnload != NULL) { + onUnload(static_cast(this)); } - return result; + Os::unloadLibrary(library_); + } } -static inline cl_agent_capabilities -operator | (const cl_agent_capabilities& lhs, const cl_agent_capabilities& rhs) -{ - cl_agent_capabilities result; - - const char* a = reinterpret_cast(&lhs); - const char* b = reinterpret_cast(&rhs); - char *c = reinterpret_cast(&result); - for (size_t i = 0; i < sizeof(cl_agent_capabilities); ++i) { - *c++ = *a++ | *b++; - } - - return result; +cl_int Agent::setCallbacks(const cl_agent_callbacks* callbacks, size_t size) { + // FIXME_lmoriche: check size + memcpy(&callbacks_, callbacks, size); + return CL_SUCCESS; } -static inline cl_agent_capabilities -operator & (const cl_agent_capabilities& lhs, const cl_agent_capabilities& rhs) -{ - cl_agent_capabilities result; - - const char* a = reinterpret_cast(&lhs); - const char* b = reinterpret_cast(&rhs); - char *c = reinterpret_cast(&result); - for (size_t i = 0; i < sizeof(cl_agent_capabilities); ++i) { - *c++ = *a++ & *b++; - } - - return result; +cl_int Agent::getCapabilities(cl_agent_capabilities* caps) { + if (caps == NULL) { + return CL_INVALID_VALUE; + } + *caps = capabilities_; + return CL_SUCCESS; } -static inline bool -operator == (const cl_agent_capabilities& lhs, const cl_agent_capabilities& rhs) -{ - const char* a = reinterpret_cast(&lhs); - const char* b = reinterpret_cast(&rhs); - for (size_t i = 0; i < sizeof(cl_agent_capabilities); ++i) { - if (*a++ != *b++) { - return false; - } - } +static inline cl_agent_capabilities operator~(const cl_agent_capabilities& src) { + cl_agent_capabilities result; + const char* a = reinterpret_cast(&src); + char* b = reinterpret_cast(&result); + for (size_t i = 0; i < sizeof(cl_agent_capabilities); ++i) { + *b++ = ~*a++; + } + + return result; +} + +static inline cl_agent_capabilities operator|(const cl_agent_capabilities& lhs, + const cl_agent_capabilities& rhs) { + cl_agent_capabilities result; + + const char* a = reinterpret_cast(&lhs); + const char* b = reinterpret_cast(&rhs); + char* c = reinterpret_cast(&result); + for (size_t i = 0; i < sizeof(cl_agent_capabilities); ++i) { + *c++ = *a++ | *b++; + } + + return result; +} + +static inline cl_agent_capabilities operator&(const cl_agent_capabilities& lhs, + const cl_agent_capabilities& rhs) { + cl_agent_capabilities result; + + const char* a = reinterpret_cast(&lhs); + const char* b = reinterpret_cast(&rhs); + char* c = reinterpret_cast(&result); + for (size_t i = 0; i < sizeof(cl_agent_capabilities); ++i) { + *c++ = *a++ & *b++; + } + + return result; +} + +static inline bool operator==(const cl_agent_capabilities& lhs, const cl_agent_capabilities& rhs) { + const char* a = reinterpret_cast(&lhs); + const char* b = reinterpret_cast(&rhs); + for (size_t i = 0; i < sizeof(cl_agent_capabilities); ++i) { + if (*a++ != *b++) { + return false; + } + } + + return true; +} + +static inline bool operator!=(const cl_agent_capabilities& lhs, const cl_agent_capabilities& rhs) { + return !(lhs == rhs); +} + +cl_int Agent::setCapabilities(const cl_agent_capabilities* caps, bool install) { + ScopedLock sl(capabilitiesLock_); + + if (caps == NULL || *caps != (*caps & potentialCapabilities_)) { + return CL_INVALID_VALUE; + } + + if (install) { + capabilities_ = capabilities_ | *caps; + } else { + capabilities_ = capabilities_ & ~*caps; + } + + memset(&enabledCapabilities_, '\0', sizeof(enabledCapabilities_)); + for (Agent* agent = list_; agent != NULL; agent = agent->next_) { + enabledCapabilities_ = enabledCapabilities_ | agent->capabilities_; + } + + return CL_SUCCESS; +} + +bool Agent::init() { + ::memset(&potentialCapabilities_, '\0', sizeof(potentialCapabilities_)); + potentialCapabilities_.canGenerateContextEvents = 1; + potentialCapabilities_.canGenerateCommandQueueEvents = 1; + potentialCapabilities_.canGenerateEventEvents = 1; + // potentialCapabilities_.canGenerateMemObjectEvents = 1; + // potentialCapabilities_.canGenerateSamplerEvents = 1; + // potentialCapabilities_.canGenerateProgramEvents = 1; + // potentialCapabilities_.canGenerateKernelEvents = 1; + + const char* envVar = ::getenv("CL_AGENT"); + if (envVar == NULL) { return true; + } + + std::string token, modules = envVar; + std::istringstream iss(modules); + + while (getline(iss, token, ',')) { + Agent* agent = new Agent(token.c_str()); + if (agent == NULL || !agent->isReady()) { + delete agent; + + // Only return an error if we failed the Agent allocation. Other + // error (the agent is not ready) can be ignored. + return agent != NULL; + } + } + + return true; } -static inline bool -operator != (const cl_agent_capabilities& lhs, const cl_agent_capabilities& rhs) -{ - return !(lhs == rhs); -} - -cl_int -Agent::setCapabilities(const cl_agent_capabilities* caps, bool install) -{ - ScopedLock sl(capabilitiesLock_); - - if (caps == NULL || *caps != (*caps & potentialCapabilities_)) { - return CL_INVALID_VALUE; - } - - if (install) { - capabilities_ = capabilities_ | *caps; - } - else { - capabilities_ = capabilities_ & ~*caps; - } - - memset(&enabledCapabilities_, '\0', sizeof(enabledCapabilities_)); - for (Agent* agent = list_; agent != NULL; agent = agent->next_) { - enabledCapabilities_ = enabledCapabilities_ | agent->capabilities_; - } - - return CL_SUCCESS; -} - -bool -Agent::init() -{ - ::memset(&potentialCapabilities_, '\0', sizeof(potentialCapabilities_)); - potentialCapabilities_.canGenerateContextEvents = 1; - potentialCapabilities_.canGenerateCommandQueueEvents = 1; - potentialCapabilities_.canGenerateEventEvents = 1; -// potentialCapabilities_.canGenerateMemObjectEvents = 1; -// potentialCapabilities_.canGenerateSamplerEvents = 1; -// potentialCapabilities_.canGenerateProgramEvents = 1; -// potentialCapabilities_.canGenerateKernelEvents = 1; - - const char* envVar = ::getenv("CL_AGENT"); - if (envVar == NULL) { - return true; - } - - std::string token, modules = envVar; - std::istringstream iss(modules); - - while (getline(iss, token, ',')) { - Agent* agent = new Agent(token.c_str()); - if (agent == NULL || !agent->isReady()) { - delete agent; - - // Only return an error if we failed the Agent allocation. Other - // error (the agent is not ready) can be ignored. - return agent != NULL; - } - } - - return true; -} - -void -Agent::tearDown() -{ - while (list_ != NULL) { - Agent* agent = list_; - list_ = list_->next_; - delete agent; - } +void Agent::tearDown() { + while (list_ != NULL) { + Agent* agent = list_; + list_ = list_->next_; + delete agent; + } } namespace agent { -static cl_int CL_API_CALL -GetVersionNumber( - cl_agent* agent, cl_int* version_ret) -{ - if (version_ret == NULL) { - return CL_INVALID_VALUE; +static cl_int CL_API_CALL GetVersionNumber(cl_agent* agent, cl_int* version_ret) { + if (version_ret == NULL) { + return CL_INVALID_VALUE; + } + *version_ret = CL_AGENT_VERSION_1_0; + return CL_SUCCESS; +} + +static cl_int CL_API_CALL GetPlatform(cl_agent* agent, cl_platform_id* platform_id_ret) { + if (platform_id_ret == NULL) { + return CL_INVALID_VALUE; + } + *platform_id_ret = AMD_PLATFORM; + return CL_SUCCESS; +} + +static cl_int CL_API_CALL GetTime(cl_agent* agent, cl_long* time_nanos) { + if (time_nanos == NULL) { + return CL_INVALID_VALUE; + } + *time_nanos = Os::timeNanos() + Os::offsetToEpochNanos(); + return CL_SUCCESS; +} + +static cl_int CL_API_CALL SetCallbacks(cl_agent* agent, const cl_agent_callbacks* callbacks, + size_t size) { + return Agent::get(agent)->setCallbacks(callbacks, size); +} + +static cl_int CL_API_CALL GetPotentialCapabilities(cl_agent* agent, + cl_agent_capabilities* capabilities) { + if (capabilities == NULL) { + return CL_INVALID_VALUE; + } + + *capabilities = Agent::potentialCapabilities(); + return CL_SUCCESS; +} + +static cl_int CL_API_CALL GetCapabilities(cl_agent* agent, cl_agent_capabilities* capabilities) { + return Agent::get(agent)->getCapabilities(capabilities); +} + +static cl_int CL_API_CALL SetCapabilities(cl_agent* agent, + const cl_agent_capabilities* capabilities, + cl_agent_capability_action action) { + return Agent::get(agent)->setCapabilities(capabilities, action == CL_AGENT_ADD_CAPABILITIES); +} + +static cl_int CL_API_CALL GetICDDispatchTable(cl_agent* agent, cl_icd_dispatch_table* table, + size_t size) { + // FIXME_lmoriche: check size + memcpy(table, amd::ICDDispatchedObject::icdVendorDispatch_, size); + return CL_SUCCESS; +} + +static cl_int CL_API_CALL SetICDDispatchTable(cl_agent* agent, const cl_icd_dispatch_table* table, + size_t size) { + // FIXME_lmoriche: check size + memcpy(amd::ICDDispatchedObject::icdVendorDispatch_, table, size); + return CL_SUCCESS; +} + +} // namespace agent + +cl_agent Agent::entryPoints_ = {agent::GetVersionNumber, + agent::GetPlatform, + agent::GetTime, + agent::SetCallbacks, + agent::GetPotentialCapabilities, + agent::GetCapabilities, + agent::SetCapabilities, + agent::GetICDDispatchTable, + agent::SetICDDispatchTable}; + +void Agent::postContextCreate(cl_context context) { + for (Agent* agent = list_; agent != NULL; agent = agent->next_) { + acContextCreate_fn callback = agent->callbacks_.ContextCreate; + if (callback != NULL && agent->canGenerateContextEvents()) { + callback(agent, context); } - *version_ret = CL_AGENT_VERSION_1_0; - return CL_SUCCESS; + } } -static cl_int CL_API_CALL -GetPlatform( - cl_agent* agent, cl_platform_id* platform_id_ret) -{ - if (platform_id_ret == NULL) { - return CL_INVALID_VALUE; - +void Agent::postContextFree(cl_context context) { + for (Agent* agent = list_; agent != NULL; agent = agent->next_) { + acContextFree_fn callback = agent->callbacks_.ContextFree; + if (callback != NULL && agent->canGenerateContextEvents()) { + callback(agent, context); } - *platform_id_ret = AMD_PLATFORM; - return CL_SUCCESS; + } } -static cl_int CL_API_CALL -GetTime( - cl_agent* agent, cl_long* time_nanos) -{ - if (time_nanos == NULL) { - return CL_INVALID_VALUE; - +void Agent::postCommandQueueCreate(cl_command_queue queue) { + for (Agent* agent = list_; agent != NULL; agent = agent->next_) { + acCommandQueueCreate_fn callback = agent->callbacks_.CommandQueueCreate; + if (callback != NULL && agent->canGenerateCommandQueueEvents()) { + callback(agent, queue); } - *time_nanos = Os::timeNanos() + Os::offsetToEpochNanos(); - return CL_SUCCESS; + } } -static cl_int CL_API_CALL -SetCallbacks( - cl_agent* agent, - const cl_agent_callbacks* callbacks, - size_t size) -{ - return Agent::get(agent)->setCallbacks(callbacks, size); -} - -static cl_int CL_API_CALL -GetPotentialCapabilities( - cl_agent* agent, cl_agent_capabilities* capabilities) -{ - if (capabilities == NULL) { - return CL_INVALID_VALUE; +void Agent::postCommandQueueFree(cl_command_queue queue) { + for (Agent* agent = list_; agent != NULL; agent = agent->next_) { + acCommandQueueFree_fn callback = agent->callbacks_.CommandQueueFree; + if (callback != NULL && agent->canGenerateCommandQueueEvents()) { + callback(agent, queue); } - - *capabilities = Agent::potentialCapabilities(); - return CL_SUCCESS; + } } -static cl_int CL_API_CALL -GetCapabilities( - cl_agent* agent, cl_agent_capabilities* capabilities) -{ - return Agent::get(agent)->getCapabilities(capabilities); -} - -static cl_int CL_API_CALL -SetCapabilities( - cl_agent* agent, - const cl_agent_capabilities* capabilities, - cl_agent_capability_action action) -{ - return Agent::get(agent)->setCapabilities( - capabilities, action == CL_AGENT_ADD_CAPABILITIES); -} - -static cl_int CL_API_CALL -GetICDDispatchTable( - cl_agent* agent, cl_icd_dispatch_table* table, size_t size) -{ - // FIXME_lmoriche: check size - memcpy(table, amd::ICDDispatchedObject::icdVendorDispatch_, size); - return CL_SUCCESS; -} - -static cl_int CL_API_CALL -SetICDDispatchTable( - cl_agent* agent, const cl_icd_dispatch_table* table, size_t size) -{ - // FIXME_lmoriche: check size - memcpy(amd::ICDDispatchedObject::icdVendorDispatch_, table, size); - return CL_SUCCESS; -} - -} // namespace agent - -cl_agent -Agent::entryPoints_ = { - agent::GetVersionNumber, - agent::GetPlatform, - agent::GetTime, - agent::SetCallbacks, - agent::GetPotentialCapabilities, - agent::GetCapabilities, - agent::SetCapabilities, - agent::GetICDDispatchTable, - agent::SetICDDispatchTable -}; - -void -Agent::postContextCreate(cl_context context) -{ - for (Agent* agent = list_; agent != NULL; agent = agent->next_) { - acContextCreate_fn callback = agent->callbacks_.ContextCreate; - if (callback != NULL && agent->canGenerateContextEvents()) { - callback(agent, context); - } +void Agent::postEventCreate(cl_event event, cl_command_type type) { + for (Agent* agent = list_; agent != NULL; agent = agent->next_) { + acEventCreate_fn callback = agent->callbacks_.EventCreate; + if (callback != NULL && agent->canGenerateEventEvents()) { + callback(agent, event, type); } + } } -void -Agent::postContextFree(cl_context context) -{ - for (Agent* agent = list_; agent != NULL; agent = agent->next_) { - acContextFree_fn callback = agent->callbacks_.ContextFree; - if (callback != NULL && agent->canGenerateContextEvents()) { - callback(agent, context); - } +void Agent::postEventFree(cl_event event) { + for (Agent* agent = list_; agent != NULL; agent = agent->next_) { + acEventFree_fn callback = agent->callbacks_.EventFree; + if (callback != NULL && agent->canGenerateEventEvents()) { + callback(agent, event); } + } } -void -Agent::postCommandQueueCreate(cl_command_queue queue) -{ - for (Agent* agent = list_; agent != NULL; agent = agent->next_) { - acCommandQueueCreate_fn callback = agent->callbacks_.CommandQueueCreate; - if (callback != NULL && agent->canGenerateCommandQueueEvents()) { - callback(agent, queue); - } +void Agent::postEventStatusChanged(cl_event event, cl_int status, cl_long ts) { + for (Agent* agent = list_; agent != NULL; agent = agent->next_) { + acEventStatusChanged_fn callback = agent->callbacks_.EventStatusChanged; + if (callback != NULL && agent->canGenerateEventEvents()) { + callback(agent, event, status, ts); } + } } -void -Agent::postCommandQueueFree(cl_command_queue queue) -{ - for (Agent* agent = list_; agent != NULL; agent = agent->next_) { - acCommandQueueFree_fn callback = agent->callbacks_.CommandQueueFree; - if (callback != NULL && agent->canGenerateCommandQueueEvents()) { - callback(agent, queue); - } +void Agent::postMemObjectCreate(cl_mem memobj) { + for (Agent* agent = list_; agent != NULL; agent = agent->next_) { + acMemObjectCreate_fn callback = agent->callbacks_.MemObjectCreate; + if (callback != NULL && agent->canGenerateMemObjectEvents()) { + callback(agent, memobj); } + } } -void -Agent::postEventCreate(cl_event event, cl_command_type type) -{ - for (Agent* agent = list_; agent != NULL; agent = agent->next_) { - acEventCreate_fn callback = agent->callbacks_.EventCreate; - if (callback != NULL && agent->canGenerateEventEvents()) { - callback(agent, event, type); - } +void Agent::postMemObjectFree(cl_mem memobj) { + for (Agent* agent = list_; agent != NULL; agent = agent->next_) { + acMemObjectFree_fn callback = agent->callbacks_.MemObjectFree; + if (callback != NULL && agent->canGenerateMemObjectEvents()) { + callback(agent, memobj); } + } } -void -Agent::postEventFree(cl_event event) -{ - for (Agent* agent = list_; agent != NULL; agent = agent->next_) { - acEventFree_fn callback = agent->callbacks_.EventFree; - if (callback != NULL && agent->canGenerateEventEvents()) { - callback(agent, event); - } +void Agent::postMemObjectAcquired(cl_mem memobj, cl_device_id device, cl_long elapsed) { + for (Agent* agent = list_; agent != NULL; agent = agent->next_) { + acMemObjectAcquired_fn callback = agent->callbacks_.MemObjectAcquired; + if (callback != NULL && agent->canGenerateMemObjectEvents()) { + callback(agent, memobj, device, elapsed); } + } } -void -Agent::postEventStatusChanged(cl_event event, cl_int status, cl_long ts) -{ - for (Agent* agent = list_; agent != NULL; agent = agent->next_) { - acEventStatusChanged_fn callback = agent->callbacks_.EventStatusChanged; - if (callback != NULL && agent->canGenerateEventEvents()) { - callback(agent, event, status, ts); - } +void Agent::postSamplerCreate(cl_sampler sampler) { + for (Agent* agent = list_; agent != NULL; agent = agent->next_) { + acSamplerCreate_fn callback = agent->callbacks_.SamplerCreate; + if (callback != NULL && agent->canGenerateSamplerEvents()) { + callback(agent, sampler); } + } } -void -Agent::postMemObjectCreate(cl_mem memobj) -{ - for (Agent* agent = list_; agent != NULL; agent = agent->next_) { - acMemObjectCreate_fn callback = agent->callbacks_.MemObjectCreate; - if (callback != NULL && agent->canGenerateMemObjectEvents()) { - callback(agent, memobj); - } +void Agent::postSamplerFree(cl_sampler sampler) { + for (Agent* agent = list_; agent != NULL; agent = agent->next_) { + acSamplerFree_fn callback = agent->callbacks_.SamplerFree; + if (callback != NULL && agent->canGenerateSamplerEvents()) { + callback(agent, sampler); } + } } -void -Agent::postMemObjectFree(cl_mem memobj) -{ - for (Agent* agent = list_; agent != NULL; agent = agent->next_) { - acMemObjectFree_fn callback = agent->callbacks_.MemObjectFree; - if (callback != NULL && agent->canGenerateMemObjectEvents()) { - callback(agent, memobj); - } +void Agent::postProgramCreate(cl_program program) { + for (Agent* agent = list_; agent != NULL; agent = agent->next_) { + acProgramCreate_fn callback = agent->callbacks_.ProgramCreate; + if (callback != NULL && agent->canGenerateProgramEvents()) { + callback(agent, program); } + } } -void -Agent::postMemObjectAcquired( - cl_mem memobj, cl_device_id device, cl_long elapsed) -{ - for (Agent* agent = list_; agent != NULL; agent = agent->next_) { - acMemObjectAcquired_fn callback = agent->callbacks_.MemObjectAcquired; - if (callback != NULL && agent->canGenerateMemObjectEvents()) { - callback(agent, memobj, device, elapsed); - } +void Agent::postProgramFree(cl_program program) { + for (Agent* agent = list_; agent != NULL; agent = agent->next_) { + acProgramFree_fn callback = agent->callbacks_.ProgramFree; + if (callback != NULL && agent->canGenerateProgramEvents()) { + callback(agent, program); } + } } -void -Agent::postSamplerCreate(cl_sampler sampler) -{ - for (Agent* agent = list_; agent != NULL; agent = agent->next_) { - acSamplerCreate_fn callback = agent->callbacks_.SamplerCreate; - if (callback != NULL && agent->canGenerateSamplerEvents()) { - callback(agent, sampler); - } +void Agent::postProgramBuild(cl_program program) { + for (Agent* agent = list_; agent != NULL; agent = agent->next_) { + acProgramBuild_fn callback = agent->callbacks_.ProgramBuild; + if (callback != NULL && agent->canGenerateProgramEvents()) { + callback(agent, program); } + } } -void -Agent::postSamplerFree(cl_sampler sampler) -{ - for (Agent* agent = list_; agent != NULL; agent = agent->next_) { - acSamplerFree_fn callback = agent->callbacks_.SamplerFree; - if (callback != NULL && agent->canGenerateSamplerEvents()) { - callback(agent, sampler); - } +void Agent::postKernelCreate(cl_kernel kernel) { + for (Agent* agent = list_; agent != NULL; agent = agent->next_) { + acKernelCreate_fn callback = agent->callbacks_.KernelCreate; + if (callback != NULL && agent->canGenerateKernelEvents()) { + callback(agent, kernel); } + } } -void -Agent::postProgramCreate(cl_program program) -{ - for (Agent* agent = list_; agent != NULL; agent = agent->next_) { - acProgramCreate_fn callback = agent->callbacks_.ProgramCreate; - if (callback != NULL && agent->canGenerateProgramEvents()) { - callback(agent, program); - } +void Agent::postKernelFree(cl_kernel kernel) { + for (Agent* agent = list_; agent != NULL; agent = agent->next_) { + acKernelFree_fn callback = agent->callbacks_.KernelFree; + if (callback != NULL && agent->canGenerateKernelEvents()) { + callback(agent, kernel); } + } } -void -Agent::postProgramFree(cl_program program) -{ - for (Agent* agent = list_; agent != NULL; agent = agent->next_) { - acProgramFree_fn callback = agent->callbacks_.ProgramFree; - if (callback != NULL && agent->canGenerateProgramEvents()) { - callback(agent, program); - } - } -} - -void -Agent::postProgramBuild(cl_program program) -{ - for (Agent* agent = list_; agent != NULL; agent = agent->next_) { - acProgramBuild_fn callback = agent->callbacks_.ProgramBuild; - if (callback != NULL && agent->canGenerateProgramEvents()) { - callback(agent, program); - } - } -} - -void -Agent::postKernelCreate(cl_kernel kernel) -{ - for (Agent* agent = list_; agent != NULL; agent = agent->next_) { - acKernelCreate_fn callback = agent->callbacks_.KernelCreate; - if (callback != NULL && agent->canGenerateKernelEvents()) { - callback(agent, kernel); - } - } -} - -void -Agent::postKernelFree(cl_kernel kernel) -{ - for (Agent* agent = list_; agent != NULL; agent = agent->next_) { - acKernelFree_fn callback = agent->callbacks_.KernelFree; - if (callback != NULL && agent->canGenerateKernelEvents()) { - callback(agent, kernel); - } - } -} - -void -Agent::postKernelSetArg( - cl_kernel kernel, cl_int index, size_t size, const void* value_ptr) -{ - for (Agent* agent = list_; agent != NULL; agent = agent->next_) { - acKernelSetArg_fn callback = agent->callbacks_.KernelSetArg; - if (callback != NULL && agent->canGenerateKernelEvents()) { - callback(agent, kernel, index, size, value_ptr); - } +void Agent::postKernelSetArg(cl_kernel kernel, cl_int index, size_t size, const void* value_ptr) { + for (Agent* agent = list_; agent != NULL; agent = agent->next_) { + acKernelSetArg_fn callback = agent->callbacks_.KernelSetArg; + if (callback != NULL && agent->canGenerateKernelEvents()) { + callback(agent, kernel, index, size, value_ptr); } + } } Agent* Agent::list_ = NULL; Monitor Agent::capabilitiesLock_; -cl_agent_capabilities Agent::enabledCapabilities_ = { 0 }; -cl_agent_capabilities Agent::potentialCapabilities_ = { 0 }; +cl_agent_capabilities Agent::enabledCapabilities_ = {0}; +cl_agent_capabilities Agent::potentialCapabilities_ = {0}; -} // namespace amd +} // namespace amd diff --git a/rocclr/runtime/platform/agent.hpp b/rocclr/runtime/platform/agent.hpp index c7d02c83a4..fad63c3fc8 100644 --- a/rocclr/runtime/platform/agent.hpp +++ b/rocclr/runtime/platform/agent.hpp @@ -12,139 +12,131 @@ namespace amd { -class Agent : public _cl_agent -{ -private: - //! Linked list of agent instances - static Agent* list_; - //! Agent API entry points - static cl_agent entryPoints_; - //! Capabilities supported by this Agent implementation - static cl_agent_capabilities potentialCapabilities_; - //! Union of all agent's enabled capabilities - static cl_agent_capabilities enabledCapabilities_; - //! Monitor to protect the global capabilities - static Monitor capabilitiesLock_; +class Agent : public _cl_agent { + private: + //! Linked list of agent instances + static Agent* list_; + //! Agent API entry points + static cl_agent entryPoints_; + //! Capabilities supported by this Agent implementation + static cl_agent_capabilities potentialCapabilities_; + //! Union of all agent's enabled capabilities + static cl_agent_capabilities enabledCapabilities_; + //! Monitor to protect the global capabilities + static Monitor capabilitiesLock_; -public: - //! Initialize the OpenCL agent - static bool init(); - //! Teardown the agent. - static void tearDown(); - //! Return the capabilities supported by this agent. - static cl_agent_capabilities potentialCapabilities() { - return potentialCapabilities_; - } + public: + //! Initialize the OpenCL agent + static bool init(); + //! Teardown the agent. + static void tearDown(); + //! Return the capabilities supported by this agent. + static cl_agent_capabilities potentialCapabilities() { return potentialCapabilities_; } -#define AGENT_FLAG(name) \ - inline static bool shouldPost##name() { \ - return enabledCapabilities_.canGenerate##name != 0; \ - } +#define AGENT_FLAG(name) \ + inline static bool shouldPost##name() { return enabledCapabilities_.canGenerate##name != 0; } - AGENT_FLAG(ContextEvents); - AGENT_FLAG(CommandQueueEvents); - AGENT_FLAG(EventEvents); - AGENT_FLAG(MemObjectEvents); - AGENT_FLAG(SamplerEvents); - AGENT_FLAG(ProgramEvents); - AGENT_FLAG(KernelEvents); + AGENT_FLAG(ContextEvents); + AGENT_FLAG(CommandQueueEvents); + AGENT_FLAG(EventEvents); + AGENT_FLAG(MemObjectEvents); + AGENT_FLAG(SamplerEvents); + AGENT_FLAG(ProgramEvents); + AGENT_FLAG(KernelEvents); #undef AGENT_FLAG - //! Post a context creation event - static void postContextCreate(cl_context context); - //! Post a context destruction event - static void postContextFree(cl_context context); + //! Post a context creation event + static void postContextCreate(cl_context context); + //! Post a context destruction event + static void postContextFree(cl_context context); - //! Post a command queue creation event - static void postCommandQueueCreate(cl_command_queue queue); - //! Post a command queue destruction event - static void postCommandQueueFree(cl_command_queue queue); + //! Post a command queue creation event + static void postCommandQueueCreate(cl_command_queue queue); + //! Post a command queue destruction event + static void postCommandQueueFree(cl_command_queue queue); - //! Post an event creation event - static void postEventCreate(cl_event event, cl_command_type type); - //! Post an event destruction event - static void postEventFree(cl_event event); - //! Post and event status change event. - static void postEventStatusChanged( - cl_event event, cl_int execution_status, cl_long epoch_timestamp); + //! Post an event creation event + static void postEventCreate(cl_event event, cl_command_type type); + //! Post an event destruction event + static void postEventFree(cl_event event); + //! Post and event status change event. + static void postEventStatusChanged(cl_event event, cl_int execution_status, + cl_long epoch_timestamp); - //! Post a memory object creation event - static void postMemObjectCreate(cl_mem memobj); - //! Post a memory object destruction event - static void postMemObjectFree(cl_mem memobj); - //! Post a memory transfer (acquired by device) event - static void postMemObjectAcquired( - cl_mem memobj, cl_device_id device, cl_long elapsed_time); + //! Post a memory object creation event + static void postMemObjectCreate(cl_mem memobj); + //! Post a memory object destruction event + static void postMemObjectFree(cl_mem memobj); + //! Post a memory transfer (acquired by device) event + static void postMemObjectAcquired(cl_mem memobj, cl_device_id device, cl_long elapsed_time); - //! Post a sampler creation event - static void postSamplerCreate(cl_sampler sampler); - //! Post a sampler destruction event - static void postSamplerFree(cl_sampler sampler); + //! Post a sampler creation event + static void postSamplerCreate(cl_sampler sampler); + //! Post a sampler destruction event + static void postSamplerFree(cl_sampler sampler); - //! Post a program creation event - static void postProgramCreate(cl_program program); - //! Post a program destruction event - static void postProgramFree(cl_program program); - //! Post a program build event - static void postProgramBuild(cl_program program); + //! Post a program creation event + static void postProgramCreate(cl_program program); + //! Post a program destruction event + static void postProgramFree(cl_program program); + //! Post a program build event + static void postProgramBuild(cl_program program); - //! Post a kernel creation event - static void postKernelCreate(cl_kernel kernel); - //! Post a kernel destruction event - static void postKernelFree(cl_kernel kernel); - //! Post a kernel set argument event - static void postKernelSetArg( - cl_kernel kernel, cl_int arg_index, size_t size, const void* value_ptr); + //! Post a kernel creation event + static void postKernelCreate(cl_kernel kernel); + //! Post a kernel destruction event + static void postKernelFree(cl_kernel kernel); + //! Post a kernel set argument event + static void postKernelSetArg(cl_kernel kernel, cl_int arg_index, size_t size, + const void* value_ptr); -private: - Agent* next_; //!< Next agent in the linked-list. - void* library_; //!< Handle to the loaded module. - bool ready_; //!< Is this instance ready? + private: + Agent* next_; //!< Next agent in the linked-list. + void* library_; //!< Handle to the loaded module. + bool ready_; //!< Is this instance ready? - //! Callbacks vector. - cl_agent_callbacks callbacks_; - //! Capabilities for this agent. - cl_agent_capabilities capabilities_; + //! Callbacks vector. + cl_agent_callbacks callbacks_; + //! Capabilities for this agent. + cl_agent_capabilities capabilities_; -#define AGENT_FLAG(name) \ - inline bool canGenerate##name() { \ - return capabilities_.canGenerate##name != 0; \ - } +#define AGENT_FLAG(name) \ + inline bool canGenerate##name() { return capabilities_.canGenerate##name != 0; } - AGENT_FLAG(ContextEvents); - AGENT_FLAG(CommandQueueEvents); - AGENT_FLAG(EventEvents); - AGENT_FLAG(MemObjectEvents); - AGENT_FLAG(SamplerEvents); - AGENT_FLAG(ProgramEvents); - AGENT_FLAG(KernelEvents); + AGENT_FLAG(ContextEvents); + AGENT_FLAG(CommandQueueEvents); + AGENT_FLAG(EventEvents); + AGENT_FLAG(MemObjectEvents); + AGENT_FLAG(SamplerEvents); + AGENT_FLAG(ProgramEvents); + AGENT_FLAG(KernelEvents); #undef AGENT_FLAG -public: - //! Construct a new agent. - Agent(const char* moduleName); - //! Destroy the agent - ~Agent(); + public: + //! Construct a new agent. + Agent(const char* moduleName); + //! Destroy the agent + ~Agent(); - //! Return true if this instance is ready for use. - bool isReady() const { return ready_; } + //! Return true if this instance is ready for use. + bool isReady() const { return ready_; } - //! Set the callback vector for this agent - cl_int setCallbacks(const cl_agent_callbacks *callbacks, size_t size); + //! Set the callback vector for this agent + cl_int setCallbacks(const cl_agent_callbacks* callbacks, size_t size); - //! Return the current capabilities. - cl_int getCapabilities(cl_agent_capabilities* caps); - //! Set the current capabilities. - cl_int setCapabilities(const cl_agent_capabilities* caps, bool install); + //! Return the current capabilities. + cl_int getCapabilities(cl_agent_capabilities* caps); + //! Set the current capabilities. + cl_int setCapabilities(const cl_agent_capabilities* caps, bool install); - //! Return the Agent instance from the given cl_agent - inline static Agent* get(cl_agent* agent) { - return const_cast(static_cast(agent)); - } + //! Return the Agent instance from the given cl_agent + inline static Agent* get(cl_agent* agent) { + return const_cast(static_cast(agent)); + } }; -} // namespace amd +} // namespace amd -#endif // AGENT_HPP_ +#endif // AGENT_HPP_ diff --git a/rocclr/runtime/platform/command.cpp b/rocclr/runtime/platform/command.cpp index 003fa38302..efe02a6802 100644 --- a/rocclr/runtime/platform/command.cpp +++ b/rocclr/runtime/platform/command.cpp @@ -26,634 +26,536 @@ namespace amd { Event::Event(HostQueue& queue) - : callbacks_(NULL) - , status_(CL_INT_MAX) - , profilingInfo_( - queue.properties().test(CL_QUEUE_PROFILING_ENABLE) - || Agent::shouldPostEventEvents()) -{ notified_.clear(); } - -Event::Event() - : callbacks_(NULL) - , status_(CL_SUBMITTED) -{ notified_.clear(); } - -Event::~Event() -{ - CallBackEntry* callback = callbacks_; - while (callback != NULL) { - CallBackEntry* next = callback->next_; - delete callback; - callback = next; - } + : callbacks_(NULL), + status_(CL_INT_MAX), + profilingInfo_(queue.properties().test(CL_QUEUE_PROFILING_ENABLE) || + Agent::shouldPostEventEvents()) { + notified_.clear(); } -uint64_t -Event::recordProfilingInfo(cl_int status, uint64_t timeStamp) -{ - if (timeStamp == 0) { - timeStamp = Os::timeNanos(); - } - switch (status) { +Event::Event() : callbacks_(NULL), status_(CL_SUBMITTED) { notified_.clear(); } + +Event::~Event() { + CallBackEntry* callback = callbacks_; + while (callback != NULL) { + CallBackEntry* next = callback->next_; + delete callback; + callback = next; + } +} + +uint64_t Event::recordProfilingInfo(cl_int status, uint64_t timeStamp) { + if (timeStamp == 0) { + timeStamp = Os::timeNanos(); + } + switch (status) { case CL_QUEUED: - profilingInfo_.queued_ = timeStamp; - break; + profilingInfo_.queued_ = timeStamp; + break; case CL_SUBMITTED: - profilingInfo_.submitted_ = timeStamp; - break; + profilingInfo_.submitted_ = timeStamp; + break; case CL_RUNNING: - profilingInfo_.start_ = timeStamp; - break; + profilingInfo_.start_ = timeStamp; + break; default: - profilingInfo_.end_ = timeStamp; - if (profilingInfo_.callback_ != NULL) { - profilingInfo_.callback_->callback(timeStamp - profilingInfo_.start_); - } - break; - } - return timeStamp; + profilingInfo_.end_ = timeStamp; + if (profilingInfo_.callback_ != NULL) { + profilingInfo_.callback_->callback(timeStamp - profilingInfo_.start_); + } + break; + } + return timeStamp; } -bool -Event::setStatus(cl_int status, uint64_t timeStamp) -{ - assert(status <= CL_QUEUED && "invalid status"); +bool Event::setStatus(cl_int status, uint64_t timeStamp) { + assert(status <= CL_QUEUED && "invalid status"); - cl_int currentStatus = status_; - if (currentStatus <= CL_COMPLETE || currentStatus <= status) { - // We can only move forward in the execution status. - return false; + cl_int currentStatus = status_; + if (currentStatus <= CL_COMPLETE || currentStatus <= status) { + // We can only move forward in the execution status. + return false; + } + + if (profilingInfo().enabled_) { + timeStamp = recordProfilingInfo(status, timeStamp); + } + + if (!make_atomic(status_).compareAndSet(currentStatus, status)) { + // Somebody else beat us to it, let them deal with the release/signal. + return false; + } + + if (callbacks_ != (CallBackEntry*)0) { + processCallbacks(status); + } + + if (Agent::shouldPostEventEvents() && command().type() != 0) { + Agent::postEventStatusChanged(as_cl(this), status, timeStamp + Os::offsetToEpochNanos()); + } + + if (status <= CL_COMPLETE) { + // Before we notify the waiters that this event reached the CL_COMPLETE + // status, we release all the resources associated with this instance. + releaseResources(); + + // Broadcast all the waiters. + if (referenceCount() > 1) { + signal(); } + release(); + } - if (profilingInfo().enabled_) { - timeStamp = recordProfilingInfo(status, timeStamp); - } - - if (!make_atomic(status_).compareAndSet(currentStatus, status)) { - // Somebody else beat us to it, let them deal with the release/signal. - return false; - } - - if (callbacks_ != (CallBackEntry*)0) { - processCallbacks(status); - } - - if (Agent::shouldPostEventEvents() && command().type() != 0) { - Agent::postEventStatusChanged( - as_cl(this), status, timeStamp + Os::offsetToEpochNanos()); - } - - if (status <= CL_COMPLETE) { - // Before we notify the waiters that this event reached the CL_COMPLETE - // status, we release all the resources associated with this instance. - releaseResources(); - - // Broadcast all the waiters. - if (referenceCount() > 1) { - signal(); - } - release(); - } - - return true; + return true; } -bool -Event::setCallback(cl_int status, Event::CallBackFunction callback, void* data) -{ - assert(status >= CL_COMPLETE && status <= CL_QUEUED && "invalid status"); +bool Event::setCallback(cl_int status, Event::CallBackFunction callback, void* data) { + assert(status >= CL_COMPLETE && status <= CL_QUEUED && "invalid status"); - CallBackEntry* entry = new CallBackEntry(status, callback, data); - if (entry == NULL) { - return false; + CallBackEntry* entry = new CallBackEntry(status, callback, data); + if (entry == NULL) { + return false; + } + + entry->next_ = callbacks_; + while (!callbacks_.compare_exchange_weak(entry->next_, entry)) + ; // Someone else is also updating the head of the linked list! reload. + + // Check if the event has already reached 'status' + if (status_ <= status && entry->callback_ != CallBackFunction(0)) { + if (entry->callback_.exchange(NULL) != NULL) { + callback(as_cl(this), status, entry->data_); } + } - entry->next_ = callbacks_; - while (!callbacks_.compare_exchange_weak(entry->next_, entry)) - ; // Someone else is also updating the head of the linked list! reload. - - // Check if the event has already reached 'status' - if (status_ <= status && entry->callback_ != CallBackFunction(0)) { - if (entry->callback_.exchange(NULL) != NULL) { - callback(as_cl(this), status, entry->data_); - } - } - - return true; + return true; } -void -Event::processCallbacks(cl_int status) const -{ - cl_event event = const_cast(as_cl(this)); - const cl_int mask = (status > CL_COMPLETE) ? status : CL_COMPLETE; +void Event::processCallbacks(cl_int status) const { + cl_event event = const_cast(as_cl(this)); + const cl_int mask = (status > CL_COMPLETE) ? status : CL_COMPLETE; - // For_each callback: - CallBackEntry* entry; - for (entry = callbacks_; entry != NULL; entry = entry->next_) { - // If the entry's status matches the mask, - if (entry->status_ == mask && entry->callback_ != CallBackFunction(0)) { - // invoke the callback function. - CallBackFunction callback = entry->callback_.exchange(NULL); - if (callback != NULL) { - callback(event, status, entry->data_); - } - } + // For_each callback: + CallBackEntry* entry; + for (entry = callbacks_; entry != NULL; entry = entry->next_) { + // If the entry's status matches the mask, + if (entry->status_ == mask && entry->callback_ != CallBackFunction(0)) { + // invoke the callback function. + CallBackFunction callback = entry->callback_.exchange(NULL); + if (callback != NULL) { + callback(event, status, entry->data_); + } } + } } -bool -Event::awaitCompletion() -{ - if (status_ > CL_COMPLETE) { - // Notifies current command queue about waiting - if (!notifyCmdQueue()) { - return false; - } - - ScopedLock lock(lock_); - - // Wait until the status becomes CL_COMPLETE or negative. - while (status_ > CL_COMPLETE) { - lock_.wait(); - } +bool Event::awaitCompletion() { + if (status_ > CL_COMPLETE) { + // Notifies current command queue about waiting + if (!notifyCmdQueue()) { + return false; } - return status_ == CL_COMPLETE; + ScopedLock lock(lock_); + + // Wait until the status becomes CL_COMPLETE or negative. + while (status_ > CL_COMPLETE) { + lock_.wait(); + } + } + + return status_ == CL_COMPLETE; } -bool -Event::notifyCmdQueue() -{ - HostQueue* queue = command().queue(); - if ((NULL != queue) && !notified_.test_and_set()) { - // Make sure the queue is draining the enqueued commands. - amd::Command* command = new amd::Marker(*queue, false, nullWaitList, this); - if (command == NULL) { - notified_.clear(); - return false; - } - command->enqueue(); - command->release(); +bool Event::notifyCmdQueue() { + HostQueue* queue = command().queue(); + if ((NULL != queue) && !notified_.test_and_set()) { + // Make sure the queue is draining the enqueued commands. + amd::Command* command = new amd::Marker(*queue, false, nullWaitList, this); + if (command == NULL) { + notified_.clear(); + return false; } - return true; + command->enqueue(); + command->release(); + } + return true; } const Event::EventWaitList Event::nullWaitList(0); -Command::Command( - HostQueue& queue, - cl_command_type type, - const EventWaitList& eventWaitList) : - Event(queue), queue_(&queue), next_(NULL), type_(type), - exception_(0), data_(NULL), eventWaitList_(eventWaitList) -{ - // Retain the commands from the event wait list. - std::for_each( - eventWaitList.begin(), - eventWaitList.end(), - std::mem_fun(&Command::retain)); +Command::Command(HostQueue& queue, cl_command_type type, const EventWaitList& eventWaitList) + : Event(queue), + queue_(&queue), + next_(NULL), + type_(type), + exception_(0), + data_(NULL), + eventWaitList_(eventWaitList) { + // Retain the commands from the event wait list. + std::for_each(eventWaitList.begin(), eventWaitList.end(), std::mem_fun(&Command::retain)); } -void -Command::releaseResources() -{ - const Command::EventWaitList& events = eventWaitList(); +void Command::releaseResources() { + const Command::EventWaitList& events = eventWaitList(); - // Release the commands from the event wait list. - std::for_each( - events.begin(), - events.end(), - std::mem_fun(&Command::release)); + // Release the commands from the event wait list. + std::for_each(events.begin(), events.end(), std::mem_fun(&Command::release)); } -void -Command::enqueue() -{ - assert(queue_ != NULL && "Cannot be enqueued"); +void Command::enqueue() { + assert(queue_ != NULL && "Cannot be enqueued"); - if (Agent::shouldPostEventEvents() && type_ != 0) { - Agent::postEventCreate(as_cl(static_cast(this)), type_); - } + if (Agent::shouldPostEventEvents() && type_ != 0) { + Agent::postEventCreate(as_cl(static_cast(this)), type_); + } - queue_->append(*this); - queue_->flush(); - if (queue_->device().settings().waitCommand_ && (type_ != 0)) { - awaitCompletion(); - } + queue_->append(*this); + queue_->flush(); + if (queue_->device().settings().waitCommand_ && (type_ != 0)) { + awaitCompletion(); + } } -const Context& -Command::context() const -{ - return queue_->context(); -} +const Context& Command::context() const { return queue_->context(); } -NDRangeKernelCommand::NDRangeKernelCommand( - HostQueue& queue, - const EventWaitList& eventWaitList, - Kernel& kernel, - const NDRangeContainer& sizes) : - Command(queue, CL_COMMAND_NDRANGE_KERNEL, eventWaitList), - kernel_(kernel), sizes_(sizes) -{ - parameters_ = kernel.parameters().capture(queue.device()); - auto& device = queue.device(); - auto devKernel = const_cast(kernel.getDeviceKernel(device)); - profilingInfo_.setCallback(devKernel->getProfilingCallback(queue.vdev())); - fixme_guarantee(parameters_ != NULL && "out of memory"); - kernel_.retain(); +NDRangeKernelCommand::NDRangeKernelCommand(HostQueue& queue, const EventWaitList& eventWaitList, + Kernel& kernel, const NDRangeContainer& sizes) + : Command(queue, CL_COMMAND_NDRANGE_KERNEL, eventWaitList), kernel_(kernel), sizes_(sizes) { + parameters_ = kernel.parameters().capture(queue.device()); + auto& device = queue.device(); + auto devKernel = const_cast(kernel.getDeviceKernel(device)); + profilingInfo_.setCallback(devKernel->getProfilingCallback(queue.vdev())); + fixme_guarantee(parameters_ != NULL && "out of memory"); + kernel_.retain(); } void NDRangeKernelCommand::releaseResources() { - kernel_.parameters().release(parameters_, queue()->device()); - DEBUG_ONLY(parameters_ = NULL); - kernel_.release(); - Command::releaseResources(); + kernel_.parameters().release(parameters_, queue()->device()); + DEBUG_ONLY(parameters_ = NULL); + kernel_.release(); + Command::releaseResources(); } -NativeFnCommand::NativeFnCommand( - HostQueue& queue, const EventWaitList& eventWaitList, - void (CL_CALLBACK *nativeFn)(void*), const void* args, size_t argsSize, - size_t numMemObjs, const cl_mem* memObjs, const void** memLocs) : - Command(queue, CL_COMMAND_NATIVE_KERNEL, eventWaitList), - nativeFn_(nativeFn), argsSize_(argsSize) -{ - args_ = new char[argsSize_]; - if (args_ == NULL) { - return; - } - ::memcpy(args_, args, argsSize_); - - memObjects_.resize(numMemObjs); - memOffsets_.resize(numMemObjs); - for (size_t i = 0; i < numMemObjs; ++i) { - Memory* obj = as_amd(memObjs[i]); - - obj->retain(); - memObjects_[i] = obj; - memOffsets_[i] = (const_address) memLocs[i] - (const_address) args; +NativeFnCommand::NativeFnCommand(HostQueue& queue, const EventWaitList& eventWaitList, + void(CL_CALLBACK* nativeFn)(void*), const void* args, + size_t argsSize, size_t numMemObjs, const cl_mem* memObjs, + const void** memLocs) + : Command(queue, CL_COMMAND_NATIVE_KERNEL, eventWaitList), + nativeFn_(nativeFn), + argsSize_(argsSize) { + args_ = new char[argsSize_]; + if (args_ == NULL) { + return; + } + ::memcpy(args_, args, argsSize_); + + memObjects_.resize(numMemObjs); + memOffsets_.resize(numMemObjs); + for (size_t i = 0; i < numMemObjs; ++i) { + Memory* obj = as_amd(memObjs[i]); + + obj->retain(); + memObjects_[i] = obj; + memOffsets_[i] = (const_address)memLocs[i] - (const_address)args; + } +} + +cl_int NativeFnCommand::invoke() { + size_t numMemObjs = memObjects_.size(); + for (size_t i = 0; i < numMemObjs; ++i) { + void* hostMemPtr = memObjects_[i]->getHostMem(); + if (hostMemPtr == NULL) { + return CL_MEM_OBJECT_ALLOCATION_FAILURE; } + *reinterpret_cast(&args_[memOffsets_[i]]) = hostMemPtr; + } + nativeFn_(args_); + return CL_SUCCESS; } -cl_int -NativeFnCommand::invoke() -{ - size_t numMemObjs = memObjects_.size(); - for (size_t i = 0; i < numMemObjs; ++i) { - void* hostMemPtr = memObjects_[i]->getHostMem(); - if (hostMemPtr == NULL) { - return CL_MEM_OBJECT_ALLOCATION_FAILURE; - } - *reinterpret_cast(&args_[memOffsets_[i]]) = hostMemPtr; - } - nativeFn_(args_); - return CL_SUCCESS; -} - -bool -OneMemoryArgCommand::validateMemory() - { - if (queue()->device().info().type_ & CL_DEVICE_TYPE_GPU) { - device::Memory* mem = memory_->getDeviceMemory(queue()->device()); - if (NULL == mem) { - LogPrintfError("Can't allocate memory size - 0x%08X bytes!", - memory_->getSize()); - return false; - } - } - return true; -} - -bool -TwoMemoryArgsCommand::validateMemory() -{ - if (queue()->device().info().type_ & CL_DEVICE_TYPE_GPU) { - device::Memory* mem = memory1_->getDeviceMemory(queue()->device()); - if (NULL == mem) { - LogPrintfError("Can't allocate memory size - 0x%08X bytes!", - memory1_->getSize()); - return false; - } - mem = memory2_->getDeviceMemory(queue()->device()); - if (NULL == mem) { - LogPrintfError("Can't allocate memory size - 0x%08X bytes!", - memory2_->getSize()); - return false; - } - } - return true; -} -bool -ReadMemoryCommand::isEntireMemory() const -{ - return source().isEntirelyCovered(origin(), size()); -} - -bool -WriteMemoryCommand::isEntireMemory() const -{ - return destination().isEntirelyCovered(origin(), size()); -} - -bool -SvmMapMemoryCommand::isEntireMemory() const -{ - return getSvmMem()->isEntirelyCovered(origin(), size()); -} - -bool -FillMemoryCommand::isEntireMemory() const -{ - return memory().isEntirelyCovered(origin(), size()); -} - -bool -CopyMemoryCommand::isEntireMemory() const -{ - bool result = false; - - switch (type()) { - case CL_COMMAND_COPY_IMAGE_TO_BUFFER: { - Coord3D imageSize(size()[0] * size()[1] * size()[2] * - source().asImage()->getImageFormat().getElementSize()); - result = source().isEntirelyCovered(srcOrigin(), size()) && - destination().isEntirelyCovered(dstOrigin(), imageSize); - } - break; - case CL_COMMAND_COPY_BUFFER_TO_IMAGE: { - Coord3D imageSize(size()[0] * size()[1] * size()[2] * - destination().asImage()->getImageFormat().getElementSize()); - result = source().isEntirelyCovered(srcOrigin(), imageSize) && - destination().isEntirelyCovered(dstOrigin(), size()); - } - break; - case CL_COMMAND_COPY_BUFFER_RECT: { - Coord3D rectSize(size()[0] * size()[1] * size()[2]); - Coord3D srcOffs(srcRect().start_); - Coord3D dstOffs(dstRect().start_); - result = source().isEntirelyCovered(srcOffs, rectSize) && - destination().isEntirelyCovered(dstOffs, rectSize); - } - break; - default: - result = source().isEntirelyCovered(srcOrigin(), size()) && - destination().isEntirelyCovered(dstOrigin(), size()); - break; - } - return result; -} - -bool -MapMemoryCommand::isEntireMemory() const -{ - return memory().isEntirelyCovered(origin(), size()); -} - -void -UnmapMemoryCommand::releaseResources() -{ - if (queue()->device().info().type_ & CL_DEVICE_TYPE_GPU) { - //! @todo This is a workaround to a deadlock on indirect map release. - //! Remove this code when CAL will have a refcounter on memory. - //! decIndMapCount() has to go back to submitUnmapMemory() - device::Memory* mem = memory_->getDeviceMemory(queue()->device()); - if (NULL != mem) { - mem->releaseIndirectMap(); - } - } - OneMemoryArgCommand::releaseResources(); -} - -bool -MigrateMemObjectsCommand::validateMemory() -{ - if (queue()->device().info().type_ & CL_DEVICE_TYPE_GPU) { - std::vector::const_iterator itr; - for (itr = memObjects_.begin(); itr != memObjects_.end(); itr++) { - device::Memory* mem = (*itr)->getDeviceMemory(queue()->device()); - if (NULL == mem) { - LogPrintfError("Can't allocate memory size - 0x%08X bytes!", - (*itr)->getSize()); - return false; - } - } - } - - return true; -} - -cl_int -NDRangeKernelCommand::validateMemory() -{ - const amd::Device& device = queue()->device(); - if (device.info().type_ & CL_DEVICE_TYPE_GPU) { - // Validate the kernel before submission - if (!queue()->device().validateKernel(kernel(), queue()->vdev())) { - return CL_OUT_OF_RESOURCES; - } - - const amd::KernelSignature& signature = kernel().signature(); - for (uint i = 0; i != signature.numParameters(); ++i) { - const amd::KernelParameterDescriptor& desc = signature.at(i); - // Check if it's a memory object - if ((desc.type_ == T_POINTER) && (desc.size_ != 0)) { - amd::Memory* amdMemory; - if (kernel().parameters().boundToSvmPointer(device, - parameters_, - i)) { - //find the real mem object from svm ptr from the list - amdMemory = amd::SvmManager::FindSvmBuffer( - *reinterpret_cast(parameters() + desc.offset_)); - } - else { - amdMemory = *reinterpret_cast - (parameters() + desc.offset_); - } - if (amdMemory != NULL) { - if (desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_CONSTANT) { - // Make sure argument size isn't bigger than the device limit - if (amdMemory->getSize() > device.info().maxConstantBufferSize_) { - LogPrintfError("HW constant buffer is too big (0x%X bytes)!", - amdMemory->getSize()); - return CL_OUT_OF_RESOURCES; - } - } - device::Memory* mem = - amdMemory->getDeviceMemory(device); - if (!kernel().getDeviceKernel( - device)->validateMemory(i, amdMemory)) { - if (device.reallocMemory(*amdMemory)) { - mem = amdMemory->getDeviceMemory(device); - } - else { - mem = NULL; - } - } - if (NULL == mem) { - LogPrintfError("Can't allocate memory size - 0x%08X bytes!", - amdMemory->getSize()); - return CL_MEM_OBJECT_ALLOCATION_FAILURE; - } - } - } - } - } - return CL_SUCCESS; -} - -bool ExtObjectsCommand::validateMemory() -{ - bool retVal = true; - if (queue()->device().info().type_ & CL_DEVICE_TYPE_GPU) { - for(std::vector::const_iterator itr = memObjects_.begin(); - itr != memObjects_.end(); itr++) { - device::Memory* mem = (*itr)->getDeviceMemory(queue()->device()); - if (NULL == mem) { - LogPrintfError("Can't allocate memory size - 0x%08X bytes!", - (*itr)->getSize()); - return false; - } - retVal = processGLResource(mem); - } - } - return retVal; -} - -bool AcquireExtObjectsCommand::processGLResource(device::Memory * mem) -{ - return mem->processGLResource(device::Memory::GLDecompressResource); -} - -bool ReleaseExtObjectsCommand::processGLResource(device::Memory * mem) -{ - return mem->processGLResource(device::Memory::GLInvalidateFBO); -} - -bool -MakeBuffersResidentCommand::validateMemory() -{ - if (queue()->device().info().type_ & CL_DEVICE_TYPE_GPU) { - for(std::vector::const_iterator itr = memObjects_.begin(); - itr != memObjects_.end(); itr++) { - device::Memory* mem = (*itr)->getDeviceMemory(queue()->device()); - if (NULL == mem) { - LogPrintfError("Can't allocate memory size - 0x%08X bytes!", - (*itr)->getSize()); - return false; - } - } - } - - return true; - -} -bool -ThreadTraceMemObjectsCommand::validateMemory() -{ - if (queue()->device().info().type_ & CL_DEVICE_TYPE_GPU) { - for(std::vector::const_iterator itr = memObjects_.begin(); - itr != memObjects_.end(); itr++) { - device::Memory* mem = (*itr)->getDeviceMemory(queue()->device()); - if (NULL == mem) { - std::vector::const_iterator tmpItr; - for (tmpItr = memObjects_.begin(); tmpItr != itr; tmpItr++) { - device::Memory* tmpMem = (*tmpItr)->getDeviceMemory(queue()->device()); - delete tmpMem; - } - LogPrintfError("Can't allocate memory size - 0x%08X bytes!", - (*itr)->getSize()); - return false; - } - } - } - - return true; -} - -void -TransferBufferFileCommand::releaseResources() -{ - for (uint i = 0; i < NumStagingBuffers; ++i) { - if (NULL != staging_[i]) { - staging_[i]->release(); - } - } - - // Call the parent - OneMemoryArgCommand::releaseResources(); -} - -void -TransferBufferFileCommand::submit(device::VirtualDevice& device) -{ +bool OneMemoryArgCommand::validateMemory() { + if (queue()->device().info().type_ & CL_DEVICE_TYPE_GPU) { device::Memory* mem = memory_->getDeviceMemory(queue()->device()); - if (memory_->getMemFlags() & (CL_MEM_USE_HOST_PTR | - CL_MEM_ALLOC_HOST_PTR | CL_MEM_USE_PERSISTENT_MEM_AMD)) { - void* srcDstBuffer = nullptr; - if (memory_->getMemFlags() & CL_MEM_USE_PERSISTENT_MEM_AMD) { - // Lock protected multiple maps for persistent memory - amd::ScopedLock lock(mem->owner()->lockMemoryOps()); - srcDstBuffer = mem->cpuMap(device); - } - else { - srcDstBuffer = mem->cpuMap(device); - } - // Make HD transfer to the host accessible memory - bool writeBuffer(type() == CL_COMMAND_READ_SSG_FILE_AMD); - if (!file()->transferBlock(writeBuffer, srcDstBuffer, mem->size(), - fileOffset(), origin()[0], size()[0])) { - setStatus(CL_INVALID_OPERATION); - return; - } - if (memory_->getMemFlags() & CL_MEM_USE_PERSISTENT_MEM_AMD) { - // Lock protected multiple maps for persistent memory - amd::ScopedLock lock(mem->owner()->lockMemoryOps()); - mem->cpuUnmap(device); - } - else { - mem->cpuUnmap(device); - } - } - else { - device.submitTransferBufferFromFile(*this); + if (NULL == mem) { + LogPrintfError("Can't allocate memory size - 0x%08X bytes!", memory_->getSize()); + return false; } + } + return true; } -bool -TransferBufferFileCommand::validateMemory() -{ - if (queue()->device().info().type_ & CL_DEVICE_TYPE_GPU) { - // Check if the destination buffer has direct host access - if (!(memory_->getMemFlags() & (CL_MEM_USE_HOST_PTR | - CL_MEM_ALLOC_HOST_PTR | CL_MEM_USE_PERSISTENT_MEM_AMD))) { - // Allocate staging buffers - for (uint i = 0; i < NumStagingBuffers; ++i) { - staging_[i] = new (memory_->getContext()) - Buffer(memory_->getContext(), - StagingBufferMemType, StagingBufferSize); - if (NULL == staging_[i] || !staging_[i]->create(nullptr)) { - return false; - } - device::Memory* mem = staging_[i]->getDeviceMemory(queue()->device()); - if (NULL == mem) { - LogPrintfError("Can't allocate staging buffer - 0x%08X bytes!", - staging_[i]->getSize()); - return false; - } +bool TwoMemoryArgsCommand::validateMemory() { + if (queue()->device().info().type_ & CL_DEVICE_TYPE_GPU) { + device::Memory* mem = memory1_->getDeviceMemory(queue()->device()); + if (NULL == mem) { + LogPrintfError("Can't allocate memory size - 0x%08X bytes!", memory1_->getSize()); + return false; + } + mem = memory2_->getDeviceMemory(queue()->device()); + if (NULL == mem) { + LogPrintfError("Can't allocate memory size - 0x%08X bytes!", memory2_->getSize()); + return false; + } + } + return true; +} +bool ReadMemoryCommand::isEntireMemory() const { + return source().isEntirelyCovered(origin(), size()); +} + +bool WriteMemoryCommand::isEntireMemory() const { + return destination().isEntirelyCovered(origin(), size()); +} + +bool SvmMapMemoryCommand::isEntireMemory() const { + return getSvmMem()->isEntirelyCovered(origin(), size()); +} + +bool FillMemoryCommand::isEntireMemory() const { + return memory().isEntirelyCovered(origin(), size()); +} + +bool CopyMemoryCommand::isEntireMemory() const { + bool result = false; + + switch (type()) { + case CL_COMMAND_COPY_IMAGE_TO_BUFFER: { + Coord3D imageSize(size()[0] * size()[1] * size()[2] * + source().asImage()->getImageFormat().getElementSize()); + result = source().isEntirelyCovered(srcOrigin(), size()) && + destination().isEntirelyCovered(dstOrigin(), imageSize); + } break; + case CL_COMMAND_COPY_BUFFER_TO_IMAGE: { + Coord3D imageSize(size()[0] * size()[1] * size()[2] * + destination().asImage()->getImageFormat().getElementSize()); + result = source().isEntirelyCovered(srcOrigin(), imageSize) && + destination().isEntirelyCovered(dstOrigin(), size()); + } break; + case CL_COMMAND_COPY_BUFFER_RECT: { + Coord3D rectSize(size()[0] * size()[1] * size()[2]); + Coord3D srcOffs(srcRect().start_); + Coord3D dstOffs(dstRect().start_); + result = source().isEntirelyCovered(srcOffs, rectSize) && + destination().isEntirelyCovered(dstOffs, rectSize); + } break; + default: + result = source().isEntirelyCovered(srcOrigin(), size()) && + destination().isEntirelyCovered(dstOrigin(), size()); + break; + } + return result; +} + +bool MapMemoryCommand::isEntireMemory() const { + return memory().isEntirelyCovered(origin(), size()); +} + +void UnmapMemoryCommand::releaseResources() { + if (queue()->device().info().type_ & CL_DEVICE_TYPE_GPU) { + //! @todo This is a workaround to a deadlock on indirect map release. + //! Remove this code when CAL will have a refcounter on memory. + //! decIndMapCount() has to go back to submitUnmapMemory() + device::Memory* mem = memory_->getDeviceMemory(queue()->device()); + if (NULL != mem) { + mem->releaseIndirectMap(); + } + } + OneMemoryArgCommand::releaseResources(); +} + +bool MigrateMemObjectsCommand::validateMemory() { + if (queue()->device().info().type_ & CL_DEVICE_TYPE_GPU) { + std::vector::const_iterator itr; + for (itr = memObjects_.begin(); itr != memObjects_.end(); itr++) { + device::Memory* mem = (*itr)->getDeviceMemory(queue()->device()); + if (NULL == mem) { + LogPrintfError("Can't allocate memory size - 0x%08X bytes!", (*itr)->getSize()); + return false; + } + } + } + + return true; +} + +cl_int NDRangeKernelCommand::validateMemory() { + const amd::Device& device = queue()->device(); + if (device.info().type_ & CL_DEVICE_TYPE_GPU) { + // Validate the kernel before submission + if (!queue()->device().validateKernel(kernel(), queue()->vdev())) { + return CL_OUT_OF_RESOURCES; + } + + const amd::KernelSignature& signature = kernel().signature(); + for (uint i = 0; i != signature.numParameters(); ++i) { + const amd::KernelParameterDescriptor& desc = signature.at(i); + // Check if it's a memory object + if ((desc.type_ == T_POINTER) && (desc.size_ != 0)) { + amd::Memory* amdMemory; + if (kernel().parameters().boundToSvmPointer(device, parameters_, i)) { + // find the real mem object from svm ptr from the list + amdMemory = amd::SvmManager::FindSvmBuffer( + *reinterpret_cast(parameters() + desc.offset_)); + } else { + amdMemory = *reinterpret_cast(parameters() + desc.offset_); + } + if (amdMemory != NULL) { + if (desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_CONSTANT) { + // Make sure argument size isn't bigger than the device limit + if (amdMemory->getSize() > device.info().maxConstantBufferSize_) { + LogPrintfError("HW constant buffer is too big (0x%X bytes)!", amdMemory->getSize()); + return CL_OUT_OF_RESOURCES; } + } + device::Memory* mem = amdMemory->getDeviceMemory(device); + if (!kernel().getDeviceKernel(device)->validateMemory(i, amdMemory)) { + if (device.reallocMemory(*amdMemory)) { + mem = amdMemory->getDeviceMemory(device); + } else { + mem = NULL; + } + } + if (NULL == mem) { + LogPrintfError("Can't allocate memory size - 0x%08X bytes!", amdMemory->getSize()); + return CL_MEM_OBJECT_ALLOCATION_FAILURE; + } } - - device::Memory* mem = memory_->getDeviceMemory(queue()->device()); - if (NULL == mem) { - LogPrintfError("Can't allocate memory size - 0x%08X bytes!", - memory_->getSize()); - return false; - } + } } - return true; + } + return CL_SUCCESS; } -} // namespace amd +bool ExtObjectsCommand::validateMemory() { + bool retVal = true; + if (queue()->device().info().type_ & CL_DEVICE_TYPE_GPU) { + for (std::vector::const_iterator itr = memObjects_.begin(); + itr != memObjects_.end(); itr++) { + device::Memory* mem = (*itr)->getDeviceMemory(queue()->device()); + if (NULL == mem) { + LogPrintfError("Can't allocate memory size - 0x%08X bytes!", (*itr)->getSize()); + return false; + } + retVal = processGLResource(mem); + } + } + return retVal; +} + +bool AcquireExtObjectsCommand::processGLResource(device::Memory* mem) { + return mem->processGLResource(device::Memory::GLDecompressResource); +} + +bool ReleaseExtObjectsCommand::processGLResource(device::Memory* mem) { + return mem->processGLResource(device::Memory::GLInvalidateFBO); +} + +bool MakeBuffersResidentCommand::validateMemory() { + if (queue()->device().info().type_ & CL_DEVICE_TYPE_GPU) { + for (std::vector::const_iterator itr = memObjects_.begin(); + itr != memObjects_.end(); itr++) { + device::Memory* mem = (*itr)->getDeviceMemory(queue()->device()); + if (NULL == mem) { + LogPrintfError("Can't allocate memory size - 0x%08X bytes!", (*itr)->getSize()); + return false; + } + } + } + + return true; +} +bool ThreadTraceMemObjectsCommand::validateMemory() { + if (queue()->device().info().type_ & CL_DEVICE_TYPE_GPU) { + for (std::vector::const_iterator itr = memObjects_.begin(); + itr != memObjects_.end(); itr++) { + device::Memory* mem = (*itr)->getDeviceMemory(queue()->device()); + if (NULL == mem) { + std::vector::const_iterator tmpItr; + for (tmpItr = memObjects_.begin(); tmpItr != itr; tmpItr++) { + device::Memory* tmpMem = (*tmpItr)->getDeviceMemory(queue()->device()); + delete tmpMem; + } + LogPrintfError("Can't allocate memory size - 0x%08X bytes!", (*itr)->getSize()); + return false; + } + } + } + + return true; +} + +void TransferBufferFileCommand::releaseResources() { + for (uint i = 0; i < NumStagingBuffers; ++i) { + if (NULL != staging_[i]) { + staging_[i]->release(); + } + } + + // Call the parent + OneMemoryArgCommand::releaseResources(); +} + +void TransferBufferFileCommand::submit(device::VirtualDevice& device) { + device::Memory* mem = memory_->getDeviceMemory(queue()->device()); + if (memory_->getMemFlags() & + (CL_MEM_USE_HOST_PTR | CL_MEM_ALLOC_HOST_PTR | CL_MEM_USE_PERSISTENT_MEM_AMD)) { + void* srcDstBuffer = nullptr; + if (memory_->getMemFlags() & CL_MEM_USE_PERSISTENT_MEM_AMD) { + // Lock protected multiple maps for persistent memory + amd::ScopedLock lock(mem->owner()->lockMemoryOps()); + srcDstBuffer = mem->cpuMap(device); + } else { + srcDstBuffer = mem->cpuMap(device); + } + // Make HD transfer to the host accessible memory + bool writeBuffer(type() == CL_COMMAND_READ_SSG_FILE_AMD); + if (!file()->transferBlock(writeBuffer, srcDstBuffer, mem->size(), fileOffset(), origin()[0], + size()[0])) { + setStatus(CL_INVALID_OPERATION); + return; + } + if (memory_->getMemFlags() & CL_MEM_USE_PERSISTENT_MEM_AMD) { + // Lock protected multiple maps for persistent memory + amd::ScopedLock lock(mem->owner()->lockMemoryOps()); + mem->cpuUnmap(device); + } else { + mem->cpuUnmap(device); + } + } else { + device.submitTransferBufferFromFile(*this); + } +} + +bool TransferBufferFileCommand::validateMemory() { + if (queue()->device().info().type_ & CL_DEVICE_TYPE_GPU) { + // Check if the destination buffer has direct host access + if (!(memory_->getMemFlags() & + (CL_MEM_USE_HOST_PTR | CL_MEM_ALLOC_HOST_PTR | CL_MEM_USE_PERSISTENT_MEM_AMD))) { + // Allocate staging buffers + for (uint i = 0; i < NumStagingBuffers; ++i) { + staging_[i] = new (memory_->getContext()) + Buffer(memory_->getContext(), StagingBufferMemType, StagingBufferSize); + if (NULL == staging_[i] || !staging_[i]->create(nullptr)) { + return false; + } + device::Memory* mem = staging_[i]->getDeviceMemory(queue()->device()); + if (NULL == mem) { + LogPrintfError("Can't allocate staging buffer - 0x%08X bytes!", staging_[i]->getSize()); + return false; + } + } + } + + device::Memory* mem = memory_->getDeviceMemory(queue()->device()); + if (NULL == mem) { + LogPrintfError("Can't allocate memory size - 0x%08X bytes!", memory_->getSize()); + return false; + } + } + return true; +} + +} // namespace amd diff --git a/rocclr/runtime/platform/command.hpp b/rocclr/runtime/platform/command.hpp index 94d35ab071..5cfa667cca 100644 --- a/rocclr/runtime/platform/command.hpp +++ b/rocclr/runtime/platform/command.hpp @@ -51,139 +51,130 @@ class HostQueue; * it is associated with and can be used to synchronize operations * in a Context. */ -class Event : public RuntimeObject -{ - typedef void (CL_CALLBACK * CallBackFunction)( - cl_event event, cl_int command_exec_status, void *user_data); +class Event : public RuntimeObject { + typedef void(CL_CALLBACK* CallBackFunction)(cl_event event, cl_int command_exec_status, + void* user_data); - struct CallBackEntry : public HeapObject - { - struct CallBackEntry* next_; //!< the next entry in the callback list. + struct CallBackEntry : public HeapObject { + struct CallBackEntry* next_; //!< the next entry in the callback list. - std::atomic callback_; //!< callback function pointer. - void* data_; //!< user data passed to the callback function. - cl_int status_; //!< execution status triggering the callback. + std::atomic callback_; //!< callback function pointer. + void* data_; //!< user data passed to the callback function. + cl_int status_; //!< execution status triggering the callback. - CallBackEntry(cl_int status, CallBackFunction callback, void* data) : - callback_(callback), data_(data), status_(status) - { } - }; + CallBackEntry(cl_int status, CallBackFunction callback, void* data) + : callback_(callback), data_(data), status_(status) {} + }; -public: + public: + typedef std::vector EventWaitList; - typedef std::vector EventWaitList; + private: + Monitor lock_; -private: + std::atomic callbacks_; //!< linked list of callback entries. + volatile cl_int status_; //!< current execution status. + std::atomic_flag notified_; //!< Command queue was notified - Monitor lock_; + protected: + static const EventWaitList nullWaitList; - std::atomic callbacks_; //!< linked list of callback entries. - volatile cl_int status_; //!< current execution status. - std::atomic_flag notified_; //!< Command queue was notified - -protected: - - static const EventWaitList nullWaitList; - - struct ProfilingInfo - { - ProfilingInfo(bool enabled = false) : enabled_(enabled) - { - if (enabled) { - clear(); - callback_ = NULL; - } - } - - uint64_t queued_; - uint64_t submitted_; - uint64_t start_; - uint64_t end_; - bool enabled_; - ProfilingCallback *callback_; - void clear() { - queued_ = 0ULL; - submitted_ = 0ULL; - start_ = 0ULL; - end_ = 0ULL; - } - void setCallback(ProfilingCallback *callback) { - if (callback == NULL) { - return; - } - enabled_ = true; - clear(); - callback_ = callback; - } - - } profilingInfo_; - - //! Construct a new event. - Event(); - - //! Construct a new event associated to the given command \a queue. - Event(HostQueue& queue); - - //! Destroy the event. - virtual ~Event(); - - //! Release the resources associated with this event. - virtual void releaseResources() { } - - //! Record the profiling info for the given change of \a status. - // If the given \a timeStamp is 0 and profiling is enabled, - // use the current host clock time instead. - uint64_t recordProfilingInfo(cl_int status, uint64_t timeStamp = 0); - - //! Process the callbacks for the given \a status change. - void processCallbacks(cl_int status) const; - -public: - //! Return the context for this event. - virtual const Context& context() const = 0; - - //! Return the command this event is associated with. - inline Command& command(); - inline const Command& command() const; - - //! Return the profiling info. - const ProfilingInfo& profilingInfo() const { return profilingInfo_; } - - //! Return this command's execution status. - cl_int status() const { return status_; } - - //! Insert the given \a callback into the callback stack. - bool setCallback(cl_int status, CallBackFunction callback, void* data); - - /*! \brief Set the event status. - * - * \details If the status becomes CL_COMPLETE, notify all threads - * awaiting this command's completion. If the given \a timeStamp is 0 - * and profiling is enabled, use the current host clock time instead. - * - * \see amd::Event::awaitCompletion - */ - bool setStatus(cl_int status, uint64_t timeStamp = 0); - - //! Signal all threads waiting on this event. - void signal() - { - ScopedLock lock(lock_); - lock_.notifyAll(); + struct ProfilingInfo { + ProfilingInfo(bool enabled = false) : enabled_(enabled) { + if (enabled) { + clear(); + callback_ = NULL; + } } - /*! \brief Suspend the current thread until the status of the Command - * associated with this event changes to CL_COMPLETE. Return true if the - * command successfully completed. - */ - virtual bool awaitCompletion(); + uint64_t queued_; + uint64_t submitted_; + uint64_t start_; + uint64_t end_; + bool enabled_; + ProfilingCallback* callback_; + void clear() { + queued_ = 0ULL; + submitted_ = 0ULL; + start_ = 0ULL; + end_ = 0ULL; + } + void setCallback(ProfilingCallback* callback) { + if (callback == NULL) { + return; + } + enabled_ = true; + clear(); + callback_ = callback; + } - /*! \brief Notifies current command queue about execution status - */ - bool notifyCmdQueue(); + } profilingInfo_; - //! RTTI internal implementation - virtual ObjectType objectType() const {return ObjectTypeEvent;} + //! Construct a new event. + Event(); + + //! Construct a new event associated to the given command \a queue. + Event(HostQueue& queue); + + //! Destroy the event. + virtual ~Event(); + + //! Release the resources associated with this event. + virtual void releaseResources() {} + + //! Record the profiling info for the given change of \a status. + // If the given \a timeStamp is 0 and profiling is enabled, + // use the current host clock time instead. + uint64_t recordProfilingInfo(cl_int status, uint64_t timeStamp = 0); + + //! Process the callbacks for the given \a status change. + void processCallbacks(cl_int status) const; + + public: + //! Return the context for this event. + virtual const Context& context() const = 0; + + //! Return the command this event is associated with. + inline Command& command(); + inline const Command& command() const; + + //! Return the profiling info. + const ProfilingInfo& profilingInfo() const { return profilingInfo_; } + + //! Return this command's execution status. + cl_int status() const { return status_; } + + //! Insert the given \a callback into the callback stack. + bool setCallback(cl_int status, CallBackFunction callback, void* data); + + /*! \brief Set the event status. + * + * \details If the status becomes CL_COMPLETE, notify all threads + * awaiting this command's completion. If the given \a timeStamp is 0 + * and profiling is enabled, use the current host clock time instead. + * + * \see amd::Event::awaitCompletion + */ + bool setStatus(cl_int status, uint64_t timeStamp = 0); + + //! Signal all threads waiting on this event. + void signal() { + ScopedLock lock(lock_); + lock_.notifyAll(); + } + + /*! \brief Suspend the current thread until the status of the Command + * associated with this event changes to CL_COMPLETE. Return true if the + * command successfully completed. + */ + virtual bool awaitCompletion(); + + /*! \brief Notifies current command queue about execution status + */ + bool notifyCmdQueue(); + + //! RTTI internal implementation + virtual ObjectType objectType() const { return ObjectTypeEvent; } }; /*! \brief An operation that is submitted to a command queue. @@ -194,202 +185,174 @@ public: * */ -class Command : public Event -{ +class Command : public Event { + private: + //! The command queue this command is enqueue into. NULL if not yet enqueue. + HostQueue* queue_; + //! Next GPU command in the queue list + Command* next_; -private: - //! The command queue this command is enqueue into. NULL if not yet enqueue. - HostQueue* queue_; - //! Next GPU command in the queue list - Command* next_; + const cl_command_type type_; //!< This command's OpenCL type. + volatile cl_int exception_; //!< The first raised exception. + void* data_; - const cl_command_type type_; //!< This command's OpenCL type. - volatile cl_int exception_; //!< The first raised exception. - void* data_; + protected: + //! The Events that need to complete before this command is submitted. + EventWaitList eventWaitList_; -protected: - //! The Events that need to complete before this command is submitted. - EventWaitList eventWaitList_; + //! Construct a new command of the given OpenCL type. + Command(HostQueue& queue, cl_command_type type, + const EventWaitList& eventWaitList = nullWaitList); - //! Construct a new command of the given OpenCL type. - Command( - HostQueue& queue, - cl_command_type type, - const EventWaitList& eventWaitList = nullWaitList); + //! Construct a new command of the given OpenCL type. + Command(cl_command_type type) + : Event(), + queue_(NULL), + next_(NULL), + type_(type), + exception_(0), + data_(NULL), + eventWaitList_(nullWaitList) {} - //! Construct a new command of the given OpenCL type. - Command(cl_command_type type) : - Event(), queue_(NULL), next_(NULL), type_(type), - exception_(0), data_(NULL), eventWaitList_(nullWaitList) - { } - - bool terminate() { - if (Agent::shouldPostEventEvents() && type() != 0) { - Agent::postEventFree(as_cl(static_cast(this))); - } - return true; + bool terminate() { + if (Agent::shouldPostEventEvents() && type() != 0) { + Agent::postEventFree(as_cl(static_cast(this))); } + return true; + } -public: - //! Return the queue this command is enqueued into. - HostQueue* queue() const { return queue_; } + public: + //! Return the queue this command is enqueued into. + HostQueue* queue() const { return queue_; } - //! Enqueue this command into the associated command queue. - void enqueue(); + //! Enqueue this command into the associated command queue. + void enqueue(); - //! Return the event encapsulating this command's status. - const Event& event() const { return *this; } - Event& event() { return *this; } + //! Return the event encapsulating this command's status. + const Event& event() const { return *this; } + Event& event() { return *this; } - //! Return the list of events this command needs to wait on before dispatch - const EventWaitList& eventWaitList() const { return eventWaitList_; } + //! Return the list of events this command needs to wait on before dispatch + const EventWaitList& eventWaitList() const { return eventWaitList_; } - //! Return this command's OpenCL type. - cl_command_type type() const { return type_; } + //! Return this command's OpenCL type. + cl_command_type type() const { return type_; } - //! Return the first raised exception or 0 if none. - cl_int exception() const { return exception_; } + //! Return the first raised exception or 0 if none. + cl_int exception() const { return exception_; } - //! Set the exception for this command. - void setException(cl_int exception) { exception_ = exception; } + //! Set the exception for this command. + void setException(cl_int exception) { exception_ = exception; } - //! Return the opaque, device specific data for this command. - void* data() const { return data_; } + //! Return the opaque, device specific data for this command. + void* data() const { return data_; } - //! Set the opaque, device specific data for this command. - void setData(void* data) { data_ = data; } + //! Set the opaque, device specific data for this command. + void setData(void* data) { data_ = data; } - /*! \brief The execution engine for this command. - * - * \details All derived class must implement this virtual function. - * - * \note This function will execute in the command queue thread. - */ - virtual void submit(device::VirtualDevice& device) = 0; + /*! \brief The execution engine for this command. + * + * \details All derived class must implement this virtual function. + * + * \note This function will execute in the command queue thread. + */ + virtual void submit(device::VirtualDevice& device) = 0; - //! Release the resources associated with this event. - virtual void releaseResources(); + //! Release the resources associated with this event. + virtual void releaseResources(); - //! Set the next GPU command - void setNext(Command* next) { next_ = next; } + //! Set the next GPU command + void setNext(Command* next) { next_ = next; } - //! Get the next GPU command - Command* getNext() const { return next_; } + //! Get the next GPU command + Command* getNext() const { return next_; } - //! Return the context for this event. - virtual const Context& context() const; + //! Return the context for this event. + virtual const Context& context() const; }; -class UserEvent : public Command -{ - const Context& context_; +class UserEvent : public Command { + const Context& context_; -public: - UserEvent(Context& context) : Command(CL_COMMAND_USER), context_(context) { - setStatus(CL_SUBMITTED); - } + public: + UserEvent(Context& context) : Command(CL_COMMAND_USER), context_(context) { + setStatus(CL_SUBMITTED); + } - virtual void submit(device::VirtualDevice& device) { - ShouldNotCallThis(); - } + virtual void submit(device::VirtualDevice& device) { ShouldNotCallThis(); } - virtual const Context& context() const { return context_; } + virtual const Context& context() const { return context_; } }; -class ClGlEvent : public Command -{ -private: - const Context& context_; - bool waitForFence(); +class ClGlEvent : public Command { + private: + const Context& context_; + bool waitForFence(); -public: - ClGlEvent(Context& context) - : Command(CL_COMMAND_GL_FENCE_SYNC_OBJECT_KHR) - , context_(context) { - setStatus(CL_SUBMITTED); - } + public: + ClGlEvent(Context& context) : Command(CL_COMMAND_GL_FENCE_SYNC_OBJECT_KHR), context_(context) { + setStatus(CL_SUBMITTED); + } - virtual void submit(device::VirtualDevice& device) { - ShouldNotCallThis(); - } + virtual void submit(device::VirtualDevice& device) { ShouldNotCallThis(); } - bool awaitCompletion() { - return waitForFence(); - } + bool awaitCompletion() { return waitForFence(); } - virtual const Context& context() const { return context_; } + virtual const Context& context() const { return context_; } }; -inline Command& -Event::command() -{ - return *static_cast(this); -} +inline Command& Event::command() { return *static_cast(this); } -inline const Command& -Event::command() const -{ - return *static_cast(this); -} +inline const Command& Event::command() const { return *static_cast(this); } class Kernel; class NDRangeContainer; //! A memory command that holds a single memory object reference. // -class OneMemoryArgCommand : public Command -{ -protected: - Memory* memory_; +class OneMemoryArgCommand : public Command { + protected: + Memory* memory_; -public: - OneMemoryArgCommand( - HostQueue& queue, - cl_command_type type, - const EventWaitList& eventWaitList, - Memory& memory) : - Command(queue, type, eventWaitList), memory_(&memory) { - memory_->retain(); - } + public: + OneMemoryArgCommand(HostQueue& queue, cl_command_type type, const EventWaitList& eventWaitList, + Memory& memory) + : Command(queue, type, eventWaitList), memory_(&memory) { + memory_->retain(); + } - virtual void releaseResources() { - memory_->release(); - DEBUG_ONLY(memory_ = NULL); - Command::releaseResources(); - } + virtual void releaseResources() { + memory_->release(); + DEBUG_ONLY(memory_ = NULL); + Command::releaseResources(); + } - bool validateMemory(); + bool validateMemory(); }; //! A memory command that holds a single memory object reference. // -class TwoMemoryArgsCommand : public Command -{ -protected: - Memory* memory1_; - Memory* memory2_; +class TwoMemoryArgsCommand : public Command { + protected: + Memory* memory1_; + Memory* memory2_; -public: - TwoMemoryArgsCommand( - HostQueue& queue, - cl_command_type type, - const EventWaitList& eventWaitList, - Memory& memory1, Memory& memory2) : - Command(queue, type, eventWaitList), - memory1_(&memory1), memory2_(&memory2) { - memory1_->retain(); - memory2_->retain(); + public: + TwoMemoryArgsCommand(HostQueue& queue, cl_command_type type, const EventWaitList& eventWaitList, + Memory& memory1, Memory& memory2) + : Command(queue, type, eventWaitList), memory1_(&memory1), memory2_(&memory2) { + memory1_->retain(); + memory2_->retain(); + } - } + virtual void releaseResources() { + memory1_->release(); + memory2_->release(); + DEBUG_ONLY(memory1_ = memory2_ = NULL); + Command::releaseResources(); + } - virtual void releaseResources() { - memory1_->release(); - memory2_->release(); - DEBUG_ONLY(memory1_ = memory2_ = NULL); - Command::releaseResources(); - } - - bool validateMemory(); + bool validateMemory(); }; /*! \brief A generic read memory command. @@ -403,84 +366,73 @@ public: * */ -class ReadMemoryCommand : public OneMemoryArgCommand -{ -private: - Coord3D origin_; //!< Origin of the region to read. - Coord3D size_; //!< Size of the region to read. - void* hostPtr_; //!< The host pointer destination. - size_t rowPitch_; //!< Row pitch (for image operations) - size_t slicePitch_; //!< Slice pitch (for image operations) +class ReadMemoryCommand : public OneMemoryArgCommand { + private: + Coord3D origin_; //!< Origin of the region to read. + Coord3D size_; //!< Size of the region to read. + void* hostPtr_; //!< The host pointer destination. + size_t rowPitch_; //!< Row pitch (for image operations) + size_t slicePitch_; //!< Slice pitch (for image operations) - BufferRect bufRect_; //!< Buffer rectangle information - BufferRect hostRect_; //!< Host memory rectangle information + BufferRect bufRect_; //!< Buffer rectangle information + BufferRect hostRect_; //!< Host memory rectangle information -public: - //! Construct a new ReadMemoryCommand - ReadMemoryCommand( - HostQueue& queue, - cl_command_type cmdType, - const EventWaitList& eventWaitList, - Memory& memory, Coord3D origin, - Coord3D size, void* hostPtr, - size_t rowPitch = 0, size_t slicePitch = 0) : - OneMemoryArgCommand(queue, cmdType, eventWaitList, memory), - origin_(origin), size_(size), hostPtr_(hostPtr), - rowPitch_(rowPitch), slicePitch_(slicePitch) - { - // Sanity checks - assert(hostPtr != NULL && "hostPtr cannot be null"); - assert(size.c[0] > 0 && "invalid"); - } + public: + //! Construct a new ReadMemoryCommand + ReadMemoryCommand(HostQueue& queue, cl_command_type cmdType, const EventWaitList& eventWaitList, + Memory& memory, Coord3D origin, Coord3D size, void* hostPtr, + size_t rowPitch = 0, size_t slicePitch = 0) + : OneMemoryArgCommand(queue, cmdType, eventWaitList, memory), + origin_(origin), + size_(size), + hostPtr_(hostPtr), + rowPitch_(rowPitch), + slicePitch_(slicePitch) { + // Sanity checks + assert(hostPtr != NULL && "hostPtr cannot be null"); + assert(size.c[0] > 0 && "invalid"); + } - //! Construct a new ReadMemoryCommand - ReadMemoryCommand( - HostQueue& queue, - cl_command_type cmdType, - const EventWaitList& eventWaitList, - Memory& memory, Coord3D origin, - Coord3D size, void* hostPtr, - const BufferRect& bufRect, - const BufferRect& hostRect) - : OneMemoryArgCommand(queue, cmdType, eventWaitList, memory) - , origin_(origin) - , size_(size) - , hostPtr_(hostPtr) - , rowPitch_(0) - , slicePitch_(0) - , bufRect_(bufRect) - , hostRect_(hostRect) - { - // Sanity checks - assert(hostPtr != NULL && "hostPtr cannot be null"); - assert(size.c[0] > 0 && "invalid"); - } + //! Construct a new ReadMemoryCommand + ReadMemoryCommand(HostQueue& queue, cl_command_type cmdType, const EventWaitList& eventWaitList, + Memory& memory, Coord3D origin, Coord3D size, void* hostPtr, + const BufferRect& bufRect, const BufferRect& hostRect) + : OneMemoryArgCommand(queue, cmdType, eventWaitList, memory), + origin_(origin), + size_(size), + hostPtr_(hostPtr), + rowPitch_(0), + slicePitch_(0), + bufRect_(bufRect), + hostRect_(hostRect) { + // Sanity checks + assert(hostPtr != NULL && "hostPtr cannot be null"); + assert(size.c[0] > 0 && "invalid"); + } - virtual void submit(device::VirtualDevice& device) { - device.submitReadMemory(*this); - } + virtual void submit(device::VirtualDevice& device) { device.submitReadMemory(*this); } - //! Return the memory object to read from. - Memory& source() const { return *memory_; } - //! Return the host memory to write to - void* destination() const { return hostPtr_; } + //! Return the memory object to read from. + Memory& source() const { return *memory_; } + //! Return the host memory to write to + void* destination() const { return hostPtr_; } - //! Return the origin of the region to read - const Coord3D& origin() const { return origin_; } - //! Return the size of the region to read - const Coord3D& size() const { return size_; } - //! Return the row pitch - size_t rowPitch() const { return rowPitch_; } - //! Return the slice pitch - size_t slicePitch() const { return slicePitch_; } + //! Return the origin of the region to read + const Coord3D& origin() const { return origin_; } + //! Return the size of the region to read + const Coord3D& size() const { return size_; } + //! Return the row pitch + size_t rowPitch() const { return rowPitch_; } + //! Return the slice pitch + size_t slicePitch() const { return slicePitch_; } - //! Return the buffer rectangle information - const BufferRect& bufRect() const { return bufRect_; } - //! Return the host rectangle information - const BufferRect& hostRect() const { return hostRect_; } + //! Return the buffer rectangle information + const BufferRect& bufRect() const { return bufRect_; } + //! Return the host rectangle information + const BufferRect& hostRect() const { return hostRect_; } - //! Return true if the entire memory object is read. - bool isEntireMemory() const; + //! Return true if the entire memory object is read. + bool isEntireMemory() const; }; /*! \brief A generic write memory command. @@ -491,78 +443,71 @@ public: * are equivalent to offset_ and count_ respectively. */ -class WriteMemoryCommand : public OneMemoryArgCommand -{ -private: - Coord3D origin_; //!< Origin of the region to write to. - Coord3D size_; //!< Size of the region to write to. - const void* hostPtr_; //!< The host pointer source. - size_t rowPitch_; //!< Row pitch (for image operations) - size_t slicePitch_; //!< Slice pitch (for image operations) +class WriteMemoryCommand : public OneMemoryArgCommand { + private: + Coord3D origin_; //!< Origin of the region to write to. + Coord3D size_; //!< Size of the region to write to. + const void* hostPtr_; //!< The host pointer source. + size_t rowPitch_; //!< Row pitch (for image operations) + size_t slicePitch_; //!< Slice pitch (for image operations) - BufferRect bufRect_; //!< Buffer rectangle information - BufferRect hostRect_; //!< Host memory rectangle information + BufferRect bufRect_; //!< Buffer rectangle information + BufferRect hostRect_; //!< Host memory rectangle information -public: - WriteMemoryCommand( - HostQueue& queue, - cl_command_type cmdType, - const EventWaitList& eventWaitList, - Memory& memory, Coord3D origin, - Coord3D size, const void* hostPtr, - size_t rowPitch = 0, size_t slicePitch = 0) : - OneMemoryArgCommand(queue, cmdType, eventWaitList, memory), - origin_(origin), size_(size), hostPtr_(hostPtr), - rowPitch_(rowPitch), slicePitch_(slicePitch) - { - // Sanity checks - assert(hostPtr != NULL && "hostPtr cannot be null"); - assert(size.c[0] > 0 && "invalid"); - } + public: + WriteMemoryCommand(HostQueue& queue, cl_command_type cmdType, const EventWaitList& eventWaitList, + Memory& memory, Coord3D origin, Coord3D size, const void* hostPtr, + size_t rowPitch = 0, size_t slicePitch = 0) + : OneMemoryArgCommand(queue, cmdType, eventWaitList, memory), + origin_(origin), + size_(size), + hostPtr_(hostPtr), + rowPitch_(rowPitch), + slicePitch_(slicePitch) { + // Sanity checks + assert(hostPtr != NULL && "hostPtr cannot be null"); + assert(size.c[0] > 0 && "invalid"); + } - WriteMemoryCommand( - HostQueue& queue, - cl_command_type cmdType, - const EventWaitList& eventWaitList, - Memory& memory, Coord3D origin, - Coord3D size, const void* hostPtr, - const BufferRect& bufRect, - const BufferRect& hostRect) - : OneMemoryArgCommand(queue, cmdType, eventWaitList, memory), - origin_(origin), size_(size), hostPtr_(hostPtr), - rowPitch_(0), slicePitch_(0), - bufRect_(bufRect), hostRect_(hostRect) - { - // Sanity checks - assert(hostPtr != NULL && "hostPtr cannot be null"); - assert(size.c[0] > 0 && "invalid"); - } + WriteMemoryCommand(HostQueue& queue, cl_command_type cmdType, const EventWaitList& eventWaitList, + Memory& memory, Coord3D origin, Coord3D size, const void* hostPtr, + const BufferRect& bufRect, const BufferRect& hostRect) + : OneMemoryArgCommand(queue, cmdType, eventWaitList, memory), + origin_(origin), + size_(size), + hostPtr_(hostPtr), + rowPitch_(0), + slicePitch_(0), + bufRect_(bufRect), + hostRect_(hostRect) { + // Sanity checks + assert(hostPtr != NULL && "hostPtr cannot be null"); + assert(size.c[0] > 0 && "invalid"); + } - virtual void submit(device::VirtualDevice& device) { - device.submitWriteMemory(*this); - } + virtual void submit(device::VirtualDevice& device) { device.submitWriteMemory(*this); } - //! Return the host memory to read from - const void* source() const { return hostPtr_; } - //! Return the memory object to write to. - Memory& destination() const { return *memory_; } + //! Return the host memory to read from + const void* source() const { return hostPtr_; } + //! Return the memory object to write to. + Memory& destination() const { return *memory_; } - //! Return the region origin - const Coord3D& origin() const { return origin_; } - //! Return the region size - const Coord3D& size() const { return size_; } - //! Return the row pitch - size_t rowPitch() const { return rowPitch_; } - //! Return the slice pitch - size_t slicePitch() const { return slicePitch_; } + //! Return the region origin + const Coord3D& origin() const { return origin_; } + //! Return the region size + const Coord3D& size() const { return size_; } + //! Return the row pitch + size_t rowPitch() const { return rowPitch_; } + //! Return the slice pitch + size_t slicePitch() const { return slicePitch_; } - //! Return the buffer rectangle information - const BufferRect& bufRect() const { return bufRect_; } - //! Return the host rectangle information - const BufferRect& hostRect() const { return hostRect_; } + //! Return the buffer rectangle information + const BufferRect& bufRect() const { return bufRect_; } + //! Return the host rectangle information + const BufferRect& hostRect() const { return hostRect_; } - //! Return true if the entire memory object is written. - bool isEntireMemory() const; + //! Return true if the entire memory object is written. + bool isEntireMemory() const; }; /*! \brief A generic fill memory command. @@ -573,54 +518,46 @@ public: * are equivalent to offset_ and count_ respectively. */ -class FillMemoryCommand : public OneMemoryArgCommand -{ -public: - const static size_t MaxFillPatterSize = sizeof(cl_double16); +class FillMemoryCommand : public OneMemoryArgCommand { + public: + const static size_t MaxFillPatterSize = sizeof(cl_double16); -private: - Coord3D origin_; //!< Origin of the region to write to. - Coord3D size_; //!< Size of the region to write to. - char pattern_[MaxFillPatterSize]; //!< The fill pattern - size_t patternSize_; //!< Pattern size + private: + Coord3D origin_; //!< Origin of the region to write to. + Coord3D size_; //!< Size of the region to write to. + char pattern_[MaxFillPatterSize]; //!< The fill pattern + size_t patternSize_; //!< Pattern size -public: - FillMemoryCommand( - HostQueue& queue, - cl_command_type cmdType, - const EventWaitList& eventWaitList, - Memory& memory, - const void* pattern, size_t patternSize, - Coord3D origin, Coord3D size) - : OneMemoryArgCommand(queue, cmdType, eventWaitList, memory) - , origin_(origin) - , size_(size) - , patternSize_(patternSize) - { - // Sanity checks - assert(pattern != NULL && "pattern cannot be null"); - assert(size.c[0] > 0 && "invalid"); - memcpy(pattern_, pattern, patternSize); - } + public: + FillMemoryCommand(HostQueue& queue, cl_command_type cmdType, const EventWaitList& eventWaitList, + Memory& memory, const void* pattern, size_t patternSize, Coord3D origin, + Coord3D size) + : OneMemoryArgCommand(queue, cmdType, eventWaitList, memory), + origin_(origin), + size_(size), + patternSize_(patternSize) { + // Sanity checks + assert(pattern != NULL && "pattern cannot be null"); + assert(size.c[0] > 0 && "invalid"); + memcpy(pattern_, pattern, patternSize); + } - virtual void submit(device::VirtualDevice& device) { - device.submitFillMemory(*this); - } + virtual void submit(device::VirtualDevice& device) { device.submitFillMemory(*this); } - //! Return the pattern memory to fill with - const void* pattern() const { return reinterpret_cast(pattern_); } - //! Return the pattern size - const size_t patternSize() const { return patternSize_; } - //! Return the memory object to write to. - Memory& memory() const { return *memory_; } + //! Return the pattern memory to fill with + const void* pattern() const { return reinterpret_cast(pattern_); } + //! Return the pattern size + const size_t patternSize() const { return patternSize_; } + //! Return the memory object to write to. + Memory& memory() const { return *memory_; } - //! Return the region origin - const Coord3D& origin() const { return origin_; } - //! Return the region size - const Coord3D& size() const { return size_; } + //! Return the region origin + const Coord3D& origin() const { return origin_; } + //! Return the region size + const Coord3D& size() const { return size_; } - //! Return true if the entire memory object is written. - bool isEntireMemory() const; + //! Return true if the entire memory object is written. + bool isEntireMemory() const; }; /*! \brief A generic copy memory command @@ -631,73 +568,61 @@ public: * equivalent to offset_ and count_ respectively. */ -class CopyMemoryCommand : public TwoMemoryArgsCommand -{ -private: - Coord3D srcOrigin_; //!< Origin of the source region. - Coord3D dstOrigin_; //!< Origin of the destination region. - Coord3D size_; //!< Size of the region to copy. +class CopyMemoryCommand : public TwoMemoryArgsCommand { + private: + Coord3D srcOrigin_; //!< Origin of the source region. + Coord3D dstOrigin_; //!< Origin of the destination region. + Coord3D size_; //!< Size of the region to copy. - BufferRect srcRect_; //!< Source buffer rectangle information - BufferRect dstRect_; //!< Destination buffer rectangle information + BufferRect srcRect_; //!< Source buffer rectangle information + BufferRect dstRect_; //!< Destination buffer rectangle information -public: - CopyMemoryCommand( - HostQueue& queue, - cl_command_type cmdType, - const EventWaitList& eventWaitList, - Memory& srcMemory, Memory& dstMemory, - Coord3D srcOrigin, Coord3D dstOrigin, - Coord3D size) - : TwoMemoryArgsCommand( - queue, cmdType, eventWaitList, srcMemory, dstMemory), - srcOrigin_(srcOrigin), dstOrigin_(dstOrigin), size_(size) - { - // Sanity checks - assert(size.c[0] > 0 && "invalid"); - } + public: + CopyMemoryCommand(HostQueue& queue, cl_command_type cmdType, const EventWaitList& eventWaitList, + Memory& srcMemory, Memory& dstMemory, Coord3D srcOrigin, Coord3D dstOrigin, + Coord3D size) + : TwoMemoryArgsCommand(queue, cmdType, eventWaitList, srcMemory, dstMemory), + srcOrigin_(srcOrigin), + dstOrigin_(dstOrigin), + size_(size) { + // Sanity checks + assert(size.c[0] > 0 && "invalid"); + } - CopyMemoryCommand( - HostQueue& queue, - cl_command_type cmdType, - const EventWaitList& eventWaitList, - Memory& srcMemory, Memory& dstMemory, - Coord3D srcOrigin, Coord3D dstOrigin, - Coord3D size, - const BufferRect& srcRect, - const BufferRect& dstRect) - : TwoMemoryArgsCommand( - queue, cmdType, eventWaitList, srcMemory, dstMemory), - srcOrigin_(srcOrigin), dstOrigin_(dstOrigin), size_(size), - srcRect_(srcRect), dstRect_(dstRect) - { - // Sanity checks - assert(size.c[0] > 0 && "invalid"); - } + CopyMemoryCommand(HostQueue& queue, cl_command_type cmdType, const EventWaitList& eventWaitList, + Memory& srcMemory, Memory& dstMemory, Coord3D srcOrigin, Coord3D dstOrigin, + Coord3D size, const BufferRect& srcRect, const BufferRect& dstRect) + : TwoMemoryArgsCommand(queue, cmdType, eventWaitList, srcMemory, dstMemory), + srcOrigin_(srcOrigin), + dstOrigin_(dstOrigin), + size_(size), + srcRect_(srcRect), + dstRect_(dstRect) { + // Sanity checks + assert(size.c[0] > 0 && "invalid"); + } - virtual void submit(device::VirtualDevice& device) { - device.submitCopyMemory(*this); - } + virtual void submit(device::VirtualDevice& device) { device.submitCopyMemory(*this); } - //! Return the host memory to read from - Memory& source() const { return *memory1_; } - //! Return the memory object to write to. - Memory& destination() const { return *memory2_; } + //! Return the host memory to read from + Memory& source() const { return *memory1_; } + //! Return the memory object to write to. + Memory& destination() const { return *memory2_; } - //! Return the source origin - const Coord3D& srcOrigin() const { return srcOrigin_; } - //! Return the offset in bytes in the destination. - const Coord3D& dstOrigin() const { return dstOrigin_; } - //! Return the number of bytes to copy. - const Coord3D& size() const { return size_; } + //! Return the source origin + const Coord3D& srcOrigin() const { return srcOrigin_; } + //! Return the offset in bytes in the destination. + const Coord3D& dstOrigin() const { return dstOrigin_; } + //! Return the number of bytes to copy. + const Coord3D& size() const { return size_; } - //! Return the source buffer rectangle information - const BufferRect& srcRect() const { return srcRect_; } - //! Return the destination buffer rectangle information - const BufferRect& dstRect() const { return dstRect_; } + //! Return the source buffer rectangle information + const BufferRect& srcRect() const { return srcRect_; } + //! Return the destination buffer rectangle information + const BufferRect& dstRect() const { return dstRect_; } - //! Return true if the both memories are is read/written in their entirety. - bool isEntireMemory() const; + //! Return true if the both memories are is read/written in their entirety. + bool isEntireMemory() const; }; /*! \brief A generic map memory command. Makes a memory object accessible to the host. @@ -706,91 +631,74 @@ public: * the context of unified buffer/image commands. */ -class MapMemoryCommand: public OneMemoryArgCommand -{ -private: - cl_map_flags mapFlags_; //!< Flags controlling the map. - bool blocking_; //!< True for blocking maps - Coord3D origin_; //!< Origin of the region to map. - Coord3D size_; //!< Size of the region to map. - const void* mapPtr_; //!< Host-space pointer that the object is currently mapped at +class MapMemoryCommand : public OneMemoryArgCommand { + private: + cl_map_flags mapFlags_; //!< Flags controlling the map. + bool blocking_; //!< True for blocking maps + Coord3D origin_; //!< Origin of the region to map. + Coord3D size_; //!< Size of the region to map. + const void* mapPtr_; //!< Host-space pointer that the object is currently mapped at -public: - //! Construct a new MapMemoryCommand - MapMemoryCommand( - HostQueue& queue, - cl_command_type cmdType, - const EventWaitList& eventWaitList, - Memory& memory, cl_map_flags mapFlags, - bool blocking, - Coord3D origin, Coord3D size, - size_t* imgRowPitch = nullptr, - size_t* imgSlicePitch = nullptr, - void* mapPtr = nullptr) : - OneMemoryArgCommand(queue, cmdType, eventWaitList, memory), - mapFlags_(mapFlags), blocking_(blocking), - origin_(origin), size_(size), mapPtr_(mapPtr) - { - // Sanity checks - assert(size.c[0] > 0 && "invalid"); - } + public: + //! Construct a new MapMemoryCommand + MapMemoryCommand(HostQueue& queue, cl_command_type cmdType, const EventWaitList& eventWaitList, + Memory& memory, cl_map_flags mapFlags, bool blocking, Coord3D origin, + Coord3D size, size_t* imgRowPitch = nullptr, size_t* imgSlicePitch = nullptr, + void* mapPtr = nullptr) + : OneMemoryArgCommand(queue, cmdType, eventWaitList, memory), + mapFlags_(mapFlags), + blocking_(blocking), + origin_(origin), + size_(size), + mapPtr_(mapPtr) { + // Sanity checks + assert(size.c[0] > 0 && "invalid"); + } - virtual void submit(device::VirtualDevice& device) { - device.submitMapMemory(*this); - } - - //! Read the memory object - Memory& memory() const { return *memory_; } - //! Read the map control flags - cl_map_flags mapFlags() const { return mapFlags_; } - //! Read the origin - const Coord3D& origin() const { return origin_; } - //! Read the size - const Coord3D& size() const { return size_; } - //! Read the blocking flag - bool blocking() const { return blocking_; } - //! Returns true if the entire memory object is mapped - bool isEntireMemory() const; - //! Read the map pointer - const void* mapPtr() const { return mapPtr_; } + virtual void submit(device::VirtualDevice& device) { device.submitMapMemory(*this); } + //! Read the memory object + Memory& memory() const { return *memory_; } + //! Read the map control flags + cl_map_flags mapFlags() const { return mapFlags_; } + //! Read the origin + const Coord3D& origin() const { return origin_; } + //! Read the size + const Coord3D& size() const { return size_; } + //! Read the blocking flag + bool blocking() const { return blocking_; } + //! Returns true if the entire memory object is mapped + bool isEntireMemory() const; + //! Read the map pointer + const void* mapPtr() const { return mapPtr_; } }; - /*! \brief A generic unmap memory command. * * @todo:dgladdin Need to think more about how the pitch parameters operate in * the context of unified buffer/image commands. */ -class UnmapMemoryCommand: public OneMemoryArgCommand -{ -private: - //! Host-space pointer that the object is currently mapped at - void* mapPtr_; +class UnmapMemoryCommand : public OneMemoryArgCommand { + private: + //! Host-space pointer that the object is currently mapped at + void* mapPtr_; -public: - //! Construct a new MapMemoryCommand - UnmapMemoryCommand( - HostQueue& queue, - cl_command_type cmdType, - const EventWaitList& eventWaitList, - Memory& memory, void* mapPtr) : - OneMemoryArgCommand(queue, cmdType, eventWaitList, memory), - mapPtr_(mapPtr) - { } + public: + //! Construct a new MapMemoryCommand + UnmapMemoryCommand(HostQueue& queue, cl_command_type cmdType, const EventWaitList& eventWaitList, + Memory& memory, void* mapPtr) + : OneMemoryArgCommand(queue, cmdType, eventWaitList, memory), mapPtr_(mapPtr) {} - virtual void submit(device::VirtualDevice& device) { - device.submitUnmapMemory(*this); - } + virtual void submit(device::VirtualDevice& device) { device.submitUnmapMemory(*this); } - virtual void releaseResources(); + virtual void releaseResources(); - //! Read the memory object - Memory& memory() const { return *memory_; } - //! Read the map pointer - void* mapPtr() const { return mapPtr_; } + //! Read the memory object + Memory& memory() const { return *memory_; } + //! Read the map pointer + void* mapPtr() const { return mapPtr_; } }; /*! \brief Migrate memory objects command. @@ -798,681 +706,530 @@ public: * \details Used for operations on both buffers and images. Backends * are expected to handle any required translations. */ -class MigrateMemObjectsCommand: public Command -{ -private: - cl_mem_migration_flags migrationFlags_; //!< Migration flags - std::vector memObjects_; //!< The list of memory objects +class MigrateMemObjectsCommand : public Command { + private: + cl_mem_migration_flags migrationFlags_; //!< Migration flags + std::vector memObjects_; //!< The list of memory objects -public: - //! Construct a new AcquireExtObjectsCommand - MigrateMemObjectsCommand( - HostQueue& queue, - cl_command_type type, - const EventWaitList& eventWaitList, - const std::vector& memObjects, - cl_mem_migration_flags flags) - : Command(queue, type, eventWaitList) - , migrationFlags_(flags) - { - std::vector::const_iterator itr; - for (itr = memObjects.begin(); itr != memObjects.end(); itr++) { - (*itr)->retain(); - memObjects_.push_back(*itr); - } + public: + //! Construct a new AcquireExtObjectsCommand + MigrateMemObjectsCommand(HostQueue& queue, cl_command_type type, + const EventWaitList& eventWaitList, + const std::vector& memObjects, + cl_mem_migration_flags flags) + : Command(queue, type, eventWaitList), migrationFlags_(flags) { + std::vector::const_iterator itr; + for (itr = memObjects.begin(); itr != memObjects.end(); itr++) { + (*itr)->retain(); + memObjects_.push_back(*itr); } + } - virtual void submit(device::VirtualDevice& device) { - device.submitMigrateMemObjects(*this); + virtual void submit(device::VirtualDevice& device) { device.submitMigrateMemObjects(*this); } + + //! Release all resources associated with this command + void releaseResources() { + std::vector::const_iterator itr; + for (itr = memObjects_.begin(); itr != memObjects_.end(); itr++) { + (*itr)->release(); } + Command::releaseResources(); + } - //! Release all resources associated with this command - void releaseResources() { - std::vector::const_iterator itr; - for (itr = memObjects_.begin(); itr != memObjects_.end(); itr++) { - (*itr)->release(); - } - Command::releaseResources(); - } + //! Returns the migration flags + cl_mem_migration_flags migrationFlags() const { return migrationFlags_; } + //! Returns the number of memory objects in the command + cl_uint numMemObjects() const { return (cl_uint)memObjects_.size(); } + //! Returns a pointer to the memory objects + const std::vector& memObjects() const { return memObjects_; } - //! Returns the migration flags - cl_mem_migration_flags migrationFlags() const { return migrationFlags_; } - //! Returns the number of memory objects in the command - cl_uint numMemObjects() const { return (cl_uint) memObjects_.size(); } - //! Returns a pointer to the memory objects - const std::vector& memObjects() const { return memObjects_; } - - bool validateMemory(); + bool validateMemory(); }; //! To execute a kernel on a specific device. -class NDRangeKernelCommand : public Command -{ -private: - Kernel& kernel_; - NDRangeContainer sizes_; - address parameters_; +class NDRangeKernelCommand : public Command { + private: + Kernel& kernel_; + NDRangeContainer sizes_; + address parameters_; -public: - //! Construct an ExecuteKernel command - NDRangeKernelCommand( - HostQueue& queue, - const EventWaitList& eventWaitList, - Kernel& kernel, - const NDRangeContainer& sizes); + public: + //! Construct an ExecuteKernel command + NDRangeKernelCommand(HostQueue& queue, const EventWaitList& eventWaitList, Kernel& kernel, + const NDRangeContainer& sizes); - virtual void submit(device::VirtualDevice& device) { - device.submitKernel(*this); - } + virtual void submit(device::VirtualDevice& device) { device.submitKernel(*this); } - //! Release all resources associated with this command ( - void releaseResources(); + //! Release all resources associated with this command ( + void releaseResources(); - //! Return the kernel. - const Kernel& kernel() const { return kernel_; } + //! Return the kernel. + const Kernel& kernel() const { return kernel_; } - //! Return the parameters given to this kernel. - const_address parameters() const { return parameters_; } + //! Return the parameters given to this kernel. + const_address parameters() const { return parameters_; } - //! Return the kernel NDRange. - const NDRangeContainer& sizes() const { return sizes_; } + //! Return the kernel NDRange. + const NDRangeContainer& sizes() const { return sizes_; } - //! Set the local work size. - void setLocalWorkSize(const NDRange& local) { sizes_.local() = local; } + //! Set the local work size. + void setLocalWorkSize(const NDRange& local) { sizes_.local() = local; } - cl_int validateMemory(); + cl_int validateMemory(); }; -class NativeFnCommand : public Command -{ -private: - void (CL_CALLBACK *nativeFn_)(void *); +class NativeFnCommand : public Command { + private: + void(CL_CALLBACK* nativeFn_)(void*); - char* args_; - size_t argsSize_; + char* args_; + size_t argsSize_; - std::vector memObjects_; - std::vector memOffsets_; + std::vector memObjects_; + std::vector memOffsets_; -public: - NativeFnCommand( - HostQueue& queue, const EventWaitList& eventWaitList, - void (CL_CALLBACK * nativeFn)(void*), const void* args, size_t argsSize, - size_t numMemObjs, const cl_mem* memObjs, const void** memLocs); + public: + NativeFnCommand(HostQueue& queue, const EventWaitList& eventWaitList, + void(CL_CALLBACK* nativeFn)(void*), const void* args, size_t argsSize, + size_t numMemObjs, const cl_mem* memObjs, const void** memLocs); - ~NativeFnCommand() { - delete[] args_; - } + ~NativeFnCommand() { delete[] args_; } - void releaseResources() { - std::for_each(memObjects_.begin(), memObjects_.end(), - std::mem_fun(&Memory::release)); - Command::releaseResources(); - } + void releaseResources() { + std::for_each(memObjects_.begin(), memObjects_.end(), std::mem_fun(&Memory::release)); + Command::releaseResources(); + } - virtual void submit(device::VirtualDevice& device) { - device.submitNativeFn(*this); - } + virtual void submit(device::VirtualDevice& device) { device.submitNativeFn(*this); } - cl_int invoke(); + cl_int invoke(); }; -class Marker : public Command -{ -public: - //! Create a new Marker - Marker( - HostQueue& queue, bool userVisible, - const EventWaitList& eventWaitList = nullWaitList, const Event* waitingEvent = NULL) - : Command(queue, userVisible ? CL_COMMAND_MARKER : 0, eventWaitList) - , waitingEvent_(waitingEvent) - { } +class Marker : public Command { + public: + //! Create a new Marker + Marker(HostQueue& queue, bool userVisible, const EventWaitList& eventWaitList = nullWaitList, + const Event* waitingEvent = NULL) + : Command(queue, userVisible ? CL_COMMAND_MARKER : 0, eventWaitList), + waitingEvent_(waitingEvent) {} - //! The actual command implementation. - virtual void submit(device::VirtualDevice& device) { - device.submitMarker(*this); - } + //! The actual command implementation. + virtual void submit(device::VirtualDevice& device) { device.submitMarker(*this); } - const Event* waitingEvent() const { return waitingEvent_; } + const Event* waitingEvent() const { return waitingEvent_; } -private: - const Event* waitingEvent_; //!< Waiting event associated with the marker + private: + const Event* waitingEvent_; //!< Waiting event associated with the marker }; /*! \brief Maps CL objects created from external ones and syncs the contents (blocking). * */ -class ExtObjectsCommand: public Command -{ -private: - std::vector memObjects_; //!< The list of Memory based classes +class ExtObjectsCommand : public Command { + private: + std::vector memObjects_; //!< The list of Memory based classes -public: - //! Construct a new AcquireExtObjectsCommand - ExtObjectsCommand( - HostQueue& queue, - const EventWaitList& eventWaitList, - cl_uint num_objects, - const std::vector& memoryObjects, - cl_command_type type) : - Command(queue, type, eventWaitList) - { - for(std::vector::const_iterator itr = memoryObjects.begin(); - itr != memoryObjects.end(); itr++) { - (*itr)->retain(); - memObjects_.push_back(*itr); - } + public: + //! Construct a new AcquireExtObjectsCommand + ExtObjectsCommand(HostQueue& queue, const EventWaitList& eventWaitList, cl_uint num_objects, + const std::vector& memoryObjects, cl_command_type type) + : Command(queue, type, eventWaitList) { + for (std::vector::const_iterator itr = memoryObjects.begin(); + itr != memoryObjects.end(); itr++) { + (*itr)->retain(); + memObjects_.push_back(*itr); } + } - //! Release all resources associated with this command - void releaseResources() { - for(std::vector::const_iterator itr = memObjects_.begin(); - itr != memObjects_.end(); itr++) { - (*itr)->release(); - } - Command::releaseResources(); + //! Release all resources associated with this command + void releaseResources() { + for (std::vector::const_iterator itr = memObjects_.begin(); + itr != memObjects_.end(); itr++) { + (*itr)->release(); } + Command::releaseResources(); + } - //! Get number of GL objects - cl_uint getNumObjects() {return (cl_uint) memObjects_.size();} - //! Get pointer to GL object list - const std::vector& getMemList() const {return memObjects_;} - bool validateMemory(); - virtual bool processGLResource(device::Memory * mem) = 0 ; - + //! Get number of GL objects + cl_uint getNumObjects() { return (cl_uint)memObjects_.size(); } + //! Get pointer to GL object list + const std::vector& getMemList() const { return memObjects_; } + bool validateMemory(); + virtual bool processGLResource(device::Memory* mem) = 0; }; -class AcquireExtObjectsCommand: public ExtObjectsCommand -{ -public: - //! Construct a new AcquireExtObjectsCommand - AcquireExtObjectsCommand( - HostQueue& queue, - const EventWaitList& eventWaitList, - cl_uint num_objects, - const std::vector& memoryObjects, - cl_command_type type) : - ExtObjectsCommand(queue, eventWaitList, num_objects, - memoryObjects, type) - { - } +class AcquireExtObjectsCommand : public ExtObjectsCommand { + public: + //! Construct a new AcquireExtObjectsCommand + AcquireExtObjectsCommand(HostQueue& queue, const EventWaitList& eventWaitList, + cl_uint num_objects, const std::vector& memoryObjects, + cl_command_type type) + : ExtObjectsCommand(queue, eventWaitList, num_objects, memoryObjects, type) {} - virtual void submit(device::VirtualDevice& device) { - device.submitAcquireExtObjects(*this); - } + virtual void submit(device::VirtualDevice& device) { device.submitAcquireExtObjects(*this); } - virtual bool processGLResource(device::Memory * mem); + virtual bool processGLResource(device::Memory* mem); }; -class ReleaseExtObjectsCommand: public ExtObjectsCommand -{ -public: - //! Construct a new ReleaseExtObjectsCommand - ReleaseExtObjectsCommand( - HostQueue& queue, - const EventWaitList& eventWaitList, - cl_uint num_objects, - const std::vector& memoryObjects, - cl_command_type type) : - ExtObjectsCommand(queue, eventWaitList, num_objects, - memoryObjects, type) - { - } +class ReleaseExtObjectsCommand : public ExtObjectsCommand { + public: + //! Construct a new ReleaseExtObjectsCommand + ReleaseExtObjectsCommand(HostQueue& queue, const EventWaitList& eventWaitList, + cl_uint num_objects, const std::vector& memoryObjects, + cl_command_type type) + : ExtObjectsCommand(queue, eventWaitList, num_objects, memoryObjects, type) {} - virtual void submit(device::VirtualDevice& device) { - device.submitReleaseExtObjects(*this); - } + virtual void submit(device::VirtualDevice& device) { device.submitReleaseExtObjects(*this); } - virtual bool processGLResource(device::Memory * mem); + virtual bool processGLResource(device::Memory* mem); }; -class PerfCounterCommand : public Command -{ -public: - typedef std::vector PerfCounterList; +class PerfCounterCommand : public Command { + public: + typedef std::vector PerfCounterList; - enum State - { - Begin = 0, //!< Issue a begin command - End = 1 //!< Issue an end command - }; + enum State { + Begin = 0, //!< Issue a begin command + End = 1 //!< Issue an end command + }; - //! Construct a new PerfCounterCommand - PerfCounterCommand( - HostQueue& queue, - const EventWaitList& eventWaitList, - const PerfCounterList& counterList, - State state) - : Command(queue, 0, eventWaitList) - , counterList_(counterList) - , state_(state) - { - for (uint i = 0; i < counterList_.size(); ++i) { - counterList_[i]->retain(); - } + //! Construct a new PerfCounterCommand + PerfCounterCommand(HostQueue& queue, const EventWaitList& eventWaitList, + const PerfCounterList& counterList, State state) + : Command(queue, 0, eventWaitList), counterList_(counterList), state_(state) { + for (uint i = 0; i < counterList_.size(); ++i) { + counterList_[i]->retain(); } + } - void releaseResources() { - for (uint i = 0; i < counterList_.size(); ++i) { - counterList_[i]->release(); - } - Command::releaseResources(); + void releaseResources() { + for (uint i = 0; i < counterList_.size(); ++i) { + counterList_[i]->release(); } + Command::releaseResources(); + } - //! Gets the number of PerfCounter objects - size_t getNumCounters() const { return counterList_.size(); } + //! Gets the number of PerfCounter objects + size_t getNumCounters() const { return counterList_.size(); } - //! Gets the list of all counters - const PerfCounterList& getCounters() const { return counterList_; } + //! Gets the list of all counters + const PerfCounterList& getCounters() const { return counterList_; } - //! Gets the performance counter state - State getState() const { return state_; } + //! Gets the performance counter state + State getState() const { return state_; } - //! Process the command on the device queue - virtual void submit(device::VirtualDevice& device) { - device.submitPerfCounter(*this); - } + //! Process the command on the device queue + virtual void submit(device::VirtualDevice& device) { device.submitPerfCounter(*this); } -private: - PerfCounterList counterList_; //!< The list of performance counters - State state_; //!< State of the issued command + private: + PerfCounterList counterList_; //!< The list of performance counters + State state_; //!< State of the issued command }; /*! \brief Thread Trace memory objects command. * * \details Used for bindig memory objects to therad trace mechanism. */ -class ThreadTraceMemObjectsCommand: public Command -{ -public: - //! Construct a new ThreadTraceMemObjectsCommand - ThreadTraceMemObjectsCommand( - HostQueue& queue, - const EventWaitList& eventWaitList, - size_t numMemoryObjects, - const cl_mem* memoryObjects, - size_t sizeMemoryObject, - ThreadTrace& threadTrace, - cl_command_type type) : - Command(queue, type, eventWaitList), - sizeMemObjects_(sizeMemoryObject), - threadTrace_(threadTrace) - { - memObjects_.resize(numMemoryObjects); - for (size_t i = 0; i < numMemoryObjects; ++i) { - Memory* obj = as_amd(memoryObjects[i]); - obj->retain(); - memObjects_[i] = obj; - } - threadTrace_.retain(); +class ThreadTraceMemObjectsCommand : public Command { + public: + //! Construct a new ThreadTraceMemObjectsCommand + ThreadTraceMemObjectsCommand(HostQueue& queue, const EventWaitList& eventWaitList, + size_t numMemoryObjects, const cl_mem* memoryObjects, + size_t sizeMemoryObject, ThreadTrace& threadTrace, + cl_command_type type) + : Command(queue, type, eventWaitList), + sizeMemObjects_(sizeMemoryObject), + threadTrace_(threadTrace) { + memObjects_.resize(numMemoryObjects); + for (size_t i = 0; i < numMemoryObjects; ++i) { + Memory* obj = as_amd(memoryObjects[i]); + obj->retain(); + memObjects_[i] = obj; } - //! Release all resources associated with this command - void releaseResources() { - threadTrace_.release(); - for(std::vector::const_iterator itr = memObjects_.begin(); - itr != memObjects_.end(); itr++) { - (*itr)->release(); - } - Command::releaseResources(); + threadTrace_.retain(); + } + //! Release all resources associated with this command + void releaseResources() { + threadTrace_.release(); + for (std::vector::const_iterator itr = memObjects_.begin(); + itr != memObjects_.end(); itr++) { + (*itr)->release(); } + Command::releaseResources(); + } - //! Get number of CL memory objects - cl_uint getNumObjects() {return (cl_uint) memObjects_.size();} + //! Get number of CL memory objects + cl_uint getNumObjects() { return (cl_uint)memObjects_.size(); } - //! Get pointer to CL memory object list - const std::vector& getMemList() const {return memObjects_;} + //! Get pointer to CL memory object list + const std::vector& getMemList() const { return memObjects_; } - //! Submit command to bind memory object to the Thread Trace mechanism - virtual void submit(device::VirtualDevice& device) { - device.submitThreadTraceMemObjects(*this); - } + //! Submit command to bind memory object to the Thread Trace mechanism + virtual void submit(device::VirtualDevice& device) { device.submitThreadTraceMemObjects(*this); } - //! Return the thread trace object. - ThreadTrace& getThreadTrace() const { return threadTrace_; } + //! Return the thread trace object. + ThreadTrace& getThreadTrace() const { return threadTrace_; } - //! Get memory object size - const size_t getMemoryObjectSize() const {return sizeMemObjects_;} + //! Get memory object size + const size_t getMemoryObjectSize() const { return sizeMemObjects_; } - //! Validate memory bound to the thread thrace - bool validateMemory(); -private: - std::vector memObjects_; //!< The list of memory objects,bound to the thread trace - size_t sizeMemObjects_; //!< The size of each memory object from memObjects_ list (all memory objects have the smae size) - ThreadTrace& threadTrace_; //!< The Thread Trace object + //! Validate memory bound to the thread thrace + bool validateMemory(); + + private: + std::vector memObjects_; //!< The list of memory objects,bound to the thread trace + size_t sizeMemObjects_; //!< The size of each memory object from memObjects_ list (all memory + //!objects have the smae size) + ThreadTrace& threadTrace_; //!< The Thread Trace object }; /*! \brief Thread Trace command. * * \details Used for issue begin/end/pause/resume for therad trace object. */ -class ThreadTraceCommand : public Command -{ -private: - void *threadTraceConfig_; -public: +class ThreadTraceCommand : public Command { + private: + void* threadTraceConfig_; - enum State - { - Begin = 0, //!< Issue a begin command - End = 1, //!< Issue an end command - Pause = 2, //!< Issue a pause command - Resume = 3 //!< Issue a resume command - }; + public: + enum State { + Begin = 0, //!< Issue a begin command + End = 1, //!< Issue an end command + Pause = 2, //!< Issue a pause command + Resume = 3 //!< Issue a resume command + }; - //! Construct a new ThreadTraceCommand - ThreadTraceCommand( - HostQueue& queue, - const EventWaitList& eventWaitList, - const void *threadTraceConfig, - ThreadTrace& threadTrace, - State state, - cl_command_type type) - : Command(queue, type, eventWaitList) - , threadTrace_(threadTrace) - , state_(state) - { - const unsigned int size = *static_cast(threadTraceConfig); - threadTraceConfig_ = static_cast(new char[size]); - if (threadTraceConfig_) { - memcpy(threadTraceConfig_, threadTraceConfig, size); - } - threadTrace_.retain(); + //! Construct a new ThreadTraceCommand + ThreadTraceCommand(HostQueue& queue, const EventWaitList& eventWaitList, + const void* threadTraceConfig, ThreadTrace& threadTrace, State state, + cl_command_type type) + : Command(queue, type, eventWaitList), threadTrace_(threadTrace), state_(state) { + const unsigned int size = *static_cast(threadTraceConfig); + threadTraceConfig_ = static_cast(new char[size]); + if (threadTraceConfig_) { + memcpy(threadTraceConfig_, threadTraceConfig, size); } + threadTrace_.retain(); + } - //! Release all resources associated with this command - void releaseResources() { - threadTrace_.release(); - Command::releaseResources(); - } + //! Release all resources associated with this command + void releaseResources() { + threadTrace_.release(); + Command::releaseResources(); + } - //! Get the thread trace object - ThreadTrace& getThreadTrace() const { return threadTrace_; } + //! Get the thread trace object + ThreadTrace& getThreadTrace() const { return threadTrace_; } - //! Get the thread trace command state - State getState() const { return state_; } + //! Get the thread trace command state + State getState() const { return state_; } - //! Process the command on the device queue - virtual void submit(device::VirtualDevice& device) { - device.submitThreadTrace(*this); - } - // Accessor methods - void* threadTraceConfig() const { return threadTraceConfig_; } + //! Process the command on the device queue + virtual void submit(device::VirtualDevice& device) { device.submitThreadTrace(*this); } + // Accessor methods + void* threadTraceConfig() const { return threadTraceConfig_; } -private: - ThreadTrace& threadTrace_; //!< The list of performance counters - State state_; //!< State of the issued command + private: + ThreadTrace& threadTrace_; //!< The list of performance counters + State state_; //!< State of the issued command }; -class SignalCommand:public OneMemoryArgCommand -{ +class SignalCommand : public OneMemoryArgCommand { + private: + cl_uint markerValue_; + cl_ulong markerOffset_; -private: - cl_uint markerValue_; - cl_ulong markerOffset_; + public: + SignalCommand(HostQueue& queue, cl_command_type cmdType, const EventWaitList& eventWaitList, + Memory& memory, cl_uint value, cl_ulong offset = 0) + : OneMemoryArgCommand(queue, cmdType, eventWaitList, memory), + markerValue_(value), + markerOffset_(offset) {} -public: - - SignalCommand( - HostQueue& queue, - cl_command_type cmdType, - const EventWaitList& eventWaitList, - Memory& memory, - cl_uint value, - cl_ulong offset = 0): - OneMemoryArgCommand(queue, cmdType, eventWaitList, memory), - markerValue_(value), - markerOffset_(offset) - { - - } - - virtual void submit(device::VirtualDevice& device) - { - device.submitSignal(*this); - } - - const cl_uint markerValue() {return markerValue_;} - Memory& memory() {return *memory_;} - const cl_ulong markerOffset() {return markerOffset_;} + virtual void submit(device::VirtualDevice& device) { device.submitSignal(*this); } + const cl_uint markerValue() { return markerValue_; } + Memory& memory() { return *memory_; } + const cl_ulong markerOffset() { return markerOffset_; } }; -class MakeBuffersResidentCommand: public Command -{ -private: - std::vector memObjects_; - cl_bus_address_amd* busAddresses_; +class MakeBuffersResidentCommand : public Command { + private: + std::vector memObjects_; + cl_bus_address_amd* busAddresses_; -public: - MakeBuffersResidentCommand( - HostQueue& queue, - cl_command_type type, - const EventWaitList& eventWaitList, - const std::vector& memObjects, - cl_bus_address_amd* busAddr) - : Command(queue, type, eventWaitList), - busAddresses_(busAddr) - { - std::vector::const_iterator itr; - for (itr = memObjects.begin(); itr != memObjects.end(); itr++) { - (*itr)->retain(); - memObjects_.push_back(*itr); - } + public: + MakeBuffersResidentCommand(HostQueue& queue, cl_command_type type, + const EventWaitList& eventWaitList, + const std::vector& memObjects, + cl_bus_address_amd* busAddr) + : Command(queue, type, eventWaitList), busAddresses_(busAddr) { + std::vector::const_iterator itr; + for (itr = memObjects.begin(); itr != memObjects.end(); itr++) { + (*itr)->retain(); + memObjects_.push_back(*itr); } + } - virtual void submit(device::VirtualDevice& device) - { - device.submitMakeBuffersResident(*this); + virtual void submit(device::VirtualDevice& device) { device.submitMakeBuffersResident(*this); } + + void releaseResources() { + std::vector::const_iterator itr; + for (itr = memObjects_.begin(); itr != memObjects_.end(); itr++) { + (*itr)->release(); } + Command::releaseResources(); + } - void releaseResources() - { - std::vector::const_iterator itr; - for (itr = memObjects_.begin(); itr != memObjects_.end(); itr++) { - (*itr)->release(); - } - Command::releaseResources(); - } - - bool validateMemory(); - const std::vector& memObjects() const { return memObjects_; } - cl_bus_address_amd* busAddress() const {return busAddresses_;} - + bool validateMemory(); + const std::vector& memObjects() const { return memObjects_; } + cl_bus_address_amd* busAddress() const { return busAddresses_; } }; //! A deallocation command used to free SVM or system pointers. -class SvmFreeMemoryCommand : public Command -{ -public: - typedef void (CL_CALLBACK *freeCallBack) - (cl_command_queue, cl_uint, void**, void*); +class SvmFreeMemoryCommand : public Command { + public: + typedef void(CL_CALLBACK* freeCallBack)(cl_command_queue, cl_uint, void**, void*); -private: - std::vector svmPointers_; //!< List of pointers to deallocate - freeCallBack pfnFreeFunc_; //!< User-defined deallocation callback - void* userData_; //!< Data passed to user-defined callback + private: + std::vector svmPointers_; //!< List of pointers to deallocate + freeCallBack pfnFreeFunc_; //!< User-defined deallocation callback + void* userData_; //!< Data passed to user-defined callback -public: - SvmFreeMemoryCommand( - HostQueue& queue, - const EventWaitList& eventWaitList, - cl_uint numSvmPointers, - void** svmPointers, - freeCallBack pfnFreeFunc, - void* userData) : - Command(queue, CL_COMMAND_SVM_FREE, eventWaitList), - //! We copy svmPointers since it can be reused/deallocated after - // command creation - svmPointers_(svmPointers, svmPointers + numSvmPointers), - pfnFreeFunc_(pfnFreeFunc), - userData_(userData) { } + public: + SvmFreeMemoryCommand(HostQueue& queue, const EventWaitList& eventWaitList, cl_uint numSvmPointers, + void** svmPointers, freeCallBack pfnFreeFunc, void* userData) + : Command(queue, CL_COMMAND_SVM_FREE, eventWaitList), + //! We copy svmPointers since it can be reused/deallocated after + // command creation + svmPointers_(svmPointers, svmPointers + numSvmPointers), + pfnFreeFunc_(pfnFreeFunc), + userData_(userData) {} - virtual void submit(device::VirtualDevice& device) - { - device.submitSvmFreeMemory(*this); - } + virtual void submit(device::VirtualDevice& device) { device.submitSvmFreeMemory(*this); } - std::vector& svmPointers() { return svmPointers_; } + std::vector& svmPointers() { return svmPointers_; } - freeCallBack pfnFreeFunc() const { return pfnFreeFunc_; } + freeCallBack pfnFreeFunc() const { return pfnFreeFunc_; } - void* userData() const { return userData_; } + void* userData() const { return userData_; } }; //! A copy command where the origin and destination memory locations are SVM // pointers. -class SvmCopyMemoryCommand : public Command -{ -private: - void* dst_; //!< Destination pointer - const void* src_; //!< Source pointer - size_t srcSize_; //!< Size (in bytes) of the source buffer +class SvmCopyMemoryCommand : public Command { + private: + void* dst_; //!< Destination pointer + const void* src_; //!< Source pointer + size_t srcSize_; //!< Size (in bytes) of the source buffer -public: - SvmCopyMemoryCommand( - HostQueue& queue, - const EventWaitList& eventWaitList, - void* dst, - const void* src, - size_t srcSize) : - Command(queue, CL_COMMAND_SVM_MEMCPY, eventWaitList), - dst_(dst), - src_(src), - srcSize_(srcSize) { } + public: + SvmCopyMemoryCommand(HostQueue& queue, const EventWaitList& eventWaitList, void* dst, + const void* src, size_t srcSize) + : Command(queue, CL_COMMAND_SVM_MEMCPY, eventWaitList), + dst_(dst), + src_(src), + srcSize_(srcSize) {} - virtual void submit(device::VirtualDevice& device) - { - device.submitSvmCopyMemory(*this); - } + virtual void submit(device::VirtualDevice& device) { device.submitSvmCopyMemory(*this); } - void* dst() const { return dst_; } + void* dst() const { return dst_; } - const void* src() const { return src_; } + const void* src() const { return src_; } - size_t srcSize() const { return srcSize_; } + size_t srcSize() const { return srcSize_; } }; //! A fill command where the pattern and destination memory locations are SVM // pointers. -class SvmFillMemoryCommand : public Command -{ -private: - void* dst_; //!< Destination pointer - char pattern_[FillMemoryCommand::MaxFillPatterSize]; //!< The fill pattern - size_t patternSize_; //!< Pattern size - size_t times_; //!< Number of times to fill the - // destination buffer with the source buffer +class SvmFillMemoryCommand : public Command { + private: + void* dst_; //!< Destination pointer + char pattern_[FillMemoryCommand::MaxFillPatterSize]; //!< The fill pattern + size_t patternSize_; //!< Pattern size + size_t times_; //!< Number of times to fill the + // destination buffer with the source buffer -public: - SvmFillMemoryCommand( - HostQueue& queue, - const EventWaitList& eventWaitList, - void* dst, - const void* pattern, - size_t patternSize, - size_t size) : - Command(queue, CL_COMMAND_SVM_MEMFILL, eventWaitList), - dst_(dst), - patternSize_(patternSize), - times_(size / patternSize) - { - assert(amd::isMultipleOf(size, patternSize)); - //! We copy the pattern buffer since it can be reused/deallocated after - // command creation - memcpy(pattern_, pattern, patternSize); - } + public: + SvmFillMemoryCommand(HostQueue& queue, const EventWaitList& eventWaitList, void* dst, + const void* pattern, size_t patternSize, size_t size) + : Command(queue, CL_COMMAND_SVM_MEMFILL, eventWaitList), + dst_(dst), + patternSize_(patternSize), + times_(size / patternSize) { + assert(amd::isMultipleOf(size, patternSize)); + //! We copy the pattern buffer since it can be reused/deallocated after + // command creation + memcpy(pattern_, pattern, patternSize); + } - virtual void submit(device::VirtualDevice& device) - { - device.submitSvmFillMemory(*this); - } + virtual void submit(device::VirtualDevice& device) { device.submitSvmFillMemory(*this); } - void* dst() const { return dst_; } + void* dst() const { return dst_; } - const char* pattern() const { return pattern_; } + const char* pattern() const { return pattern_; } - size_t patternSize() const { return patternSize_; } + size_t patternSize() const { return patternSize_; } - size_t times() const { return times_; } + size_t times() const { return times_; } }; /*! \brief A map memory command where the pointer to be mapped is a SVM shared * buffer */ -class SvmMapMemoryCommand : public Command -{ -private: - Memory* svmMem_; //!< the pointer to the amd::Memory object corresponding the svm pointer mapped - Coord3D size_; //!< the map size - Coord3D origin_; //!< the origin of the mapped svm pointer shift from the beginning of svm space allocated - cl_map_flags flags_; //!< map flags - void* svmPtr_; +class SvmMapMemoryCommand : public Command { + private: + Memory* svmMem_; //!< the pointer to the amd::Memory object corresponding the svm pointer mapped + Coord3D size_; //!< the map size + Coord3D origin_; //!< the origin of the mapped svm pointer shift from the beginning of svm space + //!allocated + cl_map_flags flags_; //!< map flags + void* svmPtr_; -public: - SvmMapMemoryCommand( - HostQueue& queue, - const EventWaitList& eventWaitList, - Memory* svmMem, - const size_t size, - const size_t offset, - cl_map_flags flags, - void* svmPtr) - : Command(queue, CL_COMMAND_SVM_MAP, eventWaitList) - , svmMem_(svmMem) - , size_(size) - , origin_(offset) - , flags_(flags) - , svmPtr_(svmPtr) - { - } + public: + SvmMapMemoryCommand(HostQueue& queue, const EventWaitList& eventWaitList, Memory* svmMem, + const size_t size, const size_t offset, cl_map_flags flags, void* svmPtr) + : Command(queue, CL_COMMAND_SVM_MAP, eventWaitList), + svmMem_(svmMem), + size_(size), + origin_(offset), + flags_(flags), + svmPtr_(svmPtr) {} - virtual void submit(device::VirtualDevice& device) - { - device.submitSvmMapMemory(*this); - } + virtual void submit(device::VirtualDevice& device) { device.submitSvmMapMemory(*this); } - Memory* getSvmMem() const {return svmMem_;} + Memory* getSvmMem() const { return svmMem_; } - Coord3D size() const {return size_;} + Coord3D size() const { return size_; } - cl_map_flags mapFlags() const {return flags_;} + cl_map_flags mapFlags() const { return flags_; } - Coord3D origin() const {return origin_;} + Coord3D origin() const { return origin_; } - void* svmPtr() const { return svmPtr_; } + void* svmPtr() const { return svmPtr_; } - bool isEntireMemory() const; + bool isEntireMemory() const; }; /*! \brief An unmap memory command where the unmapped pointer is a SVM shared * buffer */ -class SvmUnmapMemoryCommand : public Command -{ -private: - Memory* svmMem_; //!< the pointer to the amd::Memory object corresponding the svm pointer mapped - void* svmPtr_; //!< SVM pointer +class SvmUnmapMemoryCommand : public Command { + private: + Memory* svmMem_; //!< the pointer to the amd::Memory object corresponding the svm pointer mapped + void* svmPtr_; //!< SVM pointer -public: - SvmUnmapMemoryCommand( - HostQueue& queue, - const EventWaitList& eventWaitList, - Memory* svmMem, - void* svmPtr) - : Command(queue, CL_COMMAND_SVM_UNMAP, eventWaitList) - , svmMem_(svmMem) - , svmPtr_(svmPtr) - {} + public: + SvmUnmapMemoryCommand(HostQueue& queue, const EventWaitList& eventWaitList, Memory* svmMem, + void* svmPtr) + : Command(queue, CL_COMMAND_SVM_UNMAP, eventWaitList), svmMem_(svmMem), svmPtr_(svmPtr) {} - virtual void submit(device::VirtualDevice& device) - { - device.submitSvmUnmapMemory(*this); - } + virtual void submit(device::VirtualDevice& device) { device.submitSvmUnmapMemory(*this); } - Memory* getSvmMem() const { return svmMem_; } + Memory* getSvmMem() const { return svmMem_; } - void* svmPtr() const { return svmPtr_; } + void* svmPtr() const { return svmPtr_; } }; /*! \brief A generic transfer memory from/to file command. @@ -1481,69 +1238,64 @@ public: * are treated as 1D structures so origin_[0] and size_[0] * are equivalent to offset_ and count_ respectively. */ -class TransferBufferFileCommand : public OneMemoryArgCommand -{ -public: - static const uint NumStagingBuffers = 2; - static const size_t StagingBufferSize = 4 * Mi; - static const uint StagingBufferMemType = CL_MEM_USE_PERSISTENT_MEM_AMD; +class TransferBufferFileCommand : public OneMemoryArgCommand { + public: + static const uint NumStagingBuffers = 2; + static const size_t StagingBufferSize = 4 * Mi; + static const uint StagingBufferMemType = CL_MEM_USE_PERSISTENT_MEM_AMD; -protected: - const Coord3D origin_; //!< Origin of the region to write to - const Coord3D size_; //!< Size of the region to write to - LiquidFlashFile* file_; //!< The file object for data read - size_t fileOffset_; //!< Offset in the file for data read - amd::Memory* staging_[NumStagingBuffers]; //!< Staging buffers for transfer + protected: + const Coord3D origin_; //!< Origin of the region to write to + const Coord3D size_; //!< Size of the region to write to + LiquidFlashFile* file_; //!< The file object for data read + size_t fileOffset_; //!< Offset in the file for data read + amd::Memory* staging_[NumStagingBuffers]; //!< Staging buffers for transfer -public: - TransferBufferFileCommand( - cl_command_type type, - HostQueue& queue, - const EventWaitList& eventWaitList, - Memory& memory, const Coord3D& origin, - const Coord3D& size, LiquidFlashFile* file, size_t fileOffset) - : OneMemoryArgCommand(queue, type, - eventWaitList, memory) - , origin_(origin) - , size_(size) - , file_(file) - , fileOffset_(fileOffset) - { - // Sanity checks - assert(size.c[0] > 0 && "invalid"); - for (uint i = 0; i < NumStagingBuffers; ++i) { - staging_[i] = NULL; - } + public: + TransferBufferFileCommand(cl_command_type type, HostQueue& queue, + const EventWaitList& eventWaitList, Memory& memory, + const Coord3D& origin, const Coord3D& size, LiquidFlashFile* file, + size_t fileOffset) + : OneMemoryArgCommand(queue, type, eventWaitList, memory), + origin_(origin), + size_(size), + file_(file), + fileOffset_(fileOffset) { + // Sanity checks + assert(size.c[0] > 0 && "invalid"); + for (uint i = 0; i < NumStagingBuffers; ++i) { + staging_[i] = NULL; } + } - virtual void releaseResources(); + virtual void releaseResources(); - virtual void submit(device::VirtualDevice& device); + virtual void submit(device::VirtualDevice& device); - //! Return the memory object to write to - Memory& memory() const { return *memory_; } + //! Return the memory object to write to + Memory& memory() const { return *memory_; } - //! Return the host memory to read from - LiquidFlashFile* file() const { return file_; } + //! Return the host memory to read from + LiquidFlashFile* file() const { return file_; } - //! Returns file offset - size_t fileOffset() const { return fileOffset_; } + //! Returns file offset + size_t fileOffset() const { return fileOffset_; } - //! Return the region origin - const Coord3D& origin() const { return origin_; } - //! Return the region size - const Coord3D& size() const { return size_; } + //! Return the region origin + const Coord3D& origin() const { return origin_; } + //! Return the region size + const Coord3D& size() const { return size_; } - //! Return the staging buffer for transfer - Memory& staging(uint i) const { return *staging_[i]; } + //! Return the staging buffer for transfer + Memory& staging(uint i) const { return *staging_[i]; } - bool validateMemory(); + bool validateMemory(); }; /*! @} * @} */ -} // namespace amd +} // namespace amd #endif /*COMMAND_HPP_*/ diff --git a/rocclr/runtime/platform/commandqueue.cpp b/rocclr/runtime/platform/commandqueue.cpp index a954685ac2..6a452042ce 100644 --- a/rocclr/runtime/platform/commandqueue.cpp +++ b/rocclr/runtime/platform/commandqueue.cpp @@ -17,173 +17,157 @@ namespace amd { -HostQueue::HostQueue( - Context& context, Device& device, - cl_command_queue_properties properties, uint queueRTCUs, Priority priority - ) - : CommandQueue(context, device, properties, device.info().queueProperties_ - | CL_QUEUE_COMMAND_INTERCEPT_ENABLE_AMD, queueRTCUs, priority) -{ - if (thread_.state() >= Thread::INITIALIZED) { - ScopedLock sl(queueLock_); - thread_.start(this); - queueLock_.wait(); - } +HostQueue::HostQueue(Context& context, Device& device, cl_command_queue_properties properties, + uint queueRTCUs, Priority priority) + : CommandQueue(context, device, properties, + device.info().queueProperties_ | CL_QUEUE_COMMAND_INTERCEPT_ENABLE_AMD, + queueRTCUs, priority) { + if (thread_.state() >= Thread::INITIALIZED) { + ScopedLock sl(queueLock_); + thread_.start(this); + queueLock_.wait(); + } } -bool -HostQueue::terminate() -{ - if (Os::isThreadAlive(thread_)) { - // Make sure all the commands are finished on the device. - finish(); +bool HostQueue::terminate() { + if (Os::isThreadAlive(thread_)) { + // Make sure all the commands are finished on the device. + finish(); - // Kill the command queue loop. - thread_.acceptingCommands_ = false; + // Kill the command queue loop. + thread_.acceptingCommands_ = false; - // Wake-up the command loop, so it can exit - flush(); + // Wake-up the command loop, so it can exit + flush(); - // FIXME_lmoriche: fix termination handshake - while (thread_.state() < Thread::FINISHED) { - Os::yield(); - } + // FIXME_lmoriche: fix termination handshake + while (thread_.state() < Thread::FINISHED) { + Os::yield(); } + } - if (Agent::shouldPostCommandQueueEvents()) { - Agent::postCommandQueueFree(as_cl(this->asCommandQueue())); - } + if (Agent::shouldPostCommandQueueEvents()) { + Agent::postCommandQueueFree(as_cl(this->asCommandQueue())); + } - return true; + return true; } -void -HostQueue::finish() -{ - // Send a finish to make sure we finished all commands - Command* command = new Marker(*this, false); +void HostQueue::finish() { + // Send a finish to make sure we finished all commands + Command* command = new Marker(*this, false); + if (command == NULL) { + return; + } + + command->enqueue(); + command->awaitCompletion(); + command->release(); +} + +void HostQueue::loop(device::VirtualDevice* virtualDevice) { + cl_int(CL_CALLBACK * commandIntercept)(cl_event, cl_int*) = + properties().test(CL_QUEUE_COMMAND_INTERCEPT_ENABLE_AMD) ? context().info().commandIntercept_ + : NULL; + + // Notify the caller that the queue is ready to accept commands. + { + ScopedLock sl(queueLock_); + thread_.acceptingCommands_ = true; + queueLock_.notify(); + } + // Create a command batch with all the commands present in the queue. + Command* head = NULL; + Command* tail = NULL; + while (true) { + // Get one command from the queue + Command* command = queue_.dequeue(); if (command == NULL) { - return; + ScopedLock sl(queueLock_); + while ((command = queue_.dequeue()) == NULL) { + if (!thread_.acceptingCommands_) { + return; + } + queueLock_.wait(); + } } - command->enqueue(); - command->awaitCompletion(); - command->release(); -} + command->retain(); -void -HostQueue::loop(device::VirtualDevice* virtualDevice) -{ - cl_int (CL_CALLBACK * commandIntercept)(cl_event, cl_int *) = - properties().test(CL_QUEUE_COMMAND_INTERCEPT_ENABLE_AMD) - ? context().info().commandIntercept_ : NULL; + // Process the command's event wait list. + const Command::EventWaitList& events = command->eventWaitList(); + Command::EventWaitList::const_iterator it; + bool dependencyFailed = false; - // Notify the caller that the queue is ready to accept commands. - { - ScopedLock sl(queueLock_); - thread_.acceptingCommands_ = true; - queueLock_.notify(); - } - // Create a command batch with all the commands present in the queue. - Command* head = NULL; - Command* tail = NULL; - while (true) { - // Get one command from the queue - Command* command = queue_.dequeue(); - if (command == NULL) { - ScopedLock sl(queueLock_); - while ((command = queue_.dequeue()) == NULL) { - - if (!thread_.acceptingCommands_) { - return; - } - queueLock_.wait(); - } - } - - command->retain(); - - // Process the command's event wait list. - const Command::EventWaitList& events = command->eventWaitList(); - Command::EventWaitList::const_iterator it; - bool dependencyFailed = false; - - for (it = events.begin(); it != events.end(); ++it) { - // Only wait if the command is enqueued into another queue. - if ((*it)->command().queue() != this) { - virtualDevice->flush(head, true); - tail = head = NULL; - dependencyFailed |= !(*it)->awaitCompletion(); - } - } - - // Insert the command to the linked list. - if (NULL == head) { //if the list is empty - head = tail = command; - } - else { - tail->setNext(command); - tail = command; - } - - if (dependencyFailed) { - command->setStatus(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST); - continue; - } - - command->setStatus(CL_SUBMITTED); - - cl_int result; - if ((commandIntercept != NULL) && - commandIntercept(as_cl(command), &result)) { - // The command was handled by the callback. - command->setStatus(CL_RUNNING, command->profilingInfo().submitted_); - command->setStatus(result); - continue; - } - - // Submit to the device queue. - command->submit(*virtualDevice); - - //if we are in intercept mode or this is a user invisible marker command - if ((0 == command->type()) || (commandIntercept != NULL)) { - virtualDevice->flush(head); - tail = head = NULL; - } - } // while (true) { -} - -void -HostQueue::append(Command& command) -{ - // We retain the command here. It will be released when its status - // changes to CL_COMPLETE - command.retain(); - command.setStatus(CL_QUEUED); - queue_.enqueue(&command); -} - -DeviceQueue::~DeviceQueue() -{ - delete virtualDevice_; - ScopedLock lock(context().lock()); - context().removeDeviceQueue(device(), this); -} - -bool -DeviceQueue::create() -{ - static const bool InteropQueue = true; - const bool defaultDeviceQueue = properties().test(CL_QUEUE_ON_DEVICE_DEFAULT); - bool result = false; - - virtualDevice_ = device().createVirtualDevice(this); - if (virtualDevice_ != NULL) { - result = true; - context().addDeviceQueue(device(), this, defaultDeviceQueue); + for (it = events.begin(); it != events.end(); ++it) { + // Only wait if the command is enqueued into another queue. + if ((*it)->command().queue() != this) { + virtualDevice->flush(head, true); + tail = head = NULL; + dependencyFailed |= !(*it)->awaitCompletion(); + } } - return result; + // Insert the command to the linked list. + if (NULL == head) { // if the list is empty + head = tail = command; + } else { + tail->setNext(command); + tail = command; + } + + if (dependencyFailed) { + command->setStatus(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST); + continue; + } + + command->setStatus(CL_SUBMITTED); + + cl_int result; + if ((commandIntercept != NULL) && commandIntercept(as_cl(command), &result)) { + // The command was handled by the callback. + command->setStatus(CL_RUNNING, command->profilingInfo().submitted_); + command->setStatus(result); + continue; + } + + // Submit to the device queue. + command->submit(*virtualDevice); + + // if we are in intercept mode or this is a user invisible marker command + if ((0 == command->type()) || (commandIntercept != NULL)) { + virtualDevice->flush(head); + tail = head = NULL; + } + } // while (true) { } -} //namespace amd { +void HostQueue::append(Command& command) { + // We retain the command here. It will be released when its status + // changes to CL_COMPLETE + command.retain(); + command.setStatus(CL_QUEUED); + queue_.enqueue(&command); +} + +DeviceQueue::~DeviceQueue() { + delete virtualDevice_; + ScopedLock lock(context().lock()); + context().removeDeviceQueue(device(), this); +} + +bool DeviceQueue::create() { + static const bool InteropQueue = true; + const bool defaultDeviceQueue = properties().test(CL_QUEUE_ON_DEVICE_DEFAULT); + bool result = false; + + virtualDevice_ = device().createVirtualDevice(this); + if (virtualDevice_ != NULL) { + result = true; + context().addDeviceQueue(device(), this, defaultDeviceQueue); + } + + return result; +} + +} // namespace amd { diff --git a/rocclr/runtime/platform/commandqueue.hpp b/rocclr/runtime/platform/commandqueue.hpp index 3bea84fc45..218e36913b 100644 --- a/rocclr/runtime/platform/commandqueue.hpp +++ b/rocclr/runtime/platform/commandqueue.hpp @@ -28,231 +28,215 @@ namespace amd { class HostQueue; class DeviceQueue; -class CommandQueue : public RuntimeObject -{ -public: - static const uint RealTimeDisabled = 0xffffffff; - enum class Priority : uint { - Normal = 0, - Medium - }; +class CommandQueue : public RuntimeObject { + public: + static const uint RealTimeDisabled = 0xffffffff; + enum class Priority : uint { Normal = 0, Medium }; - struct Properties - { - typedef cl_command_queue_properties value_type; - const value_type mask_; - value_type value_; + struct Properties { + typedef cl_command_queue_properties value_type; + const value_type mask_; + value_type value_; - Properties(value_type mask, value_type value) : - mask_(mask), value_(value & mask) - { } + Properties(value_type mask, value_type value) : mask_(mask), value_(value & mask) {} - bool set(value_type bits) { - if ((mask_ & bits) != bits) { - return false; - } - value_ |= bits; - return true; - } + bool set(value_type bits) { + if ((mask_ & bits) != bits) { + return false; + } + value_ |= bits; + return true; + } - bool clear(value_type bits) { - if ((mask_ & bits) != bits) { - return false; - } - value_ &= ~bits; - return true; - } + bool clear(value_type bits) { + if ((mask_ & bits) != bits) { + return false; + } + value_ &= ~bits; + return true; + } - bool test(value_type bits) const { - return (value_ & bits) != 0; - } - }; + bool test(value_type bits) const { return (value_ & bits) != 0; } + }; - //! Return the context this command queue is part of. - Context& context() const { return context_(); } + //! Return the context this command queue is part of. + Context& context() const { return context_(); } - //! Return the device for this command queue. - Device& device() const { return device_; } + //! Return the device for this command queue. + Device& device() const { return device_; } - //! Return the command queue properties. - Properties properties() const { return properties_; } - Properties& properties() { return properties_; } + //! Return the command queue properties. + Properties properties() const { return properties_; } + Properties& properties() { return properties_; } - //! Returns the base class object - CommandQueue* asCommandQueue() { return this; } + //! Returns the base class object + CommandQueue* asCommandQueue() { return this; } - virtual ~CommandQueue() {} + virtual ~CommandQueue() {} - //! Returns TRUE if the object was successfully created - virtual bool create() = 0; + //! Returns TRUE if the object was successfully created + virtual bool create() = 0; - //! RTTI internal implementation - virtual ObjectType objectType() const { return ObjectTypeQueue; } + //! RTTI internal implementation + virtual ObjectType objectType() const { return ObjectTypeQueue; } - //! Rturns HostQueue object - virtual HostQueue* asHostQueue() { return NULL; } + //! Rturns HostQueue object + virtual HostQueue* asHostQueue() { return NULL; } - //! Returns DeviceQueue object - virtual DeviceQueue* asDeviceQueue() { return NULL; } + //! Returns DeviceQueue object + virtual DeviceQueue* asDeviceQueue() { return NULL; } - //! Returns the number or requested real time CUs - uint rtCUs() const { return rtCUs_; } + //! Returns the number or requested real time CUs + uint rtCUs() const { return rtCUs_; } - //! Returns the queue priority - Priority priority() const { return priority_; } + //! Returns the queue priority + Priority priority() const { return priority_; } -protected: - //! CommandQueue constructor is protected - //! to keep the CommandQueue class as a virtual interface - CommandQueue( - Context& context, //!< Context object - Device& device, //!< Device object - cl_command_queue_properties properties, //!< Queue properties - cl_command_queue_properties propMask, //!< Queue properties mask - uint rtCUs = RealTimeDisabled, //!< Avaialble real time compute units - Priority priority = Priority::Normal //!< Queue priority - ) - : properties_(propMask, properties) - , rtCUs_(rtCUs) - , priority_(priority) - , queueLock_("CommandQueue::queueLock") - , device_(device) - , context_(context) {} + protected: + //! CommandQueue constructor is protected + //! to keep the CommandQueue class as a virtual interface + CommandQueue(Context& context, //!< Context object + Device& device, //!< Device object + cl_command_queue_properties properties, //!< Queue properties + cl_command_queue_properties propMask, //!< Queue properties mask + uint rtCUs = RealTimeDisabled, //!< Avaialble real time compute units + Priority priority = Priority::Normal //!< Queue priority + ) + : properties_(propMask, properties), + rtCUs_(rtCUs), + priority_(priority), + queueLock_("CommandQueue::queueLock"), + device_(device), + context_(context) {} - Properties properties_; //!< Queue properties - uint rtCUs_; //!< The number of used RT compute units - Priority priority_; //!< Queue priority - Monitor queueLock_; //!< Lock protecting the queue - Device& device_; //!< The device - SharedReference context_; //!< The context of this command queue + Properties properties_; //!< Queue properties + uint rtCUs_; //!< The number of used RT compute units + Priority priority_; //!< Queue priority + Monitor queueLock_; //!< Lock protecting the queue + Device& device_; //!< The device + SharedReference context_; //!< The context of this command queue -private: - //! Disable copy constructor - CommandQueue(const CommandQueue&); + private: + //! Disable copy constructor + CommandQueue(const CommandQueue&); - //! Disable assignment - CommandQueue& operator=(const CommandQueue&); + //! Disable assignment + CommandQueue& operator=(const CommandQueue&); }; -class HostQueue : public CommandQueue -{ - class Thread : public amd::Thread - { - public: - //! True if this command queue thread is accepting commands. - volatile bool acceptingCommands_; +class HostQueue : public CommandQueue { + class Thread : public amd::Thread { + public: + //! True if this command queue thread is accepting commands. + volatile bool acceptingCommands_; - //! Create a new thread - Thread() : amd::Thread("Command Queue Thread", CQ_THREAD_STACK_SIZE), - acceptingCommands_(false), virtualDevice_(NULL) - { } + //! Create a new thread + Thread() + : amd::Thread("Command Queue Thread", CQ_THREAD_STACK_SIZE), + acceptingCommands_(false), + virtualDevice_(NULL) {} - //! The command queue thread entry point. - void run(void *data) { - HostQueue* queue = static_cast(data); - virtualDevice_ = queue->device().createVirtualDevice(queue); - if (virtualDevice_ != NULL) { - queue->loop(virtualDevice_); - if (virtualDevice_->terminate()) { - delete virtualDevice_; - } - } - else { - acceptingCommands_ = false; - queue->flush(); - } + //! The command queue thread entry point. + void run(void* data) { + HostQueue* queue = static_cast(data); + virtualDevice_ = queue->device().createVirtualDevice(queue); + if (virtualDevice_ != NULL) { + queue->loop(virtualDevice_); + if (virtualDevice_->terminate()) { + delete virtualDevice_; } + } else { + acceptingCommands_ = false; + queue->flush(); + } + } - //! Get virtual device for the current thread - device::VirtualDevice* vdev() const { return virtualDevice_; } + //! Get virtual device for the current thread + device::VirtualDevice* vdev() const { return virtualDevice_; } - private: - device::VirtualDevice* virtualDevice_; //!< Virtual device for this thread + private: + device::VirtualDevice* virtualDevice_; //!< Virtual device for this thread - } thread_; //!< The command queue thread instance. + } thread_; //!< The command queue thread instance. -private: - ConcurrentLinkedQueue queue_; //!< The queue. + private: + ConcurrentLinkedQueue queue_; //!< The queue. - //! Await commands and execute them as they become ready. - void loop(device::VirtualDevice* virtualDevice); + //! Await commands and execute them as they become ready. + void loop(device::VirtualDevice* virtualDevice); -protected: - virtual bool terminate(); + protected: + virtual bool terminate(); -public: - /*! \brief Construct a new host queue. - * - * \note A new virtual device instance will be created from the - * given device. - */ - HostQueue( - Context& context, - Device& device, - cl_command_queue_properties properties, - uint queueRTCUs = 0, - Priority priority = Priority::Normal - ); + public: + /*! \brief Construct a new host queue. + * + * \note A new virtual device instance will be created from the + * given device. + */ + HostQueue(Context& context, Device& device, cl_command_queue_properties properties, + uint queueRTCUs = 0, Priority priority = Priority::Normal); - //! Returns TRUE if this command queue can accept commands. - virtual bool create() { return thread_.acceptingCommands_; } + //! Returns TRUE if this command queue can accept commands. + virtual bool create() { return thread_.acceptingCommands_; } - //! Append the given command to the queue. - void append(Command& command); + //! Append the given command to the queue. + void append(Command& command); - //! Return the thread object running the command loop. - const Thread& thread() const { return thread_; } + //! Return the thread object running the command loop. + const Thread& thread() const { return thread_; } - //! Signal to start processing the commands in the queue. - void flush () { ScopedLock sl(queueLock_); queueLock_.notify(); } + //! Signal to start processing the commands in the queue. + void flush() { + ScopedLock sl(queueLock_); + queueLock_.notify(); + } - //! Finish all queued commands - void finish(); + //! Finish all queued commands + void finish(); - //! Get virtual device for the current command queue - device::VirtualDevice* vdev() const { return thread_.vdev(); } + //! Get virtual device for the current command queue + device::VirtualDevice* vdev() const { return thread_.vdev(); } - //! Return the current queue as the HostQueue - virtual HostQueue* asHostQueue() { return this; } + //! Return the current queue as the HostQueue + virtual HostQueue* asHostQueue() { return this; } }; -class DeviceQueue : public CommandQueue -{ -public: - DeviceQueue( - Context& context, //!< Context object - Device& device, //!< Device object - cl_command_queue_properties properties, //!< Queue properties - uint size //!< Device queue size - ) - : CommandQueue(context, device, properties, device.info().queueOnDeviceProperties_ - | CL_QUEUE_ON_DEVICE | CL_QUEUE_ON_DEVICE_DEFAULT) - , size_(size) - , virtualDevice_(NULL) {} +class DeviceQueue : public CommandQueue { + public: + DeviceQueue(Context& context, //!< Context object + Device& device, //!< Device object + cl_command_queue_properties properties, //!< Queue properties + uint size //!< Device queue size + ) + : CommandQueue(context, device, properties, device.info().queueOnDeviceProperties_ | + CL_QUEUE_ON_DEVICE | CL_QUEUE_ON_DEVICE_DEFAULT), + size_(size), + virtualDevice_(NULL) {} - virtual ~DeviceQueue(); + virtual ~DeviceQueue(); - //! Returns TRUE if device queue was successfully created - virtual bool create(); + //! Returns TRUE if device queue was successfully created + virtual bool create(); - //! Return the current queue as the DeviceQueue - virtual DeviceQueue* asDeviceQueue() { return this; } + //! Return the current queue as the DeviceQueue + virtual DeviceQueue* asDeviceQueue() { return this; } - //! Returns the size of device queue - uint size() const { return size_; } + //! Returns the size of device queue + uint size() const { return size_; } - //! Returns virtual device for this device queue - device::VirtualDevice* vDev() const { return virtualDevice_; } + //! Returns virtual device for this device queue + device::VirtualDevice* vDev() const { return virtualDevice_; } - //! Returns the queue lock - Monitor& lock() { return queueLock_; } + //! Returns the queue lock + Monitor& lock() { return queueLock_; } -private: - uint size_; //!< Device queue size - device::VirtualDevice* virtualDevice_; //!< Virtual device for this queue + private: + uint size_; //!< Device queue size + device::VirtualDevice* virtualDevice_; //!< Virtual device for this queue }; -} //namespace amd +} // namespace amd -#endif //COMMAND_QUEUE_HPP_ \ No newline at end of file +#endif // COMMAND_QUEUE_HPP_ \ No newline at end of file diff --git a/rocclr/runtime/platform/context.cpp b/rocclr/runtime/platform/context.cpp index e2efcbff85..71e3050c0c 100644 --- a/rocclr/runtime/platform/context.cpp +++ b/rocclr/runtime/platform/context.cpp @@ -16,389 +16,345 @@ #include "CL/cl_d3d10.h" #include "CL/cl_d3d11.h" #include "CL/cl_dx9_media_sharing.h" -#endif //_WIN32 +#endif //_WIN32 namespace amd { -Context::Context( - const std::vector& devices, - const Info& info) - : devices_(devices) - , info_(info) - , properties_(NULL) - , glenv_(NULL) - , customHostAllocDevice_(NULL) -{ - for (const auto& device : devices) { - device->retain(); - if (customHostAllocDevice_ == NULL && device->customHostAllocator()) { - customHostAllocDevice_ = device; - } - if (device->svmSupport()) { - svmAllocDevice_.push_back(device); - } +Context::Context(const std::vector& devices, const Info& info) + : devices_(devices), + info_(info), + properties_(NULL), + glenv_(NULL), + customHostAllocDevice_(NULL) { + for (const auto& device : devices) { + device->retain(); + if (customHostAllocDevice_ == NULL && device->customHostAllocator()) { + customHostAllocDevice_ = device; } - if (svmAllocDevice_.size() > 1) { - //make sure the CPU is the last device to do allocation. - if ((svmAllocDevice_.front()->type() == CL_DEVICE_TYPE_CPU)) { - std::swap(svmAllocDevice_.front(), svmAllocDevice_.back()); - } - - uint isFirstDeviceFGSEnabled = svmAllocDevice_.front()->isFineGrainedSystem(true); - for (auto& dev : svmAllocDevice_) { - //allocation on fine - grained system incapable device first - if (isFirstDeviceFGSEnabled && (dev->type() == CL_DEVICE_TYPE_GPU) - && (!(dev->isFineGrainedSystem(true)))) { - std::swap(svmAllocDevice_.front(), dev); - break; - } - } - + if (device->svmSupport()) { + svmAllocDevice_.push_back(device); + } + } + if (svmAllocDevice_.size() > 1) { + // make sure the CPU is the last device to do allocation. + if ((svmAllocDevice_.front()->type() == CL_DEVICE_TYPE_CPU)) { + std::swap(svmAllocDevice_.front(), svmAllocDevice_.back()); } + uint isFirstDeviceFGSEnabled = svmAllocDevice_.front()->isFineGrainedSystem(true); + for (auto& dev : svmAllocDevice_) { + // allocation on fine - grained system incapable device first + if (isFirstDeviceFGSEnabled && (dev->type() == CL_DEVICE_TYPE_GPU) && + (!(dev->isFineGrainedSystem(true)))) { + std::swap(svmAllocDevice_.front(), dev); + break; + } + } + } } -Context::~Context() -{ - static const bool VALIDATE_ONLY = false; +Context::~Context() { + static const bool VALIDATE_ONLY = false; - // Dissociate OCL context with any external device - if (info_.flags_ & (GLDeviceKhr | D3D10DeviceKhr | D3D11DeviceKhr)) { - std::vector::const_iterator it; - // Loop through all devices - for (it = devices_.begin(); it != devices_.end(); it++) { - (*it)->unbindExternalDevice(info_.flags_, info_.hDev_, info_.hCtx_, VALIDATE_ONLY); - } + // Dissociate OCL context with any external device + if (info_.flags_ & (GLDeviceKhr | D3D10DeviceKhr | D3D11DeviceKhr)) { + std::vector::const_iterator it; + // Loop through all devices + for (it = devices_.begin(); it != devices_.end(); it++) { + (*it)->unbindExternalDevice(info_.flags_, info_.hDev_, info_.hCtx_, VALIDATE_ONLY); } + } - if (properties_ != NULL) { - delete [] properties_; - } - if (glenv_ != NULL) { - delete glenv_; - glenv_ = NULL; - } + if (properties_ != NULL) { + delete[] properties_; + } + if (glenv_ != NULL) { + delete glenv_; + glenv_ = NULL; + } - std::for_each(devices_.begin(), devices_.end(), - std::mem_fun(&Device::release)); + std::for_each(devices_.begin(), devices_.end(), std::mem_fun(&Device::release)); } -int -Context::checkProperties( - const cl_context_properties* properties, - Context::Info* info) -{ - cl_platform_id pfmId = 0; - uint count = 0; +int Context::checkProperties(const cl_context_properties* properties, Context::Info* info) { + cl_platform_id pfmId = 0; + uint count = 0; - const struct Element - { - intptr_t name; - void* ptr; - } *p = reinterpret_cast(properties); + const struct Element { + intptr_t name; + void* ptr; + }* p = reinterpret_cast(properties); - // Clear the context infor structure - ::memset(info, 0, sizeof(Context::Info)); + // Clear the context infor structure + ::memset(info, 0, sizeof(Context::Info)); - if (properties == NULL) { - return CL_SUCCESS; - } + if (properties == NULL) { + return CL_SUCCESS; + } - // Process all properties - while (p->name != 0) { - switch (p->name) { - case CL_CONTEXT_INTEROP_USER_SYNC: - if (p->ptr == reinterpret_cast(CL_TRUE)) - { - info->flags_ |= InteropUserSync; - } - break; + // Process all properties + while (p->name != 0) { + switch (p->name) { + case CL_CONTEXT_INTEROP_USER_SYNC: + if (p->ptr == reinterpret_cast(CL_TRUE)) { + info->flags_ |= InteropUserSync; + } + break; #ifdef _WIN32 - case CL_CONTEXT_D3D10_DEVICE_KHR: - if (p->ptr == NULL) { - return CL_INVALID_VALUE; - } - info->hDev_[D3D10DeviceKhrIdx] = p->ptr; - info->flags_ |= D3D10DeviceKhr; - break; - case CL_CONTEXT_D3D11_DEVICE_KHR: - if (p->ptr == NULL) { - return CL_INVALID_VALUE; - } - info->hDev_[D3D11DeviceKhrIdx] = p->ptr; - info->flags_ |= D3D11DeviceKhr; - break; - case CL_CONTEXT_ADAPTER_D3D9_KHR: - if (p->ptr == NULL) { //not supported for xp - return CL_INVALID_VALUE; - } - info->hDev_[D3D9DeviceKhrIdx] = p->ptr; - info->flags_ |= D3D9DeviceKhr; - break; - case CL_CONTEXT_ADAPTER_D3D9EX_KHR: - if (p->ptr == NULL) { - return CL_INVALID_VALUE; - } - info->hDev_[D3D9DeviceEXKhrIdx] = p->ptr; - info->flags_ |= D3D9DeviceEXKhr; - break; - case CL_CONTEXT_ADAPTER_DXVA_KHR: - if (p->ptr == NULL) { - return CL_INVALID_VALUE; - } - info->hDev_[D3D9DeviceVAKhrIdx] = p->ptr; - info->flags_ |= D3D9DeviceVAKhr; - break; -#endif //_WIN32 + case CL_CONTEXT_D3D10_DEVICE_KHR: + if (p->ptr == NULL) { + return CL_INVALID_VALUE; + } + info->hDev_[D3D10DeviceKhrIdx] = p->ptr; + info->flags_ |= D3D10DeviceKhr; + break; + case CL_CONTEXT_D3D11_DEVICE_KHR: + if (p->ptr == NULL) { + return CL_INVALID_VALUE; + } + info->hDev_[D3D11DeviceKhrIdx] = p->ptr; + info->flags_ |= D3D11DeviceKhr; + break; + case CL_CONTEXT_ADAPTER_D3D9_KHR: + if (p->ptr == NULL) { // not supported for xp + return CL_INVALID_VALUE; + } + info->hDev_[D3D9DeviceKhrIdx] = p->ptr; + info->flags_ |= D3D9DeviceKhr; + break; + case CL_CONTEXT_ADAPTER_D3D9EX_KHR: + if (p->ptr == NULL) { + return CL_INVALID_VALUE; + } + info->hDev_[D3D9DeviceEXKhrIdx] = p->ptr; + info->flags_ |= D3D9DeviceEXKhr; + break; + case CL_CONTEXT_ADAPTER_DXVA_KHR: + if (p->ptr == NULL) { + return CL_INVALID_VALUE; + } + info->hDev_[D3D9DeviceVAKhrIdx] = p->ptr; + info->flags_ |= D3D9DeviceVAKhr; + break; +#endif //_WIN32 - case CL_EGL_DISPLAY_KHR: - info->flags_ |= EGLDeviceKhr; + case CL_EGL_DISPLAY_KHR: + info->flags_ |= EGLDeviceKhr; #ifdef _WIN32 - case CL_WGL_HDC_KHR: -#endif //_WIN32 + case CL_WGL_HDC_KHR: +#endif //_WIN32 #if defined(__linux__) - case CL_GLX_DISPLAY_KHR: -#endif //linux - info->hDev_[GLDeviceKhrIdx] = p->ptr; + case CL_GLX_DISPLAY_KHR: +#endif // linux + info->hDev_[GLDeviceKhrIdx] = p->ptr; #if defined(__APPLE__) || defined(__MACOSX) - case CL_CGL_SHAREGROUP_KHR: - Unimplemented(); - break; -#endif //__APPLE__ || MACOS + case CL_CGL_SHAREGROUP_KHR: + Unimplemented(); + break; +#endif //__APPLE__ || MACOS - case CL_GL_CONTEXT_KHR: - if (p->ptr == NULL) { - return CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR; - } - if (p->name == CL_GL_CONTEXT_KHR) { - info->hCtx_ = p->ptr; - } - info->flags_ |= GLDeviceKhr; - break; - case CL_CONTEXT_PLATFORM: - pfmId = reinterpret_cast(p->ptr); - if ((NULL != pfmId) && (AMD_PLATFORM != pfmId)) { - return CL_INVALID_VALUE; - } - break; - case CL_CONTEXT_OFFLINE_DEVICES_AMD: - if (p->ptr != reinterpret_cast(1)) { - return CL_INVALID_VALUE; - } - // Set the offline device flag - info->flags_ |= OfflineDevices; - break; - case CL_CONTEXT_COMMAND_INTERCEPT_CALLBACK_AMD: - // Set the command intercept flag - info->commandIntercept_ = - (cl_int (CL_CALLBACK *)(cl_event, cl_int *)) p->ptr; - info->flags_ |= CommandIntercept; - break; - default: - return CL_INVALID_VALUE; + case CL_GL_CONTEXT_KHR: + if (p->ptr == NULL) { + return CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR; } - p++; - count++; + if (p->name == CL_GL_CONTEXT_KHR) { + info->hCtx_ = p->ptr; + } + info->flags_ |= GLDeviceKhr; + break; + case CL_CONTEXT_PLATFORM: + pfmId = reinterpret_cast(p->ptr); + if ((NULL != pfmId) && (AMD_PLATFORM != pfmId)) { + return CL_INVALID_VALUE; + } + break; + case CL_CONTEXT_OFFLINE_DEVICES_AMD: + if (p->ptr != reinterpret_cast(1)) { + return CL_INVALID_VALUE; + } + // Set the offline device flag + info->flags_ |= OfflineDevices; + break; + case CL_CONTEXT_COMMAND_INTERCEPT_CALLBACK_AMD: + // Set the command intercept flag + info->commandIntercept_ = (cl_int(CL_CALLBACK*)(cl_event, cl_int*))p->ptr; + info->flags_ |= CommandIntercept; + break; + default: + return CL_INVALID_VALUE; } + p++; + count++; + } - info->propertiesSize_ = count * sizeof(Element) + sizeof(intptr_t); - return CL_SUCCESS; + info->propertiesSize_ = count * sizeof(Element) + sizeof(intptr_t); + return CL_SUCCESS; } -int -Context::create(const intptr_t* properties) -{ - static const bool VALIDATE_ONLY = false; - int result = CL_SUCCESS; +int Context::create(const intptr_t* properties) { + static const bool VALIDATE_ONLY = false; + int result = CL_SUCCESS; - if (properties != NULL) { - properties_ = new cl_context_properties[ - info().propertiesSize_ / sizeof(cl_context_properties)]; - if (properties_ == NULL) { - return CL_OUT_OF_HOST_MEMORY; - } - - ::memcpy(properties_, properties, info().propertiesSize_); + if (properties != NULL) { + properties_ = new cl_context_properties[info().propertiesSize_ / sizeof(cl_context_properties)]; + if (properties_ == NULL) { + return CL_OUT_OF_HOST_MEMORY; } - // Check if OCL context can be associated with any external device - if (info_.flags_ & (D3D10DeviceKhr | D3D11DeviceKhr | GLDeviceKhr | - D3D9DeviceKhr | D3D9DeviceEXKhr | D3D9DeviceVAKhr)) { - std::vector::const_iterator it; - // Loop through all devices - for (it = devices_.begin(); it != devices_.end(); it++) { - if (!(*it)->bindExternalDevice( - info_.flags_, info_.hDev_, info_.hCtx_, VALIDATE_ONLY)) { - result = CL_INVALID_VALUE; - } - } - } + ::memcpy(properties_, properties, info().propertiesSize_); + } - // Check if the device binding wasn't successful - if (result != CL_SUCCESS) { - if (info_.flags_ & GLDeviceKhr) { - result = CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR; - } - else if (info_.flags_ & D3D10DeviceKhr) { - //return CL_INVALID_VALUE; // FIXME_odintsov: CL_INVALID_D3D_INTEROP; - } - else if (info_.flags_ & D3D11DeviceKhr) { - //return CL_INVALID_VALUE; // FIXME_odintsov: CL_INVALID_D3D_INTEROP; - } - else if (info_.flags_ & (D3D9DeviceKhr | D3D9DeviceEXKhr | D3D9DeviceVAKhr)) { - //return CL_INVALID_DX9_MEDIA_ADAPTER_KHR; - } + // Check if OCL context can be associated with any external device + if (info_.flags_ & (D3D10DeviceKhr | D3D11DeviceKhr | GLDeviceKhr | D3D9DeviceKhr | + D3D9DeviceEXKhr | D3D9DeviceVAKhr)) { + std::vector::const_iterator it; + // Loop through all devices + for (it = devices_.begin(); it != devices_.end(); it++) { + if (!(*it)->bindExternalDevice(info_.flags_, info_.hDev_, info_.hCtx_, VALIDATE_ONLY)) { + result = CL_INVALID_VALUE; + } } - else { - if (info_.flags_ & GLDeviceKhr) { - // Init context for GL interop - if(glenv_ == NULL) { - HMODULE h = (HMODULE) Os::loadLibrary( + } + + // Check if the device binding wasn't successful + if (result != CL_SUCCESS) { + if (info_.flags_ & GLDeviceKhr) { + result = CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR; + } else if (info_.flags_ & D3D10DeviceKhr) { + // return CL_INVALID_VALUE; // FIXME_odintsov: CL_INVALID_D3D_INTEROP; + } else if (info_.flags_ & D3D11DeviceKhr) { + // return CL_INVALID_VALUE; // FIXME_odintsov: CL_INVALID_D3D_INTEROP; + } else if (info_.flags_ & (D3D9DeviceKhr | D3D9DeviceEXKhr | D3D9DeviceVAKhr)) { + // return CL_INVALID_DX9_MEDIA_ADAPTER_KHR; + } + } else { + if (info_.flags_ & GLDeviceKhr) { + // Init context for GL interop + if (glenv_ == NULL) { + HMODULE h = (HMODULE)Os::loadLibrary( #ifdef _WIN32 - "OpenGL32.dll" -#else //!_WIN32 - "libGL.so.1" -#endif //!_WIN32 - ); + "OpenGL32.dll" +#else //!_WIN32 + "libGL.so.1" +#endif //!_WIN32 + ); - if (h && (glenv_ = new GLFunctions(h, (info_.flags_ & Flags::EGLDeviceKhr) != 0))) { - if (!glenv_->init(reinterpret_cast(info_.hDev_[GLDeviceKhrIdx]), - reinterpret_cast(info_.hCtx_))) { - delete glenv_; - glenv_ = NULL; - result = CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR; - } - } - - } + if (h && (glenv_ = new GLFunctions(h, (info_.flags_ & Flags::EGLDeviceKhr) != 0))) { + if (!glenv_->init(reinterpret_cast(info_.hDev_[GLDeviceKhrIdx]), + reinterpret_cast(info_.hCtx_))) { + delete glenv_; + glenv_ = NULL; + result = CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR; + } } + } } + } - return result; + return result; } -void* -Context::hostAlloc(size_t size, size_t alignment, bool atomics) const -{ - if (customHostAllocDevice_ != NULL) { - return customHostAllocDevice_->hostAlloc(size, alignment, atomics); - } +void* Context::hostAlloc(size_t size, size_t alignment, bool atomics) const { + if (customHostAllocDevice_ != NULL) { + return customHostAllocDevice_->hostAlloc(size, alignment, atomics); + } + return AlignedMemory::allocate(size, alignment); +} + +void Context::hostFree(void* ptr) const { + if (customHostAllocDevice_ != NULL) { + customHostAllocDevice_->hostFree(ptr); + return; + } + AlignedMemory::deallocate(ptr); +} + +void* Context::svmAlloc(size_t size, size_t alignment, cl_svm_mem_flags flags) { + unsigned int numSVMDev = svmAllocDevice_.size(); + if (numSVMDev < 1) { + return NULL; + } + + if (svmAllocDevice_.front()->type() == CL_DEVICE_TYPE_CPU) { return AlignedMemory::allocate(size, alignment); -} - -void -Context::hostFree(void* ptr) const -{ - if (customHostAllocDevice_ != NULL) { - customHostAllocDevice_->hostFree(ptr); - return; - } - AlignedMemory::deallocate(ptr); -} - -void* -Context::svmAlloc(size_t size, size_t alignment, cl_svm_mem_flags flags) -{ - unsigned int numSVMDev = svmAllocDevice_.size(); - if (numSVMDev < 1) { - return NULL; - } - - if (svmAllocDevice_.front()->type() == CL_DEVICE_TYPE_CPU) { - return AlignedMemory::allocate(size, alignment); - } - else { - void* svmPtrAlloced = NULL; - void* tempPtr = NULL; - - for (const auto& dev : svmAllocDevice_) { - if (dev->type() == CL_DEVICE_TYPE_GPU) { - //check if the device support svm platform atomics, - //skipped allocation for platform atomics if not supported by this device - if ((flags & CL_MEM_SVM_ATOMICS) && !(dev->info().svmCapabilities_ & CL_DEVICE_SVM_ATOMICS)) { - continue; - } - svmPtrAlloced = dev->svmAlloc(*this, size, alignment, flags, svmPtrAlloced); - if (svmPtrAlloced == NULL) { - return NULL; - } - } - } - return svmPtrAlloced; - } -} - -void -Context::svmFree(void* ptr) const -{ - if (svmAllocDevice_.front()->type() == CL_DEVICE_TYPE_CPU) { - AlignedMemory::deallocate(ptr); - return; - } + } else { + void* svmPtrAlloced = NULL; + void* tempPtr = NULL; for (const auto& dev : svmAllocDevice_) { - if (dev->type() == CL_DEVICE_TYPE_GPU) { - dev->svmFree(ptr); + if (dev->type() == CL_DEVICE_TYPE_GPU) { + // check if the device support svm platform atomics, + // skipped allocation for platform atomics if not supported by this device + if ((flags & CL_MEM_SVM_ATOMICS) && + !(dev->info().svmCapabilities_ & CL_DEVICE_SVM_ATOMICS)) { + continue; } + svmPtrAlloced = dev->svmAlloc(*this, size, alignment, flags, svmPtrAlloced); + if (svmPtrAlloced == NULL) { + return NULL; + } + } } + return svmPtrAlloced; + } +} + +void Context::svmFree(void* ptr) const { + if (svmAllocDevice_.front()->type() == CL_DEVICE_TYPE_CPU) { + AlignedMemory::deallocate(ptr); return; -} + } -bool -Context::containsDevice(const Device* device) const -{ - std::vector::const_iterator it; - - for (it = devices_.begin(); it != devices_.end(); ++it) { - if (device == *it || (*it)->isAncestor(device)) { - return true; - } + for (const auto& dev : svmAllocDevice_) { + if (dev->type() == CL_DEVICE_TYPE_GPU) { + dev->svmFree(ptr); } - return false; + } + return; } -DeviceQueue* -Context::defDeviceQueue(const Device& dev) const -{ - std::map::const_iterator it = - deviceQueues_.find(&dev); - if (it != deviceQueues_.end()) { - return it->second.defDeviceQueue_; - } - else { - return NULL; +bool Context::containsDevice(const Device* device) const { + std::vector::const_iterator it; + + for (it = devices_.begin(); it != devices_.end(); ++it) { + if (device == *it || (*it)->isAncestor(device)) { + return true; } + } + return false; } -bool -Context::isDevQueuePossible(const Device& dev) -{ - return (deviceQueues_[&dev].deviceQueueCnt_ < dev.info().maxOnDeviceQueues_) ? - true : false; +DeviceQueue* Context::defDeviceQueue(const Device& dev) const { + std::map::const_iterator it = deviceQueues_.find(&dev); + if (it != deviceQueues_.end()) { + return it->second.defDeviceQueue_; + } else { + return NULL; + } } -void -Context::addDeviceQueue(const Device& dev, DeviceQueue* queue, bool defDevQueue) -{ - DeviceQueueInfo& info = deviceQueues_[&dev]; - info.deviceQueueCnt_++; - if (defDevQueue) { - info.defDeviceQueue_ = queue; - } +bool Context::isDevQueuePossible(const Device& dev) { + return (deviceQueues_[&dev].deviceQueueCnt_ < dev.info().maxOnDeviceQueues_) ? true : false; } -void -Context::removeDeviceQueue(const Device& dev, DeviceQueue* queue) -{ - DeviceQueueInfo& info = deviceQueues_[&dev]; - assert((info.deviceQueueCnt_ != 0) && "The device queue map is empty!"); - info.deviceQueueCnt_--; - if (info.defDeviceQueue_ == queue) { - info.defDeviceQueue_ = NULL; - } +void Context::addDeviceQueue(const Device& dev, DeviceQueue* queue, bool defDevQueue) { + DeviceQueueInfo& info = deviceQueues_[&dev]; + info.deviceQueueCnt_++; + if (defDevQueue) { + info.defDeviceQueue_ = queue; + } } -} // namespace amd +void Context::removeDeviceQueue(const Device& dev, DeviceQueue* queue) { + DeviceQueueInfo& info = deviceQueues_[&dev]; + assert((info.deviceQueueCnt_ != 0) && "The device queue map is empty!"); + info.deviceQueueCnt_--; + if (info.defDeviceQueue_ == queue) { + info.defDeviceQueue_ = NULL; + } +} + +} // namespace amd diff --git a/rocclr/runtime/platform/context.hpp b/rocclr/runtime/platform/context.hpp index cd42ff4058..7721156881 100644 --- a/rocclr/runtime/platform/context.hpp +++ b/rocclr/runtime/platform/context.hpp @@ -25,192 +25,182 @@ namespace amd { class GLFunctions; class DeviceQueue; -class Context : public RuntimeObject -{ - std::vector devices_; +class Context : public RuntimeObject { + std::vector devices_; -public: - enum DeviceFlagIdx - { - GLDeviceKhrIdx = 0, //!< GL - D3D10DeviceKhrIdx, //!< D3D10 - OfflineDevicesIdx, //!< Offline devices - CommandInterceptIdx, //!< Command intercept - D3D11DeviceKhrIdx, //!< D3D11 - InteropUserSyncIdx, //!< Interop user sync enabled - D3D9DeviceKhrIdx, //!< d3d9 device - D3D9DeviceEXKhrIdx, //!< d3d9EX device - D3D9DeviceVAKhrIdx, //!< d3d9VA device - EGLDeviceKhrIdx, //!< EGL device - LastDeviceFlagIdx - }; + public: + enum DeviceFlagIdx { + GLDeviceKhrIdx = 0, //!< GL + D3D10DeviceKhrIdx, //!< D3D10 + OfflineDevicesIdx, //!< Offline devices + CommandInterceptIdx, //!< Command intercept + D3D11DeviceKhrIdx, //!< D3D11 + InteropUserSyncIdx, //!< Interop user sync enabled + D3D9DeviceKhrIdx, //!< d3d9 device + D3D9DeviceEXKhrIdx, //!< d3d9EX device + D3D9DeviceVAKhrIdx, //!< d3d9VA device + EGLDeviceKhrIdx, //!< EGL device + LastDeviceFlagIdx + }; - enum Flags - { - GLDeviceKhr = 1 << GLDeviceKhrIdx, //!< GL - D3D10DeviceKhr = 1 << D3D10DeviceKhrIdx, //!< D3D10 - OfflineDevices = 1 << OfflineDevicesIdx, //!< Offline devices - CommandIntercept = 1 << CommandInterceptIdx, //!< Command intercept - D3D11DeviceKhr = 1 << D3D11DeviceKhrIdx, //!< D3D11 - InteropUserSync = 1 << InteropUserSyncIdx, //!< Interop user sync enabled - D3D9DeviceKhr = 1 << D3D9DeviceKhrIdx, //!< d3d9 device - D3D9DeviceEXKhr = 1 << D3D9DeviceEXKhrIdx, //!< d3d9EX device - D3D9DeviceVAKhr = 1 << D3D9DeviceVAKhrIdx, //!< d3d9VA device - EGLDeviceKhr = 1 << EGLDeviceKhrIdx, //!< EGL device - }; + enum Flags { + GLDeviceKhr = 1 << GLDeviceKhrIdx, //!< GL + D3D10DeviceKhr = 1 << D3D10DeviceKhrIdx, //!< D3D10 + OfflineDevices = 1 << OfflineDevicesIdx, //!< Offline devices + CommandIntercept = 1 << CommandInterceptIdx, //!< Command intercept + D3D11DeviceKhr = 1 << D3D11DeviceKhrIdx, //!< D3D11 + InteropUserSync = 1 << InteropUserSyncIdx, //!< Interop user sync enabled + D3D9DeviceKhr = 1 << D3D9DeviceKhrIdx, //!< d3d9 device + D3D9DeviceEXKhr = 1 << D3D9DeviceEXKhrIdx, //!< d3d9EX device + D3D9DeviceVAKhr = 1 << D3D9DeviceVAKhrIdx, //!< d3d9VA device + EGLDeviceKhr = 1 << EGLDeviceKhrIdx, //!< EGL device + }; - //! Context info structure - struct Info - { - uint flags_; //!< Context info flags - void* hDev_[LastDeviceFlagIdx]; //!< Device object reference - void* hCtx_; //!< Context object reference - size_t propertiesSize_;//!< Size of the original properties in bytes - cl_int (CL_CALLBACK * commandIntercept_)(cl_event, cl_int *); - }; + //! Context info structure + struct Info { + uint flags_; //!< Context info flags + void* hDev_[LastDeviceFlagIdx]; //!< Device object reference + void* hCtx_; //!< Context object reference + size_t propertiesSize_; //!< Size of the original properties in bytes + cl_int(CL_CALLBACK* commandIntercept_)(cl_event, cl_int*); + }; - struct DeviceQueueInfo - { - DeviceQueue* defDeviceQueue_; //!< Default device queue - uint deviceQueueCnt_; //!< The number of device queues - DeviceQueueInfo(): defDeviceQueue_(NULL), deviceQueueCnt_(0) {} - }; + struct DeviceQueueInfo { + DeviceQueue* defDeviceQueue_; //!< Default device queue + uint deviceQueueCnt_; //!< The number of device queues + DeviceQueueInfo() : defDeviceQueue_(NULL), deviceQueueCnt_(0) {} + }; -private: - // Copying a Context is not allowed - Context(const Context&); - Context& operator = (const Context&); + private: + // Copying a Context is not allowed + Context(const Context&); + Context& operator=(const Context&); -protected: - bool terminate() { - if (Agent::shouldPostContextEvents()) { - Agent::postContextFree(as_cl(this)); - } - return true; + protected: + bool terminate() { + if (Agent::shouldPostContextEvents()) { + Agent::postContextFree(as_cl(this)); } - //! Context destructor - ~Context(); + return true; + } + //! Context destructor + ~Context(); -public: - /*! \brief Helper function to check the context properties and initialize - * context info structure - * - * \return An errcode if invalid, CL_SUCCESS if valid - */ - static int checkProperties( - const cl_context_properties* properties, //!< Properties - Info* info //!< Info structure - ); + public: + /*! \brief Helper function to check the context properties and initialize + * context info structure + * + * \return An errcode if invalid, CL_SUCCESS if valid + */ + static int checkProperties(const cl_context_properties* properties, //!< Properties + Info* info //!< Info structure + ); - //! Default constructor - Context( - const std::vector& devices, //!< List of all devices - const Info& info //!< Context info structure - ); + //! Default constructor + Context(const std::vector& devices, //!< List of all devices + const Info& info //!< Context info structure + ); - //! Compare two Context instances. - bool operator == (const Context& rhs) const { return this == &rhs; } - bool operator != (const Context& rhs) const { return !(*this == rhs); } + //! Compare two Context instances. + bool operator==(const Context& rhs) const { return this == &rhs; } + bool operator!=(const Context& rhs) const { return !(*this == rhs); } - /*! Creates the context - * - * \return An errcode if runtime fails the context creation, - * CL_SUCCESS otherwise - */ - int create( - const intptr_t* properties //!< Original context properties - ); + /*! Creates the context + * + * \return An errcode if runtime fails the context creation, + * CL_SUCCESS otherwise + */ + int create(const intptr_t* properties //!< Original context properties + ); - /** - * Allocate host memory using either a custom device allocator or a generic - * OS allocator - * - * @param size Allocation size, in bytes - * @param alignment Desired alignment, in bytes - * @param atomics The buffer should support platform (SVM) atomics - */ - void* hostAlloc(size_t size, size_t alignment, bool atomics = false) const; + /** + * Allocate host memory using either a custom device allocator or a generic + * OS allocator + * + * @param size Allocation size, in bytes + * @param alignment Desired alignment, in bytes + * @param atomics The buffer should support platform (SVM) atomics + */ + void* hostAlloc(size_t size, size_t alignment, bool atomics = false) const; - /** - * Release host memory - * @param ptr Pointer allocated using ::hostAlloc. If the pointer has been - * allocated elsewhere, the behavior is undefined - */ - void hostFree(void* ptr) const; + /** + * Release host memory + * @param ptr Pointer allocated using ::hostAlloc. If the pointer has been + * allocated elsewhere, the behavior is undefined + */ + void hostFree(void* ptr) const; - /** - * Allocate SVM buffer - * - * @param size Allocation size, in bytes - * @param alignment Desired alignment, in bytes - * @param flags The flags to create a svm space - */ - void* svmAlloc(size_t size, size_t alignment, cl_svm_mem_flags flags = CL_MEM_READ_WRITE); + /** + * Allocate SVM buffer + * + * @param size Allocation size, in bytes + * @param alignment Desired alignment, in bytes + * @param flags The flags to create a svm space + */ + void* svmAlloc(size_t size, size_t alignment, cl_svm_mem_flags flags = CL_MEM_READ_WRITE); - /** - * Release SVM buffer - * @param ptr Pointer allocated using ::svmAlloc. If the pointer has been - * allocated elsewhere, the behavior is undefined - */ - void svmFree(void* ptr) const; + /** + * Release SVM buffer + * @param ptr Pointer allocated using ::svmAlloc. If the pointer has been + * allocated elsewhere, the behavior is undefined + */ + void svmFree(void* ptr) const; - //! Return the devices associated with this context. - const std::vector& devices() const { return devices_; } + //! Return the devices associated with this context. + const std::vector& devices() const { return devices_; } - //! Return the SVM capable devices associated with this context. - const std::vector& svmDevices() const { return svmAllocDevice_; } + //! Return the SVM capable devices associated with this context. + const std::vector& svmDevices() const { return svmAllocDevice_; } - //! Returns true if the given device is associated with this context. - bool containsDevice(const Device* device) const; + //! Returns true if the given device is associated with this context. + bool containsDevice(const Device* device) const; - //! Returns the context info structure - const Info& info() const { return info_; } + //! Returns the context info structure + const Info& info() const { return info_; } - //! Returns a pointer to the original properties - const cl_context_properties* properties() const { return properties_; } + //! Returns a pointer to the original properties + const cl_context_properties* properties() const { return properties_; } - //! Returns a pointer to the OpenGL context - GLFunctions* glenv() const { return glenv_; } + //! Returns a pointer to the OpenGL context + GLFunctions* glenv() const { return glenv_; } - //! RTTI internal implementation - virtual ObjectType objectType() const { return ObjectTypeContext; } + //! RTTI internal implementation + virtual ObjectType objectType() const { return ObjectTypeContext; } - //! Returns context lock for the serialized access to the context - Monitor& lock() { return ctxLock_; } + //! Returns context lock for the serialized access to the context + Monitor& lock() { return ctxLock_; } - //! Returns TRUE if runtime succesfully added a device queue - DeviceQueue* defDeviceQueue(const Device& dev) const; + //! Returns TRUE if runtime succesfully added a device queue + DeviceQueue* defDeviceQueue(const Device& dev) const; - //! Returns TRUE if runtime succesfully added a device queue - bool isDevQueuePossible(const Device& dev); + //! Returns TRUE if runtime succesfully added a device queue + bool isDevQueuePossible(const Device& dev); - //! Returns TRUE if runtime succesfully added a device queue - void addDeviceQueue( - const Device& dev, //!< Device object - DeviceQueue* queue, //!< Device queue - bool defDevQueue //!< Added device queue will be the default queue - ); + //! Returns TRUE if runtime succesfully added a device queue + void addDeviceQueue(const Device& dev, //!< Device object + DeviceQueue* queue, //!< Device queue + bool defDevQueue //!< Added device queue will be the default queue + ); - //! Removes a device queue from the list of queues - void removeDeviceQueue( - const Device& dev, //!< Device object - DeviceQueue* queue //!< Device queue - ); + //! Removes a device queue from the list of queues + void removeDeviceQueue(const Device& dev, //!< Device object + DeviceQueue* queue //!< Device queue + ); -private: - const Info info_; //!< Context info structure - cl_context_properties* properties_; //!< Original properties - GLFunctions* glenv_; //!< OpenGL context - Device* customHostAllocDevice_; //!< Device responsible for host allocations - std::vector svmAllocDevice_; //!< Devices can support SVM allocations - std::map deviceQueues_; //!< Device queues mapping - Monitor ctxLock_; //!< Lock for the context access + private: + const Info info_; //!< Context info structure + cl_context_properties* properties_; //!< Original properties + GLFunctions* glenv_; //!< OpenGL context + Device* customHostAllocDevice_; //!< Device responsible for host allocations + std::vector svmAllocDevice_; //!< Devices can support SVM allocations + std::map deviceQueues_; //!< Device queues mapping + Monitor ctxLock_; //!< Lock for the context access }; /*! @} * @} */ -} // namespace amd +} // namespace amd #endif /*CONTEXT_HPP_*/ diff --git a/rocclr/runtime/platform/counter.hpp b/rocclr/runtime/platform/counter.hpp index 97fadc4f89..01258bf043 100644 --- a/rocclr/runtime/platform/counter.hpp +++ b/rocclr/runtime/platform/counter.hpp @@ -20,14 +20,13 @@ namespace amd { * * \brief The container class for the performance counters */ -class Counter : public RuntimeObject -{ -public: - //! RTTI internal implementation - virtual ObjectType objectType() const {return ObjectTypeCounter;} +class Counter : public RuntimeObject { + public: + //! RTTI internal implementation + virtual ObjectType objectType() const { return ObjectTypeCounter; } }; /*@}*/ /*@}*/ } // namespace amd -#endif // COUNTERS_HPP_ +#endif // COUNTERS_HPP_ diff --git a/rocclr/runtime/platform/interop.hpp b/rocclr/runtime/platform/interop.hpp index f2668d1ab1..29f6be6c58 100644 --- a/rocclr/runtime/platform/interop.hpp +++ b/rocclr/runtime/platform/interop.hpp @@ -15,36 +15,34 @@ class BufferGL; class D3D10Object; class D3D11Object; class D3D9Object; -#endif //_WIN32 +#endif //_WIN32 //! Base object providing common map/unmap interface for interop objects -class InteropObject -{ -public: - //! Virtual destructor to get rid of linux warning - virtual ~InteropObject() {} +class InteropObject { + public: + //! Virtual destructor to get rid of linux warning + virtual ~InteropObject() {} - // Static cast functions for interop objects - virtual GLObject* asGLObject() { return NULL; } - virtual BufferGL* asBufferGL() { return NULL; } + // Static cast functions for interop objects + virtual GLObject* asGLObject() { return NULL; } + virtual BufferGL* asBufferGL() { return NULL; } #ifdef _WIN32 - virtual D3D10Object* asD3D10Object() { return NULL; } - virtual D3D11Object* asD3D11Object() { return NULL; } - virtual D3D9Object* asD3D9Object() { return NULL; } -#endif //_WIN32 + virtual D3D10Object* asD3D10Object() { return NULL; } + virtual D3D11Object* asD3D11Object() { return NULL; } + virtual D3D9Object* asD3D9Object() { return NULL; } +#endif //_WIN32 - // On acquire copy data from original resource to shared resource - virtual bool copyOrigToShared() { return true; } - // On release copy data from shared copy to the original resource - virtual bool copySharedToOrig() { return true; } + // On acquire copy data from original resource to shared resource + virtual bool copyOrigToShared() { return true; } + // On release copy data from shared copy to the original resource + virtual bool copySharedToOrig() { return true; } - //! Mapping functions for interop objects - virtual bool mapExtObjectInCQThread() { return true; } - virtual bool unmapExtObjectInCQThread() { return true; } + //! Mapping functions for interop objects + virtual bool mapExtObjectInCQThread() { return true; } + virtual bool unmapExtObjectInCQThread() { return true; } }; -} // namespace amd - -#endif //!INTEROP_H_ +} // namespace amd +#endif //! INTEROP_H_ diff --git a/rocclr/runtime/platform/kernel.cpp b/rocclr/runtime/platform/kernel.cpp index 6df3ed1910..a4e3ef270c 100644 --- a/rocclr/runtime/platform/kernel.cpp +++ b/rocclr/runtime/platform/kernel.cpp @@ -12,240 +12,216 @@ namespace amd { Kernel::Kernel(Program& program, const Symbol& symbol, const std::string& name) - : program_(program), symbol_(symbol), name_(name) -{ - const KernelSignature& s = signature(); - size_t stackSize = s.paramsSize(); - parameters_ = new (s) KernelParameters(s); - fixme_guarantee(parameters_ != NULL && "out of memory"); - name_ += '\0'; + : program_(program), symbol_(symbol), name_(name) { + const KernelSignature& s = signature(); + size_t stackSize = s.paramsSize(); + parameters_ = new (s) KernelParameters(s); + fixme_guarantee(parameters_ != NULL && "out of memory"); + name_ += '\0'; } -Kernel::~Kernel() -{ - // Release kernel object itself - delete parameters_; +Kernel::~Kernel() { + // Release kernel object itself + delete parameters_; } -const device::Kernel* -Kernel::getDeviceKernel(const Device& device, bool noAlias) const -{ - return symbol_.getDeviceKernel(device, noAlias); +const device::Kernel* Kernel::getDeviceKernel(const Device& device, bool noAlias) const { + return symbol_.getDeviceKernel(device, noAlias); } -const KernelSignature& -Kernel::signature() const -{ - return symbol_.signature(); -} - -bool -KernelParameters::check() -{ - if (validated_) { - return true; - } - - for (size_t i = 0; i < signature_.numParameters(); ++i) { - if (!test(i)) { - return false; - } - } - validated_ = true; +const KernelSignature& Kernel::signature() const { return symbol_.signature(); } +bool KernelParameters::check() { + if (validated_) { return true; + } + + for (size_t i = 0; i < signature_.numParameters(); ++i) { + if (!test(i)) { + return false; + } + } + validated_ = true; + + return true; } -size_t -KernelParameters::localMemSize(size_t minDataTypeAlignment) const -{ - size_t memSize = 0; +size_t KernelParameters::localMemSize(size_t minDataTypeAlignment) const { + size_t memSize = 0; - for (size_t i = 0; i < signature_.numParameters(); ++i) { - const KernelParameterDescriptor& desc = signature_.at(i); - if (desc.size_ == 0) { - memSize = alignUp(memSize, minDataTypeAlignment) - + *reinterpret_cast(values_ + desc.offset_); - } + for (size_t i = 0; i < signature_.numParameters(); ++i) { + const KernelParameterDescriptor& desc = signature_.at(i); + if (desc.size_ == 0) { + memSize = alignUp(memSize, minDataTypeAlignment) + + *reinterpret_cast(values_ + desc.offset_); } - return memSize; + } + return memSize; } -void -KernelParameters::set( - size_t index, - size_t size, - const void* value, - bool svmBound) -{ - const KernelParameterDescriptor& desc = signature_.at(index); +void KernelParameters::set(size_t index, size_t size, const void* value, bool svmBound) { + const KernelParameterDescriptor& desc = signature_.at(index); - void* param = values_ + desc.offset_; - assert((desc.type_ == T_POINTER || value != NULL || desc.size_ == 0) && - "not a valid local mem arg"); + void* param = values_ + desc.offset_; + assert((desc.type_ == T_POINTER || value != NULL || desc.size_ == 0) && + "not a valid local mem arg"); - uint32_t uint32_value = 0; - uint64_t uint64_value = 0; + uint32_t uint32_value = 0; + uint64_t uint64_value = 0; - if (desc.type_ == T_POINTER && desc.size_ != 0) { - if (svmBound) { - LP64_SWITCH(uint32_value, uint64_value) = - (LP64_SWITCH(uint32_t, uint64_t)) value; - svmBound_[index] = true; - } - else if ((value == NULL) || (static_cast(value) == NULL)) { - LP64_SWITCH(uint32_value, uint64_value) = 0; - } - else { - // convert cl_mem to amd::Memory* - LP64_SWITCH(uint32_value, uint64_value) = - (uintptr_t) as_amd(*static_cast(value)); - } + if (desc.type_ == T_POINTER && desc.size_ != 0) { + if (svmBound) { + LP64_SWITCH(uint32_value, uint64_value) = (LP64_SWITCH(uint32_t, uint64_t))value; + svmBound_[index] = true; + } else if ((value == NULL) || (static_cast(value) == NULL)) { + LP64_SWITCH(uint32_value, uint64_value) = 0; + } else { + // convert cl_mem to amd::Memory* + LP64_SWITCH(uint32_value, uint64_value) = + (uintptr_t)as_amd(*static_cast(value)); } - else if (desc.type_ == T_SAMPLER) { - // convert cl_sampler to amd::Sampler* - amd::Sampler *sampler = as_amd(*static_cast(value)); - LP64_SWITCH(uint32_value, uint64_value) = (uintptr_t) sampler; - } - else if (desc.type_ == T_QUEUE) { - // convert cl_command_queue to amd::DeviceQueue* - amd::DeviceQueue* queue = - as_amd(*static_cast(value))->asDeviceQueue(); - LP64_SWITCH(uint32_value, uint64_value) = (uintptr_t) queue; - } - else switch (desc.size_) { - case 1: uint32_value = *static_cast(value); break; - case 2: uint32_value = *static_cast(value); break; - case 4: uint32_value = *static_cast(value); break; - case 8: uint64_value = *static_cast(value); break; - default: break; - } - + } else if (desc.type_ == T_SAMPLER) { + // convert cl_sampler to amd::Sampler* + amd::Sampler* sampler = as_amd(*static_cast(value)); + LP64_SWITCH(uint32_value, uint64_value) = (uintptr_t)sampler; + } else if (desc.type_ == T_QUEUE) { + // convert cl_command_queue to amd::DeviceQueue* + amd::DeviceQueue* queue = as_amd(*static_cast(value))->asDeviceQueue(); + LP64_SWITCH(uint32_value, uint64_value) = (uintptr_t)queue; + } else switch (desc.size_) { - case 0 /*local mem*/ : *static_cast(param) = size; break; - case sizeof(uint32_t): *static_cast(param) = uint32_value; break; - case sizeof(uint64_t): *static_cast(param) = uint64_value; break; - default: ::memcpy(param, value, size); break; + case 1: + uint32_value = *static_cast(value); + break; + case 2: + uint32_value = *static_cast(value); + break; + case 4: + uint32_value = *static_cast(value); + break; + case 8: + uint64_value = *static_cast(value); + break; + default: + break; } - defined_[index] = true; + switch (desc.size_) { + case 0 /*local mem*/: + *static_cast(param) = size; + break; + case sizeof(uint32_t): + *static_cast(param) = uint32_value; + break; + case sizeof(uint64_t): + *static_cast(param) = uint64_value; + break; + default: + ::memcpy(param, value, size); + break; + } + + defined_[index] = true; } -address -KernelParameters::capture(const Device& device) -{ - const size_t stackSize = signature_.paramsSize(); - //! Information about which arguments are SVM pointers is stored after - // the actual parameters, but only if the device has any SVM capability - const size_t svmInfoSize = device.info().svmCapabilities_ ? - signature_.numParameters() * sizeof(bool) : 0; - const size_t execInfoSize = getNumberOfSvmPtr() * sizeof(void*); - address mem = (address) AlignedMemory::allocate( - stackSize + svmInfoSize + execInfoSize, PARAMETERS_MIN_ALIGNMENT); +address KernelParameters::capture(const Device& device) { + const size_t stackSize = signature_.paramsSize(); + //! Information about which arguments are SVM pointers is stored after + // the actual parameters, but only if the device has any SVM capability + const size_t svmInfoSize = + device.info().svmCapabilities_ ? signature_.numParameters() * sizeof(bool) : 0; + const size_t execInfoSize = getNumberOfSvmPtr() * sizeof(void*); + address mem = (address)AlignedMemory::allocate(stackSize + svmInfoSize + execInfoSize, + PARAMETERS_MIN_ALIGNMENT); - address last = mem + stackSize; - if (mem != NULL) { - ::memcpy(mem, values_, stackSize); - - for (size_t i = 0; i < signature_.numParameters(); ++i) { - const KernelParameterDescriptor& desc = signature_.at(i); - if (desc.type_ == T_POINTER && desc.size_ != 0 && !svmBound_[i]) { - Memory* memArg = *(Memory**)(mem + desc.offset_); - if (memArg != NULL) { - memArg->retain(); - } - } - else if (desc.type_ == T_SAMPLER) { - Sampler* samplerArg = *(Sampler**)(mem + desc.offset_); - if (samplerArg != NULL) { - samplerArg->retain(); - } - } - else if (desc.type_ == T_QUEUE) { - DeviceQueue* queue = *(DeviceQueue**)(mem + desc.offset_); - if (queue != NULL) { - queue->retain(); - } - } - } - ::memcpy(last, svmBound_, svmInfoSize); - last += svmInfoSize; - - if (0 != execInfoSize) { - ::memcpy(last, &execSvmPtr_[0], execInfoSize); - } - execInfoOffset_ = stackSize + svmInfoSize; - } - - return mem; -} - -bool -KernelParameters::boundToSvmPointer(const Device& device, - const_address capturedParameter, - size_t index) const -{ - if (!device.info().svmCapabilities_) { - return false; - } - //! Information about which arguments are SVM pointers is stored after - // actual parameters - const bool* svmBound = reinterpret_cast(capturedParameter + - signature_.paramsSize()); - return svmBound[index]; -} - -void -KernelParameters::release(address mem, const amd::Device& device) const -{ - if (mem == NULL) { - // nothing to do! - return; - } + address last = mem + stackSize; + if (mem != NULL) { + ::memcpy(mem, values_, stackSize); for (size_t i = 0; i < signature_.numParameters(); ++i) { - const KernelParameterDescriptor& desc = signature_.at(i); - if (desc.type_ == T_POINTER && desc.size_ != 0 && - !boundToSvmPointer(device, mem, i)) { - Memory* memArg = *(Memory**)(mem + desc.offset_); - if (memArg != NULL) { - memArg->release(); - } + const KernelParameterDescriptor& desc = signature_.at(i); + if (desc.type_ == T_POINTER && desc.size_ != 0 && !svmBound_[i]) { + Memory* memArg = *(Memory**)(mem + desc.offset_); + if (memArg != NULL) { + memArg->retain(); } - else if (desc.type_ == T_SAMPLER) { - Sampler* samplerArg = *(Sampler**)(mem + desc.offset_); - if (samplerArg != NULL) { - samplerArg->release(); - } + } else if (desc.type_ == T_SAMPLER) { + Sampler* samplerArg = *(Sampler**)(mem + desc.offset_); + if (samplerArg != NULL) { + samplerArg->retain(); } - else if (desc.type_ == T_QUEUE) { - DeviceQueue* queue = *(DeviceQueue**)(mem + desc.offset_); - if (queue != NULL) { - queue->release(); - } + } else if (desc.type_ == T_QUEUE) { + DeviceQueue* queue = *(DeviceQueue**)(mem + desc.offset_); + if (queue != NULL) { + queue->retain(); } + } } + ::memcpy(last, svmBound_, svmInfoSize); + last += svmInfoSize; - AlignedMemory::deallocate(mem); + if (0 != execInfoSize) { + ::memcpy(last, &execSvmPtr_[0], execInfoSize); + } + execInfoOffset_ = stackSize + svmInfoSize; + } + + return mem; +} + +bool KernelParameters::boundToSvmPointer(const Device& device, const_address capturedParameter, + size_t index) const { + if (!device.info().svmCapabilities_) { + return false; + } + //! Information about which arguments are SVM pointers is stored after + // actual parameters + const bool* svmBound = reinterpret_cast(capturedParameter + signature_.paramsSize()); + return svmBound[index]; +} + +void KernelParameters::release(address mem, const amd::Device& device) const { + if (mem == NULL) { + // nothing to do! + return; + } + + for (size_t i = 0; i < signature_.numParameters(); ++i) { + const KernelParameterDescriptor& desc = signature_.at(i); + if (desc.type_ == T_POINTER && desc.size_ != 0 && !boundToSvmPointer(device, mem, i)) { + Memory* memArg = *(Memory**)(mem + desc.offset_); + if (memArg != NULL) { + memArg->release(); + } + } else if (desc.type_ == T_SAMPLER) { + Sampler* samplerArg = *(Sampler**)(mem + desc.offset_); + if (samplerArg != NULL) { + samplerArg->release(); + } + } else if (desc.type_ == T_QUEUE) { + DeviceQueue* queue = *(DeviceQueue**)(mem + desc.offset_); + if (queue != NULL) { + queue->release(); + } + } + } + + AlignedMemory::deallocate(mem); } -KernelSignature::KernelSignature( - const std::vector& params, - const std::string& attrib) - : params_(params), paramsSize_(0) - , attributes_(attrib) -{ - if (params.size() > 0) { - KernelParameterDescriptor last = params.back(); +KernelSignature::KernelSignature(const std::vector& params, + const std::string& attrib) + : params_(params), paramsSize_(0), attributes_(attrib) { + if (params.size() > 0) { + KernelParameterDescriptor last = params.back(); - size_t lastSize = last.size_; - if (lastSize == 0 /* local mem */) { - lastSize = sizeof(cl_mem); - } - paramsSize_ = last.offset_ + alignUp(lastSize, sizeof(intptr_t)); + size_t lastSize = last.size_; + if (lastSize == 0 /* local mem */) { + lastSize = sizeof(cl_mem); } + paramsSize_ = last.offset_ + alignUp(lastSize, sizeof(intptr_t)); + } } -} // namespace amd - +} // namespace amd diff --git a/rocclr/runtime/platform/kernel.hpp b/rocclr/runtime/platform/kernel.hpp index 62916676cc..a1e75f89f6 100644 --- a/rocclr/runtime/platform/kernel.hpp +++ b/rocclr/runtime/platform/kernel.hpp @@ -16,9 +16,9 @@ #include "device/device.hpp" enum FGSStatus { - FGS_DEFAULT, //!< The default kernel fine-grained system pointer support - FGS_NO, //!< no support of kernel fine-grained system pointer - FGS_YES //!< have support of kernel fine-grained system pointer + FGS_DEFAULT, //!< The default kernel fine-grained system pointer support + FGS_NO, //!< no support of kernel fine-grained system pointer + FGS_YES //!< have support of kernel fine-grained system pointer }; namespace amd { @@ -33,209 +33,199 @@ class Program; * @{ */ -class KernelSignature : public HeapObject -{ -private: - std::vector params_; - size_t paramsSize_; - std::string attributes_; //!< The kernel attributes +class KernelSignature : public HeapObject { + private: + std::vector params_; + size_t paramsSize_; + std::string attributes_; //!< The kernel attributes -public: - //! Default constructor - KernelSignature() : paramsSize_(0) { } + public: + //! Default constructor + KernelSignature() : paramsSize_(0) {} - //! Construct a new signature. - KernelSignature( - const std::vector& params, - const std::string& attrib - ); + //! Construct a new signature. + KernelSignature(const std::vector& params, const std::string& attrib); - //! Return the number of parameters - size_t numParameters() const { return params_.size(); } + //! Return the number of parameters + size_t numParameters() const { return params_.size(); } - //! Return the parameter descriptor at the given index. - const KernelParameterDescriptor& at(size_t index) const - { - assert(index < params_.size() && "index is out of bounds"); - return params_[index]; - } + //! Return the parameter descriptor at the given index. + const KernelParameterDescriptor& at(size_t index) const { + assert(index < params_.size() && "index is out of bounds"); + return params_[index]; + } - //! Return the size in bytes required for the arguments on the stack. - size_t paramsSize() const { return paramsSize_; } + //! Return the size in bytes required for the arguments on the stack. + size_t paramsSize() const { return paramsSize_; } - //! Return the kernel attributes - const std::string& attributes() const { return attributes_; } + //! Return the kernel attributes + const std::string& attributes() const { return attributes_; } }; // @todo: look into a copy-on-write model instead of copy-on-read. // -class KernelParameters : protected HeapObject -{ -private: - //! The signature describing these parameters. - const KernelSignature& signature_; +class KernelParameters : protected HeapObject { + private: + //! The signature describing these parameters. + const KernelSignature& signature_; - address values_; //!< pointer to the base of the values stack. - bool* defined_; //!< pointer to the isDefined flags. - bool* svmBound_; //!< True at 'i' if parameter 'i' is bound to SVM pointer - size_t execInfoOffset_; //!< The offset of execInfo - std::vector execSvmPtr_; //!< The non argument svm pointers for kernel - FGSStatus svmSystemPointersSupport_; //!< The flag for the status of the kernel - // support of fine-grain system sharing. - struct - { - uint32_t validated_ : 1; //!< True if all parameters are defined. - uint32_t execNewVcop_ : 1; //!< special new VCOP for kernel execution - uint32_t execPfpaVcop_ : 1; //!< special PFPA VCOP for kernel execution - uint32_t unused : 29; //!< unused - }; + address values_; //!< pointer to the base of the values stack. + bool* defined_; //!< pointer to the isDefined flags. + bool* svmBound_; //!< True at 'i' if parameter 'i' is bound to SVM pointer + size_t execInfoOffset_; //!< The offset of execInfo + std::vector execSvmPtr_; //!< The non argument svm pointers for kernel + FGSStatus svmSystemPointersSupport_; //!< The flag for the status of the kernel + // support of fine-grain system sharing. + struct { + uint32_t validated_ : 1; //!< True if all parameters are defined. + uint32_t execNewVcop_ : 1; //!< special new VCOP for kernel execution + uint32_t execPfpaVcop_ : 1; //!< special PFPA VCOP for kernel execution + uint32_t unused : 29; //!< unused + }; -public: + public: + //! Construct a new instance of parameters for the given signature. + KernelParameters(const KernelSignature& signature) + : signature_(signature), + execInfoOffset_(0), + svmSystemPointersSupport_(FGS_DEFAULT), + validated_(0), + execNewVcop_(0), + execPfpaVcop_(0) { + values_ = (address) this + alignUp(sizeof(KernelParameters), 16); + defined_ = (bool*)(values_ + signature.paramsSize()); + svmBound_ = (bool*)((address)defined_ + signature.numParameters() * sizeof(bool)); - //! Construct a new instance of parameters for the given signature. - KernelParameters(const KernelSignature& signature) : - signature_(signature), execInfoOffset_(0), svmSystemPointersSupport_(FGS_DEFAULT), - validated_(0), execNewVcop_(0), execPfpaVcop_(0) - { - values_ = (address) this + alignUp(sizeof(KernelParameters), 16); - defined_ = (bool*) (values_ + signature.paramsSize()); - svmBound_ = (bool*) ((address) defined_ + signature.numParameters() * sizeof(bool)); + address limit = (address)&svmBound_[signature.numParameters()]; + ::memset(values_, '\0', limit - values_); + } - address limit = (address) &svmBound_[signature.numParameters()]; - ::memset(values_, '\0', limit - values_); + //! Reset the parameter at the given \a index (becomes undefined). + void reset(size_t index) { + defined_[index] = false; + svmBound_[index] = false; + validated_ = 0; + } + //! Set the parameter at the given \a index to the value pointed by \a value + // \a svmBound indicates that \a value is a SVM pointer. + void set(size_t index, size_t size, const void* value, bool svmBound = false); + + //! Return true if the parameter at the given \a index is defined. + bool test(size_t index) const { return defined_[index]; } + + //! Return true if all the parameters have been defined. + bool check(); + + //! The amount of memory required for local memory needed + size_t localMemSize(size_t minDataTypeAlignment) const; + + //! Capture the state of the parameters and return the stack base pointer. + address capture(const Device& device); + //! Release the captured state of the parameters. + void release(address parameters, const amd::Device& device) const; + + //! Allocate memory for this instance as well as the required storage for + // the values_, defined_, and svmBound_ arrays. + void* operator new(size_t size, const KernelSignature& signature) { + size_t requiredSize = + alignUp(size, 16) + signature.paramsSize() + signature.numParameters() * sizeof(bool) * 2; + return AlignedMemory::allocate(requiredSize, PARAMETERS_MIN_ALIGNMENT); + } + //! Deallocate the memory reserved for this instance. + void operator delete(void* ptr) { AlignedMemory::deallocate(ptr); } + + //! Deallocate the memory reserved for this instance, + // matching overloaded operator new. + void operator delete(void* ptr, const KernelSignature& signature) { + AlignedMemory::deallocate(ptr); + } + + //! Returns raw kernel parameters without capture + address values() const { return values_; } + + //! Return true if the captured parameter at the given \a index is bound to + // SVM pointer. + bool boundToSvmPointer(const Device& device, const_address capturedAddress, size_t index) const; + //! add the svmPtr execInfo into container + void addSvmPtr(void* const* execInfoArray, size_t count) { + execSvmPtr_.clear(); + for (size_t i = 0; i < count; i++) { + execSvmPtr_.push_back(execInfoArray[i]); } + } + //! get the number of svmPtr in the execInfo container + size_t getNumberOfSvmPtr() const { return execSvmPtr_.size(); } - //! Reset the parameter at the given \a index (becomes undefined). - void reset(size_t index) - { - defined_[index] = false; - svmBound_[index] = false; - validated_ = 0; - } - //! Set the parameter at the given \a index to the value pointed by \a value - // \a svmBound indicates that \a value is a SVM pointer. - void set(size_t index, size_t size, const void* value, bool svmBound = false); + //! get the number of svmPtr in the execInfo container + size_t getExecInfoOffset() const { return execInfoOffset_; } - //! Return true if the parameter at the given \a index is defined. - bool test(size_t index) const { return defined_[index]; } + //! set the status of kernel support fine-grained SVM system pointer sharing + void setSvmSystemPointersSupport(FGSStatus svmSystemSupport) { + svmSystemPointersSupport_ = svmSystemSupport; + } - //! Return true if all the parameters have been defined. - bool check(); + //! return the status of kernel support fine-grained SVM system pointer sharing + FGSStatus getSvmSystemPointersSupport() const { return svmSystemPointersSupport_; } - //! The amount of memory required for local memory needed - size_t localMemSize(size_t minDataTypeAlignment) const; + //! set the new VCOP in the execInfo container + void setExecNewVcop(const bool newVcop) { execNewVcop_ = (newVcop == true); } - //! Capture the state of the parameters and return the stack base pointer. - address capture(const Device& device); - //! Release the captured state of the parameters. - void release(address parameters, const amd::Device& device) const; + //! set the PFPA VCOP in the execInfo container + void setExecPfpaVcop(const bool pfpaVcop) { execPfpaVcop_ = (pfpaVcop == true); } - //! Allocate memory for this instance as well as the required storage for - // the values_, defined_, and svmBound_ arrays. - void* operator new(size_t size, const KernelSignature& signature) - { - size_t requiredSize = alignUp(size, 16) - + signature.paramsSize() - + signature.numParameters() * sizeof(bool) * 2; - return AlignedMemory::allocate(requiredSize, PARAMETERS_MIN_ALIGNMENT); - } - //! Deallocate the memory reserved for this instance. - void operator delete(void * ptr) { AlignedMemory::deallocate(ptr); } + //! get the new VCOP in the execInfo container + bool getExecNewVcop() const { return (execNewVcop_ == 1); } - //! Deallocate the memory reserved for this instance, - // matching overloaded operator new. - void operator delete(void * ptr, const KernelSignature& signature) - { AlignedMemory::deallocate(ptr); } - - //! Returns raw kernel parameters without capture - address values() const { return values_; } - - //! Return true if the captured parameter at the given \a index is bound to - // SVM pointer. - bool boundToSvmPointer(const Device& device, - const_address capturedAddress, - size_t index) const; - //! add the svmPtr execInfo into container - void addSvmPtr(void* const* execInfoArray, size_t count) - { - execSvmPtr_.clear(); - for (size_t i = 0; i < count; i++) { - execSvmPtr_.push_back(execInfoArray[i]); - } - } - //! get the number of svmPtr in the execInfo container - size_t getNumberOfSvmPtr() const {return execSvmPtr_.size();} - - //! get the number of svmPtr in the execInfo container - size_t getExecInfoOffset() const {return execInfoOffset_;} - - //! set the status of kernel support fine-grained SVM system pointer sharing - void setSvmSystemPointersSupport(FGSStatus svmSystemSupport) { svmSystemPointersSupport_ = svmSystemSupport; } - - //! return the status of kernel support fine-grained SVM system pointer sharing - FGSStatus getSvmSystemPointersSupport() const { return svmSystemPointersSupport_; } - - //! set the new VCOP in the execInfo container - void setExecNewVcop(const bool newVcop) { execNewVcop_ = (newVcop == true); } - - //! set the PFPA VCOP in the execInfo container - void setExecPfpaVcop(const bool pfpaVcop) { execPfpaVcop_ = (pfpaVcop == true); } - - //! get the new VCOP in the execInfo container - bool getExecNewVcop() const { return (execNewVcop_ == 1); } - - //! get the PFPA VCOP in the execInfo container - bool getExecPfpaVcop() const { return (execPfpaVcop_ == 1); } + //! get the PFPA VCOP in the execInfo container + bool getExecPfpaVcop() const { return (execPfpaVcop_ == 1); } }; /*! \brief Encapsulates a __kernel function and the argument values * to be used when invoking this function. */ -class Kernel : public RuntimeObject -{ -private: - //! The program where this kernel is defined. - SharedReference program_; +class Kernel : public RuntimeObject { + private: + //! The program where this kernel is defined. + SharedReference program_; - const Symbol& symbol_; //!< The symbol for this kernel. - std::string name_; //!< The kernel's name. - KernelParameters* parameters_; //!< The parameters. + const Symbol& symbol_; //!< The symbol for this kernel. + std::string name_; //!< The kernel's name. + KernelParameters* parameters_; //!< The parameters. -protected: - //! Destroy this kernel - ~Kernel(); + protected: + //! Destroy this kernel + ~Kernel(); -public: - /*! \brief Construct a kernel object from the __kernel function - * \a kernelName in the given \a program. - */ - Kernel(Program& program, const Symbol& symbol, const std::string& name); + public: + /*! \brief Construct a kernel object from the __kernel function + * \a kernelName in the given \a program. + */ + Kernel(Program& program, const Symbol& symbol, const std::string& name); - //! Return the program containing this kernel. - Program& program() const { return program_(); } + //! Return the program containing this kernel. + Program& program() const { return program_(); } - //! Return this kernel's signature. - const KernelSignature& signature() const; + //! Return this kernel's signature. + const KernelSignature& signature() const; - //! Return the kernel entry point for the given device. - const device::Kernel* getDeviceKernel( - const Device& device, //!< Device object - bool noAlias = true //!< Controls alias optimization - ) const; + //! Return the kernel entry point for the given device. + const device::Kernel* getDeviceKernel(const Device& device, //!< Device object + bool noAlias = true //!< Controls alias optimization + ) const; - //! Return the parameters. - KernelParameters& parameters() const { return *parameters_; } + //! Return the parameters. + KernelParameters& parameters() const { return *parameters_; } - //! Return the kernel's name. - const std::string& name() const { return name_; } + //! Return the kernel's name. + const std::string& name() const { return name_; } - virtual ObjectType objectType() const {return ObjectTypeKernel;} + virtual ObjectType objectType() const { return ObjectTypeKernel; } }; /*! @} * @} */ -} // namespace amd +} // namespace amd #endif /*KERNEL_HPP_*/ diff --git a/rocclr/runtime/platform/memory.cpp b/rocclr/runtime/platform/memory.cpp index 7c409339f7..1cf25008f5 100644 --- a/rocclr/runtime/platform/memory.cpp +++ b/rocclr/runtime/platform/memory.cpp @@ -12,821 +12,675 @@ namespace amd { -bool -BufferRect::create( - const size_t* bufferOrigin, - const size_t* region, - size_t bufferRowPitch, - size_t bufferSlicePitch) -{ - bool valid = false; - // Find the buffer's row pitch - rowPitch_ = (bufferRowPitch != 0) ? bufferRowPitch : region[0]; - // Find the buffer's slice pitch - slicePitch_ = (bufferSlicePitch != 0) ? bufferSlicePitch : - rowPitch_ * region[1]; - // Find the region start offset - start_ = bufferOrigin[2] * slicePitch_ + - bufferOrigin[1] * rowPitch_ + bufferOrigin[0]; - // Find the region relative end offset - end_ = (region[2] - 1) * slicePitch_ + (region[1] - 1) * rowPitch_ + region[0]; - // Make sure we have a valid region - if ((rowPitch_ >= region[0]) && - (slicePitch_ >= (region[1] * rowPitch_)) && - ((slicePitch_ % rowPitch_) == 0)) { - valid = true; - } - return valid; +bool BufferRect::create(const size_t* bufferOrigin, const size_t* region, size_t bufferRowPitch, + size_t bufferSlicePitch) { + bool valid = false; + // Find the buffer's row pitch + rowPitch_ = (bufferRowPitch != 0) ? bufferRowPitch : region[0]; + // Find the buffer's slice pitch + slicePitch_ = (bufferSlicePitch != 0) ? bufferSlicePitch : rowPitch_ * region[1]; + // Find the region start offset + start_ = bufferOrigin[2] * slicePitch_ + bufferOrigin[1] * rowPitch_ + bufferOrigin[0]; + // Find the region relative end offset + end_ = (region[2] - 1) * slicePitch_ + (region[1] - 1) * rowPitch_ + region[0]; + // Make sure we have a valid region + if ((rowPitch_ >= region[0]) && (slicePitch_ >= (region[1] * rowPitch_)) && + ((slicePitch_ % rowPitch_) == 0)) { + valid = true; + } + return valid; } -bool -HostMemoryReference::allocateMemory(size_t size, const Context& context) { - assert(!alloced_ && "Runtime should not reallocate system memory!"); - size_t memoryAlignment = ( CPU_MEMORY_ALIGNMENT_SIZE <= 0 ) ? 256 : CPU_MEMORY_ALIGNMENT_SIZE; - size_ = amd::alignUp(size, memoryAlignment); - //! \note memory size must be aligned for CAL pinning - hostMem_ = CPU_MEMORY_GUARD_PAGES - ? GuardedMemory::allocate(size_, MEMOBJ_BASE_ADDR_ALIGN, CPU_MEMORY_GUARD_PAGE_SIZE * Ki) - : context.hostAlloc(size_, MEMOBJ_BASE_ADDR_ALIGN); - alloced_ = (hostMem_ != NULL); - return alloced_; +bool HostMemoryReference::allocateMemory(size_t size, const Context& context) { + assert(!alloced_ && "Runtime should not reallocate system memory!"); + size_t memoryAlignment = (CPU_MEMORY_ALIGNMENT_SIZE <= 0) ? 256 : CPU_MEMORY_ALIGNMENT_SIZE; + size_ = amd::alignUp(size, memoryAlignment); + //! \note memory size must be aligned for CAL pinning + hostMem_ = CPU_MEMORY_GUARD_PAGES + ? GuardedMemory::allocate(size_, MEMOBJ_BASE_ADDR_ALIGN, CPU_MEMORY_GUARD_PAGE_SIZE * Ki) + : context.hostAlloc(size_, MEMOBJ_BASE_ADDR_ALIGN); + alloced_ = (hostMem_ != NULL); + return alloced_; } // Frees system memory if it was allocated -void -HostMemoryReference::deallocateMemory(const Context& context) -{ - if (alloced_) { - if (CPU_MEMORY_GUARD_PAGES) GuardedMemory::deallocate(hostMem_); - else context.hostFree(hostMem_); - size_ = 0; - alloced_ = false; - hostMem_ = NULL; - } +void HostMemoryReference::deallocateMemory(const Context& context) { + if (alloced_) { + if (CPU_MEMORY_GUARD_PAGES) + GuardedMemory::deallocate(hostMem_); + else + context.hostFree(hostMem_); + size_ = 0; + alloced_ = false; + hostMem_ = NULL; + } } -Memory::Memory( - Context& context, - Type type, - Flags flags, - size_t size, - void* svmPtr) - : numDevices_(0) - , deviceMemories_(NULL) - , destructorCallbacks_(NULL) - , context_(context) - , parent_(NULL) - , type_(type) - , hostMemRef_(NULL) - , origin_(0) - , size_(size) - , flags_(flags) - , version_(0) - , lastWriter_(NULL) - , interopObj_(NULL) - , isParent_(false) - , vDev_(NULL) - , forceSysMemAlloc_(false) - , mapCount_(0) - , svmHostAddress_(svmPtr) - , svmPtrCommited_(false) - , canBeCached_(true) - , lockMemoryOps_("Memory Ops Lock", true) -{ -} +Memory::Memory(Context& context, Type type, Flags flags, size_t size, void* svmPtr) + : numDevices_(0), + deviceMemories_(NULL), + destructorCallbacks_(NULL), + context_(context), + parent_(NULL), + type_(type), + hostMemRef_(NULL), + origin_(0), + size_(size), + flags_(flags), + version_(0), + lastWriter_(NULL), + interopObj_(NULL), + isParent_(false), + vDev_(NULL), + forceSysMemAlloc_(false), + mapCount_(0), + svmHostAddress_(svmPtr), + svmPtrCommited_(false), + canBeCached_(true), + lockMemoryOps_("Memory Ops Lock", true) {} -Memory::Memory( - Memory& parent, - Flags flags, - size_t origin, - size_t size, - Type type) - : numDevices_(0) - , deviceMemories_(NULL) - , destructorCallbacks_(NULL) - , context_(parent.getContext()) - , parent_(&parent) - , type_((type == 0) ? parent.type_ : type) - , hostMemRef_(NULL) - , origin_(origin) - , size_(size) - , flags_(flags) - , version_(parent.getVersion()) - , lastWriter_(parent.getLastWriter()) - , interopObj_(parent.getInteropObj()) - , isParent_(false) - , vDev_(NULL) - , forceSysMemAlloc_(false) - , mapCount_(0) - , svmHostAddress_(parent.getSvmPtr()) - , svmPtrCommited_(parent.isSvmPtrCommited()) - , canBeCached_(true) - , lockMemoryOps_("Memory Ops Lock", true) -{ - parent_->retain(); - parent_->isParent_ = true; +Memory::Memory(Memory& parent, Flags flags, size_t origin, size_t size, Type type) + : numDevices_(0), + deviceMemories_(NULL), + destructorCallbacks_(NULL), + context_(parent.getContext()), + parent_(&parent), + type_((type == 0) ? parent.type_ : type), + hostMemRef_(NULL), + origin_(origin), + size_(size), + flags_(flags), + version_(parent.getVersion()), + lastWriter_(parent.getLastWriter()), + interopObj_(parent.getInteropObj()), + isParent_(false), + vDev_(NULL), + forceSysMemAlloc_(false), + mapCount_(0), + svmHostAddress_(parent.getSvmPtr()), + svmPtrCommited_(parent.isSvmPtrCommited()), + canBeCached_(true), + lockMemoryOps_("Memory Ops Lock", true) { + parent_->retain(); + parent_->isParent_ = true; - if (parent.getHostMem() != nullptr) { - setHostMem(reinterpret_cast
(parent.getHostMem()) + origin); - } - // Inherit memory flags from the parent - if ((flags_ & (CL_MEM_READ_WRITE | CL_MEM_READ_ONLY | - CL_MEM_WRITE_ONLY)) == 0) { - flags_ |= parent_->getMemFlags() & - (CL_MEM_READ_WRITE | CL_MEM_READ_ONLY | CL_MEM_WRITE_ONLY); - } + if (parent.getHostMem() != nullptr) { + setHostMem(reinterpret_cast
(parent.getHostMem()) + origin); + } + // Inherit memory flags from the parent + if ((flags_ & (CL_MEM_READ_WRITE | CL_MEM_READ_ONLY | CL_MEM_WRITE_ONLY)) == 0) { + flags_ |= parent_->getMemFlags() & (CL_MEM_READ_WRITE | CL_MEM_READ_ONLY | CL_MEM_WRITE_ONLY); + } + flags_ |= + parent_->getMemFlags() & (CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR | CL_MEM_USE_HOST_PTR); + + if ((flags_ & (CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS)) == 0) { flags_ |= parent_->getMemFlags() & - (CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR | CL_MEM_USE_HOST_PTR); - - if ((flags_ & (CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_WRITE_ONLY | - CL_MEM_HOST_NO_ACCESS)) == 0) { - flags_ |= parent_->getMemFlags() & - (CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_WRITE_ONLY | - CL_MEM_HOST_NO_ACCESS); - } + (CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS); + } } -void -Memory::initDeviceMemory() -{ - deviceMemories_ = reinterpret_cast( - reinterpret_cast(this) + sizeof(Memory)); - memset(deviceMemories_, 0, - context_().devices().size() * sizeof(DeviceMemory)); +void Memory::initDeviceMemory() { + deviceMemories_ = reinterpret_cast(reinterpret_cast(this) + sizeof(Memory)); + memset(deviceMemories_, 0, context_().devices().size() * sizeof(DeviceMemory)); } -void* -Memory::operator new(size_t size, const Context& context) -{ - return RuntimeObject::operator new( - size + context.devices().size() * sizeof(DeviceMemory)); +void* Memory::operator new(size_t size, const Context& context) { + return RuntimeObject::operator new(size + context.devices().size() * sizeof(DeviceMemory)); } -void -Memory::operator delete(void* p) -{ - RuntimeObject::operator delete(p); +void Memory::operator delete(void* p) { RuntimeObject::operator delete(p); } + +void Memory::operator delete(void* p, const Context& context) { Memory::operator delete(p); } + + +void Memory::addSubBuffer(Memory* view) { + amd::ScopedLock lock(lockMemoryOps()); + subBuffers_.push_back(view); } -void -Memory::operator delete(void* p, const Context& context) -{ - Memory::operator delete(p); +void Memory::removeSubBuffer(Memory* view) { + amd::ScopedLock lock(lockMemoryOps()); + subBuffers_.remove(view); } +bool Memory::allocHostMemory(void* initFrom, bool allocHostMem, bool forceCopy) { + // Sanity checks (the parameters should have been prevalidated by the API) + assert(!(flags_ & (CL_MEM_USE_HOST_PTR | CL_MEM_COPY_HOST_PTR) && (initFrom == NULL) && + !allocHostMem && !isSvmPtrCommited())); + assert( + !((initFrom != NULL) && !forceCopy && + !(flags_ & (CL_MEM_USE_HOST_PTR | CL_MEM_COPY_HOST_PTR | CL_MEM_EXTERNAL_PHYSICAL_AMD)))); + assert(!(flags_ & CL_MEM_COPY_HOST_PTR && flags_ & CL_MEM_USE_HOST_PTR)); -void -Memory::addSubBuffer(Memory* view) -{ - amd::ScopedLock lock(lockMemoryOps()); - subBuffers_.push_back(view); + const std::vector& devices = context_().devices(); + + // Find if a non GPU device was created with the context + for (size_t i = 0; i < devices.size(); i++) { + if (!(devices[i]->info().type_ & CL_DEVICE_TYPE_GPU)) { + allocHostMem = true; + break; + } + } + + // This allocation is necessary to use coherency mechanism + // for the initialization + if (getMemFlags() & (CL_MEM_COPY_HOST_PTR | CL_MEM_ALLOC_HOST_PTR)) { + allocHostMem = true; + } + + // Did application request to use host memory? + if (getMemFlags() & CL_MEM_USE_HOST_PTR) { + setHostMem(initFrom); + + // Recalculate image size according to pitch + Image* image = asImage(); + if (image != NULL) { + if (image->getDims() < 3) { + size_ = image->getRowPitch() * image->getHeight(); + } else { + size_ = image->getSlicePitch() * image->getDepth(); + } + } + } + // Allocate host memory buffer if needed + else if (allocHostMem && !isInterop()) { + if (!hostMemRef_.allocateMemory(size_, context_())) { + return false; + } + + // Copy data to the backing store if the app has requested + if (((flags_ & CL_MEM_COPY_HOST_PTR) || forceCopy) && (initFrom != NULL)) { + copyToBackingStore(initFrom); + } + } + + if (allocHostMem && type_ == CL_MEM_OBJECT_PIPE) { + // Initialize the pipe for a CPU device + clk_pipe_t* pipe = reinterpret_cast(getHostMem()); + pipe->read_idx = 0; + pipe->write_idx = 0; + pipe->end_idx = asPipe()->getMaxNumPackets(); + } + + if ((flags_ & (CL_MEM_USE_HOST_PTR | CL_MEM_COPY_HOST_PTR)) && (NULL == lastWriter_)) { + // Signal write, so coherency mechanism will initialize + // memory on all devices + signalWrite(NULL); + } + + return true; } -void -Memory::removeSubBuffer(Memory* view) -{ - amd::ScopedLock lock(lockMemoryOps()); - subBuffers_.remove(view); -} +bool Memory::create(void* initFrom, bool sysMemAlloc) { + static const bool forceAllocHostMem = false; -bool -Memory::allocHostMemory(void* initFrom, bool allocHostMem, bool forceCopy) -{ - // Sanity checks (the parameters should have been prevalidated by the API) - assert(!(flags_ & (CL_MEM_USE_HOST_PTR | CL_MEM_COPY_HOST_PTR) && - (initFrom == NULL) && !allocHostMem && !isSvmPtrCommited())); - assert(!((initFrom != NULL) && !forceCopy && - !(flags_ & (CL_MEM_USE_HOST_PTR | CL_MEM_COPY_HOST_PTR | - CL_MEM_EXTERNAL_PHYSICAL_AMD)))); - assert(!(flags_ & CL_MEM_COPY_HOST_PTR && flags_ & CL_MEM_USE_HOST_PTR)); + initDeviceMemory(); - const std::vector& devices = context_().devices(); - - // Find if a non GPU device was created with the context - for (size_t i = 0; i < devices.size(); i++) { - if (!(devices[i]->info().type_ & CL_DEVICE_TYPE_GPU)) { - allocHostMem = true; - break; - } + // Check if it's a subbuffer allocation + if (parent_ != NULL) { + // Find host memory pointer for subbuffer + if (parent_->getHostMem() != NULL) { + setHostMem((address)parent_->getHostMem() + origin_); } - // This allocation is necessary to use coherency mechanism - // for the initialization - if (getMemFlags() & (CL_MEM_COPY_HOST_PTR | CL_MEM_ALLOC_HOST_PTR)) { - allocHostMem = true; + // Add a new subbuffer to the list + parent_->addSubBuffer(this); + } + // Allocate host memory if requested + else if (!allocHostMemory(initFrom, forceAllocHostMem)) { + return false; + } + + const std::vector& devices = context_().devices(); + + // Forces system memory allocation on the device, + // instead of device memory + forceSysMemAlloc_ = sysMemAlloc; + + // Create memory on all available devices + for (size_t i = 0; i < devices.size(); i++) { + deviceAlloced_[devices[i]] = AllocInit; + + // Only GPU devices have device memory objects + if (devices[i]->info().type_ & CL_DEVICE_TYPE_GPU) { + deviceMemories_[i].ref_ = devices[i]; + deviceMemories_[i].value_ = NULL; } - // Did application request to use host memory? - if (getMemFlags() & CL_MEM_USE_HOST_PTR) { - setHostMem(initFrom); - - // Recalculate image size according to pitch - Image* image = asImage(); - if (image != NULL) { - if (image->getDims() < 3) { - size_ = image->getRowPitch() * image->getHeight(); - } - else { - size_ = image->getSlicePitch() * image->getDepth(); - } - } - } - // Allocate host memory buffer if needed - else if (allocHostMem && !isInterop()) { - if (!hostMemRef_.allocateMemory(size_, context_())) { - return false; - } - - // Copy data to the backing store if the app has requested - if (((flags_ & CL_MEM_COPY_HOST_PTR) || forceCopy) && (initFrom != NULL)) { - copyToBackingStore(initFrom); - } - } - - if (allocHostMem && type_ == CL_MEM_OBJECT_PIPE) - { - // Initialize the pipe for a CPU device - clk_pipe_t* pipe = reinterpret_cast(getHostMem()); - pipe->read_idx = 0; - pipe->write_idx = 0; - pipe->end_idx = asPipe()->getMaxNumPackets(); - } - - if ((flags_ & (CL_MEM_USE_HOST_PTR | CL_MEM_COPY_HOST_PTR)) && (NULL == lastWriter_)) { - // Signal write, so coherency mechanism will initialize - // memory on all devices - signalWrite(NULL); - } - - return true; -} - -bool -Memory::create(void* initFrom, bool sysMemAlloc) -{ - static const bool forceAllocHostMem = false; - - initDeviceMemory(); - - // Check if it's a subbuffer allocation - if (parent_ != NULL) { - // Find host memory pointer for subbuffer - if (parent_->getHostMem() != NULL) { - setHostMem((address)parent_->getHostMem() + origin_); - } - - // Add a new subbuffer to the list - parent_->addSubBuffer(this); - } - // Allocate host memory if requested - else if (!allocHostMemory(initFrom, forceAllocHostMem)) { + if (DISABLE_DEFERRED_ALLOC) { + device::Memory* mem = getDeviceMemory(*devices[i]); + if (NULL == mem) { + LogPrintfError("Can't allocate memory size - 0x%08X bytes!", getSize()); return false; + } } + } - const std::vector& devices = context_().devices(); - - // Forces system memory allocation on the device, - // instead of device memory - forceSysMemAlloc_ = sysMemAlloc; - - // Create memory on all available devices - for (size_t i = 0; i < devices.size(); i++) { - deviceAlloced_[devices[i]] = AllocInit; - - // Only GPU devices have device memory objects - if (devices[i]->info().type_ & CL_DEVICE_TYPE_GPU) { - deviceMemories_[i].ref_ = devices[i]; - deviceMemories_[i].value_ = NULL; - } - - if (DISABLE_DEFERRED_ALLOC) { - device::Memory* mem = getDeviceMemory(*devices[i]); - if (NULL == mem) { - LogPrintfError("Can't allocate memory size - 0x%08X bytes!", - getSize()); - return false; - } - } - } - - return true; + return true; } -bool -Memory::addDeviceMemory(const Device* dev) -{ - bool result = false; - AllocState create = AllocCreate; - AllocState init = AllocInit; - if (make_atomic(deviceAlloced_[dev]).compareAndSet(init, create)) { - device::Memory* dm = dev->createMemory(*this); +bool Memory::addDeviceMemory(const Device* dev) { + bool result = false; + AllocState create = AllocCreate; + AllocState init = AllocInit; + if (make_atomic(deviceAlloced_[dev]).compareAndSet(init, create)) { + device::Memory* dm = dev->createMemory(*this); - // Add the new memory allocation to the device map - if (NULL != dm) { - deviceMemories_[numDevices_].ref_ = dev; - deviceMemories_[numDevices_].value_ = dm; - numDevices_++; - assert((numDevices() <= context_().devices().size()) - && "Too many device objects"); + // Add the new memory allocation to the device map + if (NULL != dm) { + deviceMemories_[numDevices_].ref_ = dev; + deviceMemories_[numDevices_].value_ = dm; + numDevices_++; + assert((numDevices() <= context_().devices().size()) && "Too many device objects"); - // Mark the allocation with the complete flag - deviceAlloced_[dev] = AllocComplete; - if (getSvmPtr() != nullptr) { - svmBase_ = dm; - } - } - else { - // Mark the allocation as an empty - deviceAlloced_[dev] = AllocInit; - } + // Mark the allocation with the complete flag + deviceAlloced_[dev] = AllocComplete; + if (getSvmPtr() != nullptr) { + svmBase_ = dm; + } + } else { + // Mark the allocation as an empty + deviceAlloced_[dev] = AllocInit; } + } - // Make sure runtime finished memory allocation. - // Loop if in the create state - while (deviceAlloced_[dev] == AllocCreate) { - Os::yield(); - } + // Make sure runtime finished memory allocation. + // Loop if in the create state + while (deviceAlloced_[dev] == AllocCreate) { + Os::yield(); + } - if (deviceAlloced_[dev] == AllocComplete) { - result = true; - } + if (deviceAlloced_[dev] == AllocComplete) { + result = true; + } - return result; + return result; } -void -Memory::replaceDeviceMemory(const Device* dev, device::Memory* dm) -{ - uint i; - for (i = 0; i < numDevices_; ++i) { - if (deviceMemories_[i].ref_ == dev) { - delete deviceMemories_[i].value_; - break; - } +void Memory::replaceDeviceMemory(const Device* dev, device::Memory* dm) { + uint i; + for (i = 0; i < numDevices_; ++i) { + if (deviceMemories_[i].ref_ == dev) { + delete deviceMemories_[i].value_; + break; } + } - if (numDevices_ == 0) { - ++numDevices_; - deviceMemories_[0].ref_ = dev; - } + if (numDevices_ == 0) { + ++numDevices_; + deviceMemories_[0].ref_ = dev; + } - deviceMemories_[i].value_ = dm; - deviceAlloced_[dev] = AllocRealloced; + deviceMemories_[i].value_ = dm; + deviceAlloced_[dev] = AllocRealloced; } -device::Memory* -Memory::getDeviceMemory(const Device& dev, bool alloc) -{ - device::Memory* dm = NULL; +device::Memory* Memory::getDeviceMemory(const Device& dev, bool alloc) { + device::Memory* dm = NULL; + for (uint i = 0; i < numDevices_; ++i) { + if (deviceMemories_[i].ref_ == &dev) { + dm = deviceMemories_[i].value_; + break; + } + } + + if ((NULL == dm) && alloc) { + if (!addDeviceMemory(&dev)) { + LogError("Video memory allocation failed!"); + return NULL; + } + dm = deviceMemories_[numDevices() - 1].value_; + } + + return dm; +} + +Memory::~Memory() { + // For_each destructor callback: + DestructorCallBackEntry* entry; + for (entry = destructorCallbacks_; entry != NULL; entry = entry->next_) { + // invoke the callback function. + entry->callback_(const_cast(as_cl(this)), entry->data_); + } + + // Release the parent. + if (NULL != parent_) { + // Update cache if runtime destroys a subbuffer + if (NULL != parent_->getHostMem() && (vDev_ == NULL)) { + cacheWriteBack(); + } + parent_->removeSubBuffer(this); + } + + if (NULL != deviceMemories_) { + // Destroy all device memory objects for (uint i = 0; i < numDevices_; ++i) { - if (deviceMemories_[i].ref_ == &dev) { - dm = deviceMemories_[i].value_; - break; - } + delete deviceMemories_[i].value_; } + } - if ((NULL == dm) && alloc) { - if (!addDeviceMemory(&dev)) { - LogError("Video memory allocation failed!"); - return NULL; - } - dm = deviceMemories_[numDevices() - 1].value_; - } + // Sanity check + if (subBuffers_.size() != 0) { + LogError("Can't have views if parent is destroyed!"); + } - return dm; + // Destroy the destructor callback entries + DestructorCallBackEntry* callback = destructorCallbacks_; + while (callback != NULL) { + DestructorCallBackEntry* next = callback->next_; + delete callback; + callback = next; + } + + // Make sure runtime destroys the parent only after subbuffer destruction + if (NULL != parent_) { + parent_->release(); + } + hostMemRef_.deallocateMemory(context_()); } -Memory::~Memory() -{ - // For_each destructor callback: - DestructorCallBackEntry* entry; - for (entry = destructorCallbacks_; entry != NULL; entry = entry->next_) { - // invoke the callback function. - entry->callback_(const_cast(as_cl(this)), entry->data_); - } +bool Memory::setDestructorCallback(DestructorCallBackFunction callback, void* data) { + DestructorCallBackEntry* entry = new DestructorCallBackEntry(callback, data); + if (entry == NULL) { + return false; + } - // Release the parent. - if (NULL != parent_) { - // Update cache if runtime destroys a subbuffer - if (NULL != parent_->getHostMem() && (vDev_ == NULL)) { - cacheWriteBack(); - } - parent_->removeSubBuffer(this); - } + entry->next_ = destructorCallbacks_; + while (!destructorCallbacks_.compare_exchange_weak(entry->next_, entry)) + ; // Someone else is also updating the head of the linked list! reload. - if (NULL != deviceMemories_) { - // Destroy all device memory objects - for (uint i = 0; i < numDevices_; ++i) { - delete deviceMemories_[i].value_; - } - } - - // Sanity check - if (subBuffers_.size() != 0) { - LogError("Can't have views if parent is destroyed!"); - } - - // Destroy the destructor callback entries - DestructorCallBackEntry* callback = destructorCallbacks_; - while (callback != NULL) { - DestructorCallBackEntry* next = callback->next_; - delete callback; - callback = next; - } - - // Make sure runtime destroys the parent only after subbuffer destruction - if (NULL != parent_) { - parent_->release(); - } - hostMemRef_.deallocateMemory(context_()); + return true; } -bool -Memory::setDestructorCallback(DestructorCallBackFunction callback, void* data) -{ - DestructorCallBackEntry* entry = new DestructorCallBackEntry(callback, data); - if (entry == NULL) { - return false; +void Memory::signalWrite(const Device* writer) { + // (the potential race condition below doesn't matter, no critical + // section needed) + ++version_; + lastWriter_ = writer; + // Update all subbuffers for this object + for (auto buf : subBuffers_) { + buf->signalWrite(writer); + } +} + +void Memory::cacheWriteBack() { + if (NULL != lastWriter_) { + device::Memory* dmem = getDeviceMemory(*lastWriter_); + //! @note It's a special condition, when a subbuffer was created, + //! but never used. Thus dev memory is still NULL and lastWriter_ + //! was passed from the parent. + if (NULL != dmem) { + dmem->syncHostFromCache(); } - - entry->next_ = destructorCallbacks_; - while (!destructorCallbacks_.compare_exchange_weak(entry->next_, entry)) - ; // Someone else is also updating the head of the linked list! reload. - - return true; -} - -void -Memory::signalWrite(const Device* writer) -{ - // (the potential race condition below doesn't matter, no critical - // section needed) - ++version_; - lastWriter_ = writer; - // Update all subbuffers for this object - for (auto buf : subBuffers_) { - buf->signalWrite(writer); + } else if (isParent()) { + // On CPU parent can't be synchronized, because lastWriter_ could be NULL + // and syncHostFromCache() won't be called. + for (uint i = 0; i < numDevices_; ++i) { + deviceMemories_[i].value_->syncHostFromCache(); } + } } -void -Memory::cacheWriteBack() -{ - if (NULL != lastWriter_) { - device::Memory* dmem = getDeviceMemory(*lastWriter_); - //! @note It's a special condition, when a subbuffer was created, - //! but never used. Thus dev memory is still NULL and lastWriter_ - //! was passed from the parent. - if (NULL != dmem) { - dmem->syncHostFromCache(); - } - } - else if (isParent()) { - // On CPU parent can't be synchronized, because lastWriter_ could be NULL - // and syncHostFromCache() won't be called. - for (uint i = 0; i < numDevices_; ++i) { - deviceMemories_[i].value_->syncHostFromCache(); - } - } +void Memory::copyToBackingStore(void* initFrom) { memcpy(getHostMem(), initFrom, size_); } + +bool Memory::usesSvmPointer() const { + if (!(flags_ & CL_MEM_USE_HOST_PTR)) { + return false; + } + // If the application host pointer lies within a SVM region, so does the + // sub-buffer host pointer - so the following check works in both cases + return (SvmBuffer::malloced(getHostMem()) || NULL != svmHostAddress_); } -void -Memory::copyToBackingStore(void* initFrom) -{ - memcpy(getHostMem(), initFrom, size_); +void Memory::commitSvmMemory() { + ScopedLock lock(lockMemoryOps_); + if (!svmPtrCommited_) { + amd::Os::commitMemory(svmHostAddress_, size_, amd::Os::MEM_PROT_RW); + svmPtrCommited_ = true; + } } -bool -Memory::usesSvmPointer() const -{ - if (!(flags_ & CL_MEM_USE_HOST_PTR)) { - return false; - } - // If the application host pointer lies within a SVM region, so does the - // sub-buffer host pointer - so the following check works in both cases - return (SvmBuffer::malloced(getHostMem()) || NULL != svmHostAddress_); +void Buffer::initDeviceMemory() { + deviceMemories_ = reinterpret_cast(reinterpret_cast(this) + sizeof(Buffer)); + memset(deviceMemories_, 0, context_().devices().size() * sizeof(DeviceMemory)); } -void -Memory::commitSvmMemory() -{ - ScopedLock lock(lockMemoryOps_); - if (!svmPtrCommited_) { - amd::Os::commitMemory(svmHostAddress_, size_, amd::Os::MEM_PROT_RW); - svmPtrCommited_ = true; - } +bool Buffer::create(void* initFrom, bool sysMemAlloc) { + if ((getMemFlags() & CL_MEM_EXTERNAL_PHYSICAL_AMD) && (initFrom != NULL)) { + busAddress_ = *(reinterpret_cast(initFrom)); + initFrom = NULL; + } else { + busAddress_.surface_bus_address = 0; + busAddress_.marker_bus_address = 0; + } + return Memory::create(initFrom, sysMemAlloc); } -void -Buffer::initDeviceMemory() -{ - deviceMemories_ = reinterpret_cast( - reinterpret_cast(this) + sizeof(Buffer)); - memset(deviceMemories_, 0, - context_().devices().size() * sizeof(DeviceMemory)); +bool Buffer::isEntirelyCovered(const Coord3D& origin, const Coord3D& region) const { + return ((origin[0] == 0) && (region[0] == getSize())) ? true : false; } -bool -Buffer::create(void* initFrom, bool sysMemAlloc) -{ - if ((getMemFlags() & CL_MEM_EXTERNAL_PHYSICAL_AMD) && (initFrom != NULL)) { - busAddress_ = *(reinterpret_cast(initFrom)); - initFrom = NULL; - } - else { - busAddress_.surface_bus_address = 0; - busAddress_.marker_bus_address = 0; - } - return Memory::create(initFrom, sysMemAlloc); +bool Buffer::validateRegion(const Coord3D& origin, const Coord3D& region) const { + return ((region[0] > 0) && (origin[0] < getSize()) && ((origin[0] + region[0]) <= getSize())) + ? true + : false; } -bool -Buffer::isEntirelyCovered(const Coord3D& origin, const Coord3D& region) const -{ - return ((origin[0] == 0) && (region[0] == getSize())) ? true : false; -} - -bool -Buffer::validateRegion(const Coord3D& origin, const Coord3D& region) const -{ - return ((region[0] > 0) && - (origin[0] < getSize()) && - ((origin[0] + region[0]) <= getSize())) ? true : false; -} - -void -Pipe::initDeviceMemory() -{ - deviceMemories_ = reinterpret_cast( - reinterpret_cast(this) + sizeof(Pipe)); - memset(deviceMemories_, 0, - context_().devices().size() * sizeof(DeviceMemory)); +void Pipe::initDeviceMemory() { + deviceMemories_ = reinterpret_cast(reinterpret_cast(this) + sizeof(Pipe)); + memset(deviceMemories_, 0, context_().devices().size() * sizeof(DeviceMemory)); } #define GETMIPDIM(dim, mip) (((dim >> mip) > 0) ? (dim >> mip) : 1) -Image::Image( - const Format& format, - Image& parent, - uint baseMipLevel, - cl_mem_flags flags) - : Memory(parent, flags, 0, parent.getWidth() * parent.getHeight() * - parent.getDepth() * format.getElementSize()) - , impl_(format, Coord3D(parent.getWidth() * - parent.getImageFormat().getElementSize() / - format.getElementSize(), parent.getHeight(), - parent.getDepth()), parent.getRowPitch(), - parent.getSlicePitch(), parent.getBytePitch()) - , mipLevels_(1) - , baseMipLevel_(baseMipLevel) -{ - if (baseMipLevel > 0) { - impl_.region_.c[0] = GETMIPDIM(parent.getWidth(), baseMipLevel) * - parent.getImageFormat().getElementSize() / format.getElementSize(); - impl_.region_.c[1] = GETMIPDIM(parent.getHeight(), baseMipLevel); - impl_.region_.c[2] = GETMIPDIM(parent.getDepth(), baseMipLevel); +Image::Image(const Format& format, Image& parent, uint baseMipLevel, cl_mem_flags flags) + : Memory(parent, flags, 0, + parent.getWidth() * parent.getHeight() * parent.getDepth() * format.getElementSize()), + impl_(format, Coord3D(parent.getWidth() * parent.getImageFormat().getElementSize() / + format.getElementSize(), + parent.getHeight(), parent.getDepth()), + parent.getRowPitch(), parent.getSlicePitch(), parent.getBytePitch()), + mipLevels_(1), + baseMipLevel_(baseMipLevel) { + if (baseMipLevel > 0) { + impl_.region_.c[0] = GETMIPDIM(parent.getWidth(), baseMipLevel) * + parent.getImageFormat().getElementSize() / format.getElementSize(); + impl_.region_.c[1] = GETMIPDIM(parent.getHeight(), baseMipLevel); + impl_.region_.c[2] = GETMIPDIM(parent.getDepth(), baseMipLevel); - if (parent.getType() == CL_MEM_OBJECT_IMAGE1D_ARRAY) { - impl_.region_.c[1] = parent.getHeight(); + if (parent.getType() == CL_MEM_OBJECT_IMAGE1D_ARRAY) { + impl_.region_.c[1] = parent.getHeight(); + } else if (parent.getType() == CL_MEM_OBJECT_IMAGE2D_ARRAY) { + impl_.region_.c[2] = parent.getDepth(); + } + size_ = getWidth() * getHeight() * parent.getDepth() * format.getElementSize(); + } + initDimension(); +} + +Image::Image(Context& context, Type type, Flags flags, const Format& format, size_t width, + size_t height, size_t depth, size_t rowPitch, size_t slicePitch, uint mipLevels) + : Memory(context, type, flags, width * height * depth * format.getElementSize()), + impl_(format, Coord3D(width, height, depth), rowPitch, slicePitch), + mipLevels_(mipLevels), + baseMipLevel_(0) { + initDimension(); +} + +Image::Image(Buffer& buffer, Type type, Flags flags, const Format& format, size_t width, + size_t height, size_t depth, size_t rowPitch, size_t slicePitch) + : Memory(buffer, flags, 0, buffer.getSize(), type), + impl_(format, Coord3D(width, height, depth), rowPitch, slicePitch), + mipLevels_(1), + baseMipLevel_(0) { + initDimension(); +} + +bool Image::validateDimensions(const std::vector& devices, cl_mem_object_type type, + size_t width, size_t height, size_t depth, size_t arraySize) { + bool sizePass = false; + switch (type) { + case CL_MEM_OBJECT_IMAGE3D: + if ((width == 0) || (height == 0) || (depth < 1)) { + return false; + } + for (const auto& dev : devices) { + if ((dev->info().image3DMaxWidth_ >= width) && (dev->info().image3DMaxHeight_ >= height) && + (dev->info().image3DMaxDepth_ >= depth)) { + return true; } - else if (parent.getType() == CL_MEM_OBJECT_IMAGE2D_ARRAY) { - impl_.region_.c[2] = parent.getDepth(); + } + break; + case CL_MEM_OBJECT_IMAGE2D_ARRAY: + if (arraySize == 0) { + return false; + } + for (const auto& dev : devices) { + if (dev->info().imageMaxArraySize_ >= arraySize) { + sizePass = true; + break; } - size_ = getWidth() * getHeight() * parent.getDepth() * format.getElementSize(); - } - initDimension(); + } + if (!sizePass) { + return false; + } + // Fall through... + case CL_MEM_OBJECT_IMAGE2D: + if ((width == 0) || (height == 0)) { + return false; + } + for (const auto dev : devices) { + if ((dev->info().image2DMaxHeight_ >= height) && (dev->info().image2DMaxWidth_ >= width)) { + return true; + } + } + break; + case CL_MEM_OBJECT_IMAGE1D_ARRAY: + if (arraySize == 0) { + return false; + } + + for (const auto& dev : devices) { + if (dev->info().imageMaxArraySize_ >= arraySize) { + sizePass = true; + break; + } + } + if (!sizePass) { + return false; + } + // Fall through... + case CL_MEM_OBJECT_IMAGE1D: + if (width == 0) { + return false; + } + for (const auto& dev : devices) { + if (dev->info().image2DMaxWidth_ >= width) { + return true; + } + } + break; + case CL_MEM_OBJECT_IMAGE1D_BUFFER: + if (width == 0) { + return false; + } + for (const auto& dev : devices) { + if (dev->info().imageMaxBufferSize_ >= width) { + return true; + } + } + break; + default: + break; + } + + return false; } -Image::Image( - Context& context, - Type type, - Flags flags, - const Format& format, - size_t width, - size_t height, - size_t depth, - size_t rowPitch, - size_t slicePitch, - uint mipLevels) - : Memory(context, type, flags, width * height * depth * format.getElementSize()) - , impl_(format, Coord3D(width, height, depth), rowPitch, slicePitch) - , mipLevels_(mipLevels) - , baseMipLevel_(0) -{ - initDimension(); -} - -Image::Image( - Buffer& buffer, - Type type, - Flags flags, - const Format& format, - size_t width, - size_t height, - size_t depth, - size_t rowPitch, - size_t slicePitch) - : Memory(buffer, flags, 0, buffer.getSize(), type) - , impl_(format, Coord3D(width, height, depth), rowPitch, slicePitch) - , mipLevels_(1) - , baseMipLevel_(0) -{ - initDimension(); -} - -bool -Image::validateDimensions( - const std::vector& devices, - cl_mem_object_type type, - size_t width, - size_t height, - size_t depth, - size_t arraySize) -{ - bool sizePass = false; - switch (type) { - case CL_MEM_OBJECT_IMAGE3D: - if ((width == 0) || (height == 0) || (depth < 1)) { - return false; - } - for (const auto& dev : devices) { - if ((dev->info().image3DMaxWidth_ >= width) && - (dev->info().image3DMaxHeight_ >= height) && - (dev->info().image3DMaxDepth_ >= depth)) { - return true; - } - } - break; - case CL_MEM_OBJECT_IMAGE2D_ARRAY: - if (arraySize == 0) { - return false; - } - for (const auto& dev : devices) { - if (dev->info().imageMaxArraySize_ >= arraySize) { - sizePass = true; - break; - } - } - if (!sizePass) { - return false; - } - // Fall through... - case CL_MEM_OBJECT_IMAGE2D: - if ((width == 0) || (height == 0)) { - return false; - } - for (const auto dev : devices) { - if ((dev->info().image2DMaxHeight_ >= height) && - (dev->info().image2DMaxWidth_ >= width)) { - return true; - } - } - break; - case CL_MEM_OBJECT_IMAGE1D_ARRAY: - if (arraySize == 0) { - return false; - } - - for (const auto& dev : devices) { - if (dev->info().imageMaxArraySize_ >= arraySize) { - sizePass = true; - break; - } - } - if (!sizePass) { - return false; - } - // Fall through... - case CL_MEM_OBJECT_IMAGE1D: - if (width == 0) { - return false; - } - for (const auto& dev : devices) { - if (dev->info().image2DMaxWidth_ >= width) { - return true; - } - } - break; - case CL_MEM_OBJECT_IMAGE1D_BUFFER: - if (width == 0) { - return false; - } - for (const auto& dev : devices) { - if (dev->info().imageMaxBufferSize_ >= width) { - return true; - } - } - break; - default: - break; - } - - return false; -} - -void -Image::initDimension() -{ - const size_t elemSize = impl_.format_.getElementSize(); - if (impl_.rp_ == 0) { - impl_.rp_ = impl_.region_[0] * elemSize; - } - switch (type_) { +void Image::initDimension() { + const size_t elemSize = impl_.format_.getElementSize(); + if (impl_.rp_ == 0) { + impl_.rp_ = impl_.region_[0] * elemSize; + } + switch (type_) { case CL_MEM_OBJECT_IMAGE3D: case CL_MEM_OBJECT_IMAGE2D_ARRAY: - dim_ = 3; - if (impl_.sp_ == 0) { - impl_.sp_ = impl_.region_[0] * impl_.region_[1] * elemSize; - } - break; + dim_ = 3; + if (impl_.sp_ == 0) { + impl_.sp_ = impl_.region_[0] * impl_.region_[1] * elemSize; + } + break; case CL_MEM_OBJECT_IMAGE2D: case CL_MEM_OBJECT_IMAGE1D_ARRAY: - dim_ = 2; - if ((impl_.sp_ == 0) && - (type_ == CL_MEM_OBJECT_IMAGE1D_ARRAY)) { - impl_.sp_ = impl_.rp_; - } - break; + dim_ = 2; + if ((impl_.sp_ == 0) && (type_ == CL_MEM_OBJECT_IMAGE1D_ARRAY)) { + impl_.sp_ = impl_.rp_; + } + break; case CL_MEM_OBJECT_IMAGE1D: case CL_MEM_OBJECT_IMAGE1D_BUFFER: default: - dim_ = 1; - break; - } + dim_ = 1; + break; + } } -void -Image::initDeviceMemory() -{ - deviceMemories_ = reinterpret_cast( - reinterpret_cast(this) + sizeof(Image)); - memset(deviceMemories_, 0, - context_().devices().size() * sizeof(DeviceMemory)); -} -bool -Image::create(void* initFrom) -{ - return Memory::create(initFrom); +void Image::initDeviceMemory() { + deviceMemories_ = reinterpret_cast(reinterpret_cast(this) + sizeof(Image)); + memset(deviceMemories_, 0, context_().devices().size() * sizeof(DeviceMemory)); } +bool Image::create(void* initFrom) { return Memory::create(initFrom); } -size_t -Image::Format::getNumChannels() const -{ - switch(image_channel_order) - { +size_t Image::Format::getNumChannels() const { + switch (image_channel_order) { case CL_RG: case CL_RA: - return 2; + return 2; case CL_RGB: case CL_sRGB: case CL_sRGBx: - return 3; + return 3; case CL_RGBA: case CL_BGRA: case CL_ARGB: case CL_sRGBA: case CL_sBGRA: - return 4; - } - return 1; + return 4; + } + return 1; } -size_t -Image::Format::getElementSize() const -{ - size_t bytesPerPixel = getNumChannels(); - switch(image_channel_data_type) - { +size_t Image::Format::getElementSize() const { + size_t bytesPerPixel = getNumChannels(); + switch (image_channel_data_type) { case CL_SNORM_INT8: case CL_UNORM_INT8: case CL_SIGNED_INT8: case CL_UNSIGNED_INT8: - break; + break; case CL_UNORM_INT_101010: - bytesPerPixel = 4; - break; + bytesPerPixel = 4; + break; case CL_SIGNED_INT32: case CL_UNSIGNED_INT32: case CL_FLOAT: - bytesPerPixel *= 4; - break; + bytesPerPixel *= 4; + break; default: - bytesPerPixel *= 2; - break; - } - return bytesPerPixel; + bytesPerPixel *= 2; + break; + } + return bytesPerPixel; } -bool -Image::Format::isValid() const -{ - switch(image_channel_data_type) - { +bool Image::Format::isValid() const { + switch (image_channel_data_type) { case CL_SNORM_INT8: case CL_SNORM_INT16: case CL_UNORM_INT8: @@ -842,156 +696,183 @@ Image::Format::isValid() const case CL_UNSIGNED_INT32: case CL_HALF_FLOAT: case CL_FLOAT: - break; + break; default: - return false; - } + return false; + } - switch(image_channel_order) - { + switch (image_channel_order) { case CL_R: case CL_A: case CL_RG: case CL_RA: case CL_RGBA: - break; + break; case CL_INTENSITY: case CL_LUMINANCE: - switch(image_channel_data_type) - { + switch (image_channel_data_type) { case CL_SNORM_INT8: case CL_SNORM_INT16: case CL_UNORM_INT8: case CL_UNORM_INT16: case CL_HALF_FLOAT: case CL_FLOAT: - break; + break; default: - return false; - } - break; + return false; + } + break; case CL_RGB: - switch(image_channel_data_type) - { + switch (image_channel_data_type) { case CL_UNORM_SHORT_565: case CL_UNORM_SHORT_555: case CL_UNORM_INT_101010: - break; + break; default: - return false; - } - break; + return false; + } + break; case CL_BGRA: case CL_ARGB: - switch(image_channel_data_type) - { + switch (image_channel_data_type) { case CL_SNORM_INT8: case CL_UNORM_INT8: case CL_SIGNED_INT8: case CL_UNSIGNED_INT8: - break; + break; default: - return false; - } - break; + return false; + } + break; case CL_sRGB: case CL_sRGBx: case CL_sRGBA: case CL_sBGRA: - switch(image_channel_data_type) - { + switch (image_channel_data_type) { case CL_UNORM_INT8: - break; + break; default: - return false; - } - break; + return false; + } + break; case CL_DEPTH: - switch(image_channel_data_type) - { + switch (image_channel_data_type) { case CL_UNORM_INT16: case CL_FLOAT: - break; + break; default: - return false; - } - break; + return false; + } + break; default: - return false; - } - return true; + return false; + } + return true; } // definition of list of supported formats -cl_image_format -Image::supportedFormats[] = { +cl_image_format Image::supportedFormats[] = { // R - {CL_R, CL_SNORM_INT8}, {CL_R, CL_SNORM_INT16}, - {CL_R, CL_UNORM_INT8}, {CL_R, CL_UNORM_INT16}, + {CL_R, CL_SNORM_INT8}, + {CL_R, CL_SNORM_INT16}, + {CL_R, CL_UNORM_INT8}, + {CL_R, CL_UNORM_INT16}, - {CL_R, CL_SIGNED_INT8}, {CL_R, CL_SIGNED_INT16}, - {CL_R, CL_SIGNED_INT32}, {CL_R, CL_UNSIGNED_INT8}, - {CL_R, CL_UNSIGNED_INT16}, {CL_R, CL_UNSIGNED_INT32}, + {CL_R, CL_SIGNED_INT8}, + {CL_R, CL_SIGNED_INT16}, + {CL_R, CL_SIGNED_INT32}, + {CL_R, CL_UNSIGNED_INT8}, + {CL_R, CL_UNSIGNED_INT16}, + {CL_R, CL_UNSIGNED_INT32}, - {CL_R, CL_HALF_FLOAT}, {CL_R, CL_FLOAT}, + {CL_R, CL_HALF_FLOAT}, + {CL_R, CL_FLOAT}, // A - {CL_A, CL_SNORM_INT8}, {CL_A, CL_SNORM_INT16}, - {CL_A, CL_UNORM_INT8}, {CL_A, CL_UNORM_INT16}, + {CL_A, CL_SNORM_INT8}, + {CL_A, CL_SNORM_INT16}, + {CL_A, CL_UNORM_INT8}, + {CL_A, CL_UNORM_INT16}, - {CL_A, CL_SIGNED_INT8}, {CL_A, CL_SIGNED_INT16}, - {CL_A, CL_SIGNED_INT32}, {CL_A, CL_UNSIGNED_INT8}, - {CL_A, CL_UNSIGNED_INT16}, {CL_A, CL_UNSIGNED_INT32}, + {CL_A, CL_SIGNED_INT8}, + {CL_A, CL_SIGNED_INT16}, + {CL_A, CL_SIGNED_INT32}, + {CL_A, CL_UNSIGNED_INT8}, + {CL_A, CL_UNSIGNED_INT16}, + {CL_A, CL_UNSIGNED_INT32}, - {CL_A, CL_HALF_FLOAT}, {CL_A, CL_FLOAT}, + {CL_A, CL_HALF_FLOAT}, + {CL_A, CL_FLOAT}, // RG - {CL_RG, CL_SNORM_INT8}, {CL_RG, CL_SNORM_INT16}, - {CL_RG, CL_UNORM_INT8}, {CL_RG, CL_UNORM_INT16}, + {CL_RG, CL_SNORM_INT8}, + {CL_RG, CL_SNORM_INT16}, + {CL_RG, CL_UNORM_INT8}, + {CL_RG, CL_UNORM_INT16}, - {CL_RG, CL_SIGNED_INT8}, {CL_RG, CL_SIGNED_INT16}, - {CL_RG, CL_SIGNED_INT32}, {CL_RG, CL_UNSIGNED_INT8}, - {CL_RG, CL_UNSIGNED_INT16}, {CL_RG, CL_UNSIGNED_INT32}, + {CL_RG, CL_SIGNED_INT8}, + {CL_RG, CL_SIGNED_INT16}, + {CL_RG, CL_SIGNED_INT32}, + {CL_RG, CL_UNSIGNED_INT8}, + {CL_RG, CL_UNSIGNED_INT16}, + {CL_RG, CL_UNSIGNED_INT32}, - {CL_RG, CL_HALF_FLOAT}, {CL_RG, CL_FLOAT}, + {CL_RG, CL_HALF_FLOAT}, + {CL_RG, CL_FLOAT}, // RGBA - {CL_RGBA, CL_SNORM_INT8}, {CL_RGBA, CL_SNORM_INT16}, - {CL_RGBA, CL_UNORM_INT8}, {CL_RGBA, CL_UNORM_INT16}, + {CL_RGBA, CL_SNORM_INT8}, + {CL_RGBA, CL_SNORM_INT16}, + {CL_RGBA, CL_UNORM_INT8}, + {CL_RGBA, CL_UNORM_INT16}, - {CL_RGBA, CL_SIGNED_INT8}, {CL_RGBA, CL_SIGNED_INT16}, - {CL_RGBA, CL_SIGNED_INT32}, {CL_RGBA, CL_UNSIGNED_INT8}, - {CL_RGBA, CL_UNSIGNED_INT16}, {CL_RGBA, CL_UNSIGNED_INT32}, + {CL_RGBA, CL_SIGNED_INT8}, + {CL_RGBA, CL_SIGNED_INT16}, + {CL_RGBA, CL_SIGNED_INT32}, + {CL_RGBA, CL_UNSIGNED_INT8}, + {CL_RGBA, CL_UNSIGNED_INT16}, + {CL_RGBA, CL_UNSIGNED_INT32}, - {CL_RGBA, CL_HALF_FLOAT}, {CL_RGBA, CL_FLOAT}, + {CL_RGBA, CL_HALF_FLOAT}, + {CL_RGBA, CL_FLOAT}, // ARGB - {CL_ARGB, CL_SNORM_INT8}, {CL_ARGB, CL_UNORM_INT8}, - {CL_ARGB, CL_SIGNED_INT8}, {CL_ARGB, CL_UNSIGNED_INT8}, + {CL_ARGB, CL_SNORM_INT8}, + {CL_ARGB, CL_UNORM_INT8}, + {CL_ARGB, CL_SIGNED_INT8}, + {CL_ARGB, CL_UNSIGNED_INT8}, // BGRA - {CL_BGRA, CL_SNORM_INT8}, {CL_BGRA, CL_UNORM_INT8}, - {CL_BGRA, CL_SIGNED_INT8}, {CL_BGRA, CL_UNSIGNED_INT8}, + {CL_BGRA, CL_SNORM_INT8}, + {CL_BGRA, CL_UNORM_INT8}, + {CL_BGRA, CL_SIGNED_INT8}, + {CL_BGRA, CL_UNSIGNED_INT8}, // LUMINANCE - {CL_LUMINANCE, CL_SNORM_INT8}, {CL_LUMINANCE, CL_SNORM_INT16}, - {CL_LUMINANCE, CL_UNORM_INT8}, {CL_LUMINANCE, CL_UNORM_INT16}, - {CL_LUMINANCE, CL_HALF_FLOAT}, {CL_LUMINANCE, CL_FLOAT}, + {CL_LUMINANCE, CL_SNORM_INT8}, + {CL_LUMINANCE, CL_SNORM_INT16}, + {CL_LUMINANCE, CL_UNORM_INT8}, + {CL_LUMINANCE, CL_UNORM_INT16}, + {CL_LUMINANCE, CL_HALF_FLOAT}, + {CL_LUMINANCE, CL_FLOAT}, // INTENSITY - {CL_INTENSITY, CL_SNORM_INT8}, {CL_INTENSITY, CL_SNORM_INT16}, - {CL_INTENSITY, CL_UNORM_INT8}, {CL_INTENSITY, CL_UNORM_INT16}, - {CL_INTENSITY, CL_HALF_FLOAT}, {CL_INTENSITY, CL_FLOAT}, + {CL_INTENSITY, CL_SNORM_INT8}, + {CL_INTENSITY, CL_SNORM_INT16}, + {CL_INTENSITY, CL_UNORM_INT8}, + {CL_INTENSITY, CL_UNORM_INT16}, + {CL_INTENSITY, CL_HALF_FLOAT}, + {CL_INTENSITY, CL_FLOAT}, // RGB {CL_RGB, CL_UNORM_INT_101010}, @@ -1000,382 +881,347 @@ Image::supportedFormats[] = { {CL_sRGBA, CL_UNORM_INT8}, // DEPTH - {CL_DEPTH, CL_UNORM_INT16}, {CL_DEPTH, CL_FLOAT}, + {CL_DEPTH, CL_UNORM_INT16}, + {CL_DEPTH, CL_FLOAT}, }; -const cl_uint NUM_CHANNEL_ORDER_OF_RGB = 1; // The number of channel orders of RGB at the end of the table supportedFormats above and before sRGB and depth. -const cl_uint NUM_CHANNEL_ORDER_OF_sRGB = 1; // The number of channel orders of sRGB at the end of the table supportedFormats above and before depth. -const cl_uint NUM_CHANNEL_ORDER_OF_DEPTH = 2; // The number of channel orders of DEPTH at the end of the table supportedFormats above. +const cl_uint NUM_CHANNEL_ORDER_OF_RGB = 1; // The number of channel orders of RGB at the end of + // the table supportedFormats above and before sRGB and + // depth. +const cl_uint NUM_CHANNEL_ORDER_OF_sRGB = 1; // The number of channel orders of sRGB at the end of + // the table supportedFormats above and before depth. +const cl_uint NUM_CHANNEL_ORDER_OF_DEPTH = + 2; // The number of channel orders of DEPTH at the end of the table supportedFormats above. // definition of list of supported RA formats -cl_image_format -Image::supportedFormatsRA[] = { - {CL_RA, CL_SNORM_INT8}, {CL_RA, CL_SNORM_INT16}, - {CL_RA, CL_UNORM_INT8}, {CL_RA, CL_UNORM_INT16}, - {CL_RA, CL_SIGNED_INT8}, {CL_RA, CL_SIGNED_INT16}, - {CL_RA, CL_SIGNED_INT32}, {CL_RA, CL_UNSIGNED_INT8}, - {CL_RA, CL_UNSIGNED_INT16}, {CL_RA, CL_UNSIGNED_INT32}, - {CL_RA, CL_HALF_FLOAT}, {CL_RA, CL_FLOAT}, +cl_image_format Image::supportedFormatsRA[] = { + {CL_RA, CL_SNORM_INT8}, {CL_RA, CL_SNORM_INT16}, {CL_RA, CL_UNORM_INT8}, + {CL_RA, CL_UNORM_INT16}, {CL_RA, CL_SIGNED_INT8}, {CL_RA, CL_SIGNED_INT16}, + {CL_RA, CL_SIGNED_INT32}, {CL_RA, CL_UNSIGNED_INT8}, {CL_RA, CL_UNSIGNED_INT16}, + {CL_RA, CL_UNSIGNED_INT32}, {CL_RA, CL_HALF_FLOAT}, {CL_RA, CL_FLOAT}, }; -cl_image_format -Image::supportedDepthStencilFormats[] = { - //DEPTH STENCIL - {CL_DEPTH_STENCIL, CL_FLOAT}, {CL_DEPTH_STENCIL, CL_UNORM_INT24} -}; +cl_image_format Image::supportedDepthStencilFormats[] = { + // DEPTH STENCIL + {CL_DEPTH_STENCIL, CL_FLOAT}, + {CL_DEPTH_STENCIL, CL_UNORM_INT24}}; -cl_uint -Image::numSupportedFormats(const Context& context, cl_mem_object_type image_type, cl_mem_flags flags) -{ - const std::vector& devices = context.devices(); - uint numFormats = sizeof(supportedFormats) / sizeof(cl_image_format); +cl_uint Image::numSupportedFormats(const Context& context, cl_mem_object_type image_type, + cl_mem_flags flags) { + const std::vector& devices = context.devices(); + uint numFormats = sizeof(supportedFormats) / sizeof(cl_image_format); - bool supportRA = false; - bool supportDepthsRGB = false; - bool supportDepthStencil = false; + bool supportRA = false; + bool supportDepthsRGB = false; + bool supportDepthStencil = false; - // Add RA if RA is supported. - for (uint i = 0; i < devices.size(); i++) { - if (devices[i]->settings().supportRA_) { - supportRA = true; - } - if (devices[i]->settings().supportDepthsRGB_) { - supportDepthsRGB = true; - } - if (devices[i]->settings().checkExtension(ClKhrGLDepthImages) && - (context.info().flags_ & Context::GLDeviceKhr)) { - supportDepthStencil = true; - } + // Add RA if RA is supported. + for (uint i = 0; i < devices.size(); i++) { + if (devices[i]->settings().supportRA_) { + supportRA = true; } - - if (supportDepthsRGB) { - if ((image_type != CL_MEM_OBJECT_IMAGE2D) && - (image_type != CL_MEM_OBJECT_IMAGE2D_ARRAY) && - (image_type != 0)) { - numFormats -= NUM_CHANNEL_ORDER_OF_DEPTH; // substract channel order of DEPTH type. - } - // Currently we are not supported sRGB for write_imagef (extension cl_khr_srgb_image_writes) - if ((image_type == CL_MEM_OBJECT_IMAGE1D_BUFFER) || - ((flags & (CL_MEM_WRITE_ONLY | CL_MEM_READ_WRITE | CL_MEM_KERNEL_READ_AND_WRITE)) != 0)) { - numFormats -= NUM_CHANNEL_ORDER_OF_sRGB; - } + if (devices[i]->settings().supportDepthsRGB_) { + supportDepthsRGB = true; } - else { - numFormats -= NUM_CHANNEL_ORDER_OF_RGB; // substract channel order of RGB type. - numFormats -= NUM_CHANNEL_ORDER_OF_sRGB; // substract channel order of sRGB type. - numFormats -= NUM_CHANNEL_ORDER_OF_DEPTH; // substract channel order of DEPTH type. + if (devices[i]->settings().checkExtension(ClKhrGLDepthImages) && + (context.info().flags_ & Context::GLDeviceKhr)) { + supportDepthStencil = true; } + } - // Add RA if RA is supported. RA isn't supported on SI. - if (supportRA) { - numFormats += sizeof(supportedFormatsRA) / sizeof(cl_image_format); // Add channel order of RA type. + if (supportDepthsRGB) { + if ((image_type != CL_MEM_OBJECT_IMAGE2D) && (image_type != CL_MEM_OBJECT_IMAGE2D_ARRAY) && + (image_type != 0)) { + numFormats -= NUM_CHANNEL_ORDER_OF_DEPTH; // substract channel order of DEPTH type. } - - if (supportDepthStencil) { - if (flags & CL_MEM_READ_ONLY) { - numFormats += sizeof(supportedDepthStencilFormats) / sizeof(cl_image_format); - } + // Currently we are not supported sRGB for write_imagef (extension cl_khr_srgb_image_writes) + if ((image_type == CL_MEM_OBJECT_IMAGE1D_BUFFER) || + ((flags & (CL_MEM_WRITE_ONLY | CL_MEM_READ_WRITE | CL_MEM_KERNEL_READ_AND_WRITE)) != 0)) { + numFormats -= NUM_CHANNEL_ORDER_OF_sRGB; } + } else { + numFormats -= NUM_CHANNEL_ORDER_OF_RGB; // substract channel order of RGB type. + numFormats -= NUM_CHANNEL_ORDER_OF_sRGB; // substract channel order of sRGB type. + numFormats -= NUM_CHANNEL_ORDER_OF_DEPTH; // substract channel order of DEPTH type. + } - return numFormats; + // Add RA if RA is supported. RA isn't supported on SI. + if (supportRA) { + numFormats += + sizeof(supportedFormatsRA) / sizeof(cl_image_format); // Add channel order of RA type. + } + + if (supportDepthStencil) { + if (flags & CL_MEM_READ_ONLY) { + numFormats += sizeof(supportedDepthStencilFormats) / sizeof(cl_image_format); + } + } + + return numFormats; } -cl_uint -Image::getSupportedFormats( - const Context& context, - cl_mem_object_type image_type, - const cl_uint num_entries, - cl_image_format *image_formats, - cl_mem_flags flags) -{ - const std::vector& devices = context.devices(); - uint numFormats = 0; +cl_uint Image::getSupportedFormats(const Context& context, cl_mem_object_type image_type, + const cl_uint num_entries, cl_image_format* image_formats, + cl_mem_flags flags) { + const std::vector& devices = context.devices(); + uint numFormats = 0; - bool supportRA = false; - bool supportDepthsRGB = false; - bool supportDepthStencil = false; + bool supportRA = false; + bool supportDepthsRGB = false; + bool supportDepthStencil = false; - // Add RA if RA is supported. - for (uint i = 0; i < devices.size(); i++) { - if (devices[i]->settings().supportRA_) { - supportRA = true; - } - if (devices[i]->settings().supportDepthsRGB_) { - supportDepthsRGB = true; - } - if (devices[i]->settings().checkExtension(ClKhrGLDepthImages) && - (context.info().flags_ & Context::GLDeviceKhr)) { - supportDepthStencil = true; - } + // Add RA if RA is supported. + for (uint i = 0; i < devices.size(); i++) { + if (devices[i]->settings().supportRA_) { + supportRA = true; } - - cl_image_format *format = image_formats; - uint numSupportedFormats = sizeof(supportedFormats) / sizeof(cl_image_format); - - bool srgbWriteSupported = true; - if (supportDepthsRGB) { - if ((image_type != CL_MEM_OBJECT_IMAGE2D) && - (image_type != CL_MEM_OBJECT_IMAGE2D_ARRAY) && - (image_type != 0)) { - numSupportedFormats -= NUM_CHANNEL_ORDER_OF_DEPTH; - } - // Currently we are not supported sRGB for write_imagef (extension cl_khr_srgb_image_writes) - if ((image_type == CL_MEM_OBJECT_IMAGE1D_BUFFER) || - ((flags & (CL_MEM_WRITE_ONLY | CL_MEM_READ_WRITE | CL_MEM_KERNEL_READ_AND_WRITE)) != 0)) { - srgbWriteSupported = false; - } + if (devices[i]->settings().supportDepthsRGB_) { + supportDepthsRGB = true; } - else { - numSupportedFormats -= NUM_CHANNEL_ORDER_OF_RGB; // substract channel order of RGB type. - numSupportedFormats -= NUM_CHANNEL_ORDER_OF_sRGB; // substract channel order of sRGB type. - numSupportedFormats -= NUM_CHANNEL_ORDER_OF_DEPTH; // substract channel order of DEPTH type. + if (devices[i]->settings().checkExtension(ClKhrGLDepthImages) && + (context.info().flags_ & Context::GLDeviceKhr)) { + supportDepthStencil = true; } + } - for (uint i = 0; i < numSupportedFormats; i++) { - if (numFormats == num_entries) { - break; - } - if (!srgbWriteSupported) { - if ((amd::Image::supportedFormats[i].image_channel_order == CL_sRGBA) || - (amd::Image::supportedFormats[i].image_channel_order == CL_sRGB) || - (amd::Image::supportedFormats[i].image_channel_order == CL_sRGBx) || - (amd::Image::supportedFormats[i].image_channel_order == CL_sBGRA)) { - continue; - } - } - *format++ = amd::Image::supportedFormats[i]; - numFormats++; + cl_image_format* format = image_formats; + uint numSupportedFormats = sizeof(supportedFormats) / sizeof(cl_image_format); + + bool srgbWriteSupported = true; + if (supportDepthsRGB) { + if ((image_type != CL_MEM_OBJECT_IMAGE2D) && (image_type != CL_MEM_OBJECT_IMAGE2D_ARRAY) && + (image_type != 0)) { + numSupportedFormats -= NUM_CHANNEL_ORDER_OF_DEPTH; } - - // Add RA if RA is supported. - if (supportRA) { - for (uint i = 0; i < sizeof(supportedFormatsRA) / sizeof(cl_image_format); i++) { - if (numFormats == num_entries) { - break; - } - *format++ = amd::Image::supportedFormatsRA[i]; - numFormats++; - } + // Currently we are not supported sRGB for write_imagef (extension cl_khr_srgb_image_writes) + if ((image_type == CL_MEM_OBJECT_IMAGE1D_BUFFER) || + ((flags & (CL_MEM_WRITE_ONLY | CL_MEM_READ_WRITE | CL_MEM_KERNEL_READ_AND_WRITE)) != 0)) { + srgbWriteSupported = false; } + } else { + numSupportedFormats -= NUM_CHANNEL_ORDER_OF_RGB; // substract channel order of RGB type. + numSupportedFormats -= NUM_CHANNEL_ORDER_OF_sRGB; // substract channel order of sRGB type. + numSupportedFormats -= NUM_CHANNEL_ORDER_OF_DEPTH; // substract channel order of DEPTH type. + } - if (supportDepthStencil) { - if (flags & CL_MEM_READ_ONLY) { - for (uint i = 0; i < sizeof(supportedDepthStencilFormats) / sizeof(cl_image_format); i++) { - if (numFormats == num_entries) { - break; - } - *format++ = amd::Image::supportedDepthStencilFormats[i]; - numFormats++; - } - } + for (uint i = 0; i < numSupportedFormats; i++) { + if (numFormats == num_entries) { + break; } - return numFormats; -} - -bool -Image::Format::isSupported(const Context& context, - cl_mem_object_type image_type, cl_mem_flags flags) const -{ - uint numFormats = numSupportedFormats(context, image_type, flags) ; - - std::vector image_formats(numFormats); - - getSupportedFormats(context, image_type, numFormats, image_formats.data(), flags); - - for (uint i = 0; i < numFormats; i++) { - if (*this == image_formats[i]) { - return true; - } + if (!srgbWriteSupported) { + if ((amd::Image::supportedFormats[i].image_channel_order == CL_sRGBA) || + (amd::Image::supportedFormats[i].image_channel_order == CL_sRGB) || + (amd::Image::supportedFormats[i].image_channel_order == CL_sRGBx) || + (amd::Image::supportedFormats[i].image_channel_order == CL_sBGRA)) { + continue; + } } + *format++ = amd::Image::supportedFormats[i]; + numFormats++; + } - return false; -} - -Image* -Image::createView( - const Context& context, - const Format& format, - device::VirtualDevice* vDev, - uint baseMipLevel, - cl_mem_flags flags) -{ - Image* view = NULL; - - // Find the image dimensions and create a corresponding object - view = new (context) Image(format, *this, baseMipLevel, flags); - - // Set GPU virtual device for this view - view->setVirtualDevice(vDev); - - if (view != NULL) { - // Initialize view - view->initDeviceMemory(); - } - - return view; -} - -bool -Image::isEntirelyCovered(const Coord3D& origin, const Coord3D& region) const -{ - return (origin[0] == 0 && origin[1] == 0 && origin[2] == 0 && - region[0] == getWidth() && - region[1] == getHeight() && - region[2] == getDepth()) ? true : false; -} - -bool -Image::validateRegion(const Coord3D& origin, const Coord3D& region) const -{ - return ((region[0] > 0) && (region[1] > 0) && (region[2] > 0) && - (origin[0] < getWidth()) && (region[0] != 0) && - (origin[1] < getHeight()) && (region[1] != 0) && - (origin[2] < getDepth()) && (region[2] != 0) && - ((origin[0] + region[0]) <= getWidth()) && - ((origin[1] + region[1]) <= getHeight()) && - ((origin[2] + region[2]) <= getDepth())) ? true : false; -} - -bool -Image::isRowSliceValid( - size_t rowPitch, - size_t slice, - size_t width, - size_t height) const -{ - size_t tmpHeight = - (getType() == CL_MEM_OBJECT_IMAGE1D_ARRAY) ? 1 : height; - - bool valid = (rowPitch == 0) || ((rowPitch != 0) && - (rowPitch >= width * getImageFormat().getElementSize())); - - return ((slice == 0) || - ((slice != 0) && - (slice >= rowPitch * tmpHeight))) ? valid : false; -} - -void -Image::copyToBackingStore(void* initFrom) -{ - char* src; - char* dst = reinterpret_cast(getHostMem()); - size_t cpySize = getWidth() * getImageFormat().getElementSize(); - - for (uint z = 0; z < getDepth(); ++z) { - src = reinterpret_cast(initFrom) + z * getSlicePitch(); - for (uint y = 0; y < getHeight(); ++y) { - memcpy(dst, src, cpySize); - dst += cpySize; - src += getRowPitch(); - } - } - - impl_.rp_ = cpySize; - if (impl_.sp_ != 0) { - impl_.sp_ = impl_.rp_; - if (getDims() == 3) { - impl_.sp_ *= getHeight(); - } - } -} - -static int -round_to_even(float v) -{ - // clamp overflow - if (v >= -(float)std::numeric_limits::min()) { - return std::numeric_limits::max(); - } - if (v <= (float)std::numeric_limits::min()) { - return std::numeric_limits::min(); - } - static const unsigned int magic[2] = { 0x4b000000u, 0xcb000000u }; - - // round fractional values to integer value - if (fabsf(v) < *reinterpret_cast(&magic[0])) { - float magicVal = *reinterpret_cast(&magic[v < 0.0f]); - v += magicVal; - v -= magicVal; - } - - return static_cast(v); -} - -static uint16_t -float2half_rtz(float f) -{ - union{ float f; cl_uint u; } u = {f}; - cl_uint sign = (u.u >> 16) & 0x8000; - float x = fabsf(f); - - //Nan - if (x != x) { - u.u >>= (24-11); - u.u &= 0x7fff; - u.u |= 0x0200; //silence the NaN - return u.u | sign; - } - int values[5] = { 0x47800000, 0x33800000, 0x38800000, 0x4b800000, 0x7f800000 }; - // overflow - if (x >= *reinterpret_cast(&values[0])) { - if (x == *reinterpret_cast(&values[4])) { - return 0x7c00 | sign; - } - return 0x7bff | sign; - } - - // underflow - if (x < *reinterpret_cast(&values[1])) { - return sign; // The halfway case can return 0x0001 or 0. 0 is even. - } - - // half denormal - if (x < *reinterpret_cast(&values[2])) { - x *= *reinterpret_cast(&values[3]); - return static_cast((int) x | sign); - } - - u.u &= 0xFFFFE000U; - u.u -= 0x38000000U; - - return (u.u >> (24-11)) | sign; -} - -void -Image::Format::getChannelOrder(uint8_t* channelOrder) const -{ - enum { CH_ORDER_R = 0, CH_ORDER_G, CH_ORDER_B, CH_ORDER_A }; - switch (image_channel_order) { - case CL_A: - channelOrder[0] = CH_ORDER_A; + // Add RA if RA is supported. + if (supportRA) { + for (uint i = 0; i < sizeof(supportedFormatsRA) / sizeof(cl_image_format); i++) { + if (numFormats == num_entries) { break; + } + *format++ = amd::Image::supportedFormatsRA[i]; + numFormats++; + } + } + + if (supportDepthStencil) { + if (flags & CL_MEM_READ_ONLY) { + for (uint i = 0; i < sizeof(supportedDepthStencilFormats) / sizeof(cl_image_format); i++) { + if (numFormats == num_entries) { + break; + } + *format++ = amd::Image::supportedDepthStencilFormats[i]; + numFormats++; + } + } + } + return numFormats; +} + +bool Image::Format::isSupported(const Context& context, cl_mem_object_type image_type, + cl_mem_flags flags) const { + uint numFormats = numSupportedFormats(context, image_type, flags); + + std::vector image_formats(numFormats); + + getSupportedFormats(context, image_type, numFormats, image_formats.data(), flags); + + for (uint i = 0; i < numFormats; i++) { + if (*this == image_formats[i]) { + return true; + } + } + + return false; +} + +Image* Image::createView(const Context& context, const Format& format, device::VirtualDevice* vDev, + uint baseMipLevel, cl_mem_flags flags) { + Image* view = NULL; + + // Find the image dimensions and create a corresponding object + view = new (context) Image(format, *this, baseMipLevel, flags); + + // Set GPU virtual device for this view + view->setVirtualDevice(vDev); + + if (view != NULL) { + // Initialize view + view->initDeviceMemory(); + } + + return view; +} + +bool Image::isEntirelyCovered(const Coord3D& origin, const Coord3D& region) const { + return (origin[0] == 0 && origin[1] == 0 && origin[2] == 0 && region[0] == getWidth() && + region[1] == getHeight() && region[2] == getDepth()) + ? true + : false; +} + +bool Image::validateRegion(const Coord3D& origin, const Coord3D& region) const { + return ((region[0] > 0) && (region[1] > 0) && (region[2] > 0) && (origin[0] < getWidth()) && + (region[0] != 0) && (origin[1] < getHeight()) && (region[1] != 0) && + (origin[2] < getDepth()) && (region[2] != 0) && ((origin[0] + region[0]) <= getWidth()) && + ((origin[1] + region[1]) <= getHeight()) && ((origin[2] + region[2]) <= getDepth())) + ? true + : false; +} + +bool Image::isRowSliceValid(size_t rowPitch, size_t slice, size_t width, size_t height) const { + size_t tmpHeight = (getType() == CL_MEM_OBJECT_IMAGE1D_ARRAY) ? 1 : height; + + bool valid = (rowPitch == 0) || + ((rowPitch != 0) && (rowPitch >= width * getImageFormat().getElementSize())); + + return ((slice == 0) || ((slice != 0) && (slice >= rowPitch * tmpHeight))) ? valid : false; +} + +void Image::copyToBackingStore(void* initFrom) { + char* src; + char* dst = reinterpret_cast(getHostMem()); + size_t cpySize = getWidth() * getImageFormat().getElementSize(); + + for (uint z = 0; z < getDepth(); ++z) { + src = reinterpret_cast(initFrom) + z * getSlicePitch(); + for (uint y = 0; y < getHeight(); ++y) { + memcpy(dst, src, cpySize); + dst += cpySize; + src += getRowPitch(); + } + } + + impl_.rp_ = cpySize; + if (impl_.sp_ != 0) { + impl_.sp_ = impl_.rp_; + if (getDims() == 3) { + impl_.sp_ *= getHeight(); + } + } +} + +static int round_to_even(float v) { + // clamp overflow + if (v >= -(float)std::numeric_limits::min()) { + return std::numeric_limits::max(); + } + if (v <= (float)std::numeric_limits::min()) { + return std::numeric_limits::min(); + } + static const unsigned int magic[2] = {0x4b000000u, 0xcb000000u}; + + // round fractional values to integer value + if (fabsf(v) < *reinterpret_cast(&magic[0])) { + float magicVal = *reinterpret_cast(&magic[v < 0.0f]); + v += magicVal; + v -= magicVal; + } + + return static_cast(v); +} + +static uint16_t float2half_rtz(float f) { + union { + float f; + cl_uint u; + } u = {f}; + cl_uint sign = (u.u >> 16) & 0x8000; + float x = fabsf(f); + + // Nan + if (x != x) { + u.u >>= (24 - 11); + u.u &= 0x7fff; + u.u |= 0x0200; // silence the NaN + return u.u | sign; + } + int values[5] = {0x47800000, 0x33800000, 0x38800000, 0x4b800000, 0x7f800000}; + // overflow + if (x >= *reinterpret_cast(&values[0])) { + if (x == *reinterpret_cast(&values[4])) { + return 0x7c00 | sign; + } + return 0x7bff | sign; + } + + // underflow + if (x < *reinterpret_cast(&values[1])) { + return sign; // The halfway case can return 0x0001 or 0. 0 is even. + } + + // half denormal + if (x < *reinterpret_cast(&values[2])) { + x *= *reinterpret_cast(&values[3]); + return static_cast((int)x | sign); + } + + u.u &= 0xFFFFE000U; + u.u -= 0x38000000U; + + return (u.u >> (24 - 11)) | sign; +} + +void Image::Format::getChannelOrder(uint8_t* channelOrder) const { + enum { CH_ORDER_R = 0, CH_ORDER_G, CH_ORDER_B, CH_ORDER_A }; + switch (image_channel_order) { + case CL_A: + channelOrder[0] = CH_ORDER_A; + break; case CL_RA: - channelOrder[0] = CH_ORDER_R; - channelOrder[1] = CH_ORDER_A; - break; + channelOrder[0] = CH_ORDER_R; + channelOrder[1] = CH_ORDER_A; + break; case CL_BGRA: - channelOrder[0] = CH_ORDER_B; - channelOrder[1] = CH_ORDER_G; - channelOrder[2] = CH_ORDER_R; - channelOrder[3] = CH_ORDER_A; - break; + channelOrder[0] = CH_ORDER_B; + channelOrder[1] = CH_ORDER_G; + channelOrder[2] = CH_ORDER_R; + channelOrder[3] = CH_ORDER_A; + break; case CL_ARGB: - channelOrder[0] = CH_ORDER_A; - channelOrder[1] = CH_ORDER_R; - channelOrder[2] = CH_ORDER_G; - channelOrder[3] = CH_ORDER_B; - break; + channelOrder[0] = CH_ORDER_A; + channelOrder[1] = CH_ORDER_R; + channelOrder[2] = CH_ORDER_G; + channelOrder[3] = CH_ORDER_B; + break; default: - channelOrder[0] = CH_ORDER_R; - channelOrder[1] = CH_ORDER_G; - channelOrder[2] = CH_ORDER_B; - channelOrder[3] = CH_ORDER_A; - break; - } + channelOrder[0] = CH_ORDER_R; + channelOrder[1] = CH_ORDER_G; + channelOrder[2] = CH_ORDER_B; + channelOrder[3] = CH_ORDER_A; + break; + } } // "colorRGBA" is a four component RGBA floating-point color value if the image @@ -1383,211 +1229,171 @@ Image::Format::getChannelOrder(uint8_t* channelOrder) const // is a four component signed integer value if the image channel data type is // an unnormalized signed integer type and is a four component unsigned integer // value if the image channel data type is an unormalized unsigned integer type. -void -Image::Format::formatColor(const void* colorRGBA, void* colorFormat) const -{ - union t565 { - struct { - uint16_t r_: 5; - uint16_t g_: 6; - uint16_t b_: 5; - }; - uint16_t rgba_; +void Image::Format::formatColor(const void* colorRGBA, void* colorFormat) const { + union t565 { + struct { + uint16_t r_ : 5; + uint16_t g_ : 6; + uint16_t b_ : 5; }; + uint16_t rgba_; + }; - union t555 { - struct { - uint16_t r_: 5; - uint16_t g_: 5; - uint16_t b_: 5; - uint16_t a_: 1; - }; - uint16_t rgba_; + union t555 { + struct { + uint16_t r_ : 5; + uint16_t g_ : 5; + uint16_t b_ : 5; + uint16_t a_ : 1; }; + uint16_t rgba_; + }; - union t101010 { - struct { - uint32_t b_: 10; - uint32_t g_: 10; - uint32_t r_: 10; - uint32_t a_: 2; - }; - uint32_t rgba_; + union t101010 { + struct { + uint32_t b_ : 10; + uint32_t g_ : 10; + uint32_t r_ : 10; + uint32_t a_ : 2; }; + uint32_t rgba_; + }; - const float* colorRGBAf = reinterpret_cast(colorRGBA); - const int32_t* colorRGBAi = reinterpret_cast(colorRGBA); - const uint32_t* colorRGBAui = reinterpret_cast(colorRGBA); + const float* colorRGBAf = reinterpret_cast(colorRGBA); + const int32_t* colorRGBAi = reinterpret_cast(colorRGBA); + const uint32_t* colorRGBAui = reinterpret_cast(colorRGBA); - size_t chCount = getNumChannels(); - uint8_t chOrder[4]; - getChannelOrder(chOrder); + size_t chCount = getNumChannels(); + uint8_t chOrder[4]; + getChannelOrder(chOrder); - bool allChannels = false; - for (size_t i = 0; i < chCount && !allChannels; ++i) { - switch (image_channel_data_type) { - case CL_SNORM_INT8: { - int8_t* color = reinterpret_cast(colorFormat); - color[i] = round_to_even(INT8_MAX * colorRGBAf[chOrder[i]]); - } - break; - case CL_SNORM_INT16: { - int16_t* color = reinterpret_cast(colorFormat); - color[i] = round_to_even(INT16_MAX * colorRGBAf[chOrder[i]]); - } - break; - case CL_UNORM_INT8: { - uint8_t* color = reinterpret_cast(colorFormat); - color[i] = round_to_even(UINT8_MAX * colorRGBAf[chOrder[i]]); - } - break; - case CL_UNORM_INT16: { - uint16_t* color = reinterpret_cast(colorFormat); - color[i] = round_to_even(UINT16_MAX * colorRGBAf[chOrder[i]]); - } - break; - case CL_UNORM_SHORT_565: { - t565* color = reinterpret_cast(colorFormat); - color->r_ = round_to_even(0x1F * colorRGBAf[0]); - color->g_ = round_to_even(0x3F * colorRGBAf[1]); - color->b_ = round_to_even(0x1F * colorRGBAf[2]); - allChannels = true; - } - break; - case CL_UNORM_SHORT_555: { - t555* color = reinterpret_cast(colorFormat); - color->r_ = round_to_even(0x1F * colorRGBAf[0]); - color->g_ = round_to_even(0x1F * colorRGBAf[1]); - color->b_ = round_to_even(0x1F * colorRGBAf[2]); - color->a_ = round_to_even(colorRGBAf[3]); - allChannels = true; - } - break; - case CL_UNORM_INT_101010: { - t101010* color = reinterpret_cast(colorFormat); - color->r_ = round_to_even(0x3FF * colorRGBAf[0]); - color->g_ = round_to_even(0x3FF * colorRGBAf[1]); - color->b_ = round_to_even(0x3FF * colorRGBAf[2]); - color->a_ = round_to_even(0x3 * colorRGBAf[3]); - allChannels = true; - } - break; - case CL_SIGNED_INT8: { - int8_t* color = reinterpret_cast(colorFormat); - color[i] = colorRGBAi[chOrder[i]]; - } - break; - case CL_SIGNED_INT16: { - int16_t* color = reinterpret_cast(colorFormat); - color[i] = colorRGBAi[chOrder[i]]; - } - break; - case CL_SIGNED_INT32: { - int32_t* color = reinterpret_cast(colorFormat); - color[i] = colorRGBAi[chOrder[i]]; - } - break; - case CL_UNSIGNED_INT8: { - uint8_t* color = reinterpret_cast(colorFormat); - color[i] = colorRGBAui[chOrder[i]]; - } - break; - case CL_UNSIGNED_INT16: { - uint16_t* color = reinterpret_cast(colorFormat); - color[i] = colorRGBAui[chOrder[i]]; - } - break; - case CL_UNSIGNED_INT32: { - uint32_t* color = reinterpret_cast(colorFormat); - color[i] = colorRGBAui[chOrder[i]]; - } - break; - case CL_HALF_FLOAT: { - uint16_t* color = reinterpret_cast(colorFormat); - color[i] = float2half_rtz(colorRGBAf[chOrder[i]]); - } - break; - case CL_FLOAT: { - float* color = reinterpret_cast(colorFormat); - color[i] = colorRGBAf[chOrder[i]]; - } - break; - } + bool allChannels = false; + for (size_t i = 0; i < chCount && !allChannels; ++i) { + switch (image_channel_data_type) { + case CL_SNORM_INT8: { + int8_t* color = reinterpret_cast(colorFormat); + color[i] = round_to_even(INT8_MAX * colorRGBAf[chOrder[i]]); + } break; + case CL_SNORM_INT16: { + int16_t* color = reinterpret_cast(colorFormat); + color[i] = round_to_even(INT16_MAX * colorRGBAf[chOrder[i]]); + } break; + case CL_UNORM_INT8: { + uint8_t* color = reinterpret_cast(colorFormat); + color[i] = round_to_even(UINT8_MAX * colorRGBAf[chOrder[i]]); + } break; + case CL_UNORM_INT16: { + uint16_t* color = reinterpret_cast(colorFormat); + color[i] = round_to_even(UINT16_MAX * colorRGBAf[chOrder[i]]); + } break; + case CL_UNORM_SHORT_565: { + t565* color = reinterpret_cast(colorFormat); + color->r_ = round_to_even(0x1F * colorRGBAf[0]); + color->g_ = round_to_even(0x3F * colorRGBAf[1]); + color->b_ = round_to_even(0x1F * colorRGBAf[2]); + allChannels = true; + } break; + case CL_UNORM_SHORT_555: { + t555* color = reinterpret_cast(colorFormat); + color->r_ = round_to_even(0x1F * colorRGBAf[0]); + color->g_ = round_to_even(0x1F * colorRGBAf[1]); + color->b_ = round_to_even(0x1F * colorRGBAf[2]); + color->a_ = round_to_even(colorRGBAf[3]); + allChannels = true; + } break; + case CL_UNORM_INT_101010: { + t101010* color = reinterpret_cast(colorFormat); + color->r_ = round_to_even(0x3FF * colorRGBAf[0]); + color->g_ = round_to_even(0x3FF * colorRGBAf[1]); + color->b_ = round_to_even(0x3FF * colorRGBAf[2]); + color->a_ = round_to_even(0x3 * colorRGBAf[3]); + allChannels = true; + } break; + case CL_SIGNED_INT8: { + int8_t* color = reinterpret_cast(colorFormat); + color[i] = colorRGBAi[chOrder[i]]; + } break; + case CL_SIGNED_INT16: { + int16_t* color = reinterpret_cast(colorFormat); + color[i] = colorRGBAi[chOrder[i]]; + } break; + case CL_SIGNED_INT32: { + int32_t* color = reinterpret_cast(colorFormat); + color[i] = colorRGBAi[chOrder[i]]; + } break; + case CL_UNSIGNED_INT8: { + uint8_t* color = reinterpret_cast(colorFormat); + color[i] = colorRGBAui[chOrder[i]]; + } break; + case CL_UNSIGNED_INT16: { + uint16_t* color = reinterpret_cast(colorFormat); + color[i] = colorRGBAui[chOrder[i]]; + } break; + case CL_UNSIGNED_INT32: { + uint32_t* color = reinterpret_cast(colorFormat); + color[i] = colorRGBAui[chOrder[i]]; + } break; + case CL_HALF_FLOAT: { + uint16_t* color = reinterpret_cast(colorFormat); + color[i] = float2half_rtz(colorRGBAf[chOrder[i]]); + } break; + case CL_FLOAT: { + float* color = reinterpret_cast(colorFormat); + color[i] = colorRGBAf[chOrder[i]]; + } break; } + } } std::map SvmBuffer::Allocated_; Monitor SvmBuffer::AllocatedLock_("Guards SVM allocation list"); -void -SvmBuffer::Add(uintptr_t k, uintptr_t v) -{ - ScopedLock lock(AllocatedLock_); - Allocated_.insert(std::pair(k, v)); +void SvmBuffer::Add(uintptr_t k, uintptr_t v) { + ScopedLock lock(AllocatedLock_); + Allocated_.insert(std::pair(k, v)); } -void -SvmBuffer::Remove(uintptr_t k) -{ - ScopedLock lock(AllocatedLock_); - Allocated_.erase(k); +void SvmBuffer::Remove(uintptr_t k) { + ScopedLock lock(AllocatedLock_); + Allocated_.erase(k); } -bool -SvmBuffer::Contains(uintptr_t ptr) -{ - ScopedLock lock(AllocatedLock_); - auto it = Allocated_.upper_bound(ptr); - if (it == Allocated_.begin()) { - return false; - } - --it; - return ptr >= it->first && ptr < it->second; +bool SvmBuffer::Contains(uintptr_t ptr) { + ScopedLock lock(AllocatedLock_); + auto it = Allocated_.upper_bound(ptr); + if (it == Allocated_.begin()) { + return false; + } + --it; + return ptr >= it->first && ptr < it->second; } // The allocation flags are ignored for now. -void* -SvmBuffer::malloc( - Context& context, - cl_svm_mem_flags flags, - size_t size, - size_t alignment) -{ - bool atomics = (flags & CL_MEM_SVM_ATOMICS) != 0; - void* ret = context.svmAlloc(size, alignment, flags); - if (ret == NULL) { - LogError("Unable to allocate aligned memory"); - return NULL; - } - uintptr_t ret_u = reinterpret_cast(ret); - Add(ret_u, ret_u + size); - return ret; +void* SvmBuffer::malloc(Context& context, cl_svm_mem_flags flags, size_t size, size_t alignment) { + bool atomics = (flags & CL_MEM_SVM_ATOMICS) != 0; + void* ret = context.svmAlloc(size, alignment, flags); + if (ret == NULL) { + LogError("Unable to allocate aligned memory"); + return NULL; + } + uintptr_t ret_u = reinterpret_cast(ret); + Add(ret_u, ret_u + size); + return ret; } -void -SvmBuffer::free(const Context& context, void* ptr) -{ - Remove(reinterpret_cast(ptr)); - context.svmFree(ptr); +void SvmBuffer::free(const Context& context, void* ptr) { + Remove(reinterpret_cast(ptr)); + context.svmFree(ptr); } -void -SvmBuffer::memFill( - void* dst, - const void* src, - size_t srcSize, - size_t times) -{ - address dstAddress = reinterpret_cast
(dst); - const_address srcAddress = reinterpret_cast(src); - for (size_t i = 0; i < times; i++) { - ::memcpy(dstAddress + i * srcSize, srcAddress, srcSize); - } +void SvmBuffer::memFill(void* dst, const void* src, size_t srcSize, size_t times) { + address dstAddress = reinterpret_cast
(dst); + const_address srcAddress = reinterpret_cast(src); + for (size_t i = 0; i < times; i++) { + ::memcpy(dstAddress + i * srcSize, srcAddress, srcSize); + } } -bool SvmBuffer::malloced(const void* ptr) -{ - return Contains(reinterpret_cast(ptr)); -} +bool SvmBuffer::malloced(const void* ptr) { return Contains(reinterpret_cast(ptr)); } -} // namespace amd +} // namespace amd diff --git a/rocclr/runtime/platform/memory.hpp b/rocclr/runtime/platform/memory.hpp index 61e011cc96..539ef646ae 100644 --- a/rocclr/runtime/platform/memory.hpp +++ b/rocclr/runtime/platform/memory.hpp @@ -31,683 +31,593 @@ class Image; class Buffer; class Pipe; -struct BufferRect : public amd::EmbeddedObject -{ - //! Default constructor - BufferRect() - : rowPitch_(0) - , slicePitch_(0) - , start_(0) - , end_(0) - { } +struct BufferRect : public amd::EmbeddedObject { + //! Default constructor + BufferRect() : rowPitch_(0), slicePitch_(0), start_(0), end_(0) {} - //! Creates BufferRect object - bool create( - const size_t* bufferOrigin, //!< Start locaiton in the buffer - const size_t* region, //!< Copy region - size_t bufferRowPitch, //!< Provided buffer's row pitch - size_t bufferSlicePitch //!< Provided buffer's slice pitch - ); + //! Creates BufferRect object + bool create(const size_t* bufferOrigin, //!< Start locaiton in the buffer + const size_t* region, //!< Copy region + size_t bufferRowPitch, //!< Provided buffer's row pitch + size_t bufferSlicePitch //!< Provided buffer's slice pitch + ); - //! Returns the plain offset for the (X, Y, Z) location - size_t offset( - size_t x, //!< Coordinate in X dimension - size_t y, //!< Coordinate in Y dimension - size_t z //!< Coordinate in Z dimension - ) const - { - return start_ + x + y * rowPitch_ + z * slicePitch_; - } + //! Returns the plain offset for the (X, Y, Z) location + size_t offset(size_t x, //!< Coordinate in X dimension + size_t y, //!< Coordinate in Y dimension + size_t z //!< Coordinate in Z dimension + ) const { + return start_ + x + y * rowPitch_ + z * slicePitch_; + } - size_t rowPitch_; //!< Calculated row pitch for the buffer rect - size_t slicePitch_; //!< Calculated slice pitch for the buffer rect - size_t start_; //!< Start offset for the copy region - size_t end_; //!< Relative end offset from start for the copy region + size_t rowPitch_; //!< Calculated row pitch for the buffer rect + size_t slicePitch_; //!< Calculated slice pitch for the buffer rect + size_t start_; //!< Start offset for the copy region + size_t end_; //!< Relative end offset from start for the copy region }; -class HostMemoryReference -{ -public: - //! Default constructor - HostMemoryReference(void* hostMem = NULL) - : alloced_(false) - , hostMem_(hostMem) - , size_(0) - {} +class HostMemoryReference { + public: + //! Default constructor + HostMemoryReference(void* hostMem = NULL) : alloced_(false), hostMem_(hostMem), size_(0) {} - //! Default destructor - ~HostMemoryReference() - { - assert(!alloced_ && "Host buffer not deallocated"); - } + //! Default destructor + ~HostMemoryReference() { assert(!alloced_ && "Host buffer not deallocated"); } - //! Creates host memory reference object - bool allocateMemory(size_t size, const Context& context); + //! Creates host memory reference object + bool allocateMemory(size_t size, const Context& context); - // Frees system memory if it was allocated - void deallocateMemory(const Context& context); + // Frees system memory if it was allocated + void deallocateMemory(const Context& context); - //! Get the host memory pointer - void* hostMem() const { return hostMem_; } + //! Get the host memory pointer + void* hostMem() const { return hostMem_; } - //! Get the host memory size - size_t size() const { return size_; } + //! Get the host memory size + size_t size() const { return size_; } - //! Set the host memory pointer - void setHostMem(void* hostMem, const Context& context) - { - deallocateMemory(context); - hostMem_ = hostMem; - } + //! Set the host memory pointer + void setHostMem(void* hostMem, const Context& context) { + deallocateMemory(context); + hostMem_ = hostMem; + } - //! Returns true if the host memory has been allocated by this object, false - // if it has been allocated elsewhere. - bool alloced() const { return alloced_; } + //! Returns true if the host memory has been allocated by this object, false + // if it has been allocated elsewhere. + bool alloced() const { return alloced_; } -private: - //! Disable copy constructor - HostMemoryReference(const HostMemoryReference&); + private: + //! Disable copy constructor + HostMemoryReference(const HostMemoryReference&); - //! Disable operator= - HostMemoryReference& operator=(const HostMemoryReference&); + //! Disable operator= + HostMemoryReference& operator=(const HostMemoryReference&); - bool alloced_; //!< TRUE if memory was allocated - void* hostMem_; //!< Host memory pointer - size_t size_; //!< The host memory size + bool alloced_; //!< TRUE if memory was allocated + void* hostMem_; //!< Host memory pointer + size_t size_; //!< The host memory size }; -class Memory: public amd::RuntimeObject -{ - typedef void (CL_CALLBACK * DestructorCallBackFunction)( - cl_mem memobj, void *user_data); +class Memory : public amd::RuntimeObject { + typedef void(CL_CALLBACK* DestructorCallBackFunction)(cl_mem memobj, void* user_data); - enum AllocState { - AllocInit = 0, - AllocCreate = 1, - AllocComplete = 2, - AllocRealloced = 3 - }; + enum AllocState { AllocInit = 0, AllocCreate = 1, AllocComplete = 2, AllocRealloced = 3 }; - struct DestructorCallBackEntry - { - struct DestructorCallBackEntry* next_; + struct DestructorCallBackEntry { + struct DestructorCallBackEntry* next_; - DestructorCallBackFunction callback_; - void* data_; + DestructorCallBackFunction callback_; + void* data_; - DestructorCallBackEntry( - DestructorCallBackFunction callback, void* data) : - callback_(callback), data_(data) - { } - }; + DestructorCallBackEntry(DestructorCallBackFunction callback, void* data) + : callback_(callback), data_(data) {} + }; -protected: - typedef cl_mem_object_type Type; - typedef cl_mem_flags Flags; - typedef DeviceMap DeviceMemory; + protected: + typedef cl_mem_object_type Type; + typedef cl_mem_flags Flags; + typedef DeviceMap DeviceMemory; - size_t numDevices_; //!< Number of devices + size_t numDevices_; //!< Number of devices - //! The device memory objects included in this memory - DeviceMemory* deviceMemories_; + //! The device memory objects included in this memory + DeviceMemory* deviceMemories_; - //! The device alloced state - std::map deviceAlloced_; + //! The device alloced state + std::map deviceAlloced_; - //! Linked list of destructor callbacks. - std::atomic destructorCallbacks_; + //! Linked list of destructor callbacks. + std::atomic destructorCallbacks_; - SharedReference context_; //!< Owning context - Memory* parent_; - const Type type_; //!< Object type (Buffer, Image2D, Image3D) - HostMemoryReference hostMemRef_; //!< Host-side memory reference(or NULL if none) - size_t origin_; - size_t size_; //!< Size in bytes - Flags flags_; //!< Construction flags - size_t version_; //!< Update count, used for coherency - const Device* lastWriter_; //!< Which device wrote most recently (NULL if host) - InteropObject* interopObj_; //!< Interop object - bool isParent_; //!< This object is a parent - device::VirtualDevice* vDev_; //!< Memory object belongs to a virtual device only - bool forceSysMemAlloc_; //!< Forces system memory allocation - std::atomic_uint mapCount_; //!< Keep track of number of mappings for a memory object - void * svmHostAddress_; //!< svm host address; - bool svmPtrCommited_; //!< svm host address committed flag; - bool canBeCached_; //!< flag to if the object can be cached; + SharedReference context_; //!< Owning context + Memory* parent_; + const Type type_; //!< Object type (Buffer, Image2D, Image3D) + HostMemoryReference hostMemRef_; //!< Host-side memory reference(or NULL if none) + size_t origin_; + size_t size_; //!< Size in bytes + Flags flags_; //!< Construction flags + size_t version_; //!< Update count, used for coherency + const Device* lastWriter_; //!< Which device wrote most recently (NULL if host) + InteropObject* interopObj_; //!< Interop object + bool isParent_; //!< This object is a parent + device::VirtualDevice* vDev_; //!< Memory object belongs to a virtual device only + bool forceSysMemAlloc_; //!< Forces system memory allocation + std::atomic_uint mapCount_; //!< Keep track of number of mappings for a memory object + void* svmHostAddress_; //!< svm host address; + bool svmPtrCommited_; //!< svm host address committed flag; + bool canBeCached_; //!< flag to if the object can be cached; -private: - //! Disable default assignment operator - Memory& operator=(const Memory&); + private: + //! Disable default assignment operator + Memory& operator=(const Memory&); - //! Disable default copy operator - Memory(const Memory&); + //! Disable default copy operator + Memory(const Memory&); - Monitor lockMemoryOps_; //!< Lock to serialize memory operations - std::list subBuffers_; //!< List of all subbuffers for this memory object - device::Memory* svmBase_; //!< svmBase allocation for MGPU case + Monitor lockMemoryOps_; //!< Lock to serialize memory operations + std::list subBuffers_; //!< List of all subbuffers for this memory object + device::Memory* svmBase_; //!< svmBase allocation for MGPU case -protected: - //! The constructor creates a memory object but does not allocate either host memory - //! or device memory. Default parameters are appropriate for Buffer creation. - Memory( - Context& context, //!< Context object - Type type, //!< Memory type - Flags flags, //!< Object's flags - size_t size, //!< Memory size - void* svmPtr = NULL //!< svm host memory address, NULL if no SVM mem object - ); - Memory( - Memory& parent, //!< Context object - Flags flags, //!< Object's flags - size_t offset, //!< Memory offset - size_t size, //!< Memory size - Type type = 0 //!< Memory type - ); + protected: + //! The constructor creates a memory object but does not allocate either host memory + //! or device memory. Default parameters are appropriate for Buffer creation. + Memory(Context& context, //!< Context object + Type type, //!< Memory type + Flags flags, //!< Object's flags + size_t size, //!< Memory size + void* svmPtr = NULL //!< svm host memory address, NULL if no SVM mem object + ); + Memory(Memory& parent, //!< Context object + Flags flags, //!< Object's flags + size_t offset, //!< Memory offset + size_t size, //!< Memory size + Type type = 0 //!< Memory type + ); - //! Memory object destructor - virtual ~Memory(); + //! Memory object destructor + virtual ~Memory(); - //! Copies initialization data to the backing store - virtual void copyToBackingStore( - void* initFrom //!< Pointer to the initialization memory - ); + //! Copies initialization data to the backing store + virtual void copyToBackingStore(void* initFrom //!< Pointer to the initialization memory + ); - //! Initializes the device memory array - virtual void initDeviceMemory(); + //! Initializes the device memory array + virtual void initDeviceMemory(); - void setSize(size_t size) { size_ = size; } - void setInteropObj(InteropObject* obj) { interopObj_ = obj; } + void setSize(size_t size) { size_ = size; } + void setInteropObj(InteropObject* obj) { interopObj_ = obj; } -public: - //! Placement new operator. - void* operator new( - size_t size, //!< Original allocation size - const Context& context //!< Context this memory object is allocated in. - ); - // Provide a "matching" placement delete operator. - void operator delete( - void*, //!< Pointer to deallocate - const Context& context //!< Context this memory object is allocated in. - ); - // and a regular delete operator to satisfy synthesized methods. - void operator delete( - void* //!< Pointer to deallocate - ); + public: + //! Placement new operator. + void* operator new(size_t size, //!< Original allocation size + const Context& context //!< Context this memory object is allocated in. + ); + // Provide a "matching" placement delete operator. + void operator delete(void*, //!< Pointer to deallocate + const Context& context //!< Context this memory object is allocated in. + ); + // and a regular delete operator to satisfy synthesized methods. + void operator delete(void* //!< Pointer to deallocate + ); - //! Returns the memory lock object - amd::Monitor& lockMemoryOps() { return lockMemoryOps_; } + //! Returns the memory lock object + amd::Monitor& lockMemoryOps() { return lockMemoryOps_; } - //! Adds a view into the list - void addSubBuffer(Memory* item); + //! Adds a view into the list + void addSubBuffer(Memory* item); - //! virtual function used to distinguish memory objects from other CL objects - virtual ObjectType objectType() const {return ObjectTypeMemory;} + //! virtual function used to distinguish memory objects from other CL objects + virtual ObjectType objectType() const { return ObjectTypeMemory; } - //! Removes a subbuffer from the list - void removeSubBuffer(Memory* item); + //! Removes a subbuffer from the list + void removeSubBuffer(Memory* item); - //! Returns the list of all subbuffers - std::list& subBuffers() { return subBuffers_; } + //! Returns the list of all subbuffers + std::list& subBuffers() { return subBuffers_; } - //! Returns the number of devices - size_t numDevices() const { return numDevices_; } + //! Returns the number of devices + size_t numDevices() const { return numDevices_; } - //! static_cast to Buffer with sanity check - virtual Buffer* asBuffer() { return NULL; } - //! static_cast to Image with sanity check - virtual Image* asImage() { return NULL; } - //! static_cast to Pipe with sanity check - virtual Pipe* asPipe() { return NULL; } + //! static_cast to Buffer with sanity check + virtual Buffer* asBuffer() { return NULL; } + //! static_cast to Image with sanity check + virtual Image* asImage() { return NULL; } + //! static_cast to Pipe with sanity check + virtual Pipe* asPipe() { return NULL; } - //! Creates and initializes device (cache) memory for all devices - virtual bool create( - void* initFrom = NULL, //!< Pointer to the initialization data - bool sysMemAlloc = false //!< Allocate device memory in system memory - ); + //! Creates and initializes device (cache) memory for all devices + virtual bool create(void* initFrom = NULL, //!< Pointer to the initialization data + bool sysMemAlloc = false //!< Allocate device memory in system memory + ); - //! Allocates device (cache) memory for a specific device - bool addDeviceMemory( - const Device* dev //!< Device object - ); + //! Allocates device (cache) memory for a specific device + bool addDeviceMemory(const Device* dev //!< Device object + ); - //! Replaces device (cache) memory for a specific device - void replaceDeviceMemory( - const Device* dev, //!< Device object - device::Memory* dm //!< New device memory object for replacement - ); + //! Replaces device (cache) memory for a specific device + void replaceDeviceMemory(const Device* dev, //!< Device object + device::Memory* dm //!< New device memory object for replacement + ); - //! Find the section for the given device. Return NULL if not found. - device::Memory* getDeviceMemory( - const Device& dev, //!< Device object - bool alloc = true //!< Allocates memory - ); + //! Find the section for the given device. Return NULL if not found. + device::Memory* getDeviceMemory(const Device& dev, //!< Device object + bool alloc = true //!< Allocates memory + ); - //! Allocate host memory (as required) - bool allocHostMemory( - void* initFrom, //!< Host memory provided by the application - bool allocHostMem, //!< Force system memory allocation - bool forceCopy = false //!< Force system memory allocation - ); + //! Allocate host memory (as required) + bool allocHostMemory(void* initFrom, //!< Host memory provided by the application + bool allocHostMem, //!< Force system memory allocation + bool forceCopy = false //!< Force system memory allocation + ); - //! Checks if memory was reallocated - bool reallocedDeviceMemory(const Device* dev) - { return (AllocRealloced == deviceAlloced_[dev]) ? true : false; } + //! Checks if memory was reallocated + bool reallocedDeviceMemory(const Device* dev) { + return (AllocRealloced == deviceAlloced_[dev]) ? true : false; + } - // Accessors - Memory* parent() const { return parent_; } - bool isParent() const { return isParent_; } + // Accessors + Memory* parent() const { return parent_; } + bool isParent() const { return isParent_; } - size_t getOrigin() const { return origin_; } - size_t getSize() const { return size_; } - Flags getMemFlags() const { return flags_; } - Type getType() const { return type_; } + size_t getOrigin() const { return origin_; } + size_t getSize() const { return size_; } + Flags getMemFlags() const { return flags_; } + Type getType() const { return type_; } - const Device* getLastWriter() { return lastWriter_; } - const HostMemoryReference* getHostMemRef() const { return &hostMemRef_; } - void* getHostMem() const { return hostMemRef_.hostMem(); } - void setHostMem(void* mem) { hostMemRef_.setHostMem(mem, context_()); } + const Device* getLastWriter() { return lastWriter_; } + const HostMemoryReference* getHostMemRef() const { return &hostMemRef_; } + void* getHostMem() const { return hostMemRef_.hostMem(); } + void setHostMem(void* mem) { hostMemRef_.setHostMem(mem, context_()); } - size_t getVersion() const { return version_; } + size_t getVersion() const { return version_; } - Context& getContext() const { return context_(); } - bool isInterop() const { return (getInteropObj() != NULL) ? true : false ; } + Context& getContext() const { return context_(); } + bool isInterop() const { return (getInteropObj() != NULL) ? true : false; } - InteropObject* getInteropObj() const { return interopObj_; } + InteropObject* getInteropObj() const { return interopObj_; } - bool setDestructorCallback(DestructorCallBackFunction callback, void* data); + bool setDestructorCallback(DestructorCallBackFunction callback, void* data); - //! Signal that a write has occurred to a cached version - void signalWrite(const Device* writer); - //! Force an asynchronous writeback from the most-recent dirty cache to host - void cacheWriteBack(void); + //! Signal that a write has occurred to a cached version + void signalWrite(const Device* writer); + //! Force an asynchronous writeback from the most-recent dirty cache to host + void cacheWriteBack(void); - //! For CPU device only! - //! Base functions for mapping/unmapping GL/D3D objects - //! Functions may be left empty, if not needed - //! Virtual member function mapExtObjectInCQThread() maps a GL object - //! and store CPU memory pointer in Memory::hostMem_. - //! Returns true if ok, false 0 if error(s) - virtual bool mapExtObjectInCQThread(void) { return true;} + //! For CPU device only! + //! Base functions for mapping/unmapping GL/D3D objects + //! Functions may be left empty, if not needed + //! Virtual member function mapExtObjectInCQThread() maps a GL object + //! and store CPU memory pointer in Memory::hostMem_. + //! Returns true if ok, false 0 if error(s) + virtual bool mapExtObjectInCQThread(void) { return true; } - //! Virtual member functions unmapExtObjectInCQThread() unmaps a GL object - //! and clears pointer Memory::hostMem_. - //! Returns true if ok, false 0 if error(s) - virtual bool unmapExtObjectInCQThread(void) { return true; } + //! Virtual member functions unmapExtObjectInCQThread() unmaps a GL object + //! and clears pointer Memory::hostMem_. + //! Returns true if ok, false 0 if error(s) + virtual bool unmapExtObjectInCQThread(void) { return true; } - //! Returns true if the specified area covers memory intirely - virtual bool isEntirelyCovered( - const Coord3D& origin, //!< Origin location of the covered region - const Coord3D& region //!< Covered region dimensions - ) const = 0; + //! Returns true if the specified area covers memory intirely + virtual bool isEntirelyCovered(const Coord3D& origin, //!< Origin location of the covered region + const Coord3D& region //!< Covered region dimensions + ) const = 0; - //! Returns true if the specified area is not degenerate and is inside of allocated memory - virtual bool validateRegion( - const Coord3D& origin, //!< Origin location of the covered region - const Coord3D& region //!< Covered region dimensions - ) const = 0; + //! Returns true if the specified area is not degenerate and is inside of allocated memory + virtual bool validateRegion(const Coord3D& origin, //!< Origin location of the covered region + const Coord3D& region //!< Covered region dimensions + ) const = 0; - void setVirtualDevice(device::VirtualDevice* vDev) { vDev_ = vDev; } - device::VirtualDevice* getVirtualDevice() const { return vDev_; } - bool forceSysMemAlloc() const { return forceSysMemAlloc_; } + void setVirtualDevice(device::VirtualDevice* vDev) { vDev_ = vDev; } + device::VirtualDevice* getVirtualDevice() const { return vDev_; } + bool forceSysMemAlloc() const { return forceSysMemAlloc_; } - void incMapCount() { ++mapCount_; } - void decMapCount() { --mapCount_; } - uint mapCount() const { return mapCount_; } + void incMapCount() { ++mapCount_; } + void decMapCount() { --mapCount_; } + uint mapCount() const { return mapCount_; } - bool usesSvmPointer() const; + bool usesSvmPointer() const; - void * getSvmPtr() const { return svmHostAddress_; } //!< svm pointer accessor; - void setSvmPtr(void * ptr) { svmHostAddress_ = ptr; } //!< svm pointer setter; - bool isSvmPtrCommited() const { return svmPtrCommited_; } //!< svm host address committed accessor; - void commitSvmMemory(); //!< svm host address committed accessor; - void setCacheStatus(bool canBeCached) { canBeCached_ = canBeCached; }//!< set the memobject cached status; - bool canBeCached() const { return canBeCached_; } //!< get the memobject cached status; - device::Memory* svmBase() const { return svmBase_; } //!< Returns SVM base for MGPU case + void* getSvmPtr() const { return svmHostAddress_; } //!< svm pointer accessor; + void setSvmPtr(void* ptr) { svmHostAddress_ = ptr; } //!< svm pointer setter; + bool isSvmPtrCommited() const { + return svmPtrCommited_; + } //!< svm host address committed accessor; + void commitSvmMemory(); //!< svm host address committed accessor; + void setCacheStatus(bool canBeCached) { + canBeCached_ = canBeCached; + } //!< set the memobject cached status; + bool canBeCached() const { return canBeCached_; } //!< get the memobject cached status; + device::Memory* svmBase() const { return svmBase_; } //!< Returns SVM base for MGPU case }; //! Buffers are a specialization of memory. Just a wrapper, really, //! but this gives us flexibility for later changes. -class Buffer: public Memory -{ -protected: - cl_bus_address_amd busAddress_; +class Buffer : public Memory { + protected: + cl_bus_address_amd busAddress_; - //! Initializes the device memory array which is nested - // after'Image1DD3D10' object in memory layout. - virtual void initDeviceMemory(); + //! Initializes the device memory array which is nested + // after'Image1DD3D10' object in memory layout. + virtual void initDeviceMemory(); - Buffer(Context& context, Type type, Flags flags, size_t size) : - Memory(context, type, flags, size) - { } + Buffer(Context& context, Type type, Flags flags, size_t size) + : Memory(context, type, flags, size) {} -public: - Buffer(Context& context, Flags flags, size_t size, void* svmPtr = NULL) : - Memory(context, CL_MEM_OBJECT_BUFFER, flags, size, svmPtr) - { } - Buffer(Memory& parent, Flags flags, size_t origin, size_t size) : - Memory(parent, flags, origin, size) - { } + public: + Buffer(Context& context, Flags flags, size_t size, void* svmPtr = NULL) + : Memory(context, CL_MEM_OBJECT_BUFFER, flags, size, svmPtr) {} + Buffer(Memory& parent, Flags flags, size_t origin, size_t size) + : Memory(parent, flags, origin, size) {} - bool create( - void* initFrom = NULL, //!< Pointer to the initialization data - bool sysMemAlloc = false //!< Allocate device memory in system memory - ); + bool create(void* initFrom = NULL, //!< Pointer to the initialization data + bool sysMemAlloc = false //!< Allocate device memory in system memory + ); - //! static_cast to Buffer with sanity check - virtual Buffer* asBuffer() { return this; } + //! static_cast to Buffer with sanity check + virtual Buffer* asBuffer() { return this; } - //! Returns true if the specified area covers buffer entirely - bool isEntirelyCovered( - const Coord3D& origin, //!< Origin location of the covered region - const Coord3D& region //!< Covered region dimensions - ) const; + //! Returns true if the specified area covers buffer entirely + bool isEntirelyCovered(const Coord3D& origin, //!< Origin location of the covered region + const Coord3D& region //!< Covered region dimensions + ) const; - //! Returns true if the specified area is not degenerate and is inside of allocated memory - bool validateRegion( - const Coord3D& origin, //!< Origin location of the covered region - const Coord3D& region //!< Covered region dimensions - ) const; + //! Returns true if the specified area is not degenerate and is inside of allocated memory + bool validateRegion(const Coord3D& origin, //!< Origin location of the covered region + const Coord3D& region //!< Covered region dimensions + ) const; - cl_bus_address_amd busAddress() const { return busAddress_; } + cl_bus_address_amd busAddress() const { return busAddress_; } }; //! Pipes are a specialization of Buffers. -class Pipe: public Buffer -{ -protected: - size_t packetSize_; //!< Size in bytes of pipe packet - size_t maxPackets_; //!< Number of max pipe packets - bool initialized_; //!< Mark if the pipe is initialized +class Pipe : public Buffer { + protected: + size_t packetSize_; //!< Size in bytes of pipe packet + size_t maxPackets_; //!< Number of max pipe packets + bool initialized_; //!< Mark if the pipe is initialized - virtual void initDeviceMemory(); -public: - Pipe(Context& context, Flags flags, size_t size, size_t pipe_packet_size, size_t pipe_max_packets) - : Buffer(context, CL_MEM_OBJECT_PIPE, flags, size) - , initialized_(false) - { - packetSize_ = pipe_packet_size; - maxPackets_ = pipe_max_packets; - } + virtual void initDeviceMemory(); - //! static_cast to Pipe with sanity check - virtual Pipe* asPipe() { return this; } + public: + Pipe(Context& context, Flags flags, size_t size, size_t pipe_packet_size, size_t pipe_max_packets) + : Buffer(context, CL_MEM_OBJECT_PIPE, flags, size), initialized_(false) { + packetSize_ = pipe_packet_size; + maxPackets_ = pipe_max_packets; + } - //! Returns pipe size pitch in bytes - size_t getPacketSize() const { return packetSize_; } + //! static_cast to Pipe with sanity check + virtual Pipe* asPipe() { return this; } - //! return max number of pipe packets - size_t getMaxNumPackets() const { return maxPackets_; } + //! Returns pipe size pitch in bytes + size_t getPacketSize() const { return packetSize_; } + + //! return max number of pipe packets + size_t getMaxNumPackets() const { return maxPackets_; } }; //! Images are a specialization of memory -class Image : public Memory -{ -public: - // declaration of list of supported formats - static cl_image_format supportedFormats[]; - static cl_image_format supportedFormatsRA[]; - static cl_image_format supportedDepthStencilFormats[]; - static cl_uint numSupportedFormats(const Context& context, cl_mem_object_type image_type, cl_mem_flags flags = 0); - static cl_uint getSupportedFormats( - const Context& context, - cl_mem_object_type image_type, - const cl_uint num_entries, - cl_image_format *image_formats, - cl_mem_flags flags = 0); +class Image : public Memory { + public: + // declaration of list of supported formats + static cl_image_format supportedFormats[]; + static cl_image_format supportedFormatsRA[]; + static cl_image_format supportedDepthStencilFormats[]; + static cl_uint numSupportedFormats(const Context& context, cl_mem_object_type image_type, + cl_mem_flags flags = 0); + static cl_uint getSupportedFormats(const Context& context, cl_mem_object_type image_type, + const cl_uint num_entries, cl_image_format* image_formats, + cl_mem_flags flags = 0); - //! Helper struct to manipulate image formats. - struct Format : public cl_image_format - { - //! Construct a new ImageFormat wrapper. - Format(const cl_image_format& format) { - image_channel_order = format.image_channel_order; - image_channel_data_type = format.image_channel_data_type; - } + //! Helper struct to manipulate image formats. + struct Format : public cl_image_format { + //! Construct a new ImageFormat wrapper. + Format(const cl_image_format& format) { + image_channel_order = format.image_channel_order; + image_channel_data_type = format.image_channel_data_type; + } - //! Return true if this is a valid image format, false otherwise. - bool isValid() const; + //! Return true if this is a valid image format, false otherwise. + bool isValid() const; - //! Returns true if this format is supported by runtime, false otherwise - bool isSupported(const Context& context, - cl_mem_object_type image_type = 0, cl_mem_flags flags = 0) const; + //! Returns true if this format is supported by runtime, false otherwise + bool isSupported(const Context& context, cl_mem_object_type image_type = 0, + cl_mem_flags flags = 0) const; - //! Compare 2 image formats. - bool operator == (const Format& rhs) const { - return image_channel_order == rhs.image_channel_order - && image_channel_data_type == rhs.image_channel_data_type; - } - bool operator != (const Format& rhs) const { return !(*this == rhs); } + //! Compare 2 image formats. + bool operator==(const Format& rhs) const { + return image_channel_order == rhs.image_channel_order && + image_channel_data_type == rhs.image_channel_data_type; + } + bool operator!=(const Format& rhs) const { return !(*this == rhs); } - //! Return the number of channels. - size_t getNumChannels() const; + //! Return the number of channels. + size_t getNumChannels() const; - //! Return the element size in bytes. - size_t getElementSize() const; + //! Return the element size in bytes. + size_t getElementSize() const; - //! Get the channel order by indices. R = 0, G = 1, B = 2, A = 3. - void getChannelOrder(uint8_t* channelOrder) const; + //! Get the channel order by indices. R = 0, G = 1, B = 2, A = 3. + void getChannelOrder(uint8_t* channelOrder) const; - //! Adjust colorRGBA according to format, and set it in colorFormat. - void formatColor(const void* colorRGBA, void* colorFormat) const; - }; + //! Adjust colorRGBA according to format, and set it in colorFormat. + void formatColor(const void* colorRGBA, void* colorFormat) const; + }; - struct Impl - { - amd::Coord3D region_; //!< Image size - size_t rp_; //!< Image row pitch - size_t sp_; //!< Image slice pitch - const Format format_; //!< Image format - void* reserved_; - size_t bp_; + struct Impl { + amd::Coord3D region_; //!< Image size + size_t rp_; //!< Image row pitch + size_t sp_; //!< Image slice pitch + const Format format_; //!< Image format + void* reserved_; + size_t bp_; - Impl(const Format& format, Coord3D region, size_t rp, size_t sp = 0, size_t bp = 0) - : region_(region), rp_(rp), sp_(sp), format_(format), bp_(bp) - { DEBUG_ONLY(reserved_ = NULL); } - }; + Impl(const Format& format, Coord3D region, size_t rp, size_t sp = 0, size_t bp = 0) + : region_(region), rp_(rp), sp_(sp), format_(format), bp_(bp) { + DEBUG_ONLY(reserved_ = NULL); + } + }; -private: - Impl impl_; //!< Image object description - size_t dim_; //!< Image dimension - uint mipLevels_; //!< The number of mip levels - uint baseMipLevel_; //!< The base mip level for a view + private: + Impl impl_; //!< Image object description + size_t dim_; //!< Image dimension + uint mipLevels_; //!< The number of mip levels + uint baseMipLevel_; //!< The base mip level for a view -protected: - Image( - const Format& format, - Image& parent, - uint baseMipLevel = 0, - cl_mem_flags flags = 0); + protected: + Image(const Format& format, Image& parent, uint baseMipLevel = 0, cl_mem_flags flags = 0); - ///! Initializes the device memory array which is nested - // after'Image' object in memory layout. - virtual void initDeviceMemory(); + ///! Initializes the device memory array which is nested + // after'Image' object in memory layout. + virtual void initDeviceMemory(); - //! Copies initialization data to the backing store - virtual void copyToBackingStore( - void* initFrom //!< Pointer to the initialization memory - ); + //! Copies initialization data to the backing store + virtual void copyToBackingStore(void* initFrom //!< Pointer to the initialization memory + ); - void initDimension(); + void initDimension(); -public: - Image( - Context& context, - Type type, - Flags flags, - const Format& format, - size_t width, - size_t height, - size_t depth, - size_t rowPitch, - size_t slicePitch, - uint mipLevels = 1); + public: + Image(Context& context, Type type, Flags flags, const Format& format, size_t width, size_t height, + size_t depth, size_t rowPitch, size_t slicePitch, uint mipLevels = 1); - Image( - Buffer& buffer, - Type type, - Flags flags, - const Format& format, - size_t width, - size_t height, - size_t depth, - size_t rowPitch, - size_t slicePitch); + Image(Buffer& buffer, Type type, Flags flags, const Format& format, size_t width, size_t height, + size_t depth, size_t rowPitch, size_t slicePitch); - //! Validate image dimensions with supported sizes - static bool validateDimensions( - const std::vector& devices, //!< List of devices for validation - cl_mem_object_type type, //!< Image type - size_t width, //!< Image width - size_t height, //!< Image height - size_t depth, //!< Image depth - size_t arraySize //!< Image array size - ); + //! Validate image dimensions with supported sizes + static bool validateDimensions( + const std::vector& devices, //!< List of devices for validation + cl_mem_object_type type, //!< Image type + size_t width, //!< Image width + size_t height, //!< Image height + size_t depth, //!< Image depth + size_t arraySize //!< Image array size + ); - const Format& getImageFormat() const {return impl_.format_;} + const Format& getImageFormat() const { return impl_.format_; } - //! static_cast to Buffer with sanity check - virtual Image* asImage() { return this; } + //! static_cast to Buffer with sanity check + virtual Image* asImage() { return this; } - //! Returns true if specified area covers image entirely - bool isEntirelyCovered( - const Coord3D& origin, //!< Origin location of the covered region - const Coord3D& region //!< Covered region dimensions - ) const; + //! Returns true if specified area covers image entirely + bool isEntirelyCovered(const Coord3D& origin, //!< Origin location of the covered region + const Coord3D& region //!< Covered region dimensions + ) const; - //! Returns true if the specified area is not degenerate and is inside of allocated memory - bool validateRegion( - const Coord3D& origin, //!< Origin location of the covered region - const Coord3D& region //!< Covered region dimensions - ) const; + //! Returns true if the specified area is not degenerate and is inside of allocated memory + bool validateRegion(const Coord3D& origin, //!< Origin location of the covered region + const Coord3D& region //!< Covered region dimensions + ) const; - //! Returns true if the slice value for the image is valid - bool isRowSliceValid( - size_t rowPitch, //!< The row pitch value - size_t slicePitch, //!< The slice pitch value - size_t width, //!< The width of the copy region - size_t height //!< The height of the copy region - ) const; + //! Returns true if the slice value for the image is valid + bool isRowSliceValid(size_t rowPitch, //!< The row pitch value + size_t slicePitch, //!< The slice pitch value + size_t width, //!< The width of the copy region + size_t height //!< The height of the copy region + ) const; - //! Creates a view memory object - virtual Image* createView( - const Context& context, //!< Context for a view creation - const Format& format, //!< The new format for a view - device::VirtualDevice* vDev, //!< Virtual device object - uint baseMipLevel = 0, //!< Base mip level for a view - cl_mem_flags flags = 0 //!< Memory allocation flags - ); + //! Creates a view memory object + virtual Image* createView(const Context& context, //!< Context for a view creation + const Format& format, //!< The new format for a view + device::VirtualDevice* vDev, //!< Virtual device object + uint baseMipLevel = 0, //!< Base mip level for a view + cl_mem_flags flags = 0 //!< Memory allocation flags + ); - //! Returns the impl for this image. - Impl& getImpl() { return impl_; } + //! Returns the impl for this image. + Impl& getImpl() { return impl_; } - //! Returns the number of dimensions. - size_t getDims() const { return dim_; } + //! Returns the number of dimensions. + size_t getDims() const { return dim_; } - //! Base virtual methods to be overridden in derived image classes - //! - //! Returns width of image in pixels - size_t getWidth() const { return impl_.region_[0]; } + //! Base virtual methods to be overridden in derived image classes + //! + //! Returns width of image in pixels + size_t getWidth() const { return impl_.region_[0]; } - //! Returns height of image in pixels - size_t getHeight() const { return impl_.region_[1]; } + //! Returns height of image in pixels + size_t getHeight() const { return impl_.region_[1]; } - //! Returns image's row pitch in bytes - size_t getRowPitch() const { return impl_.rp_; } + //! Returns image's row pitch in bytes + size_t getRowPitch() const { return impl_.rp_; } - //! Returns image's byte pitch - size_t getBytePitch() const { return impl_.bp_; } + //! Returns image's byte pitch + size_t getBytePitch() const { return impl_.bp_; } - //! Returns depth of the image in pixels/slices - size_t getDepth() const { return impl_.region_[2]; } + //! Returns depth of the image in pixels/slices + size_t getDepth() const { return impl_.region_[2]; } - //! Returns image's slice pitch in bytes - size_t getSlicePitch() const { return impl_.sp_; } + //! Returns image's slice pitch in bytes + size_t getSlicePitch() const { return impl_.sp_; } - //! Returns image's slice pitch in bytes - uint getMipLevels() const { return mipLevels_; } + //! Returns image's slice pitch in bytes + uint getMipLevels() const { return mipLevels_; } - //! Returns image's slice pitch in bytes - uint getBaseMipLevel() const { return baseMipLevel_; } + //! Returns image's slice pitch in bytes + uint getBaseMipLevel() const { return baseMipLevel_; } - //! Get the image covered region - const Coord3D& getRegion() const { return impl_.region_; } + //! Get the image covered region + const Coord3D& getRegion() const { return impl_.region_; } - //! Sets the byte pitch obtained from HWL - void setBytePitch(size_t bytePitch) { impl_.bp_ = bytePitch; } + //! Sets the byte pitch obtained from HWL + void setBytePitch(size_t bytePitch) { impl_.bp_ = bytePitch; } - //! Creates and initializes device (cache) memory for all devices - bool create( - void* initFrom = NULL //!< Pointer to the initialization data - ); + //! Creates and initializes device (cache) memory for all devices + bool create(void* initFrom = NULL //!< Pointer to the initialization data + ); }; //! SVM-related functionality. -class SvmBuffer : AllStatic -{ -public: - //! Allocate a shared buffer that is accessible by all devices in the context - static void* malloc( - Context& context, - cl_svm_mem_flags flags, - size_t size, - size_t alignment); +class SvmBuffer : AllStatic { + public: + //! Allocate a shared buffer that is accessible by all devices in the context + static void* malloc(Context& context, cl_svm_mem_flags flags, size_t size, size_t alignment); - //! Release shared buffer - static void free(const Context& context, void* ptr); + //! Release shared buffer + static void free(const Context& context, void* ptr); - //! Fill the destination buffer \a dst with the contents of the source - //! buffer \a src \times times. - static void memFill( - void* dst, - const void* src, - size_t srcSize, - size_t times); + //! Fill the destination buffer \a dst with the contents of the source + //! buffer \a src \times times. + static void memFill(void* dst, const void* src, size_t srcSize, size_t times); - //! Return true if \a ptr is a pointer allocated using SvmBuffer::malloc - //! that has not been deallocated afterwards - static bool malloced(const void* ptr); + //! Return true if \a ptr is a pointer allocated using SvmBuffer::malloc + //! that has not been deallocated afterwards + static bool malloced(const void* ptr); -private: - static void Add(uintptr_t k, uintptr_t v); - static void Remove(uintptr_t k); - static bool Contains(uintptr_t ptr); + private: + static void Add(uintptr_t k, uintptr_t v); + static void Remove(uintptr_t k); + static bool Contains(uintptr_t ptr); - static std::map Allocated_; // !< Allocated buffers - static Monitor AllocatedLock_; + static std::map Allocated_; // !< Allocated buffers + static Monitor AllocatedLock_; }; //! Liquid flash extension -class LiquidFlashFile : public RuntimeObject -{ -private: - const wchar_t* name_; - cl_file_flags_amd flags_; - void* handle_; - uint32_t blockSize_; - uint64_t fileSize_; -public: - LiquidFlashFile(const wchar_t* name, cl_file_flags_amd flags) - : name_(name), flags_(flags), handle_(NULL) ,blockSize_(0),fileSize_(0) { } +class LiquidFlashFile : public RuntimeObject { + private: + const wchar_t* name_; + cl_file_flags_amd flags_; + void* handle_; + uint32_t blockSize_; + uint64_t fileSize_; - ~LiquidFlashFile(); + public: + LiquidFlashFile(const wchar_t* name, cl_file_flags_amd flags) + : name_(name), flags_(flags), handle_(NULL), blockSize_(0), fileSize_(0) {} - bool open(); - void close(); + ~LiquidFlashFile(); - uint32_t blockSize() const { return blockSize_; }; - uint64_t fileSize() const { return fileSize_; }; + bool open(); + void close(); - bool transferBlock( - bool read, - void* dst, - uint64_t bufferSize, - uint64_t fileOffset, - uint64_t bufferOffset, - uint64_t size) const; + uint32_t blockSize() const { return blockSize_; }; + uint64_t fileSize() const { return fileSize_; }; - virtual ObjectType objectType() const { return ObjectTypeLiquidFlashFile; } + bool transferBlock(bool read, void* dst, uint64_t bufferSize, uint64_t fileOffset, + uint64_t bufferOffset, uint64_t size) const; + + virtual ObjectType objectType() const { return ObjectTypeLiquidFlashFile; } }; -} // namespace amd +} // namespace amd -#endif // MEMORY_H_ +#endif // MEMORY_H_ diff --git a/rocclr/runtime/platform/ndrange.cpp b/rocclr/runtime/platform/ndrange.cpp index 1f1cdb80aa..08cc4d7b0e 100644 --- a/rocclr/runtime/platform/ndrange.cpp +++ b/rocclr/runtime/platform/ndrange.cpp @@ -6,65 +6,47 @@ namespace amd { -NDRange::NDRange(size_t dimensions) - : dimensions_(dimensions) -{ - *this = 0; +NDRange::NDRange(size_t dimensions) : dimensions_(dimensions) { *this = 0; } + +NDRange::NDRange(const NDRange& space) : dimensions_(space.dimensions_) { *this = space; } + +NDRange& NDRange::operator=(size_t x) { + for (size_t i = 0; i < dimensions_; ++i) { + data_[i] = x; + } + return *this; } -NDRange::NDRange(const NDRange& space) - : dimensions_(space.dimensions_) -{ - *this = space; -} +NDRange::~NDRange() {} -NDRange& -NDRange::operator = (size_t x) -{ - for (size_t i = 0; i < dimensions_; ++i) { - data_[i] = x; +bool NDRange::operator==(const NDRange& x) const { + assert(dimensions_ == x.dimensions_ && "dimensions mismatch"); + + for (size_t i = 0; i < dimensions_; ++i) { + if (data_[i] != x.data_[i]) { + return false; } - return *this; + } + return true; } -NDRange::~NDRange() -{ -} - -bool -NDRange::operator == (const NDRange& x) const -{ - assert(dimensions_ == x.dimensions_ && "dimensions mismatch"); - - for (size_t i = 0; i < dimensions_; ++i) { - if (data_[i] != x.data_[i]) { - return false; - } +bool NDRange::operator==(size_t x) const { + for (size_t i = 0; i < dimensions_; ++i) { + if (data_[i] != x) { + return false; } - return true; -} - -bool -NDRange::operator == (size_t x) const -{ - for (size_t i = 0; i < dimensions_; ++i) { - if (data_[i] != x) { - return false; - } - } - return true; + } + return true; } #ifdef DEBUG -void -NDRange::printOn(FILE* file) const -{ - fprintf(file, "["); - for (size_t i = dimensions_ - 1; i > 0; --i) { - fprintf(file, SIZE_T_FMT ", ", data_[i]); - } - fprintf(file, SIZE_T_FMT "]", data_[0]); +void NDRange::printOn(FILE* file) const { + fprintf(file, "["); + for (size_t i = dimensions_ - 1; i > 0; --i) { + fprintf(file, SIZE_T_FMT ", ", data_[i]); + } + fprintf(file, SIZE_T_FMT "]", data_[0]); } -#endif // DEBUG +#endif // DEBUG -} // namespace amd +} // namespace amd diff --git a/rocclr/runtime/platform/ndrange.hpp b/rocclr/runtime/platform/ndrange.hpp index 403c7b72d3..3c92f2b183 100644 --- a/rocclr/runtime/platform/ndrange.hpp +++ b/rocclr/runtime/platform/ndrange.hpp @@ -10,8 +10,8 @@ #include #ifdef DEBUG -# include -#endif // DEBUG +#include +#endif // DEBUG namespace amd { @@ -23,182 +23,159 @@ namespace amd { */ //! An N-dimensions index space. -class NDRange : public EmbeddedObject -{ -private: +class NDRange : public EmbeddedObject { + private: + const size_t dimensions_ : 2; //!< Number of dimensions [0-3] + size_t data_[3]; //!< indexes array - const size_t dimensions_ : 2; //!< Number of dimensions [0-3] - size_t data_[3]; //!< indexes array - -private: - - //! Construct a new index space for an array of elements (no-copy) - NDRange(size_t dimensions, size_t* elements) - : dimensions_(dimensions) - { - for (uint i = 0; i < dimensions_; ++i) { - data_[i] = elements[i]; - } + private: + //! Construct a new index space for an array of elements (no-copy) + NDRange(size_t dimensions, size_t* elements) : dimensions_(dimensions) { + for (uint i = 0; i < dimensions_; ++i) { + data_[i] = elements[i]; } + } -public: - //! Construct a new index space of the given dimensions. - explicit NDRange(size_t dimensions); + public: + //! Construct a new index space of the given dimensions. + explicit NDRange(size_t dimensions); - //! Copy constructor. - NDRange(const NDRange& space); + //! Copy constructor. + NDRange(const NDRange& space); - //! Destroy the index space. - ~NDRange(); + //! Destroy the index space. + ~NDRange(); - //! Copy operator - inline NDRange& operator = (const NDRange& space); + //! Copy operator + inline NDRange& operator=(const NDRange& space); - //! Make all elements of this space equal to x. - NDRange& operator = (size_t x); + //! Make all elements of this space equal to x. + NDRange& operator=(size_t x); - //! Return the number of dimensions. - size_t dimensions() const { return dimensions_; } + //! Return the number of dimensions. + size_t dimensions() const { return dimensions_; } - //! Return the element at the given \a index. - size_t& operator [] (size_t index) - { - assert(index < dimensions_ && "index is out of bounds"); - return data_[index]; - } + //! Return the element at the given \a index. + size_t& operator[](size_t index) { + assert(index < dimensions_ && "index is out of bounds"); + return data_[index]; + } - //! Return the element at the given \a index. - size_t operator [] (size_t index) const - { - assert(index < dimensions_ && "index is out of bounds"); - return data_[index]; - } + //! Return the element at the given \a index. + size_t operator[](size_t index) const { + assert(index < dimensions_ && "index is out of bounds"); + return data_[index]; + } - //! Return the sum of this index space elements. - inline size_t sum() const; + //! Return the sum of this index space elements. + inline size_t sum() const; - //! Return the product of this index space elements (size) - inline size_t product() const; + //! Return the product of this index space elements (size) + inline size_t product() const; - // Binary operators: - inline friend NDRange operator + (const NDRange& x, const NDRange& y); - inline friend NDRange operator - (const NDRange& x, const NDRange& y); - inline friend NDRange operator * (const NDRange& x, const NDRange& y); - inline friend NDRange operator / (const NDRange& x, const NDRange& y); - inline friend NDRange operator % (const NDRange& x, const NDRange& y); + // Binary operators: + inline friend NDRange operator+(const NDRange& x, const NDRange& y); + inline friend NDRange operator-(const NDRange& x, const NDRange& y); + inline friend NDRange operator*(const NDRange& x, const NDRange& y); + inline friend NDRange operator/(const NDRange& x, const NDRange& y); + inline friend NDRange operator%(const NDRange& x, const NDRange& y); - //! Return true if this index space is identical to \a x. - bool operator == (const NDRange& x) const; + //! Return true if this index space is identical to \a x. + bool operator==(const NDRange& x) const; - //! Return true if this index space and \a x are different. - bool operator != (const NDRange& x) const { return !(*this == x); } + //! Return true if this index space and \a x are different. + bool operator!=(const NDRange& x) const { return !(*this == x); } - //! Return true if all elements are equal to \a x. - bool operator == (size_t x) const; + //! Return true if all elements are equal to \a x. + bool operator==(size_t x) const; - //! Return true if one element of this space is not equal to \a x. - bool operator != (size_t x) const { return !(*this == x); } + //! Return true if one element of this space is not equal to \a x. + bool operator!=(size_t x) const { return !(*this == x); } #ifdef DEBUG - //! Print this index space on the given stream. - void printOn(FILE* file) const; -#endif // DEBUG - + //! Print this index space on the given stream. + void printOn(FILE* file) const; +#endif // DEBUG }; //! A container for the local and global worksizes. -class NDRangeContainer : public HeapObject -{ -private: - const size_t dimensions_; //!< Number of dimensions. - NDRange offset_; //!< Global work-item offset. - NDRange global_; //!< Total number of work-items in N-dims - NDRange local_; //!< Number of work-items in N-dims in a workgroup. +class NDRangeContainer : public HeapObject { + private: + const size_t dimensions_; //!< Number of dimensions. + NDRange offset_; //!< Global work-item offset. + NDRange global_; //!< Total number of work-items in N-dims + NDRange local_; //!< Number of work-items in N-dims in a workgroup. -public: - /*! \brief Construct a new nd-range container with the given local - * and global worksizes in \a nDimensions dimensions. - */ - NDRangeContainer( - size_t dimensions, - const size_t* globalWorkOffset, - const size_t* globalWorkSize, - const size_t* localWorkSize - ) : dimensions_(dimensions), - offset_(dimensions), global_(dimensions), local_(dimensions) - { - for (size_t i = 0; i < dimensions; ++i) { - offset_[i] = globalWorkOffset != NULL ? globalWorkOffset[i] : 0; - global_[i] = globalWorkSize[i]; - local_[i] = localWorkSize[i]; - } + public: + /*! \brief Construct a new nd-range container with the given local + * and global worksizes in \a nDimensions dimensions. + */ + NDRangeContainer(size_t dimensions, const size_t* globalWorkOffset, const size_t* globalWorkSize, + const size_t* localWorkSize) + : dimensions_(dimensions), offset_(dimensions), global_(dimensions), local_(dimensions) { + for (size_t i = 0; i < dimensions; ++i) { + offset_[i] = globalWorkOffset != NULL ? globalWorkOffset[i] : 0; + global_[i] = globalWorkSize[i]; + local_[i] = localWorkSize[i]; } + } - //! Return the number of dimensions. - size_t dimensions() const { return dimensions_; } + //! Return the number of dimensions. + size_t dimensions() const { return dimensions_; } - //! Return the global workoffset. - const NDRange& offset() const { return offset_; } - NDRange& offset() { return offset_; } - //! Return the global worksize. - const NDRange& global() const { return global_; } - NDRange& global() { return global_; } - //! Return the local worksize. - const NDRange& local() const { return local_; } - NDRange& local() { return local_; } + //! Return the global workoffset. + const NDRange& offset() const { return offset_; } + NDRange& offset() { return offset_; } + //! Return the global worksize. + const NDRange& global() const { return global_; } + NDRange& global() { return global_; } + //! Return the local worksize. + const NDRange& local() const { return local_; } + NDRange& local() { return local_; } }; - /*! @}\ * @} */ -inline size_t -NDRange::sum() const -{ - size_t result = data_[0]; - for (size_t i = 1; i < dimensions_; ++i) { - result += data_[i]; - } - return result; +inline size_t NDRange::sum() const { + size_t result = data_[0]; + for (size_t i = 1; i < dimensions_; ++i) { + result += data_[i]; + } + return result; } -inline size_t -NDRange::product() const -{ - size_t result = data_[0]; - for (size_t i = 1; i < dimensions_; ++i) { - result *= data_[i]; - } - return result; +inline size_t NDRange::product() const { + size_t result = data_[0]; + for (size_t i = 1; i < dimensions_; ++i) { + result *= data_[i]; + } + return result; } // This function is in this header file for performance improvements: -inline NDRange& -NDRange::operator = (const NDRange& space) -{ - assert(dimensions_ == space.dimensions_ && "dimensions mismatch"); - for (size_t i = 0; i < sizeof(data_)/sizeof(*data_); ++i) { - data_[i] = space.data_[i]; - } - return *this; +inline NDRange& NDRange::operator=(const NDRange& space) { + assert(dimensions_ == space.dimensions_ && "dimensions mismatch"); + for (size_t i = 0; i < sizeof(data_) / sizeof(*data_); ++i) { + data_[i] = space.data_[i]; + } + return *this; } -#define DEFINE_NDRANGE_BINARY_OP(op) \ -inline NDRange \ -operator op (const NDRange& x, const NDRange& y) \ -{ \ - assert(x.dimensions_ == y.dimensions_ && "dimensions mismatch"); \ - \ - size_t dimensions = x.dimensions_; \ - size_t result[3] = {0}; \ - for (size_t i = 0; i < dimensions; ++i) { \ - result[i] = x.data_[i] op y.data_[i]; \ - } \ - \ - return NDRange(dimensions, &result[0]); \ -} +#define DEFINE_NDRANGE_BINARY_OP(op) \ + inline NDRange operator op(const NDRange& x, const NDRange& y) { \ + assert(x.dimensions_ == y.dimensions_ && "dimensions mismatch"); \ + \ + size_t dimensions = x.dimensions_; \ + size_t result[3] = {0}; \ + for (size_t i = 0; i < dimensions; ++i) { \ + result[i] = x.data_[i] op y.data_[i]; \ + } \ + \ + return NDRange(dimensions, &result[0]); \ + } DEFINE_NDRANGE_BINARY_OP(+); DEFINE_NDRANGE_BINARY_OP(-); @@ -208,6 +185,6 @@ DEFINE_NDRANGE_BINARY_OP(%); #undef DEFINE_NDRANGE_BINARY_OP -} // namespace amd +} // namespace amd #endif /*NDRANGE_HPP_*/ diff --git a/rocclr/runtime/platform/object.cpp b/rocclr/runtime/platform/object.cpp index 0afc510405..c400d02813 100644 --- a/rocclr/runtime/platform/object.cpp +++ b/rocclr/runtime/platform/object.cpp @@ -8,81 +8,68 @@ namespace amd { -Atomic -ObjectMetadata::nextKey_ = 1; +Atomic ObjectMetadata::nextKey_ = 1; -ObjectMetadata::Destructor -ObjectMetadata::destructors_[OCL_MAX_KEYS] = { NULL }; +ObjectMetadata::Destructor ObjectMetadata::destructors_[OCL_MAX_KEYS] = {NULL}; -bool -ObjectMetadata::check(Key key) -{ - return key > 0 && key <= OCL_MAX_KEYS; +bool ObjectMetadata::check(Key key) { return key > 0 && key <= OCL_MAX_KEYS; } + +ObjectMetadata::Key ObjectMetadata::createKey(Destructor destructor) { + Key key = nextKey_++; + + if (!check(key)) { + return 0; + } + + destructors_[key - 1] = destructor; + return key; } -ObjectMetadata::Key -ObjectMetadata::createKey(Destructor destructor) -{ - Key key = nextKey_++; +ObjectMetadata::~ObjectMetadata() { + if (!values_) { + return; + } - if (!check(key)) { - return 0; + for (size_t i = 0; i < OCL_MAX_KEYS; ++i) { + if (values_[i] && destructors_[i]) { + destructors_[i](values_[i]); } + } - destructors_[key-1] = destructor; - return key; + delete[] values_; } -ObjectMetadata::~ObjectMetadata() -{ - if (!values_) { - return; - } +void* ObjectMetadata::getValueForKey(Key key) const { + if (!values_ || !check(key)) { + return NULL; + } - for (size_t i = 0; i < OCL_MAX_KEYS; ++i) { - if (values_[i] && destructors_[i]) { - destructors_[i](values_[i]); - } - } - - delete[] values_; + return values_[key - 1]; } -void* -ObjectMetadata::getValueForKey(Key key) const -{ - if (!values_ || !check(key)) { - return NULL; - } +bool ObjectMetadata::setValueForKey(Key key, Value value) { + if (!check(key)) { + return false; + } - return values_[key-1]; + while (!values_) { + Value* values = new Value[OCL_MAX_KEYS]; + memset(values, '\0', sizeof(Value) * OCL_MAX_KEYS); + + if (!values_.compareAndSet(NULL, values)) { + delete[] values; + } + } + + size_t index = key - 1; + Value prev = AtomicOperation::swap(value, &values_[index]); + if (prev && destructors_[index] != NULL) { + destructors_[index](prev); + } + + return true; } -bool -ObjectMetadata::setValueForKey(Key key, Value value) -{ - if (!check(key)) { - return false; - } - - while (!values_) { - Value* values = new Value[OCL_MAX_KEYS]; - memset(values, '\0', sizeof(Value) * OCL_MAX_KEYS); - - if (!values_.compareAndSet(NULL, values)) { - delete[] values; - } - } - - size_t index = key-1; - Value prev = AtomicOperation::swap(value, &values_[index]); - if (prev && destructors_[index] != NULL) { - destructors_[index](prev); - } - - return true; -} - -} // namespace amd +} // namespace amd diff --git a/rocclr/runtime/platform/object.hpp b/rocclr/runtime/platform/object.hpp index 8dc9fcbac1..2593bd1571 100644 --- a/rocclr/runtime/platform/object.hpp +++ b/rocclr/runtime/platform/object.hpp @@ -10,27 +10,30 @@ #include "thread/monitor.hpp" #include "utils/util.hpp" -#define CL_TYPES_DO(F) \ - /* OpenCL type Runtime type */ \ - F(cl_context, Context) \ - F(cl_event, Event) \ - F(cl_command_queue, CommandQueue) \ - F(cl_kernel, Kernel) \ - F(cl_program, Program) \ - F(cl_device_id, Device) \ - F(cl_mem, Memory) \ - F(cl_sampler, Sampler) \ - F(cl_counter_amd, Counter) \ - F(cl_perfcounter_amd, PerfCounter) \ - F(cl_threadtrace_amd, ThreadTrace) \ - F(cl_file_amd, LiquidFlashFile) +#define CL_TYPES_DO(F) \ + /* OpenCL type Runtime type */ \ + F(cl_context, Context) \ + F(cl_event, Event) \ + F(cl_command_queue, CommandQueue) \ + F(cl_kernel, Kernel) \ + F(cl_program, Program) \ + F(cl_device_id, Device) \ + F(cl_mem, Memory) \ + F(cl_sampler, Sampler) \ + F(cl_counter_amd, Counter) \ + F(cl_perfcounter_amd, PerfCounter) \ + F(cl_threadtrace_amd, ThreadTrace) \ + F(cl_file_amd, LiquidFlashFile) // Forward declare ::cl_* types and amd::Class types // -#define DECLARE_CL_TYPES(CL,AMD) \ -namespace amd { class AMD; } \ -typedef struct _##CL { } * CL; +#define DECLARE_CL_TYPES(CL, AMD) \ + namespace amd { \ + class AMD; \ + } \ + typedef struct _##CL { \ + } * CL; CL_TYPES_DO(DECLARE_CL_TYPES); @@ -43,178 +46,127 @@ namespace amd { // Define the cl_*_type tokens for type checking. // -#define DEFINE_CL_TOKENS(CL,ignored) T##CL, +#define DEFINE_CL_TOKENS(CL, ignored) T##CL, -enum cl_token -{ - Tinvalid = 0, - CL_TYPES_DO(DEFINE_CL_TOKENS) - numTokens -}; +enum cl_token { Tinvalid = 0, CL_TYPES_DO(DEFINE_CL_TOKENS) numTokens }; #undef DEFINE_CL_TOKENS -const size_t RuntimeObjectAlignment = - NextPowerOfTwo::value; +const size_t RuntimeObjectAlignment = NextPowerOfTwo::value; //! \cond ignore -template -struct as_internal -{ typedef void type; }; +template struct as_internal { typedef void type; }; -template -struct as_external -{ typedef void type; }; +template struct as_external { typedef void type; }; -template -struct class_token -{ static const cl_token value = Tinvalid; }; +template struct class_token { static const cl_token value = Tinvalid; }; -#define DEFINE_CL_TRAITS(CL,AMD) \ - \ -template <> \ -struct class_token \ -{ static const cl_token value = T##CL; }; \ - \ -template <> \ -struct as_internal<_##CL> \ -{ typedef AMD type; }; \ -template <> \ -struct as_internal \ -{ typedef AMD const type; }; \ - \ -template <> \ -struct as_external \ -{ typedef _##CL type; }; \ -template <> \ -struct as_external \ -{ typedef _##CL const type; }; +#define DEFINE_CL_TRAITS(CL, AMD) \ + \ + template <> struct class_token { static const cl_token value = T##CL; }; \ + \ + template <> struct as_internal<_##CL> { typedef AMD type; }; \ + template <> struct as_internal { typedef AMD const type; }; \ + \ + template <> struct as_external { typedef _##CL type; }; \ + template <> struct as_external { typedef _##CL const type; }; CL_TYPES_DO(DEFINE_CL_TRAITS); #undef DEFINE_CL_TRAITS //! \endcond -struct ICDDispatchedObject -{ - static struct KHRicdVendorDispatchRec icdVendorDispatch_[]; - const struct KHRicdVendorDispatchRec* const dispatch_; +struct ICDDispatchedObject { + static struct KHRicdVendorDispatchRec icdVendorDispatch_[]; + const struct KHRicdVendorDispatchRec* const dispatch_; -protected: - ICDDispatchedObject() : dispatch_(icdVendorDispatch_) { } + protected: + ICDDispatchedObject() : dispatch_(icdVendorDispatch_) {} -public: - static bool isValidHandle(const void* handle) - { - return handle != NULL; - } + public: + static bool isValidHandle(const void* handle) { return handle != NULL; } - const void* handle() const - { - return static_cast(this); - } - void* handle() - { - return static_cast(this); - } + const void* handle() const { return static_cast(this); } + void* handle() { return static_cast(this); } - template - static const T* fromHandle(const void *handle) - { - return static_cast( - reinterpret_cast(handle)); - } - template - static T* fromHandle(void *handle) - { - return static_cast( - reinterpret_cast(handle)); - } + template static const T* fromHandle(const void* handle) { + return static_cast(reinterpret_cast(handle)); + } + template static T* fromHandle(void* handle) { + return static_cast(reinterpret_cast(handle)); + } }; #define OCL_MAX_KEYS 8 /*! The object metadata container. */ -class ObjectMetadata -{ +class ObjectMetadata { + public: + typedef size_t Key; + typedef void* Value; -public: - typedef size_t Key; - typedef void* Value; + private: + typedef void(CL_CALLBACK* Destructor)(Value); -private: - typedef void (CL_CALLBACK * Destructor)(Value); + static Atomic nextKey_; + static Destructor destructors_[OCL_MAX_KEYS]; - static Atomic nextKey_; - static Destructor destructors_[OCL_MAX_KEYS]; + Atomic values_; - Atomic values_; + public: + static bool check(Key key); -public: - static bool check(Key key); + static Key createKey(Destructor destructor = NULL); - static Key createKey(Destructor destructor = NULL); + ObjectMetadata() : values_(NULL) {} + ~ObjectMetadata(); - ObjectMetadata() : values_(NULL) { } - ~ObjectMetadata(); + Value getValueForKey(Key key) const; - Value getValueForKey(Key key) const; - - bool setValueForKey(Key key, Value value); + bool setValueForKey(Key key, Value value); }; /*! \brief For all OpenCL/Runtime objects. */ -class RuntimeObject : public ReferenceCountedObject, public ICDDispatchedObject -{ -private: - ObjectMetadata metadata_; +class RuntimeObject : public ReferenceCountedObject, public ICDDispatchedObject { + private: + ObjectMetadata metadata_; -public: + public: + enum ObjectType { + ObjectTypeContext = 0, + ObjectTypeDevice = 1, + ObjectTypeMemory = 2, + ObjectTypeKernel = 3, + ObjectTypeCounter = 4, + ObjectTypePerfCounter = 5, + ObjectTypeEvent = 6, + ObjectTypeProgram = 7, + ObjectTypeQueue = 8, + ObjectTypeSampler = 9, + ObjectTypeThreadTrace = 10, + ObjectTypeLiquidFlashFile = 11 + }; - enum ObjectType { - ObjectTypeContext = 0, - ObjectTypeDevice = 1, - ObjectTypeMemory = 2, - ObjectTypeKernel = 3, - ObjectTypeCounter = 4, - ObjectTypePerfCounter = 5, - ObjectTypeEvent = 6, - ObjectTypeProgram = 7, - ObjectTypeQueue = 8, - ObjectTypeSampler = 9, - ObjectTypeThreadTrace = 10, - ObjectTypeLiquidFlashFile = 11 - }; - - ObjectMetadata& metadata() { return metadata_; } - virtual ObjectType objectType() const =0 ; + ObjectMetadata& metadata() { return metadata_; } + virtual ObjectType objectType() const = 0; }; -template -class SharedReference : public EmbeddedObject -{ -private: - T& reference_; +template class SharedReference : public EmbeddedObject { + private: + T& reference_; -private: - // do not copy shared references. - SharedReference& operator = (const SharedReference& sref); + private: + // do not copy shared references. + SharedReference& operator=(const SharedReference& sref); -public: - explicit SharedReference(T& reference) - : reference_(reference) - { - reference_.retain(); - } + public: + explicit SharedReference(T& reference) : reference_(reference) { reference_.retain(); } - ~SharedReference() - { - reference_.release(); - } + ~SharedReference() { reference_.release(); } - T& operator ()() const { return reference_; } + T& operator()() const { return reference_; } }; /*! \brief A 1,2 or 3D coordinate. @@ -223,48 +175,38 @@ public: *! of non-zero elements. (i.e. a 1D line is not the same as a 2D plane with width 1) */ -struct Coord3D -{ - size_t c[3]; +struct Coord3D { + size_t c[3]; - Coord3D(size_t d0, size_t d1 = 0, size_t d2 = 0) - { - c[0]=d0; c[1]=d1; c[2]=d2; - } - const size_t& operator[] (size_t idx) const - { - assert(idx < 3); - return c[idx]; - } - bool operator== (const Coord3D& rhs) const - { - return c[0] == rhs.c[0] && c[1] == rhs.c[1] && c[2] == rhs.c[2]; - } + Coord3D(size_t d0, size_t d1 = 0, size_t d2 = 0) { + c[0] = d0; + c[1] = d1; + c[2] = d2; + } + const size_t& operator[](size_t idx) const { + assert(idx < 3); + return c[idx]; + } + bool operator==(const Coord3D& rhs) const { + return c[0] == rhs.c[0] && c[1] == rhs.c[1] && c[2] == rhs.c[2]; + } }; -} // namespace amd +} // namespace amd -template -typename amd::as_internal::type* -as_amd(CL* cl_obj) -{ - return cl_obj == NULL ? NULL : amd::RuntimeObject::fromHandle< - typename amd::as_internal::type>(static_cast(cl_obj)); +template typename amd::as_internal::type* as_amd(CL* cl_obj) { + return cl_obj == NULL ? NULL + : amd::RuntimeObject::fromHandle::type>( + static_cast(cl_obj)); } -template -typename amd::as_external::type* -as_cl(AMD* amd_obj) -{ - return amd_obj == NULL ? NULL : static_cast< - typename amd::as_external::type*>(amd_obj->handle()); +template typename amd::as_external::type* as_cl(AMD* amd_obj) { + return amd_obj == NULL ? NULL + : static_cast::type*>(amd_obj->handle()); } -template -bool -is_valid(CL* handle) -{ - return amd::as_internal::type::isValidHandle(handle); +template bool is_valid(CL* handle) { + return amd::as_internal::type::isValidHandle(handle); } #endif /*OBJECT_HPP_*/ diff --git a/rocclr/runtime/platform/perfctr.hpp b/rocclr/runtime/platform/perfctr.hpp index b3f94e5f77..b7aed5b1ef 100644 --- a/rocclr/runtime/platform/perfctr.hpp +++ b/rocclr/runtime/platform/perfctr.hpp @@ -22,44 +22,42 @@ namespace amd { * * \brief The container class for the performance counters */ -class PerfCounter : public RuntimeObject -{ -public: - typedef std::map Properties; +class PerfCounter : public RuntimeObject { + public: + typedef std::map Properties; - //! Constructor of the performance counter object - PerfCounter( - const Device& device, //!< device object - Properties& properties) //!< a list of properties - : properties_(properties) - , deviceCounter_(NULL) - , device_(device) - { } + //! Constructor of the performance counter object + PerfCounter(const Device& device, //!< device object + Properties& properties) //!< a list of properties + : properties_(properties), + deviceCounter_(NULL), + device_(device) {} - //! Get the performance counter's result - const Device& device() const { return device_; } + //! Get the performance counter's result + const Device& device() const { return device_; } - //! Get the properties - const Properties& properties() const { return properties_; } + //! Get the properties + const Properties& properties() const { return properties_; } - //! Get the device performance counter - const device::PerfCounter* getDeviceCounter() const { return deviceCounter_; } + //! Get the device performance counter + const device::PerfCounter* getDeviceCounter() const { return deviceCounter_; } - //! Set the device performance counter - void setDeviceCounter(device::PerfCounter* counter) { deviceCounter_ = counter; } + //! Set the device performance counter + void setDeviceCounter(device::PerfCounter* counter) { deviceCounter_ = counter; } - //! RTTI internal implementation - virtual ObjectType objectType() const {return ObjectTypePerfCounter;} -protected: - //! Destructor for PerfCounter class - ~PerfCounter() { delete deviceCounter_; } + //! RTTI internal implementation + virtual ObjectType objectType() const { return ObjectTypePerfCounter; } - Properties properties_; //!< the perf counter properties - device::PerfCounter* deviceCounter_; //!< device performance counter - const Device& device_; //!< the device object + protected: + //! Destructor for PerfCounter class + ~PerfCounter() { delete deviceCounter_; } + + Properties properties_; //!< the perf counter properties + device::PerfCounter* deviceCounter_; //!< device performance counter + const Device& device_; //!< the device object }; /*@}*/ /*@}*/ } // namespace amd -#endif // PERFCTR_HPP_ +#endif // PERFCTR_HPP_ diff --git a/rocclr/runtime/platform/program.cpp b/rocclr/runtime/platform/program.cpp index a4c7b3238a..979b4ccb29 100644 --- a/rocclr/runtime/platform/program.cpp +++ b/rocclr/runtime/platform/program.cpp @@ -10,8 +10,8 @@ #include "utils/bif_section_labels.hpp" #include "acl.h" -#include // for malloc -#include // for strcmp +#include // for malloc +#include // for strcmp #include #include #include @@ -19,666 +19,619 @@ namespace amd { -Program::~Program() -{ - // Destroy all device programs - deviceprograms_t::const_iterator it, itEnd; - for (it = devicePrograms_.begin(), itEnd = devicePrograms_.end(); - it != itEnd; ++it) { - delete it->second; - } +Program::~Program() { + // Destroy all device programs + deviceprograms_t::const_iterator it, itEnd; + for (it = devicePrograms_.begin(), itEnd = devicePrograms_.end(); it != itEnd; ++it) { + delete it->second; + } - for (devicebinary_t::const_iterator IT = binary_.begin(), IE = binary_.end(); - IT != IE; ++IT) { - const binary_t& Bin = IT->second; - if (Bin.first) { - delete [] Bin.first; - } + for (devicebinary_t::const_iterator IT = binary_.begin(), IE = binary_.end(); IT != IE; ++IT) { + const binary_t& Bin = IT->second; + if (Bin.first) { + delete[] Bin.first; } + } - delete symbolTable_; - //! @todo Make sure we have destroyed all CPU specific objects + delete symbolTable_; + //! @todo Make sure we have destroyed all CPU specific objects } -const Symbol* -Program::findSymbol(const char* kernelName) const -{ - symbols_t::const_iterator it = symbolTable_->find(kernelName); - return (it == symbolTable_->end()) ? NULL : &it->second; +const Symbol* Program::findSymbol(const char* kernelName) const { + symbols_t::const_iterator it = symbolTable_->find(kernelName); + return (it == symbolTable_->end()) ? NULL : &it->second; } -cl_int -Program::addDeviceProgram(Device& device, const void* image, size_t length, - amd::option::Options* options) -{ +cl_int Program::addDeviceProgram(Device& device, const void* image, size_t length, + amd::option::Options* options) { #if defined(WITH_LIGHTNING_COMPILER) - // LC binary must be in ELF format - if (image != NULL && !amd::isElfMagic((const char *) image)) { - return CL_INVALID_BINARY; - } -#else // !defined(WITH_LIGHTNING_COMPILER) - if (image != NULL && - !aclValidateBinaryImage(image, length, - isSPIRV_?BINARY_TYPE_SPIRV:BINARY_TYPE_ELF|BINARY_TYPE_LLVM)) { - return CL_INVALID_BINARY; - } -#endif // !defined(WITH_LIGHTNING_COMPILER) + // LC binary must be in ELF format + if (image != NULL && !amd::isElfMagic((const char*)image)) { + return CL_INVALID_BINARY; + } +#else // !defined(WITH_LIGHTNING_COMPILER) + if (image != NULL && + !aclValidateBinaryImage(image, length, + isSPIRV_ ? BINARY_TYPE_SPIRV : BINARY_TYPE_ELF | BINARY_TYPE_LLVM)) { + return CL_INVALID_BINARY; + } +#endif // !defined(WITH_LIGHTNING_COMPILER) - // Check if the device is already associated with this program - if (deviceList_.find(&device) != deviceList_.end()) { - return CL_INVALID_VALUE; - } + // Check if the device is already associated with this program + if (deviceList_.find(&device) != deviceList_.end()) { + return CL_INVALID_VALUE; + } - Device& rootDev = device.rootDevice(); + Device& rootDev = device.rootDevice(); - // if the rootDev is already associated with a program - if (devicePrograms_[&rootDev] != NULL) { - return CL_SUCCESS; - } - bool emptyOptions = false; - amd::option::Options emptyOpts; - if (options == NULL) { - options = &emptyOpts; - emptyOptions = true; - } - -#if !defined(WITH_LIGHTNING_COMPILER) - if (image != NULL && length != 0 && aclValidateBinaryImage(image, length, BINARY_TYPE_ELF)) { - acl_error errorCode; - aclBinary *binary = aclReadFromMem(image, length, &errorCode); - if (errorCode != ACL_SUCCESS) { - return CL_INVALID_BINARY; - } - const oclBIFSymbolStruct* symbol = findBIF30SymStruct(symOpenclCompilerOptions); - assert(symbol && "symbol not found"); - std::string symName = std::string(symbol->str[bif::PRE]) + std::string(symbol->str[bif::POST]); - size_t symSize = 0; - const void *opts = aclExtractSymbol(device.compiler(), - binary, &symSize, aclCOMMENT, symName.c_str(), &errorCode); - // if we have options from binary and input options was not specified - if (opts != NULL && emptyOptions) { - std::string sBinOptions = std::string((char*)opts, symSize); - if (!amd::option::parseAllOptions(sBinOptions, *options)) { - programLog_ = options->optionsLog(); - LogError("Parsing compilation options from binary failed."); - return CL_INVALID_COMPILER_OPTIONS; - } - } - options->oVariables->Legacy = isAMDILTarget(*aclutGetTargetInfo(binary)); - aclBinaryFini(binary); - } -#endif // !defined(WITH_LIGHTNING_COMPILER) - options->oVariables->BinaryIsSpirv = isSPIRV_; - device::Program* program = rootDev.createProgram(options); - if (program == NULL) { - return CL_OUT_OF_HOST_MEMORY; - } - - if (image != NULL) { - uint8_t* memory = binary(rootDev).first; - // clone 'binary' (it is owned by the host thread). - if (memory == NULL) { - memory = new (std::nothrow) uint8_t[length]; - if (memory == NULL) { - delete program; - return CL_OUT_OF_HOST_MEMORY; - } - - ::memcpy(memory, image, length); - - // Save the original image - binary_[&rootDev] = std::make_pair(memory, length); - } - - if (!program->setBinary(reinterpret_cast(memory), length)) { - delete program; - return CL_INVALID_BINARY; - } - -#if defined(WITH_LIGHTNING_COMPILER) - // load the compiler options from the binary if it is not provided - std::string sBinOptions = program->compileOptions(); - if (!sBinOptions.empty() && emptyOptions) { - if (!amd::option::parseAllOptions(sBinOptions, *options)) { - programLog_ = options->optionsLog(); - LogError("Parsing compilation options from binary failed."); - return CL_INVALID_COMPILER_OPTIONS; - } - } -#endif - } - - devicePrograms_[&rootDev] = program; - - deviceList_.insert(&device); + // if the rootDev is already associated with a program + if (devicePrograms_[&rootDev] != NULL) { return CL_SUCCESS; -} + } + bool emptyOptions = false; + amd::option::Options emptyOpts; + if (options == NULL) { + options = &emptyOpts; + emptyOptions = true; + } -device::Program* -Program::getDeviceProgram(const Device& device) const -{ - deviceprograms_t::const_iterator it = - devicePrograms_.find(&device.rootDevice()); - if (it == devicePrograms_.end()) { - return NULL; - } - return it->second; -} - -Monitor -Program::buildLock_("OCL build program", true); - -cl_int -Program::compile( - const std::vector& devices, - size_t numHeaders, - const std::vector& headerPrograms, - const char** headerIncludeNames, - const char* options, - void (CL_CALLBACK * notifyFptr)(cl_program, void *), - void* data, - bool optionChangable) -{ - ScopedLock sl(buildLock_); - - cl_int retval = CL_SUCCESS; - - // Clear the program object - clear(); - - // Process build options. - std::string cppstr(options ? options : ""); - - // if there is a -ignore-env, adjust options. - if (cppstr.size() > 0) { - // Set the options to be the string after -ignore-env - size_t pos = cppstr.find("-ignore-env"); - if (pos != std::string::npos) { - cppstr = cppstr.substr(pos+sizeof("-ignore-env")); - optionChangable = false; - } - } - option::Options parsedOptions; - if (!ParseAllOptions(cppstr, parsedOptions, optionChangable)) { - programLog_ = parsedOptions.optionsLog(); - LogError("Parsing compile options failed."); - return CL_INVALID_COMPILER_OPTIONS; - } - - std::vector headers(numHeaders); - for (size_t i = 0; i < numHeaders; ++i) { - const std::string& header = headerPrograms[i]->sourceCode(); - headers[i] = &header; - } - - // Compile the program programs associated with the given devices. - std::vector::const_iterator it; - for (it = devices.begin(); it != devices.end(); ++it) { - device::Program* devProgram = getDeviceProgram(**it); - if (devProgram == NULL) { - const binary_t& bin = binary(**it); - retval = addDeviceProgram(**it, bin.first, bin.second, &parsedOptions); - if (retval != CL_SUCCESS) { - return retval; - } - devProgram = getDeviceProgram(**it); - } - - if (devProgram->type() == device::Program::TYPE_INTERMEDIATE || - isSPIRV_) { - continue; - } - // We only build a Device-Program once - if (devProgram->buildStatus() != CL_BUILD_NONE) { - continue; - } - if (sourceCode_.empty()) { - return CL_INVALID_OPERATION; - } - cl_int result = devProgram->compile( - sourceCode_, headers, - headerIncludeNames, - options, - &parsedOptions); - - // Check if the previous device failed a build - if ((result != CL_SUCCESS) && (retval != CL_SUCCESS)) { - retval = CL_INVALID_OPERATION; - } - // Update the returned value with a build error - else if (result != CL_SUCCESS) { - retval = result; - } - } - - if (notifyFptr != NULL) { - notifyFptr(as_cl(this), data); - } - - return retval; -} - -cl_int -Program::link( - const std::vector& devices, - size_t numInputs, - const std::vector& inputPrograms, - const char* options, - void (CL_CALLBACK * notifyFptr)(cl_program, void *), - void* data, - bool optionChangable) -{ - ScopedLock sl(buildLock_); - cl_int retval = CL_SUCCESS; - - if (symbolTable_ == NULL) { - symbolTable_ = new symbols_t; - if (symbolTable_ == NULL) { - return CL_OUT_OF_HOST_MEMORY; - } - } - - // Clear the program object - clear(); - - // Process build options. - std::string cppstr(options ? options : ""); - - // if there is a -ignore-env, adjust options. - if (cppstr.size() > 0) { - // Set the options to be the string after -ignore-env - size_t pos = cppstr.find("-ignore-env"); - if (pos != std::string::npos) { - cppstr = cppstr.substr(pos+sizeof("-ignore-env")); - optionChangable = false; - } - } - option::Options parsedOptions; - if (!ParseAllOptions(cppstr, parsedOptions, optionChangable, true)) { - programLog_ = parsedOptions.optionsLog(); - LogError("Parsing link options failed."); - return CL_INVALID_LINKER_OPTIONS; - } - - // Link the program programs associated with the given devices. - std::vector::const_iterator it; - for (it = devices.begin(); it != devices.end(); ++it) { - // find the corresponding device program in each input program - std::vector inputDevPrograms(numInputs); - bool found = false; - for (size_t i = 0; i < numInputs; ++i) { - Program& inputProgram = *inputPrograms[i]; - if (inputProgram.isSPIRV_) { - parsedOptions.oVariables->BinaryIsSpirv = inputProgram.isSPIRV_; - } - deviceprograms_t inputDevProgs = inputProgram.devicePrograms(); - deviceprograms_t::const_iterator findIt = inputDevProgs.find(*it); - if (findIt == inputDevProgs.end()) { - if (found) break; - continue; - } - inputDevPrograms[i] = findIt->second; - device::Program::binary_t binary = inputDevPrograms[i]->binary(); - // Check the binary's target for the first found device program. - // TODO: Revise these binary's target checks - // and possibly remove them after switching to HSAIL by default. #if !defined(WITH_LIGHTNING_COMPILER) - if (!found && binary.first != NULL && binary.second > 0) { - acl_error errorCode = ACL_SUCCESS; - void *mem = const_cast(binary.first); - aclBinary* aclBin = aclReadFromMem(mem, binary.second, &errorCode); - if (errorCode != ACL_SUCCESS) { - LogWarning("Error while linking: Could not read from raw binary."); - return CL_INVALID_BINARY; - } - if (isHSAILTarget(*aclutGetTargetInfo(aclBin))) { - parsedOptions.oVariables->Frontend = "clang"; - } else if (isAMDILTarget(*aclutGetTargetInfo(aclBin))) { - parsedOptions.oVariables->Frontend = "edg"; - } - aclBinaryFini(aclBin); - } -#endif // !defined(WITH_LIGHTNING_COMPILER) - found = true; - } - if (inputDevPrograms.size() == 0) { - continue; - } - if (inputDevPrograms.size() < numInputs) { - return CL_INVALID_VALUE; - } - - device::Program* devProgram = getDeviceProgram(**it); - if (devProgram == NULL) { - const binary_t& bin = binary(**it); - retval = addDeviceProgram(**it, bin.first, bin.second, &parsedOptions); - if (retval != CL_SUCCESS) { - return retval; - } - devProgram = getDeviceProgram(**it); - } - - // We only build a Device-Program once - if (devProgram->buildStatus() != CL_BUILD_NONE) { - continue; - } - cl_int result = devProgram->link( - inputDevPrograms, options, &parsedOptions); - - // Check if the previous device failed a build - if ((result != CL_SUCCESS) && (retval != CL_SUCCESS)) { - retval = CL_INVALID_OPERATION; - } - // Update the returned value with a build error - else if (result != CL_SUCCESS) { - retval = result; - } + if (image != NULL && length != 0 && aclValidateBinaryImage(image, length, BINARY_TYPE_ELF)) { + acl_error errorCode; + aclBinary* binary = aclReadFromMem(image, length, &errorCode); + if (errorCode != ACL_SUCCESS) { + return CL_INVALID_BINARY; } - - if (retval != CL_SUCCESS) { - return retval; - } - - // Rebuild the symbol table - deviceprograms_t::iterator sit; - for (sit = devicePrograms_.begin(); sit != devicePrograms_.end(); ++sit) { - const Device& device = *sit->first; - const device::Program& program = *sit->second; - - const device::Program::kernels_t& kernels = program.kernels(); - device::Program::kernels_t::const_iterator kit; - for (kit = kernels.begin(); kit != kernels.end(); ++kit) { - const std::string& name = kit->first; - const device::Kernel* devKernel = kit->second; - - Symbol& symbol = (*symbolTable_)[name]; - if (!symbol.setDeviceKernel(device, devKernel)) { - retval = CL_LINK_PROGRAM_FAILURE; - } - } - } - - // Create a string with all kernel names from the program - if (kernelNames_.length() == 0) { - amd::Program::symbols_t::const_iterator it; - for (it = symbols().begin(); it != symbols().end(); ++it) { - if (it != symbols().begin()) { - kernelNames_.append(1, ';'); - } - kernelNames_.append(it->first.c_str()); - } - } - - if (notifyFptr != NULL) { - notifyFptr(as_cl(this), data); - } - - return retval; -} - -void Program::StubProgramSource(const std::string& app_name) -{ - static uint program_counter = 0; - std::fstream stub_read; - std::stringstream file_name; - std::string app_name_no_ext; - - std::size_t length = app_name.rfind(".exe"); - if (length == std::string::npos) { - length = app_name.size(); - } - app_name_no_ext.assign(app_name.c_str(), length); - - // Construct a unique file name for the CL program - file_name << app_name_no_ext << "_program_" << program_counter << ".cl"; - - stub_read.open(file_name.str().c_str(), (std::fstream::in | std::fstream::binary)); - // Check if we have OpenCL program - if (stub_read.is_open()) { - // Find the stream size - stub_read.seekg(0, std::fstream::end); - size_t size = stub_read.tellg(); - stub_read.seekg(0, std::ios::beg); - - char* data = new char[size]; - stub_read.read(data, size); - stub_read.close(); - - sourceCode_.assign(data, size); - delete[] data; - } - else { - std::fstream stub_write; - stub_write.open(file_name.str().c_str(), (std::fstream::out | std::fstream::binary)); - stub_write << sourceCode_; - stub_write.close(); - } - program_counter++; -} - -cl_int -Program::build( - const std::vector& devices, - const char* options, - void (CL_CALLBACK * notifyFptr)(cl_program, void *), - void* data, - bool optionChangable) -{ - ScopedLock sl(buildLock_); - cl_int retval = CL_SUCCESS; - - if (symbolTable_ == NULL) { - symbolTable_ = new symbols_t; - if (symbolTable_ == NULL) { - return CL_OUT_OF_HOST_MEMORY; - } - } - - if (OCL_STUB_PROGRAMS && !sourceCode_.empty()) { - // The app name should be the samme for all device - StubProgramSource(devices[0]->appProfile()->appFileName()); - } - - // Clear the program object - clear(); - - // Process build options. - std::string cppstr(options ? options : ""); - - // if there is a -ignore-env, adjust options. - if (cppstr.size() > 0) { - // Set the options to be the string after -ignore-env - size_t pos = cppstr.find("-ignore-env"); - if (pos != std::string::npos) { - cppstr = cppstr.substr(pos+sizeof("-ignore-env")); - optionChangable = false; - } - } - option::Options parsedOptions; - if (!ParseAllOptions(cppstr, parsedOptions, optionChangable)) { - programLog_ = parsedOptions.optionsLog(); - LogError("Parsing compile options failed."); + const oclBIFSymbolStruct* symbol = findBIF30SymStruct(symOpenclCompilerOptions); + assert(symbol && "symbol not found"); + std::string symName = std::string(symbol->str[bif::PRE]) + std::string(symbol->str[bif::POST]); + size_t symSize = 0; + const void* opts = aclExtractSymbol(device.compiler(), binary, &symSize, aclCOMMENT, + symName.c_str(), &errorCode); + // if we have options from binary and input options was not specified + if (opts != NULL && emptyOptions) { + std::string sBinOptions = std::string((char*)opts, symSize); + if (!amd::option::parseAllOptions(sBinOptions, *options)) { + programLog_ = options->optionsLog(); + LogError("Parsing compilation options from binary failed."); return CL_INVALID_COMPILER_OPTIONS; + } + } + options->oVariables->Legacy = isAMDILTarget(*aclutGetTargetInfo(binary)); + aclBinaryFini(binary); + } +#endif // !defined(WITH_LIGHTNING_COMPILER) + options->oVariables->BinaryIsSpirv = isSPIRV_; + device::Program* program = rootDev.createProgram(options); + if (program == NULL) { + return CL_OUT_OF_HOST_MEMORY; + } + + if (image != NULL) { + uint8_t* memory = binary(rootDev).first; + // clone 'binary' (it is owned by the host thread). + if (memory == NULL) { + memory = new (std::nothrow) uint8_t[length]; + if (memory == NULL) { + delete program; + return CL_OUT_OF_HOST_MEMORY; + } + + ::memcpy(memory, image, length); + + // Save the original image + binary_[&rootDev] = std::make_pair(memory, length); } - // Build the program programs associated with the given devices. - std::vector::const_iterator it; - for (it = devices.begin(); it != devices.end(); ++it) { - device::Program* devProgram = getDeviceProgram(**it); - if (devProgram == NULL) { - const binary_t& bin = binary(**it); - if (sourceCode_.empty() && (bin.first == NULL)) { - retval = false; - continue; - } - retval = addDeviceProgram(**it, bin.first, bin.second, &parsedOptions); - if (retval != CL_SUCCESS) { - return retval; - } - devProgram = getDeviceProgram(**it); - } - - parsedOptions.oVariables->AssumeAlias = true; - - // We only build a Device-Program once - if (devProgram->buildStatus() != CL_BUILD_NONE) { - continue; - } - cl_int result = devProgram->build(sourceCode_, options, &parsedOptions); - - // Check if the previous device failed a build - if ((result != CL_SUCCESS) && (retval != CL_SUCCESS)) { - retval = CL_INVALID_OPERATION; - } - // Update the returned value with a build error - else if (result != CL_SUCCESS) { - retval = result; - } + if (!program->setBinary(reinterpret_cast(memory), length)) { + delete program; + return CL_INVALID_BINARY; } - if (retval != CL_SUCCESS) { - return retval; +#if defined(WITH_LIGHTNING_COMPILER) + // load the compiler options from the binary if it is not provided + std::string sBinOptions = program->compileOptions(); + if (!sBinOptions.empty() && emptyOptions) { + if (!amd::option::parseAllOptions(sBinOptions, *options)) { + programLog_ = options->optionsLog(); + LogError("Parsing compilation options from binary failed."); + return CL_INVALID_COMPILER_OPTIONS; + } } +#endif + } - // Rebuild the symbol table - deviceprograms_t::iterator sit; - for (sit = devicePrograms_.begin(); sit != devicePrograms_.end(); ++sit) { - const Device& device = *sit->first; - const device::Program& program = *sit->second; + devicePrograms_[&rootDev] = program; - const device::Program::kernels_t& kernels = program.kernels(); - device::Program::kernels_t::const_iterator kit; - for (kit = kernels.begin(); kit != kernels.end(); ++kit) { - const std::string& name = kit->first; - const device::Kernel* devKernel = kit->second; - - Symbol& symbol = (*symbolTable_)[name]; - if (!symbol.setDeviceKernel(device, devKernel)) { - retval = CL_BUILD_PROGRAM_FAILURE; - } - } - } - - // Create a string with all kernel names from the program - if (kernelNames_.length() == 0) { - amd::Program::symbols_t::const_iterator it; - for (it = symbols().begin(); it != symbols().end(); ++it) { - if (it != symbols().begin()) { - kernelNames_.append(1, ';'); - } - kernelNames_.append(it->first.c_str()); - } - } - - if (notifyFptr != NULL) { - notifyFptr(as_cl(this), data); - } - - return retval; + deviceList_.insert(&device); + return CL_SUCCESS; } -void -Program::clear() -{ - deviceprograms_t::iterator sit; - - // Destroy old programs if we have any - for (sit = devicePrograms_.begin(); sit != devicePrograms_.end(); ++sit) { - // Destroy device program - delete sit->second; - } - - devicePrograms_.clear(); - deviceList_.clear(); - if (symbolTable_) symbolTable_->clear(); - kernelNames_.clear(); -} - -int -Program::GetOclCVersion(const char* clVer) { - // default version - int version = 12; - if (clVer == NULL) { - return version; - } - std::string clStd(clVer); - if (clStd.size() != 5) { - return version; - } - clStd.erase(0,2); - clStd.erase(1,1); - return std::stoi(clStd); -} - -bool -Program::ParseAllOptions(const std::string& options, option::Options& parsedOptions, bool optionChangable, bool linkOptsOnly) { - std::string allOpts = options; - if (optionChangable) { - if (linkOptsOnly) { - if (AMD_OCL_LINK_OPTIONS != NULL) { - allOpts.append(" "); - allOpts.append(AMD_OCL_LINK_OPTIONS); - } - if (AMD_OCL_LINK_OPTIONS_APPEND != NULL) { - allOpts.append(" "); - allOpts.append(AMD_OCL_LINK_OPTIONS_APPEND); - } - } else { - if (AMD_OCL_BUILD_OPTIONS != NULL) { - allOpts.append(" "); - allOpts.append(AMD_OCL_BUILD_OPTIONS); - } - if (!Device::appProfile()->GetBuildOptsAppend().empty()) { - allOpts.append(" "); - allOpts.append(Device::appProfile()->GetBuildOptsAppend()); - } - if (AMD_OCL_BUILD_OPTIONS_APPEND != NULL) { - allOpts.append(" "); - allOpts.append(AMD_OCL_BUILD_OPTIONS_APPEND); - } - } - } - return amd::option::parseAllOptions(allOpts, parsedOptions, linkOptsOnly); -} - -bool -Symbol::setDeviceKernel( - const Device& device, - const device::Kernel* func, - bool noAlias) -{ - // FIXME_lmoriche: check that the signatures are compatible - if (deviceKernels_.size() == 0 || device.type() == CL_DEVICE_TYPE_CPU) { - signature_ = func->signature(); - } - - if (noAlias) { - deviceKernels_[&device] = func; - } - else { - devKernelsNoOpt_[&device] = func; - } - return true; -} - -const device::Kernel* -Symbol::getDeviceKernel(const Device& device, bool noAlias) const -{ - const devicekernels_t* devKernels = - (noAlias) ? &deviceKernels_ : &devKernelsNoOpt_; - devicekernels_t::const_iterator itEnd = devKernels->end(); - devicekernels_t::const_iterator it = devKernels->find(&device); - if (it != itEnd) { - return it->second; - } - - for (it = devKernels->begin(); it != itEnd; ++it) { - if (it->first->isAncestor(&device)) { - return it->second; - } - } - +device::Program* Program::getDeviceProgram(const Device& device) const { + deviceprograms_t::const_iterator it = devicePrograms_.find(&device.rootDevice()); + if (it == devicePrograms_.end()) { return NULL; + } + return it->second; } -} // namespace amd +Monitor Program::buildLock_("OCL build program", true); + +cl_int Program::compile(const std::vector& devices, size_t numHeaders, + const std::vector& headerPrograms, + const char** headerIncludeNames, const char* options, + void(CL_CALLBACK* notifyFptr)(cl_program, void*), void* data, + bool optionChangable) { + ScopedLock sl(buildLock_); + + cl_int retval = CL_SUCCESS; + + // Clear the program object + clear(); + + // Process build options. + std::string cppstr(options ? options : ""); + + // if there is a -ignore-env, adjust options. + if (cppstr.size() > 0) { + // Set the options to be the string after -ignore-env + size_t pos = cppstr.find("-ignore-env"); + if (pos != std::string::npos) { + cppstr = cppstr.substr(pos + sizeof("-ignore-env")); + optionChangable = false; + } + } + option::Options parsedOptions; + if (!ParseAllOptions(cppstr, parsedOptions, optionChangable)) { + programLog_ = parsedOptions.optionsLog(); + LogError("Parsing compile options failed."); + return CL_INVALID_COMPILER_OPTIONS; + } + + std::vector headers(numHeaders); + for (size_t i = 0; i < numHeaders; ++i) { + const std::string& header = headerPrograms[i]->sourceCode(); + headers[i] = &header; + } + + // Compile the program programs associated with the given devices. + std::vector::const_iterator it; + for (it = devices.begin(); it != devices.end(); ++it) { + device::Program* devProgram = getDeviceProgram(**it); + if (devProgram == NULL) { + const binary_t& bin = binary(**it); + retval = addDeviceProgram(**it, bin.first, bin.second, &parsedOptions); + if (retval != CL_SUCCESS) { + return retval; + } + devProgram = getDeviceProgram(**it); + } + + if (devProgram->type() == device::Program::TYPE_INTERMEDIATE || isSPIRV_) { + continue; + } + // We only build a Device-Program once + if (devProgram->buildStatus() != CL_BUILD_NONE) { + continue; + } + if (sourceCode_.empty()) { + return CL_INVALID_OPERATION; + } + cl_int result = + devProgram->compile(sourceCode_, headers, headerIncludeNames, options, &parsedOptions); + + // Check if the previous device failed a build + if ((result != CL_SUCCESS) && (retval != CL_SUCCESS)) { + retval = CL_INVALID_OPERATION; + } + // Update the returned value with a build error + else if (result != CL_SUCCESS) { + retval = result; + } + } + + if (notifyFptr != NULL) { + notifyFptr(as_cl(this), data); + } + + return retval; +} + +cl_int Program::link(const std::vector& devices, size_t numInputs, + const std::vector& inputPrograms, const char* options, + void(CL_CALLBACK* notifyFptr)(cl_program, void*), void* data, + bool optionChangable) { + ScopedLock sl(buildLock_); + cl_int retval = CL_SUCCESS; + + if (symbolTable_ == NULL) { + symbolTable_ = new symbols_t; + if (symbolTable_ == NULL) { + return CL_OUT_OF_HOST_MEMORY; + } + } + + // Clear the program object + clear(); + + // Process build options. + std::string cppstr(options ? options : ""); + + // if there is a -ignore-env, adjust options. + if (cppstr.size() > 0) { + // Set the options to be the string after -ignore-env + size_t pos = cppstr.find("-ignore-env"); + if (pos != std::string::npos) { + cppstr = cppstr.substr(pos + sizeof("-ignore-env")); + optionChangable = false; + } + } + option::Options parsedOptions; + if (!ParseAllOptions(cppstr, parsedOptions, optionChangable, true)) { + programLog_ = parsedOptions.optionsLog(); + LogError("Parsing link options failed."); + return CL_INVALID_LINKER_OPTIONS; + } + + // Link the program programs associated with the given devices. + std::vector::const_iterator it; + for (it = devices.begin(); it != devices.end(); ++it) { + // find the corresponding device program in each input program + std::vector inputDevPrograms(numInputs); + bool found = false; + for (size_t i = 0; i < numInputs; ++i) { + Program& inputProgram = *inputPrograms[i]; + if (inputProgram.isSPIRV_) { + parsedOptions.oVariables->BinaryIsSpirv = inputProgram.isSPIRV_; + } + deviceprograms_t inputDevProgs = inputProgram.devicePrograms(); + deviceprograms_t::const_iterator findIt = inputDevProgs.find(*it); + if (findIt == inputDevProgs.end()) { + if (found) break; + continue; + } + inputDevPrograms[i] = findIt->second; + device::Program::binary_t binary = inputDevPrograms[i]->binary(); +// Check the binary's target for the first found device program. +// TODO: Revise these binary's target checks +// and possibly remove them after switching to HSAIL by default. +#if !defined(WITH_LIGHTNING_COMPILER) + if (!found && binary.first != NULL && binary.second > 0) { + acl_error errorCode = ACL_SUCCESS; + void* mem = const_cast(binary.first); + aclBinary* aclBin = aclReadFromMem(mem, binary.second, &errorCode); + if (errorCode != ACL_SUCCESS) { + LogWarning("Error while linking: Could not read from raw binary."); + return CL_INVALID_BINARY; + } + if (isHSAILTarget(*aclutGetTargetInfo(aclBin))) { + parsedOptions.oVariables->Frontend = "clang"; + } else if (isAMDILTarget(*aclutGetTargetInfo(aclBin))) { + parsedOptions.oVariables->Frontend = "edg"; + } + aclBinaryFini(aclBin); + } +#endif // !defined(WITH_LIGHTNING_COMPILER) + found = true; + } + if (inputDevPrograms.size() == 0) { + continue; + } + if (inputDevPrograms.size() < numInputs) { + return CL_INVALID_VALUE; + } + + device::Program* devProgram = getDeviceProgram(**it); + if (devProgram == NULL) { + const binary_t& bin = binary(**it); + retval = addDeviceProgram(**it, bin.first, bin.second, &parsedOptions); + if (retval != CL_SUCCESS) { + return retval; + } + devProgram = getDeviceProgram(**it); + } + + // We only build a Device-Program once + if (devProgram->buildStatus() != CL_BUILD_NONE) { + continue; + } + cl_int result = devProgram->link(inputDevPrograms, options, &parsedOptions); + + // Check if the previous device failed a build + if ((result != CL_SUCCESS) && (retval != CL_SUCCESS)) { + retval = CL_INVALID_OPERATION; + } + // Update the returned value with a build error + else if (result != CL_SUCCESS) { + retval = result; + } + } + + if (retval != CL_SUCCESS) { + return retval; + } + + // Rebuild the symbol table + deviceprograms_t::iterator sit; + for (sit = devicePrograms_.begin(); sit != devicePrograms_.end(); ++sit) { + const Device& device = *sit->first; + const device::Program& program = *sit->second; + + const device::Program::kernels_t& kernels = program.kernels(); + device::Program::kernels_t::const_iterator kit; + for (kit = kernels.begin(); kit != kernels.end(); ++kit) { + const std::string& name = kit->first; + const device::Kernel* devKernel = kit->second; + + Symbol& symbol = (*symbolTable_)[name]; + if (!symbol.setDeviceKernel(device, devKernel)) { + retval = CL_LINK_PROGRAM_FAILURE; + } + } + } + + // Create a string with all kernel names from the program + if (kernelNames_.length() == 0) { + amd::Program::symbols_t::const_iterator it; + for (it = symbols().begin(); it != symbols().end(); ++it) { + if (it != symbols().begin()) { + kernelNames_.append(1, ';'); + } + kernelNames_.append(it->first.c_str()); + } + } + + if (notifyFptr != NULL) { + notifyFptr(as_cl(this), data); + } + + return retval; +} + +void Program::StubProgramSource(const std::string& app_name) { + static uint program_counter = 0; + std::fstream stub_read; + std::stringstream file_name; + std::string app_name_no_ext; + + std::size_t length = app_name.rfind(".exe"); + if (length == std::string::npos) { + length = app_name.size(); + } + app_name_no_ext.assign(app_name.c_str(), length); + + // Construct a unique file name for the CL program + file_name << app_name_no_ext << "_program_" << program_counter << ".cl"; + + stub_read.open(file_name.str().c_str(), (std::fstream::in | std::fstream::binary)); + // Check if we have OpenCL program + if (stub_read.is_open()) { + // Find the stream size + stub_read.seekg(0, std::fstream::end); + size_t size = stub_read.tellg(); + stub_read.seekg(0, std::ios::beg); + + char* data = new char[size]; + stub_read.read(data, size); + stub_read.close(); + + sourceCode_.assign(data, size); + delete[] data; + } else { + std::fstream stub_write; + stub_write.open(file_name.str().c_str(), (std::fstream::out | std::fstream::binary)); + stub_write << sourceCode_; + stub_write.close(); + } + program_counter++; +} + +cl_int Program::build(const std::vector& devices, const char* options, + void(CL_CALLBACK* notifyFptr)(cl_program, void*), void* data, + bool optionChangable) { + ScopedLock sl(buildLock_); + cl_int retval = CL_SUCCESS; + + if (symbolTable_ == NULL) { + symbolTable_ = new symbols_t; + if (symbolTable_ == NULL) { + return CL_OUT_OF_HOST_MEMORY; + } + } + + if (OCL_STUB_PROGRAMS && !sourceCode_.empty()) { + // The app name should be the samme for all device + StubProgramSource(devices[0]->appProfile()->appFileName()); + } + + // Clear the program object + clear(); + + // Process build options. + std::string cppstr(options ? options : ""); + + // if there is a -ignore-env, adjust options. + if (cppstr.size() > 0) { + // Set the options to be the string after -ignore-env + size_t pos = cppstr.find("-ignore-env"); + if (pos != std::string::npos) { + cppstr = cppstr.substr(pos + sizeof("-ignore-env")); + optionChangable = false; + } + } + option::Options parsedOptions; + if (!ParseAllOptions(cppstr, parsedOptions, optionChangable)) { + programLog_ = parsedOptions.optionsLog(); + LogError("Parsing compile options failed."); + return CL_INVALID_COMPILER_OPTIONS; + } + + // Build the program programs associated with the given devices. + std::vector::const_iterator it; + for (it = devices.begin(); it != devices.end(); ++it) { + device::Program* devProgram = getDeviceProgram(**it); + if (devProgram == NULL) { + const binary_t& bin = binary(**it); + if (sourceCode_.empty() && (bin.first == NULL)) { + retval = false; + continue; + } + retval = addDeviceProgram(**it, bin.first, bin.second, &parsedOptions); + if (retval != CL_SUCCESS) { + return retval; + } + devProgram = getDeviceProgram(**it); + } + + parsedOptions.oVariables->AssumeAlias = true; + + // We only build a Device-Program once + if (devProgram->buildStatus() != CL_BUILD_NONE) { + continue; + } + cl_int result = devProgram->build(sourceCode_, options, &parsedOptions); + + // Check if the previous device failed a build + if ((result != CL_SUCCESS) && (retval != CL_SUCCESS)) { + retval = CL_INVALID_OPERATION; + } + // Update the returned value with a build error + else if (result != CL_SUCCESS) { + retval = result; + } + } + + if (retval != CL_SUCCESS) { + return retval; + } + + // Rebuild the symbol table + deviceprograms_t::iterator sit; + for (sit = devicePrograms_.begin(); sit != devicePrograms_.end(); ++sit) { + const Device& device = *sit->first; + const device::Program& program = *sit->second; + + const device::Program::kernels_t& kernels = program.kernels(); + device::Program::kernels_t::const_iterator kit; + for (kit = kernels.begin(); kit != kernels.end(); ++kit) { + const std::string& name = kit->first; + const device::Kernel* devKernel = kit->second; + + Symbol& symbol = (*symbolTable_)[name]; + if (!symbol.setDeviceKernel(device, devKernel)) { + retval = CL_BUILD_PROGRAM_FAILURE; + } + } + } + + // Create a string with all kernel names from the program + if (kernelNames_.length() == 0) { + amd::Program::symbols_t::const_iterator it; + for (it = symbols().begin(); it != symbols().end(); ++it) { + if (it != symbols().begin()) { + kernelNames_.append(1, ';'); + } + kernelNames_.append(it->first.c_str()); + } + } + + if (notifyFptr != NULL) { + notifyFptr(as_cl(this), data); + } + + return retval; +} + +void Program::clear() { + deviceprograms_t::iterator sit; + + // Destroy old programs if we have any + for (sit = devicePrograms_.begin(); sit != devicePrograms_.end(); ++sit) { + // Destroy device program + delete sit->second; + } + + devicePrograms_.clear(); + deviceList_.clear(); + if (symbolTable_) symbolTable_->clear(); + kernelNames_.clear(); +} + +int Program::GetOclCVersion(const char* clVer) { + // default version + int version = 12; + if (clVer == NULL) { + return version; + } + std::string clStd(clVer); + if (clStd.size() != 5) { + return version; + } + clStd.erase(0, 2); + clStd.erase(1, 1); + return std::stoi(clStd); +} + +bool Program::ParseAllOptions(const std::string& options, option::Options& parsedOptions, + bool optionChangable, bool linkOptsOnly) { + std::string allOpts = options; + if (optionChangable) { + if (linkOptsOnly) { + if (AMD_OCL_LINK_OPTIONS != NULL) { + allOpts.append(" "); + allOpts.append(AMD_OCL_LINK_OPTIONS); + } + if (AMD_OCL_LINK_OPTIONS_APPEND != NULL) { + allOpts.append(" "); + allOpts.append(AMD_OCL_LINK_OPTIONS_APPEND); + } + } else { + if (AMD_OCL_BUILD_OPTIONS != NULL) { + allOpts.append(" "); + allOpts.append(AMD_OCL_BUILD_OPTIONS); + } + if (!Device::appProfile()->GetBuildOptsAppend().empty()) { + allOpts.append(" "); + allOpts.append(Device::appProfile()->GetBuildOptsAppend()); + } + if (AMD_OCL_BUILD_OPTIONS_APPEND != NULL) { + allOpts.append(" "); + allOpts.append(AMD_OCL_BUILD_OPTIONS_APPEND); + } + } + } + return amd::option::parseAllOptions(allOpts, parsedOptions, linkOptsOnly); +} + +bool Symbol::setDeviceKernel(const Device& device, const device::Kernel* func, bool noAlias) { + // FIXME_lmoriche: check that the signatures are compatible + if (deviceKernels_.size() == 0 || device.type() == CL_DEVICE_TYPE_CPU) { + signature_ = func->signature(); + } + + if (noAlias) { + deviceKernels_[&device] = func; + } else { + devKernelsNoOpt_[&device] = func; + } + return true; +} + +const device::Kernel* Symbol::getDeviceKernel(const Device& device, bool noAlias) const { + const devicekernels_t* devKernels = (noAlias) ? &deviceKernels_ : &devKernelsNoOpt_; + devicekernels_t::const_iterator itEnd = devKernels->end(); + devicekernels_t::const_iterator it = devKernels->find(&device); + if (it != itEnd) { + return it->second; + } + + for (it = devKernels->begin(); it != itEnd; ++it) { + if (it->first->isAncestor(&device)) { + return it->second; + } + } + + return NULL; +} + +} // namespace amd diff --git a/rocclr/runtime/platform/program.hpp b/rocclr/runtime/platform/program.hpp index bcc89e0733..01008731b0 100644 --- a/rocclr/runtime/platform/program.hpp +++ b/rocclr/runtime/platform/program.hpp @@ -33,178 +33,154 @@ namespace amd { */ //! A kernel function symbol -class Symbol : public HeapObject -{ -public: - typedef std::map devicekernels_t; +class Symbol : public HeapObject { + public: + typedef std::map devicekernels_t; -private: - devicekernels_t deviceKernels_; //! All device kernels objects. - devicekernels_t devKernelsNoOpt_; //! Kernel objects without optimization - KernelSignature signature_; //! Kernel signature. + private: + devicekernels_t deviceKernels_; //! All device kernels objects. + devicekernels_t devKernelsNoOpt_; //! Kernel objects without optimization + KernelSignature signature_; //! Kernel signature. -public: - //! Default constructor - Symbol() {} + public: + //! Default constructor + Symbol() {} - //! Set the entry point and check or set the signature. - bool setDeviceKernel( - const Device& device, //!< Device object. - const device::Kernel* func, //!< Device kernel object. - bool noAlias = true //!< No-alias optimization - ); + //! Set the entry point and check or set the signature. + bool setDeviceKernel(const Device& device, //!< Device object. + const device::Kernel* func, //!< Device kernel object. + bool noAlias = true //!< No-alias optimization + ); - //! Return the device kernel. - const device::Kernel* getDeviceKernel( - const Device& device, //!< Device object. - bool noAlias = true //!< No-alias optimization - ) const; + //! Return the device kernel. + const device::Kernel* getDeviceKernel(const Device& device, //!< Device object. + bool noAlias = true //!< No-alias optimization + ) const; - //! Return this Symbol's signature. - const KernelSignature& signature() const { return signature_; } + //! Return this Symbol's signature. + const KernelSignature& signature() const { return signature_; } }; class Context; //! A collection of binaries for devices in the associated context. -class Program : public RuntimeObject -{ -public: - typedef std::pair binary_t; - typedef std::set devicelist_t; - typedef std::map devicebinary_t; - typedef std::map deviceprograms_t; - typedef std::map symbols_t; +class Program : public RuntimeObject { + public: + typedef std::pair binary_t; + typedef std::set devicelist_t; + typedef std::map devicebinary_t; + typedef std::map deviceprograms_t; + typedef std::map symbols_t; -private: - //! Replaces the compiled program with the new version from HD - void StubProgramSource(const std::string& app_name); + private: + //! Replaces the compiled program with the new version from HD + void StubProgramSource(const std::string& app_name); - //! The context this program is part of. - SharedReference context_; + //! The context this program is part of. + SharedReference context_; - std::string sourceCode_; //!< Strings that make up the source code - bool isSPIRV_; //!< The binary image is SPIRV - devicebinary_t binary_; //!< The binary image, provided by the app - symbols_t* symbolTable_; //!< The program's kernels symbol table - std::string kernelNames_; //!< The program kernel names + std::string sourceCode_; //!< Strings that make up the source code + bool isSPIRV_; //!< The binary image is SPIRV + devicebinary_t binary_; //!< The binary image, provided by the app + symbols_t* symbolTable_; //!< The program's kernels symbol table + std::string kernelNames_; //!< The program kernel names - //! The device program objects included in this program - deviceprograms_t devicePrograms_; - devicelist_t deviceList_; + //! The device program objects included in this program + deviceprograms_t devicePrograms_; + devicelist_t deviceList_; - std::string programLog_; //!< Log for parsing options, etc. + std::string programLog_; //!< Log for parsing options, etc. -protected: - //! Destroy this program. - ~Program(); + protected: + //! Destroy this program. + ~Program(); - //! Clears the program object if the app attempts to rebuild the program - void clear(); + //! Clears the program object if the app attempts to rebuild the program + void clear(); - //! Global build lock (remove when LLVM is thread-safe). - static Monitor buildLock_; + //! Global build lock (remove when LLVM is thread-safe). + static Monitor buildLock_; -public: - //! Construct a new program to be compiled from the given source code. - Program(Context& context, const std::string& sourceCode, bool isSPIRV = false) - : context_(context) - , sourceCode_(sourceCode) - , isSPIRV_(isSPIRV) - , symbolTable_(NULL) - , programLog_() - { } + public: + //! Construct a new program to be compiled from the given source code. + Program(Context& context, const std::string& sourceCode, bool isSPIRV = false) + : context_(context), + sourceCode_(sourceCode), + isSPIRV_(isSPIRV), + symbolTable_(NULL), + programLog_() {} - //! Construct a new program associated with a context. - Program(Context& context) - : context_(context) - , isSPIRV_(false) - , symbolTable_(NULL) - { } + //! Construct a new program associated with a context. + Program(Context& context) : context_(context), isSPIRV_(false), symbolTable_(NULL) {} - //! Returns context, associated with the current program. - const Context& context() const { return context_(); } + //! Returns context, associated with the current program. + const Context& context() const { return context_(); } - //! Return the sections for this program. - const deviceprograms_t& devicePrograms() const { return devicePrograms_; } + //! Return the sections for this program. + const deviceprograms_t& devicePrograms() const { return devicePrograms_; } - //! Return the associated devices. - const devicelist_t& deviceList() const { return deviceList_; } + //! Return the associated devices. + const devicelist_t& deviceList() const { return deviceList_; } - //! Return the symbols for this program. - const symbols_t& symbols() const { return *symbolTable_; } + //! Return the symbols for this program. + const symbols_t& symbols() const { return *symbolTable_; } - //! Return the pointer to symbols for this program. - const symbols_t* symbolsPtr() const { return symbolTable_; } + //! Return the pointer to symbols for this program. + const symbols_t* symbolsPtr() const { return symbolTable_; } - //! Return the program source code. - const std::string& sourceCode() const { return sourceCode_; } + //! Return the program source code. + const std::string& sourceCode() const { return sourceCode_; } - //! Return the program log. - const std::string& programLog() const { return programLog_; } + //! Return the program log. + const std::string& programLog() const { return programLog_; } - //! Add a new device program with or without binary image and options. - cl_int addDeviceProgram(Device&, const void* image = NULL, size_t len = 0, - amd::option::Options* options = NULL); + //! Add a new device program with or without binary image and options. + cl_int addDeviceProgram(Device&, const void* image = NULL, size_t len = 0, + amd::option::Options* options = NULL); - //! Find the section for the given device. Return NULL if not found. - device::Program* getDeviceProgram(const Device& device) const; + //! Find the section for the given device. Return NULL if not found. + device::Program* getDeviceProgram(const Device& device) const; - //! Return the symbol for the given kernel name. - const Symbol* findSymbol(const char* name) const; + //! Return the symbol for the given kernel name. + const Symbol* findSymbol(const char* name) const; - //! Return the binary image. - const binary_t& binary(const Device& device) { - return binary_[&device.rootDevice()]; - } + //! Return the binary image. + const binary_t& binary(const Device& device) { return binary_[&device.rootDevice()]; } - //! Return the program kernel names - const std::string& kernelNames() const { return kernelNames_; } + //! Return the program kernel names + const std::string& kernelNames() const { return kernelNames_; } - //! Compile the program for the given devices. - cl_int compile( - const std::vector& devices, - size_t numHeaders, - const std::vector& headerPrograms, - const char** headerIncludeNames, - const char* options = NULL, - void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL, - void* data = NULL, - bool optionChangable = true); + //! Compile the program for the given devices. + cl_int compile(const std::vector& devices, size_t numHeaders, + const std::vector& headerPrograms, const char** headerIncludeNames, + const char* options = NULL, + void(CL_CALLBACK* notifyFptr)(cl_program, void*) = NULL, void* data = NULL, + bool optionChangable = true); - //! Link the programs for the given devices. - cl_int link( - const std::vector& devices, - size_t numInputs, - const std::vector& inputPrograms, - const char* options = NULL, - void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL, - void* data = NULL, - bool optionChangable = true); + //! Link the programs for the given devices. + cl_int link(const std::vector& devices, size_t numInputs, + const std::vector& inputPrograms, const char* options = NULL, + void(CL_CALLBACK* notifyFptr)(cl_program, void*) = NULL, void* data = NULL, + bool optionChangable = true); - //! Build the program for the given devices. - cl_int build( - const std::vector& devices, - const char* options = NULL, - void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL, - void* data = NULL, - bool optionChangable = true); + //! Build the program for the given devices. + cl_int build(const std::vector& devices, const char* options = NULL, + void(CL_CALLBACK* notifyFptr)(cl_program, void*) = NULL, void* data = NULL, + bool optionChangable = true); - //! RTTI internal implementation - virtual ObjectType objectType() const {return ObjectTypeProgram;} + //! RTTI internal implementation + virtual ObjectType objectType() const { return ObjectTypeProgram; } - static int GetOclCVersion(const char* clVer); + static int GetOclCVersion(const char* clVer); - static bool ParseAllOptions(const std::string& options, - option::Options& parsedOptions, - bool optionChangable = true, - bool linkOptsOnly = false); + static bool ParseAllOptions(const std::string& options, option::Options& parsedOptions, + bool optionChangable = true, bool linkOptsOnly = false); }; /*! @} * @} */ -} // namespace amd +} // namespace amd #endif /*PROGRAM_HPP_*/ diff --git a/rocclr/runtime/platform/runtime.cpp b/rocclr/runtime/platform/runtime.cpp index e3cf1e877a..e7b5f70934 100644 --- a/rocclr/runtime/platform/runtime.cpp +++ b/rocclr/runtime/platform/runtime.cpp @@ -17,9 +17,9 @@ #include #include #include "CL/cl_d3d10.h" -#endif //_WIN32 +#endif //_WIN32 -#if defined(_MSC_VER) //both Win32 and Win64 +#if defined(_MSC_VER) // both Win32 and Win64 #include #endif @@ -32,133 +32,109 @@ namespace amd { #ifdef __linux__ static void __runtime_exit() __attribute__((destructor(102))); -static void __runtime_exit() -{ - if (ENABLE_CAL_SHUTDOWN) { - Runtime::tearDown(); - } +static void __runtime_exit() { + if (ENABLE_CAL_SHUTDOWN) { + Runtime::tearDown(); + } } #endif -volatile bool -Runtime::initialized_ = false; +volatile bool Runtime::initialized_ = false; -bool -Runtime::init() -{ - if (initialized_) { - return true; - } - - // Enter a very basic critical region. We want to prevent 2 threads - // from concurrently executing the init() routines. We can't use a - // Monitor since the system is not yet initialized. - - static std::atomic_flag lock = ATOMIC_FLAG_INIT; - struct CriticalRegion - { - std::atomic_flag& lock_; - CriticalRegion(std::atomic_flag& lock) : lock_(lock) - { - while (lock.test_and_set(std::memory_order_acquire)) { - Os::yield(); - } - } - ~CriticalRegion() - { - lock_.clear(std::memory_order_release); - } - } region(lock); - - if (initialized_) { - return true; - } - - if ( !Flag::init() - || !option::init() - || !Device::init() - // Agent initializes last - || !Agent::init()) { - return false; - } - - initialized_ = true; +bool Runtime::init() { + if (initialized_) { return true; -} + } -void -Runtime::tearDown() -{ - if (!initialized_) { - return; + // Enter a very basic critical region. We want to prevent 2 threads + // from concurrently executing the init() routines. We can't use a + // Monitor since the system is not yet initialized. + + static std::atomic_flag lock = ATOMIC_FLAG_INIT; + struct CriticalRegion { + std::atomic_flag& lock_; + CriticalRegion(std::atomic_flag& lock) : lock_(lock) { + while (lock.test_and_set(std::memory_order_acquire)) { + Os::yield(); + } } + ~CriticalRegion() { lock_.clear(std::memory_order_release); } + } region(lock); - Agent::tearDown(); - Device::tearDown(); - option::teardown(); - Flag::tearDown(); + if (initialized_) { + return true; + } + + if (!Flag::init() || !option::init() || !Device::init() + // Agent initializes last + || !Agent::init()) { + return false; + } + + initialized_ = true; + return true; } -uint -ReferenceCountedObject::retain() -{ - return ++make_atomic(referenceCount_); +void Runtime::tearDown() { + if (!initialized_) { + return; + } + + Agent::tearDown(); + Device::tearDown(); + option::teardown(); + Flag::tearDown(); } -uint -ReferenceCountedObject::release() -{ - uint newCount = --make_atomic(referenceCount_); - if (newCount == 0) { - if (terminate()) { - delete this; - } +uint ReferenceCountedObject::retain() { return ++make_atomic(referenceCount_); } + +uint ReferenceCountedObject::release() { + uint newCount = --make_atomic(referenceCount_); + if (newCount == 0) { + if (terminate()) { + delete this; } - return newCount; + } + return newCount; } #ifdef _WIN32 #ifdef DEBUG -static int -reportHook(int reportType, char *message, int *returnValue) -{ - if (returnValue) { - *returnValue = 1; - } - std::cerr << message; - ::exit(3); - return TRUE; +static int reportHook(int reportType, char* message, int* returnValue) { + if (returnValue) { + *returnValue = 1; + } + std::cerr << message; + ::exit(3); + return TRUE; } -#endif // DEBUG +#endif // DEBUG -extern "C" BOOL WINAPI -DllMain(HINSTANCE hinst, DWORD reason, LPVOID reserved) -{ - switch (reason) { +extern "C" BOOL WINAPI DllMain(HINSTANCE hinst, DWORD reason, LPVOID reserved) { + switch (reason) { case DLL_PROCESS_ATTACH: -# ifdef DEBUG - if (!AMD_OCL_ENABLE_MESSAGE_BOX) { - _CrtSetReportHook(reportHook); - _set_error_mode(_OUT_TO_STDERR); - } -# endif // DEBUG - break; +#ifdef DEBUG + if (!AMD_OCL_ENABLE_MESSAGE_BOX) { + _CrtSetReportHook(reportHook); + _set_error_mode(_OUT_TO_STDERR); + } +#endif // DEBUG + break; case DLL_PROCESS_DETACH: - if (!reserved || ENABLE_CAL_SHUTDOWN) { - Runtime::tearDown(); - } - break; + if (!reserved || ENABLE_CAL_SHUTDOWN) { + Runtime::tearDown(); + } + break; case DLL_THREAD_DETACH: { - amd::Thread* thread = amd::Thread::current(); - delete thread; - } - break; + amd::Thread* thread = amd::Thread::current(); + delete thread; + } break; default: - break; - } - return true; + break; + } + return true; } #endif -} // namespace amd +} // namespace amd diff --git a/rocclr/runtime/platform/runtime.hpp b/rocclr/runtime/platform/runtime.hpp index 8754089e3d..68f91b2776 100644 --- a/rocclr/runtime/platform/runtime.hpp +++ b/rocclr/runtime/platform/runtime.hpp @@ -14,22 +14,21 @@ namespace amd { * @{ */ -class Runtime : AllStatic -{ - static volatile bool initialized_; +class Runtime : AllStatic { + static volatile bool initialized_; -public: - //! Return true if the OpencCL runtime is already initialized - inline static bool initialized(); + public: + //! Return true if the OpencCL runtime is already initialized + inline static bool initialized(); - //! Initialize the OpenCL runtime. - static bool init(); + //! Initialize the OpenCL runtime. + static bool init(); - //! Tear down the runtime. - static void tearDown(); + //! Tear down the runtime. + static void tearDown(); - //! Return true if the Runtime is still single-threaded. - static bool singleThreaded() { return !initialized(); } + //! Return true if the Runtime is still single-threaded. + static bool singleThreaded() { return !initialized(); } }; #if 0 @@ -65,12 +64,8 @@ public: /*@}*/ -inline bool -Runtime::initialized() -{ - return initialized_; -} +inline bool Runtime::initialized() { return initialized_; } -} // namespace amd +} // namespace amd #endif /*RUNTIME_HPP_*/ diff --git a/rocclr/runtime/platform/sampler.hpp b/rocclr/runtime/platform/sampler.hpp index 10f9c900da..1e3b01cbbb 100644 --- a/rocclr/runtime/platform/sampler.hpp +++ b/rocclr/runtime/platform/sampler.hpp @@ -9,180 +9,160 @@ #include "platform/object.hpp" #include "device/device.hpp" -namespace amd -{ +namespace amd { //! Abstraction layer sampler class -class Sampler : public RuntimeObject -{ -public: - typedef std::map DeviceSamplers; +class Sampler : public RuntimeObject { + public: + typedef std::map DeviceSamplers; - //! \note the sampler states must match the compiler's defines. - //! See amd_ocl_sys_predef.c - enum State - { - StateNormalizedCoordsFalse = 0x00, - StateNormalizedCoordsTrue = 0x01, - StateNormalizedCoordsMask = (StateNormalizedCoordsFalse | - StateNormalizedCoordsTrue), - StateAddressNone = 0x00, - StateAddressRepeat = 0x02, - StateAddressClampToEdge = 0x04, - StateAddressClamp = 0x06, - StateAddressMirroredRepeat = 0x08, - StateAddressMask = (StateAddressNone | - StateAddressRepeat | - StateAddressMirroredRepeat | - StateAddressClampToEdge | - StateAddressClamp), - StateFilterNearest = 0x10, - StateFilterLinear = 0x20, - StateFilterMask = (StateFilterNearest | - StateFilterLinear) - }; + //! \note the sampler states must match the compiler's defines. + //! See amd_ocl_sys_predef.c + enum State { + StateNormalizedCoordsFalse = 0x00, + StateNormalizedCoordsTrue = 0x01, + StateNormalizedCoordsMask = (StateNormalizedCoordsFalse | StateNormalizedCoordsTrue), + StateAddressNone = 0x00, + StateAddressRepeat = 0x02, + StateAddressClampToEdge = 0x04, + StateAddressClamp = 0x06, + StateAddressMirroredRepeat = 0x08, + StateAddressMask = (StateAddressNone | StateAddressRepeat | StateAddressMirroredRepeat | + StateAddressClampToEdge | StateAddressClamp), + StateFilterNearest = 0x10, + StateFilterLinear = 0x20, + StateFilterMask = (StateFilterNearest | StateFilterLinear) + }; -private: - Context& context_; //!< OpenCL context associated with this sampler - uint32_t state_; //!< Sampler state - uint mipFilter_; //!< mip filter - float minLod_; //!< min level of detail - float maxLod_; //!< max level of detail - DeviceSamplers deviceSamplers_; //!< Container for the device samplers + private: + Context& context_; //!< OpenCL context associated with this sampler + uint32_t state_; //!< Sampler state + uint mipFilter_; //!< mip filter + float minLod_; //!< min level of detail + float maxLod_; //!< max level of detail + DeviceSamplers deviceSamplers_; //!< Container for the device samplers -public: - Sampler( - Context& context, //!< OpenCL context - bool normCoords, //!< normalized coordinates - uint addrMode, //!< adressing mode - uint filterMode, //!< filter mode - uint mipFilterMode, //!< mip filter mode - float minLod, //!< min level of detail - float maxLod //!< max level of detail - ) - : context_(context) - , mipFilter_(mipFilterMode) - , minLod_(minLod) - , maxLod_(maxLod) - { // Packs the sampler state into uint32_t for kernel execution - state_ = 0; + public: + Sampler(Context& context, //!< OpenCL context + bool normCoords, //!< normalized coordinates + uint addrMode, //!< adressing mode + uint filterMode, //!< filter mode + uint mipFilterMode, //!< mip filter mode + float minLod, //!< min level of detail + float maxLod //!< max level of detail + ) + : context_(context), + mipFilter_(mipFilterMode), + minLod_(minLod), + maxLod_(maxLod) { // Packs the sampler state into uint32_t for kernel execution + state_ = 0; - // Set normalized state - if (normCoords) { - state_ |= StateNormalizedCoordsTrue; - } - else { - state_ |= StateNormalizedCoordsFalse; - } - - // Program the sampler filter mode - if (filterMode == CL_FILTER_LINEAR) { - state_ |= StateFilterLinear; - } - else { - state_ |= StateFilterNearest; - } - - // Program the sampler address mode - switch (addrMode) { - case CL_ADDRESS_CLAMP_TO_EDGE: - state_ |= StateAddressClampToEdge; - break; - case CL_ADDRESS_REPEAT: - state_ |= StateAddressRepeat; - break; - case CL_ADDRESS_CLAMP: - state_ |= StateAddressClamp; - break; - case CL_ADDRESS_MIRRORED_REPEAT: - state_ |= StateAddressMirroredRepeat; - break; - case CL_ADDRESS_NONE: - state_ |= StateAddressNone; - break; - default: - break; - } + // Set normalized state + if (normCoords) { + state_ |= StateNormalizedCoordsTrue; + } else { + state_ |= StateNormalizedCoordsFalse; } - virtual ~Sampler() - { - for (const auto& it : deviceSamplers_) { - delete it.second; - } + // Program the sampler filter mode + if (filterMode == CL_FILTER_LINEAR) { + state_ |= StateFilterLinear; + } else { + state_ |= StateFilterNearest; } - bool create() - { - for (uint i = 0; i < context_.devices().size(); ++i) { - device::Sampler* sampler = NULL; - Device* dev = context_.devices()[i]; - if (!dev->createSampler(*this, &sampler)) { - return false; - } - deviceSamplers_[dev] = sampler; - } - return true; + // Program the sampler address mode + switch (addrMode) { + case CL_ADDRESS_CLAMP_TO_EDGE: + state_ |= StateAddressClampToEdge; + break; + case CL_ADDRESS_REPEAT: + state_ |= StateAddressRepeat; + break; + case CL_ADDRESS_CLAMP: + state_ |= StateAddressClamp; + break; + case CL_ADDRESS_MIRRORED_REPEAT: + state_ |= StateAddressMirroredRepeat; + break; + case CL_ADDRESS_NONE: + state_ |= StateAddressNone; + break; + default: + break; } + } - device::Sampler* getDeviceSampler(const Device& dev) const - { - auto it = deviceSamplers_.find(&dev); - if (it != deviceSamplers_.end()) { - return it->second; - } - return NULL; + virtual ~Sampler() { + for (const auto& it : deviceSamplers_) { + delete it.second; } + } - //! Accessor functions - Context& context() const { return context_; } - uint32_t state() const { return state_; } - uint mipFilter() const { return mipFilter_; } - float minLod() const { return minLod_; } - float maxLod() const { return maxLod_; } - - bool normalizedCoords() const - { - return (state_ & StateNormalizedCoordsTrue) ? true : false; + bool create() { + for (uint i = 0; i < context_.devices().size(); ++i) { + device::Sampler* sampler = NULL; + Device* dev = context_.devices()[i]; + if (!dev->createSampler(*this, &sampler)) { + return false; + } + deviceSamplers_[dev] = sampler; } + return true; + } - uint addressingMode() const - { - uint adressing = 0; - - // Program the sampler address mode - switch (state_ & StateAddressMask) { - case StateAddressRepeat: - adressing = CL_ADDRESS_REPEAT; - break; - case StateAddressClampToEdge: - adressing = CL_ADDRESS_CLAMP_TO_EDGE; - break; - case StateAddressClamp: - adressing = CL_ADDRESS_CLAMP; - break; - case StateAddressMirroredRepeat: - adressing = CL_ADDRESS_MIRRORED_REPEAT; - break; - case StateAddressNone: - adressing = CL_ADDRESS_NONE; - break; - default: - break; - } - return adressing; + device::Sampler* getDeviceSampler(const Device& dev) const { + auto it = deviceSamplers_.find(&dev); + if (it != deviceSamplers_.end()) { + return it->second; } + return NULL; + } - uint filterMode() const - { - return ((state_ & StateFilterMask) == StateFilterNearest) ? - CL_FILTER_NEAREST : CL_FILTER_LINEAR; + //! Accessor functions + Context& context() const { return context_; } + uint32_t state() const { return state_; } + uint mipFilter() const { return mipFilter_; } + float minLod() const { return minLod_; } + float maxLod() const { return maxLod_; } + + bool normalizedCoords() const { return (state_ & StateNormalizedCoordsTrue) ? true : false; } + + uint addressingMode() const { + uint adressing = 0; + + // Program the sampler address mode + switch (state_ & StateAddressMask) { + case StateAddressRepeat: + adressing = CL_ADDRESS_REPEAT; + break; + case StateAddressClampToEdge: + adressing = CL_ADDRESS_CLAMP_TO_EDGE; + break; + case StateAddressClamp: + adressing = CL_ADDRESS_CLAMP; + break; + case StateAddressMirroredRepeat: + adressing = CL_ADDRESS_MIRRORED_REPEAT; + break; + case StateAddressNone: + adressing = CL_ADDRESS_NONE; + break; + default: + break; } + return adressing; + } - //! RTTI internal implementation - virtual ObjectType objectType() const { return ObjectTypeSampler; } + uint filterMode() const { + return ((state_ & StateFilterMask) == StateFilterNearest) ? CL_FILTER_NEAREST + : CL_FILTER_LINEAR; + } + + //! RTTI internal implementation + virtual ObjectType objectType() const { return ObjectTypeSampler; } }; -} // namespace amd +} // namespace amd #endif /*SAMPLER_HPP_*/ diff --git a/rocclr/runtime/platform/threadtrace.hpp b/rocclr/runtime/platform/threadtrace.hpp index 80d0ec8699..bf511002bc 100644 --- a/rocclr/runtime/platform/threadtrace.hpp +++ b/rocclr/runtime/platform/threadtrace.hpp @@ -25,105 +25,106 @@ namespace amd { * * \brief The container class for the thread traces */ -class ThreadTrace : public RuntimeObject -{ -public: - enum State { - Undefined, - MemoryBound, - Begin, - End, - Pause - }; - typedef struct ThreadTraceConfigRec - { - size_t configSize_; // structure size - size_t cu_; // target compute unit [cu] - size_t sh_; // target shader array [sh],that contains target cu - size_t simdMask_; // bitmask to enable or disable target tokens for different SIMDs - size_t vmIdMask_; // virtual memory [vm] IDs to capture - size_t tokenMask_; // bitmask indicating which trace token IDs will be included in the trace - size_t regMask_; // bitmask indicating which register types should be included in the trace - size_t instMask_; // types of instruction scheduling updates which should be recorded - size_t randomSeed_; // linear feedback shift register [LFSR] seed - size_t userData_; // user data ,which is written as payload - size_t captureMode_; // indicator for the way how THREAD_TRACE_START / STOP events affect token collection - bool isUserData_; // indicator if user_data is set - bool isWrapped_; // indicator if the memory buffer should be wrapped around instead of stopping at the end - //default thread trace configuration/s initializator - ThreadTraceConfigRec():configSize_(0),cu_(0),sh_(0),simdMask_(0xF),vmIdMask_(CL_THREAD_TRACE_VM_ID_MASK_SINGLE), - tokenMask_(CL_THREAD_TRACE_TOKEN_MASK_ALL_SI),regMask_(CL_THREAD_TRACE_REG_MASK_ALL_SI), - instMask_(CL_THREAD_TRACE_INST_MASK_ALL),randomSeed_(0xFFF),userData_(0), - captureMode_(CL_THREAD_TRACE_CAPTURE_ALL),isUserData_(false),isWrapped_(false){ - configSize_ = sizeof(struct ThreadTraceConfigRec); - } - }ThreadTraceConfig; - - //! Constructor of the thread trace object - ThreadTrace( - const Device& device) //!< device object - : deviceThreadTrace_(NULL) - ,device_(device) - ,state_(Undefined) - { } - - //! Get the thread trace's associated device - const Device& device() const { return device_; } - - //! Get the shader engines number for thread trace`s associated device - const size_t deviceSeNumThreadTrace() const { return device_.info().numberOfShaderEngines; } - - //! Get the device thread trace - device::ThreadTrace* getDeviceThreadTrace() { return deviceThreadTrace_; } - - //! Set the device thread trace - void setDeviceThreadTrace(device::ThreadTrace* threadTrace) { deviceThreadTrace_ = threadTrace; } - - void setState(State state) {state_ = state;} - State getState() {return state_;} - - void setCU(unsigned int cu) { threadTraceConfig_.cu_ = cu; } - - void setSH(unsigned int sh) { threadTraceConfig_.sh_ = sh; } - - void setSIMD(unsigned int simdMask) { threadTraceConfig_.simdMask_ = simdMask; } - - void setUserData(unsigned int userData) { - threadTraceConfig_.isUserData_ = true; - threadTraceConfig_.userData_ = userData; - } - - void setTokenMask(unsigned int tokenMask) { threadTraceConfig_.tokenMask_ = tokenMask; } - - void setRegMask(unsigned int regMask) { threadTraceConfig_.regMask_ = regMask; } - - void setVmIdMask(unsigned int vmIdMask) { threadTraceConfig_.vmIdMask_ = vmIdMask; } - - void setInstMask(unsigned int instMask) { threadTraceConfig_.instMask_ = instMask; } - - void setRandomSeed(unsigned int randomSeed) { threadTraceConfig_.randomSeed_ = randomSeed; } - - void setCaptureMode(unsigned int captureMode) { threadTraceConfig_.captureMode_ = captureMode; } - - void setIsWrapped(bool isWrapped) { threadTraceConfig_.isWrapped_ = isWrapped; } - - const ThreadTraceConfig& threadTraceConfig() const {return threadTraceConfig_;} - - //! RTTI internal implementation - virtual ObjectType objectType() const {return ObjectTypeThreadTrace;} -protected: - //! Destructor for ThreadTrace class - ~ThreadTrace() { - delete deviceThreadTrace_; +class ThreadTrace : public RuntimeObject { + public: + enum State { Undefined, MemoryBound, Begin, End, Pause }; + typedef struct ThreadTraceConfigRec { + size_t configSize_; // structure size + size_t cu_; // target compute unit [cu] + size_t sh_; // target shader array [sh],that contains target cu + size_t simdMask_; // bitmask to enable or disable target tokens for different SIMDs + size_t vmIdMask_; // virtual memory [vm] IDs to capture + size_t tokenMask_; // bitmask indicating which trace token IDs will be included in the trace + size_t regMask_; // bitmask indicating which register types should be included in the trace + size_t instMask_; // types of instruction scheduling updates which should be recorded + size_t randomSeed_; // linear feedback shift register [LFSR] seed + size_t userData_; // user data ,which is written as payload + size_t captureMode_; // indicator for the way how THREAD_TRACE_START / STOP events affect token + // collection + bool isUserData_; // indicator if user_data is set + bool isWrapped_; // indicator if the memory buffer should be wrapped around instead of stopping + // at the end + // default thread trace configuration/s initializator + ThreadTraceConfigRec() + : configSize_(0), + cu_(0), + sh_(0), + simdMask_(0xF), + vmIdMask_(CL_THREAD_TRACE_VM_ID_MASK_SINGLE), + tokenMask_(CL_THREAD_TRACE_TOKEN_MASK_ALL_SI), + regMask_(CL_THREAD_TRACE_REG_MASK_ALL_SI), + instMask_(CL_THREAD_TRACE_INST_MASK_ALL), + randomSeed_(0xFFF), + userData_(0), + captureMode_(CL_THREAD_TRACE_CAPTURE_ALL), + isUserData_(false), + isWrapped_(false) { + configSize_ = sizeof(struct ThreadTraceConfigRec); } + } ThreadTraceConfig; - device::ThreadTrace* deviceThreadTrace_; //!< device thread trace object - const Device& device_; //!< the device object - State state_; - ThreadTraceConfig threadTraceConfig_; + //! Constructor of the thread trace object + ThreadTrace(const Device& device) //!< device object + : deviceThreadTrace_(NULL), + device_(device), + state_(Undefined) {} + + //! Get the thread trace's associated device + const Device& device() const { return device_; } + + //! Get the shader engines number for thread trace`s associated device + const size_t deviceSeNumThreadTrace() const { return device_.info().numberOfShaderEngines; } + + //! Get the device thread trace + device::ThreadTrace* getDeviceThreadTrace() { return deviceThreadTrace_; } + + //! Set the device thread trace + void setDeviceThreadTrace(device::ThreadTrace* threadTrace) { deviceThreadTrace_ = threadTrace; } + + void setState(State state) { state_ = state; } + State getState() { return state_; } + + void setCU(unsigned int cu) { threadTraceConfig_.cu_ = cu; } + + void setSH(unsigned int sh) { threadTraceConfig_.sh_ = sh; } + + void setSIMD(unsigned int simdMask) { threadTraceConfig_.simdMask_ = simdMask; } + + void setUserData(unsigned int userData) { + threadTraceConfig_.isUserData_ = true; + threadTraceConfig_.userData_ = userData; + } + + void setTokenMask(unsigned int tokenMask) { threadTraceConfig_.tokenMask_ = tokenMask; } + + void setRegMask(unsigned int regMask) { threadTraceConfig_.regMask_ = regMask; } + + void setVmIdMask(unsigned int vmIdMask) { threadTraceConfig_.vmIdMask_ = vmIdMask; } + + void setInstMask(unsigned int instMask) { threadTraceConfig_.instMask_ = instMask; } + + void setRandomSeed(unsigned int randomSeed) { threadTraceConfig_.randomSeed_ = randomSeed; } + + void setCaptureMode(unsigned int captureMode) { threadTraceConfig_.captureMode_ = captureMode; } + + void setIsWrapped(bool isWrapped) { threadTraceConfig_.isWrapped_ = isWrapped; } + + const ThreadTraceConfig& threadTraceConfig() const { return threadTraceConfig_; } + + //! RTTI internal implementation + virtual ObjectType objectType() const { return ObjectTypeThreadTrace; } + + protected: + //! Destructor for ThreadTrace class + ~ThreadTrace() { delete deviceThreadTrace_; } + + device::ThreadTrace* deviceThreadTrace_; //!< device thread trace object + const Device& device_; //!< the device object + State state_; + ThreadTraceConfig threadTraceConfig_; }; /*@}*/ /*@}*/ } // namespace amd -#endif // THREAD_TRACE_HPP_ +#endif // THREAD_TRACE_HPP_ diff --git a/rocclr/runtime/thread/atomic.hpp b/rocclr/runtime/thread/atomic.hpp index 8477ece1e0..943d5df53f 100644 --- a/rocclr/runtime/thread/atomic.hpp +++ b/rocclr/runtime/thread/atomic.hpp @@ -17,11 +17,11 @@ #include #ifdef _WIN32 -# include +#include #elif defined(ATI_ARCH_X86) -# include -# include -#endif // !_WIN32 +#include +#include +#endif // !_WIN32 #include #include @@ -37,135 +37,111 @@ namespace amd { /*! \brief Static functions for atomic operations. */ -class AtomicOperation : AllStatic -{ -private: - - //! Template to specialize atomic intrinsics on register size. - template - struct Intrinsics { - /*! \brief %Atomic add. - * - * Atomically add \a inc to \a *dest and return the prior value. - */ - template - static inline T add(T increment, volatile T* dest); - - /*! \brief %Atomic exchange. - * - * Atomically exchange value with *dest and return the prior value. - */ - template - static inline T swap(T value, volatile T* dest); - - /*! \brief %Atomic compare and exchange. - * - * Atomically compare and xchge value with *dest if *dest == compare. - * Return the prior value. - */ - template - static inline T compareAndSwap(T compare, volatile T* dest, T value); - - /*! \brief %Atomic increment. - * - * Atomically increment *dest and return the prior value. - */ - template - static inline T increment(volatile T* dest); - - /*! \brief %Atomic exchange. - * - * Atomically decrement *dest and return the prior value. - */ - template - static inline T decrement(volatile T* dest); - - /*! \brief %Atomic or. - * - * Atomically or \a mask to \a *dest and return the prior value. - */ - template - static inline T _or(T mask, volatile T* dest); - - /*! \brief %Atomic and. - * - * Atomically and \a mask to \a *dest and return the prior value. - */ - template - static inline T _and(T mask, volatile T* dest); -}; - -public: +class AtomicOperation : AllStatic { + private: + //! Template to specialize atomic intrinsics on register size. + template struct Intrinsics { /*! \brief %Atomic add. * * Atomically add \a inc to \a *dest and return the prior value. */ - template - static T add(T inc, volatile T* dest) - { - return Intrinsics::add((T) inc, dest); - } + template static inline T add(T increment, volatile T* dest); /*! \brief %Atomic exchange. * * Atomically exchange value with *dest and return the prior value. */ - template - static T swap(T value, volatile T* dest) - { - return Intrinsics::swap(value, dest); - } + template static inline T swap(T value, volatile T* dest); /*! \brief %Atomic compare and exchange. * - * Atomically compare and exchange value with *dest if *dest == compare. + * Atomically compare and xchge value with *dest if *dest == compare. * Return the prior value. */ - template - static T compareAndSwap(T compare, volatile T* dest, T value) - { - return Intrinsics::compareAndSwap(compare, dest, value); - } + template static inline T compareAndSwap(T compare, volatile T* dest, T value); /*! \brief %Atomic increment. * * Atomically increment *dest and return the prior value. */ - template - static T increment(volatile T* dest) - { - return Intrinsics::increment(dest); - } + template static inline T increment(volatile T* dest); - /*! \brief %Atomic decrement. + /*! \brief %Atomic exchange. * * Atomically decrement *dest and return the prior value. */ - template - static T decrement(volatile T* dest) - { - return Intrinsics::decrement(dest); - } + template static inline T decrement(volatile T* dest); /*! \brief %Atomic or. * * Atomically or \a mask to \a *dest and return the prior value. */ - template - static T _or(T mask, volatile T* dest) - { - return Intrinsics::_or((T) mask, dest); - } + template static inline T _or(T mask, volatile T* dest); /*! \brief %Atomic and. * - * Atomically or \a mask to \a *dest and return the prior value. + * Atomically and \a mask to \a *dest and return the prior value. */ - template - static T _and(T mask, volatile T* dest) - { - return Intrinsics::_and((T) mask, dest); - } + template static inline T _and(T mask, volatile T* dest); + }; + + public: + /*! \brief %Atomic add. + * + * Atomically add \a inc to \a *dest and return the prior value. + */ + template static T add(T inc, volatile T* dest) { + return Intrinsics::add((T)inc, dest); + } + + /*! \brief %Atomic exchange. + * + * Atomically exchange value with *dest and return the prior value. + */ + template static T swap(T value, volatile T* dest) { + return Intrinsics::swap(value, dest); + } + + /*! \brief %Atomic compare and exchange. + * + * Atomically compare and exchange value with *dest if *dest == compare. + * Return the prior value. + */ + template static T compareAndSwap(T compare, volatile T* dest, T value) { + return Intrinsics::compareAndSwap(compare, dest, value); + } + + /*! \brief %Atomic increment. + * + * Atomically increment *dest and return the prior value. + */ + template static T increment(volatile T* dest) { + return Intrinsics::increment(dest); + } + + /*! \brief %Atomic decrement. + * + * Atomically decrement *dest and return the prior value. + */ + template static T decrement(volatile T* dest) { + return Intrinsics::decrement(dest); + } + + /*! \brief %Atomic or. + * + * Atomically or \a mask to \a *dest and return the prior value. + */ + template static T _or(T mask, volatile T* dest) { + return Intrinsics::_or((T)mask, dest); + } + + /*! \brief %Atomic and. + * + * Atomically or \a mask to \a *dest and return the prior value. + */ + template static T _and(T mask, volatile T* dest) { + return Intrinsics::_and((T)mask, dest); + } }; /*@}*/ @@ -174,193 +150,139 @@ public: template <> template -inline T -AtomicOperation::Intrinsics<4>::add(T increment, volatile T* dest) -{ - return (T)_InterlockedExchangeAdd( - (volatile long*)dest, (long)increment); +inline T AtomicOperation::Intrinsics<4>::add(T increment, volatile T* dest) { + return (T)_InterlockedExchangeAdd((volatile long*)dest, (long)increment); } template <> template -inline T -AtomicOperation::Intrinsics<4>::swap(T value, volatile T* dest) -{ - return (T)_InterlockedExchange( - (volatile long*)dest, (long)value); +inline T AtomicOperation::Intrinsics<4>::swap(T value, volatile T* dest) { + return (T)_InterlockedExchange((volatile long*)dest, (long)value); } template <> template -inline T -AtomicOperation::Intrinsics<4>::compareAndSwap( - T compare, volatile T* dest, T value) -{ - return (T)_InterlockedCompareExchange( - (volatile long*)dest, (long)value, (long)compare); +inline T AtomicOperation::Intrinsics<4>::compareAndSwap(T compare, volatile T* dest, T value) { + return (T)_InterlockedCompareExchange((volatile long*)dest, (long)value, (long)compare); } template <> template -inline T -AtomicOperation::Intrinsics<4>::increment(volatile T* dest) -{ - return (T)(_InterlockedIncrement((volatile long*)dest) - 1L); +inline T AtomicOperation::Intrinsics<4>::increment(volatile T* dest) { + return (T)(_InterlockedIncrement((volatile long*)dest) - 1L); } template <> template -inline T -AtomicOperation::Intrinsics<4>::decrement(volatile T* dest) -{ - return (T)(_InterlockedDecrement((volatile long*)dest) + 1L); +inline T AtomicOperation::Intrinsics<4>::decrement(volatile T* dest) { + return (T)(_InterlockedDecrement((volatile long*)dest) + 1L); } template <> template -inline T -AtomicOperation::Intrinsics<4>::_or(T mask, volatile T* dest) -{ - return (T)_InterlockedOr( - (volatile long*)dest, (long)mask); +inline T AtomicOperation::Intrinsics<4>::_or(T mask, volatile T* dest) { + return (T)_InterlockedOr((volatile long*)dest, (long)mask); } template <> template -inline T -AtomicOperation::Intrinsics<4>::_and(T mask, volatile T* dest) -{ - return (T)_InterlockedAnd( - (volatile long*)dest, (long)mask); +inline T AtomicOperation::Intrinsics<4>::_and(T mask, volatile T* dest) { + return (T)_InterlockedAnd((volatile long*)dest, (long)mask); } #ifdef _WIN64 template <> template -inline T -AtomicOperation::Intrinsics<8>::add(T increment, volatile T* dest) -{ - return (T)_InterlockedExchangeAdd64( - (volatile __int64*)dest, (__int64)increment); +inline T AtomicOperation::Intrinsics<8>::add(T increment, volatile T* dest) { + return (T)_InterlockedExchangeAdd64((volatile __int64*)dest, (__int64)increment); } template <> template -inline T -AtomicOperation::Intrinsics<8>::swap(T value, volatile T* dest) -{ - return (T)_InterlockedExchange64( - (volatile __int64*)dest, (__int64)value); +inline T AtomicOperation::Intrinsics<8>::swap(T value, volatile T* dest) { + return (T)_InterlockedExchange64((volatile __int64*)dest, (__int64)value); } template <> template -inline T -AtomicOperation::Intrinsics<8>::compareAndSwap( - T compare, volatile T* dest, T value) -{ - return (T)_InterlockedCompareExchange64( - (volatile __int64*)dest, (__int64)value, (__int64)compare); +inline T AtomicOperation::Intrinsics<8>::compareAndSwap(T compare, volatile T* dest, T value) { + return (T)_InterlockedCompareExchange64((volatile __int64*)dest, (__int64)value, + (__int64)compare); } template <> template -inline T -AtomicOperation::Intrinsics<8>::increment(volatile T* dest) -{ - return (T)(_InterlockedIncrement64((volatile __int64*)dest) - 1LL); +inline T AtomicOperation::Intrinsics<8>::increment(volatile T* dest) { + return (T)(_InterlockedIncrement64((volatile __int64*)dest) - 1LL); } template <> template -inline T -AtomicOperation::Intrinsics<8>::decrement(volatile T* dest) -{ - return (T)(_InterlockedDecrement64((volatile __int64*)dest) + 1LL); +inline T AtomicOperation::Intrinsics<8>::decrement(volatile T* dest) { + return (T)(_InterlockedDecrement64((volatile __int64*)dest) + 1LL); } template <> template -inline T -AtomicOperation::Intrinsics<8>::_or(T mask, volatile T* dest) -{ - return (T)_InterlockedOr64( - (volatile long*)dest, (long)mask); +inline T AtomicOperation::Intrinsics<8>::_or(T mask, volatile T* dest) { + return (T)_InterlockedOr64((volatile long*)dest, (long)mask); } template <> template -inline T -AtomicOperation::Intrinsics<8>::_and(T mask, volatile T* dest) -{ - return (T)_InterlockedAnd64( - (volatile long*)dest, (long)mask); +inline T AtomicOperation::Intrinsics<8>::_and(T mask, volatile T* dest) { + return (T)_InterlockedAnd64((volatile long*)dest, (long)mask); } -#endif // _LP64 +#endif // _LP64 #elif defined(__GNUC__) template template -inline T -AtomicOperation::Intrinsics::add(T inc, volatile T* dest) -{ - return __sync_fetch_and_add(dest, inc); -} - -template -template -inline T -AtomicOperation::Intrinsics::swap(T value, volatile T* dest) -{ - return __sync_lock_test_and_set(dest, value); +inline T AtomicOperation::Intrinsics::add(T inc, volatile T* dest) { + return __sync_fetch_and_add(dest, inc); } template template -inline T -AtomicOperation::Intrinsics::compareAndSwap( - T compare, volatile T* dest, T value) -{ - return __sync_val_compare_and_swap(dest, compare, value); -} - -template -template -inline T -AtomicOperation::Intrinsics::increment(volatile T* dest) -{ - return add(T(1), dest); -} - -template -template -inline T -AtomicOperation::Intrinsics::decrement(volatile T* dest) -{ - return add(T(-1), dest); +inline T AtomicOperation::Intrinsics::swap(T value, volatile T* dest) { + return __sync_lock_test_and_set(dest, value); } template template -inline T -AtomicOperation::Intrinsics::_or(T mask, volatile T* dest) -{ - return __sync_fetch_and_or(dest, mask); +inline T AtomicOperation::Intrinsics::compareAndSwap(T compare, volatile T* dest, T value) { + return __sync_val_compare_and_swap(dest, compare, value); } template template -inline T -AtomicOperation::Intrinsics::_and(T mask, volatile T* dest) -{ - return __sync_fetch_and_and(dest, mask); +inline T AtomicOperation::Intrinsics::increment(volatile T* dest) { + return add(T(1), dest); +} + +template +template +inline T AtomicOperation::Intrinsics::decrement(volatile T* dest) { + return add(T(-1), dest); +} + +template +template +inline T AtomicOperation::Intrinsics::_or(T mask, volatile T* dest) { + return __sync_fetch_and_or(dest, mask); +} + +template +template +inline T AtomicOperation::Intrinsics::_and(T mask, volatile T* dest) { + return __sync_fetch_and_and(dest, mask); } #else -# error Unimplemented +#error Unimplemented #endif /*! \addtogroup Atomic Atomic Operations @@ -369,180 +291,151 @@ AtomicOperation::Intrinsics::_and(T mask, volatile T* dest) /*! \brief A variable of type T with atomic properties. */ -template -class Atomic -{ -private: +template class Atomic { + private: + typedef typename std::remove_volatile< + typename std::remove_pointer::type>::type>::type value_type; + typename std::add_volatile::type value_; //!< \brief The variable. - typedef typename std::remove_volatile::type>::type>::type value_type; - typename std::add_volatile::type value_; //!< \brief The variable. + public: + //! Construct a new %Atomic variable of type T. + Atomic() : value_(T(0)) {} + //! Construct a new %Atomic variable of type T from \a value. + Atomic(T value) : value_(value) {} + //! Construct a new %Atomic variable of type T from another %Atomic. + Atomic(const Atomic& atomic) : value_(atomic.value_) {} + //! Copy value into this %Atomic variable. + Atomic& operator=(T value) { + value_ = value; + return *this; + } -public: - //! Construct a new %Atomic variable of type T. - Atomic() : value_(T(0)) {} - //! Construct a new %Atomic variable of type T from \a value. - Atomic(T value) : value_(value) {} - //! Construct a new %Atomic variable of type T from another %Atomic. - Atomic(const Atomic& atomic) : value_(atomic.value_) { } - //! Copy value into this %Atomic variable. - Atomic& operator = (T value) - { - value_ = value; - return *this; + //! Return the %Atomic variable value. + operator T() const { return T(value_); } + //! Return the %Atomic variable value. + T operator->() const { return T(value_); } + //! Return the %Atomic variable's address. + typename std::add_pointer::type>::type operator&() { + return &value_; + } + + //! Atomically add \a inc to this variable. + Atomic& operator+=(value_type inc) { + if (std::is_pointer::value) { + inc *= sizeof(typename std::remove_pointer::type); } + AtomicOperation::add(inc, &value_); + return *this; + } - //! Return the %Atomic variable value. - operator T () const { return T(value_); } - //! Return the %Atomic variable value. - T operator ->() const { return T(value_); } - //! Return the %Atomic variable's address. - typename std::add_pointer::type>:: - type operator &() { return &value_; } - - //! Atomically add \a inc to this variable. - Atomic& operator += (value_type inc) - { - if (std::is_pointer::value) { - inc *= sizeof(typename std::remove_pointer::type); - } - AtomicOperation::add(inc, &value_); - return *this; + //! Atomically subtract \a inc to this variable. + Atomic& operator-=(value_type inc) { + value_type modifier = 0; + if (std::is_pointer::value) { + inc *= sizeof(typename std::remove_pointer::type); } + AtomicOperation::add(modifier - inc, &value_); + return *this; + } - //! Atomically subtract \a inc to this variable. - Atomic& operator -= (value_type inc) - { - value_type modifier = 0; - if (std::is_pointer::value) { - inc *= sizeof(typename std::remove_pointer::type); - } - AtomicOperation::add(modifier - inc, &value_); - return *this; - } + //! Atomically OR \a value to this variable. + Atomic& operator|=(value_type mask) { + AtomicOperation::_or(mask, &value_); + return *this; + } - //! Atomically OR \a value to this variable. - Atomic& operator |= (value_type mask) - { - AtomicOperation::_or(mask, &value_); - return *this; - } + //! Atomically AND \a value to this variable. + Atomic& operator&=(value_type mask) { + AtomicOperation::_and(mask, &value_); + return *this; + } - //! Atomically AND \a value to this variable. - Atomic& operator &= (value_type mask) - { - AtomicOperation::_and(mask, &value_); - return *this; + //! Atomically increment this variable and return its new value. + typename std::remove_reference::type operator++() { + if (std::is_pointer::value) { + value_type inc = static_cast(sizeof(typename std::remove_pointer::type)); + return AtomicOperation::add(inc, &value_) + 1; + } else { + return AtomicOperation::increment(&value_) + 1; } + } - //! Atomically increment this variable and return its new value. - typename std::remove_reference::type operator ++ () - { - if (std::is_pointer::value) { - value_type inc = static_cast( - sizeof(typename std::remove_pointer::type)); - return AtomicOperation::add(inc, &value_) + 1; - } - else { - return AtomicOperation::increment(&value_) + 1; - } + //! Atomically decrement this variable and return its new value. + typename std::remove_reference::type operator--() { + if (std::is_pointer::value) { + value_type inc = + static_cast(-static_cast::type>( + sizeof(typename std::remove_pointer::type))); + return AtomicOperation::add(inc, &value_) - 1; + } else { + return AtomicOperation::decrement(&value_) - 1; } + } - //! Atomically decrement this variable and return its new value. - typename std::remove_reference::type operator -- () - { - if (std::is_pointer::value) { - value_type inc = static_cast(- - static_cast::type>( - sizeof(typename std::remove_pointer::type))); - return AtomicOperation::add(inc, &value_) - 1; - } - else { - return AtomicOperation::decrement(&value_) - 1; - } + //! Atomically increment this variable and return its previous value. + typename std::remove_reference::type operator++(int) { + if (std::is_pointer::value) { + value_type inc = static_cast(sizeof(typename std::remove_pointer::type)); + return AtomicOperation::add(inc, &value_); + } else { + return AtomicOperation::increment(&value_); } + } - //! Atomically increment this variable and return its previous value. - typename std::remove_reference::type operator ++ (int) - { - if (std::is_pointer::value) { - value_type inc = static_cast( - sizeof(typename std::remove_pointer::type)); - return AtomicOperation::add(inc, &value_); - } - else { - return AtomicOperation::increment(&value_); - } + //! Atomically decrement this variable and return its previous value. + T operator--(int) { + if (std::is_pointer::value) { + value_type inc = + static_cast(-static_cast::type>( + sizeof(typename std::remove_pointer::type))); + return AtomicOperation::add(inc, &value_); + } else { + return AtomicOperation::decrement(&value_); } + } - //! Atomically decrement this variable and return its previous value. - T operator -- (int) - { - if (std::is_pointer::value) { - value_type inc = static_cast(- - static_cast::type>( - sizeof(typename std::remove_pointer::type))); - return AtomicOperation::add(inc, &value_); - } - else { - return AtomicOperation::decrement(&value_); - } - } + /*! \brief Atomically compare this variable with \a compare and set + * to value if equals + */ + bool compareAndSet(T compare, T value) { + return compare == AtomicOperation::compareAndSwap(compare, &value_, value); + } - /*! \brief Atomically compare this variable with \a compare and set - * to value if equals - */ - bool compareAndSet(T compare, T value) - { - return compare == AtomicOperation::compareAndSwap( - compare, &value_, value); - } + //! Atomically set this variable to \a value and return its previous value. + T swap(T value) { return AtomicOperation::swap(value, &value_); } - //! Atomically set this variable to \a value and return its previous value. - T swap(T value) - { - return AtomicOperation::swap(value, &value_); - } + /*! \brief Execute a stores fence followed by a store to this variable. + * + * This storeRelease operation ensures that all store to memory operations + * preceding this function will be globally visible before the update to + * this variable's value. + */ + void storeRelease(T value) { + std::atomic_thread_fence(std::memory_order_release); + value_ = value; + } - /*! \brief Execute a stores fence followed by a store to this variable. - * - * This storeRelease operation ensures that all store to memory operations - * preceding this function will be globally visible before the update to - * this variable's value. - */ - void storeRelease(T value) - { - std::atomic_thread_fence(std::memory_order_release); - value_ = value; - } - - /*! \brief Execute a load from this variable followed by a loads fence. - * - * This loadAcquire operation ensures that all load from memory operations - * following this function will be globally visible after the read from - * this variable's value. - */ - T loadAcquire() const - { - T value = value_; - std::atomic_thread_fence(std::memory_order_acquire); - return value; - } + /*! \brief Execute a load from this variable followed by a loads fence. + * + * This loadAcquire operation ensures that all load from memory operations + * following this function will be globally visible after the read from + * this variable's value. + */ + T loadAcquire() const { + T value = value_; + std::atomic_thread_fence(std::memory_order_acquire); + return value; + } }; //! Helper function to tie an Atomic to a variable of type T. -template -inline Atomic -make_atomic(T& t) -{ - return Atomic(t); -} +template inline Atomic make_atomic(T& t) { return Atomic(t); } /*! @} * @} */ -} // namespace amd +} // namespace amd #endif /*ATOMIC_HPP_*/ diff --git a/rocclr/runtime/thread/monitor.cpp b/rocclr/runtime/thread/monitor.cpp index 9a3d7de433..d54c19999a 100644 --- a/rocclr/runtime/thread/monitor.cpp +++ b/rocclr/runtime/thread/monitor.cpp @@ -14,308 +14,287 @@ namespace amd { -Monitor::Monitor(const char* name, bool recursive) : - contendersList_(0), onDeck_(0), waitersList_(NULL), - owner_(NULL), recursive_(recursive) -{ - const size_t maxNameLen = sizeof(name_); - if (name == NULL) { - const char* unknownName = "@unknown@"; - assert(sizeof(unknownName) < maxNameLen && "just checking"); - strcpy(name_, unknownName); +Monitor::Monitor(const char* name, bool recursive) + : contendersList_(0), onDeck_(0), waitersList_(NULL), owner_(NULL), recursive_(recursive) { + const size_t maxNameLen = sizeof(name_); + if (name == NULL) { + const char* unknownName = "@unknown@"; + assert(sizeof(unknownName) < maxNameLen && "just checking"); + strcpy(name_, unknownName); + } else { + strncpy(name_, name, maxNameLen - 1); + name_[maxNameLen - 1] = '\0'; + } +} + +bool Monitor::trySpinLock() { + if (tryLock()) { + return true; + } + + for (int s = kMaxSpinIter; s > 0; --s) { + // First, be SMT friendly + if (s >= (kMaxSpinIter - kMaxReadSpinIter)) { + Os::spinPause(); } + // and then SMP friendly else { - strncpy(name_, name, maxNameLen - 1); - name_[maxNameLen - 1] = '\0'; + Thread::yield(); } + if (!isLocked()) { + return tryLock(); + } + } + + // We could not acquire the lock in the spin loop. + return false; } -bool -Monitor::trySpinLock() -{ +void Monitor::finishLock() { + Thread* thread = Thread::current(); + assert(thread != NULL && "cannot lock() from (null)"); + + if (trySpinLock()) { + return; // We succeeded, we are done. + } + + /* The lock is contended. Push the thread's semaphore onto + * the contention list. + */ + Semaphore& semaphore = thread->lockSemaphore(); + semaphore.reset(); + + LinkedNode newHead; + newHead.setItem(&semaphore); + + intptr_t head = contendersList_.load(std::memory_order_acquire); + for (;;) { + // The assumption is that lockWord is locked. Make sure we do not + // continue unless the lock bit is set. + if ((head & kLockBit) == 0) { + if (tryLock()) { + return; + } + continue; + } + + // Set the new contention list head if lockWord is unchanged. + newHead.setNext(reinterpret_cast(head & ~kLockBit)); + if (contendersList_.compare_exchange_weak(head, reinterpret_cast(&newHead) | kLockBit, + std::memory_order_acq_rel, + std::memory_order_acquire)) { + break; + } + + // We failed the CAS. yield/pause before trying again. + Thread::yield(); + } + + int32_t spinCount = 0; + // Go to sleep until we become the on-deck thread. + while ((onDeck_ & ~kLockBit) != reinterpret_cast(&semaphore)) { + // First, be SMT friendly + if (spinCount < kMaxReadSpinIter) { + Os::spinPause(); + } + // and then SMP friendly + else if (spinCount < kMaxSpinIter) { + Thread::yield(); + } + // now go to sleep + else { + semaphore.wait(); + } + spinCount++; + } + + spinCount = 0; + // + // From now-on, we are the on-deck thread. It will stay that way until + // we successfuly acquire the lock. + // + for (;;) { + assert((onDeck_ & ~kLockBit) == reinterpret_cast(&semaphore) && "just checking"); if (tryLock()) { - return true; + break; } - for (int s = kMaxSpinIter; s > 0; --s) { - // First, be SMT friendly - if (s >= (kMaxSpinIter - kMaxReadSpinIter)) { - Os::spinPause(); - } - // and then SMP friendly - else { - Thread::yield(); - } - if (!isLocked()) { - return tryLock(); - } + // Somebody beat us to it. Since we are on-deck, we can just go + // back to sleep. + // First, be SMT friendly + if (spinCount < kMaxReadSpinIter) { + Os::spinPause(); } + // and then SMP friendly + else if (spinCount < kMaxSpinIter) { + Thread::yield(); + } + // now go to sleep + else { + semaphore.wait(); + } + spinCount++; + } - // We could not acquire the lock in the spin loop. - return false; + assert(newHead.next() == NULL && "Should not be linked"); + onDeck_ = 0; } -void -Monitor::finishLock() -{ - Thread* thread = Thread::current(); - assert(thread != NULL && "cannot lock() from (null)"); +void Monitor::finishUnlock() { + // If we get here, it means that there might be a thread in the contention + // list waiting to acquire the lock. We need to select a successor and + // place it on-deck. - if (trySpinLock()) { - return; // We succeeded, we are done. + for (;;) { + // Grab the onDeck_ microlock to protect the next loop (make sure only + // one semaphore is removed from the contention list). + // + intptr_t ptr = 0; + if (!onDeck_.compare_exchange_strong(ptr, ptr | kLockBit, std::memory_order_acq_rel, + std::memory_order_acquire)) { + return; // Somebody else has the microlock, let him select onDeck_ } - /* The lock is contended. Push the thread's semaphore onto - * the contention list. - */ - Semaphore& semaphore = thread->lockSemaphore(); - semaphore.reset(); - - LinkedNode newHead; - newHead.setItem(&semaphore); - intptr_t head = contendersList_.load(std::memory_order_acquire); for (;;) { - // The assumption is that lockWord is locked. Make sure we do not - // continue unless the lock bit is set. - if ((head & kLockBit) == 0) { - if (tryLock()) { - return; - } - continue; - } + if (head == 0) { + break; // There's nothing else to do. + } - // Set the new contention list head if lockWord is unchanged. - newHead.setNext(reinterpret_cast(head & ~kLockBit)); - if (contendersList_.compare_exchange_weak(head, - reinterpret_cast(&newHead) | kLockBit, - std::memory_order_acq_rel, std::memory_order_acquire)) { - break; - } + if ((head & kLockBit) != 0) { + // Somebody could have acquired then released the lock + // and failed to grab the onDeck_ microlock. + head = 0; + break; + } - // We failed the CAS. yield/pause before trying again. - Thread::yield(); + if (contendersList_.compare_exchange_weak( + head, reinterpret_cast(reinterpret_cast(head)->next()), + std::memory_order_acq_rel, std::memory_order_acquire)) { +#ifdef ASSERT + reinterpret_cast(head)->setNext(NULL); +#endif // ASSERT + break; + } } - int32_t spinCount = 0; - // Go to sleep until we become the on-deck thread. - while ((onDeck_ & ~kLockBit) != reinterpret_cast(&semaphore)) { - // First, be SMT friendly - if (spinCount < kMaxReadSpinIter) { - Os::spinPause(); - } - // and then SMP friendly - else if (spinCount < kMaxSpinIter) { - Thread::yield(); - } - // now go to sleep - else { - semaphore.wait(); - } - spinCount++; - } + Semaphore* semaphore = (head != 0) ? reinterpret_cast(head)->item() : NULL; - spinCount = 0; + onDeck_.store(reinterpret_cast(semaphore), std::memory_order_release); // - // From now-on, we are the on-deck thread. It will stay that way until - // we successfuly acquire the lock. - // - for (;;) { - assert((onDeck_ & ~kLockBit) == reinterpret_cast(&semaphore) - && "just checking"); - if (tryLock()) { - break; - } + // Release the onDeck_ microlock (end of critical region); - // Somebody beat us to it. Since we are on-deck, we can just go - // back to sleep. - // First, be SMT friendly - if (spinCount < kMaxReadSpinIter) { - Os::spinPause(); - } - // and then SMP friendly - else if (spinCount < kMaxSpinIter) { - Thread::yield(); - } - // now go to sleep - else { - semaphore.wait(); - } - spinCount++; + if (semaphore != NULL) { + semaphore->post(); + return; } - assert(newHead.next() == NULL && "Should not be linked"); - onDeck_ = 0; + // We do not have an on-deck thread (semaphore == NULL). Return if + // the contention list is empty or if the lock got acquired again. + head = contendersList_; + if (head == 0 || (head & kLockBit) != 0) { + return; + } + } } -void -Monitor::finishUnlock() -{ - // If we get here, it means that there might be a thread in the contention - // list waiting to acquire the lock. We need to select a successor and - // place it on-deck. +void Monitor::wait() { + Thread* thread = Thread::current(); + assert(isLocked() && owner_ == thread && "just checking"); - for (;;) { - // Grab the onDeck_ microlock to protect the next loop (make sure only - // one semaphore is removed from the contention list). - // - intptr_t ptr = 0; - if (!onDeck_.compare_exchange_strong(ptr, ptr | kLockBit, - std::memory_order_acq_rel, std::memory_order_acquire)) { - return; // Somebody else has the microlock, let him select onDeck_ - } + // Add the thread's resume semaphore to the list. + Semaphore& suspend = thread->suspendSemaphore(); + suspend.reset(); - intptr_t head = contendersList_.load(std::memory_order_acquire); - for (;;) { - if (head == 0) { - break; // There's nothing else to do. - } + LinkedNode newHead; + newHead.setItem(&suspend); + newHead.setNext(waitersList_); + waitersList_ = &newHead; - if ((head & kLockBit) != 0) { - // Somebody could have acquired then released the lock - // and failed to grab the onDeck_ microlock. - head = 0; - break; - } + // Preserve the lock count (for recursive mutexes) + uint32_t lockCount = lockCount_; + lockCount_ = 1; - if (contendersList_.compare_exchange_weak( - head, reinterpret_cast( - reinterpret_cast(head)->next()), - std::memory_order_acq_rel, std::memory_order_acquire)) { - #ifdef ASSERT - reinterpret_cast(head)->setNext(NULL); - #endif // ASSERT - break; - } - } + // Release the lock and go to sleep. + unlock(); - Semaphore* semaphore = (head != 0) - ? reinterpret_cast(head)->item() - : NULL; - - onDeck_.store(reinterpret_cast(semaphore), - std::memory_order_release); - // - // Release the onDeck_ microlock (end of critical region); - - if (semaphore != NULL) { - semaphore->post(); - return; - } - - // We do not have an on-deck thread (semaphore == NULL). Return if - // the contention list is empty or if the lock got acquired again. - head = contendersList_; - if (head == 0 || (head & kLockBit) != 0) { - return; - } + // Go to sleep until we become the on-deck thread. + int32_t spinCount = 0; + while ((onDeck_ & ~kLockBit) != reinterpret_cast(&suspend)) { + // First, be SMT friendly + if (spinCount < kMaxReadSpinIter) { + Os::spinPause(); } + // and then SMP friendly + else if (spinCount < kMaxSpinIter) { + Thread::yield(); + } + // now go to sleep + else { + suspend.wait(); + } + spinCount++; + } + + spinCount = 0; + for (;;) { + assert((onDeck_ & ~kLockBit) == reinterpret_cast(&suspend) && "just checking"); + + if (trySpinLock()) { + break; + } + + // Somebody beat us to it. Since we are on-deck, we can just go + // back to sleep. + // First, be SMT friendly + if (spinCount < kMaxReadSpinIter) { + Os::spinPause(); + } + // and then SMP friendly + else if (spinCount < kMaxSpinIter) { + Thread::yield(); + } + // now go to sleep + else { + suspend.wait(); + } + spinCount++; + } + + // Restore the lock count (for recursive mutexes) + lockCount_ = lockCount; + + onDeck_.store(0, std::memory_order_release); } -void -Monitor::wait() -{ - Thread* thread = Thread::current(); - assert(isLocked() && owner_ == thread && "just checking"); +void Monitor::notify() { + assert(isLocked() && owner_ == Thread::current() && "just checking"); - // Add the thread's resume semaphore to the list. - Semaphore& suspend = thread->suspendSemaphore(); - suspend.reset(); + LinkedNode* waiter = waitersList_; + if (waiter == NULL) { + return; + } - LinkedNode newHead; - newHead.setItem(&suspend); - newHead.setNext(waitersList_); - waitersList_ = &newHead; + // Dequeue a waiter from the wait list and add it to the contention list. + waitersList_ = waiter->next(); - // Preserve the lock count (for recursive mutexes) - uint32_t lockCount = lockCount_; - lockCount_ = 1; - - // Release the lock and go to sleep. - unlock(); - - // Go to sleep until we become the on-deck thread. - int32_t spinCount = 0; - while ((onDeck_ & ~kLockBit) != reinterpret_cast(&suspend)) { - // First, be SMT friendly - if (spinCount < kMaxReadSpinIter) { - Os::spinPause(); - } - // and then SMP friendly - else if (spinCount < kMaxSpinIter) { - Thread::yield(); - } - // now go to sleep - else { - suspend.wait(); - } - spinCount++; + intptr_t node = contendersList_.load(std::memory_order_acquire); + for (;;) { + waiter->setNext(reinterpret_cast(node & ~kLockBit)); + if (contendersList_.compare_exchange_weak(node, reinterpret_cast(waiter) | kLockBit, + std::memory_order_acq_rel, + std::memory_order_acquire)) { + break; } - - spinCount = 0; - for (;;) { - assert((onDeck_ & ~kLockBit) == reinterpret_cast(&suspend) - && "just checking"); - - if (trySpinLock()) { - break; - } - - // Somebody beat us to it. Since we are on-deck, we can just go - // back to sleep. - // First, be SMT friendly - if (spinCount < kMaxReadSpinIter) { - Os::spinPause(); - } - // and then SMP friendly - else if (spinCount < kMaxSpinIter) { - Thread::yield(); - } - // now go to sleep - else { - suspend.wait(); - } - spinCount++; - } - - // Restore the lock count (for recursive mutexes) - lockCount_ = lockCount; - - onDeck_.store(0, std::memory_order_release); + } } -void -Monitor::notify() -{ - assert(isLocked() && owner_ == Thread::current() && "just checking"); - - LinkedNode* waiter = waitersList_; - if (waiter == NULL) { - return; - } - - // Dequeue a waiter from the wait list and add it to the contention list. - waitersList_ = waiter->next(); - - intptr_t node = contendersList_.load(std::memory_order_acquire); - for (;;) { - waiter->setNext(reinterpret_cast(node & ~kLockBit)); - if (contendersList_.compare_exchange_weak(node, - reinterpret_cast(waiter) | kLockBit, - std::memory_order_acq_rel, std::memory_order_acquire)) { - break; - } - } +void Monitor::notifyAll() { + // NOTE: We could CAS the whole list in 1 shot but this is + // not critical code. Optimize this if it becomes hot. + while (waitersList_ != NULL) { + notify(); + } } -void -Monitor::notifyAll() -{ - // NOTE: We could CAS the whole list in 1 shot but this is - // not critical code. Optimize this if it becomes hot. - while (waitersList_ != NULL) { - notify(); - } -} - -} // namespace amd +} // namespace amd diff --git a/rocclr/runtime/thread/monitor.hpp b/rocclr/runtime/thread/monitor.hpp index fac75b72d8..4d72e2ae0d 100644 --- a/rocclr/runtime/thread/monitor.hpp +++ b/rocclr/runtime/thread/monitor.hpp @@ -25,239 +25,221 @@ namespace amd { namespace details { -template -struct SimplyLinkedNode : public AllocClass -{ - typedef SimplyLinkedNode Node; +template struct SimplyLinkedNode : public AllocClass { + typedef SimplyLinkedNode Node; -protected: - std::atomic next_; /*!< \brief The next element. */ - T volatile item_; + protected: + std::atomic next_; /*!< \brief The next element. */ + T volatile item_; -public: - //! \brief Return the next element in the linked-list. - Node* next() const { return next_; } - //! \brief Return the item. - T item() const { return item_; } + public: + //! \brief Return the next element in the linked-list. + Node* next() const { return next_; } + //! \brief Return the item. + T item() const { return item_; } - //! \brief Set the next element pointer. - void setNext(Node* next) { next_ = next; } - //! \brief Set the item. - void setItem(T item) { item_ = item; } + //! \brief Set the next element pointer. + void setNext(Node* next) { next_ = next; } + //! \brief Set the item. + void setItem(T item) { item_ = item; } - //! \brief Swap the next element pointer. - Node* swapNext(Node* next) { return next_.swap(next); } + //! \brief Swap the next element pointer. + Node* swapNext(Node* next) { return next_.swap(next); } - //! \brief Compare and set the next element pointer. - bool compareAndSetNext(Node* compare, Node* next) - { - return next_.compare_exchange_strong(compare, next); - } + //! \brief Compare and set the next element pointer. + bool compareAndSetNext(Node* compare, Node* next) { + return next_.compare_exchange_strong(compare, next); + } }; -} // namespace details +} // namespace details -class Monitor : public HeapObject -{ - typedef details::SimplyLinkedNode LinkedNode; +class Monitor : public HeapObject { + typedef details::SimplyLinkedNode LinkedNode; -private: - static const intptr_t kLockBit = 0x1; + private: + static const intptr_t kLockBit = 0x1; - static const int kMaxSpinIter = 55; //!< Total number of spin iterations. - static const int kMaxReadSpinIter = 50; //!< Read iterations before yielding + static const int kMaxSpinIter = 55; //!< Total number of spin iterations. + static const int kMaxReadSpinIter = 50; //!< Read iterations before yielding - /*! Linked list of semaphores the contending threads are waiting on - * and main lock. - */ - std::atomic_intptr_t contendersList_; - //! The Mutex's name - char name_[64]; + /*! Linked list of semaphores the contending threads are waiting on + * and main lock. + */ + std::atomic_intptr_t contendersList_; + //! The Mutex's name + char name_[64]; - //! Semaphore of the next thread to contend for the lock. - std::atomic_intptr_t onDeck_; - //! Linked list of the suspended threads resume semaphores. - LinkedNode* volatile waitersList_; + //! Semaphore of the next thread to contend for the lock. + std::atomic_intptr_t onDeck_; + //! Linked list of the suspended threads resume semaphores. + LinkedNode* volatile waitersList_; - //! Thread owning this monitor. - Thread* volatile owner_; - //! The amount of times this monitor was acquired by the owner. - uint32_t lockCount_; - //! True if this is a recursive mutex, false otherwise. - const bool recursive_; + //! Thread owning this monitor. + Thread* volatile owner_; + //! The amount of times this monitor was acquired by the owner. + uint32_t lockCount_; + //! True if this is a recursive mutex, false otherwise. + const bool recursive_; -private: - //! Finish locking the mutex (contented case). - void finishLock(); - //! Finish unlocking the mutex (contented case). - void finishUnlock(); + private: + //! Finish locking the mutex (contented case). + void finishLock(); + //! Finish unlocking the mutex (contented case). + void finishUnlock(); -protected: - //! Try to spin-acquire the lock, return true if successful. - bool trySpinLock(); + protected: + //! Try to spin-acquire the lock, return true if successful. + bool trySpinLock(); - /*! \brief Return true if the lock is owned. - * - * \note The user is responsible for the memory ordering. - */ - bool isLocked() const { return (contendersList_ & kLockBit) != 0; } + /*! \brief Return true if the lock is owned. + * + * \note The user is responsible for the memory ordering. + */ + bool isLocked() const { return (contendersList_ & kLockBit) != 0; } - //! Return this monitor's owner thread (NULL if unlocked). - Thread* owner() const { return owner_; } + //! Return this monitor's owner thread (NULL if unlocked). + Thread* owner() const { return owner_; } - //! Set the owner. - void setOwner(Thread* thread) { owner_ = thread; } + //! Set the owner. + void setOwner(Thread* thread) { owner_ = thread; } -public: - explicit Monitor(const char* name = NULL, bool recursive = false); - ~Monitor() {} + public: + explicit Monitor(const char* name = NULL, bool recursive = false); + ~Monitor() {} - //! Try to acquire the lock, return true if successful. - inline bool tryLock(); + //! Try to acquire the lock, return true if successful. + inline bool tryLock(); - //! Acquire the lock or suspend the calling thread. - inline void lock(); + //! Acquire the lock or suspend the calling thread. + inline void lock(); - //! Release the lock and wake a single waiting thread if any. - inline void unlock(); + //! Release the lock and wake a single waiting thread if any. + inline void unlock(); - /*! \brief Give up the lock and go to sleep. - * - * Calling wait() causes the current thread to go to sleep until - * another thread calls notify()/notifyAll(). - * - * \note The monitor must be owned before calling wait(). - */ - void wait(); - /*! \brief Wake up a single thread waiting on this monitor. - * - * \note The monitor must be owned before calling notify(). - */ - void notify(); - /*! \brief Wake up all threads that are waiting on this monitor. - * - * \note The monitor must be owned before calling notifyAll(). - */ - void notifyAll(); + /*! \brief Give up the lock and go to sleep. + * + * Calling wait() causes the current thread to go to sleep until + * another thread calls notify()/notifyAll(). + * + * \note The monitor must be owned before calling wait(). + */ + void wait(); + /*! \brief Wake up a single thread waiting on this monitor. + * + * \note The monitor must be owned before calling notify(). + */ + void notify(); + /*! \brief Wake up all threads that are waiting on this monitor. + * + * \note The monitor must be owned before calling notifyAll(). + */ + void notifyAll(); - //! Return this lock's name. - const char* name() const { return name_; } + //! Return this lock's name. + const char* name() const { return name_; } }; -class ScopedLock : StackObject -{ -private: - Monitor* lock_; +class ScopedLock : StackObject { + private: + Monitor* lock_; -public: - ScopedLock(Monitor& lock) - : lock_(&lock) - { - lock_->lock(); - } + public: + ScopedLock(Monitor& lock) : lock_(&lock) { lock_->lock(); } - ScopedLock(Monitor* lock) - : lock_(lock) - { - if (lock_) lock_->lock(); - } + ScopedLock(Monitor* lock) : lock_(lock) { + if (lock_) lock_->lock(); + } - ~ScopedLock() - { - if (lock_) lock_->unlock(); - } + ~ScopedLock() { + if (lock_) lock_->unlock(); + } }; /*! @} * @} */ -inline bool -Monitor::tryLock() -{ - Thread* thread = Thread::current(); - assert(thread != NULL && "cannot lock() from (null)"); +inline bool Monitor::tryLock() { + Thread* thread = Thread::current(); + assert(thread != NULL && "cannot lock() from (null)"); - intptr_t ptr = contendersList_.load(std::memory_order_acquire); + intptr_t ptr = contendersList_.load(std::memory_order_acquire); - if (unlikely((ptr & kLockBit) != 0)) { - if (recursive_ && thread == owner_) { - // Recursive lock: increment the lock count and return. - ++lockCount_; - return true; - } - return false; // Already locked! + if (unlikely((ptr & kLockBit) != 0)) { + if (recursive_ && thread == owner_) { + // Recursive lock: increment the lock count and return. + ++lockCount_; + return true; } + return false; // Already locked! + } - if (unlikely(!contendersList_.compare_exchange_weak(ptr, ptr | kLockBit, - std::memory_order_acq_rel, std::memory_order_acquire))) { - return false; // We failed the CAS from unlocked to locked. - } + if (unlikely(!contendersList_.compare_exchange_weak( + ptr, ptr | kLockBit, std::memory_order_acq_rel, std::memory_order_acquire))) { + return false; // We failed the CAS from unlocked to locked. + } - setOwner(thread); // cannot move above the CAS. - lockCount_ = 1; + setOwner(thread); // cannot move above the CAS. + lockCount_ = 1; - return true; + return true; } -inline void -Monitor::lock() -{ - if (unlikely(!tryLock())) { - // The lock is contented. - finishLock(); - } +inline void Monitor::lock() { + if (unlikely(!tryLock())) { + // The lock is contented. + finishLock(); + } - // This is the beginning of the critical region. From now-on, everything - // executes single-threaded! - // + // This is the beginning of the critical region. From now-on, everything + // executes single-threaded! + // } -inline void -Monitor::unlock() -{ - assert(isLocked() && owner_ == Thread::current() && "invariant"); +inline void Monitor::unlock() { + assert(isLocked() && owner_ == Thread::current() && "invariant"); - if (recursive_ && --lockCount_ > 0) { - // was a recursive lock case, simply return. - return; + if (recursive_ && --lockCount_ > 0) { + // was a recursive lock case, simply return. + return; + } + + setOwner(NULL); + + // Clear the lock bit. + intptr_t ptr = contendersList_.load(std::memory_order_acquire); + while (!contendersList_.compare_exchange_weak(ptr, ptr & ~kLockBit, std::memory_order_acq_rel, + std::memory_order_acquire)) + ; + // + // We succeeded the CAS from locked to unlocked. + // This is the end of the critical region. + + // Check if we have an on-deck thread that needs signaling. + intptr_t onDeck = onDeck_; + if (onDeck != 0) { + if ((onDeck & kLockBit) == 0) { + // Only signal if it is unmarked. + reinterpret_cast(onDeck)->post(); } + return; // We are done. + } - setOwner(NULL); + // We do not have an on-deck thread yet, we might have to walk the list in + // order to select the next onDeck_. Only one thread needs to fill onDeck_, + // so return if the list is empty or if the lock got acquired again (it's + // somebody else's problem now!) - // Clear the lock bit. - intptr_t ptr = contendersList_.load(std::memory_order_acquire); - while (!contendersList_.compare_exchange_weak(ptr, ptr & ~kLockBit, - std::memory_order_acq_rel, std::memory_order_acquire)) - ; - // - // We succeeded the CAS from locked to unlocked. - // This is the end of the critical region. + intptr_t head = contendersList_; + if (head == 0 || (head & kLockBit) != 0) { + return; + } - // Check if we have an on-deck thread that needs signaling. - intptr_t onDeck = onDeck_; - if (onDeck != 0) { - if ((onDeck & kLockBit) == 0) { - // Only signal if it is unmarked. - reinterpret_cast(onDeck)->post(); - } - return; // We are done. - } - - // We do not have an on-deck thread yet, we might have to walk the list in - // order to select the next onDeck_. Only one thread needs to fill onDeck_, - // so return if the list is empty or if the lock got acquired again (it's - // somebody else's problem now!) - - intptr_t head = contendersList_; - if (head == 0 || (head & kLockBit) != 0) { - return; - } - - // Finish the unlock operation: find a thread to wake up. - finishUnlock(); + // Finish the unlock operation: find a thread to wake up. + finishUnlock(); } -} // namespace amd +} // namespace amd #endif /*MONITOR_HPP_*/ diff --git a/rocclr/runtime/thread/semaphore.cpp b/rocclr/runtime/thread/semaphore.cpp index d443163b29..25ac395c76 100644 --- a/rocclr/runtime/thread/semaphore.cpp +++ b/rocclr/runtime/thread/semaphore.cpp @@ -6,90 +6,82 @@ #include "thread/thread.hpp" #if defined(_WIN32) || defined(__CYGWIN__) -# include -#else // !_WIN32 -# include -# include -#endif // !_WIN32 +#include +#else // !_WIN32 +#include +#include +#endif // !_WIN32 namespace amd { -Semaphore::Semaphore() - : state_(0) -{ +Semaphore::Semaphore() : state_(0) { #ifdef _WIN32 - handle_ = static_cast(CreateSemaphore(NULL, 0, LONG_MAX, NULL)); - assert(handle_ != NULL && "CreateSemaphore failed"); -#else // !_WIN32 - if (sem_init(&sem_, 0, 0) != 0) { - fatal("sem_init() failed"); - } -#endif // !_WIN32 + handle_ = static_cast(CreateSemaphore(NULL, 0, LONG_MAX, NULL)); + assert(handle_ != NULL && "CreateSemaphore failed"); +#else // !_WIN32 + if (sem_init(&sem_, 0, 0) != 0) { + fatal("sem_init() failed"); + } +#endif // !_WIN32 } -Semaphore::~Semaphore() -{ +Semaphore::~Semaphore() { #ifdef _WIN32 - if (!CloseHandle(static_cast(handle_))) { - fatal("CloseHandle() failed"); - } -#else // !_WIN32 - if (sem_destroy(&sem_) != 0) { - fatal("sem_destroy() failed"); - } -#endif // !WIN32 + if (!CloseHandle(static_cast(handle_))) { + fatal("CloseHandle() failed"); + } +#else // !_WIN32 + if (sem_destroy(&sem_) != 0) { + fatal("sem_destroy() failed"); + } +#endif // !WIN32 } -void -Semaphore::post() -{ - int state = state_.load(std::memory_order_relaxed); - for (;;) { - if (state > 0) { - int newstate = state_.load(std::memory_order_acquire); - if (state == newstate) { - return; - } - state = newstate; - continue; - } - if (state_.compare_exchange_weak(state, state+1, - std::memory_order_acq_rel, std::memory_order_acquire)) { - break; - } - } - - if (state < 0) { - // We have threads waiting on this event. -#ifdef _WIN32 - ReleaseSemaphore(static_cast(handle_), 1, NULL); -#else // !_WIN32 - if (0 != sem_post(&sem_)) { - fatal("sem_post() failed"); - } -#endif // !_WIN32 - } -} - -void -Semaphore::wait() -{ - if (state_-- > 0) { +void Semaphore::post() { + int state = state_.load(std::memory_order_relaxed); + for (;;) { + if (state > 0) { + int newstate = state_.load(std::memory_order_acquire); + if (state == newstate) { return; - } - -#ifdef _WIN32 - if (WAIT_OBJECT_0 != WaitForSingleObject( - static_cast(handle_), INFINITE)) { - fatal("WaitForSingleObject failed"); - } -#else // !_WIN32 - while (0 != sem_wait(&sem_)) { - if (EINTR != errno) { - fatal("sem_wait() failed"); } + state = newstate; + continue; } -#endif // !_WIN32 + if (state_.compare_exchange_weak(state, state + 1, std::memory_order_acq_rel, + std::memory_order_acquire)) { + break; + } + } + + if (state < 0) { +// We have threads waiting on this event. +#ifdef _WIN32 + ReleaseSemaphore(static_cast(handle_), 1, NULL); +#else // !_WIN32 + if (0 != sem_post(&sem_)) { + fatal("sem_post() failed"); + } +#endif // !_WIN32 + } } -} // namespace amd +void Semaphore::wait() { + if (state_-- > 0) { + return; + } + +#ifdef _WIN32 + if (WAIT_OBJECT_0 != WaitForSingleObject(static_cast(handle_), INFINITE)) { + fatal("WaitForSingleObject failed"); + } +#else // !_WIN32 + while (0 != sem_wait(&sem_)) { + if (EINTR != errno) { + fatal("sem_wait() failed"); + } + } +#endif // !_WIN32 +} + +} // namespace amd diff --git a/rocclr/runtime/thread/semaphore.hpp b/rocclr/runtime/thread/semaphore.hpp index 93c8c0f209..ed27bb9980 100644 --- a/rocclr/runtime/thread/semaphore.hpp +++ b/rocclr/runtime/thread/semaphore.hpp @@ -10,7 +10,7 @@ #include #if defined(__linux__) -# include +#include #endif /*linux*/ @@ -26,40 +26,36 @@ namespace amd { class Thread; //! \brief Counting semaphore -class Semaphore : public HeapObject -{ -private: - std::atomic_int state_; //!< This semaphore's value. +class Semaphore : public HeapObject { + private: + std::atomic_int state_; //!< This semaphore's value. #ifdef _WIN32 - void* handle_; //!< The semaphore object's handle. - char padding_[64-sizeof(void*)-sizeof(std::atomic_int)]; -#else // !_WIN32 - sem_t sem_; //!< The semaphore object's identifier. - char padding_[64-sizeof(sem_t)-sizeof(std::atomic_int)]; + void* handle_; //!< The semaphore object's handle. + char padding_[64 - sizeof(void*) - sizeof(std::atomic_int)]; +#else // !_WIN32 + sem_t sem_; //!< The semaphore object's identifier. + char padding_[64 - sizeof(sem_t) - sizeof(std::atomic_int)]; #endif /*!_WIN32*/ -public: - Semaphore(); - ~Semaphore(); + public: + Semaphore(); + ~Semaphore(); - //! \brief Decrement this semaphore - void wait(); + //! \brief Decrement this semaphore + void wait(); - //! \brief Increment this semaphore - void post(); + //! \brief Increment this semaphore + void post(); - //! \brief Reset this semaphore. - void reset() - { - state_.store(0, std::memory_order_release); - } + //! \brief Reset this semaphore. + void reset() { state_.store(0, std::memory_order_release); } }; /*! @} * @} */ -} // namespace amd +} // namespace amd #endif /*SEMAPHORE_HPP_*/ diff --git a/rocclr/runtime/thread/thread.cpp b/rocclr/runtime/thread/thread.cpp index ee1e6abde3..22d017453e 100644 --- a/rocclr/runtime/thread/thread.cpp +++ b/rocclr/runtime/thread/thread.cpp @@ -8,109 +8,97 @@ #include "os/os.hpp" #if defined(_WIN32) || defined(__CYGWIN__) -# include -#endif // _WIN32 +#include +#endif // _WIN32 namespace amd { -HostThread::HostThread() - : Thread("HostThread", 0, false) -{ - setCurrent(); - Os::currentStackInfo(&stackBase_, &stackSize_); - setState(RUNNABLE); +HostThread::HostThread() : Thread("HostThread", 0, false) { + setCurrent(); + Os::currentStackInfo(&stackBase_, &stackSize_); + setState(RUNNABLE); } -void -Thread::create() -{ - created_ = new Semaphore(); - lock_ = new Semaphore(); - suspend_ = new Semaphore(); +void Thread::create() { + created_ = new Semaphore(); + lock_ = new Semaphore(); + suspend_ = new Semaphore(); - selfSuspendLock_ = new Monitor(); + selfSuspendLock_ = new Monitor(); - data_ = NULL; - handle_ = NULL; - setState(CREATED); + data_ = NULL; + handle_ = NULL; + setState(CREATED); } Thread::Thread(const std::string& name, size_t stackSize, bool spawn) - : handle_(NULL), name_(name), stackSize_(stackSize) -{ - create(); + : handle_(NULL), name_(name), stackSize_(stackSize) { + create(); - if (!spawn) return; + if (!spawn) return; - if ((handle_ = Os::createOsThread(this))) { - // Now we need to wait for Thread::main to report back. - while (state() != Thread::INITIALIZED) { - created_->wait(); - } + if ((handle_ = Os::createOsThread(this))) { + // Now we need to wait for Thread::main to report back. + while (state() != Thread::INITIALIZED) { + created_->wait(); } + } } -Thread::~Thread() -{ +Thread::~Thread() { #if defined(_WIN32) - if (handle_ != NULL) { - ::CloseHandle((HANDLE) handle_); - } + if (handle_ != NULL) { + ::CloseHandle((HANDLE)handle_); + } #endif - delete created_; - delete lock_; - delete suspend_; + delete created_; + delete lock_; + delete suspend_; - delete selfSuspendLock_; + delete selfSuspendLock_; } -void* -Thread::main() -{ +void* Thread::main() { #ifdef DEBUG - Os::setCurrentThreadName(name().c_str()); -#endif // DEBUG - Os::currentStackInfo(&stackBase_, &stackSize_); - setCurrent(); + Os::setCurrentThreadName(name().c_str()); +#endif // DEBUG + Os::currentStackInfo(&stackBase_, &stackSize_); + setCurrent(); - // Notify the parent thread that we are up and running. - { - ScopedLock sl(selfSuspendLock_); - setState(INITIALIZED); - created_->post(); - selfSuspendLock_->wait(); - } - - if (state() == RUNNABLE) { - run(data_); - } - - setState(FINISHED); - return NULL; -} - -bool -Thread::start(void* data) -{ - if (state() != INITIALIZED) { - return false; - } - - data_ = data; - { - ScopedLock sl(selfSuspendLock_); - setState(RUNNABLE); - selfSuspendLock_->notify(); - } - - return true; -} - -void -Thread::resume() -{ + // Notify the parent thread that we are up and running. + { ScopedLock sl(selfSuspendLock_); + setState(INITIALIZED); + created_->post(); + selfSuspendLock_->wait(); + } + + if (state() == RUNNABLE) { + run(data_); + } + + setState(FINISHED); + return NULL; +} + +bool Thread::start(void* data) { + if (state() != INITIALIZED) { + return false; + } + + data_ = data; + { + ScopedLock sl(selfSuspendLock_); + setState(RUNNABLE); selfSuspendLock_->notify(); + } + + return true; +} + +void Thread::resume() { + ScopedLock sl(selfSuspendLock_); + selfSuspendLock_->notify(); } #if defined(__linux__) @@ -119,19 +107,13 @@ namespace details { __thread Thread* thread_ __attribute__((tls_model("initial-exec"))); -} // namespace details +} // namespace details -void -Thread::registerStack(address base, address top) -{ - // Nothing to do. +void Thread::registerStack(address base, address top) { + // Nothing to do. } -void -Thread::setCurrent() -{ - details::thread_ = this; -} +void Thread::setCurrent() { details::thread_ = this; } #elif defined(_WIN32) @@ -139,53 +121,45 @@ namespace details { #if defined(USE_DECLSPEC_THREAD) __declspec(thread) Thread* thread_; -#else // !USE_DECLSPEC_THREAD +#else // !USE_DECLSPEC_THREAD DWORD threadIndex_ = TlsAlloc(); -#endif // !USE_DECLSPEC_THREAD +#endif // !USE_DECLSPEC_THREAD -} // namespace details +} // namespace details -void -Thread::registerStack(address base, address top) -{ - // Nothing to do. +void Thread::registerStack(address base, address top) { + // Nothing to do. } -void -Thread::setCurrent() -{ +void Thread::setCurrent() { #if defined(USE_DECLSPEC_THREAD) - details::thread_ = this; -#else // !USE_DECLSPEC_THREAD - TlsSetValue(details::threadIndex_, this); -#endif // !USE_DECLSPEC_THREAD + details::thread_ = this; +#else // !USE_DECLSPEC_THREAD + TlsSetValue(details::threadIndex_, this); +#endif // !USE_DECLSPEC_THREAD } #endif -bool -Thread::init() -{ - static bool initialized_ = false; +bool Thread::init() { + static bool initialized_ = false; - // We could use InitOnceExecuteOnce/pthread_once here: - if (initialized_) { - return true; - } - initialized_ = true; + // We could use InitOnceExecuteOnce/pthread_once here: + if (initialized_) { + return true; + } + initialized_ = true; - // Register the main thread - return NULL != new HostThread(); + // Register the main thread + return NULL != new HostThread(); } -void -Thread::tearDown() -{ +void Thread::tearDown() { #if defined(_WIN32) && !defined(USE_DECLSPEC_THREAD) - if (details::threadIndex_ != TLS_OUT_OF_INDEXES) { - TlsFree(threadIndex_); - } -#endif // _WIN32 && !USE_DECLSPEC_THREAD + if (details::threadIndex_ != TLS_OUT_OF_INDEXES) { + TlsFree(threadIndex_); + } +#endif // _WIN32 && !USE_DECLSPEC_THREAD } -} // namespace amd +} // namespace amd diff --git a/rocclr/runtime/thread/thread.hpp b/rocclr/runtime/thread/thread.hpp index c2b5ca274c..8a15c61ceb 100644 --- a/rocclr/runtime/thread/thread.hpp +++ b/rocclr/runtime/thread/thread.hpp @@ -12,10 +12,10 @@ #include #if defined(_WIN32) -# define USE_DECLSPEC_THREAD 1 -# if !defined(USE_DECLSPEC_THREAD) -# include -# endif /*!USE_DECLSPEC_THREAD*/ +#define USE_DECLSPEC_THREAD 1 +#if !defined(USE_DECLSPEC_THREAD) +#include +#endif /*!USE_DECLSPEC_THREAD*/ #endif /*_WIN32*/ namespace amd { @@ -29,157 +29,137 @@ namespace amd { class Monitor; -class Thread : public HeapObject -{ - friend const void* Os::createOsThread(Thread*); +class Thread : public HeapObject { + friend const void* Os::createOsThread(Thread*); -public: - enum ThreadState - { - CREATED, - INITIALIZED, - RUNNABLE, - SUSPENDED, - FINISHED, - FAILED - }; + public: + enum ThreadState { CREATED, INITIALIZED, RUNNABLE, SUSPENDED, FINISHED, FAILED }; -private: - //! System thread handle. - const void* handle_; - //! The thread's name. - const std::string name_; - //! Current running state. - volatile ThreadState state_; - //! The argument passed to run() - void* data_; + private: + //! System thread handle. + const void* handle_; + //! The thread's name. + const std::string name_; + //! Current running state. + volatile ThreadState state_; + //! The argument passed to run() + void* data_; - //! \cond ignore - Semaphore* created_; //!< To notify the parent thread. - Semaphore* lock_; //!< For mutex support (during contention). - Semaphore* suspend_; //!< For wait/suspend support. - //! \endcond + //! \cond ignore + Semaphore* created_; //!< To notify the parent thread. + Semaphore* lock_; //!< For mutex support (during contention). + Semaphore* suspend_; //!< For wait/suspend support. + //! \endcond - Monitor* selfSuspendLock_; //!< For self suspend/resume. + Monitor* selfSuspendLock_; //!< For self suspend/resume. -protected: - address stackBase_; //!< Main stack base. - size_t stackSize_; //!< Main stack size. + protected: + address stackBase_; //!< Main stack base. + size_t stackSize_; //!< Main stack size. -private: + private: + /*! \brief The start wrapper for all newly create threads. + * This is called from the pthread_create start_thread. + */ + static void* entry(Thread* thread); - /*! \brief The start wrapper for all newly create threads. - * This is called from the pthread_create start_thread. - */ - static void* entry(Thread* thread); + /*! \brief Thread main (called from the main function). + * Setup the thread for running and wait for the semaphore to be signaled. + */ + void* main(); - /*! \brief Thread main (called from the main function). - * Setup the thread for running and wait for the semaphore to be signaled. - */ - void* main(); + //! The entry point for this thread. + virtual void run(void* data) = 0; - //! The entry point for this thread. - virtual void run(void* data) = 0; + protected: + //! Bring this thread to the created state. + void create(); -protected: - //! Bring this thread to the created state. - void create(); + //! Set the current thread state. + void setState(ThreadState state) { state_ = state; } - //! Set the current thread state. - void setState(ThreadState state) { state_ = state; } + //! Set the thread-local _thread variable (used by current()). + void setCurrent(); - //! Set the thread-local _thread variable (used by current()). - void setCurrent(); + //! Register the given memory region as a valid stack. + void registerStack(address base, address top); - //! Register the given memory region as a valid stack. - void registerStack(address base, address top); + /*! \brief Construct a new thread. + * If \a spawn is false, do not create a new OS thread, instead, + * bind to the currently running on. + */ + explicit Thread(const std::string& name, size_t stackSize = 0 /*use system default*/, + bool spawn = true /* create a new Os::thread */); - /*! \brief Construct a new thread. - * If \a spawn is false, do not create a new OS thread, instead, - * bind to the currently running on. - */ - explicit Thread( - const std::string& name, - size_t stackSize = 0 /*use system default*/, - bool spawn = true /* create a new Os::thread */); + public: + //! Return the currently running thread instance. + static inline Thread* current(); -public: - //! Return the currently running thread instance. - static inline Thread* current(); + //! Initialize the OsThread package. + static bool init(); - //! Initialize the OsThread package. - static bool init(); + //! Tear down the OsThread package. + static void tearDown(); - //! Tear down the OsThread package. - static void tearDown(); + //! Destroy this thread. + virtual ~Thread(); - //! Destroy this thread. - virtual ~Thread(); + //! Return the thread's name + const std::string& name() const { return name_; } - //! Return the thread's name - const std::string& name() const { return name_; } + //! Get the system thread handle. + const void* handle() const { return handle_; } - //! Get the system thread handle. - const void* handle() const { return handle_; } + //! Start the thread execution + bool start(void* data = NULL); - //! Start the thread execution - bool start(void *data = NULL); + //! Resume the thread + void resume(); - //! Resume the thread - void resume(); + //! Return true is this is the host thread. + virtual bool isHostThread() const { return false; } - //! Return true is this is the host thread. - virtual bool isHostThread() const { return false; } + //! Return true if this is a worker thread. + virtual bool isWorkerThread() const { return false; } - //! Return true if this is a worker thread. - virtual bool isWorkerThread() const { return false; } + //! Get the current thread state. + ThreadState state() const { return state_; } - //! Get the current thread state. - ThreadState state() const { return state_; } + //! Return this thread's stack base. + address stackBase() const { return stackBase_; } + //! Return this thread's stack size. + size_t stackSize() const { return stackSize_; } + //! Return this thread's stack bottom. + address stackBottom() const { return stackBase() - stackSize(); } - //! Return this thread's stack base. - address stackBase() const { return stackBase_; } - //! Return this thread's stack size. - size_t stackSize() const { return stackSize_; } - //! Return this thread's stack bottom. - address stackBottom() const { return stackBase() - stackSize(); } + //! Return this thread's contend semaphore. + Semaphore& lockSemaphore() const { return *lock_; } + //! Return this thread's resume semaphore. + Semaphore& suspendSemaphore() const { return *suspend_; } - //! Return this thread's contend semaphore. - Semaphore& lockSemaphore() const { return *lock_; } - //! Return this thread's resume semaphore. - Semaphore& suspendSemaphore() const { return *suspend_; } + //! Set this thread's affinity to the given cpu. + void setAffinity(uint cpu_id) const { Os::setThreadAffinity(handle_, cpu_id); } - //! Set this thread's affinity to the given cpu. - void setAffinity(uint cpu_id) const - { - Os::setThreadAffinity(handle_, cpu_id); - } + //! Set this thread's affinity to the given cpu mask. + void setAffinity(const Os::ThreadAffinityMask& mask) const { + Os::setThreadAffinity(handle_, mask); + } - //! Set this thread's affinity to the given cpu mask. - void setAffinity(const Os::ThreadAffinityMask& mask) const - { - Os::setThreadAffinity(handle_, mask); - } - - //! Yield to threads of the same priority of higher - static void yield() - { - Os::yield(); - } + //! Yield to threads of the same priority of higher + static void yield() { Os::yield(); } }; -class HostThread : public Thread -{ -private: - //! A HostThread does not have a run function - virtual void run(void* data) { ShouldNotCallThis(); } +class HostThread : public Thread { + private: + //! A HostThread does not have a run function + virtual void run(void* data) { ShouldNotCallThis(); } -public: - //! Construct a new HostThread - HostThread(); + public: + //! Construct a new HostThread + HostThread(); - //! Return true is this is the host thread. - bool isHostThread() const { return true; }; + //! Return true is this is the host thread. + bool isHostThread() const { return true; }; }; /*! @} @@ -192,40 +172,30 @@ namespace details { extern __thread Thread* thread_ __attribute__((tls_model("initial-exec"))); -static inline Thread* -currentThread() -{ - return thread_; -} +static inline Thread* currentThread() { return thread_; } #elif defined(_WIN32) #if defined(USE_DECLSPEC_THREAD) extern __declspec(thread) Thread* thread_; -#else // !USE_DECLSPEC_THREAD +#else // !USE_DECLSPEC_THREAD extern DWORD threadIndex_; -#endif // !USE_DECLSPEC_THREAD +#endif // !USE_DECLSPEC_THREAD -static inline Thread* -currentThread() -{ +static inline Thread* currentThread() { #if defined(USE_DECLSPEC_THREAD) - return thread_; -#else // !USE_DECLSPEC_THREAD - return (Thread*) TlsGetValue(threadIndex_); -#endif // !USE_DECLSPEC_THREAD + return thread_; +#else // !USE_DECLSPEC_THREAD + return (Thread*)TlsGetValue(threadIndex_); +#endif // !USE_DECLSPEC_THREAD } -#endif // _WIN32 +#endif // _WIN32 -} // namespace details +} // namespace details -inline Thread* -Thread::current() -{ - return details::currentThread(); -} +inline Thread* Thread::current() { return details::currentThread(); } -} // namespace amd +} // namespace amd #endif /*THREAD_HPP_*/ diff --git a/rocclr/runtime/top.hpp b/rocclr/runtime/top.hpp index 585682ccad..634dafcd54 100644 --- a/rocclr/runtime/top.hpp +++ b/rocclr/runtime/top.hpp @@ -6,14 +6,14 @@ #define TOP_HPP_ #if defined(ATI_ARCH_ARM) -# define __EXPORTED_HEADERS__ 1 +#define __EXPORTED_HEADERS__ 1 #endif /*ATI_ARCH_ARM*/ #ifdef _WIN32 // Disable unneeded features of for efficiency. -# define NODRAWTEXT 1 -# define NOMINMAX 1 -# define WIN32_LEAN_AND_MEAN 1 +#define NODRAWTEXT 1 +#define NOMINMAX 1 +#define WIN32_LEAN_AND_MEAN 1 #endif /*_WIN32*/ #include "utils/macros.hpp" @@ -21,8 +21,8 @@ #if defined(CL_VERSION_2_0) /* Deprecated in OpenCL 2.0 */ -# define CL_DEVICE_QUEUE_PROPERTIES 0x102A -# define CL_DEVICE_HOST_UNIFIED_MEMORY 0x1035 +#define CL_DEVICE_QUEUE_PROPERTIES 0x102A +#define CL_DEVICE_HOST_UNIFIED_MEMORY 0x1035 #endif #if !defined(ATI_ARCH_ARM) @@ -35,56 +35,56 @@ typedef unsigned char* address; typedef const unsigned char* const_address; -typedef void * pointer; -typedef const void * const_pointer; +typedef void* pointer; +typedef const void* const_pointer; typedef unsigned int uint; typedef unsigned long ulong; typedef const char* cstring; #if defined(_WIN32) -# if defined(_WIN64) +#if defined(_WIN64) typedef __int64 ssize_t; -# else // !_WIN64 +#else // !_WIN64 typedef __int32 ssize_t; -# endif // !_WIN64 -#endif /*_WIN32*/ +#endif // !_WIN64 +#endif /*_WIN32*/ #ifdef _WIN32 -# define SIZE_T_FMT "%Iu" -# define PTR_FMT "0x%p" -# if _MSC_VER < 1900 -# define snprintf sprintf_s -# endif +#define SIZE_T_FMT "%Iu" +#define PTR_FMT "0x%p" +#if _MSC_VER < 1900 +#define snprintf sprintf_s +#endif #else /*!_WIN32*/ -# define SIZE_T_FMT "%zu" -# define PTR_FMT "%p" +#define SIZE_T_FMT "%zu" +#define PTR_FMT "%p" #endif /*!_WIN32*/ typedef uint32_t cl_mem_fence_flags; //! \cond ignore -#define _BAD_INT32 0xBAADBAAD -#define _BAD_INT64 0XBAADBAADBAADBAADLL -#define _BAD_INTPTR LP64_SWITCH(_BAD_INT32,_BAD_INT64) +#define _BAD_INT32 0xBAADBAAD +#define _BAD_INT64 0XBAADBAADBAADBAADLL +#define _BAD_INTPTR LP64_SWITCH(_BAD_INT32, _BAD_INT64) -const pointer badPointer = (pointer)(intptr_t) _BAD_INTPTR; -const address badAddress = (address)(intptr_t) _BAD_INTPTR; +const pointer badPointer = (pointer)(intptr_t)_BAD_INTPTR; +const address badAddress = (address)(intptr_t)_BAD_INTPTR; //! \endcond const size_t Ki = 1024; -const size_t Mi = Ki*Ki; -const size_t Gi = Ki*Ki*Ki; +const size_t Mi = Ki * Ki; +const size_t Gi = Ki * Ki * Ki; const size_t K = 1000; -const size_t M = K*K; -const size_t G = K*K*K; +const size_t M = K * K; +const size_t G = K * K * K; #include "utils/debug.hpp" //! \addtogroup Utils //! Namespace for AMD's OpenCL platform -namespace amd {/*@{*/ +namespace amd { /*@{*/ //! \brief The default Null object type (!= void*); struct Null {}; @@ -94,89 +94,88 @@ inline const Null null() { return Null(); } /*! \brief Equivalent to a namespace (All member functions are static). */ -class AllStatic -{ -WINDOWS_SWITCH(public,private): - AllStatic() { ShouldNotCallThis(); } - AllStatic(const AllStatic&) { ShouldNotCallThis(); } - ~AllStatic() { ShouldNotCallThis(); } +class AllStatic { + WINDOWS_SWITCH(public, private) : AllStatic() { ShouldNotCallThis(); } + AllStatic(const AllStatic&) { ShouldNotCallThis(); } + ~AllStatic() { ShouldNotCallThis(); } }; /*! \brief For embedded objects. */ -class EmbeddedObject -{ -WINDOWS_SWITCH(public,private): - void* operator new(size_t) { ShouldNotCallThis(); return badPointer; } - void operator delete(void *) { ShouldNotCallThis(); } +class EmbeddedObject { + WINDOWS_SWITCH(public, private) : void * operator new(size_t) { + ShouldNotCallThis(); + return badPointer; + } + void operator delete(void*) { ShouldNotCallThis(); } }; /*! \brief For stack allocated objects. */ -class StackObject -{ -WINDOWS_SWITCH(public,private): - void* operator new(size_t) { ShouldNotCallThis(); return badPointer; } - void operator delete(void *) { ShouldNotCallThis(); } +class StackObject { + WINDOWS_SWITCH(public, private) : void * operator new(size_t) { + ShouldNotCallThis(); + return badPointer; + } + void operator delete(void*) { ShouldNotCallThis(); } }; /*! \brief for objects allocated in a dedicate memory pool. - the standard 'new' should not be called, + the standard 'new' should not be called, only the in place version 'new (allocation_pointer) ()' , delete should only invoke the destructors and not release memory */ -class MemoryPoolObject -{ -public: - void* operator new(size_t) { ShouldNotCallThis(); return badPointer; } - void* operator new(size_t size,void * address) { return address; } - void operator delete(void *) { } - void operator delete( void *,void * address) { } +class MemoryPoolObject { + public: + void* operator new(size_t) { + ShouldNotCallThis(); + return badPointer; + } + void* operator new(size_t size, void* address) { return address; } + void operator delete(void*) {} + void operator delete(void*, void* address) {} }; /*! \brief For objects allocated on the C-heap. */ -class HeapObject -{ -public: - void* operator new(size_t size); - void operator delete(void* obj); - void* operator new(size_t size, size_t extSize) - { return HeapObject::operator new (size + extSize); }; - void operator delete(void* obj, size_t extSize) - { HeapObject::operator delete (obj); } +class HeapObject { + public: + void* operator new(size_t size); + void operator delete(void* obj); + void* operator new(size_t size, size_t extSize) { + return HeapObject::operator new(size + extSize); + }; + void operator delete(void* obj, size_t extSize) { HeapObject::operator delete(obj); } }; /*! \brief For all reference counted objects. */ -class ReferenceCountedObject -{ - volatile uint referenceCount_; +class ReferenceCountedObject { + volatile uint referenceCount_; -protected: - virtual ~ReferenceCountedObject() { } - virtual bool terminate() { return true; } + protected: + virtual ~ReferenceCountedObject() {} + virtual bool terminate() { return true; } -public: - ReferenceCountedObject() : referenceCount_(1) { } + public: + ReferenceCountedObject() : referenceCount_(1) {} - void* operator new(size_t size) { return ::operator new(size); } - void operator delete(void* p) { return ::operator delete(p); } - void* operator new(size_t size, size_t extSize) - { return ReferenceCountedObject::operator new (size + extSize); }; - void operator delete(void* obj, size_t extSize) - { ReferenceCountedObject::operator delete (obj); } + void* operator new(size_t size) { return ::operator new(size); } + void operator delete(void* p) { return ::operator delete(p); } + void* operator new(size_t size, size_t extSize) { + return ReferenceCountedObject::operator new(size + extSize); + }; + void operator delete(void* obj, size_t extSize) { ReferenceCountedObject::operator delete(obj); } - uint referenceCount() const { return referenceCount_; } + uint referenceCount() const { return referenceCount_; } - uint retain(); - uint release(); + uint retain(); + uint release(); }; /*@}*/} // namespace amd -#undef min // using std::min -#undef max // using std::max +#undef min // using std::min +#undef max // using std::max #endif /*TOP_HPP_*/ - diff --git a/rocclr/runtime/utils/concurrent.hpp b/rocclr/runtime/utils/concurrent.hpp index 679291b1b8..5a64c01d97 100644 --- a/rocclr/runtime/utils/concurrent.hpp +++ b/rocclr/runtime/utils/concurrent.hpp @@ -13,43 +13,33 @@ //! \addtogroup Utils -namespace amd {/*@{*/ +namespace amd { /*@{*/ namespace details { -template -struct TaggedPointerHelper -{ - static const uintptr_t TagMask = (1u << N) - 1; +template struct TaggedPointerHelper { + static const uintptr_t TagMask = (1u << N) - 1; -private: - TaggedPointerHelper(); // Cannot instantiate - void* operator new(size_t); // allocate or - void operator delete(void*); // delete a TaggedPointerHelper. + private: + TaggedPointerHelper(); // Cannot instantiate + void* operator new(size_t); // allocate or + void operator delete(void*); // delete a TaggedPointerHelper. -public: - //! Create a tagged pointer. - static TaggedPointerHelper* make(T* ptr, size_t tag) - { - return reinterpret_cast( - (reinterpret_cast(ptr) & ~TagMask) | (tag & TagMask)); - } + public: + //! Create a tagged pointer. + static TaggedPointerHelper* make(T* ptr, size_t tag) { + return reinterpret_cast((reinterpret_cast(ptr) & ~TagMask) | + (tag & TagMask)); + } - //! Return the pointer value. - T* ptr() - { - return reinterpret_cast( - reinterpret_cast(this) & ~TagMask); - } + //! Return the pointer value. + T* ptr() { return reinterpret_cast(reinterpret_cast(this) & ~TagMask); } - //! Return the tag value. - size_t tag() const - { - return reinterpret_cast(this) & TagMask; - } + //! Return the tag value. + size_t tag() const { return reinterpret_cast(this) & TagMask; } }; -} // namespace details +} // namespace details /*! \brief An unbounded thread-safe queue. * @@ -60,153 +50,125 @@ public: * FIXME_lmoriche: Implement the new/delete operators for SimplyLinkedNode * using thread-local allocation buffers. */ -template -class ConcurrentLinkedQueue : public HeapObject -{ - //! A simply-linked node - struct Node - { - typedef details::TaggedPointerHelper TaggedPointerHelper; - typedef TaggedPointerHelper* Ptr; +template class ConcurrentLinkedQueue : public HeapObject { + //! A simply-linked node + struct Node { + typedef details::TaggedPointerHelper TaggedPointerHelper; + typedef TaggedPointerHelper* Ptr; - T value_; //!< The value stored in that node. - std::atomic next_; //!< Pointer to the next node + T value_; //!< The value stored in that node. + std::atomic next_; //!< Pointer to the next node - //! Create a Node::Ptr - static inline Ptr ptr(Node* ptr, size_t counter = 0) - { - return TaggedPointerHelper::make(ptr, counter); - } - }; - -private: - std::atomic head_; //! Pointer to the oldest element. - std::atomic tail_; //! Pointer to the most recent element. - -private: - //! \brief Allocate a free node. - static inline Node* allocNode() - { - return new(AlignedMemory::allocate(sizeof(Node), 1 << N)) Node(); + //! Create a Node::Ptr + static inline Ptr ptr(Node* ptr, size_t counter = 0) { + return TaggedPointerHelper::make(ptr, counter); } + }; - //! \brief Return a node to the free list. - static inline void reclaimNode(Node* node) - { - AlignedMemory::deallocate(node); - } + private: + std::atomic head_; //! Pointer to the oldest element. + std::atomic tail_; //! Pointer to the most recent element. -public: - //! \brief Initialize a new concurrent linked queue. - ConcurrentLinkedQueue(); + private: + //! \brief Allocate a free node. + static inline Node* allocNode() { + return new (AlignedMemory::allocate(sizeof(Node), 1 << N)) Node(); + } - //! \brief Destroy this concurrent linked queue. - ~ConcurrentLinkedQueue(); + //! \brief Return a node to the free list. + static inline void reclaimNode(Node* node) { AlignedMemory::deallocate(node); } - //! \brief Enqueue an element to this queue. - inline void enqueue(T elem); + public: + //! \brief Initialize a new concurrent linked queue. + ConcurrentLinkedQueue(); - //! \brief Dequeue an element from this queue. - inline T dequeue(); + //! \brief Destroy this concurrent linked queue. + ~ConcurrentLinkedQueue(); + + //! \brief Enqueue an element to this queue. + inline void enqueue(T elem); + + //! \brief Dequeue an element from this queue. + inline T dequeue(); }; /*@}*/ -template -inline -ConcurrentLinkedQueue::ConcurrentLinkedQueue() -{ - // Create the first "dummy" node. - Node* dummy = allocNode(); - dummy->next_ = NULL; - DEBUG_ONLY(dummy->value_ = NULL); +template inline ConcurrentLinkedQueue::ConcurrentLinkedQueue() { + // Create the first "dummy" node. + Node* dummy = allocNode(); + dummy->next_ = NULL; + DEBUG_ONLY(dummy->value_ = NULL); - // Head and tail should now point to it (empty list). - head_ = tail_ = Node::ptr(dummy); + // Head and tail should now point to it (empty list). + head_ = tail_ = Node::ptr(dummy); - // Make sure the instance is fully initialized before it becomes - // globally visible. - std::atomic_thread_fence(std::memory_order_release); + // Make sure the instance is fully initialized before it becomes + // globally visible. + std::atomic_thread_fence(std::memory_order_release); } -template -inline -ConcurrentLinkedQueue::~ConcurrentLinkedQueue() -{ - typename Node::Ptr head = head_; - typename Node::Ptr tail = tail_; - while (head->ptr() != tail->ptr()) { - Node* node = head->ptr(); - head = head->ptr()->next_; - reclaimNode(node); - } - reclaimNode(head->ptr()); +template inline ConcurrentLinkedQueue::~ConcurrentLinkedQueue() { + typename Node::Ptr head = head_; + typename Node::Ptr tail = tail_; + while (head->ptr() != tail->ptr()) { + Node* node = head->ptr(); + head = head->ptr()->next_; + reclaimNode(node); + } + reclaimNode(head->ptr()); } -template -inline void -ConcurrentLinkedQueue::enqueue(T elem) -{ - Node* node = allocNode(); - node->value_ = elem; - node->next_ = NULL; +template inline void ConcurrentLinkedQueue::enqueue(T elem) { + Node* node = allocNode(); + node->value_ = elem; + node->next_ = NULL; - for (;;) { - typename Node::Ptr tail = tail_.load(std::memory_order_acquire); - typename Node::Ptr next = - tail->ptr()->next_.load(std::memory_order_acquire); - if (likely(tail == tail_.load(std::memory_order_acquire))) { - if (next->ptr() == NULL) { - if (tail->ptr()->next_.compare_exchange_weak( - next, Node::ptr(node, next->tag()+1), - std::memory_order_acq_rel, std::memory_order_acquire)) { - tail_.compare_exchange_strong( - tail, Node::ptr(node, tail->tag()+1), - std::memory_order_acq_rel, std::memory_order_acquire); - return; - } - } - else { - tail_.compare_exchange_strong( - tail, Node::ptr(next->ptr(), tail->tag()+1), - std::memory_order_acq_rel, std::memory_order_acquire); - } + for (;;) { + typename Node::Ptr tail = tail_.load(std::memory_order_acquire); + typename Node::Ptr next = tail->ptr()->next_.load(std::memory_order_acquire); + if (likely(tail == tail_.load(std::memory_order_acquire))) { + if (next->ptr() == NULL) { + if (tail->ptr()->next_.compare_exchange_weak(next, Node::ptr(node, next->tag() + 1), + std::memory_order_acq_rel, + std::memory_order_acquire)) { + tail_.compare_exchange_strong(tail, Node::ptr(node, tail->tag() + 1), + std::memory_order_acq_rel, std::memory_order_acquire); + return; } + } else { + tail_.compare_exchange_strong(tail, Node::ptr(next->ptr(), tail->tag() + 1), + std::memory_order_acq_rel, std::memory_order_acquire); + } } + } } -template -inline T -ConcurrentLinkedQueue::dequeue() -{ - for (;;) { - typename Node::Ptr head = head_.load(std::memory_order_acquire); - typename Node::Ptr tail = tail_.load(std::memory_order_acquire); - typename Node::Ptr next = - head->ptr()->next_.load(std::memory_order_acquire); - if (likely(head == head_.load(std::memory_order_acquire))) { - if (head->ptr() == tail->ptr()) { - if (next->ptr() == NULL) { - return NULL; - } - tail_.compare_exchange_strong( - tail, Node::ptr(next->ptr(), tail->tag()+1), - std::memory_order_acq_rel, std::memory_order_acquire); - } - else { - T value = next->ptr()->value_; - if (head_.compare_exchange_weak( - head, Node::ptr(next->ptr(), head->tag()+1), - std::memory_order_acq_rel, std::memory_order_acquire)) { - // we can reclaim head now - reclaimNode(head->ptr()); - return value; - } - } +template inline T ConcurrentLinkedQueue::dequeue() { + for (;;) { + typename Node::Ptr head = head_.load(std::memory_order_acquire); + typename Node::Ptr tail = tail_.load(std::memory_order_acquire); + typename Node::Ptr next = head->ptr()->next_.load(std::memory_order_acquire); + if (likely(head == head_.load(std::memory_order_acquire))) { + if (head->ptr() == tail->ptr()) { + if (next->ptr() == NULL) { + return NULL; } + tail_.compare_exchange_strong(tail, Node::ptr(next->ptr(), tail->tag() + 1), + std::memory_order_acq_rel, std::memory_order_acquire); + } else { + T value = next->ptr()->value_; + if (head_.compare_exchange_weak(head, Node::ptr(next->ptr(), head->tag() + 1), + std::memory_order_acq_rel, std::memory_order_acquire)) { + // we can reclaim head now + reclaimNode(head->ptr()); + return value; + } + } } + } } -} // namespace amd +} // namespace amd #endif /*CONCURRENT_HPP_*/ diff --git a/rocclr/runtime/utils/debug.cpp b/rocclr/runtime/utils/debug.cpp index c8a4ada784..4dd502070f 100644 --- a/rocclr/runtime/utils/debug.cpp +++ b/rocclr/runtime/utils/debug.cpp @@ -7,7 +7,7 @@ #include "os/os.hpp" #if !defined(LOG_LEVEL) -# include "utils/flags.hpp" +#include "utils/flags.hpp" #endif #include @@ -16,77 +16,63 @@ #ifdef _WIN32 #include -#endif // _WIN32 +#endif // _WIN32 namespace amd { //! \cond ignore -extern "C" void -breakpoint(void) -{ +extern "C" void breakpoint(void) { #ifdef _MSC_VER - DebugBreak(); -#endif // _MSC_VER + DebugBreak(); +#endif // _MSC_VER } //! \endcond -void -report_fatal(const char* file, int line, const char* message) -{ - // FIXME_lmoriche: Obfuscate the message string - fprintf(stderr, "%s:%d: %s\n", file, line, message); - ::abort(); +void report_fatal(const char* file, int line, const char* message) { + // FIXME_lmoriche: Obfuscate the message string + fprintf(stderr, "%s:%d: %s\n", file, line, message); + ::abort(); } -void -report_warning(const char* message) -{ - fprintf(stderr, "Warning: %s\n", message); +void report_warning(const char* message) { fprintf(stderr, "Warning: %s\n", message); } + +void log_entry(LogLevel level, const char* file, int line, const char* message) { + if (level == LOG_NONE) { + return; + } + fprintf(stderr, ":%d:%s:%d: %s\n", level, file, line, message); } -void -log_entry(LogLevel level, const char* file, int line, const char* message) -{ - if (level == LOG_NONE) { - return; - } - fprintf(stderr, ":%d:%s:%d: %s\n", level, file, line, message); -} +void log_timestamped(LogLevel level, const char* file, int line, const char* message) { + static bool gotstart = false; // not thread-safe, but not scary if fails + static uint64_t start; -void -log_timestamped(LogLevel level, const char* file, int line, const char* message) -{ - static bool gotstart = false; // not thread-safe, but not scary if fails - static uint64_t start; + if (!gotstart) { + start = Os::timeNanos(); + gotstart = true; + } - if (!gotstart) { - start = Os::timeNanos(); - gotstart = true; - } - - uint64_t time = Os::timeNanos() - start; - if (level == LOG_NONE) { - return; - } + uint64_t time = Os::timeNanos() - start; + if (level == LOG_NONE) { + return; + } #if 0 fprintf(stderr, ":%d:%s:%d: (%010lld) %s\n", level, file, line, time, message); -#else // if you prefer fixed-width fields - fprintf(stderr, ":% 2d:%15s:% 5d: (%010lld) %s\n", - level, file, line, time/100ULL, message); // timestamp is 100ns units +#else // if you prefer fixed-width fields + fprintf(stderr, ":% 2d:%15s:% 5d: (%010lld) %s\n", level, file, line, time / 100ULL, + message); // timestamp is 100ns units #endif } -void -log_printf(LogLevel level, const char* file, int line, const char* format, ...) -{ - va_list ap; +void log_printf(LogLevel level, const char* file, int line, const char* format, ...) { + va_list ap; - va_start(ap, format); - char message[1024]; - vsprintf(message, format, ap); - va_end(ap); + va_start(ap, format); + char message[1024]; + vsprintf(message, format, ap); + va_end(ap); - fprintf(stderr, ":%d:%s:%d: %s\n", level, file, line, message); + fprintf(stderr, ":%d:%s:%d: %s\n", level, file, line, message); } -} // namespace amd +} // namespace amd diff --git a/rocclr/runtime/utils/debug.hpp b/rocclr/runtime/utils/debug.hpp index cb7dbb7b99..c018d92752 100644 --- a/rocclr/runtime/utils/debug.hpp +++ b/rocclr/runtime/utils/debug.hpp @@ -10,45 +10,28 @@ //! \addtogroup Utils -namespace amd {/*@{*/ +namespace amd { /*@{*/ -enum LogLevel { - LOG_NONE = 0, - LOG_ERROR = 1, - LOG_WARNING = 2, - LOG_INFO = 3, - LOG_DEBUG = 4 -}; +enum LogLevel { LOG_NONE = 0, LOG_ERROR = 1, LOG_WARNING = 2, LOG_INFO = 3, LOG_DEBUG = 4 }; //! \cond ignore -extern "C" void -breakpoint(); +extern "C" void breakpoint(); //! \endcond //! \brief Report a Fatal exception message and abort. -extern void -report_fatal(const char* file, int line, const char* message); +extern void report_fatal(const char* file, int line, const char* message); //! \brief Display a warning message. -extern void -report_warning(const char* message); +extern void report_warning(const char* message); //! \brief Insert a log entry. -extern void -log_entry(LogLevel level, const char* file, int line, const char* messsage); +extern void log_entry(LogLevel level, const char* file, int line, const char* messsage); //! \brief Insert a timestamped log entry. -extern void -log_timestamped(LogLevel level, const char* file, int line, const char* messsage); +extern void log_timestamped(LogLevel level, const char* file, int line, const char* messsage); //! \brief Insert a printf-style log entry. -extern void -log_printf( - LogLevel level, - const char* file, - int line, - const char* format, - ...); +extern void log_printf(LogLevel level, const char* file, int line, const char* format, ...); /*@}*/} // namespace amd @@ -57,31 +40,28 @@ log_printf( // Disable ICC's warning #279: controlling expression is constant // (0!=1 && "msg") // ^ -#pragma warning ( disable : 279 ) +#pragma warning(disable : 279) -#endif // __INTEL_COMPILER +#endif // __INTEL_COMPILER //! \brief Abort the program if the invariant \a cond is false. -#define guarantee(cond) \ - if (!(cond)) \ - { \ - amd::report_fatal(__FILE__, __LINE__, \ - "guarantee(" XSTR(cond) ")"); \ - amd::breakpoint(); \ - } +#define guarantee(cond) \ + if (!(cond)) { \ + amd::report_fatal(__FILE__, __LINE__, "guarantee(" XSTR(cond) ")"); \ + amd::breakpoint(); \ + } #define fixme_guarantee(cond) guarantee(cond) //! \brief Abort the program with a fatal error message. -#define fatal(msg) do { assert(false && msg); } while (0) +#define fatal(msg) \ + do { \ + assert(false && msg); \ + } while (0) //! \brief Display a warning message. -inline void -warning(const char* msg) -{ - amd::report_warning(msg); -} +inline void warning(const char* msg) { amd::report_warning(msg); } /*! \brief Abort the program with a "ShouldNotReachHere" message. * \hideinitializer @@ -102,66 +82,62 @@ warning(const char* msg) * \hideinitializer */ #ifndef NDEBUG -# define Untested(msg) \ - warning("Untested(\"" msg "\")") +#define Untested(msg) warning("Untested(\"" msg "\")") #else /*NDEBUG*/ -# define Untested(msg) (void)(0) +#define Untested(msg) (void)(0) #endif /*NDEBUG*/ #ifdef DEBUG -# define Log(level,msg) \ -do \ -{ \ - if (LOG_LEVEL >= level) { \ - amd::log_entry(level, __FILE__, __LINE__, msg); \ - } \ -} while (false) -#else // !DEBUG -# define Log(level,msg) (void)(0) -#endif // !DEBUG +#define Log(level, msg) \ + do { \ + if (LOG_LEVEL >= level) { \ + amd::log_entry(level, __FILE__, __LINE__, msg); \ + } \ + } while (false) +#else // !DEBUG +#define Log(level, msg) (void)(0) +#endif // !DEBUG #ifdef DEBUG -# define LogTS(level,msg) \ -do \ -{ \ - if (LOG_LEVEL >= level) { \ - amd::log_timestamped(level, __FILE__, __LINE__, msg); \ - } \ -} while (false) -#else // !DEBUG -# define Log(level,msg) (void)(0) -#endif // !DEBUG +#define LogTS(level, msg) \ + do { \ + if (LOG_LEVEL >= level) { \ + amd::log_timestamped(level, __FILE__, __LINE__, msg); \ + } \ + } while (false) +#else // !DEBUG +#define Log(level, msg) (void)(0) +#endif // !DEBUG #ifdef DEBUG -# define Logf(level, format, ...) \ -do \ -{ \ - if (LOG_LEVEL >= level) { \ - amd::log_printf(level, __FILE__, __LINE__, format, __VA_ARGS__); \ - } \ -} while (false) -#else // !DEBUG -# define Logf(level, format, ...) (void)(0) -#endif // !DEBUG +#define Logf(level, format, ...) \ + do { \ + if (LOG_LEVEL >= level) { \ + amd::log_printf(level, __FILE__, __LINE__, format, __VA_ARGS__); \ + } \ + } while (false) +#else // !DEBUG +#define Logf(level, format, ...) (void)(0) +#endif // !DEBUG -#define CondLog(cond,msg) \ -do { \ - if (false DEBUG_ONLY(|| (cond))) { \ - Log(amd::LOG_INFO,msg); \ - } \ -} while (false) +#define CondLog(cond, msg) \ + do { \ + if (false DEBUG_ONLY(|| (cond))) { \ + Log(amd::LOG_INFO, msg); \ + } \ + } while (false) -#define LogInfo(msg) Log(amd::LOG_INFO,msg) -#define LogError(msg) Log(amd::LOG_ERROR,msg) -#define LogWarning(msg) Log(amd::LOG_WARNING,msg) +#define LogInfo(msg) Log(amd::LOG_INFO, msg) +#define LogError(msg) Log(amd::LOG_ERROR, msg) +#define LogWarning(msg) Log(amd::LOG_WARNING, msg) -#define LogTSInfo(msg) LogTS(amd::LOG_INFO,msg) -#define LogTSError(msg) LogTS(amd::LOG_ERROR,msg) -#define LogTSWarning(msg) LogTS(amd::LOG_WARNING,msg) +#define LogTSInfo(msg) LogTS(amd::LOG_INFO, msg) +#define LogTSError(msg) LogTS(amd::LOG_ERROR, msg) +#define LogTSWarning(msg) LogTS(amd::LOG_WARNING, msg) -#define LogPrintfDebug(format, ...) Logf(amd::LOG_DEBUG, format, __VA_ARGS__) -#define LogPrintfError(format, ...) Logf(amd::LOG_ERROR, format, __VA_ARGS__) -#define LogPrintfWarning(format, ...) Logf(amd::LOG_WARNING, format, __VA_ARGS__) -#define LogPrintfInfo(format, ...) Logf(amd::LOG_INFO, format, __VA_ARGS__) +#define LogPrintfDebug(format, ...) Logf(amd::LOG_DEBUG, format, __VA_ARGS__) +#define LogPrintfError(format, ...) Logf(amd::LOG_ERROR, format, __VA_ARGS__) +#define LogPrintfWarning(format, ...) Logf(amd::LOG_WARNING, format, __VA_ARGS__) +#define LogPrintfInfo(format, ...) Logf(amd::LOG_INFO, format, __VA_ARGS__) #endif /*DEBUG_HPP_*/ diff --git a/rocclr/runtime/utils/flags.cpp b/rocclr/runtime/utils/flags.cpp index 664dbe9c97..be0526414d 100644 --- a/rocclr/runtime/utils/flags.cpp +++ b/rocclr/runtime/utils/flags.cpp @@ -15,183 +15,165 @@ #include #else #include -#endif // !_WIN32 +#endif // !_WIN32 namespace { -const char* removeQuotes(const char* Value) -{ - const char *b, *e, *p; - if (Value == NULL) { - return Value; - } +const char* removeQuotes(const char* Value) { + const char *b, *e, *p; + if (Value == NULL) { + return Value; + } - // skip the leading blank - for (p = Value; *p == ' '; ++p); - if (*p != '"') { - return Value; - } - b = p; - e = NULL; - for (++p; *p != '\0'; ++p) { - if (*p == '"') { - // e points to the last '"' - e = p; - } - else if ((e != NULL) && (*p != ' ')) { - // e isn't last '"' if there is any non-blank following e - e = NULL; - } + // skip the leading blank + for (p = Value; *p == ' '; ++p) + ; + if (*p != '"') { + return Value; + } + b = p; + e = NULL; + for (++p; *p != '\0'; ++p) { + if (*p == '"') { + // e points to the last '"' + e = p; + } else if ((e != NULL) && (*p != ' ')) { + // e isn't last '"' if there is any non-blank following e + e = NULL; } + } - if (e == NULL) { - return Value; - } - // Found a valid quoted string "" with b=1st '"' and e=the last '"' - size_t len = (e - b - 1) > 0 ? (e - b - 1) : 0; + if (e == NULL) { + return Value; + } + // Found a valid quoted string "" with b=1st '"' and e=the last '"' + size_t len = (e - b - 1) > 0 ? (e - b - 1) : 0; #ifdef _WIN32 - char* p1 = _strdup(b+1); - p1[len] = '\0'; - p = p1; + char* p1 = _strdup(b + 1); + p1[len] = '\0'; + p = p1; #else - p = strndup(b+1, len); + p = strndup(b + 1, len); #endif - return p; + return p; } - } namespace amd { #ifdef __APPLE__ #include -#endif // __APPLE__ +#endif // __APPLE__ -//static +// static char* Flag::envstr_; -void Flag::tearDown() -{ - +void Flag::tearDown() { #ifdef _WIN32 - FreeEnvironmentStringsA(envstr_); + FreeEnvironmentStringsA(envstr_); #endif - } -bool -Flag::init() -{ - typedef std::map vars_type; - vars_type vars; +bool Flag::init() { + typedef std::map vars_type; + vars_type vars; #ifdef _WIN32 - char* str = GetEnvironmentStringsA(); - envstr_ = str; + char* str = GetEnvironmentStringsA(); + envstr_ = str; - for (; *str != '\0'; str += strlen(str) + 1) { - // For all environment variables: - std::string var = str; - size_t pos = var.find('='); - if ((pos == std::string::npos) || ((pos + 1) >= var.size())) { - continue; - } - - std::string name = var.substr(0, pos); - vars.insert(std::make_pair(name, &str[pos+1])); + for (; *str != '\0'; str += strlen(str) + 1) { + // For all environment variables: + std::string var = str; + size_t pos = var.find('='); + if ((pos == std::string::npos) || ((pos + 1) >= var.size())) { + continue; } -#else // !_WIN32 + + std::string name = var.substr(0, pos); + vars.insert(std::make_pair(name, &str[pos + 1])); + } +#else // !_WIN32 #ifdef __APPLE__ - char** environ = *_NSGetEnviron(); - if (environ == NULL) { return false; } -#endif // __APPLE__ + char** environ = *_NSGetEnviron(); + if (environ == NULL) { + return false; + } +#endif // __APPLE__ - for (const char** p = const_cast(environ);*p != NULL; ++p) { - std::string var = *p; - size_t pos = var.find('='); - if ((pos == std::string::npos) || ((pos + 1) >= var.size())) { - continue; - } - - std::string name = var.substr(0, pos); - vars.insert(std::make_pair(name, &(*p)[pos+1])); - } -#endif // !_WIN32 - - for (size_t i = 0; i < numFlags_; ++i) { - Flag& flag = flags_[i]; - - vars_type::iterator it = vars.find(flag.name_); - if (it != vars.end()) { - flag.setValue(it->second); - } + for (const char** p = const_cast(environ); *p != NULL; ++p) { + std::string var = *p; + size_t pos = var.find('='); + if ((pos == std::string::npos) || ((pos + 1) >= var.size())) { + continue; } - return true; + std::string name = var.substr(0, pos); + vars.insert(std::make_pair(name, &(*p)[pos + 1])); + } +#endif // !_WIN32 + + for (size_t i = 0; i < numFlags_; ++i) { + Flag& flag = flags_[i]; + + vars_type::iterator it = vars.find(flag.name_); + if (it != vars.end()) { + flag.setValue(it->second); + } + } + + return true; } -bool -Flag::setValue(const char* value) -{ - if (value_ == NULL) { - return false; // flag is constant. - } +bool Flag::setValue(const char* value) { + if (value_ == NULL) { + return false; // flag is constant. + } - isDefault_ = false; + isDefault_ = false; - switch (type_) { + switch (type_) { case Tbool: - *(bool*) value_ = (strcmp(value, "true") == 0 || atoi(value) != 0) ? true : false; - return true; + *(bool*)value_ = (strcmp(value, "true") == 0 || atoi(value) != 0) ? true : false; + return true; case Tint: case Tuint: - *(int*) value_ = atoi(value); - return true; + *(int*)value_ = atoi(value); + return true; case Tsize_t: - *(size_t*) value_ = atol(value); - return true; + *(size_t*)value_ = atol(value); + return true; case Tcstring: - *(const char**) value_ = removeQuotes(value); - return true; + *(const char**)value_ = removeQuotes(value); + return true; - default: break; - } - ShouldNotReachHere(); - return false; + default: + break; + } + ShouldNotReachHere(); + return false; } -#define DEFINE_RELEASE_FLAG_STRUCT(type, name, value, help) \ -{ #name, &name, T##type, true }, -#define DEFINE_DEBUG_FLAG_STRUCT(type, name, value, help) \ -{ #name, RELEASE_ONLY(NULL) DEBUG_ONLY(&name), T##type, true }, +#define DEFINE_RELEASE_FLAG_STRUCT(type, name, value, help) {#name, &name, T##type, true}, +#define DEFINE_DEBUG_FLAG_STRUCT(type, name, value, help) \ + {#name, RELEASE_ONLY(NULL) DEBUG_ONLY(&name), T##type, true}, -Flag -Flag::flags_[] = -{ - RUNTIME_FLAGS( \ - DEFINE_DEBUG_FLAG_STRUCT, \ - DEFINE_RELEASE_FLAG_STRUCT, \ - DEFINE_DEBUG_FLAG_STRUCT ) - { NULL, NULL, Tinvalid, true } -}; +Flag Flag::flags_[] = { + RUNTIME_FLAGS(DEFINE_DEBUG_FLAG_STRUCT, DEFINE_RELEASE_FLAG_STRUCT, DEFINE_DEBUG_FLAG_STRUCT) + {NULL, NULL, Tinvalid, true}}; #undef DEFINE_DEBUG_FLAG_STRUCT #undef DEFINE_RELEASE_FLAG_STRUCT -} // namespace amd +} // namespace amd #define DEFINE_RELEASE_FLAG_VALUE(type, name, value, help) type name = value; -#define DEFINE_DEBUG_FLAG_VALUE(type, name, value, help) \ - DEBUG_ONLY(type name = value); +#define DEFINE_DEBUG_FLAG_VALUE(type, name, value, help) DEBUG_ONLY(type name = value); -RUNTIME_FLAGS( \ - DEFINE_DEBUG_FLAG_VALUE, \ - DEFINE_RELEASE_FLAG_VALUE, \ - DEFINE_DEBUG_FLAG_VALUE ); +RUNTIME_FLAGS(DEFINE_DEBUG_FLAG_VALUE, DEFINE_RELEASE_FLAG_VALUE, DEFINE_DEBUG_FLAG_VALUE); #undef DEFINE_DEBUG_FLAG_VALUE #undef DEFINE_RELEASE_FLAG_VALUE - diff --git a/rocclr/runtime/utils/flags.hpp b/rocclr/runtime/utils/flags.hpp index 145cbcba48..ad6e938979 100644 --- a/rocclr/runtime/utils/flags.hpp +++ b/rocclr/runtime/utils/flags.hpp @@ -213,62 +213,57 @@ namespace amd { //! \addtogroup Utils // @{ -struct Flag -{ - enum Type - { - Tinvalid = 0, - Tbool, //!< A boolean type flag (true, false). - Tint, //!< An integer type flag (signed). - Tuint, //!< An integer type flag (unsigned). - Tsize_t, //!< A size_t type flag. - Tcstring //!< A string type flag. - }; +struct Flag { + enum Type { + Tinvalid = 0, + Tbool, //!< A boolean type flag (true, false). + Tint, //!< An integer type flag (signed). + Tuint, //!< An integer type flag (unsigned). + Tsize_t, //!< A size_t type flag. + Tcstring //!< A string type flag. + }; #define DEFINE_FLAG_NAME(type, name, value, help) k##name, - enum Name - { - RUNTIME_FLAGS(DEFINE_FLAG_NAME, DEFINE_FLAG_NAME, DEFINE_FLAG_NAME) - numFlags_ - }; + enum Name { + RUNTIME_FLAGS(DEFINE_FLAG_NAME, DEFINE_FLAG_NAME, DEFINE_FLAG_NAME) + numFlags_ + }; #undef DEFINE_FLAG_NAME #define CAN_SET(type, name, v, h) static const bool cannotSet##name = false; #define CANNOT_SET(type, name, v, h) static const bool cannotSet##name = true; #ifdef DEBUG - RUNTIME_FLAGS(CAN_SET, CAN_SET, CAN_SET) + RUNTIME_FLAGS(CAN_SET, CAN_SET, CAN_SET) #else // !DEBUG - RUNTIME_FLAGS(CANNOT_SET, CAN_SET, CANNOT_SET) + RUNTIME_FLAGS(CANNOT_SET, CAN_SET, CANNOT_SET) #endif // !DEBUG #undef CAN_SET #undef CANNOT_SET -private: + private: + static Flag flags_[]; - static Flag flags_[]; + public: + static char* envstr_; + const char* name_; + const void* value_; + Type type_; + bool isDefault_; -public: - static char* envstr_; - const char* name_; - const void* value_; - Type type_; - bool isDefault_; + public: + static bool init(); -public: + static void tearDown(); - static bool init(); + bool setValue(const char* value); - static void tearDown(); - - bool setValue(const char* value); - - static bool isDefault(Name name) { return flags_[name].isDefault_; } + static bool isDefault(Name name) { return flags_[name].isDefault_; } }; #define flagIsDefault(name) \ - (amd::Flag::cannotSet##name || amd::Flag::isDefault(amd::Flag::k##name)) + (amd::Flag::cannotSet##name || amd::Flag::isDefault(amd::Flag::k##name)) #define setIfNotDefault(var, opt, other) \ if (!flagIsDefault(opt)) \ diff --git a/rocclr/runtime/utils/macros.hpp b/rocclr/runtime/utils/macros.hpp index 7ea54b6d05..b75f5be7de 100644 --- a/rocclr/runtime/utils/macros.hpp +++ b/rocclr/runtime/utils/macros.hpp @@ -6,182 +6,178 @@ #define MACROS_HPP_ #ifndef OPENCL_EXPORTS -# define OPENCL_EXPORTS 1 -#endif // OPENCL_EXPORTS +#define OPENCL_EXPORTS 1 +#endif // OPENCL_EXPORTS #if defined(NDEBUG) -# define RELEASE 1 -#else// !NDEBUG -# define ASSERT 1 -# define DEBUG 1 -#endif // !NDEBUG +#define RELEASE 1 +#else // !NDEBUG +#define ASSERT 1 +#define DEBUG 1 +#endif // !NDEBUG #if defined(_WIN64) && !defined(_LP64) -# define _LP64 1 +#define _LP64 1 #endif #if defined(_DEBUG) && !defined(DEBUG) -# define DEBUG 1 -#endif // _DEBUG && !DEBUG +#define DEBUG 1 +#endif // _DEBUG && !DEBUG #if defined(DEBUG) && defined(RELEASE) -# error "Build Error: cannot have both -DDEBUG and -DRELEASE" +#error "Build Error: cannot have both -DDEBUG and -DRELEASE" #endif /*DEBUG && RELEASE*/ #if !defined(DEBUG) && !defined(RELEASE) -# error "Build Error: must have either -DDEBUG or -DRELEASE" +#error "Build Error: must have either -DDEBUG or -DRELEASE" #endif /*DEBUG && RELEASE*/ #ifdef DEBUG -# define DEBUG_ONLY(x) x -# define RELEASE_ONLY(x) -# define IS_DEBUG true -#else // !DEBUG -# define DEBUG_ONLY(x) -# define RELEASE_ONLY(x) x -# define IS_DEBUG false +#define DEBUG_ONLY(x) x +#define RELEASE_ONLY(x) +#define IS_DEBUG true +#else // !DEBUG +#define DEBUG_ONLY(x) +#define RELEASE_ONLY(x) x +#define IS_DEBUG false #endif /*!DEBUG*/ -#define DEBUG_SWITCH(d,r) DEBUG_ONLY(d)RELEASE_ONLY(r) -#define RELEASE_SWITCH(r,d) RELEASE_ONLY(r)DEBUG_ONLY(d) +#define DEBUG_SWITCH(d, r) DEBUG_ONLY(d) RELEASE_ONLY(r) +#define RELEASE_SWITCH(r, d) RELEASE_ONLY(r) DEBUG_ONLY(d) //! \brief Make a c-string of __macro__ #define STR(__macro__) #__macro__ //! \brief Make a c-string of the expansion of __macro__ #define XSTR(__macro__) STR(__macro__) //! \brief Concatenate 2 symbols -#define CONCAT(a,b) a##b -#define XCONCAT(a,b) CONCAT(a,b) +#define CONCAT(a, b) a##b +#define XCONCAT(a, b) CONCAT(a, b) //! \cond ignore #ifdef _LP64 -# define LP64_ONLY(x) x -# define NOT_LP64(x) -#else // !_LP64 -# define LP64_ONLY(x) -# define NOT_LP64(x) x +#define LP64_ONLY(x) x +#define NOT_LP64(x) +#else // !_LP64 +#define LP64_ONLY(x) +#define NOT_LP64(x) x #endif /*!_LP64*/ -#define LP64_SWITCH(lp32,lp64) NOT_LP64(lp32)LP64_ONLY(lp64) +#define LP64_SWITCH(lp32, lp64) NOT_LP64(lp32) LP64_ONLY(lp64) #ifdef __linux__ -# define IS_LINUX true -# define LINUX_ONLY(x) x -# define NOT_LINUX(x) -#else // !__linux__ -# define LINUX_ONLY(x) -# define NOT_LINUX(x) x +#define IS_LINUX true +#define LINUX_ONLY(x) x +#define NOT_LINUX(x) +#else // !__linux__ +#define LINUX_ONLY(x) +#define NOT_LINUX(x) x #endif /*!__linux__*/ #ifdef __APPLE__ -# define IS_MACOS true -# define MACOS_ONLY(x) x -# define NOT_MACOS(x) -#else // !__APPLE__ -# define MACOS_ONLY(x) -# define NOT_MACOS(x) x +#define IS_MACOS true +#define MACOS_ONLY(x) x +#define NOT_MACOS(x) +#else // !__APPLE__ +#define MACOS_ONLY(x) +#define NOT_MACOS(x) x #endif /*!__APPLE__*/ #ifdef _WIN32 -# define IS_WINDOWS true -# define WINDOWS_ONLY(x) x -# define NOT_WINDOWS(x) -#else // !_WIN32 -# define WINDOWS_ONLY(x) -# define NOT_WINDOWS(x) x +#define IS_WINDOWS true +#define WINDOWS_ONLY(x) x +#define NOT_WINDOWS(x) +#else // !_WIN32 +#define WINDOWS_ONLY(x) +#define NOT_WINDOWS(x) x #endif /*!_WIN32*/ #ifdef _WIN64 -# define WIN64_ONLY(x) x -# define NOT_WIN64(x) -#else // !_WIN64 -# define WIN64_ONLY(x) -# define NOT_WIN64(x) x +#define WIN64_ONLY(x) x +#define NOT_WIN64(x) +#else // !_WIN64 +#define WIN64_ONLY(x) +#define NOT_WIN64(x) x #endif /*!_WIN64*/ #ifdef WITH_LIGHTNING_COMPILER -# define LIGHTNING_ONLY(x) x -# define IS_LIGHTNING true +#define LIGHTNING_ONLY(x) x +#define IS_LIGHTNING true #else /* !WITH_LIGHTNING_COMPILER */ -# define LIGHTNING_ONLY(x) +#define LIGHTNING_ONLY(x) #endif /* !WITH_LIGHTNING_COMPILER */ #ifndef IS_LINUX -# define IS_LINUX false +#define IS_LINUX false #endif #ifndef IS_MACOS -# define IS_MACOS false +#define IS_MACOS false #endif #ifndef IS_WINDOWS -# define IS_WINDOWS false +#define IS_WINDOWS false #endif #ifndef IS_LIGHTNING -# define IS_LIGHTNING false +#define IS_LIGHTNING false #endif #define IF_LEFT_true(x) x #define IF_LEFT_false(x) -#define IF_RIGHT_true(x) +#define IF_RIGHT_true(x) #define IF_RIGHT_false(x) x -#define IF_LEFT(cond,x) IF_LEFT_##cond(x) -#define IF_RIGHT(cond,x) IF_RIGHT_##cond(x) -#define IF(cond,x,y) IF_LEFT(cond,x)IF_RIGHT(cond,y) +#define IF_LEFT(cond, x) IF_LEFT_##cond(x) +#define IF_RIGHT(cond, x) IF_RIGHT_##cond(x) +#define IF(cond, x, y) IF_LEFT(cond, x) IF_RIGHT(cond, y) -#define LINUX_SWITCH(x,other) LINUX_ONLY(x)NOT_LINUX(other) -#define MACOS_SWITCH(x,other) MACOS_ONLY(x)NOT_MACOS(other) -#define WINDOWS_SWITCH(x,other) WINDOWS_ONLY(x)NOT_WINDOWS(other) +#define LINUX_SWITCH(x, other) LINUX_ONLY(x) NOT_LINUX(other) +#define MACOS_SWITCH(x, other) MACOS_ONLY(x) NOT_MACOS(other) +#define WINDOWS_SWITCH(x, other) WINDOWS_ONLY(x) NOT_WINDOWS(other) -#ifdef OPENCL_MAINLINE -# define IS_MAINLINE true -#else // OPENCL_STAGING -# define IS_MAINLINE false -#endif +#define IS_MAINLINE true #ifdef OPTIMIZED -# define OPTIMIZED_ONLY(x) x -# define NOT_OPTIMIZED(x) -# define IS_OPTIMIZED true +#define OPTIMIZED_ONLY(x) x +#define NOT_OPTIMIZED(x) +#define IS_OPTIMIZED true #else -# define OPTIMIZED_ONLY(x) -# define NOT_OPTIMIZED(x) x -# define IS_OPTIMIZED false +#define OPTIMIZED_ONLY(x) +#define NOT_OPTIMIZED(x) x +#define IS_OPTIMIZED false #endif #if defined(__GNUC__) -# define __ALIGNED__(x) __attribute__((aligned(x))) +#define __ALIGNED__(x) __attribute__((aligned(x))) #elif defined(_MSC_VER) -# define __ALIGNED__(x) __declspec(align(x)) +#define __ALIGNED__(x) __declspec(align(x)) #elif defined(RC_INVOKED) -# define __ALIGNED__(x) +#define __ALIGNED__(x) #else -# error +#error #endif /*_MSC_VER*/ #if defined(__GNUC__) -# define likely(cond) __builtin_expect(!!(cond), 1) -# define unlikely(cond) __builtin_expect(!!(cond), 0) -#else // !__GNUC__ -# define likely(cond) (cond) -# define unlikely(cond) (cond) -#endif // !__GNUC__ +#define likely(cond) __builtin_expect(!!(cond), 1) +#define unlikely(cond) __builtin_expect(!!(cond), 0) +#else // !__GNUC__ +#define likely(cond) (cond) +#define unlikely(cond) (cond) +#endif // !__GNUC__ #if defined(__GNUC__) -# define NOINLINE __attribute__((noinline)) -# define ALWAYSINLINE __attribute__ ((always_inline)) +#define NOINLINE __attribute__((noinline)) +#define ALWAYSINLINE __attribute__((always_inline)) #elif defined(_MSC_VER) -# define NOINLINE __declspec(noinline) -# define ALWAYSINLINE __forceinline -#else // !_MSC_VER -# define NOINLINE -# define ALWAYSINLINE -#endif // !_MSC_VER +#define NOINLINE __declspec(noinline) +#define ALWAYSINLINE __forceinline +#else // !_MSC_VER +#define NOINLINE +#define ALWAYSINLINE +#endif // !_MSC_VER #ifdef BRAHMA -# define IS_BRAHMA true +#define IS_BRAHMA true #else -# define IS_BRAHMA false +#define IS_BRAHMA false #endif //! \endcond -#endif // MACROS_HPP_ +#endif // MACROS_HPP_ diff --git a/rocclr/runtime/utils/util.hpp b/rocclr/runtime/utils/util.hpp index 297b922f5e..842ddbe1ff 100644 --- a/rocclr/runtime/utils/util.hpp +++ b/rocclr/runtime/utils/util.hpp @@ -17,51 +17,31 @@ namespace amd { */ //! \brief Check if the given value \a val is a power of 2. -template -static inline bool -isPowerOfTwo(T val) -{ - return (val & (val - 1)) == 0; -} +template static inline bool isPowerOfTwo(T val) { return (val & (val - 1)) == 0; } //! \cond ignore // Compute the next power of 2 helper. -template -struct NextPowerOfTwoFunction -{ - template - static T compute(T val) - { - val = NextPowerOfTwoFunction::compute(val); - return (val >> N) | val; - } +template struct NextPowerOfTwoFunction { + template static T compute(T val) { + val = NextPowerOfTwoFunction::compute(val); + return (val >> N) | val; + } }; // Specialized version for <1> to break the recursion. -template <> -struct NextPowerOfTwoFunction<1> -{ - template - static T compute(T val) { return (val >> 1) | val; } +template <> struct NextPowerOfTwoFunction<1> { + template static T compute(T val) { return (val >> 1) | val; } }; -template -struct NextPowerOfTwoHelper -{ - static const uint prev = NextPowerOfTwoHelper::value; - static const uint value = (prev >> S) | prev; -}; -template -struct NextPowerOfTwoHelper -{ - static const int value = (N >> 1) | N; +template struct NextPowerOfTwoHelper { + static const uint prev = NextPowerOfTwoHelper::value; + static const uint value = (prev >> S) | prev; }; +template struct NextPowerOfTwoHelper { static const int value = (N >> 1) | N; }; -template -struct NextPowerOfTwo -{ - static const uint value = NextPowerOfTwoHelper::value + 1; +template struct NextPowerOfTwo { + static const uint value = NextPowerOfTwoHelper::value + 1; }; //! \endcond @@ -78,26 +58,15 @@ struct NextPowerOfTwo * * The next power of two is: 1+compute(val-1) */ -template -inline T -nextPowerOfTwo(T val) -{ - return NextPowerOfTwoFunction::compute(val - 1) + 1; +template inline T nextPowerOfTwo(T val) { + return NextPowerOfTwoFunction::compute(val - 1) + 1; } // Compute log2(N) -template -struct Log2 -{ - static const uint value = Log2::value + 1; -}; +template struct Log2 { static const uint value = Log2::value + 1; }; // Break the recursion -template <> -struct Log2<1> -{ - static const uint value = 0; -}; +template <> struct Log2<1> { static const uint value = 0; }; /*! \brief Return the log2 for a value of type T. * @@ -111,177 +80,120 @@ struct Log2<1> * if (val >= 1 << 1) { l |= 1; } * return l; */ -template -struct Log2Function -{ - template - static uint compute(T val) - { - uint l = 0; - if (val >= T(1) << N) { - val >>= N; l = N; - } - return l + Log2Function::compute(val); +template struct Log2Function { + template static uint compute(T val) { + uint l = 0; + if (val >= T(1) << N) { + val >>= N; + l = N; } + return l + Log2Function::compute(val); + } }; -template <> -struct Log2Function<1> -{ - template - static uint compute(T val) { - return (val >= T(1)<<1) ? 1 : 0; - } +template <> struct Log2Function<1> { + template static uint compute(T val) { return (val >= T(1) << 1) ? 1 : 0; } }; // log2 helper function -template -inline uint -log2(T val) -{ - return Log2Function::compute(val); +template inline uint log2(T val) { return Log2Function::compute(val); } + +template inline T alignDown(T value, size_t alignment) { + return (T)(value & ~(alignment - 1)); } -template -inline T -alignDown(T value, size_t alignment) -{ - return (T) (value & ~(alignment - 1)); +template inline T* alignDown(T* value, size_t alignment) { + return (T*)alignDown((intptr_t)value, alignment); } -template -inline T* -alignDown(T* value, size_t alignment) -{ - return (T*) alignDown((intptr_t) value, alignment); +template inline T alignUp(T value, size_t alignment) { + return alignDown((T)(value + alignment - 1), alignment); } -template -inline T -alignUp(T value, size_t alignment) -{ - return alignDown((T) (value + alignment - 1), alignment); +template inline T* alignUp(T* value, size_t alignment) { + return (T*)alignDown((intptr_t)(value + alignment - 1), alignment); } -template -inline T* -alignUp(T* value, size_t alignment) -{ - return (T*) alignDown((intptr_t) (value + alignment - 1), alignment); +template inline bool isMultipleOf(T value, size_t alignment) { + if (isPowerOfTwo(alignment)) { + // fast path, using logical operators + return alignUp(value, alignment) == value; + } + return value % alignment == 0; } -template -inline bool isMultipleOf(T value, size_t alignment) -{ - if (isPowerOfTwo(alignment)) { - // fast path, using logical operators - return alignUp(value, alignment) == value; - } - return value % alignment == 0; +template inline bool isMultipleOf(T* value, size_t alignment) { + intptr_t ptr = reinterpret_cast(value); + return isMultipleOf(ptr, alignment); } -template -inline bool isMultipleOf(T* value, size_t alignment) -{ - intptr_t ptr = reinterpret_cast(value); - return isMultipleOf(ptr, alignment); -} - -template -struct DeviceMap { - Reference ref_; - Value value_; +template struct DeviceMap { + Reference ref_; + Value value_; }; -inline uint -countBitsSet32(uint32_t value) -{ +inline uint countBitsSet32(uint32_t value) { #if __GNUC__ >= 4 - return (uint)__builtin_popcount(value); + return (uint)__builtin_popcount(value); #else - value = value - ((value >> 1) & 0x55555555); - value = (value & 0x33333333) + ((value >> 2) & 0x33333333); - return (uint)(((value + (value >> 4) & 0xF0F0F0F) * 0x1010101) >> 24); + value = value - ((value >> 1) & 0x55555555); + value = (value & 0x33333333) + ((value >> 2) & 0x33333333); + return (uint)(((value + (value >> 4) & 0xF0F0F0F) * 0x1010101) >> 24); #endif } -inline uint -countBitsSet64(uint64_t value) -{ +inline uint countBitsSet64(uint64_t value) { #if __GNUC__ >= 4 - return (uint)__builtin_popcountll(value); + return (uint)__builtin_popcountll(value); #else - value = value - ((value >> 1) & 0x5555555555555555ULL); - value = (value & 0x3333333333333333ULL) + ((value >> 2) & 0x3333333333333333ULL); - value = (value + (value >> 4)) & 0x0F0F0F0F0F0F0F0FULL; - return (uint)((uint64_t)(value * 0x0101010101010101ULL) >> 56); + value = value - ((value >> 1) & 0x5555555555555555ULL); + value = (value & 0x3333333333333333ULL) + ((value >> 2) & 0x3333333333333333ULL); + value = (value + (value >> 4)) & 0x0F0F0F0F0F0F0F0FULL; + return (uint)((uint64_t)(value * 0x0101010101010101ULL) >> 56); #endif } -inline uint -leastBitSet32(uint32_t value) -{ +inline uint leastBitSet32(uint32_t value) { #if defined(_WIN32) - unsigned long idx; - return _BitScanForward(&idx, (unsigned long)value) ? idx : (uint)-1; + unsigned long idx; + return _BitScanForward(&idx, (unsigned long)value) ? idx : (uint)-1; #else - return value ? __builtin_ctz(value) : (uint)-1; + return value ? __builtin_ctz(value) : (uint)-1; #endif } -inline uint -leastBitSet64(uint64_t value) -{ +inline uint leastBitSet64(uint64_t value) { #if defined(_WIN64) - unsigned long idx; - return _BitScanForward64(&idx, (unsigned __int64)value) ? idx : (uint)-1; -#elif defined (__GNUC__) - return value ? __builtin_ctzll(value) : (uint)-1; + unsigned long idx; + return _BitScanForward64(&idx, (unsigned __int64)value) ? idx : (uint)-1; +#elif defined(__GNUC__) + return value ? __builtin_ctzll(value) : (uint)-1; #else - static const uint8_t lookup67[67+1] = { - 64, 0, 1, 39, 2, 15, 40, 23, - 3, 12, 16, 59, 41, 19, 24, 54, - 4, -1, 13, 10, 17, 62, 60, 28, - 42, 30, 20, 51, 25, 44, 55, 47, - 5, 32, -1, 38, 14, 22, 11, 58, - 18, 53, 63, 9, 61, 27, 29, 50, - 43, 46, 31, 37, 21, 57, 52, 8, - 26, 49, 45, 36, 56, 7, 48, 35, - 6, 34, 33, -1 - }; + static const uint8_t lookup67[67 + 1] = { + 64, 0, 1, 39, 2, 15, 40, 23, 3, 12, 16, 59, 41, 19, 24, 54, 4, -1, 13, 10, 17, 62, 60, + 28, 42, 30, 20, 51, 25, 44, 55, 47, 5, 32, -1, 38, 14, 22, 11, 58, 18, 53, 63, 9, 61, 27, + 29, 50, 43, 46, 31, 37, 21, 57, 52, 8, 26, 49, 45, 36, 56, 7, 48, 35, 6, 34, 33, -1}; - return (uint)lookup67[((int64_t)value & -(int64_t)value) % 67]; + return (uint)lookup67[((int64_t)value & -(int64_t)value) % 67]; #endif } -template -inline uint countBitsSet(T value) -{ - return (sizeof(T) == 8) ? countBitsSet64((uint64_t)value) : - countBitsSet32((uint32_t)value); +template inline uint countBitsSet(T value) { + return (sizeof(T) == 8) ? countBitsSet64((uint64_t)value) : countBitsSet32((uint32_t)value); } -template -inline uint leastBitSet(T value) -{ - return (sizeof(T) == 8) ? leastBitSet64((uint64_t)value) : - leastBitSet32((uint32_t)value); +template inline uint leastBitSet(T value) { + return (sizeof(T) == 8) ? leastBitSet64((uint64_t)value) : leastBitSet32((uint32_t)value); } -static inline bool Is32Bits() { - return LP64_SWITCH(true, false); -} +static inline bool Is32Bits() { return LP64_SWITCH(true, false); } -static inline bool Is64Bits() { - return LP64_SWITCH(false, true); -} +static inline bool Is64Bits() { return LP64_SWITCH(false, true); } -template -class ScopeGuard { +template class ScopeGuard { public: - explicit ALWAYSINLINE ScopeGuard(const lambda& release) - : release_(release), dismiss_(false) {} + explicit ALWAYSINLINE ScopeGuard(const lambda& release) : release_(release), dismiss_(false) {} ScopeGuard(ScopeGuard& rhs) { *this = rhs; } @@ -300,12 +212,11 @@ class ScopeGuard { bool dismiss_; }; -#define MAKE_SCOPE_GUARD_HELPER(lname, sname, ...) \ - auto lname = __VA_ARGS__; \ +#define MAKE_SCOPE_GUARD_HELPER(lname, sname, ...) \ + auto lname = __VA_ARGS__; \ amd::ScopeGuard sname(lname); -#define MAKE_SCOPE_GUARD(name, ...) \ - MAKE_SCOPE_GUARD_HELPER(XCONCAT(scopeGuardLambda, __COUNTER__), name, \ - __VA_ARGS__) +#define MAKE_SCOPE_GUARD(name, ...) \ + MAKE_SCOPE_GUARD_HELPER(XCONCAT(scopeGuardLambda, __COUNTER__), name, __VA_ARGS__) /*@}*/} // namespace amd diff --git a/rocclr/runtime/utils/versions.hpp b/rocclr/runtime/utils/versions.hpp index ea0e07a022..949600d11a 100644 --- a/rocclr/runtime/utils/versions.hpp +++ b/rocclr/runtime/utils/versions.hpp @@ -8,27 +8,29 @@ #include "utils/macros.hpp" #ifndef AMD_PLATFORM_NAME -# define AMD_PLATFORM_NAME "AMD Accelerated Parallel Processing" -#endif // AMD_PLATFORM_NAME +#define AMD_PLATFORM_NAME "AMD Accelerated Parallel Processing" +#endif // AMD_PLATFORM_NAME #ifndef AMD_PLATFORM_BUILD_NUMBER -# define AMD_PLATFORM_BUILD_NUMBER 2403 -#endif // AMD_PLATFORM_BUILD_NUMBER +#define AMD_PLATFORM_BUILD_NUMBER 2403 +#endif // AMD_PLATFORM_BUILD_NUMBER #ifndef AMD_PLATFORM_REVISION_NUMBER -# define AMD_PLATFORM_REVISION_NUMBER 0 -#endif // AMD_PLATFORM_REVISION_NUMBER +#define AMD_PLATFORM_REVISION_NUMBER 0 +#endif // AMD_PLATFORM_REVISION_NUMBER #ifndef AMD_PLATFORM_RELEASE_INFO -# define AMD_PLATFORM_RELEASE_INFO -#endif // AMD_PLATFORM_RELEASE_INFO +#define AMD_PLATFORM_RELEASE_INFO +#endif // AMD_PLATFORM_RELEASE_INFO -#define AMD_BUILD_STRING XSTR(AMD_PLATFORM_BUILD_NUMBER) \ - "." XSTR(AMD_PLATFORM_REVISION_NUMBER) +#define AMD_BUILD_STRING \ + XSTR(AMD_PLATFORM_BUILD_NUMBER) \ + "." XSTR(AMD_PLATFORM_REVISION_NUMBER) #ifndef AMD_PLATFORM_INFO -# define AMD_PLATFORM_INFO "AMD-APP" AMD_PLATFORM_RELEASE_INFO \ - DEBUG_ONLY("." IF(IS_OPTIMIZED,"opt","dbg")) " (" AMD_BUILD_STRING ")" -#endif // ATI_PLATFORM_INFO +#define AMD_PLATFORM_INFO \ + "AMD-APP" AMD_PLATFORM_RELEASE_INFO DEBUG_ONLY( \ + "." IF(IS_OPTIMIZED, "opt", "dbg")) " (" AMD_BUILD_STRING ")" +#endif // ATI_PLATFORM_INFO -#endif // VERSIONS_HPP_ +#endif // VERSIONS_HPP_