/* Copyright (c) 2008 - 2023 Advanced Micro Devices, Inc. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include "device/device.hpp" #include "thread/monitor.hpp" #include "utils/options.hpp" #include "comgrctx.hpp" #include #include #include #include #if defined(WITH_HSA_DEVICE) #include "device/rocm/rocdevice.hpp" extern amd::AppProfile* rocCreateAppProfile(); #endif #if defined(WITH_PAL_DEVICE) // namespace amd::pal { extern bool PalDeviceLoad(); extern void PalDeviceUnload(); //} #endif // WITH_PAL_DEVICE #include "platform/runtime.hpp" #include "platform/program.hpp" #include "thread/monitor.hpp" #include "amdocl/cl_common.hpp" #include "utils/options.hpp" #include "utils/versions.hpp" // AMD_PLATFORM_INFO #if defined(HAVE_BLOWFISH_H) #include "blowfish/oclcrypt.hpp" #endif #if defined(WITH_COMPILER_LIB) #include "utils/bif_section_labels.hpp" #include "utils/libUtils.h" #include "spirv/spirvUtils.h" #endif #include #include #include #include #include #include #include #include #include namespace { constexpr char hsaIsaNamePrefix[] = "amdgcn-amd-amdhsa--"; } // namespace namespace amd::device { extern const char* BlitLinearSourceCode; extern const char* BlitImageSourceCode; bool VirtualDevice::ActiveWait() const { return device_().ActiveWait(); } } // namespace amd::device static_assert(static_cast(device::Memory::MemAccess::kMemAccessNone) == static_cast(amd::Device::VmmAccess::kNone), "Mem Access Flag None mismatch between Device and Memory!"); static_assert(static_cast(device::Memory::MemAccess::kMemAccessRead) == static_cast(amd::Device::VmmAccess::kReadOnly), "Mem Access Flag Read mismatch between Device and Memory!"); static_assert(static_cast(device::Memory::MemAccess::kMemAccessReadWrite) == static_cast(amd::Device::VmmAccess::kReadWrite), "Mem Access Flag Read Write mismatch between Device and Memory!"); namespace amd { amd::Monitor Device::lockP2P_("Lock P2P ON/OFF"); std::pair Isa::supportedIsas() { constexpr amd::Isa::Feature NONE = amd::Isa::Feature::Unsupported; constexpr amd::Isa::Feature ANY = amd::Isa::Feature::Any; constexpr amd::Isa::Feature OFF = amd::Isa::Feature::Disabled; constexpr amd::Isa::Feature ON = amd::Isa::Feature::Enabled; static constexpr Isa supportedIsas_[] = { // NOTE: Add new targets by adding rows for each permutation of the SRAMECC // and XNACK target feature values. If the target does not support the // feature then only NONE is used. If it supports the feature than include // rows for ANY, OFF and ON (but not NONE). // // Use the Target ID syntax. This comprises the processor name, followed by // the target feature settings in alphebetic order separated by ':'. If a // target feature is omitted it means either it is not supported, or it has // the ANY value. If the target feature is disabled then use a '-' suffix, // and if enabled use a '+' suffix. // // If the HSAIL or AMD IL compilers do not support the target, then use // nullptr for the ID. // // -------------- Compiler ---------- - Runtime - ---- IP ---- -- Target -- ---------- // Target Properties ---------- // Supported Version Features // SIMD/ // SIMD // Instr // Bank LDS // Mem // Target ID HSAIL ID ROC PAL Maj/Min/Stp SRAMECC XNACK CU Width // Width Width Size Banks {"gfx801", nullptr, true, true, 8, 0, 1, NONE, ANY, 4, 16, 1, 256, 64 * Ki, 32}, {"gfx801:xnack-", nullptr, true, false, 8, 0, 1, NONE, OFF, 4, 16, 1, 256, 64 * Ki, 32}, {"gfx801:xnack+", "gfx801", true, true, 8, 0, 1, NONE, ON, 4, 16, 1, 256, 64 * Ki, 32}, {"gfx802", "gfx802", true, true, 8, 0, 2, NONE, NONE, 4, 16, 1, 256, 64 * Ki, 32}, {"gfx803", "gfx803", true, true, 8, 0, 3, NONE, NONE, 4, 16, 1, 256, 64 * Ki, 32}, {"gfx805", nullptr, true, true, 8, 0, 5, NONE, NONE, 4, 16, 1, 256, 64 * Ki, 32}, {"gfx810", nullptr, true, true, 8, 1, 0, NONE, ANY, 4, 16, 1, 256, 64 * Ki, 32}, {"gfx810:xnack-", nullptr, true, false, 8, 1, 0, NONE, OFF, 4, 16, 1, 256, 64 * Ki, 32}, {"gfx810:xnack+", "gfx810", true, true, 8, 1, 0, NONE, ON, 4, 16, 1, 256, 64 * Ki, 32}, {"gfx900", "gfx901", true, true, 9, 0, 0, NONE, ANY, 4, 16, 1, 256, 64 * Ki, 32}, {"gfx900:xnack-", "gfx900", true, true, 9, 0, 0, NONE, OFF, 4, 16, 1, 256, 64 * Ki, 32}, {"gfx900:xnack+", "gfx901", true, true, 9, 0, 0, NONE, ON, 4, 16, 1, 256, 64 * Ki, 32}, {"gfx902", "gfx903", true, true, 9, 0, 2, NONE, ANY, 4, 16, 1, 256, 64 * Ki, 32}, {"gfx902:xnack-", "gfx902", true, true, 9, 0, 2, NONE, OFF, 4, 16, 1, 256, 64 * Ki, 32}, {"gfx902:xnack+", "gfx903", true, true, 9, 0, 2, NONE, ON, 4, 16, 1, 256, 64 * Ki, 32}, {"gfx904", "gfx905", true, true, 9, 0, 4, NONE, ANY, 4, 16, 1, 256, 64 * Ki, 32}, {"gfx904:xnack-", "gfx904", true, true, 9, 0, 4, NONE, OFF, 4, 16, 1, 256, 64 * Ki, 32}, {"gfx904:xnack+", "gfx905", true, true, 9, 0, 4, NONE, ON, 4, 16, 1, 256, 64 * Ki, 32}, {"gfx906", "gfx907", true, true, 9, 0, 6, ANY, ANY, 4, 16, 1, 256, 64 * Ki, 32}, {"gfx906:sramecc-", "gfx907", true, true, 9, 0, 6, OFF, ANY, 4, 16, 1, 256, 64 * Ki, 32}, {"gfx906:sramecc+", nullptr, true, true, 9, 0, 6, ON, ANY, 4, 16, 1, 256, 64 * Ki, 32}, {"gfx906:xnack-", "gfx906", true, true, 9, 0, 6, ANY, OFF, 4, 16, 1, 256, 64 * Ki, 32}, {"gfx906:xnack+", "gfx907", true, true, 9, 0, 6, ANY, ON, 4, 16, 1, 256, 64 * Ki, 32}, {"gfx906:sramecc-:xnack-", "gfx906", true, true, 9, 0, 6, OFF, OFF, 4, 16, 1, 256, 64 * Ki, 32}, {"gfx906:sramecc-:xnack+", "gfx907", true, true, 9, 0, 6, OFF, ON, 4, 16, 1, 256, 64 * Ki, 32}, {"gfx906:sramecc+:xnack-", nullptr, true, true, 9, 0, 6, ON, OFF, 4, 16, 1, 256, 64 * Ki, 32}, {"gfx906:sramecc+:xnack+", nullptr, true, true, 9, 0, 6, ON, ON, 4, 16, 1, 256, 64 * Ki, 32}, {"gfx908", nullptr, true, false, 9, 0, 8, ANY, ANY, 4, 16, 1, 256, 64 * Ki, 32}, {"gfx908:sramecc-", nullptr, true, false, 9, 0, 8, OFF, ANY, 4, 16, 1, 256, 64 * Ki, 32}, {"gfx908:sramecc+", nullptr, true, false, 9, 0, 8, ON, ANY, 4, 16, 1, 256, 64 * Ki, 32}, {"gfx908:xnack-", nullptr, true, false, 9, 0, 8, ANY, OFF, 4, 16, 1, 256, 64 * Ki, 32}, {"gfx908:xnack+", nullptr, true, false, 9, 0, 8, ANY, ON, 4, 16, 1, 256, 64 * Ki, 32}, {"gfx908:sramecc-:xnack-", nullptr, true, false, 9, 0, 8, OFF, OFF, 4, 16, 1, 256, 64 * Ki, 32}, {"gfx908:sramecc-:xnack+", nullptr, true, false, 9, 0, 8, OFF, ON, 4, 16, 1, 256, 64 * Ki, 32}, {"gfx908:sramecc+:xnack-", nullptr, true, false, 9, 0, 8, ON, OFF, 4, 16, 1, 256, 64 * Ki, 32}, {"gfx908:sramecc+:xnack+", nullptr, true, false, 9, 0, 8, ON, ON, 4, 16, 1, 256, 64 * Ki, 32}, {"gfx909", nullptr, false, true, 9, 0, 2, NONE, ANY, 4, 16, 1, 256, 64 * Ki, 32}, {"gfx909:xnack-", nullptr, false, true, 9, 0, 2, NONE, OFF, 4, 16, 1, 256, 64 * Ki, 32}, {"gfx909:xnack+", nullptr, false, true, 9, 0, 2, NONE, ON, 4, 16, 1, 256, 64 * Ki, 32}, {"gfx90a", nullptr, true, false, 9, 0, 10, ANY, ANY, 4, 16, 1, 256, 64 * Ki, 32}, {"gfx90a:sramecc-", nullptr, true, false, 9, 0, 10, OFF, ANY, 4, 16, 1, 256, 64 * Ki, 32}, {"gfx90a:sramecc+", nullptr, true, false, 9, 0, 10, ON, ANY, 4, 16, 1, 256, 64 * Ki, 32}, {"gfx90a:xnack-", nullptr, true, false, 9, 0, 10, ANY, OFF, 4, 16, 1, 256, 64 * Ki, 32}, {"gfx90a:xnack+", nullptr, true, false, 9, 0, 10, ANY, ON, 4, 16, 1, 256, 64 * Ki, 32}, {"gfx90a:sramecc-:xnack-", nullptr, true, false, 9, 0, 10, OFF, OFF, 4, 16, 1, 256, 64 * Ki, 32}, {"gfx90a:sramecc-:xnack+", nullptr, true, false, 9, 0, 10, OFF, ON, 4, 16, 1, 256, 64 * Ki, 32}, {"gfx90a:sramecc+:xnack-", nullptr, true, false, 9, 0, 10, ON, OFF, 4, 16, 1, 256, 64 * Ki, 32}, {"gfx90a:sramecc+:xnack+", nullptr, true, false, 9, 0, 10, ON, ON, 4, 16, 1, 256, 64 * Ki, 32}, {"gfx942", nullptr, true, false, 9, 4, 2, ANY, ANY, 4, 16, 1, 256, 64 * Ki, 32}, {"gfx942:sramecc-", nullptr, true, false, 9, 4, 2, OFF, ANY, 4, 16, 1, 256, 64 * Ki, 32}, {"gfx942:sramecc+", nullptr, true, false, 9, 4, 2, ON, ANY, 4, 16, 1, 256, 64 * Ki, 32}, {"gfx942:xnack-", nullptr, true, false, 9, 4, 2, ANY, OFF, 4, 16, 1, 256, 64 * Ki, 32}, {"gfx942:xnack+", nullptr, true, false, 9, 4, 2, ANY, ON, 4, 16, 1, 256, 64 * Ki, 32}, {"gfx942:sramecc-:xnack-", nullptr, true, false, 9, 4, 2, OFF, OFF, 4, 16, 1, 256, 64 * Ki, 32}, {"gfx942:sramecc-:xnack+", nullptr, true, false, 9, 4, 2, OFF, ON, 4, 16, 1, 256, 64 * Ki, 32}, {"gfx942:sramecc+:xnack-", nullptr, true, false, 9, 4, 2, ON, OFF, 4, 16, 1, 256, 64 * Ki, 32}, {"gfx942:sramecc+:xnack+", nullptr, true, false, 9, 4, 2, ON, ON, 4, 16, 1, 256, 64 * Ki, 32}, {"gfx90c", nullptr, true, true, 9, 0, 12, NONE, ANY, 4, 16, 1, 256, 64 * Ki, 32}, {"gfx90c:xnack-", "gfx90c", true, true, 9, 0, 12, NONE, OFF, 4, 16, 1, 256, 64 * Ki, 32}, {"gfx90c:xnack+", "gfx90d", true, true, 9, 0, 12, NONE, ON, 4, 16, 1, 256, 64 * Ki, 32}, {"gfx950", nullptr, true, false, 9, 5, 0, ANY, ANY, 4, 16, 1, 256, 160 * Ki, 64}, {"gfx950:sramecc-", nullptr, true, false, 9, 5, 0, OFF, ANY, 4, 16, 1, 256, 160 * Ki, 64}, {"gfx950:sramecc+", nullptr, true, false, 9, 5, 0, ON, ANY, 4, 16, 1, 256, 160 * Ki, 64}, {"gfx950:xnack-", nullptr, true, false, 9, 5, 0, ANY, OFF, 4, 16, 1, 256, 160 * Ki, 64}, {"gfx950:xnack+", nullptr, true, false, 9, 5, 0, ANY, ON, 4, 16, 1, 256, 160 * Ki, 64}, {"gfx950:sramecc-:xnack-", nullptr, true, false, 9, 5, 0, OFF, OFF, 4, 16, 1, 256, 160 * Ki, 64}, {"gfx950:sramecc-:xnack+", nullptr, true, false, 9, 5, 0, OFF, ON, 4, 16, 1, 256, 160 * Ki, 64}, {"gfx950:sramecc+:xnack-", nullptr, true, false, 9, 5, 0, ON, OFF, 4, 16, 1, 256, 160 * Ki, 64}, {"gfx950:sramecc+:xnack+", nullptr, true, false, 9, 5, 0, ON, ON, 4, 16, 1, 256, 160 * Ki, 64}, {"gfx9-generic", nullptr, true, true, 9, 0, 0, NONE, ANY, 4, 16, 1, 256, 64 * Ki, 32}, {"gfx9-generic:xnack-", nullptr, true, true, 9, 0, 0, NONE, OFF, 4, 16, 1, 256, 64 * Ki, 32}, {"gfx9-generic:xnack+", nullptr, true, true, 9, 0, 0, NONE, ON, 4, 16, 1, 256, 64 * Ki, 32}, {"gfx9-4-generic", nullptr, true, true, 9, 4, 0, ANY, ANY, 4, 16, 1, 256, 64 * Ki, 32}, {"gfx9-4-generic:sramecc-", nullptr, true, true, 9, 4, 0, OFF, ANY, 4, 16, 1, 256, 64 * Ki, 32}, {"gfx9-4-generic:sramecc+", nullptr, true, true, 9, 4, 0, ON, ANY, 4, 16, 1, 256, 64 * Ki, 32}, {"gfx9-4-generic:xnack-", nullptr, true, true, 9, 4, 0, ANY, OFF, 4, 16, 1, 256, 64 * Ki, 32}, {"gfx9-4-generic:xnack+", nullptr, true, true, 9, 4, 0, ANY, ON, 4, 16, 1, 256, 64 * Ki, 32}, {"gfx9-4-generic:sramecc-:xnack-", nullptr, true, true, 9, 4, 0, OFF, OFF, 4, 16, 1, 256, 64 * Ki, 32}, {"gfx9-4-generic:sramecc-:xnack+", nullptr, true, true, 9, 4, 0, OFF, ON, 4, 16, 1, 256, 64 * Ki, 32}, {"gfx9-4-generic:sramecc+:xnack-", nullptr, true, true, 9, 4, 0, ON, OFF, 4, 16, 1, 256, 64 * Ki, 32}, {"gfx9-4-generic:sramecc+:xnack+", nullptr, true, true, 9, 4, 0, ON, ON, 4, 16, 1, 256, 64 * Ki, 32}, {"gfx1010", "gfx1010", true, true, 10, 1, 0, NONE, ANY, 2, 32, 1, 256, 64 * Ki, 32}, {"gfx1010:xnack-", "gfx1010", true, true, 10, 1, 0, NONE, OFF, 2, 32, 1, 256, 64 * Ki, 32}, {"gfx1010:xnack+", nullptr, true, true, 10, 1, 0, NONE, ON, 2, 32, 1, 256, 64 * Ki, 32}, {"gfx1011", "gfx1011", true, true, 10, 1, 1, NONE, ANY, 2, 32, 1, 256, 64 * Ki, 32}, {"gfx1011:xnack-", "gfx1011", true, true, 10, 1, 1, NONE, OFF, 2, 32, 1, 256, 64 * Ki, 32}, {"gfx1011:xnack+", nullptr, true, true, 10, 1, 1, NONE, ON, 2, 32, 1, 256, 64 * Ki, 32}, {"gfx1012", "gfx1012", true, true, 10, 1, 2, NONE, ANY, 2, 32, 1, 256, 64 * Ki, 32}, {"gfx1012:xnack-", "gfx1012", true, true, 10, 1, 2, NONE, OFF, 2, 32, 1, 256, 64 * Ki, 32}, {"gfx1012:xnack+", nullptr, true, true, 10, 1, 2, NONE, ON, 2, 32, 1, 256, 64 * Ki, 32}, {"gfx1013", "gfx1013", true, false, 10, 1, 3, NONE, ANY, 2, 32, 1, 256, 64 * Ki, 32}, {"gfx1013:xnack-", "gfx1013", true, false, 10, 1, 3, NONE, OFF, 2, 32, 1, 256, 64 * Ki, 32}, {"gfx1013:xnack+", nullptr, true, false, 10, 1, 3, NONE, ON, 2, 32, 1, 256, 64 * Ki, 32}, {"gfx10-1-generic", nullptr, true, true, 10, 1, 0, NONE, ANY, 2, 32, 1, 256, 64 * Ki, 32}, {"gfx10-1-generic:xnack-", nullptr, true, true, 10, 1, 0, NONE, OFF, 2, 32, 1, 256, 64 * Ki, 32}, {"gfx10-1-generic:xnack+", nullptr, true, true, 10, 1, 0, NONE, ON, 2, 32, 1, 256, 64 * Ki, 32}, {"gfx1030", "gfx1030", true, true, 10, 3, 0, NONE, NONE, 2, 32, 1, 256, 64 * Ki, 32}, {"gfx1031", "gfx1031", true, true, 10, 3, 1, NONE, NONE, 2, 32, 1, 256, 64 * Ki, 32}, {"gfx1032", "gfx1032", true, true, 10, 3, 2, NONE, NONE, 2, 32, 1, 256, 64 * Ki, 32}, {"gfx1033", "gfx1033", true, false, 10, 3, 3, NONE, NONE, 2, 32, 1, 256, 64 * Ki, 32}, {"gfx1034", "gfx1034", true, true, 10, 3, 4, NONE, NONE, 2, 32, 1, 256, 64 * Ki, 32}, {"gfx1035", "gfx1035", true, true, 10, 3, 5, NONE, NONE, 2, 32, 1, 256, 64 * Ki, 32}, {"gfx1036", "gfx1036", true, true, 10, 3, 6, NONE, NONE, 2, 32, 1, 256, 64 * Ki, 32}, {"gfx10-3-generic", nullptr, true, true, 10, 3, 0, NONE, NONE, 2, 32, 1, 256, 64 * Ki, 32}, {"gfx1100", "gfx1100", true, true, 11, 0, 0, NONE, NONE, 2, 32, 1, 256, 64 * Ki, 32}, {"gfx1101", "gfx1101", true, true, 11, 0, 1, NONE, NONE, 2, 32, 1, 256, 64 * Ki, 32}, {"gfx1102", "gfx1102", true, true, 11, 0, 2, NONE, NONE, 2, 32, 1, 256, 64 * Ki, 32}, {"gfx1103", "gfx1103", true, true, 11, 0, 3, NONE, NONE, 2, 32, 1, 256, 64 * Ki, 32}, {"gfx1150", "gfx1150", true, true, 11, 5, 0, NONE, NONE, 2, 32, 1, 256, 64 * Ki, 32}, {"gfx1151", "gfx1151", true, true, 11, 5, 1, NONE, NONE, 2, 32, 1, 256, 64 * Ki, 32}, {"gfx1152", "gfx1152", true, true, 11, 5, 2, NONE, NONE, 2, 32, 1, 256, 64 * Ki, 32}, {"gfx1153", "gfx1153", true, true, 11, 5, 3, NONE, NONE, 2, 32, 1, 256, 64 * Ki, 32}, {"gfx11-generic", nullptr, true, true, 11, 0, 0, NONE, NONE, 2, 32, 1, 256, 64 * Ki, 32}, {"gfx1200", "gfx1200", true, true, 12, 0, 0, NONE, NONE, 2, 32, 1, 256, 64 * Ki, 32}, {"gfx1201", "gfx1201", true, true, 12, 0, 1, NONE, NONE, 2, 32, 1, 256, 64 * Ki, 32}, {"gfx12-generic", nullptr, true, true, 12, 0, 0, NONE, NONE, 2, 32, 1, 256, 64 * Ki, 32}, }; return std::make_pair(std::begin(supportedIsas_), std::end(supportedIsas_)); } std::string Isa::processorName() const { std::string processor(targetId_); return processor.substr(0, processor.find(':')); } std::string Isa::isaName() const { return std::string(hsaIsaNamePrefix) + targetId(); } bool Isa::isCompatible(const Isa& codeObjectIsa, const Isa& agentIsa) { bool isGeneric = std::strstr(codeObjectIsa.targetId(), "generic") != nullptr; if (isGeneric) { if (codeObjectIsa.versionMajor() != agentIsa.versionMajor() || codeObjectIsa.versionMinor() > agentIsa.versionMinor() || (codeObjectIsa.versionMinor() == agentIsa.versionMinor() && codeObjectIsa.versionStepping() > agentIsa.versionStepping())) { return false; } #ifdef DEBUG // Only check in DEBUG mode if (std::strstr(agentIsa.targetId(), "gfx906") != nullptr) { // For the generic target of gfx906, codeObjectIsa.isSrameccSupported() == false while // agentIsa.isSrameccSupported() = true assert(agentIsa.sramecc() != Feature::Any); } else { assert(codeObjectIsa.isSrameccSupported() == agentIsa.isSrameccSupported() && agentIsa.sramecc() != Feature::Any); } #endif } else { if (codeObjectIsa.versionMajor() != agentIsa.versionMajor() || codeObjectIsa.versionMinor() != agentIsa.versionMinor() || codeObjectIsa.versionStepping() != agentIsa.versionStepping()) return false; assert(codeObjectIsa.isSrameccSupported() == agentIsa.isSrameccSupported() && agentIsa.sramecc() != Feature::Any); } if ((codeObjectIsa.sramecc() == Feature::Enabled || codeObjectIsa.sramecc() == Feature::Disabled) && codeObjectIsa.sramecc() != agentIsa.sramecc()) return false; assert(codeObjectIsa.isXnackSupported() == agentIsa.isXnackSupported() && agentIsa.xnack() != Feature::Any); if ((codeObjectIsa.xnack() == Feature::Enabled || codeObjectIsa.xnack() == Feature::Disabled) && codeObjectIsa.xnack() != agentIsa.xnack()) return false; return true; } const Isa* Isa::findIsa(const char* isaName) { if (!isaName) return nullptr; const char* prefix = std::strstr(isaName, hsaIsaNamePrefix); if (prefix != isaName) return nullptr; const char* targetId = isaName + std::strlen(hsaIsaNamePrefix); auto supportedIsas_ = supportedIsas(); auto isaIter = std::find_if(supportedIsas_.first, supportedIsas_.second, [&](const Isa& isa) { return std::strcmp(targetId, isa.targetId_) == 0; }); return isaIter == supportedIsas_.second ? nullptr : isaIter; } const Isa* Isa::findIsa(uint32_t versionMajor, uint32_t versionMinor, uint32_t versionStepping, Isa::Feature sramecc, Isa::Feature xnack) { auto supportedIsas_ = supportedIsas(); auto isaIter = std::find_if(supportedIsas_.first, supportedIsas_.second, [&](const Isa& isa) { return versionMajor == isa.versionMajor_ && versionMinor == isa.versionMinor_ && versionStepping == isa.versionStepping_ && (isa.sramecc_ == amd::Isa::Feature::Unsupported || isa.sramecc_ == sramecc) && (isa.xnack_ == amd::Isa::Feature::Unsupported || isa.xnack_ == xnack); }); return isaIter == supportedIsas_.second ? nullptr : isaIter; } const Isa* Isa::begin() { return supportedIsas().first; } const Isa* Isa::end() { return supportedIsas().second; } std::vector* Device::devices_ = nullptr; AppProfile Device::appProfile_; Context* Device::glb_ctx_ = nullptr; // P2P Staging Lock Monitor Device::p2p_stage_ops_(true); Memory* Device::p2p_stage_ = nullptr; cl_int Device::gpu_error_ = CL_SUCCESS; std::shared_mutex MemObjMap::AllocatedLock_ ROCCLR_INIT_PRIORITY(101); std::map MemObjMap::MemObjMap_ ROCCLR_INIT_PRIORITY(101); std::map MemObjMap::VirtualMemObjMap_ ROCCLR_INIT_PRIORITY(101); std::map MemObjMap::IpcHandleMemObjMap_ ROCCLR_INIT_PRIORITY( 101); void MemObjMap::AddMemObj(const void* k, amd::Memory* v) { std::unique_lock lock(AllocatedLock_); auto rval = MemObjMap_.insert({reinterpret_cast(k), v}); if (!rval.second) { DevLogPrintfError("Memobj map already has an entry for ptr: 0x%x", reinterpret_cast(k)); } } void MemObjMap::RemoveMemObj(const void* k) { std::unique_lock lock(AllocatedLock_); auto rval = MemObjMap_.erase(reinterpret_cast(k)); guarantee(rval == 1, "Memobj map does not have ptr: 0x%x", reinterpret_cast(k)); } amd::Memory* MemObjMap::FindMemObj(const void* k, size_t* offset) { std::shared_lock lock(AllocatedLock_); uintptr_t key = reinterpret_cast(k); auto it = MemObjMap_.upper_bound(key); if (it == MemObjMap_.begin()) { return nullptr; } --it; amd::Memory* mem = it->second; size_t mem_size = (mem->getMemFlags() & ROCCLR_MEM_PHYMEM) ? sizeof(mem->getUserData().hsa_handle) : mem->getSize(); if (key >= it->first && key < (it->first + mem_size)) { if (offset != nullptr) { *offset = key - it->first; } // the k is in the range return mem; } else { return nullptr; } } void MemObjMap::UpdateAccess(amd::Device* peerDev) { if (peerDev == nullptr) { return; } // Provides access to all memory allocated on peerDev but // hsa_amd_agents_allow_access was not called because there was no peer std::shared_lock lock(AllocatedLock_); for (auto it : MemObjMap_) { const std::vector& devices = it.second->getContext().devices(); if (devices.size() == 1 && devices[0] == peerDev) { device::Memory* devMem = it.second->getDeviceMemory(*devices[0]); if (!devMem->getAllowedPeerAccess()) { peerDev->deviceAllowAccess(reinterpret_cast(it.first)); devMem->setAllowedPeerAccess(true); } } } } void MemObjMap::Purge(amd::Device* dev) { assert(dev != nullptr); std::unique_lock lock(AllocatedLock_); for (auto it = MemObjMap_.cbegin(); it != MemObjMap_.cend();) { amd::Memory* memObj = it->second; unsigned int flags = memObj->getMemFlags(); const std::vector& devices = memObj->getContext().devices(); if (devices.size() == 1 && devices[0] == dev && !(flags & ROCCLR_MEM_INTERNAL_MEMORY)) { memObj->release(); it = MemObjMap_.erase(it); } else { ++it; } } } void MemObjMap::AddVirtualMemObj(const void* k, amd::Memory* v) { std::unique_lock lock(AllocatedLock_); auto rval = VirtualMemObjMap_.insert({reinterpret_cast(k), v}); if (!rval.second) { DevLogPrintfError("Virtual Memobj map already has an entry for ptr: 0x%x", reinterpret_cast(k)); } } void MemObjMap::RemoveVirtualMemObj(const void* k) { std::unique_lock lock(AllocatedLock_); auto rval = VirtualMemObjMap_.erase(reinterpret_cast(k)); guarantee(rval == 1, "Virtual Memobj map does not have ptr: 0x%x", reinterpret_cast(k)); } amd::Memory* MemObjMap::FindVirtualMemObj(const void* k) { std::shared_lock lock(AllocatedLock_); uintptr_t key = reinterpret_cast(k); auto it = VirtualMemObjMap_.upper_bound(key); if (it == VirtualMemObjMap_.begin()) { return nullptr; } --it; amd::Memory* mem = it->second; if (key >= it->first && key < (it->first + mem->getSize())) { // the k is in the range return mem; } else { return nullptr; } } void MemObjMap::AddIpcHandleMemObj(const IpcMemHandle& k, amd::Memory* v) { std::unique_lock lock(AllocatedLock_); auto rval = IpcHandleMemObjMap_.insert({k, v}); if (!rval.second) { DevLogPrintfError( "Error adding entry for Memobj 0x%x in IpcHandle map. The handle already exists.", v); } } void MemObjMap::RemoveIpcHandleMemObj(amd::Memory* v) { std::unique_lock lock(AllocatedLock_); for (const auto it : IpcHandleMemObjMap_) { if (it.second == v) { IpcHandleMemObjMap_.erase(it.first); break; } } } amd::Memory* MemObjMap::FindIpcHandleMemObj(const IpcMemHandle& k) { std::shared_lock lock(AllocatedLock_); auto it = IpcHandleMemObjMap_.find(k); if (it == IpcHandleMemObjMap_.cend()) { return nullptr; } return it->second; } //================================================================================================== bool Device::ValidateVirtualAddressRange(amd::Memory* vaddr_base_obj, amd::Memory* vaddr_sub_obj) { // Check if the start of the subbuffer is >= to base start. if (vaddr_base_obj->getSvmPtr() > vaddr_sub_obj->getSvmPtr()) { LogError("Sub buffer cannot start with addr lesser than base_start."); return false; } // Check if the new size belongs to the vaddr_base_obj range. address vaddr_base_end = reinterpret_cast
(vaddr_base_obj->getSvmPtr()) + vaddr_base_obj->getSize(); address vaddr_sub_end = reinterpret_cast
(vaddr_sub_obj->getSvmPtr()) + vaddr_sub_obj->getSize(); if (vaddr_sub_end > vaddr_base_end) { LogError("Sub buffer memory end cannot be greater than base_end. Return nullptr"); return false; } return true; } //================================================================================================== amd::Memory* Device::CreateVirtualBuffer(amd::Context& device_context, void* vptr, size_t size, int deviceId, bool parent, bool kForceAlloc) { amd::Memory* vaddr_base_obj = nullptr; amd::Memory* vaddr_sub_obj = nullptr; constexpr bool kSysMemAlloc = false; constexpr bool kSkipAlloc = false; if (parent) { vaddr_base_obj = new (GlbCtx()) amd::Buffer(GlbCtx(), CL_MEM_VA_RANGE_AMD, size, vptr); if (vaddr_base_obj == nullptr) { LogError("failed to new a va range curr_mem_obj object!"); return nullptr; } // This curr_mem_obj->create() does not create an actual memory but stores the memory info // with given vptr on ROCr backend. if (!vaddr_base_obj->create(nullptr, kSysMemAlloc, kSkipAlloc, kForceAlloc)) { LogError("failed to create a va range mem object"); vaddr_base_obj->release(); return nullptr; } amd::MemObjMap::AddVirtualMemObj(vaddr_base_obj->getSvmPtr(), vaddr_base_obj); } else { // If not parent, but sub-buffer/child, then validate the address range vaddr_base_obj = amd::MemObjMap::FindVirtualMemObj(vptr); if (vaddr_base_obj == nullptr) { LogPrintfError("Cannot find entry in VirtualMemObjMap: 0x%x \n", vptr); return nullptr; } assert(vaddr_base_obj->getMemFlags() & CL_MEM_VA_RANGE_AMD); size_t offset = (reinterpret_cast
(vptr) - reinterpret_cast
(vaddr_base_obj->getSvmPtr())); vaddr_sub_obj = new (device_context) amd::Buffer(device_context, CL_MEM_VA_RANGE_AMD, size, vptr); vaddr_sub_obj->SetParent(vaddr_base_obj); vaddr_sub_obj->setOrigin(offset); // This curr_mem_obj->create() does not create an actual memory but stores the memory info // with given vptr on ROCr backend. if (!vaddr_sub_obj->create(nullptr, kSysMemAlloc, kSkipAlloc, kForceAlloc)) { LogError("failed to create a va range mem object"); vaddr_sub_obj->release(); return nullptr; } vaddr_sub_obj->getUserData().deviceId = deviceId; if (!ValidateVirtualAddressRange(vaddr_base_obj, vaddr_sub_obj)) { LogError("Validation failed on address range, returning nullptr"); return nullptr; } } if (vptr != nullptr) { // Assert to make sure that amd::Memory object has set the right ptr. guarantee(vptr == (parent ? vaddr_base_obj->getSvmPtr() : vaddr_sub_obj->getSvmPtr()), "amd::Memory object does not have the right ptr"); } return parent ? vaddr_base_obj : vaddr_sub_obj; } //================================================================================================== bool Device::DestroyVirtualBuffer(amd::Memory* vaddr_mem_obj) { // Argument nullptr check. if (vaddr_mem_obj == nullptr || vaddr_mem_obj->getSvmPtr() == nullptr) { LogPrintfError("Mem obj passed is nullptr, vaddr_mem_obj: %p \n", vaddr_mem_obj); return false; } if (vaddr_mem_obj->parent() != nullptr) { // If parent is not nullptr, this is the sub-buffer object. amd::Memory* vaddr_base_obj = amd::MemObjMap::FindVirtualMemObj(vaddr_mem_obj->getSvmPtr()); if (vaddr_base_obj == nullptr) { LogPrintfError("Cannot find mem obj for ptr: 0x%x", vaddr_mem_obj->getSvmPtr()); return false; } vaddr_base_obj->removeSubBuffer(vaddr_mem_obj); } return true; } Device::BlitProgram::~BlitProgram() { if (program_ != nullptr) { program_->release(); } } bool Device::BlitProgram::create(amd::Device* device, const std::string& extraKernels, const std::string& extraOptions) { std::vector devices; devices.push_back(device); int32_t retval = CL_SUCCESS; std::string kernels(device::BlitLinearSourceCode); std::string image_kernels(device::BlitImageSourceCode); if (device->info().imageSupport_) { kernels += image_kernels; } if (!extraKernels.empty()) { kernels += extraKernels; } // Create a program with all blit kernels program_ = new Program(*context_, kernels.c_str(), Program::OpenCL_C); if (program_ == nullptr) { DevLogPrintfError("Program creation for Kernel: %s failed\n", kernels.c_str()); return false; } // Build all kernels std::string opt = "-cl-internal-kernel "; if (!device->settings().useLightning_) { opt += "-Wf,--force_disable_spir "; } if (!extraOptions.empty()) { opt += extraOptions; } if (!GPU_DUMP_BLIT_KERNELS) { opt += " -fno-enable-dump"; } if (device->settings().kernel_arg_opt_) { opt += " -Wb,-amdgpu-kernarg-preload-count=8 "; } #if defined(__clang__) #if __has_feature(address_sanitizer) opt += " -fsanitize=address "; #endif #endif if ((retval = program_->build(devices, opt.c_str(), nullptr, nullptr, GPU_DUMP_BLIT_KERNELS)) != CL_SUCCESS) { DevLogPrintfError("Build failed for Kernel: %s with error code %d\n", kernels.c_str(), retval); return false; } if (!program_->load()) { DevLogPrintfError("Could not load the kernels: %s \n", kernels.c_str()); return false; } return true; } bool Device::init() { assert(!Runtime::initialized() && "initialize only once"); bool ret = false; devices_ = nullptr; appProfile_.init(); // IMPORTANT: Note that we are initialiing HSA stack first and then // GPU stack. The order of initialization is signiicant and if changed // amd::Device::registerDevice() must be accordingly modified. #if defined(WITH_HSA_DEVICE) if ((GPU_ENABLE_PAL != 1) || flagIsDefault(GPU_ENABLE_PAL)) { // Return value of roc::Device::init() // If returned false, error initializing HSA stack. // If returned true, either HSA not installed or HSA stack // successfully initialized. ret = roc::Device::init(); if (!ret) { // abort() commentted because this is the only indication // that KFD is not installed. // Ignore the failure and assume KFD is not installed. // abort(); DevLogError("KFD is not installed \n"); } if (!amd::IS_HIP) { ret |= roc::NullDevice::init(); } } #endif // WITH_HSA_DEVICE #if defined(WITH_PAL_DEVICE) if (GPU_ENABLE_PAL != 0) { ret |= PalDeviceLoad(); } #endif // WITH_PAL_DEVICE return ret; } void Device::tearDown() { if (devices_ != nullptr) { for (uint i = 0; i < devices_->size(); ++i) { delete devices_->at(i); } devices_->clear(); delete devices_; } #if defined(WITH_HSA_DEVICE) roc::Device::tearDown(); #endif // WITH_HSA_DEVICE #if defined(WITH_PAL_DEVICE) if (GPU_ENABLE_PAL != 0) { PalDeviceUnload(); } #endif // WITH_PAL_DEVICE } Device::Device() : settings_(nullptr), online_(true), activeWait_(false), blitProgram_(nullptr), context_(nullptr), heap_buffer_(nullptr), initial_heap_buffer_(nullptr), arena_mem_obj_(nullptr), vaCacheAccess_(nullptr), vaCacheMap_(nullptr), index_(0) { memset(&info_, '\0', sizeof(info_)); // By default consider just 1 xcc per device info_.numberOfXccs_ = 1; } Device::~Device() { if (heap_buffer_ != nullptr) { delete heap_buffer_; heap_buffer_ = nullptr; } if (initial_heap_buffer_ != nullptr) { delete initial_heap_buffer_; initial_heap_buffer_ = nullptr; } if (arena_mem_obj_ != nullptr) { arena_mem_obj_->release(); } if (vaCacheMap_) { CondLog(vaCacheMap_->size() != 0, "Application didn't unmap all host memory!"); delete vaCacheMap_; } for (auto memory : hostcall_allocated_memories_) { if (memory != nullptr) { amd::MemObjMap::RemoveMemObj( reinterpret_cast(memory->getDeviceMemory(*this, false)->virtualAddress())); memory->release(); } } hostcall_allocated_memories_.clear(); delete vaCacheAccess_; delete settings_; delete[] info_.extensions_; } bool Device::ValidateComgr() { #if defined(USE_COMGR_LIBRARY) // Check if Lightning compiler was requested if (settings_->useLightning_) { constexpr bool kComgrVersioned = false; std::call_once(amd::Comgr::initialized, amd::Comgr::LoadLib, kComgrVersioned); // Use Lightning only if it's available settings_->useLightning_ = amd::Comgr::IsReady(); return settings_->useLightning_; } #endif return true; } bool Device::ValidateHsail() { #if defined(WITH_COMPILER_LIB) // Check if HSAIL compiler was requested if (!settings_->useLightning_) { std::call_once(amd::Hsail::initialized, amd::Hsail::LoadLib); // Use Hsail only if it's available return amd::Hsail::IsReady(); } #endif return true; } size_t GetMaxStackSize(const std::string& procName) { if (procName.find("gfx9") != std::string::npos || procName.find("gfx8") != std::string::npos) { return kMaxStackSize9X; } else if (procName.find("gfx11") != std::string::npos || procName.find("gfx10") != std::string::npos) { return kMaxStackSize11X; } else { return kMaxStackSize12X; } } bool Device::create(const Isa& isa) { assert(!vaCacheAccess_ && !vaCacheMap_); isa_ = &isa; // VA Cache Ops Lock vaCacheAccess_ = new amd::Monitor(true); if (nullptr == vaCacheAccess_) { return false; } vaCacheMap_ = new std::map(); if (nullptr == vaCacheMap_) { return false; } // For OpenCl default stack size needs to be set to 16K if (!amd::IS_HIP) { stack_size_ = 16 * Ki; } maxStackSize_ = GetMaxStackSize(isa_->processorName()); return true; } void Device::registerDevice() { assert(Runtime::singleThreaded() && "this is not thread-safe"); if (devices_ == nullptr) { devices_ = new std::vector; } if (info_.available_) { static bool defaultIsAssigned = false; if (!defaultIsAssigned && online_) { defaultIsAssigned = true; info_.type_ |= CL_DEVICE_TYPE_DEFAULT; } } if (isOnline()) { for (const auto& dev : devices()) { if (dev->isOnline()) { index_++; } } } devices_->push_back(this); } void Device::addVACache(device::Memory* memory) const { // Make sure system memory has direct access if (memory->isHostMemDirectAccess()) { // VA cache access must be serialised amd::ScopedLock lk(*vaCacheAccess_); void* start = memory->owner()->getHostMem(); size_t offset; device::Memory* doubleMap = findMemoryFromVA(start, &offset); if (doubleMap == nullptr) { // Insert the new entry vaCacheMap_->insert( std::pair(reinterpret_cast(start), memory)); } else { LogError("Unexpected double map() call from the app!"); } } } void Device::removeVACache(const device::Memory* memory) const { // Make sure system memory has direct access if (memory->isHostMemDirectAccess() && memory->owner()) { // VA cache access must be serialised amd::ScopedLock lk(*vaCacheAccess_); void* start = memory->owner()->getHostMem(); vaCacheMap_->erase(reinterpret_cast(start)); } } device::Memory* Device::findMemoryFromVA(const void* ptr, size_t* offset) const { // VA cache access must be serialised amd::ScopedLock lk(*vaCacheAccess_); uintptr_t key = reinterpret_cast(ptr); auto it = vaCacheMap_->upper_bound(reinterpret_cast(ptr)); if (it == vaCacheMap_->begin()) { return nullptr; } --it; device::Memory* mem = it->second; if (key >= it->first && key < (it->first + mem->size())) { // ptr is in the range *offset = key - it->first; return mem; } return nullptr; } bool Device::IsTypeMatching(cl_device_type type, bool offlineDevices) { if (!(isOnline() || offlineDevices)) { return false; } return (info_.type_ & type) != 0; } std::vector Device::getDevices(cl_device_type type, bool offlineDevices) { std::vector result; if (devices_ == nullptr) { return result; } // Create the list of available devices for (const auto& it : *devices_) { // Check if the device type is matched if (it->IsTypeMatching(type, offlineDevices)) { result.push_back(it); } } return result; } size_t Device::numDevices(cl_device_type type, bool offlineDevices) { size_t result = 0; if (devices_ == nullptr) { return 0; } for (const auto& it : *devices_) { // Check if the device type is matched if (it->IsTypeMatching(type, offlineDevices)) { ++result; } } return result; } bool Device::getDeviceIDs(cl_device_type deviceType, uint32_t numEntries, cl_device_id* devices, uint32_t* numDevices, bool offlineDevices) { if (numDevices != nullptr && devices == nullptr) { *numDevices = (uint32_t)amd::Device::numDevices(deviceType, offlineDevices); return (*numDevices > 0) ? true : false; } assert(devices != nullptr && "check the code above"); std::vector ret = amd::Device::getDevices(deviceType, offlineDevices); if (ret.size() == 0) { *not_null(numDevices) = 0; return false; } auto it = ret.cbegin(); uint32_t count = std::min(numEntries, (uint32_t)ret.size()); while (count--) { *devices++ = as_cl(*it++); --numEntries; } while (numEntries--) { *devices++ = (cl_device_id)0; } *not_null(numDevices) = (uint32_t)ret.size(); return true; } bool Device::enableP2P(amd::Device* ptrDev) { assert(ptrDev != nullptr); amd::ScopedLock lock(lockP2P_); Device* peerDev = static_cast(ptrDev); if (std::find(enabled_p2p_devices_.begin(), enabled_p2p_devices_.end(), peerDev) == enabled_p2p_devices_.end()) { enabled_p2p_devices_.push_back(peerDev); // Update access to all old allocations amd::MemObjMap::UpdateAccess(static_cast(this)); } return true; } bool Device::disableP2P(amd::Device* ptrDev) { assert(ptrDev != nullptr); amd::ScopedLock lock(lockP2P_); Device* peerDev = static_cast(ptrDev); // if device is present then remove auto it = std::find(enabled_p2p_devices_.begin(), enabled_p2p_devices_.end(), peerDev); if (it != enabled_p2p_devices_.end()) { enabled_p2p_devices_.erase(it); } return true; } bool Device::UpdateStackSize(uint64_t stackSize) { if (stackSize > maxStackSize_) { return false; } stack_size_ = amd::alignUp(stackSize, 16); return true; } bool Device::UpdateInitialHeapSize(uint64_t initialHeapSize) { if (initialHeapSize >= info().globalMemSize_) { return false; } initial_heap_size_ = initialHeapSize; return true; } char* Device::getExtensionString() { std::stringstream extStream; size_t size; char* result = nullptr; // Generate the extension string for (uint i = 0; i < ClExtTotal; ++i) { if (settings().checkExtension(i)) { extStream << OclExtensionsString[i]; } } size = extStream.str().size() + 1; // Create a single string with all extensions result = new char[size]; if (result != nullptr) { memcpy(result, extStream.str().data(), (size - 1)); result[size - 1] = 0; } return result; } // ================================================================================================ bool Device::IpcCreate(void* dev_ptr, size_t* mem_size, char* handle, size_t* mem_offset) const { amd::Memory* amd_mem_obj = amd::MemObjMap::FindMemObj(dev_ptr); if (amd_mem_obj == nullptr) { DevLogPrintfError("Cannot retrieve amd_mem_obj for dev_ptr: 0x%x", dev_ptr); return false; } // Get the original pointer from the amd::Memory object void* orig_dev_ptr = nullptr; if (amd_mem_obj->getSvmPtr() != nullptr) { orig_dev_ptr = amd_mem_obj->getSvmPtr(); } else if (amd_mem_obj->getHostMem() != nullptr) { orig_dev_ptr = amd_mem_obj->getHostMem(); } else { ShouldNotReachHere(); } // Check if the dev_ptr is lesser than original dev_ptr if (orig_dev_ptr > dev_ptr) { // If this happens, then revisit FindMemObj logic DevLogPrintfError("Original dev_ptr: 0x%x cannot be greater than dev_ptr: 0x%x", orig_dev_ptr, dev_ptr); return false; } // Calculate the memory offset from the original base ptr *mem_offset = reinterpret_cast
(dev_ptr) - reinterpret_cast
(orig_dev_ptr) + amd_mem_obj->getOffset(); *mem_size = amd_mem_obj->getSize(); auto dev_mem = static_cast(amd_mem_obj->getDeviceMemory(*this)); auto result = dev_mem->ExportHandle(handle); return result; } // ================================================================================================ bool Device::IpcAttach(const char* handle, size_t mem_size, size_t mem_offset, unsigned int flags, void** dev_ptr) const { amd::Memory* amd_mem_obj = nullptr; // Create an amd Memory object for the handle amd_mem_obj = new (context()) amd::IpcBuffer(context(), flags, mem_offset, mem_size, handle); if (amd_mem_obj == nullptr) { LogError("failed to create a mem object!"); return false; } if (!amd_mem_obj->create(nullptr)) { LogError("failed to create a svm hidden buffer!"); amd_mem_obj->release(); return false; } auto mem_obj_exist = amd::MemObjMap::FindMemObj(amd_mem_obj->getSvmPtr()); if (mem_obj_exist == nullptr) { // Add the original mem_ptr to the MemObjMap with newly created amd_mem_obj amd::MemObjMap::AddMemObj(amd_mem_obj->getSvmPtr(), amd_mem_obj); } else { amd_mem_obj->release(); amd_mem_obj = mem_obj_exist; // Memory already exists, just retain the old one. amd_mem_obj->retain(); } *dev_ptr = amd_mem_obj->getSvmPtr(); return true; } // ================================================================================================ void Device::IpcDetach(amd::Memory* amd_mem_obj) const { // Get the original pointer from the amd::Memory object void* orig_dev_ptr = nullptr; if (amd_mem_obj->getSvmPtr() != nullptr) { orig_dev_ptr = amd_mem_obj->getSvmPtr(); } else if (amd_mem_obj->getHostMem() != nullptr) { orig_dev_ptr = amd_mem_obj->getHostMem(); } else { ShouldNotReachHere(); } if (amd::MemObjMap::FindMemObj(orig_dev_ptr)) { amd::MemObjMap::RemoveMemObj(orig_dev_ptr); } } std::vector Device::getActiveQueues() { amd::ScopedLock lock(activeQueuesLock_); for (auto it = activeQueues.begin(); it != activeQueues.end();) { if ((*it)->referenceCount() == 0) { // It is being terminated in HostQueue::terminate(). // We should not wait for commands in a queue being terminated. it = activeQueues.erase(it); } else { // In case the queue will be destroyed in Stream::Destroy(). (*it)->retain(); ++it; } } return std::vector(activeQueues.begin(), activeQueues.end()); } // ================================================================================================= bool Device::GetHandleForAddressRange(void* dev_ptr, size_t size, void* handle) { // Check if the ptr is created through VMM APIs, if true we use different ROCr APIs. amd::Memory* amd_base_obj = amd::MemObjMap::FindVirtualMemObj(dev_ptr); bool VmmPtr = (amd_base_obj != nullptr) ? true : false; // Even if it is VMM ptr, check to make sure the memory is mapped. On hipMalloc'ed ptrs, // make sure the memory is allocated. amd::Memory* amd_mem_obj = amd::MemObjMap::FindMemObj(dev_ptr); if (amd_mem_obj == nullptr) { DevLogPrintfError("Cannot retrieve amd_mem_obj for dev_ptr: 0x%x", dev_ptr); return false; } device::Memory* dev_mem = amd_mem_obj->getDeviceMemory(*this); return dev_mem->GetFDHandleForMem(dev_ptr, size, VmmPtr, handle); } // ================================================================================================ void Device::TrackHostcallMemory(amd::Memory* memory) { hostcall_allocated_memories_.push_back(memory); } // ================================================================================================ void Device::RemoveHostcallMemory(amd::Memory* memory) { auto it = std::find(hostcall_allocated_memories_.begin(), hostcall_allocated_memories_.end(), memory); if (it != hostcall_allocated_memories_.end()) { hostcall_allocated_memories_.erase(it); } } } // namespace amd namespace amd::device { Settings::Settings() : value_(0) { assert((ClExtTotal < (8 * sizeof(extensions_))) && "Too many extensions!"); extensions_ = 0; supportRA_ = true; customHostAllocator_ = false; waitCommand_ = AMD_OCL_WAIT_COMMAND; supportDepthsRGB_ = false; fenceScopeAgent_ = AMD_OPT_FLUSH; // Amend certain flags for OpenCL if (!amd::IS_HIP) { if (flagIsDefault(GPU_SINGLE_ALLOC_PERCENT)) { GPU_SINGLE_ALLOC_PERCENT = 85; } if (flagIsDefault(GPU_FORCE_BLIT_COPY_SIZE)) { GPU_FORCE_BLIT_COPY_SIZE = 0; } } gwsInitSupported_ = true; } void Memory::saveMapInfo(const void* mapAddress, const amd::Coord3D origin, const amd::Coord3D region, uint mapFlags, bool entire, amd::Image* baseMip) { // Map/Unmap must be serialized. amd::ScopedLock lock(owner()->lockMemoryOps()); WriteMapInfo info = {}; WriteMapInfo* pInfo = &info; auto it = writeMapInfo_.find(mapAddress); if (it != writeMapInfo_.end()) { LogWarning("Double map of the same or overlapped region!"); pInfo = &it->second; } if (mapFlags & (CL_MAP_WRITE | CL_MAP_WRITE_INVALIDATE_REGION)) { pInfo->origin_ = origin; pInfo->region_ = region; pInfo->entire_ = entire; pInfo->unmapWrite_ = true; } if (mapFlags & CL_MAP_READ) { pInfo->unmapRead_ = true; } pInfo->baseMip_ = baseMip; // Insert into the map if it's the first region if (++pInfo->count_ == 1) { writeMapInfo_.insert({mapAddress, info}); } } ClBinary::ClBinary(const amd::Device& dev, BinaryImageFormat bifVer) : dev_(dev), binary_(nullptr), size_(0), flags_(0), origBinary_(nullptr), origSize_(0), encryptCode_(0), elfIn_(nullptr), elfOut_(nullptr), format_(bifVer) {} ClBinary::~ClBinary() { release(); delete elfIn_; delete elfOut_; } bool ClBinary::setElfTarget() { static const uint32_t Target = 21; assert(((0xFFFF8000 & Target) == 0) && "ASIC target ID >= 2^15"); uint16_t elf_target = static_cast(0x7FFF & Target); return elfOut()->setTarget(elf_target, amd::Elf::CAL_PLATFORM); } #if defined(WITH_COMPILER_LIB) std::string ClBinary::getBIFSymbol(unsigned int symbolID) const { size_t nSymbols = 0; // Due to PRE & POST defines in bif_section_labels.hpp conflict with // PRE & POST struct members in sp3-si-chip-registers.h // unable to include bif_section_labels.hpp in device.hpp //! @todo: resolve conflict by renaming defines, // then include bif_section_labels.hpp in device.hpp & // use oclBIFSymbolID instead of unsigned int as a parameter const oclBIFSymbolID symID = static_cast(symbolID); switch (format_) { case BIF_VERSION2: { nSymbols = sizeof(BIF20) / sizeof(oclBIFSymbolStruct); const oclBIFSymbolStruct* symb = findBIFSymbolStruct(BIF20, nSymbols, symID); assert(symb && "BIF20 symbol with symbolID not found"); if (symb) { return std::string(symb->str[bif::PRE]) + std::string(symb->str[bif::POST]); } break; } case BIF_VERSION3: { nSymbols = sizeof(BIF30) / sizeof(oclBIFSymbolStruct); const oclBIFSymbolStruct* symb = findBIFSymbolStruct(BIF30, nSymbols, symID); assert(symb && "BIF30 symbol with symbolID not found"); if (symb) { return std::string(symb->str[bif::PRE]) + std::string(symb->str[bif::POST]); } break; } default: assert(0 && "unexpected BIF type"); return ""; } return ""; } #endif void ClBinary::init(amd::option::Options* optionsObj) { // option has higher priority than environment variable. if ((flags_ & BinarySourceMask) != BinaryRemoveSource) { // set to zero flags_ = (flags_ & (~BinarySourceMask)); flags_ |= (optionsObj->oVariables->BinSOURCE ? BinarySaveSource : BinaryNoSaveSource); } if ((flags_ & BinaryLlvmirMask) != BinaryRemoveLlvmir) { // set to zero flags_ = (flags_ & (~BinaryLlvmirMask)); flags_ |= (optionsObj->oVariables->BinLLVMIR ? BinarySaveLlvmir : BinaryNoSaveLlvmir); } if ((flags_ & BinaryIsaMask) != BinaryRemoveIsa) { // set to zero flags_ = (flags_ & (~BinaryIsaMask)); flags_ |= ((optionsObj->oVariables->BinEXE) ? BinarySaveIsa : BinaryNoSaveIsa); } if ((flags_ & BinaryASMask) != BinaryRemoveAS) { // set to zero flags_ = (flags_ & (~BinaryASMask)); flags_ |= ((optionsObj->oVariables->BinAS) ? BinarySaveAS : BinaryNoSaveAS); } } bool ClBinary::isRecompilable(std::string& llvmBinary, amd::Elf::ElfPlatform thePlatform) { /* It is recompilable if there is llvmir that was generated for the same platform (CPU or GPU) and with the same bitness. Note: the bitness has been checked in initClBinary(), no need to check it here. */ if (llvmBinary.empty()) { DevLogError("LLVM Binary string is empty \n"); return false; } uint16_t elf_target; amd::Elf::ElfPlatform platform; if (elfIn()->getTarget(elf_target, platform)) { if (platform == thePlatform) { return true; } if ((platform == amd::Elf::COMPLIB_PLATFORM) && (((thePlatform == amd::Elf::CAL_PLATFORM) && ((elf_target == (uint16_t)EM_HSAIL) || (elf_target == (uint16_t)EM_HSAIL_64))) || ((thePlatform == amd::Elf::CPU_PLATFORM) && ((elf_target == (uint16_t)EM_386) || (elf_target == (uint16_t)EM_X86_64))))) { return true; } } DevLogPrintfError("LLVM_Binary: %s is not recompilable \n", llvmBinary.c_str()); return false; } void ClBinary::release() { if (isBinaryAllocated() && (binary_ != nullptr)) { delete[] binary_; binary_ = nullptr; flags_ &= ~BinaryAllocated; } } void ClBinary::saveBIFBinary(const char* binaryIn, size_t size) { char* image = new char[size]; memcpy(image, binaryIn, size); setBinary(image, size, true); return; } bool ClBinary::createElfBinary(bool doencrypt, Program::type_t type) { release(); size_t imageSize; char* image; assert(elfOut_ && "elfOut_ should be initialized in ClBinary::data()"); // Insert Version string that builds this binary into .comment section const device::Info& devInfo = dev_.info(); std::string buildVerInfo("@(#) "); if (devInfo.version_ != nullptr) { buildVerInfo.append(devInfo.version_); buildVerInfo.append(". Driver version: "); buildVerInfo.append(devInfo.driverVersion_); } else { // char OpenCLVersion[256]; // size_t sz; // int32_t ret= clGetPlatformInfo(AMD_PLATFORM, CL_PLATFORM_VERSION, 256, OpenCLVersion, &sz); // if (ret == CL_SUCCESS) { // buildVerInfo.append(OpenCLVersion, sz); // } // If CAL is unavailable, just hard-code the OpenCL driver version buildVerInfo.append("OpenCL 1.1" AMD_PLATFORM_INFO); } elfOut_->addSection(amd::Elf::COMMENT, buildVerInfo.data(), buildVerInfo.size()); switch (type) { case Program::TYPE_NONE: { elfOut_->setType(ET_NONE); break; } case Program::TYPE_COMPILED: { elfOut_->setType(ET_REL); break; } case Program::TYPE_LIBRARY: { elfOut_->setType(ET_DYN); break; } case Program::TYPE_EXECUTABLE: { elfOut_->setType(ET_EXEC); break; } default: assert(0 && "unexpected elf type"); } if (!elfOut_->dumpImage(&image, &imageSize)) { DevLogError("Dump Image failed \n"); return false; } if (tempFile_) { std::remove(fname_.c_str()); } #if defined(HAVE_BLOWFISH_H) if (doencrypt) { // Increase the size by 64 to accomodate extra headers int outBufSize = (int)(imageSize + 64); char* outBuf = new char[outBufSize]; if (outBuf == nullptr) { return false; } memset(outBuf, '\0', outBufSize); int outBytes = 0; bool success = amd::oclEncrypt(0, image, imageSize, outBuf, outBufSize, &outBytes); delete[] image; if (!success) { delete[] outBuf; DevLogError("Cannot succesfully OCL Encrypt Image"); return false; } image = outBuf; imageSize = outBytes; } #endif setBinary(image, imageSize, true); return true; } Program::binary_t ClBinary::data() const { return {binary_, size_}; } Program::finfo_t ClBinary::Datafd() const { return {fdesc_, foffset_}; } std::string ClBinary::DataURI() const { return uri_; } bool ClBinary::setBinary(const char* theBinary, size_t theBinarySize, bool allocated, amd::Os::FileDesc fdesc, size_t foffset, std::string uri) { release(); size_ = theBinarySize; binary_ = theBinary; if (allocated) { flags_ |= BinaryAllocated; } fdesc_ = fdesc; foffset_ = foffset; uri_ = uri; return true; } void ClBinary::setFlags(int encryptCode) { encryptCode_ = encryptCode; if (encryptCode != 0) { flags_ = (flags_ & (~(BinarySourceMask | BinaryLlvmirMask | BinaryIsaMask | BinaryASMask))); flags_ |= (BinaryRemoveSource | BinaryRemoveLlvmir | BinarySaveIsa | BinaryRemoveAS); } } bool ClBinary::decryptElf(const char* binaryIn, size_t size, char** decryptBin, size_t* decryptSize, int* encryptCode) { *decryptBin = nullptr; #if defined(HAVE_BLOWFISH_H) int outBufSize = 0; if (amd::isEncryptedBIF(binaryIn, (int)size, &outBufSize)) { char* outBuf = new (std::nothrow) char[outBufSize]; if (outBuf == nullptr) { return false; } // Decrypt int outDataSize = 0; if (!amd::oclDecrypt(binaryIn, (int)size, outBuf, outBufSize, &outDataSize)) { delete[] outBuf; DevLogError("Cannot Decrypt Image \n"); return false; } *decryptBin = reinterpret_cast(outBuf); *decryptSize = outDataSize; *encryptCode = 1; } #endif return true; } bool ClBinary::setElfIn() { if (elfIn_) return true; if (binary_ == nullptr) { return false; } elfIn_ = new amd::Elf(ELFCLASSNONE, binary_, size_, nullptr, amd::Elf::ELF_C_READ); if ((elfIn_ == nullptr) || !elfIn_->isSuccessful()) { delete elfIn_; elfIn_ = nullptr; LogError("Creating input ELF object failed"); return false; } return true; } void ClBinary::resetElfIn() { delete elfIn_; elfIn_ = nullptr; } bool ClBinary::setElfOut(unsigned char eclass, const char* outFile, bool tempFile) { elfOut_ = new amd::Elf(eclass, nullptr, 0, outFile, amd::Elf::ELF_C_WRITE); if ((elfOut_ == nullptr) || !elfOut_->isSuccessful()) { delete elfOut_; elfOut_ = nullptr; LogError("Creating output ELF object failed"); return false; } fname_ = outFile; tempFile_ = tempFile; return setElfTarget(); } void ClBinary::resetElfOut() { delete elfOut_; elfOut_ = nullptr; } bool ClBinary::loadLlvmBinary(std::string& llvmBinary, amd::Elf::ElfSections& elfSectionType) const { // Check if current binary already has LLVMIR char* section = nullptr; size_t sz = 0; const amd::Elf::ElfSections SectionTypes[] = {amd::Elf::LLVMIR, amd::Elf::SPIR, amd::Elf::SPIRV}; for (int i = 0; i < 3; ++i) { if (elfIn_->getSection(SectionTypes[i], §ion, &sz) && section && sz > 0) { llvmBinary.append(section, sz); elfSectionType = SectionTypes[i]; return true; } } DevLogPrintfError("Cannot Load LLVM Binary: %s \n", llvmBinary.c_str()); return false; } bool ClBinary::loadCompileOptions(std::string& compileOptions) const { char* options = nullptr; size_t sz; compileOptions.clear(); #if defined(WITH_COMPILER_LIB) if (elfIn_->getSymbol(amd::Elf::COMMENT, getBIFSymbol(symOpenclCompilerOptions).c_str(), &options, &sz)) { if (sz > 0) { compileOptions.append(options, sz); } return true; } #endif return false; } bool ClBinary::loadLinkOptions(std::string& linkOptions) const { char* options = nullptr; size_t sz; linkOptions.clear(); #if defined(WITH_COMPILER_LIB) if (elfIn_->getSymbol(amd::Elf::COMMENT, getBIFSymbol(symOpenclLinkerOptions).c_str(), &options, &sz)) { if (sz > 0) { linkOptions.append(options, sz); } return true; } #endif return false; } void ClBinary::storeCompileOptions(const std::string& compileOptions) { #if defined(WITH_COMPILER_LIB) elfOut()->addSymbol(amd::Elf::COMMENT, getBIFSymbol(symOpenclCompilerOptions).c_str(), compileOptions.c_str(), compileOptions.length()); #endif } void ClBinary::storeLinkOptions(const std::string& linkOptions) { #if defined(WITH_COMPILER_LIB) elfOut()->addSymbol(amd::Elf::COMMENT, getBIFSymbol(symOpenclLinkerOptions).c_str(), linkOptions.c_str(), linkOptions.length()); #endif } bool ClBinary::isSPIR() const { char* section = nullptr; size_t sz = 0; if (elfIn_->getSection(amd::Elf::LLVMIR, §ion, &sz) && section && sz > 0) return false; if (elfIn_->getSection(amd::Elf::SPIR, §ion, &sz) && section && sz > 0) return true; return false; } bool ClBinary::isSPIRV() const { char* section = nullptr; size_t sz = 0; if (elfIn_->getSection(amd::Elf::SPIRV, §ion, &sz) && section && sz > 0) { return true; } return false; } } // namespace amd::device