// // Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved. // #include "top.hpp" #include "os/os.hpp" #include "device/device.hpp" #include "device/pal/paldefs.hpp" #include "device/pal/palmemory.hpp" #include "device/pal/palkernel.hpp" #include "device/pal/palprogram.hpp" #include "device/pal/palprintf.hpp" #include #include #include namespace pal { PrintfDbg::PrintfDbg(Device& device, FILE* file) : dbgBuffer_(nullptr), dbgFile_(file), gpuDevice_(device), wiDbgSize_(0), initCntValue_(device, 4) {} PrintfDbg::~PrintfDbg() { delete dbgBuffer_; } bool PrintfDbg::create() { // Create a resource for the init count value if (initCntValue_.create(Resource::Remote)) { uint32_t* value = reinterpret_cast(initCntValue_.map(nullptr)); // The counter starts from 1 if (nullptr != value) { *value = 1; } else { return false; } initCntValue_.unmap(nullptr); return true; } return false; } bool PrintfDbg::init(VirtualGPU& gpu, bool printfEnabled, const amd::NDRange& size) { // Set up debug output buffer (if printf active) if (printfEnabled) { if (!allocate()) { return false; } // Make sure that the size isn't bigger than the reported max if (size.product() <= dev().settings().maxWorkGroupSize_) { size_t wiDbgSizeTmp; // Calculate the debug buffer size per workitem wiDbgSizeTmp = std::min(dbgBuffer_->size() / size.product(), dev().xferRead().bufSize()); // Make sure the size is DWORD aligned wiDbgSizeTmp = amd::alignDown(wiDbgSizeTmp, sizeof(uint32_t)); // If the new size is different, then clear the initial values if (wiDbgSize_ != wiDbgSizeTmp) { wiDbgSize_ = wiDbgSizeTmp; if (!clearWorkitems(gpu, 0, size.product())) { wiDbgSize_ = 0; return false; } } } } return true; } bool PrintfDbg::output(VirtualGPU& gpu, bool printfEnabled, const amd::NDRange& size, const std::vector& printfInfo) { // Are we expected to generate debug output? if (printfEnabled && !printfInfo.empty()) { uint32_t* workitemData; size_t i, j, k, z; bool realloc = false; // Wait for kernel execution gpu.waitAllEngines(); size_t zdim = 1; size_t ydim = 1; size_t xdim = 1; switch (size.dimensions()) { case 3: zdim = size[2]; // Fall through ... case 2: ydim = size[1]; // Fall through ... case 1: xdim = size[0]; // Fall through ... default: break; } for (k = 0; k < zdim; ++k) { for (j = 0; j < ydim; ++j) { for (i = 0; i < xdim; ++i) { size_t idx = (xdim * (ydim * k + j) + i); workitemData = mapWorkitem(gpu, idx, &realloc); if (nullptr != workitemData) { uint32_t wp = workitemData[0]; // write pointer (i.e. first unwritten element) // Walk through each PrintfDbg entry for (z = 1; (z < (wiDbgSize() / sizeof(uint32_t))) && (z < wp);) { if (printfInfo.size() < workitemData[z]) { LogError("The format string wasn't reported"); return false; } // Get the PrintfDbg info const device::PrintfInfo& info = printfInfo[workitemData[z++]]; // There's something in this buffer outputDbgBuffer(info, workitemData, z); } } unmapWorkitem(gpu, workitemData); } } } // Reallocate debug buffer if necessary if (!allocate(realloc)) { return false; } } return true; } bool PrintfDbg::allocate(bool realloc) { if (nullptr == dbgBuffer_) { dbgBuffer_ = dev().createScratchBuffer(dev().info().printfBufferSize_); } else if (realloc) { LogWarning("Debug buffer reallocation!"); // Double the buffer size if it's not big enough size_t size = dbgBuffer_->size(); delete dbgBuffer_; dbgBuffer_ = dev().createScratchBuffer(size << 1); } return (nullptr != dbgBuffer_) ? true : false; } bool PrintfDbg::checkFloat(const std::string& fmt) const { switch (fmt[fmt.size() - 1]) { case 'e': case 'E': case 'f': case 'g': case 'G': case 'a': return true; break; default: break; } return false; } bool PrintfDbg::checkString(const std::string& fmt) const { if (fmt[fmt.size() - 1] == 's') return true; return false; } int PrintfDbg::checkVectorSpecifier(const std::string& fmt, size_t startPos, size_t& curPos) const { int vectorSize = 0; size_t pos = curPos; size_t size = curPos - startPos; if (size >= 3) { size = 0; // no modifiers if (fmt[curPos - 3] == 'v') { size = 2; } // the modifiers are "h" or "l" else if (fmt[curPos - 4] == 'v') { size = 3; } // the modifier is "hh" else if ((curPos >= 5) && (fmt[curPos - 5] == 'v')) { size = 4; } if (size > 0) { curPos = size; pos -= curPos; // Get vector size vectorSize = fmt[pos++] - '0'; // PrintfDbg supports only 2, 3, 4, 8 and 16 wide vectors switch (vectorSize) { case 1: if ((fmt[pos++] - '0') == 6) { vectorSize = 16; } else { vectorSize = 0; } break; case 2: case 3: case 4: case 8: break; default: vectorSize = 0; break; } } } return vectorSize; } static const size_t ConstStr = 0xffffffff; static const char Separator[] = ",\0"; size_t PrintfDbg::outputArgument(const std::string& fmt, bool printFloat, size_t size, const uint32_t* argument) const { // Serialize the output to the screen amd::ScopedLock k(dev().lockAsyncOps()); size_t copiedBytes = size; // Print the string argument, using standard PrintfDbg() if (checkString(fmt.c_str())) { // copiedBytes should be as number of printed chars copiedBytes = 0; //(null) should be printed if (*argument == 0) { amd::Os::printf(fmt.data(), 0); // copiedBytes = strlen("(null)") copiedBytes = 6; } else { const unsigned char* argumentStr = reinterpret_cast(argument); amd::Os::printf(fmt.data(), argumentStr); // copiedBytes = strlen(argumentStr) while (argumentStr[copiedBytes++] != 0) ; } } // Print the argument(except for string ), using standard PrintfDbg() else { bool hlModifier = (strstr(fmt.c_str(), "hl") != nullptr); std::string hlFmt; if (hlModifier) { hlFmt = fmt; hlFmt.erase(hlFmt.find_first_of("hl"), 2); } switch (size) { case 0: { const char* str = reinterpret_cast(argument); amd::Os::printf(fmt.data(), str); // Find the string length while (str[copiedBytes++] != 0) ; } break; case 1: amd::Os::printf(fmt.data(), *(reinterpret_cast(argument))); break; case 2: case 4: if (printFloat) { static const char* fSpecifiers = "eEfgGa"; std::string fmtF = fmt; size_t posS = fmtF.find_first_of("%"); size_t posE = fmtF.find_first_of(fSpecifiers); if (posS != std::string::npos && posE != std::string::npos) { fmtF.replace(posS + 1, posE - posS, "s"); } float fArg = *(reinterpret_cast(argument)); float fSign = copysign(1.0, fArg); if (isinf(fArg) && !isnan(fArg)) { if (fSign < 0) { amd::Os::printf(fmtF.data(), "-infinity"); } else { amd::Os::printf(fmtF.data(), "infinity"); } } else if (isnan(fArg)) { if (fSign < 0) { amd::Os::printf(fmtF.data(), "-nan"); } else { amd::Os::printf(fmtF.data(), "nan"); } } else if (hlModifier) { amd::Os::printf(hlFmt.data(), fArg); } else { amd::Os::printf(fmt.data(), fArg); } } else { bool hhModifier = (strstr(fmt.c_str(), "hh") != nullptr); if (hhModifier) { // current implementation of printf in gcc 4.5.2 runtime libraries, doesn`t recognize // "hh" modifier ==> // argument should be explicitly converted to unsigned char (uchar) before printing and // fmt should be updated not to contain "hh" modifier std::string hhFmt = fmt; hhFmt.erase(hhFmt.find_first_of("h"), 2); amd::Os::printf(hhFmt.data(), *(reinterpret_cast(argument))); } else if (hlModifier) { amd::Os::printf(hlFmt.data(), *argument); } else { amd::Os::printf(fmt.data(), *argument); } } break; case 8: if (printFloat) { if (hlModifier) { amd::Os::printf(hlFmt.data(), *(reinterpret_cast(argument))); } else { amd::Os::printf(fmt.data(), *(reinterpret_cast(argument))); } } else { std::string out = fmt; // Use 'll' for 64 bit printf out.insert((out.size() - 1), 1, 'l'); amd::Os::printf(out.data(), *(reinterpret_cast(argument))); } break; case ConstStr: { const char* str = reinterpret_cast(argument); amd::Os::printf(fmt.data(), str); } break; default: amd::Os::printf("Error: Unsupported data size for PrintfDbg. %d bytes", static_cast(size)); return 0; } } fflush(stdout); return copiedBytes; } void PrintfDbg::outputDbgBuffer(const device::PrintfInfo& info, const uint32_t* workitemData, size_t& i) const { static const char* specifiers = "cdieEfgGaosuxXp"; static const char* modifiers = "hl"; static const char* special = "%n"; static const std::string sepStr = "%s"; const uint32_t* s = workitemData; size_t pos = 0; // Find the format string std::string str = info.fmtString_; std::string fmt; size_t posStart, posEnd; // Print all arguments // Note: the following code walks through all arguments, provided by the kernel and // finds the corresponding specifier in the format string. // Then it splits the original string into substrings with a single specifier and // uses standard PrintfDbg() to print each argument for (uint j = 0; j < info.arguments_.size(); ++j) { do { posStart = str.find_first_of("%", pos); if (posStart != std::string::npos) { posStart++; // Erase all spaces after % while (str[posStart] == ' ') { str.erase(posStart, 1); } size_t tmp = str.find_first_of(special, posStart); size_t tmp2 = str.find_first_of(specifiers, posStart); // Special cases. Special symbol is located before any specifier if (tmp < tmp2) { posEnd = posStart + 1; fmt = str.substr(pos, posEnd - pos); fmt.erase(posStart - pos - 1, 1); pos = posStart = posEnd; outputArgument(sepStr, false, ConstStr, reinterpret_cast(fmt.data())); continue; } break; } else if (pos < str.length()) { outputArgument(sepStr, false, ConstStr, reinterpret_cast((str.substr(pos)).data())); } } while (posStart != std::string::npos); if (posStart != std::string::npos) { bool printFloat = false; int vectorSize = 0; size_t idPos = 0; // Search for PrintfDbg specifier in the format string. // It will be a split point for the output posEnd = str.find_first_of(specifiers, posStart); if (posEnd == std::string::npos) { pos = posStart = posEnd; break; } posEnd++; size_t curPos = posEnd; vectorSize = checkVectorSpecifier(str, posStart, curPos); // Get substring from the last position to the current specifier fmt = str.substr(pos, posEnd - pos); // Readjust the string pointer if PrintfDbg outputs a vector if (vectorSize != 0) { size_t posVecSpec = fmt.length() - (curPos + 1); size_t posVecMod = fmt.find_first_of(modifiers, posVecSpec + 1); size_t posMod = str.find_first_of(modifiers, posStart); if (posMod < posEnd) { fmt = fmt.erase(posVecSpec, posVecMod - posVecSpec); } else { fmt = fmt.erase(posVecSpec, curPos); } idPos = posStart - pos - 1; } pos = posStart = posEnd; // Find out if the argument is a float printFloat = checkFloat(fmt); // Is it a scalar value? if (vectorSize == 0) { size_t length = outputArgument(fmt, printFloat, info.arguments_[j], &s[i]); if (0 == length) { return; } i += amd::alignUp(length, sizeof(uint32_t)) / sizeof(uint32_t); } else { // 3-component vector's size is defined as 4 * size of each scalar component size_t elemSize = info.arguments_[j] / (vectorSize == 3 ? 4 : vectorSize); size_t k = i * sizeof(uint32_t); std::string elementStr = fmt.substr(idPos, fmt.size()); // Print first element with full string if (0 == outputArgument(fmt, printFloat, elemSize, &s[i])) { return; } // Print other elemnts with separator if available for (int e = 1; e < vectorSize; ++e) { const char* t = reinterpret_cast(s); // Output the vector separator outputArgument(sepStr, false, ConstStr, reinterpret_cast(Separator)); // Output the next element outputArgument(elementStr, printFloat, elemSize, reinterpret_cast(&t[k + e * elemSize])); } i += (amd::alignUp(info.arguments_[j], sizeof(uint32_t))) / sizeof(uint32_t); } } } if (pos != std::string::npos) { fmt = str.substr(pos, str.size() - pos); outputArgument(sepStr, false, ConstStr, reinterpret_cast(fmt.data())); } } bool PrintfDbg::clearWorkitems(VirtualGPU& gpu, size_t idxStart, size_t number) const { // Go through all locations for every thread and copy 1 for (uint i = idxStart; i < idxStart + number; ++i) { amd::Coord3D dst(i * wiDbgSize(), 0, 0); amd::Coord3D size(sizeof(uint32_t), 0, 0); // Copy 1 into the corresponding location in the debug buffer if (!initCntValue_.partialMemCopyTo(gpu, amd::Coord3D(0, 0, 0), dst, size, *dbgBuffer_)) { return false; } } return true; } uint32_t* PrintfDbg::mapWorkitem(VirtualGPU& gpu, size_t idx, bool* realloc) { uint32_t wiSize = 0; amd::Coord3D src(idx * wiDbgSize(), 0, 0); xferBufRead_ = &(dev().xferRead().acquire()); // Copy workitem size from the corresponding location in the debug buffer if (!dbgBuffer_->partialMemCopyTo(gpu, src, amd::Coord3D(0, 0, 0), amd::Coord3D(sizeof(uint32_t), 0, 0), *xferBufRead_)) { return nullptr; } // Get memory pointer to the satged buffer uint32_t* workitem = reinterpret_cast(xferBufRead_->map(&gpu)); if (nullptr == workitem) { return nullptr; } // Copy size value wiSize = *workitem; xferBufRead_->unmap(&gpu); // Check if the cuurent workitem almost reached the size limit if ((wiDbgSize() - static_cast(wiSize)) < 3) { *realloc = true; } // If the current workitem had any output then get the data if ((wiSize > 1) && (wiSize <= wiDbgSize())) { amd::Coord3D size(wiSize * sizeof(uint32_t), 0, 0); // Copy the current workitem output data to the staged buffer if (!dbgBuffer_->partialMemCopyTo(gpu, src, amd::Coord3D(0, 0, 0), size, *xferBufRead_) || // Clear the write pointer back to index 1 for the current workitem !clearWorkitems(gpu, idx, 1)) { LogError("Reading the workitem data failed!"); return nullptr; } // Get a pointer to the workitem data workitem = reinterpret_cast(xferBufRead_->map(&gpu)); return workitem; } return nullptr; } void PrintfDbg::unmapWorkitem(VirtualGPU& gpu, const uint32_t* workitemData) const { if (nullptr != workitemData) { xferBufRead_->unmap(&gpu); } dev().xferRead().release(gpu, *xferBufRead_); } bool PrintfDbgHSA::init(VirtualGPU& gpu, bool printfEnabled) { // Set up debug output buffer (if printf active) if (printfEnabled) { if (!allocate()) { return false; } // The first two DWORDs in the printf buffer are as follows: // First DWORD = Offset to where next information is to // be written, initialized to 0 // Second DWORD = Number of bytes available for printf data // = buffer size � 2*sizeof(uint32_t) const uint8_t initSize = 2 * sizeof(uint32_t); uint8_t sysMem[initSize]; memset(sysMem, 0, initSize); uint32_t dbgBufferSize = dbgBuffer_->size() - initSize; memcpy(&sysMem[4], &dbgBufferSize, sizeof(dbgBufferSize)); // Copy offset and number of bytes available for printf data // into the corresponding location in the debug buffer dbgBuffer_->writeRawData(gpu, 0, initSize, sysMem, true); } return true; } bool PrintfDbgHSA::output(VirtualGPU& gpu, bool printfEnabled, const std::vector& printfInfo) { if (printfEnabled) { uint32_t offsetSize = 0; xferBufRead_ = &(dev().xferRead().acquire()); // Copy offset from the first DWORD in the debug buffer if (!dbgBuffer_->partialMemCopyTo(gpu, amd::Coord3D(0, 0, 0), amd::Coord3D(0, 0, 0), amd::Coord3D(sizeof(uint32_t), 0, 0), *xferBufRead_)) { return false; } // Get memory pointer to the satged buffer uint32_t* dbgBufferPtr = reinterpret_cast(xferBufRead_->map(&gpu)); if (nullptr == dbgBufferPtr) { return false; } offsetSize = *dbgBufferPtr; xferBufRead_->unmap(&gpu); if (offsetSize == 0) { LogInfo("The printf buffer is empty!"); dev().xferRead().release(gpu, *xferBufRead_); return true; } size_t bufSize = dev().xferRead().bufSize(); size_t copySize = offsetSize; while (copySize != 0) { // Copy the buffer data (i.e., the printfID followed by the // argument data for each printf call in th kernel) to the staged buffer if (!dbgBuffer_->partialMemCopyTo( gpu, amd::Coord3D(2 * sizeof(uint32_t) + offsetSize - copySize, 0, 0), amd::Coord3D(0, 0, 0), std::min(copySize, bufSize), *xferBufRead_)) { return false; } // Get a pointer to the buffer data dbgBufferPtr = reinterpret_cast(xferBufRead_->map(&gpu)); if (nullptr == dbgBufferPtr) { return false; } uint sb = 0; uint sbt = 0; // parse the debug buffer while (sbt < copySize) { if (*dbgBufferPtr >= printfInfo.size()) { LogError("Couldn't find the reported PrintfID!"); return false; } const device::PrintfInfo& info = printfInfo[(*dbgBufferPtr)]; sb += sizeof(uint32_t); for (const auto& it : info.arguments_) { sb += it; } if (sbt + sb > bufSize) { break; // Need new portion of data in staging buffer } size_t idx = 1; // There's something in the debug buffer outputDbgBuffer(info, dbgBufferPtr, idx); sbt += sb; dbgBufferPtr += sb / sizeof(uint32_t); sb = 0; } copySize -= sbt; xferBufRead_->unmap(&gpu); } dev().xferRead().release(gpu, *xferBufRead_); } return true; } } // namespace pal