diff --git a/projects/clr/rocclr/device/devhcmessages.cpp b/projects/clr/rocclr/device/devhcmessages.cpp index 34319449dc..f654958224 100644 --- a/projects/clr/rocclr/device/devhcmessages.cpp +++ b/projects/clr/rocclr/device/devhcmessages.cpp @@ -133,7 +133,9 @@ void MessageHandler::discardMessage(Message* message) { } // Defined in devhcprintf.cpp +namespace amd { void handlePrintf(uint64_t* output, const uint64_t* input, uint64_t len); +} // namespace amd bool MessageHandler::handlePayload(uint32_t service, uint64_t* payload) { Message* message = nullptr; @@ -166,7 +168,7 @@ bool MessageHandler::handlePayload(uint32_t service, uint64_t* payload) { switch (service) { case SERVICE_PRINTF: - handlePrintf(payload, message->data_.data(), message->data_.size()); + amd::handlePrintf(payload, message->data_.data(), message->data_.size()); break; default: ClPrint(amd::LOG_ERROR, amd::LOG_ALWAYS, "Hostcall: Messages not supported for service %d", diff --git a/projects/clr/rocclr/device/devhcprintf.cpp b/projects/clr/rocclr/device/devhcprintf.cpp index a5aba4a41f..56c1f04e32 100644 --- a/projects/clr/rocclr/device/devhcprintf.cpp +++ b/projects/clr/rocclr/device/devhcprintf.cpp @@ -21,6 +21,7 @@ /** \file Format string processing for printf based on hostcall messages. */ +#include "device/devkernel.hpp" #include #include #include @@ -28,6 +29,7 @@ #include #include +namespace amd { static void checkPrintf(FILE* stream, int* outCount, const char* fmt, ...) { va_list args; va_start(args, fmt); @@ -245,3 +247,45 @@ void handlePrintf(uint64_t* output, const uint64_t* input, uint64_t len) { *output = format(stream, input, end); } + +// Extract the format string hash and the format string. +// The compiler generates the amdhsa.printf metadata in +// following format for HIP nonhostcall case. +// "0:0:," +// i.e the hash is part of the format string itself +// delimited by character ','. +bool populateFormatStringHashMap( + const std::vector &printfInfo, + std::map &strMap) { + for (auto it : printfInfo) { + auto Delim = it.fmtString_.find_first_of(','); + auto HashStr = it.fmtString_.substr(0, Delim); + auto HashVal = strtoul(HashStr.c_str(), NULL, 16); + if (strMap.find(HashVal) != strMap.end()) { + LogError("Hash value collision detected, printf buffer ill formed"); + return false; + } + strMap[HashVal] = it.fmtString_.substr(Delim + 1, it.fmtString_.size()); + } + + return true; +} + +void handlePrintfDelayed(const uint64_t* input, uint64_t len, uint64_t control) +{ + auto end = input + len; + FILE* stream = stdout; + + // The LSB in the control word is used to decide stream. + uint64_t CTRL_MASK = 1; + + // Output goes to stderr if LSB is set. + if (control & CTRL_MASK) { + stream = stderr; + } + + format(stream, input, end); + +} + +} // namespace amd diff --git a/projects/clr/rocclr/device/devkernel.cpp b/projects/clr/rocclr/device/devkernel.cpp index 5526f26238..fdd20a0e37 100644 --- a/projects/clr/rocclr/device/devkernel.cpp +++ b/projects/clr/rocclr/device/devkernel.cpp @@ -1477,6 +1477,7 @@ void Kernel::InitParameters(const aclArgData* aclArg, uint32_t argBufferSize) { // ================================================================================================ #if defined(USE_COMGR_LIBRARY) void Kernel::InitPrintf(const std::vector& printfInfoStrings) { + size_t HIPPrintfInfoID = 0; for (auto str : printfInfoStrings) { std::vector tokens; @@ -1493,10 +1494,20 @@ void Kernel::InitPrintf(const std::vector& printfInfoStrings) { } pos = 0; - size_t printfInfoID = std::stoi(tokens[pos++]); - if (printf_.size() <= printfInfoID) { - printf_.resize(printfInfoID + 1); + size_t printfInfoID; + + if(amd::IS_HIP) { + printfInfoID = HIPPrintfInfoID++; + printf_.resize(HIPPrintfInfoID); + pos++; } + else { + printfInfoID = std::stoi(tokens[pos++]); + if (printf_.size() <= printfInfoID) { + printf_.resize(printfInfoID + 1); + } + } + PrintfInfo& info = printf_[printfInfoID]; size_t numSizes = std::stoi(tokens[pos++]); @@ -1514,7 +1525,13 @@ void Kernel::InitPrintf(const std::vector& printfInfoStrings) { } // FIXME: We should not need this! [ - std::string& fmt = tokens[pos]; + std::string fmt; + // Format string itself might contain ':' characters + for(int i = 0; pos < tokens.size(); i++) { + if(i) fmt += ':'; + fmt += tokens[pos++]; + } + bool need_nl = true; for (pos = 0; pos < fmt.size(); ++pos) { @@ -1559,7 +1576,7 @@ void Kernel::InitPrintf(const std::vector& printfInfoStrings) { } info.fmtString_.push_back(symbol); } - if (need_nl) { + if (need_nl && !amd::IS_HIP) { info.fmtString_ += "\n"; } // ] @@ -1570,12 +1587,19 @@ void Kernel::InitPrintf(const std::vector& printfInfoStrings) { // ================================================================================================ #if defined(WITH_COMPILER_LIB) void Kernel::InitPrintf(const aclPrintfFmt* aclPrintf) { - uint index = 0; + uint index = 0, HIPIndex = 0; for (; aclPrintf->struct_size != 0; aclPrintf++) { - index = aclPrintf->ID; - if (printf_.size() <= index) { - printf_.resize(index + 1); + if(amd::IS_HIP) { + index = HIPIndex++; + printf_.resize(HIPIndex); } + else { + index = aclPrintf->ID; + if (printf_.size() <= index) { + printf_.resize(index + 1); + } + } + PrintfInfo& info = printf_[index]; const std::string& pfmt = aclPrintf->fmtStr; bool need_nl = true; @@ -1621,7 +1645,7 @@ void Kernel::InitPrintf(const aclPrintfFmt* aclPrintf) { } info.fmtString_.push_back(symbol); } - if (need_nl) { + if (need_nl && !amd::IS_HIP) { info.fmtString_ += "\n"; } uint32_t* tmp_ptr = const_cast(aclPrintf->argSizes); diff --git a/projects/clr/rocclr/device/pal/palprintf.cpp b/projects/clr/rocclr/device/pal/palprintf.cpp index 61f1fead03..7a03e4553a 100644 --- a/projects/clr/rocclr/device/pal/palprintf.cpp +++ b/projects/clr/rocclr/device/pal/palprintf.cpp @@ -30,6 +30,14 @@ #include #include +// Functions defined in devhcprintf.cpp +namespace amd { +void handlePrintfDelayed(const uint64_t* input, uint64_t len, uint64_t control); +bool populateFormatStringHashMap( + const std::vector &printfInfo, + std::map &strMap); +} // namespace amd + namespace pal { PrintfDbg::PrintfDbg(Device& device, FILE* file) @@ -599,6 +607,11 @@ bool PrintfDbgHSA::output(VirtualGPU& gpu, bool printfEnabled, size_t bufSize = dev().xferRead().bufSize(); size_t copySize = offsetSize; + + // Map between 64 bit MD5 format string hash and + // actual format string + std::map StrMap; + while (copySize != 0) { // Copy the buffer data (i.e., the printfID followed by the // argument data for each printf call in th kernel) to the staged buffer @@ -617,6 +630,66 @@ bool PrintfDbgHSA::output(VirtualGPU& gpu, bool printfEnabled, uint sb = 0; uint sbt = 0; + // Handle HIP nonhostcall printf here, + if (amd::IS_HIP) { + auto BufferForHIP = reinterpret_cast(dbgBufferPtr); + + // Populate string map with hashes and actual + // format strings. + if(!amd::populateFormatStringHashMap(printfInfo, StrMap)) + return false; + + while (sbt < copySize) { + auto controlDword = *BufferForHIP++; + uint64_t nextOffset = controlDword >> 2; + + if (sbt + nextOffset > bufSize) { + break; // Need new portion of data in staging buffer + } + + auto PB = (uint64_t*)BufferForHIP; + std::vector PBuffer; + uint64_t BufferLen = 0; + if (controlDword & 2U) { + // Process the contsant format string case. + // The first value is the 64 bit format string hash + // and remaining values are printf arguments. + // Construct a temporary buffer with actual format + // string followed by arguments. The format string is + // obtained by querying StrMap populated before. + auto ArgsLen = nextOffset - 12; + auto Str = StrMap[*PB++]; + auto StrLenWithNull = Str.size() + 1; + BufferLen = ArgsLen + amd::alignUp(StrLenWithNull, sizeof(uint64_t)); + PBuffer.resize(BufferLen); + memcpy(PBuffer.data(), Str.c_str(), StrLenWithNull); + memset(PBuffer.data() + Str.size(), 0, 8 - (StrLenWithNull % 8 )); + memcpy(PBuffer.data() + amd::alignUp(StrLenWithNull, sizeof(uint64_t)), + PB, ArgsLen); + } + else { + // Process Non constant format string case. + // Here, The buffer itself contains the actual + // format string and hence just copy the contents + // of format string and arguments into a temporary + // buffer + BufferLen = nextOffset - /*ControlDWord*/4; + PBuffer.resize(BufferLen); + memcpy(PBuffer.data(), BufferForHIP, nextOffset); + } + + // Handle printing + amd::handlePrintfDelayed((uint64_t*)PBuffer.data(), BufferLen / 8, + controlDword); + BufferForHIP += (nextOffset / 4) - /*ControlDWord*/1; + sbt += nextOffset; + } + + copySize -= sbt; + xferBufRead_->unmap(&gpu); + continue; + } + // parse the debug buffer while (sbt < copySize) { if (*dbgBufferPtr >= printfInfo.size()) { diff --git a/projects/clr/rocclr/device/rocm/rocprintf.cpp b/projects/clr/rocclr/device/rocm/rocprintf.cpp index 490e16203d..8d8c98b751 100644 --- a/projects/clr/rocclr/device/rocm/rocprintf.cpp +++ b/projects/clr/rocclr/device/rocm/rocprintf.cpp @@ -31,6 +31,14 @@ #include #include +// Functions defined in devhcprintf.cpp +namespace amd { +void handlePrintfDelayed(const uint64_t *input, uint64_t len, uint64_t control); +bool populateFormatStringHashMap( + const std::vector &printfInfo, + std::map &strMap); +} // namespace amd + namespace roc { PrintfDbg::PrintfDbg(Device& device, FILE* file) @@ -435,6 +443,67 @@ bool PrintfDbg::output(VirtualGPU& gpu, bool printfEnabled, uint sb = 0; uint sbt = 0; + // Handle HIP nonhostcall printf here, However longterm goal + // should be to have common implementation for both HIP and OpenCL + if (amd::IS_HIP) { + // Map between 64 bit MD5 format string hash and + // actual format string + std::map StrMap; + + auto BufferForHIP = reinterpret_cast(dbgBufferPtr); + + // Populate string map with hashes and actual + // format strings. + if(!amd::populateFormatStringHashMap(printfInfo, StrMap)) + return false; + + while (sbt < offsetSize) + { + auto controlDword = *BufferForHIP++; + auto PB = (uint64_t*)BufferForHIP; + + uint64_t nextOffset = controlDword >> 2; + + std::vector PBuffer; + uint64_t BufferLen = 0; + if (controlDword & 2U) { + // Process the contsant format string case. + // The first value is the 64 bit format string hash + // and remaining values are printf arguments. + // Construct a temporary buffer with actual format + // string followed by arguments. The format string is + // obtained by querying StrMap populated before. + auto ArgsLen = nextOffset - 12; + auto Str = StrMap[*PB++]; + auto StrLenWithNull = Str.size() + 1; + BufferLen = ArgsLen + amd::alignUp(StrLenWithNull, sizeof(uint64_t)); + PBuffer.resize(BufferLen); + memcpy(PBuffer.data(), Str.c_str(), StrLenWithNull); + memset(PBuffer.data() + Str.size(), 0, 8 - (StrLenWithNull % 8 )); + memcpy(PBuffer.data() + amd::alignUp(StrLenWithNull, sizeof(uint64_t)), + PB, ArgsLen); + } + else { + // Process Non constant format string case. + // Here, The buffer itself contains the actual + // format string and hence just copy the contents + // of format string and arguments into a temporary + // buffer + BufferLen = nextOffset - /*ControlDWord*/4; + PBuffer.resize(BufferLen); + memcpy(PBuffer.data(), BufferForHIP, nextOffset); + } + + // Handle printing + amd::handlePrintfDelayed((uint64_t*)PBuffer.data(), BufferLen / 8, + controlDword); + BufferForHIP += (nextOffset / 4) - /*ControlDWord*/1; + sbt += nextOffset; + } + + return true; + } + // parse the debug buffer while (sbt < offsetSize) { if (*dbgBufferPtr >= printfInfo.size()) {