SWDEV-354050 - Runtime support for HIP non hostcall printf

Relates to https://reviews.llvm.org/D150427,

Each printf call populates buffer with following data
1. Control DWord - contains info regarding stream, format string constness and size of data frame
   (see http://gerrit-git.amd.com/c/lightning/ec/device-libs/+/857722 for more info)
2. Hash of the format string (if constant) else the format string itself
3. Printf arguments (each aligned to 8 byte boundary)

Change-Id: I7e320deb343921b4b4cfaf08a2be2883e0bc1f65


[ROCm/clr commit: 7b6a8f1702]
Este commit está contenido en:
Vikram
2023-05-12 00:31:45 -04:00
padre b1266e557e
commit 5ba0fb5802
Se han modificado 5 ficheros con 223 adiciones y 11 borrados
+3 -1
Ver fichero
@@ -133,7 +133,9 @@ void MessageHandler::discardMessage(Message* message) {
}
// Defined in devhcprintf.cpp
namespace amd {
void handlePrintf(uint64_t* output, const uint64_t* input, uint64_t len);
} // namespace amd
bool MessageHandler::handlePayload(uint32_t service, uint64_t* payload) {
Message* message = nullptr;
@@ -166,7 +168,7 @@ bool MessageHandler::handlePayload(uint32_t service, uint64_t* payload) {
switch (service) {
case SERVICE_PRINTF:
handlePrintf(payload, message->data_.data(), message->data_.size());
amd::handlePrintf(payload, message->data_.data(), message->data_.size());
break;
default:
ClPrint(amd::LOG_ERROR, amd::LOG_ALWAYS, "Hostcall: Messages not supported for service %d",
+44
Ver fichero
@@ -21,6 +21,7 @@
/** \file Format string processing for printf based on hostcall messages.
*/
#include "device/devkernel.hpp"
#include <assert.h>
#include <cstdarg>
#include <cstdint>
@@ -28,6 +29,7 @@
#include <cstring>
#include <string>
namespace amd {
static void checkPrintf(FILE* stream, int* outCount, const char* fmt, ...) {
va_list args;
va_start(args, fmt);
@@ -245,3 +247,45 @@ void handlePrintf(uint64_t* output, const uint64_t* input, uint64_t len) {
*output = format(stream, input, end);
}
// Extract the format string hash and the format string.
// The compiler generates the amdhsa.printf metadata in
// following format for HIP nonhostcall case.
// "0:0:<format_string_hash>,<actual_format_string>"
// i.e the hash is part of the format string itself
// delimited by character ','.
bool populateFormatStringHashMap(
const std::vector<device::PrintfInfo> &printfInfo,
std::map<uint64_t, std::string> &strMap) {
for (auto it : printfInfo) {
auto Delim = it.fmtString_.find_first_of(',');
auto HashStr = it.fmtString_.substr(0, Delim);
auto HashVal = strtoul(HashStr.c_str(), NULL, 16);
if (strMap.find(HashVal) != strMap.end()) {
LogError("Hash value collision detected, printf buffer ill formed");
return false;
}
strMap[HashVal] = it.fmtString_.substr(Delim + 1, it.fmtString_.size());
}
return true;
}
void handlePrintfDelayed(const uint64_t* input, uint64_t len, uint64_t control)
{
auto end = input + len;
FILE* stream = stdout;
// The LSB in the control word is used to decide stream.
uint64_t CTRL_MASK = 1;
// Output goes to stderr if LSB is set.
if (control & CTRL_MASK) {
stream = stderr;
}
format(stream, input, end);
}
} // namespace amd
+34 -10
Ver fichero
@@ -1477,6 +1477,7 @@ void Kernel::InitParameters(const aclArgData* aclArg, uint32_t argBufferSize) {
// ================================================================================================
#if defined(USE_COMGR_LIBRARY)
void Kernel::InitPrintf(const std::vector<std::string>& printfInfoStrings) {
size_t HIPPrintfInfoID = 0;
for (auto str : printfInfoStrings) {
std::vector<std::string> tokens;
@@ -1493,10 +1494,20 @@ void Kernel::InitPrintf(const std::vector<std::string>& printfInfoStrings) {
}
pos = 0;
size_t printfInfoID = std::stoi(tokens[pos++]);
if (printf_.size() <= printfInfoID) {
printf_.resize(printfInfoID + 1);
size_t printfInfoID;
if(amd::IS_HIP) {
printfInfoID = HIPPrintfInfoID++;
printf_.resize(HIPPrintfInfoID);
pos++;
}
else {
printfInfoID = std::stoi(tokens[pos++]);
if (printf_.size() <= printfInfoID) {
printf_.resize(printfInfoID + 1);
}
}
PrintfInfo& info = printf_[printfInfoID];
size_t numSizes = std::stoi(tokens[pos++]);
@@ -1514,7 +1525,13 @@ void Kernel::InitPrintf(const std::vector<std::string>& printfInfoStrings) {
}
// FIXME: We should not need this! [
std::string& fmt = tokens[pos];
std::string fmt;
// Format string itself might contain ':' characters
for(int i = 0; pos < tokens.size(); i++) {
if(i) fmt += ':';
fmt += tokens[pos++];
}
bool need_nl = true;
for (pos = 0; pos < fmt.size(); ++pos) {
@@ -1559,7 +1576,7 @@ void Kernel::InitPrintf(const std::vector<std::string>& printfInfoStrings) {
}
info.fmtString_.push_back(symbol);
}
if (need_nl) {
if (need_nl && !amd::IS_HIP) {
info.fmtString_ += "\n";
}
// ]
@@ -1570,12 +1587,19 @@ void Kernel::InitPrintf(const std::vector<std::string>& printfInfoStrings) {
// ================================================================================================
#if defined(WITH_COMPILER_LIB)
void Kernel::InitPrintf(const aclPrintfFmt* aclPrintf) {
uint index = 0;
uint index = 0, HIPIndex = 0;
for (; aclPrintf->struct_size != 0; aclPrintf++) {
index = aclPrintf->ID;
if (printf_.size() <= index) {
printf_.resize(index + 1);
if(amd::IS_HIP) {
index = HIPIndex++;
printf_.resize(HIPIndex);
}
else {
index = aclPrintf->ID;
if (printf_.size() <= index) {
printf_.resize(index + 1);
}
}
PrintfInfo& info = printf_[index];
const std::string& pfmt = aclPrintf->fmtStr;
bool need_nl = true;
@@ -1621,7 +1645,7 @@ void Kernel::InitPrintf(const aclPrintfFmt* aclPrintf) {
}
info.fmtString_.push_back(symbol);
}
if (need_nl) {
if (need_nl && !amd::IS_HIP) {
info.fmtString_ += "\n";
}
uint32_t* tmp_ptr = const_cast<uint32_t*>(aclPrintf->argSizes);
+73
Ver fichero
@@ -30,6 +30,14 @@
#include <algorithm>
#include <cmath>
// Functions defined in devhcprintf.cpp
namespace amd {
void handlePrintfDelayed(const uint64_t* input, uint64_t len, uint64_t control);
bool populateFormatStringHashMap(
const std::vector<device::PrintfInfo> &printfInfo,
std::map<uint64_t, std::string> &strMap);
} // namespace amd
namespace pal {
PrintfDbg::PrintfDbg(Device& device, FILE* file)
@@ -599,6 +607,11 @@ bool PrintfDbgHSA::output(VirtualGPU& gpu, bool printfEnabled,
size_t bufSize = dev().xferRead().bufSize();
size_t copySize = offsetSize;
// Map between 64 bit MD5 format string hash and
// actual format string
std::map<uint64_t, std::string> StrMap;
while (copySize != 0) {
// Copy the buffer data (i.e., the printfID followed by the
// argument data for each printf call in th kernel) to the staged buffer
@@ -617,6 +630,66 @@ bool PrintfDbgHSA::output(VirtualGPU& gpu, bool printfEnabled,
uint sb = 0;
uint sbt = 0;
// Handle HIP nonhostcall printf here,
if (amd::IS_HIP) {
auto BufferForHIP = reinterpret_cast<uint32_t*>(dbgBufferPtr);
// Populate string map with hashes and actual
// format strings.
if(!amd::populateFormatStringHashMap(printfInfo, StrMap))
return false;
while (sbt < copySize) {
auto controlDword = *BufferForHIP++;
uint64_t nextOffset = controlDword >> 2;
if (sbt + nextOffset > bufSize) {
break; // Need new portion of data in staging buffer
}
auto PB = (uint64_t*)BufferForHIP;
std::vector<uint8_t> PBuffer;
uint64_t BufferLen = 0;
if (controlDword & 2U) {
// Process the contsant format string case.
// The first value is the 64 bit format string hash
// and remaining values are printf arguments.
// Construct a temporary buffer with actual format
// string followed by arguments. The format string is
// obtained by querying StrMap populated before.
auto ArgsLen = nextOffset - 12;
auto Str = StrMap[*PB++];
auto StrLenWithNull = Str.size() + 1;
BufferLen = ArgsLen + amd::alignUp(StrLenWithNull, sizeof(uint64_t));
PBuffer.resize(BufferLen);
memcpy(PBuffer.data(), Str.c_str(), StrLenWithNull);
memset(PBuffer.data() + Str.size(), 0, 8 - (StrLenWithNull % 8 ));
memcpy(PBuffer.data() + amd::alignUp(StrLenWithNull, sizeof(uint64_t)),
PB, ArgsLen);
}
else {
// Process Non constant format string case.
// Here, The buffer itself contains the actual
// format string and hence just copy the contents
// of format string and arguments into a temporary
// buffer
BufferLen = nextOffset - /*ControlDWord*/4;
PBuffer.resize(BufferLen);
memcpy(PBuffer.data(), BufferForHIP, nextOffset);
}
// Handle printing
amd::handlePrintfDelayed((uint64_t*)PBuffer.data(), BufferLen / 8,
controlDword);
BufferForHIP += (nextOffset / 4) - /*ControlDWord*/1;
sbt += nextOffset;
}
copySize -= sbt;
xferBufRead_->unmap(&gpu);
continue;
}
// parse the debug buffer
while (sbt < copySize) {
if (*dbgBufferPtr >= printfInfo.size()) {
@@ -31,6 +31,14 @@
#include <algorithm>
#include <cmath>
// Functions defined in devhcprintf.cpp
namespace amd {
void handlePrintfDelayed(const uint64_t *input, uint64_t len, uint64_t control);
bool populateFormatStringHashMap(
const std::vector<device::PrintfInfo> &printfInfo,
std::map<uint64_t, std::string> &strMap);
} // namespace amd
namespace roc {
PrintfDbg::PrintfDbg(Device& device, FILE* file)
@@ -435,6 +443,67 @@ bool PrintfDbg::output(VirtualGPU& gpu, bool printfEnabled,
uint sb = 0;
uint sbt = 0;
// Handle HIP nonhostcall printf here, However longterm goal
// should be to have common implementation for both HIP and OpenCL
if (amd::IS_HIP) {
// Map between 64 bit MD5 format string hash and
// actual format string
std::map<uint64_t, std::string> StrMap;
auto BufferForHIP = reinterpret_cast<uint32_t*>(dbgBufferPtr);
// Populate string map with hashes and actual
// format strings.
if(!amd::populateFormatStringHashMap(printfInfo, StrMap))
return false;
while (sbt < offsetSize)
{
auto controlDword = *BufferForHIP++;
auto PB = (uint64_t*)BufferForHIP;
uint64_t nextOffset = controlDword >> 2;
std::vector<uint8_t> PBuffer;
uint64_t BufferLen = 0;
if (controlDword & 2U) {
// Process the contsant format string case.
// The first value is the 64 bit format string hash
// and remaining values are printf arguments.
// Construct a temporary buffer with actual format
// string followed by arguments. The format string is
// obtained by querying StrMap populated before.
auto ArgsLen = nextOffset - 12;
auto Str = StrMap[*PB++];
auto StrLenWithNull = Str.size() + 1;
BufferLen = ArgsLen + amd::alignUp(StrLenWithNull, sizeof(uint64_t));
PBuffer.resize(BufferLen);
memcpy(PBuffer.data(), Str.c_str(), StrLenWithNull);
memset(PBuffer.data() + Str.size(), 0, 8 - (StrLenWithNull % 8 ));
memcpy(PBuffer.data() + amd::alignUp(StrLenWithNull, sizeof(uint64_t)),
PB, ArgsLen);
}
else {
// Process Non constant format string case.
// Here, The buffer itself contains the actual
// format string and hence just copy the contents
// of format string and arguments into a temporary
// buffer
BufferLen = nextOffset - /*ControlDWord*/4;
PBuffer.resize(BufferLen);
memcpy(PBuffer.data(), BufferForHIP, nextOffset);
}
// Handle printing
amd::handlePrintfDelayed((uint64_t*)PBuffer.data(), BufferLen / 8,
controlDword);
BufferForHIP += (nextOffset / 4) - /*ControlDWord*/1;
sbt += nextOffset;
}
return true;
}
// parse the debug buffer
while (sbt < offsetSize) {
if (*dbgBufferPtr >= printfInfo.size()) {