kfdtest: Fix ExtendedCuMasking on GPUs with inactive CUs (#726)

Modify the code that computes the adjusted CU mask array to take
into account of additional cases for inactive CUs.

Signed-off-by: David Belanger <david.belanger@amd.com>
This commit is contained in:
David Bélanger
2025-10-17 11:26:12 -04:00
کامیت شده توسط GitHub
والد 9df655088f
کامیت 02294e3852
@@ -1012,13 +1012,74 @@ bool adjustMask(uint32_t *pAdjMask, uint32_t *pMask, mask_config_t maskConfig) {
int totalBits = maskConfig.numBits;
bool nonZero = false;
memset(pAdjMask, 0, sizeof(uint32_t) * maskConfig.numDwords);
uint32_t tempInactiveMask[maskConfig.numDwords] = { 0 };
uint32_t tempAdjustMask[maskConfig.numDwords] = { 0 };
/*
* KFD encodes all the active WGP at the lowest bits in MQD registers.
*
* If WGP3 is inactive in a SA, it will be encoded by KFD as: 0x3f.
* If WGP1 is inactive in a SA, it will be encoded by KFD also as: 0x3f.
*
* We need to adjust for that.
*
* For each SA, we need to "compress" all the active WGP settings together.
* i.e. if WGP1 is inactive, we need to put: (x WGP3 WGP2 WGP0) in our CU mask array.
*
* Note that WGPs in same SA are not encoded consecutively in the CU mask array, we need to take into account that.
*
* Once this step is done, we need to remove any "inactive" entries from the CU mask array as they are skipped by KFD.
*
*/
// Offset to get the next WGP in a SA.
// Ex: If WPG0 is at offset n, WGP1 is at offset n+nextWGPOffset in the CU mask array
const uint32_t nextWGPOffset = 2 * maskConfig.numSEs * maskConfig.numSAperSE;
for (int i = 0; i < maskConfig.numSEs; i++) {
for (int j = 0; j < maskConfig.numSAperSE; j++) {
// Location of WGP0 for (SE: i, SA: j)
uint32_t wgp0Loc = 2 * (j * maskConfig.numSEs + i);
// Location of last WGP
uint32_t wgpLastLoc = wgp0Loc + nextWGPOffset * (maskConfig.numWGPperSA - 1);
// Where to write the next active WGP
uint32_t activeWriteIndex = wgp0Loc;
// Where to write the next inactive WGP
uint32_t inactiveWriteIndex = wgpLastLoc;
/*
* Iterate over the WGPs for (SE: i, SA: j) and write all the active ones in tempAdjustMask, leaving the inactive ones.
* This condenses the WGPs for the (SE,SA) pair being processed.
*
* Generate temporary adjust mask with the inactive ones.
*
* At the end we have:
* tempAdjustMask: ( 0 WGP3 WGP2 WGP1 ) (2 bits per WGP)
* tempInactiveMask: ( 3 0 0 0 ) (2 bits per WGP)
*/
for (int k = wgp0Loc; k < totalBits; k += nextWGPOffset) {
if ((maskConfig.pInactiveMask[k / 32] & (0x3 << (k % 32))) != 0) {
tempInactiveMask[inactiveWriteIndex / 32] |= (0x3 << (inactiveWriteIndex % 32));
inactiveWriteIndex -= nextWGPOffset;
} else {
uint32_t value = (pMask[k / 32] >> (k % 32)) & 0x3;
uint32_t newValue = value << (activeWriteIndex % 32);
tempAdjustMask[activeWriteIndex / 32] |= newValue;
activeWriteIndex += nextWGPOffset;
}
}
}
}
// Now we remove all the inactive entries and generate the final adjusted mask.
memset(pAdjMask, 0, sizeof(uint32_t) * maskConfig.numDwords);
for (int ri = 0; ri < totalBits; ri += 2) {
uint32_t value = (pMask[ri / 32] >> (ri % 32)) & 0x3;
uint32_t value = (tempAdjustMask[ri / 32] >> (ri % 32)) & 0x3;
if ((maskConfig.pInactiveMask[ri / 32] & (0x3 << (ri % 32))) != 0)
if ((tempInactiveMask[ri / 32] & (0x3 << (ri % 32))) != 0)
{
// skip that entry
}
@@ -1036,17 +1097,17 @@ bool adjustMask(uint32_t *pAdjMask, uint32_t *pMask, mask_config_t maskConfig) {
#if CUMASK_DEBUG
printf("\nAdjusting mask:\n");
printMask(" mask: ", pMask, maskConfig.numDwords);
printMask("inactive: ", maskConfig.pInactiveMask, maskConfig.numDwords);
printMask("adjusted: ", pAdjMask, maskConfig.numDwords);
printMask(" mask: ", pMask, maskConfig.numDwords);
printMask(" inactive: ", maskConfig.pInactiveMask, maskConfig.numDwords);
printMask("temp adjusted: ", tempAdjustMask, maskConfig.numDwords);
printMask("temp inactive: ", tempInactiveMask, maskConfig.numDwords);
printMask(" adjusted: ", pAdjMask, maskConfig.numDwords);
printf("\n");
#endif //CUMASK_DEBUG
return nonZero;
}
/*
* Validates the result of a test.
*
@@ -1304,14 +1365,13 @@ static void extendedCuMasking(KFDTEST_PARAMETERS* pTestParameters) {
maskConfig.pInactiveMask = inactiveMask;
std::ostringstream logStr;
logStr << nodeStr << " Inactive WGP detected: " << inactiveCount << " 0x" << std::hex << std::setw(8);
logStr << nodeStr << " Inactive CUs detected: " << inactiveCount << " 0x" << std::hex << std::setfill('0');
for (int i = maskNumDwords - 1; i >= 0; i--) {
logStr << inactiveMask[i];
logStr << std::setw(8) << inactiveMask[i];
}
LOG() << logStr.str() << std::endl;
}
/*
* Generate symmetric test configuration for all (SE, SA, WGP) combinations, one level at a time.
*