TransferBench: Adding ability to reindex GPUs based on PCIe address (#494)

[ROCm/rccl commit: 84d5fce7dd]
This commit is contained in:
gilbertlee-amd
2022-02-02 08:51:41 -07:00
committed by GitHub
parent 635c0bcc01
commit b2deea27f5
3 changed files with 70 additions and 19 deletions
@@ -51,6 +51,7 @@ public:
int numCpuPerLink; // Number of CPU child threads to use per CPU link
int sharedMemBytes; // Amount of shared memory to use per threadblock
int blockBytes; // Each CU, except the last, gets a multiple of this many bytes to copy
int usePcieIndexing; // Base GPU indexing on PCIe address instead of HIP device
std::vector<float> fillPattern; // Pattern of floats used to fill source data
@@ -75,6 +76,7 @@ public:
numCpuPerLink = GetEnvVar("NUM_CPU_PER_LINK" , DEFAULT_NUM_CPU_PER_LINK);
sharedMemBytes = GetEnvVar("SHARED_MEM_BYTES" , maxSharedMemBytes / 2 + 1);
blockBytes = GetEnvVar("BLOCK_BYTES" , 256);
usePcieIndexing = GetEnvVar("USE_PCIE_INDEX" , 0);
// Check for fill pattern
char* pattern = getenv("FILL_PATTERN");
@@ -192,6 +194,7 @@ public:
printf(" FILL_PATTERN=STR - Fill input buffer with pattern specified in hex digits (0-9,a-f,A-F). Must be even number of digits, (byte-level big-endian)\n");
printf(" SHARED_MEM_BYTES=X - Use X shared mem bytes per threadblock, potentially to avoid multiple threadblocks per CU\n");
printf(" BLOCK_BYTES=B - Each CU (except the last) receives a multiple of BLOCK_BYTES to copy\n");
printf(" USE_PCIE_INDEX - Index GPUs by PCIe address-ordering instead of HIP-provided indexing\n");
}
// Display env var settings
@@ -238,6 +241,7 @@ public:
printf("%-20s = %12s : Using %d shared mem per threadblock\n", "SHARED_MEM_BYTES",
getenv("SHARED_MEM_BYTES") ? "(specified)" : "(unset)", sharedMemBytes);
printf("%-20s = %12d : Each CU gets a multiple of %d bytes to copy\n", "BLOCK_BYTES", blockBytes, blockBytes);
printf("%-20s = %12d : Using %s-based GPU indexing\n", "USE_PCIE_INDEX", usePcieIndexing, (usePcieIndexing ? "PCIe" : "HIP"));
printf("\n");
}
};
@@ -152,13 +152,15 @@ int main(int argc, char **argv)
{
// Get some aliases to link variables
MemType const& exeMemType = links[i].exeMemType;
int const& exeIndex = links[i].exeIndex;
MemType const& srcMemType = links[i].srcMemType;
MemType const& dstMemType = links[i].dstMemType;
int const& srcIndex = links[i].srcIndex;
int const& dstIndex = links[i].dstIndex;
int const& blocksToUse = links[i].numBlocksToUse;
// Get potentially remapped device indices
int const srcIndex = RemappedIndex(links[i].srcIndex, srcMemType);
int const exeIndex = RemappedIndex(links[i].exeIndex, exeMemType);
int const dstIndex = RemappedIndex(links[i].dstIndex, dstMemType);
// Enable peer-to-peer access if necessary (can only be called once per unique pair)
if (exeMemType == MEM_GPU)
{
@@ -166,7 +168,7 @@ int main(int argc, char **argv)
if ((srcMemType == MEM_GPU || srcMemType == MEM_GPU_FINE) && srcIndex != exeIndex)
{
auto exeSrcPair = std::make_pair(exeIndex, srcIndex);
if (!peerAccessTracker.count(exeSrcPair))
if (!peerAccessTracker.count(exeSrcPair))
{
EnablePeerAccess(exeIndex, srcIndex);
peerAccessTracker.insert(exeSrcPair);
@@ -369,8 +371,8 @@ int main(int argc, char **argv)
// Release GPU memory
for (int i = 0; i < numLinks; i++)
{
DeallocateMemory(links[i].srcMemType, links[i].srcIndex, links[i].srcMem);
DeallocateMemory(links[i].dstMemType, links[i].dstIndex, links[i].dstMem);
DeallocateMemory(links[i].srcMemType, links[i].srcMem);
DeallocateMemory(links[i].dstMemType, links[i].dstMem);
if (links[i].exeMemType == MEM_GPU)
{
@@ -608,6 +610,46 @@ void GenerateConfigFile(char const* cfgFile, int numBlocks)
fclose(fp);
}
int RemappedIndex(int const origIdx, MemType const memType)
{
static std::vector<int> remapping;
// No need to re-map CPU devices
if (memType == MEM_CPU) return origIdx;
// Build remapping on first use
if (remapping.empty())
{
int numGpuDevices;
HIP_CALL(hipGetDeviceCount(&numGpuDevices));
remapping.resize(numGpuDevices);
int const usePcieIndexing = getenv("USE_PCIE_INDEX") ? atoi(getenv("USE_PCIE_INDEX")) : 0;
if (!usePcieIndexing)
{
// For HIP-based indexing no remapping is necessary
for (int i = 0; i < numGpuDevices; ++i)
remapping[i] = i;
}
else
{
// Collect PCIe address for each GPU
std::vector<std::pair<std::string, int>> mapping;
char pciBusId[20];
for (int i = 0; i < numGpuDevices; ++i)
{
HIP_CALL(hipDeviceGetPCIBusId(pciBusId, 20, i));
mapping.push_back(std::make_pair(pciBusId, i));
}
// Sort GPUs by PCIe address then use that as mapping
std::sort(mapping.begin(), mapping.end());
for (int i = 0; i < numGpuDevices; ++i)
remapping[i] = mapping[i].second;
}
}
return remapping[origIdx];
}
void DisplayTopology()
{
int numGpuDevices;
@@ -632,7 +674,9 @@ void DisplayTopology()
else
{
uint32_t linkType, hopCount;
HIP_CALL(hipExtGetLinkTypeAndHopCount(i, j, &linkType, &hopCount));
HIP_CALL(hipExtGetLinkTypeAndHopCount(RemappedIndex(i, MEM_GPU),
RemappedIndex(j, MEM_GPU),
&linkType, &hopCount));
printf(" %s-%d |",
linkType == HSA_AMD_LINK_INFO_TYPE_HYPERTRANSPORT ? " HT" :
linkType == HSA_AMD_LINK_INFO_TYPE_QPI ? " QPI" :
@@ -642,8 +686,8 @@ void DisplayTopology()
hopCount);
}
}
HIP_CALL(hipDeviceGetPCIBusId(pciBusId, 20, i));
printf(" %11s | %d \n", pciBusId, GetClosestNumaNode(i));
HIP_CALL(hipDeviceGetPCIBusId(pciBusId, 20, RemappedIndex(i, MEM_GPU)));
printf(" %11s | %d \n", pciBusId, GetClosestNumaNode(RemappedIndex(i, MEM_GPU)));
}
}
@@ -863,7 +907,7 @@ void AllocateMemory(MemType memType, int devIndex, size_t numBytes, float** memP
}
}
void DeallocateMemory(MemType memType, int devIndex, float* memPtr)
void DeallocateMemory(MemType memType, float* memPtr)
{
if (memType == MEM_CPU)
{
@@ -1008,7 +1052,9 @@ std::string GetDesc(MemType srcMemType, int srcIndex,
else
{
uint32_t linkType, hopCount;
HIP_CALL(hipExtGetLinkTypeAndHopCount(srcIndex, dstIndex, &linkType, &hopCount));
HIP_CALL(hipExtGetLinkTypeAndHopCount(RemappedIndex(srcIndex, MEM_GPU),
RemappedIndex(dstIndex, MEM_GPU),
&linkType, &hopCount));
return GetLinkTypeDesc(linkType, hopCount);
}
}
@@ -1032,7 +1078,7 @@ void RunLink(EnvVars const& ev, size_t const N, int const iteration, Link& link)
if (link.exeMemType == MEM_GPU)
{
// Switch to executing GPU
HIP_CALL(hipSetDevice(link.exeIndex));
HIP_CALL(hipSetDevice(RemappedIndex(link.exeIndex, MEM_GPU)));
bool recordStart = (!ev.useSingleSync || iteration == 0);
bool recordStop = (!ev.useSingleSync || iteration == ev.numIterations - 1);
@@ -1188,14 +1234,14 @@ double GetPeakBandwidth(EnvVars const& ev, size_t N, int isBidirectional,
// Prepare Links
links[0].srcMemType = links[1].dstMemType = srcMemType;
links[0].srcIndex = links[1].dstIndex = srcIndex;
links[0].srcIndex = links[1].dstIndex = RemappedIndex(srcIndex, srcMemType);
links[0].dstMemType = links[1].srcMemType = dstMemType;
links[0].dstIndex = links[1].srcIndex = dstIndex;
links[0].dstIndex = links[1].srcIndex = RemappedIndex(dstIndex, dstMemType);
// Either perform local read / remote write, or remote read / local write
links[0].exeMemType = (readMode == 0 ? srcMemType : dstMemType);
links[0].exeIndex = (readMode == 0 ? srcIndex : dstIndex);
links[0].exeIndex = RemappedIndex((readMode == 0 ? srcIndex : dstIndex), links[0].exeMemType);
links[1].exeMemType = (readMode == 0 ? dstMemType : srcMemType);
links[1].exeIndex = (readMode == 0 ? dstIndex : srcIndex);
links[1].exeIndex = RemappedIndex((readMode == 0 ? dstIndex : srcIndex), links[1].exeMemType);
for (int i = 0; i <= isBidirectional; i++)
{
@@ -1281,8 +1327,8 @@ double GetPeakBandwidth(EnvVars const& ev, size_t N, int isBidirectional,
// Release GPU memory
for (int i = 0; i <= isBidirectional; i++)
{
DeallocateMemory(links[i].srcMemType, links[i].srcIndex, links[i].srcMem);
DeallocateMemory(links[i].dstMemType, links[i].dstIndex, links[i].dstMem);
DeallocateMemory(links[i].srcMemType, links[i].srcMem);
DeallocateMemory(links[i].dstMemType, links[i].dstMem);
if (links[i].exeMemType == MEM_GPU)
{
@@ -105,7 +105,7 @@ void ParseMemType(std::string const& token, int const numCpus, int const numGpus
void ParseLinks(char* line, int numCpus, int numGpus, std::vector<Link>& links); // Parse Link information
void EnablePeerAccess(int const deviceId, int const peerDeviceId);
void AllocateMemory(MemType memType, int devIndex, size_t numBytes, float** memPtr);
void DeallocateMemory(MemType memType, int devIndex, float* memPtr);
void DeallocateMemory(MemType memType, float* memPtr);
void CheckPages(char* byteArray, size_t numBytes, int targetId);
void CheckOrFill(ModeType mode, int N, bool isMemset, bool isHipCall, std::vector<float> const& fillPattern, float* ptr);
void RunLink(EnvVars const& ev, size_t const N, int const iteration, Link& link);
@@ -119,3 +119,4 @@ std::string GetLinkTypeDesc(uint32_t linkType, uint32_t hopCount);
std::string GetDesc(MemType srcMemType, int srcIndex,
MemType dstMemType, int dstIndex);
std::string GetLinkDesc(Link const& link);
int RemappedIndex(int const origIdx, MemType const memType);