TransferBench: Adding ability to reindex GPUs based on PCIe address (#494)
[ROCm/rccl commit: 84d5fce7dd]
This commit is contained in:
@@ -51,6 +51,7 @@ public:
|
||||
int numCpuPerLink; // Number of CPU child threads to use per CPU link
|
||||
int sharedMemBytes; // Amount of shared memory to use per threadblock
|
||||
int blockBytes; // Each CU, except the last, gets a multiple of this many bytes to copy
|
||||
int usePcieIndexing; // Base GPU indexing on PCIe address instead of HIP device
|
||||
|
||||
std::vector<float> fillPattern; // Pattern of floats used to fill source data
|
||||
|
||||
@@ -75,6 +76,7 @@ public:
|
||||
numCpuPerLink = GetEnvVar("NUM_CPU_PER_LINK" , DEFAULT_NUM_CPU_PER_LINK);
|
||||
sharedMemBytes = GetEnvVar("SHARED_MEM_BYTES" , maxSharedMemBytes / 2 + 1);
|
||||
blockBytes = GetEnvVar("BLOCK_BYTES" , 256);
|
||||
usePcieIndexing = GetEnvVar("USE_PCIE_INDEX" , 0);
|
||||
|
||||
// Check for fill pattern
|
||||
char* pattern = getenv("FILL_PATTERN");
|
||||
@@ -192,6 +194,7 @@ public:
|
||||
printf(" FILL_PATTERN=STR - Fill input buffer with pattern specified in hex digits (0-9,a-f,A-F). Must be even number of digits, (byte-level big-endian)\n");
|
||||
printf(" SHARED_MEM_BYTES=X - Use X shared mem bytes per threadblock, potentially to avoid multiple threadblocks per CU\n");
|
||||
printf(" BLOCK_BYTES=B - Each CU (except the last) receives a multiple of BLOCK_BYTES to copy\n");
|
||||
printf(" USE_PCIE_INDEX - Index GPUs by PCIe address-ordering instead of HIP-provided indexing\n");
|
||||
}
|
||||
|
||||
// Display env var settings
|
||||
@@ -238,6 +241,7 @@ public:
|
||||
printf("%-20s = %12s : Using %d shared mem per threadblock\n", "SHARED_MEM_BYTES",
|
||||
getenv("SHARED_MEM_BYTES") ? "(specified)" : "(unset)", sharedMemBytes);
|
||||
printf("%-20s = %12d : Each CU gets a multiple of %d bytes to copy\n", "BLOCK_BYTES", blockBytes, blockBytes);
|
||||
printf("%-20s = %12d : Using %s-based GPU indexing\n", "USE_PCIE_INDEX", usePcieIndexing, (usePcieIndexing ? "PCIe" : "HIP"));
|
||||
printf("\n");
|
||||
}
|
||||
};
|
||||
|
||||
@@ -152,13 +152,15 @@ int main(int argc, char **argv)
|
||||
{
|
||||
// Get some aliases to link variables
|
||||
MemType const& exeMemType = links[i].exeMemType;
|
||||
int const& exeIndex = links[i].exeIndex;
|
||||
MemType const& srcMemType = links[i].srcMemType;
|
||||
MemType const& dstMemType = links[i].dstMemType;
|
||||
int const& srcIndex = links[i].srcIndex;
|
||||
int const& dstIndex = links[i].dstIndex;
|
||||
int const& blocksToUse = links[i].numBlocksToUse;
|
||||
|
||||
// Get potentially remapped device indices
|
||||
int const srcIndex = RemappedIndex(links[i].srcIndex, srcMemType);
|
||||
int const exeIndex = RemappedIndex(links[i].exeIndex, exeMemType);
|
||||
int const dstIndex = RemappedIndex(links[i].dstIndex, dstMemType);
|
||||
|
||||
// Enable peer-to-peer access if necessary (can only be called once per unique pair)
|
||||
if (exeMemType == MEM_GPU)
|
||||
{
|
||||
@@ -166,7 +168,7 @@ int main(int argc, char **argv)
|
||||
if ((srcMemType == MEM_GPU || srcMemType == MEM_GPU_FINE) && srcIndex != exeIndex)
|
||||
{
|
||||
auto exeSrcPair = std::make_pair(exeIndex, srcIndex);
|
||||
if (!peerAccessTracker.count(exeSrcPair))
|
||||
if (!peerAccessTracker.count(exeSrcPair))
|
||||
{
|
||||
EnablePeerAccess(exeIndex, srcIndex);
|
||||
peerAccessTracker.insert(exeSrcPair);
|
||||
@@ -369,8 +371,8 @@ int main(int argc, char **argv)
|
||||
// Release GPU memory
|
||||
for (int i = 0; i < numLinks; i++)
|
||||
{
|
||||
DeallocateMemory(links[i].srcMemType, links[i].srcIndex, links[i].srcMem);
|
||||
DeallocateMemory(links[i].dstMemType, links[i].dstIndex, links[i].dstMem);
|
||||
DeallocateMemory(links[i].srcMemType, links[i].srcMem);
|
||||
DeallocateMemory(links[i].dstMemType, links[i].dstMem);
|
||||
|
||||
if (links[i].exeMemType == MEM_GPU)
|
||||
{
|
||||
@@ -608,6 +610,46 @@ void GenerateConfigFile(char const* cfgFile, int numBlocks)
|
||||
fclose(fp);
|
||||
}
|
||||
|
||||
int RemappedIndex(int const origIdx, MemType const memType)
|
||||
{
|
||||
static std::vector<int> remapping;
|
||||
|
||||
// No need to re-map CPU devices
|
||||
if (memType == MEM_CPU) return origIdx;
|
||||
|
||||
// Build remapping on first use
|
||||
if (remapping.empty())
|
||||
{
|
||||
int numGpuDevices;
|
||||
HIP_CALL(hipGetDeviceCount(&numGpuDevices));
|
||||
remapping.resize(numGpuDevices);
|
||||
|
||||
int const usePcieIndexing = getenv("USE_PCIE_INDEX") ? atoi(getenv("USE_PCIE_INDEX")) : 0;
|
||||
if (!usePcieIndexing)
|
||||
{
|
||||
// For HIP-based indexing no remapping is necessary
|
||||
for (int i = 0; i < numGpuDevices; ++i)
|
||||
remapping[i] = i;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Collect PCIe address for each GPU
|
||||
std::vector<std::pair<std::string, int>> mapping;
|
||||
char pciBusId[20];
|
||||
for (int i = 0; i < numGpuDevices; ++i)
|
||||
{
|
||||
HIP_CALL(hipDeviceGetPCIBusId(pciBusId, 20, i));
|
||||
mapping.push_back(std::make_pair(pciBusId, i));
|
||||
}
|
||||
// Sort GPUs by PCIe address then use that as mapping
|
||||
std::sort(mapping.begin(), mapping.end());
|
||||
for (int i = 0; i < numGpuDevices; ++i)
|
||||
remapping[i] = mapping[i].second;
|
||||
}
|
||||
}
|
||||
return remapping[origIdx];
|
||||
}
|
||||
|
||||
void DisplayTopology()
|
||||
{
|
||||
int numGpuDevices;
|
||||
@@ -632,7 +674,9 @@ void DisplayTopology()
|
||||
else
|
||||
{
|
||||
uint32_t linkType, hopCount;
|
||||
HIP_CALL(hipExtGetLinkTypeAndHopCount(i, j, &linkType, &hopCount));
|
||||
HIP_CALL(hipExtGetLinkTypeAndHopCount(RemappedIndex(i, MEM_GPU),
|
||||
RemappedIndex(j, MEM_GPU),
|
||||
&linkType, &hopCount));
|
||||
printf(" %s-%d |",
|
||||
linkType == HSA_AMD_LINK_INFO_TYPE_HYPERTRANSPORT ? " HT" :
|
||||
linkType == HSA_AMD_LINK_INFO_TYPE_QPI ? " QPI" :
|
||||
@@ -642,8 +686,8 @@ void DisplayTopology()
|
||||
hopCount);
|
||||
}
|
||||
}
|
||||
HIP_CALL(hipDeviceGetPCIBusId(pciBusId, 20, i));
|
||||
printf(" %11s | %d \n", pciBusId, GetClosestNumaNode(i));
|
||||
HIP_CALL(hipDeviceGetPCIBusId(pciBusId, 20, RemappedIndex(i, MEM_GPU)));
|
||||
printf(" %11s | %d \n", pciBusId, GetClosestNumaNode(RemappedIndex(i, MEM_GPU)));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -863,7 +907,7 @@ void AllocateMemory(MemType memType, int devIndex, size_t numBytes, float** memP
|
||||
}
|
||||
}
|
||||
|
||||
void DeallocateMemory(MemType memType, int devIndex, float* memPtr)
|
||||
void DeallocateMemory(MemType memType, float* memPtr)
|
||||
{
|
||||
if (memType == MEM_CPU)
|
||||
{
|
||||
@@ -1008,7 +1052,9 @@ std::string GetDesc(MemType srcMemType, int srcIndex,
|
||||
else
|
||||
{
|
||||
uint32_t linkType, hopCount;
|
||||
HIP_CALL(hipExtGetLinkTypeAndHopCount(srcIndex, dstIndex, &linkType, &hopCount));
|
||||
HIP_CALL(hipExtGetLinkTypeAndHopCount(RemappedIndex(srcIndex, MEM_GPU),
|
||||
RemappedIndex(dstIndex, MEM_GPU),
|
||||
&linkType, &hopCount));
|
||||
return GetLinkTypeDesc(linkType, hopCount);
|
||||
}
|
||||
}
|
||||
@@ -1032,7 +1078,7 @@ void RunLink(EnvVars const& ev, size_t const N, int const iteration, Link& link)
|
||||
if (link.exeMemType == MEM_GPU)
|
||||
{
|
||||
// Switch to executing GPU
|
||||
HIP_CALL(hipSetDevice(link.exeIndex));
|
||||
HIP_CALL(hipSetDevice(RemappedIndex(link.exeIndex, MEM_GPU)));
|
||||
|
||||
bool recordStart = (!ev.useSingleSync || iteration == 0);
|
||||
bool recordStop = (!ev.useSingleSync || iteration == ev.numIterations - 1);
|
||||
@@ -1188,14 +1234,14 @@ double GetPeakBandwidth(EnvVars const& ev, size_t N, int isBidirectional,
|
||||
|
||||
// Prepare Links
|
||||
links[0].srcMemType = links[1].dstMemType = srcMemType;
|
||||
links[0].srcIndex = links[1].dstIndex = srcIndex;
|
||||
links[0].srcIndex = links[1].dstIndex = RemappedIndex(srcIndex, srcMemType);
|
||||
links[0].dstMemType = links[1].srcMemType = dstMemType;
|
||||
links[0].dstIndex = links[1].srcIndex = dstIndex;
|
||||
links[0].dstIndex = links[1].srcIndex = RemappedIndex(dstIndex, dstMemType);
|
||||
// Either perform local read / remote write, or remote read / local write
|
||||
links[0].exeMemType = (readMode == 0 ? srcMemType : dstMemType);
|
||||
links[0].exeIndex = (readMode == 0 ? srcIndex : dstIndex);
|
||||
links[0].exeIndex = RemappedIndex((readMode == 0 ? srcIndex : dstIndex), links[0].exeMemType);
|
||||
links[1].exeMemType = (readMode == 0 ? dstMemType : srcMemType);
|
||||
links[1].exeIndex = (readMode == 0 ? dstIndex : srcIndex);
|
||||
links[1].exeIndex = RemappedIndex((readMode == 0 ? dstIndex : srcIndex), links[1].exeMemType);
|
||||
|
||||
for (int i = 0; i <= isBidirectional; i++)
|
||||
{
|
||||
@@ -1281,8 +1327,8 @@ double GetPeakBandwidth(EnvVars const& ev, size_t N, int isBidirectional,
|
||||
// Release GPU memory
|
||||
for (int i = 0; i <= isBidirectional; i++)
|
||||
{
|
||||
DeallocateMemory(links[i].srcMemType, links[i].srcIndex, links[i].srcMem);
|
||||
DeallocateMemory(links[i].dstMemType, links[i].dstIndex, links[i].dstMem);
|
||||
DeallocateMemory(links[i].srcMemType, links[i].srcMem);
|
||||
DeallocateMemory(links[i].dstMemType, links[i].dstMem);
|
||||
|
||||
if (links[i].exeMemType == MEM_GPU)
|
||||
{
|
||||
|
||||
@@ -105,7 +105,7 @@ void ParseMemType(std::string const& token, int const numCpus, int const numGpus
|
||||
void ParseLinks(char* line, int numCpus, int numGpus, std::vector<Link>& links); // Parse Link information
|
||||
void EnablePeerAccess(int const deviceId, int const peerDeviceId);
|
||||
void AllocateMemory(MemType memType, int devIndex, size_t numBytes, float** memPtr);
|
||||
void DeallocateMemory(MemType memType, int devIndex, float* memPtr);
|
||||
void DeallocateMemory(MemType memType, float* memPtr);
|
||||
void CheckPages(char* byteArray, size_t numBytes, int targetId);
|
||||
void CheckOrFill(ModeType mode, int N, bool isMemset, bool isHipCall, std::vector<float> const& fillPattern, float* ptr);
|
||||
void RunLink(EnvVars const& ev, size_t const N, int const iteration, Link& link);
|
||||
@@ -119,3 +119,4 @@ std::string GetLinkTypeDesc(uint32_t linkType, uint32_t hopCount);
|
||||
std::string GetDesc(MemType srcMemType, int srcIndex,
|
||||
MemType dstMemType, int dstIndex);
|
||||
std::string GetLinkDesc(Link const& link);
|
||||
int RemappedIndex(int const origIdx, MemType const memType);
|
||||
|
||||
Reference in New Issue
Block a user