Transfer bench single stream mode (#531)
- Adding single stream mode
- Removing some unused env vars
- Adding output to CSV mode for p2p benchmark, topology listing modes
[ROCm/rccl commit: def6832287]
This commit is contained in:
zatwierdzone przez
GitHub
rodzic
21415407ac
commit
e61ff3ce37
@@ -0,0 +1,16 @@
|
||||
# Changelog for TransferBench
|
||||
|
||||
## v1.01
|
||||
### Added
|
||||
- Adding USE_SINGLE_STREAM feature
|
||||
- All Links that execute on the same GPU device are executed with a single kernel launch on a single stream
|
||||
- Does not work with USE_HIP_CALL and forces USE_SINGLE_SYNC to collect timings
|
||||
- Adding ability to request coherent / fine-grained host memory ('B')
|
||||
### Changed
|
||||
- Separating TransferBench from RCCL repo
|
||||
- Peer-to-peer benchmark mode now works OUTPUT_TO_CSV
|
||||
- Toplogy display now works with OUTPUT_TO_CSV
|
||||
- Moving documentation about config file into example.cfg
|
||||
### Removed
|
||||
- Removed config file generation
|
||||
- Removed show pointer address environment variable (SHOW_ADDR)
|
||||
@@ -25,6 +25,8 @@ THE SOFTWARE.
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
#define TB_VERSION "1.01"
|
||||
|
||||
// This class manages environment variable that affect TransferBench
|
||||
class EnvVars
|
||||
{
|
||||
@@ -40,9 +42,7 @@ public:
|
||||
int useMemset; // Perform a memset instead of a copy (ignores source memory)
|
||||
int useSingleSync; // Perform synchronization only once after all iterations instead of per iteration
|
||||
int useInteractive; // Pause for user-input before starting transfer loop
|
||||
int useSleep; // Adds a 100ms sleep after each synchronization
|
||||
int combineTiming; // Combines the timing with kernel launch
|
||||
int showAddr; // Print out memory addresses for each Link
|
||||
int outputToCsv; // Output in CSV format
|
||||
int byteOffset; // Byte-offset for memory allocations
|
||||
int numWarmups; // Number of un-timed warmup iterations to perform
|
||||
@@ -52,6 +52,7 @@ public:
|
||||
int sharedMemBytes; // Amount of shared memory to use per threadblock
|
||||
int blockBytes; // Each CU, except the last, gets a multiple of this many bytes to copy
|
||||
int usePcieIndexing; // Base GPU indexing on PCIe address instead of HIP device
|
||||
int useSingleStream; // Use a single stream per device instead of per Link. Can not be used with USE_HIP_CALL
|
||||
|
||||
std::vector<float> fillPattern; // Pattern of floats used to fill source data
|
||||
|
||||
@@ -67,7 +68,6 @@ public:
|
||||
useSingleSync = GetEnvVar("USE_SINGLE_SYNC" , 1);
|
||||
useInteractive = GetEnvVar("USE_INTERACTIVE" , 0);
|
||||
combineTiming = GetEnvVar("COMBINE_TIMING" , 0);
|
||||
showAddr = GetEnvVar("SHOW_ADDR" , 0);
|
||||
outputToCsv = GetEnvVar("OUTPUT_TO_CSV" , 0);
|
||||
byteOffset = GetEnvVar("BYTE_OFFSET" , 0);
|
||||
numWarmups = GetEnvVar("NUM_WARMUPS" , DEFAULT_NUM_WARMUPS);
|
||||
@@ -77,6 +77,7 @@ public:
|
||||
sharedMemBytes = GetEnvVar("SHARED_MEM_BYTES" , maxSharedMemBytes / 2 + 1);
|
||||
blockBytes = GetEnvVar("BLOCK_BYTES" , 256);
|
||||
usePcieIndexing = GetEnvVar("USE_PCIE_INDEX" , 0);
|
||||
useSingleStream = GetEnvVar("USE_SINGLE_STREAM", 0);
|
||||
|
||||
// Check for fill pattern
|
||||
char* pattern = getenv("FILL_PATTERN");
|
||||
@@ -172,6 +173,11 @@ public:
|
||||
printf("[ERROR] BLOCK_BYTES must be a positive multiple of 4\n");
|
||||
exit(1);
|
||||
}
|
||||
if (useSingleStream && useHipCall)
|
||||
{
|
||||
printf("[ERROR] Single stream mode cannot be used with HIP calls\n");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
// Display info on the env vars that can be used
|
||||
@@ -184,7 +190,6 @@ public:
|
||||
printf(" USE_SINGLE_SYNC - Perform synchronization only once after all iterations instead of per iteration\n");
|
||||
printf(" USE_INTERACTIVE - Pause for user-input before starting transfer loop\n");
|
||||
printf(" COMBINE_TIMING - Combines timing with launch (potentially lower timing overhead)\n");
|
||||
printf(" SHOW_ADDR - Print out memory addresses for each Link\n");
|
||||
printf(" OUTPUT_TO_CSV - Outputs to CSV format if set\n");
|
||||
printf(" BYTE_OFFSET - Initial byte-offset for memory allocations. Must be multiple of 4. Defaults to 0\n");
|
||||
printf(" NUM_WARMUPS=W - Perform W untimed warmup iteration(s) per test\n");
|
||||
@@ -195,6 +200,7 @@ public:
|
||||
printf(" SHARED_MEM_BYTES=X - Use X shared mem bytes per threadblock, potentially to avoid multiple threadblocks per CU\n");
|
||||
printf(" BLOCK_BYTES=B - Each CU (except the last) receives a multiple of BLOCK_BYTES to copy\n");
|
||||
printf(" USE_PCIE_INDEX - Index GPUs by PCIe address-ordering instead of HIP-provided indexing\n");
|
||||
printf(" USE_SINGLE_STREAM - Use single stream per device instead of per link. Cannot be used with USE_HIP_CALL\n");
|
||||
}
|
||||
|
||||
// Display env var settings
|
||||
@@ -202,7 +208,7 @@ public:
|
||||
{
|
||||
if (!outputToCsv)
|
||||
{
|
||||
printf("Run configuration\n");
|
||||
printf("Run configuration (TransferBench v%s)\n", TB_VERSION);
|
||||
printf("=====================================================\n");
|
||||
printf("%-20s = %12d : Using %s for GPU-executed copies\n", "USE_HIP_CALL", useHipCall,
|
||||
useHipCall ? "HIP functions" : "custom kernels");
|
||||
@@ -220,8 +226,6 @@ public:
|
||||
useInteractive ? "interactive" : "non-interactive");
|
||||
printf("%-20s = %12d : %s\n", "COMBINE_TIMING", combineTiming,
|
||||
combineTiming ? "Using combined timing+launch" : "Using separate timing / launch");
|
||||
printf("%-20s = %12d : %s\n", "SHOW_ADDR", showAddr,
|
||||
showAddr ? "Displaying src/dst mem addresses" : "Not displaying src/dst mem addresses");
|
||||
printf("%-20s = %12d : Output to %s\n", "OUTPUT_TO_CSV", outputToCsv,
|
||||
outputToCsv ? "CSV" : "console");
|
||||
printf("%-20s = %12d : Using byte offset of %d\n", "BYTE_OFFSET", byteOffset, byteOffset);
|
||||
@@ -242,13 +246,13 @@ public:
|
||||
getenv("SHARED_MEM_BYTES") ? "(specified)" : "(unset)", sharedMemBytes);
|
||||
printf("%-20s = %12d : Each CU gets a multiple of %d bytes to copy\n", "BLOCK_BYTES", blockBytes, blockBytes);
|
||||
printf("%-20s = %12d : Using %s-based GPU indexing\n", "USE_PCIE_INDEX", usePcieIndexing, (usePcieIndexing ? "PCIe" : "HIP"));
|
||||
printf("%-20s = %12d : Using single stream per %s\n", "USE_SINGLE_STREAM", useSingleStream, (useSingleStream ? "device" : "Link"));
|
||||
printf("\n");
|
||||
}
|
||||
};
|
||||
|
||||
private:
|
||||
// Helper function that gets parses environment variable or sets to default value
|
||||
int GetEnvVar(std::string const varname, int defaultValue)
|
||||
static int GetEnvVar(std::string const varname, int defaultValue)
|
||||
{
|
||||
if (getenv(varname.c_str()))
|
||||
return atoi(getenv(varname.c_str()));
|
||||
|
||||
@@ -36,6 +36,7 @@ GpuCopyKernel(BlockParam* blockParams)
|
||||
int Nrem = blockParams[blockIdx.x].N;
|
||||
float const* src = blockParams[blockIdx.x].src;
|
||||
float* dst = blockParams[blockIdx.x].dst;
|
||||
if (threadIdx.x == 0) blockParams[blockIdx.x].startCycle = __builtin_amdgcn_s_memrealtime();
|
||||
|
||||
// Operate on wavefront granularity
|
||||
int numWaves = BLOCKSIZE / WARP_SIZE; // Number of wavefronts per threadblock
|
||||
@@ -68,30 +69,34 @@ GpuCopyKernel(BlockParam* blockParams)
|
||||
loop1Offset += loop1Inc;
|
||||
}
|
||||
Nrem -= loop1Nelem;
|
||||
if (Nrem == 0) return;
|
||||
|
||||
// 2nd loop - Each thread operates on FLOATS_PER_PACK per iteration
|
||||
int const loop2Npack = Nrem / FLOATS_PER_PACK;
|
||||
int const loop2Nelem = loop2Npack * FLOATS_PER_PACK;
|
||||
int const loop2Inc = BLOCKSIZE;
|
||||
int loop2Offset = threadIdx.x;
|
||||
|
||||
packedSrc = (PackedFloat_t const*)(src + loop1Nelem);
|
||||
packedDst = (PackedFloat_t *)(dst + loop1Nelem);
|
||||
while (loop2Offset < loop2Npack)
|
||||
if (Nrem > 0)
|
||||
{
|
||||
packedDst[loop2Offset] = packedSrc[loop2Offset];
|
||||
loop2Offset += loop2Inc;
|
||||
}
|
||||
Nrem -= loop2Nelem;
|
||||
if (Nrem == 0) return;
|
||||
// 2nd loop - Each thread operates on FLOATS_PER_PACK per iteration
|
||||
int const loop2Npack = Nrem / FLOATS_PER_PACK;
|
||||
int const loop2Nelem = loop2Npack * FLOATS_PER_PACK;
|
||||
int const loop2Inc = BLOCKSIZE;
|
||||
int loop2Offset = threadIdx.x;
|
||||
|
||||
// Deal with leftovers less than FLOATS_PER_PACK)
|
||||
if (threadIdx.x < Nrem)
|
||||
{
|
||||
int offset = loop1Nelem + loop2Nelem + threadIdx.x;
|
||||
dst[offset] = src[offset];
|
||||
packedSrc = (PackedFloat_t const*)(src + loop1Nelem);
|
||||
packedDst = (PackedFloat_t *)(dst + loop1Nelem);
|
||||
while (loop2Offset < loop2Npack)
|
||||
{
|
||||
packedDst[loop2Offset] = packedSrc[loop2Offset];
|
||||
loop2Offset += loop2Inc;
|
||||
}
|
||||
Nrem -= loop2Nelem;
|
||||
|
||||
// Deal with leftovers less than FLOATS_PER_PACK)
|
||||
if (threadIdx.x < Nrem)
|
||||
{
|
||||
int offset = loop1Nelem + loop2Nelem + threadIdx.x;
|
||||
dst[offset] = src[offset];
|
||||
}
|
||||
}
|
||||
|
||||
__threadfence_system();
|
||||
if (threadIdx.x == 0)
|
||||
blockParams[blockIdx.x].stopCycle = __builtin_amdgcn_s_memrealtime();
|
||||
}
|
||||
|
||||
#define MEMSET_UNROLL 8
|
||||
|
||||
@@ -1,12 +1,9 @@
|
||||
# Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
HIP_PATH?= $(wildcard /opt/rocm/hip)
|
||||
ifeq (,$(HIP_PATH))
|
||||
HIP_PATH=../../..
|
||||
endif
|
||||
HIPCC=$(HIP_PATH)/bin/hipcc
|
||||
# Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
ROCM_PATH ?= /opt/rocm
|
||||
HIPCC=$(ROCM_PATH)/bin/hipcc
|
||||
|
||||
EXE=TransferBench
|
||||
CXXFLAGS = -O3 -I. -lnuma -L$(HIP_PATH)/../hsa/lib -lhsa-runtime64
|
||||
CXXFLAGS = -O3 -I. -lnuma -L$(ROCM_PATH)/hsa/lib -lhsa-runtime64
|
||||
|
||||
all: $(EXE)
|
||||
|
||||
|
||||
@@ -0,0 +1,14 @@
|
||||
# TransferBench
|
||||
|
||||
TransferBench is a simple utility capable of benchmarking simultaneous copies between user-specified devices (CPUs/GPUs).
|
||||
|
||||
## Requirements
|
||||
|
||||
1. ROCm stack installed on the system (HIP runtime)
|
||||
2. libnuma installed on system
|
||||
|
||||
## Building
|
||||
To build TransferBench:
|
||||
* `make`
|
||||
|
||||
If ROCm is installed in a folder other than `/opt/rocm/`, set ROCM_PATH appropriately
|
||||
Plik diff jest za duży
Load Diff
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
@@ -49,15 +49,19 @@ THE SOFTWARE.
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
// Simple configuration parameters
|
||||
size_t const DEFAULT_BYTES_PER_LINK = (1<<26); // Amount of data transferred per Link
|
||||
|
||||
// Different src/dst memory types supported
|
||||
typedef enum
|
||||
{
|
||||
MEM_CPU = 0, // Pinned CPU memory
|
||||
MEM_CPU = 0, // Coarse-grained pinned CPU memory
|
||||
MEM_GPU = 1, // Coarse-grained global GPU memory
|
||||
MEM_GPU_FINE = 2 // Fine-grained global GPU memory
|
||||
MEM_CPU_FINE = 2, // Fine-grained pinned CPU memory
|
||||
MEM_GPU_FINE = 3 // Fine-grained global GPU memory
|
||||
} MemType;
|
||||
|
||||
char const MemTypeStr[4] = "CGF";
|
||||
char const MemTypeStr[5] = "CGBF";
|
||||
|
||||
typedef enum
|
||||
{
|
||||
@@ -68,55 +72,99 @@ typedef enum
|
||||
// Each threadblock copies N floats from src to dst
|
||||
struct BlockParam
|
||||
{
|
||||
int N;
|
||||
float* src;
|
||||
float* dst;
|
||||
int N;
|
||||
float* src;
|
||||
float* dst;
|
||||
long long startCycle;
|
||||
long long stopCycle;
|
||||
};
|
||||
|
||||
// Each Link is a uni-direction operation from a src memory to dst memory executed by a specific GPU
|
||||
// Each Link is a uni-direction operation from a src memory to dst memory
|
||||
struct Link
|
||||
{
|
||||
int linkIndex; // Link identifier
|
||||
|
||||
// Link config
|
||||
MemType exeMemType; // Link executor type (CPU or GPU)
|
||||
int exeIndex; // Executor index (NUMA node for CPU / device ID for GPU)
|
||||
MemType srcMemType; // Source memory type
|
||||
int srcIndex; // Source device index
|
||||
MemType dstMemType; // Destination memory type
|
||||
int dstIndex; // Destination device index
|
||||
int numBlocksToUse; // Number of threadblocks to use for this Link
|
||||
MemType exeMemType; // Link executor type (CPU or GPU)
|
||||
int exeIndex; // Executor index (NUMA node for CPU / device ID for GPU)
|
||||
MemType srcMemType; // Source memory type
|
||||
int srcIndex; // Source device index
|
||||
MemType dstMemType; // Destination memory type
|
||||
int dstIndex; // Destination device index
|
||||
int numBlocksToUse; // Number of threadblocks to use for this Link
|
||||
|
||||
// Link implementation
|
||||
float* srcMem; // Source memory
|
||||
float* dstMem; // Destination memory
|
||||
// Memory
|
||||
float* srcMem; // Source memory
|
||||
float* dstMem; // Destination memory
|
||||
|
||||
hipEvent_t startEvent;
|
||||
hipEvent_t stopEvent;
|
||||
hipStream_t stream;
|
||||
BlockParam* blockParam;
|
||||
// How memory is split across threadblocks / CPU cores
|
||||
std::vector<BlockParam> blockParam;
|
||||
BlockParam* blockParamGpuPtr;
|
||||
|
||||
// Results
|
||||
double linkTime;
|
||||
|
||||
// Prepares src memory and how to divide N elements across threadblocks/threads
|
||||
void PrepareBlockParams(EnvVars const& ev, size_t const N);
|
||||
};
|
||||
|
||||
typedef std::pair<MemType, int> Executor;
|
||||
|
||||
struct ExecutorInfo
|
||||
{
|
||||
std::vector<Link> links; // Links to execute
|
||||
|
||||
// For GPU-Executors
|
||||
int totalBlocks; // Total number of CUs/CPU threads to use
|
||||
BlockParam* blockParamGpu; // Copy of block parameters in GPU device memory
|
||||
std::vector<hipStream_t> streams;
|
||||
std::vector<hipEvent_t> startEvents;
|
||||
std::vector<hipEvent_t> stopEvents;
|
||||
|
||||
// Results
|
||||
double totalTime;
|
||||
};
|
||||
|
||||
void DisplayUsage(char const* cmdName); // Display usage instructions
|
||||
void GenerateConfigFile(char const* cfgFile, int numBlocks); // Generate a sample config file
|
||||
void DisplayTopology(); // Display GPU topology
|
||||
void PopulateTestSizes(size_t const numBytesPerLink, int const samplingFactor, std::vector<size_t>& valuesofN);
|
||||
void ParseMemType(std::string const& token, int const numCpus, int const numGpus, MemType* memType, int* memIndex);
|
||||
void ParseLinks(char* line, int numCpus, int numGpus, std::vector<Link>& links); // Parse Link information
|
||||
typedef std::map<Executor, ExecutorInfo> LinkMap;
|
||||
|
||||
// Display usage instructions
|
||||
void DisplayUsage(char const* cmdName);
|
||||
|
||||
// Display detected GPU topology / CPU numa nodes
|
||||
void DisplayTopology(bool const outputToCsv);
|
||||
|
||||
// Build array of test sizes based on sampling factor
|
||||
void PopulateTestSizes(size_t const numBytesPerLink, int const samplingFactor,
|
||||
std::vector<size_t>& valuesofN);
|
||||
|
||||
void ParseMemType(std::string const& token, int const numCpus, int const numGpus,
|
||||
MemType* memType, int* memIndex);
|
||||
|
||||
void ParseLinks(char* line, int numCpus, int numGpus,
|
||||
LinkMap& linkMap);
|
||||
|
||||
void EnablePeerAccess(int const deviceId, int const peerDeviceId);
|
||||
void AllocateMemory(MemType memType, int devIndex, size_t numBytes, float** memPtr);
|
||||
void DeallocateMemory(MemType memType, float* memPtr);
|
||||
void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void** memPtr);
|
||||
void DeallocateMemory(MemType memType, void* memPtr);
|
||||
void CheckPages(char* byteArray, size_t numBytes, int targetId);
|
||||
void CheckOrFill(ModeType mode, int N, bool isMemset, bool isHipCall, std::vector<float> const& fillPattern, float* ptr);
|
||||
void RunLink(EnvVars const& ev, size_t const N, int const iteration, Link& link);
|
||||
void RunLink(EnvVars const& ev, size_t const N, int const iteration, ExecutorInfo& exeInfo, int const linkIdx);
|
||||
void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N, int numBlocksToUse, int readMode, int skipCpu);
|
||||
double GetPeakBandwidth(EnvVars const& ev, size_t N, int isBidirectional,
|
||||
MemType srcMemType, int srcIndex,
|
||||
MemType dstMemType, int dstIndex,
|
||||
int readMode);
|
||||
|
||||
// Return the maximum bandwidth measured for given (src/dst) pair
|
||||
double GetPeakBandwidth(EnvVars const& ev,
|
||||
size_t const N,
|
||||
int const isBidirectional,
|
||||
int const readMode,
|
||||
int const numBlocksToUse,
|
||||
MemType const srcMemType,
|
||||
int const srcIndex,
|
||||
MemType const dstMemType,
|
||||
int const dstIndex);
|
||||
|
||||
std::string GetLinkTypeDesc(uint32_t linkType, uint32_t hopCount);
|
||||
std::string GetDesc(MemType srcMemType, int srcIndex,
|
||||
MemType dstMemType, int dstIndex);
|
||||
std::string GetLinkDesc(Link const& link);
|
||||
int RemappedIndex(int const origIdx, MemType const memType);
|
||||
int GetWallClockRate(int deviceId);
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
# Configfile Format:
|
||||
# ConfigFile Format:
|
||||
# ==================
|
||||
# A Link is defined as a uni-directional transfer from src memory location to dst memory location executed by either CPU or GPU
|
||||
# A Link is defined as a uni-directional transfer from src memory location to dst memory location
|
||||
# executed by either CPU or GPU
|
||||
# Each single line in the configuration file defines a set of Links to run in parallel
|
||||
|
||||
# There are two ways to specify the configuration file:
|
||||
@@ -25,19 +26,22 @@
|
||||
# - G: GPU-executed (Indexed from 0 to 3)
|
||||
# dstMemL : Destination memory location (Where the data is to be written to)
|
||||
|
||||
# Memory locations are specified by a character indicating memory type, followed by device index (0-indexed)
|
||||
# Memory locations are specified by a character indicating memory type,
|
||||
# followed by device index (0-indexed)
|
||||
# Supported memory locations are:
|
||||
# - C: Pinned host memory (on NUMA node, indexed from 0 to 1)
|
||||
# - G: Global device memory (on GPU device indexed from 0 to 3)
|
||||
# - F: Fine-grain device memory (on GPU device indexed from 0 to 3)
|
||||
# - C: Pinned host memory (on NUMA node, indexed from 0 to [# NUMA nodes-1])
|
||||
# - B: Fine-grain host memory (on NUMA node, indexed from 0 to [# NUMA nodes-1])
|
||||
# - G: Global device memory (on GPU device indexed from 0 to [# GPUs - 1])
|
||||
# - F: Fine-grain device memory (on GPU device indexed from 0 to [# GPUs - 1])
|
||||
|
||||
# Examples:
|
||||
# 1 4 (G0->G0->G1) Single Link that uses 4 CUs on GPU 0 that reads memory from GPU 0 and copies it to memory on GPU 1
|
||||
# 1 4 (G1->C0->G0) Single Link that uses 4 CUs on GPU 0 that reads memory from CPU 1 and copies it to memory on GPU 0
|
||||
# 1 4 (C0->G2->G2) Single Link that uses 4 CUs on GPU 2 that reads memory from CPU 0 and copies it to memory on GPU 2
|
||||
# 2 4 G0->G0->G1 G1->G1->G0 Runs 2 Links in parallel. GPU 0 - > GPU1, and GP1 -> GPU 0, each with 4 CUs
|
||||
# -2 (G0 G0 G1 4) (G1 G1 G0 2) Runs 2 Links in parallel. GPU 0 - > GPU 1 using four CUs, and GPU1 -> GPU 0 using two CUs
|
||||
# 1 4 (G0->G0->G1) Single link using 4 CUs on GPU0 to copy from GPU0 to GPU1
|
||||
# 1 4 (C1->G2->G0) Single link using 4 CUs on GPU2 to copy from CPU1 to GPU0
|
||||
# 2 4 G0->G0->G1 G1->G1->G0 Runs 2 Links in parallel. GPU0 to GPU1, and GPU1 to GPU0, each with 4 CUs
|
||||
# -2 (G0 G0 G1 4) (G1 G1 G0 2) Runs 2 Links in parallel. GPU0 to GPU1 with 4 CUs, and GPU1 to GPU0 with 2 CUs
|
||||
|
||||
# Round brackets and arrows' ->' may be included for human clarity, but will be ignored and are unnecessary
|
||||
# Lines starting with # will be ignored. Lines starting with ## will be echoed to output
|
||||
|
||||
# Single GPU-executed link between GPUs 0 and 1 using 4 CUs
|
||||
1 4 (G0->G0->G1)
|
||||
|
||||
Reference in New Issue
Block a user