From 5272cd16efeeb5012f17e5541a39af9de6aa9eae Mon Sep 17 00:00:00 2001 From: amd-jiali Date: Thu, 11 Dec 2025 14:00:29 -0800 Subject: [PATCH] Fix Out of Memory issue when allocating bias buffer (#160) * Add argument to select performance test with bias or not; if with bias, the maximum memory usage should be re-calculated and reduce the data size to avoid the Out of Memory issue; if without bias, no need to allocate buffers for bias * Remove argument option for bias; memory calculation and buffer allocation are determined by the exec name. --------- Co-authored-by: Li --- src/common.cu | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/src/common.cu b/src/common.cu index f9ae4caf58..eff266bb8f 100644 --- a/src/common.cu +++ b/src/common.cu @@ -151,6 +151,9 @@ static int local_register = 0; #endif static int minCudaArch = 1<<30; +// Test bias +static int test_bias = 0; + Reporter::Reporter(std::string fileName, std::string outputFormat) : _outputFormat(outputFormat) { if (!fileName.empty()) { if (isMainThread()) { @@ -1546,8 +1549,10 @@ testResult_t run() { // Reserve 1GiB of memory for each 16GiB installed, but limit to a max of 4GiB const size_t GB = (1ULL << 30); size_t reserveMem = std::min(DIVUP(maxMem, 16*GB) * 1*GB, 4*GB); - // We need sendbuff, recvbuff, expected (when datacheck enabled), plus 1G for the rest. - size_t memMaxBytes = (maxMem - reserveMem - 1*GB) / (datacheck ? 3 : 2); + // If the program is all_reduce_bias, enable bias + if (strcmp(program_invocation_short_name, "all_reduce_bias_perf") == 0) test_bias = 1; + // We need sendbuff, recvbuff, expected (when datacheck enabled), bias (when bias enabled), plus 1G for the rest. + size_t memMaxBytes = (maxMem - reserveMem - 1*GB) / (datacheck ? (test_bias ? 4 : 3) : (test_bias ? 3 : 2)); if (maxBytes > memMaxBytes) { maxBytes = memMaxBytes; if (minBytes > maxBytes) minBytes = maxBytes; @@ -1578,7 +1583,11 @@ testResult_t run() { for (int i=0; i