From a6d621176cbfd8e91f78cb4e915d2de553734b84 Mon Sep 17 00:00:00 2001 From: Wenkai Du Date: Thu, 11 Jun 2020 13:02:45 +0800 Subject: [PATCH] Sender rank's opCount maybe ahead by one if it finishes earlier --- src/collectives/device/primitives.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/collectives/device/primitives.h b/src/collectives/device/primitives.h index 023004cb3e..ce5c83c3e9 100644 --- a/src/collectives/device/primitives.h +++ b/src/collectives/device/primitives.h @@ -113,7 +113,7 @@ class ncclPrimitives { if (mismatch) { // In non-LL, we use _threadfence_system before incrementing opCount, yet we are still waiting for credits here, so there must be a size mismatch STORE(comm->fatalDevError, ncclDevAssertedMismatch); - } else if (conn && LOAD(conn->opCountRem) > opCount) { + } else if (conn && LOAD(conn->opCountRem) > opCount+1) { mismatch += 1; } }