Increase number of rings with XGMI connection

Improve throughput for about 20%. Also remove P2P over PCIe which was left enabled at initial release. Signed-off-by: Wenkai Du <wenkai.du@amd.com> [ROCm/rccl commit: f45566a8bd]
2019-05-24 20:58:51 +00:00
parent 7b4b3e2981
commit 5fdf2edd39
@@ -119,9 +119,9 @@ ncclResult_t p2pCanConnect(ncclTvalue_t* ret, ncclTinfo_t* myOpaqueInfo, ncclTin
    link_status_print_once_mask |= (1 << (myInfo->cudaDev*8 + peerInfo->cudaDev));
  }
  if (link_type != HSA_AMD_LINK_INFO_TYPE_XGMI) {
-  // enable below lines on release only: disable PCIe P2P until HDP flush is implemented.
-  //  p2p = 0;
-  //  return ncclSuccess;
+    // disable PCIe P2P until HDP flush is implemented.
+    p2p = 0;
+    return ncclSuccess;
  }
  int nvlinkp2p = 0;
  if (link_type == HSA_AMD_LINK_INFO_TYPE_XGMI && hops == 1)
@@ -290,7 +290,11 @@ int p2pComputeRingsNvLink(ncclTvalue_t* values, int nranks, int* rings, int nrin
  }

  // Duplicate the rings for direct NVLink
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+  compNrings = copyRings(nranks, rings, compNrings, compNrings*3);
+#else
  compNrings = copyRings(nranks, rings, compNrings, compNrings*2);
+#endif

  if (ncclCudaCompCap() == 6) *nthreads /= 2;
  return compNrings;