Fix crash during shared memory creation (#185)

The shared memory filename was only based on the destination. While
this was OK for rings since only one rank would send data to a given
rank, it would crash with trees because they communicate in both
directions.

Co-authored-by: Rong Ou <rong.ou@gmail.com>
Этот коммит содержится в:
Rong Ou
2019-03-04 11:42:47 -08:00
коммит произвёл Sylvain Jeaugey
родитель 1450d42675
Коммит 14e0cf644b
+24 -23
Просмотреть файл
@@ -12,17 +12,11 @@
#include <unistd.h>
#include <cuda_runtime.h>
struct shmSendConnectInfo {
struct shmConnectInfo {
uint64_t pidHash;
int id;
int rank;
int shmSize;
};
struct shmRecvConnectInfo {
uint64_t pidHash;
int id;
int rank;
int sendRank;
int recvRank;
int shmSize;
};
@@ -141,17 +135,21 @@ ncclResult_t shmSendSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peer
NCCLCHECK(ncclCalloc(&resources, 1));
send->transportResources = resources;
struct shmRecvConnectInfo info;
struct shmConnectInfo info;
info.id = channelId;
info.pidHash = myInfo->pidHash;
info.sendRank = myInfo->rank;
info.recvRank = peerInfo->rank;
char shmName[MAX_SHM_NAME_LEN];
sprintf(shmName, "nccl-shm-send-%lx-%d-%d", myInfo->pidHash, channelId, myInfo->rank);
sprintf(shmName, "nccl-shm-send-%lx-%d-%d-%d", info.pidHash, info.id, info.sendRank, info.recvRank);
info.shmSize = resources->shmSize = sizeof(struct ncclSendMem);
TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info.shmSize);
NCCLCHECK(shmOpen(shmName, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1));
INFO(NCCL_INIT|NCCL_SHM,"Ring %02d : %d[%d] -> %d[%d] via direct shared memory", channelId, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
info.id = channelId; info.rank = myInfo->rank; info.pidHash = myInfo->pidHash;
static_assert(sizeof(struct shmRecvConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Recv Info is too big");
memcpy(connectInfo, &info, sizeof(struct shmRecvConnectInfo));
static_assert(sizeof(struct shmConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Recv Info is too big");
memcpy(connectInfo, &info, sizeof(struct shmConnectInfo));
return ncclSuccess;
}
@@ -160,28 +158,31 @@ ncclResult_t shmRecvSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peer
NCCLCHECK(ncclCalloc(&resources, 1));
recv->transportResources = resources;
struct shmSendConnectInfo info;
struct shmConnectInfo info;
info.id = channelId;
info.pidHash = myInfo->pidHash;
info.sendRank = peerInfo->rank;
info.recvRank = myInfo->rank;
char shmName[MAX_SHM_NAME_LEN];
sprintf(shmName, "nccl-shm-recv-%lx-%d-%d", myInfo->pidHash, channelId, myInfo->rank);
sprintf(shmName, "nccl-shm-recv-%lx-%d-%d-%d", info.pidHash, info.id, info.sendRank, info.recvRank);
info.shmSize = resources->shmSize = offsetof(struct ncclRecvMem, buff)+buffSize;
TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info.shmSize);
NCCLCHECK(shmOpen(shmName, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1));
info.id = channelId; info.rank = myInfo->rank; info.pidHash = myInfo->pidHash;
static_assert(sizeof(struct shmRecvConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Send Info is too big");
memcpy(connectInfo, &info, sizeof(struct shmSendConnectInfo));
static_assert(sizeof(struct shmConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Send Info is too big");
memcpy(connectInfo, &info, sizeof(struct shmConnectInfo));
return ncclSuccess;
}
/* Connect to this peer */
ncclResult_t shmSendConnect(struct ncclConnect* connectInfo, struct ncclConnector* send) {
// Setup device pointers
struct shmSendConnectInfo* info = (struct shmSendConnectInfo*)connectInfo;
struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo;
struct shmSendResources* resources = (struct shmSendResources*)send->transportResources;
char shmName[MAX_SHM_NAME_LEN];
sprintf(shmName, "nccl-shm-recv-%lx-%d-%d", info->pidHash, info->id, info->rank);
sprintf(shmName, "nccl-shm-recv-%lx-%d-%d-%d", info->pidHash, info->id, info->sendRank, info->recvRank);
resources->remShmSize = info->shmSize;
TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info->shmSize);
NCCLCHECK(shmOpen(shmName, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, 0));
@@ -202,10 +203,10 @@ ncclResult_t shmSendConnect(struct ncclConnect* connectInfo, struct ncclConnecto
ncclResult_t shmRecvConnect(struct ncclConnect* connectInfo, struct ncclConnector* recv) {
// Setup device pointers
struct shmRecvResources* resources = (struct shmRecvResources*)recv->transportResources;
struct shmRecvConnectInfo* info = (struct shmRecvConnectInfo*)connectInfo;
struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo;
char shmName[MAX_SHM_NAME_LEN];
sprintf(shmName, "nccl-shm-send-%lx-%d-%d", info->pidHash, info->id, info->rank);
sprintf(shmName, "nccl-shm-send-%lx-%d-%d-%d", info->pidHash, info->id, info->sendRank, info->recvRank);
resources->remShmSize = info->shmSize;
TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info->shmSize);
NCCLCHECK(shmOpen(shmName, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, 0));