diff --git a/Makefile b/Makefile index 87fdc6c2fe..5f93ecb266 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ # -# Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -49,6 +49,7 @@ endif LDFLAGS := -L$(CUDA_HOME)/lib64 -lcudart MPIFLAGS := -I$(MPI_HOME)/include -L$(MPI_HOME)/lib -lmpi +TSTINC := -Ibuild/include -Itest/include .PHONY : lib clean test mpitest install .DEFAULT : lib @@ -65,8 +66,8 @@ MPITESTS := mpi_test INCDIR := $(BUILDDIR)/include LIBDIR := $(BUILDDIR)/lib OBJDIR := $(BUILDDIR)/obj -TSTDIR := $(BUILDDIR)/test -MPITSTDIR := $(BUILDDIR)/mpitest +TSTDIR := $(BUILDDIR)/test/single +MPITSTDIR := $(BUILDDIR)/test/mpi INCTARGETS := $(patsubst %, $(INCDIR)/%, $(INCEXPORTS)) LIBSONAME := $(patsubst %,%.$(VER_MAJOR),$(LIBNAME)) @@ -108,11 +109,11 @@ clean : test : lib $(TESTBINS) -$(TSTDIR)/% : src/%.cu lib +$(TSTDIR)/% : test/single/%.cu lib @printf "Building %-25s > %-24s\n" $< $@ @mkdir -p $(TSTDIR) - @$(NVCC) -Ibuild/include $(CPPFLAGS) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" -o $@ $< -Lbuild/lib $(LIBLINK) $(LDFLAGS) -lcuda -lcurand -lnvToolsExt - @$(NVCC) -M -Ibuild/include $(CPPFLAGS) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< -Lbuild/lib $(LIBLINK) $(LDFLAGS) -lcuda -lcurand -lnvToolsExt > $(@:%=%.d.tmp) + @$(NVCC) $(TSTINC) $(CPPFLAGS) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" -o $@ $< -Lbuild/lib $(LIBLINK) $(LDFLAGS) -lcuda -lcurand -lnvToolsExt + @$(NVCC) -M $(TSTINC) $(CPPFLAGS) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< -Lbuild/lib $(LIBLINK) $(LDFLAGS) -lcuda -lcurand -lnvToolsExt > $(@:%=%.d.tmp) @sed "0,/^.*:/s//$(subst /,\/,$@):/" $(@:%=%.d.tmp) > $(@:%=%.d) @sed -e 's/.*://' -e 's/\\$$//' < $(@:%=%.d.tmp) | fmt -1 | \ sed -e 's/^ *//' -e 's/$$/:/' >> $(@:%=%.d) @@ -120,11 +121,11 @@ $(TSTDIR)/% : src/%.cu lib mpitest : lib $(MPITESTBINS) -$(MPITSTDIR)/% : src/%.cu lib +$(MPITSTDIR)/% : test/mpi/%.cu lib @printf "Building %-25s > %-24s\n" $< $@ @mkdir -p $(MPITSTDIR) - @$(NVCC) $(MPIFLAGS) -Ibuild/include $(CPPFLAGS) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" -o $@ $< -Lbuild/lib $(LIBLINK) $(LDFLAGS) - @$(NVCC) $(MPIFLAGS) -M -Ibuild/include $(CPPFLAGS) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< -Lbuild/lib $(LIBLINK) $(LDFLAGS) > $(@:%=%.d.tmp) + @$(NVCC) $(MPIFLAGS) $(TSTINC) $(CPPFLAGS) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" -o $@ $< -Lbuild/lib $(LIBLINK) $(LDFLAGS) + @$(NVCC) $(MPIFLAGS) -M $(TSTINC) $(CPPFLAGS) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< -Lbuild/lib $(LIBLINK) $(LDFLAGS) > $(@:%=%.d.tmp) @sed "0,/^.*:/s//$(subst /,\/,$@):/" $(@:%=%.d.tmp) > $(@:%=%.d) @sed -e 's/.*://' -e 's/\\$$//' < $(@:%=%.d.tmp) | fmt -1 | \ sed -e 's/^ *//' -e 's/$$/:/' >> $(@:%=%.d) diff --git a/src/test_utilities.h b/test/include/test_utilities.h similarity index 100% rename from src/test_utilities.h rename to test/include/test_utilities.h diff --git a/src/mpi_test.cu b/test/mpi/mpi_test.cu similarity index 60% rename from src/mpi_test.cu rename to test/mpi/mpi_test.cu index 600228cb6d..87465e56c1 100644 --- a/src/mpi_test.cu +++ b/test/mpi/mpi_test.cu @@ -28,19 +28,23 @@ #include #include +#include #include "nccl.h" #include "mpi.h" #define CUDACHECK(cmd) do { \ - cudaError_t e = cmd; \ - if( e != cudaSuccess ) { \ - printf("Cuda failure %s:%d '%s'\n", \ - __FILE__,__LINE__,cudaGetErrorString(e)); \ - exit(EXIT_FAILURE); \ - } \ + cudaError_t e = cmd; \ + if( e != cudaSuccess ) { \ + printf("Cuda failure %s:%d '%s'\n", \ + __FILE__,__LINE__,cudaGetErrorString(e)); \ + exit(EXIT_FAILURE); \ + } \ } while(false) +#define SIZE 128 +#define NITERS 1 + int main(int argc, char *argv[]) { ncclUniqueId commId; int size, rank; @@ -50,14 +54,18 @@ int main(int argc, char *argv[]) { MPI_Comm_size(MPI_COMM_WORLD, &size); MPI_Comm_rank(MPI_COMM_WORLD, &rank); + if (argc < size) { + printf("Usage : %s \n", argv[0]); + } + int gpu = atoi(argv[rank+1]); - printf("MPI Rank %d running on GPU %d\n", rank, gpu); + // We have to set our device before NCCL init CUDACHECK(cudaSetDevice(gpu)); MPI_Barrier(MPI_COMM_WORLD); + // NCCL Communicator creation ncclComm_t comm; - // Let's use rank 0 PID as job ID ncclGetUniqueId(&commId); MPI_Bcast(&commId, NCCL_UNIQUE_ID_BYTES, MPI_CHAR, 0, MPI_COMM_WORLD); ret = ncclCommInitRank(&comm, size, commId, rank); @@ -66,18 +74,48 @@ int main(int argc, char *argv[]) { exit(1); } + // CUDA stream creation + cudaStream_t stream; + cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking); + + // Initialize input values int *dptr; - CUDACHECK(cudaMalloc(&dptr, 1024*2*sizeof(int))); - int val = rank; - CUDACHECK(cudaMemcpy(dptr, &val, sizeof(int), cudaMemcpyHostToDevice)); + CUDACHECK(cudaMalloc(&dptr, SIZE*2*sizeof(int))); + int *val = (int*) malloc(SIZE*sizeof(int)); + for (int v=0; v