diff --git a/doc/PERFORMANCE.md b/doc/PERFORMANCE.md index 21fef60..63522aa 100644 --- a/doc/PERFORMANCE.md +++ b/doc/PERFORMANCE.md @@ -130,6 +130,16 @@ Similarly to broadcast, all data need to be sent to the root, hence : And : `B = S/t` + +### Bisection + +In the bisection operation, each rank is paired with a single other rank and sends him one message of size S, hence: + +t = S/B + +And : + +B = S/t ### Summary @@ -140,5 +150,6 @@ To obtain a bus bandwidth which should be independent of the number of ranks _n_ * AllGather : (_n_-1)/_n_ * Broadcast : 1 * Reduce : 1 +* Bisection : 1 The bus bandwidth should reflect the speed of the hardware bottleneck : NVLink, PCI, QPI, or network. diff --git a/src/Makefile b/src/Makefile index 393de8e..a2cb85b 100644 --- a/src/Makefile +++ b/src/Makefile @@ -76,7 +76,7 @@ NVLDFLAGS += $(LIBRARIES:%=-l%) DST_DIR := $(BUILDDIR) SRC_FILES := $(wildcard *.cu) OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o) -BIN_FILES_LIST := all_reduce all_gather broadcast reduce_scatter reduce alltoall scatter gather sendrecv hypercube +BIN_FILES_LIST := all_reduce all_gather broadcast reduce_scatter reduce alltoall scatter gather sendrecv hypercube bisection BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf) build: ${BIN_FILES} diff --git a/src/bisection.cu b/src/bisection.cu new file mode 100644 index 0000000..9f43072 --- /dev/null +++ b/src/bisection.cu @@ -0,0 +1,121 @@ +/************************************************************************* + * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "cuda_runtime.h" +#include "common.h" + +void BisectionGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { + *sendcount = count; + *recvcount = count; + *sendInplaceOffset = 0; + *recvInplaceOffset = 0; + *paramcount = count; +} + +int getPeer(int rank, int n_ranks){ + if (n_ranks % 4 == 0) + return ((n_ranks / 2 + rank) % n_ranks) + (rank % 2 ? -1 : 1); + // If there is an odd number of ranks, the last rank is ignored and paired with itself + else if (n_ranks % 2 == 1 && rank == n_ranks-1) + return rank; + else + return (rank + n_ranks/2) % (n_ranks - n_ranks % 2); +} + +testResult_t BisectionInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) { + size_t sendcount = args->sendBytes / wordSize(type); + size_t recvcount = args->expectedBytes / wordSize(type); + int nranks = args->nProcs*args->nThreads*args->nGpus; + + for (int i=0; inGpus; i++) { + CUDACHECK(cudaSetDevice(args->gpus[i])); + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); + CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); + void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; + TESTCHECK(InitData(data, sendcount, rank*sendcount, type, ncclSum, rep, 1, 0)); + int peer = getPeer(rank, nranks); + TESTCHECK(InitData(args->expected[i], recvcount, peer*recvcount, type, ncclSum, rep, 1, 0)); + CUDACHECK(cudaDeviceSynchronize()); + } + // We don't support in-place sendrecv + args->reportErrors = in_place ? 0 : 1; + return testSuccess; +} + +void BisectionGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) { + *busBw = *algBw = (double)(count * typesize) / 1.0E9 / sec; +} + +testResult_t BisectionRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { + int n_ranks, comm_rank, peer; + + NCCLCHECK(ncclCommUserRank(comm, &comm_rank)); + NCCLCHECK(ncclCommCount(comm, &n_ranks)); + + peer = getPeer(comm_rank, n_ranks); + + NCCLCHECK(ncclGroupStart()); + NCCLCHECK(ncclSend(sendbuff, count, type, peer, comm, stream)); + NCCLCHECK(ncclRecv(recvbuff, count, type, peer, comm, stream)); + NCCLCHECK(ncclGroupEnd()); + + return testSuccess; +} + +struct testColl bisectionTest = { + "Bisection", + BisectionGetCollByteCount, + BisectionInitData, + BisectionGetBw, + BisectionRunColl +}; + +void BisectionGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) { + size_t paramcount, sendInplaceOffset, recvInplaceOffset; + BisectionGetCollByteCount(sendcount, recvcount, ¶mcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks); +} + +testResult_t BisectionRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) { + args->collTest = &bisectionTest; + ncclDataType_t *run_types; + ncclRedOp_t *run_ops; + const char **run_typenames, **run_opnames; + int type_count, op_count; + + if ((int)type != -1) { + type_count = 1; + run_types = &type; + run_typenames = &typeName; + } else { + type_count = test_typenum; + run_types = test_types; + run_typenames = test_typenames; + } + + if ((int)op != -1) { + op_count = 1; + run_ops = &op; + run_opnames = &opName; + } else { + op_count = test_opnum; + run_ops = test_ops; + run_opnames = test_opnames; + } + + for (int i=0; i