NVIDIA · x41lakazam · Apr 4, 2024 · AddyLaddy · May 27, 2024 · x41lakazam
diff --git a/doc/PERFORMANCE.md b/doc/PERFORMANCE.md
@@ -130,6 +130,16 @@ Similarly to broadcast, all data need to be sent to the root, hence :
 And :
 
  `B = S/t`
+
+### Bisection
+
+In the bisection operation, each rank is paired with a single other rank and sends him one message of size S, hence:
+
+t = S/B 
+
+And : 
+
+B = S/t
 
 ### Summary
 
@@ -140,5 +150,6 @@ To obtain a bus bandwidth which should be independent of the number of ranks _n_
 * AllGather : (_n_-1)/_n_
 * Broadcast : 1
 * Reduce : 1
+* Bisection : 1
 
 The bus bandwidth should reflect the speed of the hardware bottleneck : NVLink, PCI, QPI, or network.
diff --git a/src/Makefile b/src/Makefile
@@ -76,7 +76,7 @@ NVLDFLAGS += $(LIBRARIES:%=-l%)
 DST_DIR := $(BUILDDIR)
 SRC_FILES := $(wildcard *.cu)
 OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o)
-BIN_FILES_LIST := all_reduce all_gather broadcast reduce_scatter reduce alltoall scatter gather sendrecv hypercube
+BIN_FILES_LIST := all_reduce all_gather broadcast reduce_scatter reduce alltoall scatter gather sendrecv hypercube bisection
 BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf)
 
 build: ${BIN_FILES}

diff --git a/src/bisection.cu b/src/bisection.cu
@@ -0,0 +1,121 @@
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "cuda_runtime.h"
+#include "common.h"
+
+void BisectionGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+  *sendcount = count;
+  *recvcount = count;
+  *sendInplaceOffset = 0;
+  *recvInplaceOffset = 0;
+  *paramcount = count;
+}
+
+int getPeer(int rank, int n_ranks){
+    if (n_ranks % 4 == 0)
+        return ((n_ranks / 2 + rank) % n_ranks) + (rank % 2 ? -1 : 1);
+    // If there is an odd number of ranks, the last rank is ignored and paired with itself
+    else if (n_ranks % 2 == 1 && rank == n_ranks-1)
+        return rank;
+    else
+        return (rank + n_ranks/2) % (n_ranks - n_ranks % 2);
+}
+
+testResult_t BisectionInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
+  size_t sendcount = args->sendBytes / wordSize(type);
+  size_t recvcount = args->expectedBytes / wordSize(type);
+  int nranks = args->nProcs*args->nThreads*args->nGpus;
+
+  for (int i=0; i<args->nGpus; i++) {
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
+    TESTCHECK(InitData(data, sendcount, rank*sendcount, type, ncclSum, rep, 1, 0));
+    int peer = getPeer(rank, nranks);
+    TESTCHECK(InitData(args->expected[i], recvcount, peer*recvcount, type, ncclSum, rep, 1, 0));
+    CUDACHECK(cudaDeviceSynchronize());
+  }
+  // We don't support in-place sendrecv
+  args->reportErrors = in_place ? 0 : 1;
+  return testSuccess;
+}
+
+void BisectionGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+  *busBw = *algBw = (double)(count * typesize) / 1.0E9 / sec;
+}
+
+testResult_t BisectionRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+  int n_ranks, comm_rank, peer;
+
+  NCCLCHECK(ncclCommUserRank(comm, &comm_rank));
+  NCCLCHECK(ncclCommCount(comm, &n_ranks));
+
+  peer = getPeer(comm_rank, n_ranks);
+
+  NCCLCHECK(ncclGroupStart());
+  NCCLCHECK(ncclSend(sendbuff, count, type, peer, comm, stream));
+  NCCLCHECK(ncclRecv(recvbuff, count, type, peer, comm, stream));
+  NCCLCHECK(ncclGroupEnd());
+
+  return testSuccess;
+}
+
+struct testColl bisectionTest = {
+  "Bisection",
+  BisectionGetCollByteCount,
+  BisectionInitData,
+  BisectionGetBw,
+  BisectionRunColl
+};
+
+void BisectionGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  BisectionGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+}
+
+testResult_t BisectionRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &bisectionTest;
+  ncclDataType_t *run_types;
+  ncclRedOp_t *run_ops;
+  const char **run_typenames, **run_opnames;
+  int type_count, op_count;
+
+  if ((int)type != -1) {
+    type_count = 1;
+    run_types = &type;
+    run_typenames = &typeName;
+  } else {
+    type_count = test_typenum;
+    run_types = test_types;
+    run_typenames = test_typenames;
+  }
+
+  if ((int)op != -1) {
+    op_count = 1;
+    run_ops = &op;
+    run_opnames = &opName;
+  } else {
+    op_count = test_opnum;
+    run_ops = test_ops;
+    run_opnames = test_opnames;
+  }
+
+  for (int i=0; i<type_count; i++) {
+    for (int j=0; j<op_count; j++) {
+      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], -1));
+    }
+  }
+  return testSuccess;
+}
+
+struct testEngine bisectionEngine = {
+  BisectionGetBuffSize,
+  BisectionRunTest
+};
+
+#pragma weak ncclTestEngine=bisectionEngine