[RcclReplayer] Compile without the need for RCCL to be compiled (#2039)

This commit is contained in:
Kapil S. Pawar
2025-11-10 15:38:48 -06:00
کامیت شده توسط GitHub
والد 05f914c997
کامیت acdafac49f
5فایلهای تغییر یافته به همراه34 افزوده شده و 5 حذف شده
-1
مشاهده پرونده
@@ -5,7 +5,6 @@
#include <vector>
#include <mutex>
#include <chrono>
#include "debug.h"
namespace rccl
{
+1
مشاهده پرونده
@@ -7,6 +7,7 @@
#include <string>
#include <iomanip>
#include <sys/syscall.h>
#include "debug.h"
using namespace std::chrono;
+1 -1
مشاهده پرونده
@@ -2,7 +2,7 @@ ROCM_DIR ?= /opt/rocm
RCCL_DIR ?= ../../build/release
MPI_DIR ?= /opt/ompi
INCLUDES = -I$(MPI_DIR)/include -I$(RCCL_DIR)/include -I$(RCCL_DIR)/hipify/src/include
INCLUDES = -I$(MPI_DIR)/include -I$(RCCL_DIR)/include -I../../src/include
LDFLAGS = -L$(MPI_DIR)/lib -L$(RCCL_DIR) -lmpi -lrccl
main: rcclReplayer.cpp
+1 -1
مشاهده پرونده
@@ -73,7 +73,7 @@ Replayer is a separate tool which aims to re-run the same set of RCCL calls as r
## Installation
* Replayer relies on MPI for out of band communication.
* Under `rccl/tools/RcclReplayer`, run `MPI_DIR=${MPI_PATH} make`
* Replayer has to be built from RCCL source. Furthermore, it requires RCCL library to be built from the same source in `../../build/release`. For compatibility reason, it is recommended that the logs are collected using same RCCL library as well.
* Replayer can be built from RCCL source using internal headers directly. It links against a RCCL library specified by `RCCL_DIR` (defaults to `../../build/release`). For compatibility, it is recommended that logs are collected using the same RCCL library version.
## Running
* Replayer requires the exact same number of processes and processes per node as the recorded job. And all log files must be accessible by all processes in Replayer, either through shared filesystem or copies.
* To run Replayer, simply call `mpirun -np ${np} ./rcclReplayer ${filename}.${extension}`
@@ -5,13 +5,17 @@
#include <unordered_map>
#include <chrono>
#include <cstring>
#include <iostream>
#include <rccl/rccl.h>
#include <hip/hip_bfloat16.h>
#include "hip/hip_fp16.h"
#include "rccl_float8.h"
#include "info.h"
// Forward declaration for ncclInfo
// - recorder.h declares functions that take 'const ncclInfo&' as parameter
// - These functions are only used during recording (by recorder.cc), not during replay
// - RcclReplayer only uses rcclApiCall struct
struct ncclInfo;
#include "recorder.h"
// NOTE: Parsing is based on this line logging collective information in enqueue.cc
@@ -63,6 +67,31 @@ struct DeviceGraphInfo
int counter = 0;
};
// ncclTypeSize() - extracted from collectives.h to avoid deep dependencies
// This is the only function we need from info.h/collectives.h
static inline int ncclTypeSize(ncclDataType_t type) {
switch (type) {
case ncclInt8:
case ncclUint8:
case ncclFloat8e4m3:
case ncclFloat8e5m2:
return 1;
case ncclFloat16:
case ncclBfloat16:
return 2;
case ncclInt32:
case ncclUint32:
case ncclFloat32:
return 4;
case ncclInt64:
case ncclUint64:
case ncclFloat64:
return 8;
default:
return -1;
}
}
class Replayer
{
private: