diff --git a/README.md b/README.md index 475c1957ed..32edc232f4 100644 --- a/README.md +++ b/README.md @@ -99,33 +99,7 @@ RCCL package install requires sudo/root access because it installs under `/opt/r ## Docker build -Assuming you have docker installed on your system: - -#### To build the docker image : - -By default, the given Dockerfile uses `docker.io/rocm/dev-ubuntu-22.04:latest` as the base docker image, and then installs RCCL (develop branch) and RCCL-Tests (develop branch). -```shell -$ docker build -t rccl-tests -f Dockerfile.ubuntu --pull . -``` - -The base docker image, rccl repo, and rccl-tests repo can be modified using `--build-args` in the `docker build` command above. E.g., to use a different base docker image: -```shell -$ docker build -t rccl-tests -f Dockerfile.ubuntu --build-arg="ROCM_IMAGE_NAME=rocm/dev-ubuntu-20.04" --build-arg="ROCM_IMAGE_TAG=6.2" --pull . -``` - -#### To start an interactive docker container on a system with AMD GPUs : - -```shell -$ docker run -it --rm --device=/dev/kfd --device=/dev/dri --group-add video --ipc=host --network=host --cap-add=SYS_PTRACE --security-opt seccomp=unconfined rccl-tests /bin/bash -``` - -#### To run rccl-tests (all_reduce_perf) on 8 AMD GPUs (inside the docker container) : - -```shell -$ mpirun --allow-run-as-root -np 8 --mca pml ucx --mca btl ^openib -x NCCL_DEBUG=VERSION /workspace/rccl-tests/build/all_reduce_perf -b 1 -e 16G -f 2 -g 1 -``` - -For more information on rccl-tests options, refer to the [Usage](https://github.com/ROCm/rccl-tests#usage) section of rccl-tests. +Refer to [docker/README.md](docker/README.md "docker/README.md") ## Tests diff --git a/docker/Dockerfile.ubuntu b/docker/Dockerfile.ubuntu index 1866ba1694..7d87152ef0 100644 --- a/docker/Dockerfile.ubuntu +++ b/docker/Dockerfile.ubuntu @@ -11,10 +11,13 @@ ARG RCCL_BRANCH=develop ARG RCCL_TESTS_REPO=https://github.com/ROCm/rccl-tests ARG RCCL_TESTS_BRANCH=develop +## AMD GPU Targets +ARG GPU_TARGETS=gfx942 ## creating scratch space -RUN mkdir -p /workspace -WORKDIR /workspace +ENV WORKDIR /workspace +RUN mkdir -p ${WORKDIR} +WORKDIR ${WORKDIR} ## install dependencies RUN apt-get update \ @@ -55,6 +58,8 @@ RUN apt-get update \ python3-dev \ python3-tk \ python3-yaml \ + vim \ + less \ && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* @@ -64,6 +69,9 @@ RUN wget https://github.com/Kitware/CMake/releases/download/v3.28.0/cmake-3.28.0 && bash ./cmake-3.28.0-linux-x86_64.sh --prefix=/usr --exclude-subdir --skip-license \ && rm cmake-3.28.0-linux-x86_64.sh +## Set ROCm path +ENV ROCM_PATH=/opt/rocm + ## Install UCX ENV UCX_INSTALL_PREFIX=/opt/ucx RUN wget https://github.com/openucx/ucx/releases/download/v1.16.0/ucx-1.16.0.tar.gz \ @@ -72,7 +80,7 @@ RUN wget https://github.com/openucx/ucx/releases/download/v1.16.0/ucx-1.16.0.tar && cd ucx \ && mkdir build \ && cd build \ - && ../configure --prefix=${UCX_INSTALL_PREFIX} --with-rocm=/opt/rocm \ + && ../configure --prefix=${UCX_INSTALL_PREFIX} --with-rocm=${ROCM_PATH} \ && make -j16 install \ && cd ../.. \ && rm -rf ucx ucx-1.16.0.tar.gz @@ -92,18 +100,25 @@ RUN wget https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.6.tar.g ## building RCCL -ENV RCCL_INSTALL_PREFIX=/opt/rocm -RUN git clone --recurse-submodules -b "${RCCL_BRANCH}" "${RCCL_REPO}" ./rccl \ - && cd ./rccl \ - && ./install.sh -t --prefix=${RCCL_INSTALL_PREFIX} +ENV RCCL_INSTALL_PREFIX=${WORKDIR}/rccl_develop/build/release +RUN git clone --recurse-submodules -b "${RCCL_BRANCH}" "${RCCL_REPO}" ./rccl_develop \ + && cd ./rccl_develop \ + && ./install.sh --amdgpu_targets=${GPU_TARGETS} ## building RCCL-Tests RUN git clone -b "${RCCL_TESTS_BRANCH}" "${RCCL_TESTS_REPO}" ./rccl-tests \ && cd ./rccl-tests \ - && make MPI=1 MPI_HOME=${MPI_INSTALL_PREFIX} NCCL_HOME=${RCCL_INSTALL_PREFIX} -j16 + && mkdir build \ + && cd build \ + && CXX=${ROCM_PATH}/bin/amdclang++ MPI_HOME=${MPI_INSTALL_PREFIX} cmake -DCMAKE_BUILD_TYPE=Release -DUSE_MPI=ON -DAMDGPU_TARGETS=${GPU_TARGETS} .. \ + && make -j16 ## set environment variables -ENV PATH="${RCCL_INSTALL_PREFIX}/bin:${MPI_INSTALL_PREFIX}/bin:${PATH}" -ENV LD_LIBRARY_PATH="${RCCL_INSTALL_PREFIX}/lib:${MPI_INSTALL_PREFIX}/lib:${LD_LIBRARY_PATH}" +ENV PATH="${MPI_INSTALL_PREFIX}/bin:${ROCM_PATH}/bin:${PATH}" +ENV LD_LIBRARY_PATH="${RCCL_INSTALL_PREFIX}:${MPI_INSTALL_PREFIX}/lib:${ROCM_PATH}/lib:${LD_LIBRARY_PATH}" +ENV UCX_WARN_UNUSED_ENV_VARS=n +ENV OMPI_ALLOW_RUN_AS_ROOT=1 +ENV OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1 +ENV NCCL_DEBUG=VERSION diff --git a/docker/README.md b/docker/README.md new file mode 100644 index 0000000000..30bf58aed4 --- /dev/null +++ b/docker/README.md @@ -0,0 +1,42 @@ +# Using RCCL/RCCL-Tests in a docker environment + +## Docker build + +Assuming you have docker installed on your system: + +### To build the docker image : + +By default, the given Dockerfile uses `docker.io/rocm/dev-ubuntu-22.04:latest` as the base docker image, and then installs RCCL (develop branch) and RCCL-Tests (develop branch), targetting `gfx942` GPUs. +```shell +$ docker build -t rccl-tests -f Dockerfile.ubuntu --pull . +``` + +The base docker image, rccl repo, rccl-tests repo, and GPU targets can be modified using `--build-args` in the `docker build` command above. E.g., to use a different base docker image for the MI250 GPU: +```shell +$ docker build -t rccl-tests -f Dockerfile.ubuntu --build-arg="ROCM_IMAGE_NAME=rocm/dev-ubuntu-20.04" --build-arg="ROCM_IMAGE_TAG=6.2" --build-arg="GPU_TARGETS=gfx90a" --pull . +``` + +### To start an interactive docker container on a system with AMD GPUs : + +```shell +$ docker run --rm --device=/dev/kfd --device=/dev/dri --group-add video --ipc=host --network=host --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -it rccl-tests /bin/bash +``` + +### To run rccl-tests (all\_reduce\_perf) on 8 AMD GPUs (inside the docker container) : + +If using ROCm 6.3.x or earlier +```shell +$ mpirun --allow-run-as-root -np 8 --mca pml ucx --mca btl ^openib -x NCCL_DEBUG=VERSION -x HSA_NO_SCRATCH_RECLAIM=1 /workspace/rccl-tests/build/all_reduce_perf -b 1 -e 16G -f 2 -g 1 +``` + +If using ROCm 6.4.0 or later +```shell +$ mpirun --allow-run-as-root -np 8 --mca pml ucx --mca btl ^openib -x NCCL_DEBUG=VERSION /workspace/rccl-tests/build/all_reduce_perf -b 1 -e 16G -f 2 -g 1 +``` + +For more information on rccl-tests options, refer to the [Usage](https://github.com/ROCm/rccl-tests#usage) section of rccl-tests. + + +## Copyright + +All modifications are copyright (c) 2019-2025 Advanced Micro Devices, Inc. All rights reserved. diff --git a/docs/install/docker-install.rst b/docs/install/docker-install.rst index 3b0c780bb3..32e65c97ee 100644 --- a/docs/install/docker-install.rst +++ b/docs/install/docker-install.rst @@ -14,7 +14,7 @@ To build the Docker image and run the container, follow these steps. #. Build the Docker image By default, the Dockerfile uses ``docker.io/rocm/dev-ubuntu-22.04:latest`` as the base Docker image. - It then installs RCCL and rccl-tests (in both cases, it uses the version from the RCCL ``develop`` branch). + It then installs RCCL and rccl-tests (in both cases, it uses the version from the ``develop`` branch). Use this command to build the Docker image: @@ -22,24 +22,30 @@ To build the Docker image and run the container, follow these steps. docker build -t rccl-tests -f Dockerfile.ubuntu --pull . - The base Docker image, rccl repository, and rccl-tests repository can be modified - by using ``--build-args`` in the ``docker build`` command above. For example, to use a different base Docker image, + The base Docker image, rccl repository, rccl-tests repository, and GPU targets can be modified + by using ``--build-args`` in the ``docker build`` command above. For example, to use a different base Docker image for the MI250 GPU, use this command: .. code-block:: shell - docker build -t rccl-tests -f Dockerfile.ubuntu --build-arg="ROCM_IMAGE_NAME=rocm/dev-ubuntu-20.04" --build-arg="ROCM_IMAGE_TAG=6.2" --pull . + docker build -t rccl-tests -f Dockerfile.ubuntu --build-arg="ROCM_IMAGE_NAME=rocm/dev-ubuntu-20.04" --build-arg="ROCM_IMAGE_TAG=6.2" --build-arg="GPU_TARGETS=gfx90a" --pull . #. Launch an interactive Docker container on a system with AMD GPUs: .. code-block:: shell - docker run -it --rm --device=/dev/kfd --device=/dev/dri --group-add video --ipc=host --network=host --cap-add=SYS_PTRACE --security-opt seccomp=unconfined rccl-tests /bin/bash + docker run --rm --device=/dev/kfd --device=/dev/dri --group-add video --ipc=host --network=host --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -it rccl-tests /bin/bash To run, for example, the ``all_reduce_perf`` test from rccl-tests on 8 AMD GPUs from inside the Docker container, use this command: +If using ROCm 6.3.x or earlier +.. code-block:: shell + + mpirun --allow-run-as-root -np 8 --mca pml ucx --mca btl ^openib -x NCCL_DEBUG=VERSION -x HSA_NO_SCRATCH_RECLAIM=1 /workspace/rccl-tests/build/all_reduce_perf -b 1 -e 16G -f 2 -g 1 + +If using ROCm 6.4.0 or later .. code-block:: shell mpirun --allow-run-as-root -np 8 --mca pml ucx --mca btl ^openib -x NCCL_DEBUG=VERSION /workspace/rccl-tests/build/all_reduce_perf -b 1 -e 16G -f 2 -g 1 -For more information on the rccl-tests options, see the `Usage guidelines `_ in the GitHub repository. \ No newline at end of file +For more information on the rccl-tests options, see the `Usage guidelines `_ in the GitHub repository.