From e228ea8485dcb6afbc3f7c2489f0374fc87ce66b Mon Sep 17 00:00:00 2001 From: "Bhardwaj, Gopesh" Date: Fri, 7 Feb 2025 12:01:31 +0530 Subject: [PATCH] SWDEV-510794 Adding MPI usage with rocprofv3 (#183) * swdev-510794 Adding MPI usage with rocprofv3 * update doc * Fixed build issues * updating doc * doc update * Fixed Typos * csv format * change format to shell [ROCm/rocprofiler-sdk commit: 7821657d65073f1e2d2d2526171adf493da58cf2] --- projects/rocprofiler-sdk/CHANGELOG.md | 1 + .../rocprofiler-sdk/source/docs/_toc.yml.in | 1 + .../docs/how-to/using-rocprofv3-with-mpi.rst | 175 ++++++++++++++++++ .../rocprofiler-sdk/source/docs/index.rst | 1 + 4 files changed, 178 insertions(+) create mode 100644 projects/rocprofiler-sdk/source/docs/how-to/using-rocprofv3-with-mpi.rst diff --git a/projects/rocprofiler-sdk/CHANGELOG.md b/projects/rocprofiler-sdk/CHANGELOG.md index a86f43cc4d..581c035814 100644 --- a/projects/rocprofiler-sdk/CHANGELOG.md +++ b/projects/rocprofiler-sdk/CHANGELOG.md @@ -154,6 +154,7 @@ Full documentation for ROCprofiler-SDK is available at [rocm.docs.amd.com/projec - Added deprecation notice for rocprofiler(v1) and rocprofiler(v2). - Added support for rocDecode API Tracing - Added usage documentation for ROCTx +- Added usage documentation for MPI applications ### Changed diff --git a/projects/rocprofiler-sdk/source/docs/_toc.yml.in b/projects/rocprofiler-sdk/source/docs/_toc.yml.in index aa5a096abd..1d97d7a8de 100644 --- a/projects/rocprofiler-sdk/source/docs/_toc.yml.in +++ b/projects/rocprofiler-sdk/source/docs/_toc.yml.in @@ -14,6 +14,7 @@ subtrees: - file: how-to/using-rocprofv3 - file: how-to/using-rocprofiler-sdk-roctx - file: how-to/samples + - file: how-to/using-rocprofv3-with-mpi - caption: API reference entries: - file: api-reference/buffered_services diff --git a/projects/rocprofiler-sdk/source/docs/how-to/using-rocprofv3-with-mpi.rst b/projects/rocprofiler-sdk/source/docs/how-to/using-rocprofv3-with-mpi.rst new file mode 100644 index 0000000000..4bbe55ceab --- /dev/null +++ b/projects/rocprofiler-sdk/source/docs/how-to/using-rocprofv3-with-mpi.rst @@ -0,0 +1,175 @@ +.. meta:: + :description: Documentation of the mpi usage for rocprofv3 + :keywords: ROCprofiler-SDK tool, mpirun, rocprofv3, rocprofv3 tool usage, mpich, ROCprofiler-SDK command line tool, ROCprofiler-SDK CLI + + +.. _using-rocprofv3-with-mpi: + +Using rocprofv3 with MPI ++++++++++++++++++++++++++ + +Message Passing Interface (MPI) is a standardized and portable message-passing system designed to function on a wide variety of parallel computing architectures. MPI is widely used for developing parallel applications and is considered the de facto standard for communication in high-performance computing (HPC) environments. +MPI applications are parallel applications that run across multiple processes, which can be distributed over one or more nodes. + +For ``MPI`` applications or other job launchers such as ``SLURM``, place ``rocprofv3`` inside the job launcher. The following example demonstrates how to use ``rocprofv3`` with MPI: + +.. code-block:: bash + + mpirun -n 4 rocprofv3 --hip-trace -- + +The above command runs the application with `rocprofv3` and generates the trace file for each rank. The trace files are prefixed with the process ID. + +.. code-block:: bash + + 2293213_agent_info.csv + 2293213_hip_api_trace.csv + 2293214_agent_info.csv + 2293214_hip_api_trace.csv + 2293212_agent_info.csv + 2293212_hip_api_trace.csv + 2293215_agent_info.csv + 2293215_hip_api_trace.csv + +Since we do the data collection in-process, it is ideal to be in the process(es) launched by ``MPI``. Outside of ``mpirun``, the tool library is loaded into the ``mpirun`` executable. +It will ideally work but you will get agent info for the ``mpirun`` process too. Example: + +.. code-block:: bash + + rocprofv3 --hip-trace -d %h.%p.%env{OMPI_COMM_WORLD_RANK}% -- mpirun -n 2 + +In the above example, an extra agent info file is generated for the ``mpirun`` process. The trace files are prefixed with the hostname, process ID, and the MPI rank. + +.. code-block:: bash + + 3000020_agent_info.csv + 3000019_agent_info.csv + 3000020_hip_api_trace.csv + 3000019_hip_api_trace.csv + 3164458_agent_info.csv + +`ROCTx` annotations +=================== +For an MPI application, you can use `ROCTx` annotations to mark the start and end of the MPI code region. The following example demonstrates how to use `ROCTx` annotations with MPI: + +.. code-block:: cpp + + #include + #include + ... + + void run(int rank, int tid, int dev_id, int argc, char** argv) + { + auto roctx_run_id = roctxRangeStart("run"); + + const auto mark = [rank, tid, dev_id](std::string_view suffix) { + auto _ss = std::stringstream{}; + _ss << "run/rank-" << rank << "/thread-" << tid << "/device-" << dev_id << "/" << suffix; + roctxMark(_ss.str().c_str()); + }; + + mark("begin"); + + constexpr unsigned int M = 4960 * 2; + constexpr unsigned int N = 4960 * 2; + + unsigned long long nitr = 0; + unsigned long long nsync = 0; + + if(argc > 2) nitr = atoll(argv[2]); + if(argc > 3) nsync = atoll(argv[3]); + + hipStream_t stream = {}; + + printf("[transpose] Rank %i, thread %i assigned to device %i\n", rank, tid, dev_id); + HIP_API_CALL(hipSetDevice(dev_id)); + HIP_API_CALL(hipStreamCreate(&stream)); + + auto_lock_t _lk{print_lock}; + std::cout << "[transpose][" << rank << "][" << tid << "] M: " << M << " N: " << N << std::endl; + _lk.unlock(); + + std::default_random_engine _engine{std::random_device{}() * (rank + 1) * (tid + 1)}; + std::uniform_int_distribution _dist{0, 1000}; + + ... + + auto t1 = std::chrono::high_resolution_clock::now(); + for(size_t i = 0; i < nitr; ++i) + { + roctxRangePush("run/iteration"); + transpose<<>>(in, out, M, N); + check_hip_error(); + if(i % nsync == (nsync - 1)) + { + roctxRangePush("run/iteration/sync"); + HIP_API_CALL(hipStreamSynchronize(stream)); + roctxRangePop(); + } + roctxRangePop(); + } + auto t2 = std::chrono::high_resolution_clock::now(); + HIP_API_CALL(hipStreamSynchronize(stream)); + HIP_API_CALL(hipMemcpyAsync(out_matrix, out, size, hipMemcpyDeviceToHost, stream)); + double time = std::chrono::duration_cast>(t2 - t1).count(); + float GB = (float) size * nitr * 2 / (1 << 30); + + print_lock.lock(); + std::cout << "[transpose][" << rank << "][" << tid << "] Runtime of transpose is " << time + << " sec\n"; + std::cout << "[transpose][" << rank << "][" << tid + << "] The average performance of transpose is " << GB / time << " GBytes/sec" + << std::endl; + print_lock.unlock(); + + ... + + mark("end"); + + roctxRangeStop(roctx_run_id); + } + +This gives you output similar to the following: + +.. code-block:: shell + + "MARKER_CORE_API","run/rank-0/thread-0/device-0/begin",2936128,2936128,5,432927100747635,432927100747635 + "MARKER_CORE_API","run/rank-0/thread-1/device-1/begin",2936128,2936397,7,432927100811475,432927100811475 + "MARKER_CORE_API","run/iteration",2936128,2936397,22,432928615598809,432928648197081 + "MARKER_CORE_API","run/iteration",2936128,2936397,61,432928648229081,432928648234041 + "MARKER_CORE_API","run/iteration",2936128,2936397,67,432928648234701,432928648239621 + "MARKER_CORE_API","run/iteration",2936128,2936397,73,432928648239971,432928648244141 + "MARKER_CORE_API","run/iteration/sync",2936128,2936397,84,432928648249791,432928664871094 + ... + + "MARKER_CORE_API","run/iteration",2936128,2936128,6313,432929397644269,432929397648369 + "MARKER_CORE_API","run/iteration/sync",2936128,2936128,6324,432929397653119,432929401455250 + "MARKER_CORE_API","run/iteration",2936128,2936128,6319,432929397648779,432929401455640 + "MARKER_CORE_API","run/rank-0/thread-1/device-1/end",2936128,2936397,6339,432929527301990,432929527301990 + "MARKER_CORE_API","run",2936128,2936397,6,432927100787035,432929527313480 + "MARKER_CORE_API","run/rank-0/thread-0/device-0/end",2936128,2936128,6342,432929612438185,432929612438185 + "MARKER_CORE_API","run",2936128,2936128,4,432927100729745,432929612448285 + +Output Format Features: +======================= +To use ``rocprofv3`` to collect the profiles of the individual MPI processes, you must tell ``rocprofv3`` to send its output to unique files. +This is done using the following placeholders: + +Output directory option supports following placeholders: +- %hostname%: Hostname of the machine +- %pid%: Process ID +- %env{USER}% - Consistent with other output key formats (start+end with %) +- $ENV{USER} - Similar to CMake +- %q{USER}% - Compatibility with NVIDIA + +.. code-block:: bash + + mpirun -n 2 rocprofv3 --hip-trace -d %h.%p.%env{OMPI_COMM_WORLD_RANK}% -- + +Assuming the hostname is `ubuntu-latest`, the process ID is `3000020` and `3000019`, the output file names are: + +.. code-block:: bash + + ubuntu-latest.3000020.1/ubuntu-latest/3000020_agent_info.csv + ubuntu-latest.3000019.0/ubuntu-latest/3000019_agent_info.csv + ubuntu-latest.3000020.1/ubuntu-latest/3000020_hip_api_trace.csv + ubuntu-latest.3000019.0/ubuntu-latest/3000019_hip_api_trace.csv diff --git a/projects/rocprofiler-sdk/source/docs/index.rst b/projects/rocprofiler-sdk/source/docs/index.rst index 6d185afdf0..cbc9ab49ea 100644 --- a/projects/rocprofiler-sdk/source/docs/index.rst +++ b/projects/rocprofiler-sdk/source/docs/index.rst @@ -34,6 +34,7 @@ The documentation is structured as follows: * :ref:`using-rocprofv3` * :ref:`using-rocprofiler-sdk-roctx` * :doc:`Samples ` + * :ref:`using-rocprofv3-with-mpi` .. grid-item-card:: API reference