GDA support for alltoall via rocshmem integration (#2099)

* ROCSHMEM linking/building to match MSCCL++ style

* add rocSHMEM as a submodule

* Move rocSHMEM submodule to ext-src/rocSHMEM

* Adding submodule support proper, as well as a patch for rocshmem

* Cleaning up INCLUDE_DIR vs INCLUDE_DIRS mixup

* updating patch file

* Pointing rocshmem submodule to edgars fixup patch

* Adding IBVERBS link to the submodule build

* More IBVERBS patching

* pin rocshmem submodule to b534423

* Adding IPC support in rocSHMEM build

* updating rocshmem submodule to resolve CQ errors

* Updating submodule to include recent a2a optimizations

* invoke rocshmem alltoall from rccl

* Updating submodule to CQ error number hang

* Updating submodule to include a2a improvements and bug fixes

* Updating submodule to point to Yiltan's fork and doorbell ring removal commit

* Updating hash to correspond with submodule change

* Updating to no-ctx wg call and updating submodule

* copy-in/copy-out using multiples CUs

* Updating rocSHMEM submodule to include doorbell improvs

* updating gitmodule to point to upstream

* code cleanup and adjust threashold

* guard rocshmem a2a invocation

* Only build with rocshmem when specified

* code cleanup

* address review comments

* Removing debugging failure case

Signed-off-by: Thomas Huber <thomas.huber@amd.com>

* whitespace fix

* Adding rocshmem compile guard

* Removing unneccesary comment

Signed-off-by: Thomas Huber <thomas.huber@amd.com>

* remove commented lines

* address review comments

* cleanup

---------

Signed-off-by: Thomas Huber <thomas.huber@amd.com>
Co-authored-by: Thomas Huber <thomas.huber@amd.com>
Co-authored-by: Nusrat Islam <nusislam@dell300x-ccs-aus-k12-27.cs-aus.dcgpu>
Co-authored-by: Nusrat Islam <nusislam@dell300x-ccs-aus-k13-09.cs-aus.dcgpu>
Co-authored-by: Islam <nusislam@amd.com>
Co-authored-by: Nusrat Islam <nusislam@dell300x-ccs-aus-k13-03.cs-aus.dcgpu>
Цей коміт міститься в:
Nusrat Islam
2026-01-09 14:04:54 -06:00
зафіксовано GitHub
джерело 11e0f4445e
коміт 27648b0900
16 змінених файлів з 427 додано та 16 видалено
+12 -1
Переглянути файл
@@ -41,6 +41,7 @@ force_reduce_pipeline=false
generate_sym_kernels=false
warp_speed_enabled=true # note that this flag will be overridden to false for non MI350/MI300 platforms
quiet_warnings=false
build_rocshmem_support=false
# #################################################
# helper functions
@@ -82,6 +83,7 @@ function display_help()
echo " --force-reduce-pipeline Force reduce_copy sw pipeline to be used for every reduce-based collectives and datatypes"
echo " --generate-sym-kernels Generate symmetric memory kernels"
echo " -q|--quiet-warnings Suppress majority of compiler warnings (not recommended)"
echo " --rocshmem Build with rocSHMEM support"
}
# #################################################
@@ -91,7 +93,7 @@ function display_help()
# check if we have a modern version of getopt that can handle whitespace and long parameters
getopt -T
if [[ "$?" -eq 4 ]]; then
GETOPT_PARSE=$(getopt --name "${0}" --options cdfhij:lprtq --longoptions address-sanitizer,dependencies,debug,dump-asm,enable-code-coverage,enable_backtrace,disable-colltrace,disable-msccl-kernel,enable-mscclpp,fast,help,install,jobs:,kernel-resource-use,local_gpu_only,amdgpu_targets:,no_clean,npkit-enable,log-trace,openmp-test-enable,roctx-enable,package_build,prefix:,rm-legacy-include-dir,run_tests_all,run_tests_quick,static,tests_build,time-trace,force-reduce-pipeline,generate-sym-kernels,quiet-warnings,disable-warp-speed,verbose -- "$@")
GETOPT_PARSE=$(getopt --name "${0}" --options cdfhij:lprtq --longoptions address-sanitizer,dependencies,debug,dump-asm,enable-code-coverage,enable_backtrace,disable-colltrace,disable-msccl-kernel,enable-mscclpp,fast,help,install,jobs:,kernel-resource-use,local_gpu_only,amdgpu_targets:,no_clean,npkit-enable,log-trace,openmp-test-enable,roctx-enable,package_build,prefix:,rm-legacy-include-dir,run_tests_all,run_tests_quick,static,tests_build,time-trace,force-reduce-pipeline,generate-sym-kernels,quiet-warnings,disable-warp-speed,verbose,rocshmem -- "$@")
else
echo "Need a new version of getopt"
exit 1
@@ -140,6 +142,7 @@ while true; do
--generate-sym-kernels) generate_sym_kernels=true; shift ;;
--disable-warp-speed) warp_speed_enabled=false; shift ;;
-q | --quiet-warnings) quiet_warnings=true; shift ;;
--rocshmem) build_rocshmem_support=true; shift ;;
--) shift ; break ;;
*) echo "Unexpected command line parameter received; aborting";
exit 1
@@ -329,6 +332,14 @@ if [[ "${quiet_warnings}" == true ]]; then
fi
# Enable rocSHMEM support
if [[ "${build_rocshmem_support}" == true ]]; then
cmake_common_options="${cmake_common_options} -DENABLE_ROCSHMEM=ON"
cmake_common_options="${cmake_common_options} -DROCSHMEM_INSTALL_DIR=${ROCSHMEM_INSTALL_DIR}"
else
cmake_common_options="${cmake_common_options} -DENABLE_ROCSHMEM=OFF"
fi
check_exit_code "$?"
# Enable ninja build for time tracing