From da93542c40b9a7eb60ea69ca577173cbd3abc84b Mon Sep 17 00:00:00 2001 From: Brandon Potter Date: Tue, 6 Aug 2024 14:32:14 -0700 Subject: [PATCH 01/18] Add simple fine test --- tests/unit_tests/CMakeLists.txt | 1 + .../unit_tests/ipc_impl_simple_fine_gtest.cpp | 1030 +++++++++++++++++ .../unit_tests/ipc_impl_simple_fine_gtest.hpp | 236 ++++ 3 files changed, 1267 insertions(+) create mode 100644 tests/unit_tests/ipc_impl_simple_fine_gtest.cpp create mode 100644 tests/unit_tests/ipc_impl_simple_fine_gtest.hpp diff --git a/tests/unit_tests/CMakeLists.txt b/tests/unit_tests/CMakeLists.txt index 349bb2c2bf..b47da1c154 100644 --- a/tests/unit_tests/CMakeLists.txt +++ b/tests/unit_tests/CMakeLists.txt @@ -90,6 +90,7 @@ target_sources( free_list_gtest.cpp #context_ipc_gtest.cpp ipc_impl_simple_coarse_gtest.cpp + ipc_impl_simple_fine_gtest.cpp ) ############################################################################### diff --git a/tests/unit_tests/ipc_impl_simple_fine_gtest.cpp b/tests/unit_tests/ipc_impl_simple_fine_gtest.cpp new file mode 100644 index 0000000000..1c4c409f59 --- /dev/null +++ b/tests/unit_tests/ipc_impl_simple_fine_gtest.cpp @@ -0,0 +1,1030 @@ +/****************************************************************************** + * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + *****************************************************************************/ + +#include "ipc_impl_simple_fine_gtest.hpp" + +using namespace rocshmem; + +TEST_F(IPCImplSimpleFineTestFixture, ptr_check) { + ASSERT_NE(heap_mem_.get_ptr(), nullptr); +} + +TEST_F(IPCImplSimpleFineTestFixture, MPI_num_pes) { + ASSERT_EQ(mpi_.num_pes(), 2); +} + +TEST_F(IPCImplSimpleFineTestFixture, IPC_bases) { + for(int i{0}; i < mpi_.num_pes(); i++) { + ASSERT_NE(ipc_impl_.ipc_bases[i], nullptr); + } +} + +TEST_F(IPCImplSimpleFineTestFixture, golden_1048576_int) { + iota_golden(1048576); + validate_golden(1048576); +} + +//============================================================================= + +TEST_F(IPCImplSimpleFineTestFixture, write_wg_1x1x1_1024x1x1_32_int) { + dim3 grid {1,1,1}; + dim3 block {1024,1,1}; + write_wg(grid, block, 32); +} + +//============================================================================= + +TEST_F(IPCImplSimpleFineTestFixture, read_wg_1x1x1_1024x1x1_32_int) { + dim3 grid {1,1,1}; + dim3 block {1024,1,1}; + read_wg(grid, block, 32); +} + +//============================================================================= + +TEST_F(IPCImplSimpleFineTestFixture, write_wg_1x1x1_1x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {1,1,1}; + write_wg(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, write_wg_1x1x1_2x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {2,1,1}; + write_wg(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, write_wg_1x1x1_4x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {4,1,1}; + write_wg(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, write_wg_1x1x1_8x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {8,1,1}; + write_wg(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, write_wg_1x1x1_16x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {16,1,1}; + write_wg(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, write_wg_1x1x1_32x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {32,1,1}; + write_wg(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, write_wg_1x1x1_64x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {64,1,1}; + write_wg(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, write_wg_1x1x1_128x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {128,1,1}; + write_wg(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, write_wg_1x1x1_256x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {256,1,1}; + write_wg(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, write_wg_1x1x1_512x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {512,1,1}; + write_wg(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, write_wg_1x1x1_768x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {768,1,1}; + write_wg(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, write_wg_1x1x1_1024x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {1024,1,1}; + write_wg(grid, block, 1048576); +} + +//============================================================================= + +TEST_F(IPCImplSimpleFineTestFixture, read_wg_1x1x1_1x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {1,1,1}; + read_wg(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, read_wg_1x1x1_2x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {2,1,1}; + read_wg(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, read_wg_1x1x1_4x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {4,1,1}; + read_wg(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, read_wg_1x1x1_8x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {8,1,1}; + read_wg(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, read_wg_1x1x1_16x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {16,1,1}; + read_wg(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, read_wg_1x1x1_32x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {32,1,1}; + read_wg(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, read_wg_1x1x1_64x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {64,1,1}; + read_wg(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, read_wg_1x1x1_128x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {128,1,1}; + read_wg(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, read_wg_1x1x1_256x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {256,1,1}; + read_wg(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, read_wg_1x1x1_512x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {512,1,1}; + read_wg(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, read_wg_1x1x1_768x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {768,1,1}; + read_wg(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, read_wg_1x1x1_1024x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {1024,1,1}; + read_wg(grid, block, 1048576); +} + +//============================================================================= + +TEST_F(IPCImplSimpleFineTestFixture, write_wave_1x1x1_64x1x1_1_int) { + dim3 grid {1,1,1}; + dim3 block {64,1,1}; + write_wave(grid, block, 1); +} + +//============================================================================= + +TEST_F(IPCImplSimpleFineTestFixture, read_wave_1x1x1_64x1x1_1_int) { + dim3 grid {1,1,1}; + dim3 block {64,1,1}; + read_wave(grid, block, 1); +} + +//============================================================================= + +TEST_F(IPCImplSimpleFineTestFixture, write_wave_1x1x1_64x1x1_32_int) { + dim3 grid {1,1,1}; + dim3 block {64,1,1}; + write_wave(grid, block, 32); +} + +//============================================================================= + +TEST_F(IPCImplSimpleFineTestFixture, read_wave_1x1x1_64x1x1_32_int) { + dim3 grid {1,1,1}; + dim3 block {64,1,1}; + read_wave(grid, block, 32); +} + +//============================================================================= + +TEST_F(IPCImplSimpleFineTestFixture, write_wave_1x1x1_1x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {1,1,1}; + write_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, write_wave_1x1x1_2x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {2,1,1}; + write_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, write_wave_1x1x1_3x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {3,1,1}; + write_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, write_wave_1x1x1_4x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {4,1,1}; + write_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, write_wave_1x1x1_5x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {5,1,1}; + write_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, write_wave_1x1x1_6x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {6,1,1}; + write_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, write_wave_1x1x1_7x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {7,1,1}; + write_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, write_wave_1x1x1_8x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {8,1,1}; + write_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, write_wave_1x1x1_9x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {9,1,1}; + write_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, write_wave_1x1x1_10x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {10,1,1}; + write_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, write_wave_1x1x1_11x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {11,1,1}; + write_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, write_wave_1x1x1_12x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {12,1,1}; + write_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, write_wave_1x1x1_13x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {13,1,1}; + write_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, write_wave_1x1x1_14x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {14,1,1}; + write_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, write_wave_1x1x1_15x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {15,1,1}; + write_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, write_wave_1x1x1_16x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {16,1,1}; + write_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, write_wave_1x1x1_17x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {17,1,1}; + write_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, write_wave_1x1x1_18x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {18,1,1}; + write_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, write_wave_1x1x1_19x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {19,1,1}; + write_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, write_wave_1x1x1_20x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {20,1,1}; + write_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, write_wave_1x1x1_21x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {21,1,1}; + write_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, write_wave_1x1x1_22x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {22,1,1}; + write_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, write_wave_1x1x1_23x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {23,1,1}; + write_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, write_wave_1x1x1_24x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {24,1,1}; + write_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, write_wave_1x1x1_25x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {25,1,1}; + write_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, write_wave_1x1x1_26x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {26,1,1}; + write_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, write_wave_1x1x1_27x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {27,1,1}; + write_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, write_wave_1x1x1_28x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {28,1,1}; + write_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, write_wave_1x1x1_29x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {29,1,1}; + write_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, write_wave_1x1x1_30x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {30,1,1}; + write_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, write_wave_1x1x1_31x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {31,1,1}; + write_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, write_wave_1x1x1_32x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {32,1,1}; + write_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, write_wave_1x1x1_33x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {33,1,1}; + write_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, write_wave_1x1x1_34x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {34,1,1}; + write_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, write_wave_1x1x1_35x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {35,1,1}; + write_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, write_wave_1x1x1_36x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {36,1,1}; + write_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, write_wave_1x1x1_37x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {37,1,1}; + write_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, write_wave_1x1x1_38x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {38,1,1}; + write_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, write_wave_1x1x1_39x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {39,1,1}; + write_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, write_wave_1x1x1_40x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {40,1,1}; + write_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, write_wave_1x1x1_41x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {41,1,1}; + write_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, write_wave_1x1x1_42x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {42,1,1}; + write_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, write_wave_1x1x1_43x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {43,1,1}; + write_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, write_wave_1x1x1_44x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {44,1,1}; + write_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, write_wave_1x1x1_45x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {45,1,1}; + write_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, write_wave_1x1x1_46x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {46,1,1}; + write_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, write_wave_1x1x1_47x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {47,1,1}; + write_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, write_wave_1x1x1_48x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {48,1,1}; + write_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, write_wave_1x1x1_49x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {49,1,1}; + write_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, write_wave_1x1x1_50x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {50,1,1}; + write_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, write_wave_1x1x1_51x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {51,1,1}; + write_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, write_wave_1x1x1_52x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {52,1,1}; + write_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, write_wave_1x1x1_53x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {53,1,1}; + write_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, write_wave_1x1x1_54x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {54,1,1}; + write_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, write_wave_1x1x1_55x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {55,1,1}; + write_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, write_wave_1x1x1_56x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {56,1,1}; + write_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, write_wave_1x1x1_57x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {57,1,1}; + write_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, write_wave_1x1x1_58x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {58,1,1}; + write_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, write_wave_1x1x1_59x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {59,1,1}; + write_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, write_wave_1x1x1_60x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {60,1,1}; + write_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, write_wave_1x1x1_61x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {61,1,1}; + write_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, write_wave_1x1x1_62x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {62,1,1}; + write_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, write_wave_1x1x1_63x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {63,1,1}; + write_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, write_wave_1x1x1_64x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {64,1,1}; + write_wave(grid, block, 1048576); +} + +//============================================================================= + +TEST_F(IPCImplSimpleFineTestFixture, read_wave_1x1x1_1x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {1,1,1}; + read_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, read_wave_1x1x1_2x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {2,1,1}; + read_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, read_wave_1x1x1_3x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {3,1,1}; + read_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, read_wave_1x1x1_4x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {4,1,1}; + read_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, read_wave_1x1x1_5x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {5,1,1}; + read_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, read_wave_1x1x1_6x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {6,1,1}; + read_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, read_wave_1x1x1_7x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {7,1,1}; + read_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, read_wave_1x1x1_8x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {8,1,1}; + read_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, read_wave_1x1x1_9x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {9,1,1}; + read_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, read_wave_1x1x1_10x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {10,1,1}; + read_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, read_wave_1x1x1_11x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {11,1,1}; + read_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, read_wave_1x1x1_12x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {12,1,1}; + read_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, read_wave_1x1x1_13x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {13,1,1}; + read_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, read_wave_1x1x1_14x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {14,1,1}; + read_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, read_wave_1x1x1_15x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {15,1,1}; + read_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, read_wave_1x1x1_16x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {16,1,1}; + read_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, read_wave_1x1x1_17x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {17,1,1}; + read_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, read_wave_1x1x1_18x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {18,1,1}; + read_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, read_wave_1x1x1_19x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {19,1,1}; + read_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, read_wave_1x1x1_20x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {20,1,1}; + read_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, read_wave_1x1x1_21x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {21,1,1}; + read_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, read_wave_1x1x1_22x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {22,1,1}; + read_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, read_wave_1x1x1_23x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {23,1,1}; + read_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, read_wave_1x1x1_24x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {24,1,1}; + read_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, read_wave_1x1x1_25x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {25,1,1}; + read_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, read_wave_1x1x1_26x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {26,1,1}; + read_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, read_wave_1x1x1_27x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {27,1,1}; + read_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, read_wave_1x1x1_28x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {28,1,1}; + read_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, read_wave_1x1x1_29x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {29,1,1}; + read_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, read_wave_1x1x1_30x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {30,1,1}; + read_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, read_wave_1x1x1_31x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {31,1,1}; + read_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, read_wave_1x1x1_32x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {32,1,1}; + read_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, read_wave_1x1x1_33x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {33,1,1}; + read_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, read_wave_1x1x1_34x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {34,1,1}; + read_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, read_wave_1x1x1_35x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {35,1,1}; + read_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, read_wave_1x1x1_36x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {36,1,1}; + read_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, read_wave_1x1x1_37x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {37,1,1}; + read_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, read_wave_1x1x1_38x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {38,1,1}; + read_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, read_wave_1x1x1_39x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {39,1,1}; + read_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, read_wave_1x1x1_40x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {40,1,1}; + read_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, read_wave_1x1x1_41x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {41,1,1}; + read_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, read_wave_1x1x1_42x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {42,1,1}; + read_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, read_wave_1x1x1_43x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {43,1,1}; + read_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, read_wave_1x1x1_44x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {44,1,1}; + read_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, read_wave_1x1x1_45x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {45,1,1}; + read_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, read_wave_1x1x1_46x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {46,1,1}; + read_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, read_wave_1x1x1_47x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {47,1,1}; + read_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, read_wave_1x1x1_48x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {48,1,1}; + read_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, read_wave_1x1x1_49x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {49,1,1}; + read_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, read_wave_1x1x1_50x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {50,1,1}; + read_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, read_wave_1x1x1_51x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {51,1,1}; + read_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, read_wave_1x1x1_52x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {52,1,1}; + read_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, read_wave_1x1x1_53x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {53,1,1}; + read_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, read_wave_1x1x1_54x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {54,1,1}; + read_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, read_wave_1x1x1_55x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {55,1,1}; + read_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, read_wave_1x1x1_56x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {56,1,1}; + read_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, read_wave_1x1x1_57x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {57,1,1}; + read_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, read_wave_1x1x1_58x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {58,1,1}; + read_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, read_wave_1x1x1_59x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {59,1,1}; + read_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, read_wave_1x1x1_60x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {60,1,1}; + read_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, read_wave_1x1x1_61x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {61,1,1}; + read_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, read_wave_1x1x1_62x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {62,1,1}; + read_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, read_wave_1x1x1_63x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {63,1,1}; + read_wave(grid, block, 1048576); +} + +TEST_F(IPCImplSimpleFineTestFixture, read_wave_1x1x1_64x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {64,1,1}; + read_wave(grid, block, 1048576); +} + +//============================================================================= + +TEST_F(IPCImplSimpleFineTestFixture, write_1x1x1_1x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {1,1,1}; + write(grid, block, 1048576); +} + +//============================================================================= + +TEST_F(IPCImplSimpleFineTestFixture, read_1x1x1_1x1x1_1048576_int) { + dim3 grid {1,1,1}; + dim3 block {1,1,1}; + read(grid, block, 1048576); +} + + diff --git a/tests/unit_tests/ipc_impl_simple_fine_gtest.hpp b/tests/unit_tests/ipc_impl_simple_fine_gtest.hpp new file mode 100644 index 0000000000..3f85528e7d --- /dev/null +++ b/tests/unit_tests/ipc_impl_simple_fine_gtest.hpp @@ -0,0 +1,236 @@ +/****************************************************************************** + * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + *****************************************************************************/ + +#ifndef ROCSHMEM_IPC_IMPL_SIMPLE_FINE_GTEST_HPP +#define ROCSHMEM_IPC_IMPL_SIMPLE_FINE_GTEST_HPP + +#include "gtest/gtest.h" + +#include + +#include +#include "../src/memory/symmetric_heap.hpp" +#include "../src/ipc/ipc_policy.hpp" + +namespace rocshmem { + +__global__ +void +kernel_simple_fine_copy(IpcImpl *ipc_impl, int *src, int *dest, size_t bytes) { + if (!threadIdx.x) { + ipc_impl->ipcCopy(dest, src, bytes); + ipc_impl->ipcFence(); + } + __syncthreads(); +} + +__global__ +void +kernel_simple_fine_copy_wg(IpcImpl *ipc_impl, int *src, int *dest, size_t bytes) { + ipc_impl->ipcCopy_wg(dest, src, bytes); + ipc_impl->ipcFence(); + __syncthreads(); +} + +__global__ +void +kernel_simple_fine_copy_wave(IpcImpl *ipc_impl, int *src, int *dest, size_t bytes) { + ipc_impl->ipcCopy_wave(dest, src, bytes); + ipc_impl->ipcFence(); + __syncthreads(); +} + +class IPCImplSimpleFineTestFixture : public ::testing::Test { + + using HEAP_T = HeapMemory; + + using MPI_T = RemoteHeapInfo; + + using FN_T = void (*)(IpcImpl*, int*, int*, size_t); + + public: + IPCImplSimpleFineTestFixture() { + ipc_impl_.ipcHostInit(mpi_.my_pe(), mpi_.get_heap_bases() , MPI_COMM_WORLD); + + assert(ipc_impl_dptr_ == nullptr); + hip_allocator_.allocate((void**)&ipc_impl_dptr_, sizeof(IpcImpl)); + + CHECK_HIP(hipMemcpy(ipc_impl_dptr_, &ipc_impl_, + sizeof(IpcImpl), hipMemcpyHostToDevice)); + } + + ~IPCImplSimpleFineTestFixture() { + if (ipc_impl_dptr_) { + hip_allocator_.deallocate(ipc_impl_dptr_); + } + + ipc_impl_.ipcHostStop(); + } + + void launch(FN_T f, const dim3 grid, const dim3 block, int* src, int* dest, size_t bytes) { + f<<>>(ipc_impl_dptr_, src, dest, bytes); + CHECK_HIP(hipStreamSynchronize(nullptr)); + } + + enum TestType { + READ = 0, + WRITE = 1 + }; + + void write(const dim3 grid, const dim3 block, size_t elems) { + iota_golden(elems); + initialize_src_buffer(WRITE); + copy(WRITE, grid, block); + validate_dest_buffer(WRITE); + } + + void write_wg(const dim3 grid, const dim3 block, size_t elems) { + iota_golden(elems); + initialize_src_buffer(WRITE); + copy_wg(WRITE, grid, block); + validate_dest_buffer(WRITE); + } + + void write_wave(const dim3 grid, const dim3 block, size_t elems) { + iota_golden(elems); + initialize_src_buffer(WRITE); + copy_wave(WRITE, grid, block); + validate_dest_buffer(WRITE); + } + + void read(const dim3 grid, const dim3 block, size_t elems) { + iota_golden(elems); + initialize_src_buffer(READ); + copy(READ, grid, block); + validate_dest_buffer(READ); + } + + void read_wg(const dim3 grid, const dim3 block, size_t elems) { + iota_golden(elems); + initialize_src_buffer(READ); + copy_wg(READ, grid, block); + validate_dest_buffer(READ); + } + + void read_wave(const dim3 grid, const dim3 block, size_t elems) { + iota_golden(elems); + initialize_src_buffer(READ); + copy_wave(READ, grid, block); + validate_dest_buffer(READ); + } + + void iota_golden(size_t elems) { + golden_.resize(elems); + std::iota(golden_.begin(), golden_.end(), 0); + } + + void validate_golden(size_t elems) { + ASSERT_EQ(golden_.size(), elems); + for (int i{0}; i < golden_.size(); i++) { + ASSERT_EQ(golden_[i], i); + } + } + + void initialize_src_buffer(TestType test) { + if (!pe_initializes_src_buffer(test)) { + return; + } + size_t bytes = golden_.size() * sizeof(int); + auto dev_src = reinterpret_cast(ipc_impl_.ipc_bases[mpi_.my_pe()]); + CHECK_HIP(hipMemcpy(dev_src, golden_.data(), bytes, hipMemcpyHostToDevice)); + CHECK_HIP(hipStreamSynchronize(nullptr)); + } + + bool pe_initializes_src_buffer(TestType test) { + bool is_write_test = test; + bool is_read_test = !test; + return (is_write_test && mpi_.my_pe() == 0) || + (is_read_test && mpi_.my_pe() == 1); + } + + void execute(TestType test, FN_T fn, const dim3 grid, const dim3 block) { + if (mpi_.my_pe()) { + mpi_.barrier(); + mpi_.barrier(); + return; + } + int *src{nullptr}; + int *dest{nullptr}; + if (test == WRITE) { + src = reinterpret_cast(ipc_impl_.ipc_bases[0]); + dest = reinterpret_cast(ipc_impl_.ipc_bases[1]); + } else { + src = reinterpret_cast(ipc_impl_.ipc_bases[1]); + dest = reinterpret_cast(ipc_impl_.ipc_bases[0]); + } + size_t bytes = golden_.size() * sizeof(int); + mpi_.barrier(); + launch(fn, grid, block, src, dest, bytes); + mpi_.barrier(); + } + + void copy(TestType test, dim3 grid, dim3 block) { + execute(test, kernel_simple_fine_copy, grid, block); + } + + void copy_wg(TestType test, dim3 grid, dim3 block) { + execute(test, kernel_simple_fine_copy_wg, grid, block); + } + + void copy_wave(TestType test, dim3 grid, dim3 block) { + execute(test, kernel_simple_fine_copy_wave, grid, block); + } + + void validate_dest_buffer(TestType test) { + if (!pe_validates_dest_buffer(test)) { + return; + } + + auto dev_dest = reinterpret_cast(ipc_impl_.ipc_bases[mpi_.my_pe()]); + for (int i{0}; i < golden_.size(); i++) { + ASSERT_EQ(golden_[i], dev_dest[i]); + } + } + + bool pe_validates_dest_buffer(TestType test) { + return !pe_initializes_src_buffer(test); + } + + protected: + std::vector golden_; + + std::vector output_; + + HEAP_T heap_mem_ {}; + + MPI_T mpi_ {heap_mem_.get_ptr(), heap_mem_.get_size()}; + + IpcImpl ipc_impl_ {}; + + IpcImpl *ipc_impl_dptr_ {nullptr}; + + HIPAllocator hip_allocator_ {}; +}; + +} // namespace rocshmem + +#endif // ROCSHMEM_IPC_IMPL_SIMPLE_FINE_GTEST_HPP From c4b7e0d91becd8bc20c4b3eefac6146f76d2a82f Mon Sep 17 00:00:00 2001 From: Brandon Potter Date: Wed, 7 Aug 2024 11:03:18 -0700 Subject: [PATCH 02/18] Partial notifier --- scripts/build_configs/ro_net | 2 +- src/atomic.hpp | 131 +++++++++++++++++++++++++++++++++++ src/memory/notifier.hpp | 25 +++---- 3 files changed, 140 insertions(+), 18 deletions(-) create mode 100644 src/atomic.hpp diff --git a/scripts/build_configs/ro_net b/scripts/build_configs/ro_net index 17809fa0a9..47690bb2d9 100755 --- a/scripts/build_configs/ro_net +++ b/scripts/build_configs/ro_net @@ -24,4 +24,4 @@ cmake \ -DUSE_COHERENT_HEAP=ON \ $src_path cmake --build . --parallel 8 -cmake --install . +#cmake --install . diff --git a/src/atomic.hpp b/src/atomic.hpp new file mode 100644 index 0000000000..85a140f75a --- /dev/null +++ b/src/atomic.hpp @@ -0,0 +1,131 @@ +/****************************************************************************** + * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + *****************************************************************************/ + +#ifndef LIBRARY_SRC_ATOMIC_HPP +#define LIBRARY_SRC_ATOMIC_HPP + +namespace rocshmem { +namespace detail { +namespace atomic { + +typedef enum rocshmem_memory_scope { + memory_scope_thread = __HIP_MEMORY_SCOPE_SINGLETHREAD, + memory_scope_wavefront = __HIP_MEMORY_SCOPE_WAVEFRONT, + memory_scope_workgroup = __HIP_MEMORY_SCOPE_WORKGROUP, + memory_scope_agent = __HIP_MEMORY_SCOPE_AGENT, + memory_scope_system = __HIP_MEMORY_SCOPE_SYSTEM, +} rocshmem_memory_scope; + +typedef enum rocshmem_memory_order { + memory_order_relaxed = __ATOMIC_RELAXED, + memory_order_consume = __ATOMIC_CONSUME, + memory_order_acquire = __ATOMIC_ACQUIRE, + memory_order_release = __ATOMIC_RELEASE, + memory_order_acq_rel = __ATOMIC_ACQ_REL, + memory_order_seq_cst = __ATOMIC_SEQ_CST +} rocshmem_memory_order; + +template +__host__ __device__ +T load(const T* address, rocshmem_memory_order order) { + return __hip_atomic_load(address, order, Scope); +} + +template +__host__ __device__ +void store(const T value, const T* address, rocshmem_memory_order order) { + return __hip_atomic_store(value, address, order, Scope); +} + +template +__host__ __device__ +bool compare_exchange_weak(T& expected, T desired, rocshmem_memory_order success, + rocshmem_memory_order failure) { + return __hip_atomic_compare_exchange_weak(expected, desired, success, failure, Scope); +} + +template +__host__ __device__ +bool compare_exchange_strong(T& expected, T desired, rocshmem_memory_order success, + rocshmem_memory_order failure) { + return __hip_atomic_compare_exchange_strong(expected, desired, success, failure, Scope); +} + +template +__host__ __device__ +T fetch_add(T* obj, U arg, rocshmem_memory_order order) { + return __hip_atomic_fetch_add(obj, arg, order, Scope); +} + +template +__host__ __device__ +T fetch_sub(T* obj, U arg, rocshmem_memory_order order) { + return __hip_atomic_fetch_sub(obj, arg, order, Scope); +} + +template +__host__ __device__ +T fetch_and(T* obj, U arg, rocshmem_memory_order order) { + return __hip_atomic_fetch_and(obj, arg, order, Scope); +} + +template +__host__ __device__ +T fetch_or(T* obj, U arg, rocshmem_memory_order order) { + return __hip_atomic_fetch_or(obj, arg, order, Scope); +} + +template +__host__ __device__ +T fetch_xor(T* obj, U arg, rocshmem_memory_order order) { + return __hip_atomic_fetch_xor(obj, arg, order, Scope); +} + +template +__host__ __device__ +T fetch_max(T* obj, U arg, rocshmem_memory_order order) { + return __hip_atomic_fetch_max(obj, arg, order, Scope); +} + +template +__host__ __device__ +T fetch_min(T* obj, U arg, rocshmem_memory_order order) { + return __hip_atomic_fetch_min(obj, arg, order, Scope); +} + +template +__device__ +void thread_fence([[maybe_unused]] rocshmem_memory_order order) { + if constexpr (Scope == memory_scope_system) { + __threadfence_system(); + } else if constexpr (Scope == memory_scope_agent) { + __threadfence(); + } else if constexpr (Scope == memory_scope_workgroup) { + __threadfence_block(); + } +} + +} // namespace atomic +} // namespace detail +} // namespace rocshmem + +#endif // LIBRARY_SRC_ATOMIC_HPP_ diff --git a/src/memory/notifier.hpp b/src/memory/notifier.hpp index 0d8cd92f2f..d398110e9a 100644 --- a/src/memory/notifier.hpp +++ b/src/memory/notifier.hpp @@ -20,30 +20,21 @@ * IN THE SOFTWARE. *****************************************************************************/ -/** - * @file notifier.hpp - * - * @brief Contains the notification memory space for threads to communicate - * results with one another. - * - * Assume one thread does work on behalf of other threads (as a leader) and - * that work needs to be communicated to the other threads as a result. - * To expose the result, the threads need to share a memory space where the - * result can be written. The leader thread writes the result out to this - * memory space and all threads synchronize on it. - * - * This class allows the leader thread to notify other threads of the update. - */ - #ifndef LIBRARY_SRC_MEMORY_NOTIFIER_HPP_ #define LIBRARY_SRC_MEMORY_NOTIFIER_HPP_ #include "../device_proxy.hpp" #include "../util.hpp" +#include "../atomic.hpp" namespace rocshmem { +template class Notifier { +}; + +template +class Notifier { public: __device__ uint64_t read() { return value_; } @@ -67,9 +58,9 @@ class Notifier { uint64_t value_{}; }; -template +template class NotifierProxy { - using ProxyT = DeviceProxy; + using ProxyT = DeviceProxy, 1>; public: __host__ __device__ Notifier* get() { return proxy_.get(); } From 0c53a075f2faaf5717ac12d32473b59b68160c81 Mon Sep 17 00:00:00 2001 From: Brandon Potter Date: Mon, 12 Aug 2024 11:29:31 -0700 Subject: [PATCH 03/18] Fix problems with Notifier --- src/atomic.hpp | 87 ++++++++++++++++------------- src/memory/notifier.hpp | 23 ++++---- src/memory/slab_heap.hpp | 2 +- tests/unit_tests/notifier_gtest.hpp | 4 +- 4 files changed, 61 insertions(+), 55 deletions(-) diff --git a/src/atomic.hpp b/src/atomic.hpp index 85a140f75a..7b1ce9300a 100644 --- a/src/atomic.hpp +++ b/src/atomic.hpp @@ -44,82 +44,91 @@ typedef enum rocshmem_memory_order { memory_order_seq_cst = __ATOMIC_SEQ_CST } rocshmem_memory_order; -template +struct rocshmem_memory_orders { + rocshmem_memory_order load {memory_order_acquire}; + rocshmem_memory_order store {memory_order_release}; + rocshmem_memory_order fence {memory_order_acq_rel}; + rocshmem_memory_order atomic {memory_order_acq_rel}; + rocshmem_memory_order weak_cas_success {memory_order_acq_rel}; + rocshmem_memory_order weak_cas_failure {memory_order_acq_rel}; + rocshmem_memory_order strong_cas_success {memory_order_acq_rel}; + rocshmem_memory_order strong_cas_failure {memory_order_acq_rel}; +}; + +template __host__ __device__ -T load(const T* address, rocshmem_memory_order order) { - return __hip_atomic_load(address, order, Scope); +T load(const T* address, rocshmem_memory_orders o) { + return __hip_atomic_load(address, o.load, s); } -template +template __host__ __device__ -void store(const T value, const T* address, rocshmem_memory_order order) { - return __hip_atomic_store(value, address, order, Scope); +void store(T* address, const T value, rocshmem_memory_orders o) { + return __hip_atomic_store(address, value, o.store, s); } -template +template __host__ __device__ -bool compare_exchange_weak(T& expected, T desired, rocshmem_memory_order success, - rocshmem_memory_order failure) { - return __hip_atomic_compare_exchange_weak(expected, desired, success, failure, Scope); +bool compare_exchange_weak(T& expected, T desired, rocshmem_memory_orders o) { + return __hip_atomic_compare_exchange_weak(expected, desired, o.weak_cas_success, o.weak_cas_failure, s); } -template +template __host__ __device__ -bool compare_exchange_strong(T& expected, T desired, rocshmem_memory_order success, - rocshmem_memory_order failure) { - return __hip_atomic_compare_exchange_strong(expected, desired, success, failure, Scope); +bool compare_exchange_strong(T& expected, T desired, rocshmem_memory_orders o) { + return __hip_atomic_compare_exchange_strong(expected, desired, o.strong_cas_success, o.strong_cas_failure, s); } -template +template __host__ __device__ -T fetch_add(T* obj, U arg, rocshmem_memory_order order) { - return __hip_atomic_fetch_add(obj, arg, order, Scope); +T fetch_add(T* obj, U arg, rocshmem_memory_orders o) { + return __hip_atomic_fetch_add(obj, arg, o.atomic, s); } -template +template __host__ __device__ -T fetch_sub(T* obj, U arg, rocshmem_memory_order order) { - return __hip_atomic_fetch_sub(obj, arg, order, Scope); +T fetch_sub(T* obj, U arg, rocshmem_memory_orders o) { + return __hip_atomic_fetch_sub(obj, arg, o.atomic, s); } -template +template __host__ __device__ -T fetch_and(T* obj, U arg, rocshmem_memory_order order) { - return __hip_atomic_fetch_and(obj, arg, order, Scope); +T fetch_and(T* obj, U arg, rocshmem_memory_orders o) { + return __hip_atomic_fetch_and(obj, arg, o.atomic, s); } -template +template __host__ __device__ -T fetch_or(T* obj, U arg, rocshmem_memory_order order) { - return __hip_atomic_fetch_or(obj, arg, order, Scope); +T fetch_or(T* obj, U arg, rocshmem_memory_orders o) { + return __hip_atomic_fetch_or(obj, arg, o, s); } -template +template __host__ __device__ -T fetch_xor(T* obj, U arg, rocshmem_memory_order order) { - return __hip_atomic_fetch_xor(obj, arg, order, Scope); +T fetch_xor(T* obj, U arg, rocshmem_memory_orders o) { + return __hip_atomic_fetch_xor(obj, arg, o.atomic, s); } -template +template __host__ __device__ -T fetch_max(T* obj, U arg, rocshmem_memory_order order) { - return __hip_atomic_fetch_max(obj, arg, order, Scope); +T fetch_max(T* obj, U arg, rocshmem_memory_orders o) { + return __hip_atomic_fetch_max(obj, arg, o.atomic, s); } -template +template __host__ __device__ -T fetch_min(T* obj, U arg, rocshmem_memory_order order) { - return __hip_atomic_fetch_min(obj, arg, order, Scope); +T fetch_min(T* obj, U arg, rocshmem_memory_orders o) { + return __hip_atomic_fetch_min(obj, arg, o.atomic, s); } -template +template __device__ void thread_fence([[maybe_unused]] rocshmem_memory_order order) { - if constexpr (Scope == memory_scope_system) { + if constexpr (s == memory_scope_system) { __threadfence_system(); - } else if constexpr (Scope == memory_scope_agent) { + } else if constexpr (s == memory_scope_agent) { __threadfence(); - } else if constexpr (Scope == memory_scope_workgroup) { + } else if constexpr (s == memory_scope_workgroup) { __threadfence_block(); } } diff --git a/src/memory/notifier.hpp b/src/memory/notifier.hpp index d398110e9a..e6364a26cf 100644 --- a/src/memory/notifier.hpp +++ b/src/memory/notifier.hpp @@ -29,20 +29,15 @@ namespace rocshmem { -template +template class Notifier { -}; - -template -class Notifier { public: - __device__ uint64_t read() { return value_; } + __device__ uint64_t read() { + return detail::atomic::load(&value_, orders); + } __device__ void write(uint64_t val) { - if (is_thread_zero_in_block()) { - value_ = val; - } - publish(); + detail::atomic::store(&value_, val, orders); } __device__ void done() { __syncthreads(); } @@ -55,15 +50,17 @@ class Notifier { __syncthreads(); } + detail::atomic::rocshmem_memory_orders orders; + uint64_t value_{}; }; -template +template class NotifierProxy { - using ProxyT = DeviceProxy, 1>; + using ProxyT = DeviceProxy, 1>; public: - __host__ __device__ Notifier* get() { return proxy_.get(); } + __host__ __device__ Notifier* get() { return proxy_.get(); } private: ProxyT proxy_{}; diff --git a/src/memory/slab_heap.hpp b/src/memory/slab_heap.hpp index a3655b27c1..171332bee3 100644 --- a/src/memory/slab_heap.hpp +++ b/src/memory/slab_heap.hpp @@ -48,7 +48,7 @@ class SlabHeap { /** * @brief Helper type for notifier */ - using NOTIFIER_PROXY_T = NotifierProxy; + using NOTIFIER_PROXY_T = NotifierProxy; /** * @brief Helper type for notifier diff --git a/tests/unit_tests/notifier_gtest.hpp b/tests/unit_tests/notifier_gtest.hpp index ecb2e7a619..4a72212f5d 100644 --- a/tests/unit_tests/notifier_gtest.hpp +++ b/tests/unit_tests/notifier_gtest.hpp @@ -52,7 +52,7 @@ write_to_memory(uint8_t* raw_memory) { __global__ void all_threads_once(uint8_t* raw_memory, - Notifier* notifier) { + Notifier * notifier) { notifier->write(NOTIFIER_OFFSET); uint64_t offset_u64 {notifier->read()}; notifier->done(); @@ -65,7 +65,7 @@ all_threads_once(uint8_t* raw_memory, } class NotifierTestFixture : public ::testing::Test { - using NotifierProxyT = NotifierProxy; + using NotifierProxyT = NotifierProxy; public: NotifierTestFixture() { From 039ea82777df665645ae630c51572958bd963903 Mon Sep 17 00:00:00 2001 From: Brandon Potter Date: Mon, 12 Aug 2024 14:37:47 -0700 Subject: [PATCH 04/18] Change read/write to load/store in Nofitier API --- src/memory/notifier.hpp | 4 ++-- src/memory/slab_heap.cpp | 4 ++-- tests/unit_tests/notifier_gtest.hpp | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/memory/notifier.hpp b/src/memory/notifier.hpp index e6364a26cf..b2211f0cc5 100644 --- a/src/memory/notifier.hpp +++ b/src/memory/notifier.hpp @@ -32,11 +32,11 @@ namespace rocshmem { template class Notifier { public: - __device__ uint64_t read() { + __device__ uint64_t load() { return detail::atomic::load(&value_, orders); } - __device__ void write(uint64_t val) { + __device__ void store(uint64_t val) { detail::atomic::store(&value_, val, orders); } diff --git a/src/memory/slab_heap.cpp b/src/memory/slab_heap.cpp index 89067b78da..80b8a8cef9 100644 --- a/src/memory/slab_heap.cpp +++ b/src/memory/slab_heap.cpp @@ -75,8 +75,8 @@ __device__ void SlabHeap::malloc(void** ptr, size_t size) { * Notify other threads in block about the allocation result. */ auto notifier{notifier_.get()}; - notifier->write(ptr_deref_u64); - uint64_t notification_u64{notifier->read()}; + notifier->store(ptr_deref_u64); + uint64_t notification_u64{notifier->load()}; notifier->done(); /* diff --git a/tests/unit_tests/notifier_gtest.hpp b/tests/unit_tests/notifier_gtest.hpp index 4a72212f5d..b914ca4f22 100644 --- a/tests/unit_tests/notifier_gtest.hpp +++ b/tests/unit_tests/notifier_gtest.hpp @@ -53,8 +53,8 @@ __global__ void all_threads_once(uint8_t* raw_memory, Notifier * notifier) { - notifier->write(NOTIFIER_OFFSET); - uint64_t offset_u64 {notifier->read()}; + notifier->store(NOTIFIER_OFFSET); + uint64_t offset_u64 {notifier->load()}; notifier->done(); uint64_t raw_memory_u64 {reinterpret_cast(raw_memory)}; From 51c33b2a6670b7e287169d30d73e739f65902ff3 Mon Sep 17 00:00:00 2001 From: Brandon Potter Date: Mon, 12 Aug 2024 15:11:48 -0700 Subject: [PATCH 05/18] Updates to Notifier --- src/atomic.hpp | 3 +-- src/memory/notifier.hpp | 10 +++------- src/memory/slab_heap.cpp | 7 +++++-- tests/unit_tests/notifier_gtest.hpp | 8 +++++--- 4 files changed, 14 insertions(+), 14 deletions(-) diff --git a/src/atomic.hpp b/src/atomic.hpp index 7b1ce9300a..5d9dc30886 100644 --- a/src/atomic.hpp +++ b/src/atomic.hpp @@ -47,7 +47,6 @@ typedef enum rocshmem_memory_order { struct rocshmem_memory_orders { rocshmem_memory_order load {memory_order_acquire}; rocshmem_memory_order store {memory_order_release}; - rocshmem_memory_order fence {memory_order_acq_rel}; rocshmem_memory_order atomic {memory_order_acq_rel}; rocshmem_memory_order weak_cas_success {memory_order_acq_rel}; rocshmem_memory_order weak_cas_failure {memory_order_acq_rel}; @@ -123,7 +122,7 @@ T fetch_min(T* obj, U arg, rocshmem_memory_orders o) { template __device__ -void thread_fence([[maybe_unused]] rocshmem_memory_order order) { +void thread_fence() { if constexpr (s == memory_scope_system) { __threadfence_system(); } else if constexpr (s == memory_scope_agent) { diff --git a/src/memory/notifier.hpp b/src/memory/notifier.hpp index b2211f0cc5..1f9f6efe76 100644 --- a/src/memory/notifier.hpp +++ b/src/memory/notifier.hpp @@ -40,15 +40,11 @@ class Notifier { detail::atomic::store(&value_, val, orders); } - __device__ void done() { __syncthreads(); } + __device__ void fence() { + detail::atomic::thread_fence(); + } private: - __device__ void publish() { - if (is_thread_zero_in_block()) { - __threadfence(); - } - __syncthreads(); - } detail::atomic::rocshmem_memory_orders orders; diff --git a/src/memory/slab_heap.cpp b/src/memory/slab_heap.cpp index 80b8a8cef9..faf3a84b1b 100644 --- a/src/memory/slab_heap.cpp +++ b/src/memory/slab_heap.cpp @@ -75,9 +75,12 @@ __device__ void SlabHeap::malloc(void** ptr, size_t size) { * Notify other threads in block about the allocation result. */ auto notifier{notifier_.get()}; - notifier->store(ptr_deref_u64); + if (!threadIdx.x) { + notifier->store(ptr_deref_u64); + notifier->fence(); + } + __syncthreads(); uint64_t notification_u64{notifier->load()}; - notifier->done(); /* * Write to the ptr parameter (to return it back up the call stack). diff --git a/tests/unit_tests/notifier_gtest.hpp b/tests/unit_tests/notifier_gtest.hpp index b914ca4f22..303f1932c9 100644 --- a/tests/unit_tests/notifier_gtest.hpp +++ b/tests/unit_tests/notifier_gtest.hpp @@ -53,10 +53,12 @@ __global__ void all_threads_once(uint8_t* raw_memory, Notifier * notifier) { - notifier->store(NOTIFIER_OFFSET); + if (!threadIdx.x) { + notifier->store(NOTIFIER_OFFSET); + notifier->fence(); + } + __syncthreads(); uint64_t offset_u64 {notifier->load()}; - notifier->done(); - uint64_t raw_memory_u64 {reinterpret_cast(raw_memory)}; uint64_t address_u64 {raw_memory_u64 + offset_u64}; uint8_t* address {reinterpret_cast(address_u64)}; From 5b42cff96cea239991c144cfbb84f1abd4d5c5bb Mon Sep 17 00:00:00 2001 From: Brandon Potter Date: Tue, 13 Aug 2024 12:01:24 -0700 Subject: [PATCH 06/18] Update Notifier fixture to Block --- src/atomic.hpp | 12 +++++++++++- src/memory/notifier.hpp | 10 +++++++--- tests/unit_tests/notifier_gtest.cpp | 28 ++++++++++++++-------------- tests/unit_tests/notifier_gtest.hpp | 23 +++++++++++------------ 4 files changed, 43 insertions(+), 30 deletions(-) diff --git a/src/atomic.hpp b/src/atomic.hpp index 5d9dc30886..330bd82f86 100644 --- a/src/atomic.hpp +++ b/src/atomic.hpp @@ -122,7 +122,7 @@ T fetch_min(T* obj, U arg, rocshmem_memory_orders o) { template __device__ -void thread_fence() { +void threadfence() { if constexpr (s == memory_scope_system) { __threadfence_system(); } else if constexpr (s == memory_scope_agent) { @@ -132,6 +132,16 @@ void thread_fence() { } } +template +__device__ +void sync() { + if constexpr (s == memory_scope_workgroup) { + __syncthreads(); + } else { + assert(false); + } +} + } // namespace atomic } // namespace detail } // namespace rocshmem diff --git a/src/memory/notifier.hpp b/src/memory/notifier.hpp index 1f9f6efe76..12f53a4c8b 100644 --- a/src/memory/notifier.hpp +++ b/src/memory/notifier.hpp @@ -33,15 +33,19 @@ template class Notifier { public: __device__ uint64_t load() { - return detail::atomic::load(&value_, orders); + return detail::atomic::load(&value_, orders); } __device__ void store(uint64_t val) { - detail::atomic::store(&value_, val, orders); + detail::atomic::store(&value_, val, orders); } __device__ void fence() { - detail::atomic::thread_fence(); + detail::atomic::threadfence(); + } + + __device__ void sync() { + detail::atomic::sync(); } private: diff --git a/tests/unit_tests/notifier_gtest.cpp b/tests/unit_tests/notifier_gtest.cpp index 9f79c62397..e1ebea0210 100644 --- a/tests/unit_tests/notifier_gtest.cpp +++ b/tests/unit_tests/notifier_gtest.cpp @@ -28,30 +28,30 @@ using namespace rocshmem; ******************************* Fixture Tests ******************************* *****************************************************************************/ -TEST_F(NotifierTestFixture, run_all_threads_once_1_1) { - run_all_threads_once(1, 1); +TEST_F(NotifierBlockTestFixture, run_all_threads_once_1_1) { + run_all_threads_once_block(1, 1); } -TEST_F(NotifierTestFixture, run_all_threads_once_2_1) { - run_all_threads_once(2, 1); +TEST_F(NotifierBlockTestFixture, run_all_threads_once_2_1) { + run_all_threads_once_block(2, 1); } -TEST_F(NotifierTestFixture, run_all_threads_once_64_1) { - run_all_threads_once(64, 1); +TEST_F(NotifierBlockTestFixture, run_all_threads_once_64_1) { + run_all_threads_once_block(64, 1); } -TEST_F(NotifierTestFixture, run_all_threads_once_128_1) { - run_all_threads_once(128, 1); +TEST_F(NotifierBlockTestFixture, run_all_threads_once_128_1) { + run_all_threads_once_block(128, 1); } -TEST_F(NotifierTestFixture, run_all_threads_once_256_1) { - run_all_threads_once(256, 1); +TEST_F(NotifierBlockTestFixture, run_all_threads_once_256_1) { + run_all_threads_once_block(256, 1); } -TEST_F(NotifierTestFixture, run_all_threads_once_512_1) { - run_all_threads_once(512, 1); +TEST_F(NotifierBlockTestFixture, run_all_threads_once_512_1) { + run_all_threads_once_block(512, 1); } -TEST_F(NotifierTestFixture, run_all_threads_once_1024_1) { - run_all_threads_once(1024, 1); +TEST_F(NotifierBlockTestFixture, run_all_threads_once_1024_1) { + run_all_threads_once_block(1024, 1); } diff --git a/tests/unit_tests/notifier_gtest.hpp b/tests/unit_tests/notifier_gtest.hpp index 303f1932c9..453b93680d 100644 --- a/tests/unit_tests/notifier_gtest.hpp +++ b/tests/unit_tests/notifier_gtest.hpp @@ -43,7 +43,7 @@ static const uint64_t NOTIFIER_OFFSET {0x100B00}; inline __device__ void -write_to_memory(uint8_t* raw_memory) { +write_to_memory_block(uint8_t* raw_memory) { auto thread_idx {get_flat_block_id()}; raw_memory[thread_idx] = THREAD_VALUE; __threadfence(); @@ -51,44 +51,43 @@ write_to_memory(uint8_t* raw_memory) { __global__ void -all_threads_once(uint8_t* raw_memory, - Notifier * notifier) { +all_threads_once_block(uint8_t* raw_memory, + Notifier * notifier) { if (!threadIdx.x) { notifier->store(NOTIFIER_OFFSET); notifier->fence(); } - __syncthreads(); + notifier->sync(); uint64_t offset_u64 {notifier->load()}; uint64_t raw_memory_u64 {reinterpret_cast(raw_memory)}; uint64_t address_u64 {raw_memory_u64 + offset_u64}; uint8_t* address {reinterpret_cast(address_u64)}; - write_to_memory(address); - __syncthreads(); + write_to_memory_block(address); } -class NotifierTestFixture : public ::testing::Test { +class NotifierBlockTestFixture : public ::testing::Test { using NotifierProxyT = NotifierProxy; public: - NotifierTestFixture() { + NotifierBlockTestFixture() { assert(raw_memory_ == nullptr); hip_allocator_.allocate((void**)&raw_memory_, GIBIBYTE_); assert(raw_memory_); } - ~NotifierTestFixture() { + ~NotifierBlockTestFixture() { if (raw_memory_) { hip_allocator_.deallocate(raw_memory_); } } void - run_all_threads_once(uint32_t x_block_dim, - uint32_t x_grid_dim) { + run_all_threads_once_block(uint32_t x_block_dim, + uint32_t x_grid_dim) { const dim3 hip_blocksize(x_block_dim, 1, 1); const dim3 hip_gridsize(x_grid_dim, 1, 1); - hipLaunchKernelGGL(all_threads_once, + hipLaunchKernelGGL(all_threads_once_block, hip_gridsize, hip_blocksize, 0, From 1289d50be5e213a10269926cdf5219ab9dca5a82 Mon Sep 17 00:00:00 2001 From: Brandon Potter Date: Thu, 15 Aug 2024 12:22:38 -0700 Subject: [PATCH 07/18] Change notifier fixture to prep for other fixtures --- tests/unit_tests/notifier_gtest.cpp | 14 +++---- tests/unit_tests/notifier_gtest.hpp | 64 ++++++++++++++--------------- 2 files changed, 38 insertions(+), 40 deletions(-) diff --git a/tests/unit_tests/notifier_gtest.cpp b/tests/unit_tests/notifier_gtest.cpp index e1ebea0210..073c37a495 100644 --- a/tests/unit_tests/notifier_gtest.cpp +++ b/tests/unit_tests/notifier_gtest.cpp @@ -29,29 +29,29 @@ using namespace rocshmem; *****************************************************************************/ TEST_F(NotifierBlockTestFixture, run_all_threads_once_1_1) { - run_all_threads_once_block(1, 1); + run_all_threads_once(1, 1); } TEST_F(NotifierBlockTestFixture, run_all_threads_once_2_1) { - run_all_threads_once_block(2, 1); + run_all_threads_once(2, 1); } TEST_F(NotifierBlockTestFixture, run_all_threads_once_64_1) { - run_all_threads_once_block(64, 1); + run_all_threads_once(64, 1); } TEST_F(NotifierBlockTestFixture, run_all_threads_once_128_1) { - run_all_threads_once_block(128, 1); + run_all_threads_once(128, 1); } TEST_F(NotifierBlockTestFixture, run_all_threads_once_256_1) { - run_all_threads_once_block(256, 1); + run_all_threads_once(256, 1); } TEST_F(NotifierBlockTestFixture, run_all_threads_once_512_1) { - run_all_threads_once_block(512, 1); + run_all_threads_once(512, 1); } TEST_F(NotifierBlockTestFixture, run_all_threads_once_1024_1) { - run_all_threads_once_block(1024, 1); + run_all_threads_once(1024, 1); } diff --git a/tests/unit_tests/notifier_gtest.hpp b/tests/unit_tests/notifier_gtest.hpp index 453b93680d..b82f47b1d9 100644 --- a/tests/unit_tests/notifier_gtest.hpp +++ b/tests/unit_tests/notifier_gtest.hpp @@ -43,17 +43,18 @@ static const uint64_t NOTIFIER_OFFSET {0x100B00}; inline __device__ void -write_to_memory_block(uint8_t* raw_memory) { - auto thread_idx {get_flat_block_id()}; +write_to_memory(uint8_t* raw_memory) { + auto thread_idx {get_flat_id()}; raw_memory[thread_idx] = THREAD_VALUE; __threadfence(); } +template __global__ void -all_threads_once_block(uint8_t* raw_memory, - Notifier * notifier) { - if (!threadIdx.x) { +all_threads_once(uint8_t* raw_memory, + Notifier * notifier) { + if (!get_flat_id()) { notifier->store(NOTIFIER_OFFSET); notifier->fence(); } @@ -62,49 +63,26 @@ all_threads_once_block(uint8_t* raw_memory, uint64_t raw_memory_u64 {reinterpret_cast(raw_memory)}; uint64_t address_u64 {raw_memory_u64 + offset_u64}; uint8_t* address {reinterpret_cast(address_u64)}; - write_to_memory_block(address); + write_to_memory(address); } -class NotifierBlockTestFixture : public ::testing::Test { - using NotifierProxyT = NotifierProxy; - +class NotifierBase : public ::testing::Test { public: - NotifierBlockTestFixture() { + NotifierBase() { assert(raw_memory_ == nullptr); hip_allocator_.allocate((void**)&raw_memory_, GIBIBYTE_); assert(raw_memory_); } - ~NotifierBlockTestFixture() { + ~NotifierBase() { if (raw_memory_) { hip_allocator_.deallocate(raw_memory_); } } void - run_all_threads_once_block(uint32_t x_block_dim, - uint32_t x_grid_dim) { - const dim3 hip_blocksize(x_block_dim, 1, 1); - const dim3 hip_gridsize(x_grid_dim, 1, 1); - - hipLaunchKernelGGL(all_threads_once_block, - hip_gridsize, - hip_blocksize, - 0, - nullptr, - raw_memory_, - notifier_.get()); - - hipError_t return_code = hipStreamSynchronize(nullptr); - if (return_code != hipSuccess) { - printf("Failed in stream synchronize\n"); - assert(return_code == hipSuccess); - } - - size_t number_threads {x_block_dim * x_grid_dim}; - + verify(size_t number_threads) { uint8_t* offset_addr {compute_offset_addr()}; - for (size_t i {0}; i < number_threads; i++) { ASSERT_EQ(offset_addr[i], THREAD_VALUE); } @@ -137,6 +115,26 @@ class NotifierBlockTestFixture : public ::testing::Test { */ uint8_t *raw_memory_ {nullptr}; +}; + +class NotifierBlockTestFixture : public NotifierBase { + using NotifierT = Notifier; + using NotifierProxyT = NotifierProxy; + + public: + void + run_all_threads_once(uint32_t x_block_dim, + uint32_t x_grid_dim) { + const dim3 block(x_block_dim, 1, 1); + const dim3 grid(x_grid_dim, 1, 1); + + all_threads_once<<>>(raw_memory_, notifier_.get()); + + CHECK_HIP(hipStreamSynchronize(nullptr)); + + verify(x_block_dim * x_grid_dim); + } + /** * @brief Used to broadcast base offset for writing. */ From 359d6be797bcafd6e5766990d171535bf89b96be Mon Sep 17 00:00:00 2001 From: Brandon Potter Date: Fri, 16 Aug 2024 10:45:33 -0700 Subject: [PATCH 08/18] Add sync method to notifier class --- src/atomic.hpp | 10 ----- src/memory/notifier.hpp | 50 ++++++++++++++++++--- tests/unit_tests/notifier_gtest.cpp | 68 +++++++++++++++++++++++++++++ tests/unit_tests/notifier_gtest.hpp | 27 ++++++++++-- 4 files changed, 135 insertions(+), 20 deletions(-) diff --git a/src/atomic.hpp b/src/atomic.hpp index 330bd82f86..eadacf7284 100644 --- a/src/atomic.hpp +++ b/src/atomic.hpp @@ -132,16 +132,6 @@ void threadfence() { } } -template -__device__ -void sync() { - if constexpr (s == memory_scope_workgroup) { - __syncthreads(); - } else { - assert(false); - } -} - } // namespace atomic } // namespace detail } // namespace rocshmem diff --git a/src/memory/notifier.hpp b/src/memory/notifier.hpp index 12f53a4c8b..f01f7f8301 100644 --- a/src/memory/notifier.hpp +++ b/src/memory/notifier.hpp @@ -31,13 +31,14 @@ namespace rocshmem { template class Notifier { + public: __device__ uint64_t load() { - return detail::atomic::load(&value_, orders); + return detail::atomic::load(&value_, orders_); } __device__ void store(uint64_t val) { - detail::atomic::store(&value_, val, orders); + detail::atomic::store(&value_, val, orders_); } __device__ void fence() { @@ -45,19 +46,56 @@ class Notifier { } __device__ void sync() { - detail::atomic::sync(); + if constexpr (scope == detail::atomic::memory_scope_thread || + scope == detail::atomic::memory_scope_wavefront) { + return; + } + if constexpr (scope == detail::atomic::memory_scope_workgroup) { + __syncthreads(); + return; + } + if constexpr (scope == detail::atomic::memory_scope_system) { + assert(false); + return; + } + + uint32_t done = signal_ + 1; + __syncthreads(); + + uint32_t retval = 0; + bool executor {!threadIdx.x && !threadIdx.y && !threadIdx.z}; + if (executor) { + retval = detail::atomic::fetch_add(&count_, 1, orders_); + detail::atomic::threadfence(); + } + __syncthreads(); + + if (retval == ((gridDim.x * gridDim.y * gridDim.z) - 1)) { + if (executor) { + detail::atomic::store(&count_, 0, orders_); + detail::atomic::threadfence(); + auto x = detail::atomic::fetch_add(&signal_, 1, orders_); + detail::atomic::threadfence(); + } + } + while (detail::atomic::load(&signal_, orders_) != done) { + ; + } } private: - - detail::atomic::rocshmem_memory_orders orders; + detail::atomic::rocshmem_memory_orders orders_{}; uint64_t value_{}; + + uint32_t signal_ {}; + + uint32_t count_ {}; }; template class NotifierProxy { - using ProxyT = DeviceProxy, 1>; + using ProxyT = DeviceProxy>; public: __host__ __device__ Notifier* get() { return proxy_.get(); } diff --git a/tests/unit_tests/notifier_gtest.cpp b/tests/unit_tests/notifier_gtest.cpp index 073c37a495..e6275b10a1 100644 --- a/tests/unit_tests/notifier_gtest.cpp +++ b/tests/unit_tests/notifier_gtest.cpp @@ -55,3 +55,71 @@ TEST_F(NotifierBlockTestFixture, run_all_threads_once_512_1) { TEST_F(NotifierBlockTestFixture, run_all_threads_once_1024_1) { run_all_threads_once(1024, 1); } + +TEST_F(NotifierAgentTestFixture, run_all_threads_once_1_1) { + run_all_threads_once(1, 1); +} + +TEST_F(NotifierAgentTestFixture, run_all_threads_once_2_1) { + run_all_threads_once(2, 1); +} + +TEST_F(NotifierAgentTestFixture, run_all_threads_once_64_1) { + run_all_threads_once(64, 1); +} + +TEST_F(NotifierAgentTestFixture, run_all_threads_once_128_1) { + run_all_threads_once(128, 1); +} + +TEST_F(NotifierAgentTestFixture, run_all_threads_once_256_1) { + run_all_threads_once(256, 1); +} + +TEST_F(NotifierAgentTestFixture, run_all_threads_once_512_1) { + run_all_threads_once(512, 1); +} + +TEST_F(NotifierAgentTestFixture, run_all_threads_once_1024_1) { + run_all_threads_once(1024, 1); +} + +TEST_F(NotifierAgentTestFixture, run_all_threads_once_1_2) { + run_all_threads_once(1, 2); +} + +TEST_F(NotifierAgentTestFixture, run_all_threads_once_1024_2) { + run_all_threads_once(1024, 2); +} + +TEST_F(NotifierAgentTestFixture, run_all_threads_once_1_4) { + run_all_threads_once(1, 4); +} + +TEST_F(NotifierAgentTestFixture, run_all_threads_once_1024_4) { + run_all_threads_once(1024, 4); +} + +TEST_F(NotifierAgentTestFixture, run_all_threads_once_1_8) { + run_all_threads_once(1, 8); +} + +TEST_F(NotifierAgentTestFixture, run_all_threads_once_1024_8) { + run_all_threads_once(1024, 8); +} + +TEST_F(NotifierAgentTestFixture, run_all_threads_once_1_32) { + run_all_threads_once(1, 32); +} + +TEST_F(NotifierAgentTestFixture, run_all_threads_once_1024_32) { + run_all_threads_once(1024, 32); +} + +TEST_F(NotifierAgentTestFixture, run_all_threads_once_1_64) { + run_all_threads_once(1, 64); +} + +TEST_F(NotifierAgentTestFixture, run_all_threads_once_1024_64) { + run_all_threads_once(1024, 64); +} diff --git a/tests/unit_tests/notifier_gtest.hpp b/tests/unit_tests/notifier_gtest.hpp index b82f47b1d9..e130159b2e 100644 --- a/tests/unit_tests/notifier_gtest.hpp +++ b/tests/unit_tests/notifier_gtest.hpp @@ -53,7 +53,7 @@ template __global__ void all_threads_once(uint8_t* raw_memory, - Notifier * notifier) { + NotifierT * notifier) { if (!get_flat_id()) { notifier->store(NOTIFIER_OFFSET); notifier->fence(); @@ -125,13 +125,11 @@ class NotifierBlockTestFixture : public NotifierBase { void run_all_threads_once(uint32_t x_block_dim, uint32_t x_grid_dim) { + new (notifier_.get()) NotifierT(); const dim3 block(x_block_dim, 1, 1); const dim3 grid(x_grid_dim, 1, 1); - all_threads_once<<>>(raw_memory_, notifier_.get()); - CHECK_HIP(hipStreamSynchronize(nullptr)); - verify(x_block_dim * x_grid_dim); } @@ -141,6 +139,27 @@ class NotifierBlockTestFixture : public NotifierBase { NotifierProxyT notifier_ {}; }; +class NotifierAgentTestFixture : public NotifierBase { + using NotifierT = Notifier; + using NotifierProxyT = NotifierProxy; + + public: + void + run_all_threads_once(uint32_t x_block_dim, + uint32_t x_grid_dim) { + new (notifier_.get()) NotifierT(); + const dim3 block(x_block_dim, 1, 1); + const dim3 grid(x_grid_dim, 1, 1); + all_threads_once<<>>(raw_memory_, notifier_.get()); + CHECK_HIP(hipStreamSynchronize(nullptr)); + verify(x_block_dim * x_grid_dim); + } + + /** + * @brief Used to broadcast base offset for writing. + */ + NotifierProxyT notifier_ {}; +}; } // namespace rocshmem From 45c29e7734385ceb5de6ae26db197e208c543752 Mon Sep 17 00:00:00 2001 From: Brandon Potter Date: Fri, 16 Aug 2024 12:33:13 -0700 Subject: [PATCH 09/18] Minor updates to Nofifier sync method --- src/memory/notifier.hpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/memory/notifier.hpp b/src/memory/notifier.hpp index f01f7f8301..bcc030ce0a 100644 --- a/src/memory/notifier.hpp +++ b/src/memory/notifier.hpp @@ -59,25 +59,25 @@ class Notifier { return; } - uint32_t done = signal_ + 1; + uint32_t done {signal_ + 1}; __syncthreads(); - uint32_t retval = 0; + uint32_t retval {0}; bool executor {!threadIdx.x && !threadIdx.y && !threadIdx.z}; if (executor) { retval = detail::atomic::fetch_add(&count_, 1, orders_); - detail::atomic::threadfence(); + fence(); } __syncthreads(); if (retval == ((gridDim.x * gridDim.y * gridDim.z) - 1)) { if (executor) { detail::atomic::store(&count_, 0, orders_); - detail::atomic::threadfence(); - auto x = detail::atomic::fetch_add(&signal_, 1, orders_); - detail::atomic::threadfence(); + fence(); + detail::atomic::fetch_add(&signal_, 1, orders_); } } + while (detail::atomic::load(&signal_, orders_) != done) { ; } From 678564ba3ce1d772745b16c9d1c4b2908a7351c0 Mon Sep 17 00:00:00 2001 From: Brandon Potter Date: Mon, 9 Sep 2024 10:14:06 -0700 Subject: [PATCH 10/18] Add an extra assertion check for nullptr --- tests/unit_tests/ipc_impl_simple_coarse_gtest.cpp | 1 + tests/unit_tests/ipc_impl_simple_fine_gtest.cpp | 1 + 2 files changed, 2 insertions(+) diff --git a/tests/unit_tests/ipc_impl_simple_coarse_gtest.cpp b/tests/unit_tests/ipc_impl_simple_coarse_gtest.cpp index dbf72923b5..37833f9cfe 100644 --- a/tests/unit_tests/ipc_impl_simple_coarse_gtest.cpp +++ b/tests/unit_tests/ipc_impl_simple_coarse_gtest.cpp @@ -33,6 +33,7 @@ TEST_F(IPCImplSimpleCoarseTestFixture, MPI_num_pes) { } TEST_F(IPCImplSimpleCoarseTestFixture, IPC_bases) { + ASSERT_NE(ipc_impl_.ipc_bases, nullptr); for(int i{0}; i < mpi_.num_pes(); i++) { ASSERT_NE(ipc_impl_.ipc_bases[i], nullptr); } diff --git a/tests/unit_tests/ipc_impl_simple_fine_gtest.cpp b/tests/unit_tests/ipc_impl_simple_fine_gtest.cpp index 1c4c409f59..7c752e2f6a 100644 --- a/tests/unit_tests/ipc_impl_simple_fine_gtest.cpp +++ b/tests/unit_tests/ipc_impl_simple_fine_gtest.cpp @@ -33,6 +33,7 @@ TEST_F(IPCImplSimpleFineTestFixture, MPI_num_pes) { } TEST_F(IPCImplSimpleFineTestFixture, IPC_bases) { + ASSERT_NE(ipc_impl_.ipc_bases, nullptr); for(int i{0}; i < mpi_.num_pes(); i++) { ASSERT_NE(ipc_impl_.ipc_bases[i], nullptr); } From 2806e1be79a6de385d0db21d526a0a3ad402a95e Mon Sep 17 00:00:00 2001 From: Brandon Potter Date: Tue, 10 Sep 2024 07:08:56 -0700 Subject: [PATCH 11/18] Intermediate commit for rebase --- scripts/build_configs/ro_net_debug | 2 +- src/util.hpp | 8 ++ .../ipc_impl_simple_coarse_gtest.hpp | 2 - .../unit_tests/ipc_impl_simple_fine_gtest.hpp | 82 +++++++++++++++---- 4 files changed, 77 insertions(+), 17 deletions(-) diff --git a/scripts/build_configs/ro_net_debug b/scripts/build_configs/ro_net_debug index 67c3f2d0a5..e8309b06d8 100755 --- a/scripts/build_configs/ro_net_debug +++ b/scripts/build_configs/ro_net_debug @@ -18,7 +18,7 @@ cmake \ -DPROFILE=OFF \ -DUSE_GPU_IB=OFF \ -DUSE_DC=OFF \ - -DUSE_IPC=OFF \ + -DUSE_IPC=ON \ -DUSE_THREADS=ON \ -DUSE_WF_COAL=OFF \ -DUSE_COHERENT_HEAP=ON \ diff --git a/src/util.hpp b/src/util.hpp index c209750d49..c02f891dc4 100644 --- a/src/util.hpp +++ b/src/util.hpp @@ -93,6 +93,7 @@ __device__ __forceinline__ bool is_thread_zero_in_block() { __device__ __forceinline__ bool is_block_zero_in_grid() { return hipBlockIdx_x == 0 && hipBlockIdx_y == 0 && hipBlockIdx_z == 0; } + /* * Returns the number of threads in the caller's flattened thread block. */ @@ -100,6 +101,13 @@ __device__ __forceinline__ int get_flat_block_size() { return hipBlockDim_x * hipBlockDim_y * hipBlockDim_z; } +/* + * Returns the number of threads in the caller's flattened grid. + */ +__device__ __forceinline__ int get_flat_grid_size() { + return get_flat_block_size() * hipGridDim_x * hipGridDim_y * hipGridDim_z; +} + /* * Returns the flattened thread index of the calling thread within its * thread block. diff --git a/tests/unit_tests/ipc_impl_simple_coarse_gtest.hpp b/tests/unit_tests/ipc_impl_simple_coarse_gtest.hpp index 02dfd8c55a..083d73f418 100644 --- a/tests/unit_tests/ipc_impl_simple_coarse_gtest.hpp +++ b/tests/unit_tests/ipc_impl_simple_coarse_gtest.hpp @@ -218,8 +218,6 @@ class IPCImplSimpleCoarseTestFixture : public ::testing::Test { protected: std::vector golden_; - std::vector output_; - HEAP_T heap_mem_ {}; MPI_T mpi_ {heap_mem_.get_ptr(), heap_mem_.get_size()}; diff --git a/tests/unit_tests/ipc_impl_simple_fine_gtest.hpp b/tests/unit_tests/ipc_impl_simple_fine_gtest.hpp index 3f85528e7d..fb60acdec5 100644 --- a/tests/unit_tests/ipc_impl_simple_fine_gtest.hpp +++ b/tests/unit_tests/ipc_impl_simple_fine_gtest.hpp @@ -33,9 +33,27 @@ namespace rocshmem { +enum TestType { + READ = 0, + WRITE = 1 +}; + __global__ void -kernel_simple_fine_copy(IpcImpl *ipc_impl, int *src, int *dest, size_t bytes) { +kernel_simple_fine_copy(IpcImpl *ipc_impl, int *src, int *dest, size_t bytes, TestType test) { + if (!threadIdx.x) { + ipc_impl->ipcCopy(dest, src, bytes); + ipc_impl->ipcFence(); + if (test == WRITE) { + ipc_impl->ipc + } + } + __syncthreads(); +} + +__global__ +void +kernel_simple_fine_copy_signal_validate(IpcImpl *ipc_impl, int *src, int *dest, size_t bytes) { if (!threadIdx.x) { ipc_impl->ipcCopy(dest, src, bytes); ipc_impl->ipcFence(); @@ -51,6 +69,14 @@ kernel_simple_fine_copy_wg(IpcImpl *ipc_impl, int *src, int *dest, size_t bytes) __syncthreads(); } +__global__ +void +kernel_simple_fine_copy_wg_signal_validate(IpcImpl *ipc_impl, int *src, int *dest, size_t bytes) { + ipc_impl->ipcCopy_wg(dest, src, bytes); + ipc_impl->ipcFence(); + __syncthreads(); +} + __global__ void kernel_simple_fine_copy_wave(IpcImpl *ipc_impl, int *src, int *dest, size_t bytes) { @@ -59,6 +85,14 @@ kernel_simple_fine_copy_wave(IpcImpl *ipc_impl, int *src, int *dest, size_t byte __syncthreads(); } +__global__ +void +kernel_simple_fine_copy_wave_signal_validate(IpcImpl *ipc_impl, int *src, int *dest, size_t bytes) { + ipc_impl->ipcCopy_wave(dest, src, bytes); + ipc_impl->ipcFence(); + __syncthreads(); +} + class IPCImplSimpleFineTestFixture : public ::testing::Test { using HEAP_T = HeapMemory; @@ -91,51 +125,46 @@ class IPCImplSimpleFineTestFixture : public ::testing::Test { CHECK_HIP(hipStreamSynchronize(nullptr)); } - enum TestType { - READ = 0, - WRITE = 1 - }; - void write(const dim3 grid, const dim3 block, size_t elems) { iota_golden(elems); initialize_src_buffer(WRITE); copy(WRITE, grid, block); - validate_dest_buffer(WRITE); + check_device_validation_errors(WRITE); } void write_wg(const dim3 grid, const dim3 block, size_t elems) { iota_golden(elems); initialize_src_buffer(WRITE); copy_wg(WRITE, grid, block); - validate_dest_buffer(WRITE); + check_device_validation_errors(WRITE); } void write_wave(const dim3 grid, const dim3 block, size_t elems) { iota_golden(elems); initialize_src_buffer(WRITE); copy_wave(WRITE, grid, block); - validate_dest_buffer(WRITE); + check_device_validation_errors(WRITE); } void read(const dim3 grid, const dim3 block, size_t elems) { iota_golden(elems); initialize_src_buffer(READ); copy(READ, grid, block); - validate_dest_buffer(READ); + check_device_validation_errors(READ); } void read_wg(const dim3 grid, const dim3 block, size_t elems) { iota_golden(elems); initialize_src_buffer(READ); copy_wg(READ, grid, block); - validate_dest_buffer(READ); + check_device_validation_errors(READ); } void read_wave(const dim3 grid, const dim3 block, size_t elems) { iota_golden(elems); initialize_src_buffer(READ); copy_wave(READ, grid, block); - validate_dest_buffer(READ); + check_device_validation_errors(READ); } void iota_golden(size_t elems) { @@ -160,6 +189,7 @@ class IPCImplSimpleFineTestFixture : public ::testing::Test { CHECK_HIP(hipStreamSynchronize(nullptr)); } + __host__ __device__ bool pe_initializes_src_buffer(TestType test) { bool is_write_test = test; bool is_read_test = !test; @@ -184,7 +214,7 @@ class IPCImplSimpleFineTestFixture : public ::testing::Test { } size_t bytes = golden_.size() * sizeof(int); mpi_.barrier(); - launch(fn, grid, block, src, dest, bytes); + launch(fn, grid, block, src, dest, bytes, test); mpi_.barrier(); } @@ -200,6 +230,13 @@ class IPCImplSimpleFineTestFixture : public ::testing::Test { execute(test, kernel_simple_fine_copy_wave, grid, block); } + void check_device_validation_errors(TestType test) { + if (!pe_validates_dest_buffer(test)) { + return; + } + ASSERT_EQ(validation_error, false); + } + void validate_dest_buffer(TestType test) { if (!pe_validates_dest_buffer(test)) { return; @@ -211,6 +248,21 @@ class IPCImplSimpleFineTestFixture : public ::testing::Test { } } + __device__ + void validate_dest_buffer(TestType test) { + if (!pe_validates_dest_buffer(test)) { + return; + } + + auto dev_dest = reinterpret_cast(ipc_impl_.ipc_bases[mpi_.my_pe()]); + for (int i {get_flat_id()}; i < golden_.size(); i += get_flat_grid_size()) { + if (dev_golden_[i] != dev_dest[i]) { + validation_error = true; + } + } + } + + __host__ __device__ bool pe_validates_dest_buffer(TestType test) { return !pe_initializes_src_buffer(test); } @@ -218,7 +270,7 @@ class IPCImplSimpleFineTestFixture : public ::testing::Test { protected: std::vector golden_; - std::vector output_; + std::vector device_golden_; HEAP_T heap_mem_ {}; @@ -229,6 +281,8 @@ class IPCImplSimpleFineTestFixture : public ::testing::Test { IpcImpl *ipc_impl_dptr_ {nullptr}; HIPAllocator hip_allocator_ {}; + + bool validation_error {false}; }; } // namespace rocshmem From 7411c4559151e963c173b89910b89ffb3eb03b35 Mon Sep 17 00:00:00 2001 From: Brandon Potter Date: Tue, 10 Sep 2024 09:34:45 -0700 Subject: [PATCH 12/18] Conservatively use SEQ_CST atomics in IPC conduit --- src/ipc_policy.hpp | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/src/ipc_policy.hpp b/src/ipc_policy.hpp index c0190198ca..48b7b7fd40 100644 --- a/src/ipc_policy.hpp +++ b/src/ipc_policy.hpp @@ -68,34 +68,34 @@ class IpcOnImpl { template __device__ T ipcAMOFetchAdd(T *val, T value) { - return __hip_atomic_fetch_add(val, value, __ATOMIC_RELAXED, - __HIP_MEMORY_SCOPE_AGENT); + return __hip_atomic_fetch_add(val, value, __ATOMIC_SEQ_CST, + __HIP_MEMORY_SCOPE_SYSTEM); } template __device__ T ipcAMOFetchCas(T *val, T cond, T value) { - __hip_atomic_compare_exchange_strong(val, &cond, value, __ATOMIC_RELAXED, - __ATOMIC_RELAXED, - __HIP_MEMORY_SCOPE_AGENT); + __hip_atomic_compare_exchange_strong(val, &cond, value, __ATOMIC_SEQ_CST, + __ATOMIC_SEQ_CST, + __HIP_MEMORY_SCOPE_SYSTEM); return cond; } template __device__ void ipcAMOAdd(T *val, T value) { - __hip_atomic_fetch_add(val, value, __ATOMIC_RELAXED, - __HIP_MEMORY_SCOPE_AGENT); + __hip_atomic_fetch_add(val, value, __ATOMIC_SEQ_CST, + __HIP_MEMORY_SCOPE_SYSTEM); } template __device__ void ipcAMOCas(T *val, T cond, T value) { - __hip_atomic_compare_exchange_strong(val, &cond, value, __ATOMIC_RELAXED, - __ATOMIC_RELAXED, - __HIP_MEMORY_SCOPE_AGENT); + __hip_atomic_compare_exchange_strong(val, &cond, value, __ATOMIC_SEQ_CST, + __ATOMIC_SEQ_CST, + __HIP_MEMORY_SCOPE_SYSTEM); } template __device__ void ipcAMOSet(T *val, T value) { - __hip_atomic_store(val, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + __hip_atomic_store(val, value, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_SYSTEM); } __device__ void zero_byte_read(int pe) { From 86a2f34539c327ad106efb1ae722c4f84ebe40b8 Mon Sep 17 00:00:00 2001 From: Brandon Potter Date: Tue, 10 Sep 2024 09:35:02 -0700 Subject: [PATCH 13/18] Add missing header file --- src/atomic.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/atomic.hpp b/src/atomic.hpp index eadacf7284..f0828e4145 100644 --- a/src/atomic.hpp +++ b/src/atomic.hpp @@ -23,6 +23,8 @@ #ifndef LIBRARY_SRC_ATOMIC_HPP #define LIBRARY_SRC_ATOMIC_HPP +#include + namespace rocshmem { namespace detail { namespace atomic { From 46fdb1851c067267ad40208508b6ed5b82d40005 Mon Sep 17 00:00:00 2001 From: Brandon Potter Date: Tue, 10 Sep 2024 09:35:41 -0700 Subject: [PATCH 14/18] Update fine-grained simple tests --- .../unit_tests/ipc_impl_simple_fine_gtest.hpp | 205 +++++++++++------- 1 file changed, 122 insertions(+), 83 deletions(-) diff --git a/tests/unit_tests/ipc_impl_simple_fine_gtest.hpp b/tests/unit_tests/ipc_impl_simple_fine_gtest.hpp index fb60acdec5..8e9c28ac46 100644 --- a/tests/unit_tests/ipc_impl_simple_fine_gtest.hpp +++ b/tests/unit_tests/ipc_impl_simple_fine_gtest.hpp @@ -26,10 +26,13 @@ #include "gtest/gtest.h" #include - #include + +#include "../src/atomic.hpp" +#include "../src/ipc_policy.hpp" +#include "../src/memory/notifier.hpp" #include "../src/memory/symmetric_heap.hpp" -#include "../src/ipc/ipc_policy.hpp" +#include "../src/util.hpp" namespace rocshmem { @@ -38,68 +41,96 @@ enum TestType { WRITE = 1 }; +const uint32_t SIGNAL_OFFSET {67108864}; + +__device__ +void +validator(bool *error, int *golden, int *dest, size_t bytes) { + size_t elements {bytes / sizeof(int)}; + for (int i {get_flat_id()}; i < elements; i += get_flat_grid_size()) { + if (golden[i] != dest[i]) { + *error = true; + } + } +} + +template __global__ void -kernel_simple_fine_copy(IpcImpl *ipc_impl, int *src, int *dest, size_t bytes, TestType test) { - if (!threadIdx.x) { - ipc_impl->ipcCopy(dest, src, bytes); - ipc_impl->ipcFence(); - if (test == WRITE) { - ipc_impl->ipc - } +kernel_put_with_signal_validator(bool *error, int *golden, int *dest, size_t bytes, NotifierT *notifier) { + detail::atomic::rocshmem_memory_orders orders{}; + if (!get_flat_id()) { + while (detail::atomic::load(dest + SIGNAL_OFFSET, orders) == 0) { + ; + } + } + notifier->sync(); + validator(error, golden, dest, bytes); +} + +template +__global__ +void +kernel_simple_fine_copy(IpcImpl *ipc_impl, bool *error, int *golden, int *src, int *dest, size_t bytes, TestType test, NotifierT *notifier) { + if (!get_flat_id()) { + ipc_impl->ipcCopy(dest, src, bytes); + ipc_impl->ipcFence(); + if (test == WRITE) { + ipc_impl->ipcAMOFetchAdd(dest + SIGNAL_OFFSET, 1); + } + } + if (test == READ) { + notifier->sync(); + validator(error, golden, dest, bytes); + } +} + +template +__global__ +void +kernel_simple_fine_copy_wg(IpcImpl *ipc_impl, bool *error, int *golden, int *src, int *dest, size_t bytes, TestType test, NotifierT *notifier) { + if (!blockIdx.x) { + ipc_impl->ipcCopy_wg(dest, src, bytes); + ipc_impl->ipcFence(); + if (test == WRITE) { + if (!threadIdx.x) { + ipc_impl->ipcAMOFetchAdd(dest + SIGNAL_OFFSET, 1); + } + } + } + if (test == READ) { + notifier->sync(); + validator(error, golden, dest, bytes); + } +} + +template +__global__ +void +kernel_simple_fine_copy_wave(IpcImpl *ipc_impl, bool *error, int *golden, int *src, int *dest, size_t bytes, TestType test, NotifierT *notifier) { + if (!blockIdx.x && threadIdx.x < 64) { + ipc_impl->ipcCopy_wave(dest, src, bytes); + ipc_impl->ipcFence(); + if (test == WRITE) { + if (!threadIdx.x) { + ipc_impl->ipcAMOFetchAdd(dest + SIGNAL_OFFSET, 1); + } + } } __syncthreads(); -} - -__global__ -void -kernel_simple_fine_copy_signal_validate(IpcImpl *ipc_impl, int *src, int *dest, size_t bytes) { - if (!threadIdx.x) { - ipc_impl->ipcCopy(dest, src, bytes); - ipc_impl->ipcFence(); + if (test == READ) { + notifier->sync(); + validator(error, golden, dest, bytes); } - __syncthreads(); -} - -__global__ -void -kernel_simple_fine_copy_wg(IpcImpl *ipc_impl, int *src, int *dest, size_t bytes) { - ipc_impl->ipcCopy_wg(dest, src, bytes); - ipc_impl->ipcFence(); - __syncthreads(); -} - -__global__ -void -kernel_simple_fine_copy_wg_signal_validate(IpcImpl *ipc_impl, int *src, int *dest, size_t bytes) { - ipc_impl->ipcCopy_wg(dest, src, bytes); - ipc_impl->ipcFence(); - __syncthreads(); -} - -__global__ -void -kernel_simple_fine_copy_wave(IpcImpl *ipc_impl, int *src, int *dest, size_t bytes) { - ipc_impl->ipcCopy_wave(dest, src, bytes); - ipc_impl->ipcFence(); - __syncthreads(); -} - -__global__ -void -kernel_simple_fine_copy_wave_signal_validate(IpcImpl *ipc_impl, int *src, int *dest, size_t bytes) { - ipc_impl->ipcCopy_wave(dest, src, bytes); - ipc_impl->ipcFence(); - __syncthreads(); } class IPCImplSimpleFineTestFixture : public ::testing::Test { - using HEAP_T = HeapMemory; - using MPI_T = RemoteHeapInfo; - - using FN_T = void (*)(IpcImpl*, int*, int*, size_t); + using NotifierT = Notifier; + using NotifierProxyT = NotifierProxy; + using FN_T1 = void (*)(IpcImpl*, bool*, int*, int*, int*, size_t, TestType, NotifierT*); + using FN_T2 = void (*)(bool*, int*, int*, size_t, NotifierT*); public: IPCImplSimpleFineTestFixture() { @@ -107,21 +138,34 @@ class IPCImplSimpleFineTestFixture : public ::testing::Test { assert(ipc_impl_dptr_ == nullptr); hip_allocator_.allocate((void**)&ipc_impl_dptr_, sizeof(IpcImpl)); + CHECK_HIP(hipMemcpy(ipc_impl_dptr_, &ipc_impl_, sizeof(IpcImpl), hipMemcpyHostToDevice)); - CHECK_HIP(hipMemcpy(ipc_impl_dptr_, &ipc_impl_, - sizeof(IpcImpl), hipMemcpyHostToDevice)); + assert(error_dptr_ == nullptr); + hip_allocator_.allocate((void**)&error_dptr_, sizeof(bool)); + *error_dptr_ = false; } ~IPCImplSimpleFineTestFixture() { if (ipc_impl_dptr_) { hip_allocator_.deallocate(ipc_impl_dptr_); } + if (error_dptr_) { + hip_allocator_.deallocate(error_dptr_); + } + if (golden_dptr_) { + hip_allocator_.deallocate(golden_dptr_); + } ipc_impl_.ipcHostStop(); } - void launch(FN_T f, const dim3 grid, const dim3 block, int* src, int* dest, size_t bytes) { - f<<>>(ipc_impl_dptr_, src, dest, bytes); + void launch(FN_T1 f, const dim3 grid, const dim3 block, int* src, int* dest, size_t bytes, TestType test) { + f<<>>(ipc_impl_dptr_, error_dptr_, golden_dptr_, src, dest, bytes, test, notifier_.get()); + CHECK_HIP(hipStreamSynchronize(nullptr)); + } + + void launch(FN_T2 f, const dim3 grid, const dim3 block, int* dest, size_t bytes) { + f<<>>(error_dptr_, golden_dptr_, dest, bytes, notifier_.get()); CHECK_HIP(hipStreamSynchronize(nullptr)); } @@ -170,6 +214,11 @@ class IPCImplSimpleFineTestFixture : public ::testing::Test { void iota_golden(size_t elems) { golden_.resize(elems); std::iota(golden_.begin(), golden_.end(), 0); + + assert(golden_dptr_ == nullptr); + size_t golden_dptr_bytes {golden_.size() * sizeof(int)}; + hip_allocator_.allocate((void**)&golden_dptr_, golden_dptr_bytes); + CHECK_HIP(hipMemcpy(golden_dptr_, golden_.data(), golden_dptr_bytes, hipMemcpyHostToDevice)); } void validate_golden(size_t elems) { @@ -186,10 +235,8 @@ class IPCImplSimpleFineTestFixture : public ::testing::Test { size_t bytes = golden_.size() * sizeof(int); auto dev_src = reinterpret_cast(ipc_impl_.ipc_bases[mpi_.my_pe()]); CHECK_HIP(hipMemcpy(dev_src, golden_.data(), bytes, hipMemcpyHostToDevice)); - CHECK_HIP(hipStreamSynchronize(nullptr)); } - __host__ __device__ bool pe_initializes_src_buffer(TestType test) { bool is_write_test = test; bool is_read_test = !test; @@ -197,9 +244,15 @@ class IPCImplSimpleFineTestFixture : public ::testing::Test { (is_read_test && mpi_.my_pe() == 1); } - void execute(TestType test, FN_T fn, const dim3 grid, const dim3 block) { + void execute(TestType test, FN_T1 fn, const dim3 grid, const dim3 block) { + size_t bytes = golden_.size() * sizeof(int); if (mpi_.my_pe()) { mpi_.barrier(); + if (test == WRITE) { + int *dest = reinterpret_cast(ipc_impl_.ipc_bases[1]); + FN_T2 val_fn = kernel_put_with_signal_validator; + launch(val_fn, grid, block, dest, bytes); + } mpi_.barrier(); return; } @@ -212,7 +265,6 @@ class IPCImplSimpleFineTestFixture : public ::testing::Test { src = reinterpret_cast(ipc_impl_.ipc_bases[1]); dest = reinterpret_cast(ipc_impl_.ipc_bases[0]); } - size_t bytes = golden_.size() * sizeof(int); mpi_.barrier(); launch(fn, grid, block, src, dest, bytes, test); mpi_.barrier(); @@ -234,7 +286,7 @@ class IPCImplSimpleFineTestFixture : public ::testing::Test { if (!pe_validates_dest_buffer(test)) { return; } - ASSERT_EQ(validation_error, false); + ASSERT_EQ(*error_dptr_, false); } void validate_dest_buffer(TestType test) { @@ -248,41 +300,28 @@ class IPCImplSimpleFineTestFixture : public ::testing::Test { } } - __device__ - void validate_dest_buffer(TestType test) { - if (!pe_validates_dest_buffer(test)) { - return; - } - - auto dev_dest = reinterpret_cast(ipc_impl_.ipc_bases[mpi_.my_pe()]); - for (int i {get_flat_id()}; i < golden_.size(); i += get_flat_grid_size()) { - if (dev_golden_[i] != dev_dest[i]) { - validation_error = true; - } - } - } - - __host__ __device__ bool pe_validates_dest_buffer(TestType test) { return !pe_initializes_src_buffer(test); } protected: - std::vector golden_; + HIPDefaultFinegrainedAllocator hip_allocator_ {}; - std::vector device_golden_; + NotifierProxyT notifier_ {}; HEAP_T heap_mem_ {}; MPI_T mpi_ {heap_mem_.get_ptr(), heap_mem_.get_size()}; + std::vector golden_; + + int *golden_dptr_ {nullptr}; + IpcImpl ipc_impl_ {}; IpcImpl *ipc_impl_dptr_ {nullptr}; - HIPAllocator hip_allocator_ {}; - - bool validation_error {false}; + bool *error_dptr_ {nullptr}; }; } // namespace rocshmem From f85c46ec0a4c34ac599949c09c65da9fa4d279e3 Mon Sep 17 00:00:00 2001 From: Brandon Potter Date: Thu, 26 Sep 2024 13:40:05 -0500 Subject: [PATCH 15/18] Bugfixes for the ipc unit tests --- src/ipc_policy.hpp | 2 +- src/memory/notifier.hpp | 8 ++++++++ tests/unit_tests/ipc_impl_simple_fine_gtest.cpp | 6 ++++++ tests/unit_tests/ipc_impl_simple_fine_gtest.hpp | 15 +++++++++++++++ 4 files changed, 30 insertions(+), 1 deletion(-) diff --git a/src/ipc_policy.hpp b/src/ipc_policy.hpp index 48b7b7fd40..1b84c52613 100644 --- a/src/ipc_policy.hpp +++ b/src/ipc_policy.hpp @@ -64,7 +64,7 @@ class IpcOnImpl { __device__ void ipcCopy_wave(void *dst, void *src, size_t size); - __device__ void ipcFence() { __threadfence(); } + __device__ void ipcFence() { __threadfence_system(); } template __device__ T ipcAMOFetchAdd(T *val, T value) { diff --git a/src/memory/notifier.hpp b/src/memory/notifier.hpp index bcc030ce0a..b72a256f15 100644 --- a/src/memory/notifier.hpp +++ b/src/memory/notifier.hpp @@ -98,6 +98,14 @@ class NotifierProxy { using ProxyT = DeviceProxy>; public: + NotifierProxy() { + new (proxy_.get()) Notifier(); + } + + ~NotifierProxy() { + proxy_.get()->~Notifier(); + } + __host__ __device__ Notifier* get() { return proxy_.get(); } private: diff --git a/tests/unit_tests/ipc_impl_simple_fine_gtest.cpp b/tests/unit_tests/ipc_impl_simple_fine_gtest.cpp index 7c752e2f6a..71cb85dc9e 100644 --- a/tests/unit_tests/ipc_impl_simple_fine_gtest.cpp +++ b/tests/unit_tests/ipc_impl_simple_fine_gtest.cpp @@ -46,6 +46,12 @@ TEST_F(IPCImplSimpleFineTestFixture, golden_1048576_int) { //============================================================================= +TEST_F(IPCImplSimpleFineTestFixture, write_wg_1x1x1_1x1x1_32_int) { + dim3 grid {1,1,1}; + dim3 block {1,1,1}; + write_wg(grid, block, 32); +} + TEST_F(IPCImplSimpleFineTestFixture, write_wg_1x1x1_1024x1x1_32_int) { dim3 grid {1,1,1}; dim3 block {1024,1,1}; diff --git a/tests/unit_tests/ipc_impl_simple_fine_gtest.hpp b/tests/unit_tests/ipc_impl_simple_fine_gtest.hpp index 8e9c28ac46..ba4c20e288 100644 --- a/tests/unit_tests/ipc_impl_simple_fine_gtest.hpp +++ b/tests/unit_tests/ipc_impl_simple_fine_gtest.hpp @@ -49,6 +49,7 @@ validator(bool *error, int *golden, int *dest, size_t bytes) { size_t elements {bytes / sizeof(int)}; for (int i {get_flat_id()}; i < elements; i += get_flat_grid_size()) { if (golden[i] != dest[i]) { + printf("golden[%d] %d != dest[%d] %d\n", i, golden[i], i, dest[i]); *error = true; } } @@ -171,6 +172,7 @@ class IPCImplSimpleFineTestFixture : public ::testing::Test { void write(const dim3 grid, const dim3 block, size_t elems) { iota_golden(elems); + initialize_signal(WRITE); initialize_src_buffer(WRITE); copy(WRITE, grid, block); check_device_validation_errors(WRITE); @@ -178,6 +180,7 @@ class IPCImplSimpleFineTestFixture : public ::testing::Test { void write_wg(const dim3 grid, const dim3 block, size_t elems) { iota_golden(elems); + initialize_signal(WRITE); initialize_src_buffer(WRITE); copy_wg(WRITE, grid, block); check_device_validation_errors(WRITE); @@ -185,6 +188,7 @@ class IPCImplSimpleFineTestFixture : public ::testing::Test { void write_wave(const dim3 grid, const dim3 block, size_t elems) { iota_golden(elems); + initialize_signal(WRITE); initialize_src_buffer(WRITE); copy_wave(WRITE, grid, block); check_device_validation_errors(WRITE); @@ -192,6 +196,7 @@ class IPCImplSimpleFineTestFixture : public ::testing::Test { void read(const dim3 grid, const dim3 block, size_t elems) { iota_golden(elems); + initialize_signal(READ); initialize_src_buffer(READ); copy(READ, grid, block); check_device_validation_errors(READ); @@ -199,6 +204,7 @@ class IPCImplSimpleFineTestFixture : public ::testing::Test { void read_wg(const dim3 grid, const dim3 block, size_t elems) { iota_golden(elems); + initialize_signal(READ); initialize_src_buffer(READ); copy_wg(READ, grid, block); check_device_validation_errors(READ); @@ -206,6 +212,7 @@ class IPCImplSimpleFineTestFixture : public ::testing::Test { void read_wave(const dim3 grid, const dim3 block, size_t elems) { iota_golden(elems); + initialize_signal(READ); initialize_src_buffer(READ); copy_wave(READ, grid, block); check_device_validation_errors(READ); @@ -228,6 +235,14 @@ class IPCImplSimpleFineTestFixture : public ::testing::Test { } } + void initialize_signal(TestType test) { + bool is_write_test = test; + if (is_write_test && mpi_.my_pe() == 0) { + int *dest = reinterpret_cast(ipc_impl_.ipc_bases[1]); + *(dest + SIGNAL_OFFSET) = 0; + } + } + void initialize_src_buffer(TestType test) { if (!pe_initializes_src_buffer(test)) { return; From 56b2ed699bd04c3bf7fe3cee03c4b6bc46ad1182 Mon Sep 17 00:00:00 2001 From: Brandon Potter Date: Fri, 27 Sep 2024 11:16:55 -0500 Subject: [PATCH 16/18] Reset config options to original values --- scripts/build_configs/ro_net | 4 ++-- scripts/build_configs/ro_net_debug | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/build_configs/ro_net b/scripts/build_configs/ro_net index 47690bb2d9..95ea2950a8 100755 --- a/scripts/build_configs/ro_net +++ b/scripts/build_configs/ro_net @@ -21,7 +21,7 @@ cmake \ -DUSE_IPC=ON \ -DUSE_THREADS=ON \ -DUSE_WF_COAL=OFF \ - -DUSE_COHERENT_HEAP=ON \ + -DUSE_COHERENT_HEAP=OFF \ $src_path cmake --build . --parallel 8 -#cmake --install . +cmake --install . diff --git a/scripts/build_configs/ro_net_debug b/scripts/build_configs/ro_net_debug index e8309b06d8..c2b8afdc00 100755 --- a/scripts/build_configs/ro_net_debug +++ b/scripts/build_configs/ro_net_debug @@ -18,10 +18,10 @@ cmake \ -DPROFILE=OFF \ -DUSE_GPU_IB=OFF \ -DUSE_DC=OFF \ - -DUSE_IPC=ON \ + -DUSE_IPC=OFF \ -DUSE_THREADS=ON \ -DUSE_WF_COAL=OFF \ - -DUSE_COHERENT_HEAP=ON \ + -DUSE_COHERENT_HEAP=OFF \ $src_path cmake --build . --parallel 8 cmake --install . From db221b022abb18139f85142102005104afb2e692 Mon Sep 17 00:00:00 2001 From: Brandon Potter Date: Fri, 27 Sep 2024 11:17:53 -0500 Subject: [PATCH 17/18] Change notifier max thread block value to account for MI300 CPX --- tests/unit_tests/notifier_gtest.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/unit_tests/notifier_gtest.cpp b/tests/unit_tests/notifier_gtest.cpp index e6275b10a1..d3b699e3d9 100644 --- a/tests/unit_tests/notifier_gtest.cpp +++ b/tests/unit_tests/notifier_gtest.cpp @@ -116,10 +116,10 @@ TEST_F(NotifierAgentTestFixture, run_all_threads_once_1024_32) { run_all_threads_once(1024, 32); } -TEST_F(NotifierAgentTestFixture, run_all_threads_once_1_64) { - run_all_threads_once(1, 64); +TEST_F(NotifierAgentTestFixture, run_all_threads_once_1_38) { + run_all_threads_once(1, 38); // MI300 CPX } -TEST_F(NotifierAgentTestFixture, run_all_threads_once_1024_64) { - run_all_threads_once(1024, 64); +TEST_F(NotifierAgentTestFixture, run_all_threads_once_1024_38) { + run_all_threads_once(1024, 38); // MI300 CPX } From 24b928a0075ca9425028d719b41262f14aa1135a Mon Sep 17 00:00:00 2001 From: Brandon Potter Date: Fri, 27 Sep 2024 15:17:57 -0500 Subject: [PATCH 18/18] Poll the signal from one thread instead of all --- src/memory/notifier.hpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/memory/notifier.hpp b/src/memory/notifier.hpp index b72a256f15..946a9c291e 100644 --- a/src/memory/notifier.hpp +++ b/src/memory/notifier.hpp @@ -78,9 +78,12 @@ class Notifier { } } - while (detail::atomic::load(&signal_, orders_) != done) { - ; + if (executor) { + while (detail::atomic::load(&signal_, orders_) != done) { + ; + } } + __syncthreads(); } private: