diff --git a/scripts/functional_tests/driver.sh b/scripts/functional_tests/driver.sh index 77eb3fe2f5..099dba9772 100755 --- a/scripts/functional_tests/driver.sh +++ b/scripts/functional_tests/driver.sh @@ -461,25 +461,25 @@ TestGDA() { ExecTest "teamctxput" 2 4 128 1024 ExecTest "teamctxput" 2 16 256 1024 - ExecTest "get" 2 1 1 1048576 - ExecTest "get" 2 1 1024 512 - ExecTest "get" 2 8 1 1048576 - ExecTest "get" 2 16 128 8 - ExecTest "get" 2 32 256 512 - ExecTest "get" 2 64 1024 8 + ExecTest "get" 2 1 1 1048576 + ExecTest "get" 2 1 1024 512 + ExecTest "get" 2 8 1 1048576 + ExecTest "get" 2 16 128 8 + ExecTest "get" 2 32 256 512 + ExecTest "get" 2 64 1024 8 - ExecTest "wgget" 2 1 64 1048576 - ExecTest "wgget" 2 2 64 1048576 - ExecTest "wgget" 2 16 64 8 + ExecTest "wgget" 2 1 64 1048576 + ExecTest "wgget" 2 2 64 1048576 + ExecTest "wgget" 2 16 64 8 - ExecTest "waveget" 2 1 64 1048576 - ExecTest "waveget" 2 2 64 1048576 - ExecTest "waveget" 2 2 128 1048576 - ExecTest "waveget" 2 16 128 8 + ExecTest "waveget" 2 1 64 1048576 + ExecTest "waveget" 2 2 64 1048576 + ExecTest "waveget" 2 2 128 1048576 + ExecTest "waveget" 2 16 128 8 - ExecTest "defaultctxget" 2 4 128 1024 - ExecTest "teamctxget" 2 4 128 1024 - ExecTest "teamctxget" 2 16 256 1024 + ExecTest "defaultctxget" 2 4 128 1024 + ExecTest "teamctxget" 2 4 128 1024 + ExecTest "teamctxget" 2 16 256 1024 # ExecTest "g" 2 1 1 128 # ExecTest "g" 2 1 1024 2 @@ -513,25 +513,25 @@ TestGDA() { ExecTest "teamctxputnbi" 2 4 128 1024 ExecTest "teamctxputnbi" 2 16 256 1024 - ExecTest "getnbi" 2 1 1 1048576 - ExecTest "getnbi" 2 1 1024 512 - ExecTest "getnbi" 2 8 1 1048576 - ExecTest "getnbi" 2 16 128 8 - ExecTest "getnbi" 2 32 256 512 - ExecTest "getnbi" 2 64 1024 8 + ExecTest "getnbi" 2 1 1 1048576 + ExecTest "getnbi" 2 1 1024 512 + ExecTest "getnbi" 2 8 1 1048576 + ExecTest "getnbi" 2 16 128 8 + ExecTest "getnbi" 2 32 256 512 + ExecTest "getnbi" 2 64 1024 8 - ExecTest "wggetnbi" 2 1 64 1048576 - ExecTest "wggetnbi" 2 2 64 1048576 - ExecTest "wggetnbi" 2 16 64 8 + ExecTest "wggetnbi" 2 1 64 1048576 + ExecTest "wggetnbi" 2 2 64 1048576 + ExecTest "wggetnbi" 2 16 64 8 - ExecTest "wavegetnbi" 2 1 64 1048576 - ExecTest "wavegetnbi" 2 2 64 1048576 - ExecTest "wavegetnbi" 2 2 128 1048576 - ExecTest "wavegetnbi" 2 16 128 8 + ExecTest "wavegetnbi" 2 1 64 1048576 + ExecTest "wavegetnbi" 2 2 64 1048576 + ExecTest "wavegetnbi" 2 2 128 1048576 + ExecTest "wavegetnbi" 2 16 128 8 - ExecTest "defaultctxgetnbi" 2 4 128 1024 - ExecTest "teamctxgetnbi" 2 4 128 1024 - ExecTest "teamctxgetnbi" 2 16 256 1024 + ExecTest "defaultctxgetnbi" 2 4 128 1024 + ExecTest "teamctxgetnbi" 2 4 128 1024 + ExecTest "teamctxgetnbi" 2 16 256 1024 #TestAMO() { ############################################################################## @@ -550,7 +550,7 @@ TestGDA() { # ExecTest "amo_fcswap" 2 32 1 # ExecTest "amo_fcswap" 2 8 1 -#Works on CX7, not implemented on BNXT +#This works but tester requires 32bit amos to be implemented # ExecTest "amo_finc" 2 1 1 # ExecTest "amo_finc" 2 1 1024 # ExecTest "amo_finc" 2 8 1 @@ -562,7 +562,7 @@ TestGDA() { # ExecTest "amo_inc" 2 8 1 # ExecTest "amo_inc" 2 32 128 -#Works on CX7, not implemented on BNXT +#This works but tester requires 32bit amos to be implemented # ExecTest "amo_fadd" 2 1 1 # ExecTest "amo_fadd" 2 1 1024 # ExecTest "amo_fadd" 2 8 1 diff --git a/src/gda/bnxt/queue_pair_bnxt.cpp b/src/gda/bnxt/queue_pair_bnxt.cpp index e185133322..e6319da754 100644 --- a/src/gda/bnxt/queue_pair_bnxt.cpp +++ b/src/gda/bnxt/queue_pair_bnxt.cpp @@ -336,6 +336,7 @@ __device__ uint64_t QueuePair::post_wqe_amo(int pe, int32_t length, uintptr_t *r uint64_t active_lane_mask; uint8_t active_lane_count; uint8_t active_lane_id; + uint32_t atomic_idx = 0; active_lane_mask = get_active_lane_mask(); active_lane_count = get_active_lane_count(active_lane_mask); @@ -378,8 +379,14 @@ __device__ uint64_t QueuePair::post_wqe_amo(int pe, int32_t length, uintptr_t *r amo.swp_dt = atomic_data; /* Populate SG Segment - (Return address of atomic) */ - sge.pa = (uint64_t) nonfetching_atomic; - sge.lkey = nonfetching_atomic_lkey; + if (fetching) { + atomic_idx = fetching_atomic_idx++ % FETCHING_ATOMIC_CNT; + sge.pa = (uint64_t) &fetching_atomic[atomic_idx]; + sge.lkey = fetching_atomic_lkey; + } else { + sge.pa = (uint64_t) nonfetching_atomic; + sge.lkey = nonfetching_atomic_lkey; + } sge.length = length; /* Write WQE to SQ */ @@ -406,6 +413,10 @@ __device__ uint64_t QueuePair::post_wqe_amo(int pe, int32_t length, uintptr_t *r release_lock(&sq.lock); } + if (fetching) { + return fetching_atomic[atomic_idx]; + } + return 0; } diff --git a/src/gda/queue_pair.hpp b/src/gda/queue_pair.hpp index 4356116341..3d1801f254 100644 --- a/src/gda/queue_pair.hpp +++ b/src/gda/queue_pair.hpp @@ -311,9 +311,12 @@ class QueuePair { uint64_t* nonfetching_atomic{nullptr}; uint32_t nonfetching_atomic_lkey{0}; + struct ibv_mr *mr_nonfetching_atomic; uint64_t* fetching_atomic{nullptr}; uint32_t fetching_atomic_lkey{0}; + uint32_t fetching_atomic_idx{0}; + struct ibv_mr *mr_fetching_atomic; static const uint32_t FETCHING_ATOMIC_CNT{1024}; static_assert(FETCHING_ATOMIC_CNT % WF_SIZE == 0); @@ -322,8 +325,6 @@ class QueuePair { HIPAllocator allocator{}; - struct ibv_mr *mr_nonfetching_atomic; - struct ibv_mr *mr_fetching_atomic; }; } // namespace rocshmem