gda ionic: use all threads in wave operations (#295)

Use all available threads for polling the cq to increase the maximum
message rate.  Even when posting a single wqe in the wave, use all
available theads for polling the cq to reserve space in the sq.

Changes were needed in the rocshmem abstraction to avoid disabling gpu
threads, like taking turns or using only the first thread in a wave or
wavefront.  To avoid breaking other gda implementations, reimplement
turn-based or single thread strategy in post_wqe_rma_turn and
post_wqe_rma_single.

Signed-off-by: Allen Hubbe <allen.hubbe@amd.com>
このコミットが含まれているのは:
Allen Hubbe
2025-11-05 12:01:14 -05:00
committed by GitHub
コミット 6de67d5d7c
8個のファイルの変更114行の追加104行の削除
+15 -3
ファイルの表示
@@ -171,7 +171,8 @@ __device__ void QueuePair::ionic_quiet_internal(uint64_t activemask, uint32_t co
}
__device__ void QueuePair::ionic_ring_doorbell(uint32_t pos) {
// TODO When threads write at once to the same address, not all writes reach the bus.
// When threads write at once to the same address, not all writes reach the bus.
// Take turns and insert a thread fence between writes to the same address.
for (int i = 0; i < 64; ++i) {
if (__lane_id() == i) {
__threadfence();
@@ -185,11 +186,22 @@ __device__ void QueuePair::ionic_quiet() {
ionic_quiet_internal(get_same_qp_lane_mask(), sq_prod);
}
__device__ void QueuePair::ionic_post_wqe_rma(int pe, int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode) {
__device__ void QueuePair::ionic_post_wqe_rma(int pe, int32_t size, uintptr_t *laddr, uintptr_t *raddr, uint8_t opcode, Collectivity cy) {
uint64_t activemask = get_same_qp_lane_mask();
uint32_t num_wqes = get_active_lane_count(activemask);
uint32_t my_logical_lane_id = get_active_lane_num(activemask);
uint32_t num_wqes = 1;
if (cy == THREAD) {
num_wqes = get_active_lane_count(activemask);
}
uint32_t my_sq_prod = reserve_sq(activemask, num_wqes);
if (cy == WAVE) {
if (!is_first_active_lane(activemask)) {
return;
}
activemask &= activemask ^ (activemask - 1);
}
uint32_t my_sq_pos = my_sq_prod + my_logical_lane_id;
struct ionic_v1_wqe *wqe = &ionic_sq_buf[my_sq_pos & sq_mask];
uint16_t wqe_flags = 0;