Remove comparisons of signed to unsigned values
This commit is contained in:
@@ -55,7 +55,7 @@ inline const unsigned MAX_WG_SIZE{1024};
|
||||
*
|
||||
* @note Wavefront size on most systems is either 32 or 64.
|
||||
*/
|
||||
inline const unsigned WF_SIZE{64};
|
||||
inline const int WF_SIZE{64};
|
||||
|
||||
} // namespace rocshmem
|
||||
|
||||
|
||||
@@ -137,7 +137,7 @@ IPCBackend::~IPCBackend() {
|
||||
|
||||
void IPCBackend::setup_ctxs() {
|
||||
CHECK_HIP(hipMalloc(&ctx_array, sizeof(IPCContext) * maximum_num_contexts_));
|
||||
for (int i = 0; i < maximum_num_contexts_; i++) {
|
||||
for (size_t i = 0; i < maximum_num_contexts_; i++) {
|
||||
new (&ctx_array[i]) IPCContext(this);
|
||||
ctx_free_list.get()->push_back(ctx_array + i);
|
||||
}
|
||||
@@ -367,7 +367,7 @@ void IPCBackend::init_wrk_sync_buffer() {
|
||||
* For all local processing elements, initialize the device-side array
|
||||
* with the IPC work/sync buffer addresses.
|
||||
*/
|
||||
for (size_t i = 0; i < num_pes; i++) {
|
||||
for (int i = 0; i < num_pes; i++) {
|
||||
if (i != my_pe) {
|
||||
CHECK_HIP(hipIpcOpenMemHandle(
|
||||
reinterpret_cast<void**>(&Wrk_Sync_buffer_bases_[i]),
|
||||
@@ -380,7 +380,7 @@ void IPCBackend::init_wrk_sync_buffer() {
|
||||
}
|
||||
|
||||
void IPCBackend::cleanup_wrk_sync_buffer() {
|
||||
for (size_t i = 0; i < num_pes; i++) {
|
||||
for (int i = 0; i < num_pes; i++) {
|
||||
if (i != my_pe) {
|
||||
CHECK_HIP(hipIpcCloseMemHandle(Wrk_Sync_buffer_bases_[i]));
|
||||
}
|
||||
@@ -444,7 +444,7 @@ void IPCBackend::teams_init() {
|
||||
|
||||
/* Accommodating for largest possible data type for pWrk */
|
||||
pWrk_pool = reinterpret_cast<void *>(temp_Wrk_Sync_buff_ptr_);
|
||||
temp_Wrk_Sync_buff_ptr_ += sizeof(double) * ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE
|
||||
temp_Wrk_Sync_buff_ptr_ += sizeof(double) * ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE
|
||||
* max_num_teams;
|
||||
|
||||
|
||||
@@ -466,16 +466,16 @@ void IPCBackend::teams_init() {
|
||||
alltoall_pSync = reinterpret_cast<long *>(
|
||||
&alltoall_pSync_pool[team_i * ROCSHMEM_ALLTOALL_SYNC_SIZE]);
|
||||
|
||||
for (int i = 0; i < ROCSHMEM_BARRIER_SYNC_SIZE; i++) {
|
||||
for (size_t i = 0; i < ROCSHMEM_BARRIER_SYNC_SIZE; i++) {
|
||||
barrier_pSync[i] = ROCSHMEM_SYNC_VALUE;
|
||||
}
|
||||
for (int i = 0; i < ROCSHMEM_REDUCE_SYNC_SIZE; i++) {
|
||||
for (size_t i = 0; i < ROCSHMEM_REDUCE_SYNC_SIZE; i++) {
|
||||
reduce_pSync[i] = ROCSHMEM_SYNC_VALUE;
|
||||
}
|
||||
for (int i = 0; i < ROCSHMEM_BCAST_SYNC_SIZE; i++) {
|
||||
for (size_t i = 0; i < ROCSHMEM_BCAST_SYNC_SIZE; i++) {
|
||||
bcast_pSync[i] = ROCSHMEM_SYNC_VALUE;
|
||||
}
|
||||
for (int i = 0; i < ROCSHMEM_ALLTOALL_SYNC_SIZE; i++) {
|
||||
for (size_t i = 0; i < ROCSHMEM_ALLTOALL_SYNC_SIZE; i++) {
|
||||
alltoall_pSync[i] = ROCSHMEM_SYNC_VALUE;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -38,14 +38,14 @@ __device__ void IPCContext::internal_direct_barrier(int pe, int PE_start,
|
||||
#if defined(__gfx90a__)
|
||||
__threadfence_system();
|
||||
#endif /* __gfx90a__ */
|
||||
for (size_t i = 1; i < n_pes; i++) {
|
||||
for (int i = 1; i < n_pes; i++) {
|
||||
wait_until(&pSync[i], ROCSHMEM_CMP_EQ, flag_val);
|
||||
pSync[i] = ROCSHMEM_SYNC_VALUE;
|
||||
}
|
||||
threadfence_system();
|
||||
|
||||
// Announce to other PEs that all have reached
|
||||
for (size_t i = 1, j = PE_start + stride; i < n_pes; ++i, j += stride) {
|
||||
for (int i = 1, j = PE_start + stride; i < n_pes; ++i, j += stride) {
|
||||
internal_putmem(&pSync[0], &flag_val, sizeof(*pSync), j);
|
||||
#if defined(__gfx90a__)
|
||||
__threadfence_system();
|
||||
@@ -73,7 +73,7 @@ __device__ void IPCContext::internal_atomic_barrier(int pe, int PE_start,
|
||||
pSync[0] = ROCSHMEM_SYNC_VALUE;
|
||||
threadfence_system();
|
||||
|
||||
for (size_t i = 1, j = PE_start + stride; i < n_pes; ++i, j += stride) {
|
||||
for (int i = 1, j = PE_start + stride; i < n_pes; ++i, j += stride) {
|
||||
internal_putmem(&pSync[0], &flag_val, sizeof(*pSync), j);
|
||||
}
|
||||
} else {
|
||||
|
||||
@@ -156,7 +156,7 @@ __device__ T IPCContext::amo_fetch_cas(void *dest, T value, T cond, int pe) {
|
||||
template <typename T, ROCSHMEM_OP Op>
|
||||
__device__ void compute_reduce(T *src, T *dst, int size, int wg_id,
|
||||
int wg_size) {
|
||||
for (size_t i = wg_id; i < size; i += wg_size) {
|
||||
for (int i = wg_id; i < size; i += wg_size) {
|
||||
OpWrap<Op>::Calc(src, dst, i);
|
||||
}
|
||||
__syncthreads();
|
||||
@@ -299,12 +299,12 @@ __device__ void IPCContext::internal_ring_allreduce(
|
||||
int wg_size = get_flat_block_size();
|
||||
int wg_id = get_flat_block_id();
|
||||
|
||||
for (size_t i = wg_id; i < nelems; i += wg_size) {
|
||||
for (int i = wg_id; i < nelems; i += wg_size) {
|
||||
dst[i] = src[i];
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
for (size_t seg = 0; seg < n_seg; seg++) {
|
||||
for (int seg = 0; seg < n_seg; seg++) {
|
||||
off_seg = seg * seg_size;
|
||||
// Loop 2 in the algorithm above
|
||||
for (int iter = 0; iter < PE_size - 1; iter++) {
|
||||
@@ -331,7 +331,7 @@ __device__ void IPCContext::internal_ring_allreduce(
|
||||
}
|
||||
|
||||
// Loop 2 in the example above
|
||||
for (size_t iter = PE_size - 1; iter < 2 * PE_size - 2; iter++) {
|
||||
for (int iter = PE_size - 1; iter < 2 * PE_size - 2; iter++) {
|
||||
off_send = (((my_pe_in_team + 1 - iter + 2 * PE_size) % PE_size) * chunk_size);
|
||||
putmem_nbi_wg(reinterpret_cast<void *>(&dst[off_send + off_seg]),
|
||||
reinterpret_cast<void *>(&dst[off_send + off_seg]),
|
||||
@@ -351,7 +351,7 @@ __device__ void IPCContext::internal_ring_allreduce(
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
for (size_t i = wg_id; i < 2 * num_pes - 2; i += wg_size) {
|
||||
for (int i = wg_id; i < 2 * num_pes - 2; i += wg_size) {
|
||||
pSync[i] = ROCSHMEM_SYNC_VALUE;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
@@ -86,7 +86,7 @@ __host__ void IpcOnImpl::ipcHostInit(int my_pe, const HEAP_BASES_T &heap_bases,
|
||||
* For all local processing elements, initialize the device-side array
|
||||
* with the IPC symmetric heap base addresses.
|
||||
*/
|
||||
for (size_t i = 0; i < shm_size; i++) {
|
||||
for (int i = 0; i < shm_size; i++) {
|
||||
if (i != shm_rank) {
|
||||
void **ipc_base_uncast = reinterpret_cast<void **>(&ipc_base[i]);
|
||||
CHECK_HIP(hipIpcOpenMemHandle(ipc_base_uncast, vec_ipc_handle[i],
|
||||
@@ -109,7 +109,7 @@ __host__ void IpcOnImpl::ipcHostInit(int my_pe, const HEAP_BASES_T &heap_bases,
|
||||
}
|
||||
|
||||
__host__ void IpcOnImpl::ipcHostStop() {
|
||||
for (size_t i = 0; i < shm_size; i++) {
|
||||
for (int i = 0; i < shm_size; i++) {
|
||||
if (i != shm_rank) {
|
||||
CHECK_HIP(hipIpcCloseMemHandle(ipc_bases[i]));
|
||||
}
|
||||
|
||||
@@ -44,7 +44,7 @@ class IpcOnImpl {
|
||||
public:
|
||||
int shm_rank{0};
|
||||
|
||||
uint32_t shm_size{0};
|
||||
int shm_size{0};
|
||||
|
||||
char **ipc_bases{nullptr};
|
||||
|
||||
|
||||
+1
-1
@@ -171,7 +171,7 @@ __device__ __forceinline__ void memcpy(void* dst, void* src, size_t size) {
|
||||
uint8_t* dst_bytes{static_cast<uint8_t*>(dst)};
|
||||
uint8_t* src_bytes{static_cast<uint8_t*>(src)};
|
||||
|
||||
for (int i = 8; i > 1; i >>= 1) {
|
||||
for (size_t i = 8; i > 1; i >>= 1) {
|
||||
while (size >= i) {
|
||||
store_asm(src_bytes, dst_bytes, i);
|
||||
src_bytes += i;
|
||||
|
||||
@@ -142,7 +142,7 @@ template <typename T1>
|
||||
void AlltoallTester<T1>::resetBuffers(uint64_t size) {
|
||||
int n_pes = rocshmem_team_n_pes(ROCSHMEM_TEAM_WORLD);
|
||||
for (int i = 0; i < n_pes; i++) {
|
||||
for (int j = 0; j < size; j++) {
|
||||
for (uint64_t j = 0; j < size; j++) {
|
||||
init_buf(source_buf[i * size + j], dest_buf[i * size + j], (T1)i);
|
||||
}
|
||||
}
|
||||
@@ -152,10 +152,10 @@ template <typename T1>
|
||||
void AlltoallTester<T1>::verifyResults(uint64_t size) {
|
||||
int n_pes = rocshmem_team_n_pes(ROCSHMEM_TEAM_WORLD);
|
||||
for (int i = 0; i < n_pes; i++) {
|
||||
for (int j = 0; j < size; j++) {
|
||||
for (uint64_t j = 0; j < size; j++) {
|
||||
auto r = verify_buf(dest_buf[i * size + j], i);
|
||||
if (r.first == false) {
|
||||
fprintf(stderr, "Data validation error at idx %d\n", j);
|
||||
fprintf(stderr, "Data validation error at idx %lu\n", j);
|
||||
fprintf(stderr, "%s.\n", r.second.c_str());
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
@@ -141,7 +141,7 @@ template <typename T1>
|
||||
void FcollectTester<T1>::resetBuffers(uint64_t size) {
|
||||
int n_pes = rocshmem_team_n_pes(ROCSHMEM_TEAM_WORLD);
|
||||
for (int i = 0; i < n_pes; i++) {
|
||||
for (int j = 0; j < size; j++) {
|
||||
for (uint64_t j = 0; j < size; j++) {
|
||||
// Note: This is redundant work,
|
||||
// source is being reinitialized multiple times
|
||||
init_buf(source_buf[j], dest_buf[i * size + j]);
|
||||
@@ -153,10 +153,10 @@ template <typename T1>
|
||||
void FcollectTester<T1>::verifyResults(uint64_t size) {
|
||||
int n_pes = rocshmem_team_n_pes(ROCSHMEM_TEAM_WORLD);
|
||||
for (int i = 0; i < n_pes; i++) {
|
||||
for (int j = 0; j < size; j++) {
|
||||
for (uint64_t j = 0; j < size; j++) {
|
||||
auto r = verify_buf(dest_buf[i * size + j], i);
|
||||
if (r.first == false) {
|
||||
fprintf(stderr, "Data validation error at idx %d\n", j);
|
||||
fprintf(stderr, "Data validation error at idx %lu\n", j);
|
||||
fprintf(stderr, "%s.\n", r.second.c_str());
|
||||
// exit(-1);
|
||||
return;
|
||||
|
||||
@@ -100,9 +100,9 @@ void PrimitiveMRTester::verifyResults(uint64_t size) {
|
||||
: 1;
|
||||
|
||||
if (args.myid == check_id) {
|
||||
for (int i = 0; i < size; i++) {
|
||||
for (uint64_t i = 0; i < size; i++) {
|
||||
if (r_buf[i] != '0') {
|
||||
fprintf(stderr, "Data validation error at idx %d\n", i);
|
||||
fprintf(stderr, "Data validation error at idx %lu\n", i);
|
||||
fprintf(stderr, "Got %c, Expected %c\n", r_buf[i], '0');
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
@@ -123,9 +123,9 @@ void PrimitiveTester::verifyResults(uint64_t size) {
|
||||
: 1;
|
||||
|
||||
if (args.myid == check_id) {
|
||||
for (int i = 0; i < size; i++) {
|
||||
for (uint64_t i = 0; i < size; i++) {
|
||||
if (r_buf[i] != '0') {
|
||||
fprintf(stderr, "Data validation error at idx %d\n", i);
|
||||
fprintf(stderr, "Data validation error at idx %lu\n", i);
|
||||
fprintf(stderr, "Got %c, Expected %c\n", r_buf[i], '0');
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
@@ -154,7 +154,7 @@ RandomAccessTester::~RandomAccessTester() {
|
||||
}
|
||||
|
||||
void RandomAccessTester::resetBuffers(uint64_t size) {
|
||||
for (int i = 0; i < args.max_msg_size / sizeof(int) * args.wg_size * space;
|
||||
for (size_t i = 0; i < args.max_msg_size / sizeof(int) * args.wg_size * space;
|
||||
i++) {
|
||||
s_buf[i] = 1;
|
||||
r_buf[i] = 0;
|
||||
@@ -187,21 +187,21 @@ void RandomAccessTester::launchKernel(dim3 gridSize, dim3 blockSize, int loop,
|
||||
}
|
||||
|
||||
void RandomAccessTester::verifyResults(uint64_t size) {
|
||||
int offset, i, j;
|
||||
uint64_t offset;
|
||||
for (int k = 0; k < _num_waves; k++) {
|
||||
for (i = 0; i < _num_bins; i++) {
|
||||
for (int i = 0; i < _num_bins; i++) {
|
||||
int index = i + _num_bins * k;
|
||||
if (args.op_type == PutType) {
|
||||
if (_PE_bins[index] == args.myid) {
|
||||
if (_PE_bins[index] == static_cast<uint32_t>(args.myid)) {
|
||||
offset = _off_bins[index];
|
||||
for (j = 0; j < ((size / sizeof(int)) * args.coal_coef); j++) {
|
||||
for (uint64_t j = 0; j < ((size / sizeof(int)) * args.coal_coef); j++) {
|
||||
h_buf[offset + j] = 1;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (args.myid == 0) {
|
||||
offset = _off_bins[index];
|
||||
for (j = 0; j < ((size / sizeof(int)) * args.coal_coef); j++) {
|
||||
for (uint64_t j = 0; j < ((size / sizeof(int)) * args.coal_coef); j++) {
|
||||
h_buf[offset + j] = 1;
|
||||
}
|
||||
}
|
||||
@@ -212,9 +212,9 @@ void RandomAccessTester::verifyResults(uint64_t size) {
|
||||
CHECK_HIP(hipMemcpy(h_dev_buf, r_buf, space * args.wg_size * size,
|
||||
hipMemcpyDeviceToHost));
|
||||
CHECK_HIP(hipDeviceSynchronize());
|
||||
for (i = 0; i < (space * args.wg_size * size / sizeof(int)); i++) {
|
||||
for (uint64_t i = 0; i < (space * args.wg_size * size / sizeof(int)); i++) {
|
||||
if (h_dev_buf[i] != h_buf[i]) {
|
||||
printf("PE %d Got Data Validation: expecting %d got %d at %d \n",
|
||||
printf("PE %d Got Data Validation: expecting %d got %d at %lu\n",
|
||||
args.myid, h_buf[i], h_dev_buf[i], i);
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
@@ -143,9 +143,9 @@ void SignalingOperationsTester::verifyResults(uint64_t size) {
|
||||
? 0 : -1; // do not check if it doesn't match a test
|
||||
|
||||
if (args.myid == check_data_id) {
|
||||
for (int i = 0; i < size; i++) {
|
||||
for (uint64_t i = 0; i < size; i++) {
|
||||
if (r_buf[i] != '0') {
|
||||
fprintf(stderr, "Data validation error at idx %d\n", i);
|
||||
fprintf(stderr, "Data validation error at idx %lu\n", i);
|
||||
fprintf(stderr, "Got %c, Expected %c\n", r_buf[i], '0');
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
@@ -80,9 +80,9 @@ void GetSwarmTester::launchKernel(dim3 gridSize, dim3 blockSize, int loop,
|
||||
|
||||
void GetSwarmTester::verifyResults(uint64_t size) {
|
||||
if (args.myid == 0) {
|
||||
for (int i = 0; i < size * args.wg_size; i++) {
|
||||
for (uint64_t i = 0; i < size * args.wg_size; i++) {
|
||||
if (r_buf[i] != '0') {
|
||||
fprintf(stderr, "Data validation error at idx %d\n", i);
|
||||
fprintf(stderr, "Data validation error at idx %lu\n", i);
|
||||
fprintf(stderr, "Got %c, Expected %c\n", r_buf[i], '0');
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
@@ -145,17 +145,17 @@ void TeamBroadcastTester<T1>::postLaunchKernel() {
|
||||
|
||||
template <typename T1>
|
||||
void TeamBroadcastTester<T1>::resetBuffers(uint64_t size) {
|
||||
for (int i = 0; i < args.max_msg_size; i++) {
|
||||
for (uint64_t i = 0; i < args.max_msg_size; i++) {
|
||||
init_buf(source_buf[i], dest_buf[i]);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T1>
|
||||
void TeamBroadcastTester<T1>::verifyResults(uint64_t size) {
|
||||
for (int i = 0; i < size; i++) {
|
||||
for (uint64_t i = 0; i < size; i++) {
|
||||
auto r = verify_buf(dest_buf[i]);
|
||||
if (r.first == false) {
|
||||
fprintf(stderr, "Data validation error at idx %d\n", i);
|
||||
fprintf(stderr, "Data validation error at idx %lu\n", i);
|
||||
fprintf(stderr, "%s.\n", r.second.c_str());
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
@@ -120,9 +120,9 @@ void TeamCtxPrimitiveTester::verifyResults(uint64_t size) {
|
||||
(_type == TeamCtxGetTestType || _type == TeamCtxGetNBITestType) ? 0 : 1;
|
||||
|
||||
if (args.myid == check_id) {
|
||||
for (int i = 0; i < size; i++) {
|
||||
for (uint64_t i = 0; i < size; i++) {
|
||||
if (r_buf[i] != '0') {
|
||||
fprintf(stderr, "Data validation error at idx %d\n", i);
|
||||
fprintf(stderr, "Data validation error at idx %lu\n", i);
|
||||
fprintf(stderr, "Got %c, Expected %c\n", r_buf[i], '0');
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
@@ -153,7 +153,7 @@ void TeamReductionTester<T1, T2>::postLaunchKernel() {
|
||||
|
||||
template <typename T1, ROCSHMEM_OP T2>
|
||||
void TeamReductionTester<T1, T2>::resetBuffers(uint64_t size) {
|
||||
for (int i = 0; i < args.max_msg_size; i++) {
|
||||
for (uint64_t i = 0; i < args.max_msg_size; i++) {
|
||||
init_buf(s_buf[i], r_buf[i]);
|
||||
}
|
||||
}
|
||||
@@ -161,10 +161,10 @@ void TeamReductionTester<T1, T2>::resetBuffers(uint64_t size) {
|
||||
template <typename T1, ROCSHMEM_OP T2>
|
||||
void TeamReductionTester<T1, T2>::verifyResults(uint64_t size) {
|
||||
int n_pes = rocshmem_n_pes();
|
||||
for (int i = 0; i < size; i++) {
|
||||
for (uint64_t i = 0; i < size; i++) {
|
||||
auto r = verify_buf(r_buf[i], (T1)n_pes);
|
||||
if (r.first == false) {
|
||||
fprintf(stderr, "Data validation error at idx %d\n", i);
|
||||
fprintf(stderr, "Data validation error at idx %lu\n", i);
|
||||
fprintf(stderr, "%s.\n", r.second.c_str());
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
@@ -653,7 +653,7 @@ uint64_t Tester::timerAvgInMicroseconds() {
|
||||
* TODO: (bpotter/avinash) Modify the calcuation for the Tiled version of
|
||||
* puts and gets at wavefront level
|
||||
*/
|
||||
for (int i = 0; i < args.num_wgs; i++) {
|
||||
for (uint64_t i = 0; i < args.num_wgs; i++) {
|
||||
sum += gpuCyclesToMicroseconds(timer[i]);
|
||||
}
|
||||
|
||||
|
||||
@@ -62,8 +62,8 @@ class TesterArguments {
|
||||
/**
|
||||
* Arguments obtained from rocshmem
|
||||
*/
|
||||
unsigned numprocs = UINT_MAX;
|
||||
unsigned myid = UINT_MAX;
|
||||
int numprocs = INT_MAX;
|
||||
int myid = INT_MAX;
|
||||
|
||||
/**
|
||||
* Defaults tester values
|
||||
@@ -71,7 +71,7 @@ class TesterArguments {
|
||||
int loop = 10;
|
||||
int skip = 10;
|
||||
int loop_large = 10;
|
||||
int large_message_size = 32768;
|
||||
uint64_t large_message_size = 32768;
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
@@ -116,7 +116,7 @@ class IPCImplSimpleCoarse : public ::testing::TestWithParam<std::tuple<int, int,
|
||||
|
||||
void validate_golden(size_t elems) {
|
||||
ASSERT_EQ(golden_.size(), elems);
|
||||
for (int i{0}; i < golden_.size(); i++) {
|
||||
for (int i = 0; i < static_cast<int>(golden_.size()); i++) {
|
||||
ASSERT_EQ(golden_[i], i);
|
||||
}
|
||||
}
|
||||
@@ -165,7 +165,7 @@ class IPCImplSimpleCoarse : public ::testing::TestWithParam<std::tuple<int, int,
|
||||
}
|
||||
|
||||
auto dev_dest = reinterpret_cast<int*>(ipc_impl_.ipc_bases[mpi_.my_pe()]);
|
||||
for (int i{0}; i < golden_.size(); i++) {
|
||||
for (int i = 0; i < static_cast<int>(golden_.size()); i++) {
|
||||
ASSERT_EQ(golden_[i], dev_dest[i]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -47,9 +47,9 @@ __device__
|
||||
void
|
||||
simple_validator(bool *error, int *golden, int *dest, size_t bytes) {
|
||||
size_t elements {bytes / sizeof(int)};
|
||||
for (int i {get_flat_id()}; i < elements; i += get_flat_grid_size()) {
|
||||
for (size_t i = get_flat_id(); i < elements; i += get_flat_grid_size()) {
|
||||
if (golden[i] != dest[i]) {
|
||||
printf("golden[%d] %d != dest[%d] %d\n", i, golden[i], i, dest[i]);
|
||||
printf("golden[%zu] %d != dest[%zu] %d\n", i, golden[i], i, dest[i]);
|
||||
*error = true;
|
||||
}
|
||||
}
|
||||
@@ -202,7 +202,7 @@ class IPCImplSimpleFine : public ::testing::TestWithParam<std::tuple<int, int, i
|
||||
|
||||
void validate_golden(size_t elems) {
|
||||
ASSERT_EQ(golden_.size(), elems);
|
||||
for (int i{0}; i < golden_.size(); i++) {
|
||||
for (int i = 0; i < static_cast<int>(golden_.size()); i++) {
|
||||
ASSERT_EQ(golden_[i], i);
|
||||
}
|
||||
}
|
||||
@@ -270,7 +270,7 @@ class IPCImplSimpleFine : public ::testing::TestWithParam<std::tuple<int, int, i
|
||||
}
|
||||
|
||||
auto dev_dest = reinterpret_cast<int*>(ipc_impl_.ipc_bases[mpi_.my_pe()]);
|
||||
for (int i{0}; i < golden_.size(); i++) {
|
||||
for (int i = 0; i < static_cast<int>(golden_.size()); i++) {
|
||||
ASSERT_EQ(golden_[i], dev_dest[i]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -57,9 +57,9 @@ __device__
|
||||
void
|
||||
tiled_validator(bool *error, int *golden, int *dest, size_t bytes) {
|
||||
size_t elements {bytes / sizeof(int)};
|
||||
for (int i {get_flat_id()}; i < elements; i += get_flat_grid_size()) {
|
||||
for (size_t i = get_flat_id(); i < elements; i += get_flat_grid_size()) {
|
||||
if (golden[i] != dest[i]) {
|
||||
printf("golden[%d] %d != dest[%d] %d\n", i, golden[i], i, dest[i]);
|
||||
printf("golden[%zu] %d != dest[%zu] %d\n", i, golden[i], i, dest[i]);
|
||||
*error = true;
|
||||
}
|
||||
}
|
||||
@@ -100,13 +100,13 @@ template <typename NotifierT>
|
||||
__global__
|
||||
void
|
||||
kernel_tiled_fine_copy_block(IpcImpl *ipc_impl, bool *error, int *golden, int *src, int *dest, size_t bytes, TestType test, NotifierT *notifier) {
|
||||
int block_bytes = blockDim.x * THREAD_TRANSFER_GRANULARITY;
|
||||
int block_byte_offset = blockIdx.x * block_bytes;
|
||||
for (int i {block_byte_offset}; i < bytes; i += get_flat_grid_size() * THREAD_TRANSFER_GRANULARITY) {
|
||||
int chunk = min(block_bytes, bytes - i);
|
||||
size_t block_bytes = blockDim.x * THREAD_TRANSFER_GRANULARITY;
|
||||
size_t block_byte_offset = blockIdx.x * block_bytes;
|
||||
for (size_t i = block_byte_offset; i < bytes; i += get_flat_grid_size() * THREAD_TRANSFER_GRANULARITY) {
|
||||
int chunk = min(block_bytes, bytes - i);
|
||||
ipc_impl->ipcCopy_wg((char*)dest + i, (char*)src + i, chunk);
|
||||
ipc_impl->ipcFence();
|
||||
__syncthreads();
|
||||
__syncthreads();
|
||||
if (test == WRITE) {
|
||||
if (!threadIdx.x) {
|
||||
ipc_impl->ipcAMOFetchAdd(dest + SIGNAL_OFFSET, -1);
|
||||
@@ -123,10 +123,10 @@ template <typename NotifierT>
|
||||
__global__
|
||||
void
|
||||
kernel_tiled_fine_copy_warp(IpcImpl *ipc_impl, bool *error, int *golden, int *src, int *dest, size_t bytes, TestType test, NotifierT *notifier) {
|
||||
int warp_id = (blockIdx.x * blockDim.x + threadIdx.x) / WARP_SIZE;
|
||||
int warp_bytes = WARP_SIZE * THREAD_TRANSFER_GRANULARITY;
|
||||
int warp_byte_offset = warp_id * warp_bytes;
|
||||
for (int i {warp_byte_offset}; i < bytes; i += get_flat_grid_size() * THREAD_TRANSFER_GRANULARITY) {
|
||||
size_t warp_id = (blockIdx.x * blockDim.x + threadIdx.x) / WARP_SIZE;
|
||||
size_t warp_bytes = WARP_SIZE * THREAD_TRANSFER_GRANULARITY;
|
||||
size_t warp_byte_offset = warp_id * warp_bytes;
|
||||
for (size_t i = warp_byte_offset; i < bytes; i += get_flat_grid_size() * THREAD_TRANSFER_GRANULARITY) {
|
||||
int chunk = min(warp_bytes, bytes - i);
|
||||
ipc_impl->ipcCopy_wave(((char*)dest) + i, ((char*)src) + i, chunk);
|
||||
ipc_impl->ipcFence();
|
||||
@@ -220,7 +220,7 @@ class IPCImplTiledFine : public ::testing::TestWithParam<std::tuple<int, int, in
|
||||
|
||||
void validate_golden(size_t elems) {
|
||||
ASSERT_EQ(golden_.size(), elems);
|
||||
for (int i{0}; i < golden_.size(); i++) {
|
||||
for (int i = 0; i < static_cast<int>(golden_.size()); i++) {
|
||||
ASSERT_EQ(golden_[i], i);
|
||||
}
|
||||
}
|
||||
@@ -288,7 +288,7 @@ class IPCImplTiledFine : public ::testing::TestWithParam<std::tuple<int, int, in
|
||||
}
|
||||
|
||||
auto dev_dest = reinterpret_cast<int*>(ipc_impl_.ipc_bases[mpi_.my_pe()]);
|
||||
for (int i{0}; i < golden_.size(); i++) {
|
||||
for (int i = 0; i < static_cast<int>(golden_.size()); i++) {
|
||||
ASSERT_EQ(golden_[i], dev_dest[i]);
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user