fix(transpose): correct host allocation and GB/s calculation (#860)
이 커밋은 다음에 포함됨:
@@ -116,10 +116,12 @@ run(int rank, int tid, hipStream_t stream, int argc, char** argv)
|
||||
std::default_random_engine _engine{ std::random_device{}() * (rank + 1) * (tid + 1) };
|
||||
std::uniform_int_distribution<int> _dist{ 0, 1000 };
|
||||
|
||||
size_t size = sizeof(int) * M * N;
|
||||
int* inp_matrix = new int[size];
|
||||
int* out_matrix = new int[size];
|
||||
for(size_t i = 0; i < M * N; i++)
|
||||
const size_t elems = static_cast<size_t>(M) * static_cast<size_t>(N);
|
||||
const size_t size = elems * sizeof(int);
|
||||
int* inp_matrix = new int[elems];
|
||||
int* out_matrix = new int[elems];
|
||||
|
||||
for(size_t i = 0; i < elems; i++)
|
||||
{
|
||||
inp_matrix[i] = _dist(_engine);
|
||||
out_matrix[i] = 0;
|
||||
@@ -149,7 +151,7 @@ run(int rank, int tid, hipStream_t stream, int argc, char** argv)
|
||||
HIP_API_CALL(hipMemcpyAsync(out_matrix, out, size, hipMemcpyDeviceToHost, stream));
|
||||
double time =
|
||||
std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1).count();
|
||||
float GB = (float) size * nitr * 2 / (1 << 30);
|
||||
float GB = static_cast<float>(size) * nitr * 2 / (1 << 30);
|
||||
|
||||
print_lock.lock();
|
||||
std::cout << "[" << rank << "][" << tid << "] Runtime of transpose is " << time
|
||||
|
||||
새 이슈에서 참조
사용자 차단