Extending hipMallocManaged tests (#2670)

* Extending hipMallocManaged tests

* Fixed compilation error

* Added tests skips for hipMallocManaged tests on devices that don't support managed memory

* Removed unused stream


[ROCm/hip-tests commit: 4b07ea6125]
Этот коммит содержится в:
Dylan Angus
2022-07-29 04:05:27 +01:00
коммит произвёл GitHub
родитель b99fdc689c
Коммит 4a96554d63
6 изменённых файлов: 636 добавлений и 634 удалений
+1 -1
Просмотреть файл
@@ -166,7 +166,7 @@ static inline unsigned setNumBlocks(unsigned blocksPerCU, unsigned threadsPerBlo
HIP_CHECK(hipGetDeviceProperties(&props, device));
unsigned blocks = props.multiProcessorCount * blocksPerCU;
if (blocks * threadsPerBlock > N) {
if (blocks * threadsPerBlock < N) {
blocks = (N + threadsPerBlock - 1) / threadsPerBlock;
}
+78 -116
Просмотреть файл
@@ -1,5 +1,5 @@
/*
Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
@@ -24,15 +24,14 @@
only on HMM enabled devices
*/
#include <hip_test_common.hh>
#include "hipMallocManagedCommon.hh"
#include <hip_test_kernels.hh>
#include <hip_test_checkers.hh>
// Kernel functions
__global__ void KernelMul_MngdMem(int *Hmm, int *Dptr, size_t n) {
__global__ void KernelMul_MngdMem(int* Hmm, int* Dptr, size_t n) {
size_t index = blockIdx.x * blockDim.x + threadIdx.x;
size_t stride = blockDim.x * gridDim.x;
for (size_t i = index; i < n; i += stride) {
@@ -40,7 +39,7 @@ __global__ void KernelMul_MngdMem(int *Hmm, int *Dptr, size_t n) {
}
}
__global__ void KernelMulAdd_MngdMem(int *Hmm, size_t n) {
__global__ void KernelMulAdd_MngdMem(int* Hmm, size_t n) {
size_t index = blockIdx.x * blockDim.x + threadIdx.x;
size_t stride = blockDim.x * gridDim.x;
for (size_t i = index; i < n; i += stride) {
@@ -48,130 +47,93 @@ __global__ void KernelMulAdd_MngdMem(int *Hmm, size_t n) {
}
}
__global__ void KrnlWth2MemTypesC(unsigned char *Hmm, unsigned char *Dptr,
size_t n) {
size_t index = blockIdx.x * blockDim.x + threadIdx.x;
size_t stride = blockDim.x * gridDim.x;
for (size_t i = index; i < n; i += stride) {
Hmm[i] = Dptr[i] + 10;
}
}
static int HmmAttrPrint() {
int managed = 0;
INFO("The following are the attribute values related to HMM for"
" device 0:\n");
HIP_CHECK(hipDeviceGetAttribute(&managed,
hipDeviceAttributeDirectManagedMemAccessFromHost, 0));
INFO("hipDeviceAttributeDirectManagedMemAccessFromHost: " << managed);
HIP_CHECK(hipDeviceGetAttribute(&managed,
hipDeviceAttributeConcurrentManagedAccess, 0));
INFO("hipDeviceAttributeConcurrentManagedAccess: " << managed);
HIP_CHECK(hipDeviceGetAttribute(&managed,
hipDeviceAttributePageableMemoryAccess, 0));
INFO("hipDeviceAttributePageableMemoryAccess: " << managed);
HIP_CHECK(hipDeviceGetAttribute(&managed,
hipDeviceAttributePageableMemoryAccessUsesHostPageTables, 0));
INFO("hipDeviceAttributePageableMemoryAccessUsesHostPageTables:"
<< managed);
HIP_CHECK(hipDeviceGetAttribute(&managed, hipDeviceAttributeManagedMemory,
0));
INFO("hipDeviceAttributeManagedMemory: " << managed);
return managed;
}
static size_t N{4 * 1024 * 1024};
static size_t numElements{64 * 1024 * 1024};
static unsigned blocksPerCU{6};
static unsigned threadsPerBlock{256};
/*
This testcase verifies the hipMallocManaged basic scenario - supported on all devices
*/
TEST_CASE("Unit_hipMallocManaged_Basic") {
int numElements = (N < (64 * 1024 * 1024)) ? 64 * 1024 * 1024 : N;
float *A, *B, *C;
auto managed = HmmAttrPrint();
if (managed != 1) {
WARN(
"GPU doesn't support hipDeviceAttributeManagedMemory attribute so defaulting to system "
"memory.");
}
HIP_CHECK(hipMallocManaged(&A, numElements*sizeof(float)));
HIP_CHECK(hipMallocManaged(&B, numElements*sizeof(float)));
HIP_CHECK(hipMallocManaged(&C, numElements*sizeof(float)));
float *A, *B, *C;
HIP_CHECK(hipMallocManaged(&A, numElements * sizeof(float)));
HIP_CHECK(hipMallocManaged(&B, numElements * sizeof(float)));
HIP_CHECK(hipMallocManaged(&C, numElements * sizeof(float)));
}
/*
This testcase verifies the hipMallocManaged basic scenario - supported only on HMM enabled devices
This testcase verifies the hipMallocManaged advanced scenario - supported only on HMM enabled
devices
*/
TEST_CASE("Unit_hipMallocManaged_Advanced") {
int managed = HmmAttrPrint();
if (managed == 1) {
int numElements = (N < (64 * 1024 * 1024)) ? 64 * 1024 * 1024 : N;
float *A, *B, *C;
HIP_CHECK(hipMallocManaged(&A, numElements*sizeof(float)));
HIP_CHECK(hipMallocManaged(&B, numElements*sizeof(float)));
HIP_CHECK(hipMallocManaged(&C, numElements*sizeof(float)));
HipTest::setDefaultData(numElements, A, B, C);
hipDevice_t device = hipCpuDeviceId;
HIP_CHECK(hipMemAdvise(A, numElements*sizeof(float),
hipMemAdviseSetReadMostly, device));
HIP_CHECK(hipMemPrefetchAsync(A, numElements*sizeof(float), 0));
HIP_CHECK(hipMemPrefetchAsync(B, numElements*sizeof(float), 0));
HIP_CHECK(hipDeviceSynchronize());
HIP_CHECK(hipMemRangeGetAttribute(&device, sizeof(device),
hipMemRangeAttributeLastPrefetchLocation,
A, numElements*sizeof(float)));
if (device != 0) {
INFO("hipMemRangeGetAttribute error, device = " << device);
}
uint32_t read_only = 0xf;
HIP_CHECK(hipMemRangeGetAttribute(&read_only, sizeof(read_only),
hipMemRangeAttributeReadMostly,
A, numElements*sizeof(float)));
if (read_only != 1) {
SUCCEED("hipMemRangeGetAttribute error, read_only = " << read_only);
}
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock,
numElements);
hipEvent_t event0, event1;
HIP_CHECK(hipEventCreate(&event0));
HIP_CHECK(hipEventCreate(&event1));
HIP_CHECK(hipEventRecord(event0, 0));
hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock),
0, 0, static_cast<const float*>(A),
static_cast<const float*>(B), C, numElements);
HIP_CHECK(hipEventRecord(event1, 0));
HIP_CHECK(hipDeviceSynchronize());
float time = 0.0f;
HIP_CHECK(hipEventElapsedTime(&time, event0, event1));
printf("Time %.3f ms\n", time);
float maxError = 0.0f;
HIP_CHECK(hipMemPrefetchAsync(B, numElements*sizeof(float),
hipCpuDeviceId));
HIP_CHECK(hipDeviceSynchronize());
device = 0;
HIP_CHECK(hipMemRangeGetAttribute(&device, sizeof(device),
hipMemRangeAttributeLastPrefetchLocation,
A, numElements*sizeof(float)));
if (device != hipCpuDeviceId) {
SUCCEED("hipMemRangeGetAttribute error device = " << device);
}
for (int i = 0; i < numElements; i++) {
maxError = fmax(maxError, fabs(B[i]-3.0f));
}
HIP_CHECK(hipFree(A));
HIP_CHECK(hipFree(B));
REQUIRE(maxError != 0.0f);
} else {
SUCCEED("GPU 0 doesn't support hipDeviceAttributeManagedMemory "
"attribute. Hence skipping the testing with Pass result.\n");
auto managed = HmmAttrPrint();
if (managed != 1) {
HipTest::HIP_SKIP_TEST("GPU doesn't support managed memory so skipping test.");
return;
}
}
float *A, *B, *C;
HIP_CHECK(hipMallocManaged(&A, numElements * sizeof(float)));
HIP_CHECK(hipMallocManaged(&B, numElements * sizeof(float)));
HIP_CHECK(hipMallocManaged(&C, numElements * sizeof(float)));
HipTest::setDefaultData(numElements, A, B, C);
hipDevice_t device = hipCpuDeviceId;
HIP_CHECK(hipMemAdvise(A, numElements * sizeof(float), hipMemAdviseSetReadMostly, device));
HIP_CHECK(hipMemPrefetchAsync(A, numElements * sizeof(float), 0));
HIP_CHECK(hipMemPrefetchAsync(B, numElements * sizeof(float), 0));
HIP_CHECK(hipDeviceSynchronize());
HIP_CHECK(hipMemRangeGetAttribute(&device, sizeof(device),
hipMemRangeAttributeLastPrefetchLocation, A,
numElements * sizeof(float)));
if (device != 0) {
INFO("hipMemRangeGetAttribute error, device = " << device);
}
uint32_t read_only = 0xf;
HIP_CHECK(hipMemRangeGetAttribute(&read_only, sizeof(read_only), hipMemRangeAttributeReadMostly,
A, numElements * sizeof(float)));
if (read_only != 1) {
SUCCEED("hipMemRangeGetAttribute error, read_only = " << read_only);
}
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements);
hipEvent_t event0, event1;
HIP_CHECK(hipEventCreate(&event0));
HIP_CHECK(hipEventCreate(&event1));
HIP_CHECK(hipEventRecord(event0, 0));
hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
static_cast<const float*>(A), static_cast<const float*>(B), C, numElements);
HIP_CHECK(hipEventRecord(event1, 0));
HIP_CHECK(hipDeviceSynchronize());
float time = 0.0f;
HIP_CHECK(hipEventElapsedTime(&time, event0, event1));
printf("Time %.3f ms\n", time);
float maxError = 0.0f;
HIP_CHECK(hipMemPrefetchAsync(B, numElements * sizeof(float), hipCpuDeviceId));
HIP_CHECK(hipDeviceSynchronize());
device = 0;
HIP_CHECK(hipMemRangeGetAttribute(&device, sizeof(device),
hipMemRangeAttributeLastPrefetchLocation, A,
numElements * sizeof(float)));
if (device != hipCpuDeviceId) {
SUCCEED("hipMemRangeGetAttribute error device = " << device);
}
for (size_t i = 0; i < numElements; i++) {
maxError = fmax(maxError, fabs(B[i] - 3.0f));
}
HIP_CHECK(hipFree(A));
HIP_CHECK(hipFree(B));
REQUIRE(maxError != 0.0f);
}
+26
Просмотреть файл
@@ -0,0 +1,26 @@
#include <hip_test_common.hh>
static int HmmAttrPrint() {
int managed = 0;
INFO(
"The following are the attribute values related to HMM for"
" device 0:\n");
HIP_CHECK(hipDeviceGetAttribute(&managed, hipDeviceAttributeDirectManagedMemAccessFromHost, 0));
INFO("hipDeviceAttributeDirectManagedMemAccessFromHost: " << managed);
HIP_CHECK(hipDeviceGetAttribute(&managed, hipDeviceAttributeConcurrentManagedAccess, 0));
INFO("hipDeviceAttributeConcurrentManagedAccess: " << managed);
HIP_CHECK(hipDeviceGetAttribute(&managed, hipDeviceAttributePageableMemoryAccess, 0));
INFO("hipDeviceAttributePageableMemoryAccess: " << managed);
HIP_CHECK(
hipDeviceGetAttribute(&managed, hipDeviceAttributePageableMemoryAccessUsesHostPageTables, 0));
INFO("hipDeviceAttributePageableMemoryAccessUsesHostPageTables:" << managed);
HIP_CHECK(hipDeviceGetAttribute(&managed, hipDeviceAttributeManagedMemory, 0));
INFO("hipDeviceAttributeManagedMemory: " << managed);
if (managed != 1) {
WARN(
"GPU 0 doesn't support hipDeviceAttributeManagedMemory attribute so defaulting to system "
"memory.");
}
return managed;
}
+191 -226
Просмотреть файл
@@ -1,5 +1,5 @@
/*
Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
@@ -20,245 +20,210 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include <hip_test_common.hh>
#include "hipMallocManagedCommon.hh"
#include <atomic>
// Kernel function
__global__ void MallcMangdFlgTst(int n, float *x, float *y) {
__global__ void MallcMangdFlgTst(int n, float* x, float* y) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = index; i < n; i += stride)
y[i] = x[i] * x[i];
}
// The following function prints info on attributes related to HMM
static int HmmAttrPrint() {
int managed = 0;
INFO("The following are the attribute values related to HMM for"
" device 0:\n");
HIP_CHECK(hipDeviceGetAttribute(&managed,
hipDeviceAttributeDirectManagedMemAccessFromHost, 0));
INFO("hipDeviceAttributeDirectManagedMemAccessFromHost: " << managed);
HIP_CHECK(hipDeviceGetAttribute(&managed,
hipDeviceAttributeConcurrentManagedAccess, 0));
INFO("hipDeviceAttributeConcurrentManagedAccess: " << managed);
HIP_CHECK(hipDeviceGetAttribute(&managed,
hipDeviceAttributePageableMemoryAccess, 0));
INFO("hipDeviceAttributePageableMemoryAccess: " << managed);
HIP_CHECK(hipDeviceGetAttribute(&managed,
hipDeviceAttributePageableMemoryAccessUsesHostPageTables, 0));
INFO("hipDeviceAttributePageableMemoryAccessUsesHostPageTables:"
<< managed);
HIP_CHECK(hipDeviceGetAttribute(&managed, hipDeviceAttributeManagedMemory,
0));
INFO("hipDeviceAttributeManagedMemory: " << managed);
return managed;
for (int i = index; i < n; i += stride) y[i] = x[i] * x[i];
}
// The following section tests working of hipMallocManaged with flag parameters
TEST_CASE("Unit_hipMallocManaged_FlgParam") {
int managed = HmmAttrPrint();
if (managed == 1) {
std::atomic<int> DataMismatch{0};
bool IfTestPassed = true;
float *HmmAG = NULL, *HmmAH1 = NULL, *HmmAH2 = NULL, INIT_VAL = 2.5;
int NumDevs = 0, NUM_ELMS = 4096;
HIP_CHECK(hipGetDeviceCount(&NumDevs));
float *Ad = NULL, *Ah = NULL;
Ah = new float[NUM_ELMS];
// Testing hipMemAttachGlobal Flag
HIP_CHECK(hipMallocManaged(&HmmAG, NUM_ELMS * sizeof(float),
hipMemAttachGlobal));
// Initializing HmmAG memory
for (int i = 0; i < NUM_ELMS; i++) {
HmmAG[i] = INIT_VAL;
Ah[i] = 0;
}
int blockSize = 256;
int numBlocks = (NUM_ELMS + blockSize - 1) / blockSize;
dim3 dimGrid(numBlocks, 1, 1);
dim3 dimBlock(blockSize, 1, 1);
hipStream_t strm;
for (int i = 0; i < NumDevs; i++) {
HIP_CHECK(hipSetDevice(i));
HIP_CHECK(hipStreamCreate(&strm));
HIP_CHECK(hipMalloc(&Ad, NUM_ELMS * sizeof(float)));
HIP_CHECK(hipMemset(Ad, 0, NUM_ELMS * sizeof(float)));
MallcMangdFlgTst<<<dimGrid, dimBlock, 0, strm>>>(NUM_ELMS, HmmAG, Ad);
HIP_CHECK(hipStreamSynchronize(strm));
HIP_CHECK(hipMemcpy(Ah, Ad, NUM_ELMS * sizeof(float),
hipMemcpyDeviceToHost));
for (int j = 0; j < NUM_ELMS; ++j) {
if (Ah[j] != (INIT_VAL * INIT_VAL)) {
DataMismatch++;
}
}
if (DataMismatch != 0) {
WARN("Data Mismatch observed when kernel launched on");
WARN(" device: " << i);
IfTestPassed = false;
}
DataMismatch = 0;
HIP_CHECK(hipFree(Ad));
HIP_CHECK(hipStreamDestroy(strm));
}
delete[] Ah;
HIP_CHECK(hipFree(HmmAG));
DataMismatch = 0;
HIP_CHECK(hipMallocManaged(&HmmAH1, NUM_ELMS * sizeof(float),
hipMemAttachHost));
HIP_CHECK(hipMallocManaged(&HmmAH2, NUM_ELMS * sizeof(float),
hipMemAttachHost));
// Initializing HmmAH memory
for (int i = 0; i < NUM_ELMS; i++) {
HmmAH1[i] = INIT_VAL;
HmmAH2[i] = 0;
}
for (int i = 0; i < NumDevs; i++) {
HIP_CHECK(hipSetDevice(i));
HIP_CHECK(hipStreamCreate(&strm));
HIP_CHECK(hipMemset(HmmAH2, 0, NUM_ELMS * sizeof(float)));
MallcMangdFlgTst<<<dimGrid, dimBlock, 0, strm>>>(NUM_ELMS,
HmmAH1, HmmAH2);
HIP_CHECK(hipStreamSynchronize(strm));
for (int j = 0; j < NUM_ELMS; ++j) {
if (HmmAH2[j] != (INIT_VAL * INIT_VAL)) {
DataMismatch++;
}
}
if (DataMismatch != 0) {
WARN("Data Mismatch observed when kernel launched on");
WARN(" device: " << i);
IfTestPassed = false;
}
HIP_CHECK(hipStreamDestroy(strm));
}
HIP_CHECK(hipFree(HmmAH1));
HIP_CHECK(hipFree(HmmAH2));
REQUIRE(IfTestPassed);
} else {
SUCCEED("Gpu doesnt support HMM! Hence skipping the test with PASS result");
auto managed = HmmAttrPrint();
if (managed != 1) {
HipTest::HIP_SKIP_TEST("GPU doesn't support managed memory so skipping test.");
return;
}
std::atomic<int> DataMismatch{0};
bool IfTestPassed = true;
float *HmmAG = NULL, *HmmAH1 = NULL, *HmmAH2 = NULL, INIT_VAL = 2.5;
int NumDevs = 0, NUM_ELMS = 4096;
HIP_CHECK(hipGetDeviceCount(&NumDevs));
float *Ad = NULL, *Ah = NULL;
Ah = new float[NUM_ELMS];
// Testing hipMemAttachGlobal Flag
HIP_CHECK(hipMallocManaged(&HmmAG, NUM_ELMS * sizeof(float), hipMemAttachGlobal));
// Initializing HmmAG memory
for (int i = 0; i < NUM_ELMS; i++) {
HmmAG[i] = INIT_VAL;
Ah[i] = 0;
}
int blockSize = 256;
int numBlocks = (NUM_ELMS + blockSize - 1) / blockSize;
dim3 dimGrid(numBlocks, 1, 1);
dim3 dimBlock(blockSize, 1, 1);
hipStream_t strm;
for (int i = 0; i < NumDevs; i++) {
HIP_CHECK(hipSetDevice(i));
HIP_CHECK(hipStreamCreate(&strm));
HIP_CHECK(hipMalloc(&Ad, NUM_ELMS * sizeof(float)));
HIP_CHECK(hipMemset(Ad, 0, NUM_ELMS * sizeof(float)));
MallcMangdFlgTst<<<dimGrid, dimBlock, 0, strm>>>(NUM_ELMS, HmmAG, Ad);
HIP_CHECK(hipStreamSynchronize(strm));
HIP_CHECK(hipMemcpy(Ah, Ad, NUM_ELMS * sizeof(float), hipMemcpyDeviceToHost));
for (int j = 0; j < NUM_ELMS; ++j) {
if (Ah[j] != (INIT_VAL * INIT_VAL)) {
DataMismatch++;
}
}
if (DataMismatch != 0) {
WARN("Data Mismatch observed when kernel launched on");
WARN(" device: " << i);
IfTestPassed = false;
}
DataMismatch = 0;
HIP_CHECK(hipFree(Ad));
HIP_CHECK(hipStreamDestroy(strm));
}
delete[] Ah;
HIP_CHECK(hipFree(HmmAG));
DataMismatch = 0;
HIP_CHECK(hipMallocManaged(&HmmAH1, NUM_ELMS * sizeof(float), hipMemAttachHost));
HIP_CHECK(hipMallocManaged(&HmmAH2, NUM_ELMS * sizeof(float), hipMemAttachHost));
// Initializing HmmAH memory
for (int i = 0; i < NUM_ELMS; i++) {
HmmAH1[i] = INIT_VAL;
HmmAH2[i] = 0;
}
for (int i = 0; i < NumDevs; i++) {
HIP_CHECK(hipSetDevice(i));
HIP_CHECK(hipStreamCreate(&strm));
HIP_CHECK(hipMemset(HmmAH2, 0, NUM_ELMS * sizeof(float)));
MallcMangdFlgTst<<<dimGrid, dimBlock, 0, strm>>>(NUM_ELMS, HmmAH1, HmmAH2);
HIP_CHECK(hipStreamSynchronize(strm));
for (int j = 0; j < NUM_ELMS; ++j) {
if (HmmAH2[j] != (INIT_VAL * INIT_VAL)) {
DataMismatch++;
}
}
if (DataMismatch != 0) {
WARN("Data Mismatch observed when kernel launched on");
WARN(" device: " << i);
IfTestPassed = false;
}
HIP_CHECK(hipStreamDestroy(strm));
}
HIP_CHECK(hipFree(HmmAH1));
HIP_CHECK(hipFree(HmmAH2));
REQUIRE(IfTestPassed);
}
// The following function tests Memory access allocated using hipMallocManaged
// in multiple streams
TEST_CASE("Unit_hipMallocManaged_AccessMultiStream") {
int managed = HmmAttrPrint();
if (managed == 1) {
std::atomic<int> DataMismatch{0};
bool IfTestPassed = true;
float *HmmAG = NULL, *HmmAH1 = NULL, *HmmAH2 = NULL, INIT_VAL = 2.5;
int NumStrms = 0, MultiDevice = 0, NUM_ELMS = 4096;
HIP_CHECK(hipGetDeviceCount(&MultiDevice));
if (MultiDevice >= 2) {
HIP_CHECK(hipGetDeviceCount(&NumStrms));
} else {
NumStrms = 4;
}
hipStream_t **Stream = new hipStream_t*[NumStrms];
for (int i = 0; i < NumStrms; ++i) {
Stream[i] = reinterpret_cast<hipStream_t*>(malloc(sizeof(hipStream_t)));
}
float *Ad = NULL, *Ah = NULL;
Ah = new float[NUM_ELMS];
for (int i = 0; i < NumStrms; ++i) {
if (MultiDevice >= 2) {
HIP_CHECK(hipSetDevice(i));
}
HIP_CHECK(hipStreamCreate(Stream[i]));
}
HIP_CHECK(hipSetDevice(0));
// Testing hipMemAttachGlobal Flag
HIP_CHECK(hipMallocManaged(&HmmAG, NUM_ELMS * sizeof(float),
hipMemAttachGlobal));
// Initializing HmmAG memory
for (int i = 0; i < NUM_ELMS; i++) {
HmmAG[i] = INIT_VAL;
Ah[i] = 0;
}
int blockSize = 256;
int numBlocks = (NUM_ELMS + blockSize - 1) / blockSize;
dim3 dimGrid(numBlocks, 1, 1);
dim3 dimBlock(blockSize, 1, 1);
for (int i = 0; i < NumStrms; i++) {
if (MultiDevice >= 2) {
HIP_CHECK(hipSetDevice(i));
}
HIP_CHECK(hipMalloc(&Ad, NUM_ELMS * sizeof(float)));
HIP_CHECK(hipMemset(Ad, 0, NUM_ELMS * sizeof(float)));
MallcMangdFlgTst<<<dimGrid, dimBlock, 0, *(Stream[i])>>>(NUM_ELMS,
HmmAG, Ad);
HIP_CHECK(hipStreamSynchronize(*(Stream[i])));
// Validating the results
HIP_CHECK(hipMemcpy(Ah, Ad, NUM_ELMS * sizeof(float),
hipMemcpyDeviceToHost));
for (int j = 0; j < NUM_ELMS; ++j) {
if (Ah[j] != (INIT_VAL * INIT_VAL)) {
DataMismatch++;
}
}
if (DataMismatch != 0) {
WARN("Data Mismatch observed when kernel launched on");
WARN(" device: " << i);
IfTestPassed = false;
}
DataMismatch = 0;
HIP_CHECK(hipFree(Ad));
}
delete[] Ah;
HIP_CHECK(hipFree(HmmAG));
DataMismatch = 0;
HIP_CHECK(hipMallocManaged(&HmmAH1, NUM_ELMS * sizeof(float),
hipMemAttachHost));
HIP_CHECK(hipMallocManaged(&HmmAH2, NUM_ELMS * sizeof(float),
hipMemAttachHost));
// Initializing HmmAH memory
for (int i = 0; i < NUM_ELMS; i++) {
HmmAH1[i] = INIT_VAL;
HmmAH2[i] = 0;
}
for (int i = 0; i < NumStrms; i++) {
if (MultiDevice >= 2) {
HIP_CHECK(hipSetDevice(i));
}
HIP_CHECK(hipMemset(HmmAH2, 0, NUM_ELMS * sizeof(float)));
MallcMangdFlgTst<<<dimGrid, dimBlock, 0, *(Stream[i])>>>(NUM_ELMS,
HmmAH1, HmmAH2);
HIP_CHECK(hipStreamSynchronize(*(Stream[i])));
for (int j = 0; j < NUM_ELMS; ++j) {
if (HmmAH2[j] != (INIT_VAL * INIT_VAL)) {
DataMismatch++;
break;
}
}
if (DataMismatch != 0) {
WARN("Data Mismatch observed when kernel launched on");
WARN(" device: " << i);
IfTestPassed = false;
}
}
HIP_CHECK(hipFree(HmmAH1));
HIP_CHECK(hipFree(HmmAH2));
for (int i = 0; i < NumStrms; ++i) {
HIP_CHECK(hipStreamDestroy(*(Stream[i])));
}
REQUIRE(IfTestPassed);
} else {
SUCCEED("Gpu doesnt support HMM! Hence skipping the test with PASS result");
auto managed = HmmAttrPrint();
if (managed != 1) {
HipTest::HIP_SKIP_TEST("GPU doesn't support managed memory so skipping test.");
return;
}
}
std::atomic<int> DataMismatch{0};
bool IfTestPassed = true;
float *HmmAG = NULL, *HmmAH1 = NULL, *HmmAH2 = NULL, INIT_VAL = 2.5;
int NumStrms = 0, MultiDevice = 0, NUM_ELMS = 4096;
HIP_CHECK(hipGetDeviceCount(&MultiDevice));
if (MultiDevice >= 2) {
HIP_CHECK(hipGetDeviceCount(&NumStrms));
} else {
NumStrms = 4;
}
hipStream_t** Stream = new hipStream_t*[NumStrms];
for (int i = 0; i < NumStrms; ++i) {
Stream[i] = reinterpret_cast<hipStream_t*>(malloc(sizeof(hipStream_t)));
}
float *Ad = NULL, *Ah = NULL;
Ah = new float[NUM_ELMS];
for (int i = 0; i < NumStrms; ++i) {
if (MultiDevice >= 2) {
HIP_CHECK(hipSetDevice(i));
}
HIP_CHECK(hipStreamCreate(Stream[i]));
}
HIP_CHECK(hipSetDevice(0));
// Testing hipMemAttachGlobal Flag
HIP_CHECK(hipMallocManaged(&HmmAG, NUM_ELMS * sizeof(float), hipMemAttachGlobal));
// Initializing HmmAG memory
for (int i = 0; i < NUM_ELMS; i++) {
HmmAG[i] = INIT_VAL;
Ah[i] = 0;
}
int blockSize = 256;
int numBlocks = (NUM_ELMS + blockSize - 1) / blockSize;
dim3 dimGrid(numBlocks, 1, 1);
dim3 dimBlock(blockSize, 1, 1);
for (int i = 0; i < NumStrms; i++) {
if (MultiDevice >= 2) {
HIP_CHECK(hipSetDevice(i));
}
HIP_CHECK(hipMalloc(&Ad, NUM_ELMS * sizeof(float)));
HIP_CHECK(hipMemset(Ad, 0, NUM_ELMS * sizeof(float)));
MallcMangdFlgTst<<<dimGrid, dimBlock, 0, *(Stream[i])>>>(NUM_ELMS, HmmAG, Ad);
HIP_CHECK(hipStreamSynchronize(*(Stream[i])));
// Validating the results
HIP_CHECK(hipMemcpy(Ah, Ad, NUM_ELMS * sizeof(float), hipMemcpyDeviceToHost));
for (int j = 0; j < NUM_ELMS; ++j) {
if (Ah[j] != (INIT_VAL * INIT_VAL)) {
DataMismatch++;
}
}
if (DataMismatch != 0) {
WARN("Data Mismatch observed when kernel launched on");
WARN(" device: " << i);
IfTestPassed = false;
}
DataMismatch = 0;
HIP_CHECK(hipFree(Ad));
}
delete[] Ah;
HIP_CHECK(hipFree(HmmAG));
DataMismatch = 0;
HIP_CHECK(hipMallocManaged(&HmmAH1, NUM_ELMS * sizeof(float), hipMemAttachHost));
HIP_CHECK(hipMallocManaged(&HmmAH2, NUM_ELMS * sizeof(float), hipMemAttachHost));
// Initializing HmmAH memory
for (int i = 0; i < NUM_ELMS; i++) {
HmmAH1[i] = INIT_VAL;
HmmAH2[i] = 0;
}
for (int i = 0; i < NumStrms; i++) {
if (MultiDevice >= 2) {
HIP_CHECK(hipSetDevice(i));
}
HIP_CHECK(hipMemset(HmmAH2, 0, NUM_ELMS * sizeof(float)));
MallcMangdFlgTst<<<dimGrid, dimBlock, 0, *(Stream[i])>>>(NUM_ELMS, HmmAH1, HmmAH2);
HIP_CHECK(hipStreamSynchronize(*(Stream[i])));
for (int j = 0; j < NUM_ELMS; ++j) {
if (HmmAH2[j] != (INIT_VAL * INIT_VAL)) {
DataMismatch++;
break;
}
}
if (DataMismatch != 0) {
WARN("Data Mismatch observed when kernel launched on");
WARN(" device: " << i);
IfTestPassed = false;
}
}
HIP_CHECK(hipFree(HmmAH1));
HIP_CHECK(hipFree(HmmAH2));
for (int i = 0; i < NumStrms; ++i) {
HIP_CHECK(hipStreamDestroy(*(Stream[i])));
}
REQUIRE(IfTestPassed);
}
+150 -62
Просмотреть файл
@@ -1,5 +1,5 @@
/*
Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
@@ -27,28 +27,81 @@
6. Multiple Pointers
*/
#include <hip_test_common.hh>
#include "hipMallocManagedCommon.hh"
#include <hip_test_kernels.hh>
#include <hip_test_checkers.hh>
#include <atomic>
const size_t MAX_GPU{256};
static size_t N{4*1024*1024};
static size_t N{4 * 1024 * 1024};
static unsigned blocksPerCU{6};
static unsigned threadsPerBlock{256};
#define INIT_VAL 123
/*
* Kernel function to perform addition operation.
*/
template <typename T>
__global__ void
vector_sum(T *Ad1, T *Ad2, size_t NUM_ELMTS) {
size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
size_t stride = blockDim.x * gridDim.x;
template <typename T> __global__ void vector_sum(T* Ad1, T* Ad2, size_t NUM_ELMTS) {
size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
size_t stride = blockDim.x * gridDim.x;
for (size_t i = offset; i < NUM_ELMTS; i += stride) {
Ad2[i] = Ad1[i] + Ad1[i];
}
for (size_t i = offset; i < NUM_ELMTS; i += stride) {
Ad2[i] = Ad1[i] + Ad1[i];
}
}
/*
* Kernel function to perform multiplication
*/
__global__ void KernelDouble(float* Hmm, float* dPtr, size_t n) {
size_t index = blockIdx.x * blockDim.x + threadIdx.x;
if (index < n) {
dPtr[index] = 2 * Hmm[index];
}
}
/*
* Host function to perform multiplication
*/
void HostKernelDouble(float* Hmm, float* hPtr, size_t n) {
for (size_t i = 0; i < n; i++) {
hPtr[i] = 2 * Hmm[i];
}
}
/*
This testcase verifies the concurrent access of hipMallocManaged Memory on host and device.
*/
TEST_CASE("Unit_hipMallocManaged_HostDeviceConcurrent") {
auto managed = HmmAttrPrint();
if (managed != 1) {
HipTest::HIP_SKIP_TEST("GPU doesn't support managed memory so skipping test.");
return;
}
float *Hmm = nullptr, *hPtr = nullptr, *dPtr = nullptr, *resPtr = nullptr;
hPtr = reinterpret_cast<float*>(malloc(N * sizeof(float)));
resPtr = reinterpret_cast<float*>(malloc(N * sizeof(float)));
HIP_CHECK(hipMalloc(&dPtr, N * sizeof(float)));
HIP_CHECK(hipMallocManaged(&Hmm, N * sizeof(float)));
memset(Hmm, 2.0, N * sizeof(float));
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
std::thread host_thread(HostKernelDouble, Hmm, hPtr, N);
KernelDouble<<<dim3(blocks), dim3(threadsPerBlock), 0, 0>>>(Hmm, dPtr, N);
host_thread.join();
hipMemcpy(resPtr, dPtr, N * sizeof(float), hipMemcpyDeviceToHost);
for (size_t i = 0; i < N; i++) {
REQUIRE(hPtr[i] == resPtr[i]);
}
free(hPtr);
HIP_CHECK(hipFree(dPtr));
HIP_CHECK(hipFree(Hmm));
}
// The following Test case tests the following scenario:
@@ -57,7 +110,13 @@ vector_sum(T *Ad1, T *Ad2, size_t NUM_ELMTS) {
// kernel is launched on acessed chunk of hmm memory
// and checks if there are any inconsistencies or access issues
TEST_CASE("Unit_hipMallocManaged_MultiChunkSingleDevice") {
std::atomic<int> DataMismatch{0};
auto managed = HmmAttrPrint();
if (managed != 1) {
HipTest::HIP_SKIP_TEST("GPU doesn't support managed memory so skipping test.");
return;
}
std::atomic<int> DataMismatch{0};
constexpr int Chunks = 4;
int Counter = 0;
int NUM_ELMS = (1024 * 1024);
@@ -74,16 +133,14 @@ std::atomic<int> DataMismatch{0};
Hmm[Counter] = (INIT_VAL + i);
}
}
const unsigned threadsPerBlock = 256;
const unsigned blocks = (NUM_ELMS + 255)/256;
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
for (int k = 0; k < Chunks; ++k) {
vector_sum<float> <<<blocks, threadsPerBlock, 0, stream[k]>>>
(&Hmm[k * NUM_ELMS], Ad[k], NUM_ELMS);
vector_sum<float>
<<<blocks, threadsPerBlock, 0, stream[k]>>>(&Hmm[k * NUM_ELMS], Ad[k], NUM_ELMS);
}
HIP_CHECK(hipDeviceSynchronize());
for (int m = 0; m < Chunks; ++m) {
HIP_CHECK(hipMemcpy(Ah, Ad[m], NUM_ELMS * sizeof(float),
hipMemcpyDeviceToHost));
HIP_CHECK(hipMemcpy(Ah, Ad[m], NUM_ELMS * sizeof(float), hipMemcpyDeviceToHost));
for (int n = 0; n < NUM_ELMS; ++n) {
if (Ah[n] != ((INIT_VAL + m) * 2)) {
DataMismatch++;
@@ -96,7 +153,7 @@ std::atomic<int> DataMismatch{0};
HIP_CHECK(hipStreamDestroy(stream[i]));
}
HIP_CHECK(hipFree(Hmm));
delete [] Ah;
delete[] Ah;
}
// The following Test case tests the following scenario:
@@ -105,10 +162,20 @@ std::atomic<int> DataMismatch{0};
// kernel is launched on acessed chunk of hmm memory
// and checks if there are any inconsistencies or access issues
TEST_CASE("Unit_hipMallocManaged_MultiChunkMultiDevice") {
auto managed = HmmAttrPrint();
if (managed != 1) {
HipTest::HIP_SKIP_TEST("GPU doesn't support managed memory so skipping test.");
return;
}
std::atomic<int> DataMismatch{0};
int Counter = 0;
int NumDevices = 0;
HIP_CHECK(hipGetDeviceCount(&NumDevices));
if (NumDevices < 2) {
HipTest::HIP_SKIP_TEST("Skipping test because more than one device was not found.");
return;
}
unsigned int NUM_ELMS = (1024 * 1024);
float *Ad[MAX_GPU], *Hmm = NULL, *Ah = new float[NUM_ELMS];
hipStream_t stream[MAX_GPU];
@@ -124,17 +191,15 @@ TEST_CASE("Unit_hipMallocManaged_MultiChunkMultiDevice") {
Hmm[Counter] = INIT_VAL + i;
}
}
const unsigned threadsPerBlock = 256;
const unsigned blocks = (NUM_ELMS + 255)/256;
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
for (int Klaunch = 0; Klaunch < NumDevices; ++Klaunch) {
HIP_CHECK(hipSetDevice(Klaunch));
vector_sum<float> <<<blocks, threadsPerBlock, 0, stream[Klaunch]>>>
(&Hmm[Klaunch * NUM_ELMS], Ad[Klaunch], NUM_ELMS);
vector_sum<float><<<blocks, threadsPerBlock, 0, stream[Klaunch]>>>(&Hmm[Klaunch * NUM_ELMS],
Ad[Klaunch], NUM_ELMS);
}
HIP_CHECK(hipDeviceSynchronize());
for (int m = 0; m < NumDevices; ++m) {
HIP_CHECK(hipMemcpy(Ah, Ad[m], NUM_ELMS * sizeof(float),
hipMemcpyDeviceToHost));
HIP_CHECK(hipMemcpy(Ah, Ad[m], NUM_ELMS * sizeof(float), hipMemcpyDeviceToHost));
for (size_t n = 0; n < NUM_ELMS; ++n) {
if (Ah[n] != ((INIT_VAL + m) * 2)) {
DataMismatch++;
@@ -148,32 +213,38 @@ TEST_CASE("Unit_hipMallocManaged_MultiChunkMultiDevice") {
HIP_CHECK(hipStreamDestroy(stream[i]));
}
HIP_CHECK(hipFree(Hmm));
delete [] Ah;
delete[] Ah;
}
// The following tests oversubscription hipMallocManaged() api
// Currently disabled.
TEST_CASE("Unit_hipMallocManaged_OverSubscription") {
void *A = nullptr;
auto managed = HmmAttrPrint();
if (managed != 1) {
HipTest::HIP_SKIP_TEST("GPU doesn't support managed memory so skipping test.");
return;
}
void* A = nullptr;
size_t total = 0, free = 0;
HIP_CHECK(hipMemGetInfo(&free, &total));
// ToDo: In case of HMM, memory over-subscription is allowed. Hence, relook
// into how out of memory can be tested.
// Demanding more mem size than available
#if HT_AMD
REQUIRE(hipMallocManaged(&A, (free +1), hipMemAttachGlobal) != hipSuccess);
HIP_CHECK_ERROR(hipMallocManaged(&A, (free + 1), hipMemAttachGlobal), hipErrorOutOfMemory);
#endif
}
// The following test does negative testing of hipMallocManaged() api
// by passing invalid values and check if the behavior is as expected
TEST_CASE("Unit_hipMallocManaged_Negative") {
void *A;
void* A;
size_t total = 0, free = 0;
HIP_CHECK(hipMemGetInfo(&free, &total));
SECTION("Nullptr to devPtr") {
REQUIRE(hipMallocManaged(NULL, 1024, hipMemAttachGlobal) != hipSuccess);
HIP_CHECK_ERROR(hipMallocManaged(NULL, 1024, hipMemAttachGlobal), hipErrorInvalidValue);
}
// cuda api doc says : If size is 0, cudaMallocManaged returns
@@ -184,14 +255,14 @@ TEST_CASE("Unit_hipMallocManaged_Negative") {
// reset ptr while returning success (to accommodate cuda 11.2 api behavior).
SECTION("size 0 with flag hipMemAttachGlobal") {
#if HT_AMD
REQUIRE(hipMallocManaged(&A, 0, hipMemAttachGlobal) != hipSuccess);
HIP_CHECK_ERROR(hipMallocManaged(&A, 0, hipMemAttachGlobal), hipErrorInvalidValue);
#else
REQUIRE(hipMallocManaged(&A, 0, hipMemAttachHost) == hipSuccess);
HIP_CHECK(hipMallocManaged(&A, 0, hipMemAttachGlobal));
#endif
}
SECTION("devptr is nullptr with flag hipMemAttachHost") {
REQUIRE(hipMallocManaged(NULL, 1024, hipMemAttachHost) != hipSuccess);
HIP_CHECK_ERROR(hipMallocManaged(NULL, 1024, hipMemAttachHost), hipErrorInvalidValue);
}
// cuda api doc says : If size is 0, cudaMallocManaged returns
@@ -202,32 +273,47 @@ TEST_CASE("Unit_hipMallocManaged_Negative") {
// reset ptr while returning success (to accommodate cuda 11.2 api behavior).
SECTION("size 0 with flag hipMemAttachHost") {
#if HT_AMD
REQUIRE(hipMallocManaged(&A, 0, hipMemAttachHost) != hipSuccess);
HIP_CHECK_ERROR(hipMallocManaged(&A, 0, hipMemAttachHost), hipErrorInvalidValue);
#else
REQUIRE(hipMallocManaged(&A, 0, hipMemAttachHost) == hipSuccess);
HIP_CHECK(hipMallocManaged(&A, 0, hipMemAttachHost));
#endif
}
SECTION("nullptr to devptr, size 0 and flag 0") {
REQUIRE(hipMallocManaged(NULL, 0, 0) != hipSuccess);
HIP_CHECK_ERROR(hipMallocManaged(NULL, 0, 0), hipErrorInvalidValue);
}
SECTION("Numeric value to flag parameter") {
REQUIRE(hipMallocManaged(&A, 1024, 145) != hipSuccess);
SECTION("Invalid flag parameter") {
HIP_CHECK_ERROR(hipMallocManaged(&A, 1024, 145), hipErrorInvalidValue);
}
SECTION("Invalid flag parameter- flag set to 0") {
HIP_CHECK_ERROR(hipMallocManaged(&A, 1024, 0), hipErrorInvalidValue);
}
SECTION("Invalid flag parameter- Both flags set") {
HIP_CHECK_ERROR(hipMallocManaged(&A, 1024, hipMemAttachGlobal | hipMemAttachHost),
hipErrorInvalidValue);
}
SECTION("Negative value to size") {
REQUIRE(hipMallocManaged(&A, -10, hipMemAttachGlobal));
SECTION("Max value to size") {
HIP_CHECK_ERROR(hipMallocManaged(&A, std::numeric_limits<size_t>::max(), hipMemAttachGlobal),
hipErrorOutOfMemory);
}
}
// Allocate two pointers using hipMallocManaged(), initialize,
// then launch kernel using these pointers directly and
// later validate the content without using any Memcpy.
TEMPLATE_TEST_CASE("Unit_hipMallocManaged_TwoPointers", "",
int, float, double) {
TEMPLATE_TEST_CASE("Unit_hipMallocManaged_TwoPointers", "", int, float, double) {
auto managed = HmmAttrPrint();
if (managed != 1) {
HipTest::HIP_SKIP_TEST("GPU doesn't support managed memory so skipping test.");
return;
}
int NumDevices = 0;
HIP_CHECK(hipGetDeviceCount(&NumDevices));
TestType *Hmm1 = nullptr, *Hmm2 = nullptr;
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
for (int i = 0; i < NumDevices; ++i) {
HIP_CHECK(hipSetDevice(i));
@@ -238,10 +324,8 @@ TEMPLATE_TEST_CASE("Unit_hipMallocManaged_TwoPointers", "",
Hmm1[m] = m;
Hmm2[m] = 0;
}
const unsigned threadsPerBlock = 256;
const unsigned blocks = (N + 255)/256;
// Kernel launch
vector_sum <<<blocks, threadsPerBlock>>> (Hmm1, Hmm2, N);
vector_sum<<<blocks, threadsPerBlock>>>(Hmm1, Hmm2, N);
HIP_CHECK(hipDeviceSynchronize());
for (size_t v = 0; v < N; ++v) {
if (Hmm2[v] != static_cast<TestType>(v + v)) {
@@ -259,18 +343,28 @@ TEMPLATE_TEST_CASE("Unit_hipMallocManaged_TwoPointers", "",
// to all other devices. This include verification and Device two Device
// transfers and kernel launch o discover if there any access issues.
TEMPLATE_TEST_CASE("Unit_hipMallocManaged_DeviceContextChange", "",
unsigned char, int, float, double) {
TEMPLATE_TEST_CASE("Unit_hipMallocManaged_DeviceContextChange", "", unsigned char, int, float,
double) {
auto managed = HmmAttrPrint();
if (managed != 1) {
HipTest::HIP_SKIP_TEST("GPU doesn't support managed memory so skipping test.");
return;
}
std::atomic<unsigned int> DataMismatch;
TestType *Ah1 = new TestType[N], *Ah2 = new TestType[N], *Ad = nullptr,
*Hmm = nullptr;
TestType *Ah1 = new TestType[N], *Ah2 = new TestType[N], *Ad = nullptr, *Hmm = nullptr;
int NumDevices = 0;
HIP_CHECK(hipGetDeviceCount(&NumDevices));
if (NumDevices < 2) {
HipTest::HIP_SKIP_TEST("Skipping test because more than one device was not found.");
return;
}
for (size_t i =0; i < N; ++i) {
for (size_t i = 0; i < N; ++i) {
Ah1[i] = INIT_VAL;
Ah2[i] = 0;
}
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
for (int Oloop = 0; Oloop < NumDevices; ++Oloop) {
DataMismatch = 0;
HIP_CHECK(hipSetDevice(Oloop));
@@ -279,8 +373,7 @@ TEMPLATE_TEST_CASE("Unit_hipMallocManaged_DeviceContextChange", "",
HIP_CHECK(hipSetDevice(Iloop));
HIP_CHECK(hipMalloc(&Ad, N * sizeof(TestType)));
// Copy data from host to hipMallocMananged memory and verify
HIP_CHECK(hipMemcpy(Hmm, Ah1, N * sizeof(TestType),
hipMemcpyHostToDevice));
HIP_CHECK(hipMemcpy(Hmm, Ah1, N * sizeof(TestType), hipMemcpyHostToDevice));
for (size_t v = 0; v < N; ++v) {
if (Hmm[v] != INIT_VAL) {
DataMismatch++;
@@ -289,10 +382,8 @@ TEMPLATE_TEST_CASE("Unit_hipMallocManaged_DeviceContextChange", "",
REQUIRE(DataMismatch.load() == 0);
// Executing D2D transfer with hipMallocManaged memory and verify
HIP_CHECK(hipMemcpy(Ad, Hmm, N * sizeof(TestType),
hipMemcpyDeviceToDevice));
HIP_CHECK(hipMemcpy(Ah2, Ad, N * sizeof(TestType),
hipMemcpyDeviceToHost));
HIP_CHECK(hipMemcpy(Ad, Hmm, N * sizeof(TestType), hipMemcpyDeviceToDevice));
HIP_CHECK(hipMemcpy(Ah2, Ad, N * sizeof(TestType), hipMemcpyDeviceToHost));
for (size_t k = 0; k < N; ++k) {
if (Ah2[k] != INIT_VAL) {
DataMismatch++;
@@ -300,14 +391,11 @@ TEMPLATE_TEST_CASE("Unit_hipMallocManaged_DeviceContextChange", "",
}
REQUIRE(DataMismatch.load() == 0);
HIP_CHECK(hipMemset(Ad, 0, N * sizeof(TestType)));
const unsigned threadsPerBlock = 256;
const unsigned blocks = (N + 255)/256;
// Launching the kernel to check if there is any access issue with
// hipMallocManaged memory and local device's memory
vector_sum <<<blocks, threadsPerBlock>>> (Hmm, Ad, N);
hipDeviceSynchronize();
HIP_CHECK(hipMemcpy(Ah2, Ad, N * sizeof(TestType),
hipMemcpyDeviceToHost));
vector_sum<<<blocks, threadsPerBlock>>>(Hmm, Ad, N);
HIP_CHECK(hipDeviceSynchronize());
HIP_CHECK(hipMemcpy(Ah2, Ad, N * sizeof(TestType), hipMemcpyDeviceToHost));
for (size_t m = 0; m < N; ++m) {
if (Ah2[m] != 246) {
DataMismatch++;
+190 -229
Просмотреть файл
@@ -20,25 +20,25 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include <hip_test_common.hh>
#include "hipMallocManagedCommon.hh"
#include <atomic>
// Kernel functions
__global__ void HmmMultiThread(int n, float *x, float *y) {
__global__ void HmmMultiThread(int n, float* x, float* y) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = index; i < n; i += stride)
y[i] = x[i] * x[i];
for (int i = index; i < n; i += stride) y[i] = x[i] * x[i];
}
__global__ void KrnlWth2MemTypes(int *Hmm, int *Dptr, size_t n) {
__global__ void KrnlWth2MemTypes(int* Hmm, int* Dptr, size_t n) {
size_t index = blockIdx.x * blockDim.x + threadIdx.x;
for (size_t i = index; i < n; i++) {
Hmm[i] = Dptr[i] + 10;
}
}
__global__ void KernelMul_MngdMem123(int *Hmm, int *Dptr, size_t n) {
__global__ void KernelMul_MngdMem123(int* Hmm, int* Dptr, size_t n) {
size_t index = blockIdx.x * blockDim.x + threadIdx.x;
size_t stride = blockDim.x * gridDim.x;
for (size_t i = index; i < n; i += stride) {
@@ -47,32 +47,27 @@ __global__ void KernelMul_MngdMem123(int *Hmm, int *Dptr, size_t n) {
}
// The following variable is used to determine the failure of test case
static bool IfTestPassed = true;
static bool IfTestPassed = true;
static void LaunchKrnl(int *Hmm1, size_t NumElms, int InitVal, int GpuOrdnl,
int AdviseFlg) {
int *Hmm2 = NULL;
static void LaunchKrnl(int* Hmm1, size_t NumElms, int InitVal, int GpuOrdnl, int AdviseFlg) {
int* Hmm2 = NULL;
hipStream_t strm;
HIPCHECK(hipSetDevice(GpuOrdnl));
HIPCHECK(hipStreamCreate(&strm));
if (AdviseFlg == 0) {
HIPCHECK(hipMemAdvise(Hmm1 , NumElms * sizeof(int),
hipMemAdviseSetReadMostly, GpuOrdnl));
HIPCHECK(hipMemAdvise(Hmm1, NumElms * sizeof(int), hipMemAdviseSetReadMostly, GpuOrdnl));
} else if (AdviseFlg == 1) {
HIPCHECK(hipMemAdvise(Hmm1 , NumElms * sizeof(int),
hipMemAdviseSetPreferredLocation, GpuOrdnl));
HIPCHECK(hipMemAdvise(Hmm1, NumElms * sizeof(int), hipMemAdviseSetPreferredLocation, GpuOrdnl));
} else if (AdviseFlg == 2) {
HIPCHECK(hipMemAdvise(Hmm1 , NumElms * sizeof(int),
hipMemAdviseSetAccessedBy, GpuOrdnl));
HIPCHECK(hipMemAdvise(Hmm1, NumElms * sizeof(int), hipMemAdviseSetAccessedBy, GpuOrdnl));
} else if (AdviseFlg == 3) {
HIPCHECK(hipMemPrefetchAsync(Hmm1, NumElms * sizeof(int), GpuOrdnl, strm));
HIPCHECK(hipStreamSynchronize(strm));
}
HIPCHECK(hipMallocManaged(&Hmm2, (sizeof(int) * NumElms)));
for (int i = 0; i < 2; ++i) {
KrnlWth2MemTypes<<<((NumElms + 63)/64), 64, 0, strm>>>(Hmm2, Hmm1, NumElms);
KrnlWth2MemTypes<<<((NumElms + 63) / 64), 64, 0, strm>>>(Hmm2, Hmm1, NumElms);
HIPCHECK(hipStreamSynchronize(strm));
}
// Verifying the result
@@ -88,7 +83,7 @@ static void LaunchKrnl(int *Hmm1, size_t NumElms, int InitVal, int GpuOrdnl,
}
}
static void LaunchKrnl2(int *Hmm, size_t NumElms, int InitVal, int HmmMem) {
static void LaunchKrnl2(int* Hmm, size_t NumElms, int InitVal, int HmmMem) {
int *ptr = nullptr, blockSize = 64, *HstPtr = nullptr;
hipStream_t strm;
HIPCHECK(hipStreamCreate(&strm));
@@ -99,7 +94,7 @@ static void LaunchKrnl2(int *Hmm, size_t NumElms, int InitVal, int HmmMem) {
HIPCHECK(hipMallocManaged(&ptr, (sizeof(int) * NumElms)));
}
dim3 dimBlock(blockSize, 1, 1);
dim3 dimGrid((NumElms + blockSize -1)/blockSize, 1, 1);
dim3 dimGrid((NumElms + blockSize - 1) / blockSize, 1, 1);
for (int i = 0; i < 2; ++i) {
KrnlWth2MemTypes<<<dimGrid, dimBlock, 0, strm>>>(ptr, Hmm, NumElms);
}
@@ -107,8 +102,7 @@ static void LaunchKrnl2(int *Hmm, size_t NumElms, int InitVal, int HmmMem) {
// Verifying the result
int DataMismatch = 0;
if (HmmMem == 0) {
HIPCHECK(hipMemcpy(HstPtr, ptr, (sizeof(int) * NumElms),
hipMemcpyDeviceToHost));
HIPCHECK(hipMemcpy(HstPtr, ptr, (sizeof(int) * NumElms), hipMemcpyDeviceToHost));
for (size_t i = 0; i < NumElms; ++i) {
if (HstPtr[i] != (InitVal + 10)) {
DataMismatch++;
@@ -127,13 +121,13 @@ static void LaunchKrnl2(int *Hmm, size_t NumElms, int InitVal, int HmmMem) {
}
}
static void LaunchKrnl3(int *Dptr, size_t NumElms, int InitVal) {
static void LaunchKrnl3(int* Dptr, size_t NumElms, int InitVal) {
int *Hmm = NULL, blockSize = 64;
hipStream_t strm;
HIPCHECK(hipStreamCreate(&strm));
HIPCHECK(hipMallocManaged(&Hmm, (sizeof(int) * NumElms)));
dim3 dimBlock(blockSize, 1, 1);
dim3 dimGrid((NumElms + blockSize -1)/blockSize, 1, 1);
dim3 dimGrid((NumElms + blockSize - 1) / blockSize, 1, 1);
for (int i = 0; i < 2; ++i) {
KrnlWth2MemTypes<<<dimGrid, dimBlock, 0, strm>>>(Hmm, Dptr, NumElms);
}
@@ -152,14 +146,13 @@ static void LaunchKrnl3(int *Dptr, size_t NumElms, int InitVal) {
}
static void LaunchKrnl5(int *Hmm1, size_t NumElms, int InitVal,
int KerneltoLaunch) {
static void LaunchKrnl5(int* Hmm1, size_t NumElms, int InitVal, int KerneltoLaunch) {
int *Hmm2 = NULL, blockSize = 64;
hipStream_t strm;
HIPCHECK(hipStreamCreate(&strm));
HIPCHECK(hipMallocManaged(&Hmm2, (sizeof(int) * NumElms)));
dim3 dimBlock(blockSize, 1, 1);
dim3 dimGrid((NumElms + blockSize -1)/blockSize, 1, 1);
dim3 dimGrid((NumElms + blockSize - 1) / blockSize, 1, 1);
for (int i = 0; i < 2; ++i) {
if (KerneltoLaunch == 0) {
KrnlWth2MemTypes<<<dimGrid, dimBlock, 0, strm>>>(Hmm2, Hmm1, NumElms);
@@ -200,8 +193,7 @@ static void TestFlagParamGlobal(int dev) {
HIPCHECK(hipSetDevice(dev));
HIPCHECK(hipStreamCreate(&strm));
// Testing hipMemAttachGlobal Flag
HIPCHECK(hipMallocManaged(&HmmAG, NUM_ELMS * sizeof(float),
hipMemAttachGlobal));
HIPCHECK(hipMallocManaged(&HmmAG, NUM_ELMS * sizeof(float), hipMemAttachGlobal));
// Initializing HmmAG memory
for (int i = 0; i < NUM_ELMS; i++) {
@@ -246,10 +238,8 @@ static void TestFlagParamHost(int dev) {
hipStream_t strm;
HIPCHECK(hipSetDevice(dev));
HIPCHECK(hipStreamCreate(&strm));
HIPCHECK(hipMallocManaged(&HmmAH1, NUM_ELMS * sizeof(float),
hipMemAttachHost));
HIPCHECK(hipMallocManaged(&HmmAH2, NUM_ELMS * sizeof(float),
hipMemAttachHost));
HIPCHECK(hipMallocManaged(&HmmAH1, NUM_ELMS * sizeof(float), hipMemAttachHost));
HIPCHECK(hipMallocManaged(&HmmAH2, NUM_ELMS * sizeof(float), hipMemAttachHost));
// Initializing HmmAH memory
for (int i = 0; i < NUM_ELMS; i++) {
HmmAH1[i] = INIT_VAL;
@@ -294,77 +284,54 @@ static void AllocateHmmMemory(int flag, int device) {
}
}
static int HmmAttrPrint() {
int managed = 0;
INFO("The following are the attribute values related to HMM for"
" device 0:\n");
HIP_CHECK(hipDeviceGetAttribute(&managed,
hipDeviceAttributeDirectManagedMemAccessFromHost, 0));
INFO("hipDeviceAttributeDirectManagedMemAccessFromHost: " << managed);
HIP_CHECK(hipDeviceGetAttribute(&managed,
hipDeviceAttributeConcurrentManagedAccess, 0));
INFO("hipDeviceAttributeConcurrentManagedAccess: " << managed);
HIP_CHECK(hipDeviceGetAttribute(&managed,
hipDeviceAttributePageableMemoryAccess, 0));
INFO("hipDeviceAttributePageableMemoryAccess: " << managed);
HIP_CHECK(hipDeviceGetAttribute(&managed,
hipDeviceAttributePageableMemoryAccessUsesHostPageTables, 0));
INFO("hipDeviceAttributePageableMemoryAccessUsesHostPageTables:"
<< managed);
HIP_CHECK(hipDeviceGetAttribute(&managed, hipDeviceAttributeManagedMemory,
0));
INFO("hipDeviceAttributeManagedMemory: " << managed);
return managed;
}
TEST_CASE("Unit_hipMallocManaged_MultiThread") {
auto managed = HmmAttrPrint();
if (managed != 1) {
HipTest::HIP_SKIP_TEST("GPU doesn't support managed memory so skipping test.");
return;
}
IfTestPassed = true;
int NumDevs = 0, managed = 0, ATTACH_GLOBAL = 0, ATTACH_HOST = 1;
int NumDevs = 0, ATTACH_GLOBAL = 0, ATTACH_HOST = 1;
int ITERATIONS = 10;
managed = HmmAttrPrint();
if (managed) {
HIP_CHECK(hipGetDeviceCount(&NumDevs));
std::vector<std::thread> T1;
std::vector<std::thread> T2;
for (int i = 0; i < NumDevs; ++i) {
for (int j = 0; j < ITERATIONS; ++j) {
T1.push_back(std::thread(TestFlagParamGlobal, i));
T2.push_back(std::thread(AllocateHmmMemory, ATTACH_GLOBAL, i));
}
for (auto &t1 : T1) {
if (t1.joinable()) {
t1.join();
}
}
for (auto &t2 : T2) {
if (t2.joinable()) {
t2.join();
}
HIP_CHECK(hipGetDeviceCount(&NumDevs));
std::vector<std::thread> T1;
std::vector<std::thread> T2;
for (int i = 0; i < NumDevs; ++i) {
for (int j = 0; j < ITERATIONS; ++j) {
T1.push_back(std::thread(TestFlagParamGlobal, i));
T2.push_back(std::thread(AllocateHmmMemory, ATTACH_GLOBAL, i));
}
for (auto& t1 : T1) {
if (t1.joinable()) {
t1.join();
}
}
T1.clear();
T2.clear();
for (int i = 0; i < NumDevs; ++i) {
for (int j = 0; j < ITERATIONS; ++j) {
T1.push_back(std::thread(TestFlagParamHost, i));
T2.push_back(std::thread(AllocateHmmMemory, ATTACH_HOST, i));
}
for (auto &t1 : T1) {
if (t1.joinable()) {
t1.join();
}
}
for (auto &t2 : T2) {
if (t2.joinable()) {
t2.join();
}
for (auto& t2 : T2) {
if (t2.joinable()) {
t2.join();
}
}
}
T1.clear();
T2.clear();
for (int i = 0; i < NumDevs; ++i) {
for (int j = 0; j < ITERATIONS; ++j) {
T1.push_back(std::thread(TestFlagParamHost, i));
T2.push_back(std::thread(AllocateHmmMemory, ATTACH_HOST, i));
}
for (auto& t1 : T1) {
if (t1.joinable()) {
t1.join();
}
}
for (auto& t2 : T2) {
if (t2.joinable()) {
t2.join();
}
}
} else {
SUCCEED("GPU 0 doesn't support hipDeviceAttributeManagedMemory"
"attribute. Hence skipping the testing with Pass result.\n");
}
REQUIRE(IfTestPassed);
}
@@ -372,175 +339,169 @@ TEST_CASE("Unit_hipMallocManaged_MultiThread") {
// The following test checks what happens when same Hmm memory is used to
// launch multiple threads over multiple gpus
TEST_CASE("Unit_hipMallocManaged_MGpuMThread") {
auto managed = HmmAttrPrint();
if (managed != 1) {
HipTest::HIP_SKIP_TEST("GPU doesn't support managed memory so skipping test.");
return;
}
IfTestPassed = true;
int Ngpus = 0;
HIP_CHECK(hipGetDeviceCount(&Ngpus));
if (Ngpus < 2) {
WARN("This test needs atleast 2 or more gpus, but the system");
WARN(" has only " << Ngpus);
WARN(" gpus. Hence skipping the test.");
SUCCEED("\n");
HipTest::HIP_SKIP_TEST("Skipping test because more than one device was not found.");
return;
}
int managed = HmmAttrPrint();
if (managed == 1) {
int InitVal = 123, *Hmm1 = NULL, NumElms = 4096*4;
HIP_CHECK(hipMallocManaged(&Hmm1, (NumElms * sizeof(int))));
for (int i = 0; i < NumElms; ++i) {
Hmm1[i] = InitVal;
}
std::vector<std::thread> Thrds;
// AdviseFlg=0 for ReadMostly to be applied
// AdviseFlg=1 for PreferredLocation to be applied
// AdviseFlg=2 for AccessedBy to be applied
// AdviseFlg=3 to prefetch the memory to particular gpu
for (int AdviseFlg = 0; AdviseFlg < 4; ++AdviseFlg) {
for (int i = 0; i < Ngpus; ++i) {
Thrds.push_back(std::thread(LaunchKrnl, Hmm1, NumElms, InitVal, i,
AdviseFlg));
}
for (auto &thr : Thrds) {
if (thr.joinable()) {
thr.join();
}
int InitVal = 123, *Hmm1 = NULL, NumElms = 4096 * 4;
HIP_CHECK(hipMallocManaged(&Hmm1, (NumElms * sizeof(int))));
for (int i = 0; i < NumElms; ++i) {
Hmm1[i] = InitVal;
}
std::vector<std::thread> Thrds;
// AdviseFlg=0 for ReadMostly to be applied
// AdviseFlg=1 for PreferredLocation to be applied
// AdviseFlg=2 for AccessedBy to be applied
// AdviseFlg=3 to prefetch the memory to particular gpu
for (int AdviseFlg = 0; AdviseFlg < 4; ++AdviseFlg) {
for (int i = 0; i < Ngpus; ++i) {
Thrds.push_back(std::thread(LaunchKrnl, Hmm1, NumElms, InitVal, i, AdviseFlg));
}
for (auto& thr : Thrds) {
if (thr.joinable()) {
thr.join();
}
}
REQUIRE(IfTestPassed);
} else {
SUCCEED("GPU 0 doesn't support hipDeviceAttributeManagedMemory "
"attribute. Hence skipping the testing with Pass result.\n");
}
REQUIRE(IfTestPassed);
}
// The following test checks what happens when multiple kernels are launched
// with same Hmm memory
TEST_CASE("Unit_hipMallocManaged_MultiKrnlComnHmm") {
IfTestPassed = true;
int managed = HmmAttrPrint();
if (managed == 1) {
int InitVal = 123, *Hmm = NULL, NumElms = 1024*4, TotThrds = 2;
int HmmMem2 = 0, *HstPtr = nullptr; // to indicate the thread that
// hipMalloc() memory has to be used
HstPtr = reinterpret_cast<int*>(new int[NumElms]);
HIP_CHECK(hipMalloc(&Hmm, (NumElms * sizeof(int))));
for (int i = 0; i < NumElms; ++i) {
HstPtr[i] = InitVal;
}
HIP_CHECK(hipMemcpy(Hmm, HstPtr, (NumElms * sizeof(int)),
hipMemcpyHostToDevice));
std::vector<std::thread> Thrds;
for (int i = 0; i < TotThrds; ++i) {
Thrds.push_back(std::thread(LaunchKrnl2, Hmm, NumElms, InitVal, HmmMem2));
}
for (auto &thr : Thrds) {
if (thr.joinable()) {
thr.join();
}
}
delete[] HstPtr;
} else {
SUCCEED("GPU 0 doesn't support hipDeviceAttributeManagedMemory "
"attribute. Hence skipping the testing with Pass result.\n");
auto managed = HmmAttrPrint();
if (managed != 1) {
HipTest::HIP_SKIP_TEST("GPU doesn't support managed memory so skipping test.");
return;
}
IfTestPassed = true;
int InitVal = 123, *Hmm = NULL, NumElms = 1024 * 4, TotThrds = 2;
int HmmMem2 = 0, *HstPtr = nullptr; // to indicate the thread that
// hipMalloc() memory has to be used
HstPtr = reinterpret_cast<int*>(new int[NumElms]);
HIP_CHECK(hipMalloc(&Hmm, (NumElms * sizeof(int))));
for (int i = 0; i < NumElms; ++i) {
HstPtr[i] = InitVal;
}
HIP_CHECK(hipMemcpy(Hmm, HstPtr, (NumElms * sizeof(int)), hipMemcpyHostToDevice));
std::vector<std::thread> Thrds;
for (int i = 0; i < TotThrds; ++i) {
Thrds.push_back(std::thread(LaunchKrnl2, Hmm, NumElms, InitVal, HmmMem2));
}
for (auto& thr : Thrds) {
if (thr.joinable()) {
thr.join();
}
}
delete[] HstPtr;
}
// The following test checks what happens when multiple kernels are launched
// with same hipMalloc() memory
TEST_CASE("Unit_hipMallocManaged_MultiKrnlComnMalloc") {
IfTestPassed = true;
int managed = HmmAttrPrint();
if (managed) {
int InitVal = 123, *Dptr = NULL, NumElms = 4096*8, TotThrds = 2;
int *HstPtr = reinterpret_cast<int*>(new int[NumElms]);
HIP_CHECK(hipMalloc(&Dptr, (NumElms * sizeof(int))));
for (int i = 0; i < NumElms; ++i) {
HstPtr[i] = InitVal;
}
HIP_CHECK(hipMemcpy(Dptr, HstPtr, (NumElms * sizeof(int)),
hipMemcpyHostToDevice));
std::vector<std::thread> Thrds;
for (int i = 0; i < TotThrds; ++i) {
Thrds.push_back(std::thread(LaunchKrnl3, Dptr, NumElms, InitVal));
}
for (auto &thr : Thrds) {
if (thr.joinable()) {
thr.join();
}
}
delete[] HstPtr;
HIP_CHECK(hipFree(Dptr));
} else {
SUCCEED("GPU 0 doesn't support hipDeviceAttributeManagedMemory "
"attribute. Hence skipping the testing with Pass result.\n");
auto managed = HmmAttrPrint();
if (managed != 1) {
HipTest::HIP_SKIP_TEST("GPU doesn't support managed memory so skipping test.");
return;
}
IfTestPassed = true;
int InitVal = 123, *Dptr = NULL, NumElms = 4096 * 8, TotThrds = 2;
int* HstPtr = reinterpret_cast<int*>(new int[NumElms]);
HIP_CHECK(hipMalloc(&Dptr, (NumElms * sizeof(int))));
for (int i = 0; i < NumElms; ++i) {
HstPtr[i] = InitVal;
}
HIP_CHECK(hipMemcpy(Dptr, HstPtr, (NumElms * sizeof(int)), hipMemcpyHostToDevice));
std::vector<std::thread> Thrds;
for (int i = 0; i < TotThrds; ++i) {
Thrds.push_back(std::thread(LaunchKrnl3, Dptr, NumElms, InitVal));
}
for (auto& thr : Thrds) {
if (thr.joinable()) {
thr.join();
}
}
delete[] HstPtr;
HIP_CHECK(hipFree(Dptr));
}
// The following section tests the scenario wherein multiple threads use their
// own stream to launch kernel on common Hmm memory
TEST_CASE("Unit_hipMallocManaged_MultiThrdMultiStrm") {
IfTestPassed = true;
int managed = HmmAttrPrint();
if (managed == 1) {
int NumElms = 4096*4;
int *Hmm1 = NULL, TotlThrds = 4, InitVal = 123;
int HmmMem = 1; // to indicate the thread that Hmm memory need to be
// used inside it
HIP_CHECK(hipMallocManaged(&Hmm1, (NumElms * sizeof(int))));
for (int i = 0; i < NumElms; ++i) {
Hmm1[i] = InitVal;
}
std::vector<std::thread> Thrds;
for (int i = 0; i < TotlThrds; ++i) {
Thrds.push_back(std::thread(LaunchKrnl2, Hmm1, NumElms, InitVal, HmmMem));
}
auto managed = HmmAttrPrint();
if (managed != 1) {
HipTest::HIP_SKIP_TEST("GPU doesn't support managed memory so skipping test.");
return;
}
for (auto &thr : Thrds) {
if (thr.joinable()) {
thr.join();
}
IfTestPassed = true;
int NumElms = 4096 * 4;
int *Hmm1 = NULL, TotlThrds = 4, InitVal = 123;
int HmmMem = 1; // to indicate the thread that Hmm memory need to be
// used inside it
HIP_CHECK(hipMallocManaged(&Hmm1, (NumElms * sizeof(int))));
for (int i = 0; i < NumElms; ++i) {
Hmm1[i] = InitVal;
}
std::vector<std::thread> Thrds;
for (int i = 0; i < TotlThrds; ++i) {
Thrds.push_back(std::thread(LaunchKrnl2, Hmm1, NumElms, InitVal, HmmMem));
}
for (auto& thr : Thrds) {
if (thr.joinable()) {
thr.join();
}
} else {
SUCCEED("GPU 0 doesn't support hipDeviceAttributeManagedMemory "
"attribute. Hence skipping the testing with Pass result.\n");
}
}
// The following section tests the scenario wherein two threads each use
// different kernel but common HMM memory
TEST_CASE("Unit_hipMallocManaged_TwoKrnlsComnHmmMem") {
IfTestPassed = true;
int managed = HmmAttrPrint();
if (managed == 1) {
int InitVal = 123, *Dptr = NULL, NumElms = 4096*4, TotThrds = 2;
int *HstPtr = reinterpret_cast<int*>(new int[NumElms]);
HIP_CHECK(hipMalloc(&Dptr, (NumElms * sizeof(int))));
for (int i = 0; i < NumElms; ++i) {
HstPtr[i] = InitVal;
}
HIP_CHECK(hipMemcpy(Dptr, HstPtr, (NumElms * sizeof(int)),
hipMemcpyHostToDevice));
std::vector<std::thread> Thrds;
for (int i = 0; i < TotThrds; ++i) {
Thrds.push_back(std::thread(LaunchKrnl5, Dptr, NumElms, InitVal, i));
}
for (auto &thr : Thrds) {
if (thr.joinable()) {
thr.join();
}
}
delete[] HstPtr;
HIP_CHECK(hipFree(Dptr));
} else {
SUCCEED("GPU 0 doesn't support hipDeviceAttributeManagedMemory "
"attribute. Hence skipping the testing with Pass result.\n");
auto managed = HmmAttrPrint();
if (managed != 1) {
HipTest::HIP_SKIP_TEST("GPU doesn't support managed memory so skipping test.");
return;
}
IfTestPassed = true;
int InitVal = 123, *Dptr = NULL, NumElms = 4096 * 4, TotThrds = 2;
int* HstPtr = reinterpret_cast<int*>(new int[NumElms]);
HIP_CHECK(hipMalloc(&Dptr, (NumElms * sizeof(int))));
for (int i = 0; i < NumElms; ++i) {
HstPtr[i] = InitVal;
}
HIP_CHECK(hipMemcpy(Dptr, HstPtr, (NumElms * sizeof(int)), hipMemcpyHostToDevice));
std::vector<std::thread> Thrds;
for (int i = 0; i < TotThrds; ++i) {
Thrds.push_back(std::thread(LaunchKrnl5, Dptr, NumElms, InitVal, i));
}
for (auto& thr : Thrds) {
if (thr.joinable()) {
thr.join();
}
}
delete[] HstPtr;
HIP_CHECK(hipFree(Dptr));
}