Extending hipMallocManaged tests (#2670)
* Extending hipMallocManaged tests
* Fixed compilation error
* Added tests skips for hipMallocManaged tests on devices that don't support managed memory
* Removed unused stream
[ROCm/hip-tests commit: 4b07ea6125]
Этот коммит содержится в:
@@ -166,7 +166,7 @@ static inline unsigned setNumBlocks(unsigned blocksPerCU, unsigned threadsPerBlo
|
||||
HIP_CHECK(hipGetDeviceProperties(&props, device));
|
||||
|
||||
unsigned blocks = props.multiProcessorCount * blocksPerCU;
|
||||
if (blocks * threadsPerBlock > N) {
|
||||
if (blocks * threadsPerBlock < N) {
|
||||
blocks = (N + threadsPerBlock - 1) / threadsPerBlock;
|
||||
}
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
@@ -24,15 +24,14 @@
|
||||
only on HMM enabled devices
|
||||
*/
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
#include "hipMallocManagedCommon.hh"
|
||||
#include <hip_test_kernels.hh>
|
||||
#include <hip_test_checkers.hh>
|
||||
|
||||
|
||||
|
||||
// Kernel functions
|
||||
|
||||
__global__ void KernelMul_MngdMem(int *Hmm, int *Dptr, size_t n) {
|
||||
__global__ void KernelMul_MngdMem(int* Hmm, int* Dptr, size_t n) {
|
||||
size_t index = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
size_t stride = blockDim.x * gridDim.x;
|
||||
for (size_t i = index; i < n; i += stride) {
|
||||
@@ -40,7 +39,7 @@ __global__ void KernelMul_MngdMem(int *Hmm, int *Dptr, size_t n) {
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void KernelMulAdd_MngdMem(int *Hmm, size_t n) {
|
||||
__global__ void KernelMulAdd_MngdMem(int* Hmm, size_t n) {
|
||||
size_t index = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
size_t stride = blockDim.x * gridDim.x;
|
||||
for (size_t i = index; i < n; i += stride) {
|
||||
@@ -48,130 +47,93 @@ __global__ void KernelMulAdd_MngdMem(int *Hmm, size_t n) {
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void KrnlWth2MemTypesC(unsigned char *Hmm, unsigned char *Dptr,
|
||||
size_t n) {
|
||||
size_t index = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
size_t stride = blockDim.x * gridDim.x;
|
||||
for (size_t i = index; i < n; i += stride) {
|
||||
Hmm[i] = Dptr[i] + 10;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static int HmmAttrPrint() {
|
||||
int managed = 0;
|
||||
INFO("The following are the attribute values related to HMM for"
|
||||
" device 0:\n");
|
||||
HIP_CHECK(hipDeviceGetAttribute(&managed,
|
||||
hipDeviceAttributeDirectManagedMemAccessFromHost, 0));
|
||||
INFO("hipDeviceAttributeDirectManagedMemAccessFromHost: " << managed);
|
||||
HIP_CHECK(hipDeviceGetAttribute(&managed,
|
||||
hipDeviceAttributeConcurrentManagedAccess, 0));
|
||||
INFO("hipDeviceAttributeConcurrentManagedAccess: " << managed);
|
||||
HIP_CHECK(hipDeviceGetAttribute(&managed,
|
||||
hipDeviceAttributePageableMemoryAccess, 0));
|
||||
INFO("hipDeviceAttributePageableMemoryAccess: " << managed);
|
||||
HIP_CHECK(hipDeviceGetAttribute(&managed,
|
||||
hipDeviceAttributePageableMemoryAccessUsesHostPageTables, 0));
|
||||
INFO("hipDeviceAttributePageableMemoryAccessUsesHostPageTables:"
|
||||
<< managed);
|
||||
|
||||
HIP_CHECK(hipDeviceGetAttribute(&managed, hipDeviceAttributeManagedMemory,
|
||||
0));
|
||||
INFO("hipDeviceAttributeManagedMemory: " << managed);
|
||||
return managed;
|
||||
}
|
||||
|
||||
|
||||
|
||||
static size_t N{4 * 1024 * 1024};
|
||||
static size_t numElements{64 * 1024 * 1024};
|
||||
static unsigned blocksPerCU{6};
|
||||
static unsigned threadsPerBlock{256};
|
||||
|
||||
/*
|
||||
This testcase verifies the hipMallocManaged basic scenario - supported on all devices
|
||||
*/
|
||||
|
||||
TEST_CASE("Unit_hipMallocManaged_Basic") {
|
||||
int numElements = (N < (64 * 1024 * 1024)) ? 64 * 1024 * 1024 : N;
|
||||
float *A, *B, *C;
|
||||
auto managed = HmmAttrPrint();
|
||||
if (managed != 1) {
|
||||
WARN(
|
||||
"GPU doesn't support hipDeviceAttributeManagedMemory attribute so defaulting to system "
|
||||
"memory.");
|
||||
}
|
||||
|
||||
HIP_CHECK(hipMallocManaged(&A, numElements*sizeof(float)));
|
||||
HIP_CHECK(hipMallocManaged(&B, numElements*sizeof(float)));
|
||||
HIP_CHECK(hipMallocManaged(&C, numElements*sizeof(float)));
|
||||
float *A, *B, *C;
|
||||
|
||||
HIP_CHECK(hipMallocManaged(&A, numElements * sizeof(float)));
|
||||
HIP_CHECK(hipMallocManaged(&B, numElements * sizeof(float)));
|
||||
HIP_CHECK(hipMallocManaged(&C, numElements * sizeof(float)));
|
||||
}
|
||||
|
||||
/*
|
||||
This testcase verifies the hipMallocManaged basic scenario - supported only on HMM enabled devices
|
||||
This testcase verifies the hipMallocManaged advanced scenario - supported only on HMM enabled
|
||||
devices
|
||||
*/
|
||||
|
||||
TEST_CASE("Unit_hipMallocManaged_Advanced") {
|
||||
int managed = HmmAttrPrint();
|
||||
if (managed == 1) {
|
||||
int numElements = (N < (64 * 1024 * 1024)) ? 64 * 1024 * 1024 : N;
|
||||
float *A, *B, *C;
|
||||
|
||||
HIP_CHECK(hipMallocManaged(&A, numElements*sizeof(float)));
|
||||
HIP_CHECK(hipMallocManaged(&B, numElements*sizeof(float)));
|
||||
HIP_CHECK(hipMallocManaged(&C, numElements*sizeof(float)));
|
||||
HipTest::setDefaultData(numElements, A, B, C);
|
||||
|
||||
hipDevice_t device = hipCpuDeviceId;
|
||||
|
||||
HIP_CHECK(hipMemAdvise(A, numElements*sizeof(float),
|
||||
hipMemAdviseSetReadMostly, device));
|
||||
HIP_CHECK(hipMemPrefetchAsync(A, numElements*sizeof(float), 0));
|
||||
HIP_CHECK(hipMemPrefetchAsync(B, numElements*sizeof(float), 0));
|
||||
HIP_CHECK(hipDeviceSynchronize());
|
||||
HIP_CHECK(hipMemRangeGetAttribute(&device, sizeof(device),
|
||||
hipMemRangeAttributeLastPrefetchLocation,
|
||||
A, numElements*sizeof(float)));
|
||||
if (device != 0) {
|
||||
INFO("hipMemRangeGetAttribute error, device = " << device);
|
||||
}
|
||||
uint32_t read_only = 0xf;
|
||||
HIP_CHECK(hipMemRangeGetAttribute(&read_only, sizeof(read_only),
|
||||
hipMemRangeAttributeReadMostly,
|
||||
A, numElements*sizeof(float)));
|
||||
if (read_only != 1) {
|
||||
SUCCEED("hipMemRangeGetAttribute error, read_only = " << read_only);
|
||||
}
|
||||
|
||||
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock,
|
||||
numElements);
|
||||
hipEvent_t event0, event1;
|
||||
HIP_CHECK(hipEventCreate(&event0));
|
||||
HIP_CHECK(hipEventCreate(&event1));
|
||||
HIP_CHECK(hipEventRecord(event0, 0));
|
||||
hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock),
|
||||
0, 0, static_cast<const float*>(A),
|
||||
static_cast<const float*>(B), C, numElements);
|
||||
HIP_CHECK(hipEventRecord(event1, 0));
|
||||
HIP_CHECK(hipDeviceSynchronize());
|
||||
float time = 0.0f;
|
||||
HIP_CHECK(hipEventElapsedTime(&time, event0, event1));
|
||||
printf("Time %.3f ms\n", time);
|
||||
float maxError = 0.0f;
|
||||
HIP_CHECK(hipMemPrefetchAsync(B, numElements*sizeof(float),
|
||||
hipCpuDeviceId));
|
||||
HIP_CHECK(hipDeviceSynchronize());
|
||||
device = 0;
|
||||
HIP_CHECK(hipMemRangeGetAttribute(&device, sizeof(device),
|
||||
hipMemRangeAttributeLastPrefetchLocation,
|
||||
A, numElements*sizeof(float)));
|
||||
if (device != hipCpuDeviceId) {
|
||||
SUCCEED("hipMemRangeGetAttribute error device = " << device);
|
||||
}
|
||||
|
||||
for (int i = 0; i < numElements; i++) {
|
||||
maxError = fmax(maxError, fabs(B[i]-3.0f));
|
||||
}
|
||||
HIP_CHECK(hipFree(A));
|
||||
HIP_CHECK(hipFree(B));
|
||||
REQUIRE(maxError != 0.0f);
|
||||
} else {
|
||||
SUCCEED("GPU 0 doesn't support hipDeviceAttributeManagedMemory "
|
||||
"attribute. Hence skipping the testing with Pass result.\n");
|
||||
auto managed = HmmAttrPrint();
|
||||
if (managed != 1) {
|
||||
HipTest::HIP_SKIP_TEST("GPU doesn't support managed memory so skipping test.");
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
float *A, *B, *C;
|
||||
|
||||
HIP_CHECK(hipMallocManaged(&A, numElements * sizeof(float)));
|
||||
HIP_CHECK(hipMallocManaged(&B, numElements * sizeof(float)));
|
||||
HIP_CHECK(hipMallocManaged(&C, numElements * sizeof(float)));
|
||||
HipTest::setDefaultData(numElements, A, B, C);
|
||||
|
||||
hipDevice_t device = hipCpuDeviceId;
|
||||
|
||||
HIP_CHECK(hipMemAdvise(A, numElements * sizeof(float), hipMemAdviseSetReadMostly, device));
|
||||
HIP_CHECK(hipMemPrefetchAsync(A, numElements * sizeof(float), 0));
|
||||
HIP_CHECK(hipMemPrefetchAsync(B, numElements * sizeof(float), 0));
|
||||
HIP_CHECK(hipDeviceSynchronize());
|
||||
HIP_CHECK(hipMemRangeGetAttribute(&device, sizeof(device),
|
||||
hipMemRangeAttributeLastPrefetchLocation, A,
|
||||
numElements * sizeof(float)));
|
||||
if (device != 0) {
|
||||
INFO("hipMemRangeGetAttribute error, device = " << device);
|
||||
}
|
||||
uint32_t read_only = 0xf;
|
||||
HIP_CHECK(hipMemRangeGetAttribute(&read_only, sizeof(read_only), hipMemRangeAttributeReadMostly,
|
||||
A, numElements * sizeof(float)));
|
||||
if (read_only != 1) {
|
||||
SUCCEED("hipMemRangeGetAttribute error, read_only = " << read_only);
|
||||
}
|
||||
|
||||
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements);
|
||||
hipEvent_t event0, event1;
|
||||
HIP_CHECK(hipEventCreate(&event0));
|
||||
HIP_CHECK(hipEventCreate(&event1));
|
||||
HIP_CHECK(hipEventRecord(event0, 0));
|
||||
hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
|
||||
static_cast<const float*>(A), static_cast<const float*>(B), C, numElements);
|
||||
HIP_CHECK(hipEventRecord(event1, 0));
|
||||
HIP_CHECK(hipDeviceSynchronize());
|
||||
float time = 0.0f;
|
||||
HIP_CHECK(hipEventElapsedTime(&time, event0, event1));
|
||||
printf("Time %.3f ms\n", time);
|
||||
float maxError = 0.0f;
|
||||
HIP_CHECK(hipMemPrefetchAsync(B, numElements * sizeof(float), hipCpuDeviceId));
|
||||
HIP_CHECK(hipDeviceSynchronize());
|
||||
device = 0;
|
||||
HIP_CHECK(hipMemRangeGetAttribute(&device, sizeof(device),
|
||||
hipMemRangeAttributeLastPrefetchLocation, A,
|
||||
numElements * sizeof(float)));
|
||||
if (device != hipCpuDeviceId) {
|
||||
SUCCEED("hipMemRangeGetAttribute error device = " << device);
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < numElements; i++) {
|
||||
maxError = fmax(maxError, fabs(B[i] - 3.0f));
|
||||
}
|
||||
HIP_CHECK(hipFree(A));
|
||||
HIP_CHECK(hipFree(B));
|
||||
REQUIRE(maxError != 0.0f);
|
||||
}
|
||||
|
||||
@@ -0,0 +1,26 @@
|
||||
#include <hip_test_common.hh>
|
||||
|
||||
static int HmmAttrPrint() {
|
||||
int managed = 0;
|
||||
INFO(
|
||||
"The following are the attribute values related to HMM for"
|
||||
" device 0:\n");
|
||||
HIP_CHECK(hipDeviceGetAttribute(&managed, hipDeviceAttributeDirectManagedMemAccessFromHost, 0));
|
||||
INFO("hipDeviceAttributeDirectManagedMemAccessFromHost: " << managed);
|
||||
HIP_CHECK(hipDeviceGetAttribute(&managed, hipDeviceAttributeConcurrentManagedAccess, 0));
|
||||
INFO("hipDeviceAttributeConcurrentManagedAccess: " << managed);
|
||||
HIP_CHECK(hipDeviceGetAttribute(&managed, hipDeviceAttributePageableMemoryAccess, 0));
|
||||
INFO("hipDeviceAttributePageableMemoryAccess: " << managed);
|
||||
HIP_CHECK(
|
||||
hipDeviceGetAttribute(&managed, hipDeviceAttributePageableMemoryAccessUsesHostPageTables, 0));
|
||||
INFO("hipDeviceAttributePageableMemoryAccessUsesHostPageTables:" << managed);
|
||||
|
||||
HIP_CHECK(hipDeviceGetAttribute(&managed, hipDeviceAttributeManagedMemory, 0));
|
||||
INFO("hipDeviceAttributeManagedMemory: " << managed);
|
||||
if (managed != 1) {
|
||||
WARN(
|
||||
"GPU 0 doesn't support hipDeviceAttributeManagedMemory attribute so defaulting to system "
|
||||
"memory.");
|
||||
}
|
||||
return managed;
|
||||
}
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
@@ -20,245 +20,210 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
#include "hipMallocManagedCommon.hh"
|
||||
#include <atomic>
|
||||
|
||||
// Kernel function
|
||||
__global__ void MallcMangdFlgTst(int n, float *x, float *y) {
|
||||
__global__ void MallcMangdFlgTst(int n, float* x, float* y) {
|
||||
int index = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int stride = blockDim.x * gridDim.x;
|
||||
for (int i = index; i < n; i += stride)
|
||||
y[i] = x[i] * x[i];
|
||||
}
|
||||
|
||||
|
||||
// The following function prints info on attributes related to HMM
|
||||
static int HmmAttrPrint() {
|
||||
int managed = 0;
|
||||
INFO("The following are the attribute values related to HMM for"
|
||||
" device 0:\n");
|
||||
HIP_CHECK(hipDeviceGetAttribute(&managed,
|
||||
hipDeviceAttributeDirectManagedMemAccessFromHost, 0));
|
||||
INFO("hipDeviceAttributeDirectManagedMemAccessFromHost: " << managed);
|
||||
HIP_CHECK(hipDeviceGetAttribute(&managed,
|
||||
hipDeviceAttributeConcurrentManagedAccess, 0));
|
||||
INFO("hipDeviceAttributeConcurrentManagedAccess: " << managed);
|
||||
HIP_CHECK(hipDeviceGetAttribute(&managed,
|
||||
hipDeviceAttributePageableMemoryAccess, 0));
|
||||
INFO("hipDeviceAttributePageableMemoryAccess: " << managed);
|
||||
HIP_CHECK(hipDeviceGetAttribute(&managed,
|
||||
hipDeviceAttributePageableMemoryAccessUsesHostPageTables, 0));
|
||||
INFO("hipDeviceAttributePageableMemoryAccessUsesHostPageTables:"
|
||||
<< managed);
|
||||
|
||||
HIP_CHECK(hipDeviceGetAttribute(&managed, hipDeviceAttributeManagedMemory,
|
||||
0));
|
||||
INFO("hipDeviceAttributeManagedMemory: " << managed);
|
||||
return managed;
|
||||
for (int i = index; i < n; i += stride) y[i] = x[i] * x[i];
|
||||
}
|
||||
|
||||
// The following section tests working of hipMallocManaged with flag parameters
|
||||
TEST_CASE("Unit_hipMallocManaged_FlgParam") {
|
||||
int managed = HmmAttrPrint();
|
||||
if (managed == 1) {
|
||||
std::atomic<int> DataMismatch{0};
|
||||
bool IfTestPassed = true;
|
||||
float *HmmAG = NULL, *HmmAH1 = NULL, *HmmAH2 = NULL, INIT_VAL = 2.5;
|
||||
int NumDevs = 0, NUM_ELMS = 4096;
|
||||
HIP_CHECK(hipGetDeviceCount(&NumDevs));
|
||||
float *Ad = NULL, *Ah = NULL;
|
||||
Ah = new float[NUM_ELMS];
|
||||
// Testing hipMemAttachGlobal Flag
|
||||
HIP_CHECK(hipMallocManaged(&HmmAG, NUM_ELMS * sizeof(float),
|
||||
hipMemAttachGlobal));
|
||||
|
||||
// Initializing HmmAG memory
|
||||
for (int i = 0; i < NUM_ELMS; i++) {
|
||||
HmmAG[i] = INIT_VAL;
|
||||
Ah[i] = 0;
|
||||
}
|
||||
|
||||
int blockSize = 256;
|
||||
int numBlocks = (NUM_ELMS + blockSize - 1) / blockSize;
|
||||
dim3 dimGrid(numBlocks, 1, 1);
|
||||
dim3 dimBlock(blockSize, 1, 1);
|
||||
hipStream_t strm;
|
||||
for (int i = 0; i < NumDevs; i++) {
|
||||
HIP_CHECK(hipSetDevice(i));
|
||||
HIP_CHECK(hipStreamCreate(&strm));
|
||||
HIP_CHECK(hipMalloc(&Ad, NUM_ELMS * sizeof(float)));
|
||||
HIP_CHECK(hipMemset(Ad, 0, NUM_ELMS * sizeof(float)));
|
||||
MallcMangdFlgTst<<<dimGrid, dimBlock, 0, strm>>>(NUM_ELMS, HmmAG, Ad);
|
||||
HIP_CHECK(hipStreamSynchronize(strm));
|
||||
HIP_CHECK(hipMemcpy(Ah, Ad, NUM_ELMS * sizeof(float),
|
||||
hipMemcpyDeviceToHost));
|
||||
for (int j = 0; j < NUM_ELMS; ++j) {
|
||||
if (Ah[j] != (INIT_VAL * INIT_VAL)) {
|
||||
DataMismatch++;
|
||||
}
|
||||
}
|
||||
if (DataMismatch != 0) {
|
||||
WARN("Data Mismatch observed when kernel launched on");
|
||||
WARN(" device: " << i);
|
||||
IfTestPassed = false;
|
||||
}
|
||||
DataMismatch = 0;
|
||||
|
||||
HIP_CHECK(hipFree(Ad));
|
||||
HIP_CHECK(hipStreamDestroy(strm));
|
||||
}
|
||||
delete[] Ah;
|
||||
HIP_CHECK(hipFree(HmmAG));
|
||||
|
||||
DataMismatch = 0;
|
||||
HIP_CHECK(hipMallocManaged(&HmmAH1, NUM_ELMS * sizeof(float),
|
||||
hipMemAttachHost));
|
||||
HIP_CHECK(hipMallocManaged(&HmmAH2, NUM_ELMS * sizeof(float),
|
||||
hipMemAttachHost));
|
||||
|
||||
// Initializing HmmAH memory
|
||||
for (int i = 0; i < NUM_ELMS; i++) {
|
||||
HmmAH1[i] = INIT_VAL;
|
||||
HmmAH2[i] = 0;
|
||||
}
|
||||
for (int i = 0; i < NumDevs; i++) {
|
||||
HIP_CHECK(hipSetDevice(i));
|
||||
HIP_CHECK(hipStreamCreate(&strm));
|
||||
HIP_CHECK(hipMemset(HmmAH2, 0, NUM_ELMS * sizeof(float)));
|
||||
MallcMangdFlgTst<<<dimGrid, dimBlock, 0, strm>>>(NUM_ELMS,
|
||||
HmmAH1, HmmAH2);
|
||||
HIP_CHECK(hipStreamSynchronize(strm));
|
||||
for (int j = 0; j < NUM_ELMS; ++j) {
|
||||
if (HmmAH2[j] != (INIT_VAL * INIT_VAL)) {
|
||||
DataMismatch++;
|
||||
}
|
||||
}
|
||||
if (DataMismatch != 0) {
|
||||
WARN("Data Mismatch observed when kernel launched on");
|
||||
WARN(" device: " << i);
|
||||
IfTestPassed = false;
|
||||
}
|
||||
HIP_CHECK(hipStreamDestroy(strm));
|
||||
}
|
||||
HIP_CHECK(hipFree(HmmAH1));
|
||||
HIP_CHECK(hipFree(HmmAH2));
|
||||
REQUIRE(IfTestPassed);
|
||||
} else {
|
||||
SUCCEED("Gpu doesnt support HMM! Hence skipping the test with PASS result");
|
||||
|
||||
auto managed = HmmAttrPrint();
|
||||
if (managed != 1) {
|
||||
HipTest::HIP_SKIP_TEST("GPU doesn't support managed memory so skipping test.");
|
||||
return;
|
||||
}
|
||||
|
||||
std::atomic<int> DataMismatch{0};
|
||||
bool IfTestPassed = true;
|
||||
float *HmmAG = NULL, *HmmAH1 = NULL, *HmmAH2 = NULL, INIT_VAL = 2.5;
|
||||
int NumDevs = 0, NUM_ELMS = 4096;
|
||||
HIP_CHECK(hipGetDeviceCount(&NumDevs));
|
||||
float *Ad = NULL, *Ah = NULL;
|
||||
Ah = new float[NUM_ELMS];
|
||||
// Testing hipMemAttachGlobal Flag
|
||||
HIP_CHECK(hipMallocManaged(&HmmAG, NUM_ELMS * sizeof(float), hipMemAttachGlobal));
|
||||
|
||||
// Initializing HmmAG memory
|
||||
for (int i = 0; i < NUM_ELMS; i++) {
|
||||
HmmAG[i] = INIT_VAL;
|
||||
Ah[i] = 0;
|
||||
}
|
||||
|
||||
int blockSize = 256;
|
||||
int numBlocks = (NUM_ELMS + blockSize - 1) / blockSize;
|
||||
dim3 dimGrid(numBlocks, 1, 1);
|
||||
dim3 dimBlock(blockSize, 1, 1);
|
||||
hipStream_t strm;
|
||||
for (int i = 0; i < NumDevs; i++) {
|
||||
HIP_CHECK(hipSetDevice(i));
|
||||
HIP_CHECK(hipStreamCreate(&strm));
|
||||
HIP_CHECK(hipMalloc(&Ad, NUM_ELMS * sizeof(float)));
|
||||
HIP_CHECK(hipMemset(Ad, 0, NUM_ELMS * sizeof(float)));
|
||||
MallcMangdFlgTst<<<dimGrid, dimBlock, 0, strm>>>(NUM_ELMS, HmmAG, Ad);
|
||||
HIP_CHECK(hipStreamSynchronize(strm));
|
||||
HIP_CHECK(hipMemcpy(Ah, Ad, NUM_ELMS * sizeof(float), hipMemcpyDeviceToHost));
|
||||
for (int j = 0; j < NUM_ELMS; ++j) {
|
||||
if (Ah[j] != (INIT_VAL * INIT_VAL)) {
|
||||
DataMismatch++;
|
||||
}
|
||||
}
|
||||
if (DataMismatch != 0) {
|
||||
WARN("Data Mismatch observed when kernel launched on");
|
||||
WARN(" device: " << i);
|
||||
IfTestPassed = false;
|
||||
}
|
||||
DataMismatch = 0;
|
||||
|
||||
HIP_CHECK(hipFree(Ad));
|
||||
HIP_CHECK(hipStreamDestroy(strm));
|
||||
}
|
||||
delete[] Ah;
|
||||
HIP_CHECK(hipFree(HmmAG));
|
||||
|
||||
DataMismatch = 0;
|
||||
HIP_CHECK(hipMallocManaged(&HmmAH1, NUM_ELMS * sizeof(float), hipMemAttachHost));
|
||||
HIP_CHECK(hipMallocManaged(&HmmAH2, NUM_ELMS * sizeof(float), hipMemAttachHost));
|
||||
|
||||
// Initializing HmmAH memory
|
||||
for (int i = 0; i < NUM_ELMS; i++) {
|
||||
HmmAH1[i] = INIT_VAL;
|
||||
HmmAH2[i] = 0;
|
||||
}
|
||||
for (int i = 0; i < NumDevs; i++) {
|
||||
HIP_CHECK(hipSetDevice(i));
|
||||
HIP_CHECK(hipStreamCreate(&strm));
|
||||
HIP_CHECK(hipMemset(HmmAH2, 0, NUM_ELMS * sizeof(float)));
|
||||
MallcMangdFlgTst<<<dimGrid, dimBlock, 0, strm>>>(NUM_ELMS, HmmAH1, HmmAH2);
|
||||
HIP_CHECK(hipStreamSynchronize(strm));
|
||||
for (int j = 0; j < NUM_ELMS; ++j) {
|
||||
if (HmmAH2[j] != (INIT_VAL * INIT_VAL)) {
|
||||
DataMismatch++;
|
||||
}
|
||||
}
|
||||
if (DataMismatch != 0) {
|
||||
WARN("Data Mismatch observed when kernel launched on");
|
||||
WARN(" device: " << i);
|
||||
IfTestPassed = false;
|
||||
}
|
||||
HIP_CHECK(hipStreamDestroy(strm));
|
||||
}
|
||||
HIP_CHECK(hipFree(HmmAH1));
|
||||
HIP_CHECK(hipFree(HmmAH2));
|
||||
REQUIRE(IfTestPassed);
|
||||
}
|
||||
|
||||
// The following function tests Memory access allocated using hipMallocManaged
|
||||
// in multiple streams
|
||||
TEST_CASE("Unit_hipMallocManaged_AccessMultiStream") {
|
||||
int managed = HmmAttrPrint();
|
||||
if (managed == 1) {
|
||||
std::atomic<int> DataMismatch{0};
|
||||
bool IfTestPassed = true;
|
||||
float *HmmAG = NULL, *HmmAH1 = NULL, *HmmAH2 = NULL, INIT_VAL = 2.5;
|
||||
int NumStrms = 0, MultiDevice = 0, NUM_ELMS = 4096;
|
||||
HIP_CHECK(hipGetDeviceCount(&MultiDevice));
|
||||
if (MultiDevice >= 2) {
|
||||
HIP_CHECK(hipGetDeviceCount(&NumStrms));
|
||||
} else {
|
||||
NumStrms = 4;
|
||||
}
|
||||
hipStream_t **Stream = new hipStream_t*[NumStrms];
|
||||
for (int i = 0; i < NumStrms; ++i) {
|
||||
Stream[i] = reinterpret_cast<hipStream_t*>(malloc(sizeof(hipStream_t)));
|
||||
}
|
||||
float *Ad = NULL, *Ah = NULL;
|
||||
Ah = new float[NUM_ELMS];
|
||||
for (int i = 0; i < NumStrms; ++i) {
|
||||
if (MultiDevice >= 2) {
|
||||
HIP_CHECK(hipSetDevice(i));
|
||||
}
|
||||
HIP_CHECK(hipStreamCreate(Stream[i]));
|
||||
}
|
||||
HIP_CHECK(hipSetDevice(0));
|
||||
// Testing hipMemAttachGlobal Flag
|
||||
HIP_CHECK(hipMallocManaged(&HmmAG, NUM_ELMS * sizeof(float),
|
||||
hipMemAttachGlobal));
|
||||
|
||||
// Initializing HmmAG memory
|
||||
for (int i = 0; i < NUM_ELMS; i++) {
|
||||
HmmAG[i] = INIT_VAL;
|
||||
Ah[i] = 0;
|
||||
}
|
||||
|
||||
int blockSize = 256;
|
||||
int numBlocks = (NUM_ELMS + blockSize - 1) / blockSize;
|
||||
dim3 dimGrid(numBlocks, 1, 1);
|
||||
dim3 dimBlock(blockSize, 1, 1);
|
||||
for (int i = 0; i < NumStrms; i++) {
|
||||
if (MultiDevice >= 2) {
|
||||
HIP_CHECK(hipSetDevice(i));
|
||||
}
|
||||
HIP_CHECK(hipMalloc(&Ad, NUM_ELMS * sizeof(float)));
|
||||
HIP_CHECK(hipMemset(Ad, 0, NUM_ELMS * sizeof(float)));
|
||||
MallcMangdFlgTst<<<dimGrid, dimBlock, 0, *(Stream[i])>>>(NUM_ELMS,
|
||||
HmmAG, Ad);
|
||||
HIP_CHECK(hipStreamSynchronize(*(Stream[i])));
|
||||
// Validating the results
|
||||
HIP_CHECK(hipMemcpy(Ah, Ad, NUM_ELMS * sizeof(float),
|
||||
hipMemcpyDeviceToHost));
|
||||
for (int j = 0; j < NUM_ELMS; ++j) {
|
||||
if (Ah[j] != (INIT_VAL * INIT_VAL)) {
|
||||
DataMismatch++;
|
||||
}
|
||||
}
|
||||
if (DataMismatch != 0) {
|
||||
WARN("Data Mismatch observed when kernel launched on");
|
||||
WARN(" device: " << i);
|
||||
IfTestPassed = false;
|
||||
}
|
||||
DataMismatch = 0;
|
||||
|
||||
HIP_CHECK(hipFree(Ad));
|
||||
}
|
||||
delete[] Ah;
|
||||
HIP_CHECK(hipFree(HmmAG));
|
||||
|
||||
DataMismatch = 0;
|
||||
HIP_CHECK(hipMallocManaged(&HmmAH1, NUM_ELMS * sizeof(float),
|
||||
hipMemAttachHost));
|
||||
HIP_CHECK(hipMallocManaged(&HmmAH2, NUM_ELMS * sizeof(float),
|
||||
hipMemAttachHost));
|
||||
|
||||
// Initializing HmmAH memory
|
||||
for (int i = 0; i < NUM_ELMS; i++) {
|
||||
HmmAH1[i] = INIT_VAL;
|
||||
HmmAH2[i] = 0;
|
||||
}
|
||||
for (int i = 0; i < NumStrms; i++) {
|
||||
if (MultiDevice >= 2) {
|
||||
HIP_CHECK(hipSetDevice(i));
|
||||
}
|
||||
HIP_CHECK(hipMemset(HmmAH2, 0, NUM_ELMS * sizeof(float)));
|
||||
MallcMangdFlgTst<<<dimGrid, dimBlock, 0, *(Stream[i])>>>(NUM_ELMS,
|
||||
HmmAH1, HmmAH2);
|
||||
HIP_CHECK(hipStreamSynchronize(*(Stream[i])));
|
||||
for (int j = 0; j < NUM_ELMS; ++j) {
|
||||
if (HmmAH2[j] != (INIT_VAL * INIT_VAL)) {
|
||||
DataMismatch++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (DataMismatch != 0) {
|
||||
WARN("Data Mismatch observed when kernel launched on");
|
||||
WARN(" device: " << i);
|
||||
IfTestPassed = false;
|
||||
}
|
||||
}
|
||||
|
||||
HIP_CHECK(hipFree(HmmAH1));
|
||||
HIP_CHECK(hipFree(HmmAH2));
|
||||
for (int i = 0; i < NumStrms; ++i) {
|
||||
HIP_CHECK(hipStreamDestroy(*(Stream[i])));
|
||||
}
|
||||
REQUIRE(IfTestPassed);
|
||||
} else {
|
||||
SUCCEED("Gpu doesnt support HMM! Hence skipping the test with PASS result");
|
||||
|
||||
auto managed = HmmAttrPrint();
|
||||
if (managed != 1) {
|
||||
HipTest::HIP_SKIP_TEST("GPU doesn't support managed memory so skipping test.");
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
std::atomic<int> DataMismatch{0};
|
||||
bool IfTestPassed = true;
|
||||
float *HmmAG = NULL, *HmmAH1 = NULL, *HmmAH2 = NULL, INIT_VAL = 2.5;
|
||||
int NumStrms = 0, MultiDevice = 0, NUM_ELMS = 4096;
|
||||
HIP_CHECK(hipGetDeviceCount(&MultiDevice));
|
||||
if (MultiDevice >= 2) {
|
||||
HIP_CHECK(hipGetDeviceCount(&NumStrms));
|
||||
} else {
|
||||
NumStrms = 4;
|
||||
}
|
||||
hipStream_t** Stream = new hipStream_t*[NumStrms];
|
||||
for (int i = 0; i < NumStrms; ++i) {
|
||||
Stream[i] = reinterpret_cast<hipStream_t*>(malloc(sizeof(hipStream_t)));
|
||||
}
|
||||
float *Ad = NULL, *Ah = NULL;
|
||||
Ah = new float[NUM_ELMS];
|
||||
for (int i = 0; i < NumStrms; ++i) {
|
||||
if (MultiDevice >= 2) {
|
||||
HIP_CHECK(hipSetDevice(i));
|
||||
}
|
||||
HIP_CHECK(hipStreamCreate(Stream[i]));
|
||||
}
|
||||
HIP_CHECK(hipSetDevice(0));
|
||||
// Testing hipMemAttachGlobal Flag
|
||||
HIP_CHECK(hipMallocManaged(&HmmAG, NUM_ELMS * sizeof(float), hipMemAttachGlobal));
|
||||
|
||||
// Initializing HmmAG memory
|
||||
for (int i = 0; i < NUM_ELMS; i++) {
|
||||
HmmAG[i] = INIT_VAL;
|
||||
Ah[i] = 0;
|
||||
}
|
||||
|
||||
int blockSize = 256;
|
||||
int numBlocks = (NUM_ELMS + blockSize - 1) / blockSize;
|
||||
dim3 dimGrid(numBlocks, 1, 1);
|
||||
dim3 dimBlock(blockSize, 1, 1);
|
||||
for (int i = 0; i < NumStrms; i++) {
|
||||
if (MultiDevice >= 2) {
|
||||
HIP_CHECK(hipSetDevice(i));
|
||||
}
|
||||
HIP_CHECK(hipMalloc(&Ad, NUM_ELMS * sizeof(float)));
|
||||
HIP_CHECK(hipMemset(Ad, 0, NUM_ELMS * sizeof(float)));
|
||||
MallcMangdFlgTst<<<dimGrid, dimBlock, 0, *(Stream[i])>>>(NUM_ELMS, HmmAG, Ad);
|
||||
HIP_CHECK(hipStreamSynchronize(*(Stream[i])));
|
||||
// Validating the results
|
||||
HIP_CHECK(hipMemcpy(Ah, Ad, NUM_ELMS * sizeof(float), hipMemcpyDeviceToHost));
|
||||
for (int j = 0; j < NUM_ELMS; ++j) {
|
||||
if (Ah[j] != (INIT_VAL * INIT_VAL)) {
|
||||
DataMismatch++;
|
||||
}
|
||||
}
|
||||
if (DataMismatch != 0) {
|
||||
WARN("Data Mismatch observed when kernel launched on");
|
||||
WARN(" device: " << i);
|
||||
IfTestPassed = false;
|
||||
}
|
||||
DataMismatch = 0;
|
||||
|
||||
HIP_CHECK(hipFree(Ad));
|
||||
}
|
||||
delete[] Ah;
|
||||
HIP_CHECK(hipFree(HmmAG));
|
||||
|
||||
DataMismatch = 0;
|
||||
HIP_CHECK(hipMallocManaged(&HmmAH1, NUM_ELMS * sizeof(float), hipMemAttachHost));
|
||||
HIP_CHECK(hipMallocManaged(&HmmAH2, NUM_ELMS * sizeof(float), hipMemAttachHost));
|
||||
|
||||
// Initializing HmmAH memory
|
||||
for (int i = 0; i < NUM_ELMS; i++) {
|
||||
HmmAH1[i] = INIT_VAL;
|
||||
HmmAH2[i] = 0;
|
||||
}
|
||||
for (int i = 0; i < NumStrms; i++) {
|
||||
if (MultiDevice >= 2) {
|
||||
HIP_CHECK(hipSetDevice(i));
|
||||
}
|
||||
HIP_CHECK(hipMemset(HmmAH2, 0, NUM_ELMS * sizeof(float)));
|
||||
MallcMangdFlgTst<<<dimGrid, dimBlock, 0, *(Stream[i])>>>(NUM_ELMS, HmmAH1, HmmAH2);
|
||||
HIP_CHECK(hipStreamSynchronize(*(Stream[i])));
|
||||
for (int j = 0; j < NUM_ELMS; ++j) {
|
||||
if (HmmAH2[j] != (INIT_VAL * INIT_VAL)) {
|
||||
DataMismatch++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (DataMismatch != 0) {
|
||||
WARN("Data Mismatch observed when kernel launched on");
|
||||
WARN(" device: " << i);
|
||||
IfTestPassed = false;
|
||||
}
|
||||
}
|
||||
|
||||
HIP_CHECK(hipFree(HmmAH1));
|
||||
HIP_CHECK(hipFree(HmmAH2));
|
||||
for (int i = 0; i < NumStrms; ++i) {
|
||||
HIP_CHECK(hipStreamDestroy(*(Stream[i])));
|
||||
}
|
||||
REQUIRE(IfTestPassed);
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
@@ -27,28 +27,81 @@
|
||||
6. Multiple Pointers
|
||||
*/
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
#include "hipMallocManagedCommon.hh"
|
||||
#include <hip_test_kernels.hh>
|
||||
#include <hip_test_checkers.hh>
|
||||
#include <atomic>
|
||||
|
||||
const size_t MAX_GPU{256};
|
||||
static size_t N{4*1024*1024};
|
||||
static size_t N{4 * 1024 * 1024};
|
||||
static unsigned blocksPerCU{6};
|
||||
static unsigned threadsPerBlock{256};
|
||||
#define INIT_VAL 123
|
||||
|
||||
|
||||
/*
|
||||
* Kernel function to perform addition operation.
|
||||
*/
|
||||
template <typename T>
|
||||
__global__ void
|
||||
vector_sum(T *Ad1, T *Ad2, size_t NUM_ELMTS) {
|
||||
size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
|
||||
size_t stride = blockDim.x * gridDim.x;
|
||||
template <typename T> __global__ void vector_sum(T* Ad1, T* Ad2, size_t NUM_ELMTS) {
|
||||
size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
|
||||
size_t stride = blockDim.x * gridDim.x;
|
||||
|
||||
for (size_t i = offset; i < NUM_ELMTS; i += stride) {
|
||||
Ad2[i] = Ad1[i] + Ad1[i];
|
||||
}
|
||||
for (size_t i = offset; i < NUM_ELMTS; i += stride) {
|
||||
Ad2[i] = Ad1[i] + Ad1[i];
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Kernel function to perform multiplication
|
||||
*/
|
||||
__global__ void KernelDouble(float* Hmm, float* dPtr, size_t n) {
|
||||
size_t index = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (index < n) {
|
||||
dPtr[index] = 2 * Hmm[index];
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Host function to perform multiplication
|
||||
*/
|
||||
void HostKernelDouble(float* Hmm, float* hPtr, size_t n) {
|
||||
for (size_t i = 0; i < n; i++) {
|
||||
hPtr[i] = 2 * Hmm[i];
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
This testcase verifies the concurrent access of hipMallocManaged Memory on host and device.
|
||||
*/
|
||||
TEST_CASE("Unit_hipMallocManaged_HostDeviceConcurrent") {
|
||||
auto managed = HmmAttrPrint();
|
||||
if (managed != 1) {
|
||||
HipTest::HIP_SKIP_TEST("GPU doesn't support managed memory so skipping test.");
|
||||
return;
|
||||
}
|
||||
|
||||
float *Hmm = nullptr, *hPtr = nullptr, *dPtr = nullptr, *resPtr = nullptr;
|
||||
|
||||
hPtr = reinterpret_cast<float*>(malloc(N * sizeof(float)));
|
||||
resPtr = reinterpret_cast<float*>(malloc(N * sizeof(float)));
|
||||
|
||||
HIP_CHECK(hipMalloc(&dPtr, N * sizeof(float)));
|
||||
HIP_CHECK(hipMallocManaged(&Hmm, N * sizeof(float)));
|
||||
memset(Hmm, 2.0, N * sizeof(float));
|
||||
|
||||
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
|
||||
std::thread host_thread(HostKernelDouble, Hmm, hPtr, N);
|
||||
KernelDouble<<<dim3(blocks), dim3(threadsPerBlock), 0, 0>>>(Hmm, dPtr, N);
|
||||
host_thread.join();
|
||||
hipMemcpy(resPtr, dPtr, N * sizeof(float), hipMemcpyDeviceToHost);
|
||||
|
||||
for (size_t i = 0; i < N; i++) {
|
||||
REQUIRE(hPtr[i] == resPtr[i]);
|
||||
}
|
||||
|
||||
free(hPtr);
|
||||
HIP_CHECK(hipFree(dPtr));
|
||||
HIP_CHECK(hipFree(Hmm));
|
||||
}
|
||||
|
||||
// The following Test case tests the following scenario:
|
||||
@@ -57,7 +110,13 @@ vector_sum(T *Ad1, T *Ad2, size_t NUM_ELMTS) {
|
||||
// kernel is launched on acessed chunk of hmm memory
|
||||
// and checks if there are any inconsistencies or access issues
|
||||
TEST_CASE("Unit_hipMallocManaged_MultiChunkSingleDevice") {
|
||||
std::atomic<int> DataMismatch{0};
|
||||
auto managed = HmmAttrPrint();
|
||||
if (managed != 1) {
|
||||
HipTest::HIP_SKIP_TEST("GPU doesn't support managed memory so skipping test.");
|
||||
return;
|
||||
}
|
||||
|
||||
std::atomic<int> DataMismatch{0};
|
||||
constexpr int Chunks = 4;
|
||||
int Counter = 0;
|
||||
int NUM_ELMS = (1024 * 1024);
|
||||
@@ -74,16 +133,14 @@ std::atomic<int> DataMismatch{0};
|
||||
Hmm[Counter] = (INIT_VAL + i);
|
||||
}
|
||||
}
|
||||
const unsigned threadsPerBlock = 256;
|
||||
const unsigned blocks = (NUM_ELMS + 255)/256;
|
||||
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
|
||||
for (int k = 0; k < Chunks; ++k) {
|
||||
vector_sum<float> <<<blocks, threadsPerBlock, 0, stream[k]>>>
|
||||
(&Hmm[k * NUM_ELMS], Ad[k], NUM_ELMS);
|
||||
vector_sum<float>
|
||||
<<<blocks, threadsPerBlock, 0, stream[k]>>>(&Hmm[k * NUM_ELMS], Ad[k], NUM_ELMS);
|
||||
}
|
||||
HIP_CHECK(hipDeviceSynchronize());
|
||||
for (int m = 0; m < Chunks; ++m) {
|
||||
HIP_CHECK(hipMemcpy(Ah, Ad[m], NUM_ELMS * sizeof(float),
|
||||
hipMemcpyDeviceToHost));
|
||||
HIP_CHECK(hipMemcpy(Ah, Ad[m], NUM_ELMS * sizeof(float), hipMemcpyDeviceToHost));
|
||||
for (int n = 0; n < NUM_ELMS; ++n) {
|
||||
if (Ah[n] != ((INIT_VAL + m) * 2)) {
|
||||
DataMismatch++;
|
||||
@@ -96,7 +153,7 @@ std::atomic<int> DataMismatch{0};
|
||||
HIP_CHECK(hipStreamDestroy(stream[i]));
|
||||
}
|
||||
HIP_CHECK(hipFree(Hmm));
|
||||
delete [] Ah;
|
||||
delete[] Ah;
|
||||
}
|
||||
|
||||
// The following Test case tests the following scenario:
|
||||
@@ -105,10 +162,20 @@ std::atomic<int> DataMismatch{0};
|
||||
// kernel is launched on acessed chunk of hmm memory
|
||||
// and checks if there are any inconsistencies or access issues
|
||||
TEST_CASE("Unit_hipMallocManaged_MultiChunkMultiDevice") {
|
||||
auto managed = HmmAttrPrint();
|
||||
if (managed != 1) {
|
||||
HipTest::HIP_SKIP_TEST("GPU doesn't support managed memory so skipping test.");
|
||||
return;
|
||||
}
|
||||
|
||||
std::atomic<int> DataMismatch{0};
|
||||
int Counter = 0;
|
||||
int NumDevices = 0;
|
||||
HIP_CHECK(hipGetDeviceCount(&NumDevices));
|
||||
if (NumDevices < 2) {
|
||||
HipTest::HIP_SKIP_TEST("Skipping test because more than one device was not found.");
|
||||
return;
|
||||
}
|
||||
unsigned int NUM_ELMS = (1024 * 1024);
|
||||
float *Ad[MAX_GPU], *Hmm = NULL, *Ah = new float[NUM_ELMS];
|
||||
hipStream_t stream[MAX_GPU];
|
||||
@@ -124,17 +191,15 @@ TEST_CASE("Unit_hipMallocManaged_MultiChunkMultiDevice") {
|
||||
Hmm[Counter] = INIT_VAL + i;
|
||||
}
|
||||
}
|
||||
const unsigned threadsPerBlock = 256;
|
||||
const unsigned blocks = (NUM_ELMS + 255)/256;
|
||||
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
|
||||
for (int Klaunch = 0; Klaunch < NumDevices; ++Klaunch) {
|
||||
HIP_CHECK(hipSetDevice(Klaunch));
|
||||
vector_sum<float> <<<blocks, threadsPerBlock, 0, stream[Klaunch]>>>
|
||||
(&Hmm[Klaunch * NUM_ELMS], Ad[Klaunch], NUM_ELMS);
|
||||
vector_sum<float><<<blocks, threadsPerBlock, 0, stream[Klaunch]>>>(&Hmm[Klaunch * NUM_ELMS],
|
||||
Ad[Klaunch], NUM_ELMS);
|
||||
}
|
||||
HIP_CHECK(hipDeviceSynchronize());
|
||||
for (int m = 0; m < NumDevices; ++m) {
|
||||
HIP_CHECK(hipMemcpy(Ah, Ad[m], NUM_ELMS * sizeof(float),
|
||||
hipMemcpyDeviceToHost));
|
||||
HIP_CHECK(hipMemcpy(Ah, Ad[m], NUM_ELMS * sizeof(float), hipMemcpyDeviceToHost));
|
||||
for (size_t n = 0; n < NUM_ELMS; ++n) {
|
||||
if (Ah[n] != ((INIT_VAL + m) * 2)) {
|
||||
DataMismatch++;
|
||||
@@ -148,32 +213,38 @@ TEST_CASE("Unit_hipMallocManaged_MultiChunkMultiDevice") {
|
||||
HIP_CHECK(hipStreamDestroy(stream[i]));
|
||||
}
|
||||
HIP_CHECK(hipFree(Hmm));
|
||||
delete [] Ah;
|
||||
delete[] Ah;
|
||||
}
|
||||
|
||||
// The following tests oversubscription hipMallocManaged() api
|
||||
// Currently disabled.
|
||||
TEST_CASE("Unit_hipMallocManaged_OverSubscription") {
|
||||
void *A = nullptr;
|
||||
auto managed = HmmAttrPrint();
|
||||
if (managed != 1) {
|
||||
HipTest::HIP_SKIP_TEST("GPU doesn't support managed memory so skipping test.");
|
||||
return;
|
||||
}
|
||||
|
||||
void* A = nullptr;
|
||||
size_t total = 0, free = 0;
|
||||
HIP_CHECK(hipMemGetInfo(&free, &total));
|
||||
// ToDo: In case of HMM, memory over-subscription is allowed. Hence, relook
|
||||
// into how out of memory can be tested.
|
||||
// Demanding more mem size than available
|
||||
#if HT_AMD
|
||||
REQUIRE(hipMallocManaged(&A, (free +1), hipMemAttachGlobal) != hipSuccess);
|
||||
HIP_CHECK_ERROR(hipMallocManaged(&A, (free + 1), hipMemAttachGlobal), hipErrorOutOfMemory);
|
||||
#endif
|
||||
}
|
||||
|
||||
// The following test does negative testing of hipMallocManaged() api
|
||||
// by passing invalid values and check if the behavior is as expected
|
||||
TEST_CASE("Unit_hipMallocManaged_Negative") {
|
||||
void *A;
|
||||
void* A;
|
||||
size_t total = 0, free = 0;
|
||||
HIP_CHECK(hipMemGetInfo(&free, &total));
|
||||
|
||||
SECTION("Nullptr to devPtr") {
|
||||
REQUIRE(hipMallocManaged(NULL, 1024, hipMemAttachGlobal) != hipSuccess);
|
||||
HIP_CHECK_ERROR(hipMallocManaged(NULL, 1024, hipMemAttachGlobal), hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
// cuda api doc says : If size is 0, cudaMallocManaged returns
|
||||
@@ -184,14 +255,14 @@ TEST_CASE("Unit_hipMallocManaged_Negative") {
|
||||
// reset ptr while returning success (to accommodate cuda 11.2 api behavior).
|
||||
SECTION("size 0 with flag hipMemAttachGlobal") {
|
||||
#if HT_AMD
|
||||
REQUIRE(hipMallocManaged(&A, 0, hipMemAttachGlobal) != hipSuccess);
|
||||
HIP_CHECK_ERROR(hipMallocManaged(&A, 0, hipMemAttachGlobal), hipErrorInvalidValue);
|
||||
#else
|
||||
REQUIRE(hipMallocManaged(&A, 0, hipMemAttachHost) == hipSuccess);
|
||||
HIP_CHECK(hipMallocManaged(&A, 0, hipMemAttachGlobal));
|
||||
#endif
|
||||
}
|
||||
|
||||
SECTION("devptr is nullptr with flag hipMemAttachHost") {
|
||||
REQUIRE(hipMallocManaged(NULL, 1024, hipMemAttachHost) != hipSuccess);
|
||||
HIP_CHECK_ERROR(hipMallocManaged(NULL, 1024, hipMemAttachHost), hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
// cuda api doc says : If size is 0, cudaMallocManaged returns
|
||||
@@ -202,32 +273,47 @@ TEST_CASE("Unit_hipMallocManaged_Negative") {
|
||||
// reset ptr while returning success (to accommodate cuda 11.2 api behavior).
|
||||
SECTION("size 0 with flag hipMemAttachHost") {
|
||||
#if HT_AMD
|
||||
REQUIRE(hipMallocManaged(&A, 0, hipMemAttachHost) != hipSuccess);
|
||||
HIP_CHECK_ERROR(hipMallocManaged(&A, 0, hipMemAttachHost), hipErrorInvalidValue);
|
||||
#else
|
||||
REQUIRE(hipMallocManaged(&A, 0, hipMemAttachHost) == hipSuccess);
|
||||
HIP_CHECK(hipMallocManaged(&A, 0, hipMemAttachHost));
|
||||
#endif
|
||||
}
|
||||
|
||||
SECTION("nullptr to devptr, size 0 and flag 0") {
|
||||
REQUIRE(hipMallocManaged(NULL, 0, 0) != hipSuccess);
|
||||
HIP_CHECK_ERROR(hipMallocManaged(NULL, 0, 0), hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
SECTION("Numeric value to flag parameter") {
|
||||
REQUIRE(hipMallocManaged(&A, 1024, 145) != hipSuccess);
|
||||
SECTION("Invalid flag parameter") {
|
||||
HIP_CHECK_ERROR(hipMallocManaged(&A, 1024, 145), hipErrorInvalidValue);
|
||||
}
|
||||
SECTION("Invalid flag parameter- flag set to 0") {
|
||||
HIP_CHECK_ERROR(hipMallocManaged(&A, 1024, 0), hipErrorInvalidValue);
|
||||
}
|
||||
SECTION("Invalid flag parameter- Both flags set") {
|
||||
HIP_CHECK_ERROR(hipMallocManaged(&A, 1024, hipMemAttachGlobal | hipMemAttachHost),
|
||||
hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
SECTION("Negative value to size") {
|
||||
REQUIRE(hipMallocManaged(&A, -10, hipMemAttachGlobal));
|
||||
SECTION("Max value to size") {
|
||||
HIP_CHECK_ERROR(hipMallocManaged(&A, std::numeric_limits<size_t>::max(), hipMemAttachGlobal),
|
||||
hipErrorOutOfMemory);
|
||||
}
|
||||
}
|
||||
|
||||
// Allocate two pointers using hipMallocManaged(), initialize,
|
||||
// then launch kernel using these pointers directly and
|
||||
// later validate the content without using any Memcpy.
|
||||
TEMPLATE_TEST_CASE("Unit_hipMallocManaged_TwoPointers", "",
|
||||
int, float, double) {
|
||||
TEMPLATE_TEST_CASE("Unit_hipMallocManaged_TwoPointers", "", int, float, double) {
|
||||
auto managed = HmmAttrPrint();
|
||||
if (managed != 1) {
|
||||
HipTest::HIP_SKIP_TEST("GPU doesn't support managed memory so skipping test.");
|
||||
return;
|
||||
}
|
||||
|
||||
int NumDevices = 0;
|
||||
HIP_CHECK(hipGetDeviceCount(&NumDevices));
|
||||
TestType *Hmm1 = nullptr, *Hmm2 = nullptr;
|
||||
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
|
||||
|
||||
for (int i = 0; i < NumDevices; ++i) {
|
||||
HIP_CHECK(hipSetDevice(i));
|
||||
@@ -238,10 +324,8 @@ TEMPLATE_TEST_CASE("Unit_hipMallocManaged_TwoPointers", "",
|
||||
Hmm1[m] = m;
|
||||
Hmm2[m] = 0;
|
||||
}
|
||||
const unsigned threadsPerBlock = 256;
|
||||
const unsigned blocks = (N + 255)/256;
|
||||
// Kernel launch
|
||||
vector_sum <<<blocks, threadsPerBlock>>> (Hmm1, Hmm2, N);
|
||||
vector_sum<<<blocks, threadsPerBlock>>>(Hmm1, Hmm2, N);
|
||||
HIP_CHECK(hipDeviceSynchronize());
|
||||
for (size_t v = 0; v < N; ++v) {
|
||||
if (Hmm2[v] != static_cast<TestType>(v + v)) {
|
||||
@@ -259,18 +343,28 @@ TEMPLATE_TEST_CASE("Unit_hipMallocManaged_TwoPointers", "",
|
||||
// to all other devices. This include verification and Device two Device
|
||||
// transfers and kernel launch o discover if there any access issues.
|
||||
|
||||
TEMPLATE_TEST_CASE("Unit_hipMallocManaged_DeviceContextChange", "",
|
||||
unsigned char, int, float, double) {
|
||||
TEMPLATE_TEST_CASE("Unit_hipMallocManaged_DeviceContextChange", "", unsigned char, int, float,
|
||||
double) {
|
||||
auto managed = HmmAttrPrint();
|
||||
if (managed != 1) {
|
||||
HipTest::HIP_SKIP_TEST("GPU doesn't support managed memory so skipping test.");
|
||||
return;
|
||||
}
|
||||
|
||||
std::atomic<unsigned int> DataMismatch;
|
||||
TestType *Ah1 = new TestType[N], *Ah2 = new TestType[N], *Ad = nullptr,
|
||||
*Hmm = nullptr;
|
||||
TestType *Ah1 = new TestType[N], *Ah2 = new TestType[N], *Ad = nullptr, *Hmm = nullptr;
|
||||
int NumDevices = 0;
|
||||
HIP_CHECK(hipGetDeviceCount(&NumDevices));
|
||||
if (NumDevices < 2) {
|
||||
HipTest::HIP_SKIP_TEST("Skipping test because more than one device was not found.");
|
||||
return;
|
||||
}
|
||||
|
||||
for (size_t i =0; i < N; ++i) {
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
Ah1[i] = INIT_VAL;
|
||||
Ah2[i] = 0;
|
||||
}
|
||||
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
|
||||
for (int Oloop = 0; Oloop < NumDevices; ++Oloop) {
|
||||
DataMismatch = 0;
|
||||
HIP_CHECK(hipSetDevice(Oloop));
|
||||
@@ -279,8 +373,7 @@ TEMPLATE_TEST_CASE("Unit_hipMallocManaged_DeviceContextChange", "",
|
||||
HIP_CHECK(hipSetDevice(Iloop));
|
||||
HIP_CHECK(hipMalloc(&Ad, N * sizeof(TestType)));
|
||||
// Copy data from host to hipMallocMananged memory and verify
|
||||
HIP_CHECK(hipMemcpy(Hmm, Ah1, N * sizeof(TestType),
|
||||
hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipMemcpy(Hmm, Ah1, N * sizeof(TestType), hipMemcpyHostToDevice));
|
||||
for (size_t v = 0; v < N; ++v) {
|
||||
if (Hmm[v] != INIT_VAL) {
|
||||
DataMismatch++;
|
||||
@@ -289,10 +382,8 @@ TEMPLATE_TEST_CASE("Unit_hipMallocManaged_DeviceContextChange", "",
|
||||
REQUIRE(DataMismatch.load() == 0);
|
||||
|
||||
// Executing D2D transfer with hipMallocManaged memory and verify
|
||||
HIP_CHECK(hipMemcpy(Ad, Hmm, N * sizeof(TestType),
|
||||
hipMemcpyDeviceToDevice));
|
||||
HIP_CHECK(hipMemcpy(Ah2, Ad, N * sizeof(TestType),
|
||||
hipMemcpyDeviceToHost));
|
||||
HIP_CHECK(hipMemcpy(Ad, Hmm, N * sizeof(TestType), hipMemcpyDeviceToDevice));
|
||||
HIP_CHECK(hipMemcpy(Ah2, Ad, N * sizeof(TestType), hipMemcpyDeviceToHost));
|
||||
for (size_t k = 0; k < N; ++k) {
|
||||
if (Ah2[k] != INIT_VAL) {
|
||||
DataMismatch++;
|
||||
@@ -300,14 +391,11 @@ TEMPLATE_TEST_CASE("Unit_hipMallocManaged_DeviceContextChange", "",
|
||||
}
|
||||
REQUIRE(DataMismatch.load() == 0);
|
||||
HIP_CHECK(hipMemset(Ad, 0, N * sizeof(TestType)));
|
||||
const unsigned threadsPerBlock = 256;
|
||||
const unsigned blocks = (N + 255)/256;
|
||||
// Launching the kernel to check if there is any access issue with
|
||||
// hipMallocManaged memory and local device's memory
|
||||
vector_sum <<<blocks, threadsPerBlock>>> (Hmm, Ad, N);
|
||||
hipDeviceSynchronize();
|
||||
HIP_CHECK(hipMemcpy(Ah2, Ad, N * sizeof(TestType),
|
||||
hipMemcpyDeviceToHost));
|
||||
vector_sum<<<blocks, threadsPerBlock>>>(Hmm, Ad, N);
|
||||
HIP_CHECK(hipDeviceSynchronize());
|
||||
HIP_CHECK(hipMemcpy(Ah2, Ad, N * sizeof(TestType), hipMemcpyDeviceToHost));
|
||||
for (size_t m = 0; m < N; ++m) {
|
||||
if (Ah2[m] != 246) {
|
||||
DataMismatch++;
|
||||
|
||||
@@ -20,25 +20,25 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
#include <hip_test_common.hh>
|
||||
#include "hipMallocManagedCommon.hh"
|
||||
#include <atomic>
|
||||
|
||||
|
||||
// Kernel functions
|
||||
__global__ void HmmMultiThread(int n, float *x, float *y) {
|
||||
__global__ void HmmMultiThread(int n, float* x, float* y) {
|
||||
int index = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int stride = blockDim.x * gridDim.x;
|
||||
for (int i = index; i < n; i += stride)
|
||||
y[i] = x[i] * x[i];
|
||||
for (int i = index; i < n; i += stride) y[i] = x[i] * x[i];
|
||||
}
|
||||
|
||||
__global__ void KrnlWth2MemTypes(int *Hmm, int *Dptr, size_t n) {
|
||||
__global__ void KrnlWth2MemTypes(int* Hmm, int* Dptr, size_t n) {
|
||||
size_t index = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
for (size_t i = index; i < n; i++) {
|
||||
Hmm[i] = Dptr[i] + 10;
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void KernelMul_MngdMem123(int *Hmm, int *Dptr, size_t n) {
|
||||
__global__ void KernelMul_MngdMem123(int* Hmm, int* Dptr, size_t n) {
|
||||
size_t index = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
size_t stride = blockDim.x * gridDim.x;
|
||||
for (size_t i = index; i < n; i += stride) {
|
||||
@@ -47,32 +47,27 @@ __global__ void KernelMul_MngdMem123(int *Hmm, int *Dptr, size_t n) {
|
||||
}
|
||||
|
||||
|
||||
|
||||
// The following variable is used to determine the failure of test case
|
||||
static bool IfTestPassed = true;
|
||||
static bool IfTestPassed = true;
|
||||
|
||||
static void LaunchKrnl(int *Hmm1, size_t NumElms, int InitVal, int GpuOrdnl,
|
||||
int AdviseFlg) {
|
||||
int *Hmm2 = NULL;
|
||||
static void LaunchKrnl(int* Hmm1, size_t NumElms, int InitVal, int GpuOrdnl, int AdviseFlg) {
|
||||
int* Hmm2 = NULL;
|
||||
hipStream_t strm;
|
||||
HIPCHECK(hipSetDevice(GpuOrdnl));
|
||||
HIPCHECK(hipStreamCreate(&strm));
|
||||
if (AdviseFlg == 0) {
|
||||
HIPCHECK(hipMemAdvise(Hmm1 , NumElms * sizeof(int),
|
||||
hipMemAdviseSetReadMostly, GpuOrdnl));
|
||||
HIPCHECK(hipMemAdvise(Hmm1, NumElms * sizeof(int), hipMemAdviseSetReadMostly, GpuOrdnl));
|
||||
} else if (AdviseFlg == 1) {
|
||||
HIPCHECK(hipMemAdvise(Hmm1 , NumElms * sizeof(int),
|
||||
hipMemAdviseSetPreferredLocation, GpuOrdnl));
|
||||
HIPCHECK(hipMemAdvise(Hmm1, NumElms * sizeof(int), hipMemAdviseSetPreferredLocation, GpuOrdnl));
|
||||
} else if (AdviseFlg == 2) {
|
||||
HIPCHECK(hipMemAdvise(Hmm1 , NumElms * sizeof(int),
|
||||
hipMemAdviseSetAccessedBy, GpuOrdnl));
|
||||
HIPCHECK(hipMemAdvise(Hmm1, NumElms * sizeof(int), hipMemAdviseSetAccessedBy, GpuOrdnl));
|
||||
} else if (AdviseFlg == 3) {
|
||||
HIPCHECK(hipMemPrefetchAsync(Hmm1, NumElms * sizeof(int), GpuOrdnl, strm));
|
||||
HIPCHECK(hipStreamSynchronize(strm));
|
||||
}
|
||||
HIPCHECK(hipMallocManaged(&Hmm2, (sizeof(int) * NumElms)));
|
||||
for (int i = 0; i < 2; ++i) {
|
||||
KrnlWth2MemTypes<<<((NumElms + 63)/64), 64, 0, strm>>>(Hmm2, Hmm1, NumElms);
|
||||
KrnlWth2MemTypes<<<((NumElms + 63) / 64), 64, 0, strm>>>(Hmm2, Hmm1, NumElms);
|
||||
HIPCHECK(hipStreamSynchronize(strm));
|
||||
}
|
||||
// Verifying the result
|
||||
@@ -88,7 +83,7 @@ static void LaunchKrnl(int *Hmm1, size_t NumElms, int InitVal, int GpuOrdnl,
|
||||
}
|
||||
}
|
||||
|
||||
static void LaunchKrnl2(int *Hmm, size_t NumElms, int InitVal, int HmmMem) {
|
||||
static void LaunchKrnl2(int* Hmm, size_t NumElms, int InitVal, int HmmMem) {
|
||||
int *ptr = nullptr, blockSize = 64, *HstPtr = nullptr;
|
||||
hipStream_t strm;
|
||||
HIPCHECK(hipStreamCreate(&strm));
|
||||
@@ -99,7 +94,7 @@ static void LaunchKrnl2(int *Hmm, size_t NumElms, int InitVal, int HmmMem) {
|
||||
HIPCHECK(hipMallocManaged(&ptr, (sizeof(int) * NumElms)));
|
||||
}
|
||||
dim3 dimBlock(blockSize, 1, 1);
|
||||
dim3 dimGrid((NumElms + blockSize -1)/blockSize, 1, 1);
|
||||
dim3 dimGrid((NumElms + blockSize - 1) / blockSize, 1, 1);
|
||||
for (int i = 0; i < 2; ++i) {
|
||||
KrnlWth2MemTypes<<<dimGrid, dimBlock, 0, strm>>>(ptr, Hmm, NumElms);
|
||||
}
|
||||
@@ -107,8 +102,7 @@ static void LaunchKrnl2(int *Hmm, size_t NumElms, int InitVal, int HmmMem) {
|
||||
// Verifying the result
|
||||
int DataMismatch = 0;
|
||||
if (HmmMem == 0) {
|
||||
HIPCHECK(hipMemcpy(HstPtr, ptr, (sizeof(int) * NumElms),
|
||||
hipMemcpyDeviceToHost));
|
||||
HIPCHECK(hipMemcpy(HstPtr, ptr, (sizeof(int) * NumElms), hipMemcpyDeviceToHost));
|
||||
for (size_t i = 0; i < NumElms; ++i) {
|
||||
if (HstPtr[i] != (InitVal + 10)) {
|
||||
DataMismatch++;
|
||||
@@ -127,13 +121,13 @@ static void LaunchKrnl2(int *Hmm, size_t NumElms, int InitVal, int HmmMem) {
|
||||
}
|
||||
}
|
||||
|
||||
static void LaunchKrnl3(int *Dptr, size_t NumElms, int InitVal) {
|
||||
static void LaunchKrnl3(int* Dptr, size_t NumElms, int InitVal) {
|
||||
int *Hmm = NULL, blockSize = 64;
|
||||
hipStream_t strm;
|
||||
HIPCHECK(hipStreamCreate(&strm));
|
||||
HIPCHECK(hipMallocManaged(&Hmm, (sizeof(int) * NumElms)));
|
||||
dim3 dimBlock(blockSize, 1, 1);
|
||||
dim3 dimGrid((NumElms + blockSize -1)/blockSize, 1, 1);
|
||||
dim3 dimGrid((NumElms + blockSize - 1) / blockSize, 1, 1);
|
||||
for (int i = 0; i < 2; ++i) {
|
||||
KrnlWth2MemTypes<<<dimGrid, dimBlock, 0, strm>>>(Hmm, Dptr, NumElms);
|
||||
}
|
||||
@@ -152,14 +146,13 @@ static void LaunchKrnl3(int *Dptr, size_t NumElms, int InitVal) {
|
||||
}
|
||||
|
||||
|
||||
static void LaunchKrnl5(int *Hmm1, size_t NumElms, int InitVal,
|
||||
int KerneltoLaunch) {
|
||||
static void LaunchKrnl5(int* Hmm1, size_t NumElms, int InitVal, int KerneltoLaunch) {
|
||||
int *Hmm2 = NULL, blockSize = 64;
|
||||
hipStream_t strm;
|
||||
HIPCHECK(hipStreamCreate(&strm));
|
||||
HIPCHECK(hipMallocManaged(&Hmm2, (sizeof(int) * NumElms)));
|
||||
dim3 dimBlock(blockSize, 1, 1);
|
||||
dim3 dimGrid((NumElms + blockSize -1)/blockSize, 1, 1);
|
||||
dim3 dimGrid((NumElms + blockSize - 1) / blockSize, 1, 1);
|
||||
for (int i = 0; i < 2; ++i) {
|
||||
if (KerneltoLaunch == 0) {
|
||||
KrnlWth2MemTypes<<<dimGrid, dimBlock, 0, strm>>>(Hmm2, Hmm1, NumElms);
|
||||
@@ -200,8 +193,7 @@ static void TestFlagParamGlobal(int dev) {
|
||||
HIPCHECK(hipSetDevice(dev));
|
||||
HIPCHECK(hipStreamCreate(&strm));
|
||||
// Testing hipMemAttachGlobal Flag
|
||||
HIPCHECK(hipMallocManaged(&HmmAG, NUM_ELMS * sizeof(float),
|
||||
hipMemAttachGlobal));
|
||||
HIPCHECK(hipMallocManaged(&HmmAG, NUM_ELMS * sizeof(float), hipMemAttachGlobal));
|
||||
|
||||
// Initializing HmmAG memory
|
||||
for (int i = 0; i < NUM_ELMS; i++) {
|
||||
@@ -246,10 +238,8 @@ static void TestFlagParamHost(int dev) {
|
||||
hipStream_t strm;
|
||||
HIPCHECK(hipSetDevice(dev));
|
||||
HIPCHECK(hipStreamCreate(&strm));
|
||||
HIPCHECK(hipMallocManaged(&HmmAH1, NUM_ELMS * sizeof(float),
|
||||
hipMemAttachHost));
|
||||
HIPCHECK(hipMallocManaged(&HmmAH2, NUM_ELMS * sizeof(float),
|
||||
hipMemAttachHost));
|
||||
HIPCHECK(hipMallocManaged(&HmmAH1, NUM_ELMS * sizeof(float), hipMemAttachHost));
|
||||
HIPCHECK(hipMallocManaged(&HmmAH2, NUM_ELMS * sizeof(float), hipMemAttachHost));
|
||||
// Initializing HmmAH memory
|
||||
for (int i = 0; i < NUM_ELMS; i++) {
|
||||
HmmAH1[i] = INIT_VAL;
|
||||
@@ -294,77 +284,54 @@ static void AllocateHmmMemory(int flag, int device) {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static int HmmAttrPrint() {
|
||||
int managed = 0;
|
||||
INFO("The following are the attribute values related to HMM for"
|
||||
" device 0:\n");
|
||||
HIP_CHECK(hipDeviceGetAttribute(&managed,
|
||||
hipDeviceAttributeDirectManagedMemAccessFromHost, 0));
|
||||
INFO("hipDeviceAttributeDirectManagedMemAccessFromHost: " << managed);
|
||||
HIP_CHECK(hipDeviceGetAttribute(&managed,
|
||||
hipDeviceAttributeConcurrentManagedAccess, 0));
|
||||
INFO("hipDeviceAttributeConcurrentManagedAccess: " << managed);
|
||||
HIP_CHECK(hipDeviceGetAttribute(&managed,
|
||||
hipDeviceAttributePageableMemoryAccess, 0));
|
||||
INFO("hipDeviceAttributePageableMemoryAccess: " << managed);
|
||||
HIP_CHECK(hipDeviceGetAttribute(&managed,
|
||||
hipDeviceAttributePageableMemoryAccessUsesHostPageTables, 0));
|
||||
INFO("hipDeviceAttributePageableMemoryAccessUsesHostPageTables:"
|
||||
<< managed);
|
||||
|
||||
HIP_CHECK(hipDeviceGetAttribute(&managed, hipDeviceAttributeManagedMemory,
|
||||
0));
|
||||
INFO("hipDeviceAttributeManagedMemory: " << managed);
|
||||
return managed;
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_hipMallocManaged_MultiThread") {
|
||||
auto managed = HmmAttrPrint();
|
||||
if (managed != 1) {
|
||||
HipTest::HIP_SKIP_TEST("GPU doesn't support managed memory so skipping test.");
|
||||
return;
|
||||
}
|
||||
|
||||
IfTestPassed = true;
|
||||
int NumDevs = 0, managed = 0, ATTACH_GLOBAL = 0, ATTACH_HOST = 1;
|
||||
int NumDevs = 0, ATTACH_GLOBAL = 0, ATTACH_HOST = 1;
|
||||
int ITERATIONS = 10;
|
||||
managed = HmmAttrPrint();
|
||||
if (managed) {
|
||||
HIP_CHECK(hipGetDeviceCount(&NumDevs));
|
||||
std::vector<std::thread> T1;
|
||||
std::vector<std::thread> T2;
|
||||
for (int i = 0; i < NumDevs; ++i) {
|
||||
for (int j = 0; j < ITERATIONS; ++j) {
|
||||
T1.push_back(std::thread(TestFlagParamGlobal, i));
|
||||
T2.push_back(std::thread(AllocateHmmMemory, ATTACH_GLOBAL, i));
|
||||
}
|
||||
for (auto &t1 : T1) {
|
||||
if (t1.joinable()) {
|
||||
t1.join();
|
||||
}
|
||||
}
|
||||
for (auto &t2 : T2) {
|
||||
if (t2.joinable()) {
|
||||
t2.join();
|
||||
}
|
||||
|
||||
|
||||
HIP_CHECK(hipGetDeviceCount(&NumDevs));
|
||||
std::vector<std::thread> T1;
|
||||
std::vector<std::thread> T2;
|
||||
for (int i = 0; i < NumDevs; ++i) {
|
||||
for (int j = 0; j < ITERATIONS; ++j) {
|
||||
T1.push_back(std::thread(TestFlagParamGlobal, i));
|
||||
T2.push_back(std::thread(AllocateHmmMemory, ATTACH_GLOBAL, i));
|
||||
}
|
||||
for (auto& t1 : T1) {
|
||||
if (t1.joinable()) {
|
||||
t1.join();
|
||||
}
|
||||
}
|
||||
T1.clear();
|
||||
T2.clear();
|
||||
for (int i = 0; i < NumDevs; ++i) {
|
||||
for (int j = 0; j < ITERATIONS; ++j) {
|
||||
T1.push_back(std::thread(TestFlagParamHost, i));
|
||||
T2.push_back(std::thread(AllocateHmmMemory, ATTACH_HOST, i));
|
||||
}
|
||||
for (auto &t1 : T1) {
|
||||
if (t1.joinable()) {
|
||||
t1.join();
|
||||
}
|
||||
}
|
||||
for (auto &t2 : T2) {
|
||||
if (t2.joinable()) {
|
||||
t2.join();
|
||||
}
|
||||
for (auto& t2 : T2) {
|
||||
if (t2.joinable()) {
|
||||
t2.join();
|
||||
}
|
||||
}
|
||||
}
|
||||
T1.clear();
|
||||
T2.clear();
|
||||
for (int i = 0; i < NumDevs; ++i) {
|
||||
for (int j = 0; j < ITERATIONS; ++j) {
|
||||
T1.push_back(std::thread(TestFlagParamHost, i));
|
||||
T2.push_back(std::thread(AllocateHmmMemory, ATTACH_HOST, i));
|
||||
}
|
||||
for (auto& t1 : T1) {
|
||||
if (t1.joinable()) {
|
||||
t1.join();
|
||||
}
|
||||
}
|
||||
for (auto& t2 : T2) {
|
||||
if (t2.joinable()) {
|
||||
t2.join();
|
||||
}
|
||||
}
|
||||
} else {
|
||||
SUCCEED("GPU 0 doesn't support hipDeviceAttributeManagedMemory"
|
||||
"attribute. Hence skipping the testing with Pass result.\n");
|
||||
}
|
||||
REQUIRE(IfTestPassed);
|
||||
}
|
||||
@@ -372,175 +339,169 @@ TEST_CASE("Unit_hipMallocManaged_MultiThread") {
|
||||
// The following test checks what happens when same Hmm memory is used to
|
||||
// launch multiple threads over multiple gpus
|
||||
TEST_CASE("Unit_hipMallocManaged_MGpuMThread") {
|
||||
auto managed = HmmAttrPrint();
|
||||
if (managed != 1) {
|
||||
HipTest::HIP_SKIP_TEST("GPU doesn't support managed memory so skipping test.");
|
||||
return;
|
||||
}
|
||||
|
||||
IfTestPassed = true;
|
||||
int Ngpus = 0;
|
||||
HIP_CHECK(hipGetDeviceCount(&Ngpus));
|
||||
if (Ngpus < 2) {
|
||||
WARN("This test needs atleast 2 or more gpus, but the system");
|
||||
WARN(" has only " << Ngpus);
|
||||
WARN(" gpus. Hence skipping the test.");
|
||||
SUCCEED("\n");
|
||||
HipTest::HIP_SKIP_TEST("Skipping test because more than one device was not found.");
|
||||
return;
|
||||
}
|
||||
int managed = HmmAttrPrint();
|
||||
if (managed == 1) {
|
||||
int InitVal = 123, *Hmm1 = NULL, NumElms = 4096*4;
|
||||
HIP_CHECK(hipMallocManaged(&Hmm1, (NumElms * sizeof(int))));
|
||||
for (int i = 0; i < NumElms; ++i) {
|
||||
Hmm1[i] = InitVal;
|
||||
}
|
||||
|
||||
std::vector<std::thread> Thrds;
|
||||
// AdviseFlg=0 for ReadMostly to be applied
|
||||
// AdviseFlg=1 for PreferredLocation to be applied
|
||||
// AdviseFlg=2 for AccessedBy to be applied
|
||||
// AdviseFlg=3 to prefetch the memory to particular gpu
|
||||
for (int AdviseFlg = 0; AdviseFlg < 4; ++AdviseFlg) {
|
||||
for (int i = 0; i < Ngpus; ++i) {
|
||||
Thrds.push_back(std::thread(LaunchKrnl, Hmm1, NumElms, InitVal, i,
|
||||
AdviseFlg));
|
||||
}
|
||||
for (auto &thr : Thrds) {
|
||||
if (thr.joinable()) {
|
||||
thr.join();
|
||||
}
|
||||
int InitVal = 123, *Hmm1 = NULL, NumElms = 4096 * 4;
|
||||
HIP_CHECK(hipMallocManaged(&Hmm1, (NumElms * sizeof(int))));
|
||||
for (int i = 0; i < NumElms; ++i) {
|
||||
Hmm1[i] = InitVal;
|
||||
}
|
||||
|
||||
std::vector<std::thread> Thrds;
|
||||
// AdviseFlg=0 for ReadMostly to be applied
|
||||
// AdviseFlg=1 for PreferredLocation to be applied
|
||||
// AdviseFlg=2 for AccessedBy to be applied
|
||||
// AdviseFlg=3 to prefetch the memory to particular gpu
|
||||
for (int AdviseFlg = 0; AdviseFlg < 4; ++AdviseFlg) {
|
||||
for (int i = 0; i < Ngpus; ++i) {
|
||||
Thrds.push_back(std::thread(LaunchKrnl, Hmm1, NumElms, InitVal, i, AdviseFlg));
|
||||
}
|
||||
for (auto& thr : Thrds) {
|
||||
if (thr.joinable()) {
|
||||
thr.join();
|
||||
}
|
||||
}
|
||||
REQUIRE(IfTestPassed);
|
||||
} else {
|
||||
SUCCEED("GPU 0 doesn't support hipDeviceAttributeManagedMemory "
|
||||
"attribute. Hence skipping the testing with Pass result.\n");
|
||||
}
|
||||
REQUIRE(IfTestPassed);
|
||||
}
|
||||
|
||||
|
||||
// The following test checks what happens when multiple kernels are launched
|
||||
// with same Hmm memory
|
||||
TEST_CASE("Unit_hipMallocManaged_MultiKrnlComnHmm") {
|
||||
IfTestPassed = true;
|
||||
int managed = HmmAttrPrint();
|
||||
if (managed == 1) {
|
||||
int InitVal = 123, *Hmm = NULL, NumElms = 1024*4, TotThrds = 2;
|
||||
int HmmMem2 = 0, *HstPtr = nullptr; // to indicate the thread that
|
||||
// hipMalloc() memory has to be used
|
||||
HstPtr = reinterpret_cast<int*>(new int[NumElms]);
|
||||
HIP_CHECK(hipMalloc(&Hmm, (NumElms * sizeof(int))));
|
||||
for (int i = 0; i < NumElms; ++i) {
|
||||
HstPtr[i] = InitVal;
|
||||
}
|
||||
HIP_CHECK(hipMemcpy(Hmm, HstPtr, (NumElms * sizeof(int)),
|
||||
hipMemcpyHostToDevice));
|
||||
std::vector<std::thread> Thrds;
|
||||
for (int i = 0; i < TotThrds; ++i) {
|
||||
Thrds.push_back(std::thread(LaunchKrnl2, Hmm, NumElms, InitVal, HmmMem2));
|
||||
}
|
||||
|
||||
for (auto &thr : Thrds) {
|
||||
if (thr.joinable()) {
|
||||
thr.join();
|
||||
}
|
||||
}
|
||||
delete[] HstPtr;
|
||||
} else {
|
||||
SUCCEED("GPU 0 doesn't support hipDeviceAttributeManagedMemory "
|
||||
"attribute. Hence skipping the testing with Pass result.\n");
|
||||
auto managed = HmmAttrPrint();
|
||||
if (managed != 1) {
|
||||
HipTest::HIP_SKIP_TEST("GPU doesn't support managed memory so skipping test.");
|
||||
return;
|
||||
}
|
||||
|
||||
IfTestPassed = true;
|
||||
|
||||
int InitVal = 123, *Hmm = NULL, NumElms = 1024 * 4, TotThrds = 2;
|
||||
int HmmMem2 = 0, *HstPtr = nullptr; // to indicate the thread that
|
||||
// hipMalloc() memory has to be used
|
||||
HstPtr = reinterpret_cast<int*>(new int[NumElms]);
|
||||
HIP_CHECK(hipMalloc(&Hmm, (NumElms * sizeof(int))));
|
||||
for (int i = 0; i < NumElms; ++i) {
|
||||
HstPtr[i] = InitVal;
|
||||
}
|
||||
HIP_CHECK(hipMemcpy(Hmm, HstPtr, (NumElms * sizeof(int)), hipMemcpyHostToDevice));
|
||||
std::vector<std::thread> Thrds;
|
||||
for (int i = 0; i < TotThrds; ++i) {
|
||||
Thrds.push_back(std::thread(LaunchKrnl2, Hmm, NumElms, InitVal, HmmMem2));
|
||||
}
|
||||
|
||||
for (auto& thr : Thrds) {
|
||||
if (thr.joinable()) {
|
||||
thr.join();
|
||||
}
|
||||
}
|
||||
delete[] HstPtr;
|
||||
}
|
||||
|
||||
|
||||
// The following test checks what happens when multiple kernels are launched
|
||||
// with same hipMalloc() memory
|
||||
TEST_CASE("Unit_hipMallocManaged_MultiKrnlComnMalloc") {
|
||||
IfTestPassed = true;
|
||||
int managed = HmmAttrPrint();
|
||||
if (managed) {
|
||||
int InitVal = 123, *Dptr = NULL, NumElms = 4096*8, TotThrds = 2;
|
||||
int *HstPtr = reinterpret_cast<int*>(new int[NumElms]);
|
||||
HIP_CHECK(hipMalloc(&Dptr, (NumElms * sizeof(int))));
|
||||
for (int i = 0; i < NumElms; ++i) {
|
||||
HstPtr[i] = InitVal;
|
||||
}
|
||||
HIP_CHECK(hipMemcpy(Dptr, HstPtr, (NumElms * sizeof(int)),
|
||||
hipMemcpyHostToDevice));
|
||||
std::vector<std::thread> Thrds;
|
||||
for (int i = 0; i < TotThrds; ++i) {
|
||||
Thrds.push_back(std::thread(LaunchKrnl3, Dptr, NumElms, InitVal));
|
||||
}
|
||||
|
||||
for (auto &thr : Thrds) {
|
||||
if (thr.joinable()) {
|
||||
thr.join();
|
||||
}
|
||||
}
|
||||
delete[] HstPtr;
|
||||
HIP_CHECK(hipFree(Dptr));
|
||||
} else {
|
||||
SUCCEED("GPU 0 doesn't support hipDeviceAttributeManagedMemory "
|
||||
"attribute. Hence skipping the testing with Pass result.\n");
|
||||
auto managed = HmmAttrPrint();
|
||||
if (managed != 1) {
|
||||
HipTest::HIP_SKIP_TEST("GPU doesn't support managed memory so skipping test.");
|
||||
return;
|
||||
}
|
||||
|
||||
IfTestPassed = true;
|
||||
int InitVal = 123, *Dptr = NULL, NumElms = 4096 * 8, TotThrds = 2;
|
||||
int* HstPtr = reinterpret_cast<int*>(new int[NumElms]);
|
||||
HIP_CHECK(hipMalloc(&Dptr, (NumElms * sizeof(int))));
|
||||
for (int i = 0; i < NumElms; ++i) {
|
||||
HstPtr[i] = InitVal;
|
||||
}
|
||||
HIP_CHECK(hipMemcpy(Dptr, HstPtr, (NumElms * sizeof(int)), hipMemcpyHostToDevice));
|
||||
std::vector<std::thread> Thrds;
|
||||
for (int i = 0; i < TotThrds; ++i) {
|
||||
Thrds.push_back(std::thread(LaunchKrnl3, Dptr, NumElms, InitVal));
|
||||
}
|
||||
|
||||
for (auto& thr : Thrds) {
|
||||
if (thr.joinable()) {
|
||||
thr.join();
|
||||
}
|
||||
}
|
||||
delete[] HstPtr;
|
||||
HIP_CHECK(hipFree(Dptr));
|
||||
}
|
||||
|
||||
// The following section tests the scenario wherein multiple threads use their
|
||||
// own stream to launch kernel on common Hmm memory
|
||||
TEST_CASE("Unit_hipMallocManaged_MultiThrdMultiStrm") {
|
||||
IfTestPassed = true;
|
||||
int managed = HmmAttrPrint();
|
||||
if (managed == 1) {
|
||||
int NumElms = 4096*4;
|
||||
int *Hmm1 = NULL, TotlThrds = 4, InitVal = 123;
|
||||
int HmmMem = 1; // to indicate the thread that Hmm memory need to be
|
||||
// used inside it
|
||||
HIP_CHECK(hipMallocManaged(&Hmm1, (NumElms * sizeof(int))));
|
||||
for (int i = 0; i < NumElms; ++i) {
|
||||
Hmm1[i] = InitVal;
|
||||
}
|
||||
std::vector<std::thread> Thrds;
|
||||
for (int i = 0; i < TotlThrds; ++i) {
|
||||
Thrds.push_back(std::thread(LaunchKrnl2, Hmm1, NumElms, InitVal, HmmMem));
|
||||
}
|
||||
auto managed = HmmAttrPrint();
|
||||
if (managed != 1) {
|
||||
HipTest::HIP_SKIP_TEST("GPU doesn't support managed memory so skipping test.");
|
||||
return;
|
||||
}
|
||||
|
||||
for (auto &thr : Thrds) {
|
||||
if (thr.joinable()) {
|
||||
thr.join();
|
||||
}
|
||||
IfTestPassed = true;
|
||||
|
||||
int NumElms = 4096 * 4;
|
||||
int *Hmm1 = NULL, TotlThrds = 4, InitVal = 123;
|
||||
int HmmMem = 1; // to indicate the thread that Hmm memory need to be
|
||||
// used inside it
|
||||
HIP_CHECK(hipMallocManaged(&Hmm1, (NumElms * sizeof(int))));
|
||||
for (int i = 0; i < NumElms; ++i) {
|
||||
Hmm1[i] = InitVal;
|
||||
}
|
||||
std::vector<std::thread> Thrds;
|
||||
for (int i = 0; i < TotlThrds; ++i) {
|
||||
Thrds.push_back(std::thread(LaunchKrnl2, Hmm1, NumElms, InitVal, HmmMem));
|
||||
}
|
||||
|
||||
for (auto& thr : Thrds) {
|
||||
if (thr.joinable()) {
|
||||
thr.join();
|
||||
}
|
||||
} else {
|
||||
SUCCEED("GPU 0 doesn't support hipDeviceAttributeManagedMemory "
|
||||
"attribute. Hence skipping the testing with Pass result.\n");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
// The following section tests the scenario wherein two threads each use
|
||||
// different kernel but common HMM memory
|
||||
TEST_CASE("Unit_hipMallocManaged_TwoKrnlsComnHmmMem") {
|
||||
IfTestPassed = true;
|
||||
int managed = HmmAttrPrint();
|
||||
if (managed == 1) {
|
||||
int InitVal = 123, *Dptr = NULL, NumElms = 4096*4, TotThrds = 2;
|
||||
int *HstPtr = reinterpret_cast<int*>(new int[NumElms]);
|
||||
HIP_CHECK(hipMalloc(&Dptr, (NumElms * sizeof(int))));
|
||||
for (int i = 0; i < NumElms; ++i) {
|
||||
HstPtr[i] = InitVal;
|
||||
}
|
||||
HIP_CHECK(hipMemcpy(Dptr, HstPtr, (NumElms * sizeof(int)),
|
||||
hipMemcpyHostToDevice));
|
||||
std::vector<std::thread> Thrds;
|
||||
for (int i = 0; i < TotThrds; ++i) {
|
||||
Thrds.push_back(std::thread(LaunchKrnl5, Dptr, NumElms, InitVal, i));
|
||||
}
|
||||
|
||||
for (auto &thr : Thrds) {
|
||||
if (thr.joinable()) {
|
||||
thr.join();
|
||||
}
|
||||
}
|
||||
delete[] HstPtr;
|
||||
HIP_CHECK(hipFree(Dptr));
|
||||
} else {
|
||||
SUCCEED("GPU 0 doesn't support hipDeviceAttributeManagedMemory "
|
||||
"attribute. Hence skipping the testing with Pass result.\n");
|
||||
auto managed = HmmAttrPrint();
|
||||
if (managed != 1) {
|
||||
HipTest::HIP_SKIP_TEST("GPU doesn't support managed memory so skipping test.");
|
||||
return;
|
||||
}
|
||||
|
||||
IfTestPassed = true;
|
||||
int InitVal = 123, *Dptr = NULL, NumElms = 4096 * 4, TotThrds = 2;
|
||||
int* HstPtr = reinterpret_cast<int*>(new int[NumElms]);
|
||||
HIP_CHECK(hipMalloc(&Dptr, (NumElms * sizeof(int))));
|
||||
for (int i = 0; i < NumElms; ++i) {
|
||||
HstPtr[i] = InitVal;
|
||||
}
|
||||
HIP_CHECK(hipMemcpy(Dptr, HstPtr, (NumElms * sizeof(int)), hipMemcpyHostToDevice));
|
||||
std::vector<std::thread> Thrds;
|
||||
for (int i = 0; i < TotThrds; ++i) {
|
||||
Thrds.push_back(std::thread(LaunchKrnl5, Dptr, NumElms, InitVal, i));
|
||||
}
|
||||
|
||||
for (auto& thr : Thrds) {
|
||||
if (thr.joinable()) {
|
||||
thr.join();
|
||||
}
|
||||
}
|
||||
delete[] HstPtr;
|
||||
HIP_CHECK(hipFree(Dptr));
|
||||
}
|
||||
|
||||
|
||||
|
||||
Ссылка в новой задаче
Block a user