273 lignes
8.4 KiB
C++
273 lignes
8.4 KiB
C++
/*
|
|
Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
|
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
of this software and associated documentation files (the "Software"), to deal
|
|
in the Software without restriction, including without limitation the rights
|
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
copies of the Software, and to permit persons to whom the Software is
|
|
furnished to do so, subject to the following conditions:
|
|
The above copyright notice and this permission notice shall be included in
|
|
all copies or substantial portions of the Software.
|
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
|
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
THE SOFTWARE.
|
|
*/
|
|
|
|
/**
|
|
Testcase Scenarios :
|
|
1) Validate Async behavior of hipMemset3DAsync with commands queued
|
|
concurrently from multiple threads.
|
|
2) Validate hipMemset3DAsync behavior when api is queued along with kernel
|
|
function operating on same memory.
|
|
3) Perform regression of hipMemset3D api in loop with device memory allocated
|
|
on different gpus.
|
|
4) Perform regression of hipMemset3DAsync api in loop with device memory
|
|
allocated on different gpus.
|
|
*/
|
|
|
|
#include <hip_test_common.hh>
|
|
|
|
|
|
/*
|
|
* Defines
|
|
*/
|
|
#define MAX_REGRESS_ITERS 2
|
|
#define MAX_THREADS 10
|
|
|
|
/**
|
|
* kernel function sets device memory with value passed
|
|
*/
|
|
static __global__ void func_set_value(hipPitchedPtr devicePitchedPointer, hipExtent extent,
|
|
unsigned char val) {
|
|
// Index Calculation
|
|
size_t x = threadIdx.x + blockDim.x * blockIdx.x;
|
|
size_t y = threadIdx.y + blockDim.y * blockIdx.y;
|
|
size_t z = threadIdx.z + blockDim.z * blockIdx.z;
|
|
|
|
// Get attributes from device pitched pointer
|
|
char* devicePointer = reinterpret_cast<char*>(devicePitchedPointer.ptr);
|
|
size_t pitch = devicePitchedPointer.pitch;
|
|
size_t slicePitch = pitch * extent.height;
|
|
|
|
// Loop over the device buffer
|
|
if (z < extent.depth) {
|
|
char* current_slice_index = devicePointer + z * slicePitch;
|
|
if (y < extent.height) {
|
|
// Get data array containing all elements from the current row
|
|
char* current_row = reinterpret_cast<char*>(current_slice_index + y * pitch);
|
|
if (x < extent.width) {
|
|
current_row[x] = val;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
/**
|
|
* Thread function queues kernel function and memset cmds
|
|
*/
|
|
static void threadFunc(hipStream_t stream, hipPitchedPtr devpPtr, int memsetval, int testval,
|
|
hipExtent extent, hipMemcpy3DParms myparms) {
|
|
// Kernel Launch Configuration
|
|
constexpr auto size = 8;
|
|
dim3 threadsPerBlock = dim3(size, size, size);
|
|
dim3 blocks;
|
|
blocks = dim3((extent.width + threadsPerBlock.x - 1) / threadsPerBlock.x,
|
|
(extent.height + threadsPerBlock.y - 1) / threadsPerBlock.y,
|
|
(extent.depth + threadsPerBlock.z - 1) / threadsPerBlock.z);
|
|
|
|
hipLaunchKernelGGL(func_set_value, dim3(blocks), dim3(threadsPerBlock), 0, stream, devpPtr,
|
|
extent, memsetval);
|
|
HIP_CHECK(hipGetLastError());
|
|
HIPCHECK(hipMemset3DAsync(devpPtr, testval, extent, stream));
|
|
HIPCHECK(hipMemcpy3DAsync(&myparms, stream));
|
|
}
|
|
|
|
|
|
/**
|
|
* Performs api regression in loop
|
|
*/
|
|
bool loopRegression(bool bAsync) {
|
|
bool testPassed = true;
|
|
char* A_h;
|
|
constexpr int memsetval = 1;
|
|
constexpr size_t numH = 256, numW = 100, depth = 10;
|
|
int numGpu = 0, hasPeerAccess = 0;
|
|
size_t width = numW * sizeof(char);
|
|
hipExtent extent = make_hipExtent(width, numH, depth);
|
|
size_t sizeElements = width * numH * depth;
|
|
size_t elements = numW * numH * depth;
|
|
std::vector<hipPitchedPtr> devPitchedPtrlist;
|
|
hipPitchedPtr pitchedPtr, devpPtr;
|
|
|
|
A_h = reinterpret_cast<char*>(malloc(sizeElements));
|
|
REQUIRE(A_h != nullptr);
|
|
memset(A_h, 0, sizeElements);
|
|
|
|
// Populate hipMemcpy3D parameters
|
|
hipMemcpy3DParms myparms{};
|
|
myparms.srcPos = make_hipPos(0, 0, 0);
|
|
myparms.dstPos = make_hipPos(0, 0, 0);
|
|
myparms.dstPtr = make_hipPitchedPtr(A_h, width, numW, numH);
|
|
myparms.extent = extent;
|
|
|
|
#if HT_NVIDIA
|
|
myparms.kind = hipMemcpyKindToCudaMemcpyKind(hipMemcpyDeviceToHost);
|
|
#else
|
|
myparms.kind = hipMemcpyDeviceToHost;
|
|
#endif
|
|
|
|
HIP_CHECK(hipGetDeviceCount(&numGpu));
|
|
REQUIRE(numGpu > 0);
|
|
|
|
// Alloc 3D arrays in all GPUs
|
|
for (int j = 0; j < numGpu; j++) {
|
|
HIP_CHECK(hipSetDevice(j));
|
|
HIP_CHECK(hipMalloc3D(&pitchedPtr, extent));
|
|
devPitchedPtrlist.push_back(pitchedPtr);
|
|
}
|
|
|
|
for (int itern = 0; itern < MAX_REGRESS_ITERS; itern++) {
|
|
// Validate hipMemset3D data consistency in multiple iters
|
|
for (int i = 0; i < numGpu; i++) {
|
|
for (int j = 0; j < numGpu; j++) {
|
|
HIP_CHECK(hipDeviceCanAccessPeer(&hasPeerAccess, i, j));
|
|
if (!hasPeerAccess) {
|
|
// Skip and continue if no peer access
|
|
continue;
|
|
}
|
|
|
|
HIP_CHECK(hipSetDevice(i));
|
|
devpPtr = devPitchedPtrlist[j];
|
|
HIP_CHECK(hipDeviceEnablePeerAccess(j, 0));
|
|
HIP_CHECK(hipMemset3D(devpPtr, 0, extent));
|
|
|
|
if (bAsync) {
|
|
hipStream_t stream;
|
|
HIP_CHECK(hipStreamCreate(&stream));
|
|
HIP_CHECK(hipMemset3DAsync(devpPtr, memsetval, extent, stream));
|
|
HIP_CHECK(hipStreamSynchronize(stream));
|
|
HIP_CHECK(hipStreamDestroy(stream));
|
|
} else {
|
|
HIP_CHECK(hipMemset3D(devpPtr, memsetval, extent));
|
|
}
|
|
|
|
myparms.srcPtr = devpPtr;
|
|
memset(A_h, 0, sizeElements);
|
|
HIP_CHECK(hipMemcpy3D(&myparms));
|
|
|
|
for (size_t indx = 0; indx < elements; indx++) {
|
|
if (A_h[indx] != memsetval) {
|
|
testPassed = false;
|
|
printf(
|
|
"RegressIter : mismatch at index:%d computed:%02x, "
|
|
"memsetval:%02x\n",
|
|
static_cast<int>(indx), static_cast<int>(A_h[indx]), static_cast<int>(memsetval));
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
for (int j = 0; j < numGpu; j++) {
|
|
HIP_CHECK(hipFree(devPitchedPtrlist[j].ptr));
|
|
}
|
|
|
|
free(A_h);
|
|
return testPassed;
|
|
}
|
|
|
|
/**
|
|
* Perform regression of hipMemset3D api with device memory allocated
|
|
* on different gpus.
|
|
*/
|
|
TEST_CASE("Unit_hipMemset3D_RegressInLoop", "[multigpu]") {
|
|
CHECK_IMAGE_SUPPORT
|
|
|
|
bool TestPassed = false;
|
|
|
|
TestPassed = loopRegression(0);
|
|
REQUIRE(TestPassed == true);
|
|
}
|
|
|
|
/**
|
|
* Perform regression of hipMemset3DAsync api with device memory allocated
|
|
* on different gpus.
|
|
*/
|
|
TEST_CASE("Unit_hipMemset3DAsync_RegressInLoop", "[multigpu]") {
|
|
CHECK_IMAGE_SUPPORT
|
|
|
|
bool TestPassed = false;
|
|
|
|
TestPassed = loopRegression(1);
|
|
REQUIRE(TestPassed == true);
|
|
}
|
|
|
|
/**
|
|
* Async commands queued concurrently and executed
|
|
*/
|
|
TEST_CASE("Unit_hipMemset3DAsync_ConcurrencyMthread") {
|
|
CHECK_IMAGE_SUPPORT
|
|
|
|
char* A_h;
|
|
constexpr int memsetval = 1, testval = 2;
|
|
constexpr size_t numH = 256, numW = 100, depth = 10;
|
|
size_t width = numW * sizeof(char);
|
|
hipExtent extent = make_hipExtent(width, numH, depth);
|
|
size_t sizeElements = width * numH * depth;
|
|
size_t elements = numW * numH * depth;
|
|
hipPitchedPtr devpPtr;
|
|
hipStream_t stream;
|
|
|
|
HIP_CHECK(hipStreamCreate(&stream));
|
|
HIP_CHECK(hipMalloc3D(&devpPtr, extent));
|
|
|
|
A_h = reinterpret_cast<char*>(malloc(sizeElements));
|
|
REQUIRE(A_h != nullptr);
|
|
memset(A_h, 0, sizeElements);
|
|
|
|
// Populate hipMemcpy3D parameters
|
|
hipMemcpy3DParms myparms{};
|
|
myparms.srcPos = make_hipPos(0, 0, 0);
|
|
myparms.srcPtr = devpPtr;
|
|
myparms.dstPos = make_hipPos(0, 0, 0);
|
|
myparms.dstPtr = make_hipPitchedPtr(A_h, width, numW, numH);
|
|
myparms.extent = extent;
|
|
|
|
#if HT_NVIDIA
|
|
myparms.kind = hipMemcpyKindToCudaMemcpyKind(hipMemcpyDeviceToHost);
|
|
#else
|
|
myparms.kind = hipMemcpyDeviceToHost;
|
|
#endif
|
|
|
|
std::vector<std::thread> threadlist;
|
|
|
|
// Queue cmds concurrently from multiple threads on same stream
|
|
for (int i = 0; i < MAX_THREADS; i++) {
|
|
threadlist.push_back(
|
|
std::thread(threadFunc, stream, devpPtr, memsetval, testval, extent, myparms));
|
|
}
|
|
|
|
for (auto& t : threadlist) {
|
|
t.join();
|
|
}
|
|
|
|
HIP_CHECK(hipStreamSynchronize(stream));
|
|
|
|
for (size_t k = 0; k < elements; k++) {
|
|
if (A_h[k] != testval) {
|
|
CAPTURE(A_h[k], testval, k);
|
|
REQUIRE(false);
|
|
}
|
|
}
|
|
|
|
HIP_CHECK(hipStreamDestroy(stream));
|
|
free(A_h);
|
|
HIP_CHECK(hipFree(devpPtr.ptr));
|
|
}
|