Merge pull request #933 from ROCm-Developer-Tools/fix_hipmemset

Add HIP memset APIs to cope with non-zero initial values of integer types
Этот коммит содержится в:
Maneesh Gupta
2019-03-05 14:31:38 +05:30
коммит произвёл GitHub
родитель de4a9b8446 51f29b9cee
Коммит 2ed31e9f6c
6 изменённых файлов: 142 добавлений и 13 удалений
+29
Просмотреть файл
@@ -1504,6 +1504,17 @@ hipError_t hipMemset(void* dst, int value, size_t sizeBytes);
*/
hipError_t hipMemsetD8(hipDeviceptr_t dest, unsigned char value, size_t sizeBytes);
/**
* @brief Fills the memory area pointed to by dest with the constant integer
* value for specified number of times.
*
* @param[out] dst Data being filled
* @param[in] constant value to be set
* @param[in] number of values to be set
* @return #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized
*/
hipError_t hipMemsetD32(hipDeviceptr_t dest, int value, size_t count);
/**
* @brief Fills the first sizeBytes bytes of the memory area pointed to by dev with the constant
* byte value value.
@@ -1521,6 +1532,24 @@ hipError_t hipMemsetD8(hipDeviceptr_t dest, unsigned char value, size_t sizeByte
*/
hipError_t hipMemsetAsync(void* dst, int value, size_t sizeBytes, hipStream_t stream __dparm(0));
/**
* @brief Fills the memory area pointed to by dev with the constant integer
* value for specified number of times.
*
* hipMemsetD32Async() is asynchronous with respect to the host, so the call may return before the
* memset is complete. The operation can optionally be associated to a stream by passing a non-zero
* stream argument. If stream is non-zero, the operation may overlap with operations in other
* streams.
*
* @param[out] dst Pointer to device memory
* @param[in] value - Value to set for each byte of specified memory
* @param[in] count - number of values to be set
* @param[in] stream - Stream identifier
* @return #hipSuccess, #hipErrorInvalidValue, #hipErrorMemoryFree
*/
hipError_t hipMemsetD32Async(hipDeviceptr_t dst, int value, size_t count,
hipStream_t stream __dparm(0));
/**
* @brief Fills the memory area pointed to by dst with the constant value.
*
+9
Просмотреть файл
@@ -655,11 +655,20 @@ inline static hipError_t hipMemset(void* devPtr, int value, size_t count) {
return hipCUDAErrorTohipError(cudaMemset(devPtr, value, count));
}
inline static hipError_t hipMemsetD32(hipDeviceptr_t devPtr, int value, size_t count) {
return hipCUResultTohipError(cuMemsetD32(devPtr, value, count));
}
inline static hipError_t hipMemsetAsync(void* devPtr, int value, size_t count,
hipStream_t stream __dparm(0)) {
return hipCUDAErrorTohipError(cudaMemsetAsync(devPtr, value, count, stream));
}
inline static hipError_t hipMemsetD32Async(hipDeviceptr_t devPtr, int value, size_t count,
hipStream_t stream __dparm(0)) {
return hipCUResultTohipError(cuMemsetD32Async(devPtr, value, count, stream));
}
inline static hipError_t hipMemsetD8(hipDeviceptr_t dest, unsigned char value, size_t sizeBytes) {
return hipCUResultTohipError(cuMemsetD8(dest, value, sizeBytes));
}
+38 -10
Просмотреть файл
@@ -1508,13 +1508,13 @@ __global__ void hip_copy2d_n(T* dst, const T* src, size_t width, size_t height,
} // namespace
template <typename T>
void ihipMemsetKernel(hipStream_t stream, T* ptr, T val, size_t sizeBytes) {
void ihipMemsetKernel(hipStream_t stream, T* ptr, T val, size_t count) {
static constexpr uint32_t block_dim = 256;
const uint32_t grid_dim = clamp_integer<size_t>(sizeBytes / block_dim, 1, UINT32_MAX);
const uint32_t grid_dim = clamp_integer<size_t>(count / block_dim, 1, UINT32_MAX);
hipLaunchKernelGGL(hip_fill_n<block_dim>, dim3(grid_dim), dim3{block_dim}, 0u, stream, ptr,
sizeBytes, std::move(val));
count, std::move(val));
}
template <typename T>
@@ -1533,20 +1533,20 @@ typedef enum ihipMemsetDataType {
ihipMemsetDataTypeInt = 2
}ihipMemsetDataType;
hipError_t ihipMemset(void* dst, int value, size_t sizeBytes, hipStream_t stream, enum ihipMemsetDataType copyDataType )
hipError_t ihipMemset(void* dst, int value, size_t count, hipStream_t stream, enum ihipMemsetDataType copyDataType )
{
hipError_t e = hipSuccess;
if (sizeBytes == 0) return e;
if (count == 0) return e;
if (stream && (dst != NULL)) {
if(copyDataType == ihipMemsetDataTypeChar){
if ((sizeBytes & 0x3) == 0) {
if ((count & 0x3) == 0) {
// use a faster dword-per-workitem copy:
try {
value = value & 0xff;
uint32_t value32 = (value << 24) | (value << 16) | (value << 8) | (value) ;
ihipMemsetKernel<uint32_t> (stream, static_cast<uint32_t*> (dst), value32, sizeBytes/sizeof(uint32_t));
ihipMemsetKernel<uint32_t> (stream, static_cast<uint32_t*> (dst), value32, count/sizeof(uint32_t));
}
catch (std::exception &ex) {
e = hipErrorInvalidValue;
@@ -1554,7 +1554,7 @@ hipError_t ihipMemset(void* dst, int value, size_t sizeBytes, hipStream_t strea
} else {
// use a slow byte-per-workitem copy:
try {
ihipMemsetKernel<char> (stream, static_cast<char*> (dst), value, sizeBytes);
ihipMemsetKernel<char> (stream, static_cast<char*> (dst), value, count);
}
catch (std::exception &ex) {
e = hipErrorInvalidValue;
@@ -1563,14 +1563,14 @@ hipError_t ihipMemset(void* dst, int value, size_t sizeBytes, hipStream_t strea
} else {
if(copyDataType == ihipMemsetDataTypeInt) { // 4 Bytes value
try {
ihipMemsetKernel<uint32_t> (stream, static_cast<uint32_t*> (dst), value, sizeBytes);
ihipMemsetKernel<uint32_t> (stream, static_cast<uint32_t*> (dst), value, count);
} catch (std::exception &ex) {
e = hipErrorInvalidValue;
}
} else if(copyDataType == ihipMemsetDataTypeShort) {
try {
value = value & 0xffff;
ihipMemsetKernel<uint16_t> (stream, static_cast<uint16_t*> (dst), value, sizeBytes);
ihipMemsetKernel<uint16_t> (stream, static_cast<uint16_t*> (dst), value, count);
} catch (std::exception &ex) {
e = hipErrorInvalidValue;
}
@@ -1719,6 +1719,18 @@ hipError_t hipMemsetAsync(void* dst, int value, size_t sizeBytes, hipStream_t st
return ihipLogStatus(e);
};
hipError_t hipMemsetD32Async(hipDeviceptr_t dst, int value, size_t count, hipStream_t stream) {
HIP_INIT_SPECIAL_API(hipMemsetD32Async, (TRACE_MCMD), dst, value, count, stream);
hipError_t e = hipSuccess;
stream = ihipSyncAndResolveStream(stream);
e = ihipMemset(dst, value, count, stream, ihipMemsetDataTypeInt);
return ihipLogStatus(e);
};
hipError_t hipMemset(void* dst, int value, size_t sizeBytes) {
HIP_INIT_SPECIAL_API(hipMemset, (TRACE_MCMD), dst, value, sizeBytes);
@@ -1787,6 +1799,22 @@ hipError_t hipMemsetD8(hipDeviceptr_t dst, unsigned char value, size_t sizeBytes
return ihipLogStatus(e);
}
hipError_t hipMemsetD32(hipDeviceptr_t dst, int value, size_t count) {
HIP_INIT_SPECIAL_API(hipMemsetD32, (TRACE_MCMD), dst, value, count);
hipError_t e = hipSuccess;
hipStream_t stream = hipStreamNull;
stream = ihipSyncAndResolveStream(stream);
if (stream) {
e = ihipMemset(dst, value, count, stream, ihipMemsetDataTypeInt);
stream->locked_wait();
} else {
e = hipErrorInvalidValue;
}
return ihipLogStatus(e);
}
hipError_t hipMemset3D(hipPitchedPtr pitchedDevPtr, int value, hipExtent extent )
{
HIP_INIT_SPECIAL_API(hipMemset3D, (TRACE_MCMD), &pitchedDevPtr, value, &extent);
+58 -3
Просмотреть файл
@@ -26,11 +26,11 @@ THE SOFTWARE.
* BUILD: %t %s ../../test_common.cpp
* RUN: %t
* //Small copy
* RUN: %t -N 10 --memsetval 0x42
* RUN: %t -N 10 --memsetval 0x42 --memsetD32val 0x101
* // Oddball size
* RUN: %t -N 10013 --memsetval 0x5a
* RUN: %t -N 10013 --memsetval 0x5a --memsetD32val 0xDEADBEEF
* // Big copy
* RUN: %t -N 256M --memsetval 0xa6
* RUN: %t -N 256M --memsetval 0xa6 --memsetD32val 0xCAFEBABE
* HIT_END
*/
@@ -62,6 +62,30 @@ bool testhipMemset(int memsetval,int p_gpuDevice)
return testResult;
}
bool testhipMemsetD32(int memsetD32val,int p_gpuDevice)
{
size_t Nbytes = N*sizeof(int);
printf ("testhipMemsetD32 N=%zu memsetD32val=%8x device=%d\n", N, memsetD32val, p_gpuDevice);
int *A_d;
int *A_h;
bool testResult = true;
HIPCHECK ( hipMalloc(&A_d, Nbytes) );
A_h = (int*)malloc(Nbytes);
HIPCHECK ( hipMemsetD32((hipDeviceptr_t)A_d, memsetD32val, N) );
HIPCHECK ( hipMemcpy(A_h, A_d, Nbytes, hipMemcpyDeviceToHost));
for (int i=0; i<N; i++) {
if (A_h[i] != memsetD32val) {
testResult = false; printf("mismatch at index:%d computed:%08x, memsetD32val:%08x\n", i, A_h[i], memsetD32val);
break;
}
}
HIPCHECK(hipFree(A_d));
free(A_h);
return testResult;
}
bool testhipMemsetAsync(int memsetval,int p_gpuDevice)
{
size_t Nbytes = N*sizeof(int);
@@ -91,6 +115,35 @@ bool testhipMemsetAsync(int memsetval,int p_gpuDevice)
return testResult;
}
bool testhipMemsetD32Async(int memsetD32val,int p_gpuDevice)
{
size_t Nbytes = N*sizeof(int);
printf ("testhipMemsetD32Async N=%zu memsetval=%8x device=%d\n", N, memsetD32val, p_gpuDevice);
int *A_d;
int *A_h;
bool testResult = true;
HIPCHECK ( hipMalloc((void**)&A_d, Nbytes) );
A_h = (int*)malloc(Nbytes);
hipStream_t stream;
HIPCHECK(hipStreamCreate(&stream));
HIPCHECK ( hipMemsetD32Async((hipDeviceptr_t)A_d, memsetD32val, N, stream ));
HIPCHECK ( hipStreamSynchronize(stream));
HIPCHECK ( hipMemcpy(A_h, (void*)A_d, Nbytes, hipMemcpyDeviceToHost));
for (int i=0; i<N; i++) {
if (A_h[i] != memsetD32val) {
testResult = false;
printf("mismatch at index:%d computed:%02x, memsetD32val:%02x\n", i, A_h[i], memsetD32val);
break;
}
}
HIPCHECK(hipFree((void*)A_d));
HIPCHECK(hipStreamDestroy(stream));
free(A_h);
return testResult;
}
int main(int argc, char *argv[])
{
HipTest::parseStandardArguments(argc, argv, true);
@@ -98,6 +151,8 @@ int main(int argc, char *argv[])
HIPCHECK(hipSetDevice(p_gpuDevice));
testResult &= testhipMemset(memsetval, p_gpuDevice);
testResult &= testhipMemsetAsync(memsetval, p_gpuDevice);
testResult &= testhipMemsetD32(memsetD32val, p_gpuDevice);
testResult &= testhipMemsetD32Async(memsetD32val, p_gpuDevice);
if (testResult) passed();
failed("Output Mismatch\n");
}
+7
Просмотреть файл
@@ -24,6 +24,7 @@ THE SOFTWARE.
// standard global variables that can be set on command line
size_t N = 4 * 1024 * 1024;
char memsetval = 0x42;
int memsetD32val = 0xDEADBEEF;
int iterations = 1;
unsigned blocksPerCU = 6; // to hide latency
unsigned threadsPerBlock = 256;
@@ -99,6 +100,12 @@ int parseStandardArguments(int argc, char* argv[], bool failOnUndefinedArg) {
failed("Bad memsetval argument");
}
memsetval = ex;
} else if (!strcmp(arg, "--memsetD32val")) {
int ex;
if (++i >= argc || !HipTest::parseInt(argv[i], &ex)) {
failed("Bad memsetD32val argument");
}
memsetD32val = ex;
} else if (!strcmp(arg, "--iterations") || (!strcmp(arg, "-i"))) {
if (++i >= argc || !HipTest::parseInt(argv[i], &iterations)) {
failed("Bad iterations argument");
+1
Просмотреть файл
@@ -98,6 +98,7 @@ THE SOFTWARE.
// standard command-line variables:
extern size_t N;
extern char memsetval;
extern int memsetD32val;
extern int iterations;
extern unsigned blocksPerCU;
extern unsigned threadsPerBlock;