Merge branch 'amd-master-next' into amd-npi-next

Change-Id: I75be5fbe5ddd6552a6a316ec99ca8833ee02cb6c


[ROCm/hip commit: e19c8e176f]
Αυτή η υποβολή περιλαμβάνεται σε:
Vlad Sytchenko
2020-05-21 19:17:10 -04:00
γονέας 13afb72279 02d5a7c4b9
υποβολή 2cefbe38e1
13 αρχεία άλλαξαν με 289 προσθήκες και 321 διαγραφές
+36 -19
Προβολή Αρχείου
@@ -620,7 +620,17 @@ foreach $arg (@ARGV)
$toolArgs = substr $toolArgs, 0, -8;
chomp $toolArgs;
}
} elsif ($arg eq 'c' and $prevArg eq '-x') {
$hasC = 1;
$hasCXX = 0;
$hasHIP = 0;
} elsif ($arg eq 'c++' and $prevArg eq '-x') {
$hasC = 0;
$hasCXX = 1;
$hasHIP = 0;
} elsif ($arg eq 'hip' and $prevArg eq '-x') {
$hasC = 0;
$hasCXX = 0;
$hasHIP = 1;
} elsif ($arg =~ m/^-/) {
# options start with -
@@ -647,28 +657,35 @@ foreach $arg (@ARGV)
#print "O: <$arg>\n";
} elsif ($prevArg ne '-o') {
# input files and libraries
if ($arg =~ /\.c$/) {
$hasC = 1;
# Skip guessing if `-x {c|c++|hip}` is already specified.
if (not ($hasC or $hasCXX or $hasHIP)) {
if ($arg =~ /\.c$/) {
$hasC = 1;
$needCFLAGS = 1;
$toolArgs .= " -x c"
} elsif (($arg =~ /\.cpp$/) or ($arg =~ /\.cxx$/) or ($arg =~ /\.cc$/) ) {
$needCXXFLAGS = 1;
if ($HIP_COMPILE_CXX_AS_HIP eq '0' or $HIP_COMPILER ne "clang") {
$hasCXX = 1;
} else {
$hasHIP = 1;
$toolArgs .= " -x hip";
}
} elsif ((($arg =~ /\.cu$/ or $arg =~ /\.cuh$/) and $HIP_COMPILE_CXX_AS_HIP ne '0') or ($arg =~ /\.hip$/)) {
$needCXXFLAGS = 1;
if ($HIP_PLATFORM eq "hcc" and $HIP_COMPILER eq "clang") {
$hasHIP = 1;
$toolArgs .= " -x hip";
} else {
$hasCU = 1;
}
}
} elsif ($hasC) {
$needCFLAGS = 1;
$toolArgs .= " -x c"
}
elsif (($arg =~ /\.cpp$/) or ($arg =~ /\.cxx$/) or ($arg =~ /\.cc$/) ) {
} elsif ($hasCXX) {
$needCXXFLAGS = 1;
if ($HIP_COMPILE_CXX_AS_HIP eq '0' or $HIP_COMPILER ne "clang") {
$hasCXX = 1;
} else {
$hasHIP = 1;
$toolArgs .= " -x hip";
}
}
elsif ((($arg =~ /\.cu$/ or $arg =~ /\.cuh$/) and $HIP_COMPILE_CXX_AS_HIP ne '0') or ($arg =~ /\.hip$/)) {
} else {
$needCXXFLAGS = 1;
if ($HIP_PLATFORM eq "hcc" and $HIP_COMPILER eq "clang") {
$hasHIP = 1;
$toolArgs .= " -x hip";
} else {
$hasCU = 1;
}
}
push (@inputs, $arg);
#print "I: <$arg>\n";
@@ -35,7 +35,8 @@ set(HIP_CLANG_PARALLEL_BUILD_COMPILE_OPTIONS "@HIP_CLANG_PARALLEL_BUILD_COMPILE_
@_HIP_HCC_FLAGS@
@_HIP_CLANG_FLAGS@
@_HIP_NVCC_FLAGS@
set(HIP_HIPCC_INCLUDE_ARGS "@HIP_HIPCC_INCLUDE_ARGS@") # list (needs to be in quotes to handle spaces properly)
#Needed to bring the HIP_HIPCC_INCLUDE_ARGS variable in scope
set(HIP_HIPCC_INCLUDE_ARGS @HIP_HIPCC_INCLUDE_ARGS@) # list
set(cmake_dependency_file "@cmake_dependency_file@") # path
set(source_file "@source_file@") # path
@@ -513,7 +513,7 @@ hipError_t hipDeviceGetSharedMemConfig(hipSharedMemConfig* pConfig);
*
* @returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue
*/
hipError_t hipGetDeviceFlags(unsigned *flags);
hipError_t hipGetDeviceFlags(unsigned int* flags);
/**
* @brief The bank width of shared memory on current device is set
@@ -132,6 +132,8 @@ target_include_directories(hip64
${PROJECT_SOURCE_DIR}
${PROJECT_SOURCE_DIR}/amdocl
${PROJECT_SOURCE_DIR}/include/hip/hcc_detail/elfio
# FIXME: Remove ROCclr_DIr explicit references
${ROCclr_DIR}
${ROCclr_DIR}/include
${ROCclr_DIR}/compiler/lib
@@ -139,6 +141,7 @@ target_include_directories(hip64
${ROCclr_DIR}/elf/utils/common
${ROCclr_DIR}/elf/utils/libelf
${ROCR_INCLUDES}
$<TARGET_PROPERTY:amdrocclr_static,INTERFACE_INCLUDE_DIRECTORIES>
$<TARGET_PROPERTY:amd_comgr,INTERFACE_INCLUDE_DIRECTORIES>)
target_compile_definitions(hip64
PRIVATE
@@ -99,7 +99,7 @@ hipError_t hipDeviceComputeCapability(int *major, int *minor, hipDevice_t device
}
hipError_t hipDeviceGetCount(int* count) {
HIP_INIT_API(NONE, count);
HIP_INIT_API(hipDeviceGetCount, count);
HIP_RETURN(ihipDeviceGetCount(count));
}
@@ -238,7 +238,7 @@ hipError_t hipGetDeviceProperties ( hipDeviceProp_t* props, hipDevice_t device )
}
hipError_t hipHccGetAccelerator(int deviceId, hc::accelerator* acc) {
HIP_INIT_API(NONE, deviceId, acc);
HIP_INIT_API(hipHccGetAccelerator, deviceId, acc);
assert(0 && "Unimplemented");
@@ -246,7 +246,7 @@ hipError_t hipHccGetAccelerator(int deviceId, hc::accelerator* acc) {
}
hipError_t hipHccGetAcceleratorView(hipStream_t stream, hc::accelerator_view** av) {
HIP_INIT_API(NONE, stream, av);
HIP_INIT_API(hipHccGetAcceleratorView, stream, av);
assert(0 && "Unimplemented");
@@ -480,7 +480,7 @@ hipError_t hipGetDeviceFlags ( unsigned int* flags ) {
}
hipError_t hipIpcGetEventHandle ( hipIpcEventHandle_t* handle, hipEvent_t event ) {
HIP_INIT_API(NONE, handle, event);
HIP_INIT_API(hipIpcGetEventHandle, handle, event);
assert(0 && "Unimplemented");
@@ -488,7 +488,7 @@ hipError_t hipIpcGetEventHandle ( hipIpcEventHandle_t* handle, hipEvent_t event
}
hipError_t hipIpcOpenEventHandle ( hipEvent_t* event, hipIpcEventHandle_t handle ) {
HIP_INIT_API(NONE, event, handle);
HIP_INIT_API(hipIpcOpenEventHandle, event, handle);
assert(0 && "Unimplemented");
@@ -542,7 +542,7 @@ hipError_t hipSetDeviceFlags ( unsigned int flags ) {
}
hipError_t hipSetValidDevices ( int* device_arr, int len ) {
HIP_INIT_API(NONE, device_arr, len);
HIP_INIT_API(hipSetValidDevices, device_arr, len);
assert(0 && "Unimplemented");
@@ -90,9 +90,15 @@ hipError_t Event::elapsedTime(Event& eStop, float& ms) {
return hipErrorNotReady;
}
ms = static_cast<float>(static_cast<int64_t>(eStop.event_->profilingInfo().end_ -
// For certain HIP Api's that take start and stop event
// the command is the same
if (event_ == eStop.event_) {
ms = static_cast<float>(static_cast<int64_t>(eStop.event_->profilingInfo().end_ -
event_->profilingInfo().start_))/1000000.f;
} else {
ms = static_cast<float>(static_cast<int64_t>(eStop.event_->profilingInfo().end_ -
event_->profilingInfo().end_))/1000000.f;
}
return hipSuccess;
}
@@ -110,6 +110,9 @@ hipError_t ihipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKin
// Skip if nothing needs writing.
return hipSuccess;
}
if (dst == nullptr || src == nullptr) {
return hipErrorInvalidValue;
}
amd::Command* command = nullptr;
amd::Command::EventWaitList waitList;
@@ -1717,7 +1720,9 @@ hipError_t ihipMemset(void* dst, int64_t value, size_t valueSize, size_t sizeByt
}
size_t offset = 0;
amd::Memory* memory = getMemoryObject(dst, offset);
auto aligned_dst = amd::alignUp(reinterpret_cast<address>(dst), sizeof(uint64_t));
amd::Memory* memory = getMemoryObject(aligned_dst, offset);
if (memory == nullptr) {
// Host alloced memory
memset(dst, value, sizeBytes);
@@ -1727,44 +1732,52 @@ hipError_t ihipMemset(void* dst, int64_t value, size_t valueSize, size_t sizeByt
hipError_t hip_error = hipSuccess;
amd::HostQueue* queue = hip::getQueue(stream);
size_t n_head_bytes = 0;
size_t n_tail_bytes = 0;
int64_t value64 = 0;
const size_t uint64ModSize = (sizeBytes % sizeof(int64_t));
if (sizeBytes/sizeof(int64_t) > 0) {
n_head_bytes = static_cast<uint8_t*>(aligned_dst) - static_cast<uint8_t*>(dst);
if (valueSize == sizeof(int8_t)) {
value = value & 0xff;
value64 = ((value << 56) | (value << 48) | (value << 40) | (value << 32)
| (value << 24) | (value << 16) | (value << 8) | (value));
} else if (valueSize == sizeof(int16_t)) {
value = value & 0xffff;
value64 = ((value << 48) | (value << 32) | (value<<16) | (value));
} else if(valueSize == sizeof(int32_t)) {
value64 = ((value << 48) | (value << 32) | (value << 16) | (value));
} else if (valueSize == sizeof(int32_t)) {
value = value & 0xffffffff;
value64 = ((value<<32) | (value));
value64 = ((value << 32) | (value));
} else if (valueSize == sizeof(int64_t)) {
value64 = value;
} else {
LogPrintfError("Unsupported Pattern size: %u \n", valueSize);
return hipErrorInvalidValue;
}
// If uint64ModSize is != 0 then we will do a second fillBuffer Command
n_tail_bytes = ((sizeBytes - n_head_bytes) % sizeof(int64_t));
// If n_tail_bytes is != 0 then we will do a second fillBuffer Command
// on the same stream below, dont wait, do the first call async.
hip_error = packFillMemoryCommand(memory, offset, value64, sizeof(int64_t),
sizeBytes - uint64ModSize, queue,
((uint64ModSize != 0) || isAsync));
if(hip_error != hipSuccess) {
sizeBytes - n_tail_bytes - n_head_bytes, queue,
((n_head_bytes != 0) || (n_tail_bytes != 0) || isAsync));
if (hip_error != hipSuccess) {
return hip_error;
}
} else {
n_head_bytes = sizeBytes;
}
if (uint64ModSize != 0) {
void* new_dst = reinterpret_cast<void*>((reinterpret_cast<address>(dst)
+ sizeBytes) - uint64ModSize);
if (n_head_bytes != 0) {
memory = getMemoryObject(dst, offset);
hip_error = packFillMemoryCommand(memory, offset, value, valueSize,
n_head_bytes , queue, isAsync);
}
if (n_tail_bytes != 0) {
void* new_dst = (reinterpret_cast<address>(dst) + sizeBytes) - n_tail_bytes;
memory = getMemoryObject(new_dst, offset);
hip_error = packFillMemoryCommand(memory, offset, value, valueSize,
uint64ModSize, queue, isAsync);
n_tail_bytes, queue, isAsync);
}
return hip_error;
}
@@ -2090,7 +2103,7 @@ hipError_t hipArrayGetDescriptor(HIP_ARRAY_DESCRIPTOR* pArrayDescriptor,
hipError_t hipMemcpyParam2DAsync(const hip_Memcpy2D* pCopy,
hipStream_t stream) {
HIP_INIT_API(hipMemcpyParam2D, pCopy);
HIP_INIT_API(hipMemcpyParam2DAsync, pCopy);
HIP_RETURN(ihipMemcpyParam2D(pCopy, stream, true));
}
@@ -119,7 +119,7 @@ hipError_t hipModuleLoadDataEx(hipModule_t *module, const void *image,
void** optionsValues)
{
/* TODO: Pass options to Program */
HIP_INIT_API(hipModuleLoadData, module, image);
HIP_INIT_API(hipModuleLoadDataEx, module, image);
HIP_RETURN(ihipModuleLoadData(module, image, 0));
}
@@ -369,7 +369,7 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f,
hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags = 0,
uint32_t params = 0, uint32_t gridId = 0, uint32_t numGrids = 0,
uint64_t prevGridSum = 0, uint64_t allGridSum = 0, uint32_t firstDevice = 0) {
HIP_INIT_API(NONE, f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ,
HIP_INIT_API(ihipModuleLaunchKernel, f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ,
sharedMemBytes, hStream, kernelParams, extra, startEvent, stopEvent, flags, params);
hip::Function* function = hip::Function::asFunction(f);
@@ -493,7 +493,7 @@ hipError_t hipExtModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
hipStream_t hStream, void** kernelParams, void** extra,
hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags)
{
HIP_INIT_API(NONE, f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ,
HIP_INIT_API(hipExtModuleLaunchKernel, f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ,
localWorkSizeX, localWorkSizeY, localWorkSizeZ,
sharedMemBytes, hStream,
kernelParams, extra, startEvent, stopEvent, flags);
@@ -512,7 +512,7 @@ hipError_t hipHccModuleLaunchKernel(hipFunction_t f, uint32_t gridDimX,
hipEvent_t startEvent,
hipEvent_t stopEvent)
{
HIP_INIT_API(NONE, f, gridDimX, gridDimY, gridDimZ,
HIP_INIT_API(hipHccModuleLaunchKernel, f, gridDimX, gridDimY, gridDimZ,
blockDimX, blockDimY, blockDimZ,
sharedMemBytes, hStream,
kernelParams, extra, startEvent, stopEvent);
@@ -529,7 +529,7 @@ hipError_t hipModuleLaunchKernelExt(hipFunction_t f, uint32_t gridDimX,
hipEvent_t startEvent,
hipEvent_t stopEvent)
{
HIP_INIT_API(NONE, f, gridDimX, gridDimY, gridDimZ,
HIP_INIT_API(hipModuleLaunchKernelExt, f, gridDimX, gridDimY, gridDimZ,
blockDimX, blockDimY, blockDimZ,
sharedMemBytes, hStream,
kernelParams, extra, startEvent, stopEvent);
@@ -191,13 +191,6 @@ void PlatformState::init()
for (auto& it : vars_) {
it.second.rvars.resize(g_devices.size());
}
if (!HIP_ENABLE_LAZY_KERNEL_LOADING) {
for (size_t i = 0; i < g_devices.size(); ++i) {
for (auto& it: functions_) {
getFunc(it.first, i);
}
}
}
}
bool PlatformState::unregisterFunc(hipModule_t hmod) {
@@ -599,6 +592,13 @@ void PlatformState::popExec(ihipExec_t& exec) {
execStack_.pop();
}
namespace {
const int HIP_ENABLE_DEFERRED_LOADING{[] () {
char *var = getenv("HIP_ENABLE_DEFERRED_LOADING");
return var ? atoi(var) : 1;
}()};
} /* namespace */
extern "C" void __hipRegisterFunction(
std::vector<std::pair<hipModule_t,bool> >* modules,
const void* hostFunction,
@@ -613,9 +613,12 @@ extern "C" void __hipRegisterFunction(
{
PlatformState::DeviceFunction func{ std::string{deviceName}, modules, std::vector<hipFunction_t>{g_devices.size()}};
PlatformState::instance().registerFunction(hostFunction, func);
// for (size_t i = 0; i < g_devices.size(); ++i) {
// PlatformState::instance().getFunc(hostFunction, i);
// }
if (!HIP_ENABLE_DEFERRED_LOADING) {
HIP_INIT();
for (size_t i = 0; i < g_devices.size(); ++i) {
PlatformState::instance().getFunc(hostFunction, i);
}
}
}
// Registers a device-side global variable.
@@ -706,7 +709,7 @@ extern "C" hipError_t hipConfigureCall(
size_t sharedMem,
hipStream_t stream)
{
HIP_INIT_API(NONE, gridDim, blockDim, sharedMem, stream);
HIP_INIT_API(hipConfigureCall, gridDim, blockDim, sharedMem, stream);
PlatformState::instance().configureCall(gridDim, blockDim, sharedMem, stream);
@@ -719,7 +722,7 @@ extern "C" hipError_t __hipPushCallConfiguration(
size_t sharedMem,
hipStream_t stream)
{
HIP_INIT_API(NONE, gridDim, blockDim, sharedMem, stream);
HIP_INIT_API(__hipPushCallConfiguration, gridDim, blockDim, sharedMem, stream);
PlatformState::instance().configureCall(gridDim, blockDim, sharedMem, stream);
@@ -730,7 +733,7 @@ extern "C" hipError_t __hipPopCallConfiguration(dim3 *gridDim,
dim3 *blockDim,
size_t *sharedMem,
hipStream_t *stream) {
HIP_INIT_API(NONE, gridDim, blockDim, sharedMem, stream);
HIP_INIT_API(__hipPopCallConfiguration, gridDim, blockDim, sharedMem, stream);
ihipExec_t exec;
PlatformState::instance().popExec(exec);
@@ -747,7 +750,7 @@ extern "C" hipError_t hipSetupArgument(
size_t size,
size_t offset)
{
HIP_INIT_API(NONE, arg, size, offset);
HIP_INIT_API(hipSetupArgument, arg, size, offset);
PlatformState::instance().setupArgument(arg, size, offset);
@@ -756,7 +759,7 @@ extern "C" hipError_t hipSetupArgument(
extern "C" hipError_t hipLaunchByPtr(const void *hostFunction)
{
HIP_INIT_API(NONE, hostFunction);
HIP_INIT_API(hipLaunchByPtr, hostFunction);
ihipExec_t exec;
PlatformState::instance().popExec(exec);
@@ -1301,7 +1304,7 @@ extern "C" hipError_t hipLaunchKernel(const void *hostFunction,
size_t sharedMemBytes,
hipStream_t stream)
{
HIP_INIT_API(NONE, hostFunction, gridDim, blockDim, args, sharedMemBytes,
HIP_INIT_API(hipLaunchKernel, hostFunction, gridDim, blockDim, args, sharedMemBytes,
stream);
hip::Stream* s = reinterpret_cast<hip::Stream*>(stream);
@@ -68,8 +68,8 @@ def filtr_api_name(name):
return name
def filtr_api_decl(record):
record = re.sub("\s__dparm\([^\)]*\)", '', record);
record = re.sub("\(void\*\)", '', record);
record = re.sub("\s__dparm\([^\)]*\)", r'', record);
record = re.sub("\(void\*\)", r'', record);
return record
# Normalizing API arguments
@@ -216,6 +216,8 @@ def parse_content(inp_file_p, api_map, out):
api_name = ""
# Valid public API found flag
api_valid = 0
# API overload (parameters mismatch)
api_overload = 0
# Input file patched content
content = ''
@@ -240,6 +242,7 @@ def parse_content(inp_file_p, api_map, out):
# Looking for API begin
if found == 0:
record = re.sub(r'\s*extern\s+"C"\s+', r'', record);
if beg_pattern.match(record):
found = 1
record = filtr_api_decl(record)
@@ -255,6 +258,8 @@ def parse_content(inp_file_p, api_map, out):
# Checking if complete API matched
if m:
found = 2
api_valid = 0
api_overload = 0
api_name = filtr_api_name(m.group(2))
# Checking if API name is in the API map
if (private_check_mode == 0) or (api_name in api_map):
@@ -280,48 +285,34 @@ def parse_content(inp_file_p, api_map, out):
out[api_name] = filtr_api_opts(api_args)
# Register missmatched API methods
else:
api_overload = 1
# Warning about mismatched API, possible non public overloaded version
api_diff = '\t\t' + inp_file + " line(" + str(line_num) + ")\n\t\tapi: " + api_types + "\n\t\teta: " + eta_types
message("\t" + api_name + ' args mismatch:\n' + api_diff + '\n')
if hip_patch_mode != 0:
# Looking for INIT macro
m = init_pattern.match(line)
if m:
if api_valid == 0: api_name = 'NONE'
if api_name == m.group(3):
if hip_patch_mode == 1: hip_patch_mode = 0
else: fatal("patching failed")
else:
hip_patch_mode = 2
init_args = m.group(2)
if init_args != '': init_args = ', ' + init_args
line = m.group(1) + '(' + api_name + init_args + m.group(5) + '\n'
non_public_api = 0
# API found action
if found == 2:
# Looking for INIT macro
if hip_patch_mode != 0:
# Looking for INIT macro
m = init_pattern.match(line)
if m:
init_name = api_name
if api_overload == 1: init_name = 'NONE'
init_args = m.group(4)
line = m.group(1) + '(' + init_name + init_args + m.group(5) + '\n'
m = init_pattern.match(line)
if m:
found = 0
non_public_api = 0
if api_valid == 1:
api_valid = 0
message("\t" + api_name)
else:
non_public_api = 1
if non_public_api == 1:
if api_valid == 1: message("\t" + api_name)
# Ignore if it is initialized as NONE
init_name = m.group(3)
if init_name != 'NONE':
# Check if init name matching API name
if init_name != api_name:
fatal("init name mismatch: '" + init_name + "' <> '" + api_name + "'")
# Registering dummy API for non public API if the name in INIT is not NONE
init_name = m.group(3)
# Ignore if it is initialized as NONE
if init_name != 'NONE':
# Check if init name matching API name
if init_name != api_name:
fatal("init name mismatch: '" + init_name + "' <> '" + api_name + "'")
if api_valid == 0:
# If init name is not in public API map then it is private API
# else it was not identified and will be checked on finish
if not init_name in api_map:
@@ -333,7 +324,6 @@ def parse_content(inp_file_p, api_map, out):
# Expect INIT macro for valid public API
# Removing and registering non-conformant APIs with missing HIP_INIT macro
if api_valid == 1:
api_valid = 0
if api_name in out:
del out[api_name]
del api_map[api_name]
@@ -44,8 +44,6 @@ void multiplyCPU(float* C, float* A, float* B, int N){
}
}
#if defined(__HIP_PLATFORM_HCC__) && GENERIC_GRID_LAUNCH == 1 && defined(__HCC__)
void launchKernel(float* C, float* A, float* B, bool manual){
hipDeviceProp_t devProp;
@@ -95,10 +93,8 @@ void launchKernel(float* C, float* A, float* B, bool manual){
std::cout << "Theoretical Occupancy is " << (double)numBlock* blockSize/devProp.maxThreadsPerMultiProcessor * 100 << "%" << std::endl;
}
}
#endif
int main() {
#if defined(__HIP_PLATFORM_HCC__) && GENERIC_GRID_LAUNCH == 1 && defined(__HCC__)
float *A, *B, *C0, *C1, *cpuC;
float *Ad, *Bd, *C0d, *C1d;
int errors=0;
@@ -177,8 +173,5 @@ int main() {
free(C0);
free(C1);
free(cpuC);
#else
std::cout <<"hipOccupancyMaxPotentialBlockSize template not support for Clang compiler"<<std::endl;
#endif
return 0;
}
@@ -1,16 +1,13 @@
/*
Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -33,236 +30,181 @@ THE SOFTWARE.
* TEST: %t -N 256M --memsetval 0xa6 --memsetD32val 0xCAFEBABE --memsetD16val 0xCAFE --memsetD8val 0xCA
* HIT_END
*/
#define MAX_OFFSET 3
// To test memset on unaligned pointer
#define loop(offset, offsetMax) for (int offset = offsetMax; offset >= 0; offset --)
#include <vector>
#include "hip/hip_runtime.h"
#include "test_common.h"
enum MemsetType {
hipMemsetTypeDefault,
hipMemsetTypeD8,
hipMemsetTypeD16,
hipMemsetTypeD32
};
bool testhipMemset(int memsetval,int p_gpuDevice)
bool testhipMemsetSmallSize(int memsetval, int p_gpuDevice)
{
size_t Nbytes = N*sizeof(char);
printf ("testhipMemset N=%zu memsetval=%2x device=%d\n", N, memsetval, p_gpuDevice);
char *A_d;
char *A_h;
bool testResult = true;
char *A_d;
char *A_h;
bool testResult = true;
for ( size_t iSize = 1; iSize < 4; iSize++ ) {
size_t Nbytes = iSize * sizeof(char);
HIPCHECK(hipMalloc(&A_d, Nbytes));
A_h = reinterpret_cast<char*> (malloc(Nbytes));
printf("testhipMemsetSmallSize N=%zu memsetval=%2x device=%d\n",
iSize, memsetval, p_gpuDevice);
HIPCHECK(hipMemset(A_d, memsetval, Nbytes));
HIPCHECK(hipMemcpy(A_h, A_d, Nbytes, hipMemcpyDeviceToHost));
HIPCHECK ( hipMalloc(&A_d, Nbytes) );
A_h = (char*)malloc(Nbytes);
HIPCHECK ( hipMemset(A_d, memsetval, Nbytes) );
HIPCHECK ( hipMemcpy(A_h, A_d, Nbytes, hipMemcpyDeviceToHost));
for (int i=0; i<N; i++) {
if (A_h[i] != memsetval) {
testResult = false;
printf("mismatch at index:%d computed:%02x, memsetval:%02x\n", i, (int)A_h[i], (int)memsetval);
break;
}
for ( int i = 0; i < iSize; i++ ) {
if ( A_h[i] != memsetval ) {
testResult = false;
printf("mismatch at index:%d computed:%02x, memsetval:%02x\n",
i, static_cast<int> (A_h[i]), static_cast<int> (memsetval));
break;
}
}
HIPCHECK(hipFree(A_d));
free(A_h);
return testResult;
HIPCHECK(hipFree(A_d));
free(A_h);
}
return testResult;
}
bool testhipMemsetD32(int memsetD32val,int p_gpuDevice)
{
size_t Nbytes = N*sizeof(int);
printf ("testhipMemsetD32 N=%zu memsetD32val=%8x device=%d\n", N, memsetD32val, p_gpuDevice);
int *A_d;
int *A_h;
bool testResult = true;
HIPCHECK ( hipMalloc(&A_d, Nbytes) );
A_h = (int*)malloc(Nbytes);
HIPCHECK ( hipMemsetD32((hipDeviceptr_t)A_d, memsetD32val, N) );
HIPCHECK ( hipMemcpy(A_h, A_d, Nbytes, hipMemcpyDeviceToHost));
for (int i=0; i<N; i++) {
if (A_h[i] != memsetD32val) {
testResult = false; printf("mismatch at index:%d computed:%08x, memsetD32val:%08x\n", i, A_h[i], memsetD32val);
break;
}
template<typename T>
bool testhipMemset(T*A_h, T*A_d, T memsetval, enum MemsetType type,
int p_gpuDevice) {
size_t Nbytes = N * sizeof(T);
bool testResult = true;
HIPCHECK(hipMalloc(&A_d, Nbytes));
A_h = reinterpret_cast<T*> (malloc(Nbytes));
loop(offset, MAX_OFFSET) {
if (type == hipMemsetTypeDefault) {
printf("testhipMemset N=%zu memsetval=%2x device=%d\n",
(N - offset), memsetval, p_gpuDevice);
HIPCHECK(hipMemset(A_d + offset, memsetval, N - offset));
} else if (type == hipMemsetTypeD8) {
printf("testhipMemsetD8 N=%zu memsetD8val=%4x device=%d\n",
(N - offset), memsetval, p_gpuDevice);
HIPCHECK(hipMemsetD8((hipDeviceptr_t)(A_d + offset), memsetval, N - offset));
} else if (type == hipMemsetTypeD16) {
printf("testhipMemsetD16 N=%zu memsetD16val=%4x device=%d\n",
(N - offset), memsetval, p_gpuDevice);
HIPCHECK(hipMemsetD16((hipDeviceptr_t)(A_d + offset), memsetval, N - offset));
} else if (type == hipMemsetTypeD32) {
printf("testhipMemsetD32 N=%zu memsetD32val=%8x device=%d\n",
(N - offset), memsetval, p_gpuDevice);
HIPCHECK(hipMemsetD32((hipDeviceptr_t)(A_d + offset), memsetval, N - offset));
}
HIPCHECK(hipFree(A_d));
free(A_h);
return testResult;
}
bool testhipMemsetD16(short memsetD16val,int p_gpuDevice)
{
size_t Nbytes = N*sizeof(int);
printf ("testhipMemsetD16 N=%zu memsetD16val=%4x device=%d\n", N, memsetD16val, p_gpuDevice);
short *A_d;
short *A_h;
bool testResult = true;
HIPCHECK ( hipMalloc(&A_d, Nbytes) );
A_h = (short*)malloc(Nbytes);
HIPCHECK ( hipMemsetD16((hipDeviceptr_t)A_d, memsetD16val, N) );
HIPCHECK ( hipMemcpy(A_h, A_d, Nbytes, hipMemcpyDeviceToHost));
for (int i=0; i<N; i++) {
if (A_h[i] != memsetD16val) {
testResult = false; printf("mismatch at index:%d computed:%08x, memsetD16val:%08x\n", i, A_h[i], memsetD32val);
break;
}
HIPCHECK(hipMemcpy(A_h, A_d, Nbytes, hipMemcpyDeviceToHost) );
for ( int i = offset; i < N; i++ ) {
if (A_h[i] != memsetval) {
testResult = false;
printf("mismatch at index:%d computed:%02x, memsetval:%02x\n",
i, static_cast<int> (A_h[i]), static_cast<int> (memsetval));
break;
}
}
HIPCHECK(hipFree(A_d));
free(A_h);
return testResult;
}
HIPCHECK(hipFree(A_d));
free(A_h);
return testResult;
}
bool testhipMemsetD8(char memsetD8val,int p_gpuDevice)
{
size_t Nbytes = N*sizeof(int);
printf ("testhipMemsetD8 N=%zu memsetD8val=%4x device=%d\n", N, memsetD8val, p_gpuDevice);
char *A_d;
char *A_h;
bool testResult = true;
HIPCHECK ( hipMalloc(&A_d, Nbytes) );
A_h = (char*)malloc(Nbytes);
HIPCHECK ( hipMemsetD8((hipDeviceptr_t)A_d, memsetD8val, N) );
HIPCHECK ( hipMemcpy(A_h, A_d, Nbytes, hipMemcpyDeviceToHost));
for (int i=0; i<N; i++) {
if (A_h[i] != memsetD8val) {
testResult = false; printf("mismatch at index:%d computed:%08x, memsetD8val:%08x\n", i, A_h[i], memsetD8val);
break;
}
template<typename T>
bool testhipMemsetAsync(T*A_h, T*A_d, T memsetval, enum MemsetType type,
int p_gpuDevice) {
size_t Nbytes = N * sizeof(T);
bool testResult = true;
HIPCHECK(hipMalloc(reinterpret_cast<void**> (&A_d), Nbytes));
A_h = reinterpret_cast<T*> (malloc(Nbytes));
hipStream_t stream;
HIPCHECK(hipStreamCreate(&stream));
loop(offset, MAX_OFFSET) {
if (type == hipMemsetTypeDefault) {
printf("testhipMemsetAsync N=%zu memsetval=%2x device=%d\n",
(N - offset), memsetval, p_gpuDevice);
HIPCHECK(hipMemsetAsync(A_d+offset, memsetval, Nbytes-offset, stream));
} else if (type == hipMemsetTypeD8) {
printf("testhipMemsetD8Async N=%zu memsetD8val=%2x device=%d\n",
(N - offset), memsetval, p_gpuDevice);
HIPCHECK(hipMemsetD8Async((hipDeviceptr_t)(A_d + offset), memsetval, N - offset, stream));
} else if (type == hipMemsetTypeD16) {
printf("testhipMemsetD16Async N=%zu memsetD16val=%8x device=%d\n",
(N - offset), memsetval, p_gpuDevice);
HIPCHECK(hipMemsetD16Async((hipDeviceptr_t)(A_d + offset), memsetval, N - offset, stream));
} else if (type == hipMemsetTypeD32) {
printf("testhipMemsetD32Async N=%zu memsetD32val=%8x device=%d\n",
(N - offset), memsetval, p_gpuDevice);
HIPCHECK(hipMemsetD32Async((hipDeviceptr_t)(A_d + offset), memsetval, N - offset, stream));
}
HIPCHECK(hipFree(A_d));
free(A_h);
return testResult;
}
HIPCHECK(hipStreamSynchronize(stream));
HIPCHECK(hipMemcpy(A_h, reinterpret_cast<void*> (A_d), Nbytes, hipMemcpyDeviceToHost));
bool testhipMemsetAsync(int memsetval,int p_gpuDevice)
{
size_t Nbytes = N*sizeof(int);
printf ("testhipMemsetAsync N=%zu memsetval=%2x device=%d\n", N, memsetval, p_gpuDevice);
char *A_d;
char *A_h;
bool testResult = true;
HIPCHECK ( hipMalloc((void**)&A_d, Nbytes) );
A_h = (char*)malloc(Nbytes);
hipStream_t stream;
HIPCHECK(hipStreamCreate(&stream));
HIPCHECK ( hipMemsetAsync(A_d, memsetval, Nbytes, stream ));
HIPCHECK ( hipStreamSynchronize(stream));
HIPCHECK ( hipMemcpy(A_h, (void*)A_d, Nbytes, hipMemcpyDeviceToHost));
for (int i=0; i<N; i++) {
if (A_h[i] != memsetval) {
testResult = false;
printf("mismatch at index:%d computed:%02x, memsetval:%02x\n", i, (int)A_h[i], (int)memsetval);
break;
}
for ( int i = offset; i < N; i++ ) {
if (A_h[i] != memsetval) {
testResult = false;
printf("mismatch at index:%d computed:%02x\n", i, static_cast<int> (A_h[i]));
break;
}
}
HIPCHECK(hipFree((void*)A_d));
HIPCHECK(hipStreamDestroy(stream));
free(A_h);
return testResult;
}
HIPCHECK(hipFree(reinterpret_cast<void*> (A_d)) );
HIPCHECK(hipStreamDestroy(stream));
free(A_h);
return testResult;
}
bool testhipMemsetD32Async(int memsetD32val,int p_gpuDevice)
{
size_t Nbytes = N*sizeof(int);
printf ("testhipMemsetD32Async N=%zu memsetval=%8x device=%d\n", N, memsetD32val, p_gpuDevice);
int *A_d;
int *A_h;
bool testResult = true;
HIPCHECK ( hipMalloc((void**)&A_d, Nbytes) );
A_h = (int*)malloc(Nbytes);
hipStream_t stream;
HIPCHECK(hipStreamCreate(&stream));
HIPCHECK ( hipMemsetD32Async((hipDeviceptr_t)A_d, memsetD32val, N, stream ));
HIPCHECK ( hipStreamSynchronize(stream));
HIPCHECK ( hipMemcpy(A_h, (void*)A_d, Nbytes, hipMemcpyDeviceToHost));
for (int i=0; i<N; i++) {
if (A_h[i] != memsetD32val) {
testResult = false;
printf("mismatch at index:%d computed:%02x, memsetD32val:%02x\n", i, A_h[i], memsetD32val);
break;
}
}
HIPCHECK(hipFree((void*)A_d));
HIPCHECK(hipStreamDestroy(stream));
free(A_h);
return testResult;
bool testhipMemset2AsyncOps() {
printf("testhipMemset2AsyncOps 2 memset operations at the same time\n");
std::vector<float> v;
v.resize(2048);
float* p2, *p3;
hipMalloc(reinterpret_cast<void**>(&p2), 4096 + 4096*2);
p3 = p2+2048;
hipStream_t s;
hipStreamCreate(&s);
hipMemsetAsync(p2, 0, 32*32*4, s);
hipMemsetD32Async(p3, 0x3fe00000, 32*32, s );
hipStreamSynchronize(s);
for (int i = 0; i < 256; ++i) {
hipMemsetAsync(p2, 0, 32*32*4, s);
hipMemsetD32Async(p3, 0x3fe00000, 32*32, s );
}
hipStreamSynchronize(s);
hipDeviceSynchronize();
hipMemcpy(&v[0], p2, 1024, hipMemcpyDeviceToHost);
hipMemcpy(&v[1024], p3, 1024, hipMemcpyDeviceToHost);
if ((v[0] != 0) || (v[1024] != 1.75f)) {
printf("mismatch (%f != 0) or (%f != 1.75f)\n", v[0], v[1024]);
return false;
}
return true;
}
bool testhipMemsetD16Async(short memsetD16val,int p_gpuDevice)
{
size_t Nbytes = N*sizeof(int);
printf ("testhipMemsetD16Async N=%zu memsetval=%8x device=%d\n", N, memsetD16val, p_gpuDevice);
short *A_d;
short *A_h;
bool testResult = true;
int main(int argc, char *argv[]) {
HipTest::parseStandardArguments(argc, argv, true);
bool testResult = true;
char * cA_d;
char * cA_h;
int16_t * siA_d;
int16_t * siA_h;
int32_t * iA_d;
int32_t * iA_h;
HIPCHECK(hipSetDevice(p_gpuDevice));
testResult &= testhipMemsetSmallSize(memsetval, p_gpuDevice);
HIPCHECK ( hipMalloc((void**)&A_d, Nbytes) );
A_h = (short*)malloc(Nbytes);
hipStream_t stream;
HIPCHECK(hipStreamCreate(&stream));
HIPCHECK ( hipMemsetD16Async((hipDeviceptr_t)A_d, memsetD16val, N, stream ));
HIPCHECK ( hipStreamSynchronize(stream));
HIPCHECK ( hipMemcpy(A_h, (void*)A_d, Nbytes, hipMemcpyDeviceToHost));
testResult &= testhipMemset(cA_h, cA_d, memsetval, hipMemsetTypeDefault, p_gpuDevice);
testResult &= testhipMemset(iA_h, iA_d, memsetD32val, hipMemsetTypeD32, p_gpuDevice);
testResult &= testhipMemset(siA_h, siA_d, memsetD16val, hipMemsetTypeD16, p_gpuDevice);
testResult &= testhipMemset(cA_h, cA_d, memsetD8val, hipMemsetTypeD8, p_gpuDevice);
for (int i=0; i<N; i++) {
if (A_h[i] != memsetD16val) {
testResult = false;
printf("mismatch at index:%d computed:%02x, memsetD16val:%02x\n", i, A_h[i], memsetD16val);
break;
}
}
HIPCHECK(hipFree((void*)A_d));
HIPCHECK(hipStreamDestroy(stream));
free(A_h);
return testResult;
}
bool testhipMemsetD8Async(char memsetD8val,int p_gpuDevice)
{
size_t Nbytes = N*sizeof(int);
printf ("testhipMemsetD8Async N=%zu memsetD8val=%2x device=%d\n", N, memsetD8val, p_gpuDevice);
char *A_d;
char *A_h;
bool testResult = true;
HIPCHECK ( hipMalloc((void**)&A_d, Nbytes) );
A_h = (char*)malloc(Nbytes);
hipStream_t stream;
HIPCHECK(hipStreamCreate(&stream));
HIPCHECK ( hipMemsetD8Async((hipDeviceptr_t)A_d, memsetD8val, N, stream ));
HIPCHECK ( hipStreamSynchronize(stream));
HIPCHECK ( hipMemcpy(A_h, (void*)A_d, Nbytes, hipMemcpyDeviceToHost));
for (int i=0; i<N; i++) {
if (A_h[i] != memsetD8val) {
testResult = false;
printf("mismatch at index:%d computed:%02x, memsetD8val:%02x\n", i, A_h[i], memsetD8val);
break;
}
}
HIPCHECK(hipFree((void*)A_d));
HIPCHECK(hipStreamDestroy(stream));
free(A_h);
return testResult;
}
int main(int argc, char *argv[])
{
HipTest::parseStandardArguments(argc, argv, true);
bool testResult = true;
HIPCHECK(hipSetDevice(p_gpuDevice));
testResult &= testhipMemset(memsetval, p_gpuDevice);
testResult &= testhipMemsetAsync(memsetval, p_gpuDevice);
testResult &= testhipMemsetD32(memsetD32val, p_gpuDevice);
testResult &= testhipMemsetD32Async(memsetD32val, p_gpuDevice);
testResult &= testhipMemsetD16(memsetD16val, p_gpuDevice);
testResult &= testhipMemsetD16Async(memsetD16val, p_gpuDevice);
testResult &= testhipMemsetD8(memsetD8val, p_gpuDevice);
testResult &= testhipMemsetD8Async(memsetD8val, p_gpuDevice);
if (testResult) passed();
failed("Output Mismatch\n");
testResult &= testhipMemsetAsync(cA_h, cA_d, memsetval, hipMemsetTypeDefault, p_gpuDevice);
testResult &= testhipMemsetAsync(iA_h, iA_d, memsetD32val, hipMemsetTypeD32, p_gpuDevice);
testResult &= testhipMemsetAsync(siA_h, siA_d, memsetD16val, hipMemsetTypeD16, p_gpuDevice);
testResult &= testhipMemsetAsync(cA_h, cA_d, memsetD8val, hipMemsetTypeD8, p_gpuDevice);
testResult &= testhipMemset2AsyncOps();
if (testResult) passed();
failed("Output Mismatch\n");
}