diff --git a/projects/hip/bin/hipcc b/projects/hip/bin/hipcc index fd88e63c33..8f19c69881 100755 --- a/projects/hip/bin/hipcc +++ b/projects/hip/bin/hipcc @@ -620,7 +620,17 @@ foreach $arg (@ARGV) $toolArgs = substr $toolArgs, 0, -8; chomp $toolArgs; } + } elsif ($arg eq 'c' and $prevArg eq '-x') { + $hasC = 1; + $hasCXX = 0; + $hasHIP = 0; + } elsif ($arg eq 'c++' and $prevArg eq '-x') { + $hasC = 0; + $hasCXX = 1; + $hasHIP = 0; } elsif ($arg eq 'hip' and $prevArg eq '-x') { + $hasC = 0; + $hasCXX = 0; $hasHIP = 1; } elsif ($arg =~ m/^-/) { # options start with - @@ -647,28 +657,35 @@ foreach $arg (@ARGV) #print "O: <$arg>\n"; } elsif ($prevArg ne '-o') { # input files and libraries - if ($arg =~ /\.c$/) { - $hasC = 1; + # Skip guessing if `-x {c|c++|hip}` is already specified. + if (not ($hasC or $hasCXX or $hasHIP)) { + if ($arg =~ /\.c$/) { + $hasC = 1; + $needCFLAGS = 1; + $toolArgs .= " -x c" + } elsif (($arg =~ /\.cpp$/) or ($arg =~ /\.cxx$/) or ($arg =~ /\.cc$/) ) { + $needCXXFLAGS = 1; + if ($HIP_COMPILE_CXX_AS_HIP eq '0' or $HIP_COMPILER ne "clang") { + $hasCXX = 1; + } else { + $hasHIP = 1; + $toolArgs .= " -x hip"; + } + } elsif ((($arg =~ /\.cu$/ or $arg =~ /\.cuh$/) and $HIP_COMPILE_CXX_AS_HIP ne '0') or ($arg =~ /\.hip$/)) { + $needCXXFLAGS = 1; + if ($HIP_PLATFORM eq "hcc" and $HIP_COMPILER eq "clang") { + $hasHIP = 1; + $toolArgs .= " -x hip"; + } else { + $hasCU = 1; + } + } + } elsif ($hasC) { $needCFLAGS = 1; - $toolArgs .= " -x c" - } - elsif (($arg =~ /\.cpp$/) or ($arg =~ /\.cxx$/) or ($arg =~ /\.cc$/) ) { + } elsif ($hasCXX) { $needCXXFLAGS = 1; - if ($HIP_COMPILE_CXX_AS_HIP eq '0' or $HIP_COMPILER ne "clang") { - $hasCXX = 1; - } else { - $hasHIP = 1; - $toolArgs .= " -x hip"; - } - } - elsif ((($arg =~ /\.cu$/ or $arg =~ /\.cuh$/) and $HIP_COMPILE_CXX_AS_HIP ne '0') or ($arg =~ /\.hip$/)) { + } else { $needCXXFLAGS = 1; - if ($HIP_PLATFORM eq "hcc" and $HIP_COMPILER eq "clang") { - $hasHIP = 1; - $toolArgs .= " -x hip"; - } else { - $hasCU = 1; - } } push (@inputs, $arg); #print "I: <$arg>\n"; diff --git a/projects/hip/cmake/FindHIP/run_hipcc.cmake b/projects/hip/cmake/FindHIP/run_hipcc.cmake index 24b754c874..1d8f3b38ad 100644 --- a/projects/hip/cmake/FindHIP/run_hipcc.cmake +++ b/projects/hip/cmake/FindHIP/run_hipcc.cmake @@ -35,7 +35,8 @@ set(HIP_CLANG_PARALLEL_BUILD_COMPILE_OPTIONS "@HIP_CLANG_PARALLEL_BUILD_COMPILE_ @_HIP_HCC_FLAGS@ @_HIP_CLANG_FLAGS@ @_HIP_NVCC_FLAGS@ -set(HIP_HIPCC_INCLUDE_ARGS "@HIP_HIPCC_INCLUDE_ARGS@") # list (needs to be in quotes to handle spaces properly) +#Needed to bring the HIP_HIPCC_INCLUDE_ARGS variable in scope +set(HIP_HIPCC_INCLUDE_ARGS @HIP_HIPCC_INCLUDE_ARGS@) # list set(cmake_dependency_file "@cmake_dependency_file@") # path set(source_file "@source_file@") # path diff --git a/projects/hip/include/hip/hcc_detail/hip_runtime_api.h b/projects/hip/include/hip/hcc_detail/hip_runtime_api.h index 17c34b0ad5..744885923f 100644 --- a/projects/hip/include/hip/hcc_detail/hip_runtime_api.h +++ b/projects/hip/include/hip/hcc_detail/hip_runtime_api.h @@ -513,7 +513,7 @@ hipError_t hipDeviceGetSharedMemConfig(hipSharedMemConfig* pConfig); * * @returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue */ -hipError_t hipGetDeviceFlags(unsigned *flags); +hipError_t hipGetDeviceFlags(unsigned int* flags); /** * @brief The bank width of shared memory on current device is set diff --git a/projects/hip/rocclr/CMakeLists.txt b/projects/hip/rocclr/CMakeLists.txt index aaee0a4d0b..9d05de586d 100644 --- a/projects/hip/rocclr/CMakeLists.txt +++ b/projects/hip/rocclr/CMakeLists.txt @@ -132,6 +132,8 @@ target_include_directories(hip64 ${PROJECT_SOURCE_DIR} ${PROJECT_SOURCE_DIR}/amdocl ${PROJECT_SOURCE_DIR}/include/hip/hcc_detail/elfio + + # FIXME: Remove ROCclr_DIr explicit references ${ROCclr_DIR} ${ROCclr_DIR}/include ${ROCclr_DIR}/compiler/lib @@ -139,6 +141,7 @@ target_include_directories(hip64 ${ROCclr_DIR}/elf/utils/common ${ROCclr_DIR}/elf/utils/libelf ${ROCR_INCLUDES} + $ $) target_compile_definitions(hip64 PRIVATE diff --git a/projects/hip/rocclr/hip_device.cpp b/projects/hip/rocclr/hip_device.cpp index 8695ef43e5..b8a5e65d29 100644 --- a/projects/hip/rocclr/hip_device.cpp +++ b/projects/hip/rocclr/hip_device.cpp @@ -99,7 +99,7 @@ hipError_t hipDeviceComputeCapability(int *major, int *minor, hipDevice_t device } hipError_t hipDeviceGetCount(int* count) { - HIP_INIT_API(NONE, count); + HIP_INIT_API(hipDeviceGetCount, count); HIP_RETURN(ihipDeviceGetCount(count)); } @@ -238,7 +238,7 @@ hipError_t hipGetDeviceProperties ( hipDeviceProp_t* props, hipDevice_t device ) } hipError_t hipHccGetAccelerator(int deviceId, hc::accelerator* acc) { - HIP_INIT_API(NONE, deviceId, acc); + HIP_INIT_API(hipHccGetAccelerator, deviceId, acc); assert(0 && "Unimplemented"); @@ -246,7 +246,7 @@ hipError_t hipHccGetAccelerator(int deviceId, hc::accelerator* acc) { } hipError_t hipHccGetAcceleratorView(hipStream_t stream, hc::accelerator_view** av) { - HIP_INIT_API(NONE, stream, av); + HIP_INIT_API(hipHccGetAcceleratorView, stream, av); assert(0 && "Unimplemented"); diff --git a/projects/hip/rocclr/hip_device_runtime.cpp b/projects/hip/rocclr/hip_device_runtime.cpp index 531f35c732..3823af5206 100644 --- a/projects/hip/rocclr/hip_device_runtime.cpp +++ b/projects/hip/rocclr/hip_device_runtime.cpp @@ -480,7 +480,7 @@ hipError_t hipGetDeviceFlags ( unsigned int* flags ) { } hipError_t hipIpcGetEventHandle ( hipIpcEventHandle_t* handle, hipEvent_t event ) { - HIP_INIT_API(NONE, handle, event); + HIP_INIT_API(hipIpcGetEventHandle, handle, event); assert(0 && "Unimplemented"); @@ -488,7 +488,7 @@ hipError_t hipIpcGetEventHandle ( hipIpcEventHandle_t* handle, hipEvent_t event } hipError_t hipIpcOpenEventHandle ( hipEvent_t* event, hipIpcEventHandle_t handle ) { - HIP_INIT_API(NONE, event, handle); + HIP_INIT_API(hipIpcOpenEventHandle, event, handle); assert(0 && "Unimplemented"); @@ -542,7 +542,7 @@ hipError_t hipSetDeviceFlags ( unsigned int flags ) { } hipError_t hipSetValidDevices ( int* device_arr, int len ) { - HIP_INIT_API(NONE, device_arr, len); + HIP_INIT_API(hipSetValidDevices, device_arr, len); assert(0 && "Unimplemented"); diff --git a/projects/hip/rocclr/hip_event.cpp b/projects/hip/rocclr/hip_event.cpp index dddadd1bfd..2e4870834f 100644 --- a/projects/hip/rocclr/hip_event.cpp +++ b/projects/hip/rocclr/hip_event.cpp @@ -90,9 +90,15 @@ hipError_t Event::elapsedTime(Event& eStop, float& ms) { return hipErrorNotReady; } - ms = static_cast(static_cast(eStop.event_->profilingInfo().end_ - + // For certain HIP Api's that take start and stop event + // the command is the same + if (event_ == eStop.event_) { + ms = static_cast(static_cast(eStop.event_->profilingInfo().end_ - + event_->profilingInfo().start_))/1000000.f; + } else { + ms = static_cast(static_cast(eStop.event_->profilingInfo().end_ - event_->profilingInfo().end_))/1000000.f; - + } return hipSuccess; } diff --git a/projects/hip/rocclr/hip_memory.cpp b/projects/hip/rocclr/hip_memory.cpp index 4bd75b8ffb..3aad7f57ba 100755 --- a/projects/hip/rocclr/hip_memory.cpp +++ b/projects/hip/rocclr/hip_memory.cpp @@ -110,6 +110,9 @@ hipError_t ihipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKin // Skip if nothing needs writing. return hipSuccess; } + if (dst == nullptr || src == nullptr) { + return hipErrorInvalidValue; + } amd::Command* command = nullptr; amd::Command::EventWaitList waitList; @@ -1717,7 +1720,9 @@ hipError_t ihipMemset(void* dst, int64_t value, size_t valueSize, size_t sizeByt } size_t offset = 0; - amd::Memory* memory = getMemoryObject(dst, offset); + auto aligned_dst = amd::alignUp(reinterpret_cast
(dst), sizeof(uint64_t)); + + amd::Memory* memory = getMemoryObject(aligned_dst, offset); if (memory == nullptr) { // Host alloced memory memset(dst, value, sizeBytes); @@ -1727,44 +1732,52 @@ hipError_t ihipMemset(void* dst, int64_t value, size_t valueSize, size_t sizeByt hipError_t hip_error = hipSuccess; amd::HostQueue* queue = hip::getQueue(stream); + size_t n_head_bytes = 0; + size_t n_tail_bytes = 0; int64_t value64 = 0; - const size_t uint64ModSize = (sizeBytes % sizeof(int64_t)); - if (sizeBytes/sizeof(int64_t) > 0) { + n_head_bytes = static_cast(aligned_dst) - static_cast(dst); if (valueSize == sizeof(int8_t)) { value = value & 0xff; value64 = ((value << 56) | (value << 48) | (value << 40) | (value << 32) | (value << 24) | (value << 16) | (value << 8) | (value)); } else if (valueSize == sizeof(int16_t)) { value = value & 0xffff; - value64 = ((value << 48) | (value << 32) | (value<<16) | (value)); - } else if(valueSize == sizeof(int32_t)) { + value64 = ((value << 48) | (value << 32) | (value << 16) | (value)); + } else if (valueSize == sizeof(int32_t)) { value = value & 0xffffffff; - value64 = ((value<<32) | (value)); + value64 = ((value << 32) | (value)); } else if (valueSize == sizeof(int64_t)) { value64 = value; } else { LogPrintfError("Unsupported Pattern size: %u \n", valueSize); return hipErrorInvalidValue; } - // If uint64ModSize is != 0 then we will do a second fillBuffer Command + n_tail_bytes = ((sizeBytes - n_head_bytes) % sizeof(int64_t)); + // If n_tail_bytes is != 0 then we will do a second fillBuffer Command // on the same stream below, dont wait, do the first call async. hip_error = packFillMemoryCommand(memory, offset, value64, sizeof(int64_t), - sizeBytes - uint64ModSize, queue, - ((uint64ModSize != 0) || isAsync)); - if(hip_error != hipSuccess) { + sizeBytes - n_tail_bytes - n_head_bytes, queue, + ((n_head_bytes != 0) || (n_tail_bytes != 0) || isAsync)); + if (hip_error != hipSuccess) { return hip_error; } + } else { + n_head_bytes = sizeBytes; } - if (uint64ModSize != 0) { - void* new_dst = reinterpret_cast((reinterpret_cast
(dst) - + sizeBytes) - uint64ModSize); + if (n_head_bytes != 0) { + memory = getMemoryObject(dst, offset); + hip_error = packFillMemoryCommand(memory, offset, value, valueSize, + n_head_bytes , queue, isAsync); + } + + if (n_tail_bytes != 0) { + void* new_dst = (reinterpret_cast
(dst) + sizeBytes) - n_tail_bytes; memory = getMemoryObject(new_dst, offset); hip_error = packFillMemoryCommand(memory, offset, value, valueSize, - uint64ModSize, queue, isAsync); + n_tail_bytes, queue, isAsync); } - return hip_error; } @@ -2090,7 +2103,7 @@ hipError_t hipArrayGetDescriptor(HIP_ARRAY_DESCRIPTOR* pArrayDescriptor, hipError_t hipMemcpyParam2DAsync(const hip_Memcpy2D* pCopy, hipStream_t stream) { - HIP_INIT_API(hipMemcpyParam2D, pCopy); + HIP_INIT_API(hipMemcpyParam2DAsync, pCopy); HIP_RETURN(ihipMemcpyParam2D(pCopy, stream, true)); } diff --git a/projects/hip/rocclr/hip_module.cpp b/projects/hip/rocclr/hip_module.cpp index 7cda6864c6..3a51001063 100755 --- a/projects/hip/rocclr/hip_module.cpp +++ b/projects/hip/rocclr/hip_module.cpp @@ -119,7 +119,7 @@ hipError_t hipModuleLoadDataEx(hipModule_t *module, const void *image, void** optionsValues) { /* TODO: Pass options to Program */ - HIP_INIT_API(hipModuleLoadData, module, image); + HIP_INIT_API(hipModuleLoadDataEx, module, image); HIP_RETURN(ihipModuleLoadData(module, image, 0)); } @@ -369,7 +369,7 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f, hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags = 0, uint32_t params = 0, uint32_t gridId = 0, uint32_t numGrids = 0, uint64_t prevGridSum = 0, uint64_t allGridSum = 0, uint32_t firstDevice = 0) { - HIP_INIT_API(NONE, f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, + HIP_INIT_API(ihipModuleLaunchKernel, f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams, extra, startEvent, stopEvent, flags, params); hip::Function* function = hip::Function::asFunction(f); @@ -493,7 +493,7 @@ hipError_t hipExtModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX, hipStream_t hStream, void** kernelParams, void** extra, hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags) { - HIP_INIT_API(NONE, f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, + HIP_INIT_API(hipExtModuleLaunchKernel, f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, localWorkSizeX, localWorkSizeY, localWorkSizeZ, sharedMemBytes, hStream, kernelParams, extra, startEvent, stopEvent, flags); @@ -512,7 +512,7 @@ hipError_t hipHccModuleLaunchKernel(hipFunction_t f, uint32_t gridDimX, hipEvent_t startEvent, hipEvent_t stopEvent) { - HIP_INIT_API(NONE, f, gridDimX, gridDimY, gridDimZ, + HIP_INIT_API(hipHccModuleLaunchKernel, f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams, extra, startEvent, stopEvent); @@ -529,7 +529,7 @@ hipError_t hipModuleLaunchKernelExt(hipFunction_t f, uint32_t gridDimX, hipEvent_t startEvent, hipEvent_t stopEvent) { - HIP_INIT_API(NONE, f, gridDimX, gridDimY, gridDimZ, + HIP_INIT_API(hipModuleLaunchKernelExt, f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams, extra, startEvent, stopEvent); diff --git a/projects/hip/rocclr/hip_platform.cpp b/projects/hip/rocclr/hip_platform.cpp index 8759ef47f1..0be928f9a4 100755 --- a/projects/hip/rocclr/hip_platform.cpp +++ b/projects/hip/rocclr/hip_platform.cpp @@ -191,13 +191,6 @@ void PlatformState::init() for (auto& it : vars_) { it.second.rvars.resize(g_devices.size()); } - if (!HIP_ENABLE_LAZY_KERNEL_LOADING) { - for (size_t i = 0; i < g_devices.size(); ++i) { - for (auto& it: functions_) { - getFunc(it.first, i); - } - } - } } bool PlatformState::unregisterFunc(hipModule_t hmod) { @@ -599,6 +592,13 @@ void PlatformState::popExec(ihipExec_t& exec) { execStack_.pop(); } +namespace { +const int HIP_ENABLE_DEFERRED_LOADING{[] () { + char *var = getenv("HIP_ENABLE_DEFERRED_LOADING"); + return var ? atoi(var) : 1; +}()}; +} /* namespace */ + extern "C" void __hipRegisterFunction( std::vector >* modules, const void* hostFunction, @@ -613,9 +613,12 @@ extern "C" void __hipRegisterFunction( { PlatformState::DeviceFunction func{ std::string{deviceName}, modules, std::vector{g_devices.size()}}; PlatformState::instance().registerFunction(hostFunction, func); -// for (size_t i = 0; i < g_devices.size(); ++i) { -// PlatformState::instance().getFunc(hostFunction, i); -// } + if (!HIP_ENABLE_DEFERRED_LOADING) { + HIP_INIT(); + for (size_t i = 0; i < g_devices.size(); ++i) { + PlatformState::instance().getFunc(hostFunction, i); + } + } } // Registers a device-side global variable. @@ -706,7 +709,7 @@ extern "C" hipError_t hipConfigureCall( size_t sharedMem, hipStream_t stream) { - HIP_INIT_API(NONE, gridDim, blockDim, sharedMem, stream); + HIP_INIT_API(hipConfigureCall, gridDim, blockDim, sharedMem, stream); PlatformState::instance().configureCall(gridDim, blockDim, sharedMem, stream); @@ -719,7 +722,7 @@ extern "C" hipError_t __hipPushCallConfiguration( size_t sharedMem, hipStream_t stream) { - HIP_INIT_API(NONE, gridDim, blockDim, sharedMem, stream); + HIP_INIT_API(__hipPushCallConfiguration, gridDim, blockDim, sharedMem, stream); PlatformState::instance().configureCall(gridDim, blockDim, sharedMem, stream); @@ -730,7 +733,7 @@ extern "C" hipError_t __hipPopCallConfiguration(dim3 *gridDim, dim3 *blockDim, size_t *sharedMem, hipStream_t *stream) { - HIP_INIT_API(NONE, gridDim, blockDim, sharedMem, stream); + HIP_INIT_API(__hipPopCallConfiguration, gridDim, blockDim, sharedMem, stream); ihipExec_t exec; PlatformState::instance().popExec(exec); @@ -747,7 +750,7 @@ extern "C" hipError_t hipSetupArgument( size_t size, size_t offset) { - HIP_INIT_API(NONE, arg, size, offset); + HIP_INIT_API(hipSetupArgument, arg, size, offset); PlatformState::instance().setupArgument(arg, size, offset); @@ -756,7 +759,7 @@ extern "C" hipError_t hipSetupArgument( extern "C" hipError_t hipLaunchByPtr(const void *hostFunction) { - HIP_INIT_API(NONE, hostFunction); + HIP_INIT_API(hipLaunchByPtr, hostFunction); ihipExec_t exec; PlatformState::instance().popExec(exec); @@ -1301,7 +1304,7 @@ extern "C" hipError_t hipLaunchKernel(const void *hostFunction, size_t sharedMemBytes, hipStream_t stream) { - HIP_INIT_API(NONE, hostFunction, gridDim, blockDim, args, sharedMemBytes, + HIP_INIT_API(hipLaunchKernel, hostFunction, gridDim, blockDim, args, sharedMemBytes, stream); hip::Stream* s = reinterpret_cast(stream); diff --git a/projects/hip/rocclr/hip_prof_gen.py b/projects/hip/rocclr/hip_prof_gen.py index c20df3c1aa..c508888140 100755 --- a/projects/hip/rocclr/hip_prof_gen.py +++ b/projects/hip/rocclr/hip_prof_gen.py @@ -68,8 +68,8 @@ def filtr_api_name(name): return name def filtr_api_decl(record): - record = re.sub("\s__dparm\([^\)]*\)", '', record); - record = re.sub("\(void\*\)", '', record); + record = re.sub("\s__dparm\([^\)]*\)", r'', record); + record = re.sub("\(void\*\)", r'', record); return record # Normalizing API arguments @@ -216,6 +216,8 @@ def parse_content(inp_file_p, api_map, out): api_name = "" # Valid public API found flag api_valid = 0 + # API overload (parameters mismatch) + api_overload = 0 # Input file patched content content = '' @@ -240,6 +242,7 @@ def parse_content(inp_file_p, api_map, out): # Looking for API begin if found == 0: + record = re.sub(r'\s*extern\s+"C"\s+', r'', record); if beg_pattern.match(record): found = 1 record = filtr_api_decl(record) @@ -255,6 +258,8 @@ def parse_content(inp_file_p, api_map, out): # Checking if complete API matched if m: found = 2 + api_valid = 0 + api_overload = 0 api_name = filtr_api_name(m.group(2)) # Checking if API name is in the API map if (private_check_mode == 0) or (api_name in api_map): @@ -280,48 +285,34 @@ def parse_content(inp_file_p, api_map, out): out[api_name] = filtr_api_opts(api_args) # Register missmatched API methods else: + api_overload = 1 # Warning about mismatched API, possible non public overloaded version api_diff = '\t\t' + inp_file + " line(" + str(line_num) + ")\n\t\tapi: " + api_types + "\n\t\teta: " + eta_types message("\t" + api_name + ' args mismatch:\n' + api_diff + '\n') - if hip_patch_mode != 0: - # Looking for INIT macro - m = init_pattern.match(line) - if m: - if api_valid == 0: api_name = 'NONE' - - if api_name == m.group(3): - if hip_patch_mode == 1: hip_patch_mode = 0 - else: fatal("patching failed") - else: - hip_patch_mode = 2 - init_args = m.group(2) - if init_args != '': init_args = ', ' + init_args - line = m.group(1) + '(' + api_name + init_args + m.group(5) + '\n' - non_public_api = 0 - # API found action if found == 2: - # Looking for INIT macro + if hip_patch_mode != 0: + # Looking for INIT macro + m = init_pattern.match(line) + if m: + init_name = api_name + if api_overload == 1: init_name = 'NONE' + init_args = m.group(4) + line = m.group(1) + '(' + init_name + init_args + m.group(5) + '\n' + m = init_pattern.match(line) if m: found = 0 - non_public_api = 0 - - if api_valid == 1: - api_valid = 0 - message("\t" + api_name) - else: - non_public_api = 1 - - if non_public_api == 1: + if api_valid == 1: message("\t" + api_name) + # Ignore if it is initialized as NONE + init_name = m.group(3) + if init_name != 'NONE': + # Check if init name matching API name + if init_name != api_name: + fatal("init name mismatch: '" + init_name + "' <> '" + api_name + "'") # Registering dummy API for non public API if the name in INIT is not NONE - init_name = m.group(3) - # Ignore if it is initialized as NONE - if init_name != 'NONE': - # Check if init name matching API name - if init_name != api_name: - fatal("init name mismatch: '" + init_name + "' <> '" + api_name + "'") + if api_valid == 0: # If init name is not in public API map then it is private API # else it was not identified and will be checked on finish if not init_name in api_map: @@ -333,7 +324,6 @@ def parse_content(inp_file_p, api_map, out): # Expect INIT macro for valid public API # Removing and registering non-conformant APIs with missing HIP_INIT macro if api_valid == 1: - api_valid = 0 if api_name in out: del out[api_name] del api_map[api_name] diff --git a/projects/hip/samples/2_Cookbook/13_occupancy/occupancy.cpp b/projects/hip/samples/2_Cookbook/13_occupancy/occupancy.cpp index e772e82b1d..7b32b6d0fa 100644 --- a/projects/hip/samples/2_Cookbook/13_occupancy/occupancy.cpp +++ b/projects/hip/samples/2_Cookbook/13_occupancy/occupancy.cpp @@ -44,8 +44,6 @@ void multiplyCPU(float* C, float* A, float* B, int N){ } } -#if defined(__HIP_PLATFORM_HCC__) && GENERIC_GRID_LAUNCH == 1 && defined(__HCC__) - void launchKernel(float* C, float* A, float* B, bool manual){ hipDeviceProp_t devProp; @@ -95,10 +93,8 @@ void launchKernel(float* C, float* A, float* B, bool manual){ std::cout << "Theoretical Occupancy is " << (double)numBlock* blockSize/devProp.maxThreadsPerMultiProcessor * 100 << "%" << std::endl; } } -#endif int main() { -#if defined(__HIP_PLATFORM_HCC__) && GENERIC_GRID_LAUNCH == 1 && defined(__HCC__) float *A, *B, *C0, *C1, *cpuC; float *Ad, *Bd, *C0d, *C1d; int errors=0; @@ -177,8 +173,5 @@ int main() { free(C0); free(C1); free(cpuC); -#else - std::cout <<"hipOccupancyMaxPotentialBlockSize template not support for Clang compiler"<= 0; offset --) +#include #include "hip/hip_runtime.h" #include "test_common.h" +enum MemsetType { + hipMemsetTypeDefault, + hipMemsetTypeD8, + hipMemsetTypeD16, + hipMemsetTypeD32 +}; -bool testhipMemset(int memsetval,int p_gpuDevice) +bool testhipMemsetSmallSize(int memsetval, int p_gpuDevice) { - size_t Nbytes = N*sizeof(char); - printf ("testhipMemset N=%zu memsetval=%2x device=%d\n", N, memsetval, p_gpuDevice); - char *A_d; - char *A_h; - bool testResult = true; + char *A_d; + char *A_h; + bool testResult = true; + for ( size_t iSize = 1; iSize < 4; iSize++ ) { + size_t Nbytes = iSize * sizeof(char); + HIPCHECK(hipMalloc(&A_d, Nbytes)); + A_h = reinterpret_cast (malloc(Nbytes)); + printf("testhipMemsetSmallSize N=%zu memsetval=%2x device=%d\n", + iSize, memsetval, p_gpuDevice); + HIPCHECK(hipMemset(A_d, memsetval, Nbytes)); + HIPCHECK(hipMemcpy(A_h, A_d, Nbytes, hipMemcpyDeviceToHost)); - HIPCHECK ( hipMalloc(&A_d, Nbytes) ); - A_h = (char*)malloc(Nbytes); - HIPCHECK ( hipMemset(A_d, memsetval, Nbytes) ); - HIPCHECK ( hipMemcpy(A_h, A_d, Nbytes, hipMemcpyDeviceToHost)); - - for (int i=0; i (A_h[i]), static_cast (memsetval)); + break; + } } - HIPCHECK(hipFree(A_d)); - free(A_h); - return testResult; + HIPCHECK(hipFree(A_d)); + free(A_h); + } + return testResult; } -bool testhipMemsetD32(int memsetD32val,int p_gpuDevice) -{ - size_t Nbytes = N*sizeof(int); - printf ("testhipMemsetD32 N=%zu memsetD32val=%8x device=%d\n", N, memsetD32val, p_gpuDevice); - int *A_d; - int *A_h; - bool testResult = true; - - HIPCHECK ( hipMalloc(&A_d, Nbytes) ); - A_h = (int*)malloc(Nbytes); - HIPCHECK ( hipMemsetD32((hipDeviceptr_t)A_d, memsetD32val, N) ); - HIPCHECK ( hipMemcpy(A_h, A_d, Nbytes, hipMemcpyDeviceToHost)); - - for (int i=0; i +bool testhipMemset(T*A_h, T*A_d, T memsetval, enum MemsetType type, + int p_gpuDevice) { + size_t Nbytes = N * sizeof(T); + bool testResult = true; + HIPCHECK(hipMalloc(&A_d, Nbytes)); + A_h = reinterpret_cast (malloc(Nbytes)); + loop(offset, MAX_OFFSET) { + if (type == hipMemsetTypeDefault) { + printf("testhipMemset N=%zu memsetval=%2x device=%d\n", + (N - offset), memsetval, p_gpuDevice); + HIPCHECK(hipMemset(A_d + offset, memsetval, N - offset)); + } else if (type == hipMemsetTypeD8) { + printf("testhipMemsetD8 N=%zu memsetD8val=%4x device=%d\n", + (N - offset), memsetval, p_gpuDevice); + HIPCHECK(hipMemsetD8((hipDeviceptr_t)(A_d + offset), memsetval, N - offset)); + } else if (type == hipMemsetTypeD16) { + printf("testhipMemsetD16 N=%zu memsetD16val=%4x device=%d\n", + (N - offset), memsetval, p_gpuDevice); + HIPCHECK(hipMemsetD16((hipDeviceptr_t)(A_d + offset), memsetval, N - offset)); + } else if (type == hipMemsetTypeD32) { + printf("testhipMemsetD32 N=%zu memsetD32val=%8x device=%d\n", + (N - offset), memsetval, p_gpuDevice); + HIPCHECK(hipMemsetD32((hipDeviceptr_t)(A_d + offset), memsetval, N - offset)); } - HIPCHECK(hipFree(A_d)); - free(A_h); - return testResult; -} - -bool testhipMemsetD16(short memsetD16val,int p_gpuDevice) -{ - size_t Nbytes = N*sizeof(int); - printf ("testhipMemsetD16 N=%zu memsetD16val=%4x device=%d\n", N, memsetD16val, p_gpuDevice); - short *A_d; - short *A_h; - bool testResult = true; - - HIPCHECK ( hipMalloc(&A_d, Nbytes) ); - A_h = (short*)malloc(Nbytes); - HIPCHECK ( hipMemsetD16((hipDeviceptr_t)A_d, memsetD16val, N) ); - HIPCHECK ( hipMemcpy(A_h, A_d, Nbytes, hipMemcpyDeviceToHost)); - - for (int i=0; i (A_h[i]), static_cast (memsetval)); + break; + } } - HIPCHECK(hipFree(A_d)); - free(A_h); - return testResult; + } + HIPCHECK(hipFree(A_d)); + free(A_h); + return testResult; } -bool testhipMemsetD8(char memsetD8val,int p_gpuDevice) -{ - size_t Nbytes = N*sizeof(int); - printf ("testhipMemsetD8 N=%zu memsetD8val=%4x device=%d\n", N, memsetD8val, p_gpuDevice); - char *A_d; - char *A_h; - bool testResult = true; - - HIPCHECK ( hipMalloc(&A_d, Nbytes) ); - A_h = (char*)malloc(Nbytes); - HIPCHECK ( hipMemsetD8((hipDeviceptr_t)A_d, memsetD8val, N) ); - HIPCHECK ( hipMemcpy(A_h, A_d, Nbytes, hipMemcpyDeviceToHost)); - - for (int i=0; i +bool testhipMemsetAsync(T*A_h, T*A_d, T memsetval, enum MemsetType type, + int p_gpuDevice) { + size_t Nbytes = N * sizeof(T); + bool testResult = true; + HIPCHECK(hipMalloc(reinterpret_cast (&A_d), Nbytes)); + A_h = reinterpret_cast (malloc(Nbytes)); + hipStream_t stream; + HIPCHECK(hipStreamCreate(&stream)); + loop(offset, MAX_OFFSET) { + if (type == hipMemsetTypeDefault) { + printf("testhipMemsetAsync N=%zu memsetval=%2x device=%d\n", + (N - offset), memsetval, p_gpuDevice); + HIPCHECK(hipMemsetAsync(A_d+offset, memsetval, Nbytes-offset, stream)); + } else if (type == hipMemsetTypeD8) { + printf("testhipMemsetD8Async N=%zu memsetD8val=%2x device=%d\n", + (N - offset), memsetval, p_gpuDevice); + HIPCHECK(hipMemsetD8Async((hipDeviceptr_t)(A_d + offset), memsetval, N - offset, stream)); + } else if (type == hipMemsetTypeD16) { + printf("testhipMemsetD16Async N=%zu memsetD16val=%8x device=%d\n", + (N - offset), memsetval, p_gpuDevice); + HIPCHECK(hipMemsetD16Async((hipDeviceptr_t)(A_d + offset), memsetval, N - offset, stream)); + } else if (type == hipMemsetTypeD32) { + printf("testhipMemsetD32Async N=%zu memsetD32val=%8x device=%d\n", + (N - offset), memsetval, p_gpuDevice); + HIPCHECK(hipMemsetD32Async((hipDeviceptr_t)(A_d + offset), memsetval, N - offset, stream)); } - HIPCHECK(hipFree(A_d)); - free(A_h); - return testResult; -} + HIPCHECK(hipStreamSynchronize(stream)); + HIPCHECK(hipMemcpy(A_h, reinterpret_cast (A_d), Nbytes, hipMemcpyDeviceToHost)); -bool testhipMemsetAsync(int memsetval,int p_gpuDevice) -{ - size_t Nbytes = N*sizeof(int); - printf ("testhipMemsetAsync N=%zu memsetval=%2x device=%d\n", N, memsetval, p_gpuDevice); - char *A_d; - char *A_h; - bool testResult = true; - - HIPCHECK ( hipMalloc((void**)&A_d, Nbytes) ); - A_h = (char*)malloc(Nbytes); - hipStream_t stream; - HIPCHECK(hipStreamCreate(&stream)); - HIPCHECK ( hipMemsetAsync(A_d, memsetval, Nbytes, stream )); - HIPCHECK ( hipStreamSynchronize(stream)); - HIPCHECK ( hipMemcpy(A_h, (void*)A_d, Nbytes, hipMemcpyDeviceToHost)); - - for (int i=0; i (A_h[i])); + break; + } } - HIPCHECK(hipFree((void*)A_d)); - HIPCHECK(hipStreamDestroy(stream)); - free(A_h); - return testResult; + } + HIPCHECK(hipFree(reinterpret_cast (A_d)) ); + HIPCHECK(hipStreamDestroy(stream)); + free(A_h); + return testResult; } -bool testhipMemsetD32Async(int memsetD32val,int p_gpuDevice) -{ - size_t Nbytes = N*sizeof(int); - printf ("testhipMemsetD32Async N=%zu memsetval=%8x device=%d\n", N, memsetD32val, p_gpuDevice); - int *A_d; - int *A_h; - bool testResult = true; - - HIPCHECK ( hipMalloc((void**)&A_d, Nbytes) ); - A_h = (int*)malloc(Nbytes); - hipStream_t stream; - HIPCHECK(hipStreamCreate(&stream)); - HIPCHECK ( hipMemsetD32Async((hipDeviceptr_t)A_d, memsetD32val, N, stream )); - HIPCHECK ( hipStreamSynchronize(stream)); - HIPCHECK ( hipMemcpy(A_h, (void*)A_d, Nbytes, hipMemcpyDeviceToHost)); - - for (int i=0; i v; + v.resize(2048); + float* p2, *p3; + hipMalloc(reinterpret_cast(&p2), 4096 + 4096*2); + p3 = p2+2048; + hipStream_t s; + hipStreamCreate(&s); + hipMemsetAsync(p2, 0, 32*32*4, s); + hipMemsetD32Async(p3, 0x3fe00000, 32*32, s ); + hipStreamSynchronize(s); + for (int i = 0; i < 256; ++i) { + hipMemsetAsync(p2, 0, 32*32*4, s); + hipMemsetD32Async(p3, 0x3fe00000, 32*32, s ); + } + hipStreamSynchronize(s); + hipDeviceSynchronize(); + hipMemcpy(&v[0], p2, 1024, hipMemcpyDeviceToHost); + hipMemcpy(&v[1024], p3, 1024, hipMemcpyDeviceToHost); + if ((v[0] != 0) || (v[1024] != 1.75f)) { + printf("mismatch (%f != 0) or (%f != 1.75f)\n", v[0], v[1024]); + return false; + } + return true; } -bool testhipMemsetD16Async(short memsetD16val,int p_gpuDevice) -{ - size_t Nbytes = N*sizeof(int); - printf ("testhipMemsetD16Async N=%zu memsetval=%8x device=%d\n", N, memsetD16val, p_gpuDevice); - short *A_d; - short *A_h; - bool testResult = true; +int main(int argc, char *argv[]) { + HipTest::parseStandardArguments(argc, argv, true); + bool testResult = true; + char * cA_d; + char * cA_h; + int16_t * siA_d; + int16_t * siA_h; + int32_t * iA_d; + int32_t * iA_h; + HIPCHECK(hipSetDevice(p_gpuDevice)); + testResult &= testhipMemsetSmallSize(memsetval, p_gpuDevice); - HIPCHECK ( hipMalloc((void**)&A_d, Nbytes) ); - A_h = (short*)malloc(Nbytes); - hipStream_t stream; - HIPCHECK(hipStreamCreate(&stream)); - HIPCHECK ( hipMemsetD16Async((hipDeviceptr_t)A_d, memsetD16val, N, stream )); - HIPCHECK ( hipStreamSynchronize(stream)); - HIPCHECK ( hipMemcpy(A_h, (void*)A_d, Nbytes, hipMemcpyDeviceToHost)); + testResult &= testhipMemset(cA_h, cA_d, memsetval, hipMemsetTypeDefault, p_gpuDevice); + testResult &= testhipMemset(iA_h, iA_d, memsetD32val, hipMemsetTypeD32, p_gpuDevice); + testResult &= testhipMemset(siA_h, siA_d, memsetD16val, hipMemsetTypeD16, p_gpuDevice); + testResult &= testhipMemset(cA_h, cA_d, memsetD8val, hipMemsetTypeD8, p_gpuDevice); - for (int i=0; i