diff --git a/CMakeLists.txt b/CMakeLists.txt index 9a1d940421..3eedf35b09 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -20,6 +20,7 @@ endmacro() # Setup version information ############################# # Determine HIP_BASE_VERSION +set(ENV{HIP_PATH} "") execute_process(COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/bin/hipconfig --version OUTPUT_VARIABLE HIP_BASE_VERSION OUTPUT_STRIP_TRAILING_WHITESPACE) diff --git a/hipify-clang/packaging/hipify-clang.txt b/hipify-clang/packaging/hipify-clang.txt index 5f78e7e67e..b189eff1e6 100644 --- a/hipify-clang/packaging/hipify-clang.txt +++ b/hipify-clang/packaging/hipify-clang.txt @@ -48,7 +48,7 @@ endif() set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}) set(CPACK_GENERATOR "TGZ;DEB;RPM") set(CPACK_BINARY_DEB "ON") -set(CPACK_DEBIAN_PACKAGE_DEPENDS "cuda >= 7.0") +set(CPACK_DEBIAN_PACKAGE_DEPENDS "cuda (>= 7.0)") set(CPACK_BINARY_RPM "ON") set(CPACK_RPM_PACKAGE_ARCHITECTURE "${CMAKE_SYSTEM_PROCESSOR}") set(CPACK_RPM_PACKAGE_AUTOREQPROV "NO") diff --git a/include/hip/hcc_detail/hip_cooperative_groups_helper.h b/include/hip/hcc_detail/hip_cooperative_groups_helper.h index b74d16d23b..9738448d94 100644 --- a/include/hip/hcc_detail/hip_cooperative_groups_helper.h +++ b/include/hip/hcc_detail/hip_cooperative_groups_helper.h @@ -106,7 +106,7 @@ __CG_STATIC_QUALIFIER__ uint32_t size() { } __CG_STATIC_QUALIFIER__ uint32_t thread_rank() { - // Compute global id of the workgroup to which the current threads belongs to + // Compute global id of the workgroup to which the current thread belongs to uint32_t blkIdx = (uint32_t)((hipBlockIdx_z * hipGridDim_y * hipGridDim_x) + (hipBlockIdx_y * hipGridDim_x) + @@ -115,7 +115,7 @@ __CG_STATIC_QUALIFIER__ uint32_t thread_rank() { // Compute total number of threads being passed to reach current workgroup // within grid uint32_t num_threads_till_current_workgroup = - (uint32_t)(blkIdx * (hipBlockIdx_x * hipBlockIdx_y * hipBlockIdx_z)); + (uint32_t)(blkIdx * (hipBlockDim_x * hipBlockDim_y * hipBlockDim_z)); // Compute thread local rank within current workgroup uint32_t local_thread_rank = diff --git a/packaging/hip-nvcc.postinst b/packaging/hip-nvcc.postinst index 2f901324cb..b70cf2848f 100755 --- a/packaging/hip-nvcc.postinst +++ b/packaging/hip-nvcc.postinst @@ -3,6 +3,6 @@ ROCMDIR=@ROCM_PATH@ HIPDIR=$ROCMDIR/hip -if [ -d $ROCMDIR] +if [ -d $ROCMDIR] ; then ln -s -f $ROCMDIR /opt/rocm fi diff --git a/packaging/hip-nvcc.prerm b/packaging/hip-nvcc.prerm index 96875e4a9c..baa0e6f5c7 100755 --- a/packaging/hip-nvcc.prerm +++ b/packaging/hip-nvcc.prerm @@ -1,5 +1,5 @@ #!/bin/bash -if [ -L "/opt/rocm" ] +if [ -L "/opt/rocm" ] ; then unlink /opt/rocm fi diff --git a/src/program_state.cpp b/src/program_state.cpp index dbd7d3ebc4..5e9f9976be 100644 --- a/src/program_state.cpp +++ b/src/program_state.cpp @@ -61,7 +61,7 @@ namespace hip_impl { if (it == impl->get_globals().end()) return nullptr; else - return it->second; + return it->second.first; } hsa_executable_t program_state::load_executable(const char* data, diff --git a/src/program_state.inl b/src/program_state.inl index 993418de96..8861558e04 100644 --- a/src/program_state.inl +++ b/src/program_state.inl @@ -18,6 +18,7 @@ #include #include #include +#include "hc.hpp" #include @@ -193,7 +194,8 @@ public: std::tuple< std::once_flag, std::mutex, - std::unordered_map> globals; + // map from string to pair + std::unordered_map>> globals; using RAII_code_reader = std::unique_ptr& get_globals() { + std::unordered_map>& get_globals() { std::call_once(std::get<0>(globals), [this]() { std::get<2>(globals).reserve(get_symbol_addresses().size()); }); @@ -349,30 +351,52 @@ public: auto& g_mutex = get_globals_mutex(); for (auto&& x : undefined_symbols) { - if (g.find(x) != g.cend()) return; - const auto it1 = get_symbol_addresses().find(x); - if (it1 == get_symbol_addresses().cend()) { hip_throw(std::runtime_error{ "Global symbol: " + x + " is undefined."}); } - std::lock_guard lck{g_mutex}; + hsa_status_t status; + auto check_hsa_global_var_define_error = [&x](hsa_status_t s) { + if (s != HSA_STATUS_SUCCESS) { + const char* es; + hsa_status_string(s, &es); + hip_throw(std::runtime_error{ "Error when defining symbol " + x + " : " + es}); + } + }; - if (g.find(x) != g.cend()) return; + auto retrieve_pinned_address_from_cache = [](decltype(g) g, decltype(x) x) { + const auto& global_addr = g.find(x); + if (global_addr != g.cend()) { + return global_addr->second.second; + } + return (void*)nullptr; + }; - g.emplace(x, (void*)(it1->second.first)); - void* p = nullptr; - hsa_amd_memory_lock( - reinterpret_cast(it1->second.first), - it1->second.second, - nullptr, // All agents. - 0, - &p); - - hsa_executable_agent_global_variable_define( - executable, agent, x.c_str(), p); + void* p = retrieve_pinned_address_from_cache(g, x); + if (p == nullptr) { + std::lock_guard lck{g_mutex}; + p = retrieve_pinned_address_from_cache(g, x); + if (p == nullptr) { + if (x == "_ZN2hc13printf_bufferE") { + // This is the printf buffer, get the pinned address from HCC + p = Kalmar::getContext()->getPrintfBufferPointerVA(); + } + else { + status = hsa_amd_memory_lock(reinterpret_cast(it1->second.first), + it1->second.second, + nullptr, // All agents. + 0, &p); + check_hsa_global_var_define_error(status); + } + // cache the global address and its pinned address + g.emplace(x, std::make_pair(reinterpret_cast(it1->second.first), p)); + } + } + status = hsa_executable_agent_global_variable_define( + executable, agent, x.c_str(), p); + check_hsa_global_var_define_error(status); } } diff --git a/tests/src/kernel/hipPrintfKernel.cpp b/tests/src/kernel/hipPrintfKernel.cpp index 1d4fa5fe30..5675f2e6bd 100644 --- a/tests/src/kernel/hipPrintfKernel.cpp +++ b/tests/src/kernel/hipPrintfKernel.cpp @@ -30,7 +30,12 @@ THE SOFTWARE. __global__ void run_printf() { printf("Hello World\n"); } int main() { - hipLaunchKernelGGL(HIP_KERNEL_NAME(run_printf), dim3(1), dim3(1), 0, 0); - hipDeviceSynchronize(); + int device_count = 0; + hipGetDeviceCount(&device_count); + for (int i = 0; i < device_count; ++i) { + hipSetDevice(i); + hipLaunchKernelGGL(HIP_KERNEL_NAME(run_printf), dim3(1), dim3(1), 0, 0); + hipDeviceSynchronize(); + } passed(); }