From 294ffbb51f1e67d9ebae30e4cf37bfe330c18075 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Wed, 5 Apr 2017 19:40:00 -0500 Subject: [PATCH] Doc cleanup & add bug descript for restrict specifier issue --- hipamd/docs/markdown/hip_bugs.md | 92 ++++++++++++-------------------- 1 file changed, 34 insertions(+), 58 deletions(-) diff --git a/hipamd/docs/markdown/hip_bugs.md b/hipamd/docs/markdown/hip_bugs.md index 234dec4e0e..c53b68d796 100644 --- a/hipamd/docs/markdown/hip_bugs.md +++ b/hipamd/docs/markdown/hip_bugs.md @@ -15,9 +15,11 @@ Some common code practices may lead to hipcc generating a error with the form : undefined reference to `__hcLaunchKernel__ZN15vecAddNamespace6vecAddIidEEv16grid_launch_parmPT0_S3_S3_T_ To workaround, try: -- Avoid calling hcLaunchKernel from a function with the __host__ attribute +- Avoid calling hipLaunchKernel from a function with the __host__ attribute +``` __host__ MyFunc(…) { hipLaunchKernel(myKernel, …) +``` - Avoid use of static with kernel definition: static __global__ MyKernel - Avoid defining kernels in anonymous namespace @@ -25,25 +27,6 @@ namespace { __global__ MyKernel … - Avoid calling member functions -If hipLaunchKernel takes parameters that request explicitly memcpy, then it will cause application hang. -Reason is that the hipLaunchKernel macro locks the stream. -If kernel paramters are actually function calls which invoke other hip apis (i.e. memcpy) to the same stream, then deadlock occurs. - -To workaround, try: -Move the function calls so they occur outside the hipLaunchKernel macro, store results in temps, then use the tems inside the kernel. - -``` -// Example pseudo code causing system hang: -// "bottom[0]->gpu_data()" calls hipMemcpy() implicitly and using the same stream, cause deadlock condition. -hipLaunchKernel(HIP_KERNEL_NAME(LRNComputeDiff),dim3(CAFFE_GET_BLOCKS(n_threads)), dim3(CAFFE_HIP_NUM_THREADS), 0, 0, n_threads, - bottom[0]->gpu_data()); - -// Move "gpu_data()" ouside of hipLaunchKernel to avoid hang. -auto bot_gpu_data = bottom[0]->gpu_data(); -hipLaunchKernel( LRNComputeDiff, dim3(CAFFE_GET_BLOCKS(n_threads)), dim3(CAFFE_HIP_NUM_THREADS), 0, 0, n_threads, - bot_gpu_data); - -``` ### What is the current limitation of HIP Generic Grid Launch method? 1. __global__ functions cannot be marked as static or put in an unnamed namespace i.e. they cannot be given internal linkage (this would clash with __attribute__((weak))); @@ -115,87 +98,80 @@ Rather than create serializer functions, another workaround is to pass the membe ### HIP is more restrictive in enforcing restrictions -By the language specification, both for HIP and CUDA it is forbidden to call a +The language specification for HIP and CUDA forbid calling a `__device__` function in a `__host__` context. In practice, you may observe differences in the strictness of this restriction, with HIP exhibiting a tighter -adherence to the specification i.e. being less tolerant of infringing code. The -solution is to always ensure that all functions which are to be called in a +adherence to the specification and thus less tolerant of infringing code. The +solution is to ensure that all functions which are called in a `__device__` context are correctly annotated to reflect it. An interesting case -where these differences emerge is shown below (this has been lifted from -production code, and relies on a the common [C++ Member Detector idiom][1], as it -would be implemented pre C++11): +where these differences emerge is shown below. This relies on a the common +[C++ Member Detector idiom][1], as it would be implemented pre C++11): + ```c++ #include #include -struct meta_yes { char a[1]; }; -struct meta_no { char a[2]; }; +struct aye { bool a[1]; }; +struct nay { bool a[2]; }; // Dual restriction is necessary in HIP if the detector is to work for // __device__ contexts as well as __host__ ones. NVCC is less strict. template __host__ __device__ -const T& return_ref(); +const T& cref_t(); template -struct has_nullary_operator { +struct Has_call_operator { // Dual restriction is necessary in HIP if the detector is to work for // __device__ contexts as well as __host__ ones. NVCC is less strict. template __host__ __device__ static - meta_yes testFunctor( + aye test( C const *, typename std::enable_if< - (sizeof(return_ref().operator()()) > 0)>::type* = nullptr); + (sizeof(cref_t().operator()()) > 0)>::type* = nullptr); static - meta_no testFunctor(...); + nay test(...); - enum { - value = sizeof(testFunctor(static_cast(0))) == sizeof(meta_yes) }; + enum { value = sizeof(test(static_cast(0))) == sizeof(aye) }; }; -template< - typename Scalar, - typename NullaryOp, - bool has_nullary = has_nullary_operator::value> -struct nullary_wrapper { - template - T packetOp() const { return T{1}; } +template::value> +struct Wrapper { + template + V f() const { return T{1}; } }; -template -struct nullary_wrapper { - template - T packetOp() const { return T{10}; } +template +struct Wrapper { + template + V f() const { return T{10}; } }; -// This specialisation will fail to compile. -template -struct nullary_wrapper {}; +// This specialisation will yield a compile-time error, if selected. +template +struct Wrapper {}; template -struct UniformRandomGenerator; +struct Functor; -template<> struct UniformRandomGenerator { - float operator()() const [[hc]] { return 42.0; } +template<> struct Functor { + __device__ + float operator()() const { return 42.0f; } }; __device__ void this_will_not_compile_if_detector_is_not_marked_device() { - float f = - nullary_wrapper< - float, UniformRandomGenerator>().packetOp(); + float f = Wrapper>().f(); } __host__ void this_will_not_compile_if_detector_is_marked_device_only() { - float f = - nullary_wrapper< - float, UniformRandomGenerator>().packetOp(); + float f = Wrapper>().f(); } ``` [1]: https://en.wikibooks.org/wiki/More_C%2B%2B_Idioms/Member_Detector