From 294ffbb51f1e67d9ebae30e4cf37bfe330c18075 Mon Sep 17 00:00:00 2001
From: Ben Sander <ben.sander@amd.com>
Date: Wed, 5 Apr 2017 19:40:00 -0500
Subject: [PATCH] Doc cleanup & add bug descript for restrict specifier issue

---
 hipamd/docs/markdown/hip_bugs.md | 92 ++++++++++++--------------------
 1 file changed, 34 insertions(+), 58 deletions(-)
diff --git a/hipamd/docs/markdown/hip_bugs.md b/hipamd/docs/markdown/hip_bugs.md
index 234dec4e0e..c53b68d796 100644
--- a/hipamd/docs/markdown/hip_bugs.md
+++ b/hipamd/docs/markdown/hip_bugs.md
@@ -15,9 +15,11 @@ Some common code practices may lead to hipcc generating a error with the form :
 undefined reference to `__hcLaunchKernel__ZN15vecAddNamespace6vecAddIidEEv16grid_launch_parmPT0_S3_S3_T_
 
 To workaround, try:
-- Avoid calling hcLaunchKernel from a function with the __host__ attribute
+- Avoid calling hipLaunchKernel from a function with the __host__ attribute
+```
 __host__ MyFunc(…) {
 hipLaunchKernel(myKernel, …)
+```
 - Avoid use of static with kernel definition:
 static __global__ MyKernel 
 - Avoid defining kernels in anonymous namespace
@@ -25,25 +27,6 @@ namespace {
 __global__ MyKernel …
 - Avoid calling member functions 
 
-If hipLaunchKernel takes parameters that request explicitly memcpy, then it will cause application hang. 
-Reason is that the hipLaunchKernel macro locks the stream.  
-If kernel paramters are actually function calls which invoke other hip apis (i.e. memcpy) to the same stream, then deadlock occurs.
-
-To workaround, try:
-Move the function calls so they occur outside the hipLaunchKernel macro, store results in temps, then use the tems inside the kernel.
-
-```
-// Example pseudo code causing system hang:
-// "bottom[0]->gpu_data()" calls hipMemcpy() implicitly and using the same stream, cause deadlock condition.
-hipLaunchKernel(HIP_KERNEL_NAME(LRNComputeDiff),dim3(CAFFE_GET_BLOCKS(n_threads)), dim3(CAFFE_HIP_NUM_THREADS), 0, 0, n_threads,
-       bottom[0]->gpu_data());
-       
-// Move "gpu_data()" ouside of hipLaunchKernel to avoid hang. 
-auto bot_gpu_data = bottom[0]->gpu_data();
-hipLaunchKernel( LRNComputeDiff, dim3(CAFFE_GET_BLOCKS(n_threads)), dim3(CAFFE_HIP_NUM_THREADS), 0, 0, n_threads, 
-       bot_gpu_data);
-
-```
 
 ### What is the current limitation of HIP Generic Grid Launch method?
 1. __global__ functions cannot be marked as static or put in an unnamed namespace i.e. they cannot be given internal linkage (this would clash with __attribute__((weak)));
@@ -115,87 +98,80 @@ Rather than create serializer functions, another workaround is to pass the membe
 
 
 ### HIP is more restrictive in enforcing restrictions
-By the language specification, both for HIP and CUDA it is forbidden to call a
+The language specification for HIP and CUDA forbid calling a
 `__device__` function in a `__host__` context. In practice, you may observe
 differences in the strictness of this restriction, with HIP exhibiting a tighter
-adherence to the specification i.e. being less tolerant of infringing code. The
-solution is to always ensure that all functions which are to be called in a
+adherence to the specification and thus less tolerant of infringing code. The
+solution is to ensure that all functions which are called in a
 `__device__` context are correctly annotated to reflect it. An interesting case
-where these differences emerge is shown below (this has been lifted from
-production code, and relies on a the common [C++ Member Detector idiom][1], as it
-would be implemented pre C++11):
+where these differences emerge is shown below.  This relies on a the common 
+[C++ Member Detector idiom][1], as it would be implemented pre C++11):  
+
 ```c++
 #include <cassert>
 #include <type_traits>
 
-struct meta_yes { char a[1]; };
-struct meta_no  { char a[2]; };
+struct aye { bool a[1]; };
+struct nay { bool a[2]; };
 
 // Dual restriction is necessary in HIP if the detector is to work for
 // __device__ contexts as well as __host__ ones. NVCC is less strict.
 template<typename T>
 __host__ __device__
-const T& return_ref();
+const T& cref_t();
 
 template<typename T>
-struct has_nullary_operator {
+struct Has_call_operator {
     // Dual restriction is necessary in HIP if the detector is to work for
     // __device__ contexts as well as __host__ ones. NVCC is less strict.
     template<typename C>
     __host__ __device__
     static
-    meta_yes testFunctor(
+    aye test(
         C const *,
         typename std::enable_if<
-            (sizeof(return_ref<C>().operator()()) > 0)>::type* = nullptr);
+            (sizeof(cref_t<C>().operator()()) > 0)>::type* = nullptr);
     static
-    meta_no testFunctor(...);
+    nay test(...);
 
-    enum {
-        value = sizeof(testFunctor(static_cast<T*>(0))) == sizeof(meta_yes) };
+    enum { value = sizeof(test(static_cast<T*>(0))) == sizeof(aye) };
 };
 
-template<
-    typename Scalar,
-    typename NullaryOp,
-    bool has_nullary = has_nullary_operator<NullaryOp>::value>
-struct nullary_wrapper {
-    template<typename T>
-    T packetOp() const { return T{1}; }
+template<typename T, typename U, bool callable = has_call_operator<U>::value>
+struct Wrapper {
+    template<typename V>
+    V f() const { return T{1}; }
 };
 
 
-template<typename Scalar, typename NullaryOp>
-struct nullary_wrapper<Scalar, NullaryOp, true> {
-    template<typename T>
-    T packetOp() const { return T{10}; }
+template<typename T, typename U>
+struct Wrapper<T, U, true> {
+    template<typename V>
+    V f() const { return T{10}; }
 };
 
-// This specialisation will fail to compile.
-template<typename Scalar, typename NullaryOp>
-struct nullary_wrapper<Scalar, NullaryOp, false> {};
+// This specialisation will yield a compile-time error, if selected.
+template<typename T, typename U>
+struct Wrapper<T, U, false> {};
 
 template<typename T>
-struct UniformRandomGenerator;
+struct Functor;
 
-template<> struct UniformRandomGenerator<float> {
-    float operator()() const [[hc]] { return 42.0; }
+template<> struct Functor<float> {
+    __device__
+    float operator()() const { return 42.0f; }
 };
 
 __device__
 void this_will_not_compile_if_detector_is_not_marked_device()
 {
-    float f =
-        nullary_wrapper<
-            float, UniformRandomGenerator<float>>().packetOp<float>();
+    float f = Wrapper<float, Functor<float>>().f<float>();
 }
 
 __host__
 void this_will_not_compile_if_detector_is_marked_device_only()
 {
-    float f =
-        nullary_wrapper<
-            float, UniformRandomGenerator<float>>().packetOp<float>();
+    float f = Wrapper<float, Functor<float>>().f<float>();
 }
 ```
 [1]: https://en.wikibooks.org/wiki/More_C%2B%2B_Idioms/Member_Detector