Merge branch 'amd-develop' into amd-master

Change-Id: I53d5a8916d769c4f0fe60d2ee3b240551da80b4f (cherry picked from commit 01c523f6c9)
2017-04-07 15:40:09 +05:30
@@ -1,49 +1,177 @@
-# HIP Bugs 
+# HIP Bugs

 <!-- toc -->

- [Errors related to undefined reference to `__hcLaunchKernel__***__grid_launch_parm**](#errors-related-to-undefined-reference-to-hclaunchkernel__grid_launch_parm)
- [Application hangs after a hipLaunchKernel call](#what-if-i-see-application-hangs-after-a-hiplaunchkernel-call)
+- [Errors related to undefined reference to `__hcLaunchKernel__***__grid_launch_parm**`](#errors-related-to-undefined-reference-to-__hclaunchkernel____grid_launch_parm)
 - [What is the current limitation of HIP Generic Grid Launch method?](#what-is-the-current-limitation-of-hip-generic-grid-launch-method)
+- [Errors related to `no matching constructor`](#errors-related-to-no-matching-constructor)
+- [HIP is more restrictive in enforcing restrictions](#hip-is-more-restrictive-in-enforcing-restrictions)

 <!-- tocstop -->

-### Errors related to undefined reference to `__hcLaunchKernel__***__grid_launch_parm**
+### Errors related to undefined reference to `__hcLaunchKernel__***__grid_launch_parm**`

 Some common code practices may lead to hipcc generating a error with the form :
 undefined reference to `__hcLaunchKernel__ZN15vecAddNamespace6vecAddIidEEv16grid_launch_parmPT0_S3_S3_T_

-To workaround, try:
- Avoid calling hcLaunchKernel from a function with the __host__ attribute
-__host__ MyFunc(…) {
-hipLaunchKernel(myKernel, …)
+Suggested workarounds:
 - Avoid use of static with kernel definition:
+```c++
 static __global__ MyKernel 
- Avoid defining kernels in anonymous namespace
+```
+
+- Avoid defining kernels in anonymous namespace :
+```c++
 namespace {
-__global__ MyKernel …
- Avoid calling member functions 
-
-If hipLaunchKernel takes parameters that request explicitly memcpy, then it will cause application hang. 
-Reason is that the hipLaunchKernel macro locks the stream.  
-If kernel paramters are actually function calls which invoke other hip apis (i.e. memcpy) to the same stream, then deadlock occurs.
-
-To workaround, try:
-Move the function calls so they occur outside the hipLaunchKernel macro, store results in temps, then use the tems inside the kernel.
-
+  __global__ MyKernel
+}
 ```
-// Example pseudo code causing system hang:
-// "bottom[0]->gpu_data()" calls hipMemcpy() implicitly and using the same stream, cause deadlock condition.
-hipLaunchKernel(HIP_KERNEL_NAME(LRNComputeDiff),dim3(CAFFE_GET_BLOCKS(n_threads)), dim3(CAFFE_HIP_NUM_THREADS), 0, 0, n_threads,
-       bottom[0]->gpu_data());
-       
-// Move "gpu_data()" ouside of hipLaunchKernel to avoid hang. 
-auto bot_gpu_data = bottom[0]->gpu_data();
-hipLaunchKernel( LRNComputeDiff, dim3(CAFFE_GET_BLOCKS(n_threads)), dim3(CAFFE_HIP_NUM_THREADS), 0, 0, n_threads, 
-       bot_gpu_data);

-```

 ### What is the current limitation of HIP Generic Grid Launch method?
 1. __global__ functions cannot be marked as static or put in an unnamed namespace i.e. they cannot be given internal linkage (this would clash with __attribute__((weak)));
-2. using the macro based dispatch mechanism i.e. hipLaunchKernel* only works for functions that take no more than 20 arguments (this limit can be increased up to 126, and is temporary until we can enable C++14 mode and use variadic generic lambdas); no such limitation applies do dispatching directly through grid_launch.
+2. using the macro based dispatch mechanism i.e. hipLaunchKernel* only works for functions that take no more than 20 arguments (this limit can be increased up to 126, and is temporary until we can enable C++14 mode and use variadic generic lambdas); no such limitation applies do dispatching directly through grid_launch.
+
+
+### Errors related to `no matching constructor`
+
+The symptom is the compiler would complain about errors like `no matching constructor` for classes/structs passed as arguments into a GPU kernel. Often, this is caused by a design limitation in HCC where array-typed member variables inside a class/struct can’t be correctly passed into GPU kernels. To mitigate this issue, a custom serializer/deserializer pair is provided.
+
+For example, `Foo` in the code snippets below contains an array-typed member variable `table`, which would fail the compiler if used as a kernel argument.
+
+```
+struct Foo {
+  // table is an array, which makes foo
+  int table[3];
+};
+```
+
+An workaround is to provide a custom serializer on CPU side, and append the contents of the array as kernel arguments:
+
+```
+
+struct Foo {
+  int table[3];
+
+  // user-provided CPU serializer
+  // must append the contents of the array member as kernel arguments
+#ifdef __HCC__
+  __attribute__((annotate(“serialize”)))
+  void __cxxamp_serialize(Kalmar::Serialize &s) const {
+    for (int i = 0; i < 3; ++i)
+      s.Append(sizeof(int), &table[i]);
+  }
+#endif
+};
+```
+
+Then, provide a custom deserializer on GPU side, to help reconstruct the array within GPU kernels. Notice that the deserializer can not be a function template, and should have scalar-typed parameters of the number equals to the length of the array-typed member variable. For example:
+
+```
+struct Foo {
+  int table[3];
+
+  // user-provided GPU deserializer
+  // table has 3 int elements, so deserializer must have 3 int parameters.
+#ifdef __HCC__
+  __attribute__((annotate(“user_deserialize”)))
+  Foo(int x0, int x1, int x2) [[cpu]][[hc]] {
+    table[0] = x0;
+    table[1] = x1;
+    table[2] = x2;
+  }
+#endif
+
+#ifdef __HCC__
+  __attribute__((annotate(“serialize”)))
+  void __cxxamp_serialize(Kalmar::Serialize &s) const {
+    s.Append(sizeof(int), &table[0]);
+    s.Append(sizeof(int), &table[1]);
+    s.Append(sizeof(int), &table[2]);
+  }
+#endif
+};
+```
+
+
+Rather than create serializer functions, another workaround is to pass the member fields from the structure as simple data types.
+
+
+### HIP is more restrictive in enforcing restrictions
+The language specification for HIP and CUDA forbid calling a
+`__device__` function in a `__host__` context. In practice, you may observe
+differences in the strictness of this restriction, with HIP exhibiting a tighter
+adherence to the specification and thus less tolerant of infringing code. The
+solution is to ensure that all functions which are called in a
+`__device__` context are correctly annotated to reflect it. An interesting case
+where these differences emerge is shown below.  This relies on a the common 
+[C++ Member Detector idiom][1], as it would be implemented pre C++11):  
+
+```c++
+#include <cassert>
+#include <type_traits>
+
+struct aye { bool a[1]; };
+struct nay { bool a[2]; };
+
+// Dual restriction is necessary in HIP if the detector is to work for
+// __device__ contexts as well as __host__ ones. NVCC is less strict.
+template<typename T>
+__host__ __device__
+const T& cref_t();
+
+template<typename T>
+struct Has_call_operator {
+    // Dual restriction is necessary in HIP if the detector is to work for
+    // __device__ contexts as well as __host__ ones. NVCC is less strict.
+    template<typename C>
+    __host__ __device__
+    static
+    aye test(
+        C const *,
+        typename std::enable_if<
+            (sizeof(cref_t<C>().operator()()) > 0)>::type* = nullptr);
+    static
+    nay test(...);
+
+    enum { value = sizeof(test(static_cast<T*>(0))) == sizeof(aye) };
+};
+
+template<typename T, typename U, bool callable = has_call_operator<U>::value>
+struct Wrapper {
+    template<typename V>
+    V f() const { return T{1}; }
+};
+
+
+template<typename T, typename U>
+struct Wrapper<T, U, true> {
+    template<typename V>
+    V f() const { return T{10}; }
+};
+
+// This specialisation will yield a compile-time error, if selected.
+template<typename T, typename U>
+struct Wrapper<T, U, false> {};
+
+template<typename T>
+struct Functor;
+
+template<> struct Functor<float> {
+    __device__
+    float operator()() const { return 42.0f; }
+};
+
+__device__
+void this_will_not_compile_if_detector_is_not_marked_device()
+{
+    float f = Wrapper<float, Functor<float>>().f<float>();
+}
+
+__host__
+void this_will_not_compile_if_detector_is_marked_device_only()
+{
+    float f = Wrapper<float, Functor<float>>().f<float>();
+}
+```
+[1]: https://en.wikibooks.org/wiki/More_C%2B%2B_Idioms/Member_Detector
@@ -4,7 +4,7 @@

 - [What APIs and features does HIP support?](#what-apis-and-features-does-hip-support)
 - [What is not supported?](#what-is-not-supported)
-  * [Run-time features](#run-time-features)
+  * [Runtime/Driver API features](#runtimedriver-api-features)
  * [Kernel language features](#kernel-language-features)
 - [Is HIP a drop-in replacement for CUDA?](#is-hip-a-drop-in-replacement-for-cuda)
 - [What specific version of CUDA does HIP support?](#what-specific-version-of-cuda-does-hip-support)
@@ -23,10 +23,11 @@
 - [On HCC, can I link HIP code with host code compiled with another compiler such as gcc, icc, or clang ?](#on-hcc-can-i-link-hip-code-with-host-code-compiled-with-another-compiler-such-as-gcc-icc-or-clang-)
 - [HIP detected my platform (hcc vs nvcc) incorrectly - what should I do?](#hip-detected-my-platform-hcc-vs-nvcc-incorrectly---what-should-i-do)
 - [Can I install both CUDA SDK and HCC on same machine?](#can-i-install-both-cuda-sdk-and-hcc-on-same-machine)
+- [On CUDA, can I mix CUDA code with HIP code?](#on-cuda-can-i-mix-cuda-code-with-hip-code)
+- [On HCC, can I use HC functionality with HIP?](#on-hcc-can-i-use-hc-functionality-with-hip)
 - [How do I trace HIP application flow?](#how-do-i-trace-hip-application-flow)
-  * [Using CodeXL markers for HIP Functions](#using-codexl-markers-for-hip-functions)
-  * [Using HIP_TRACE_API](#using-hip_trace_api)
- [How do I enable HIP Generic Grid Launch option?](#how-do-i-enable-hip-generic-grid-launch-option)
+- [What if HIP generates error of "symbol multiply defined!" only on AMD machine?](#what-if-hip-generates-error-of-symbol-multiply-defined-only-on-amd-machine)
+- [How do I disable HIP Generic Grid Launch option?](#how-do-i-disable-hip-generic-grid-launch-option)

 <!-- tocstop -->

@@ -44,6 +44,7 @@
 - [Pragma Unroll](#pragma-unroll)
 - [In-Line Assembly](#in-line-assembly)
 - [C++ Support](#c-support)
+- [Kernel Compilation](#kernel-compilation)

 <!-- tocstop -->

@@ -21,6 +21,7 @@ and provides practical suggestions on how to port CUDA code and work through com
  * [Device-Architecture Properties](#device-architecture-properties)
  * [Table of Architecture Properties](#table-of-architecture-properties)
 - [Finding HIP](#finding-hip)
+- [hipLaunchKernel](#hiplaunchkernel)
 - [Compiler Options](#compiler-options)
 - [Linking Issues](#linking-issues)
  * [Linking With hipcc](#linking-with-hipcc)
@@ -31,9 +32,11 @@ and provides practical suggestions on how to port CUDA code and work through com
  * [Using a Standard C++ Compiler](#using-a-standard-c-compiler)
    + [cuda.h](#cudah)
  * [Choosing HIP File Extensions](#choosing-hip-file-extensions)
-  * [Workarounds](#workarounds)
-    + [warpSize](#warpsize)
-    + [Textures and Cache Control](#textures-and-cache-control)
+- [Workarounds](#workarounds)
+  * [warpSize](#warpsize)
+- [memcpyToSymbol](#memcpytosymbol)
+- [threadfence_system](#threadfence_system)
+  * [Textures and Cache Control](#textures-and-cache-control)
 - [More Tips](#more-tips)
  * [HIPTRACE Mode](#hiptrace-mode)
  * [Environment Variables](#environment-variables)
@@ -4,26 +4,32 @@ This section describes the profiling and debugging capabilities that HIP provide
 Profiling information can viewed in the CodeXL visualization tool or printed directly to stderr as the application runs.
 This document starts with some of the general capabilities of CodeXL and then describes some of the additional HIP marker and debug features.

- * [CodeXL Profiling](#codexl-profiling)
-   * [Collecting and Viewing Traces](#collecting-and-viewing-traces)
-     * [Using rocm-profiler timestamp profiling](#using-rocm-profiler-timestamp-profiling)
-     * [Using rocm-profiler performance counter collection:](#using-rocm-profiler-performance-counter-collection)
-     * [Using CodeXL to view profiling results:](#using-codexl-to-view-profiling-results)
-     * [More information on CodeXL](#more-information-on-codexl)
-   * [HIP Markers](#hip-markers)
-     * [Profiling HIP APIs](#profiling-hip-apis)
-     * [Adding markers to applications](#adding-markers-to-applications)
-   * [Additional HIP Profiling Features](#additional-hip-profiling-features)
-     * [Demangling C   Kernel Names](#demangling-c-kernel-names)
-     * [Controlling when profiling starts and ends](#controlling-when-profiling-starts-and-ends)
-     * [Reducing timeline trace output file size](#reducing-timeline-trace-output-file-size)
-     * [How to enable profiling at HIP build time](#how-to-enable-profiling-at-hip-build-time)
- * [Tracing and Debug](#tracing-and-debug)
-   * [Tracing HIP APIs](#tracing-hip-apis)
-     * [Color](#color)
-   * [Using HIP_DB](#using-hip_db)
-   * [Using ltrace](#using-ltrace)
-   * [Chicken bits](#chicken-bits)
+<!-- toc -->
+
+- [CodeXL Profiling](#codexl-profiling)
+  * [Collecting and Viewing Traces](#collecting-and-viewing-traces)
+    + [Using rocm-profiler timestamp profiling](#using-rocm-profiler-timestamp-profiling)
+    + [Using rocm-profiler performance counter collection:](#using-rocm-profiler-performance-counter-collection)
+    + [Using CodeXL to view profiling results:](#using-codexl-to-view-profiling-results)
+    + [More information on CodeXL](#more-information-on-codexl)
+  * [HIP Markers](#hip-markers)
+    + [Profiling HIP APIs](#profiling-hip-apis)
+    + [Adding markers to applications](#adding-markers-to-applications)
+  * [Additional HIP Profiling Features](#additional-hip-profiling-features)
+    + [Demangling C++ Kernel Names](#demangling-c-kernel-names)
+    + [Controlling when profiling starts and ends](#controlling-when-profiling-starts-and-ends)
+    + [Reducing timeline trace output file size](#reducing-timeline-trace-output-file-size)
+    + [How to enable profiling at HIP build time](#how-to-enable-profiling-at-hip-build-time)
+- [Tracing and Debug](#tracing-and-debug)
+  * [Tracing HIP APIs](#tracing-hip-apis)
+    + [Color](#color)
+  * [Using HIP_DB](#using-hip_db)
+  * [Using ltrace](#using-ltrace)
+  * [Chicken bits](#chicken-bits)
+  * [Debugging HIP Applications](#debugging-hip-applications)
+  * [General Debugging Tips](#general-debugging-tips)
+
+<!-- tocstop -->

 ## CodeXL Profiling

@@ -2343,9 +2343,6 @@ private:
    LangOptions DefaultLangOptions;
    SmallString<40> XStr;
    raw_svector_ostream OS(XStr);
-    StringRef initialParamList;
-    OS << "hipLaunchParm lp";
-    size_t repLength = OS.str().size();
    SourceLocation sl = kernelDecl->getNameInfo().getEndLoc();
    SourceLocation kernelArgListStart = Lexer::findLocationAfterToken(sl, tok::l_paren, *SM, DefaultLangOptions, true);
    DEBUG(dbgs() << kernelArgListStart.printToString(*SM));
@@ -2355,14 +2352,12 @@ private:
      SourceLocation kernelArgListStart(pvdFirst->getLocStart());
      SourceLocation kernelArgListEnd(pvdLast->getLocEnd());
      SourceLocation stop = Lexer::getLocForEndOfToken(kernelArgListEnd, 0, *SM, DefaultLangOptions);
-      repLength += SM->getCharacterData(stop) - SM->getCharacterData(kernelArgListStart);
-      initialParamList = StringRef(SM->getCharacterData(kernelArgListStart), repLength);
-      OS << ", " << initialParamList;
+      size_t repLength = SM->getCharacterData(stop) - SM->getCharacterData(kernelArgListStart);
+      OS << StringRef(SM->getCharacterData(kernelArgListStart), repLength);
+      Replacement Rep0(*(Result.SourceManager), kernelArgListStart, repLength, OS.str());
+      FullSourceLoc fullSL(sl, *(Result.SourceManager));
+      insertReplacement(Rep0, fullSL);
    }
-    DEBUG(dbgs() << "initial paramlist: " << initialParamList << "\n" << "new paramlist: " << OS.str() << "\n");
-    Replacement Rep0(*(Result.SourceManager), kernelArgListStart, repLength, OS.str());
-    FullSourceLoc fullSL(sl, *(Result.SourceManager));
-    insertReplacement(Rep0, fullSL);
  }

  bool cudaCall(const MatchFinder::MatchResult &Result) {
@@ -2431,9 +2426,9 @@ private:
      XStr.clear();
      if (calleeName.find(',') != StringRef::npos) {
        SmallString<128> tmpData;
-        calleeName = Twine("HIP_KERNEL_NAME(" + calleeName + ")").toStringRef(tmpData);
+        calleeName = Twine("(" + calleeName + ")").toStringRef(tmpData);
      }
-      OS << "hipLaunchKernel(" << calleeName << ",";
+      OS << "hipLaunchKernelGGL(" << calleeName << ",";
      const CallExpr *config = launchKernel->getConfig();
      DEBUG(dbgs() << "Kernel config arguments:" << "\n");
      SourceManager *SM = Result.SourceManager;
@@ -2473,7 +2468,7 @@ private:
      Replacement Rep(*SM, launchKernel->getLocStart(), length, OS.str());
      FullSourceLoc fullSL(launchKernel->getLocStart(), *SM);
      insertReplacement(Rep, fullSL);
-      hipCounter counter = {"hipLaunchKernel", CONV_KERN, API_RUNTIME};
+      hipCounter counter = {"hipLaunchKernelGGL", CONV_KERN, API_RUNTIME};
      updateCounters(counter, refName.str());
      return true;
    }
@@ -21,6 +21,7 @@ THE SOFTWARE.
 */

 #pragma once
+#if GENERIC_GRID_LAUNCH == 1

 #include "concepts.hpp"
 #include "helpers.hpp"
@@ -840,14 +841,16 @@ namespace hip_impl
        group_mem_bytes,\
        stream,\
        ...)\
-    {\
-        hipLaunchKernelGGL(\
-            kernel_name,\
-            num_blocks,\
-            dim_blocks,\
-            group_mem_bytes,\
-            stream,\
-            hipLaunchParm{},\
-            ##__VA_ARGS__);\
-    }
+   do {\
+    hipLaunchKernelGGL(\
+        kernel_name,\
+        num_blocks,\
+        dim_blocks,\
+        group_mem_bytes,\
+        stream,\
+        hipLaunchParm{},\
+        ##__VA_ARGS__);\
+   } while(0)
+
 }
+#endif //GENERIC_GRID_LAUNCH
@@ -23,8 +23,7 @@ THE SOFTWARE.
 #ifndef HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COMPLEX_H
 #define HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COMPLEX_H

-#include "./hip_fp16.h"
-#include "./hip_vector_types.h"
+#include "hip/hcc_detail/hip_vector_types.h"

 #if __cplusplus
 #define COMPLEX_ADD_OP_OVERLOAD(type) \
@@ -177,45 +176,45 @@ COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, unsigned long long)

 #endif

-__device__ static inline float hipCrealf(hipFloatComplex z){
+__device__ __host__ static inline float hipCrealf(hipFloatComplex z){
    return z.x;
 }

-__device__ static inline float hipCimagf(hipFloatComplex z){
+__device__ __host__ static inline float hipCimagf(hipFloatComplex z){
    return z.y;
 }

-__device__ static inline hipFloatComplex make_hipFloatComplex(float a, float b){
+__device__ __host__ static inline hipFloatComplex make_hipFloatComplex(float a, float b){
    hipFloatComplex z;
    z.x = a;
    z.y = b;
    return z;
 }

-__device__ static inline hipFloatComplex hipConjf(hipFloatComplex z){
+__device__ __host__ static inline hipFloatComplex hipConjf(hipFloatComplex z){
    hipFloatComplex ret;
    ret.x = z.x;
    ret.y = -z.y;
    return ret;
 }

-__device__ static inline float hipCsqabsf(hipFloatComplex z){
+__device__ __host__ static inline float hipCsqabsf(hipFloatComplex z){
    return z.x * z.x + z.y * z.y;
 }

-__device__ static inline hipFloatComplex hipCaddf(hipFloatComplex p, hipFloatComplex q){
+__device__ __host__ static inline hipFloatComplex hipCaddf(hipFloatComplex p, hipFloatComplex q){
    return make_hipFloatComplex(p.x + q.x, p.y + q.y);
 }

-__device__ static inline hipFloatComplex hipCsubf(hipFloatComplex p, hipFloatComplex q){
+__device__ __host__ static inline hipFloatComplex hipCsubf(hipFloatComplex p, hipFloatComplex q){
    return make_hipFloatComplex(p.x - q.x, p.y - q.y);
 }

-__device__ static inline hipFloatComplex hipCmulf(hipFloatComplex p, hipFloatComplex q){
+__device__ __host__ static inline hipFloatComplex hipCmulf(hipFloatComplex p, hipFloatComplex q){
    return make_hipFloatComplex(p.x * q.x - p.y * q.y, p.y * q.x + p.x * q.y);
 }

-__device__ static inline hipFloatComplex hipCdivf(hipFloatComplex p, hipFloatComplex q){
+__device__ __host__ static inline hipFloatComplex hipCdivf(hipFloatComplex p, hipFloatComplex q){
    float sqabs = hipCsqabsf(q);
    hipFloatComplex ret;
    ret.x = (p.x * q.x + p.y * q.y)/sqabs;
@@ -223,51 +222,51 @@ __device__ static inline hipFloatComplex hipCdivf(hipFloatComplex p, hipFloatCom
    return ret;
 }

-__device__ static inline float hipCabsf(hipFloatComplex z){
+__device__ __host__ static inline float hipCabsf(hipFloatComplex z){
    return sqrtf(hipCsqabsf(z));
 }



-__device__ static inline double hipCreal(hipDoubleComplex z){
+__device__ __host__ static inline double hipCreal(hipDoubleComplex z){
    return z.x;
 }

-__device__ static inline double hipCimag(hipDoubleComplex z){
+__device__ __host__ static inline double hipCimag(hipDoubleComplex z){
    return z.y;
 }

-__device__ static inline hipDoubleComplex make_hipDoubleComplex(double a, double b){
+__device__ __host__ static inline hipDoubleComplex make_hipDoubleComplex(double a, double b){
    hipDoubleComplex z;
    z.x = a;
    z.y = b;
    return z;
 }

-__device__ static inline hipDoubleComplex hipConj(hipDoubleComplex z){
+__device__ __host__ static inline hipDoubleComplex hipConj(hipDoubleComplex z){
    hipDoubleComplex ret;
    ret.x = z.x;
    ret.y = z.y;
    return ret;
 }

-__device__ static inline double hipCsqabs(hipDoubleComplex z){
+__device__ __host__ static inline double hipCsqabs(hipDoubleComplex z){
    return z.x * z.x + z.y * z.y;
 }

-__device__ static inline hipDoubleComplex hipCadd(hipDoubleComplex p, hipDoubleComplex q){
+__device__ __host__ static inline hipDoubleComplex hipCadd(hipDoubleComplex p, hipDoubleComplex q){
    return make_hipDoubleComplex(p.x + q.x, p.y + q.y);
 }

-__device__ static inline hipDoubleComplex hipCsub(hipDoubleComplex p, hipDoubleComplex q){
+__device__ __host__ static inline hipDoubleComplex hipCsub(hipDoubleComplex p, hipDoubleComplex q){
    return make_hipDoubleComplex(p.x - q.x, p.y - q.y);
 }

-__device__ static inline hipDoubleComplex hipCmul(hipDoubleComplex p, hipDoubleComplex q){
+__device__ __host__ static inline hipDoubleComplex hipCmul(hipDoubleComplex p, hipDoubleComplex q){
    return make_hipDoubleComplex(p.x * q.x - p.y * q.y, p.y * q.x + p.x * q.y);
 }

-__device__ static inline hipDoubleComplex hipCdiv(hipDoubleComplex p, hipDoubleComplex q){
+__device__ __host__ static inline hipDoubleComplex hipCdiv(hipDoubleComplex p, hipDoubleComplex q){
    double sqabs = hipCsqabs(q);
    hipDoubleComplex ret;
    ret.x = (p.x * q.x + p.y * q.y)/sqabs;
@@ -275,28 +274,28 @@ __device__ static inline hipDoubleComplex hipCdiv(hipDoubleComplex p, hipDoubleC
    return ret;
 }

-__device__ static inline double hipCabs(hipDoubleComplex z){
+__device__ __host__ static inline double hipCabs(hipDoubleComplex z){
    return sqrtf(hipCsqabs(z));
 }

 typedef hipFloatComplex hipComplex;

-__device__ static inline hipComplex make_hipComplex(float x,
+__device__ __host__ static inline hipComplex make_hipComplex(float x,
                                            float y){
    return make_hipFloatComplex(x, y);
 }

-__device__ static inline hipFloatComplex hipComplexDoubleToFloat
+__device__ __host__ static inline hipFloatComplex hipComplexDoubleToFloat
 (hipDoubleComplex z){
    return make_hipFloatComplex((float)z.x, (float)z.y);
 }

-__device__ static inline hipDoubleComplex hipComplexFloatToDouble
+__device__ __host__ static inline hipDoubleComplex hipComplexFloatToDouble
 (hipFloatComplex z){
    return make_hipDoubleComplex((double)z.x, (double)z.y);
 }

-__device__ static inline hipComplex hipCfmaf(hipComplex p, hipComplex q, hipComplex r){
+__device__ __host__ static inline hipComplex hipCfmaf(hipComplex p, hipComplex q, hipComplex r){
    float real = (p.x * q.x) + r.x;
    float imag = (q.x * p.y) + r.y;

@@ -306,7 +305,7 @@ __device__ static inline hipComplex hipCfmaf(hipComplex p, hipComplex q, hipComp
    return make_hipComplex(real, imag);
 }

-__device__ static inline hipDoubleComplex hipCfma(hipDoubleComplex p, hipDoubleComplex q, hipDoubleComplex r){
+__device__ __host__ static inline hipDoubleComplex hipCfma(hipDoubleComplex p, hipDoubleComplex q, hipDoubleComplex r){
    float real = (p.x * q.x) + r.x;
    float imag = (q.x * p.y) + r.y;

@@ -23,7 +23,7 @@ THE SOFTWARE.
 #ifndef HIP_INCLUDE_HIP_HCC_DETAIL_HIP_FP16_H
 #define HIP_INCLUDE_HIP_HCC_DETAIL_HIP_FP16_H

-#include "hip/hip_runtime.h"
+#include "hip/hcc_detail/hip_vector_types.h"

 #if __clang_major__ > 3

@@ -28,6 +28,17 @@ THE SOFTWARE.
 #if __cplusplus
 #ifdef __HCC__
 #include <hc.hpp>
+
+
+/**
+ *-------------------------------------------------------------------------------------------------
+ *-------------------------------------------------------------------------------------------------
+ *  @defgroup HCC-specific features
+ *  @warning These APIs provide access to special features of HCC compiler and are not available through the CUDA path.
+ *  @{
+ */
+
+
 /**
 * @brief Return hc::accelerator associated with the specified deviceId
 * @return #hipSuccess, #hipErrorInvalidDevice
@@ -45,6 +56,29 @@ hipError_t hipHccGetAcceleratorView(hipStream_t stream, hc::accelerator_view **a

 #endif // #ifdef __HCC__

+/**
+ * @brief launches kernel f with launch parameters and shared memory on stream with arguments passed to kernelparams or extra
+ *
+ * @param [in[ f	 Kernel to launch.
+ * @param [in] gridDimX  X grid dimension specified in work-items
+ * @param [in] gridDimY  Y grid dimension specified in work-items
+ * @param [in] gridDimZ  Z grid dimension specified in work-items
+ * @param [in] blockDimX X block dimensions specified in work-items
+ * @param [in] blockDimY Y grid dimension specified in work-items
+ * @param [in] blockDimZ Z grid dimension specified in work-items
+ * @param [in] sharedMemBytes Amount of dynamic shared memory to allocate for this kernel.  The kernel can access this with HIP_DYNAMIC_SHARED.
+ * @param [in] stream Stream where the kernel should be dispatched.  May be 0, in which case th default stream is used with associated synchronization rules.
+ * @param [in] kernelParams 
+ * @param [in] extra     Pointer to kernel arguments.   These are passed directly to the kernel and must be in the memory layout and alignment expected by the kernel.
+ * @param [in] startEvent  If non-null, specified event will be updated to track the start time of the kernel launch.  The event must be created before calling this API. 
+ * @param [in] stopEvent   If non-null, specified event will be updated to track the stop time of the kernel launch.  The event must be created before calling this API.
+ *
+ * @returns hipSuccess, hipInvalidDevice, hipErrorNotInitialized, hipErrorInvalidValue
+ * 
+ * @warning kernellParams argument is not yet implemented in HIP. Please use extra instead. Please refer to hip_porting_driver_api.md for sample usage.
+
+ * HIP/ROCm actually updates the start event when the associated kernel completes.
+ */
 hipError_t hipHccModuleLaunchKernel(hipFunction_t f,
                                    uint32_t globalWorkSizeX,
                                    uint32_t globalWorkSizeY,
@@ -55,8 +89,15 @@ hipError_t hipHccModuleLaunchKernel(hipFunction_t f,
                                    size_t sharedMemBytes,
                                    hipStream_t hStream,
                                    void **kernelParams,
-                                    void **extra);
+                                    void **extra,
+                                    hipEvent_t startEvent=nullptr,
+                                    hipEvent_t stopEvent=nullptr
+                                    );

+// doxygen end HCC-specific features
+/**
+ * @}
+ */
 #endif // #if __cplusplus

 #endif //
@@ -1913,19 +1913,18 @@ hipError_t hipModuleLoadData(hipModule_t *module, const void *image);
 /**
 * @brief launches kernel f with launch parameters and shared memory on stream with arguments passed to kernelparams or extra
 *
- * @param [in[ f
- * @param [in] gridDimX
- * @param [in] gridDimY
- * @param [in] gridDimZ
- * @param [in] blockDimX
- * @param [in] blockDimY
- * @param [in] blockDimZ
- * @param [in] sharedMemBytes
- * @param [in] stream
- * @param [in] kernelParams
- * @param [in] extraa
+ * @param [in[ f	 Kernel to launch.
+ * @param [in] gridDimX  X grid dimension specified as multiple of blockDimX.
+ * @param [in] gridDimY  Y grid dimension specified as multiple of blockDimY.
+ * @param [in] gridDimZ  Z grid dimension specified as multiple of blockDimZ.
+ * @param [in] blockDimX X block dimensions specified in work-items
+ * @param [in] blockDimY Y grid dimension specified in work-items
+ * @param [in] blockDimZ Z grid dimension specified in work-items
+ * @param [in] sharedMemBytes Amount of dynamic shared memory to allocate for this kernel.  The kernel can access this with HIP_DYNAMIC_SHARED.
+ * @param [in] stream Stream where the kernel should be dispatched.  May be 0, in which case th default stream is used with associated synchronization rules.
+ * @param [in] kernelParams 
+ * @param [in] extra     Pointer to kernel arguments.   These are passed directly to the kernel and must be in the memory layout and alignment expected by the kernel.
 *
- * The function takes the above arguments and run the kernel in hipFunction_t f.  with launch parameters specified in gridDimX, gridDimY, gridDimZ,  blockDimX, blockDimY and blockDimmZ. The amount of shared memory is specificed and can be used with HIP_DYNAMIC_SHARED. The arguemt extra is used to pass in the arguments for the kernel.
 * @returns hipSuccess, hipInvalidDevice, hipErrorNotInitialized, hipErrorInvalidValue
 *
 * @warning kernellParams argument is not yet implemented in HIP. Please use extra instead. Please refer to hip_porting_driver_api.md for sample usage.
@@ -32,7 +32,7 @@ THE SOFTWARE.
 #error("This version of HIP requires a newer version of HCC.");
 #endif

-#include "host_defines.h"
+#include "hip/hcc_detail/host_defines.h"

 #define MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(type) \
 __device__ __host__ type() {} \
@@ -1,23 +0,0 @@
-#!/bin/bash
-function die {
-    echo "${1-Died}." >&2
-    exit 1
-}
-
-payload=$1
-script=$2
-[ "$payload" != "" ] || [ "$script" != "" ] || die "Invalid arguments!"
-tmp=__extract__$RANDOM
-
-printf "#!/bin/bash
-samples_dir=\$1
-[ \"\$samples_dir\" != \"\" ] || read -e -p \"Enter the path to extract the HIP samples: \" samples_dir
-mkdir -p \$samples_dir
-PAYLOAD=\`awk '/^__PAYLOAD_BELOW__/ {print NR + 1; exit 0; }' \$0\`
-tail -n+\$PAYLOAD \$0 | tar -xz -C \$samples_dir
-echo \"HIP samples installed in \$samples_dir\"
-exit 0
-__PAYLOAD_BELOW__\n" > "$tmp"
-
-cat "$tmp" "$payload" > "$script" && rm "$tmp"
-chmod +x "$script"
@@ -1,12 +1,19 @@
 cmake_minimum_required(VERSION 2.8.3)
 project(hip_doc)

-add_custom_target(build_doxygen ALL
+find_program(DOXYGEN_EXE doxygen)
+if(DOXYGEN_EXE)
+    add_custom_target(build_doxygen ALL
                  COMMAND HIP_PATH=@hip_SOURCE_DIR@ doxygen @hip_SOURCE_DIR@/docs/doxygen-input/doxy.cfg)
-add_custom_target(convert_md_to_html ALL
+    install(DIRECTORY RuntimeAPI/html DESTINATION docs/docs/RuntimeAPI)
+endif()
+
+find_program(GRIP_EXE grip)
+if(GRIP_EXE)
+    add_custom_target(convert_md_to_html ALL
                  COMMAND @hip_SOURCE_DIR@/packaging/convert_md_to_html.sh @hip_SOURCE_DIR@ ${PROJECT_BINARY_DIR}/md2html)
-install(DIRECTORY RuntimeAPI/html DESTINATION docs/docs/RuntimeAPI)
-install(DIRECTORY md2html/ DESTINATION docs)
+    install(DIRECTORY md2html/ DESTINATION docs)
+endif()

 #############################
 # Packaging steps
@@ -11,10 +11,6 @@ HIPCC=$(HIP_PATH)/bin/hipcc
 ifeq (${HIP_PLATFORM}, nvcc)
 	HIPCC_FLAGS = -gencode=arch=compute_20,code=sm_20 
 endif
-ifeq (${HIP_PLATFORM}, hcc)
-	HIPCC_FLAGS = -stdlib=libc++
-endif
-

 EXE=bit_extract

@@ -10,9 +10,6 @@ OPT=-O3
 CXXFLAGS =  $(OPT) --std=c++11

 HIP_PLATFORM=$(shell $(HIP_PATH)/bin/hipconfig --platform)
-ifeq (${HIP_PLATFORM}, hcc)
-	CXXFLAGS += " -stdlib=libc++"
-endif

 CODE_OBJECTS=nullkernel.hsaco

@@ -11,7 +11,6 @@
 #include <elf.h>
 #include <hsa/hsa.h>
 #include <hc.hpp>
-#include <hip/hcc_detail/hcc_acc.h>
 #endif

 #include <sys/time.h>
@@ -30,6 +30,54 @@ THE SOFTWARE.
 //---


+ihipEvent_t::ihipEvent_t(unsigned flags)
+{
+    _state  = hipEventStatusCreated;
+    _stream = NULL;
+    _flags  = flags;
+    _timestamp  = 0;
+    _type   = hipEventTypeIndependent;
+};
+
+
+
+// Attach to an existing completion future:
+void ihipEvent_t::attachToCompletionFuture(const hc::completion_future *cf, ihipEventType_t eventType)
+{
+    _state  = hipEventStatusRecording;
+    _marker = *cf;
+    _type   = eventType;
+}
+
+
+
+void ihipEvent_t::setTimestamp()
+{
+    if (_state == hipEventStatusRecorded) {
+        // already recorded, done:
+        return;
+    } else {
+        // TODO - use completion-future functions to obtain ticks and timestamps:
+        hsa_signal_t *sig  = static_cast<hsa_signal_t*> (_marker.get_native_handle());
+        if (sig) {
+            if (hsa_signal_load_acquire(*sig) == 0) {
+
+                if ((_type == hipEventTypeIndependent) || (_type == hipEventTypeStopCommand)) {
+                    _timestamp =  _marker.get_end_tick();
+                } else if (_type == hipEventTypeStartCommand) {
+                    _timestamp =  _marker.get_begin_tick();
+                } else {
+                    assert(0); // TODO - move to debug assert
+                    _timestamp =  0;
+                }
+
+                _state = hipEventStatusRecorded;
+            }
+        }
+    }
+}
+
+
 hipError_t ihipEventCreate(hipEvent_t* event, unsigned flags)
 {
    hipError_t e = hipSuccess;
@@ -37,12 +85,8 @@ hipError_t ihipEventCreate(hipEvent_t* event, unsigned flags)
    // TODO-IPC - support hipEventInterprocess.
    unsigned supportedFlags = hipEventDefault | hipEventBlockingSync | hipEventDisableTiming;
    if ((flags & ~supportedFlags) == 0) {
-        ihipEvent_t *eh = new ihipEvent_t();
+        ihipEvent_t *eh = new ihipEvent_t(flags);

-        eh->_state  = hipEventStatusCreated;
-        eh->_stream = NULL;
-        eh->_flags  = flags;
-        eh->_timestamp  = 0;
        *event = eh;
    } else {
        e = hipErrorInvalidValue;
@@ -141,8 +185,8 @@ hipError_t hipEventElapsedTime(float *ms, hipEvent_t start, hipEvent_t stop)
    ihipEvent_t *start_eh = start;
    ihipEvent_t *stop_eh = stop;

-    ihipSetTs(start);
-    ihipSetTs(stop);
+    start->setTimestamp();
+    stop->setTimestamp();

    hipError_t status = hipSuccess;
    *ms = 0.0f;
@@ -151,7 +195,7 @@ hipError_t hipEventElapsedTime(float *ms, hipEvent_t start, hipEvent_t stop)
        if ((start_eh->_state == hipEventStatusRecorded) && (stop_eh->_state == hipEventStatusRecorded)) {
            // Common case, we have good information for both events.

-            int64_t tickDiff = (stop_eh->_timestamp - start_eh->_timestamp);
+            int64_t tickDiff = (stop_eh->timestamp() - start_eh->timestamp());

            uint64_t freqHz;
            hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &freqHz);
@@ -1641,23 +1641,6 @@ const char *ihipErrorString(hipError_t hip_error)
 };


-void ihipSetTs(hipEvent_t e)
-{
-    ihipEvent_t *eh = e;
-    if (eh->_state == hipEventStatusRecorded) {
-        // already recorded, done:
-        return;
-    } else {
-        // TODO - use completion-future functions to obtain ticks and timestamps:
-        hsa_signal_t *sig  = static_cast<hsa_signal_t*> (eh->_marker.get_native_handle());
-        if (sig) {
-            if (hsa_signal_load_acquire(*sig) == 0) {
-                eh->_timestamp = eh->_marker.get_end_tick();
-                eh->_state = hipEventStatusRecorded;
-            }
-        }
-    }
-}


 // Returns true if copyEngineCtx can see the memory allocated on dstCtx and srcCtx.
@@ -584,22 +584,40 @@ private: // Data
 //----
 // Internal event structure:
 enum hipEventStatus_t {
-   hipEventStatusUnitialized = 0, // event is unutilized, must be "Created" before use.
-   hipEventStatusCreated     = 1,
-   hipEventStatusRecording   = 2, // event has been enqueued to record something.
-   hipEventStatusRecorded    = 3, // event has been recorded - timestamps are valid.
+    hipEventStatusUnitialized = 0, // event is unutilized, must be "Created" before use.
+    hipEventStatusCreated     = 1,
+    hipEventStatusRecording   = 2, // event has been enqueued to record something.
+    hipEventStatusRecorded    = 3, // event has been recorded - timestamps are valid.
 } ;

+// TODO - rename to ihip type of some kind
+enum ihipEventType_t {
+    hipEventTypeIndependent,
+    hipEventTypeStartCommand,
+    hipEventTypeStopCommand,
+};

 // internal hip event structure.
-struct ihipEvent_t {
-    hipEventStatus_t       _state;
+class ihipEvent_t {
+public:
+    ihipEvent_t(unsigned flags);
+    void attachToCompletionFuture(const hc::completion_future *cf, ihipEventType_t eventType);
+    void setTimestamp();
+    uint64_t timestamp() const { return _timestamp; } ;
+    ihipEventType_t type() const { return _type; };
+
+public:
+    hipEventStatus_t      _state;

    hipStream_t           _stream;  // Stream where the event is recorded, or NULL if all streams.
    unsigned              _flags;

    hc::completion_future _marker;
+
+private:
+    ihipEventType_t       _type;
    uint64_t              _timestamp;  // store timestamp, may be set on host or by marker.
+friend hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream);
 } ;


@@ -822,8 +840,6 @@ extern hipError_t    ihipDeviceSetState();
 extern ihipDevice_t *ihipGetDevice(int);
 ihipCtx_t * ihipGetPrimaryCtx(unsigned deviceIndex);

-extern void ihipSetTs(hipEvent_t e);
-

 hipStream_t ihipSyncAndResolveStream(hipStream_t);

@@ -1260,10 +1260,15 @@ hipError_t hipIpcOpenMemHandle(void** devPtr, hipIpcMemHandle_t handle, unsigned

    ihipIpcMemHandle_t* iHandle = (ihipIpcMemHandle_t*) &handle;
    //Attach ipc memory
-    hsa_status_t hsa_status =
-        hsa_amd_ipc_memory_attach((hsa_amd_ipc_memory_t*)&(iHandle->ipc_handle), iHandle->psize, 1, agent, devPtr);
-    if(hsa_status != HSA_STATUS_SUCCESS)
-        hipStatus = hipErrorMapBufferObjectFailed;
+    auto ctx= ihipGetTlsDefaultCtx();
+    {
+        LockedAccessor_CtxCrit_t crit(ctx->criticalData());
+        // the peerCnt always stores self so make sure the trace actually
+        hsa_status_t hsa_status =
+         hsa_amd_ipc_memory_attach((hsa_amd_ipc_memory_t*)&(iHandle->ipc_handle), iHandle->psize, crit->peerCnt(), crit->peerAgents(), devPtr);
+        if(hsa_status != HSA_STATUS_SUCCESS)
+            hipStatus = hipErrorMapBufferObjectFailed;
+    }    
 #else
    hipStatus = hipErrorRuntimeOther;
 #endif
@@ -364,10 +364,11 @@ hipError_t hipModuleGetFunction(hipFunction_t *hfunc, hipModule_t hmod,


 hipError_t ihipModuleLaunchKernel(hipFunction_t f,
-            uint32_t globalWorkSizeX, uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ,
-            uint32_t localWorkSizeX, uint32_t localWorkSizeY, uint32_t localWorkSizeZ,
-            size_t sharedMemBytes, hipStream_t hStream,
-            void **kernelParams, void **extra)
+                                  uint32_t globalWorkSizeX, uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ,
+                                  uint32_t localWorkSizeX, uint32_t localWorkSizeY, uint32_t localWorkSizeZ,
+                                  size_t sharedMemBytes, hipStream_t hStream,
+                                  void **kernelParams, void **extra,
+                                  hipEvent_t startEvent, hipEvent_t stopEvent)
 {

    auto ctx = ihipGetTlsDefaultCtx();
@@ -446,7 +447,20 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f,
                          (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
        };

-        lp.av->dispatch_hsa_kernel(&aql, config[1] /* kernarg*/, kernArgSize, nullptr/*completion_future*/);
+
+        hc::completion_future cf;
+
+        lp.av->dispatch_hsa_kernel(&aql, config[1] /* kernarg*/, kernArgSize, 
+                                  (startEvent || stopEvent) ? &cf : nullptr);
+
+
+        if (startEvent) {
+            startEvent->attachToCompletionFuture(&cf, hipEventTypeStartCommand);
+        }
+        if (stopEvent) {
+            stopEvent->attachToCompletionFuture (&cf, hipEventTypeStopCommand);
+        }
+

        if(kernelParams != NULL){
          free(config[1]);
@@ -470,7 +484,8 @@ hipError_t hipModuleLaunchKernel(hipFunction_t f,
    return ihipLogStatus(ihipModuleLaunchKernel(f,
                blockDimX * gridDimX, blockDimY * gridDimY, gridDimZ * blockDimZ,
                blockDimX, blockDimY, blockDimZ,
-                sharedMemBytes, hStream, kernelParams, extra));
+                sharedMemBytes, hStream, kernelParams, extra,
+                nullptr, nullptr));
 }


@@ -478,7 +493,8 @@ hipError_t hipHccModuleLaunchKernel(hipFunction_t f,
            uint32_t globalWorkSizeX, uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ,
            uint32_t localWorkSizeX, uint32_t localWorkSizeY, uint32_t localWorkSizeZ,
            size_t sharedMemBytes, hipStream_t hStream,
-            void **kernelParams, void **extra)
+            void **kernelParams, void **extra,
+            hipEvent_t startEvent, hipEvent_t stopEvent)
 {
    HIP_INIT_API(f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ,
                 localWorkSizeX, localWorkSizeY, localWorkSizeZ,
@@ -486,7 +502,7 @@ hipError_t hipHccModuleLaunchKernel(hipFunction_t f,
                 kernelParams, extra);
    return ihipLogStatus(ihipModuleLaunchKernel(f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ,
                localWorkSizeX, localWorkSizeY, localWorkSizeZ,
-                sharedMemBytes, hStream, kernelParams, extra));
+                sharedMemBytes, hStream, kernelParams, extra, startEvent, stopEvent));
 }

 hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t *bytes,
@@ -1,18 +1,29 @@
-#include<iostream>
+#include <iostream>
 #include "hip/hip_runtime.h"
 #include "hip/hip_runtime_api.h"
+#include "../test_common.h"
+

 #define LEN 1030
 #define SIZE LEN << 2

-__global__ void cpy(hipLaunchParm lp, uint32_t *Out, uint32_t *In, uint32_t *Vald)
+/* HIT_START
+ * BUILD: %t %s ../test_common.cpp
+ * RUN: %t
+ * HIT_END
+ */
+
+
+__global__ void cpy(hipLaunchParm lp, uint32_t *Out, uint32_t *In)
 {
-    memcpy(Out, In, SIZE, Vald);
+    int tx = hipThreadIdx_x;
+    memcpy(Out + tx, In + tx, SIZE/LEN);
 }

 __global__ void set(hipLaunchParm lp, uint32_t *ptr, uint8_t val, size_t size)
 {
-    memset(ptr, val, size);
+    int tx = hipThreadIdx_x;
+    memset(ptr + tx, val, size);
 }

 int main()
@@ -24,19 +35,29 @@ int main()
    Val = new uint32_t;
    *Val = 0;
    for(int i=0;i<LEN;i++){
-        A[i] = i *1.0f;
-        B[i] = 0.0f;
+        A[i] = i;
+        B[i] = 0;
    }
    hipMalloc((void**)&Ad, SIZE);
    hipMalloc((void**)&Bd, SIZE);
-    hipMalloc((void**)&Vald, sizeof(uint32_t));
    hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
-    hipLaunchKernel(cpy, dim3(1), dim3(LEN/4), 0, 0, Bd, Ad, Vald);
-    hipLaunchKernel(set, dim3(1), dim3(LEN/4), 0, 0, Bd, 0x1, SIZE);
+
+    hipLaunchKernel(cpy, dim3(1), dim3(LEN), 0, 0, Bd, Ad);
+
    hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost);
-    hipMemcpy(Val, Vald, sizeof(uint32_t), hipMemcpyDeviceToHost);
    for(int i=LEN-16;i<LEN;i++){
-        std::cout<<A[i]<<" "<<B[i]<<std::endl;
+        if(A[i]!=B[i]){
+            return 0;
+        }
    }
-    std::cout<<*Val<<std::endl;
+    hipLaunchKernel(set, dim3(1), dim3(LEN), 0, 0, Bd, 0x1, LEN);
+
+    hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost);
+    for(int i=LEN-16;i<LEN;i++){
+        if(0x01010101!=B[i]){
+            return 0;
+        }
+    }
+ 
+    passed();
 }