diff --git a/projects/clr/hipamd/bin/hipcc b/projects/clr/hipamd/bin/hipcc
index d001c6febe..1ab4cf2759 100755
--- a/projects/clr/hipamd/bin/hipcc
+++ b/projects/clr/hipamd/bin/hipcc
@@ -71,6 +71,7 @@ if ($HIP_PLATFORM eq "hcc") {
 	$HIPLDFLAGS .= " -L$HSA_PATH/lib -lhsa-runtime64 -lhc_am";
 	# Add C++ libs for GCC.
 	$HIPLDFLAGS .= " -lstdc++";
+	$HIPLDFLAGS .= " -lm";
 
     if ($verbose & 0x2) {
         print ("HSA_PATH=$HSA_PATH\n");
@@ -163,7 +164,7 @@ if ($needHipHcc) {
     if ((not -e $object) or ((stat($source))[9] > (stat($object))[9]))  {
         my $CMD = "$HCC $HCCFLAGS  -I$HSA_PATH/include -I$HIP_PATH/include -Wall -c $source -o $object";
         if ($verbose & 0x10) {
-            $CMD .= " -g" ;
+            $CMD .= " -g -O2" ;
         } else {
             $CMD .= " -O3" ;
         }
diff --git a/projects/clr/hipamd/bin/hipconfig b/projects/clr/hipamd/bin/hipconfig
index db53d6014e..1687983330 100755
--- a/projects/clr/hipamd/bin/hipconfig
+++ b/projects/clr/hipamd/bin/hipconfig
@@ -82,6 +82,7 @@ if ($p_full) {
         system("$HCC_HOME/bin/hcc-config --cxxflags");
         print ("HCC-ldflags : "); 
         system("$HCC_HOME/bin/hcc-config --ldflags");
+        printf("\n");
     }
     if ($HIP_PLATFORM eq "nvcc")  {
         print "\n" ;
@@ -98,6 +99,8 @@ if ($p_full) {
     print "\n" ;
     print "== Linux Kernel\n";
     system ("uname -a");
+
+    print "\n" ;
     $printed = 1;
 }
 
diff --git a/projects/clr/hipamd/bin/hipify b/projects/clr/hipamd/bin/hipify
index d143bdff37..e3b6c64c88 100755
--- a/projects/clr/hipamd/bin/hipify
+++ b/projects/clr/hipamd/bin/hipify
@@ -277,6 +277,8 @@ while (@ARGV) {
 
         $ft{'mem'} += s/\bcudaMemcpyKind\b/hipMemcpyKind/g;
 
+        $ft{'mem'} += s/\bcudaPointerAttributes\b/hipPointerAttribute_t/g;
+
 
         #--------
         # Memory management:
diff --git a/projects/clr/hipamd/include/hcc_detail/hip_runtime.h b/projects/clr/hipamd/include/hcc_detail/hip_runtime.h
index 8474f066df..aeed53348e 100644
--- a/projects/clr/hipamd/include/hcc_detail/hip_runtime.h
+++ b/projects/clr/hipamd/include/hcc_detail/hip_runtime.h
@@ -108,6 +108,12 @@ THE SOFTWARE.
 #define __HCC_C__
 #endif
 
+
+// TODO - hipify-clang - change to use the function call.
+//#define warpSize hc::__wavesize()
+const int warpSize  = 64;
+
+
 #define clock_t long long int
 __device__ inline long long int clock64() { return (long long int)hc::__clock_u64(); };
 __device__ inline clock_t clock() { return (clock_t)hc::__clock_u64(); };
@@ -344,42 +350,42 @@ __device__ inline unsigned long long int __ballot( int input)
 }
 
 // warp shuffle functions
-__device__ inline int __shfl(int input, int lane, int width)
+__device__ inline int __shfl(int input, int lane, int width=warpSize)
 {
   return hc::__shfl(input,lane,width);
 }
 
-__device__ inline int __shfl_up(int input, unsigned int lane_delta, int width)
+__device__ inline int __shfl_up(int input, unsigned int lane_delta, int width=warpSize)
 {
   return hc::__shfl_up(input,lane_delta,width);
 }
 
-__device__ inline int __shfl_down(int input, unsigned int lane_delta, int width)
+__device__ inline int __shfl_down(int input, unsigned int lane_delta, int width=warpSize)
 {
   return hc::__shfl_down(input,lane_delta,width);
 }
 
-__device__ inline int __shfl_xor(int input, int lane_mask, int width)
+__device__ inline int __shfl_xor(int input, int lane_mask, int width=warpSize)
 {
   return hc::__shfl_xor(input,lane_mask,width);
 }
 
-__device__ inline float __shfl(float input, int lane, int width)
+__device__ inline float __shfl(float input, int lane, int width=warpSize)
 {
   return hc::__shfl(input,lane,width);
 }
 
-__device__ inline float __shfl_up(float input, unsigned int lane_delta, int width)
+__device__ inline float __shfl_up(float input, unsigned int lane_delta, int width=warpSize)
 {
   return hc::__shfl_up(input,lane_delta,width);
 }
 
-__device__ inline float __shfl_down(float input, unsigned int lane_delta, int width)
+__device__ inline float __shfl_down(float input, unsigned int lane_delta, int width=warpSize)
 {
   return hc::__shfl_down(input,lane_delta,width);
 }
 
-__device__ inline float __shfl_xor(float input, int lane_mask, int width)
+__device__ inline float __shfl_xor(float input, int lane_mask, int width=warpSize)
 {
   return hc::__shfl_xor(input,lane_mask,width);
 }
@@ -438,7 +444,6 @@ __device__ inline float __dsqrt_rz(double x) {return hc::fast_math::sqrt(x); };
 #define hipGridDim_z   (hc_get_num_groups(0))
 
 
-extern int warpSize ;
 
 
 #define __syncthreads() hc_barrier(CLK_LOCAL_MEM_FENCE)
@@ -476,7 +481,8 @@ extern int warpSize ;
 
 
 #ifdef __HCC_CPP__
-hc::accelerator_view *ihipLaunchKernel(hipStream_t stream);
+hipStream_t ihipPreLaunchKernel(hipStream_t stream, hc::accelerator_view **av);
+void ihipPostLaunchKernel(hipStream_t stream, hc::completion_future &cf);
 
 #if not defined(DISABLE_GRID_LAUNCH)
 #define hipLaunchKernel(_kernelName, _numBlocks3D, _blockDim3D, _groupMemBytes, _stream, ...) \
@@ -491,12 +497,13 @@ do {\
   lp.groupMemBytes = _groupMemBytes;\
   hc::completion_future cf;\
   lp.cf = &cf;  \
-  lp.av = (ihipLaunchKernel(_stream)); \
+  hipStream_t trueStream = (ihipPreLaunchKernel(_stream, &lp.av)); \
     if (HIP_TRACE_API) {\
         fprintf(stderr, "hiptrace1: launch '%s' gridDim:[%d.%d.%d] groupDim:[%d.%d.%d] groupMem:+%d stream=%p\n", \
                 #_kernelName, lp.gridDim.z, lp.gridDim.y, lp.gridDim.x, lp.groupDim.z, lp.groupDim.y, lp.groupDim.x, lp.groupMemBytes, (void*)(_stream));\
     }\
   _kernelName (lp, __VA_ARGS__);\
+  ihipPostLaunchKernel(trueStream, cf);\
 } while(0)
 
 #else
@@ -514,12 +521,13 @@ do {\
   lp.groupMemBytes = _groupMemBytes;\
   hc::completion_future cf;\
   lp.cf = &cf;  \
-  lp.av = (ihipLaunchKernel(_stream)); \
+  hipStream_t trueStream = (ihipPreLaunchKernel(_stream, &lp.av)); \
     if (HIP_TRACE_API) {\
         fprintf(stderr, "hiptrace1: launch '%s' gridDim:[%d.%d.%d] groupDim:[%d.%d.%d] groupMem:+%d stream=%p\n", \
                 #_kernelName, lp.gridDim.z, lp.gridDim.y, lp.gridDim.x, lp.groupDim.z, lp.groupDim.y, lp.groupDim.x, lp.groupMemBytes, (void*)(_stream));\
     }\
   _kernelName (lp, __VA_ARGS__);\
+  ihipPostLaunchKernel(trueStream, cf);\
 } while(0)
 /*end hipLaunchKernel */
 #endif
diff --git a/projects/clr/hipamd/include/hcc_detail/hip_runtime_api.h b/projects/clr/hipamd/include/hcc_detail/hip_runtime_api.h
index 225b065654..5fe398b84c 100644
--- a/projects/clr/hipamd/include/hcc_detail/hip_runtime_api.h
+++ b/projects/clr/hipamd/include/hcc_detail/hip_runtime_api.h
@@ -105,6 +105,8 @@ enum hipMemcpyKind {
 } ;
 
 
+
+
 // Doxygen end group GlobalDefs
 /**  @} */
 
@@ -113,7 +115,7 @@ enum hipMemcpyKind {
 
 
 // The handle allows the async commands to use the stream even if the parent hipStream_t goes out-of-scope.
-typedef struct ihipStream_t * hipStream_t;
+typedef class ihipStream_t * hipStream_t;
 
 
 /*
@@ -128,6 +130,7 @@ typedef struct hipEvent_t {
 
 
 
+
 #ifdef __cplusplus
 } /* extern "C" */
 #endif
@@ -634,6 +637,11 @@ hipError_t hipEventQuery(hipEvent_t event) ;
  */
 
 
+/**
+ *  @brief Return attributes for the specified pointer
+ */
+hipError_t hipPointerGetAttributes(hipPointerAttribute_t *attributes, void* ptr) ;
+
 
 /**
  *  @brief Allocate memory on the default accelerator
diff --git a/projects/clr/hipamd/include/hip_runtime_api.h b/projects/clr/hipamd/include/hip_runtime_api.h
index 5191bc5d54..239be13843 100644
--- a/projects/clr/hipamd/include/hip_runtime_api.h
+++ b/projects/clr/hipamd/include/hip_runtime_api.h
@@ -98,6 +98,30 @@ typedef struct hipDeviceProp_t {
  } hipDeviceProp_t;
 
 
+/**
+ * Memory type (for pointer attributes)
+ */
+enum hipMemoryType {
+    hipMemoryTypeHost,   ///< Memory is physically located on host
+    hipMemoryTypeDevice  ///< Memory is physically located on device. (see deviceId for specific device)
+};
+
+
+
+/**
+ * Pointer attributes
+ */
+typedef struct hipPointerAttribute_t {
+    enum hipMemoryType memoryType;
+    int device;
+    void *devicePointer;
+    void *hostPointer;
+    int isManaged;
+    unsigned allocationFlags; /* flags specified when memory was allocated*/
+    /* peers? */
+} hipPointerAttribute_t;
+
+
 // hack to get these to show up in Doxygen:
 /**
  *     @defgroup GlobalDefs Global enum and defines
@@ -111,6 +135,7 @@ typedef struct hipDeviceProp_t {
  * @enum
  * @ingroup Enumerations
  */
+// Developer note - when updating these, update the hipErrorName and hipErrorString functions
 typedef enum hipError_t {
      hipSuccess = 0                   ///< Successful completion.
     ,hipErrorMemoryAllocation         ///< Memory allocation error.
@@ -120,6 +145,8 @@ typedef enum hipError_t {
     ,hipErrorInvalidValue             ///< One or more of the parameters passed to the API call is NULL or not in an acceptable range.
     ,hipErrorInvalidResourceHandle    ///< Resource handle (hipEvent_t or hipStream_t) invalid.
     ,hipErrorInvalidDevice            ///< DeviceID must be in range 0...#compute-devices.
+    ,hipErrorInvalidMemcpyDirection   ///< Invalid memory copy direction 
+
     ,hipErrorNoDevice                 ///< Call to hipGetDeviceCount returned 0 devices
     ,hipErrorNotReady                 ///< Indicates that asynchronous operations enqueued earlier are not ready.  This is not actually an error, but is used to distinguish from hipSuccess (which indicates completion).  APIs that return this error include hipEventQuery and hipStreamQuery.
     ,hipErrorUnknown                  ///< Unknown error.
diff --git a/projects/clr/hipamd/samples/1_Utils/hipBusBandwidth/LICENSE.txt b/projects/clr/hipamd/samples/1_Utils/hipBusBandwidth/LICENSE.txt
new file mode 100644
index 0000000000..5d0d603232
--- /dev/null
+++ b/projects/clr/hipamd/samples/1_Utils/hipBusBandwidth/LICENSE.txt
@@ -0,0 +1,27 @@
+
+Copyright (c) 2011, UT-Battelle, LLC
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright
+  notice, this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+* Neither the name of Oak Ridge National Laboratory, nor UT-Battelle, LLC, nor
+  the names of its contributors may be used to endorse or promote products
+  derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
diff --git a/projects/clr/hipamd/samples/1_Utils/hipBusBandwidth/Makefile b/projects/clr/hipamd/samples/1_Utils/hipBusBandwidth/Makefile
new file mode 100644
index 0000000000..77a92fb1a6
--- /dev/null
+++ b/projects/clr/hipamd/samples/1_Utils/hipBusBandwidth/Makefile
@@ -0,0 +1,17 @@
+HIP_PATH?=$(shell hipconfig -p)
+HIPCC=$(HIP_PATH)/bin/hipcc
+
+EXE=hipBusBandwidth
+CXXFLAGS = -O3 -g
+
+all: install
+
+$(EXE): hipBusBandwidth.cpp ResultDatabase.cpp
+	$(HIPCC) $(CXXFLAGS) $^ -o $@
+
+install: $(EXE)
+	cp $(EXE) $(HIP_PATH)/bin
+
+
+clean:
+	rm -f *.o $(EXE)
diff --git a/projects/clr/hipamd/samples/1_Utils/hipBusBandwidth/ResultDatabase.cpp b/projects/clr/hipamd/samples/1_Utils/hipBusBandwidth/ResultDatabase.cpp
new file mode 100644
index 0000000000..7d2f3aef84
--- /dev/null
+++ b/projects/clr/hipamd/samples/1_Utils/hipBusBandwidth/ResultDatabase.cpp
@@ -0,0 +1,523 @@
+#include "ResultDatabase.h"
+
+#include <cfloat>
+#include <algorithm>
+#include <cmath>
+#include <iomanip>
+
+using namespace std;
+
+bool ResultDatabase::Result::operator<(const Result &rhs) const
+{
+    if (test < rhs.test)
+        return true;
+    if (test > rhs.test)
+        return false;
+    if (atts < rhs.atts)
+        return true;
+    if (atts > rhs.atts)
+        return false;
+    return false; // less-operator returns false on equal
+}
+
+double ResultDatabase::Result::GetMin() const
+{
+    double r = FLT_MAX;
+    for (int i=0; i<value.size(); i++)
+    {
+        r = min(r, value[i]);
+    }
+    return r;
+}
+
+double ResultDatabase::Result::GetMax() const
+{
+    double r = -FLT_MAX;
+    for (int i=0; i<value.size(); i++)
+    {
+        r = max(r, value[i]);
+    }
+    return r;
+}
+
+double ResultDatabase::Result::GetMedian() const
+{
+    return GetPercentile(50);
+}
+
+double ResultDatabase::Result::GetPercentile(double q) const
+{
+    int n = value.size();
+    if (n == 0)
+        return FLT_MAX;
+    if (n == 1)
+        return value[0];
+
+    if (q <= 0)
+        return value[0];
+    if (q >= 100)
+        return value[n-1];
+
+    double index = ((n + 1.) * q / 100.) - 1;
+
+    vector<double> sorted = value;
+    sort(sorted.begin(), sorted.end());
+
+    if (n == 2)
+        return (sorted[0] * (1 - q/100.)  +  sorted[1] * (q/100.));
+
+    int index_lo = int(index);
+    double frac = index - index_lo;
+    if (frac == 0)
+        return sorted[index_lo];
+
+    double lo = sorted[index_lo];
+    double hi = sorted[index_lo + 1];
+    return lo + (hi-lo)*frac;
+}
+
+double ResultDatabase::Result::GetMean() const
+{
+    double r = 0;
+    for (int i=0; i<value.size(); i++)
+    {
+        r += value[i];
+    }
+    return r / double(value.size());
+}
+
+double ResultDatabase::Result::GetStdDev() const
+{
+    double r = 0;
+    double u = GetMean();
+    if (u == FLT_MAX)
+        return FLT_MAX;
+    for (int i=0; i<value.size(); i++)
+    {
+        r += (value[i] - u) * (value[i] - u);
+    }
+    r = sqrt(r / value.size());
+    return r;
+}
+
+
+void ResultDatabase::AddResults(const string &test,
+                                const string &atts,
+                                const string &unit,
+                                const vector<double> &values)
+{
+    for (int i=0; i<values.size(); i++)
+    {
+        AddResult(test, atts, unit, values[i]);
+    }
+}
+
+static string RemoveAllButLeadingSpaces(const string &a)
+{
+    string b;
+    int n = a.length();
+    int i = 0;
+    while (i<n && a[i] == ' ')
+    {
+        b += a[i];
+        ++i;
+    }
+    for (; i<n; i++)
+    {
+        if (a[i] != ' ' && a[i] != '\t')
+            b += a[i];
+    }
+    return b;
+}
+
+void ResultDatabase::AddResult(const string &test_orig,
+                               const string &atts_orig,
+                               const string &unit_orig,
+                               double value)
+{
+    string test = RemoveAllButLeadingSpaces(test_orig);
+    string atts = RemoveAllButLeadingSpaces(atts_orig);
+    string unit = RemoveAllButLeadingSpaces(unit_orig);
+    int index;
+    for (index = 0; index < results.size(); index++)
+    {
+        if (results[index].test == test &&
+            results[index].atts == atts)
+        {
+            if (results[index].unit != unit)
+                throw "Internal error: mixed units";
+
+            break;
+        }
+    }
+
+    if (index >= results.size())
+    {
+        Result r;
+        r.test = test;
+        r.atts = atts;
+        r.unit = unit;
+        results.push_back(r);
+    }
+
+    results[index].value.push_back(value);
+}
+
+// ****************************************************************************
+//  Method:  ResultDatabase::DumpDetailed
+//
+//  Purpose:
+//    Writes the full results, including all trials.
+//
+//  Arguments:
+//    out        where to print
+//
+//  Programmer:  Jeremy Meredith
+//  Creation:    August 14, 2009
+//
+//  Modifications:
+//    Jeremy Meredith, Wed Nov 10 14:25:17 EST 2010
+//    Renamed to DumpDetailed to make room for a DumpSummary.
+//
+//    Jeremy Meredith, Thu Nov 11 11:39:57 EST 2010
+//    Added note about (*) missing value tag.
+//
+//    Jeremy Meredith, Tue Nov 23 13:57:02 EST 2010
+//    Changed note about missing values to be worded a little better.
+//
+// ****************************************************************************
+void ResultDatabase::DumpDetailed(ostream &out)
+{
+    vector<Result> sorted(results);
+
+    sort(sorted.begin(), sorted.end());
+
+    int maxtrials = 1;
+    for (int i=0; i<sorted.size(); i++)
+    {
+        if (sorted[i].value.size() > maxtrials)
+            maxtrials = sorted[i].value.size();
+    }
+
+    // TODO: in big parallel runs, the "trials" are the procs
+    // and we really don't want to print them all out....
+    out << "test\t"
+        << "atts\t"
+        << "units\t"
+        << "median\t"
+        << "mean\t"
+        << "stddev\t"
+        << "min\t"
+        << "max\t";
+    for (int i=0; i<maxtrials; i++)
+        out << "trial"<<i<<"\t";
+    out << endl;
+
+    for (int i=0; i<sorted.size(); i++)
+    {
+        Result &r = sorted[i];
+        out << r.test << "\t";
+        out << r.atts << "\t";
+        out << r.unit << "\t";
+        if (r.GetMedian() == FLT_MAX)
+            out << "N/A\t";
+        else
+            out << r.GetMedian() << "\t";
+        if (r.GetMean() == FLT_MAX)
+            out << "N/A\t";
+        else
+            out << r.GetMean()   << "\t";
+        if (r.GetStdDev() == FLT_MAX)
+            out << "N/A\t";
+        else
+            out << r.GetStdDev() << "\t";
+        if (r.GetMin() == FLT_MAX)
+            out << "N/A\t";
+        else
+            out << r.GetMin()    << "\t";
+        if (r.GetMax() == FLT_MAX)
+            out << "N/A\t";
+        else
+            out << r.GetMax()    << "\t";
+        for (int j=0; j<r.value.size(); j++)
+        {
+            if (r.value[j] == FLT_MAX)
+                out << "N/A\t";
+            else
+                out << r.value[j] << "\t";
+        }
+
+        out << endl;
+    }
+    out << endl
+        << "Note: Any results marked with (*) had missing values." << endl
+        << "      This can occur on systems with a mixture of" << endl
+        << "      device types or architectural capabilities." << endl;
+}
+
+
+// ****************************************************************************
+//  Method:  ResultDatabase::DumpDetailed
+//
+//  Purpose:
+//    Writes the summary results (min/max/stddev/med/mean), but not
+//    every individual trial.
+//
+//  Arguments:
+//    out        where to print
+//
+//  Programmer:  Jeremy Meredith
+//  Creation:    November 10, 2010
+//
+//  Modifications:
+//    Jeremy Meredith, Thu Nov 11 11:39:57 EST 2010
+//    Added note about (*) missing value tag.
+//
+// ****************************************************************************
+void ResultDatabase::DumpSummary(ostream &out)
+{
+    vector<Result> sorted(results);
+
+    int testW = 15 ;
+    const int fieldW = 9;
+
+    sort(sorted.begin(), sorted.end());
+
+    out << std::fixed << right << std::setprecision(4);
+
+    // TODO: in big parallel runs, the "trials" are the procs
+    // and we really don't want to print them all out....
+    out << setw(testW) << "test\t"  << setw(fieldW)
+        << "atts\t"
+        << "units\t"
+        << "median\t"
+        << "mean\t"
+        << "stddev\t"
+        << "min\t"
+        << "max\t";
+    out << endl;
+
+    for (int i=0; i<sorted.size(); i++)
+    {
+        Result &r = sorted[i];
+        out << setw(testW) << r.test << setw(fieldW) << "\t";
+        out << r.atts << "\t";
+        out << r.unit << "\t";
+        if (r.GetMedian() == FLT_MAX)
+            out << "N/A\t";
+        else
+            out << r.GetMedian() << "\t";
+        if (r.GetMean() == FLT_MAX)
+            out << "N/A\t";
+        else
+            out << r.GetMean()   << "\t";
+        if (r.GetStdDev() == FLT_MAX)
+            out << "N/A\t";
+        else
+            out << r.GetStdDev() << "\t";
+        if (r.GetMin() == FLT_MAX)
+            out << "N/A\t";
+        else
+            out << r.GetMin()    << "\t";
+        if (r.GetMax() == FLT_MAX)
+            out << "N/A\t";
+        else
+            out << r.GetMax()    << "\t";
+
+        out << endl;
+    }
+    out << endl
+        << "Note: results marked with (*) had missing values such as" << endl
+        << "might occur with a mixture of architectural capabilities." << endl;
+}
+
+// ****************************************************************************
+//  Method:  ResultDatabase::ClearAllResults
+//
+//  Purpose:
+//    Clears all existing results from the ResultDatabase; used for multiple passes
+//    of the same test or multiple tests.
+//
+//  Arguments:
+//
+//  Programmer:  Jeffrey Young
+//  Creation:    September 10th, 2014
+//
+//  Modifications:
+//
+//
+// ****************************************************************************
+void ResultDatabase::ClearAllResults()
+{
+	results.clear();	
+}
+
+// ****************************************************************************
+//  Method:  ResultDatabase::DumpCsv
+//
+//  Purpose:
+//    Writes either detailed or summary results (min/max/stddev/med/mean), but not
+//    every individual trial.
+//
+//  Arguments:
+//    out        file to print CSV results
+//
+//  Programmer:  Jeffrey Young
+//  Creation:    August 28th, 2014
+//
+//  Modifications:
+//
+// ****************************************************************************
+void ResultDatabase::DumpCsv(string fileName)
+{
+    bool emptyFile;
+    vector<Result> sorted(results);
+
+    sort(sorted.begin(), sorted.end());
+
+    //Check to see if the file is empty - if so, add the headers
+    emptyFile = this->IsFileEmpty(fileName);
+
+    //Open file and append by default
+    ofstream out;
+    out.open(fileName.c_str(), std::ofstream::out | std::ofstream::app); 
+
+    //Add headers only for empty files
+    if(emptyFile)
+    {
+    // TODO: in big parallel runs, the "trials" are the procs
+    // and we really don't want to print them all out....
+    out << "test, "
+        << "atts, "
+        << "units, "
+        << "median, "
+        << "mean, "
+        << "stddev, "
+        << "min, "
+        << "max, ";
+    out << endl;
+    }
+
+    for (int i=0; i<sorted.size(); i++)
+    {
+        Result &r = sorted[i];
+        out << r.test << ", ";
+        out << r.atts << ", ";
+        out << r.unit << ", ";
+        if (r.GetMedian() == FLT_MAX)
+            out << "N/A, ";
+        else
+            out << r.GetMedian() << ", ";
+        if (r.GetMean() == FLT_MAX)
+            out << "N/A, ";
+        else
+            out << r.GetMean()   << ", ";
+        if (r.GetStdDev() == FLT_MAX)
+            out << "N/A, ";
+        else
+            out << r.GetStdDev() << ", ";
+        if (r.GetMin() == FLT_MAX)
+            out << "N/A, ";
+        else
+            out << r.GetMin()    << ", ";
+        if (r.GetMax() == FLT_MAX)
+            out << "N/A, ";
+        else
+            out << r.GetMax()    << ", ";
+
+        out << endl;
+    }
+    out << endl;
+
+    out.close();
+}
+
+// ****************************************************************************
+//  Method:  ResultDatabase::IsFileEmpty
+//
+//  Purpose:
+//    Returns whether a file is empty - used as a helper for CSV printing
+//
+//  Arguments:
+//    file  The input file to check for emptiness
+//
+//  Programmer:  Jeffrey Young
+//  Creation:    August 28th, 2014
+//
+//  Modifications:
+//
+// ****************************************************************************
+
+bool ResultDatabase::IsFileEmpty(string fileName)
+{
+      bool fileEmpty;
+
+      ifstream file(fileName.c_str());
+
+      //If the file doesn't exist it is by definition empty
+      if(!file.good())
+      {
+        return true;
+      }
+      else
+      {
+        fileEmpty = (bool)(file.peek() == ifstream::traits_type::eof());
+        file.close();
+        
+	return fileEmpty;
+      }
+  
+      //Otherwise, return false  
+        return false;
+}
+
+
+
+// ****************************************************************************
+//  Method:  ResultDatabase::GetResultsForTest
+//
+//  Purpose:
+//    Returns a vector of results for just one test name.
+//
+//  Arguments:
+//    test       the name of the test results to search for
+//
+//  Programmer:  Jeremy Meredith
+//  Creation:    December  3, 2010
+//
+//  Modifications:
+//
+// ****************************************************************************
+vector<ResultDatabase::Result>
+ResultDatabase::GetResultsForTest(const string &test)
+{
+    // get only the given test results
+    vector<Result> retval;
+    for (int i=0; i<results.size(); i++)
+    {
+        Result &r = results[i];
+        if (r.test == test)
+            retval.push_back(r);
+    }
+    return retval;
+}
+
+// ****************************************************************************
+//  Method:  ResultDatabase::GetResults
+//
+//  Purpose:
+//    Returns all the results.
+//
+//  Arguments:
+//
+//  Programmer:  Jeremy Meredith
+//  Creation:    December  3, 2010
+//
+//  Modifications:
+//
+// ****************************************************************************
+const vector<ResultDatabase::Result> &
+ResultDatabase::GetResults() const
+{
+    return results;
+}
diff --git a/projects/clr/hipamd/samples/1_Utils/hipBusBandwidth/ResultDatabase.h b/projects/clr/hipamd/samples/1_Utils/hipBusBandwidth/ResultDatabase.h
new file mode 100644
index 0000000000..4b63a02a1f
--- /dev/null
+++ b/projects/clr/hipamd/samples/1_Utils/hipBusBandwidth/ResultDatabase.h
@@ -0,0 +1,100 @@
+#ifndef RESULT_DATABASE_H
+#define RESULT_DATABASE_H
+
+#include <string>
+#include <vector>
+#include <iostream>
+#include <fstream>
+#include <cfloat>
+using std::string;
+using std::vector;
+using std::ostream;
+using std::ofstream;
+using std::ifstream;
+
+
+// ****************************************************************************
+// Class:  ResultDatabase
+//
+// Purpose:
+//   Track numerical results as they are generated.
+//   Print statistics of raw results.
+//
+// Programmer:  Jeremy Meredith
+// Creation:    June 12, 2009
+//
+// Modifications:
+//    Jeremy Meredith, Wed Nov 10 14:20:47 EST 2010
+//    Split timing reports into detailed and summary.  E.g. for serial code,
+//    we might report all trial values, but skip them in parallel.
+//
+//    Jeremy Meredith, Thu Nov 11 11:40:18 EST 2010
+//    Added check for missing value tag.
+//
+//    Jeremy Meredith, Mon Nov 22 13:37:10 EST 2010
+//    Added percentile statistic.
+//
+//    Jeremy Meredith, Fri Dec  3 16:30:31 EST 2010
+//    Added a method to extract a subset of results based on test name.  Also,
+//    the Result class is now public, so that clients can use them directly.
+//    Added a GetResults method as well, and made several functions const.
+//
+// ****************************************************************************
+class ResultDatabase
+{
+  public:
+    //
+    // A performance result for a single SHOC benchmark run.
+    //
+    struct Result
+    {
+        string test;  // e.g. "readback"
+        string atts;  // e.g. "pagelocked 4k^2"
+        string unit;  // e.g. "MB/sec"
+        vector<double> value; // e.g. "837.14"
+        double GetMin() const;
+        double GetMax() const;
+        double GetMedian() const;
+        double GetPercentile(double q) const;
+        double GetMean() const;
+        double GetStdDev() const;
+
+        bool operator<(const Result &rhs) const;
+
+        bool HadAnyFLTMAXValues() const
+        {
+            for (int i=0; i<value.size(); ++i)
+            {
+                if (value[i] >= FLT_MAX)
+                    return true;
+            }
+            return false;
+        }
+    };
+
+  protected:
+    vector<Result> results;
+
+  public:
+    void AddResult(const string &test,
+                   const string &atts,
+                   const string &unit,
+                   double value);
+    void AddResults(const string &test,
+                    const string &atts,
+                    const string &unit,
+                    const vector<double> &values);
+    vector<Result>        GetResultsForTest(const string &test);
+    const vector<Result> &GetResults() const;
+    void ClearAllResults();
+    void DumpDetailed(ostream&);
+    void DumpSummary(ostream&);
+    void DumpCsv(string fileName);
+
+  private:
+    bool IsFileEmpty(string fileName);
+
+};
+
+
+#endif
diff --git a/projects/clr/hipamd/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp b/projects/clr/hipamd/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp
new file mode 100644
index 0000000000..d276725921
--- /dev/null
+++ b/projects/clr/hipamd/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp
@@ -0,0 +1,387 @@
+#include <stdio.h>
+#include <iostream>
+#include <hip_runtime.h>
+
+#include "ResultDatabase.h"
+
+// Cmdline parms:
+bool          p_verbose = false;
+bool          p_pinned  = true;
+int           p_iterations   = 10;
+int           p_device  = 0;
+int           p_detailed  = 0;
+
+bool          p_h2d = true;
+bool          p_d2h = true;
+
+
+#define CHECK_HIP_ERROR()                                                    \
+{                                                                             \
+    hipError_t err = hipGetLastError();                                     \
+    if (err != hipSuccess)                                                   \
+    {                                                                         \
+        printf("error=%d name=%s at "                                         \
+               "ln: %d\n  ",err,hipGetErrorString(err),__LINE__);            \
+        exit(EXIT_FAILURE);                                                  \
+    }                                                                         \
+}
+
+
+// ****************************************************************************
+// Function: runBenchmark
+//
+// Purpose:
+//   Measures the bandwidth of the bus connecting the host processor to the
+//   OpenCL device.  This benchmark repeatedly transfers data chunks of various
+//   sizes across the bus to the OpenCL device, and calculates the bandwidth.
+//
+//
+// Arguments:
+//
+// Returns:  nothing
+//
+// Programmer: Jeremy Meredith
+// Creation: September 08, 2009
+//
+// Modifications:
+//    Jeremy Meredith, Wed Dec  1 17:05:27 EST 2010
+//    Added calculation of latency estimate.
+//    Ben Sander - moved to standalone test 
+//
+// ****************************************************************************
+void RunBenchmark_H2D(ResultDatabase &resultDB) 
+{
+    // Sizes are in kb
+    int sizes[] = {1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384, 32768,65536,131072,262144,524288};
+    int nSizes  = sizeof(sizes) / sizeof(int);
+
+    long long numMaxFloats = 1024 * (sizes[nSizes-1]) / 4;
+
+    hipSetDevice(p_device);
+
+    // Create some host memory pattern
+    float *hostMem = NULL;
+    if (p_pinned)
+    {
+        hipMallocHost((void**)&hostMem, sizeof(float) * numMaxFloats);
+        while (hipGetLastError() != hipSuccess)
+        {
+ 	    // drop the size and try again
+	    if (p_verbose) std::cout << " - dropping size allocating pinned mem\n";
+	    --nSizes;
+	    if (nSizes < 1)
+	    {
+            std::cerr << "Error: Couldn't allocated any pinned buffer\n";
+		return;
+	    }
+	    numMaxFloats = 1024 * (sizes[nSizes-1]) / 4;
+            hipMallocHost((void**)&hostMem, sizeof(float) * numMaxFloats);
+        }
+    }
+    else
+    {
+        hostMem = new float[numMaxFloats];
+    }
+
+    for (int i = 0; i < numMaxFloats; i++)
+    {
+        hostMem[i] = i % 77;
+    }
+
+    float *device;
+    hipMalloc((void**)&device, sizeof(float) * numMaxFloats);
+    while (hipGetLastError() != hipSuccess)
+    {
+	// drop the size and try again
+	if (p_verbose) std::cout << " - dropping size allocating device mem\n";
+	--nSizes;
+	if (nSizes < 1)
+	{
+        std::cerr << "Error: Couldn't allocated any device buffer\n";
+	    return;
+	}
+	numMaxFloats = 1024 * (sizes[nSizes-1]) / 4;
+        hipMalloc((void**)&device, sizeof(float) * numMaxFloats);
+    }
+
+
+    hipEvent_t start, stop;
+    hipEventCreate(&start);
+    hipEventCreate(&stop);
+    CHECK_HIP_ERROR();
+
+    // Three passes, forward and backward both
+    for (int pass = 0; pass < p_iterations; pass++)
+    {
+        // store the times temporarily to estimate latency
+        //float times[nSizes];
+        // Step through sizes forward on even passes and backward on odd
+        for (int i = 0; i < nSizes; i++)
+        {
+            int sizeIndex;
+            if ((pass % 2) == 0)
+                sizeIndex = i;
+            else
+                sizeIndex = (nSizes - 1) - i;
+
+            int nbytes = sizes[sizeIndex] * 1024;
+
+            hipEventRecord(start, 0);
+            hipMemcpy(device, hostMem, nbytes, hipMemcpyHostToDevice);
+            hipEventRecord(stop, 0);
+            hipEventSynchronize(stop);
+            float t = 0;
+            hipEventElapsedTime(&t, start, stop);
+            //times[sizeIndex] = t;
+
+            // Convert to GB/sec
+            if (p_verbose)
+            {
+                std::cerr << "size " << sizes[sizeIndex] << "k took " << t <<
+                        " ms\n";
+            }
+
+            double speed = (double(sizes[sizeIndex]) * 1024. / (1000*1000)) / t;
+            char sizeStr[256];
+            sprintf(sizeStr, "% 7dkB", sizes[sizeIndex]);
+            resultDB.AddResult("H2D_Bandwidth", sizeStr, "GB/sec", speed);
+            resultDB.AddResult("H2D_Time", sizeStr, "ms", t);
+        }
+    }
+
+    // Cleanup
+    hipFree((void*)device);
+    CHECK_HIP_ERROR();
+    if (p_pinned)
+    {
+        hipFreeHost((void*)hostMem);
+        CHECK_HIP_ERROR();
+    }
+    else
+    {
+        delete[] hostMem;
+    }
+    hipEventDestroy(start);
+    hipEventDestroy(stop);
+}
+
+
+void RunBenchmark_D2H(ResultDatabase &resultDB)
+{
+
+    // Sizes are in kb
+    int nSizes  = 20;
+    int sizes[20] = {1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,
+		     32768,65536,131072,262144,524288};
+    long long numMaxFloats = 1024 * (sizes[nSizes-1]) / 4;
+
+    // Create some host memory pattern
+    float *hostMem1;
+    float *hostMem2;
+    if (p_pinned)
+    {
+        hipMallocHost((void**)&hostMem1, sizeof(float)*numMaxFloats);
+        hipError_t err1 = hipGetLastError();
+        hipMallocHost((void**)&hostMem2, sizeof(float)*numMaxFloats);
+        hipError_t err2 = hipGetLastError();
+	while (err1 != hipSuccess || err2 != hipSuccess)
+	{
+	    // free the first buffer if only the second failed
+	    if (err1 == hipSuccess)
+	        hipFreeHost((void*)hostMem1);
+
+	    // drop the size and try again
+	    if (p_verbose) std::cout << " - dropping size allocating pinned mem\n";
+	    --nSizes;
+	    if (nSizes < 1)
+	    {
+            std::cerr << "Error: Couldn't allocated any pinned buffer\n";
+		return;
+	    }
+	    numMaxFloats = 1024 * (sizes[nSizes-1]) / 4;
+            hipMallocHost((void**)&hostMem1, sizeof(float)*numMaxFloats);
+            err1 = hipGetLastError();
+            hipMallocHost((void**)&hostMem2, sizeof(float)*numMaxFloats);
+            err2 = hipGetLastError();
+	}
+   }
+    else
+    {
+        hostMem1 = new float[numMaxFloats];
+        hostMem2 = new float[numMaxFloats];
+    }
+    for (int i=0; i<numMaxFloats; i++)
+        hostMem1[i] = i % 77;
+
+    float *device;
+    hipMalloc((void**)&device, sizeof(float) * numMaxFloats);
+    while (hipGetLastError() != hipSuccess)
+    {
+	// drop the size and try again
+	if (p_verbose) std::cout << " - dropping size allocating device mem\n";
+	--nSizes;
+	if (nSizes < 1)
+	{
+        std::cerr << "Error: Couldn't allocated any device buffer\n";
+	    return;
+	}
+	numMaxFloats = 1024 * (sizes[nSizes-1]) / 4;
+        hipMalloc((void**)&device, sizeof(float) * numMaxFloats);
+    }
+
+    hipMemcpy(device, hostMem1,
+               numMaxFloats*sizeof(float), hipMemcpyHostToDevice);
+    hipDeviceSynchronize();
+
+    hipEvent_t start, stop;
+    hipEventCreate(&start);
+    hipEventCreate(&stop);
+    CHECK_HIP_ERROR();
+
+    // Three passes, forward and backward both
+    for (int pass = 0; pass < p_iterations; pass++)
+    {
+        // store the times temporarily to estimate latency
+        //float times[nSizes];
+        // Step through sizes forward on even passes and backward on odd
+        for (int i = 0; i < nSizes; i++)
+        {
+            int sizeIndex;
+            if ((pass % 2) == 0)
+                sizeIndex = i;
+            else
+                sizeIndex = (nSizes - 1) - i;
+
+            int nbytes = sizes[sizeIndex] * 1024;
+
+            hipEventRecord(start, 0);
+            hipMemcpy(hostMem2, device,
+                       nbytes, hipMemcpyDeviceToHost);
+            hipEventRecord(stop, 0);
+            hipEventSynchronize(stop);
+            float t = 0;
+            hipEventElapsedTime(&t, start, stop);
+            //times[sizeIndex] = t;
+
+            // Convert to GB/sec
+            if (p_verbose)
+            {
+                std::cerr << "size " <<sizes[sizeIndex] << "k took " << t <<
+                        " ms\n";
+            }
+
+            double speed = (double(sizes[sizeIndex]) * 1024. / (1000*1000)) / t;
+            char sizeStr[256];
+            sprintf(sizeStr, "% 7dkB", sizes[sizeIndex]);
+            resultDB.AddResult("D2H_Bandwidth", sizeStr, "GB/sec", speed);
+            resultDB.AddResult("D2H_Time", sizeStr, "ms", t);
+        }
+	//resultDB.AddResult("ReadbackLatencyEstimate", "1-2kb", "ms", times[0]-(times[1]-times[0])/1.);
+	//resultDB.AddResult("ReadbackLatencyEstimate", "1-4kb", "ms", times[0]-(times[2]-times[0])/3.);
+	//resultDB.AddResult("ReadbackLatencyEstimate", "2-4kb", "ms", times[1]-(times[2]-times[1])/1.);
+    }
+
+    // Cleanup
+    hipFree((void*)device);
+    CHECK_HIP_ERROR();
+    if (p_pinned)
+    {
+        hipFreeHost((void*)hostMem1);
+        CHECK_HIP_ERROR();
+        hipFreeHost((void*)hostMem2);
+        CHECK_HIP_ERROR();
+    }
+    else
+    {
+        delete[] hostMem1;
+        delete[] hostMem2;
+        hipEventDestroy(start);
+	    hipEventDestroy(stop);
+    }
+}
+
+
+#define failed(...) \
+    printf ("error: ");\
+    printf (__VA_ARGS__);\
+    printf ("\n");\
+    exit(EXIT_FAILURE);
+
+int parseInt(const char *str, int *output)
+{
+    char *next;
+    *output = strtol(str, &next, 0);
+    return !strlen(next);
+}
+
+void help() {
+};
+
+int parseStandardArguments(int argc, char *argv[])
+{
+    for (int i = 1; i < argc; i++) {
+        const char *arg = argv[i];
+
+        if (!strcmp(arg, " ")) {
+            // skip NULL args.
+        } else if (!strcmp(arg, "--iterations") || (!strcmp(arg, "-i"))) {
+            if (++i >= argc || !parseInt(argv[i], &p_iterations)) {
+               failed("Bad iterations argument"); 
+            }
+        } else if (!strcmp(arg, "--device") || (!strcmp(arg, "-d"))) {
+            if (++i >= argc || !parseInt(argv[i], &p_device)) {
+               failed("Bad device argument"); 
+            }
+        } else if (!strcmp(arg, "--unpinned")) {
+            p_pinned = 0;
+        } else if (!strcmp(arg, "--h2d")) {
+            p_h2d = true;
+            p_d2h = false;
+
+        } else if (!strcmp(arg, "--d2h")) {
+            p_h2d = false;
+            p_d2h = true;
+
+        } else if (!strcmp(arg, "--help")  || (!strcmp(arg, "-h"))) {
+            help();
+
+        } else if (!strcmp(arg, "--verbose")) {
+            p_verbose = 1;
+        } else if (!strcmp(arg, "--detailed")) {
+            p_detailed = 1;
+        } else {
+            failed("Bad argument '%s'", arg);
+        }
+    } 
+
+    return 0;
+};
+
+
+
+int main(int argc, char *argv[])
+{
+    parseStandardArguments(argc, argv);
+
+    if (p_h2d) {
+        ResultDatabase resultDB;
+        RunBenchmark_H2D(resultDB);
+
+        resultDB.DumpSummary(std::cout);
+
+        if (p_detailed) {
+            resultDB.DumpDetailed(std::cout);
+        }
+    }
+
+    if (p_d2h) {
+        ResultDatabase resultDB;
+        RunBenchmark_D2H(resultDB);
+
+        resultDB.DumpSummary(std::cout);
+
+        if (p_detailed) {
+            resultDB.DumpDetailed(std::cout);
+        }
+    }
+}
diff --git a/projects/clr/hipamd/src/hip_hcc.cpp b/projects/clr/hipamd/src/hip_hcc.cpp
index beba7c2775..8a0d0df1d4 100644
--- a/projects/clr/hipamd/src/hip_hcc.cpp
+++ b/projects/clr/hipamd/src/hip_hcc.cpp
@@ -31,6 +31,8 @@ THE SOFTWARE.
 #include <list>
 #include <sys/types.h>
 #include <unistd.h>
+#include <deque>
+
 #include <hc.hpp>
 #include <hc_am.hpp>
 
@@ -38,32 +40,60 @@ THE SOFTWARE.
 
 #include "hsa_ext_amd.h"
 
-#define USE_PINNED_HOST (__hcc_workweek__ >= 1601)
 
-//#define USE_ASYNC_COPY 
+
+#define USE_AM_TRACKER 1  /* >0 = use new AM memory tracker features. */
+#define USE_ROCR_V2    1  /* use the ROCR v2 async copy API with dst and src agents */
+
+#if (USE_AM_TRACKER) and (__hcc_workweek__ < 16074)
+#error (USE_AM_TRACKER requries HCC version of 16074 or newer)
+#endif
+
+
+#if (USE_ROCR_V2) and (USE_AM_TRACKER == 0)
+#error (USE_ROCR_V2 requires USE_AM_TRACKER>0)
+#endif
+
+
 
 #define INLINE static inline
 
 //---
 // Environment variables:
-// TODO-HCC - map this to the HC instruction that uses HSAIL to get the wave size.
-int warpSize = 64;
 
 // Intended to distinguish whether an environment variable should be visible only in debug mode, or in debug+release.
 //static const int debug = 0;
 static const int release = 1;
 
-int HIP_PRINT_ENV = 0;
-int HIP_TRACE_API= 0;
+#define ENABLE_CHECKS 1
+
 int HIP_LAUNCH_BLOCKING = 0;
 
-#define TRACE_API  0x1 /* trace API calls and return values */
-#define TRACE_SYNC 0x2 /* trace synchronization pieces */
-#define TRACE_MEM  0x4 /* trace memory allocation / deallocation */
+int HIP_PRINT_ENV = 0;
+int HIP_TRACE_API= 0;
+int HIP_STAGING_SIZE = 64;   /* size of staging buffers, in KB */
+int HIP_STAGING_BUFFERS = 2;  
+int HIP_STREAM_SIGNALS = 2;  /* number of signals to allocate at stream creation */
+
+
+//---
+// Chicken bits for disabling functionality to work around potential issues:
+int HIP_DISABLE_ENQ_BARRIER = 1;
+int HIP_DISABLE_BIDIR_MEMCPY = 1;
+int HIP_ONESHOT_COPY_DEP     = 1;  // this is a good thing
+
+
+//---
+//Debug flags:
+#define TRACE_API    0x01 /* trace API calls and return values */
+#define TRACE_SYNC   0x02 /* trace synchronization pieces */
+#define TRACE_MEM    0x04 /* trace memory allocation / deallocation */
+#define TRACE_COPY2  0x08 /* trace memory copy commands. Detailed. */
+#define TRACE_SIGNAL 0x10 /* trace signal pool commands */
 
 #define tprintf(trace_level, ...) {\
     if (HIP_TRACE_API & trace_level) {\
-        fprintf (stderr, "hiptrace%d: ", trace_level); \
+        fprintf (stderr, "hiptrace%x: ", trace_level); \
         fprintf (stderr, __VA_ARGS__);\
     }\
 }
@@ -75,21 +105,77 @@ struct ihipDevice_t;
 
 enum ihipCommand_t {
     ihipCommandKernel,
-    ihipCommandData,
+    ihipCommandCopyH2D,
+    ihipCommandCopyD2H,
 };
 
+const char* ihipCommandName[] = {
+    "Kernel", "CopyH2D", "CopyD2H"
+};
+
+
+
+typedef uint64_t SIGSEQNUM;
+
+//---
+// Small wrapper around signals.
+// Designed to be used from stream.
+// TODO-someday refactor this class so it can be stored in a vector<>  
+// we already store the index here so we can use for garbage collection.
+struct ihipSignal_t {
+    hsa_signal_t   _hsa_signal; // hsa signal handle
+    int            _index;      // Index in pool, used for garbage collection.
+    SIGSEQNUM      _sig_id;     // unique sequentially increasing ID.
+
+    ihipSignal_t(); 
+    ~ihipSignal_t();
+
+    inline void release();
+};
+
+
+
+
 // Internal stream structure.
-struct ihipStream_t {
-    unsigned             _device_index;
+class ihipStream_t {
+public:
+
+    ihipStream_t(unsigned device_index, hc::accelerator_view av, unsigned int flags);
+    ~ihipStream_t();
+
+    inline void                 reclaimSignals(SIGSEQNUM sigNum);
+    inline void                 waitAndReclaimOlder(ihipSignal_t *signal);
+    inline void                 wait();
+
+    inline ihipDevice_t *       getDevice() const;
+
+    ihipSignal_t *              getSignal() ;
+
+    inline bool                 preKernelCommand();
+    inline void                 postKernelCommand(hc::completion_future &kernel_future);
+    inline int                  copyCommand(ihipSignal_t *lastCopy, hsa_signal_t *waitSignal, ihipCommand_t copyType);
+
+    inline void                 resetToEmpty();
+
+    inline SIGSEQNUM            lastCopySeqId() { return _last_copy_signal ? _last_copy_signal->_sig_id : 0; };
+
+    //---
     hc::accelerator_view _av;
     unsigned            _flags;
-    ihipCommand_t        _last_command;
+private:
+    void                        enqueueBarrier(hsa_queue_t* queue, ihipSignal_t *depSignal);
 
-    //ihipStream_t() : _av(){ };
-    ihipStream_t(unsigned device_index, hc::accelerator_view av, unsigned int flags) :
-        _device_index(device_index), _av(av), _flags(flags), _last_command(ihipCommandKernel)
-         {};
-} ;
+    unsigned                    _device_index;
+    ihipCommand_t               _last_command_type;  // type of the last command
+    ihipSignal_t                *_last_copy_signal;  // signal of last copy command sent to the stream.  Copy can be either H2D or D2H.
+    hc::completion_future       _last_kernel_future;  // Completion future of last kernel command sent to GPU.
+    
+    int                         _signalCursor;
+    
+    SIGSEQNUM                   _stream_sig_id;      // Monotonically increasing unique signal id.
+    SIGSEQNUM                   _oldest_live_sig_id; // oldest live seq_id, anything < this can be allocated.
+    std::deque<ihipSignal_t>    _signalPool;   // Pool of signals for use by this stream.
+};
 
 
 
@@ -112,9 +198,34 @@ struct ihipEvent_t {
 
     hc::completion_future _marker;
     uint64_t              _timestamp;  // store timestamp, may be set on host or by marker.
+
+    SIGSEQNUM             _copy_seq_id;
 } ;
 
 
+//-------------------------------------------------------------------------------------------------
+struct StagingBuffer {
+
+    static const int _max_buffers = 4;
+
+    StagingBuffer(ihipDevice_t *device, size_t bufferSize, int numBuffers) ;
+    ~StagingBuffer();
+
+    void CopyDeviceToHost(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor);
+    void CopyHostToDevice(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor);
+
+private:
+    ihipDevice_t    *_device;
+    size_t          _bufferSize;  // Size of the buffers.
+    int             _numBuffers;
+
+    char            *_pinnedStagingBuffer[_max_buffers];
+    hsa_signal_t     _completion_signal[_max_buffers];
+};
+
+
+
+//-------------------------------------------------------------------------------------------------
 struct ihipDevice_t
 {
     unsigned                _device_index; // index into g_devices.
@@ -131,20 +242,305 @@ struct ihipDevice_t
 
     unsigned                _compute_units;
 
+    hsa_signal_t             _copy_signal;         // signal to use for copies
+    std::mutex               _copy_lock[2];      // mutex for each direction.
+    StagingBuffer           *_staging_buffer[2]; // one buffer for each direction.
+
 public:
-    ihipDevice_t(unsigned device_index, hc::accelerator acc);
+    void reset();
+    void init(unsigned device_index, hc::accelerator acc);
     hipError_t getProperties(hipDeviceProp_t* prop);
 
-    // TODO- create a copy constructor.
-    //~ihipDevice_t();
+    ~ihipDevice_t();
 };
 
 
 //=================================================================================================
-ihipDevice_t::ihipDevice_t(unsigned device_index, hc::accelerator acc)
-    :   _device_index(device_index),
-        _acc(acc)
+// Global Data Structures:
+//=================================================================================================
+//TLS - must be initialized here.
+thread_local hipError_t tls_lastHipError = hipSuccess;
+thread_local int tls_defaultDevice = 0;
+
+// Global initialization.
+std::once_flag hip_initialized;
+ihipDevice_t *g_devices; // Array of all non-emulated (ie GPU) accelerators in the system.
+unsigned g_deviceCnt;
+//=================================================================================================
+
+
+//=================================================================================================
+//Forward Declarations:
+//=================================================================================================
+INLINE bool ihipIsValidDevice(unsigned deviceIndex);
+
+//=================================================================================================
+// Implementation:
+//=================================================================================================
+
+
+//=================================================================================================
+// ihipSignal_t:
+//=================================================================================================
+//
+//---
+ihipSignal_t::ihipSignal_t() :  _sig_id(0) 
 {
+  if (hsa_signal_create(0/*value*/, 0, NULL, &_hsa_signal) != HSA_STATUS_SUCCESS) {
+      throw;
+  }
+  tprintf (TRACE_SIGNAL, "  allocated hsa_signal=%lu\n", (_hsa_signal.handle));
+}
+
+//---
+ihipSignal_t::~ihipSignal_t() 
+{
+    tprintf (TRACE_SIGNAL, "  destroy hsa_signal #%lu (#%lu)\n", (_hsa_signal.handle), _sig_id);
+    if (hsa_signal_destroy(_hsa_signal) != HSA_STATUS_SUCCESS) {
+        throw; // TODO
+    }
+};
+
+
+
+//=================================================================================================
+// ihipStream_t:
+//=================================================================================================
+//---
+ihipStream_t::ihipStream_t(unsigned device_index, hc::accelerator_view av, unsigned int flags) :
+    _av(av), 
+    _flags(flags), 
+    _device_index(device_index), 
+    _last_copy_signal(0),
+    _signalCursor(0),
+    _stream_sig_id(0),
+    _oldest_live_sig_id(1)
+{
+    tprintf(TRACE_SYNC, " streamCreate: stream=%p\n", this);
+    _signalPool.resize(HIP_STREAM_SIGNALS > 0 ? HIP_STREAM_SIGNALS : 1);
+
+    resetToEmpty();
+};
+
+
+//---
+ihipStream_t::~ihipStream_t()
+{
+    _signalPool.clear();
+}
+
+
+//---
+// Reset the stream to "empty" - next command will not set up an inpute dependency on any older signal.
+void ihipStream_t::resetToEmpty() 
+{
+    _last_command_type = ihipCommandCopyH2D;
+    _last_copy_signal = NULL;
+}
+
+//---
+void ihipStream_t::reclaimSignals(SIGSEQNUM sigNum)
+{
+    tprintf(TRACE_SIGNAL, "reclaim signal #%lu\n", sigNum);
+    // Mark all signals older and including this one as available for 
+    _oldest_live_sig_id = sigNum+1;
+}
+
+
+//---
+void ihipStream_t::waitAndReclaimOlder(ihipSignal_t *signal) 
+{
+    hsa_signal_wait_acquire(_last_copy_signal->_hsa_signal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE);
+
+    reclaimSignals(_last_copy_signal->_sig_id); 
+
+}
+
+
+//---
+//Wait for all queues kernels in the associated accelerator_view to complete.
+void ihipStream_t::wait() 
+{
+    tprintf (TRACE_SYNC, "stream %p wait for queue-empty and lastCopy:#%lu...\n", this, _last_copy_signal ? _last_copy_signal->_sig_id: 0x0 );
+    _av.wait();
+    if (_last_copy_signal) {
+        this->waitAndReclaimOlder(_last_copy_signal);
+    }
+
+    resetToEmpty();
+};
+
+
+//---
+inline ihipDevice_t * ihipStream_t::getDevice() const 
+{ 
+    if (ihipIsValidDevice(_device_index)) {
+        return &g_devices[_device_index]; 
+    } else {
+        return NULL;
+    }
+};
+
+
+//---
+// Allocate a new signal from the signal pool.
+// Returned signals have value of 0.
+// Signals are intended for use in this stream and are always reclaimed "in-order".
+ihipSignal_t *ihipStream_t::getSignal() 
+{
+    int numToScan = _signalPool.size();
+    do {
+        auto thisCursor = _signalCursor;
+        if (++_signalCursor == _signalPool.size()) {
+            _signalCursor = 0;
+        }
+
+        if (_signalPool[thisCursor]._sig_id < _oldest_live_sig_id) {
+            _signalPool[thisCursor]._index = thisCursor;
+            _signalPool[thisCursor]._sig_id  =  ++_stream_sig_id;  // allocate it.
+
+
+            return &_signalPool[thisCursor];
+        }
+
+    } while (--numToScan) ;
+
+    assert(numToScan == 0);
+
+    // Have to grow the pool:
+    _signalCursor = _signalPool.size(); // set to the beginning of the new entries:
+    _signalPool.resize(_signalPool.size() * 2);
+    tprintf (TRACE_SIGNAL, "grow signal pool to %zu entries, cursor=%d\n", _signalPool.size(), _signalCursor);
+    return getSignal();  // try again, 
+
+    // Should never reach here.
+    assert(0);
+}
+
+
+//---
+void ihipStream_t::enqueueBarrier(hsa_queue_t* queue, ihipSignal_t *depSignal) 
+{
+
+    // Obtain the write index for the command queue
+    uint64_t index = hsa_queue_load_write_index_relaxed(queue);
+    const uint32_t queueMask = queue->size - 1;
+
+    // Define the barrier packet to be at the calculated queue index address
+    hsa_barrier_and_packet_t* barrier = &(((hsa_barrier_and_packet_t*)(queue->base_address))[index&queueMask]);
+    memset(barrier, 0, sizeof(hsa_barrier_and_packet_t));
+
+    // setup header
+    uint16_t header = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE;
+    header |= 1 << HSA_PACKET_HEADER_BARRIER;
+    //header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
+    //header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
+    barrier->header = header;
+
+    barrier->dep_signal[0] = depSignal->_hsa_signal;
+
+    barrier->completion_signal.handle = 0;
+
+    // TODO - check queue overflow, return error:
+    // Increment write index and ring doorbell to dispatch the kernel
+    hsa_queue_store_write_index_relaxed(queue, index+1);
+    hsa_signal_store_relaxed(queue->doorbell_signal, index);
+}
+
+
+//--
+//When the commands in a stream change types (ie kernel command follows a data command,
+//or data command follows a kernel command), then we need to add a barrier packet
+//into the stream to mimic CUDA stream semantics.  (some hardware uses separate
+//queues for data commands and kernel commands, and no implicit ordering is provided).
+//
+inline bool ihipStream_t::preKernelCommand()
+{
+    bool addedSync = false;
+    // If switching command types, we need to add a barrier packet to synchronize things.
+    if (_last_command_type != ihipCommandKernel) {
+        if (_last_copy_signal) {
+            addedSync = true;
+
+            hsa_queue_t * q =  (hsa_queue_t*)_av.get_hsa_queue();
+            if (! HIP_DISABLE_ENQ_BARRIER) {
+                this->enqueueBarrier(q, _last_copy_signal);
+                tprintf (TRACE_SYNC, "stream %p switch %s to %s (barrier pkt inserted with wait on #%lu)\n", 
+                        this, ihipCommandName[_last_command_type], ihipCommandName[ihipCommandKernel], _last_copy_signal->_sig_id)
+
+            } else {
+                tprintf (TRACE_SYNC, "stream %p switch %s to %s (wait for previous...)\n", 
+                        this, ihipCommandName[_last_command_type], ihipCommandName[ihipCommandKernel]);
+                this->waitAndReclaimOlder(_last_copy_signal);
+            }
+        }
+        _last_command_type = ihipCommandKernel;
+    }
+
+    return addedSync;
+}
+
+
+//---
+inline void ihipStream_t::postKernelCommand(hc::completion_future &kernelFuture)
+{
+    _last_kernel_future = kernelFuture;
+};
+
+
+
+//---
+// Called whenever a copy command is set to the stream.
+// Examines the last command sent to this stream and returns a signal to wait on, if required.
+inline int ihipStream_t::copyCommand(ihipSignal_t *lastCopy, hsa_signal_t *waitSignal, ihipCommand_t copyType)
+{
+    int needSync = 0;
+
+    waitSignal->handle = 0;
+    // If switching command types, we need to add a barrier packet to synchronize things.
+    if (_last_command_type != copyType) {
+
+
+        if (_last_command_type == ihipCommandKernel) {
+            tprintf (TRACE_SYNC, "stream %p switch %s to %s (async copy dep on prev kernel)\n", 
+                    this, ihipCommandName[_last_command_type], ihipCommandName[copyType]);
+            needSync = 1;
+            hsa_signal_t *hsaSignal = (static_cast<hsa_signal_t*> (_last_kernel_future.get_native_handle()));
+            if (hsaSignal) {
+                *waitSignal = * hsaSignal;
+            }
+        } else if (_last_copy_signal) {
+            needSync = 1;
+            tprintf (TRACE_SYNC, "stream %p switch %s to %s (async copy dep on other copy #%lu)\n", 
+                    this, ihipCommandName[_last_command_type], ihipCommandName[copyType], _last_copy_signal->_sig_id);
+            *waitSignal = _last_copy_signal->_hsa_signal;
+        }
+
+        _last_command_type = copyType;
+    }
+
+    _last_copy_signal = lastCopy;
+
+    return needSync;
+}
+
+
+//=================================================================================================
+//
+//Reset the device - this is called from hipDeviceReset.  
+//Device may be reset multiple times, and may be reset after init.
+void ihipDevice_t::reset() 
+{
+    _staging_buffer[0] = new StagingBuffer(this, HIP_STAGING_SIZE*1024, HIP_STAGING_BUFFERS);
+    _staging_buffer[1] = new StagingBuffer(this, HIP_STAGING_SIZE*1024, HIP_STAGING_BUFFERS);
+};
+
+
+//---
+void ihipDevice_t::init(unsigned device_index, hc::accelerator acc)
+{
+    _device_index = device_index;
+    _acc = acc;
     hsa_agent_t *agent = static_cast<hsa_agent_t*> (acc.get_default_view().get_hsa_agent());
     if (agent) {
         int err = hsa_agent_get_info(*agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &_compute_units);
@@ -162,30 +558,30 @@ ihipDevice_t::ihipDevice_t(unsigned device_index, hc::accelerator acc)
     _null_stream = new ihipStream_t(device_index, acc.get_default_view(), hipStreamDefault);
     this->_streams.push_back(_null_stream);
     tprintf(TRACE_SYNC, "created device with null_stream=%p\n", _null_stream);
+
+    hsa_signal_create(0, 0, NULL, &_copy_signal);
+
+    this->reset();
 };
 
-#if 0
+
 ihipDevice_t::~ihipDevice_t()
 {
     if (_null_stream) {
         delete _null_stream;
         _null_stream = NULL;
     }
+
+    for (int i=0; i<2; i++) {
+        if (_staging_buffer[i]) {
+            delete _staging_buffer[i];
+        }
+    }
+    hsa_signal_destroy(_copy_signal);
 }
-#endif
 
 //----
 
-//=================================================================================================
-//TLS - must be initialized here.
-thread_local hipError_t tls_lastHipError = hipSuccess;
-thread_local int tls_defaultDevice = 0;
-
-// Global initialization.
-std::once_flag hip_initialized;
-std::vector<ihipDevice_t> g_devices; // Vector of all non-emulated (ie GPU) accelerators in the system.
-
-//=================================================================================================
 
 
 
@@ -338,8 +734,7 @@ hipError_t ihipDevice_t::getProperties(hipDeviceProp_t* prop)
     DeviceErrorCheck(err);
     prop->l2CacheSize = cache_size[1];
 
-    /* Computemode for HSA Devices is always : cudaComputeModeDefault :/
-                             Default compute mode (Multiple threads can use cudaSetDevice() with this device)  */
+    /* Computemode for HSA Devices is always : cudaComputeModeDefault */
     prop->computeMode = 0;
 
     // Get Max Threads Per Multiprocessor
@@ -464,24 +859,44 @@ void ihipReadEnv_I(int *var_ptr, const char *var_name1, const char *var_name2, c
 //It is called with C++11 call_once, which provided thread-safety.
 void ihipInit()
 {
+    /*
+     * Environment variables
+     */
+    READ_ENV_I(release, HIP_PRINT_ENV, 0,  "Print HIP environment variables.");  
+    //-- READ HIP_PRINT_ENV env first, since it has impact on later env var reading
+
+    READ_ENV_I(release, HIP_LAUNCH_BLOCKING, CUDA_LAUNCH_BLOCKING, "Make HIP APIs 'host-synchronous', so they block until any kernel launches or data copy commands complete. Alias: CUDA_LAUNCH_BLOCKING." );
+    READ_ENV_I(release, HIP_TRACE_API, 0,  "Trace each HIP API call.  Print function name and return code to stderr as program executes.");
+    READ_ENV_I(release, HIP_STAGING_SIZE, 0, "Size of each staging buffer (in KB)" );
+    READ_ENV_I(release, HIP_STAGING_BUFFERS, 0, "Number of staging buffers to use in each direction");
+    READ_ENV_I(release, HIP_STREAM_SIGNALS, 0, "Number of signals to allocate when new stream is created (signal pool will grow on demand)");
+
+    READ_ENV_I(release, HIP_DISABLE_ENQ_BARRIER, 0, "Disable enqueue of barrier packet - instead wait for copy completion on host.");
+    READ_ENV_I(release, HIP_DISABLE_BIDIR_MEMCPY, 0, "Disable simultaneous H2D memcpy and D2H memcpy to same device");
+    READ_ENV_I(release, HIP_ONESHOT_COPY_DEP, 0, "If set, only set the copy input dependency for the first copy command in a staged copy.  If clear, set the dep for each copy.");
 
     /*
      * Build a table of valid compute devices.
      */
     auto accs = hc::accelerator::get_all();
-    g_devices.reserve(accs.size());
+    int deviceCnt = 0;
     for (int i=0; i<accs.size(); i++) {
         if (! accs[i].get_is_emulated()) {
-           g_devices.emplace_back(ihipDevice_t(g_devices.size(), accs[i]));
+            deviceCnt++;
+        }
+    };
+
+    g_devices = new ihipDevice_t[deviceCnt];
+    g_deviceCnt = 0;
+    for (int i=0; i<accs.size(); i++) {
+        if (! accs[i].get_is_emulated()) {
+           g_devices[g_deviceCnt].init(g_deviceCnt, accs[i]);
+           g_deviceCnt++;
         }
     }
 
-    /*
-     * Environment variables
-     */
-    READ_ENV_I(release, HIP_PRINT_ENV, 0,  "Print HIP environment variables.");
-    READ_ENV_I(release, HIP_TRACE_API, 0,  "Trace each HIP API call.  Print function name and return code to stderr as program executes.");
-    READ_ENV_I(release, HIP_LAUNCH_BLOCKING, CUDA_LAUNCH_BLOCKING, "Make HIP APIs 'host-synchronous', so they block until any kernel launches or data copy commands complete. Alias: CUDA_LAUNCH_BLOCKING." );
+    assert(deviceCnt == g_deviceCnt);
+
 
     tprintf(TRACE_API, "pid=%u %-30s\n", getpid(), "<ihipInit>");
 
@@ -490,7 +905,7 @@ void ihipInit()
 INLINE bool ihipIsValidDevice(unsigned deviceIndex)
 {
     // deviceIndex is unsigned so always > 0
-    return (deviceIndex < g_devices.size());
+    return (deviceIndex < g_deviceCnt);
 }
 
 
@@ -509,7 +924,7 @@ INLINE ihipDevice_t *ihipGetTlsDefaultDevice()
 //---
 INLINE ihipDevice_t *ihipGetDevice(int deviceId)
 {
-    if ((deviceId >= 0) && (deviceId < g_devices.size())) {
+    if ((deviceId >= 0) && (deviceId < g_deviceCnt)) {
         return &g_devices[deviceId];
     } else {
         return NULL;
@@ -524,7 +939,7 @@ static inline void ihipWaitAllStreams(ihipDevice_t *device)
 {
     tprintf(TRACE_SYNC, "waitAllStream\n");
     for (auto streamI=device->_streams.begin(); streamI!=device->_streams.end(); streamI++) {
-        (*streamI)->_av.wait();
+        (*streamI)->wait();
     }
 }
 
@@ -540,7 +955,7 @@ inline void ihipWaitNullStream(ihipDevice_t *device)
         if (!(stream->_flags & hipStreamNonBlocking)) {
             // TODO-hcc - use blocking or active wait here?
             // TODO-sync - cudaDeviceBlockingSync
-            stream->_av.wait();
+            stream->wait();
         }
     }
 }
@@ -563,78 +978,35 @@ inline hipStream_t ihipSyncAndResolveStream(hipStream_t stream)
     }
 }
 
-#if 0
-inline hsa_status_t
-HSABarrier::enqueueBarrier(hsa_queue_t* queue) {
-    hsa_status_t status = HSA_STATUS_SUCCESS;
-
-    hc::completion_future marker = stream->_av.create_marker();
-
-    // Create a signal to wait for the barrier to finish.
-    std::pair<hsa_signal_t, int> ret = Kalmar::ctx.getSignal();
-    signal = ret.first;
-    signalIndex = ret.second;
-
-    // Obtain the write index for the command queue
-    uint64_t index = hsa_queue_load_write_index_relaxed(queue);
-    const uint32_t queueMask = queue->size - 1;
-
-    // Define the barrier packet to be at the calculated queue index address
-    hsa_barrier_and_packet_t* barrier = &(((hsa_barrier_and_packet_t*)(queue->base_address))[index&queueMask]);
-    memset(barrier, 0, sizeof(hsa_barrier_and_packet_t));
-
-    // setup header
-    uint16_t header = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE;
-    header |= 1 << HSA_PACKET_HEADER_BARRIER;
-    header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
-    header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
-    barrier->header = header;
-
-    barrier->completion_signal = signal;
-
-    // Increment write index and ring doorbell to dispatch the kernel
-    hsa_queue_store_write_index_relaxed(queue, index+1);
-    hsa_signal_store_relaxed(queue->doorbell_signal, index);
-
-    isDispatched = true;
-
-    return status;
-}
-#endif
-
-//--
-//When the commands in a stream change types (ie kernel command follows a data command,
-//or data command follows a kernel command), then we need to add a barrier packet
-//into the stream to mimic CUDA stream semantics.  (some hardware uses separate
-//queues for data commands and kernel commands, and no implicit ordering is provided).
-//
-inline bool ihipCheckCommandSwitchSync(hipStream_t stream, ihipCommand_t new_command, hc::completion_future *marker)
-{
-    bool addedSync = false;
-    // If switching command types, we need to add a barrier packet to synchronize things.
-    if (stream->_last_command != new_command) {
-        addedSync = true;
-        *marker = stream->_av.create_marker();
-
-        tprintf (TRACE_SYNC, "stream %p switch to %s (barrier pkt inserted)\n", (void*)stream, new_command == ihipCommandKernel ? "Kernel" : "Data");
-        stream->_last_command = new_command;
-    }
-
-    return addedSync;
-}
 
 
+
+
+
+
+// TODO - data-up to data-down:
 // Called just before a kernel is launched from hipLaunchKernel.
 // Allows runtime to track some information about the stream.
-hc::accelerator_view *ihipLaunchKernel(hipStream_t stream)
+hipStream_t ihipPreLaunchKernel(hipStream_t stream, hc::accelerator_view **av)
 {
-
     stream = ihipSyncAndResolveStream(stream);
 
-    hc::completion_future marker;
-    ihipCheckCommandSwitchSync(stream, ihipCommandKernel, &marker);
+    stream->preKernelCommand();
 
-    return &(stream->_av);
+    *av = &stream->_av;
+
+    return (stream);
+}
+
+
+//---
+//Called after kernel finishes execution.
+void ihipPostLaunchKernel(hipStream_t stream, hc::completion_future &kernelFuture)
+{
+    stream->postKernelCommand(kernelFuture);
+    if (HIP_LAUNCH_BLOCKING) {
+        tprintf(TRACE_SYNC, " stream:%p LAUNCH_BLOCKING for kernel completion\n", stream);
+    }
 }
 
 
@@ -676,7 +1048,7 @@ hipError_t hipGetDeviceCount(int *count)
 {
     std::call_once(hip_initialized, ihipInit);
 
-    *count = g_devices.size();
+    *count = g_deviceCnt;
 
     if (*count > 0) {
         return ihipLogStatus(hipSuccess);
@@ -765,7 +1137,7 @@ hipError_t hipSetDevice(int device)
 {
     std::call_once(hip_initialized, ihipInit);
 
-    if ((device < 0) || (device > g_devices.size())) {
+    if ((device < 0) || (device >= g_deviceCnt)) {
         return ihipLogStatus(hipErrorInvalidDevice);
     } else {
         tls_defaultDevice = device;
@@ -802,6 +1174,14 @@ hipError_t hipDeviceReset(void)
     // It should destroy and clean up all resources allocated with the default device in the current process.
     // and needs to destroy all queues as well.
     //
+#if USE_AM_TRACKER
+    // TODO - remove bug above.
+    ihipDevice_t *device = ihipGetTlsDefaultDevice();
+    if (device) {
+        am_memtracker_reset(device->_acc);
+        device->reset(); // re-allocate required resources.
+    }
+#endif
 
     return ihipLogStatus(hipSuccess);
 }
@@ -941,6 +1321,7 @@ const char *hipGetErrorName(hipError_t hip_error)
         case hipErrorInvalidValue           : return "hipErrorInvalidValue";
         case hipErrorInvalidResourceHandle  : return "hipErrorInvalidResourceHandle";
         case hipErrorInvalidDevice          : return "hipErrorInvalidDevice";
+        case hipErrorInvalidMemcpyDirection : return "hipErrorInvalidMemcpyDirection";
         case hipErrorNoDevice               : return "hipErrorNoDevice";
         case hipErrorNotReady               : return "hipErrorNotReady";
         case hipErrorUnknown                : return "hipErrorUnknown";
@@ -1004,7 +1385,7 @@ hipError_t hipStreamWaitEvent(hipStream_t stream, hipEvent_t event, unsigned int
 
     {
         // Super-conservative version of this - TODO - remove me:
-        stream->_av.wait();
+        stream->wait();
         e = hipSuccess;
     }
 
@@ -1022,7 +1403,7 @@ hipError_t hipStreamSynchronize(hipStream_t stream)
         ihipDevice_t *device = ihipGetTlsDefaultDevice();
         ihipWaitNullStream(device);
     } else {
-        stream->_av.wait();
+        stream->wait();
         e = hipSuccess;
     }
 
@@ -1041,20 +1422,16 @@ hipError_t hipStreamDestroy(hipStream_t stream)
 
     hipError_t e = hipSuccess;
 
-    if (ihipIsValidDevice(stream->_device_index)) {
-
-        ihipDevice_t *device = &g_devices[stream->_device_index];
+    ihipDevice_t *device = stream->getDevice();
 
+    if (device) {
         device->_streams.remove(stream);
-
         delete stream;
-
-        e = hipSuccess;
     } else {
         e = hipErrorInvalidResourceHandle;
     }
 
-    return ihipLogStatus(hipSuccess);
+    return ihipLogStatus(e);
 }
 
 
@@ -1095,6 +1472,8 @@ hipError_t hipEventCreateWithFlags(hipEvent_t* event, unsigned flags)
         eh->_state  = hipEventStatusCreated;
         eh->_stream = NULL;
         eh->_flags  = flags;
+        eh->_timestamp  = 0;
+        eh->_copy_seq_id  = 0;
     } else {
         e = hipErrorInvalidValue;
     }
@@ -1129,6 +1508,7 @@ hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream)
             // Clear timestamps
             eh->_timestamp = 0;
             eh->_marker = stream->_av.create_marker();
+            eh->_copy_seq_id = stream->lastCopySeqId();
 
             return ihipLogStatus(hipSuccess);
         }
@@ -1176,6 +1556,8 @@ hipError_t hipEventSynchronize(hipEvent_t event)
 #else
             eh->_marker.wait();
 #endif
+            eh->_stream->reclaimSignals(eh->_copy_seq_id);
+
             return ihipLogStatus(hipSuccess);
         }
     } else {
@@ -1191,7 +1573,7 @@ void ihipSetTs(hipEvent_t e)
         // already recorded, done:
         return;
     } else {
-        // Test this code:
+        // TODO - use completion-future functions to obtain ticks and timestamps:
         hsa_signal_t *sig  = static_cast<hsa_signal_t*> (eh->_marker.get_native_handle());
         if (sig) {
             if (hsa_signal_load_acquire(*sig) == 0) {
@@ -1272,13 +1654,95 @@ hipError_t hipEventQuery(hipEvent_t event)
 // Memory
 //
 //
+//
+
+//---
+/**
+ * @return #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidDevice
+ */
+hipError_t hipPointerGetAttributes(hipPointerAttribute_t *attributes, void* ptr) 
+{
+    std::call_once(hip_initialized, ihipInit);
+
+    hipError_t e = hipSuccess;
+
+#if USE_AM_TRACKER
+    hc::accelerator acc;
+    hc::AmPointerInfo amPointerInfo(NULL, NULL, 0, acc, 0, 0);
+    am_status_t status = hc::am_memtracker_getinfo(&amPointerInfo, ptr);
+    if (status == AM_SUCCESS) {
+
+        attributes->memoryType    = amPointerInfo._isInDeviceMem ? hipMemoryTypeDevice: hipMemoryTypeHost;
+        attributes->hostPointer   = amPointerInfo._hostPointer;
+        attributes->devicePointer = amPointerInfo._devicePointer;
+        attributes->isManaged     = 0;
+
+        attributes->allocationFlags = amPointerInfo._appAllocationFlags;
+        attributes->device          = amPointerInfo._appId;
+
+        if (attributes->device < 0) {
+            e = hipErrorInvalidDevice;
+        }
+
+
+    } else {
+        attributes->memoryType    = hipMemoryTypeDevice;
+        attributes->hostPointer   = 0;
+        attributes->devicePointer = 0;
+        attributes->device        = -1;
+        attributes->isManaged     = 0;
+        attributes->allocationFlags = 0;
+
+        e = hipErrorInvalidValue;
+    }
+#else
+    e = hipErrorInvalidValue;
+#endif
+
+    return ihipLogStatus(e);
+}
+
+
+#if USE_AM_TRACKER
+// TODO - test this function:
+/**
+ * @returns #hipSuccess, 
+ * @returns #hipErrorInvalidValue if flags are not 0
+ * @returns #hipErrorMemoryAllocation if hostPointer is not a tracked allocation.
+ */
+hipError_t hipHostGetDevicePointer(void **devicePointer, void *hostPointer, unsigned flags)
+{
+    std::call_once(hip_initialized, ihipInit);
+
+    hipError_t e = hipSuccess;
+
+    // Flags must be 0:
+    if (flags == 0) {
+        e = hipErrorInvalidValue;
+    } else {
+        hc::accelerator acc;
+        hc::AmPointerInfo amPointerInfo(NULL, NULL, 0, acc, 0, 0);
+        am_status_t status = hc::am_memtracker_getinfo(&amPointerInfo, hostPointer);
+        if (status == AM_SUCCESS) {
+            *devicePointer = amPointerInfo._devicePointer;
+        } else {
+            e = hipErrorMemoryAllocation;  
+            *devicePointer = NULL;
+        }
+    }
+
+    return ihipLogStatus(e);
+}
+#endif
+
+
 
 // kernel for launching memcpy operations:
 template <typename T>
 hc::completion_future
 ihipMemcpyKernel(hipStream_t stream, T * c, const T * a, size_t sizeBytes)
 {
-    int wg = std::min((unsigned)8, g_devices[stream->_device_index]._compute_units);
+    int wg = std::min((unsigned)8, stream->getDevice()->_compute_units);
     const int threads_per_wg = 256;
 
     int threads = wg * threads_per_wg;
@@ -1315,7 +1779,7 @@ template <typename T>
 hc::completion_future
 ihipMemsetKernel(hipStream_t stream, T * ptr, T val, size_t sizeBytes)
 {
-    int wg = std::min((unsigned)8, g_devices[stream->_device_index]._compute_units);
+    int wg = std::min((unsigned)8, stream->getDevice()->_compute_units);
     const int threads_per_wg = 256;
 
     int threads = wg * threads_per_wg;
@@ -1348,24 +1812,33 @@ ihipMemsetKernel(hipStream_t stream, T * ptr, T val, size_t sizeBytes)
 }
 
 //---
+/**
+ * @returns #hipSuccess #hipErrorMemoryAllocation 
+ */
 hipError_t hipMalloc(void** ptr, size_t sizeBytes)
 {
     std::call_once(hip_initialized, ihipInit);
 
     hipError_t  hip_status = hipSuccess;
 
-    const unsigned am_flags = 0;
-    *ptr = hc::am_alloc(sizeBytes, ihipGetTlsDefaultDevice()->_acc, am_flags);
+	auto device = ihipGetTlsDefaultDevice();
 
-    if (*ptr == NULL) {
-        hip_status = hipErrorMemoryAllocation;
+    if (device) {
+        const unsigned am_flags = 0;
+        *ptr = hc::am_alloc(sizeBytes, device->_acc, am_flags);
+
+        if (sizeBytes && (*ptr == NULL)) {
+            hip_status = hipErrorMemoryAllocation;
+        } else {
+#if USE_AM_TRACKER
+            hc::am_memtracker_update(*ptr, device->_device_index, 0);
+#endif
+        }
     } else {
-        hip_status = hipSuccess;
+        hip_status = hipErrorMemoryAllocation;
     }
 
-    ihipLogStatus(hip_status);
-
-    return hip_status;
+    return ihipLogStatus(hip_status);
 }
 
 
@@ -1373,40 +1846,29 @@ hipError_t hipMallocHost(void** ptr, size_t sizeBytes)
 {
     std::call_once(hip_initialized, ihipInit);
 
-#if USE_PINNED_HOST
+    hipError_t  hip_status = hipSuccess;
 
     const unsigned am_flags = amHostPinned;
+	auto device = ihipGetTlsDefaultDevice();
 
-    *ptr = hc::am_alloc(sizeBytes, ihipGetTlsDefaultDevice()->_acc, am_flags);
-    hipError_t  hip_status = hipSuccess;
-    if (*ptr == NULL) {
-        hip_status = hipErrorMemoryAllocation;
-    } else {
-        hip_status = hipSuccess;
-    }
-
-    tprintf (TRACE_MEM, "  %s: pinned ptr=%p\n", __func__, *ptr);
-
-    ihipLogStatus(hip_status);
-
-    return hip_status;
-
-#else
-    // TODO-hcc remove-me
-
-	// This code only works on Kaveri:
-    *ptr = malloc(sizeBytes); // TODO - call am_alloc for device memory, this will only on KV HSA.
-    if (*ptr != NULL) {
-        //TODO-hsart : need memory pin APIs to implement this correctly.
-        // FOr now do our best to allocate the memory, but return an error since
-        // the returned pointer can only be used on the HOST not the GPU.
-        return ihipLogStatus(hipErrorMemoryAllocation);
-    } else {
-        return ihipLogStatus(hipErrorMemoryAllocation);
-    }
+    if (device) {
+        *ptr = hc::am_alloc(sizeBytes, device->_acc, am_flags);
+        if (sizeBytes && (*ptr == NULL)) {
+            hip_status = hipErrorMemoryAllocation;
+        } else {
+#if USE_AM_TRACKER
+            hc::am_memtracker_update(*ptr, device->_device_index, 0);
 #endif
+        }
+
+        tprintf (TRACE_MEM, "  %s: pinned ptr=%p\n", __func__, *ptr);
+    }
+
+    return ihipLogStatus(hip_status);
+
 }
 
+//---
 hipError_t hipMemcpyToSymbol(const char* symbolName, const void *src, size_t count, size_t offset, hipMemcpyKind kind)
 {
 #ifdef USE_MEMCPYTOSYMBOL
@@ -1416,8 +1878,9 @@ hipError_t hipMemcpyToSymbol(const char* symbolName, const void *src, size_t cou
 	}
 	auto device = ihipGetTlsDefaultDevice();
 
-    hc::completion_future marker;
-    ihipCheckCommandSwitchSync(device._null_stream, ihipCommandData, &marker);
+    //hsa_signal_t depSignal;
+    //int depSignalCnt = device._null_stream->copyCommand(NULL, &depSignal, ihipCommandCopyH2D);
+    assert(0);  // Need to properly synchronize the copy - do something with depSignal if != NULL.
 
 	device->_acc.memcpy_symbol(symbolName, (void*) src,count, offset);
 #endif
@@ -1425,6 +1888,247 @@ hipError_t hipMemcpyToSymbol(const char* symbolName, const void *src, size_t cou
 }
 
 
+//-------------------------------------------------------------------------------------------------
+StagingBuffer::StagingBuffer(ihipDevice_t *device, size_t bufferSize, int numBuffers) :
+    _device(device),
+    _bufferSize(bufferSize),
+    _numBuffers(numBuffers > _max_buffers ? _max_buffers : numBuffers)
+{
+    
+
+    
+    for (int i=0; i<_numBuffers; i++) {
+        // TODO - experiment with alignment here.
+        _pinnedStagingBuffer[i] = hc::am_alloc(_bufferSize, device->_acc, amHostPinned);
+        if (_pinnedStagingBuffer[i] == NULL) {
+            throw;
+        }
+        hsa_signal_create(0, 0, NULL, &_completion_signal[i]);
+    }
+};
+
+//---
+StagingBuffer::~StagingBuffer()
+{
+    for (int i=0; i<_numBuffers; i++) {
+        if (_pinnedStagingBuffer[i]) {
+            hc::am_free(_pinnedStagingBuffer[i]);
+            _pinnedStagingBuffer[i] = NULL;
+        }
+        hsa_signal_destroy(_completion_signal[i]);
+    }
+}
+
+
+//---
+//Copies sizeBytes from src to dst, using either a copy to a staging buffer or a staged pin-in-place strategy
+//IN: dst - dest pointer - must be accessible from host CPU.
+//IN: src - src pointer for copy.  Must be accessible from agent this buffer is associated with (via _device)
+//IN: waitFor - hsaSignal to wait for - the copy will begin only when the specified dependency is resolved.  May be NULL indicating no dependency.
+void StagingBuffer::CopyHostToDevice(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor) 
+{
+    const char *srcp = static_cast<const char*> (src); 
+    char *dstp = static_cast<char*> (dst); 
+
+    for (int i=0; i<_numBuffers; i++) {
+        hsa_signal_store_relaxed(_completion_signal[i], 0);
+    }
+
+    assert(sizeBytes < UINT64_MAX/2); // TODO
+    int bufferIndex = 0;
+    for (int64_t bytesRemaining=sizeBytes; bytesRemaining>0 ;  bytesRemaining -= _bufferSize) {
+
+        size_t theseBytes = (bytesRemaining > _bufferSize) ? _bufferSize : bytesRemaining;
+
+        tprintf (TRACE_COPY2, "H2D: waiting... on completion signal handle=%lu\n", _completion_signal[bufferIndex].handle);
+        hsa_signal_wait_acquire(_completion_signal[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); 
+
+        tprintf (TRACE_COPY2, "H2D: bytesRemaining=%zu: copy %zu bytes %p to stagingBuf[%d]:%p\n", bytesRemaining, theseBytes, srcp, bufferIndex, _pinnedStagingBuffer[bufferIndex]);
+        // TODO - use uncached memcpy, someday.
+        memcpy(_pinnedStagingBuffer[bufferIndex], srcp, theseBytes);
+
+
+        hsa_signal_store_relaxed(_completion_signal[bufferIndex], 1);
+
+#if USE_ROCR_V2
+        hsa_status_t hsa_status = hsa_amd_memory_async_copy(dstp, _device->_hsa_agent, _pinnedStagingBuffer[bufferIndex], _device->_hsa_agent, theseBytes, waitFor ? 1:0, waitFor, _completion_signal[bufferIndex]);
+#else
+        hsa_status_t hsa_status = hsa_amd_memory_async_copy(dstp, _pinnedStagingBuffer[bufferIndex], theseBytes, _device->_hsa_agent, 0, NULL, _completion_signal[bufferIndex]);
+#endif
+        tprintf (TRACE_COPY2, "H2D: bytesRemaining=%zu: async_copy %zu bytes %p to %p status=%x\n", bytesRemaining, theseBytes, _pinnedStagingBuffer[bufferIndex], dstp, hsa_status);
+
+        assert(hsa_status == HSA_STATUS_SUCCESS); // TODO - throw 
+
+        srcp += theseBytes;
+        dstp += theseBytes;
+        if (++bufferIndex >= _numBuffers) {
+            bufferIndex = 0;
+        }
+
+        if (HIP_ONESHOT_COPY_DEP) {
+            waitFor = NULL; // TODO - don't need dependency after first copy submitted?
+        }
+    }
+
+
+    for (int i=0; i<_numBuffers; i++) {
+        hsa_signal_wait_acquire(_completion_signal[i], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); 
+    }
+}
+
+//---
+//Copies sizeBytes from src to dst, using either a copy to a staging buffer or a staged pin-in-place strategy
+//IN: dst - dest pointer - must be accessible from agent this buffer is assocaited with (via _device).
+//IN: src - src pointer for copy.  Must be accessible from host CPU.
+//IN: waitFor - hsaSignal to wait for - the copy will begin only when the specified dependency is resolved.  May be NULL indicating no dependency.
+void StagingBuffer::CopyDeviceToHost(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor) 
+{
+    const char *srcp0 = static_cast<const char*> (src); 
+    char *dstp1 = static_cast<char*> (dst); 
+
+    for (int i=0; i<_numBuffers; i++) {
+        hsa_signal_store_relaxed(_completion_signal[i], 0);
+    }
+
+    assert(sizeBytes < UINT64_MAX/2); // TODO
+
+    int64_t bytesRemaining0 = sizeBytes; // bytes to copy from dest into staging buffer.
+    int64_t bytesRemaining1 = sizeBytes; // bytes to copy from staging buffer into final dest
+
+    while (bytesRemaining1 > 0) {
+        // First launch the async copies to copy from dest to host
+        for (int bufferIndex = 0; (bytesRemaining0>0) && (bufferIndex < _numBuffers);  bytesRemaining0 -= _bufferSize, bufferIndex++) {
+
+            size_t theseBytes = (bytesRemaining0 > _bufferSize) ? _bufferSize : bytesRemaining0;
+
+            tprintf (TRACE_COPY2, "D2H: bytesRemaining0=%zu  async_copy %zu bytes src:%p to staging:%p\n", bytesRemaining0, theseBytes, srcp0, _pinnedStagingBuffer[bufferIndex]);
+            hsa_signal_store_relaxed(_completion_signal[bufferIndex], 1);
+#if USE_ROCR_V2
+            hsa_status_t hsa_status = hsa_amd_memory_async_copy(_pinnedStagingBuffer[bufferIndex], _device->_hsa_agent, srcp0, _device->_hsa_agent, theseBytes, waitFor ? 1:0, waitFor, _completion_signal[bufferIndex]);
+#else
+            hsa_status_t hsa_status = hsa_amd_memory_async_copy(_pinnedStagingBuffer[bufferIndex], srcp0, theseBytes, _device->_hsa_agent, 0, NULL, _completion_signal[bufferIndex]);
+#endif
+            assert(hsa_status == HSA_STATUS_SUCCESS); // TODO - throw 
+
+            srcp0 += theseBytes;
+
+           
+            if (HIP_ONESHOT_COPY_DEP) {
+                waitFor = NULL; // TODO - don't need dependency after first copy submitted?
+            }
+        }
+
+        // Now unload the staging buffers:
+        for (int bufferIndex=0; (bytesRemaining1>0) && (bufferIndex < _numBuffers);  bytesRemaining1 -= _bufferSize, bufferIndex++) {
+
+            size_t theseBytes = (bytesRemaining1 > _bufferSize) ? _bufferSize : bytesRemaining1;
+
+            tprintf (TRACE_COPY2, "D2H: wait_completion[%d] bytesRemaining=%zu\n", bufferIndex, bytesRemaining1);
+            hsa_signal_wait_acquire(_completion_signal[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); 
+
+            tprintf (TRACE_COPY2, "D2H: bytesRemaining1=%zu copy %zu bytes stagingBuf[%d]:%p to dst:%p\n", bytesRemaining1, theseBytes, bufferIndex, _pinnedStagingBuffer[bufferIndex], dstp1);
+            memcpy(dstp1, _pinnedStagingBuffer[bufferIndex], theseBytes);
+
+            dstp1 += theseBytes;
+        }
+    }
+
+
+    //for (int i=0; i<_numBuffers; i++) {
+    //    hsa_signal_wait_acquire(_completion_signal[i], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); 
+    //}
+}
+
+
+
+
+#if USE_AM_TRACKER
+void ihipSyncCopy(ihipStream_t *stream, void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind)
+{
+    ihipDevice_t *device = stream->getDevice();
+
+    if (device == NULL) {
+        throw;
+    }
+
+    hc::accelerator acc;
+    hc::AmPointerInfo dstPtrInfo(NULL, NULL, 0, acc, 0, 0);
+    hc::AmPointerInfo srcPtrInfo(NULL, NULL, 0, acc, 0, 0);
+
+    bool dstNotTracked = (hc::am_memtracker_getinfo(&dstPtrInfo, dst) != AM_SUCCESS);
+    bool srcNotTracked = (hc::am_memtracker_getinfo(&srcPtrInfo, src) != AM_SUCCESS);
+
+    bool useStagingBuffer = true;  
+
+    // Resolve default to a specific Kind so we know which algorithm to use:
+    if (kind == hipMemcpyDefault) {
+        bool dstIsHost = (dstNotTracked || !dstPtrInfo._isInDeviceMem);
+        bool srcIsHost = (srcNotTracked || !srcPtrInfo._isInDeviceMem);
+        if (srcIsHost && !dstIsHost) {
+            kind = hipMemcpyHostToDevice;
+        } else if (!srcIsHost && dstIsHost) {
+            kind = hipMemcpyDeviceToHost;
+        } else if (srcIsHost && dstIsHost) {
+            kind = hipMemcpyHostToHost;
+        } else if (srcIsHost && dstIsHost) {
+            kind = hipMemcpyDeviceToDevice;
+        }
+    }
+
+
+    if ((kind == hipMemcpyHostToDevice) && (srcNotTracked)) {
+        if (useStagingBuffer) {
+            std::lock_guard<std::mutex> l (device->_copy_lock[0]);
+            //printf ("staged-copy- read dep signals\n");
+
+            hsa_signal_t depSignal;
+            int depSignalCnt = stream->copyCommand(NULL, &depSignal, ihipCommandCopyH2D);
+            device->_staging_buffer[0]->CopyHostToDevice(dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL); 
+
+            // The copy waits for inputs and then completes before returning.
+            stream->resetToEmpty();
+        } else {
+            // TODO - remove, slow path.
+            hc::am_copy(dst, src, sizeBytes);
+        }
+    } else if ((kind == hipMemcpyDeviceToHost) && (dstNotTracked)) {
+        if (useStagingBuffer) {
+            std::lock_guard<std::mutex> l (device->_copy_lock[HIP_DISABLE_BIDIR_MEMCPY ? 0:1]);
+            //printf ("staged-copy- read dep signals\n");
+            hsa_signal_t depSignal;
+            int depSignalCnt = stream->copyCommand(NULL, &depSignal, ihipCommandCopyD2H);
+            device->_staging_buffer[1]->CopyDeviceToHost(dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL); 
+        } else {
+            // TODO - remove, slow path.
+            hc::am_copy(dst, src, sizeBytes);
+        }
+    } else if (kind == hipMemcpyHostToHost)  {
+        memcpy(dst, src, sizeBytes);  
+
+    } else {
+        // Let HSA runtime handle it:
+        // TODO - need buffer pool for the signals:
+
+        device->_copy_lock[HIP_DISABLE_BIDIR_MEMCPY? 0:1].lock();
+
+        hsa_signal_store_relaxed(device->_copy_signal, 1);
+#if USE_ROCR_V2
+        hsa_status_t hsa_status = hsa_amd_memory_async_copy(dst, device->_hsa_agent, src, device->_hsa_agent, sizeBytes, 0, NULL, device->_copy_signal);
+#else
+        hsa_status_t hsa_status = hsa_amd_memory_async_copy(dst, src, sizeBytes, device->_hsa_agent, 0, NULL, device->_copy_signal);
+#endif
+
+        if (hsa_status == HSA_STATUS_SUCCESS) {
+            hsa_signal_wait_relaxed(device->_copy_signal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); 
+        }
+
+        device->_copy_lock[HIP_DISABLE_BIDIR_MEMCPY ? 0:1].unlock();
+
+    }
+}
+#endif
+
+
 //---
 hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind)
 {
@@ -1433,83 +2137,114 @@ hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind
     hipStream_t stream = ihipSyncAndResolveStream(hipStreamNull);
 
     hc::completion_future marker;
-    ihipCheckCommandSwitchSync(stream, ihipCommandData, &marker);
 
     hipError_t e = hipSuccess;
 
-#ifdef USE_ASYNC_COPY
-    if (ihipIsValidDevice(stream->_device_index)) {
-
-        ihipDevice_t *device = &g_devices[stream->_device_index];
-
-        hsa_signal_t completion_signal; // init/obtain from pool.
-
-        hsa_status_t hsa_status = hsa_amd_memory_async_copy(dst, src, size, device->_hsa_agent, 0, NULL, &completion_signal);
-
-        e = (hsa_status == HSA_STATUS_SUCCESS) ? hipSuccess : hipErrorTbd;
-    } else {
+#if USE_AM_TRACKER
+    try {
+        ihipSyncCopy(stream, dst, src, sizeBytes, kind);
+    } 
+    catch (...) {
         e = hipErrorInvalidResourceHandle;
     }
         
 
 #else
-    // TODO-hsart - what synchronization does hsa_copy provide?
     hc::am_copy(dst, src, sizeBytes);
     e = hipSuccess;
 #endif
 
-    // TODO - when am_copy becomes async, and we have HIP_LAUNCH_BLOCKING set, then we would wait for copy operation to complete here.
-
     return ihipLogStatus(e);
 }
 
 
-//---
-/*
+#if USE_AM_TRACKER==0
+/**
  * @warning on HCC hipMemcpyAsync uses a synchronous copy.
  */
+#endif
+/**
+ * @result #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidMemcpyDirection, #hipErrorInvalidValue
+ */
+//---
 hipError_t hipMemcpyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind, hipStream_t stream)
 {
     std::call_once(hip_initialized, ihipInit);
 
     hipError_t e = hipSuccess;
 
-    stream =  ihipSyncAndResolveStream(stream);
+    stream = ihipSyncAndResolveStream(stream);
 
-    hc::completion_future marker;
-    ihipCheckCommandSwitchSync(stream, ihipCommandData, &marker);
+#if USE_AM_TRACKER
+    if (stream) {
+        ihipDevice_t *device = stream->getDevice();
 
-    // Dispatch async memory copy to synchronize with items in the specified stream.
+        if (device == NULL) {
+            e = hipErrorInvalidDevice;
 
-    // Async - need to set up dependency on the last command queued to the device?
+        } else if (kind == hipMemcpyDefault) {
+            e = hipErrorInvalidMemcpyDirection;
 
-    // TODO-hsart This routine needs to ensure that dst and src are mapped on the GPU.
-    // This is a synchronous copy - remove and replace with code below when we have appropriate LOCK APIs.
-    hc::am_copy(dst, src, sizeBytes);
+        } else if (kind == hipMemcpyHostToHost) {
+            tprintf (TRACE_COPY2, "H2H copy with memcpy");
 
-#if 0
+            memcpy(dst, src, sizeBytes);
 
-    hipStream_t s =ihipGetStream(stream);
+        } else {
+            ihipSignal_t *ihip_signal = stream->getSignal();
+            hsa_signal_store_relaxed(ihip_signal->_hsa_signal, 1);
+
+            ihipCommand_t copyType;
+            if ((kind == hipMemcpyHostToDevice) || (kind == hipMemcpyDeviceToDevice)) {
+                copyType = ihipCommandCopyH2D;
+            } else if (kind == hipMemcpyDeviceToHost) {
+                copyType = ihipCommandCopyD2H;
+            } else {
+                e = hipErrorInvalidMemcpyDirection;
+                copyType = ihipCommandCopyD2H;
+            }
+
+#if USE_ROCR_V2
+            hsa_signal_t depSignal;
+            int depSignalCnt = stream->copyCommand(ihip_signal, &depSignal, copyType);
+
+            tprintf (TRACE_SYNC, " copy-async, waitFor=%lu completion=#%lu(%lu)\n", depSignalCnt? depSignal.handle:0x0, ihip_signal->_sig_id, ihip_signal->_hsa_signal.handle);
+
+            hsa_status_t hsa_status = hsa_amd_memory_async_copy(dst, device->_hsa_agent, src, device->_hsa_agent, sizeBytes, depSignalCnt, depSignalCnt ? &depSignal:0x0, ihip_signal->_hsa_signal);
+#else
+            hsa_status_t hsa_status = hsa_amd_memory_async_copy(dst, src, sizeBytes, device->_hsa_agent, 0, NULL, ihip_signal->_hsa_signal);
+#endif
 
 
-    if (s) {
-        hc::completion_future cf = ihipMemcpyKernel<char> (s, static_cast<char*> (dst), static_cast<const char*> (src), sizeBytes);
-
-        //cf.wait();
-
-        e =  hipSuccess;
+            if (hsa_status == HSA_STATUS_SUCCESS) {
+                // TODO-stream - fix release-signal calls here.
+                if (HIP_LAUNCH_BLOCKING) {
+                    tprintf(TRACE_SYNC, "LAUNCH_BLOCKING for completion of hipMemcpyAsync(%zu)\n", sizeBytes);
+                    stream->wait();
+                }  
+            } else {
+                // This path can be hit if src or dst point to unpinned host memory.
+                // TODO-stream - does async-copy fall back to sync if input pointers are not pinned?
+                e = hipErrorInvalidValue;
+            }
+        }
     } else {
         e = hipErrorInvalidValue;
     }
+#else
+    // TODO-hsart This routine needs to ensure that dst and src are mapped on the GPU.
+    // This is a synchronous copy - remove and replace with code below when we have appropriate LOCK APIs.
+    hc::am_copy(dst, src, sizeBytes);
 #endif
 
-    // TODO - if am_copy becomes async, and we have HIP_LAUNCH_BLOCKING set, then we would wait for copy operation to complete here.
 
     return ihipLogStatus(e);
 }
 
 
 // TODO-sync: function is async unless target is pinned host memory - then these are fully sync.
+/** @return #hipErrorInvalidValue
+ */
 hipError_t hipMemsetAsync(void* dst, int  value, size_t sizeBytes, hipStream_t stream )
 {
     std::call_once(hip_initialized, ihipInit);
@@ -1517,37 +2252,42 @@ hipError_t hipMemsetAsync(void* dst, int  value, size_t sizeBytes, hipStream_t s
     hipError_t e = hipSuccess;
 
     stream =  ihipSyncAndResolveStream(stream);
-    hc::completion_future marker;
-    ihipCheckCommandSwitchSync(stream, ihipCommandData, &marker);
+    stream->preKernelCommand();
 
+    if (stream) {
 
-    hc::completion_future cf ;
+        hc::completion_future cf ;
 
-    if ((sizeBytes & 0x3) == 0) {
-        // use a faster word-per-workitem copy:
-        try {
-            value = value & 0xff;
-            unsigned value32 = (value << 24) | (value << 16) | (value << 8) | (value) ;
-            cf = ihipMemsetKernel<unsigned> (stream, static_cast<unsigned*> (dst), value32, sizeBytes/sizeof(unsigned));
-        }
-        catch (std::exception &ex) {
-            e = hipErrorInvalidValue;
+        if ((sizeBytes & 0x3) == 0) {
+            // use a faster word-per-workitem copy:
+            try {
+                value = value & 0xff;
+                unsigned value32 = (value << 24) | (value << 16) | (value << 8) | (value) ;
+                cf = ihipMemsetKernel<unsigned> (stream, static_cast<unsigned*> (dst), value32, sizeBytes/sizeof(unsigned));
+            }
+            catch (std::exception &ex) {
+                e = hipErrorInvalidValue;
+            }
+        } else {
+            // use a slow byte-per-workitem copy:
+            try {
+                cf = ihipMemsetKernel<char> (stream, static_cast<char*> (dst), value, sizeBytes);
+            }
+            catch (std::exception &ex) {
+                e = hipErrorInvalidValue;
+            }
         }
+
+        stream->postKernelCommand(cf);
+
+
+        if (HIP_LAUNCH_BLOCKING) {
+            tprintf (TRACE_SYNC, "'%s' LAUNCH_BLOCKING wait for completion [stream:%p].\n", __func__, (void*)stream);
+            cf.wait();
+            tprintf (TRACE_SYNC, "'%s' LAUNCH_BLOCKING completed [stream:%p].\n", __func__, (void*)stream);
+        } 
     } else {
-        // use a slow byte-per-workitem copy:
-        try {
-            cf = ihipMemsetKernel<char> (stream, static_cast<char*> (dst), value, sizeBytes);
-        }
-        catch (std::exception &ex) {
-            e = hipErrorInvalidValue;
-        }
-    }
-
-
-    if (HIP_LAUNCH_BLOCKING) {
-        tprintf (TRACE_SYNC, "'%s' LAUNCH_BLOCKING wait for completion [stream:%p].\n", __func__, (void*)stream);
-        cf.wait();
-        tprintf (TRACE_SYNC, "'%s' LAUNCH_BLOCKING completed [stream:%p].\n", __func__, (void*)stream);
+        e = hipErrorInvalidValue;
     }
 
 
@@ -1557,15 +2297,16 @@ hipError_t hipMemsetAsync(void* dst, int  value, size_t sizeBytes, hipStream_t s
 
 hipError_t hipMemset(void* dst, int  value, size_t sizeBytes )
 {
+    // TODO - call an ihip memset so HIP_TRACE is correct.
     return hipMemsetAsync(dst, value, sizeBytes, hipStreamNull);
 }
 
 
 /*
- * @returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue (if free != NULL due to bug)
- * @bug - on hcc free always returns 50% of peak regardless of current allocations.  hipMemGetInfo returns hipErrorInvalidValue to indicate this.
+ * @returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue (if free != NULL due to bug)S
+ * @warning On HCC, the free memory only accounts for memory allocated by this process and may be optimistic.
  */
-hipError_t hipMemGetInfo  (   size_t *    free, size_t *    total    )
+hipError_t hipMemGetInfo  (size_t *free, size_t *total)
 {
     std::call_once(hip_initialized, ihipInit);
 
@@ -1578,23 +2319,29 @@ hipError_t hipMemGetInfo  (   size_t *    free, size_t *    total    )
         }
 
         if (free) {
-            *free =  hipDevice->_props.totalGlobalMem  * 0.5; // TODO
+#if USE_AM_TRACKER
+            // TODO - replace with kernel-level for reporting free memory:
+            size_t deviceMemSize, hostMemSize, userMemSize;
+            hc::am_memtracker_sizeinfo(hipDevice->_acc, &deviceMemSize, &hostMemSize, &userMemSize);
+            *free =  hipDevice->_props.totalGlobalMem - deviceMemSize;
+#else
+            *free =  hipDevice->_props.totalGlobalMem * 0.5; // TODO
             e=hipErrorInvalidValue;
+#endif
         }
 
     } else {
         e = hipErrorInvalidDevice;
     }
 
-    // TODO-runtime  - when we fix the 50% bug.
-    //return ihipLogStatus(hipErrorSuccess);
-    return ihipLogStatus(hipErrorInvalidValue);
+    return ihipLogStatus(e);
 }
 
 
 //---
 hipError_t hipFree(void* ptr)
 {
+    // TODO - ensure this pointer was created by hipMalloc and not hipMallocHost
     std::call_once(hip_initialized, ihipInit);
 
 
@@ -1611,15 +2358,12 @@ hipError_t hipFree(void* ptr)
 
 hipError_t hipFreeHost(void* ptr)
 {
+    // TODO - ensure this pointer was created by hipMallocHost and not hipMalloc
     std::call_once(hip_initialized, ihipInit);
 
     if (ptr) {
-#if USE_PINNED_HOST
         tprintf (TRACE_MEM, "  %s: %p\n", __func__, ptr);
         hc::am_free(ptr);
-#else
-        free(ptr);
-#endif
     }
 
     return ihipLogStatus(hipSuccess);
@@ -1741,3 +2485,5 @@ hipError_t hipHccGetAcceleratorView(hipStream_t stream, hc::accelerator_view **a
     hipError_t err = hipSuccess;
     return ihipLogStatus(err);
 }
+
+// TODO - review signal / error reporting code.
diff --git a/projects/clr/hipamd/tests/src/CMakeLists.txt b/projects/clr/hipamd/tests/src/CMakeLists.txt
index 0ec287b334..09c0ca7162 100644
--- a/projects/clr/hipamd/tests/src/CMakeLists.txt
+++ b/projects/clr/hipamd/tests/src/CMakeLists.txt
@@ -19,8 +19,10 @@ MESSAGE ("HIP_PATH=" ${HIP_PATH})
 
 if (${HIP_PLATFORM} STREQUAL "hcc")
 	MESSAGE ("HIP_PLATFORM=hcc")
-	set (HC_PATH ${HIP_PATH}/hc)
-	set (HSA_PATH /opt/hsa)
+    set (HSA_PATH $ENV{HSA_PATH})
+    if (NOT DEFINED HSA_PATH)
+        set (HSA_PATH /opt/hsa)
+    endif()
 
 	#---
 	# Add HSA library:
@@ -30,7 +32,7 @@ if (${HIP_PLATFORM} STREQUAL "hcc")
 	#These includes are used for all files.
 	#Include HIP and HC since the tests need both of these:
 	#Note below HSA path is surgically included only where necessary.
-	include_directories(${HIP_PATH}/include ${HC_PATH}/include)
+	include_directories(${HIP_PATH}/include)
 
 	# hip_hcc.o:
 	add_library(hip_hcc OBJECT ${HIP_PATH}/src/hip_hcc.cpp)
@@ -105,6 +107,7 @@ make_hip_executable (hip_brev hip_brev.cpp)
 make_hip_executable (hip_ffs hip_ffs.cpp) 
 make_hip_executable (hipGetDeviceAttribute hipGetDeviceAttribute.cpp) 
 make_hip_executable (hipMemcpy hipMemcpy.cpp) 
+make_hip_executable (hipMemcpyAsync hipMemcpyAsync.cpp) 
 make_hip_executable (hipMemset hipMemset.cpp) 
 make_hip_executable (hipEventRecord hipEventRecord.cpp) 
 make_hip_executable (hipLanguageExtensions hipLanguageExtensions.cpp) 
@@ -114,6 +117,7 @@ make_hip_executable (hipSimpleAtomicsTest hipSimpleAtomicsTest.cpp)
 make_hip_executable (hipMathFunctionsHost hipMathFunctions.cpp hipSinglePrecisionMathHost.cpp hipDoublePrecisionMathHost.cpp)
 make_hip_executable (hipMathFunctionsDevice hipMathFunctions.cpp hipSinglePrecisionMathDevice.cpp hipDoublePrecisionMathDevice.cpp)
 make_hip_executable (hipIntrinsics hipMathFunctions.cpp hipSinglePrecisionIntrinsics.cpp hipDoublePrecisionIntrinsics.cpp hipIntegerIntrinsics.cpp)
+make_hip_executable (hipPointerAttrib hipPointerAttrib.cpp) 
 target_link_libraries(hipMathFunctionsHost m)
 
 make_test(hip_ballot " " )
@@ -128,8 +132,10 @@ make_test(hipMemset --N 10    --memsetval 0x42 )  # small copy, just 10 bytes.
 make_test(hipMemset --N 10013 --memsetval 0x5a )  # oddball size.
 make_test(hipMemset --N 256M  --memsetval 0xa6 )  # big copy
 make_test(hipGridLaunch " " )
+make_test(hipPointerAttrib " " )
 
 make_test(hipMemcpy  " " )
+make_test(hipMemcpyAsync  " " )
 
 make_test(hipHcc  " " )
 
diff --git a/projects/clr/hipamd/tests/src/hipMemcpy.cpp b/projects/clr/hipamd/tests/src/hipMemcpy.cpp
index 5db2b270d6..8286454098 100644
--- a/projects/clr/hipamd/tests/src/hipMemcpy.cpp
+++ b/projects/clr/hipamd/tests/src/hipMemcpy.cpp
@@ -23,24 +23,28 @@ THE SOFTWARE.
 #include "test_common.h"
 
 
-
-int main(int argc, char *argv[])
+void printSep()
 {
-    HipTest::parseStandardArguments(argc, argv, true);
+    printf ("======================================================================================\n");
+}
 
+//---
+// Test simple H2D copies and back.
+// Designed to stress a small number of simple smoke tests
+void simpleTest1()
+{
+    printf ("test: %s\n", __func__);
     size_t Nbytes = N*sizeof(int);
-
-    printf ("N=%zu \n", N);
+    printf ("N=%zu Nbytes=%6.2fMB\n", N, Nbytes/1024.0/1024.0);
 
     int *A_d, *B_d, *C_d;
     int *A_h, *B_h, *C_h;
 
-    HipTest::initArrays (&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N);
-
+    HipTest::initArrays (&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
 
+    printf ("A_d=%p B_d=%p C_d=%p  A_h=%p B_h=%p C_h=%p\n", A_d, B_d, C_d, A_h, B_d, C_h);
     unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
 
-
     HIPCHECK ( hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
     HIPCHECK ( hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice));
 
@@ -50,8 +54,212 @@ int main(int argc, char *argv[])
 
     HIPCHECK (hipDeviceSynchronize());
 
-
     HipTest::checkVectorADD(A_h, B_h, C_h, N);
+
+    HipTest::freeArrays (A_d, B_d, C_d, A_h, B_h, C_h, false);
+    HIPCHECK (hipDeviceReset());
+
+    printf ("  %s success\n", __func__);
+}
+
+
+//---
+// Test many different kinds of memory copies.
+// THe subroutine allocates memory , copies to device, runs a vector add kernel, copies back, and checks the result.
+//
+// IN: numElements  controls the number of elements used for allocations.
+// IN: usePinnedHost : If true, allocate host with hipMallocHost and is pinned ; else allocate host memory with malloc.
+// IN: useHostToHost : If true, add an extra host-to-host copy.
+// IN: useDeviceToDevice : If true, add an extra deviceto-device copy after result is produced.
+// IN: useMemkindDefault : If true, use memkinddefault (runtime figures out direction).  if false, use explicit memcpy direction.
+//
+template <typename T>
+void memcpytest2(size_t numElements, bool usePinnedHost, bool useHostToHost, bool useDeviceToDevice, bool useMemkindDefault)
+{
+    size_t sizeElements = numElements * sizeof(T);
+    printf ("test: %s<%s> size=%lu (%6.2fMB) usePinnedHost:%d, useHostToHost:%d, useDeviceToDevice:%d, useMemkindDefault:%d\n", 
+            __func__, 
+            typeid(T).name(),
+            sizeElements, sizeElements/1024.0/1024.0,
+            usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault);
+
+
+    T *A_d, *B_d, *C_d;
+    T *A_h, *B_h, *C_h;
+
+
+    HipTest::initArrays (&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, numElements, usePinnedHost);
+    unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements);
+
+    T *A_hh = NULL;
+    T *B_hh = NULL;
+    T *C_dd = NULL;
+
+
+
+    if (useHostToHost) {
+        if (usePinnedHost) {
+            HIPCHECK ( hipMallocHost(&A_hh, sizeElements) );
+            HIPCHECK ( hipMallocHost(&B_hh, sizeElements) );
+        } else {
+            A_hh = (T*)malloc(sizeElements);
+            B_hh = (T*)malloc(sizeElements);
+        }
+
+
+        // Do some extra host-to-host copies here to mix things up:
+        HIPCHECK ( hipMemcpy(A_hh, A_h, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost));
+        HIPCHECK ( hipMemcpy(B_hh, B_h, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost));
+
+
+        HIPCHECK ( hipMemcpy(A_d, A_hh, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice));
+        HIPCHECK ( hipMemcpy(B_d, B_hh, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice));
+    } else {
+        HIPCHECK ( hipMemcpy(A_d, A_h, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice));
+        HIPCHECK ( hipMemcpy(B_d, B_h, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice));
+    }
+
+    hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, numElements);
+
+    if (useDeviceToDevice) {
+        HIPCHECK ( hipMalloc(&C_dd, sizeElements) );
+
+        // Do an extra device-to-device copies here to mix things up:
+        HIPCHECK ( hipMemcpy(C_dd, C_d,  sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyDeviceToDevice));
+
+        //Destroy the original C_d:
+        HIPCHECK ( hipMemset(C_d, 0x5A, sizeElements));
+
+        HIPCHECK ( hipMemcpy(C_h, C_dd, sizeElements, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost));
+    } else {
+        HIPCHECK ( hipMemcpy(C_h, C_d, sizeElements, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost));
+    }
+
+    HIPCHECK ( hipDeviceSynchronize() );
+    HipTest::checkVectorADD(A_h, B_h, C_h, numElements);
+
+    HipTest::freeArrays (A_d, B_d, C_d, A_h, B_h, C_h, usePinnedHost);
+
+    printf ("  %s success\n", __func__);
+}
+
+
+//---
+//Try all the 16 possible combinations to memcpytest2 - usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault
+template<typename T>
+void memcpytest2_loop(size_t numElements)
+{
+    printSep();
+
+    for (int usePinnedHost =0; usePinnedHost<=1; usePinnedHost++) {
+        for (int useHostToHost =0; useHostToHost<=1; useHostToHost++) {  // TODO
+            for (int useDeviceToDevice =0; useDeviceToDevice<=1; useDeviceToDevice++) {
+                for (int useMemkindDefault =0; useMemkindDefault<=1; useMemkindDefault++) {
+                    memcpytest2<T>(numElements, usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault);
+                }
+            }
+        }
+    }
+}
+
+
+//---
+//Try many different sizes to memory copy.
+template<typename T>
+void memcpytest2_sizes(size_t maxElem=0, size_t offset=0)
+{
+    printSep();
+    printf ("test: %s<%s>\n", __func__,  typeid(T).name());
+
+    int deviceId;
+    HIPCHECK(hipGetDevice(&deviceId));
+
+    size_t free, total;
+    HIPCHECK(hipMemGetInfo(&free, &total));
+
+    if (maxElem == 0) {
+        maxElem = free/sizeof(T)/5;
+    }
+
+    printf ("  device#%d: hipMemGetInfo: free=%zu (%4.2fMB) total=%zu (%4.2fMB)    maxSize=%6.1fMB offset=%lu\n", 
+            deviceId, free, (float)(free/1024.0/1024.0), total, (float)(total/1024.0/1024.0), maxElem*sizeof(T)/1024.0/1024.0, offset);
+
+    for (size_t elem=64; elem+offset<=maxElem; elem*=2) {
+        HIPCHECK ( hipDeviceReset() );
+        memcpytest2<T>(elem+offset, 0, 1, 1, 0);  // unpinned host
+        HIPCHECK ( hipDeviceReset() );
+        memcpytest2<T>(elem+offset, 1, 1, 1, 0);  // pinned host
+    }
+}
+
+
+//---
+//Create multiple threads to stress multi-thread locking behavior in the allocation/deallocation/tracking logic:
+template<typename T>
+void multiThread_1(bool serialize, bool usePinnedHost)
+{
+    printSep();
+    printf ("test: %s<%s> serialize=%d usePinnedHost=%d\n", __func__,  typeid(T).name(), serialize, usePinnedHost);
+    std::thread t1 (memcpytest2<T>,N, usePinnedHost,0,0,0);
+    if (serialize) {
+        t1.join();
+    }
+
+    
+    std::thread t2 (memcpytest2<T>,N, usePinnedHost,0,0,0);
+    if (serialize) {
+        t2.join();
+    }
+
+    if (!serialize) {
+        t1.join();
+        t2.join();
+    }
+}
+
+
+
+int main(int argc, char *argv[])
+{
+    HipTest::parseStandardArguments(argc, argv, true);
+
+    printf ("info: set device to %d\n", p_gpuDevice);
+    HIPCHECK(hipSetDevice(p_gpuDevice));
+
+
+    if (p_tests & 0x1) {
+        HIPCHECK ( hipDeviceReset() );
+        simpleTest1();
+    }
+
+    if (p_tests & 0x2) {
+        HIPCHECK ( hipDeviceReset() );
+        memcpytest2_loop<float>(N);
+        memcpytest2_loop<double>(N);
+        memcpytest2_loop<char>(N);
+        memcpytest2_loop<int>(N);
+    }
+
+    if (p_tests & 0x4) {
+        HIPCHECK ( hipDeviceReset() );
+        printSep();
+        memcpytest2_sizes<float>(0,0);
+        printSep();
+        memcpytest2_sizes<float>(0,64);
+        printSep();
+        memcpytest2_sizes<float>(1024*1024, 13);
+        printSep();
+        memcpytest2_sizes<float>(1024*1024, 50);
+    }
+
+    if (p_tests & 0x8) {
+        HIPCHECK ( hipDeviceReset() );
+        printSep();
+        multiThread_1<float>(true, true);
+        multiThread_1<float>(false, true);
+        multiThread_1<float>(false, false); // TODO
+    }
+
     passed();
 
 }
diff --git a/projects/clr/hipamd/tests/src/hipMemcpyAsync.cpp b/projects/clr/hipamd/tests/src/hipMemcpyAsync.cpp
new file mode 100644
index 0000000000..4b92e2fc1e
--- /dev/null
+++ b/projects/clr/hipamd/tests/src/hipMemcpyAsync.cpp
@@ -0,0 +1,349 @@
+// Test under-development.  Calls async mem-copy API, experiment with functionality.
+
+#include "hip_runtime.h"
+#include "test_common.h"
+
+unsigned p_streams = 2;
+
+
+void simpleNegTest()
+{
+    printf ("testing: %s\n",__func__);
+    hipError_t e;
+    float *A_malloc, *A_pinned, *A_d;
+
+    size_t Nbytes = N*sizeof(float);
+    A_malloc = (float*)malloc(Nbytes);
+    HIPCHECK(hipMallocHost(&A_pinned, Nbytes));
+    HIPCHECK(hipMalloc(&A_d, Nbytes));
+
+
+    // Can't use default with async copy
+    e = hipMemcpyAsync(A_pinned, A_d, Nbytes, hipMemcpyDefault, NULL);
+    HIPASSERT (e==hipErrorInvalidMemcpyDirection); // TODO 
+    HIPASSERT (e!= hipSuccess);
+
+
+    // Not sure what happens here, the memory must be pinned.
+    e = hipMemcpyAsync(A_malloc, A_d, Nbytes, hipMemcpyHostToDevice, NULL);
+
+    printf ("  async memcpy of A_malloc to A_d. Result=%d\n", e);
+    //HIPASSERT (e==hipErrorInvalidValue);
+}
+
+class Pinned;
+class Unpinned;
+
+template <typename T> struct HostTraits;
+
+template<>
+struct HostTraits<Pinned>
+{
+    static const char *Name() { return "Pinned"; } ;
+
+    static void *Alloc(size_t sizeBytes) {
+        void *p; 
+        HIPCHECK(hipMallocHost(&p, sizeBytes));
+        return p;
+    };
+};
+
+
+template<typename T>
+__global__ void 
+addK (hipLaunchParm lp, T *A, T K, size_t numElements)
+{
+    size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
+    size_t stride = hipBlockDim_x * hipGridDim_x ;
+
+    for (size_t i=offset; i<numElements; i+=stride) {
+        A[i] = A[i] + K;
+	}
+}
+
+
+
+//---
+//Tests propert dependency resolution between H2D and D2H commands in same stream:
+//IN: numInflight : number of copies inflight at any time:
+//IN: numPongs = number of iterations to run (iteration)
+template<typename T, class AllocType>
+void test_pingpong(hipStream_t stream, size_t numElements, int numInflight, int numPongs, bool doHostSide) 
+{
+    HIPASSERT(numElements % numInflight == 0); // Must be evenly divisible.
+    size_t Nbytes = numElements*sizeof(T);
+    size_t eachCopyElements = numElements / numInflight;
+    size_t eachCopyBytes = eachCopyElements * sizeof(T);
+
+    unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements);
+
+    printf ("-----------------------------------------------------------------------------------------------\n");
+    printf ("testing: %s<%s>  Nbytes=%zu (%6.1f MB) numPongs=%d numInflight=%d eachCopyElements=%zu eachCopyBytes=%zu\n", 
+            __func__, HostTraits<AllocType>::Name(), Nbytes, (double)(Nbytes)/1024.0/1024.0, numPongs, numInflight, eachCopyElements, eachCopyBytes);
+
+    T *A_h;
+    T *A_d;
+
+    A_h = (T*)(HostTraits<AllocType>::Alloc(Nbytes));
+    HIPCHECK(hipMalloc(&A_d, Nbytes));
+
+    // Initialize the host array:
+    const T initValue = 13;
+    const T deviceConst = 2;
+    const T hostConst = 10000;
+    for (size_t i=0; i<numElements; i++) {
+        A_h[i] = initValue + i;
+    }
+
+
+    for (int k=0; k<numPongs; k++ ) {
+        for (int i=0; i<numInflight; i++) {
+            HIPCHECK(hipMemcpyAsync(&A_d[i*eachCopyElements], &A_h[i*eachCopyElements], eachCopyBytes, hipMemcpyHostToDevice, stream));
+        }
+
+        hipLaunchKernel(addK<T>, dim3(blocks), dim3(threadsPerBlock), 0, stream,   A_d, 2, numElements);
+
+        for (int i=0; i<numInflight; i++ ) {
+            HIPCHECK(hipMemcpyAsync(&A_h[i*eachCopyElements], &A_d[i*eachCopyElements], eachCopyBytes, hipMemcpyDeviceToHost, stream));
+        }
+
+        if (doHostSide) {
+            assert(0);
+#if 0
+            hipEvent_t e;
+            HIPCHECK(hipEventCreate(&e));
+#endif
+            HIPCHECK(hipDeviceSynchronize());
+            for (size_t i=0; i<numElements; i++) {
+                A_h[i] += hostConst;
+            }
+        }
+    };
+
+    HIPCHECK(hipDeviceSynchronize());
+
+
+    // Verify we copied back all the data correctly:
+    for (size_t i=0; i<numElements; i++) {
+        T gold = initValue + i;
+        // Perform calcs in same order as test above to replicate FP order-of-operations:
+        for (int k=0; k<numPongs; k++) {
+            gold += deviceConst;
+            if (doHostSide) {
+                gold += hostConst;
+            }
+        }
+
+        if (gold != A_h[i]) {
+            std::cout << i << ": gold=" << gold << " out=" << A_h[i] << std::endl;
+            HIPASSERT(gold == A_h[i]);
+        }
+    }
+
+
+    HIPCHECK(hipFreeHost(A_h));
+    HIPCHECK(hipFree(A_d));
+}
+
+
+//---
+//Send many async copies to the same stream.
+//This requires runtime to keep track of many outstanding commands, and in the case of HCC requires growing/tracking the signal pool:
+template<typename T>
+void test_manyInflightCopies(hipStream_t stream, int numElements, int numCopies, bool syncBetweenCopies)
+{
+    size_t Nbytes = numElements*sizeof(T);
+    size_t eachCopyElements = numElements / numCopies;
+    size_t eachCopyBytes = eachCopyElements * sizeof(T);
+
+    printf ("-----------------------------------------------------------------------------------------------\n");
+    printf ("testing: %s  Nbytes=%zu (%6.1f MB) numCopies=%d eachCopyElements=%zu eachCopyBytes=%zu\n", 
+            __func__, Nbytes, (double)(Nbytes)/1024.0/1024.0, numCopies, eachCopyElements, eachCopyBytes);
+
+    T *A_d;
+    T *A_h1, *A_h2;
+
+    HIPCHECK(hipMallocHost(&A_h1, Nbytes));
+    HIPCHECK(hipMallocHost(&A_h2, Nbytes));
+    HIPCHECK(hipMalloc(&A_d, Nbytes));
+
+    for (int i=0; i<numElements; i++) {
+        A_h1[i] = 3.14f + static_cast<T> (i);
+    }
+
+
+    //stream=0; // fixme TODO
+
+
+    for (int i=0; i<numCopies; i++) 
+    {
+        HIPCHECK(hipMemcpyAsync(&A_d[i*eachCopyElements], &A_h1[i*eachCopyElements], eachCopyBytes, hipMemcpyHostToDevice, stream));
+    }
+
+    if (syncBetweenCopies) {
+        HIPCHECK(hipDeviceSynchronize());
+    }
+
+    for (int i=0; i<numCopies; i++) 
+    {
+        HIPCHECK(hipMemcpyAsync(&A_h2[i*eachCopyElements], &A_d[i*eachCopyElements], eachCopyBytes, hipMemcpyDeviceToHost, stream));
+    }
+
+    HIPCHECK(hipDeviceSynchronize());
+
+
+    // Verify we copied back all the data correctly:
+    for (int i=0; i<numElements; i++) {
+        HIPASSERT(A_h1[i] == A_h2[i]);
+    }
+
+
+    HIPCHECK(hipFreeHost(A_h1));
+    HIPCHECK(hipFreeHost(A_h2));
+    HIPCHECK(hipFree(A_d));
+}
+
+
+//---
+//Classic example showing how to overlap data transfer with compute.
+//We divide the work into "chunks" and create a stream for each chunk.
+//Each chunk then runs a H2D copy, followed by kernel execution, followed by D2H copyback.
+//Work in separate streams is independent which enables concurrency.
+
+// IN: nStreams : number of streams to use for the test
+// IN :useNullStream - use NULL stream.  Synchronizes everything.
+// IN: useSyncMemcpyH2D - use sync memcpy (no overlap) for H2D
+// IN: useSyncMemcpyD2H - use sync memcpy (no overlap) for D2H
+void test_chunkedAsyncExample(int nStreams, bool useNullStream, bool useSyncMemcpyH2D, bool useSyncMemcpyD2H)
+{
+
+    size_t Nbytes = N*sizeof(int);
+    printf ("testing: %s(useNullStream=%d, useSyncMemcpyH2D=%d, useSyncMemcpyD2H=%d)  ",__func__, useNullStream, useSyncMemcpyH2D, useSyncMemcpyD2H);
+    printf ("Nbytes=%zu (%6.1f MB)\n", Nbytes, (double)(Nbytes)/1024.0/1024.0);
+
+    int *A_d, *B_d, *C_d;
+    int *A_h, *B_h, *C_h;
+
+    HipTest::initArrays (&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, true);
+
+
+    unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
+
+
+    hipStream_t *stream = (hipStream_t*)malloc(sizeof(hipStream_t) * nStreams);
+    if (useNullStream) { 
+        nStreams = 1;
+        stream[0] = NULL;
+    } else  {
+        for (int i = 0; i < nStreams; ++i) {
+            HIPCHECK (hipStreamCreate(&stream[i]));
+        }
+    }
+
+
+    size_t workLeft = N; 
+    size_t workPerStream = N / nStreams;
+    for (int i = 0; i < nStreams; ++i) {
+        size_t work = (workLeft < workPerStream) ? workLeft : workPerStream;
+        size_t workBytes = work * sizeof(int);
+
+        size_t offset = i*workPerStream;
+
+        if (useSyncMemcpyH2D) {
+            HIPCHECK ( hipMemcpy(&A_d[offset], &A_h[offset], workBytes, hipMemcpyHostToDevice));
+            HIPCHECK ( hipMemcpy(&B_d[offset], &B_h[offset], workBytes, hipMemcpyHostToDevice));
+        } else {
+            HIPCHECK ( hipMemcpyAsync(&A_d[offset], &A_h[offset], workBytes, hipMemcpyHostToDevice, stream[i]));
+            HIPCHECK ( hipMemcpyAsync(&B_d[offset], &B_h[offset], workBytes, hipMemcpyHostToDevice, stream[i]));
+        };
+
+        hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, stream[i], &A_d[offset], &B_d[offset], &C_d[offset], work);
+
+        if (useSyncMemcpyD2H) {
+            HIPCHECK ( hipMemcpy(&C_h[offset], &C_d[offset], workBytes, hipMemcpyDeviceToHost));
+        } else {
+            HIPCHECK ( hipMemcpyAsync(&C_h[offset], &C_d[offset], workBytes, hipMemcpyDeviceToHost, stream[i]));
+        }
+    } 
+
+
+    HIPCHECK (hipDeviceSynchronize());
+
+
+    HipTest::checkVectorADD(A_h, B_h, C_h, N);
+
+    HipTest::freeArrays (A_d, B_d, C_d, A_h, B_h, C_h, true);
+
+    free(stream);
+};
+
+
+//---
+//Parse arguments specific to this test.
+void parseMyArguments(int argc, char *argv[])
+{
+    int more_argc = HipTest::parseStandardArguments(argc, argv, false);
+
+    // parse args for this test:
+    for (int i = 1; i < more_argc; i++) {
+        const char *arg = argv[i];
+
+        if (!strcmp(arg, "--streams")) {
+            if (++i >= argc || !HipTest::parseUInt(argv[i], &p_streams)) {
+               failed("Bad streams argument");
+            }
+        } else {
+            failed("Bad argument '%s'", arg);
+        }
+    };
+};
+
+
+
+
+int main(int argc, char *argv[])
+{
+    HipTest::parseStandardArguments(argc, argv, true);
+    parseMyArguments(argc, argv);
+
+
+    printf ("info: set device to %d\n", p_gpuDevice);
+    HIPCHECK(hipSetDevice(p_gpuDevice));
+
+    if (p_tests & 0x01) {
+        simpleNegTest();
+    }
+
+    if (p_tests & 0x02) {
+        hipStream_t stream;
+        HIPCHECK (hipStreamCreate(&stream));
+
+        test_manyInflightCopies<float>(stream, 1024,   16,  true);
+        test_manyInflightCopies<float>(stream, 1024,    4,  true); // verify we re-use the same entries instead of growing pool.
+        test_manyInflightCopies<float>(stream, 1024*8, 64, false);
+
+        HIPCHECK(hipStreamDestroy(stream));
+    }
+
+
+    if (p_tests & 0x04) {
+        test_chunkedAsyncExample(p_streams, true, true, true); // Easy sync version
+        test_chunkedAsyncExample(p_streams, false, true, true); // Easy sync version
+        test_chunkedAsyncExample(p_streams, false, false, true); // Some async
+        test_chunkedAsyncExample(p_streams, false, false, false); // All async
+    }
+
+    if (p_tests & 0x08) {
+        hipStream_t stream;
+        HIPCHECK (hipStreamCreate(&stream));
+
+        test_pingpong<int, Pinned>(stream, 1024*1024*32, 1, 1, false);
+        test_pingpong<int, Pinned>(stream, 1024*1024*32, 1, 10, false);
+
+        HIPCHECK(hipStreamDestroy(stream));
+    }
+
+
+    passed();
+
+}
diff --git a/projects/clr/hipamd/tests/src/hipPointerAttrib.cpp b/projects/clr/hipamd/tests/src/hipPointerAttrib.cpp
new file mode 100644
index 0000000000..6928ec9a64
--- /dev/null
+++ b/projects/clr/hipamd/tests/src/hipPointerAttrib.cpp
@@ -0,0 +1,524 @@
+/*
+Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+// Test pointer tracking logic: allocate memory and retrieve stats with hipPointerGetAttributes
+
+#include "hip_runtime.h"
+#include "test_common.h"
+
+#ifdef __HIP_PLATFORM_HCC__
+//#include "hcc_detail/AM.h"
+#include "hc_am.hpp"
+
+#endif
+
+size_t Nbytes = 0;
+
+//=================================================================================================
+// Utility Functions:
+//=================================================================================================
+
+bool operator==(const hipPointerAttribute_t &lhs, const hipPointerAttribute_t &rhs) 
+{
+    return ((lhs.hostPointer == rhs.hostPointer) &&
+            (lhs.devicePointer == rhs.devicePointer) &&
+            (lhs.memoryType == rhs.memoryType) &&
+            (lhs.device == rhs.device) &&
+            (lhs.allocationFlags == rhs.allocationFlags)
+            ) ;
+
+};
+
+
+bool operator!=(const hipPointerAttribute_t &lhs, const hipPointerAttribute_t &rhs) 
+{
+    return ! (lhs == rhs);
+}
+
+
+const char *memoryTypeToString(hipMemoryType memoryType)
+{
+    switch (memoryType) {
+        case hipMemoryTypeHost   : return "[Host]";
+        case hipMemoryTypeDevice : return "[Device]";
+        default:                   return "[Unknown]";
+    };
+}
+
+
+void resetAttribs(hipPointerAttribute_t *attribs) 
+{
+    attribs->hostPointer = (void*) (-1);
+    attribs->devicePointer = (void*) (-1);
+    attribs->memoryType = hipMemoryTypeHost;
+    attribs->device = -2;
+    attribs->isManaged = -1;
+    attribs->allocationFlags = 0xffff;
+};
+
+
+void printAttribs(const hipPointerAttribute_t *attribs) 
+{
+    printf ("hostPointer:%p devicePointer:%p  memoryType:%s deviceId:%d isManaged:%d allocationFlags:%u\n", 
+            attribs->hostPointer,
+            attribs->devicePointer,
+            memoryTypeToString(attribs->memoryType),
+            attribs->device,
+            attribs->isManaged,
+            attribs->allocationFlags
+            );
+};
+
+
+inline int zrand(int max) 
+{
+    return rand() % max;
+}
+
+
+//=================================================================================================
+// Functins to run tests
+//=================================================================================================
+//--
+//Run through a couple simple cases to test lookups and host pointer arithmetic:
+void testSimple() 
+{
+    printf ("\n");
+    printf ("===========================================================================\n");
+    printf ("Simple Tests\n");
+    printf ("===========================================================================\n");
+
+    char *A_d;
+    char *A_Pinned_h;
+    char *A_OSAlloc_h;
+    hipError_t e;
+
+    HIPCHECK ( hipMalloc(&A_d, Nbytes) );
+    HIPCHECK ( hipMallocHost(&A_Pinned_h, Nbytes) );
+    A_OSAlloc_h = (char*)malloc(Nbytes);
+
+    size_t free, total;
+    HIPCHECK(hipMemGetInfo(&free, &total));
+    printf ("hipMemGetInfo: free=%zu (%4.2f) Nbytes=%lu total=%zu (%4.2f)\n", free, (float)(free/1024.0/1024.0), Nbytes, total, (float)(total/1024.0/1024.0));
+    HIPASSERT(free + Nbytes <= total);
+
+
+    hipPointerAttribute_t attribs;
+    hipPointerAttribute_t attribs2;
+
+    // Device memory
+    printf ("\nDevice memory (hipMalloc)\n");
+    HIPCHECK( hipPointerGetAttributes(&attribs, A_d));
+    printf("getAttr:%-20s", "A_d"); printAttribs(&attribs);
+
+    // Check pointer arithmetic cases:
+    resetAttribs(&attribs2);
+    HIPCHECK( hipPointerGetAttributes(&attribs2, A_d+100));
+    printf("getAttr:%-20s", "A_d+100"); printAttribs(&attribs2);
+    HIPASSERT(attribs == attribs2);
+
+    // Corner case at end of array:
+    resetAttribs(&attribs2);
+    HIPCHECK( hipPointerGetAttributes(&attribs2, A_d+Nbytes-1));
+    printf("getAttr:%-20s", "A_d+NBytes-1"); printAttribs(&attribs2);
+    HIPASSERT(attribs == attribs2);
+
+    // Pointer just beyond array - must be invalid or at least a different pointer
+    resetAttribs(&attribs2);
+    e = hipPointerGetAttributes(&attribs2, A_d+Nbytes+1);
+    printf("getAttr:%-20s err=%d (%s), neg-test expected\n", "A_d+NBytes", e, hipGetErrorString(e)); 
+    if (e != hipErrorInvalidValue) {
+        // We might have strayed into another pointer area.
+        printf("getAttr:%-20s", "A_d+NBytes"); printAttribs(&attribs2);
+        HIPASSERT(attribs.devicePointer != attribs2.devicePointer);
+    }
+
+
+    resetAttribs(&attribs2);
+    e = hipPointerGetAttributes(&attribs2, A_d+Nbytes);
+    if (e != hipErrorInvalidValue) {
+        printf("%-20s", "A_d+Nbytes"); printAttribs(&attribs2);
+        HIPASSERT(attribs.devicePointer != attribs2.devicePointer);
+    }
+
+    hipFree(A_d);
+    e = hipPointerGetAttributes(&attribs, A_d);
+    HIPASSERT(e == hipErrorInvalidValue); // Just freed the pointer, this should return an error.
+
+
+    // Device-visible host memory
+    printf ("\nDevice-visible host memory (hipMallocHost)\n");
+    HIPCHECK( hipPointerGetAttributes(&attribs, A_Pinned_h));
+    printf("getAttr:%-20s", "A_pinned_h"); printAttribs(&attribs);
+
+    resetAttribs(&attribs2);
+    HIPCHECK( hipPointerGetAttributes(&attribs2, A_Pinned_h+Nbytes/2));
+    printf("getAttr:%-20s", "A_pinned_h+NBytes/2"); printAttribs(&attribs2);
+    HIPASSERT(attribs == attribs2);
+
+
+    hipFreeHost(A_Pinned_h);
+    e = hipPointerGetAttributes(&attribs, A_Pinned_h);
+    HIPASSERT(e == hipErrorInvalidValue); // Just freed the pointer, this should return an error.
+    printf("getAttr:%-20s err=%d (%s), neg-test expected\n", "A_d+NBytes", e, hipGetErrorString(e)); 
+
+
+    // OS memory
+    printf ("\nOS-allocated memory (malloc)\n");
+    e = hipPointerGetAttributes(&attribs, A_OSAlloc_h);
+    printf("getAttr:%-20s err=%d (%s), neg-test expected\n", "A_OSAlloc_h", e, hipGetErrorString(e)); 
+    HIPASSERT(e == hipErrorInvalidValue); // OS-allocated pointers should return hipErrorInvalidValue.
+}
+
+//---
+//Reset the memory tracker (remove allocations from all known devices):
+//This frees any memory allocated through the runtime.
+//The routine will not release any 
+void resetTracker ()
+{
+    if (p_verbose & 0x1) {
+        printf ("info: reset tracker for all devices in platform\n");
+    }
+
+    int numDevices;
+    HIPCHECK(hipGetDeviceCount(&numDevices));
+
+    // Clean up:
+    for (int i=0; i<numDevices; i++) {
+        HIPCHECK(hipSetDevice(i));
+        HIPCHECK(hipDeviceReset());
+    };
+}
+
+
+// Store the hipPointer attrib and some extra info so can later compare the looked-up info against the reference expectation
+struct SuperPointerAttribute {
+    void *                  _pointer;
+    size_t                  _sizeBytes;
+    hipPointerAttribute_t   _attrib;
+};
+
+
+//---
+//Support function to check result against a reference:
+void checkPointer(SuperPointerAttribute &ref, int major, int minor, void *pointer)
+{
+    hipPointerAttribute_t attribs;
+    resetAttribs(&attribs);
+
+    hipError_t e = hipPointerGetAttributes(&attribs, pointer);
+    if ((e != hipSuccess) || (attribs != ref._attrib)) {
+        printf("Test %d.%d (err=%d)\n", major, minor, e);
+        HIPCHECK(e);
+        printf("  ref    ::  "); printAttribs(&ref._attrib);
+        printf("  getattr::  "); printAttribs(&attribs);
+        
+        HIPASSERT(attribs == ref._attrib);
+    } else {
+        if (p_verbose & 0x1) {
+            printf("#%4d.%d GOOD:%p getattr ::  ",major, minor, pointer); printAttribs(&attribs);
+        }
+    }
+}
+
+
+//---
+//Test that allocates memory across all 4 devices withing the specified size range (minSize...maxSize).
+//Then does lookups to make sure the info reported by the tracker matches expecations
+//Then deallocates it all.
+//
+//Multiple threads can call this funtion and in fact we do this in the testMultiThreaded_1 test.
+void clusterAllocs(int numAllocs, size_t minSize, size_t maxSize)
+{
+    printf ("  clusterAllocs numAllocs=%d size=%lu..%lu\n", numAllocs, minSize, maxSize);
+    std::vector <SuperPointerAttribute> reference(numAllocs);
+
+    HIPASSERT(minSize > 0);
+    HIPASSERT(maxSize >= minSize);
+
+    int numDevices;
+    HIPCHECK(hipGetDeviceCount(&numDevices));
+
+    //---
+    //Populate with device and host allocations.
+    size_t totalDeviceAllocated[numDevices];
+    for (int i =0; i<numDevices; i++) {
+        totalDeviceAllocated[i] = 0;
+    }
+    for (int i=0; i<numAllocs; i++) {
+        bool isDevice = rand() & 0x1;
+        reference[i]._sizeBytes = zrand(maxSize-minSize) + minSize;
+
+        reference[i]._attrib.device = zrand(numDevices);
+        HIPCHECK(hipSetDevice(reference[i]._attrib.device));
+        reference[i]._attrib.isManaged = 0;
+
+        void * ptr;
+        if (isDevice) {
+            totalDeviceAllocated[reference[i]._attrib.device] += reference[i]._sizeBytes;
+            HIPCHECK(hipMalloc(&ptr, reference[i]._sizeBytes));
+            reference[i]._attrib.memoryType    = hipMemoryTypeDevice;
+            reference[i]._attrib.devicePointer = ptr;
+            reference[i]._attrib.hostPointer   = NULL;
+            reference[i]._attrib.allocationFlags = 0; // TODO-randomize these.
+        } else {
+            HIPCHECK(hipMallocHost(&ptr, reference[i]._sizeBytes));
+            reference[i]._attrib.memoryType    = hipMemoryTypeHost;
+            reference[i]._attrib.devicePointer = ptr;
+            reference[i]._attrib.hostPointer   = ptr;
+            reference[i]._attrib.allocationFlags = 0; // TODO-randomize these.
+        }
+        reference[i]._pointer = ptr;
+    }
+
+#ifdef __HIP_PLATFORM_HCC__
+    if (p_verbose & 0x2) {
+        printf ("Tracker after insertions:\n");
+        hc::am_memtracker_print();
+    }
+#endif
+
+
+    for (int i =0; i<numDevices; i++) {
+        size_t free, total;
+        HIPCHECK(hipSetDevice(i));
+        HIPCHECK(hipMemGetInfo(&free, &total));
+        printf ("  device#%d: hipMemGetInfo: free=%zu (%4.2fMB) clusterAllocTotalDevice=%lu (%4.2fMB) total=%zu (%4.2fMB)\n", 
+                i, free, (float)(free/1024.0/1024.0), totalDeviceAllocated[i], (float)(totalDeviceAllocated[i])/1024.0/1024.0, total, (float)(total/1024.0/1024.0));
+        HIPASSERT(free + totalDeviceAllocated[i] <= total);
+    }
+
+
+    // Now look up each pointer we inserted and verify we can find it:
+    for (int i=0; i<numAllocs; i++) {
+        SuperPointerAttribute &ref = reference[i];
+        checkPointer(ref, i, 0, ref._pointer);
+        checkPointer(ref, i, 1, (char *)ref._pointer + ref._sizeBytes/2);
+        if (ref._sizeBytes > 1) {
+            checkPointer(ref, i, 2, (char *)ref._pointer + ref._sizeBytes-1);
+        }
+
+        if (ref._attrib.memoryType == hipMemoryTypeDevice) {
+            hipFree(ref._pointer);
+        } else {
+            hipFreeHost(ref._pointer);
+        }
+
+    }
+
+#ifdef __HIP_PLATFORM_HCC__
+    if (p_verbose & 0x2) {
+        printf ("Tracker after cleanup:\n");
+        hc::am_memtracker_print();
+    }
+#endif
+}
+
+
+//---
+// Multi-threaded test with many simul allocs.
+// IN : serialize will force the test to run in serial fashion.
+// Seems like this does not hit MT corner cases in the tracker very often - testMultiThreaded_2 below seems more effective.
+void testMultiThreaded_1(bool serialize=false)
+{
+    printf ("\n===========================================================================\n");
+    printf ("MultiThreaded_1\n");
+    if (serialize) printf ("[SERIALIZE]\n");
+    printf ("===========================================================================\n");
+    std::thread t1(clusterAllocs, 1000, 101, 1000);
+    if (serialize) t1.join();
+
+    std::thread t2(clusterAllocs, 1000,  11,  100);
+    if (serialize) t2.join();
+
+    std::thread t3(clusterAllocs, 1000,   5,  10);
+    if (serialize) t3.join();
+
+    std::thread t4(clusterAllocs, 1000,   1,   4);
+    if (serialize) t4.join();
+
+    if (!serialize) {
+        t1.join();
+        t2.join();
+        t3.join();
+        t4.join();
+    }
+
+    resetTracker();
+}
+
+
+///================================================================================================
+
+//---
+//Repeatedly query a single entry:
+void thread_query(void *ptr, const hipPointerAttribute_t *refAttrib)
+{
+    int count = 0;
+
+    for (int count=0; count< 1000000; count++) {
+        hipPointerAttribute_t a;
+        hipError_t e = hipPointerGetAttributes(&a, ptr);
+        if ((e != hipSuccess) || (a!= *refAttrib)) {
+            printf("Test %d (err=%d)\n", count, e);
+            HIPCHECK(e);
+
+            printf("  ref    ::  "); printAttribs(refAttrib);
+            printf("  getattr::  "); printAttribs(&a);
+        }
+    }
+}
+
+
+#ifdef __HIP_PLATFORM_HCC__
+//---
+// Add pointers to tracker very quickly, then remove them quickly:
+enum Dir {Up, Down};
+void thread_noise_generator(int iters, size_t numBuffers, Dir addDir, Dir removeDir)
+{
+    const size_t bufferSize = 16;
+    size_t maxSize = numBuffers*bufferSize;
+    HIPASSERT((maxSize % bufferSize) == 0); // loop logic assumes this is true
+
+
+    for (int i=0; i<iters; i++) {
+        char * basePtr = (char*)malloc(maxSize);
+
+        auto acc = hc::accelerator();
+
+        if (addDir == Up) {
+            for (char *p = basePtr; p<basePtr + maxSize; p+=bufferSize) {
+                hc::am_memtracker_add(p, bufferSize, acc, false);
+            }
+        } else if (addDir == Down) {
+            for (char *p = basePtr+maxSize-bufferSize; p>=0; p-=bufferSize) {
+                hc::am_memtracker_add(p, bufferSize, acc, false);
+            }
+        }
+
+        if (removeDir == Up) {
+            for (char *p = basePtr; p<basePtr + maxSize; p+=bufferSize) {
+                hc::am_memtracker_remove(p);
+            }
+        } else if (removeDir == Down) {
+            for (char *p = basePtr+maxSize-bufferSize; p>=0; p-=bufferSize)  {
+                hc::am_memtracker_remove(p);
+            }
+        }
+    }
+}
+
+
+//---
+//Multi-thread test that is effective at catching locking errors in the alloc/dealloc/tracker.
+//The query thread repeately requests information on the same block of memory.
+//Meanwhile, the thread_noise_generator registers a large number of blocks, and 
+//then unregisters them.   This causes a large amount of rebalancing in the tree
+//structure and will generate errors unless the locks in the tracker are preventing reading 
+//while writing.
+void testMultiThreaded_2()
+{
+    std::atomic<int> inflight(2);
+
+    printf ("\n===========================================================================\n");
+    printf ("MultiThreaded_2\n");
+    printf ("===========================================================================\n");
+
+    hipSetDevice(0);
+    hipDeviceReset();
+
+    // Create some entries in the tracker:
+    for (int i=0; i<1000; i++) {
+        void *C_d;
+        HIPCHECK(hipMalloc(&C_d, 32));
+    }
+
+
+    // Allocate a pointer that we will repeatedly lookup:
+    void *A_d;
+    HIPCHECK(hipMalloc(&A_d, 10000));
+    hipPointerAttribute_t attrib1;
+    HIPCHECK(hipPointerGetAttributes(&attrib1, A_d));
+    std::thread t1(thread_query, A_d, &attrib1);
+
+    std::thread t2(thread_noise_generator, 10000, 1000, Up, Up);
+
+    t1.join();
+    t2.join();
+
+    hipSetDevice(0);
+    hipDeviceReset();
+}
+#endif
+
+
+
+int main(int argc, char *argv[])
+{
+    N= 1000000;
+    HipTest::parseStandardArguments(argc, argv, true);
+
+
+    Nbytes = N*sizeof(char);
+
+    printf ("N=%zu (%6.2f MB) device=%d\n", N, Nbytes/(1024.0*1024.0), p_gpuDevice);
+
+
+    if (p_tests & 0x01) {
+        printf ("info: set device to %d\n", p_gpuDevice);
+        HIPCHECK(hipSetDevice(p_gpuDevice));
+        testSimple();
+    }
+
+    if (p_tests & 0x02) {
+        srand(0x100);
+        printf ("\n===========================================================================\n");
+        clusterAllocs(100, 1024*1, 1024*1024);
+        resetTracker();
+    }
+
+    if (p_tests & 0x04) {
+        srand(0x200);
+        printf ("\n===========================================================================\n");
+        clusterAllocs(1000, 1, 10); //  Many tiny allocations;
+        resetTracker();
+    }
+
+    if (p_tests & 0x08) {
+        srand(0x300);
+        testMultiThreaded_1(true);
+        testMultiThreaded_1(false);
+    }
+
+
+#ifdef __HIP_PLATFORM_HCC__
+    if (p_tests & 0x10) {
+        srand(0x400);
+        testMultiThreaded_2();
+        resetTracker();
+    }
+#endif
+
+    printf ("\n");
+    passed();
+}
diff --git a/projects/clr/hipamd/tests/src/test_common.cpp b/projects/clr/hipamd/tests/src/test_common.cpp
index d7a108a11b..3da5568b7c 100644
--- a/projects/clr/hipamd/tests/src/test_common.cpp
+++ b/projects/clr/hipamd/tests/src/test_common.cpp
@@ -28,6 +28,8 @@ int iterations = 1;
 unsigned blocksPerCU = 6; // to hide latency
 unsigned threadsPerBlock = 256; 
 int p_gpuDevice = 0;
+unsigned p_verbose  = 0;
+int p_tests = -1; /*which tests to run. Interpretation is left to each test.  default:all*/
 
 
 
@@ -86,7 +88,7 @@ int parseStandardArguments(int argc, char *argv[], bool failOnUndefinedArg)
 
         if (!strcmp(arg, " ")) {
             // skip NULL args.
-        } else if (!strcmp(arg, "--N")) {
+        } else if (!strcmp(arg, "--N") || (!strcmp(arg, "-N"))) {
             if (++i >= argc || !HipTest::parseSize(argv[i], &N)) {
                failed("Bad N size argument"); 
             }
@@ -114,8 +116,16 @@ int parseStandardArguments(int argc, char *argv[], bool failOnUndefinedArg)
                failed("Bad gpuDevice argument"); 
             }
 
-        } 
-        else {
+        } else if (!strcmp(arg, "--verbose") || (!strcmp(arg, "-v"))) {
+            if (++i >= argc || !HipTest::parseUInt(argv[i], &p_verbose)) {
+               failed("Bad verbose argument"); 
+            }
+        } else if (!strcmp(arg, "--tests") || (!strcmp(arg, "-t"))) {
+            if (++i >= argc || !HipTest::parseInt(argv[i], &p_tests)) {
+               failed("Bad tests argument"); 
+            }
+
+        } else {
             if (failOnUndefinedArg) {
                 failed("Bad argument '%s'", arg);
             } else {
diff --git a/projects/clr/hipamd/tests/src/test_common.h b/projects/clr/hipamd/tests/src/test_common.h
index fee052c1ad..1bf89f1604 100644
--- a/projects/clr/hipamd/tests/src/test_common.h
+++ b/projects/clr/hipamd/tests/src/test_common.h
@@ -25,7 +25,7 @@
     printf (__VA_ARGS__);\
     printf ("\n");\
     printf ("error: TEST FAILED\n%s", KNRM );\
-    exit(EXIT_FAILURE);
+    abort();
 
 
 #define HIPCHECK(error) \
@@ -53,6 +53,8 @@ extern int iterations;
 extern unsigned blocksPerCU;
 extern unsigned threadsPerBlock;
 extern int p_gpuDevice;
+extern unsigned p_verbose;
+extern int p_tests;
 
 namespace HipTest {
 
@@ -86,7 +88,7 @@ vectorADD(hipLaunchParm lp,
     size_t stride = hipBlockDim_x * hipGridDim_x ;
 
     for (size_t i=offset; i<N; i+=stride) {
-		C_d[i] = A_d[i] + B_d[i];
+        C_d[i] = A_d[i] + B_d[i];
 	}
 }
 
@@ -94,7 +96,7 @@ vectorADD(hipLaunchParm lp,
 template <typename T>
 void initArrays(T **A_d, T **B_d, T **C_d,
                 T **A_h, T **B_h, T **C_h, 
-                size_t N) 
+                size_t N, bool usePinnedHost=false) 
 {
     size_t Nbytes = N*sizeof(T);
 
@@ -108,14 +110,32 @@ void initArrays(T **A_d, T **B_d, T **C_d,
         HIPCHECK ( hipMalloc(C_d, Nbytes) );
     }
 
-    if (A_h)
-        *A_h = (T*)malloc(Nbytes);
-    
-    if (B_h)
-        *B_h = (T*)malloc(Nbytes);
+    if (usePinnedHost) {
+        if (A_h) {
+            HIPCHECK ( hipMallocHost(A_h, Nbytes) );
+        }
+        if (B_h) {
+            HIPCHECK ( hipMallocHost(B_h, Nbytes) );
+        }
+        if (C_h) {
+            HIPCHECK ( hipMallocHost(C_h, Nbytes) );
+        }
+    } else {
+        if (A_h) {
+            *A_h = (T*)malloc(Nbytes);
+            HIPASSERT(*A_h != NULL);
+        }
+        
+        if (B_h) {
+            *B_h = (T*)malloc(Nbytes);
+            HIPASSERT(*B_h != NULL);
+        }
 
-    if (C_h)
-        *C_h = (T*)malloc(Nbytes);
+        if (C_h) {
+            *C_h = (T*)malloc(Nbytes);
+            HIPASSERT(*C_h != NULL);
+        }
+    }
 
 
     // Initialize the host data:
@@ -128,7 +148,43 @@ void initArrays(T **A_d, T **B_d, T **C_d,
 }
 
 
+template <typename T>
+void freeArrays(T *A_d, T *B_d, T *C_d,
+                T *A_h, T *B_h, T *C_h, bool usePinnedHost) 
+{
+    if (A_d) {
+        HIPCHECK ( hipFree(A_d) );
+    }
+    if (B_d) {
+        HIPCHECK ( hipFree(B_d) );
+    }
+    if (C_d) {
+        HIPCHECK ( hipFree(C_d) );
+    }
 
+    if (usePinnedHost) {
+        if (A_h) {
+            HIPCHECK (hipFreeHost(A_h));
+        }
+        if (B_h) {
+            HIPCHECK (hipFreeHost(B_h));
+        }
+        if (C_h) {
+            HIPCHECK (hipFreeHost(C_h));
+        }
+    } else {
+        if (A_h) {
+            free (A_h);
+        }
+        if (B_h) {
+            free (B_h);
+        }
+        if (C_h) {
+            free (C_h);
+        }
+    }
+    
+}
 
 
 // Assumes C_h contains vector add of A_h + B_h
diff --git a/projects/clr/hipamd/util/vim/hip.vim b/projects/clr/hipamd/util/vim/hip.vim
index 01f3b3f2ad..e4ea0a4a9e 100644
--- a/projects/clr/hipamd/util/vim/hip.vim
+++ b/projects/clr/hipamd/util/vim/hip.vim
@@ -91,6 +91,7 @@ syn keyword hipFunctionName hipD3D9UnmapResources
 syn keyword hipFunctionName hipD3D9UnregisterResource 
 syn keyword hipFunctionName hipDeviceGetProperties 
 syn keyword hipFunctionName hipDeviceSynchronize 
+syn keyword hipFunctionName hipDeviceReset 
 syn keyword hipFunctionName hipEventCreate 
 syn keyword hipFunctionName hipEventDestroy 
 syn keyword hipFunctionName hipEventElapsedTime 
@@ -151,6 +152,9 @@ syn keyword hipFunctionName hipUnbindTexture
 syn keyword hipFlags hipFilterModePoint 
 syn keyword hipFlags hipMemcpyHostToDevice 
 syn keyword hipFlags hipMemcpyDeviceToHost 
+syn keyword hipFlags hipMemcpyHostToHost 
+syn keyword hipFlags hipMemcpyDeviceToDevice 
+syn keyword hipFlags hipMemcpyDefault 
 syn keyword hipFlags hipReadModeElementType 
 syn keyword hipFlags hipSuccess 
 syn keyword hipFlags hipTextureType1D