Merge branch 'privatestaging' of https://github.com/AMDComputeLibraries/HIP-privatestaging into privatestaging

[ROCm/clr commit: c3b854fba4]
2016-04-14 09:17:30 -05:00
@@ -64,7 +64,7 @@ if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT AND CMAKE_INSTALL_PREFIX MATCHES
    if(CMAKE_BUILD_TYPE MATCHES Debug)
            set(CMAKE_INSTALL_PREFIX ${CMAKE_CURRENT_SOURCE_DIR} CACHE PATH "Installation path for HIP" FORCE)
    elseif(CMAKE_BUILD_TYPE MATCHES Release)
-            set(CMAKE_INSTALL_PREFIX "/opt/hip" CACHE PATH "Installation path for HIP" FORCE)
+            set(CMAKE_INSTALL_PREFIX "/opt/rocm/hip" CACHE PATH "Installation path for HIP" FORCE)
    else()
        message(FATAL_ERROR "Invalid CMAKE_BUILD_TYPE specified. Valid values are Debug and Release")
    endif()
@@ -34,30 +34,34 @@ Make sure HIP_PATH is pointed to `/where/to/install/hip` and PATH includes `$HIP
 ## How do I get set up?

 ### Prerequisites - Choose Your Platform
-HIP code can be developed either on AMD HSA or Boltzmann platform using hcc compiler, or a CUDA platform with nvcc installed:
+HIP code can be developed either on AMD ROCm platform using hcc compiler, or a CUDA platform with nvcc installed:

 #### AMD (hcc):

 * Install [hcc](https://bitbucket.org/multicoreware/hcc/wiki/Home) including supporting HSA kernel and runtime driver stack 
-* By default HIP looks for hcc in /opt/hcc (can be overridden by setting HCC_HOME environment variable)
-* By default HIP looks for HSA in /opt/hsa (can be overridden by setting HSA_PATH environment variable) 
+* By default HIP looks for hcc in /opt/rocm/hcc (can be overridden by setting HCC_HOME environment variable)
+* By default HIP looks for HSA in /opt/rocm/hsa (can be overridden by setting HSA_PATH environment variable) 
 * Ensure that ROCR runtime is installed and added to LD_LIBRARY_PATH
+* Install HIP (from this GitHub repot).  By default HIP is installed into /opt/rocm/hip (can be overridden by setting HIP_PATH environment variable).
+
+* Optionally, consider adding /opt/rocm/bin to your path to make it easier to use the tools.
   
 #### NVIDIA (nvcc)
 * Install CUDA SDK from manufacturer website
 * By default HIP looks for CUDA SDK in /usr/local/cuda (can be overriden by setting CUDA_PATH env variable)

-### Add HIP/bin to your path.
-For example, if this repot is cloned to ~/HIP, and you are running bash:
 ```
-> export PATH=$PATH:~/HIP/bin
+
+#### Verify your installation
+Run hipconfig (instructions below assume default installation path) :
 ```
-Verify your can find hipconfig (one of the hip tools in bin dir):
-```
->  hipconfig -pn
-/home/me/HIP
+>  /opt/rocm/bin/hipconfig --full
 ```

+Compile and run the [square sample](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/tree/master/samples/0_Intro/square). 
+
+
+
 ### HCC Options

 #### Compiling CodeXL markers for HIP Functions
@@ -196,7 +196,7 @@ if ($needHipHcc) {
    $HIP_USE_SHARED_LIBRARY = $ENV{'HIP_USE_SHARED_LIBRARY'};
    $HIP_USE_SHARED_LIBRARY = 0 unless defined $HIP_USE_SHARED_LIBRARY;

-    #$HIPLDFLAGS .= " -L/opt/hip/lib -lhip_hcc" ;
+    #$HIPLDFLAGS .= " -L/opt/rocm/hip/lib -lhip_hcc" ;
    if ($HIP_USE_SHARED_LIBRARY) {
        $HIPLDFLAGS .= " -L$HIP_PATH/lib -Wl,--rpath=$HIP_PATH/lib -lhip_hcc";
    } else {
@@ -107,9 +107,15 @@ HIP is a portable C++ language that supports a strong subset of the CUDA run-tim

 A C++ dialect, hc is supported by the AMD HCC compiler. It provides C++ run time, C++ kernel-launch APIs (parallel_for_each), C++ kernel language, and several memory-management options, including pointers, arrays and array_view (with implicit data synchronization). It's intended to be a leading indicator of the ISO C++ standard.

+
 ### HIP detected my platform (hcc vs nvcc) incorrectly - what should I do?
+
 HIP will set the platform to HCC if it sees that the AMD graphics driver is installed and has detected an AMD GPU.
 Sometimes this isn't what you want - you can force HIP to recognize the platform by setting HIP_PLATFORM to hcc (or nvcc)
 ```
 export HIP_PLATFORM=hcc
+
 ```
+One symptom of this problem is the message "error: 'unknown error'(11) at square.hipref.cpp:56".  This can occur if you have a CUDA installation on an AMD platform, and HIP incorrectly detects the platform as nvcc.  HIP may be able to compile the application using the nvcc tool-chain, but will generate this error at runtime since the platform does not have a CUDA device. The fix is to set HIP_PLATFORM=hcc and rebuild the issue. 
+
+If you see issues related to incorrect platform detection, please file an issue with the GitHub issue tracker so we can improve HIP's platform detection logic.
@@ -0,0 +1 @@
+../include
@@ -1,6 +1,6 @@
 #Dependencies : [MYHIP]/bin must be in user's path.

-HIP_PATH=?../../..
+HIP_PATH?=../../..
 HIP_PLATFORM=$(shell $(HIP_PATH)/bin/hipconfig --platform)
 HIPCC=$(HIP_PATH)/bin/hipcc

@@ -24,11 +24,14 @@ THE SOFTWARE.
 #include <hip_runtime.h>


-#define CHECK(error) \
+#define CHECK(cmd) \
+{\
+    hipError_t error  = cmd;\
    if (error != hipSuccess) { \
      fprintf(stderr, "error: '%s'(%d) at %s:%d\n", hipGetErrorString(error), error,__FILE__, __LINE__); \
    exit(EXIT_FAILURE);\
-	}
+	}\
+}

 void __global__
 bit_extract_kernel(hipLaunchParm lp, uint32_t *C_d, const uint32_t *A_d, size_t N)
@@ -0,0 +1,66 @@
+HCC_HOME?=/opt/rocm/hcc
+HCC = $(HCC_HOME)/bin/hcc
+
+HCC_CFLAGS= `$(HCC_HOME)/bin/hcc-config --cxxflags` 
+HCC_LDFLAGS= `$(HCC_HOME)/bin/hcc-config --ldflags`
+
+CPPAMP_CFLAGS= -std=c++amp -stdlib=libc++ -I/opt/hcc/include
+CPPAMP_LDFLAGS=  -std=c++amp -L/opt/hcc/lib -Wl,--rpath=/opt/hcc/lib -lc++ -lc++abi -ldl -lpthread -Wl,--whole-archive -lmcwamp -Wl,--no-whole-archive 
+
+HIP_PATH?=/opt/rocm/hip
+HIPCC=$(HIP_PATH)/bin/hipcc
+HIP_PLATFORM=$(shell $(HIP_PATH)/bin/hipconfig --platform)
+
+ifneq (${HIP_PLATFORM}, hcc)
+$(error hcc_dialects requires hcc compiler and only runs on hcc platform)
+endif
+
+
+TARGETS=vadd_hc_arrayview vadd_hc_array vadd_amp_arrayview  vadd_hip  
+
+all: $(TARGETS)
+
+clean: 
+	rm -f $(TARGETS) *.o
+
+run: $(TARGETS)
+	@for t in $(TARGETS); do\
+		echo "Running $$t"; \
+		./$$t; \
+	done
+	
+
+# HCC version:
+vadd_hc_arrayview.o: vadd_hc_arrayview.cpp
+	$(HCC) $(HCC_CFLAGS) -c $< -o $@
+vadd_hc_arrayview: vadd_hc_arrayview.o
+	$(HCC) $(HCC_LDFLAGS) $< -o $@
+
+
+# HCC version, using explicit arrays:
+vadd_hc_array.o: vadd_hc_array.cpp
+	$(HCC) $(HCC_CFLAGS) -c $< -o $@
+vadd_hc_array: vadd_hc_array.o
+	$(HCC) $(HCC_LDFLAGS) $< -o $@
+
+
+# HCC version, using AM (accelerator memory) pointer
+vadd_hc_am.o: vadd_hc_am.cpp
+	$(HCC) $(HCC_CFLAGS) -c $< -o $@
+vadd_hc_am: vadd_hc_am.o
+	$(HCC) $(HCC_LDFLAGS) $< -o $@
+
+
+
+# HIP version:
+vadd_hip.o: vadd_hip.cpp
+	$(HIPCC) -c $< -o $@
+vadd_hip: vadd_hip.o
+	$(HIPCC) $< -o $@
+
+
+# AMP version:
+vadd_amp_arrayview.o: vadd_amp_arrayview.cpp
+	$(HCC) $(CPPAMP_CFLAGS) -c $< -o $@
+vadd_amp_arrayview: vadd_amp_arrayview.o
+	$(HCC) $(CPPAMP_LDFLAGS) $< -o $@
@@ -0,0 +1,48 @@
+// Simple test showing how to use C++AMP syntax with array_view.
+// The code uses AMP's array_view class, which provides automatic data synchronization
+// of data between the host and the accelerator. As noted below, the HCC runtime
+// will automatically copy data to and from the host, without the user needing
+// to manually perform such copies.  This is an excellent mode for developers
+// new to GPU programming and matches the memory models provided by recent systems where
+// CPU and GPU share the same memory pool.  Advanced programmers may prefer 
+// more explicit control over the data movement - shown in the other vadd_hc_array and 
+// vadd_hc_am examples.
+// This example shows the similarity between C++AMP and and HC for simple cases where
+// implicit data transfer is used - really the only difference is the namespace.
+// Other examples show some of the more advanced controls.
+
+#include <amp.h>
+
+int main(int argc, char *argv[])
+{
+    int sizeElements = 1000000;
+
+    // Allocate auto-managed host/device views of data:
+    concurrency::array_view<float> A(sizeElements);
+    concurrency::array_view<float> B(sizeElements);
+    concurrency::array_view<float> C(sizeElements);
+
+    // Initialize host data
+    for (int i=0; i<sizeElements; i++) {
+        A[i] = 1.618f * i; 
+        B[i] = 3.142f * i;
+    }
+    C.discard_data(); // tell runtime not to copy CPU host data.
+
+
+    // Launch kernel onto default accelerator
+    // The HCC runtime will ensure that A and B are available on the accelerator before launching the kernel.
+    concurrency::parallel_for_each(concurrency::extent<1> (sizeElements),
+      [=] (concurrency::index<1> idx) restrict(amp) { 
+        int i = idx[0];
+        C[i] = A[i] + B[i];
+    });
+
+    for (int i=0; i<sizeElements; i++) {
+        float ref= 1.618f * i + 3.142f * i;
+        // Because C is an array_view, the HCC runtime will copy C back to host at first access here:
+        if (C[i] != ref) {
+            printf ("error:%d computed=%6.2f, reference=%6.2f\n", i, C[i], ref);
+        }
+    };
+}
@@ -0,0 +1,59 @@
+// Simple test showing how to use HC syntax with AM (accelerator memory).
+// AM provides a set of c-style memory management routines for allocating,
+// freeing, and copying memory.   am_alloc returns a device pointer
+// which can only be used on the device.  The programmer has full control
+// over when data is copied.  
+
+#include <hc.hpp>
+#include <hc_am.hpp>
+
+int main(int argc, char *argv[])
+{
+    int sizeElements = 1000000;
+    size_t sizeBytes = sizeElements * sizeof(float);
+
+    // Allocate host memory
+    float *A_h = (float*)malloc(sizeBytes);
+    float *B_h = (float*)malloc(sizeBytes);
+    float *C_h = (float*)malloc(sizeBytes);
+
+    // Allocate device pointers:
+    // Unlike array_view, these must be explicitly managed by user:
+    hc::accelerator acc;  // grab default accelerator where we want to allocate memory:
+    hc::accelerator_view av = acc.get_default_view();
+
+    float *A_d, *B_d, *C_d;
+    A_d = hc::am_alloc(sizeBytes, acc, 0);
+    B_d = hc::am_alloc(sizeBytes, acc, 0);
+    C_d = hc::am_alloc(sizeBytes, acc, 0);
+
+    // Initialize host data
+    for (int i=0; i<sizeElements; i++) {
+        A_h[i] = 1.618f * i; 
+        B_h[i] = 3.142f * i;
+    }
+
+    av.copy(A_h, A_d); // C++ copy H2D
+    av.copy(B_h, B_d); //C++  copy H2D
+
+    // Launch kernel onto AV.  
+    // Because the kernel PFE and the copies are submitted to same AV, they will execute in order
+    // and we don't need additional synchronization to ensure the copies complete before the PFE begins.
+    hc::parallel_for_each(av,  hc::extent<1> (sizeElements),
+      [&] (hc::index<1> idx) [[hc]] { 
+        int i = idx[0];
+        C_d[i] = A_d[i] + B_d[i];
+    });
+
+   
+    // This copy is in same AV as the kernel and thus will wait for the kernel to finish before executing.
+    av.copy(C_d, C_h); // C++ copy D2H
+
+
+    for (int i=0; i<sizeElements; i++) {
+        float ref= 1.618f * i + 3.142f * i;
+        if (C_h[i] != ref) {
+            printf ("error:%d computed=%6.2f, reference=%6.2f\n", i, C_h[i], ref);
+        }
+    };
+}
@@ -0,0 +1,53 @@
+// Simple test showing how to use HC syntax with array.
+// Array provides a type-safe C++ mechanism to allocate accelerator memory.  
+// Like array_view, hc::array provides multi-dimensional indexing capability,
+// and is typed.  However, unlike array_view, hc::array does not provide
+// automatic data management capabilities - instead the programmer
+// takes the reins and controls when copies are executed.
+
+#include <hc.hpp>
+
+int main(int argc, char *argv[])
+{
+    int sizeElements = 1000000;
+    size_t sizeBytes = sizeElements * sizeof(float);
+
+    // Allocate host memory
+    float *A_h = (float*)malloc(sizeBytes);
+    float *B_h = (float*)malloc(sizeBytes);
+    float *C_h = (float*)malloc(sizeBytes);
+
+    // Allocate device arrays<>
+    // Unlike array_view, these must be explicitly managed by user:
+    hc::array<float> A_d(sizeElements);
+    hc::array<float> B_d(sizeElements);
+    hc::array<float> C_d(sizeElements);
+
+    // Initialize host data
+    for (int i=0; i<sizeElements; i++) {
+        A_h[i] = 1.618f * i; 
+        B_h[i] = 3.142f * i;
+    }
+
+    hc::copy(A_h, A_d); // C++ copy H2D
+    hc::copy(B_h, B_d); // C++ copy H2D
+
+    // Launch kernel onto default accelerator:
+    // array<> types are not implicitly copied, so we performed copies above.
+    hc::parallel_for_each(hc::extent<1> (sizeElements),
+      [&] (hc::index<1> idx) [[hc]] { 
+        int i = idx[0];
+        C_d[i] = A_d[i] + B_d[i];
+    });
+
+    // HCC runtime knows that C_d depends on previous PFE and will force the copy to wait for the PFE to complte.
+    hc::copy(C_d, C_h); // C++ copy D2H
+
+
+    for (int i=0; i<sizeElements; i++) {
+        float ref= 1.618f * i + 3.142f * i;
+        if (C_h[i] != ref) {
+            printf ("error:%d computed=%6.2f, reference=%6.2f\n", i, C_h[i], ref);
+        }
+    };
+}
@@ -0,0 +1,33 @@
+#include <hc.hpp>
+
+int main(int argc, char *argv[])
+{
+    int size = 1000000;
+
+    // Allocate auto-managed host/device views of data:
+    hc::array_view<float> A(size);
+    hc::array_view<float> B(size);
+    hc::array_view<float> C(size);
+
+    // Initialize host data
+    for (int i=0; i<size; i++) {
+        A[i] = 1.618f * i; 
+        B[i] = 3.142f * i;
+    }
+    C.discard_data(); // tell runtime not to copy CPU host data.
+
+
+    // Launch kernel onto default accelerator:
+    hc::parallel_for_each(hc::extent<1> (size),
+      [=] (hc::index<1> idx) [[hc]] { 
+        int i = idx[0];
+        C[i] = A[i] + B[i];
+    });
+
+    for (int i=0; i<size; i++) {
+        float ref= 1.618f * i + 3.142f * i;
+        if (C[i] != ref) {
+            printf ("error:%d computed=%6.2f, reference=%6.2f\n", i, C[i], ref);
+        }
+    };
+}
@@ -0,0 +1,48 @@
+// Simple test showing how to use HC syntax with array_view.
+// The code uses AMP's array_view class, which provides automatic data synchronization
+// of data between the host and the accelerator. As noted below, the HCC runtime
+// will automatically copy data to and from the host, without the user needing
+// to manually perform such copies.  This is an excellent mode for developers
+// new to GPU programming and matches the memory models provided by recent systems where
+// CPU and GPU share the same memory pool.  Advanced programmers may prefer 
+// more explicit control over the data movement - shown in the other vadd_hc_array and 
+// vadd_hc_am examples.
+// This example shows the similarity between C++AMP and and HC for simple cases where
+// implicit data transfer is used - really the only difference is the namespace.
+// Other examples show some of the more advanced controls.
+
+#include <hc.hpp>
+
+int main(int argc, char *argv[])
+{
+    int sizeElements = 1000000;
+
+    // Allocate auto-managed host/device views of data:
+    hc::array_view<float> A(sizeElements);
+    hc::array_view<float> B(sizeElements);
+    hc::array_view<float> C(sizeElements);
+
+    // Initialize host data
+    for (int i=0; i<sizeElements; i++) {
+        A[i] = 1.618f * i; 
+        B[i] = 3.142f * i;
+    }
+    C.discard_data(); // tell runtime not to copy CPU host data.
+
+
+    // Launch kernel onto default accelerator:
+    // The HCC runtime will ensure that A and B are available on the accelerator before launching the kernel.
+    hc::parallel_for_each(hc::extent<1> (sizeElements),
+      [=] (hc::index<1> idx) [[hc]] { 
+        int i = idx[0];
+        C[i] = A[i] + B[i];
+    });
+
+    for (int i=0; i<sizeElements; i++) {
+        float ref= 1.618f * i + 3.142f * i;
+        // Because C is an array_view, the HCC runtime will copy C back to host at first access here:
+        if (C[i] != ref) {
+            printf ("error:%d computed=%6.2f, reference=%6.2f\n", i, C[i], ref);
+        }
+    };
+}
@@ -0,0 +1,51 @@
+#include <hip_runtime.h>
+
+__global__ void vadd_hip(hipLaunchParm lp, const float *a, const float *b, float *c, int N)
+{
+    int idx = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
+
+    if (idx < N) {
+        c[idx] = a[idx] + b[idx];
+    }
+}
+
+
+int main(int argc, char *argv[])
+{
+    int sizeElements = 1000000;
+    size_t sizeBytes = sizeElements * sizeof(float);
+
+    // Allocate host memory
+    float *A_h = (float*)malloc(sizeBytes);
+    float *B_h = (float*)malloc(sizeBytes);
+    float *C_h = (float*)malloc(sizeBytes);
+
+    // Allocate device memory:
+    float *A_d, *B_d, *C_d;
+    hipMalloc(&A_d, sizeBytes);
+    hipMalloc(&B_d, sizeBytes);
+    hipMalloc(&C_d, sizeBytes);
+
+    // Initialize host data
+    for (int i=0; i<sizeElements; i++) {
+        A_h[i] = 1.618f * i; 
+        B_h[i] = 3.142f * i;
+    }
+
+    hipMemcpy(A_d, A_h, sizeBytes, hipMemcpyHostToDevice);
+    hipMemcpy(B_d, B_h, sizeBytes, hipMemcpyHostToDevice);
+
+    // Launch kernel onto default accelerator:
+    int blockSize = 256;  // pick arbitrary block size
+    int blocks = (sizeElements+blockSize-1)/blockSize; // round up to launch enough blocks
+    hipLaunchKernel(vadd_hip, dim3(blocks), dim3(blockSize), 0, 0, A_d, B_d, C_d, sizeElements);
+
+    hipMemcpy(C_h, C_d, sizeBytes, hipMemcpyDeviceToHost);
+
+    for (int i=0; i<sizeElements; i++) {
+        float ref= 1.618f * i + 3.142f * i;
+        if (C_h[i] != ref) {
+            printf ("error:%d computed=%6.2f, reference=%6.2f\n", i, C_h[i], ref);
+        }
+    };
+}
@@ -22,11 +22,14 @@ THE SOFTWARE.
 #include <stdio.h>
 #include <cuda_runtime.h>

-#define CHECK(error) \
+#define CHECK(cmd) \
+{\
+    hipError_t error  = cmd;\
    if (error != cudaSuccess) { \
      fprintf(stderr, "error: '%s'(%d) at %s:%d\n", cudaGetErrorString(error), error,__FILE__, __LINE__); \
    exit(EXIT_FAILURE);\
-	}
+	}\
+}


 /* 
@@ -22,11 +22,14 @@ THE SOFTWARE.
 #include <stdio.h>
 #include <hip_runtime.h>

-#define CHECK(error) \
+#define CHECK(cmd) \
+{\
+    hipError_t error  = cmd;\
    if (error != hipSuccess) { \
      fprintf(stderr, "error: '%s'(%d) at %s:%d\n", hipGetErrorString(error), error,__FILE__, __LINE__); \
    exit(EXIT_FAILURE);\
-	}
+	}\
+}


 /*