Merge branch 'privatestaging' of https://github.com/AMDComputeLibraries/HIP-privatestaging into privatestaging

[ROCm/clr commit: c3b854fba4]
Tá an tiomantas seo le fáil i:
Aditya Atluri
2016-04-14 09:17:30 -05:00
tuismitheoir 8fdf04a64b 2a311f25ef
tiomantas 0eab4527d7
D'athraigh 16 comhad le 397 breiseanna agus 19 scriosta
+1 -1
Féach ar an gComhad
@@ -64,7 +64,7 @@ if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT AND CMAKE_INSTALL_PREFIX MATCHES
if(CMAKE_BUILD_TYPE MATCHES Debug)
set(CMAKE_INSTALL_PREFIX ${CMAKE_CURRENT_SOURCE_DIR} CACHE PATH "Installation path for HIP" FORCE)
elseif(CMAKE_BUILD_TYPE MATCHES Release)
set(CMAKE_INSTALL_PREFIX "/opt/hip" CACHE PATH "Installation path for HIP" FORCE)
set(CMAKE_INSTALL_PREFIX "/opt/rocm/hip" CACHE PATH "Installation path for HIP" FORCE)
else()
message(FATAL_ERROR "Invalid CMAKE_BUILD_TYPE specified. Valid values are Debug and Release")
endif()
+14 -10
Féach ar an gComhad
@@ -34,30 +34,34 @@ Make sure HIP_PATH is pointed to `/where/to/install/hip` and PATH includes `$HIP
## How do I get set up?
### Prerequisites - Choose Your Platform
HIP code can be developed either on AMD HSA or Boltzmann platform using hcc compiler, or a CUDA platform with nvcc installed:
HIP code can be developed either on AMD ROCm platform using hcc compiler, or a CUDA platform with nvcc installed:
#### AMD (hcc):
* Install [hcc](https://bitbucket.org/multicoreware/hcc/wiki/Home) including supporting HSA kernel and runtime driver stack
* By default HIP looks for hcc in /opt/hcc (can be overridden by setting HCC_HOME environment variable)
* By default HIP looks for HSA in /opt/hsa (can be overridden by setting HSA_PATH environment variable)
* By default HIP looks for hcc in /opt/rocm/hcc (can be overridden by setting HCC_HOME environment variable)
* By default HIP looks for HSA in /opt/rocm/hsa (can be overridden by setting HSA_PATH environment variable)
* Ensure that ROCR runtime is installed and added to LD_LIBRARY_PATH
* Install HIP (from this GitHub repot). By default HIP is installed into /opt/rocm/hip (can be overridden by setting HIP_PATH environment variable).
* Optionally, consider adding /opt/rocm/bin to your path to make it easier to use the tools.
#### NVIDIA (nvcc)
* Install CUDA SDK from manufacturer website
* By default HIP looks for CUDA SDK in /usr/local/cuda (can be overriden by setting CUDA_PATH env variable)
### Add HIP/bin to your path.
For example, if this repot is cloned to ~/HIP, and you are running bash:
```
> export PATH=$PATH:~/HIP/bin
#### Verify your installation
Run hipconfig (instructions below assume default installation path) :
```
Verify your can find hipconfig (one of the hip tools in bin dir):
```
> hipconfig -pn
/home/me/HIP
> /opt/rocm/bin/hipconfig --full
```
Compile and run the [square sample](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/tree/master/samples/0_Intro/square).
### HCC Options
#### Compiling CodeXL markers for HIP Functions
+1 -1
Féach ar an gComhad
@@ -196,7 +196,7 @@ if ($needHipHcc) {
$HIP_USE_SHARED_LIBRARY = $ENV{'HIP_USE_SHARED_LIBRARY'};
$HIP_USE_SHARED_LIBRARY = 0 unless defined $HIP_USE_SHARED_LIBRARY;
#$HIPLDFLAGS .= " -L/opt/hip/lib -lhip_hcc" ;
#$HIPLDFLAGS .= " -L/opt/rocm/hip/lib -lhip_hcc" ;
if ($HIP_USE_SHARED_LIBRARY) {
$HIPLDFLAGS .= " -L$HIP_PATH/lib -Wl,--rpath=$HIP_PATH/lib -lhip_hcc";
} else {
+6
Féach ar an gComhad
@@ -107,9 +107,15 @@ HIP is a portable C++ language that supports a strong subset of the CUDA run-tim
A C++ dialect, hc is supported by the AMD HCC compiler. It provides C++ run time, C++ kernel-launch APIs (parallel_for_each), C++ kernel language, and several memory-management options, including pointers, arrays and array_view (with implicit data synchronization). It's intended to be a leading indicator of the ISO C++ standard.
### HIP detected my platform (hcc vs nvcc) incorrectly - what should I do?
HIP will set the platform to HCC if it sees that the AMD graphics driver is installed and has detected an AMD GPU.
Sometimes this isn't what you want - you can force HIP to recognize the platform by setting HIP_PLATFORM to hcc (or nvcc)
```
export HIP_PLATFORM=hcc
```
One symptom of this problem is the message "error: 'unknown error'(11) at square.hipref.cpp:56". This can occur if you have a CUDA installation on an AMD platform, and HIP incorrectly detects the platform as nvcc. HIP may be able to compile the application using the nvcc tool-chain, but will generate this error at runtime since the platform does not have a CUDA device. The fix is to set HIP_PLATFORM=hcc and rebuild the issue.
If you see issues related to incorrect platform detection, please file an issue with the GitHub issue tracker so we can improve HIP's platform detection logic.
+1
Féach ar an gComhad
@@ -0,0 +1 @@
../include
@@ -1,6 +1,6 @@
#Dependencies : [MYHIP]/bin must be in user's path.
HIP_PATH=?../../..
HIP_PATH?=../../..
HIP_PLATFORM=$(shell $(HIP_PATH)/bin/hipconfig --platform)
HIPCC=$(HIP_PATH)/bin/hipcc
@@ -24,11 +24,14 @@ THE SOFTWARE.
#include <hip_runtime.h>
#define CHECK(error) \
#define CHECK(cmd) \
{\
hipError_t error = cmd;\
if (error != hipSuccess) { \
fprintf(stderr, "error: '%s'(%d) at %s:%d\n", hipGetErrorString(error), error,__FILE__, __LINE__); \
exit(EXIT_FAILURE);\
}
}\
}
void __global__
bit_extract_kernel(hipLaunchParm lp, uint32_t *C_d, const uint32_t *A_d, size_t N)
@@ -0,0 +1,66 @@
HCC_HOME?=/opt/rocm/hcc
HCC = $(HCC_HOME)/bin/hcc
HCC_CFLAGS= `$(HCC_HOME)/bin/hcc-config --cxxflags`
HCC_LDFLAGS= `$(HCC_HOME)/bin/hcc-config --ldflags`
CPPAMP_CFLAGS= -std=c++amp -stdlib=libc++ -I/opt/hcc/include
CPPAMP_LDFLAGS= -std=c++amp -L/opt/hcc/lib -Wl,--rpath=/opt/hcc/lib -lc++ -lc++abi -ldl -lpthread -Wl,--whole-archive -lmcwamp -Wl,--no-whole-archive
HIP_PATH?=/opt/rocm/hip
HIPCC=$(HIP_PATH)/bin/hipcc
HIP_PLATFORM=$(shell $(HIP_PATH)/bin/hipconfig --platform)
ifneq (${HIP_PLATFORM}, hcc)
$(error hcc_dialects requires hcc compiler and only runs on hcc platform)
endif
TARGETS=vadd_hc_arrayview vadd_hc_array vadd_amp_arrayview vadd_hip
all: $(TARGETS)
clean:
rm -f $(TARGETS) *.o
run: $(TARGETS)
@for t in $(TARGETS); do\
echo "Running $$t"; \
./$$t; \
done
# HCC version:
vadd_hc_arrayview.o: vadd_hc_arrayview.cpp
$(HCC) $(HCC_CFLAGS) -c $< -o $@
vadd_hc_arrayview: vadd_hc_arrayview.o
$(HCC) $(HCC_LDFLAGS) $< -o $@
# HCC version, using explicit arrays:
vadd_hc_array.o: vadd_hc_array.cpp
$(HCC) $(HCC_CFLAGS) -c $< -o $@
vadd_hc_array: vadd_hc_array.o
$(HCC) $(HCC_LDFLAGS) $< -o $@
# HCC version, using AM (accelerator memory) pointer
vadd_hc_am.o: vadd_hc_am.cpp
$(HCC) $(HCC_CFLAGS) -c $< -o $@
vadd_hc_am: vadd_hc_am.o
$(HCC) $(HCC_LDFLAGS) $< -o $@
# HIP version:
vadd_hip.o: vadd_hip.cpp
$(HIPCC) -c $< -o $@
vadd_hip: vadd_hip.o
$(HIPCC) $< -o $@
# AMP version:
vadd_amp_arrayview.o: vadd_amp_arrayview.cpp
$(HCC) $(CPPAMP_CFLAGS) -c $< -o $@
vadd_amp_arrayview: vadd_amp_arrayview.o
$(HCC) $(CPPAMP_LDFLAGS) $< -o $@
@@ -0,0 +1,48 @@
// Simple test showing how to use C++AMP syntax with array_view.
// The code uses AMP's array_view class, which provides automatic data synchronization
// of data between the host and the accelerator. As noted below, the HCC runtime
// will automatically copy data to and from the host, without the user needing
// to manually perform such copies. This is an excellent mode for developers
// new to GPU programming and matches the memory models provided by recent systems where
// CPU and GPU share the same memory pool. Advanced programmers may prefer
// more explicit control over the data movement - shown in the other vadd_hc_array and
// vadd_hc_am examples.
// This example shows the similarity between C++AMP and and HC for simple cases where
// implicit data transfer is used - really the only difference is the namespace.
// Other examples show some of the more advanced controls.
#include <amp.h>
int main(int argc, char *argv[])
{
int sizeElements = 1000000;
// Allocate auto-managed host/device views of data:
concurrency::array_view<float> A(sizeElements);
concurrency::array_view<float> B(sizeElements);
concurrency::array_view<float> C(sizeElements);
// Initialize host data
for (int i=0; i<sizeElements; i++) {
A[i] = 1.618f * i;
B[i] = 3.142f * i;
}
C.discard_data(); // tell runtime not to copy CPU host data.
// Launch kernel onto default accelerator
// The HCC runtime will ensure that A and B are available on the accelerator before launching the kernel.
concurrency::parallel_for_each(concurrency::extent<1> (sizeElements),
[=] (concurrency::index<1> idx) restrict(amp) {
int i = idx[0];
C[i] = A[i] + B[i];
});
for (int i=0; i<sizeElements; i++) {
float ref= 1.618f * i + 3.142f * i;
// Because C is an array_view, the HCC runtime will copy C back to host at first access here:
if (C[i] != ref) {
printf ("error:%d computed=%6.2f, reference=%6.2f\n", i, C[i], ref);
}
};
}
@@ -0,0 +1,59 @@
// Simple test showing how to use HC syntax with AM (accelerator memory).
// AM provides a set of c-style memory management routines for allocating,
// freeing, and copying memory. am_alloc returns a device pointer
// which can only be used on the device. The programmer has full control
// over when data is copied.
#include <hc.hpp>
#include <hc_am.hpp>
int main(int argc, char *argv[])
{
int sizeElements = 1000000;
size_t sizeBytes = sizeElements * sizeof(float);
// Allocate host memory
float *A_h = (float*)malloc(sizeBytes);
float *B_h = (float*)malloc(sizeBytes);
float *C_h = (float*)malloc(sizeBytes);
// Allocate device pointers:
// Unlike array_view, these must be explicitly managed by user:
hc::accelerator acc; // grab default accelerator where we want to allocate memory:
hc::accelerator_view av = acc.get_default_view();
float *A_d, *B_d, *C_d;
A_d = hc::am_alloc(sizeBytes, acc, 0);
B_d = hc::am_alloc(sizeBytes, acc, 0);
C_d = hc::am_alloc(sizeBytes, acc, 0);
// Initialize host data
for (int i=0; i<sizeElements; i++) {
A_h[i] = 1.618f * i;
B_h[i] = 3.142f * i;
}
av.copy(A_h, A_d); // C++ copy H2D
av.copy(B_h, B_d); //C++ copy H2D
// Launch kernel onto AV.
// Because the kernel PFE and the copies are submitted to same AV, they will execute in order
// and we don't need additional synchronization to ensure the copies complete before the PFE begins.
hc::parallel_for_each(av, hc::extent<1> (sizeElements),
[&] (hc::index<1> idx) [[hc]] {
int i = idx[0];
C_d[i] = A_d[i] + B_d[i];
});
// This copy is in same AV as the kernel and thus will wait for the kernel to finish before executing.
av.copy(C_d, C_h); // C++ copy D2H
for (int i=0; i<sizeElements; i++) {
float ref= 1.618f * i + 3.142f * i;
if (C_h[i] != ref) {
printf ("error:%d computed=%6.2f, reference=%6.2f\n", i, C_h[i], ref);
}
};
}
@@ -0,0 +1,53 @@
// Simple test showing how to use HC syntax with array.
// Array provides a type-safe C++ mechanism to allocate accelerator memory.
// Like array_view, hc::array provides multi-dimensional indexing capability,
// and is typed. However, unlike array_view, hc::array does not provide
// automatic data management capabilities - instead the programmer
// takes the reins and controls when copies are executed.
#include <hc.hpp>
int main(int argc, char *argv[])
{
int sizeElements = 1000000;
size_t sizeBytes = sizeElements * sizeof(float);
// Allocate host memory
float *A_h = (float*)malloc(sizeBytes);
float *B_h = (float*)malloc(sizeBytes);
float *C_h = (float*)malloc(sizeBytes);
// Allocate device arrays<>
// Unlike array_view, these must be explicitly managed by user:
hc::array<float> A_d(sizeElements);
hc::array<float> B_d(sizeElements);
hc::array<float> C_d(sizeElements);
// Initialize host data
for (int i=0; i<sizeElements; i++) {
A_h[i] = 1.618f * i;
B_h[i] = 3.142f * i;
}
hc::copy(A_h, A_d); // C++ copy H2D
hc::copy(B_h, B_d); // C++ copy H2D
// Launch kernel onto default accelerator:
// array<> types are not implicitly copied, so we performed copies above.
hc::parallel_for_each(hc::extent<1> (sizeElements),
[&] (hc::index<1> idx) [[hc]] {
int i = idx[0];
C_d[i] = A_d[i] + B_d[i];
});
// HCC runtime knows that C_d depends on previous PFE and will force the copy to wait for the PFE to complte.
hc::copy(C_d, C_h); // C++ copy D2H
for (int i=0; i<sizeElements; i++) {
float ref= 1.618f * i + 3.142f * i;
if (C_h[i] != ref) {
printf ("error:%d computed=%6.2f, reference=%6.2f\n", i, C_h[i], ref);
}
};
}
@@ -0,0 +1,33 @@
#include <hc.hpp>
int main(int argc, char *argv[])
{
int size = 1000000;
// Allocate auto-managed host/device views of data:
hc::array_view<float> A(size);
hc::array_view<float> B(size);
hc::array_view<float> C(size);
// Initialize host data
for (int i=0; i<size; i++) {
A[i] = 1.618f * i;
B[i] = 3.142f * i;
}
C.discard_data(); // tell runtime not to copy CPU host data.
// Launch kernel onto default accelerator:
hc::parallel_for_each(hc::extent<1> (size),
[=] (hc::index<1> idx) [[hc]] {
int i = idx[0];
C[i] = A[i] + B[i];
});
for (int i=0; i<size; i++) {
float ref= 1.618f * i + 3.142f * i;
if (C[i] != ref) {
printf ("error:%d computed=%6.2f, reference=%6.2f\n", i, C[i], ref);
}
};
}
@@ -0,0 +1,48 @@
// Simple test showing how to use HC syntax with array_view.
// The code uses AMP's array_view class, which provides automatic data synchronization
// of data between the host and the accelerator. As noted below, the HCC runtime
// will automatically copy data to and from the host, without the user needing
// to manually perform such copies. This is an excellent mode for developers
// new to GPU programming and matches the memory models provided by recent systems where
// CPU and GPU share the same memory pool. Advanced programmers may prefer
// more explicit control over the data movement - shown in the other vadd_hc_array and
// vadd_hc_am examples.
// This example shows the similarity between C++AMP and and HC for simple cases where
// implicit data transfer is used - really the only difference is the namespace.
// Other examples show some of the more advanced controls.
#include <hc.hpp>
int main(int argc, char *argv[])
{
int sizeElements = 1000000;
// Allocate auto-managed host/device views of data:
hc::array_view<float> A(sizeElements);
hc::array_view<float> B(sizeElements);
hc::array_view<float> C(sizeElements);
// Initialize host data
for (int i=0; i<sizeElements; i++) {
A[i] = 1.618f * i;
B[i] = 3.142f * i;
}
C.discard_data(); // tell runtime not to copy CPU host data.
// Launch kernel onto default accelerator:
// The HCC runtime will ensure that A and B are available on the accelerator before launching the kernel.
hc::parallel_for_each(hc::extent<1> (sizeElements),
[=] (hc::index<1> idx) [[hc]] {
int i = idx[0];
C[i] = A[i] + B[i];
});
for (int i=0; i<sizeElements; i++) {
float ref= 1.618f * i + 3.142f * i;
// Because C is an array_view, the HCC runtime will copy C back to host at first access here:
if (C[i] != ref) {
printf ("error:%d computed=%6.2f, reference=%6.2f\n", i, C[i], ref);
}
};
}
@@ -0,0 +1,51 @@
#include <hip_runtime.h>
__global__ void vadd_hip(hipLaunchParm lp, const float *a, const float *b, float *c, int N)
{
int idx = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
if (idx < N) {
c[idx] = a[idx] + b[idx];
}
}
int main(int argc, char *argv[])
{
int sizeElements = 1000000;
size_t sizeBytes = sizeElements * sizeof(float);
// Allocate host memory
float *A_h = (float*)malloc(sizeBytes);
float *B_h = (float*)malloc(sizeBytes);
float *C_h = (float*)malloc(sizeBytes);
// Allocate device memory:
float *A_d, *B_d, *C_d;
hipMalloc(&A_d, sizeBytes);
hipMalloc(&B_d, sizeBytes);
hipMalloc(&C_d, sizeBytes);
// Initialize host data
for (int i=0; i<sizeElements; i++) {
A_h[i] = 1.618f * i;
B_h[i] = 3.142f * i;
}
hipMemcpy(A_d, A_h, sizeBytes, hipMemcpyHostToDevice);
hipMemcpy(B_d, B_h, sizeBytes, hipMemcpyHostToDevice);
// Launch kernel onto default accelerator:
int blockSize = 256; // pick arbitrary block size
int blocks = (sizeElements+blockSize-1)/blockSize; // round up to launch enough blocks
hipLaunchKernel(vadd_hip, dim3(blocks), dim3(blockSize), 0, 0, A_d, B_d, C_d, sizeElements);
hipMemcpy(C_h, C_d, sizeBytes, hipMemcpyDeviceToHost);
for (int i=0; i<sizeElements; i++) {
float ref= 1.618f * i + 3.142f * i;
if (C_h[i] != ref) {
printf ("error:%d computed=%6.2f, reference=%6.2f\n", i, C_h[i], ref);
}
};
}
@@ -22,11 +22,14 @@ THE SOFTWARE.
#include <stdio.h>
#include <cuda_runtime.h>
#define CHECK(error) \
#define CHECK(cmd) \
{\
hipError_t error = cmd;\
if (error != cudaSuccess) { \
fprintf(stderr, "error: '%s'(%d) at %s:%d\n", cudaGetErrorString(error), error,__FILE__, __LINE__); \
exit(EXIT_FAILURE);\
}
}\
}
/*
@@ -22,11 +22,14 @@ THE SOFTWARE.
#include <stdio.h>
#include <hip_runtime.h>
#define CHECK(error) \
#define CHECK(cmd) \
{\
hipError_t error = cmd;\
if (error != hipSuccess) { \
fprintf(stderr, "error: '%s'(%d) at %s:%d\n", hipGetErrorString(error), error,__FILE__, __LINE__); \
exit(EXIT_FAILURE);\
}
}\
}
/*