Merge branch 'amd-develop' into amd-master

Change-Id: I53d5a8916d769c4f0fe60d2ee3b240551da80b4f
(cherry picked from commit 01c523f6c9)
此提交包含在:
Maneesh Gupta
2017-04-07 15:40:09 +05:30
提交者 Maneesh Gupta
父節點 83dd6b4bec
當前提交 cfdb828e6c
共有 23 個檔案被更改,包括 451 行新增214 行删除
+158 -30
查看文件
@@ -1,49 +1,177 @@
# HIP Bugs
# HIP Bugs
<!-- toc -->
- [Errors related to undefined reference to `__hcLaunchKernel__***__grid_launch_parm**](#errors-related-to-undefined-reference-to-hclaunchkernel__grid_launch_parm)
- [Application hangs after a hipLaunchKernel call](#what-if-i-see-application-hangs-after-a-hiplaunchkernel-call)
- [Errors related to undefined reference to `__hcLaunchKernel__***__grid_launch_parm**`](#errors-related-to-undefined-reference-to-__hclaunchkernel____grid_launch_parm)
- [What is the current limitation of HIP Generic Grid Launch method?](#what-is-the-current-limitation-of-hip-generic-grid-launch-method)
- [Errors related to `no matching constructor`](#errors-related-to-no-matching-constructor)
- [HIP is more restrictive in enforcing restrictions](#hip-is-more-restrictive-in-enforcing-restrictions)
<!-- tocstop -->
### Errors related to undefined reference to `__hcLaunchKernel__***__grid_launch_parm**
### Errors related to undefined reference to `__hcLaunchKernel__***__grid_launch_parm**`
Some common code practices may lead to hipcc generating a error with the form :
undefined reference to `__hcLaunchKernel__ZN15vecAddNamespace6vecAddIidEEv16grid_launch_parmPT0_S3_S3_T_
To workaround, try:
- Avoid calling hcLaunchKernel from a function with the __host__ attribute
__host__ MyFunc(…) {
hipLaunchKernel(myKernel, …)
Suggested workarounds:
- Avoid use of static with kernel definition:
```c++
static __global__ MyKernel
- Avoid defining kernels in anonymous namespace
```
- Avoid defining kernels in anonymous namespace :
```c++
namespace {
__global__ MyKernel
- Avoid calling member functions
If hipLaunchKernel takes parameters that request explicitly memcpy, then it will cause application hang.
Reason is that the hipLaunchKernel macro locks the stream.
If kernel paramters are actually function calls which invoke other hip apis (i.e. memcpy) to the same stream, then deadlock occurs.
To workaround, try:
Move the function calls so they occur outside the hipLaunchKernel macro, store results in temps, then use the tems inside the kernel.
__global__ MyKernel
}
```
// Example pseudo code causing system hang:
// "bottom[0]->gpu_data()" calls hipMemcpy() implicitly and using the same stream, cause deadlock condition.
hipLaunchKernel(HIP_KERNEL_NAME(LRNComputeDiff),dim3(CAFFE_GET_BLOCKS(n_threads)), dim3(CAFFE_HIP_NUM_THREADS), 0, 0, n_threads,
bottom[0]->gpu_data());
// Move "gpu_data()" ouside of hipLaunchKernel to avoid hang.
auto bot_gpu_data = bottom[0]->gpu_data();
hipLaunchKernel( LRNComputeDiff, dim3(CAFFE_GET_BLOCKS(n_threads)), dim3(CAFFE_HIP_NUM_THREADS), 0, 0, n_threads,
bot_gpu_data);
```
### What is the current limitation of HIP Generic Grid Launch method?
1. __global__ functions cannot be marked as static or put in an unnamed namespace i.e. they cannot be given internal linkage (this would clash with __attribute__((weak)));
2. using the macro based dispatch mechanism i.e. hipLaunchKernel* only works for functions that take no more than 20 arguments (this limit can be increased up to 126, and is temporary until we can enable C++14 mode and use variadic generic lambdas); no such limitation applies do dispatching directly through grid_launch.
2. using the macro based dispatch mechanism i.e. hipLaunchKernel* only works for functions that take no more than 20 arguments (this limit can be increased up to 126, and is temporary until we can enable C++14 mode and use variadic generic lambdas); no such limitation applies do dispatching directly through grid_launch.
### Errors related to `no matching constructor`
The symptom is the compiler would complain about errors like `no matching constructor` for classes/structs passed as arguments into a GPU kernel. Often, this is caused by a design limitation in HCC where array-typed member variables inside a class/struct can’t be correctly passed into GPU kernels. To mitigate this issue, a custom serializer/deserializer pair is provided.
For example, `Foo` in the code snippets below contains an array-typed member variable `table`, which would fail the compiler if used as a kernel argument.
```
struct Foo {
// table is an array, which makes foo
int table[3];
};
```
An workaround is to provide a custom serializer on CPU side, and append the contents of the array as kernel arguments:
```
struct Foo {
int table[3];
// user-provided CPU serializer
// must append the contents of the array member as kernel arguments
#ifdef __HCC__
__attribute__((annotate(“serialize”)))
void __cxxamp_serialize(Kalmar::Serialize &s) const {
for (int i = 0; i < 3; ++i)
s.Append(sizeof(int), &table[i]);
}
#endif
};
```
Then, provide a custom deserializer on GPU side, to help reconstruct the array within GPU kernels. Notice that the deserializer can not be a function template, and should have scalar-typed parameters of the number equals to the length of the array-typed member variable. For example:
```
struct Foo {
int table[3];
// user-provided GPU deserializer
// table has 3 int elements, so deserializer must have 3 int parameters.
#ifdef __HCC__
__attribute__((annotate(“user_deserialize”)))
Foo(int x0, int x1, int x2) [[cpu]][[hc]] {
table[0] = x0;
table[1] = x1;
table[2] = x2;
}
#endif
#ifdef __HCC__
__attribute__((annotate(“serialize”)))
void __cxxamp_serialize(Kalmar::Serialize &s) const {
s.Append(sizeof(int), &table[0]);
s.Append(sizeof(int), &table[1]);
s.Append(sizeof(int), &table[2]);
}
#endif
};
```
Rather than create serializer functions, another workaround is to pass the member fields from the structure as simple data types.
### HIP is more restrictive in enforcing restrictions
The language specification for HIP and CUDA forbid calling a
`__device__` function in a `__host__` context. In practice, you may observe
differences in the strictness of this restriction, with HIP exhibiting a tighter
adherence to the specification and thus less tolerant of infringing code. The
solution is to ensure that all functions which are called in a
`__device__` context are correctly annotated to reflect it. An interesting case
where these differences emerge is shown below. This relies on a the common
[C++ Member Detector idiom][1], as it would be implemented pre C++11):
```c++
#include <cassert>
#include <type_traits>
struct aye { bool a[1]; };
struct nay { bool a[2]; };
// Dual restriction is necessary in HIP if the detector is to work for
// __device__ contexts as well as __host__ ones. NVCC is less strict.
template<typename T>
__host__ __device__
const T& cref_t();
template<typename T>
struct Has_call_operator {
// Dual restriction is necessary in HIP if the detector is to work for
// __device__ contexts as well as __host__ ones. NVCC is less strict.
template<typename C>
__host__ __device__
static
aye test(
C const *,
typename std::enable_if<
(sizeof(cref_t<C>().operator()()) > 0)>::type* = nullptr);
static
nay test(...);
enum { value = sizeof(test(static_cast<T*>(0))) == sizeof(aye) };
};
template<typename T, typename U, bool callable = has_call_operator<U>::value>
struct Wrapper {
template<typename V>
V f() const { return T{1}; }
};
template<typename T, typename U>
struct Wrapper<T, U, true> {
template<typename V>
V f() const { return T{10}; }
};
// This specialisation will yield a compile-time error, if selected.
template<typename T, typename U>
struct Wrapper<T, U, false> {};
template<typename T>
struct Functor;
template<> struct Functor<float> {
__device__
float operator()() const { return 42.0f; }
};
__device__
void this_will_not_compile_if_detector_is_not_marked_device()
{
float f = Wrapper<float, Functor<float>>().f<float>();
}
__host__
void this_will_not_compile_if_detector_is_marked_device_only()
{
float f = Wrapper<float, Functor<float>>().f<float>();
}
```
[1]: https://en.wikibooks.org/wiki/More_C%2B%2B_Idioms/Member_Detector
+5 -4
查看文件
@@ -4,7 +4,7 @@
- [What APIs and features does HIP support?](#what-apis-and-features-does-hip-support)
- [What is not supported?](#what-is-not-supported)
* [Run-time features](#run-time-features)
* [Runtime/Driver API features](#runtimedriver-api-features)
* [Kernel language features](#kernel-language-features)
- [Is HIP a drop-in replacement for CUDA?](#is-hip-a-drop-in-replacement-for-cuda)
- [What specific version of CUDA does HIP support?](#what-specific-version-of-cuda-does-hip-support)
@@ -23,10 +23,11 @@
- [On HCC, can I link HIP code with host code compiled with another compiler such as gcc, icc, or clang ?](#on-hcc-can-i-link-hip-code-with-host-code-compiled-with-another-compiler-such-as-gcc-icc-or-clang-)
- [HIP detected my platform (hcc vs nvcc) incorrectly - what should I do?](#hip-detected-my-platform-hcc-vs-nvcc-incorrectly---what-should-i-do)
- [Can I install both CUDA SDK and HCC on same machine?](#can-i-install-both-cuda-sdk-and-hcc-on-same-machine)
- [On CUDA, can I mix CUDA code with HIP code?](#on-cuda-can-i-mix-cuda-code-with-hip-code)
- [On HCC, can I use HC functionality with HIP?](#on-hcc-can-i-use-hc-functionality-with-hip)
- [How do I trace HIP application flow?](#how-do-i-trace-hip-application-flow)
* [Using CodeXL markers for HIP Functions](#using-codexl-markers-for-hip-functions)
* [Using HIP_TRACE_API](#using-hip_trace_api)
- [How do I enable HIP Generic Grid Launch option?](#how-do-i-enable-hip-generic-grid-launch-option)
- [What if HIP generates error of "symbol multiply defined!" only on AMD machine?](#what-if-hip-generates-error-of-symbol-multiply-defined-only-on-amd-machine)
- [How do I disable HIP Generic Grid Launch option?](#how-do-i-disable-hip-generic-grid-launch-option)
<!-- tocstop -->
+1
查看文件
@@ -44,6 +44,7 @@
- [Pragma Unroll](#pragma-unroll)
- [In-Line Assembly](#in-line-assembly)
- [C++ Support](#c-support)
- [Kernel Compilation](#kernel-compilation)
<!-- tocstop -->
+6 -3
查看文件
@@ -21,6 +21,7 @@ and provides practical suggestions on how to port CUDA code and work through com
* [Device-Architecture Properties](#device-architecture-properties)
* [Table of Architecture Properties](#table-of-architecture-properties)
- [Finding HIP](#finding-hip)
- [hipLaunchKernel](#hiplaunchkernel)
- [Compiler Options](#compiler-options)
- [Linking Issues](#linking-issues)
* [Linking With hipcc](#linking-with-hipcc)
@@ -31,9 +32,11 @@ and provides practical suggestions on how to port CUDA code and work through com
* [Using a Standard C++ Compiler](#using-a-standard-c-compiler)
+ [cuda.h](#cudah)
* [Choosing HIP File Extensions](#choosing-hip-file-extensions)
* [Workarounds](#workarounds)
+ [warpSize](#warpsize)
+ [Textures and Cache Control](#textures-and-cache-control)
- [Workarounds](#workarounds)
* [warpSize](#warpsize)
- [memcpyToSymbol](#memcpytosymbol)
- [threadfence_system](#threadfence_system)
* [Textures and Cache Control](#textures-and-cache-control)
- [More Tips](#more-tips)
* [HIPTRACE Mode](#hiptrace-mode)
* [Environment Variables](#environment-variables)
+26 -20
查看文件
@@ -4,26 +4,32 @@ This section describes the profiling and debugging capabilities that HIP provide
Profiling information can viewed in the CodeXL visualization tool or printed directly to stderr as the application runs.
This document starts with some of the general capabilities of CodeXL and then describes some of the additional HIP marker and debug features.
* [CodeXL Profiling](#codexl-profiling)
* [Collecting and Viewing Traces](#collecting-and-viewing-traces)
* [Using rocm-profiler timestamp profiling](#using-rocm-profiler-timestamp-profiling)
* [Using rocm-profiler performance counter collection:](#using-rocm-profiler-performance-counter-collection)
* [Using CodeXL to view profiling results:](#using-codexl-to-view-profiling-results)
* [More information on CodeXL](#more-information-on-codexl)
* [HIP Markers](#hip-markers)
* [Profiling HIP APIs](#profiling-hip-apis)
* [Adding markers to applications](#adding-markers-to-applications)
* [Additional HIP Profiling Features](#additional-hip-profiling-features)
* [Demangling C Kernel Names](#demangling-c-kernel-names)
* [Controlling when profiling starts and ends](#controlling-when-profiling-starts-and-ends)
* [Reducing timeline trace output file size](#reducing-timeline-trace-output-file-size)
* [How to enable profiling at HIP build time](#how-to-enable-profiling-at-hip-build-time)
* [Tracing and Debug](#tracing-and-debug)
* [Tracing HIP APIs](#tracing-hip-apis)
* [Color](#color)
* [Using HIP_DB](#using-hip_db)
* [Using ltrace](#using-ltrace)
* [Chicken bits](#chicken-bits)
<!-- toc -->
- [CodeXL Profiling](#codexl-profiling)
* [Collecting and Viewing Traces](#collecting-and-viewing-traces)
+ [Using rocm-profiler timestamp profiling](#using-rocm-profiler-timestamp-profiling)
+ [Using rocm-profiler performance counter collection:](#using-rocm-profiler-performance-counter-collection)
+ [Using CodeXL to view profiling results:](#using-codexl-to-view-profiling-results)
+ [More information on CodeXL](#more-information-on-codexl)
* [HIP Markers](#hip-markers)
+ [Profiling HIP APIs](#profiling-hip-apis)
+ [Adding markers to applications](#adding-markers-to-applications)
* [Additional HIP Profiling Features](#additional-hip-profiling-features)
+ [Demangling C++ Kernel Names](#demangling-c-kernel-names)
+ [Controlling when profiling starts and ends](#controlling-when-profiling-starts-and-ends)
+ [Reducing timeline trace output file size](#reducing-timeline-trace-output-file-size)
+ [How to enable profiling at HIP build time](#how-to-enable-profiling-at-hip-build-time)
- [Tracing and Debug](#tracing-and-debug)
* [Tracing HIP APIs](#tracing-hip-apis)
+ [Color](#color)
* [Using HIP_DB](#using-hip_db)
* [Using ltrace](#using-ltrace)
* [Chicken bits](#chicken-bits)
* [Debugging HIP Applications](#debugging-hip-applications)
* [General Debugging Tips](#general-debugging-tips)
<!-- tocstop -->
## CodeXL Profiling
+8 -13
查看文件
@@ -2343,9 +2343,6 @@ private:
LangOptions DefaultLangOptions;
SmallString<40> XStr;
raw_svector_ostream OS(XStr);
StringRef initialParamList;
OS << "hipLaunchParm lp";
size_t repLength = OS.str().size();
SourceLocation sl = kernelDecl->getNameInfo().getEndLoc();
SourceLocation kernelArgListStart = Lexer::findLocationAfterToken(sl, tok::l_paren, *SM, DefaultLangOptions, true);
DEBUG(dbgs() << kernelArgListStart.printToString(*SM));
@@ -2355,14 +2352,12 @@ private:
SourceLocation kernelArgListStart(pvdFirst->getLocStart());
SourceLocation kernelArgListEnd(pvdLast->getLocEnd());
SourceLocation stop = Lexer::getLocForEndOfToken(kernelArgListEnd, 0, *SM, DefaultLangOptions);
repLength += SM->getCharacterData(stop) - SM->getCharacterData(kernelArgListStart);
initialParamList = StringRef(SM->getCharacterData(kernelArgListStart), repLength);
OS << ", " << initialParamList;
size_t repLength = SM->getCharacterData(stop) - SM->getCharacterData(kernelArgListStart);
OS << StringRef(SM->getCharacterData(kernelArgListStart), repLength);
Replacement Rep0(*(Result.SourceManager), kernelArgListStart, repLength, OS.str());
FullSourceLoc fullSL(sl, *(Result.SourceManager));
insertReplacement(Rep0, fullSL);
}
DEBUG(dbgs() << "initial paramlist: " << initialParamList << "\n" << "new paramlist: " << OS.str() << "\n");
Replacement Rep0(*(Result.SourceManager), kernelArgListStart, repLength, OS.str());
FullSourceLoc fullSL(sl, *(Result.SourceManager));
insertReplacement(Rep0, fullSL);
}
bool cudaCall(const MatchFinder::MatchResult &Result) {
@@ -2431,9 +2426,9 @@ private:
XStr.clear();
if (calleeName.find(',') != StringRef::npos) {
SmallString<128> tmpData;
calleeName = Twine("HIP_KERNEL_NAME(" + calleeName + ")").toStringRef(tmpData);
calleeName = Twine("(" + calleeName + ")").toStringRef(tmpData);
}
OS << "hipLaunchKernel(" << calleeName << ",";
OS << "hipLaunchKernelGGL(" << calleeName << ",";
const CallExpr *config = launchKernel->getConfig();
DEBUG(dbgs() << "Kernel config arguments:" << "\n");
SourceManager *SM = Result.SourceManager;
@@ -2473,7 +2468,7 @@ private:
Replacement Rep(*SM, launchKernel->getLocStart(), length, OS.str());
FullSourceLoc fullSL(launchKernel->getLocStart(), *SM);
insertReplacement(Rep, fullSL);
hipCounter counter = {"hipLaunchKernel", CONV_KERN, API_RUNTIME};
hipCounter counter = {"hipLaunchKernelGGL", CONV_KERN, API_RUNTIME};
updateCounters(counter, refName.str());
return true;
}
+13 -10
查看文件
@@ -21,6 +21,7 @@ THE SOFTWARE.
*/
#pragma once
#if GENERIC_GRID_LAUNCH == 1
#include "concepts.hpp"
#include "helpers.hpp"
@@ -840,14 +841,16 @@ namespace hip_impl
group_mem_bytes,\
stream,\
...)\
{\
hipLaunchKernelGGL(\
kernel_name,\
num_blocks,\
dim_blocks,\
group_mem_bytes,\
stream,\
hipLaunchParm{},\
##__VA_ARGS__);\
}
do {\
hipLaunchKernelGGL(\
kernel_name,\
num_blocks,\
dim_blocks,\
group_mem_bytes,\
stream,\
hipLaunchParm{},\
##__VA_ARGS__);\
} while(0)
}
#endif //GENERIC_GRID_LAUNCH
+26 -27
查看文件
@@ -23,8 +23,7 @@ THE SOFTWARE.
#ifndef HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COMPLEX_H
#define HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COMPLEX_H
#include "./hip_fp16.h"
#include "./hip_vector_types.h"
#include "hip/hcc_detail/hip_vector_types.h"
#if __cplusplus
#define COMPLEX_ADD_OP_OVERLOAD(type) \
@@ -177,45 +176,45 @@ COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, unsigned long long)
#endif
__device__ static inline float hipCrealf(hipFloatComplex z){
__device__ __host__ static inline float hipCrealf(hipFloatComplex z){
return z.x;
}
__device__ static inline float hipCimagf(hipFloatComplex z){
__device__ __host__ static inline float hipCimagf(hipFloatComplex z){
return z.y;
}
__device__ static inline hipFloatComplex make_hipFloatComplex(float a, float b){
__device__ __host__ static inline hipFloatComplex make_hipFloatComplex(float a, float b){
hipFloatComplex z;
z.x = a;
z.y = b;
return z;
}
__device__ static inline hipFloatComplex hipConjf(hipFloatComplex z){
__device__ __host__ static inline hipFloatComplex hipConjf(hipFloatComplex z){
hipFloatComplex ret;
ret.x = z.x;
ret.y = -z.y;
return ret;
}
__device__ static inline float hipCsqabsf(hipFloatComplex z){
__device__ __host__ static inline float hipCsqabsf(hipFloatComplex z){
return z.x * z.x + z.y * z.y;
}
__device__ static inline hipFloatComplex hipCaddf(hipFloatComplex p, hipFloatComplex q){
__device__ __host__ static inline hipFloatComplex hipCaddf(hipFloatComplex p, hipFloatComplex q){
return make_hipFloatComplex(p.x + q.x, p.y + q.y);
}
__device__ static inline hipFloatComplex hipCsubf(hipFloatComplex p, hipFloatComplex q){
__device__ __host__ static inline hipFloatComplex hipCsubf(hipFloatComplex p, hipFloatComplex q){
return make_hipFloatComplex(p.x - q.x, p.y - q.y);
}
__device__ static inline hipFloatComplex hipCmulf(hipFloatComplex p, hipFloatComplex q){
__device__ __host__ static inline hipFloatComplex hipCmulf(hipFloatComplex p, hipFloatComplex q){
return make_hipFloatComplex(p.x * q.x - p.y * q.y, p.y * q.x + p.x * q.y);
}
__device__ static inline hipFloatComplex hipCdivf(hipFloatComplex p, hipFloatComplex q){
__device__ __host__ static inline hipFloatComplex hipCdivf(hipFloatComplex p, hipFloatComplex q){
float sqabs = hipCsqabsf(q);
hipFloatComplex ret;
ret.x = (p.x * q.x + p.y * q.y)/sqabs;
@@ -223,51 +222,51 @@ __device__ static inline hipFloatComplex hipCdivf(hipFloatComplex p, hipFloatCom
return ret;
}
__device__ static inline float hipCabsf(hipFloatComplex z){
__device__ __host__ static inline float hipCabsf(hipFloatComplex z){
return sqrtf(hipCsqabsf(z));
}
__device__ static inline double hipCreal(hipDoubleComplex z){
__device__ __host__ static inline double hipCreal(hipDoubleComplex z){
return z.x;
}
__device__ static inline double hipCimag(hipDoubleComplex z){
__device__ __host__ static inline double hipCimag(hipDoubleComplex z){
return z.y;
}
__device__ static inline hipDoubleComplex make_hipDoubleComplex(double a, double b){
__device__ __host__ static inline hipDoubleComplex make_hipDoubleComplex(double a, double b){
hipDoubleComplex z;
z.x = a;
z.y = b;
return z;
}
__device__ static inline hipDoubleComplex hipConj(hipDoubleComplex z){
__device__ __host__ static inline hipDoubleComplex hipConj(hipDoubleComplex z){
hipDoubleComplex ret;
ret.x = z.x;
ret.y = z.y;
return ret;
}
__device__ static inline double hipCsqabs(hipDoubleComplex z){
__device__ __host__ static inline double hipCsqabs(hipDoubleComplex z){
return z.x * z.x + z.y * z.y;
}
__device__ static inline hipDoubleComplex hipCadd(hipDoubleComplex p, hipDoubleComplex q){
__device__ __host__ static inline hipDoubleComplex hipCadd(hipDoubleComplex p, hipDoubleComplex q){
return make_hipDoubleComplex(p.x + q.x, p.y + q.y);
}
__device__ static inline hipDoubleComplex hipCsub(hipDoubleComplex p, hipDoubleComplex q){
__device__ __host__ static inline hipDoubleComplex hipCsub(hipDoubleComplex p, hipDoubleComplex q){
return make_hipDoubleComplex(p.x - q.x, p.y - q.y);
}
__device__ static inline hipDoubleComplex hipCmul(hipDoubleComplex p, hipDoubleComplex q){
__device__ __host__ static inline hipDoubleComplex hipCmul(hipDoubleComplex p, hipDoubleComplex q){
return make_hipDoubleComplex(p.x * q.x - p.y * q.y, p.y * q.x + p.x * q.y);
}
__device__ static inline hipDoubleComplex hipCdiv(hipDoubleComplex p, hipDoubleComplex q){
__device__ __host__ static inline hipDoubleComplex hipCdiv(hipDoubleComplex p, hipDoubleComplex q){
double sqabs = hipCsqabs(q);
hipDoubleComplex ret;
ret.x = (p.x * q.x + p.y * q.y)/sqabs;
@@ -275,28 +274,28 @@ __device__ static inline hipDoubleComplex hipCdiv(hipDoubleComplex p, hipDoubleC
return ret;
}
__device__ static inline double hipCabs(hipDoubleComplex z){
__device__ __host__ static inline double hipCabs(hipDoubleComplex z){
return sqrtf(hipCsqabs(z));
}
typedef hipFloatComplex hipComplex;
__device__ static inline hipComplex make_hipComplex(float x,
__device__ __host__ static inline hipComplex make_hipComplex(float x,
float y){
return make_hipFloatComplex(x, y);
}
__device__ static inline hipFloatComplex hipComplexDoubleToFloat
__device__ __host__ static inline hipFloatComplex hipComplexDoubleToFloat
(hipDoubleComplex z){
return make_hipFloatComplex((float)z.x, (float)z.y);
}
__device__ static inline hipDoubleComplex hipComplexFloatToDouble
__device__ __host__ static inline hipDoubleComplex hipComplexFloatToDouble
(hipFloatComplex z){
return make_hipDoubleComplex((double)z.x, (double)z.y);
}
__device__ static inline hipComplex hipCfmaf(hipComplex p, hipComplex q, hipComplex r){
__device__ __host__ static inline hipComplex hipCfmaf(hipComplex p, hipComplex q, hipComplex r){
float real = (p.x * q.x) + r.x;
float imag = (q.x * p.y) + r.y;
@@ -306,7 +305,7 @@ __device__ static inline hipComplex hipCfmaf(hipComplex p, hipComplex q, hipComp
return make_hipComplex(real, imag);
}
__device__ static inline hipDoubleComplex hipCfma(hipDoubleComplex p, hipDoubleComplex q, hipDoubleComplex r){
__device__ __host__ static inline hipDoubleComplex hipCfma(hipDoubleComplex p, hipDoubleComplex q, hipDoubleComplex r){
float real = (p.x * q.x) + r.x;
float imag = (q.x * p.y) + r.y;
+1 -1
查看文件
@@ -23,7 +23,7 @@ THE SOFTWARE.
#ifndef HIP_INCLUDE_HIP_HCC_DETAIL_HIP_FP16_H
#define HIP_INCLUDE_HIP_HCC_DETAIL_HIP_FP16_H
#include "hip/hip_runtime.h"
#include "hip/hcc_detail/hip_vector_types.h"
#if __clang_major__ > 3
+42 -1
查看文件
@@ -28,6 +28,17 @@ THE SOFTWARE.
#if __cplusplus
#ifdef __HCC__
#include <hc.hpp>
/**
*-------------------------------------------------------------------------------------------------
*-------------------------------------------------------------------------------------------------
* @defgroup HCC-specific features
* @warning These APIs provide access to special features of HCC compiler and are not available through the CUDA path.
* @{
*/
/**
* @brief Return hc::accelerator associated with the specified deviceId
* @return #hipSuccess, #hipErrorInvalidDevice
@@ -45,6 +56,29 @@ hipError_t hipHccGetAcceleratorView(hipStream_t stream, hc::accelerator_view **a
#endif // #ifdef __HCC__
/**
* @brief launches kernel f with launch parameters and shared memory on stream with arguments passed to kernelparams or extra
*
* @param [in[ f Kernel to launch.
* @param [in] gridDimX X grid dimension specified in work-items
* @param [in] gridDimY Y grid dimension specified in work-items
* @param [in] gridDimZ Z grid dimension specified in work-items
* @param [in] blockDimX X block dimensions specified in work-items
* @param [in] blockDimY Y grid dimension specified in work-items
* @param [in] blockDimZ Z grid dimension specified in work-items
* @param [in] sharedMemBytes Amount of dynamic shared memory to allocate for this kernel. The kernel can access this with HIP_DYNAMIC_SHARED.
* @param [in] stream Stream where the kernel should be dispatched. May be 0, in which case th default stream is used with associated synchronization rules.
* @param [in] kernelParams
* @param [in] extra Pointer to kernel arguments. These are passed directly to the kernel and must be in the memory layout and alignment expected by the kernel.
* @param [in] startEvent If non-null, specified event will be updated to track the start time of the kernel launch. The event must be created before calling this API.
* @param [in] stopEvent If non-null, specified event will be updated to track the stop time of the kernel launch. The event must be created before calling this API.
*
* @returns hipSuccess, hipInvalidDevice, hipErrorNotInitialized, hipErrorInvalidValue
*
* @warning kernellParams argument is not yet implemented in HIP. Please use extra instead. Please refer to hip_porting_driver_api.md for sample usage.
* HIP/ROCm actually updates the start event when the associated kernel completes.
*/
hipError_t hipHccModuleLaunchKernel(hipFunction_t f,
uint32_t globalWorkSizeX,
uint32_t globalWorkSizeY,
@@ -55,8 +89,15 @@ hipError_t hipHccModuleLaunchKernel(hipFunction_t f,
size_t sharedMemBytes,
hipStream_t hStream,
void **kernelParams,
void **extra);
void **extra,
hipEvent_t startEvent=nullptr,
hipEvent_t stopEvent=nullptr
);
// doxygen end HCC-specific features
/**
* @}
*/
#endif // #if __cplusplus
#endif //
+11 -12
查看文件
@@ -1913,19 +1913,18 @@ hipError_t hipModuleLoadData(hipModule_t *module, const void *image);
/**
* @brief launches kernel f with launch parameters and shared memory on stream with arguments passed to kernelparams or extra
*
* @param [in[ f
* @param [in] gridDimX
* @param [in] gridDimY
* @param [in] gridDimZ
* @param [in] blockDimX
* @param [in] blockDimY
* @param [in] blockDimZ
* @param [in] sharedMemBytes
* @param [in] stream
* @param [in] kernelParams
* @param [in] extraa
* @param [in[ f Kernel to launch.
* @param [in] gridDimX X grid dimension specified as multiple of blockDimX.
* @param [in] gridDimY Y grid dimension specified as multiple of blockDimY.
* @param [in] gridDimZ Z grid dimension specified as multiple of blockDimZ.
* @param [in] blockDimX X block dimensions specified in work-items
* @param [in] blockDimY Y grid dimension specified in work-items
* @param [in] blockDimZ Z grid dimension specified in work-items
* @param [in] sharedMemBytes Amount of dynamic shared memory to allocate for this kernel. The kernel can access this with HIP_DYNAMIC_SHARED.
* @param [in] stream Stream where the kernel should be dispatched. May be 0, in which case th default stream is used with associated synchronization rules.
* @param [in] kernelParams
* @param [in] extra Pointer to kernel arguments. These are passed directly to the kernel and must be in the memory layout and alignment expected by the kernel.
*
* The function takes the above arguments and run the kernel in hipFunction_t f. with launch parameters specified in gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY and blockDimmZ. The amount of shared memory is specificed and can be used with HIP_DYNAMIC_SHARED. The arguemt extra is used to pass in the arguments for the kernel.
* @returns hipSuccess, hipInvalidDevice, hipErrorNotInitialized, hipErrorInvalidValue
*
* @warning kernellParams argument is not yet implemented in HIP. Please use extra instead. Please refer to hip_porting_driver_api.md for sample usage.
+1 -1
查看文件
@@ -32,7 +32,7 @@ THE SOFTWARE.
#error("This version of HIP requires a newer version of HCC.");
#endif
#include "host_defines.h"
#include "hip/hcc_detail/host_defines.h"
#define MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(type) \
__device__ __host__ type() {} \
-23
查看文件
@@ -1,23 +0,0 @@
#!/bin/bash
function die {
echo "${1-Died}." >&2
exit 1
}
payload=$1
script=$2
[ "$payload" != "" ] || [ "$script" != "" ] || die "Invalid arguments!"
tmp=__extract__$RANDOM
printf "#!/bin/bash
samples_dir=\$1
[ \"\$samples_dir\" != \"\" ] || read -e -p \"Enter the path to extract the HIP samples: \" samples_dir
mkdir -p \$samples_dir
PAYLOAD=\`awk '/^__PAYLOAD_BELOW__/ {print NR + 1; exit 0; }' \$0\`
tail -n+\$PAYLOAD \$0 | tar -xz -C \$samples_dir
echo \"HIP samples installed in \$samples_dir\"
exit 0
__PAYLOAD_BELOW__\n" > "$tmp"
cat "$tmp" "$payload" > "$script" && rm "$tmp"
chmod +x "$script"
+11 -4
查看文件
@@ -1,12 +1,19 @@
cmake_minimum_required(VERSION 2.8.3)
project(hip_doc)
add_custom_target(build_doxygen ALL
find_program(DOXYGEN_EXE doxygen)
if(DOXYGEN_EXE)
add_custom_target(build_doxygen ALL
COMMAND HIP_PATH=@hip_SOURCE_DIR@ doxygen @hip_SOURCE_DIR@/docs/doxygen-input/doxy.cfg)
add_custom_target(convert_md_to_html ALL
install(DIRECTORY RuntimeAPI/html DESTINATION docs/docs/RuntimeAPI)
endif()
find_program(GRIP_EXE grip)
if(GRIP_EXE)
add_custom_target(convert_md_to_html ALL
COMMAND @hip_SOURCE_DIR@/packaging/convert_md_to_html.sh @hip_SOURCE_DIR@ ${PROJECT_BINARY_DIR}/md2html)
install(DIRECTORY RuntimeAPI/html DESTINATION docs/docs/RuntimeAPI)
install(DIRECTORY md2html/ DESTINATION docs)
install(DIRECTORY md2html/ DESTINATION docs)
endif()
#############################
# Packaging steps
-4
查看文件
@@ -11,10 +11,6 @@ HIPCC=$(HIP_PATH)/bin/hipcc
ifeq (${HIP_PLATFORM}, nvcc)
HIPCC_FLAGS = -gencode=arch=compute_20,code=sm_20
endif
ifeq (${HIP_PLATFORM}, hcc)
HIPCC_FLAGS = -stdlib=libc++
endif
EXE=bit_extract
-3
查看文件
@@ -10,9 +10,6 @@ OPT=-O3
CXXFLAGS = $(OPT) --std=c++11
HIP_PLATFORM=$(shell $(HIP_PATH)/bin/hipconfig --platform)
ifeq (${HIP_PLATFORM}, hcc)
CXXFLAGS += " -stdlib=libc++"
endif
CODE_OBJECTS=nullkernel.hsaco
-1
查看文件
@@ -11,7 +11,6 @@
#include <elf.h>
#include <hsa/hsa.h>
#include <hc.hpp>
#include <hip/hcc_detail/hcc_acc.h>
#endif
#include <sys/time.h>
+52 -8
查看文件
@@ -30,6 +30,54 @@ THE SOFTWARE.
//---
ihipEvent_t::ihipEvent_t(unsigned flags)
{
_state = hipEventStatusCreated;
_stream = NULL;
_flags = flags;
_timestamp = 0;
_type = hipEventTypeIndependent;
};
// Attach to an existing completion future:
void ihipEvent_t::attachToCompletionFuture(const hc::completion_future *cf, ihipEventType_t eventType)
{
_state = hipEventStatusRecording;
_marker = *cf;
_type = eventType;
}
void ihipEvent_t::setTimestamp()
{
if (_state == hipEventStatusRecorded) {
// already recorded, done:
return;
} else {
// TODO - use completion-future functions to obtain ticks and timestamps:
hsa_signal_t *sig = static_cast<hsa_signal_t*> (_marker.get_native_handle());
if (sig) {
if (hsa_signal_load_acquire(*sig) == 0) {
if ((_type == hipEventTypeIndependent) || (_type == hipEventTypeStopCommand)) {
_timestamp = _marker.get_end_tick();
} else if (_type == hipEventTypeStartCommand) {
_timestamp = _marker.get_begin_tick();
} else {
assert(0); // TODO - move to debug assert
_timestamp = 0;
}
_state = hipEventStatusRecorded;
}
}
}
}
hipError_t ihipEventCreate(hipEvent_t* event, unsigned flags)
{
hipError_t e = hipSuccess;
@@ -37,12 +85,8 @@ hipError_t ihipEventCreate(hipEvent_t* event, unsigned flags)
// TODO-IPC - support hipEventInterprocess.
unsigned supportedFlags = hipEventDefault | hipEventBlockingSync | hipEventDisableTiming;
if ((flags & ~supportedFlags) == 0) {
ihipEvent_t *eh = new ihipEvent_t();
ihipEvent_t *eh = new ihipEvent_t(flags);
eh->_state = hipEventStatusCreated;
eh->_stream = NULL;
eh->_flags = flags;
eh->_timestamp = 0;
*event = eh;
} else {
e = hipErrorInvalidValue;
@@ -141,8 +185,8 @@ hipError_t hipEventElapsedTime(float *ms, hipEvent_t start, hipEvent_t stop)
ihipEvent_t *start_eh = start;
ihipEvent_t *stop_eh = stop;
ihipSetTs(start);
ihipSetTs(stop);
start->setTimestamp();
stop->setTimestamp();
hipError_t status = hipSuccess;
*ms = 0.0f;
@@ -151,7 +195,7 @@ hipError_t hipEventElapsedTime(float *ms, hipEvent_t start, hipEvent_t stop)
if ((start_eh->_state == hipEventStatusRecorded) && (stop_eh->_state == hipEventStatusRecorded)) {
// Common case, we have good information for both events.
int64_t tickDiff = (stop_eh->_timestamp - start_eh->_timestamp);
int64_t tickDiff = (stop_eh->timestamp() - start_eh->timestamp());
uint64_t freqHz;
hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &freqHz);
-17
查看文件
@@ -1641,23 +1641,6 @@ const char *ihipErrorString(hipError_t hip_error)
};
void ihipSetTs(hipEvent_t e)
{
ihipEvent_t *eh = e;
if (eh->_state == hipEventStatusRecorded) {
// already recorded, done:
return;
} else {
// TODO - use completion-future functions to obtain ticks and timestamps:
hsa_signal_t *sig = static_cast<hsa_signal_t*> (eh->_marker.get_native_handle());
if (sig) {
if (hsa_signal_load_acquire(*sig) == 0) {
eh->_timestamp = eh->_marker.get_end_tick();
eh->_state = hipEventStatusRecorded;
}
}
}
}
// Returns true if copyEngineCtx can see the memory allocated on dstCtx and srcCtx.
+24 -8
查看文件
@@ -584,22 +584,40 @@ private: // Data
//----
// Internal event structure:
enum hipEventStatus_t {
hipEventStatusUnitialized = 0, // event is unutilized, must be "Created" before use.
hipEventStatusCreated = 1,
hipEventStatusRecording = 2, // event has been enqueued to record something.
hipEventStatusRecorded = 3, // event has been recorded - timestamps are valid.
hipEventStatusUnitialized = 0, // event is unutilized, must be "Created" before use.
hipEventStatusCreated = 1,
hipEventStatusRecording = 2, // event has been enqueued to record something.
hipEventStatusRecorded = 3, // event has been recorded - timestamps are valid.
} ;
// TODO - rename to ihip type of some kind
enum ihipEventType_t {
hipEventTypeIndependent,
hipEventTypeStartCommand,
hipEventTypeStopCommand,
};
// internal hip event structure.
struct ihipEvent_t {
hipEventStatus_t _state;
class ihipEvent_t {
public:
ihipEvent_t(unsigned flags);
void attachToCompletionFuture(const hc::completion_future *cf, ihipEventType_t eventType);
void setTimestamp();
uint64_t timestamp() const { return _timestamp; } ;
ihipEventType_t type() const { return _type; };
public:
hipEventStatus_t _state;
hipStream_t _stream; // Stream where the event is recorded, or NULL if all streams.
unsigned _flags;
hc::completion_future _marker;
private:
ihipEventType_t _type;
uint64_t _timestamp; // store timestamp, may be set on host or by marker.
friend hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream);
} ;
@@ -822,8 +840,6 @@ extern hipError_t ihipDeviceSetState();
extern ihipDevice_t *ihipGetDevice(int);
ihipCtx_t * ihipGetPrimaryCtx(unsigned deviceIndex);
extern void ihipSetTs(hipEvent_t e);
hipStream_t ihipSyncAndResolveStream(hipStream_t);
+9 -4
查看文件
@@ -1260,10 +1260,15 @@ hipError_t hipIpcOpenMemHandle(void** devPtr, hipIpcMemHandle_t handle, unsigned
ihipIpcMemHandle_t* iHandle = (ihipIpcMemHandle_t*) &handle;
//Attach ipc memory
hsa_status_t hsa_status =
hsa_amd_ipc_memory_attach((hsa_amd_ipc_memory_t*)&(iHandle->ipc_handle), iHandle->psize, 1, agent, devPtr);
if(hsa_status != HSA_STATUS_SUCCESS)
hipStatus = hipErrorMapBufferObjectFailed;
auto ctx= ihipGetTlsDefaultCtx();
{
LockedAccessor_CtxCrit_t crit(ctx->criticalData());
// the peerCnt always stores self so make sure the trace actually
hsa_status_t hsa_status =
hsa_amd_ipc_memory_attach((hsa_amd_ipc_memory_t*)&(iHandle->ipc_handle), iHandle->psize, crit->peerCnt(), crit->peerAgents(), devPtr);
if(hsa_status != HSA_STATUS_SUCCESS)
hipStatus = hipErrorMapBufferObjectFailed;
}
#else
hipStatus = hipErrorRuntimeOther;
#endif
+24 -8
查看文件
@@ -364,10 +364,11 @@ hipError_t hipModuleGetFunction(hipFunction_t *hfunc, hipModule_t hmod,
hipError_t ihipModuleLaunchKernel(hipFunction_t f,
uint32_t globalWorkSizeX, uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ,
uint32_t localWorkSizeX, uint32_t localWorkSizeY, uint32_t localWorkSizeZ,
size_t sharedMemBytes, hipStream_t hStream,
void **kernelParams, void **extra)
uint32_t globalWorkSizeX, uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ,
uint32_t localWorkSizeX, uint32_t localWorkSizeY, uint32_t localWorkSizeZ,
size_t sharedMemBytes, hipStream_t hStream,
void **kernelParams, void **extra,
hipEvent_t startEvent, hipEvent_t stopEvent)
{
auto ctx = ihipGetTlsDefaultCtx();
@@ -446,7 +447,20 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f,
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
};
lp.av->dispatch_hsa_kernel(&aql, config[1] /* kernarg*/, kernArgSize, nullptr/*completion_future*/);
hc::completion_future cf;
lp.av->dispatch_hsa_kernel(&aql, config[1] /* kernarg*/, kernArgSize,
(startEvent || stopEvent) ? &cf : nullptr);
if (startEvent) {
startEvent->attachToCompletionFuture(&cf, hipEventTypeStartCommand);
}
if (stopEvent) {
stopEvent->attachToCompletionFuture (&cf, hipEventTypeStopCommand);
}
if(kernelParams != NULL){
free(config[1]);
@@ -470,7 +484,8 @@ hipError_t hipModuleLaunchKernel(hipFunction_t f,
return ihipLogStatus(ihipModuleLaunchKernel(f,
blockDimX * gridDimX, blockDimY * gridDimY, gridDimZ * blockDimZ,
blockDimX, blockDimY, blockDimZ,
sharedMemBytes, hStream, kernelParams, extra));
sharedMemBytes, hStream, kernelParams, extra,
nullptr, nullptr));
}
@@ -478,7 +493,8 @@ hipError_t hipHccModuleLaunchKernel(hipFunction_t f,
uint32_t globalWorkSizeX, uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ,
uint32_t localWorkSizeX, uint32_t localWorkSizeY, uint32_t localWorkSizeZ,
size_t sharedMemBytes, hipStream_t hStream,
void **kernelParams, void **extra)
void **kernelParams, void **extra,
hipEvent_t startEvent, hipEvent_t stopEvent)
{
HIP_INIT_API(f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ,
localWorkSizeX, localWorkSizeY, localWorkSizeZ,
@@ -486,7 +502,7 @@ hipError_t hipHccModuleLaunchKernel(hipFunction_t f,
kernelParams, extra);
return ihipLogStatus(ihipModuleLaunchKernel(f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ,
localWorkSizeX, localWorkSizeY, localWorkSizeZ,
sharedMemBytes, hStream, kernelParams, extra));
sharedMemBytes, hStream, kernelParams, extra, startEvent, stopEvent));
}
hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t *bytes,
+33 -12
查看文件
@@ -1,18 +1,29 @@
#include<iostream>
#include <iostream>
#include "hip/hip_runtime.h"
#include "hip/hip_runtime_api.h"
#include "../test_common.h"
#define LEN 1030
#define SIZE LEN << 2
__global__ void cpy(hipLaunchParm lp, uint32_t *Out, uint32_t *In, uint32_t *Vald)
/* HIT_START
* BUILD: %t %s ../test_common.cpp
* RUN: %t
* HIT_END
*/
__global__ void cpy(hipLaunchParm lp, uint32_t *Out, uint32_t *In)
{
memcpy(Out, In, SIZE, Vald);
int tx = hipThreadIdx_x;
memcpy(Out + tx, In + tx, SIZE/LEN);
}
__global__ void set(hipLaunchParm lp, uint32_t *ptr, uint8_t val, size_t size)
{
memset(ptr, val, size);
int tx = hipThreadIdx_x;
memset(ptr + tx, val, size);
}
int main()
@@ -24,19 +35,29 @@ int main()
Val = new uint32_t;
*Val = 0;
for(int i=0;i<LEN;i++){
A[i] = i *1.0f;
B[i] = 0.0f;
A[i] = i;
B[i] = 0;
}
hipMalloc((void**)&Ad, SIZE);
hipMalloc((void**)&Bd, SIZE);
hipMalloc((void**)&Vald, sizeof(uint32_t));
hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
hipLaunchKernel(cpy, dim3(1), dim3(LEN/4), 0, 0, Bd, Ad, Vald);
hipLaunchKernel(set, dim3(1), dim3(LEN/4), 0, 0, Bd, 0x1, SIZE);
hipLaunchKernel(cpy, dim3(1), dim3(LEN), 0, 0, Bd, Ad);
hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost);
hipMemcpy(Val, Vald, sizeof(uint32_t), hipMemcpyDeviceToHost);
for(int i=LEN-16;i<LEN;i++){
std::cout<<A[i]<<" "<<B[i]<<std::endl;
if(A[i]!=B[i]){
return 0;
}
}
std::cout<<*Val<<std::endl;
hipLaunchKernel(set, dim3(1), dim3(LEN), 0, 0, Bd, 0x1, LEN);
hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost);
for(int i=LEN-16;i<LEN;i++){
if(0x01010101!=B[i]){
return 0;
}
}
passed();
}