Merge branch 'amd-develop' into amd-master
Change-Id: I53d5a8916d769c4f0fe60d2ee3b240551da80b4f
(cherry picked from commit 01c523f6c9)
此提交包含在:
+158
-30
@@ -1,49 +1,177 @@
|
||||
# HIP Bugs
|
||||
# HIP Bugs
|
||||
|
||||
<!-- toc -->
|
||||
|
||||
- [Errors related to undefined reference to `__hcLaunchKernel__***__grid_launch_parm**](#errors-related-to-undefined-reference-to-hclaunchkernel__grid_launch_parm)
|
||||
- [Application hangs after a hipLaunchKernel call](#what-if-i-see-application-hangs-after-a-hiplaunchkernel-call)
|
||||
- [Errors related to undefined reference to `__hcLaunchKernel__***__grid_launch_parm**`](#errors-related-to-undefined-reference-to-__hclaunchkernel____grid_launch_parm)
|
||||
- [What is the current limitation of HIP Generic Grid Launch method?](#what-is-the-current-limitation-of-hip-generic-grid-launch-method)
|
||||
- [Errors related to `no matching constructor`](#errors-related-to-no-matching-constructor)
|
||||
- [HIP is more restrictive in enforcing restrictions](#hip-is-more-restrictive-in-enforcing-restrictions)
|
||||
|
||||
<!-- tocstop -->
|
||||
|
||||
### Errors related to undefined reference to `__hcLaunchKernel__***__grid_launch_parm**
|
||||
### Errors related to undefined reference to `__hcLaunchKernel__***__grid_launch_parm**`
|
||||
|
||||
Some common code practices may lead to hipcc generating a error with the form :
|
||||
undefined reference to `__hcLaunchKernel__ZN15vecAddNamespace6vecAddIidEEv16grid_launch_parmPT0_S3_S3_T_
|
||||
|
||||
To workaround, try:
|
||||
- Avoid calling hcLaunchKernel from a function with the __host__ attribute
|
||||
__host__ MyFunc(…) {
|
||||
hipLaunchKernel(myKernel, …)
|
||||
Suggested workarounds:
|
||||
- Avoid use of static with kernel definition:
|
||||
```c++
|
||||
static __global__ MyKernel
|
||||
- Avoid defining kernels in anonymous namespace
|
||||
```
|
||||
|
||||
- Avoid defining kernels in anonymous namespace :
|
||||
```c++
|
||||
namespace {
|
||||
__global__ MyKernel …
|
||||
- Avoid calling member functions
|
||||
|
||||
If hipLaunchKernel takes parameters that request explicitly memcpy, then it will cause application hang.
|
||||
Reason is that the hipLaunchKernel macro locks the stream.
|
||||
If kernel paramters are actually function calls which invoke other hip apis (i.e. memcpy) to the same stream, then deadlock occurs.
|
||||
|
||||
To workaround, try:
|
||||
Move the function calls so they occur outside the hipLaunchKernel macro, store results in temps, then use the tems inside the kernel.
|
||||
|
||||
__global__ MyKernel
|
||||
}
|
||||
```
|
||||
// Example pseudo code causing system hang:
|
||||
// "bottom[0]->gpu_data()" calls hipMemcpy() implicitly and using the same stream, cause deadlock condition.
|
||||
hipLaunchKernel(HIP_KERNEL_NAME(LRNComputeDiff),dim3(CAFFE_GET_BLOCKS(n_threads)), dim3(CAFFE_HIP_NUM_THREADS), 0, 0, n_threads,
|
||||
bottom[0]->gpu_data());
|
||||
|
||||
// Move "gpu_data()" ouside of hipLaunchKernel to avoid hang.
|
||||
auto bot_gpu_data = bottom[0]->gpu_data();
|
||||
hipLaunchKernel( LRNComputeDiff, dim3(CAFFE_GET_BLOCKS(n_threads)), dim3(CAFFE_HIP_NUM_THREADS), 0, 0, n_threads,
|
||||
bot_gpu_data);
|
||||
|
||||
```
|
||||
|
||||
### What is the current limitation of HIP Generic Grid Launch method?
|
||||
1. __global__ functions cannot be marked as static or put in an unnamed namespace i.e. they cannot be given internal linkage (this would clash with __attribute__((weak)));
|
||||
2. using the macro based dispatch mechanism i.e. hipLaunchKernel* only works for functions that take no more than 20 arguments (this limit can be increased up to 126, and is temporary until we can enable C++14 mode and use variadic generic lambdas); no such limitation applies do dispatching directly through grid_launch.
|
||||
2. using the macro based dispatch mechanism i.e. hipLaunchKernel* only works for functions that take no more than 20 arguments (this limit can be increased up to 126, and is temporary until we can enable C++14 mode and use variadic generic lambdas); no such limitation applies do dispatching directly through grid_launch.
|
||||
|
||||
|
||||
### Errors related to `no matching constructor`
|
||||
|
||||
The symptom is the compiler would complain about errors like `no matching constructor` for classes/structs passed as arguments into a GPU kernel. Often, this is caused by a design limitation in HCC where array-typed member variables inside a class/struct can’t be correctly passed into GPU kernels. To mitigate this issue, a custom serializer/deserializer pair is provided.
|
||||
|
||||
For example, `Foo` in the code snippets below contains an array-typed member variable `table`, which would fail the compiler if used as a kernel argument.
|
||||
|
||||
```
|
||||
struct Foo {
|
||||
// table is an array, which makes foo
|
||||
int table[3];
|
||||
};
|
||||
```
|
||||
|
||||
An workaround is to provide a custom serializer on CPU side, and append the contents of the array as kernel arguments:
|
||||
|
||||
```
|
||||
|
||||
struct Foo {
|
||||
int table[3];
|
||||
|
||||
// user-provided CPU serializer
|
||||
// must append the contents of the array member as kernel arguments
|
||||
#ifdef __HCC__
|
||||
__attribute__((annotate(“serialize”)))
|
||||
void __cxxamp_serialize(Kalmar::Serialize &s) const {
|
||||
for (int i = 0; i < 3; ++i)
|
||||
s.Append(sizeof(int), &table[i]);
|
||||
}
|
||||
#endif
|
||||
};
|
||||
```
|
||||
|
||||
Then, provide a custom deserializer on GPU side, to help reconstruct the array within GPU kernels. Notice that the deserializer can not be a function template, and should have scalar-typed parameters of the number equals to the length of the array-typed member variable. For example:
|
||||
|
||||
```
|
||||
struct Foo {
|
||||
int table[3];
|
||||
|
||||
// user-provided GPU deserializer
|
||||
// table has 3 int elements, so deserializer must have 3 int parameters.
|
||||
#ifdef __HCC__
|
||||
__attribute__((annotate(“user_deserialize”)))
|
||||
Foo(int x0, int x1, int x2) [[cpu]][[hc]] {
|
||||
table[0] = x0;
|
||||
table[1] = x1;
|
||||
table[2] = x2;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __HCC__
|
||||
__attribute__((annotate(“serialize”)))
|
||||
void __cxxamp_serialize(Kalmar::Serialize &s) const {
|
||||
s.Append(sizeof(int), &table[0]);
|
||||
s.Append(sizeof(int), &table[1]);
|
||||
s.Append(sizeof(int), &table[2]);
|
||||
}
|
||||
#endif
|
||||
};
|
||||
```
|
||||
|
||||
|
||||
Rather than create serializer functions, another workaround is to pass the member fields from the structure as simple data types.
|
||||
|
||||
|
||||
### HIP is more restrictive in enforcing restrictions
|
||||
The language specification for HIP and CUDA forbid calling a
|
||||
`__device__` function in a `__host__` context. In practice, you may observe
|
||||
differences in the strictness of this restriction, with HIP exhibiting a tighter
|
||||
adherence to the specification and thus less tolerant of infringing code. The
|
||||
solution is to ensure that all functions which are called in a
|
||||
`__device__` context are correctly annotated to reflect it. An interesting case
|
||||
where these differences emerge is shown below. This relies on a the common
|
||||
[C++ Member Detector idiom][1], as it would be implemented pre C++11):
|
||||
|
||||
```c++
|
||||
#include <cassert>
|
||||
#include <type_traits>
|
||||
|
||||
struct aye { bool a[1]; };
|
||||
struct nay { bool a[2]; };
|
||||
|
||||
// Dual restriction is necessary in HIP if the detector is to work for
|
||||
// __device__ contexts as well as __host__ ones. NVCC is less strict.
|
||||
template<typename T>
|
||||
__host__ __device__
|
||||
const T& cref_t();
|
||||
|
||||
template<typename T>
|
||||
struct Has_call_operator {
|
||||
// Dual restriction is necessary in HIP if the detector is to work for
|
||||
// __device__ contexts as well as __host__ ones. NVCC is less strict.
|
||||
template<typename C>
|
||||
__host__ __device__
|
||||
static
|
||||
aye test(
|
||||
C const *,
|
||||
typename std::enable_if<
|
||||
(sizeof(cref_t<C>().operator()()) > 0)>::type* = nullptr);
|
||||
static
|
||||
nay test(...);
|
||||
|
||||
enum { value = sizeof(test(static_cast<T*>(0))) == sizeof(aye) };
|
||||
};
|
||||
|
||||
template<typename T, typename U, bool callable = has_call_operator<U>::value>
|
||||
struct Wrapper {
|
||||
template<typename V>
|
||||
V f() const { return T{1}; }
|
||||
};
|
||||
|
||||
|
||||
template<typename T, typename U>
|
||||
struct Wrapper<T, U, true> {
|
||||
template<typename V>
|
||||
V f() const { return T{10}; }
|
||||
};
|
||||
|
||||
// This specialisation will yield a compile-time error, if selected.
|
||||
template<typename T, typename U>
|
||||
struct Wrapper<T, U, false> {};
|
||||
|
||||
template<typename T>
|
||||
struct Functor;
|
||||
|
||||
template<> struct Functor<float> {
|
||||
__device__
|
||||
float operator()() const { return 42.0f; }
|
||||
};
|
||||
|
||||
__device__
|
||||
void this_will_not_compile_if_detector_is_not_marked_device()
|
||||
{
|
||||
float f = Wrapper<float, Functor<float>>().f<float>();
|
||||
}
|
||||
|
||||
__host__
|
||||
void this_will_not_compile_if_detector_is_marked_device_only()
|
||||
{
|
||||
float f = Wrapper<float, Functor<float>>().f<float>();
|
||||
}
|
||||
```
|
||||
[1]: https://en.wikibooks.org/wiki/More_C%2B%2B_Idioms/Member_Detector
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
|
||||
- [What APIs and features does HIP support?](#what-apis-and-features-does-hip-support)
|
||||
- [What is not supported?](#what-is-not-supported)
|
||||
* [Run-time features](#run-time-features)
|
||||
* [Runtime/Driver API features](#runtimedriver-api-features)
|
||||
* [Kernel language features](#kernel-language-features)
|
||||
- [Is HIP a drop-in replacement for CUDA?](#is-hip-a-drop-in-replacement-for-cuda)
|
||||
- [What specific version of CUDA does HIP support?](#what-specific-version-of-cuda-does-hip-support)
|
||||
@@ -23,10 +23,11 @@
|
||||
- [On HCC, can I link HIP code with host code compiled with another compiler such as gcc, icc, or clang ?](#on-hcc-can-i-link-hip-code-with-host-code-compiled-with-another-compiler-such-as-gcc-icc-or-clang-)
|
||||
- [HIP detected my platform (hcc vs nvcc) incorrectly - what should I do?](#hip-detected-my-platform-hcc-vs-nvcc-incorrectly---what-should-i-do)
|
||||
- [Can I install both CUDA SDK and HCC on same machine?](#can-i-install-both-cuda-sdk-and-hcc-on-same-machine)
|
||||
- [On CUDA, can I mix CUDA code with HIP code?](#on-cuda-can-i-mix-cuda-code-with-hip-code)
|
||||
- [On HCC, can I use HC functionality with HIP?](#on-hcc-can-i-use-hc-functionality-with-hip)
|
||||
- [How do I trace HIP application flow?](#how-do-i-trace-hip-application-flow)
|
||||
* [Using CodeXL markers for HIP Functions](#using-codexl-markers-for-hip-functions)
|
||||
* [Using HIP_TRACE_API](#using-hip_trace_api)
|
||||
- [How do I enable HIP Generic Grid Launch option?](#how-do-i-enable-hip-generic-grid-launch-option)
|
||||
- [What if HIP generates error of "symbol multiply defined!" only on AMD machine?](#what-if-hip-generates-error-of-symbol-multiply-defined-only-on-amd-machine)
|
||||
- [How do I disable HIP Generic Grid Launch option?](#how-do-i-disable-hip-generic-grid-launch-option)
|
||||
|
||||
<!-- tocstop -->
|
||||
|
||||
|
||||
@@ -44,6 +44,7 @@
|
||||
- [Pragma Unroll](#pragma-unroll)
|
||||
- [In-Line Assembly](#in-line-assembly)
|
||||
- [C++ Support](#c-support)
|
||||
- [Kernel Compilation](#kernel-compilation)
|
||||
|
||||
<!-- tocstop -->
|
||||
|
||||
|
||||
@@ -21,6 +21,7 @@ and provides practical suggestions on how to port CUDA code and work through com
|
||||
* [Device-Architecture Properties](#device-architecture-properties)
|
||||
* [Table of Architecture Properties](#table-of-architecture-properties)
|
||||
- [Finding HIP](#finding-hip)
|
||||
- [hipLaunchKernel](#hiplaunchkernel)
|
||||
- [Compiler Options](#compiler-options)
|
||||
- [Linking Issues](#linking-issues)
|
||||
* [Linking With hipcc](#linking-with-hipcc)
|
||||
@@ -31,9 +32,11 @@ and provides practical suggestions on how to port CUDA code and work through com
|
||||
* [Using a Standard C++ Compiler](#using-a-standard-c-compiler)
|
||||
+ [cuda.h](#cudah)
|
||||
* [Choosing HIP File Extensions](#choosing-hip-file-extensions)
|
||||
* [Workarounds](#workarounds)
|
||||
+ [warpSize](#warpsize)
|
||||
+ [Textures and Cache Control](#textures-and-cache-control)
|
||||
- [Workarounds](#workarounds)
|
||||
* [warpSize](#warpsize)
|
||||
- [memcpyToSymbol](#memcpytosymbol)
|
||||
- [threadfence_system](#threadfence_system)
|
||||
* [Textures and Cache Control](#textures-and-cache-control)
|
||||
- [More Tips](#more-tips)
|
||||
* [HIPTRACE Mode](#hiptrace-mode)
|
||||
* [Environment Variables](#environment-variables)
|
||||
|
||||
@@ -4,26 +4,32 @@ This section describes the profiling and debugging capabilities that HIP provide
|
||||
Profiling information can viewed in the CodeXL visualization tool or printed directly to stderr as the application runs.
|
||||
This document starts with some of the general capabilities of CodeXL and then describes some of the additional HIP marker and debug features.
|
||||
|
||||
* [CodeXL Profiling](#codexl-profiling)
|
||||
* [Collecting and Viewing Traces](#collecting-and-viewing-traces)
|
||||
* [Using rocm-profiler timestamp profiling](#using-rocm-profiler-timestamp-profiling)
|
||||
* [Using rocm-profiler performance counter collection:](#using-rocm-profiler-performance-counter-collection)
|
||||
* [Using CodeXL to view profiling results:](#using-codexl-to-view-profiling-results)
|
||||
* [More information on CodeXL](#more-information-on-codexl)
|
||||
* [HIP Markers](#hip-markers)
|
||||
* [Profiling HIP APIs](#profiling-hip-apis)
|
||||
* [Adding markers to applications](#adding-markers-to-applications)
|
||||
* [Additional HIP Profiling Features](#additional-hip-profiling-features)
|
||||
* [Demangling C Kernel Names](#demangling-c-kernel-names)
|
||||
* [Controlling when profiling starts and ends](#controlling-when-profiling-starts-and-ends)
|
||||
* [Reducing timeline trace output file size](#reducing-timeline-trace-output-file-size)
|
||||
* [How to enable profiling at HIP build time](#how-to-enable-profiling-at-hip-build-time)
|
||||
* [Tracing and Debug](#tracing-and-debug)
|
||||
* [Tracing HIP APIs](#tracing-hip-apis)
|
||||
* [Color](#color)
|
||||
* [Using HIP_DB](#using-hip_db)
|
||||
* [Using ltrace](#using-ltrace)
|
||||
* [Chicken bits](#chicken-bits)
|
||||
<!-- toc -->
|
||||
|
||||
- [CodeXL Profiling](#codexl-profiling)
|
||||
* [Collecting and Viewing Traces](#collecting-and-viewing-traces)
|
||||
+ [Using rocm-profiler timestamp profiling](#using-rocm-profiler-timestamp-profiling)
|
||||
+ [Using rocm-profiler performance counter collection:](#using-rocm-profiler-performance-counter-collection)
|
||||
+ [Using CodeXL to view profiling results:](#using-codexl-to-view-profiling-results)
|
||||
+ [More information on CodeXL](#more-information-on-codexl)
|
||||
* [HIP Markers](#hip-markers)
|
||||
+ [Profiling HIP APIs](#profiling-hip-apis)
|
||||
+ [Adding markers to applications](#adding-markers-to-applications)
|
||||
* [Additional HIP Profiling Features](#additional-hip-profiling-features)
|
||||
+ [Demangling C++ Kernel Names](#demangling-c-kernel-names)
|
||||
+ [Controlling when profiling starts and ends](#controlling-when-profiling-starts-and-ends)
|
||||
+ [Reducing timeline trace output file size](#reducing-timeline-trace-output-file-size)
|
||||
+ [How to enable profiling at HIP build time](#how-to-enable-profiling-at-hip-build-time)
|
||||
- [Tracing and Debug](#tracing-and-debug)
|
||||
* [Tracing HIP APIs](#tracing-hip-apis)
|
||||
+ [Color](#color)
|
||||
* [Using HIP_DB](#using-hip_db)
|
||||
* [Using ltrace](#using-ltrace)
|
||||
* [Chicken bits](#chicken-bits)
|
||||
* [Debugging HIP Applications](#debugging-hip-applications)
|
||||
* [General Debugging Tips](#general-debugging-tips)
|
||||
|
||||
<!-- tocstop -->
|
||||
|
||||
## CodeXL Profiling
|
||||
|
||||
|
||||
@@ -2343,9 +2343,6 @@ private:
|
||||
LangOptions DefaultLangOptions;
|
||||
SmallString<40> XStr;
|
||||
raw_svector_ostream OS(XStr);
|
||||
StringRef initialParamList;
|
||||
OS << "hipLaunchParm lp";
|
||||
size_t repLength = OS.str().size();
|
||||
SourceLocation sl = kernelDecl->getNameInfo().getEndLoc();
|
||||
SourceLocation kernelArgListStart = Lexer::findLocationAfterToken(sl, tok::l_paren, *SM, DefaultLangOptions, true);
|
||||
DEBUG(dbgs() << kernelArgListStart.printToString(*SM));
|
||||
@@ -2355,14 +2352,12 @@ private:
|
||||
SourceLocation kernelArgListStart(pvdFirst->getLocStart());
|
||||
SourceLocation kernelArgListEnd(pvdLast->getLocEnd());
|
||||
SourceLocation stop = Lexer::getLocForEndOfToken(kernelArgListEnd, 0, *SM, DefaultLangOptions);
|
||||
repLength += SM->getCharacterData(stop) - SM->getCharacterData(kernelArgListStart);
|
||||
initialParamList = StringRef(SM->getCharacterData(kernelArgListStart), repLength);
|
||||
OS << ", " << initialParamList;
|
||||
size_t repLength = SM->getCharacterData(stop) - SM->getCharacterData(kernelArgListStart);
|
||||
OS << StringRef(SM->getCharacterData(kernelArgListStart), repLength);
|
||||
Replacement Rep0(*(Result.SourceManager), kernelArgListStart, repLength, OS.str());
|
||||
FullSourceLoc fullSL(sl, *(Result.SourceManager));
|
||||
insertReplacement(Rep0, fullSL);
|
||||
}
|
||||
DEBUG(dbgs() << "initial paramlist: " << initialParamList << "\n" << "new paramlist: " << OS.str() << "\n");
|
||||
Replacement Rep0(*(Result.SourceManager), kernelArgListStart, repLength, OS.str());
|
||||
FullSourceLoc fullSL(sl, *(Result.SourceManager));
|
||||
insertReplacement(Rep0, fullSL);
|
||||
}
|
||||
|
||||
bool cudaCall(const MatchFinder::MatchResult &Result) {
|
||||
@@ -2431,9 +2426,9 @@ private:
|
||||
XStr.clear();
|
||||
if (calleeName.find(',') != StringRef::npos) {
|
||||
SmallString<128> tmpData;
|
||||
calleeName = Twine("HIP_KERNEL_NAME(" + calleeName + ")").toStringRef(tmpData);
|
||||
calleeName = Twine("(" + calleeName + ")").toStringRef(tmpData);
|
||||
}
|
||||
OS << "hipLaunchKernel(" << calleeName << ",";
|
||||
OS << "hipLaunchKernelGGL(" << calleeName << ",";
|
||||
const CallExpr *config = launchKernel->getConfig();
|
||||
DEBUG(dbgs() << "Kernel config arguments:" << "\n");
|
||||
SourceManager *SM = Result.SourceManager;
|
||||
@@ -2473,7 +2468,7 @@ private:
|
||||
Replacement Rep(*SM, launchKernel->getLocStart(), length, OS.str());
|
||||
FullSourceLoc fullSL(launchKernel->getLocStart(), *SM);
|
||||
insertReplacement(Rep, fullSL);
|
||||
hipCounter counter = {"hipLaunchKernel", CONV_KERN, API_RUNTIME};
|
||||
hipCounter counter = {"hipLaunchKernelGGL", CONV_KERN, API_RUNTIME};
|
||||
updateCounters(counter, refName.str());
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -21,6 +21,7 @@ THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
#if GENERIC_GRID_LAUNCH == 1
|
||||
|
||||
#include "concepts.hpp"
|
||||
#include "helpers.hpp"
|
||||
@@ -840,14 +841,16 @@ namespace hip_impl
|
||||
group_mem_bytes,\
|
||||
stream,\
|
||||
...)\
|
||||
{\
|
||||
hipLaunchKernelGGL(\
|
||||
kernel_name,\
|
||||
num_blocks,\
|
||||
dim_blocks,\
|
||||
group_mem_bytes,\
|
||||
stream,\
|
||||
hipLaunchParm{},\
|
||||
##__VA_ARGS__);\
|
||||
}
|
||||
do {\
|
||||
hipLaunchKernelGGL(\
|
||||
kernel_name,\
|
||||
num_blocks,\
|
||||
dim_blocks,\
|
||||
group_mem_bytes,\
|
||||
stream,\
|
||||
hipLaunchParm{},\
|
||||
##__VA_ARGS__);\
|
||||
} while(0)
|
||||
|
||||
}
|
||||
#endif //GENERIC_GRID_LAUNCH
|
||||
|
||||
@@ -23,8 +23,7 @@ THE SOFTWARE.
|
||||
#ifndef HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COMPLEX_H
|
||||
#define HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COMPLEX_H
|
||||
|
||||
#include "./hip_fp16.h"
|
||||
#include "./hip_vector_types.h"
|
||||
#include "hip/hcc_detail/hip_vector_types.h"
|
||||
|
||||
#if __cplusplus
|
||||
#define COMPLEX_ADD_OP_OVERLOAD(type) \
|
||||
@@ -177,45 +176,45 @@ COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, unsigned long long)
|
||||
|
||||
#endif
|
||||
|
||||
__device__ static inline float hipCrealf(hipFloatComplex z){
|
||||
__device__ __host__ static inline float hipCrealf(hipFloatComplex z){
|
||||
return z.x;
|
||||
}
|
||||
|
||||
__device__ static inline float hipCimagf(hipFloatComplex z){
|
||||
__device__ __host__ static inline float hipCimagf(hipFloatComplex z){
|
||||
return z.y;
|
||||
}
|
||||
|
||||
__device__ static inline hipFloatComplex make_hipFloatComplex(float a, float b){
|
||||
__device__ __host__ static inline hipFloatComplex make_hipFloatComplex(float a, float b){
|
||||
hipFloatComplex z;
|
||||
z.x = a;
|
||||
z.y = b;
|
||||
return z;
|
||||
}
|
||||
|
||||
__device__ static inline hipFloatComplex hipConjf(hipFloatComplex z){
|
||||
__device__ __host__ static inline hipFloatComplex hipConjf(hipFloatComplex z){
|
||||
hipFloatComplex ret;
|
||||
ret.x = z.x;
|
||||
ret.y = -z.y;
|
||||
return ret;
|
||||
}
|
||||
|
||||
__device__ static inline float hipCsqabsf(hipFloatComplex z){
|
||||
__device__ __host__ static inline float hipCsqabsf(hipFloatComplex z){
|
||||
return z.x * z.x + z.y * z.y;
|
||||
}
|
||||
|
||||
__device__ static inline hipFloatComplex hipCaddf(hipFloatComplex p, hipFloatComplex q){
|
||||
__device__ __host__ static inline hipFloatComplex hipCaddf(hipFloatComplex p, hipFloatComplex q){
|
||||
return make_hipFloatComplex(p.x + q.x, p.y + q.y);
|
||||
}
|
||||
|
||||
__device__ static inline hipFloatComplex hipCsubf(hipFloatComplex p, hipFloatComplex q){
|
||||
__device__ __host__ static inline hipFloatComplex hipCsubf(hipFloatComplex p, hipFloatComplex q){
|
||||
return make_hipFloatComplex(p.x - q.x, p.y - q.y);
|
||||
}
|
||||
|
||||
__device__ static inline hipFloatComplex hipCmulf(hipFloatComplex p, hipFloatComplex q){
|
||||
__device__ __host__ static inline hipFloatComplex hipCmulf(hipFloatComplex p, hipFloatComplex q){
|
||||
return make_hipFloatComplex(p.x * q.x - p.y * q.y, p.y * q.x + p.x * q.y);
|
||||
}
|
||||
|
||||
__device__ static inline hipFloatComplex hipCdivf(hipFloatComplex p, hipFloatComplex q){
|
||||
__device__ __host__ static inline hipFloatComplex hipCdivf(hipFloatComplex p, hipFloatComplex q){
|
||||
float sqabs = hipCsqabsf(q);
|
||||
hipFloatComplex ret;
|
||||
ret.x = (p.x * q.x + p.y * q.y)/sqabs;
|
||||
@@ -223,51 +222,51 @@ __device__ static inline hipFloatComplex hipCdivf(hipFloatComplex p, hipFloatCom
|
||||
return ret;
|
||||
}
|
||||
|
||||
__device__ static inline float hipCabsf(hipFloatComplex z){
|
||||
__device__ __host__ static inline float hipCabsf(hipFloatComplex z){
|
||||
return sqrtf(hipCsqabsf(z));
|
||||
}
|
||||
|
||||
|
||||
|
||||
__device__ static inline double hipCreal(hipDoubleComplex z){
|
||||
__device__ __host__ static inline double hipCreal(hipDoubleComplex z){
|
||||
return z.x;
|
||||
}
|
||||
|
||||
__device__ static inline double hipCimag(hipDoubleComplex z){
|
||||
__device__ __host__ static inline double hipCimag(hipDoubleComplex z){
|
||||
return z.y;
|
||||
}
|
||||
|
||||
__device__ static inline hipDoubleComplex make_hipDoubleComplex(double a, double b){
|
||||
__device__ __host__ static inline hipDoubleComplex make_hipDoubleComplex(double a, double b){
|
||||
hipDoubleComplex z;
|
||||
z.x = a;
|
||||
z.y = b;
|
||||
return z;
|
||||
}
|
||||
|
||||
__device__ static inline hipDoubleComplex hipConj(hipDoubleComplex z){
|
||||
__device__ __host__ static inline hipDoubleComplex hipConj(hipDoubleComplex z){
|
||||
hipDoubleComplex ret;
|
||||
ret.x = z.x;
|
||||
ret.y = z.y;
|
||||
return ret;
|
||||
}
|
||||
|
||||
__device__ static inline double hipCsqabs(hipDoubleComplex z){
|
||||
__device__ __host__ static inline double hipCsqabs(hipDoubleComplex z){
|
||||
return z.x * z.x + z.y * z.y;
|
||||
}
|
||||
|
||||
__device__ static inline hipDoubleComplex hipCadd(hipDoubleComplex p, hipDoubleComplex q){
|
||||
__device__ __host__ static inline hipDoubleComplex hipCadd(hipDoubleComplex p, hipDoubleComplex q){
|
||||
return make_hipDoubleComplex(p.x + q.x, p.y + q.y);
|
||||
}
|
||||
|
||||
__device__ static inline hipDoubleComplex hipCsub(hipDoubleComplex p, hipDoubleComplex q){
|
||||
__device__ __host__ static inline hipDoubleComplex hipCsub(hipDoubleComplex p, hipDoubleComplex q){
|
||||
return make_hipDoubleComplex(p.x - q.x, p.y - q.y);
|
||||
}
|
||||
|
||||
__device__ static inline hipDoubleComplex hipCmul(hipDoubleComplex p, hipDoubleComplex q){
|
||||
__device__ __host__ static inline hipDoubleComplex hipCmul(hipDoubleComplex p, hipDoubleComplex q){
|
||||
return make_hipDoubleComplex(p.x * q.x - p.y * q.y, p.y * q.x + p.x * q.y);
|
||||
}
|
||||
|
||||
__device__ static inline hipDoubleComplex hipCdiv(hipDoubleComplex p, hipDoubleComplex q){
|
||||
__device__ __host__ static inline hipDoubleComplex hipCdiv(hipDoubleComplex p, hipDoubleComplex q){
|
||||
double sqabs = hipCsqabs(q);
|
||||
hipDoubleComplex ret;
|
||||
ret.x = (p.x * q.x + p.y * q.y)/sqabs;
|
||||
@@ -275,28 +274,28 @@ __device__ static inline hipDoubleComplex hipCdiv(hipDoubleComplex p, hipDoubleC
|
||||
return ret;
|
||||
}
|
||||
|
||||
__device__ static inline double hipCabs(hipDoubleComplex z){
|
||||
__device__ __host__ static inline double hipCabs(hipDoubleComplex z){
|
||||
return sqrtf(hipCsqabs(z));
|
||||
}
|
||||
|
||||
typedef hipFloatComplex hipComplex;
|
||||
|
||||
__device__ static inline hipComplex make_hipComplex(float x,
|
||||
__device__ __host__ static inline hipComplex make_hipComplex(float x,
|
||||
float y){
|
||||
return make_hipFloatComplex(x, y);
|
||||
}
|
||||
|
||||
__device__ static inline hipFloatComplex hipComplexDoubleToFloat
|
||||
__device__ __host__ static inline hipFloatComplex hipComplexDoubleToFloat
|
||||
(hipDoubleComplex z){
|
||||
return make_hipFloatComplex((float)z.x, (float)z.y);
|
||||
}
|
||||
|
||||
__device__ static inline hipDoubleComplex hipComplexFloatToDouble
|
||||
__device__ __host__ static inline hipDoubleComplex hipComplexFloatToDouble
|
||||
(hipFloatComplex z){
|
||||
return make_hipDoubleComplex((double)z.x, (double)z.y);
|
||||
}
|
||||
|
||||
__device__ static inline hipComplex hipCfmaf(hipComplex p, hipComplex q, hipComplex r){
|
||||
__device__ __host__ static inline hipComplex hipCfmaf(hipComplex p, hipComplex q, hipComplex r){
|
||||
float real = (p.x * q.x) + r.x;
|
||||
float imag = (q.x * p.y) + r.y;
|
||||
|
||||
@@ -306,7 +305,7 @@ __device__ static inline hipComplex hipCfmaf(hipComplex p, hipComplex q, hipComp
|
||||
return make_hipComplex(real, imag);
|
||||
}
|
||||
|
||||
__device__ static inline hipDoubleComplex hipCfma(hipDoubleComplex p, hipDoubleComplex q, hipDoubleComplex r){
|
||||
__device__ __host__ static inline hipDoubleComplex hipCfma(hipDoubleComplex p, hipDoubleComplex q, hipDoubleComplex r){
|
||||
float real = (p.x * q.x) + r.x;
|
||||
float imag = (q.x * p.y) + r.y;
|
||||
|
||||
|
||||
@@ -23,7 +23,7 @@ THE SOFTWARE.
|
||||
#ifndef HIP_INCLUDE_HIP_HCC_DETAIL_HIP_FP16_H
|
||||
#define HIP_INCLUDE_HIP_HCC_DETAIL_HIP_FP16_H
|
||||
|
||||
#include "hip/hip_runtime.h"
|
||||
#include "hip/hcc_detail/hip_vector_types.h"
|
||||
|
||||
#if __clang_major__ > 3
|
||||
|
||||
|
||||
@@ -28,6 +28,17 @@ THE SOFTWARE.
|
||||
#if __cplusplus
|
||||
#ifdef __HCC__
|
||||
#include <hc.hpp>
|
||||
|
||||
|
||||
/**
|
||||
*-------------------------------------------------------------------------------------------------
|
||||
*-------------------------------------------------------------------------------------------------
|
||||
* @defgroup HCC-specific features
|
||||
* @warning These APIs provide access to special features of HCC compiler and are not available through the CUDA path.
|
||||
* @{
|
||||
*/
|
||||
|
||||
|
||||
/**
|
||||
* @brief Return hc::accelerator associated with the specified deviceId
|
||||
* @return #hipSuccess, #hipErrorInvalidDevice
|
||||
@@ -45,6 +56,29 @@ hipError_t hipHccGetAcceleratorView(hipStream_t stream, hc::accelerator_view **a
|
||||
|
||||
#endif // #ifdef __HCC__
|
||||
|
||||
/**
|
||||
* @brief launches kernel f with launch parameters and shared memory on stream with arguments passed to kernelparams or extra
|
||||
*
|
||||
* @param [in[ f Kernel to launch.
|
||||
* @param [in] gridDimX X grid dimension specified in work-items
|
||||
* @param [in] gridDimY Y grid dimension specified in work-items
|
||||
* @param [in] gridDimZ Z grid dimension specified in work-items
|
||||
* @param [in] blockDimX X block dimensions specified in work-items
|
||||
* @param [in] blockDimY Y grid dimension specified in work-items
|
||||
* @param [in] blockDimZ Z grid dimension specified in work-items
|
||||
* @param [in] sharedMemBytes Amount of dynamic shared memory to allocate for this kernel. The kernel can access this with HIP_DYNAMIC_SHARED.
|
||||
* @param [in] stream Stream where the kernel should be dispatched. May be 0, in which case th default stream is used with associated synchronization rules.
|
||||
* @param [in] kernelParams
|
||||
* @param [in] extra Pointer to kernel arguments. These are passed directly to the kernel and must be in the memory layout and alignment expected by the kernel.
|
||||
* @param [in] startEvent If non-null, specified event will be updated to track the start time of the kernel launch. The event must be created before calling this API.
|
||||
* @param [in] stopEvent If non-null, specified event will be updated to track the stop time of the kernel launch. The event must be created before calling this API.
|
||||
*
|
||||
* @returns hipSuccess, hipInvalidDevice, hipErrorNotInitialized, hipErrorInvalidValue
|
||||
*
|
||||
* @warning kernellParams argument is not yet implemented in HIP. Please use extra instead. Please refer to hip_porting_driver_api.md for sample usage.
|
||||
|
||||
* HIP/ROCm actually updates the start event when the associated kernel completes.
|
||||
*/
|
||||
hipError_t hipHccModuleLaunchKernel(hipFunction_t f,
|
||||
uint32_t globalWorkSizeX,
|
||||
uint32_t globalWorkSizeY,
|
||||
@@ -55,8 +89,15 @@ hipError_t hipHccModuleLaunchKernel(hipFunction_t f,
|
||||
size_t sharedMemBytes,
|
||||
hipStream_t hStream,
|
||||
void **kernelParams,
|
||||
void **extra);
|
||||
void **extra,
|
||||
hipEvent_t startEvent=nullptr,
|
||||
hipEvent_t stopEvent=nullptr
|
||||
);
|
||||
|
||||
// doxygen end HCC-specific features
|
||||
/**
|
||||
* @}
|
||||
*/
|
||||
#endif // #if __cplusplus
|
||||
|
||||
#endif //
|
||||
|
||||
@@ -1913,19 +1913,18 @@ hipError_t hipModuleLoadData(hipModule_t *module, const void *image);
|
||||
/**
|
||||
* @brief launches kernel f with launch parameters and shared memory on stream with arguments passed to kernelparams or extra
|
||||
*
|
||||
* @param [in[ f
|
||||
* @param [in] gridDimX
|
||||
* @param [in] gridDimY
|
||||
* @param [in] gridDimZ
|
||||
* @param [in] blockDimX
|
||||
* @param [in] blockDimY
|
||||
* @param [in] blockDimZ
|
||||
* @param [in] sharedMemBytes
|
||||
* @param [in] stream
|
||||
* @param [in] kernelParams
|
||||
* @param [in] extraa
|
||||
* @param [in[ f Kernel to launch.
|
||||
* @param [in] gridDimX X grid dimension specified as multiple of blockDimX.
|
||||
* @param [in] gridDimY Y grid dimension specified as multiple of blockDimY.
|
||||
* @param [in] gridDimZ Z grid dimension specified as multiple of blockDimZ.
|
||||
* @param [in] blockDimX X block dimensions specified in work-items
|
||||
* @param [in] blockDimY Y grid dimension specified in work-items
|
||||
* @param [in] blockDimZ Z grid dimension specified in work-items
|
||||
* @param [in] sharedMemBytes Amount of dynamic shared memory to allocate for this kernel. The kernel can access this with HIP_DYNAMIC_SHARED.
|
||||
* @param [in] stream Stream where the kernel should be dispatched. May be 0, in which case th default stream is used with associated synchronization rules.
|
||||
* @param [in] kernelParams
|
||||
* @param [in] extra Pointer to kernel arguments. These are passed directly to the kernel and must be in the memory layout and alignment expected by the kernel.
|
||||
*
|
||||
* The function takes the above arguments and run the kernel in hipFunction_t f. with launch parameters specified in gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY and blockDimmZ. The amount of shared memory is specificed and can be used with HIP_DYNAMIC_SHARED. The arguemt extra is used to pass in the arguments for the kernel.
|
||||
* @returns hipSuccess, hipInvalidDevice, hipErrorNotInitialized, hipErrorInvalidValue
|
||||
*
|
||||
* @warning kernellParams argument is not yet implemented in HIP. Please use extra instead. Please refer to hip_porting_driver_api.md for sample usage.
|
||||
|
||||
@@ -32,7 +32,7 @@ THE SOFTWARE.
|
||||
#error("This version of HIP requires a newer version of HCC.");
|
||||
#endif
|
||||
|
||||
#include "host_defines.h"
|
||||
#include "hip/hcc_detail/host_defines.h"
|
||||
|
||||
#define MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(type) \
|
||||
__device__ __host__ type() {} \
|
||||
|
||||
@@ -1,23 +0,0 @@
|
||||
#!/bin/bash
|
||||
function die {
|
||||
echo "${1-Died}." >&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
payload=$1
|
||||
script=$2
|
||||
[ "$payload" != "" ] || [ "$script" != "" ] || die "Invalid arguments!"
|
||||
tmp=__extract__$RANDOM
|
||||
|
||||
printf "#!/bin/bash
|
||||
samples_dir=\$1
|
||||
[ \"\$samples_dir\" != \"\" ] || read -e -p \"Enter the path to extract the HIP samples: \" samples_dir
|
||||
mkdir -p \$samples_dir
|
||||
PAYLOAD=\`awk '/^__PAYLOAD_BELOW__/ {print NR + 1; exit 0; }' \$0\`
|
||||
tail -n+\$PAYLOAD \$0 | tar -xz -C \$samples_dir
|
||||
echo \"HIP samples installed in \$samples_dir\"
|
||||
exit 0
|
||||
__PAYLOAD_BELOW__\n" > "$tmp"
|
||||
|
||||
cat "$tmp" "$payload" > "$script" && rm "$tmp"
|
||||
chmod +x "$script"
|
||||
+11
-4
@@ -1,12 +1,19 @@
|
||||
cmake_minimum_required(VERSION 2.8.3)
|
||||
project(hip_doc)
|
||||
|
||||
add_custom_target(build_doxygen ALL
|
||||
find_program(DOXYGEN_EXE doxygen)
|
||||
if(DOXYGEN_EXE)
|
||||
add_custom_target(build_doxygen ALL
|
||||
COMMAND HIP_PATH=@hip_SOURCE_DIR@ doxygen @hip_SOURCE_DIR@/docs/doxygen-input/doxy.cfg)
|
||||
add_custom_target(convert_md_to_html ALL
|
||||
install(DIRECTORY RuntimeAPI/html DESTINATION docs/docs/RuntimeAPI)
|
||||
endif()
|
||||
|
||||
find_program(GRIP_EXE grip)
|
||||
if(GRIP_EXE)
|
||||
add_custom_target(convert_md_to_html ALL
|
||||
COMMAND @hip_SOURCE_DIR@/packaging/convert_md_to_html.sh @hip_SOURCE_DIR@ ${PROJECT_BINARY_DIR}/md2html)
|
||||
install(DIRECTORY RuntimeAPI/html DESTINATION docs/docs/RuntimeAPI)
|
||||
install(DIRECTORY md2html/ DESTINATION docs)
|
||||
install(DIRECTORY md2html/ DESTINATION docs)
|
||||
endif()
|
||||
|
||||
#############################
|
||||
# Packaging steps
|
||||
|
||||
@@ -11,10 +11,6 @@ HIPCC=$(HIP_PATH)/bin/hipcc
|
||||
ifeq (${HIP_PLATFORM}, nvcc)
|
||||
HIPCC_FLAGS = -gencode=arch=compute_20,code=sm_20
|
||||
endif
|
||||
ifeq (${HIP_PLATFORM}, hcc)
|
||||
HIPCC_FLAGS = -stdlib=libc++
|
||||
endif
|
||||
|
||||
|
||||
EXE=bit_extract
|
||||
|
||||
|
||||
@@ -10,9 +10,6 @@ OPT=-O3
|
||||
CXXFLAGS = $(OPT) --std=c++11
|
||||
|
||||
HIP_PLATFORM=$(shell $(HIP_PATH)/bin/hipconfig --platform)
|
||||
ifeq (${HIP_PLATFORM}, hcc)
|
||||
CXXFLAGS += " -stdlib=libc++"
|
||||
endif
|
||||
|
||||
CODE_OBJECTS=nullkernel.hsaco
|
||||
|
||||
|
||||
@@ -11,7 +11,6 @@
|
||||
#include <elf.h>
|
||||
#include <hsa/hsa.h>
|
||||
#include <hc.hpp>
|
||||
#include <hip/hcc_detail/hcc_acc.h>
|
||||
#endif
|
||||
|
||||
#include <sys/time.h>
|
||||
|
||||
+52
-8
@@ -30,6 +30,54 @@ THE SOFTWARE.
|
||||
//---
|
||||
|
||||
|
||||
ihipEvent_t::ihipEvent_t(unsigned flags)
|
||||
{
|
||||
_state = hipEventStatusCreated;
|
||||
_stream = NULL;
|
||||
_flags = flags;
|
||||
_timestamp = 0;
|
||||
_type = hipEventTypeIndependent;
|
||||
};
|
||||
|
||||
|
||||
|
||||
// Attach to an existing completion future:
|
||||
void ihipEvent_t::attachToCompletionFuture(const hc::completion_future *cf, ihipEventType_t eventType)
|
||||
{
|
||||
_state = hipEventStatusRecording;
|
||||
_marker = *cf;
|
||||
_type = eventType;
|
||||
}
|
||||
|
||||
|
||||
|
||||
void ihipEvent_t::setTimestamp()
|
||||
{
|
||||
if (_state == hipEventStatusRecorded) {
|
||||
// already recorded, done:
|
||||
return;
|
||||
} else {
|
||||
// TODO - use completion-future functions to obtain ticks and timestamps:
|
||||
hsa_signal_t *sig = static_cast<hsa_signal_t*> (_marker.get_native_handle());
|
||||
if (sig) {
|
||||
if (hsa_signal_load_acquire(*sig) == 0) {
|
||||
|
||||
if ((_type == hipEventTypeIndependent) || (_type == hipEventTypeStopCommand)) {
|
||||
_timestamp = _marker.get_end_tick();
|
||||
} else if (_type == hipEventTypeStartCommand) {
|
||||
_timestamp = _marker.get_begin_tick();
|
||||
} else {
|
||||
assert(0); // TODO - move to debug assert
|
||||
_timestamp = 0;
|
||||
}
|
||||
|
||||
_state = hipEventStatusRecorded;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
hipError_t ihipEventCreate(hipEvent_t* event, unsigned flags)
|
||||
{
|
||||
hipError_t e = hipSuccess;
|
||||
@@ -37,12 +85,8 @@ hipError_t ihipEventCreate(hipEvent_t* event, unsigned flags)
|
||||
// TODO-IPC - support hipEventInterprocess.
|
||||
unsigned supportedFlags = hipEventDefault | hipEventBlockingSync | hipEventDisableTiming;
|
||||
if ((flags & ~supportedFlags) == 0) {
|
||||
ihipEvent_t *eh = new ihipEvent_t();
|
||||
ihipEvent_t *eh = new ihipEvent_t(flags);
|
||||
|
||||
eh->_state = hipEventStatusCreated;
|
||||
eh->_stream = NULL;
|
||||
eh->_flags = flags;
|
||||
eh->_timestamp = 0;
|
||||
*event = eh;
|
||||
} else {
|
||||
e = hipErrorInvalidValue;
|
||||
@@ -141,8 +185,8 @@ hipError_t hipEventElapsedTime(float *ms, hipEvent_t start, hipEvent_t stop)
|
||||
ihipEvent_t *start_eh = start;
|
||||
ihipEvent_t *stop_eh = stop;
|
||||
|
||||
ihipSetTs(start);
|
||||
ihipSetTs(stop);
|
||||
start->setTimestamp();
|
||||
stop->setTimestamp();
|
||||
|
||||
hipError_t status = hipSuccess;
|
||||
*ms = 0.0f;
|
||||
@@ -151,7 +195,7 @@ hipError_t hipEventElapsedTime(float *ms, hipEvent_t start, hipEvent_t stop)
|
||||
if ((start_eh->_state == hipEventStatusRecorded) && (stop_eh->_state == hipEventStatusRecorded)) {
|
||||
// Common case, we have good information for both events.
|
||||
|
||||
int64_t tickDiff = (stop_eh->_timestamp - start_eh->_timestamp);
|
||||
int64_t tickDiff = (stop_eh->timestamp() - start_eh->timestamp());
|
||||
|
||||
uint64_t freqHz;
|
||||
hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &freqHz);
|
||||
|
||||
@@ -1641,23 +1641,6 @@ const char *ihipErrorString(hipError_t hip_error)
|
||||
};
|
||||
|
||||
|
||||
void ihipSetTs(hipEvent_t e)
|
||||
{
|
||||
ihipEvent_t *eh = e;
|
||||
if (eh->_state == hipEventStatusRecorded) {
|
||||
// already recorded, done:
|
||||
return;
|
||||
} else {
|
||||
// TODO - use completion-future functions to obtain ticks and timestamps:
|
||||
hsa_signal_t *sig = static_cast<hsa_signal_t*> (eh->_marker.get_native_handle());
|
||||
if (sig) {
|
||||
if (hsa_signal_load_acquire(*sig) == 0) {
|
||||
eh->_timestamp = eh->_marker.get_end_tick();
|
||||
eh->_state = hipEventStatusRecorded;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Returns true if copyEngineCtx can see the memory allocated on dstCtx and srcCtx.
|
||||
|
||||
@@ -584,22 +584,40 @@ private: // Data
|
||||
//----
|
||||
// Internal event structure:
|
||||
enum hipEventStatus_t {
|
||||
hipEventStatusUnitialized = 0, // event is unutilized, must be "Created" before use.
|
||||
hipEventStatusCreated = 1,
|
||||
hipEventStatusRecording = 2, // event has been enqueued to record something.
|
||||
hipEventStatusRecorded = 3, // event has been recorded - timestamps are valid.
|
||||
hipEventStatusUnitialized = 0, // event is unutilized, must be "Created" before use.
|
||||
hipEventStatusCreated = 1,
|
||||
hipEventStatusRecording = 2, // event has been enqueued to record something.
|
||||
hipEventStatusRecorded = 3, // event has been recorded - timestamps are valid.
|
||||
} ;
|
||||
|
||||
// TODO - rename to ihip type of some kind
|
||||
enum ihipEventType_t {
|
||||
hipEventTypeIndependent,
|
||||
hipEventTypeStartCommand,
|
||||
hipEventTypeStopCommand,
|
||||
};
|
||||
|
||||
// internal hip event structure.
|
||||
struct ihipEvent_t {
|
||||
hipEventStatus_t _state;
|
||||
class ihipEvent_t {
|
||||
public:
|
||||
ihipEvent_t(unsigned flags);
|
||||
void attachToCompletionFuture(const hc::completion_future *cf, ihipEventType_t eventType);
|
||||
void setTimestamp();
|
||||
uint64_t timestamp() const { return _timestamp; } ;
|
||||
ihipEventType_t type() const { return _type; };
|
||||
|
||||
public:
|
||||
hipEventStatus_t _state;
|
||||
|
||||
hipStream_t _stream; // Stream where the event is recorded, or NULL if all streams.
|
||||
unsigned _flags;
|
||||
|
||||
hc::completion_future _marker;
|
||||
|
||||
private:
|
||||
ihipEventType_t _type;
|
||||
uint64_t _timestamp; // store timestamp, may be set on host or by marker.
|
||||
friend hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream);
|
||||
} ;
|
||||
|
||||
|
||||
@@ -822,8 +840,6 @@ extern hipError_t ihipDeviceSetState();
|
||||
extern ihipDevice_t *ihipGetDevice(int);
|
||||
ihipCtx_t * ihipGetPrimaryCtx(unsigned deviceIndex);
|
||||
|
||||
extern void ihipSetTs(hipEvent_t e);
|
||||
|
||||
|
||||
hipStream_t ihipSyncAndResolveStream(hipStream_t);
|
||||
|
||||
|
||||
@@ -1260,10 +1260,15 @@ hipError_t hipIpcOpenMemHandle(void** devPtr, hipIpcMemHandle_t handle, unsigned
|
||||
|
||||
ihipIpcMemHandle_t* iHandle = (ihipIpcMemHandle_t*) &handle;
|
||||
//Attach ipc memory
|
||||
hsa_status_t hsa_status =
|
||||
hsa_amd_ipc_memory_attach((hsa_amd_ipc_memory_t*)&(iHandle->ipc_handle), iHandle->psize, 1, agent, devPtr);
|
||||
if(hsa_status != HSA_STATUS_SUCCESS)
|
||||
hipStatus = hipErrorMapBufferObjectFailed;
|
||||
auto ctx= ihipGetTlsDefaultCtx();
|
||||
{
|
||||
LockedAccessor_CtxCrit_t crit(ctx->criticalData());
|
||||
// the peerCnt always stores self so make sure the trace actually
|
||||
hsa_status_t hsa_status =
|
||||
hsa_amd_ipc_memory_attach((hsa_amd_ipc_memory_t*)&(iHandle->ipc_handle), iHandle->psize, crit->peerCnt(), crit->peerAgents(), devPtr);
|
||||
if(hsa_status != HSA_STATUS_SUCCESS)
|
||||
hipStatus = hipErrorMapBufferObjectFailed;
|
||||
}
|
||||
#else
|
||||
hipStatus = hipErrorRuntimeOther;
|
||||
#endif
|
||||
|
||||
+24
-8
@@ -364,10 +364,11 @@ hipError_t hipModuleGetFunction(hipFunction_t *hfunc, hipModule_t hmod,
|
||||
|
||||
|
||||
hipError_t ihipModuleLaunchKernel(hipFunction_t f,
|
||||
uint32_t globalWorkSizeX, uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ,
|
||||
uint32_t localWorkSizeX, uint32_t localWorkSizeY, uint32_t localWorkSizeZ,
|
||||
size_t sharedMemBytes, hipStream_t hStream,
|
||||
void **kernelParams, void **extra)
|
||||
uint32_t globalWorkSizeX, uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ,
|
||||
uint32_t localWorkSizeX, uint32_t localWorkSizeY, uint32_t localWorkSizeZ,
|
||||
size_t sharedMemBytes, hipStream_t hStream,
|
||||
void **kernelParams, void **extra,
|
||||
hipEvent_t startEvent, hipEvent_t stopEvent)
|
||||
{
|
||||
|
||||
auto ctx = ihipGetTlsDefaultCtx();
|
||||
@@ -446,7 +447,20 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f,
|
||||
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
|
||||
};
|
||||
|
||||
lp.av->dispatch_hsa_kernel(&aql, config[1] /* kernarg*/, kernArgSize, nullptr/*completion_future*/);
|
||||
|
||||
hc::completion_future cf;
|
||||
|
||||
lp.av->dispatch_hsa_kernel(&aql, config[1] /* kernarg*/, kernArgSize,
|
||||
(startEvent || stopEvent) ? &cf : nullptr);
|
||||
|
||||
|
||||
if (startEvent) {
|
||||
startEvent->attachToCompletionFuture(&cf, hipEventTypeStartCommand);
|
||||
}
|
||||
if (stopEvent) {
|
||||
stopEvent->attachToCompletionFuture (&cf, hipEventTypeStopCommand);
|
||||
}
|
||||
|
||||
|
||||
if(kernelParams != NULL){
|
||||
free(config[1]);
|
||||
@@ -470,7 +484,8 @@ hipError_t hipModuleLaunchKernel(hipFunction_t f,
|
||||
return ihipLogStatus(ihipModuleLaunchKernel(f,
|
||||
blockDimX * gridDimX, blockDimY * gridDimY, gridDimZ * blockDimZ,
|
||||
blockDimX, blockDimY, blockDimZ,
|
||||
sharedMemBytes, hStream, kernelParams, extra));
|
||||
sharedMemBytes, hStream, kernelParams, extra,
|
||||
nullptr, nullptr));
|
||||
}
|
||||
|
||||
|
||||
@@ -478,7 +493,8 @@ hipError_t hipHccModuleLaunchKernel(hipFunction_t f,
|
||||
uint32_t globalWorkSizeX, uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ,
|
||||
uint32_t localWorkSizeX, uint32_t localWorkSizeY, uint32_t localWorkSizeZ,
|
||||
size_t sharedMemBytes, hipStream_t hStream,
|
||||
void **kernelParams, void **extra)
|
||||
void **kernelParams, void **extra,
|
||||
hipEvent_t startEvent, hipEvent_t stopEvent)
|
||||
{
|
||||
HIP_INIT_API(f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ,
|
||||
localWorkSizeX, localWorkSizeY, localWorkSizeZ,
|
||||
@@ -486,7 +502,7 @@ hipError_t hipHccModuleLaunchKernel(hipFunction_t f,
|
||||
kernelParams, extra);
|
||||
return ihipLogStatus(ihipModuleLaunchKernel(f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ,
|
||||
localWorkSizeX, localWorkSizeY, localWorkSizeZ,
|
||||
sharedMemBytes, hStream, kernelParams, extra));
|
||||
sharedMemBytes, hStream, kernelParams, extra, startEvent, stopEvent));
|
||||
}
|
||||
|
||||
hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t *bytes,
|
||||
|
||||
@@ -1,18 +1,29 @@
|
||||
#include<iostream>
|
||||
#include <iostream>
|
||||
#include "hip/hip_runtime.h"
|
||||
#include "hip/hip_runtime_api.h"
|
||||
#include "../test_common.h"
|
||||
|
||||
|
||||
#define LEN 1030
|
||||
#define SIZE LEN << 2
|
||||
|
||||
__global__ void cpy(hipLaunchParm lp, uint32_t *Out, uint32_t *In, uint32_t *Vald)
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../test_common.cpp
|
||||
* RUN: %t
|
||||
* HIT_END
|
||||
*/
|
||||
|
||||
|
||||
__global__ void cpy(hipLaunchParm lp, uint32_t *Out, uint32_t *In)
|
||||
{
|
||||
memcpy(Out, In, SIZE, Vald);
|
||||
int tx = hipThreadIdx_x;
|
||||
memcpy(Out + tx, In + tx, SIZE/LEN);
|
||||
}
|
||||
|
||||
__global__ void set(hipLaunchParm lp, uint32_t *ptr, uint8_t val, size_t size)
|
||||
{
|
||||
memset(ptr, val, size);
|
||||
int tx = hipThreadIdx_x;
|
||||
memset(ptr + tx, val, size);
|
||||
}
|
||||
|
||||
int main()
|
||||
@@ -24,19 +35,29 @@ int main()
|
||||
Val = new uint32_t;
|
||||
*Val = 0;
|
||||
for(int i=0;i<LEN;i++){
|
||||
A[i] = i *1.0f;
|
||||
B[i] = 0.0f;
|
||||
A[i] = i;
|
||||
B[i] = 0;
|
||||
}
|
||||
hipMalloc((void**)&Ad, SIZE);
|
||||
hipMalloc((void**)&Bd, SIZE);
|
||||
hipMalloc((void**)&Vald, sizeof(uint32_t));
|
||||
hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
|
||||
hipLaunchKernel(cpy, dim3(1), dim3(LEN/4), 0, 0, Bd, Ad, Vald);
|
||||
hipLaunchKernel(set, dim3(1), dim3(LEN/4), 0, 0, Bd, 0x1, SIZE);
|
||||
|
||||
hipLaunchKernel(cpy, dim3(1), dim3(LEN), 0, 0, Bd, Ad);
|
||||
|
||||
hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost);
|
||||
hipMemcpy(Val, Vald, sizeof(uint32_t), hipMemcpyDeviceToHost);
|
||||
for(int i=LEN-16;i<LEN;i++){
|
||||
std::cout<<A[i]<<" "<<B[i]<<std::endl;
|
||||
if(A[i]!=B[i]){
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
std::cout<<*Val<<std::endl;
|
||||
hipLaunchKernel(set, dim3(1), dim3(LEN), 0, 0, Bd, 0x1, LEN);
|
||||
|
||||
hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost);
|
||||
for(int i=LEN-16;i<LEN;i++){
|
||||
if(0x01010101!=B[i]){
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
passed();
|
||||
}
|
||||
|
||||
新增問題並參考
封鎖使用者