[ROCm/hip commit: 841dd69c90]
This commit is contained in:
Evgeny Mankov
2017-12-05 18:19:15 +03:00
56 changed files with 7552 additions and 2224 deletions
+3 -1
View File
@@ -167,6 +167,7 @@ if(HIP_PLATFORM STREQUAL "hcc")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${HIP_HCC_BUILD_FLAGS}")
set(SOURCE_FILES_RUNTIME
src/code_object_bundle.cpp
src/hip_hcc.cpp
src/hip_context.cpp
src/hip_device.cpp
@@ -179,7 +180,8 @@ if(HIP_PLATFORM STREQUAL "hcc")
src/hip_db.cpp
src/grid_launch.cpp
src/hip_texture.cpp
src/env.cpp)
src/env.cpp
src/program_state.cpp)
set(SOURCE_FILES_DEVICE
src/device_util.cpp
+19 -1
View File
@@ -8,6 +8,24 @@ We have attempted to document known bugs and limitations - in particular the [HI
## Revision History:
===================================================================================================
Release: 1.5
Date:
- Support threadIdx, blockIdx, blockDim directly (no need for hipify conversions in kernels.) HIP
Kernel syntax is now identical to CUDA kernel syntax - no need for extra parms or conversions.
- Refactor launch syntax. HIP now extracts kernels from the executable and launches them using the
existing module interface. Kernels dispatch no longer flows through HCC. Result is faster
kernel launches and with less resource usage (no signals required).
- Remove requirement for manual "serializers" previously required when passing complex structures
into kernels.
- Remove need for manual destructors
- Provide printf in device code
- Support for globals when using module API
- hipify-clang now supports using newer versions of clang
- HIP texture support equivalent to CUDA texture driver APIs
- Updates to hipify-perl, hipify-clang and documentation
===================================================================================================
Release: 1.4
Date: 2017.10.06
@@ -23,7 +41,7 @@ Date: 2017.10.06
Release: 1.3
Date: 2017.08.16
- hipcc now auto-detects amdgcn arch. No need to specify the arch when building for same system.
- HIP texture support
- HIP texture support (run-time APIs)
- Implemented __threadfence_support
- Improvements in HIP context management logic
- Bug fixes in several APIs including hipDeviceGetPCIBusId, hipEventDestroy, hipMemcpy2DAsync
+1 -1
View File
@@ -1,7 +1,7 @@
#!/usr/bin/perl -w
$HIP_BASE_VERSION_MAJOR = "1";
$HIP_BASE_VERSION_MINOR = "4";
$HIP_BASE_VERSION_MINOR = "5";
# Need perl > 5.10 to use logic-defined or
use 5.006; use v5.10.1;
-19
View File
@@ -309,25 +309,6 @@ while (@ARGV) {
$ft{'mem'} += s/\bcudaMallocPitch\b/hipMallocPitch/g;
#--------
# Coordinate Indexing and Dimensions:
$ft{'coord_func'} += s/\bthreadIdx\.x\b/hipThreadIdx_x/g;
$ft{'coord_func'} += s/\bthreadIdx\.y\b/hipThreadIdx_y/g;
$ft{'coord_func'} += s/\bthreadIdx\.z\b/hipThreadIdx_z/g;
$ft{'coord_func'} += s/\bblockIdx\.x\b/hipBlockIdx_x/g;
$ft{'coord_func'} += s/\bblockIdx\.y\b/hipBlockIdx_y/g;
$ft{'coord_func'} += s/\bblockIdx\.z\b/hipBlockIdx_z/g;
$ft{'coord_func'} += s/\bblockDim\.x\b/hipBlockDim_x/g;
$ft{'coord_func'} += s/\bblockDim\.y\b/hipBlockDim_y/g;
$ft{'coord_func'} += s/\bblockDim\.z\b/hipBlockDim_z/g;
$ft{'coord_func'} += s/\bgridDim\.x\b/hipGridDim_x/g;
$ft{'coord_func'} += s/\bgridDim\.y\b/hipGridDim_y/g;
$ft{'coord_func'} += s/\bgridDim\.z\b/hipGridDim_z/g;
#--------
# Events
$ft{'event'} += s/\bcudaEvent_t\b/hipEvent_t/g;
-134
View File
@@ -2,7 +2,6 @@
<!-- toc -->
- [Errors related to undefined reference to `__hcLaunchKernel__***__grid_launch_parm**`](#errors-related-to-undefined-reference-to-__hclaunchkernel____grid_launch_parm)
- [Can't find kernels inside dynamic linked library](#cant-find-kernels-inside-dynamic-linked-library)
- [What is the current limitation of HIP Generic Grid Launch method?](#what-is-the-current-limitation-of-hip-generic-grid-launch-method)
- [Errors related to `no matching constructor`](#errors-related-to-no-matching-constructor)
@@ -10,139 +9,6 @@
<!-- tocstop -->
### Errors related to undefined reference to `__hcLaunchKernel__***__grid_launch_parm**`
Some common code practices may lead to hipcc generating a error with the form :
```
undefined reference to `__hcLaunchKernel__ZN15vecAddNamespace6vecAddIidEEv16grid_launch_parmPT0_S3_S3_T_
```
Or:
```
error: weak declaration cannot have internal linkage
```
Suggested workarounds:
- Avoid use of static with kernel definition:
```c++
static __global__ MyKernel
```
- Avoid defining kernels in anonymous namespace :
```c++
namespace {
__global__ MyKernel
}
```
### Can't find kernels inside dynamic linked library
HCC requires use of the "-Bdynamic" flag when creating a dynamic library which contains kernels. The dynamic flag causes the symbols to be created with a signature which allows HCC to discover and load the kernels in the dynamic library. This flag is often not set by default and must be added to the link step of the library. If not done, HCC will be unable to find the kernels defined in the library, and will emit a message such as:
```
HSADevice::CreateKernel(): Unable to create kernel"
```
To correct, add the following flag to hcc or hipcc:
```
$ hipcc -Wl,-Bsymbolic ...
```
Ensure there is no space in the "Wl,-Bsymbolic" option.
### What is the current limitation of HIP Generic Grid Launch method?
1. __global__ functions cannot be marked as static or put in an unnamed namespace i.e. they cannot be given internal linkage (this would clash with __attribute__((weak)));
2. using the macro based dispatch mechanism i.e. hipLaunchKernel* only works for functions that take no more than 20 arguments (this limit can be increased up to 126, and is temporary until we can enable C++14 mode and use variadic generic lambdas); no such limitation applies do dispatching directly through grid_launch.
### Errors related to `no matching constructor`
The symptom is the compiler would complain about errors like `no matching constructor` for classes/structs passed as arguments into a GPU kernel. Often, this is caused by a design limitation in HCC where array-typed member variables inside a class/struct cant be correctly passed into GPU kernels. To mitigate this issue, a custom serializer/deserializer pair is provided.
For example, `Foo` in the code snippets below contains an array-typed member variable `table`, which would fail the compiler if used as a kernel argument.
```
struct Foo {
float _data;
// table is an array, which makes foo
int table[3];
};
```
A workaround is to provide a custom serializer on host side which appends the contents of the array as kernel arguments, and a custome deserializaer on the device path to reconstruct the array inside the GPU kernels.
The deserializer can not be a function template, and should have scalar-typed parameters of the number equals to the length of the array-typed member variable. For example:
```
struct Foo {
float _data;
int _table[3];
#ifdef __HCC__
// user-provided CPU serializer
// Append the contents of the array member as kernel arguments
__attribute__((annotate(“serialize”)))
void __cxxamp_serialize(Kalmar::Serialize &s) const {
s.Append(sizeof(float), &_data);
for (int i = 0; i < 3; ++i)
s.Append(sizeof(int), &_table[i]);
}
// user-provided GPU deserializer
// table has 3 int elements, so deserializer must have 3 int parameters.
__attribute__((annotate(“user_deserialize”)))
Foo(float d, int x0, int x1, int x2) [[cpu]][[hc]] {
_data = d;
_table[0] = x0;
_table[1] = x1;
_table[2] = x2;
}
#endif
};
```
Rather than create serializer functions, another workaround is to pass the member fields from the structure as simple data types.
Note a class or struct can contain only one "user_deserialize" constructor.
For types which contain arrays which are based on template parameter, you can use partial template instantiation to implement one constructor per specialization.
However, an easier approach may be to create one user_deserializer which processes the maximum supported dimension.
This will take more memory in the structure and also require additional kernel arguments, but this may have little performance impact and the conversion is easier than partial template specialization. An example:
```
#define MAX_Dim 4
template<typename T, int Dim> struct MyArray {
T* dataPtr_;
//int size_[Dim]; // Original code with template-sized Dims
int size_[MAX_dim]; // Workaround code - allocate an array big enough for all dims so one serializer works.
...
#ifdef __HCC__
__attribute__((annotate("serialize")))
void __cxxamp_serialize(Kalmar::Serialize &s) const {
s.Append(sizeof(float), &_dataPtr);
for (int i=0; i<MAX_Dim; i++) {
s.Append(sizeof(size_[0]), &size_[i]);
}
}
__attribute__((annotate("user_deserialize")))
MyArray(T* data, int size0, int size1, int size2, int size3) [[cpu]][[hc]] {
data_ = data;
size_[0] = size0;
size_[1] = size1;
size_[2] = size2;
size_[3] = size3;
}
#endif
```
### HIP is more restrictive in enforcing restrictions
@@ -231,3 +231,45 @@ int main(){
return 0;
}
```
## HIP Module and Texture Driver API
HIP supports texture driver APIs however texture reference should be declared in host scope. Following code explains the use of texture reference for __HIP_PLATFORM_HCC__ platform.
```
// Code to generate code object
#include "hip/hip_runtime.h"
extern texture<float, 2, hipReadModeElementType> tex;
__global__ void tex2dKernel(hipLaunchParm lp, float* outputData,
int width,
int height)
{
int x = hipBlockIdx_x*hipBlockDim_x + hipThreadIdx_x;
int y = hipBlockIdx_y*hipBlockDim_y + hipThreadIdx_y;
outputData[y*width + x] = tex2D(tex, x, y);
}
```
```
// Host code:
texture<float, 2, hipReadModeElementType> tex;
void myFunc ()
{
// ...
textureReference* texref;
hipModuleGetTexRef(&texref, Module1, "tex");
hipTexRefSetAddressMode(texref, 0, hipAddressModeWrap);
hipTexRefSetAddressMode(texref, 1, hipAddressModeWrap);
hipTexRefSetFilterMode(texref, hipFilterModePoint);
hipTexRefSetFlags(texref, 0);
hipTexRefSetFormat(texref, HIP_AD_FORMAT_FLOAT, 1);
hipTexRefSetArray(texref, array, HIP_TRSA_OVERRIDE_FORMAT);
// ...
}
```
+23 -29
View File
@@ -465,34 +465,36 @@ a performance impact.
### Textures and Cache Control
>Texture support is under-development and not yet supported by HIP.
Compute programs sometimes use textures either to access dedicated texture caches or to use the texture-sampling hardware for interpolation and clamping. The former approach uses simple point samplers with linear interpolation, essentially only reading a single point. The latter approach uses the sampler hardware to interpolate and combine multiple
point samples. AMD hardware, as well as recent competing hardware,
has a unified texture/L1 cache, so it no longer has a dedicated texture cache. But the nvcc path often caches global loads in the L2 cache, and some programs may benefit from explicit control of the L1 cache contents. We recommend the __ldg instruction for this purpose.
HIP currently lacks texture support; a future revision will add this capability. Also, AMD compilers currently load all data into both the L1 and L2 caches, so __ldg is treated as a no-op.
AMD compilers currently load all data into both the L1 and L2 caches, so __ldg is treated as a no-op.
We recommend the following for functional portability:
- For programs that use textures only to benefit from improved caching, use the __ldg instruction
- Alternatively, use conditional compilation (see [Identify HIP Target Platform](#identify-hip-target-platform))
- For the `__HIP_PLATFORM_NVCC__` path, use the full texture path
- For the `__HIP_PLATFORM_HCC__` path, pass an additional pointer to the kernel and reference it using regular device memory-load instructions rather than texture loads. Some applications may already take this step, since it allows experimentation with caching behavior.
- Programs that use texture object APIs, work well on HIP
- For program that use texture reference APIs, use conditional compilation (see [Identify HIP Target Platform](#identify-hip-target-platform))
- For the `__HIP_PLATFORM_HCC__` path, pass an additional argument to the kernel and in texture fetch API inside kernel as shown below:-
```
texture<float, 1, cudaReadModeElementType> t_features;
texture<float, 2, hipReadModeElementType> tex;
void __global__ MyKernel(float *d_features /* pass pointer parameter, if not already available */...)
{
// ...
#ifdef __HIP_PLATFORM_NVCC__
float tval = tex1Dfetch(t_features,addr);
#else
float tval = d_features[addr];
__global__ void tex2DKernel(float* outputData,
#ifdef __HIP_PLATFORM_HCC__
hipTextureObject_t textureObject,
#endif
int width,
int height)
{
int x = blockIdx.x*blockDim.x + threadIdx.x;
int y = blockIdx.y*blockDim.y + threadIdx.y;
#ifdef __HIP_PLATFORM_HCC__
outputData[y*width + x] = tex2D(tex, textureObject, x, y);
#else
outputData[y*width + x] = tex2D(tex, x, y);
#endif
}
// Host code:
@@ -500,23 +502,15 @@ void myFunc ()
{
// ...
#ifdef __HIP_PLATFORM_NVCC__
cudaChannelFormatDesc chDesc0 = cudaCreateChannelDesc<float>();
t_features.filterMode = cudaFilterModePoint;
t_features.normalized = false;
t_features.channelDesc = chDesc0;
cudaBindTexture(NULL, &t_features, d_features, &chDesc0, npoints*nfeatures*sizeof(float));
#ifdef __HIP_PLATFORM_HCC__
hipLaunchKernelGGL(tex2DKernel, dim3(dimGrid), dim3(dimBlock), 0, 0, dData, tex.textureObject, width, height);
#else
hipLaunchKernelGGL(tex2DKernel, dim3(dimGrid), dim3(dimBlock), 0, 0, dData, width, height);
#endif
```
Additionally, many of the Rodinia benchmarks demonstrate how to modify hipified programs so that textures are not required - search for USE_TEXTURES define in the rodinia source directory.
For example, [here
Cuda programs that employ sampler hardware must either wait for hcc texture support or use more-sophisticated workarounds.
## More Tips
### HIPTRACE Mode
@@ -0,0 +1,158 @@
/*
Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#pragma once
#include <hsa/hsa.h>
#include <algorithm>
#include <cstdint>
#include <istream>
#include <iterator>
#include <string>
#include <utility>
#include <vector>
namespace hip_impl
{
hsa_isa_t triple_to_hsa_isa(const std::string& triple);
struct Bundled_code {
union {
struct {
std::uint64_t offset;
std::uint64_t bundle_sz;
std::uint64_t triple_sz;
};
std::uint8_t cbuf[
sizeof(offset) + sizeof(bundle_sz) + sizeof(triple_sz)];
};
std::string triple;
std::vector<std::uint8_t> blob;
};
class Bundled_code_header {
// DATA - STATICS
static constexpr const char magic_string_[] =
"__CLANG_OFFLOAD_BUNDLE__";
static constexpr auto magic_string_sz_ = sizeof(magic_string_) - 1;
// DATA
union {
struct {
std::uint8_t bundler_magic_string_[magic_string_sz_];
std::uint64_t bundle_cnt_;
};
std::uint8_t cbuf_[
sizeof(bundler_magic_string_) + sizeof(bundle_cnt_)];
};
std::vector<Bundled_code> bundles_;
// FRIENDS - MANIPULATORS
template<typename RandomAccessIterator>
friend
inline
bool read(
RandomAccessIterator f,
RandomAccessIterator l,
Bundled_code_header& x)
{
if (f == l) return false;
std::copy_n(f, sizeof(x.cbuf_), x.cbuf_);
if (valid(x)) {
x.bundles_.resize(x.bundle_cnt_);
auto it = f + sizeof(x.cbuf_);
for (auto&& y : x.bundles_) {
std::copy_n(it, sizeof(y.cbuf), y.cbuf);
it += sizeof(y.cbuf);
y.triple.insert(y.triple.cend(), it, it + y.triple_sz);
std::copy_n(
f + y.offset, y.bundle_sz, std::back_inserter(y.blob));
it += y.triple_sz;
}
return true;
}
return false;
}
friend
inline
bool read(const std::vector<std::uint8_t>& blob, Bundled_code_header& x)
{
return read(blob.cbegin(), blob.cend(), x);
}
friend
inline
bool read(std::istream& is, Bundled_code_header& x)
{
return read(std::vector<std::uint8_t>{
std::istreambuf_iterator<char>{is},
std::istreambuf_iterator<char>{}},
x);
}
// FRIENDS - ACCESSORS
friend
inline
bool valid(const Bundled_code_header& x)
{
return std::equal(
x.bundler_magic_string_,
x.bundler_magic_string_ + magic_string_sz_,
x.magic_string_);
}
friend
inline
const std::vector<Bundled_code>& bundles(const Bundled_code_header& x)
{
return x.bundles_;
}
public:
// CREATORS
Bundled_code_header() = default;
template<typename RandomAccessIterator>
Bundled_code_header(RandomAccessIterator f, RandomAccessIterator l);
explicit
Bundled_code_header(const std::vector<std::uint8_t>& blob);
Bundled_code_header(const Bundled_code_header&) = default;
Bundled_code_header(Bundled_code_header&&) = default;
~Bundled_code_header() = default;
// MANIPULATORS
Bundled_code_header& operator=(const Bundled_code_header&) = default;
Bundled_code_header& operator=(Bundled_code_header&&) = default;
};
// CREATORS
template<typename I>
Bundled_code_header::Bundled_code_header(I f, I l) : Bundled_code_header{}
{
read(f, l, *this);
}
} // Namespace hip_impl.
@@ -1,5 +1,5 @@
/*
Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
Copyright (c) 2015-present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
@@ -23,6 +23,7 @@ THE SOFTWARE.
#ifndef HIP_INCLUDE_HIP_HCC_DETAIL_DRIVER_TYPES_H
#define HIP_INCLUDE_HIP_HCC_DETAIL_DRIVER_TYPES_H
typedef void* hipDeviceptr_t;
enum hipChannelFormatKind
{
hipChannelFormatKindSigned = 0,
@@ -40,6 +41,29 @@ struct hipChannelFormatDesc
enum hipChannelFormatKind f;
};
#define HIP_TRSF_NORMALIZED_COORDINATES 0x02
#define HIP_TRSF_READ_AS_INTEGER 0x01
#define HIP_TRSA_OVERRIDE_FORMAT 0x01
enum hipArray_Format
{
HIP_AD_FORMAT_UNSIGNED_INT8 = 0x01,
HIP_AD_FORMAT_UNSIGNED_INT16 = 0x02,
HIP_AD_FORMAT_UNSIGNED_INT32 = 0x03,
HIP_AD_FORMAT_SIGNED_INT8 = 0x08,
HIP_AD_FORMAT_SIGNED_INT16 = 0x09,
HIP_AD_FORMAT_SIGNED_INT32 = 0x0a,
HIP_AD_FORMAT_HALF = 0x10,
HIP_AD_FORMAT_FLOAT = 0x20
};
struct HIP_ARRAY_DESCRIPTOR {
enum hipArray_Format format;
unsigned int numChannels;
size_t width;
size_t height;
};
struct hipArray {
void* data; //FIXME: generalize this
struct hipChannelFormatDesc desc;
@@ -47,8 +71,30 @@ struct hipArray {
unsigned int width;
unsigned int height;
unsigned int depth;
struct HIP_ARRAY_DESCRIPTOR drvDesc;
bool isDrv;
};
typedef struct hip_Memcpy2D {
size_t height;
size_t widthInBytes;
hipArray* dstArray;
hipDeviceptr_t dstDevice;
void * dstHost;
hipMemoryType dstMemoryType;
size_t dstPitch;
size_t dstXInBytes;
size_t dstY;
hipArray* srcArray;
hipDeviceptr_t srcDevice;
const void * srcHost;
hipMemoryType srcMemoryType;
size_t srcPitch;
size_t srcXInBytes;
size_t srcY;
}hip_Memcpy2D;
typedef struct hipArray* hipArray_t;
typedef const struct hipArray* hipArray_const_t;
@@ -0,0 +1,159 @@
/*
Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#pragma once
#include "code_object_bundle.hpp"
#include "concepts.hpp"
#include "helpers.hpp"
#include "program_state.hpp"
#include "hc.hpp"
#include "hip/hip_hcc.h"
#include "hip_runtime.h"
#include <cstddef>
#include <cstdint>
#include <functional>
#include <iostream>
#include <mutex>
#include <stdexcept>
#include <string>
#include <tuple>
#include <type_traits>
#include <unordered_map>
#include <utility>
#include <vector>
namespace hip_impl
{
template<
typename T,
typename std::enable_if<std::is_integral<T>{}>::type* = nullptr>
inline
T round_up_to_next_multiple_nonnegative(T x, T y)
{
T tmp = x + y - 1;
return tmp - tmp % y;
}
inline
std::vector<std::uint8_t> make_kernarg()
{
return {};
}
inline
std::vector<std::uint8_t> make_kernarg(std::vector<std::uint8_t> kernarg)
{
return kernarg;
}
template<typename T>
inline
std::vector<std::uint8_t> make_kernarg(std::vector<uint8_t> kernarg, T x)
{
kernarg.resize(
round_up_to_next_multiple_nonnegative(kernarg.size(), alignof(T)) +
sizeof(T));
new (kernarg.data() + kernarg.size() - sizeof(T)) T{std::move(x)};
return kernarg;
}
template<typename T, typename... Ts>
inline
std::vector<std::uint8_t> make_kernarg(
std::vector<std::uint8_t> kernarg, T x, Ts... xs)
{
return make_kernarg(
make_kernarg(std::move(kernarg), std::move(x)), std::move(xs)...);
}
template<typename... Ts>
inline
std::vector<std::uint8_t> make_kernarg(Ts... xs)
{
std::vector<std::uint8_t> kernarg;
kernarg.reserve(sizeof(std::tuple<Ts...>));
return make_kernarg(std::move(kernarg), std::move(xs)...);
}
void hipLaunchKernelGGLImpl(
std::uintptr_t function_address,
const dim3& numBlocks,
const dim3& dimBlocks,
std::uint32_t sharedMemBytes,
hipStream_t stream,
void** kernarg);
} // Namespace hip_impl.
template<typename... Args, typename F = void (*)(Args...)>
inline
void hipLaunchKernelGGL(
F kernel,
const dim3& numBlocks,
const dim3& dimBlocks,
std::uint32_t sharedMemBytes,
hipStream_t stream,
Args... args)
{
auto kernarg = hip_impl::make_kernarg(std::move(args)...);
std::size_t kernarg_size = kernarg.size();
void* config[] = {
HIP_LAUNCH_PARAM_BUFFER_POINTER, kernarg.data(),
HIP_LAUNCH_PARAM_BUFFER_SIZE, &kernarg_size,
HIP_LAUNCH_PARAM_END
};
hip_impl::hipLaunchKernelGGLImpl(
reinterpret_cast<std::uintptr_t>(kernel),
numBlocks,
dimBlocks,
sharedMemBytes,
stream,
&config[0]);
}
template<typename... Args, typename F = void (*)(hipLaunchParm, Args...)>
inline
void hipLaunchKernel(
F kernel,
const dim3& numBlocks,
const dim3& dimBlocks,
std::uint32_t groupMemBytes,
hipStream_t stream,
Args... args)
{
hipLaunchKernelGGL(
kernel,
numBlocks,
dimBlocks,
groupMemBytes,
stream,
hipLaunchParm{},
std::move(args)...);
}
@@ -19,989 +19,12 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#pragma once
#if GENERIC_GRID_LAUNCH == 1
#include "concepts.hpp"
#include "helpers.hpp"
#include "hc.hpp"
#include "hip/hip_hcc.h"
#include "hip_runtime.h"
#include <functional>
#include <iostream>
#include <stdexcept>
#include <type_traits>
#include <utility>
namespace hip_impl
{
namespace
{
struct New_grid_launch_tag {};
struct Old_grid_launch_tag {};
template<typename C, typename D>
class RAII_guard {
D dtor_;
public:
RAII_guard() = default;
RAII_guard(const C& ctor, D dtor) : dtor_{std::move(dtor)}
{
ctor();
}
RAII_guard(const RAII_guard&) = default;
RAII_guard(RAII_guard&&) = default;
RAII_guard& operator=(const RAII_guard&) = default;
RAII_guard& operator=(RAII_guard&&) = default;
~RAII_guard() { dtor_(); }
};
template<typename C, typename D>
RAII_guard<C, D> make_RAII_guard(const C& ctor, D dtor)
{
return RAII_guard<C, D>{ctor, std::move(dtor)};
}
template<FunctionalProcedure F, typename... Ts>
using is_new_grid_launch_t = typename std::conditional<
is_callable<F(Ts...)>{},
New_grid_launch_tag,
Old_grid_launch_tag>::type;
}
// TODO: - dispatch rank should be derived from the domain dimensions passed
// in, and not always assumed to be 3;
template<FunctionalProcedure K, typename... Ts>
requires(Domain<K> == {Ts...})
inline
void grid_launch_hip_impl_(
New_grid_launch_tag,
dim3 num_blocks,
dim3 dim_blocks,
int group_mem_bytes,
const hc::accelerator_view& acc_v,
K k)
{
const auto d = hc::extent<3>{
num_blocks.z * dim_blocks.z,
num_blocks.y * dim_blocks.y,
num_blocks.x * dim_blocks.x}.tile_with_dynamic(
dim_blocks.z,
dim_blocks.y,
dim_blocks.x,
group_mem_bytes);
try {
hc::parallel_for_each(acc_v, d, k);
}
catch (std::exception& ex) {
std::cerr << "Failed in " << __func__ << ", with exception: "
<< ex.what() << std::endl;
throw;
}
}
// TODO: these are workarounds, they should be removed.
hc::accelerator_view lock_stream_hip_(hipStream_t&, void*&);
void print_prelaunch_trace_(const char*, dim3, dim3, int, hipStream_t);
void unlock_stream_hip_(
hipStream_t, void*, const char*, hc::accelerator_view*);
template<FunctionalProcedure K, typename... Ts>
requires(Domain<K> == {Ts...})
inline
void grid_launch_hip_impl_(
New_grid_launch_tag,
dim3 num_blocks,
dim3 dim_blocks,
int group_mem_bytes,
hipStream_t stream,
const char* kernel_name,
K k)
{
void* lck_stream = nullptr;
auto acc_v = lock_stream_hip_(stream, lck_stream);
auto stream_guard = make_RAII_guard(
std::bind(
print_prelaunch_trace_,
kernel_name,
num_blocks,
dim_blocks,
group_mem_bytes,
stream),
std::bind(
unlock_stream_hip_, stream, lck_stream, kernel_name, &acc_v));
try {
grid_launch_hip_impl_(
New_grid_launch_tag{},
std::move(num_blocks),
std::move(dim_blocks),
group_mem_bytes,
acc_v,
std::move(k));
}
catch (std::exception& ex) {
std::cerr << "Failed in " << __func__ << ", with exception: "
<< ex.what() << std::endl;
throw;
}
}
template<FunctionalProcedure K, typename... Ts>
requires(Domain<K> == {hipLaunchParm, Ts...})
inline
void grid_launch_hip_impl_(
Old_grid_launch_tag,
dim3 num_blocks,
dim3 dim_blocks,
int group_mem_bytes,
hipStream_t stream,
K k)
{
grid_launch_hip_impl_(
New_grid_launch_tag{},
std::move(num_blocks),
std::move(dim_blocks),
group_mem_bytes,
std::move(stream),
std::move(k));
}
template<FunctionalProcedure K, typename... Ts>
requires(Domain<K> == {hipLaunchParm, Ts...})
inline
void grid_launch_hip_impl_(
Old_grid_launch_tag,
dim3 num_blocks,
dim3 dim_blocks,
int group_mem_bytes,
hipStream_t stream,
const char* kernel_name,
K k)
{
grid_launch_hip_impl_(
New_grid_launch_tag{},
std::move(num_blocks),
std::move(dim_blocks),
group_mem_bytes,
std::move(stream),
kernel_name,
std::move(k));
}
template<FunctionalProcedure K, typename... Ts>
requires(Domain<K> == {Ts...})
inline
std::enable_if_t<!std::is_function<K>::value> grid_launch_hip_(
dim3 num_blocks,
dim3 dim_blocks,
int group_mem_bytes,
hipStream_t stream,
const char* kernel_name,
K k)
{
grid_launch_hip_impl_(
is_new_grid_launch_t<K, Ts...>{},
std::move(num_blocks),
std::move(dim_blocks),
group_mem_bytes,
std::move(stream),
kernel_name,
std::move(k));
}
template<FunctionalProcedure K, typename... Ts>
requires(Domain<K> == {Ts...})
inline
std::enable_if_t<!std::is_function<K>::value> grid_launch_hip_(
dim3 num_blocks,
dim3 dim_blocks,
int group_mem_bytes,
hipStream_t stream,
K k)
{
grid_launch_hip_impl_(
is_new_grid_launch_t<K, Ts...>{},
std::move(num_blocks),
std::move(dim_blocks),
group_mem_bytes,
std::move(stream),
std::move(k));
}
// TODO: these are temporary and purposefully noisy and disruptive.
#define make_kernel_name_hip(k, n)\
HIP_kernel_functor_name_begin ## _ ## k ## _ ## \
HIP_kernel_functor_name_end ## _ ## n
#define make_kernel_functor_hip_30(\
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23,\
p24, p25, p26, p27)\
struct make_kernel_name_hip(function_name, 28) {\
std::decay_t<decltype(p0)> _p0_;\
std::decay_t<decltype(p1)> _p1_;\
std::decay_t<decltype(p2)> _p2_;\
std::decay_t<decltype(p3)> _p3_;\
std::decay_t<decltype(p4)> _p4_;\
std::decay_t<decltype(p5)> _p5_;\
std::decay_t<decltype(p6)> _p6_;\
std::decay_t<decltype(p7)> _p7_;\
std::decay_t<decltype(p8)> _p8_;\
std::decay_t<decltype(p9)> _p9_;\
std::decay_t<decltype(p10)> _p10_;\
std::decay_t<decltype(p11)> _p11_;\
std::decay_t<decltype(p12)> _p12_;\
std::decay_t<decltype(p13)> _p13_;\
std::decay_t<decltype(p14)> _p14_;\
std::decay_t<decltype(p15)> _p15_;\
std::decay_t<decltype(p16)> _p16_;\
std::decay_t<decltype(p17)> _p17_;\
std::decay_t<decltype(p18)> _p18_;\
std::decay_t<decltype(p19)> _p19_;\
std::decay_t<decltype(p20)> _p20_;\
std::decay_t<decltype(p21)> _p21_;\
std::decay_t<decltype(p22)> _p22_;\
std::decay_t<decltype(p23)> _p23_;\
std::decay_t<decltype(p24)> _p24_;\
std::decay_t<decltype(p25)> _p25_;\
std::decay_t<decltype(p26)> _p26_;\
std::decay_t<decltype(p27)> _p27_;\
void operator()(const hc::tiled_index<3>&) const [[hc]]\
{\
kernel_name(\
_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
_p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\
_p18_, _p19_, _p20_, _p21_, _p22_, _p23_, _p24_, _p25_,\
_p26_, _p27_);\
}\
}
#define make_kernel_functor_hip_29(\
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23,\
p24, p25, p26)\
struct make_kernel_name_hip(function_name, 27) {\
std::decay_t<decltype(p0)> _p0_;\
std::decay_t<decltype(p1)> _p1_;\
std::decay_t<decltype(p2)> _p2_;\
std::decay_t<decltype(p3)> _p3_;\
std::decay_t<decltype(p4)> _p4_;\
std::decay_t<decltype(p5)> _p5_;\
std::decay_t<decltype(p6)> _p6_;\
std::decay_t<decltype(p7)> _p7_;\
std::decay_t<decltype(p8)> _p8_;\
std::decay_t<decltype(p9)> _p9_;\
std::decay_t<decltype(p10)> _p10_;\
std::decay_t<decltype(p11)> _p11_;\
std::decay_t<decltype(p12)> _p12_;\
std::decay_t<decltype(p13)> _p13_;\
std::decay_t<decltype(p14)> _p14_;\
std::decay_t<decltype(p15)> _p15_;\
std::decay_t<decltype(p16)> _p16_;\
std::decay_t<decltype(p17)> _p17_;\
std::decay_t<decltype(p18)> _p18_;\
std::decay_t<decltype(p19)> _p19_;\
std::decay_t<decltype(p20)> _p20_;\
std::decay_t<decltype(p21)> _p21_;\
std::decay_t<decltype(p22)> _p22_;\
std::decay_t<decltype(p23)> _p23_;\
std::decay_t<decltype(p24)> _p24_;\
std::decay_t<decltype(p25)> _p25_;\
std::decay_t<decltype(p26)> _p26_;\
void operator()(const hc::tiled_index<3>&) const [[hc]]\
{\
kernel_name(\
_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
_p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\
_p18_, _p19_, _p20_, _p21_, _p22_, _p23_, _p24_, _p25_,\
_p26_);\
}\
}
#define make_kernel_functor_hip_28(\
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23,\
p24, p25)\
struct make_kernel_name_hip(function_name, 26) {\
std::decay_t<decltype(p0)> _p0_;\
std::decay_t<decltype(p1)> _p1_;\
std::decay_t<decltype(p2)> _p2_;\
std::decay_t<decltype(p3)> _p3_;\
std::decay_t<decltype(p4)> _p4_;\
std::decay_t<decltype(p5)> _p5_;\
std::decay_t<decltype(p6)> _p6_;\
std::decay_t<decltype(p7)> _p7_;\
std::decay_t<decltype(p8)> _p8_;\
std::decay_t<decltype(p9)> _p9_;\
std::decay_t<decltype(p10)> _p10_;\
std::decay_t<decltype(p11)> _p11_;\
std::decay_t<decltype(p12)> _p12_;\
std::decay_t<decltype(p13)> _p13_;\
std::decay_t<decltype(p14)> _p14_;\
std::decay_t<decltype(p15)> _p15_;\
std::decay_t<decltype(p16)> _p16_;\
std::decay_t<decltype(p17)> _p17_;\
std::decay_t<decltype(p18)> _p18_;\
std::decay_t<decltype(p19)> _p19_;\
std::decay_t<decltype(p20)> _p20_;\
std::decay_t<decltype(p21)> _p21_;\
std::decay_t<decltype(p22)> _p22_;\
std::decay_t<decltype(p23)> _p23_;\
std::decay_t<decltype(p24)> _p24_;\
std::decay_t<decltype(p25)> _p25_;\
void operator()(const hc::tiled_index<3>&) const [[hc]]\
{\
kernel_name(\
_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
_p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\
_p18_, _p19_, _p20_, _p21_, _p22_, _p23_, _p24_, _p25_);\
}\
}
#define make_kernel_functor_hip_27(\
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23,\
p24)\
struct make_kernel_name_hip(function_name, 25) {\
std::decay_t<decltype(p0)> _p0_;\
std::decay_t<decltype(p1)> _p1_;\
std::decay_t<decltype(p2)> _p2_;\
std::decay_t<decltype(p3)> _p3_;\
std::decay_t<decltype(p4)> _p4_;\
std::decay_t<decltype(p5)> _p5_;\
std::decay_t<decltype(p6)> _p6_;\
std::decay_t<decltype(p7)> _p7_;\
std::decay_t<decltype(p8)> _p8_;\
std::decay_t<decltype(p9)> _p9_;\
std::decay_t<decltype(p10)> _p10_;\
std::decay_t<decltype(p11)> _p11_;\
std::decay_t<decltype(p12)> _p12_;\
std::decay_t<decltype(p13)> _p13_;\
std::decay_t<decltype(p14)> _p14_;\
std::decay_t<decltype(p15)> _p15_;\
std::decay_t<decltype(p16)> _p16_;\
std::decay_t<decltype(p17)> _p17_;\
std::decay_t<decltype(p18)> _p18_;\
std::decay_t<decltype(p19)> _p19_;\
std::decay_t<decltype(p20)> _p20_;\
std::decay_t<decltype(p21)> _p21_;\
std::decay_t<decltype(p22)> _p22_;\
std::decay_t<decltype(p23)> _p23_;\
std::decay_t<decltype(p24)> _p24_;\
void operator()(const hc::tiled_index<3>&) const [[hc]]\
{\
kernel_name(\
_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
_p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\
_p18_, _p19_, _p20_, _p21_, _p22_, _p23_, _p24_);\
}\
}
#define make_kernel_functor_hip_26(\
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23)\
struct make_kernel_name_hip(function_name, 24) {\
std::decay_t<decltype(p0)> _p0_;\
std::decay_t<decltype(p1)> _p1_;\
std::decay_t<decltype(p2)> _p2_;\
std::decay_t<decltype(p3)> _p3_;\
std::decay_t<decltype(p4)> _p4_;\
std::decay_t<decltype(p5)> _p5_;\
std::decay_t<decltype(p6)> _p6_;\
std::decay_t<decltype(p7)> _p7_;\
std::decay_t<decltype(p8)> _p8_;\
std::decay_t<decltype(p9)> _p9_;\
std::decay_t<decltype(p10)> _p10_;\
std::decay_t<decltype(p11)> _p11_;\
std::decay_t<decltype(p12)> _p12_;\
std::decay_t<decltype(p13)> _p13_;\
std::decay_t<decltype(p14)> _p14_;\
std::decay_t<decltype(p15)> _p15_;\
std::decay_t<decltype(p16)> _p16_;\
std::decay_t<decltype(p17)> _p17_;\
std::decay_t<decltype(p18)> _p18_;\
std::decay_t<decltype(p19)> _p19_;\
std::decay_t<decltype(p20)> _p20_;\
std::decay_t<decltype(p21)> _p21_;\
std::decay_t<decltype(p22)> _p22_;\
std::decay_t<decltype(p23)> _p23_;\
void operator()(const hc::tiled_index<3>&) const [[hc]]\
{\
kernel_name(\
_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
_p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\
_p18_, _p19_, _p20_, _p21_, _p22_, _p23_);\
}\
}
#define make_kernel_functor_hip_25(\
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22)\
struct make_kernel_name_hip(function_name, 23) {\
std::decay_t<decltype(p0)> _p0_;\
std::decay_t<decltype(p1)> _p1_;\
std::decay_t<decltype(p2)> _p2_;\
std::decay_t<decltype(p3)> _p3_;\
std::decay_t<decltype(p4)> _p4_;\
std::decay_t<decltype(p5)> _p5_;\
std::decay_t<decltype(p6)> _p6_;\
std::decay_t<decltype(p7)> _p7_;\
std::decay_t<decltype(p8)> _p8_;\
std::decay_t<decltype(p9)> _p9_;\
std::decay_t<decltype(p10)> _p10_;\
std::decay_t<decltype(p11)> _p11_;\
std::decay_t<decltype(p12)> _p12_;\
std::decay_t<decltype(p13)> _p13_;\
std::decay_t<decltype(p14)> _p14_;\
std::decay_t<decltype(p15)> _p15_;\
std::decay_t<decltype(p16)> _p16_;\
std::decay_t<decltype(p17)> _p17_;\
std::decay_t<decltype(p18)> _p18_;\
std::decay_t<decltype(p19)> _p19_;\
std::decay_t<decltype(p20)> _p20_;\
std::decay_t<decltype(p21)> _p21_;\
std::decay_t<decltype(p22)> _p22_;\
__attribute__((used, flatten))\
void operator()(const hc::tiled_index<3>&) const [[hc]]\
{\
kernel_name(\
_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
_p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\
_p18_, _p19_, _p20_, _p21_, _p22_);\
}\
}
#define make_kernel_functor_hip_24(\
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21)\
struct make_kernel_name_hip(function_name, 22) {\
std::decay_t<decltype(p0)> _p0_;\
std::decay_t<decltype(p1)> _p1_;\
std::decay_t<decltype(p2)> _p2_;\
std::decay_t<decltype(p3)> _p3_;\
std::decay_t<decltype(p4)> _p4_;\
std::decay_t<decltype(p5)> _p5_;\
std::decay_t<decltype(p6)> _p6_;\
std::decay_t<decltype(p7)> _p7_;\
std::decay_t<decltype(p8)> _p8_;\
std::decay_t<decltype(p9)> _p9_;\
std::decay_t<decltype(p10)> _p10_;\
std::decay_t<decltype(p11)> _p11_;\
std::decay_t<decltype(p12)> _p12_;\
std::decay_t<decltype(p13)> _p13_;\
std::decay_t<decltype(p14)> _p14_;\
std::decay_t<decltype(p15)> _p15_;\
std::decay_t<decltype(p16)> _p16_;\
std::decay_t<decltype(p17)> _p17_;\
std::decay_t<decltype(p18)> _p18_;\
std::decay_t<decltype(p19)> _p19_;\
std::decay_t<decltype(p20)> _p20_;\
std::decay_t<decltype(p21)> _p21_;\
void operator()(const hc::tiled_index<3>&) const [[hc]]\
{\
kernel_name(\
_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
_p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\
_p18_, _p19_, _p20_, _p21_);\
}\
}
#define make_kernel_functor_hip_23(\
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20)\
struct make_kernel_name_hip(function_name, 21) {\
std::decay_t<decltype(p0)> _p0_;\
std::decay_t<decltype(p1)> _p1_;\
std::decay_t<decltype(p2)> _p2_;\
std::decay_t<decltype(p3)> _p3_;\
std::decay_t<decltype(p4)> _p4_;\
std::decay_t<decltype(p5)> _p5_;\
std::decay_t<decltype(p6)> _p6_;\
std::decay_t<decltype(p7)> _p7_;\
std::decay_t<decltype(p8)> _p8_;\
std::decay_t<decltype(p9)> _p9_;\
std::decay_t<decltype(p10)> _p10_;\
std::decay_t<decltype(p11)> _p11_;\
std::decay_t<decltype(p12)> _p12_;\
std::decay_t<decltype(p13)> _p13_;\
std::decay_t<decltype(p14)> _p14_;\
std::decay_t<decltype(p15)> _p15_;\
std::decay_t<decltype(p16)> _p16_;\
std::decay_t<decltype(p17)> _p17_;\
std::decay_t<decltype(p18)> _p18_;\
std::decay_t<decltype(p19)> _p19_;\
std::decay_t<decltype(p20)> _p20_;\
void operator()(const hc::tiled_index<3>&) const [[hc]]\
{\
kernel_name(\
_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
_p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\
_p18_, _p19_, _p20_);\
}\
}
#define make_kernel_functor_hip_22(\
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
p10, p11, p12, p13, p14, p15, p16, p17, p18, p19)\
struct make_kernel_name_hip(function_name, 20) {\
std::decay_t<decltype(p0)> _p0_;\
std::decay_t<decltype(p1)> _p1_;\
std::decay_t<decltype(p2)> _p2_;\
std::decay_t<decltype(p3)> _p3_;\
std::decay_t<decltype(p4)> _p4_;\
std::decay_t<decltype(p5)> _p5_;\
std::decay_t<decltype(p6)> _p6_;\
std::decay_t<decltype(p7)> _p7_;\
std::decay_t<decltype(p8)> _p8_;\
std::decay_t<decltype(p9)> _p9_;\
std::decay_t<decltype(p10)> _p10_;\
std::decay_t<decltype(p11)> _p11_;\
std::decay_t<decltype(p12)> _p12_;\
std::decay_t<decltype(p13)> _p13_;\
std::decay_t<decltype(p14)> _p14_;\
std::decay_t<decltype(p15)> _p15_;\
std::decay_t<decltype(p16)> _p16_;\
std::decay_t<decltype(p17)> _p17_;\
std::decay_t<decltype(p18)> _p18_;\
std::decay_t<decltype(p19)> _p19_;\
void operator()(const hc::tiled_index<3>&) const [[hc]]\
{\
kernel_name(\
_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
_p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\
_p18_, _p19_);\
}\
}
#define make_kernel_functor_hip_21(\
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
p10, p11, p12, p13, p14, p15, p16, p17, p18)\
struct make_kernel_name_hip(function_name, 19) {\
std::decay_t<decltype(p0)> _p0_;\
std::decay_t<decltype(p1)> _p1_;\
std::decay_t<decltype(p2)> _p2_;\
std::decay_t<decltype(p3)> _p3_;\
std::decay_t<decltype(p4)> _p4_;\
std::decay_t<decltype(p5)> _p5_;\
std::decay_t<decltype(p6)> _p6_;\
std::decay_t<decltype(p7)> _p7_;\
std::decay_t<decltype(p8)> _p8_;\
std::decay_t<decltype(p9)> _p9_;\
std::decay_t<decltype(p10)> _p10_;\
std::decay_t<decltype(p11)> _p11_;\
std::decay_t<decltype(p12)> _p12_;\
std::decay_t<decltype(p13)> _p13_;\
std::decay_t<decltype(p14)> _p14_;\
std::decay_t<decltype(p15)> _p15_;\
std::decay_t<decltype(p16)> _p16_;\
std::decay_t<decltype(p17)> _p17_;\
std::decay_t<decltype(p18)> _p18_;\
void operator()(const hc::tiled_index<3>&) const [[hc]]\
{\
kernel_name(\
_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
_p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\
_p18_);\
}\
}
#define make_kernel_functor_hip_20(\
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
p10, p11, p12, p13, p14, p15, p16, p17)\
struct make_kernel_name_hip(function_name, 18) {\
std::decay_t<decltype(p0)> _p0_;\
std::decay_t<decltype(p1)> _p1_;\
std::decay_t<decltype(p2)> _p2_;\
std::decay_t<decltype(p3)> _p3_;\
std::decay_t<decltype(p4)> _p4_;\
std::decay_t<decltype(p5)> _p5_;\
std::decay_t<decltype(p6)> _p6_;\
std::decay_t<decltype(p7)> _p7_;\
std::decay_t<decltype(p8)> _p8_;\
std::decay_t<decltype(p9)> _p9_;\
std::decay_t<decltype(p10)> _p10_;\
std::decay_t<decltype(p11)> _p11_;\
std::decay_t<decltype(p12)> _p12_;\
std::decay_t<decltype(p13)> _p13_;\
std::decay_t<decltype(p14)> _p14_;\
std::decay_t<decltype(p15)> _p15_;\
std::decay_t<decltype(p16)> _p16_;\
std::decay_t<decltype(p17)> _p17_;\
void operator()(const hc::tiled_index<3>&) const [[hc]]\
{\
kernel_name(\
_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
_p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_);\
}\
}
#define make_kernel_functor_hip_19(\
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
p10, p11, p12, p13, p14, p15, p16)\
struct make_kernel_name_hip(function_name, 17) {\
std::decay_t<decltype(p0)> _p0_;\
std::decay_t<decltype(p1)> _p1_;\
std::decay_t<decltype(p2)> _p2_;\
std::decay_t<decltype(p3)> _p3_;\
std::decay_t<decltype(p4)> _p4_;\
std::decay_t<decltype(p5)> _p5_;\
std::decay_t<decltype(p6)> _p6_;\
std::decay_t<decltype(p7)> _p7_;\
std::decay_t<decltype(p8)> _p8_;\
std::decay_t<decltype(p9)> _p9_;\
std::decay_t<decltype(p10)> _p10_;\
std::decay_t<decltype(p11)> _p11_;\
std::decay_t<decltype(p12)> _p12_;\
std::decay_t<decltype(p13)> _p13_;\
std::decay_t<decltype(p14)> _p14_;\
std::decay_t<decltype(p15)> _p15_;\
std::decay_t<decltype(p16)> _p16_;\
void operator()(const hc::tiled_index<3>&) const [[hc]]\
{\
kernel_name(\
_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
_p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_);\
}\
}
#define make_kernel_functor_hip_18(\
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
p10, p11, p12, p13, p14, p15)\
struct make_kernel_name_hip(function_name, 16) {\
std::decay_t<decltype(p0)> _p0_;\
std::decay_t<decltype(p1)> _p1_;\
std::decay_t<decltype(p2)> _p2_;\
std::decay_t<decltype(p3)> _p3_;\
std::decay_t<decltype(p4)> _p4_;\
std::decay_t<decltype(p5)> _p5_;\
std::decay_t<decltype(p6)> _p6_;\
std::decay_t<decltype(p7)> _p7_;\
std::decay_t<decltype(p8)> _p8_;\
std::decay_t<decltype(p9)> _p9_;\
std::decay_t<decltype(p10)> _p10_;\
std::decay_t<decltype(p11)> _p11_;\
std::decay_t<decltype(p12)> _p12_;\
std::decay_t<decltype(p13)> _p13_;\
std::decay_t<decltype(p14)> _p14_;\
std::decay_t<decltype(p15)> _p15_;\
void operator()(const hc::tiled_index<3>&) const [[hc]]\
{\
kernel_name(\
_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
_p10_, _p11_, _p12_, _p13_, _p14_, _p15_);\
}\
}
#define make_kernel_functor_hip_17(\
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
p10, p11, p12, p13, p14)\
struct make_kernel_name_hip(function_name, 15) {\
std::decay_t<decltype(p0)> _p0_;\
std::decay_t<decltype(p1)> _p1_;\
std::decay_t<decltype(p2)> _p2_;\
std::decay_t<decltype(p3)> _p3_;\
std::decay_t<decltype(p4)> _p4_;\
std::decay_t<decltype(p5)> _p5_;\
std::decay_t<decltype(p6)> _p6_;\
std::decay_t<decltype(p7)> _p7_;\
std::decay_t<decltype(p8)> _p8_;\
std::decay_t<decltype(p9)> _p9_;\
std::decay_t<decltype(p10)> _p10_;\
std::decay_t<decltype(p11)> _p11_;\
std::decay_t<decltype(p12)> _p12_;\
std::decay_t<decltype(p13)> _p13_;\
std::decay_t<decltype(p14)> _p14_;\
void operator()(const hc::tiled_index<3>&) const [[hc]]\
{\
kernel_name(\
_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
_p10_, _p11_, _p12_, _p13_, _p14_);\
}\
}
#define make_kernel_functor_hip_16(\
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
p10, p11, p12, p13)\
struct make_kernel_name_hip(function_name, 14) {\
std::decay_t<decltype(p0)> _p0_;\
std::decay_t<decltype(p1)> _p1_;\
std::decay_t<decltype(p2)> _p2_;\
std::decay_t<decltype(p3)> _p3_;\
std::decay_t<decltype(p4)> _p4_;\
std::decay_t<decltype(p5)> _p5_;\
std::decay_t<decltype(p6)> _p6_;\
std::decay_t<decltype(p7)> _p7_;\
std::decay_t<decltype(p8)> _p8_;\
std::decay_t<decltype(p9)> _p9_;\
std::decay_t<decltype(p10)> _p10_;\
std::decay_t<decltype(p11)> _p11_;\
std::decay_t<decltype(p12)> _p12_;\
std::decay_t<decltype(p13)> _p13_;\
void operator()(const hc::tiled_index<3>&) const [[hc]]\
{\
kernel_name(\
_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
_p10_, _p11_, _p12_, _p13_);\
}\
}
#define make_kernel_functor_hip_15(\
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
p10, p11, p12)\
struct make_kernel_name_hip(function_name, 13) {\
std::decay_t<decltype(p0)> _p0_;\
std::decay_t<decltype(p1)> _p1_;\
std::decay_t<decltype(p2)> _p2_;\
std::decay_t<decltype(p3)> _p3_;\
std::decay_t<decltype(p4)> _p4_;\
std::decay_t<decltype(p5)> _p5_;\
std::decay_t<decltype(p6)> _p6_;\
std::decay_t<decltype(p7)> _p7_;\
std::decay_t<decltype(p8)> _p8_;\
std::decay_t<decltype(p9)> _p9_;\
std::decay_t<decltype(p10)> _p10_;\
std::decay_t<decltype(p11)> _p11_;\
std::decay_t<decltype(p12)> _p12_;\
void operator()(const hc::tiled_index<3>&) const [[hc]]\
{\
kernel_name(\
_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
_p10_, _p11_, _p12_);\
}\
}
#define make_kernel_functor_hip_14(\
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
p10, p11)\
struct make_kernel_name_hip(function_name, 12) {\
std::decay_t<decltype(p0)> _p0_;\
std::decay_t<decltype(p1)> _p1_;\
std::decay_t<decltype(p2)> _p2_;\
std::decay_t<decltype(p3)> _p3_;\
std::decay_t<decltype(p4)> _p4_;\
std::decay_t<decltype(p5)> _p5_;\
std::decay_t<decltype(p6)> _p6_;\
std::decay_t<decltype(p7)> _p7_;\
std::decay_t<decltype(p8)> _p8_;\
std::decay_t<decltype(p9)> _p9_;\
std::decay_t<decltype(p10)> _p10_;\
std::decay_t<decltype(p11)> _p11_;\
void operator()(const hc::tiled_index<3>&) const [[hc]]\
{\
kernel_name(\
_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
_p10_, _p11_);\
}\
}
#define make_kernel_functor_hip_13(\
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10)\
struct make_kernel_name_hip(function_name, 11) {\
std::decay_t<decltype(p0)> _p0_;\
std::decay_t<decltype(p1)> _p1_;\
std::decay_t<decltype(p2)> _p2_;\
std::decay_t<decltype(p3)> _p3_;\
std::decay_t<decltype(p4)> _p4_;\
std::decay_t<decltype(p5)> _p5_;\
std::decay_t<decltype(p6)> _p6_;\
std::decay_t<decltype(p7)> _p7_;\
std::decay_t<decltype(p8)> _p8_;\
std::decay_t<decltype(p9)> _p9_;\
std::decay_t<decltype(p10)> _p10_;\
void operator()(const hc::tiled_index<3>&) const [[hc]]\
{\
kernel_name(\
_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
_p10_);\
}\
}
#define make_kernel_functor_hip_12(\
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9)\
struct make_kernel_name_hip(function_name, 10) {\
std::decay_t<decltype(p0)> _p0_;\
std::decay_t<decltype(p1)> _p1_;\
std::decay_t<decltype(p2)> _p2_;\
std::decay_t<decltype(p3)> _p3_;\
std::decay_t<decltype(p4)> _p4_;\
std::decay_t<decltype(p5)> _p5_;\
std::decay_t<decltype(p6)> _p6_;\
std::decay_t<decltype(p7)> _p7_;\
std::decay_t<decltype(p8)> _p8_;\
std::decay_t<decltype(p9)> _p9_;\
void operator()(const hc::tiled_index<3>&) const [[hc]]\
{\
kernel_name(\
_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_,\
_p9_);\
}\
}
#define make_kernel_functor_hip_11(\
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8)\
struct make_kernel_name_hip(function_name, 9) {\
std::decay_t<decltype(p0)> _p0_;\
std::decay_t<decltype(p1)> _p1_;\
std::decay_t<decltype(p2)> _p2_;\
std::decay_t<decltype(p3)> _p3_;\
std::decay_t<decltype(p4)> _p4_;\
std::decay_t<decltype(p5)> _p5_;\
std::decay_t<decltype(p6)> _p6_;\
std::decay_t<decltype(p7)> _p7_;\
std::decay_t<decltype(p8)> _p8_;\
void operator()(const hc::tiled_index<3>&) const [[hc]]\
{\
kernel_name(\
_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_);\
}\
}
#define make_kernel_functor_hip_10(\
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7)\
struct make_kernel_name_hip(function_name, 8) {\
std::decay_t<decltype(p0)> _p0_;\
std::decay_t<decltype(p1)> _p1_;\
std::decay_t<decltype(p2)> _p2_;\
std::decay_t<decltype(p3)> _p3_;\
std::decay_t<decltype(p4)> _p4_;\
std::decay_t<decltype(p5)> _p5_;\
std::decay_t<decltype(p6)> _p6_;\
std::decay_t<decltype(p7)> _p7_;\
void operator()(const hc::tiled_index<3>&) const [[hc]]\
{\
kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_);\
}\
}
#define make_kernel_functor_hip_9(\
function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6)\
struct make_kernel_name_hip(function_name, 7) {\
std::decay_t<decltype(p0)> _p0_;\
std::decay_t<decltype(p1)> _p1_;\
std::decay_t<decltype(p2)> _p2_;\
std::decay_t<decltype(p3)> _p3_;\
std::decay_t<decltype(p4)> _p4_;\
std::decay_t<decltype(p5)> _p5_;\
std::decay_t<decltype(p6)> _p6_;\
void operator()(const hc::tiled_index<3>&) const [[hc]]\
{\
kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_);\
}\
}
#define make_kernel_functor_hip_8(\
function_name, kernel_name, p0, p1, p2, p3, p4, p5)\
struct make_kernel_name_hip(function_name, 6) {\
std::decay_t<decltype(p0)> _p0_;\
std::decay_t<decltype(p1)> _p1_;\
std::decay_t<decltype(p2)> _p2_;\
std::decay_t<decltype(p3)> _p3_;\
std::decay_t<decltype(p4)> _p4_;\
std::decay_t<decltype(p5)> _p5_;\
void operator()(const hc::tiled_index<3>&) const [[hc]]\
{\
kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_);\
}\
}
#define make_kernel_functor_hip_7(\
function_name, kernel_name, p0, p1, p2, p3, p4)\
struct make_kernel_name_hip(function_name, 5) {\
std::decay_t<decltype(p0)> _p0_;\
std::decay_t<decltype(p1)> _p1_;\
std::decay_t<decltype(p2)> _p2_;\
std::decay_t<decltype(p3)> _p3_;\
std::decay_t<decltype(p4)> _p4_;\
void operator()(const hc::tiled_index<3>&) const [[hc]]\
{\
kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_);\
}\
}
#define make_kernel_functor_hip_6(function_name, kernel_name, p0, p1, p2, p3)\
struct make_kernel_name_hip(function_name, 4) {\
std::decay_t<decltype(p0)> _p0_;\
std::decay_t<decltype(p1)> _p1_;\
std::decay_t<decltype(p2)> _p2_;\
std::decay_t<decltype(p3)> _p3_;\
void operator()(const hc::tiled_index<3>&) const [[hc]]\
{\
kernel_name(_p0_, _p1_, _p2_, _p3_);\
}\
}
#define make_kernel_functor_hip_5(function_name, kernel_name, p0, p1, p2)\
struct make_kernel_name_hip(function_name, 3) {\
std::decay_t<decltype(p0)> _p0_;\
std::decay_t<decltype(p1)> _p1_;\
std::decay_t<decltype(p2)> _p2_;\
void operator()(const hc::tiled_index<3>&) const [[hc]]\
{\
kernel_name(_p0_, _p1_, _p2_);\
}\
}
#define make_kernel_functor_hip_4(function_name, kernel_name, p0, p1)\
struct make_kernel_name_hip(function_name, 2) {\
std::decay_t<decltype(p0)> _p0_;\
std::decay_t<decltype(p1)> _p1_;\
void operator()(const hc::tiled_index<3>&) const [[hc]]\
{\
kernel_name(_p0_, _p1_);\
}\
}
#define fofo(f, n) kernel_prefix_hip ## f ## kernel_suffix_hip ## n
#define make_kernel_functor_hip_3(function_name, kernel_name, p0)\
struct make_kernel_name_hip(function_name, 1) {\
std::decay_t<decltype(p0)> _p0_;\
void operator()(const hc::tiled_index<3>&) const [[hc]]\
{\
kernel_name(_p0_);\
}\
}
#define make_kernel_functor_hip_2(function_name, kernel_name)\
struct make_kernel_name_hip(function_name, 0) {\
void operator()(const hc::tiled_index<3>&) [[hc]]\
{\
return kernel_name(hipLaunchParm{});\
}\
}
#define make_kernel_functor_hip_1(...)
#define make_kernel_functor_hip_0(...)
#define make_kernel_functor_hip_(...)\
overload_macro_hip_(make_kernel_functor_hip_, __VA_ARGS__)
#define hipLaunchNamedKernelGGL(\
function_name,\
kernel_name,\
num_blocks,\
dim_blocks,\
group_mem_bytes,\
stream,\
...)\
do {\
make_kernel_functor_hip_(function_name, kernel_name, __VA_ARGS__)\
hip_kernel_functor_impl_{__VA_ARGS__};\
hip_impl::grid_launch_hip_(\
num_blocks,\
dim_blocks,\
group_mem_bytes,\
stream,\
#kernel_name,\
hip_kernel_functor_impl_);\
} while(0)
#define hipLaunchKernelGGL(\
kernel_name, num_blocks, dim_blocks, group_mem_bytes, stream, ...)\
do {\
hipLaunchNamedKernelGGL(\
unnamed,\
kernel_name,\
num_blocks,\
dim_blocks,\
group_mem_bytes,\
stream,\
##__VA_ARGS__);\
} while (0)
#define hipLaunchKernel(\
kernel_name, num_blocks, dim_blocks, group_mem_bytes, stream, ...)\
do {\
hipLaunchKernelGGL(\
kernel_name,\
num_blocks,\
dim_blocks,\
group_mem_bytes,\
stream,\
hipLaunchParm{},\
##__VA_ARGS__);\
} while(0)
}
#endif //GENERIC_GRID_LAUNCH
#if __hcc_workweek__ >= 17481
#include "functional_grid_launch.hpp"
#else
#include "macro_based_grid_launch.hpp"
#endif
#endif //GENERIC_GRID_LAUNCH
@@ -84,8 +84,6 @@ typedef struct ihipModule_t *hipModule_t;
typedef struct ihipModuleSymbol_t *hipFunction_t;
typedef void* hipDeviceptr_t;
typedef struct ihipEvent_t *hipEvent_t;
enum hipLimit_t
@@ -621,7 +619,7 @@ hipError_t hipStreamQuery(hipStream_t stream);
*
* This command is host-synchronous : the host will block until the specified stream is empty.
*
* This command follows standard null-stream semantics. Specifically, specifying the null stream will cause the
* This command follows standard null-stream semantics. Specifically, specifying the null stream will cause the
* command to wait for other streams on the same device to complete all pending operations.
*
* This command honors the hipDeviceLaunchBlocking flag, which controls whether the wait is active or blocking.
@@ -644,9 +642,9 @@ hipError_t hipStreamSynchronize(hipStream_t stream);
* This function inserts a wait operation into the specified stream.
* All future work submitted to @p stream will wait until @p event reports completion before beginning execution.
*
* This function only waits for commands in the current stream to complete. Notably,, this function does
* not impliciy wait for commands in the default stream to complete, even if the specified stream is
* created with hipStreamNonBlocking = 0.
* This function only waits for commands in the current stream to complete. Notably,, this function does
* not impliciy wait for commands in the default stream to complete, even if the specified stream is
* created with hipStreamNonBlocking = 0.
*
* @see hipStreamCreate, hipStreamCreateWithFlags, hipStreamSynchronize, hipStreamDestroy
*/
@@ -756,7 +754,7 @@ hipError_t hipEventCreate(hipEvent_t* event);
* If hipEventRecord() has been previously called on this event, then this call will overwrite any existing state in event.
*
* If this function is called on a an event that is currently being recorded, results are undefined - either
* outstanding recording may save state into the event, and the order is not guaranteed.
* outstanding recording may save state into the event, and the order is not guaranteed.
*
* @see hipEventCreate, hipEventCreateWithFlags, hipEventQuery, hipEventSynchronize, hipEventDestroy, hipEventElapsedTime
*
@@ -1318,6 +1316,7 @@ hipError_t hipMallocArray(hipArray** array, const hipChannelFormatDesc* desc,
hipError_t hipMallocArray(hipArray** array, const struct hipChannelFormatDesc* desc,
size_t width, size_t height, unsigned int flags);
#endif
hipError_t hipArrayCreate ( hipArray** pHandle, const HIP_ARRAY_DESCRIPTOR* pAllocateArray );
/**
* @brief Frees an array on the device.
*
@@ -1359,6 +1358,7 @@ hipError_t hipMalloc3DArray(hipArray_t *array,
* @see hipMemcpy, hipMemcpyToArray, hipMemcpy2DToArray, hipMemcpyFromArray, hipMemcpyToSymbol, hipMemcpyAsync
*/
hipError_t hipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width, size_t height, hipMemcpyKind kind);
hipError_t hipMemcpyParam2D(const hip_Memcpy2D* pCopy);
/**
* @brief Copies data between host and device.
@@ -1968,6 +1968,7 @@ hipError_t hipModuleGetFunction(hipFunction_t *function, hipModule_t module, con
*/
hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t *bytes, hipModule_t hmod, const char *name);
hipError_t hipModuleGetTexRef(textureReference** texRef, hipModule_t hmod, const char* name);
/**
* @brief builds module from code object which resides in host memory. Image is pointer to that location.
*
@@ -2172,12 +2173,9 @@ hipError_t ihipBindTextureImpl(int dim,
enum hipTextureReadMode readMode,
size_t *offset,
const void *devPtr,
const struct hipChannelFormatDesc& desc,
const struct hipChannelFormatDesc* desc,
size_t size,
enum hipTextureAddressMode addressMode,
enum hipTextureFilterMode filterMode,
int normalizedCoords,
hipTextureObject_t& textureObject);
textureReference* tex);
/*
* @brief hipBindTexture Binds size bytes of the memory area pointed to by @p devPtr to the texture reference tex.
@@ -2199,9 +2197,7 @@ hipError_t hipBindTexture(size_t *offset,
const struct hipChannelFormatDesc& desc,
size_t size = UINT_MAX)
{
return ihipBindTextureImpl(dim, readMode, offset, devPtr, desc, size,
tex.addressMode[0], tex.filterMode, tex.normalized,
tex.textureObject);
return ihipBindTextureImpl(dim, readMode, offset, devPtr, &desc, size, &tex);
}
/*
@@ -2222,9 +2218,7 @@ hipError_t hipBindTexture(size_t *offset,
const void *devPtr,
size_t size = UINT_MAX)
{
return ihipBindTextureImpl(dim, readMode, offset, devPtr, tex.channelDesc, size,
tex.addressMode[0], tex.filterMode, tex.normalized,
tex.textureObject);
return ihipBindTextureImpl(dim, readMode, offset, devPtr, &(tex.channelDesc), size, &tex);
}
// C API
@@ -2240,13 +2234,10 @@ hipError_t ihipBindTexture2DImpl(int dim,
enum hipTextureReadMode readMode,
size_t *offset,
const void *devPtr,
const struct hipChannelFormatDesc& desc,
const struct hipChannelFormatDesc* desc,
size_t width,
size_t height,
enum hipTextureAddressMode addressMode,
enum hipTextureFilterMode filterMode,
int normalizedCoords,
hipTextureObject_t& textureObject);
textureReference* tex);
template <class T, int dim, enum hipTextureReadMode readMode>
hipError_t hipBindTexture2D(size_t *offset,
@@ -2256,9 +2247,7 @@ hipError_t hipBindTexture2D(size_t *offset,
size_t height,
size_t pitch)
{
return ihipBindTexture2DImpl(dim, readMode, offset, devPtr, tex.channelDesc, width, height,
tex.addressMode[0], tex.filterMode, tex.normalized,
tex.textureObject);
return ihipBindTexture2DImpl(dim, readMode, offset, devPtr, &(tex.channelDesc), width, height, &tex);
}
template <class T, int dim, enum hipTextureReadMode readMode>
@@ -2270,9 +2259,7 @@ hipError_t hipBindTexture2D(size_t *offset,
size_t height,
size_t pitch)
{
return ihipBindTexture2DImpl(dim, readMode, offset, devPtr, desc, width, height,
tex.addressMode[0], tex.filterMode, tex.normalized,
tex.textureObject);
return ihipBindTexture2DImpl(dim, readMode, offset, devPtr, &desc, width, height, &tex);
}
//C API
@@ -2284,18 +2271,13 @@ hipError_t ihipBindTextureToArrayImpl(int dim,
enum hipTextureReadMode readMode,
hipArray_const_t array,
const struct hipChannelFormatDesc& desc,
enum hipTextureAddressMode addressMode,
enum hipTextureFilterMode filterMode,
int normalizedCoords,
hipTextureObject_t& textureObject);
textureReference* tex);
template <class T, int dim, enum hipTextureReadMode readMode>
hipError_t hipBindTextureToArray(struct texture<T, dim, readMode>& tex,
hipArray_const_t array)
{
return ihipBindTextureToArrayImpl(dim, readMode, array, tex.channelDesc,
tex.addressMode[0], tex.filterMode, tex.normalized,
tex.textureObject);
return ihipBindTextureToArrayImpl(dim, readMode, array, tex.channelDesc, &tex);
}
template <class T, int dim, enum hipTextureReadMode readMode>
@@ -2303,9 +2285,7 @@ hipError_t hipBindTextureToArray(struct texture<T, dim, readMode>& tex,
hipArray_const_t array,
const struct hipChannelFormatDesc& desc)
{
return ihipBindTextureToArrayImpl(dim, readMode, array, desc,
tex.addressMode[0], tex.filterMode, tex.normalized,
tex.textureObject);
return ihipBindTextureToArrayImpl(dim, readMode, array, desc, &tex);
}
//C API
@@ -2359,6 +2339,19 @@ hipError_t hipDestroyTextureObject(hipTextureObject_t textureObject);
hipError_t hipGetTextureObjectResourceDesc(hipResourceDesc* pResDesc, hipTextureObject_t textureObject);
hipError_t hipGetTextureObjectResourceViewDesc(hipResourceViewDesc* pResViewDesc, hipTextureObject_t textureObject);
hipError_t hipGetTextureObjectTextureDesc(hipTextureDesc* pTexDesc, hipTextureObject_t textureObject);
hipError_t hipTexRefSetArray ( textureReference* tex, hipArray_const_t array, unsigned int flags );
hipError_t hipTexRefSetAddressMode ( textureReference* tex, int dim, hipTextureAddressMode am );
hipError_t hipTexRefSetFilterMode ( textureReference* tex, hipTextureFilterMode fm );
hipError_t hipTexRefSetFlags ( textureReference* tex, unsigned int flags );
hipError_t hipTexRefSetFormat (textureReference* tex, hipArray_Format fmt, int NumPackedComponents );
hipError_t hipTexRefSetAddress( size_t* offset, textureReference* tex, hipDeviceptr_t devPtr, size_t size );
hipError_t hipTexRefSetAddress2D( textureReference* tex, const HIP_ARRAY_DESCRIPTOR* desc, hipDeviceptr_t devPtr, size_t pitch );
// doxygen end Texture
/**
@@ -44,7 +44,12 @@ THE SOFTWARE.
#if GENERIC_GRID_LAUNCH == 0
#define __global__ __attribute__((hc_grid_launch)) __attribute__((used))
#else
#define __global__ __attribute__((annotate("hip__global__"), hc, used, weak))
#if __hcc_workweek__ >= 17481
#define __global__ \
__attribute__((annotate("__HIP_global_function__"), cpu, hc, used))
#else
#define __global__ __attribute__((hc, used))
#endif
#endif //GENERIC_GRID_LAUNCH
#define __noinline__ __attribute__((noinline))
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,86 @@
/*
Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#pragma once
#include <hsa/hsa.h>
#include <hsa/hsa_ext_amd.h>
#include <cstddef>
#include <istream>
#include <memory>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
struct ihipModuleSymbol_t;
using hipFunction_t = ihipModuleSymbol_t*;
namespace std
{
template<>
struct hash<hsa_agent_t> {
size_t operator()(hsa_agent_t x) const
{
return hash<decltype(x.handle)>{}(x.handle);
}
};
}
inline
constexpr
bool operator==(hsa_agent_t x, hsa_agent_t y)
{
return x.handle == y.handle;
}
namespace hip_impl
{
struct Kernel_descriptor {
std::uint64_t kernel_object_;
std::uint32_t group_size_;
std::uint32_t private_size_;
std::string name_;
operator hipFunction_t() const
{ // TODO: this is awful and only meant for illustration.
return reinterpret_cast<hipFunction_t>(
const_cast<Kernel_descriptor*>(this));
}
};
using RAII_global = std::unique_ptr<void, decltype(hsa_amd_memory_unlock)*>;
const std::unordered_map<
hsa_agent_t, std::vector<hsa_executable_t>>& executables();
const std::unordered_map<
std::uintptr_t,
std::vector<std::pair<hsa_agent_t, Kernel_descriptor>>>& functions();
const std::unordered_map<std::uintptr_t, std::string>& function_names();
std::unordered_map<std::string, RAII_global>& globals();
hsa_executable_t load_executable(
const std::string& file,
hsa_executable_t executable,
hsa_agent_t agent);
} // Namespace hip_impl.
File diff suppressed because it is too large Load Diff
@@ -93,6 +93,8 @@ struct textureReference
float maxMipmapLevelClamp;
hipTextureObject_t textureObject;
int numChannels;
enum hipArray_Format format;
};
/**
+4 -2
View File
@@ -116,8 +116,10 @@ typedef struct hipDeviceProp_t {
* Memory type (for pointer attributes)
*/
enum hipMemoryType {
hipMemoryTypeHost, ///< Memory is physically located on host
hipMemoryTypeDevice ///< Memory is physically located on device. (see deviceId for specific device)
hipMemoryTypeHost, ///< Memory is physically located on host
hipMemoryTypeDevice, ///< Memory is physically located on device. (see deviceId for specific device)
hipMemoryTypeArray, ///< Array memory, physically located on device. (see deviceId for specific device)
hipMemoryTypeUnified ///< Not used currently
};
+16 -12
View File
@@ -1,20 +1,24 @@
HIP_PATH?= $(wildcard /opt/rocm/hip)
ifeq (,$(HIP_PATH))
HIP_PATH=../../..
endif
HIP_PLATFORM=$(shell $(HIP_PATH)/bin/hipconfig --platform)
HIPCC=$(HIP_PATH)/bin/hipcc
all: square.hip.out
square.cuda.out : square.cu
nvcc square.cu -o $@
#hipify square.cu > square.cpp
# Then review & finish port in square.cpp
#
square.hip.out: square.hipref.cpp
$(HIPCC) $(CXXFLAGS) square.hipref.cpp -o $@
ifeq (${HIP_PLATFORM}, nvcc)
SOURCES=square.cu
else
SOURCES=square.cpp
endif
all: square.out
# Step
square.cpp: square.cu
$(HIP_PATH)/bin/hipify-perl square.cu > square.cpp
square.out: $(SOURCES)
$(HIPCC) $(CXXFLAGS) $(SOURCES) -o $@
clean:
rm -f *.o *.out
rm -f *.o *.out square.cpp
@@ -1,16 +1,13 @@
# Square.md
Simple test which shows how to use hipify to port CUDA code to HIP.
See related [blog](http://gpuopen.com/hip-to-be-squared-an-introductory-hip-tutorial) that explains the example.
See related [blog](http://gpuopen.com/hip-to-be-squared-an-introductory-hip-tutorial) that explains the example.
Now it is even simpler and requires no manual modification to the hipified source code - just hipify and compile:
1. Add hip/bin path to the PATH :
<code>export PATH=$PATH:[MYHIP]/bin</code>
2. Do <code>$ hipify square.cu > square.cpp </code>
3. Manually edit square.cpp to add hipLaunchParms lp to kernel parms:
<code>vector_square(hipLaunchParm lp, T *C_d, const T *A_d, size_t N)</code>
(see square.hipref.cpp for the correct output after running hipify and the above manual step)
4. make
2. <code>$ make </code>
Make runs these steps. This can be performed on either CUDA or AMD platform:
<code>hipify-perl square.cu > square.cpp </code> # convert cuda code to hip code
<code>hipcc square.cpp</code> # compile into executable
@@ -38,7 +38,7 @@ THE SOFTWARE.
*/
template <typename T>
__global__ void
vector_square(T *C_d, const T *A_d, size_t N)
vector_square(T *C_d, T *A_d, size_t N)
{
size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
size_t stride = blockDim.x * gridDim.x ;
@@ -0,0 +1,17 @@
HIP_PATH?= $(wildcard /opt/rocm/hip)
ifeq (,$(HIP_PATH))
HIP_PATH=../../..
endif
HIPCC=$(HIP_PATH)/bin/hipcc
HIP_PLATFORM=$(shell $(HIP_PATH)/bin/hipconfig --compiler)
all: tex2dKernel.code texture2dDrv.out
texture2dDrv.out: texture2dDrv.cpp
$(HIPCC) $(HIPCC_FLAGS) $< -o $@
tex2dKernel.code: tex2dKernel.cpp
$(HIPCC) --genco $(GENCO_FLAGS) $^ -o $@
clean:
rm -f *.code *.out
@@ -0,0 +1,33 @@
/*
Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "hip/hip_runtime.h"
extern texture<float, 2, hipReadModeElementType> tex;
__global__ void tex2dKernel(hipLaunchParm lp, float* outputData,
int width,
int height)
{
int x = hipBlockIdx_x*hipBlockDim_x + hipThreadIdx_x;
int y = hipBlockIdx_y*hipBlockDim_y + hipThreadIdx_y;
outputData[y*width + x] = tex2D(tex, x, y);
}
@@ -0,0 +1,156 @@
/*
Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "hip/hip_runtime.h"
#include "hip/hip_runtime_api.h"
#include <iostream>
#include <fstream>
#include <vector>
#include <hip/hip_hcc.h>
#define fileName "tex2dKernel.code"
texture<float, 2, hipReadModeElementType> tex;
bool testResult = false;
#define HIP_CHECK(cmd) \
{\
hipError_t status = cmd;\
if(status != hipSuccess) {std::cout<<"error: #"<<status<<" ("<< hipGetErrorString(status) << ") at line:"<<__LINE__<<": "<<#cmd<<std::endl;abort();}\
}
bool runTest(int argc, char **argv)
{
unsigned int width = 256;
unsigned int height = 256;
unsigned int size = width * height * sizeof(float);
float* hData = (float*) malloc(size);
memset(hData, 0, size);
for (int i = 0; i < height; i++) {
for (int j = 0; j < width; j++) {
hData[i*width+j] = i*width+j;
}
}
hipModule_t Module;
HIP_CHECK(hipModuleLoad(&Module, fileName));
hipArray* array;
HIP_ARRAY_DESCRIPTOR desc;
desc.format = HIP_AD_FORMAT_FLOAT;
desc.numChannels = 1;
desc.width = width;
desc.height = height;
hipArrayCreate(&array, &desc);
hip_Memcpy2D copyParam;
memset(&copyParam, 0, sizeof(copyParam));
copyParam.dstMemoryType = hipMemoryTypeArray;
copyParam.dstArray = array;
copyParam.srcMemoryType = hipMemoryTypeHost;
copyParam.srcHost = hData;
copyParam.srcPitch = width * sizeof(float);
copyParam.widthInBytes = copyParam.srcPitch;
copyParam.height = height;
hipMemcpyParam2D(&copyParam);
textureReference* texref;
hipModuleGetTexRef(&texref, Module, "tex");
hipTexRefSetAddressMode(texref, 0, hipAddressModeWrap);
hipTexRefSetAddressMode(texref, 1, hipAddressModeWrap);
hipTexRefSetFilterMode(texref, hipFilterModePoint);
hipTexRefSetFlags(texref, 0);
hipTexRefSetFormat(texref, HIP_AD_FORMAT_FLOAT, 1);
hipTexRefSetArray(texref, array, HIP_TRSA_OVERRIDE_FORMAT);
float* dData = NULL;
hipMalloc((void **) &dData, size);
#ifdef __HIP_PLATFORM_HCC__
struct {
uint32_t _hidden[6]; // genco path + wrapper-gen pass used hidden arguments.
void * _Ad;
unsigned int _Bd;
unsigned int _Cd;
} args;
args._Ad = dData;
args._Bd = width;
args._Cd = height;
#endif
#ifdef __HIP_PLATFORM_NVCC__
struct {
uint32_t _hidden[1];
void * _Ad;
unsigned int _Bd;
unsigned int _Cd;
} args;
args._hidden[0] = 0;
args._Ad = dData;
args._Bd = width;
args._Cd = height;
#endif
size_t sizeTemp = sizeof(args);
void *config[] = {
HIP_LAUNCH_PARAM_BUFFER_POINTER, &args,
HIP_LAUNCH_PARAM_BUFFER_SIZE, &sizeTemp,
HIP_LAUNCH_PARAM_END
};
hipFunction_t Function;
HIP_CHECK(hipModuleGetFunction(&Function, Module, "tex2dKernel"));
int temp1= width/16;
int temp2 = height/16;
HIP_CHECK(hipModuleLaunchKernel(Function, 16, 16, 1, temp1, temp2, 1, 0, 0, NULL, (void**)&config));
hipDeviceSynchronize();
float *hOutputData = (float *) malloc(size);
memset(hOutputData, 0, size);
hipMemcpy(hOutputData, dData, size, hipMemcpyDeviceToHost);
for (int i = 0; i < height; i++) {
for (int j = 0; j < width; j++) {
if (hData[i*width+j] != hOutputData[i*width+j]) {
printf("Difference [ %d %d ]:%f ----%f\n",i, j, hData[i*width+j] , hOutputData[i*width+j]);
testResult = false;
break;
}
}
}
hipFree(dData);
hipFreeArray(array);
return true;
}
int main(int argc, char **argv){
hipInit(0);
testResult = runTest(argc, argv);
printf("%s ...\n", testResult ? "PASSED" : "FAILED");
exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
return 0;
}
+39
View File
@@ -0,0 +1,39 @@
#include "../include/hip/hcc_detail/code_object_bundle.hpp"
#include <hsa/hsa.h>
#include <cstdint>
#include <string>
#include <vector>
hsa_isa_t hip_impl::triple_to_hsa_isa(const std::string& triple)
{
static constexpr const char prefix[] = "hcc-amdgcn--amdhsa-gfx";
static constexpr std::size_t prefix_sz = sizeof(prefix) - 1;
hsa_isa_t r = {};
auto idx = triple.find(prefix);
if (idx != std::string::npos) {
idx += prefix_sz;
std::string tmp = "AMD:AMDGPU";
while (idx != triple.size()) {
tmp.push_back(':');
tmp.push_back(triple[idx++]);
}
hsa_isa_from_name(tmp.c_str(), &r);
}
return r;
}
// DATA - STATICS
constexpr const char hip_impl::Bundled_code_header::magic_string_[];
// CREATORS
hip_impl::Bundled_code_header::Bundled_code_header(
const std::vector<std::uint8_t>& x)
: Bundled_code_header{x.cbegin(), x.cend()}
{}
+57 -41
View File
@@ -28,27 +28,6 @@ extern "C" float __ocml_rint_f32(float);
extern "C" float __ocml_ceil_f32(float);
extern "C" float __ocml_trunc_f32(float);
struct holder64Bit{
union{
double d;
unsigned long int uli;
signed long int sli;
signed int si[2];
unsigned int ui[2];
};
} __attribute__((aligned(8)));
struct holder32Bit {
union {
float f;
unsigned int ui;
signed int si;
};
} __attribute__((aligned(4)));
__device__ struct holder64Bit hold64;
__device__ struct holder32Bit hold32;
__device__ float __double2float_rd(double x)
{
return (double)x;
@@ -69,13 +48,21 @@ __device__ float __double2float_rz(double x)
__device__ int __double2hiint(double x)
{
hold64.d = x;
return hold64.si[1];
static_assert(sizeof(double) == 2 * sizeof(int), "");
int tmp[2];
__builtin_memcpy(tmp, &x, sizeof(tmp));
return tmp[1];
}
__device__ int __double2loint(double x)
{
hold64.d = x;
return hold64.si[0];
static_assert(sizeof(double) == 2 * sizeof(int), "");
int tmp[2];
__builtin_memcpy(tmp, &x, sizeof(tmp));
return tmp[0];
}
@@ -150,8 +137,12 @@ __device__ unsigned long long int __double2ull_rz(double x)
__device__ long long int __double_as_longlong(double x)
{
hold64.d = x;
return hold64.sli;
static_assert(sizeof(long long) == sizeof(double), "");
long long tmp;
__builtin_memcpy(&tmp, &x, sizeof(tmp));
return tmp;
}
__device__ int __float2int_rd(float x)
@@ -224,19 +215,32 @@ __device__ unsigned long long int __float2ull_rz(float x)
__device__ int __float_as_int(float x)
{
hold32.f = x;
return hold32.si;
static_assert(sizeof(int) == sizeof(float), "");
int tmp;
__builtin_memcpy(&tmp, &x, sizeof(tmp));
return tmp;
}
__device__ unsigned int __float_as_uint(float x)
{
hold32.f = x;
return hold32.ui;
static_assert(sizeof(unsigned int) == sizeof(float), "");
unsigned int tmp;
__builtin_memcpy(&tmp, &x, sizeof(tmp));
return tmp;
}
__device__ double __hiloint2double(int hi, int lo)
__device__ double __hiloint2double(int32_t hi, int32_t lo)
{
hold64.si[1] = hi;
hold64.si[0] = lo;
return hold64.d;
static_assert(sizeof(double) == sizeof(uint64_t), "");
uint64_t tmp0 =
(static_cast<uint64_t>(hi) << 32ull) | static_cast<uint32_t>(lo);
double tmp1;
__builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
return tmp1;
}
__device__ double __int2double_rn(int x)
{
@@ -262,8 +266,12 @@ __device__ float __int2float_rz(int x)
__device__ float __int_as_float(int x)
{
hold32.si = x;
return hold32.f;
static_assert(sizeof(float) == sizeof(int), "");
float tmp;
__builtin_memcpy(&tmp, &x, sizeof(tmp));
return tmp;
}
__device__ double __ll2double_rd(long long int x)
@@ -302,8 +310,12 @@ __device__ float __ll2float_rz(long long int x)
__device__ double __longlong_as_double(long long int x)
{
hold64.sli = x;
return hold64.d;
static_assert(sizeof(double) == sizeof(long long), "");
double tmp;
__builtin_memcpy(&tmp, &x, sizeof(tmp));
return x;
}
__device__ double __uint2double_rn(int x)
@@ -330,8 +342,12 @@ __device__ float __uint2float_rz(unsigned int x)
__device__ float __uint_as_float(unsigned int x)
{
hold32.ui = x;
return hold32.f;
static_assert(sizeof(float) == sizeof(unsigned int), "");
float tmp;
__builtin_memcpy(&tmp, &x, sizeof(tmp));
return tmp;
}
__device__ double __ull2double_rd(unsigned long long int x)
+39 -94
View File
@@ -102,111 +102,56 @@ __device__ void* __hip_hc_free(void *ptr)
// loop unrolling
__device__ void* __hip_hc_memcpy(void* dst, const void* src, size_t size)
{
uint8_t *dstPtr, *srcPtr;
dstPtr = (uint8_t*)dst;
srcPtr = (uint8_t*)src;
for(uint32_t i=0;i<size;i++) {
dstPtr[i] = srcPtr[i];
auto dstPtr = static_cast<uint8_t*>(dst);
auto srcPtr = static_cast<const uint8_t*>(src);
while (size >= 4u) {
dstPtr[0] = srcPtr[0];
dstPtr[1] = srcPtr[1];
dstPtr[2] = srcPtr[2];
dstPtr[3] = srcPtr[3];
size -= 4u;
srcPtr += 4u;
dstPtr += 4u;
}
return nullptr;
switch (size) {
case 3: dstPtr[2] = srcPtr[2];
case 2: dstPtr[1] = srcPtr[1];
case 1: dstPtr[0] = srcPtr[0];
}
return dst;
}
__device__ void* __hip_hc_memset(void* ptr, uint8_t val, size_t size)
__device__ void* __hip_hc_memset(void* dst, uint8_t val, size_t size)
{
uint8_t *dstPtr;
dstPtr = (uint8_t*)ptr;
for(uint32_t i=0;i<size;i++) {
dstPtr[i] = val;
auto dstPtr = static_cast<uint8_t*>(dst);
while (size >= 4u) {
dstPtr[0] = val;
dstPtr[1] = val;
dstPtr[2] = val;
dstPtr[3] = val;
size -= 4u;
dstPtr += 4u;
}
return nullptr;
switch (size) {
case 3: dstPtr[2] = val;
case 2: dstPtr[1] = val;
case 1: dstPtr[0] = val;
}
return dst;
}
__device__ float __hip_erfinvf(float x){
float ret;
int sign;
if (x < -1 || x > 1){
return NAN;
}
if (x == 0){
return 0;
}
if (x > 0){
sign = 1;
} else {
sign = -1;
x = -x;
}
if (x <= 0.7) {
float x1 = x * x;
float x2 = __hip_erfinva3 * x1 + __hip_erfinva2;
float x3 = x2 * x1 + __hip_erfinva1;
float x4 = x * (x3 * x1 + __hip_erfinva0);
float r1 = __hip_erfinvb4 * x1 + __hip_erfinvb3;
float r2 = r1 * x1 + __hip_erfinvb2;
float r3 = r2 * x1 + __hip_erfinvb1;
ret = x4 / (r3 * x1 + __hip_erfinvb0);
} else {
float x1 = hc::precise_math::sqrtf(-hc::precise_math::logf((1 - x) / 2));
float x2 = __hip_erfinvc3 * x1 + __hip_erfinvc2;
float x3 = x2 * x1 + __hip_erfinvc1;
float x4 = x3 * x1 + __hip_erfinvc0;
float r1 = __hip_erfinvd2 * x1 + __hip_erfinvd1;
ret = x4 / (r1 * x1 + __hip_erfinvd0);
}
ret = ret * sign;
x = x * sign;
ret -= (hc::precise_math::erff(ret) - x) / (2 / HIP_SQRT_PI * hc::precise_math::expf(-ret * ret));
ret -= (hc::precise_math::erff(ret) - x) / (2 / HIP_SQRT_PI * hc::precise_math::expf(-ret * ret));
return ret;
return hc::precise_math::erfinvf(x);
}
__device__ double __hip_erfinv(double x){
double ret;
int sign;
if (x < -1 || x > 1){
return NAN;
}
if (x == 0){
return 0;
}
if (x > 0){
sign = 1;
} else {
sign = -1;
x = -x;
}
if (x <= 0.7) {
double x1 = x * x;
double x2 = __hip_erfinva3 * x1 + __hip_erfinva2;
double x3 = x2 * x1 + __hip_erfinva1;
double x4 = x * (x3 * x1 + __hip_erfinva0);
double r1 = __hip_erfinvb4 * x1 + __hip_erfinvb3;
double r2 = r1 * x1 + __hip_erfinvb2;
double r3 = r2 * x1 + __hip_erfinvb1;
ret = x4 / (r3 * x1 + __hip_erfinvb0);
} else {
double x1 = hc::precise_math::sqrt(-hc::precise_math::log((1 - x) / 2));
double x2 = __hip_erfinvc3 * x1 + __hip_erfinvc2;
double x3 = x2 * x1 + __hip_erfinvc1;
double x4 = x3 * x1 + __hip_erfinvc0;
double r1 = __hip_erfinvd2 * x1 + __hip_erfinvd1;
ret = x4 / (r1 * x1 + __hip_erfinvd0);
}
ret = ret * sign;
x = x * sign;
ret -= (hc::precise_math::erf(ret) - x) / (2 / HIP_SQRT_PI * hc::precise_math::exp(-ret * ret));
ret -= (hc::precise_math::erf(ret) - x) / (2 / HIP_SQRT_PI * hc::precise_math::exp(-ret * ret));
return ret;
return hc::precise_math::erfinv(x);
}
#define __hip_j0a1 57568490574.0
+137
View File
@@ -0,0 +1,137 @@
/*
Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "hip/hcc_detail/program_state.hpp"
#include "hip/hip_runtime_api.h"
// Internal header, do not percolate upwards.
#include "hip_hcc_internal.h"
#include "hc.hpp"
#include "trace_helper.h"
#include <cassert>
#include <cstddef>
#include <cstdint>
#include <stdexcept>
#include <iostream>
using namespace hc;
using namespace std;
namespace hip_impl
{
namespace
{
inline
string name(uintptr_t function_address)
{
const auto it = function_names().find(function_address);
if (it == function_names().cend()) {
throw runtime_error{
"Invalid function passed to hipLaunchKernelGGL."};
}
return it->second;
}
inline
string name(hsa_agent_t agent)
{
char n[64] = {};
hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, n);
return string{n};
}
inline
hsa_agent_t target_agent(hipStream_t stream)
{
if (stream) {
return *static_cast<hsa_agent_t*>(
stream->locked_getAv()->get_hsa_agent());
}
else if (
ihipGetTlsDefaultCtx() && ihipGetTlsDefaultCtx()->getDevice()) {
return ihipGetDevice(
ihipGetTlsDefaultCtx()->getDevice()->_deviceId)->_hsaAgent;
}
else {
return *static_cast<hsa_agent_t*>(
accelerator{}.get_default_view().get_hsa_agent());
}
}
}
void hipLaunchKernelGGLImpl(
uintptr_t function_address,
const dim3& numBlocks,
const dim3& dimBlocks,
uint32_t sharedMemBytes,
hipStream_t stream,
void** kernarg)
{
const auto it0 = functions().find(function_address);
if (it0 == functions().cend()) {
throw runtime_error{
"No device code available for function: " +
name(function_address)
};
}
auto agent = target_agent(stream);
const auto it1 = find_if(
it0->second.cbegin(),
it0->second.cend(),
[=](const pair<hsa_agent_t, Kernel_descriptor>& x) {
return x.first.handle == agent.handle;
});
if (it1 == it0->second.cend()) {
throw runtime_error{
"No code available for function: " + name(function_address) +
", for agent: " + name(agent)
};
}
for (auto&& agent_kernel : it0->second) {
if (agent.handle == agent_kernel.first.handle) {
hipModuleLaunchKernel(
agent_kernel.second,
numBlocks.x,
numBlocks.y,
numBlocks.z,
dimBlocks.x,
dimBlocks.y,
dimBlocks.z,
sharedMemBytes,
stream,
nullptr,
kernarg);
}
}
}
}
+5 -75
View File
@@ -22,78 +22,8 @@ THE SOFTWARE.
#include "hip/hcc_detail/grid_launch_GGL.hpp"
// Internal header, do not percolate upwards.
#include "hip_hcc_internal.h"
#include "hc.hpp"
#include "trace_helper.h"
#include <iostream>
#include <sstream>
namespace hip_impl
{
hc::accelerator_view lock_stream_hip_(
hipStream_t& stream, void*& locked_stream)
{ // This allocated but does not take ownership of locked_stream. If it is
// not deleted elsewhere it will leak.
using L = decltype(stream->lockopen_preKernelCommand());
HIP_INIT();
stream = ihipSyncAndResolveStream(stream);
locked_stream = new L{stream->lockopen_preKernelCommand()};
return (*static_cast<L*>(locked_stream))->_av;
}
void print_prelaunch_trace_(
const char* kernel_name,
dim3 num_blocks,
dim3 dim_blocks,
int group_mem_bytes,
hipStream_t stream)
{
if ((HIP_TRACE_API & (1 << TRACE_KCMD)) ||
HIP_PROFILE_API ||
(COMPILE_HIP_DB && (HIP_TRACE_API & (1<<TRACE_ALL)))) {
std::stringstream os;
os << tls_tidInfo.tid() << "." << tls_tidInfo.apiSeqNum()
<< " hipLaunchKernel '" << kernel_name << "'"
<< " gridDim:" << num_blocks
<< " groupDim:" << dim_blocks
<< " sharedMem:+" << group_mem_bytes
<< " " << *stream;
if (HIP_PROFILE_API == 0x1) {
std::string shortAtpString("hipLaunchKernel:");
shortAtpString += kernel_name;
MARKER_BEGIN(shortAtpString.c_str(), "HIP");
} else if (HIP_PROFILE_API == 0x2) {
MARKER_BEGIN(os.str().c_str(), "HIP");
}
if (COMPILE_HIP_DB && HIP_TRACE_API) {
std::string fullStr;
recordApiTrace(&fullStr, os.str());
}
}
}
void unlock_stream_hip_(
hipStream_t stream,
void* locked_stream,
const char* kernel_name,
hc::accelerator_view* acc_v)
{ // Precondition: acc_v is the accelerator_view associated with stream
// which is guarded by locked_stream;
// locked_stream is deletable.
using L = decltype(stream->lockopen_preKernelCommand());
stream->lockclose_postKernelCommand(kernel_name, acc_v);
delete static_cast<L*>(locked_stream);
locked_stream = nullptr;
if(HIP_PROFILE_API) {
MARKER_END();
}
}
}
#if __hcc_workweek__ >= 17481
#include "functional_grid_launch.inl"
#else
#include "macro_based_grid_launch.inl"
#endif
+3 -3
View File
@@ -446,14 +446,14 @@ hipError_t hipChooseDevice( int* device, const hipDeviceProp_t* prop )
{
HIP_INIT_API(device,prop);
hipDeviceProp_t tempProp;
int deviceCount;
int inPropCount = 0;
int matchedPropCount = 0;
hipError_t e = hipSuccess;
if((device == NULL) || (prop == NULL)) {
e = hipErrorInvalidValue;
}
if(e == hipSuccess) {
int deviceCount;
int inPropCount = 0;
int matchedPropCount = 0;
ihipGetDeviceCount( &deviceCount );
*device = 0;
for (int i = 0; i < deviceCount; i++) {
+2
View File
@@ -779,6 +779,8 @@ hipError_t ihipDevice_t::initProperties(hipDeviceProp_t* prop)
// Get agent name
err = hsa_agent_get_info(_hsaAgent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_PRODUCT_NAME, &(prop->name));
DeviceErrorCheck(err);
char archName[256];
err = hsa_agent_get_info(_hsaAgent, HSA_AGENT_INFO_NAME, &archName);
+22 -19
View File
@@ -25,8 +25,9 @@ THE SOFTWARE.
#include <hc.hpp>
#include <hsa/hsa.h>
#include "hsa/hsa_ext_amd.h"
#include <unordered_map>
#include "hsa/hsa_ext_amd.h"
#include "hip/hip_runtime.h"
#include "hip_util.h"
#include "env.h"
@@ -248,7 +249,7 @@ static const DbName dbName [] =
#define tprintf(trace_level, ...) {\
if (HIP_DB & (1<<(trace_level))) {\
char msgStr[1000];\
snprintf(msgStr, 2000, __VA_ARGS__);\
snprintf(msgStr, sizeof(msgStr), __VA_ARGS__);\
fprintf (stderr, " %ship-%s tid:%d:%s%s", dbName[trace_level]._color, dbName[trace_level]._shortName, tls_tidInfo.tid(), msgStr, KNRM); \
}\
}
@@ -268,7 +269,7 @@ extern uint64_t recordApiTrace(std::string *fullStr, const std::string &apiStr);
#if COMPILE_HIP_ATP_MARKER || (COMPILE_HIP_TRACE_API & 0x1)
#define API_TRACE(forceTrace, ...)\
uint64_t hipApiStartTick;\
uint64_t hipApiStartTick=0;\
{\
tls_tidInfo.incApiSeqNum();\
if (forceTrace || (HIP_PROFILE_API || (COMPILE_HIP_DB && (HIP_TRACE_API & (1<<TRACE_ALL))))) {\
@@ -338,7 +339,7 @@ uint64_t hipApiStartTick;\
class ihipException : public std::exception
{
public:
ihipException(hipError_t e) : _code(e) {};
explicit ihipException(hipError_t e) : _code(e) {};
hipError_t _code;
};
@@ -371,15 +372,16 @@ public:
};
class ihipModule_t {
public:
hsa_executable_t executable;
hsa_code_object_t object;
std::string fileName;
void *ptr;
size_t size;
std::list<hipFunction_t> funcTrack;
ihipModule_t() : executable(), object(), fileName(), ptr(nullptr), size(0) {}
struct ihipModule_t {
std::string fileName;
hsa_executable_t executable = {};
hsa_code_object_reader_t coReader = {};
~ihipModule_t()
{
if (executable.handle) hsa_executable_destroy(executable);
if (coReader.handle) hsa_code_object_reader_destroy(coReader);
}
};
@@ -667,11 +669,11 @@ template <typename MUTEX_TYPE>
class ihipEventCriticalBase_t : LockedBase<MUTEX_TYPE>
{
public:
ihipEventCriticalBase_t(const ihipEvent_t *parentEvent) :
explicit ihipEventCriticalBase_t(const ihipEvent_t *parentEvent) :
_parent(parentEvent)
{}
~ihipEventCriticalBase_t() {};
// Keep data in structure so it can be easily copied into snapshots
// (used to reduce lock contention and preserve correct lock order)
ihipEventData_t _eventData;
@@ -688,7 +690,7 @@ typedef LockedAccessor<ihipEventCritical_t> LockedAccessor_EventCrit_t;
// internal hip event structure.
class ihipEvent_t {
public:
ihipEvent_t(unsigned flags);
explicit ihipEvent_t(unsigned flags);
void attachToCompletionFuture(const hc::completion_future *cf, hipStream_t stream, ihipEventType_t eventType);
std::pair<hipEventStatus_t, uint64_t> refreshEventStatus(); // returns pair <state, timestamp>
@@ -696,7 +698,7 @@ public:
// Return a copy of the critical state. The critical data is locked during the copy.
ihipEventData_t locked_copyCrit() {
LockedAccessor_EventCrit_t crit(_criticalData);
return _criticalData._eventData;
return _criticalData._eventData;
};
ihipEventCritical_t &criticalData() { return _criticalData; };
@@ -718,8 +720,9 @@ template <typename MUTEX_TYPE>
class ihipDeviceCriticalBase_t : LockedBase<MUTEX_TYPE>
{
public:
ihipDeviceCriticalBase_t(ihipDevice_t *parentDevice) :
_parent(parentDevice)
explicit ihipDeviceCriticalBase_t(ihipDevice_t *parentDevice) :
_parent(parentDevice),
_ctxCount(0)
{
};
+241 -102
View File
@@ -44,7 +44,7 @@ hipError_t memcpyAsync (void* dst, const void* src, size_t sizeBytes, hipMemcpyK
try {
stream->locked_copyAsync(dst, src, sizeBytes, kind);
}
catch (ihipException ex) {
catch (ihipException &ex) {
e = ex._code;
}
} else {
@@ -407,8 +407,113 @@ hipChannelFormatDesc hipCreateChannelDesc(int x, int y, int z, int w, hipChannel
extern void getChannelOrderAndType(const hipChannelFormatDesc& desc,
enum hipTextureReadMode readMode,
hsa_ext_image_channel_order_t& channelOrder,
hsa_ext_image_channel_type_t& channelType);
hsa_ext_image_channel_order_t* channelOrder,
hsa_ext_image_channel_type_t* channelType);
hipError_t hipArrayCreate ( hipArray** array, const HIP_ARRAY_DESCRIPTOR* pAllocateArray )
{
HIP_INIT_SPECIAL_API((TRACE_MEM), array, pAllocateArray);
HIP_SET_DEVICE();
hipError_t hip_status = hipSuccess;
if(pAllocateArray->width >0) {
auto ctx = ihipGetTlsDefaultCtx();
*array = (hipArray*)malloc(sizeof(hipArray));
array[0]->drvDesc = *pAllocateArray;
array[0]->width = pAllocateArray->width;
array[0]->height = pAllocateArray->height;
array[0]->isDrv = true;
void ** ptr = &array[0]->data;
if (ctx) {
const unsigned am_flags = 0;
size_t size = pAllocateArray->width;
if(pAllocateArray->height > 0) {
size = size * pAllocateArray->height;
}
hsa_ext_image_channel_type_t channelType;
size_t allocSize = 0;
switch(pAllocateArray->format) {
case HIP_AD_FORMAT_UNSIGNED_INT8:
allocSize = size * sizeof(uint8_t);
channelType = HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8;
break;
case HIP_AD_FORMAT_UNSIGNED_INT16:
allocSize = size * sizeof(uint16_t);
channelType = HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16;
break;
case HIP_AD_FORMAT_UNSIGNED_INT32:
allocSize = size * sizeof(uint32_t);
channelType = HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32;
break;
case HIP_AD_FORMAT_SIGNED_INT8:
allocSize = size * sizeof(int8_t);
channelType = HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT8;
break;
case HIP_AD_FORMAT_SIGNED_INT16:
allocSize = size * sizeof(int16_t);
channelType = HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT16;
break;
case HIP_AD_FORMAT_SIGNED_INT32:
allocSize = size * sizeof(int32_t);
channelType = HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT32;
break;
case HIP_AD_FORMAT_HALF:
allocSize = size * sizeof(int16_t);
channelType = HSA_EXT_IMAGE_CHANNEL_TYPE_HALF_FLOAT;
break;
case HIP_AD_FORMAT_FLOAT:
allocSize = size * sizeof(float);
channelType = HSA_EXT_IMAGE_CHANNEL_TYPE_FLOAT;
break;
default:
hip_status = hipErrorUnknown;
break;
}
hc::accelerator acc = ctx->getDevice()->_acc;
hsa_agent_t* agent =static_cast<hsa_agent_t*>(acc.get_hsa_agent());
size_t allocGranularity = 0;
hsa_amd_memory_pool_t *allocRegion = static_cast<hsa_amd_memory_pool_t*>(acc.get_hsa_am_region());
hsa_amd_memory_pool_get_info(*allocRegion, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE, &allocGranularity);
hsa_ext_image_descriptor_t imageDescriptor;
imageDescriptor.width = pAllocateArray->width;
imageDescriptor.height = pAllocateArray->height;
imageDescriptor.depth = 0;
imageDescriptor.array_size = 0;
imageDescriptor.geometry = HSA_EXT_IMAGE_GEOMETRY_2D;
hsa_ext_image_channel_order_t channelOrder;
if (pAllocateArray->numChannels == 4) {
channelOrder = HSA_EXT_IMAGE_CHANNEL_ORDER_RGBA;
} else if (pAllocateArray->numChannels == 2) {
channelOrder = HSA_EXT_IMAGE_CHANNEL_ORDER_RG;
} else if (pAllocateArray->numChannels == 1) {
channelOrder = HSA_EXT_IMAGE_CHANNEL_ORDER_R;
}
imageDescriptor.format.channel_order = channelOrder;
imageDescriptor.format.channel_type = channelType;
hsa_access_permission_t permission = HSA_ACCESS_PERMISSION_RW;
hsa_ext_image_data_info_t imageInfo;
hsa_status_t status = hsa_ext_image_data_get_info(*agent, &imageDescriptor, permission, &imageInfo);
size_t alignment = imageInfo.alignment <= allocGranularity ? 0 : imageInfo.alignment;
*ptr = hip_internal::allocAndSharePtr("device_array", allocSize, ctx, false/*shareWithAll*/, am_flags, 0, alignment);
if (size && (*ptr == NULL)) {
hip_status = hipErrorMemoryAllocation;
}
} else {
hip_status = hipErrorMemoryAllocation;
}
} else {
hip_status = hipErrorInvalidValue;
}
return ihipLogStatus(hip_status);
}
hipError_t hipMallocArray(hipArray** array, const hipChannelFormatDesc* desc,
size_t width, size_t height, unsigned int flags)
@@ -425,6 +530,7 @@ hipError_t hipMallocArray(hipArray** array, const hipChannelFormatDesc* desc,
array[0]->height = height;
array[0]->depth = 1;
array[0]->desc = *desc;
array[0]->isDrv = false;
void ** ptr = &array[0]->data;
@@ -480,7 +586,7 @@ hipError_t hipMallocArray(hipArray** array, const hipChannelFormatDesc* desc,
}
hsa_ext_image_channel_order_t channelOrder;
hsa_ext_image_channel_type_t channelType;
getChannelOrderAndType(*desc, hipReadModeElementType, channelOrder, channelType);
getChannelOrderAndType(*desc, hipReadModeElementType, &channelOrder, &channelType);
imageDescriptor.format.channel_order = channelOrder;
imageDescriptor.format.channel_type = channelType;
@@ -577,7 +683,7 @@ hipError_t hipMalloc3DArray(hipArray_t *array,
}
hsa_ext_image_channel_order_t channelOrder;
hsa_ext_image_channel_type_t channelType;
getChannelOrderAndType(*desc, hipReadModeElementType, channelOrder, channelType);
getChannelOrderAndType(*desc, hipReadModeElementType, &channelOrder, &channelType);
imageDescriptor.format.channel_order = channelOrder;
imageDescriptor.format.channel_type = channelType;
@@ -702,6 +808,26 @@ hipError_t hipHostUnregister(void *hostPtr)
return ihipLogStatus(hip_status);
}
namespace
{
inline
hipDeviceptr_t agent_address_for_symbol(const char* symbolName)
{
hipDeviceptr_t r = nullptr;
#if __hcc_workweek__ >= 17481
size_t byte_cnt = 0u;
hipModuleGetGlobal(&r, &byte_cnt, 0, symbolName);
#else
auto ctx = ihipGetTlsDefaultCtx();
auto acc = ctx->getDevice()->_acc;
r = acc.get_symbol_address(symbolName);
#endif
return r;
}
}
hipError_t hipMemcpyToSymbol(const void* symbolName, const void *src, size_t count, size_t offset, hipMemcpyKind kind)
{
HIP_INIT_SPECIAL_API((TRACE_MCMD), symbolName, src, count, offset, kind);
@@ -715,7 +841,8 @@ hipError_t hipMemcpyToSymbol(const void* symbolName, const void *src, size_t cou
hc::accelerator acc = ctx->getDevice()->_acc;
void *dst = acc.get_symbol_address((const char*) symbolName);
hipDeviceptr_t dst =
agent_address_for_symbol(static_cast<const char*>(symbolName));
tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbolName, dst);
if(dst == nullptr)
@@ -750,7 +877,8 @@ hipError_t hipMemcpyFromSymbol(void* dst, const void* symbolName, size_t count,
hc::accelerator acc = ctx->getDevice()->_acc;
void *src = acc.get_symbol_address((const char*) symbolName);
hipDeviceptr_t src =
agent_address_for_symbol(static_cast<const char*>(symbolName));
tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbolName, dst);
if(dst == nullptr)
@@ -787,7 +915,8 @@ hipError_t hipMemcpyToSymbolAsync(const void* symbolName, const void *src, size_
hc::accelerator acc = ctx->getDevice()->_acc;
void *dst = acc.get_symbol_address((const char*) symbolName);
hipDeviceptr_t dst =
agent_address_for_symbol(static_cast<const char*>(symbolName));
tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbolName, dst);
if(dst == nullptr)
@@ -799,7 +928,7 @@ hipError_t hipMemcpyToSymbolAsync(const void* symbolName, const void *src, size_
try {
stream->lockedSymbolCopyAsync(acc, dst, (void*)src, count, offset, kind);
}
catch (ihipException ex) {
catch (ihipException &ex) {
e = ex._code;
}
} else {
@@ -825,7 +954,8 @@ hipError_t hipMemcpyFromSymbolAsync(void* dst, const void* symbolName, size_t co
hc::accelerator acc = ctx->getDevice()->_acc;
void *src = acc.get_symbol_address((const char*) symbolName);
hipDeviceptr_t src =
agent_address_for_symbol(static_cast<const char*>(symbolName));
tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbolName, src);
if(src == nullptr || dst == nullptr)
@@ -838,7 +968,7 @@ hipError_t hipMemcpyFromSymbolAsync(void* dst, const void* symbolName, size_t co
try {
stream->lockedSymbolCopyAsync(acc, dst, src, count, offset, kind);
}
catch (ihipException ex) {
catch (ihipException &ex) {
e = ex._code;
}
} else {
@@ -863,7 +993,7 @@ hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind
stream->locked_copySync(dst, src, sizeBytes, kind);
}
catch (ihipException ex) {
catch (ihipException &ex) {
e = ex._code;
}
@@ -885,7 +1015,7 @@ hipError_t hipMemcpyHtoD(hipDeviceptr_t dst, void* src, size_t sizeBytes)
stream->locked_copySync((void*)dst, (void*)src, sizeBytes, hipMemcpyHostToDevice, false);
}
catch (ihipException ex) {
catch (ihipException &ex) {
e = ex._code;
}
@@ -907,7 +1037,7 @@ hipError_t hipMemcpyDtoH(void* dst, hipDeviceptr_t src, size_t sizeBytes)
stream->locked_copySync((void*)dst, (void*)src, sizeBytes, hipMemcpyDeviceToHost, false);
}
catch (ihipException ex) {
catch (ihipException &ex) {
e = ex._code;
}
@@ -929,7 +1059,7 @@ hipError_t hipMemcpyDtoD(hipDeviceptr_t dst, hipDeviceptr_t src, size_t sizeByte
stream->locked_copySync((void*)dst, (void*)src, sizeBytes, hipMemcpyDeviceToDevice, false);
}
catch (ihipException ex) {
catch (ihipException &ex) {
e = ex._code;
}
@@ -951,7 +1081,7 @@ hipError_t hipMemcpyHtoH(void* dst, void* src, size_t sizeBytes)
stream->locked_copySync((void*)dst, (void*)src, sizeBytes, hipMemcpyHostToHost, false);
}
catch (ihipException ex) {
catch (ihipException &ex) {
e = ex._code;
}
@@ -993,13 +1123,11 @@ hipError_t hipMemcpyDtoHAsync(void* dst, hipDeviceptr_t src, size_t sizeBytes, h
}
// TODO - review and optimize
hipError_t hipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch,
size_t width, size_t height, hipMemcpyKind kind) {
HIP_INIT_SPECIAL_API((TRACE_MCMD), dst, dpitch, src, spitch, width, height, kind);
hipError_t ihipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch,
size_t width, size_t height, hipMemcpyKind kind)
{
if(width > dpitch || width > spitch)
return ihipLogStatus(hipErrorUnknown);
return hipErrorUnknown;
hipStream_t stream = ihipSyncAndResolveStream(hipStreamNull);
@@ -1012,10 +1140,30 @@ hipError_t hipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch,
stream->locked_copySync((unsigned char*)dst + i*dpitch, (unsigned char*)src + i*spitch, width, kind);
}
}
catch (ihipException ex) {
catch (ihipException &ex) {
e = ex._code;
}
return e;
}
hipError_t hipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch,
size_t width, size_t height, hipMemcpyKind kind)
{
HIP_INIT_SPECIAL_API((TRACE_MCMD), dst, dpitch, src, spitch, width, height, kind);
hipError_t e = hipSuccess;
e = ihipMemcpy2D(dst,dpitch, src, spitch, width, height, kind);
return ihipLogStatus(e);
}
hipError_t hipMemcpyParam2D(const hip_Memcpy2D* pCopy)
{
HIP_INIT_SPECIAL_API((TRACE_MCMD), pCopy);
hipError_t e = hipSuccess;
if(pCopy == nullptr) {
e = hipErrorInvalidValue;
}
e = ihipMemcpy2D(pCopy->dstArray->data, pCopy->widthInBytes, pCopy->srcHost, pCopy->srcPitch, pCopy->widthInBytes, pCopy->height, hipMemcpyDefault);
return ihipLogStatus(e);
}
@@ -1030,7 +1178,7 @@ hipError_t hipMemcpy2DAsync(void* dst, size_t dpitch, const void* src, size_t sp
e = hip_internal::memcpyAsync((unsigned char*)dst + i*dpitch, (unsigned char*)src + i*spitch, width, kind,stream);
}
}
catch (ihipException ex) {
catch (ihipException &ex) {
e = ex._code;
}
@@ -1083,7 +1231,7 @@ hipError_t hipMemcpy2DToArray(hipArray* dst, size_t wOffset, size_t hOffset, con
stream->locked_copySync((unsigned char*)dst->data + i*dst_w, (unsigned char*)src + i*src_w, width, kind);
}
}
catch (ihipException ex) {
catch (ihipException &ex) {
e = ex._code;
}
@@ -1104,7 +1252,7 @@ hipError_t hipMemcpyToArray(hipArray* dst, size_t wOffset, size_t hOffset,
try {
stream->locked_copySync((char *)dst->data + wOffset, src, count, kind);
}
catch (ihipException ex) {
catch (ihipException &ex) {
e = ex._code;
}
@@ -1154,49 +1302,66 @@ hipError_t hipMemcpy3D(const struct hipMemcpy3DParms *p)
}
}
}
catch (ihipException ex) {
catch (ihipException &ex) {
e = ex._code;
}
return ihipLogStatus(e);
}
// TODO - make member function of stream?
namespace
{
template<
uint32_t block_dim,
typename RandomAccessIterator,
typename N,
typename T>
__global__
void hip_fill_n(RandomAccessIterator f, N n, T value)
{
const uint32_t grid_dim = gridDim.x * blockDim.x;
size_t idx = blockIdx.x * block_dim + threadIdx.x;
while (idx < n) {
__builtin_memcpy(
reinterpret_cast<void*>(&f[idx]),
reinterpret_cast<const void*>(&value),
sizeof(T));
idx += grid_dim;
}
}
template<
typename T,
typename std::enable_if<std::is_integral<T>{}>::type* = nullptr>
inline
const T& clamp_integer(const T& x, const T& lower, const T& upper)
{
assert(!(upper < lower));
return std::min(upper, std::max(x, lower));
}
}
template <typename T>
void
ihipMemsetKernel(hipStream_t stream,
LockedAccessor_StreamCrit_t &crit,
T * ptr, T val, size_t sizeBytes,
hc::completion_future *cf)
T * ptr, T val, size_t sizeBytes)
{
int wg = std::min((unsigned)8, stream->getDevice()->_computeUnits);
const int threads_per_wg = 256;
static constexpr uint32_t block_dim = 256;
int threads = wg * threads_per_wg;
if (threads > sizeBytes) {
threads = ((sizeBytes + threads_per_wg - 1) / threads_per_wg) * threads_per_wg;
}
hc::extent<1> ext(threads);
auto ext_tile = ext.tile(threads_per_wg);
*cf =
hc::parallel_for_each(
crit->_av,
ext_tile,
[=] (hc::tiled_index<1> idx)
__attribute__((hc))
{
int offset = amp_get_global_id(0);
// TODO-HCC - change to hc_get_local_size()
int stride = amp_get_local_size(0) * hc_get_num_groups(0) ;
for (int i=offset; i<sizeBytes; i+=stride) {
ptr[i] = val;
}
});
const uint32_t grid_dim = clamp_integer<size_t>(
sizeBytes / block_dim, 1, UINT32_MAX);
hipLaunchKernelGGL(
hip_fill_n<block_dim>,
dim3(grid_dim),
dim3{block_dim},
0u,
stream,
ptr,
sizeBytes,
std::move(val));
}
@@ -1210,17 +1375,12 @@ hipError_t hipMemsetAsync(void* dst, int value, size_t sizeBytes, hipStream_t s
stream = ihipSyncAndResolveStream(stream);
if (stream) {
auto crit = stream->lockopen_preKernelCommand();
hc::completion_future cf ;
if ((sizeBytes & 0x3) == 0) {
// use a faster dword-per-workitem copy:
try {
value = value & 0xff;
uint32_t value32 = (value << 24) | (value << 16) | (value << 8) | (value) ;
ihipMemsetKernel<uint32_t> (stream, crit, static_cast<uint32_t*> (dst), value32, sizeBytes/sizeof(uint32_t), &cf);
ihipMemsetKernel<uint32_t> (stream, static_cast<uint32_t*> (dst), value32, sizeBytes/sizeof(uint32_t));
}
catch (std::exception &ex) {
e = hipErrorInvalidValue;
@@ -1228,19 +1388,16 @@ hipError_t hipMemsetAsync(void* dst, int value, size_t sizeBytes, hipStream_t s
} else {
// use a slow byte-per-workitem copy:
try {
ihipMemsetKernel<char> (stream, crit, static_cast<char*> (dst), value, sizeBytes, &cf);
ihipMemsetKernel<char> (stream, static_cast<char*> (dst), value, sizeBytes);
}
catch (std::exception &ex) {
e = hipErrorInvalidValue;
}
}
stream->lockclose_postKernelCommand("hipMemsetAsync", &crit->_av);
if (HIP_API_BLOCKING) {
tprintf (DB_SYNC, "%s LAUNCH_BLOCKING wait for hipMemsetAsync.\n", ToString(stream).c_str());
cf.wait();
stream->locked_wait();
}
} else {
e = hipErrorInvalidValue;
@@ -1261,16 +1418,12 @@ hipError_t hipMemset(void* dst, int value, size_t sizeBytes)
stream = ihipSyncAndResolveStream(stream);
if (stream) {
auto crit = stream->lockopen_preKernelCommand();
hc::completion_future cf ;
if ((sizeBytes & 0x3) == 0) {
// use a faster dword-per-workitem copy:
try {
value = value & 0xff;
uint32_t value32 = (value << 24) | (value << 16) | (value << 8) | (value) ;
ihipMemsetKernel<uint32_t> (stream, crit, static_cast<uint32_t*> (dst), value32, sizeBytes/sizeof(uint32_t), &cf);
ihipMemsetKernel<uint32_t> (stream, static_cast<uint32_t*> (dst), value32, sizeBytes/sizeof(uint32_t));
}
catch (std::exception &ex) {
e = hipErrorInvalidValue;
@@ -1278,21 +1431,18 @@ hipError_t hipMemset(void* dst, int value, size_t sizeBytes)
} else {
// use a slow byte-per-workitem copy:
try {
ihipMemsetKernel<char> (stream, crit, static_cast<char*> (dst), value, sizeBytes, &cf);
ihipMemsetKernel<char> (stream, static_cast<char*> (dst), value, sizeBytes);
}
catch (std::exception &ex) {
e = hipErrorInvalidValue;
}
}
// TODO - is hipMemset supposed to be async?
cf.wait();
stream->lockclose_postKernelCommand("hipMemset", &crit->_av);
stream->locked_wait();
if (HIP_LAUNCH_BLOCKING) {
tprintf (DB_SYNC, "'%s' LAUNCH_BLOCKING wait for memset in %s.\n", __func__, ToString(stream).c_str());
cf.wait();
stream->locked_wait();
tprintf (DB_SYNC, "'%s' LAUNCH_BLOCKING memset completed in %s.\n", __func__, ToString(stream).c_str());
}
} else {
@@ -1313,17 +1463,13 @@ hipError_t hipMemset2D(void* dst, size_t pitch, int value, size_t width, size_t
stream = ihipSyncAndResolveStream(stream);
if (stream) {
auto crit = stream->lockopen_preKernelCommand();
hc::completion_future cf ;
size_t sizeBytes = pitch * height;
if ((sizeBytes & 0x3) == 0) {
// use a faster dword-per-workitem copy:
try {
value = value & 0xff;
uint32_t value32 = (value << 24) | (value << 16) | (value << 8) | (value) ;
ihipMemsetKernel<uint32_t> (stream, crit, static_cast<uint32_t*> (dst), value32, sizeBytes/sizeof(uint32_t), &cf);
ihipMemsetKernel<uint32_t> (stream, static_cast<uint32_t*> (dst), value32, sizeBytes/sizeof(uint32_t));
}
catch (std::exception &ex) {
e = hipErrorInvalidValue;
@@ -1331,20 +1477,18 @@ hipError_t hipMemset2D(void* dst, size_t pitch, int value, size_t width, size_t
} else {
// use a slow byte-per-workitem copy:
try {
ihipMemsetKernel<char> (stream, crit, static_cast<char*> (dst), value, sizeBytes, &cf);
ihipMemsetKernel<char> (stream, static_cast<char*> (dst), value, sizeBytes);
}
catch (std::exception &ex) {
e = hipErrorInvalidValue;
}
}
// TODO - is hipMemset supposed to be async?
cf.wait();
stream->lockclose_postKernelCommand("hipMemset", &crit->_av);
stream->locked_wait();
if (HIP_LAUNCH_BLOCKING) {
tprintf (DB_SYNC, "'%s' LAUNCH_BLOCKING wait for memset in %s.\n", __func__, ToString(stream).c_str());
cf.wait();
stream->locked_wait();
tprintf (DB_SYNC, "'%s' LAUNCH_BLOCKING memset completed in %s.\n", __func__, ToString(stream).c_str());
}
} else {
@@ -1365,36 +1509,30 @@ hipError_t hipMemsetD8(hipDeviceptr_t dst, unsigned char value, size_t sizeByte
stream = ihipSyncAndResolveStream(stream);
if (stream) {
auto crit = stream->lockopen_preKernelCommand();
hc::completion_future cf ;
if ((sizeBytes & 0x3) == 0) {
// use a faster dword-per-workitem copy:
try {
uint32_t value32 = (value << 24) | (value << 16) | (value << 8) | (value) ;
ihipMemsetKernel<uint32_t> (stream, crit, static_cast<uint32_t*> (dst), value32, sizeBytes/sizeof(uint32_t), &cf);
ihipMemsetKernel<uint32_t> (stream, static_cast<uint32_t*> (dst), value32, sizeBytes/sizeof(uint32_t));
}
catch (std::exception &ex) {
std::cout << ex.what() << std::endl;
e = hipErrorInvalidValue;
}
} else {
// use a slow byte-per-workitem copy:
try {
ihipMemsetKernel<char> (stream, crit, static_cast<char*> (dst), value, sizeBytes, &cf);
ihipMemsetKernel<char> (stream, static_cast<char*> (dst), value, sizeBytes);
}
catch (std::exception &ex) {
e = hipErrorInvalidValue;
}
}
cf.wait();
stream->lockclose_postKernelCommand("hipMemsetD8", &crit->_av);
stream->locked_wait();
if (HIP_LAUNCH_BLOCKING) {
tprintf (DB_SYNC, "'%s' LAUNCH_BLOCKING wait for memset in %s.\n", __func__, ToString(stream).c_str());
cf.wait();
stream->locked_wait();
tprintf (DB_SYNC, "'%s' LAUNCH_BLOCKING memset completed in %s.\n", __func__, ToString(stream).c_str());
}
} else {
@@ -1593,7 +1731,7 @@ hipError_t hipIpcGetMemHandle(hipIpcMemHandle_t* handle, void* devPtr){
HIP_INIT_API ( handle, devPtr);
hipError_t hipStatus = hipSuccess;
// Get the size of allocated pointer
size_t psize;
size_t psize = 0u;
hc::accelerator acc;
if((handle == NULL) || (devPtr == NULL)) {
hipStatus = hipErrorInvalidResourceHandle;
@@ -1606,8 +1744,9 @@ hipError_t hipIpcGetMemHandle(hipIpcMemHandle_t* handle, void* devPtr){
am_status_t status = hc::am_memtracker_getinfo( &amPointerInfo , devPtr );
if (status == AM_SUCCESS) {
psize = (size_t)amPointerInfo._sizeBytes;
} else
} else {
hipStatus = hipErrorInvalidResourceHandle;
}
ihipIpcMemHandle_t* iHandle = (ihipIpcMemHandle_t*) handle;
// Save the size of the pointer to hipIpcMemHandle
iHandle->psize = psize;
+238 -499
View File
@@ -20,62 +20,65 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include <fstream>
#include <stdio.h>
#include <stdlib.h>
#include <cstdint>
#include <memory>
#include <mutex>
#include <string>
#include <unordered_map>
#include <vector>
#include <map>
#include "elfio/elfio.hpp"
#include "hip/hip_runtime.h"
#include "hip/hcc_detail/program_state.hpp"
#include "hip_hcc_internal.h"
#include "hsa_helpers.hpp"
#include "trace_helper.h"
#include <hsa/hsa.h>
#include <hsa/hsa_ext_amd.h>
#include <hsa/amd_hsa_kernel_code.h>
#include "elfio/elfio.hpp"
#include "hip/hip_runtime.h"
#include "hip_hcc_internal.h"
#include "trace_helper.h"
#include <cassert>
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include <fstream>
#include <map>
#include <memory>
#include <mutex>
#include <sstream>
#include <stdexcept>
#include <string>
#include <tuple>
#include <unordered_map>
#include <utility>
#include <vector>
//TODO Use Pool APIs from HCC to get memory regions.
#include <cassert>
using namespace ELFIO;
using namespace hip_impl;
using namespace std;
inline uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
assert(Align != 0u && "Align can't be 0.");
Skew %= Align;
return (Value + Align - 1 - Skew) / Align * Align + Skew;
}
struct ihipKernArgInfo{
std::vector<uint32_t> Size;
std::vector<uint32_t> Align;
std::vector<std::string> ArgType;
std::vector<std::string> ArgName;
vector<uint32_t> Size;
vector<uint32_t> Align;
vector<string> ArgType;
vector<string> ArgName;
uint32_t totalSize;
};
std::map<std::string,struct ihipKernArgInfo> kernelArguments;
struct MyElfNote {
uint32_t n_namesz = 0;
uint32_t n_descsz = 0;
uint32_t n_type = 0;
MyElfNote() = default;
};
map<string, ihipKernArgInfo> kernelArguments;
struct ihipModuleSymbol_t{
uint64_t _object; // The kernel object.
uint32_t _groupSegmentSize;
uint32_t _privateSegmentSize;
std::string _name; // TODO - review for performance cost. Name is just used for debug.
uint64_t _object; // The kernel object.
uint32_t _groupSegmentSize;
uint32_t _privateSegmentSize;
string _name; // TODO - review for performance cost. Name is just used for debug.
};
template <>
std::string ToString(hipFunction_t v)
string ToString(hipFunction_t v)
{
std::ostringstream ss;
ss << "0x" << std::hex << v->_object;
@@ -93,283 +96,20 @@ if (hsaStatus != HSA_STATUS_SUCCESS) {\
return ihipLogStatus(hipStatus);\
}
namespace hipdrv {
hsa_status_t findSystemRegions(hsa_region_t region, void *data){
hsa_region_segment_t segment_id;
hsa_region_get_info(region, HSA_REGION_INFO_SEGMENT, &segment_id);
if(segment_id != HSA_REGION_SEGMENT_GLOBAL){
return HSA_STATUS_SUCCESS;
}
hsa_region_global_flag_t flags;
hsa_region_get_info(region, HSA_REGION_INFO_GLOBAL_FLAGS, &flags);
hsa_region_t *reg = (hsa_region_t*)data;
if(flags & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED){
*reg = region;
}
return HSA_STATUS_SUCCESS;
}
} // End namespace hipdrv
uint64_t PrintSymbolSizes(const void *emi, const char *name){
using namespace ELFIO;
const Elf64_Ehdr *ehdr = (const Elf64_Ehdr*)emi;
if(NULL == ehdr || EV_CURRENT != ehdr->e_version){}
const Elf64_Shdr * shdr = (const Elf64_Shdr*)((char*)emi + ehdr->e_shoff);
for(uint16_t i=0;i<ehdr->e_shnum;++i){
if(shdr[i].sh_type == SHT_SYMTAB){
const Elf64_Sym *syms = (const Elf64_Sym*)((char*)emi + shdr[i].sh_offset);
assert(syms);
uint64_t numSyms = shdr[i].sh_size/shdr[i].sh_entsize;
const char* strtab = (const char*)((char*)emi + shdr[shdr[i].sh_link].sh_offset);
assert(strtab);
for(uint64_t i=0;i<numSyms;++i){
const char *symname = strtab + syms[i].st_name;
assert(symname);
uint64_t size = syms[i].st_size;
if(strcmp(name, symname) == 0){
return size;
}
}
}
}
return 0;
}
uint64_t ElfSize(const void *emi){
using namespace ELFIO;
const Elf64_Ehdr *ehdr = (const Elf64_Ehdr*)emi;
const Elf64_Shdr *shdr = (const Elf64_Shdr*)((char*)emi + ehdr->e_shoff);
uint64_t max_offset = ehdr->e_shoff;
uint64_t total_size = max_offset + ehdr->e_shentsize * ehdr->e_shnum;
for(uint16_t i=0;i < ehdr->e_shnum;++i){
uint64_t cur_offset = static_cast<uint64_t>(shdr[i].sh_offset);
if(max_offset < cur_offset){
max_offset = cur_offset;
total_size = max_offset;
if(SHT_NOBITS != shdr[i].sh_type){
total_size += static_cast<uint64_t>(shdr[i].sh_size);
}
}
}
return total_size;
}
namespace
{
template<typename P>
inline
ELFIO::section* find_section_if(ELFIO::elfio& reader, P p)
{
using namespace std;
const auto it = find_if(
reader.sections.begin(), reader.sections.end(), move(p));
return it != reader.sections.end() ? *it : nullptr;
}
inline
std::vector<std::string> copy_names_of_undefined_symbols(
const ELFIO::symbol_section_accessor& section)
{
using namespace ELFIO;
using namespace std;
vector<string> r;
for (auto i = 0u; i != section.get_symbols_num(); ++i) {
// TODO: this is boyscout code, caching the temporaries
// may be of worth.
string name;
Elf64_Addr value = 0;
Elf_Xword size = 0;
Elf_Half sect_idx = 0;
uint8_t bind = 0;
uint8_t type = 0;
uint8_t other = 0;
section.get_symbol(
i, name, value, size, bind, type, sect_idx, other);
if (sect_idx == SHN_UNDEF && !name.empty()) {
r.push_back(std::move(name));
}
}
return r;
}
inline
std::pair<ELFIO::Elf64_Addr, ELFIO::Elf_Xword> find_symbol_address(
const ELFIO::symbol_section_accessor& section,
const std::string& symbol_name)
{
using namespace ELFIO;
using namespace std;
static const pair<Elf64_Addr, Elf_Xword> r{0, 0};
for (auto i = 0u; i != section.get_symbols_num(); ++i) {
// TODO: this is boyscout code, caching the temporaries
// may be of worth.
string name;
Elf64_Addr value = 0;
Elf_Xword size = 0;
Elf_Half sect_idx = 0;
uint8_t bind = 0;
uint8_t type = 0;
uint8_t other = 0;
section.get_symbol(
i, name, value, size, bind, type, sect_idx, other);
if (name == symbol_name) return make_pair(value, size);
}
return r;
}
inline
void associate_code_object_symbols_with_host_allocation(
const ELFIO::elfio& reader,
const ELFIO::elfio& self_reader,
ELFIO::section* code_object_dynsym,
ELFIO::section* process_symtab,
hsa_agent_t agent,
hsa_executable_t executable)
{
using namespace ELFIO;
using namespace std;
if (!code_object_dynsym || !process_symtab) return;
const auto undefined_symbols = copy_names_of_undefined_symbols(
symbol_section_accessor{reader, code_object_dynsym});
for (auto&& x : undefined_symbols) {
const auto tmp = find_symbol_address(
symbol_section_accessor{self_reader, process_symtab}, x);
assert(tmp.first);
void* p = nullptr;
hsa_amd_memory_lock(
reinterpret_cast<void*>(tmp.first), tmp.second, &agent, 1, &p);
hsa_executable_agent_global_variable_define(
executable, agent, x.c_str(), p);
static vector<
unique_ptr<void, decltype(hsa_amd_memory_unlock)*>> globals;
static mutex mtx;
lock_guard<std::mutex> lck{mtx};
globals.emplace_back(p, hsa_amd_memory_unlock);
}
}
inline
void load_code_object_and_freeze_executable(
const char* file, hsa_agent_t agent, hsa_executable_t executable)
{ // TODO: the following sequence is inefficient, should be refactored
// into a single load of the file and subsequent ELFIO
// processing.
using namespace std;
static const auto cor_deleter = [](hsa_code_object_reader_t* p) {
hsa_code_object_reader_destroy(*p);
};
using RAII_code_reader = unique_ptr<
hsa_code_object_reader_t, decltype(cor_deleter)>;
unique_ptr<FILE, decltype(fclose)*> cobj{fopen(file, "r"), fclose};
RAII_code_reader tmp{new hsa_code_object_reader_t, cor_deleter};
hsa_code_object_reader_create_from_file(fileno(cobj.get()), tmp.get());
hsa_executable_load_agent_code_object(
executable, agent, *tmp, nullptr, nullptr);
hsa_executable_freeze(executable, nullptr);
static vector<RAII_code_reader> code_readers;
static mutex mtx;
lock_guard<mutex> lck{mtx};
code_readers.push_back(move(tmp));
}
}
hipError_t hipModuleLoad(hipModule_t *module, const char *fname)
{
using namespace ELFIO;
HIP_INIT_API(module, fname);
hipError_t ret = hipSuccess;
*module = new ihipModule_t;
if(module == NULL){
return ihipLogStatus(hipErrorInvalidValue);
}
if (!fname) return ihipLogStatus(hipErrorInvalidValue);
auto ctx = ihipGetTlsDefaultCtx();
if(ctx == nullptr){
ret = hipErrorInvalidContext;
ifstream file{fname};
}else{
int deviceId = ctx->getDevice()->_deviceId;
ihipDevice_t *currentDevice = ihipGetDevice(deviceId);
if (!file.is_open()) return ihipLogStatus(hipErrorFileNotFound);
hsa_executable_create_alt(
HSA_PROFILE_FULL,
HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT,
nullptr,
&(*module)->executable);
vector<char> tmp{
istreambuf_iterator<char>{file}, istreambuf_iterator<char>{}};
elfio reader;
if (!reader.load(fname)) {
return ihipLogStatus(hipErrorFileNotFound);
}
else {
// TODO: this may benefit from caching as well.
elfio self_reader;
self_reader.load("/proc/self/exe");
const auto symtab =
find_section_if(self_reader, [](const ELFIO::section* x) {
return x->get_type() == SHT_SYMTAB;
});
const auto code_object_dynsym =
find_section_if(reader, [](const ELFIO::section* x) {
return x->get_type() == SHT_DYNSYM;
});
associate_code_object_symbols_with_host_allocation(
reader,
self_reader,
code_object_dynsym,
symtab,
currentDevice->_hsaAgent,
(*module)->executable);
load_code_object_and_freeze_executable(
fname, currentDevice->_hsaAgent, (*module)->executable);
}
}
return ihipLogStatus(ret);
return hipModuleLoadData(module, tmp.data());
}
@@ -381,92 +121,13 @@ hipError_t hipModuleUnload(hipModule_t hmod)
// Currently we want for all inflight activity to complete, but don't prevent another
// thread from launching new kernels before we finish this operation.
ihipSynchronize();
hipError_t ret = hipSuccess;
hsa_status_t status = hsa_executable_destroy(hmod->executable);
if(status != HSA_STATUS_SUCCESS)
{
ret = hipErrorInvalidValue;
}
// status = hsa_code_object_destroy(hmod->object);
// if(status != HSA_STATUS_SUCCESS)
// {
// ret = hipErrorInvalidValue;
// }
// status = hsa_memory_free(hmod->ptr);
// if(status != HSA_STATUS_SUCCESS)
// {
// ret = hipErrorInvalidValue;
// }
for(auto f = hmod->funcTrack.begin(); f != hmod->funcTrack.end(); ++f) {
delete *f;
}
delete hmod;
return ihipLogStatus(ret);
delete hmod; // The ihipModule_t dtor will clean everything up.
hmod = nullptr;
return ihipLogStatus(hipSuccess);
}
hipError_t ihipModuleGetSymbol(hipFunction_t *func, hipModule_t hmod, const char *name)
{
auto ctx = ihipGetTlsDefaultCtx();
hipError_t ret = hipSuccess;
if (name == nullptr){
return (hipErrorInvalidValue);
}
if (ctx == nullptr){
ret = hipErrorInvalidContext;
} else {
std::string str(name);
for(auto f = hmod->funcTrack.begin(); f != hmod->funcTrack.end(); ++f) {
if((*f)->_name == str) {
*func = *f;
return ret;
}
}
ihipModuleSymbol_t *sym = new ihipModuleSymbol_t;
int deviceId = ctx->getDevice()->_deviceId;
ihipDevice_t *currentDevice = ihipGetDevice(deviceId);
hsa_agent_t gpuAgent = (hsa_agent_t)currentDevice->_hsaAgent;
hsa_status_t status;
hsa_executable_symbol_t symbol;
status = hsa_executable_get_symbol(hmod->executable, NULL, name, gpuAgent, 0, &symbol);
if(status != HSA_STATUS_SUCCESS){
return hipErrorNotFound;
}
status = hsa_executable_symbol_get_info(symbol,
HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT,
&sym->_object);
CHECK_HSA(status, hipErrorNotFound);
status = hsa_executable_symbol_get_info(symbol,
HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
&sym->_groupSegmentSize);
CHECK_HSA(status, hipErrorNotFound);
status = hsa_executable_symbol_get_info(symbol,
HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
&sym->_privateSegmentSize);
CHECK_HSA(status, hipErrorNotFound);
sym->_name = name;
*func = sym;
hmod->funcTrack.push_back(*func);
}
return ret;
}
hipError_t hipModuleGetFunction(hipFunction_t *hfunc, hipModule_t hmod,
const char *name){
HIP_INIT_API(hfunc, hmod, name);
return ihipLogStatus(ihipModuleGetSymbol(hfunc, hmod, name));
}
hipError_t ihipModuleLaunchKernel(hipFunction_t f,
uint32_t globalWorkSizeX, uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ,
uint32_t localWorkSizeX, uint32_t localWorkSizeY, uint32_t localWorkSizeZ,
@@ -617,45 +278,11 @@ hipError_t hipHccModuleLaunchKernel(hipFunction_t f,
namespace
{
struct Agent_global {
std::string name;
string name;
hipDeviceptr_t address;
std::uint32_t byte_cnt;
uint32_t byte_cnt;
};
inline
void* address(hsa_executable_symbol_t x)
{
void* r = nullptr;
hsa_executable_symbol_get_info(
x, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &r);
return r;
}
inline
std::string name(hsa_executable_symbol_t x)
{
uint32_t sz = 0u;
hsa_executable_symbol_get_info(
x, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &sz);
std::string r(sz, '\0');
hsa_executable_symbol_get_info(
x, HSA_EXECUTABLE_SYMBOL_INFO_NAME, &r.front());
return r;
}
inline
std::uint32_t size(hsa_executable_symbol_t x)
{
std::uint32_t r = 0;
hsa_executable_symbol_get_info(
x, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE, &r);
return r;
}
inline
void track(const Agent_global& x)
{
@@ -680,7 +307,7 @@ namespace
hc::am_memtracker_update(x.address, device->_deviceId, 0u);
}
template<typename Container = std::vector<Agent_global>>
template<typename Container = vector<Agent_global>>
inline
hsa_status_t copy_agent_global_variables(
hsa_executable_t, hsa_agent_t, hsa_executable_symbol_t x, void* out)
@@ -705,56 +332,58 @@ namespace
{
auto ctx = ihipGetTlsDefaultCtx();
if (!ctx) throw std::runtime_error{"No active HIP context."};
if (!ctx) throw runtime_error{"No active HIP context."};
auto device = ctx->getDevice();
if (!device) throw std::runtime_error{"No device available for HIP."};
if (!device) throw runtime_error{"No device available for HIP."};
ihipDevice_t *currentDevice = ihipGetDevice(device->_deviceId);
if (!currentDevice) {
throw std::runtime_error{"No active device for HIP"};
}
if (!currentDevice) throw runtime_error{"No active device for HIP."};
return currentDevice->_hsaAgent;
}
inline
std::vector<Agent_global> read_agent_globals(hipModule_t hmodule)
vector<Agent_global> read_agent_globals(
hsa_agent_t agent, hsa_executable_t executable)
{
std::vector<Agent_global> r;
vector<Agent_global> r;
hsa_executable_iterate_agent_symbols(
hmodule->executable, this_agent(), copy_agent_global_variables, &r);
executable, agent, copy_agent_global_variables, &r);
return r;
}
}
hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t *bytes,
hipModule_t hmod, const char* name)
{
HIP_INIT_API(dptr, bytes, hmod, name);
hipError_t ret = hipSuccess;
if(dptr == NULL || bytes == NULL){
return ihipLogStatus(hipErrorInvalidValue);
template<typename ForwardIterator>
pair<hipDeviceptr_t, size_t> read_global_description(
ForwardIterator f, ForwardIterator l, const char* name)
{
const auto it = std::find_if(
f, l, [=](const Agent_global& x) { return x.name == name; });
return it == l ?
make_pair(nullptr, 0u) : make_pair(it->address, it->byte_cnt);
}
if(name == NULL || hmod == NULL){
return ihipLogStatus(hipErrorNotInitialized);
}
else{
static std::unordered_map<
hipModule_t, std::vector<Agent_global>> agent_globals;
hipError_t read_agent_global_from_module(
hipDeviceptr_t *dptr,
size_t* bytes,
hipModule_t hmod,
const char* name)
{
static unordered_map<hipModule_t, vector<Agent_global>> agent_globals;
// TODO: this is not particularly robust.
if (agent_globals.count(hmod) == 0) {
static std::mutex mtx;
std::lock_guard<std::mutex> lck{mtx};
static mutex mtx;
lock_guard<mutex> lck{mtx};
if (agent_globals.count(hmod) == 0) {
agent_globals.emplace(hmod, read_agent_globals(hmod));
agent_globals.emplace(
hmod, read_agent_globals(this_agent(), hmod->executable));
}
}
@@ -762,74 +391,184 @@ hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t *bytes,
// It will have to be properly fleshed out in the future.
const auto it0 = agent_globals.find(hmod);
if (it0 == agent_globals.cend()) {
throw std::runtime_error{"agent_globals data structure corrupted."};
throw runtime_error{"agent_globals data structure corrupted."};
}
const auto it1 = std::find_if(
it0->second.cbegin(),
it0->second.cend(),
[=](const Agent_global& x) { return x.name == name; });
tie(*dptr, *bytes) = read_global_description(
it0->second.cbegin(), it0->second.cend(), name);
if (it1 == it0->second.cend()) return ihipLogStatus(hipErrorNotFound);
*dptr = it1->address;
*bytes = it1->byte_cnt;
return ihipLogStatus(hipSuccess);
return dptr ? hipSuccess : hipErrorNotFound;
}
hipError_t read_agent_global_from_process(
hipDeviceptr_t *dptr, size_t* bytes, const char* name)
{
static unordered_map<hsa_agent_t, vector<Agent_global>> agent_globals;
static std::once_flag f;
call_once(f, []() {
for (auto&& agent_executables : hip_impl::executables()) {
vector<Agent_global> tmp0;
for (auto&& executable : agent_executables.second) {
auto tmp1 = read_agent_globals(
agent_executables.first, executable);
tmp0.insert(
tmp0.end(),
make_move_iterator(tmp1.begin()),
make_move_iterator(tmp1.end()));
}
agent_globals.emplace(agent_executables.first, move(tmp0));
}
});
const auto it = agent_globals.find(this_agent());
if (it == agent_globals.cend()) return hipErrorNotInitialized;
tie(*dptr, *bytes) = read_global_description(
it->second.cbegin(), it->second.cend(), name);
return dptr ? hipSuccess : hipErrorNotFound;
}
hsa_executable_symbol_t find_kernel_by_name(
hsa_executable_t executable, const char* kname)
{
pair<const char*, hsa_executable_symbol_t> r{kname, {}};
hsa_executable_iterate_agent_symbols(
executable,
this_agent(),
[](hsa_executable_t, hsa_agent_t, hsa_executable_symbol_t x, void* s) {
auto p =
static_cast<pair<const char*, hsa_executable_symbol_t>*>(s);
if (type(x) != HSA_SYMBOL_KIND_KERNEL) {
return HSA_STATUS_SUCCESS;
}
if (name(x) != p->first) return HSA_STATUS_SUCCESS;
p->second = x;
return HSA_STATUS_INFO_BREAK;
}, &r);
return r.second;
}
string read_elf_file_as_string(const void* file)
{ // Precondition: file points to an ELF image that was BITWISE loaded
// into process accessible memory, and not one loaded by
// the loader. This is because in the latter case
// alignment may differ, which will break the size
// computation.
// the image is Elf64, and matches endianness i.e. it is
// Little Endian.
if (!file) return {};
auto h = static_cast<const Elf64_Ehdr*>(file);
auto s = static_cast<const char*>(file);
// This assumes the common case of SHT being the last part of the ELF.
auto sz = sizeof(Elf64_Ehdr) + h->e_shoff + h->e_shentsize * h->e_shnum;
return string{s, s + sz};
}
} // Anonymous namespace, internal linkage.
hipError_t ihipModuleGetFunction(
hipFunction_t *func, hipModule_t hmod, const char *name)
{
HIP_INIT_API(func, hmod, name);
if (!func || !name) return ihipLogStatus(hipErrorInvalidValue);
auto ctx = ihipGetTlsDefaultCtx();
if (!ctx) return ihipLogStatus(hipErrorInvalidContext);
hipError_t ret = hipSuccess;
*func = new ihipModuleSymbol_t;
if (!*func) return ihipLogStatus(hipErrorInvalidValue);
auto kernel = find_kernel_by_name(hmod->executable, name);
if (kernel.handle == 0u) return ihipLogStatus(hipErrorNotFound);
(*func)->_object = kernel_object(kernel);
(*func)->_groupSegmentSize = group_size(kernel);
(*func)->_privateSegmentSize = private_size(kernel);
(*func)->_name = name;
return ihipLogStatus(hipSuccess);
}
hipError_t hipModuleGetFunction(hipFunction_t *hfunc, hipModule_t hmod,
const char *name){
HIP_INIT_API(hfunc, hmod, name);
return ihipLogStatus(ihipModuleGetFunction(hfunc, hmod, name));
}
hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t *bytes,
hipModule_t hmod, const char* name)
{
HIP_INIT_API(dptr, bytes, hmod, name);
if(!dptr || !bytes) return ihipLogStatus(hipErrorInvalidValue);
if(!name) return ihipLogStatus(hipErrorNotInitialized);
const auto r = hmod ?
read_agent_global_from_module(dptr, bytes, hmod, name) :
read_agent_global_from_process(dptr, bytes, name);
return ihipLogStatus(r);
}
hipError_t hipModuleLoadData(hipModule_t *module, const void *image)
{
HIP_INIT_API(module, image);
hipError_t ret = hipSuccess;
if(image == NULL || module == NULL){
return ihipLogStatus(hipErrorNotInitialized);
} else {
auto ctx = ihipGetTlsDefaultCtx();
*module = new ihipModule_t;
int deviceId = ctx->getDevice()->_deviceId;
ihipDevice_t *currentDevice = ihipGetDevice(deviceId);
void *p;
uint64_t size = ElfSize(image);
hsa_agent_t agent = currentDevice->_hsaAgent;
hsa_region_t sysRegion;
hsa_status_t status = hsa_agent_iterate_regions(agent, hipdrv::findSystemRegions, &sysRegion);
status = hsa_memory_allocate(sysRegion, size, (void**)&p);
if (!module) return ihipLogStatus(hipErrorInvalidValue);
if(status != HSA_STATUS_SUCCESS){
return ihipLogStatus(hipErrorOutOfMemory);
}
*module = new ihipModule_t;
char *ptr = (char*)p;
if(!ptr){
return ihipLogStatus(hipErrorOutOfMemory);
}
(*module)->ptr = p;
(*module)->size = size;
auto ctx = ihipGetTlsDefaultCtx();
if (!ctx) return ihipLogStatus(hipErrorInvalidContext);
memcpy(ptr, image, size);
hsa_executable_create_alt(
HSA_PROFILE_FULL,
HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT,
nullptr,
&(*module)->executable);
status = hsa_code_object_deserialize(ptr, size, NULL, &(*module)->object);
(*module)->executable = hip_impl::load_executable(
read_elf_file_as_string(image), (*module)->executable, this_agent());
if(status != HSA_STATUS_SUCCESS){
return ihipLogStatus(hipErrorSharedObjectInitFailed);
}
status = hsa_executable_create(HSA_PROFILE_FULL, HSA_EXECUTABLE_STATE_UNFROZEN, NULL, &(*module)->executable);
CHECKLOG_HSA(status, hipErrorNotInitialized);
status = hsa_executable_load_code_object((*module)->executable, agent, (*module)->object, NULL);
CHECKLOG_HSA(status, hipErrorNotInitialized);
status = hsa_executable_freeze((*module)->executable, NULL);
CHECKLOG_HSA(status, hipErrorNotInitialized);
}
return ihipLogStatus(ret);
return ihipLogStatus(
(*module)->executable.handle ? hipSuccess : hipErrorUnknown);
}
hipError_t hipModuleLoadDataEx(hipModule_t *module, const void *image, unsigned int numOptions, hipJitOption *options, void **optionValues)
{
return hipModuleLoadData(module, image);
}
hipError_t hipModuleGetTexRef(
textureReference** texRef, hipModule_t hmod, const char* name)
{
HIP_INIT_API(texRef, hmod, name);
hipError_t ret = hipErrorNotFound;
if(!texRef) return ihipLogStatus(hipErrorInvalidValue);
if(!hmod || !name) return ihipLogStatus(hipErrorNotInitialized);
const auto it = globals().find(name);
if (it == globals().end()) return ihipLogStatus(hipErrorInvalidValue);
*texRef = static_cast<textureReference*>(it->second.get());
return ihipLogStatus(hipSuccess);
}
+190 -88
View File
@@ -32,19 +32,61 @@ void saveTextureInfo(const hipTexture* pTexture,
}
}
void getDrvChannelOrderAndType(const enum hipArray_Format Format,
unsigned int NumChannels,
hsa_ext_image_channel_order_t* channelOrder,
hsa_ext_image_channel_type_t* channelType)
{
switch(Format) {
case HIP_AD_FORMAT_UNSIGNED_INT8:
*channelType = HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8;
break;
case HIP_AD_FORMAT_UNSIGNED_INT16:
*channelType = HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16;
break;
case HIP_AD_FORMAT_UNSIGNED_INT32:
*channelType = HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32;
break;
case HIP_AD_FORMAT_SIGNED_INT8:
*channelType = HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT8;
break;
case HIP_AD_FORMAT_SIGNED_INT16:
*channelType = HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT16;
break;
case HIP_AD_FORMAT_SIGNED_INT32:
*channelType = HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT32;
break;
case HIP_AD_FORMAT_HALF:
*channelType = HSA_EXT_IMAGE_CHANNEL_TYPE_HALF_FLOAT;
break;
case HIP_AD_FORMAT_FLOAT:
*channelType = HSA_EXT_IMAGE_CHANNEL_TYPE_FLOAT;
break;
default:
break;
}
if (NumChannels == 4) {
*channelOrder = HSA_EXT_IMAGE_CHANNEL_ORDER_RGBA;
} else if (NumChannels == 2) {
*channelOrder = HSA_EXT_IMAGE_CHANNEL_ORDER_RG;
} else if (NumChannels == 1) {
*channelOrder = HSA_EXT_IMAGE_CHANNEL_ORDER_R;
}
}
void getChannelOrderAndType(const hipChannelFormatDesc& desc,
enum hipTextureReadMode readMode,
hsa_ext_image_channel_order_t& channelOrder,
hsa_ext_image_channel_type_t& channelType)
hsa_ext_image_channel_order_t* channelOrder,
hsa_ext_image_channel_type_t* channelType)
{
if (desc.x != 0 && desc.y != 0 && desc.z != 0 && desc.w != 0) {
channelOrder = HSA_EXT_IMAGE_CHANNEL_ORDER_RGBA;
*channelOrder = HSA_EXT_IMAGE_CHANNEL_ORDER_RGBA;
} else if (desc.x != 0 && desc.y != 0 && desc.z != 0 && desc.w == 0) {
channelOrder = HSA_EXT_IMAGE_CHANNEL_ORDER_RGB;
*channelOrder = HSA_EXT_IMAGE_CHANNEL_ORDER_RGB;
} else if (desc.x != 0 && desc.y != 0 && desc.z == 0 && desc.w == 0) {
channelOrder = HSA_EXT_IMAGE_CHANNEL_ORDER_RG;
*channelOrder = HSA_EXT_IMAGE_CHANNEL_ORDER_RG;
} else if (desc.x != 0 && desc.y == 0 && desc.z == 0 && desc.w == 0) {
channelOrder = HSA_EXT_IMAGE_CHANNEL_ORDER_R;
*channelOrder = HSA_EXT_IMAGE_CHANNEL_ORDER_R;
} else {
}
@@ -52,49 +94,49 @@ void getChannelOrderAndType(const hipChannelFormatDesc& desc,
case hipChannelFormatKindUnsigned:
switch(desc.x) {
case 32:
channelType = HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32;
*channelType = HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32;
break;
case 16:
channelType = readMode == hipReadModeNormalizedFloat ? HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT16 :
*channelType = readMode == hipReadModeNormalizedFloat ? HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT16 :
HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16;
break;
case 8:
channelType = readMode == hipReadModeNormalizedFloat ? HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT8 :
*channelType = readMode == hipReadModeNormalizedFloat ? HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT8 :
HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8;
break;
default:
channelType = HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32;
*channelType = HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32;
}
break;
case hipChannelFormatKindSigned:
switch(desc.x) {
case 32:
channelType = HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT32;
*channelType = HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT32;
break;
case 16:
channelType = readMode == hipReadModeNormalizedFloat ? HSA_EXT_IMAGE_CHANNEL_TYPE_SNORM_INT16 :
*channelType = readMode == hipReadModeNormalizedFloat ? HSA_EXT_IMAGE_CHANNEL_TYPE_SNORM_INT16 :
HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT16;
break;
case 8:
channelType = readMode == hipReadModeNormalizedFloat ? HSA_EXT_IMAGE_CHANNEL_TYPE_SNORM_INT8 :
*channelType = readMode == hipReadModeNormalizedFloat ? HSA_EXT_IMAGE_CHANNEL_TYPE_SNORM_INT8 :
HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT8;
break;
default:
channelType = HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT32;
*channelType = HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT32;
}
break;
case hipChannelFormatKindFloat:
switch(desc.x) {
case 32:
channelType = HSA_EXT_IMAGE_CHANNEL_TYPE_FLOAT;
*channelType = HSA_EXT_IMAGE_CHANNEL_TYPE_FLOAT;
break;
case 16:
channelType = HSA_EXT_IMAGE_CHANNEL_TYPE_HALF_FLOAT;
*channelType = HSA_EXT_IMAGE_CHANNEL_TYPE_HALF_FLOAT;
break;
case 8:
break;
default:
channelType = HSA_EXT_IMAGE_CHANNEL_TYPE_FLOAT;
*channelType = HSA_EXT_IMAGE_CHANNEL_TYPE_FLOAT;
}
break;
case hipChannelFormatKindNone:
@@ -168,8 +210,6 @@ hipError_t hipCreateTextureObject(hipTextureObject_t* pTexObject,
const hipResourceViewDesc* pResViewDesc)
{
HIP_INIT_API(pTexObject, pResDesc, pTexDesc, pResViewDesc);
HIP_SET_DEVICE();
hipError_t hip_status = hipSuccess;
auto ctx = ihipGetTlsDefaultCtx();
@@ -215,7 +255,7 @@ hipError_t hipCreateTextureObject(hipTextureObject_t* pTexObject,
imageDescriptor.array_size = 0;
break;
}
getChannelOrderAndType(pResDesc->res.array.array->desc, pTexDesc->readMode, channelOrder, channelType);
getChannelOrderAndType(pResDesc->res.array.array->desc, pTexDesc->readMode, &channelOrder, &channelType);
break;
case hipResourceTypeMipmappedArray:
devPtr = pResDesc->res.mipmap.mipmap->data;
@@ -224,7 +264,7 @@ hipError_t hipCreateTextureObject(hipTextureObject_t* pTexObject,
imageDescriptor.depth = pResDesc->res.mipmap.mipmap->depth;
imageDescriptor.array_size = 0;
imageDescriptor.geometry = HSA_EXT_IMAGE_GEOMETRY_2D;
getChannelOrderAndType(pResDesc->res.mipmap.mipmap->desc, pTexDesc->readMode, channelOrder, channelType);
getChannelOrderAndType(pResDesc->res.mipmap.mipmap->desc, pTexDesc->readMode, &channelOrder, &channelType);
break;
case hipResourceTypeLinear:
devPtr = pResDesc->res.linear.devPtr;
@@ -233,7 +273,7 @@ hipError_t hipCreateTextureObject(hipTextureObject_t* pTexObject,
imageDescriptor.depth = 0;
imageDescriptor.array_size = 0;
imageDescriptor.geometry = HSA_EXT_IMAGE_GEOMETRY_1D; // ? HSA_EXT_IMAGE_DATA_LAYOUT_LINEAR
getChannelOrderAndType(pResDesc->res.linear.desc, pTexDesc->readMode, channelOrder, channelType);
getChannelOrderAndType(pResDesc->res.linear.desc, pTexDesc->readMode, &channelOrder, &channelType);
break;
case hipResourceTypePitch2D:
devPtr = pResDesc->res.pitch2D.devPtr;
@@ -242,7 +282,7 @@ hipError_t hipCreateTextureObject(hipTextureObject_t* pTexObject,
imageDescriptor.depth = 0;
imageDescriptor.array_size = 0;
imageDescriptor.geometry = HSA_EXT_IMAGE_GEOMETRY_2D;
getChannelOrderAndType(pResDesc->res.pitch2D.desc, pTexDesc->readMode, channelOrder, channelType);
getChannelOrderAndType(pResDesc->res.pitch2D.desc, pTexDesc->readMode, &channelOrder, &channelType);
break;
default:
break;
@@ -271,7 +311,6 @@ hipError_t hipCreateTextureObject(hipTextureObject_t* pTexObject,
hipError_t hipDestroyTextureObject(hipTextureObject_t textureObject)
{
HIP_INIT_API(textureObject);
HIP_SET_DEVICE();
hipError_t hip_status = hipSuccess;
@@ -296,8 +335,6 @@ hipError_t hipDestroyTextureObject(hipTextureObject_t textureObject)
hipError_t hipGetTextureObjectResourceDesc(hipResourceDesc* pResDesc, hipTextureObject_t textureObject)
{
HIP_INIT_API(pResDesc, textureObject);
HIP_SET_DEVICE();
hipError_t hip_status = hipSuccess;
auto ctx = ihipGetTlsDefaultCtx();
@@ -313,8 +350,6 @@ hipError_t hipGetTextureObjectResourceDesc(hipResourceDesc* pResDesc, hipTexture
hipError_t hipGetTextureObjectResourceViewDesc(hipResourceViewDesc* pResViewDesc, hipTextureObject_t textureObject)
{
HIP_INIT_API(pResViewDesc, textureObject);
HIP_SET_DEVICE();
hipError_t hip_status = hipSuccess;
auto ctx = ihipGetTlsDefaultCtx();
@@ -330,7 +365,6 @@ hipError_t hipGetTextureObjectResourceViewDesc(hipResourceViewDesc* pResViewDesc
hipError_t hipGetTextureObjectTextureDesc(hipTextureDesc* pTexDesc, hipTextureObject_t textureObject)
{
HIP_INIT_API(pTexDesc, textureObject);
HIP_SET_DEVICE();
hipError_t hip_status = hipSuccess;
@@ -349,18 +383,14 @@ hipError_t ihipBindTextureImpl(int dim,
enum hipTextureReadMode readMode,
size_t *offset,
const void *devPtr,
const struct hipChannelFormatDesc& desc,
size_t size,
enum hipTextureAddressMode addressMode,
enum hipTextureFilterMode filterMode,
int normalizedCoords,
hipTextureObject_t& textureObject)
const struct hipChannelFormatDesc* desc,
size_t size, textureReference* tex )
{
HIP_INIT_API();
HIP_SET_DEVICE();
hipError_t hip_status = hipSuccess;
enum hipTextureAddressMode addressMode = tex->addressMode[0];
enum hipTextureFilterMode filterMode = tex->filterMode;
int normalizedCoords = tex->normalized;
hipTextureObject_t& textureObject = tex->textureObject;
auto ctx = ihipGetTlsDefaultCtx();
if (ctx) {
hc::accelerator acc = ctx->getDevice()->_acc;
@@ -385,7 +415,11 @@ hipError_t ihipBindTextureImpl(int dim,
hsa_ext_image_channel_order_t channelOrder;
hsa_ext_image_channel_type_t channelType;
getChannelOrderAndType(desc, readMode, channelOrder, channelType);
if(NULL == desc) {
getDrvChannelOrderAndType(tex->format, tex->numChannels, &channelOrder, &channelType);
} else {
getChannelOrderAndType(*desc, readMode, &channelOrder, &channelType);
}
imageDescriptor.format.channel_order = channelOrder;
imageDescriptor.format.channel_type = channelType;
@@ -396,13 +430,13 @@ hipError_t ihipBindTextureImpl(int dim,
if (HSA_STATUS_SUCCESS != hsa_ext_image_create_with_layout(*agent, &imageDescriptor, devPtr, permission, HSA_EXT_IMAGE_DATA_LAYOUT_LINEAR, 0, 0, &(pTexture->image)) ||
HSA_STATUS_SUCCESS != hsa_ext_sampler_create(*agent, &samplerDescriptor, &(pTexture->sampler))) {
return ihipLogStatus(hipErrorRuntimeOther);
return hipErrorRuntimeOther;
}
getHipTextureObject(&textureObject, pTexture->image, pTexture->sampler);
textureHash[textureObject] = pTexture;
}
return ihipLogStatus(hip_status);
return hip_status;
}
hipError_t hipBindTexture(size_t* offset,
@@ -411,30 +445,28 @@ hipError_t hipBindTexture(size_t* offset,
const hipChannelFormatDesc* desc,
size_t size)
{
HIP_INIT_API(offset, tex, devPtr, desc, size);
hipError_t hip_status = hipSuccess;
// TODO: hipReadModeElementType is default.
return ihipBindTextureImpl(hipTextureType1D, hipReadModeElementType,
offset, devPtr, *desc, size,
tex->addressMode[0], tex->filterMode, tex->normalized,
tex->textureObject);
hip_status = ihipBindTextureImpl(hipTextureType1D, hipReadModeElementType,
offset, devPtr, desc, size, tex);
return ihipLogStatus(hip_status);
}
hipError_t ihipBindTexture2DImpl(int dim,
enum hipTextureReadMode readMode,
size_t *offset,
const void *devPtr,
const struct hipChannelFormatDesc& desc,
const struct hipChannelFormatDesc* desc,
size_t width,
size_t height,
enum hipTextureAddressMode addressMode,
enum hipTextureFilterMode filterMode,
int normalizedCoords,
hipTextureObject_t& textureObject)
textureReference* tex)
{
HIP_INIT_API();
HIP_SET_DEVICE();
hipError_t hip_status = hipSuccess;
enum hipTextureAddressMode addressMode = tex->addressMode[0];
enum hipTextureFilterMode filterMode = tex->filterMode;
int normalizedCoords = tex->normalized;
hipTextureObject_t& textureObject = tex->textureObject;
auto ctx = ihipGetTlsDefaultCtx();
if (ctx) {
hc::accelerator acc = ctx->getDevice()->_acc;
@@ -459,7 +491,12 @@ hipError_t ihipBindTexture2DImpl(int dim,
hsa_ext_image_channel_order_t channelOrder;
hsa_ext_image_channel_type_t channelType;
getChannelOrderAndType(desc, readMode, channelOrder, channelType);
if(NULL == desc) {
getDrvChannelOrderAndType(tex->format, tex->numChannels, &channelOrder, &channelType);
} else {
getChannelOrderAndType(*desc, readMode, &channelOrder, &channelType);
}
imageDescriptor.format.channel_order = channelOrder;
imageDescriptor.format.channel_type = channelType;
@@ -470,13 +507,13 @@ hipError_t ihipBindTexture2DImpl(int dim,
if (HSA_STATUS_SUCCESS != hsa_ext_image_create_with_layout(*agent, &imageDescriptor, devPtr, permission, HSA_EXT_IMAGE_DATA_LAYOUT_LINEAR, 0, 0, &(pTexture->image)) ||
HSA_STATUS_SUCCESS != hsa_ext_sampler_create(*agent, &samplerDescriptor, &(pTexture->sampler))) {
return ihipLogStatus(hipErrorRuntimeOther);
return hipErrorRuntimeOther;
}
getHipTextureObject(&textureObject, pTexture->image, pTexture->sampler);
textureHash[textureObject] = pTexture;
}
return ihipLogStatus(hip_status);
return hip_status;
}
hipError_t hipBindTexture2D(size_t* offset,
@@ -487,27 +524,24 @@ hipError_t hipBindTexture2D(size_t* offset,
size_t height,
size_t pitch)
{
// TODO: hipReadModeElementType is default.
return ihipBindTexture2DImpl(hipTextureType2D, hipReadModeElementType,
offset, devPtr, *desc, width, height,
tex->addressMode[0], tex->filterMode, tex->normalized,
tex->textureObject);
HIP_INIT_API(offset, tex, devPtr, desc, width, height, pitch);
hipError_t hip_status = hipSuccess;
hip_status = ihipBindTexture2DImpl(hipTextureType2D, hipReadModeElementType,
offset, devPtr, desc, width, height, tex);
return ihipLogStatus(hip_status);
}
hipError_t ihipBindTextureToArrayImpl(int dim,
enum hipTextureReadMode readMode,
hipArray_const_t array,
const struct hipChannelFormatDesc& desc,
enum hipTextureAddressMode addressMode,
enum hipTextureFilterMode filterMode,
int normalizedCoords,
hipTextureObject_t& textureObject)
textureReference* tex)
{
HIP_INIT_API();
HIP_SET_DEVICE();
hipError_t hip_status = hipSuccess;
enum hipTextureAddressMode addressMode = tex->addressMode[0];
enum hipTextureFilterMode filterMode = tex->filterMode;
int normalizedCoords = tex->normalized;
hipTextureObject_t& textureObject = tex->textureObject;
auto ctx = ihipGetTlsDefaultCtx();
if (ctx) {
hc::accelerator acc = ctx->getDevice()->_acc;
@@ -558,7 +592,11 @@ hipError_t ihipBindTextureToArrayImpl(int dim,
hsa_ext_image_channel_order_t channelOrder;
hsa_ext_image_channel_type_t channelType;
getChannelOrderAndType(desc, readMode, channelOrder, channelType);
if(array->isDrv) {
getDrvChannelOrderAndType(array->drvDesc.format, array->drvDesc.numChannels, &channelOrder, &channelType);
} else {
getChannelOrderAndType(desc, readMode, &channelOrder, &channelType);
}
imageDescriptor.format.channel_order = channelOrder;
imageDescriptor.format.channel_type = channelType;
@@ -569,38 +607,38 @@ hipError_t ihipBindTextureToArrayImpl(int dim,
if (HSA_STATUS_SUCCESS != hsa_ext_image_create_with_layout(*agent, &imageDescriptor, array->data, permission, HSA_EXT_IMAGE_DATA_LAYOUT_LINEAR, 0, 0, &(pTexture->image)) ||
HSA_STATUS_SUCCESS != hsa_ext_sampler_create(*agent, &samplerDescriptor, &(pTexture->sampler))) {
return ihipLogStatus(hipErrorRuntimeOther);
return hipErrorRuntimeOther;
}
getHipTextureObject(&textureObject, pTexture->image, pTexture->sampler);
textureHash[textureObject] = pTexture;
}
return ihipLogStatus(hip_status);
return hip_status;
}
hipError_t hipBindTextureToArray(textureReference* tex,
hipArray_const_t array,
const hipChannelFormatDesc* desc)
{
HIP_INIT_API(tex, array, desc);
hipError_t hip_status = hipSuccess;
// TODO: hipReadModeElementType is default.
return ihipBindTextureToArrayImpl(hipTextureType2D, hipReadModeElementType,
array, *desc,
tex->addressMode[0], tex->filterMode, tex->normalized,
tex->textureObject);
hip_status = ihipBindTextureToArrayImpl(hipTextureType2D, hipReadModeElementType,
array, *desc, tex);
return ihipLogStatus(hip_status);
}
hipError_t hipBindTextureToMipmappedArray(textureReference* tex,
hipMipmappedArray_const_t mipmappedArray,
const hipChannelFormatDesc* desc)
{
return hipSuccess;
HIP_INIT_API(tex, mipmappedArray, desc);
hipError_t hip_status = hipSuccess;
return ihipLogStatus(hip_status);
}
hipError_t ihipUnbindTextureImpl(const hipTextureObject_t& textureObject)
{
HIP_INIT_API();
HIP_SET_DEVICE();
hipError_t hip_status = hipSuccess;
auto ctx = ihipGetTlsDefaultCtx();
@@ -619,19 +657,20 @@ hipError_t ihipUnbindTextureImpl(const hipTextureObject_t& textureObject)
}
}
return ihipLogStatus(hip_status);
return hip_status;
}
hipError_t hipUnbindTexture(const textureReference* tex)
{
return ihipUnbindTextureImpl(tex->textureObject);
HIP_INIT_API(tex);
hipError_t hip_status = hipSuccess;
hip_status = ihipUnbindTextureImpl(tex->textureObject);
return ihipLogStatus(hip_status);
}
hipError_t hipGetChannelDesc(hipChannelFormatDesc* desc, hipArray_const_t array)
{
HIP_INIT_API(desc, array);
HIP_SET_DEVICE();
hipError_t hip_status = hipSuccess;
auto ctx = ihipGetTlsDefaultCtx();
@@ -644,7 +683,6 @@ hipError_t hipGetChannelDesc(hipChannelFormatDesc* desc, hipArray_const_t array)
hipError_t hipGetTextureAlignmentOffset(size_t* offset, const textureReference* tex)
{
HIP_INIT_API(offset, tex);
HIP_SET_DEVICE();
hipError_t hip_status = hipSuccess;
@@ -657,7 +695,6 @@ hipError_t hipGetTextureAlignmentOffset(size_t* offset, const textureReference*
hipError_t hipGetTextureReference(const textureReference** tex, const void* symbol)
{
HIP_INIT_API(tex, symbol);
HIP_SET_DEVICE();
hipError_t hip_status = hipSuccess;
@@ -666,3 +703,68 @@ hipError_t hipGetTextureReference(const textureReference** tex, const void* symb
}
return ihipLogStatus(hip_status);
}
hipError_t hipTexRefSetFormat (textureReference* tex, hipArray_Format fmt, int NumPackedComponents )
{
HIP_INIT_API(tex, fmt, NumPackedComponents);
hipError_t hip_status = hipSuccess;
tex->format = fmt;
tex->numChannels = NumPackedComponents;
return ihipLogStatus(hip_status);
}
hipError_t hipTexRefSetFlags ( textureReference* tex, unsigned int flags )
{
HIP_INIT_API(tex, flags);
hipError_t hip_status = hipSuccess;
tex->normalized = flags;
return ihipLogStatus(hip_status);
}
hipError_t hipTexRefSetFilterMode ( textureReference* tex, hipTextureFilterMode fm )
{
HIP_INIT_API(tex, fm);
hipError_t hip_status = hipSuccess;
tex->filterMode = fm;
return ihipLogStatus(hip_status);
}
hipError_t hipTexRefSetAddressMode ( textureReference* tex, int dim, hipTextureAddressMode am )
{
HIP_INIT_API(tex, dim, am);
hipError_t hip_status = hipSuccess;
tex->addressMode[dim] = am;
return ihipLogStatus(hip_status);
}
hipError_t hipTexRefSetArray ( textureReference* tex, hipArray_const_t array, unsigned int flags )
{
HIP_INIT_API(tex, array, flags);
hipError_t hip_status = hipSuccess;
hip_status = ihipBindTextureToArrayImpl(hipTextureType2D, hipReadModeElementType,
array, array->desc,tex );
return ihipLogStatus(hip_status);
}
hipError_t hipTexRefSetAddress( size_t* offset, textureReference* tex, hipDeviceptr_t devPtr, size_t size )
{
HIP_INIT_API(offset, tex, devPtr, size);
hipError_t hip_status = hipSuccess;
// TODO: hipReadModeElementType is default.
hip_status = ihipBindTextureImpl(hipTextureType1D, hipReadModeElementType,
offset, devPtr, NULL, size, tex);
return ihipLogStatus(hip_status);
}
hipError_t hipTexRefSetAddress2D( textureReference* tex, const HIP_ARRAY_DESCRIPTOR* desc, hipDeviceptr_t devPtr, size_t pitch )
{
HIP_INIT_API(tex, desc, devPtr, pitch);
size_t offset;
hipError_t hip_status = hipSuccess;
// TODO: hipReadModeElementType is default.
hip_status = ihipBindTexture2DImpl(hipTextureType2D, hipReadModeElementType,
&offset, devPtr, NULL, desc->width, desc->height, tex);
return ihipLogStatus(hip_status);
}
+112
View File
@@ -0,0 +1,112 @@
/*
Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#pragma once
#include <hsa/hsa.h>
#include <cstdint>
#include <string>
namespace hip_impl
{
inline
void* address(hsa_executable_symbol_t x)
{
void* r = nullptr;
hsa_executable_symbol_get_info(
x, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &r);
return r;
}
inline
hsa_agent_t agent(hsa_executable_symbol_t x)
{
hsa_agent_t r = {};
hsa_executable_symbol_get_info(x, HSA_EXECUTABLE_SYMBOL_INFO_AGENT, &r);
return r;
}
inline
std::uint32_t group_size(hsa_executable_symbol_t x)
{
std::uint32_t r = 0u;
hsa_executable_symbol_get_info(
x, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &r);
return r;
}
inline
std::uint64_t kernel_object(hsa_executable_symbol_t x)
{
std::uint64_t r = 0u;
hsa_executable_symbol_get_info(
x, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &r);
return r;
}
inline
std::string name(hsa_executable_symbol_t x)
{
std::uint32_t sz = 0u;
hsa_executable_symbol_get_info(
x, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &sz);
std::string r(sz, '\0');
hsa_executable_symbol_get_info(
x, HSA_EXECUTABLE_SYMBOL_INFO_NAME, &r.front());
return r;
}
inline
std::uint32_t private_size(hsa_executable_symbol_t x)
{
std::uint32_t r = 0u;
hsa_executable_symbol_get_info(
x, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &r);
return r;
}
inline
std::uint32_t size(hsa_executable_symbol_t x)
{
std::uint32_t r = 0;
hsa_executable_symbol_get_info(
x, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE, &r);
return r;
}
inline
hsa_symbol_kind_t type(hsa_executable_symbol_t x)
{
hsa_symbol_kind_t r = {};
hsa_executable_symbol_get_info(x, HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &r);
return r;
}
}
@@ -0,0 +1,97 @@
/*
Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
// Internal header, do not percolate upwards.
#include "hip_hcc_internal.h"
#include "hc.hpp"
#include "trace_helper.h"
#include <iostream>
#include <sstream>
namespace hip_impl
{
hc::accelerator_view lock_stream_hip_(
hipStream_t& stream, void*& locked_stream)
{ // This allocated but does not take ownership of locked_stream. If it is
// not deleted elsewhere it will leak.
using L = decltype(stream->lockopen_preKernelCommand());
HIP_INIT();
stream = ihipSyncAndResolveStream(stream);
locked_stream = new L{stream->lockopen_preKernelCommand()};
return (*static_cast<L*>(locked_stream))->_av;
}
void print_prelaunch_trace_(
const char* kernel_name,
dim3 num_blocks,
dim3 dim_blocks,
int group_mem_bytes,
hipStream_t stream)
{
if ((HIP_TRACE_API & (1 << TRACE_KCMD)) ||
HIP_PROFILE_API ||
(COMPILE_HIP_DB && (HIP_TRACE_API & (1<<TRACE_ALL)))) {
std::stringstream os;
os << tls_tidInfo.tid() << "." << tls_tidInfo.apiSeqNum()
<< " hipLaunchKernel '" << kernel_name << "'"
<< " gridDim:" << num_blocks
<< " groupDim:" << dim_blocks
<< " sharedMem:+" << group_mem_bytes
<< " " << *stream;
if (HIP_PROFILE_API == 0x1) {
std::string shortAtpString("hipLaunchKernel:");
shortAtpString += kernel_name;
MARKER_BEGIN(shortAtpString.c_str(), "HIP");
} else if (HIP_PROFILE_API == 0x2) {
MARKER_BEGIN(os.str().c_str(), "HIP");
}
if (COMPILE_HIP_DB && HIP_TRACE_API) {
std::string fullStr;
recordApiTrace(&fullStr, os.str());
}
}
}
void unlock_stream_hip_(
hipStream_t stream,
void* locked_stream,
const char* kernel_name,
hc::accelerator_view* acc_v)
{ // Precondition: acc_v is the accelerator_view associated with stream
// which is guarded by locked_stream;
// locked_stream is deletable.
using L = decltype(stream->lockopen_preKernelCommand());
stream->lockclose_postKernelCommand(kernel_name, acc_v);
delete static_cast<L*>(locked_stream);
locked_stream = nullptr;
if(HIP_PROFILE_API) {
MARKER_END();
}
}
}
+3 -8
View File
@@ -84,7 +84,7 @@ __device__ float erfcf(float x)
}
__device__ float erfcinvf(float y)
{
return __hip_erfinvf(1 - y);
return hc::precise_math::erfcinvf(y);
}
__device__ float erfcxf(float x)
{
@@ -96,7 +96,7 @@ __device__ float erff(float x)
}
__device__ float erfinvf(float y)
{
return __hip_erfinvf(y);
return hc::precise_math::erfinvf(y);
}
__device__ float exp10f(float x)
{
@@ -192,12 +192,7 @@ __device__ float ldexpf(float x, int exp)
}
__device__ float lgammaf(float x)
{
float val = 0.0f;
float y = x - 1;
while(y > 0){
val += logf(y--);
}
return val;
return hc::precise_math::lgammaf(x);
}
__device__ long long int llrintf(float x)
{
+494
View File
@@ -0,0 +1,494 @@
#include "../include/hip/hcc_detail/program_state.hpp"
#include "../include/hip/hcc_detail/code_object_bundle.hpp"
#include "hip_hcc_internal.h"
#include "hsa_helpers.hpp"
#include "trace_helper.h"
#include "elfio/elfio.hpp"
#include <link.h>
#include <hsa/hsa.h>
#include <hsa/hsa_ext_amd.h>
#include <cassert>
#include <cstddef>
#include <cstdint>
#include <memory>
#include <mutex>
#include <sstream>
#include <stdexcept>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <vector>
using namespace ELFIO;
using namespace hip_impl;
using namespace std;
namespace std
{
template<>
struct hash<hsa_isa_t> {
size_t operator()(hsa_isa_t x) const
{
return hash<decltype(x.handle)>{}(x.handle);
}
};
}
inline
constexpr
bool operator==(hsa_isa_t x, hsa_isa_t y)
{
return x.handle == y.handle;
}
namespace
{
struct Symbol {
std::string name;
ELFIO::Elf64_Addr value = 0;
ELFIO::Elf_Xword size = 0;
ELFIO::Elf_Half sect_idx = 0;
std::uint8_t bind = 0;
std::uint8_t type = 0;
std::uint8_t other = 0;
};
inline
Symbol read_symbol(const symbol_section_accessor& section, unsigned int idx)
{
assert(idx < section.get_symbols_num());
Symbol r;
section.get_symbol(
idx, r.name, r.value, r.size, r.bind, r.type, r.sect_idx, r.other);
return r;
}
template<typename P>
inline
section* find_section_if(elfio& reader, P p)
{
const auto it = find_if(
reader.sections.begin(), reader.sections.end(), move(p));
return it != reader.sections.end() ? *it : nullptr;
}
vector<string> copy_names_of_undefined_symbols(
const symbol_section_accessor& section)
{
vector<string> r;
for (auto i = 0u; i != section.get_symbols_num(); ++i) {
// TODO: this is boyscout code, caching the temporaries
// may be of worth.
auto tmp = read_symbol(section, i);
if (tmp.sect_idx == SHN_UNDEF && !tmp.name.empty()) {
r.push_back(std::move(tmp.name));
}
}
return r;
}
const std::unordered_map<
std::string,
std::pair<ELFIO::Elf64_Addr, ELFIO::Elf_Xword>>& symbol_addresses()
{
static unordered_map<string, pair<Elf64_Addr, Elf_Xword>> r;
static once_flag f;
call_once(f, []() {
dl_iterate_phdr([](dl_phdr_info* info, size_t, void*) {
static constexpr const char self[] = "/proc/self/exe";
elfio reader;
static unsigned int iter = 0u;
if (reader.load(!iter ? self : info->dlpi_name)) {
auto it = find_section_if(
reader, [](const class section* x) {
return x->get_type() == SHT_SYMTAB;
});
if (it) {
const symbol_section_accessor symtab{reader, it};
for (auto i = 0u; i != symtab.get_symbols_num(); ++i) {
auto tmp = read_symbol(symtab, i);
if (tmp.type == STT_OBJECT &&
tmp.sect_idx != SHN_UNDEF) {
const auto addr =
tmp.value + (iter ? info->dlpi_addr : 0);
r.emplace(
move(tmp.name), make_pair(addr, tmp.size));
}
}
}
++iter;
}
return 0;
}, nullptr);
});
return r;
}
void associate_code_object_symbols_with_host_allocation(
const elfio& reader,
section* code_object_dynsym,
hsa_agent_t agent,
hsa_executable_t executable)
{
if (!code_object_dynsym) return;
const auto undefined_symbols = copy_names_of_undefined_symbols(
symbol_section_accessor{reader, code_object_dynsym});
for (auto&& x : undefined_symbols) {
if (globals().find(x) != globals().cend()) return;
const auto it1 = symbol_addresses().find(x);
if (it1 == symbol_addresses().cend()) {
throw runtime_error{"Global symbol: " + x + " is undefined."};
}
static mutex mtx;
lock_guard<mutex> lck{mtx};
if (globals().find(x) != globals().cend()) return;
void* p = nullptr;
hsa_amd_memory_lock(
reinterpret_cast<void*>(it1->second.first),
it1->second.second,
nullptr, // All agents.
0,
&p);
hsa_executable_agent_global_variable_define(
executable, agent, x.c_str(), p);
globals().emplace(x, RAII_global{p, hsa_amd_memory_unlock});
}
}
vector<uint8_t> code_object_blob_for_process()
{
static constexpr const char self[] = "/proc/self/exe";
static constexpr const char kernel_section[] = ".kernel";
elfio reader;
if (!reader.load(self)) {
throw runtime_error{"Failed to load ELF file for current process."};
}
auto kernels = find_section_if(reader, [](const section* x) {
return x->get_name() == kernel_section;
});
vector<uint8_t> r;
if (kernels) {
r.insert(
r.end(),
kernels->get_data(),
kernels->get_data() + kernels->get_size());
}
return r;
}
const unordered_map<hsa_isa_t, vector<vector<uint8_t>>>& code_object_blobs()
{
static unordered_map<hsa_isa_t, vector<vector<uint8_t>>> r;
static once_flag f;
call_once(f, []() {
static vector<vector<uint8_t>> blobs{
code_object_blob_for_process()};
dl_iterate_phdr([](dl_phdr_info* info, std::size_t, void*) {
elfio tmp;
if (tmp.load(info->dlpi_name)) {
const auto it = find_section_if(tmp, [](const section* x) {
return x->get_name() == ".kernel";
});
if (it) blobs.emplace_back(
it->get_data(), it->get_data() + it->get_size());
}
return 0;
}, nullptr);
for (auto&& blob : blobs) {
Bundled_code_header tmp{blob};
if (valid(tmp)) {
for (auto&& bundle : bundles(tmp)) {
r[triple_to_hsa_isa(bundle.triple)].push_back(
bundle.blob);
}
}
}
});
return r;
}
vector<pair<uintptr_t, string>> function_names_for(
const elfio& reader, section* symtab)
{
vector<pair<uintptr_t, string>> r;
symbol_section_accessor symbols{reader, symtab};
for (auto i = 0u; i != symbols.get_symbols_num(); ++i) {
// TODO: this is boyscout code, caching the temporaries
// may be of worth.
auto tmp = read_symbol(symbols, i);
if (tmp.type == STT_FUNC &&
tmp.sect_idx != SHN_UNDEF &&
!tmp.name.empty()) {
r.emplace_back(tmp.value, tmp.name);
}
}
return r;
}
const vector<pair<uintptr_t, string>>& function_names_for_process()
{
static constexpr const char self[] = "/proc/self/exe";
static vector<pair<uintptr_t, string>> r;
static once_flag f;
call_once(f, []() {
elfio reader;
if (!reader.load(self)) {
throw runtime_error{
"Failed to load the ELF file for the current process."};
}
auto symtab = find_section_if(reader, [](const section* x) {
return x->get_type() == SHT_SYMTAB;
});
if (symtab) r = function_names_for(reader, symtab);
});
return r;
}
const unordered_map<string, vector<hsa_executable_symbol_t>>& kernels()
{
static unordered_map<string, vector<hsa_executable_symbol_t>> r;
static once_flag f;
call_once(f, []() {
static const auto copy_kernels = [](
hsa_executable_t, hsa_agent_t, hsa_executable_symbol_t s, void*) {
if (type(s) == HSA_SYMBOL_KIND_KERNEL) r[name(s)].push_back(s);
return HSA_STATUS_SUCCESS;
};
for (auto&& agent_executables : executables()) {
for (auto&& executable : agent_executables.second) {
hsa_executable_iterate_agent_symbols(
executable,
agent_executables.first,
copy_kernels,
nullptr);
}
}
});
return r;
}
void load_code_object_and_freeze_executable(
const string& file, hsa_agent_t agent, hsa_executable_t executable)
{ // TODO: the following sequence is inefficient, should be refactored
// into a single load of the file and subsequent ELFIO
// processing.
static const auto cor_deleter = [](hsa_code_object_reader_t* p) {
if (p) {
hsa_code_object_reader_destroy(*p);
delete p;
}
};
using RAII_code_reader = unique_ptr<
hsa_code_object_reader_t, decltype(cor_deleter)>;
if (!file.empty()) {
RAII_code_reader tmp{new hsa_code_object_reader_t, cor_deleter};
hsa_code_object_reader_create_from_memory(
file.data(), file.size(), tmp.get());
hsa_executable_load_agent_code_object(
executable, agent, *tmp, nullptr, nullptr);
hsa_executable_freeze(executable, nullptr);
static vector<RAII_code_reader> code_readers;
static mutex mtx;
lock_guard<mutex> lck{mtx};
code_readers.push_back(move(tmp));
}
}
}
namespace hip_impl
{
const unordered_map<hsa_agent_t, vector<hsa_executable_t>>& executables()
{ // TODO: This leaks the hsa_executable_ts, it should use RAII.
static unordered_map<hsa_agent_t, vector<hsa_executable_t>> r;
static once_flag f;
call_once(f, []() {
static const auto accelerators = hc::accelerator::get_all();
for (auto&& acc : accelerators) {
auto agent = static_cast<hsa_agent_t*>(acc.get_hsa_agent());
if (!agent || !acc.is_hsa_accelerator()) continue;
hsa_agent_iterate_isas(*agent, [](hsa_isa_t x, void* pa) {
const auto it = code_object_blobs().find(x);
if (it != code_object_blobs().cend()) {
hsa_agent_t a = *static_cast<hsa_agent_t*>(pa);
for (auto&& blob : it->second) {
hsa_executable_t tmp = {};
hsa_executable_create_alt(
HSA_PROFILE_FULL,
HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT,
nullptr,
&tmp);
// TODO: this is massively inefficient and only
// meant for illustration.
string blob_to_str{blob.cbegin(), blob.cend()};
tmp = load_executable(blob_to_str, tmp, a);
if (tmp.handle) r[a].push_back(tmp);
}
}
return HSA_STATUS_SUCCESS;
}, agent);
}
});
return r;
}
const unordered_map<uintptr_t, string>& function_names()
{
static unordered_map<uintptr_t, string> r{
function_names_for_process().cbegin(),
function_names_for_process().cend()};
static once_flag f;
call_once(f, []() {
dl_iterate_phdr([](dl_phdr_info* info, size_t, void*) {
elfio tmp;
if (tmp.load(info->dlpi_name)) {
const auto it = find_section_if(tmp, [](const section* x) {
return x->get_type() == SHT_SYMTAB;
});
if (it) {
auto n = function_names_for(tmp, it);
for (auto&& f : n) f.first += info->dlpi_addr;
r.insert(
make_move_iterator(n.begin()),
make_move_iterator(n.end()));
}
}
return 0;
}, nullptr);
});
return r;
}
const unordered_map<
uintptr_t, vector<pair<hsa_agent_t, Kernel_descriptor>>>& functions()
{
static unordered_map<
uintptr_t, vector<pair<hsa_agent_t, Kernel_descriptor>>> r;
static once_flag f;
call_once(f, []() {
for (auto&& function : function_names()) {
const auto it = kernels().find(function.second);
if (it != kernels().cend()) {
for (auto&& kernel_symbol : it->second) {
r[function.first].emplace_back(
agent(kernel_symbol),
Kernel_descriptor{
kernel_object(kernel_symbol),
group_size(kernel_symbol),
private_size(kernel_symbol),
it->first});
}
}
}
});
return r;
}
unordered_map<string, RAII_global>& globals()
{
static unordered_map<string, RAII_global> r;
static once_flag f;
call_once(f, []() { r.reserve(symbol_addresses().size()); });
return r;
}
hsa_executable_t load_executable(
const string& file, hsa_executable_t executable, hsa_agent_t agent)
{
elfio reader;
stringstream tmp{file};
if (!reader.load(tmp)) return hsa_executable_t{};
const auto code_object_dynsym =
find_section_if(reader, [](const ELFIO::section* x) {
return x->get_type() == SHT_DYNSYM;
});
associate_code_object_symbols_with_host_allocation(
reader, code_object_dynsym, agent, executable);
load_code_object_and_freeze_executable(file, agent, executable);
return executable;
}
} // Namespace hip_impl.
@@ -23,7 +23,7 @@ __global__ void cpy(hipLaunchParm lp, uint32_t *Out, uint32_t *In)
__global__ void set(hipLaunchParm lp, uint32_t *ptr, uint8_t val, size_t size)
{
int tx = threadIdx.x;
memset(ptr + tx, val, (sizeof(uint32_t)*(size/LEN)));
memset(ptr + tx, val, sizeof(uint32_t));
}
int main()
@@ -18,8 +18,8 @@ THE SOFTWARE.
*/
/* HIT_START
* BUILD: %t %s ../test_common.cpp
* RUN: %t
* XXBUILD: %t %s ../test_common.cpp
* XXRUN: %t
* HIT_END
*/
@@ -159,11 +159,16 @@ bool dataTypesRun(){
HIP_ASSERT(hipMemcpy(deviceB, hostB, NUM*sizeof(T), hipMemcpyHostToDevice));
hipLaunchKernel(vectoradd_float,
dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y),
dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
0, 0,
deviceA ,deviceB ,WIDTH ,HEIGHT);
hipLaunchKernel(
vectoradd_float,
dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y),
dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
0,
0,
deviceA,
static_cast<const T*>(deviceB),
WIDTH,
HEIGHT);
HIP_ASSERT(hipMemcpy(hostA, deviceA, NUM*sizeof(T), hipMemcpyDeviceToHost));
@@ -221,11 +226,16 @@ bool dataTypesRun2(){
HIP_ASSERT(hipMalloc((void**)&deviceB, NUM * sizeof(T)));
HIP_ASSERT(hipMemcpy(deviceB, hostB, NUM*sizeof(T), hipMemcpyHostToDevice));
hipLaunchKernel(vectoradd_float,
dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y),
dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
0, 0,
deviceA ,deviceB,WIDTH ,HEIGHT);
hipLaunchKernel(
vectoradd_float,
dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y),
dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
0,
0,
deviceA,
static_cast<const T*>(deviceB),
WIDTH,
HEIGHT);
HIP_ASSERT(hipMemcpy(hostA, deviceA, NUM*sizeof(T), hipMemcpyDeviceToHost));
@@ -281,11 +291,16 @@ bool dataTypesRun4(){
HIP_ASSERT(hipMemcpy(deviceB, hostB, NUM*sizeof(T), hipMemcpyHostToDevice));
hipLaunchKernel(vectoradd_float,
dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y),
dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
0, 0,
deviceA ,deviceB ,WIDTH ,HEIGHT);
hipLaunchKernel(
vectoradd_float,
dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y),
dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
0,
0,
deviceA,
static_cast<const T*>(deviceB),
WIDTH,
HEIGHT);
HIP_ASSERT(hipMemcpy(hostA, deviceA, NUM*sizeof(T), hipMemcpyDeviceToHost));
@@ -69,7 +69,16 @@ int main(int argc, char *argv[])
// Record the start event
HIPCHECK (hipEventRecord(start, NULL));
hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, N);
hipLaunchKernel(
HipTest::vectorADD,
dim3(blocks),
dim3(threadsPerBlock),
0,
0,
static_cast<const float*>(A_d),
static_cast<const float*>(B_d),
C_d,
N);
HIPCHECK (hipEventRecord(stop, NULL));
@@ -77,7 +77,16 @@ void test(unsigned testMask, int *C_d, int *C_h, int64_t numElements, hipStream_
HIPCHECK(hipEventRecord(timingDisabled, stream));
// sandwhich a kernel:
HIPCHECK(hipEventRecord(start, stream));
hipLaunchKernelGGL(HipTest::addCountReverse , dim3(blocks), dim3(threadsPerBlock), 0, stream, C_d, C_h, numElements, count);
hipLaunchKernelGGL(
HipTest::addCountReverse,
dim3(blocks),
dim3(threadsPerBlock),
0,
stream,
static_cast<const int*>(C_d),
C_h,
numElements,
count);
HIPCHECK(hipEventRecord(stop, stream));
@@ -243,7 +243,16 @@ void memcpytest2(DeviceMemory<T> *dmem, HostMemory<T> *hmem, size_t numElements,
HIPCHECK ( hipMemcpy(dmem->B_d(), hmem->B_h(), sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice));
}
hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, dmem->A_d(), dmem->B_d(), dmem->C_d(), numElements);
hipLaunchKernel(
HipTest::vectorADD,
dim3(blocks),
dim3(threadsPerBlock),
0,
0,
static_cast<const T*>(dmem->A_d()),
static_cast<const T*>(dmem->B_d()),
dmem->C_d(),
numElements);
if (useDeviceToDevice) {
// Do an extra device-to-device copy here to mix things up:
@@ -49,21 +49,39 @@ int main()
HIPCHECK(hipMalloc(&Y_d,Nbytes));
HIPCHECK(hipMalloc(&Z_d,Nbytes));
HIPCHECK(hipSetDevice(0));
HIPCHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
HIPCHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice));
hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d,B_d, C_d, N);
hipLaunchKernel(
HipTest::vectorADD,
dim3(blocks),
dim3(threadsPerBlock),
0,
0,
static_cast<const int*>(A_d),
static_cast<const int*>(B_d),
C_d,
N);
HIPCHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
HIPCHECK(hipDeviceSynchronize());
HipTest::checkVectorADD(A_h, B_h, C_h, N);
HIPCHECK(hipSetDevice(1));
HIPCHECK(hipMemcpyDtoD(X_d, A_d, Nbytes));
HIPCHECK(hipMemcpyDtoD(X_d, A_d, Nbytes));
HIPCHECK(hipMemcpyDtoD(Y_d, B_d, Nbytes));
hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, X_d,Y_d, Z_d, N);
hipLaunchKernel(
HipTest::vectorADD,
dim3(blocks),
dim3(threadsPerBlock),
0,
0,
static_cast<const int*>(X_d),
static_cast<const int*>(Y_d),
Z_d,
N);
HIPCHECK(hipMemcpyDtoH(C_h, Z_d, Nbytes));
HIPCHECK(hipDeviceSynchronize());
HipTest::checkVectorADD(A_h, B_h, C_h, N);
@@ -73,8 +91,8 @@ int main()
HIPCHECK(hipFree(Y_d));
HIPCHECK(hipFree(Z_d));
}
passed();
}
@@ -50,25 +50,43 @@ int main()
HIPCHECK(hipMalloc(&Y_d,Nbytes));
HIPCHECK(hipMalloc(&Z_d,Nbytes));
HIPCHECK(hipSetDevice(0));
HIPCHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
HIPCHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice));
hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d,B_d, C_d, N);
hipLaunchKernel(
HipTest::vectorADD,
dim3(blocks),
dim3(threadsPerBlock),
0,
0,
static_cast<const int*>(A_d),
static_cast<const int*>(B_d),
C_d,
N);
HIPCHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
HIPCHECK(hipDeviceSynchronize());
HipTest::checkVectorADD(A_h, B_h, C_h, N);
HIPCHECK(hipStreamCreate(&s));
HIPCHECK(hipStreamCreate(&s));
HIPCHECK(hipSetDevice(1));
HIPCHECK(hipMemcpyDtoDAsync(X_d, A_d, Nbytes, s));
HIPCHECK(hipMemcpyDtoDAsync(X_d, A_d, Nbytes, s));
HIPCHECK(hipMemcpyDtoDAsync(Y_d, B_d, Nbytes, s));
hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, X_d,Y_d, Z_d, N);
hipLaunchKernel(
HipTest::vectorADD,
dim3(blocks),
dim3(threadsPerBlock),
0,
0,
static_cast<const int*>(X_d),
static_cast<const int*>(Y_d),
Z_d,
N);
HIPCHECK(hipMemcpyDtoHAsync(C_h, Z_d, Nbytes, s));
HIPCHECK(hipStreamSynchronize(s));
HIPCHECK(hipDeviceSynchronize());
HipTest::checkVectorADD(A_h, B_h, C_h, N);
HIPCHECK(hipStreamDestroy(s));
HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
@@ -78,7 +96,7 @@ int main()
}
passed();
}
@@ -48,24 +48,42 @@ int main()
HIPCHECK(hipMalloc(&X_d,Nbytes));
HIPCHECK(hipMalloc(&Y_d,Nbytes));
HIPCHECK(hipMalloc(&Z_d,Nbytes));
HIPCHECK(hipSetDevice(0));
HIPCHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
HIPCHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice));
hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d,B_d, C_d, N);
hipLaunchKernel(
HipTest::vectorADD,
dim3(blocks),
dim3(threadsPerBlock),
0,
0,
static_cast<const int*>(A_d),
static_cast<const int*>(B_d),
C_d,
N);
HIPCHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
HIPCHECK(hipDeviceSynchronize());
HipTest::checkVectorADD(A_h, B_h, C_h, N);
HIPCHECK(hipSetDevice(1));
hipMemcpyPeer(X_d, 1, A_d, 0, Nbytes); //this call is eqv to hipMemcpy(hipMemcpyD2D) which goes via stg bufs.
hipMemcpyPeer(Y_d, 1, B_d, 0, Nbytes);
hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, X_d,Y_d, Z_d, N);
hipLaunchKernel(
HipTest::vectorADD,
dim3(blocks),
dim3(threadsPerBlock),
0,
0,
static_cast<const int*>(X_d),
static_cast<const int*>(Y_d),
Z_d,
N);
HIPCHECK(hipMemcpy(C_h, Z_d, Nbytes, hipMemcpyDeviceToHost));
HIPCHECK(hipDeviceSynchronize());
HipTest::checkVectorADD(A_h, B_h, C_h, N);
HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
HIPCHECK(hipFree(X_d));
HIPCHECK(hipFree(Y_d));
@@ -74,7 +92,7 @@ int main()
passed();
}
@@ -51,26 +51,44 @@ int main()
HIPCHECK(hipMalloc(&Y_d,Nbytes));
HIPCHECK(hipMalloc(&Z_d,Nbytes));
HIPCHECK(hipSetDevice(0));
HIPCHECK ( hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
HIPCHECK ( hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice));
hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d,B_d, C_d, N);
hipLaunchKernel(
HipTest::vectorADD,
dim3(blocks),
dim3(threadsPerBlock),
0,
0,
static_cast<const int*>(A_d),
static_cast<const int*>(B_d),
C_d,
N);
HIPCHECK ( hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
HIPCHECK (hipDeviceSynchronize());
HipTest::checkVectorADD(A_h, B_h, C_h, N);
HIPCHECK(hipStreamCreate(&s));
HIPCHECK(hipStreamCreate(&s));
HIPCHECK(hipSetDevice(1));
HIPCHECK(hipMemcpyPeerAsync(X_d, 1, A_d, 0, Nbytes, s));
HIPCHECK(hipMemcpyPeerAsync(Y_d, 1, B_d, 0, Nbytes, s));
hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, X_d,Y_d, Z_d, N);
hipLaunchKernel(
HipTest::vectorADD,
dim3(blocks),
dim3(threadsPerBlock),
0,
0,
static_cast<const int*>(X_d),
static_cast<const int*>(Y_d),
Z_d,
N);
HIPCHECK ( hipMemcpy(C_h, Z_d, Nbytes, hipMemcpyDeviceToHost));
HIPCHECK (hipDeviceSynchronize());
HIPCHECK (hipStreamSynchronize(s));
HipTest::checkVectorADD(A_h, B_h, C_h, N);
HIPCHECK(hipStreamDestroy(s));
HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
HIPCHECK(hipFree(X_d));
@@ -79,7 +97,7 @@ int main()
}
passed();
}
@@ -63,7 +63,16 @@ void simpleTest1()
HIPCHECK ( memcopy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
HIPCHECK ( memcopy(B_d, B_h, Nbytes, hipMemcpyHostToDevice));
hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, N);
hipLaunchKernel(
HipTest::vectorADD,
dim3(blocks),
dim3(threadsPerBlock),
0,
0,
static_cast<const int*>(A_d),
static_cast<const int*>(B_d),
C_d,
N);
HIPCHECK ( memcopy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
@@ -90,7 +90,16 @@ void simpleVectorAdd(size_t numElements, int iters, hipStream_t stream)
// This is the null stream?
//hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, numElements);
hipLaunchKernel(HipTest::vectorADDReverse, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, numElements);
hipLaunchKernel(
HipTest::vectorADDReverse,
dim3(blocks),
dim3(threadsPerBlock),
0,
0,
static_cast<const T*>(A_d),
static_cast<const T*>(B_d),
C_d,
numElements);
MemTraits<C>::Copy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost, stream);
@@ -128,7 +128,17 @@ void Streamer<T>::enqueAsync()
{
printf ("testing: %s numElements=%zu size=%6.2fMB\n", __func__, _numElements, _numElements * sizeof(T) / 1024.0/1024.0);
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, _numElements);
hipLaunchKernel(vectorADDRepeat, dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _B_d, _C_d, _numElements, p_repeat);
hipLaunchKernel(
vectorADDRepeat,
dim3(blocks),
dim3(threadsPerBlock),
0,
_stream,
static_cast<const T*>(_A_d),
static_cast<const T*>(_B_d),
_C_d,
_numElements,
p_repeat);
}
@@ -225,7 +235,17 @@ int main(int argc, char *argv[])
auto lastStreamer = streamers[s - 1];
// Dispatch to NULL stream, should wait for prior async activity to complete before beginning:
hipLaunchKernel(vectorADDRepeat, dim3(blocks), dim3(threadsPerBlock), 0, 0/*nullstream*/, lastStreamer->_C_d, lastStreamer->_C_d, nullStreamer->_C_d, numElements, 1/*repeat*/);
hipLaunchKernel(
vectorADDRepeat,
dim3(blocks),
dim3(threadsPerBlock),
0,
0/*nullstream*/,
static_cast<const int*>(lastStreamer->_C_d),
static_cast<const int*>(lastStreamer->_C_d),
nullStreamer->_C_d,
numElements,
1/*repeat*/);
if (p_db) {
@@ -257,7 +277,17 @@ int main(int argc, char *argv[])
auto lastStreamer = streamers[s - 1];
// Dispatch to NULL stream, should wait for prior async activity to complete before beginning:
hipLaunchKernel(vectorADDRepeat, dim3(blocks), dim3(threadsPerBlock), 0, 0/*nullstream*/, lastStreamer->_C_d, lastStreamer->_C_d, nullStreamer->_C_d, numElements, 1/*repeat*/);
hipLaunchKernel(
vectorADDRepeat,
dim3(blocks),
dim3(threadsPerBlock),
0,
0/*nullstream*/,
static_cast<const int*>(lastStreamer->_C_d),
static_cast<const int*>(lastStreamer->_C_d),
nullStreamer->_C_d,
numElements,
1/*repeat*/);
nullStreamer->D2H();
@@ -97,7 +97,16 @@ void test(unsigned testMask, int *C_d, int *C_h, int64_t numElements, SyncMode s
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements);
// Launch kernel into null stream, should result in C_h == count.
hipLaunchKernelGGL(HipTest::addCountReverse , dim3(blocks), dim3(threadsPerBlock), 0, 0 /*stream*/, C_d, C_h, numElements, count);
hipLaunchKernelGGL(
HipTest::addCountReverse,
dim3(blocks),
dim3(threadsPerBlock),
0,
0 /*stream*/,
static_cast<const int*>(C_d),
C_h,
numElements,
count);
HIPCHECK(hipEventRecord(stop, 0/*default*/));
switch (syncMode) {
@@ -18,8 +18,8 @@ THE SOFTWARE.
*/
/* HIT_START
* BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11
* RUN: %t
* ZZZBUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11
* ZZZRUN: %t
* HIT_END
*/
@@ -163,9 +163,27 @@ void Streamer<T>::runAsyncAfter(Streamer<T> *depStreamer, bool waitSameStream)
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, _numElements);
if (_commandType == COMMAND_ADD_REVERSE) {
hipLaunchKernelGGL(HipTest::addCountReverse , dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _C_d, _numElements, p_count);
hipLaunchKernelGGL(
HipTest::addCountReverse,
dim3(blocks),
dim3(threadsPerBlock),
0,
_stream,
static_cast<const T*>(_A_d),
_C_d,
static_cast<int64_t>(_numElements),
static_cast<int>(p_count));
} else if (_commandType == COMMAND_ADD_FORWARD) {
hipLaunchKernelGGL(HipTest::addCount, dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _C_d, _numElements, p_count);
hipLaunchKernelGGL(
HipTest::addCount,
dim3(blocks),
dim3(threadsPerBlock),
0,
_stream,
static_cast<const T*>(_A_d),
_C_d,
_numElements,
static_cast<int>(p_count));
} else if (_commandType == COMMAND_COPY) {
HIPCHECK(hipMemcpyAsync(_C_d, _A_d, _numElements * sizeof(T), hipMemcpyDeviceToDevice, _stream));
} else {
@@ -14,7 +14,9 @@ texture<float, 2, hipReadModeElementType> tex;
bool testResult = true;
__global__ void tex2DKernel(float* outputData,
#ifdef __HIP_PLATFORM_HCC__
hipTextureObject_t textureObject,
#endif
int width,
int height)
{
@@ -78,7 +80,7 @@ void runTest(int argc, char **argv)
#ifdef __HIP_PLATFORM_HCC__
hipLaunchKernelGGL(tex2DKernel, dim3(dimGrid), dim3(dimBlock), 0, 0, dData, tex.textureObject, width, height);
#else
hipLaunchKernelGGL(tex2DKernel, dim3(dimGrid), dim3(dimBlock), 0, 0, dData, 0, width, height);
hipLaunchKernelGGL(tex2DKernel, dim3(dimGrid), dim3(dimBlock), 0, 0, dData, width, height);
#endif
hipDeviceSynchronize();