This switches HIP from its currently convoluted macro + pfe based dispatch mechanism to a more natural one partially based on the existing module API. The basic idea is that HCC will always correctly emit __global__ functions: as empty-bodied stubs, on host, and as kernels, on device. It then becomes trivial to obtain the mangled name on host, at dispatch, from the function's address, and then to use the mangled name to retrieve the kernel. This should address all problems stemming from serialisation, dubious mismatches due to the manufactured functor, macro-isms et al. It also immediately enables support for generalised globals as a consequence of that being available in the module API. Finally, it will make debug much easier, since the actual names of the __global__ functions will automatically be used in traces etc. One detail is that due to how dispatch works now (hipLaunchKernel and hipLaunchKernelGGL are themselves variadic function templates which deduce the function type of the callee), in certain cases it may be necesssary to insert explicit casts to ensure that the variadic argument list selects a viable overload - this can be observed in some unit tests. Eventually we may be able to remove this limitation, but for now it does not appear terribly onerous. The code is not extremely HIPpie, nor is it fully optimised, but rather is intended as a starting point for the HIP team to make its own.
[ROCm/clr commit: 28f87f7d2e]
Este commit está contenido en:
@@ -0,0 +1,134 @@
|
||||
#pragma once
|
||||
|
||||
#include <hsa/hsa.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdint>
|
||||
#include <istream>
|
||||
#include <iterator>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
namespace hip_impl
|
||||
{
|
||||
hsa_isa_t triple_to_hsa_isa(const std::string& triple);
|
||||
|
||||
struct Bundled_code {
|
||||
union {
|
||||
struct {
|
||||
std::uint64_t offset;
|
||||
std::uint64_t bundle_sz;
|
||||
std::uint64_t triple_sz;
|
||||
};
|
||||
std::uint8_t cbuf[
|
||||
sizeof(offset) + sizeof(bundle_sz) + sizeof(triple_sz)];
|
||||
};
|
||||
std::string triple;
|
||||
std::vector<std::uint8_t> blob;
|
||||
};
|
||||
|
||||
class Bundled_code_header {
|
||||
// DATA - STATICS
|
||||
static constexpr const char magic_string_[] =
|
||||
"__CLANG_OFFLOAD_BUNDLE__";
|
||||
static constexpr auto magic_string_sz_ = sizeof(magic_string_) - 1;
|
||||
|
||||
// DATA
|
||||
union {
|
||||
struct {
|
||||
std::uint8_t bundler_magic_string_[magic_string_sz_];
|
||||
std::uint64_t bundle_cnt_;
|
||||
};
|
||||
std::uint8_t cbuf_[
|
||||
sizeof(bundler_magic_string_) + sizeof(bundle_cnt_)];
|
||||
};
|
||||
std::vector<Bundled_code> bundles_;
|
||||
|
||||
// FRIENDS - MANIPULATORS
|
||||
template<typename RandomAccessIterator>
|
||||
friend
|
||||
inline
|
||||
bool read(
|
||||
RandomAccessIterator f,
|
||||
RandomAccessIterator l,
|
||||
Bundled_code_header& x)
|
||||
{
|
||||
std::copy_n(f, sizeof(x.cbuf_), x.cbuf_);
|
||||
|
||||
if (valid(x)) {
|
||||
x.bundles_.resize(x.bundle_cnt_);
|
||||
|
||||
auto it = f + sizeof(x.cbuf_);
|
||||
for (auto&& y : x.bundles_) {
|
||||
std::copy_n(it, sizeof(y.cbuf), y.cbuf);
|
||||
it += sizeof(y.cbuf);
|
||||
|
||||
y.triple.insert(y.triple.cend(), it, it + y.triple_sz);
|
||||
|
||||
std::copy_n(
|
||||
f + y.offset, y.bundle_sz, std::back_inserter(y.blob));
|
||||
|
||||
it += y.triple_sz;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
friend
|
||||
inline
|
||||
bool read(const std::vector<std::uint8_t>& blob, Bundled_code_header& x)
|
||||
{
|
||||
return read(blob.cbegin(), blob.cend(), x);
|
||||
}
|
||||
friend
|
||||
inline
|
||||
bool read(std::istream& is, Bundled_code_header& x)
|
||||
{
|
||||
return read(std::vector<std::uint8_t>{
|
||||
std::istreambuf_iterator<char>{is},
|
||||
std::istreambuf_iterator<char>{}},
|
||||
x);
|
||||
}
|
||||
|
||||
// FRIENDS - ACCESSORS
|
||||
friend
|
||||
inline
|
||||
bool valid(const Bundled_code_header& x)
|
||||
{
|
||||
return std::equal(
|
||||
x.bundler_magic_string_,
|
||||
x.bundler_magic_string_ + magic_string_sz_,
|
||||
x.magic_string_);
|
||||
}
|
||||
friend
|
||||
inline
|
||||
const std::vector<Bundled_code>& bundles(const Bundled_code_header& x)
|
||||
{
|
||||
return x.bundles_;
|
||||
}
|
||||
public:
|
||||
// CREATORS
|
||||
Bundled_code_header() = default;
|
||||
template<typename RandomAccessIterator>
|
||||
Bundled_code_header(RandomAccessIterator f, RandomAccessIterator l);
|
||||
explicit
|
||||
Bundled_code_header(const std::vector<std::uint8_t>& blob);
|
||||
Bundled_code_header(const Bundled_code_header&) = default;
|
||||
Bundled_code_header(Bundled_code_header&&) = default;
|
||||
~Bundled_code_header() = default;
|
||||
|
||||
// MANIPULATORS
|
||||
Bundled_code_header& operator=(const Bundled_code_header&) = default;
|
||||
Bundled_code_header& operator=(Bundled_code_header&&) = default;
|
||||
};
|
||||
|
||||
// CREATORS
|
||||
template<typename I>
|
||||
Bundled_code_header::Bundled_code_header(I f, I l) : Bundled_code_header{}
|
||||
{
|
||||
read(f, l, *this);
|
||||
}
|
||||
} // Namespace hip_impl.
|
||||
La diferencia del archivo ha sido suprimido porque es demasiado grande
Cargar Diff
@@ -53,7 +53,7 @@ THE SOFTWARE.
|
||||
// define HIP_ENABLE_PRINTF to enable printf
|
||||
#ifdef HIP_ENABLE_PRINTF
|
||||
#define HCC_ENABLE_ACCELERATOR_PRINTF 1
|
||||
#endif
|
||||
#endif
|
||||
|
||||
//---
|
||||
// Remainder of this file only compiles with HCC
|
||||
@@ -481,7 +481,7 @@ do {\
|
||||
type* var = \
|
||||
(type*)__get_dynamicgroupbaseptr(); \
|
||||
|
||||
#define HIP_DYNAMIC_SHARED_ATTRIBUTE
|
||||
#define HIP_DYNAMIC_SHARED_ATTRIBUTE
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -44,7 +44,8 @@ THE SOFTWARE.
|
||||
#if GENERIC_GRID_LAUNCH == 0
|
||||
#define __global__ __attribute__((hc_grid_launch)) __attribute__((used))
|
||||
#else
|
||||
#define __global__ __attribute__((annotate("hip__global__"), hc, used, weak))
|
||||
#define __global__ \
|
||||
__attribute__((annotate("__HIP_global_function__"), cpu, hc, used))
|
||||
#endif //GENERIC_GRID_LAUNCH
|
||||
|
||||
#define __noinline__ __attribute__((noinline))
|
||||
|
||||
@@ -0,0 +1,60 @@
|
||||
/*
|
||||
Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <hsa/hsa.h>
|
||||
|
||||
#include <cstddef>
|
||||
#include <istream>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
struct ihipModuleSymbol_t;
|
||||
using hipFunction_t = ihipModuleSymbol_t*;
|
||||
|
||||
namespace hip_impl
|
||||
{
|
||||
struct Kernel_descriptor {
|
||||
std::uint64_t kernel_object_;
|
||||
std::uint32_t group_size_;
|
||||
std::uint32_t private_size_;
|
||||
std::string name_;
|
||||
|
||||
operator hipFunction_t() const
|
||||
{ // TODO: this is awful and only meant for illustration.
|
||||
return reinterpret_cast<hipFunction_t>(
|
||||
const_cast<Kernel_descriptor*>(this));
|
||||
}
|
||||
};
|
||||
|
||||
const std::unordered_map<
|
||||
std::uintptr_t,
|
||||
std::vector<std::pair<hsa_agent_t, Kernel_descriptor>>>& functions();
|
||||
const std::unordered_map<std::uintptr_t, std::string>& function_names();
|
||||
|
||||
hsa_executable_t load_executable(
|
||||
hsa_executable_t executable, hsa_agent_t agent, std::istream& file);
|
||||
} // Namespace hip_impl.
|
||||
@@ -0,0 +1,39 @@
|
||||
#include "../include/hip/hcc_detail/code_object_bundle.hpp"
|
||||
|
||||
#include <hsa/hsa.h>
|
||||
|
||||
#include <cstdint>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
hsa_isa_t hip_impl::triple_to_hsa_isa(const std::string& triple)
|
||||
{
|
||||
static constexpr const char prefix[] = "hcc-amdgcn--amdhsa-gfx";
|
||||
static constexpr std::size_t prefix_sz = sizeof(prefix) - 1;
|
||||
|
||||
hsa_isa_t r = {};
|
||||
|
||||
auto idx = triple.find(prefix);
|
||||
|
||||
if (idx != std::string::npos) {
|
||||
idx += prefix_sz;
|
||||
std::string tmp = "AMD:AMDGPU";
|
||||
while (idx != triple.size()) {
|
||||
tmp.push_back(':');
|
||||
tmp.push_back(triple[idx++]);
|
||||
}
|
||||
|
||||
hsa_isa_from_name(tmp.c_str(), &r);
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
// DATA - STATICS
|
||||
constexpr const char hip_impl::Bundled_code_header::magic_string_[];
|
||||
|
||||
// CREATORS
|
||||
hip_impl::Bundled_code_header::Bundled_code_header(
|
||||
const std::vector<std::uint8_t>& x)
|
||||
: Bundled_code_header{x.cbegin(), x.cend()}
|
||||
{}
|
||||
@@ -21,76 +21,118 @@ THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "hip/hcc_detail/grid_launch_GGL.hpp"
|
||||
#include "hip/hcc_detail/program_state.hpp"
|
||||
|
||||
#include "hip/hip_runtime_api.h"
|
||||
|
||||
// Internal header, do not percolate upwards.
|
||||
#include "hip_hcc_internal.h"
|
||||
#include "hc.hpp"
|
||||
#include "trace_helper.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <stdexcept>
|
||||
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
|
||||
using namespace hc;
|
||||
using namespace std;
|
||||
|
||||
namespace hip_impl
|
||||
{
|
||||
hc::accelerator_view lock_stream_hip_(
|
||||
hipStream_t& stream, void*& locked_stream)
|
||||
{ // This allocated but does not take ownership of locked_stream. If it is
|
||||
// not deleted elsewhere it will leak.
|
||||
using L = decltype(stream->lockopen_preKernelCommand());
|
||||
|
||||
HIP_INIT();
|
||||
|
||||
stream = ihipSyncAndResolveStream(stream);
|
||||
locked_stream = new L{stream->lockopen_preKernelCommand()};
|
||||
return (*static_cast<L*>(locked_stream))->_av;
|
||||
}
|
||||
|
||||
void print_prelaunch_trace_(
|
||||
const char* kernel_name,
|
||||
dim3 num_blocks,
|
||||
dim3 dim_blocks,
|
||||
int group_mem_bytes,
|
||||
hipStream_t stream)
|
||||
namespace
|
||||
{
|
||||
if ((HIP_TRACE_API & (1 << TRACE_KCMD)) ||
|
||||
HIP_PROFILE_API ||
|
||||
(COMPILE_HIP_DB && (HIP_TRACE_API & (1<<TRACE_ALL)))) {
|
||||
std::stringstream os;
|
||||
os << tls_tidInfo.tid() << "." << tls_tidInfo.apiSeqNum()
|
||||
<< " hipLaunchKernel '" << kernel_name << "'"
|
||||
<< " gridDim:" << num_blocks
|
||||
<< " groupDim:" << dim_blocks
|
||||
<< " sharedMem:+" << group_mem_bytes
|
||||
<< " " << *stream;
|
||||
inline
|
||||
string name(uintptr_t function_address)
|
||||
{
|
||||
const auto it = function_names().find(function_address);
|
||||
|
||||
if (HIP_PROFILE_API == 0x1) {
|
||||
std::string shortAtpString("hipLaunchKernel:");
|
||||
shortAtpString += kernel_name;
|
||||
MARKER_BEGIN(shortAtpString.c_str(), "HIP");
|
||||
} else if (HIP_PROFILE_API == 0x2) {
|
||||
MARKER_BEGIN(os.str().c_str(), "HIP");
|
||||
if (it == function_names().cend()) {
|
||||
throw runtime_error{
|
||||
"Invalid function passed to hipLaunchKernelGGL."};
|
||||
}
|
||||
|
||||
if (COMPILE_HIP_DB && HIP_TRACE_API) {
|
||||
std::string fullStr;
|
||||
recordApiTrace(&fullStr, os.str());
|
||||
return it->second;
|
||||
}
|
||||
|
||||
inline
|
||||
string name(hsa_agent_t agent)
|
||||
{
|
||||
char n[64] = {};
|
||||
hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, n);
|
||||
|
||||
return string{n};
|
||||
}
|
||||
|
||||
inline
|
||||
hsa_agent_t target_agent(hipStream_t stream)
|
||||
{
|
||||
if (stream) {
|
||||
return *static_cast<hsa_agent_t*>(
|
||||
stream->locked_getAv()->get_hsa_agent());
|
||||
}
|
||||
else if (
|
||||
ihipGetTlsDefaultCtx() && ihipGetTlsDefaultCtx()->getDevice()) {
|
||||
return ihipGetDevice(
|
||||
ihipGetTlsDefaultCtx()->getDevice()->_deviceId)->_hsaAgent;
|
||||
}
|
||||
else {
|
||||
return *static_cast<hsa_agent_t*>(
|
||||
accelerator{}.get_default_view().get_hsa_agent());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void unlock_stream_hip_(
|
||||
void hipLaunchKernelGGLImpl(
|
||||
uintptr_t function_address,
|
||||
const dim3& numBlocks,
|
||||
const dim3& dimBlocks,
|
||||
uint32_t sharedMemBytes,
|
||||
hipStream_t stream,
|
||||
void* locked_stream,
|
||||
const char* kernel_name,
|
||||
hc::accelerator_view* acc_v)
|
||||
{ // Precondition: acc_v is the accelerator_view associated with stream
|
||||
// which is guarded by locked_stream;
|
||||
// locked_stream is deletable.
|
||||
using L = decltype(stream->lockopen_preKernelCommand());
|
||||
void** kernarg)
|
||||
{
|
||||
const auto it0 = functions().find(function_address);
|
||||
|
||||
stream->lockclose_postKernelCommand(kernel_name, acc_v);
|
||||
if (it0 == functions().cend()) {
|
||||
throw runtime_error{
|
||||
"No device code available for function: " +
|
||||
name(function_address)
|
||||
};
|
||||
}
|
||||
|
||||
delete static_cast<L*>(locked_stream);
|
||||
locked_stream = nullptr;
|
||||
auto agent = target_agent(stream);
|
||||
|
||||
const auto it1 = find_if(
|
||||
it0->second.cbegin(),
|
||||
it0->second.cend(),
|
||||
[=](const pair<hsa_agent_t, Kernel_descriptor>& x) {
|
||||
return x.first.handle == agent.handle;
|
||||
});
|
||||
|
||||
if (it1 == it0->second.cend()) {
|
||||
throw runtime_error{
|
||||
"No code available for function: " + name(function_address) +
|
||||
", for agent: " + name(agent)
|
||||
};
|
||||
}
|
||||
|
||||
for (auto&& agent_kernel : it0->second) {
|
||||
if (agent.handle == agent_kernel.first.handle) {
|
||||
hipModuleLaunchKernel(
|
||||
agent_kernel.second,
|
||||
numBlocks.x,
|
||||
numBlocks.y,
|
||||
numBlocks.z,
|
||||
dimBlocks.x,
|
||||
dimBlocks.y,
|
||||
dimBlocks.z,
|
||||
sharedMemBytes,
|
||||
stream,
|
||||
nullptr,
|
||||
kernarg);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -233,7 +233,7 @@ static const DbName dbName [] =
|
||||
#if COMPILE_HIP_DB
|
||||
#define tprintf(trace_level, ...) {\
|
||||
if (HIP_DB & (1<<(trace_level))) {\
|
||||
char msgStr[1000];\
|
||||
char msgStr[2000];\
|
||||
snprintf(msgStr, 2000, __VA_ARGS__);\
|
||||
fprintf (stderr, " %ship-%s tid:%d:%s%s", dbName[trace_level]._color, dbName[trace_level]._shortName, tls_tidInfo.tid(), msgStr, KNRM); \
|
||||
}\
|
||||
|
||||
@@ -65,7 +65,7 @@ int sharePtr(void *ptr, ihipCtx_t *ctx, bool shareWithAll, unsigned hipFlags)
|
||||
|
||||
if (shareWithAll) {
|
||||
hsa_status_t s = hsa_amd_agents_allow_access(g_deviceCnt+1, g_allAgents, NULL, ptr);
|
||||
tprintf (DB_MEM, " allow access to CPU + all %d GPUs (shareWithAll)\n", g_deviceCnt);
|
||||
tprintf (DB_MEM, " allow access to CPU + all %d GPUs (shareWithAll)\n", g_deviceCnt);
|
||||
if (s != HSA_STATUS_SUCCESS) {
|
||||
ret = -1;
|
||||
}
|
||||
@@ -122,7 +122,7 @@ void * allocAndSharePtr(const char *msg, size_t sizeBytes, ihipCtx_t *ctx, bool
|
||||
if (HIP_INIT_ALLOC != -1) {
|
||||
// TODO , dont' call HIP API directly here:
|
||||
hipMemset(ptr, HIP_INIT_ALLOC, sizeBytes);
|
||||
}
|
||||
}
|
||||
|
||||
if (ptr != nullptr) {
|
||||
int r = sharePtr(ptr, ctx, shareWithAll, hipFlags);
|
||||
@@ -251,7 +251,7 @@ hipError_t hipMalloc(void** ptr, size_t sizeBytes)
|
||||
hip_status = hipErrorMemoryAllocation;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
return ihipLogStatus(hip_status);
|
||||
@@ -284,10 +284,10 @@ hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags)
|
||||
}
|
||||
|
||||
|
||||
const unsigned supportedFlags = hipHostMallocPortable
|
||||
| hipHostMallocMapped
|
||||
| hipHostMallocWriteCombined
|
||||
| hipHostMallocCoherent
|
||||
const unsigned supportedFlags = hipHostMallocPortable
|
||||
| hipHostMallocMapped
|
||||
| hipHostMallocWriteCombined
|
||||
| hipHostMallocCoherent
|
||||
| hipHostMallocNonCoherent;
|
||||
|
||||
|
||||
@@ -300,7 +300,7 @@ hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags)
|
||||
hip_status = hipErrorInvalidValue;
|
||||
} else {
|
||||
auto device = ctx->getWriteableDevice();
|
||||
|
||||
|
||||
unsigned amFlags = 0;
|
||||
if (flags & hipHostMallocCoherent) {
|
||||
amFlags = amHostCoherent;
|
||||
@@ -581,7 +581,7 @@ hipError_t hipMalloc3DArray(hipArray_t *array,
|
||||
hsa_ext_image_data_info_t imageInfo;
|
||||
hsa_status_t status = hsa_ext_image_data_get_info(*agent, &imageDescriptor, permission, &imageInfo);
|
||||
size_t alignment = imageInfo.alignment <= allocGranularity ? 0 : imageInfo.alignment;
|
||||
|
||||
|
||||
*ptr = hip_internal::allocAndSharePtr("device_array", allocSize, ctx, false, am_flags, 0, alignment);
|
||||
|
||||
if (size && (*ptr == NULL)) {
|
||||
@@ -1585,7 +1585,7 @@ hipError_t hipIpcGetMemHandle(hipIpcMemHandle_t* handle, void* devPtr){
|
||||
HIP_INIT_API ( handle, devPtr);
|
||||
hipError_t hipStatus = hipSuccess;
|
||||
// Get the size of allocated pointer
|
||||
size_t psize;
|
||||
size_t psize = 0u;
|
||||
hc::accelerator acc;
|
||||
if((handle == NULL) || (devPtr == NULL)) {
|
||||
hipStatus = hipErrorInvalidResourceHandle;
|
||||
|
||||
@@ -119,15 +119,18 @@ namespace hipdrv {
|
||||
uint64_t PrintSymbolSizes(const void *emi, const char *name){
|
||||
using namespace ELFIO;
|
||||
|
||||
const Elf64_Ehdr *ehdr = (const Elf64_Ehdr*)emi;
|
||||
const ELFIO::Elf64_Ehdr *ehdr = (const ELFIO::Elf64_Ehdr*)emi;
|
||||
if(NULL == ehdr || EV_CURRENT != ehdr->e_version){}
|
||||
const Elf64_Shdr * shdr = (const Elf64_Shdr*)((char*)emi + ehdr->e_shoff);
|
||||
const ELFIO::Elf64_Shdr * shdr =
|
||||
(const ELFIO::Elf64_Shdr*)((char*)emi + ehdr->e_shoff);
|
||||
for(uint16_t i=0;i<ehdr->e_shnum;++i){
|
||||
if(shdr[i].sh_type == SHT_SYMTAB){
|
||||
const Elf64_Sym *syms = (const Elf64_Sym*)((char*)emi + shdr[i].sh_offset);
|
||||
const ELFIO::Elf64_Sym *syms =
|
||||
(const ELFIO::Elf64_Sym*)((char*)emi + shdr[i].sh_offset);
|
||||
assert(syms);
|
||||
uint64_t numSyms = shdr[i].sh_size/shdr[i].sh_entsize;
|
||||
const char* strtab = (const char*)((char*)emi + shdr[shdr[i].sh_link].sh_offset);
|
||||
const char* strtab =
|
||||
(const char*)((char*)emi + shdr[shdr[i].sh_link].sh_offset);
|
||||
assert(strtab);
|
||||
for(uint64_t i=0;i<numSyms;++i){
|
||||
const char *symname = strtab + syms[i].st_name;
|
||||
@@ -145,8 +148,8 @@ uint64_t PrintSymbolSizes(const void *emi, const char *name){
|
||||
uint64_t ElfSize(const void *emi){
|
||||
using namespace ELFIO;
|
||||
|
||||
const Elf64_Ehdr *ehdr = (const Elf64_Ehdr*)emi;
|
||||
const Elf64_Shdr *shdr = (const Elf64_Shdr*)((char*)emi + ehdr->e_shoff);
|
||||
const ELFIO::Elf64_Ehdr *ehdr = (const ELFIO::Elf64_Ehdr*)emi;
|
||||
const ELFIO::Elf64_Shdr *shdr = (const ELFIO::Elf64_Shdr*)((char*)emi + ehdr->e_shoff);
|
||||
|
||||
uint64_t max_offset = ehdr->e_shoff;
|
||||
uint64_t total_size = max_offset + ehdr->e_shentsize * ehdr->e_shnum;
|
||||
@@ -164,156 +167,8 @@ uint64_t ElfSize(const void *emi){
|
||||
return total_size;
|
||||
}
|
||||
|
||||
namespace
|
||||
{
|
||||
template<typename P>
|
||||
inline
|
||||
ELFIO::section* find_section_if(ELFIO::elfio& reader, P p)
|
||||
{
|
||||
using namespace std;
|
||||
|
||||
const auto it = find_if(
|
||||
reader.sections.begin(), reader.sections.end(), move(p));
|
||||
|
||||
return it != reader.sections.end() ? *it : nullptr;
|
||||
}
|
||||
|
||||
inline
|
||||
std::vector<std::string> copy_names_of_undefined_symbols(
|
||||
const ELFIO::symbol_section_accessor& section)
|
||||
{
|
||||
using namespace ELFIO;
|
||||
using namespace std;
|
||||
|
||||
vector<string> r;
|
||||
|
||||
for (auto i = 0u; i != section.get_symbols_num(); ++i) {
|
||||
// TODO: this is boyscout code, caching the temporaries
|
||||
// may be of worth.
|
||||
string name;
|
||||
Elf64_Addr value = 0;
|
||||
Elf_Xword size = 0;
|
||||
Elf_Half sect_idx = 0;
|
||||
uint8_t bind = 0;
|
||||
uint8_t type = 0;
|
||||
uint8_t other = 0;
|
||||
|
||||
section.get_symbol(
|
||||
i, name, value, size, bind, type, sect_idx, other);
|
||||
|
||||
if (sect_idx == SHN_UNDEF && !name.empty()) {
|
||||
r.push_back(std::move(name));
|
||||
}
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
inline
|
||||
std::pair<ELFIO::Elf64_Addr, ELFIO::Elf_Xword> find_symbol_address(
|
||||
const ELFIO::symbol_section_accessor& section,
|
||||
const std::string& symbol_name)
|
||||
{
|
||||
using namespace ELFIO;
|
||||
using namespace std;
|
||||
|
||||
static constexpr pair<Elf64_Addr, Elf_Xword> r{0, 0};
|
||||
|
||||
for (auto i = 0u; i != section.get_symbols_num(); ++i) {
|
||||
// TODO: this is boyscout code, caching the temporaries
|
||||
// may be of worth.
|
||||
string name;
|
||||
Elf64_Addr value = 0;
|
||||
Elf_Xword size = 0;
|
||||
Elf_Half sect_idx = 0;
|
||||
uint8_t bind = 0;
|
||||
uint8_t type = 0;
|
||||
uint8_t other = 0;
|
||||
|
||||
section.get_symbol(
|
||||
i, name, value, size, bind, type, sect_idx, other);
|
||||
|
||||
if (name == symbol_name) return make_pair(value, size);
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
inline
|
||||
void associate_code_object_symbols_with_host_allocation(
|
||||
const ELFIO::elfio& reader,
|
||||
const ELFIO::elfio& self_reader,
|
||||
ELFIO::section* code_object_dynsym,
|
||||
ELFIO::section* process_symtab,
|
||||
hsa_agent_t agent,
|
||||
hsa_executable_t executable)
|
||||
{
|
||||
using namespace ELFIO;
|
||||
using namespace std;
|
||||
|
||||
if (!code_object_dynsym || !process_symtab) return;
|
||||
|
||||
const auto undefined_symbols = copy_names_of_undefined_symbols(
|
||||
symbol_section_accessor{reader, code_object_dynsym});
|
||||
|
||||
for (auto&& x : undefined_symbols) {
|
||||
const auto tmp = find_symbol_address(
|
||||
symbol_section_accessor{self_reader, process_symtab}, x);
|
||||
|
||||
assert(tmp.first);
|
||||
|
||||
void* p = nullptr;
|
||||
hsa_amd_memory_lock(
|
||||
reinterpret_cast<void*>(tmp.first), tmp.second, &agent, 1, &p);
|
||||
|
||||
hsa_executable_agent_global_variable_define(
|
||||
executable, agent, x.c_str(), p);
|
||||
|
||||
static vector<
|
||||
unique_ptr<void, decltype(hsa_amd_memory_unlock)*>> globals;
|
||||
static mutex mtx;
|
||||
|
||||
lock_guard<std::mutex> lck{mtx};
|
||||
globals.emplace_back(p, hsa_amd_memory_unlock);
|
||||
}
|
||||
}
|
||||
|
||||
inline
|
||||
void load_code_object_and_freeze_executable(
|
||||
const char* file, hsa_agent_t agent, hsa_executable_t executable)
|
||||
{ // TODO: the following sequence is inefficient, should be refactored
|
||||
// into a single load of the file and subsequent ELFIO
|
||||
// processing.
|
||||
using namespace std;
|
||||
|
||||
static const auto cor_deleter = [](hsa_code_object_reader_t* p) {
|
||||
hsa_code_object_reader_destroy(*p);
|
||||
};
|
||||
|
||||
using RAII_code_reader = unique_ptr<
|
||||
hsa_code_object_reader_t, decltype(cor_deleter)>;
|
||||
|
||||
unique_ptr<FILE, decltype(fclose)*> cobj{fopen(file, "r"), fclose};
|
||||
RAII_code_reader tmp{new hsa_code_object_reader_t, cor_deleter};
|
||||
hsa_code_object_reader_create_from_file(fileno(cobj.get()), tmp.get());
|
||||
|
||||
hsa_executable_load_agent_code_object(
|
||||
executable, agent, *tmp, nullptr, nullptr);
|
||||
|
||||
hsa_executable_freeze(executable, nullptr);
|
||||
|
||||
static vector<RAII_code_reader> code_readers;
|
||||
static mutex mtx;
|
||||
|
||||
lock_guard<mutex> lck{mtx};
|
||||
code_readers.push_back(move(tmp));
|
||||
}
|
||||
}
|
||||
|
||||
hipError_t hipModuleLoad(hipModule_t *module, const char *fname)
|
||||
{
|
||||
using namespace ELFIO;
|
||||
|
||||
HIP_INIT_API(module, fname);
|
||||
hipError_t ret = hipSuccess;
|
||||
*module = new ihipModule_t;
|
||||
@@ -336,36 +191,14 @@ hipError_t hipModuleLoad(hipModule_t *module, const char *fname)
|
||||
nullptr,
|
||||
&(*module)->executable);
|
||||
|
||||
elfio reader;
|
||||
if (!reader.load(fname)) {
|
||||
std::ifstream file{fname};
|
||||
|
||||
if (!file.is_open()) {
|
||||
return ihipLogStatus(hipErrorFileNotFound);
|
||||
}
|
||||
else {
|
||||
// TODO: this may benefit from caching as well.
|
||||
elfio self_reader;
|
||||
self_reader.load("/proc/self/exe");
|
||||
|
||||
const auto symtab =
|
||||
find_section_if(self_reader, [](const ELFIO::section* x) {
|
||||
return x->get_type() == SHT_SYMTAB;
|
||||
});
|
||||
|
||||
const auto code_object_dynsym =
|
||||
find_section_if(reader, [](const ELFIO::section* x) {
|
||||
return x->get_type() == SHT_DYNSYM;
|
||||
});
|
||||
|
||||
associate_code_object_symbols_with_host_allocation(
|
||||
reader,
|
||||
self_reader,
|
||||
code_object_dynsym,
|
||||
symtab,
|
||||
currentDevice->_hsaAgent,
|
||||
(*module)->executable);
|
||||
|
||||
load_code_object_and_freeze_executable(
|
||||
fname, currentDevice->_hsaAgent, (*module)->executable);
|
||||
}
|
||||
(*module)->executable = hip_impl::load_executable(
|
||||
(*module)->executable, currentDevice->_hsaAgent, file);
|
||||
ret = (*module)->executable.handle ? hipSuccess : hipErrorUnknown;
|
||||
}
|
||||
|
||||
return ihipLogStatus(ret);
|
||||
|
||||
@@ -0,0 +1,498 @@
|
||||
#include "../include/hip/hcc_detail/program_state.hpp"
|
||||
|
||||
#include "../include/hip/hcc_detail/code_object_bundle.hpp"
|
||||
|
||||
#include "hip_hcc_internal.h"
|
||||
#include "trace_helper.h"
|
||||
|
||||
#include "elfio/elfio.hpp"
|
||||
|
||||
#include <link.h>
|
||||
|
||||
#include <hsa/hsa.h>
|
||||
#include <hsa/hsa_ext_amd.h>
|
||||
|
||||
#include <cassert>
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
using namespace ELFIO;
|
||||
using namespace hip_impl;
|
||||
using namespace std;
|
||||
|
||||
namespace std
|
||||
{
|
||||
template<>
|
||||
struct hash<hsa_agent_t> {
|
||||
size_t operator()(hsa_agent_t x) const
|
||||
{
|
||||
return hash<decltype(x.handle)>{}(x.handle);
|
||||
}
|
||||
};
|
||||
|
||||
template<>
|
||||
struct hash<hsa_isa_t> {
|
||||
size_t operator()(hsa_isa_t x) const
|
||||
{
|
||||
return hash<decltype(x.handle)>{}(x.handle);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
inline
|
||||
constexpr
|
||||
bool operator==(hsa_agent_t x, hsa_agent_t y)
|
||||
{
|
||||
return x.handle == y.handle;
|
||||
}
|
||||
|
||||
inline
|
||||
constexpr
|
||||
bool operator==(hsa_isa_t x, hsa_isa_t y)
|
||||
{
|
||||
return x.handle == y.handle;
|
||||
}
|
||||
|
||||
namespace
|
||||
{
|
||||
vector<string> copy_names_of_undefined_symbols(
|
||||
const symbol_section_accessor& section)
|
||||
{
|
||||
vector<string> r;
|
||||
|
||||
for (auto i = 0u; i != section.get_symbols_num(); ++i) {
|
||||
// TODO: this is boyscout code, caching the temporaries
|
||||
// may be of worth.
|
||||
string name;
|
||||
Elf64_Addr value = 0;
|
||||
Elf_Xword size = 0;
|
||||
Elf_Half sect_idx = 0;
|
||||
uint8_t bind = 0;
|
||||
uint8_t type = 0;
|
||||
uint8_t other = 0;
|
||||
|
||||
section.get_symbol(
|
||||
i, name, value, size, bind, type, sect_idx, other);
|
||||
|
||||
if (sect_idx == SHN_UNDEF && !name.empty()) {
|
||||
r.push_back(std::move(name));
|
||||
}
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
pair<Elf64_Addr, Elf_Xword> find_symbol_address(
|
||||
const symbol_section_accessor& section,
|
||||
const string& symbol_name)
|
||||
{
|
||||
static constexpr pair<Elf64_Addr, Elf_Xword> r{0, 0};
|
||||
|
||||
for (auto i = 0u; i != section.get_symbols_num(); ++i) {
|
||||
// TODO: this is boyscout code, caching the temporaries
|
||||
// may be of worth.
|
||||
string name;
|
||||
Elf64_Addr value = 0;
|
||||
Elf_Xword size = 0;
|
||||
Elf_Half sect_idx = 0;
|
||||
uint8_t bind = 0;
|
||||
uint8_t type = 0;
|
||||
uint8_t other = 0;
|
||||
|
||||
section.get_symbol(
|
||||
i, name, value, size, bind, type, sect_idx, other);
|
||||
|
||||
if (name == symbol_name) return make_pair(value, size);
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
void associate_code_object_symbols_with_host_allocation(
|
||||
const elfio& reader,
|
||||
const elfio& self_reader,
|
||||
section* code_object_dynsym,
|
||||
section* process_symtab,
|
||||
hsa_agent_t agent,
|
||||
hsa_executable_t executable)
|
||||
{
|
||||
if (!code_object_dynsym || !process_symtab) return;
|
||||
|
||||
const auto undefined_symbols = copy_names_of_undefined_symbols(
|
||||
symbol_section_accessor{reader, code_object_dynsym});
|
||||
|
||||
for (auto&& x : undefined_symbols) {
|
||||
const auto tmp = find_symbol_address(
|
||||
symbol_section_accessor{self_reader, process_symtab}, x);
|
||||
|
||||
assert(tmp.first);
|
||||
|
||||
void* p = nullptr;
|
||||
hsa_amd_memory_lock(
|
||||
reinterpret_cast<void*>(tmp.first), tmp.second, &agent, 1, &p);
|
||||
|
||||
hsa_executable_agent_global_variable_define(
|
||||
executable, agent, x.c_str(), p);
|
||||
|
||||
static vector<
|
||||
unique_ptr<void, decltype(hsa_amd_memory_unlock)*>> globals;
|
||||
static mutex mtx;
|
||||
|
||||
lock_guard<std::mutex> lck{mtx};
|
||||
globals.emplace_back(p, hsa_amd_memory_unlock);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename P>
|
||||
inline
|
||||
section* find_section_if(elfio& reader, P p)
|
||||
{
|
||||
const auto it = find_if(
|
||||
reader.sections.begin(), reader.sections.end(), std::move(p));
|
||||
|
||||
return it != reader.sections.end() ? *it : nullptr;
|
||||
}
|
||||
|
||||
vector<uint8_t> code_object_blob_for_process()
|
||||
{
|
||||
static constexpr const char self[] = "/proc/self/exe";
|
||||
static constexpr const char kernel_section[] = ".kernel";
|
||||
|
||||
elfio reader;
|
||||
|
||||
if (!reader.load(self)) {
|
||||
throw runtime_error{"Failed to load ELF file for current process."};
|
||||
}
|
||||
|
||||
auto kernels = find_section_if(reader, [](const section* x) {
|
||||
return x->get_name() == kernel_section;
|
||||
});
|
||||
|
||||
vector<uint8_t> r;
|
||||
if (kernels) {
|
||||
r.insert(
|
||||
r.end(),
|
||||
kernels->get_data(),
|
||||
kernels->get_data() + kernels->get_size());
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
const unordered_map<hsa_isa_t, vector<vector<uint8_t>>>& code_object_blobs()
|
||||
{
|
||||
static unordered_map<hsa_isa_t, vector<vector<uint8_t>>> r;
|
||||
static once_flag f;
|
||||
|
||||
call_once(f, []() {
|
||||
static vector<vector<uint8_t>> blobs{
|
||||
code_object_blob_for_process()};
|
||||
|
||||
dl_iterate_phdr([](dl_phdr_info* i, std::size_t, void*) {
|
||||
elfio tmp;
|
||||
if (tmp.load(i->dlpi_name)) {
|
||||
const auto it = find_section_if(tmp, [](const section* x) {
|
||||
return x->get_name() == ".kernel";
|
||||
});
|
||||
|
||||
if (it) blobs.emplace_back(
|
||||
it->get_data(), it->get_data() + it->get_size());
|
||||
}
|
||||
return 0;
|
||||
}, nullptr);
|
||||
|
||||
for (auto&& blob : blobs) {
|
||||
Bundled_code_header tmp{blob};
|
||||
if (valid(tmp)) {
|
||||
for (auto&& bundle : bundles(tmp)) {
|
||||
r[triple_to_hsa_isa(bundle.triple)]
|
||||
.push_back(bundle.blob);
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
const unordered_map<hsa_agent_t, vector<hsa_executable_t>>& executables()
|
||||
{
|
||||
static unordered_map<hsa_agent_t, vector<hsa_executable_t>> r;
|
||||
static once_flag f;
|
||||
|
||||
call_once(f, []() {
|
||||
static const auto accelerators = hc::accelerator::get_all();
|
||||
|
||||
for (auto&& acc : accelerators) {
|
||||
auto agent = static_cast<hsa_agent_t*>(acc.get_hsa_agent());
|
||||
|
||||
if (!agent) continue;
|
||||
|
||||
hsa_agent_iterate_isas(*agent, [](hsa_isa_t x, void* pa) {
|
||||
const auto it = code_object_blobs().find(x);
|
||||
|
||||
if (it != code_object_blobs().cend()) {
|
||||
hsa_agent_t a = *static_cast<hsa_agent_t*>(pa);
|
||||
|
||||
for (auto&& blob : it->second) {
|
||||
hsa_executable_t tmp = {};
|
||||
|
||||
hsa_executable_create_alt(
|
||||
HSA_PROFILE_FULL,
|
||||
HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT,
|
||||
nullptr,
|
||||
&tmp);
|
||||
|
||||
// TODO: this is massively inefficient and only
|
||||
// meant for illustration.
|
||||
string blob_to_str{blob.cbegin(), blob.cend()};
|
||||
stringstream istr{blob_to_str};
|
||||
tmp = load_executable(tmp, a, istr);
|
||||
|
||||
if (tmp.handle) r[a].push_back(tmp);
|
||||
}
|
||||
}
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}, agent);
|
||||
}
|
||||
});
|
||||
|
||||
cout << r.size() << endl;
|
||||
return r;
|
||||
}
|
||||
|
||||
inline
|
||||
hsa_agent_t agent(hsa_executable_symbol_t x)
|
||||
{
|
||||
hsa_agent_t r = {};
|
||||
hsa_executable_symbol_get_info(x, HSA_EXECUTABLE_SYMBOL_INFO_AGENT, &r);
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
inline
|
||||
uint32_t group_size(hsa_executable_symbol_t x)
|
||||
{
|
||||
uint32_t r = 0u;
|
||||
hsa_executable_symbol_get_info(
|
||||
x, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &r);
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
inline
|
||||
uint64_t kernel_object(hsa_executable_symbol_t x)
|
||||
{
|
||||
uint64_t r = 0u;
|
||||
hsa_executable_symbol_get_info(
|
||||
x, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &r);
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
inline
|
||||
string name(hsa_executable_symbol_t x)
|
||||
{
|
||||
uint32_t sz = 0u;
|
||||
hsa_executable_symbol_get_info(
|
||||
x, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &sz);
|
||||
|
||||
string r(sz, '\0');
|
||||
hsa_executable_symbol_get_info(
|
||||
x, HSA_EXECUTABLE_SYMBOL_INFO_NAME, &r.front());
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
inline
|
||||
uint32_t private_size(hsa_executable_symbol_t x)
|
||||
{
|
||||
uint32_t r = 0u;
|
||||
hsa_executable_symbol_get_info(
|
||||
x, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &r);
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
inline
|
||||
hsa_symbol_kind_t type(hsa_executable_symbol_t x)
|
||||
{
|
||||
hsa_symbol_kind_t r = {};
|
||||
hsa_executable_symbol_get_info(x, HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &r);
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
const unordered_map<string, vector<hsa_executable_symbol_t>>& kernels()
|
||||
{
|
||||
static unordered_map<string, vector<hsa_executable_symbol_t>> r;
|
||||
static once_flag f;
|
||||
|
||||
call_once(f, []() {
|
||||
static const auto copy_kernels = [](
|
||||
hsa_executable_t, hsa_agent_t, hsa_executable_symbol_t s, void*) {
|
||||
if (type(s) == HSA_SYMBOL_KIND_KERNEL) r[name(s)].push_back(s);
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
};
|
||||
|
||||
for (auto&& agent_executables : executables()) {
|
||||
for (auto&& executable : agent_executables.second) {
|
||||
hsa_executable_iterate_agent_symbols(
|
||||
executable,
|
||||
agent_executables.first,
|
||||
copy_kernels,
|
||||
nullptr);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
void load_code_object_and_freeze_executable(
|
||||
istream& file, hsa_agent_t agent, hsa_executable_t executable)
|
||||
{ // TODO: the following sequence is inefficient, should be refactored
|
||||
// into a single load of the file and subsequent ELFIO
|
||||
// processing.
|
||||
static const auto cor_deleter = [](hsa_code_object_reader_t* p) {
|
||||
hsa_code_object_reader_destroy(*p);
|
||||
};
|
||||
|
||||
using RAII_code_reader = unique_ptr<
|
||||
hsa_code_object_reader_t, decltype(cor_deleter)>;
|
||||
|
||||
file.seekg(0);
|
||||
|
||||
vector<uint8_t> blob{
|
||||
istreambuf_iterator<char>{file}, istreambuf_iterator<char>{}};
|
||||
RAII_code_reader tmp{new hsa_code_object_reader_t, cor_deleter};
|
||||
hsa_code_object_reader_create_from_memory(
|
||||
blob.data(), blob.size(), tmp.get());
|
||||
|
||||
hsa_executable_load_agent_code_object(
|
||||
executable, agent, *tmp, nullptr, nullptr);
|
||||
|
||||
hsa_executable_freeze(executable, nullptr);
|
||||
|
||||
static vector<RAII_code_reader> code_readers;
|
||||
static mutex mtx;
|
||||
|
||||
lock_guard<mutex> lck{mtx};
|
||||
code_readers.push_back(move(tmp));
|
||||
}
|
||||
}
|
||||
|
||||
namespace hip_impl
|
||||
{
|
||||
const unordered_map<uintptr_t, string>& function_names()
|
||||
{
|
||||
static constexpr const char self[] = "/proc/self/exe";
|
||||
|
||||
static unordered_map<uintptr_t, string> r;
|
||||
static once_flag f;
|
||||
|
||||
call_once(f, []() {
|
||||
elfio reader;
|
||||
|
||||
if (!reader.load(self)) {
|
||||
throw runtime_error{
|
||||
"Failed to load the ELF file for the current process."};
|
||||
}
|
||||
|
||||
auto symtab = find_section_if(reader, [](const section* x) {
|
||||
return x->get_type() == SHT_SYMTAB;
|
||||
});
|
||||
|
||||
symbol_section_accessor symbols{reader, symtab};
|
||||
|
||||
for (auto i = 0u; i != symbols.get_symbols_num(); ++i) {
|
||||
// TODO: this is boyscout code, caching the temporaries
|
||||
// may be of worth.
|
||||
string name;
|
||||
Elf64_Addr value = 0;
|
||||
Elf_Xword size = 0;
|
||||
Elf_Half sect_idx = 0;
|
||||
uint8_t bind = 0;
|
||||
uint8_t type = 0;
|
||||
uint8_t other = 0;
|
||||
|
||||
symbols.get_symbol(
|
||||
i, name, value, size, bind, type, sect_idx, other);
|
||||
|
||||
if (type == STT_FUNC && sect_idx != SHN_UNDEF && !name.empty()) {
|
||||
r.emplace(value, name);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
const unordered_map<
|
||||
uintptr_t, vector<pair<hsa_agent_t, Kernel_descriptor>>>& functions()
|
||||
{
|
||||
static unordered_map<
|
||||
uintptr_t, vector<pair<hsa_agent_t, Kernel_descriptor>>> r;
|
||||
static once_flag f;
|
||||
|
||||
call_once(f, []() {
|
||||
for (auto&& function : function_names()) {
|
||||
const auto it = kernels().find(function.second);
|
||||
|
||||
if (it != kernels().cend()) {
|
||||
for (auto&& kernel_symbol : it->second) {
|
||||
r[function.first].emplace_back(
|
||||
agent(kernel_symbol),
|
||||
Kernel_descriptor{
|
||||
kernel_object(kernel_symbol),
|
||||
group_size(kernel_symbol),
|
||||
private_size(kernel_symbol),
|
||||
it->first});
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
hsa_executable_t load_executable(
|
||||
hsa_executable_t executable, hsa_agent_t agent, istream& file)
|
||||
{
|
||||
elfio reader;
|
||||
if (!reader.load(file)) {
|
||||
return hsa_executable_t{};
|
||||
}
|
||||
else {
|
||||
// TODO: this may benefit from caching as well.
|
||||
elfio self_reader;
|
||||
self_reader.load("/proc/self/exe");
|
||||
|
||||
const auto symtab =
|
||||
find_section_if(self_reader, [](const ELFIO::section* x) {
|
||||
return x->get_type() == SHT_SYMTAB;
|
||||
});
|
||||
|
||||
const auto code_object_dynsym =
|
||||
find_section_if(reader, [](const ELFIO::section* x) {
|
||||
return x->get_type() == SHT_DYNSYM;
|
||||
});
|
||||
|
||||
associate_code_object_symbols_with_host_allocation(
|
||||
reader, self_reader, code_object_dynsym, symtab, agent, executable);
|
||||
|
||||
load_code_object_and_freeze_executable(file, agent, executable);
|
||||
|
||||
return executable;
|
||||
}
|
||||
}
|
||||
} // Namespace hip_impl.
|
||||
@@ -46,7 +46,6 @@ int main(int argc, char *argv[])
|
||||
A_h = new char[Nbytes];
|
||||
|
||||
HIPCHECK ( hipMalloc((void **) &A_d, Nbytes) );
|
||||
A_h = (char*)malloc(Nbytes);
|
||||
|
||||
printf ("Size=%zu memsetval=%2x \n", Nbytes, memsetval);
|
||||
HIPCHECK ( hipMemsetD8(A_d, memsetval, Nbytes) );
|
||||
@@ -61,7 +60,7 @@ int main(int argc, char *argv[])
|
||||
}
|
||||
|
||||
hipFree((void *) A_d);
|
||||
free(A_h);
|
||||
delete [] A_h;
|
||||
passed();
|
||||
|
||||
}
|
||||
|
||||
@@ -139,7 +139,14 @@ for(int i=0;i<512;i++){
|
||||
passed = 1;
|
||||
}
|
||||
}
|
||||
free(A);
|
||||
|
||||
delete [] A;
|
||||
delete [] B;
|
||||
delete [] C;
|
||||
hipFree(Ad);
|
||||
hipFree(Bd);
|
||||
hipFree(Cd);
|
||||
|
||||
if(passed == 1){
|
||||
return true;
|
||||
}
|
||||
@@ -174,7 +181,14 @@ for(int i=0;i<512;i++){
|
||||
passed = 1;
|
||||
}
|
||||
}
|
||||
free(A);
|
||||
|
||||
delete [] A;
|
||||
delete [] B;
|
||||
delete [] C;
|
||||
hipFree(Ad);
|
||||
hipFree(Bd);
|
||||
hipFree(Cd);
|
||||
|
||||
if(passed == 1){
|
||||
return true;
|
||||
}
|
||||
@@ -205,7 +219,13 @@ for(int i=0;i<512;i++){
|
||||
}
|
||||
}
|
||||
|
||||
free(A);
|
||||
delete [] A;
|
||||
delete [] B;
|
||||
delete [] C;
|
||||
hipFree(Ad);
|
||||
hipFree(Bd);
|
||||
hipFree(Cd);
|
||||
|
||||
if(passed == 1){
|
||||
return true;
|
||||
}
|
||||
@@ -234,7 +254,12 @@ for(int i=0;i<512;i++){
|
||||
passed = 1;
|
||||
}
|
||||
}
|
||||
free(A);
|
||||
|
||||
delete [] A;
|
||||
delete [] B;
|
||||
hipFree(Ad);
|
||||
hipFree(Bd);
|
||||
|
||||
if(passed == 1){
|
||||
return true;
|
||||
}
|
||||
@@ -263,7 +288,12 @@ for(int i=0;i<512;i++){
|
||||
passed = 1;
|
||||
}
|
||||
}
|
||||
free(A);
|
||||
|
||||
delete [] A;
|
||||
delete [] B;
|
||||
hipFree(Ad);
|
||||
hipFree(Bd);
|
||||
|
||||
if(passed == 1){
|
||||
return true;
|
||||
}
|
||||
@@ -291,7 +321,12 @@ for(int i=0;i<512;i++){
|
||||
passed = 1;
|
||||
}
|
||||
}
|
||||
free(A);
|
||||
|
||||
delete [] A;
|
||||
delete [] B;
|
||||
hipFree(Ad);
|
||||
hipFree(Bd);
|
||||
|
||||
if(passed == 1){
|
||||
return true;
|
||||
}
|
||||
@@ -321,7 +356,12 @@ for(int i=0;i<512;i++){
|
||||
passed = 1;
|
||||
}
|
||||
}
|
||||
free(A);
|
||||
|
||||
delete [] A;
|
||||
delete [] B;
|
||||
hipFree(Ad);
|
||||
hipFree(Bd);
|
||||
|
||||
if(passed == 1){
|
||||
return true;
|
||||
}
|
||||
@@ -350,7 +390,12 @@ for(int i=0;i<512;i++){
|
||||
passed = 1;
|
||||
}
|
||||
}
|
||||
free(A);
|
||||
|
||||
delete [] A;
|
||||
delete [] B;
|
||||
hipFree(Ad);
|
||||
hipFree(Bd);
|
||||
|
||||
if(passed == 1){
|
||||
return true;
|
||||
}
|
||||
@@ -387,7 +432,16 @@ for(int i=0;i<512;i++){
|
||||
passed = 1;
|
||||
}
|
||||
}
|
||||
free(A);
|
||||
|
||||
delete [] A;
|
||||
delete [] B;
|
||||
delete [] C;
|
||||
delete [] D;
|
||||
hipFree(Ad);
|
||||
hipFree(Bd);
|
||||
hipFree(Cd);
|
||||
hipFree(Dd);
|
||||
|
||||
if(passed == 1){
|
||||
return true;
|
||||
}
|
||||
@@ -427,7 +481,18 @@ for(int i=0;i<512;i++){
|
||||
passed = 1;
|
||||
}
|
||||
}
|
||||
free(A);
|
||||
|
||||
delete [] A;
|
||||
delete [] B;
|
||||
delete [] C;
|
||||
delete [] D;
|
||||
delete [] E;
|
||||
hipFree(Ad);
|
||||
hipFree(Bd);
|
||||
hipFree(Cd);
|
||||
hipFree(Dd);
|
||||
hipFree(Ed);
|
||||
|
||||
if(passed == 1){
|
||||
return true;
|
||||
}
|
||||
@@ -457,7 +522,12 @@ for(int i=0;i<512;i++){
|
||||
passed = 1;
|
||||
}
|
||||
}
|
||||
free(A);
|
||||
|
||||
delete [] A;
|
||||
delete [] B;
|
||||
hipFree(Ad);
|
||||
hipFree(Bd);
|
||||
|
||||
if(passed == 1){
|
||||
return true;
|
||||
}
|
||||
@@ -489,7 +559,14 @@ for(int i=0;i<512;i++){
|
||||
passed = 1;
|
||||
}
|
||||
}
|
||||
free(A);
|
||||
|
||||
delete [] A;
|
||||
delete [] B;
|
||||
delete [] C;
|
||||
hipFree(Ad);
|
||||
hipFree(Bd);
|
||||
hipFree(Cd);
|
||||
|
||||
if(passed == 1){
|
||||
return true;
|
||||
}
|
||||
@@ -525,7 +602,16 @@ for(int i=0;i<512;i++){
|
||||
passed = 1;
|
||||
}
|
||||
}
|
||||
free(A);
|
||||
|
||||
delete [] A;
|
||||
delete [] B;
|
||||
delete [] C;
|
||||
delete [] D;
|
||||
hipFree(Ad);
|
||||
hipFree(Bd);
|
||||
hipFree(Cd);
|
||||
hipFree(Dd);
|
||||
|
||||
if(passed == 1){
|
||||
return true;
|
||||
}
|
||||
@@ -565,7 +651,18 @@ for(int i=0;i<512;i++){
|
||||
passed = 1;
|
||||
}
|
||||
}
|
||||
free(A);
|
||||
|
||||
delete [] A;
|
||||
delete [] B;
|
||||
delete [] C;
|
||||
delete [] D;
|
||||
delete [] E;
|
||||
hipFree(Ad);
|
||||
hipFree(Bd);
|
||||
hipFree(Cd);
|
||||
hipFree(Dd);
|
||||
hipFree(Ed);
|
||||
|
||||
if(passed == 1){
|
||||
return true;
|
||||
}
|
||||
@@ -595,7 +692,12 @@ for(int i=0;i<512;i++){
|
||||
passed = 1;
|
||||
}
|
||||
}
|
||||
free(A);
|
||||
|
||||
delete [] A;
|
||||
delete [] B;
|
||||
hipFree(Ad);
|
||||
hipFree(Bd);
|
||||
|
||||
if(passed == 1){
|
||||
return true;
|
||||
}
|
||||
@@ -622,7 +724,12 @@ for(int i=0;i<512;i++){
|
||||
passed = 1;
|
||||
}
|
||||
}
|
||||
free(A);
|
||||
|
||||
delete [] A;
|
||||
delete [] B;
|
||||
hipFree(Ad);
|
||||
hipFree(Bd);
|
||||
|
||||
if(passed == 1){
|
||||
return true;
|
||||
}
|
||||
@@ -631,7 +738,7 @@ return false;
|
||||
}
|
||||
|
||||
int main(){
|
||||
if(run_sincosf() && run_sincospif() && run_fdividef() &&
|
||||
if(run_sincosf() && run_sincospif() && run_fdividef() &&
|
||||
run_llrintf() && run_norm3df() && run_norm4df() &&
|
||||
run_normf() && run_rnorm3df() && run_rnorm4df() &&
|
||||
run_rnormf() && run_lroundf() && run_llroundf() &&
|
||||
|
||||
@@ -128,7 +128,14 @@ for(int i=0;i<512;i++){
|
||||
passed = 1;
|
||||
}
|
||||
}
|
||||
free(A);
|
||||
|
||||
delete [] A;
|
||||
delete [] B;
|
||||
delete [] C;
|
||||
hipFree(Ad);
|
||||
hipFree(Bd);
|
||||
hipFree(Cd);
|
||||
|
||||
if(passed == 1){
|
||||
return true;
|
||||
}
|
||||
@@ -163,7 +170,14 @@ for(int i=0;i<512;i++){
|
||||
passed = 1;
|
||||
}
|
||||
}
|
||||
free(A);
|
||||
|
||||
delete [] A;
|
||||
delete [] B;
|
||||
delete [] C;
|
||||
hipFree(Ad);
|
||||
hipFree(Bd);
|
||||
hipFree(Cd);
|
||||
|
||||
if(passed == 1){
|
||||
return true;
|
||||
}
|
||||
@@ -193,7 +207,12 @@ for(int i=0;i<512;i++){
|
||||
passed = 1;
|
||||
}
|
||||
}
|
||||
free(A);
|
||||
|
||||
delete [] A;
|
||||
delete [] B;
|
||||
hipFree(Ad);
|
||||
hipFree(Bd);
|
||||
|
||||
if(passed == 1){
|
||||
return true;
|
||||
}
|
||||
@@ -221,7 +240,12 @@ for(int i=0;i<512;i++){
|
||||
passed = 1;
|
||||
}
|
||||
}
|
||||
free(A);
|
||||
|
||||
delete [] A;
|
||||
delete [] B;
|
||||
hipFree(Ad);
|
||||
hipFree(Bd);
|
||||
|
||||
if(passed == 1){
|
||||
return true;
|
||||
}
|
||||
@@ -249,7 +273,12 @@ for(int i=0;i<512;i++){
|
||||
passed = 1;
|
||||
}
|
||||
}
|
||||
free(A);
|
||||
|
||||
delete [] A;
|
||||
delete [] B;
|
||||
hipFree(Ad);
|
||||
hipFree(Bd);
|
||||
|
||||
if(passed == 1){
|
||||
return true;
|
||||
}
|
||||
@@ -278,7 +307,12 @@ for(int i=0;i<512;i++){
|
||||
passed = 1;
|
||||
}
|
||||
}
|
||||
free(A);
|
||||
|
||||
delete [] A;
|
||||
delete [] B;
|
||||
hipFree(Ad);
|
||||
hipFree(Bd);
|
||||
|
||||
if(passed == 1){
|
||||
return true;
|
||||
}
|
||||
@@ -306,7 +340,12 @@ for(int i=0;i<512;i++){
|
||||
passed = 1;
|
||||
}
|
||||
}
|
||||
free(A);
|
||||
|
||||
delete [] A;
|
||||
delete [] B;
|
||||
hipFree(Ad);
|
||||
hipFree(Bd);
|
||||
|
||||
if(passed == 1){
|
||||
return true;
|
||||
}
|
||||
@@ -343,7 +382,16 @@ for(int i=0;i<512;i++){
|
||||
passed = 1;
|
||||
}
|
||||
}
|
||||
free(A);
|
||||
|
||||
delete [] A;
|
||||
delete [] B;
|
||||
delete [] C;
|
||||
delete [] D;
|
||||
hipFree(Ad);
|
||||
hipFree(Bd);
|
||||
hipFree(Cd);
|
||||
hipFree(Dd);
|
||||
|
||||
if(passed == 1){
|
||||
return true;
|
||||
}
|
||||
@@ -383,7 +431,18 @@ for(int i=0;i<512;i++){
|
||||
passed = 1;
|
||||
}
|
||||
}
|
||||
free(A);
|
||||
|
||||
delete [] A;
|
||||
delete [] B;
|
||||
delete [] C;
|
||||
delete [] D;
|
||||
delete [] E;
|
||||
hipFree(Ad);
|
||||
hipFree(Bd);
|
||||
hipFree(Cd);
|
||||
hipFree(Dd);
|
||||
hipFree(Ed);
|
||||
|
||||
if(passed == 1){
|
||||
return true;
|
||||
}
|
||||
@@ -416,7 +475,14 @@ for(int i=0;i<512;i++){
|
||||
passed = 1;
|
||||
}
|
||||
}
|
||||
free(A);
|
||||
|
||||
delete [] A;
|
||||
delete [] B;
|
||||
delete [] C;
|
||||
hipFree(Ad);
|
||||
hipFree(Bd);
|
||||
hipFree(Cd);
|
||||
|
||||
if(passed == 1){
|
||||
return true;
|
||||
}
|
||||
@@ -452,7 +518,16 @@ for(int i=0;i<512;i++){
|
||||
passed = 1;
|
||||
}
|
||||
}
|
||||
free(A);
|
||||
|
||||
delete [] A;
|
||||
delete [] B;
|
||||
delete [] C;
|
||||
delete [] D;
|
||||
hipFree(Ad);
|
||||
hipFree(Bd);
|
||||
hipFree(Cd);
|
||||
hipFree(Dd);
|
||||
|
||||
if(passed == 1){
|
||||
return true;
|
||||
}
|
||||
@@ -492,7 +567,18 @@ for(int i=0;i<512;i++){
|
||||
passed = 1;
|
||||
}
|
||||
}
|
||||
free(A);
|
||||
|
||||
delete [] A;
|
||||
delete [] B;
|
||||
delete [] C;
|
||||
delete [] D;
|
||||
delete [] E;
|
||||
hipFree(Ad);
|
||||
hipFree(Bd);
|
||||
hipFree(Cd);
|
||||
hipFree(Dd);
|
||||
hipFree(Ed);
|
||||
|
||||
if(passed == 1){
|
||||
return true;
|
||||
}
|
||||
@@ -522,7 +608,12 @@ for(int i=0;i<512;i++){
|
||||
passed = 1;
|
||||
}
|
||||
}
|
||||
free(A);
|
||||
|
||||
delete [] A;
|
||||
delete [] B;
|
||||
hipFree(Ad);
|
||||
hipFree(Bd);
|
||||
|
||||
if(passed == 1){
|
||||
return true;
|
||||
}
|
||||
@@ -549,7 +640,12 @@ for(int i=0;i<512;i++){
|
||||
passed = 1;
|
||||
}
|
||||
}
|
||||
free(A);
|
||||
|
||||
delete [] A;
|
||||
delete [] B;
|
||||
hipFree(Ad);
|
||||
hipFree(Bd);
|
||||
|
||||
if(passed == 1){
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -159,11 +159,16 @@ bool dataTypesRun(){
|
||||
HIP_ASSERT(hipMemcpy(deviceB, hostB, NUM*sizeof(T), hipMemcpyHostToDevice));
|
||||
|
||||
|
||||
hipLaunchKernel(vectoradd_float,
|
||||
dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y),
|
||||
dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
|
||||
0, 0,
|
||||
deviceA ,deviceB ,WIDTH ,HEIGHT);
|
||||
hipLaunchKernel(
|
||||
vectoradd_float,
|
||||
dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y),
|
||||
dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
|
||||
0,
|
||||
0,
|
||||
deviceA,
|
||||
static_cast<const T*>(deviceB),
|
||||
WIDTH,
|
||||
HEIGHT);
|
||||
|
||||
|
||||
HIP_ASSERT(hipMemcpy(hostA, deviceA, NUM*sizeof(T), hipMemcpyDeviceToHost));
|
||||
@@ -221,11 +226,16 @@ bool dataTypesRun2(){
|
||||
HIP_ASSERT(hipMalloc((void**)&deviceB, NUM * sizeof(T)));
|
||||
|
||||
HIP_ASSERT(hipMemcpy(deviceB, hostB, NUM*sizeof(T), hipMemcpyHostToDevice));
|
||||
hipLaunchKernel(vectoradd_float,
|
||||
dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y),
|
||||
dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
|
||||
0, 0,
|
||||
deviceA ,deviceB,WIDTH ,HEIGHT);
|
||||
hipLaunchKernel(
|
||||
vectoradd_float,
|
||||
dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y),
|
||||
dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
|
||||
0,
|
||||
0,
|
||||
deviceA,
|
||||
static_cast<const T*>(deviceB),
|
||||
WIDTH,
|
||||
HEIGHT);
|
||||
|
||||
|
||||
HIP_ASSERT(hipMemcpy(hostA, deviceA, NUM*sizeof(T), hipMemcpyDeviceToHost));
|
||||
@@ -281,11 +291,16 @@ bool dataTypesRun4(){
|
||||
HIP_ASSERT(hipMemcpy(deviceB, hostB, NUM*sizeof(T), hipMemcpyHostToDevice));
|
||||
|
||||
|
||||
hipLaunchKernel(vectoradd_float,
|
||||
dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y),
|
||||
dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
|
||||
0, 0,
|
||||
deviceA ,deviceB ,WIDTH ,HEIGHT);
|
||||
hipLaunchKernel(
|
||||
vectoradd_float,
|
||||
dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y),
|
||||
dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
|
||||
0,
|
||||
0,
|
||||
deviceA,
|
||||
static_cast<const T*>(deviceB),
|
||||
WIDTH,
|
||||
HEIGHT);
|
||||
|
||||
|
||||
HIP_ASSERT(hipMemcpy(hostA, deviceA, NUM*sizeof(T), hipMemcpyDeviceToHost));
|
||||
|
||||
@@ -36,17 +36,23 @@ __global__ void Kern(hipLaunchParm lp, float *A)
|
||||
|
||||
int main()
|
||||
{
|
||||
float *A, *Ad;
|
||||
float A[len];
|
||||
float *Ad;
|
||||
|
||||
for(int i=0;i<len;i++)
|
||||
{
|
||||
A[i] = 1.0f;
|
||||
}
|
||||
|
||||
Ad = (float*)mallocHip(size);
|
||||
memcpyHipH2D(Ad, A, size);
|
||||
hipLaunchKernel(HIP_KERNEL_NAME(Kern), dim3(len/1024), dim3(1024), 0, 0, A);
|
||||
hipLaunchKernel(
|
||||
HIP_KERNEL_NAME(Kern), dim3(len/1024), dim3(1024), 0, 0, Ad);
|
||||
memcpyHipD2H(A, Ad, size);
|
||||
for(int i=0;i<len;i++)
|
||||
{
|
||||
assert(A[i] == 2.0f);
|
||||
}
|
||||
|
||||
hipFree(Ad);
|
||||
}
|
||||
|
||||
@@ -74,8 +74,8 @@ __global__ void MyKernel (const hipLaunchParm lp, const float *a, const float *b
|
||||
void callMyKernel()
|
||||
{
|
||||
float *a, *b, *c;
|
||||
unsigned N;
|
||||
const unsigned blockSize = 256;
|
||||
unsigned N = blockSize;
|
||||
|
||||
hipLaunchKernel(MyKernel, dim3(N/blockSize), dim3(blockSize), 0, 0, a,b,c,N);
|
||||
}
|
||||
@@ -102,7 +102,7 @@ vectorADD(const hipLaunchParm lp,
|
||||
int a = __shfl_up(x, 1);
|
||||
#endif
|
||||
|
||||
float x;
|
||||
float x = 1.0;
|
||||
float z = sin(x);
|
||||
#ifdef NOT_YET
|
||||
float fastZ = __sin(x);
|
||||
|
||||
@@ -107,9 +107,12 @@ int main(){
|
||||
assert(C[i] == 1);
|
||||
}
|
||||
|
||||
delete A;
|
||||
delete B;
|
||||
delete C;
|
||||
delete [] A;
|
||||
delete [] B;
|
||||
delete [] C;
|
||||
hipFree(Ad);
|
||||
hipFree(Bd);
|
||||
hipFree(Cd);
|
||||
|
||||
A = new uint8_t[LEN9];
|
||||
B = new uint8_t[LEN9];
|
||||
@@ -132,9 +135,12 @@ int main(){
|
||||
assert(C[i] == 1);
|
||||
}
|
||||
|
||||
delete A;
|
||||
delete B;
|
||||
delete C;
|
||||
delete [] A;
|
||||
delete [] B;
|
||||
delete [] C;
|
||||
hipFree(Ad);
|
||||
hipFree(Bd);
|
||||
hipFree(Cd);
|
||||
|
||||
A = new uint8_t[LEN10];
|
||||
B = new uint8_t[LEN10];
|
||||
@@ -157,9 +163,12 @@ int main(){
|
||||
assert(C[i] == 1);
|
||||
}
|
||||
|
||||
delete A;
|
||||
delete B;
|
||||
delete C;
|
||||
delete [] A;
|
||||
delete [] B;
|
||||
delete [] C;
|
||||
hipFree(Ad);
|
||||
hipFree(Bd);
|
||||
hipFree(Cd);
|
||||
|
||||
A = new uint8_t[LEN11];
|
||||
B = new uint8_t[LEN11];
|
||||
@@ -182,9 +191,12 @@ int main(){
|
||||
assert(C[i] == 1);
|
||||
}
|
||||
|
||||
delete A;
|
||||
delete B;
|
||||
delete C;
|
||||
delete [] A;
|
||||
delete [] B;
|
||||
delete [] C;
|
||||
hipFree(Ad);
|
||||
hipFree(Bd);
|
||||
hipFree(Cd);
|
||||
|
||||
A = new uint8_t[LEN12];
|
||||
B = new uint8_t[LEN12];
|
||||
@@ -207,9 +219,12 @@ int main(){
|
||||
assert(C[i] == 1);
|
||||
}
|
||||
|
||||
delete A;
|
||||
delete B;
|
||||
delete C;
|
||||
delete [] A;
|
||||
delete [] B;
|
||||
delete [] C;
|
||||
hipFree(Ad);
|
||||
hipFree(Bd);
|
||||
hipFree(Cd);
|
||||
|
||||
passed();
|
||||
}
|
||||
|
||||
@@ -69,7 +69,16 @@ int main(int argc, char *argv[])
|
||||
// Record the start event
|
||||
HIPCHECK (hipEventRecord(start, NULL));
|
||||
|
||||
hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, N);
|
||||
hipLaunchKernel(
|
||||
HipTest::vectorADD,
|
||||
dim3(blocks),
|
||||
dim3(threadsPerBlock),
|
||||
0,
|
||||
0,
|
||||
static_cast<const float*>(A_d),
|
||||
static_cast<const float*>(B_d),
|
||||
C_d,
|
||||
N);
|
||||
|
||||
|
||||
HIPCHECK (hipEventRecord(stop, NULL));
|
||||
|
||||
@@ -52,7 +52,7 @@ void test(unsigned testMask, int *C_d, int *C_h, int64_t numElements, hipStream_
|
||||
if (!(testMask & p_tests)) {
|
||||
return;
|
||||
}
|
||||
printf ("\ntest 0x%3x: stream=%p waitStart=%d syncMode=%s\n",
|
||||
printf ("\ntest 0x%3x: stream=%p waitStart=%d syncMode=%s\n",
|
||||
testMask, stream, waitStart, syncModeString(syncMode));
|
||||
|
||||
size_t sizeBytes = numElements * sizeof(int);
|
||||
@@ -77,7 +77,16 @@ void test(unsigned testMask, int *C_d, int *C_h, int64_t numElements, hipStream_
|
||||
HIPCHECK(hipEventRecord(timingDisabled, stream));
|
||||
// sandwhich a kernel:
|
||||
HIPCHECK(hipEventRecord(start, stream));
|
||||
hipLaunchKernelGGL(HipTest::addCountReverse , dim3(blocks), dim3(threadsPerBlock), 0, stream, C_d, C_h, numElements, count);
|
||||
hipLaunchKernelGGL(
|
||||
HipTest::addCountReverse,
|
||||
dim3(blocks),
|
||||
dim3(threadsPerBlock),
|
||||
0,
|
||||
stream,
|
||||
static_cast<const int*>(C_d),
|
||||
C_h,
|
||||
numElements,
|
||||
count);
|
||||
HIPCHECK(hipEventRecord(stop, stream));
|
||||
|
||||
|
||||
@@ -85,8 +94,8 @@ void test(unsigned testMask, int *C_d, int *C_h, int64_t numElements, hipStream_
|
||||
HIPCHECK(hipEventSynchronize(start));
|
||||
}
|
||||
|
||||
|
||||
hipError_t expectedStopError = hipSuccess;
|
||||
|
||||
hipError_t expectedStopError = hipSuccess;
|
||||
|
||||
// How to wait for the events to finish:
|
||||
switch (syncMode) {
|
||||
@@ -97,12 +106,12 @@ void test(unsigned testMask, int *C_d, int *C_h, int64_t numElements, hipStream_
|
||||
HIPCHECK(hipStreamSynchronize(stream)); // wait for recording to finish...
|
||||
break;
|
||||
case syncStopEvent:
|
||||
HIPCHECK(hipEventSynchronize(stop));
|
||||
HIPCHECK(hipEventSynchronize(stop));
|
||||
break;
|
||||
default:
|
||||
assert(0);
|
||||
};
|
||||
|
||||
|
||||
|
||||
float t;
|
||||
|
||||
@@ -111,25 +120,25 @@ void test(unsigned testMask, int *C_d, int *C_h, int64_t numElements, hipStream_
|
||||
failed ("start event not in expected state, was %d=%s\n", e, hipGetErrorName(e));
|
||||
}
|
||||
|
||||
if (e == hipSuccess)
|
||||
if (e == hipSuccess)
|
||||
assert (t==0.0f);
|
||||
|
||||
|
||||
|
||||
// stop usually ready unless we skipped the synchronization (syncNone)
|
||||
HIPCHECK_API(hipEventElapsedTime(&t, stop, stop), expectedStopError);
|
||||
if (e == hipSuccess)
|
||||
if (e == hipSuccess)
|
||||
assert (t==0.0f);
|
||||
|
||||
|
||||
e = hipEventElapsedTime(&t, start, stop);
|
||||
HIPCHECK_API(e, expectedStopError);
|
||||
if (expectedStopError == hipSuccess)
|
||||
if (expectedStopError == hipSuccess)
|
||||
assert (t>0.0f);
|
||||
printf ("time=%6.2f error=%s\n", t, hipGetErrorName(e));
|
||||
|
||||
e = hipEventElapsedTime(&t, stop, start);
|
||||
HIPCHECK_API(e, expectedStopError);
|
||||
if (expectedStopError == hipSuccess)
|
||||
if (expectedStopError == hipSuccess)
|
||||
assert (t<0.0f);
|
||||
printf ("negtime=%6.2f error=%s\n", t, hipGetErrorName(e));
|
||||
|
||||
|
||||
@@ -58,7 +58,7 @@ public:
|
||||
|
||||
void offset(int offset) { _offset = offset; };
|
||||
int offset() const { return _offset; };
|
||||
|
||||
|
||||
private:
|
||||
T * _A_d;
|
||||
T* _B_d;
|
||||
@@ -72,7 +72,7 @@ private:
|
||||
|
||||
template<typename T>
|
||||
DeviceMemory<T>::DeviceMemory(size_t numElements)
|
||||
: _maxNumElements(numElements),
|
||||
: _maxNumElements(numElements),
|
||||
_offset(0)
|
||||
{
|
||||
T ** np = nullptr;
|
||||
@@ -93,7 +93,7 @@ DeviceMemory<T>::~DeviceMemory ()
|
||||
HipTest::freeArrays (_A_d, _B_d, _C_d, np, np, np, 0);
|
||||
|
||||
HIPCHECK (hipFree(_C_dd));
|
||||
|
||||
|
||||
_C_dd = NULL;
|
||||
};
|
||||
|
||||
@@ -125,7 +125,7 @@ public:
|
||||
T * A_hh;
|
||||
T* B_hh;
|
||||
|
||||
bool _usePinnedHost;
|
||||
bool _usePinnedHost;
|
||||
private:
|
||||
size_t _maxNumElements;
|
||||
|
||||
@@ -165,11 +165,11 @@ HostMemory<T>::HostMemory(size_t numElements, bool usePinnedHost)
|
||||
|
||||
template<typename T>
|
||||
void
|
||||
HostMemory<T>::reset(size_t numElements, bool full)
|
||||
HostMemory<T>::reset(size_t numElements, bool full)
|
||||
{
|
||||
// Initialize the host data:
|
||||
for (size_t i=0; i<numElements; i++) {
|
||||
(A_hh)[i] = 1097.0 + i;
|
||||
(A_hh)[i] = 1097.0 + i;
|
||||
(B_hh)[i] = 1492.0 + i; // Phi
|
||||
|
||||
if (full) {
|
||||
@@ -213,8 +213,8 @@ template <typename T>
|
||||
void memcpytest2(DeviceMemory<T> *dmem, HostMemory<T> *hmem, size_t numElements, bool useHostToHost, bool useDeviceToDevice, bool useMemkindDefault)
|
||||
{
|
||||
size_t sizeElements = numElements * sizeof(T);
|
||||
printf ("test: %s<%s> size=%lu (%6.2fMB) usePinnedHost:%d, useHostToHost:%d, useDeviceToDevice:%d, useMemkindDefault:%d, offsets:dev:%+d host:+%d\n",
|
||||
__func__,
|
||||
printf ("test: %s<%s> size=%lu (%6.2fMB) usePinnedHost:%d, useHostToHost:%d, useDeviceToDevice:%d, useMemkindDefault:%d, offsets:dev:%+d host:+%d\n",
|
||||
__func__,
|
||||
TYPENAME(T),
|
||||
sizeElements, sizeElements/1024.0/1024.0,
|
||||
hmem->_usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault,
|
||||
@@ -243,7 +243,16 @@ void memcpytest2(DeviceMemory<T> *dmem, HostMemory<T> *hmem, size_t numElements,
|
||||
HIPCHECK ( hipMemcpy(dmem->B_d(), hmem->B_h(), sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice));
|
||||
}
|
||||
|
||||
hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, dmem->A_d(), dmem->B_d(), dmem->C_d(), numElements);
|
||||
hipLaunchKernel(
|
||||
HipTest::vectorADD,
|
||||
dim3(blocks),
|
||||
dim3(threadsPerBlock),
|
||||
0,
|
||||
0,
|
||||
static_cast<const T*>(dmem->A_d()),
|
||||
static_cast<const T*>(dmem->B_d()),
|
||||
dmem->C_d(),
|
||||
numElements);
|
||||
|
||||
if (useDeviceToDevice) {
|
||||
// Do an extra device-to-device copy here to mix things up:
|
||||
@@ -273,8 +282,8 @@ void memcpytest2_for_type(size_t numElements)
|
||||
{
|
||||
printSep();
|
||||
|
||||
DeviceMemory<T> memD(numElements);
|
||||
HostMemory<T> memU(numElements, 0/*usePinnedHost*/);
|
||||
DeviceMemory<T> memD(numElements);
|
||||
HostMemory<T> memU(numElements, 0/*usePinnedHost*/);
|
||||
HostMemory<T> memP(numElements, 1/*usePinnedHost*/);
|
||||
|
||||
for (int usePinnedHost =0; usePinnedHost<=1; usePinnedHost++) {
|
||||
@@ -307,11 +316,11 @@ void memcpytest2_sizes(size_t maxElem=0)
|
||||
maxElem = free/sizeof(T)/8;
|
||||
}
|
||||
|
||||
printf (" device#%d: hipMemGetInfo: free=%zu (%4.2fMB) total=%zu (%4.2fMB) maxSize=%6.1fMB\n",
|
||||
printf (" device#%d: hipMemGetInfo: free=%zu (%4.2fMB) total=%zu (%4.2fMB) maxSize=%6.1fMB\n",
|
||||
deviceId, free, (float)(free/1024.0/1024.0), total, (float)(total/1024.0/1024.0), maxElem*sizeof(T)/1024.0/1024.0);
|
||||
HIPCHECK ( hipDeviceReset() );
|
||||
DeviceMemory<T> memD(maxElem);
|
||||
HostMemory<T> memU(maxElem, 0/*usePinnedHost*/);
|
||||
DeviceMemory<T> memD(maxElem);
|
||||
HostMemory<T> memU(maxElem, 0/*usePinnedHost*/);
|
||||
HostMemory<T> memP(maxElem, 1/*usePinnedHost*/);
|
||||
|
||||
for (size_t elem=1; elem<=maxElem; elem*=2) {
|
||||
@@ -336,11 +345,11 @@ void memcpytest2_offsets(size_t maxElem, bool devOffsets, bool hostOffsets)
|
||||
HIPCHECK(hipMemGetInfo(&free, &total));
|
||||
|
||||
|
||||
printf (" device#%d: hipMemGetInfo: free=%zu (%4.2fMB) total=%zu (%4.2fMB) maxSize=%6.1fMB\n",
|
||||
printf (" device#%d: hipMemGetInfo: free=%zu (%4.2fMB) total=%zu (%4.2fMB) maxSize=%6.1fMB\n",
|
||||
deviceId, free, (float)(free/1024.0/1024.0), total, (float)(total/1024.0/1024.0), maxElem*sizeof(T)/1024.0/1024.0);
|
||||
HIPCHECK ( hipDeviceReset() );
|
||||
DeviceMemory<T> memD(maxElem);
|
||||
HostMemory<T> memU(maxElem, 0/*usePinnedHost*/);
|
||||
DeviceMemory<T> memD(maxElem);
|
||||
HostMemory<T> memU(maxElem, 0/*usePinnedHost*/);
|
||||
HostMemory<T> memP(maxElem, 1/*usePinnedHost*/);
|
||||
|
||||
size_t elem = maxElem / 2;
|
||||
@@ -380,16 +389,16 @@ void multiThread_1(bool serialize, bool usePinnedHost)
|
||||
{
|
||||
printSep();
|
||||
printf ("test: %s<%s> serialize=%d usePinnedHost=%d\n", __func__, TYPENAME(T), serialize, usePinnedHost);
|
||||
DeviceMemory<T> memD(N);
|
||||
HostMemory<T> mem1(N, usePinnedHost);
|
||||
HostMemory<T> mem2(N, usePinnedHost);
|
||||
DeviceMemory<T> memD(N);
|
||||
HostMemory<T> mem1(N, usePinnedHost);
|
||||
HostMemory<T> mem2(N, usePinnedHost);
|
||||
|
||||
std::thread t1 (memcpytest2<T>, &memD, &mem1, N, 0,0,0);
|
||||
if (serialize) {
|
||||
t1.join();
|
||||
}
|
||||
|
||||
|
||||
|
||||
std::thread t2 (memcpytest2<T>,&memD, &mem2, N, 0,0,0);
|
||||
if (serialize) {
|
||||
t2.join();
|
||||
@@ -427,21 +436,21 @@ int main(int argc, char *argv[])
|
||||
// Some tests around the 64KB boundary which have historically shown issues:
|
||||
printf ("\n\n=== tests&0x2 (64KB boundary)\n");
|
||||
size_t maxElem = 32*1024*1024;
|
||||
DeviceMemory<float> memD(maxElem);
|
||||
HostMemory<float> memU(maxElem, 0/*usePinnedHost*/);
|
||||
HostMemory<float> memP(maxElem, 0/*usePinnedHost*/);
|
||||
DeviceMemory<float> memD(maxElem);
|
||||
HostMemory<float> memU(maxElem, 0/*usePinnedHost*/);
|
||||
HostMemory<float> memP(maxElem, 0/*usePinnedHost*/);
|
||||
// These all pass:
|
||||
memcpytest2<float>(&memD, &memP, 15*1024*1024, 0, 0, 0);
|
||||
memcpytest2<float>(&memD, &memP, 16*1024*1024, 0, 0, 0);
|
||||
memcpytest2<float>(&memD, &memP, 16*1024*1024+16*1024, 0, 0, 0);
|
||||
memcpytest2<float>(&memD, &memP, 15*1024*1024, 0, 0, 0);
|
||||
memcpytest2<float>(&memD, &memP, 16*1024*1024, 0, 0, 0);
|
||||
memcpytest2<float>(&memD, &memP, 16*1024*1024+16*1024, 0, 0, 0);
|
||||
|
||||
// Just over 64MB:
|
||||
memcpytest2<float>(&memD, &memP, 16*1024*1024+512*1024, 0, 0, 0);
|
||||
memcpytest2<float>(&memD, &memP, 17*1024*1024+1024, 0, 0, 0);
|
||||
memcpytest2<float>(&memD, &memP, 32*1024*1024, 0, 0, 0);
|
||||
memcpytest2<float>(&memD, &memU, 32*1024*1024, 0, 0, 0);
|
||||
memcpytest2<float>(&memD, &memP, 32*1024*1024, 1, 1, 0);
|
||||
memcpytest2<float>(&memD, &memP, 32*1024*1024, 1, 1, 0);
|
||||
memcpytest2<float>(&memD, &memP, 16*1024*1024+512*1024, 0, 0, 0);
|
||||
memcpytest2<float>(&memD, &memP, 17*1024*1024+1024, 0, 0, 0);
|
||||
memcpytest2<float>(&memD, &memP, 32*1024*1024, 0, 0, 0);
|
||||
memcpytest2<float>(&memD, &memU, 32*1024*1024, 0, 0, 0);
|
||||
memcpytest2<float>(&memD, &memP, 32*1024*1024, 1, 1, 0);
|
||||
memcpytest2<float>(&memD, &memP, 32*1024*1024, 1, 1, 0);
|
||||
|
||||
|
||||
}
|
||||
@@ -464,7 +473,7 @@ int main(int argc, char *argv[])
|
||||
|
||||
// Simplest cases: serialize the threads, and also used pinned memory:
|
||||
// This verifies that the sub-calls to memcpytest2 are correct.
|
||||
multiThread_1<float>(true, true);
|
||||
multiThread_1<float>(true, true);
|
||||
|
||||
// Serialize, but use unpinned memory to stress the unpinned memory xfer path.
|
||||
multiThread_1<float>(true, false);
|
||||
|
||||
@@ -63,7 +63,16 @@ void simpleTest1()
|
||||
HIPCHECK ( memcopy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
|
||||
HIPCHECK ( memcopy(B_d, B_h, Nbytes, hipMemcpyHostToDevice));
|
||||
|
||||
hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, N);
|
||||
hipLaunchKernel(
|
||||
HipTest::vectorADD,
|
||||
dim3(blocks),
|
||||
dim3(threadsPerBlock),
|
||||
0,
|
||||
0,
|
||||
static_cast<const int*>(A_d),
|
||||
static_cast<const int*>(B_d),
|
||||
C_d,
|
||||
N);
|
||||
|
||||
HIPCHECK ( memcopy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
|
||||
|
||||
|
||||
@@ -41,8 +41,8 @@ void printSep()
|
||||
// Designed to stress a small number of simple smoke tests
|
||||
|
||||
template<
|
||||
typename T=float,
|
||||
class P=HipTest::Unpinned,
|
||||
typename T=float,
|
||||
class P=HipTest::Unpinned,
|
||||
class C=HipTest::Memcpy
|
||||
>
|
||||
void simpleVectorAdd(size_t numElements, int iters, hipStream_t stream)
|
||||
@@ -90,7 +90,16 @@ void simpleVectorAdd(size_t numElements, int iters, hipStream_t stream)
|
||||
|
||||
// This is the null stream?
|
||||
//hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, numElements);
|
||||
hipLaunchKernel(HipTest::vectorADDReverse, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, numElements);
|
||||
hipLaunchKernel(
|
||||
HipTest::vectorADDReverse,
|
||||
dim3(blocks),
|
||||
dim3(threadsPerBlock),
|
||||
0,
|
||||
0,
|
||||
static_cast<const T*>(A_d),
|
||||
static_cast<const T*>(B_d),
|
||||
C_d,
|
||||
numElements);
|
||||
|
||||
MemTraits<C>::Copy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost, stream);
|
||||
|
||||
|
||||
@@ -119,7 +119,7 @@ void Streamer<T>::reset()
|
||||
{
|
||||
HipTest::setDefaultData(_numElements, _A_h, _B_h, _C_h);
|
||||
H2D();
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
@@ -128,7 +128,17 @@ void Streamer<T>::enqueAsync()
|
||||
{
|
||||
printf ("testing: %s numElements=%zu size=%6.2fMB\n", __func__, _numElements, _numElements * sizeof(T) / 1024.0/1024.0);
|
||||
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, _numElements);
|
||||
hipLaunchKernel(vectorADDRepeat, dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _B_d, _C_d, _numElements, p_repeat);
|
||||
hipLaunchKernel(
|
||||
vectorADDRepeat,
|
||||
dim3(blocks),
|
||||
dim3(threadsPerBlock),
|
||||
0,
|
||||
_stream,
|
||||
static_cast<const T*>(_A_d),
|
||||
static_cast<const T*>(_B_d),
|
||||
_C_d,
|
||||
_numElements,
|
||||
p_repeat);
|
||||
|
||||
}
|
||||
|
||||
@@ -225,7 +235,17 @@ int main(int argc, char *argv[])
|
||||
auto lastStreamer = streamers[s - 1];
|
||||
|
||||
// Dispatch to NULL stream, should wait for prior async activity to complete before beginning:
|
||||
hipLaunchKernel(vectorADDRepeat, dim3(blocks), dim3(threadsPerBlock), 0, 0/*nullstream*/, lastStreamer->_C_d, lastStreamer->_C_d, nullStreamer->_C_d, numElements, 1/*repeat*/);
|
||||
hipLaunchKernel(
|
||||
vectorADDRepeat,
|
||||
dim3(blocks),
|
||||
dim3(threadsPerBlock),
|
||||
0,
|
||||
0/*nullstream*/,
|
||||
static_cast<const int*>(lastStreamer->_C_d),
|
||||
static_cast<const int*>(lastStreamer->_C_d),
|
||||
nullStreamer->_C_d,
|
||||
numElements,
|
||||
1/*repeat*/);
|
||||
|
||||
|
||||
if (p_db) {
|
||||
@@ -238,7 +258,7 @@ int main(int argc, char *argv[])
|
||||
nullStreamer->D2H();
|
||||
HIPCHECK(hipDeviceSynchronize());
|
||||
|
||||
HipTest::checkTest(expected_H, nullStreamer->_C_h, numElements);
|
||||
HipTest::checkTest(expected_H, nullStreamer->_C_h, numElements);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -257,13 +277,23 @@ int main(int argc, char *argv[])
|
||||
auto lastStreamer = streamers[s - 1];
|
||||
|
||||
// Dispatch to NULL stream, should wait for prior async activity to complete before beginning:
|
||||
hipLaunchKernel(vectorADDRepeat, dim3(blocks), dim3(threadsPerBlock), 0, 0/*nullstream*/, lastStreamer->_C_d, lastStreamer->_C_d, nullStreamer->_C_d, numElements, 1/*repeat*/);
|
||||
hipLaunchKernel(
|
||||
vectorADDRepeat,
|
||||
dim3(blocks),
|
||||
dim3(threadsPerBlock),
|
||||
0,
|
||||
0/*nullstream*/,
|
||||
static_cast<const int*>(lastStreamer->_C_d),
|
||||
static_cast<const int*>(lastStreamer->_C_d),
|
||||
nullStreamer->_C_d,
|
||||
numElements,
|
||||
1/*repeat*/);
|
||||
|
||||
nullStreamer->D2H();
|
||||
|
||||
HIPCHECK(hipDeviceSynchronize());
|
||||
|
||||
HipTest::checkTest(expected_H, nullStreamer->_C_h, numElements);
|
||||
HipTest::checkTest(expected_H, nullStreamer->_C_h, numElements);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -289,10 +319,10 @@ int main(int argc, char *argv[])
|
||||
// Copy with stream1, this could go async if the streamSync doesn't synchronize ALL the streams.
|
||||
HIPCHECK(hipMemcpyAsync(streamers[0]->_C_h, streamers[0]->_C_d, streamers[0]->_numElements*sizeof(int), hipMemcpyDeviceToHost, streamers[1]->_stream));
|
||||
|
||||
|
||||
|
||||
HIPCHECK(hipDeviceSynchronize());
|
||||
|
||||
HipTest::checkTest(expected_H, streamers[0]->_C_h, numElements);
|
||||
HipTest::checkTest(expected_H, streamers[0]->_C_h, numElements);
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -59,23 +59,23 @@ const char *syncModeString(int syncMode) {
|
||||
void test(unsigned testMask, int *C_d, int *C_h, int64_t numElements, SyncMode syncMode, bool expectMismatch)
|
||||
{
|
||||
|
||||
// This test sends a long-running kernel to the null stream, then tests to see if the
|
||||
// This test sends a long-running kernel to the null stream, then tests to see if the
|
||||
// specified synchronization technique is effective.
|
||||
//
|
||||
// Some syncMode are not expected to correctly sync (for example "syncNone"). in these
|
||||
// Some syncMode are not expected to correctly sync (for example "syncNone"). in these
|
||||
// cases the test sets expectMismatch and the check logic below will attempt to ensure that
|
||||
// the undesired synchronization did not occur - ie ensure the kernel is still running and did
|
||||
// not yet update the stop event. This can be tricky since if the kernel runs fast enough it
|
||||
// may complete before the check. To prevent this, the addCountReverse has a count parameter
|
||||
// which causes it to loop repeatedly, and the results are checked in reverse order.
|
||||
// may complete before the check. To prevent this, the addCountReverse has a count parameter
|
||||
// which causes it to loop repeatedly, and the results are checked in reverse order.
|
||||
//
|
||||
// Tests with expectMismatch=true should ensure the kernel finishes correctly. This results
|
||||
// are checked and we test to make sure stop event has completed.
|
||||
|
||||
|
||||
if (!(testMask & p_tests)) {
|
||||
return;
|
||||
}
|
||||
printf ("\ntest 0x%02x: syncMode=%s expectMismatch=%d\n",
|
||||
printf ("\ntest 0x%02x: syncMode=%s expectMismatch=%d\n",
|
||||
testMask, syncModeString(syncMode), expectMismatch);
|
||||
|
||||
size_t sizeBytes = numElements * sizeof(int);
|
||||
@@ -97,8 +97,17 @@ void test(unsigned testMask, int *C_d, int *C_h, int64_t numElements, SyncMode s
|
||||
|
||||
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements);
|
||||
// Launch kernel into null stream, should result in C_h == count.
|
||||
hipLaunchKernelGGL(HipTest::addCountReverse , dim3(blocks), dim3(threadsPerBlock), 0, 0 /*stream*/, C_d, C_h, numElements, count);
|
||||
HIPCHECK(hipEventRecord(stop, 0/*default*/));
|
||||
hipLaunchKernelGGL(
|
||||
HipTest::addCountReverse,
|
||||
dim3(blocks),
|
||||
dim3(threadsPerBlock),
|
||||
0,
|
||||
0 /*stream*/,
|
||||
static_cast<const int*>(C_d),
|
||||
C_h,
|
||||
numElements,
|
||||
count);
|
||||
HIPCHECK(hipEventRecord(stop, 0/*default*/));
|
||||
|
||||
switch (syncMode) {
|
||||
case syncNone:
|
||||
@@ -108,18 +117,18 @@ void test(unsigned testMask, int *C_d, int *C_h, int64_t numElements, SyncMode s
|
||||
break;
|
||||
case syncOtherStream:
|
||||
// Does this synchronize with the null stream?
|
||||
HIPCHECK(hipStreamSynchronize(otherStream));
|
||||
HIPCHECK(hipStreamSynchronize(otherStream));
|
||||
break;
|
||||
case syncMarkerThenOtherStream:
|
||||
case syncMarkerThenOtherNonBlockingStream:
|
||||
|
||||
// this may wait for NULL stream depending hipStreamNonBlocking flag above
|
||||
HIPCHECK(hipEventRecord(otherStreamEvent, otherStream));
|
||||
|
||||
HIPCHECK(hipStreamSynchronize(otherStream));
|
||||
// this may wait for NULL stream depending hipStreamNonBlocking flag above
|
||||
HIPCHECK(hipEventRecord(otherStreamEvent, otherStream));
|
||||
|
||||
HIPCHECK(hipStreamSynchronize(otherStream));
|
||||
break;
|
||||
case syncDevice:
|
||||
HIPCHECK(hipDeviceSynchronize());
|
||||
HIPCHECK(hipDeviceSynchronize());
|
||||
break;
|
||||
default:
|
||||
assert(0);
|
||||
@@ -197,7 +206,7 @@ void runTests(int64_t numElements)
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
// Can' destroy the default stream:// TODO - move to another test
|
||||
HIPCHECK_API(hipStreamDestroy(0), hipErrorInvalidResourceHandle);
|
||||
HIPCHECK_API(hipStreamDestroy(0), hipErrorInvalidResourceHandle);
|
||||
|
||||
HipTest::parseStandardArguments(argc, argv, true /*failOnUndefinedArg*/);
|
||||
|
||||
|
||||
@@ -88,7 +88,7 @@ private:
|
||||
|
||||
template <typename T>
|
||||
Streamer<T>::Streamer(int deviceId, T * A_d, size_t numElements, int commandType) :
|
||||
_preA_d(NULL),
|
||||
_preA_d(NULL),
|
||||
_A_d(A_d),
|
||||
_deviceId(deviceId),
|
||||
_numElements(numElements),
|
||||
@@ -163,9 +163,27 @@ void Streamer<T>::runAsyncAfter(Streamer<T> *depStreamer, bool waitSameStream)
|
||||
|
||||
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, _numElements);
|
||||
if (_commandType == COMMAND_ADD_REVERSE) {
|
||||
hipLaunchKernelGGL(HipTest::addCountReverse , dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _C_d, _numElements, p_count);
|
||||
hipLaunchKernelGGL(
|
||||
HipTest::addCountReverse,
|
||||
dim3(blocks),
|
||||
dim3(threadsPerBlock),
|
||||
0,
|
||||
_stream,
|
||||
static_cast<const T*>(_A_d),
|
||||
_C_d,
|
||||
static_cast<int64_t>(_numElements),
|
||||
static_cast<int>(p_count));
|
||||
} else if (_commandType == COMMAND_ADD_FORWARD) {
|
||||
hipLaunchKernelGGL(HipTest::addCount, dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _C_d, _numElements, p_count);
|
||||
hipLaunchKernelGGL(
|
||||
HipTest::addCount,
|
||||
dim3(blocks),
|
||||
dim3(threadsPerBlock),
|
||||
0,
|
||||
_stream,
|
||||
static_cast<const T*>(_A_d),
|
||||
_C_d,
|
||||
_numElements,
|
||||
static_cast<int>(p_count));
|
||||
} else if (_commandType == COMMAND_COPY) {
|
||||
HIPCHECK(hipMemcpyAsync(_C_d, _A_d, _numElements * sizeof(T), hipMemcpyDeviceToDevice, _stream));
|
||||
} else {
|
||||
@@ -239,7 +257,7 @@ size_t Streamer<T>::check(int streamerNum, T initValue, T expectedOffset, bool e
|
||||
return _mismatchCount;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
//---
|
||||
//Parse arguments specific to this test.
|
||||
@@ -300,7 +318,7 @@ void checkAll(int initValue, std::vector<IntStreamer *> &streamers, std::vector<
|
||||
for (int i=0; i<streamers.size(); i++) {
|
||||
|
||||
expected += streamers[i]->expectedAdd();
|
||||
|
||||
|
||||
mismatchCount += streamers[i]->check(i+1, initValue, expected, expectPass);
|
||||
|
||||
}
|
||||
@@ -330,7 +348,7 @@ void checkAll(int initValue, std::vector<IntStreamer *> &streamers, std::vector<
|
||||
|
||||
void sync_none(void) {};
|
||||
|
||||
void sync_allDevices(int numDevices)
|
||||
void sync_allDevices(int numDevices)
|
||||
{
|
||||
for (int d=0; d<numDevices; d++) {
|
||||
HIPCHECK(hipSetDevice(d));
|
||||
@@ -339,7 +357,7 @@ void sync_allDevices(int numDevices)
|
||||
}
|
||||
|
||||
|
||||
void sync_queryAllUntilComplete(std::vector<IntStreamer *> streamers)
|
||||
void sync_queryAllUntilComplete(std::vector<IntStreamer *> streamers)
|
||||
{
|
||||
for (int i=streamers.size()-1; i>=0; i--) {
|
||||
streamers[i]->queryUntilComplete();
|
||||
@@ -347,7 +365,7 @@ void sync_queryAllUntilComplete(std::vector<IntStreamer *> streamers)
|
||||
}
|
||||
|
||||
|
||||
void sync_streamWaitEvent(hipEvent_t lastEvent, int sideDeviceId, hipStream_t sideStream, bool waitHere)
|
||||
void sync_streamWaitEvent(hipEvent_t lastEvent, int sideDeviceId, hipStream_t sideStream, bool waitHere)
|
||||
{
|
||||
HIPCHECK(hipSetDevice(sideDeviceId));
|
||||
|
||||
@@ -389,7 +407,7 @@ int main(int argc, char *argv[])
|
||||
initArray_h[i] = initValue;
|
||||
}
|
||||
HIPCHECK(hipMemcpy(initArray_d, initArray_h, sizeElements, hipMemcpyHostToDevice));
|
||||
|
||||
|
||||
|
||||
int numDevices;
|
||||
HIPCHECK(hipGetDeviceCount(&numDevices));
|
||||
@@ -414,7 +432,7 @@ int main(int argc, char *argv[])
|
||||
|
||||
|
||||
// A sideband stream channel that is independent from above.
|
||||
// Used to check to ensure the WaitEvent or other synchronization is working correctly since by default sideStream is
|
||||
// Used to check to ensure the WaitEvent or other synchronization is working correctly since by default sideStream is
|
||||
// asynchronous wrt the other streams.
|
||||
std::vector<hipStream_t> sideStreams;
|
||||
for (int d=0; d<numDevices; d++) {
|
||||
@@ -446,7 +464,7 @@ int main(int argc, char *argv[])
|
||||
|
||||
|
||||
if (p_tests & 0x1000) {
|
||||
printf ("==> Test 0x1000 simple null stream tests\n");
|
||||
printf ("==> Test 0x1000 simple null stream tests\n");
|
||||
|
||||
// try some null stream:
|
||||
hipStreamQuery(0);
|
||||
@@ -463,7 +481,7 @@ int main(int argc, char *argv[])
|
||||
HIPCHECK(hipEventRecord(e1, s1))
|
||||
|
||||
HIPCHECK(hipStreamWaitEvent(hipStream_t(0), e1, 0/*flags*/));
|
||||
|
||||
|
||||
HIPCHECK(hipStreamDestroy(s1));
|
||||
HIPCHECK(hipEventDestroy(e1));
|
||||
}
|
||||
@@ -476,11 +494,11 @@ int main(int argc, char *argv[])
|
||||
HIPCHECK(hipEventRecord(e1, hipStream_t(0)))
|
||||
|
||||
HIPCHECK(hipStreamWaitEvent(s1, e1, 0/*flags*/));
|
||||
|
||||
|
||||
HIPCHECK(hipStreamDestroy(s1));
|
||||
HIPCHECK(hipEventDestroy(e1));
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -57,5 +57,8 @@ int main(){
|
||||
}
|
||||
std::cout<<std::endl;
|
||||
hipDeviceSynchronize();
|
||||
|
||||
free(A);
|
||||
hipFree(Ad);
|
||||
}
|
||||
}
|
||||
|
||||
Referencia en una nueva incidencia
Block a user