d4a51e4102
* Adding att v3 support * misc fix * bug fix * Python linting workflow and rules * fix regex * Adding temporary args * fix temporary args * fix format * remove att_perfcounters from test input * Review comments (#163) Co-authored-by: Giovanni Baraldi <gbaraldi@amd.com> * Revert "Review comments (#163)" This reverts commit 9ef0f8e5a4489d5581255e1b70ced2aef5c1c1d0. * Address review comments 2 * review changes * review comments * review * cmake alias * review * review * review * review * Enabling percounter in v3 script * review * formatting * formatting --------- Co-authored-by: Jonathan R. Madsen <jonathanrmadsen@gmail.com> Co-authored-by: Baraldi, Giovanni <Giovanni.Baraldi@amd.com> Co-authored-by: Giovanni Baraldi <gbaraldi@amd.com>
416 linhas
15 KiB
C++
416 linhas
15 KiB
C++
// MIT License
|
|
//
|
|
// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
|
|
//
|
|
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
// of this software and associated documentation files (the "Software"), to deal
|
|
// in the Software without restriction, including without limitation the rights
|
|
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
// copies of the Software, and to permit persons to whom the Software is
|
|
// furnished to do so, subject to the following conditions:
|
|
//
|
|
// The above copyright notice and this permission notice shall be included in
|
|
// all copies or substantial portions of the Software.
|
|
//
|
|
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
// THE SOFTWARE.
|
|
|
|
#pragma once
|
|
|
|
#include "common/filesystem.hpp"
|
|
|
|
#include <hsa/hsa.h>
|
|
#include <hsa/hsa_ext_amd.h>
|
|
|
|
#include <dlfcn.h>
|
|
#include <fcntl.h>
|
|
#include <unistd.h>
|
|
#include <cassert>
|
|
#include <cstdio>
|
|
#include <cstdlib>
|
|
#include <cstring>
|
|
#include <fstream>
|
|
#include <iostream>
|
|
#include <string>
|
|
#include <vector>
|
|
namespace fs = common::fs;
|
|
|
|
#define RET_IF_HSA_ERR(err) \
|
|
{ \
|
|
if((err) != HSA_STATUS_SUCCESS) \
|
|
{ \
|
|
char err_val[12]; \
|
|
char* err_str = nullptr; \
|
|
if(hsa_status_string(err, (const char**) &err_str) != HSA_STATUS_SUCCESS) \
|
|
{ \
|
|
sprintf(&(err_val[0]), "%#x", (uint32_t) err); \
|
|
err_str = &(err_val[0]); \
|
|
} \
|
|
printf("hsa api call failure at: %s:%d\n", __FILE__, __LINE__); \
|
|
printf("Call returned %s\n", err_str); \
|
|
abort(); \
|
|
} \
|
|
}
|
|
|
|
struct Device
|
|
{
|
|
struct Memory
|
|
{
|
|
hsa_amd_memory_pool_t pool;
|
|
bool fine;
|
|
bool kernarg;
|
|
size_t size;
|
|
size_t granule;
|
|
};
|
|
|
|
hsa_agent_t agent;
|
|
char name[64];
|
|
std::vector<Memory> pools;
|
|
uint32_t fine;
|
|
uint32_t coarse;
|
|
static std::vector<hsa_agent_t> all_devices;
|
|
};
|
|
|
|
class MQDependencyTest
|
|
{
|
|
public:
|
|
MQDependencyTest() { hsa_init(); }
|
|
~MQDependencyTest() { hsa_shut_down(); }
|
|
|
|
std::vector<Device> cpu;
|
|
std::vector<Device> gpu;
|
|
Device::Memory kernarg;
|
|
std::vector<hsa_agent_t> all_devices = {};
|
|
|
|
struct CodeObject
|
|
{
|
|
hsa_file_t file = 0;
|
|
hsa_code_object_reader_t code_obj_rdr = {};
|
|
hsa_executable_t executable = {};
|
|
};
|
|
|
|
struct Kernel
|
|
{
|
|
uint64_t handle = 0;
|
|
uint32_t scratch = 0;
|
|
uint32_t group = 0;
|
|
uint32_t kernarg_size = 0;
|
|
uint32_t kernarg_align = 0;
|
|
};
|
|
|
|
union AqlHeader
|
|
{
|
|
struct
|
|
{
|
|
uint16_t type : 8;
|
|
uint16_t barrier : 1;
|
|
uint16_t acquire : 2;
|
|
uint16_t release : 2;
|
|
uint16_t reserved : 3;
|
|
};
|
|
uint16_t raw = 0;
|
|
};
|
|
|
|
struct BarrierValue
|
|
{
|
|
AqlHeader header = {};
|
|
uint8_t AmdFormat = 0;
|
|
uint8_t reserved = 0;
|
|
uint32_t reserved1 = 0;
|
|
hsa_signal_t signal = {};
|
|
hsa_signal_value_t value = 0;
|
|
hsa_signal_value_t mask = 0;
|
|
uint32_t cond = 0;
|
|
uint32_t reserved2 = 0;
|
|
uint64_t reserved3 = 0;
|
|
uint64_t reserved4 = 0;
|
|
hsa_signal_t completion_signal = {};
|
|
};
|
|
|
|
union Aql
|
|
{
|
|
AqlHeader header;
|
|
hsa_kernel_dispatch_packet_t dispatch;
|
|
hsa_barrier_and_packet_t barrier_and;
|
|
hsa_barrier_or_packet_t barrier_or;
|
|
BarrierValue barrier_value = {};
|
|
};
|
|
|
|
struct OCLHiddenArgs
|
|
{
|
|
uint64_t offset_x = 0;
|
|
uint64_t offset_y = 0;
|
|
uint64_t offset_z = 0;
|
|
void* printf_buffer = nullptr;
|
|
void* enqueue = nullptr;
|
|
void* enqueue2 = nullptr;
|
|
void* multi_grid = nullptr;
|
|
};
|
|
|
|
bool load_code_object(const std::string& filename, hsa_agent_t agent, CodeObject& code_object)
|
|
{
|
|
hsa_status_t err;
|
|
code_object.file = open(filename.c_str(), O_RDONLY);
|
|
if(code_object.file == -1)
|
|
{
|
|
fprintf(stderr, "%s:%s\n", "Could not load code object", filename.c_str());
|
|
abort();
|
|
return false;
|
|
}
|
|
|
|
err = hsa_code_object_reader_create_from_file(code_object.file, &code_object.code_obj_rdr);
|
|
RET_IF_HSA_ERR(err);
|
|
|
|
err = hsa_executable_create_alt(HSA_PROFILE_FULL,
|
|
HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT,
|
|
nullptr,
|
|
&code_object.executable);
|
|
RET_IF_HSA_ERR(err);
|
|
|
|
err = hsa_executable_load_agent_code_object(
|
|
code_object.executable, agent, code_object.code_obj_rdr, nullptr, nullptr);
|
|
if(err != HSA_STATUS_SUCCESS) return false;
|
|
|
|
err = hsa_executable_freeze(code_object.executable, nullptr);
|
|
RET_IF_HSA_ERR(err);
|
|
|
|
return true;
|
|
}
|
|
|
|
bool load_code_object_memory(const std::string& filename,
|
|
hsa_agent_t agent,
|
|
CodeObject& code_object)
|
|
{
|
|
hsa_status_t err;
|
|
size_t buffer_size = 0;
|
|
std::ifstream code_object_file(filename.c_str(), std::ios::binary | std::ios::ate);
|
|
if(!code_object_file.good()) return false;
|
|
buffer_size = code_object_file.tellg();
|
|
code_object_file.seekg(0, std::ios::beg);
|
|
uint8_t* binary = new(std::nothrow) uint8_t[buffer_size];
|
|
if(binary && !code_object_file.read(reinterpret_cast<char*>(binary), buffer_size))
|
|
{
|
|
delete[] binary;
|
|
binary = nullptr;
|
|
return false;
|
|
}
|
|
|
|
err = hsa_code_object_reader_create_from_memory(
|
|
binary, buffer_size, &code_object.code_obj_rdr);
|
|
RET_IF_HSA_ERR(err);
|
|
|
|
err = hsa_executable_create_alt(HSA_PROFILE_FULL,
|
|
HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT,
|
|
nullptr,
|
|
&code_object.executable);
|
|
RET_IF_HSA_ERR(err);
|
|
|
|
err = hsa_executable_load_agent_code_object(
|
|
code_object.executable, agent, code_object.code_obj_rdr, nullptr, nullptr);
|
|
if(err != HSA_STATUS_SUCCESS) return false;
|
|
|
|
err = hsa_executable_freeze(code_object.executable, nullptr);
|
|
RET_IF_HSA_ERR(err);
|
|
delete[] binary;
|
|
binary = nullptr;
|
|
return true;
|
|
}
|
|
|
|
bool get_kernel(const CodeObject& code_object,
|
|
const std::string& kernel,
|
|
hsa_agent_t agent,
|
|
Kernel& kern)
|
|
{
|
|
hsa_executable_symbol_t symbol;
|
|
hsa_status_t err = hsa_executable_get_symbol_by_name(
|
|
code_object.executable, kernel.c_str(), &agent, &symbol);
|
|
if(err != HSA_STATUS_SUCCESS)
|
|
{
|
|
err = hsa_executable_get_symbol_by_name(
|
|
code_object.executable, (kernel + ".kd").c_str(), &agent, &symbol);
|
|
if(err != HSA_STATUS_SUCCESS)
|
|
{
|
|
return false;
|
|
}
|
|
}
|
|
printf("\nkernel-name: %s\n", kernel.c_str());
|
|
err = hsa_executable_symbol_get_info(
|
|
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kern.handle);
|
|
RET_IF_HSA_ERR(err);
|
|
|
|
return true;
|
|
}
|
|
|
|
// Not for parallel insertion.
|
|
bool submit_packet(hsa_queue_t* queue, Aql& pkt)
|
|
{
|
|
size_t mask = queue->size - 1;
|
|
Aql* ring = static_cast<Aql*>(queue->base_address);
|
|
|
|
uint64_t write = hsa_queue_load_write_index_relaxed(queue);
|
|
uint64_t read = hsa_queue_load_read_index_relaxed(queue);
|
|
if(write - read + 1 > queue->size) return false;
|
|
|
|
Aql& dst = ring[write & mask];
|
|
|
|
uint16_t header = pkt.header.raw;
|
|
pkt.header.raw = dst.header.raw;
|
|
dst = pkt;
|
|
__atomic_store_n(&dst.header.raw, header, __ATOMIC_RELEASE);
|
|
pkt.header.raw = header;
|
|
|
|
hsa_queue_store_write_index_release(queue, write + 1);
|
|
hsa_signal_store_screlease(queue->doorbell_signal, write);
|
|
|
|
return true;
|
|
}
|
|
|
|
void* hsa_malloc(size_t size, const Device::Memory& mem)
|
|
{
|
|
void* ret;
|
|
hsa_status_t err = hsa_amd_memory_pool_allocate(mem.pool, size, 0, &ret);
|
|
RET_IF_HSA_ERR(err);
|
|
|
|
err = hsa_amd_agents_allow_access(all_devices.size(), all_devices.data(), nullptr, ret);
|
|
RET_IF_HSA_ERR(err);
|
|
return ret;
|
|
}
|
|
|
|
void* hsa_malloc(size_t size, const Device& dev, bool fine)
|
|
{
|
|
uint32_t index = fine ? dev.fine : dev.coarse;
|
|
assert(index != -1u && "Memory type unavailable.");
|
|
return hsa_malloc(size, dev.pools[index]);
|
|
}
|
|
|
|
bool device_discovery()
|
|
{
|
|
hsa_status_t err;
|
|
|
|
err = hsa_iterate_agents(
|
|
[](hsa_agent_t agent, void* obj) {
|
|
hsa_status_t error;
|
|
|
|
Device dev;
|
|
dev.agent = agent;
|
|
|
|
dev.fine = -1u;
|
|
dev.coarse = -1u;
|
|
MQDependencyTest* _obj = reinterpret_cast<MQDependencyTest*>(obj);
|
|
error = hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, dev.name);
|
|
RET_IF_HSA_ERR(error)
|
|
|
|
hsa_device_type_t type;
|
|
error = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &type);
|
|
RET_IF_HSA_ERR(error)
|
|
|
|
error = hsa_amd_agent_iterate_memory_pools(
|
|
agent,
|
|
[](hsa_amd_memory_pool_t pool, void* data) {
|
|
auto& pools = *reinterpret_cast<std::vector<Device::Memory>*>(data);
|
|
hsa_status_t status;
|
|
|
|
bool allowed = false;
|
|
status = hsa_amd_memory_pool_get_info(
|
|
pool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED, &allowed);
|
|
|
|
if(!allowed) return HSA_STATUS_SUCCESS;
|
|
|
|
hsa_amd_segment_t segment;
|
|
status = hsa_amd_memory_pool_get_info(
|
|
pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment);
|
|
RET_IF_HSA_ERR(status)
|
|
|
|
if(segment != HSA_AMD_SEGMENT_GLOBAL) return HSA_STATUS_SUCCESS;
|
|
|
|
uint32_t flags;
|
|
status = hsa_amd_memory_pool_get_info(
|
|
pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flags);
|
|
RET_IF_HSA_ERR(status)
|
|
|
|
Device::Memory mem;
|
|
mem.pool = pool;
|
|
mem.fine = ((flags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) != 0u);
|
|
mem.kernarg =
|
|
((flags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT) != 0u);
|
|
|
|
status = hsa_amd_memory_pool_get_info(
|
|
pool, HSA_AMD_MEMORY_POOL_INFO_SIZE, &mem.size);
|
|
RET_IF_HSA_ERR(status)
|
|
|
|
status = hsa_amd_memory_pool_get_info(
|
|
pool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE, &mem.granule);
|
|
RET_IF_HSA_ERR(status)
|
|
|
|
pools.push_back(mem);
|
|
return HSA_STATUS_SUCCESS;
|
|
},
|
|
static_cast<void*>(&dev.pools));
|
|
|
|
if(!dev.pools.empty())
|
|
{
|
|
for(size_t i = 0; i < dev.pools.size(); i++)
|
|
{
|
|
if(dev.pools[i].fine && dev.pools[i].kernarg && dev.fine == -1u)
|
|
dev.fine = i;
|
|
if(dev.pools[i].fine && !dev.pools[i].kernarg) dev.fine = i;
|
|
if(!dev.pools[i].fine) dev.coarse = i;
|
|
}
|
|
|
|
if(type == HSA_DEVICE_TYPE_CPU)
|
|
_obj->cpu.push_back(dev);
|
|
else
|
|
_obj->gpu.push_back(dev);
|
|
|
|
_obj->all_devices.push_back(dev.agent);
|
|
}
|
|
|
|
return HSA_STATUS_SUCCESS;
|
|
},
|
|
this);
|
|
|
|
bool is_break = false;
|
|
for(auto& dev : cpu)
|
|
{
|
|
for(auto& mem : dev.pools)
|
|
{
|
|
if(mem.fine && mem.kernarg)
|
|
{
|
|
kernarg = mem;
|
|
is_break = true;
|
|
break;
|
|
}
|
|
}
|
|
if(is_break) break;
|
|
}
|
|
|
|
RET_IF_HSA_ERR(err);
|
|
|
|
if(cpu.empty() || gpu.empty() || kernarg.pool.handle == 0) return false;
|
|
return true;
|
|
}
|
|
|
|
void search_hasco(const fs::path& directory, std::string& filename)
|
|
{
|
|
for(const auto& entry : fs::directory_iterator(directory))
|
|
{
|
|
if(fs::is_regular_file(entry))
|
|
{
|
|
if(entry.path().filename() == filename)
|
|
{
|
|
filename = entry.path();
|
|
}
|
|
}
|
|
else if(fs::is_directory(entry))
|
|
{
|
|
search_hasco(entry, filename); // Recursive call for subdirectories
|
|
}
|
|
}
|
|
}
|
|
};
|