Files

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

416 baris
15 KiB
C++

2025-02-04 04:05:38 -06:00
// MIT License
//
// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#pragma once
#include "common/filesystem.hpp"
#include <hsa/hsa.h>
#include <hsa/hsa_ext_amd.h>
#include <dlfcn.h>
#include <fcntl.h>
#include <unistd.h>
#include <cassert>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <fstream>
#include <iostream>
#include <string>
#include <vector>
namespace fs = common::fs;
#define RET_IF_HSA_ERR(err) \
{ \
if((err) != HSA_STATUS_SUCCESS) \
{ \
char err_val[12]; \
char* err_str = nullptr; \
if(hsa_status_string(err, (const char**) &err_str) != HSA_STATUS_SUCCESS) \
{ \
sprintf(&(err_val[0]), "%#x", (uint32_t) err); \
err_str = &(err_val[0]); \
} \
printf("hsa api call failure at: %s:%d\n", __FILE__, __LINE__); \
printf("Call returned %s\n", err_str); \
abort(); \
} \
}
struct Device
{
struct Memory
{
hsa_amd_memory_pool_t pool;
bool fine;
bool kernarg;
size_t size;
size_t granule;
};
hsa_agent_t agent;
char name[64];
std::vector<Memory> pools;
uint32_t fine;
uint32_t coarse;
static std::vector<hsa_agent_t> all_devices;
};
class MQDependencyTest
{
public:
MQDependencyTest() { hsa_init(); }
~MQDependencyTest() { hsa_shut_down(); }
std::vector<Device> cpu;
std::vector<Device> gpu;
Device::Memory kernarg;
std::vector<hsa_agent_t> all_devices = {};
struct CodeObject
{
hsa_file_t file = 0;
hsa_code_object_reader_t code_obj_rdr = {};
hsa_executable_t executable = {};
};
struct Kernel
{
uint64_t handle = 0;
uint32_t scratch = 0;
uint32_t group = 0;
uint32_t kernarg_size = 0;
uint32_t kernarg_align = 0;
};
union AqlHeader
{
struct
{
uint16_t type : 8;
uint16_t barrier : 1;
uint16_t acquire : 2;
uint16_t release : 2;
uint16_t reserved : 3;
};
uint16_t raw = 0;
};
struct BarrierValue
{
AqlHeader header = {};
uint8_t AmdFormat = 0;
uint8_t reserved = 0;
uint32_t reserved1 = 0;
hsa_signal_t signal = {};
hsa_signal_value_t value = 0;
hsa_signal_value_t mask = 0;
uint32_t cond = 0;
uint32_t reserved2 = 0;
uint64_t reserved3 = 0;
uint64_t reserved4 = 0;
hsa_signal_t completion_signal = {};
};
union Aql
{
AqlHeader header;
hsa_kernel_dispatch_packet_t dispatch;
hsa_barrier_and_packet_t barrier_and;
hsa_barrier_or_packet_t barrier_or;
BarrierValue barrier_value = {};
};
struct OCLHiddenArgs
{
uint64_t offset_x = 0;
uint64_t offset_y = 0;
uint64_t offset_z = 0;
void* printf_buffer = nullptr;
void* enqueue = nullptr;
void* enqueue2 = nullptr;
void* multi_grid = nullptr;
};
bool load_code_object(const std::string& filename, hsa_agent_t agent, CodeObject& code_object)
{
hsa_status_t err;
code_object.file = open(filename.c_str(), O_RDONLY);
if(code_object.file == -1)
{
fprintf(stderr, "%s:%s\n", "Could not load code object", filename.c_str());
abort();
return false;
}
err = hsa_code_object_reader_create_from_file(code_object.file, &code_object.code_obj_rdr);
RET_IF_HSA_ERR(err);
err = hsa_executable_create_alt(HSA_PROFILE_FULL,
HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT,
nullptr,
&code_object.executable);
RET_IF_HSA_ERR(err);
err = hsa_executable_load_agent_code_object(
code_object.executable, agent, code_object.code_obj_rdr, nullptr, nullptr);
if(err != HSA_STATUS_SUCCESS) return false;
err = hsa_executable_freeze(code_object.executable, nullptr);
RET_IF_HSA_ERR(err);
return true;
}
bool load_code_object_memory(const std::string& filename,
hsa_agent_t agent,
CodeObject& code_object)
{
hsa_status_t err;
size_t buffer_size = 0;
std::ifstream code_object_file(filename.c_str(), std::ios::binary | std::ios::ate);
if(!code_object_file.good()) return false;
buffer_size = code_object_file.tellg();
code_object_file.seekg(0, std::ios::beg);
uint8_t* binary = new(std::nothrow) uint8_t[buffer_size];
if(binary && !code_object_file.read(reinterpret_cast<char*>(binary), buffer_size))
{
delete[] binary;
binary = nullptr;
return false;
}
err = hsa_code_object_reader_create_from_memory(
binary, buffer_size, &code_object.code_obj_rdr);
RET_IF_HSA_ERR(err);
err = hsa_executable_create_alt(HSA_PROFILE_FULL,
HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT,
nullptr,
&code_object.executable);
RET_IF_HSA_ERR(err);
err = hsa_executable_load_agent_code_object(
code_object.executable, agent, code_object.code_obj_rdr, nullptr, nullptr);
if(err != HSA_STATUS_SUCCESS) return false;
err = hsa_executable_freeze(code_object.executable, nullptr);
RET_IF_HSA_ERR(err);
delete[] binary;
binary = nullptr;
return true;
}
bool get_kernel(const CodeObject& code_object,
const std::string& kernel,
hsa_agent_t agent,
Kernel& kern)
{
hsa_executable_symbol_t symbol;
hsa_status_t err = hsa_executable_get_symbol_by_name(
code_object.executable, kernel.c_str(), &agent, &symbol);
if(err != HSA_STATUS_SUCCESS)
{
err = hsa_executable_get_symbol_by_name(
code_object.executable, (kernel + ".kd").c_str(), &agent, &symbol);
if(err != HSA_STATUS_SUCCESS)
{
return false;
}
}
printf("\nkernel-name: %s\n", kernel.c_str());
err = hsa_executable_symbol_get_info(
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kern.handle);
RET_IF_HSA_ERR(err);
return true;
}
// Not for parallel insertion.
bool submit_packet(hsa_queue_t* queue, Aql& pkt)
{
size_t mask = queue->size - 1;
Aql* ring = static_cast<Aql*>(queue->base_address);
uint64_t write = hsa_queue_load_write_index_relaxed(queue);
uint64_t read = hsa_queue_load_read_index_relaxed(queue);
if(write - read + 1 > queue->size) return false;
Aql& dst = ring[write & mask];
uint16_t header = pkt.header.raw;
pkt.header.raw = dst.header.raw;
dst = pkt;
__atomic_store_n(&dst.header.raw, header, __ATOMIC_RELEASE);
pkt.header.raw = header;
hsa_queue_store_write_index_release(queue, write + 1);
hsa_signal_store_screlease(queue->doorbell_signal, write);
return true;
}
void* hsa_malloc(size_t size, const Device::Memory& mem)
{
void* ret;
hsa_status_t err = hsa_amd_memory_pool_allocate(mem.pool, size, 0, &ret);
RET_IF_HSA_ERR(err);
err = hsa_amd_agents_allow_access(all_devices.size(), all_devices.data(), nullptr, ret);
RET_IF_HSA_ERR(err);
return ret;
}
void* hsa_malloc(size_t size, const Device& dev, bool fine)
{
uint32_t index = fine ? dev.fine : dev.coarse;
assert(index != -1u && "Memory type unavailable.");
return hsa_malloc(size, dev.pools[index]);
}
bool device_discovery()
{
hsa_status_t err;
err = hsa_iterate_agents(
[](hsa_agent_t agent, void* obj) {
hsa_status_t error;
Device dev;
dev.agent = agent;
dev.fine = -1u;
dev.coarse = -1u;
MQDependencyTest* _obj = reinterpret_cast<MQDependencyTest*>(obj);
error = hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, dev.name);
RET_IF_HSA_ERR(error)
hsa_device_type_t type;
error = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &type);
RET_IF_HSA_ERR(error)
error = hsa_amd_agent_iterate_memory_pools(
agent,
[](hsa_amd_memory_pool_t pool, void* data) {
auto& pools = *reinterpret_cast<std::vector<Device::Memory>*>(data);
hsa_status_t status;
bool allowed = false;
status = hsa_amd_memory_pool_get_info(
pool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED, &allowed);
if(!allowed) return HSA_STATUS_SUCCESS;
hsa_amd_segment_t segment;
status = hsa_amd_memory_pool_get_info(
pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment);
RET_IF_HSA_ERR(status)
if(segment != HSA_AMD_SEGMENT_GLOBAL) return HSA_STATUS_SUCCESS;
uint32_t flags;
status = hsa_amd_memory_pool_get_info(
pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flags);
RET_IF_HSA_ERR(status)
Device::Memory mem;
mem.pool = pool;
mem.fine = ((flags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) != 0u);
mem.kernarg =
((flags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT) != 0u);
status = hsa_amd_memory_pool_get_info(
pool, HSA_AMD_MEMORY_POOL_INFO_SIZE, &mem.size);
RET_IF_HSA_ERR(status)
status = hsa_amd_memory_pool_get_info(
pool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE, &mem.granule);
RET_IF_HSA_ERR(status)
pools.push_back(mem);
return HSA_STATUS_SUCCESS;
},
static_cast<void*>(&dev.pools));
if(!dev.pools.empty())
{
for(size_t i = 0; i < dev.pools.size(); i++)
{
if(dev.pools[i].fine && dev.pools[i].kernarg && dev.fine == -1u)
dev.fine = i;
if(dev.pools[i].fine && !dev.pools[i].kernarg) dev.fine = i;
if(!dev.pools[i].fine) dev.coarse = i;
}
if(type == HSA_DEVICE_TYPE_CPU)
_obj->cpu.push_back(dev);
else
_obj->gpu.push_back(dev);
_obj->all_devices.push_back(dev.agent);
}
return HSA_STATUS_SUCCESS;
},
this);
bool is_break = false;
for(auto& dev : cpu)
{
for(auto& mem : dev.pools)
{
if(mem.fine && mem.kernarg)
{
kernarg = mem;
is_break = true;
break;
}
}
if(is_break) break;
}
RET_IF_HSA_ERR(err);
if(cpu.empty() || gpu.empty() || kernarg.pool.handle == 0) return false;
return true;
}
void search_hasco(const fs::path& directory, std::string& filename)
{
for(const auto& entry : fs::directory_iterator(directory))
{
if(fs::is_regular_file(entry))
{
if(entry.path().filename() == filename)
{
filename = entry.path();
}
}
else if(fs::is_directory(entry))
{
search_hasco(entry, filename); // Recursive call for subdirectories
}
}
}
};