Merge branch 'master' into implicit-kernarg

此提交包含在:
Maneesh Gupta
2019-06-04 13:24:19 +05:30
提交者 GitHub
當前提交 d4fa74ff09
共有 31 個檔案被更改,包括 888 行新增290 行删除
+64 -20
查看文件
@@ -59,6 +59,18 @@ using namespace std;
static const size_t HIP_IMPLICIT_KERNARG_SIZE = 48;
static const size_t HIP_IMPLICIT_KERNARG_ALIGNMENT = 8;
struct amd_kernel_code_v3_t {
uint32_t group_segment_fixed_size;
uint32_t private_segment_fixed_size;
uint8_t reserved0[8];
int64_t kernel_code_entry_byte_offset;
uint8_t reserved1[24];
uint32_t compute_pgm_rsrc1;
uint32_t compute_pgm_rsrc2;
uint16_t kernel_code_properties;
uint8_t reserved2[6];
};
// calculate MD5 checksum
inline std::string checksum(size_t size, const char *source) {
// FNV-1a hashing, 64-bit version
@@ -191,7 +203,7 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
lp.dynamic_group_mem_bytes =
sharedMemBytes; // TODO - this should be part of preLaunchKernel.
hStream = ihipPreLaunchKernel(
hStream, dim3(globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ),
hStream, dim3(globalWorkSizeX/localWorkSizeX, globalWorkSizeY/localWorkSizeY, globalWorkSizeZ/localWorkSizeZ),
dim3(localWorkSizeX, localWorkSizeY, localWorkSizeZ), &lp, f->_name.c_str());
@@ -208,10 +220,20 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
aql.grid_size_x = globalWorkSizeX;
aql.grid_size_y = globalWorkSizeY;
aql.grid_size_z = globalWorkSizeZ;
aql.group_segment_size =
f->_header->workgroup_group_segment_byte_size + sharedMemBytes;
aql.private_segment_size =
f->_header->workitem_private_segment_byte_size;
bool is_code_object_v3 = f->_name.find(".kd") != std::string::npos;
if (is_code_object_v3) {
const auto* header =
reinterpret_cast<const amd_kernel_code_v3_t*>(f->_header);
aql.group_segment_size =
header->group_segment_fixed_size + sharedMemBytes;
aql.private_segment_size =
header->private_segment_fixed_size;
} else {
aql.group_segment_size =
f->_header->workgroup_group_segment_byte_size + sharedMemBytes;
aql.private_segment_size =
f->_header->workitem_private_segment_byte_size;
}
aql.kernel_object = f->_object;
aql.setup = 3 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
aql.header =
@@ -464,6 +486,12 @@ hipError_t ihipModuleGetFunction(hipFunction_t* func, hipModule_t hmod, const ch
auto kernel = find_kernel_by_name(hmod->executable, name, agent);
if (kernel.handle == 0u) {
std::string name_str(name);
name_str.append(".kd");
kernel = find_kernel_by_name(hmod->executable, name_str.c_str(), agent);
}
if (kernel.handle == 0u) return hipErrorNotFound;
// TODO: refactor the whole ihipThisThat, which is a mess and yields the
@@ -488,7 +516,11 @@ hipError_t hipModuleGetFunctionEx(hipFunction_t* hfunc, hipModule_t hmod,
}
namespace {
hipFuncAttributes make_function_attributes(const amd_kernel_code_t& header) {
const amd_kernel_code_v3_t *header_v3(const ihipModuleSymbol_t& kd) {
return reinterpret_cast<const amd_kernel_code_v3_t*>(kd._header);
}
hipFuncAttributes make_function_attributes(const ihipModuleSymbol_t& kd) {
hipFuncAttributes r{};
hipDeviceProp_t prop{};
@@ -497,16 +529,31 @@ hipFuncAttributes make_function_attributes(const amd_kernel_code_t& header) {
// available per CU, therefore we hardcode it to 64 KiRegisters.
prop.regsPerBlock = prop.regsPerBlock ? prop.regsPerBlock : 64 * 1024;
r.localSizeBytes = header.workitem_private_segment_byte_size;
r.sharedSizeBytes = header.workgroup_group_segment_byte_size;
bool is_code_object_v3 = kd._name.find(".kd") != std::string::npos;
if (is_code_object_v3) {
r.localSizeBytes = header_v3(kd)->private_segment_fixed_size;
r.sharedSizeBytes = header_v3(kd)->group_segment_fixed_size;
} else {
r.localSizeBytes = kd._header->workitem_private_segment_byte_size;
r.sharedSizeBytes = kd._header->workgroup_group_segment_byte_size;
}
r.maxDynamicSharedSizeBytes = prop.sharedMemPerBlock - r.sharedSizeBytes;
r.numRegs = header.workitem_vgpr_count;
if (is_code_object_v3) {
r.numRegs = ((header_v3(kd)->compute_pgm_rsrc1 & 0x3F) + 1) << 2;
} else {
r.numRegs = kd._header->workitem_vgpr_count;
}
r.maxThreadsPerBlock = r.numRegs ?
std::min(prop.maxThreadsPerBlock, prop.regsPerBlock / r.numRegs) :
prop.maxThreadsPerBlock;
r.binaryVersion =
header.amd_machine_version_major * 10 +
header.amd_machine_version_minor;
if (is_code_object_v3) {
r.binaryVersion = 0; // FIXME: should it be the ISA version or code
// object format version?
} else {
r.binaryVersion =
kd._header->amd_machine_version_major * 10 +
kd._header->amd_machine_version_minor;
}
r.ptxVersion = prop.major * 10 + prop.minor; // HIP currently presents itself as PTX 3.0.
return r;
@@ -522,11 +569,10 @@ hipError_t hipFuncGetAttributes(hipFuncAttributes* attr, const void* func)
auto agent = this_agent();
auto kd = get_program_state().kernel_descriptor(reinterpret_cast<uintptr_t>(func), agent);
const auto header = kd->_header;
if (!header) throw runtime_error{"Ill-formed Kernel_descriptor."};
if (!kd->_header) throw runtime_error{"Ill-formed Kernel_descriptor."};
*attr = make_function_attributes(*header);
*attr = make_function_attributes(*kd);
return hipSuccess;
}
@@ -557,11 +603,9 @@ hipError_t ihipModuleLoadData(hipModule_t* module, const void* image) {
(*module)->executable = get_program_state().load_executable(
content.data(), content.size(), (*module)->executable,
this_agent());
istringstream elf{content};
ELFIO::elfio reader;
if (reader.load(elf)) {
program_state_impl::read_kernarg_metadata(reader, (*module)->kernargs);
}
std::vector<char> blob(content.cbegin(), content.cend());
program_state_impl::read_kernarg_metadata(blob, (*module)->kernargs);
// compute the hash of the code object
(*module)->hash = checksum(content.length(), content.data());