d64e33520f
Fix some potentially unreleased memory, null value checks, files not closed, and other such issues reported by codeql Change-Id: Ia679aff97a773a642d8c8cbadeae30955554a62e Signed-off-by: Kent Russell <kent.russell@amd.com>
900 خطوط
32 KiB
C++
Executable File
900 خطوط
32 KiB
C++
Executable File
/*
|
|
* =============================================================================
|
|
* ROC Runtime Conformance Release License
|
|
* =============================================================================
|
|
* The University of Illinois/NCSA
|
|
* Open Source License (NCSA)
|
|
*
|
|
* Copyright (c) 2017, Advanced Micro Devices, Inc.
|
|
* All rights reserved.
|
|
*
|
|
* Developed by:
|
|
*
|
|
* AMD Research and AMD ROC Software Development
|
|
*
|
|
* Advanced Micro Devices, Inc.
|
|
*
|
|
* www.amd.com
|
|
*
|
|
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
* of this software and associated documentation files (the "Software"), to
|
|
* deal with the Software without restriction, including without limitation
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
*
|
|
* - Redistributions of source code must retain the above copyright notice,
|
|
* this list of conditions and the following disclaimers.
|
|
* - Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimers in
|
|
* the documentation and/or other materials provided with the distribution.
|
|
* - Neither the names of <Name of Development Group, Name of Institution>,
|
|
* nor the names of its contributors may be used to endorse or promote
|
|
* products derived from this Software without specific prior written
|
|
* permission.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
|
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
|
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
|
* DEALINGS WITH THE SOFTWARE.
|
|
*
|
|
*/
|
|
|
|
#include <assert.h>
|
|
#include <stdint.h>
|
|
#include <string.h>
|
|
#include <fcntl.h>
|
|
#include <unistd.h>
|
|
#include <string>
|
|
#include <iostream>
|
|
#include <climits>
|
|
#include "hsa/hsa.h"
|
|
#include "hsa/hsa_ext_amd.h"
|
|
|
|
#define RET_IF_HSA_ERR(err) { \
|
|
if ((err) != HSA_STATUS_SUCCESS) { \
|
|
std::cout << "hsa api call failure at line " << __LINE__ << ", file: " << \
|
|
__FILE__ << ". Call returned " << err << std::endl; \
|
|
return (err); \
|
|
} \
|
|
}
|
|
|
|
#ifndef ROCRTST_EMULATOR_BUILD
|
|
static const uint32_t kBinarySearchLength = 512;
|
|
static const uint32_t kBinarySearchFindMe = 108;
|
|
static const uint32_t kWorkGroupSize = 256;
|
|
#else
|
|
static const uint32_t kBinarySearchLength = 16;
|
|
static const uint32_t kBinarySearchFindMe = 6;
|
|
static const uint32_t kWorkGroupSize = 8;
|
|
#endif
|
|
|
|
// Hold all the info specific to binary search
|
|
typedef struct BinarySearch {
|
|
// Binary Search parameters
|
|
uint32_t length;
|
|
uint32_t work_group_size;
|
|
uint32_t work_grid_size;
|
|
uint32_t num_sub_divisions;
|
|
uint32_t find_me;
|
|
|
|
// Buffers needed for this application
|
|
uint32_t* input;
|
|
uint32_t* input_arr;
|
|
uint32_t* input_arr_local;
|
|
uint32_t* output;
|
|
// Keneral argument buffers and addresses
|
|
void* kern_arg_buffer; // Begin of allocated memory
|
|
// this pointer to be deallocated
|
|
void* kern_arg_address; // Properly aligned address to be used in aql
|
|
// packet (don't use for deallocation)
|
|
|
|
// Kernel code
|
|
std::string kernel_file_name;
|
|
std::string kernel_name;
|
|
uint32_t kernarg_size;
|
|
uint32_t kernarg_align;
|
|
|
|
// HSA/RocR objects needed for this application
|
|
hsa_agent_t gpu_dev;
|
|
hsa_agent_t cpu_dev;
|
|
hsa_signal_t signal;
|
|
hsa_queue_t* queue;
|
|
hsa_amd_memory_pool_t cpu_pool;
|
|
hsa_amd_memory_pool_t gpu_pool;
|
|
hsa_amd_memory_pool_t kern_arg_pool;
|
|
|
|
// Other items we need to populate AQL packet
|
|
uint64_t kernel_object;
|
|
uint32_t group_segment_size; ///< Kernel group seg size
|
|
uint32_t private_segment_size; ///< Kernel private seg size
|
|
} BinarySearch;
|
|
|
|
void InitializeBinarySearch(BinarySearch* bs) {
|
|
bs->kernel_file_name = "./binary_search_kernels.hsaco";
|
|
bs->kernel_name = "binarySearch.kd";
|
|
bs->length = kBinarySearchLength;
|
|
bs->find_me = kBinarySearchFindMe;
|
|
bs->work_group_size = kWorkGroupSize;
|
|
bs->num_sub_divisions = bs->length / bs->work_group_size;
|
|
}
|
|
|
|
// This function is called by the call-back functions used to find an agent of
|
|
// the specified hsa_device_type_t. Note that it cannot be called directly from
|
|
// hsa_iterate_agents() as it does not match the prototype of the call-back
|
|
// function. It must be wrapped by a function with the correct prototype.
|
|
//
|
|
// Return values:
|
|
// HSA_STATUS_INFO_BREAK -- "agent" is of the specified type (dev_type)
|
|
// HSA_STATUS_SUCCESS -- "agent" is not of the specified type
|
|
// Other -- Some error occurred
|
|
static hsa_status_t FindAgent(hsa_agent_t agent, void* data,
|
|
hsa_device_type_t dev_type) {
|
|
if (data == nullptr) {
|
|
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
|
}
|
|
|
|
// See if the provided agent matches the input type (dev_type)
|
|
hsa_device_type_t hsa_device_type;
|
|
hsa_status_t hsa_error_code = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE,
|
|
&hsa_device_type);
|
|
RET_IF_HSA_ERR(hsa_error_code);
|
|
|
|
if (hsa_device_type == dev_type) {
|
|
*(reinterpret_cast<hsa_agent_t*>(data)) = agent;
|
|
return HSA_STATUS_INFO_BREAK;
|
|
}
|
|
|
|
return HSA_STATUS_SUCCESS;
|
|
}
|
|
|
|
// This is the call-back function used to find a GPU type agent. Note that the
|
|
// prototype of this function is dictated by the HSA specification
|
|
hsa_status_t FindGPUDevice(hsa_agent_t agent, void* data) {
|
|
return FindAgent(agent, data, HSA_DEVICE_TYPE_GPU);
|
|
}
|
|
|
|
// This is the call-back function used to find a CPU type agent. Note that the
|
|
// prototype of this function is dictated by the HSA specification
|
|
hsa_status_t FindCPUDevice(hsa_agent_t agent, void* data) {
|
|
return FindAgent(agent, data, HSA_DEVICE_TYPE_CPU);
|
|
}
|
|
|
|
// Find the CPU and GPU agents we need to run this sample, and save them in the
|
|
// BinarySearch structure for later use.
|
|
hsa_status_t FindDevices(BinarySearch* bs) {
|
|
hsa_status_t err;
|
|
|
|
// Note that hsa_iterate_agents iterate through all known agents until
|
|
// HSA_STATUS_SUCCESS is not returned. The call-backs are implemented such
|
|
// that HSA_STATUS_INFO_BREAK means we found an agent of the specified type.
|
|
// This value is returned by hsa_iterate_agents.
|
|
bs->gpu_dev.handle = 0;
|
|
err = hsa_iterate_agents(FindGPUDevice, &bs->gpu_dev);
|
|
|
|
if (err != HSA_STATUS_INFO_BREAK) {
|
|
return HSA_STATUS_ERROR;
|
|
}
|
|
|
|
bs->cpu_dev.handle = 0;
|
|
err = hsa_iterate_agents(FindCPUDevice, &bs->cpu_dev);
|
|
|
|
if (err != HSA_STATUS_INFO_BREAK) {
|
|
return HSA_STATUS_ERROR;
|
|
}
|
|
|
|
if (0 == bs->gpu_dev.handle) {
|
|
std::cout << "GPU Device is not Created properly!" << std::endl;
|
|
RET_IF_HSA_ERR(HSA_STATUS_ERROR);
|
|
}
|
|
|
|
if (0 == bs->cpu_dev.handle) {
|
|
std::cout << "CPU Device is not Created properly!" << std::endl;
|
|
RET_IF_HSA_ERR(HSA_STATUS_ERROR);
|
|
}
|
|
|
|
return HSA_STATUS_SUCCESS;
|
|
}
|
|
|
|
// This function checks to see if the provided
|
|
// pool has the HSA_AMD_SEGMENT_GLOBAL property. If the kern_arg flag is true,
|
|
// the function adds an additional requirement that the pool have the
|
|
// HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT property. If kern_arg is false,
|
|
// pools must NOT have this property.
|
|
// Upon finding a pool that meets these conditions, HSA_STATUS_INFO_BREAK is
|
|
// returned. HSA_STATUS_SUCCESS is returned if no errors were encountered, but
|
|
// no pool was found meeting the requirements. If an error is encountered, we
|
|
// return that error.
|
|
|
|
// Note that this function does not match the required prototype for the
|
|
// hsa_amd_agent_iterate_memory_pools call back function, and therefore must be
|
|
// wrapped by a function with the correct prototype.
|
|
static hsa_status_t
|
|
FindGlobalPool(hsa_amd_memory_pool_t pool, void* data, bool kern_arg) {
|
|
hsa_status_t err;
|
|
hsa_amd_segment_t segment;
|
|
uint32_t flag;
|
|
|
|
if (nullptr == data) {
|
|
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
|
}
|
|
|
|
err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT,
|
|
&segment);
|
|
RET_IF_HSA_ERR(err);
|
|
|
|
if (HSA_AMD_SEGMENT_GLOBAL != segment) {
|
|
return HSA_STATUS_SUCCESS;
|
|
}
|
|
|
|
err = hsa_amd_memory_pool_get_info(pool,
|
|
HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flag);
|
|
RET_IF_HSA_ERR(err);
|
|
|
|
uint32_t karg_st = flag & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT;
|
|
|
|
if ((karg_st == 0 && kern_arg) ||
|
|
(karg_st != 0 && !kern_arg)) {
|
|
return HSA_STATUS_SUCCESS;
|
|
}
|
|
|
|
*(reinterpret_cast<hsa_amd_memory_pool_t*>(data)) = pool;
|
|
return HSA_STATUS_INFO_BREAK;
|
|
}
|
|
|
|
// This is the call-back function for hsa_amd_agent_iterate_memory_pools() that
|
|
// finds a pool with the properties of HSA_AMD_SEGMENT_GLOBAL and that is NOT
|
|
// HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT
|
|
hsa_status_t FindStandardPool(hsa_amd_memory_pool_t pool, void* data) {
|
|
return FindGlobalPool(pool, data, false);
|
|
}
|
|
|
|
// This is the call-back function for hsa_amd_agent_iterate_memory_pools() that
|
|
// finds a pool with the properties of HSA_AMD_SEGMENT_GLOBAL and that IS
|
|
// HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT
|
|
hsa_status_t FindKernArgPool(hsa_amd_memory_pool_t pool, void* data) {
|
|
return FindGlobalPool(pool, data, true);
|
|
}
|
|
|
|
// Find memory pools that we will need to allocate from for this sample
|
|
// application. We will need memory associated with the host CPU, the GPU
|
|
// executing the kernels, and for kernel arguments. This function will
|
|
// save the found pools to the BinarySearch structure for use elsewhere
|
|
// in this program.
|
|
hsa_status_t FindPools(BinarySearch* bs) {
|
|
hsa_status_t err;
|
|
|
|
err = hsa_amd_agent_iterate_memory_pools(bs->cpu_dev, FindStandardPool,
|
|
&bs->cpu_pool);
|
|
|
|
if (err != HSA_STATUS_INFO_BREAK) {
|
|
return HSA_STATUS_ERROR;
|
|
}
|
|
|
|
err = hsa_amd_agent_iterate_memory_pools(bs->gpu_dev, FindStandardPool,
|
|
&bs->gpu_pool);
|
|
|
|
if (err != HSA_STATUS_INFO_BREAK) {
|
|
return HSA_STATUS_ERROR;
|
|
}
|
|
|
|
err = hsa_amd_agent_iterate_memory_pools(bs->cpu_dev,
|
|
FindKernArgPool, &bs->kern_arg_pool);
|
|
|
|
if (err != HSA_STATUS_INFO_BREAK) {
|
|
return HSA_STATUS_ERROR;
|
|
}
|
|
|
|
return HSA_STATUS_SUCCESS;
|
|
}
|
|
|
|
// Once the needed memory pools have been found and the BinarySearch structure
|
|
// has been updated with these handles, this function is then used to allocate
|
|
// memory from those pools.
|
|
// Devices with which a pool is associated already have access to the pool.
|
|
// However, other devices may also need to read or write to that memory. Below,
|
|
// we see how we can grant access to other devices to address this issue.
|
|
hsa_status_t AllocateAndInitBuffers(BinarySearch* bs) {
|
|
hsa_status_t err;
|
|
uint32_t out_length = 4 * sizeof(uint32_t);
|
|
uint32_t in_length = bs->num_sub_divisions * 2 * sizeof(uint32_t);
|
|
|
|
// In all of these examples, we want both the cpu and gpu to have access to
|
|
// the buffer in question. We use the array of agents below in the susequent
|
|
// calls to hsa_amd_agents_allow_access() for this purpose.
|
|
hsa_agent_t ag_list[2] = {bs->gpu_dev, bs->cpu_dev};
|
|
|
|
err = hsa_amd_memory_pool_allocate(bs->cpu_pool, in_length, 0,
|
|
reinterpret_cast<void**>(&bs->input));
|
|
RET_IF_HSA_ERR(err);
|
|
err = hsa_amd_agents_allow_access(2, ag_list, NULL, bs->input);
|
|
RET_IF_HSA_ERR(err);
|
|
(void)memset(bs->input, 0, in_length);
|
|
|
|
err = hsa_amd_memory_pool_allocate(bs->cpu_pool, out_length, 0,
|
|
reinterpret_cast<void**>(&bs->output));
|
|
RET_IF_HSA_ERR(err);
|
|
err = hsa_amd_agents_allow_access(2, ag_list, NULL, bs->output);
|
|
RET_IF_HSA_ERR(err);
|
|
(void)memset(bs->input, 0, in_length);
|
|
|
|
err = hsa_amd_memory_pool_allocate(bs->cpu_pool, in_length, 0,
|
|
reinterpret_cast<void**>(&bs->input_arr));
|
|
RET_IF_HSA_ERR(err);
|
|
err = hsa_amd_agents_allow_access(2, ag_list, NULL, bs->input_arr);
|
|
RET_IF_HSA_ERR(err);
|
|
(void)memset(bs->input, 0, in_length);
|
|
|
|
err = hsa_amd_memory_pool_allocate(bs->cpu_pool, in_length, 0,
|
|
reinterpret_cast<void**>(&bs->input_arr_local));
|
|
RET_IF_HSA_ERR(err);
|
|
err = hsa_amd_agents_allow_access(2, ag_list, NULL, bs->input_arr_local);
|
|
RET_IF_HSA_ERR(err);
|
|
|
|
// Binary-search application specific code...
|
|
// Initialize input buffer with random values in an increasing order
|
|
uint32_t max = bs->length * 20;
|
|
bs->input[0] = 0;
|
|
|
|
uint32_t seed = (unsigned int)time(NULL);
|
|
srand(seed);
|
|
|
|
for (uint32_t i = 1; i < bs->length; ++i) {
|
|
bs->input[i] = bs->input[i - 1] +
|
|
static_cast<uint32_t>(max * rand_r(&seed) / static_cast<float>(RAND_MAX));
|
|
}
|
|
|
|
// #define VERBOSE 1
|
|
#ifdef VERBOSE
|
|
std::cout << "Input array values:" << std::endl;
|
|
|
|
for (uint32_t i = 0; i < bs->length; ++i) {
|
|
std::cout << "input[" << i << "] = " << bs->input[i] << " ";
|
|
|
|
if (i % 4 == 0) {
|
|
std::cout << std::endl;
|
|
}
|
|
}
|
|
|
|
std::cout << std::endl;
|
|
#endif
|
|
|
|
return err;
|
|
}
|
|
|
|
// The code in this function illustrates how to load a kernel from
|
|
// pre-compiled code. The goal is to get a handle that can be later
|
|
// used in an AQL packet and also to extract information about kernel
|
|
// that we will need. All of the information hand kernel handle will
|
|
// be saved to the BinarySearch structure. It will be used when we
|
|
// populate the AQL packet.
|
|
hsa_status_t LoadKernelFromObjFile(BinarySearch* bs) {
|
|
hsa_status_t err;
|
|
hsa_code_object_reader_t code_obj_rdr = {0};
|
|
hsa_executable_t executable = {0};
|
|
|
|
hsa_file_t file_handle = open(bs->kernel_file_name.c_str(), O_RDONLY);
|
|
|
|
if (file_handle == -1) {
|
|
char agent_name[64];
|
|
err = hsa_agent_get_info(bs->gpu_dev, HSA_AGENT_INFO_NAME, agent_name);
|
|
RET_IF_HSA_ERR(err);
|
|
std::string fileName = std::string("./") + agent_name + "/" + bs->kernel_file_name;
|
|
hsa_file_t file_handle = open(fileName.c_str(), O_RDONLY);
|
|
}
|
|
|
|
if (file_handle == -1) {
|
|
std::cout << "failed to open " << bs->kernel_file_name.c_str() <<
|
|
" at line " << __LINE__ << ", errno: " << errno << std::endl;
|
|
return HSA_STATUS_ERROR;
|
|
}
|
|
|
|
err = hsa_code_object_reader_create_from_file(file_handle, &code_obj_rdr);
|
|
close(file_handle);
|
|
RET_IF_HSA_ERR(err);
|
|
|
|
err = hsa_executable_create_alt(HSA_PROFILE_FULL,
|
|
HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, NULL, &executable);
|
|
RET_IF_HSA_ERR(err);
|
|
|
|
err = hsa_executable_load_agent_code_object(executable, bs->gpu_dev,
|
|
code_obj_rdr, NULL, NULL);
|
|
RET_IF_HSA_ERR(err);
|
|
|
|
err = hsa_executable_freeze(executable, NULL);
|
|
RET_IF_HSA_ERR(err);
|
|
|
|
hsa_executable_symbol_t kern_sym;
|
|
err = hsa_executable_get_symbol(executable, NULL, bs->kernel_name.c_str(),
|
|
bs->gpu_dev, 0, &kern_sym);
|
|
RET_IF_HSA_ERR(err);
|
|
|
|
err = hsa_executable_symbol_get_info(kern_sym,
|
|
HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT,
|
|
&bs->kernel_object);
|
|
RET_IF_HSA_ERR(err);
|
|
|
|
err = hsa_executable_symbol_get_info(kern_sym,
|
|
HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
|
|
&bs->private_segment_size);
|
|
RET_IF_HSA_ERR(err);
|
|
|
|
err = hsa_executable_symbol_get_info(kern_sym,
|
|
HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
|
|
&bs->group_segment_size);
|
|
RET_IF_HSA_ERR(err);
|
|
|
|
// Remaining queries not supported on code object v3.
|
|
err = hsa_executable_symbol_get_info(kern_sym,
|
|
HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
|
|
&bs->kernarg_size);
|
|
RET_IF_HSA_ERR(err);
|
|
|
|
err = hsa_executable_symbol_get_info(kern_sym,
|
|
HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_ALIGNMENT,
|
|
&bs->kernarg_align);
|
|
RET_IF_HSA_ERR(err);
|
|
assert(bs->kernarg_align >= 16 && "Reported kernarg size is too small.");
|
|
bs->kernarg_align = (bs->kernarg_align == 0) ? 16 : bs->kernarg_align;
|
|
|
|
return err;
|
|
}
|
|
|
|
// This function shows how to do an asynchronous copy. We have to create a
|
|
// signal and use the signal to notify us when the copy has completed.
|
|
hsa_status_t AgentMemcpy(void* dst, const void* src,
|
|
size_t size, hsa_agent_t dst_ag, hsa_agent_t src_ag) {
|
|
hsa_signal_t s;
|
|
hsa_status_t err;
|
|
|
|
err = hsa_signal_create(1, 0, NULL, &s);
|
|
RET_IF_HSA_ERR(err);
|
|
|
|
err = hsa_amd_memory_async_copy(dst, dst_ag, src, src_ag, size, 0, NULL, s);
|
|
RET_IF_HSA_ERR(err);
|
|
|
|
if (hsa_signal_wait_scacquire(s, HSA_SIGNAL_CONDITION_LT, 1,
|
|
UINT64_MAX, HSA_WAIT_STATE_BLOCKED) != 0) {
|
|
err = HSA_STATUS_ERROR;
|
|
std::cout << "Async copy signal error" << std::endl;
|
|
|
|
RET_IF_HSA_ERR(err);
|
|
}
|
|
|
|
err = hsa_signal_destroy(s);
|
|
|
|
RET_IF_HSA_ERR(err);
|
|
|
|
return err;
|
|
}
|
|
|
|
// AlignDown and AlignUp are 2 utility functions we use to find an aligned
|
|
// boundary either below or above a given value (address). The function will
|
|
// return a value that has the specified alignment.
|
|
static intptr_t
|
|
AlignDown(intptr_t value, size_t alignment) {
|
|
assert(alignment != 0 && "Zero alignment");
|
|
return (intptr_t) (value & ~(alignment - 1));
|
|
}
|
|
static void*
|
|
AlignUp(void* value, size_t alignment) {
|
|
return reinterpret_cast<void*>(AlignDown((uintptr_t)
|
|
(reinterpret_cast<uintptr_t>(value) + alignment - 1), alignment));
|
|
}
|
|
|
|
// This function populates the AQL patch with the information
|
|
// we have collected and stored in the BinarySearch structure thus far.
|
|
void PopulateAQLPacket(BinarySearch const* bs,
|
|
hsa_kernel_dispatch_packet_t* aql) {
|
|
aql->header = 0; // Dummy val. for now. Set this right before doorbell ring
|
|
aql->setup = 1;
|
|
aql->workgroup_size_x = bs->work_group_size;
|
|
aql->workgroup_size_y = 1;
|
|
aql->workgroup_size_z = 1;
|
|
aql->grid_size_x = bs->work_grid_size;
|
|
aql->grid_size_y = 1;
|
|
aql->grid_size_z = 1;
|
|
aql->private_segment_size = bs->private_segment_size;
|
|
aql->group_segment_size = bs->group_segment_size;
|
|
aql->kernel_object = bs->kernel_object;
|
|
aql->kernarg_address = bs->kern_arg_address;
|
|
aql->completion_signal = bs->signal;
|
|
|
|
return;
|
|
}
|
|
/*
|
|
* Write everything in the provided AQL packet to the queue except the first 32
|
|
* bits which include the header and setup fields. That should be done
|
|
* last.
|
|
*/
|
|
void WriteAQLToQueue(hsa_kernel_dispatch_packet_t const* in_aql,
|
|
hsa_queue_t* q) {
|
|
void* queue_base = q->base_address;
|
|
const uint32_t queue_mask = q->size - 1;
|
|
uint64_t que_idx = hsa_queue_add_write_index_relaxed(q, 1);
|
|
|
|
hsa_kernel_dispatch_packet_t* queue_aql_packet;
|
|
|
|
queue_aql_packet =
|
|
&(reinterpret_cast<hsa_kernel_dispatch_packet_t*>(queue_base))
|
|
[que_idx & queue_mask];
|
|
|
|
queue_aql_packet->workgroup_size_x = in_aql->workgroup_size_x;
|
|
queue_aql_packet->workgroup_size_y = in_aql->workgroup_size_y;
|
|
queue_aql_packet->workgroup_size_z = in_aql->workgroup_size_z;
|
|
queue_aql_packet->grid_size_x = in_aql->grid_size_x;
|
|
queue_aql_packet->grid_size_y = in_aql->grid_size_y;
|
|
queue_aql_packet->grid_size_z = in_aql->grid_size_z;
|
|
queue_aql_packet->private_segment_size = in_aql->private_segment_size;
|
|
queue_aql_packet->group_segment_size = in_aql->group_segment_size;
|
|
queue_aql_packet->kernel_object = in_aql->kernel_object;
|
|
queue_aql_packet->kernarg_address = in_aql->kernarg_address;
|
|
queue_aql_packet->completion_signal = in_aql->completion_signal;
|
|
}
|
|
|
|
// This function allocates memory from the kern_arg pool we already found, and
|
|
// then sets the argument values needed by the kernel code.
|
|
hsa_status_t AllocAndSetKernArgs(BinarySearch* bs, void* args,
|
|
size_t arg_size, void** aql_buf_ptr) {
|
|
void* kern_arg_buf = nullptr;
|
|
hsa_status_t err;
|
|
size_t buf_size;
|
|
size_t req_align;
|
|
|
|
// The kernel code must be written to memory at the correct alignment. We
|
|
// already queried the executable to get the correct alignment, which is
|
|
// stored in bs->kernarg_align. In case the memory returned from
|
|
// hsa_amd_memory_pool is not of the correct alignment, we request a little
|
|
// more than what we need in case we need to adjust.
|
|
req_align = bs->kernarg_align;
|
|
// Allocate enough extra space for alignment adjustments if ncessary
|
|
buf_size = arg_size + (req_align << 1);
|
|
|
|
err = hsa_amd_memory_pool_allocate(bs->kern_arg_pool, buf_size, 0,
|
|
reinterpret_cast<void**>(&kern_arg_buf));
|
|
RET_IF_HSA_ERR(err);
|
|
|
|
// Address of the allocated buffer
|
|
bs->kern_arg_buffer = kern_arg_buf;
|
|
|
|
// Addr. of kern arg start.
|
|
bs->kern_arg_address = AlignUp(kern_arg_buf, req_align);
|
|
|
|
assert(arg_size >= bs->kernarg_size);
|
|
assert(((uintptr_t)bs->kern_arg_address + arg_size) <
|
|
((uintptr_t)bs->kern_arg_buffer + buf_size));
|
|
|
|
(void)memcpy(bs->kern_arg_address, args, arg_size);
|
|
RET_IF_HSA_ERR(err);
|
|
|
|
// Make sure both the CPU and GPU can access the kernel arguments
|
|
hsa_agent_t ag_list[2] = {bs->gpu_dev, bs->cpu_dev};
|
|
err = hsa_amd_agents_allow_access(2, ag_list, NULL, bs->kern_arg_buffer);
|
|
RET_IF_HSA_ERR(err);
|
|
|
|
// Save this info in our BinarySearch structure for later.
|
|
*aql_buf_ptr = bs->kern_arg_address;
|
|
|
|
return HSA_STATUS_SUCCESS;
|
|
}
|
|
|
|
// This wrapper atomically writes the provided header and setup to the
|
|
// provided AQL packet. The provided AQL packet address should be in the
|
|
// queue memory space.
|
|
inline void AtomicSetPacketHeader(uint16_t header, uint16_t setup,
|
|
hsa_kernel_dispatch_packet_t* queue_packet) {
|
|
__atomic_store_n(reinterpret_cast<uint32_t*>(queue_packet),
|
|
header | (setup << 16), __ATOMIC_RELEASE);
|
|
}
|
|
|
|
// Once all the required data for kernel execution is collected (in this
|
|
// application it is stored in the BinarySearch structure) we can put it in
|
|
// an AQL packet and ring the queue door bell to tell the command processor to
|
|
// execute it.
|
|
hsa_status_t Run(BinarySearch* bs) {
|
|
hsa_status_t err;
|
|
|
|
std::cout << "Executing kernel " << bs->kernel_name << std::endl;
|
|
|
|
// Adjust the size of workgroup
|
|
// This is mostly application specific.
|
|
if (bs->work_group_size > 64) {
|
|
bs->work_group_size = 64;
|
|
bs->num_sub_divisions = bs->length / bs->work_group_size;
|
|
}
|
|
if (bs->num_sub_divisions < bs->work_group_size) {
|
|
bs->num_sub_divisions = bs->work_group_size;
|
|
}
|
|
|
|
bs->work_grid_size = bs->num_sub_divisions;
|
|
|
|
// Explanation of BinarySearch algorithm.
|
|
/*
|
|
* Since a plain binary search on the GPU would not achieve much benefit
|
|
* over the GPU we are doing an N'ary search. We split the array into N
|
|
* segments every pass and therefore get log (base N) passes instead of log
|
|
* (base 2) passes.
|
|
*
|
|
* In every pass, only the thread that can potentially have the element we
|
|
* are looking for writes to the output array. For ex: if we are looking to
|
|
* find 4567 in the array and every thread is searching over a segment of
|
|
* 1000 values and the input array is 1, 2, 3, 4,... then the first thread
|
|
* is searching in 1 to 1000, the second one from 1001 to 2000, etc. The
|
|
* first one does not write to the output. The second one doesn't either.
|
|
* The fifth one however is from 4001 to 5000. So it can potentially have
|
|
* the element 4567 which lies between them.
|
|
*
|
|
* This particular thread writes to the output the lower bound, upper bound
|
|
* and whether the element equals the lower bound element. So, it would be
|
|
* 4001, 5000, 0
|
|
*
|
|
* The next pass would subdivide 4001 to 5000 into smaller segments and
|
|
* continue the same process from there.
|
|
*
|
|
* When a pass returns 1 in the third element, it means the element has been
|
|
* found and we can stop executing the kernel. If the element is not found,
|
|
* then the execution stops after looking at segment of size 1.
|
|
*/
|
|
|
|
uint32_t global_lower_bound = 0;
|
|
uint32_t global_upper_bound = bs->length - 1;
|
|
uint32_t sub_div_size = (global_upper_bound - global_lower_bound + 1) /
|
|
bs->num_sub_divisions;
|
|
|
|
if ((bs->input[0] > bs->find_me) ||
|
|
(bs->input[bs->length - 1] < bs->find_me)) {
|
|
bs->output[0] = 0;
|
|
bs->output[1] = bs->length - 1;
|
|
bs->output[2] = 0;
|
|
std::cout << "Returning too early" << std::endl;
|
|
return HSA_STATUS_SUCCESS;
|
|
}
|
|
|
|
bs->output[3] = 1;
|
|
|
|
// Setup the kernel args
|
|
// See the meta-data for the compiled OpenCL kernel code to ascertain
|
|
// the sizes, padding and alignment required for kernel arguments.
|
|
// This can be seen by executing
|
|
// $ amdgcn-amd-amdhsa-readelf -aw ./binary_search_kernels.hsaco
|
|
// The kernel code will expect the following arguments aligned as shown.
|
|
typedef uint32_t uint2[2];
|
|
typedef uint32_t uint4[4];
|
|
struct __attribute__((aligned(16))) local_args_t {
|
|
uint4* outputArray;
|
|
uint2* sortedArray;
|
|
uint32_t findMe;
|
|
uint32_t pad;
|
|
uint64_t global_offset_x;
|
|
uint64_t global_offset_y;
|
|
uint64_t global_offset_z;
|
|
uint64_t printf_buffer;
|
|
uint64_t default_queue;
|
|
uint64_t completion_action;
|
|
} local_args;
|
|
|
|
local_args.outputArray = reinterpret_cast<uint4*>(bs->output);
|
|
local_args.sortedArray = reinterpret_cast<uint2*>(bs->input_arr_local);
|
|
local_args.findMe = bs->find_me;
|
|
local_args.global_offset_x = 0;
|
|
local_args.global_offset_y = 0;
|
|
local_args.global_offset_z = 0;
|
|
local_args.printf_buffer = 0;
|
|
local_args.default_queue = 0;
|
|
local_args.completion_action = 0;
|
|
|
|
// Copy the kernel args structure into kernel arg memory
|
|
err = AllocAndSetKernArgs(bs, &local_args, sizeof(local_args),
|
|
&bs->kern_arg_address);
|
|
RET_IF_HSA_ERR(err);
|
|
|
|
// Populate an AQL packet with the info we've gathered
|
|
hsa_kernel_dispatch_packet_t aql;
|
|
PopulateAQLPacket(bs, &aql);
|
|
|
|
uint32_t in_length = bs->num_sub_divisions * 2 * sizeof(uint32_t);
|
|
|
|
while ((sub_div_size > 1) && (bs->output[3] != 0)) {
|
|
for (uint32_t i = 0 ; i < bs->num_sub_divisions; i++) {
|
|
int idx1 = i * sub_div_size;
|
|
int idx2 = ((i + 1) * sub_div_size) - 1;
|
|
bs->input_arr[2 * i] = bs->input[idx1];
|
|
bs->input_arr[2 * i + 1] = bs->input[idx2];
|
|
}
|
|
|
|
// Copy kernel parameter from system memory to local memory
|
|
err = AgentMemcpy(reinterpret_cast<uint8_t*>(bs->input_arr_local),
|
|
reinterpret_cast<uint8_t*>(bs->input_arr),
|
|
in_length, bs->gpu_dev, bs->cpu_dev);
|
|
|
|
RET_IF_HSA_ERR(err);
|
|
|
|
// Reset output buffer to zero
|
|
bs->output[3] = 0;
|
|
|
|
// Dispatch kernel with global work size, work group size with ONE dimesion
|
|
// and wait for kernel to complete
|
|
|
|
// Compute the write index of queue and copy Aql packet into it
|
|
uint64_t que_idx = hsa_queue_load_write_index_relaxed(bs->queue);
|
|
|
|
const uint32_t mask = bs->queue->size - 1;
|
|
|
|
// This function simply copies the data we've collected so far into our
|
|
// local AQL packet, except the the setup and header fields.
|
|
WriteAQLToQueue(&aql, bs->queue);
|
|
|
|
uint32_t aql_header = HSA_PACKET_TYPE_KERNEL_DISPATCH;
|
|
aql_header |= HSA_FENCE_SCOPE_SYSTEM <<
|
|
HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
|
|
aql_header |= HSA_FENCE_SCOPE_SYSTEM <<
|
|
HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
|
|
|
|
// Set the packet's type, acquire and release fences. This should be done
|
|
// atomically after all the other fields have been set, using release
|
|
// memory ordering to ensure all the fields are set when the door bell
|
|
// signal is activated.
|
|
void* q_base = bs->queue->base_address;
|
|
|
|
AtomicSetPacketHeader(aql_header, aql.setup,
|
|
&(reinterpret_cast<hsa_kernel_dispatch_packet_t*>
|
|
(q_base))[que_idx & mask]);
|
|
|
|
// Increment the write index and ring the doorbell to dispatch kernel.
|
|
hsa_queue_store_write_index_relaxed(bs->queue, (que_idx + 1));
|
|
hsa_signal_store_relaxed(bs->queue->doorbell_signal, que_idx);
|
|
|
|
// Wait on the dispatch signal until the kernel is finished.
|
|
// Modify the wait condition to HSA_WAIT_STATE_ACTIVE (instead of
|
|
// HSA_WAIT_STATE_BLOCKED) if polling is needed instead of blocking, as we
|
|
// have below.
|
|
// The call below will block until the condition is met. Below we have said
|
|
// the condition is that the signal value (initiailzed to 1) associated with
|
|
// the queue is less than 1. When the kernel associated with the queued AQL
|
|
// packet has completed execution, the signal value is automatically
|
|
// decremented by the packet processor.
|
|
hsa_signal_value_t value = hsa_signal_wait_scacquire(bs->signal,
|
|
HSA_SIGNAL_CONDITION_LT, 1,
|
|
UINT64_MAX, HSA_WAIT_STATE_BLOCKED);
|
|
|
|
// value should be 0, or we timed-out
|
|
if (value) {
|
|
std::cout << "Timed out waiting for kernel to complete?" << std::endl;
|
|
RET_IF_HSA_ERR(HSA_STATUS_ERROR);
|
|
}
|
|
|
|
// Reset the signal to its initial value for the next iteration
|
|
hsa_signal_store_screlease(bs->signal, 1);
|
|
|
|
// Binary search algorithm stuff...
|
|
global_lower_bound = bs->output[0] * sub_div_size;
|
|
global_upper_bound = global_lower_bound + sub_div_size - 1;
|
|
sub_div_size = (global_upper_bound - global_lower_bound + 1) /
|
|
bs->num_sub_divisions;
|
|
}
|
|
|
|
uint32_t element_index = UINT_MAX;
|
|
|
|
for (uint32_t i = global_lower_bound; i <= global_upper_bound; i++) {
|
|
if (bs->input[i] == bs->find_me) {
|
|
element_index = i;
|
|
bs->output[0] = i;
|
|
bs->output[1] = i + 1;
|
|
bs->output[2] = 1;
|
|
break;
|
|
}
|
|
|
|
// Element is not found in region specified
|
|
// by global lower bound to global upper bound
|
|
bs->output[2] = 0;
|
|
}
|
|
|
|
uint32_t is_elem_found = bs->output[2];
|
|
|
|
std::cout << "Lower bound = " << global_lower_bound << std::endl;
|
|
std::cout << "Upper bound = " << global_upper_bound << std::endl;
|
|
std::cout << "Element search for = " << bs->find_me << std::endl;
|
|
|
|
|
|
if (is_elem_found == 1) {
|
|
std::cout << "Element found at index " << element_index << std::endl;
|
|
} else {
|
|
std::cout << "Element value " << bs->find_me << " not found" << std::endl;
|
|
}
|
|
|
|
return HSA_STATUS_SUCCESS;
|
|
}
|
|
|
|
// Release all the RocR resources we have acquired in this application.
|
|
hsa_status_t CleanUp(BinarySearch* bs) {
|
|
hsa_status_t err;
|
|
|
|
err = hsa_amd_memory_pool_free(bs->input);
|
|
RET_IF_HSA_ERR(err);
|
|
|
|
err = hsa_amd_memory_pool_free(bs->output);
|
|
RET_IF_HSA_ERR(err);
|
|
|
|
err = hsa_amd_memory_pool_free(bs->input_arr);
|
|
RET_IF_HSA_ERR(err);
|
|
|
|
err = hsa_amd_memory_pool_free(bs->kern_arg_buffer);
|
|
RET_IF_HSA_ERR(err);
|
|
|
|
err = hsa_queue_destroy(bs->queue);
|
|
RET_IF_HSA_ERR(err);
|
|
|
|
err = hsa_signal_destroy(bs->signal);
|
|
RET_IF_HSA_ERR(err);
|
|
|
|
err = hsa_shut_down();
|
|
RET_IF_HSA_ERR(err);
|
|
|
|
return HSA_STATUS_SUCCESS;
|
|
}
|
|
|
|
int main(int argc, char* argv[]) {
|
|
// This BinarySearch structure (bs) below holds all of the appl. specific
|
|
// info we need to run the sample. This includes algorithm specific
|
|
// information as well as handles to RocR/HSA objects.
|
|
|
|
// The basic structure of this sample is to fill in this structure with the
|
|
// required RocR/HSA handles to RocR resources (e.g., agents, memory pools,
|
|
// queues, etc.) and then dispatch the packets to the queue, and examine the
|
|
// output.
|
|
|
|
BinarySearch bs;
|
|
hsa_status_t err;
|
|
|
|
// Set some working values specific to this application
|
|
InitializeBinarySearch(&bs);
|
|
|
|
// hsa_init() initializes internal data structures and causes devices
|
|
// (agents), memory pools and other resources to be discovered.
|
|
err = hsa_init();
|
|
RET_IF_HSA_ERR(err);
|
|
|
|
// Find the agents needed for the sample
|
|
err = FindDevices(&bs);
|
|
RET_IF_HSA_ERR(err);
|
|
|
|
// Create the completion signal used when dispatching a packet
|
|
err = hsa_signal_create(1, 0, NULL, &bs.signal);
|
|
RET_IF_HSA_ERR(err);
|
|
|
|
// Create a queue to submit our binary search AQL packets
|
|
err = hsa_queue_create(bs.gpu_dev, 128, HSA_QUEUE_TYPE_MULTI, NULL, NULL,
|
|
UINT32_MAX, UINT32_MAX, &bs.queue);
|
|
RET_IF_HSA_ERR(err);
|
|
|
|
// Find the HSA memory pools we need to run this sample
|
|
err = FindPools(&bs);
|
|
RET_IF_HSA_ERR(err);
|
|
|
|
// Allocate memory from the correct memory pool, and initialize them as
|
|
// neeeded for the algorihm.
|
|
err = AllocateAndInitBuffers(&bs);
|
|
RET_IF_HSA_ERR(err);
|
|
|
|
// Create a kernel object from the pre-compiled kernel, and read some
|
|
// attributes associated with the kernel that we will need.
|
|
err = LoadKernelFromObjFile(&bs);
|
|
RET_IF_HSA_ERR(err);
|
|
|
|
// Fill in the AQL packet, assign the kernel arguments, enqueue the packet,
|
|
// "ring" the doorbell, and wait for completion.
|
|
err = Run(&bs);
|
|
RET_IF_HSA_ERR(err);
|
|
|
|
// Release all the RocR resources we've acquired and shutdown HSA.
|
|
err = CleanUp(&bs);
|
|
|
|
return 0;
|
|
}
|
|
|
|
|
|
#undef RET_IF_HSA_ERR
|