From 9d9ea2ad3b3bdcc9b6b9871f19b2ed3aa166d54a Mon Sep 17 00:00:00 2001 From: "Ding, Wei (xN/A) TX" Date: Wed, 25 Feb 2015 13:05:59 -0500 Subject: [PATCH] ECR #333755 - Added two HSA samples: MatrixMultiplication, MatrixTranspose [git-p4: depot-paths = "//depot/stg/hsa/drivers/hsa/runtime/": change = 1124951] --- samples/Makefile | 7 + samples/build/Makefile | 9 ++ samples/build/Makefile.samples | 53 +++++++ samples/build/lnx/Makefile | 11 ++ samples/build/lnx64a/Makefile | 11 ++ samples/common/Makefile | 7 + samples/common/common_utility.cpp | 67 +++++++++ samples/common/common_utility.h | 13 ++ samples/common/elf_utils.cpp | 222 +++++++++++++++++++++++++++++ samples/common/elf_utils.h | 41 ++++++ samples/common/hsa_base.cpp | 225 ++++++++++++++++++++++++++++++ samples/common/hsa_base.h | 43 ++++++ samples/common/hsatimer.cpp | 190 +++++++++++++++++++++++++ samples/common/hsatimer.h | 64 +++++++++ samples/common/utilities.cpp | 185 ++++++++++++++++++++++++ samples/common/utilities.h | 207 +++++++++++++++++++++++++++ 16 files changed, 1355 insertions(+) create mode 100644 samples/Makefile create mode 100644 samples/build/Makefile create mode 100644 samples/build/Makefile.samples create mode 100644 samples/build/lnx/Makefile create mode 100644 samples/build/lnx64a/Makefile create mode 100644 samples/common/Makefile create mode 100644 samples/common/common_utility.cpp create mode 100644 samples/common/common_utility.h create mode 100644 samples/common/elf_utils.cpp create mode 100644 samples/common/elf_utils.h create mode 100644 samples/common/hsa_base.cpp create mode 100644 samples/common/hsa_base.h create mode 100644 samples/common/hsatimer.cpp create mode 100644 samples/common/hsatimer.h create mode 100644 samples/common/utilities.cpp create mode 100644 samples/common/utilities.h diff --git a/samples/Makefile b/samples/Makefile new file mode 100644 index 0000000000..47c3a90a10 --- /dev/null +++ b/samples/Makefile @@ -0,0 +1,7 @@ +OPENCL_DEPTH = ../.. + +include $(OPENCL_DEPTH)/runtimenew/runtimedefs + +SUBDIRS = MatrixTranspose MatrixMultiplication + +include $(OPENCL_DEPTH)/runtimenew/runtimerules diff --git a/samples/build/Makefile b/samples/build/Makefile new file mode 100644 index 0000000000..639cb73d07 --- /dev/null +++ b/samples/build/Makefile @@ -0,0 +1,9 @@ +OPENCL_DEPTH = ../../.. + +include $(OPENCL_DEPTH)/hsadefs + +BUILD_SUBDIRS = $(DEFAULT_TARGETS) +BUILD_MAKEFILE = Makefile.samples + + +include $(OPENCL_DEPTH)/hsarules diff --git a/samples/build/Makefile.samples b/samples/build/Makefile.samples new file mode 100644 index 0000000000..204e549733 --- /dev/null +++ b/samples/build/Makefile.samples @@ -0,0 +1,53 @@ +include $(OPENCL_DEPTH)/hsadefs + +EXE_TARGET = Test +vpath %.cpp $(COMPONENT_DEPTH) +CPPFILES := $(notdir $(wildcard $(COMPONENT_DEPTH)/*.cpp)) + +ifdef ATI_BITS_64 + LIB_SUFFIX = 64 + NBITS = 64 +else + LIB_SUFFIX = + ifndef ATI_OS_WINDOWS + NBITS := 32 + endif +endif + +ifdef ATI_OS_WINDOWS + CORE_LIB = dll + LFLAGS += /subsystem:console + LIB_PREFIX = +else + CORE_LIB = so + LIB_PREFIX = lib +endif + +ifdef ATI_OS_LINUX + GCXXOPTS := $(filter-out -fno-rtti,$(GCXXOPTS)) + GCXXOPTS := $(filter-out -fno-exceptions,$(GCXXOPTS)) + LFLAGS += -L$(DIST_LIB_DEST) -lpthread $(LIBSTDCXX) -lm -ldl -lrt +endif + +export BUILD_HSA_TARGET=yes + +LCINCS := $(INCSWITCH) "$(OPENCL_DEPTH)/compiler/finalizer/HSAIL/hsail-tools/libHSAIL" +LCINCS += $(INCSWITCH) "$(OPENCL_DEPTH)/compiler/finalizer/HSAIL/hsail-tools/libHSAIL/$(FULL_BUILD_DIR)" +LCINCS += $(INCSWITCH) "$(OPENCL_DEPTH)/compiler/finalizer/Interface" +LCINCS += $(INCSWITCH) "$(OPENCL_DEPTH)/contrib/gtest-1.6.0/include" +LCINCS += $(INCSWITCH) "$(OPENCL_DEPTH)/runtime/inc" + +LLLIBS := $(OPENCL_DEPTH)/contrib/gtest-1.6.0/$(FULL_BUILD_DIR)/libgtest$(LIB_EXT) +LLLIBS += $(OPENCL_DEPTH)/runtime/test/common/$(FULL_BUILD_DIR)/testcommon$(LIB_EXT) +LLLIBS += $(OPENCL_DEPTH)/runtime/test/gcommon/$(FULL_BUILD_DIR)/gtestcommon$(LIB_EXT) +LLLIBS += $(OPENCL_DEPTH)/compiler/finalizer/HSAIL/hsail-tools/libHSAIL/$(FULL_BUILD_DIR)/libhsail$(LIB_EXT) + +RUNTIME_BUILD = build/$(OS_TYPE)/$(CORE_LIB)/$(BUILD_DIR) + +ifdef ATI_OS_LINUX + LFLAGS += -L$(OPENCL_DEPTH)/runtime/core/$(RUNTIME_BUILD) -lhsa-runtime$(LIB_SUFFIX) +else + LLLIBS += $(OPENCL_DEPTH)/runtime/core/$(RUNTIME_BUILD)/hsa-runtime$(LIB_SUFFIX)$(LIB_EXT) +endif + +include $(OPENCL_DEPTH)/hsarules diff --git a/samples/build/lnx/Makefile b/samples/build/lnx/Makefile new file mode 100644 index 0000000000..6ce4ef59d0 --- /dev/null +++ b/samples/build/lnx/Makefile @@ -0,0 +1,11 @@ +# Generated Makefile lnx + +OPENCL_DEPTH = ..$(BUILD_DIR_DEPTH)/../../.. +SCLIB_DEPTH = ..$(BUILD_DIR_DEPTH)/../../../../../drivers/hsa/compiler/finalizer +MODULE_DEPTH = ../..$(BUILD_DIR_DEPTH) +COMPONENT_DEPTH = $(MODULE_DEPTH) + +OPENCL_TARGET = lnx +SCLIB_TARGET = lnx + +include $(MODULE_DEPTH)/build/Makefile.samples diff --git a/samples/build/lnx64a/Makefile b/samples/build/lnx64a/Makefile new file mode 100644 index 0000000000..3c1d2facdc --- /dev/null +++ b/samples/build/lnx64a/Makefile @@ -0,0 +1,11 @@ +# Generated Makefile lnx64a + +OPENCL_DEPTH = ..$(BUILD_DIR_DEPTH)/../../.. +SCLIB_DEPTH = ..$(BUILD_DIR_DEPTH)/../../../../../drivers/hsa/compiler/finalizer +MODULE_DEPTH = ../..$(BUILD_DIR_DEPTH) +COMPONENT_DEPTH = $(MODULE_DEPTH) + +OPENCL_TARGET = lnx64a +SCLIB_TARGET = lnx64a + +include $(MODULE_DEPTH)/build/Makefile.samples diff --git a/samples/common/Makefile b/samples/common/Makefile new file mode 100644 index 0000000000..ae780dfdcf --- /dev/null +++ b/samples/common/Makefile @@ -0,0 +1,7 @@ +OPENCL_DEPTH = ../../.. + +include $(OPENCL_DEPTH)/runtimenew/runtimedefs + +SUBDIRS = build + +include $(OPENCL_DEPTH)/runtimenew/runtimerules diff --git a/samples/common/common_utility.cpp b/samples/common/common_utility.cpp new file mode 100644 index 0000000000..939aedc854 --- /dev/null +++ b/samples/common/common_utility.cpp @@ -0,0 +1,67 @@ +#include "common_utility.h" + + +double CalcMedian(vector scores) +{ + double median; + size_t size = scores.size(); + + if (size % 2 == 0) + median = (scores[size / 2 - 1] + scores[size / 2]) / 2; + else + median = scores[size / 2]; + + return median; +} + +double CalcMean(vector scores) +{ + double mean; + size_t size = scores.size(); + + for (int i=0; i scores, int score_mean) +{ + double ret = 0.0; + for (int i=0; i scores) +{ + int num_of_concurrent_queues = 0; + vectorexecpted_exec_time_array; + + for (int i=0; i +#include +#include +#include +#include "hsa.h" +#include "elf_utils.h" +#include "hsa_ext_finalize.h" + +enum { + SECTION_HSA_DATA = 0, + SECTION_HSA_CODE, + SECTION_HSA_OPERAND, +}; + +typedef struct SectionDesc SectionDesc; +struct SectionDesc { + int sectionId; + const char *brigName; + const char *bifName; +} + +sectionDescs[] = { + { SECTION_HSA_DATA, "hsa_data",".brig_hsa_data" }, + { SECTION_HSA_CODE, "hsa_code",".brig_hsa_code" }, + { SECTION_HSA_OPERAND,"hsa_operand",".brig_hsa_operand"}, +}; + +extern int fileno(FILE* stream); + +const SectionDesc* get_section_desc(int sectionId) { + const int NUM_PREDEFINED_SECTIONS = sizeof(sectionDescs)/sizeof(sectionDescs[0]); + for(int i=0; id_buf + shdr->sh_name; + if (sectionName && + ((strcmp(sectionName, desc->brigName) == 0) || + (strcmp(sectionName, desc->bifName) == 0))) { + return scn; + } + } + + return NULL; +} + +/* Extract section and copy into HsaBrig */ +static status_t extract_section_and_copy (Elf *elfP, + Elf_Data *secHdr, + const SectionDesc* desc, + hsa_ext_brig_module_t* brig_module, + hsa_ext_brig_section_id_t section_id) { + Elf_Scn* scn = NULL; + Elf_Data* data = NULL; + void* address_to_copy; + size_t section_size=0; + + scn = extract_elf_section(elfP, secHdr, desc); + + if (scn) { + if ((data = elf_getdata(scn, NULL)) == NULL) { + return STATUS_UNKNOWN; + } + section_size = data->d_size; + if (section_size > 0) { + address_to_copy = malloc(section_size); + memcpy(address_to_copy, data->d_buf, section_size); + } + } + + if ((!scn || section_size == 0)) { + return STATUS_UNKNOWN; + } + + /* Create a section header */ + brig_module->section[section_id] = (hsa_ext_brig_section_header_t*) address_to_copy; + + return STATUS_SUCCESS; +} + +/* Reads binary of BRIG and BIF format */ +status_t read_binary(hsa_ext_brig_module_t **brig_module_t, FILE* binary) { + /* Create the brig_module */ + uint32_t number_of_sections = 3; + hsa_ext_brig_module_t* brig_module; + + brig_module = (hsa_ext_brig_module_t*) + (malloc (sizeof(hsa_ext_brig_module_t) + sizeof(void*)*number_of_sections)); + brig_module->section_count = number_of_sections; + + status_t status; + Elf* elfP = NULL; + Elf32_Ehdr* ehdr = NULL; + Elf_Data *secHdr = NULL; + Elf_Scn* scn = NULL; + int fd; + + if (elf_version ( EV_CURRENT ) == EV_NONE) { + return STATUS_KERNEL_ELF_INITIALIZATION_FAILED; + } + + fd = fileno(binary); + if ((elfP = elf_begin(fd, ELF_C_READ, (Elf *)0)) == NULL) { + return STATUS_KERNEL_INVALID_ELF_CONTAINER; + } + + if (elf_kind (elfP) != ELF_K_ELF) { + return STATUS_KERNEL_INVALID_ELF_CONTAINER; + } + + if (((ehdr = elf32_getehdr(elfP)) == NULL) || + ((scn = elf_getscn(elfP, ehdr->e_shstrndx)) == NULL) || + ((secHdr = elf_getdata(scn, NULL)) == NULL)) { + return STATUS_KERNEL_INVALID_SECTION_HEADER; + } + + status = extract_section_and_copy(elfP, + secHdr, + get_section_desc(SECTION_HSA_DATA), + brig_module, + HSA_EXT_BRIG_SECTION_DATA); + + if (status != STATUS_SUCCESS) { + return STATUS_KERNEL_MISSING_DATA_SECTION; + } + + status = extract_section_and_copy(elfP, + secHdr, + get_section_desc(SECTION_HSA_CODE), + brig_module, + HSA_EXT_BRIG_SECTION_CODE); + + if (status != STATUS_SUCCESS) { + return STATUS_KERNEL_MISSING_CODE_SECTION; + } + + status = extract_section_and_copy(elfP, + secHdr, + get_section_desc(SECTION_HSA_OPERAND), + brig_module, + HSA_EXT_BRIG_SECTION_OPERAND); + + if (status != STATUS_SUCCESS) { + return STATUS_KERNEL_MISSING_OPERAND_SECTION; + } + + elf_end(elfP); + *brig_module_t = brig_module; + + return STATUS_SUCCESS; +} + +status_t create_brig_module_from_brig_file(const char* file_name, hsa_ext_brig_module_t** brig_module) { + FILE *fp = fopen(file_name, "rb"); + + status_t status = read_binary(brig_module, fp); + + if (status != STATUS_SUCCESS) { + printf("Could not create BRIG module: %d\n", status); + if (status == STATUS_KERNEL_INVALID_SECTION_HEADER || + status == STATUS_KERNEL_ELF_INITIALIZATION_FAILED || + status == STATUS_KERNEL_INVALID_ELF_CONTAINER) { + printf("The ELF file is invalid or possibley corrupted.\n"); + } + if (status == STATUS_KERNEL_MISSING_DATA_SECTION || + status == STATUS_KERNEL_MISSING_CODE_SECTION || + status == STATUS_KERNEL_MISSING_OPERAND_SECTION) { + printf("One or more ELF sections are missing. Use readelf command to \ + to check if hsa_data, hsa_code and hsa_operands exist.\n"); + } + } + + fclose(fp); + + return status; +} + +void destroy_brig_module(hsa_ext_brig_module_t* brig_module) { + for (int i=0; isection_count; i++) { + free (brig_module->section[i]); + } + free (brig_module); +} diff --git a/samples/common/elf_utils.h b/samples/common/elf_utils.h new file mode 100644 index 0000000000..3305d33c00 --- /dev/null +++ b/samples/common/elf_utils.h @@ -0,0 +1,41 @@ +/* Copyright 2014 HSA Foundation Inc. All Rights Reserved. + * + * HSAF is granting you permission to use this software and documentation (if + * any) (collectively, the "Materials") pursuant to the terms and conditions + * of the Software License Agreement included with the Materials. If you do + * not have a copy of the Software License Agreement, contact the HSA Foundation for a copy. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS + * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE SOFTWARE. + */ + +#pragma once + +#include "hsa_ext_finalize.h" + +//typedef enum status_t status_t; +typedef enum status_t { + STATUS_SUCCESS=0, + STATUS_KERNEL_INVALID_SECTION_HEADER=1, + STATUS_KERNEL_ELF_INITIALIZATION_FAILED=2, + STATUS_KERNEL_INVALID_ELF_CONTAINER=3, + STATUS_KERNEL_MISSING_DATA_SECTION=4, + STATUS_KERNEL_MISSING_CODE_SECTION=5, + STATUS_KERNEL_MISSING_OPERAND_SECTION=6, + STATUS_UNKNOWN=7, +} status_t; + +status_t create_brig_module_from_brig_file(const char* file_name, hsa_ext_brig_module_t** brig_module); + +void destroy_brig_module(hsa_ext_brig_module_t* brig_module); diff --git a/samples/common/hsa_base.cpp b/samples/common/hsa_base.cpp new file mode 100644 index 0000000000..56bea80b15 --- /dev/null +++ b/samples/common/hsa_base.cpp @@ -0,0 +1,225 @@ +#include "hsa_base.h" + +void HSA::SetBrigFileAndKernelName(char * brig_file_name, char *kernel_name) +{ + strcpy(hsa_brig_file_name, brig_file_name); + strcpy(hsa_kernel_name, kernel_name); +} + +HSA::HSA() +{ + +} + +HSA::~HSA() +{ + +} + + +bool HSA::HsaInit() +{ + err = hsa_init(); + check(Initializing the hsa runtime, err); + + /* + * Iterate over the agents and pick the gpu agent using + * the find_gpu callback. + */ + err = hsa_iterate_agents(find_gpu, &device); + check(Calling hsa_iterate_agents, err); + + err = (device.handle== 0) ? HSA_STATUS_ERROR : HSA_STATUS_SUCCESS; + check(Checking if the GPU device is non-zero, err); + + if (err == HSA_STATUS_ERROR) + return false; + + /* + * Query the maximum size of the queue. + */ + err = hsa_agent_get_info(device, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &queue_size); + check(Querying the device maximum queue size, err); + + return true; +} + +double HSA::Run(int dim, int group_x, int group_y, int group_z, int s_size, int grid_x, int grid_y, int grid_z, void* kernel_args, int kernel_args_size) +{ + hsa_queue_t* local_command_queue; + /* + * Create a queue using the maximum size. + */ + err = hsa_queue_create(device, queue_size, HSA_QUEUE_TYPE_MULTI, NULL, NULL, 0, 0, &local_command_queue); + check(Creating the queue, err); + + /* + * Load BRIG, encapsulated in an ELF container, into a BRIG module. + */ + //char file_name[128] = "transpose_kernel.brig"; + hsa_ext_brig_module_t* local_brig_module; + err = (hsa_status_t)create_brig_module_from_brig_file(hsa_brig_file_name, &local_brig_module); + check(Creating the brig module from vector_copy.brig, err); + + /* + * Create hsa program. + */ + hsa_ext_program_handle_t local_hsa_program; + err = hsa_ext_program_create(&device, 1, HSA_EXT_BRIG_MACHINE_LARGE, HSA_EXT_BRIG_PROFILE_FULL, &local_hsa_program); + check(Creating the hsa program, err); + + /* + * Add the BRIG module to hsa program. + */ + hsa_ext_brig_module_handle_t local_module; + err = hsa_ext_add_module(local_hsa_program, local_brig_module, &local_module); + check(Adding the local brig module to the program, err); + + /* + * Construct finalization request list. + */ + hsa_ext_finalization_request_t local_finalization_request_list; + local_finalization_request_list.module = local_module; + local_finalization_request_list.program_call_convention = 0; + //char kernel_name[128] = "&__OpenCL_matrixTranspose_kernel"; + err = find_symbol_offset(local_brig_module, hsa_kernel_name, &local_finalization_request_list.symbol); + check(Finding the symbol offset for the kernel, err); + + /* + * Finalize the hsa program. + */ + err = hsa_ext_finalize_program(local_hsa_program, device, 1, &local_finalization_request_list, NULL, NULL, 0, NULL, 0); + check(Finalizing the program, err); + + /* + * Destroy the brig module. The program was successfully created the kernel + * symbol was found and the program was finalized, so it is no longer needed. + */ + destroy_brig_module(local_brig_module); + + /* + * Get the hsa code descriptor address. + */ + hsa_ext_code_descriptor_t *local_hsa_code_descriptor; + err = hsa_ext_query_kernel_descriptor_address(local_hsa_program, local_module, local_finalization_request_list.symbol, &local_hsa_code_descriptor); + check(Querying the kernel descriptor address, err); + + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + /* + * Create a signal to wait for the dispatch to finish. + */ + hsa_signal_t local_signal; + err=hsa_signal_create(1, 0, NULL, &local_signal); + check(Creating a HSA signal, err); + + + /* Initialize the dispatch packet */ + hsa_kernel_dispatch_packet_t local_dispatch_packet; + memset(&local_dispatch_packet, 0, sizeof(hsa_kernel_dispatch_packet_t)); + /* + * Setup the dispatch information. + */ + local_dispatch_packet.completion_signal=local_signal; + local_dispatch_packet.setup |= dim<< HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; + local_dispatch_packet.workgroup_size_x = group_x; + local_dispatch_packet.workgroup_size_y = group_y; + local_dispatch_packet.workgroup_size_z = group_z; + local_dispatch_packet.group_segment_size = s_size; + local_dispatch_packet.grid_size_x = grid_x; + local_dispatch_packet.grid_size_y = grid_y; + local_dispatch_packet.grid_size_z = grid_z; + local_dispatch_packet.header |= HSA_PACKET_TYPE_KERNEL_DISPATCH; + local_dispatch_packet.header |= HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE; + local_dispatch_packet.header |= HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE; + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + /* + * Find a memory region that supports kernel arguments. + */ + hsa_region_t local_kernarg_region; + local_kernarg_region.handle = 0; + + hsa_agent_iterate_regions(device, get_kernarg, &local_kernarg_region); + err = (local_kernarg_region.handle== 0) ? HSA_STATUS_ERROR : HSA_STATUS_SUCCESS; + check(Finding a kernarg memory region, err); + void* local_kernel_arg_buffer = NULL; + + size_t local_kernel_arg_buffer_size = local_hsa_code_descriptor->kernarg_segment_byte_size; + + /* + * Allocate the kernel argument buffer from the correct region. + */ + err = hsa_memory_allocate(local_kernarg_region, local_kernel_arg_buffer_size, &local_kernel_arg_buffer); + check(Allocating kernel argument memory buffer, err); + memcpy(local_kernel_arg_buffer, kernel_args, kernel_args_size); + + local_dispatch_packet.kernel_object = local_hsa_code_descriptor->code.handle; + // Assume our kernel receives no arguments + local_dispatch_packet.kernarg_address = local_kernel_arg_buffer; + + /* + * Register the memory region for the argument buffer. + */ + err = hsa_memory_register(kernel_args, kernel_args_size); + + check(Registering the argument buffer, err); + + /* + * Obtain the current queue write index. + */ + uint64_t index = hsa_queue_load_write_index_relaxed(local_command_queue); + + /* + * Write the aql packet at the calculated queue index address. + */ + const uint32_t queueMask = local_command_queue->size - 1; + ((hsa_kernel_dispatch_packet_t*)(local_command_queue->base_address))[index&queueMask]=local_dispatch_packet; + + /* + * Increment the write index and ring the doorbell to dispatch the kernel. + */ + hsa_queue_store_write_index_relaxed(local_command_queue, index+1); + +#ifdef TIME + PerfTimer perf_timer_0; + int timer_idx_0 = perf_timer_0.CreateTimer(); + perf_timer_0.StartTimer(timer_idx_0); +#endif + hsa_signal_store_release(local_command_queue->doorbell_signal, index); + + /* + * Wait on the dispatch signal until all kernel are finished. + */ + while (hsa_signal_wait_acquire(local_signal, HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX, HSA_WAIT_STATE_ACTIVE) != 0); + +#ifdef TIME + perf_timer_0.StopTimer(timer_idx_0); +#endif + /* + * Cleanup all allocated resources. + */ + + err=hsa_signal_destroy(local_signal); + check(Destroying the local_signal, err); + + err=hsa_ext_program_destroy(local_hsa_program); + check(Destroying the program, err); + + err=hsa_queue_destroy(local_command_queue); + check(Destroying the queue, err); + +#ifdef TIME + double ret = perf_timer_0.ReadTimer(timer_idx_0); +#endif + + return 0; + +} + + +void HSA::Close() +{ + err=hsa_shut_down(); + check(Shutting down the runtime, err); +} + diff --git a/samples/common/hsa_base.h b/samples/common/hsa_base.h new file mode 100644 index 0000000000..a8c30bfd91 --- /dev/null +++ b/samples/common/hsa_base.h @@ -0,0 +1,43 @@ +#ifndef __HSA_BASE__ +#define __HSA_BASE__ + + +#include +#include "hsa.h" +#include "hsa_ext_finalize.h" +#include "elf_utils.h" +#include "hsatimer.h" +#include "utilities.h" + +class HSA{ + public: + HSA(); + ~HSA(); + + public: + void SetBrigFileAndKernelName(char *brig_file_name, char *kernel_name); + bool HsaInit(); + void Close(); + double Run(int dim, int group_x, int group_y, int group_z, int s_size, int grid_x, int grid_y, int grid_z, void* kernel_args, int kernel_args_size); + + public: + hsa_status_t err; + uint32_t queue_size; + hsa_agent_t device; + + char hsa_brig_file_name[128]; + char hsa_kernel_name[128]; + + hsa_queue_t* command_queue; + hsa_signal_t hsa_signal; + hsa_ext_brig_module_t* brig_module; + hsa_ext_brig_module_handle_t module; + hsa_ext_program_handle_t hsa_program; + hsa_ext_code_descriptor_t *hsa_code_descriptor; + hsa_kernel_dispatch_packet_t dispatch_packet; // needs to be set manually each time + hsa_region_t hsa_kernarg_region; +}; + + +#endif + diff --git a/samples/common/hsatimer.cpp b/samples/common/hsatimer.cpp new file mode 100644 index 0000000000..1ce6726d35 --- /dev/null +++ b/samples/common/hsatimer.cpp @@ -0,0 +1,190 @@ +#include "hsatimer.h" + +PerfTimer::PerfTimer() +{ + freq_in_100mhz = MeasureTSCFreqHz(); +} + +PerfTimer::~PerfTimer() +{ + while(!_timers.empty()) + { + Timer *temp = _timers.back(); + _timers.pop_back(); + delete temp; + } +} + +//a new cretaed timer instantance index will be returned +int PerfTimer::CreateTimer() +{ + Timer *newTimer = new Timer; + newTimer->_start = 0; + newTimer->_clocks = 0; + +#ifdef _WIN32 + QueryPerformanceFrequency((LARGE_INTEGER*)&newTimer->_freq); +#else + newTimer->_freq = (long long)1.0E3; +#endif + + /* Push back the address of new Timer instance created */ + _timers.push_back(newTimer); + return (int)(_timers.size() - 1); +} + +int PerfTimer::StartTimer(int index) +{ + if(index >= (int)_timers.size()) + { + Error("Cannot reset timer. Invalid handle."); + return HSA_FAILURE; + } + +#ifdef _WIN32 + // General Windows timing method + #ifndef _AMD + long long tmpStart; + QueryPerformanceCounter((LARGE_INTEGER*)&(tmpStart)); + _timers[index]->_start = (double)tmpStart; + #else + // AMD Windows timing method + + #endif + +#else + // General Linux timing method + #ifndef _AMD + struct timeval s; + gettimeofday(&s, 0); + _timers[index]->_start = s.tv_sec * 1.0E3 + ((double)(s.tv_usec / 1.0E3)); + #else + + // AMD timing method + + unsigned int unused; + _timers[index]->_start = __rdtscp(&unused); + + #endif + +#endif + + return HSA_SUCCESS; +} + + +int PerfTimer::StopTimer(int index) +{ + double n=0; + if(index >= (int)_timers.size()) + { + Error("Cannot reset timer. Invalid handle."); + return HSA_FAILURE; + } +#ifdef _WIN32 + #ifndef _AMD + long long n1; + QueryPerformanceCounter((LARGE_INTEGER*)&(n1)); + n = (double) n1; + #else + + // AMD Window Timing + + #endif + +#else + // General Linux timing method + #ifndef _AMD + struct timeval s; + gettimeofday(&s, 0); + n = s.tv_sec * 1.0E3+ (double)(s.tv_usec/1.0E3); + #else + // AMD Linux timing + + unsigned int unused; + n = __rdtscp(&unused); + #endif + +#endif + + n -= _timers[index]->_start; + _timers[index]->_start = 0; + + #ifndef _AMD + _timers[index]->_clocks += n; + #else + //_timers[index]->_clocks += 10 * n /freq_in_100mhz; // unit is ns + _timers[index]->_clocks += 1.0E-6 * 10 * n /freq_in_100mhz; // convert to ms + cout << "_AMD is enabled!!!" << endl; + #endif + + return HSA_SUCCESS; +} + +void PerfTimer::Error(string str) +{ + cout << str << endl; +} + + +double PerfTimer::ReadTimer(int index) +{ + + if(index >= (int)_timers.size()) + { + Error("Cannot read timer. Invalid handle."); + return HSA_FAILURE; + } + + double reading = double(_timers[index]->_clocks); + + reading = double(reading / _timers[index]->_freq); + + return reading; +} + + +uint64_t PerfTimer::CoarseTimestampUs() +{ +#ifdef _WIN32 + uint64_t freqHz, ticks; + QueryPerformanceFrequency((LARGE_INTEGER *)&freqHz); + QueryPerformanceCounter((LARGE_INTEGER *)&ticks); + + // Scale numerator and divisor until (ticks * 1000000) fits in uint64_t. + while (ticks > (1ULL << 44)) { + ticks /= 16; + freqHz /= 16; + } + + return (ticks * 1000000) / freqHz; +#else + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC_RAW, &ts); + return uint64_t(ts.tv_sec) * 1000000 + ts.tv_nsec / 1000; +#endif +} + +uint64_t PerfTimer::MeasureTSCFreqHz() +{ + // Make a coarse interval measurement of TSC ticks for 1 gigacycles. + unsigned int unused; + uint64_t tscTicksEnd; + + uint64_t coarseBeginUs = CoarseTimestampUs(); + uint64_t tscTicksBegin = __rdtscp(&unused); + do + { + tscTicksEnd = __rdtscp(&unused); + } + while (tscTicksEnd - tscTicksBegin < 1000000000); + + uint64_t coarseEndUs = CoarseTimestampUs(); + + // Compute the TSC frequency and round to nearest 100MHz. + uint64_t coarseIntervalNs = (coarseEndUs - coarseBeginUs) * 1000; + uint64_t tscIntervalTicks = tscTicksEnd - tscTicksBegin; + return (tscIntervalTicks * 10 + (coarseIntervalNs / 2)) / coarseIntervalNs; +} + + diff --git a/samples/common/hsatimer.h b/samples/common/hsatimer.h new file mode 100644 index 0000000000..a15b06f583 --- /dev/null +++ b/samples/common/hsatimer.h @@ -0,0 +1,64 @@ +#ifndef __MYTIME__ +#define __MYTIME__ + +// Will use AMD timer and general Linux timer based on users' need --> compilation flag + +// need to consider platform is Windows or Linux + +#include +#include +#include +#include +#include + +#include +#include +#include +using namespace std; + +#include + +#define HSA_FAILURE 1 +#define HSA_SUCCESS 0 + +class PerfTimer { + private: + struct Timer + { + string name; /* < name name of time object*/ + long long _freq; /* < _freq frequency*/ + double _clocks; /* < _clocks number of ticks at end*/ + double _start; /* < _start start point ticks*/ + }; + + std::vector _timers; /*< _timers vector to Timer objects */ + double freq_in_100mhz; + + public: + PerfTimer(); + ~PerfTimer(); + + private: + //AMD timing method + uint64_t CoarseTimestampUs(); + uint64_t MeasureTSCFreqHz(); + + //General Linux timing method + + public: + int CreateTimer(); + int StartTimer(int index); + int StopTimer(int index); + + public: + // retrieve time + double ReadTimer(int index); + // write into a file + double WriteTimer(int index); + + public: + void Error(string str); +}; + +#endif + diff --git a/samples/common/utilities.cpp b/samples/common/utilities.cpp new file mode 100644 index 0000000000..96a5d34cda --- /dev/null +++ b/samples/common/utilities.cpp @@ -0,0 +1,185 @@ +#include "utilities.h" + +/* + * Prints no more than 256 elements of the given array. + * Prints full array if length is less than 256. + * Prints Array name followed by elements. + */ +template +void PrintArray( + string header, + const T * data, + const int width, + const int height) +{ + cout<<"\n"< +int IsPowerOf2(T val) +{ + long long _val = val; + if((_val & (-_val))-_val == 0 && _val != 0) + return 0; + else + return -1; +} + + +template +T RoundToPowerOf2(T val) +{ + int bytes = sizeof(T); + + val--; + for(int i = 0; i < bytes; i++) + val |= val >> (1< +int FillRandom( + T * arrayPtr, + const int width, + const int height, + const T rangeMin, + const T rangeMax, + unsigned int seed=123) +{ + if(!arrayPtr) + { + printf("Cannot fill array. NULL pointer."); + return -1; + } + + if(!seed) + seed = (unsigned int)time(NULL); + + srand(seed); + double range = double(rangeMax - rangeMin) + 1.0; + + /* random initialisation of input */ + for(int i = 0; i < height; i++) + for(int j = 0; j < width; j++) + { + int index = i*width + j; + arrayPtr[index] = rangeMin + T(range*rand()/(RAND_MAX + 1.0)); + } + + return 0; +} + + +//get a memory region that can be used for global memory allocations. +hsa_status_t get_global_region(hsa_region_t region, void* data) +{ + hsa_region_segment_t segment; + hsa_region_get_info(region, HSA_REGION_INFO_SEGMENT, &segment); + if (HSA_REGION_SEGMENT_GLOBAL == segment) + { + hsa_region_t* ret = (hsa_region_t*) data; + *ret = region; + } + return HSA_STATUS_SUCCESS; +} + + +/* + * Finds the specified symbols offset in the specified brig_module. + * If the symbol is found the function returns HSA_STATUS_SUCCESS, + * otherwise it returns HSA_STATUS_ERROR. + */ +hsa_status_t find_symbol_offset(hsa_ext_brig_module_t* brig_module, + char* symbol_name, + hsa_ext_brig_code_section_offset32_t* offset) +{ + + /* + * Get the data section + */ + hsa_ext_brig_section_header_t* data_section_header = + brig_module->section[HSA_EXT_BRIG_SECTION_DATA]; + /* + * Get the code section + */ + hsa_ext_brig_section_header_t* code_section_header = + brig_module->section[HSA_EXT_BRIG_SECTION_CODE]; + + /* + * First entry into the BRIG code section + */ + BrigCodeOffset32_t code_offset = code_section_header->header_byte_count; + BrigBase* code_entry = (BrigBase*) ((char*)code_section_header + code_offset); + while (code_offset != code_section_header->byte_count) + { + if (code_entry->kind == BRIG_KIND_DIRECTIVE_KERNEL) + { + /* + * Now find the data in the data section + */ + BrigDirectiveExecutable* directive_kernel = (BrigDirectiveExecutable*) (code_entry); + BrigDataOffsetString32_t data_name_offset = directive_kernel->name; + BrigData* data_entry = (BrigData*)((char*) data_section_header + data_name_offset); + if (!strncmp(symbol_name, (char*) data_entry->bytes, strlen(symbol_name))) + { + *offset = code_offset; + return HSA_STATUS_SUCCESS; + } + } + code_offset += code_entry->byteCount; + code_entry = (BrigBase*) ((char*)code_section_header + code_offset); + } + return HSA_STATUS_ERROR; +} + +/* + * Determines if the given agent is of type HSA_DEVICE_TYPE_GPU + * and sets the value of data to the agent handle if it is. + */ +hsa_status_t find_gpu(hsa_agent_t agent, void *data) +{ + if (data == NULL) + { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + hsa_device_type_t device_type; + hsa_status_t stat = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &device_type); + if (stat != HSA_STATUS_SUCCESS) + { + return stat; + } + if (device_type == HSA_DEVICE_TYPE_GPU) + { + *((hsa_agent_t *)data) = agent; + } + return HSA_STATUS_SUCCESS; +} + +/* + * Determines if a memory region can be used for kernarg + * allocations. + */ +hsa_status_t get_kernarg(hsa_region_t region, void* data) +{ + hsa_region_global_flag_t flags; + hsa_region_get_info(region, HSA_REGION_INFO_GLOBAL_FLAGS, &flags); + if (flags & HSA_REGION_GLOBAL_FLAG_KERNARG) + { + hsa_region_t* ret = (hsa_region_t*) data; + *ret = region; + } + return HSA_STATUS_SUCCESS; +} + diff --git a/samples/common/utilities.h b/samples/common/utilities.h new file mode 100644 index 0000000000..9239712caf --- /dev/null +++ b/samples/common/utilities.h @@ -0,0 +1,207 @@ +#ifndef __HSA_UTILITY__ +#define __HSA_UTILITY__ + +#include +#include + +#include "hsa.h" +#include "hsa_ext_finalize.h" +#include "elf_utils.h" + +#include +#include +using namespace std; + +#define HSA_ARGUMENT_ALIGN_BYTES 16 + +#define SDK_FAILURE 1 +#define SDK_SUCCESS 0 + +/* +#define check(msg, status) \ +if (status != HSA_STATUS_SUCCESS) { \ + printf("%s failed.\n", #msg); \ + exit(1); \ +} else { \ + printf("%s succeeded.\n", #msg); \ +} +*/ +#define check(msg, status) \ +if (status != HSA_STATUS_SUCCESS) { \ + printf("%s failed.\n", #msg); \ + exit(1); \ +} else { \ + ; \ +} + +/* + * Define required BRIG data structures. + */ + +typedef uint32_t BrigCodeOffset32_t; + +typedef uint32_t BrigDataOffset32_t; + +typedef uint16_t BrigKinds16_t; + +typedef uint8_t BrigLinkage8_t; + +typedef uint8_t BrigExecutableModifier8_t; + +typedef BrigDataOffset32_t BrigDataOffsetString32_t; + +enum BrigKinds { + BRIG_KIND_NONE = 0x0000, + BRIG_KIND_DIRECTIVE_BEGIN = 0x1000, + BRIG_KIND_DIRECTIVE_KERNEL = 0x1008, +}; + +typedef struct BrigBase BrigBase; +struct BrigBase { + uint16_t byteCount; + BrigKinds16_t kind; +}; + +typedef struct BrigExecutableModifier BrigExecutableModifier; +struct BrigExecutableModifier { + BrigExecutableModifier8_t allBits; +}; + +typedef struct BrigDirectiveExecutable BrigDirectiveExecutable; +struct BrigDirectiveExecutable { + uint16_t byteCount; + BrigKinds16_t kind; + BrigDataOffsetString32_t name; + uint16_t outArgCount; + uint16_t inArgCount; + BrigCodeOffset32_t firstInArg; + BrigCodeOffset32_t firstCodeBlockEntry; + BrigCodeOffset32_t nextModuleEntry; + uint32_t codeBlockEntryCount; + BrigExecutableModifier modifier; + BrigLinkage8_t linkage; + uint16_t reserved; +}; + +typedef struct BrigData BrigData; +struct BrigData { + uint32_t byteCount; + uint8_t bytes[1]; +}; + +struct float2 +{ + float s0; + float s1; + + + float2 operator * (float2 &fl) + { + float2 temp; + temp.s0 = (this->s0) * fl.s0; + temp.s1 = (this->s1) * fl.s1; + return temp; + } + + float2 operator * (float scalar) + { + float2 temp; + temp.s0 = (this->s0) * scalar; + temp.s1 = (this->s1) * scalar; + return temp; + } + + float2 operator + (float2 &fl) + { + float2 temp; + temp.s0 = (this->s0) + fl.s0; + temp.s1 = (this->s1) + fl.s1; + return temp; + } + + float2 operator - (float2 fl) + { + float2 temp; + temp.s0 = (this->s0) - fl.s0; + temp.s1 = (this->s1) - fl.s1; + return temp; + } +}; + + +struct uint2 +{ + uint s0; + uint s1; + + + uint2 operator * (uint2 &fl) + { + uint2 temp; + temp.s0 = (this->s0) * fl.s0; + temp.s1 = (this->s1) * fl.s1; + return temp; + } + + uint2 operator * (float scalar) + { + uint2 temp; + temp.s0 = (this->s0) * scalar; + temp.s1 = (this->s1) * scalar; + return temp; + } + + uint2 operator + (uint2 &fl) + { + uint2 temp; + temp.s0 = (this->s0) + fl.s0; + temp.s1 = (this->s1) + fl.s1; + return temp; + } + + uint2 operator - (uint2 fl) + { + uint2 temp; + temp.s0 = (this->s0) - fl.s0; + temp.s1 = (this->s1) - fl.s1; + return temp; + } +}; + + +/* + * Prints no more than 256 elements of the given array. + * Prints full array if length is less than 256. + * Prints Array name followed by elements. + */ +template void PrintArray(string header, const T * data, const int width, const int height); + +template int IsPowerOf2(T val); + +template T RoundToPowerOf2(T val); + +template int FillRandom(T * arrayPtr, const int width, const int height, const T rangeMin, const T rangeMax, unsigned int seed=123); + +//get a memory region that can be used for global memory allocations. +hsa_status_t get_global_region(hsa_region_t region, void* data); + +/* + * Finds the specified symbols offset in the specified brig_module. + * If the symbol is found the function returns HSA_STATUS_SUCCESS, + * otherwise it returns HSA_STATUS_ERROR. + */ +hsa_status_t find_symbol_offset(hsa_ext_brig_module_t* brig_module, char* symbol_name, hsa_ext_brig_code_section_offset32_t* offset); + +/* + * Determines if the given agent is of type HSA_DEVICE_TYPE_GPU + * and sets the value of data to the agent handle if it is. + */ +hsa_status_t find_gpu(hsa_agent_t agent, void *data); + +/* + * Determines if a memory region can be used for kernarg + * allocations. + */ +hsa_status_t get_kernarg(hsa_region_t region, void* data); + +#endif