ECR #333755 - Added two HSA samples: MatrixMultiplication, MatrixTranspose

[git-p4: depot-paths = "//depot/stg/hsa/drivers/hsa/runtime/": change = 1124951]
Bu işleme şunda yer alıyor:
Ding, Wei (xN/A) TX
2015-02-25 13:05:59 -05:00
işleme 9d9ea2ad3b
16 değiştirilmiş dosya ile 1355 ekleme ve 0 silme
+7
Dosyayı Görüntüle
@@ -0,0 +1,7 @@
OPENCL_DEPTH = ../..
include $(OPENCL_DEPTH)/runtimenew/runtimedefs
SUBDIRS = MatrixTranspose MatrixMultiplication
include $(OPENCL_DEPTH)/runtimenew/runtimerules
+9
Dosyayı Görüntüle
@@ -0,0 +1,9 @@
OPENCL_DEPTH = ../../..
include $(OPENCL_DEPTH)/hsadefs
BUILD_SUBDIRS = $(DEFAULT_TARGETS)
BUILD_MAKEFILE = Makefile.samples
include $(OPENCL_DEPTH)/hsarules
+53
Dosyayı Görüntüle
@@ -0,0 +1,53 @@
include $(OPENCL_DEPTH)/hsadefs
EXE_TARGET = Test
vpath %.cpp $(COMPONENT_DEPTH)
CPPFILES := $(notdir $(wildcard $(COMPONENT_DEPTH)/*.cpp))
ifdef ATI_BITS_64
LIB_SUFFIX = 64
NBITS = 64
else
LIB_SUFFIX =
ifndef ATI_OS_WINDOWS
NBITS := 32
endif
endif
ifdef ATI_OS_WINDOWS
CORE_LIB = dll
LFLAGS += /subsystem:console
LIB_PREFIX =
else
CORE_LIB = so
LIB_PREFIX = lib
endif
ifdef ATI_OS_LINUX
GCXXOPTS := $(filter-out -fno-rtti,$(GCXXOPTS))
GCXXOPTS := $(filter-out -fno-exceptions,$(GCXXOPTS))
LFLAGS += -L$(DIST_LIB_DEST) -lpthread $(LIBSTDCXX) -lm -ldl -lrt
endif
export BUILD_HSA_TARGET=yes
LCINCS := $(INCSWITCH) "$(OPENCL_DEPTH)/compiler/finalizer/HSAIL/hsail-tools/libHSAIL"
LCINCS += $(INCSWITCH) "$(OPENCL_DEPTH)/compiler/finalizer/HSAIL/hsail-tools/libHSAIL/$(FULL_BUILD_DIR)"
LCINCS += $(INCSWITCH) "$(OPENCL_DEPTH)/compiler/finalizer/Interface"
LCINCS += $(INCSWITCH) "$(OPENCL_DEPTH)/contrib/gtest-1.6.0/include"
LCINCS += $(INCSWITCH) "$(OPENCL_DEPTH)/runtime/inc"
LLLIBS := $(OPENCL_DEPTH)/contrib/gtest-1.6.0/$(FULL_BUILD_DIR)/libgtest$(LIB_EXT)
LLLIBS += $(OPENCL_DEPTH)/runtime/test/common/$(FULL_BUILD_DIR)/testcommon$(LIB_EXT)
LLLIBS += $(OPENCL_DEPTH)/runtime/test/gcommon/$(FULL_BUILD_DIR)/gtestcommon$(LIB_EXT)
LLLIBS += $(OPENCL_DEPTH)/compiler/finalizer/HSAIL/hsail-tools/libHSAIL/$(FULL_BUILD_DIR)/libhsail$(LIB_EXT)
RUNTIME_BUILD = build/$(OS_TYPE)/$(CORE_LIB)/$(BUILD_DIR)
ifdef ATI_OS_LINUX
LFLAGS += -L$(OPENCL_DEPTH)/runtime/core/$(RUNTIME_BUILD) -lhsa-runtime$(LIB_SUFFIX)
else
LLLIBS += $(OPENCL_DEPTH)/runtime/core/$(RUNTIME_BUILD)/hsa-runtime$(LIB_SUFFIX)$(LIB_EXT)
endif
include $(OPENCL_DEPTH)/hsarules
+11
Dosyayı Görüntüle
@@ -0,0 +1,11 @@
# Generated Makefile lnx
OPENCL_DEPTH = ..$(BUILD_DIR_DEPTH)/../../..
SCLIB_DEPTH = ..$(BUILD_DIR_DEPTH)/../../../../../drivers/hsa/compiler/finalizer
MODULE_DEPTH = ../..$(BUILD_DIR_DEPTH)
COMPONENT_DEPTH = $(MODULE_DEPTH)
OPENCL_TARGET = lnx
SCLIB_TARGET = lnx
include $(MODULE_DEPTH)/build/Makefile.samples
+11
Dosyayı Görüntüle
@@ -0,0 +1,11 @@
# Generated Makefile lnx64a
OPENCL_DEPTH = ..$(BUILD_DIR_DEPTH)/../../..
SCLIB_DEPTH = ..$(BUILD_DIR_DEPTH)/../../../../../drivers/hsa/compiler/finalizer
MODULE_DEPTH = ../..$(BUILD_DIR_DEPTH)
COMPONENT_DEPTH = $(MODULE_DEPTH)
OPENCL_TARGET = lnx64a
SCLIB_TARGET = lnx64a
include $(MODULE_DEPTH)/build/Makefile.samples
+7
Dosyayı Görüntüle
@@ -0,0 +1,7 @@
OPENCL_DEPTH = ../../..
include $(OPENCL_DEPTH)/runtimenew/runtimedefs
SUBDIRS = build
include $(OPENCL_DEPTH)/runtimenew/runtimerules
+67
Dosyayı Görüntüle
@@ -0,0 +1,67 @@
#include "common_utility.h"
double CalcMedian(vector<double> scores)
{
double median;
size_t size = scores.size();
if (size % 2 == 0)
median = (scores[size / 2 - 1] + scores[size / 2]) / 2;
else
median = scores[size / 2];
return median;
}
double CalcMean(vector<double> scores)
{
double mean;
size_t size = scores.size();
for (int i=0; i<size; ++i)
mean += scores[i];
return mean/size;
}
double CalcStdDeviation(vector<double> scores, int score_mean)
{
double ret = 0.0;
for (int i=0; i<scores.size(); ++i)
{
ret += (scores[i] - score_mean) * (scores[i] - score_mean);
}
ret /= scores.size();
return sqrt(ret);
}
int CalcConcurrentQueues(vector<double> scores)
{
int num_of_concurrent_queues = 0;
vector<double>execpted_exec_time_array;
for (int i=0; i<scores.size(); ++i)
{
execpted_exec_time_array.push_back(scores[0]/(1<<i));
}
for (int i=0; i<scores.size(); ++i)
{
cout << "expected exe time = " << execpted_exec_time_array[i] << endl;
}
for (int i=1; i<scores.size(); ++i)
{
if ((execpted_exec_time_array[i] - scores[i]) < 0.1 * execpted_exec_time_array[i])
++num_of_concurrent_queues;
}
return num_of_concurrent_queues;
}
+13
Dosyayı Görüntüle
@@ -0,0 +1,13 @@
#include <iostream>
#include <algorithm>
#include <cmath>
#include <vector>
using namespace std;
double CalcMean(vector<double> scores);
double CalcMedian(vector<double> scores);
double CalcStdDeviation(vector<double> scores, int score_mean);
int CalcConcurrentQueues(vector<double> scores);
+222
Dosyayı Görüntüle
@@ -0,0 +1,222 @@
/* Copyright 2014 HSA Foundation Inc. All Rights Reserved.
*
* HSAF is granting you permission to use this software and documentation (if
* any) (collectively, the "Materials") pursuant to the terms and conditions
* of the Software License Agreement included with the Materials. If you do
* not have a copy of the Software License Agreement, contact the HSA Foundation for a copy.
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
* FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE SOFTWARE.
*/
#include <stdlib.h>
#include <libelf.h>
#include <string.h>
#include <stdio.h>
#include "hsa.h"
#include "elf_utils.h"
#include "hsa_ext_finalize.h"
enum {
SECTION_HSA_DATA = 0,
SECTION_HSA_CODE,
SECTION_HSA_OPERAND,
};
typedef struct SectionDesc SectionDesc;
struct SectionDesc {
int sectionId;
const char *brigName;
const char *bifName;
}
sectionDescs[] = {
{ SECTION_HSA_DATA, "hsa_data",".brig_hsa_data" },
{ SECTION_HSA_CODE, "hsa_code",".brig_hsa_code" },
{ SECTION_HSA_OPERAND,"hsa_operand",".brig_hsa_operand"},
};
extern int fileno(FILE* stream);
const SectionDesc* get_section_desc(int sectionId) {
const int NUM_PREDEFINED_SECTIONS = sizeof(sectionDescs)/sizeof(sectionDescs[0]);
for(int i=0; i<NUM_PREDEFINED_SECTIONS; ++i) {
if (sectionDescs[i].sectionId == sectionId) {
return &sectionDescs[i];
}
}
return NULL;
}
static Elf_Scn* extract_elf_section (Elf *elfP,
Elf_Data *secHdr,
const SectionDesc* desc) {
int cnt = 0;
Elf_Scn* scn = NULL;
Elf32_Shdr* shdr = NULL;
char* sectionName = NULL;
/* Iterate thru the elf sections */
for (cnt = 1, scn = NULL; scn = elf_nextscn(elfP, scn); cnt++) {
if (((shdr = elf32_getshdr(scn)) == NULL)) {
return NULL;
}
sectionName = (char *)secHdr->d_buf + shdr->sh_name;
if (sectionName &&
((strcmp(sectionName, desc->brigName) == 0) ||
(strcmp(sectionName, desc->bifName) == 0))) {
return scn;
}
}
return NULL;
}
/* Extract section and copy into HsaBrig */
static status_t extract_section_and_copy (Elf *elfP,
Elf_Data *secHdr,
const SectionDesc* desc,
hsa_ext_brig_module_t* brig_module,
hsa_ext_brig_section_id_t section_id) {
Elf_Scn* scn = NULL;
Elf_Data* data = NULL;
void* address_to_copy;
size_t section_size=0;
scn = extract_elf_section(elfP, secHdr, desc);
if (scn) {
if ((data = elf_getdata(scn, NULL)) == NULL) {
return STATUS_UNKNOWN;
}
section_size = data->d_size;
if (section_size > 0) {
address_to_copy = malloc(section_size);
memcpy(address_to_copy, data->d_buf, section_size);
}
}
if ((!scn || section_size == 0)) {
return STATUS_UNKNOWN;
}
/* Create a section header */
brig_module->section[section_id] = (hsa_ext_brig_section_header_t*) address_to_copy;
return STATUS_SUCCESS;
}
/* Reads binary of BRIG and BIF format */
status_t read_binary(hsa_ext_brig_module_t **brig_module_t, FILE* binary) {
/* Create the brig_module */
uint32_t number_of_sections = 3;
hsa_ext_brig_module_t* brig_module;
brig_module = (hsa_ext_brig_module_t*)
(malloc (sizeof(hsa_ext_brig_module_t) + sizeof(void*)*number_of_sections));
brig_module->section_count = number_of_sections;
status_t status;
Elf* elfP = NULL;
Elf32_Ehdr* ehdr = NULL;
Elf_Data *secHdr = NULL;
Elf_Scn* scn = NULL;
int fd;
if (elf_version ( EV_CURRENT ) == EV_NONE) {
return STATUS_KERNEL_ELF_INITIALIZATION_FAILED;
}
fd = fileno(binary);
if ((elfP = elf_begin(fd, ELF_C_READ, (Elf *)0)) == NULL) {
return STATUS_KERNEL_INVALID_ELF_CONTAINER;
}
if (elf_kind (elfP) != ELF_K_ELF) {
return STATUS_KERNEL_INVALID_ELF_CONTAINER;
}
if (((ehdr = elf32_getehdr(elfP)) == NULL) ||
((scn = elf_getscn(elfP, ehdr->e_shstrndx)) == NULL) ||
((secHdr = elf_getdata(scn, NULL)) == NULL)) {
return STATUS_KERNEL_INVALID_SECTION_HEADER;
}
status = extract_section_and_copy(elfP,
secHdr,
get_section_desc(SECTION_HSA_DATA),
brig_module,
HSA_EXT_BRIG_SECTION_DATA);
if (status != STATUS_SUCCESS) {
return STATUS_KERNEL_MISSING_DATA_SECTION;
}
status = extract_section_and_copy(elfP,
secHdr,
get_section_desc(SECTION_HSA_CODE),
brig_module,
HSA_EXT_BRIG_SECTION_CODE);
if (status != STATUS_SUCCESS) {
return STATUS_KERNEL_MISSING_CODE_SECTION;
}
status = extract_section_and_copy(elfP,
secHdr,
get_section_desc(SECTION_HSA_OPERAND),
brig_module,
HSA_EXT_BRIG_SECTION_OPERAND);
if (status != STATUS_SUCCESS) {
return STATUS_KERNEL_MISSING_OPERAND_SECTION;
}
elf_end(elfP);
*brig_module_t = brig_module;
return STATUS_SUCCESS;
}
status_t create_brig_module_from_brig_file(const char* file_name, hsa_ext_brig_module_t** brig_module) {
FILE *fp = fopen(file_name, "rb");
status_t status = read_binary(brig_module, fp);
if (status != STATUS_SUCCESS) {
printf("Could not create BRIG module: %d\n", status);
if (status == STATUS_KERNEL_INVALID_SECTION_HEADER ||
status == STATUS_KERNEL_ELF_INITIALIZATION_FAILED ||
status == STATUS_KERNEL_INVALID_ELF_CONTAINER) {
printf("The ELF file is invalid or possibley corrupted.\n");
}
if (status == STATUS_KERNEL_MISSING_DATA_SECTION ||
status == STATUS_KERNEL_MISSING_CODE_SECTION ||
status == STATUS_KERNEL_MISSING_OPERAND_SECTION) {
printf("One or more ELF sections are missing. Use readelf command to \
to check if hsa_data, hsa_code and hsa_operands exist.\n");
}
}
fclose(fp);
return status;
}
void destroy_brig_module(hsa_ext_brig_module_t* brig_module) {
for (int i=0; i<brig_module->section_count; i++) {
free (brig_module->section[i]);
}
free (brig_module);
}
+41
Dosyayı Görüntüle
@@ -0,0 +1,41 @@
/* Copyright 2014 HSA Foundation Inc. All Rights Reserved.
*
* HSAF is granting you permission to use this software and documentation (if
* any) (collectively, the "Materials") pursuant to the terms and conditions
* of the Software License Agreement included with the Materials. If you do
* not have a copy of the Software License Agreement, contact the HSA Foundation for a copy.
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
* FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE SOFTWARE.
*/
#pragma once
#include "hsa_ext_finalize.h"
//typedef enum status_t status_t;
typedef enum status_t {
STATUS_SUCCESS=0,
STATUS_KERNEL_INVALID_SECTION_HEADER=1,
STATUS_KERNEL_ELF_INITIALIZATION_FAILED=2,
STATUS_KERNEL_INVALID_ELF_CONTAINER=3,
STATUS_KERNEL_MISSING_DATA_SECTION=4,
STATUS_KERNEL_MISSING_CODE_SECTION=5,
STATUS_KERNEL_MISSING_OPERAND_SECTION=6,
STATUS_UNKNOWN=7,
} status_t;
status_t create_brig_module_from_brig_file(const char* file_name, hsa_ext_brig_module_t** brig_module);
void destroy_brig_module(hsa_ext_brig_module_t* brig_module);
+225
Dosyayı Görüntüle
@@ -0,0 +1,225 @@
#include "hsa_base.h"
void HSA::SetBrigFileAndKernelName(char * brig_file_name, char *kernel_name)
{
strcpy(hsa_brig_file_name, brig_file_name);
strcpy(hsa_kernel_name, kernel_name);
}
HSA::HSA()
{
}
HSA::~HSA()
{
}
bool HSA::HsaInit()
{
err = hsa_init();
check(Initializing the hsa runtime, err);
/*
* Iterate over the agents and pick the gpu agent using
* the find_gpu callback.
*/
err = hsa_iterate_agents(find_gpu, &device);
check(Calling hsa_iterate_agents, err);
err = (device.handle== 0) ? HSA_STATUS_ERROR : HSA_STATUS_SUCCESS;
check(Checking if the GPU device is non-zero, err);
if (err == HSA_STATUS_ERROR)
return false;
/*
* Query the maximum size of the queue.
*/
err = hsa_agent_get_info(device, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &queue_size);
check(Querying the device maximum queue size, err);
return true;
}
double HSA::Run(int dim, int group_x, int group_y, int group_z, int s_size, int grid_x, int grid_y, int grid_z, void* kernel_args, int kernel_args_size)
{
hsa_queue_t* local_command_queue;
/*
* Create a queue using the maximum size.
*/
err = hsa_queue_create(device, queue_size, HSA_QUEUE_TYPE_MULTI, NULL, NULL, 0, 0, &local_command_queue);
check(Creating the queue, err);
/*
* Load BRIG, encapsulated in an ELF container, into a BRIG module.
*/
//char file_name[128] = "transpose_kernel.brig";
hsa_ext_brig_module_t* local_brig_module;
err = (hsa_status_t)create_brig_module_from_brig_file(hsa_brig_file_name, &local_brig_module);
check(Creating the brig module from vector_copy.brig, err);
/*
* Create hsa program.
*/
hsa_ext_program_handle_t local_hsa_program;
err = hsa_ext_program_create(&device, 1, HSA_EXT_BRIG_MACHINE_LARGE, HSA_EXT_BRIG_PROFILE_FULL, &local_hsa_program);
check(Creating the hsa program, err);
/*
* Add the BRIG module to hsa program.
*/
hsa_ext_brig_module_handle_t local_module;
err = hsa_ext_add_module(local_hsa_program, local_brig_module, &local_module);
check(Adding the local brig module to the program, err);
/*
* Construct finalization request list.
*/
hsa_ext_finalization_request_t local_finalization_request_list;
local_finalization_request_list.module = local_module;
local_finalization_request_list.program_call_convention = 0;
//char kernel_name[128] = "&__OpenCL_matrixTranspose_kernel";
err = find_symbol_offset(local_brig_module, hsa_kernel_name, &local_finalization_request_list.symbol);
check(Finding the symbol offset for the kernel, err);
/*
* Finalize the hsa program.
*/
err = hsa_ext_finalize_program(local_hsa_program, device, 1, &local_finalization_request_list, NULL, NULL, 0, NULL, 0);
check(Finalizing the program, err);
/*
* Destroy the brig module. The program was successfully created the kernel
* symbol was found and the program was finalized, so it is no longer needed.
*/
destroy_brig_module(local_brig_module);
/*
* Get the hsa code descriptor address.
*/
hsa_ext_code_descriptor_t *local_hsa_code_descriptor;
err = hsa_ext_query_kernel_descriptor_address(local_hsa_program, local_module, local_finalization_request_list.symbol, &local_hsa_code_descriptor);
check(Querying the kernel descriptor address, err);
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/*
* Create a signal to wait for the dispatch to finish.
*/
hsa_signal_t local_signal;
err=hsa_signal_create(1, 0, NULL, &local_signal);
check(Creating a HSA signal, err);
/* Initialize the dispatch packet */
hsa_kernel_dispatch_packet_t local_dispatch_packet;
memset(&local_dispatch_packet, 0, sizeof(hsa_kernel_dispatch_packet_t));
/*
* Setup the dispatch information.
*/
local_dispatch_packet.completion_signal=local_signal;
local_dispatch_packet.setup |= dim<< HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
local_dispatch_packet.workgroup_size_x = group_x;
local_dispatch_packet.workgroup_size_y = group_y;
local_dispatch_packet.workgroup_size_z = group_z;
local_dispatch_packet.group_segment_size = s_size;
local_dispatch_packet.grid_size_x = grid_x;
local_dispatch_packet.grid_size_y = grid_y;
local_dispatch_packet.grid_size_z = grid_z;
local_dispatch_packet.header |= HSA_PACKET_TYPE_KERNEL_DISPATCH;
local_dispatch_packet.header |= HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
local_dispatch_packet.header |= HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/*
* Find a memory region that supports kernel arguments.
*/
hsa_region_t local_kernarg_region;
local_kernarg_region.handle = 0;
hsa_agent_iterate_regions(device, get_kernarg, &local_kernarg_region);
err = (local_kernarg_region.handle== 0) ? HSA_STATUS_ERROR : HSA_STATUS_SUCCESS;
check(Finding a kernarg memory region, err);
void* local_kernel_arg_buffer = NULL;
size_t local_kernel_arg_buffer_size = local_hsa_code_descriptor->kernarg_segment_byte_size;
/*
* Allocate the kernel argument buffer from the correct region.
*/
err = hsa_memory_allocate(local_kernarg_region, local_kernel_arg_buffer_size, &local_kernel_arg_buffer);
check(Allocating kernel argument memory buffer, err);
memcpy(local_kernel_arg_buffer, kernel_args, kernel_args_size);
local_dispatch_packet.kernel_object = local_hsa_code_descriptor->code.handle;
// Assume our kernel receives no arguments
local_dispatch_packet.kernarg_address = local_kernel_arg_buffer;
/*
* Register the memory region for the argument buffer.
*/
err = hsa_memory_register(kernel_args, kernel_args_size);
check(Registering the argument buffer, err);
/*
* Obtain the current queue write index.
*/
uint64_t index = hsa_queue_load_write_index_relaxed(local_command_queue);
/*
* Write the aql packet at the calculated queue index address.
*/
const uint32_t queueMask = local_command_queue->size - 1;
((hsa_kernel_dispatch_packet_t*)(local_command_queue->base_address))[index&queueMask]=local_dispatch_packet;
/*
* Increment the write index and ring the doorbell to dispatch the kernel.
*/
hsa_queue_store_write_index_relaxed(local_command_queue, index+1);
#ifdef TIME
PerfTimer perf_timer_0;
int timer_idx_0 = perf_timer_0.CreateTimer();
perf_timer_0.StartTimer(timer_idx_0);
#endif
hsa_signal_store_release(local_command_queue->doorbell_signal, index);
/*
* Wait on the dispatch signal until all kernel are finished.
*/
while (hsa_signal_wait_acquire(local_signal, HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX, HSA_WAIT_STATE_ACTIVE) != 0);
#ifdef TIME
perf_timer_0.StopTimer(timer_idx_0);
#endif
/*
* Cleanup all allocated resources.
*/
err=hsa_signal_destroy(local_signal);
check(Destroying the local_signal, err);
err=hsa_ext_program_destroy(local_hsa_program);
check(Destroying the program, err);
err=hsa_queue_destroy(local_command_queue);
check(Destroying the queue, err);
#ifdef TIME
double ret = perf_timer_0.ReadTimer(timer_idx_0);
#endif
return 0;
}
void HSA::Close()
{
err=hsa_shut_down();
check(Shutting down the runtime, err);
}
+43
Dosyayı Görüntüle
@@ -0,0 +1,43 @@
#ifndef __HSA_BASE__
#define __HSA_BASE__
#include <vector>
#include "hsa.h"
#include "hsa_ext_finalize.h"
#include "elf_utils.h"
#include "hsatimer.h"
#include "utilities.h"
class HSA{
public:
HSA();
~HSA();
public:
void SetBrigFileAndKernelName(char *brig_file_name, char *kernel_name);
bool HsaInit();
void Close();
double Run(int dim, int group_x, int group_y, int group_z, int s_size, int grid_x, int grid_y, int grid_z, void* kernel_args, int kernel_args_size);
public:
hsa_status_t err;
uint32_t queue_size;
hsa_agent_t device;
char hsa_brig_file_name[128];
char hsa_kernel_name[128];
hsa_queue_t* command_queue;
hsa_signal_t hsa_signal;
hsa_ext_brig_module_t* brig_module;
hsa_ext_brig_module_handle_t module;
hsa_ext_program_handle_t hsa_program;
hsa_ext_code_descriptor_t *hsa_code_descriptor;
hsa_kernel_dispatch_packet_t dispatch_packet; // needs to be set manually each time
hsa_region_t hsa_kernarg_region;
};
#endif
+190
Dosyayı Görüntüle
@@ -0,0 +1,190 @@
#include "hsatimer.h"
PerfTimer::PerfTimer()
{
freq_in_100mhz = MeasureTSCFreqHz();
}
PerfTimer::~PerfTimer()
{
while(!_timers.empty())
{
Timer *temp = _timers.back();
_timers.pop_back();
delete temp;
}
}
//a new cretaed timer instantance index will be returned
int PerfTimer::CreateTimer()
{
Timer *newTimer = new Timer;
newTimer->_start = 0;
newTimer->_clocks = 0;
#ifdef _WIN32
QueryPerformanceFrequency((LARGE_INTEGER*)&newTimer->_freq);
#else
newTimer->_freq = (long long)1.0E3;
#endif
/* Push back the address of new Timer instance created */
_timers.push_back(newTimer);
return (int)(_timers.size() - 1);
}
int PerfTimer::StartTimer(int index)
{
if(index >= (int)_timers.size())
{
Error("Cannot reset timer. Invalid handle.");
return HSA_FAILURE;
}
#ifdef _WIN32
// General Windows timing method
#ifndef _AMD
long long tmpStart;
QueryPerformanceCounter((LARGE_INTEGER*)&(tmpStart));
_timers[index]->_start = (double)tmpStart;
#else
// AMD Windows timing method
#endif
#else
// General Linux timing method
#ifndef _AMD
struct timeval s;
gettimeofday(&s, 0);
_timers[index]->_start = s.tv_sec * 1.0E3 + ((double)(s.tv_usec / 1.0E3));
#else
// AMD timing method
unsigned int unused;
_timers[index]->_start = __rdtscp(&unused);
#endif
#endif
return HSA_SUCCESS;
}
int PerfTimer::StopTimer(int index)
{
double n=0;
if(index >= (int)_timers.size())
{
Error("Cannot reset timer. Invalid handle.");
return HSA_FAILURE;
}
#ifdef _WIN32
#ifndef _AMD
long long n1;
QueryPerformanceCounter((LARGE_INTEGER*)&(n1));
n = (double) n1;
#else
// AMD Window Timing
#endif
#else
// General Linux timing method
#ifndef _AMD
struct timeval s;
gettimeofday(&s, 0);
n = s.tv_sec * 1.0E3+ (double)(s.tv_usec/1.0E3);
#else
// AMD Linux timing
unsigned int unused;
n = __rdtscp(&unused);
#endif
#endif
n -= _timers[index]->_start;
_timers[index]->_start = 0;
#ifndef _AMD
_timers[index]->_clocks += n;
#else
//_timers[index]->_clocks += 10 * n /freq_in_100mhz; // unit is ns
_timers[index]->_clocks += 1.0E-6 * 10 * n /freq_in_100mhz; // convert to ms
cout << "_AMD is enabled!!!" << endl;
#endif
return HSA_SUCCESS;
}
void PerfTimer::Error(string str)
{
cout << str << endl;
}
double PerfTimer::ReadTimer(int index)
{
if(index >= (int)_timers.size())
{
Error("Cannot read timer. Invalid handle.");
return HSA_FAILURE;
}
double reading = double(_timers[index]->_clocks);
reading = double(reading / _timers[index]->_freq);
return reading;
}
uint64_t PerfTimer::CoarseTimestampUs()
{
#ifdef _WIN32
uint64_t freqHz, ticks;
QueryPerformanceFrequency((LARGE_INTEGER *)&freqHz);
QueryPerformanceCounter((LARGE_INTEGER *)&ticks);
// Scale numerator and divisor until (ticks * 1000000) fits in uint64_t.
while (ticks > (1ULL << 44)) {
ticks /= 16;
freqHz /= 16;
}
return (ticks * 1000000) / freqHz;
#else
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC_RAW, &ts);
return uint64_t(ts.tv_sec) * 1000000 + ts.tv_nsec / 1000;
#endif
}
uint64_t PerfTimer::MeasureTSCFreqHz()
{
// Make a coarse interval measurement of TSC ticks for 1 gigacycles.
unsigned int unused;
uint64_t tscTicksEnd;
uint64_t coarseBeginUs = CoarseTimestampUs();
uint64_t tscTicksBegin = __rdtscp(&unused);
do
{
tscTicksEnd = __rdtscp(&unused);
}
while (tscTicksEnd - tscTicksBegin < 1000000000);
uint64_t coarseEndUs = CoarseTimestampUs();
// Compute the TSC frequency and round to nearest 100MHz.
uint64_t coarseIntervalNs = (coarseEndUs - coarseBeginUs) * 1000;
uint64_t tscIntervalTicks = tscTicksEnd - tscTicksBegin;
return (tscIntervalTicks * 10 + (coarseIntervalNs / 2)) / coarseIntervalNs;
}
+64
Dosyayı Görüntüle
@@ -0,0 +1,64 @@
#ifndef __MYTIME__
#define __MYTIME__
// Will use AMD timer and general Linux timer based on users' need --> compilation flag
// need to consider platform is Windows or Linux
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <x86intrin.h>
#include <string.h>
#include <iostream>
#include <vector>
#include <string>
using namespace std;
#include <sys/time.h>
#define HSA_FAILURE 1
#define HSA_SUCCESS 0
class PerfTimer {
private:
struct Timer
{
string name; /* < name name of time object*/
long long _freq; /* < _freq frequency*/
double _clocks; /* < _clocks number of ticks at end*/
double _start; /* < _start start point ticks*/
};
std::vector<Timer*> _timers; /*< _timers vector to Timer objects */
double freq_in_100mhz;
public:
PerfTimer();
~PerfTimer();
private:
//AMD timing method
uint64_t CoarseTimestampUs();
uint64_t MeasureTSCFreqHz();
//General Linux timing method
public:
int CreateTimer();
int StartTimer(int index);
int StopTimer(int index);
public:
// retrieve time
double ReadTimer(int index);
// write into a file
double WriteTimer(int index);
public:
void Error(string str);
};
#endif
+185
Dosyayı Görüntüle
@@ -0,0 +1,185 @@
#include "utilities.h"
/*
* Prints no more than 256 elements of the given array.
* Prints full array if length is less than 256.
* Prints Array name followed by elements.
*/
template<typename T>
void PrintArray(
string header,
const T * data,
const int width,
const int height)
{
cout<<"\n"<<header<<"\n";
for(int i = 0; i < height; i++)
{
for(int j = 0; j < width; j++)
{
cout<<data[i*width+j]<<" ";
}
cout<<"\n";
}
cout<<"\n";
}
template<typename T>
int IsPowerOf2(T val)
{
long long _val = val;
if((_val & (-_val))-_val == 0 && _val != 0)
return 0;
else
return -1;
}
template<typename T>
T RoundToPowerOf2(T val)
{
int bytes = sizeof(T);
val--;
for(int i = 0; i < bytes; i++)
val |= val >> (1<<i);
val++;
return val;
}
template<typename T>
int FillRandom(
T * arrayPtr,
const int width,
const int height,
const T rangeMin,
const T rangeMax,
unsigned int seed=123)
{
if(!arrayPtr)
{
printf("Cannot fill array. NULL pointer.");
return -1;
}
if(!seed)
seed = (unsigned int)time(NULL);
srand(seed);
double range = double(rangeMax - rangeMin) + 1.0;
/* random initialisation of input */
for(int i = 0; i < height; i++)
for(int j = 0; j < width; j++)
{
int index = i*width + j;
arrayPtr[index] = rangeMin + T(range*rand()/(RAND_MAX + 1.0));
}
return 0;
}
//get a memory region that can be used for global memory allocations.
hsa_status_t get_global_region(hsa_region_t region, void* data)
{
hsa_region_segment_t segment;
hsa_region_get_info(region, HSA_REGION_INFO_SEGMENT, &segment);
if (HSA_REGION_SEGMENT_GLOBAL == segment)
{
hsa_region_t* ret = (hsa_region_t*) data;
*ret = region;
}
return HSA_STATUS_SUCCESS;
}
/*
* Finds the specified symbols offset in the specified brig_module.
* If the symbol is found the function returns HSA_STATUS_SUCCESS,
* otherwise it returns HSA_STATUS_ERROR.
*/
hsa_status_t find_symbol_offset(hsa_ext_brig_module_t* brig_module,
char* symbol_name,
hsa_ext_brig_code_section_offset32_t* offset)
{
/*
* Get the data section
*/
hsa_ext_brig_section_header_t* data_section_header =
brig_module->section[HSA_EXT_BRIG_SECTION_DATA];
/*
* Get the code section
*/
hsa_ext_brig_section_header_t* code_section_header =
brig_module->section[HSA_EXT_BRIG_SECTION_CODE];
/*
* First entry into the BRIG code section
*/
BrigCodeOffset32_t code_offset = code_section_header->header_byte_count;
BrigBase* code_entry = (BrigBase*) ((char*)code_section_header + code_offset);
while (code_offset != code_section_header->byte_count)
{
if (code_entry->kind == BRIG_KIND_DIRECTIVE_KERNEL)
{
/*
* Now find the data in the data section
*/
BrigDirectiveExecutable* directive_kernel = (BrigDirectiveExecutable*) (code_entry);
BrigDataOffsetString32_t data_name_offset = directive_kernel->name;
BrigData* data_entry = (BrigData*)((char*) data_section_header + data_name_offset);
if (!strncmp(symbol_name, (char*) data_entry->bytes, strlen(symbol_name)))
{
*offset = code_offset;
return HSA_STATUS_SUCCESS;
}
}
code_offset += code_entry->byteCount;
code_entry = (BrigBase*) ((char*)code_section_header + code_offset);
}
return HSA_STATUS_ERROR;
}
/*
* Determines if the given agent is of type HSA_DEVICE_TYPE_GPU
* and sets the value of data to the agent handle if it is.
*/
hsa_status_t find_gpu(hsa_agent_t agent, void *data)
{
if (data == NULL)
{
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
}
hsa_device_type_t device_type;
hsa_status_t stat = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &device_type);
if (stat != HSA_STATUS_SUCCESS)
{
return stat;
}
if (device_type == HSA_DEVICE_TYPE_GPU)
{
*((hsa_agent_t *)data) = agent;
}
return HSA_STATUS_SUCCESS;
}
/*
* Determines if a memory region can be used for kernarg
* allocations.
*/
hsa_status_t get_kernarg(hsa_region_t region, void* data)
{
hsa_region_global_flag_t flags;
hsa_region_get_info(region, HSA_REGION_INFO_GLOBAL_FLAGS, &flags);
if (flags & HSA_REGION_GLOBAL_FLAG_KERNARG)
{
hsa_region_t* ret = (hsa_region_t*) data;
*ret = region;
}
return HSA_STATUS_SUCCESS;
}
+207
Dosyayı Görüntüle
@@ -0,0 +1,207 @@
#ifndef __HSA_UTILITY__
#define __HSA_UTILITY__
#include <vector>
#include <thread>
#include "hsa.h"
#include "hsa_ext_finalize.h"
#include "elf_utils.h"
#include <string.h>
#include<iostream>
using namespace std;
#define HSA_ARGUMENT_ALIGN_BYTES 16
#define SDK_FAILURE 1
#define SDK_SUCCESS 0
/*
#define check(msg, status) \
if (status != HSA_STATUS_SUCCESS) { \
printf("%s failed.\n", #msg); \
exit(1); \
} else { \
printf("%s succeeded.\n", #msg); \
}
*/
#define check(msg, status) \
if (status != HSA_STATUS_SUCCESS) { \
printf("%s failed.\n", #msg); \
exit(1); \
} else { \
; \
}
/*
* Define required BRIG data structures.
*/
typedef uint32_t BrigCodeOffset32_t;
typedef uint32_t BrigDataOffset32_t;
typedef uint16_t BrigKinds16_t;
typedef uint8_t BrigLinkage8_t;
typedef uint8_t BrigExecutableModifier8_t;
typedef BrigDataOffset32_t BrigDataOffsetString32_t;
enum BrigKinds {
BRIG_KIND_NONE = 0x0000,
BRIG_KIND_DIRECTIVE_BEGIN = 0x1000,
BRIG_KIND_DIRECTIVE_KERNEL = 0x1008,
};
typedef struct BrigBase BrigBase;
struct BrigBase {
uint16_t byteCount;
BrigKinds16_t kind;
};
typedef struct BrigExecutableModifier BrigExecutableModifier;
struct BrigExecutableModifier {
BrigExecutableModifier8_t allBits;
};
typedef struct BrigDirectiveExecutable BrigDirectiveExecutable;
struct BrigDirectiveExecutable {
uint16_t byteCount;
BrigKinds16_t kind;
BrigDataOffsetString32_t name;
uint16_t outArgCount;
uint16_t inArgCount;
BrigCodeOffset32_t firstInArg;
BrigCodeOffset32_t firstCodeBlockEntry;
BrigCodeOffset32_t nextModuleEntry;
uint32_t codeBlockEntryCount;
BrigExecutableModifier modifier;
BrigLinkage8_t linkage;
uint16_t reserved;
};
typedef struct BrigData BrigData;
struct BrigData {
uint32_t byteCount;
uint8_t bytes[1];
};
struct float2
{
float s0;
float s1;
float2 operator * (float2 &fl)
{
float2 temp;
temp.s0 = (this->s0) * fl.s0;
temp.s1 = (this->s1) * fl.s1;
return temp;
}
float2 operator * (float scalar)
{
float2 temp;
temp.s0 = (this->s0) * scalar;
temp.s1 = (this->s1) * scalar;
return temp;
}
float2 operator + (float2 &fl)
{
float2 temp;
temp.s0 = (this->s0) + fl.s0;
temp.s1 = (this->s1) + fl.s1;
return temp;
}
float2 operator - (float2 fl)
{
float2 temp;
temp.s0 = (this->s0) - fl.s0;
temp.s1 = (this->s1) - fl.s1;
return temp;
}
};
struct uint2
{
uint s0;
uint s1;
uint2 operator * (uint2 &fl)
{
uint2 temp;
temp.s0 = (this->s0) * fl.s0;
temp.s1 = (this->s1) * fl.s1;
return temp;
}
uint2 operator * (float scalar)
{
uint2 temp;
temp.s0 = (this->s0) * scalar;
temp.s1 = (this->s1) * scalar;
return temp;
}
uint2 operator + (uint2 &fl)
{
uint2 temp;
temp.s0 = (this->s0) + fl.s0;
temp.s1 = (this->s1) + fl.s1;
return temp;
}
uint2 operator - (uint2 fl)
{
uint2 temp;
temp.s0 = (this->s0) - fl.s0;
temp.s1 = (this->s1) - fl.s1;
return temp;
}
};
/*
* Prints no more than 256 elements of the given array.
* Prints full array if length is less than 256.
* Prints Array name followed by elements.
*/
template<typename T> void PrintArray(string header, const T * data, const int width, const int height);
template<typename T> int IsPowerOf2(T val);
template<typename T> T RoundToPowerOf2(T val);
template<typename T> int FillRandom(T * arrayPtr, const int width, const int height, const T rangeMin, const T rangeMax, unsigned int seed=123);
//get a memory region that can be used for global memory allocations.
hsa_status_t get_global_region(hsa_region_t region, void* data);
/*
* Finds the specified symbols offset in the specified brig_module.
* If the symbol is found the function returns HSA_STATUS_SUCCESS,
* otherwise it returns HSA_STATUS_ERROR.
*/
hsa_status_t find_symbol_offset(hsa_ext_brig_module_t* brig_module, char* symbol_name, hsa_ext_brig_code_section_offset32_t* offset);
/*
* Determines if the given agent is of type HSA_DEVICE_TYPE_GPU
* and sets the value of data to the agent handle if it is.
*/
hsa_status_t find_gpu(hsa_agent_t agent, void *data);
/*
* Determines if a memory region can be used for kernarg
* allocations.
*/
hsa_status_t get_kernarg(hsa_region_t region, void* data);
#endif