code object tracking and v3 code object support
Change-Id: I081ada13f6364ea4401a97a485dedfa9bf8a45fe
[ROCm/rocprofiler commit: 60043d198b]
This commit is contained in:
@@ -179,6 +179,7 @@ usage() {
|
||||
echo ""
|
||||
echo " --trace-period <dealy:length:rate> - to enable trace with initial delay, with periodic sample length and rate"
|
||||
echo " Supported time formats: <number(m|s|ms|us)>"
|
||||
echo " --obj-tracking <on|off> - to turn on/off kernels code objects tracking [off]"
|
||||
echo ""
|
||||
echo "Configuration file:"
|
||||
echo " You can set your parameters defaults preferences in the configuration file 'rpl_rc.xml'. The search path sequence: .:${HOME}:<package path>"
|
||||
@@ -394,6 +395,10 @@ while [ 1 ] ; do
|
||||
convert_time_val period_rate
|
||||
errck "Option '$ARG_IN', rate value"
|
||||
export ROCP_CTRL_RATE="$period_delay:$period_len:$period_rate"
|
||||
elif [ "$1" = "--obj-tracking" ] ; then
|
||||
if [ "$2" = "on" ] ; then
|
||||
export ROCP_OBJ_TRACKING=1
|
||||
fi
|
||||
elif [ "$1" = "--verbose" ] ; then
|
||||
ARG_VAL=0
|
||||
export ROCP_VERBOSE_MODE=1
|
||||
|
||||
@@ -64,6 +64,7 @@ uint32_t rocprofiler_version_minor();
|
||||
|
||||
typedef struct {
|
||||
uint32_t intercept_mode;
|
||||
uint32_t code_obj_tracking;
|
||||
uint32_t memcopy_tracking;
|
||||
uint32_t trace_size;
|
||||
uint32_t trace_local;
|
||||
@@ -222,7 +223,7 @@ typedef struct {
|
||||
hsa_signal_t completion_signal; // Completion signal
|
||||
const hsa_kernel_dispatch_packet_t* packet; // HSA dispatch packet
|
||||
const char* kernel_name; // Kernel name
|
||||
uint64_t kernel_object; // Kernel object pointer
|
||||
uint64_t kernel_object; // Kernel object address
|
||||
const amd_kernel_code_t* kernel_code; // Kernel code pointer
|
||||
int64_t thread_id; // Thread id
|
||||
const rocprofiler_dispatch_record_t* record; // Dispatch record
|
||||
|
||||
@@ -148,9 +148,20 @@ class InterceptQueue {
|
||||
}
|
||||
|
||||
// Prepareing dispatch callback data
|
||||
const amd_kernel_code_t* kernel_code = GetKernelCode(dispatch_packet);
|
||||
const uint64_t kernel_symbol = kernel_code->runtime_loader_kernel_symbol;
|
||||
const char* kernel_name = GetKernelName(kernel_symbol);
|
||||
uint64_t kernel_object = dispatch_packet->kernel_object;
|
||||
const amd_kernel_code_t* kernel_code = GetKernelCode(kernel_object);
|
||||
|
||||
const uint16_t kernel_object_flag = *((uint64_t*)kernel_code + 1);
|
||||
if (kernel_object_flag == 0) {
|
||||
if (!util::HsaRsrcFactory::IsExecutableTracking()) {
|
||||
fprintf(stderr, "Error: V3 code object detected - code objects tracking should be enabled\n");
|
||||
abort();
|
||||
}
|
||||
}
|
||||
const char* kernel_name = (util::HsaRsrcFactory::IsExecutableTracking()) ?
|
||||
util::HsaRsrcFactory::GetKernelName(kernel_object) :
|
||||
GetKernelName(kernel_code->runtime_loader_kernel_symbol);
|
||||
|
||||
rocprofiler_callback_data_t data = {obj->agent_info_->dev_id,
|
||||
obj->agent_info_->dev_index,
|
||||
obj->queue_,
|
||||
@@ -159,7 +170,7 @@ class InterceptQueue {
|
||||
completion_signal,
|
||||
dispatch_packet,
|
||||
kernel_name,
|
||||
kernel_symbol,
|
||||
kernel_object,
|
||||
kernel_code,
|
||||
syscall(__NR_gettid),
|
||||
(tracker_entry) ? tracker_entry->record : NULL};
|
||||
@@ -243,14 +254,14 @@ class InterceptQueue {
|
||||
return static_cast<hsa_packet_type_t>((*header >> HSA_PACKET_HEADER_TYPE) & header_type_mask);
|
||||
}
|
||||
|
||||
static const amd_kernel_code_t* GetKernelCode(const hsa_kernel_dispatch_packet_t* dispatch_packet) {
|
||||
static const amd_kernel_code_t* GetKernelCode(uint64_t kernel_object) {
|
||||
const amd_kernel_code_t* kernel_code = NULL;
|
||||
hsa_status_t status =
|
||||
util::HsaRsrcFactory::Instance().LoaderApi()->hsa_ven_amd_loader_query_host_address(
|
||||
reinterpret_cast<const void*>(dispatch_packet->kernel_object),
|
||||
reinterpret_cast<const void*>(kernel_object),
|
||||
reinterpret_cast<const void**>(&kernel_code));
|
||||
if (HSA_STATUS_SUCCESS != status) {
|
||||
kernel_code = reinterpret_cast<amd_kernel_code_t*>(dispatch_packet->kernel_object);
|
||||
kernel_code = reinterpret_cast<amd_kernel_code_t*>(kernel_object);
|
||||
}
|
||||
return kernel_code;
|
||||
}
|
||||
|
||||
@@ -144,7 +144,8 @@ void * tool_handle = NULL;
|
||||
// Return true if intercepting mode is enabled
|
||||
enum {
|
||||
DISPATCH_INTERCEPT_MODE = 0x1,
|
||||
MEMCOPY_INTERCEPT_MODE = 0x2
|
||||
CODE_OBJ_TRACKING_MODE = 0x2,
|
||||
MEMCOPY_INTERCEPT_MODE = 0x4,
|
||||
};
|
||||
uint32_t LoadTool() {
|
||||
uint32_t intercept_mode = 0;
|
||||
@@ -188,6 +189,7 @@ uint32_t LoadTool() {
|
||||
util::HsaRsrcFactory::SetTimeoutNs(settings.timeout);
|
||||
InterceptQueue::TrackerOn(settings.timestamp_on != 0);
|
||||
if (settings.intercept_mode != 0) intercept_mode = DISPATCH_INTERCEPT_MODE;
|
||||
if (settings.code_obj_tracking) intercept_mode |= CODE_OBJ_TRACKING_MODE;
|
||||
if (settings.memcopy_tracking) intercept_mode |= MEMCOPY_INTERCEPT_MODE;
|
||||
}
|
||||
|
||||
@@ -432,7 +434,13 @@ PUBLIC_API bool OnLoad(HsaApiTable* table, uint64_t runtime_version, uint64_t fa
|
||||
|
||||
// Loading a tool lib and setting of intercept mode
|
||||
const uint32_t intercept_mode_mask = rocprofiler::LoadTool();
|
||||
if (intercept_mode_mask & rocprofiler::DISPATCH_INTERCEPT_MODE) intercept_mode = true;
|
||||
if (intercept_mode_mask & rocprofiler::DISPATCH_INTERCEPT_MODE) {
|
||||
intercept_mode = true;
|
||||
}
|
||||
if (intercept_mode_mask & rocprofiler::CODE_OBJ_TRACKING_MODE) {
|
||||
if (intercept_mode == false) EXC_RAISING(HSA_STATUS_ERROR, "code objects tracking without intercept mode enabled");
|
||||
rocprofiler::util::HsaRsrcFactory::EnableExecutableTracking(table);
|
||||
}
|
||||
if (intercept_mode_mask & rocprofiler::MEMCOPY_INTERCEPT_MODE) {
|
||||
hsa_status_t status = hsa_amd_profiling_async_copy_enable(true);
|
||||
if (status != HSA_STATUS_SUCCESS) EXC_ABORT(status, "hsa_amd_profiling_async_copy_enable");
|
||||
|
||||
@@ -193,6 +193,8 @@ void HsaRsrcFactory::InitHsaApiTable(HsaApiTable* table) {
|
||||
hsa_api_.hsa_executable_load_agent_code_object = table->core_->hsa_executable_load_agent_code_object_fn;
|
||||
hsa_api_.hsa_executable_freeze = table->core_->hsa_executable_freeze_fn;
|
||||
hsa_api_.hsa_executable_get_symbol = table->core_->hsa_executable_get_symbol_fn;
|
||||
hsa_api_.hsa_executable_symbol_get_info = table->core_->hsa_executable_symbol_get_info_fn;
|
||||
hsa_api_.hsa_executable_iterate_symbols = table->core_->hsa_executable_iterate_symbols_fn;
|
||||
|
||||
hsa_api_.hsa_system_get_info = table->core_->hsa_system_get_info_fn;
|
||||
hsa_api_.hsa_system_get_major_extension_table = table->core_->hsa_system_get_major_extension_table_fn;
|
||||
@@ -231,6 +233,8 @@ void HsaRsrcFactory::InitHsaApiTable(HsaApiTable* table) {
|
||||
hsa_api_.hsa_executable_load_agent_code_object = hsa_executable_load_agent_code_object;
|
||||
hsa_api_.hsa_executable_freeze = hsa_executable_freeze;
|
||||
hsa_api_.hsa_executable_get_symbol = hsa_executable_get_symbol;
|
||||
hsa_api_.hsa_executable_symbol_get_info = hsa_executable_symbol_get_info;
|
||||
hsa_api_.hsa_executable_iterate_symbols = hsa_executable_iterate_symbols;
|
||||
|
||||
hsa_api_.hsa_system_get_info = hsa_system_get_info;
|
||||
hsa_api_.hsa_system_get_major_extension_table = hsa_system_get_major_extension_table;
|
||||
@@ -681,10 +685,60 @@ uint64_t HsaRsrcFactory::Submit(hsa_queue_t* queue, const void* packet, size_t s
|
||||
return write_idx;
|
||||
}
|
||||
|
||||
const char* HsaRsrcFactory::GetKernelName(uint64_t addr) {
|
||||
std::lock_guard<mutex_t> lck(mutex_);
|
||||
const auto it = symbols_map_->find(addr);
|
||||
if (it == symbols_map_->end()) {
|
||||
fprintf(stderr, "HsaRsrcFactory::kernel addr (0x%lx) is not found\n", addr);
|
||||
abort();
|
||||
}
|
||||
return strdup(it->second);
|
||||
}
|
||||
|
||||
void HsaRsrcFactory::EnableExecutableTracking(HsaApiTable* table) {
|
||||
std::lock_guard<mutex_t> lck(mutex_);
|
||||
executable_tracking_on_ = true;
|
||||
table->core_->hsa_executable_freeze_fn = hsa_executable_freeze_interceptor;
|
||||
}
|
||||
|
||||
hsa_status_t HsaRsrcFactory::executable_symbols_cb(hsa_executable_t exec, hsa_executable_symbol_t symbol, void *data) {
|
||||
hsa_symbol_kind_t value = (hsa_symbol_kind_t)0;
|
||||
hsa_status_t status = hsa_api_.hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &value);
|
||||
CHECK_STATUS("Error in getting symbol info", status);
|
||||
if (value == HSA_SYMBOL_KIND_KERNEL) {
|
||||
uint64_t addr = 0;
|
||||
uint32_t len = 0;
|
||||
status = hsa_api_.hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &addr);
|
||||
CHECK_STATUS("Error in getting kernel object", status);
|
||||
status = hsa_api_.hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &len);
|
||||
CHECK_STATUS("Error in getting name len", status);
|
||||
char *name = new char[len + 1];
|
||||
status = hsa_api_.hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME, name);
|
||||
CHECK_STATUS("Error in getting kernel name", status);
|
||||
name[len] = 0;
|
||||
auto ret = symbols_map_->insert({addr, name});
|
||||
if (ret.second == false) {
|
||||
delete[] ret.first->second;
|
||||
ret.first->second = name;
|
||||
}
|
||||
}
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
hsa_status_t HsaRsrcFactory::hsa_executable_freeze_interceptor(hsa_executable_t executable, const char *options) {
|
||||
std::lock_guard<mutex_t> lck(mutex_);
|
||||
if (symbols_map_ == NULL) symbols_map_ = new symbols_map_t;
|
||||
hsa_status_t status = hsa_api_.hsa_executable_iterate_symbols(executable, executable_symbols_cb, NULL);
|
||||
CHECK_STATUS("Error in iterating executable symbols", status);
|
||||
return hsa_api_.hsa_executable_freeze(executable, options);;
|
||||
}
|
||||
|
||||
std::atomic<HsaRsrcFactory*> HsaRsrcFactory::instance_{};
|
||||
HsaRsrcFactory::mutex_t HsaRsrcFactory::mutex_;
|
||||
HsaRsrcFactory::timestamp_t HsaRsrcFactory::timeout_ns_ = HsaTimer::TIMESTAMP_MAX;
|
||||
hsa_pfn_t HsaRsrcFactory::hsa_api_{};
|
||||
bool HsaRsrcFactory::executable_tracking_on_ = false;
|
||||
HsaRsrcFactory::symbols_map_t* HsaRsrcFactory::symbols_map_ = NULL;
|
||||
|
||||
} // namespace util
|
||||
} // namespace rocprofiler
|
||||
|
||||
@@ -95,6 +95,8 @@ struct hsa_pfn_t {
|
||||
decltype(hsa_executable_load_agent_code_object)* hsa_executable_load_agent_code_object;
|
||||
decltype(hsa_executable_freeze)* hsa_executable_freeze;
|
||||
decltype(hsa_executable_get_symbol)* hsa_executable_get_symbol;
|
||||
decltype(hsa_executable_symbol_get_info)* hsa_executable_symbol_get_info;
|
||||
decltype(hsa_executable_iterate_symbols)* hsa_executable_iterate_symbols;
|
||||
|
||||
decltype(hsa_system_get_info)* hsa_system_get_info;
|
||||
decltype(hsa_system_get_major_extension_table)* hsa_system_get_major_extension_table;
|
||||
@@ -323,6 +325,11 @@ class HsaRsrcFactory {
|
||||
static uint64_t Submit(hsa_queue_t* queue, const void* packet);
|
||||
static uint64_t Submit(hsa_queue_t* queue, const void* packet, size_t size_bytes);
|
||||
|
||||
// Enable executables loading tracking
|
||||
static bool IsExecutableTracking() { return executable_tracking_on_; }
|
||||
static void EnableExecutableTracking(HsaApiTable* table);
|
||||
static const char* GetKernelName(uint64_t addr);
|
||||
|
||||
// Initialize HSA API table
|
||||
void static InitHsaApiTable(HsaApiTable* table);
|
||||
static const hsa_pfn_t* HsaApi() { return &hsa_api_; }
|
||||
@@ -387,6 +394,13 @@ class HsaRsrcFactory {
|
||||
// System agents map
|
||||
std::map<hsa_agent_handle_t, const AgentInfo*> agent_map_;
|
||||
|
||||
// Executables loading tracking
|
||||
typedef std::map<uint64_t, const char*> symbols_map_t;
|
||||
static symbols_map_t* symbols_map_;
|
||||
static bool executable_tracking_on_;
|
||||
static hsa_status_t hsa_executable_freeze_interceptor(hsa_executable_t executable, const char *options);
|
||||
static hsa_status_t executable_symbols_cb(hsa_executable_t exec, hsa_executable_symbol_t symbol, void *data);
|
||||
|
||||
// HSA runtime API table
|
||||
static hsa_pfn_t hsa_api_;
|
||||
|
||||
|
||||
@@ -94,3 +94,6 @@ add_custom_target( mbench
|
||||
COMMAND sh -xc "cp -r ${TEST_DIR}/memory_validation ${PROJECT_BINARY_DIR}/test/."
|
||||
COMMAND make -C "${PROJECT_BINARY_DIR}/test/memory_validation"
|
||||
)
|
||||
|
||||
## Copy OCL test
|
||||
execute_process ( COMMAND sh -xc "cp -r ${TEST_DIR}/ocl ${PROJECT_BINARY_DIR}/test" )
|
||||
|
||||
@@ -73,7 +73,7 @@ void dump_context_entry(context_entry_t* entry) {
|
||||
const rocprofiler_dispatch_record_t* record = entry->data.record;
|
||||
|
||||
fflush(stdout);
|
||||
fprintf(stdout, "kernel symbol(0x%lx) name(\"%s\")", entry->data.kernel_object, kernel_name.c_str());
|
||||
fprintf(stdout, "kernel-object(0x%lx) name(\"%s\")", entry->data.kernel_object, kernel_name.c_str());
|
||||
if (record) fprintf(stdout, ", gpu-id(%u), time(%lu,%lu,%lu,%lu)",
|
||||
HsaRsrcFactory::Instance().GetAgentInfo(entry->agent)->dev_index,
|
||||
record->dispatch,
|
||||
|
||||
BIN
Binary file not shown.
@@ -0,0 +1,175 @@
|
||||
/**********************************************************************
|
||||
Copyright ©2015 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
|
||||
|
||||
• Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
|
||||
• Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or
|
||||
other materials provided with the distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
|
||||
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
|
||||
OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
********************************************************************/
|
||||
|
||||
/**
|
||||
* The kernel has two implementation of convolution.
|
||||
* 1. Non-Separable Convolution
|
||||
* 2. Separable Convolution
|
||||
*/
|
||||
|
||||
|
||||
/**
|
||||
* NonSeparableConvolution
|
||||
* is where each pixel of the output image
|
||||
* is the weighted sum of the neighbourhood pixels of the input image
|
||||
* The neighbourhood is defined by the dimensions of the mask and
|
||||
* weight of each neighbour is defined by the mask itself.
|
||||
* @param input Padded Input matrix on which convolution is to be performed
|
||||
* @param mask mask matrix using which convolution was to be performed
|
||||
* @param output Output matrix after performing convolution
|
||||
* @param inputDimensions dimensions of the input matrix
|
||||
* @param maskDimensions dimensions of the mask matrix
|
||||
* @param nExWidth Size of padded input width
|
||||
*/
|
||||
|
||||
__kernel void simpleNonSeparableConvolution(__global uint * input,
|
||||
__global float * mask,
|
||||
__global int * output,
|
||||
const uint2 inputDimensions,
|
||||
const uint2 maskDimensions,
|
||||
const uint nExWidth)
|
||||
{
|
||||
uint tid = get_global_id(0);
|
||||
|
||||
uint width = inputDimensions.x;
|
||||
uint height = inputDimensions.y;
|
||||
|
||||
uint x = tid%width;
|
||||
uint y = tid/width;
|
||||
|
||||
uint maskWidth = maskDimensions.x;
|
||||
uint maskHeight = maskDimensions.y;
|
||||
|
||||
if(x >= width || y >= height)
|
||||
return;
|
||||
|
||||
/*
|
||||
* initializing weighted sum value
|
||||
*/
|
||||
float sumFX = 0.0f;
|
||||
int m = 0, n = 0;
|
||||
|
||||
//performing weighted sum within the mask boundaries
|
||||
for(uint j = y ; j < (y + maskHeight); ++j, m++)
|
||||
{
|
||||
n = 0;
|
||||
for(uint i = x; i < (x + maskWidth); ++i, n++)
|
||||
{
|
||||
uint maskIndex = m * maskWidth + n;
|
||||
uint index = j * nExWidth + i;
|
||||
|
||||
sumFX += ((float)input[index] * mask[maskIndex]);
|
||||
}
|
||||
}
|
||||
|
||||
sumFX += 0.5f;
|
||||
output[tid] = (int)sumFX;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* SeparableConvolution
|
||||
* is product of 2 one-dimensional convolution.
|
||||
* A 2-dimensional convolution operation is separated into 2 one one-dimensional convolution.
|
||||
* SeparableConvolution is implemented in two passes.
|
||||
* The first pass is called Row-wise convolution.
|
||||
* And second pass is called Column-wise convolution.
|
||||
*/
|
||||
|
||||
/**
|
||||
* First Pass - Row-wise convolution
|
||||
* @param input Input matrix on which convolution is to be performed
|
||||
* @param rowFilter rowFilter vector using which row-wise convolution was to be performed
|
||||
* @param tmpOutput Output matrix after performing first pass convolution
|
||||
* @param inputDimensions dimensions of the input matrix
|
||||
* @param filterSize length of row filter vector
|
||||
* @param exInputDimensions dimensions of padded input
|
||||
*/
|
||||
__kernel void simpleSeparableConvolutionPass1(__global uint * input,
|
||||
__global float * rowFilter,
|
||||
__global float * tmpOutput,
|
||||
const uint2 inputDimensions,
|
||||
const uint filterSize,
|
||||
const uint2 exInputDimensions)
|
||||
{
|
||||
int i = 0, cnt = 0;
|
||||
|
||||
uint width = inputDimensions.x;
|
||||
uint height = inputDimensions.y;
|
||||
|
||||
uint tid = get_global_id(0);
|
||||
uint x = tid%width;
|
||||
uint y = tid/width;
|
||||
|
||||
if(x >= width || y >= (height+filterSize-1))
|
||||
return;
|
||||
|
||||
/*
|
||||
* initializing weighted sum value
|
||||
*/
|
||||
float sum = 0.0f;
|
||||
|
||||
for(uint i = x; i < (x + filterSize); ++i) {
|
||||
sum = mad((float)input[y * exInputDimensions.x + i], rowFilter[cnt++], sum);
|
||||
}
|
||||
|
||||
/* Transposed save */
|
||||
tmpOutput[x * exInputDimensions.y + y] = sum;
|
||||
}
|
||||
|
||||
/**
|
||||
* Second Pass - Column-wise convolution
|
||||
* @param input Input matrix on which convolution is to be performed
|
||||
* @param colFilter colFilter vector using which column-wise convolution was to be performed
|
||||
* @param Output Output matrix after performing second pass convolution
|
||||
* @param inputDimensions dimensions of the input matrix
|
||||
* @param filterSize length of col filter vector
|
||||
* @param exInputDimensions dimensions of padded input
|
||||
*/
|
||||
__kernel void simpleSeparableConvolutionPass2(__global float * input,
|
||||
__global float * colFilter,
|
||||
__global int * output,
|
||||
const uint2 inputDimensions,
|
||||
const uint filterSize,
|
||||
const uint2 exInputDimensions)
|
||||
{
|
||||
int i = 0, cnt = 0;
|
||||
|
||||
uint width = inputDimensions.x;
|
||||
uint height = inputDimensions.y;
|
||||
|
||||
uint tid = get_global_id(0);
|
||||
uint x = tid%height;
|
||||
uint y = tid/height;
|
||||
|
||||
if(y >= width || x >= height)
|
||||
return;
|
||||
|
||||
/*
|
||||
* initializing wighted sum value
|
||||
*/
|
||||
float sum = 0.0f;
|
||||
|
||||
for(uint i = x; i < (x + filterSize); ++i) {
|
||||
sum = mad(input[y * exInputDimensions.y + i], colFilter[cnt++], sum);
|
||||
}
|
||||
|
||||
/* Tranposed save */
|
||||
sum += 0.5f;
|
||||
output[x * width + y] = (int)sum;
|
||||
}
|
||||
@@ -74,7 +74,7 @@ export ROCP_TOOL_LIB=./test/libintercept_test.so
|
||||
export ROCP_KITER=50
|
||||
export ROCP_DITER=50
|
||||
export ROCP_AGENTS=1
|
||||
export ROCP_THRS=1
|
||||
export ROCP_THRS=3
|
||||
eval_test "Intercepting usage model test" "../bin/run_tool.sh ./test/ctrl"
|
||||
|
||||
## Standalone sampling usage model test
|
||||
@@ -129,6 +129,12 @@ export ROCP_DITER=4
|
||||
export ROCP_INPUT=input2.xml
|
||||
eval_test "libtool test, counter sets" ./test/ctrl
|
||||
|
||||
## OpenCL test
|
||||
|
||||
export ROCP_OBJ_TRACKING=1
|
||||
export ROCP_INPUT=input1.xml
|
||||
eval_test "libtool test, OpenCL sample" ./test/ocl/SimpleConvolution
|
||||
|
||||
#valgrind --leak-check=full $tbin
|
||||
#valgrind --tool=massif $tbin
|
||||
#ms_print massif.out.<N>
|
||||
|
||||
@@ -881,6 +881,8 @@ extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings)
|
||||
}
|
||||
it = opts.find("trace-local");
|
||||
if (it != opts.end()) { settings->trace_local = (it->second == "on"); }
|
||||
it = opts.find("obj-tracking");
|
||||
if (it != opts.end()) { settings->code_obj_tracking = (it->second == "on"); }
|
||||
it = opts.find("memcopies");
|
||||
if (it != opts.end()) { settings->memcopy_tracking = (it->second == "on"); }
|
||||
}
|
||||
@@ -901,6 +903,8 @@ extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings)
|
||||
check_env_var("ROCP_TRACE_SIZE", settings->trace_size);
|
||||
// Set trace local buffer
|
||||
check_env_var("ROCP_TRACE_LOCAL", settings->trace_local);
|
||||
// Set code objects tracking
|
||||
check_env_var("ROCP_OBJ_TRACKING", settings->code_obj_tracking);
|
||||
// Set memcopies tracking
|
||||
check_env_var("ROCP_MCOPY_TRACKING", settings->memcopy_tracking);
|
||||
|
||||
|
||||
Reference in New Issue
Block a user