code object tracking and v3 code object support

Change-Id: I081ada13f6364ea4401a97a485dedfa9bf8a45fe


[ROCm/rocprofiler commit: 60043d198b]
This commit is contained in:
Evgeny
2019-11-19 20:18:09 -06:00
parent c6607f3f62
commit 9b183d909e
12 changed files with 293 additions and 12 deletions
+5
View File
@@ -179,6 +179,7 @@ usage() {
echo ""
echo " --trace-period <dealy:length:rate> - to enable trace with initial delay, with periodic sample length and rate"
echo " Supported time formats: <number(m|s|ms|us)>"
echo " --obj-tracking <on|off> - to turn on/off kernels code objects tracking [off]"
echo ""
echo "Configuration file:"
echo " You can set your parameters defaults preferences in the configuration file 'rpl_rc.xml'. The search path sequence: .:${HOME}:<package path>"
@@ -394,6 +395,10 @@ while [ 1 ] ; do
convert_time_val period_rate
errck "Option '$ARG_IN', rate value"
export ROCP_CTRL_RATE="$period_delay:$period_len:$period_rate"
elif [ "$1" = "--obj-tracking" ] ; then
if [ "$2" = "on" ] ; then
export ROCP_OBJ_TRACKING=1
fi
elif [ "$1" = "--verbose" ] ; then
ARG_VAL=0
export ROCP_VERBOSE_MODE=1
+2 -1
View File
@@ -64,6 +64,7 @@ uint32_t rocprofiler_version_minor();
typedef struct {
uint32_t intercept_mode;
uint32_t code_obj_tracking;
uint32_t memcopy_tracking;
uint32_t trace_size;
uint32_t trace_local;
@@ -222,7 +223,7 @@ typedef struct {
hsa_signal_t completion_signal; // Completion signal
const hsa_kernel_dispatch_packet_t* packet; // HSA dispatch packet
const char* kernel_name; // Kernel name
uint64_t kernel_object; // Kernel object pointer
uint64_t kernel_object; // Kernel object address
const amd_kernel_code_t* kernel_code; // Kernel code pointer
int64_t thread_id; // Thread id
const rocprofiler_dispatch_record_t* record; // Dispatch record
@@ -148,9 +148,20 @@ class InterceptQueue {
}
// Prepareing dispatch callback data
const amd_kernel_code_t* kernel_code = GetKernelCode(dispatch_packet);
const uint64_t kernel_symbol = kernel_code->runtime_loader_kernel_symbol;
const char* kernel_name = GetKernelName(kernel_symbol);
uint64_t kernel_object = dispatch_packet->kernel_object;
const amd_kernel_code_t* kernel_code = GetKernelCode(kernel_object);
const uint16_t kernel_object_flag = *((uint64_t*)kernel_code + 1);
if (kernel_object_flag == 0) {
if (!util::HsaRsrcFactory::IsExecutableTracking()) {
fprintf(stderr, "Error: V3 code object detected - code objects tracking should be enabled\n");
abort();
}
}
const char* kernel_name = (util::HsaRsrcFactory::IsExecutableTracking()) ?
util::HsaRsrcFactory::GetKernelName(kernel_object) :
GetKernelName(kernel_code->runtime_loader_kernel_symbol);
rocprofiler_callback_data_t data = {obj->agent_info_->dev_id,
obj->agent_info_->dev_index,
obj->queue_,
@@ -159,7 +170,7 @@ class InterceptQueue {
completion_signal,
dispatch_packet,
kernel_name,
kernel_symbol,
kernel_object,
kernel_code,
syscall(__NR_gettid),
(tracker_entry) ? tracker_entry->record : NULL};
@@ -243,14 +254,14 @@ class InterceptQueue {
return static_cast<hsa_packet_type_t>((*header >> HSA_PACKET_HEADER_TYPE) & header_type_mask);
}
static const amd_kernel_code_t* GetKernelCode(const hsa_kernel_dispatch_packet_t* dispatch_packet) {
static const amd_kernel_code_t* GetKernelCode(uint64_t kernel_object) {
const amd_kernel_code_t* kernel_code = NULL;
hsa_status_t status =
util::HsaRsrcFactory::Instance().LoaderApi()->hsa_ven_amd_loader_query_host_address(
reinterpret_cast<const void*>(dispatch_packet->kernel_object),
reinterpret_cast<const void*>(kernel_object),
reinterpret_cast<const void**>(&kernel_code));
if (HSA_STATUS_SUCCESS != status) {
kernel_code = reinterpret_cast<amd_kernel_code_t*>(dispatch_packet->kernel_object);
kernel_code = reinterpret_cast<amd_kernel_code_t*>(kernel_object);
}
return kernel_code;
}
+10 -2
View File
@@ -144,7 +144,8 @@ void * tool_handle = NULL;
// Return true if intercepting mode is enabled
enum {
DISPATCH_INTERCEPT_MODE = 0x1,
MEMCOPY_INTERCEPT_MODE = 0x2
CODE_OBJ_TRACKING_MODE = 0x2,
MEMCOPY_INTERCEPT_MODE = 0x4,
};
uint32_t LoadTool() {
uint32_t intercept_mode = 0;
@@ -188,6 +189,7 @@ uint32_t LoadTool() {
util::HsaRsrcFactory::SetTimeoutNs(settings.timeout);
InterceptQueue::TrackerOn(settings.timestamp_on != 0);
if (settings.intercept_mode != 0) intercept_mode = DISPATCH_INTERCEPT_MODE;
if (settings.code_obj_tracking) intercept_mode |= CODE_OBJ_TRACKING_MODE;
if (settings.memcopy_tracking) intercept_mode |= MEMCOPY_INTERCEPT_MODE;
}
@@ -432,7 +434,13 @@ PUBLIC_API bool OnLoad(HsaApiTable* table, uint64_t runtime_version, uint64_t fa
// Loading a tool lib and setting of intercept mode
const uint32_t intercept_mode_mask = rocprofiler::LoadTool();
if (intercept_mode_mask & rocprofiler::DISPATCH_INTERCEPT_MODE) intercept_mode = true;
if (intercept_mode_mask & rocprofiler::DISPATCH_INTERCEPT_MODE) {
intercept_mode = true;
}
if (intercept_mode_mask & rocprofiler::CODE_OBJ_TRACKING_MODE) {
if (intercept_mode == false) EXC_RAISING(HSA_STATUS_ERROR, "code objects tracking without intercept mode enabled");
rocprofiler::util::HsaRsrcFactory::EnableExecutableTracking(table);
}
if (intercept_mode_mask & rocprofiler::MEMCOPY_INTERCEPT_MODE) {
hsa_status_t status = hsa_amd_profiling_async_copy_enable(true);
if (status != HSA_STATUS_SUCCESS) EXC_ABORT(status, "hsa_amd_profiling_async_copy_enable");
@@ -193,6 +193,8 @@ void HsaRsrcFactory::InitHsaApiTable(HsaApiTable* table) {
hsa_api_.hsa_executable_load_agent_code_object = table->core_->hsa_executable_load_agent_code_object_fn;
hsa_api_.hsa_executable_freeze = table->core_->hsa_executable_freeze_fn;
hsa_api_.hsa_executable_get_symbol = table->core_->hsa_executable_get_symbol_fn;
hsa_api_.hsa_executable_symbol_get_info = table->core_->hsa_executable_symbol_get_info_fn;
hsa_api_.hsa_executable_iterate_symbols = table->core_->hsa_executable_iterate_symbols_fn;
hsa_api_.hsa_system_get_info = table->core_->hsa_system_get_info_fn;
hsa_api_.hsa_system_get_major_extension_table = table->core_->hsa_system_get_major_extension_table_fn;
@@ -231,6 +233,8 @@ void HsaRsrcFactory::InitHsaApiTable(HsaApiTable* table) {
hsa_api_.hsa_executable_load_agent_code_object = hsa_executable_load_agent_code_object;
hsa_api_.hsa_executable_freeze = hsa_executable_freeze;
hsa_api_.hsa_executable_get_symbol = hsa_executable_get_symbol;
hsa_api_.hsa_executable_symbol_get_info = hsa_executable_symbol_get_info;
hsa_api_.hsa_executable_iterate_symbols = hsa_executable_iterate_symbols;
hsa_api_.hsa_system_get_info = hsa_system_get_info;
hsa_api_.hsa_system_get_major_extension_table = hsa_system_get_major_extension_table;
@@ -681,10 +685,60 @@ uint64_t HsaRsrcFactory::Submit(hsa_queue_t* queue, const void* packet, size_t s
return write_idx;
}
const char* HsaRsrcFactory::GetKernelName(uint64_t addr) {
std::lock_guard<mutex_t> lck(mutex_);
const auto it = symbols_map_->find(addr);
if (it == symbols_map_->end()) {
fprintf(stderr, "HsaRsrcFactory::kernel addr (0x%lx) is not found\n", addr);
abort();
}
return strdup(it->second);
}
void HsaRsrcFactory::EnableExecutableTracking(HsaApiTable* table) {
std::lock_guard<mutex_t> lck(mutex_);
executable_tracking_on_ = true;
table->core_->hsa_executable_freeze_fn = hsa_executable_freeze_interceptor;
}
hsa_status_t HsaRsrcFactory::executable_symbols_cb(hsa_executable_t exec, hsa_executable_symbol_t symbol, void *data) {
hsa_symbol_kind_t value = (hsa_symbol_kind_t)0;
hsa_status_t status = hsa_api_.hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &value);
CHECK_STATUS("Error in getting symbol info", status);
if (value == HSA_SYMBOL_KIND_KERNEL) {
uint64_t addr = 0;
uint32_t len = 0;
status = hsa_api_.hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &addr);
CHECK_STATUS("Error in getting kernel object", status);
status = hsa_api_.hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &len);
CHECK_STATUS("Error in getting name len", status);
char *name = new char[len + 1];
status = hsa_api_.hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME, name);
CHECK_STATUS("Error in getting kernel name", status);
name[len] = 0;
auto ret = symbols_map_->insert({addr, name});
if (ret.second == false) {
delete[] ret.first->second;
ret.first->second = name;
}
}
return HSA_STATUS_SUCCESS;
}
hsa_status_t HsaRsrcFactory::hsa_executable_freeze_interceptor(hsa_executable_t executable, const char *options) {
std::lock_guard<mutex_t> lck(mutex_);
if (symbols_map_ == NULL) symbols_map_ = new symbols_map_t;
hsa_status_t status = hsa_api_.hsa_executable_iterate_symbols(executable, executable_symbols_cb, NULL);
CHECK_STATUS("Error in iterating executable symbols", status);
return hsa_api_.hsa_executable_freeze(executable, options);;
}
std::atomic<HsaRsrcFactory*> HsaRsrcFactory::instance_{};
HsaRsrcFactory::mutex_t HsaRsrcFactory::mutex_;
HsaRsrcFactory::timestamp_t HsaRsrcFactory::timeout_ns_ = HsaTimer::TIMESTAMP_MAX;
hsa_pfn_t HsaRsrcFactory::hsa_api_{};
bool HsaRsrcFactory::executable_tracking_on_ = false;
HsaRsrcFactory::symbols_map_t* HsaRsrcFactory::symbols_map_ = NULL;
} // namespace util
} // namespace rocprofiler
@@ -95,6 +95,8 @@ struct hsa_pfn_t {
decltype(hsa_executable_load_agent_code_object)* hsa_executable_load_agent_code_object;
decltype(hsa_executable_freeze)* hsa_executable_freeze;
decltype(hsa_executable_get_symbol)* hsa_executable_get_symbol;
decltype(hsa_executable_symbol_get_info)* hsa_executable_symbol_get_info;
decltype(hsa_executable_iterate_symbols)* hsa_executable_iterate_symbols;
decltype(hsa_system_get_info)* hsa_system_get_info;
decltype(hsa_system_get_major_extension_table)* hsa_system_get_major_extension_table;
@@ -323,6 +325,11 @@ class HsaRsrcFactory {
static uint64_t Submit(hsa_queue_t* queue, const void* packet);
static uint64_t Submit(hsa_queue_t* queue, const void* packet, size_t size_bytes);
// Enable executables loading tracking
static bool IsExecutableTracking() { return executable_tracking_on_; }
static void EnableExecutableTracking(HsaApiTable* table);
static const char* GetKernelName(uint64_t addr);
// Initialize HSA API table
void static InitHsaApiTable(HsaApiTable* table);
static const hsa_pfn_t* HsaApi() { return &hsa_api_; }
@@ -387,6 +394,13 @@ class HsaRsrcFactory {
// System agents map
std::map<hsa_agent_handle_t, const AgentInfo*> agent_map_;
// Executables loading tracking
typedef std::map<uint64_t, const char*> symbols_map_t;
static symbols_map_t* symbols_map_;
static bool executable_tracking_on_;
static hsa_status_t hsa_executable_freeze_interceptor(hsa_executable_t executable, const char *options);
static hsa_status_t executable_symbols_cb(hsa_executable_t exec, hsa_executable_symbol_t symbol, void *data);
// HSA runtime API table
static hsa_pfn_t hsa_api_;
+3
View File
@@ -94,3 +94,6 @@ add_custom_target( mbench
COMMAND sh -xc "cp -r ${TEST_DIR}/memory_validation ${PROJECT_BINARY_DIR}/test/."
COMMAND make -C "${PROJECT_BINARY_DIR}/test/memory_validation"
)
## Copy OCL test
execute_process ( COMMAND sh -xc "cp -r ${TEST_DIR}/ocl ${PROJECT_BINARY_DIR}/test" )
@@ -73,7 +73,7 @@ void dump_context_entry(context_entry_t* entry) {
const rocprofiler_dispatch_record_t* record = entry->data.record;
fflush(stdout);
fprintf(stdout, "kernel symbol(0x%lx) name(\"%s\")", entry->data.kernel_object, kernel_name.c_str());
fprintf(stdout, "kernel-object(0x%lx) name(\"%s\")", entry->data.kernel_object, kernel_name.c_str());
if (record) fprintf(stdout, ", gpu-id(%u), time(%lu,%lu,%lu,%lu)",
HsaRsrcFactory::Instance().GetAgentInfo(entry->agent)->dev_index,
record->dispatch,
Binary file not shown.
@@ -0,0 +1,175 @@
/**********************************************************************
Copyright ©2015 Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or
other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
********************************************************************/
/**
* The kernel has two implementation of convolution.
* 1. Non-Separable Convolution
* 2. Separable Convolution
*/
/**
* NonSeparableConvolution
* is where each pixel of the output image
* is the weighted sum of the neighbourhood pixels of the input image
* The neighbourhood is defined by the dimensions of the mask and
* weight of each neighbour is defined by the mask itself.
* @param input Padded Input matrix on which convolution is to be performed
* @param mask mask matrix using which convolution was to be performed
* @param output Output matrix after performing convolution
* @param inputDimensions dimensions of the input matrix
* @param maskDimensions dimensions of the mask matrix
* @param nExWidth Size of padded input width
*/
__kernel void simpleNonSeparableConvolution(__global uint * input,
__global float * mask,
__global int * output,
const uint2 inputDimensions,
const uint2 maskDimensions,
const uint nExWidth)
{
uint tid = get_global_id(0);
uint width = inputDimensions.x;
uint height = inputDimensions.y;
uint x = tid%width;
uint y = tid/width;
uint maskWidth = maskDimensions.x;
uint maskHeight = maskDimensions.y;
if(x >= width || y >= height)
return;
/*
* initializing weighted sum value
*/
float sumFX = 0.0f;
int m = 0, n = 0;
//performing weighted sum within the mask boundaries
for(uint j = y ; j < (y + maskHeight); ++j, m++)
{
n = 0;
for(uint i = x; i < (x + maskWidth); ++i, n++)
{
uint maskIndex = m * maskWidth + n;
uint index = j * nExWidth + i;
sumFX += ((float)input[index] * mask[maskIndex]);
}
}
sumFX += 0.5f;
output[tid] = (int)sumFX;
}
/**
* SeparableConvolution
* is product of 2 one-dimensional convolution.
* A 2-dimensional convolution operation is separated into 2 one one-dimensional convolution.
* SeparableConvolution is implemented in two passes.
* The first pass is called Row-wise convolution.
* And second pass is called Column-wise convolution.
*/
/**
* First Pass - Row-wise convolution
* @param input Input matrix on which convolution is to be performed
* @param rowFilter rowFilter vector using which row-wise convolution was to be performed
* @param tmpOutput Output matrix after performing first pass convolution
* @param inputDimensions dimensions of the input matrix
* @param filterSize length of row filter vector
* @param exInputDimensions dimensions of padded input
*/
__kernel void simpleSeparableConvolutionPass1(__global uint * input,
__global float * rowFilter,
__global float * tmpOutput,
const uint2 inputDimensions,
const uint filterSize,
const uint2 exInputDimensions)
{
int i = 0, cnt = 0;
uint width = inputDimensions.x;
uint height = inputDimensions.y;
uint tid = get_global_id(0);
uint x = tid%width;
uint y = tid/width;
if(x >= width || y >= (height+filterSize-1))
return;
/*
* initializing weighted sum value
*/
float sum = 0.0f;
for(uint i = x; i < (x + filterSize); ++i) {
sum = mad((float)input[y * exInputDimensions.x + i], rowFilter[cnt++], sum);
}
/* Transposed save */
tmpOutput[x * exInputDimensions.y + y] = sum;
}
/**
* Second Pass - Column-wise convolution
* @param input Input matrix on which convolution is to be performed
* @param colFilter colFilter vector using which column-wise convolution was to be performed
* @param Output Output matrix after performing second pass convolution
* @param inputDimensions dimensions of the input matrix
* @param filterSize length of col filter vector
* @param exInputDimensions dimensions of padded input
*/
__kernel void simpleSeparableConvolutionPass2(__global float * input,
__global float * colFilter,
__global int * output,
const uint2 inputDimensions,
const uint filterSize,
const uint2 exInputDimensions)
{
int i = 0, cnt = 0;
uint width = inputDimensions.x;
uint height = inputDimensions.y;
uint tid = get_global_id(0);
uint x = tid%height;
uint y = tid/height;
if(y >= width || x >= height)
return;
/*
* initializing wighted sum value
*/
float sum = 0.0f;
for(uint i = x; i < (x + filterSize); ++i) {
sum = mad(input[y * exInputDimensions.y + i], colFilter[cnt++], sum);
}
/* Tranposed save */
sum += 0.5f;
output[x * width + y] = (int)sum;
}
+7 -1
View File
@@ -74,7 +74,7 @@ export ROCP_TOOL_LIB=./test/libintercept_test.so
export ROCP_KITER=50
export ROCP_DITER=50
export ROCP_AGENTS=1
export ROCP_THRS=1
export ROCP_THRS=3
eval_test "Intercepting usage model test" "../bin/run_tool.sh ./test/ctrl"
## Standalone sampling usage model test
@@ -129,6 +129,12 @@ export ROCP_DITER=4
export ROCP_INPUT=input2.xml
eval_test "libtool test, counter sets" ./test/ctrl
## OpenCL test
export ROCP_OBJ_TRACKING=1
export ROCP_INPUT=input1.xml
eval_test "libtool test, OpenCL sample" ./test/ocl/SimpleConvolution
#valgrind --leak-check=full $tbin
#valgrind --tool=massif $tbin
#ms_print massif.out.<N>
+4
View File
@@ -881,6 +881,8 @@ extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings)
}
it = opts.find("trace-local");
if (it != opts.end()) { settings->trace_local = (it->second == "on"); }
it = opts.find("obj-tracking");
if (it != opts.end()) { settings->code_obj_tracking = (it->second == "on"); }
it = opts.find("memcopies");
if (it != opts.end()) { settings->memcopy_tracking = (it->second == "on"); }
}
@@ -901,6 +903,8 @@ extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings)
check_env_var("ROCP_TRACE_SIZE", settings->trace_size);
// Set trace local buffer
check_env_var("ROCP_TRACE_LOCAL", settings->trace_local);
// Set code objects tracking
check_env_var("ROCP_OBJ_TRACKING", settings->code_obj_tracking);
// Set memcopies tracking
check_env_var("ROCP_MCOPY_TRACKING", settings->memcopy_tracking);