Adding Make for bulding code samples
Change-Id: I7b62a4c65c5560239e69ea121c6fdaef188f709d
このコミットが含まれているのは:
@@ -0,0 +1,50 @@
|
||||
ROCM_PATH ?=/opt/rocm
|
||||
ROCPROFILER_LIBS_PATH ?=$(ROCM_PATH)/lib
|
||||
ROCM_INCLUDES=-I$(ROCM_PATH)/include
|
||||
ROCPROFILER_INCLUDES=-I$(ROCM_PATH)/include/rocprofiler/
|
||||
|
||||
LIBS=-L$(ROCPROFILER_LIBS_PATH) -lrocprofiler64 -lamd_comgr -lsystemd
|
||||
|
||||
ifndef ROCPROFILER_LIBS_PATH
|
||||
$(warning You may need to set ROCPROFILER_LIBS_PATH to the path of the rocprofiler source)
|
||||
endif
|
||||
|
||||
.PHONY: all
|
||||
all: kernel_profiling_no_replay_sample device_profiling_sample kernel_replay_sample application_replay_sample user_replay_sample tracer_sample pc_sampler
|
||||
|
||||
CXX=hipcc -std=c++17 -Wall
|
||||
|
||||
DEBUG=1
|
||||
ifeq ($(DEBUG), 1)
|
||||
CFLAGS =-DDEBUG -g
|
||||
else
|
||||
CFLAGS=-DNDEBUG
|
||||
endif
|
||||
|
||||
kernel_profiling_no_replay_sample: profiler/kernel_profiling_no_replay_sample.cpp common/helper.cpp
|
||||
$(CXX) $^ $(LDFLAGS) -o $@ $(LIBS) $(ROCPROFILER_INCLUDES) $(ROCM_INCLUDES) $(HIP_INCLUDES) $(CFLAGS)
|
||||
|
||||
device_profiling_sample: profiler/device_profiling_sample.cpp common/helper.cpp
|
||||
$(CXX) $^ $(LDFLAGS) -o $@ $(LIBS) $(ROCPROFILER_INCLUDES) $(ROCM_INCLUDES) $(HIP_INCLUDES) $(CFLAGS)
|
||||
|
||||
kernel_replay_sample: profiler/kernel_replay_sample.cpp common/helper.cpp
|
||||
$(CXX) $^ $(LDFLAGS) -o $@ $(LIBS) $(ROCPROFILER_INCLUDES) $(ROCM_INCLUDES) $(HIP_INCLUDES) $(CFLAGS)
|
||||
|
||||
application_replay_sample: profiler/application_replay_sample.cpp common/helper.cpp
|
||||
$(CXX) $^ $(LDFLAGS) -o $@ $(LIBS) $(ROCPROFILER_INCLUDES) $(ROCM_INCLUDES) $(HIP_INCLUDES) $(CFLAGS)
|
||||
|
||||
user_replay_sample: profiler/user_replay_sample.cpp common/helper.cpp
|
||||
$(CXX) $^ $(LDFLAGS) -o $@ $(LIBS) $(ROCPROFILER_INCLUDES) $(ROCM_INCLUDES) $(HIP_INCLUDES) $(CFLAGS)
|
||||
|
||||
tracer_sample: tracer/sample.cpp common/helper.cpp
|
||||
$(CXX) $^ $(LDFLAGS) -o $@ $(LIBS) $(ROCPROFILER_INCLUDES) $(ROCM_INCLUDES) $(HIP_INCLUDES) $(CFLAGS)
|
||||
|
||||
pc_sampler :
|
||||
cd pcsampler/code_printing_sample && $(MAKE)
|
||||
|
||||
|
||||
|
||||
.PHONY: clean
|
||||
|
||||
clean:
|
||||
$(RM) -rf kernel_profiling_no_replay_sample device_profiling_sample kernel_replay_sample application_replay_sample user_replay_sample tracer_sample && cd pcsampler/code_printing_sample && $(MAKE) clean
|
||||
@@ -0,0 +1,92 @@
|
||||
## DISCLAIMER
|
||||
|
||||
The information presented in this document is for informational purposes only and may contain technical inaccuracies, omissions, and typographical errors. The information contained herein is subject to change and may be rendered inaccurate for many reasons, including but not limited to product and roadmap changes, component and motherboard version changes, new model and/or product releases, product differences between differing manufacturers, software changes, BIOS flashes, firmware upgrades, or the like. Any computer system has risks of security vulnerabilities that cannot be completely prevented or mitigated. AMD assumes no obligation to update or otherwise correct or revise this information. However, AMD reserves the right to revise this information and to make changes from time to time to the content hereof without obligation of AMD to notify any person of such revisions or changes.THIS INFORMATION IS PROVIDED ‘AS IS.” AMD MAKES NO REPRESENTATIONS OR WARRANTIES WITH RESPECT TO THE CONTENTS HEREOF AND ASSUMES NO RESPONSIBILITY FOR ANY INACCURACIES, ERRORS, OR OMISSIONS THAT MAY APPEAR IN THIS INFORMATION. AMD SPECIFICALLY DISCLAIMS ANY IMPLIED WARRANTIES OF NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR ANY PARTICULAR PURPOSE. IN NO EVENT WILL AMD BE LIABLE TO ANY PERSON FOR ANY RELIANCE, DIRECT, INDIRECT, SPECIAL, OR OTHER CONSEQUENTIAL DAMAGES ARISING FROM THE USE OF ANY INFORMATION CONTAINED HEREIN, EVEN IF AMD IS EXPRESSLY ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. AMD, the AMD Arrow logo, and combinations thereof are trademarks of Advanced Micro Devices, Inc. Other product names used in this publication are for identification purposes only and may be trademarks of their respective companies.
|
||||
|
||||
© 2023 Advanced Micro Devices, Inc. All Rights Reserved.
|
||||
|
||||
|
||||
## ROCProfiler API Concepts
|
||||
- Session
|
||||
- Filter
|
||||
- Buffer
|
||||
|
||||
|
||||
## API Philosophy
|
||||
|
||||
The APIs provide a common interface to the users for different
|
||||
features such as profiling, tracing.
|
||||
|
||||
In order to make use of any functionality of rocprofv2, one needs to create
|
||||
a "Session" object. This session could be a profiling session/tracing/pc-sampling session etc.
|
||||
|
||||
In order to set user inputs, one needs to provide a "Filter" to a session object.
|
||||
This filter could be for counters/traces/pc-samples etc.
|
||||
|
||||
Now that the input is taken care of, one also needs to provide a "Buffer" which
|
||||
will store the output results generated during a session. This buffer will contain
|
||||
different records corresponding to the filter type chosen. A flush function can also
|
||||
be specified for the buffer, which will be used to flush the buffer records.
|
||||
A filter and buffer are associated together.
|
||||
|
||||
Once a Session, Buffer, Filter have all been created, the session can be started.
|
||||
One can control when the session can be started, stopped and destroyed.
|
||||
|
||||
## Descriptions of Code Samples
|
||||
### kernel_profiling_sample.cpp
|
||||
This code sample demonstrates how to use the APIs to collect performance counters and metrics for every kernel dispatch.
|
||||
|
||||
### tracer_sample.cpp
|
||||
This code sample demonstrates how to use the APIs to collect different API and activity traces:
|
||||
- HIP API
|
||||
- HIP OPS
|
||||
- HSA API
|
||||
- HSA OPS
|
||||
- ROCTX
|
||||
|
||||
### device_profiling_sample.cpp
|
||||
This code sample demonstrates how to use the APIs to collect counters and metrics from the GPU via user defined sampling, instead of per-kernel dipatch measurements.
|
||||
|
||||
|
||||
## How to compile
|
||||
In order to get the samples to compile, make sure to copy rocmtools binaries into /opt/rocm/lib
|
||||
Running 'make install' inside the rocmtools/build folder will copy the binaries to /opt/rocm/lib
|
||||
|
||||
Alternately, change the 'ROCMTOOLS_LIBS_PATH' variable in the Makefile to point to the rocmtools/build folder.
|
||||
After modifications to Makefile are done, run:
|
||||
|
||||
```bash
|
||||
# compile all samples
|
||||
make
|
||||
```
|
||||
|
||||
```bash
|
||||
# compile kernel_profiling_no_replay_sample.cpp
|
||||
make kernel_profiling_no_replay_sample
|
||||
```
|
||||
|
||||
### How to run
|
||||
Before running, ROCPROFILER_METRICS_PATH needs to be set to point to 'derived_counters.xml'
|
||||
If the rocprofiler binaries are present in the rocm installation path /opt/rocm
|
||||
then below command will work:
|
||||
```bash
|
||||
export ROCPROFILER_METRICS_PATH=/opt/rocm/libexec/rocprofiler/counters/derived_counters.xml
|
||||
```
|
||||
|
||||
Otherwise, make it point to rocprofiler/build/counters/derived_counters.xml like below:
|
||||
```bash
|
||||
export ROCPROFILER_METRICS_PATH=<path_to_rocprofiler>/rocprofiler/buid/counters/derived_counters.xml
|
||||
```
|
||||
|
||||
Finally, run a sample:
|
||||
```bash
|
||||
./kernel_profiling_no_replay_sample
|
||||
```
|
||||
|
||||
## PC-Sampler
|
||||
The ROCProfiler library includes an API to enable periodic sampling of the GPU
|
||||
program counter during kernel execution. An example program is included that demonstrates the PC
|
||||
sampling API, with additional code to illustrate a typical non-trivial use case:
|
||||
correlation of sampled PC addresses with their disassembled machine code, as
|
||||
well as source code and symbolic debugging information if available.
|
||||
|
||||
See [PC-Sampler README](pcsampler/code_printing_sample/README.md)
|
||||
@@ -26,7 +26,7 @@
|
||||
#include <string>
|
||||
#include <mutex>
|
||||
|
||||
#include "src/utils/helper.h"
|
||||
#include "helper.h"
|
||||
|
||||
// Custom assert to print error messages
|
||||
#define ASSERTM(exp, msg) assert(((void)msg, exp))
|
||||
@@ -103,6 +103,7 @@ const char* GetDomainName(rocprofiler_tracer_activity_domain_t domain) {
|
||||
break;
|
||||
default:
|
||||
return "";
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -117,7 +118,7 @@ void FlushTracerRecord(rocprofiler_record_tracer_t tracer_record, rocprofiler_se
|
||||
std::string kernel_name;
|
||||
std::string function_name;
|
||||
std::string roctx_message;
|
||||
uint64_t roctx_id;
|
||||
uint64_t roctx_id = 0;
|
||||
if ((tracer_record.operation_id.id == 0 && tracer_record.domain == ACTIVITY_DOMAIN_HIP_OPS)) {
|
||||
if (tracer_record.api_data_handle.handle &&
|
||||
strlen(reinterpret_cast<const char*>(tracer_record.api_data_handle.handle)) > 1)
|
||||
@@ -207,12 +208,11 @@ void FlushProfilerRecord(const rocprofiler_record_profiler_t* profiler_record,
|
||||
rocprofiler_session_id_t session_id, rocprofiler_buffer_id_t buffer_id) {
|
||||
std::lock_guard<std::mutex> lock(writing_lock);
|
||||
size_t name_length = 0;
|
||||
bool is_counter = true;
|
||||
CHECK_ROCPROFILER(rocprofiler_query_kernel_info_size(ROCPROFILER_KERNEL_NAME,
|
||||
profiler_record->kernel_id, &name_length));
|
||||
// Taken from rocprofiler: The size hasn't changed in recent past
|
||||
static const uint32_t lds_block_size = 128 * 4;
|
||||
const char* kernel_name_c;
|
||||
const char* kernel_name_c = "";
|
||||
if (name_length > 1) {
|
||||
kernel_name_c = static_cast<const char*>(malloc(name_length * sizeof(char)));
|
||||
CHECK_ROCPROFILER(rocprofiler_query_kernel_info(ROCPROFILER_KERNEL_NAME, profiler_record->kernel_id,
|
||||
@@ -345,6 +345,7 @@ void kernelCalls(char c) {
|
||||
}
|
||||
default: {
|
||||
fprintf(stderr, "Error: Wrong Kernel character (%c) Given for kernelCalls!\n", c);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,180 @@
|
||||
/* Copyright (c) 2022 Advanced Micro Devices, Inc.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE. */
|
||||
|
||||
#include "helper.h"
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstdarg>
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <amd_comgr/amd_comgr.h>
|
||||
|
||||
#define amd_comgr_(call) \
|
||||
do { \
|
||||
if (amd_comgr_status_t status = amd_comgr_##call; status != AMD_COMGR_STATUS_SUCCESS) { \
|
||||
const char* reason = ""; \
|
||||
amd_comgr_status_string(status, &reason); \
|
||||
fatal(#call " failed: %s", reason); \
|
||||
} \
|
||||
} while (false)
|
||||
|
||||
namespace rocmtools {
|
||||
|
||||
std::string string_vprintf(const char* format, va_list va) {
|
||||
va_list copy;
|
||||
|
||||
va_copy(copy, va);
|
||||
size_t size = vsnprintf(NULL, 0, format, copy);
|
||||
va_end(copy);
|
||||
|
||||
std::string str(size, '\0');
|
||||
vsprintf(&str[0], format, va);
|
||||
|
||||
return str;
|
||||
}
|
||||
|
||||
std::string string_printf(const char* format, ...) {
|
||||
va_list va;
|
||||
va_start(va, format);
|
||||
std::string str(string_vprintf(format, va));
|
||||
va_end(va);
|
||||
|
||||
return str;
|
||||
}
|
||||
|
||||
[[maybe_unused]] void warning(const char* format, ...) {
|
||||
va_list va;
|
||||
va_start(va, format);
|
||||
vfprintf(stderr, format, va);
|
||||
va_end(va);
|
||||
}
|
||||
|
||||
[[maybe_unused]] void fatal [[noreturn]] (const char* format, ...) {
|
||||
va_list va;
|
||||
va_start(va, format);
|
||||
std::string message = string_vprintf(format, va);
|
||||
va_end(va);
|
||||
|
||||
#if defined(ENABLE_BACKTRACE)
|
||||
BackTraceInfo info;
|
||||
|
||||
info.sstream << std::endl << "Backtrace:";
|
||||
info.state = ::backtrace_create_state("/proc/self/exe", 0, errorCallback, &info);
|
||||
::backtrace_full(info.state, 1, fullCallback, errorCallback, &info);
|
||||
|
||||
message += info.sstream.str();
|
||||
#endif /* defined (ENABLE_BACKTRACE) */
|
||||
|
||||
std::string errmsg("ROCMTools: fatal error: " + message);
|
||||
fputs(errmsg.c_str(), stderr);
|
||||
|
||||
throw(errmsg);
|
||||
}
|
||||
|
||||
/* The function extracts the kernel name from
|
||||
input string. By using the iterators it finds the
|
||||
window in the string which contains only the kernel name.
|
||||
For example 'Foo<int, float>::foo(a[], int (int))' -> 'foo'*/
|
||||
std::string truncate_name(const std::string& name) {
|
||||
auto rit = name.rbegin();
|
||||
auto rend = name.rend();
|
||||
uint32_t counter = 0;
|
||||
char open_token = 0;
|
||||
char close_token = 0;
|
||||
while (rit != rend) {
|
||||
if (counter == 0) {
|
||||
switch (*rit) {
|
||||
case ')':
|
||||
counter = 1;
|
||||
open_token = ')';
|
||||
close_token = '(';
|
||||
break;
|
||||
case '>':
|
||||
counter = 1;
|
||||
open_token = '>';
|
||||
close_token = '<';
|
||||
break;
|
||||
case ']':
|
||||
counter = 1;
|
||||
open_token = ']';
|
||||
close_token = '[';
|
||||
break;
|
||||
case ' ':
|
||||
++rit;
|
||||
continue;
|
||||
}
|
||||
if (counter == 0) break;
|
||||
} else {
|
||||
if (*rit == open_token) counter++;
|
||||
if (*rit == close_token) counter--;
|
||||
}
|
||||
++rit;
|
||||
}
|
||||
auto rbeg = rit;
|
||||
while ((rit != rend) && (*rit != ' ') && (*rit != ':')) rit++;
|
||||
return name.substr(rend - rit, rit - rbeg);
|
||||
}
|
||||
|
||||
// C++ symbol demangle
|
||||
std::string cxx_demangle(const std::string& symbol) {
|
||||
amd_comgr_data_t mangled_data;
|
||||
amd_comgr_(create_data(AMD_COMGR_DATA_KIND_BYTES, &mangled_data));
|
||||
amd_comgr_(set_data(mangled_data, symbol.size(), symbol.data()));
|
||||
|
||||
amd_comgr_data_t demangled_data;
|
||||
amd_comgr_(demangle_symbol_name(mangled_data, &demangled_data));
|
||||
|
||||
size_t demangled_size = 0;
|
||||
amd_comgr_(get_data(demangled_data, &demangled_size, nullptr));
|
||||
|
||||
std::string demangled_str;
|
||||
demangled_str.resize(demangled_size);
|
||||
// amd_comgr_(get_data(demangled_data, &demangled_size, demangled_str.data())); // TODO: uncomment
|
||||
|
||||
amd_comgr_(release_data(mangled_data));
|
||||
amd_comgr_(release_data(demangled_data));
|
||||
return demangled_str;
|
||||
}
|
||||
|
||||
// check if string has special char
|
||||
bool has_special_char(std::string const& str) {
|
||||
return std::find_if(str.begin(), str.end(), [](unsigned char ch) {
|
||||
return !(isalnum(ch) || ch == '_' || ch == ':' || ch == ' ');
|
||||
}) != str.end();
|
||||
}
|
||||
|
||||
// check if string has correct counter format
|
||||
bool has_counter_format(std::string const& str) {
|
||||
return std::find_if(str.begin(), str.end(), [](unsigned char ch) {
|
||||
return (isalnum(ch) || ch == '_' || ch != ':');
|
||||
}) != str.end();
|
||||
}
|
||||
|
||||
// trims the begining of the line for spaces
|
||||
std::string left_trim(const std::string& s) {
|
||||
const std::string WHITESPACE = " \n\r\t\f\v";
|
||||
size_t start = s.find_first_not_of(WHITESPACE);
|
||||
return (start == std::string::npos) ? "" : s.substr(start);
|
||||
}
|
||||
|
||||
|
||||
} // namespace rocmtools
|
||||
@@ -0,0 +1,72 @@
|
||||
/* Copyright (c) 2022 Advanced Micro Devices, Inc.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE. */
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstdarg>
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <algorithm>
|
||||
#include <cxxabi.h>
|
||||
// #include "exception.h"
|
||||
|
||||
namespace rocmtools {
|
||||
|
||||
std::string string_vprintf(const char* format, va_list va);
|
||||
|
||||
std::string string_printf(const char* format, ...);
|
||||
|
||||
[[maybe_unused]] void warning(const char* format, ...)
|
||||
#if defined(__GNUC__)
|
||||
__attribute__((format(printf, 1, 2)))
|
||||
#endif /* defined (__GNUC__) */
|
||||
;
|
||||
[[maybe_unused]] void fatal [[noreturn]] (const char* format, ...)
|
||||
#if defined(__GNUC__)
|
||||
__attribute__((format(printf, 1, 2)))
|
||||
#endif /* defined (__GNUC__) */
|
||||
;
|
||||
|
||||
[[maybe_unused]] void warning(const char* format, ...);
|
||||
|
||||
[[maybe_unused]] void fatal [[noreturn]] (const char* format, ...);
|
||||
|
||||
/* The function extracts the kernel name from
|
||||
input string. By using the iterators it finds the
|
||||
window in the string which contains only the kernel name.
|
||||
For example 'Foo<int, float>::foo(a[], int (int))' -> 'foo'*/
|
||||
std::string truncate_name(const std::string& name);
|
||||
|
||||
// C++ symbol demangle
|
||||
std::string cxx_demangle(const std::string& symbol);
|
||||
|
||||
// check if string has special char
|
||||
bool has_special_char(std::string const& str);
|
||||
|
||||
// check if string has correct counter format
|
||||
bool has_counter_format(std::string const& str);
|
||||
|
||||
// trims the begining of the line for spaces
|
||||
std::string left_trim(const std::string& s);
|
||||
|
||||
} // namespace rocmtools
|
||||
@@ -0,0 +1,68 @@
|
||||
# -*- makefile-gmake -*-
|
||||
|
||||
ROCM_PATH ?= /opt/rocm
|
||||
HIP_PATH ?= $(ROCM_PATH)/hip
|
||||
HIPCC := $(HIP_PATH)/bin/hipcc
|
||||
|
||||
ifndef ROCPROFILER_PATH
|
||||
$(warning You may need to set ROCPROFILER_PATH to the path of the rocprofiler source)
|
||||
endif
|
||||
|
||||
ROCPROFILER_PATH ?= $(ROCM_PATH)/include/rocprofiler
|
||||
|
||||
CXXFLAGS += -std=c++17 -Wall
|
||||
|
||||
ifdef DEBUG
|
||||
CXXFLAGS += -gdwarf-4 -O0
|
||||
else
|
||||
ifdef DEBUGOPT
|
||||
CXXFLAGS += -gdwarf-4 -Og
|
||||
else
|
||||
CXXFLAGS += -gdwarf-4 -O2
|
||||
endif
|
||||
endif
|
||||
|
||||
###
|
||||
|
||||
srcs := $(wildcard *.cpp)
|
||||
prog := main
|
||||
|
||||
objs := $(srcs:%.cpp=%.o)
|
||||
deps := $(srcs:%.cpp=%.d)
|
||||
|
||||
# Kernel program
|
||||
|
||||
CPPFLAGS += -DHAVE_MEMFD_CREATE
|
||||
|
||||
$(prog): CC = $(HIPCC)
|
||||
$(prog): CPPFLAGS += -I$(ROCM_PATH)/include -I$(ROCPROFILER_PATH)
|
||||
$(prog): LDFLAGS := -L$(ROCM_PATH)/lib -L$(ROCPROFILER_PATH)/build
|
||||
$(prog): LDLIBS += -ldl -lpthread -lhsa-runtime64 -lrocprofiler64 -lrocm-dbgapi -ldw -lelf
|
||||
$(objs): CXX = $(HIPCC)
|
||||
|
||||
# Targets
|
||||
|
||||
all: $(prog)
|
||||
|
||||
$(prog): $(objs)
|
||||
|
||||
-include $(deps)
|
||||
|
||||
OUTPUT_OPTION = -MMD -MP -o $@
|
||||
|
||||
%.so: %.o
|
||||
$(LINK.o) $(OUTPUT_OPTION) $^ $(LDLIBS)
|
||||
|
||||
#COMPILE.hip = $(COMPILE.cpp)
|
||||
#LINK.hip = $(LINK.cpp)
|
||||
|
||||
#%.o: %.hip
|
||||
# $(COMPILE.hip) $(OUTPUT_OPTION) $<
|
||||
|
||||
clean:
|
||||
$(RM) $(prog) $(objs) $(deps)
|
||||
|
||||
distclean: | clean
|
||||
$(RM) compile_commands.json
|
||||
|
||||
.PHONY: all clean distclean
|
||||
@@ -0,0 +1,149 @@
|
||||
# ROCProfiler PC sampling example code
|
||||
|
||||
The ROCProfiler library includes an API to enable periodic sampling of the GPU
|
||||
program counter during kernel execution. This program demonstrates the PC
|
||||
sampling API, with additional code to illustrate a typical non-trivial use case:
|
||||
correlation of sampled PC addresses with their disassembled machine code, as
|
||||
well as source code and symbolic debugging information if available.
|
||||
|
||||
## Building the demo program
|
||||
|
||||
If your ROCm installation already includes ROCProfiler, the only requirements to
|
||||
build the demo program are:
|
||||
|
||||
* GNU `make`
|
||||
* libdw (**not** libdwarf)
|
||||
* libelf
|
||||
|
||||
If ROCm is installed in the standard location (`/opt/rocm`), running `make` in
|
||||
the same directory as this README should work; otherwise, set `ROCM_PATH` to the
|
||||
location of the ROCm installation in your environment and `ROCPROFILER_PATH` to
|
||||
the location of the ROCProfiler source repo before running `make`.
|
||||
|
||||
If your ROCm installation does **not** include ROCProfiler, you will need to build
|
||||
it yourself. This demo program will be built as part of that process. See the
|
||||
main ROCProfiler README for additional requirements and directions.
|
||||
|
||||
## Running the demo program
|
||||
|
||||
The demo program simply fills a vector with random 64-bit unsigned integers and
|
||||
tallies the count of those greater than the mandatory `MIN` argument:
|
||||
|
||||
```
|
||||
usage: code_printing_sample [OPTION]... MIN [SEED]
|
||||
-d DEV HIP device number
|
||||
-n LEN Length of random integer array
|
||||
-D Print kernel disassembly
|
||||
-P Print source and disassembly of sampled PC locations
|
||||
where
|
||||
DEV : i32
|
||||
MIN : u64
|
||||
LEN : u64
|
||||
SEED : u64
|
||||
```
|
||||
|
||||
### Defaults and troubleshooting
|
||||
|
||||
* `-d`: use HIP device 0
|
||||
* `-n`: 4194304 (1024 * 1024 * 4)
|
||||
* `-D`: false
|
||||
* `-P`: false
|
||||
* `SEED`: random seed; taken from the system's monotonic clock
|
||||
|
||||
The program contains two trivial GPU kernels: an implementation of `memset`, and
|
||||
the parallel counting procedure. Because the actual point is to demonstrate the
|
||||
PC sampling functionality, it is recommended to use the `-n` option with an
|
||||
argument such that the allocated vector fits in the smaller of available host as
|
||||
well as device memory, but sufficiently large argument such that the kernels run
|
||||
long enough for ROCProfiler to actually collect some samples.
|
||||
|
||||
In order for the `-P` option to display source, the demo program must have been
|
||||
built with debug symbols (at least `-gdwarf-4`). Any optimization level is
|
||||
fine, but if the kernels run too quickly for ROCProfiler to collect any samples
|
||||
even when a very large vector is given with the `-n` option, try rebuilding the
|
||||
demo program without optimizations by adding `-O0` to the `hipcc` compilation
|
||||
flags.
|
||||
|
||||
## Files
|
||||
|
||||
* `main.cpp`: initializes ROCProfiler and PC sampling and runs the GPU kernels
|
||||
* `code_printing.cpp`: inspects the ELF and DWARF info for the GPU programs
|
||||
embedded in the host binary and uses amd-dbgapi to print disassembly and
|
||||
source
|
||||
* `disassembly.cpp`: wrapper for `code_printing.cpp`
|
||||
|
||||
## PC sampling API
|
||||
|
||||
Adding PC sampling to a program already using the ROCProfiler API requires only
|
||||
two changes:
|
||||
|
||||
1. Call `rocprofiler_create_filter` to create a `ROCPROFILER_PC_SAMPLING_COLLECTION`
|
||||
filter, then `rocprofiler_set_filter_buffer` to add the filter to the desired
|
||||
buffer (see functions `main` and `run_kernel` in `main.cpp`)
|
||||
|
||||
2. Handle records of kind `ROCPROFILER_PC_SAMPLING_RECORD` in the buffer callback
|
||||
function. These should be cast to `rocprofiler_record_pc_sample_t *` (see
|
||||
function `callback_flush_fn` in `main.cpp`)
|
||||
|
||||
Like all ROCProfiler records, PC sample records contain a standard header followed
|
||||
by one or more payloads:
|
||||
|
||||
```c
|
||||
/**
|
||||
* PC sample record: contains the program counter/instruction pointer observed
|
||||
* during periodic sampling of a kernel
|
||||
*/
|
||||
typedef struct {
|
||||
/**
|
||||
* ROCMtool General Record base header to identify the id and kind of every
|
||||
* record
|
||||
*/
|
||||
rocprofiler_record_header_t header;
|
||||
/**
|
||||
* PC sample data
|
||||
*/
|
||||
rocprofiler_pc_sample_t pc_sample;
|
||||
} rocprofiler_record_pc_sample_t;
|
||||
```
|
||||
|
||||
PC samples are delivered via the normal ROCProfiler buffer callback mechanism,
|
||||
along with some additional information allowing each sample to be associated
|
||||
with a unique, individual kernel execution:
|
||||
|
||||
```c
|
||||
/**
|
||||
* An individual PC sample
|
||||
*/
|
||||
typedef struct {
|
||||
/**
|
||||
* Kernel dispatch ID. This is used by PC sampling to associate samples with
|
||||
* individual dispatches and is unrelated to any user-supplied correlation ID
|
||||
*/
|
||||
rocprofiler_kernel_dispatch_id_t dispatch_id;
|
||||
union {
|
||||
/**
|
||||
* Host timestamp
|
||||
*/
|
||||
rocprofiler_timestamp_t timestamp;
|
||||
/**
|
||||
* GPU clock counter (not currently used)
|
||||
*/
|
||||
uint64_t cycle;
|
||||
};
|
||||
/**
|
||||
* Sampled program counter
|
||||
*/
|
||||
uint64_t pc;
|
||||
/**
|
||||
* Sampled shader element
|
||||
*/
|
||||
uint32_t se;
|
||||
/**
|
||||
* Sampled GPU agent
|
||||
*/
|
||||
rocprofiler_agent_id_t gpu_id;
|
||||
} rocprofiler_pc_sample_t;
|
||||
```
|
||||
|
||||
PC sampling is started and stopped with `rocprofiler_start_session` and
|
||||
`rocprofiler_terminate_session`, just like other profiling activities.
|
||||
@@ -41,7 +41,7 @@
|
||||
#include <hsa/amd_hsa_kernel_code.h>
|
||||
#include <hsa/hsa_ven_amd_loader.h>
|
||||
|
||||
#include "inc/rocprofiler.h"
|
||||
#include "rocprofiler.h"
|
||||
|
||||
#include "code_printing.hpp"
|
||||
#include "program.hpp"
|
||||
|
||||
@@ -9,12 +9,15 @@ int main(int argc, char** argv) {
|
||||
int poll_duration = 5;
|
||||
if (argc > 1) poll_duration = atoi(argv[1]);
|
||||
|
||||
hipDeviceProp_t devProp;
|
||||
HIP_CALL(hipGetDeviceProperties(&devProp, 0));
|
||||
|
||||
CHECK_ROCPROFILER(rocprofiler_initialize());
|
||||
printf("initialize\n");
|
||||
|
||||
rocprofiler_session_id_t dp_session_id;
|
||||
std::vector<const char*> counters;
|
||||
counters.emplace_back("FETCH_SIZE");
|
||||
counters.emplace_back("GRBM_COUNT");
|
||||
|
||||
printf("session create\n");
|
||||
|
||||
|
||||
@@ -0,0 +1,68 @@
|
||||
#include "../common/common.h"
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
int* gpuMem;
|
||||
prepare();
|
||||
// Initialize the tools
|
||||
CHECK_ROCPROFILER(rocprofiler_initialize());
|
||||
|
||||
// Creating the session with given replay mode
|
||||
rocprofiler_session_id_t session_id;
|
||||
CHECK_ROCPROFILER(rocprofiler_create_session(ROCPROFILER_NONE_REPLAY_MODE, &session_id));
|
||||
|
||||
// Creating Output Buffer for the data
|
||||
rocprofiler_buffer_id_t buffer_id;
|
||||
CHECK_ROCPROFILER(rocprofiler_create_buffer(
|
||||
session_id,
|
||||
[](const rocprofiler_record_header_t* record, const rocprofiler_record_header_t* end_record,
|
||||
rocprofiler_session_id_t session_id, rocprofiler_buffer_id_t buffer_id) {
|
||||
WriteBufferRecords(record, end_record, session_id, buffer_id);
|
||||
},
|
||||
0x9999, &buffer_id));
|
||||
|
||||
// Counter Collection Filter
|
||||
std::vector<const char*> counters;
|
||||
counters.emplace_back("GRBM_COUNT");
|
||||
rocprofiler_filter_id_t filter_id;
|
||||
[[maybe_unused]] rocprofiler_filter_property_t property = {};
|
||||
CHECK_ROCPROFILER(rocprofiler_create_filter(session_id, ROCPROFILER_COUNTERS_COLLECTION,
|
||||
rocprofiler_filter_data_t{.counters_names = &counters[0]},
|
||||
counters.size(), &filter_id, property));
|
||||
CHECK_ROCPROFILER(rocprofiler_set_filter_buffer(session_id, filter_id, buffer_id));
|
||||
|
||||
// Normal HIP Calls
|
||||
hipDeviceProp_t devProp;
|
||||
HIP_CALL(hipGetDeviceProperties(&devProp, 0));
|
||||
HIP_CALL(hipMalloc((void**)&gpuMem, 1 * sizeof(int)));
|
||||
|
||||
// KernelA and KernelB won't be profiled
|
||||
kernelCalls('A');
|
||||
kernelCalls('B');
|
||||
|
||||
// Activating Profiling Session to profile whatever kernel launches occurs up
|
||||
// till the next terminate session
|
||||
CHECK_ROCPROFILER(rocprofiler_start_session(session_id));
|
||||
|
||||
// KernelC, KernelD, KernelE and KernelF to be profiled as part of the session
|
||||
kernelCalls('C');
|
||||
kernelCalls('D');
|
||||
kernelCalls('E');
|
||||
kernelCalls('F');
|
||||
// Normal HIP Calls
|
||||
HIP_CALL(hipFree(gpuMem));
|
||||
|
||||
// Deactivating session
|
||||
CHECK_ROCPROFILER(rocprofiler_terminate_session(session_id));
|
||||
|
||||
// Manual Flush user buffer request
|
||||
CHECK_ROCPROFILER(rocprofiler_flush_data(session_id, buffer_id));
|
||||
|
||||
// Destroy sessions
|
||||
CHECK_ROCPROFILER(rocprofiler_destroy_session(session_id));
|
||||
|
||||
// Destroy all profiling related objects(User buffer, sessions, filters,
|
||||
// etc..)
|
||||
CHECK_ROCPROFILER(rocprofiler_finalize());
|
||||
|
||||
return 0;
|
||||
}
|
||||
新しいイシューから参照
ユーザーをブロックする