6f17da7ade
* Adding Benchmarking Stg1
* config fix
* reset
* add jpeg and decode traces in iteration
* address comments benchmark config files.
* address comments.
* address comments.
* address comments: revert cntrl ctx.
* address comments: revert csv output.
* resolve merge conflits.
* format.
* build fix.
* fix hip runtime api traces.
* loop cb services.
* format.
* bug fix.
* Fix operator>
- public C++ comparison operator
* Update configuration options
- support selected regions (--selected-regions)
- support writing output config json (--output-config)
- update serialization data
* rocprofv3 tool library misc updates
- lambda for starting context
- support for writing config json
* Tool library updates
- Finished support for all benchmarking modes
- Added build spec support to config json
* Fix ROCPROFILER_SOVERSION
- this value should not be multiplied by 10,000
* Minor tweak to rocprofv3
* Benchmarking scripts
* formatting
* Fix duplicate include
* Add reproducible-dispatch-count test app
- used in benchmarking
* registration logging
- report number of registered contexts and active contexts after client initialization
* Serialize environment in rocprofv3 output config
* ROCPROFILER_BUILD_BENCHMARK CMake option
* Update benchmark SQL schema
- hash_id is text
- add md5sum to benchmarked_app
- remove app_id from benchmarked_sdk
- add sdk_id to benchmark_config
- separate hip_trace into hip_runtime_trace and hip_compiler_trace
- use INT instead of INTEGER for MySQL compatibility
- add count column in benchmark_statistics
- allow std_dev to be NULL in benchmark_statistics
* Update rocprofv3-benchmark.py
- use md5 instead of python hash (which includes random seed)
- use args.mysql_database
- compute md5sum of executable
- fix insert_benchmark_config
- marker trace fixes
- memory allocation fixes
- split hip_trace into hip_{runtime,compiler}_trace
- remove app_id from benchmarked_sdk
- support warmup runs
- count field in benchmark_statistics
* Support launcher and environment in YAML
* Update reproducible-dispatch-count.cpp
- support mode which doesn't use hip event timing
* Misc rocprofv3-benchmark.py updates
- fix some MySQL support
- remove some unnecessary logging
* support mysql db.
* Format.
* Updated SQL input files
- moved benchmark_schema.sql to benchmark_table.sql
- added benchmark_views.sql
- uses {{metric}} syntax for variable substitution
* cmake formatting
* update rocprofv3-benchmark.py
- benchmark config labels
- overhead views
* Encode rocprofv3-benchmark PID in rocprofv3 and timem output files
* Minor tweak to benchmark_views.sql
- include count
- reorder fields for readability
* split statements and use IS if values is NONE.
* use backtick instead of double quotes and add IS before NOT NULL.:
* Adding Mandelbrot Benchmark App
* Adding Dockerfile example
* Update dockerfile
* Update dockerfile
* [SDK] rocprofiler_query_external_correlation_id_request_kind_name
* Execution-profile benchmark mode
* Execution profile SQL support
* Rename mandlebrot folder + misc clang-tidy
* [rocprofv3-benchmark] Execution profile support
* Update installation
* add work dir when setting git revision, useful when building outside src.
* Set FULL_VERSION_STRING and ROCPROFILER_SDK_GIT_REVISION
- when benchmark folder is top-level
* Remove unused python packages from requirements.txt
* Use ldd/pyelftools to include linked libs for md5sum
- also add --filter-benchmark and --filter-rocprofv3 options
- support labeling the rocprofv3 options
- use more argparse groups
- more generic application of filters
- support variable substitution in environment, e.g. PATH=/some/path:$PATH
* Environment improvements
- improve reproducibility when env set via input file vs. shell
- support "environment-ignore" to remove environment variables
* Misc formatting
* Misc. fix
* use backticks for defining new columns name
* Support shuffling the order of benchmark modes/rocprofv3 args
* Address review comments
* Update Dockerfile
- rename to Dockerfile
- reduce to one layer
* Support docker build arg BRANCH
---------
Co-authored-by: Ammar ELWazir <aelwazir@amd.com>
Co-authored-by: Kandula, Venkateshwar reddy <Venkateshwarreddy.Kandula@amd.com>
Co-authored-by: Venkateshwar Reddy Kandula <vkandula@amd.com>
Co-authored-by: Madsen, Jonathan <Jonathan.Madsen@amd.com>
Co-authored-by: Jonathan R. Madsen <jonathanrmadsen@gmail.com>
325 lines
9.2 KiB
C++
325 lines
9.2 KiB
C++
/*
|
|
Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
|
|
|
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
of this software and associated documentation files (the "Software"), to deal
|
|
in the Software without restriction, including without limitation the rights
|
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
copies of the Software, and to permit persons to whom the Software is
|
|
furnished to do so, subject to the following conditions:
|
|
|
|
The above copyright notice and this permission notice shall be included in
|
|
all copies or substantial portions of the Software.
|
|
|
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
THE SOFTWARE.
|
|
*/
|
|
|
|
#include "utils.hpp"
|
|
|
|
#include <thread>
|
|
#ifdef __linux__
|
|
# include <sys/sysinfo.h>
|
|
#elif defined(_WIN32)
|
|
# include <windows.h>
|
|
#endif
|
|
|
|
// standard global variables that can be set on command line
|
|
size_t N = 4 * 1024 * 1024;
|
|
char memsetval = 0x42;
|
|
int memsetD32val = 0xDEADBEEF;
|
|
short memsetD16val = 0xDEAD;
|
|
char memsetD8val = 0xDE;
|
|
int iterations = 1;
|
|
unsigned blocksPerCU = 6; // to hide latency
|
|
unsigned threadsPerBlock = 256;
|
|
int textureFilterMode = 0; // 0: hipFilterModePoint; 1: hipFilterModeLinear
|
|
int p_gpuDevice = 0;
|
|
unsigned p_verbose = 0;
|
|
int p_tests = -1; /*which tests to run. Interpretation is left to each test. default:all*/
|
|
int debug_test = 0;
|
|
#ifdef _WIN64
|
|
const char* HIP_VISIBLE_DEVICES_STR = "HIP_VISIBLE_DEVICES=";
|
|
const char* CUDA_VISIBLE_DEVICES_STR = "CUDA_VISIBLE_DEVICES=";
|
|
const char* PATH_SEPERATOR_STR = "\\";
|
|
const char* NULL_DEVICE = "NUL:";
|
|
#else
|
|
const char* HIP_VISIBLE_DEVICES_STR = "HIP_VISIBLE_DEVICES";
|
|
const char* CUDA_VISIBLE_DEVICES_STR = "CUDA_VISIBLE_DEVICES";
|
|
const char* PATH_SEPERATOR_STR = "/";
|
|
const char* NULL_DEVICE = "/dev/null";
|
|
#endif
|
|
|
|
#ifdef _WIN64
|
|
// Windows does not have rand_r, use srand and rand instead.
|
|
int
|
|
rand_r(unsigned int* s)
|
|
{
|
|
srand(*s);
|
|
return rand();
|
|
}
|
|
#endif
|
|
|
|
// Get Free Memory from the system
|
|
static size_t
|
|
getMemoryAmount()
|
|
{
|
|
#if __linux__
|
|
struct sysinfo info;
|
|
int _ = sysinfo(&info);
|
|
return info.freeram / (1024 * 1024); // MB
|
|
#elif defined(_WIN32)
|
|
MEMORYSTATUSEX statex;
|
|
statex.dwLength = sizeof(statex);
|
|
GlobalMemoryStatusEx(&statex);
|
|
return (statex.ullAvailPhys / (1024 * 1024)); // MB
|
|
#endif
|
|
}
|
|
|
|
size_t
|
|
getHostThreadCount(const size_t memPerThread, const size_t maxThreads)
|
|
{
|
|
if(memPerThread == 0) return 0;
|
|
auto memAmount = getMemoryAmount();
|
|
const auto processor_count = std::thread::hardware_concurrency();
|
|
if(processor_count == 0 || memAmount == 0) return 0;
|
|
size_t thread_count = 0;
|
|
if((processor_count * memPerThread) < memAmount)
|
|
thread_count = processor_count;
|
|
else
|
|
thread_count = reinterpret_cast<size_t>(memAmount / memPerThread);
|
|
if(maxThreads > 0)
|
|
{
|
|
return (thread_count > maxThreads) ? maxThreads : thread_count;
|
|
}
|
|
return thread_count;
|
|
}
|
|
|
|
// Function to determine if the device is of gfx11 architecture
|
|
bool
|
|
IsGfx11()
|
|
{
|
|
#if defined(__HIP_PLATFORM_NVIDIA__)
|
|
return false;
|
|
#elif defined(__HIP_PLATFORM_AMD__)
|
|
int device = -1;
|
|
hipDeviceProp_t props{};
|
|
HIPCHECK(hipGetDevice(&device));
|
|
HIPCHECK(hipGetDeviceProperties(&props, device));
|
|
|
|
// Get GCN Arch Name and compare to check if it is gfx11
|
|
std::string arch = std::string(props.gcnArchName);
|
|
auto pos = arch.find(":");
|
|
if(pos != std::string::npos) arch = arch.substr(0, pos);
|
|
|
|
if(arch.size() >= 5) arch = arch.substr(0, 5);
|
|
|
|
return (arch == std::string("gfx11")) ? true : false;
|
|
#else
|
|
std::cout << "Have to be either Nvidia or AMD platform, asserting" << std::endl;
|
|
assert(false);
|
|
#endif
|
|
}
|
|
|
|
namespace HipTest
|
|
{
|
|
double
|
|
elapsed_time(long long startTimeUs, long long stopTimeUs)
|
|
{
|
|
return ((double) (stopTimeUs - startTimeUs)) / ((double) (1000));
|
|
}
|
|
|
|
int
|
|
parseSize(const char* str, size_t* output)
|
|
{
|
|
char* next;
|
|
*output = strtoull(str, &next, 0);
|
|
int l = strlen(str);
|
|
if(l)
|
|
{
|
|
char c = str[l - 1]; // last char.
|
|
if((c == 'k') || (c == 'K'))
|
|
{
|
|
*output *= 1024;
|
|
}
|
|
if((c == 'm') || (c == 'M'))
|
|
{
|
|
*output *= (1024 * 1024);
|
|
}
|
|
if((c == 'g') || (c == 'G'))
|
|
{
|
|
*output *= (1024 * 1024 * 1024);
|
|
}
|
|
}
|
|
return 1;
|
|
}
|
|
|
|
int
|
|
parseUInt(const char* str, unsigned int* output)
|
|
{
|
|
char* next;
|
|
*output = strtoul(str, &next, 0);
|
|
return !strlen(next);
|
|
}
|
|
|
|
int
|
|
parseInt(const char* str, int* output)
|
|
{
|
|
char* next;
|
|
*output = strtol(str, &next, 0);
|
|
return !strlen(next);
|
|
}
|
|
|
|
int
|
|
parseStandardArguments(int argc, char* argv[], bool failOnUndefinedArg)
|
|
{
|
|
int extraArgs = 1;
|
|
for(int i = 1; i < argc; i++)
|
|
{
|
|
const char* arg = argv[i];
|
|
|
|
if(!strcmp(arg, " "))
|
|
{
|
|
// skip NULL args.
|
|
}
|
|
else if(!strcmp(arg, "--N") || (!strcmp(arg, "-N")))
|
|
{
|
|
if(++i >= argc || !HipTest::parseSize(argv[i], &N))
|
|
{
|
|
failed("Bad N size argument");
|
|
}
|
|
}
|
|
else if(!strcmp(arg, "--threadsPerBlock"))
|
|
{
|
|
if(++i >= argc || !HipTest::parseUInt(argv[i], &threadsPerBlock))
|
|
{
|
|
failed("Bad threadsPerBlock argument");
|
|
}
|
|
}
|
|
else if(!strcmp(arg, "--blocksPerCU"))
|
|
{
|
|
if(++i >= argc || !HipTest::parseUInt(argv[i], &blocksPerCU))
|
|
{
|
|
failed("Bad blocksPerCU argument");
|
|
}
|
|
}
|
|
else if(!strcmp(arg, "--memsetval"))
|
|
{
|
|
int ex;
|
|
if(++i >= argc || !HipTest::parseInt(argv[i], &ex))
|
|
{
|
|
failed("Bad memsetval argument");
|
|
}
|
|
memsetval = ex;
|
|
}
|
|
else if(!strcmp(arg, "--memsetD32val"))
|
|
{
|
|
int ex;
|
|
if(++i >= argc || !HipTest::parseInt(argv[i], &ex))
|
|
{
|
|
failed("Bad memsetD32val argument");
|
|
}
|
|
memsetD32val = ex;
|
|
}
|
|
else if(!strcmp(arg, "--memsetD16val"))
|
|
{
|
|
int ex;
|
|
if(++i >= argc || !HipTest::parseInt(argv[i], &ex))
|
|
{
|
|
failed("Bad memsetD16val argument");
|
|
}
|
|
memsetD16val = ex;
|
|
}
|
|
else if(!strcmp(arg, "--memsetD8val"))
|
|
{
|
|
int ex;
|
|
if(++i >= argc || !HipTest::parseInt(argv[i], &ex))
|
|
{
|
|
failed("Bad memsetD8val argument");
|
|
}
|
|
memsetD8val = ex;
|
|
}
|
|
else if(!strcmp(arg, "--textureFilterMode"))
|
|
{
|
|
int mode;
|
|
if(++i >= argc || !HipTest::parseInt(argv[i], &mode))
|
|
{
|
|
failed("Bad textureFilterMode argument");
|
|
}
|
|
textureFilterMode = mode;
|
|
}
|
|
else if(!strcmp(arg, "--iterations") || (!strcmp(arg, "-i")))
|
|
{
|
|
if(++i >= argc || !HipTest::parseInt(argv[i], &iterations))
|
|
{
|
|
failed("Bad iterations argument");
|
|
}
|
|
}
|
|
else if(!strcmp(arg, "--gpu") || (!strcmp(arg, "-gpuDevice")) || (!strcmp(arg, "-g")))
|
|
{
|
|
if(++i >= argc || !HipTest::parseInt(argv[i], &p_gpuDevice))
|
|
{
|
|
failed("Bad gpuDevice argument");
|
|
}
|
|
}
|
|
else if(!strcmp(arg, "--verbose") || (!strcmp(arg, "-v")))
|
|
{
|
|
if(++i >= argc || !HipTest::parseUInt(argv[i], &p_verbose))
|
|
{
|
|
failed("Bad verbose argument");
|
|
}
|
|
}
|
|
else if(!strcmp(arg, "--tests") || (!strcmp(arg, "-t")))
|
|
{
|
|
if(++i >= argc || !HipTest::parseInt(argv[i], &p_tests))
|
|
{
|
|
failed("Bad tests argument");
|
|
}
|
|
}
|
|
else if(!strcmp(arg, "--debug") || (!strcmp(arg, "-d")))
|
|
{
|
|
if(++i >= argc || !HipTest::parseInt(argv[i], &debug_test))
|
|
{
|
|
failed("Bad tests argument");
|
|
}
|
|
}
|
|
else
|
|
{
|
|
if(failOnUndefinedArg)
|
|
{
|
|
failed("Bad argument '%s'", arg);
|
|
}
|
|
else
|
|
{
|
|
argv[extraArgs++] = argv[i];
|
|
}
|
|
}
|
|
};
|
|
|
|
return extraArgs;
|
|
}
|
|
|
|
unsigned
|
|
setNumBlocks(unsigned blocksPerCU, unsigned threadsPerBlock, size_t N)
|
|
{
|
|
int device;
|
|
HIPCHECK(hipGetDevice(&device));
|
|
hipDeviceProp_t props;
|
|
HIPCHECK(hipGetDeviceProperties(&props, device));
|
|
|
|
unsigned blocks = props.multiProcessorCount * blocksPerCU;
|
|
if(blocks * threadsPerBlock > N)
|
|
{
|
|
blocks = (N + threadsPerBlock - 1) / threadsPerBlock;
|
|
}
|
|
|
|
return blocks;
|
|
}
|
|
|
|
} // namespace HipTest
|