Merge branch 'memtracker' of https://github.com/AMDComputeLibraries/HIP-privatestaging into memtracker
[ROCm/clr commit: 16b04fc0d3]
This commit is contained in:
@@ -71,6 +71,7 @@ if ($HIP_PLATFORM eq "hcc") {
|
||||
$HIPLDFLAGS .= " -L$HSA_PATH/lib -lhsa-runtime64 -lhc_am";
|
||||
# Add C++ libs for GCC.
|
||||
$HIPLDFLAGS .= " -lstdc++";
|
||||
$HIPLDFLAGS .= " -lm";
|
||||
|
||||
if ($verbose & 0x2) {
|
||||
print ("HSA_PATH=$HSA_PATH\n");
|
||||
@@ -163,7 +164,7 @@ if ($needHipHcc) {
|
||||
if ((not -e $object) or ((stat($source))[9] > (stat($object))[9])) {
|
||||
my $CMD = "$HCC $HCCFLAGS -I$HSA_PATH/include -I$HIP_PATH/include -Wall -c $source -o $object";
|
||||
if ($verbose & 0x10) {
|
||||
$CMD .= " -g" ;
|
||||
$CMD .= " -g -O2" ;
|
||||
} else {
|
||||
$CMD .= " -O3" ;
|
||||
}
|
||||
|
||||
@@ -82,6 +82,7 @@ if ($p_full) {
|
||||
system("$HCC_HOME/bin/hcc-config --cxxflags");
|
||||
print ("HCC-ldflags : ");
|
||||
system("$HCC_HOME/bin/hcc-config --ldflags");
|
||||
printf("\n");
|
||||
}
|
||||
if ($HIP_PLATFORM eq "nvcc") {
|
||||
print "\n" ;
|
||||
@@ -98,6 +99,8 @@ if ($p_full) {
|
||||
print "\n" ;
|
||||
print "== Linux Kernel\n";
|
||||
system ("uname -a");
|
||||
|
||||
print "\n" ;
|
||||
$printed = 1;
|
||||
}
|
||||
|
||||
|
||||
@@ -277,6 +277,8 @@ while (@ARGV) {
|
||||
|
||||
$ft{'mem'} += s/\bcudaMemcpyKind\b/hipMemcpyKind/g;
|
||||
|
||||
$ft{'mem'} += s/\bcudaPointerAttributes\b/hipPointerAttribute_t/g;
|
||||
|
||||
|
||||
#--------
|
||||
# Memory management:
|
||||
|
||||
@@ -108,6 +108,12 @@ THE SOFTWARE.
|
||||
#define __HCC_C__
|
||||
#endif
|
||||
|
||||
|
||||
// TODO - hipify-clang - change to use the function call.
|
||||
//#define warpSize hc::__wavesize()
|
||||
const int warpSize = 64;
|
||||
|
||||
|
||||
#define clock_t long long int
|
||||
__device__ inline long long int clock64() { return (long long int)hc::__clock_u64(); };
|
||||
__device__ inline clock_t clock() { return (clock_t)hc::__clock_u64(); };
|
||||
@@ -344,42 +350,42 @@ __device__ inline unsigned long long int __ballot( int input)
|
||||
}
|
||||
|
||||
// warp shuffle functions
|
||||
__device__ inline int __shfl(int input, int lane, int width)
|
||||
__device__ inline int __shfl(int input, int lane, int width=warpSize)
|
||||
{
|
||||
return hc::__shfl(input,lane,width);
|
||||
}
|
||||
|
||||
__device__ inline int __shfl_up(int input, unsigned int lane_delta, int width)
|
||||
__device__ inline int __shfl_up(int input, unsigned int lane_delta, int width=warpSize)
|
||||
{
|
||||
return hc::__shfl_up(input,lane_delta,width);
|
||||
}
|
||||
|
||||
__device__ inline int __shfl_down(int input, unsigned int lane_delta, int width)
|
||||
__device__ inline int __shfl_down(int input, unsigned int lane_delta, int width=warpSize)
|
||||
{
|
||||
return hc::__shfl_down(input,lane_delta,width);
|
||||
}
|
||||
|
||||
__device__ inline int __shfl_xor(int input, int lane_mask, int width)
|
||||
__device__ inline int __shfl_xor(int input, int lane_mask, int width=warpSize)
|
||||
{
|
||||
return hc::__shfl_xor(input,lane_mask,width);
|
||||
}
|
||||
|
||||
__device__ inline float __shfl(float input, int lane, int width)
|
||||
__device__ inline float __shfl(float input, int lane, int width=warpSize)
|
||||
{
|
||||
return hc::__shfl(input,lane,width);
|
||||
}
|
||||
|
||||
__device__ inline float __shfl_up(float input, unsigned int lane_delta, int width)
|
||||
__device__ inline float __shfl_up(float input, unsigned int lane_delta, int width=warpSize)
|
||||
{
|
||||
return hc::__shfl_up(input,lane_delta,width);
|
||||
}
|
||||
|
||||
__device__ inline float __shfl_down(float input, unsigned int lane_delta, int width)
|
||||
__device__ inline float __shfl_down(float input, unsigned int lane_delta, int width=warpSize)
|
||||
{
|
||||
return hc::__shfl_down(input,lane_delta,width);
|
||||
}
|
||||
|
||||
__device__ inline float __shfl_xor(float input, int lane_mask, int width)
|
||||
__device__ inline float __shfl_xor(float input, int lane_mask, int width=warpSize)
|
||||
{
|
||||
return hc::__shfl_xor(input,lane_mask,width);
|
||||
}
|
||||
@@ -438,7 +444,6 @@ __device__ inline float __dsqrt_rz(double x) {return hc::fast_math::sqrt(x); };
|
||||
#define hipGridDim_z (hc_get_num_groups(0))
|
||||
|
||||
|
||||
extern int warpSize ;
|
||||
|
||||
|
||||
#define __syncthreads() hc_barrier(CLK_LOCAL_MEM_FENCE)
|
||||
@@ -476,7 +481,8 @@ extern int warpSize ;
|
||||
|
||||
|
||||
#ifdef __HCC_CPP__
|
||||
hc::accelerator_view *ihipLaunchKernel(hipStream_t stream);
|
||||
hipStream_t ihipPreLaunchKernel(hipStream_t stream, hc::accelerator_view **av);
|
||||
void ihipPostLaunchKernel(hipStream_t stream, hc::completion_future &cf);
|
||||
|
||||
#if not defined(DISABLE_GRID_LAUNCH)
|
||||
#define hipLaunchKernel(_kernelName, _numBlocks3D, _blockDim3D, _groupMemBytes, _stream, ...) \
|
||||
@@ -491,12 +497,13 @@ do {\
|
||||
lp.groupMemBytes = _groupMemBytes;\
|
||||
hc::completion_future cf;\
|
||||
lp.cf = &cf; \
|
||||
lp.av = (ihipLaunchKernel(_stream)); \
|
||||
hipStream_t trueStream = (ihipPreLaunchKernel(_stream, &lp.av)); \
|
||||
if (HIP_TRACE_API) {\
|
||||
fprintf(stderr, "hiptrace1: launch '%s' gridDim:[%d.%d.%d] groupDim:[%d.%d.%d] groupMem:+%d stream=%p\n", \
|
||||
#_kernelName, lp.gridDim.z, lp.gridDim.y, lp.gridDim.x, lp.groupDim.z, lp.groupDim.y, lp.groupDim.x, lp.groupMemBytes, (void*)(_stream));\
|
||||
}\
|
||||
_kernelName (lp, __VA_ARGS__);\
|
||||
ihipPostLaunchKernel(trueStream, cf);\
|
||||
} while(0)
|
||||
|
||||
#else
|
||||
@@ -514,12 +521,13 @@ do {\
|
||||
lp.groupMemBytes = _groupMemBytes;\
|
||||
hc::completion_future cf;\
|
||||
lp.cf = &cf; \
|
||||
lp.av = (ihipLaunchKernel(_stream)); \
|
||||
hipStream_t trueStream = (ihipPreLaunchKernel(_stream, &lp.av)); \
|
||||
if (HIP_TRACE_API) {\
|
||||
fprintf(stderr, "hiptrace1: launch '%s' gridDim:[%d.%d.%d] groupDim:[%d.%d.%d] groupMem:+%d stream=%p\n", \
|
||||
#_kernelName, lp.gridDim.z, lp.gridDim.y, lp.gridDim.x, lp.groupDim.z, lp.groupDim.y, lp.groupDim.x, lp.groupMemBytes, (void*)(_stream));\
|
||||
}\
|
||||
_kernelName (lp, __VA_ARGS__);\
|
||||
ihipPostLaunchKernel(trueStream, cf);\
|
||||
} while(0)
|
||||
/*end hipLaunchKernel */
|
||||
#endif
|
||||
|
||||
@@ -105,6 +105,8 @@ enum hipMemcpyKind {
|
||||
} ;
|
||||
|
||||
|
||||
|
||||
|
||||
// Doxygen end group GlobalDefs
|
||||
/** @} */
|
||||
|
||||
@@ -113,7 +115,7 @@ enum hipMemcpyKind {
|
||||
|
||||
|
||||
// The handle allows the async commands to use the stream even if the parent hipStream_t goes out-of-scope.
|
||||
typedef struct ihipStream_t * hipStream_t;
|
||||
typedef class ihipStream_t * hipStream_t;
|
||||
|
||||
|
||||
/*
|
||||
@@ -128,6 +130,7 @@ typedef struct hipEvent_t {
|
||||
|
||||
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
} /* extern "C" */
|
||||
#endif
|
||||
@@ -634,6 +637,11 @@ hipError_t hipEventQuery(hipEvent_t event) ;
|
||||
*/
|
||||
|
||||
|
||||
/**
|
||||
* @brief Return attributes for the specified pointer
|
||||
*/
|
||||
hipError_t hipPointerGetAttributes(hipPointerAttribute_t *attributes, void* ptr) ;
|
||||
|
||||
|
||||
/**
|
||||
* @brief Allocate memory on the default accelerator
|
||||
|
||||
@@ -98,6 +98,30 @@ typedef struct hipDeviceProp_t {
|
||||
} hipDeviceProp_t;
|
||||
|
||||
|
||||
/**
|
||||
* Memory type (for pointer attributes)
|
||||
*/
|
||||
enum hipMemoryType {
|
||||
hipMemoryTypeHost, ///< Memory is physically located on host
|
||||
hipMemoryTypeDevice ///< Memory is physically located on device. (see deviceId for specific device)
|
||||
};
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Pointer attributes
|
||||
*/
|
||||
typedef struct hipPointerAttribute_t {
|
||||
enum hipMemoryType memoryType;
|
||||
int device;
|
||||
void *devicePointer;
|
||||
void *hostPointer;
|
||||
int isManaged;
|
||||
unsigned allocationFlags; /* flags specified when memory was allocated*/
|
||||
/* peers? */
|
||||
} hipPointerAttribute_t;
|
||||
|
||||
|
||||
// hack to get these to show up in Doxygen:
|
||||
/**
|
||||
* @defgroup GlobalDefs Global enum and defines
|
||||
@@ -111,6 +135,7 @@ typedef struct hipDeviceProp_t {
|
||||
* @enum
|
||||
* @ingroup Enumerations
|
||||
*/
|
||||
// Developer note - when updating these, update the hipErrorName and hipErrorString functions
|
||||
typedef enum hipError_t {
|
||||
hipSuccess = 0 ///< Successful completion.
|
||||
,hipErrorMemoryAllocation ///< Memory allocation error.
|
||||
@@ -120,6 +145,8 @@ typedef enum hipError_t {
|
||||
,hipErrorInvalidValue ///< One or more of the parameters passed to the API call is NULL or not in an acceptable range.
|
||||
,hipErrorInvalidResourceHandle ///< Resource handle (hipEvent_t or hipStream_t) invalid.
|
||||
,hipErrorInvalidDevice ///< DeviceID must be in range 0...#compute-devices.
|
||||
,hipErrorInvalidMemcpyDirection ///< Invalid memory copy direction
|
||||
|
||||
,hipErrorNoDevice ///< Call to hipGetDeviceCount returned 0 devices
|
||||
,hipErrorNotReady ///< Indicates that asynchronous operations enqueued earlier are not ready. This is not actually an error, but is used to distinguish from hipSuccess (which indicates completion). APIs that return this error include hipEventQuery and hipStreamQuery.
|
||||
,hipErrorUnknown ///< Unknown error.
|
||||
|
||||
@@ -0,0 +1,27 @@
|
||||
|
||||
Copyright (c) 2011, UT-Battelle, LLC
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of Oak Ridge National Laboratory, nor UT-Battelle, LLC, nor
|
||||
the names of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
@@ -0,0 +1,17 @@
|
||||
HIP_PATH?=$(shell hipconfig -p)
|
||||
HIPCC=$(HIP_PATH)/bin/hipcc
|
||||
|
||||
EXE=hipBusBandwidth
|
||||
CXXFLAGS = -O3 -g
|
||||
|
||||
all: install
|
||||
|
||||
$(EXE): hipBusBandwidth.cpp ResultDatabase.cpp
|
||||
$(HIPCC) $(CXXFLAGS) $^ -o $@
|
||||
|
||||
install: $(EXE)
|
||||
cp $(EXE) $(HIP_PATH)/bin
|
||||
|
||||
|
||||
clean:
|
||||
rm -f *.o $(EXE)
|
||||
@@ -0,0 +1,523 @@
|
||||
#include "ResultDatabase.h"
|
||||
|
||||
#include <cfloat>
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <iomanip>
|
||||
|
||||
using namespace std;
|
||||
|
||||
bool ResultDatabase::Result::operator<(const Result &rhs) const
|
||||
{
|
||||
if (test < rhs.test)
|
||||
return true;
|
||||
if (test > rhs.test)
|
||||
return false;
|
||||
if (atts < rhs.atts)
|
||||
return true;
|
||||
if (atts > rhs.atts)
|
||||
return false;
|
||||
return false; // less-operator returns false on equal
|
||||
}
|
||||
|
||||
double ResultDatabase::Result::GetMin() const
|
||||
{
|
||||
double r = FLT_MAX;
|
||||
for (int i=0; i<value.size(); i++)
|
||||
{
|
||||
r = min(r, value[i]);
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
double ResultDatabase::Result::GetMax() const
|
||||
{
|
||||
double r = -FLT_MAX;
|
||||
for (int i=0; i<value.size(); i++)
|
||||
{
|
||||
r = max(r, value[i]);
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
double ResultDatabase::Result::GetMedian() const
|
||||
{
|
||||
return GetPercentile(50);
|
||||
}
|
||||
|
||||
double ResultDatabase::Result::GetPercentile(double q) const
|
||||
{
|
||||
int n = value.size();
|
||||
if (n == 0)
|
||||
return FLT_MAX;
|
||||
if (n == 1)
|
||||
return value[0];
|
||||
|
||||
if (q <= 0)
|
||||
return value[0];
|
||||
if (q >= 100)
|
||||
return value[n-1];
|
||||
|
||||
double index = ((n + 1.) * q / 100.) - 1;
|
||||
|
||||
vector<double> sorted = value;
|
||||
sort(sorted.begin(), sorted.end());
|
||||
|
||||
if (n == 2)
|
||||
return (sorted[0] * (1 - q/100.) + sorted[1] * (q/100.));
|
||||
|
||||
int index_lo = int(index);
|
||||
double frac = index - index_lo;
|
||||
if (frac == 0)
|
||||
return sorted[index_lo];
|
||||
|
||||
double lo = sorted[index_lo];
|
||||
double hi = sorted[index_lo + 1];
|
||||
return lo + (hi-lo)*frac;
|
||||
}
|
||||
|
||||
double ResultDatabase::Result::GetMean() const
|
||||
{
|
||||
double r = 0;
|
||||
for (int i=0; i<value.size(); i++)
|
||||
{
|
||||
r += value[i];
|
||||
}
|
||||
return r / double(value.size());
|
||||
}
|
||||
|
||||
double ResultDatabase::Result::GetStdDev() const
|
||||
{
|
||||
double r = 0;
|
||||
double u = GetMean();
|
||||
if (u == FLT_MAX)
|
||||
return FLT_MAX;
|
||||
for (int i=0; i<value.size(); i++)
|
||||
{
|
||||
r += (value[i] - u) * (value[i] - u);
|
||||
}
|
||||
r = sqrt(r / value.size());
|
||||
return r;
|
||||
}
|
||||
|
||||
|
||||
void ResultDatabase::AddResults(const string &test,
|
||||
const string &atts,
|
||||
const string &unit,
|
||||
const vector<double> &values)
|
||||
{
|
||||
for (int i=0; i<values.size(); i++)
|
||||
{
|
||||
AddResult(test, atts, unit, values[i]);
|
||||
}
|
||||
}
|
||||
|
||||
static string RemoveAllButLeadingSpaces(const string &a)
|
||||
{
|
||||
string b;
|
||||
int n = a.length();
|
||||
int i = 0;
|
||||
while (i<n && a[i] == ' ')
|
||||
{
|
||||
b += a[i];
|
||||
++i;
|
||||
}
|
||||
for (; i<n; i++)
|
||||
{
|
||||
if (a[i] != ' ' && a[i] != '\t')
|
||||
b += a[i];
|
||||
}
|
||||
return b;
|
||||
}
|
||||
|
||||
void ResultDatabase::AddResult(const string &test_orig,
|
||||
const string &atts_orig,
|
||||
const string &unit_orig,
|
||||
double value)
|
||||
{
|
||||
string test = RemoveAllButLeadingSpaces(test_orig);
|
||||
string atts = RemoveAllButLeadingSpaces(atts_orig);
|
||||
string unit = RemoveAllButLeadingSpaces(unit_orig);
|
||||
int index;
|
||||
for (index = 0; index < results.size(); index++)
|
||||
{
|
||||
if (results[index].test == test &&
|
||||
results[index].atts == atts)
|
||||
{
|
||||
if (results[index].unit != unit)
|
||||
throw "Internal error: mixed units";
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (index >= results.size())
|
||||
{
|
||||
Result r;
|
||||
r.test = test;
|
||||
r.atts = atts;
|
||||
r.unit = unit;
|
||||
results.push_back(r);
|
||||
}
|
||||
|
||||
results[index].value.push_back(value);
|
||||
}
|
||||
|
||||
// ****************************************************************************
|
||||
// Method: ResultDatabase::DumpDetailed
|
||||
//
|
||||
// Purpose:
|
||||
// Writes the full results, including all trials.
|
||||
//
|
||||
// Arguments:
|
||||
// out where to print
|
||||
//
|
||||
// Programmer: Jeremy Meredith
|
||||
// Creation: August 14, 2009
|
||||
//
|
||||
// Modifications:
|
||||
// Jeremy Meredith, Wed Nov 10 14:25:17 EST 2010
|
||||
// Renamed to DumpDetailed to make room for a DumpSummary.
|
||||
//
|
||||
// Jeremy Meredith, Thu Nov 11 11:39:57 EST 2010
|
||||
// Added note about (*) missing value tag.
|
||||
//
|
||||
// Jeremy Meredith, Tue Nov 23 13:57:02 EST 2010
|
||||
// Changed note about missing values to be worded a little better.
|
||||
//
|
||||
// ****************************************************************************
|
||||
void ResultDatabase::DumpDetailed(ostream &out)
|
||||
{
|
||||
vector<Result> sorted(results);
|
||||
|
||||
sort(sorted.begin(), sorted.end());
|
||||
|
||||
int maxtrials = 1;
|
||||
for (int i=0; i<sorted.size(); i++)
|
||||
{
|
||||
if (sorted[i].value.size() > maxtrials)
|
||||
maxtrials = sorted[i].value.size();
|
||||
}
|
||||
|
||||
// TODO: in big parallel runs, the "trials" are the procs
|
||||
// and we really don't want to print them all out....
|
||||
out << "test\t"
|
||||
<< "atts\t"
|
||||
<< "units\t"
|
||||
<< "median\t"
|
||||
<< "mean\t"
|
||||
<< "stddev\t"
|
||||
<< "min\t"
|
||||
<< "max\t";
|
||||
for (int i=0; i<maxtrials; i++)
|
||||
out << "trial"<<i<<"\t";
|
||||
out << endl;
|
||||
|
||||
for (int i=0; i<sorted.size(); i++)
|
||||
{
|
||||
Result &r = sorted[i];
|
||||
out << r.test << "\t";
|
||||
out << r.atts << "\t";
|
||||
out << r.unit << "\t";
|
||||
if (r.GetMedian() == FLT_MAX)
|
||||
out << "N/A\t";
|
||||
else
|
||||
out << r.GetMedian() << "\t";
|
||||
if (r.GetMean() == FLT_MAX)
|
||||
out << "N/A\t";
|
||||
else
|
||||
out << r.GetMean() << "\t";
|
||||
if (r.GetStdDev() == FLT_MAX)
|
||||
out << "N/A\t";
|
||||
else
|
||||
out << r.GetStdDev() << "\t";
|
||||
if (r.GetMin() == FLT_MAX)
|
||||
out << "N/A\t";
|
||||
else
|
||||
out << r.GetMin() << "\t";
|
||||
if (r.GetMax() == FLT_MAX)
|
||||
out << "N/A\t";
|
||||
else
|
||||
out << r.GetMax() << "\t";
|
||||
for (int j=0; j<r.value.size(); j++)
|
||||
{
|
||||
if (r.value[j] == FLT_MAX)
|
||||
out << "N/A\t";
|
||||
else
|
||||
out << r.value[j] << "\t";
|
||||
}
|
||||
|
||||
out << endl;
|
||||
}
|
||||
out << endl
|
||||
<< "Note: Any results marked with (*) had missing values." << endl
|
||||
<< " This can occur on systems with a mixture of" << endl
|
||||
<< " device types or architectural capabilities." << endl;
|
||||
}
|
||||
|
||||
|
||||
// ****************************************************************************
|
||||
// Method: ResultDatabase::DumpDetailed
|
||||
//
|
||||
// Purpose:
|
||||
// Writes the summary results (min/max/stddev/med/mean), but not
|
||||
// every individual trial.
|
||||
//
|
||||
// Arguments:
|
||||
// out where to print
|
||||
//
|
||||
// Programmer: Jeremy Meredith
|
||||
// Creation: November 10, 2010
|
||||
//
|
||||
// Modifications:
|
||||
// Jeremy Meredith, Thu Nov 11 11:39:57 EST 2010
|
||||
// Added note about (*) missing value tag.
|
||||
//
|
||||
// ****************************************************************************
|
||||
void ResultDatabase::DumpSummary(ostream &out)
|
||||
{
|
||||
vector<Result> sorted(results);
|
||||
|
||||
int testW = 15 ;
|
||||
const int fieldW = 9;
|
||||
|
||||
sort(sorted.begin(), sorted.end());
|
||||
|
||||
out << std::fixed << right << std::setprecision(4);
|
||||
|
||||
// TODO: in big parallel runs, the "trials" are the procs
|
||||
// and we really don't want to print them all out....
|
||||
out << setw(testW) << "test\t" << setw(fieldW)
|
||||
<< "atts\t"
|
||||
<< "units\t"
|
||||
<< "median\t"
|
||||
<< "mean\t"
|
||||
<< "stddev\t"
|
||||
<< "min\t"
|
||||
<< "max\t";
|
||||
out << endl;
|
||||
|
||||
for (int i=0; i<sorted.size(); i++)
|
||||
{
|
||||
Result &r = sorted[i];
|
||||
out << setw(testW) << r.test << setw(fieldW) << "\t";
|
||||
out << r.atts << "\t";
|
||||
out << r.unit << "\t";
|
||||
if (r.GetMedian() == FLT_MAX)
|
||||
out << "N/A\t";
|
||||
else
|
||||
out << r.GetMedian() << "\t";
|
||||
if (r.GetMean() == FLT_MAX)
|
||||
out << "N/A\t";
|
||||
else
|
||||
out << r.GetMean() << "\t";
|
||||
if (r.GetStdDev() == FLT_MAX)
|
||||
out << "N/A\t";
|
||||
else
|
||||
out << r.GetStdDev() << "\t";
|
||||
if (r.GetMin() == FLT_MAX)
|
||||
out << "N/A\t";
|
||||
else
|
||||
out << r.GetMin() << "\t";
|
||||
if (r.GetMax() == FLT_MAX)
|
||||
out << "N/A\t";
|
||||
else
|
||||
out << r.GetMax() << "\t";
|
||||
|
||||
out << endl;
|
||||
}
|
||||
out << endl
|
||||
<< "Note: results marked with (*) had missing values such as" << endl
|
||||
<< "might occur with a mixture of architectural capabilities." << endl;
|
||||
}
|
||||
|
||||
// ****************************************************************************
|
||||
// Method: ResultDatabase::ClearAllResults
|
||||
//
|
||||
// Purpose:
|
||||
// Clears all existing results from the ResultDatabase; used for multiple passes
|
||||
// of the same test or multiple tests.
|
||||
//
|
||||
// Arguments:
|
||||
//
|
||||
// Programmer: Jeffrey Young
|
||||
// Creation: September 10th, 2014
|
||||
//
|
||||
// Modifications:
|
||||
//
|
||||
//
|
||||
// ****************************************************************************
|
||||
void ResultDatabase::ClearAllResults()
|
||||
{
|
||||
results.clear();
|
||||
}
|
||||
|
||||
// ****************************************************************************
|
||||
// Method: ResultDatabase::DumpCsv
|
||||
//
|
||||
// Purpose:
|
||||
// Writes either detailed or summary results (min/max/stddev/med/mean), but not
|
||||
// every individual trial.
|
||||
//
|
||||
// Arguments:
|
||||
// out file to print CSV results
|
||||
//
|
||||
// Programmer: Jeffrey Young
|
||||
// Creation: August 28th, 2014
|
||||
//
|
||||
// Modifications:
|
||||
//
|
||||
// ****************************************************************************
|
||||
void ResultDatabase::DumpCsv(string fileName)
|
||||
{
|
||||
bool emptyFile;
|
||||
vector<Result> sorted(results);
|
||||
|
||||
sort(sorted.begin(), sorted.end());
|
||||
|
||||
//Check to see if the file is empty - if so, add the headers
|
||||
emptyFile = this->IsFileEmpty(fileName);
|
||||
|
||||
//Open file and append by default
|
||||
ofstream out;
|
||||
out.open(fileName.c_str(), std::ofstream::out | std::ofstream::app);
|
||||
|
||||
//Add headers only for empty files
|
||||
if(emptyFile)
|
||||
{
|
||||
// TODO: in big parallel runs, the "trials" are the procs
|
||||
// and we really don't want to print them all out....
|
||||
out << "test, "
|
||||
<< "atts, "
|
||||
<< "units, "
|
||||
<< "median, "
|
||||
<< "mean, "
|
||||
<< "stddev, "
|
||||
<< "min, "
|
||||
<< "max, ";
|
||||
out << endl;
|
||||
}
|
||||
|
||||
for (int i=0; i<sorted.size(); i++)
|
||||
{
|
||||
Result &r = sorted[i];
|
||||
out << r.test << ", ";
|
||||
out << r.atts << ", ";
|
||||
out << r.unit << ", ";
|
||||
if (r.GetMedian() == FLT_MAX)
|
||||
out << "N/A, ";
|
||||
else
|
||||
out << r.GetMedian() << ", ";
|
||||
if (r.GetMean() == FLT_MAX)
|
||||
out << "N/A, ";
|
||||
else
|
||||
out << r.GetMean() << ", ";
|
||||
if (r.GetStdDev() == FLT_MAX)
|
||||
out << "N/A, ";
|
||||
else
|
||||
out << r.GetStdDev() << ", ";
|
||||
if (r.GetMin() == FLT_MAX)
|
||||
out << "N/A, ";
|
||||
else
|
||||
out << r.GetMin() << ", ";
|
||||
if (r.GetMax() == FLT_MAX)
|
||||
out << "N/A, ";
|
||||
else
|
||||
out << r.GetMax() << ", ";
|
||||
|
||||
out << endl;
|
||||
}
|
||||
out << endl;
|
||||
|
||||
out.close();
|
||||
}
|
||||
|
||||
// ****************************************************************************
|
||||
// Method: ResultDatabase::IsFileEmpty
|
||||
//
|
||||
// Purpose:
|
||||
// Returns whether a file is empty - used as a helper for CSV printing
|
||||
//
|
||||
// Arguments:
|
||||
// file The input file to check for emptiness
|
||||
//
|
||||
// Programmer: Jeffrey Young
|
||||
// Creation: August 28th, 2014
|
||||
//
|
||||
// Modifications:
|
||||
//
|
||||
// ****************************************************************************
|
||||
|
||||
bool ResultDatabase::IsFileEmpty(string fileName)
|
||||
{
|
||||
bool fileEmpty;
|
||||
|
||||
ifstream file(fileName.c_str());
|
||||
|
||||
//If the file doesn't exist it is by definition empty
|
||||
if(!file.good())
|
||||
{
|
||||
return true;
|
||||
}
|
||||
else
|
||||
{
|
||||
fileEmpty = (bool)(file.peek() == ifstream::traits_type::eof());
|
||||
file.close();
|
||||
|
||||
return fileEmpty;
|
||||
}
|
||||
|
||||
//Otherwise, return false
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
|
||||
// ****************************************************************************
|
||||
// Method: ResultDatabase::GetResultsForTest
|
||||
//
|
||||
// Purpose:
|
||||
// Returns a vector of results for just one test name.
|
||||
//
|
||||
// Arguments:
|
||||
// test the name of the test results to search for
|
||||
//
|
||||
// Programmer: Jeremy Meredith
|
||||
// Creation: December 3, 2010
|
||||
//
|
||||
// Modifications:
|
||||
//
|
||||
// ****************************************************************************
|
||||
vector<ResultDatabase::Result>
|
||||
ResultDatabase::GetResultsForTest(const string &test)
|
||||
{
|
||||
// get only the given test results
|
||||
vector<Result> retval;
|
||||
for (int i=0; i<results.size(); i++)
|
||||
{
|
||||
Result &r = results[i];
|
||||
if (r.test == test)
|
||||
retval.push_back(r);
|
||||
}
|
||||
return retval;
|
||||
}
|
||||
|
||||
// ****************************************************************************
|
||||
// Method: ResultDatabase::GetResults
|
||||
//
|
||||
// Purpose:
|
||||
// Returns all the results.
|
||||
//
|
||||
// Arguments:
|
||||
//
|
||||
// Programmer: Jeremy Meredith
|
||||
// Creation: December 3, 2010
|
||||
//
|
||||
// Modifications:
|
||||
//
|
||||
// ****************************************************************************
|
||||
const vector<ResultDatabase::Result> &
|
||||
ResultDatabase::GetResults() const
|
||||
{
|
||||
return results;
|
||||
}
|
||||
@@ -0,0 +1,100 @@
|
||||
#ifndef RESULT_DATABASE_H
|
||||
#define RESULT_DATABASE_H
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
#include <cfloat>
|
||||
using std::string;
|
||||
using std::vector;
|
||||
using std::ostream;
|
||||
using std::ofstream;
|
||||
using std::ifstream;
|
||||
|
||||
|
||||
// ****************************************************************************
|
||||
// Class: ResultDatabase
|
||||
//
|
||||
// Purpose:
|
||||
// Track numerical results as they are generated.
|
||||
// Print statistics of raw results.
|
||||
//
|
||||
// Programmer: Jeremy Meredith
|
||||
// Creation: June 12, 2009
|
||||
//
|
||||
// Modifications:
|
||||
// Jeremy Meredith, Wed Nov 10 14:20:47 EST 2010
|
||||
// Split timing reports into detailed and summary. E.g. for serial code,
|
||||
// we might report all trial values, but skip them in parallel.
|
||||
//
|
||||
// Jeremy Meredith, Thu Nov 11 11:40:18 EST 2010
|
||||
// Added check for missing value tag.
|
||||
//
|
||||
// Jeremy Meredith, Mon Nov 22 13:37:10 EST 2010
|
||||
// Added percentile statistic.
|
||||
//
|
||||
// Jeremy Meredith, Fri Dec 3 16:30:31 EST 2010
|
||||
// Added a method to extract a subset of results based on test name. Also,
|
||||
// the Result class is now public, so that clients can use them directly.
|
||||
// Added a GetResults method as well, and made several functions const.
|
||||
//
|
||||
// ****************************************************************************
|
||||
class ResultDatabase
|
||||
{
|
||||
public:
|
||||
//
|
||||
// A performance result for a single SHOC benchmark run.
|
||||
//
|
||||
struct Result
|
||||
{
|
||||
string test; // e.g. "readback"
|
||||
string atts; // e.g. "pagelocked 4k^2"
|
||||
string unit; // e.g. "MB/sec"
|
||||
vector<double> value; // e.g. "837.14"
|
||||
double GetMin() const;
|
||||
double GetMax() const;
|
||||
double GetMedian() const;
|
||||
double GetPercentile(double q) const;
|
||||
double GetMean() const;
|
||||
double GetStdDev() const;
|
||||
|
||||
bool operator<(const Result &rhs) const;
|
||||
|
||||
bool HadAnyFLTMAXValues() const
|
||||
{
|
||||
for (int i=0; i<value.size(); ++i)
|
||||
{
|
||||
if (value[i] >= FLT_MAX)
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
protected:
|
||||
vector<Result> results;
|
||||
|
||||
public:
|
||||
void AddResult(const string &test,
|
||||
const string &atts,
|
||||
const string &unit,
|
||||
double value);
|
||||
void AddResults(const string &test,
|
||||
const string &atts,
|
||||
const string &unit,
|
||||
const vector<double> &values);
|
||||
vector<Result> GetResultsForTest(const string &test);
|
||||
const vector<Result> &GetResults() const;
|
||||
void ClearAllResults();
|
||||
void DumpDetailed(ostream&);
|
||||
void DumpSummary(ostream&);
|
||||
void DumpCsv(string fileName);
|
||||
|
||||
private:
|
||||
bool IsFileEmpty(string fileName);
|
||||
|
||||
};
|
||||
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,387 @@
|
||||
#include <stdio.h>
|
||||
#include <iostream>
|
||||
#include <hip_runtime.h>
|
||||
|
||||
#include "ResultDatabase.h"
|
||||
|
||||
// Cmdline parms:
|
||||
bool p_verbose = false;
|
||||
bool p_pinned = true;
|
||||
int p_iterations = 10;
|
||||
int p_device = 0;
|
||||
int p_detailed = 0;
|
||||
|
||||
bool p_h2d = true;
|
||||
bool p_d2h = true;
|
||||
|
||||
|
||||
#define CHECK_HIP_ERROR() \
|
||||
{ \
|
||||
hipError_t err = hipGetLastError(); \
|
||||
if (err != hipSuccess) \
|
||||
{ \
|
||||
printf("error=%d name=%s at " \
|
||||
"ln: %d\n ",err,hipGetErrorString(err),__LINE__); \
|
||||
exit(EXIT_FAILURE); \
|
||||
} \
|
||||
}
|
||||
|
||||
|
||||
// ****************************************************************************
|
||||
// Function: runBenchmark
|
||||
//
|
||||
// Purpose:
|
||||
// Measures the bandwidth of the bus connecting the host processor to the
|
||||
// OpenCL device. This benchmark repeatedly transfers data chunks of various
|
||||
// sizes across the bus to the OpenCL device, and calculates the bandwidth.
|
||||
//
|
||||
//
|
||||
// Arguments:
|
||||
//
|
||||
// Returns: nothing
|
||||
//
|
||||
// Programmer: Jeremy Meredith
|
||||
// Creation: September 08, 2009
|
||||
//
|
||||
// Modifications:
|
||||
// Jeremy Meredith, Wed Dec 1 17:05:27 EST 2010
|
||||
// Added calculation of latency estimate.
|
||||
// Ben Sander - moved to standalone test
|
||||
//
|
||||
// ****************************************************************************
|
||||
void RunBenchmark_H2D(ResultDatabase &resultDB)
|
||||
{
|
||||
// Sizes are in kb
|
||||
int sizes[] = {1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384, 32768,65536,131072,262144,524288};
|
||||
int nSizes = sizeof(sizes) / sizeof(int);
|
||||
|
||||
long long numMaxFloats = 1024 * (sizes[nSizes-1]) / 4;
|
||||
|
||||
hipSetDevice(p_device);
|
||||
|
||||
// Create some host memory pattern
|
||||
float *hostMem = NULL;
|
||||
if (p_pinned)
|
||||
{
|
||||
hipMallocHost((void**)&hostMem, sizeof(float) * numMaxFloats);
|
||||
while (hipGetLastError() != hipSuccess)
|
||||
{
|
||||
// drop the size and try again
|
||||
if (p_verbose) std::cout << " - dropping size allocating pinned mem\n";
|
||||
--nSizes;
|
||||
if (nSizes < 1)
|
||||
{
|
||||
std::cerr << "Error: Couldn't allocated any pinned buffer\n";
|
||||
return;
|
||||
}
|
||||
numMaxFloats = 1024 * (sizes[nSizes-1]) / 4;
|
||||
hipMallocHost((void**)&hostMem, sizeof(float) * numMaxFloats);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
hostMem = new float[numMaxFloats];
|
||||
}
|
||||
|
||||
for (int i = 0; i < numMaxFloats; i++)
|
||||
{
|
||||
hostMem[i] = i % 77;
|
||||
}
|
||||
|
||||
float *device;
|
||||
hipMalloc((void**)&device, sizeof(float) * numMaxFloats);
|
||||
while (hipGetLastError() != hipSuccess)
|
||||
{
|
||||
// drop the size and try again
|
||||
if (p_verbose) std::cout << " - dropping size allocating device mem\n";
|
||||
--nSizes;
|
||||
if (nSizes < 1)
|
||||
{
|
||||
std::cerr << "Error: Couldn't allocated any device buffer\n";
|
||||
return;
|
||||
}
|
||||
numMaxFloats = 1024 * (sizes[nSizes-1]) / 4;
|
||||
hipMalloc((void**)&device, sizeof(float) * numMaxFloats);
|
||||
}
|
||||
|
||||
|
||||
hipEvent_t start, stop;
|
||||
hipEventCreate(&start);
|
||||
hipEventCreate(&stop);
|
||||
CHECK_HIP_ERROR();
|
||||
|
||||
// Three passes, forward and backward both
|
||||
for (int pass = 0; pass < p_iterations; pass++)
|
||||
{
|
||||
// store the times temporarily to estimate latency
|
||||
//float times[nSizes];
|
||||
// Step through sizes forward on even passes and backward on odd
|
||||
for (int i = 0; i < nSizes; i++)
|
||||
{
|
||||
int sizeIndex;
|
||||
if ((pass % 2) == 0)
|
||||
sizeIndex = i;
|
||||
else
|
||||
sizeIndex = (nSizes - 1) - i;
|
||||
|
||||
int nbytes = sizes[sizeIndex] * 1024;
|
||||
|
||||
hipEventRecord(start, 0);
|
||||
hipMemcpy(device, hostMem, nbytes, hipMemcpyHostToDevice);
|
||||
hipEventRecord(stop, 0);
|
||||
hipEventSynchronize(stop);
|
||||
float t = 0;
|
||||
hipEventElapsedTime(&t, start, stop);
|
||||
//times[sizeIndex] = t;
|
||||
|
||||
// Convert to GB/sec
|
||||
if (p_verbose)
|
||||
{
|
||||
std::cerr << "size " << sizes[sizeIndex] << "k took " << t <<
|
||||
" ms\n";
|
||||
}
|
||||
|
||||
double speed = (double(sizes[sizeIndex]) * 1024. / (1000*1000)) / t;
|
||||
char sizeStr[256];
|
||||
sprintf(sizeStr, "% 7dkB", sizes[sizeIndex]);
|
||||
resultDB.AddResult("H2D_Bandwidth", sizeStr, "GB/sec", speed);
|
||||
resultDB.AddResult("H2D_Time", sizeStr, "ms", t);
|
||||
}
|
||||
}
|
||||
|
||||
// Cleanup
|
||||
hipFree((void*)device);
|
||||
CHECK_HIP_ERROR();
|
||||
if (p_pinned)
|
||||
{
|
||||
hipFreeHost((void*)hostMem);
|
||||
CHECK_HIP_ERROR();
|
||||
}
|
||||
else
|
||||
{
|
||||
delete[] hostMem;
|
||||
}
|
||||
hipEventDestroy(start);
|
||||
hipEventDestroy(stop);
|
||||
}
|
||||
|
||||
|
||||
void RunBenchmark_D2H(ResultDatabase &resultDB)
|
||||
{
|
||||
|
||||
// Sizes are in kb
|
||||
int nSizes = 20;
|
||||
int sizes[20] = {1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,
|
||||
32768,65536,131072,262144,524288};
|
||||
long long numMaxFloats = 1024 * (sizes[nSizes-1]) / 4;
|
||||
|
||||
// Create some host memory pattern
|
||||
float *hostMem1;
|
||||
float *hostMem2;
|
||||
if (p_pinned)
|
||||
{
|
||||
hipMallocHost((void**)&hostMem1, sizeof(float)*numMaxFloats);
|
||||
hipError_t err1 = hipGetLastError();
|
||||
hipMallocHost((void**)&hostMem2, sizeof(float)*numMaxFloats);
|
||||
hipError_t err2 = hipGetLastError();
|
||||
while (err1 != hipSuccess || err2 != hipSuccess)
|
||||
{
|
||||
// free the first buffer if only the second failed
|
||||
if (err1 == hipSuccess)
|
||||
hipFreeHost((void*)hostMem1);
|
||||
|
||||
// drop the size and try again
|
||||
if (p_verbose) std::cout << " - dropping size allocating pinned mem\n";
|
||||
--nSizes;
|
||||
if (nSizes < 1)
|
||||
{
|
||||
std::cerr << "Error: Couldn't allocated any pinned buffer\n";
|
||||
return;
|
||||
}
|
||||
numMaxFloats = 1024 * (sizes[nSizes-1]) / 4;
|
||||
hipMallocHost((void**)&hostMem1, sizeof(float)*numMaxFloats);
|
||||
err1 = hipGetLastError();
|
||||
hipMallocHost((void**)&hostMem2, sizeof(float)*numMaxFloats);
|
||||
err2 = hipGetLastError();
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
hostMem1 = new float[numMaxFloats];
|
||||
hostMem2 = new float[numMaxFloats];
|
||||
}
|
||||
for (int i=0; i<numMaxFloats; i++)
|
||||
hostMem1[i] = i % 77;
|
||||
|
||||
float *device;
|
||||
hipMalloc((void**)&device, sizeof(float) * numMaxFloats);
|
||||
while (hipGetLastError() != hipSuccess)
|
||||
{
|
||||
// drop the size and try again
|
||||
if (p_verbose) std::cout << " - dropping size allocating device mem\n";
|
||||
--nSizes;
|
||||
if (nSizes < 1)
|
||||
{
|
||||
std::cerr << "Error: Couldn't allocated any device buffer\n";
|
||||
return;
|
||||
}
|
||||
numMaxFloats = 1024 * (sizes[nSizes-1]) / 4;
|
||||
hipMalloc((void**)&device, sizeof(float) * numMaxFloats);
|
||||
}
|
||||
|
||||
hipMemcpy(device, hostMem1,
|
||||
numMaxFloats*sizeof(float), hipMemcpyHostToDevice);
|
||||
hipDeviceSynchronize();
|
||||
|
||||
hipEvent_t start, stop;
|
||||
hipEventCreate(&start);
|
||||
hipEventCreate(&stop);
|
||||
CHECK_HIP_ERROR();
|
||||
|
||||
// Three passes, forward and backward both
|
||||
for (int pass = 0; pass < p_iterations; pass++)
|
||||
{
|
||||
// store the times temporarily to estimate latency
|
||||
//float times[nSizes];
|
||||
// Step through sizes forward on even passes and backward on odd
|
||||
for (int i = 0; i < nSizes; i++)
|
||||
{
|
||||
int sizeIndex;
|
||||
if ((pass % 2) == 0)
|
||||
sizeIndex = i;
|
||||
else
|
||||
sizeIndex = (nSizes - 1) - i;
|
||||
|
||||
int nbytes = sizes[sizeIndex] * 1024;
|
||||
|
||||
hipEventRecord(start, 0);
|
||||
hipMemcpy(hostMem2, device,
|
||||
nbytes, hipMemcpyDeviceToHost);
|
||||
hipEventRecord(stop, 0);
|
||||
hipEventSynchronize(stop);
|
||||
float t = 0;
|
||||
hipEventElapsedTime(&t, start, stop);
|
||||
//times[sizeIndex] = t;
|
||||
|
||||
// Convert to GB/sec
|
||||
if (p_verbose)
|
||||
{
|
||||
std::cerr << "size " <<sizes[sizeIndex] << "k took " << t <<
|
||||
" ms\n";
|
||||
}
|
||||
|
||||
double speed = (double(sizes[sizeIndex]) * 1024. / (1000*1000)) / t;
|
||||
char sizeStr[256];
|
||||
sprintf(sizeStr, "% 7dkB", sizes[sizeIndex]);
|
||||
resultDB.AddResult("D2H_Bandwidth", sizeStr, "GB/sec", speed);
|
||||
resultDB.AddResult("D2H_Time", sizeStr, "ms", t);
|
||||
}
|
||||
//resultDB.AddResult("ReadbackLatencyEstimate", "1-2kb", "ms", times[0]-(times[1]-times[0])/1.);
|
||||
//resultDB.AddResult("ReadbackLatencyEstimate", "1-4kb", "ms", times[0]-(times[2]-times[0])/3.);
|
||||
//resultDB.AddResult("ReadbackLatencyEstimate", "2-4kb", "ms", times[1]-(times[2]-times[1])/1.);
|
||||
}
|
||||
|
||||
// Cleanup
|
||||
hipFree((void*)device);
|
||||
CHECK_HIP_ERROR();
|
||||
if (p_pinned)
|
||||
{
|
||||
hipFreeHost((void*)hostMem1);
|
||||
CHECK_HIP_ERROR();
|
||||
hipFreeHost((void*)hostMem2);
|
||||
CHECK_HIP_ERROR();
|
||||
}
|
||||
else
|
||||
{
|
||||
delete[] hostMem1;
|
||||
delete[] hostMem2;
|
||||
hipEventDestroy(start);
|
||||
hipEventDestroy(stop);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#define failed(...) \
|
||||
printf ("error: ");\
|
||||
printf (__VA_ARGS__);\
|
||||
printf ("\n");\
|
||||
exit(EXIT_FAILURE);
|
||||
|
||||
int parseInt(const char *str, int *output)
|
||||
{
|
||||
char *next;
|
||||
*output = strtol(str, &next, 0);
|
||||
return !strlen(next);
|
||||
}
|
||||
|
||||
void help() {
|
||||
};
|
||||
|
||||
int parseStandardArguments(int argc, char *argv[])
|
||||
{
|
||||
for (int i = 1; i < argc; i++) {
|
||||
const char *arg = argv[i];
|
||||
|
||||
if (!strcmp(arg, " ")) {
|
||||
// skip NULL args.
|
||||
} else if (!strcmp(arg, "--iterations") || (!strcmp(arg, "-i"))) {
|
||||
if (++i >= argc || !parseInt(argv[i], &p_iterations)) {
|
||||
failed("Bad iterations argument");
|
||||
}
|
||||
} else if (!strcmp(arg, "--device") || (!strcmp(arg, "-d"))) {
|
||||
if (++i >= argc || !parseInt(argv[i], &p_device)) {
|
||||
failed("Bad device argument");
|
||||
}
|
||||
} else if (!strcmp(arg, "--unpinned")) {
|
||||
p_pinned = 0;
|
||||
} else if (!strcmp(arg, "--h2d")) {
|
||||
p_h2d = true;
|
||||
p_d2h = false;
|
||||
|
||||
} else if (!strcmp(arg, "--d2h")) {
|
||||
p_h2d = false;
|
||||
p_d2h = true;
|
||||
|
||||
} else if (!strcmp(arg, "--help") || (!strcmp(arg, "-h"))) {
|
||||
help();
|
||||
|
||||
} else if (!strcmp(arg, "--verbose")) {
|
||||
p_verbose = 1;
|
||||
} else if (!strcmp(arg, "--detailed")) {
|
||||
p_detailed = 1;
|
||||
} else {
|
||||
failed("Bad argument '%s'", arg);
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
};
|
||||
|
||||
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
parseStandardArguments(argc, argv);
|
||||
|
||||
if (p_h2d) {
|
||||
ResultDatabase resultDB;
|
||||
RunBenchmark_H2D(resultDB);
|
||||
|
||||
resultDB.DumpSummary(std::cout);
|
||||
|
||||
if (p_detailed) {
|
||||
resultDB.DumpDetailed(std::cout);
|
||||
}
|
||||
}
|
||||
|
||||
if (p_d2h) {
|
||||
ResultDatabase resultDB;
|
||||
RunBenchmark_D2H(resultDB);
|
||||
|
||||
resultDB.DumpSummary(std::cout);
|
||||
|
||||
if (p_detailed) {
|
||||
resultDB.DumpDetailed(std::cout);
|
||||
}
|
||||
}
|
||||
}
|
||||
File diff ditekan karena terlalu besar
Load Diff
@@ -19,8 +19,10 @@ MESSAGE ("HIP_PATH=" ${HIP_PATH})
|
||||
|
||||
if (${HIP_PLATFORM} STREQUAL "hcc")
|
||||
MESSAGE ("HIP_PLATFORM=hcc")
|
||||
set (HC_PATH ${HIP_PATH}/hc)
|
||||
set (HSA_PATH /opt/hsa)
|
||||
set (HSA_PATH $ENV{HSA_PATH})
|
||||
if (NOT DEFINED HSA_PATH)
|
||||
set (HSA_PATH /opt/hsa)
|
||||
endif()
|
||||
|
||||
#---
|
||||
# Add HSA library:
|
||||
@@ -30,7 +32,7 @@ if (${HIP_PLATFORM} STREQUAL "hcc")
|
||||
#These includes are used for all files.
|
||||
#Include HIP and HC since the tests need both of these:
|
||||
#Note below HSA path is surgically included only where necessary.
|
||||
include_directories(${HIP_PATH}/include ${HC_PATH}/include)
|
||||
include_directories(${HIP_PATH}/include)
|
||||
|
||||
# hip_hcc.o:
|
||||
add_library(hip_hcc OBJECT ${HIP_PATH}/src/hip_hcc.cpp)
|
||||
@@ -105,6 +107,7 @@ make_hip_executable (hip_brev hip_brev.cpp)
|
||||
make_hip_executable (hip_ffs hip_ffs.cpp)
|
||||
make_hip_executable (hipGetDeviceAttribute hipGetDeviceAttribute.cpp)
|
||||
make_hip_executable (hipMemcpy hipMemcpy.cpp)
|
||||
make_hip_executable (hipMemcpyAsync hipMemcpyAsync.cpp)
|
||||
make_hip_executable (hipMemset hipMemset.cpp)
|
||||
make_hip_executable (hipEventRecord hipEventRecord.cpp)
|
||||
make_hip_executable (hipLanguageExtensions hipLanguageExtensions.cpp)
|
||||
@@ -114,6 +117,7 @@ make_hip_executable (hipSimpleAtomicsTest hipSimpleAtomicsTest.cpp)
|
||||
make_hip_executable (hipMathFunctionsHost hipMathFunctions.cpp hipSinglePrecisionMathHost.cpp hipDoublePrecisionMathHost.cpp)
|
||||
make_hip_executable (hipMathFunctionsDevice hipMathFunctions.cpp hipSinglePrecisionMathDevice.cpp hipDoublePrecisionMathDevice.cpp)
|
||||
make_hip_executable (hipIntrinsics hipMathFunctions.cpp hipSinglePrecisionIntrinsics.cpp hipDoublePrecisionIntrinsics.cpp hipIntegerIntrinsics.cpp)
|
||||
make_hip_executable (hipPointerAttrib hipPointerAttrib.cpp)
|
||||
target_link_libraries(hipMathFunctionsHost m)
|
||||
|
||||
make_test(hip_ballot " " )
|
||||
@@ -128,8 +132,10 @@ make_test(hipMemset --N 10 --memsetval 0x42 ) # small copy, just 10 bytes.
|
||||
make_test(hipMemset --N 10013 --memsetval 0x5a ) # oddball size.
|
||||
make_test(hipMemset --N 256M --memsetval 0xa6 ) # big copy
|
||||
make_test(hipGridLaunch " " )
|
||||
make_test(hipPointerAttrib " " )
|
||||
|
||||
make_test(hipMemcpy " " )
|
||||
make_test(hipMemcpyAsync " " )
|
||||
|
||||
make_test(hipHcc " " )
|
||||
|
||||
|
||||
@@ -23,24 +23,28 @@ THE SOFTWARE.
|
||||
#include "test_common.h"
|
||||
|
||||
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
void printSep()
|
||||
{
|
||||
HipTest::parseStandardArguments(argc, argv, true);
|
||||
printf ("======================================================================================\n");
|
||||
}
|
||||
|
||||
//---
|
||||
// Test simple H2D copies and back.
|
||||
// Designed to stress a small number of simple smoke tests
|
||||
void simpleTest1()
|
||||
{
|
||||
printf ("test: %s\n", __func__);
|
||||
size_t Nbytes = N*sizeof(int);
|
||||
|
||||
printf ("N=%zu \n", N);
|
||||
printf ("N=%zu Nbytes=%6.2fMB\n", N, Nbytes/1024.0/1024.0);
|
||||
|
||||
int *A_d, *B_d, *C_d;
|
||||
int *A_h, *B_h, *C_h;
|
||||
|
||||
HipTest::initArrays (&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N);
|
||||
|
||||
HipTest::initArrays (&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
|
||||
|
||||
printf ("A_d=%p B_d=%p C_d=%p A_h=%p B_h=%p C_h=%p\n", A_d, B_d, C_d, A_h, B_d, C_h);
|
||||
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
|
||||
|
||||
|
||||
HIPCHECK ( hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
|
||||
HIPCHECK ( hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice));
|
||||
|
||||
@@ -50,8 +54,212 @@ int main(int argc, char *argv[])
|
||||
|
||||
HIPCHECK (hipDeviceSynchronize());
|
||||
|
||||
|
||||
HipTest::checkVectorADD(A_h, B_h, C_h, N);
|
||||
|
||||
HipTest::freeArrays (A_d, B_d, C_d, A_h, B_h, C_h, false);
|
||||
HIPCHECK (hipDeviceReset());
|
||||
|
||||
printf (" %s success\n", __func__);
|
||||
}
|
||||
|
||||
|
||||
//---
|
||||
// Test many different kinds of memory copies.
|
||||
// THe subroutine allocates memory , copies to device, runs a vector add kernel, copies back, and checks the result.
|
||||
//
|
||||
// IN: numElements controls the number of elements used for allocations.
|
||||
// IN: usePinnedHost : If true, allocate host with hipMallocHost and is pinned ; else allocate host memory with malloc.
|
||||
// IN: useHostToHost : If true, add an extra host-to-host copy.
|
||||
// IN: useDeviceToDevice : If true, add an extra deviceto-device copy after result is produced.
|
||||
// IN: useMemkindDefault : If true, use memkinddefault (runtime figures out direction). if false, use explicit memcpy direction.
|
||||
//
|
||||
template <typename T>
|
||||
void memcpytest2(size_t numElements, bool usePinnedHost, bool useHostToHost, bool useDeviceToDevice, bool useMemkindDefault)
|
||||
{
|
||||
size_t sizeElements = numElements * sizeof(T);
|
||||
printf ("test: %s<%s> size=%lu (%6.2fMB) usePinnedHost:%d, useHostToHost:%d, useDeviceToDevice:%d, useMemkindDefault:%d\n",
|
||||
__func__,
|
||||
typeid(T).name(),
|
||||
sizeElements, sizeElements/1024.0/1024.0,
|
||||
usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault);
|
||||
|
||||
|
||||
T *A_d, *B_d, *C_d;
|
||||
T *A_h, *B_h, *C_h;
|
||||
|
||||
|
||||
HipTest::initArrays (&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, numElements, usePinnedHost);
|
||||
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements);
|
||||
|
||||
T *A_hh = NULL;
|
||||
T *B_hh = NULL;
|
||||
T *C_dd = NULL;
|
||||
|
||||
|
||||
|
||||
if (useHostToHost) {
|
||||
if (usePinnedHost) {
|
||||
HIPCHECK ( hipMallocHost(&A_hh, sizeElements) );
|
||||
HIPCHECK ( hipMallocHost(&B_hh, sizeElements) );
|
||||
} else {
|
||||
A_hh = (T*)malloc(sizeElements);
|
||||
B_hh = (T*)malloc(sizeElements);
|
||||
}
|
||||
|
||||
|
||||
// Do some extra host-to-host copies here to mix things up:
|
||||
HIPCHECK ( hipMemcpy(A_hh, A_h, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost));
|
||||
HIPCHECK ( hipMemcpy(B_hh, B_h, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost));
|
||||
|
||||
|
||||
HIPCHECK ( hipMemcpy(A_d, A_hh, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice));
|
||||
HIPCHECK ( hipMemcpy(B_d, B_hh, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice));
|
||||
} else {
|
||||
HIPCHECK ( hipMemcpy(A_d, A_h, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice));
|
||||
HIPCHECK ( hipMemcpy(B_d, B_h, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice));
|
||||
}
|
||||
|
||||
hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, numElements);
|
||||
|
||||
if (useDeviceToDevice) {
|
||||
HIPCHECK ( hipMalloc(&C_dd, sizeElements) );
|
||||
|
||||
// Do an extra device-to-device copies here to mix things up:
|
||||
HIPCHECK ( hipMemcpy(C_dd, C_d, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyDeviceToDevice));
|
||||
|
||||
//Destroy the original C_d:
|
||||
HIPCHECK ( hipMemset(C_d, 0x5A, sizeElements));
|
||||
|
||||
HIPCHECK ( hipMemcpy(C_h, C_dd, sizeElements, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost));
|
||||
} else {
|
||||
HIPCHECK ( hipMemcpy(C_h, C_d, sizeElements, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost));
|
||||
}
|
||||
|
||||
HIPCHECK ( hipDeviceSynchronize() );
|
||||
HipTest::checkVectorADD(A_h, B_h, C_h, numElements);
|
||||
|
||||
HipTest::freeArrays (A_d, B_d, C_d, A_h, B_h, C_h, usePinnedHost);
|
||||
|
||||
printf (" %s success\n", __func__);
|
||||
}
|
||||
|
||||
|
||||
//---
|
||||
//Try all the 16 possible combinations to memcpytest2 - usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault
|
||||
template<typename T>
|
||||
void memcpytest2_loop(size_t numElements)
|
||||
{
|
||||
printSep();
|
||||
|
||||
for (int usePinnedHost =0; usePinnedHost<=1; usePinnedHost++) {
|
||||
for (int useHostToHost =0; useHostToHost<=1; useHostToHost++) { // TODO
|
||||
for (int useDeviceToDevice =0; useDeviceToDevice<=1; useDeviceToDevice++) {
|
||||
for (int useMemkindDefault =0; useMemkindDefault<=1; useMemkindDefault++) {
|
||||
memcpytest2<T>(numElements, usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//---
|
||||
//Try many different sizes to memory copy.
|
||||
template<typename T>
|
||||
void memcpytest2_sizes(size_t maxElem=0, size_t offset=0)
|
||||
{
|
||||
printSep();
|
||||
printf ("test: %s<%s>\n", __func__, typeid(T).name());
|
||||
|
||||
int deviceId;
|
||||
HIPCHECK(hipGetDevice(&deviceId));
|
||||
|
||||
size_t free, total;
|
||||
HIPCHECK(hipMemGetInfo(&free, &total));
|
||||
|
||||
if (maxElem == 0) {
|
||||
maxElem = free/sizeof(T)/5;
|
||||
}
|
||||
|
||||
printf (" device#%d: hipMemGetInfo: free=%zu (%4.2fMB) total=%zu (%4.2fMB) maxSize=%6.1fMB offset=%lu\n",
|
||||
deviceId, free, (float)(free/1024.0/1024.0), total, (float)(total/1024.0/1024.0), maxElem*sizeof(T)/1024.0/1024.0, offset);
|
||||
|
||||
for (size_t elem=64; elem+offset<=maxElem; elem*=2) {
|
||||
HIPCHECK ( hipDeviceReset() );
|
||||
memcpytest2<T>(elem+offset, 0, 1, 1, 0); // unpinned host
|
||||
HIPCHECK ( hipDeviceReset() );
|
||||
memcpytest2<T>(elem+offset, 1, 1, 1, 0); // pinned host
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//---
|
||||
//Create multiple threads to stress multi-thread locking behavior in the allocation/deallocation/tracking logic:
|
||||
template<typename T>
|
||||
void multiThread_1(bool serialize, bool usePinnedHost)
|
||||
{
|
||||
printSep();
|
||||
printf ("test: %s<%s> serialize=%d usePinnedHost=%d\n", __func__, typeid(T).name(), serialize, usePinnedHost);
|
||||
std::thread t1 (memcpytest2<T>,N, usePinnedHost,0,0,0);
|
||||
if (serialize) {
|
||||
t1.join();
|
||||
}
|
||||
|
||||
|
||||
std::thread t2 (memcpytest2<T>,N, usePinnedHost,0,0,0);
|
||||
if (serialize) {
|
||||
t2.join();
|
||||
}
|
||||
|
||||
if (!serialize) {
|
||||
t1.join();
|
||||
t2.join();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
HipTest::parseStandardArguments(argc, argv, true);
|
||||
|
||||
printf ("info: set device to %d\n", p_gpuDevice);
|
||||
HIPCHECK(hipSetDevice(p_gpuDevice));
|
||||
|
||||
|
||||
if (p_tests & 0x1) {
|
||||
HIPCHECK ( hipDeviceReset() );
|
||||
simpleTest1();
|
||||
}
|
||||
|
||||
if (p_tests & 0x2) {
|
||||
HIPCHECK ( hipDeviceReset() );
|
||||
memcpytest2_loop<float>(N);
|
||||
memcpytest2_loop<double>(N);
|
||||
memcpytest2_loop<char>(N);
|
||||
memcpytest2_loop<int>(N);
|
||||
}
|
||||
|
||||
if (p_tests & 0x4) {
|
||||
HIPCHECK ( hipDeviceReset() );
|
||||
printSep();
|
||||
memcpytest2_sizes<float>(0,0);
|
||||
printSep();
|
||||
memcpytest2_sizes<float>(0,64);
|
||||
printSep();
|
||||
memcpytest2_sizes<float>(1024*1024, 13);
|
||||
printSep();
|
||||
memcpytest2_sizes<float>(1024*1024, 50);
|
||||
}
|
||||
|
||||
if (p_tests & 0x8) {
|
||||
HIPCHECK ( hipDeviceReset() );
|
||||
printSep();
|
||||
multiThread_1<float>(true, true);
|
||||
multiThread_1<float>(false, true);
|
||||
multiThread_1<float>(false, false); // TODO
|
||||
}
|
||||
|
||||
passed();
|
||||
|
||||
}
|
||||
|
||||
@@ -0,0 +1,349 @@
|
||||
// Test under-development. Calls async mem-copy API, experiment with functionality.
|
||||
|
||||
#include "hip_runtime.h"
|
||||
#include "test_common.h"
|
||||
|
||||
unsigned p_streams = 2;
|
||||
|
||||
|
||||
void simpleNegTest()
|
||||
{
|
||||
printf ("testing: %s\n",__func__);
|
||||
hipError_t e;
|
||||
float *A_malloc, *A_pinned, *A_d;
|
||||
|
||||
size_t Nbytes = N*sizeof(float);
|
||||
A_malloc = (float*)malloc(Nbytes);
|
||||
HIPCHECK(hipMallocHost(&A_pinned, Nbytes));
|
||||
HIPCHECK(hipMalloc(&A_d, Nbytes));
|
||||
|
||||
|
||||
// Can't use default with async copy
|
||||
e = hipMemcpyAsync(A_pinned, A_d, Nbytes, hipMemcpyDefault, NULL);
|
||||
HIPASSERT (e==hipErrorInvalidMemcpyDirection); // TODO
|
||||
HIPASSERT (e!= hipSuccess);
|
||||
|
||||
|
||||
// Not sure what happens here, the memory must be pinned.
|
||||
e = hipMemcpyAsync(A_malloc, A_d, Nbytes, hipMemcpyHostToDevice, NULL);
|
||||
|
||||
printf (" async memcpy of A_malloc to A_d. Result=%d\n", e);
|
||||
//HIPASSERT (e==hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
class Pinned;
|
||||
class Unpinned;
|
||||
|
||||
template <typename T> struct HostTraits;
|
||||
|
||||
template<>
|
||||
struct HostTraits<Pinned>
|
||||
{
|
||||
static const char *Name() { return "Pinned"; } ;
|
||||
|
||||
static void *Alloc(size_t sizeBytes) {
|
||||
void *p;
|
||||
HIPCHECK(hipMallocHost(&p, sizeBytes));
|
||||
return p;
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
template<typename T>
|
||||
__global__ void
|
||||
addK (hipLaunchParm lp, T *A, T K, size_t numElements)
|
||||
{
|
||||
size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
|
||||
size_t stride = hipBlockDim_x * hipGridDim_x ;
|
||||
|
||||
for (size_t i=offset; i<numElements; i+=stride) {
|
||||
A[i] = A[i] + K;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
//---
|
||||
//Tests propert dependency resolution between H2D and D2H commands in same stream:
|
||||
//IN: numInflight : number of copies inflight at any time:
|
||||
//IN: numPongs = number of iterations to run (iteration)
|
||||
template<typename T, class AllocType>
|
||||
void test_pingpong(hipStream_t stream, size_t numElements, int numInflight, int numPongs, bool doHostSide)
|
||||
{
|
||||
HIPASSERT(numElements % numInflight == 0); // Must be evenly divisible.
|
||||
size_t Nbytes = numElements*sizeof(T);
|
||||
size_t eachCopyElements = numElements / numInflight;
|
||||
size_t eachCopyBytes = eachCopyElements * sizeof(T);
|
||||
|
||||
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements);
|
||||
|
||||
printf ("-----------------------------------------------------------------------------------------------\n");
|
||||
printf ("testing: %s<%s> Nbytes=%zu (%6.1f MB) numPongs=%d numInflight=%d eachCopyElements=%zu eachCopyBytes=%zu\n",
|
||||
__func__, HostTraits<AllocType>::Name(), Nbytes, (double)(Nbytes)/1024.0/1024.0, numPongs, numInflight, eachCopyElements, eachCopyBytes);
|
||||
|
||||
T *A_h;
|
||||
T *A_d;
|
||||
|
||||
A_h = (T*)(HostTraits<AllocType>::Alloc(Nbytes));
|
||||
HIPCHECK(hipMalloc(&A_d, Nbytes));
|
||||
|
||||
// Initialize the host array:
|
||||
const T initValue = 13;
|
||||
const T deviceConst = 2;
|
||||
const T hostConst = 10000;
|
||||
for (size_t i=0; i<numElements; i++) {
|
||||
A_h[i] = initValue + i;
|
||||
}
|
||||
|
||||
|
||||
for (int k=0; k<numPongs; k++ ) {
|
||||
for (int i=0; i<numInflight; i++) {
|
||||
HIPCHECK(hipMemcpyAsync(&A_d[i*eachCopyElements], &A_h[i*eachCopyElements], eachCopyBytes, hipMemcpyHostToDevice, stream));
|
||||
}
|
||||
|
||||
hipLaunchKernel(addK<T>, dim3(blocks), dim3(threadsPerBlock), 0, stream, A_d, 2, numElements);
|
||||
|
||||
for (int i=0; i<numInflight; i++ ) {
|
||||
HIPCHECK(hipMemcpyAsync(&A_h[i*eachCopyElements], &A_d[i*eachCopyElements], eachCopyBytes, hipMemcpyDeviceToHost, stream));
|
||||
}
|
||||
|
||||
if (doHostSide) {
|
||||
assert(0);
|
||||
#if 0
|
||||
hipEvent_t e;
|
||||
HIPCHECK(hipEventCreate(&e));
|
||||
#endif
|
||||
HIPCHECK(hipDeviceSynchronize());
|
||||
for (size_t i=0; i<numElements; i++) {
|
||||
A_h[i] += hostConst;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
HIPCHECK(hipDeviceSynchronize());
|
||||
|
||||
|
||||
// Verify we copied back all the data correctly:
|
||||
for (size_t i=0; i<numElements; i++) {
|
||||
T gold = initValue + i;
|
||||
// Perform calcs in same order as test above to replicate FP order-of-operations:
|
||||
for (int k=0; k<numPongs; k++) {
|
||||
gold += deviceConst;
|
||||
if (doHostSide) {
|
||||
gold += hostConst;
|
||||
}
|
||||
}
|
||||
|
||||
if (gold != A_h[i]) {
|
||||
std::cout << i << ": gold=" << gold << " out=" << A_h[i] << std::endl;
|
||||
HIPASSERT(gold == A_h[i]);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
HIPCHECK(hipFreeHost(A_h));
|
||||
HIPCHECK(hipFree(A_d));
|
||||
}
|
||||
|
||||
|
||||
//---
|
||||
//Send many async copies to the same stream.
|
||||
//This requires runtime to keep track of many outstanding commands, and in the case of HCC requires growing/tracking the signal pool:
|
||||
template<typename T>
|
||||
void test_manyInflightCopies(hipStream_t stream, int numElements, int numCopies, bool syncBetweenCopies)
|
||||
{
|
||||
size_t Nbytes = numElements*sizeof(T);
|
||||
size_t eachCopyElements = numElements / numCopies;
|
||||
size_t eachCopyBytes = eachCopyElements * sizeof(T);
|
||||
|
||||
printf ("-----------------------------------------------------------------------------------------------\n");
|
||||
printf ("testing: %s Nbytes=%zu (%6.1f MB) numCopies=%d eachCopyElements=%zu eachCopyBytes=%zu\n",
|
||||
__func__, Nbytes, (double)(Nbytes)/1024.0/1024.0, numCopies, eachCopyElements, eachCopyBytes);
|
||||
|
||||
T *A_d;
|
||||
T *A_h1, *A_h2;
|
||||
|
||||
HIPCHECK(hipMallocHost(&A_h1, Nbytes));
|
||||
HIPCHECK(hipMallocHost(&A_h2, Nbytes));
|
||||
HIPCHECK(hipMalloc(&A_d, Nbytes));
|
||||
|
||||
for (int i=0; i<numElements; i++) {
|
||||
A_h1[i] = 3.14f + static_cast<T> (i);
|
||||
}
|
||||
|
||||
|
||||
//stream=0; // fixme TODO
|
||||
|
||||
|
||||
for (int i=0; i<numCopies; i++)
|
||||
{
|
||||
HIPCHECK(hipMemcpyAsync(&A_d[i*eachCopyElements], &A_h1[i*eachCopyElements], eachCopyBytes, hipMemcpyHostToDevice, stream));
|
||||
}
|
||||
|
||||
if (syncBetweenCopies) {
|
||||
HIPCHECK(hipDeviceSynchronize());
|
||||
}
|
||||
|
||||
for (int i=0; i<numCopies; i++)
|
||||
{
|
||||
HIPCHECK(hipMemcpyAsync(&A_h2[i*eachCopyElements], &A_d[i*eachCopyElements], eachCopyBytes, hipMemcpyDeviceToHost, stream));
|
||||
}
|
||||
|
||||
HIPCHECK(hipDeviceSynchronize());
|
||||
|
||||
|
||||
// Verify we copied back all the data correctly:
|
||||
for (int i=0; i<numElements; i++) {
|
||||
HIPASSERT(A_h1[i] == A_h2[i]);
|
||||
}
|
||||
|
||||
|
||||
HIPCHECK(hipFreeHost(A_h1));
|
||||
HIPCHECK(hipFreeHost(A_h2));
|
||||
HIPCHECK(hipFree(A_d));
|
||||
}
|
||||
|
||||
|
||||
//---
|
||||
//Classic example showing how to overlap data transfer with compute.
|
||||
//We divide the work into "chunks" and create a stream for each chunk.
|
||||
//Each chunk then runs a H2D copy, followed by kernel execution, followed by D2H copyback.
|
||||
//Work in separate streams is independent which enables concurrency.
|
||||
|
||||
// IN: nStreams : number of streams to use for the test
|
||||
// IN :useNullStream - use NULL stream. Synchronizes everything.
|
||||
// IN: useSyncMemcpyH2D - use sync memcpy (no overlap) for H2D
|
||||
// IN: useSyncMemcpyD2H - use sync memcpy (no overlap) for D2H
|
||||
void test_chunkedAsyncExample(int nStreams, bool useNullStream, bool useSyncMemcpyH2D, bool useSyncMemcpyD2H)
|
||||
{
|
||||
|
||||
size_t Nbytes = N*sizeof(int);
|
||||
printf ("testing: %s(useNullStream=%d, useSyncMemcpyH2D=%d, useSyncMemcpyD2H=%d) ",__func__, useNullStream, useSyncMemcpyH2D, useSyncMemcpyD2H);
|
||||
printf ("Nbytes=%zu (%6.1f MB)\n", Nbytes, (double)(Nbytes)/1024.0/1024.0);
|
||||
|
||||
int *A_d, *B_d, *C_d;
|
||||
int *A_h, *B_h, *C_h;
|
||||
|
||||
HipTest::initArrays (&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, true);
|
||||
|
||||
|
||||
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
|
||||
|
||||
|
||||
hipStream_t *stream = (hipStream_t*)malloc(sizeof(hipStream_t) * nStreams);
|
||||
if (useNullStream) {
|
||||
nStreams = 1;
|
||||
stream[0] = NULL;
|
||||
} else {
|
||||
for (int i = 0; i < nStreams; ++i) {
|
||||
HIPCHECK (hipStreamCreate(&stream[i]));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
size_t workLeft = N;
|
||||
size_t workPerStream = N / nStreams;
|
||||
for (int i = 0; i < nStreams; ++i) {
|
||||
size_t work = (workLeft < workPerStream) ? workLeft : workPerStream;
|
||||
size_t workBytes = work * sizeof(int);
|
||||
|
||||
size_t offset = i*workPerStream;
|
||||
|
||||
if (useSyncMemcpyH2D) {
|
||||
HIPCHECK ( hipMemcpy(&A_d[offset], &A_h[offset], workBytes, hipMemcpyHostToDevice));
|
||||
HIPCHECK ( hipMemcpy(&B_d[offset], &B_h[offset], workBytes, hipMemcpyHostToDevice));
|
||||
} else {
|
||||
HIPCHECK ( hipMemcpyAsync(&A_d[offset], &A_h[offset], workBytes, hipMemcpyHostToDevice, stream[i]));
|
||||
HIPCHECK ( hipMemcpyAsync(&B_d[offset], &B_h[offset], workBytes, hipMemcpyHostToDevice, stream[i]));
|
||||
};
|
||||
|
||||
hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, stream[i], &A_d[offset], &B_d[offset], &C_d[offset], work);
|
||||
|
||||
if (useSyncMemcpyD2H) {
|
||||
HIPCHECK ( hipMemcpy(&C_h[offset], &C_d[offset], workBytes, hipMemcpyDeviceToHost));
|
||||
} else {
|
||||
HIPCHECK ( hipMemcpyAsync(&C_h[offset], &C_d[offset], workBytes, hipMemcpyDeviceToHost, stream[i]));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
HIPCHECK (hipDeviceSynchronize());
|
||||
|
||||
|
||||
HipTest::checkVectorADD(A_h, B_h, C_h, N);
|
||||
|
||||
HipTest::freeArrays (A_d, B_d, C_d, A_h, B_h, C_h, true);
|
||||
|
||||
free(stream);
|
||||
};
|
||||
|
||||
|
||||
//---
|
||||
//Parse arguments specific to this test.
|
||||
void parseMyArguments(int argc, char *argv[])
|
||||
{
|
||||
int more_argc = HipTest::parseStandardArguments(argc, argv, false);
|
||||
|
||||
// parse args for this test:
|
||||
for (int i = 1; i < more_argc; i++) {
|
||||
const char *arg = argv[i];
|
||||
|
||||
if (!strcmp(arg, "--streams")) {
|
||||
if (++i >= argc || !HipTest::parseUInt(argv[i], &p_streams)) {
|
||||
failed("Bad streams argument");
|
||||
}
|
||||
} else {
|
||||
failed("Bad argument '%s'", arg);
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
HipTest::parseStandardArguments(argc, argv, true);
|
||||
parseMyArguments(argc, argv);
|
||||
|
||||
|
||||
printf ("info: set device to %d\n", p_gpuDevice);
|
||||
HIPCHECK(hipSetDevice(p_gpuDevice));
|
||||
|
||||
if (p_tests & 0x01) {
|
||||
simpleNegTest();
|
||||
}
|
||||
|
||||
if (p_tests & 0x02) {
|
||||
hipStream_t stream;
|
||||
HIPCHECK (hipStreamCreate(&stream));
|
||||
|
||||
test_manyInflightCopies<float>(stream, 1024, 16, true);
|
||||
test_manyInflightCopies<float>(stream, 1024, 4, true); // verify we re-use the same entries instead of growing pool.
|
||||
test_manyInflightCopies<float>(stream, 1024*8, 64, false);
|
||||
|
||||
HIPCHECK(hipStreamDestroy(stream));
|
||||
}
|
||||
|
||||
|
||||
if (p_tests & 0x04) {
|
||||
test_chunkedAsyncExample(p_streams, true, true, true); // Easy sync version
|
||||
test_chunkedAsyncExample(p_streams, false, true, true); // Easy sync version
|
||||
test_chunkedAsyncExample(p_streams, false, false, true); // Some async
|
||||
test_chunkedAsyncExample(p_streams, false, false, false); // All async
|
||||
}
|
||||
|
||||
if (p_tests & 0x08) {
|
||||
hipStream_t stream;
|
||||
HIPCHECK (hipStreamCreate(&stream));
|
||||
|
||||
test_pingpong<int, Pinned>(stream, 1024*1024*32, 1, 1, false);
|
||||
test_pingpong<int, Pinned>(stream, 1024*1024*32, 1, 10, false);
|
||||
|
||||
HIPCHECK(hipStreamDestroy(stream));
|
||||
}
|
||||
|
||||
|
||||
passed();
|
||||
|
||||
}
|
||||
@@ -0,0 +1,524 @@
|
||||
/*
|
||||
Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
|
||||
// Test pointer tracking logic: allocate memory and retrieve stats with hipPointerGetAttributes
|
||||
|
||||
#include "hip_runtime.h"
|
||||
#include "test_common.h"
|
||||
|
||||
#ifdef __HIP_PLATFORM_HCC__
|
||||
//#include "hcc_detail/AM.h"
|
||||
#include "hc_am.hpp"
|
||||
|
||||
#endif
|
||||
|
||||
size_t Nbytes = 0;
|
||||
|
||||
//=================================================================================================
|
||||
// Utility Functions:
|
||||
//=================================================================================================
|
||||
|
||||
bool operator==(const hipPointerAttribute_t &lhs, const hipPointerAttribute_t &rhs)
|
||||
{
|
||||
return ((lhs.hostPointer == rhs.hostPointer) &&
|
||||
(lhs.devicePointer == rhs.devicePointer) &&
|
||||
(lhs.memoryType == rhs.memoryType) &&
|
||||
(lhs.device == rhs.device) &&
|
||||
(lhs.allocationFlags == rhs.allocationFlags)
|
||||
) ;
|
||||
|
||||
};
|
||||
|
||||
|
||||
bool operator!=(const hipPointerAttribute_t &lhs, const hipPointerAttribute_t &rhs)
|
||||
{
|
||||
return ! (lhs == rhs);
|
||||
}
|
||||
|
||||
|
||||
const char *memoryTypeToString(hipMemoryType memoryType)
|
||||
{
|
||||
switch (memoryType) {
|
||||
case hipMemoryTypeHost : return "[Host]";
|
||||
case hipMemoryTypeDevice : return "[Device]";
|
||||
default: return "[Unknown]";
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
void resetAttribs(hipPointerAttribute_t *attribs)
|
||||
{
|
||||
attribs->hostPointer = (void*) (-1);
|
||||
attribs->devicePointer = (void*) (-1);
|
||||
attribs->memoryType = hipMemoryTypeHost;
|
||||
attribs->device = -2;
|
||||
attribs->isManaged = -1;
|
||||
attribs->allocationFlags = 0xffff;
|
||||
};
|
||||
|
||||
|
||||
void printAttribs(const hipPointerAttribute_t *attribs)
|
||||
{
|
||||
printf ("hostPointer:%p devicePointer:%p memoryType:%s deviceId:%d isManaged:%d allocationFlags:%u\n",
|
||||
attribs->hostPointer,
|
||||
attribs->devicePointer,
|
||||
memoryTypeToString(attribs->memoryType),
|
||||
attribs->device,
|
||||
attribs->isManaged,
|
||||
attribs->allocationFlags
|
||||
);
|
||||
};
|
||||
|
||||
|
||||
inline int zrand(int max)
|
||||
{
|
||||
return rand() % max;
|
||||
}
|
||||
|
||||
|
||||
//=================================================================================================
|
||||
// Functins to run tests
|
||||
//=================================================================================================
|
||||
//--
|
||||
//Run through a couple simple cases to test lookups and host pointer arithmetic:
|
||||
void testSimple()
|
||||
{
|
||||
printf ("\n");
|
||||
printf ("===========================================================================\n");
|
||||
printf ("Simple Tests\n");
|
||||
printf ("===========================================================================\n");
|
||||
|
||||
char *A_d;
|
||||
char *A_Pinned_h;
|
||||
char *A_OSAlloc_h;
|
||||
hipError_t e;
|
||||
|
||||
HIPCHECK ( hipMalloc(&A_d, Nbytes) );
|
||||
HIPCHECK ( hipMallocHost(&A_Pinned_h, Nbytes) );
|
||||
A_OSAlloc_h = (char*)malloc(Nbytes);
|
||||
|
||||
size_t free, total;
|
||||
HIPCHECK(hipMemGetInfo(&free, &total));
|
||||
printf ("hipMemGetInfo: free=%zu (%4.2f) Nbytes=%lu total=%zu (%4.2f)\n", free, (float)(free/1024.0/1024.0), Nbytes, total, (float)(total/1024.0/1024.0));
|
||||
HIPASSERT(free + Nbytes <= total);
|
||||
|
||||
|
||||
hipPointerAttribute_t attribs;
|
||||
hipPointerAttribute_t attribs2;
|
||||
|
||||
// Device memory
|
||||
printf ("\nDevice memory (hipMalloc)\n");
|
||||
HIPCHECK( hipPointerGetAttributes(&attribs, A_d));
|
||||
printf("getAttr:%-20s", "A_d"); printAttribs(&attribs);
|
||||
|
||||
// Check pointer arithmetic cases:
|
||||
resetAttribs(&attribs2);
|
||||
HIPCHECK( hipPointerGetAttributes(&attribs2, A_d+100));
|
||||
printf("getAttr:%-20s", "A_d+100"); printAttribs(&attribs2);
|
||||
HIPASSERT(attribs == attribs2);
|
||||
|
||||
// Corner case at end of array:
|
||||
resetAttribs(&attribs2);
|
||||
HIPCHECK( hipPointerGetAttributes(&attribs2, A_d+Nbytes-1));
|
||||
printf("getAttr:%-20s", "A_d+NBytes-1"); printAttribs(&attribs2);
|
||||
HIPASSERT(attribs == attribs2);
|
||||
|
||||
// Pointer just beyond array - must be invalid or at least a different pointer
|
||||
resetAttribs(&attribs2);
|
||||
e = hipPointerGetAttributes(&attribs2, A_d+Nbytes+1);
|
||||
printf("getAttr:%-20s err=%d (%s), neg-test expected\n", "A_d+NBytes", e, hipGetErrorString(e));
|
||||
if (e != hipErrorInvalidValue) {
|
||||
// We might have strayed into another pointer area.
|
||||
printf("getAttr:%-20s", "A_d+NBytes"); printAttribs(&attribs2);
|
||||
HIPASSERT(attribs.devicePointer != attribs2.devicePointer);
|
||||
}
|
||||
|
||||
|
||||
resetAttribs(&attribs2);
|
||||
e = hipPointerGetAttributes(&attribs2, A_d+Nbytes);
|
||||
if (e != hipErrorInvalidValue) {
|
||||
printf("%-20s", "A_d+Nbytes"); printAttribs(&attribs2);
|
||||
HIPASSERT(attribs.devicePointer != attribs2.devicePointer);
|
||||
}
|
||||
|
||||
hipFree(A_d);
|
||||
e = hipPointerGetAttributes(&attribs, A_d);
|
||||
HIPASSERT(e == hipErrorInvalidValue); // Just freed the pointer, this should return an error.
|
||||
|
||||
|
||||
// Device-visible host memory
|
||||
printf ("\nDevice-visible host memory (hipMallocHost)\n");
|
||||
HIPCHECK( hipPointerGetAttributes(&attribs, A_Pinned_h));
|
||||
printf("getAttr:%-20s", "A_pinned_h"); printAttribs(&attribs);
|
||||
|
||||
resetAttribs(&attribs2);
|
||||
HIPCHECK( hipPointerGetAttributes(&attribs2, A_Pinned_h+Nbytes/2));
|
||||
printf("getAttr:%-20s", "A_pinned_h+NBytes/2"); printAttribs(&attribs2);
|
||||
HIPASSERT(attribs == attribs2);
|
||||
|
||||
|
||||
hipFreeHost(A_Pinned_h);
|
||||
e = hipPointerGetAttributes(&attribs, A_Pinned_h);
|
||||
HIPASSERT(e == hipErrorInvalidValue); // Just freed the pointer, this should return an error.
|
||||
printf("getAttr:%-20s err=%d (%s), neg-test expected\n", "A_d+NBytes", e, hipGetErrorString(e));
|
||||
|
||||
|
||||
// OS memory
|
||||
printf ("\nOS-allocated memory (malloc)\n");
|
||||
e = hipPointerGetAttributes(&attribs, A_OSAlloc_h);
|
||||
printf("getAttr:%-20s err=%d (%s), neg-test expected\n", "A_OSAlloc_h", e, hipGetErrorString(e));
|
||||
HIPASSERT(e == hipErrorInvalidValue); // OS-allocated pointers should return hipErrorInvalidValue.
|
||||
}
|
||||
|
||||
//---
|
||||
//Reset the memory tracker (remove allocations from all known devices):
|
||||
//This frees any memory allocated through the runtime.
|
||||
//The routine will not release any
|
||||
void resetTracker ()
|
||||
{
|
||||
if (p_verbose & 0x1) {
|
||||
printf ("info: reset tracker for all devices in platform\n");
|
||||
}
|
||||
|
||||
int numDevices;
|
||||
HIPCHECK(hipGetDeviceCount(&numDevices));
|
||||
|
||||
// Clean up:
|
||||
for (int i=0; i<numDevices; i++) {
|
||||
HIPCHECK(hipSetDevice(i));
|
||||
HIPCHECK(hipDeviceReset());
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
// Store the hipPointer attrib and some extra info so can later compare the looked-up info against the reference expectation
|
||||
struct SuperPointerAttribute {
|
||||
void * _pointer;
|
||||
size_t _sizeBytes;
|
||||
hipPointerAttribute_t _attrib;
|
||||
};
|
||||
|
||||
|
||||
//---
|
||||
//Support function to check result against a reference:
|
||||
void checkPointer(SuperPointerAttribute &ref, int major, int minor, void *pointer)
|
||||
{
|
||||
hipPointerAttribute_t attribs;
|
||||
resetAttribs(&attribs);
|
||||
|
||||
hipError_t e = hipPointerGetAttributes(&attribs, pointer);
|
||||
if ((e != hipSuccess) || (attribs != ref._attrib)) {
|
||||
printf("Test %d.%d (err=%d)\n", major, minor, e);
|
||||
HIPCHECK(e);
|
||||
printf(" ref :: "); printAttribs(&ref._attrib);
|
||||
printf(" getattr:: "); printAttribs(&attribs);
|
||||
|
||||
HIPASSERT(attribs == ref._attrib);
|
||||
} else {
|
||||
if (p_verbose & 0x1) {
|
||||
printf("#%4d.%d GOOD:%p getattr :: ",major, minor, pointer); printAttribs(&attribs);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//---
|
||||
//Test that allocates memory across all 4 devices withing the specified size range (minSize...maxSize).
|
||||
//Then does lookups to make sure the info reported by the tracker matches expecations
|
||||
//Then deallocates it all.
|
||||
//
|
||||
//Multiple threads can call this funtion and in fact we do this in the testMultiThreaded_1 test.
|
||||
void clusterAllocs(int numAllocs, size_t minSize, size_t maxSize)
|
||||
{
|
||||
printf (" clusterAllocs numAllocs=%d size=%lu..%lu\n", numAllocs, minSize, maxSize);
|
||||
std::vector <SuperPointerAttribute> reference(numAllocs);
|
||||
|
||||
HIPASSERT(minSize > 0);
|
||||
HIPASSERT(maxSize >= minSize);
|
||||
|
||||
int numDevices;
|
||||
HIPCHECK(hipGetDeviceCount(&numDevices));
|
||||
|
||||
//---
|
||||
//Populate with device and host allocations.
|
||||
size_t totalDeviceAllocated[numDevices];
|
||||
for (int i =0; i<numDevices; i++) {
|
||||
totalDeviceAllocated[i] = 0;
|
||||
}
|
||||
for (int i=0; i<numAllocs; i++) {
|
||||
bool isDevice = rand() & 0x1;
|
||||
reference[i]._sizeBytes = zrand(maxSize-minSize) + minSize;
|
||||
|
||||
reference[i]._attrib.device = zrand(numDevices);
|
||||
HIPCHECK(hipSetDevice(reference[i]._attrib.device));
|
||||
reference[i]._attrib.isManaged = 0;
|
||||
|
||||
void * ptr;
|
||||
if (isDevice) {
|
||||
totalDeviceAllocated[reference[i]._attrib.device] += reference[i]._sizeBytes;
|
||||
HIPCHECK(hipMalloc(&ptr, reference[i]._sizeBytes));
|
||||
reference[i]._attrib.memoryType = hipMemoryTypeDevice;
|
||||
reference[i]._attrib.devicePointer = ptr;
|
||||
reference[i]._attrib.hostPointer = NULL;
|
||||
reference[i]._attrib.allocationFlags = 0; // TODO-randomize these.
|
||||
} else {
|
||||
HIPCHECK(hipMallocHost(&ptr, reference[i]._sizeBytes));
|
||||
reference[i]._attrib.memoryType = hipMemoryTypeHost;
|
||||
reference[i]._attrib.devicePointer = ptr;
|
||||
reference[i]._attrib.hostPointer = ptr;
|
||||
reference[i]._attrib.allocationFlags = 0; // TODO-randomize these.
|
||||
}
|
||||
reference[i]._pointer = ptr;
|
||||
}
|
||||
|
||||
#ifdef __HIP_PLATFORM_HCC__
|
||||
if (p_verbose & 0x2) {
|
||||
printf ("Tracker after insertions:\n");
|
||||
hc::am_memtracker_print();
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
for (int i =0; i<numDevices; i++) {
|
||||
size_t free, total;
|
||||
HIPCHECK(hipSetDevice(i));
|
||||
HIPCHECK(hipMemGetInfo(&free, &total));
|
||||
printf (" device#%d: hipMemGetInfo: free=%zu (%4.2fMB) clusterAllocTotalDevice=%lu (%4.2fMB) total=%zu (%4.2fMB)\n",
|
||||
i, free, (float)(free/1024.0/1024.0), totalDeviceAllocated[i], (float)(totalDeviceAllocated[i])/1024.0/1024.0, total, (float)(total/1024.0/1024.0));
|
||||
HIPASSERT(free + totalDeviceAllocated[i] <= total);
|
||||
}
|
||||
|
||||
|
||||
// Now look up each pointer we inserted and verify we can find it:
|
||||
for (int i=0; i<numAllocs; i++) {
|
||||
SuperPointerAttribute &ref = reference[i];
|
||||
checkPointer(ref, i, 0, ref._pointer);
|
||||
checkPointer(ref, i, 1, (char *)ref._pointer + ref._sizeBytes/2);
|
||||
if (ref._sizeBytes > 1) {
|
||||
checkPointer(ref, i, 2, (char *)ref._pointer + ref._sizeBytes-1);
|
||||
}
|
||||
|
||||
if (ref._attrib.memoryType == hipMemoryTypeDevice) {
|
||||
hipFree(ref._pointer);
|
||||
} else {
|
||||
hipFreeHost(ref._pointer);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#ifdef __HIP_PLATFORM_HCC__
|
||||
if (p_verbose & 0x2) {
|
||||
printf ("Tracker after cleanup:\n");
|
||||
hc::am_memtracker_print();
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
//---
|
||||
// Multi-threaded test with many simul allocs.
|
||||
// IN : serialize will force the test to run in serial fashion.
|
||||
// Seems like this does not hit MT corner cases in the tracker very often - testMultiThreaded_2 below seems more effective.
|
||||
void testMultiThreaded_1(bool serialize=false)
|
||||
{
|
||||
printf ("\n===========================================================================\n");
|
||||
printf ("MultiThreaded_1\n");
|
||||
if (serialize) printf ("[SERIALIZE]\n");
|
||||
printf ("===========================================================================\n");
|
||||
std::thread t1(clusterAllocs, 1000, 101, 1000);
|
||||
if (serialize) t1.join();
|
||||
|
||||
std::thread t2(clusterAllocs, 1000, 11, 100);
|
||||
if (serialize) t2.join();
|
||||
|
||||
std::thread t3(clusterAllocs, 1000, 5, 10);
|
||||
if (serialize) t3.join();
|
||||
|
||||
std::thread t4(clusterAllocs, 1000, 1, 4);
|
||||
if (serialize) t4.join();
|
||||
|
||||
if (!serialize) {
|
||||
t1.join();
|
||||
t2.join();
|
||||
t3.join();
|
||||
t4.join();
|
||||
}
|
||||
|
||||
resetTracker();
|
||||
}
|
||||
|
||||
|
||||
///================================================================================================
|
||||
|
||||
//---
|
||||
//Repeatedly query a single entry:
|
||||
void thread_query(void *ptr, const hipPointerAttribute_t *refAttrib)
|
||||
{
|
||||
int count = 0;
|
||||
|
||||
for (int count=0; count< 1000000; count++) {
|
||||
hipPointerAttribute_t a;
|
||||
hipError_t e = hipPointerGetAttributes(&a, ptr);
|
||||
if ((e != hipSuccess) || (a!= *refAttrib)) {
|
||||
printf("Test %d (err=%d)\n", count, e);
|
||||
HIPCHECK(e);
|
||||
|
||||
printf(" ref :: "); printAttribs(refAttrib);
|
||||
printf(" getattr:: "); printAttribs(&a);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#ifdef __HIP_PLATFORM_HCC__
|
||||
//---
|
||||
// Add pointers to tracker very quickly, then remove them quickly:
|
||||
enum Dir {Up, Down};
|
||||
void thread_noise_generator(int iters, size_t numBuffers, Dir addDir, Dir removeDir)
|
||||
{
|
||||
const size_t bufferSize = 16;
|
||||
size_t maxSize = numBuffers*bufferSize;
|
||||
HIPASSERT((maxSize % bufferSize) == 0); // loop logic assumes this is true
|
||||
|
||||
|
||||
for (int i=0; i<iters; i++) {
|
||||
char * basePtr = (char*)malloc(maxSize);
|
||||
|
||||
auto acc = hc::accelerator();
|
||||
|
||||
if (addDir == Up) {
|
||||
for (char *p = basePtr; p<basePtr + maxSize; p+=bufferSize) {
|
||||
hc::am_memtracker_add(p, bufferSize, acc, false);
|
||||
}
|
||||
} else if (addDir == Down) {
|
||||
for (char *p = basePtr+maxSize-bufferSize; p>=0; p-=bufferSize) {
|
||||
hc::am_memtracker_add(p, bufferSize, acc, false);
|
||||
}
|
||||
}
|
||||
|
||||
if (removeDir == Up) {
|
||||
for (char *p = basePtr; p<basePtr + maxSize; p+=bufferSize) {
|
||||
hc::am_memtracker_remove(p);
|
||||
}
|
||||
} else if (removeDir == Down) {
|
||||
for (char *p = basePtr+maxSize-bufferSize; p>=0; p-=bufferSize) {
|
||||
hc::am_memtracker_remove(p);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//---
|
||||
//Multi-thread test that is effective at catching locking errors in the alloc/dealloc/tracker.
|
||||
//The query thread repeately requests information on the same block of memory.
|
||||
//Meanwhile, the thread_noise_generator registers a large number of blocks, and
|
||||
//then unregisters them. This causes a large amount of rebalancing in the tree
|
||||
//structure and will generate errors unless the locks in the tracker are preventing reading
|
||||
//while writing.
|
||||
void testMultiThreaded_2()
|
||||
{
|
||||
std::atomic<int> inflight(2);
|
||||
|
||||
printf ("\n===========================================================================\n");
|
||||
printf ("MultiThreaded_2\n");
|
||||
printf ("===========================================================================\n");
|
||||
|
||||
hipSetDevice(0);
|
||||
hipDeviceReset();
|
||||
|
||||
// Create some entries in the tracker:
|
||||
for (int i=0; i<1000; i++) {
|
||||
void *C_d;
|
||||
HIPCHECK(hipMalloc(&C_d, 32));
|
||||
}
|
||||
|
||||
|
||||
// Allocate a pointer that we will repeatedly lookup:
|
||||
void *A_d;
|
||||
HIPCHECK(hipMalloc(&A_d, 10000));
|
||||
hipPointerAttribute_t attrib1;
|
||||
HIPCHECK(hipPointerGetAttributes(&attrib1, A_d));
|
||||
std::thread t1(thread_query, A_d, &attrib1);
|
||||
|
||||
std::thread t2(thread_noise_generator, 10000, 1000, Up, Up);
|
||||
|
||||
t1.join();
|
||||
t2.join();
|
||||
|
||||
hipSetDevice(0);
|
||||
hipDeviceReset();
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
N= 1000000;
|
||||
HipTest::parseStandardArguments(argc, argv, true);
|
||||
|
||||
|
||||
Nbytes = N*sizeof(char);
|
||||
|
||||
printf ("N=%zu (%6.2f MB) device=%d\n", N, Nbytes/(1024.0*1024.0), p_gpuDevice);
|
||||
|
||||
|
||||
if (p_tests & 0x01) {
|
||||
printf ("info: set device to %d\n", p_gpuDevice);
|
||||
HIPCHECK(hipSetDevice(p_gpuDevice));
|
||||
testSimple();
|
||||
}
|
||||
|
||||
if (p_tests & 0x02) {
|
||||
srand(0x100);
|
||||
printf ("\n===========================================================================\n");
|
||||
clusterAllocs(100, 1024*1, 1024*1024);
|
||||
resetTracker();
|
||||
}
|
||||
|
||||
if (p_tests & 0x04) {
|
||||
srand(0x200);
|
||||
printf ("\n===========================================================================\n");
|
||||
clusterAllocs(1000, 1, 10); // Many tiny allocations;
|
||||
resetTracker();
|
||||
}
|
||||
|
||||
if (p_tests & 0x08) {
|
||||
srand(0x300);
|
||||
testMultiThreaded_1(true);
|
||||
testMultiThreaded_1(false);
|
||||
}
|
||||
|
||||
|
||||
#ifdef __HIP_PLATFORM_HCC__
|
||||
if (p_tests & 0x10) {
|
||||
srand(0x400);
|
||||
testMultiThreaded_2();
|
||||
resetTracker();
|
||||
}
|
||||
#endif
|
||||
|
||||
printf ("\n");
|
||||
passed();
|
||||
}
|
||||
@@ -28,6 +28,8 @@ int iterations = 1;
|
||||
unsigned blocksPerCU = 6; // to hide latency
|
||||
unsigned threadsPerBlock = 256;
|
||||
int p_gpuDevice = 0;
|
||||
unsigned p_verbose = 0;
|
||||
int p_tests = -1; /*which tests to run. Interpretation is left to each test. default:all*/
|
||||
|
||||
|
||||
|
||||
@@ -86,7 +88,7 @@ int parseStandardArguments(int argc, char *argv[], bool failOnUndefinedArg)
|
||||
|
||||
if (!strcmp(arg, " ")) {
|
||||
// skip NULL args.
|
||||
} else if (!strcmp(arg, "--N")) {
|
||||
} else if (!strcmp(arg, "--N") || (!strcmp(arg, "-N"))) {
|
||||
if (++i >= argc || !HipTest::parseSize(argv[i], &N)) {
|
||||
failed("Bad N size argument");
|
||||
}
|
||||
@@ -114,8 +116,16 @@ int parseStandardArguments(int argc, char *argv[], bool failOnUndefinedArg)
|
||||
failed("Bad gpuDevice argument");
|
||||
}
|
||||
|
||||
}
|
||||
else {
|
||||
} else if (!strcmp(arg, "--verbose") || (!strcmp(arg, "-v"))) {
|
||||
if (++i >= argc || !HipTest::parseUInt(argv[i], &p_verbose)) {
|
||||
failed("Bad verbose argument");
|
||||
}
|
||||
} else if (!strcmp(arg, "--tests") || (!strcmp(arg, "-t"))) {
|
||||
if (++i >= argc || !HipTest::parseInt(argv[i], &p_tests)) {
|
||||
failed("Bad tests argument");
|
||||
}
|
||||
|
||||
} else {
|
||||
if (failOnUndefinedArg) {
|
||||
failed("Bad argument '%s'", arg);
|
||||
} else {
|
||||
|
||||
@@ -25,7 +25,7 @@
|
||||
printf (__VA_ARGS__);\
|
||||
printf ("\n");\
|
||||
printf ("error: TEST FAILED\n%s", KNRM );\
|
||||
exit(EXIT_FAILURE);
|
||||
abort();
|
||||
|
||||
|
||||
#define HIPCHECK(error) \
|
||||
@@ -53,6 +53,8 @@ extern int iterations;
|
||||
extern unsigned blocksPerCU;
|
||||
extern unsigned threadsPerBlock;
|
||||
extern int p_gpuDevice;
|
||||
extern unsigned p_verbose;
|
||||
extern int p_tests;
|
||||
|
||||
namespace HipTest {
|
||||
|
||||
@@ -86,7 +88,7 @@ vectorADD(hipLaunchParm lp,
|
||||
size_t stride = hipBlockDim_x * hipGridDim_x ;
|
||||
|
||||
for (size_t i=offset; i<N; i+=stride) {
|
||||
C_d[i] = A_d[i] + B_d[i];
|
||||
C_d[i] = A_d[i] + B_d[i];
|
||||
}
|
||||
}
|
||||
|
||||
@@ -94,7 +96,7 @@ vectorADD(hipLaunchParm lp,
|
||||
template <typename T>
|
||||
void initArrays(T **A_d, T **B_d, T **C_d,
|
||||
T **A_h, T **B_h, T **C_h,
|
||||
size_t N)
|
||||
size_t N, bool usePinnedHost=false)
|
||||
{
|
||||
size_t Nbytes = N*sizeof(T);
|
||||
|
||||
@@ -108,14 +110,32 @@ void initArrays(T **A_d, T **B_d, T **C_d,
|
||||
HIPCHECK ( hipMalloc(C_d, Nbytes) );
|
||||
}
|
||||
|
||||
if (A_h)
|
||||
*A_h = (T*)malloc(Nbytes);
|
||||
|
||||
if (B_h)
|
||||
*B_h = (T*)malloc(Nbytes);
|
||||
if (usePinnedHost) {
|
||||
if (A_h) {
|
||||
HIPCHECK ( hipMallocHost(A_h, Nbytes) );
|
||||
}
|
||||
if (B_h) {
|
||||
HIPCHECK ( hipMallocHost(B_h, Nbytes) );
|
||||
}
|
||||
if (C_h) {
|
||||
HIPCHECK ( hipMallocHost(C_h, Nbytes) );
|
||||
}
|
||||
} else {
|
||||
if (A_h) {
|
||||
*A_h = (T*)malloc(Nbytes);
|
||||
HIPASSERT(*A_h != NULL);
|
||||
}
|
||||
|
||||
if (B_h) {
|
||||
*B_h = (T*)malloc(Nbytes);
|
||||
HIPASSERT(*B_h != NULL);
|
||||
}
|
||||
|
||||
if (C_h)
|
||||
*C_h = (T*)malloc(Nbytes);
|
||||
if (C_h) {
|
||||
*C_h = (T*)malloc(Nbytes);
|
||||
HIPASSERT(*C_h != NULL);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Initialize the host data:
|
||||
@@ -128,7 +148,43 @@ void initArrays(T **A_d, T **B_d, T **C_d,
|
||||
}
|
||||
|
||||
|
||||
template <typename T>
|
||||
void freeArrays(T *A_d, T *B_d, T *C_d,
|
||||
T *A_h, T *B_h, T *C_h, bool usePinnedHost)
|
||||
{
|
||||
if (A_d) {
|
||||
HIPCHECK ( hipFree(A_d) );
|
||||
}
|
||||
if (B_d) {
|
||||
HIPCHECK ( hipFree(B_d) );
|
||||
}
|
||||
if (C_d) {
|
||||
HIPCHECK ( hipFree(C_d) );
|
||||
}
|
||||
|
||||
if (usePinnedHost) {
|
||||
if (A_h) {
|
||||
HIPCHECK (hipFreeHost(A_h));
|
||||
}
|
||||
if (B_h) {
|
||||
HIPCHECK (hipFreeHost(B_h));
|
||||
}
|
||||
if (C_h) {
|
||||
HIPCHECK (hipFreeHost(C_h));
|
||||
}
|
||||
} else {
|
||||
if (A_h) {
|
||||
free (A_h);
|
||||
}
|
||||
if (B_h) {
|
||||
free (B_h);
|
||||
}
|
||||
if (C_h) {
|
||||
free (C_h);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
// Assumes C_h contains vector add of A_h + B_h
|
||||
|
||||
@@ -91,6 +91,7 @@ syn keyword hipFunctionName hipD3D9UnmapResources
|
||||
syn keyword hipFunctionName hipD3D9UnregisterResource
|
||||
syn keyword hipFunctionName hipDeviceGetProperties
|
||||
syn keyword hipFunctionName hipDeviceSynchronize
|
||||
syn keyword hipFunctionName hipDeviceReset
|
||||
syn keyword hipFunctionName hipEventCreate
|
||||
syn keyword hipFunctionName hipEventDestroy
|
||||
syn keyword hipFunctionName hipEventElapsedTime
|
||||
@@ -151,6 +152,9 @@ syn keyword hipFunctionName hipUnbindTexture
|
||||
syn keyword hipFlags hipFilterModePoint
|
||||
syn keyword hipFlags hipMemcpyHostToDevice
|
||||
syn keyword hipFlags hipMemcpyDeviceToHost
|
||||
syn keyword hipFlags hipMemcpyHostToHost
|
||||
syn keyword hipFlags hipMemcpyDeviceToDevice
|
||||
syn keyword hipFlags hipMemcpyDefault
|
||||
syn keyword hipFlags hipReadModeElementType
|
||||
syn keyword hipFlags hipSuccess
|
||||
syn keyword hipFlags hipTextureType1D
|
||||
|
||||
Reference in New Issue
Block a user