diff --git a/projects/clr/hipamd/bin/hipcc b/projects/clr/hipamd/bin/hipcc index d001c6febe..1ab4cf2759 100755 --- a/projects/clr/hipamd/bin/hipcc +++ b/projects/clr/hipamd/bin/hipcc @@ -71,6 +71,7 @@ if ($HIP_PLATFORM eq "hcc") { $HIPLDFLAGS .= " -L$HSA_PATH/lib -lhsa-runtime64 -lhc_am"; # Add C++ libs for GCC. $HIPLDFLAGS .= " -lstdc++"; + $HIPLDFLAGS .= " -lm"; if ($verbose & 0x2) { print ("HSA_PATH=$HSA_PATH\n"); @@ -163,7 +164,7 @@ if ($needHipHcc) { if ((not -e $object) or ((stat($source))[9] > (stat($object))[9])) { my $CMD = "$HCC $HCCFLAGS -I$HSA_PATH/include -I$HIP_PATH/include -Wall -c $source -o $object"; if ($verbose & 0x10) { - $CMD .= " -g" ; + $CMD .= " -g -O2" ; } else { $CMD .= " -O3" ; } diff --git a/projects/clr/hipamd/bin/hipconfig b/projects/clr/hipamd/bin/hipconfig index db53d6014e..1687983330 100755 --- a/projects/clr/hipamd/bin/hipconfig +++ b/projects/clr/hipamd/bin/hipconfig @@ -82,6 +82,7 @@ if ($p_full) { system("$HCC_HOME/bin/hcc-config --cxxflags"); print ("HCC-ldflags : "); system("$HCC_HOME/bin/hcc-config --ldflags"); + printf("\n"); } if ($HIP_PLATFORM eq "nvcc") { print "\n" ; @@ -98,6 +99,8 @@ if ($p_full) { print "\n" ; print "== Linux Kernel\n"; system ("uname -a"); + + print "\n" ; $printed = 1; } diff --git a/projects/clr/hipamd/bin/hipify b/projects/clr/hipamd/bin/hipify index d143bdff37..e3b6c64c88 100755 --- a/projects/clr/hipamd/bin/hipify +++ b/projects/clr/hipamd/bin/hipify @@ -277,6 +277,8 @@ while (@ARGV) { $ft{'mem'} += s/\bcudaMemcpyKind\b/hipMemcpyKind/g; + $ft{'mem'} += s/\bcudaPointerAttributes\b/hipPointerAttribute_t/g; + #-------- # Memory management: diff --git a/projects/clr/hipamd/include/hcc_detail/hip_runtime.h b/projects/clr/hipamd/include/hcc_detail/hip_runtime.h index 8474f066df..aeed53348e 100644 --- a/projects/clr/hipamd/include/hcc_detail/hip_runtime.h +++ b/projects/clr/hipamd/include/hcc_detail/hip_runtime.h @@ -108,6 +108,12 @@ THE SOFTWARE. #define __HCC_C__ #endif + +// TODO - hipify-clang - change to use the function call. +//#define warpSize hc::__wavesize() +const int warpSize = 64; + + #define clock_t long long int __device__ inline long long int clock64() { return (long long int)hc::__clock_u64(); }; __device__ inline clock_t clock() { return (clock_t)hc::__clock_u64(); }; @@ -344,42 +350,42 @@ __device__ inline unsigned long long int __ballot( int input) } // warp shuffle functions -__device__ inline int __shfl(int input, int lane, int width) +__device__ inline int __shfl(int input, int lane, int width=warpSize) { return hc::__shfl(input,lane,width); } -__device__ inline int __shfl_up(int input, unsigned int lane_delta, int width) +__device__ inline int __shfl_up(int input, unsigned int lane_delta, int width=warpSize) { return hc::__shfl_up(input,lane_delta,width); } -__device__ inline int __shfl_down(int input, unsigned int lane_delta, int width) +__device__ inline int __shfl_down(int input, unsigned int lane_delta, int width=warpSize) { return hc::__shfl_down(input,lane_delta,width); } -__device__ inline int __shfl_xor(int input, int lane_mask, int width) +__device__ inline int __shfl_xor(int input, int lane_mask, int width=warpSize) { return hc::__shfl_xor(input,lane_mask,width); } -__device__ inline float __shfl(float input, int lane, int width) +__device__ inline float __shfl(float input, int lane, int width=warpSize) { return hc::__shfl(input,lane,width); } -__device__ inline float __shfl_up(float input, unsigned int lane_delta, int width) +__device__ inline float __shfl_up(float input, unsigned int lane_delta, int width=warpSize) { return hc::__shfl_up(input,lane_delta,width); } -__device__ inline float __shfl_down(float input, unsigned int lane_delta, int width) +__device__ inline float __shfl_down(float input, unsigned int lane_delta, int width=warpSize) { return hc::__shfl_down(input,lane_delta,width); } -__device__ inline float __shfl_xor(float input, int lane_mask, int width) +__device__ inline float __shfl_xor(float input, int lane_mask, int width=warpSize) { return hc::__shfl_xor(input,lane_mask,width); } @@ -438,7 +444,6 @@ __device__ inline float __dsqrt_rz(double x) {return hc::fast_math::sqrt(x); }; #define hipGridDim_z (hc_get_num_groups(0)) -extern int warpSize ; #define __syncthreads() hc_barrier(CLK_LOCAL_MEM_FENCE) @@ -476,7 +481,8 @@ extern int warpSize ; #ifdef __HCC_CPP__ -hc::accelerator_view *ihipLaunchKernel(hipStream_t stream); +hipStream_t ihipPreLaunchKernel(hipStream_t stream, hc::accelerator_view **av); +void ihipPostLaunchKernel(hipStream_t stream, hc::completion_future &cf); #if not defined(DISABLE_GRID_LAUNCH) #define hipLaunchKernel(_kernelName, _numBlocks3D, _blockDim3D, _groupMemBytes, _stream, ...) \ @@ -491,12 +497,13 @@ do {\ lp.groupMemBytes = _groupMemBytes;\ hc::completion_future cf;\ lp.cf = &cf; \ - lp.av = (ihipLaunchKernel(_stream)); \ + hipStream_t trueStream = (ihipPreLaunchKernel(_stream, &lp.av)); \ if (HIP_TRACE_API) {\ fprintf(stderr, "hiptrace1: launch '%s' gridDim:[%d.%d.%d] groupDim:[%d.%d.%d] groupMem:+%d stream=%p\n", \ #_kernelName, lp.gridDim.z, lp.gridDim.y, lp.gridDim.x, lp.groupDim.z, lp.groupDim.y, lp.groupDim.x, lp.groupMemBytes, (void*)(_stream));\ }\ _kernelName (lp, __VA_ARGS__);\ + ihipPostLaunchKernel(trueStream, cf);\ } while(0) #else @@ -514,12 +521,13 @@ do {\ lp.groupMemBytes = _groupMemBytes;\ hc::completion_future cf;\ lp.cf = &cf; \ - lp.av = (ihipLaunchKernel(_stream)); \ + hipStream_t trueStream = (ihipPreLaunchKernel(_stream, &lp.av)); \ if (HIP_TRACE_API) {\ fprintf(stderr, "hiptrace1: launch '%s' gridDim:[%d.%d.%d] groupDim:[%d.%d.%d] groupMem:+%d stream=%p\n", \ #_kernelName, lp.gridDim.z, lp.gridDim.y, lp.gridDim.x, lp.groupDim.z, lp.groupDim.y, lp.groupDim.x, lp.groupMemBytes, (void*)(_stream));\ }\ _kernelName (lp, __VA_ARGS__);\ + ihipPostLaunchKernel(trueStream, cf);\ } while(0) /*end hipLaunchKernel */ #endif diff --git a/projects/clr/hipamd/include/hcc_detail/hip_runtime_api.h b/projects/clr/hipamd/include/hcc_detail/hip_runtime_api.h index 225b065654..5fe398b84c 100644 --- a/projects/clr/hipamd/include/hcc_detail/hip_runtime_api.h +++ b/projects/clr/hipamd/include/hcc_detail/hip_runtime_api.h @@ -105,6 +105,8 @@ enum hipMemcpyKind { } ; + + // Doxygen end group GlobalDefs /** @} */ @@ -113,7 +115,7 @@ enum hipMemcpyKind { // The handle allows the async commands to use the stream even if the parent hipStream_t goes out-of-scope. -typedef struct ihipStream_t * hipStream_t; +typedef class ihipStream_t * hipStream_t; /* @@ -128,6 +130,7 @@ typedef struct hipEvent_t { + #ifdef __cplusplus } /* extern "C" */ #endif @@ -634,6 +637,11 @@ hipError_t hipEventQuery(hipEvent_t event) ; */ +/** + * @brief Return attributes for the specified pointer + */ +hipError_t hipPointerGetAttributes(hipPointerAttribute_t *attributes, void* ptr) ; + /** * @brief Allocate memory on the default accelerator diff --git a/projects/clr/hipamd/include/hip_runtime_api.h b/projects/clr/hipamd/include/hip_runtime_api.h index 5191bc5d54..239be13843 100644 --- a/projects/clr/hipamd/include/hip_runtime_api.h +++ b/projects/clr/hipamd/include/hip_runtime_api.h @@ -98,6 +98,30 @@ typedef struct hipDeviceProp_t { } hipDeviceProp_t; +/** + * Memory type (for pointer attributes) + */ +enum hipMemoryType { + hipMemoryTypeHost, ///< Memory is physically located on host + hipMemoryTypeDevice ///< Memory is physically located on device. (see deviceId for specific device) +}; + + + +/** + * Pointer attributes + */ +typedef struct hipPointerAttribute_t { + enum hipMemoryType memoryType; + int device; + void *devicePointer; + void *hostPointer; + int isManaged; + unsigned allocationFlags; /* flags specified when memory was allocated*/ + /* peers? */ +} hipPointerAttribute_t; + + // hack to get these to show up in Doxygen: /** * @defgroup GlobalDefs Global enum and defines @@ -111,6 +135,7 @@ typedef struct hipDeviceProp_t { * @enum * @ingroup Enumerations */ +// Developer note - when updating these, update the hipErrorName and hipErrorString functions typedef enum hipError_t { hipSuccess = 0 ///< Successful completion. ,hipErrorMemoryAllocation ///< Memory allocation error. @@ -120,6 +145,8 @@ typedef enum hipError_t { ,hipErrorInvalidValue ///< One or more of the parameters passed to the API call is NULL or not in an acceptable range. ,hipErrorInvalidResourceHandle ///< Resource handle (hipEvent_t or hipStream_t) invalid. ,hipErrorInvalidDevice ///< DeviceID must be in range 0...#compute-devices. + ,hipErrorInvalidMemcpyDirection ///< Invalid memory copy direction + ,hipErrorNoDevice ///< Call to hipGetDeviceCount returned 0 devices ,hipErrorNotReady ///< Indicates that asynchronous operations enqueued earlier are not ready. This is not actually an error, but is used to distinguish from hipSuccess (which indicates completion). APIs that return this error include hipEventQuery and hipStreamQuery. ,hipErrorUnknown ///< Unknown error. diff --git a/projects/clr/hipamd/samples/1_Utils/hipBusBandwidth/LICENSE.txt b/projects/clr/hipamd/samples/1_Utils/hipBusBandwidth/LICENSE.txt new file mode 100644 index 0000000000..5d0d603232 --- /dev/null +++ b/projects/clr/hipamd/samples/1_Utils/hipBusBandwidth/LICENSE.txt @@ -0,0 +1,27 @@ + +Copyright (c) 2011, UT-Battelle, LLC +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. +* Neither the name of Oak Ridge National Laboratory, nor UT-Battelle, LLC, nor + the names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + diff --git a/projects/clr/hipamd/samples/1_Utils/hipBusBandwidth/Makefile b/projects/clr/hipamd/samples/1_Utils/hipBusBandwidth/Makefile new file mode 100644 index 0000000000..77a92fb1a6 --- /dev/null +++ b/projects/clr/hipamd/samples/1_Utils/hipBusBandwidth/Makefile @@ -0,0 +1,17 @@ +HIP_PATH?=$(shell hipconfig -p) +HIPCC=$(HIP_PATH)/bin/hipcc + +EXE=hipBusBandwidth +CXXFLAGS = -O3 -g + +all: install + +$(EXE): hipBusBandwidth.cpp ResultDatabase.cpp + $(HIPCC) $(CXXFLAGS) $^ -o $@ + +install: $(EXE) + cp $(EXE) $(HIP_PATH)/bin + + +clean: + rm -f *.o $(EXE) diff --git a/projects/clr/hipamd/samples/1_Utils/hipBusBandwidth/ResultDatabase.cpp b/projects/clr/hipamd/samples/1_Utils/hipBusBandwidth/ResultDatabase.cpp new file mode 100644 index 0000000000..7d2f3aef84 --- /dev/null +++ b/projects/clr/hipamd/samples/1_Utils/hipBusBandwidth/ResultDatabase.cpp @@ -0,0 +1,523 @@ +#include "ResultDatabase.h" + +#include +#include +#include +#include + +using namespace std; + +bool ResultDatabase::Result::operator<(const Result &rhs) const +{ + if (test < rhs.test) + return true; + if (test > rhs.test) + return false; + if (atts < rhs.atts) + return true; + if (atts > rhs.atts) + return false; + return false; // less-operator returns false on equal +} + +double ResultDatabase::Result::GetMin() const +{ + double r = FLT_MAX; + for (int i=0; i= 100) + return value[n-1]; + + double index = ((n + 1.) * q / 100.) - 1; + + vector sorted = value; + sort(sorted.begin(), sorted.end()); + + if (n == 2) + return (sorted[0] * (1 - q/100.) + sorted[1] * (q/100.)); + + int index_lo = int(index); + double frac = index - index_lo; + if (frac == 0) + return sorted[index_lo]; + + double lo = sorted[index_lo]; + double hi = sorted[index_lo + 1]; + return lo + (hi-lo)*frac; +} + +double ResultDatabase::Result::GetMean() const +{ + double r = 0; + for (int i=0; i &values) +{ + for (int i=0; i= results.size()) + { + Result r; + r.test = test; + r.atts = atts; + r.unit = unit; + results.push_back(r); + } + + results[index].value.push_back(value); +} + +// **************************************************************************** +// Method: ResultDatabase::DumpDetailed +// +// Purpose: +// Writes the full results, including all trials. +// +// Arguments: +// out where to print +// +// Programmer: Jeremy Meredith +// Creation: August 14, 2009 +// +// Modifications: +// Jeremy Meredith, Wed Nov 10 14:25:17 EST 2010 +// Renamed to DumpDetailed to make room for a DumpSummary. +// +// Jeremy Meredith, Thu Nov 11 11:39:57 EST 2010 +// Added note about (*) missing value tag. +// +// Jeremy Meredith, Tue Nov 23 13:57:02 EST 2010 +// Changed note about missing values to be worded a little better. +// +// **************************************************************************** +void ResultDatabase::DumpDetailed(ostream &out) +{ + vector sorted(results); + + sort(sorted.begin(), sorted.end()); + + int maxtrials = 1; + for (int i=0; i maxtrials) + maxtrials = sorted[i].value.size(); + } + + // TODO: in big parallel runs, the "trials" are the procs + // and we really don't want to print them all out.... + out << "test\t" + << "atts\t" + << "units\t" + << "median\t" + << "mean\t" + << "stddev\t" + << "min\t" + << "max\t"; + for (int i=0; i sorted(results); + + int testW = 15 ; + const int fieldW = 9; + + sort(sorted.begin(), sorted.end()); + + out << std::fixed << right << std::setprecision(4); + + // TODO: in big parallel runs, the "trials" are the procs + // and we really don't want to print them all out.... + out << setw(testW) << "test\t" << setw(fieldW) + << "atts\t" + << "units\t" + << "median\t" + << "mean\t" + << "stddev\t" + << "min\t" + << "max\t"; + out << endl; + + for (int i=0; i sorted(results); + + sort(sorted.begin(), sorted.end()); + + //Check to see if the file is empty - if so, add the headers + emptyFile = this->IsFileEmpty(fileName); + + //Open file and append by default + ofstream out; + out.open(fileName.c_str(), std::ofstream::out | std::ofstream::app); + + //Add headers only for empty files + if(emptyFile) + { + // TODO: in big parallel runs, the "trials" are the procs + // and we really don't want to print them all out.... + out << "test, " + << "atts, " + << "units, " + << "median, " + << "mean, " + << "stddev, " + << "min, " + << "max, "; + out << endl; + } + + for (int i=0; i +ResultDatabase::GetResultsForTest(const string &test) +{ + // get only the given test results + vector retval; + for (int i=0; i & +ResultDatabase::GetResults() const +{ + return results; +} diff --git a/projects/clr/hipamd/samples/1_Utils/hipBusBandwidth/ResultDatabase.h b/projects/clr/hipamd/samples/1_Utils/hipBusBandwidth/ResultDatabase.h new file mode 100644 index 0000000000..4b63a02a1f --- /dev/null +++ b/projects/clr/hipamd/samples/1_Utils/hipBusBandwidth/ResultDatabase.h @@ -0,0 +1,100 @@ +#ifndef RESULT_DATABASE_H +#define RESULT_DATABASE_H + +#include +#include +#include +#include +#include +using std::string; +using std::vector; +using std::ostream; +using std::ofstream; +using std::ifstream; + + +// **************************************************************************** +// Class: ResultDatabase +// +// Purpose: +// Track numerical results as they are generated. +// Print statistics of raw results. +// +// Programmer: Jeremy Meredith +// Creation: June 12, 2009 +// +// Modifications: +// Jeremy Meredith, Wed Nov 10 14:20:47 EST 2010 +// Split timing reports into detailed and summary. E.g. for serial code, +// we might report all trial values, but skip them in parallel. +// +// Jeremy Meredith, Thu Nov 11 11:40:18 EST 2010 +// Added check for missing value tag. +// +// Jeremy Meredith, Mon Nov 22 13:37:10 EST 2010 +// Added percentile statistic. +// +// Jeremy Meredith, Fri Dec 3 16:30:31 EST 2010 +// Added a method to extract a subset of results based on test name. Also, +// the Result class is now public, so that clients can use them directly. +// Added a GetResults method as well, and made several functions const. +// +// **************************************************************************** +class ResultDatabase +{ + public: + // + // A performance result for a single SHOC benchmark run. + // + struct Result + { + string test; // e.g. "readback" + string atts; // e.g. "pagelocked 4k^2" + string unit; // e.g. "MB/sec" + vector value; // e.g. "837.14" + double GetMin() const; + double GetMax() const; + double GetMedian() const; + double GetPercentile(double q) const; + double GetMean() const; + double GetStdDev() const; + + bool operator<(const Result &rhs) const; + + bool HadAnyFLTMAXValues() const + { + for (int i=0; i= FLT_MAX) + return true; + } + return false; + } + }; + + protected: + vector results; + + public: + void AddResult(const string &test, + const string &atts, + const string &unit, + double value); + void AddResults(const string &test, + const string &atts, + const string &unit, + const vector &values); + vector GetResultsForTest(const string &test); + const vector &GetResults() const; + void ClearAllResults(); + void DumpDetailed(ostream&); + void DumpSummary(ostream&); + void DumpCsv(string fileName); + + private: + bool IsFileEmpty(string fileName); + +}; + + +#endif diff --git a/projects/clr/hipamd/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp b/projects/clr/hipamd/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp new file mode 100644 index 0000000000..d276725921 --- /dev/null +++ b/projects/clr/hipamd/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp @@ -0,0 +1,387 @@ +#include +#include +#include + +#include "ResultDatabase.h" + +// Cmdline parms: +bool p_verbose = false; +bool p_pinned = true; +int p_iterations = 10; +int p_device = 0; +int p_detailed = 0; + +bool p_h2d = true; +bool p_d2h = true; + + +#define CHECK_HIP_ERROR() \ +{ \ + hipError_t err = hipGetLastError(); \ + if (err != hipSuccess) \ + { \ + printf("error=%d name=%s at " \ + "ln: %d\n ",err,hipGetErrorString(err),__LINE__); \ + exit(EXIT_FAILURE); \ + } \ +} + + +// **************************************************************************** +// Function: runBenchmark +// +// Purpose: +// Measures the bandwidth of the bus connecting the host processor to the +// OpenCL device. This benchmark repeatedly transfers data chunks of various +// sizes across the bus to the OpenCL device, and calculates the bandwidth. +// +// +// Arguments: +// +// Returns: nothing +// +// Programmer: Jeremy Meredith +// Creation: September 08, 2009 +// +// Modifications: +// Jeremy Meredith, Wed Dec 1 17:05:27 EST 2010 +// Added calculation of latency estimate. +// Ben Sander - moved to standalone test +// +// **************************************************************************** +void RunBenchmark_H2D(ResultDatabase &resultDB) +{ + // Sizes are in kb + int sizes[] = {1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384, 32768,65536,131072,262144,524288}; + int nSizes = sizeof(sizes) / sizeof(int); + + long long numMaxFloats = 1024 * (sizes[nSizes-1]) / 4; + + hipSetDevice(p_device); + + // Create some host memory pattern + float *hostMem = NULL; + if (p_pinned) + { + hipMallocHost((void**)&hostMem, sizeof(float) * numMaxFloats); + while (hipGetLastError() != hipSuccess) + { + // drop the size and try again + if (p_verbose) std::cout << " - dropping size allocating pinned mem\n"; + --nSizes; + if (nSizes < 1) + { + std::cerr << "Error: Couldn't allocated any pinned buffer\n"; + return; + } + numMaxFloats = 1024 * (sizes[nSizes-1]) / 4; + hipMallocHost((void**)&hostMem, sizeof(float) * numMaxFloats); + } + } + else + { + hostMem = new float[numMaxFloats]; + } + + for (int i = 0; i < numMaxFloats; i++) + { + hostMem[i] = i % 77; + } + + float *device; + hipMalloc((void**)&device, sizeof(float) * numMaxFloats); + while (hipGetLastError() != hipSuccess) + { + // drop the size and try again + if (p_verbose) std::cout << " - dropping size allocating device mem\n"; + --nSizes; + if (nSizes < 1) + { + std::cerr << "Error: Couldn't allocated any device buffer\n"; + return; + } + numMaxFloats = 1024 * (sizes[nSizes-1]) / 4; + hipMalloc((void**)&device, sizeof(float) * numMaxFloats); + } + + + hipEvent_t start, stop; + hipEventCreate(&start); + hipEventCreate(&stop); + CHECK_HIP_ERROR(); + + // Three passes, forward and backward both + for (int pass = 0; pass < p_iterations; pass++) + { + // store the times temporarily to estimate latency + //float times[nSizes]; + // Step through sizes forward on even passes and backward on odd + for (int i = 0; i < nSizes; i++) + { + int sizeIndex; + if ((pass % 2) == 0) + sizeIndex = i; + else + sizeIndex = (nSizes - 1) - i; + + int nbytes = sizes[sizeIndex] * 1024; + + hipEventRecord(start, 0); + hipMemcpy(device, hostMem, nbytes, hipMemcpyHostToDevice); + hipEventRecord(stop, 0); + hipEventSynchronize(stop); + float t = 0; + hipEventElapsedTime(&t, start, stop); + //times[sizeIndex] = t; + + // Convert to GB/sec + if (p_verbose) + { + std::cerr << "size " << sizes[sizeIndex] << "k took " << t << + " ms\n"; + } + + double speed = (double(sizes[sizeIndex]) * 1024. / (1000*1000)) / t; + char sizeStr[256]; + sprintf(sizeStr, "% 7dkB", sizes[sizeIndex]); + resultDB.AddResult("H2D_Bandwidth", sizeStr, "GB/sec", speed); + resultDB.AddResult("H2D_Time", sizeStr, "ms", t); + } + } + + // Cleanup + hipFree((void*)device); + CHECK_HIP_ERROR(); + if (p_pinned) + { + hipFreeHost((void*)hostMem); + CHECK_HIP_ERROR(); + } + else + { + delete[] hostMem; + } + hipEventDestroy(start); + hipEventDestroy(stop); +} + + +void RunBenchmark_D2H(ResultDatabase &resultDB) +{ + + // Sizes are in kb + int nSizes = 20; + int sizes[20] = {1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384, + 32768,65536,131072,262144,524288}; + long long numMaxFloats = 1024 * (sizes[nSizes-1]) / 4; + + // Create some host memory pattern + float *hostMem1; + float *hostMem2; + if (p_pinned) + { + hipMallocHost((void**)&hostMem1, sizeof(float)*numMaxFloats); + hipError_t err1 = hipGetLastError(); + hipMallocHost((void**)&hostMem2, sizeof(float)*numMaxFloats); + hipError_t err2 = hipGetLastError(); + while (err1 != hipSuccess || err2 != hipSuccess) + { + // free the first buffer if only the second failed + if (err1 == hipSuccess) + hipFreeHost((void*)hostMem1); + + // drop the size and try again + if (p_verbose) std::cout << " - dropping size allocating pinned mem\n"; + --nSizes; + if (nSizes < 1) + { + std::cerr << "Error: Couldn't allocated any pinned buffer\n"; + return; + } + numMaxFloats = 1024 * (sizes[nSizes-1]) / 4; + hipMallocHost((void**)&hostMem1, sizeof(float)*numMaxFloats); + err1 = hipGetLastError(); + hipMallocHost((void**)&hostMem2, sizeof(float)*numMaxFloats); + err2 = hipGetLastError(); + } + } + else + { + hostMem1 = new float[numMaxFloats]; + hostMem2 = new float[numMaxFloats]; + } + for (int i=0; i= argc || !parseInt(argv[i], &p_iterations)) { + failed("Bad iterations argument"); + } + } else if (!strcmp(arg, "--device") || (!strcmp(arg, "-d"))) { + if (++i >= argc || !parseInt(argv[i], &p_device)) { + failed("Bad device argument"); + } + } else if (!strcmp(arg, "--unpinned")) { + p_pinned = 0; + } else if (!strcmp(arg, "--h2d")) { + p_h2d = true; + p_d2h = false; + + } else if (!strcmp(arg, "--d2h")) { + p_h2d = false; + p_d2h = true; + + } else if (!strcmp(arg, "--help") || (!strcmp(arg, "-h"))) { + help(); + + } else if (!strcmp(arg, "--verbose")) { + p_verbose = 1; + } else if (!strcmp(arg, "--detailed")) { + p_detailed = 1; + } else { + failed("Bad argument '%s'", arg); + } + } + + return 0; +}; + + + +int main(int argc, char *argv[]) +{ + parseStandardArguments(argc, argv); + + if (p_h2d) { + ResultDatabase resultDB; + RunBenchmark_H2D(resultDB); + + resultDB.DumpSummary(std::cout); + + if (p_detailed) { + resultDB.DumpDetailed(std::cout); + } + } + + if (p_d2h) { + ResultDatabase resultDB; + RunBenchmark_D2H(resultDB); + + resultDB.DumpSummary(std::cout); + + if (p_detailed) { + resultDB.DumpDetailed(std::cout); + } + } +} diff --git a/projects/clr/hipamd/src/hip_hcc.cpp b/projects/clr/hipamd/src/hip_hcc.cpp index beba7c2775..8a0d0df1d4 100644 --- a/projects/clr/hipamd/src/hip_hcc.cpp +++ b/projects/clr/hipamd/src/hip_hcc.cpp @@ -31,6 +31,8 @@ THE SOFTWARE. #include #include #include +#include + #include #include @@ -38,32 +40,60 @@ THE SOFTWARE. #include "hsa_ext_amd.h" -#define USE_PINNED_HOST (__hcc_workweek__ >= 1601) -//#define USE_ASYNC_COPY + +#define USE_AM_TRACKER 1 /* >0 = use new AM memory tracker features. */ +#define USE_ROCR_V2 1 /* use the ROCR v2 async copy API with dst and src agents */ + +#if (USE_AM_TRACKER) and (__hcc_workweek__ < 16074) +#error (USE_AM_TRACKER requries HCC version of 16074 or newer) +#endif + + +#if (USE_ROCR_V2) and (USE_AM_TRACKER == 0) +#error (USE_ROCR_V2 requires USE_AM_TRACKER>0) +#endif + + #define INLINE static inline //--- // Environment variables: -// TODO-HCC - map this to the HC instruction that uses HSAIL to get the wave size. -int warpSize = 64; // Intended to distinguish whether an environment variable should be visible only in debug mode, or in debug+release. //static const int debug = 0; static const int release = 1; -int HIP_PRINT_ENV = 0; -int HIP_TRACE_API= 0; +#define ENABLE_CHECKS 1 + int HIP_LAUNCH_BLOCKING = 0; -#define TRACE_API 0x1 /* trace API calls and return values */ -#define TRACE_SYNC 0x2 /* trace synchronization pieces */ -#define TRACE_MEM 0x4 /* trace memory allocation / deallocation */ +int HIP_PRINT_ENV = 0; +int HIP_TRACE_API= 0; +int HIP_STAGING_SIZE = 64; /* size of staging buffers, in KB */ +int HIP_STAGING_BUFFERS = 2; +int HIP_STREAM_SIGNALS = 2; /* number of signals to allocate at stream creation */ + + +//--- +// Chicken bits for disabling functionality to work around potential issues: +int HIP_DISABLE_ENQ_BARRIER = 1; +int HIP_DISABLE_BIDIR_MEMCPY = 1; +int HIP_ONESHOT_COPY_DEP = 1; // this is a good thing + + +//--- +//Debug flags: +#define TRACE_API 0x01 /* trace API calls and return values */ +#define TRACE_SYNC 0x02 /* trace synchronization pieces */ +#define TRACE_MEM 0x04 /* trace memory allocation / deallocation */ +#define TRACE_COPY2 0x08 /* trace memory copy commands. Detailed. */ +#define TRACE_SIGNAL 0x10 /* trace signal pool commands */ #define tprintf(trace_level, ...) {\ if (HIP_TRACE_API & trace_level) {\ - fprintf (stderr, "hiptrace%d: ", trace_level); \ + fprintf (stderr, "hiptrace%x: ", trace_level); \ fprintf (stderr, __VA_ARGS__);\ }\ } @@ -75,21 +105,77 @@ struct ihipDevice_t; enum ihipCommand_t { ihipCommandKernel, - ihipCommandData, + ihipCommandCopyH2D, + ihipCommandCopyD2H, }; +const char* ihipCommandName[] = { + "Kernel", "CopyH2D", "CopyD2H" +}; + + + +typedef uint64_t SIGSEQNUM; + +//--- +// Small wrapper around signals. +// Designed to be used from stream. +// TODO-someday refactor this class so it can be stored in a vector<> +// we already store the index here so we can use for garbage collection. +struct ihipSignal_t { + hsa_signal_t _hsa_signal; // hsa signal handle + int _index; // Index in pool, used for garbage collection. + SIGSEQNUM _sig_id; // unique sequentially increasing ID. + + ihipSignal_t(); + ~ihipSignal_t(); + + inline void release(); +}; + + + + // Internal stream structure. -struct ihipStream_t { - unsigned _device_index; +class ihipStream_t { +public: + + ihipStream_t(unsigned device_index, hc::accelerator_view av, unsigned int flags); + ~ihipStream_t(); + + inline void reclaimSignals(SIGSEQNUM sigNum); + inline void waitAndReclaimOlder(ihipSignal_t *signal); + inline void wait(); + + inline ihipDevice_t * getDevice() const; + + ihipSignal_t * getSignal() ; + + inline bool preKernelCommand(); + inline void postKernelCommand(hc::completion_future &kernel_future); + inline int copyCommand(ihipSignal_t *lastCopy, hsa_signal_t *waitSignal, ihipCommand_t copyType); + + inline void resetToEmpty(); + + inline SIGSEQNUM lastCopySeqId() { return _last_copy_signal ? _last_copy_signal->_sig_id : 0; }; + + //--- hc::accelerator_view _av; unsigned _flags; - ihipCommand_t _last_command; +private: + void enqueueBarrier(hsa_queue_t* queue, ihipSignal_t *depSignal); - //ihipStream_t() : _av(){ }; - ihipStream_t(unsigned device_index, hc::accelerator_view av, unsigned int flags) : - _device_index(device_index), _av(av), _flags(flags), _last_command(ihipCommandKernel) - {}; -} ; + unsigned _device_index; + ihipCommand_t _last_command_type; // type of the last command + ihipSignal_t *_last_copy_signal; // signal of last copy command sent to the stream. Copy can be either H2D or D2H. + hc::completion_future _last_kernel_future; // Completion future of last kernel command sent to GPU. + + int _signalCursor; + + SIGSEQNUM _stream_sig_id; // Monotonically increasing unique signal id. + SIGSEQNUM _oldest_live_sig_id; // oldest live seq_id, anything < this can be allocated. + std::deque _signalPool; // Pool of signals for use by this stream. +}; @@ -112,9 +198,34 @@ struct ihipEvent_t { hc::completion_future _marker; uint64_t _timestamp; // store timestamp, may be set on host or by marker. + + SIGSEQNUM _copy_seq_id; } ; +//------------------------------------------------------------------------------------------------- +struct StagingBuffer { + + static const int _max_buffers = 4; + + StagingBuffer(ihipDevice_t *device, size_t bufferSize, int numBuffers) ; + ~StagingBuffer(); + + void CopyDeviceToHost(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor); + void CopyHostToDevice(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor); + +private: + ihipDevice_t *_device; + size_t _bufferSize; // Size of the buffers. + int _numBuffers; + + char *_pinnedStagingBuffer[_max_buffers]; + hsa_signal_t _completion_signal[_max_buffers]; +}; + + + +//------------------------------------------------------------------------------------------------- struct ihipDevice_t { unsigned _device_index; // index into g_devices. @@ -131,20 +242,305 @@ struct ihipDevice_t unsigned _compute_units; + hsa_signal_t _copy_signal; // signal to use for copies + std::mutex _copy_lock[2]; // mutex for each direction. + StagingBuffer *_staging_buffer[2]; // one buffer for each direction. + public: - ihipDevice_t(unsigned device_index, hc::accelerator acc); + void reset(); + void init(unsigned device_index, hc::accelerator acc); hipError_t getProperties(hipDeviceProp_t* prop); - // TODO- create a copy constructor. - //~ihipDevice_t(); + ~ihipDevice_t(); }; //================================================================================================= -ihipDevice_t::ihipDevice_t(unsigned device_index, hc::accelerator acc) - : _device_index(device_index), - _acc(acc) +// Global Data Structures: +//================================================================================================= +//TLS - must be initialized here. +thread_local hipError_t tls_lastHipError = hipSuccess; +thread_local int tls_defaultDevice = 0; + +// Global initialization. +std::once_flag hip_initialized; +ihipDevice_t *g_devices; // Array of all non-emulated (ie GPU) accelerators in the system. +unsigned g_deviceCnt; +//================================================================================================= + + +//================================================================================================= +//Forward Declarations: +//================================================================================================= +INLINE bool ihipIsValidDevice(unsigned deviceIndex); + +//================================================================================================= +// Implementation: +//================================================================================================= + + +//================================================================================================= +// ihipSignal_t: +//================================================================================================= +// +//--- +ihipSignal_t::ihipSignal_t() : _sig_id(0) { + if (hsa_signal_create(0/*value*/, 0, NULL, &_hsa_signal) != HSA_STATUS_SUCCESS) { + throw; + } + tprintf (TRACE_SIGNAL, " allocated hsa_signal=%lu\n", (_hsa_signal.handle)); +} + +//--- +ihipSignal_t::~ihipSignal_t() +{ + tprintf (TRACE_SIGNAL, " destroy hsa_signal #%lu (#%lu)\n", (_hsa_signal.handle), _sig_id); + if (hsa_signal_destroy(_hsa_signal) != HSA_STATUS_SUCCESS) { + throw; // TODO + } +}; + + + +//================================================================================================= +// ihipStream_t: +//================================================================================================= +//--- +ihipStream_t::ihipStream_t(unsigned device_index, hc::accelerator_view av, unsigned int flags) : + _av(av), + _flags(flags), + _device_index(device_index), + _last_copy_signal(0), + _signalCursor(0), + _stream_sig_id(0), + _oldest_live_sig_id(1) +{ + tprintf(TRACE_SYNC, " streamCreate: stream=%p\n", this); + _signalPool.resize(HIP_STREAM_SIGNALS > 0 ? HIP_STREAM_SIGNALS : 1); + + resetToEmpty(); +}; + + +//--- +ihipStream_t::~ihipStream_t() +{ + _signalPool.clear(); +} + + +//--- +// Reset the stream to "empty" - next command will not set up an inpute dependency on any older signal. +void ihipStream_t::resetToEmpty() +{ + _last_command_type = ihipCommandCopyH2D; + _last_copy_signal = NULL; +} + +//--- +void ihipStream_t::reclaimSignals(SIGSEQNUM sigNum) +{ + tprintf(TRACE_SIGNAL, "reclaim signal #%lu\n", sigNum); + // Mark all signals older and including this one as available for + _oldest_live_sig_id = sigNum+1; +} + + +//--- +void ihipStream_t::waitAndReclaimOlder(ihipSignal_t *signal) +{ + hsa_signal_wait_acquire(_last_copy_signal->_hsa_signal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); + + reclaimSignals(_last_copy_signal->_sig_id); + +} + + +//--- +//Wait for all queues kernels in the associated accelerator_view to complete. +void ihipStream_t::wait() +{ + tprintf (TRACE_SYNC, "stream %p wait for queue-empty and lastCopy:#%lu...\n", this, _last_copy_signal ? _last_copy_signal->_sig_id: 0x0 ); + _av.wait(); + if (_last_copy_signal) { + this->waitAndReclaimOlder(_last_copy_signal); + } + + resetToEmpty(); +}; + + +//--- +inline ihipDevice_t * ihipStream_t::getDevice() const +{ + if (ihipIsValidDevice(_device_index)) { + return &g_devices[_device_index]; + } else { + return NULL; + } +}; + + +//--- +// Allocate a new signal from the signal pool. +// Returned signals have value of 0. +// Signals are intended for use in this stream and are always reclaimed "in-order". +ihipSignal_t *ihipStream_t::getSignal() +{ + int numToScan = _signalPool.size(); + do { + auto thisCursor = _signalCursor; + if (++_signalCursor == _signalPool.size()) { + _signalCursor = 0; + } + + if (_signalPool[thisCursor]._sig_id < _oldest_live_sig_id) { + _signalPool[thisCursor]._index = thisCursor; + _signalPool[thisCursor]._sig_id = ++_stream_sig_id; // allocate it. + + + return &_signalPool[thisCursor]; + } + + } while (--numToScan) ; + + assert(numToScan == 0); + + // Have to grow the pool: + _signalCursor = _signalPool.size(); // set to the beginning of the new entries: + _signalPool.resize(_signalPool.size() * 2); + tprintf (TRACE_SIGNAL, "grow signal pool to %zu entries, cursor=%d\n", _signalPool.size(), _signalCursor); + return getSignal(); // try again, + + // Should never reach here. + assert(0); +} + + +//--- +void ihipStream_t::enqueueBarrier(hsa_queue_t* queue, ihipSignal_t *depSignal) +{ + + // Obtain the write index for the command queue + uint64_t index = hsa_queue_load_write_index_relaxed(queue); + const uint32_t queueMask = queue->size - 1; + + // Define the barrier packet to be at the calculated queue index address + hsa_barrier_and_packet_t* barrier = &(((hsa_barrier_and_packet_t*)(queue->base_address))[index&queueMask]); + memset(barrier, 0, sizeof(hsa_barrier_and_packet_t)); + + // setup header + uint16_t header = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE; + header |= 1 << HSA_PACKET_HEADER_BARRIER; + //header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE; + //header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE; + barrier->header = header; + + barrier->dep_signal[0] = depSignal->_hsa_signal; + + barrier->completion_signal.handle = 0; + + // TODO - check queue overflow, return error: + // Increment write index and ring doorbell to dispatch the kernel + hsa_queue_store_write_index_relaxed(queue, index+1); + hsa_signal_store_relaxed(queue->doorbell_signal, index); +} + + +//-- +//When the commands in a stream change types (ie kernel command follows a data command, +//or data command follows a kernel command), then we need to add a barrier packet +//into the stream to mimic CUDA stream semantics. (some hardware uses separate +//queues for data commands and kernel commands, and no implicit ordering is provided). +// +inline bool ihipStream_t::preKernelCommand() +{ + bool addedSync = false; + // If switching command types, we need to add a barrier packet to synchronize things. + if (_last_command_type != ihipCommandKernel) { + if (_last_copy_signal) { + addedSync = true; + + hsa_queue_t * q = (hsa_queue_t*)_av.get_hsa_queue(); + if (! HIP_DISABLE_ENQ_BARRIER) { + this->enqueueBarrier(q, _last_copy_signal); + tprintf (TRACE_SYNC, "stream %p switch %s to %s (barrier pkt inserted with wait on #%lu)\n", + this, ihipCommandName[_last_command_type], ihipCommandName[ihipCommandKernel], _last_copy_signal->_sig_id) + + } else { + tprintf (TRACE_SYNC, "stream %p switch %s to %s (wait for previous...)\n", + this, ihipCommandName[_last_command_type], ihipCommandName[ihipCommandKernel]); + this->waitAndReclaimOlder(_last_copy_signal); + } + } + _last_command_type = ihipCommandKernel; + } + + return addedSync; +} + + +//--- +inline void ihipStream_t::postKernelCommand(hc::completion_future &kernelFuture) +{ + _last_kernel_future = kernelFuture; +}; + + + +//--- +// Called whenever a copy command is set to the stream. +// Examines the last command sent to this stream and returns a signal to wait on, if required. +inline int ihipStream_t::copyCommand(ihipSignal_t *lastCopy, hsa_signal_t *waitSignal, ihipCommand_t copyType) +{ + int needSync = 0; + + waitSignal->handle = 0; + // If switching command types, we need to add a barrier packet to synchronize things. + if (_last_command_type != copyType) { + + + if (_last_command_type == ihipCommandKernel) { + tprintf (TRACE_SYNC, "stream %p switch %s to %s (async copy dep on prev kernel)\n", + this, ihipCommandName[_last_command_type], ihipCommandName[copyType]); + needSync = 1; + hsa_signal_t *hsaSignal = (static_cast (_last_kernel_future.get_native_handle())); + if (hsaSignal) { + *waitSignal = * hsaSignal; + } + } else if (_last_copy_signal) { + needSync = 1; + tprintf (TRACE_SYNC, "stream %p switch %s to %s (async copy dep on other copy #%lu)\n", + this, ihipCommandName[_last_command_type], ihipCommandName[copyType], _last_copy_signal->_sig_id); + *waitSignal = _last_copy_signal->_hsa_signal; + } + + _last_command_type = copyType; + } + + _last_copy_signal = lastCopy; + + return needSync; +} + + +//================================================================================================= +// +//Reset the device - this is called from hipDeviceReset. +//Device may be reset multiple times, and may be reset after init. +void ihipDevice_t::reset() +{ + _staging_buffer[0] = new StagingBuffer(this, HIP_STAGING_SIZE*1024, HIP_STAGING_BUFFERS); + _staging_buffer[1] = new StagingBuffer(this, HIP_STAGING_SIZE*1024, HIP_STAGING_BUFFERS); +}; + + +//--- +void ihipDevice_t::init(unsigned device_index, hc::accelerator acc) +{ + _device_index = device_index; + _acc = acc; hsa_agent_t *agent = static_cast (acc.get_default_view().get_hsa_agent()); if (agent) { int err = hsa_agent_get_info(*agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &_compute_units); @@ -162,30 +558,30 @@ ihipDevice_t::ihipDevice_t(unsigned device_index, hc::accelerator acc) _null_stream = new ihipStream_t(device_index, acc.get_default_view(), hipStreamDefault); this->_streams.push_back(_null_stream); tprintf(TRACE_SYNC, "created device with null_stream=%p\n", _null_stream); + + hsa_signal_create(0, 0, NULL, &_copy_signal); + + this->reset(); }; -#if 0 + ihipDevice_t::~ihipDevice_t() { if (_null_stream) { delete _null_stream; _null_stream = NULL; } + + for (int i=0; i<2; i++) { + if (_staging_buffer[i]) { + delete _staging_buffer[i]; + } + } + hsa_signal_destroy(_copy_signal); } -#endif //---- -//================================================================================================= -//TLS - must be initialized here. -thread_local hipError_t tls_lastHipError = hipSuccess; -thread_local int tls_defaultDevice = 0; - -// Global initialization. -std::once_flag hip_initialized; -std::vector g_devices; // Vector of all non-emulated (ie GPU) accelerators in the system. - -//================================================================================================= @@ -338,8 +734,7 @@ hipError_t ihipDevice_t::getProperties(hipDeviceProp_t* prop) DeviceErrorCheck(err); prop->l2CacheSize = cache_size[1]; - /* Computemode for HSA Devices is always : cudaComputeModeDefault :/ - Default compute mode (Multiple threads can use cudaSetDevice() with this device) */ + /* Computemode for HSA Devices is always : cudaComputeModeDefault */ prop->computeMode = 0; // Get Max Threads Per Multiprocessor @@ -464,24 +859,44 @@ void ihipReadEnv_I(int *var_ptr, const char *var_name1, const char *var_name2, c //It is called with C++11 call_once, which provided thread-safety. void ihipInit() { + /* + * Environment variables + */ + READ_ENV_I(release, HIP_PRINT_ENV, 0, "Print HIP environment variables."); + //-- READ HIP_PRINT_ENV env first, since it has impact on later env var reading + + READ_ENV_I(release, HIP_LAUNCH_BLOCKING, CUDA_LAUNCH_BLOCKING, "Make HIP APIs 'host-synchronous', so they block until any kernel launches or data copy commands complete. Alias: CUDA_LAUNCH_BLOCKING." ); + READ_ENV_I(release, HIP_TRACE_API, 0, "Trace each HIP API call. Print function name and return code to stderr as program executes."); + READ_ENV_I(release, HIP_STAGING_SIZE, 0, "Size of each staging buffer (in KB)" ); + READ_ENV_I(release, HIP_STAGING_BUFFERS, 0, "Number of staging buffers to use in each direction"); + READ_ENV_I(release, HIP_STREAM_SIGNALS, 0, "Number of signals to allocate when new stream is created (signal pool will grow on demand)"); + + READ_ENV_I(release, HIP_DISABLE_ENQ_BARRIER, 0, "Disable enqueue of barrier packet - instead wait for copy completion on host."); + READ_ENV_I(release, HIP_DISABLE_BIDIR_MEMCPY, 0, "Disable simultaneous H2D memcpy and D2H memcpy to same device"); + READ_ENV_I(release, HIP_ONESHOT_COPY_DEP, 0, "If set, only set the copy input dependency for the first copy command in a staged copy. If clear, set the dep for each copy."); /* * Build a table of valid compute devices. */ auto accs = hc::accelerator::get_all(); - g_devices.reserve(accs.size()); + int deviceCnt = 0; for (int i=0; i"); @@ -490,7 +905,7 @@ void ihipInit() INLINE bool ihipIsValidDevice(unsigned deviceIndex) { // deviceIndex is unsigned so always > 0 - return (deviceIndex < g_devices.size()); + return (deviceIndex < g_deviceCnt); } @@ -509,7 +924,7 @@ INLINE ihipDevice_t *ihipGetTlsDefaultDevice() //--- INLINE ihipDevice_t *ihipGetDevice(int deviceId) { - if ((deviceId >= 0) && (deviceId < g_devices.size())) { + if ((deviceId >= 0) && (deviceId < g_deviceCnt)) { return &g_devices[deviceId]; } else { return NULL; @@ -524,7 +939,7 @@ static inline void ihipWaitAllStreams(ihipDevice_t *device) { tprintf(TRACE_SYNC, "waitAllStream\n"); for (auto streamI=device->_streams.begin(); streamI!=device->_streams.end(); streamI++) { - (*streamI)->_av.wait(); + (*streamI)->wait(); } } @@ -540,7 +955,7 @@ inline void ihipWaitNullStream(ihipDevice_t *device) if (!(stream->_flags & hipStreamNonBlocking)) { // TODO-hcc - use blocking or active wait here? // TODO-sync - cudaDeviceBlockingSync - stream->_av.wait(); + stream->wait(); } } } @@ -563,78 +978,35 @@ inline hipStream_t ihipSyncAndResolveStream(hipStream_t stream) } } -#if 0 -inline hsa_status_t -HSABarrier::enqueueBarrier(hsa_queue_t* queue) { - hsa_status_t status = HSA_STATUS_SUCCESS; - - hc::completion_future marker = stream->_av.create_marker(); - - // Create a signal to wait for the barrier to finish. - std::pair ret = Kalmar::ctx.getSignal(); - signal = ret.first; - signalIndex = ret.second; - - // Obtain the write index for the command queue - uint64_t index = hsa_queue_load_write_index_relaxed(queue); - const uint32_t queueMask = queue->size - 1; - - // Define the barrier packet to be at the calculated queue index address - hsa_barrier_and_packet_t* barrier = &(((hsa_barrier_and_packet_t*)(queue->base_address))[index&queueMask]); - memset(barrier, 0, sizeof(hsa_barrier_and_packet_t)); - - // setup header - uint16_t header = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE; - header |= 1 << HSA_PACKET_HEADER_BARRIER; - header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE; - header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE; - barrier->header = header; - - barrier->completion_signal = signal; - - // Increment write index and ring doorbell to dispatch the kernel - hsa_queue_store_write_index_relaxed(queue, index+1); - hsa_signal_store_relaxed(queue->doorbell_signal, index); - - isDispatched = true; - - return status; -} -#endif - -//-- -//When the commands in a stream change types (ie kernel command follows a data command, -//or data command follows a kernel command), then we need to add a barrier packet -//into the stream to mimic CUDA stream semantics. (some hardware uses separate -//queues for data commands and kernel commands, and no implicit ordering is provided). -// -inline bool ihipCheckCommandSwitchSync(hipStream_t stream, ihipCommand_t new_command, hc::completion_future *marker) -{ - bool addedSync = false; - // If switching command types, we need to add a barrier packet to synchronize things. - if (stream->_last_command != new_command) { - addedSync = true; - *marker = stream->_av.create_marker(); - - tprintf (TRACE_SYNC, "stream %p switch to %s (barrier pkt inserted)\n", (void*)stream, new_command == ihipCommandKernel ? "Kernel" : "Data"); - stream->_last_command = new_command; - } - - return addedSync; -} + + + + +// TODO - data-up to data-down: // Called just before a kernel is launched from hipLaunchKernel. // Allows runtime to track some information about the stream. -hc::accelerator_view *ihipLaunchKernel(hipStream_t stream) +hipStream_t ihipPreLaunchKernel(hipStream_t stream, hc::accelerator_view **av) { - stream = ihipSyncAndResolveStream(stream); - hc::completion_future marker; - ihipCheckCommandSwitchSync(stream, ihipCommandKernel, &marker); + stream->preKernelCommand(); - return &(stream->_av); + *av = &stream->_av; + + return (stream); +} + + +//--- +//Called after kernel finishes execution. +void ihipPostLaunchKernel(hipStream_t stream, hc::completion_future &kernelFuture) +{ + stream->postKernelCommand(kernelFuture); + if (HIP_LAUNCH_BLOCKING) { + tprintf(TRACE_SYNC, " stream:%p LAUNCH_BLOCKING for kernel completion\n", stream); + } } @@ -676,7 +1048,7 @@ hipError_t hipGetDeviceCount(int *count) { std::call_once(hip_initialized, ihipInit); - *count = g_devices.size(); + *count = g_deviceCnt; if (*count > 0) { return ihipLogStatus(hipSuccess); @@ -765,7 +1137,7 @@ hipError_t hipSetDevice(int device) { std::call_once(hip_initialized, ihipInit); - if ((device < 0) || (device > g_devices.size())) { + if ((device < 0) || (device >= g_deviceCnt)) { return ihipLogStatus(hipErrorInvalidDevice); } else { tls_defaultDevice = device; @@ -802,6 +1174,14 @@ hipError_t hipDeviceReset(void) // It should destroy and clean up all resources allocated with the default device in the current process. // and needs to destroy all queues as well. // +#if USE_AM_TRACKER + // TODO - remove bug above. + ihipDevice_t *device = ihipGetTlsDefaultDevice(); + if (device) { + am_memtracker_reset(device->_acc); + device->reset(); // re-allocate required resources. + } +#endif return ihipLogStatus(hipSuccess); } @@ -941,6 +1321,7 @@ const char *hipGetErrorName(hipError_t hip_error) case hipErrorInvalidValue : return "hipErrorInvalidValue"; case hipErrorInvalidResourceHandle : return "hipErrorInvalidResourceHandle"; case hipErrorInvalidDevice : return "hipErrorInvalidDevice"; + case hipErrorInvalidMemcpyDirection : return "hipErrorInvalidMemcpyDirection"; case hipErrorNoDevice : return "hipErrorNoDevice"; case hipErrorNotReady : return "hipErrorNotReady"; case hipErrorUnknown : return "hipErrorUnknown"; @@ -1004,7 +1385,7 @@ hipError_t hipStreamWaitEvent(hipStream_t stream, hipEvent_t event, unsigned int { // Super-conservative version of this - TODO - remove me: - stream->_av.wait(); + stream->wait(); e = hipSuccess; } @@ -1022,7 +1403,7 @@ hipError_t hipStreamSynchronize(hipStream_t stream) ihipDevice_t *device = ihipGetTlsDefaultDevice(); ihipWaitNullStream(device); } else { - stream->_av.wait(); + stream->wait(); e = hipSuccess; } @@ -1041,20 +1422,16 @@ hipError_t hipStreamDestroy(hipStream_t stream) hipError_t e = hipSuccess; - if (ihipIsValidDevice(stream->_device_index)) { - - ihipDevice_t *device = &g_devices[stream->_device_index]; + ihipDevice_t *device = stream->getDevice(); + if (device) { device->_streams.remove(stream); - delete stream; - - e = hipSuccess; } else { e = hipErrorInvalidResourceHandle; } - return ihipLogStatus(hipSuccess); + return ihipLogStatus(e); } @@ -1095,6 +1472,8 @@ hipError_t hipEventCreateWithFlags(hipEvent_t* event, unsigned flags) eh->_state = hipEventStatusCreated; eh->_stream = NULL; eh->_flags = flags; + eh->_timestamp = 0; + eh->_copy_seq_id = 0; } else { e = hipErrorInvalidValue; } @@ -1129,6 +1508,7 @@ hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream) // Clear timestamps eh->_timestamp = 0; eh->_marker = stream->_av.create_marker(); + eh->_copy_seq_id = stream->lastCopySeqId(); return ihipLogStatus(hipSuccess); } @@ -1176,6 +1556,8 @@ hipError_t hipEventSynchronize(hipEvent_t event) #else eh->_marker.wait(); #endif + eh->_stream->reclaimSignals(eh->_copy_seq_id); + return ihipLogStatus(hipSuccess); } } else { @@ -1191,7 +1573,7 @@ void ihipSetTs(hipEvent_t e) // already recorded, done: return; } else { - // Test this code: + // TODO - use completion-future functions to obtain ticks and timestamps: hsa_signal_t *sig = static_cast (eh->_marker.get_native_handle()); if (sig) { if (hsa_signal_load_acquire(*sig) == 0) { @@ -1272,13 +1654,95 @@ hipError_t hipEventQuery(hipEvent_t event) // Memory // // +// + +//--- +/** + * @return #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidDevice + */ +hipError_t hipPointerGetAttributes(hipPointerAttribute_t *attributes, void* ptr) +{ + std::call_once(hip_initialized, ihipInit); + + hipError_t e = hipSuccess; + +#if USE_AM_TRACKER + hc::accelerator acc; + hc::AmPointerInfo amPointerInfo(NULL, NULL, 0, acc, 0, 0); + am_status_t status = hc::am_memtracker_getinfo(&amPointerInfo, ptr); + if (status == AM_SUCCESS) { + + attributes->memoryType = amPointerInfo._isInDeviceMem ? hipMemoryTypeDevice: hipMemoryTypeHost; + attributes->hostPointer = amPointerInfo._hostPointer; + attributes->devicePointer = amPointerInfo._devicePointer; + attributes->isManaged = 0; + + attributes->allocationFlags = amPointerInfo._appAllocationFlags; + attributes->device = amPointerInfo._appId; + + if (attributes->device < 0) { + e = hipErrorInvalidDevice; + } + + + } else { + attributes->memoryType = hipMemoryTypeDevice; + attributes->hostPointer = 0; + attributes->devicePointer = 0; + attributes->device = -1; + attributes->isManaged = 0; + attributes->allocationFlags = 0; + + e = hipErrorInvalidValue; + } +#else + e = hipErrorInvalidValue; +#endif + + return ihipLogStatus(e); +} + + +#if USE_AM_TRACKER +// TODO - test this function: +/** + * @returns #hipSuccess, + * @returns #hipErrorInvalidValue if flags are not 0 + * @returns #hipErrorMemoryAllocation if hostPointer is not a tracked allocation. + */ +hipError_t hipHostGetDevicePointer(void **devicePointer, void *hostPointer, unsigned flags) +{ + std::call_once(hip_initialized, ihipInit); + + hipError_t e = hipSuccess; + + // Flags must be 0: + if (flags == 0) { + e = hipErrorInvalidValue; + } else { + hc::accelerator acc; + hc::AmPointerInfo amPointerInfo(NULL, NULL, 0, acc, 0, 0); + am_status_t status = hc::am_memtracker_getinfo(&amPointerInfo, hostPointer); + if (status == AM_SUCCESS) { + *devicePointer = amPointerInfo._devicePointer; + } else { + e = hipErrorMemoryAllocation; + *devicePointer = NULL; + } + } + + return ihipLogStatus(e); +} +#endif + + // kernel for launching memcpy operations: template hc::completion_future ihipMemcpyKernel(hipStream_t stream, T * c, const T * a, size_t sizeBytes) { - int wg = std::min((unsigned)8, g_devices[stream->_device_index]._compute_units); + int wg = std::min((unsigned)8, stream->getDevice()->_compute_units); const int threads_per_wg = 256; int threads = wg * threads_per_wg; @@ -1315,7 +1779,7 @@ template hc::completion_future ihipMemsetKernel(hipStream_t stream, T * ptr, T val, size_t sizeBytes) { - int wg = std::min((unsigned)8, g_devices[stream->_device_index]._compute_units); + int wg = std::min((unsigned)8, stream->getDevice()->_compute_units); const int threads_per_wg = 256; int threads = wg * threads_per_wg; @@ -1348,24 +1812,33 @@ ihipMemsetKernel(hipStream_t stream, T * ptr, T val, size_t sizeBytes) } //--- +/** + * @returns #hipSuccess #hipErrorMemoryAllocation + */ hipError_t hipMalloc(void** ptr, size_t sizeBytes) { std::call_once(hip_initialized, ihipInit); hipError_t hip_status = hipSuccess; - const unsigned am_flags = 0; - *ptr = hc::am_alloc(sizeBytes, ihipGetTlsDefaultDevice()->_acc, am_flags); + auto device = ihipGetTlsDefaultDevice(); - if (*ptr == NULL) { - hip_status = hipErrorMemoryAllocation; + if (device) { + const unsigned am_flags = 0; + *ptr = hc::am_alloc(sizeBytes, device->_acc, am_flags); + + if (sizeBytes && (*ptr == NULL)) { + hip_status = hipErrorMemoryAllocation; + } else { +#if USE_AM_TRACKER + hc::am_memtracker_update(*ptr, device->_device_index, 0); +#endif + } } else { - hip_status = hipSuccess; + hip_status = hipErrorMemoryAllocation; } - ihipLogStatus(hip_status); - - return hip_status; + return ihipLogStatus(hip_status); } @@ -1373,40 +1846,29 @@ hipError_t hipMallocHost(void** ptr, size_t sizeBytes) { std::call_once(hip_initialized, ihipInit); -#if USE_PINNED_HOST + hipError_t hip_status = hipSuccess; const unsigned am_flags = amHostPinned; + auto device = ihipGetTlsDefaultDevice(); - *ptr = hc::am_alloc(sizeBytes, ihipGetTlsDefaultDevice()->_acc, am_flags); - hipError_t hip_status = hipSuccess; - if (*ptr == NULL) { - hip_status = hipErrorMemoryAllocation; - } else { - hip_status = hipSuccess; - } - - tprintf (TRACE_MEM, " %s: pinned ptr=%p\n", __func__, *ptr); - - ihipLogStatus(hip_status); - - return hip_status; - -#else - // TODO-hcc remove-me - - // This code only works on Kaveri: - *ptr = malloc(sizeBytes); // TODO - call am_alloc for device memory, this will only on KV HSA. - if (*ptr != NULL) { - //TODO-hsart : need memory pin APIs to implement this correctly. - // FOr now do our best to allocate the memory, but return an error since - // the returned pointer can only be used on the HOST not the GPU. - return ihipLogStatus(hipErrorMemoryAllocation); - } else { - return ihipLogStatus(hipErrorMemoryAllocation); - } + if (device) { + *ptr = hc::am_alloc(sizeBytes, device->_acc, am_flags); + if (sizeBytes && (*ptr == NULL)) { + hip_status = hipErrorMemoryAllocation; + } else { +#if USE_AM_TRACKER + hc::am_memtracker_update(*ptr, device->_device_index, 0); #endif + } + + tprintf (TRACE_MEM, " %s: pinned ptr=%p\n", __func__, *ptr); + } + + return ihipLogStatus(hip_status); + } +//--- hipError_t hipMemcpyToSymbol(const char* symbolName, const void *src, size_t count, size_t offset, hipMemcpyKind kind) { #ifdef USE_MEMCPYTOSYMBOL @@ -1416,8 +1878,9 @@ hipError_t hipMemcpyToSymbol(const char* symbolName, const void *src, size_t cou } auto device = ihipGetTlsDefaultDevice(); - hc::completion_future marker; - ihipCheckCommandSwitchSync(device._null_stream, ihipCommandData, &marker); + //hsa_signal_t depSignal; + //int depSignalCnt = device._null_stream->copyCommand(NULL, &depSignal, ihipCommandCopyH2D); + assert(0); // Need to properly synchronize the copy - do something with depSignal if != NULL. device->_acc.memcpy_symbol(symbolName, (void*) src,count, offset); #endif @@ -1425,6 +1888,247 @@ hipError_t hipMemcpyToSymbol(const char* symbolName, const void *src, size_t cou } +//------------------------------------------------------------------------------------------------- +StagingBuffer::StagingBuffer(ihipDevice_t *device, size_t bufferSize, int numBuffers) : + _device(device), + _bufferSize(bufferSize), + _numBuffers(numBuffers > _max_buffers ? _max_buffers : numBuffers) +{ + + + + for (int i=0; i<_numBuffers; i++) { + // TODO - experiment with alignment here. + _pinnedStagingBuffer[i] = hc::am_alloc(_bufferSize, device->_acc, amHostPinned); + if (_pinnedStagingBuffer[i] == NULL) { + throw; + } + hsa_signal_create(0, 0, NULL, &_completion_signal[i]); + } +}; + +//--- +StagingBuffer::~StagingBuffer() +{ + for (int i=0; i<_numBuffers; i++) { + if (_pinnedStagingBuffer[i]) { + hc::am_free(_pinnedStagingBuffer[i]); + _pinnedStagingBuffer[i] = NULL; + } + hsa_signal_destroy(_completion_signal[i]); + } +} + + +//--- +//Copies sizeBytes from src to dst, using either a copy to a staging buffer or a staged pin-in-place strategy +//IN: dst - dest pointer - must be accessible from host CPU. +//IN: src - src pointer for copy. Must be accessible from agent this buffer is associated with (via _device) +//IN: waitFor - hsaSignal to wait for - the copy will begin only when the specified dependency is resolved. May be NULL indicating no dependency. +void StagingBuffer::CopyHostToDevice(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor) +{ + const char *srcp = static_cast (src); + char *dstp = static_cast (dst); + + for (int i=0; i<_numBuffers; i++) { + hsa_signal_store_relaxed(_completion_signal[i], 0); + } + + assert(sizeBytes < UINT64_MAX/2); // TODO + int bufferIndex = 0; + for (int64_t bytesRemaining=sizeBytes; bytesRemaining>0 ; bytesRemaining -= _bufferSize) { + + size_t theseBytes = (bytesRemaining > _bufferSize) ? _bufferSize : bytesRemaining; + + tprintf (TRACE_COPY2, "H2D: waiting... on completion signal handle=%lu\n", _completion_signal[bufferIndex].handle); + hsa_signal_wait_acquire(_completion_signal[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); + + tprintf (TRACE_COPY2, "H2D: bytesRemaining=%zu: copy %zu bytes %p to stagingBuf[%d]:%p\n", bytesRemaining, theseBytes, srcp, bufferIndex, _pinnedStagingBuffer[bufferIndex]); + // TODO - use uncached memcpy, someday. + memcpy(_pinnedStagingBuffer[bufferIndex], srcp, theseBytes); + + + hsa_signal_store_relaxed(_completion_signal[bufferIndex], 1); + +#if USE_ROCR_V2 + hsa_status_t hsa_status = hsa_amd_memory_async_copy(dstp, _device->_hsa_agent, _pinnedStagingBuffer[bufferIndex], _device->_hsa_agent, theseBytes, waitFor ? 1:0, waitFor, _completion_signal[bufferIndex]); +#else + hsa_status_t hsa_status = hsa_amd_memory_async_copy(dstp, _pinnedStagingBuffer[bufferIndex], theseBytes, _device->_hsa_agent, 0, NULL, _completion_signal[bufferIndex]); +#endif + tprintf (TRACE_COPY2, "H2D: bytesRemaining=%zu: async_copy %zu bytes %p to %p status=%x\n", bytesRemaining, theseBytes, _pinnedStagingBuffer[bufferIndex], dstp, hsa_status); + + assert(hsa_status == HSA_STATUS_SUCCESS); // TODO - throw + + srcp += theseBytes; + dstp += theseBytes; + if (++bufferIndex >= _numBuffers) { + bufferIndex = 0; + } + + if (HIP_ONESHOT_COPY_DEP) { + waitFor = NULL; // TODO - don't need dependency after first copy submitted? + } + } + + + for (int i=0; i<_numBuffers; i++) { + hsa_signal_wait_acquire(_completion_signal[i], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); + } +} + +//--- +//Copies sizeBytes from src to dst, using either a copy to a staging buffer or a staged pin-in-place strategy +//IN: dst - dest pointer - must be accessible from agent this buffer is assocaited with (via _device). +//IN: src - src pointer for copy. Must be accessible from host CPU. +//IN: waitFor - hsaSignal to wait for - the copy will begin only when the specified dependency is resolved. May be NULL indicating no dependency. +void StagingBuffer::CopyDeviceToHost(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor) +{ + const char *srcp0 = static_cast (src); + char *dstp1 = static_cast (dst); + + for (int i=0; i<_numBuffers; i++) { + hsa_signal_store_relaxed(_completion_signal[i], 0); + } + + assert(sizeBytes < UINT64_MAX/2); // TODO + + int64_t bytesRemaining0 = sizeBytes; // bytes to copy from dest into staging buffer. + int64_t bytesRemaining1 = sizeBytes; // bytes to copy from staging buffer into final dest + + while (bytesRemaining1 > 0) { + // First launch the async copies to copy from dest to host + for (int bufferIndex = 0; (bytesRemaining0>0) && (bufferIndex < _numBuffers); bytesRemaining0 -= _bufferSize, bufferIndex++) { + + size_t theseBytes = (bytesRemaining0 > _bufferSize) ? _bufferSize : bytesRemaining0; + + tprintf (TRACE_COPY2, "D2H: bytesRemaining0=%zu async_copy %zu bytes src:%p to staging:%p\n", bytesRemaining0, theseBytes, srcp0, _pinnedStagingBuffer[bufferIndex]); + hsa_signal_store_relaxed(_completion_signal[bufferIndex], 1); +#if USE_ROCR_V2 + hsa_status_t hsa_status = hsa_amd_memory_async_copy(_pinnedStagingBuffer[bufferIndex], _device->_hsa_agent, srcp0, _device->_hsa_agent, theseBytes, waitFor ? 1:0, waitFor, _completion_signal[bufferIndex]); +#else + hsa_status_t hsa_status = hsa_amd_memory_async_copy(_pinnedStagingBuffer[bufferIndex], srcp0, theseBytes, _device->_hsa_agent, 0, NULL, _completion_signal[bufferIndex]); +#endif + assert(hsa_status == HSA_STATUS_SUCCESS); // TODO - throw + + srcp0 += theseBytes; + + + if (HIP_ONESHOT_COPY_DEP) { + waitFor = NULL; // TODO - don't need dependency after first copy submitted? + } + } + + // Now unload the staging buffers: + for (int bufferIndex=0; (bytesRemaining1>0) && (bufferIndex < _numBuffers); bytesRemaining1 -= _bufferSize, bufferIndex++) { + + size_t theseBytes = (bytesRemaining1 > _bufferSize) ? _bufferSize : bytesRemaining1; + + tprintf (TRACE_COPY2, "D2H: wait_completion[%d] bytesRemaining=%zu\n", bufferIndex, bytesRemaining1); + hsa_signal_wait_acquire(_completion_signal[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); + + tprintf (TRACE_COPY2, "D2H: bytesRemaining1=%zu copy %zu bytes stagingBuf[%d]:%p to dst:%p\n", bytesRemaining1, theseBytes, bufferIndex, _pinnedStagingBuffer[bufferIndex], dstp1); + memcpy(dstp1, _pinnedStagingBuffer[bufferIndex], theseBytes); + + dstp1 += theseBytes; + } + } + + + //for (int i=0; i<_numBuffers; i++) { + // hsa_signal_wait_acquire(_completion_signal[i], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); + //} +} + + + + +#if USE_AM_TRACKER +void ihipSyncCopy(ihipStream_t *stream, void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind) +{ + ihipDevice_t *device = stream->getDevice(); + + if (device == NULL) { + throw; + } + + hc::accelerator acc; + hc::AmPointerInfo dstPtrInfo(NULL, NULL, 0, acc, 0, 0); + hc::AmPointerInfo srcPtrInfo(NULL, NULL, 0, acc, 0, 0); + + bool dstNotTracked = (hc::am_memtracker_getinfo(&dstPtrInfo, dst) != AM_SUCCESS); + bool srcNotTracked = (hc::am_memtracker_getinfo(&srcPtrInfo, src) != AM_SUCCESS); + + bool useStagingBuffer = true; + + // Resolve default to a specific Kind so we know which algorithm to use: + if (kind == hipMemcpyDefault) { + bool dstIsHost = (dstNotTracked || !dstPtrInfo._isInDeviceMem); + bool srcIsHost = (srcNotTracked || !srcPtrInfo._isInDeviceMem); + if (srcIsHost && !dstIsHost) { + kind = hipMemcpyHostToDevice; + } else if (!srcIsHost && dstIsHost) { + kind = hipMemcpyDeviceToHost; + } else if (srcIsHost && dstIsHost) { + kind = hipMemcpyHostToHost; + } else if (srcIsHost && dstIsHost) { + kind = hipMemcpyDeviceToDevice; + } + } + + + if ((kind == hipMemcpyHostToDevice) && (srcNotTracked)) { + if (useStagingBuffer) { + std::lock_guard l (device->_copy_lock[0]); + //printf ("staged-copy- read dep signals\n"); + + hsa_signal_t depSignal; + int depSignalCnt = stream->copyCommand(NULL, &depSignal, ihipCommandCopyH2D); + device->_staging_buffer[0]->CopyHostToDevice(dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL); + + // The copy waits for inputs and then completes before returning. + stream->resetToEmpty(); + } else { + // TODO - remove, slow path. + hc::am_copy(dst, src, sizeBytes); + } + } else if ((kind == hipMemcpyDeviceToHost) && (dstNotTracked)) { + if (useStagingBuffer) { + std::lock_guard l (device->_copy_lock[HIP_DISABLE_BIDIR_MEMCPY ? 0:1]); + //printf ("staged-copy- read dep signals\n"); + hsa_signal_t depSignal; + int depSignalCnt = stream->copyCommand(NULL, &depSignal, ihipCommandCopyD2H); + device->_staging_buffer[1]->CopyDeviceToHost(dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL); + } else { + // TODO - remove, slow path. + hc::am_copy(dst, src, sizeBytes); + } + } else if (kind == hipMemcpyHostToHost) { + memcpy(dst, src, sizeBytes); + + } else { + // Let HSA runtime handle it: + // TODO - need buffer pool for the signals: + + device->_copy_lock[HIP_DISABLE_BIDIR_MEMCPY? 0:1].lock(); + + hsa_signal_store_relaxed(device->_copy_signal, 1); +#if USE_ROCR_V2 + hsa_status_t hsa_status = hsa_amd_memory_async_copy(dst, device->_hsa_agent, src, device->_hsa_agent, sizeBytes, 0, NULL, device->_copy_signal); +#else + hsa_status_t hsa_status = hsa_amd_memory_async_copy(dst, src, sizeBytes, device->_hsa_agent, 0, NULL, device->_copy_signal); +#endif + + if (hsa_status == HSA_STATUS_SUCCESS) { + hsa_signal_wait_relaxed(device->_copy_signal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); + } + + device->_copy_lock[HIP_DISABLE_BIDIR_MEMCPY ? 0:1].unlock(); + + } +} +#endif + + //--- hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind) { @@ -1433,83 +2137,114 @@ hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind hipStream_t stream = ihipSyncAndResolveStream(hipStreamNull); hc::completion_future marker; - ihipCheckCommandSwitchSync(stream, ihipCommandData, &marker); hipError_t e = hipSuccess; -#ifdef USE_ASYNC_COPY - if (ihipIsValidDevice(stream->_device_index)) { - - ihipDevice_t *device = &g_devices[stream->_device_index]; - - hsa_signal_t completion_signal; // init/obtain from pool. - - hsa_status_t hsa_status = hsa_amd_memory_async_copy(dst, src, size, device->_hsa_agent, 0, NULL, &completion_signal); - - e = (hsa_status == HSA_STATUS_SUCCESS) ? hipSuccess : hipErrorTbd; - } else { +#if USE_AM_TRACKER + try { + ihipSyncCopy(stream, dst, src, sizeBytes, kind); + } + catch (...) { e = hipErrorInvalidResourceHandle; } #else - // TODO-hsart - what synchronization does hsa_copy provide? hc::am_copy(dst, src, sizeBytes); e = hipSuccess; #endif - // TODO - when am_copy becomes async, and we have HIP_LAUNCH_BLOCKING set, then we would wait for copy operation to complete here. - return ihipLogStatus(e); } -//--- -/* +#if USE_AM_TRACKER==0 +/** * @warning on HCC hipMemcpyAsync uses a synchronous copy. */ +#endif +/** + * @result #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidMemcpyDirection, #hipErrorInvalidValue + */ +//--- hipError_t hipMemcpyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind, hipStream_t stream) { std::call_once(hip_initialized, ihipInit); hipError_t e = hipSuccess; - stream = ihipSyncAndResolveStream(stream); + stream = ihipSyncAndResolveStream(stream); - hc::completion_future marker; - ihipCheckCommandSwitchSync(stream, ihipCommandData, &marker); +#if USE_AM_TRACKER + if (stream) { + ihipDevice_t *device = stream->getDevice(); - // Dispatch async memory copy to synchronize with items in the specified stream. + if (device == NULL) { + e = hipErrorInvalidDevice; - // Async - need to set up dependency on the last command queued to the device? + } else if (kind == hipMemcpyDefault) { + e = hipErrorInvalidMemcpyDirection; - // TODO-hsart This routine needs to ensure that dst and src are mapped on the GPU. - // This is a synchronous copy - remove and replace with code below when we have appropriate LOCK APIs. - hc::am_copy(dst, src, sizeBytes); + } else if (kind == hipMemcpyHostToHost) { + tprintf (TRACE_COPY2, "H2H copy with memcpy"); -#if 0 + memcpy(dst, src, sizeBytes); - hipStream_t s =ihipGetStream(stream); + } else { + ihipSignal_t *ihip_signal = stream->getSignal(); + hsa_signal_store_relaxed(ihip_signal->_hsa_signal, 1); + + ihipCommand_t copyType; + if ((kind == hipMemcpyHostToDevice) || (kind == hipMemcpyDeviceToDevice)) { + copyType = ihipCommandCopyH2D; + } else if (kind == hipMemcpyDeviceToHost) { + copyType = ihipCommandCopyD2H; + } else { + e = hipErrorInvalidMemcpyDirection; + copyType = ihipCommandCopyD2H; + } + +#if USE_ROCR_V2 + hsa_signal_t depSignal; + int depSignalCnt = stream->copyCommand(ihip_signal, &depSignal, copyType); + + tprintf (TRACE_SYNC, " copy-async, waitFor=%lu completion=#%lu(%lu)\n", depSignalCnt? depSignal.handle:0x0, ihip_signal->_sig_id, ihip_signal->_hsa_signal.handle); + + hsa_status_t hsa_status = hsa_amd_memory_async_copy(dst, device->_hsa_agent, src, device->_hsa_agent, sizeBytes, depSignalCnt, depSignalCnt ? &depSignal:0x0, ihip_signal->_hsa_signal); +#else + hsa_status_t hsa_status = hsa_amd_memory_async_copy(dst, src, sizeBytes, device->_hsa_agent, 0, NULL, ihip_signal->_hsa_signal); +#endif - if (s) { - hc::completion_future cf = ihipMemcpyKernel (s, static_cast (dst), static_cast (src), sizeBytes); - - //cf.wait(); - - e = hipSuccess; + if (hsa_status == HSA_STATUS_SUCCESS) { + // TODO-stream - fix release-signal calls here. + if (HIP_LAUNCH_BLOCKING) { + tprintf(TRACE_SYNC, "LAUNCH_BLOCKING for completion of hipMemcpyAsync(%zu)\n", sizeBytes); + stream->wait(); + } + } else { + // This path can be hit if src or dst point to unpinned host memory. + // TODO-stream - does async-copy fall back to sync if input pointers are not pinned? + e = hipErrorInvalidValue; + } + } } else { e = hipErrorInvalidValue; } +#else + // TODO-hsart This routine needs to ensure that dst and src are mapped on the GPU. + // This is a synchronous copy - remove and replace with code below when we have appropriate LOCK APIs. + hc::am_copy(dst, src, sizeBytes); #endif - // TODO - if am_copy becomes async, and we have HIP_LAUNCH_BLOCKING set, then we would wait for copy operation to complete here. return ihipLogStatus(e); } // TODO-sync: function is async unless target is pinned host memory - then these are fully sync. +/** @return #hipErrorInvalidValue + */ hipError_t hipMemsetAsync(void* dst, int value, size_t sizeBytes, hipStream_t stream ) { std::call_once(hip_initialized, ihipInit); @@ -1517,37 +2252,42 @@ hipError_t hipMemsetAsync(void* dst, int value, size_t sizeBytes, hipStream_t s hipError_t e = hipSuccess; stream = ihipSyncAndResolveStream(stream); - hc::completion_future marker; - ihipCheckCommandSwitchSync(stream, ihipCommandData, &marker); + stream->preKernelCommand(); + if (stream) { - hc::completion_future cf ; + hc::completion_future cf ; - if ((sizeBytes & 0x3) == 0) { - // use a faster word-per-workitem copy: - try { - value = value & 0xff; - unsigned value32 = (value << 24) | (value << 16) | (value << 8) | (value) ; - cf = ihipMemsetKernel (stream, static_cast (dst), value32, sizeBytes/sizeof(unsigned)); - } - catch (std::exception &ex) { - e = hipErrorInvalidValue; + if ((sizeBytes & 0x3) == 0) { + // use a faster word-per-workitem copy: + try { + value = value & 0xff; + unsigned value32 = (value << 24) | (value << 16) | (value << 8) | (value) ; + cf = ihipMemsetKernel (stream, static_cast (dst), value32, sizeBytes/sizeof(unsigned)); + } + catch (std::exception &ex) { + e = hipErrorInvalidValue; + } + } else { + // use a slow byte-per-workitem copy: + try { + cf = ihipMemsetKernel (stream, static_cast (dst), value, sizeBytes); + } + catch (std::exception &ex) { + e = hipErrorInvalidValue; + } } + + stream->postKernelCommand(cf); + + + if (HIP_LAUNCH_BLOCKING) { + tprintf (TRACE_SYNC, "'%s' LAUNCH_BLOCKING wait for completion [stream:%p].\n", __func__, (void*)stream); + cf.wait(); + tprintf (TRACE_SYNC, "'%s' LAUNCH_BLOCKING completed [stream:%p].\n", __func__, (void*)stream); + } } else { - // use a slow byte-per-workitem copy: - try { - cf = ihipMemsetKernel (stream, static_cast (dst), value, sizeBytes); - } - catch (std::exception &ex) { - e = hipErrorInvalidValue; - } - } - - - if (HIP_LAUNCH_BLOCKING) { - tprintf (TRACE_SYNC, "'%s' LAUNCH_BLOCKING wait for completion [stream:%p].\n", __func__, (void*)stream); - cf.wait(); - tprintf (TRACE_SYNC, "'%s' LAUNCH_BLOCKING completed [stream:%p].\n", __func__, (void*)stream); + e = hipErrorInvalidValue; } @@ -1557,15 +2297,16 @@ hipError_t hipMemsetAsync(void* dst, int value, size_t sizeBytes, hipStream_t s hipError_t hipMemset(void* dst, int value, size_t sizeBytes ) { + // TODO - call an ihip memset so HIP_TRACE is correct. return hipMemsetAsync(dst, value, sizeBytes, hipStreamNull); } /* - * @returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue (if free != NULL due to bug) - * @bug - on hcc free always returns 50% of peak regardless of current allocations. hipMemGetInfo returns hipErrorInvalidValue to indicate this. + * @returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue (if free != NULL due to bug)S + * @warning On HCC, the free memory only accounts for memory allocated by this process and may be optimistic. */ -hipError_t hipMemGetInfo ( size_t * free, size_t * total ) +hipError_t hipMemGetInfo (size_t *free, size_t *total) { std::call_once(hip_initialized, ihipInit); @@ -1578,23 +2319,29 @@ hipError_t hipMemGetInfo ( size_t * free, size_t * total ) } if (free) { - *free = hipDevice->_props.totalGlobalMem * 0.5; // TODO +#if USE_AM_TRACKER + // TODO - replace with kernel-level for reporting free memory: + size_t deviceMemSize, hostMemSize, userMemSize; + hc::am_memtracker_sizeinfo(hipDevice->_acc, &deviceMemSize, &hostMemSize, &userMemSize); + *free = hipDevice->_props.totalGlobalMem - deviceMemSize; +#else + *free = hipDevice->_props.totalGlobalMem * 0.5; // TODO e=hipErrorInvalidValue; +#endif } } else { e = hipErrorInvalidDevice; } - // TODO-runtime - when we fix the 50% bug. - //return ihipLogStatus(hipErrorSuccess); - return ihipLogStatus(hipErrorInvalidValue); + return ihipLogStatus(e); } //--- hipError_t hipFree(void* ptr) { + // TODO - ensure this pointer was created by hipMalloc and not hipMallocHost std::call_once(hip_initialized, ihipInit); @@ -1611,15 +2358,12 @@ hipError_t hipFree(void* ptr) hipError_t hipFreeHost(void* ptr) { + // TODO - ensure this pointer was created by hipMallocHost and not hipMalloc std::call_once(hip_initialized, ihipInit); if (ptr) { -#if USE_PINNED_HOST tprintf (TRACE_MEM, " %s: %p\n", __func__, ptr); hc::am_free(ptr); -#else - free(ptr); -#endif } return ihipLogStatus(hipSuccess); @@ -1741,3 +2485,5 @@ hipError_t hipHccGetAcceleratorView(hipStream_t stream, hc::accelerator_view **a hipError_t err = hipSuccess; return ihipLogStatus(err); } + +// TODO - review signal / error reporting code. diff --git a/projects/clr/hipamd/tests/src/CMakeLists.txt b/projects/clr/hipamd/tests/src/CMakeLists.txt index 0ec287b334..09c0ca7162 100644 --- a/projects/clr/hipamd/tests/src/CMakeLists.txt +++ b/projects/clr/hipamd/tests/src/CMakeLists.txt @@ -19,8 +19,10 @@ MESSAGE ("HIP_PATH=" ${HIP_PATH}) if (${HIP_PLATFORM} STREQUAL "hcc") MESSAGE ("HIP_PLATFORM=hcc") - set (HC_PATH ${HIP_PATH}/hc) - set (HSA_PATH /opt/hsa) + set (HSA_PATH $ENV{HSA_PATH}) + if (NOT DEFINED HSA_PATH) + set (HSA_PATH /opt/hsa) + endif() #--- # Add HSA library: @@ -30,7 +32,7 @@ if (${HIP_PLATFORM} STREQUAL "hcc") #These includes are used for all files. #Include HIP and HC since the tests need both of these: #Note below HSA path is surgically included only where necessary. - include_directories(${HIP_PATH}/include ${HC_PATH}/include) + include_directories(${HIP_PATH}/include) # hip_hcc.o: add_library(hip_hcc OBJECT ${HIP_PATH}/src/hip_hcc.cpp) @@ -105,6 +107,7 @@ make_hip_executable (hip_brev hip_brev.cpp) make_hip_executable (hip_ffs hip_ffs.cpp) make_hip_executable (hipGetDeviceAttribute hipGetDeviceAttribute.cpp) make_hip_executable (hipMemcpy hipMemcpy.cpp) +make_hip_executable (hipMemcpyAsync hipMemcpyAsync.cpp) make_hip_executable (hipMemset hipMemset.cpp) make_hip_executable (hipEventRecord hipEventRecord.cpp) make_hip_executable (hipLanguageExtensions hipLanguageExtensions.cpp) @@ -114,6 +117,7 @@ make_hip_executable (hipSimpleAtomicsTest hipSimpleAtomicsTest.cpp) make_hip_executable (hipMathFunctionsHost hipMathFunctions.cpp hipSinglePrecisionMathHost.cpp hipDoublePrecisionMathHost.cpp) make_hip_executable (hipMathFunctionsDevice hipMathFunctions.cpp hipSinglePrecisionMathDevice.cpp hipDoublePrecisionMathDevice.cpp) make_hip_executable (hipIntrinsics hipMathFunctions.cpp hipSinglePrecisionIntrinsics.cpp hipDoublePrecisionIntrinsics.cpp hipIntegerIntrinsics.cpp) +make_hip_executable (hipPointerAttrib hipPointerAttrib.cpp) target_link_libraries(hipMathFunctionsHost m) make_test(hip_ballot " " ) @@ -128,8 +132,10 @@ make_test(hipMemset --N 10 --memsetval 0x42 ) # small copy, just 10 bytes. make_test(hipMemset --N 10013 --memsetval 0x5a ) # oddball size. make_test(hipMemset --N 256M --memsetval 0xa6 ) # big copy make_test(hipGridLaunch " " ) +make_test(hipPointerAttrib " " ) make_test(hipMemcpy " " ) +make_test(hipMemcpyAsync " " ) make_test(hipHcc " " ) diff --git a/projects/clr/hipamd/tests/src/hipMemcpy.cpp b/projects/clr/hipamd/tests/src/hipMemcpy.cpp index 5db2b270d6..8286454098 100644 --- a/projects/clr/hipamd/tests/src/hipMemcpy.cpp +++ b/projects/clr/hipamd/tests/src/hipMemcpy.cpp @@ -23,24 +23,28 @@ THE SOFTWARE. #include "test_common.h" - -int main(int argc, char *argv[]) +void printSep() { - HipTest::parseStandardArguments(argc, argv, true); + printf ("======================================================================================\n"); +} +//--- +// Test simple H2D copies and back. +// Designed to stress a small number of simple smoke tests +void simpleTest1() +{ + printf ("test: %s\n", __func__); size_t Nbytes = N*sizeof(int); - - printf ("N=%zu \n", N); + printf ("N=%zu Nbytes=%6.2fMB\n", N, Nbytes/1024.0/1024.0); int *A_d, *B_d, *C_d; int *A_h, *B_h, *C_h; - HipTest::initArrays (&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N); - + HipTest::initArrays (&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false); + printf ("A_d=%p B_d=%p C_d=%p A_h=%p B_h=%p C_h=%p\n", A_d, B_d, C_d, A_h, B_d, C_h); unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N); - HIPCHECK ( hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice)); HIPCHECK ( hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice)); @@ -50,8 +54,212 @@ int main(int argc, char *argv[]) HIPCHECK (hipDeviceSynchronize()); - HipTest::checkVectorADD(A_h, B_h, C_h, N); + + HipTest::freeArrays (A_d, B_d, C_d, A_h, B_h, C_h, false); + HIPCHECK (hipDeviceReset()); + + printf (" %s success\n", __func__); +} + + +//--- +// Test many different kinds of memory copies. +// THe subroutine allocates memory , copies to device, runs a vector add kernel, copies back, and checks the result. +// +// IN: numElements controls the number of elements used for allocations. +// IN: usePinnedHost : If true, allocate host with hipMallocHost and is pinned ; else allocate host memory with malloc. +// IN: useHostToHost : If true, add an extra host-to-host copy. +// IN: useDeviceToDevice : If true, add an extra deviceto-device copy after result is produced. +// IN: useMemkindDefault : If true, use memkinddefault (runtime figures out direction). if false, use explicit memcpy direction. +// +template +void memcpytest2(size_t numElements, bool usePinnedHost, bool useHostToHost, bool useDeviceToDevice, bool useMemkindDefault) +{ + size_t sizeElements = numElements * sizeof(T); + printf ("test: %s<%s> size=%lu (%6.2fMB) usePinnedHost:%d, useHostToHost:%d, useDeviceToDevice:%d, useMemkindDefault:%d\n", + __func__, + typeid(T).name(), + sizeElements, sizeElements/1024.0/1024.0, + usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault); + + + T *A_d, *B_d, *C_d; + T *A_h, *B_h, *C_h; + + + HipTest::initArrays (&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, numElements, usePinnedHost); + unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements); + + T *A_hh = NULL; + T *B_hh = NULL; + T *C_dd = NULL; + + + + if (useHostToHost) { + if (usePinnedHost) { + HIPCHECK ( hipMallocHost(&A_hh, sizeElements) ); + HIPCHECK ( hipMallocHost(&B_hh, sizeElements) ); + } else { + A_hh = (T*)malloc(sizeElements); + B_hh = (T*)malloc(sizeElements); + } + + + // Do some extra host-to-host copies here to mix things up: + HIPCHECK ( hipMemcpy(A_hh, A_h, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost)); + HIPCHECK ( hipMemcpy(B_hh, B_h, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost)); + + + HIPCHECK ( hipMemcpy(A_d, A_hh, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); + HIPCHECK ( hipMemcpy(B_d, B_hh, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); + } else { + HIPCHECK ( hipMemcpy(A_d, A_h, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); + HIPCHECK ( hipMemcpy(B_d, B_h, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); + } + + hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, numElements); + + if (useDeviceToDevice) { + HIPCHECK ( hipMalloc(&C_dd, sizeElements) ); + + // Do an extra device-to-device copies here to mix things up: + HIPCHECK ( hipMemcpy(C_dd, C_d, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyDeviceToDevice)); + + //Destroy the original C_d: + HIPCHECK ( hipMemset(C_d, 0x5A, sizeElements)); + + HIPCHECK ( hipMemcpy(C_h, C_dd, sizeElements, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost)); + } else { + HIPCHECK ( hipMemcpy(C_h, C_d, sizeElements, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost)); + } + + HIPCHECK ( hipDeviceSynchronize() ); + HipTest::checkVectorADD(A_h, B_h, C_h, numElements); + + HipTest::freeArrays (A_d, B_d, C_d, A_h, B_h, C_h, usePinnedHost); + + printf (" %s success\n", __func__); +} + + +//--- +//Try all the 16 possible combinations to memcpytest2 - usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault +template +void memcpytest2_loop(size_t numElements) +{ + printSep(); + + for (int usePinnedHost =0; usePinnedHost<=1; usePinnedHost++) { + for (int useHostToHost =0; useHostToHost<=1; useHostToHost++) { // TODO + for (int useDeviceToDevice =0; useDeviceToDevice<=1; useDeviceToDevice++) { + for (int useMemkindDefault =0; useMemkindDefault<=1; useMemkindDefault++) { + memcpytest2(numElements, usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault); + } + } + } + } +} + + +//--- +//Try many different sizes to memory copy. +template +void memcpytest2_sizes(size_t maxElem=0, size_t offset=0) +{ + printSep(); + printf ("test: %s<%s>\n", __func__, typeid(T).name()); + + int deviceId; + HIPCHECK(hipGetDevice(&deviceId)); + + size_t free, total; + HIPCHECK(hipMemGetInfo(&free, &total)); + + if (maxElem == 0) { + maxElem = free/sizeof(T)/5; + } + + printf (" device#%d: hipMemGetInfo: free=%zu (%4.2fMB) total=%zu (%4.2fMB) maxSize=%6.1fMB offset=%lu\n", + deviceId, free, (float)(free/1024.0/1024.0), total, (float)(total/1024.0/1024.0), maxElem*sizeof(T)/1024.0/1024.0, offset); + + for (size_t elem=64; elem+offset<=maxElem; elem*=2) { + HIPCHECK ( hipDeviceReset() ); + memcpytest2(elem+offset, 0, 1, 1, 0); // unpinned host + HIPCHECK ( hipDeviceReset() ); + memcpytest2(elem+offset, 1, 1, 1, 0); // pinned host + } +} + + +//--- +//Create multiple threads to stress multi-thread locking behavior in the allocation/deallocation/tracking logic: +template +void multiThread_1(bool serialize, bool usePinnedHost) +{ + printSep(); + printf ("test: %s<%s> serialize=%d usePinnedHost=%d\n", __func__, typeid(T).name(), serialize, usePinnedHost); + std::thread t1 (memcpytest2,N, usePinnedHost,0,0,0); + if (serialize) { + t1.join(); + } + + + std::thread t2 (memcpytest2,N, usePinnedHost,0,0,0); + if (serialize) { + t2.join(); + } + + if (!serialize) { + t1.join(); + t2.join(); + } +} + + + +int main(int argc, char *argv[]) +{ + HipTest::parseStandardArguments(argc, argv, true); + + printf ("info: set device to %d\n", p_gpuDevice); + HIPCHECK(hipSetDevice(p_gpuDevice)); + + + if (p_tests & 0x1) { + HIPCHECK ( hipDeviceReset() ); + simpleTest1(); + } + + if (p_tests & 0x2) { + HIPCHECK ( hipDeviceReset() ); + memcpytest2_loop(N); + memcpytest2_loop(N); + memcpytest2_loop(N); + memcpytest2_loop(N); + } + + if (p_tests & 0x4) { + HIPCHECK ( hipDeviceReset() ); + printSep(); + memcpytest2_sizes(0,0); + printSep(); + memcpytest2_sizes(0,64); + printSep(); + memcpytest2_sizes(1024*1024, 13); + printSep(); + memcpytest2_sizes(1024*1024, 50); + } + + if (p_tests & 0x8) { + HIPCHECK ( hipDeviceReset() ); + printSep(); + multiThread_1(true, true); + multiThread_1(false, true); + multiThread_1(false, false); // TODO + } + passed(); } diff --git a/projects/clr/hipamd/tests/src/hipMemcpyAsync.cpp b/projects/clr/hipamd/tests/src/hipMemcpyAsync.cpp new file mode 100644 index 0000000000..4b92e2fc1e --- /dev/null +++ b/projects/clr/hipamd/tests/src/hipMemcpyAsync.cpp @@ -0,0 +1,349 @@ +// Test under-development. Calls async mem-copy API, experiment with functionality. + +#include "hip_runtime.h" +#include "test_common.h" + +unsigned p_streams = 2; + + +void simpleNegTest() +{ + printf ("testing: %s\n",__func__); + hipError_t e; + float *A_malloc, *A_pinned, *A_d; + + size_t Nbytes = N*sizeof(float); + A_malloc = (float*)malloc(Nbytes); + HIPCHECK(hipMallocHost(&A_pinned, Nbytes)); + HIPCHECK(hipMalloc(&A_d, Nbytes)); + + + // Can't use default with async copy + e = hipMemcpyAsync(A_pinned, A_d, Nbytes, hipMemcpyDefault, NULL); + HIPASSERT (e==hipErrorInvalidMemcpyDirection); // TODO + HIPASSERT (e!= hipSuccess); + + + // Not sure what happens here, the memory must be pinned. + e = hipMemcpyAsync(A_malloc, A_d, Nbytes, hipMemcpyHostToDevice, NULL); + + printf (" async memcpy of A_malloc to A_d. Result=%d\n", e); + //HIPASSERT (e==hipErrorInvalidValue); +} + +class Pinned; +class Unpinned; + +template struct HostTraits; + +template<> +struct HostTraits +{ + static const char *Name() { return "Pinned"; } ; + + static void *Alloc(size_t sizeBytes) { + void *p; + HIPCHECK(hipMallocHost(&p, sizeBytes)); + return p; + }; +}; + + +template +__global__ void +addK (hipLaunchParm lp, T *A, T K, size_t numElements) +{ + size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); + size_t stride = hipBlockDim_x * hipGridDim_x ; + + for (size_t i=offset; i +void test_pingpong(hipStream_t stream, size_t numElements, int numInflight, int numPongs, bool doHostSide) +{ + HIPASSERT(numElements % numInflight == 0); // Must be evenly divisible. + size_t Nbytes = numElements*sizeof(T); + size_t eachCopyElements = numElements / numInflight; + size_t eachCopyBytes = eachCopyElements * sizeof(T); + + unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements); + + printf ("-----------------------------------------------------------------------------------------------\n"); + printf ("testing: %s<%s> Nbytes=%zu (%6.1f MB) numPongs=%d numInflight=%d eachCopyElements=%zu eachCopyBytes=%zu\n", + __func__, HostTraits::Name(), Nbytes, (double)(Nbytes)/1024.0/1024.0, numPongs, numInflight, eachCopyElements, eachCopyBytes); + + T *A_h; + T *A_d; + + A_h = (T*)(HostTraits::Alloc(Nbytes)); + HIPCHECK(hipMalloc(&A_d, Nbytes)); + + // Initialize the host array: + const T initValue = 13; + const T deviceConst = 2; + const T hostConst = 10000; + for (size_t i=0; i, dim3(blocks), dim3(threadsPerBlock), 0, stream, A_d, 2, numElements); + + for (int i=0; i (i); + } + + + //stream=0; // fixme TODO + + + for (int i=0; i= argc || !HipTest::parseUInt(argv[i], &p_streams)) { + failed("Bad streams argument"); + } + } else { + failed("Bad argument '%s'", arg); + } + }; +}; + + + + +int main(int argc, char *argv[]) +{ + HipTest::parseStandardArguments(argc, argv, true); + parseMyArguments(argc, argv); + + + printf ("info: set device to %d\n", p_gpuDevice); + HIPCHECK(hipSetDevice(p_gpuDevice)); + + if (p_tests & 0x01) { + simpleNegTest(); + } + + if (p_tests & 0x02) { + hipStream_t stream; + HIPCHECK (hipStreamCreate(&stream)); + + test_manyInflightCopies(stream, 1024, 16, true); + test_manyInflightCopies(stream, 1024, 4, true); // verify we re-use the same entries instead of growing pool. + test_manyInflightCopies(stream, 1024*8, 64, false); + + HIPCHECK(hipStreamDestroy(stream)); + } + + + if (p_tests & 0x04) { + test_chunkedAsyncExample(p_streams, true, true, true); // Easy sync version + test_chunkedAsyncExample(p_streams, false, true, true); // Easy sync version + test_chunkedAsyncExample(p_streams, false, false, true); // Some async + test_chunkedAsyncExample(p_streams, false, false, false); // All async + } + + if (p_tests & 0x08) { + hipStream_t stream; + HIPCHECK (hipStreamCreate(&stream)); + + test_pingpong(stream, 1024*1024*32, 1, 1, false); + test_pingpong(stream, 1024*1024*32, 1, 10, false); + + HIPCHECK(hipStreamDestroy(stream)); + } + + + passed(); + +} diff --git a/projects/clr/hipamd/tests/src/hipPointerAttrib.cpp b/projects/clr/hipamd/tests/src/hipPointerAttrib.cpp new file mode 100644 index 0000000000..6928ec9a64 --- /dev/null +++ b/projects/clr/hipamd/tests/src/hipPointerAttrib.cpp @@ -0,0 +1,524 @@ +/* +Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + + +// Test pointer tracking logic: allocate memory and retrieve stats with hipPointerGetAttributes + +#include "hip_runtime.h" +#include "test_common.h" + +#ifdef __HIP_PLATFORM_HCC__ +//#include "hcc_detail/AM.h" +#include "hc_am.hpp" + +#endif + +size_t Nbytes = 0; + +//================================================================================================= +// Utility Functions: +//================================================================================================= + +bool operator==(const hipPointerAttribute_t &lhs, const hipPointerAttribute_t &rhs) +{ + return ((lhs.hostPointer == rhs.hostPointer) && + (lhs.devicePointer == rhs.devicePointer) && + (lhs.memoryType == rhs.memoryType) && + (lhs.device == rhs.device) && + (lhs.allocationFlags == rhs.allocationFlags) + ) ; + +}; + + +bool operator!=(const hipPointerAttribute_t &lhs, const hipPointerAttribute_t &rhs) +{ + return ! (lhs == rhs); +} + + +const char *memoryTypeToString(hipMemoryType memoryType) +{ + switch (memoryType) { + case hipMemoryTypeHost : return "[Host]"; + case hipMemoryTypeDevice : return "[Device]"; + default: return "[Unknown]"; + }; +} + + +void resetAttribs(hipPointerAttribute_t *attribs) +{ + attribs->hostPointer = (void*) (-1); + attribs->devicePointer = (void*) (-1); + attribs->memoryType = hipMemoryTypeHost; + attribs->device = -2; + attribs->isManaged = -1; + attribs->allocationFlags = 0xffff; +}; + + +void printAttribs(const hipPointerAttribute_t *attribs) +{ + printf ("hostPointer:%p devicePointer:%p memoryType:%s deviceId:%d isManaged:%d allocationFlags:%u\n", + attribs->hostPointer, + attribs->devicePointer, + memoryTypeToString(attribs->memoryType), + attribs->device, + attribs->isManaged, + attribs->allocationFlags + ); +}; + + +inline int zrand(int max) +{ + return rand() % max; +} + + +//================================================================================================= +// Functins to run tests +//================================================================================================= +//-- +//Run through a couple simple cases to test lookups and host pointer arithmetic: +void testSimple() +{ + printf ("\n"); + printf ("===========================================================================\n"); + printf ("Simple Tests\n"); + printf ("===========================================================================\n"); + + char *A_d; + char *A_Pinned_h; + char *A_OSAlloc_h; + hipError_t e; + + HIPCHECK ( hipMalloc(&A_d, Nbytes) ); + HIPCHECK ( hipMallocHost(&A_Pinned_h, Nbytes) ); + A_OSAlloc_h = (char*)malloc(Nbytes); + + size_t free, total; + HIPCHECK(hipMemGetInfo(&free, &total)); + printf ("hipMemGetInfo: free=%zu (%4.2f) Nbytes=%lu total=%zu (%4.2f)\n", free, (float)(free/1024.0/1024.0), Nbytes, total, (float)(total/1024.0/1024.0)); + HIPASSERT(free + Nbytes <= total); + + + hipPointerAttribute_t attribs; + hipPointerAttribute_t attribs2; + + // Device memory + printf ("\nDevice memory (hipMalloc)\n"); + HIPCHECK( hipPointerGetAttributes(&attribs, A_d)); + printf("getAttr:%-20s", "A_d"); printAttribs(&attribs); + + // Check pointer arithmetic cases: + resetAttribs(&attribs2); + HIPCHECK( hipPointerGetAttributes(&attribs2, A_d+100)); + printf("getAttr:%-20s", "A_d+100"); printAttribs(&attribs2); + HIPASSERT(attribs == attribs2); + + // Corner case at end of array: + resetAttribs(&attribs2); + HIPCHECK( hipPointerGetAttributes(&attribs2, A_d+Nbytes-1)); + printf("getAttr:%-20s", "A_d+NBytes-1"); printAttribs(&attribs2); + HIPASSERT(attribs == attribs2); + + // Pointer just beyond array - must be invalid or at least a different pointer + resetAttribs(&attribs2); + e = hipPointerGetAttributes(&attribs2, A_d+Nbytes+1); + printf("getAttr:%-20s err=%d (%s), neg-test expected\n", "A_d+NBytes", e, hipGetErrorString(e)); + if (e != hipErrorInvalidValue) { + // We might have strayed into another pointer area. + printf("getAttr:%-20s", "A_d+NBytes"); printAttribs(&attribs2); + HIPASSERT(attribs.devicePointer != attribs2.devicePointer); + } + + + resetAttribs(&attribs2); + e = hipPointerGetAttributes(&attribs2, A_d+Nbytes); + if (e != hipErrorInvalidValue) { + printf("%-20s", "A_d+Nbytes"); printAttribs(&attribs2); + HIPASSERT(attribs.devicePointer != attribs2.devicePointer); + } + + hipFree(A_d); + e = hipPointerGetAttributes(&attribs, A_d); + HIPASSERT(e == hipErrorInvalidValue); // Just freed the pointer, this should return an error. + + + // Device-visible host memory + printf ("\nDevice-visible host memory (hipMallocHost)\n"); + HIPCHECK( hipPointerGetAttributes(&attribs, A_Pinned_h)); + printf("getAttr:%-20s", "A_pinned_h"); printAttribs(&attribs); + + resetAttribs(&attribs2); + HIPCHECK( hipPointerGetAttributes(&attribs2, A_Pinned_h+Nbytes/2)); + printf("getAttr:%-20s", "A_pinned_h+NBytes/2"); printAttribs(&attribs2); + HIPASSERT(attribs == attribs2); + + + hipFreeHost(A_Pinned_h); + e = hipPointerGetAttributes(&attribs, A_Pinned_h); + HIPASSERT(e == hipErrorInvalidValue); // Just freed the pointer, this should return an error. + printf("getAttr:%-20s err=%d (%s), neg-test expected\n", "A_d+NBytes", e, hipGetErrorString(e)); + + + // OS memory + printf ("\nOS-allocated memory (malloc)\n"); + e = hipPointerGetAttributes(&attribs, A_OSAlloc_h); + printf("getAttr:%-20s err=%d (%s), neg-test expected\n", "A_OSAlloc_h", e, hipGetErrorString(e)); + HIPASSERT(e == hipErrorInvalidValue); // OS-allocated pointers should return hipErrorInvalidValue. +} + +//--- +//Reset the memory tracker (remove allocations from all known devices): +//This frees any memory allocated through the runtime. +//The routine will not release any +void resetTracker () +{ + if (p_verbose & 0x1) { + printf ("info: reset tracker for all devices in platform\n"); + } + + int numDevices; + HIPCHECK(hipGetDeviceCount(&numDevices)); + + // Clean up: + for (int i=0; i reference(numAllocs); + + HIPASSERT(minSize > 0); + HIPASSERT(maxSize >= minSize); + + int numDevices; + HIPCHECK(hipGetDeviceCount(&numDevices)); + + //--- + //Populate with device and host allocations. + size_t totalDeviceAllocated[numDevices]; + for (int i =0; i 1) { + checkPointer(ref, i, 2, (char *)ref._pointer + ref._sizeBytes-1); + } + + if (ref._attrib.memoryType == hipMemoryTypeDevice) { + hipFree(ref._pointer); + } else { + hipFreeHost(ref._pointer); + } + + } + +#ifdef __HIP_PLATFORM_HCC__ + if (p_verbose & 0x2) { + printf ("Tracker after cleanup:\n"); + hc::am_memtracker_print(); + } +#endif +} + + +//--- +// Multi-threaded test with many simul allocs. +// IN : serialize will force the test to run in serial fashion. +// Seems like this does not hit MT corner cases in the tracker very often - testMultiThreaded_2 below seems more effective. +void testMultiThreaded_1(bool serialize=false) +{ + printf ("\n===========================================================================\n"); + printf ("MultiThreaded_1\n"); + if (serialize) printf ("[SERIALIZE]\n"); + printf ("===========================================================================\n"); + std::thread t1(clusterAllocs, 1000, 101, 1000); + if (serialize) t1.join(); + + std::thread t2(clusterAllocs, 1000, 11, 100); + if (serialize) t2.join(); + + std::thread t3(clusterAllocs, 1000, 5, 10); + if (serialize) t3.join(); + + std::thread t4(clusterAllocs, 1000, 1, 4); + if (serialize) t4.join(); + + if (!serialize) { + t1.join(); + t2.join(); + t3.join(); + t4.join(); + } + + resetTracker(); +} + + +///================================================================================================ + +//--- +//Repeatedly query a single entry: +void thread_query(void *ptr, const hipPointerAttribute_t *refAttrib) +{ + int count = 0; + + for (int count=0; count< 1000000; count++) { + hipPointerAttribute_t a; + hipError_t e = hipPointerGetAttributes(&a, ptr); + if ((e != hipSuccess) || (a!= *refAttrib)) { + printf("Test %d (err=%d)\n", count, e); + HIPCHECK(e); + + printf(" ref :: "); printAttribs(refAttrib); + printf(" getattr:: "); printAttribs(&a); + } + } +} + + +#ifdef __HIP_PLATFORM_HCC__ +//--- +// Add pointers to tracker very quickly, then remove them quickly: +enum Dir {Up, Down}; +void thread_noise_generator(int iters, size_t numBuffers, Dir addDir, Dir removeDir) +{ + const size_t bufferSize = 16; + size_t maxSize = numBuffers*bufferSize; + HIPASSERT((maxSize % bufferSize) == 0); // loop logic assumes this is true + + + for (int i=0; i=0; p-=bufferSize) { + hc::am_memtracker_add(p, bufferSize, acc, false); + } + } + + if (removeDir == Up) { + for (char *p = basePtr; p=0; p-=bufferSize) { + hc::am_memtracker_remove(p); + } + } + } +} + + +//--- +//Multi-thread test that is effective at catching locking errors in the alloc/dealloc/tracker. +//The query thread repeately requests information on the same block of memory. +//Meanwhile, the thread_noise_generator registers a large number of blocks, and +//then unregisters them. This causes a large amount of rebalancing in the tree +//structure and will generate errors unless the locks in the tracker are preventing reading +//while writing. +void testMultiThreaded_2() +{ + std::atomic inflight(2); + + printf ("\n===========================================================================\n"); + printf ("MultiThreaded_2\n"); + printf ("===========================================================================\n"); + + hipSetDevice(0); + hipDeviceReset(); + + // Create some entries in the tracker: + for (int i=0; i<1000; i++) { + void *C_d; + HIPCHECK(hipMalloc(&C_d, 32)); + } + + + // Allocate a pointer that we will repeatedly lookup: + void *A_d; + HIPCHECK(hipMalloc(&A_d, 10000)); + hipPointerAttribute_t attrib1; + HIPCHECK(hipPointerGetAttributes(&attrib1, A_d)); + std::thread t1(thread_query, A_d, &attrib1); + + std::thread t2(thread_noise_generator, 10000, 1000, Up, Up); + + t1.join(); + t2.join(); + + hipSetDevice(0); + hipDeviceReset(); +} +#endif + + + +int main(int argc, char *argv[]) +{ + N= 1000000; + HipTest::parseStandardArguments(argc, argv, true); + + + Nbytes = N*sizeof(char); + + printf ("N=%zu (%6.2f MB) device=%d\n", N, Nbytes/(1024.0*1024.0), p_gpuDevice); + + + if (p_tests & 0x01) { + printf ("info: set device to %d\n", p_gpuDevice); + HIPCHECK(hipSetDevice(p_gpuDevice)); + testSimple(); + } + + if (p_tests & 0x02) { + srand(0x100); + printf ("\n===========================================================================\n"); + clusterAllocs(100, 1024*1, 1024*1024); + resetTracker(); + } + + if (p_tests & 0x04) { + srand(0x200); + printf ("\n===========================================================================\n"); + clusterAllocs(1000, 1, 10); // Many tiny allocations; + resetTracker(); + } + + if (p_tests & 0x08) { + srand(0x300); + testMultiThreaded_1(true); + testMultiThreaded_1(false); + } + + +#ifdef __HIP_PLATFORM_HCC__ + if (p_tests & 0x10) { + srand(0x400); + testMultiThreaded_2(); + resetTracker(); + } +#endif + + printf ("\n"); + passed(); +} diff --git a/projects/clr/hipamd/tests/src/test_common.cpp b/projects/clr/hipamd/tests/src/test_common.cpp index d7a108a11b..3da5568b7c 100644 --- a/projects/clr/hipamd/tests/src/test_common.cpp +++ b/projects/clr/hipamd/tests/src/test_common.cpp @@ -28,6 +28,8 @@ int iterations = 1; unsigned blocksPerCU = 6; // to hide latency unsigned threadsPerBlock = 256; int p_gpuDevice = 0; +unsigned p_verbose = 0; +int p_tests = -1; /*which tests to run. Interpretation is left to each test. default:all*/ @@ -86,7 +88,7 @@ int parseStandardArguments(int argc, char *argv[], bool failOnUndefinedArg) if (!strcmp(arg, " ")) { // skip NULL args. - } else if (!strcmp(arg, "--N")) { + } else if (!strcmp(arg, "--N") || (!strcmp(arg, "-N"))) { if (++i >= argc || !HipTest::parseSize(argv[i], &N)) { failed("Bad N size argument"); } @@ -114,8 +116,16 @@ int parseStandardArguments(int argc, char *argv[], bool failOnUndefinedArg) failed("Bad gpuDevice argument"); } - } - else { + } else if (!strcmp(arg, "--verbose") || (!strcmp(arg, "-v"))) { + if (++i >= argc || !HipTest::parseUInt(argv[i], &p_verbose)) { + failed("Bad verbose argument"); + } + } else if (!strcmp(arg, "--tests") || (!strcmp(arg, "-t"))) { + if (++i >= argc || !HipTest::parseInt(argv[i], &p_tests)) { + failed("Bad tests argument"); + } + + } else { if (failOnUndefinedArg) { failed("Bad argument '%s'", arg); } else { diff --git a/projects/clr/hipamd/tests/src/test_common.h b/projects/clr/hipamd/tests/src/test_common.h index fee052c1ad..1bf89f1604 100644 --- a/projects/clr/hipamd/tests/src/test_common.h +++ b/projects/clr/hipamd/tests/src/test_common.h @@ -25,7 +25,7 @@ printf (__VA_ARGS__);\ printf ("\n");\ printf ("error: TEST FAILED\n%s", KNRM );\ - exit(EXIT_FAILURE); + abort(); #define HIPCHECK(error) \ @@ -53,6 +53,8 @@ extern int iterations; extern unsigned blocksPerCU; extern unsigned threadsPerBlock; extern int p_gpuDevice; +extern unsigned p_verbose; +extern int p_tests; namespace HipTest { @@ -86,7 +88,7 @@ vectorADD(hipLaunchParm lp, size_t stride = hipBlockDim_x * hipGridDim_x ; for (size_t i=offset; i void initArrays(T **A_d, T **B_d, T **C_d, T **A_h, T **B_h, T **C_h, - size_t N) + size_t N, bool usePinnedHost=false) { size_t Nbytes = N*sizeof(T); @@ -108,14 +110,32 @@ void initArrays(T **A_d, T **B_d, T **C_d, HIPCHECK ( hipMalloc(C_d, Nbytes) ); } - if (A_h) - *A_h = (T*)malloc(Nbytes); - - if (B_h) - *B_h = (T*)malloc(Nbytes); + if (usePinnedHost) { + if (A_h) { + HIPCHECK ( hipMallocHost(A_h, Nbytes) ); + } + if (B_h) { + HIPCHECK ( hipMallocHost(B_h, Nbytes) ); + } + if (C_h) { + HIPCHECK ( hipMallocHost(C_h, Nbytes) ); + } + } else { + if (A_h) { + *A_h = (T*)malloc(Nbytes); + HIPASSERT(*A_h != NULL); + } + + if (B_h) { + *B_h = (T*)malloc(Nbytes); + HIPASSERT(*B_h != NULL); + } - if (C_h) - *C_h = (T*)malloc(Nbytes); + if (C_h) { + *C_h = (T*)malloc(Nbytes); + HIPASSERT(*C_h != NULL); + } + } // Initialize the host data: @@ -128,7 +148,43 @@ void initArrays(T **A_d, T **B_d, T **C_d, } +template +void freeArrays(T *A_d, T *B_d, T *C_d, + T *A_h, T *B_h, T *C_h, bool usePinnedHost) +{ + if (A_d) { + HIPCHECK ( hipFree(A_d) ); + } + if (B_d) { + HIPCHECK ( hipFree(B_d) ); + } + if (C_d) { + HIPCHECK ( hipFree(C_d) ); + } + if (usePinnedHost) { + if (A_h) { + HIPCHECK (hipFreeHost(A_h)); + } + if (B_h) { + HIPCHECK (hipFreeHost(B_h)); + } + if (C_h) { + HIPCHECK (hipFreeHost(C_h)); + } + } else { + if (A_h) { + free (A_h); + } + if (B_h) { + free (B_h); + } + if (C_h) { + free (C_h); + } + } + +} // Assumes C_h contains vector add of A_h + B_h diff --git a/projects/clr/hipamd/util/vim/hip.vim b/projects/clr/hipamd/util/vim/hip.vim index 01f3b3f2ad..e4ea0a4a9e 100644 --- a/projects/clr/hipamd/util/vim/hip.vim +++ b/projects/clr/hipamd/util/vim/hip.vim @@ -91,6 +91,7 @@ syn keyword hipFunctionName hipD3D9UnmapResources syn keyword hipFunctionName hipD3D9UnregisterResource syn keyword hipFunctionName hipDeviceGetProperties syn keyword hipFunctionName hipDeviceSynchronize +syn keyword hipFunctionName hipDeviceReset syn keyword hipFunctionName hipEventCreate syn keyword hipFunctionName hipEventDestroy syn keyword hipFunctionName hipEventElapsedTime @@ -151,6 +152,9 @@ syn keyword hipFunctionName hipUnbindTexture syn keyword hipFlags hipFilterModePoint syn keyword hipFlags hipMemcpyHostToDevice syn keyword hipFlags hipMemcpyDeviceToHost +syn keyword hipFlags hipMemcpyHostToHost +syn keyword hipFlags hipMemcpyDeviceToDevice +syn keyword hipFlags hipMemcpyDefault syn keyword hipFlags hipReadModeElementType syn keyword hipFlags hipSuccess syn keyword hipFlags hipTextureType1D