Step1 in staging buffer copy.
- use StagingBuffer class for copies.
- refactor g_device to use array rather than vector.
(keeps pointers from moving).
[ROCm/hip commit: 24c1fdb864]
Этот коммит содержится в:
@@ -22,8 +22,8 @@ struct AmPointerInfo {
|
||||
bool _isInDeviceMem; ///< Memory is physically resident on a device (if false, memory is located on host)
|
||||
bool _isAmManaged; ///< Memory was allocated by AM and should be freed when am_reset is called.
|
||||
|
||||
int _appId; ///< App-specific storage. Used by HIP to store deviceID.
|
||||
unsigned _appAllocationFlags; ///< App-specific allocation flags. Used by HIP to store allocation flags.
|
||||
int _appId; ///< App-specific storage. (Used by HIP to store deviceID.)
|
||||
unsigned _appAllocationFlags; ///< App-specific allocation flags. (Used by HIP to store allocation flags.)
|
||||
|
||||
AmPointerInfo() {};
|
||||
|
||||
@@ -91,7 +91,7 @@ am_status_t AM_copy(void* dst, const void* src, size_t size);
|
||||
*
|
||||
* @see AM_memtracker_add,
|
||||
*/
|
||||
am_status_t am_memtracker_getinfo(hc::AmPointerInfo *info, void *ptr);
|
||||
am_status_t am_memtracker_getinfo(hc::AmPointerInfo *info, const void *ptr);
|
||||
|
||||
|
||||
//TODO-doc
|
||||
@@ -99,7 +99,7 @@ am_status_t am_memtracker_add(void* ptr, size_t sizeBytes, hc::accelerator acc,
|
||||
|
||||
|
||||
//TODO-doc
|
||||
am_status_t am_memtracker_update(void* ptr, int appId, unsigned allocationFlags);
|
||||
am_status_t am_memtracker_update(const void* ptr, int appId, unsigned allocationFlags);
|
||||
|
||||
|
||||
/**
|
||||
|
||||
@@ -24,10 +24,10 @@
|
||||
//#include <shared_mutex>
|
||||
|
||||
struct AmMemoryRange {
|
||||
void * _basePointer;
|
||||
void * _endPointer;
|
||||
AmMemoryRange(void *basePointer, size_t sizeBytes) :
|
||||
_basePointer(basePointer), _endPointer((unsigned char*)basePointer + sizeBytes - 1) {};
|
||||
const void * _basePointer;
|
||||
const void * _endPointer;
|
||||
AmMemoryRange(const void *basePointer, size_t sizeBytes) :
|
||||
_basePointer(basePointer), _endPointer((const unsigned char*)basePointer + sizeBytes - 1) {};
|
||||
};
|
||||
|
||||
// Functor to compare ranges:
|
||||
@@ -63,7 +63,7 @@ public:
|
||||
void insert(void *pointer, const hc::AmPointerInfo &p);
|
||||
int remove(void *pointer);
|
||||
|
||||
MapTrackerType::iterator find(void *hostPtr) ;
|
||||
MapTrackerType::iterator find(const void *hostPtr) ;
|
||||
|
||||
MapTrackerType::iterator readerLockBegin() { _mutex.lock(); return _tracker.begin(); } ;
|
||||
MapTrackerType::iterator end() { return _tracker.end(); } ;
|
||||
@@ -107,7 +107,7 @@ int AmPointerTracker::remove (void *pointer)
|
||||
|
||||
|
||||
//---
|
||||
AmPointerTracker::MapTrackerType::iterator AmPointerTracker::find (void *pointer)
|
||||
AmPointerTracker::MapTrackerType::iterator AmPointerTracker::find (const void *pointer)
|
||||
{
|
||||
// TODO-mutex- read lock
|
||||
std::lock_guard<std::mutex> l (_mutex);
|
||||
@@ -144,7 +144,7 @@ size_t AmPointerTracker::reset (hc::accelerator acc)
|
||||
for (auto iter = _tracker.begin() ; iter != _tracker.end(); ) {
|
||||
if (iter->second._acc == acc) {
|
||||
if (iter->second._isAmManaged) {
|
||||
hsa_memory_free(iter->first._basePointer);
|
||||
hsa_memory_free(const_cast<void*> (iter->first._basePointer));
|
||||
}
|
||||
count++;
|
||||
|
||||
@@ -278,7 +278,7 @@ am_status_t AM_copy(void* dst, const void* src, size_t sizeBytes)
|
||||
}
|
||||
|
||||
|
||||
am_status_t am_memtracker_getinfo(hc::AmPointerInfo *info, void *ptr)
|
||||
am_status_t am_memtracker_getinfo(hc::AmPointerInfo *info, const void *ptr)
|
||||
{
|
||||
auto infoI = g_amPointerTracker.find(ptr);
|
||||
if (infoI != g_amPointerTracker.end()) {
|
||||
@@ -290,7 +290,7 @@ am_status_t am_memtracker_getinfo(hc::AmPointerInfo *info, void *ptr)
|
||||
}
|
||||
|
||||
|
||||
am_status_t am_memtracker_update(void* ptr, int appId, unsigned allocationFlags)
|
||||
am_status_t am_memtracker_update(const void* ptr, int appId, unsigned allocationFlags)
|
||||
{
|
||||
auto iter = g_amPointerTracker.find(ptr);
|
||||
if (iter != g_amPointerTracker.end()) {
|
||||
|
||||
@@ -43,7 +43,7 @@ THE SOFTWARE.
|
||||
|
||||
#include "hc_AM.cpp"
|
||||
|
||||
#define USE_ASYNC_COPY 0
|
||||
#define USE_ASYNC_COPY 1
|
||||
#define USE_AM_TRACKER 1 /* use new AM memory tracker features */
|
||||
|
||||
#define INLINE static inline
|
||||
@@ -60,10 +60,12 @@ static const int release = 1;
|
||||
int HIP_PRINT_ENV = 0;
|
||||
int HIP_TRACE_API= 0;
|
||||
int HIP_LAUNCH_BLOCKING = 0;
|
||||
int HIP_STAGING_SIZE = 64; /* size of staging buffers, in KB */
|
||||
|
||||
#define TRACE_API 0x1 /* trace API calls and return values */
|
||||
#define TRACE_SYNC 0x2 /* trace synchronization pieces */
|
||||
#define TRACE_MEM 0x4 /* trace memory allocation / deallocation */
|
||||
#define TRACE_API 0x1 /* trace API calls and return values */
|
||||
#define TRACE_SYNC 0x2 /* trace synchronization pieces */
|
||||
#define TRACE_MEM 0x4 /* trace memory allocation / deallocation */
|
||||
#define TRACE_COPY2 0x8 /* trace memory copy commands. Detailed. */
|
||||
|
||||
#define tprintf(trace_level, ...) {\
|
||||
if (HIP_TRACE_API & trace_level) {\
|
||||
@@ -119,6 +121,28 @@ struct ihipEvent_t {
|
||||
} ;
|
||||
|
||||
|
||||
//-------------------------------------------------------------------------------------------------
|
||||
struct StagingBuffer {
|
||||
static const int numBuffers = 2;
|
||||
|
||||
int _bufferIndex; // Operating on buffer 0 or 1?
|
||||
|
||||
ihipDevice_t *_device;
|
||||
size_t _bufferSize; // Size of the buffers.
|
||||
|
||||
|
||||
StagingBuffer(ihipDevice_t *device, size_t bufferSize) ;
|
||||
|
||||
void CopyHostToDevice(void* dst, const void* src, size_t sizeBytes);
|
||||
|
||||
private:
|
||||
char *_pinnedStagingBuffer[numBuffers];
|
||||
hsa_signal_t _completion_signal[numBuffers];
|
||||
};
|
||||
|
||||
|
||||
|
||||
//-------------------------------------------------------------------------------------------------
|
||||
struct ihipDevice_t
|
||||
{
|
||||
unsigned _device_index; // index into g_devices.
|
||||
@@ -135,8 +159,11 @@ struct ihipDevice_t
|
||||
|
||||
unsigned _compute_units;
|
||||
|
||||
StagingBuffer *_staging_host2device;
|
||||
StagingBuffer *_staging_device2host;
|
||||
|
||||
public:
|
||||
ihipDevice_t(unsigned device_index, hc::accelerator acc);
|
||||
void init(unsigned device_index, hc::accelerator acc);
|
||||
hipError_t getProperties(hipDeviceProp_t* prop);
|
||||
|
||||
// TODO- create a copy constructor.
|
||||
@@ -145,10 +172,10 @@ public:
|
||||
|
||||
|
||||
//=================================================================================================
|
||||
ihipDevice_t::ihipDevice_t(unsigned device_index, hc::accelerator acc)
|
||||
: _device_index(device_index),
|
||||
_acc(acc)
|
||||
void ihipDevice_t::init(unsigned device_index, hc::accelerator acc)
|
||||
{
|
||||
_device_index = device_index;
|
||||
_acc = acc;
|
||||
hsa_agent_t *agent = static_cast<hsa_agent_t*> (acc.get_default_view().get_hsa_agent());
|
||||
if (agent) {
|
||||
int err = hsa_agent_get_info(*agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &_compute_units);
|
||||
@@ -166,6 +193,9 @@ ihipDevice_t::ihipDevice_t(unsigned device_index, hc::accelerator acc)
|
||||
_null_stream = new ihipStream_t(device_index, acc.get_default_view(), hipStreamDefault);
|
||||
this->_streams.push_back(_null_stream);
|
||||
tprintf(TRACE_SYNC, "created device with null_stream=%p\n", _null_stream);
|
||||
|
||||
_staging_host2device = new StagingBuffer(this, HIP_STAGING_SIZE*1024);
|
||||
_staging_device2host = NULL;
|
||||
};
|
||||
|
||||
#if 0
|
||||
@@ -187,7 +217,8 @@ thread_local int tls_defaultDevice = 0;
|
||||
|
||||
// Global initialization.
|
||||
std::once_flag hip_initialized;
|
||||
std::vector<ihipDevice_t> g_devices; // Vector of all non-emulated (ie GPU) accelerators in the system.
|
||||
ihipDevice_t *g_devices; // Array of all non-emulated (ie GPU) accelerators in the system.
|
||||
unsigned g_deviceCnt;
|
||||
|
||||
//=================================================================================================
|
||||
|
||||
@@ -462,25 +493,36 @@ void ihipReadEnv_I(int *var_ptr, const char *var_name1, const char *var_name2, c
|
||||
//It is called with C++11 call_once, which provided thread-safety.
|
||||
void ihipInit()
|
||||
{
|
||||
|
||||
/*
|
||||
* Build a table of valid compute devices.
|
||||
*/
|
||||
auto accs = hc::accelerator::get_all();
|
||||
g_devices.reserve(accs.size());
|
||||
for (int i=0; i<accs.size(); i++) {
|
||||
if (! accs[i].get_is_emulated()) {
|
||||
int deviceId = g_devices.size();
|
||||
g_devices.emplace_back(ihipDevice_t(deviceId, accs[i]));
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Environment variables
|
||||
*/
|
||||
READ_ENV_I(release, HIP_PRINT_ENV, 0, "Print HIP environment variables.");
|
||||
READ_ENV_I(release, HIP_TRACE_API, 0, "Trace each HIP API call. Print function name and return code to stderr as program executes.");
|
||||
READ_ENV_I(release, HIP_LAUNCH_BLOCKING, CUDA_LAUNCH_BLOCKING, "Make HIP APIs 'host-synchronous', so they block until any kernel launches or data copy commands complete. Alias: CUDA_LAUNCH_BLOCKING." );
|
||||
READ_ENV_I(release, HIP_STAGING_SIZE, 0, "Size of staging buffer, in KB" );
|
||||
|
||||
/*
|
||||
* Build a table of valid compute devices.
|
||||
*/
|
||||
auto accs = hc::accelerator::get_all();
|
||||
int deviceCnt = 0;
|
||||
for (int i=0; i<accs.size(); i++) {
|
||||
if (! accs[i].get_is_emulated()) {
|
||||
deviceCnt++;
|
||||
}
|
||||
};
|
||||
|
||||
g_devices = new ihipDevice_t[deviceCnt];
|
||||
g_deviceCnt = 0;
|
||||
for (int i=0; i<accs.size(); i++) {
|
||||
if (! accs[i].get_is_emulated()) {
|
||||
g_devices[g_deviceCnt].init(g_deviceCnt, accs[i]);
|
||||
g_deviceCnt++;
|
||||
}
|
||||
}
|
||||
|
||||
assert(deviceCnt == g_deviceCnt);
|
||||
|
||||
|
||||
tprintf(TRACE_API, "pid=%u %-30s\n", getpid(), "<ihipInit>");
|
||||
|
||||
@@ -489,7 +531,7 @@ void ihipInit()
|
||||
INLINE bool ihipIsValidDevice(unsigned deviceIndex)
|
||||
{
|
||||
// deviceIndex is unsigned so always > 0
|
||||
return (deviceIndex < g_devices.size());
|
||||
return (deviceIndex < g_deviceCnt);
|
||||
}
|
||||
|
||||
|
||||
@@ -508,7 +550,7 @@ INLINE ihipDevice_t *ihipGetTlsDefaultDevice()
|
||||
//---
|
||||
INLINE ihipDevice_t *ihipGetDevice(int deviceId)
|
||||
{
|
||||
if ((deviceId >= 0) && (deviceId < g_devices.size())) {
|
||||
if ((deviceId >= 0) && (deviceId < g_deviceCnt)) {
|
||||
return &g_devices[deviceId];
|
||||
} else {
|
||||
return NULL;
|
||||
@@ -675,7 +717,7 @@ hipError_t hipGetDeviceCount(int *count)
|
||||
{
|
||||
std::call_once(hip_initialized, ihipInit);
|
||||
|
||||
*count = g_devices.size();
|
||||
*count = g_deviceCnt;
|
||||
|
||||
if (*count > 0) {
|
||||
return ihipLogStatus(hipSuccess);
|
||||
@@ -764,7 +806,7 @@ hipError_t hipSetDevice(int device)
|
||||
{
|
||||
std::call_once(hip_initialized, ihipInit);
|
||||
|
||||
if ((device < 0) || (device >= g_devices.size())) {
|
||||
if ((device < 0) || (device >= g_deviceCnt)) {
|
||||
return ihipLogStatus(hipErrorInvalidDevice);
|
||||
} else {
|
||||
tls_defaultDevice = device;
|
||||
@@ -1299,6 +1341,10 @@ hipError_t hipPointerGetAttributes(hipPointerAttribute_t *attributes, void* ptr)
|
||||
attributes->allocationFlags = amPointerInfo._appAllocationFlags;
|
||||
attributes->device = amPointerInfo._appId;
|
||||
|
||||
if (attributes->device < 0) {
|
||||
e = hipErrorInvalidDevice;
|
||||
}
|
||||
|
||||
|
||||
} else {
|
||||
attributes->memoryType = hipMemoryTypeDevice;
|
||||
@@ -1482,6 +1528,7 @@ hipError_t hipMallocHost(void** ptr, size_t sizeBytes)
|
||||
|
||||
}
|
||||
|
||||
//---
|
||||
hipError_t hipMemcpyToSymbol(const char* symbolName, const void *src, size_t count, size_t offset, hipMemcpyKind kind)
|
||||
{
|
||||
#ifdef USE_MEMCPYTOSYMBOL
|
||||
@@ -1500,6 +1547,102 @@ hipError_t hipMemcpyToSymbol(const char* symbolName, const void *src, size_t cou
|
||||
}
|
||||
|
||||
|
||||
//-------------------------------------------------------------------------------------------------
|
||||
StagingBuffer::StagingBuffer(ihipDevice_t *device, size_t bufferSize) :
|
||||
_bufferIndex(0),
|
||||
_device(device),
|
||||
_bufferSize(bufferSize)
|
||||
{
|
||||
for (int i=0; i<numBuffers; i++) {
|
||||
_pinnedStagingBuffer[i] = hc::AM_alloc(_bufferSize, device->_acc, amHostPinned);
|
||||
if (_pinnedStagingBuffer[i] == NULL) {
|
||||
throw;
|
||||
}
|
||||
hsa_signal_create(0, 0, NULL, &_completion_signal[i]);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
//---
|
||||
void StagingBuffer::CopyHostToDevice(void* dst, const void* src, size_t sizeBytes) {
|
||||
const char *srcp = static_cast<const char*> (src);
|
||||
char *dstp = static_cast<char*> (dst);
|
||||
|
||||
assert(sizeBytes < UINT64_MAX/2); // TODO
|
||||
for (int64_t bytesRemaining=sizeBytes; bytesRemaining>0; bytesRemaining -= _bufferSize) {
|
||||
|
||||
// TODO - double-buffer these guys.
|
||||
size_t theseBytes = (bytesRemaining > _bufferSize) ? _bufferSize : bytesRemaining;
|
||||
|
||||
tprintf (TRACE_COPY2, "copy %zu bytes %p to stagingBuf[%d]:%p\n", theseBytes, srcp, _bufferIndex, _pinnedStagingBuffer[_bufferIndex]);
|
||||
|
||||
memcpy(_pinnedStagingBuffer[_bufferIndex], srcp, theseBytes);
|
||||
|
||||
tprintf (TRACE_COPY2, "async_copy %zu bytes %p to %p\n", theseBytes, _pinnedStagingBuffer[_bufferIndex], dstp);
|
||||
|
||||
hsa_signal_store_relaxed(_completion_signal[_bufferIndex], 1);
|
||||
hsa_status_t hsa_status = hsa_amd_memory_async_copy(dstp, _pinnedStagingBuffer[_bufferIndex], theseBytes, _device->_hsa_agent, 0, NULL, _completion_signal[_bufferIndex]);
|
||||
|
||||
tprintf (TRACE_COPY2, "waiting... status=%d\n", hsa_status);
|
||||
if (hsa_status == HSA_STATUS_SUCCESS) {
|
||||
hsa_signal_wait_acquire(_completion_signal[_bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE);
|
||||
}
|
||||
|
||||
srcp += theseBytes;
|
||||
dstp += theseBytes;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
#if USE_AM_TRACKER
|
||||
// TODO - add mutex to limit in/out:
|
||||
void ihipAsyncCopy(ihipDevice_t *device, void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind)
|
||||
{
|
||||
hc::AmPointerInfo dstPtrInfo, srcPtrInfo;
|
||||
|
||||
bool dstNotTracked = (hc::am_memtracker_getinfo(&dstPtrInfo, dst) != AM_SUCCESS);
|
||||
bool srcNotTracked = (hc::am_memtracker_getinfo(&srcPtrInfo, src) != AM_SUCCESS);
|
||||
|
||||
bool useStagingBuffer = true;
|
||||
|
||||
// Resolve default to a specific Kind, since we use different algorithms:
|
||||
if (kind == hipMemcpyDefault) {
|
||||
bool dstIsHost = (dstNotTracked || dstPtrInfo._isInDeviceMem);
|
||||
bool srcIsHost = (srcNotTracked || srcPtrInfo._isInDeviceMem);
|
||||
if (srcIsHost && !dstIsHost) {
|
||||
kind = hipMemcpyHostToDevice;
|
||||
} else if (!srcIsHost && dstIsHost) {
|
||||
kind = hipMemcpyDeviceToHost;
|
||||
} else if (srcIsHost && dstIsHost) {
|
||||
kind = hipMemcpyHostToHost;
|
||||
} else if (srcIsHost && dstIsHost) {
|
||||
kind = hipMemcpyDeviceToDevice;
|
||||
}
|
||||
}
|
||||
|
||||
switch (kind) {
|
||||
case hipMemcpyHostToDevice:
|
||||
if (srcNotTracked) {
|
||||
device->_staging_host2device->CopyHostToDevice(dst, src, sizeBytes);
|
||||
} else {
|
||||
assert(0); // TODO
|
||||
//hsa_signal_wait_relaxed(completion_signal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE);
|
||||
}
|
||||
break;
|
||||
case hipMemcpyDeviceToHost:
|
||||
// TODO - optimize the copy here.
|
||||
hc::AM_copy(dst, src, sizeBytes);
|
||||
break;
|
||||
default:
|
||||
assert(0); // TODO
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
//---
|
||||
hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind)
|
||||
{
|
||||
@@ -1517,11 +1660,8 @@ hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind
|
||||
|
||||
ihipDevice_t *device = &g_devices[stream->_device_index];
|
||||
|
||||
hsa_signal_t completion_signal; // init/obtain from pool.
|
||||
ihipAsyncCopy(device, dst, src, sizeBytes, kind);
|
||||
|
||||
hsa_status_t hsa_status = hsa_amd_memory_async_copy(dst, src, size, device->_hsa_agent, 0, NULL, &completion_signal);
|
||||
|
||||
e = (hsa_status == HSA_STATUS_SUCCESS) ? hipSuccess : hipErrorTbd;
|
||||
} else {
|
||||
e = hipErrorInvalidResourceHandle;
|
||||
}
|
||||
|
||||
@@ -30,7 +30,7 @@ int main(int argc, char *argv[])
|
||||
|
||||
size_t Nbytes = N*sizeof(int);
|
||||
|
||||
printf ("N=%zu \n", N);
|
||||
printf ("N=%zu Nbytes=%6.2fMB\n", N, Nbytes/1024.0/1024.0);
|
||||
|
||||
int *A_d, *B_d, *C_d;
|
||||
int *A_h, *B_h, *C_h;
|
||||
|
||||
@@ -88,7 +88,7 @@ int parseStandardArguments(int argc, char *argv[], bool failOnUndefinedArg)
|
||||
|
||||
if (!strcmp(arg, " ")) {
|
||||
// skip NULL args.
|
||||
} else if (!strcmp(arg, "--N")) {
|
||||
} else if (!strcmp(arg, "--N") || (!strcmp(arg, "-N"))) {
|
||||
if (++i >= argc || !HipTest::parseSize(argv[i], &N)) {
|
||||
failed("Bad N size argument");
|
||||
}
|
||||
|
||||
@@ -151,6 +151,9 @@ syn keyword hipFunctionName hipUnbindTexture
|
||||
syn keyword hipFlags hipFilterModePoint
|
||||
syn keyword hipFlags hipMemcpyHostToDevice
|
||||
syn keyword hipFlags hipMemcpyDeviceToHost
|
||||
syn keyword hipFlags hipMemcpyHostToHost
|
||||
syn keyword hipFlags hipMemcpyDeviceToDevice
|
||||
syn keyword hipFlags hipMemcpyDefault
|
||||
syn keyword hipFlags hipReadModeElementType
|
||||
syn keyword hipFlags hipSuccess
|
||||
syn keyword hipFlags hipTextureType1D
|
||||
|
||||
Ссылка в новой задаче
Block a user