From 29119bd3d40f102dc3aef9dbce73dbeb8596df03 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Wed, 23 Mar 2016 11:13:02 -0500 Subject: [PATCH] reverted back to old infra with cmake added --- hipamd/CMakeLists.txt | 2 +- hipamd/include/hcc_detail/hip_common.h | 334 ---------- hipamd/include/hcc_detail/hip_inline.h | 67 -- hipamd/include/hcc_detail/hip_util.h | 22 - hipamd/include/hcc_detail/staging_buffer.h | 2 +- hipamd/include/hcc_detail/trace_helper.h | 3 +- hipamd/src/hip_device.cpp | 278 --------- hipamd/src/hip_hcc.cpp | 673 ++++++++++++++++++++- hipamd/src/hip_inline.h | 6 - hipamd/src/staging_buffer.cpp | 5 +- 10 files changed, 666 insertions(+), 726 deletions(-) delete mode 100644 hipamd/include/hcc_detail/hip_common.h delete mode 100644 hipamd/include/hcc_detail/hip_inline.h delete mode 100644 hipamd/include/hcc_detail/hip_util.h delete mode 100644 hipamd/src/hip_device.cpp delete mode 100644 hipamd/src/hip_inline.h diff --git a/hipamd/CMakeLists.txt b/hipamd/CMakeLists.txt index a62a33c340..d3249aee2f 100644 --- a/hipamd/CMakeLists.txt +++ b/hipamd/CMakeLists.txt @@ -12,7 +12,7 @@ set(CMAKE_C_COMPILER "hcc") set(CMAKE_C_FLAGS " -hc -I${HCC_PATH}/include -I${HSA_PATH}/include -I${HIP_PATH}/include -stdlib=libc++ ") set(CMAKE_CXX_FLAGS " -hc -I${HCC_PATH}/include -I${HSA_PATH}/include -I${HIP_PATH}/include -stdlib=libc++ ") -set(SOURCE_FILES src/hip_hcc.cpp src/hip_device.cpp src/staging_buffer.cpp) +set(SOURCE_FILES src/hip_hcc.cpp src/staging_buffer.cpp) add_library(hip_hcc STATIC ${SOURCE_FILES}) diff --git a/hipamd/include/hcc_detail/hip_common.h b/hipamd/include/hcc_detail/hip_common.h deleted file mode 100644 index e45f26e551..0000000000 --- a/hipamd/include/hcc_detail/hip_common.h +++ /dev/null @@ -1,334 +0,0 @@ -#pragma once -#ifndef HIP_COMMON_H -#define HIP_COMMON_H - -#include "hcc_detail/hip_util.h" -#include "hcc_detail/staging_buffer.h" - -#define HIP_INIT_API(...) \ - std::call_once(hip_initialized, ihipInit);\ - API_TRACE(__VA_ARGS__); - -const char *ihipErrorString(hipError_t); - -#define USE_AV_COPY 0 - -#define INLINE static inline - -//--- -// Environment variables: - -// Intended to distinguish whether an environment variable should be visible only in debug mode, or in debug+release. -//static const int debug = 0; -static const int release = 1; -static unsigned g_deviceCnt; - - -int HIP_LAUNCH_BLOCKING = 0; - -int HIP_PRINT_ENV = 0; -int HIP_TRACE_API= 0; -int HIP_DB= 0; -int HIP_STAGING_SIZE = 64; /* size of staging buffers, in KB */ -int HIP_STAGING_BUFFERS = 2; // TODO - remove, two buffers should be enough. -int HIP_PININPLACE = 0; -int HIP_STREAM_SIGNALS = 2; /* number of signals to allocate at stream creation */ -static int HIP_VISIBLE_DEVICES = 0; /* Contains a comma-separated sequence of GPU identifiers */ - - - -//--- -// Chicken bits for disabling functionality to work around potential issues: -int HIP_DISABLE_HW_KERNEL_DEP = 1; -int HIP_DISABLE_HW_COPY_DEP = 1; - - -// Color defs for debug messages: -#define KNRM "\x1B[0m" -#define KRED "\x1B[31m" -#define KGRN "\x1B[32m" -#define KYEL "\x1B[33m" -#define KBLU "\x1B[34m" -#define KMAG "\x1B[35m" -#define KCYN "\x1B[36m" -#define KWHT "\x1B[37m" - -#define API_COLOR KGRN - - -#define HIP_HCC - -// If set, thread-safety is enforced on all stream functions. -// Stream functions will acquire a mutex before entering critical sections. -#define STREAM_THREAD_SAFE 1 - -// If FORCE_COPY_DEP=1 , HIP runtime will add -// synchronization for copy commands in the same stream, regardless of command type. -// If FORCE_COPY_DEP=0 data copies of the same kind (H2H, H2D, D2H, D2D) are assumed to be implicitly ordered. -// ROCR runtime implementation currently provides this guarantee when using SDMA queues but not -// when using shader queues. -// TODO - measure if this matters for performance, in particular for back-to-back small copies. -// If not, we can simplify the copy dependency tracking by collapsing to a single Copy type, and always forcing dependencies for copy commands. -#define FORCE_SAMEDIR_COPY_DEP 1 - - -// Compile debug trace mode - this prints debug messages to stderr when env var HIP_DB is set. -// May be set to 0 to remove debug if checks - possible code size and performance difference? -#define COMPILE_HIP_DB 1 - - -// Compile HIP tracing capability. -// 0x1 = print a string at function entry with arguments. -// 0x2 = prints a simple message with function name + return code when function exits. -// 0x3 = print both. -// Must be enabled at runtime with HIP_TRACE_API -#define COMPILE_HIP_TRACE_API 0x3 - - -// Compile code that generates trace markers for CodeXL ATP at HIP function begin/end. -// ATP is standard CodeXL format that includes timestamps for kernels, HSA RT APIs, and HIP APIs. -#ifndef COMPILE_TRACE_MARKER -#define COMPILE_TRACE_MARKER 0 -#endif - - -// #include CPP files to produce one object file -#define ONE_OBJECT_FILE 1 - - -// Compile support for trace markers that are displayed on CodeXL GUI at start/stop of each function boundary. -// TODO - currently we print the trace message at the beginning. if we waited, we could also include return codes, and any values returned -// through ptr-to-args (ie the pointers allocated by hipMalloc). -#if COMPILE_TRACE_MARKER -#include "AMDTActivityLogger.h" -#define SCOPED_MARKER(markerName,group,userString) amdtScopedMarker(markerName, group, userString) -#else -// Swallow scoped markers: -#define SCOPED_MARKER(markerName,group,userString) -#endif - - -#if COMPILE_TRACE_MARKER || (COMPILE_HIP_TRACE_API & 0x1) -#define API_TRACE(...)\ -{\ - std::string s = std::string(__func__) + " (" + ToString(__VA_ARGS__) + ')';\ - if (COMPILE_HIP_DB && HIP_TRACE_API) {\ - fprintf (stderr, API_COLOR "<>\n" KNRM, (_hip_status == 0) ? API_COLOR:KRED, __func__, _hip_status, ihipErrorString(_hip_status));\ - }\ - _hip_status;\ - }) - - - -extern thread_local hipError_t tls_lastHipError; -const char *ihipErrorString(hipError_t); -typedef uint64_t SIGSEQNUM; -// Used to remove lock, for performance or stimulating bugs. -class FakeMutex -{ - public: - void lock() { } - bool try_lock() {return true; } - void unlock() { } -}; - - -#if STREAM_THREAD_SAFE -typedef std::mutex StreamMutex; -#else -typedef FakeMutex StreamMutex; -#endif - - -enum ihipCommand_t { - ihipCommandCopyH2H, - ihipCommandCopyH2D, - ihipCommandCopyD2H, - ihipCommandCopyD2D, - ihipCommandKernel, -}; - -struct ihipSignal_t; -struct ihipStream_t; -struct ihipDevice_t; -extern ihipDevice_t *g_devices; - -//--- -// Small wrapper around signals. -// Designed to be used from stream. -// TODO-someday refactor this class so it can be stored in a vector<> -// we already store the index here so we can use for garbage collection. -struct ihipSignal_t { - hsa_signal_t _hsa_signal; // hsa signal handle - int _index; // Index in pool, used for garbage collection. - SIGSEQNUM _sig_id; // unique sequentially increasing ID. - - ihipSignal_t(); - ~ihipSignal_t(); - - inline void release(); -}; - -// Internal stream structure. - -class ihipStream_t { -public: -typedef uint64_t SeqNum_t ; - - ihipStream_t(unsigned device_index, hc::accelerator_view av, SeqNum_t id, unsigned int flags); - ~ihipStream_t(); - - - void copySync (void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind); - void copyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind); - - //--- - // Thread-safe accessors - these acquire / release mutex: - inline bool preKernelCommand(); - inline void postKernelCommand(hc::completion_future &kernel_future); - - inline int preCopyCommand(ihipSignal_t *lastCopy, hsa_signal_t *waitSignal, ihipCommand_t copyType); - - inline void reclaimSignals_ts(SIGSEQNUM sigNum); - inline void wait(bool assertQueueEmpty=false); - - - - // Non-threadsafe accessors - must be protected by high-level stream lock: - inline SIGSEQNUM lastCopySeqId() { return _last_copy_signal ? _last_copy_signal->_sig_id : 0; }; - ihipSignal_t * allocSignal(); - - - //-- Non-racy accessors: - // These functions access fields set at initialization time and are non-racy (so do not acquire mutex) - inline ihipDevice_t * getDevice() const; - StreamMutex & mutex() {return _mutex;}; - - //--- - //Member vars - these are set at initialization: - SeqNum_t _id; // monotonic sequence ID - hc::accelerator_view _av; - unsigned _flags; -private: - void enqueueBarrier(hsa_queue_t* queue, ihipSignal_t *depSignal); - inline void waitCopy(ihipSignal_t *signal); - - - hipMemcpyKind resolveMemcpyDirection(bool srcInDeviceMem, bool dstInDeviceMem); - void setCopyAgents(hipMemcpyKind kind, ihipCommand_t *commandType, hsa_agent_t *srcAgent, hsa_agent_t *dstAgent); - - //--- - - unsigned _device_index; - ihipCommand_t _last_command_type; // type of the last command - - // signal of last copy command sent to the stream. - // May be NULL, indicating the previous command has completley finished and future commands don't need to create a dependency. - // Copy can be either H2D or D2H. - ihipSignal_t *_last_copy_signal; - hc::completion_future _last_kernel_future; // Completion future of last kernel command sent to GPU. - - int _signalCursor; - - SIGSEQNUM _stream_sig_id; // Monotonically increasing unique signal id. - SIGSEQNUM _oldest_live_sig_id; // oldest live seq_id, anything < this can be allocated. - std::deque _signalPool; // Pool of signals for use by this stream. - - StreamMutex _mutex; -}; - - -struct ihipDevice_t -{ - unsigned _device_index; // index into g_devices. - - hipDeviceProp_t _props; // saved device properties. - hc::accelerator _acc; - hsa_agent_t _hsa_agent; // hsa agent handle - - // The NULL stream is used if no other stream is specified. - // NULL has special synchronization properties with other streams. - ihipStream_t *_default_stream; - - std::list _streams; // streams associated with this device. - - unsigned _compute_units; - - StagingBuffer *_staging_buffer[2]; // one buffer for each direction. - - ihipStream_t::SeqNum_t _stream_id; - -public: - void init(unsigned device_index, hc::accelerator acc); - ~ihipDevice_t(); - void reset(); - hipError_t getProperties(hipDeviceProp_t* prop); - - inline void waitAllStreams(); - inline void syncDefaultStream(bool waitOnSelf); - -private: - -}; - - -#endif diff --git a/hipamd/include/hcc_detail/hip_inline.h b/hipamd/include/hcc_detail/hip_inline.h deleted file mode 100644 index 3e2ae0fce7..0000000000 --- a/hipamd/include/hcc_detail/hip_inline.h +++ /dev/null @@ -1,67 +0,0 @@ -#ifndef HIP_INLINE_H -#define HIP_INLINE_H - -#include "trace_helper.h" - -#define INLINE static inline -extern ihipDevice_t *g_devices; -extern thread_local int tls_defaultDevice; -extern const hipStream_t hipStreamNull; - -INLINE bool ihipIsValidDevice(unsigned deviceIndex) -{ - // deviceIndex is unsigned so always > 0 - return (deviceIndex < g_deviceCnt); -} - -/*// check if the device ID is set as visible*/ -//INLINE bool ihipIsVisibleDevice(unsigned deviceIndex) -//{ - //return std::find(g_hip_visible_devices.begin(), g_hip_visible_devices.end(), - //(int)deviceIndex) != g_hip_visible_devices.end(); -/*}*/ - -//--- -INLINE ihipDevice_t *ihipGetTlsDefaultDevice() -{ - // If this is invalid, the TLS state is corrupt. - // This can fire if called before devices are initialized. - // TODO - consider replacing assert with error code - assert (ihipIsValidDevice(tls_defaultDevice)); - - return &g_devices[tls_defaultDevice]; -} - - -//--- -INLINE ihipDevice_t *ihipGetDevice(int deviceId) -{ - if ((deviceId >= 0) && (deviceId < g_deviceCnt)) { - return &g_devices[deviceId]; - } else { - return NULL; - } - -} - -inline hipStream_t ihipSyncAndResolveStream(hipStream_t stream) -{ - if (stream == hipStreamNull ) { - ihipDevice_t *device = ihipGetTlsDefaultDevice(); - -#ifndef HIP_API_PER_THREAD_DEFAULT_STREAM - device->syncDefaultStream(false); -#endif - return device->_default_stream; - } else { - // Have to wait for legacy default stream to be empty: - if (!(stream->_flags & hipStreamNonBlocking)) { - tprintf(DB_SYNC, "stream %p wait default stream\n", stream); - stream->getDevice()->_default_stream->wait(); - } - - return stream; - } -} - -#endif diff --git a/hipamd/include/hcc_detail/hip_util.h b/hipamd/include/hcc_detail/hip_util.h deleted file mode 100644 index 2e89dd4e8d..0000000000 --- a/hipamd/include/hcc_detail/hip_util.h +++ /dev/null @@ -1,22 +0,0 @@ -#ifndef HIP_UTIL_H -#define HIP_UTIL_H - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include "hip_runtime.h" - -#include "hsa_ext_amd.h" - -#endif diff --git a/hipamd/include/hcc_detail/staging_buffer.h b/hipamd/include/hcc_detail/staging_buffer.h index f570842cc8..23a748e2cc 100644 --- a/hipamd/include/hcc_detail/staging_buffer.h +++ b/hipamd/include/hcc_detail/staging_buffer.h @@ -1,6 +1,6 @@ #pragma once -#include "hcc_detail/hip_util.h" +#include "hsa.h" //------------------------------------------------------------------------------------------------- diff --git a/hipamd/include/hcc_detail/trace_helper.h b/hipamd/include/hcc_detail/trace_helper.h index 43d37f9c4e..a4542f76da 100644 --- a/hipamd/include/hcc_detail/trace_helper.h +++ b/hipamd/include/hcc_detail/trace_helper.h @@ -1,7 +1,7 @@ #include #include #include -#include "hcc_detail/hip_common.h" + //--- // Helper functions to convert HIP function arguments into strings. // Handles POD data types as well as enumerations (ie hipMemcpyKind). @@ -12,6 +12,7 @@ // Handy macro to convert an enumeration to a stringified version of same: #define CASE_STR(x) case x: return #x; + // Building block functions: template std::string ToHexString(T v) diff --git a/hipamd/src/hip_device.cpp b/hipamd/src/hip_device.cpp deleted file mode 100644 index 8f176ab7e9..0000000000 --- a/hipamd/src/hip_device.cpp +++ /dev/null @@ -1,278 +0,0 @@ -#include "hcc_detail/hip_runtime.h" -//#include "hcc_detail/trace_helper.h" -#include "hcc_detail/hip_inline.h" -#include "hsa.h" -extern std::once_flag hip_initialized; -extern void ihipInit(); -extern thread_local int tls_defaultDevice; - -//--- -/** - * @returns #hipSuccess - */ -hipError_t hipDeviceGetCacheConfig ( hipFuncCache *cacheConfig ) -{ - std::call_once(hip_initialized, ihipInit); - - *cacheConfig = hipFuncCachePreferNone; - - return ihipLogStatus(hipSuccess); -} - - - -//--- -/** - * @returns #hipSuccess - */ -hipError_t hipDeviceGetSharedMemConfig ( hipSharedMemConfig * pConfig ) -{ - std::call_once(hip_initialized, ihipInit); - - *pConfig = hipSharedMemBankSizeFourByte; - - return ihipLogStatus(hipSuccess); -} - - -//--- -/** - * @return @ref hipSuccess - */ -hipError_t hipDeviceReset(void) -{ - HIP_INIT_API(); - - ihipDevice_t *device = ihipGetTlsDefaultDevice(); - - // TODO-HCC - // This function currently does a user-level cleanup of known resources. - // It could benefit from KFD support to perform a more "nuclear" clean that would include any associated kernel resources and page table entries. - - - if (device) { - //--- - //Wait for pending activity to complete? - //TODO - check if this is required behavior: - for (auto streamI=device->_streams.begin(); streamI!=device->_streams.end(); streamI++) { - ihipStream_t *stream = *streamI; - stream->wait(); - } - - // Release device resources (streams and memory): - device->reset(); - } - - return ihipLogStatus(hipSuccess); -} - - -//--- -/** - * @returns #hipSuccess - */ -hipError_t hipDeviceSetCacheConfig ( hipFuncCache cacheConfig ) -{ - std::call_once(hip_initialized, ihipInit); - - // Nop, AMD does not support variable cache configs. - - return ihipLogStatus(hipSuccess); -} - - -//--- -/** - * @returns #hipSuccess - */ -hipError_t hipDeviceSetSharedMemConfig ( hipSharedMemConfig config ) -{ - std::call_once(hip_initialized, ihipInit); - - // Nop, AMD does not support variable shared mem configs. - - return ihipLogStatus(hipSuccess); -} - -//--- -/** - * @return #hipSuccess - */ -hipError_t hipDeviceSynchronize(void) -{ - HIP_INIT_API(); - - ihipGetTlsDefaultDevice()->waitAllStreams(); // ignores non-blocking streams, this waits for all activity to finish. - - - return ihipLogStatus(hipSuccess); -} - -//--- -/** - * @return #hipSuccess - */ -hipError_t hipGetDevice(int *device) -{ - HIP_INIT_API(device); - - *device = tls_defaultDevice; - return ihipLogStatus(hipSuccess); -} - -//--- -/** - * @return #hipSuccess, #hipErrorNoDevice - */ -hipError_t hipGetDeviceCount(int *count) -{ - HIP_INIT_API(count); - - *count = g_deviceCnt; - - if (*count > 0) { - return ihipLogStatus(hipSuccess); - } else { - return ihipLogStatus(hipErrorNoDevice); - } -} - - -/** - * @return #hipSuccess, #hipErrorInvalidDevice - * @bug HCC always returns 0 for maxThreadsPerMultiProcessor - * @bug HCC always returns 0 for regsPerBlock - * @bug HCC always returns 0 for l2CacheSize - */ -hipError_t hipGetDeviceProperties(hipDeviceProp_t* props, int device) -{ - HIP_INIT_API(props, device); - - hipError_t e; - - ihipDevice_t * hipDevice = ihipGetDevice(device); - if (hipDevice) { - // copy saved props - *props = hipDevice->_props; - e = hipSuccess; - } else { - e = hipErrorInvalidDevice; - } - - return ihipLogStatus(e); -} - - -//--- -/** - * @return #hipSuccess, #hipErrorInvalidDevice - */ -hipError_t hipSetDevice(int device) -{ - HIP_INIT_API(device); - - if ((device < 0) || (device >= g_deviceCnt)) { - return ihipLogStatus(hipErrorInvalidDevice); - } else { - tls_defaultDevice = device; - return ihipLogStatus(hipSuccess); - } -} - -/** - * - */ -hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t attr, int device) -{ - std::call_once(hip_initialized, ihipInit); - - hipError_t e = hipSuccess; - - ihipDevice_t * hipDevice = ihipGetDevice(device); - hipDeviceProp_t *prop = &hipDevice->_props; - if (hipDevice) { - switch (attr) { - case hipDeviceAttributeMaxThreadsPerBlock: - *pi = prop->maxThreadsPerBlock; break; - case hipDeviceAttributeMaxBlockDimX: - *pi = prop->maxThreadsDim[0]; break; - case hipDeviceAttributeMaxBlockDimY: - *pi = prop->maxThreadsDim[1]; break; - case hipDeviceAttributeMaxBlockDimZ: - *pi = prop->maxThreadsDim[2]; break; - case hipDeviceAttributeMaxGridDimX: - *pi = prop->maxGridSize[0]; break; - case hipDeviceAttributeMaxGridDimY: - *pi = prop->maxGridSize[1]; break; - case hipDeviceAttributeMaxGridDimZ: - *pi = prop->maxGridSize[2]; break; - case hipDeviceAttributeMaxSharedMemoryPerBlock: - *pi = prop->sharedMemPerBlock; break; - case hipDeviceAttributeTotalConstantMemory: - *pi = prop->totalConstMem; break; - case hipDeviceAttributeWarpSize: - *pi = prop->warpSize; break; - case hipDeviceAttributeMaxRegistersPerBlock: - *pi = prop->regsPerBlock; break; - case hipDeviceAttributeClockRate: - *pi = prop->clockRate; break; - case hipDeviceAttributeMemoryClockRate: - *pi = prop->memoryClockRate; break; - case hipDeviceAttributeMemoryBusWidth: - *pi = prop->memoryBusWidth; break; - case hipDeviceAttributeMultiprocessorCount: - *pi = prop->multiProcessorCount; break; - case hipDeviceAttributeComputeMode: - *pi = prop->computeMode; break; - case hipDeviceAttributeL2CacheSize: - *pi = prop->l2CacheSize; break; - case hipDeviceAttributeMaxThreadsPerMultiProcessor: - *pi = prop->maxThreadsPerMultiProcessor; break; - case hipDeviceAttributeComputeCapabilityMajor: - *pi = prop->major; break; - case hipDeviceAttributeComputeCapabilityMinor: - *pi = prop->minor; break; - case hipDeviceAttributePciBusId: - *pi = prop->pciBusID; break; - case hipDeviceAttributeConcurrentKernels: - *pi = prop->concurrentKernels; break; - case hipDeviceAttributePciDeviceId: - *pi = prop->pciDeviceID; break; - case hipDeviceAttributeMaxSharedMemoryPerMultiprocessor: - *pi = prop->maxSharedMemoryPerMultiProcessor; break; - case hipDeviceAttributeIsMultiGpuBoard: - *pi = prop->isMultiGpuBoard; break; - default: - e = hipErrorInvalidValue; break; - } - } else { - e = hipErrorInvalidDevice; - } - return ihipLogStatus(e); -} - - -/** - * @return #hipSuccess, #hipErrorInvalidDevice - * @bug HCC always returns 0 for maxThreadsPerMultiProcessor - * @bug HCC always returns 0 for regsPerBlock - * @bug HCC always returns 0 for l2CacheSize - */ -hipError_t hipGetDeviceProperties(hipDeviceProp_t* props, int device) -{ - HIP_INIT_API(props, device); - - hipError_t e; - - ihipDevice_t * hipDevice = ihipGetDevice(device); - if (hipDevice) { - // copy saved props - *props = hipDevice->_props; - e = hipSuccess; - } else { - e = hipErrorInvalidDevice; - } - - return ihipLogStatus(e); -} - diff --git a/hipamd/src/hip_hcc.cpp b/hipamd/src/hip_hcc.cpp index bc1f314574..0ef8b3407b 100644 --- a/hipamd/src/hip_hcc.cpp +++ b/hipamd/src/hip_hcc.cpp @@ -25,19 +25,191 @@ THE SOFTWARE. * Contains definitions for functions that are large enough that we don't want to inline them everywhere. * This file is compiled and linked into apps running HIP / HCC path. */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "hip_runtime.h" + +#include "hsa_ext_amd.h" // HIP includes: #define HIP_HCC #include "hcc_detail/staging_buffer.h" -#include "hcc_detail/hip_common.h" -#include "hcc_detail/hip_util.h" -#include "hcc_detail/hip_inline.h" + + // TODO, re-org header order. extern const char *ihipErrorString(hipError_t hip_error); +#include "hcc_detail/trace_helper.h" // #define USE_MEMCPYTOSYMBOL // //Use the new HCC accelerator_view::copy instead of am_copy +#define USE_AV_COPY 0 + +#define INLINE static inline + +//--- +// Environment variables: + +// Intended to distinguish whether an environment variable should be visible only in debug mode, or in debug+release. +//static const int debug = 0; +static const int release = 1; + + +int HIP_LAUNCH_BLOCKING = 0; + +int HIP_PRINT_ENV = 0; +int HIP_TRACE_API= 0; +int HIP_DB= 0; +int HIP_STAGING_SIZE = 64; /* size of staging buffers, in KB */ +int HIP_STAGING_BUFFERS = 2; // TODO - remove, two buffers should be enough. +int HIP_PININPLACE = 0; +int HIP_STREAM_SIGNALS = 2; /* number of signals to allocate at stream creation */ +int HIP_VISIBLE_DEVICES = 0; /* Contains a comma-separated sequence of GPU identifiers */ + + + +//--- +// Chicken bits for disabling functionality to work around potential issues: +int HIP_DISABLE_HW_KERNEL_DEP = 1; +int HIP_DISABLE_HW_COPY_DEP = 1; + + +// Color defs for debug messages: +#define KNRM "\x1B[0m" +#define KRED "\x1B[31m" +#define KGRN "\x1B[32m" +#define KYEL "\x1B[33m" +#define KBLU "\x1B[34m" +#define KMAG "\x1B[35m" +#define KCYN "\x1B[36m" +#define KWHT "\x1B[37m" + +#define API_COLOR KGRN + + +#define HIP_HCC + +// If set, thread-safety is enforced on all stream functions. +// Stream functions will acquire a mutex before entering critical sections. +#define STREAM_THREAD_SAFE 1 + +// If FORCE_COPY_DEP=1 , HIP runtime will add +// synchronization for copy commands in the same stream, regardless of command type. +// If FORCE_COPY_DEP=0 data copies of the same kind (H2H, H2D, D2H, D2D) are assumed to be implicitly ordered. +// ROCR runtime implementation currently provides this guarantee when using SDMA queues but not +// when using shader queues. +// TODO - measure if this matters for performance, in particular for back-to-back small copies. +// If not, we can simplify the copy dependency tracking by collapsing to a single Copy type, and always forcing dependencies for copy commands. +#define FORCE_SAMEDIR_COPY_DEP 1 + + +// Compile debug trace mode - this prints debug messages to stderr when env var HIP_DB is set. +// May be set to 0 to remove debug if checks - possible code size and performance difference? +#define COMPILE_HIP_DB 1 + + +// Compile HIP tracing capability. +// 0x1 = print a string at function entry with arguments. +// 0x2 = prints a simple message with function name + return code when function exits. +// 0x3 = print both. +// Must be enabled at runtime with HIP_TRACE_API +#define COMPILE_HIP_TRACE_API 0x3 + + +// Compile code that generates trace markers for CodeXL ATP at HIP function begin/end. +// ATP is standard CodeXL format that includes timestamps for kernels, HSA RT APIs, and HIP APIs. +#ifndef COMPILE_TRACE_MARKER +#define COMPILE_TRACE_MARKER 0 +#endif + + +// #include CPP files to produce one object file +#define ONE_OBJECT_FILE 1 + + +// Compile support for trace markers that are displayed on CodeXL GUI at start/stop of each function boundary. +// TODO - currently we print the trace message at the beginning. if we waited, we could also include return codes, and any values returned +// through ptr-to-args (ie the pointers allocated by hipMalloc). +#if COMPILE_TRACE_MARKER +#include "AMDTActivityLogger.h" +#define SCOPED_MARKER(markerName,group,userString) amdtScopedMarker(markerName, group, userString) +#else +// Swallow scoped markers: +#define SCOPED_MARKER(markerName,group,userString) +#endif + + +#if COMPILE_TRACE_MARKER || (COMPILE_HIP_TRACE_API & 0x1) +#define API_TRACE(...)\ +{\ + std::string s = std::string(__func__) + " (" + ToString(__VA_ARGS__) + ')';\ + if (COMPILE_HIP_DB && HIP_TRACE_API) {\ + fprintf (stderr, API_COLOR "< +// we already store the index here so we can use for garbage collection. +struct ihipSignal_t { + hsa_signal_t _hsa_signal; // hsa signal handle + int _index; // Index in pool, used for garbage collection. + SIGSEQNUM _sig_id; // unique sequentially increasing ID. + + ihipSignal_t(); + ~ihipSignal_t(); + + inline void release(); +}; + + +// Used to remove lock, for performance or stimulating bugs. +class FakeMutex +{ + public: + void lock() { } + bool try_lock() {return true; } + void unlock() { } +}; + + +#if STREAM_THREAD_SAFE +typedef std::mutex StreamMutex; +#else +typedef FakeMutex StreamMutex; +#endif // TODO - move async copy code into stream? Stream->async-copy. @@ -67,7 +281,71 @@ const char* ihipCommandName[] = { // Internal stream structure. +class ihipStream_t { +public: +typedef uint64_t SeqNum_t ; + ihipStream_t(unsigned device_index, hc::accelerator_view av, SeqNum_t id, unsigned int flags); + ~ihipStream_t(); + + + void copySync (void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind); + void copyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind); + + //--- + // Thread-safe accessors - these acquire / release mutex: + inline bool preKernelCommand(); + inline void postKernelCommand(hc::completion_future &kernel_future); + + inline int preCopyCommand(ihipSignal_t *lastCopy, hsa_signal_t *waitSignal, ihipCommand_t copyType); + + inline void reclaimSignals_ts(SIGSEQNUM sigNum); + inline void wait(bool assertQueueEmpty=false); + + + + // Non-threadsafe accessors - must be protected by high-level stream lock: + inline SIGSEQNUM lastCopySeqId() { return _last_copy_signal ? _last_copy_signal->_sig_id : 0; }; + ihipSignal_t * allocSignal(); + + + //-- Non-racy accessors: + // These functions access fields set at initialization time and are non-racy (so do not acquire mutex) + inline ihipDevice_t * getDevice() const; + StreamMutex & mutex() {return _mutex;}; + + //--- + //Member vars - these are set at initialization: + SeqNum_t _id; // monotonic sequence ID + hc::accelerator_view _av; + unsigned _flags; +private: + void enqueueBarrier(hsa_queue_t* queue, ihipSignal_t *depSignal); + inline void waitCopy(ihipSignal_t *signal); + + + hipMemcpyKind resolveMemcpyDirection(bool srcInDeviceMem, bool dstInDeviceMem); + void setCopyAgents(hipMemcpyKind kind, ihipCommand_t *commandType, hsa_agent_t *srcAgent, hsa_agent_t *dstAgent); + + //--- + + unsigned _device_index; + ihipCommand_t _last_command_type; // type of the last command + + // signal of last copy command sent to the stream. + // May be NULL, indicating the previous command has completley finished and future commands don't need to create a dependency. + // Copy can be either H2D or D2H. + ihipSignal_t *_last_copy_signal; + hc::completion_future _last_kernel_future; // Completion future of last kernel command sent to GPU. + + int _signalCursor; + + SIGSEQNUM _stream_sig_id; // Monotonically increasing unique signal id. + SIGSEQNUM _oldest_live_sig_id; // oldest live seq_id, anything < this can be allocated. + std::deque _signalPool; // Pool of signals for use by this stream. + + StreamMutex _mutex; +}; @@ -99,7 +377,38 @@ struct ihipEvent_t { //------------------------------------------------------------------------------------------------- +struct ihipDevice_t +{ + unsigned _device_index; // index into g_devices. + hipDeviceProp_t _props; // saved device properties. + hc::accelerator _acc; + hsa_agent_t _hsa_agent; // hsa agent handle + + // The NULL stream is used if no other stream is specified. + // NULL has special synchronization properties with other streams. + ihipStream_t *_default_stream; + + std::list _streams; // streams associated with this device. + + unsigned _compute_units; + + StagingBuffer *_staging_buffer[2]; // one buffer for each direction. + + ihipStream_t::SeqNum_t _stream_id; + +public: + void init(unsigned device_index, hc::accelerator acc); + ~ihipDevice_t(); + void reset(); + hipError_t getProperties(hipDeviceProp_t* prop); + + inline void waitAllStreams(); + inline void syncDefaultStream(bool waitOnSelf); + +private: + +}; //================================================================================================= @@ -113,7 +422,7 @@ thread_local int tls_defaultDevice = 0; std::once_flag hip_initialized; ihipDevice_t *g_devices; // Array of all non-emulated (ie GPU) accelerators in the system. bool g_visible_device = false; // Set the flag when HIP_VISIBLE_DEVICES is set -//unsigned g_deviceCnt; +unsigned g_deviceCnt; std::vector g_hip_visible_devices; /* vector of integers that contains the visible device IDs */ hsa_agent_t g_cpu_agent ; // the CPU agent. //================================================================================================= @@ -122,6 +431,7 @@ hsa_agent_t g_cpu_agent ; // the CPU agent. //================================================================================================= //Forward Declarations: //================================================================================================= +INLINE bool ihipIsValidDevice(unsigned deviceIndex); //================================================================================================= // Implementation: @@ -179,16 +489,6 @@ ihipStream_t::~ihipStream_t() _signalPool.clear(); } -//--- -inline ihipDevice_t * ihipStream_t::getDevice() const -{ - if (ihipIsValidDevice(_device_index)) { - return &g_devices[_device_index]; - } else { - return NULL; - } -}; - //--- @@ -235,6 +535,16 @@ void ihipStream_t::wait(bool assertQueueEmpty) }; +//--- +inline ihipDevice_t * ihipStream_t::getDevice() const +{ + if (ihipIsValidDevice(_device_index)) { + return &g_devices[_device_index]; + } else { + return NULL; + } +}; + //--- // Allocate a new signal from the signal pool. @@ -729,6 +1039,16 @@ void ihipDevice_t::waitAllStreams() +#define ihipLogStatus(_hip_status) \ + ({\ + tls_lastHipError = _hip_status;\ + \ + if ((COMPILE_HIP_TRACE_API & 0x2) && HIP_TRACE_API) {\ + fprintf(stderr, " %ship-api: %-30s ret=%2d (%s)>>\n" KNRM, (_hip_status == 0) ? API_COLOR:KRED, __func__, _hip_status, ihipErrorString(_hip_status));\ + }\ + _hip_status;\ + }) + // Read environment variables. @@ -913,6 +1233,67 @@ void ihipInit() } +INLINE bool ihipIsValidDevice(unsigned deviceIndex) +{ + // deviceIndex is unsigned so always > 0 + return (deviceIndex < g_deviceCnt); +} + +/*// check if the device ID is set as visible*/ +//INLINE bool ihipIsVisibleDevice(unsigned deviceIndex) +//{ + //return std::find(g_hip_visible_devices.begin(), g_hip_visible_devices.end(), + //(int)deviceIndex) != g_hip_visible_devices.end(); +/*}*/ + +//--- +INLINE ihipDevice_t *ihipGetTlsDefaultDevice() +{ + // If this is invalid, the TLS state is corrupt. + // This can fire if called before devices are initialized. + // TODO - consider replacing assert with error code + assert (ihipIsValidDevice(tls_defaultDevice)); + + return &g_devices[tls_defaultDevice]; +} + + +//--- +INLINE ihipDevice_t *ihipGetDevice(int deviceId) +{ + if ((deviceId >= 0) && (deviceId < g_deviceCnt)) { + return &g_devices[deviceId]; + } else { + return NULL; + } + +} + +//--- +// Get the stream to use for a command submission. +// +// If stream==NULL synchronize appropriately with other streams and return the default av for the device. +// If stream is valid, return the AV to use. +inline hipStream_t ihipSyncAndResolveStream(hipStream_t stream) +{ + if (stream == hipStreamNull ) { + ihipDevice_t *device = ihipGetTlsDefaultDevice(); + +#ifndef HIP_API_PER_THREAD_DEFAULT_STREAM + device->syncDefaultStream(false); +#endif + return device->_default_stream; + } else { + // Have to wait for legacy default stream to be empty: + if (!(stream->_flags & hipStreamNonBlocking)) { + tprintf(DB_SYNC, "stream %p wait default stream\n", stream); + stream->getDevice()->_default_stream->wait(); + } + + return stream; + } +} + // TODO - data-up to data-down: // Called just before a kernel is launched from hipLaunchKernel. @@ -955,7 +1336,270 @@ void ihipPostLaunchKernel(hipStream_t stream, hc::completion_future &kernelFutur //--- +//------------------------------------------------------------------------------------------------- +//Devices +//------------------------------------------------------------------------------------------------- +//--- +/** + * @return #hipSuccess + */ +hipError_t hipGetDevice(int *device) +{ + HIP_INIT_API(device); + *device = tls_defaultDevice; + return ihipLogStatus(hipSuccess); +} + + +//--- +/** + * @return #hipSuccess, #hipErrorNoDevice + */ +hipError_t hipGetDeviceCount(int *count) +{ + HIP_INIT_API(count); + + *count = g_deviceCnt; + + if (*count > 0) { + return ihipLogStatus(hipSuccess); + } else { + return ihipLogStatus(hipErrorNoDevice); + } +} + + +//--- +/** + * @returns #hipSuccess + */ +hipError_t hipDeviceSetCacheConfig ( hipFuncCache cacheConfig ) +{ + std::call_once(hip_initialized, ihipInit); + + // Nop, AMD does not support variable cache configs. + + return ihipLogStatus(hipSuccess); +} + + +//--- +/** + * @returns #hipSuccess + */ +hipError_t hipDeviceGetCacheConfig ( hipFuncCache *cacheConfig ) +{ + std::call_once(hip_initialized, ihipInit); + + *cacheConfig = hipFuncCachePreferNone; + + return ihipLogStatus(hipSuccess); +} + + +//--- +/** + * @returns #hipSuccess + */ +hipError_t hipFuncSetCacheConfig ( hipFuncCache cacheConfig ) +{ + std::call_once(hip_initialized, ihipInit); + + // Nop, AMD does not support variable cache configs. + + return ihipLogStatus(hipSuccess); +} + + + +//--- +/** + * @returns #hipSuccess + */ +hipError_t hipDeviceSetSharedMemConfig ( hipSharedMemConfig config ) +{ + std::call_once(hip_initialized, ihipInit); + + // Nop, AMD does not support variable shared mem configs. + + return ihipLogStatus(hipSuccess); +} + + + +//--- +/** + * @returns #hipSuccess + */ +hipError_t hipDeviceGetSharedMemConfig ( hipSharedMemConfig * pConfig ) +{ + std::call_once(hip_initialized, ihipInit); + + *pConfig = hipSharedMemBankSizeFourByte; + + return ihipLogStatus(hipSuccess); +} + +//--- +/** + * @return #hipSuccess, #hipErrorInvalidDevice + */ +hipError_t hipSetDevice(int device) +{ + HIP_INIT_API(device); + + if ((device < 0) || (device >= g_deviceCnt)) { + return ihipLogStatus(hipErrorInvalidDevice); + } else { + tls_defaultDevice = device; + return ihipLogStatus(hipSuccess); + } +} + + +//--- +/** + * @return #hipSuccess + */ +hipError_t hipDeviceSynchronize(void) +{ + HIP_INIT_API(); + + ihipGetTlsDefaultDevice()->waitAllStreams(); // ignores non-blocking streams, this waits for all activity to finish. + + + return ihipLogStatus(hipSuccess); +} + + +//--- +/** + * @return @ref hipSuccess + */ +hipError_t hipDeviceReset(void) +{ + HIP_INIT_API(); + + ihipDevice_t *device = ihipGetTlsDefaultDevice(); + + // TODO-HCC + // This function currently does a user-level cleanup of known resources. + // It could benefit from KFD support to perform a more "nuclear" clean that would include any associated kernel resources and page table entries. + + + if (device) { + //--- + //Wait for pending activity to complete? + //TODO - check if this is required behavior: + for (auto streamI=device->_streams.begin(); streamI!=device->_streams.end(); streamI++) { + ihipStream_t *stream = *streamI; + stream->wait(); + } + + // Release device resources (streams and memory): + device->reset(); + } + + return ihipLogStatus(hipSuccess); +} + +/** + * + */ +hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t attr, int device) +{ + std::call_once(hip_initialized, ihipInit); + + hipError_t e = hipSuccess; + + ihipDevice_t * hipDevice = ihipGetDevice(device); + hipDeviceProp_t *prop = &hipDevice->_props; + if (hipDevice) { + switch (attr) { + case hipDeviceAttributeMaxThreadsPerBlock: + *pi = prop->maxThreadsPerBlock; break; + case hipDeviceAttributeMaxBlockDimX: + *pi = prop->maxThreadsDim[0]; break; + case hipDeviceAttributeMaxBlockDimY: + *pi = prop->maxThreadsDim[1]; break; + case hipDeviceAttributeMaxBlockDimZ: + *pi = prop->maxThreadsDim[2]; break; + case hipDeviceAttributeMaxGridDimX: + *pi = prop->maxGridSize[0]; break; + case hipDeviceAttributeMaxGridDimY: + *pi = prop->maxGridSize[1]; break; + case hipDeviceAttributeMaxGridDimZ: + *pi = prop->maxGridSize[2]; break; + case hipDeviceAttributeMaxSharedMemoryPerBlock: + *pi = prop->sharedMemPerBlock; break; + case hipDeviceAttributeTotalConstantMemory: + *pi = prop->totalConstMem; break; + case hipDeviceAttributeWarpSize: + *pi = prop->warpSize; break; + case hipDeviceAttributeMaxRegistersPerBlock: + *pi = prop->regsPerBlock; break; + case hipDeviceAttributeClockRate: + *pi = prop->clockRate; break; + case hipDeviceAttributeMemoryClockRate: + *pi = prop->memoryClockRate; break; + case hipDeviceAttributeMemoryBusWidth: + *pi = prop->memoryBusWidth; break; + case hipDeviceAttributeMultiprocessorCount: + *pi = prop->multiProcessorCount; break; + case hipDeviceAttributeComputeMode: + *pi = prop->computeMode; break; + case hipDeviceAttributeL2CacheSize: + *pi = prop->l2CacheSize; break; + case hipDeviceAttributeMaxThreadsPerMultiProcessor: + *pi = prop->maxThreadsPerMultiProcessor; break; + case hipDeviceAttributeComputeCapabilityMajor: + *pi = prop->major; break; + case hipDeviceAttributeComputeCapabilityMinor: + *pi = prop->minor; break; + case hipDeviceAttributePciBusId: + *pi = prop->pciBusID; break; + case hipDeviceAttributeConcurrentKernels: + *pi = prop->concurrentKernels; break; + case hipDeviceAttributePciDeviceId: + *pi = prop->pciDeviceID; break; + case hipDeviceAttributeMaxSharedMemoryPerMultiprocessor: + *pi = prop->maxSharedMemoryPerMultiProcessor; break; + case hipDeviceAttributeIsMultiGpuBoard: + *pi = prop->isMultiGpuBoard; break; + default: + e = hipErrorInvalidValue; break; + } + } else { + e = hipErrorInvalidDevice; + } + return ihipLogStatus(e); +} + + +/** + * @return #hipSuccess, #hipErrorInvalidDevice + * @bug HCC always returns 0 for maxThreadsPerMultiProcessor + * @bug HCC always returns 0 for regsPerBlock + * @bug HCC always returns 0 for l2CacheSize + */ +hipError_t hipGetDeviceProperties(hipDeviceProp_t* props, int device) +{ + HIP_INIT_API(props, device); + + hipError_t e; + + ihipDevice_t * hipDevice = ihipGetDevice(device); + if (hipDevice) { + // copy saved props + *props = hipDevice->_props; + e = hipSuccess; + } else { + e = hipErrorInvalidDevice; + } + + return ihipLogStatus(e); +} @@ -2264,5 +2908,4 @@ hipError_t hipHccGetAcceleratorView(hipStream_t stream, hc::accelerator_view **a #if ONE_OBJECT_FILE #include "staging_buffer.cpp" -//#include "hip_device.cpp" #endif diff --git a/hipamd/src/hip_inline.h b/hipamd/src/hip_inline.h deleted file mode 100644 index 65ebe7ceb0..0000000000 --- a/hipamd/src/hip_inline.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef HIP_INLINE_H -#define HIP_INLINE_H - - - -#endif diff --git a/hipamd/src/staging_buffer.cpp b/hipamd/src/staging_buffer.cpp index c2d1210791..dc2f5d715a 100644 --- a/hipamd/src/staging_buffer.cpp +++ b/hipamd/src/staging_buffer.cpp @@ -1,5 +1,8 @@ +#include + +#include "hsa_ext_amd.h" + #include "hcc_detail/staging_buffer.h" -#include "hcc_detail/hip_util.h" #ifdef HIP_HCC #define THROW_ERROR(e) throw ihipException(e)