From 89bfc0e375460c6a58aa6e13d43319590eb30da6 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Wed, 23 Mar 2016 07:54:01 -0500 Subject: [PATCH] WIP added modular feature for device apis --- CMakeLists.txt | 24 + bin/hipcc | 32 +- include/hcc_detail/hip_common.h | 334 ++++++++++++++ include/hcc_detail/hip_inline.h | 67 +++ include/hcc_detail/hip_util.h | 22 + include/hcc_detail/staging_buffer.h | 2 +- include/hcc_detail/trace_helper.h | 3 +- src/hip_device.cpp | 278 ++++++++++++ src/hip_hcc.cpp | 673 +--------------------------- src/hip_inline.h | 6 + src/staging_buffer.cpp | 5 +- tests/src/CMakeLists.txt | 4 +- 12 files changed, 767 insertions(+), 683 deletions(-) create mode 100644 CMakeLists.txt create mode 100644 include/hcc_detail/hip_common.h create mode 100644 include/hcc_detail/hip_inline.h create mode 100644 include/hcc_detail/hip_util.h create mode 100644 src/hip_device.cpp create mode 100644 src/hip_inline.h diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000000..a62a33c340 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,24 @@ +cmake_minimum_required(VERSION 2.6) +project(hip_hcc) + +set(HCC_PATH "/opt/hcc") +set(HSA_PATH "/opt/hsa") + +include_directories(${PROJECT_SOURCE_DIR}/include) + +set(CMAKE_CXX_COMPILER "hcc") +set(CMAKE_C_COMPILER "hcc") + +set(CMAKE_C_FLAGS " -hc -I${HCC_PATH}/include -I${HSA_PATH}/include -I${HIP_PATH}/include -stdlib=libc++ ") +set(CMAKE_CXX_FLAGS " -hc -I${HCC_PATH}/include -I${HSA_PATH}/include -I${HIP_PATH}/include -stdlib=libc++ ") + +set(SOURCE_FILES src/hip_hcc.cpp src/hip_device.cpp src/staging_buffer.cpp) + +add_library(hip_hcc STATIC ${SOURCE_FILES}) + +install(TARGETS hip_hcc DESTINATION /opt/hip/lib) +install(DIRECTORY src DESTINATION /opt/hip/src) +install(DIRECTORY bin DESTINATION /opt/hip/bin) +install(DIRECTORY include DESTINATION /opt/hip/include) + + diff --git a/bin/hipcc b/bin/hipcc index 1c850bbfe0..17efaf2280 100755 --- a/bin/hipcc +++ b/bin/hipcc @@ -173,30 +173,30 @@ if (($verbose & 0x18) and ($HIP_PLATFORM eq 'hcc')) { if ($needHipHcc) { # See if we need to remake the .o file: - my $source = "$HIP_PATH/src/hip_hcc.cpp" ; - my $object = "$HIP_PATH/src/hip_hcc.o" ; +# my $source = "$HIP_PATH/src/hip_hcc.cpp" ; +# my $object = "$HIP_PATH/src/hip_hcc.o" ; # remake if object does not exist, or if source is newer than object: - if ((not -e $object) or ((stat($source))[9] > (stat($object))[9])) { - my $CMD = "$HCC $HCCFLAGS -I$HSA_PATH/include -I$HIP_PATH/include -Wall -c $source -o $object"; - if ($verbose & 0x10) { - $CMD .= " -g -O0" ; - } else { - $CMD .= " -O3" ; - } +# if ((not -e $object) or ((stat($source))[9] > (stat($object))[9])) { +# my $CMD = "$HCC $HCCFLAGS -I$HSA_PATH/include -I$HIP_PATH/include -Wall -c $source -o $object"; +# if ($verbose & 0x10) { +# $CMD .= " -g -O0" ; +# } else { +# $CMD .= " -O3" ; +# } - $CMD .= " -I$marker_path/include"; +# $CMD .= " -I$marker_path/include"; - if ($verbose & 0x1) { - print "remake-deps:", $CMD, "\n"; - } - system ("$CMD") and die ("remake-deps failed"); +# if ($verbose & 0x1) { +# print "remake-deps:", $CMD, "\n"; +# } +# system ("$CMD") and die ("remake-deps failed"); - } +# } - $HIPLDFLAGS .= " $HIP_PATH/src/hip_hcc.o" ; + $HIPLDFLAGS .= " -L/opt/hip/lib -lhip_hcc" ; } # hipcc currrently requires separate compilation of source files, ie it is not possible to pass diff --git a/include/hcc_detail/hip_common.h b/include/hcc_detail/hip_common.h new file mode 100644 index 0000000000..e45f26e551 --- /dev/null +++ b/include/hcc_detail/hip_common.h @@ -0,0 +1,334 @@ +#pragma once +#ifndef HIP_COMMON_H +#define HIP_COMMON_H + +#include "hcc_detail/hip_util.h" +#include "hcc_detail/staging_buffer.h" + +#define HIP_INIT_API(...) \ + std::call_once(hip_initialized, ihipInit);\ + API_TRACE(__VA_ARGS__); + +const char *ihipErrorString(hipError_t); + +#define USE_AV_COPY 0 + +#define INLINE static inline + +//--- +// Environment variables: + +// Intended to distinguish whether an environment variable should be visible only in debug mode, or in debug+release. +//static const int debug = 0; +static const int release = 1; +static unsigned g_deviceCnt; + + +int HIP_LAUNCH_BLOCKING = 0; + +int HIP_PRINT_ENV = 0; +int HIP_TRACE_API= 0; +int HIP_DB= 0; +int HIP_STAGING_SIZE = 64; /* size of staging buffers, in KB */ +int HIP_STAGING_BUFFERS = 2; // TODO - remove, two buffers should be enough. +int HIP_PININPLACE = 0; +int HIP_STREAM_SIGNALS = 2; /* number of signals to allocate at stream creation */ +static int HIP_VISIBLE_DEVICES = 0; /* Contains a comma-separated sequence of GPU identifiers */ + + + +//--- +// Chicken bits for disabling functionality to work around potential issues: +int HIP_DISABLE_HW_KERNEL_DEP = 1; +int HIP_DISABLE_HW_COPY_DEP = 1; + + +// Color defs for debug messages: +#define KNRM "\x1B[0m" +#define KRED "\x1B[31m" +#define KGRN "\x1B[32m" +#define KYEL "\x1B[33m" +#define KBLU "\x1B[34m" +#define KMAG "\x1B[35m" +#define KCYN "\x1B[36m" +#define KWHT "\x1B[37m" + +#define API_COLOR KGRN + + +#define HIP_HCC + +// If set, thread-safety is enforced on all stream functions. +// Stream functions will acquire a mutex before entering critical sections. +#define STREAM_THREAD_SAFE 1 + +// If FORCE_COPY_DEP=1 , HIP runtime will add +// synchronization for copy commands in the same stream, regardless of command type. +// If FORCE_COPY_DEP=0 data copies of the same kind (H2H, H2D, D2H, D2D) are assumed to be implicitly ordered. +// ROCR runtime implementation currently provides this guarantee when using SDMA queues but not +// when using shader queues. +// TODO - measure if this matters for performance, in particular for back-to-back small copies. +// If not, we can simplify the copy dependency tracking by collapsing to a single Copy type, and always forcing dependencies for copy commands. +#define FORCE_SAMEDIR_COPY_DEP 1 + + +// Compile debug trace mode - this prints debug messages to stderr when env var HIP_DB is set. +// May be set to 0 to remove debug if checks - possible code size and performance difference? +#define COMPILE_HIP_DB 1 + + +// Compile HIP tracing capability. +// 0x1 = print a string at function entry with arguments. +// 0x2 = prints a simple message with function name + return code when function exits. +// 0x3 = print both. +// Must be enabled at runtime with HIP_TRACE_API +#define COMPILE_HIP_TRACE_API 0x3 + + +// Compile code that generates trace markers for CodeXL ATP at HIP function begin/end. +// ATP is standard CodeXL format that includes timestamps for kernels, HSA RT APIs, and HIP APIs. +#ifndef COMPILE_TRACE_MARKER +#define COMPILE_TRACE_MARKER 0 +#endif + + +// #include CPP files to produce one object file +#define ONE_OBJECT_FILE 1 + + +// Compile support for trace markers that are displayed on CodeXL GUI at start/stop of each function boundary. +// TODO - currently we print the trace message at the beginning. if we waited, we could also include return codes, and any values returned +// through ptr-to-args (ie the pointers allocated by hipMalloc). +#if COMPILE_TRACE_MARKER +#include "AMDTActivityLogger.h" +#define SCOPED_MARKER(markerName,group,userString) amdtScopedMarker(markerName, group, userString) +#else +// Swallow scoped markers: +#define SCOPED_MARKER(markerName,group,userString) +#endif + + +#if COMPILE_TRACE_MARKER || (COMPILE_HIP_TRACE_API & 0x1) +#define API_TRACE(...)\ +{\ + std::string s = std::string(__func__) + " (" + ToString(__VA_ARGS__) + ')';\ + if (COMPILE_HIP_DB && HIP_TRACE_API) {\ + fprintf (stderr, API_COLOR "<>\n" KNRM, (_hip_status == 0) ? API_COLOR:KRED, __func__, _hip_status, ihipErrorString(_hip_status));\ + }\ + _hip_status;\ + }) + + + +extern thread_local hipError_t tls_lastHipError; +const char *ihipErrorString(hipError_t); +typedef uint64_t SIGSEQNUM; +// Used to remove lock, for performance or stimulating bugs. +class FakeMutex +{ + public: + void lock() { } + bool try_lock() {return true; } + void unlock() { } +}; + + +#if STREAM_THREAD_SAFE +typedef std::mutex StreamMutex; +#else +typedef FakeMutex StreamMutex; +#endif + + +enum ihipCommand_t { + ihipCommandCopyH2H, + ihipCommandCopyH2D, + ihipCommandCopyD2H, + ihipCommandCopyD2D, + ihipCommandKernel, +}; + +struct ihipSignal_t; +struct ihipStream_t; +struct ihipDevice_t; +extern ihipDevice_t *g_devices; + +//--- +// Small wrapper around signals. +// Designed to be used from stream. +// TODO-someday refactor this class so it can be stored in a vector<> +// we already store the index here so we can use for garbage collection. +struct ihipSignal_t { + hsa_signal_t _hsa_signal; // hsa signal handle + int _index; // Index in pool, used for garbage collection. + SIGSEQNUM _sig_id; // unique sequentially increasing ID. + + ihipSignal_t(); + ~ihipSignal_t(); + + inline void release(); +}; + +// Internal stream structure. + +class ihipStream_t { +public: +typedef uint64_t SeqNum_t ; + + ihipStream_t(unsigned device_index, hc::accelerator_view av, SeqNum_t id, unsigned int flags); + ~ihipStream_t(); + + + void copySync (void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind); + void copyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind); + + //--- + // Thread-safe accessors - these acquire / release mutex: + inline bool preKernelCommand(); + inline void postKernelCommand(hc::completion_future &kernel_future); + + inline int preCopyCommand(ihipSignal_t *lastCopy, hsa_signal_t *waitSignal, ihipCommand_t copyType); + + inline void reclaimSignals_ts(SIGSEQNUM sigNum); + inline void wait(bool assertQueueEmpty=false); + + + + // Non-threadsafe accessors - must be protected by high-level stream lock: + inline SIGSEQNUM lastCopySeqId() { return _last_copy_signal ? _last_copy_signal->_sig_id : 0; }; + ihipSignal_t * allocSignal(); + + + //-- Non-racy accessors: + // These functions access fields set at initialization time and are non-racy (so do not acquire mutex) + inline ihipDevice_t * getDevice() const; + StreamMutex & mutex() {return _mutex;}; + + //--- + //Member vars - these are set at initialization: + SeqNum_t _id; // monotonic sequence ID + hc::accelerator_view _av; + unsigned _flags; +private: + void enqueueBarrier(hsa_queue_t* queue, ihipSignal_t *depSignal); + inline void waitCopy(ihipSignal_t *signal); + + + hipMemcpyKind resolveMemcpyDirection(bool srcInDeviceMem, bool dstInDeviceMem); + void setCopyAgents(hipMemcpyKind kind, ihipCommand_t *commandType, hsa_agent_t *srcAgent, hsa_agent_t *dstAgent); + + //--- + + unsigned _device_index; + ihipCommand_t _last_command_type; // type of the last command + + // signal of last copy command sent to the stream. + // May be NULL, indicating the previous command has completley finished and future commands don't need to create a dependency. + // Copy can be either H2D or D2H. + ihipSignal_t *_last_copy_signal; + hc::completion_future _last_kernel_future; // Completion future of last kernel command sent to GPU. + + int _signalCursor; + + SIGSEQNUM _stream_sig_id; // Monotonically increasing unique signal id. + SIGSEQNUM _oldest_live_sig_id; // oldest live seq_id, anything < this can be allocated. + std::deque _signalPool; // Pool of signals for use by this stream. + + StreamMutex _mutex; +}; + + +struct ihipDevice_t +{ + unsigned _device_index; // index into g_devices. + + hipDeviceProp_t _props; // saved device properties. + hc::accelerator _acc; + hsa_agent_t _hsa_agent; // hsa agent handle + + // The NULL stream is used if no other stream is specified. + // NULL has special synchronization properties with other streams. + ihipStream_t *_default_stream; + + std::list _streams; // streams associated with this device. + + unsigned _compute_units; + + StagingBuffer *_staging_buffer[2]; // one buffer for each direction. + + ihipStream_t::SeqNum_t _stream_id; + +public: + void init(unsigned device_index, hc::accelerator acc); + ~ihipDevice_t(); + void reset(); + hipError_t getProperties(hipDeviceProp_t* prop); + + inline void waitAllStreams(); + inline void syncDefaultStream(bool waitOnSelf); + +private: + +}; + + +#endif diff --git a/include/hcc_detail/hip_inline.h b/include/hcc_detail/hip_inline.h new file mode 100644 index 0000000000..3e2ae0fce7 --- /dev/null +++ b/include/hcc_detail/hip_inline.h @@ -0,0 +1,67 @@ +#ifndef HIP_INLINE_H +#define HIP_INLINE_H + +#include "trace_helper.h" + +#define INLINE static inline +extern ihipDevice_t *g_devices; +extern thread_local int tls_defaultDevice; +extern const hipStream_t hipStreamNull; + +INLINE bool ihipIsValidDevice(unsigned deviceIndex) +{ + // deviceIndex is unsigned so always > 0 + return (deviceIndex < g_deviceCnt); +} + +/*// check if the device ID is set as visible*/ +//INLINE bool ihipIsVisibleDevice(unsigned deviceIndex) +//{ + //return std::find(g_hip_visible_devices.begin(), g_hip_visible_devices.end(), + //(int)deviceIndex) != g_hip_visible_devices.end(); +/*}*/ + +//--- +INLINE ihipDevice_t *ihipGetTlsDefaultDevice() +{ + // If this is invalid, the TLS state is corrupt. + // This can fire if called before devices are initialized. + // TODO - consider replacing assert with error code + assert (ihipIsValidDevice(tls_defaultDevice)); + + return &g_devices[tls_defaultDevice]; +} + + +//--- +INLINE ihipDevice_t *ihipGetDevice(int deviceId) +{ + if ((deviceId >= 0) && (deviceId < g_deviceCnt)) { + return &g_devices[deviceId]; + } else { + return NULL; + } + +} + +inline hipStream_t ihipSyncAndResolveStream(hipStream_t stream) +{ + if (stream == hipStreamNull ) { + ihipDevice_t *device = ihipGetTlsDefaultDevice(); + +#ifndef HIP_API_PER_THREAD_DEFAULT_STREAM + device->syncDefaultStream(false); +#endif + return device->_default_stream; + } else { + // Have to wait for legacy default stream to be empty: + if (!(stream->_flags & hipStreamNonBlocking)) { + tprintf(DB_SYNC, "stream %p wait default stream\n", stream); + stream->getDevice()->_default_stream->wait(); + } + + return stream; + } +} + +#endif diff --git a/include/hcc_detail/hip_util.h b/include/hcc_detail/hip_util.h new file mode 100644 index 0000000000..2e89dd4e8d --- /dev/null +++ b/include/hcc_detail/hip_util.h @@ -0,0 +1,22 @@ +#ifndef HIP_UTIL_H +#define HIP_UTIL_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "hip_runtime.h" + +#include "hsa_ext_amd.h" + +#endif diff --git a/include/hcc_detail/staging_buffer.h b/include/hcc_detail/staging_buffer.h index 23a748e2cc..f570842cc8 100644 --- a/include/hcc_detail/staging_buffer.h +++ b/include/hcc_detail/staging_buffer.h @@ -1,6 +1,6 @@ #pragma once -#include "hsa.h" +#include "hcc_detail/hip_util.h" //------------------------------------------------------------------------------------------------- diff --git a/include/hcc_detail/trace_helper.h b/include/hcc_detail/trace_helper.h index a4542f76da..43d37f9c4e 100644 --- a/include/hcc_detail/trace_helper.h +++ b/include/hcc_detail/trace_helper.h @@ -1,7 +1,7 @@ #include #include #include - +#include "hcc_detail/hip_common.h" //--- // Helper functions to convert HIP function arguments into strings. // Handles POD data types as well as enumerations (ie hipMemcpyKind). @@ -12,7 +12,6 @@ // Handy macro to convert an enumeration to a stringified version of same: #define CASE_STR(x) case x: return #x; - // Building block functions: template std::string ToHexString(T v) diff --git a/src/hip_device.cpp b/src/hip_device.cpp new file mode 100644 index 0000000000..8f176ab7e9 --- /dev/null +++ b/src/hip_device.cpp @@ -0,0 +1,278 @@ +#include "hcc_detail/hip_runtime.h" +//#include "hcc_detail/trace_helper.h" +#include "hcc_detail/hip_inline.h" +#include "hsa.h" +extern std::once_flag hip_initialized; +extern void ihipInit(); +extern thread_local int tls_defaultDevice; + +//--- +/** + * @returns #hipSuccess + */ +hipError_t hipDeviceGetCacheConfig ( hipFuncCache *cacheConfig ) +{ + std::call_once(hip_initialized, ihipInit); + + *cacheConfig = hipFuncCachePreferNone; + + return ihipLogStatus(hipSuccess); +} + + + +//--- +/** + * @returns #hipSuccess + */ +hipError_t hipDeviceGetSharedMemConfig ( hipSharedMemConfig * pConfig ) +{ + std::call_once(hip_initialized, ihipInit); + + *pConfig = hipSharedMemBankSizeFourByte; + + return ihipLogStatus(hipSuccess); +} + + +//--- +/** + * @return @ref hipSuccess + */ +hipError_t hipDeviceReset(void) +{ + HIP_INIT_API(); + + ihipDevice_t *device = ihipGetTlsDefaultDevice(); + + // TODO-HCC + // This function currently does a user-level cleanup of known resources. + // It could benefit from KFD support to perform a more "nuclear" clean that would include any associated kernel resources and page table entries. + + + if (device) { + //--- + //Wait for pending activity to complete? + //TODO - check if this is required behavior: + for (auto streamI=device->_streams.begin(); streamI!=device->_streams.end(); streamI++) { + ihipStream_t *stream = *streamI; + stream->wait(); + } + + // Release device resources (streams and memory): + device->reset(); + } + + return ihipLogStatus(hipSuccess); +} + + +//--- +/** + * @returns #hipSuccess + */ +hipError_t hipDeviceSetCacheConfig ( hipFuncCache cacheConfig ) +{ + std::call_once(hip_initialized, ihipInit); + + // Nop, AMD does not support variable cache configs. + + return ihipLogStatus(hipSuccess); +} + + +//--- +/** + * @returns #hipSuccess + */ +hipError_t hipDeviceSetSharedMemConfig ( hipSharedMemConfig config ) +{ + std::call_once(hip_initialized, ihipInit); + + // Nop, AMD does not support variable shared mem configs. + + return ihipLogStatus(hipSuccess); +} + +//--- +/** + * @return #hipSuccess + */ +hipError_t hipDeviceSynchronize(void) +{ + HIP_INIT_API(); + + ihipGetTlsDefaultDevice()->waitAllStreams(); // ignores non-blocking streams, this waits for all activity to finish. + + + return ihipLogStatus(hipSuccess); +} + +//--- +/** + * @return #hipSuccess + */ +hipError_t hipGetDevice(int *device) +{ + HIP_INIT_API(device); + + *device = tls_defaultDevice; + return ihipLogStatus(hipSuccess); +} + +//--- +/** + * @return #hipSuccess, #hipErrorNoDevice + */ +hipError_t hipGetDeviceCount(int *count) +{ + HIP_INIT_API(count); + + *count = g_deviceCnt; + + if (*count > 0) { + return ihipLogStatus(hipSuccess); + } else { + return ihipLogStatus(hipErrorNoDevice); + } +} + + +/** + * @return #hipSuccess, #hipErrorInvalidDevice + * @bug HCC always returns 0 for maxThreadsPerMultiProcessor + * @bug HCC always returns 0 for regsPerBlock + * @bug HCC always returns 0 for l2CacheSize + */ +hipError_t hipGetDeviceProperties(hipDeviceProp_t* props, int device) +{ + HIP_INIT_API(props, device); + + hipError_t e; + + ihipDevice_t * hipDevice = ihipGetDevice(device); + if (hipDevice) { + // copy saved props + *props = hipDevice->_props; + e = hipSuccess; + } else { + e = hipErrorInvalidDevice; + } + + return ihipLogStatus(e); +} + + +//--- +/** + * @return #hipSuccess, #hipErrorInvalidDevice + */ +hipError_t hipSetDevice(int device) +{ + HIP_INIT_API(device); + + if ((device < 0) || (device >= g_deviceCnt)) { + return ihipLogStatus(hipErrorInvalidDevice); + } else { + tls_defaultDevice = device; + return ihipLogStatus(hipSuccess); + } +} + +/** + * + */ +hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t attr, int device) +{ + std::call_once(hip_initialized, ihipInit); + + hipError_t e = hipSuccess; + + ihipDevice_t * hipDevice = ihipGetDevice(device); + hipDeviceProp_t *prop = &hipDevice->_props; + if (hipDevice) { + switch (attr) { + case hipDeviceAttributeMaxThreadsPerBlock: + *pi = prop->maxThreadsPerBlock; break; + case hipDeviceAttributeMaxBlockDimX: + *pi = prop->maxThreadsDim[0]; break; + case hipDeviceAttributeMaxBlockDimY: + *pi = prop->maxThreadsDim[1]; break; + case hipDeviceAttributeMaxBlockDimZ: + *pi = prop->maxThreadsDim[2]; break; + case hipDeviceAttributeMaxGridDimX: + *pi = prop->maxGridSize[0]; break; + case hipDeviceAttributeMaxGridDimY: + *pi = prop->maxGridSize[1]; break; + case hipDeviceAttributeMaxGridDimZ: + *pi = prop->maxGridSize[2]; break; + case hipDeviceAttributeMaxSharedMemoryPerBlock: + *pi = prop->sharedMemPerBlock; break; + case hipDeviceAttributeTotalConstantMemory: + *pi = prop->totalConstMem; break; + case hipDeviceAttributeWarpSize: + *pi = prop->warpSize; break; + case hipDeviceAttributeMaxRegistersPerBlock: + *pi = prop->regsPerBlock; break; + case hipDeviceAttributeClockRate: + *pi = prop->clockRate; break; + case hipDeviceAttributeMemoryClockRate: + *pi = prop->memoryClockRate; break; + case hipDeviceAttributeMemoryBusWidth: + *pi = prop->memoryBusWidth; break; + case hipDeviceAttributeMultiprocessorCount: + *pi = prop->multiProcessorCount; break; + case hipDeviceAttributeComputeMode: + *pi = prop->computeMode; break; + case hipDeviceAttributeL2CacheSize: + *pi = prop->l2CacheSize; break; + case hipDeviceAttributeMaxThreadsPerMultiProcessor: + *pi = prop->maxThreadsPerMultiProcessor; break; + case hipDeviceAttributeComputeCapabilityMajor: + *pi = prop->major; break; + case hipDeviceAttributeComputeCapabilityMinor: + *pi = prop->minor; break; + case hipDeviceAttributePciBusId: + *pi = prop->pciBusID; break; + case hipDeviceAttributeConcurrentKernels: + *pi = prop->concurrentKernels; break; + case hipDeviceAttributePciDeviceId: + *pi = prop->pciDeviceID; break; + case hipDeviceAttributeMaxSharedMemoryPerMultiprocessor: + *pi = prop->maxSharedMemoryPerMultiProcessor; break; + case hipDeviceAttributeIsMultiGpuBoard: + *pi = prop->isMultiGpuBoard; break; + default: + e = hipErrorInvalidValue; break; + } + } else { + e = hipErrorInvalidDevice; + } + return ihipLogStatus(e); +} + + +/** + * @return #hipSuccess, #hipErrorInvalidDevice + * @bug HCC always returns 0 for maxThreadsPerMultiProcessor + * @bug HCC always returns 0 for regsPerBlock + * @bug HCC always returns 0 for l2CacheSize + */ +hipError_t hipGetDeviceProperties(hipDeviceProp_t* props, int device) +{ + HIP_INIT_API(props, device); + + hipError_t e; + + ihipDevice_t * hipDevice = ihipGetDevice(device); + if (hipDevice) { + // copy saved props + *props = hipDevice->_props; + e = hipSuccess; + } else { + e = hipErrorInvalidDevice; + } + + return ihipLogStatus(e); +} + diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp index 0ef8b3407b..bc1f314574 100644 --- a/src/hip_hcc.cpp +++ b/src/hip_hcc.cpp @@ -25,191 +25,19 @@ THE SOFTWARE. * Contains definitions for functions that are large enough that we don't want to inline them everywhere. * This file is compiled and linked into apps running HIP / HCC path. */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include "hip_runtime.h" - -#include "hsa_ext_amd.h" // HIP includes: #define HIP_HCC #include "hcc_detail/staging_buffer.h" - - +#include "hcc_detail/hip_common.h" +#include "hcc_detail/hip_util.h" +#include "hcc_detail/hip_inline.h" // TODO, re-org header order. extern const char *ihipErrorString(hipError_t hip_error); -#include "hcc_detail/trace_helper.h" // #define USE_MEMCPYTOSYMBOL // //Use the new HCC accelerator_view::copy instead of am_copy -#define USE_AV_COPY 0 - -#define INLINE static inline - -//--- -// Environment variables: - -// Intended to distinguish whether an environment variable should be visible only in debug mode, or in debug+release. -//static const int debug = 0; -static const int release = 1; - - -int HIP_LAUNCH_BLOCKING = 0; - -int HIP_PRINT_ENV = 0; -int HIP_TRACE_API= 0; -int HIP_DB= 0; -int HIP_STAGING_SIZE = 64; /* size of staging buffers, in KB */ -int HIP_STAGING_BUFFERS = 2; // TODO - remove, two buffers should be enough. -int HIP_PININPLACE = 0; -int HIP_STREAM_SIGNALS = 2; /* number of signals to allocate at stream creation */ -int HIP_VISIBLE_DEVICES = 0; /* Contains a comma-separated sequence of GPU identifiers */ - - - -//--- -// Chicken bits for disabling functionality to work around potential issues: -int HIP_DISABLE_HW_KERNEL_DEP = 1; -int HIP_DISABLE_HW_COPY_DEP = 1; - - -// Color defs for debug messages: -#define KNRM "\x1B[0m" -#define KRED "\x1B[31m" -#define KGRN "\x1B[32m" -#define KYEL "\x1B[33m" -#define KBLU "\x1B[34m" -#define KMAG "\x1B[35m" -#define KCYN "\x1B[36m" -#define KWHT "\x1B[37m" - -#define API_COLOR KGRN - - -#define HIP_HCC - -// If set, thread-safety is enforced on all stream functions. -// Stream functions will acquire a mutex before entering critical sections. -#define STREAM_THREAD_SAFE 1 - -// If FORCE_COPY_DEP=1 , HIP runtime will add -// synchronization for copy commands in the same stream, regardless of command type. -// If FORCE_COPY_DEP=0 data copies of the same kind (H2H, H2D, D2H, D2D) are assumed to be implicitly ordered. -// ROCR runtime implementation currently provides this guarantee when using SDMA queues but not -// when using shader queues. -// TODO - measure if this matters for performance, in particular for back-to-back small copies. -// If not, we can simplify the copy dependency tracking by collapsing to a single Copy type, and always forcing dependencies for copy commands. -#define FORCE_SAMEDIR_COPY_DEP 1 - - -// Compile debug trace mode - this prints debug messages to stderr when env var HIP_DB is set. -// May be set to 0 to remove debug if checks - possible code size and performance difference? -#define COMPILE_HIP_DB 1 - - -// Compile HIP tracing capability. -// 0x1 = print a string at function entry with arguments. -// 0x2 = prints a simple message with function name + return code when function exits. -// 0x3 = print both. -// Must be enabled at runtime with HIP_TRACE_API -#define COMPILE_HIP_TRACE_API 0x3 - - -// Compile code that generates trace markers for CodeXL ATP at HIP function begin/end. -// ATP is standard CodeXL format that includes timestamps for kernels, HSA RT APIs, and HIP APIs. -#ifndef COMPILE_TRACE_MARKER -#define COMPILE_TRACE_MARKER 0 -#endif - - -// #include CPP files to produce one object file -#define ONE_OBJECT_FILE 1 - - -// Compile support for trace markers that are displayed on CodeXL GUI at start/stop of each function boundary. -// TODO - currently we print the trace message at the beginning. if we waited, we could also include return codes, and any values returned -// through ptr-to-args (ie the pointers allocated by hipMalloc). -#if COMPILE_TRACE_MARKER -#include "AMDTActivityLogger.h" -#define SCOPED_MARKER(markerName,group,userString) amdtScopedMarker(markerName, group, userString) -#else -// Swallow scoped markers: -#define SCOPED_MARKER(markerName,group,userString) -#endif - - -#if COMPILE_TRACE_MARKER || (COMPILE_HIP_TRACE_API & 0x1) -#define API_TRACE(...)\ -{\ - std::string s = std::string(__func__) + " (" + ToString(__VA_ARGS__) + ')';\ - if (COMPILE_HIP_DB && HIP_TRACE_API) {\ - fprintf (stderr, API_COLOR "< -// we already store the index here so we can use for garbage collection. -struct ihipSignal_t { - hsa_signal_t _hsa_signal; // hsa signal handle - int _index; // Index in pool, used for garbage collection. - SIGSEQNUM _sig_id; // unique sequentially increasing ID. - - ihipSignal_t(); - ~ihipSignal_t(); - - inline void release(); -}; - - -// Used to remove lock, for performance or stimulating bugs. -class FakeMutex -{ - public: - void lock() { } - bool try_lock() {return true; } - void unlock() { } -}; - - -#if STREAM_THREAD_SAFE -typedef std::mutex StreamMutex; -#else -typedef FakeMutex StreamMutex; -#endif // TODO - move async copy code into stream? Stream->async-copy. @@ -281,71 +67,7 @@ typedef FakeMutex StreamMutex; // Internal stream structure. -class ihipStream_t { -public: -typedef uint64_t SeqNum_t ; - ihipStream_t(unsigned device_index, hc::accelerator_view av, SeqNum_t id, unsigned int flags); - ~ihipStream_t(); - - - void copySync (void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind); - void copyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind); - - //--- - // Thread-safe accessors - these acquire / release mutex: - inline bool preKernelCommand(); - inline void postKernelCommand(hc::completion_future &kernel_future); - - inline int preCopyCommand(ihipSignal_t *lastCopy, hsa_signal_t *waitSignal, ihipCommand_t copyType); - - inline void reclaimSignals_ts(SIGSEQNUM sigNum); - inline void wait(bool assertQueueEmpty=false); - - - - // Non-threadsafe accessors - must be protected by high-level stream lock: - inline SIGSEQNUM lastCopySeqId() { return _last_copy_signal ? _last_copy_signal->_sig_id : 0; }; - ihipSignal_t * allocSignal(); - - - //-- Non-racy accessors: - // These functions access fields set at initialization time and are non-racy (so do not acquire mutex) - inline ihipDevice_t * getDevice() const; - StreamMutex & mutex() {return _mutex;}; - - //--- - //Member vars - these are set at initialization: - SeqNum_t _id; // monotonic sequence ID - hc::accelerator_view _av; - unsigned _flags; -private: - void enqueueBarrier(hsa_queue_t* queue, ihipSignal_t *depSignal); - inline void waitCopy(ihipSignal_t *signal); - - - hipMemcpyKind resolveMemcpyDirection(bool srcInDeviceMem, bool dstInDeviceMem); - void setCopyAgents(hipMemcpyKind kind, ihipCommand_t *commandType, hsa_agent_t *srcAgent, hsa_agent_t *dstAgent); - - //--- - - unsigned _device_index; - ihipCommand_t _last_command_type; // type of the last command - - // signal of last copy command sent to the stream. - // May be NULL, indicating the previous command has completley finished and future commands don't need to create a dependency. - // Copy can be either H2D or D2H. - ihipSignal_t *_last_copy_signal; - hc::completion_future _last_kernel_future; // Completion future of last kernel command sent to GPU. - - int _signalCursor; - - SIGSEQNUM _stream_sig_id; // Monotonically increasing unique signal id. - SIGSEQNUM _oldest_live_sig_id; // oldest live seq_id, anything < this can be allocated. - std::deque _signalPool; // Pool of signals for use by this stream. - - StreamMutex _mutex; -}; @@ -377,38 +99,7 @@ struct ihipEvent_t { //------------------------------------------------------------------------------------------------- -struct ihipDevice_t -{ - unsigned _device_index; // index into g_devices. - hipDeviceProp_t _props; // saved device properties. - hc::accelerator _acc; - hsa_agent_t _hsa_agent; // hsa agent handle - - // The NULL stream is used if no other stream is specified. - // NULL has special synchronization properties with other streams. - ihipStream_t *_default_stream; - - std::list _streams; // streams associated with this device. - - unsigned _compute_units; - - StagingBuffer *_staging_buffer[2]; // one buffer for each direction. - - ihipStream_t::SeqNum_t _stream_id; - -public: - void init(unsigned device_index, hc::accelerator acc); - ~ihipDevice_t(); - void reset(); - hipError_t getProperties(hipDeviceProp_t* prop); - - inline void waitAllStreams(); - inline void syncDefaultStream(bool waitOnSelf); - -private: - -}; //================================================================================================= @@ -422,7 +113,7 @@ thread_local int tls_defaultDevice = 0; std::once_flag hip_initialized; ihipDevice_t *g_devices; // Array of all non-emulated (ie GPU) accelerators in the system. bool g_visible_device = false; // Set the flag when HIP_VISIBLE_DEVICES is set -unsigned g_deviceCnt; +//unsigned g_deviceCnt; std::vector g_hip_visible_devices; /* vector of integers that contains the visible device IDs */ hsa_agent_t g_cpu_agent ; // the CPU agent. //================================================================================================= @@ -431,7 +122,6 @@ hsa_agent_t g_cpu_agent ; // the CPU agent. //================================================================================================= //Forward Declarations: //================================================================================================= -INLINE bool ihipIsValidDevice(unsigned deviceIndex); //================================================================================================= // Implementation: @@ -489,6 +179,16 @@ ihipStream_t::~ihipStream_t() _signalPool.clear(); } +//--- +inline ihipDevice_t * ihipStream_t::getDevice() const +{ + if (ihipIsValidDevice(_device_index)) { + return &g_devices[_device_index]; + } else { + return NULL; + } +}; + //--- @@ -535,16 +235,6 @@ void ihipStream_t::wait(bool assertQueueEmpty) }; -//--- -inline ihipDevice_t * ihipStream_t::getDevice() const -{ - if (ihipIsValidDevice(_device_index)) { - return &g_devices[_device_index]; - } else { - return NULL; - } -}; - //--- // Allocate a new signal from the signal pool. @@ -1039,16 +729,6 @@ void ihipDevice_t::waitAllStreams() -#define ihipLogStatus(_hip_status) \ - ({\ - tls_lastHipError = _hip_status;\ - \ - if ((COMPILE_HIP_TRACE_API & 0x2) && HIP_TRACE_API) {\ - fprintf(stderr, " %ship-api: %-30s ret=%2d (%s)>>\n" KNRM, (_hip_status == 0) ? API_COLOR:KRED, __func__, _hip_status, ihipErrorString(_hip_status));\ - }\ - _hip_status;\ - }) - // Read environment variables. @@ -1233,67 +913,6 @@ void ihipInit() } -INLINE bool ihipIsValidDevice(unsigned deviceIndex) -{ - // deviceIndex is unsigned so always > 0 - return (deviceIndex < g_deviceCnt); -} - -/*// check if the device ID is set as visible*/ -//INLINE bool ihipIsVisibleDevice(unsigned deviceIndex) -//{ - //return std::find(g_hip_visible_devices.begin(), g_hip_visible_devices.end(), - //(int)deviceIndex) != g_hip_visible_devices.end(); -/*}*/ - -//--- -INLINE ihipDevice_t *ihipGetTlsDefaultDevice() -{ - // If this is invalid, the TLS state is corrupt. - // This can fire if called before devices are initialized. - // TODO - consider replacing assert with error code - assert (ihipIsValidDevice(tls_defaultDevice)); - - return &g_devices[tls_defaultDevice]; -} - - -//--- -INLINE ihipDevice_t *ihipGetDevice(int deviceId) -{ - if ((deviceId >= 0) && (deviceId < g_deviceCnt)) { - return &g_devices[deviceId]; - } else { - return NULL; - } - -} - -//--- -// Get the stream to use for a command submission. -// -// If stream==NULL synchronize appropriately with other streams and return the default av for the device. -// If stream is valid, return the AV to use. -inline hipStream_t ihipSyncAndResolveStream(hipStream_t stream) -{ - if (stream == hipStreamNull ) { - ihipDevice_t *device = ihipGetTlsDefaultDevice(); - -#ifndef HIP_API_PER_THREAD_DEFAULT_STREAM - device->syncDefaultStream(false); -#endif - return device->_default_stream; - } else { - // Have to wait for legacy default stream to be empty: - if (!(stream->_flags & hipStreamNonBlocking)) { - tprintf(DB_SYNC, "stream %p wait default stream\n", stream); - stream->getDevice()->_default_stream->wait(); - } - - return stream; - } -} - // TODO - data-up to data-down: // Called just before a kernel is launched from hipLaunchKernel. @@ -1336,270 +955,7 @@ void ihipPostLaunchKernel(hipStream_t stream, hc::completion_future &kernelFutur //--- -//------------------------------------------------------------------------------------------------- -//Devices -//------------------------------------------------------------------------------------------------- -//--- -/** - * @return #hipSuccess - */ -hipError_t hipGetDevice(int *device) -{ - HIP_INIT_API(device); - *device = tls_defaultDevice; - return ihipLogStatus(hipSuccess); -} - - -//--- -/** - * @return #hipSuccess, #hipErrorNoDevice - */ -hipError_t hipGetDeviceCount(int *count) -{ - HIP_INIT_API(count); - - *count = g_deviceCnt; - - if (*count > 0) { - return ihipLogStatus(hipSuccess); - } else { - return ihipLogStatus(hipErrorNoDevice); - } -} - - -//--- -/** - * @returns #hipSuccess - */ -hipError_t hipDeviceSetCacheConfig ( hipFuncCache cacheConfig ) -{ - std::call_once(hip_initialized, ihipInit); - - // Nop, AMD does not support variable cache configs. - - return ihipLogStatus(hipSuccess); -} - - -//--- -/** - * @returns #hipSuccess - */ -hipError_t hipDeviceGetCacheConfig ( hipFuncCache *cacheConfig ) -{ - std::call_once(hip_initialized, ihipInit); - - *cacheConfig = hipFuncCachePreferNone; - - return ihipLogStatus(hipSuccess); -} - - -//--- -/** - * @returns #hipSuccess - */ -hipError_t hipFuncSetCacheConfig ( hipFuncCache cacheConfig ) -{ - std::call_once(hip_initialized, ihipInit); - - // Nop, AMD does not support variable cache configs. - - return ihipLogStatus(hipSuccess); -} - - - -//--- -/** - * @returns #hipSuccess - */ -hipError_t hipDeviceSetSharedMemConfig ( hipSharedMemConfig config ) -{ - std::call_once(hip_initialized, ihipInit); - - // Nop, AMD does not support variable shared mem configs. - - return ihipLogStatus(hipSuccess); -} - - - -//--- -/** - * @returns #hipSuccess - */ -hipError_t hipDeviceGetSharedMemConfig ( hipSharedMemConfig * pConfig ) -{ - std::call_once(hip_initialized, ihipInit); - - *pConfig = hipSharedMemBankSizeFourByte; - - return ihipLogStatus(hipSuccess); -} - -//--- -/** - * @return #hipSuccess, #hipErrorInvalidDevice - */ -hipError_t hipSetDevice(int device) -{ - HIP_INIT_API(device); - - if ((device < 0) || (device >= g_deviceCnt)) { - return ihipLogStatus(hipErrorInvalidDevice); - } else { - tls_defaultDevice = device; - return ihipLogStatus(hipSuccess); - } -} - - -//--- -/** - * @return #hipSuccess - */ -hipError_t hipDeviceSynchronize(void) -{ - HIP_INIT_API(); - - ihipGetTlsDefaultDevice()->waitAllStreams(); // ignores non-blocking streams, this waits for all activity to finish. - - - return ihipLogStatus(hipSuccess); -} - - -//--- -/** - * @return @ref hipSuccess - */ -hipError_t hipDeviceReset(void) -{ - HIP_INIT_API(); - - ihipDevice_t *device = ihipGetTlsDefaultDevice(); - - // TODO-HCC - // This function currently does a user-level cleanup of known resources. - // It could benefit from KFD support to perform a more "nuclear" clean that would include any associated kernel resources and page table entries. - - - if (device) { - //--- - //Wait for pending activity to complete? - //TODO - check if this is required behavior: - for (auto streamI=device->_streams.begin(); streamI!=device->_streams.end(); streamI++) { - ihipStream_t *stream = *streamI; - stream->wait(); - } - - // Release device resources (streams and memory): - device->reset(); - } - - return ihipLogStatus(hipSuccess); -} - -/** - * - */ -hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t attr, int device) -{ - std::call_once(hip_initialized, ihipInit); - - hipError_t e = hipSuccess; - - ihipDevice_t * hipDevice = ihipGetDevice(device); - hipDeviceProp_t *prop = &hipDevice->_props; - if (hipDevice) { - switch (attr) { - case hipDeviceAttributeMaxThreadsPerBlock: - *pi = prop->maxThreadsPerBlock; break; - case hipDeviceAttributeMaxBlockDimX: - *pi = prop->maxThreadsDim[0]; break; - case hipDeviceAttributeMaxBlockDimY: - *pi = prop->maxThreadsDim[1]; break; - case hipDeviceAttributeMaxBlockDimZ: - *pi = prop->maxThreadsDim[2]; break; - case hipDeviceAttributeMaxGridDimX: - *pi = prop->maxGridSize[0]; break; - case hipDeviceAttributeMaxGridDimY: - *pi = prop->maxGridSize[1]; break; - case hipDeviceAttributeMaxGridDimZ: - *pi = prop->maxGridSize[2]; break; - case hipDeviceAttributeMaxSharedMemoryPerBlock: - *pi = prop->sharedMemPerBlock; break; - case hipDeviceAttributeTotalConstantMemory: - *pi = prop->totalConstMem; break; - case hipDeviceAttributeWarpSize: - *pi = prop->warpSize; break; - case hipDeviceAttributeMaxRegistersPerBlock: - *pi = prop->regsPerBlock; break; - case hipDeviceAttributeClockRate: - *pi = prop->clockRate; break; - case hipDeviceAttributeMemoryClockRate: - *pi = prop->memoryClockRate; break; - case hipDeviceAttributeMemoryBusWidth: - *pi = prop->memoryBusWidth; break; - case hipDeviceAttributeMultiprocessorCount: - *pi = prop->multiProcessorCount; break; - case hipDeviceAttributeComputeMode: - *pi = prop->computeMode; break; - case hipDeviceAttributeL2CacheSize: - *pi = prop->l2CacheSize; break; - case hipDeviceAttributeMaxThreadsPerMultiProcessor: - *pi = prop->maxThreadsPerMultiProcessor; break; - case hipDeviceAttributeComputeCapabilityMajor: - *pi = prop->major; break; - case hipDeviceAttributeComputeCapabilityMinor: - *pi = prop->minor; break; - case hipDeviceAttributePciBusId: - *pi = prop->pciBusID; break; - case hipDeviceAttributeConcurrentKernels: - *pi = prop->concurrentKernels; break; - case hipDeviceAttributePciDeviceId: - *pi = prop->pciDeviceID; break; - case hipDeviceAttributeMaxSharedMemoryPerMultiprocessor: - *pi = prop->maxSharedMemoryPerMultiProcessor; break; - case hipDeviceAttributeIsMultiGpuBoard: - *pi = prop->isMultiGpuBoard; break; - default: - e = hipErrorInvalidValue; break; - } - } else { - e = hipErrorInvalidDevice; - } - return ihipLogStatus(e); -} - - -/** - * @return #hipSuccess, #hipErrorInvalidDevice - * @bug HCC always returns 0 for maxThreadsPerMultiProcessor - * @bug HCC always returns 0 for regsPerBlock - * @bug HCC always returns 0 for l2CacheSize - */ -hipError_t hipGetDeviceProperties(hipDeviceProp_t* props, int device) -{ - HIP_INIT_API(props, device); - - hipError_t e; - - ihipDevice_t * hipDevice = ihipGetDevice(device); - if (hipDevice) { - // copy saved props - *props = hipDevice->_props; - e = hipSuccess; - } else { - e = hipErrorInvalidDevice; - } - - return ihipLogStatus(e); -} @@ -2908,4 +2264,5 @@ hipError_t hipHccGetAcceleratorView(hipStream_t stream, hc::accelerator_view **a #if ONE_OBJECT_FILE #include "staging_buffer.cpp" +//#include "hip_device.cpp" #endif diff --git a/src/hip_inline.h b/src/hip_inline.h new file mode 100644 index 0000000000..65ebe7ceb0 --- /dev/null +++ b/src/hip_inline.h @@ -0,0 +1,6 @@ +#ifndef HIP_INLINE_H +#define HIP_INLINE_H + + + +#endif diff --git a/src/staging_buffer.cpp b/src/staging_buffer.cpp index dc2f5d715a..c2d1210791 100644 --- a/src/staging_buffer.cpp +++ b/src/staging_buffer.cpp @@ -1,8 +1,5 @@ -#include - -#include "hsa_ext_amd.h" - #include "hcc_detail/staging_buffer.h" +#include "hcc_detail/hip_util.h" #ifdef HIP_HCC #define THROW_ERROR(e) throw ihipException(e) diff --git a/tests/src/CMakeLists.txt b/tests/src/CMakeLists.txt index 12c784cefc..bfc1845238 100644 --- a/tests/src/CMakeLists.txt +++ b/tests/src/CMakeLists.txt @@ -43,8 +43,8 @@ if (${HIP_PLATFORM} STREQUAL "hcc") # hip_hcc.o: #add_library(hip_hcc STATIC ${HIP_PATH}/src/hip_hcc.cpp ) - add_library(hip_hcc STATIC ${HIP_PATH}/src/hip_hcc.cpp ${HIP_PATH}/src/staging_buffer.cpp) - target_include_directories(hip_hcc PRIVATE ${HSA_PATH}/include ${CODEXL_SDK_ATAL_PATH}/include) +# add_library(hip_hcc STATIC ${HIP_PATH}/src/hip_hcc.cpp ${HIP_PATH}/src/staging_buffer.cpp) +# target_include_directories(hip_hcc PRIVATE ${HSA_PATH}/include ${CODEXL_SDK_ATAL_PATH}/include) elseif (${HIP_PLATFORM} STREQUAL "nvcc")