diff --git a/projects/hip/CMakeLists.txt b/projects/hip/CMakeLists.txt index 25459766ba..9b62c20e59 100644 --- a/projects/hip/CMakeLists.txt +++ b/projects/hip/CMakeLists.txt @@ -101,6 +101,7 @@ if(HIP_PLATFORM STREQUAL "hcc") src/hip_device.cpp src/hip_error.cpp src/hip_event.cpp + src/hip_ldg.cpp src/hip_memory.cpp src/hip_peer.cpp src/hip_stream.cpp diff --git a/projects/hip/bin/hipcc b/projects/hip/bin/hipcc index 22e2dbaa55..c62fd504a9 100755 --- a/projects/hip/bin/hipcc +++ b/projects/hip/bin/hipcc @@ -200,7 +200,7 @@ if ($needHipHcc) { if ($HIP_USE_SHARED_LIBRARY) { $HIPLDFLAGS .= " -L$HIP_PATH/lib -Wl,--rpath=$HIP_PATH/lib -lhip_hcc"; } else { - $HIPLDFLAGS .= " $HIP_PATH/lib/device_util.cpp.o $HIP_PATH/lib/hip_device.cpp.o $HIP_PATH/lib/hip_error.cpp.o $HIP_PATH/lib/hip_event.cpp.o $HIP_PATH/lib/hip_hcc.cpp.o $HIP_PATH/lib/hip_memory.cpp.o $HIP_PATH/lib/hip_peer.cpp.o $HIP_PATH/lib/hip_stream.cpp.o $HIP_PATH/lib/staging_buffer.cpp.o"; + $HIPLDFLAGS .= " $HIP_PATH/lib/device_util.cpp.o $HIP_PATH/lib/hip_device.cpp.o $HIP_PATH/lib/hip_error.cpp.o $HIP_PATH/lib/hip_event.cpp.o $HIP_PATH/lib/hip_hcc.cpp.o $HIP_PATH/lib/hip_memory.cpp.o $HIP_PATH/lib/hip_peer.cpp.o $HIP_PATH/lib/hip_stream.cpp.o $HIP_PATH/lib/staging_buffer.cpp.o $HIP_PATH/lib/hip_ldg.cpp.o"; } } diff --git a/projects/hip/include/hcc_detail/hip_ldg.h b/projects/hip/include/hcc_detail/hip_ldg.h new file mode 100644 index 0000000000..93ce12635c --- /dev/null +++ b/projects/hip/include/hcc_detail/hip_ldg.h @@ -0,0 +1,39 @@ +#ifndef HIP_LDG_H +#define HIP_LDG_H + +#if __HCC__ +#include"hip_vector_types.h" +#include"host_defines.h" +__device__ char __ldg(const char* ); +__device__ signed char __ldg(const signed char* ); +__device__ short __ldg(const short* ); +__device__ int __ldg(const int* ); +__device__ long __ldg(const long* ); +__device__ long long __ldg(const long long* ); +__device__ char2 __ldg(const char2* ); +__device__ char4 __ldg(const char4* ); +__device__ short2 __ldg(const short2* ); +__device__ short4 __ldg(const short4* ); +__device__ int2 __ldg(const int2* ); +__device__ int4 __ldg(const int4* ); +__device__ longlong2 __ldg(const longlong2* ); +__device__ unsigned char __ldg(const unsigned char* ); +__device__ unsigned short __ldg(const unsigned short* ); +__device__ unsigned int __ldg(const unsigned int* ); +__device__ unsigned long __ldg(const unsigned long* ); +__device__ unsigned long long __ldg(const unsigned long long* ); +__device__ uchar2 __ldg(const uchar2* ); +__device__ uchar4 __ldg(const uchar4* ); +__device__ ushort2 __ldg(const ushort2* ); +__device__ uint2 __ldg(const uint2* ); +__device__ uint4 __ldg(const uint4* ); +__device__ ulonglong2 __ldg(const ulonglong2* ); +__device__ float __ldg(const float* ); +__device__ double __ldg(const double* ); +__device__ float2 __ldg(const float2* ); +__device__ float4 __ldg(const float4* ); +__device__ double2 __ldg(const double2* ); + +#endif + +#endif diff --git a/projects/hip/include/hcc_detail/hip_runtime.h b/projects/hip/include/hcc_detail/hip_runtime.h index 003ededca0..232319c710 100644 --- a/projects/hip/include/hcc_detail/hip_runtime.h +++ b/projects/hip/include/hcc_detail/hip_runtime.h @@ -56,6 +56,7 @@ extern int HIP_TRACE_API; #define hipLaunchParm grid_launch_parm #ifdef __cplusplus #include +#include #endif #include // TODO-HCC remove old definitions ; ~1602 hcc supports __HCC_ACCELERATOR__ define. diff --git a/projects/hip/src/hip_hcc.cpp b/projects/hip/src/hip_hcc.cpp index 04bc92421e..91a26bf971 100644 --- a/projects/hip/src/hip_hcc.cpp +++ b/projects/hip/src/hip_hcc.cpp @@ -1236,12 +1236,12 @@ void ihipStream_t::copySync(LockedAccessor_StreamCrit_t &crit, void* dst, const bool dstTracked = (hc::am_memtracker_getinfo(&dstPtrInfo, dst) == AM_SUCCESS); bool srcTracked = (hc::am_memtracker_getinfo(&srcPtrInfo, src) == AM_SUCCESS); + bool srcInDeviceMem = srcPtrInfo._isInDeviceMem; + bool dstInDeviceMem = dstPtrInfo._isInDeviceMem; // Resolve default to a specific Kind so we know which algorithm to use: if (kind == hipMemcpyDefault) { - bool srcInDeviceMem = (srcTracked && srcPtrInfo._isInDeviceMem); - bool dstInDeviceMem = (dstTracked && dstPtrInfo._isInDeviceMem); - kind = resolveMemcpyDirection(srcTracked, dstTracked, srcPtrInfo._isInDeviceMem, dstPtrInfo._isInDeviceMem); + kind = resolveMemcpyDirection(srcTracked, dstTracked, srcInDeviceMem, dstInDeviceMem); }; hsa_signal_t depSignal; @@ -1259,26 +1259,26 @@ void ihipStream_t::copySync(LockedAccessor_StreamCrit_t &crit, void* dst, const if (kind == hipMemcpyHostToDevice) { int depSignalCnt = preCopyCommand(crit, NULL, &depSignal, ihipCommandCopyH2D); - if (HIP_STAGING_BUFFERS) { - tprintf(DB_COPY1, "D2H && !dstTracked: staged copy H2D dst=%p src=%p sz=%zu\n", dst, src, sizeBytes); + if (HIP_STAGING_BUFFERS) { + tprintf(DB_COPY1, "D2H && !dstTracked: staged copy H2D dst=%p src=%p sz=%zu\n", dst, src, sizeBytes); - if (HIP_PININPLACE) { - device->_staging_buffer[0]->CopyHostToDevicePinInPlace(dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL); - } else { - device->_staging_buffer[0]->CopyHostToDevice(dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL); - } + if (HIP_PININPLACE) { + device->_staging_buffer[0]->CopyHostToDevicePinInPlace(dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL); + } else { + device->_staging_buffer[0]->CopyHostToDevice(dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL); + } - // The copy waits for inputs and then completes before returning so can reset queue to empty: - this->wait(crit, true); - } else { - // TODO - remove, slow path. - tprintf(DB_COPY1, "H2D && ! srcTracked: am_copy dst=%p src=%p sz=%zu\n", dst, src, sizeBytes); + // The copy waits for inputs and then completes before returning so can reset queue to empty: + this->wait(crit, true); + } else { + // TODO - remove, slow path. + tprintf(DB_COPY1, "H2D && ! srcTracked: am_copy dst=%p src=%p sz=%zu\n", dst, src, sizeBytes); #if USE_AV_COPY - _av.copy(src,dst,sizeBytes); + _av.copy(src,dst,sizeBytes); #else - hc::am_copy(dst, src, sizeBytes); + hc::am_copy(dst, src, sizeBytes); #endif - } + } } else if (kind == hipMemcpyDeviceToHost) { int depSignalCnt = preCopyCommand(crit, NULL, &depSignal, ihipCommandCopyD2H); if (HIP_STAGING_BUFFERS) { diff --git a/projects/hip/src/hip_ldg.cpp b/projects/hip/src/hip_ldg.cpp new file mode 100644 index 0000000000..14c461f746 --- /dev/null +++ b/projects/hip/src/hip_ldg.cpp @@ -0,0 +1,42 @@ +#include"hcc_detail/hip_ldg.h" + +__device__ char __ldg(const char* ptr) +{ + return *ptr; +} + +__device__ signed char __ldg(const signed char* ptr) +{ + return ptr[0]; +} + +__device__ short __ldg(const short* ptr) +{ + return ptr[0]; +} + +__device__ int __ldg(const int* ptr) +{ + return ptr[0]; +} + +__device__ long long __ldg(const long long* ptr) +{ + return ptr[0]; +} + + +__device__ int2 __ldg(const int2* ptr) +{ + return ptr[0]; +} + +__device__ int4 __ldg(const int4* ptr) +{ + return ptr[0]; +} + +__device__ float __ldg(const float* ptr) +{ + return ptr[0]; +} diff --git a/projects/hip/tests/src/Makefile b/projects/hip/tests/src/Makefile index cd3025d387..829d45a08c 100644 --- a/projects/hip/tests/src/Makefile +++ b/projects/hip/tests/src/Makefile @@ -9,13 +9,13 @@ OBJECTS=$(SOURCES:.cpp=.o) EXECUTABLE=hipMemset $(EXECUTABLE): $(HIP_DEPS) $(OBJECTS) - $(HCC) $(HLDFLAGS) $(OBJECTS) -o $@ + $(HCC) $(HLDFLAGS) $(OBJECTS) -o $@ .cpp.o: - $(HCC) $(HCFLAGS) -c $< -o $@ - @$(CC) -MM -MT $@ $(CFLAGS) -c $< > $(@:.o=.d) + $(HCC) $(HCFLAGS) -c $< -o $@ + @$(CC) -MM -MT $@ $(CFLAGS) -c $< > $(@:.o=.d) clean: hip_clean - rm -rf $(EXECUTABLE) $(OBJECTS) + rm -rf $(EXECUTABLE) $(OBJECTS) include $(HIP_PATH)/examples/common/hip.epilogue.make diff --git a/projects/hip/tests/src/hip_ldg.cpp b/projects/hip/tests/src/hip_ldg.cpp index 2f281c5991..612eae32e8 100644 --- a/projects/hip/tests/src/hip_ldg.cpp +++ b/projects/hip/tests/src/hip_ldg.cpp @@ -30,18 +30,19 @@ THE SOFTWARE. #define HIP_ASSERT(x) (assert((x)==hipSuccess)) -#define WIDTH 1024 -#define HEIGHT 1024 +#define WIDTH 8 +#define HEIGHT 8 #define NUM (WIDTH*HEIGHT) -#define THREADS_PER_BLOCK_X 16 -#define THREADS_PER_BLOCK_Y 16 +#define THREADS_PER_BLOCK_X 8 +#define THREADS_PER_BLOCK_Y 8 #define THREADS_PER_BLOCK_Z 1 +template __global__ void vectoradd_float(hipLaunchParm lp, - float* a, const float* bm, const float* cm, int width, int height) + T* a, const T* bm, const T* cm, int width, int height) { int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; @@ -72,46 +73,35 @@ __kernel__ void vectoradd_float(float* a, const float* b, const float* c, int wi using namespace std; -int main() { - - float* hostA; - float* hostB; - float* hostC; - - float* deviceA; - float* deviceB; - float* deviceC; - - hipDeviceProp_t devProp; - hipGetDeviceProperties(&devProp, 0); - cout << " System minor " << devProp.minor << endl; - cout << " System major " << devProp.major << endl; - cout << " agent prop name " << devProp.name << endl; - - - - cout << "__ldg " << endl ; +template +bool dataTypesRun(){ + T* hostA; + T* hostB; + T* hostC; + T* deviceA; + T* deviceB; + T* deviceC; int i; int errors; - hostA = (float*)malloc(NUM * sizeof(float)); - hostB = (float*)malloc(NUM * sizeof(float)); - hostC = (float*)malloc(NUM * sizeof(float)); + hostA = (T*)malloc(NUM * sizeof(T)); + hostB = (T*)malloc(NUM * sizeof(T)); + hostC = (T*)malloc(NUM * sizeof(T)); // initialize the input data for (i = 0; i < NUM; i++) { - hostB[i] = (float)i; - hostC[i] = (float)i*100.0f; + hostB[i] = (T)i; + hostC[i] = (T)i; } - HIP_ASSERT(hipMalloc((void**)&deviceA, NUM * sizeof(float))); - HIP_ASSERT(hipMalloc((void**)&deviceB, NUM * sizeof(float))); - HIP_ASSERT(hipMalloc((void**)&deviceC, NUM * sizeof(float))); + HIP_ASSERT(hipMalloc((void**)&deviceA, NUM * sizeof(T))); + HIP_ASSERT(hipMalloc((void**)&deviceB, NUM * sizeof(T))); + HIP_ASSERT(hipMalloc((void**)&deviceC, NUM * sizeof(T))); - HIP_ASSERT(hipMemcpy(deviceB, hostB, NUM*sizeof(float), hipMemcpyHostToDevice)); - HIP_ASSERT(hipMemcpy(deviceC, hostC, NUM*sizeof(float), hipMemcpyHostToDevice)); + HIP_ASSERT(hipMemcpy(deviceB, hostB, NUM*sizeof(T), hipMemcpyHostToDevice)); + HIP_ASSERT(hipMemcpy(deviceC, hostC, NUM*sizeof(T), hipMemcpyHostToDevice)); hipLaunchKernel(vectoradd_float, @@ -121,8 +111,9 @@ int main() { deviceA ,deviceB ,deviceC ,WIDTH ,HEIGHT); - HIP_ASSERT(hipMemcpy(hostA, deviceA, NUM*sizeof(float), hipMemcpyDeviceToHost)); + HIP_ASSERT(hipMemcpy(hostA, deviceA, NUM*sizeof(T), hipMemcpyDeviceToHost)); + bool ret = false; // verify the results errors = 0; for (i = 0; i < NUM; i++) { @@ -132,8 +123,10 @@ int main() { } if (errors!=0) { printf("FAILED: %d errors\n",errors); + ret = false; } else { printf ("PASSED!\n"); + ret = true; } HIP_ASSERT(hipFree(deviceA)); @@ -144,6 +137,25 @@ int main() { free(hostB); free(hostC); + return ret; + +} + + +int main() { + + hipDeviceProp_t devProp; + hipGetDeviceProperties(&devProp, 0); + cout << " System minor " << devProp.minor << endl; + cout << " System major " << devProp.major << endl; + cout << " agent prop name " << devProp.name << endl; + + int errors; + errors = dataTypesRun(); + errors = dataTypesRun(); + errors = dataTypesRun(); + errors = dataTypesRun(); + cout << "__ldg " << endl ; //hipResetDefaultAccelerator(); return errors;