2
0

added support for __ldg

[ROCm/hip commit: 805b268ad4]
Este cometimento está contido em:
Aditya Atluri
2016-04-20 12:25:40 -05:00
ascendente 3a25eeca87
cometimento 15fc041fd6
8 ficheiros modificados com 153 adições e 58 eliminações
+1
Ver ficheiro
@@ -101,6 +101,7 @@ if(HIP_PLATFORM STREQUAL "hcc")
src/hip_device.cpp
src/hip_error.cpp
src/hip_event.cpp
src/hip_ldg.cpp
src/hip_memory.cpp
src/hip_peer.cpp
src/hip_stream.cpp
+1 -1
Ver ficheiro
@@ -200,7 +200,7 @@ if ($needHipHcc) {
if ($HIP_USE_SHARED_LIBRARY) {
$HIPLDFLAGS .= " -L$HIP_PATH/lib -Wl,--rpath=$HIP_PATH/lib -lhip_hcc";
} else {
$HIPLDFLAGS .= " $HIP_PATH/lib/device_util.cpp.o $HIP_PATH/lib/hip_device.cpp.o $HIP_PATH/lib/hip_error.cpp.o $HIP_PATH/lib/hip_event.cpp.o $HIP_PATH/lib/hip_hcc.cpp.o $HIP_PATH/lib/hip_memory.cpp.o $HIP_PATH/lib/hip_peer.cpp.o $HIP_PATH/lib/hip_stream.cpp.o $HIP_PATH/lib/staging_buffer.cpp.o";
$HIPLDFLAGS .= " $HIP_PATH/lib/device_util.cpp.o $HIP_PATH/lib/hip_device.cpp.o $HIP_PATH/lib/hip_error.cpp.o $HIP_PATH/lib/hip_event.cpp.o $HIP_PATH/lib/hip_hcc.cpp.o $HIP_PATH/lib/hip_memory.cpp.o $HIP_PATH/lib/hip_peer.cpp.o $HIP_PATH/lib/hip_stream.cpp.o $HIP_PATH/lib/staging_buffer.cpp.o $HIP_PATH/lib/hip_ldg.cpp.o";
}
}
+39
Ver ficheiro
@@ -0,0 +1,39 @@
#ifndef HIP_LDG_H
#define HIP_LDG_H
#if __HCC__
#include"hip_vector_types.h"
#include"host_defines.h"
__device__ char __ldg(const char* );
__device__ signed char __ldg(const signed char* );
__device__ short __ldg(const short* );
__device__ int __ldg(const int* );
__device__ long __ldg(const long* );
__device__ long long __ldg(const long long* );
__device__ char2 __ldg(const char2* );
__device__ char4 __ldg(const char4* );
__device__ short2 __ldg(const short2* );
__device__ short4 __ldg(const short4* );
__device__ int2 __ldg(const int2* );
__device__ int4 __ldg(const int4* );
__device__ longlong2 __ldg(const longlong2* );
__device__ unsigned char __ldg(const unsigned char* );
__device__ unsigned short __ldg(const unsigned short* );
__device__ unsigned int __ldg(const unsigned int* );
__device__ unsigned long __ldg(const unsigned long* );
__device__ unsigned long long __ldg(const unsigned long long* );
__device__ uchar2 __ldg(const uchar2* );
__device__ uchar4 __ldg(const uchar4* );
__device__ ushort2 __ldg(const ushort2* );
__device__ uint2 __ldg(const uint2* );
__device__ uint4 __ldg(const uint4* );
__device__ ulonglong2 __ldg(const ulonglong2* );
__device__ float __ldg(const float* );
__device__ double __ldg(const double* );
__device__ float2 __ldg(const float2* );
__device__ float4 __ldg(const float4* );
__device__ double2 __ldg(const double2* );
#endif
#endif
+1
Ver ficheiro
@@ -56,6 +56,7 @@ extern int HIP_TRACE_API;
#define hipLaunchParm grid_launch_parm
#ifdef __cplusplus
#include <hcc_detail/hip_texture.h>
#include <hcc_detail/hip_ldg.h>
#endif
#include <hcc_detail/host_defines.h>
// TODO-HCC remove old definitions ; ~1602 hcc supports __HCC_ACCELERATOR__ define.
+18 -18
Ver ficheiro
@@ -1236,12 +1236,12 @@ void ihipStream_t::copySync(LockedAccessor_StreamCrit_t &crit, void* dst, const
bool dstTracked = (hc::am_memtracker_getinfo(&dstPtrInfo, dst) == AM_SUCCESS);
bool srcTracked = (hc::am_memtracker_getinfo(&srcPtrInfo, src) == AM_SUCCESS);
bool srcInDeviceMem = srcPtrInfo._isInDeviceMem;
bool dstInDeviceMem = dstPtrInfo._isInDeviceMem;
// Resolve default to a specific Kind so we know which algorithm to use:
if (kind == hipMemcpyDefault) {
bool srcInDeviceMem = (srcTracked && srcPtrInfo._isInDeviceMem);
bool dstInDeviceMem = (dstTracked && dstPtrInfo._isInDeviceMem);
kind = resolveMemcpyDirection(srcTracked, dstTracked, srcPtrInfo._isInDeviceMem, dstPtrInfo._isInDeviceMem);
kind = resolveMemcpyDirection(srcTracked, dstTracked, srcInDeviceMem, dstInDeviceMem);
};
hsa_signal_t depSignal;
@@ -1259,26 +1259,26 @@ void ihipStream_t::copySync(LockedAccessor_StreamCrit_t &crit, void* dst, const
if (kind == hipMemcpyHostToDevice) {
int depSignalCnt = preCopyCommand(crit, NULL, &depSignal, ihipCommandCopyH2D);
if (HIP_STAGING_BUFFERS) {
tprintf(DB_COPY1, "D2H && !dstTracked: staged copy H2D dst=%p src=%p sz=%zu\n", dst, src, sizeBytes);
if (HIP_STAGING_BUFFERS) {
tprintf(DB_COPY1, "D2H && !dstTracked: staged copy H2D dst=%p src=%p sz=%zu\n", dst, src, sizeBytes);
if (HIP_PININPLACE) {
device->_staging_buffer[0]->CopyHostToDevicePinInPlace(dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL);
} else {
device->_staging_buffer[0]->CopyHostToDevice(dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL);
}
if (HIP_PININPLACE) {
device->_staging_buffer[0]->CopyHostToDevicePinInPlace(dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL);
} else {
device->_staging_buffer[0]->CopyHostToDevice(dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL);
}
// The copy waits for inputs and then completes before returning so can reset queue to empty:
this->wait(crit, true);
} else {
// TODO - remove, slow path.
tprintf(DB_COPY1, "H2D && ! srcTracked: am_copy dst=%p src=%p sz=%zu\n", dst, src, sizeBytes);
// The copy waits for inputs and then completes before returning so can reset queue to empty:
this->wait(crit, true);
} else {
// TODO - remove, slow path.
tprintf(DB_COPY1, "H2D && ! srcTracked: am_copy dst=%p src=%p sz=%zu\n", dst, src, sizeBytes);
#if USE_AV_COPY
_av.copy(src,dst,sizeBytes);
_av.copy(src,dst,sizeBytes);
#else
hc::am_copy(dst, src, sizeBytes);
hc::am_copy(dst, src, sizeBytes);
#endif
}
}
} else if (kind == hipMemcpyDeviceToHost) {
int depSignalCnt = preCopyCommand(crit, NULL, &depSignal, ihipCommandCopyD2H);
if (HIP_STAGING_BUFFERS) {
+42
Ver ficheiro
@@ -0,0 +1,42 @@
#include"hcc_detail/hip_ldg.h"
__device__ char __ldg(const char* ptr)
{
return *ptr;
}
__device__ signed char __ldg(const signed char* ptr)
{
return ptr[0];
}
__device__ short __ldg(const short* ptr)
{
return ptr[0];
}
__device__ int __ldg(const int* ptr)
{
return ptr[0];
}
__device__ long long __ldg(const long long* ptr)
{
return ptr[0];
}
__device__ int2 __ldg(const int2* ptr)
{
return ptr[0];
}
__device__ int4 __ldg(const int4* ptr)
{
return ptr[0];
}
__device__ float __ldg(const float* ptr)
{
return ptr[0];
}
+4 -4
Ver ficheiro
@@ -9,13 +9,13 @@ OBJECTS=$(SOURCES:.cpp=.o)
EXECUTABLE=hipMemset
$(EXECUTABLE): $(HIP_DEPS) $(OBJECTS)
$(HCC) $(HLDFLAGS) $(OBJECTS) -o $@
$(HCC) $(HLDFLAGS) $(OBJECTS) -o $@
.cpp.o:
$(HCC) $(HCFLAGS) -c $< -o $@
@$(CC) -MM -MT $@ $(CFLAGS) -c $< > $(@:.o=.d)
$(HCC) $(HCFLAGS) -c $< -o $@
@$(CC) -MM -MT $@ $(CFLAGS) -c $< > $(@:.o=.d)
clean: hip_clean
rm -rf $(EXECUTABLE) $(OBJECTS)
rm -rf $(EXECUTABLE) $(OBJECTS)
include $(HIP_PATH)/examples/common/hip.epilogue.make
+47 -35
Ver ficheiro
@@ -30,18 +30,19 @@ THE SOFTWARE.
#define HIP_ASSERT(x) (assert((x)==hipSuccess))
#define WIDTH 1024
#define HEIGHT 1024
#define WIDTH 8
#define HEIGHT 8
#define NUM (WIDTH*HEIGHT)
#define THREADS_PER_BLOCK_X 16
#define THREADS_PER_BLOCK_Y 16
#define THREADS_PER_BLOCK_X 8
#define THREADS_PER_BLOCK_Y 8
#define THREADS_PER_BLOCK_Z 1
template<typename T>
__global__ void
vectoradd_float(hipLaunchParm lp,
float* a, const float* bm, const float* cm, int width, int height)
T* a, const T* bm, const T* cm, int width, int height)
{
int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
@@ -72,46 +73,35 @@ __kernel__ void vectoradd_float(float* a, const float* b, const float* c, int wi
using namespace std;
int main() {
float* hostA;
float* hostB;
float* hostC;
float* deviceA;
float* deviceB;
float* deviceC;
hipDeviceProp_t devProp;
hipGetDeviceProperties(&devProp, 0);
cout << " System minor " << devProp.minor << endl;
cout << " System major " << devProp.major << endl;
cout << " agent prop name " << devProp.name << endl;
cout << "__ldg " << endl ;
template<typename T>
bool dataTypesRun(){
T* hostA;
T* hostB;
T* hostC;
T* deviceA;
T* deviceB;
T* deviceC;
int i;
int errors;
hostA = (float*)malloc(NUM * sizeof(float));
hostB = (float*)malloc(NUM * sizeof(float));
hostC = (float*)malloc(NUM * sizeof(float));
hostA = (T*)malloc(NUM * sizeof(T));
hostB = (T*)malloc(NUM * sizeof(T));
hostC = (T*)malloc(NUM * sizeof(T));
// initialize the input data
for (i = 0; i < NUM; i++) {
hostB[i] = (float)i;
hostC[i] = (float)i*100.0f;
hostB[i] = (T)i;
hostC[i] = (T)i;
}
HIP_ASSERT(hipMalloc((void**)&deviceA, NUM * sizeof(float)));
HIP_ASSERT(hipMalloc((void**)&deviceB, NUM * sizeof(float)));
HIP_ASSERT(hipMalloc((void**)&deviceC, NUM * sizeof(float)));
HIP_ASSERT(hipMalloc((void**)&deviceA, NUM * sizeof(T)));
HIP_ASSERT(hipMalloc((void**)&deviceB, NUM * sizeof(T)));
HIP_ASSERT(hipMalloc((void**)&deviceC, NUM * sizeof(T)));
HIP_ASSERT(hipMemcpy(deviceB, hostB, NUM*sizeof(float), hipMemcpyHostToDevice));
HIP_ASSERT(hipMemcpy(deviceC, hostC, NUM*sizeof(float), hipMemcpyHostToDevice));
HIP_ASSERT(hipMemcpy(deviceB, hostB, NUM*sizeof(T), hipMemcpyHostToDevice));
HIP_ASSERT(hipMemcpy(deviceC, hostC, NUM*sizeof(T), hipMemcpyHostToDevice));
hipLaunchKernel(vectoradd_float,
@@ -121,8 +111,9 @@ int main() {
deviceA ,deviceB ,deviceC ,WIDTH ,HEIGHT);
HIP_ASSERT(hipMemcpy(hostA, deviceA, NUM*sizeof(float), hipMemcpyDeviceToHost));
HIP_ASSERT(hipMemcpy(hostA, deviceA, NUM*sizeof(T), hipMemcpyDeviceToHost));
bool ret = false;
// verify the results
errors = 0;
for (i = 0; i < NUM; i++) {
@@ -132,8 +123,10 @@ int main() {
}
if (errors!=0) {
printf("FAILED: %d errors\n",errors);
ret = false;
} else {
printf ("PASSED!\n");
ret = true;
}
HIP_ASSERT(hipFree(deviceA));
@@ -144,6 +137,25 @@ int main() {
free(hostB);
free(hostC);
return ret;
}
int main() {
hipDeviceProp_t devProp;
hipGetDeviceProperties(&devProp, 0);
cout << " System minor " << devProp.minor << endl;
cout << " System major " << devProp.major << endl;
cout << " agent prop name " << devProp.name << endl;
int errors;
errors = dataTypesRun<char>();
errors = dataTypesRun<signed char>();
errors = dataTypesRun<short>();
errors = dataTypesRun<int>();
cout << "__ldg " << endl ;
//hipResetDefaultAccelerator();
return errors;