Add hipHostMallocCoherent, hipHostMallocNonCoherent

Provide per-allocation control over coherent/non-coherent mem.
These overrid the default HIP_COHERENT_HOST_ALLOC setting.
Bu işleme şunda yer alıyor:
Ben Sander
2017-05-24 00:48:10 -05:00
ebeveyn d0ef9d8462
işleme 75f691ec2f
6 değiştirilmiş dosya ile 104 ekleme ve 26 silme
+7 -3
Dosyayı Görüntüle
@@ -111,17 +111,21 @@ enum hipLimit_t
//! Flags that can be used with hipHostMalloc
#define hipHostMallocDefault 0x0
#define hipHostMallocPortable 0x1
#define hipHostMallocMapped 0x2
#define hipHostMallocPortable 0x1 ///< Memory is considered allocated by all contexts.
#define hipHostMallocMapped 0x2 ///< Map the allocation into the address space for the current device. The device pointer can be obtained with #hipHostGetDevicePointer.
#define hipHostMallocWriteCombined 0x4
#define hipHostMallocCoherent 0x40000000 ///< Allocate coherent memory. Overrides HIP_COHERENT_HOST_ALLOC for specific allocation.
#define hipHostMallocNonCoherent 0x80000000 ///< Allocate non-coherent memory. Overrides HIP_COHERENT_HOST_ALLOC for specific allocation.
//! Flags that can be used with hipHostRegister
#define hipHostRegisterDefault 0x0 ///< Memory is Mapped and Portable
#define hipHostRegisterPortable 0x1 ///< Memory is considered registered by all contexts. HIP only supports one context so this is always assumed true.
#define hipHostRegisterPortable 0x1 ///< Memory is considered registered by all contexts.
#define hipHostRegisterMapped 0x2 ///< Map the allocation into the address space for the current device. The device pointer can be obtained with #hipHostGetDevicePointer.
#define hipHostRegisterIoMemory 0x4 ///< Not supported.
#define hipDeviceScheduleAuto 0x0 ///< Automatically select between Spin and Yield
#define hipDeviceScheduleSpin 0x1 ///< Dedicate a CPU core to spin-wait. Provides lowest latency, but burns a CPU core and may consume more power.
#define hipDeviceScheduleYield 0x2 ///< Yield the CPU to the operating system when waiting. May increase latency, but lowers power and is friendlier to other threads in the system.
+2
Dosyayı Görüntüle
@@ -65,6 +65,8 @@ hipMemcpyHostToHost
#define hipHostMallocPortable cudaHostAllocPortable
#define hipHostMallocMapped cudaHostAllocMapped
#define hipHostMallocWriteCombined cudaHostAllocWriteCombined
#define hipHostMallocCoherent 0x0
#define hipHostMallocNonCoherent 0x0
#define hipHostRegisterPortable cudaHostRegisterPortable
#define hipHostRegisterMapped cudaHostRegisterMapped
+1 -1
Dosyayı Görüntüle
@@ -74,7 +74,7 @@ int HIP_PROFILE_API= 0;
std::string HIP_DB_START_API;
std::string HIP_DB_STOP_API;
int HIP_DB= 0;
int HIP_VISIBLE_DEVICES = 0; /* Contains a comma-separated sequence of GPU identifiers */
int HIP_VISIBLE_DEVICES = 0;
int HIP_NUM_KERNELS_INFLIGHT = 128;
int HIP_WAIT_MODE = 0;
+25 -6
Dosyayı Görüntüle
@@ -267,17 +267,36 @@ hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags)
trueFlags = hipHostMallocMapped | hipHostMallocPortable;
}
const unsigned supportedFlags = hipHostMallocPortable | hipHostMallocMapped | hipHostMallocWriteCombined;
if (flags & ~supportedFlags) {
const unsigned supportedFlags = hipHostMallocPortable
| hipHostMallocMapped
| hipHostMallocWriteCombined
| hipHostMallocCoherent
| hipHostMallocNonCoherent;
const unsigned coherencyFlags = hipHostMallocCoherent | hipHostMallocNonCoherent;
if ((flags & ~supportedFlags) ||
((flags & coherencyFlags) == coherencyFlags)) {
*ptr = nullptr;
// can't specify unsupported flags, can't specify both Coherent + NonCoherent
hip_status = hipErrorInvalidValue;
}
else {
} else {
auto device = ctx->getWriteableDevice();
unsigned amFlags = HIP_COHERENT_HOST_ALLOC ? amHostCoherent : amHostPinned;
unsigned amFlags = 0;
if (flags & hipHostMallocCoherent) {
amFlags = amHostCoherent;
} else if (flags & hipHostMallocNonCoherent) {
amFlags = amHostPinned;
} else {
// depends on env variables:
amFlags = HIP_COHERENT_HOST_ALLOC ? amHostCoherent : amHostPinned;
}
*ptr = hip_internal::allocAndSharePtr(HIP_COHERENT_HOST_ALLOC ? "finegrained_host":"pinned_host",
*ptr = hip_internal::allocAndSharePtr((amFlags & amHostCoherent) ? "finegrained_host":"pinned_host",
sizeBytes, ctx, (trueFlags & hipHostMallocPortable) /*shareWithAll*/, amFlags, flags);
if(sizeBytes && (*ptr == NULL)){
+67 -16
Dosyayı Görüntüle
@@ -31,14 +31,19 @@
#define LEN 1024*1024
#define SIZE LEN*sizeof(float)
__global__ void Add(hipLaunchParm lp, float *Ad, float *Bd, float *Cd){
__global__ void Add(float *Ad, float *Bd, float *Cd){
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
Cd[tx] = Ad[tx] + Bd[tx];
}
__global__ void Set(int *Ad, int val){
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
Ad[tx] = val;
}
int main(){
float *A, *B, *C;
float *Ad, *Bd, *Cd;
hipDeviceProp_t prop;
int device;
@@ -49,26 +54,72 @@ int main(){
failed("Does support HostPinned Memory");
}
HIPCHECK(hipHostMalloc((void**)&A, SIZE, hipHostMallocWriteCombined | hipHostMallocMapped));
HIPCHECK(hipHostMalloc((void**)&B, SIZE, hipHostMallocDefault));
HIPCHECK(hipHostMalloc((void**)&C, SIZE, hipHostMallocMapped));
HIPCHECK(hipHostGetDevicePointer((void**)&Ad, A, 0));
HIPCHECK(hipHostGetDevicePointer((void**)&Cd, C, 0));
{
float *A, *B, *C;
float *Ad, *Bd, *Cd;
HIPCHECK(hipHostMalloc((void**)&A, SIZE, hipHostMallocWriteCombined | hipHostMallocMapped));
HIPCHECK(hipHostMalloc((void**)&B, SIZE, hipHostMallocDefault));
HIPCHECK(hipHostMalloc((void**)&C, SIZE, hipHostMallocMapped));
for(int i=0;i<LEN;i++){
A[i] = 1.0f;
B[i] = 2.0f;
HIPCHECK(hipHostGetDevicePointer((void**)&Ad, A, 0));
HIPCHECK(hipHostGetDevicePointer((void**)&Cd, C, 0));
for(int i=0;i<LEN;i++){
A[i] = 1.0f;
B[i] = 2.0f;
}
HIPCHECK(hipMalloc((void**)&Bd, SIZE));
HIPCHECK(hipMemcpy(Bd, B, SIZE, hipMemcpyHostToDevice));
dim3 dimGrid(LEN/512,1,1);
dim3 dimBlock(512,1,1);
hipLaunchKernelGGL(Add, dimGrid, dimBlock, 0, 0, Ad, Bd, Cd);
HIPCHECK(hipDeviceSynchronize());
HIPCHECK(hipHostFree(A));
HIPCHECK(hipHostFree(B));
HIPCHECK(hipHostFree(C));
}
HIPCHECK(hipMalloc((void**)&Bd, SIZE));
HIPCHECK(hipMemcpy(Bd, B, SIZE, hipMemcpyHostToDevice));
{
int *A, *B;
int numElements = 1024*16;
size_t sizeBytes = numElements * sizeof (int);
#ifdef __HIP_PLATFORM_HCC__
HIPCHECK_API(hipHostMalloc((void**)&A, sizeBytes, hipHostMallocCoherent|hipHostMallocNonCoherent), hipErrorInvalidValue);
dim3 dimGrid(LEN/512,1,1);
dim3 dimBlock(512,1,1);
assert (A == 0);
#endif
hipLaunchKernel(HIP_KERNEL_NAME(Add), dimGrid, dimBlock, 0, 0, Ad, Bd, Cd);
HIPCHECK(hipHostMalloc((void**)&A, sizeBytes, hipHostMallocCoherent));
hipStream_t s;
hipEvent_t e;
// Init:
HIPCHECK(hipStreamCreate(&s));
HIPCHECK(hipEventCreateWithFlags(&e, 0));
dim3 dimBlock(64,1,1);
dim3 dimGrid(numElements/dimBlock.x,1,1);
// Init array to know state:
hipLaunchKernelGGL(Set, dimGrid, dimBlock, 0, 0x0, A, -42);
HIPCHECK(hipDeviceSynchronize());
hipLaunchKernelGGL(Set, dimGrid, dimBlock, 0, s, A, 13);
HIPCHECK(hipEventRecord(e, s));
// Host waits for event :
HIPCHECK(hipEventSynchronize(e));
// check result?
HIPCHECK(hipHostMalloc((void**)&B, sizeBytes, hipHostMallocNonCoherent));
}
passed();
}
+2
Dosyayı Görüntüle
@@ -185,6 +185,8 @@ syn keyword hipFlags hipHostMallocDefault
syn keyword hipFlags hipHostMallocPortable
syn keyword hipFlags hipHostMallocMapped
syn keyword hipFlags hipHostMallocWriteCombined
syn keyword hipFlags hipHostMallocCoherent
syn keyword hipFlags hipHostMallocNonCoherent
syn keyword hipFlags hipHostRegisterDefault
syn keyword hipFlags hipHostRegisterPortable