Merge branch 'amd-develop' into amd-master
Change-Id: I1ec6b0bacf555d11adc5f67a55086dc6bd648efb
Этот коммит содержится в:
+11
-10
@@ -220,7 +220,7 @@ if($HIP_PLATFORM eq "hcc"){
|
||||
}
|
||||
}
|
||||
|
||||
if(($HIP_PLATFORM eq "hcc") and defined $ENV{HIP_EXPERIMENTAL}){
|
||||
if(($HIP_PLATFORM eq "hcc")){
|
||||
$EXPORT_LL=" ";
|
||||
$ENV{HCC_EXTRA_LIBRARIES}="$HIP_PATH/lib/hip_ir.ll\n";
|
||||
}
|
||||
@@ -244,6 +244,8 @@ my $toolArgs = ""; # arguments to pass to the hcc or nvcc tool
|
||||
|
||||
foreach $arg (@ARGV)
|
||||
{
|
||||
$trimarg = $arg;
|
||||
$trimarg =~ s/^\s+|\s+$//g; # Remive whitespace
|
||||
my $swallowArg = 0;
|
||||
if ($arg eq '-c') {
|
||||
$compileOnly = 1;
|
||||
@@ -254,38 +256,37 @@ foreach $arg (@ARGV)
|
||||
$needLDFLAGS = 1;
|
||||
}
|
||||
|
||||
if(($arg eq '-stdlib=libc++') and ($setStdLib eq 0))
|
||||
if(($trimarg eq '-stdlib=libc++') and ($setStdLib eq 0))
|
||||
{
|
||||
$HIPCXXFLAGS .= " -stdlib=libc++";
|
||||
$setStdLib = 1;
|
||||
}
|
||||
if(($arg eq '-stdlib=libstdc++') and ($setStdLib eq 0))
|
||||
if(($trimarg eq '-stdlib=libstdc++') and ($setStdLib eq 0))
|
||||
{
|
||||
$HIPCXXFLAGS .= " -stdlib=libstdc++";
|
||||
$HIPCXXFLAGS .= $HCC_WA_FLAGS;
|
||||
$setStdLib = 1;
|
||||
}
|
||||
if($arg eq '--version') {
|
||||
if($trimarg eq '--version') {
|
||||
$printHipVersion = 1;
|
||||
}
|
||||
if($arg eq '--short-version') {
|
||||
if($trimarg eq '--short-version') {
|
||||
$printHipVersion = 1;
|
||||
$runCmd = 0;
|
||||
}
|
||||
if($arg eq '-M') {
|
||||
if($trimarg eq '-M') {
|
||||
$compileOnly = 1;
|
||||
$buildDeps = 1;
|
||||
}
|
||||
if($arg eq '-use_fast_math') {
|
||||
print "In fast Math";
|
||||
if($trimarg eq '-use_fast_math') {
|
||||
$HIPCXXFLAGS .= " -DHIP_FAST_MATH ";
|
||||
}
|
||||
if(($arg eq '-use-staticlib') and ($setLinkType eq 0))
|
||||
if(($trimarg eq '-use-staticlib') and ($setLinkType eq 0))
|
||||
{
|
||||
$linkType = 0;
|
||||
$setLinkType = 1;
|
||||
}
|
||||
if(($arg eq '-use-sharedlib') and ($setLinkType eq 0))
|
||||
if(($trimarg eq '-use-sharedlib') and ($setLinkType eq 0))
|
||||
{
|
||||
$linkType = 1;
|
||||
$setLinkType = 1;
|
||||
|
||||
@@ -340,3 +340,25 @@ These options cause HCC to serialize. Useful if you have libraries or code whic
|
||||
- HSA_ENABLE_SDMA=0 : Causes host-to-device and device-to-host copies to use compute shader blit kernels rather than the dedicated DMA copy engines. Compute shader copies have low latency (typically < 5us) and can achieve approximately 80% of the bandwidth of the DMA copy engine. This flag is useful to isolate issues with the hardware copy engines.
|
||||
- HSA_ENABLE_INTERRUPT=0 : Causes completion signals to be detected with memory-based polling rather than interrupts. Can be useful to diagnose interrupt storm issues in the driver.
|
||||
- HSA_DISABLE_CACHE=1 : Disables the GPU L2 data cache.
|
||||
|
||||
### Debugging HIP Applications
|
||||
|
||||
- The variable "tls_tidInfo" contains the API sequence number (_apiSeqNum)- a monotonically increasing count of the HIP APIs called from this thread. This can be useful for setting conditional breakpoints. Also, each new HIP thread is mapped to monotically increasing shortTid ID. Both of these fields are displayed in the HIP debug info.
|
||||
```
|
||||
(gdb) p tls_tidInfo
|
||||
$32 = {_shortTid = 1, _apiSeqNum = 803}
|
||||
```
|
||||
|
||||
- HCC tracks all of the application memory allocations, including those from HIP and HC's "am_alloc". These can be printed by calling the function 'hc::am_memtracker_print()'.
|
||||
An optional argument specifies a void * targetPointer - the print routine will mark the allocation which contains the specified pointer with "-->" in the printed output.
|
||||
This example shows a sample GDB session where we print the memory allocated by this process and mark a specified address by using the gdb "call" function..
|
||||
The gdb syntax also supports using the variable name (in this case 'dst'):
|
||||
```
|
||||
(gdb) p dst
|
||||
$33 = (void *) 0x5ec7e9000
|
||||
(gdb) call hc::am_memtracker_print(dst)
|
||||
TargetAddress:0x5ec7e9000
|
||||
0x504cfc000-0x504cfc00f:: allocSeqNum:1 hostPointer:0x504cfc000 devicePointer:0x504cfc000 sizeBytes:16 isInDeviceMem:0 isAmManaged:1 appId:0 appAllocFlags:0 appPtr:(nil)
|
||||
...
|
||||
-->0x5ec7e9000-0x5f7e28fff:: allocSeqNum:488 hostPointer:(nil) devicePointer:0x5ec7e9000 sizeBytes:191102976 isInDeviceMem:1 isAmManaged:1 appId:0 appAllocFlags:0 appPtr:(nil)
|
||||
```
|
||||
|
||||
@@ -319,31 +319,107 @@ struct cuda2hipMap {
|
||||
cuda2hipRename["CUdevice_attribute_enum"] = {"hipDeviceAttribute_t", CONV_TYPE, API_DRIVER};
|
||||
cuda2hipRename["CUdevice_attribute"] = {"hipDeviceAttribute_t", CONV_TYPE, API_DRIVER};
|
||||
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK"] = {"hipDeviceAttributeMaxThreadsPerBlock", CONV_DEV, API_DRIVER};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X"] = {"hipDeviceAttributeMaxBlockDimX", CONV_DEV, API_DRIVER};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y"] = {"hipDeviceAttributeMaxBlockDimY", CONV_DEV, API_DRIVER};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z"] = {"hipDeviceAttributeMaxBlockDimZ", CONV_DEV, API_DRIVER};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X"] = {"hipDeviceAttributeMaxGridDimX", CONV_DEV, API_DRIVER};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y"] = {"hipDeviceAttributeMaxGridDimY", CONV_DEV, API_DRIVER};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z"] = {"hipDeviceAttributeMaxGridDimZ", CONV_DEV, API_DRIVER};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK"] = {"hipDeviceAttributeMaxSharedMemoryPerBlock", CONV_DEV, API_DRIVER};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY"] = {"hipDeviceAttributeTotalConstantMemory", CONV_DEV, API_DRIVER};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_WARP_SIZE"] = {"hipDeviceAttributeWarpSize", CONV_DEV, API_DRIVER};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK"] = {"hipDeviceAttributeMaxRegistersPerBlock", CONV_DEV, API_DRIVER};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_CLOCK_RATE"] = {"hipDeviceAttributeClockRate", CONV_DEV, API_DRIVER};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE"] = {"hipDeviceAttributeMemoryClockRate", CONV_DEV, API_DRIVER};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH"] = {"hipDeviceAttributeMemoryBusWidth", CONV_DEV, API_DRIVER};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH"] = {"hipDeviceAttributeMultiprocessorCount", CONV_DEV, API_DRIVER};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_COMPUTE_MODE"] = {"hipDeviceAttributeComputeMode", CONV_DEV, API_DRIVER};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE"] = {"hipDeviceAttributeL2CacheSize", CONV_DEV, API_DRIVER};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR"] = {"hipDeviceAttributeMaxThreadsPerMultiProcessor", CONV_DEV, API_DRIVER};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR"] = {"hipDeviceAttributeComputeCapabilityMajor", CONV_DEV, API_DRIVER};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR"] = {"hipDeviceAttributeComputeCapabilityMinor", CONV_DEV, API_DRIVER};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS"] = {"hipDeviceAttributeConcurrentKernels", CONV_DEV, API_DRIVER};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_PCI_BUS_ID"] = {"hipDeviceAttributePciBusId", CONV_DEV, API_DRIVER};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID"] = {"hipDeviceAttributePciDeviceId", CONV_DEV, API_DRIVER};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR"] = {"hipDeviceAttributeMaxSharedMemoryPerMultiprocessor", CONV_DEV, API_DRIVER};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD"] = {"hipDeviceAttributeIsMultiGpuBoard", CONV_DEV, API_DRIVER};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK"] = {"hipDeviceAttributeMaxThreadsPerBlock", CONV_DEV, API_DRIVER};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X"] = {"hipDeviceAttributeMaxBlockDimX", CONV_DEV, API_DRIVER};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y"] = {"hipDeviceAttributeMaxBlockDimY", CONV_DEV, API_DRIVER};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z"] = {"hipDeviceAttributeMaxBlockDimZ", CONV_DEV, API_DRIVER};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X"] = {"hipDeviceAttributeMaxGridDimX", CONV_DEV, API_DRIVER};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y"] = {"hipDeviceAttributeMaxGridDimY", CONV_DEV, API_DRIVER};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z"] = {"hipDeviceAttributeMaxGridDimZ", CONV_DEV, API_DRIVER};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK"] = {"hipDeviceAttributeMaxSharedMemoryPerBlock", CONV_DEV, API_DRIVER};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK"] = {"hipDeviceAttributeMaxSharedMemoryPerBlock", CONV_DEV, API_DRIVER};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY"] = {"hipDeviceAttributeTotalConstantMemory", CONV_DEV, API_DRIVER};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_WARP_SIZE"] = {"hipDeviceAttributeWarpSize", CONV_DEV, API_DRIVER};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK"] = {"hipDeviceAttributeMaxRegistersPerBlock", CONV_DEV, API_DRIVER};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK"] = {"hipDeviceAttributeMaxRegistersPerBlock", CONV_DEV, API_DRIVER};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_CLOCK_RATE"] = {"hipDeviceAttributeClockRate", CONV_DEV, API_DRIVER};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE"] = {"hipDeviceAttributeMemoryClockRate", CONV_DEV, API_DRIVER};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH"] = {"hipDeviceAttributeMemoryBusWidth", CONV_DEV, API_DRIVER};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_COMPUTE_MODE"] = {"hipDeviceAttributeComputeMode", CONV_DEV, API_DRIVER};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE"] = {"hipDeviceAttributeL2CacheSize", CONV_DEV, API_DRIVER};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR"] = {"hipDeviceAttributeMaxThreadsPerMultiProcessor", CONV_DEV, API_DRIVER};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR"] = {"hipDeviceAttributeComputeCapabilityMajor", CONV_DEV, API_DRIVER};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR"] = {"hipDeviceAttributeComputeCapabilityMinor", CONV_DEV, API_DRIVER};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS"] = {"hipDeviceAttributeConcurrentKernels", CONV_DEV, API_DRIVER};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_PCI_BUS_ID"] = {"hipDeviceAttributePciBusId", CONV_DEV, API_DRIVER};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID"] = {"hipDeviceAttributePciDeviceId", CONV_DEV, API_DRIVER};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR"] = {"hipDeviceAttributeMaxSharedMemoryPerMultiprocessor", CONV_DEV, API_DRIVER};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD"] = {"hipDeviceAttributeIsMultiGpuBoard", CONV_DEV, API_DRIVER};
|
||||
// unsupported yet by HIP
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_PITCH"] = {"hipDeviceAttributeMaxPitch", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT"] = {"hipDeviceAttributeTextureAlignment", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT"] = {"hipDeviceAttributeAsyncEngineCount", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
|
||||
// Deprecated. Use instead CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_GPU_OVERLAP"] = {"hipDeviceAttributeAsyncEngineCount", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT"] = {"hipDeviceAttributeMultiprocessorCount", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT"] = {"hipDeviceAttributeKernelExecTimeout", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_INTEGRATED"] = {"hipDeviceAttributeIntegrated", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY"] = {"hipDeviceAttributeCanMapHostMemory", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH"] = {"hipDeviceAttributeMaxTexture1DWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH"] = {"hipDeviceAttributeMaxTexture2DWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT"] = {"hipDeviceAttributeMaxTexture2DHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH"] = {"hipDeviceAttributeMaxTexture3DWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT"] = {"hipDeviceAttributeMaxTexture3DHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH"] = {"hipDeviceAttributeMaxTexture3DDepth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH"] = {"hipDeviceAttributeMaxTexture2DLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT"] = {"hipDeviceAttributeMaxTexture2DLayeredHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS"] = {"hipDeviceAttributeMaxTexture2DLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH"] = {"hipDeviceAttributeMaxTexture2DLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT"] = {"hipDeviceAttributeMaxTexture2DLayeredHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES"] = {"hipDeviceAttributeMaxTexture2DLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT"] = {"hipDeviceAttributeSurfaceAlignment", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_ECC_ENABLED"] = {"hipDeviceAttributeEccEnabled", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_TCC_DRIVER"] = {"hipDeviceAttributeTccDriver", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING"] = {"hipDeviceAttributeUnifiedAddressing", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH"] = {"hipDeviceAttributeMaxTexture1DLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS"] = {"hipDeviceAttributeMaxTexture1DLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH"] = {"hipDeviceAttributeMaxTexture2DGatherWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT"] = {"hipDeviceAttributeMaxTexture2DGatherHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE"] = {"hipDeviceAttributeMaxTexture3DWidthAlternate", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE"] = {"hipDeviceAttributeMaxTexture3DHeightAlternate", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE"] = {"hipDeviceAttributeMaxTexture3DDepthAlternate", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID"] = {"hipDeviceAttributePciDomainId", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT"] = {"hipDeviceAttributeTexturePitchAlignment", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH"] = {"hipDeviceAttributeMaxTextureCubemapWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH"] = {"hipDeviceAttributeMaxTextureCubemapLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS"] = {"hipDeviceAttributeMaxTextureCubemapLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH"] = {"hipDeviceAttributeMaxSurface1DWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH"] = {"hipDeviceAttributeMaxSurface2DWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT"] = {"hipDeviceAttributeMaxSurface2DHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH"] = {"hipDeviceAttributeMaxSurface3DWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT"] = {"hipDeviceAttributeMaxSurface3DHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH"] = {"hipDeviceAttributeMaxSurface3DDepth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH"] = {"hipDeviceAttributeMaxSurface1DLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS"] = {"hipDeviceAttributeMaxSurface1DLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH"] = {"hipDeviceAttributeMaxSurface2DLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT"] = {"hipDeviceAttributeMaxSurface2DLayeredHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS"] = {"hipDeviceAttributeMaxSurface2DLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH"] = {"hipDeviceAttributeMaxSurfaceCubemapWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH"] = {"hipDeviceAttributeMaxSurfaceCubemapLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS"] = {"hipDeviceAttributeMaxSurfaceCubemapLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH"] = {"hipDeviceAttributeMaxTexture1DLinearWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH"] = {"hipDeviceAttributeMaxTexture2DLinearWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT"] = {"hipDeviceAttributeMaxTexture2DLinearHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH"] = {"hipDeviceAttributeMaxTexture2DLinearPitch", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH"] = {"hipDeviceAttributeMaxTexture2DMipmappedWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT"] = {"hipDeviceAttributeMaxTexture2DMipmappedHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH"] = {"hipDeviceAttributeMaxTexture1DMipmappedWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED"] = {"hipDeviceAttributeStreamPrioritiesSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED"] = {"hipDeviceAttributeGlobalL1CacheSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED"] = {"hipDeviceAttributeLocalL1CacheSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR"] = {"hipDeviceAttributeMaxRegistersPerMultiprocessor", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY"] = {"hipDeviceAttributeManagedMemory", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID"] = {"hipDeviceAttributeMultiGpuBoardGroupId", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX"] = {"hipDeviceAttributeMax", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
|
||||
// deprecated, do not use
|
||||
// cuda2hipRename["CU_DEVICE_ATTRIBUTE_CAN_TEX2D_GATHER"] = {"hipDeviceAttributeCanTex2DGather", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
|
||||
// unsupported yet by HIP [CUDA 8.0.44]
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED"] = {"hipDeviceAttributeHostNativeAtomicSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO"] = {"hipDeviceAttributeSingleToDoublePrecisionPerfRatio", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS"] = {"hipDeviceAttributePageableMemoryAccess", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS"] = {"hipDeviceAttributeConcurrentManagedAccess", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED"] = {"hipDeviceAttributeComputePreemptionSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM"] = {"hipDeviceAttributeCanUseHostPointerForRegisteredMem", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
|
||||
|
||||
cuda2hipRename["CUdevprop_st"] = {"hipDeviceProp_t", CONV_TYPE, API_DRIVER};
|
||||
cuda2hipRename["CUdevprop"] = {"hipDeviceProp_t", CONV_TYPE, API_DRIVER};
|
||||
@@ -509,6 +585,35 @@ struct cuda2hipMap {
|
||||
cuda2hipRename["cuProfilerStop"] = {"hipProfilerStop", CONV_OTHER, API_DRIVER};
|
||||
|
||||
/////////////////////////////// CUDA RT API ///////////////////////////////
|
||||
// Data types
|
||||
// unsupported yet by HIP [CUDA 8.0.44]
|
||||
cuda2hipRename["cudaDataType_t"] = {"hipDataType_t", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["cudaDataType"] = {"hipDataType_t", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["CUDA_R_16F"] = {"hipR16F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["CUDA_C_16F"] = {"hipC16F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["CUDA_R_32F"] = {"hipR32F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["CUDA_C_32F"] = {"hipC32F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["CUDA_R_64F"] = {"hipR64F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["CUDA_C_64F"] = {"hipC64F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["CUDA_R_8I"] = {"hipR8I", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["CUDA_C_8I"] = {"hipC8I", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["CUDA_R_8U"] = {"hipR8U", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["CUDA_C_8U"] = {"hipC8U", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["CUDA_R_32I"] = {"hipR32I", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["CUDA_C_32I"] = {"hipC32I", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["CUDA_R_32U"] = {"hipR32U", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["CUDA_C_32U"] = {"hipC32U", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
|
||||
// Library property types
|
||||
// IMPORTANT: no cuda prefix
|
||||
// TO_DO: new matcher is needed
|
||||
// unsupported yet by HIP [CUDA 8.0.44]
|
||||
cuda2hipRename["libraryPropertyType_t"] = {"hipLibraryPropertyType_t", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["libraryPropertyType"] = {"hipLibraryPropertyType_t", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["MAJOR_VERSION"] = {"hipLibraryMajorVersion", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["MINOR_VERSION"] = {"hipLibraryMinorVersion", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["PATCH_LEVEL"] = {"hipLibraryPatchVersion", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
|
||||
// Error API
|
||||
cuda2hipRename["cudaGetLastError"] = {"hipGetLastError", CONV_ERR, API_RUNTIME};
|
||||
cuda2hipRename["cudaPeekAtLastError"] = {"hipPeekAtLastError", CONV_ERR, API_RUNTIME};
|
||||
@@ -629,31 +734,98 @@ struct cuda2hipMap {
|
||||
cuda2hipRename["cudaDeviceAttr"] = {"hipDeviceAttribute_t", CONV_TYPE, API_RUNTIME};
|
||||
cuda2hipRename["cudaDeviceGetAttribute"] = {"hipDeviceGetAttribute", CONV_DEV, API_RUNTIME};
|
||||
|
||||
cuda2hipRename["cudaDevAttrMaxThreadsPerBlock"] = {"hipDeviceAttributeMaxThreadsPerBlock", CONV_DEV, API_RUNTIME};
|
||||
cuda2hipRename["cudaDevAttrMaxBlockDimX"] = {"hipDeviceAttributeMaxBlockDimX", CONV_DEV, API_RUNTIME};
|
||||
cuda2hipRename["cudaDevAttrMaxBlockDimY"] = {"hipDeviceAttributeMaxBlockDimY", CONV_DEV, API_RUNTIME};
|
||||
cuda2hipRename["cudaDevAttrMaxBlockDimZ"] = {"hipDeviceAttributeMaxBlockDimZ", CONV_DEV, API_RUNTIME};
|
||||
cuda2hipRename["cudaDevAttrMaxGridDimX"] = {"hipDeviceAttributeMaxGridDimX", CONV_DEV, API_RUNTIME};
|
||||
cuda2hipRename["cudaDevAttrMaxGridDimY"] = {"hipDeviceAttributeMaxGridDimY", CONV_DEV, API_RUNTIME};
|
||||
cuda2hipRename["cudaDevAttrMaxGridDimZ"] = {"hipDeviceAttributeMaxGridDimZ", CONV_DEV, API_RUNTIME};
|
||||
cuda2hipRename["cudaDevAttrMaxSharedMemoryPerBlock"] = {"hipDeviceAttributeMaxSharedMemoryPerBlock", CONV_DEV, API_RUNTIME};
|
||||
cuda2hipRename["cudaDevAttrTotalConstantMemory"] = {"hipDeviceAttributeTotalConstantMemory", CONV_DEV, API_RUNTIME};
|
||||
cuda2hipRename["cudaDevAttrWarpSize"] = {"hipDeviceAttributeWarpSize", CONV_DEV, API_RUNTIME};
|
||||
cuda2hipRename["cudaDevAttrMaxRegistersPerBlock"] = {"hipDeviceAttributeMaxRegistersPerBlock", CONV_DEV, API_RUNTIME};
|
||||
cuda2hipRename["cudaDevAttrClockRate"] = {"hipDeviceAttributeClockRate", CONV_DEV, API_RUNTIME};
|
||||
cuda2hipRename["cudaDevAttrMemoryClockRate"] = {"hipDeviceAttributeMemoryClockRate", CONV_DEV, API_RUNTIME};
|
||||
cuda2hipRename["cudaDevAttrGlobalMemoryBusWidth"] = {"hipDeviceAttributeMemoryBusWidth", CONV_DEV, API_RUNTIME};
|
||||
cuda2hipRename["cudaDevAttrMultiProcessorCount"] = {"hipDeviceAttributeMultiprocessorCount", CONV_DEV, API_RUNTIME};
|
||||
cuda2hipRename["cudaDevAttrComputeMode"] = {"hipDeviceAttributeComputeMode", CONV_DEV, API_RUNTIME};
|
||||
cuda2hipRename["cudaDevAttrL2CacheSize"] = {"hipDeviceAttributeL2CacheSize", CONV_DEV, API_RUNTIME};
|
||||
cuda2hipRename["cudaDevAttrMaxThreadsPerMultiProcessor"] = {"hipDeviceAttributeMaxThreadsPerMultiProcessor", CONV_DEV, API_RUNTIME};
|
||||
cuda2hipRename["cudaDevAttrComputeCapabilityMajor"] = {"hipDeviceAttributeComputeCapabilityMajor", CONV_DEV, API_RUNTIME};
|
||||
cuda2hipRename["cudaDevAttrComputeCapabilityMinor"] = {"hipDeviceAttributeComputeCapabilityMinor", CONV_DEV, API_RUNTIME};
|
||||
cuda2hipRename["cudaDevAttrConcurrentKernels"] = {"hipDeviceAttributeConcurrentKernels", CONV_DEV, API_RUNTIME};
|
||||
cuda2hipRename["cudaDevAttrPciBusId"] = {"hipDeviceAttributePciBusId", CONV_DEV, API_RUNTIME};
|
||||
cuda2hipRename["cudaDevAttrPciDeviceId"] = {"hipDeviceAttributePciDeviceId", CONV_DEV, API_RUNTIME};
|
||||
cuda2hipRename["cudaDevAttrMaxSharedMemoryPerMultiprocessor"] = {"hipDeviceAttributeMaxSharedMemoryPerMultiprocessor", CONV_DEV, API_RUNTIME};
|
||||
cuda2hipRename["cudaDevAttrIsMultiGpuBoard"] = {"hipDeviceAttributeIsMultiGpuBoard", CONV_DEV, API_RUNTIME};
|
||||
cuda2hipRename["cudaDevAttrMaxThreadsPerBlock"] = {"hipDeviceAttributeMaxThreadsPerBlock", CONV_DEV, API_RUNTIME};
|
||||
cuda2hipRename["cudaDevAttrMaxBlockDimX"] = {"hipDeviceAttributeMaxBlockDimX", CONV_DEV, API_RUNTIME};
|
||||
cuda2hipRename["cudaDevAttrMaxBlockDimY"] = {"hipDeviceAttributeMaxBlockDimY", CONV_DEV, API_RUNTIME};
|
||||
cuda2hipRename["cudaDevAttrMaxBlockDimZ"] = {"hipDeviceAttributeMaxBlockDimZ", CONV_DEV, API_RUNTIME};
|
||||
cuda2hipRename["cudaDevAttrMaxGridDimX"] = {"hipDeviceAttributeMaxGridDimX", CONV_DEV, API_RUNTIME};
|
||||
cuda2hipRename["cudaDevAttrMaxGridDimY"] = {"hipDeviceAttributeMaxGridDimY", CONV_DEV, API_RUNTIME};
|
||||
cuda2hipRename["cudaDevAttrMaxGridDimZ"] = {"hipDeviceAttributeMaxGridDimZ", CONV_DEV, API_RUNTIME};
|
||||
cuda2hipRename["cudaDevAttrMaxSharedMemoryPerBlock"] = {"hipDeviceAttributeMaxSharedMemoryPerBlock", CONV_DEV, API_RUNTIME};
|
||||
cuda2hipRename["cudaDevAttrTotalConstantMemory"] = {"hipDeviceAttributeTotalConstantMemory", CONV_DEV, API_RUNTIME};
|
||||
cuda2hipRename["cudaDevAttrWarpSize"] = {"hipDeviceAttributeWarpSize", CONV_DEV, API_RUNTIME};
|
||||
cuda2hipRename["cudaDevAttrMaxRegistersPerBlock"] = {"hipDeviceAttributeMaxRegistersPerBlock", CONV_DEV, API_RUNTIME};
|
||||
cuda2hipRename["cudaDevAttrClockRate"] = {"hipDeviceAttributeClockRate", CONV_DEV, API_RUNTIME};
|
||||
cuda2hipRename["cudaDevAttrMemoryClockRate"] = {"hipDeviceAttributeMemoryClockRate", CONV_DEV, API_RUNTIME};
|
||||
cuda2hipRename["cudaDevAttrGlobalMemoryBusWidth"] = {"hipDeviceAttributeMemoryBusWidth", CONV_DEV, API_RUNTIME};
|
||||
cuda2hipRename["cudaDevAttrMultiProcessorCount"] = {"hipDeviceAttributeMultiprocessorCount", CONV_DEV, API_RUNTIME};
|
||||
cuda2hipRename["cudaDevAttrComputeMode"] = {"hipDeviceAttributeComputeMode", CONV_DEV, API_RUNTIME};
|
||||
cuda2hipRename["cudaDevAttrL2CacheSize"] = {"hipDeviceAttributeL2CacheSize", CONV_DEV, API_RUNTIME};
|
||||
cuda2hipRename["cudaDevAttrMaxThreadsPerMultiProcessor"] = {"hipDeviceAttributeMaxThreadsPerMultiProcessor", CONV_DEV, API_RUNTIME};
|
||||
cuda2hipRename["cudaDevAttrComputeCapabilityMajor"] = {"hipDeviceAttributeComputeCapabilityMajor", CONV_DEV, API_RUNTIME};
|
||||
cuda2hipRename["cudaDevAttrComputeCapabilityMinor"] = {"hipDeviceAttributeComputeCapabilityMinor", CONV_DEV, API_RUNTIME};
|
||||
cuda2hipRename["cudaDevAttrConcurrentKernels"] = {"hipDeviceAttributeConcurrentKernels", CONV_DEV, API_RUNTIME};
|
||||
cuda2hipRename["cudaDevAttrPciBusId"] = {"hipDeviceAttributePciBusId", CONV_DEV, API_RUNTIME};
|
||||
cuda2hipRename["cudaDevAttrPciDeviceId"] = {"hipDeviceAttributePciDeviceId", CONV_DEV, API_RUNTIME};
|
||||
cuda2hipRename["cudaDevAttrMaxSharedMemoryPerMultiprocessor"] = {"hipDeviceAttributeMaxSharedMemoryPerMultiprocessor", CONV_DEV, API_RUNTIME};
|
||||
cuda2hipRename["cudaDevAttrIsMultiGpuBoard"] = {"hipDeviceAttributeIsMultiGpuBoard", CONV_DEV, API_RUNTIME};
|
||||
// unsupported yet by HIP
|
||||
cuda2hipRename["cudaDevAttrMaxPitch"] = {"hipDeviceAttributeMaxPitch", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["cudaDevAttrTextureAlignment"] = {"hipDeviceAttributeTextureAlignment", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
// Is not deprecated as CUDA Driver's API analogue CU_DEVICE_ATTRIBUTE_GPU_OVERLAP
|
||||
cuda2hipRename["cudaDevAttrGpuOverlap"] = {"hipDeviceAttributeGpuOverlap", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["cudaDevAttrKernelExecTimeout"] = {"hipDeviceAttributeKernelExecTimeout", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["cudaDevAttrIntegrated"] = {"hipDeviceAttributeIntegrated", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["cudaDevAttrCanMapHostMemory"] = {"hipDeviceAttributeCanMapHostMemory", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["cudaDevAttrMaxTexture1DWidth"] = {"hipDeviceAttributeMaxTexture1DWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["cudaDevAttrMaxTexture2DWidth"] = {"hipDeviceAttributeMaxTexture2DWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["cudaDevAttrMaxTexture2DHeight"] = {"hipDeviceAttributeMaxTexture2DHeight", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["cudaDevAttrMaxTexture3DWidth"] = {"hipDeviceAttributeMaxTexture3DWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["cudaDevAttrMaxTexture3DHeight"] = {"hipDeviceAttributeMaxTexture3DHeight", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["cudaDevAttrMaxTexture3DDepth"] = {"hipDeviceAttributeMaxTexture3DDepth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["cudaDevAttrMaxTexture2DLayeredWidth"] = {"hipDeviceAttributeMaxTexture2DLayeredWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["cudaDevAttrMaxTexture2DLayeredHeight"] = {"hipDeviceAttributeMaxTexture2DLayeredHeight", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["cudaDevAttrMaxTexture2DLayeredLayers"] = {"hipDeviceAttributeMaxTexture2DLayeredLayers", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["cudaDevAttrSurfaceAlignment"] = {"hipDeviceAttributeSurfaceAlignment", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["cudaDevAttrEccEnabled"] = {"hipDeviceAttributeEccEnabled", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["cudaDevAttrTccDriver"] = {"hipDeviceAttributeTccDriver", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["cudaDevAttrUnifiedAddressing"] = {"hipDeviceAttributeUnifiedAddressing", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["cudaDevAttrMaxTexture1DLayeredWidth"] = {"hipDeviceAttributeMaxTexture1DLayeredWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["cudaDevAttrMaxTexture1DLayeredLayers"] = {"hipDeviceAttributeMaxTexture1DLayeredLayers", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["cudaDevAttrMaxTexture2DGatherWidth"] = {"hipDeviceAttributeMaxTexture2DGatherWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["cudaDevAttrMaxTexture2DGatherHeight"] = {"hipDeviceAttributeMaxTexture2DGatherHeight", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["cudaDevAttrMaxTexture3DWidthAlt"] = {"hipDeviceAttributeMaxTexture3DWidthAlternate", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["cudaDevAttrMaxTexture3DHeightAlt"] = {"hipDeviceAttributeMaxTexture3DHeightAlternate", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["cudaDevAttrMaxTexture3DDepthAlt"] = {"hipDeviceAttributeMaxTexture3DDepthAlternate", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["cudaDevAttrPciDomainId"] = {"hipDeviceAttributePciDomainId", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["cudaDevAttrTexturePitchAlignment"] = {"hipDeviceAttributeTexturePitchAlignment", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["cudaDevAttrMaxTextureCubemapWidth"] = {"hipDeviceAttributeMaxTextureCubemapWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["cudaDevAttrMaxTextureCubemapLayeredWidth"] = {"hipDeviceAttributeMaxTextureCubemapLayeredWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["cudaDevAttrMaxTextureCubemapLayeredLayers"] = {"hipDeviceAttributeMaxTextureCubemapLayeredLayers", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["cudaDevAttrMaxSurface1DWidth"] = {"hipDeviceAttributeMaxSurface1DWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["cudaDevAttrMaxSurface2DWidth"] = {"hipDeviceAttributeMaxSurface2DWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["cudaDevAttrMaxSurface2DHeight"] = {"hipDeviceAttributeMaxSurface2DHeight", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["cudaDevAttrMaxSurface3DWidth"] = {"hipDeviceAttributeMaxSurface3DWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["cudaDevAttrMaxSurface3DHeight"] = {"hipDeviceAttributeMaxSurface3DHeight", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["cudaDevAttrMaxSurface3DDepth"] = {"hipDeviceAttributeMaxSurface3DDepth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["cudaDevAttrMaxSurface1DLayeredWidth"] = {"hipDeviceAttributeMaxSurface1DLayeredWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["cudaDevAttrMaxSurface1DLayeredLayers"] = {"hipDeviceAttributeMaxSurface1DLayeredLayers", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["cudaDevAttrMaxSurface2DLayeredWidth"] = {"hipDeviceAttributeMaxSurface2DLayeredWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["cudaDevAttrMaxSurface2DLayeredHeight"] = {"hipDeviceAttributeMaxSurface2DLayeredHeight", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["cudaDevAttrMaxSurface2DLayeredLayers"] = {"hipDeviceAttributeMaxSurface2DLayeredLayers", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["cudaDevAttrMaxSurfaceCubemapWidth"] = {"hipDeviceAttributeMaxSurfaceCubemapWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["cudaDevAttrMaxSurfaceCubemapLayeredWidth"] = {"hipDeviceAttributeMaxSurfaceCubemapLayeredWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["cudaDevAttrMaxSurfaceCubemapLayeredLayers"] = {"hipDeviceAttributeMaxSurfaceCubemapLayeredLayers", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["cudaDevAttrMaxTexture1DLinearWidth"] = {"hipDeviceAttributeMaxTexture1DLinearWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["cudaDevAttrMaxTexture2DLinearWidth"] = {"hipDeviceAttributeMaxTexture2DLinearWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["cudaDevAttrMaxTexture2DLinearHeight"] = {"hipDeviceAttributeMaxTexture2DLinearHeight", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["cudaDevAttrMaxTexture2DLinearPitch"] = {"hipDeviceAttributeMaxTexture2DLinearPitch", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["cudaDevAttrMaxTexture2DMipmappedWidth"] = {"hipDeviceAttributeMaxTexture2DMipmappedWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["cudaDevAttrMaxTexture2DMipmappedHeight"] = {"hipDeviceAttributeMaxTexture2DMipmappedHeight", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["cudaDevAttrMaxTexture1DMipmappedWidth"] = {"hipDeviceAttributeMaxTexture1DMipmappedWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["cudaDevAttrStreamPrioritiesSupported"] = {"hipDeviceAttributeStreamPrioritiesSupported", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["cudaDevAttrGlobalL1CacheSupported"] = {"hipDeviceAttributeGlobalL1CacheSupported", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["cudaDevAttrLocalL1CacheSupported"] = {"hipDeviceAttributeLocalL1CacheSupported", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["cudaDevAttrMaxRegistersPerMultiprocessor"] = {"hipDeviceAttributeMaxRegistersPerMultiprocessor", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["cudaDevAttrManagedMemory"] = {"hipDeviceAttributeManagedMemory", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["cudaDevAttrMultiGpuBoardGroupID"] = {"hipDeviceAttributeMultiGpuBoardGroupID", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
// unsupported yet by HIP [CUDA 8.0.44]
|
||||
cuda2hipRename["cudaDevAttrHostNativeAtomicSupported"] = {"hipDeviceAttributeHostNativeAtomicSupported", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["cudaDevAttrSingleToDoublePrecisionPerfRatio"] = {"hipDeviceAttributeSingleToDoublePrecisionPerfRatio", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["cudaDevAttrPageableMemoryAccess"] = {"hipDeviceAttributePageableMemoryAccess", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["cudaDevAttrConcurrentManagedAccess"] = {"hipDeviceAttributeConcurrentManagedAccess", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["cudaDevAttrComputePreemptionSupported"] = {"hipDeviceAttributeComputePreemptionSupported", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
cuda2hipRename["cudaDevAttrCanUseHostPointerForRegisteredMem"] = {"hipDeviceAttributeCanUseHostPointerForRegisteredMem", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
|
||||
|
||||
// Pointer Attributes
|
||||
cuda2hipRename["cudaPointerAttributes"] = {"hipPointerAttribute_t", CONV_TYPE, API_RUNTIME};
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
@@ -26,8 +26,8 @@ THE SOFTWARE.
|
||||
#include <hip/hcc_detail/device_functions.h>
|
||||
#elif defined(__HIP_PLATFORM_NVCC__) && !defined (__HIP_PLATFORM_HCC__)
|
||||
#include <device_functions.h>
|
||||
#else
|
||||
#else
|
||||
#error("Must define exactly one of __HIP_PLATFORM_HCC__ or __HIP_PLATFORM_NVCC__");
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
|
||||
@@ -20,13 +20,18 @@ THE SOFTWARE.
|
||||
#ifndef HIP_HCC_DETAIL_DEVICE_FUNCTIONS_H
|
||||
#define HIP_HCC_DETAIL_DEVICE_FUNCTIONS_H
|
||||
|
||||
#include "hip_runtime.h"
|
||||
#include <hip/hip_runtime.h>
|
||||
#include <hip/hip_vector_types.h>
|
||||
|
||||
__device__ float __int_as_float (int x);
|
||||
|
||||
__device__ double __hiloint2double (int hi, int lo);
|
||||
|
||||
extern __HIP_DEVICE__ double __longlong_as_double(long long int x);
|
||||
extern __HIP_DEVICE__ long long int __double_as_longlong(double x);
|
||||
__device__ char4 __hip_hc_add8pk(char4, char4);
|
||||
__device__ char4 __hip_hc_sub8pk(char4, char4);
|
||||
__device__ char4 __hip_hc_mul8pk(char4, char4);
|
||||
|
||||
extern __device__ double __longlong_as_double(long long int x);
|
||||
extern __device__ long long int __double_as_longlong(double x);
|
||||
|
||||
#endif
|
||||
|
||||
@@ -23,10 +23,8 @@ THE SOFTWARE.
|
||||
#ifndef HIPCOMPLEX_H
|
||||
#define HIPCOMPLEX_H
|
||||
|
||||
typedef struct{
|
||||
float x;
|
||||
float y;
|
||||
}hipFloatComplex;
|
||||
typedef float2 hipFloatComplex;
|
||||
typedef double2 hipDoubleComplex;
|
||||
|
||||
__device__ static inline float hipCrealf(hipFloatComplex z){
|
||||
return z.x;
|
||||
@@ -79,10 +77,6 @@ __device__ static inline float hipCabsf(hipFloatComplex z){
|
||||
}
|
||||
|
||||
|
||||
typedef struct{
|
||||
double x;
|
||||
double y;
|
||||
}hipDoubleComplex;
|
||||
|
||||
__device__ static inline double hipCreal(hipDoubleComplex z){
|
||||
return z.x;
|
||||
|
||||
@@ -25,213 +25,227 @@ THE SOFTWARE.
|
||||
|
||||
#include "hip/hip_runtime.h"
|
||||
|
||||
#if 0
|
||||
#if __clang_major__ == 4
|
||||
|
||||
typedef __fp16 __half;
|
||||
|
||||
typedef struct __attribute__((aligned(4))){
|
||||
int a;
|
||||
union {
|
||||
__half p[2];
|
||||
unsigned int q;
|
||||
};
|
||||
} __half2;
|
||||
|
||||
extern "C" __half __hip_hadd_gfx803(__half a, __half b);
|
||||
extern "C" __half __hip_hfma_gfx803(__half a, __half b);
|
||||
extern "C" __half __hip_hmul_gfx803(__half a, __half b);
|
||||
extern "C" __half __hip_hsub_gfx803(__half a, __half b);
|
||||
struct holder{
|
||||
union {
|
||||
__half h;
|
||||
unsigned short s;
|
||||
};
|
||||
};
|
||||
|
||||
extern "C" int __hip_hadd2_gfx803(int a, int b);
|
||||
extern "C" int __hip_hfma2_gfx803(int a, int b);
|
||||
extern "C" int __hip_hmul2_gfx803(int a, int b);
|
||||
extern "C" int __hip_hsub2_gfx803(int a, int b);
|
||||
#define HINF 65504
|
||||
|
||||
__device__ inline __half __hadd(__half a, __half b) {
|
||||
return __hip_hadd_gfx803(a, b);
|
||||
static struct holder hInf = {HINF};
|
||||
|
||||
extern "C" __half __hip_hc_ir_hadd_half(__half, __half);
|
||||
extern "C" __half __hip_hc_ir_hfma_half(__half, __half, __half);
|
||||
extern "C" __half __hip_hc_ir_hmul_half(__half, __half);
|
||||
extern "C" __half __hip_hc_ir_hsub_half(__half, __half);
|
||||
|
||||
extern "C" int __hip_hc_ir_hadd2_int(int, int);
|
||||
extern "C" int __hip_hc_ir_hfma2_int(int, int, int);
|
||||
extern "C" int __hip_hc_ir_hmul2_int(int, int);
|
||||
extern "C" int __hip_hc_ir_hsub2_int(int, int);
|
||||
|
||||
__device__ static inline __half __hadd(const __half a, const __half b) {
|
||||
return __hip_hc_ir_hadd_half(a, b);
|
||||
}
|
||||
|
||||
__device__ inline __half __hadd_sat(__half a, __half b) {
|
||||
return __hip_hadd_gfx803(a, b);
|
||||
__device__ static inline __half __hadd_sat(__half a, __half b) {
|
||||
return __hip_hc_ir_hadd_half(a, b);
|
||||
}
|
||||
|
||||
__device__ inline __half __hfma(__half a, __half b) {
|
||||
return __hip_hfma_gfx803(a, b);
|
||||
__device__ static inline __half __hfma(__half a, __half b, __half c) {
|
||||
return __hip_hc_ir_hfma_half(a, b, c);
|
||||
}
|
||||
|
||||
__device__ inline __half __hfma_sat(__half a, __half b) {
|
||||
return __hip_hfma_gfx803(a, b);
|
||||
__device__ static inline __half __hfma_sat(__half a, __half b, __half c) {
|
||||
return __hip_hc_ir_hfma_half(a, b, c);
|
||||
}
|
||||
|
||||
__device__ inline __half __hmul(__half a, __half b) {
|
||||
return __hip_hmul_gfx803(a, b);
|
||||
__device__ static inline __half __hmul(__half a, __half b) {
|
||||
return __hip_hc_ir_hmul_half(a, b);
|
||||
}
|
||||
|
||||
__device__ inline __half __hmul_sat(__half a, __half b) {
|
||||
return __hip_hmul_gfx803(a, b);
|
||||
__device__ static inline __half __hmul_sat(__half a, __half b) {
|
||||
return __hip_hc_ir_hmul_half(a, b);
|
||||
}
|
||||
|
||||
__device__ inline __half __hsub(__half a, __half b) {
|
||||
return __hip_hsub_gfx803(a, b);
|
||||
__device__ static inline __half __hneg(__half a) {
|
||||
return -a;
|
||||
}
|
||||
|
||||
__device__ inline __half __hsub_sat(__half a, __half b) {
|
||||
return __hip_hsub_gfx803(a, b);
|
||||
__device__ static inline __half __hsub(__half a, __half b) {
|
||||
return __hip_hc_ir_hsub_half(a, b);
|
||||
}
|
||||
|
||||
|
||||
__device__ inline __half2 __hadd2(__half2 a, __half2 b) {
|
||||
__half2 ret;
|
||||
ret.a = __hip_hadd2_gfx803(a.a, b.a);
|
||||
return ret;
|
||||
__device__ static inline __half __hsub_sat(__half a, __half b) {
|
||||
return __hip_hc_ir_hsub_half(a, b);
|
||||
}
|
||||
|
||||
#else
|
||||
__device__ static inline __half hdiv(__half a, __half b) {
|
||||
return a/b;
|
||||
}
|
||||
|
||||
typedef struct{
|
||||
/*
|
||||
Half2 Arithmetic Functions
|
||||
*/
|
||||
|
||||
__device__ static inline __half2 __hadd2(__half2 a, __half2 b) {
|
||||
__half2 c;
|
||||
c.q = __hip_hc_ir_hadd2_int(a.q, b.q);
|
||||
return c;
|
||||
}
|
||||
|
||||
__device__ static inline __half2 __hadd2_sat(__half2 a, __half2 b) {
|
||||
__half2 c;
|
||||
c.q = __hip_hc_ir_hadd2_int(a.q, b.q);
|
||||
return c;
|
||||
}
|
||||
|
||||
__device__ static inline __half2 __hfma2(__half2 a, __half2 b, __half2 c) {
|
||||
__half2 d;
|
||||
d.q = __hip_hc_ir_hfma2_int(a.q, b.q, c.q);
|
||||
return d;
|
||||
}
|
||||
|
||||
__device__ static inline __half2 __hfma2_sat(__half2 a, __half2 b, __half2 c) {
|
||||
__half2 d;
|
||||
d.q = __hip_hc_ir_hfma2_int(a.q, b.q, c.q);
|
||||
return d;
|
||||
}
|
||||
|
||||
__device__ static inline __half2 __hmul2(__half2 a, __half2 b) {
|
||||
__half2 c;
|
||||
c.q = __hip_hc_ir_hmul2_int(a.q, b.q);
|
||||
return c;
|
||||
}
|
||||
|
||||
__device__ static inline __half2 __hmul2_sat(__half2 a, __half2 b) {
|
||||
__half2 c;
|
||||
c.q = __hip_hc_ir_hmul2_int(a.q, b.q);
|
||||
return c;
|
||||
}
|
||||
|
||||
__device__ static inline __half2 __hsub2(__half2 a, __half2 b) {
|
||||
__half2 c;
|
||||
c.q = __hip_hc_ir_hsub2_int(a.q, b.q);
|
||||
return c;
|
||||
}
|
||||
|
||||
__device__ static inline __half2 __hneg2(__half2 a) {
|
||||
__half2 c;
|
||||
c.p[0] = - a.p[0];
|
||||
c.p[1] = - a.p[1];
|
||||
return c;
|
||||
}
|
||||
|
||||
__device__ static inline __half2 __hsub2_sat(__half2 a, __half2 b) {
|
||||
__half2 c;
|
||||
c.q = __hip_hc_ir_hsub2_int(a.q, b.q);
|
||||
return c;
|
||||
}
|
||||
|
||||
__device__ static inline __half2 h2div(__half2 a, __half2 b) {
|
||||
__half2 c;
|
||||
c.p[0] = a.p[0] / b.p[0];
|
||||
c.p[1] = a.p[1] / b.p[1];
|
||||
return c;
|
||||
}
|
||||
|
||||
/*
|
||||
Half comparision Functions
|
||||
*/
|
||||
|
||||
__device__ static inline bool __heq(__half a, __half b) {
|
||||
return a == b ? true : false;
|
||||
}
|
||||
|
||||
__device__ static inline bool __hge(__half a, __half b) {
|
||||
return a >= b ? true : false;
|
||||
}
|
||||
|
||||
__device__ static inline bool __hgt(__half a, __half b) {
|
||||
return a > b ? true : false;
|
||||
}
|
||||
|
||||
__device__ static inline bool __hisinf(__half a) {
|
||||
return a == hInf.s ? true : false;
|
||||
}
|
||||
|
||||
__device__ static inline bool __hisnan(__half a) {
|
||||
return a > hInf.s ? true : false;
|
||||
}
|
||||
|
||||
__device__ static inline bool __hle(__half a, __half b) {
|
||||
return a <= b ? true : false;
|
||||
}
|
||||
|
||||
__device__ static inline bool __hlt(__half a, __half b) {
|
||||
return a < b ? true : false;
|
||||
}
|
||||
|
||||
__device__ static inline bool __hne(__half a, __half b) {
|
||||
return a != b ? true : false;
|
||||
}
|
||||
|
||||
/*
|
||||
Half2 Comparision Functions
|
||||
*/
|
||||
|
||||
__device__ static inline bool __hbeq2(__half2 a, __half2 b) {
|
||||
return (a.p[0] == b.p[0] ? true : false) && (a.p[1] == b.p[1] ? true : false);
|
||||
}
|
||||
|
||||
__device__ static inline bool __hbge2(__half2 a, __half2 b) {
|
||||
return (a.p[0] >= b.p[0] ? true : false) && (a.p[1] >= b.p[1] ? true : false);
|
||||
}
|
||||
|
||||
__device__ static inline bool __hbgt2(__half2 a, __half2 b) {
|
||||
return (a.p[0] > b.p[0] ? true : false) && (a.p[1] > b.p[1] ? true : false);
|
||||
}
|
||||
|
||||
__device__ static inline bool __hble2(__half2 a, __half2 b) {
|
||||
return (a.p[0] <= b.p[0] ? true : false) && (a.p[1] <= b.p[1] ? true : false);
|
||||
}
|
||||
|
||||
__device__ static inline bool __hblt2(__half2 a, __half2 b) {
|
||||
return (a.p[0] < b.p[0] ? true : false) && (a.p[1] < b.p[1] ? true : false);
|
||||
}
|
||||
|
||||
__device__ static inline bool __hbne2(__half2 a, __half2 b) {
|
||||
return (a.p[0] != b.p[0] ? true : false) && (a.p[1] != b.p[1] ? true : false);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if __clang_major__ == 3
|
||||
|
||||
typedef struct {
|
||||
unsigned x: 16;
|
||||
} __half;
|
||||
|
||||
|
||||
typedef struct __attribute__((aligned(4))){
|
||||
__half p,q;
|
||||
union {
|
||||
__half p[2];
|
||||
unsigned int q;
|
||||
};
|
||||
} __half2;
|
||||
|
||||
typedef __half half;
|
||||
typedef __half2 half2;
|
||||
|
||||
/*
|
||||
Arithmetic functions
|
||||
*/
|
||||
|
||||
__device__ __half __hadd(const __half a, const __half b);
|
||||
|
||||
__device__ __half __hadd_sat(const __half a, const __half b);
|
||||
|
||||
__device__ __half __hfma(const __half a, const __half b, const __half c);
|
||||
|
||||
__device__ __half __hfma_sat(const __half a, const __half b, const __half c);
|
||||
|
||||
__device__ __half __hmul(const __half a, const __half b);
|
||||
|
||||
__device__ __half __hmul_sat(const __half a, const __half b);
|
||||
|
||||
__device__ __half __hneq(const __half a);
|
||||
|
||||
__device__ __half __hsub(const __half a, const __half b);
|
||||
|
||||
__device__ __half __hsub_sat(const __half a, const __half b);
|
||||
|
||||
|
||||
|
||||
/*
|
||||
Half2 Arithmetic Instructions
|
||||
*/
|
||||
|
||||
__device__ __half2 __hadd2(const __half2 a, const __half2 b);
|
||||
|
||||
__device__ __half2 __hadd2_sat(const __half2 a, const __half2 b);
|
||||
|
||||
__device__ __half2 __hfma2(const __half2 a, const __half2 b, const __half2 c);
|
||||
|
||||
__device__ __half2 __hfma2_sat(const __half2 a, const __half2 b, const __half2 c);
|
||||
|
||||
__device__ __half2 __hmul2(const __half2 a, const __half2 b);
|
||||
|
||||
__device__ __half2 __hmul2_sat(const __half2 a, const __half2 b);
|
||||
|
||||
__device__ __half2 __hneq2(const __half2 a);
|
||||
|
||||
__device__ __half2 __hsub2(const __half2 a, const __half2 b);
|
||||
|
||||
__device__ __half2 __hsub2_sat(const __half2 a, const __half2 b);
|
||||
|
||||
/*
|
||||
Half Cmps
|
||||
*/
|
||||
|
||||
__device__ bool __heq(const __half a, const __half b);
|
||||
|
||||
__device__ bool __hge(const __half a, const __half b);
|
||||
|
||||
__device__ bool __hgt(const __half a, const __half b);
|
||||
|
||||
__device__ bool __hisinf(const __half a);
|
||||
|
||||
__device__ bool __hisnan(const __half a);
|
||||
|
||||
__device__ bool __hle(const __half a, const __half b);
|
||||
|
||||
__device__ bool __hlt(const __half a, const __half b);
|
||||
|
||||
__device__ bool __hne(const __half a, const __half b);
|
||||
|
||||
/*
|
||||
Half2 Cmps
|
||||
*/
|
||||
|
||||
__device__ bool __hbeq2(const __half2 a, const __half2 b);
|
||||
|
||||
__device__ bool __hbge2(const __half2 a, const __half2 b);
|
||||
|
||||
__device__ bool __hbgt2(const __half2 a, const __half2 b);
|
||||
|
||||
__device__ bool __hble2(const __half2 a, const __half2 b);
|
||||
|
||||
__device__ bool __hblt2(const __half2 a, const __half2 b);
|
||||
|
||||
__device__ bool __hbne2(const __half2 a, const __half2 b);
|
||||
|
||||
__device__ __half2 __heq2(const __half2 a, const __half2 b);
|
||||
|
||||
__device__ __half2 __hge2(const __half2 a, const __half2 b);
|
||||
|
||||
__device__ __half2 __hgt2(const __half2 a, const __half2 b);
|
||||
|
||||
__device__ __half2 __hisnan2(const __half2 a);
|
||||
|
||||
__device__ __half2 __hle2(const __half2 a, const __half2 b);
|
||||
|
||||
__device__ __half2 __hlt2(const __half2 a, const __half2 b);
|
||||
|
||||
__device__ __half2 __hne2(const __half2 a, const __half2 b);
|
||||
|
||||
|
||||
/*
|
||||
Half Cnvs and Data Mvmnt
|
||||
*/
|
||||
|
||||
__device__ __half2 __float22half2_rn(const float2 a);
|
||||
|
||||
__device__ __half __float2half(const float a);
|
||||
|
||||
__device__ __half2 __float2half2_rn(const float a);
|
||||
|
||||
__device__ __half2 __floats2half2_rn(const float a, const float b);
|
||||
|
||||
__device__ float2 __half22float2(const __half2 a);
|
||||
|
||||
__device__ float __half2float(const __half a);
|
||||
|
||||
__device__ __half2 __half2half2(const __half a);
|
||||
|
||||
__device__ __half2 __halves2half2(const __half a, const __half b);
|
||||
|
||||
__device__ float __high2float(const __half2 a);
|
||||
|
||||
__device__ __half __high2half(const __half2 a);
|
||||
|
||||
__device__ __half2 __high2half2(const __half2 a);
|
||||
|
||||
__device__ __half2 __highs2half2(const __half2 a, const __half2 b);
|
||||
|
||||
__device__ float __low2float(const __half2 a);
|
||||
|
||||
__device__ __half __low2half(const __half2 a);
|
||||
|
||||
__device__ __half2 __low2half2(const __half2 a);
|
||||
|
||||
__device__ __half2 __lows2half2(const __half2 a, const __half2 b);
|
||||
|
||||
__device__ __half2 __lowhigh2highlow(const __half2 a);
|
||||
|
||||
__device__ __half2 __low2half2(const __half2 a, const __half2 b);
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
@@ -25,8 +25,8 @@ THE SOFTWARE.
|
||||
|
||||
#if __HCC__
|
||||
#if __hcc_workweek__ >= 16164
|
||||
#include "hip/hip_vector_types.h"
|
||||
#include "hip/hcc_detail/host_defines.h"
|
||||
#include "hip_vector_types.h"
|
||||
#include "host_defines.h"
|
||||
|
||||
|
||||
__device__ char __ldg(const char* );
|
||||
@@ -75,4 +75,3 @@ __device__ double2 __ldg(const double2* );
|
||||
#endif // __HCC__
|
||||
|
||||
#endif // HIP_LDG_H
|
||||
|
||||
|
||||
@@ -46,6 +46,7 @@ THE SOFTWARE.
|
||||
#define CUDA_SUCCESS hipSuccess
|
||||
|
||||
#include <hip/hip_runtime_api.h>
|
||||
|
||||
//#include "hip/hcc_detail/hip_hcc.h"
|
||||
//---
|
||||
// Remainder of this file only compiles with HCC
|
||||
@@ -815,9 +816,6 @@ extern "C" __device__ void* __hip_hc_free(void *ptr);
|
||||
//extern "C" __device__ void* malloc(size_t size);
|
||||
//extern "C" __device__ void* free(void *ptr);
|
||||
|
||||
extern "C" __device__ char4 __hip_hc_add8pk(char4, char4);
|
||||
extern "C" __device__ char4 __hip_hc_sub8pk(char4, char4);
|
||||
extern "C" __device__ char4 __hip_hc_mul8pk(char4, char4);
|
||||
|
||||
#define __syncthreads() hc_barrier(CLK_LOCAL_MEM_FENCE)
|
||||
|
||||
|
||||
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
@@ -25,7 +25,7 @@ THE SOFTWARE.
|
||||
// Common code included at start of every hip file.
|
||||
// Auto enable __HIP_PLATFORM_HCC__ if compiling with HCC
|
||||
// Other compiler (GCC,ICC,etc) need to set one of these macros explicitly
|
||||
#if defined(__HCC__)
|
||||
#if defined(__HCC__)
|
||||
#define __HIP_PLATFORM_HCC__
|
||||
#define __HIPCC__
|
||||
|
||||
@@ -37,7 +37,7 @@ THE SOFTWARE.
|
||||
#endif
|
||||
|
||||
// Auto enable __HIP_PLATFORM_NVCC__ if compiling with NVCC
|
||||
#if defined(__NVCC__)
|
||||
#if defined(__NVCC__)
|
||||
#define __HIP_PLATFORM_NVCC__
|
||||
# ifdef __CUDACC__
|
||||
# define __HIPCC__
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
@@ -31,4 +31,3 @@ THE SOFTWARE.
|
||||
#else
|
||||
#error("Must define exactly one of __HIP_PLATFORM_HCC__ or __HIP_PLATFORM_NVCC__");
|
||||
#endif
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
@@ -32,7 +32,7 @@ THE SOFTWARE.
|
||||
#define HIP_BEGIN_MARKER(markerName, group) amdtBeginMarker(markerName, group, nullptr);
|
||||
#define HIP_END_MARKER() amdtEndMarker();
|
||||
#else
|
||||
#define HIP_SCOPED_MARKER(markerName, group)
|
||||
#define HIP_SCOPED_MARKER(markerName, group)
|
||||
#define HIP_BEGIN_MARKER(markerName, group)
|
||||
#define HIP_END_MARKER()
|
||||
#define HIP_END_MARKER()
|
||||
#endif
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
@@ -22,7 +22,7 @@ THE SOFTWARE.
|
||||
|
||||
//! HIP = Heterogeneous-compute Interface for Portability
|
||||
//!
|
||||
//! Define a extremely thin runtime layer that allows source code to be compiled unmodified
|
||||
//! Define a extremely thin runtime layer that allows source code to be compiled unmodified
|
||||
//! through either AMD HCC or NVCC. Key features tend to be in the spirit
|
||||
//! and terminology of CUDA, but with a portable path to other accelerators as well:
|
||||
//
|
||||
@@ -54,11 +54,10 @@ THE SOFTWARE.
|
||||
#include <hip/hcc_detail/hip_runtime.h>
|
||||
#elif defined(__HIP_PLATFORM_NVCC__) && !defined (__HIP_PLATFORM_HCC__)
|
||||
#include <hip/nvcc_detail/hip_runtime.h>
|
||||
#else
|
||||
#else
|
||||
#error("Must define exactly one of __HIP_PLATFORM_HCC__ or __HIP_PLATFORM_NVCC__");
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
#include <hip/hip_runtime_api.h>
|
||||
#include <hip/hip_vector_types.h>
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
@@ -181,6 +181,7 @@ typedef enum hipError_t {
|
||||
hipErrorSharedObjectSymbolNotFound = 302,
|
||||
hipErrorSharedObjectInitFailed = 303,
|
||||
hipErrorOperatingSystem = 304,
|
||||
hipErrorSetOnActiveProcess = 305,
|
||||
hipErrorInvalidHandle = 400,
|
||||
hipErrorNotFound = 500,
|
||||
hipErrorIllegalAddress = 700,
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
@@ -29,9 +29,9 @@ THE SOFTWARE.
|
||||
#include <hip/hcc_detail/hip_texture.h>
|
||||
#elif defined(__HIP_PLATFORM_NVCC__) && !defined (__HIP_PLATFORM_HCC__)
|
||||
#include <hip/nvcc_detail/hip_texture.h>
|
||||
#else
|
||||
#else
|
||||
#error("Must define exactly one of __HIP_PLATFORM_HCC__ or __HIP_PLATFORM_NVCC__");
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
@@ -33,6 +33,6 @@ THE SOFTWARE.
|
||||
#endif
|
||||
#elif defined(__HIP_PLATFORM_NVCC__) && !defined (__HIP_PLATFORM_HCC__)
|
||||
#include <vector_types.h>
|
||||
#else
|
||||
#else
|
||||
#error("Must define exactly one of __HIP_PLATFORM_HCC__ or __HIP_PLATFORM_NVCC__");
|
||||
#endif
|
||||
#endif
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
@@ -54,5 +54,3 @@ __device__ double __hiloint2double (int hi, int lo) {
|
||||
s.s2.lo = lo;
|
||||
return s.d;
|
||||
}
|
||||
|
||||
|
||||
|
||||
+23
-434
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
@@ -14,7 +14,7 @@ all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
@@ -129,34 +129,34 @@ __device__ int __hip_move_dpp(int src, int dpp_ctrl, int row_mask, int bank_mask
|
||||
|
||||
__device__ char4 __hip_hc_add8pk(char4 in1, char4 in2) {
|
||||
char4 out;
|
||||
unsigned one1 = in1.val & MASK1;
|
||||
unsigned one2 = in2.val & MASK1;
|
||||
out.val = (one1 + one2) & MASK1;
|
||||
one1 = in1.val & MASK2;
|
||||
one2 = in2.val & MASK2;
|
||||
out.val = out.val | ((one1 + one2) & MASK2);
|
||||
unsigned one1 = in1.a & MASK1;
|
||||
unsigned one2 = in2.a & MASK1;
|
||||
out.a = (one1 + one2) & MASK1;
|
||||
one1 = in1.a & MASK2;
|
||||
one2 = in2.a & MASK2;
|
||||
out.a = out.a | ((one1 + one2) & MASK2);
|
||||
return out;
|
||||
}
|
||||
|
||||
__device__ char4 __hip_hc_sub8pk(char4 in1, char4 in2) {
|
||||
char4 out;
|
||||
unsigned one1 = in1.val & MASK1;
|
||||
unsigned one2 = in2.val & MASK1;
|
||||
out.val = (one1 - one2) & MASK1;
|
||||
one1 = in1.val & MASK2;
|
||||
one2 = in2.val & MASK2;
|
||||
out.val = out.val | ((one1 - one2) & MASK2);
|
||||
unsigned one1 = in1.a & MASK1;
|
||||
unsigned one2 = in2.a & MASK1;
|
||||
out.a = (one1 - one2) & MASK1;
|
||||
one1 = in1.a & MASK2;
|
||||
one2 = in2.a & MASK2;
|
||||
out.a = out.a | ((one1 - one2) & MASK2);
|
||||
return out;
|
||||
}
|
||||
|
||||
__device__ char4 __hip_hc_mul8pk(char4 in1, char4 in2) {
|
||||
char4 out;
|
||||
unsigned one1 = in1.val & MASK1;
|
||||
unsigned one2 = in2.val & MASK1;
|
||||
out.val = (one1 * one2) & MASK1;
|
||||
one1 = in1.val & MASK2;
|
||||
one2 = in2.val & MASK2;
|
||||
out.val = out.val | ((one1 * one2) & MASK2);
|
||||
unsigned one1 = in1.a & MASK1;
|
||||
unsigned one2 = in2.a & MASK1;
|
||||
out.a = (one1 * one2) & MASK1;
|
||||
one1 = in1.a & MASK2;
|
||||
one2 = in2.a & MASK2;
|
||||
out.a = out.a | ((one1 * one2) & MASK2);
|
||||
return out;
|
||||
}
|
||||
|
||||
@@ -2179,426 +2179,17 @@ __device__ double __hip_fast_dsqrt_rz(double x) {
|
||||
return hc::fast_math::sqrt(x);
|
||||
}
|
||||
|
||||
__HIP_DEVICE__ char1 make_char1(signed char x)
|
||||
{
|
||||
char1 c1;
|
||||
c1.x = x;
|
||||
return c1;
|
||||
}
|
||||
|
||||
__HIP_DEVICE__ char2 make_char2(signed char x, signed char y)
|
||||
{
|
||||
char2 c2;
|
||||
c2.x = x;
|
||||
c2.y = y;
|
||||
return c2;
|
||||
}
|
||||
|
||||
__HIP_DEVICE__ char3 make_char3(signed char x, signed char y, signed char z)
|
||||
{
|
||||
char3 c3;
|
||||
c3.x = x;
|
||||
c3.y = y;
|
||||
c3.z = z;
|
||||
return c3;
|
||||
}
|
||||
|
||||
__HIP_DEVICE__ char4 make_char4(signed char x, signed char y, signed char z, signed char w)
|
||||
{
|
||||
char4 c4;
|
||||
c4.x = x;
|
||||
c4.y = y;
|
||||
c4.z = z;
|
||||
c4.w = w;
|
||||
return c4;
|
||||
}
|
||||
|
||||
__HIP_DEVICE__ short1 make_short1(short x)
|
||||
{
|
||||
short1 s1;
|
||||
s1.x = x;
|
||||
return s1;
|
||||
}
|
||||
|
||||
__HIP_DEVICE__ short2 make_short2(short x, short y)
|
||||
{
|
||||
short2 s2;
|
||||
s2.x = x;
|
||||
s2.y = y;
|
||||
return s2;
|
||||
}
|
||||
|
||||
__HIP_DEVICE__ short3 make_short3(short x, short y, short z)
|
||||
{
|
||||
short3 s3;
|
||||
s3.x = x;
|
||||
s3.y = y;
|
||||
s3.z = z;
|
||||
return s3;
|
||||
}
|
||||
|
||||
__HIP_DEVICE__ short4 make_short4(short x, short y, short z, short w)
|
||||
{
|
||||
short4 s4;
|
||||
s4.x = x;
|
||||
s4.y = y;
|
||||
s4.z = z;
|
||||
s4.w = w;
|
||||
return s4;
|
||||
}
|
||||
|
||||
__HIP_DEVICE__ int1 make_int1(int x)
|
||||
{
|
||||
int1 i1;
|
||||
i1.x = x;
|
||||
return i1;
|
||||
}
|
||||
|
||||
__HIP_DEVICE__ int2 make_int2(int x, int y)
|
||||
{
|
||||
int2 i2;
|
||||
i2.x = x;
|
||||
i2.y = y;
|
||||
return i2;
|
||||
}
|
||||
|
||||
__HIP_DEVICE__ int3 make_int3(int x, int y, int z)
|
||||
{
|
||||
int3 i3;
|
||||
i3.x = x;
|
||||
i3.y = y;
|
||||
i3.z = z;
|
||||
return i3;
|
||||
}
|
||||
|
||||
__HIP_DEVICE__ int4 make_int4(int x, int y, int z, int w)
|
||||
{
|
||||
int4 i4;
|
||||
i4.x = x;
|
||||
i4.y = y;
|
||||
i4.z = z;
|
||||
i4.w = w;
|
||||
return i4;
|
||||
}
|
||||
|
||||
__HIP_DEVICE__ long1 make_long1(long x)
|
||||
{
|
||||
long1 l1;
|
||||
l1.x = x;
|
||||
return l1;
|
||||
}
|
||||
|
||||
__HIP_DEVICE__ long2 make_long2(long x, long y)
|
||||
{
|
||||
long2 l2;
|
||||
l2.x = x;
|
||||
l2.y = y;
|
||||
return l2;
|
||||
}
|
||||
|
||||
__HIP_DEVICE__ long3 make_long3(long x, long y, long z)
|
||||
{
|
||||
long3 l3;
|
||||
l3.x = x;
|
||||
l3.y = y;
|
||||
l3.z = z;
|
||||
return l3;
|
||||
}
|
||||
|
||||
__HIP_DEVICE__ long4 make_long4(long x, long y, long z, long w)
|
||||
{
|
||||
long4 l4;
|
||||
l4.x = x;
|
||||
l4.y = y;
|
||||
l4.z = z;
|
||||
l4.w = w;
|
||||
return l4;
|
||||
}
|
||||
|
||||
__HIP_DEVICE__ longlong1 make_longlong1(long long x)
|
||||
{
|
||||
longlong1 l1;
|
||||
l1.x = x;
|
||||
return l1;
|
||||
}
|
||||
|
||||
__HIP_DEVICE__ longlong2 make_longlong2(long long x, long long y)
|
||||
{
|
||||
longlong2 l2;
|
||||
l2.x = x;
|
||||
l2.y = y;
|
||||
return l2;
|
||||
}
|
||||
|
||||
__HIP_DEVICE__ longlong3 make_longlong3(long long x, long long y, long long z)
|
||||
{
|
||||
longlong3 l3;
|
||||
l3.x = x;
|
||||
l3.y = y;
|
||||
l3.z = z;
|
||||
return l3;
|
||||
}
|
||||
|
||||
__HIP_DEVICE__ longlong4 make_longlong4(long long x, long long y, long long z, long long w)
|
||||
{
|
||||
longlong4 l4;
|
||||
l4.x = x;
|
||||
l4.y = y;
|
||||
l4.z = z;
|
||||
l4.w = w;
|
||||
return l4;
|
||||
}
|
||||
|
||||
__HIP_DEVICE__ uchar1 make_uchar1(unsigned char x)
|
||||
{
|
||||
uchar1 c1;
|
||||
c1.x = x;
|
||||
return c1;
|
||||
}
|
||||
|
||||
__HIP_DEVICE__ uchar2 make_uchar2(unsigned char x, unsigned char y)
|
||||
{
|
||||
uchar2 c2;
|
||||
c2.x = x;
|
||||
c2.y = y;
|
||||
return c2;
|
||||
}
|
||||
|
||||
__HIP_DEVICE__ uchar3 make_uchar3(unsigned char x, unsigned char y, unsigned char z)
|
||||
{
|
||||
uchar3 c3;
|
||||
c3.x = x;
|
||||
c3.y = y;
|
||||
c3.z = z;
|
||||
return c3;
|
||||
}
|
||||
|
||||
__HIP_DEVICE__ uchar4 make_uchar4(unsigned char x, unsigned char y, unsigned char z, unsigned char w)
|
||||
{
|
||||
uchar4 c4;
|
||||
c4.x = x;
|
||||
c4.y = y;
|
||||
c4.z = z;
|
||||
c4.w = w;
|
||||
return c4;
|
||||
}
|
||||
|
||||
__HIP_DEVICE__ ushort1 make_ushort1(unsigned short x)
|
||||
{
|
||||
ushort1 s1;
|
||||
s1.x = x;
|
||||
return s1;
|
||||
}
|
||||
|
||||
__HIP_DEVICE__ ushort2 make_ushort2(unsigned short x, unsigned short y)
|
||||
{
|
||||
ushort2 s2;
|
||||
s2.x = x;
|
||||
s2.y = y;
|
||||
return s2;
|
||||
}
|
||||
|
||||
__HIP_DEVICE__ ushort3 make_ushort3(unsigned short x, unsigned short y, unsigned short z)
|
||||
{
|
||||
ushort3 s3;
|
||||
s3.x = x;
|
||||
s3.y = y;
|
||||
s3.z = z;
|
||||
return s3;
|
||||
}
|
||||
|
||||
__HIP_DEVICE__ ushort4 make_ushort4(unsigned short x, unsigned short y, unsigned short z, unsigned short w)
|
||||
{
|
||||
ushort4 s4;
|
||||
s4.x = x;
|
||||
s4.y = y;
|
||||
s4.z = z;
|
||||
s4.w = w;
|
||||
return s4;
|
||||
}
|
||||
|
||||
__HIP_DEVICE__ uint1 make_uint1(unsigned int x)
|
||||
{
|
||||
uint1 i1;
|
||||
i1.x = x;
|
||||
return i1;
|
||||
}
|
||||
|
||||
__HIP_DEVICE__ uint2 make_uint2(unsigned int x, unsigned int y)
|
||||
{
|
||||
uint2 i2;
|
||||
i2.x = x;
|
||||
i2.y = y;
|
||||
return i2;
|
||||
}
|
||||
|
||||
__HIP_DEVICE__ uint3 make_uint3(unsigned int x, unsigned int y, unsigned int z)
|
||||
{
|
||||
uint3 i3;
|
||||
i3.x = x;
|
||||
i3.y = y;
|
||||
i3.z = z;
|
||||
return i3;
|
||||
}
|
||||
|
||||
__HIP_DEVICE__ uint4 make_uint4(unsigned int x, unsigned int y, unsigned int z, unsigned int w)
|
||||
{
|
||||
uint4 i4;
|
||||
i4.x = x;
|
||||
i4.y = y;
|
||||
i4.z = z;
|
||||
i4.w = w;
|
||||
return i4;
|
||||
}
|
||||
|
||||
__HIP_DEVICE__ ulong1 make_ulong1(unsigned long x)
|
||||
{
|
||||
ulong1 l1;
|
||||
l1.x = x;
|
||||
return l1;
|
||||
}
|
||||
|
||||
__HIP_DEVICE__ ulong2 make_ulong2(unsigned long x, unsigned long y)
|
||||
{
|
||||
ulong2 l2;
|
||||
l2.x = x;
|
||||
l2.y = y;
|
||||
return l2;
|
||||
}
|
||||
|
||||
__HIP_DEVICE__ ulong3 make_ulong3(unsigned long x, unsigned long y, unsigned long z)
|
||||
{
|
||||
ulong3 l3;
|
||||
l3.x = x;
|
||||
l3.y = y;
|
||||
l3.z = z;
|
||||
return l3;
|
||||
}
|
||||
|
||||
__HIP_DEVICE__ ulong4 make_ulong4(unsigned long x, unsigned long y, unsigned long z, unsigned long w)
|
||||
{
|
||||
ulong4 l4;
|
||||
l4.x = x;
|
||||
l4.y = y;
|
||||
l4.z = z;
|
||||
l4.w = w;
|
||||
return l4;
|
||||
}
|
||||
|
||||
__HIP_DEVICE__ ulonglong1 make_ulonglong1(unsigned long long x)
|
||||
{
|
||||
ulonglong1 l1;
|
||||
l1.x = x;
|
||||
return l1;
|
||||
}
|
||||
|
||||
__HIP_DEVICE__ ulonglong2 make_ulonglong2(unsigned long long x, unsigned long long y)
|
||||
{
|
||||
ulonglong2 l2;
|
||||
l2.x = x;
|
||||
l2.y = y;
|
||||
return l2;
|
||||
}
|
||||
|
||||
__HIP_DEVICE__ ulonglong3 make_ulonglong3(unsigned long long x, unsigned long long y, unsigned long long z)
|
||||
{
|
||||
ulonglong3 l3;
|
||||
l3.x = x;
|
||||
l3.y = y;
|
||||
l3.z = z;
|
||||
return l3;
|
||||
}
|
||||
|
||||
__HIP_DEVICE__ ulonglong4 make_ulonglong4(unsigned long long x, unsigned long long y, unsigned long long z, unsigned long long w)
|
||||
{
|
||||
ulonglong4 l4;
|
||||
l4.x = x;
|
||||
l4.y = y;
|
||||
l4.z = z;
|
||||
l4.w = w;
|
||||
return l4;
|
||||
}
|
||||
|
||||
__HIP_DEVICE__ float1 make_float1(float x)
|
||||
{
|
||||
float1 f1;
|
||||
f1.x = x;
|
||||
return f1;
|
||||
}
|
||||
|
||||
__HIP_DEVICE__ float2 make_float2(float x, float y)
|
||||
{
|
||||
float2 f2;
|
||||
f2.x = x;
|
||||
f2.y = y;
|
||||
return f2;
|
||||
}
|
||||
|
||||
__HIP_DEVICE__ float3 make_float3(float x, float y, float z)
|
||||
{
|
||||
float3 f3;
|
||||
f3.x = x;
|
||||
f3.y = y;
|
||||
f3.z = z;
|
||||
return f3;
|
||||
}
|
||||
|
||||
__HIP_DEVICE__ float4 make_float4(float x, float y, float z, float w)
|
||||
{
|
||||
float4 f4;
|
||||
f4.x = x;
|
||||
f4.y = y;
|
||||
f4.z = z;
|
||||
f4.w = w;
|
||||
return f4;
|
||||
}
|
||||
|
||||
__HIP_DEVICE__ double1 make_double1(double x)
|
||||
{
|
||||
double1 d1;
|
||||
d1.x = x;
|
||||
return d1;
|
||||
}
|
||||
|
||||
__HIP_DEVICE__ double2 make_double2(double x, double y)
|
||||
{
|
||||
double2 d2;
|
||||
d2.x = x;
|
||||
d2.y = y;
|
||||
return d2;
|
||||
}
|
||||
|
||||
__HIP_DEVICE__ double3 make_double3(double x, double y, double z)
|
||||
{
|
||||
double3 d3;
|
||||
d3.x = x;
|
||||
d3.y = y;
|
||||
d3.z = z;
|
||||
return d3;
|
||||
}
|
||||
|
||||
__HIP_DEVICE__ double4 make_double4(double x, double y, double z, double w)
|
||||
{
|
||||
double4 d4;
|
||||
d4.x = x;
|
||||
d4.y = y;
|
||||
d4.z = z;
|
||||
d4.w = w;
|
||||
return d4;
|
||||
}
|
||||
|
||||
|
||||
__HIP_DEVICE__ double __longlong_as_double(long long int x)
|
||||
__device__ double __longlong_as_double(long long int x)
|
||||
{
|
||||
return static_cast<double>(x);
|
||||
}
|
||||
|
||||
__HIP_DEVICE__ long long __double_as_longlong(double x)
|
||||
__device__ long long __double_as_longlong(double x)
|
||||
{
|
||||
return static_cast<long long>(x);
|
||||
}
|
||||
|
||||
__HIP_DEVICE__ void __threadfence_system(void){
|
||||
__device__ void __threadfence_system(void){
|
||||
// no-op
|
||||
}
|
||||
|
||||
@@ -3391,5 +2982,3 @@ __host__ double norm4d(double a, double b, double c, double d)
|
||||
{
|
||||
return std::sqrt(a*a + b*b + c*c + d*d);
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
|
||||
+45
-22
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
@@ -175,6 +175,24 @@ hipError_t hipDeviceReset(void)
|
||||
return ihipLogStatus(hipSuccess);
|
||||
}
|
||||
|
||||
hipError_t ihipDeviceSetState(void)
|
||||
{
|
||||
hipError_t e = hipErrorInvalidContext;
|
||||
auto *ctx = ihipGetTlsDefaultCtx();
|
||||
|
||||
if (ctx) {
|
||||
ihipDevice_t *deviceHandle = ctx->getWriteableDevice();
|
||||
if(deviceHandle->_state == 0)
|
||||
{
|
||||
deviceHandle->_state = 1;
|
||||
}
|
||||
e = hipSuccess;
|
||||
}
|
||||
|
||||
return ihipLogStatus(e);
|
||||
}
|
||||
|
||||
|
||||
hipError_t ihipDeviceGetAttribute(int* pi, hipDeviceAttribute_t attr, int device)
|
||||
{
|
||||
hipError_t e = hipSuccess;
|
||||
@@ -289,29 +307,35 @@ hipError_t hipSetDeviceFlags( unsigned int flags)
|
||||
// TODO : does this really OR in the flags or replaces previous flags:
|
||||
// TODO : Review error handling behavior for this function, it often returns ErrorSetOnActiveProcess
|
||||
if (ctx) {
|
||||
ctx->_ctxFlags = ctx->_ctxFlags | flags;
|
||||
if (flags & hipDeviceScheduleMask) {
|
||||
switch (hipDeviceScheduleMask) {
|
||||
case hipDeviceScheduleAuto:
|
||||
case hipDeviceScheduleSpin:
|
||||
case hipDeviceScheduleYield:
|
||||
case hipDeviceScheduleBlockingSync:
|
||||
e = hipSuccess;
|
||||
break;
|
||||
default:
|
||||
e = hipSuccess; // TODO - should this be error? Map to Auto?
|
||||
//e = hipErrorInvalidValue;
|
||||
break;
|
||||
auto *deviceHandle = ctx->getDevice();
|
||||
if(deviceHandle->_state == 0)
|
||||
{
|
||||
ctx->_ctxFlags = ctx->_ctxFlags | flags;
|
||||
if (flags & hipDeviceScheduleMask) {
|
||||
switch (hipDeviceScheduleMask) {
|
||||
case hipDeviceScheduleAuto:
|
||||
case hipDeviceScheduleSpin:
|
||||
case hipDeviceScheduleYield:
|
||||
case hipDeviceScheduleBlockingSync:
|
||||
e = hipSuccess;
|
||||
break;
|
||||
default:
|
||||
e = hipSuccess; // TODO - should this be error? Map to Auto?
|
||||
//e = hipErrorInvalidValue;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
unsigned supportedFlags = hipDeviceScheduleMask | hipDeviceMapHost | hipDeviceLmemResizeToMax;
|
||||
unsigned supportedFlags = hipDeviceScheduleMask | hipDeviceMapHost | hipDeviceLmemResizeToMax;
|
||||
|
||||
if (flags & (~supportedFlags)) {
|
||||
e = hipErrorInvalidValue;
|
||||
}
|
||||
} else {
|
||||
e = hipErrorInvalidDevice;
|
||||
if (flags & (~supportedFlags)) {
|
||||
e = hipErrorInvalidValue;
|
||||
}
|
||||
} else {
|
||||
e = hipErrorSetOnActiveProcess;
|
||||
}
|
||||
} else {
|
||||
e = hipErrorInvalidDevice;
|
||||
}
|
||||
|
||||
return ihipLogStatus(e);
|
||||
@@ -455,4 +479,3 @@ hipError_t hipChooseDevice( int* device, const hipDeviceProp_t* prop )
|
||||
}
|
||||
return ihipLogStatus(e);
|
||||
}
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
@@ -43,7 +43,7 @@ hipError_t ihipEventCreate(hipEvent_t* event, unsigned flags)
|
||||
eh->_stream = NULL;
|
||||
eh->_flags = flags;
|
||||
eh->_timestamp = 0;
|
||||
*event = eh;
|
||||
*event = eh;
|
||||
} else {
|
||||
e = hipErrorInvalidValue;
|
||||
}
|
||||
@@ -186,5 +186,3 @@ hipError_t hipEventQuery(hipEvent_t event)
|
||||
return ihipLogStatus(hipSuccess);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
+113
-109
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
@@ -35,6 +35,8 @@ typedef struct{
|
||||
};
|
||||
} struct_float;
|
||||
|
||||
#if __clang_major__ == 3
|
||||
|
||||
static __device__ float cvt_half_to_float(__half a){
|
||||
struct_float ret = {0};
|
||||
if(a.x == 0){
|
||||
@@ -64,44 +66,44 @@ static __device__ __half cvt_float_to_half(float b){
|
||||
}
|
||||
|
||||
|
||||
__device__ __half __hadd(const __half a, const __half b){
|
||||
__device__ __half __soft_hadd(const __half a, const __half b){
|
||||
return cvt_float_to_half(cvt_half_to_float(a)+cvt_half_to_float(b));
|
||||
}
|
||||
|
||||
__device__ __half __hadd_sat(const __half a, const __half b){
|
||||
__device__ __half __soft_hadd_sat(const __half a, const __half b){
|
||||
float f = cvt_half_to_float(a) + cvt_half_to_float(b);
|
||||
return (f < 0.0f ? __half_value_zero_float : (f > 1.0f ? __half_value_one_float: cvt_float_to_half(f)));
|
||||
}
|
||||
|
||||
__device__ __half __hfma(const __half a, const __half b, const __half c){
|
||||
__device__ __half __soft_hfma(const __half a, const __half b, const __half c){
|
||||
return cvt_float_to_half(fmaf(cvt_half_to_float(a), cvt_half_to_float(b), cvt_half_to_float(c)));
|
||||
}
|
||||
|
||||
__device__ __half __hfma_sat(const __half a, const __half b, const __half c){
|
||||
__device__ __half __soft_hfma_sat(const __half a, const __half b, const __half c){
|
||||
float f = fmaf(cvt_half_to_float(a), cvt_half_to_float(b), cvt_half_to_float(c));
|
||||
return (f < 0.0f ? __half_value_zero_float : (f > 1.0f ? __half_value_one_float: cvt_float_to_half(f)));
|
||||
}
|
||||
|
||||
__device__ __half __hmul(const __half a, const __half b){
|
||||
__device__ __half __soft_hmul(const __half a, const __half b){
|
||||
return cvt_float_to_half(cvt_half_to_float(a)*cvt_half_to_float(b));
|
||||
}
|
||||
|
||||
__device__ __half __hmul_sat(const __half a, const __half b){
|
||||
__device__ __half __soft_hmul_sat(const __half a, const __half b){
|
||||
float f = cvt_half_to_float(a) * cvt_half_to_float(b);
|
||||
return (f < 0.0f ? __half_value_zero_float : (f > 1.0f ? __half_value_one_float: cvt_float_to_half(f)));
|
||||
}
|
||||
|
||||
__device__ __half __hneq(const __half a){
|
||||
__device__ __half __soft_hneq(const __half a){
|
||||
__half ret = {a.x};
|
||||
ret.x ^= 1 << 15;
|
||||
return ret;
|
||||
}
|
||||
|
||||
__device__ __half __hsub(const __half a, const __half b){
|
||||
__device__ __half __soft_hsub(const __half a, const __half b){
|
||||
return cvt_float_to_half(cvt_half_to_float(a)-cvt_half_to_float(b));
|
||||
}
|
||||
|
||||
__device__ __half __hsub_sat(const __half a, const __half b){
|
||||
__device__ __half __soft_hsub_sat(const __half a, const __half b){
|
||||
float f = cvt_half_to_float(a) - cvt_half_to_float(b);
|
||||
return (f < 0.0f ? __half_value_zero_float : (f > 1.0f ? __half_value_one_float: cvt_float_to_half(f)));
|
||||
}
|
||||
@@ -111,66 +113,66 @@ __device__ __half __hsub_sat(const __half a, const __half b){
|
||||
Half2 Arithmetic Instructions
|
||||
*/
|
||||
|
||||
__device__ __half2 __hadd2(const __half2 a, const __half2 b){
|
||||
__device__ __half2 __soft_hadd2(const __half2 a, const __half2 b){
|
||||
__half2 ret;
|
||||
ret.p = __hadd(a.p, b.p);
|
||||
ret.q = __hadd(a.q, b.q);
|
||||
ret.p[1] = __soft_hadd(a.p[1], b.p[1]);
|
||||
ret.p[0] = __soft_hadd(a.p[0], b.p[0]);
|
||||
return ret;
|
||||
}
|
||||
|
||||
__device__ __half2 __hadd2_sat(const __half2 a, const __half2 b){
|
||||
__device__ __half2 __soft_hadd2_sat(const __half2 a, const __half2 b){
|
||||
__half2 ret;
|
||||
ret.p = __hadd_sat(a.p, b.p);
|
||||
ret.q = __hadd_sat(a.q, b.q);
|
||||
ret.p[1] = __soft_hadd_sat(a.p[1], b.p[1]);
|
||||
ret.p[0] = __soft_hadd_sat(a.p[0], b.p[0]);
|
||||
return ret;
|
||||
}
|
||||
|
||||
__device__ __half2 __hfma2(const __half2 a, const __half2 b, const __half2 c){
|
||||
__device__ __half2 __soft_hfma2(const __half2 a, const __half2 b, const __half2 c){
|
||||
__half2 ret;
|
||||
ret.p = __hfma(a.p, b.p, c.p);
|
||||
ret.q = __hfma(a.q, b.q, c.q);
|
||||
ret.p[1] = __soft_hfma(a.p[1], b.p[1], c.p[1]);
|
||||
ret.p[0] = __soft_hfma(a.p[0], b.p[0], c.p[0]);
|
||||
return ret;
|
||||
}
|
||||
|
||||
__device__ __half2 __hfma2_sat(const __half2 a, const __half2 b, const __half2 c){
|
||||
__device__ __half2 __soft_hfma2_sat(const __half2 a, const __half2 b, const __half2 c){
|
||||
__half2 ret;
|
||||
ret.p = __hfma_sat(a.p, b.p, c.p);
|
||||
ret.q = __hfma_sat(a.q, b.q, c.q);
|
||||
ret.p[1] = __soft_hfma_sat(a.p[1], b.p[1], c.p[1]);
|
||||
ret.p[0] = __soft_hfma_sat(a.p[0], b.p[0], c.p[0]);
|
||||
return ret;
|
||||
}
|
||||
|
||||
__device__ __half2 __hmul2(const __half2 a, const __half2 b){
|
||||
__device__ __half2 __soft_hmul2(const __half2 a, const __half2 b){
|
||||
__half2 ret;
|
||||
ret.p = __hmul(a.p, b.p);
|
||||
ret.q = __hmul(a.q, b.q);
|
||||
ret.p[1] = __soft_hmul(a.p[1], b.p[1]);
|
||||
ret.p[0] = __soft_hmul(a.p[0], b.p[0]);
|
||||
return ret;
|
||||
}
|
||||
|
||||
__device__ __half2 __hmul2_sat(const __half2 a, const __half2 b){
|
||||
__device__ __half2 __soft_hmul2_sat(const __half2 a, const __half2 b){
|
||||
__half2 ret;
|
||||
ret.p = __hmul_sat(a.p, b.p);
|
||||
ret.q = __hmul_sat(a.q, b.q);
|
||||
ret.p[1] = __soft_hmul_sat(a.p[1], b.p[1]);
|
||||
ret.p[0] = __soft_hmul_sat(a.p[0], b.p[0]);
|
||||
return ret;
|
||||
}
|
||||
|
||||
__device__ __half2 __hneq2(const __half2 a){
|
||||
__device__ __half2 __soft_hneq2(const __half2 a){
|
||||
__half2 ret;
|
||||
ret.p = __hneq(a.p);
|
||||
ret.q = __hneq(a.q);
|
||||
ret.p[1] = __soft_hneq(a.p[1]);
|
||||
ret.p[0] = __soft_hneq(a.p[0]);
|
||||
return ret;
|
||||
}
|
||||
|
||||
__device__ __half2 __hsub2(const __half2 a, const __half2 b){
|
||||
__device__ __half2 __soft_hsub2(const __half2 a, const __half2 b){
|
||||
__half2 ret;
|
||||
ret.p = __hsub(a.p, b.p);
|
||||
ret.q = __hsub(a.q, b.q);
|
||||
ret.p[1] = __soft_hsub(a.p[1], b.p[1]);
|
||||
ret.p[0] = __soft_hsub(a.p[0], b.p[0]);
|
||||
return ret;
|
||||
}
|
||||
|
||||
__device__ __half2 __hsub2_sat(const __half2 a, const __half2 b){
|
||||
__device__ __half2 __soft_hsub2_sat(const __half2 a, const __half2 b){
|
||||
__half2 ret;
|
||||
ret.p = __hsub_sat(a.p, b.p);
|
||||
ret.q = __hsub_sat(a.q, b.q);
|
||||
ret.p[1] = __soft_hsub_sat(a.p[1], b.p[1]);
|
||||
ret.p[0] = __soft_hsub_sat(a.p[0], b.p[0]);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -178,23 +180,23 @@ __device__ __half2 __hsub2_sat(const __half2 a, const __half2 b){
|
||||
Half Cmps
|
||||
*/
|
||||
|
||||
__device__ bool __heq(const __half a, const __half b){
|
||||
__device__ bool __soft_heq(const __half a, const __half b){
|
||||
return (a.x == b.x ? true:false);
|
||||
}
|
||||
|
||||
__device__ bool __hge(const __half a, const __half b){
|
||||
__device__ bool __soft_hge(const __half a, const __half b){
|
||||
return (cvt_half_to_float(a) >= cvt_half_to_float(b));
|
||||
}
|
||||
|
||||
__device__ bool __hgt(const __half a, const __half b){
|
||||
__device__ bool __soft_hgt(const __half a, const __half b){
|
||||
return (cvt_half_to_float(a) > cvt_half_to_float(b));
|
||||
}
|
||||
|
||||
__device__ bool __hisinf(const __half a){
|
||||
__device__ bool __soft_hisinf(const __half a){
|
||||
return ((a.x == __half_neg_inf) ? -1 : (a.x == __half_pos_inf) ? 1 : 0);
|
||||
}
|
||||
|
||||
__device__ bool __hisnan(const __half a){
|
||||
__device__ bool __soft_hisnan(const __half a){
|
||||
if(((a.x & __half_pos_inf) == a.x) || ((a.x & __half_neg_inf) == a.x)){
|
||||
return true;
|
||||
}else{
|
||||
@@ -202,15 +204,15 @@ __device__ bool __hisnan(const __half a){
|
||||
}
|
||||
}
|
||||
|
||||
__device__ bool __hle(const __half a, const __half b){
|
||||
__device__ bool __soft_hle(const __half a, const __half b){
|
||||
return (cvt_half_to_float(a) <= cvt_half_to_float(b));
|
||||
}
|
||||
|
||||
__device__ bool __hlt(const __half a, const __half b){
|
||||
__device__ bool __soft_hlt(const __half a, const __half b){
|
||||
return (cvt_half_to_float(a) < cvt_half_to_float(b));
|
||||
}
|
||||
|
||||
__device__ bool __hne(const __half a, const __half b){
|
||||
__device__ bool __soft_hne(const __half a, const __half b){
|
||||
return a.x == b.x ? false : true;
|
||||
}
|
||||
|
||||
@@ -218,78 +220,78 @@ __device__ bool __hne(const __half a, const __half b){
|
||||
Half2 Cmps
|
||||
*/
|
||||
|
||||
__device__ bool __hbeq2(const __half2 a, const __half2 b){
|
||||
return __heq(a.p, b.p) && __heq(a.q, b.q);
|
||||
__device__ bool __soft_hbeq2(const __half2 a, const __half2 b){
|
||||
return __soft_heq(a.p[1], b.p[1]) && __soft_heq(a.p[0], b.p[0]);
|
||||
}
|
||||
|
||||
__device__ bool __hbge2(const __half2 a, const __half2 b){
|
||||
return __hge(a.p, b.p) && __hge(a.q, b.q);
|
||||
__device__ bool __soft_hbge2(const __half2 a, const __half2 b){
|
||||
return __soft_hge(a.p[1], b.p[1]) && __soft_hge(a.p[0], b.p[0]);
|
||||
}
|
||||
|
||||
__device__ bool __hbgt2(const __half2 a, const __half2 b){
|
||||
return __hgt(a.p, b.p) && __hgt(a.q, b.q);
|
||||
__device__ bool __soft_hbgt2(const __half2 a, const __half2 b){
|
||||
return __soft_hgt(a.p[1], b.p[1]) && __soft_hgt(a.p[0], b.p[0]);
|
||||
}
|
||||
|
||||
__device__ bool __hble2(const __half2 a, const __half2 b){
|
||||
return __hle(a.p, b.p) && __hle(a.q, b.q);
|
||||
__device__ bool __soft_hble2(const __half2 a, const __half2 b){
|
||||
return __soft_hle(a.p[1], b.p[1]) && __soft_hle(a.p[0], b.p[0]);
|
||||
}
|
||||
|
||||
__device__ bool __hblt2(const __half2 a, const __half2 b){
|
||||
return __hlt(a.p, b.p) && __hlt(a.q, b.q);
|
||||
__device__ bool __soft_hblt2(const __half2 a, const __half2 b){
|
||||
return __soft_hlt(a.p[1], b.p[1]) && __soft_hlt(a.p[0], b.p[0]);
|
||||
}
|
||||
|
||||
__device__ bool __hbne2(const __half2 a, const __half2 b){
|
||||
return __hne(a.p, b.p) && __hne(a.q, b.q);
|
||||
__device__ bool __soft_hbne2(const __half2 a, const __half2 b){
|
||||
return __soft_hne(a.p[1], b.p[1]) && __soft_hne(a.p[0], b.p[0]);
|
||||
}
|
||||
|
||||
|
||||
|
||||
__device__ __half2 __heq2(const __half2 a, const __half2 b){
|
||||
__device__ __half2 __soft_heq2(const __half2 a, const __half2 b){
|
||||
__half2 ret = {0};
|
||||
ret.p = (__heq(a.p, b.p)) ? __half_value_one_float : __half_value_zero_float;
|
||||
ret.q = (__heq(a.q, b.q)) ? __half_value_one_float : __half_value_zero_float;
|
||||
ret.p[1] = (__soft_heq(a.p[1], b.p[1])) ? __half_value_one_float : __half_value_zero_float;
|
||||
ret.p[0] = (__soft_heq(a.p[0], b.p[0])) ? __half_value_one_float : __half_value_zero_float;
|
||||
return ret;
|
||||
}
|
||||
|
||||
__device__ __half2 __hge2(const __half2 a, const __half2 b){
|
||||
__device__ __half2 __soft_hge2(const __half2 a, const __half2 b){
|
||||
__half2 ret = {0};
|
||||
ret.p = (__hge(a.p, b.p)) ? __half_value_one_float : __half_value_zero_float;
|
||||
ret.q = (__hge(a.q, b.q)) ? __half_value_one_float : __half_value_zero_float;
|
||||
ret.p[1] = (__soft_hge(a.p[1], b.p[1])) ? __half_value_one_float : __half_value_zero_float;
|
||||
ret.p[0] = (__soft_hge(a.p[0], b.p[0])) ? __half_value_one_float : __half_value_zero_float;
|
||||
return ret;
|
||||
}
|
||||
|
||||
__device__ __half2 __hgt2(const __half2 a, const __half2 b){
|
||||
__device__ __half2 __soft_hgt2(const __half2 a, const __half2 b){
|
||||
__half2 ret = {0};
|
||||
ret.p = (__hgt(a.p, b.p)) ? __half_value_one_float : __half_value_zero_float;
|
||||
ret.q = (__hgt(a.q, b.q)) ? __half_value_one_float : __half_value_zero_float;
|
||||
ret.p[1] = (__soft_hgt(a.p[1], b.p[1])) ? __half_value_one_float : __half_value_zero_float;
|
||||
ret.p[0] = (__soft_hgt(a.p[0], b.p[0])) ? __half_value_one_float : __half_value_zero_float;
|
||||
return ret;
|
||||
}
|
||||
|
||||
__device__ __half2 __hisnan2(const __half2 a){
|
||||
__device__ __half2 __soft_hisnan2(const __half2 a){
|
||||
__half2 ret = {0};
|
||||
ret.p = __hisnan(a.p) ? __half_value_one_float : __half_value_zero_float;
|
||||
ret.q = __hisnan(a.q) ? __half_value_one_float : __half_value_zero_float;
|
||||
ret.p[1] = __soft_hisnan(a.p[1]) ? __half_value_one_float : __half_value_zero_float;
|
||||
ret.p[0] = __soft_hisnan(a.p[0]) ? __half_value_one_float : __half_value_zero_float;
|
||||
return ret;
|
||||
}
|
||||
|
||||
__device__ __half2 __hle2(const __half2 a, const __half2 b){
|
||||
__device__ __half2 __soft_hle2(const __half2 a, const __half2 b){
|
||||
__half2 ret = {0};
|
||||
ret.p = (__hle(a.p, b.p)) ? __half_value_one_float : __half_value_zero_float;
|
||||
ret.q = (__hle(a.q, b.q)) ? __half_value_one_float : __half_value_zero_float;
|
||||
ret.p[1] = (__soft_hle(a.p[1], b.p[1])) ? __half_value_one_float : __half_value_zero_float;
|
||||
ret.p[0] = (__soft_hle(a.p[0], b.p[0])) ? __half_value_one_float : __half_value_zero_float;
|
||||
return ret;
|
||||
}
|
||||
|
||||
__device__ __half2 __hlt2(const __half2 a, const __half2 b){
|
||||
__device__ __half2 __soft_hlt2(const __half2 a, const __half2 b){
|
||||
__half2 ret = {0};
|
||||
ret.p = (__hlt(a.p, b.p)) ? __half_value_one_float : __half_value_zero_float;
|
||||
ret.q = (__hlt(a.q, b.q)) ? __half_value_one_float : __half_value_zero_float;
|
||||
ret.p[1] = (__soft_hlt(a.p[1], b.p[1])) ? __half_value_one_float : __half_value_zero_float;
|
||||
ret.p[0] = (__soft_hlt(a.p[0], b.p[0])) ? __half_value_one_float : __half_value_zero_float;
|
||||
return ret;
|
||||
}
|
||||
|
||||
__device__ __half2 __hne2(const __half2 a, const __half2 b){
|
||||
__device__ __half2 __soft_hne2(const __half2 a, const __half2 b){
|
||||
__half2 ret = {0};
|
||||
ret.p = (__hne(a.p, b.p)) ? __half_value_one_float : __half_value_zero_float;
|
||||
ret.q = (__hne(a.q, b.q)) ? __half_value_one_float : __half_value_zero_float;
|
||||
ret.p[1] = (__soft_hne(a.p[1], b.p[1])) ? __half_value_one_float : __half_value_zero_float;
|
||||
ret.p[0] = (__soft_hne(a.p[0], b.p[0])) ? __half_value_one_float : __half_value_zero_float;
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -297,78 +299,80 @@ __device__ __half2 __hne2(const __half2 a, const __half2 b){
|
||||
Half Cnvs and Data Mvmnt
|
||||
*/
|
||||
|
||||
__device__ __half2 __float22half2_rn(const float2 a){
|
||||
__device__ __half2 __soft_float22half2_rn(const float2 a){
|
||||
__half2 ret = {0};
|
||||
ret.p = cvt_float_to_half(a.x);
|
||||
ret.q = cvt_float_to_half(a.y);
|
||||
ret.p[1] = cvt_float_to_half(a.x);
|
||||
ret.p[0] = cvt_float_to_half(a.y);
|
||||
return ret;
|
||||
}
|
||||
|
||||
__device__ __half __float2half(const float a){
|
||||
__device__ __half __soft_float2half(const float a){
|
||||
return cvt_float_to_half(a);
|
||||
}
|
||||
|
||||
__device__ __half2 __float2half2_rn(const float a){
|
||||
__device__ __half2 __soft_float2half2_rn(const float a){
|
||||
__half ret = cvt_float_to_half(a);
|
||||
return {ret, ret};
|
||||
}
|
||||
|
||||
__device__ __half2 __floats2half2_rn(const float a, const float b){
|
||||
__device__ __half2 __soft_floats2half2_rn(const float a, const float b){
|
||||
return {cvt_float_to_half(a), cvt_float_to_half(b)};
|
||||
}
|
||||
|
||||
__device__ float2 __half22float2(const __half2 a){
|
||||
return {cvt_half_to_float(a.p), cvt_half_to_float(a.q)};
|
||||
__device__ float2 __soft_half22float2(const __half2 a){
|
||||
return {cvt_half_to_float(a.p[1]), cvt_half_to_float(a.p[0])};
|
||||
}
|
||||
|
||||
__device__ float __half2float(const __half a){
|
||||
__device__ float __soft_half2float(const __half a){
|
||||
return cvt_half_to_float(a);
|
||||
}
|
||||
|
||||
__device__ __half2 __half2half2(const __half a){
|
||||
__device__ __half2 __soft_half2half2(const __half a){
|
||||
return {a,a};
|
||||
}
|
||||
|
||||
__device__ __half2 __halves2half2(const __half a, const __half b){
|
||||
__device__ __half2 __soft_halves2half2(const __half a, const __half b){
|
||||
return {a,b};
|
||||
}
|
||||
|
||||
__device__ float __high2float(const __half2 a){
|
||||
return cvt_half_to_float(a.p);
|
||||
__device__ float __soft_high2float(const __half2 a){
|
||||
return cvt_half_to_float(a.p[1]);
|
||||
}
|
||||
|
||||
__device__ __half __high2half(const __half2 a){
|
||||
return a.p;
|
||||
__device__ __half __soft_high2half(const __half2 a){
|
||||
return a.p[1];
|
||||
}
|
||||
|
||||
__device__ __half2 __high2half2(const __half2 a){
|
||||
return {a.p, a.p};
|
||||
__device__ __half2 __soft_high2half2(const __half2 a){
|
||||
return {a.p[1], a.p[1]};
|
||||
}
|
||||
|
||||
__device__ __half2 __highs2half2(const __half2 a, const __half2 b){
|
||||
return {a.p, b.p};
|
||||
__device__ __half2 __soft_highs2half2(const __half2 a, const __half2 b){
|
||||
return {a.p[1], b.p[1]};
|
||||
}
|
||||
|
||||
__device__ float __low2float(const __half2 a){
|
||||
return cvt_half_to_float(a.q);
|
||||
__device__ float __soft_low2float(const __half2 a){
|
||||
return cvt_half_to_float(a.p[0]);
|
||||
}
|
||||
|
||||
__device__ __half __low2half(const __half2 a){
|
||||
return a.q;
|
||||
__device__ __half __soft_low2half(const __half2 a){
|
||||
return a.p[0];
|
||||
}
|
||||
|
||||
__device__ __half2 __low2half2(const __half2 a){
|
||||
return {a.q, a.q};
|
||||
__device__ __half2 __soft_low2half2(const __half2 a){
|
||||
return {a.p[0], a.p[0]};
|
||||
}
|
||||
|
||||
__device__ __half2 __lows2half2(const __half2 a, const __half2 b){
|
||||
return {a.q, b.q};
|
||||
__device__ __half2 __soft_lows2half2(const __half2 a, const __half2 b){
|
||||
return {a.p[0], b.p[0]};
|
||||
}
|
||||
|
||||
__device__ __half2 __lowhigh2highlow(const __half2 a){
|
||||
return {a.q, a.p};
|
||||
__device__ __half2 __soft_lowhigh2highlow(const __half2 a){
|
||||
return {a.p[0], a.p[1]};
|
||||
}
|
||||
|
||||
__device__ __half2 __low2half2(const __half2 a, const __half2 b){
|
||||
return {a.q, b.q};
|
||||
__device__ __half2 __soft_low2half2(const __half2 a, const __half2 b){
|
||||
return {a.p[0], b.p[0]};
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
+116
-25
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
@@ -69,6 +69,8 @@ std::string HIP_LAUNCH_BLOCKING_KERNELS;
|
||||
std::vector<std::string> g_hipLaunchBlockingKernels;
|
||||
int HIP_API_BLOCKING = 0;
|
||||
|
||||
int HIP_MAX_QUEUES = 0;
|
||||
|
||||
int HIP_PRINT_ENV = 0;
|
||||
int HIP_TRACE_API= 0;
|
||||
std::string HIP_TRACE_API_COLOR("green");
|
||||
@@ -241,7 +243,7 @@ ihipStream_t::ihipStream_t(ihipCtx_t *ctx, hc::accelerator_view av, unsigned int
|
||||
_id(0), // will be set by add function.
|
||||
_flags(flags),
|
||||
_ctx(ctx),
|
||||
_criticalData(av)
|
||||
_criticalData(this, av)
|
||||
{
|
||||
unsigned schedBits = ctx->_ctxFlags & hipDeviceScheduleMask;
|
||||
|
||||
@@ -254,7 +256,6 @@ ihipStream_t::ihipStream_t(ihipCtx_t *ctx, hc::accelerator_view av, unsigned int
|
||||
};
|
||||
|
||||
|
||||
tprintf(DB_SYNC, " streamCreate: stream=%p\n", this);
|
||||
};
|
||||
|
||||
|
||||
@@ -264,12 +265,38 @@ ihipStream_t::~ihipStream_t()
|
||||
}
|
||||
|
||||
|
||||
inline void ihipStream_t::ensureHaveQueue(LockedAccessor_StreamCrit_t &streamCrit)
|
||||
{
|
||||
if (HIP_MAX_QUEUES && !streamCrit->_hasQueue) {
|
||||
|
||||
// To avoid deadlock, we have to release the stream lock before acquiring context lock.
|
||||
// Else we can get hung if another thread has the context lock is trying to get lock for this stream.
|
||||
// We lock it again below.
|
||||
streamCrit->munlock();
|
||||
|
||||
// Obtain mutex access to the device critical data, release by destructor
|
||||
LockedAccessor_CtxCrit_t ctxCrit(this->_ctx->criticalData());
|
||||
// TODO
|
||||
auto needyCritPtr = this->_criticalData.mlock();
|
||||
|
||||
// Second test to ensure we still need to steal the queue - another thread may have
|
||||
// snuck in here and already solved the issue.
|
||||
if (!needyCritPtr->_hasQueue) {
|
||||
needyCritPtr->_av = this->_ctx->stealActiveQueue(ctxCrit, this);
|
||||
}
|
||||
|
||||
streamCrit->_hasQueue = true;
|
||||
}
|
||||
assert(streamCrit->_hasQueue);
|
||||
}
|
||||
|
||||
|
||||
//Wait for all kernel and data copy commands in this stream to complete.
|
||||
//This signature should be used in routines that already have locked the stream mutex
|
||||
void ihipStream_t::wait(LockedAccessor_StreamCrit_t &crit, bool assertQueueEmpty)
|
||||
void ihipStream_t::wait(LockedAccessor_StreamCrit_t &crit)
|
||||
{
|
||||
if (! assertQueueEmpty) {
|
||||
tprintf (DB_SYNC, "stream %p wait for queue-empty..\n", this);
|
||||
if (crit->_hasQueue) {
|
||||
tprintf (DB_SYNC, "%s wait for queue-empty..\n", ToString(this).c_str());
|
||||
hc::hcWaitMode waitMode = hc::hcWaitModeActive;
|
||||
|
||||
if (_scheduleMode == Auto) {
|
||||
@@ -293,6 +320,8 @@ void ihipStream_t::wait(LockedAccessor_StreamCrit_t &crit, bool assertQueueEmpty
|
||||
}
|
||||
|
||||
crit->_av.wait(waitMode);
|
||||
} else {
|
||||
tprintf (DB_SYNC, "%s wait for queue empty (done since stream has no physical queue).\n", ToString(this).c_str());
|
||||
}
|
||||
|
||||
crit->_kernelCnt = 0;
|
||||
@@ -300,11 +329,11 @@ void ihipStream_t::wait(LockedAccessor_StreamCrit_t &crit, bool assertQueueEmpty
|
||||
|
||||
//---
|
||||
//Wait for all kernel and data copy commands in this stream to complete.
|
||||
void ihipStream_t::locked_wait(bool assertQueueEmpty)
|
||||
void ihipStream_t::locked_wait()
|
||||
{
|
||||
LockedAccessor_StreamCrit_t crit(_criticalData);
|
||||
|
||||
wait(crit, assertQueueEmpty);
|
||||
wait(crit);
|
||||
|
||||
};
|
||||
|
||||
@@ -313,6 +342,8 @@ void ihipStream_t::locked_waitEvent(hipEvent_t event)
|
||||
{
|
||||
LockedAccessor_StreamCrit_t crit(_criticalData);
|
||||
|
||||
this->ensureHaveQueue(crit);
|
||||
|
||||
crit->_av.create_blocking_marker(event->_marker);
|
||||
}
|
||||
|
||||
@@ -323,6 +354,7 @@ void ihipStream_t::locked_recordEvent(hipEvent_t event)
|
||||
// Lock the stream to prevent simultaneous access
|
||||
LockedAccessor_StreamCrit_t crit(_criticalData);
|
||||
|
||||
this->ensureHaveQueue(crit);
|
||||
event->_marker = crit->_av.create_marker();
|
||||
}
|
||||
|
||||
@@ -353,13 +385,17 @@ ihipCtx_t * ihipStream_t::getCtx() const
|
||||
// Lock the stream to prevent other threads from intervening.
|
||||
LockedAccessor_StreamCrit_t ihipStream_t::lockopen_preKernelCommand()
|
||||
{
|
||||
|
||||
LockedAccessor_StreamCrit_t crit(_criticalData, false/*no unlock at destruction*/);
|
||||
|
||||
if(crit->_kernelCnt > HIP_NUM_KERNELS_INFLIGHT){
|
||||
this->wait(crit);
|
||||
crit->_kernelCnt = 0;
|
||||
}
|
||||
crit->_kernelCnt++;
|
||||
|
||||
this->ensureHaveQueue(crit);
|
||||
|
||||
|
||||
|
||||
return crit;
|
||||
}
|
||||
@@ -391,6 +427,7 @@ void ihipStream_t::lockclose_postKernelCommand(const char * kernelName, hc::acce
|
||||
};
|
||||
|
||||
|
||||
|
||||
//=============================================================================
|
||||
// Recompute the peercnt and the packed _peerAgents whenever a peer is added or deleted.
|
||||
// The packed _peerAgents can efficiently be used on each memory allocation.
|
||||
@@ -474,6 +511,7 @@ void ihipCtxCriticalBase_t<CtxMutex>::addStream(ihipStream_t *stream)
|
||||
{
|
||||
stream->_id = _streams.size();
|
||||
_streams.push_back(stream);
|
||||
tprintf(DB_SYNC, " addStream: %s\n", ToString(stream).c_str());
|
||||
}
|
||||
//=============================================================================
|
||||
|
||||
@@ -482,7 +520,8 @@ void ihipCtxCriticalBase_t<CtxMutex>::addStream(ihipStream_t *stream)
|
||||
//=================================================================================================
|
||||
ihipDevice_t::ihipDevice_t(unsigned deviceId, unsigned deviceCnt, hc::accelerator &acc) :
|
||||
_deviceId(deviceId),
|
||||
_acc(acc)
|
||||
_acc(acc),
|
||||
_state(0)
|
||||
{
|
||||
hsa_agent_t *agent = static_cast<hsa_agent_t*> (acc.get_hsa_agent());
|
||||
if (agent) {
|
||||
@@ -811,11 +850,11 @@ hipError_t ihipDevice_t::initProperties(hipDeviceProp_t* prop)
|
||||
ihipCtx_t::ihipCtx_t(ihipDevice_t *device, unsigned deviceCnt, unsigned flags) :
|
||||
_ctxFlags(flags),
|
||||
_device(device),
|
||||
_criticalData(deviceCnt)
|
||||
_criticalData(this, deviceCnt)
|
||||
{
|
||||
locked_reset();
|
||||
|
||||
tprintf(DB_SYNC, "created ctx with defaultStream=%p\n", _defaultStream);
|
||||
tprintf(DB_SYNC, "created ctx with defaultStream=%p (%s)\n", _defaultStream, ToString(_defaultStream).c_str());
|
||||
};
|
||||
|
||||
|
||||
@@ -845,7 +884,7 @@ void ihipCtx_t::locked_reset()
|
||||
for (auto streamI=crit->const_streams().begin(); streamI!=crit->const_streams().end(); streamI++) {
|
||||
ihipStream_t *stream = *streamI;
|
||||
(*streamI)->locked_wait();
|
||||
tprintf(DB_SYNC, " delete stream=%p\n", stream);
|
||||
tprintf(DB_SYNC, " delete %s\n", ToString(stream).c_str());
|
||||
|
||||
delete stream;
|
||||
}
|
||||
@@ -865,6 +904,7 @@ void ihipCtx_t::locked_reset()
|
||||
// Reset will remove peer mapping so don't need to do this explicitly.
|
||||
// FIXME - This is clearly a non-const action! Is this a context reset or a device reset - maybe should reference count?
|
||||
ihipDevice_t *device = getWriteableDevice();
|
||||
device->_state = 0;
|
||||
am_memtracker_reset(device->_acc);
|
||||
|
||||
};
|
||||
@@ -878,6 +918,56 @@ std::string ihipCtx_t::toString() const
|
||||
return ss.str();
|
||||
};
|
||||
|
||||
|
||||
hc::accelerator_view
|
||||
ihipCtx_t::stealActiveQueue(LockedAccessor_CtxCrit_t &ctxCrit, ihipStream_t *needyStream)
|
||||
{
|
||||
|
||||
// TODO - review handling if queue can't be found.
|
||||
while (1) {
|
||||
|
||||
for (auto iter=ctxCrit->streams().begin(); iter != ctxCrit->streams().end(); iter++) {
|
||||
if (*iter != needyStream) {
|
||||
auto victimCritPtr = (*iter)->_criticalData.mtry_lock();
|
||||
if (victimCritPtr) {
|
||||
// try-lock succeeded:
|
||||
if (victimCritPtr->_hasQueue && (victimCritPtr->_kernelCnt == 0)) {
|
||||
|
||||
victimCritPtr->_hasQueue = false;
|
||||
|
||||
tprintf(DB_SYNC, " stealActiveQueue from victim:%s to needy:%s\n",
|
||||
ToString(*iter).c_str(), ToString(needyStream).c_str());
|
||||
|
||||
hc::accelerator_view av = victimCritPtr->_av;
|
||||
|
||||
// TODO - cleanup to remove forced setting to N
|
||||
uint64_t *p = (uint64_t*)(&victimCritPtr->_av);
|
||||
*p = 0; // damage the victim av so attempt to use it will fault.
|
||||
|
||||
(*iter)->_criticalData.munlock();
|
||||
return av;
|
||||
}
|
||||
(*iter)->_criticalData.munlock();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
hc::accelerator_view
|
||||
ihipCtx_t::createOrStealQueue(LockedAccessor_CtxCrit_t &ctxCrit)
|
||||
{
|
||||
if (HIP_MAX_QUEUES && (ctxCrit->streams().size() >= HIP_MAX_QUEUES)) {
|
||||
// Steal a queue from an existing stream:
|
||||
hc::accelerator_view av = this->stealActiveQueue (ctxCrit, nullptr);
|
||||
return av;
|
||||
} else {
|
||||
// Create a new view
|
||||
return getWriteableDevice()->_acc.create_view();
|
||||
}
|
||||
}
|
||||
|
||||
//----
|
||||
|
||||
|
||||
@@ -919,13 +1009,6 @@ void ihipCtx_t::locked_syncDefaultStream(bool waitOnSelf)
|
||||
}
|
||||
}
|
||||
|
||||
//---
|
||||
void ihipCtx_t::locked_addStream(ihipStream_t *s)
|
||||
{
|
||||
LockedAccessor_CtxCrit_t crit(_criticalData);
|
||||
|
||||
crit->addStream(s);
|
||||
}
|
||||
|
||||
//---
|
||||
void ihipCtx_t::locked_removeStream(ihipStream_t *s)
|
||||
@@ -1213,9 +1296,10 @@ void ihipInit()
|
||||
tokenize(HIP_LAUNCH_BLOCKING_KERNELS, ',', &g_hipLaunchBlockingKernels);
|
||||
}
|
||||
READ_ENV_I(release, HIP_API_BLOCKING, 0, "Make HIP APIs 'host-synchronous', so they block until completed. Impacts hipMemcpyAsync, hipMemsetAsync." );
|
||||
|
||||
|
||||
|
||||
READ_ENV_I(release, HIP_MAX_QUEUES, 0, "Maximum number of queues that this app will use per-device. Additional streams will share the specified number of queues. 0=no limit.");
|
||||
|
||||
READ_ENV_C(release, HIP_DB, 0, "Print debug info. Bitmask (HIP_DB=0xff) or flags separated by '+' (HIP_DB=api+sync+mem+copy)", HIP_DB_callback);
|
||||
if ((HIP_DB & (1<<DB_API)) && (HIP_TRACE_API == 0)) {
|
||||
// Set HIP_TRACE_API default before we read it, so it is printed correctly.
|
||||
@@ -1236,8 +1320,8 @@ void ihipInit()
|
||||
|
||||
|
||||
READ_ENV_I(release, HIP_WAIT_MODE, 0, "Force synchronization mode. 1= force yield, 2=force spin, 0=defaults specified in application");
|
||||
READ_ENV_I(release, HIP_FORCE_P2P_HOST, 0, "Force use of host/staging copy for peer-to-peer copies.1=always use copies, 2=always return false for hipDeviceCanAccessPeer");
|
||||
READ_ENV_I(release, HIP_FORCE_SYNC_COPY, 0, "Force all copies (even hipMemcpyAsync) to use sync copies");
|
||||
READ_ENV_I(release, HIP_FORCE_P2P_HOST, 0, "Force use of host/staging copy for peer-to-peer copies.1=always use copies, 2=always return false for hipDeviceCanAccessPeer");
|
||||
READ_ENV_I(release, HIP_FORCE_SYNC_COPY, 0, "Force all copies (even hipMemcpyAsync) to use sync copies");
|
||||
|
||||
// TODO - review, can we remove this?
|
||||
READ_ENV_I(release, HIP_NUM_KERNELS_INFLIGHT, 128, "Max number of inflight kernels per stream before active synchronization is forced.");
|
||||
@@ -1366,7 +1450,7 @@ hipStream_t ihipSyncAndResolveStream(hipStream_t stream)
|
||||
} else {
|
||||
// ALl streams have to wait for legacy default stream to be empty:
|
||||
if (!(stream->_flags & hipStreamNonBlocking)) {
|
||||
tprintf(DB_SYNC, "stream %p wait default stream\n", stream);
|
||||
tprintf(DB_SYNC, "%s wait default stream\n", ToString(stream).c_str());
|
||||
stream->getCtx()->_defaultStream->locked_wait();
|
||||
}
|
||||
|
||||
@@ -1553,6 +1637,7 @@ const char *ihipErrorString(hipError_t hip_error)
|
||||
case hipErrorSharedObjectSymbolNotFound : return "hipErrorSharedObjectSymbolNotFound";
|
||||
case hipErrorSharedObjectInitFailed : return "hipErrorSharedObjectInitFailed";
|
||||
case hipErrorOperatingSystem : return "hipErrorOperatingSystem";
|
||||
case hipErrorSetOnActiveProcess : return "hipErrorSetOnActiveProcess";
|
||||
case hipErrorInvalidHandle : return "hipErrorInvalidHandle";
|
||||
case hipErrorNotFound : return "hipErrorNotFound";
|
||||
case hipErrorIllegalAddress : return "hipErrorIllegalAddress";
|
||||
@@ -1779,6 +1864,7 @@ void ihipStream_t::locked_copySync(void* dst, const void* src, size_t sizeBytes,
|
||||
src, srcPtrInfo._hostPointer, srcPtrInfo._devicePointer, srcPtrInfo._sizeBytes,
|
||||
srcPtrInfo._appId, srcTracked, srcPtrInfo._isInDeviceMem);
|
||||
|
||||
this->ensureHaveQueue(crit);
|
||||
|
||||
#if USE_COPY_EXT_V2
|
||||
crit->_av.copy_ext(src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, copyDevice ? ©Device->getDevice()->_acc : nullptr, forceUnpinnedCopy);
|
||||
@@ -1843,6 +1929,8 @@ void ihipStream_t::locked_copyAsync(void* dst, const void* src, size_t sizeBytes
|
||||
|
||||
// Perform fast asynchronous copy - we know copyDevice != NULL based on check above
|
||||
try {
|
||||
this->ensureHaveQueue(crit);
|
||||
|
||||
if (HIP_FORCE_SYNC_COPY) {
|
||||
#if USE_COPY_EXT_V2
|
||||
crit->_av.copy_ext (src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, ©Device->getDevice()->_acc, forceUnpinnedCopy);
|
||||
@@ -1869,6 +1957,8 @@ void ihipStream_t::locked_copyAsync(void* dst, const void* src, size_t sizeBytes
|
||||
|
||||
} else {
|
||||
LockedAccessor_StreamCrit_t crit(_criticalData);
|
||||
|
||||
this->ensureHaveQueue(crit);
|
||||
#if USE_COPY_EXT_V2
|
||||
crit->_av.copy_ext(src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, copyDevice ? ©Device->getDevice()->_acc : nullptr, forceUnpinnedCopy);
|
||||
#else
|
||||
@@ -1926,6 +2016,7 @@ hipError_t hipHccGetAccelerator(int deviceId, hc::accelerator *acc)
|
||||
|
||||
|
||||
//---
|
||||
// Warning - with HIP_MAX_QUEUES!=0 there is no mechanism to prevent accelerator_view from being re-assigned...
|
||||
hipError_t hipHccGetAcceleratorView(hipStream_t stream, hc::accelerator_view **av)
|
||||
{
|
||||
HIP_INIT_API(stream, av);
|
||||
@@ -1935,7 +2026,7 @@ hipError_t hipHccGetAcceleratorView(hipStream_t stream, hc::accelerator_view **a
|
||||
stream = device->_defaultStream;
|
||||
}
|
||||
|
||||
*av = stream->locked_getAv();
|
||||
*av = stream->locked_getAv(); // TODO - review.
|
||||
|
||||
hipError_t err = hipSuccess;
|
||||
return ihipLogStatus(err);
|
||||
|
||||
+85
-45
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
@@ -204,7 +204,8 @@ extern void recordApiTrace(std::string *fullStr, const std::string &apiStr);
|
||||
#define HIP_INIT()\
|
||||
std::call_once(hip_initialized, ihipInit);\
|
||||
ihipCtxStackUpdate();
|
||||
|
||||
#define HIP_SET_DEVICE()\
|
||||
ihipDeviceSetState();
|
||||
|
||||
// This macro should be called at the beginning of every HIP API.
|
||||
// It initialies the hip runtime (exactly once), and
|
||||
@@ -234,8 +235,7 @@ extern void recordApiTrace(std::string *fullStr, const std::string &apiStr);
|
||||
#define DB_SYNC 1 /* 0x02 - trace synchronization pieces */
|
||||
#define DB_MEM 2 /* 0x04 - trace memory allocation / deallocation */
|
||||
#define DB_COPY 3 /* 0x08 - trace memory copy and peer commands. . */
|
||||
#define DB_SIGNAL 4 /* 0x10 - trace signal pool commands */
|
||||
#define DB_MAX_FLAG 5
|
||||
#define DB_MAX_FLAG 4
|
||||
// When adding a new debug flag, also add to the char name table below.
|
||||
//
|
||||
|
||||
@@ -250,7 +250,6 @@ static const DbName dbName [] =
|
||||
{KYEL, "sync"},
|
||||
{KCYN, "mem"},
|
||||
{KMAG, "copy"},
|
||||
{KRED, "signal"},
|
||||
};
|
||||
|
||||
|
||||
@@ -293,6 +292,34 @@ extern "C" {
|
||||
const hipStream_t hipStreamNull = 0x0;
|
||||
|
||||
|
||||
/**
|
||||
* HIP IPC Handle Size
|
||||
*/
|
||||
#define HIP_IPC_HANDLE_SIZE 64
|
||||
class ihipIpcMemHandle_t
|
||||
{
|
||||
public:
|
||||
#if USE_IPC
|
||||
hsa_amd_ipc_memory_t ipc_handle; ///< ipc memory handle on ROCr
|
||||
#endif
|
||||
char reserved[HIP_IPC_HANDLE_SIZE];
|
||||
size_t psize;
|
||||
};
|
||||
|
||||
|
||||
class ihipModule_t {
|
||||
public:
|
||||
hsa_executable_t executable;
|
||||
hsa_code_object_t object;
|
||||
std::string fileName;
|
||||
void *ptr;
|
||||
size_t size;
|
||||
|
||||
ihipModule_t() : executable(), object(), fileName(), ptr(nullptr), size(0) {}
|
||||
};
|
||||
|
||||
|
||||
//---
|
||||
// Used to remove lock, for performance or stimulating bugs.
|
||||
class FakeMutex
|
||||
{
|
||||
@@ -331,21 +358,21 @@ public:
|
||||
_autoUnlock(autoUnlock)
|
||||
|
||||
{
|
||||
tprintf(DB_SYNC, "lock critical data %s.%p\n", typeid(T).name(), _criticalData);
|
||||
tprintf(DB_SYNC, "locking criticalData=%p for %s..\n", _criticalData, ToString(_criticalData->_parent).c_str());
|
||||
_criticalData->_mutex.lock();
|
||||
};
|
||||
|
||||
~LockedAccessor()
|
||||
{
|
||||
if (_autoUnlock) {
|
||||
tprintf(DB_SYNC, "auto-unlock critical data %s.%p\n",typeid(T).name(), _criticalData);
|
||||
tprintf(DB_SYNC, "auto-unlocking criticalData=%p for %s...\n", _criticalData, ToString(_criticalData->_parent).c_str());
|
||||
_criticalData->_mutex.unlock();
|
||||
}
|
||||
}
|
||||
|
||||
void unlock()
|
||||
{
|
||||
tprintf(DB_SYNC, "unlock critical data %s.%p\n", typeid(T).name(), _criticalData);
|
||||
tprintf(DB_SYNC, "unlocking criticalData=%p for %s...\n", _criticalData, ToString(_criticalData->_parent).c_str());
|
||||
_criticalData->_mutex.unlock();
|
||||
}
|
||||
|
||||
@@ -365,43 +392,21 @@ struct LockedBase {
|
||||
// Most uses should use the lock-accessor.
|
||||
void lock() { _mutex.lock(); }
|
||||
void unlock() { _mutex.unlock(); }
|
||||
bool try_lock() { return _mutex.try_lock(); }
|
||||
|
||||
MUTEX_TYPE _mutex;
|
||||
};
|
||||
|
||||
/**
|
||||
* HIP IPC Handle Size
|
||||
*/
|
||||
#define HIP_IPC_HANDLE_SIZE 64
|
||||
class ihipIpcMemHandle_t
|
||||
{
|
||||
public:
|
||||
#if USE_IPC
|
||||
hsa_amd_ipc_memory_t ipc_handle; ///< ipc memory handle on ROCr
|
||||
#endif
|
||||
char reserved[HIP_IPC_HANDLE_SIZE];
|
||||
size_t psize;
|
||||
};
|
||||
|
||||
|
||||
class ihipModule_t {
|
||||
public:
|
||||
hsa_executable_t executable;
|
||||
hsa_code_object_t object;
|
||||
std::string fileName;
|
||||
void *ptr;
|
||||
size_t size;
|
||||
|
||||
ihipModule_t() : executable(), object(), fileName(), ptr(nullptr), size(0) {}
|
||||
};
|
||||
|
||||
template <typename MUTEX_TYPE>
|
||||
class ihipStreamCriticalBase_t : public LockedBase<MUTEX_TYPE>
|
||||
{
|
||||
public:
|
||||
ihipStreamCriticalBase_t(hc::accelerator_view av) :
|
||||
ihipStreamCriticalBase_t(ihipStream_t *parentStream, hc::accelerator_view av) :
|
||||
_kernelCnt(0),
|
||||
_av(av)
|
||||
_av(av),
|
||||
_hasQueue(true),
|
||||
_parent(parentStream)
|
||||
{
|
||||
};
|
||||
|
||||
@@ -410,10 +415,28 @@ public:
|
||||
|
||||
ihipStreamCriticalBase_t<StreamMutex> * mlock() { LockedBase<MUTEX_TYPE>::lock(); return this;};
|
||||
|
||||
void munlock() {
|
||||
tprintf(DB_SYNC, "munlocking criticalData=%p for %s...\n", this, ToString(this->_parent).c_str());
|
||||
LockedBase<MUTEX_TYPE>::unlock();
|
||||
};
|
||||
|
||||
ihipStreamCriticalBase_t<StreamMutex> * mtry_lock() {
|
||||
bool gotLock = LockedBase<MUTEX_TYPE>::try_lock() ;
|
||||
tprintf(DB_SYNC, "mtry_locking=%d criticalData=%p for %s...\n", gotLock, this, ToString(this->_parent).c_str());
|
||||
return gotLock ? this: nullptr;
|
||||
};
|
||||
|
||||
public:
|
||||
// TODO - remove _kernelCnt mechanism:
|
||||
ihipStream_t * _parent;
|
||||
uint32_t _kernelCnt; // Count of inflight kernels in this stream. Reset at ::wait().
|
||||
|
||||
hc::accelerator_view _av;
|
||||
|
||||
// True if the stream has an allocated queue (accelerato_view) for its use:
|
||||
// Always true at ihipStream creation but queue may later be stolen.
|
||||
// This acts as a valid bit for the _av.
|
||||
bool _hasQueue;
|
||||
private:
|
||||
};
|
||||
|
||||
|
||||
@@ -421,6 +444,7 @@ public:
|
||||
// for the ihipCtx_t and then for the individual streams. The locks should not be acquired in reverse order
|
||||
// or deadlock may occur. In some cases, it may be possible to reduce the range where the locks must be held.
|
||||
// HIP routines should avoid acquiring and releasing the same lock during the execution of a single HIP API.
|
||||
// Another option is to use try_lock in the innermost lock query.
|
||||
|
||||
|
||||
typedef ihipStreamCriticalBase_t<StreamMutex> ihipStreamCritical_t;
|
||||
@@ -435,6 +459,7 @@ public:
|
||||
enum ScheduleMode {Auto, Spin, Yield};
|
||||
typedef uint64_t SeqNum_t ;
|
||||
|
||||
// TODOD -make av a reference to avoid shared_ptr overhead?
|
||||
ihipStream_t(ihipCtx_t *ctx, hc::accelerator_view av, unsigned int flags);
|
||||
~ihipStream_t();
|
||||
|
||||
@@ -451,7 +476,7 @@ public:
|
||||
void lockclose_postKernelCommand(const char *kernelName, hc::accelerator_view *av);
|
||||
|
||||
|
||||
void locked_wait(bool assertQueueEmpty=false);
|
||||
void locked_wait();
|
||||
|
||||
hc::accelerator_view* locked_getAv() { LockedAccessor_StreamCrit_t crit(_criticalData); return &(crit->_av); };
|
||||
|
||||
@@ -462,7 +487,7 @@ public:
|
||||
//---
|
||||
|
||||
// Use this if we already have the stream critical data mutex:
|
||||
void wait(LockedAccessor_StreamCrit_t &crit, bool assertQueueEmpty=false);
|
||||
void wait(LockedAccessor_StreamCrit_t &crit);
|
||||
|
||||
void launchModuleKernel(hc::accelerator_view av, hsa_signal_t signal,
|
||||
uint32_t blockDimX, uint32_t blockDimY, uint32_t blockDimZ,
|
||||
@@ -477,6 +502,7 @@ public:
|
||||
const ihipDevice_t * getDevice() const;
|
||||
ihipCtx_t * getCtx() const;
|
||||
|
||||
void ensureHaveQueue(LockedAccessor_StreamCrit_t &streamCrit);
|
||||
|
||||
public:
|
||||
//---
|
||||
@@ -498,11 +524,14 @@ private:
|
||||
|
||||
bool canSeeMemory(const ihipCtx_t *thisCtx, const hc::AmPointerInfo *dstInfo, const hc::AmPointerInfo *srcInfo);
|
||||
|
||||
|
||||
private: // Data
|
||||
public: // TODO - move private
|
||||
// Critical Data - MUST be accessed through LockedAccessor_StreamCrit_t
|
||||
ihipStreamCritical_t _criticalData;
|
||||
|
||||
private: // Data
|
||||
|
||||
std::mutex _hasQueueLock;
|
||||
|
||||
ihipCtx_t *_ctx; // parent context that owns this stream.
|
||||
|
||||
// Friends:
|
||||
@@ -566,6 +595,8 @@ public:
|
||||
|
||||
ihipCtx_t *_primaryCtx;
|
||||
|
||||
int _state; //1 if device is set otherwise 0
|
||||
|
||||
private:
|
||||
hipError_t initProperties(hipDeviceProp_t* prop);
|
||||
};
|
||||
@@ -579,8 +610,9 @@ template <typename MUTEX_TYPE>
|
||||
class ihipCtxCriticalBase_t : LockedBase<MUTEX_TYPE>
|
||||
{
|
||||
public:
|
||||
ihipCtxCriticalBase_t(unsigned deviceCnt) :
|
||||
_peerCnt(0)
|
||||
ihipCtxCriticalBase_t(ihipCtx_t *parentCtx, unsigned deviceCnt) :
|
||||
_parent(parentCtx),
|
||||
_peerCnt(0)
|
||||
{
|
||||
_peerAgents = new hsa_agent_t[deviceCnt];
|
||||
};
|
||||
@@ -599,6 +631,7 @@ public:
|
||||
const std::list<ihipStream_t*> &const_streams() const { return _streams; };
|
||||
|
||||
|
||||
|
||||
// Peer Accessor classes:
|
||||
bool isPeerWatcher(const ihipCtx_t *peer); // returns True if peer has access to memory physically located on this device.
|
||||
bool addPeerWatcher(const ihipCtx_t *thisCtx, ihipCtx_t *peer);
|
||||
@@ -615,6 +648,8 @@ public:
|
||||
|
||||
friend class LockedAccessor<ihipCtxCriticalBase_t>;
|
||||
private:
|
||||
ihipCtx_t * _parent;
|
||||
|
||||
//--- Stream Tracker:
|
||||
std::list< ihipStream_t* > _streams; // streams associated with this device.
|
||||
|
||||
@@ -649,16 +684,20 @@ public: // Functions:
|
||||
~ihipCtx_t();
|
||||
|
||||
// Functions which read or write the critical data are named locked_.
|
||||
// (might be better called "locking_"
|
||||
// ihipCtx_t does not use recursive locks so the ihip implementation must avoid calling a locked_ function from within a locked_ function.
|
||||
// External functions which call several locked_ functions will acquire and release the lock for each function. if this occurs in
|
||||
// performance-sensitive code we may want to refactor by adding non-locked functions and creating a new locked_ member function to call them all.
|
||||
void locked_addStream(ihipStream_t *s);
|
||||
void locked_removeStream(ihipStream_t *s);
|
||||
void locked_reset();
|
||||
void locked_waitAllStreams();
|
||||
void locked_syncDefaultStream(bool waitOnSelf);
|
||||
|
||||
ihipCtxCritical_t &criticalData() { return _criticalData; }; // TODO, move private. Fix P2P.
|
||||
// Will allocate a queue and assign it to the needyStream:
|
||||
hc::accelerator_view stealActiveQueue(LockedAccessor_CtxCrit_t &ctxCrit, ihipStream_t *needyStream);
|
||||
hc::accelerator_view createOrStealQueue(LockedAccessor_CtxCrit_t &ctxCrit);
|
||||
|
||||
ihipCtxCritical_t &criticalData() { return _criticalData; };
|
||||
|
||||
const ihipDevice_t *getDevice() const { return _device; };
|
||||
int getDeviceNum() const { return _device->_deviceId; };
|
||||
@@ -703,6 +742,7 @@ extern ihipCtx_t *ihipGetTlsDefaultCtx();
|
||||
extern void ihipSetTlsDefaultCtx(ihipCtx_t *ctx);
|
||||
extern hipError_t ihipSynchronize(void);
|
||||
extern void ihipCtxStackUpdate();
|
||||
extern hipError_t ihipDeviceSetState();
|
||||
|
||||
extern ihipDevice_t *ihipGetDevice(int);
|
||||
ihipCtx_t * ihipGetPrimaryCtx(unsigned deviceIndex);
|
||||
@@ -715,7 +755,7 @@ hipStream_t ihipSyncAndResolveStream(hipStream_t);
|
||||
// Stream printf functions:
|
||||
inline std::ostream& operator<<(std::ostream& os, const ihipStream_t& s)
|
||||
{
|
||||
os << "stream#";
|
||||
os << "stream:";
|
||||
os << s.getDevice()->_deviceId;;
|
||||
os << '.';
|
||||
os << s._id;
|
||||
|
||||
@@ -12,6 +12,78 @@ define linkonce_odr spir_func void @__threadfence_block() #1 {
|
||||
ret void
|
||||
}
|
||||
|
||||
; Lightning does not support inline asm for 16-bit data types
|
||||
; So, bitcast half to short and then extend to 32bit i32
|
||||
; After inline asm, convert back to half
|
||||
define half @__hip_hc_ir_hadd_half(half %a, half %b) #1 {
|
||||
%1 = bitcast half %a to i16
|
||||
%2 = bitcast half %b to i16
|
||||
%3 = zext i16 %1 to i32
|
||||
%4 = zext i16 %2 to i32
|
||||
%5 = tail call i32 asm "v_add_f16 $0, $1, $2","=v,v,v"(i32 %3, i32 %4)
|
||||
%6 = trunc i32 %5 to i16
|
||||
%7 = bitcast i16 %6 to half
|
||||
ret half %7
|
||||
}
|
||||
|
||||
define half @__hip_hc_ir_hsub_half(half %a, half %b) #1 {
|
||||
%1 = bitcast half %a to i16
|
||||
%2 = bitcast half %b to i16
|
||||
%3 = zext i16 %1 to i32
|
||||
%4 = zext i16 %2 to i32
|
||||
%5 = tail call i32 asm "v_sub_f16 $0, $1, $2","=v,v,v"(i32 %3, i32 %4)
|
||||
%6 = trunc i32 %5 to i16
|
||||
%7 = bitcast i16 %6 to half
|
||||
ret half %7
|
||||
}
|
||||
|
||||
define half @__hip_hc_ir_hmul_half(half %a, half %b) #1 {
|
||||
%1 = bitcast half %a to i16
|
||||
%2 = bitcast half %b to i16
|
||||
%3 = zext i16 %1 to i32
|
||||
%4 = zext i16 %2 to i32
|
||||
%5 = tail call i32 asm "v_mul_f16 $0, $1, $2","=v,v,v"(i32 %3, i32 %4)
|
||||
%6 = trunc i32 %5 to i16
|
||||
%7 = bitcast i16 %6 to half
|
||||
ret half %7
|
||||
}
|
||||
|
||||
define half @__hip_hc_ir_hfma_half(half %a, half %b, half %c) #1 {
|
||||
%1 = bitcast half %a to i16
|
||||
%2 = bitcast half %b to i16
|
||||
%3 = bitcast half %c to i16
|
||||
%4 = zext i16 %1 to i32
|
||||
%5 = zext i16 %2 to i32
|
||||
%6 = zext i16 %3 to i32
|
||||
%7 = tail call i32 asm "v_mad_f16 $0, $1, $2, $3","=v,v,v,v"(i32 %4, i32 %5, i32 %6)
|
||||
%8 = trunc i32 %7 to i16
|
||||
%9 = bitcast i16 %8 to half
|
||||
ret half %9
|
||||
}
|
||||
|
||||
define i32 @__hip_hc_ir_hadd2_int(i32 %a, i32 %b) #1 {
|
||||
%1 = tail call i32 asm sideeffect "v_add_f16 $0, $1, $2","=v,v,v"(i32 %a, i32 %b)
|
||||
tail call void asm sideeffect "v_add_f16_sdwa $0, $1, $2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1","v,v,v"(i32 %1, i32 %a, i32 %b)
|
||||
ret i32 %1
|
||||
}
|
||||
|
||||
define i32 @__hip_hc_ir_hfma2_int(i32 %a, i32 %b, i32 %c) #1 {
|
||||
%1 = tail call i32 asm sideeffect "v_mad_f16 $0, $1, $2, $3","=v,v,v,v"(i32 %a, i32 %b, i32 %c)
|
||||
tail call void asm sideeffect "v_mul_f16_sdwa $0, $1, $2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1","v,v,v"(i32 %1, i32 %a, i32 %b)
|
||||
tail call void asm sideeffect "v_add_f16_sdwa $0, $1, $2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1","v,v,v"(i32 %1, i32 %1, i32 %c)
|
||||
ret i32 %1
|
||||
}
|
||||
|
||||
define i32 @__hip_hc_ir_hmul2_int(i32 %a, i32 %b) #1 {
|
||||
%1 = tail call i32 asm sideeffect "v_mul_f16 $0, $1, $2","=v,v,v"(i32 %a, i32 %b)
|
||||
tail call void asm sideeffect "v_mul_f16_sdwa $0, $1, $2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1","v,v,v"(i32 %1, i32 %a, i32 %b)
|
||||
ret i32 %1
|
||||
}
|
||||
|
||||
define i32 @__hip_hc_ir_hsub2_int(i32 %a, i32 %b) #1 {
|
||||
%1 = tail call i32 asm sideeffect "v_sub_f16 $0, $1, $2","=v,v,v"(i32 %a, i32 %b)
|
||||
tail call void asm sideeffect "v_sub_f16_sdwa $0, $1, $2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1","v,v,v"(i32 %1, i32 %a, i32 %b)
|
||||
ret i32 %1
|
||||
}
|
||||
|
||||
attributes #1 = { alwaysinline nounwind }
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
@@ -20,23 +20,22 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <hc.hpp>
|
||||
|
||||
#include "hip/hcc_detail/hip_ldg.h"
|
||||
#include "hip/hcc_detail/hip_vector_types.h"
|
||||
|
||||
__device__ char __ldg(const char* ptr)
|
||||
{
|
||||
return ptr[0];
|
||||
return *ptr;
|
||||
}
|
||||
|
||||
__device__ char2 __ldg(const char2* ptr)
|
||||
{
|
||||
return ptr[0];
|
||||
return *ptr;
|
||||
}
|
||||
|
||||
__device__ char4 __ldg(const char4* ptr)
|
||||
{
|
||||
return ptr[0];
|
||||
return *ptr;
|
||||
}
|
||||
|
||||
__device__ signed char __ldg(const signed char* ptr)
|
||||
@@ -169,6 +168,3 @@ __device__ double2 __ldg(const double2* ptr)
|
||||
{
|
||||
return ptr[0];
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
+12
-10
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
@@ -105,7 +105,7 @@ hipError_t hipHostGetDevicePointer(void **devicePointer, void *hostPointer, unsi
|
||||
hipError_t hipMalloc(void** ptr, size_t sizeBytes)
|
||||
{
|
||||
HIP_INIT_API(ptr, sizeBytes);
|
||||
|
||||
HIP_SET_DEVICE();
|
||||
hipError_t hip_status = hipSuccess;
|
||||
// return NULL pointer when malloc size is 0
|
||||
if (sizeBytes == 0)
|
||||
@@ -131,7 +131,7 @@ hipError_t hipMalloc(void** ptr, size_t sizeBytes)
|
||||
LockedAccessor_CtxCrit_t crit(ctx->criticalData());
|
||||
// the peerCnt always stores self so make sure the trace actually
|
||||
peerCnt = crit->peerCnt();
|
||||
tprintf(DB_MEM, " allocated device_mem ptr:%p size:%zu on dev:%d and allowed %d other peer(s) access\n",
|
||||
tprintf(DB_MEM, " allocated device_mem ptr:%p size:%zu on dev:%d and allow access to %d other peer(s)\n",
|
||||
*ptr, sizeBytes, device->_deviceId, peerCnt-1);
|
||||
if (peerCnt > 1) {
|
||||
|
||||
@@ -161,7 +161,7 @@ hipError_t hipMalloc(void** ptr, size_t sizeBytes)
|
||||
hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags)
|
||||
{
|
||||
HIP_INIT_API(ptr, sizeBytes, flags);
|
||||
|
||||
HIP_SET_DEVICE();
|
||||
hipError_t hip_status = hipSuccess;
|
||||
|
||||
auto ctx = ihipGetTlsDefaultCtx();
|
||||
@@ -233,7 +233,7 @@ hipError_t hipHostAlloc(void** ptr, size_t sizeBytes, unsigned int flags)
|
||||
hipError_t hipMallocPitch(void** ptr, size_t* pitch, size_t width, size_t height)
|
||||
{
|
||||
HIP_INIT_API(ptr, pitch, width, height);
|
||||
|
||||
HIP_SET_DEVICE();
|
||||
hipError_t hip_status = hipSuccess;
|
||||
|
||||
if(width == 0 || height == 0)
|
||||
@@ -285,7 +285,7 @@ hipError_t hipMallocArray(hipArray** array, const hipChannelFormatDesc* desc,
|
||||
size_t width, size_t height, unsigned int flags)
|
||||
{
|
||||
HIP_INIT_API(array, desc, width, height, flags);
|
||||
|
||||
HIP_SET_DEVICE();
|
||||
hipError_t hip_status = hipSuccess;
|
||||
|
||||
auto ctx = ihipGetTlsDefaultCtx();
|
||||
@@ -813,6 +813,8 @@ hipError_t hipMemsetAsync(void* dst, int value, size_t sizeBytes, hipStream_t s
|
||||
if (stream) {
|
||||
auto crit = stream->lockopen_preKernelCommand();
|
||||
|
||||
stream->ensureHaveQueue(crit);
|
||||
|
||||
hc::completion_future cf ;
|
||||
|
||||
if ((sizeBytes & 0x3) == 0) {
|
||||
@@ -841,7 +843,6 @@ hipError_t hipMemsetAsync(void* dst, int value, size_t sizeBytes, hipStream_t s
|
||||
if (HIP_API_BLOCKING) {
|
||||
tprintf (DB_SYNC, "%s LAUNCH_BLOCKING wait for hipMemsetAsync.\n", ToString(stream).c_str());
|
||||
cf.wait();
|
||||
//tprintf (DB_SYNC, "'%s' LAUNCH_BLOCKING memset completed [stream:%p].\n", __func__, (void*)stream);
|
||||
}
|
||||
} else {
|
||||
e = hipErrorInvalidValue;
|
||||
@@ -864,6 +865,7 @@ hipError_t hipMemset(void* dst, int value, size_t sizeBytes )
|
||||
if (stream) {
|
||||
auto crit = stream->lockopen_preKernelCommand();
|
||||
|
||||
stream->ensureHaveQueue(crit);
|
||||
hc::completion_future cf ;
|
||||
|
||||
if ((sizeBytes & 0x3) == 0) {
|
||||
@@ -892,9 +894,9 @@ hipError_t hipMemset(void* dst, int value, size_t sizeBytes )
|
||||
|
||||
|
||||
if (HIP_LAUNCH_BLOCKING) {
|
||||
tprintf (DB_SYNC, "'%s' LAUNCH_BLOCKING wait for memset [stream:%p].\n", __func__, (void*)stream);
|
||||
tprintf (DB_SYNC, "'%s' LAUNCH_BLOCKING wait for memset in %s.\n", __func__, ToString(stream).c_str());
|
||||
cf.wait();
|
||||
tprintf (DB_SYNC, "'%s' LAUNCH_BLOCKING memset completed [stream:%p].\n", __func__, (void*)stream);
|
||||
tprintf (DB_SYNC, "'%s' LAUNCH_BLOCKING memset completed in %s.\n", __func__, ToString(stream).c_str());
|
||||
}
|
||||
} else {
|
||||
e = hipErrorInvalidValue;
|
||||
@@ -1085,7 +1087,7 @@ hipError_t hipIpcOpenMemHandle(void** devPtr, hipIpcMemHandle_t handle, unsigned
|
||||
hsa_amd_ipc_memory_attach(&handle->ipc_handle, handle->psize, 1, agent, devPtr);
|
||||
if(hsa_status != HSA_STATUS_SUCCESS)
|
||||
hipStatus = hipErrorMapBufferObjectFailed;
|
||||
#else
|
||||
#else
|
||||
hipStatus = hipErrorRuntimeOther;
|
||||
#endif
|
||||
return hipStatus;
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
@@ -403,5 +403,3 @@ hipError_t hipModuleLoadData(hipModule_t *module, const void *image)
|
||||
}
|
||||
return ihipLogStatus(ret);
|
||||
}
|
||||
|
||||
|
||||
|
||||
+13
-13
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
@@ -31,7 +31,7 @@ THE SOFTWARE.
|
||||
// There are two flavors:
|
||||
// - one where contexts are specified with hipCtx_t type.
|
||||
// - one where contexts are specified with integer deviceIds, that are mapped to the primary context for that device.
|
||||
// The implementation contains a set of internal ihip* functions which operate on contexts. Then the
|
||||
// The implementation contains a set of internal ihip* functions which operate on contexts. Then the
|
||||
// public APIs are thin wrappers which call into this internal implementations.
|
||||
// TODO - actually not yet - currently the integer deviceId flavors just call the context APIs. need to fix.
|
||||
|
||||
@@ -46,16 +46,16 @@ hipError_t ihipDeviceCanAccessPeer (int* canAccessPeer, hipCtx_t thisCtx, hipCtx
|
||||
|
||||
if (thisCtx == peerCtx) {
|
||||
*canAccessPeer = 0;
|
||||
tprintf(DB_MEM, "Can't be peer to self. (this=%s, peer=%s)\n",
|
||||
thisCtx->toString().c_str(), peerCtx->toString().c_str());
|
||||
tprintf(DB_MEM, "Can't be peer to self. (this=%s, peer=%s)\n",
|
||||
thisCtx->toString().c_str(), peerCtx->toString().c_str());
|
||||
} else if (HIP_FORCE_P2P_HOST & 0x2) {
|
||||
*canAccessPeer = false;
|
||||
tprintf(DB_MEM, "HIP_FORCE_P2P_HOST denies peer access this=%s peer=%s canAccessPeer=%d\n",
|
||||
thisCtx->toString().c_str(), peerCtx->toString().c_str(), *canAccessPeer);
|
||||
tprintf(DB_MEM, "HIP_FORCE_P2P_HOST denies peer access this=%s peer=%s canAccessPeer=%d\n",
|
||||
thisCtx->toString().c_str(), peerCtx->toString().c_str(), *canAccessPeer);
|
||||
} else {
|
||||
*canAccessPeer = peerCtx->getDevice()->_acc.get_is_peer(thisCtx->getDevice()->_acc);
|
||||
tprintf(DB_MEM, "deviceCanAccessPeer this=%s peer=%s canAccessPeer=%d\n",
|
||||
thisCtx->toString().c_str(), peerCtx->toString().c_str(), *canAccessPeer);
|
||||
tprintf(DB_MEM, "deviceCanAccessPeer this=%s peer=%s canAccessPeer=%d\n",
|
||||
thisCtx->toString().c_str(), peerCtx->toString().c_str(), *canAccessPeer);
|
||||
}
|
||||
|
||||
} else {
|
||||
@@ -99,14 +99,14 @@ hipError_t ihipDisablePeerAccess (hipCtx_t peerCtx)
|
||||
LockedAccessor_CtxCrit_t peerCrit(peerCtx->criticalData());
|
||||
bool changed = peerCrit->removePeerWatcher(peerCtx, thisCtx);
|
||||
if (changed) {
|
||||
tprintf(DB_MEM, "device %s disable access to memory allocated on peer:%s\n",
|
||||
thisCtx->toString().c_str(), peerCtx->toString().c_str());
|
||||
tprintf(DB_MEM, "device %s disable access to memory allocated on peer:%s\n",
|
||||
thisCtx->toString().c_str(), peerCtx->toString().c_str());
|
||||
// Update the peers for all memory already saved in the tracker:
|
||||
am_memtracker_update_peers(peerCtx->getDevice()->_acc, peerCrit->peerCnt(), peerCrit->peerAgents());
|
||||
} else {
|
||||
err = hipErrorPeerAccessNotEnabled; // never enabled P2P access.
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
err = hipErrorInvalidDevice;
|
||||
}
|
||||
@@ -133,8 +133,8 @@ hipError_t ihipEnablePeerAccess (hipCtx_t peerCtx, unsigned int flags)
|
||||
// Add thisCtx to peerCtx's access list so that new allocations on peer will be made visible to this device:
|
||||
bool isNewPeer = peerCrit->addPeerWatcher(peerCtx, thisCtx);
|
||||
if (isNewPeer) {
|
||||
tprintf(DB_MEM, "device=%s can now see all memory allocated on peer=%s\n",
|
||||
thisCtx->toString().c_str(), peerCtx->toString().c_str());
|
||||
tprintf(DB_MEM, "device=%s can now see all memory allocated on peer=%s\n",
|
||||
thisCtx->toString().c_str(), peerCtx->toString().c_str());
|
||||
am_memtracker_update_peers(peerCtx->getDevice()->_acc, peerCrit->peerCnt(), peerCrit->peerAgents());
|
||||
} else {
|
||||
err = hipErrorPeerAccessAlreadyEnabled;
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
@@ -45,12 +45,17 @@ hipError_t ihipStreamCreate(hipStream_t *stream, unsigned int flags)
|
||||
//Note this is an execute_in_order queue, so all kernels submitted will atuomatically wait for prev to complete:
|
||||
//This matches CUDA stream behavior:
|
||||
|
||||
auto istream = new ihipStream_t(ctx, acc.create_view(), flags);
|
||||
{
|
||||
// Obtain mutex access to the device critical data, release by destructor
|
||||
LockedAccessor_CtxCrit_t ctxCrit(ctx->criticalData());
|
||||
|
||||
ctx->locked_addStream(istream);
|
||||
auto istream = new ihipStream_t(ctx, ctx->createOrStealQueue(ctxCrit), flags);
|
||||
|
||||
*stream = istream;
|
||||
tprintf(DB_SYNC, "hipStreamCreate, stream=%p\n", *stream);
|
||||
ctxCrit->addStream(istream);
|
||||
*stream = istream;
|
||||
}
|
||||
|
||||
tprintf(DB_SYNC, "hipStreamCreate, %s\n", ToString(*stream).c_str());
|
||||
} else {
|
||||
e = hipErrorInvalidDevice;
|
||||
}
|
||||
@@ -120,8 +125,14 @@ hipError_t hipStreamQuery(hipStream_t stream)
|
||||
stream = device->_defaultStream;
|
||||
}
|
||||
|
||||
LockedAccessor_StreamCrit_t crit(stream->_criticalData);
|
||||
int pendingOps = crit->_av.get_pending_async_ops();
|
||||
int pendingOps = 0;
|
||||
|
||||
{
|
||||
LockedAccessor_StreamCrit_t crit(stream->_criticalData);
|
||||
if (crit->_hasQueue) {
|
||||
pendingOps = crit->_av.get_pending_async_ops();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
hipError_t e = (pendingOps > 0) ? hipErrorNotReady : hipSuccess;
|
||||
@@ -203,7 +214,7 @@ hipError_t hipStreamAddCallback(hipStream_t stream, hipStreamCallback_t callback
|
||||
{
|
||||
HIP_INIT_API(stream, callback, userData, flags);
|
||||
hipError_t e = hipSuccess;
|
||||
//--- explicitly synchronize stream to add callback routines
|
||||
//--- explicitly synchronize stream to add callback routines
|
||||
hipStreamSynchronize(stream);
|
||||
callback(stream, e, userData);
|
||||
return ihipLogStatus(e);
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
|
||||
+10
-10
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
@@ -55,7 +55,7 @@ inline std::string ToHexString(T v)
|
||||
|
||||
// This is the default which works for most types:
|
||||
template <typename T>
|
||||
inline std::string ToString(T v)
|
||||
inline std::string ToString(T v)
|
||||
{
|
||||
std::ostringstream ss;
|
||||
ss << v;
|
||||
@@ -65,7 +65,7 @@ inline std::string ToString(T v)
|
||||
|
||||
// hipEvent_t specialization. TODO - maybe add an event ID for debug?
|
||||
template <>
|
||||
inline std::string ToString(hipEvent_t v)
|
||||
inline std::string ToString(hipEvent_t v)
|
||||
{
|
||||
std::ostringstream ss;
|
||||
ss << v;
|
||||
@@ -74,7 +74,7 @@ inline std::string ToString(hipEvent_t v)
|
||||
|
||||
// hipEvent_t specialization. TODO - maybe add an event ID for debug?
|
||||
template <>
|
||||
inline std::string ToString(hipFunction_t v)
|
||||
inline std::string ToString(hipFunction_t v)
|
||||
{
|
||||
std::ostringstream ss;
|
||||
ss << "0x" << std::hex << v._object;
|
||||
@@ -85,7 +85,7 @@ inline std::string ToString(hipFunction_t v)
|
||||
|
||||
// hipStream_t
|
||||
template <>
|
||||
inline std::string ToString(hipStream_t v)
|
||||
inline std::string ToString(hipStream_t v)
|
||||
{
|
||||
std::ostringstream ss;
|
||||
if (v == NULL) {
|
||||
@@ -99,7 +99,7 @@ inline std::string ToString(hipStream_t v)
|
||||
|
||||
// hipMemcpyKind specialization
|
||||
template <>
|
||||
inline std::string ToString(hipMemcpyKind v)
|
||||
inline std::string ToString(hipMemcpyKind v)
|
||||
{
|
||||
switch(v) {
|
||||
CASE_STR(hipMemcpyHostToHost);
|
||||
@@ -113,14 +113,14 @@ inline std::string ToString(hipMemcpyKind v)
|
||||
|
||||
|
||||
template <>
|
||||
inline std::string ToString(hipError_t v)
|
||||
inline std::string ToString(hipError_t v)
|
||||
{
|
||||
return ihipErrorString(v);
|
||||
};
|
||||
|
||||
|
||||
// Catch empty arguments case
|
||||
inline std::string ToString()
|
||||
inline std::string ToString()
|
||||
{
|
||||
return ("");
|
||||
}
|
||||
@@ -129,8 +129,8 @@ inline std::string ToString()
|
||||
//---
|
||||
// C++11 variadic template - peels off first argument, converts to string, and calls itself again to peel the next arg.
|
||||
// Strings are automatically separated by comma+space.
|
||||
template <typename T, typename... Args>
|
||||
inline std::string ToString(T first, Args... args)
|
||||
template <typename T, typename... Args>
|
||||
inline std::string ToString(T first, Args... args)
|
||||
{
|
||||
return ToString(first) + ", " + ToString(args...) ;
|
||||
}
|
||||
|
||||
@@ -0,0 +1,91 @@
|
||||
/*
|
||||
Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../test_common.cpp
|
||||
* RUN: %t
|
||||
* HIT_END
|
||||
*/
|
||||
|
||||
#include "test_common.h"
|
||||
#include "hip/hip_runtime.h"
|
||||
#include "hip/hip_runtime_api.h"
|
||||
#include "hip/hip_fp16.h"
|
||||
|
||||
#define hInf 0x7C00
|
||||
#define hInfPK 0x7C007C00
|
||||
#define h65504 0xF7FF
|
||||
#define h65504PK 0xF7FFF7FF
|
||||
#define h27 0x4EC0
|
||||
#define h27PK 0x4EC04EC0
|
||||
#define h7 0x4700
|
||||
#define h7PK 0x47004700
|
||||
#define h3 0x4200
|
||||
#define h3PK 0x42004200
|
||||
#define h1 0x3C00
|
||||
#define h1PK 0x3C003C00
|
||||
#define hPoint5 0x3800
|
||||
#define hPoint5PK 0x38003800
|
||||
#define hZero 0x0000
|
||||
#define hNeg1 0xBC00
|
||||
#define hNeg1PK 0xBC00BC00
|
||||
|
||||
__global__ void CheckHalf(hipLaunchParm lp, __half* In1, __half* In2, __half* In3, __half* Out){
|
||||
Out[0] = __hadd(In1[0], In2[0]);
|
||||
Out[1] = __hadd_sat(In1[1], In2[1]);
|
||||
Out[2] = __hfma(In1[2], In2[2],In3[2]);
|
||||
Out[3] = __hfma_sat(In1[3], In2[3], In3[3]);
|
||||
Out[4] = __hmul(In1[4], In2[4]);
|
||||
Out[5] = __hmul_sat(In1[5], In2[5]);
|
||||
Out[6] = __hneg(In1[6]);
|
||||
Out[7] = __hsub(In1[7], In2[7]);
|
||||
Out[8] = __hsub_sat(In1[8], In2[8]);
|
||||
Out[9] = hdiv(In1[9], In2[9]);
|
||||
}
|
||||
|
||||
__global__ void CheckHalf2(hipLaunchParm lp, __half2* In1, __half2* In2, __half2* In3, __half2* Out){
|
||||
Out[0] = __hadd2(In1[0], In2[0]);
|
||||
Out[1] = __hadd2_sat(In1[1], In2[1]);
|
||||
Out[2] = __hfma2(In1[2], In2[2],In3[2]);
|
||||
Out[3] = __hfma2_sat(In1[3], In2[3], In3[3]);
|
||||
Out[4] = __hmul2(In1[4], In2[4]);
|
||||
Out[5] = __hmul2_sat(In1[5], In2[5]);
|
||||
Out[6] = __hneg2(In1[6]);
|
||||
Out[7] = __hsub2(In1[7], In2[7]);
|
||||
Out[8] = __hsub2_sat(In1[8], In2[8]);
|
||||
Out[9] = h2div(In1[9], In2[9]);
|
||||
}
|
||||
|
||||
__global__ void CheckCmpHalf(hipLaunchParm lp, __half* In1, __half* In2, bool* Out) {
|
||||
Out[0] = __heq(In1[0], In2[0]);
|
||||
Out[1] = __hge(In1[1], In2[1]);
|
||||
Out[2] = __hgt(In1[2], In2[2]);
|
||||
Out[3] = __hisinf(In1[3]);
|
||||
Out[4] = __hisnan(In1[4]);
|
||||
Out[5] = __hle(In1[5], In2[5]);
|
||||
Out[6] = __hlt(In1[6], In2[6]);
|
||||
Out[7] = __hne(In1[7], In2[7]);
|
||||
}
|
||||
|
||||
int main(){
|
||||
|
||||
}
|
||||
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
@@ -32,6 +32,7 @@ THE SOFTWARE.
|
||||
#include <stdlib.h>
|
||||
#include<iostream>
|
||||
#include "hip/hip_runtime.h"
|
||||
#include "hip/hip_vector_types.h"
|
||||
#include "test_common.h"
|
||||
|
||||
#if (__hcc_workweek__ >= 16164) || defined (__HIP_PLATFORM_NVCC__)
|
||||
@@ -389,4 +390,3 @@ int main() {
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
@@ -33,7 +33,7 @@ void createThenDestroyStreams(int iterations, int burstSize)
|
||||
}
|
||||
}
|
||||
|
||||
delete streams;
|
||||
delete[] streams;
|
||||
}
|
||||
|
||||
|
||||
|
||||
Ссылка в новой задаче
Block a user