diff --git a/hipamd/include/hip/amd_detail/hip_prof_str.h b/hipamd/include/hip/amd_detail/hip_prof_str.h index 27c8b26a1b..1bdd5ff70c 100644 --- a/hipamd/include/hip/amd_detail/hip_prof_str.h +++ b/hipamd/include/hip/amd_detail/hip_prof_str.h @@ -93,7 +93,7 @@ enum hip_api_id_t { HIP_API_ID_hipGetDevice = 78, HIP_API_ID_hipGetDeviceCount = 79, HIP_API_ID_hipGetDeviceFlags = 80, - HIP_API_ID_hipGetDeviceProperties = 81, + HIP_API_ID_hipGetDevicePropertiesR0600 = 81, HIP_API_ID_RESERVED_82 = 82, HIP_API_ID_hipGetErrorString = 83, HIP_API_ID_hipGetLastError = 84, @@ -518,7 +518,7 @@ static inline const char* hip_api_name(const uint32_t id) { case HIP_API_ID_hipGetDevice: return "hipGetDevice"; case HIP_API_ID_hipGetDeviceCount: return "hipGetDeviceCount"; case HIP_API_ID_hipGetDeviceFlags: return "hipGetDeviceFlags"; - case HIP_API_ID_hipGetDeviceProperties: return "hipGetDeviceProperties"; + case HIP_API_ID_hipGetDevicePropertiesR0600: return "hipGetDevicePropertiesR0600"; case HIP_API_ID_hipGetErrorString: return "hipGetErrorString"; case HIP_API_ID_hipGetLastError: return "hipGetLastError"; case HIP_API_ID_hipGetMipmappedArrayLevel: return "hipGetMipmappedArrayLevel"; @@ -885,7 +885,7 @@ static inline uint32_t hipApiIdByName(const char* name) { if (strcmp("hipGetDevice", name) == 0) return HIP_API_ID_hipGetDevice; if (strcmp("hipGetDeviceCount", name) == 0) return HIP_API_ID_hipGetDeviceCount; if (strcmp("hipGetDeviceFlags", name) == 0) return HIP_API_ID_hipGetDeviceFlags; - if (strcmp("hipGetDeviceProperties", name) == 0) return HIP_API_ID_hipGetDeviceProperties; + if (strcmp("hipGetDevicePropertiesR0600", name) == 0) return HIP_API_ID_hipGetDevicePropertiesR0600; if (strcmp("hipGetErrorString", name) == 0) return HIP_API_ID_hipGetErrorString; if (strcmp("hipGetLastError", name) == 0) return HIP_API_ID_hipGetLastError; if (strcmp("hipGetMipmappedArrayLevel", name) == 0) return HIP_API_ID_hipGetMipmappedArrayLevel; @@ -1213,8 +1213,8 @@ typedef struct hip_api_data_s { struct { int* device; int device__val; - const hipDeviceProp_t* prop; - hipDeviceProp_t prop__val; + const hipDeviceProp_tR0600* prop; + hipDeviceProp_tR0600 prop__val; } hipChooseDevice; struct { dim3 gridDim; @@ -1628,10 +1628,10 @@ typedef struct hip_api_data_s { unsigned int flags__val; } hipGetDeviceFlags; struct { - hipDeviceProp_t* props; - hipDeviceProp_t props__val; + hipDeviceProp_tR0600* props; + hipDeviceProp_tR0600 props__val; hipDevice_t device; - } hipGetDeviceProperties; + } hipGetDevicePropertiesR0600; struct { hipArray_t* levelArray; hipArray_t levelArray__val; @@ -3325,10 +3325,10 @@ typedef struct hip_api_data_s { cb_data.args.hipArrayGetInfo.flags = (unsigned int*)flags; \ cb_data.args.hipArrayGetInfo.array = (hipArray*)array; \ }; -// hipChooseDevice[('int*', 'device'), ('const hipDeviceProp_t*', 'prop')] +// hipChooseDevice[('int*', 'device'), ('const hipDeviceProp_tR0600*', 'prop')] #define INIT_hipChooseDevice_CB_ARGS_DATA(cb_data) { \ cb_data.args.hipChooseDevice.device = (int*)device; \ - cb_data.args.hipChooseDevice.prop = (const hipDeviceProp_t*)properties; \ + cb_data.args.hipChooseDevice.prop = (const hipDeviceProp_tR0600*)properties; \ }; // hipConfigureCall[('dim3', 'gridDim'), ('dim3', 'blockDim'), ('size_t', 'sharedMem'), ('hipStream_t', 'stream')] #define INIT_hipConfigureCall_CB_ARGS_DATA(cb_data) { \ @@ -3779,10 +3779,10 @@ typedef struct hip_api_data_s { #define INIT_hipGetDeviceFlags_CB_ARGS_DATA(cb_data) { \ cb_data.args.hipGetDeviceFlags.flags = (unsigned int*)flags; \ }; -// hipGetDeviceProperties[('hipDeviceProp_t*', 'props'), ('hipDevice_t', 'device')] -#define INIT_hipGetDeviceProperties_CB_ARGS_DATA(cb_data) { \ - cb_data.args.hipGetDeviceProperties.props = (hipDeviceProp_t*)props; \ - cb_data.args.hipGetDeviceProperties.device = (hipDevice_t)device; \ +// hipGetDeviceProperties[('hipDeviceProp_tR0600*', 'props'), ('hipDevice_t', 'device')] +#define INIT_hipGetDevicePropertiesR0600_CB_ARGS_DATA(cb_data) { \ + cb_data.args.hipGetDevicePropertiesR0600.props = (hipDeviceProp_tR0600*)props; \ + cb_data.args.hipGetDevicePropertiesR0600.device = (hipDevice_t)device; \ }; // hipGetErrorString[] #define INIT_hipGetErrorString_CB_ARGS_DATA(cb_data) { \ @@ -5573,7 +5573,7 @@ static inline void hipApiArgsInit(hip_api_id_t id, hip_api_data_t* data) { if (data->args.hipArrayGetInfo.flags) data->args.hipArrayGetInfo.flags__val = *(data->args.hipArrayGetInfo.flags); if (data->args.hipArrayGetInfo.array) data->args.hipArrayGetInfo.array__val = *(data->args.hipArrayGetInfo.array); break; -// hipChooseDevice[('int*', 'device'), ('const hipDeviceProp_t*', 'prop')] +// hipChooseDevice[('int*', 'device'), ('const hipDeviceProp_tR0600*', 'prop')] case HIP_API_ID_hipChooseDevice: if (data->args.hipChooseDevice.device) data->args.hipChooseDevice.device__val = *(data->args.hipChooseDevice.device); if (data->args.hipChooseDevice.prop) data->args.hipChooseDevice.prop__val = *(data->args.hipChooseDevice.prop); @@ -5908,9 +5908,9 @@ static inline void hipApiArgsInit(hip_api_id_t id, hip_api_data_t* data) { case HIP_API_ID_hipGetDeviceFlags: if (data->args.hipGetDeviceFlags.flags) data->args.hipGetDeviceFlags.flags__val = *(data->args.hipGetDeviceFlags.flags); break; -// hipGetDeviceProperties[('hipDeviceProp_t*', 'props'), ('hipDevice_t', 'device')] - case HIP_API_ID_hipGetDeviceProperties: - if (data->args.hipGetDeviceProperties.props) data->args.hipGetDeviceProperties.props__val = *(data->args.hipGetDeviceProperties.props); +// hipGetDevicePropertiesR0600[('hipDeviceProp_tR0600*', 'props'), ('hipDevice_t', 'device')] + case HIP_API_ID_hipGetDevicePropertiesR0600: + if (data->args.hipGetDevicePropertiesR0600.props) data->args.hipGetDevicePropertiesR0600.props__val = *(data->args.hipGetDevicePropertiesR0600.props); break; // hipGetErrorString[] case HIP_API_ID_hipGetErrorString: @@ -7626,11 +7626,11 @@ static inline const char* hipApiString(hip_api_id_t id, const hip_api_data_t* da else { oss << "flags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGetDeviceFlags.flags__val); } oss << ")"; break; - case HIP_API_ID_hipGetDeviceProperties: - oss << "hipGetDeviceProperties("; - if (data->args.hipGetDeviceProperties.props == NULL) oss << "props=NULL"; - else { oss << "props="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGetDeviceProperties.props__val); } - oss << ", device="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGetDeviceProperties.device); + case HIP_API_ID_hipGetDevicePropertiesR0600: + oss << "hipGetDevicePropertiesR0600("; + if (data->args.hipGetDevicePropertiesR0600.props == NULL) oss << "props=NULL"; + else { oss << "props="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGetDevicePropertiesR0600.props__val); } + oss << ", device="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGetDevicePropertiesR0600.device); oss << ")"; break; case HIP_API_ID_hipGetErrorString: diff --git a/hipamd/include/hip/nvidia_detail/nvidia_hip_runtime.h b/hipamd/include/hip/nvidia_detail/nvidia_hip_runtime.h index c63e35700b..eabce14fa7 100644 --- a/hipamd/include/hip/nvidia_detail/nvidia_hip_runtime.h +++ b/hipamd/include/hip/nvidia_detail/nvidia_hip_runtime.h @@ -32,11 +32,11 @@ THE SOFTWARE. typedef int hipLaunchParm; #define hipLaunchKernelGGLInternal(kernelName, numBlocks, numThreads, memPerBlock, streamId, ...) \ - do { \ - kernelName<<>>(__VA_ARGS__); \ - } while (0) + do { \ + kernelName<<>>(__VA_ARGS__); \ + } while (0) -#define hipLaunchKernelGGL(kernelName, ...) hipLaunchKernelGGLInternal((kernelName), __VA_ARGS__) +#define hipLaunchKernelGGL(kernelName, ...) hipLaunchKernelGGLInternal((kernelName), __VA_ARGS__) #define hipReadModeElementType cudaReadModeElementType @@ -105,15 +105,15 @@ typedef int hipLaunchParm; #define HIP_DYNAMIC_SHARED_ATTRIBUTE #ifdef __HIP_DEVICE_COMPILE__ -#define abort_() \ - { asm("trap;"); } +#define abort_() \ + { asm("trap;"); } #undef assert #define assert(COND) \ - { \ - if (!COND) { \ - abort_(); \ - } \ - } + { \ + if (!COND) { \ + abort_(); \ + } \ + } #endif #define __clock() clock() diff --git a/hipamd/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipamd/include/hip/nvidia_detail/nvidia_hip_runtime_api.h index c340cb0b01..89ae35428c 100644 --- a/hipamd/include/hip/nvidia_detail/nvidia_hip_runtime_api.h +++ b/hipamd/include/hip/nvidia_detail/nvidia_hip_runtime_api.h @@ -1980,87 +1980,143 @@ inline static hipError_t hipMemset3DAsync(hipPitchedPtr pitchedDevPtr, int valu } inline static hipError_t hipGetDeviceProperties(hipDeviceProp_t* p_prop, int device) { - if (p_prop == NULL) { - return hipErrorInvalidValue; + return hipErrorInvalidValue; } struct cudaDeviceProp cdprop; - cudaError_t cerror; - cerror = cudaGetDeviceProperties(&cdprop, device); + hipError_t error = hipCUDAErrorTohipError(cudaGetDeviceProperties(&cdprop, device)); + + if (error != hipSuccess) { + return error; + } strncpy(p_prop->name, cdprop.name, 256); + strncpy(p_prop->uuid.bytes, cdprop.uuid.bytes, 16); + strncpy(p_prop->luid, cdprop.luid, 8); + p_prop->luidDeviceNodeMask = cdprop.luidDeviceNodeMask; p_prop->totalGlobalMem = cdprop.totalGlobalMem; p_prop->sharedMemPerBlock = cdprop.sharedMemPerBlock; p_prop->regsPerBlock = cdprop.regsPerBlock; - p_prop->warpSize = cdprop.warpSize; + p_prop->memPitch = cdprop.memPitch; p_prop->maxThreadsPerBlock = cdprop.maxThreadsPerBlock; - for (int i = 0; i < 3; i++) { - p_prop->maxThreadsDim[i] = cdprop.maxThreadsDim[i]; - p_prop->maxGridSize[i] = cdprop.maxGridSize[i]; - } + p_prop->maxThreadsDim[0] = cdprop.maxThreadsDim[0]; + p_prop->maxThreadsDim[1] = cdprop.maxThreadsDim[1]; + p_prop->maxThreadsDim[2] = cdprop.maxThreadsDim[2]; + p_prop->maxGridSize[0] = cdprop.maxGridSize[0]; + p_prop->maxGridSize[1] = cdprop.maxGridSize[1]; + p_prop->maxGridSize[2] = cdprop.maxGridSize[2]; p_prop->clockRate = cdprop.clockRate; - p_prop->memoryClockRate = cdprop.memoryClockRate; - p_prop->memoryBusWidth = cdprop.memoryBusWidth; p_prop->totalConstMem = cdprop.totalConstMem; p_prop->major = cdprop.major; p_prop->minor = cdprop.minor; + p_prop->textureAlignment = cdprop.textureAlignment; + p_prop->texturePitchAlignment = cdprop.texturePitchAlignment; + p_prop->deviceOverlap = cdprop.deviceOverlap; p_prop->multiProcessorCount = cdprop.multiProcessorCount; - p_prop->l2CacheSize = cdprop.l2CacheSize; - p_prop->maxThreadsPerMultiProcessor = cdprop.maxThreadsPerMultiProcessor; - p_prop->computeMode = cdprop.computeMode; - p_prop->clockInstructionRate = cdprop.clockRate; // Same as clock-rate: - - int ccVers = p_prop->major * 100 + p_prop->minor * 10; - p_prop->arch.hasGlobalInt32Atomics = (ccVers >= 110); - p_prop->arch.hasGlobalFloatAtomicExch = (ccVers >= 110); - p_prop->arch.hasSharedInt32Atomics = (ccVers >= 120); - p_prop->arch.hasSharedFloatAtomicExch = (ccVers >= 120); - p_prop->arch.hasFloatAtomicAdd = (ccVers >= 200); - p_prop->arch.hasGlobalInt64Atomics = (ccVers >= 120); - p_prop->arch.hasSharedInt64Atomics = (ccVers >= 110); - p_prop->arch.hasDoubles = (ccVers >= 130); - p_prop->arch.hasWarpVote = (ccVers >= 120); - p_prop->arch.hasWarpBallot = (ccVers >= 200); - p_prop->arch.hasWarpShuffle = (ccVers >= 300); - p_prop->arch.hasFunnelShift = (ccVers >= 350); - p_prop->arch.hasThreadFenceSystem = (ccVers >= 200); - p_prop->arch.hasSyncThreadsExt = (ccVers >= 200); - p_prop->arch.hasSurfaceFuncs = (ccVers >= 200); - p_prop->arch.has3dGrid = (ccVers >= 200); - p_prop->arch.hasDynamicParallelism = (ccVers >= 350); - - p_prop->concurrentKernels = cdprop.concurrentKernels; - p_prop->pciDomainID = cdprop.pciDomainID; - p_prop->pciBusID = cdprop.pciBusID; - p_prop->pciDeviceID = cdprop.pciDeviceID; - p_prop->maxSharedMemoryPerMultiProcessor = cdprop.sharedMemPerMultiprocessor; - p_prop->isMultiGpuBoard = cdprop.isMultiGpuBoard; - p_prop->canMapHostMemory = cdprop.canMapHostMemory; - p_prop->gcnArch = 0; // Not a GCN arch + p_prop->kernelExecTimeoutEnabled = cdprop.kernelExecTimeoutEnabled; p_prop->integrated = cdprop.integrated; - p_prop->cooperativeLaunch = cdprop.cooperativeLaunch; - p_prop->cooperativeMultiDeviceLaunch = cdprop.cooperativeMultiDeviceLaunch; - p_prop->cooperativeMultiDeviceUnmatchedFunc = 0; - p_prop->cooperativeMultiDeviceUnmatchedGridDim = 0; - p_prop->cooperativeMultiDeviceUnmatchedBlockDim = 0; - p_prop->cooperativeMultiDeviceUnmatchedSharedMem = 0; - - p_prop->maxTexture1D = cdprop.maxTexture1D; + p_prop->canMapHostMemory = cdprop.canMapHostMemory; + p_prop->computeMode = cdprop.computeMode; + p_prop->maxTexture1D = cdprop.maxTexture1D; + p_prop->maxTexture1DMipmap = cdprop.maxTexture1DMipmap; + p_prop->maxTexture1DLinear = cdprop.maxTexture1DLinear; p_prop->maxTexture2D[0] = cdprop.maxTexture2D[0]; p_prop->maxTexture2D[1] = cdprop.maxTexture2D[1]; + p_prop->maxTexture2DMipmap[0] = cdprop.maxTexture2DMipmap[0]; + p_prop->maxTexture2DMipmap[1] = cdprop.maxTexture2DMipmap[1]; + p_prop->maxTexture2DLinear[0] = cdprop.maxTexture2DLinear[0]; + p_prop->maxTexture2DLinear[1] = cdprop.maxTexture2DLinear[1]; + p_prop->maxTexture2DLinear[2] = cdprop.maxTexture2DLinear[2]; + p_prop->maxTexture2DGather[0] = cdprop.maxTexture2DGather[0]; + p_prop->maxTexture2DGather[1] = cdprop.maxTexture2DGather[1]; p_prop->maxTexture3D[0] = cdprop.maxTexture3D[0]; p_prop->maxTexture3D[1] = cdprop.maxTexture3D[1]; p_prop->maxTexture3D[2] = cdprop.maxTexture3D[2]; + p_prop->maxTexture3DAlt[0] = cdprop.maxTexture3DAlt[0]; + p_prop->maxTexture3DAlt[1] = cdprop.maxTexture3DAlt[1]; + p_prop->maxTexture3DAlt[2] = cdprop.maxTexture3DAlt[2]; + p_prop->maxTextureCubemap = cdprop.maxTextureCubemap; + p_prop->maxTexture1DLayered[0] = cdprop.maxTexture1DLayered[0]; + p_prop->maxTexture1DLayered[1] = cdprop.maxTexture1DLayered[1]; + p_prop->maxTexture2DLayered[0] = cdprop.maxTexture2DLayered[0]; + p_prop->maxTexture2DLayered[1] = cdprop.maxTexture2DLayered[1]; + p_prop->maxTexture2DLayered[2] = cdprop.maxTexture2DLayered[2]; + p_prop->maxTextureCubemapLayered[0] = cdprop.maxTextureCubemapLayered[0]; + p_prop->maxTextureCubemapLayered[1] = cdprop.maxTextureCubemapLayered[1]; + p_prop->maxSurface1D = cdprop.maxSurface1D; + p_prop->maxSurface2D[0] = cdprop.maxSurface2D[0]; + p_prop->maxSurface2D[1] = cdprop.maxSurface2D[1]; + p_prop->maxSurface3D[0] = cdprop.maxSurface3D[0]; + p_prop->maxSurface3D[1] = cdprop.maxSurface3D[1]; + p_prop->maxSurface3D[2] = cdprop.maxSurface3D[2]; + p_prop->maxSurface1DLayered[0] = cdprop.maxSurface1DLayered[0]; + p_prop->maxSurface1DLayered[1] = cdprop.maxSurface1DLayered[1]; + p_prop->maxSurface2DLayered[0] = cdprop.maxSurface2DLayered[0]; + p_prop->maxSurface2DLayered[1] = cdprop.maxSurface2DLayered[1]; + p_prop->maxSurface2DLayered[2] = cdprop.maxSurface2DLayered[2]; + p_prop->maxSurfaceCubemap = cdprop.maxSurfaceCubemap; + p_prop->maxSurfaceCubemapLayered[0] = cdprop.maxSurfaceCubemapLayered[0]; + p_prop->maxSurfaceCubemapLayered[1] = cdprop.maxSurfaceCubemapLayered[1]; + p_prop->surfaceAlignment = cdprop.surfaceAlignment; + p_prop->concurrentKernels = cdprop.concurrentKernels; + p_prop->ECCEnabled = cdprop.ECCEnabled; + p_prop->pciBusID = cdprop.pciBusID; + p_prop->pciDeviceID = cdprop.pciDeviceID; + p_prop->pciDomainID = cdprop.pciDomainID; + p_prop->tccDriver = cdprop.tccDriver; + p_prop->asyncEngineCount = cdprop.asyncEngineCount; + p_prop->unifiedAddressing = cdprop.unifiedAddressing; + p_prop->memoryClockRate = cdprop.memoryClockRate; + p_prop->memoryBusWidth = cdprop.memoryBusWidth; + p_prop->l2CacheSize = cdprop.l2CacheSize; + p_prop->maxThreadsPerMultiProcessor = cdprop.maxThreadsPerMultiProcessor; + p_prop->streamPrioritiesSupported = cdprop.streamPrioritiesSupported; + p_prop->globalL1CacheSupported = cdprop.globalL1CacheSupported; + p_prop->localL1CacheSupported = cdprop.localL1CacheSupported; + p_prop->sharedMemPerMultiprocessor = cdprop.sharedMemPerMultiprocessor; + p_prop->regsPerMultiprocessor = cdprop.regsPerMultiprocessor; + p_prop->managedMemory = cdprop.managedMemory; + p_prop->isMultiGpuBoard = cdprop.isMultiGpuBoard; + p_prop->multiGpuBoardGroupID = cdprop.multiGpuBoardGroupID; + p_prop->hostNativeAtomicSupported = cdprop.hostNativeAtomicSupported; + p_prop->singleToDoublePrecisionPerfRatio = cdprop.singleToDoublePrecisionPerfRatio; + p_prop->pageableMemoryAccess = cdprop.pageableMemoryAccess; + p_prop->concurrentManagedAccess = cdprop.concurrentManagedAccess; + p_prop->computePreemptionSupported = cdprop.computePreemptionSupported; + p_prop->canUseHostPointerForRegisteredMem = cdprop.canUseHostPointerForRegisteredMem; + p_prop->cooperativeLaunch = cdprop.cooperativeLaunch; + p_prop->cooperativeMultiDeviceLaunch = cdprop.cooperativeMultiDeviceLaunch; + p_prop->sharedMemPerBlockOptin = cdprop.sharedMemPerBlockOptin; + p_prop->pageableMemoryAccessUsesHostPageTables = cdprop.pageableMemoryAccessUsesHostPageTables; + p_prop->directManagedMemAccessFromHost = cdprop.directManagedMemAccessFromHost; - p_prop->memPitch = cdprop.memPitch; - p_prop->textureAlignment = cdprop.textureAlignment; - p_prop->texturePitchAlignment = cdprop.texturePitchAlignment; - p_prop->kernelExecTimeoutEnabled = cdprop.kernelExecTimeoutEnabled; - p_prop->ECCEnabled = cdprop.ECCEnabled; - p_prop->tccDriver = cdprop.tccDriver; - return hipCUDAErrorTohipError(cerror); +#if CUDA_VERSION >= 11010 + p_prop->accessPolicyMaxWindowSize = cdprop.accessPolicyMaxWindowSize; + p_prop->maxBlocksPerMultiProcessor = cdprop.maxBlocksPerMultiProcessor; + p_prop->persistingL2CacheMaxSize = cdprop.persistingL2CacheMaxSize; + p_prop->reservedSharedMemPerBlock = cdprop.reservedSharedMemPerBlock; + p_prop->warpSize = cdprop.warpSize; +#endif + +#if CUDA_VERSION >= 12000 + p_prop->clusterLaunch = cdprop.clusterLaunch; + p_prop->deferredMappingHipArraySupported = cdprop.deferredMappingCudaArraySupported; + p_prop->gpuDirectRDMAFlushWritesOptions = cdprop.gpuDirectRDMAFlushWritesOptions; + p_prop->gpuDirectRDMASupported = cdprop.gpuDirectRDMASupported; + p_prop->gpuDirectRDMAWritesOrdering = cdprop.gpuDirectRDMAWritesOrdering; + p_prop->hostRegisterReadOnlySupported = cdprop.hostRegisterReadOnlySupported; + p_prop->hostRegisterSupported = cdprop.hostRegisterSupported; + p_prop->ipcEventSupported = cdprop.ipcEventSupported; + p_prop->memoryPoolSupportedHandleTypes = cdprop.memoryPoolSupportedHandleTypes; + p_prop->memoryPoolsSupported = cdprop.memoryPoolsSupported; + p_prop->sparseHipArraySupported = cdprop.sparseCudaArraySupported; + p_prop->timelineSemaphoreInteropSupported = cdprop.timelineSemaphoreInteropSupported; + p_prop->unifiedFunctionPointers = cdprop.unifiedFunctionPointers; +#endif + + return error; } inline static hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t attr, int device) { @@ -2188,6 +2244,9 @@ inline static hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t att case hipDeviceAttributeCooperativeMultiDeviceLaunch: cdattr = cudaDevAttrCooperativeMultiDeviceLaunch; break; + case hipDeviceAttributeHostRegisterSupported: + cdattr = cudaDevAttrHostRegisterSupported; + break; case hipDeviceAttributeConcurrentManagedAccess: cdattr = cudaDevAttrConcurrentManagedAccess; break; diff --git a/hipamd/src/amdhip.def b/hipamd/src/amdhip.def index 7bac33593d..f3f1a64599 100644 --- a/hipamd/src/amdhip.def +++ b/hipamd/src/amdhip.def @@ -63,6 +63,7 @@ hipFuncSetSharedMemConfig hipGetDevice hipGetDeviceCount hipGetDeviceProperties +hipGetDevicePropertiesR0600 hipGetErrorName hipGetErrorString hipGetLastError diff --git a/hipamd/src/hip_device.cpp b/hipamd/src/hip_device.cpp index 268e2ca815..1bf374dd52 100644 --- a/hipamd/src/hip_device.cpp +++ b/hipamd/src/hip_device.cpp @@ -23,6 +23,9 @@ #include "hip_internal.hpp" #include "hip_mempool_impl.hpp" +#undef hipGetDeviceProperties +#undef hipDeviceProp_t + namespace hip { // ================================================================================================ @@ -139,12 +142,12 @@ Device::~Device() { graph_mem_pool_->release(); } - if (null_stream_!= nullptr) { + if (null_stream_ != nullptr) { hip::Stream::Destroy(null_stream_); } } -} +} // namespace hip void ihipDestroyDevice() { for (auto deviceHandle : g_devices) { @@ -171,8 +174,7 @@ hipError_t hipDeviceGet(hipDevice_t* device, int deviceId) { HIP_RETURN(ihipDeviceGet(device, deviceId)); } -hipError_t hipDeviceTotalMem (size_t *bytes, hipDevice_t device) { - +hipError_t hipDeviceTotalMem(size_t* bytes, hipDevice_t device) { HIP_INIT_API(hipDeviceTotalMem, bytes, device); if (device < 0 || static_cast(device) >= g_devices.size()) { @@ -191,8 +193,7 @@ hipError_t hipDeviceTotalMem (size_t *bytes, hipDevice_t device) { HIP_RETURN(hipSuccess); } -hipError_t hipDeviceComputeCapability(int *major, int *minor, hipDevice_t device) { - +hipError_t hipDeviceComputeCapability(int* major, int* minor, hipDevice_t device) { HIP_INIT_API(hipDeviceComputeCapability, major, minor, device); if (device < 0 || static_cast(device) >= g_devices.size()) { @@ -232,8 +233,7 @@ hipError_t ihipDeviceGetCount(int* count) { return hipSuccess; } -hipError_t hipDeviceGetName(char *name, int len, hipDevice_t device) { - +hipError_t hipDeviceGetName(char* name, int len, hipDevice_t device) { HIP_INIT_API(hipDeviceGetName, (void*)name, len, device); if (device < 0 || static_cast(device) >= g_devices.size()) { @@ -275,7 +275,7 @@ hipError_t hipDeviceGetUuid(hipUUID* uuid, hipDevice_t device) { HIP_RETURN(hipSuccess); } -hipError_t ihipGetDeviceProperties(hipDeviceProp_t* props, hipDevice_t device) { +hipError_t ihipGetDeviceProperties(hipDeviceProp_tR0600* props, hipDevice_t device) { if (props == nullptr) { return hipErrorInvalidValue; } @@ -286,7 +286,278 @@ hipError_t ihipGetDeviceProperties(hipDeviceProp_t* props, hipDevice_t device) { auto* deviceHandle = g_devices[device]->devices()[0]; constexpr auto int32_max = static_cast(std::numeric_limits::max()); - constexpr auto uint16_max = static_cast(std::numeric_limits::max())+1; + constexpr auto uint16_max = static_cast(std::numeric_limits::max()) + 1; + hipDeviceProp_tR0600 deviceProps = {0}; + + const auto& info = deviceHandle->info(); + const auto& isa = deviceHandle->isa(); + ::strncpy(deviceProps.name, info.boardName_, sizeof(info.boardName_)); + ::strncpy(deviceProps.uuid.bytes, info.uuid_, sizeof(info.uuid_)); + deviceProps.totalGlobalMem = info.globalMemSize_; + deviceProps.sharedMemPerBlock = info.localMemSizePerCU_; + deviceProps.sharedMemPerMultiprocessor = info.localMemSizePerCU_ * info.numRTCUs_; + deviceProps.regsPerBlock = info.availableRegistersPerCU_; + deviceProps.warpSize = info.wavefrontWidth_; + deviceProps.maxThreadsPerBlock = info.maxWorkGroupSize_; + deviceProps.maxThreadsDim[0] = info.maxWorkItemSizes_[0]; + deviceProps.maxThreadsDim[1] = info.maxWorkItemSizes_[1]; + deviceProps.maxThreadsDim[2] = info.maxWorkItemSizes_[2]; + deviceProps.maxGridSize[0] = int32_max; + deviceProps.maxGridSize[1] = uint16_max; + deviceProps.maxGridSize[2] = uint16_max; + deviceProps.clockRate = info.maxEngineClockFrequency_ * 1000; + deviceProps.memoryClockRate = info.maxMemoryClockFrequency_ * 1000; + deviceProps.memoryBusWidth = info.globalMemChannels_; + deviceProps.totalConstMem = std::min(info.maxConstantBufferSize_, int32_max); + deviceProps.major = isa.versionMajor(); + deviceProps.minor = isa.versionMinor(); + deviceProps.multiProcessorCount = info.maxComputeUnits_; + deviceProps.l2CacheSize = info.l2CacheSize_; + deviceProps.maxThreadsPerMultiProcessor = info.maxThreadsPerCU_; + deviceProps.maxBlocksPerMultiProcessor = int(info.maxThreadsPerCU_ / info.maxWorkGroupSize_); + deviceProps.computeMode = 0; + deviceProps.clockInstructionRate = info.timeStampFrequency_; + deviceProps.arch.hasGlobalInt32Atomics = 1; + deviceProps.arch.hasGlobalFloatAtomicExch = 1; + deviceProps.arch.hasSharedInt32Atomics = 1; + deviceProps.arch.hasSharedFloatAtomicExch = 1; + deviceProps.arch.hasFloatAtomicAdd = 1; + deviceProps.arch.hasGlobalInt64Atomics = 1; + deviceProps.arch.hasSharedInt64Atomics = 1; + deviceProps.hostNativeAtomicSupported = info.pcie_atomics_ ? 1 : 0; + deviceProps.arch.hasDoubles = 1; + deviceProps.arch.hasWarpVote = 1; + deviceProps.arch.hasWarpBallot = 1; + deviceProps.arch.hasWarpShuffle = 1; + deviceProps.arch.hasFunnelShift = 0; + deviceProps.arch.hasThreadFenceSystem = 1; + deviceProps.arch.hasSyncThreadsExt = 0; + deviceProps.arch.hasSurfaceFuncs = 0; + deviceProps.arch.has3dGrid = 1; + deviceProps.arch.hasDynamicParallelism = 0; + deviceProps.concurrentKernels = 1; + deviceProps.pciDomainID = info.pciDomainID; + deviceProps.pciBusID = info.deviceTopology_.pcie.bus; + deviceProps.pciDeviceID = info.deviceTopology_.pcie.device; + deviceProps.maxSharedMemoryPerMultiProcessor = info.localMemSizePerCU_; + deviceProps.canMapHostMemory = 1; + deviceProps.regsPerMultiprocessor = info.availableRegistersPerCU_; + sprintf(deviceProps.gcnArchName, "%s", isa.targetId()); + deviceProps.cooperativeLaunch = info.cooperativeGroups_; + deviceProps.cooperativeMultiDeviceLaunch = info.cooperativeMultiDeviceGroups_; + + deviceProps.cooperativeMultiDeviceUnmatchedFunc = info.cooperativeMultiDeviceGroups_; + deviceProps.cooperativeMultiDeviceUnmatchedGridDim = info.cooperativeMultiDeviceGroups_; + deviceProps.cooperativeMultiDeviceUnmatchedBlockDim = info.cooperativeMultiDeviceGroups_; + deviceProps.cooperativeMultiDeviceUnmatchedSharedMem = info.cooperativeMultiDeviceGroups_; + + deviceProps.maxTexture1DLinear = + std::min(16 * info.imageMaxBufferSize_, int32_max); // Max pixel size is 16 bytes + deviceProps.maxTexture1DMipmap = std::min(16 * info.imageMaxBufferSize_, int32_max); + deviceProps.maxTexture1D = deviceProps.maxSurface1D = std::min(info.image1DMaxWidth_, int32_max); + deviceProps.maxTexture2D[0] = deviceProps.maxSurface2D[0] = + std::min(info.image2DMaxWidth_, int32_max); + deviceProps.maxTexture2D[1] = deviceProps.maxSurface2D[1] = + std::min(info.image2DMaxHeight_, int32_max); + deviceProps.maxTexture3D[0] = deviceProps.maxSurface3D[0] = + std::min(info.image3DMaxWidth_, int32_max); + deviceProps.maxTexture3D[1] = deviceProps.maxSurface3D[1] = + std::min(info.image3DMaxHeight_, int32_max); + deviceProps.maxTexture3D[2] = deviceProps.maxSurface3D[2] = + std::min(info.image3DMaxDepth_, int32_max); + deviceProps.maxTexture1DLayered[0] = deviceProps.maxSurface1DLayered[0] = + std::min(info.image1DAMaxWidth_, int32_max); + deviceProps.maxTexture1DLayered[1] = deviceProps.maxSurface1DLayered[1] = + std::min(info.imageMaxArraySize_, int32_max); + deviceProps.maxTexture2DLayered[0] = deviceProps.maxSurface2DLayered[0] = + std::min(info.image2DAMaxWidth_[0], int32_max); + deviceProps.maxTexture2DLayered[1] = deviceProps.maxSurface2DLayered[1] = + std::min(info.image2DAMaxWidth_[1], int32_max); + deviceProps.maxTexture2DLayered[2] = deviceProps.maxSurface2DLayered[2] = + std::min(info.imageMaxArraySize_, int32_max); + deviceProps.hdpMemFlushCntl = info.hdpMemFlushCntl; + deviceProps.hdpRegFlushCntl = info.hdpRegFlushCntl; + + deviceProps.memPitch = std::min(info.maxMemAllocSize_, int32_max); + deviceProps.textureAlignment = deviceProps.surfaceAlignment = info.imageBaseAddressAlignment_; + deviceProps.texturePitchAlignment = info.imagePitchAlignment_; + deviceProps.kernelExecTimeoutEnabled = 0; + deviceProps.ECCEnabled = info.errorCorrectionSupport_ ? 1 : 0; + deviceProps.isLargeBar = info.largeBar_ ? 1 : 0; + deviceProps.asicRevision = info.asicRevision_; + deviceProps.ipcEventSupported = 1; + deviceProps.streamPrioritiesSupported = 1; + deviceProps.multiGpuBoardGroupID = info.deviceTopology_.pcie.device; + + // HMM capabilities + deviceProps.asyncEngineCount = info.numAsyncQueues_; + deviceProps.deviceOverlap = (info.numAsyncQueues_ > 0) ? 1 : 0; + deviceProps.unifiedAddressing = info.hmmDirectHostAccess_; + deviceProps.managedMemory = info.hmmSupported_; + deviceProps.concurrentManagedAccess = info.hmmSupported_; + deviceProps.directManagedMemAccessFromHost = info.hmmDirectHostAccess_; + deviceProps.canUseHostPointerForRegisteredMem = info.hostUnifiedMemory_; + deviceProps.pageableMemoryAccess = info.hmmCpuMemoryAccessible_; + deviceProps.hostRegisterSupported = info.hostUnifiedMemory_; + deviceProps.pageableMemoryAccessUsesHostPageTables = info.hostUnifiedMemory_; + + // Mem pool + deviceProps.memoryPoolsSupported = HIP_MEM_POOL_SUPPORT; + deviceProps.memoryPoolSupportedHandleTypes = 0; + + // Caching behavior + deviceProps.globalL1CacheSupported = 1; + deviceProps.localL1CacheSupported = 1; + deviceProps.persistingL2CacheMaxSize = info.l2CacheSize_; + deviceProps.reservedSharedMemPerBlock = 0; + deviceProps.sharedMemPerBlockOptin = 0; + + // Unsupported features + // Single to double precision perf ratio + deviceProps.singleToDoublePrecisionPerfRatio = 0; + // Flag hipHostRegisterReadOnly + deviceProps.hostRegisterReadOnlySupported = 0; + // Compute preemption + deviceProps.computePreemptionSupported = 0; + // Cubemaps + deviceProps.maxTextureCubemap = 0; + deviceProps.maxTextureCubemapLayered[0] = 0; + deviceProps.maxTextureCubemapLayered[1] = 0; + deviceProps.maxSurfaceCubemap = 0; + deviceProps.maxSurfaceCubemapLayered[0] = 0; + deviceProps.maxSurfaceCubemapLayered[1] = 0; + // Texture gather ops + deviceProps.maxTexture2DGather[0] = 0; + deviceProps.maxTexture2DGather[1] = 0; + // Textures bound to pitch memory + deviceProps.maxTexture2DLinear[0] = 0; + deviceProps.maxTexture2DLinear[1] = 0; + deviceProps.maxTexture2DLinear[2] = 0; + // Alternate 3D texture + deviceProps.maxTexture3DAlt[0] = 0; + deviceProps.maxTexture3DAlt[1] = 0; + deviceProps.maxTexture3DAlt[2] = 0; + // access policy + deviceProps.accessPolicyMaxWindowSize = 0; + // cluster launch + deviceProps.clusterLaunch = 0; + // Mapping HIP array + deviceProps.deferredMappingHipArraySupported = 0; + // RDMA options + deviceProps.gpuDirectRDMASupported = 0; + deviceProps.gpuDirectRDMAFlushWritesOptions = 0; + deviceProps.gpuDirectRDMAWritesOrdering = 0; + // luid - TODO populate it only on windows + std::memset(deviceProps.luid, 0, sizeof(deviceProps.luid)); + deviceProps.luidDeviceNodeMask = 0; + deviceProps.sparseHipArraySupported = 0; + deviceProps.timelineSemaphoreInteropSupported = 0; + deviceProps.unifiedFunctionPointers = 0; + + *props = deviceProps; + return hipSuccess; +} + +hipError_t hipGetDevicePropertiesR0600(hipDeviceProp_tR0600* props, hipDevice_t device) { + HIP_INIT_API(hipGetDevicePropertiesR0600, props, device); + + HIP_RETURN(ihipGetDeviceProperties(props, device)); +} + +extern "C" typedef struct hipDeviceProp_t { + char name[256]; ///< Device name. + size_t totalGlobalMem; ///< Size of global memory region (in bytes). + size_t sharedMemPerBlock; ///< Size of shared memory region (in bytes). + int regsPerBlock; ///< Registers per block. + int warpSize; ///< Warp size. + int maxThreadsPerBlock; ///< Max work items per work group or workgroup max size. + int maxThreadsDim[3]; ///< Max number of threads in each dimension (XYZ) of a block. + int maxGridSize[3]; ///< Max grid dimensions (XYZ). + int clockRate; ///< Max clock frequency of the multiProcessors in khz. + int memoryClockRate; ///< Max global memory clock frequency in khz. + int memoryBusWidth; ///< Global memory bus width in bits. + size_t totalConstMem; ///< Size of shared memory region (in bytes). + int major; ///< Major compute capability. On HCC, this is an approximation and features may + ///< differ from CUDA CC. See the arch feature flags for portable ways to query + ///< feature caps. + int minor; ///< Minor compute capability. On HCC, this is an approximation and features may + ///< differ from CUDA CC. See the arch feature flags for portable ways to query + ///< feature caps. + int multiProcessorCount; ///< Number of multi-processors (compute units). + int l2CacheSize; ///< L2 cache size. + int maxThreadsPerMultiProcessor; ///< Maximum resident threads per multi-processor. + int computeMode; ///< Compute mode. + int clockInstructionRate; ///< Frequency in khz of the timer used by the device-side "clock*" + ///< instructions. New for HIP. + hipDeviceArch_t arch; ///< Architectural feature flags. New for HIP. + int concurrentKernels; ///< Device can possibly execute multiple kernels concurrently. + int pciDomainID; ///< PCI Domain ID + int pciBusID; ///< PCI Bus ID. + int pciDeviceID; ///< PCI Device ID. + size_t maxSharedMemoryPerMultiProcessor; ///< Maximum Shared Memory Per Multiprocessor. + int isMultiGpuBoard; ///< 1 if device is on a multi-GPU board, 0 if not. + int canMapHostMemory; ///< Check whether HIP can map host memory + int gcnArch; ///< DEPRECATED: use gcnArchName instead + char gcnArchName[256]; ///< AMD GCN Arch Name. + int integrated; ///< APU vs dGPU + int cooperativeLaunch; ///< HIP device supports cooperative launch + int cooperativeMultiDeviceLaunch; ///< HIP device supports cooperative launch on multiple + ///< devices + int maxTexture1DLinear; ///< Maximum size for 1D textures bound to linear memory + int maxTexture1D; ///< Maximum number of elements in 1D images + int maxTexture2D[2]; ///< Maximum dimensions (width, height) of 2D images, in image elements + int maxTexture3D[3]; ///< Maximum dimensions (width, height, depth) of 3D images, in image + ///< elements + unsigned int* hdpMemFlushCntl; ///< Addres of HDP_MEM_COHERENCY_FLUSH_CNTL register + unsigned int* hdpRegFlushCntl; ///< Addres of HDP_REG_COHERENCY_FLUSH_CNTL register + size_t memPitch; ///< Maximum pitch in bytes allowed by memory copies + size_t textureAlignment; ///< Alignment requirement for textures + size_t texturePitchAlignment; ///< Pitch alignment requirement for texture references bound to + ///< pitched memory + int kernelExecTimeoutEnabled; ///< Run time limit for kernels executed on the device + int ECCEnabled; ///< Device has ECC support enabled + int tccDriver; ///< 1:If device is Tesla device using TCC driver, else 0 + int cooperativeMultiDeviceUnmatchedFunc; ///< HIP device supports cooperative launch on + ///< multiple + /// devices with unmatched functions + int cooperativeMultiDeviceUnmatchedGridDim; ///< HIP device supports cooperative launch on + ///< multiple + /// devices with unmatched grid dimensions + int cooperativeMultiDeviceUnmatchedBlockDim; ///< HIP device supports cooperative launch on + ///< multiple + /// devices with unmatched block dimensions + int cooperativeMultiDeviceUnmatchedSharedMem; ///< HIP device supports cooperative launch on + ///< multiple + /// devices with unmatched shared memories + int isLargeBar; ///< 1: if it is a large PCI bar device, else 0 + int asicRevision; ///< Revision of the GPU in this device + int managedMemory; ///< Device supports allocating managed memory on this system + int directManagedMemAccessFromHost; ///< Host can directly access managed memory on the device + ///< without migration + int concurrentManagedAccess; ///< Device can coherently access managed memory concurrently with + ///< the CPU + int pageableMemoryAccess; ///< Device supports coherently accessing pageable memory + ///< without calling hipHostRegister on it + int pageableMemoryAccessUsesHostPageTables; ///< Device accesses pageable memory via the host's + ///< page tables +} hipDeviceProp_t; + +extern "C" hipError_t hipGetDeviceProperties(hipDeviceProp_t* props, hipDevice_t device) { + // Removing this API from tracing. + // This API is now in backwards compatibility mode and is not callable from newly compiled apps. + HIP_INIT_VOID(); + + if (props == nullptr) { + return hipErrorInvalidValue; + } + + if (unsigned(device) >= g_devices.size()) { + return hipErrorInvalidDevice; + } + auto* deviceHandle = g_devices[device]->devices()[0]; + + constexpr auto int32_max = static_cast(std::numeric_limits::max()); + constexpr auto uint16_max = static_cast(std::numeric_limits::max()) + 1; hipDeviceProp_t deviceProps = {0}; const auto& info = deviceHandle->info(); @@ -348,8 +619,9 @@ hipError_t ihipGetDeviceProperties(hipDeviceProp_t* props, hipDevice_t device) { deviceProps.cooperativeMultiDeviceUnmatchedBlockDim = info.cooperativeMultiDeviceGroups_; deviceProps.cooperativeMultiDeviceUnmatchedSharedMem = info.cooperativeMultiDeviceGroups_; - deviceProps.maxTexture1DLinear = std::min(16 * info.imageMaxBufferSize_, int32_max); // Max pixel size is 16 bytes - deviceProps.maxTexture1D = std::min(info.image1DMaxWidth_, int32_max); + deviceProps.maxTexture1DLinear = + std::min(16 * info.imageMaxBufferSize_, int32_max); // Max pixel size is 16 bytes + deviceProps.maxTexture1D = std::min(info.image1DMaxWidth_, int32_max); deviceProps.maxTexture2D[0] = std::min(info.image2DMaxWidth_, int32_max); deviceProps.maxTexture2D[1] = std::min(info.image2DMaxHeight_, int32_max); deviceProps.maxTexture3D[0] = std::min(info.image3DMaxWidth_, int32_max); @@ -376,9 +648,3 @@ hipError_t ihipGetDeviceProperties(hipDeviceProp_t* props, hipDevice_t device) { *props = deviceProps; return hipSuccess; } - -hipError_t hipGetDeviceProperties(hipDeviceProp_t* props, hipDevice_t device) { - HIP_INIT_API(hipGetDeviceProperties, props, device); - - HIP_RETURN(ihipGetDeviceProperties(props, device)); -} diff --git a/hipamd/src/hip_device_runtime.cpp b/hipamd/src/hip_device_runtime.cpp index f6ce98ad45..3fbfce9d4e 100644 --- a/hipamd/src/hip_device_runtime.cpp +++ b/hipamd/src/hip_device_runtime.cpp @@ -23,7 +23,6 @@ #include "hip_internal.hpp" hipError_t hipChooseDevice(int* device, const hipDeviceProp_t* properties) { - HIP_INIT_API(hipChooseDevice, device, properties); if (device == nullptr || properties == nullptr) { @@ -35,99 +34,97 @@ hipError_t hipChooseDevice(int* device, const hipDeviceProp_t* properties) { int count = 0; HIP_RETURN_ONFAIL(ihipDeviceGetCount(&count)); - for (cl_int i = 0; i< count; ++i) { + for (cl_int i = 0; i < count; ++i) { hipDeviceProp_t currentProp = {0}; cl_uint validPropCount = 0; cl_uint matchedCount = 0; hipError_t err = ihipGetDeviceProperties(¤tProp, i); if (properties->major != 0) { validPropCount++; - if(currentProp.major >= properties->major) { + if (currentProp.major >= properties->major) { matchedCount++; } } if (properties->minor != 0) { validPropCount++; - if(currentProp.minor >= properties->minor) { + if (currentProp.minor >= properties->minor) { matchedCount++; } } - if(properties->totalGlobalMem != 0) { - validPropCount++; - if(currentProp.totalGlobalMem >= properties->totalGlobalMem) { - matchedCount++; - } + if (properties->totalGlobalMem != 0) { + validPropCount++; + if (currentProp.totalGlobalMem >= properties->totalGlobalMem) { + matchedCount++; + } } - if(properties->sharedMemPerBlock != 0) { - validPropCount++; - if(currentProp.sharedMemPerBlock >= properties->sharedMemPerBlock) { - matchedCount++; - } + if (properties->sharedMemPerBlock != 0) { + validPropCount++; + if (currentProp.sharedMemPerBlock >= properties->sharedMemPerBlock) { + matchedCount++; + } } - if(properties->maxThreadsPerBlock != 0) { - validPropCount++; - if(currentProp.maxThreadsPerBlock >= properties->maxThreadsPerBlock ) { - matchedCount++; - } + if (properties->maxThreadsPerBlock != 0) { + validPropCount++; + if (currentProp.maxThreadsPerBlock >= properties->maxThreadsPerBlock) { + matchedCount++; + } } - if(properties->totalConstMem != 0) { - validPropCount++; - if(currentProp.totalConstMem >= properties->totalConstMem ) { - matchedCount++; - } + if (properties->totalConstMem != 0) { + validPropCount++; + if (currentProp.totalConstMem >= properties->totalConstMem) { + matchedCount++; + } } - if(properties->multiProcessorCount != 0) { - validPropCount++; - if(currentProp.multiProcessorCount >= - properties->multiProcessorCount ) { - matchedCount++; - } + if (properties->multiProcessorCount != 0) { + validPropCount++; + if (currentProp.multiProcessorCount >= properties->multiProcessorCount) { + matchedCount++; + } } - if(properties->maxThreadsPerMultiProcessor != 0) { - validPropCount++; - if(currentProp.maxThreadsPerMultiProcessor >= - properties->maxThreadsPerMultiProcessor ) { - matchedCount++; - } + if (properties->maxThreadsPerMultiProcessor != 0) { + validPropCount++; + if (currentProp.maxThreadsPerMultiProcessor >= properties->maxThreadsPerMultiProcessor) { + matchedCount++; + } } - if(properties->memoryClockRate != 0) { - validPropCount++; - if(currentProp.memoryClockRate >= properties->memoryClockRate ) { - matchedCount++; - } + if (properties->memoryClockRate != 0) { + validPropCount++; + if (currentProp.memoryClockRate >= properties->memoryClockRate) { + matchedCount++; + } } - if(properties->memoryBusWidth != 0) { - validPropCount++; - if(currentProp.memoryBusWidth >= properties->memoryBusWidth ) { - matchedCount++; - } + if (properties->memoryBusWidth != 0) { + validPropCount++; + if (currentProp.memoryBusWidth >= properties->memoryBusWidth) { + matchedCount++; + } } - if(properties->l2CacheSize != 0) { - validPropCount++; - if(currentProp.l2CacheSize >= properties->l2CacheSize ) { - matchedCount++; - } + if (properties->l2CacheSize != 0) { + validPropCount++; + if (currentProp.l2CacheSize >= properties->l2CacheSize) { + matchedCount++; + } } - if(properties->regsPerBlock != 0) { - validPropCount++; - if(currentProp.regsPerBlock >= properties->regsPerBlock ) { - matchedCount++; - } + if (properties->regsPerBlock != 0) { + validPropCount++; + if (currentProp.regsPerBlock >= properties->regsPerBlock) { + matchedCount++; + } } - if(properties->maxSharedMemoryPerMultiProcessor != 0) { - validPropCount++; - if(currentProp.maxSharedMemoryPerMultiProcessor >= - properties->maxSharedMemoryPerMultiProcessor ) { - matchedCount++; - } + if (properties->maxSharedMemoryPerMultiProcessor != 0) { + validPropCount++; + if (currentProp.maxSharedMemoryPerMultiProcessor >= + properties->maxSharedMemoryPerMultiProcessor) { + matchedCount++; + } } - if(properties->warpSize != 0) { - validPropCount++; - if(currentProp.warpSize >= properties->warpSize ) { - matchedCount++; - } + if (properties->warpSize != 0) { + validPropCount++; + if (currentProp.warpSize >= properties->warpSize) { + matchedCount++; + } } - if(validPropCount == matchedCount) { + if (validPropCount == matchedCount) { *device = matchedCount > maxMatchedCount ? i : *device; maxMatchedCount = std::max(matchedCount, maxMatchedCount); } @@ -137,7 +134,6 @@ hipError_t hipChooseDevice(int* device, const hipDeviceProp_t* properties) { } hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t attr, int device) { - HIP_INIT_API(hipDeviceGetAttribute, pi, attr, device); if (pi == nullptr) { @@ -151,214 +147,282 @@ hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t attr, int device) HIP_RETURN(hipErrorInvalidDevice); } - //FIXME: should we cache the props, or just select from deviceHandle->info_? + // FIXME: should we cache the props, or just select from deviceHandle->info_? hipDeviceProp_t prop = {0}; HIP_RETURN_ONFAIL(ihipGetDeviceProperties(&prop, device)); constexpr auto int32_max = static_cast(std::numeric_limits::max()); switch (attr) { - case hipDeviceAttributeMaxThreadsPerBlock: - *pi = prop.maxThreadsPerBlock; - break; - case hipDeviceAttributeMaxBlockDimX: - *pi = prop.maxThreadsDim[0]; - break; - case hipDeviceAttributeMaxBlockDimY: - *pi = prop.maxThreadsDim[1]; - break; - case hipDeviceAttributeMaxBlockDimZ: - *pi = prop.maxThreadsDim[2]; - break; - case hipDeviceAttributeMaxGridDimX: - *pi = prop.maxGridSize[0]; - break; - case hipDeviceAttributeMaxGridDimY: - *pi = prop.maxGridSize[1]; - break; - case hipDeviceAttributeMaxGridDimZ: - *pi = prop.maxGridSize[2]; - break; - case hipDeviceAttributeMaxSharedMemoryPerBlock: - *pi = prop.sharedMemPerBlock; - break; - case hipDeviceAttributeTotalConstantMemory: - // size_t to int casting - *pi = std::min(prop.totalConstMem, int32_max); - break; - case hipDeviceAttributeWarpSize: - *pi = prop.warpSize; - break; - case hipDeviceAttributeMaxRegistersPerBlock: - *pi = prop.regsPerBlock; - break; - case hipDeviceAttributeClockRate: - *pi = prop.clockRate; - break; - case hipDeviceAttributeWallClockRate: - *pi = g_devices[device]->devices()[0]->info().wallClockFrequency_; - break; - case hipDeviceAttributeMemoryClockRate: - *pi = prop.memoryClockRate; - break; - case hipDeviceAttributeMemoryBusWidth: - *pi = prop.memoryBusWidth; - break; - case hipDeviceAttributeMultiprocessorCount: - *pi = prop.multiProcessorCount; - break; - case hipDeviceAttributeComputeMode: - *pi = prop.computeMode; - break; - case hipDeviceAttributeL2CacheSize: - *pi = prop.l2CacheSize; - break; - case hipDeviceAttributeMaxThreadsPerMultiProcessor: - *pi = prop.maxThreadsPerMultiProcessor; - break; - case hipDeviceAttributeComputeCapabilityMajor: - *pi = prop.major; - break; - case hipDeviceAttributeComputeCapabilityMinor: - *pi = prop.minor; - break; - case hipDeviceAttributePciBusId: - *pi = prop.pciBusID; - break; - case hipDeviceAttributeConcurrentKernels: - *pi = prop.concurrentKernels; - break; - case hipDeviceAttributePciDeviceId: - *pi = prop.pciDeviceID; - break; - case hipDeviceAttributeMaxSharedMemoryPerMultiprocessor: - *pi = prop.maxSharedMemoryPerMultiProcessor; - break; - case hipDeviceAttributeIsMultiGpuBoard: - *pi = prop.isMultiGpuBoard; - break; - case hipDeviceAttributeCooperativeLaunch: - *pi = prop.cooperativeLaunch; - break; - case hipDeviceAttributeCooperativeMultiDeviceLaunch: - *pi = prop.cooperativeMultiDeviceLaunch; - break; - case hipDeviceAttributeIntegrated: - *pi = prop.integrated; - break; - case hipDeviceAttributeMaxTexture1DWidth: - *pi = prop.maxTexture1D; - break; - case hipDeviceAttributeMaxTexture2DWidth: - *pi = prop.maxTexture2D[0]; - break; - case hipDeviceAttributeMaxTexture2DHeight: - *pi = prop.maxTexture2D[1]; - break; - case hipDeviceAttributeMaxTexture3DWidth: - *pi = prop.maxTexture3D[0]; - break; - case hipDeviceAttributeMaxTexture3DHeight: - *pi = prop.maxTexture3D[1]; - break; - case hipDeviceAttributeMaxTexture3DDepth: - *pi = prop.maxTexture3D[2]; - break; - case hipDeviceAttributeHdpMemFlushCntl: - *reinterpret_cast(pi) = prop.hdpMemFlushCntl; - break; - case hipDeviceAttributeHdpRegFlushCntl: - *reinterpret_cast(pi) = prop.hdpRegFlushCntl; - break; - case hipDeviceAttributeMaxPitch: - // size_t to int casting - *pi = std::min(prop.memPitch, int32_max); - break; - case hipDeviceAttributeTextureAlignment: - *pi = prop.textureAlignment; - break; - case hipDeviceAttributeTexturePitchAlignment: - *pi = prop.texturePitchAlignment; - break; - case hipDeviceAttributeKernelExecTimeout: - *pi = prop.kernelExecTimeoutEnabled; - break; - case hipDeviceAttributeCanMapHostMemory: - *pi = prop.canMapHostMemory; - break; - case hipDeviceAttributeEccEnabled: - *pi = prop.ECCEnabled; - break; - case hipDeviceAttributeCooperativeMultiDeviceUnmatchedFunc: - *pi = prop.cooperativeMultiDeviceUnmatchedFunc; - break; - case hipDeviceAttributeCooperativeMultiDeviceUnmatchedGridDim: - *pi = prop.cooperativeMultiDeviceUnmatchedGridDim; - break; - case hipDeviceAttributeCooperativeMultiDeviceUnmatchedBlockDim: - *pi = prop.cooperativeMultiDeviceUnmatchedBlockDim; - break; - case hipDeviceAttributeCooperativeMultiDeviceUnmatchedSharedMem: - *pi = prop.cooperativeMultiDeviceUnmatchedSharedMem; - break; - case hipDeviceAttributeAsicRevision: - *pi = prop.asicRevision; - break; - case hipDeviceAttributeManagedMemory: - *pi = prop.managedMemory; - break; - case hipDeviceAttributeDirectManagedMemAccessFromHost: - *pi = prop.directManagedMemAccessFromHost; - break; - case hipDeviceAttributeConcurrentManagedAccess: - *pi = prop.concurrentManagedAccess; - break; - case hipDeviceAttributePageableMemoryAccess: - *pi = prop.pageableMemoryAccess; - break; - case hipDeviceAttributePageableMemoryAccessUsesHostPageTables: - *pi = prop.pageableMemoryAccessUsesHostPageTables; - break; - case hipDeviceAttributeIsLargeBar: - *pi = prop.isLargeBar; - break; - case hipDeviceAttributeUnifiedAddressing: - // HIP runtime always uses SVM for host memory allocations. - // Note: Host registered memory isn't covered by this feature - // and still requires hipMemHostGetDevicePointer() call - *pi = true; - break; - case hipDeviceAttributeCanUseStreamWaitValue: - // hipStreamWaitValue64() and hipStreamWaitValue32() support - *pi = g_devices[device]->devices()[0]->info().aqlBarrierValue_; - break; - case hipDeviceAttributeImageSupport: - *pi = static_cast(g_devices[device]->devices()[0]->info().imageSupport_); - break; - case hipDeviceAttributePhysicalMultiProcessorCount: - *pi = g_devices[device]->devices()[0]->info().maxPhysicalComputeUnits_; - break; - case hipDeviceAttributeFineGrainSupport: - *pi = static_cast(g_devices[device]->devices()[0]->isFineGrainSupported()); - break; - case hipDeviceAttributeMemoryPoolsSupported: - *pi = HIP_MEM_POOL_SUPPORT; - break; - case hipDeviceAttributeVirtualMemoryManagementSupported: - *pi = static_cast(g_devices[device]->devices()[0]->info().virtualMemoryManagement_); - break; - case hipDeviceAttributeHostRegisterSupported: - *pi = true; - break; - default: - HIP_RETURN(hipErrorInvalidValue); + case hipDeviceAttributeMaxThreadsPerBlock: + *pi = prop.maxThreadsPerBlock; + break; + case hipDeviceAttributeAsyncEngineCount: + *pi = prop.asyncEngineCount; + break; + case hipDeviceAttributeMaxBlockDimX: + *pi = prop.maxThreadsDim[0]; + break; + case hipDeviceAttributeMaxBlockDimY: + *pi = prop.maxThreadsDim[1]; + break; + case hipDeviceAttributeMaxBlockDimZ: + *pi = prop.maxThreadsDim[2]; + break; + case hipDeviceAttributeMaxGridDimX: + *pi = prop.maxGridSize[0]; + break; + case hipDeviceAttributeMaxGridDimY: + *pi = prop.maxGridSize[1]; + break; + case hipDeviceAttributeMaxGridDimZ: + *pi = prop.maxGridSize[2]; + break; + case hipDeviceAttributeMaxSurface1D: + *pi = prop.maxSurface1D; + break; + case hipDeviceAttributeMaxSharedMemoryPerBlock: + *pi = prop.sharedMemPerBlock; + break; + case hipDeviceAttributeSharedMemPerBlockOptin: + *pi = prop.sharedMemPerBlockOptin; + break; + case hipDeviceAttributeSharedMemPerMultiprocessor: + *pi = prop.sharedMemPerMultiprocessor; + break; + case hipDeviceAttributeStreamPrioritiesSupported: + *pi = prop.streamPrioritiesSupported; + break; + case hipDeviceAttributeSurfaceAlignment: + *pi = prop.surfaceAlignment; + break; + case hipDeviceAttributeTotalConstantMemory: + // size_t to int casting + *pi = std::min(prop.totalConstMem, int32_max); + break; + case hipDeviceAttributeTotalGlobalMem: + *pi = std::min(prop.totalGlobalMem, int32_max); + break; + case hipDeviceAttributeWarpSize: + *pi = prop.warpSize; + break; + case hipDeviceAttributeMaxRegistersPerBlock: + *pi = prop.regsPerBlock; + break; + case hipDeviceAttributeClockRate: + *pi = prop.clockRate; + break; + case hipDeviceAttributeWallClockRate: + *pi = g_devices[device]->devices()[0]->info().wallClockFrequency_; + break; + case hipDeviceAttributeMemoryClockRate: + *pi = prop.memoryClockRate; + break; + case hipDeviceAttributeMemoryBusWidth: + *pi = prop.memoryBusWidth; + break; + case hipDeviceAttributeMultiprocessorCount: + *pi = prop.multiProcessorCount; + break; + case hipDeviceAttributeComputeMode: + *pi = prop.computeMode; + break; + case hipDeviceAttributeComputePreemptionSupported: + *pi = prop.computePreemptionSupported; + break; + case hipDeviceAttributeL2CacheSize: + *pi = prop.l2CacheSize; + break; + case hipDeviceAttributeLocalL1CacheSupported: + *pi = prop.localL1CacheSupported; + break; + case hipDeviceAttributeLuidDeviceNodeMask: + *pi = prop.luidDeviceNodeMask; + break; + case hipDeviceAttributeMaxThreadsPerMultiProcessor: + *pi = prop.maxThreadsPerMultiProcessor; + break; + case hipDeviceAttributeComputeCapabilityMajor: + *pi = prop.major; + break; + case hipDeviceAttributeComputeCapabilityMinor: + *pi = prop.minor; + break; + case hipDeviceAttributeMultiGpuBoardGroupID: + *pi = prop.multiGpuBoardGroupID; + break; + case hipDeviceAttributePciBusId: + *pi = prop.pciBusID; + break; + case hipDeviceAttributeConcurrentKernels: + *pi = prop.concurrentKernels; + break; + case hipDeviceAttributePciDeviceId: + *pi = prop.pciDeviceID; + break; + case hipDeviceAttributePciDomainID: + *pi = prop.pciDomainID; + break; + case hipDeviceAttributePersistingL2CacheMaxSize: + *pi = prop.persistingL2CacheMaxSize; + break; + case hipDeviceAttributeMaxRegistersPerMultiprocessor: + *pi = prop.regsPerMultiprocessor; + break; + case hipDeviceAttributeReservedSharedMemPerBlock: + *pi = prop.reservedSharedMemPerBlock; + break; + case hipDeviceAttributeMaxSharedMemoryPerMultiprocessor: + *pi = prop.maxSharedMemoryPerMultiProcessor; + break; + case hipDeviceAttributeIsMultiGpuBoard: + *pi = prop.isMultiGpuBoard; + break; + case hipDeviceAttributeCooperativeLaunch: + *pi = prop.cooperativeLaunch; + break; + case hipDeviceAttributeHostRegisterSupported: + *pi = 1; // AMD GPUs allow you to register host memory regardless of the GPU + break; + case hipDeviceAttributeDeviceOverlap: + *pi = prop.asyncEngineCount > 0 ? 1 : 0; + break; + case hipDeviceAttributeCooperativeMultiDeviceLaunch: + *pi = prop.cooperativeMultiDeviceLaunch; + break; + case hipDeviceAttributeIntegrated: + *pi = prop.integrated; + break; + case hipDeviceAttributeMaxTexture1DWidth: + *pi = prop.maxTexture1D; + break; + case hipDeviceAttributeMaxTexture1DLinear: + *pi = prop.maxTexture1DLinear; + break; + case hipDeviceAttributeMaxTexture1DMipmap: + *pi = prop.maxTexture1DMipmap; + break; + case hipDeviceAttributeMaxTextureCubemap: + *pi = prop.maxTextureCubemap; + break; + case hipDeviceAttributeMaxTexture2DWidth: + *pi = prop.maxTexture2D[0]; + break; + case hipDeviceAttributeMaxTexture2DHeight: + *pi = prop.maxTexture2D[1]; + break; + case hipDeviceAttributeMaxTexture3DWidth: + *pi = prop.maxTexture3D[0]; + break; + case hipDeviceAttributeMaxTexture3DHeight: + *pi = prop.maxTexture3D[1]; + break; + case hipDeviceAttributeMaxTexture3DDepth: + *pi = prop.maxTexture3D[2]; + break; + case hipDeviceAttributeHdpMemFlushCntl: + *reinterpret_cast(pi) = prop.hdpMemFlushCntl; + break; + case hipDeviceAttributeHdpRegFlushCntl: + *reinterpret_cast(pi) = prop.hdpRegFlushCntl; + break; + case hipDeviceAttributeMaxPitch: + // size_t to int casting + *pi = std::min(prop.memPitch, int32_max); + break; + case hipDeviceAttributeTextureAlignment: + *pi = prop.textureAlignment; + break; + case hipDeviceAttributeTexturePitchAlignment: + *pi = prop.texturePitchAlignment; + break; + case hipDeviceAttributeKernelExecTimeout: + *pi = prop.kernelExecTimeoutEnabled; + break; + case hipDeviceAttributeCanMapHostMemory: + *pi = prop.canMapHostMemory; + break; + case hipDeviceAttributeCanUseHostPointerForRegisteredMem: + *pi = prop.canUseHostPointerForRegisteredMem; + break; + case hipDeviceAttributeEccEnabled: + *pi = prop.ECCEnabled; + break; + case hipDeviceAttributeCooperativeMultiDeviceUnmatchedFunc: + *pi = prop.cooperativeMultiDeviceUnmatchedFunc; + break; + case hipDeviceAttributeCooperativeMultiDeviceUnmatchedGridDim: + *pi = prop.cooperativeMultiDeviceUnmatchedGridDim; + break; + case hipDeviceAttributeCooperativeMultiDeviceUnmatchedBlockDim: + *pi = prop.cooperativeMultiDeviceUnmatchedBlockDim; + break; + case hipDeviceAttributeCooperativeMultiDeviceUnmatchedSharedMem: + *pi = prop.cooperativeMultiDeviceUnmatchedSharedMem; + break; + case hipDeviceAttributeAsicRevision: + *pi = prop.asicRevision; + break; + case hipDeviceAttributeManagedMemory: + *pi = prop.managedMemory; + break; + case hipDeviceAttributeMaxBlocksPerMultiProcessor: + *pi = prop.maxBlocksPerMultiProcessor; + break; + case hipDeviceAttributeDirectManagedMemAccessFromHost: + *pi = prop.directManagedMemAccessFromHost; + break; + case hipDeviceAttributeGlobalL1CacheSupported: + *pi = prop.globalL1CacheSupported; + break; + case hipDeviceAttributeHostNativeAtomicSupported: + *pi = prop.hostNativeAtomicSupported; + break; + case hipDeviceAttributeConcurrentManagedAccess: + *pi = prop.concurrentManagedAccess; + break; + case hipDeviceAttributePageableMemoryAccess: + *pi = prop.pageableMemoryAccess; + break; + case hipDeviceAttributePageableMemoryAccessUsesHostPageTables: + *pi = prop.pageableMemoryAccessUsesHostPageTables; + break; + case hipDeviceAttributeIsLargeBar: + *pi = prop.isLargeBar; + break; + case hipDeviceAttributeUnifiedAddressing: + // HIP runtime always uses SVM for host memory allocations. + // Note: Host registered memory isn't covered by this feature + // and still requires hipMemHostGetDevicePointer() call + *pi = true; + break; + case hipDeviceAttributeCanUseStreamWaitValue: + // hipStreamWaitValue64() and hipStreamWaitValue32() support + *pi = g_devices[device]->devices()[0]->info().aqlBarrierValue_; + break; + case hipDeviceAttributeImageSupport: + *pi = static_cast(g_devices[device]->devices()[0]->info().imageSupport_); + break; + case hipDeviceAttributePhysicalMultiProcessorCount: + *pi = g_devices[device]->devices()[0]->info().maxPhysicalComputeUnits_; + break; + case hipDeviceAttributeFineGrainSupport: + *pi = static_cast(g_devices[device]->devices()[0]->isFineGrainSupported()); + break; + case hipDeviceAttributeMemoryPoolsSupported: + *pi = HIP_MEM_POOL_SUPPORT; + break; + case hipDeviceAttributeVirtualMemoryManagementSupported: + *pi = static_cast(g_devices[device]->devices()[0]->info().virtualMemoryManagement_); + break; + default: + HIP_RETURN(hipErrorInvalidValue); } HIP_RETURN(hipSuccess); } -hipError_t hipDeviceGetByPCIBusId(int* device, const char*pciBusIdstr) { - +hipError_t hipDeviceGetByPCIBusId(int* device, const char* pciBusIdstr) { HIP_INIT_API(hipDeviceGetByPCIBusId, device, pciBusIdstr); if (device == nullptr || pciBusIdstr == nullptr) { @@ -369,9 +433,9 @@ hipError_t hipDeviceGetByPCIBusId(int* device, const char*pciBusIdstr) { int pciDeviceID = -1; int pciDomainID = -1; bool found = false; - if (sscanf (pciBusIdstr, "%04x:%02x:%02x", reinterpret_cast(&pciDomainID), - reinterpret_cast(&pciBusID), - reinterpret_cast(&pciDeviceID)) == 0x3) { + if (sscanf(pciBusIdstr, "%04x:%02x:%02x", reinterpret_cast(&pciDomainID), + reinterpret_cast(&pciBusID), + reinterpret_cast(&pciDeviceID)) == 0x3) { int count = 0; HIP_RETURN_ONFAIL(ihipDeviceGetCount(&count)); for (cl_int i = 0; i < count; i++) { @@ -380,8 +444,8 @@ hipError_t hipDeviceGetByPCIBusId(int* device, const char*pciBusIdstr) { HIP_RETURN_ONFAIL(ihipDeviceGet(&dev, i)); HIP_RETURN_ONFAIL(ihipGetDeviceProperties(&prop, dev)); - if ((pciBusID == prop.pciBusID) && (pciDomainID == prop.pciDomainID) - && (pciDeviceID == prop.pciDeviceID)) { + if ((pciBusID == prop.pciBusID) && (pciDomainID == prop.pciDomainID) && + (pciDeviceID == prop.pciDeviceID)) { *device = i; found = true; break; @@ -395,10 +459,10 @@ hipError_t hipDeviceGetByPCIBusId(int* device, const char*pciBusIdstr) { HIP_RETURN(hipSuccess); } -hipError_t hipDeviceGetCacheConfig ( hipFuncCache_t * cacheConfig ) { +hipError_t hipDeviceGetCacheConfig(hipFuncCache_t* cacheConfig) { HIP_INIT_API(hipDeviceGetCacheConfig, cacheConfig); - if(cacheConfig == nullptr) { + if (cacheConfig == nullptr) { HIP_RETURN(hipErrorInvalidValue); } @@ -407,8 +471,7 @@ hipError_t hipDeviceGetCacheConfig ( hipFuncCache_t * cacheConfig ) { HIP_RETURN(hipSuccess); } -hipError_t hipDeviceGetLimit ( size_t* pValue, hipLimit_t limit ) { - +hipError_t hipDeviceGetLimit(size_t* pValue, hipLimit_t limit) { HIP_INIT_API(hipDeviceGetLimit, pValue, limit); if (pValue == nullptr || limit >= hipLimitRange) { @@ -431,8 +494,7 @@ hipError_t hipDeviceGetLimit ( size_t* pValue, hipLimit_t limit ) { HIP_RETURN(hipSuccess); } -hipError_t hipDeviceGetPCIBusId ( char* pciBusId, int len, int device ) { - +hipError_t hipDeviceGetPCIBusId(char* pciBusId, int len, int device) { HIP_INIT_API(hipDeviceGetPCIBusId, (void*)pciBusId, len, device); int count; @@ -442,24 +504,19 @@ hipError_t hipDeviceGetPCIBusId ( char* pciBusId, int len, int device ) { HIP_RETURN(hipErrorInvalidDevice); } - //pciBusId should be large enough to store 13 characters including the NULL-terminator. + // pciBusId should be large enough to store 13 characters including the NULL-terminator. if (pciBusId == nullptr) { HIP_RETURN(hipErrorInvalidValue); } hipDeviceProp_t prop; HIP_RETURN_ONFAIL(ihipGetDeviceProperties(&prop, device)); - auto* deviceHandle = g_devices[device]->devices()[0]; - snprintf (pciBusId, len, "%04x:%02x:%02x.%01x", - prop.pciDomainID, - prop.pciBusID, - prop.pciDeviceID, - deviceHandle->info().deviceTopology_.pcie.function); + snprintf(pciBusId, len, "%04x:%02x:%02x.0", prop.pciDomainID, prop.pciBusID, prop.pciDeviceID); HIP_RETURN(len <= 12 ? hipErrorInvalidValue : hipSuccess); } -hipError_t hipDeviceGetSharedMemConfig ( hipSharedMemConfig * pConfig ) { +hipError_t hipDeviceGetSharedMemConfig(hipSharedMemConfig* pConfig) { HIP_INIT_API(hipDeviceGetSharedMemConfig, pConfig); if (pConfig == nullptr) { HIP_RETURN(hipErrorInvalidValue); @@ -469,7 +526,7 @@ hipError_t hipDeviceGetSharedMemConfig ( hipSharedMemConfig * pConfig ) { HIP_RETURN(hipSuccess); } -hipError_t hipDeviceReset ( void ) { +hipError_t hipDeviceReset(void) { HIP_INIT_API(hipDeviceReset); hip::getCurrentDevice()->Reset(); @@ -477,7 +534,7 @@ hipError_t hipDeviceReset ( void ) { HIP_RETURN(hipSuccess); } -hipError_t hipDeviceSetCacheConfig ( hipFuncCache_t cacheConfig ) { +hipError_t hipDeviceSetCacheConfig(hipFuncCache_t cacheConfig) { HIP_INIT_API(hipDeviceSetCacheConfig, cacheConfig); // No way to set cache config yet. @@ -485,34 +542,33 @@ hipError_t hipDeviceSetCacheConfig ( hipFuncCache_t cacheConfig ) { HIP_RETURN(hipSuccess); } -hipError_t hipDeviceSetLimit ( hipLimit_t limit, size_t value ) { +hipError_t hipDeviceSetLimit(hipLimit_t limit, size_t value) { HIP_INIT_API(hipDeviceSetLimit, limit, value); if (limit >= hipLimitRange) { HIP_RETURN(hipErrorInvalidValue); } - switch(limit) { - case hipLimitStackSize : - // need to query device size and take action - if (!hip::getCurrentDevice()->devices()[0]->UpdateStackSize(value)) { - HIP_RETURN(hipErrorInvalidValue); - } - break; - case hipLimitMallocHeapSize: - if (!hip::getCurrentDevice()->devices()[0]->UpdateInitialHeapSize(value)) { - HIP_RETURN(hipErrorInvalidValue); - } - break; - default: - LogPrintfError("UnsupportedLimit = %d is passed", limit); - HIP_RETURN(hipErrorUnsupportedLimit); + switch (limit) { + case hipLimitStackSize: + // need to query device size and take action + if (!hip::getCurrentDevice()->devices()[0]->UpdateStackSize(value)) { + HIP_RETURN(hipErrorInvalidValue); + } + break; + case hipLimitMallocHeapSize: + if (!hip::getCurrentDevice()->devices()[0]->UpdateInitialHeapSize(value)) { + HIP_RETURN(hipErrorInvalidValue); + } + break; + default: + LogPrintfError("UnsupportedLimit = %d is passed", limit); + HIP_RETURN(hipErrorUnsupportedLimit); } HIP_RETURN(hipSuccess); } -hipError_t hipDeviceSetSharedMemConfig ( hipSharedMemConfig config ) { +hipError_t hipDeviceSetSharedMemConfig(hipSharedMemConfig config) { HIP_INIT_API(hipDeviceSetSharedMemConfig, config); - if (config != hipSharedMemBankSizeDefault && - config != hipSharedMemBankSizeFourByte && + if (config != hipSharedMemBankSizeDefault && config != hipSharedMemBankSizeFourByte && config != hipSharedMemBankSizeEightByte) { HIP_RETURN(hipErrorInvalidValue); } @@ -530,13 +586,13 @@ hipError_t hipDeviceSynchronize() { int ihipGetDevice() { hip::Device* device = hip::getCurrentDevice(); - if(device == nullptr){ + if (device == nullptr) { return -1; } return device->deviceId(); } -hipError_t hipGetDevice ( int* deviceId ) { +hipError_t hipGetDevice(int* deviceId) { HIP_INIT_API(hipGetDevice, deviceId); if (deviceId != nullptr) { @@ -551,13 +607,13 @@ hipError_t hipGetDevice ( int* deviceId ) { } } -hipError_t hipGetDeviceCount ( int* count ) { +hipError_t hipGetDeviceCount(int* count) { HIP_INIT_API_NO_RETURN(hipGetDeviceCount, count); HIP_RETURN(ihipDeviceGetCount(count)); } -hipError_t hipGetDeviceFlags ( unsigned int* flags ) { +hipError_t hipGetDeviceFlags(unsigned int* flags) { HIP_INIT_API(hipGetDeviceFlags, flags); if (flags == nullptr) { HIP_RETURN(hipErrorInvalidValue); @@ -566,7 +622,7 @@ hipError_t hipGetDeviceFlags ( unsigned int* flags ) { HIP_RETURN(hipSuccess); } -hipError_t hipSetDevice ( int device ) { +hipError_t hipSetDevice(int device) { HIP_INIT_API_NO_RETURN(hipSetDevice, device); if (static_cast(device) < g_devices.size()) { hip::setCurrentDevice(device); @@ -578,7 +634,7 @@ hipError_t hipSetDevice ( int device ) { HIP_RETURN(hipErrorInvalidDevice); } -hipError_t hipSetDeviceFlags ( unsigned int flags ) { +hipError_t hipSetDeviceFlags(unsigned int flags) { HIP_INIT_API(hipSetDeviceFlags, flags); if (g_devices.empty()) { HIP_RETURN(hipErrorNoDevice); @@ -590,9 +646,10 @@ hipError_t hipSetDeviceFlags ( unsigned int flags ) { // Only one scheduling flag allowed a time uint32_t scheduleFlag = flags & hipDeviceScheduleMask; - if (((scheduleFlag & mutualExclusiveFlags) != hipDeviceScheduleSpin) && ((scheduleFlag & mutualExclusiveFlags) != hipDeviceScheduleYield) - && ((scheduleFlag & mutualExclusiveFlags) != hipDeviceScheduleBlockingSync) - && ((scheduleFlag & mutualExclusiveFlags) != hipDeviceScheduleAuto)) { + if (((scheduleFlag & mutualExclusiveFlags) != hipDeviceScheduleSpin) && + ((scheduleFlag & mutualExclusiveFlags) != hipDeviceScheduleYield) && + ((scheduleFlag & mutualExclusiveFlags) != hipDeviceScheduleBlockingSync) && + ((scheduleFlag & mutualExclusiveFlags) != hipDeviceScheduleAuto)) { HIP_RETURN(hipErrorInvalidValue); } @@ -625,7 +682,7 @@ hipError_t hipSetDeviceFlags ( unsigned int flags ) { HIP_RETURN(hipSuccess); } -hipError_t hipSetValidDevices ( int* device_arr, int len ) { +hipError_t hipSetValidDevices(int* device_arr, int len) { HIP_INIT_API(hipSetValidDevices, device_arr, len); assert(0 && "Unimplemented"); diff --git a/hipamd/src/hip_hcc.map.in b/hipamd/src/hip_hcc.map.in index 9494a3f44c..a0d22286af 100644 --- a/hipamd/src/hip_hcc.map.in +++ b/hipamd/src/hip_hcc.map.in @@ -525,4 +525,11 @@ global: hipArray3DGetDescriptor; local: *; -} hip_5.5; \ No newline at end of file +} hip_5.5; + +hip_6.0 { +global: + hipGetDevicePropertiesR0600; +local: + *; +} hip_5.6; \ No newline at end of file diff --git a/hipamd/src/hip_peer.cpp b/hipamd/src/hip_peer.cpp index 17dc65da05..cadb7c1035 100644 --- a/hipamd/src/hip_peer.cpp +++ b/hipamd/src/hip_peer.cpp @@ -151,7 +151,7 @@ hipError_t hipDeviceGetP2PAttribute(int* value, hipDeviceP2PAttr attr, // Linear layout access is supported if P2P is enabled // Opaque Images are supported only on homogeneous systems // Might have more conditions to check, in future. - if (srcDeviceProp.gcnArch == dstDeviceProp.gcnArch) { + if (std::string(srcDeviceProp.gcnArchName) == std::string(dstDeviceProp.gcnArchName)) { HIP_RETURN_ONFAIL(canAccessPeer(value, srcDevice, dstDevice)); } else { *value = 0; diff --git a/hipamd/src/hiprtc/hiprtcInternal.cpp b/hipamd/src/hiprtc/hiprtcInternal.cpp index d423d78fc0..446379184f 100644 --- a/hipamd/src/hiprtc/hiprtcInternal.cpp +++ b/hipamd/src/hiprtc/hiprtcInternal.cpp @@ -76,7 +76,12 @@ bool RTCProgram::findIsa() { } void* sym_hipGetDevice = amd::Os::getSymbol(handle, "hipGetDevice"); - void* sym_hipGetDeviceProperties = amd::Os::getSymbol(handle, "hipGetDeviceProperties"); + void* sym_hipGetDeviceProperties = + amd::Os::getSymbol(handle, "hipGetDevicePropertiesR0600"); // Try to find the new symbol + if (sym_hipGetDeviceProperties == nullptr) { + sym_hipGetDeviceProperties = + amd::Os::getSymbol(handle, "hipGetDeviceProperties"); // Fall back to old one + } if (sym_hipGetDevice == nullptr || sym_hipGetDeviceProperties == nullptr) { LogInfo("ISA cannot be found to dlsym failure"); diff --git a/rocclr/device/device.hpp b/rocclr/device/device.hpp index dc147aa775..a033fcd7eb 100644 --- a/rocclr/device/device.hpp +++ b/rocclr/device/device.hpp @@ -344,6 +344,9 @@ struct Info : public amd::EmbeddedObject { //! Max width of 2D image in pixels. size_t image2DMaxWidth_; + //! Max width of 2DA image in pixels. + size_t image2DAMaxWidth_[2]; + //! Max height of 2D image in pixels. size_t image2DMaxHeight_; @@ -480,6 +483,9 @@ struct Info : public amd::EmbeddedObject { //! Returns max number of pixels for a 1D image size_t image1DMaxWidth_; + //! Returns max number of pixels for a 1DA image + size_t image1DAMaxWidth_; + //! Returns max number of pixels for a 1D image created from a buffer object size_t imageMaxBufferSize_; @@ -1526,7 +1532,6 @@ class Isa { uint32_t memChannelBankWidth_; //!< Memory channel bank width. uint32_t localMemSizePerCU_; //!< Local memory size per CU. uint32_t localMemBanks_; //!< Number of banks of local memory. - }; // class Isa /*! \addtogroup Runtime diff --git a/rocclr/device/pal/paldevice.cpp b/rocclr/device/pal/paldevice.cpp index 5fd640bae4..d47346c922 100644 --- a/rocclr/device/pal/paldevice.cpp +++ b/rocclr/device/pal/paldevice.cpp @@ -563,6 +563,9 @@ void NullDevice::fillDeviceInfo(const Pal::DeviceProperties& palProp, info_.maxMemAllocSize_ / kPixelRgbaSize); info_.image1DMaxWidth_ = maxTextureSize; info_.imageMaxArraySize_ = MaxImageArraySize; + info_.image2DAMaxWidth_[0] = MaxImageArraySize; + info_.image2DAMaxWidth_[1] = MaxImageArraySize; + info_.image1DAMaxWidth_ = maxTextureSize; info_.preferredInteropUserSync_ = true; info_.printfBufferSize_ = PrintfDbg::WorkitemDebugSize * info().maxWorkGroupSize_; diff --git a/rocclr/device/rocm/rocdevice.cpp b/rocclr/device/rocm/rocdevice.cpp index 2d17a71c84..58a6838948 100644 --- a/rocclr/device/rocm/rocdevice.cpp +++ b/rocclr/device/rocm/rocdevice.cpp @@ -1521,6 +1521,27 @@ bool Device::populateOCLDeviceConstants() { info_.imageMaxArraySize_ = max_array_size; + uint32_t max_image1da_width = 0; + if (HSA_STATUS_SUCCESS != + hsa_agent_get_info(bkendDevice_, + static_cast(HSA_EXT_AGENT_INFO_IMAGE_1DA_MAX_ELEMENTS), + &max_image1da_width)) { + return false; + } + + info_.image1DAMaxWidth_ = max_image1da_width; + + uint32_t max_image2da_width[2] = {0, 0}; + if (HSA_STATUS_SUCCESS != + hsa_agent_get_info(bkendDevice_, + static_cast(HSA_EXT_AGENT_INFO_IMAGE_2DA_MAX_ELEMENTS), + &max_image2da_width)) { + return false; + } + + info_.image2DAMaxWidth_[0] = max_image2da_width[0]; + info_.image2DAMaxWidth_[1] = max_image2da_width[1]; + uint32_t max_image1d_width = 0; if (HSA_STATUS_SUCCESS != hsa_agent_get_info(bkendDevice_,