From 6db9b782beef9296e1beb78bc0274d8d0b50d0b2 Mon Sep 17 00:00:00 2001 From: Aryan Salmanpour Date: Thu, 30 Jan 2020 15:05:53 -0500 Subject: [PATCH 01/50] [HIP][HIPIFY] Add some missing flags for cooperative launch and occupancy APIs --- hipamd/bin/hipify-perl | 5 +++++ .../CUDA_Driver_API_functions_supported_by_HIP.md | 5 +++-- .../CUDA_Runtime_API_functions_supported_by_HIP.md | 4 ++-- .../hipify-clang/src/CUDA2HIP_Driver_API_types.cpp | 2 ++ .../hipify-clang/src/CUDA2HIP_Runtime_API_types.cpp | 2 +- hipamd/include/hip/hcc_detail/hip_runtime_api.h | 11 ++++++++--- hipamd/include/hip/hip_runtime_api.h | 4 ++++ hipamd/include/hip/nvcc_detail/hip_runtime_api.h | 12 ++++++++++++ hipamd/src/hip_hcc.cpp | 2 ++ 9 files changed, 39 insertions(+), 8 deletions(-) diff --git a/hipamd/bin/hipify-perl b/hipamd/bin/hipify-perl index 4fe9b80349..d18e227c3d 100755 --- a/hipamd/bin/hipify-perl +++ b/hipamd/bin/hipify-perl @@ -1186,6 +1186,7 @@ sub simpleSubstitutions { $ft{'numeric_literal'} += s/\bCUDA_ERROR_INVALID_PTX\b/hipErrorInvalidKernelFile/g; $ft{'numeric_literal'} += s/\bCUDA_ERROR_INVALID_SOURCE\b/hipErrorInvalidSource/g; $ft{'numeric_literal'} += s/\bCUDA_ERROR_INVALID_VALUE\b/hipErrorInvalidValue/g; + $ft{'numeric_literal'} += s/\bCUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE\b/hipErrorCooperativeLaunchTooLarge/g; $ft{'numeric_literal'} += s/\bCUDA_ERROR_LAUNCH_FAILED\b/hipErrorLaunchFailure/g; $ft{'numeric_literal'} += s/\bCUDA_ERROR_LAUNCH_OUT_OF_RESOURCES\b/hipErrorLaunchOutOfResources/g; $ft{'numeric_literal'} += s/\bCUDA_ERROR_LAUNCH_TIMEOUT\b/hipErrorLaunchTimeOut/g; @@ -1659,6 +1660,7 @@ sub simpleSubstitutions { $ft{'numeric_literal'} += s/\bcudaErrorInvalidSource\b/hipErrorInvalidSource/g; $ft{'numeric_literal'} += s/\bcudaErrorInvalidSymbol\b/hipErrorInvalidSymbol/g; $ft{'numeric_literal'} += s/\bcudaErrorInvalidValue\b/hipErrorInvalidValue/g; + $ft{'numeric_literal'} += s/\bcudaErrorCooperativeLaunchTooLarge\b/hipErrorCooperativeLaunchTooLarge/g; $ft{'numeric_literal'} += s/\bcudaErrorLaunchFailure\b/hipErrorLaunchFailure/g; $ft{'numeric_literal'} += s/\bcudaErrorLaunchOutOfResources\b/hipErrorLaunchOutOfResources/g; $ft{'numeric_literal'} += s/\bcudaErrorLaunchTimeout\b/hipErrorLaunchTimeOut/g; @@ -1801,6 +1803,9 @@ sub simpleSubstitutions { $ft{'define'} += s/\bcudaTextureType3D\b/hipTextureType3D/g; $ft{'define'} += s/\bcudaTextureTypeCubemap\b/hipTextureTypeCubemap/g; $ft{'define'} += s/\bcudaTextureTypeCubemapLayered\b/hipTextureTypeCubemapLayered/g; + $ft{'define'} += s/\bcudaOccupancyDefault\b/hipOccupancyDefault/g; + $ft{'define'} += s/\bcudaCooperativeLaunchMultiDeviceNoPreSync\b/hipCooperativeLaunchMultiDeviceNoPreSync/g; + $ft{'define'} += s/\bcudaCooperativeLaunchMultiDeviceNoPostSync\b/hipCooperativeLaunchMultiDeviceNoPostSync/g; } # CUDA extern __shared__ syntax replace with HIP_DYNAMIC_SHARED() macro diff --git a/hipamd/docs/markdown/CUDA_Driver_API_functions_supported_by_HIP.md b/hipamd/docs/markdown/CUDA_Driver_API_functions_supported_by_HIP.md index ff14cf9bc1..5be6a4da8d 100644 --- a/hipamd/docs/markdown/CUDA_Driver_API_functions_supported_by_HIP.md +++ b/hipamd/docs/markdown/CUDA_Driver_API_functions_supported_by_HIP.md @@ -447,6 +447,7 @@ | 717 |*`CUDA_ERROR_INVALID_ADDRESS_SPACE`* | | | 718 |*`CUDA_ERROR_INVALID_PC`* | | | 719 |*`CUDA_ERROR_LAUNCH_FAILED`* |*`hipErrorLaunchFailure`* | +| 720 |*`CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE`* |*`hipErrorCooperativeLaunchTooLarge`* | | 800 |*`CUDA_ERROR_NOT_PERMITTED`* | | | 801 |*`CUDA_ERROR_NOT_SUPPORTED`* |*`hipErrorNotSupported`* | | 802 |*`CUDA_ERROR_SYSTEM_NOT_READY`* | | 10.0 | @@ -1063,8 +1064,8 @@ | `cuFuncSetSharedMemConfig` | | | `cuLaunchKernel` | `hipModuleLaunchKernel` | | `cuLaunchHostFunc` | | 10.0 | -| `cuLaunchCooperativeKernel` | | 9.0 | -| `cuLaunchCooperativeKernelMultiDevice` | | 9.0 | +| `cuLaunchCooperativeKernel` | `hipLaunchCooperativeKernel` | 9.0 | +| `cuLaunchCooperativeKernelMultiDevice` | `hipLaunchCooperativeKernelMultiDevice` | 9.0 | ## **19. Execution Control [DEPRECATED]** diff --git a/hipamd/docs/markdown/CUDA_Runtime_API_functions_supported_by_HIP.md b/hipamd/docs/markdown/CUDA_Runtime_API_functions_supported_by_HIP.md index 201d2aad0f..54e0c89e06 100644 --- a/hipamd/docs/markdown/CUDA_Runtime_API_functions_supported_by_HIP.md +++ b/hipamd/docs/markdown/CUDA_Runtime_API_functions_supported_by_HIP.md @@ -800,7 +800,7 @@ | 717 |*`cudaErrorInvalidAddressSpace`* | | | | 718 |*`cudaErrorInvalidPc`* | | | | 719 |*`cudaErrorLaunchFailure`* | |*`hipErrorLaunchFailure`* | -| 720 |*`cudaErrorCooperativeLaunchTooLarge`* | 9.0 | | +| 720 |*`cudaErrorCooperativeLaunchTooLarge`* | 9.0 |*`hipErrorCooperativeLaunchTooLarge`* | | 800 |*`cudaErrorNotPermitted`* | | | | 801 |*`cudaErrorNotSupported`* | |*`hipErrorNotSupported`* | | 802 |*`cudaErrorSystemNotReady`* | 10.0 | | @@ -1045,7 +1045,7 @@ | define |`cudaMemAttachGlobal` | |`hipMemAttachGlobal` | | define |`cudaMemAttachHost` | |`hipMemAttachHost` | | define |`cudaMemAttachSingle` | | | -| define |`cudaOccupancyDefault` | | | +| define |`cudaOccupancyDefault` | |`hipOccupancyDefault` | | define |`cudaOccupancyDisableCachingOverride` | | | | define |`cudaPeerAccessDefault` | | | | define |`cudaStreamDefault` | |`hipStreamDefault` | diff --git a/hipamd/hipify-clang/src/CUDA2HIP_Driver_API_types.cpp b/hipamd/hipify-clang/src/CUDA2HIP_Driver_API_types.cpp index 1c3c2634f7..a4d419384d 100644 --- a/hipamd/hipify-clang/src/CUDA2HIP_Driver_API_types.cpp +++ b/hipamd/hipify-clang/src/CUDA2HIP_Driver_API_types.cpp @@ -1284,6 +1284,8 @@ const std::map CUDA_DRIVER_TYPE_NAME_MAP{ {"CUDA_ERROR_INVALID_PC", {"hipErrorInvalidPc", "", CONV_NUMERIC_LITERAL, API_DRIVER, HIP_UNSUPPORTED}}, // 718 // cudaErrorLaunchFailure {"CUDA_ERROR_LAUNCH_FAILED", {"hipErrorLaunchFailure", "", CONV_NUMERIC_LITERAL, API_DRIVER}}, // 719 + // cudaErrorCooperativeLaunchTooLarge + {"CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE", {"hipErrorCooperativeLaunchTooLarge", "", CONV_NUMERIC_LITERAL, API_DRIVER, HIP_UNSUPPORTED}}, // 720 // cudaErrorNotPermitted {"CUDA_ERROR_NOT_PERMITTED", {"hipErrorNotPermitted", "", CONV_NUMERIC_LITERAL, API_DRIVER, HIP_UNSUPPORTED}}, // 800 // cudaErrorNotSupported diff --git a/hipamd/hipify-clang/src/CUDA2HIP_Runtime_API_types.cpp b/hipamd/hipify-clang/src/CUDA2HIP_Runtime_API_types.cpp index 5993f5d770..ade8b105f1 100644 --- a/hipamd/hipify-clang/src/CUDA2HIP_Runtime_API_types.cpp +++ b/hipamd/hipify-clang/src/CUDA2HIP_Runtime_API_types.cpp @@ -787,7 +787,7 @@ const std::map CUDA_RUNTIME_TYPE_NAME_MAP { {"cudaErrorInvalidPc", {"hipErrorInvalidPc", "", CONV_NUMERIC_LITERAL, API_RUNTIME, HIP_UNSUPPORTED}}, // 718 // CUDA_ERROR_LAUNCH_FAILED {"cudaErrorLaunchFailure", {"hipErrorLaunchFailure", "", CONV_NUMERIC_LITERAL, API_RUNTIME}}, // 719 - // no analogue + // CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE {"cudaErrorCooperativeLaunchTooLarge", {"hipErrorCooperativeLaunchTooLarge", "", CONV_NUMERIC_LITERAL, API_RUNTIME, HIP_UNSUPPORTED}}, // 720 // CUDA_ERROR_NOT_PERMITTED {"cudaErrorNotPermitted", {"hipErrorNotPermitted", "", CONV_NUMERIC_LITERAL, API_RUNTIME, HIP_UNSUPPORTED}}, // 800 diff --git a/hipamd/include/hip/hcc_detail/hip_runtime_api.h b/hipamd/include/hip/hcc_detail/hip_runtime_api.h index 659a6c3c3a..8ee44be6ac 100644 --- a/hipamd/include/hip/hcc_detail/hip_runtime_api.h +++ b/hipamd/include/hip/hcc_detail/hip_runtime_api.h @@ -212,6 +212,11 @@ enum hipLimit_t { #define hipArrayCubemap 0x04 #define hipArrayTextureGather 0x08 +#define hipOccupancyDefault 0x00 + +#define hipCooperativeLaunchMultiDeviceNoPreSync 0x01 +#define hipCooperativeLaunchMultiDeviceNoPostSync 0x02 + /* * @brief hipJitOption * @enum @@ -2903,7 +2908,7 @@ hipError_t hipModuleLaunchKernel(hipFunction_t f, unsigned int gridDimX, unsigne * @param [in] stream Stream where the kernel should be dispatched. May be 0, in which case th * default stream is used with associated synchronization rules. * - * @returns hipSuccess, hipInvalidDevice, hipErrorNotInitialized, hipErrorInvalidValue + * @returns hipSuccess, hipInvalidDevice, hipErrorNotInitialized, hipErrorInvalidValue, hipErrorCooperativeLaunchTooLarge */ hipError_t hipLaunchCooperativeKernel(const void* f, dim3 gridDim, dim3 blockDimX, void** kernelParams, unsigned int sharedMemBytes, @@ -2917,7 +2922,7 @@ hipError_t hipLaunchCooperativeKernel(const void* f, dim3 gridDim, dim3 blockDim * @param [in] numDevices Size of the launchParamsList array. * @param [in] flags Flags to control launch behavior. * - * @returns hipSuccess, hipInvalidDevice, hipErrorNotInitialized, hipErrorInvalidValue + * @returns hipSuccess, hipInvalidDevice, hipErrorNotInitialized, hipErrorInvalidValue, hipErrorCooperativeLaunchTooLarge */ hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList, int numDevices, unsigned int flags); @@ -2960,7 +2965,7 @@ hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor( * @param [in] flags Extra flags for occupancy calculation (currently ignored) */ hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( - int* numBlocks, const void* f, int blockSize, size_t dynSharedMemPerBlk, unsigned int flags); + int* numBlocks, const void* f, int blockSize, size_t dynSharedMemPerBlk, unsigned int flags __dparm(hipOccupancyDefault)); #if __HIP_VDI__ && !defined(__HCC__) /** diff --git a/hipamd/include/hip/hip_runtime_api.h b/hipamd/include/hip/hip_runtime_api.h index 64b2a85d8a..025688e98c 100644 --- a/hipamd/include/hip/hip_runtime_api.h +++ b/hipamd/include/hip/hip_runtime_api.h @@ -255,6 +255,10 @@ typedef enum __HIP_NODISCARD hipError_t { 713, ///< Produced when trying to unlock a non-page-locked memory. hipErrorLaunchFailure = 719, ///< An exception occurred on the device while executing a kernel. + hipErrorCooperativeLaunchTooLarge = + 720, ///< This error indicates that the number of blocks launched per grid for a kernel + ///< that was launched via cooperative launch APIs exceeds the maximum number of + ///< allowed blocks for the current device hipErrorNotSupported = 801, ///< Produced when the hip API is not supported/implemented hipErrorUnknown = 999, //< Unknown error. // HSA Runtime Error Codes start here. diff --git a/hipamd/include/hip/nvcc_detail/hip_runtime_api.h b/hipamd/include/hip/nvcc_detail/hip_runtime_api.h index 05a162478a..1ee84b5057 100644 --- a/hipamd/include/hip/nvcc_detail/hip_runtime_api.h +++ b/hipamd/include/hip/nvcc_detail/hip_runtime_api.h @@ -140,6 +140,12 @@ typedef enum cudaChannelFormatKind hipChannelFormatKind; #define hipLimitMallocHeapSize cudaLimitMallocHeapSize #define hipIpcMemLazyEnablePeerAccess cudaIpcMemLazyEnablePeerAccess +#define hipOccupancyDefault cudaOccupancyDefault + +#define hipCooperativeLaunchMultiDeviceNoPreSync cudaCooperativeLaunchMultiDeviceNoPreSync +#define hipCooperativeLaunchMultiDeviceNoPostSync cudaCooperativeLaunchMultiDeviceNoPostSync + + // enum CUjit_option redefines #define hipJitOptionMaxRegisters CU_JIT_MAX_REGISTERS #define hipJitOptionThreadsPerBlock CU_JIT_THREADS_PER_BLOCK @@ -267,6 +273,8 @@ inline static hipError_t hipCUDAErrorTohipError(cudaError_t cuError) { return hipErrorNotInitialized; case cudaErrorLaunchFailure: return hipErrorLaunchFailure; + case cudaErrorCooperativeLaunchTooLarge: + return hipErrorCooperativeLaunchTooLarge; case cudaErrorPriorLaunchFailure: return hipErrorPriorLaunchFailure; case cudaErrorLaunchOutOfResources: @@ -445,6 +453,8 @@ inline static hipError_t hipCUResultTohipError(CUresult cuError) { return hipErrorHostMemoryNotRegistered; case CUDA_ERROR_LAUNCH_FAILED: return hipErrorLaunchFailure; + case CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE: + return hipErrorCooperativeLaunchTooLarge; case CUDA_ERROR_NOT_SUPPORTED: return hipErrorNotSupported; case CUDA_ERROR_UNKNOWN: @@ -601,6 +611,8 @@ inline static cudaError_t hipErrorToCudaError(hipError_t hError) { return cudaErrorSetOnActiveProcess; case hipErrorLaunchFailure: return cudaErrorLaunchFailure; + case hipErrorCooperativeLaunchTooLarge: + return cudaErrorCooperativeLaunchTooLarge; case hipErrorNotSupported: return cudaErrorNotSupported; // HSA: does not exist in CUDA diff --git a/hipamd/src/hip_hcc.cpp b/hipamd/src/hip_hcc.cpp index 175d301ee1..85c08943bf 100644 --- a/hipamd/src/hip_hcc.cpp +++ b/hipamd/src/hip_hcc.cpp @@ -1796,6 +1796,8 @@ const char* ihipErrorString(hipError_t hip_error) { return "hipErrorMissingConfiguration"; case hipErrorLaunchFailure: return "hipErrorLaunchFailure"; + case hipErrorCooperativeLaunchTooLarge: + return "hipErrorCooperativeLaunchTooLarge"; case hipErrorPriorLaunchFailure: return "hipErrorPriorLaunchFailure"; case hipErrorLaunchTimeOut: From 75572a41752ba95c29c48bfa4e4399c9fba89a2e Mon Sep 17 00:00:00 2001 From: Aryan Salmanpour Date: Fri, 31 Jan 2020 13:08:25 -0500 Subject: [PATCH 02/50] code clean up --- hipamd/bin/hipify-perl | 5 ----- .../CUDA_Driver_API_functions_supported_by_HIP.md | 4 ++-- hipamd/hipify-clang/src/CUDA2HIP_Driver_API_types.cpp | 8 ++++---- hipamd/hipify-clang/src/CUDA2HIP_Runtime_API_types.cpp | 8 ++++---- hipamd/include/hip/nvcc_detail/hip_runtime_api.h | 6 ++++-- 5 files changed, 14 insertions(+), 17 deletions(-) diff --git a/hipamd/bin/hipify-perl b/hipamd/bin/hipify-perl index d18e227c3d..4fe9b80349 100755 --- a/hipamd/bin/hipify-perl +++ b/hipamd/bin/hipify-perl @@ -1186,7 +1186,6 @@ sub simpleSubstitutions { $ft{'numeric_literal'} += s/\bCUDA_ERROR_INVALID_PTX\b/hipErrorInvalidKernelFile/g; $ft{'numeric_literal'} += s/\bCUDA_ERROR_INVALID_SOURCE\b/hipErrorInvalidSource/g; $ft{'numeric_literal'} += s/\bCUDA_ERROR_INVALID_VALUE\b/hipErrorInvalidValue/g; - $ft{'numeric_literal'} += s/\bCUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE\b/hipErrorCooperativeLaunchTooLarge/g; $ft{'numeric_literal'} += s/\bCUDA_ERROR_LAUNCH_FAILED\b/hipErrorLaunchFailure/g; $ft{'numeric_literal'} += s/\bCUDA_ERROR_LAUNCH_OUT_OF_RESOURCES\b/hipErrorLaunchOutOfResources/g; $ft{'numeric_literal'} += s/\bCUDA_ERROR_LAUNCH_TIMEOUT\b/hipErrorLaunchTimeOut/g; @@ -1660,7 +1659,6 @@ sub simpleSubstitutions { $ft{'numeric_literal'} += s/\bcudaErrorInvalidSource\b/hipErrorInvalidSource/g; $ft{'numeric_literal'} += s/\bcudaErrorInvalidSymbol\b/hipErrorInvalidSymbol/g; $ft{'numeric_literal'} += s/\bcudaErrorInvalidValue\b/hipErrorInvalidValue/g; - $ft{'numeric_literal'} += s/\bcudaErrorCooperativeLaunchTooLarge\b/hipErrorCooperativeLaunchTooLarge/g; $ft{'numeric_literal'} += s/\bcudaErrorLaunchFailure\b/hipErrorLaunchFailure/g; $ft{'numeric_literal'} += s/\bcudaErrorLaunchOutOfResources\b/hipErrorLaunchOutOfResources/g; $ft{'numeric_literal'} += s/\bcudaErrorLaunchTimeout\b/hipErrorLaunchTimeOut/g; @@ -1803,9 +1801,6 @@ sub simpleSubstitutions { $ft{'define'} += s/\bcudaTextureType3D\b/hipTextureType3D/g; $ft{'define'} += s/\bcudaTextureTypeCubemap\b/hipTextureTypeCubemap/g; $ft{'define'} += s/\bcudaTextureTypeCubemapLayered\b/hipTextureTypeCubemapLayered/g; - $ft{'define'} += s/\bcudaOccupancyDefault\b/hipOccupancyDefault/g; - $ft{'define'} += s/\bcudaCooperativeLaunchMultiDeviceNoPreSync\b/hipCooperativeLaunchMultiDeviceNoPreSync/g; - $ft{'define'} += s/\bcudaCooperativeLaunchMultiDeviceNoPostSync\b/hipCooperativeLaunchMultiDeviceNoPostSync/g; } # CUDA extern __shared__ syntax replace with HIP_DYNAMIC_SHARED() macro diff --git a/hipamd/docs/markdown/CUDA_Driver_API_functions_supported_by_HIP.md b/hipamd/docs/markdown/CUDA_Driver_API_functions_supported_by_HIP.md index 5be6a4da8d..8956533ed0 100644 --- a/hipamd/docs/markdown/CUDA_Driver_API_functions_supported_by_HIP.md +++ b/hipamd/docs/markdown/CUDA_Driver_API_functions_supported_by_HIP.md @@ -1064,8 +1064,8 @@ | `cuFuncSetSharedMemConfig` | | | `cuLaunchKernel` | `hipModuleLaunchKernel` | | `cuLaunchHostFunc` | | 10.0 | -| `cuLaunchCooperativeKernel` | `hipLaunchCooperativeKernel` | 9.0 | -| `cuLaunchCooperativeKernelMultiDevice` | `hipLaunchCooperativeKernelMultiDevice` | 9.0 | +| `cuLaunchCooperativeKernel` | | 9.0 | +| `cuLaunchCooperativeKernelMultiDevice` | | 9.0 | ## **19. Execution Control [DEPRECATED]** diff --git a/hipamd/hipify-clang/src/CUDA2HIP_Driver_API_types.cpp b/hipamd/hipify-clang/src/CUDA2HIP_Driver_API_types.cpp index a4d419384d..054de19800 100644 --- a/hipamd/hipify-clang/src/CUDA2HIP_Driver_API_types.cpp +++ b/hipamd/hipify-clang/src/CUDA2HIP_Driver_API_types.cpp @@ -1047,7 +1047,7 @@ const std::map CUDA_DRIVER_TYPE_NAME_MAP{ {"CUoccupancy_flags_enum", {"hipOccupancyFlags", "", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}}, // CUoccupancy_flags enum values // cudaOccupancyDefault - {"CU_OCCUPANCY_DEFAULT", {"hipOccupancyDefault", "", CONV_NUMERIC_LITERAL, API_DRIVER, HIP_UNSUPPORTED}}, // 0x00 + {"CU_OCCUPANCY_DEFAULT", {"hipOccupancyDefault", "", CONV_NUMERIC_LITERAL, API_DRIVER}}, // 0x00 // cudaOccupancyDisableCachingOverride {"CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE", {"hipOccupancyDisableCachingOverride", "", CONV_NUMERIC_LITERAL, API_DRIVER, HIP_UNSUPPORTED}}, // 0x01 @@ -1285,7 +1285,7 @@ const std::map CUDA_DRIVER_TYPE_NAME_MAP{ // cudaErrorLaunchFailure {"CUDA_ERROR_LAUNCH_FAILED", {"hipErrorLaunchFailure", "", CONV_NUMERIC_LITERAL, API_DRIVER}}, // 719 // cudaErrorCooperativeLaunchTooLarge - {"CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE", {"hipErrorCooperativeLaunchTooLarge", "", CONV_NUMERIC_LITERAL, API_DRIVER, HIP_UNSUPPORTED}}, // 720 + {"CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE", {"hipErrorCooperativeLaunchTooLarge", "", CONV_NUMERIC_LITERAL, API_DRIVER}}, // 720 // cudaErrorNotPermitted {"CUDA_ERROR_NOT_PERMITTED", {"hipErrorNotPermitted", "", CONV_NUMERIC_LITERAL, API_DRIVER, HIP_UNSUPPORTED}}, // 800 // cudaErrorNotSupported @@ -1600,9 +1600,9 @@ const std::map CUDA_DRIVER_TYPE_NAME_MAP{ // cudaArrayColorAttachment {"CUDA_ARRAY3D_COLOR_ATTACHMENT", {"hipArrayColorAttachment", "", CONV_DEFINE, API_DRIVER, HIP_UNSUPPORTED}}, // 0x20 // cudaCooperativeLaunchMultiDeviceNoPreSync - {"CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_PRE_LAUNCH_SYNC", {"hipCooperativeLaunchMultiDeviceNoPreSync", "", CONV_DEFINE, API_DRIVER, HIP_UNSUPPORTED}}, // 0x01 + {"CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_PRE_LAUNCH_SYNC", {"hipCooperativeLaunchMultiDeviceNoPreSync", "", CONV_DEFINE, API_DRIVER}}, // 0x01 // cudaCooperativeLaunchMultiDeviceNoPostSync - {"CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_POST_LAUNCH_SYNC", {"hipCooperativeLaunchMultiDeviceNoPostSync", "", CONV_DEFINE, API_DRIVER, HIP_UNSUPPORTED}}, // 0x02 + {"CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_POST_LAUNCH_SYNC", {"hipCooperativeLaunchMultiDeviceNoPostSync", "", CONV_DEFINE, API_DRIVER}}, // 0x02 // cudaExternalMemoryDedicated {"CUDA_EXTERNAL_MEMORY_DEDICATED", {"hipExternalMemoryDedicated", "", CONV_DEFINE, API_DRIVER, HIP_UNSUPPORTED}}, // 0x1 // cudaExternalSemaphoreSignalSkipNvSciBufMemSync diff --git a/hipamd/hipify-clang/src/CUDA2HIP_Runtime_API_types.cpp b/hipamd/hipify-clang/src/CUDA2HIP_Runtime_API_types.cpp index ade8b105f1..6eb9bfb2be 100644 --- a/hipamd/hipify-clang/src/CUDA2HIP_Runtime_API_types.cpp +++ b/hipamd/hipify-clang/src/CUDA2HIP_Runtime_API_types.cpp @@ -788,7 +788,7 @@ const std::map CUDA_RUNTIME_TYPE_NAME_MAP { // CUDA_ERROR_LAUNCH_FAILED {"cudaErrorLaunchFailure", {"hipErrorLaunchFailure", "", CONV_NUMERIC_LITERAL, API_RUNTIME}}, // 719 // CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE - {"cudaErrorCooperativeLaunchTooLarge", {"hipErrorCooperativeLaunchTooLarge", "", CONV_NUMERIC_LITERAL, API_RUNTIME, HIP_UNSUPPORTED}}, // 720 + {"cudaErrorCooperativeLaunchTooLarge", {"hipErrorCooperativeLaunchTooLarge", "", CONV_NUMERIC_LITERAL, API_RUNTIME}}, // 720 // CUDA_ERROR_NOT_PERMITTED {"cudaErrorNotPermitted", {"hipErrorNotPermitted", "", CONV_NUMERIC_LITERAL, API_RUNTIME, HIP_UNSUPPORTED}}, // 800 // CUDA_ERROR_NOT_SUPPORTED @@ -1327,9 +1327,9 @@ const std::map CUDA_RUNTIME_TYPE_NAME_MAP { // CUDA_ARRAY3D_COLOR_ATTACHMENT {"cudaArrayColorAttachment", {"hipArrayColorAttachment", "", CONV_DEFINE, API_RUNTIME, HIP_UNSUPPORTED}}, // 0x20 // CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_PRE_LAUNCH_SYNC - {"cudaCooperativeLaunchMultiDeviceNoPreSync", {"hipCooperativeLaunchMultiDeviceNoPreSync", "", CONV_DEFINE, API_RUNTIME, HIP_UNSUPPORTED}}, // 0x01 + {"cudaCooperativeLaunchMultiDeviceNoPreSync", {"hipCooperativeLaunchMultiDeviceNoPreSync", "", CONV_DEFINE, API_RUNTIME}}, // 0x01 // CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_POST_LAUNCH_SYNC - {"cudaCooperativeLaunchMultiDeviceNoPostSync", {"hipCooperativeLaunchMultiDeviceNoPostSync", "", CONV_DEFINE, API_RUNTIME, HIP_UNSUPPORTED}}, // 0x02 + {"cudaCooperativeLaunchMultiDeviceNoPostSync", {"hipCooperativeLaunchMultiDeviceNoPostSync", "", CONV_DEFINE, API_RUNTIME}}, // 0x02 // CU_DEVICE_CPU ((CUdevice)-1) {"cudaCpuDeviceId", {"hipCpuDeviceId", "", CONV_DEFINE, API_RUNTIME, HIP_UNSUPPORTED}}, // ((int)-1) // CU_DEVICE_INVALID ((CUdevice)-2) @@ -1412,7 +1412,7 @@ const std::map CUDA_RUNTIME_TYPE_NAME_MAP { // no analogue {"cudaTextureTypeCubemapLayered", {"hipTextureTypeCubemapLayered", "", CONV_DEFINE, API_RUNTIME}}, // 0xFC // CU_OCCUPANCY_DEFAULT - {"cudaOccupancyDefault", {"hipOccupancyDefault", "", CONV_DEFINE, API_RUNTIME, HIP_UNSUPPORTED}}, // 0x00 + {"cudaOccupancyDefault", {"hipOccupancyDefault", "", CONV_DEFINE, API_RUNTIME}}, // 0x00 // CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE {"cudaOccupancyDisableCachingOverride", {"hipOccupancyDisableCachingOverride", "", CONV_DEFINE, API_RUNTIME, HIP_UNSUPPORTED}}, // 0x01 // CU_STREAM_DEFAULT diff --git a/hipamd/include/hip/nvcc_detail/hip_runtime_api.h b/hipamd/include/hip/nvcc_detail/hip_runtime_api.h index 1ee84b5057..bcddce58b2 100644 --- a/hipamd/include/hip/nvcc_detail/hip_runtime_api.h +++ b/hipamd/include/hip/nvcc_detail/hip_runtime_api.h @@ -142,8 +142,10 @@ typedef enum cudaChannelFormatKind hipChannelFormatKind; #define hipOccupancyDefault cudaOccupancyDefault -#define hipCooperativeLaunchMultiDeviceNoPreSync cudaCooperativeLaunchMultiDeviceNoPreSync -#define hipCooperativeLaunchMultiDeviceNoPostSync cudaCooperativeLaunchMultiDeviceNoPostSync +#define hipCooperativeLaunchMultiDeviceNoPreSync + cudaCooperativeLaunchMultiDeviceNoPreSync +#define hipCooperativeLaunchMultiDeviceNoPostSync + cudaCooperativeLaunchMultiDeviceNoPostSync // enum CUjit_option redefines From d805e98fcb7d8a28ebf7ecebce14fcfc62108cc5 Mon Sep 17 00:00:00 2001 From: Aryan Salmanpour Date: Fri, 31 Jan 2020 14:36:37 -0500 Subject: [PATCH 03/50] update the hipify-perl --- hipamd/bin/hipify-perl | 8 ++++++++ 1 file changed, 8 insertions(+) mode change 100755 => 100644 hipamd/bin/hipify-perl diff --git a/hipamd/bin/hipify-perl b/hipamd/bin/hipify-perl old mode 100755 new mode 100644 index 4fe9b80349..f6de5abae4 --- a/hipamd/bin/hipify-perl +++ b/hipamd/bin/hipify-perl @@ -1172,6 +1172,7 @@ sub simpleSubstitutions { $ft{'numeric_literal'} += s/\bCUDA_ERROR_ASSERT\b/hipErrorAssert/g; $ft{'numeric_literal'} += s/\bCUDA_ERROR_CONTEXT_ALREADY_CURRENT\b/hipErrorContextAlreadyCurrent/g; $ft{'numeric_literal'} += s/\bCUDA_ERROR_CONTEXT_ALREADY_IN_USE\b/hipErrorContextAlreadyInUse/g; + $ft{'numeric_literal'} += s/\bCUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE\b/hipErrorCooperativeLaunchTooLarge/g; $ft{'numeric_literal'} += s/\bCUDA_ERROR_DEINITIALIZED\b/hipErrorDeinitialized/g; $ft{'numeric_literal'} += s/\bCUDA_ERROR_ECC_UNCORRECTABLE\b/hipErrorECCNotCorrectable/g; $ft{'numeric_literal'} += s/\bCUDA_ERROR_FILE_NOT_FOUND\b/hipErrorFileNotFound/g; @@ -1528,6 +1529,7 @@ sub simpleSubstitutions { $ft{'numeric_literal'} += s/\bCU_MEMORYTYPE_UNIFIED\b/hipMemoryTypeUnified/g; $ft{'numeric_literal'} += s/\bCU_MEM_ATTACH_GLOBAL\b/hipMemAttachGlobal/g; $ft{'numeric_literal'} += s/\bCU_MEM_ATTACH_HOST\b/hipMemAttachHost/g; + $ft{'numeric_literal'} += s/\bCU_OCCUPANCY_DEFAULT\b/hipOccupancyDefault/g; $ft{'numeric_literal'} += s/\bCU_RESOURCE_TYPE_ARRAY\b/hipResourceTypeArray/g; $ft{'numeric_literal'} += s/\bCU_RESOURCE_TYPE_LINEAR\b/hipResourceTypeLinear/g; $ft{'numeric_literal'} += s/\bCU_RESOURCE_TYPE_MIPMAPPED_ARRAY\b/hipResourceTypeMipmappedArray/g; @@ -1636,6 +1638,7 @@ sub simpleSubstitutions { $ft{'numeric_literal'} += s/\bcudaErrorAlreadyMapped\b/hipErrorAlreadyMapped/g; $ft{'numeric_literal'} += s/\bcudaErrorArrayIsMapped\b/hipErrorArrayIsMapped/g; $ft{'numeric_literal'} += s/\bcudaErrorAssert\b/hipErrorAssert/g; + $ft{'numeric_literal'} += s/\bcudaErrorCooperativeLaunchTooLarge\b/hipErrorCooperativeLaunchTooLarge/g; $ft{'numeric_literal'} += s/\bcudaErrorCudartUnloading\b/hipErrorDeinitialized/g; $ft{'numeric_literal'} += s/\bcudaErrorDeviceAlreadyInUse\b/hipErrorContextAlreadyInUse/g; $ft{'numeric_literal'} += s/\bcudaErrorDeviceUninitialized\b/hipErrorInvalidContext/g; @@ -1749,6 +1752,8 @@ sub simpleSubstitutions { $ft{'define'} += s/\bCUDA_ARRAY3D_LAYERED\b/hipArrayLayered/g; $ft{'define'} += s/\bCUDA_ARRAY3D_SURFACE_LDST\b/hipArraySurfaceLoadStore/g; $ft{'define'} += s/\bCUDA_ARRAY3D_TEXTURE_GATHER\b/hipArrayTextureGather/g; + $ft{'define'} += s/\bCUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_POST_LAUNCH_SYNC\b/hipCooperativeLaunchMultiDeviceNoPostSync/g; + $ft{'define'} += s/\bCUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_PRE_LAUNCH_SYNC\b/hipCooperativeLaunchMultiDeviceNoPreSync/g; $ft{'define'} += s/\bCU_LAUNCH_PARAM_BUFFER_POINTER\b/HIP_LAUNCH_PARAM_BUFFER_POINTER/g; $ft{'define'} += s/\bCU_LAUNCH_PARAM_BUFFER_SIZE\b/HIP_LAUNCH_PARAM_BUFFER_SIZE/g; $ft{'define'} += s/\bCU_LAUNCH_PARAM_END\b/HIP_LAUNCH_PARAM_END/g; @@ -1769,6 +1774,8 @@ sub simpleSubstitutions { $ft{'define'} += s/\bcudaArrayLayered\b/hipArrayLayered/g; $ft{'define'} += s/\bcudaArraySurfaceLoadStore\b/hipArraySurfaceLoadStore/g; $ft{'define'} += s/\bcudaArrayTextureGather\b/hipArrayTextureGather/g; + $ft{'define'} += s/\bcudaCooperativeLaunchMultiDeviceNoPostSync\b/hipCooperativeLaunchMultiDeviceNoPostSync/g; + $ft{'define'} += s/\bcudaCooperativeLaunchMultiDeviceNoPreSync\b/hipCooperativeLaunchMultiDeviceNoPreSync/g; $ft{'define'} += s/\bcudaDeviceBlockingSync\b/hipDeviceScheduleBlockingSync/g; $ft{'define'} += s/\bcudaDeviceLmemResizeToMax\b/hipDeviceLmemResizeToMax/g; $ft{'define'} += s/\bcudaDeviceMapHost\b/hipDeviceMapHost/g; @@ -1792,6 +1799,7 @@ sub simpleSubstitutions { $ft{'define'} += s/\bcudaIpcMemLazyEnablePeerAccess\b/hipIpcMemLazyEnablePeerAccess/g; $ft{'define'} += s/\bcudaMemAttachGlobal\b/hipMemAttachGlobal/g; $ft{'define'} += s/\bcudaMemAttachHost\b/hipMemAttachHost/g; + $ft{'define'} += s/\bcudaOccupancyDefault\b/hipOccupancyDefault/g; $ft{'define'} += s/\bcudaStreamDefault\b/hipStreamDefault/g; $ft{'define'} += s/\bcudaStreamNonBlocking\b/hipStreamNonBlocking/g; $ft{'define'} += s/\bcudaTextureType1D\b/hipTextureType1D/g; From 1354a447e1306fce446459d5ad25cab303c865d7 Mon Sep 17 00:00:00 2001 From: Paul Date: Thu, 6 Feb 2020 11:56:17 -0600 Subject: [PATCH 04/50] Use deque instead of vector for code readers so that the iterators and references will be stable --- hipamd/src/program_state.inl | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/hipamd/src/program_state.inl b/hipamd/src/program_state.inl index 272addd053..8df0f1cc0f 100644 --- a/hipamd/src/program_state.inl +++ b/hipamd/src/program_state.inl @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -202,7 +203,7 @@ public: std::function>; std::pair< std::mutex, - std::vector>> code_readers; + std::deque>> code_readers; program_state_impl() { // Create placeholder for each agent for the per-agent members. @@ -418,7 +419,7 @@ public: decltype(code_readers.second)::iterator it; { std::lock_guard lck{code_readers.first}; - it = code_readers.second.emplace(code_readers.second.end(), + it = code_readers.second.emplace_back(code_readers.second.end(), move(file), move(tmp)); } From bb26e99c735ca49b88ff23215c80faf89c828650 Mon Sep 17 00:00:00 2001 From: Paul Date: Thu, 6 Feb 2020 12:04:50 -0600 Subject: [PATCH 05/50] Fix compile error --- hipamd/src/program_state.inl | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/hipamd/src/program_state.inl b/hipamd/src/program_state.inl index 8df0f1cc0f..0442874900 100644 --- a/hipamd/src/program_state.inl +++ b/hipamd/src/program_state.inl @@ -419,8 +419,7 @@ public: decltype(code_readers.second)::iterator it; { std::lock_guard lck{code_readers.first}; - it = code_readers.second.emplace_back(code_readers.second.end(), - move(file), move(tmp)); + it = code_readers.second.emplace_back(move(file), move(tmp)); } auto check_hsa_error = [](hsa_status_t s) { From 84d6eb985d4983db208e2f3137f19fbdecc6b843 Mon Sep 17 00:00:00 2001 From: Paul Date: Thu, 6 Feb 2020 12:15:29 -0600 Subject: [PATCH 06/50] Assign the iterator --- hipamd/src/program_state.inl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hipamd/src/program_state.inl b/hipamd/src/program_state.inl index 0442874900..c8eb2f297c 100644 --- a/hipamd/src/program_state.inl +++ b/hipamd/src/program_state.inl @@ -419,7 +419,8 @@ public: decltype(code_readers.second)::iterator it; { std::lock_guard lck{code_readers.first}; - it = code_readers.second.emplace_back(move(file), move(tmp)); + code_readers.second.emplace_back(move(file), move(tmp)); + it = std::prev(code_readers.second.end()); } auto check_hsa_error = [](hsa_status_t s) { From bf69c7ae32fd86f95b471c38adc91df82bb873e4 Mon Sep 17 00:00:00 2001 From: Paul Date: Thu, 6 Feb 2020 16:21:40 -0600 Subject: [PATCH 07/50] Add multithreaded test --- .../module/hipModuleLoadDataMultThreaded.cpp | 144 ++++++++++++++++++ 1 file changed, 144 insertions(+) create mode 100644 hipamd/tests/src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp diff --git a/hipamd/tests/src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp b/hipamd/tests/src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp new file mode 100644 index 0000000000..f989a14a06 --- /dev/null +++ b/hipamd/tests/src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp @@ -0,0 +1,144 @@ +/* +Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR +IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +/* HIT_START + * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11 + * TEST: %t + * HIT_END + */ + +#include "hip/hip_runtime.h" +#include "hip/hip_runtime_api.h" +#include +#include +#include +#include +#include + +#include "test_common.h" + +#define LEN 64 +#define SIZE LEN << 2 +#define THREADS 64 + +#define FILENAME "vcpy_kernel.code" +#define kernel_name "hello_world" + +using ModuleFunction = std::pair; + +ModuleFunction load() { + hipModule_t Module; + hipFunction_t Function; + std::ifstream file(FILENAME, std::ios::binary | std::ios::ate); + std::streamsize fsize = file.tellg(); + file.seekg(0, std::ios::beg); + + std::vector buffer(fsize); + if (file.read(buffer.data(), fsize)) { + HIPCHECK(hipModuleLoadData(&Module, &buffer[0])); + HIPCHECK(hipModuleGetFunction(&Function, Module, kernel_name)); + } + else { + failed("could not open code object '%s'\n", FILENAME); + } + return {Module, Function}; +} + +void run(ModuleFunction mf) { + hipModule_t Module = mf.first; + hipFunction_t Function = mf.second; + float *A, *B, *Ad, *Bd; + A = new float[LEN]; + B = new float[LEN]; + + for (uint32_t i = 0; i < LEN; i++) { + A[i] = i * 1.0f; + B[i] = 0.0f; + } + + HIPCHECK(hipMalloc((void**)&Ad, SIZE)); + HIPCHECK(hipMalloc((void**)&Bd, SIZE)); + + HIPCHECK(hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice)); + HIPCHECK(hipMemcpy(Bd, B, SIZE, hipMemcpyHostToDevice)); + + hipStream_t stream; + HIPCHECK(hipStreamCreate(&stream)); + + struct { + void* _Ad; + void* _Bd; + } args; + args._Ad = (void*) Ad; + args._Bd = (void*) Bd; + size_t size = sizeof(args); + + void* config[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, &args, HIP_LAUNCH_PARAM_BUFFER_SIZE, &size, + HIP_LAUNCH_PARAM_END}; + HIPCHECK(hipModuleLaunchKernel(Function, 1, 1, 1, LEN, 1, 1, 0, stream, NULL, (void**)&config)); + + HIPCHECK(hipStreamDestroy(stream)); + + HIPCHECK(hipModuleUnload(Module)); + + HIPCHECK(hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost)); + + for (uint32_t i = 0; i < LEN; i++) { + assert(A[i] == B[i]); + } +} + +struct joinable_thread : std::thread +{ + template + joinable_thread(Xs&&... xs) : std::thread(std::forward(xs)...) // NOLINT + { + } + + joinable_thread& operator=(joinable_thread&& other) = default; + joinable_thread(joinable_thread&& other) = default; + + ~joinable_thread() + { + if(this->joinable()) + this->join(); + } +}; + +void run_multi_threads(uint32_t n) { + std::vector mf(n); + { + std::vector threads; + for (uint32_t i = 0; i < n; i++) { + threads.emplace_back(std::thread{[=, &mf] { + mf[i] = load(); + }}); + } + } + for(auto&& x:mf) + run(x); +} + +int main() { + + HIPCHECK(hipInit(0)); + run_multi_threads(THREADS); + + passed(); +} From fb8c7cb3eca35514dc45349e1d05cd5b79ad7302 Mon Sep 17 00:00:00 2001 From: Paul Date: Thu, 6 Feb 2020 16:23:29 -0600 Subject: [PATCH 08/50] Make threads a multiple of hardware concurrency --- .../src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hipamd/tests/src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp b/hipamd/tests/src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp index f989a14a06..31e930086b 100644 --- a/hipamd/tests/src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp +++ b/hipamd/tests/src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp @@ -35,7 +35,7 @@ THE SOFTWARE. #define LEN 64 #define SIZE LEN << 2 -#define THREADS 64 +#define THREADS 4 #define FILENAME "vcpy_kernel.code" #define kernel_name "hello_world" @@ -138,7 +138,7 @@ void run_multi_threads(uint32_t n) { int main() { HIPCHECK(hipInit(0)); - run_multi_threads(THREADS); + run_multi_threads(THREADS * std::thread::hardware_concurrency()); passed(); } From 2405ab236f03e6d22e9ae59f9cd99aadb9702b74 Mon Sep 17 00:00:00 2001 From: Paul Date: Fri, 7 Feb 2020 10:13:28 -0600 Subject: [PATCH 09/50] Output on failure --- hipamd/Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hipamd/Jenkinsfile b/hipamd/Jenkinsfile index b8bd24cd74..734e875e03 100644 --- a/hipamd/Jenkinsfile +++ b/hipamd/Jenkinsfile @@ -177,7 +177,7 @@ def docker_build_inside_image( def build_image, String inside_args, String platf cd ${build_dir_rel} make install -j\$(nproc) make build_tests -i -j\$(nproc) - ctest -E "(hipMultiThreadDevice-pyramid|hipMemoryAllocateCoherentDriver)" + ctest --output-on-failure -E "(hipMultiThreadDevice-pyramid|hipMemoryAllocateCoherentDriver)" """ // If unit tests output a junit or xunit file in the future, jenkins can parse that file // to display test results on the dashboard From 504412c1e49d82564c2aa63075897990022a151c Mon Sep 17 00:00:00 2001 From: Aryan Salmanpour Date: Mon, 10 Feb 2020 11:44:40 -0500 Subject: [PATCH 10/50] Fix a typo causing a build error --- hipamd/include/hip/hcc_detail/hip_runtime_api.h | 1 + 1 file changed, 1 insertion(+) diff --git a/hipamd/include/hip/hcc_detail/hip_runtime_api.h b/hipamd/include/hip/hcc_detail/hip_runtime_api.h index 1636a782a0..76209ef6a7 100644 --- a/hipamd/include/hip/hcc_detail/hip_runtime_api.h +++ b/hipamd/include/hip/hcc_detail/hip_runtime_api.h @@ -2964,6 +2964,7 @@ hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor( * @param [in] dynSharedMemPerBlk dynamic shared memory usage (in bytes) intended for each block * @param [in] flags Extra flags for occupancy calculation (currently ignored) */ +hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk, unsigned int flags __dparm(hipOccupancyDefault)); #if __HIP_VDI__ && !defined(__HCC__) From 9494d0e3c9619a122b6de3bf19e9126a312c1297 Mon Sep 17 00:00:00 2001 From: Paul Date: Mon, 10 Feb 2020 13:37:45 -0600 Subject: [PATCH 11/50] Add setDevice to try and initialize the context on cuda --- .../src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/hipamd/tests/src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp b/hipamd/tests/src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp index 31e930086b..ff2d5b1ef3 100644 --- a/hipamd/tests/src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp +++ b/hipamd/tests/src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp @@ -127,6 +127,7 @@ void run_multi_threads(uint32_t n) { std::vector threads; for (uint32_t i = 0; i < n; i++) { threads.emplace_back(std::thread{[=, &mf] { + hipSetDevice(0); mf[i] = load(); }}); } From bb145e77aedf960346ab97d7920a4672f7feb468 Mon Sep 17 00:00:00 2001 From: Paul Date: Mon, 10 Feb 2020 15:52:34 -0600 Subject: [PATCH 12/50] Create context for cuda --- .../module/hipModuleLoadDataMultThreaded.cpp | 39 +++++++++++++------ 1 file changed, 27 insertions(+), 12 deletions(-) diff --git a/hipamd/tests/src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp b/hipamd/tests/src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp index ff2d5b1ef3..92a1cb77c7 100644 --- a/hipamd/tests/src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp +++ b/hipamd/tests/src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp @@ -35,28 +35,31 @@ THE SOFTWARE. #define LEN 64 #define SIZE LEN << 2 -#define THREADS 4 +#define THREADS 8 #define FILENAME "vcpy_kernel.code" #define kernel_name "hello_world" using ModuleFunction = std::pair; -ModuleFunction load() { - hipModule_t Module; - hipFunction_t Function; +std::vector load_file() +{ std::ifstream file(FILENAME, std::ios::binary | std::ios::ate); std::streamsize fsize = file.tellg(); file.seekg(0, std::ios::beg); std::vector buffer(fsize); - if (file.read(buffer.data(), fsize)) { - HIPCHECK(hipModuleLoadData(&Module, &buffer[0])); - HIPCHECK(hipModuleGetFunction(&Function, Module, kernel_name)); - } - else { + if (!file.read(buffer.data(), fsize)) { failed("could not open code object '%s'\n", FILENAME); } + return buffer; +} + +ModuleFunction load(const std::vector& buffer) { + hipModule_t Module; + hipFunction_t Function; + HIPCHECK(hipModuleLoadData(&Module, &buffer[0])); + HIPCHECK(hipModuleGetFunction(&Function, Module, kernel_name)); return {Module, Function}; } @@ -121,25 +124,37 @@ struct joinable_thread : std::thread } }; +hipCtx_t create_context() { + hipDevice_t device; + HIPCHECK(hipDeviceGet(&device, 0)); + + hipCtx_t ctx; + HIPCHECK(hipCtxCreate(&ctx, 0, device)); + return ctx; +} + void run_multi_threads(uint32_t n) { std::vector mf(n); { + auto buffer = load_file(); std::vector threads; for (uint32_t i = 0; i < n; i++) { - threads.emplace_back(std::thread{[=, &mf] { - hipSetDevice(0); - mf[i] = load(); + threads.emplace_back(std::thread{[&, i, buffer] { + mf[i] = load(buffer); }}); } } for(auto&& x:mf) run(x); + } int main() { HIPCHECK(hipInit(0)); + auto ctx = create_context(); run_multi_threads(THREADS * std::thread::hardware_concurrency()); + hipCtxDestroy(ctx); passed(); } From 7a6e88480bc6114df5f3bac5b0da43c21940e404 Mon Sep 17 00:00:00 2001 From: Paul Date: Mon, 10 Feb 2020 16:01:53 -0600 Subject: [PATCH 13/50] Set context on each thread --- .../src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/hipamd/tests/src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp b/hipamd/tests/src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp index 92a1cb77c7..cdb4c81c94 100644 --- a/hipamd/tests/src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp +++ b/hipamd/tests/src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp @@ -134,27 +134,27 @@ hipCtx_t create_context() { } void run_multi_threads(uint32_t n) { + auto ctx = create_context(); std::vector mf(n); { auto buffer = load_file(); std::vector threads; for (uint32_t i = 0; i < n; i++) { threads.emplace_back(std::thread{[&, i, buffer] { + HIPCHECK(hipCtxSetCurrent(ctx)); mf[i] = load(buffer); }}); } } for(auto&& x:mf) run(x); - + hipCtxDestroy(ctx); } int main() { HIPCHECK(hipInit(0)); - auto ctx = create_context(); run_multi_threads(THREADS * std::thread::hardware_concurrency()); - hipCtxDestroy(ctx); passed(); } From 5d24a2beef3dd5b4e072206f548fc63becd1b302 Mon Sep 17 00:00:00 2001 From: Paul Date: Mon, 10 Feb 2020 16:37:34 -0600 Subject: [PATCH 14/50] Reduce threads on cuda --- .../src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/hipamd/tests/src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp b/hipamd/tests/src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp index cdb4c81c94..3b22176927 100644 --- a/hipamd/tests/src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp +++ b/hipamd/tests/src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp @@ -35,7 +35,11 @@ THE SOFTWARE. #define LEN 64 #define SIZE LEN << 2 +#ifdef __CUDACC__ +#define THREADS 1 +#else #define THREADS 8 +#endif #define FILENAME "vcpy_kernel.code" #define kernel_name "hello_world" From accdd882dc6ea1dc75e50218c9f3bab3165cd6a6 Mon Sep 17 00:00:00 2001 From: Paul Date: Mon, 10 Feb 2020 17:23:58 -0600 Subject: [PATCH 15/50] Skip test on cuda --- .../module/hipModuleLoadDataMultThreaded.cpp | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) diff --git a/hipamd/tests/src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp b/hipamd/tests/src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp index 3b22176927..03a2b82b8b 100644 --- a/hipamd/tests/src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp +++ b/hipamd/tests/src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp @@ -18,7 +18,7 @@ THE SOFTWARE. */ /* HIT_START - * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11 + * BUILD: %t %s ../../test_common.cpp EXCLUDE_HIP_PLATFORM nvcc * TEST: %t * HIT_END */ @@ -35,11 +35,7 @@ THE SOFTWARE. #define LEN 64 #define SIZE LEN << 2 -#ifdef __CUDACC__ -#define THREADS 1 -#else #define THREADS 8 -#endif #define FILENAME "vcpy_kernel.code" #define kernel_name "hello_world" @@ -128,31 +124,19 @@ struct joinable_thread : std::thread } }; -hipCtx_t create_context() { - hipDevice_t device; - HIPCHECK(hipDeviceGet(&device, 0)); - - hipCtx_t ctx; - HIPCHECK(hipCtxCreate(&ctx, 0, device)); - return ctx; -} - void run_multi_threads(uint32_t n) { - auto ctx = create_context(); std::vector mf(n); { auto buffer = load_file(); std::vector threads; for (uint32_t i = 0; i < n; i++) { threads.emplace_back(std::thread{[&, i, buffer] { - HIPCHECK(hipCtxSetCurrent(ctx)); mf[i] = load(buffer); }}); } } for(auto&& x:mf) run(x); - hipCtxDestroy(ctx); } int main() { From 1c013bab3e869e62111709b50842c735b22c39cf Mon Sep 17 00:00:00 2001 From: Aryan Salmanpour Date: Tue, 11 Feb 2020 12:16:51 -0500 Subject: [PATCH 16/50] fix build error in nvcc path --- hipamd/include/hip/nvcc_detail/hip_runtime_api.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hipamd/include/hip/nvcc_detail/hip_runtime_api.h b/hipamd/include/hip/nvcc_detail/hip_runtime_api.h index bcddce58b2..6e0d02d0c0 100644 --- a/hipamd/include/hip/nvcc_detail/hip_runtime_api.h +++ b/hipamd/include/hip/nvcc_detail/hip_runtime_api.h @@ -142,9 +142,9 @@ typedef enum cudaChannelFormatKind hipChannelFormatKind; #define hipOccupancyDefault cudaOccupancyDefault -#define hipCooperativeLaunchMultiDeviceNoPreSync +#define hipCooperativeLaunchMultiDeviceNoPreSync \ cudaCooperativeLaunchMultiDeviceNoPreSync -#define hipCooperativeLaunchMultiDeviceNoPostSync +#define hipCooperativeLaunchMultiDeviceNoPostSync \ cudaCooperativeLaunchMultiDeviceNoPostSync From 86cdb30195ed26f4f0c514d15f947dd0397bc366 Mon Sep 17 00:00:00 2001 From: Paul Date: Tue, 11 Feb 2020 11:26:24 -0600 Subject: [PATCH 17/50] Try to initialize the primary context on cuda --- .../runtimeApi/module/hipModuleLoadDataMultThreaded.cpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/hipamd/tests/src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp b/hipamd/tests/src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp index 03a2b82b8b..6612392064 100644 --- a/hipamd/tests/src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp +++ b/hipamd/tests/src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp @@ -18,7 +18,7 @@ THE SOFTWARE. */ /* HIT_START - * BUILD: %t %s ../../test_common.cpp EXCLUDE_HIP_PLATFORM nvcc + * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11 * TEST: %t * HIT_END */ @@ -127,10 +127,16 @@ struct joinable_thread : std::thread void run_multi_threads(uint32_t n) { std::vector mf(n); { + hipDevice_t device; + HIPCHECK(hipDeviceGet(&device, 0)); + auto buffer = load_file(); std::vector threads; for (uint32_t i = 0; i < n; i++) { threads.emplace_back(std::thread{[&, i, buffer] { + hipCtx_t ctx; + HIPCHECK(hipDevicePrimaryCtxRetain(&ctx, device)); + mf[i] = load(buffer); }}); } From e8a7cc69579405b129b5fe2a70db83d7c12a0d29 Mon Sep 17 00:00:00 2001 From: Paul Date: Tue, 11 Feb 2020 11:46:29 -0600 Subject: [PATCH 18/50] Push ctx to the stack as current --- .../src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hipamd/tests/src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp b/hipamd/tests/src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp index 6612392064..09fedc3660 100644 --- a/hipamd/tests/src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp +++ b/hipamd/tests/src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp @@ -136,7 +136,7 @@ void run_multi_threads(uint32_t n) { threads.emplace_back(std::thread{[&, i, buffer] { hipCtx_t ctx; HIPCHECK(hipDevicePrimaryCtxRetain(&ctx, device)); - + HIPCHECK(hipCtxPushCurrent(ctx)); mf[i] = load(buffer); }}); } From cd279cd5748c2667466c147c172ca744fc2efa92 Mon Sep 17 00:00:00 2001 From: Paul Date: Tue, 11 Feb 2020 12:34:10 -0600 Subject: [PATCH 19/50] Revert "Push ctx to the stack as current" This reverts commit e8a7cc69579405b129b5fe2a70db83d7c12a0d29. --- .../src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hipamd/tests/src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp b/hipamd/tests/src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp index 09fedc3660..6612392064 100644 --- a/hipamd/tests/src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp +++ b/hipamd/tests/src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp @@ -136,7 +136,7 @@ void run_multi_threads(uint32_t n) { threads.emplace_back(std::thread{[&, i, buffer] { hipCtx_t ctx; HIPCHECK(hipDevicePrimaryCtxRetain(&ctx, device)); - HIPCHECK(hipCtxPushCurrent(ctx)); + mf[i] = load(buffer); }}); } From dd6676dadf886ce6a7a5d87d647dc0403706b20e Mon Sep 17 00:00:00 2001 From: Paul Date: Tue, 11 Feb 2020 12:34:11 -0600 Subject: [PATCH 20/50] Revert "Try to initialize the primary context on cuda" This reverts commit 86cdb30195ed26f4f0c514d15f947dd0397bc366. --- .../runtimeApi/module/hipModuleLoadDataMultThreaded.cpp | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/hipamd/tests/src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp b/hipamd/tests/src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp index 6612392064..03a2b82b8b 100644 --- a/hipamd/tests/src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp +++ b/hipamd/tests/src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp @@ -18,7 +18,7 @@ THE SOFTWARE. */ /* HIT_START - * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11 + * BUILD: %t %s ../../test_common.cpp EXCLUDE_HIP_PLATFORM nvcc * TEST: %t * HIT_END */ @@ -127,16 +127,10 @@ struct joinable_thread : std::thread void run_multi_threads(uint32_t n) { std::vector mf(n); { - hipDevice_t device; - HIPCHECK(hipDeviceGet(&device, 0)); - auto buffer = load_file(); std::vector threads; for (uint32_t i = 0; i < n; i++) { threads.emplace_back(std::thread{[&, i, buffer] { - hipCtx_t ctx; - HIPCHECK(hipDevicePrimaryCtxRetain(&ctx, device)); - mf[i] = load(buffer); }}); } From 888a7f2a90379a9e73b9a1267bbd76be4e48ef12 Mon Sep 17 00:00:00 2001 From: ansurya <50609411+ansurya@users.noreply.github.com> Date: Thu, 13 Feb 2020 14:21:51 +0530 Subject: [PATCH 21/50] Reduce GPU copying based on arch it runs on (#1751) Implements SWDEV-213230. --- hipamd/CMakeLists.txt | 2 +- .../hip/hcc_detail/code_object_bundle.hpp | 20 +++++++++++-------- hipamd/lpl_ca/CMakeLists.txt | 4 ++-- hipamd/src/hip_hcc.cpp | 11 ++++++++++ hipamd/src/program_state.inl | 3 ++- 5 files changed, 28 insertions(+), 12 deletions(-) diff --git a/hipamd/CMakeLists.txt b/hipamd/CMakeLists.txt index 3eedf35b09..3a77735865 100644 --- a/hipamd/CMakeLists.txt +++ b/hipamd/CMakeLists.txt @@ -336,9 +336,9 @@ if(HIP_PLATFORM STREQUAL "hcc") target_link_libraries(hip_hcc_static PRIVATE hc_am) add_library(hiprtc SHARED src/hiprtc.cpp src/code_object_bundle.cpp) + target_compile_options(hiprtc PRIVATE -DDISABLE_REDUCED_GPU_BLOB_COPY) set_property ( TARGET hiprtc PROPERTY VERSION "${HIP_LIB_VERSION_STRING}" ) set_property ( TARGET hiprtc PROPERTY SOVERSION "${HIP_LIB_VERSION_MAJOR}" ) - target_include_directories( hiprtc SYSTEM PRIVATE ${PROJECT_SOURCE_DIR}/include ${HSA_PATH}/include) diff --git a/hipamd/include/hip/hcc_detail/code_object_bundle.hpp b/hipamd/include/hip/hcc_detail/code_object_bundle.hpp index f312d2e79b..77e0d706d6 100644 --- a/hipamd/include/hip/hcc_detail/code_object_bundle.hpp +++ b/hipamd/include/hip/hcc_detail/code_object_bundle.hpp @@ -31,9 +31,11 @@ THE SOFTWARE. #include #include #include - +#include namespace hip_impl { - +#if !defined(DISABLE_REDUCED_GPU_BLOB_COPY) +std::unordered_set& get_all_gpuarch(); +#endif inline std::string transmogrify_triple(const std::string& triple) { @@ -43,7 +45,6 @@ std::string transmogrify_triple(const std::string& triple) if (triple.find(old_prefix) == 0) { return new_prefix + triple.substr(sizeof(old_prefix) - 1); } - return (triple.find(new_prefix) == 0) ? triple : ""; } @@ -114,9 +115,7 @@ class Bundled_code_header { friend inline bool read(RandomAccessIterator f, RandomAccessIterator l, Bundled_code_header& x) { if (f == l) return false; - std::copy_n(f, sizeof(x.header_.cbuf_), x.header_.cbuf_); - if (valid(x)) { x.bundles_.resize(x.header_.bundle_cnt_); @@ -126,11 +125,16 @@ class Bundled_code_header { it += sizeof(y.header.cbuf); y.triple.assign(it, it + y.header.triple_sz); - + #ifdef DISABLE_REDUCED_GPU_BLOB_COPY std::copy_n(f + y.header.offset, y.header.bundle_sz, std::back_inserter(y.blob)); - + #else + auto& gpuArch = get_all_gpuarch(); + auto itgpuArch = std::find(gpuArch.begin(),gpuArch.end(),y.triple); + if (itgpuArch != gpuArch.end()){ + std::copy_n(f + y.header.offset, y.header.bundle_sz, std::back_inserter(y.blob)); + } + #endif it += y.header.triple_sz; - x.bundled_code_size = std::max(x.bundled_code_size, y.header.offset + y.header.bundle_sz); } diff --git a/hipamd/lpl_ca/CMakeLists.txt b/hipamd/lpl_ca/CMakeLists.txt index b36d73bbcb..ac01a6a0ab 100644 --- a/hipamd/lpl_ca/CMakeLists.txt +++ b/hipamd/lpl_ca/CMakeLists.txt @@ -26,7 +26,7 @@ target_include_directories(ca PUBLIC ${PROJECT_SOURCE_DIR}/src) find_library( hsart NAMES libhsa-runtime64.so libhsa-runtime64.so.1 HINTS ${HSA_PATH}/lib) target_link_libraries(ca PUBLIC ${hsart}) -target_compile_options(ca PUBLIC -Wall) +target_compile_options(ca PUBLIC -DDISABLE_REDUCED_GPU_BLOB_COPY -Wall) install(TARGETS ca RUNTIME DESTINATION bin) -#-------------------------------------CA---------------------------------------# \ No newline at end of file +#-------------------------------------CA---------------------------------------# diff --git a/hipamd/src/hip_hcc.cpp b/hipamd/src/hip_hcc.cpp index 85c08943bf..af40b29ea9 100644 --- a/hipamd/src/hip_hcc.cpp +++ b/hipamd/src/hip_hcc.cpp @@ -39,6 +39,7 @@ THE SOFTWARE. #include #include #include +#include #include #include @@ -2542,6 +2543,16 @@ hipError_t hipHccGetAcceleratorView(hipStream_t stream, hc::accelerator_view** a // TODO - add a contect sequence number for debug. Print operator<< ctx:0.1 (device.ctx) namespace hip_impl { + std::unordered_set& get_all_gpuarch() { + static std::unordered_set r{}; + static std::once_flag init; + std::call_once(init, []() { + for (int i=0; i < g_deviceCnt; i++){ + r.insert("hcc-amdgcn-amd-amdhsa--gfx"+std::to_string(g_deviceArray[i]->_props.gcnArch)); + }}); + return r; + } + std::vector all_hsa_agents() { std::vector r{}; std::vector visible_accelerators; diff --git a/hipamd/src/program_state.inl b/hipamd/src/program_state.inl index 272addd053..3581dea229 100644 --- a/hipamd/src/program_state.inl +++ b/hipamd/src/program_state.inl @@ -244,7 +244,8 @@ public: if (!valid(tmp)) break; for (auto&& bundle : bundles(tmp)) { - impl.code_object_blobs.second[elf][triple_to_hsa_isa(bundle.triple)].push_back(bundle.blob); + if(bundle.blob.size()) + impl.code_object_blobs.second[elf][triple_to_hsa_isa(bundle.triple)].push_back(bundle.blob); } blob_it += tmp.bundled_code_size; From e334d3d6ec3d5a100e44075aa131a6e0f6f8f000 Mon Sep 17 00:00:00 2001 From: Sarbojit2019 <52527887+SarbojitAMD@users.noreply.github.com> Date: Thu, 13 Feb 2020 14:22:11 +0530 Subject: [PATCH 22/50] ihipEnablePeerAccess return error if peer is not accessible (#1858) hipDeviceEnablePeerAccess returns success and adds peer into the list even if it is not accessible which creates problem in hipMalloc when it tries to share the ptr to peer device. Proposed change is to check the access status before updating the peer list and update only when it can access the peer. --- hipamd/src/hip_peer.cpp | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/hipamd/src/hip_peer.cpp b/hipamd/src/hip_peer.cpp index 7781af1dbe..8fd66a52bb 100644 --- a/hipamd/src/hip_peer.cpp +++ b/hipamd/src/hip_peer.cpp @@ -128,17 +128,24 @@ hipError_t ihipEnablePeerAccess(TlsData* tls, hipCtx_t peerCtx, unsigned int fla if (thisCtx == peerCtx) { err = hipErrorInvalidDevice; // Can't enable peer access to self. } else if ((thisCtx != NULL) && (peerCtx != NULL)) { - LockedAccessor_CtxCrit_t peerCrit(peerCtx->criticalData()); - // Add thisCtx to peerCtx's access list so that new allocations on peer will be made - // visible to this device: - bool isNewPeer = peerCrit->addPeerWatcher(peerCtx, thisCtx); - if (isNewPeer) { - tprintf(DB_MEM, "device=%s can now see all memory allocated on peer=%s\n", - thisCtx->toString().c_str(), peerCtx->toString().c_str()); - am_memtracker_update_peers(peerCtx->getDevice()->_acc, peerCrit->peerCnt(), - peerCrit->peerAgents()); + + int canAccess = 0; + if ((hipSuccess != ihipDeviceCanAccessPeer(&canAccess,thisCtx,peerCtx)) || (canAccess == 0)){ + tprintf(DB_MEM, "device=%s can't access peer=%s\n",thisCtx->toString().c_str(), peerCtx->toString().c_str()); + err = hipErrorInvalidDevice; } else { - err = hipErrorPeerAccessAlreadyEnabled; + LockedAccessor_CtxCrit_t peerCrit(peerCtx->criticalData()); + // Add thisCtx to peerCtx's access list so that new allocations on peer will be made + // visible to this device: + bool isNewPeer = peerCrit->addPeerWatcher(peerCtx, thisCtx); + if (isNewPeer) { + tprintf(DB_MEM, "device=%s can now see all memory allocated on peer=%s\n", + thisCtx->toString().c_str(), peerCtx->toString().c_str()); + am_memtracker_update_peers(peerCtx->getDevice()->_acc, peerCrit->peerCnt(), + peerCrit->peerAgents()); + } else { + err = hipErrorPeerAccessAlreadyEnabled; + } } } else { err = hipErrorInvalidDevice; From a03628335c34b468b8955180df3b37c85aad253b Mon Sep 17 00:00:00 2001 From: Sarbojit2019 <52527887+SarbojitAMD@users.noreply.github.com> Date: Thu, 13 Feb 2020 14:22:46 +0530 Subject: [PATCH 23/50] [hip] Fix for bug introduced in #1770 when blockSize is non-power of 2 (#1864) Fixes SWDEV-222161 --- hipamd/src/hip_module.cpp | 95 ++++++++++++++++++++++++--------------- 1 file changed, 60 insertions(+), 35 deletions(-) diff --git a/hipamd/src/hip_module.cpp b/hipamd/src/hip_module.cpp index 16606e8016..44f0f108a6 100644 --- a/hipamd/src/hip_module.cpp +++ b/hipamd/src/hip_module.cpp @@ -134,10 +134,10 @@ extern hipError_t ihipGetDeviceProperties(hipDeviceProp_t* props, int device); return ihipLogStatus(hipStatus); \ } -hipError_t ihipModuleLaunchKernel(TlsData *tls, hipFunction_t f, uint32_t gridSizeX, - uint32_t gridSizeY, uint32_t gridSizeZ, - uint32_t blockSizeX, uint32_t blockSizeY, - uint32_t blockSizeZ, size_t sharedMemBytes, +hipError_t ihipModuleLaunchKernel(TlsData *tls, hipFunction_t f, uint32_t globalWorkSizeX, + uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ, + uint32_t localWorkSizeX, uint32_t localWorkSizeY, + uint32_t localWorkSizeZ, size_t sharedMemBytes, hipStream_t hStream, void** kernelParams, void** extra, hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags, bool isStreamLocked = 0, void** impCoopParams = 0) { @@ -146,14 +146,6 @@ hipError_t ihipModuleLaunchKernel(TlsData *tls, hipFunction_t f, uint32_t gridSi auto ctx = ihipGetTlsDefaultCtx(); hipError_t ret = hipSuccess; - size_t globalWorkSizeX = (size_t)gridSizeX * (size_t)blockSizeX; - size_t globalWorkSizeY = (size_t)gridSizeY * (size_t)blockSizeY; - size_t globalWorkSizeZ = (size_t)gridSizeZ * (size_t)blockSizeZ; - if(globalWorkSizeX > UINT32_MAX || globalWorkSizeY > UINT32_MAX || globalWorkSizeZ > UINT32_MAX) - { - return hipErrorInvalidConfiguration; - } - if (ctx == nullptr) { ret = hipErrorInvalidDevice; @@ -211,8 +203,8 @@ hipError_t ihipModuleLaunchKernel(TlsData *tls, hipFunction_t f, uint32_t gridSi lp.dynamic_group_mem_bytes = sharedMemBytes; // TODO - this should be part of preLaunchKernel. hStream = ihipPreLaunchKernel( - hStream, dim3(globalWorkSizeX/blockSizeX, globalWorkSizeY/blockSizeY, globalWorkSizeZ/blockSizeZ), - dim3(blockSizeX, blockSizeY, blockSizeZ), &lp, f->_name.c_str(), isStreamLocked); + hStream, dim3(globalWorkSizeX/localWorkSizeX, globalWorkSizeY/localWorkSizeY, globalWorkSizeZ/localWorkSizeZ), + dim3(localWorkSizeX, localWorkSizeY, localWorkSizeZ), &lp, f->_name.c_str(), isStreamLocked); hsa_kernel_dispatch_packet_t aql; @@ -221,9 +213,9 @@ hipError_t ihipModuleLaunchKernel(TlsData *tls, hipFunction_t f, uint32_t gridSi // aql.completion_signal._handle = 0; // aql.kernarg_address = 0; - aql.workgroup_size_x = blockSizeX; - aql.workgroup_size_y = blockSizeY; - aql.workgroup_size_z = blockSizeZ; + aql.workgroup_size_x = localWorkSizeX; + aql.workgroup_size_y = localWorkSizeY; + aql.workgroup_size_z = localWorkSizeZ; aql.grid_size_x = globalWorkSizeX; aql.grid_size_y = globalWorkSizeY; aql.grid_size_z = globalWorkSizeZ; @@ -283,8 +275,17 @@ hipError_t hipModuleLaunchKernel(hipFunction_t f, uint32_t gridDimX, uint32_t gr void** kernelParams, void** extra) { HIP_INIT_API(hipModuleLaunchKernel, f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams, extra); + + size_t globalWorkSizeX = (size_t)gridDimX * (size_t)blockDimX; + size_t globalWorkSizeY = (size_t)gridDimY * (size_t)blockDimY; + size_t globalWorkSizeZ = (size_t)gridDimZ * (size_t)blockDimZ; + if(globalWorkSizeX > UINT32_MAX || globalWorkSizeY > UINT32_MAX || globalWorkSizeZ > UINT32_MAX) + { + return hipErrorInvalidConfiguration; + } + return ihipLogStatus(ihipModuleLaunchKernel(tls, - f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, + f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams, extra, nullptr, nullptr, 0)); } @@ -297,11 +298,8 @@ hipError_t hipExtModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX, HIP_INIT_API(hipExtModuleLaunchKernel, f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, localWorkSizeX, localWorkSizeY, localWorkSizeZ, sharedMemBytes, hStream, kernelParams, extra); - if(localWorkSizeX == 0 || localWorkSizeY == 0 || localWorkSizeZ == 0) - return hipErrorInvalidValue; - return ihipLogStatus(ihipModuleLaunchKernel(tls, - f, globalWorkSizeX/localWorkSizeX, globalWorkSizeY/localWorkSizeY, globalWorkSizeZ/localWorkSizeZ, localWorkSizeX, localWorkSizeY, + f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, localWorkSizeX, localWorkSizeY, localWorkSizeZ, sharedMemBytes, hStream, kernelParams, extra, startEvent, stopEvent, flags)); } @@ -314,11 +312,8 @@ hipError_t hipHccModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX, HIP_INIT_API(hipHccModuleLaunchKernel, f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, localWorkSizeX, localWorkSizeY, localWorkSizeZ, sharedMemBytes, hStream, kernelParams, extra); - if(localWorkSizeX == 0 || localWorkSizeY == 0 || localWorkSizeZ == 0) - return hipErrorInvalidValue; - return ihipLogStatus(ihipModuleLaunchKernel(tls, - f, globalWorkSizeX/localWorkSizeX, globalWorkSizeY/localWorkSizeY, globalWorkSizeZ/localWorkSizeZ, localWorkSizeX, localWorkSizeY, + f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, localWorkSizeX, localWorkSizeY, localWorkSizeZ, sharedMemBytes, hStream, kernelParams, extra, startEvent, stopEvent, 0)); } @@ -364,14 +359,26 @@ hipError_t ihipExtLaunchMultiKernelMultiDevice(hipLaunchParams* launchParamsList } GET_TLS(); + + size_t globalWorkSizeX = 0, globalWorkSizeY = 0, globalWorkSizeZ = 0; + // launch kernels for each device for (int i = 0; i < numDevices; ++i) { const hipLaunchParams& lp = launchParamsList[i]; + globalWorkSizeX = (size_t)lp.gridDim.x * (size_t)lp.blockDim.x; + globalWorkSizeY = (size_t)lp.gridDim.y * (size_t)lp.blockDim.y; + globalWorkSizeZ = (size_t)lp.gridDim.z * (size_t)lp.blockDim.z; + + if(globalWorkSizeX > UINT32_MAX || globalWorkSizeY > UINT32_MAX || globalWorkSizeZ > UINT32_MAX) + { + return hipErrorInvalidConfiguration; + } + result = ihipModuleLaunchKernel(tls, kds[i], - lp.gridDim.x, - lp.gridDim.y, - lp.gridDim.z, + lp.gridDim.x * lp.blockDim.x, + lp.gridDim.y * lp.blockDim.y, + lp.gridDim.z * lp.blockDim.z, lp.blockDim.x, lp.blockDim.y, lp.blockDim.z, lp.sharedMem, lp.stream, lp.args, nullptr, nullptr, nullptr, 0, @@ -416,6 +423,14 @@ hipError_t ihipLaunchCooperativeKernel(const void* f, dim3 gridDim, return hipErrorInvalidConfiguration; } + size_t globalWorkSizeX = (size_t)gridDim.x * (size_t)blockDimX.x; + size_t globalWorkSizeY = (size_t)gridDim.y * (size_t)blockDimX.y; + size_t globalWorkSizeZ = (size_t)gridDim.z * (size_t)blockDimX.z; + if(globalWorkSizeX > UINT32_MAX || globalWorkSizeY > UINT32_MAX || globalWorkSizeZ > UINT32_MAX) + { + return hipErrorInvalidConfiguration; + } + // Prepare the kernel descriptor for initializing the GWS hipFunction_t gwsKD = ps.kernel_descriptor( reinterpret_cast(&init_gws), @@ -475,9 +490,9 @@ hipError_t ihipLaunchCooperativeKernel(const void* f, dim3 gridDim, // launch the main kernel result = ihipModuleLaunchKernel(tls, kd, - gridDim.x, - gridDim.y, - gridDim.z, + gridDim.x * blockDimX.x, + gridDim.y * blockDimX.y, + gridDim.z * blockDimX.z, blockDimX.x, blockDimX.y, blockDimX.z, sharedMemBytes, stream, kernelParams, nullptr, nullptr, nullptr, 0, true, impCoopParams); @@ -612,6 +627,8 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL void* impCoopParams[1]; ulong prev_sum = 0; + + size_t globalWorkSizeX = 0, globalWorkSizeY = 0, globalWorkSizeZ = 0; // launch the main kernels for each device for (int i = 0; i < numDevices; ++i) { const hipLaunchParams& lp = launchParamsList[i]; @@ -628,10 +645,18 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL impCoopParams[0] = &mg_info_ptr[i]; + globalWorkSizeX = (size_t)lp.gridDim.x * (size_t)lp.blockDim.x; + globalWorkSizeY = (size_t)lp.gridDim.y * (size_t)lp.blockDim.y; + globalWorkSizeZ = (size_t)lp.gridDim.z * (size_t)lp.blockDim.z; + if(globalWorkSizeX > UINT32_MAX || globalWorkSizeY > UINT32_MAX || globalWorkSizeZ > UINT32_MAX) + { + return hipErrorInvalidConfiguration; + } + result = ihipModuleLaunchKernel(tls, kds[i], - lp.gridDim.x, - lp.gridDim.y, - lp.gridDim.z, + lp.gridDim.x * lp.blockDim.x, + lp.gridDim.y * lp.blockDim.y, + lp.gridDim.z * lp.blockDim.z, lp.blockDim.x, lp.blockDim.y, lp.blockDim.z, lp.sharedMem, lp.stream, lp.args, nullptr, nullptr, nullptr, 0, From fe47fce496ca8aea02a7e50a5c6f2c40dae92258 Mon Sep 17 00:00:00 2001 From: Jeff Daily Date: Thu, 13 Feb 2020 00:52:56 -0800 Subject: [PATCH 24/50] missing break statement in hipDeviceGetAttribute (#1865) The break is missing for hipDeviceAttributeMaxTexture3DDepth. --- hipamd/src/hip_device.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/hipamd/src/hip_device.cpp b/hipamd/src/hip_device.cpp index aa89e62271..1bbdb10bbc 100644 --- a/hipamd/src/hip_device.cpp +++ b/hipamd/src/hip_device.cpp @@ -291,6 +291,7 @@ hipError_t ihipDeviceGetAttribute(int* pi, hipDeviceAttribute_t attr, int device break; case hipDeviceAttributeMaxTexture3DDepth: *pi = prop->maxTexture3D[2]; + break; case hipDeviceAttributeHdpMemFlushCntl: { uint32_t** hdp = reinterpret_cast(pi); From 627d9a1f46686d0745e96a8bdb399d14401c5c1b Mon Sep 17 00:00:00 2001 From: Satyanvesh Dittakavi Date: Thu, 13 Feb 2020 16:34:05 +0530 Subject: [PATCH 25/50] updated test for nvidia path --- .../module/hipModuleLoadDataMultThreaded.cpp | 55 ++++++++++--------- 1 file changed, 29 insertions(+), 26 deletions(-) diff --git a/hipamd/tests/src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp b/hipamd/tests/src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp index 03a2b82b8b..6115125399 100644 --- a/hipamd/tests/src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp +++ b/hipamd/tests/src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp @@ -1,5 +1,5 @@ /* -Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. +Copyright (c) 2015-Present Advanced Micro Devices, Inc. All rights reserved. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights @@ -18,7 +18,7 @@ THE SOFTWARE. */ /* HIT_START - * BUILD: %t %s ../../test_common.cpp EXCLUDE_HIP_PLATFORM nvcc + * BUILD: %t %s ../../test_common.cpp * TEST: %t * HIT_END */ @@ -35,13 +35,11 @@ THE SOFTWARE. #define LEN 64 #define SIZE LEN << 2 -#define THREADS 8 +#define THREADS 2 #define FILENAME "vcpy_kernel.code" #define kernel_name "hello_world" -using ModuleFunction = std::pair; - std::vector load_file() { std::ifstream file(FILENAME, std::ios::binary | std::ios::ate); @@ -55,18 +53,18 @@ std::vector load_file() return buffer; } -ModuleFunction load(const std::vector& buffer) { +void run(const std::vector& buffer) { + hipDevice_t device; + HIPCHECK(hipDeviceGet(&device, 0)); + hipCtx_t context; + HIPCHECK(hipCtxCreate(&context, 0, device)); + hipModule_t Module; hipFunction_t Function; HIPCHECK(hipModuleLoadData(&Module, &buffer[0])); HIPCHECK(hipModuleGetFunction(&Function, Module, kernel_name)); - return {Module, Function}; -} - -void run(ModuleFunction mf) { - hipModule_t Module = mf.first; - hipFunction_t Function = mf.second; - float *A, *B, *Ad, *Bd; + + float *A, *B, *Ad, *Bd; A = new float[LEN]; B = new float[LEN]; @@ -105,6 +103,13 @@ void run(ModuleFunction mf) { for (uint32_t i = 0; i < LEN; i++) { assert(A[i] == B[i]); } + + hipFree(Ad); + hipFree(Bd); + delete A; + delete B; + hipCtxDestroy(context); + } struct joinable_thread : std::thread @@ -124,25 +129,23 @@ struct joinable_thread : std::thread } }; -void run_multi_threads(uint32_t n) { - std::vector mf(n); - { - auto buffer = load_file(); - std::vector threads; - for (uint32_t i = 0; i < n; i++) { - threads.emplace_back(std::thread{[&, i, buffer] { - mf[i] = load(buffer); - }}); - } +void run_multi_threads(uint32_t n, const std::vector& buffer) { + + std::vector threads; + + for (uint32_t i = 0; i < n; i++) { + threads.emplace_back(std::thread{[&, buffer] { + run(buffer); + }}); } - for(auto&& x:mf) - run(x); + } int main() { HIPCHECK(hipInit(0)); - run_multi_threads(THREADS * std::thread::hardware_concurrency()); + auto buffer = load_file(); + run_multi_threads(THREADS * std::thread::hardware_concurrency(), buffer); passed(); } From 2034ed35614bb4dc05179051dc43f59d1a8a37fc Mon Sep 17 00:00:00 2001 From: Satyanvesh Dittakavi Date: Thu, 13 Feb 2020 19:48:26 +0530 Subject: [PATCH 26/50] Add c++11 option for nvcc --- .../src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hipamd/tests/src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp b/hipamd/tests/src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp index 6115125399..8591a748df 100644 --- a/hipamd/tests/src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp +++ b/hipamd/tests/src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp @@ -18,7 +18,7 @@ THE SOFTWARE. */ /* HIT_START - * BUILD: %t %s ../../test_common.cpp + * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11 * TEST: %t * HIT_END */ From fb52a3973b14df58779aef4bb276447ad5f112a8 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Thu, 13 Feb 2020 18:34:10 +0300 Subject: [PATCH 27/50] [HIPIFY][doc] Update README.md: Windows tested configurations --- hipamd/hipify-clang/README.md | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/hipamd/hipify-clang/README.md b/hipamd/hipify-clang/README.md index 86a97dbe29..950482b7ed 100644 --- a/hipamd/hipify-clang/README.md +++ b/hipamd/hipify-clang/README.md @@ -393,19 +393,19 @@ Testing Time: 3.07s ``` ### hipify-clang: Windows -On Windows 10 the following configurations are tested: +*Tested configurations:* -LLVM 5.0.0 - 5.0.2, CUDA 8.0, cudnn 5.1.10 - 7.1.4.18 +| **LLVM** | **CUDA** | **cuDNN** | **Visual Studio** | **cmake** | **Python** | +|:--------------:|---------:|--------------------:|--------------------------:|----------:|-----------:| +| 5.0.0 - 5.0.2 | 8.0 | 5.1.10 - 7.1.4.18 | 2017.15.5.2 | 3.5.1 | 3.6.4 | +| 6.0.0 - 6.0.1 | 9.0 | 7.0.5.15 - 7.6.5.32 | 2017.15.5.5 | 3.6.0 | 3.7.2 | +| 7.0.0 - 7.1.0 | 9.2 | 7.6.5.32 | 2017.15.9.11 | 3.13.3 | 3.7.3 | +| 8.0.0 - 8.0.1 | 10.0 | 7.6.5.32 | 2017.15.9.15 | 3.14.2 | 3.7.4 | +| 9.0.0 - 9.0.1 | 10.1 | 7.6.5.32 | 2017.15.9.20, 2019.16.4.5 | 3.16.4 | 3.8.0 | +| 10.0.0-rc1 | 10.2 | 7.6.5.32 | 2017.15.9.20, 2019.16.4.5 | 3.16.4 | 3.8.1 | +| 11.0.0git | 10.2 | 7.6.5.32 | 2017.15.9.20, 2019.16.4.5 | 3.16.4 | 3.8.1 | -LLVM 6.0.0 - 6.0.1, CUDA 9.0, cudnn 7.0.5.15 - 7.6.5.32 - -LLVM 7.0.0 - 10.0.0-rc1, CUDA 7.5 - 10.2, cudnn 7.0.5.15 - 7.6.5.32 - -Build system requirements for the latest stable configuration LLVM 9.0.1/CUDA 10.1 Update 2: - -Python 3.6.0 - 3.8.1, cmake 3.5.1 - 3.16.3, Visual Studio 2017 (15.5.2) - 2019 (16.4.4). - -Here is an example of building `hipify-clang` with testing support on `Windows 10` by `Visual Studio 16 2019`: +*Building with testing support on `Windows 10` by `Visual Studio 16 2019`:* ```shell cmake From 893ccc4ddffc50ea736fd3873240de4fc6fb4c1c Mon Sep 17 00:00:00 2001 From: Rahul Garg Date: Thu, 13 Feb 2020 23:21:40 -0800 Subject: [PATCH 28/50] [sample] Add hipDispatchEnqueueRateMT (#1869) * [sample] Add hipDispatchEnqueueRateMT --- .../1_Utils/hipDispatchLatency/Makefile | 11 +- .../hipDispatchEnqueueRateMT.cpp | 167 ++++++++++++++++++ 2 files changed, 173 insertions(+), 5 deletions(-) create mode 100644 hipamd/samples/1_Utils/hipDispatchLatency/hipDispatchEnqueueRateMT.cpp diff --git a/hipamd/samples/1_Utils/hipDispatchLatency/Makefile b/hipamd/samples/1_Utils/hipDispatchLatency/Makefile index 0616f01f0d..74945dc515 100644 --- a/hipamd/samples/1_Utils/hipDispatchLatency/Makefile +++ b/hipamd/samples/1_Utils/hipDispatchLatency/Makefile @@ -4,16 +4,17 @@ ifeq (,$(HIP_PATH)) endif HIPCC=$(HIP_PATH)/bin/hipcc -std=c++11 -EXE=hipDispatchLatency - CXXFLAGS = -O3 -all: test_kernel.code ${EXE} +all: test_kernel.code hipDispatchLatency.out hipDispatchEnqueueRateMT.out -$(EXE): hipDispatchLatency.cpp +hipDispatchLatency.out: hipDispatchLatency.cpp $(HIPCC) $(CXXFLAGS) hipDispatchLatency.cpp -o $@ +hipDispatchEnqueueRateMT.out: hipDispatchEnqueueRateMT.cpp + $(HIPCC) $(CXXFLAGS) hipDispatchEnqueueRateMT.cpp -o $@ + test_kernel.code: test_kernel.cpp $(HIP_PATH)/bin/hipcc --genco $(GENCO_FLAGS) $^ -o $@ clean: - rm -f *.o $(EXE) + rm -f *.o *.out diff --git a/hipamd/samples/1_Utils/hipDispatchLatency/hipDispatchEnqueueRateMT.cpp b/hipamd/samples/1_Utils/hipDispatchLatency/hipDispatchEnqueueRateMT.cpp new file mode 100644 index 0000000000..d1b5c2f3b5 --- /dev/null +++ b/hipamd/samples/1_Utils/hipDispatchLatency/hipDispatchEnqueueRateMT.cpp @@ -0,0 +1,167 @@ +/* +Copyright (c) 2020-present Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include +#include "hip/hip_runtime.h" +#ifdef __HIP_PLATFORM_HCC__ +#include "hip/hip_ext.h" +#endif +#include +#include +#include +#include +#include +#include +#include + +#define NUM_GROUPS 1 +#define GROUP_SIZE 1 +#define WARMUP_RUN_COUNT 10 +#define TIMING_RUN_COUNT 100 +#define TOTAL_RUN_COUNT WARMUP_RUN_COUNT + TIMING_RUN_COUNT + +__global__ void EmptyKernel() {} + +// Helper to print various timing metrics +void print_timing(std::string test, std::array &results, int batch = 1) +{ + + float total_us = 0.0f, mean_us = 0.0f, stddev_us = 0.0f; + + // remove top outliers due to nature of variability across large number of multi-threaded runs + std::sort(results.begin(), results.end(), std::greater()); + auto start_iter = std::next(results.begin(), WARMUP_RUN_COUNT); + auto end_iter = results.end(); + + // mean + std::for_each(start_iter, end_iter, [&](const float &run_ms) { + total_us += (run_ms * 1000) / batch; + }); + mean_us = total_us / TIMING_RUN_COUNT; + + // stddev + total_us = 0; + std::for_each(start_iter, end_iter, [&](const float &run_ms) { + float dev_us = ((run_ms * 1000) / batch) - mean_us; + total_us += dev_us * dev_us; + }); + stddev_us = sqrt(total_us / TIMING_RUN_COUNT); + + printf("\n %s: %.1f us, std: %.1f us\n", test.c_str(), mean_us, stddev_us); +} + +// Measure time taken to enqueue a kernel on the GPU using hipModuleLaunchKernel +void hipModuleLaunchKernel_enqueue_rate(std::atomic_int* shared, int max_threads) +{ + //resources necessary for this thread + hipStream_t stream; + hipStreamCreate(&stream); + hipModule_t module; + hipFunction_t function; + hipModuleLoad(&module, "test_kernel.code"); + hipModuleGetFunction(&function, module, "test"); + void* kernel_params = nullptr; + std::array results; + + //synchronize all threads, before running + int tid = shared->fetch_add(1, std::memory_order_release); + while (max_threads != shared->load(std::memory_order_acquire)) {} + + for (auto i = 0; i < TOTAL_RUN_COUNT; ++i) { + auto start = std::chrono::high_resolution_clock::now(); + hipModuleLaunchKernel(function, 1, 1, 1, 1, 1, 1, 0, stream, &kernel_params, nullptr); + auto stop = std::chrono::high_resolution_clock::now(); + results[i] = std::chrono::duration(stop - start).count(); + } + print_timing("Thread ID : " + std::to_string(tid) + " , " + "hipModuleLaunchKernel enqueue rate", results); +} + +// Measure time taken to enqueue a kernel on the GPU using hipLaunchKernelGGL +void hipLaunchKernelGGL_enqueue_rate(std::atomic_int* shared, int max_threads) +{ + //resources necessary for this thread + hipStream_t stream; + hipStreamCreate(&stream); + std::array results; + + //synchronize all threads, before running + int tid = shared->fetch_add(1, std::memory_order_release); + while (max_threads != shared->load(std::memory_order_acquire)) {} + + for (auto i = 0; i < TOTAL_RUN_COUNT; ++i) { + auto start = std::chrono::high_resolution_clock::now(); + hipLaunchKernelGGL((EmptyKernel), dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream); + auto stop = std::chrono::high_resolution_clock::now(); + results[i] = std::chrono::duration(stop - start).count(); + } + print_timing("Thread ID : " + std::to_string(tid) + " , " + "hipLaunchKernelGGL enqueue rate", results); +} + +// Simple thread pool +struct thread_pool { + thread_pool(int total_threads) : max_threads(total_threads) {} + void start(std::function f) { + for (int i = 0; i < max_threads; ++i) { + threads.push_back(std::async(std::launch::async, f, &shared, max_threads)); + } + } + void finish() { + for (auto&&thread : threads) { + thread.get(); + } + threads.clear(); + shared = {0}; + } + ~thread_pool() { + finish(); + } +private: + std::atomic_int shared {0}; + std::vector> threads; + int max_threads = 1; +}; + + +int main(int argc, char* argv[]) +{ + if (argc != 3) { + std::cerr << "Run test as 'hipDispatchEnqueueRateMT <0-hipModuleLaunchKernel /1-hipLaunchKernelGGL>'\n"; + return -1; + } + + int max_threads = atoi(argv[1]); + int run_module_test = atoi(argv[2]); + if(max_threads < 1 || run_module_test < 0 || run_module_test > 1) { + std::cerr << "Invalid Input.\n"; + std::cerr << "Run test as 'hipDispatchEnqueueRateMT <0-hipModuleLaunchKernel /1-hipLaunchKernelGGL>'\n"; + return -1; + } + thread_pool task(max_threads); + + if(run_module_test == 0) { + task.start(hipModuleLaunchKernel_enqueue_rate); + task.finish(); + } else { + task.start(hipLaunchKernelGGL_enqueue_rate); + task.finish(); + } + + return 0; +} + From 0315aefe78119a5a3c79213171c36b4f9a4cffe6 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Fri, 14 Feb 2020 13:09:31 +0300 Subject: [PATCH 29/50] [HIPIFY][doc] Update README.md: LLVM 10.0.0-rc2 - the latest supported LLVM Release --- hipamd/hipify-clang/README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/hipamd/hipify-clang/README.md b/hipamd/hipify-clang/README.md index 950482b7ed..a375c6ab65 100644 --- a/hipamd/hipify-clang/README.md +++ b/hipamd/hipify-clang/README.md @@ -42,10 +42,10 @@ After applying all the matchers, the output HIP source is produced. `hipify-clang` requires: -1. [**LLVM+CLANG**](http://releases.llvm.org) of at least version [3.8.0](http://releases.llvm.org/download.html#3.8.0); the latest stable and recommended release: [**9.0.1**](http://releases.llvm.org/download.html#9.0.1), the latest release candidate: [10.0.0-rc1](https://github.com/llvm/llvm-project/releases/tag/llvmorg-10.0.0-rc1). +1. [**LLVM+CLANG**](http://releases.llvm.org) of at least version [3.8.0](http://releases.llvm.org/download.html#3.8.0); the latest stable and recommended release: [**9.0.1**](http://releases.llvm.org/download.html#9.0.1), the latest release candidate: [10.0.0-rc2](https://github.com/llvm/llvm-project/releases/tag/llvmorg-10.0.0-rc2). 2. [**CUDA**](https://developer.nvidia.com/cuda-downloads) of at least version [7.0](https://developer.nvidia.com/cuda-toolkit-70), the latest supported version is [**10.1 Update 2**](https://developer.nvidia.com/cuda-10.1-download-archive-base). -To use the latest CUDA version [10.2](https://developer.nvidia.com/cuda-downloads) please use the latest `LLVM` release candidate: [10.0.0-rc1](https://github.com/llvm/llvm-project/releases/tag/llvmorg-10.0.0-rc1). +To use the latest CUDA version [10.2](https://developer.nvidia.com/cuda-downloads) please use the latest `LLVM` release candidate: [10.0.0-rc2](https://github.com/llvm/llvm-project/releases/tag/llvmorg-10.0.0-rc2). | **LLVM release version** | **CUDA latest supported version** | **Windows** | **Linux** | |:----------------------------------------------------------:|:------------------------------------------------------------------------:|:-----------:|:---------:| @@ -67,7 +67,7 @@ To use the latest CUDA version [10.2](https://developer.nvidia.com/cuda-download | [8.0.1](http://releases.llvm.org/download.html#8.0.1) | [10.0](https://developer.nvidia.com/cuda-10.0-download-archive) | -
not working due to
the clang's bug [38811](https://bugs.llvm.org/show_bug.cgi?id=38811)
+
[patch](patches/patch_for_clang_8.0.1_bug_38811.zip)*
| + | | [9.0.0](http://releases.llvm.org/download.html#9.0.0) | [10.1](https://developer.nvidia.com/cuda-10.1-download-archive-base) | + | + | | [**9.0.1**](http://releases.llvm.org/download.html#9.0.1) | [**10.1**](https://developer.nvidia.com/cuda-10.1-download-archive-base) | +
**LATEST STABLE RELEASE** | +
**LATEST STABLE RELEASE** | -| [10.0.0-rc1](https://github.com/llvm/llvm-project/releases/tag/llvmorg-10.0.0-rc1) | [10.2](https://developer.nvidia.com/cuda-downloads) | + | + | +| [10.0.0-rc2](https://github.com/llvm/llvm-project/releases/tag/llvmorg-10.0.0-rc2) | [10.2](https://developer.nvidia.com/cuda-downloads) | + | + | `*` Download the patch and unpack it into your `LLVM` distributive directory; a few header files will be overwritten; rebuilding of `LLVM` is not needed. @@ -158,7 +158,7 @@ Run `Visual Studio 16 2019`, open the generated `LLVM.sln`, build all, build pro **LLVM 10.0.0 or newer:** -1. download [`LLVM project`](https://github.com/llvm/llvm-project/archive/llvmorg-10.0.0-rc1.tar.gz) sources; +1. download [`LLVM project`](https://github.com/llvm/llvm-project/archive/llvmorg-10.0.0-rc2.tar.gz) sources; 2. build [`LLVM project`](http://llvm.org/docs/CMake.html): **Linux**: @@ -247,7 +247,7 @@ On Linux the following configurations are tested: Ubuntu 14: LLVM 5.0.0 - 6.0.1, CUDA 7.0 - 9.0, cudnn-5.0.5 - cudnn-7.6.5.32 -Ubuntu 16-18: LLVM 8.0.0 - 10.0.0-rc1, CUDA 8.0 - 10.2, cudnn-5.1.10 - cudnn-7.6.5.32 +Ubuntu 16-18: LLVM 8.0.0 - 10.0.0-rc2, CUDA 8.0 - 10.2, cudnn-5.1.10 - cudnn-7.6.5.32 Minimum build system requirements for the above configurations: @@ -402,7 +402,7 @@ Testing Time: 3.07s | 7.0.0 - 7.1.0 | 9.2 | 7.6.5.32 | 2017.15.9.11 | 3.13.3 | 3.7.3 | | 8.0.0 - 8.0.1 | 10.0 | 7.6.5.32 | 2017.15.9.15 | 3.14.2 | 3.7.4 | | 9.0.0 - 9.0.1 | 10.1 | 7.6.5.32 | 2017.15.9.20, 2019.16.4.5 | 3.16.4 | 3.8.0 | -| 10.0.0-rc1 | 10.2 | 7.6.5.32 | 2017.15.9.20, 2019.16.4.5 | 3.16.4 | 3.8.1 | +| 10.0.0-rc1,rc2 | 10.2 | 7.6.5.32 | 2017.15.9.20, 2019.16.4.5 | 3.16.4 | 3.8.1 | | 11.0.0git | 10.2 | 7.6.5.32 | 2017.15.9.20, 2019.16.4.5 | 3.16.4 | 3.8.1 | *Building with testing support on `Windows 10` by `Visual Studio 16 2019`:* From 13052f27c29c3f035235888eaec2b3ff5d5a94f8 Mon Sep 17 00:00:00 2001 From: Siu Chi Chan Date: Fri, 14 Feb 2020 13:20:12 -0500 Subject: [PATCH 30/50] Disabling HCC code object v3 generation by default. Some PyTorch unit tests have regression. Disabling cov3 to allow more time to debug and unblock PyTorch Change-Id: Iba7f425ef3499c20c42ec45d9152b5d27ce97d03 --- hipamd/bin/hipcc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hipamd/bin/hipcc b/hipamd/bin/hipcc index eb3f185424..1bdb466c68 100755 --- a/hipamd/bin/hipcc +++ b/hipamd/bin/hipcc @@ -342,7 +342,7 @@ my $runCmd = 1; my $buildDeps = 0; my $linkType = 1; my $setLinkType = 0; -my $coFormatv3 = 1; +my $coFormatv3 = 0; my @options = (); my @inputs = (); From 5b0f34c59bf94a0a2c414f33d249471b4b79e42d Mon Sep 17 00:00:00 2001 From: Nick Curtis Date: Fri, 14 Feb 2020 22:21:09 -0600 Subject: [PATCH 31/50] Implement long / long long shuffles (#1829) Implement additional data-types for shuffles (long and long long). Based upon the double implementation. --- .../include/hip/hcc_detail/device_functions.h | 114 +++++++++++++++ hipamd/tests/src/kernel/hipShflTests.cpp | 130 ++++++++++++++++++ 2 files changed, 244 insertions(+) create mode 100644 hipamd/tests/src/kernel/hipShflTests.cpp diff --git a/hipamd/include/hip/hcc_detail/device_functions.h b/hipamd/include/hip/hcc_detail/device_functions.h index 68e3277270..46ed53ff87 100644 --- a/hipamd/include/hip/hcc_detail/device_functions.h +++ b/hipamd/include/hip/hcc_detail/device_functions.h @@ -319,6 +319,36 @@ double __shfl(double var, int src_lane, int width = warpSize) { double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); return tmp1; } +__device__ +inline +long __shfl(long var, int src_lane, int width = warpSize) +{ + static_assert(sizeof(long) == 2 * sizeof(int), ""); + static_assert(sizeof(long) == sizeof(uint64_t), ""); + + int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp)); + tmp[0] = __shfl(tmp[0], src_lane, width); + tmp[1] = __shfl(tmp[1], src_lane, width); + + uint64_t tmp0 = (static_cast(tmp[1]) << 32ull) | static_cast(tmp[0]); + long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); + return tmp1; +} +__device__ +inline +long long __shfl(long long var, int src_lane, int width = warpSize) +{ + static_assert(sizeof(long long) == 2 * sizeof(int), ""); + static_assert(sizeof(long long) == sizeof(uint64_t), ""); + + int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp)); + tmp[0] = __shfl(tmp[0], src_lane, width); + tmp[1] = __shfl(tmp[1], src_lane, width); + + uint64_t tmp0 = (static_cast(tmp[1]) << 32ull) | static_cast(tmp[0]); + long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); + return tmp1; +} __device__ inline @@ -356,6 +386,34 @@ double __shfl_up(double var, unsigned int lane_delta, int width = warpSize) { double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); return tmp1; } +__device__ +inline +long __shfl_up(long var, unsigned int lane_delta, int width = warpSize) +{ + static_assert(sizeof(long) == 2 * sizeof(int), ""); + static_assert(sizeof(long) == sizeof(uint64_t), ""); + + int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp)); + tmp[0] = __shfl_up(tmp[0], lane_delta, width); + tmp[1] = __shfl_up(tmp[1], lane_delta, width); + + uint64_t tmp0 = (static_cast(tmp[1]) << 32ull) | static_cast(tmp[0]); + long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); + return tmp1; +} +__device__ +inline +long long __shfl_up(long long var, unsigned int lane_delta, int width = warpSize) +{ + static_assert(sizeof(long long) == 2 * sizeof(int), ""); + static_assert(sizeof(long long) == sizeof(uint64_t), ""); + int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp)); + tmp[0] = __shfl_up(tmp[0], lane_delta, width); + tmp[1] = __shfl_up(tmp[1], lane_delta, width); + uint64_t tmp0 = (static_cast(tmp[1]) << 32ull) | static_cast(tmp[0]); + long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); + return tmp1; +} __device__ inline @@ -393,6 +451,34 @@ double __shfl_down(double var, unsigned int lane_delta, int width = warpSize) { double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); return tmp1; } +__device__ +inline +long __shfl_down(long var, unsigned int lane_delta, int width = warpSize) +{ + static_assert(sizeof(long) == 2 * sizeof(int), ""); + static_assert(sizeof(long) == sizeof(uint64_t), ""); + + int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp)); + tmp[0] = __shfl_down(tmp[0], lane_delta, width); + tmp[1] = __shfl_down(tmp[1], lane_delta, width); + + uint64_t tmp0 = (static_cast(tmp[1]) << 32ull) | static_cast(tmp[0]); + long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); + return tmp1; +} +__device__ +inline +long long __shfl_down(long long var, unsigned int lane_delta, int width = warpSize) +{ + static_assert(sizeof(long long) == 2 * sizeof(int), ""); + static_assert(sizeof(long long) == sizeof(uint64_t), ""); + int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp)); + tmp[0] = __shfl_down(tmp[0], lane_delta, width); + tmp[1] = __shfl_down(tmp[1], lane_delta, width); + uint64_t tmp0 = (static_cast(tmp[1]) << 32ull) | static_cast(tmp[0]); + long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); + return tmp1; +} __device__ inline @@ -430,6 +516,34 @@ double __shfl_xor(double var, int lane_mask, int width = warpSize) { double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); return tmp1; } +__device__ +inline +long __shfl_xor(long var, int lane_mask, int width = warpSize) +{ + static_assert(sizeof(long) == 2 * sizeof(int), ""); + static_assert(sizeof(long) == sizeof(uint64_t), ""); + + int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp)); + tmp[0] = __shfl_xor(tmp[0], lane_mask, width); + tmp[1] = __shfl_xor(tmp[1], lane_mask, width); + + uint64_t tmp0 = (static_cast(tmp[1]) << 32ull) | static_cast(tmp[0]); + long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); + return tmp1; +} +__device__ +inline +long long __shfl_xor(long long var, int lane_mask, int width = warpSize) +{ + static_assert(sizeof(long long) == 2 * sizeof(int), ""); + static_assert(sizeof(long long) == sizeof(uint64_t), ""); + int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp)); + tmp[0] = __shfl_xor(tmp[0], lane_mask, width); + tmp[1] = __shfl_xor(tmp[1], lane_mask, width); + uint64_t tmp0 = (static_cast(tmp[1]) << 32ull) | static_cast(tmp[0]); + long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); + return tmp1; +} #define MASK1 0x00ff00ff #define MASK2 0xff00ff00 diff --git a/hipamd/tests/src/kernel/hipShflTests.cpp b/hipamd/tests/src/kernel/hipShflTests.cpp new file mode 100644 index 0000000000..9b1cc73248 --- /dev/null +++ b/hipamd/tests/src/kernel/hipShflTests.cpp @@ -0,0 +1,130 @@ +/* +Copyright (c) 2015-present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +/* HIT_START + * BUILD: %t %s ../test_common.cpp + * TEST: %t + * HIT_END + */ + +#include +#include +#include "test_common.h" + +#define WIDTH 4 + +#define NUM (WIDTH * WIDTH) + +#define THREADS_PER_BLOCK_X 4 +#define THREADS_PER_BLOCK_Y 4 +#define THREADS_PER_BLOCK_Z 1 + +// Device (Kernel) function, it must be void +template +__global__ void matrixTranspose(T* out, T* in, const int width) { + int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + T val = in[x]; + for (int i = 0; i < width; i++) { + for (int j = 0; j < width; j++) out[i * width + j] = __shfl(val, j * width + i); + } +} + +// CPU implementation of matrix transpose +template +void matrixTransposeCPUReference(T* output, T* input, const unsigned int width) { + for (unsigned int j = 0; j < width; j++) { + for (unsigned int i = 0; i < width; i++) { + output[i * width + j] = input[j * width + i]; + } + } +} + +template +void runTest() { + T* Matrix; + T* TransposeMatrix; + T* cpuTransposeMatrix; + + T* gpuMatrix; + T* gpuTransposeMatrix; + + hipDeviceProp_t devProp; + hipGetDeviceProperties(&devProp, 0); + + int i; + int errors; + + Matrix = (T*)malloc(NUM * sizeof(T)); + TransposeMatrix = (T*)malloc(NUM * sizeof(T)); + cpuTransposeMatrix = (T*)malloc(NUM * sizeof(T)); + + // initialize the input data + for (i = 0; i < NUM; i++) { + Matrix[i] = (T)i * 10l; + } + + // allocate the memory on the device side + hipMalloc((void**)&gpuMatrix, NUM * sizeof(T)); + hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(T)); + + // Memory transfer from host to device + hipMemcpy(gpuMatrix, Matrix, NUM * sizeof(T), hipMemcpyHostToDevice); + + // Lauching kernel from host + hipLaunchKernelGGL(matrixTranspose, dim3(1), dim3(THREADS_PER_BLOCK_X * THREADS_PER_BLOCK_Y), 0, 0, + gpuTransposeMatrix, gpuMatrix, WIDTH); + + // Memory transfer from device to host + hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM * sizeof(T), hipMemcpyDeviceToHost); + + // CPU MatrixTranspose computation + matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH); + + // verify the results + errors = 0; + double eps = 1.0E-6; + for (i = 0; i < NUM; i++) { + if (TransposeMatrix[i] != cpuTransposeMatrix[i]) { + errors++; + } + } + + // free the resources on device side + hipFree(gpuMatrix); + hipFree(gpuTransposeMatrix); + + // free the resources on host side + free(Matrix); + free(TransposeMatrix); + free(cpuTransposeMatrix); + + if (errors != 0) { + failed("Mismatch present"); + } +} + +int main() { + runTest(); + runTest(); + runTest(); + runTest(); + passed(); +} From 5afe43b1aab4efd33ebd7da6d5efd470bac5cf60 Mon Sep 17 00:00:00 2001 From: vsytch Date: Fri, 14 Feb 2020 23:22:25 -0500 Subject: [PATCH 32/50] Add missing __hip_pinned_shadow__ attributes to the texture global vars. (#1866) --- hipamd/tests/src/texture/hipBindTex2DPitch.cpp | 4 ++++ .../tests/src/texture/hipNormalizedFloatValueTex.cpp | 3 +++ hipamd/tests/src/texture/simpleTexture2DLayered.cpp | 6 ++++++ hipamd/tests/src/texture/simpleTexture3D.cpp | 11 +++++++++++ 4 files changed, 24 insertions(+) diff --git a/hipamd/tests/src/texture/hipBindTex2DPitch.cpp b/hipamd/tests/src/texture/hipBindTex2DPitch.cpp index 905e24bce3..b01402c91d 100644 --- a/hipamd/tests/src/texture/hipBindTex2DPitch.cpp +++ b/hipamd/tests/src/texture/hipBindTex2DPitch.cpp @@ -27,6 +27,10 @@ THE SOFTWARE. #define SIZE_H 8 #define SIZE_W 12 #define TYPE_t float + +#if __HIP__ +__hip_pinned_shadow__ +#endif texture tex; // texture object is a kernel argument diff --git a/hipamd/tests/src/texture/hipNormalizedFloatValueTex.cpp b/hipamd/tests/src/texture/hipNormalizedFloatValueTex.cpp index 3179f7412e..609f6916f8 100644 --- a/hipamd/tests/src/texture/hipNormalizedFloatValueTex.cpp +++ b/hipamd/tests/src/texture/hipNormalizedFloatValueTex.cpp @@ -45,6 +45,9 @@ static float getNormalizedValue(const float value, } } +#if __HIP__ +__hip_pinned_shadow__ +#endif texture textureNormalizedVal_1D; __global__ void normalizedValTextureTest(unsigned int numElements, float* pDst) diff --git a/hipamd/tests/src/texture/simpleTexture2DLayered.cpp b/hipamd/tests/src/texture/simpleTexture2DLayered.cpp index e3ba6d9afe..fa545cb4ca 100644 --- a/hipamd/tests/src/texture/simpleTexture2DLayered.cpp +++ b/hipamd/tests/src/texture/simpleTexture2DLayered.cpp @@ -26,9 +26,15 @@ THE SOFTWARE. * HIT_END */ #include "test_common.h" + typedef float T; + // Texture reference for 2D Layered texture +#if __HIP__ +__hip_pinned_shadow__ +#endif texture tex2DL; + __global__ void simpleKernelLayeredArray(T* outputData,int width,int height,int layer) { unsigned int x = blockIdx.x*blockDim.x + threadIdx.x; diff --git a/hipamd/tests/src/texture/simpleTexture3D.cpp b/hipamd/tests/src/texture/simpleTexture3D.cpp index 741b1671af..06833761c8 100644 --- a/hipamd/tests/src/texture/simpleTexture3D.cpp +++ b/hipamd/tests/src/texture/simpleTexture3D.cpp @@ -31,8 +31,19 @@ THE SOFTWARE. const char *sampleName = "simpleTexture3D"; // Texture reference for 3D texture +#if __HIP__ +__hip_pinned_shadow__ +#endif texture texf; + +#if __HIP__ +__hip_pinned_shadow__ +#endif texture texi; + +#if __HIP__ +__hip_pinned_shadow__ +#endif texture texc; template From 2ec55fde47f5dec911adc1c17892699781579808 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Mon, 17 Feb 2020 11:16:20 +0530 Subject: [PATCH 33/50] [dtests] Fix random timeout failures in hipModuleLoadDataMultThreaded (#1877) Limit the max threads that are launched to 16. --- .../src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hipamd/tests/src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp b/hipamd/tests/src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp index 8591a748df..e73bbedba5 100644 --- a/hipamd/tests/src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp +++ b/hipamd/tests/src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp @@ -36,6 +36,7 @@ THE SOFTWARE. #define LEN 64 #define SIZE LEN << 2 #define THREADS 2 +#define MAX_THREADS 16 #define FILENAME "vcpy_kernel.code" #define kernel_name "hello_world" @@ -145,7 +146,7 @@ int main() { HIPCHECK(hipInit(0)); auto buffer = load_file(); - run_multi_threads(THREADS * std::thread::hardware_concurrency(), buffer); + run_multi_threads(min(THREADS * std::thread::hardware_concurrency(), MAX_THREADS), buffer); passed(); } From ec84c16d7570f49746a1a39c4afb3f8694e7581d Mon Sep 17 00:00:00 2001 From: Rahul Garg Date: Mon, 17 Feb 2020 06:05:35 -0800 Subject: [PATCH 34/50] Fix hipMemcpy3D (#1798) Fixes #1790 and #1791. hipMemcpy3D still requires further refactoring for different input and output combinations. --- hipamd/include/hip/hcc_detail/driver_types.h | 9 +- hipamd/src/hip_memory.cpp | 205 ++++++++++-------- .../src/runtimeApi/memory/hipMemcpy3D.cpp | 110 ++++++++++ .../src/texture/simpleTexture2DLayered.cpp | 2 +- hipamd/tests/src/texture/simpleTexture3D.cpp | 39 ++-- 5 files changed, 249 insertions(+), 116 deletions(-) create mode 100644 hipamd/tests/src/runtimeApi/memory/hipMemcpy3D.cpp diff --git a/hipamd/include/hip/hcc_detail/driver_types.h b/hipamd/include/hip/hcc_detail/driver_types.h index 0c29542c7e..1941f44617 100644 --- a/hipamd/include/hip/hcc_detail/driver_types.h +++ b/hipamd/include/hip/hcc_detail/driver_types.h @@ -255,14 +255,14 @@ typedef struct hipMemcpy3DParms { hipArray_t srcArray; struct hipPos srcPos; struct hipPitchedPtr srcPtr; - hipArray_t dstArray; struct hipPos dstPos; struct hipPitchedPtr dstPtr; - struct hipExtent extent; enum hipMemcpyKind kind; +} hipMemcpy3DParms; +typedef struct HIP_MEMCPY3D { size_t Depth; size_t Height; size_t WidthInBytes; @@ -283,10 +283,7 @@ typedef struct hipMemcpy3DParms { size_t srcLOD; hipMemoryType srcMemoryType; size_t srcPitch; - size_t srcXInBytes; - size_t srcY; - size_t srcZ; -}hipMemcpy3DParms; +} HIP_MEMCPY3D; static inline struct hipPitchedPtr make_hipPitchedPtr(void* d, size_t p, size_t xsz, size_t ysz) { diff --git a/hipamd/src/hip_memory.cpp b/hipamd/src/hip_memory.cpp index 444e41107a..d965059923 100644 --- a/hipamd/src/hip_memory.cpp +++ b/hipamd/src/hip_memory.cpp @@ -1540,111 +1540,144 @@ hipError_t hipMemcpyAtoH(void* dst, hipArray* srcArray, size_t srcOffset, size_t return ihipLogStatus(e); } +int getByteSizeFromFormat(const hipChannelFormatDesc& desc){ + int byteSize =0; + switch (desc.f) { + case hipChannelFormatKindUnsigned: + switch (desc.x) { + case 32: + byteSize = sizeof(uint32_t); + break; + case 16: + byteSize = sizeof(uint16_t); + break; + case 8: + byteSize = sizeof(uint8_t); + break; + default: + byteSize = sizeof(uint32_t); + } + break; + case hipChannelFormatKindSigned: + switch (desc.x) { + case 32: + byteSize = sizeof(int32_t); + break; + case 16: + byteSize = sizeof(int16_t); + break; + case 8: + byteSize = sizeof(int8_t); + break; + default: + byteSize = sizeof(int32_t); + } + break; + case hipChannelFormatKindFloat: + switch (desc.x) { + case 32: + byteSize = sizeof(float); + break; + case 16: + byteSize = sizeof(_Float16); + break; + default: + byteSize = sizeof(float); + } + break; + case hipChannelFormatKindNone: + default: + break; + } + return byteSize; +} + hipError_t ihipMemcpy3D(const struct hipMemcpy3DParms* p, hipStream_t stream, bool isAsync) { hipError_t e = hipSuccess; if(p) { - size_t byteSize, width, height, depth, widthInBytes, srcPitch, dstPitch, ySize; - hipChannelFormatDesc desc; - void* srcPtr;void* dstPtr; + size_t dstByteSize, srcByteSize, copyWidth, copyHeight, copyDepth, widthInBytes, srcPitch, dstPitch, srcYsize, dstYsize; + size_t srcXoffset, srcYoffset, srcZoffset, dstXoffset, dstYoffset, dstZoffset; + size_t srcWidth, srcHeight, srcDepth, dstWidth, dstHeight, dstDepth; + + void* srcPtr, *dstPtr; + bool copyWidthUpdate= false; + copyDepth = p->extent.depth; + copyHeight = p->extent.height; + copyWidth = p->extent.width; // in bytes ? + dstXoffset = p->dstPos.x; + dstYoffset = p->dstPos.y; + dstZoffset = p->dstPos.z; + srcXoffset = p->srcPos.x; + srcYoffset = p->srcPos.y; + srcZoffset = p->srcPos.z; if (p->dstArray != nullptr) { - if (p->dstArray->isDrv == false) { - switch (p->dstArray->desc.f) { - case hipChannelFormatKindSigned: - byteSize = sizeof(int); - break; - case hipChannelFormatKindUnsigned: - byteSize = sizeof(unsigned int); - break; - case hipChannelFormatKindFloat: - byteSize = sizeof(float); - break; - case hipChannelFormatKindNone: - byteSize = sizeof(size_t); - break; - default: - byteSize = 0; - break; - } - depth = p->extent.depth; - height = p->extent.height; - width = p->extent.width; - widthInBytes = p->extent.width * byteSize; - srcPitch = p->srcPtr.pitch; - srcPtr = p->srcPtr.ptr; - ySize = p->srcPtr.ysize; - desc = p->dstArray->desc; - dstPtr = p->dstArray->data; - hsa_ext_image_data_info_t imageInfo; - if(hipTextureType2DLayered == p->dstArray->textureType) - GetImageInfo(HSA_EXT_IMAGE_GEOMETRY_2DA, width, height, 0, desc, imageInfo, depth); - else - GetImageInfo(HSA_EXT_IMAGE_GEOMETRY_3D, width, height, depth, desc, imageInfo); - dstPitch = imageInfo.size/(height == 0 ? 1 : height)/(depth == 0 ? 1 : depth); - } else { - depth = p->Depth; - height = p->Height; - widthInBytes = p->WidthInBytes; - width = p->dstArray->width; - hsa_ext_image_channel_order_t channelOrder; - switch(p->dstArray->NumChannels) { - case 2: - channelOrder = HSA_EXT_IMAGE_CHANNEL_ORDER_RG; - break; - case 3: - channelOrder = HSA_EXT_IMAGE_CHANNEL_ORDER_RGB; - break; - case 4: - channelOrder = HSA_EXT_IMAGE_CHANNEL_ORDER_RGBA; - break; - case 1: - default: - channelOrder = HSA_EXT_IMAGE_CHANNEL_ORDER_R; - break; - } - hsa_ext_image_channel_type_t channelType; - e = ihipArrayToImageFormat(p->dstArray->Format,channelType); - srcPitch = p->srcPitch; - srcPtr = (void*)p->srcHost; - ySize = p->srcHeight; - dstPtr = p->dstArray->data; - hsa_ext_image_data_info_t imageInfo; - if(hipTextureType2DLayered == p->dstArray->textureType) - GetImageInfo(HSA_EXT_IMAGE_GEOMETRY_2DA, width, height, 0, channelOrder, channelType, imageInfo, depth); - else - GetImageInfo(HSA_EXT_IMAGE_GEOMETRY_3D, width, height, depth, channelOrder, channelType, imageInfo); - dstPitch = imageInfo.size/(height == 0 ? 1 : height)/(depth == 0 ? 1 : depth); + if ((p->dstArray->isDrv == true) ||( p->dstPtr.ptr!= nullptr)){ + return hipErrorInvalidValue; + } + // Array destination + dstByteSize = getByteSizeFromFormat(p->dstArray->desc); + hipChannelFormatDesc desc; + desc = p->dstArray->desc; + dstPtr = p->dstArray->data; + dstWidth = p->dstArray->width; + dstHeight = p->dstArray->height; + dstDepth = p->dstArray->depth; + dstPitch = dstByteSize * alignUp(dstWidth, IMAGE_PITCH_ALIGNMENT); + if(!copyWidthUpdate) { + copyWidth = copyWidth * dstByteSize; + copyWidthUpdate = true; } } else { - // Non array destination - depth = p->extent.depth; - height = p->extent.height; - widthInBytes = p->extent.width; - srcPitch = p->srcPtr.pitch; - srcPtr = p->srcPtr.ptr; + //Non Array destination dstPtr = p->dstPtr.ptr; - ySize = p->srcPtr.ysize; + dstWidth = p->dstPtr.xsize; + dstHeight = p->dstPtr.ysize; dstPitch = p->dstPtr.pitch; } + if (p->srcArray != nullptr) { + if ((p->srcArray->isDrv == true) ||( p->srcPtr.ptr!= nullptr)){ + return hipErrorInvalidValue; + } + // Array source + srcByteSize = getByteSizeFromFormat(p->srcArray->desc); + hipChannelFormatDesc desc; + desc = p->srcArray->desc; + srcPtr = p->srcArray->data; + srcWidth = p->srcArray->width; + srcHeight = p->srcArray->height; + srcDepth = p->srcArray->depth; + srcPitch = srcByteSize * alignUp(srcWidth, IMAGE_PITCH_ALIGNMENT); + if(!copyWidthUpdate) { + copyWidth = copyWidth * srcByteSize; + copyWidthUpdate = true; + } + } else { + //Non Array source + srcPtr = p->srcPtr.ptr; + srcWidth = p->srcPtr.xsize; + srcHeight = p->srcPtr.ysize; + srcPitch = p->srcPtr.pitch; + } + stream = ihipSyncAndResolveStream(stream); try { - if((widthInBytes == dstPitch) && (widthInBytes == srcPitch)) { + if((copyWidth == dstPitch) && (copyWidth == srcPitch)&& (copyHeight == dstHeight) &&(copyHeight == srcHeight)) { if(isAsync) - stream->locked_copyAsync((void*)dstPtr, (void*)srcPtr, widthInBytes*height*depth, p->kind); + stream->locked_copyAsync((void*)dstPtr, (void*)srcPtr, copyWidth*copyHeight*copyDepth, p->kind); else - stream->locked_copySync((void*)dstPtr, (void*)srcPtr, widthInBytes*height*depth, p->kind, false); + stream->locked_copySync((void*)dstPtr, (void*)srcPtr, copyWidth*copyHeight*copyDepth, p->kind, false); } else { - for (int i = 0; i < depth; i++) { - for (int j = 0; j < height; j++) { - // TODO: p->srcPos or p->dstPos are not 0. + for (int i = 0; i < copyDepth; i++) { + for (int j = 0; j < copyHeight; j++) { unsigned char* src = - (unsigned char*)srcPtr + i * ySize * srcPitch + j * srcPitch; + (unsigned char*)srcPtr + (i + srcZoffset) * srcHeight * srcPitch + (j + srcYoffset) * srcPitch + srcXoffset; unsigned char* dst = - (unsigned char*)dstPtr + i * height * dstPitch + j * dstPitch; + (unsigned char*)dstPtr + (i + dstZoffset) * dstHeight * dstPitch + (j + dstYoffset) * dstPitch + dstXoffset; if(isAsync) - stream->locked_copyAsync(dst, src, widthInBytes, p->kind); + stream->locked_copyAsync(dst, src, copyWidth, p->kind); else - stream->locked_copySync(dst, src, widthInBytes, p->kind); + stream->locked_copySync(dst, src, copyWidth, p->kind); } } } diff --git a/hipamd/tests/src/runtimeApi/memory/hipMemcpy3D.cpp b/hipamd/tests/src/runtimeApi/memory/hipMemcpy3D.cpp new file mode 100644 index 0000000000..255a3490b6 --- /dev/null +++ b/hipamd/tests/src/runtimeApi/memory/hipMemcpy3D.cpp @@ -0,0 +1,110 @@ +/* +Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +/* HIT_START + * BUILD: %t %s ../../test_common.cpp + * TEST: %t + * HIT_END + */ + +#include "test_common.h" + +template +void runTest(int width,int height,int depth, hipChannelFormatKind formatKind) +{ + unsigned int size = width * height * depth * sizeof(T); + T* hData = (T*) malloc(size); + memset(hData, 0, size); + + for (int i = 0; i < depth; i++) { + for (int j = 0; j < height; j++) { + for (int k = 0; k < width; k++) { + hData[i*width*height + j*width +k] = i*width*height + j*width + k; + } + } + } + printf("test- sizeof(T) =%d\n", sizeof(T)); + hipChannelFormatDesc channelDesc = hipCreateChannelDesc(sizeof(T)*8, 0, 0, 0, formatKind); + hipArray *arr,*arr1; + + HIPCHECK(hipMalloc3DArray(&arr, &channelDesc, make_hipExtent(width, height, depth), hipArrayDefault)); + HIPCHECK(hipMalloc3DArray(&arr1, &channelDesc, make_hipExtent(width, height, depth), hipArrayDefault)); + hipMemcpy3DParms myparms = {0}; + myparms.srcPos = make_hipPos(0,0,0); + myparms.dstPos = make_hipPos(0,0,0); + myparms.srcPtr = make_hipPitchedPtr(hData, width * sizeof(T), width, height); + myparms.dstArray = arr; + myparms.extent = make_hipExtent(width , height, depth); +#ifdef __HIP_PLATFORM_NVCC__ + myparms.kind = cudaMemcpyHostToDevice; +#else + myparms.kind = hipMemcpyHostToDevice; +#endif + HIPCHECK(hipMemcpy3D(&myparms)); + HIPCHECK(hipDeviceSynchronize()); + //Array to Array + memset(&myparms,0x0, sizeof(hipMemcpy3DParms)); + myparms.srcPos = make_hipPos(0,0,0); + myparms.dstPos = make_hipPos(0,0,0); + myparms.srcArray = arr; + myparms.dstArray = arr1; + myparms.extent = make_hipExtent(width, height, depth); +#ifdef __HIP_PLATFORM_NVCC__ + myparms.kind = cudaMemcpyDeviceToDevice; +#else + myparms.kind = hipMemcpyDeviceToDevice; +#endif + HIPCHECK(hipMemcpy3D(&myparms)); + HIPCHECK(hipDeviceSynchronize()); + + T *hOutputData = (T*) malloc(size); + memset(hOutputData, 0, size); + //Device to host + memset(&myparms,0x0, sizeof(hipMemcpy3DParms)); + myparms.srcPos = make_hipPos(0,0,0); + myparms.dstPos = make_hipPos(0,0,0); + myparms.dstPtr = make_hipPitchedPtr(hOutputData, width * sizeof(T), width, height); + myparms.srcArray = arr1; + myparms.extent = make_hipExtent(width, height, depth); +#ifdef __HIP_PLATFORM_NVCC__ + myparms.kind = cudaMemcpyDeviceToHost; +#else + myparms.kind = hipMemcpyDeviceToHost; +#endif + HIPCHECK(hipMemcpy3D(&myparms)); + HIPCHECK(hipDeviceSynchronize()); + + // Check result + HipTest::checkArray(hData,hOutputData,width,height,depth); + hipFreeArray(arr); + hipFreeArray(arr1); + free(hData); + free(hOutputData); +} + +int main(int argc, char **argv) +{ + for(int i=1;i<25;i++) + { + runTest(i,i,i, hipChannelFormatKindFloat); + runTest(i+1,i,i, hipChannelFormatKindSigned); + runTest(i,i+1,i, hipChannelFormatKindSigned); + } + passed(); +} diff --git a/hipamd/tests/src/texture/simpleTexture2DLayered.cpp b/hipamd/tests/src/texture/simpleTexture2DLayered.cpp index fa545cb4ca..e5014dae6b 100644 --- a/hipamd/tests/src/texture/simpleTexture2DLayered.cpp +++ b/hipamd/tests/src/texture/simpleTexture2DLayered.cpp @@ -65,7 +65,7 @@ void runTest(int width,int height,int num_layers,texture texc; template -__global__ void simpleKernel3DArray(T* outputData, +__global__ void simpleKernel3DArray(T* outputData, int width, int height,int depth) { @@ -55,21 +52,18 @@ __global__ void simpleKernel3DArray(T* outputData, for (int j = 0; j < height; j++) { for (int k = 0; k < width; k++) { if(std::is_same::value) - outputData[i*width*height + j*width + k] = tex3D(texf, texf.textureObject, k, j, i); + outputData[i*width*height + j*width + k] = tex3D(texf, k, j, i); else if(std::is_same::value) - outputData[i*width*height + j*width + k] = tex3D(texi, texi.textureObject, k, j, i); + outputData[i*width*height + j*width + k] = tex3D(texi, k, j, i); else if(std::is_same::value) - outputData[i*width*height + j*width + k] = tex3D(texc, texc.textureObject, k, j, i); + outputData[i*width*height + j*width + k] = tex3D(texc, k, j, i); } } } } -//////////////////////////////////////////////////////////////////////////////// -//! Run a simple test for tex3D -//////////////////////////////////////////////////////////////////////////////// template -void runTest(int width,int height,int depth,texture *tex) +void runTest(int width,int height,int depth,texture *tex, hipChannelFormatKind formatKind) { unsigned int size = width * height * depth * sizeof(T); T* hData = (T*) malloc(size); @@ -84,17 +78,21 @@ void runTest(int width,int height,int depth,texture(i,i,i,&texf); - runTest(i+1,i,i,&texi); - runTest(i,i+1,i,&texc); + runTest(i,i,i,&texf, hipChannelFormatKindFloat); + runTest(i+1,i,i,&texi, hipChannelFormatKindSigned); + runTest(i,i+1,i,&texc, hipChannelFormatKindSigned); } passed(); } - From f4120043ed38967b954024378f429a346391be3c Mon Sep 17 00:00:00 2001 From: "Yaxun (Sam) Liu" Date: Mon, 17 Feb 2020 09:19:26 -0800 Subject: [PATCH 35/50] Let HIP-Clang inline all functions by default (#1875) This is a quick workaround to match HCC behavior for performance since inlining usually results in more optimization opportunities therefore better performance. We will fine tuning inline threashold later. --- hipamd/bin/hipcc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/hipamd/bin/hipcc b/hipamd/bin/hipcc index 1bdb466c68..41f11a36b4 100755 --- a/hipamd/bin/hipcc +++ b/hipamd/bin/hipcc @@ -727,6 +727,10 @@ if ($HIP_PLATFORM eq "clang") { $HIPCXXFLAGS .= " -O3"; $HIPLDFLAGS .= " -O3"; } + if ($optArg ne "-O0") { + $HIPCXXFLAGS .= " -mllvm -amdgpu-early-inline-all=true -mllvm -amdgpu-function-calls=false"; + $HIPLDFLAGS .= " -mllvm -amdgpu-early-inline-all=true -mllvm -amdgpu-function-calls=false"; + } $HIP_DEVLIB_FLAGS = " --hip-device-lib-path=$DEVICE_LIB_PATH"; $HIPCXXFLAGS .= " $HIP_DEVLIB_FLAGS"; if (not $isWindows) { From 730f23829e7ab07d9414a81c114042537531cfde Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Tue, 18 Feb 2020 17:20:27 +0200 Subject: [PATCH 36/50] Tweak synchronous memcpy implementation (#1809) The existing one can have issues on certain systems, therefore this limits use of direct memcpy via largeBAR to sizes where it is unequivocally better. Also addresses SWDEV-220030 and SWDEV-222237. --- hipamd/src/hip_memory.cpp | 105 ++++++++++++++++---------------------- 1 file changed, 43 insertions(+), 62 deletions(-) diff --git a/hipamd/src/hip_memory.cpp b/hipamd/src/hip_memory.cpp index d965059923..1bcf10f982 100644 --- a/hipamd/src/hip_memory.cpp +++ b/hipamd/src/hip_memory.cpp @@ -37,7 +37,6 @@ __device__ uint32_t __hip_device_page_flag[__HIP_NUM_PAGES]; namespace hip_internal { namespace { - inline const char* hsa_to_string(hsa_status_t err) noexcept { @@ -149,13 +148,14 @@ namespace { const_cast(p), &r, nullptr, nullptr, nullptr), __FILE__, __func__, __LINE__); - r.size = is_large_BAR || (type(r.agentOwner) == HSA_DEVICE_TYPE_CPU) ? - UINT32_MAX : sizeof(hsa_amd_pointer_info_t); + if (is_large_BAR) r.size = UINT32_MAX; + else if (type(r.agentOwner) == HSA_DEVICE_TYPE_CPU) r.size = INT32_MAX; return r; } - constexpr size_t staging_sz{4 * 1024 * 1024}; // 2 Pages. + constexpr size_t staging_sz{4 * 1024 * 1024}; // 2 Pages. + constexpr size_t max_std_memcpy_sz{8 * 1024}; // 8 KiB. thread_local const std::unique_ptr staging_buffer{ []() { @@ -202,8 +202,8 @@ namespace { } // Unnamed namespace. inline -void do_copy(void* __restrict dst, const void* __restrict src, std::size_t n, - hsa_agent_t da, hsa_agent_t sa) { +void do_copy(void* __restrict dst, const void* __restrict src, size_t n, + hsa_agent_t da, hsa_agent_t sa) { hsa_signal_silent_store_relaxed(copy_signal, 1); throwing_result_check( hsa_amd_memory_async_copy(dst, da, src, sa, n, 0, nullptr, copy_signal), @@ -224,10 +224,10 @@ void do_std_memcpy( inline void d2h_copy(void* __restrict dst, const void* __restrict src, size_t n, hsa_amd_pointer_info_t si) { - // TODO: characterise direct largeBAR reads from agent-allocated memory. - // if (si.size == UINT32_MAX) { - // return do_std_memcpy(dst, src, n); - // } + if (si.size == INT32_MAX) return do_std_memcpy(dst, src, n); + if (si.size == UINT32_MAX && n <= max_std_memcpy_sz) { + return do_std_memcpy(dst, src, n); + } const auto di{info(dst)}; @@ -256,7 +256,8 @@ void d2h_copy(void* __restrict dst, const void* __restrict src, size_t n, inline void h2d_copy(void* __restrict dst, const void* __restrict src, size_t n, hsa_amd_pointer_info_t di) { - if (di.size == UINT32_MAX) { + if (di.size == INT32_MAX) return do_std_memcpy(dst, src, n); + if (di.size == UINT32_MAX && n <= max_std_memcpy_sz) { return do_std_memcpy(dst, src, n); } @@ -264,8 +265,8 @@ void h2d_copy(void* __restrict dst, const void* __restrict src, size_t n, if (si.type == HSA_EXT_POINTER_TYPE_LOCKED) { src = static_cast(si.agentBaseAddress) + - (static_cast(src) - - static_cast(si.hostBaseAddress)); + (static_cast(src) - + static_cast(si.hostBaseAddress)); do_copy(dst, src, n, di.agentOwner, di.agentOwner); } else if (n <= staging_sz) { @@ -288,53 +289,30 @@ void h2d_copy(void* __restrict dst, const void* __restrict src, size_t n, inline void generic_copy(void* __restrict dst, const void* __restrict src, size_t n, hsa_amd_pointer_info_t di, hsa_amd_pointer_info_t si) { - if (di.size == UINT32_MAX && si.size == UINT32_MAX) { + if (di.size == INT32_MAX && si.size == INT32_MAX) { + return do_std_memcpy(dst, src, n); + } + if (di.size == UINT32_MAX && si.size == UINT32_MAX && + n <= max_std_memcpy_sz) { return do_std_memcpy(dst, src, n); } - std::unique_ptr lck0{ - nullptr, [](void* p) { hsa_amd_memory_unlock(p); }}; - std::unique_ptr lck1{nullptr, lck0.get_deleter()}; - - switch (si.type) { - case HSA_EXT_POINTER_TYPE_HSA: - if (di.type == HSA_EXT_POINTER_TYPE_HSA) { - hsa_memory_copy(dst, src, n); - return; // TODO: do_copy(dst, src, n, di.agentOwner, si.agentOwner); + switch (type(si.agentOwner)) { + case HSA_DEVICE_TYPE_GPU: + if (type(di.agentOwner) == HSA_DEVICE_TYPE_GPU) { + throwing_result_check( + hsa_amd_agents_allow_access( + 1u, &si.agentOwner, nullptr, di.agentBaseAddress), + __FILE__, __func__, __LINE__); + return do_copy(dst, src, n, di.agentOwner, si.agentOwner); } - - if (di.type == HSA_EXT_POINTER_TYPE_UNKNOWN || - di.type == HSA_EXT_POINTER_TYPE_LOCKED) { - return d2h_copy(dst, src, n, si); - } - break; - case HSA_EXT_POINTER_TYPE_LOCKED: - if (di.type == HSA_EXT_POINTER_TYPE_UNKNOWN) { - std::memcpy(dst, si.hostBaseAddress, n); - - return; - } - if (di.type == HSA_EXT_POINTER_TYPE_LOCKED) { - std::memcpy(di.hostBaseAddress, si.hostBaseAddress, n); - - return; - } - src = si.agentBaseAddress; - si.agentOwner = di.agentOwner; - break; - case HSA_EXT_POINTER_TYPE_UNKNOWN: - if (di.type == HSA_EXT_POINTER_TYPE_UNKNOWN) { - std::memcpy(dst, src, n); - - return; - } - if (di.type == HSA_EXT_POINTER_TYPE_LOCKED) { - std::memcpy(di.hostBaseAddress, src, n); - - return; + return d2h_copy(dst, src, n, si); + case HSA_DEVICE_TYPE_CPU: + if (type(di.agentOwner) == HSA_DEVICE_TYPE_CPU) { + return do_std_memcpy(dst, src, n); } return h2d_copy(dst, src, n, di); - default: do_copy(dst, src, n, di.agentOwner, si.agentOwner); break; + default: throw std::runtime_error{"Unsupported copy type."}; } } @@ -343,14 +321,17 @@ void memcpy_impl(void* __restrict dst, const void* __restrict src, size_t n, hipMemcpyKind k) noexcept { switch (k) { case hipMemcpyHostToHost: std::memcpy(dst, src, n); break; - case hipMemcpyHostToDevice: - return is_large_BAR ? do_std_memcpy(dst, src, n) - : h2d_copy(dst, src, n, info(dst)); - case hipMemcpyDeviceToHost: - // TODO: characterise direct largeBAR reads from agent-allocated memory. - return /*is_large_BAR ? do_std_memcpy(dst, src, n) - : */d2h_copy(dst, src, n, info(src)); - case hipMemcpyDeviceToDevice: hsa_memory_copy(dst, src, n); break; + case hipMemcpyHostToDevice: return h2d_copy(dst, src, n, info(dst)); + case hipMemcpyDeviceToHost: return d2h_copy(dst, src, n, info(src)); + case hipMemcpyDeviceToDevice: { + const auto di{info(dst)}; + const auto si{info(src)}; + throwing_result_check( + hsa_amd_agents_allow_access( + 1u, &si.agentOwner, nullptr, di.agentBaseAddress), + __FILE__, __func__, __LINE__); + return do_copy(dst, src, n, di.agentOwner, si.agentOwner); + } default: return generic_copy(dst, src, n, info(dst), info(src)); } } From 82478dca652d17ea3f2fac2a5107128c61d2a903 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Wed, 19 Feb 2020 02:17:49 -0600 Subject: [PATCH 37/50] adding hipExtModuleLaunchKernel to tracing layer (#1880) --- hipamd/hip_prof_gen.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hipamd/hip_prof_gen.py b/hipamd/hip_prof_gen.py index 9e90c1558c..d2da7cd4df 100755 --- a/hipamd/hip_prof_gen.py +++ b/hipamd/hip_prof_gen.py @@ -447,7 +447,8 @@ if len(sys.argv) > 3: OUTPUT = sys.argv[3] # API declaration map api_map = { - 'hipHccModuleLaunchKernel': '' + 'hipHccModuleLaunchKernel': '', + 'hipExtModuleLaunchKernel': '' } # API options map opts_map = {} From d81a9a712a3c9aaec7d19385944df42833d24bba Mon Sep 17 00:00:00 2001 From: Sarbojit2019 <52527887+SarbojitAMD@users.noreply.github.com> Date: Wed, 19 Feb 2020 13:48:20 +0530 Subject: [PATCH 38/50] [HIPIFY] Add back missing execute permission to hipify-perl (#1881) hipify-perl script lost its executable permission hence "samples/0_Intro/square" was failing. Fixes SWDEV 223433. --- hipamd/bin/hipify-perl | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 hipamd/bin/hipify-perl diff --git a/hipamd/bin/hipify-perl b/hipamd/bin/hipify-perl old mode 100644 new mode 100755 From ff4479fcc72622150ec18c38afd0d25e16e29c41 Mon Sep 17 00:00:00 2001 From: "Yaxun (Sam) Liu" Date: Wed, 26 Feb 2020 02:23:43 -0800 Subject: [PATCH 39/50] Fix hipcc for extra -mllvm option (#1885) --- hipamd/bin/hipcc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/hipamd/bin/hipcc b/hipamd/bin/hipcc index 41f11a36b4..22923e5ec4 100755 --- a/hipamd/bin/hipcc +++ b/hipamd/bin/hipcc @@ -729,7 +729,9 @@ if ($HIP_PLATFORM eq "clang") { } if ($optArg ne "-O0") { $HIPCXXFLAGS .= " -mllvm -amdgpu-early-inline-all=true -mllvm -amdgpu-function-calls=false"; - $HIPLDFLAGS .= " -mllvm -amdgpu-early-inline-all=true -mllvm -amdgpu-function-calls=false"; + if ($needLDFLAGS and not $needCXXFLAGS) { + $HIPLDFLAGS .= " -mllvm -amdgpu-early-inline-all=true -mllvm -amdgpu-function-calls=false"; + } } $HIP_DEVLIB_FLAGS = " --hip-device-lib-path=$DEVICE_LIB_PATH"; $HIPCXXFLAGS .= " $HIP_DEVLIB_FLAGS"; From a094c87038d6e34cca7c726647f5825033cb6765 Mon Sep 17 00:00:00 2001 From: Nick Curtis Date: Wed, 26 Feb 2020 04:23:56 -0600 Subject: [PATCH 40/50] fix long shuffle implementations for windows (#1895) Fixes for SWDEV-223694 --- .../include/hip/hcc_detail/device_functions.h | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/hipamd/include/hip/hcc_detail/device_functions.h b/hipamd/include/hip/hcc_detail/device_functions.h index 46ed53ff87..7096841da8 100644 --- a/hipamd/include/hip/hcc_detail/device_functions.h +++ b/hipamd/include/hip/hcc_detail/device_functions.h @@ -323,6 +323,7 @@ __device__ inline long __shfl(long var, int src_lane, int width = warpSize) { + #ifndef _MSC_VER static_assert(sizeof(long) == 2 * sizeof(int), ""); static_assert(sizeof(long) == sizeof(uint64_t), ""); @@ -333,6 +334,10 @@ long __shfl(long var, int src_lane, int width = warpSize) uint64_t tmp0 = (static_cast(tmp[1]) << 32ull) | static_cast(tmp[0]); long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); return tmp1; + #else + static_assert(sizeof(long) == sizeof(int), ""); + return static_cast(__shfl(static_cast(var), src_lane, width)); + #endif } __device__ inline @@ -390,6 +395,7 @@ __device__ inline long __shfl_up(long var, unsigned int lane_delta, int width = warpSize) { + #ifndef _MSC_VER static_assert(sizeof(long) == 2 * sizeof(int), ""); static_assert(sizeof(long) == sizeof(uint64_t), ""); @@ -400,6 +406,10 @@ long __shfl_up(long var, unsigned int lane_delta, int width = warpSize) uint64_t tmp0 = (static_cast(tmp[1]) << 32ull) | static_cast(tmp[0]); long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); return tmp1; + #else + static_assert(sizeof(long) == sizeof(int), ""); + return static_cast(__shfl_up(static_cast(var), lane_delta, width)); + #endif } __device__ inline @@ -455,6 +465,7 @@ __device__ inline long __shfl_down(long var, unsigned int lane_delta, int width = warpSize) { + #ifndef _MSC_VER static_assert(sizeof(long) == 2 * sizeof(int), ""); static_assert(sizeof(long) == sizeof(uint64_t), ""); @@ -465,6 +476,10 @@ long __shfl_down(long var, unsigned int lane_delta, int width = warpSize) uint64_t tmp0 = (static_cast(tmp[1]) << 32ull) | static_cast(tmp[0]); long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); return tmp1; + #else + static_assert(sizeof(long) == sizeof(int), ""); + return static_cast(__shfl_down(static_cast(var), lane_delta, width)); + #endif } __device__ inline @@ -520,6 +535,7 @@ __device__ inline long __shfl_xor(long var, int lane_mask, int width = warpSize) { + #ifndef _MSC_VER static_assert(sizeof(long) == 2 * sizeof(int), ""); static_assert(sizeof(long) == sizeof(uint64_t), ""); @@ -530,6 +546,10 @@ long __shfl_xor(long var, int lane_mask, int width = warpSize) uint64_t tmp0 = (static_cast(tmp[1]) << 32ull) | static_cast(tmp[0]); long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); return tmp1; + #else + static_assert(sizeof(long) == sizeof(int), ""); + return static_cast(__shfl_down(static_cast(var), lane_delta, width)); + #endif } __device__ inline From 3b4b58bda6b7caf9cbd9dd1ae4b9a2e408ec9f59 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Thu, 27 Feb 2020 16:18:31 +0530 Subject: [PATCH 41/50] bump version to 3.2 (#1898) - Bump version to 3.2 - [ci] Enable tests on ROCm 3.1 --- hipamd/Jenkinsfile | 8 ++++---- hipamd/bin/hipconfig | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/hipamd/Jenkinsfile b/hipamd/Jenkinsfile index 734e875e03..e38f7824d2 100644 --- a/hipamd/Jenkinsfile +++ b/hipamd/Jenkinsfile @@ -295,13 +295,13 @@ def docker_upload_dockerhub( String local_org, String image_name, String remote_ String build_config = 'Release' String job_name = env.JOB_NAME.toLowerCase( ) -// The following launches 3 builds in parallel: rocm-head, rocm-3.0.x and cuda-10.x -parallel rocm_3_0: +// The following launches 3 builds in parallel: rocm-head, rocm-3.1.x and cuda-10.x +parallel rocm_3_1: { node('hip-rocm') { - String hcc_ver = 'rocm-3.0.x' - String from_image = 'ci_test_nodes/rocm-3.0.x/ubuntu-16.04:latest' + String hcc_ver = 'rocm-3.1.x' + String from_image = 'ci_test_nodes/rocm-3.1.x/ubuntu-16.04:latest' String inside_args = '--device=/dev/kfd --device=/dev/dri --group-add=video' // Checkout source code, dependencies and version files diff --git a/hipamd/bin/hipconfig b/hipamd/bin/hipconfig index a73e8af8b9..c56b56ecd8 100755 --- a/hipamd/bin/hipconfig +++ b/hipamd/bin/hipconfig @@ -1,7 +1,7 @@ #!/usr/bin/perl -w $HIP_BASE_VERSION_MAJOR = "3"; -$HIP_BASE_VERSION_MINOR = "1"; +$HIP_BASE_VERSION_MINOR = "2"; # Need perl > 5.10 to use logic-defined or use 5.006; use v5.10.1; From cea5489f00be50d2a8bbbad1cb91f80190607236 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Thu, 27 Feb 2020 12:51:12 +0200 Subject: [PATCH 42/50] Address post-staging issues in #1809 (#1894) Fixes SWDEV-223910 and SWDEV-223663 --- hipamd/src/hip_memory.cpp | 95 ++++++++++++++++++++------------------- 1 file changed, 48 insertions(+), 47 deletions(-) diff --git a/hipamd/src/hip_memory.cpp b/hipamd/src/hip_memory.cpp index 1bcf10f982..8159f22a97 100644 --- a/hipamd/src/hip_memory.cpp +++ b/hipamd/src/hip_memory.cpp @@ -139,6 +139,8 @@ namespace { return r; }()}; + constexpr std::uint32_t is_cpu_owned{UINT32_MAX}; + inline hsa_amd_pointer_info_t info(const void* p) { @@ -148,14 +150,14 @@ namespace { const_cast(p), &r, nullptr, nullptr, nullptr), __FILE__, __func__, __LINE__); - if (is_large_BAR) r.size = UINT32_MAX; - else if (type(r.agentOwner) == HSA_DEVICE_TYPE_CPU) r.size = INT32_MAX; + if (type(r.agentOwner) == HSA_DEVICE_TYPE_CPU) r.size = is_cpu_owned; return r; } - constexpr size_t staging_sz{4 * 1024 * 1024}; // 2 Pages. - constexpr size_t max_std_memcpy_sz{8 * 1024}; // 8 KiB. + constexpr size_t staging_sz{4 * 1024 * 1024}; // 2 Pages. + constexpr size_t max_h2d_std_memcpy_sz{8 * 1024}; // 8 KiB. + constexpr size_t max_d2h_std_memcpy_sz{64}; // 1 cacheline. thread_local const std::unique_ptr staging_buffer{ []() { @@ -203,7 +205,7 @@ namespace { inline void do_copy(void* __restrict dst, const void* __restrict src, size_t n, - hsa_agent_t da, hsa_agent_t sa) { + hsa_agent_t da, hsa_agent_t sa) { hsa_signal_silent_store_relaxed(copy_signal, 1); throwing_result_check( hsa_amd_memory_async_copy(dst, da, src, sa, n, 0, nullptr, copy_signal), @@ -224,14 +226,20 @@ void do_std_memcpy( inline void d2h_copy(void* __restrict dst, const void* __restrict src, size_t n, hsa_amd_pointer_info_t si) { - if (si.size == INT32_MAX) return do_std_memcpy(dst, src, n); - if (si.size == UINT32_MAX && n <= max_std_memcpy_sz) { + const auto di{info(dst)}; + const auto is_locked{di.type == HSA_EXT_POINTER_TYPE_LOCKED}; + + if (!is_locked && si.size == is_cpu_owned) { return do_std_memcpy(dst, src, n); } - - const auto di{info(dst)}; - - if (di.type == HSA_EXT_POINTER_TYPE_LOCKED) { + if (!is_locked && is_large_BAR && n <= max_d2h_std_memcpy_sz) { + return do_std_memcpy(dst, src, n); + } + if (di.type == HSA_EXT_POINTER_TYPE_HSA) { + return do_copy(dst, src, n, si.agentOwner, si.agentOwner); + } + + if (is_locked) { dst = static_cast(di.agentBaseAddress) + (static_cast(dst) - static_cast(di.hostBaseAddress)); @@ -247,7 +255,7 @@ void d2h_copy(void* __restrict dst, const void* __restrict src, size_t n, throwing_result_check(hsa_amd_memory_lock(dst, n, &si.agentOwner, 1, const_cast(&dst)), - __FILE__, __func__, __LINE__); + __FILE__, __func__, __LINE__); do_copy(dst, src, n, si.agentOwner, si.agentOwner); } @@ -256,14 +264,20 @@ void d2h_copy(void* __restrict dst, const void* __restrict src, size_t n, inline void h2d_copy(void* __restrict dst, const void* __restrict src, size_t n, hsa_amd_pointer_info_t di) { - if (di.size == INT32_MAX) return do_std_memcpy(dst, src, n); - if (di.size == UINT32_MAX && n <= max_std_memcpy_sz) { + const auto si{info(const_cast(src))}; + const auto is_locked{si.type == HSA_EXT_POINTER_TYPE_LOCKED}; + + if (!is_locked && di.size == is_cpu_owned) { return do_std_memcpy(dst, src, n); } + if (!is_locked && is_large_BAR && n <= max_h2d_std_memcpy_sz) { + return do_std_memcpy(dst, src, n); + } + if (si.type == HSA_EXT_POINTER_TYPE_HSA) { + return do_copy(dst, src, n, di.agentOwner, di.agentOwner); + } - const auto si{info(const_cast(src))}; - - if (si.type == HSA_EXT_POINTER_TYPE_LOCKED) { + if (is_locked) { src = static_cast(si.agentBaseAddress) + (static_cast(src) - static_cast(si.hostBaseAddress)); @@ -280,7 +294,7 @@ void h2d_copy(void* __restrict dst, const void* __restrict src, size_t n, throwing_result_check(hsa_amd_memory_lock(const_cast(src), n, &di.agentOwner, 1, const_cast(&src)), - __FILE__, __func__, __LINE__); + __FILE__, __func__, __LINE__); do_copy(dst, src, n, di.agentOwner, di.agentOwner); } @@ -289,36 +303,23 @@ void h2d_copy(void* __restrict dst, const void* __restrict src, size_t n, inline void generic_copy(void* __restrict dst, const void* __restrict src, size_t n, hsa_amd_pointer_info_t di, hsa_amd_pointer_info_t si) { - if (di.size == INT32_MAX && si.size == INT32_MAX) { - return do_std_memcpy(dst, src, n); - } - if (di.size == UINT32_MAX && si.size == UINT32_MAX && - n <= max_std_memcpy_sz) { + if (di.size == is_cpu_owned && si.size == is_cpu_owned) { return do_std_memcpy(dst, src, n); } + if (di.size == is_cpu_owned) return d2h_copy(dst, src, n, si); + if (si.size == is_cpu_owned) return h2d_copy(dst, src, n, di); - switch (type(si.agentOwner)) { - case HSA_DEVICE_TYPE_GPU: - if (type(di.agentOwner) == HSA_DEVICE_TYPE_GPU) { - throwing_result_check( - hsa_amd_agents_allow_access( - 1u, &si.agentOwner, nullptr, di.agentBaseAddress), - __FILE__, __func__, __LINE__); - return do_copy(dst, src, n, di.agentOwner, si.agentOwner); - } - return d2h_copy(dst, src, n, si); - case HSA_DEVICE_TYPE_CPU: - if (type(di.agentOwner) == HSA_DEVICE_TYPE_CPU) { - return do_std_memcpy(dst, src, n); - } - return h2d_copy(dst, src, n, di); - default: throw std::runtime_error{"Unsupported copy type."}; - } + throwing_result_check(hsa_amd_agents_allow_access(1u, &si.agentOwner, + nullptr, + di.agentBaseAddress), + __FILE__, __func__, __LINE__); + + return do_copy(dst, src, n, di.agentOwner, si.agentOwner); } inline void memcpy_impl(void* __restrict dst, const void* __restrict src, size_t n, - hipMemcpyKind k) noexcept { + hipMemcpyKind k) { switch (k) { case hipMemcpyHostToHost: std::memcpy(dst, src, n); break; case hipMemcpyHostToDevice: return h2d_copy(dst, src, n, info(dst)); @@ -326,10 +327,10 @@ void memcpy_impl(void* __restrict dst, const void* __restrict src, size_t n, case hipMemcpyDeviceToDevice: { const auto di{info(dst)}; const auto si{info(src)}; - throwing_result_check( - hsa_amd_agents_allow_access( - 1u, &si.agentOwner, nullptr, di.agentBaseAddress), - __FILE__, __func__, __LINE__); + throwing_result_check(hsa_amd_agents_allow_access(1u, &si.agentOwner, + nullptr, + di.agentBaseAddress), + __FILE__, __func__, __LINE__); return do_copy(dst, src, n, di.agentOwner, si.agentOwner); } default: return generic_copy(dst, src, n, info(dst), info(src)); @@ -1271,7 +1272,7 @@ hipError_t hipMemcpyToSymbolAsync(void* dst, const void* src, size_t count, if (dst == nullptr) { return ihipLogStatus(hipErrorInvalidSymbol); } - + if (kind == hipMemcpyDeviceToHost || kind == hipMemcpyHostToHost) { return ihipLogStatus(hipErrorInvalidMemcpyDirection); } else if (kind == hipMemcpyDeviceToDevice) { @@ -1303,7 +1304,7 @@ hipError_t hipMemcpyFromSymbolAsync(void* dst, const void* src, size_t count, if (src == nullptr || dst == nullptr) { return ihipLogStatus(hipErrorInvalidSymbol); } - + if (kind == hipMemcpyHostToDevice || kind == hipMemcpyHostToHost) { return ihipLogStatus(hipErrorInvalidMemcpyDirection); } else if (kind == hipMemcpyDeviceToDevice) { From e747efe2b4ca131a99acd59d20a48a36b53bed36 Mon Sep 17 00:00:00 2001 From: jiabaxie <57198431+jiabaxie@users.noreply.github.com> Date: Fri, 28 Feb 2020 06:16:12 -0500 Subject: [PATCH 43/50] Cleaned up error messages for HipEnvVarDriver test (#1825) There were several error messages that appeared even if the hipEnvVarDriver.exe test passes and executes successfully. Now it is cleaned up. The following are those instances: * When popen searches for directed_test directory but does not find it, it outputs an error, then finds the hipEnvVar at the same level. Currently the fix will prompt the test to only output an error if both searches for hipEnvVar fails. * When assertion is used towards the later half of the test, conditions were set to specifically hide the devices, resulting in No Hip Device detected in the latter half of the test. The fix will make these errors not appear as they are intended to not find any devices. Assertions themselves are untouched. HipEnvVarDriver.cpp has also been refactored. Reading HipEnvVar will now happen in a helper function for getDeviceNumber and getDevicePCIBusNumRemote, as the code to read HipEnvVar were really similar in them. --- hipamd/tests/src/hipEnvVarDriver.cpp | 72 ++++++++++++++++------------ hipamd/tests/src/test_common.cpp | 2 + hipamd/tests/src/test_common.h | 5 ++ 3 files changed, 48 insertions(+), 31 deletions(-) diff --git a/hipamd/tests/src/hipEnvVarDriver.cpp b/hipamd/tests/src/hipEnvVarDriver.cpp index 07379f0878..c970cb7674 100644 --- a/hipamd/tests/src/hipEnvVarDriver.cpp +++ b/hipamd/tests/src/hipEnvVarDriver.cpp @@ -34,42 +34,52 @@ OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWA using namespace std; -const string directed_dir = "directed_tests" + string(PATH_SEPERATOR_STR) + "hipEnvVar"; -const string dir = "." + string(PATH_SEPERATOR_STR) + "hipEnvVar"; +const string directed_dir = string(".") + PATH_SEPERATOR_STR + "directed_tests" + PATH_SEPERATOR_STR + "hipEnvVar"; +const string dir = string(".") + PATH_SEPERATOR_STR + "hipEnvVar"; -int getDeviceNumber() { - char buff[512]; - std::this_thread::sleep_for(std::chrono::milliseconds(10)); - FILE* in = popen((directed_dir + " -c").c_str(), "r"); - if(fgets(buff, 512, in) == NULL){ - pclose(in); - //Check at same level - in = popen((dir + " -c").c_str(), "r"); +int readHipEnvVar(string flags, char* buff){ + + std::cout << "\nFinding hipEnvVar in " << directed_dir << "...\n"; + FILE* directed_in = popen((directed_dir + flags).c_str(), "r"); + + if(fgets(buff, 512, directed_in) == NULL){ + std::cout << "Finding hipEnvVar in " << dir << "...\n"; + FILE* in = popen((dir + flags).c_str(), "r"); if(fgets(buff, 512, in) == NULL){ + pclose(directed_in); pclose(in); return 1; } + pclose(in); + } + std::cout << "hipEnvVar Found!\n"; + pclose(directed_in); + return 0; +} + +int getDeviceNumber(bool print_err=true) { + char buff[512]; + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + + if (readHipEnvVar(string(" -c"), buff)){ + strncpy(buff, "1", 512); + if (print_err){ + std::cerr << "The system cannot find hipEnvVar, using 1 as number of devices\n"; + } + } + if (print_err) { + std::cout << buff; } - cout << buff; - pclose(in); return atoi(buff); } // Query the current device ID remotely to hipEnvVar void getDevicePCIBusNumRemote(int deviceID, char* pciBusID) { std::this_thread::sleep_for(std::chrono::milliseconds(10)); - FILE* in = popen((directed_dir + " -d " + std::to_string(deviceID)).c_str(), "r"); - if(fgets(pciBusID, 100, in) == NULL){ - pclose(in); - //Check at same level - in = popen((dir + " -d").c_str(), "r"); - if(fgets(pciBusID, 100, in) == NULL){ - pclose(in); - return; - } + if (readHipEnvVar((" -d " + std::to_string(deviceID)), pciBusID)){ + std::cerr << "The system cannot find hipEnvVar\n"; } cout << pciBusID; - pclose(in); return; } @@ -78,15 +88,15 @@ void getDevicePCIBusNum(int deviceID, char* pciBusID) { hipDevice_t deviceT; hipDeviceGet(&deviceT, deviceID); - memset(pciBusID, 0, 100); - hipDeviceGetPCIBusId(pciBusID, 100, deviceT); + memset(pciBusID, 0, 512); + hipDeviceGetPCIBusId(pciBusID, 512, deviceT); } int main() { unsetenv(HIP_VISIBLE_DEVICES_STR); unsetenv(CUDA_VISIBLE_DEVICES_STR); std::vector devPCINum; - char pciBusID[100]; + char pciBusID[512]; // collect the device pci bus ID for all devices int totalDeviceNum = getDeviceNumber(); std::cout << "The total number of available devices is " << totalDeviceNum << std::endl @@ -116,27 +126,27 @@ int main() { // check when set an invalid device number setenv("HIP_VISIBLE_DEVICES", "1000,0,1", 1); setenv("CUDA_VISIBLE_DEVICES", "1000,0,1", 1); - assert(getDeviceNumber() == 0); + assert(getDeviceNumber(false) == 0); if (totalDeviceNum > 2) { setenv("HIP_VISIBLE_DEVICES", "0,1,1000,2", 1); setenv("CUDA_VISIBLE_DEVICES", "0,1,1000,2", 1); - assert(getDeviceNumber() == 2); + assert(getDeviceNumber(false) == 2); setenv("HIP_VISIBLE_DEVICES", "0,1,2", 1); setenv("CUDA_VISIBLE_DEVICES", "0,1,2", 1); - assert(getDeviceNumber() == 3); + assert(getDeviceNumber(false) == 3); // test if CUDA_VISIBLE_DEVICES will be accepted by the runtime unsetenv(HIP_VISIBLE_DEVICES_STR); unsetenv(CUDA_VISIBLE_DEVICES_STR); setenv("CUDA_VISIBLE_DEVICES", "0,1,2", 1); - assert(getDeviceNumber() == 3); + assert(getDeviceNumber(false) == 3); } setenv("HIP_VISIBLE_DEVICES", "-100,0,1", 1); setenv("CUDA_VISIBLE_DEVICES", "-100,0,1", 1); - assert(getDeviceNumber() == 0); + assert(getDeviceNumber(false) == 0); std::cout << "PASSED" << std::endl; return 0; -} +} \ No newline at end of file diff --git a/hipamd/tests/src/test_common.cpp b/hipamd/tests/src/test_common.cpp index e7a2622662..1c0dcc8c34 100644 --- a/hipamd/tests/src/test_common.cpp +++ b/hipamd/tests/src/test_common.cpp @@ -37,10 +37,12 @@ int p_tests = -1; /*which tests to run. Interpretation is left to each test. de const char* HIP_VISIBLE_DEVICES_STR = "HIP_VISIBLE_DEVICES="; const char* CUDA_VISIBLE_DEVICES_STR = "CUDA_VISIBLE_DEVICES="; const char* PATH_SEPERATOR_STR = "\\"; +const char* NULL_DEVICE = "NUL:"; #else const char* HIP_VISIBLE_DEVICES_STR = "HIP_VISIBLE_DEVICES"; const char* CUDA_VISIBLE_DEVICES_STR = "CUDA_VISIBLE_DEVICES"; const char* PATH_SEPERATOR_STR = "/"; +const char* NULL_DEVICE = "/dev/null"; #endif namespace HipTest { diff --git a/hipamd/tests/src/test_common.h b/hipamd/tests/src/test_common.h index 426ea846b1..7d8c39e74c 100644 --- a/hipamd/tests/src/test_common.h +++ b/hipamd/tests/src/test_common.h @@ -105,6 +105,10 @@ THE SOFTWARE. #define pclose(x) _pclose(x) #define setenv(x,y,z) _putenv_s(x,y) #define unsetenv _putenv +#define fileno(x) _fileno(x) +#define dup(x) _dup(x) +#define dup2(x,y) _dup2(x,y) +#define close(x) _close(x) #else #define aligned_free(x) free(x) #endif @@ -124,6 +128,7 @@ extern int p_tests; extern const char* HIP_VISIBLE_DEVICES_STR; extern const char* CUDA_VISIBLE_DEVICES_STR; extern const char* PATH_SEPERATOR_STR; +extern const char* NULL_DEVICE; // ********************* CPP section ********************* #ifdef __cplusplus From 1c794045e026411ca054aa2317744b47cdbb973f Mon Sep 17 00:00:00 2001 From: Rahul Garg Date: Fri, 28 Feb 2020 03:16:55 -0800 Subject: [PATCH 44/50] Add hipDrvOccupancyMaxActiveBlocksPerMultiprocessor[WithFlags] (#1854) Equivalent to cuOccupancyMaxActiveBlocksPerMultiprocessor[WithFlags]. --- ...A_Driver_API_functions_supported_by_HIP.md | 8 +++--- .../src/CUDA2HIP_Driver_API_functions.cpp | 4 +-- .../include/hip/hcc_detail/hip_runtime_api.h | 25 ++++++++++++++++++- hipamd/src/hip_module.cpp | 18 +++++++++++++ 4 files changed, 48 insertions(+), 7 deletions(-) diff --git a/hipamd/docs/markdown/CUDA_Driver_API_functions_supported_by_HIP.md b/hipamd/docs/markdown/CUDA_Driver_API_functions_supported_by_HIP.md index 8956533ed0..e77997b0b6 100644 --- a/hipamd/docs/markdown/CUDA_Driver_API_functions_supported_by_HIP.md +++ b/hipamd/docs/markdown/CUDA_Driver_API_functions_supported_by_HIP.md @@ -1126,10 +1126,10 @@ | **CUDA** | **HIP** |**CUDA version\***| |-----------------------------------------------------------|---------------------------------------------------------|------------------| -| `cuOccupancyMaxActiveBlocksPerMultiprocessor` |`hipOccupancyMaxActiveBlocksPerMultiprocessor` | -| `cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags` |`hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags` | -| `cuOccupancyMaxPotentialBlockSize` |`hipOccupancyMaxPotentialBlockSize` | -| `cuOccupancyMaxPotentialBlockSizeWithFlags` | | +| `cuOccupancyMaxActiveBlocksPerMultiprocessor` |`hipDrvOccupancyMaxActiveBlocksPerMultiprocessor` | +| `cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags` |`hipDrvOccupancyMaxActiveBlocksPerMultiprocessorWithFlags` | +| `cuOccupancyMaxPotentialBlockSize` |`hipOccupancyMaxPotentialBlockSize` | +| `cuOccupancyMaxPotentialBlockSizeWithFlags` | | ## **22. Texture Reference Management [DEPRECATED]** diff --git a/hipamd/hipify-clang/src/CUDA2HIP_Driver_API_functions.cpp b/hipamd/hipify-clang/src/CUDA2HIP_Driver_API_functions.cpp index 5d700631c4..ab07a10e93 100644 --- a/hipamd/hipify-clang/src/CUDA2HIP_Driver_API_functions.cpp +++ b/hipamd/hipify-clang/src/CUDA2HIP_Driver_API_functions.cpp @@ -545,9 +545,9 @@ const std::map CUDA_DRIVER_FUNCTION_MAP{ // 5.21. Occupancy // cudaOccupancyMaxActiveBlocksPerMultiprocessor - {"cuOccupancyMaxActiveBlocksPerMultiprocessor", {"hipOccupancyMaxActiveBlocksPerMultiprocessor", "", CONV_OCCUPANCY, API_DRIVER}}, + {"cuOccupancyMaxActiveBlocksPerMultiprocessor", {"hipDrvOccupancyMaxActiveBlocksPerMultiprocessor", "", CONV_OCCUPANCY, API_DRIVER}}, // cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags - {"cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", {"hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "", CONV_OCCUPANCY, API_DRIVER}}, + {"cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", {"hipDrvOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", "", CONV_OCCUPANCY, API_DRIVER}}, // cudaOccupancyMaxPotentialBlockSize {"cuOccupancyMaxPotentialBlockSize", {"hipOccupancyMaxPotentialBlockSize", "", CONV_OCCUPANCY, API_DRIVER}}, // cudaOccupancyMaxPotentialBlockSizeWithFlags diff --git a/hipamd/include/hip/hcc_detail/hip_runtime_api.h b/hipamd/include/hip/hcc_detail/hip_runtime_api.h index 76209ef6a7..2c6726c161 100644 --- a/hipamd/include/hip/hcc_detail/hip_runtime_api.h +++ b/hipamd/include/hip/hcc_detail/hip_runtime_api.h @@ -2959,7 +2959,18 @@ hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor( * @brief Returns occupancy for a device function. * * @param [out] numBlocks Returned occupancy - * @param [in] func Kernel function for which occupancy is calulated + * @param [in] func Kernel function (hipFunction) for which occupancy is calulated + * @param [in] blockSize Block size the kernel is intended to be launched with + * @param [in] dynSharedMemPerBlk dynamic shared memory usage (in bytes) intended for each block + */ +hipError_t hipDrvOccupancyMaxActiveBlocksPerMultiprocessor( + int* numBlocks, hipFunction_t f, int blockSize, size_t dynSharedMemPerBlk); + +/** + * @brief Returns occupancy for a device function. + * + * @param [out] numBlocks Returned occupancy + * @param [in] f Kernel function for which occupancy is calulated * @param [in] blockSize Block size the kernel is intended to be launched with * @param [in] dynSharedMemPerBlk dynamic shared memory usage (in bytes) intended for each block * @param [in] flags Extra flags for occupancy calculation (currently ignored) @@ -2967,6 +2978,18 @@ hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor( hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk, unsigned int flags __dparm(hipOccupancyDefault)); +/** + * @brief Returns occupancy for a device function. + * + * @param [out] numBlocks Returned occupancy + * @param [in] f Kernel function(hipFunction_t) for which occupancy is calulated + * @param [in] blockSize Block size the kernel is intended to be launched with + * @param [in] dynSharedMemPerBlk dynamic shared memory usage (in bytes) intended for each block + * @param [in] flags Extra flags for occupancy calculation (currently ignored) + */ +hipError_t hipDrvOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( + int* numBlocks, hipFunction_t f, int blockSize, size_t dynSharedMemPerBlk, unsigned int flags); + #if __HIP_VDI__ && !defined(__HCC__) /** * @brief Launches kernels on multiple devices and guarantees all specified kernels are dispatched diff --git a/hipamd/src/hip_module.cpp b/hipamd/src/hip_module.cpp index 44f0f108a6..a88abba9cb 100644 --- a/hipamd/src/hip_module.cpp +++ b/hipamd/src/hip_module.cpp @@ -1471,6 +1471,15 @@ hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor( tls, numBlocks, f, blockSize, dynSharedMemPerBlk)); } +hipError_t hipDrvOccupancyMaxActiveBlocksPerMultiprocessor( + int* numBlocks, hipFunction_t f, int blockSize, size_t dynSharedMemPerBlk) +{ + HIP_INIT_API(hipDrvOccupancyMaxActiveBlocksPerMultiprocessor, numBlocks, f, blockSize, dynSharedMemPerBlk); + + return ihipLogStatus(ihipOccupancyMaxActiveBlocksPerMultiprocessor( + tls, (uint32_t*) numBlocks, f, blockSize, dynSharedMemPerBlk)); +} + hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk, unsigned int flags) @@ -1481,6 +1490,15 @@ hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( tls, numBlocks, f, blockSize, dynSharedMemPerBlk)); } +hipError_t hipDrvOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( + int* numBlocks, hipFunction_t f, int blockSize, size_t dynSharedMemPerBlk, + unsigned int flags) +{ + HIP_INIT_API(hipDrvOccupancyMaxActiveBlocksPerMultiprocessorWithFlags, numBlocks, f, blockSize, dynSharedMemPerBlk, flags); + return ihipLogStatus(ihipOccupancyMaxActiveBlocksPerMultiprocessor( + tls, (uint32_t*) numBlocks, f, blockSize, dynSharedMemPerBlk)); +} + hipError_t hipLaunchKernel( const void* func_addr, dim3 numBlocks, dim3 dimBlocks, void** args, size_t sharedMemBytes, hipStream_t stream) From 6be7537cf92b950249057dbb44883dc0ed387a66 Mon Sep 17 00:00:00 2001 From: Rahul Garg Date: Fri, 28 Feb 2020 03:17:15 -0800 Subject: [PATCH 45/50] Remove deprecated HIP markers (#1876) --- hipamd/CMakeLists.txt | 20 -- hipamd/bin/hipcc | 17 -- hipamd/docs/markdown/hip_profiling.md | 279 ------------------ hipamd/docs/markdown/hip_tracing.md | 72 +++++ hipamd/include/hip/hip_profile.h | 18 -- hipamd/packaging/hip-hcc.txt | 12 +- hipamd/samples/2_Cookbook/2_Profiler/Makefile | 53 ---- .../2_Cookbook/2_Profiler/MatrixTranspose.cpp | 219 -------------- .../samples/2_Cookbook/2_Profiler/Readme.md | 47 --- hipamd/src/hip_hcc.cpp | 44 +-- hipamd/src/hip_hcc_internal.h | 41 +-- 11 files changed, 77 insertions(+), 745 deletions(-) delete mode 100644 hipamd/docs/markdown/hip_profiling.md create mode 100644 hipamd/docs/markdown/hip_tracing.md delete mode 100644 hipamd/samples/2_Cookbook/2_Profiler/Makefile delete mode 100644 hipamd/samples/2_Cookbook/2_Profiler/MatrixTranspose.cpp delete mode 100644 hipamd/samples/2_Cookbook/2_Profiler/Readme.md diff --git a/hipamd/CMakeLists.txt b/hipamd/CMakeLists.txt index 3a77735865..07d23130c2 100644 --- a/hipamd/CMakeLists.txt +++ b/hipamd/CMakeLists.txt @@ -197,19 +197,6 @@ if (NOT CPACK_SET_DESTDIR) set(CPACK_PACKAGING_INSTALL_PREFIX "/opt/rocm/hip" CACHE PATH "Default installation path of hcc installer package") endif (CPACK_SET_DESTDIR) -# Check if we need to enable ATP marker -if(NOT DEFINED COMPILE_HIP_ATP_MARKER) - if(NOT DEFINED ENV{COMPILE_HIP_ATP_MARKER}) - set(COMPILE_HIP_ATP_MARKER 0) - else() - set(COMPILE_HIP_ATP_MARKER $ENV{COMPILE_HIP_ATP_MARKER}) - message(WARNING "HIP Markers are deprecated, please use roctracer/rocTX marker APIs.") - endif() -else() - message(WARNING "HIP Markers are deprecated, please use roctracer/rocTX marker APIs.") -endif() -add_to_config(_buildInfo COMPILE_HIP_ATP_MARKER) - ############################# # Profiling API support ############################# @@ -267,10 +254,6 @@ endif () if(HIP_PLATFORM STREQUAL "hcc") include_directories(${PROJECT_SOURCE_DIR}/include) set(HIP_HCC_BUILD_FLAGS) - if(COMPILE_HIP_ATP_MARKER) - include_directories(/opt/rocm/profiler/CXLActivityLogger/include) - set(HIP_HCC_BUILD_FLAGS "${HIP_HCC_BUILD_FLAGS} -DCOMPILE_HIP_ATP_MARKER=1") - endif() # Add HIP_VERSION to CMAKE__FLAGS set(HIP_HCC_BUILD_FLAGS "${HIP_HCC_BUILD_FLAGS} -DHIP_VERSION_MAJOR=${HIP_VERSION_MAJOR} -DHIP_VERSION_MINOR=${HIP_VERSION_MINOR} -DHIP_VERSION_PATCH=${HIP_VERSION_GITDATE}") @@ -321,9 +304,6 @@ if(HIP_PLATFORM STREQUAL "hcc") set (CMAKE_BUILD_WITH_INSTALL_RPATH TRUE ) set (CMAKE_SKIP_BUILD_RPATH TRUE ) endif () - if(COMPILE_HIP_ATP_MARKER) - set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -L/opt/rocm/profiler/CXLActivityLogger/bin/x86_64 -lCXLActivityLogger") - endif() add_library(hip_hcc SHARED ${SOURCE_FILES_RUNTIME}) add_library(hip_hcc_static STATIC ${SOURCE_FILES_RUNTIME}) diff --git a/hipamd/bin/hipcc b/hipamd/bin/hipcc index 22923e5ec4..06a3f9e385 100755 --- a/hipamd/bin/hipcc +++ b/hipamd/bin/hipcc @@ -241,9 +241,6 @@ if ($HIP_PLATFORM eq "clang") { $HCC_VERSION_MAJOR=$HCC_VERSION; $HCC_VERSION_MAJOR=~s/\..*//; - $HIP_ATP_MARKER=$ENV{'HIP_ATP_MARKER'} // 1; - $marker_path = "$ROCM_PATH/profiler/CXLActivityLogger"; - # HCC* may be used to compile src/hip_hcc.o (and also feed the HIPCXXFLAGS below) $HCC = "$HCC_HOME/bin/hcc"; $HCCFLAGS = "-hc -D__HIPCC__ -isystem $HCC_HOME/include "; @@ -288,20 +285,6 @@ if ($HIP_PLATFORM eq "clang") { $HIPLDFLAGS .= " -L$HSA_PATH/lib -L$ROCM_PATH/lib -lhsa-runtime64 -lhc_am "; # $HIPLDFLAGS .= " -L$HCC_HOME/compiler/lib -lLLVMAMDGPUDesc -lLLVMAMDGPUUtils -lLLVMMC -lLLVMCore -lLLVMSupport "; - # Add trace marker library: - # TODO - once we cleanly separate the HIP API headers from HIP library headers this logic should move to CMakebuild option - apps do not need to see the marker library. - if ($HIP_ATP_MARKER) { - $marker_inc_path = "$marker_path/include"; - if (-e $marker_inc_path) { - $HIPCXXFLAGS .= " -isystem $marker_inc_path"; - } - } - - $marker_lib_path = "$marker_path/bin/x86_64"; - if (-e $marker_lib_path) { - $HIPLDFLAGS .= " -L$marker_lib_path -lCXLActivityLogger -Wl,--rpath=$marker_lib_path"; - } - if (not $isWindows) { $HIPLDFLAGS .= " -lm"; } diff --git a/hipamd/docs/markdown/hip_profiling.md b/hipamd/docs/markdown/hip_profiling.md deleted file mode 100644 index 28ed37e321..0000000000 --- a/hipamd/docs/markdown/hip_profiling.md +++ /dev/null @@ -1,279 +0,0 @@ -# Profiling HIP Code - -This section describes the profiling and debugging capabilities that HIP provides. -Profiling information can viewed in the CodeXL visualization tool or printed directly to stderr as the application runs. -This document starts with some of the general capabilities of CodeXL and then describes some of the additional HIP marker and debug features. - - - -- [CodeXL Profiling](#codexl-profiling) - * [Collecting and Viewing Traces](#collecting-and-viewing-traces) - + [Using rocm-profiler timestamp profiling](#using-rocm-profiler-timestamp-profiling) - + [Using rocm-profiler performance counter collection:](#using-rocm-profiler-performance-counter-collection) - + [Using CodeXL to view profiling results:](#using-codexl-to-view-profiling-results) - + [More information on CodeXL](#more-information-on-codexl) - * [HIP Markers](#hip-markers) - + [Profiling HIP APIs](#profiling-hip-apis) - + [Adding markers to applications](#adding-markers-to-applications) - * [Additional HIP Profiling Features](#additional-hip-profiling-features) - + [Demangling C++ Kernel Names](#demangling-c-kernel-names) - + [Controlling when profiling starts and ends](#controlling-when-profiling-starts-and-ends) - + [Reducing timeline trace output file size](#reducing-timeline-trace-output-file-size) - + [How to enable profiling at HIP build time](#how-to-enable-profiling-at-hip-build-time) -- [Tracing and Debug](#tracing-and-debug) - * [Tracing HIP APIs](#tracing-hip-apis) - + [Color](#color) - - - -## CodeXL Profiling - -### Collecting and Viewing Traces - -#### Using rocm-profiler timestamp profiling -rocm-profiler is a command-line tool for tracing any application that uses ROCr API, including HCC and HIP. -rocm-profiler's timeline trace will show the beginning and end for all kernel commands, data transfer commands, and HSA Runtime (ROCr) API calls. The trace results are saved into a file, which by convention uses the "atp" extension. Here is an example that shows how to run the command-line profiler: -```shell -$ /opt/rocm/bin/rocm-profiler -o -A -T -``` - -#### Using rocm-profiler performance counter collection: -rocm-profiler can record performance counter information to provide greater insight inside a kernel, such as the memory bandwidth, ALU busy percentage, and cache statistics. -Collecting the common set of useful counters requires passing the counter configuration files for two passes: -``` -$ /opt/rocm/bin/rocm-profiler -C -O --counterfile /opt/rocm/profiler/counterfiles/counters_HSA_Fiji_pass1 --counterfile /opt/rocm/profiler/counterfiles/counters_HSA_Fiji_pass2 -``` - - -#### Using CodeXL to view profiling results: -The trace can be loaded and viewed in the CodeXL visualization tool: - -- Open the CodeXL GUI, create an new project, and switch to "Profile Mode": - - $ CodeXL & - - [File->New Project, leave fields as is, just click "OK"] - - [Profile->Switch to Profile Mode] -- Load timestamp tracing results into a timeline view: - - Right click on the project in the CodeXL Explorer view - - Click "Import Session..." - - Select to $HOME/apitrace.atp (or appropriate .atp file if you used another file name) - -- Load the performance counter results - - Right click on the project in the CodeXL Explorer view - - Click "Import Session..." - - Select $HOME/Session1.csv (or appropriate .csv file if you used another file name) - - -#### More information on CodeXL -rocm-profiler --help will show additional options and usage guidelines. - -See this [blog](http://gpuopen.com/getting-up-to-speed-with-the-codexl-gpu-profiler-and-radeon-open-compute/) for more information on profiling ROCm apps (including HIP) with CodeXL. - -The 2.2 version of Windows CodeXL does not correctly handle Linux line-endings. If you are collecting a trace on Linux and then viewing it with the 2.2 Windows CodeXL, first convert the line ending in the .atp file to Windows-style line endings. - -### HIP Markers -#### Profiling HIP APIs -HIP can generate markers at function beginning and end which are displayed on the CodeXL timeline view. -HIP 1.0 compiles marker support by default, and you can enable it by setting the HIP_PROFILE_API environment variable and then running the rocm-profiler: - -```shell - -# Use profile to generate timeline view: -export HIP_PROFILE_API=1 -$ /opt/rocm/bin/rocm-profiler -A -T - -Or -$ /opt/rocm/bin/rocm-profiler -e HIP_PROFILE_API=1 -A -T -``` - -HIP_PROFILE_API supports two levels of information. -- HIP_PROFILE_API=1 : Short format. Print name of API but no arguments. For example: -`hipMemcpy` -- HIP_PROFILE_API=2 : Long format. Print name of API + values of all function arguments. For example: -`hipMemcpy (0x7f32154db010, 0x50446e000, 4000000, hipMemcpyDeviceToHost)` - -#### Adding markers to applications - -Markers can be used to define application-specific events that will be recorded in the ATP file and displayed in the CodeXL GUI. -This can be particularly useful for visualizing how the higher-level phases of application behavior relate to the lower level HIP APIs, kernel launches, and data transfers. -For example, an instrumented machine learning framework could show the beginning and ending of each layer in the network. - -Markers have a specific begin and end time, and can be nested. Nested calls are displayed hierarchically in the CodeXL GUI, with each level of the hierarchy occupying a different row. - -The HIP APis are defined in "hip_profile.h": -``` -#include - -HIP_BEGIN_MARKER(const char *markerName, const char *groupName); -HIP_END_MARKER(); - -HIP_BEGIN_MARKER("Setup", "MyAppGroup"); -// ... -// application code for setup -// ... -HIP_END_MARKER(); -``` - -For C++ codes, HIP also provides a scoped marker which records the start time when constructed and the end time when the scoped marker is destructed at the end of the scope. This provides a convenient, single-line mechanism to record an event that neatly corresponds to a region of code. - -```cxx -void FunctionFoo(...) -{ - HIP_SCOPED_MARKER("FunctionFoo", "MyAppGroup"); // Marker starts recording here. - - // ... - // Function implementation - // ... - - // Marker destroyed here and records end time stamp. -}; -``` - -The HIP marker API is only supported on ROCm platform. The marker macros are defined on CUDA platforms and will compile, but are silently ignored at runtime. - -This [HIP sample](https://github.com/ROCm-Developer-Tools/HIP/tree/master/samples/2_Cookbook/2_Profiler) shows the profiler marker API used in a small application. - -More information on the marker API can be found in the profiler header file and PDF in a ROCm installation: -- /opt/rocm/profiler/CXLActivityLogger/include/CXLActivityLogger.h -- /opt/rocm/profiler/CXLActivityLogger/doc/CXLActivityLogger.pdf - -### Additional HIP Profiling Features -#### Demangling C++ Kernel Names -HIP includes the `hipdemangleatp` tool which can post-process an ATP file to "demangle" C++ names. -Mangled kernel names encode the C++ arguments and other information, and are guaranteed to be unique even for cases such as operator overloading. However, the mangled names can be quite verbose. For example: - -`ZZ39gemm_NoTransA_MICRO_NBK_M_N_K_TS16XMTS4RN2hc16accelerator_viewEPKflS3_lPfliiiiiiffEN3_EC__719__cxxamp_trampolineElililiiiiiiS3_iS3_S4_ff` - -`hipdemangleatp` will convert this into the more readable: -`gemm_NoTransA_MICRO_NBK_M_N_K_TS16XMTS4` - -The `hipdemangleatp` tool operates on the ATP file "in-place" and thus replaces the input file with the demangled version. - -``` -$ hipdemangleatp myfile.atp -``` - -The kernel name is also shown in some of the summary htlm files (Top10 kernels). These can be regenerated from the demangled ATP file by re-running rocm-profiler: -``` -$ rocm-profiler -T --atpfile myfile.atp -``` - -A future version of CodeXL may directly integrate demangle functionality. - - -#### Controlling when profiling starts and ends -hipProfilerStart() and hipProfilerEnd() can be inserted into an application to control which phases of the applications are profiled. -These APIs can be used to skip initialization code or to focus profiling on a desired region, and are particularly useful for large long-running applications. -See the API documentation for more information. These APIs work on both ROCm and CUDA paths. - -On ROCm, the following environment variables can be used to control when profiling occurs: - -``` -HIP_DB_START_API : Comma-separated list of tid.api_seq_num for when to start debug and profiling. -HIP_DB_STOP_API : Comma-separated list of tid.api_seq_num for when to stop debug and profiling. -``` - -HIP/ROCm assigns a monotonically increasing sequence number to the APIs called from each thread. The thread and API sequence number can be used in the above API to control when tracing starts and stops. These flags also control the HIP_DB messages (described below). - -When using these options, start the profiler with profiling disabled: -``` -# ROCm: -$ rocm-profiler --startdisabled ... - -# CUDA: -$ nvprof --profile-from-start-off ... -``` - -This feature is under development. - -#### Reducing timeline trace output file size -If the application is already recording the HIP APIs, the HSA APIs are somewhat redundant and the ATP file size can be substantially reduced by not recording these APIs. HIP includes a text file that lists all of the HSA APIs and can assist in this filtering: - -``` -$ rocm-profiler -F hip/bin/hsa-api-filter-cxl.txt -``` - -This file can be copied and edited to provide more selective HSA event recording. - - -#### How to enable profiling at HIP build time -Pre-built packages of HIP are not built with profiling support enabled.You must enable marker support manually when compiling HIP. - -1. Build HIP with ATP markers enabled -HIP pre-built packages are enabled with ATP marker support by default. -To enable ATP marker support when building HIP from source, use the option ```-DCOMPILE_HIP_ATP_MARKER=1``` during the cmake configure step. Build and install HIP. -```shell -$ mkdir build && cd build -$ cmake .. -DCOMPILE_HIP_ATP_MARKER -$ make install -``` - -2. Install ROCm-Profiler -Installing HIP from the [rocm](http://gpuopen.com/getting-started-with-boltzmann-components-platforms-installation/) pre-built packages, installs the ROCm-Profiler as well. -Alternatively, you can build ROCm-Profiler using the instructions [here](https://github.com/RadeonOpenCompute/ROCm-Profiler#building-the-rocm-profiler). - -3. Recompile the target application - -Then follow the steps above to collect a marker-enabled trace. - - -## Tracing and Debug - -### Tracing HIP APIs -The HIP runtime can print the HIP function strings to stderr using HIP_TRACE_API environment variable. -The trace prints two messages for each API - one at the beginning of the API call (line starts with "<<") and one at the end of the API call (line ends with ">>"). -Here's an example for one API followed by a description for the sections of the trace: - -``` -<> -``` - -- `<> -info: running on device gfx803 -info: allocate host mem ( 7.63 MB) -info: allocate device mem ( 7.63 MB) -<> -<> -info: copy Host2Device -<> -info: launch 'vector_square' kernel -1.5 hipLaunchKernel 'HIP_KERNEL_NAME(vector_square)' gridDim:{512,1,1} groupDim:{256,1,1} sharedMem:+0 stream#0.0 -info: copy Device2Host -<> -info: check result -PASSED! -``` - -HIP_TRACE_API supports multiple levels of debug information: - - 0x1 = print all HIP APIs. This is the most verbose setting; the flags below allow selecting a subset. - - 0x2 = print HIP APIs which initiate GPU kernel commands. Includes hipLaunchKernel, hipLaunchModuleKernel - - 0x4 = print HIP APIs which initiate GPU memory commands. Includes hipMemcpy*, hipMemset*. - - 0x8 = print HIP APIs which allocate or free memory. Includes hipMalloc, hipHostMalloc, hipFree, hipHostFree. - -These can be combined. For example, HIP_TRACE_API=6 shows a concise view of the HIP commands (both kernel and memory) that are sent to the GPU. - - -#### Color -Note this trace mode uses colors. "less -r" can handle raw control characters and will display the debug output in proper colors. -You can change the color used for the trace mode with the HIP_TRACE_API_COLOR environment variable. Possible values are None/Red/Green/Yellow/Blue/Magenta/Cyan/White. -None will disable use of color control codes for both the opening and closing and may be useful when saving the trace file or when a pure text trace is desired. - - - diff --git a/hipamd/docs/markdown/hip_tracing.md b/hipamd/docs/markdown/hip_tracing.md new file mode 100644 index 0000000000..40513f4e3c --- /dev/null +++ b/hipamd/docs/markdown/hip_tracing.md @@ -0,0 +1,72 @@ +# Profiling HIP Code + +This section describes the tracing and debugging capabilities that HIP provides. + + +- [Tracing and Debug](#tracing-and-debug) + * [Tracing HIP APIs](#tracing-hip-apis) + + [Color](#color) + + + +## Tracing and Debug + +### Tracing HIP APIs +The HIP runtime can print the HIP function strings to stderr using HIP_TRACE_API environment variable. +The trace prints two messages for each API - one at the beginning of the API call (line starts with "<<") and one at the end of the API call (line ends with ">>"). +Here's an example for one API followed by a description for the sections of the trace: + +``` +<> +``` + +- `<> +info: running on device gfx803 +info: allocate host mem ( 7.63 MB) +info: allocate device mem ( 7.63 MB) +<> +<> +info: copy Host2Device +<> +info: launch 'vector_square' kernel +1.5 hipLaunchKernel 'HIP_KERNEL_NAME(vector_square)' gridDim:{512,1,1} groupDim:{256,1,1} sharedMem:+0 stream#0.0 +info: copy Device2Host +<> +info: check result +PASSED! +``` + +HIP_TRACE_API supports multiple levels of debug information: + - 0x1 = print all HIP APIs. This is the most verbose setting; the flags below allow selecting a subset. + - 0x2 = print HIP APIs which initiate GPU kernel commands. Includes hipLaunchKernel, hipLaunchModuleKernel + - 0x4 = print HIP APIs which initiate GPU memory commands. Includes hipMemcpy*, hipMemset*. + - 0x8 = print HIP APIs which allocate or free memory. Includes hipMalloc, hipHostMalloc, hipFree, hipHostFree. + +These can be combined. For example, HIP_TRACE_API=6 shows a concise view of the HIP commands (both kernel and memory) that are sent to the GPU. + + +#### Color +Note this trace mode uses colors. "less -r" can handle raw control characters and will display the debug output in proper colors. +You can change the color used for the trace mode with the HIP_TRACE_API_COLOR environment variable. Possible values are None/Red/Green/Yellow/Blue/Magenta/Cyan/White. +None will disable use of color control codes for both the opening and closing and may be useful when saving the trace file or when a pure text trace is desired. + + + diff --git a/hipamd/include/hip/hip_profile.h b/hipamd/include/hip/hip_profile.h index 95224af4a3..ff18239e44 100644 --- a/hipamd/include/hip/hip_profile.h +++ b/hipamd/include/hip/hip_profile.h @@ -1,16 +1,13 @@ /* Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved. - Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -23,23 +20,8 @@ THE SOFTWARE. #ifndef HIP_INCLUDE_HIP_HIP_PROFILE_H #define HIP_INCLUDE_HIP_HIP_PROFILE_H -#warning "HIP Profiling through markers is deprecated, please check roctrace/rocTX support." - -#if not defined(ENABLE_HIP_PROFILE) -#define ENABLE_HIP_PROFILE 1 -#endif - -#if defined(__HIP_PLATFORM_HCC__) and (ENABLE_HIP_PROFILE == 1) -#warning "HIP Markers are deprecated and would be removed soon." -#include -#define HIP_SCOPED_MARKER(markerName, group) \ - amdtScopedMarker __scopedMarker(markerName, group, nullptr); -#define HIP_BEGIN_MARKER(markerName, group) amdtBeginMarker(markerName, group, nullptr); -#define HIP_END_MARKER() amdtEndMarker(); -#else #define HIP_SCOPED_MARKER(markerName, group) #define HIP_BEGIN_MARKER(markerName, group) #define HIP_END_MARKER() -#endif #endif diff --git a/hipamd/packaging/hip-hcc.txt b/hipamd/packaging/hip-hcc.txt index 63f3d73e67..21e138e1ed 100644 --- a/hipamd/packaging/hip-hcc.txt +++ b/hipamd/packaging/hip-hcc.txt @@ -36,11 +36,7 @@ set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION_MAJOR} set(CPACK_GENERATOR "TGZ;DEB;RPM") set(CPACK_BINARY_DEB "ON") set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PROJECT_BINARY_DIR}/postinst;${PROJECT_BINARY_DIR}/prerm") -if(@COMPILE_HIP_ATP_MARKER@) - set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip-base (= ${CPACK_PACKAGE_VERSION}), ${HCC_PACKAGE_NAME} (= @HCC_PACKAGE_VERSION@), rocm-profiler, comgr (>= 1.1)") -else() - set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip-base (= ${CPACK_PACKAGE_VERSION}), ${HCC_PACKAGE_NAME} (= @HCC_PACKAGE_VERSION@), comgr (>= 1.1)") -endif() +set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip-base (= ${CPACK_PACKAGE_VERSION}), ${HCC_PACKAGE_NAME} (= @HCC_PACKAGE_VERSION@), comgr (>= 1.1)") set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip_hcc") set(CPACK_DEBIAN_PACKAGE_REPLACES "hip_hcc") set(CPACK_DEBIAN_PACKAGE_CONFLICTS "hip_hcc") @@ -50,11 +46,7 @@ set(CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/postinst") set(CPACK_RPM_PRE_UNINSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/prerm") set(CPACK_RPM_PACKAGE_AUTOREQPROV " no") string(REPLACE "-" "_" HIP_BASE_VERSION ${CPACK_PACKAGE_VERSION}) -if(@COMPILE_HIP_ATP_MARKER@) - set(CPACK_RPM_PACKAGE_REQUIRES "hip-base = ${HIP_BASE_VERSION}, ${HCC_PACKAGE_NAME} = @HCC_PACKAGE_VERSION@, rocm-profiler, comgr >= 1.1") -else() - set(CPACK_RPM_PACKAGE_REQUIRES "hip-base = ${HIP_BASE_VERSION}, ${HCC_PACKAGE_NAME} = @HCC_PACKAGE_VERSION@, comgr >= 1.1") -endif() +set(CPACK_RPM_PACKAGE_REQUIRES "hip-base = ${HIP_BASE_VERSION}, ${HCC_PACKAGE_NAME} = @HCC_PACKAGE_VERSION@, comgr >= 1.1") set(CPACK_RPM_PACKAGE_OBSOLETES "hip_hcc") set(CPACK_RPM_PACKAGE_CONFLICTS "hip_hcc") set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/opt") diff --git a/hipamd/samples/2_Cookbook/2_Profiler/Makefile b/hipamd/samples/2_Cookbook/2_Profiler/Makefile deleted file mode 100644 index db2d008182..0000000000 --- a/hipamd/samples/2_Cookbook/2_Profiler/Makefile +++ /dev/null @@ -1,53 +0,0 @@ -HIP_PATH?= $(wildcard /opt/rocm/hip) - -HIPCC=$(HIP_PATH)/bin/hipcc - - -HIPPROFILER=/opt/rocm/bin/rocm-profiler -PROFILER_OPT=-A -o MT.atp -e HIP_PROFILE_API=1 -HIPPROFILER_POST_CMD=$(HIP_PATH)/bin/hipdemangleatp MT.atp - -TARGET=hcc - -SOURCES = MatrixTranspose.cpp -OBJECTS = $(SOURCES:.cpp=.o) - -EXECUTABLE=./MatrixTranspose - -.PHONY: test - - -all: $(EXECUTABLE) profile - - - -OPT =-g -CXXFLAGS =$(OPT) -CXX=$(HIPCC) - - -$(EXECUTABLE): $(OBJECTS) - $(HIPCC) $(OBJECTS) -o $@ - - -profile: $(EXECUTABLE) - $(HIPPROFILER) $(PROFILER_OPT) $(EXECUTABLE) - $(HIPPROFILER_POST_CMD) - - -# Pass option to control start and stop iterations for profiling - see MatrixTranspose.cpp for implementation: -# Note we start profiler in --startdisabled mode - no timing collected until app enabled it via hipProfilerStart() -profile_trigger: $(EXECUTABLE) - $(HIPPROFILER) $(PROFILER_OPT) --startdisabled $(EXECUTABLE) 3 6 - $(HIPPROFILER_POST_CMD) - - -run: $(EXECUTABLE) - $(EXECUTABLE) - - -clean: - rm -f $(EXECUTABLE) - rm -f $(OBJECTS) - rm -f $(HIP_PATH)/src/*.o - diff --git a/hipamd/samples/2_Cookbook/2_Profiler/MatrixTranspose.cpp b/hipamd/samples/2_Cookbook/2_Profiler/MatrixTranspose.cpp deleted file mode 100644 index 69266e1288..0000000000 --- a/hipamd/samples/2_Cookbook/2_Profiler/MatrixTranspose.cpp +++ /dev/null @@ -1,219 +0,0 @@ -/* -Copyright (c) 2015-present Advanced Micro Devices, Inc. All rights reserved. - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -#include - -// hip header file -#include "hip/hip_runtime.h" -#include "hip/hip_profile.h" - -#define WIDTH 1024 - -#define NUM (WIDTH * WIDTH) - -#define THREADS_PER_BLOCK_X 4 -#define THREADS_PER_BLOCK_Y 4 -#define THREADS_PER_BLOCK_Z 1 - -#define ITERATIONS 10 - -// Cmdline parms to control start and stop triggers -int startTriggerIteration = -1; -int stopTriggerIteration = -1; - -// Device (Kernel) function, it must be void -__global__ void matrixTranspose(float* out, float* in, const int width) { - int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; - int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; - - out[y * width + x] = in[x * width + y]; -} - -// CPU implementation of matrix transpose -void matrixTransposeCPUReference(float* output, float* input, const unsigned int width) { - for (unsigned int j = 0; j < width; j++) { - for (unsigned int i = 0; i < width; i++) { - output[i * width + j] = input[j * width + i]; - } - } -} - - -// Use a separate function to demonstrate how to use function name as part of scoped marker: -void runGPU(float* Matrix, float* TransposeMatrix, float* gpuMatrix, float* gpuTransposeMatrix) { - // __func__ is a standard C++ macro which expands to the name of the function, in this case - // "runGPU" - HIP_SCOPED_MARKER(__func__, "MyGroup"); - - for (int i = 0; i < ITERATIONS; i++) { - if (i == startTriggerIteration) { - hipProfilerStart(); - } - if (i == stopTriggerIteration) { - hipProfilerStop(); - } - - float eventMs = 0.0f; - - hipEvent_t start, stop; - hipEventCreate(&start); - hipEventCreate(&stop); - - - // Record the start event - hipEventRecord(start, NULL); - - // Memory transfer from host to device - hipMemcpy(gpuMatrix, Matrix, NUM * sizeof(float), hipMemcpyHostToDevice); - - // Record the stop event - hipEventRecord(stop, NULL); - hipEventSynchronize(stop); - - hipEventElapsedTime(&eventMs, start, stop); - - printf("hipMemcpyHostToDevice time taken = %6.3fms\n", eventMs); - - // Record the start event - hipEventRecord(start, NULL); - - // Lauching kernel from host - hipLaunchKernelGGL(matrixTranspose, - dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y), - dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, gpuTransposeMatrix, - gpuMatrix, WIDTH); - - // Record the stop event - hipEventRecord(stop, NULL); - hipEventSynchronize(stop); - hipEventElapsedTime(&eventMs, start, stop); - - printf("kernel Execution time = %6.3fms\n", eventMs); - - // Record the start event - hipEventRecord(start, NULL); - - // Memory transfer from device to host - hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM * sizeof(float), hipMemcpyDeviceToHost); - - // Record the stop event - hipEventRecord(stop, NULL); - hipEventSynchronize(stop); - - hipEventElapsedTime(&eventMs, start, stop); - - printf("hipMemcpyDeviceToHost time taken = %6.3fms\n", eventMs); - } -}; - - -int main(int argc, char* argv[]) { - if (argc >= 2) { - startTriggerIteration = atoi(argv[1]); - printf("info : will start tracing at iteration:%d\n", startTriggerIteration); - } - if (argc >= 3) { - stopTriggerIteration = atoi(argv[2]); - printf("info : will stop tracing at iteration:%d\n", stopTriggerIteration); - } - - float* Matrix; - float* TransposeMatrix; - float* cpuTransposeMatrix; - - float* gpuMatrix; - float* gpuTransposeMatrix; - - hipDeviceProp_t devProp; - hipGetDeviceProperties(&devProp, 0); - - std::cout << "Device name " << devProp.name << std::endl; - - { - // Show example of how to create a "scoped marker". - // The scoped marker records the time spent inside the { scope } of the marker - the begin - // timestamp is at the beginning of the code scope, and the end is recorded when the SCOPE - // exits. This can be viewed in CodeXL timeline relative to other GPU and CPU events. This - // marker captures the time spent in setup including host allocation, initialization, and - // device memory allocation. - HIP_SCOPED_MARKER("Setup", "MyGroup"); - - - Matrix = (float*)malloc(NUM * sizeof(float)); - TransposeMatrix = (float*)malloc(NUM * sizeof(float)); - cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float)); - - // initialize the input data - for (int i = 0; i < NUM; i++) { - Matrix[i] = (float)i * 10.0f; - } - - - // allocate the memory on the device side - hipMalloc((void**)&gpuMatrix, NUM * sizeof(float)); - hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float)); - - // FYI, the scoped-marker will be destroyed here when the scope exits, and will record its - // "end" timestamp. - } - - runGPU(Matrix, TransposeMatrix, gpuMatrix, gpuTransposeMatrix); - - - // show how to use explicit begin/end markers: - // We begin the timed region with HIP_BEGIN_MARKER, passing in the markerName and group: - // The region will stop when HIP_END_MARKER is called - // This is another way to mark begin/end - as an alternative to scoped markers. - HIP_BEGIN_MARKER("Check&TearDown", "MyGroup"); - - int errors = 0; - - // CPU MatrixTranspose computation - matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH); - - // verify the results - double eps = 1.0E-6; - for (int i = 0; i < NUM; i++) { - if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps) { - errors++; - } - } - if (errors != 0) { - printf("FAILED: %d errors\n", errors); - } else { - printf("PASSED!\n"); - } - - // free the resources on device side - hipFree(gpuMatrix); - hipFree(gpuTransposeMatrix); - - // free the resources on host side - free(Matrix); - free(TransposeMatrix); - free(cpuTransposeMatrix); - - // This ends the last marker started in this thread, in this case "Check&TearDown" - HIP_END_MARKER(); - - return errors; -} diff --git a/hipamd/samples/2_Cookbook/2_Profiler/Readme.md b/hipamd/samples/2_Cookbook/2_Profiler/Readme.md deleted file mode 100644 index 8b32beb348..0000000000 --- a/hipamd/samples/2_Cookbook/2_Profiler/Readme.md +++ /dev/null @@ -1,47 +0,0 @@ -## Using hipEvents to measure performance ### - -This tutorial is follow-up of the previous two tutorial where we learn how to write our first hip program, in which we compute Matrix Transpose and in second one, we added feature to measure time taken for memory transfer and kernel execution. In this tutorial, we'll explain how to use the codexl/rocm-profiler for hip timeline tracing. Also, we will augment the source code with additional markers so we can see the high-level application flow alongside the information that CodeXL automatically collects. - - -## Introduction: - -CodeXL and rocm-profiler are the tool used for profiling the application, which is of prominent use in optimizing the application by means of finding the memory bottlenecks and etc. - -## Requirement: -[CodeXL Installation](http://gpuopen.com/compute-product/codexl/) - -## prerequiste knowledge: - -Programmers familiar with CUDA, OpenCL will be able to quickly learn and start coding with the HIP API. In case you are not, don't worry. You choose to start with the best one. We'll be explaining everything assuming you are completely new to gpgpu programming. - -## Simple Matrix Transpose - -We will be using the Simple Matrix Transpose source code from the previous tutorial as it is. - -## Using CodeXL markers for HIP Functions - -HIP can generate markers at function being/end which are displayed on the CodeXL timeline view. To do this, you need to install ROCm-Profiler and enable HIP to generate the markers: - -1. Install ROCm-Profiler Installing HIP from the rocm pre-built packages, installs the ROCm-Profiler as well. Alternatively, you can build ROCm-Profiler using the instructions given below. - - -2. Run with profiler enabled to generate ATP file. -(These steps are also captured in the Makefile) -The HIP_PROFILE_API enables display of the HIP APIs on the CodeXL trimeline view. -`/opt/rocm/bin/rocm-profiler -o -A -e HIP_PROFILE_API=1 ` - -##Using HIP_TRACE_API - -You can also print the HIP function strings to stderr using HIP_TRACE_API environment variable. This can also be combined with the more detailed debug information provided by the HIP_DB switch. For example: -`HIP_TRACE_API=1 HIP_DB=0x2 ./myHipApp` -Note this trace mode uses colors. "less -r" can handle raw control characters and will display the debug output in proper colors. - -## More Info: -- [HIP FAQ](https://github.com/ROCm-Developer-Tools/HIP/blob/master/docs/markdown/hip_faq.md) -- [HIP Kernel Language](https://github.com/ROCm-Developer-Tools/HIP/blob/master/docs/markdown/hip_kernel_language.md) -- [HIP Runtime API (Doxygen)](http://rocm-developer-tools.github.io/HIP) -- [HIP Porting Guide](https://github.com/ROCm-Developer-Tools/HIP/blob/master/docs/markdown/hip_porting_guide.md) -- [HIP Terminology](https://github.com/ROCm-Developer-Tools/HIP/blob/master/docs/markdown/hip_terms.md) (including Rosetta Stone of GPU computing terms across CUDA/HIP/HC/AMP/OpenL) -- [HIPIFY](https://github.com/ROCm-Developer-Tools/HIP/blob/master/hipify-clang/README.md) -- [Developer/CONTRIBUTING Info](https://github.com/ROCm-Developer-Tools/HIP/blob/master/CONTRIBUTING.md) -- [Release Notes](https://github.com/ROCm-Developer-Tools/HIP/blob/master/RELEASE.md) diff --git a/hipamd/src/hip_hcc.cpp b/hipamd/src/hip_hcc.cpp index af40b29ea9..c9688408c8 100644 --- a/hipamd/src/hip_hcc.cpp +++ b/hipamd/src/hip_hcc.cpp @@ -72,7 +72,6 @@ int HIP_API_BLOCKING = 0; int HIP_PRINT_ENV = 0; int HIP_TRACE_API = 0; std::string HIP_TRACE_API_COLOR("green"); -int HIP_PROFILE_API = 0; // TODO - DB_START/STOP need more testing. std::string HIP_DB_START_API; @@ -150,12 +149,10 @@ uint64_t recordApiTrace(TlsData *tls, std::string* fullStr, const std::string& a if ((tid < g_dbStartTriggers.size()) && (apiSeqNum >= g_dbStartTriggers[tid].nextTrigger())) { printf("info: resume profiling at %lu\n", apiSeqNum); - RESUME_PROFILING; g_dbStartTriggers.pop_back(); }; if ((tid < g_dbStopTriggers.size()) && (apiSeqNum >= g_dbStopTriggers[tid].nextTrigger())) { printf("info: stop profiling at %lu\n", apiSeqNum); - STOP_PROFILING; g_dbStopTriggers.pop_back(); }; @@ -1295,9 +1292,6 @@ void HipReadEnv() { "executes."); READ_ENV_S(release, HIP_TRACE_API_COLOR, 0, "Color to use for HIP_API. None/Red/Green/Yellow/Blue/Magenta/Cyan/White"); - READ_ENV_I(release, HIP_PROFILE_API, 0, - "Add HIP API markers to ATP file generated with CodeXL. 0x1=short API name, " - "0x2=full API name including args."); READ_ENV_S(release, HIP_DB_START_API, 0, "Comma-separated list of tid.api_seq_num for when to start debug and profiling."); READ_ENV_S(release, HIP_DB_STOP_API, 0, @@ -1373,14 +1367,6 @@ void HipReadEnv() { HIP_DB |= 0x1; } - if (HIP_PROFILE_API && !COMPILE_HIP_ATP_MARKER) { - fprintf(stderr, - "warning: env var HIP_PROFILE_API=0x%x but COMPILE_HIP_ATP_MARKER=0. (perhaps " - "enable COMPILE_HIP_ATP_MARKER in src code before compiling?)\n", - HIP_PROFILE_API); - HIP_PROFILE_API = 0; - } - if (HIP_DB) { fprintf(stderr, "HIP_DB=0x%x [%s]\n", HIP_DB, HIP_DB_string(HIP_DB).c_str()); } @@ -1424,11 +1410,6 @@ void HipReadEnv() { // This function creates a vector with only the GPU accelerators. // It is called with C++11 call_once, which provided thread-safety. void ihipInit() { -#if COMPILE_HIP_ATP_MARKER - amdtInitializeActivityLogger(); - amdtScopedMarker("ihipInit", "HIP", NULL); -#endif - HipReadEnv(); @@ -1618,7 +1599,7 @@ hipStream_t ihipSyncAndResolveStream(hipStream_t stream, bool lockAcquired) { void ihipPrintKernelLaunch(const char* kernelName, const grid_launch_parm* lp, const hipStream_t stream) { - if ((HIP_TRACE_API & (1 << TRACE_KCMD)) || HIP_PROFILE_API || + if ((HIP_TRACE_API & (1 << TRACE_KCMD)) || (COMPILE_HIP_DB & HIP_TRACE_API)) { GET_TLS(); std::stringstream os; @@ -1631,14 +1612,6 @@ void ihipPrintKernelLaunch(const char* kernelName, const grid_launch_parm* lp, std::string fullStr; recordApiTrace(tls, &fullStr, os.str()); } - - if (HIP_PROFILE_API == 0x1) { - std::string shortAtpString("hipLaunchKernel:"); - shortAtpString += kernelName; - MARKER_BEGIN(shortAtpString.c_str(), "HIP"); - } else if (HIP_PROFILE_API == 0x2) { - MARKER_BEGIN(os.str().c_str(), "HIP"); - } } } @@ -1697,9 +1670,6 @@ void ihipPostLaunchKernel(const char* kernelName, hipStream_t stream, grid_launc tprintf(DB_SYNC, "ihipPostLaunchKernel, unlocking stream\n"); stream->lockclose_postKernelCommand(kernelName, lp.av, unlockPostponed); - if (HIP_PROFILE_API) { - MARKER_END(); - } } //================================================================================================= @@ -2481,29 +2451,17 @@ bool ihipStream_t::locked_copy2DAsync(void* dst, const void* src, size_t width, return retStatus; } -//------------------------------------------------------------------------------------------------- -//------------------------------------------------------------------------------------------------- -// Profiler, really these should live elsewhere: hipError_t hipProfilerStart() { HIP_INIT_API(hipProfilerStart); -#if COMPILE_HIP_ATP_MARKER - amdtResumeProfiling(AMDT_ALL_PROFILING); -#endif - return ihipLogStatus(hipSuccess); }; hipError_t hipProfilerStop() { HIP_INIT_API(hipProfilerStop); -#if COMPILE_HIP_ATP_MARKER - amdtStopProfiling(AMDT_ALL_PROFILING); -#endif - return ihipLogStatus(hipSuccess); }; - //------------------------------------------------------------------------------------------------- //------------------------------------------------------------------------------------------------- // HCC-specific accessor functions: diff --git a/hipamd/src/hip_hcc_internal.h b/hipamd/src/hip_hcc_internal.h index 0510015c42..ac63f49dba 100644 --- a/hipamd/src/hip_hcc_internal.h +++ b/hipamd/src/hip_hcc_internal.h @@ -63,7 +63,6 @@ extern int HIP_LAUNCH_BLOCKING; extern int HIP_API_BLOCKING; extern int HIP_PRINT_ENV; -extern int HIP_PROFILE_API; // extern int HIP_TRACE_API; extern int HIP_ATP; extern int HIP_DB; @@ -250,34 +249,6 @@ extern const char* API_COLOR_END; // Must be enabled at runtime with HIP_TRACE_API #define COMPILE_HIP_TRACE_API 0x3 - -// Compile code that generates trace markers for CodeXL ATP at HIP function begin/end. -// ATP is standard CodeXL format that includes timestamps for kernels, HSA RT APIs, and HIP APIs. -#ifndef COMPILE_HIP_ATP_MARKER -#define COMPILE_HIP_ATP_MARKER 0 -#endif - - -// Compile support for trace markers that are displayed on CodeXL GUI at start/stop of each function -// boundary. -// TODO - currently we print the trace message at the beginning. if we waited, we could also -// tls->tidInfo return codes, and any values returned through ptr-to-args (ie the pointers allocated -// by hipMalloc). -#if COMPILE_HIP_ATP_MARKER -#include "CXLActivityLogger.h" -#define MARKER_BEGIN(markerName, group) amdtBeginMarker(markerName, group, nullptr); -#define MARKER_END() amdtEndMarker(); -#define RESUME_PROFILING amdtResumeProfiling(AMDT_ALL_PROFILING); -#define STOP_PROFILING amdtStopProfiling(AMDT_ALL_PROFILING); -#else -// Swallow scoped markers: -#define MARKER_BEGIN(markerName, group) -#define MARKER_END() -#define RESUME_PROFILING -#define STOP_PROFILING -#endif - - //--- // HIP Trace modes - use with HIP_TRACE_API=... #define TRACE_ALL 0 // 0x01 @@ -336,22 +307,17 @@ static inline uint64_t getTicks() { return hc::get_system_ticks(); } //--- extern uint64_t recordApiTrace(TlsData *tls, std::string* fullStr, const std::string& apiStr); -#if COMPILE_HIP_ATP_MARKER || (COMPILE_HIP_TRACE_API & 0x1) +#if (COMPILE_HIP_TRACE_API & 0x1) #define API_TRACE(forceTrace, ...) \ GET_TLS(); \ uint64_t hipApiStartTick = 0; \ { \ tls->tidInfo.incApiSeqNum(); \ if (forceTrace || \ - (HIP_PROFILE_API || (COMPILE_HIP_DB && (HIP_TRACE_API & (1 << TRACE_ALL))))) { \ + (COMPILE_HIP_DB && (HIP_TRACE_API & (1 << TRACE_ALL)))) { \ std::string apiStr = std::string(__func__) + " (" + ToString(__VA_ARGS__) + ')'; \ std::string fullStr; \ hipApiStartTick = recordApiTrace(tls, &fullStr, apiStr); \ - if (HIP_PROFILE_API == 0x1) { \ - MARKER_BEGIN(__func__, "HIP") \ - } else if (HIP_PROFILE_API == 0x2) { \ - MARKER_BEGIN(fullStr.c_str(), "HIP"); \ - } \ } \ } @@ -398,9 +364,6 @@ extern uint64_t recordApiTrace(TlsData *tls, std::string* fullStr, const std::st tls->tidInfo.apiSeqNum(), __func__, localHipStatus, \ ihipErrorString(localHipStatus), ticks, API_COLOR_END); \ } \ - if (HIP_PROFILE_API) { \ - MARKER_END(); \ - } \ localHipStatus; \ }) From 40a28e767e46e0dfaafd5b3f422bc91d6eb0379e Mon Sep 17 00:00:00 2001 From: saleelk Date: Fri, 28 Feb 2020 03:17:29 -0800 Subject: [PATCH 46/50] Fix HIPRTC headers to export C style symbols (#1879) --- hipamd/include/hip/hcc_detail/hiprtc.h | 66 +++----- hipamd/src/hiprtc.cpp | 64 ++++---- hipamd/tests/src/hiprtc/hiprtcGetTypeName.cpp | 147 ------------------ 3 files changed, 58 insertions(+), 219 deletions(-) delete mode 100644 hipamd/tests/src/hiprtc/hiprtcGetTypeName.cpp diff --git a/hipamd/include/hip/hcc_detail/hiprtc.h b/hipamd/include/hip/hcc_detail/hiprtc.h index 26d3129dbc..624f1ea157 100644 --- a/hipamd/include/hip/hcc_detail/hiprtc.h +++ b/hipamd/include/hip/hcc_detail/hiprtc.h @@ -19,10 +19,14 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#pragma once +#ifndef HIPRTC_H +#define HIPRTC_H -#include -#include +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + +#include enum hiprtcResult { HIPRTC_SUCCESS = 0, @@ -41,29 +45,22 @@ enum hiprtcResult { const char* hiprtcGetErrorString(hiprtcResult result); -inline -hiprtcResult hiprtcVersion(int* major, int* minor) noexcept -{ // TODO: NVRTC versioning is somewhat unclear. - if (!major || !minor) return HIPRTC_ERROR_INVALID_INPUT; - // TODO: this should be generic / set by the build infrastructure. - *major = 9; - *minor = 0; +hiprtcResult hiprtcVersion(int* major, int* minor); - return HIPRTC_SUCCESS; -} - -struct _hiprtcProgram; -using hiprtcProgram = _hiprtcProgram*; +typedef struct _hiprtcProgram* hiprtcProgram; hiprtcResult hiprtcAddNameExpression(hiprtcProgram prog, const char* name_expression); -hiprtcResult hiprtcCompileProgram(hiprtcProgram prog, int numOptions, +hiprtcResult hiprtcCompileProgram(hiprtcProgram prog, + int numOptions, const char** options); -hiprtcResult hiprtcCreateProgram(hiprtcProgram* prog, const char* src, - const char* name, int numHeaders, +hiprtcResult hiprtcCreateProgram(hiprtcProgram* prog, + const char* src, + const char* name, + int numHeaders, const char** headers, const char** includeNames); @@ -76,37 +73,14 @@ hiprtcResult hiprtcGetLoweredName(hiprtcProgram prog, hiprtcResult hiprtcGetProgramLog(hiprtcProgram prog, char* log); hiprtcResult hiprtcGetProgramLogSize(hiprtcProgram prog, - std::size_t* logSizeRet); + size_t* logSizeRet); hiprtcResult hiprtcGetCode(hiprtcProgram prog, char* code); -hiprtcResult hiprtcGetCodeSize(hiprtcProgram prog, std::size_t* codeSizeRet); +hiprtcResult hiprtcGetCodeSize(hiprtcProgram prog, size_t* codeSizeRet); -namespace hip_impl -{ - char* demangle(const char* mangled_expression); +#ifdef __cplusplus } +#endif /* __cplusplus */ -#if defined(HIPRTC_GET_TYPE_NAME) - #include - - #if defined(_WIN32) - #include - - template - hiprtcResult hiprtcGetTypeName(std::string*) = delete; - #else - template - inline - hiprtcResult hiprtcGetTypeName(std::string* result) - { - if (!result) return HIPRTC_ERROR_INVALID_INPUT; - - char * res= hip_impl::demangle(typeid(T).name()); - result->assign(res == nullptr ? "" : res); - std::free(res); - return (result->empty()) ? HIPRTC_ERROR_INTERNAL_ERROR : - HIPRTC_SUCCESS; - } - #endif -#endif +#endif //HIPRTC_H diff --git a/hipamd/src/hiprtc.cpp b/hipamd/src/hiprtc.cpp index 5198bf0cbb..3c7fe6e78c 100644 --- a/hipamd/src/hiprtc.cpp +++ b/hipamd/src/hiprtc.cpp @@ -50,7 +50,7 @@ THE SOFTWARE. #include #include -const char* hiprtcGetErrorString(hiprtcResult x) +extern "C" const char* hiprtcGetErrorString(hiprtcResult x) { switch (x) { case HIPRTC_SUCCESS: @@ -95,6 +95,21 @@ inline bool fileExists (const std::string& name) { } } // namespace hip_impl +namespace +{ + char* demangle(const char* x) + { + if (!x) return nullptr; + + int s{}; + char* tmp = abi::__cxa_demangle(x, nullptr, nullptr, &s); + + if (s != 0) return nullptr; + + return tmp; + } +} // Unnamed namespace. + namespace { struct Symbol { @@ -158,7 +173,7 @@ struct _hiprtcProgram { { using namespace std; - char* demangled = hip_impl::demangle(name.c_str()); + char* demangled = demangle(name.c_str()); name.assign(demangled == nullptr ? "" : demangled); free(demangled); @@ -352,7 +367,7 @@ namespace } } // Unnamed namespace. -hiprtcResult hiprtcAddNameExpression(hiprtcProgram p, const char* n) +extern "C" hiprtcResult hiprtcAddNameExpression(hiprtcProgram p, const char* n) { if (!n) return HIPRTC_ERROR_INVALID_INPUT; if (!isValidProgram(p)) return HIPRTC_ERROR_INVALID_PROGRAM; @@ -413,21 +428,6 @@ namespace }; } // Unnamed namespace. -namespace hip_impl -{ - char* demangle(const char* x) - { - if (!x) return nullptr; - - int s{}; - char* tmp = abi::__cxa_demangle(x, nullptr, nullptr, &s); - - if (s != 0) return nullptr; - - return tmp; - } -} // Namespace hip_impl. - namespace { const std::string& defaultTarget() @@ -492,7 +492,7 @@ namespace } } // Unnamed namespace. -hiprtcResult hiprtcCompileProgram(hiprtcProgram p, int n, const char** o) +extern "C" hiprtcResult hiprtcCompileProgram(hiprtcProgram p, int n, const char** o) { using namespace std; @@ -530,7 +530,7 @@ hiprtcResult hiprtcCompileProgram(hiprtcProgram p, int n, const char** o) return HIPRTC_SUCCESS; } -hiprtcResult hiprtcCreateProgram(hiprtcProgram* p, const char* src, +extern "C" hiprtcResult hiprtcCreateProgram(hiprtcProgram* p, const char* src, const char* name, int n, const char** hdrs, const char** incs) { @@ -548,14 +548,14 @@ hiprtcResult hiprtcCreateProgram(hiprtcProgram* p, const char* src, return HIPRTC_SUCCESS; } -hiprtcResult hiprtcDestroyProgram(hiprtcProgram* p) +extern "C" hiprtcResult hiprtcDestroyProgram(hiprtcProgram* p) { if (!p) return HIPRTC_SUCCESS; return _hiprtcProgram::destroy(*p); } -hiprtcResult hiprtcGetLoweredName(hiprtcProgram p, const char* n, +extern "C" hiprtcResult hiprtcGetLoweredName(hiprtcProgram p, const char* n, const char** ln) { using namespace std; @@ -576,7 +576,7 @@ hiprtcResult hiprtcGetLoweredName(hiprtcProgram p, const char* n, return HIPRTC_SUCCESS; } -hiprtcResult hiprtcGetProgramLog(hiprtcProgram p, char* l) +extern "C" hiprtcResult hiprtcGetProgramLog(hiprtcProgram p, char* l) { if (!l) return HIPRTC_ERROR_INVALID_INPUT; if (!isValidProgram(p)) return HIPRTC_ERROR_INVALID_PROGRAM; @@ -588,7 +588,7 @@ hiprtcResult hiprtcGetProgramLog(hiprtcProgram p, char* l) return HIPRTC_SUCCESS; } -hiprtcResult hiprtcGetProgramLogSize(hiprtcProgram p, std::size_t* sz) +extern "C" hiprtcResult hiprtcGetProgramLogSize(hiprtcProgram p, std::size_t* sz) { if (!sz) return HIPRTC_ERROR_INVALID_INPUT; if (!isValidProgram(p)) return HIPRTC_ERROR_INVALID_PROGRAM; @@ -599,7 +599,7 @@ hiprtcResult hiprtcGetProgramLogSize(hiprtcProgram p, std::size_t* sz) return HIPRTC_SUCCESS; } -hiprtcResult hiprtcGetCode(hiprtcProgram p, char* c) +extern "C" hiprtcResult hiprtcGetCode(hiprtcProgram p, char* c) { if (!c) return HIPRTC_ERROR_INVALID_INPUT; if (!isValidProgram(p)) return HIPRTC_ERROR_INVALID_PROGRAM; @@ -610,7 +610,7 @@ hiprtcResult hiprtcGetCode(hiprtcProgram p, char* c) return HIPRTC_SUCCESS; } -hiprtcResult hiprtcGetCodeSize(hiprtcProgram p, std::size_t* sz) +extern "C" hiprtcResult hiprtcGetCodeSize(hiprtcProgram p, std::size_t* sz) { if (!sz) return HIPRTC_ERROR_INVALID_INPUT; if (!isValidProgram(p)) return HIPRTC_ERROR_INVALID_PROGRAM; @@ -620,3 +620,15 @@ hiprtcResult hiprtcGetCodeSize(hiprtcProgram p, std::size_t* sz) return HIPRTC_SUCCESS; } + +extern "C" hiprtcResult hiprtcVersion(int* major, int* minor) +{ + if (major == nullptr || minor == nullptr) { + return HIPRTC_ERROR_INVALID_INPUT; + } + + *major = 9; + *minor = 0; + + return HIPRTC_SUCCESS; +} diff --git a/hipamd/tests/src/hiprtc/hiprtcGetTypeName.cpp b/hipamd/tests/src/hiprtc/hiprtcGetTypeName.cpp deleted file mode 100644 index 812229f81f..0000000000 --- a/hipamd/tests/src/hiprtc/hiprtcGetTypeName.cpp +++ /dev/null @@ -1,147 +0,0 @@ -/* -Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved. - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ -/* HIT_START - * BUILD: %t %s ../test_common.cpp LINK_OPTIONS hiprtc EXCLUDE_HIP_PLATFORM nvcc - * TEST: %t - * HIT_END - */ - -#include - -#define HIPRTC_GET_TYPE_NAME -#include -#include - -#include -#include -#include - -static constexpr auto gpu_program{ -R"( -#include - -namespace N1 { struct S1_t { int i; double d; }; } -template -__global__ void f3(int *result) { *result = sizeof(T); } -)"}; - -// note: this structure is also defined in GPU code string. Should ideally -// be in a header file included by both GPU code string and by CPU code. -namespace N1 { struct S1_t { int i; double d; }; }; - -template -std::string getKernelNameForType(void) -{ - std::string type_name; - hiprtcGetTypeName(&type_name); - return std::string{"f3<"} + type_name + '>'; -} - -int main() -{ - using namespace std; - - hiprtcProgram prog; - hiprtcCreateProgram(&prog, gpu_program, "gpu_program.cu", 0, nullptr, - nullptr); - - vector name_vec; - vector expected_result; - - name_vec.push_back(getKernelNameForType()); - expected_result.push_back(sizeof(int)); - name_vec.push_back(getKernelNameForType()); - expected_result.push_back(sizeof(double)); - name_vec.push_back(getKernelNameForType()); - expected_result.push_back(sizeof(N1::S1_t)); - - for (auto&& x : name_vec) hiprtcAddNameExpression(prog, x.c_str()); - - hipDeviceProp_t props; - int device = 0; - hipGetDeviceProperties(&props, device); - std::string gfxName = "gfx" + std::to_string(props.gcnArch); - std::string sarg = "--gpu-architecture=" + gfxName; - const char* options[] = { - sarg.c_str() - }; - - hiprtcResult compileResult = hiprtcCompileProgram(prog, 1, options); - - size_t logSize; - hiprtcGetProgramLogSize(prog, &logSize); - - if (logSize) { - string log(logSize, '\0'); - hiprtcGetProgramLog(prog, &log[0]); - - cout << log << '\n'; - } - - if (compileResult != HIPRTC_SUCCESS) { failed("Compilation failed."); } - - size_t codeSize; - hiprtcGetCodeSize(prog, &codeSize); - - vector code(codeSize); - hiprtcGetCode(prog, code.data()); - - hipModule_t module; - hipModuleLoadDataEx(&module, code.data(), 0, nullptr, nullptr); - - hipDeviceptr_t dResult; - int hResult = 0; - hipMalloc(&dResult, sizeof(hResult)); - hipMemcpyHtoD(dResult, &hResult, sizeof(hResult)); - - for (size_t i = 0; i < name_vec.size(); ++i) { - const char *name; - hiprtcGetLoweredName(prog, name_vec[i].c_str(), &name); - - hipFunction_t kernel; - hipModuleGetFunction(&kernel, module, name); - - struct { hipDeviceptr_t a_; } args{dResult}; - - auto size = sizeof(args); - void* config[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, &args, - HIP_LAUNCH_PARAM_BUFFER_SIZE, &size, - HIP_LAUNCH_PARAM_END}; - - hipModuleLaunchKernel(kernel, - 1, 1, 1, - 1, 1, 1, - 0, nullptr, - nullptr, config); - - hipMemcpyDtoH(&hResult, dResult, sizeof(hResult)); - - if (expected_result[i] != hResult) { failed("Validation failed."); } - } - - hipFree(dResult); - hipModuleUnload(module); - - hiprtcDestroyProgram(&prog); - - passed(); -} From eebba4799ca122c23c95cd38bc81495fc7613ffa Mon Sep 17 00:00:00 2001 From: Siu Chi Chan Date: Fri, 28 Feb 2020 06:17:41 -0500 Subject: [PATCH 47/50] improve code object loading error message (#1889) --- hipamd/src/program_state.inl | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/hipamd/src/program_state.inl b/hipamd/src/program_state.inl index bcac4d384b..27fa9f2b0e 100644 --- a/hipamd/src/program_state.inl +++ b/hipamd/src/program_state.inl @@ -426,7 +426,11 @@ public: auto check_hsa_error = [](hsa_status_t s) { if (s != HSA_STATUS_SUCCESS) { - hip_throw(std::runtime_error{"error when loading code object"}); + const char* hsa_err_msg; + hsa_status_string(s, &hsa_err_msg); + hip_throw(std::runtime_error{ + std::string("error when loading code object: ") + + hsa_err_msg}); } }; From dacc90f415ae894be2c08ab43d8184f646675c13 Mon Sep 17 00:00:00 2001 From: Jatin Chaudhary <51944368+cjatin@users.noreply.github.com> Date: Fri, 28 Feb 2020 16:48:15 +0530 Subject: [PATCH 48/50] [dtests] __shfl_up and __shfl_down tests (#1899) --- hipamd/tests/src/kernel/hipShflUpDownTest.cpp | 102 ++++++++++++++++++ 1 file changed, 102 insertions(+) create mode 100644 hipamd/tests/src/kernel/hipShflUpDownTest.cpp diff --git a/hipamd/tests/src/kernel/hipShflUpDownTest.cpp b/hipamd/tests/src/kernel/hipShflUpDownTest.cpp new file mode 100644 index 0000000000..553087ce45 --- /dev/null +++ b/hipamd/tests/src/kernel/hipShflUpDownTest.cpp @@ -0,0 +1,102 @@ +/* +Copyright (c) 2015-present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +/* HIT_START + * BUILD: %t %s ../test_common.cpp + * TEST: %t + * HIT_END + */ + +#include +#include +#include "test_common.h" + +template +__global__ void shflDownSum(T* a, int size) { + T val = a[threadIdx.x]; + for (int i = size / 2; i > 0; i /= 2) { + val += __shfl_down(val, i, size); + } + a[threadIdx.x] = val; +} + +template +__global__ void shflUpSum(T* a, int size) { + T val = a[threadIdx.x]; + for (int i = size / 2; i > 0; i /= 2) { + val += __shfl_up(val, i, size); + } + a[threadIdx.x] = val; +} + +template +void runTestShflUp() { + const int size = 32; + T a[size]; + T cpuSum = 0; + for (int i = 0; i < size; i++) { + a[i] = i; + cpuSum += a[i]; + } + T* d_a; + hipMalloc(&d_a, sizeof(T) * size); + hipMemcpy(d_a, &a, sizeof(T) * size, hipMemcpyDefault); + hipLaunchKernelGGL(shflUpSum, 1, size, 0, 0, d_a, size); + hipMemcpy(&a, d_a, sizeof(T) * size, hipMemcpyDefault); + if (a[size - 1] != cpuSum) { + hipFree(d_a); + failed("Shfl Up Sum did not match."); + } + hipFree(d_a); +} + +template +void runTestShflDown() { + const int size = 32; + T a[size]; + T cpuSum = 0; + for (int i = 0; i < size; i++) { + a[i] = i; + cpuSum += a[i]; + } + T* d_a; + hipMalloc(&d_a, sizeof(T) * size); + hipMemcpy(d_a, &a, sizeof(T) * size, hipMemcpyDefault); + hipLaunchKernelGGL(shflDownSum, 1, size, 0, 0, d_a, size); + hipMemcpy(&a, d_a, sizeof(T) * size, hipMemcpyDefault); + if (a[0] != cpuSum) { + hipFree(d_a); + failed("Shfl Up Sum did not match."); + } + hipFree(d_a); +} +int main() { + runTestShflUp(); + runTestShflUp(); + runTestShflUp(); + runTestShflUp(); + + runTestShflDown(); + runTestShflDown(); + runTestShflDown(); + runTestShflDown(); + passed(); +} From ff7165a73c04cb8c4a783628ad30843b0212ee25 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Tue, 3 Mar 2020 12:07:13 +0300 Subject: [PATCH 49/50] [HIP][cmake] Remove dependency from hipify-clang [Reason] Upcoming hipify-clang's splitting out into a new repository https://github.com/ROCm-Developer-Tools/HIPIFY. --- hipamd/CMakeLists.txt | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/hipamd/CMakeLists.txt b/hipamd/CMakeLists.txt index 07d23130c2..4894168348 100644 --- a/hipamd/CMakeLists.txt +++ b/hipamd/CMakeLists.txt @@ -1,11 +1,6 @@ cmake_minimum_required(VERSION 3.4.3) project(hip) -############################# -# Options -############################# -option(BUILD_HIPIFY_CLANG "Enable building the CUDA->HIP converter" OFF) - ############################# # Setup config generation ############################# @@ -238,11 +233,6 @@ set(LIB_INSTALL_DIR ${CMAKE_INSTALL_PREFIX}/lib) set(INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_PREFIX}/include) set(CONFIG_PACKAGE_INSTALL_DIR ${LIB_INSTALL_DIR}/cmake/hip) -# Build clang hipify if enabled -if (BUILD_HIPIFY_CLANG) - add_subdirectory(hipify-clang) -endif() - # Build LPL an CA (fat binary generation / fat binary decomposition tools) if # platform is hcc; do this before the ugly hijacking of the compiler, since no # HC code is involved. @@ -464,11 +454,6 @@ add_custom_target(pkg_hip_base COMMAND ${CMAKE_COMMAND} . WORKING_DIRECTORY ${BUILD_DIR} DEPENDS lpl ca) -# Packaging needs to wait for hipify-clang to build if it's enabled... -if (BUILD_HIPIFY_CLANG) - add_dependencies(pkg_hip_base hipify-clang) -endif() - # Package: hip_hcc set(BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}/packages/hip_hcc) configure_file(packaging/hip-hcc.txt ${BUILD_DIR}/CMakeLists.txt @ONLY) From 278eff4c965aedf8b819a95cb3050f5bf05b13bd Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Fri, 6 Mar 2020 18:17:05 +0300 Subject: [PATCH 50/50] [HIPIFY][doc] Update README.md: LLVM 10.0.0-rc3 is supported + Add -DLLVM_TEMPORARILY_ALLOW_OLD_TOOLCHAIN=ON for LLVM 10.0.0 or newer + Supported versions update --- hipamd/hipify-clang/README.md | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/hipamd/hipify-clang/README.md b/hipamd/hipify-clang/README.md index a375c6ab65..88d7a72ccd 100644 --- a/hipamd/hipify-clang/README.md +++ b/hipamd/hipify-clang/README.md @@ -42,10 +42,10 @@ After applying all the matchers, the output HIP source is produced. `hipify-clang` requires: -1. [**LLVM+CLANG**](http://releases.llvm.org) of at least version [3.8.0](http://releases.llvm.org/download.html#3.8.0); the latest stable and recommended release: [**9.0.1**](http://releases.llvm.org/download.html#9.0.1), the latest release candidate: [10.0.0-rc2](https://github.com/llvm/llvm-project/releases/tag/llvmorg-10.0.0-rc2). +1. [**LLVM+CLANG**](http://releases.llvm.org) of at least version [3.8.0](http://releases.llvm.org/download.html#3.8.0); the latest stable and recommended release: [**9.0.1**](http://releases.llvm.org/download.html#9.0.1), the latest release candidate: [10.0.0-rc3](https://github.com/llvm/llvm-project/releases/tag/llvmorg-10.0.0-rc3). 2. [**CUDA**](https://developer.nvidia.com/cuda-downloads) of at least version [7.0](https://developer.nvidia.com/cuda-toolkit-70), the latest supported version is [**10.1 Update 2**](https://developer.nvidia.com/cuda-10.1-download-archive-base). -To use the latest CUDA version [10.2](https://developer.nvidia.com/cuda-downloads) please use the latest `LLVM` release candidate: [10.0.0-rc2](https://github.com/llvm/llvm-project/releases/tag/llvmorg-10.0.0-rc2). +To use the latest CUDA version [10.2](https://developer.nvidia.com/cuda-downloads) please use the latest `LLVM` release candidate: [10.0.0-rc3](https://github.com/llvm/llvm-project/releases/tag/llvmorg-10.0.0-rc3). | **LLVM release version** | **CUDA latest supported version** | **Windows** | **Linux** | |:----------------------------------------------------------:|:------------------------------------------------------------------------:|:-----------:|:---------:| @@ -67,7 +67,7 @@ To use the latest CUDA version [10.2](https://developer.nvidia.com/cuda-download | [8.0.1](http://releases.llvm.org/download.html#8.0.1) | [10.0](https://developer.nvidia.com/cuda-10.0-download-archive) | -
not working due to
the clang's bug [38811](https://bugs.llvm.org/show_bug.cgi?id=38811)
+
[patch](patches/patch_for_clang_8.0.1_bug_38811.zip)*
| + | | [9.0.0](http://releases.llvm.org/download.html#9.0.0) | [10.1](https://developer.nvidia.com/cuda-10.1-download-archive-base) | + | + | | [**9.0.1**](http://releases.llvm.org/download.html#9.0.1) | [**10.1**](https://developer.nvidia.com/cuda-10.1-download-archive-base) | +
**LATEST STABLE RELEASE** | +
**LATEST STABLE RELEASE** | -| [10.0.0-rc2](https://github.com/llvm/llvm-project/releases/tag/llvmorg-10.0.0-rc2) | [10.2](https://developer.nvidia.com/cuda-downloads) | + | + | +| [10.0.0-rc3](https://github.com/llvm/llvm-project/releases/tag/llvmorg-10.0.0-rc3) | [10.2](https://developer.nvidia.com/cuda-downloads) | + | + | `*` Download the patch and unpack it into your `LLVM` distributive directory; a few header files will be overwritten; rebuilding of `LLVM` is not needed. @@ -158,7 +158,7 @@ Run `Visual Studio 16 2019`, open the generated `LLVM.sln`, build all, build pro **LLVM 10.0.0 or newer:** -1. download [`LLVM project`](https://github.com/llvm/llvm-project/archive/llvmorg-10.0.0-rc2.tar.gz) sources; +1. download [`LLVM project`](https://github.com/llvm/llvm-project/archive/llvmorg-10.0.0-rc3.tar.gz) sources; 2. build [`LLVM project`](http://llvm.org/docs/CMake.html): **Linux**: @@ -168,6 +168,7 @@ Run `Visual Studio 16 2019`, open the generated `LLVM.sln`, build all, build pro -DLLVM_SOURCE_DIR=../llvm-project \ -DLLVM_TARGETS_TO_BUILD="X86;NVPTX" \ -DLLVM_ENABLE_PROJECTS="clang" \ + -DLLVM_TEMPORARILY_ALLOW_OLD_TOOLCHAIN=ON -DCMAKE_BUILD_TYPE=Release \ ../llvm-project/llvm make -j install @@ -181,6 +182,7 @@ Run `Visual Studio 16 2019`, open the generated `LLVM.sln`, build all, build pro -DLLVM_SOURCE_DIR=../llvm-project \ -DLLVM_TARGETS_TO_BUILD="NVPTX" \ -DLLVM_ENABLE_PROJECTS="clang" \ + -DLLVM_TEMPORARILY_ALLOW_OLD_TOOLCHAIN=ON -DCMAKE_BUILD_TYPE=Release \ -Thost=x64 \ ../llvm-project/llvm @@ -247,7 +249,7 @@ On Linux the following configurations are tested: Ubuntu 14: LLVM 5.0.0 - 6.0.1, CUDA 7.0 - 9.0, cudnn-5.0.5 - cudnn-7.6.5.32 -Ubuntu 16-18: LLVM 8.0.0 - 10.0.0-rc2, CUDA 8.0 - 10.2, cudnn-5.1.10 - cudnn-7.6.5.32 +Ubuntu 16-18: LLVM 8.0.0 - 10.0.0-rc3, CUDA 8.0 - 10.2, cudnn-5.1.10 - cudnn-7.6.5.32 Minimum build system requirements for the above configurations: @@ -402,8 +404,8 @@ Testing Time: 3.07s | 7.0.0 - 7.1.0 | 9.2 | 7.6.5.32 | 2017.15.9.11 | 3.13.3 | 3.7.3 | | 8.0.0 - 8.0.1 | 10.0 | 7.6.5.32 | 2017.15.9.15 | 3.14.2 | 3.7.4 | | 9.0.0 - 9.0.1 | 10.1 | 7.6.5.32 | 2017.15.9.20, 2019.16.4.5 | 3.16.4 | 3.8.0 | -| 10.0.0-rc1,rc2 | 10.2 | 7.6.5.32 | 2017.15.9.20, 2019.16.4.5 | 3.16.4 | 3.8.1 | -| 11.0.0git | 10.2 | 7.6.5.32 | 2017.15.9.20, 2019.16.4.5 | 3.16.4 | 3.8.1 | +| 10.0.0-rc1-rc3 | 10.2 | 7.6.5.32 | 2017.15.9.20, 2019.16.4.5 | 3.16.4 | 3.8.1 | +| 11.0.0git | 10.2 | 7.6.5.32 | 2017.15.9.20, 2019.16.4.5 | 3.16.5 | 3.8.2 | *Building with testing support on `Windows 10` by `Visual Studio 16 2019`:* @@ -429,7 +431,7 @@ cmake -- - CMake module path: F:/LLVM/9.0.1/dist/lib/cmake/llvm -- - Include path : F:/LLVM/9.0.1/dist/include -- - Binary path : F:/LLVM/9.0.1/dist/bin --- Found PythonInterp: C:/Program Files/Python38/python.exe (found suitable version "3.8.1", minimum required is "3.6") +-- Found PythonInterp: C:/Program Files/Python38/python.exe (found suitable version "3.8.2", minimum required is "3.6") -- Found lit: C:/Program Files/Python38/Scripts/lit.exe -- Found FileCheck: F:/LLVM/9.0.1/dist/bin/FileCheck.exe -- Found CUDA: C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.1 (found version "10.1")