diff --git a/.github/workflows/therock-ci-windows.yml b/.github/workflows/therock-ci-windows.yml index aed6fbbb75..86bcbc79c2 100644 --- a/.github/workflows/therock-ci-windows.yml +++ b/.github/workflows/therock-ci-windows.yml @@ -67,6 +67,7 @@ jobs: echo "$PATH;C:\Strawberry\c\bin" >> $GITHUB_PATH choco install --no-progress -y awscli echo "$PATH;C:\Program Files\Amazon\AWSCLIV2" >> $GITHUB_PATH + choco install --no-progress -y pkgconfiglite - uses: iterative/setup-dvc@4bdfd2b0f6f1ad7e08afadb03b1a895c352a5239 # v2.0.0 with: diff --git a/projects/clr/hipamd/packaging/CMakeLists.txt b/projects/clr/hipamd/packaging/CMakeLists.txt index 9d742177e1..4ec80af0a0 100644 --- a/projects/clr/hipamd/packaging/CMakeLists.txt +++ b/projects/clr/hipamd/packaging/CMakeLists.txt @@ -201,7 +201,7 @@ else() set(HIP_RUNTIME_DEB_STDPKG_DEPENDENCIES "${HIP_RUNTIME_DEB_STDPKG_DEPENDENCIES}, libgcc-s1") endif() -set(CPACK_DEBIAN_BINARY_PACKAGE_DEPENDS "${HIP_RUNTIME_ROCM_PKG_DEPENDENCIES}, ${HIP_RUNTIME_DEB_STDPKG_DEPENDENCIES}") +set(CPACK_DEBIAN_BINARY_PACKAGE_DEPENDS "${HIP_RUNTIME_ROCM_PKG_DEPENDENCIES}, ${HIP_RUNTIME_DEB_STDPKG_DEPENDENCIES}, libsimde-dev") set(CPACK_DEBIAN_BINARY_PACKAGE_PROVIDES "hip-rocclr (= ${CPACK_PACKAGE_VERSION})") set(CPACK_DEBIAN_BINARY_PACKAGE_REPLACES "hip-rocclr (= ${CPACK_PACKAGE_VERSION})") @@ -223,7 +223,7 @@ else() endif() endif() -set(CPACK_RPM_BINARY_PACKAGE_REQUIRES "${HIP_RUNTIME_ROCM_PKG_DEPENDENCIES}, ${HIP_RUNTIME_RPM_STDPKG_DEPENDENCIES}") +set(CPACK_RPM_BINARY_PACKAGE_REQUIRES "${HIP_RUNTIME_ROCM_PKG_DEPENDENCIES}, ${HIP_RUNTIME_RPM_STDPKG_DEPENDENCIES}, simde-devel") set(CPACK_RPM_BINARY_PACKAGE_PROVIDES "hip-rocclr = ${HIP_BASE_VERSION}") set(CPACK_RPM_BINARY_PACKAGE_OBSOLETES "hip-rocclr = ${HIP_BASE_VERSION}") diff --git a/projects/clr/hipamd/src/hip_embed_pch.sh b/projects/clr/hipamd/src/hip_embed_pch.sh index 6c92d43884..4593f76444 100755 --- a/projects/clr/hipamd/src/hip_embed_pch.sh +++ b/projects/clr/hipamd/src/hip_embed_pch.sh @@ -142,19 +142,20 @@ __hip_pch_wave64_size: .long __hip_pch_wave64_size - __hip_pch_wave64 EOF + host_triple="$(uname -m)" set -x $LLVM_DIR/bin/clang -O3 --hip-path=$HIP_INC_DIR/.. -std=c++17 -nogpulib -isystem $HIP_INC_DIR -isystem $HIP_BUILD_INC_DIR -isystem $HIP_AMD_INC_DIR --cuda-device-only --cuda-gpu-arch=gfx1030 -x hip $tmp/hip_pch.h -E >$tmp/pch_wave32.cui && cat $tmp/hip_macros.h >> $tmp/pch_wave32.cui && - $LLVM_DIR/bin/clang -cc1 -O3 -emit-pch -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-linux-gnu -fcuda-is-device -std=c++17 -fgnuc-version=4.2.1 -o $tmp/hip_wave32.pch -x hip-cpp-output - <$tmp/pch_wave32.cui && + $LLVM_DIR/bin/clang -cc1 -O3 -emit-pch -triple amdgcn-amd-amdhsa -aux-triple "$host_triple" -fcuda-is-device -std=c++17 -fgnuc-version=4.2.1 -o $tmp/hip_wave32.pch -x hip-cpp-output - <$tmp/pch_wave32.cui && $LLVM_DIR/bin/clang -O3 --hip-path=$HIP_INC_DIR/.. -std=c++17 -nogpulib -isystem $HIP_INC_DIR -isystem $HIP_BUILD_INC_DIR -isystem $HIP_AMD_INC_DIR --cuda-device-only -x hip $tmp/hip_pch.h -E >$tmp/pch_wave64.cui && cat $tmp/hip_macros.h >> $tmp/pch_wave64.cui && - $LLVM_DIR/bin/clang -cc1 -O3 -emit-pch -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-linux-gnu -fcuda-is-device -std=c++17 -fgnuc-version=4.2.1 -o $tmp/hip_wave64.pch -x hip-cpp-output - <$tmp/pch_wave64.cui && + $LLVM_DIR/bin/clang -cc1 -O3 -emit-pch -triple amdgcn-amd-amdhsa -aux-triple "$host_triple" -fcuda-is-device -std=c++17 -fgnuc-version=4.2.1 -o $tmp/hip_wave64.pch -x hip-cpp-output - <$tmp/pch_wave64.cui && $LLVM_DIR/bin/llvm-mc -o hip_pch.o $tmp/hip_pch.mcin --filetype=obj && diff --git a/projects/clr/hipamd/src/hip_graph_internal.cpp b/projects/clr/hipamd/src/hip_graph_internal.cpp index ab8d069988..253036753f 100644 --- a/projects/clr/hipamd/src/hip_graph_internal.cpp +++ b/projects/clr/hipamd/src/hip_graph_internal.cpp @@ -19,6 +19,9 @@ THE SOFTWARE. */ #include "hip_graph_internal.hpp" +#include +#include + #define CASE_STRING(X, C) \ case X: \ @@ -2093,9 +2096,9 @@ void GraphKernelArgManager::ReadBackOrFlush() { // Read-modify-write sequence with memory barriers volatile unsigned char kSentinel = *sentinel_ptr; - _mm_sfence(); + simde_mm_sfence(); *sentinel_ptr = kSentinel; - _mm_mfence(); + simde_mm_mfence(); kSentinel = *sentinel_ptr; (void)kSentinel; // Suppress unused variable warning } diff --git a/projects/clr/rocclr/cmake/FindSIMDe.cmake b/projects/clr/rocclr/cmake/FindSIMDe.cmake new file mode 100644 index 0000000000..086c3bd7b6 --- /dev/null +++ b/projects/clr/rocclr/cmake/FindSIMDe.cmake @@ -0,0 +1,63 @@ +# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +include(FindPackageHandleStandardArgs) + +find_package(PkgConfig REQUIRED) +if(PkgConfig_FOUND) + pkg_check_modules(simde IMPORTED_TARGET simde) +endif() + +if(PkgConfig_FOUND AND simde_FOUND) + message(STATUS "Found SIMDe via pkg-config") + set(SIMDE_TARGET PkgConfig::simde) +else() + message(STATUS "SIMDe not found via pkg-config. Falling back to find_path...") + + if(WIN32) + find_path(SIMDE_INCLUDE_DIR + NAMES simde/simde-common.h + PATHS + "$ENV{DK_ROOT}/simde" + NO_DEFAULT_PATH + ) + elseif(UNIX) + find_path(SIMDE_INCLUDE_DIR + NAMES simde/simde-common.h + PATHS + /usr/include + /usr/local/include + NO_DEFAULT_PATH + ) + endif() + + find_package_handle_standard_args(SIMDe + REQUIRED_VARS SIMDE_INCLUDE_DIR) + if(SIMDE_FOUND) + message(STATUS "Found SIMDe headers at: ${SIMDE_INCLUDE_DIR}") + if(NOT TARGET SIMDE) + add_library(SIMDE INTERFACE) + target_include_directories(SIMDE INTERFACE ${SIMDE_INCLUDE_DIR}) + endif() + set(SIMDE_TARGET SIMDE) + else() + message(WARNING "could not find simde") + endif() +endif() \ No newline at end of file diff --git a/projects/clr/rocclr/cmake/ROCclr.cmake b/projects/clr/rocclr/cmake/ROCclr.cmake index e30e2de56f..8a1b132ae4 100644 --- a/projects/clr/rocclr/cmake/ROCclr.cmake +++ b/projects/clr/rocclr/cmake/ROCclr.cmake @@ -144,3 +144,7 @@ endif() if(ROCCLR_ENABLE_PAL) include(ROCclrPAL) endif() + +find_package(SIMDe REQUIRED) +target_link_libraries(rocclr PUBLIC ${SIMDE_TARGET}) + diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/device/rocm/rocvirtual.cpp index 9a74b599ef..7472ad82de 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.cpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.cpp @@ -35,6 +35,15 @@ #include "utils/debug.hpp" #include "os/os.hpp" +#include +#include +#if defined(SIMDE_VERSION_MAJOR) && \ + ((SIMDE_VERSION_MAJOR > 0) || (SIMDE_VERSION_MAJOR == 0 && SIMDE_VERSION_MINOR >= 7)) + +#include +#endif + + #include #include #include @@ -44,14 +53,6 @@ #include #include -#if defined(__AVX__) -#if defined(__MINGW64__) -#include -#else -#include -#endif -#endif - /** * HSA image object size in bytes (see HSA spec) */ @@ -3536,49 +3537,44 @@ bool VirtualGPU::createVirtualQueue(uint deviceQueueSize) { #if IS_LINUX __attribute__((optimize("unroll-all-loops"), always_inline)) static inline void nontemporalMemcpy( void* __restrict dst, const void* __restrict src, size_t size) { -#if defined(ATI_ARCH_X86) -#if defined(__AVX512F__) - for (auto i = 0u; i != size / sizeof(__m512i); ++i) { - _mm512_stream_si512(reinterpret_cast<__m512i* __restrict&>(dst)++, - *reinterpret_cast(src)++); +#if defined(__AVX512F__) && false // Disable until SIMDe adds support. + for (auto i = 0u; i != size / sizeof(simde__m512i); ++i) { + simde_mm512_stream_si512(reinterpret_cast(dst)++, + *reinterpret_cast(src)++); } - size = size % sizeof(__m512i); + size = size % sizeof(simde__m512i); #endif #if defined(__AVX__) - for (auto i = 0u; i != size / sizeof(__m256i); ++i) { - _mm256_stream_si256(reinterpret_cast<__m256i* __restrict&>(dst)++, - *reinterpret_cast(src)++); + for (auto i = 0u; i != size / sizeof(simde__m256i); ++i) { + simde_mm256_stream_si256(reinterpret_cast(dst)++, + *reinterpret_cast(src)++); } - size = size % sizeof(__m256i); + size = size % sizeof(simde__m256i); #endif - - for (auto i = 0u; i != size / sizeof(__m128i); ++i) { - _mm_stream_si128(reinterpret_cast<__m128i* __restrict&>(dst)++, - *(reinterpret_cast(src)++)); + for (auto i = 0u; i != size / sizeof(simde__m128i); ++i) { + simde_mm_stream_si128(reinterpret_cast(dst)++, + *(reinterpret_cast(src)++)); } - size = size % sizeof(__m128i); + size = size % sizeof(simde__m128i); - for (auto i = 0u; i != size / sizeof(long long); ++i) { - _mm_stream_si64(reinterpret_cast(dst)++, - *reinterpret_cast(src)++); + for (auto i = 0u; i != size / sizeof(int64_t); ++i) { + simde_mm_stream_si64(reinterpret_cast(dst)++, + *reinterpret_cast(src)++); } - size = size % sizeof(long long); + size = size % sizeof(int64_t); - for (auto i = 0u; i != size / sizeof(int); ++i) { - _mm_stream_si32(reinterpret_cast(dst)++, - *reinterpret_cast(src)++); + for (auto i = 0u; i != size / sizeof(int32_t); ++i) { + simde_mm_stream_si32(reinterpret_cast(dst)++, + *reinterpret_cast(src)++); } - size = size % sizeof(int); + size = size % sizeof(int32_t); // Copy remaining bytes for unaligned size std::memcpy(dst, src, size); // Add memory fence - _mm_sfence(); -#else - std::memcpy(dst, src, size); -#endif + simde_mm_sfence(); } #else static inline void nontemporalMemcpy(void* __restrict dst, const void* __restrict src, @@ -3835,9 +3831,9 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const *dev().info().hdpMemFlushCntl = 1u; auto kSentinel = *reinterpret_cast(dev().info().hdpMemFlushCntl); } else if (kernArgImpl == KernelArgImpl::DeviceKernelArgsReadback && argSize != 0) { - _mm_sfence(); + simde_mm_sfence(); *(argBuffer + argSize - 1) = *(parameters + argSize - 1); - _mm_mfence(); + simde_mm_mfence(); auto kSentinel = *reinterpret_cast(argBuffer + argSize - 1); } } diff --git a/projects/clr/rocclr/os/os.cpp b/projects/clr/rocclr/os/os.cpp index 35ba90c2fb..0a0ab91a03 100644 --- a/projects/clr/rocclr/os/os.cpp +++ b/projects/clr/rocclr/os/os.cpp @@ -30,10 +30,8 @@ #include #include #endif // !_WIN32 - -#if defined(ATI_ARCH_X86) -#include // for _mm_pause -#endif // ATI_ARCH_X86 +#include +#include namespace amd { @@ -120,13 +118,7 @@ size_t Os::pageSize_ = 0; int Os::processorCount_ = 0; -void Os::spinPause() { -#if defined(ATI_ARCH_X86) - _mm_pause(); -#elif defined(ATI_ARCH_ARM) - __asm__ __volatile__("yield"); -#endif -} +void Os::spinPause() { simde_mm_pause(); } void Os::sleep(long n) { // FIXME_lmoriche: Should be nano-seconds not seconds.