SWDEV-538607 - Add SIMDe as a build dependency, remove naked intrinsic use. (#500)

Co-authored-by: Alex Voicu <alexandru.voicu@amd.com>
Co-authored-by: Ioannis Assiouras <Ioannis.Assiouras@amd.com>
This commit is contained in:
systems-assistant[bot]
2025-12-15 17:40:51 +00:00
zatwierdzone przez GitHub
rodzic b9dc8f729a
commit b002c6a739
8 zmienionych plików z 114 dodań i 54 usunięć
@@ -67,6 +67,7 @@ jobs:
echo "$PATH;C:\Strawberry\c\bin" >> $GITHUB_PATH
choco install --no-progress -y awscli
echo "$PATH;C:\Program Files\Amazon\AWSCLIV2" >> $GITHUB_PATH
choco install --no-progress -y pkgconfiglite
- uses: iterative/setup-dvc@4bdfd2b0f6f1ad7e08afadb03b1a895c352a5239 # v2.0.0
with:
@@ -201,7 +201,7 @@ else()
set(HIP_RUNTIME_DEB_STDPKG_DEPENDENCIES "${HIP_RUNTIME_DEB_STDPKG_DEPENDENCIES}, libgcc-s1")
endif()
set(CPACK_DEBIAN_BINARY_PACKAGE_DEPENDS "${HIP_RUNTIME_ROCM_PKG_DEPENDENCIES}, ${HIP_RUNTIME_DEB_STDPKG_DEPENDENCIES}")
set(CPACK_DEBIAN_BINARY_PACKAGE_DEPENDS "${HIP_RUNTIME_ROCM_PKG_DEPENDENCIES}, ${HIP_RUNTIME_DEB_STDPKG_DEPENDENCIES}, libsimde-dev")
set(CPACK_DEBIAN_BINARY_PACKAGE_PROVIDES "hip-rocclr (= ${CPACK_PACKAGE_VERSION})")
set(CPACK_DEBIAN_BINARY_PACKAGE_REPLACES "hip-rocclr (= ${CPACK_PACKAGE_VERSION})")
@@ -223,7 +223,7 @@ else()
endif()
endif()
set(CPACK_RPM_BINARY_PACKAGE_REQUIRES "${HIP_RUNTIME_ROCM_PKG_DEPENDENCIES}, ${HIP_RUNTIME_RPM_STDPKG_DEPENDENCIES}")
set(CPACK_RPM_BINARY_PACKAGE_REQUIRES "${HIP_RUNTIME_ROCM_PKG_DEPENDENCIES}, ${HIP_RUNTIME_RPM_STDPKG_DEPENDENCIES}, simde-devel")
set(CPACK_RPM_BINARY_PACKAGE_PROVIDES "hip-rocclr = ${HIP_BASE_VERSION}")
set(CPACK_RPM_BINARY_PACKAGE_OBSOLETES "hip-rocclr = ${HIP_BASE_VERSION}")
@@ -142,19 +142,20 @@ __hip_pch_wave64_size:
.long __hip_pch_wave64_size - __hip_pch_wave64
EOF
host_triple="$(uname -m)"
set -x
$LLVM_DIR/bin/clang -O3 --hip-path=$HIP_INC_DIR/.. -std=c++17 -nogpulib -isystem $HIP_INC_DIR -isystem $HIP_BUILD_INC_DIR -isystem $HIP_AMD_INC_DIR --cuda-device-only --cuda-gpu-arch=gfx1030 -x hip $tmp/hip_pch.h -E >$tmp/pch_wave32.cui &&
cat $tmp/hip_macros.h >> $tmp/pch_wave32.cui &&
$LLVM_DIR/bin/clang -cc1 -O3 -emit-pch -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-linux-gnu -fcuda-is-device -std=c++17 -fgnuc-version=4.2.1 -o $tmp/hip_wave32.pch -x hip-cpp-output - <$tmp/pch_wave32.cui &&
$LLVM_DIR/bin/clang -cc1 -O3 -emit-pch -triple amdgcn-amd-amdhsa -aux-triple "$host_triple" -fcuda-is-device -std=c++17 -fgnuc-version=4.2.1 -o $tmp/hip_wave32.pch -x hip-cpp-output - <$tmp/pch_wave32.cui &&
$LLVM_DIR/bin/clang -O3 --hip-path=$HIP_INC_DIR/.. -std=c++17 -nogpulib -isystem $HIP_INC_DIR -isystem $HIP_BUILD_INC_DIR -isystem $HIP_AMD_INC_DIR --cuda-device-only -x hip $tmp/hip_pch.h -E >$tmp/pch_wave64.cui &&
cat $tmp/hip_macros.h >> $tmp/pch_wave64.cui &&
$LLVM_DIR/bin/clang -cc1 -O3 -emit-pch -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-linux-gnu -fcuda-is-device -std=c++17 -fgnuc-version=4.2.1 -o $tmp/hip_wave64.pch -x hip-cpp-output - <$tmp/pch_wave64.cui &&
$LLVM_DIR/bin/clang -cc1 -O3 -emit-pch -triple amdgcn-amd-amdhsa -aux-triple "$host_triple" -fcuda-is-device -std=c++17 -fgnuc-version=4.2.1 -o $tmp/hip_wave64.pch -x hip-cpp-output - <$tmp/pch_wave64.cui &&
$LLVM_DIR/bin/llvm-mc -o hip_pch.o $tmp/hip_pch.mcin --filetype=obj &&
@@ -19,6 +19,9 @@
THE SOFTWARE. */
#include "hip_graph_internal.hpp"
#include <cmath>
#include <simde/x86/sse2.h>
#define CASE_STRING(X, C) \
case X: \
@@ -2093,9 +2096,9 @@ void GraphKernelArgManager::ReadBackOrFlush() {
// Read-modify-write sequence with memory barriers
volatile unsigned char kSentinel = *sentinel_ptr;
_mm_sfence();
simde_mm_sfence();
*sentinel_ptr = kSentinel;
_mm_mfence();
simde_mm_mfence();
kSentinel = *sentinel_ptr;
(void)kSentinel; // Suppress unused variable warning
}
@@ -0,0 +1,63 @@
# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
include(FindPackageHandleStandardArgs)
find_package(PkgConfig REQUIRED)
if(PkgConfig_FOUND)
pkg_check_modules(simde IMPORTED_TARGET simde)
endif()
if(PkgConfig_FOUND AND simde_FOUND)
message(STATUS "Found SIMDe via pkg-config")
set(SIMDE_TARGET PkgConfig::simde)
else()
message(STATUS "SIMDe not found via pkg-config. Falling back to find_path...")
if(WIN32)
find_path(SIMDE_INCLUDE_DIR
NAMES simde/simde-common.h
PATHS
"$ENV{DK_ROOT}/simde"
NO_DEFAULT_PATH
)
elseif(UNIX)
find_path(SIMDE_INCLUDE_DIR
NAMES simde/simde-common.h
PATHS
/usr/include
/usr/local/include
NO_DEFAULT_PATH
)
endif()
find_package_handle_standard_args(SIMDe
REQUIRED_VARS SIMDE_INCLUDE_DIR)
if(SIMDE_FOUND)
message(STATUS "Found SIMDe headers at: ${SIMDE_INCLUDE_DIR}")
if(NOT TARGET SIMDE)
add_library(SIMDE INTERFACE)
target_include_directories(SIMDE INTERFACE ${SIMDE_INCLUDE_DIR})
endif()
set(SIMDE_TARGET SIMDE)
else()
message(WARNING "could not find simde")
endif()
endif()
@@ -144,3 +144,7 @@ endif()
if(ROCCLR_ENABLE_PAL)
include(ROCclrPAL)
endif()
find_package(SIMDe REQUIRED)
target_link_libraries(rocclr PUBLIC ${SIMDE_TARGET})
@@ -35,6 +35,15 @@
#include "utils/debug.hpp"
#include "os/os.hpp"
#include <simde/x86/avx.h>
#include <simde/x86/sse2.h>
#if defined(SIMDE_VERSION_MAJOR) && \
((SIMDE_VERSION_MAJOR > 0) || (SIMDE_VERSION_MAJOR == 0 && SIMDE_VERSION_MINOR >= 7))
#include <simde/x86/avx512.h>
#endif
#include <fstream>
#include <limits>
#include <memory>
@@ -44,14 +53,6 @@
#include <atomic>
#include <cinttypes>
#if defined(__AVX__)
#if defined(__MINGW64__)
#include <intrin.h>
#else
#include <immintrin.h>
#endif
#endif
/**
* HSA image object size in bytes (see HSA spec)
*/
@@ -3536,49 +3537,44 @@ bool VirtualGPU::createVirtualQueue(uint deviceQueueSize) {
#if IS_LINUX
__attribute__((optimize("unroll-all-loops"), always_inline)) static inline void nontemporalMemcpy(
void* __restrict dst, const void* __restrict src, size_t size) {
#if defined(ATI_ARCH_X86)
#if defined(__AVX512F__)
for (auto i = 0u; i != size / sizeof(__m512i); ++i) {
_mm512_stream_si512(reinterpret_cast<__m512i* __restrict&>(dst)++,
*reinterpret_cast<const __m512i* __restrict&>(src)++);
#if defined(__AVX512F__) && false // Disable until SIMDe adds support.
for (auto i = 0u; i != size / sizeof(simde__m512i); ++i) {
simde_mm512_stream_si512(reinterpret_cast<simde__m512i* __restrict&>(dst)++,
*reinterpret_cast<const simde__m512i* __restrict&>(src)++);
}
size = size % sizeof(__m512i);
size = size % sizeof(simde__m512i);
#endif
#if defined(__AVX__)
for (auto i = 0u; i != size / sizeof(__m256i); ++i) {
_mm256_stream_si256(reinterpret_cast<__m256i* __restrict&>(dst)++,
*reinterpret_cast<const __m256i* __restrict&>(src)++);
for (auto i = 0u; i != size / sizeof(simde__m256i); ++i) {
simde_mm256_stream_si256(reinterpret_cast<simde__m256i* __restrict&>(dst)++,
*reinterpret_cast<const simde__m256i* __restrict&>(src)++);
}
size = size % sizeof(__m256i);
size = size % sizeof(simde__m256i);
#endif
for (auto i = 0u; i != size / sizeof(__m128i); ++i) {
_mm_stream_si128(reinterpret_cast<__m128i* __restrict&>(dst)++,
*(reinterpret_cast<const __m128i* __restrict&>(src)++));
for (auto i = 0u; i != size / sizeof(simde__m128i); ++i) {
simde_mm_stream_si128(reinterpret_cast<simde__m128i* __restrict&>(dst)++,
*(reinterpret_cast<const simde__m128i* __restrict&>(src)++));
}
size = size % sizeof(__m128i);
size = size % sizeof(simde__m128i);
for (auto i = 0u; i != size / sizeof(long long); ++i) {
_mm_stream_si64(reinterpret_cast<long long* __restrict&>(dst)++,
*reinterpret_cast<const long long* __restrict&>(src)++);
for (auto i = 0u; i != size / sizeof(int64_t); ++i) {
simde_mm_stream_si64(reinterpret_cast<int64_t* __restrict&>(dst)++,
*reinterpret_cast<const int64_t* __restrict&>(src)++);
}
size = size % sizeof(long long);
size = size % sizeof(int64_t);
for (auto i = 0u; i != size / sizeof(int); ++i) {
_mm_stream_si32(reinterpret_cast<int* __restrict&>(dst)++,
*reinterpret_cast<const int* __restrict&>(src)++);
for (auto i = 0u; i != size / sizeof(int32_t); ++i) {
simde_mm_stream_si32(reinterpret_cast<int32_t* __restrict&>(dst)++,
*reinterpret_cast<const int32_t* __restrict&>(src)++);
}
size = size % sizeof(int);
size = size % sizeof(int32_t);
// Copy remaining bytes for unaligned size
std::memcpy(dst, src, size);
// Add memory fence
_mm_sfence();
#else
std::memcpy(dst, src, size);
#endif
simde_mm_sfence();
}
#else
static inline void nontemporalMemcpy(void* __restrict dst, const void* __restrict src,
@@ -3835,9 +3831,9 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
*dev().info().hdpMemFlushCntl = 1u;
auto kSentinel = *reinterpret_cast<volatile int*>(dev().info().hdpMemFlushCntl);
} else if (kernArgImpl == KernelArgImpl::DeviceKernelArgsReadback && argSize != 0) {
_mm_sfence();
simde_mm_sfence();
*(argBuffer + argSize - 1) = *(parameters + argSize - 1);
_mm_mfence();
simde_mm_mfence();
auto kSentinel = *reinterpret_cast<volatile unsigned char*>(argBuffer + argSize - 1);
}
}
+3 -11
Wyświetl plik
@@ -30,10 +30,8 @@
#include <time.h>
#include <unistd.h>
#endif // !_WIN32
#if defined(ATI_ARCH_X86)
#include <xmmintrin.h> // for _mm_pause
#endif // ATI_ARCH_X86
#include <cmath>
#include <simde/x86/sse2.h>
namespace amd {
@@ -120,13 +118,7 @@ size_t Os::pageSize_ = 0;
int Os::processorCount_ = 0;
void Os::spinPause() {
#if defined(ATI_ARCH_X86)
_mm_pause();
#elif defined(ATI_ARCH_ARM)
__asm__ __volatile__("yield");
#endif
}
void Os::spinPause() { simde_mm_pause(); }
void Os::sleep(long n) {
// FIXME_lmoriche: Should be nano-seconds not seconds.