SWDEV-538607 - Add SIMDe as a build dependency, remove naked intrinsic use. (#500)
Co-authored-by: Alex Voicu <alexandru.voicu@amd.com> Co-authored-by: Ioannis Assiouras <Ioannis.Assiouras@amd.com>
This commit is contained in:
committato da
GitHub
parent
b9dc8f729a
commit
b002c6a739
@@ -67,6 +67,7 @@ jobs:
|
||||
echo "$PATH;C:\Strawberry\c\bin" >> $GITHUB_PATH
|
||||
choco install --no-progress -y awscli
|
||||
echo "$PATH;C:\Program Files\Amazon\AWSCLIV2" >> $GITHUB_PATH
|
||||
choco install --no-progress -y pkgconfiglite
|
||||
|
||||
- uses: iterative/setup-dvc@4bdfd2b0f6f1ad7e08afadb03b1a895c352a5239 # v2.0.0
|
||||
with:
|
||||
|
||||
@@ -201,7 +201,7 @@ else()
|
||||
set(HIP_RUNTIME_DEB_STDPKG_DEPENDENCIES "${HIP_RUNTIME_DEB_STDPKG_DEPENDENCIES}, libgcc-s1")
|
||||
endif()
|
||||
|
||||
set(CPACK_DEBIAN_BINARY_PACKAGE_DEPENDS "${HIP_RUNTIME_ROCM_PKG_DEPENDENCIES}, ${HIP_RUNTIME_DEB_STDPKG_DEPENDENCIES}")
|
||||
set(CPACK_DEBIAN_BINARY_PACKAGE_DEPENDS "${HIP_RUNTIME_ROCM_PKG_DEPENDENCIES}, ${HIP_RUNTIME_DEB_STDPKG_DEPENDENCIES}, libsimde-dev")
|
||||
|
||||
set(CPACK_DEBIAN_BINARY_PACKAGE_PROVIDES "hip-rocclr (= ${CPACK_PACKAGE_VERSION})")
|
||||
set(CPACK_DEBIAN_BINARY_PACKAGE_REPLACES "hip-rocclr (= ${CPACK_PACKAGE_VERSION})")
|
||||
@@ -223,7 +223,7 @@ else()
|
||||
endif()
|
||||
endif()
|
||||
|
||||
set(CPACK_RPM_BINARY_PACKAGE_REQUIRES "${HIP_RUNTIME_ROCM_PKG_DEPENDENCIES}, ${HIP_RUNTIME_RPM_STDPKG_DEPENDENCIES}")
|
||||
set(CPACK_RPM_BINARY_PACKAGE_REQUIRES "${HIP_RUNTIME_ROCM_PKG_DEPENDENCIES}, ${HIP_RUNTIME_RPM_STDPKG_DEPENDENCIES}, simde-devel")
|
||||
|
||||
set(CPACK_RPM_BINARY_PACKAGE_PROVIDES "hip-rocclr = ${HIP_BASE_VERSION}")
|
||||
set(CPACK_RPM_BINARY_PACKAGE_OBSOLETES "hip-rocclr = ${HIP_BASE_VERSION}")
|
||||
|
||||
@@ -142,19 +142,20 @@ __hip_pch_wave64_size:
|
||||
.long __hip_pch_wave64_size - __hip_pch_wave64
|
||||
EOF
|
||||
|
||||
host_triple="$(uname -m)"
|
||||
set -x
|
||||
|
||||
$LLVM_DIR/bin/clang -O3 --hip-path=$HIP_INC_DIR/.. -std=c++17 -nogpulib -isystem $HIP_INC_DIR -isystem $HIP_BUILD_INC_DIR -isystem $HIP_AMD_INC_DIR --cuda-device-only --cuda-gpu-arch=gfx1030 -x hip $tmp/hip_pch.h -E >$tmp/pch_wave32.cui &&
|
||||
|
||||
cat $tmp/hip_macros.h >> $tmp/pch_wave32.cui &&
|
||||
|
||||
$LLVM_DIR/bin/clang -cc1 -O3 -emit-pch -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-linux-gnu -fcuda-is-device -std=c++17 -fgnuc-version=4.2.1 -o $tmp/hip_wave32.pch -x hip-cpp-output - <$tmp/pch_wave32.cui &&
|
||||
$LLVM_DIR/bin/clang -cc1 -O3 -emit-pch -triple amdgcn-amd-amdhsa -aux-triple "$host_triple" -fcuda-is-device -std=c++17 -fgnuc-version=4.2.1 -o $tmp/hip_wave32.pch -x hip-cpp-output - <$tmp/pch_wave32.cui &&
|
||||
|
||||
$LLVM_DIR/bin/clang -O3 --hip-path=$HIP_INC_DIR/.. -std=c++17 -nogpulib -isystem $HIP_INC_DIR -isystem $HIP_BUILD_INC_DIR -isystem $HIP_AMD_INC_DIR --cuda-device-only -x hip $tmp/hip_pch.h -E >$tmp/pch_wave64.cui &&
|
||||
|
||||
cat $tmp/hip_macros.h >> $tmp/pch_wave64.cui &&
|
||||
|
||||
$LLVM_DIR/bin/clang -cc1 -O3 -emit-pch -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-linux-gnu -fcuda-is-device -std=c++17 -fgnuc-version=4.2.1 -o $tmp/hip_wave64.pch -x hip-cpp-output - <$tmp/pch_wave64.cui &&
|
||||
$LLVM_DIR/bin/clang -cc1 -O3 -emit-pch -triple amdgcn-amd-amdhsa -aux-triple "$host_triple" -fcuda-is-device -std=c++17 -fgnuc-version=4.2.1 -o $tmp/hip_wave64.pch -x hip-cpp-output - <$tmp/pch_wave64.cui &&
|
||||
|
||||
$LLVM_DIR/bin/llvm-mc -o hip_pch.o $tmp/hip_pch.mcin --filetype=obj &&
|
||||
|
||||
|
||||
@@ -19,6 +19,9 @@
|
||||
THE SOFTWARE. */
|
||||
|
||||
#include "hip_graph_internal.hpp"
|
||||
#include <cmath>
|
||||
#include <simde/x86/sse2.h>
|
||||
|
||||
|
||||
#define CASE_STRING(X, C) \
|
||||
case X: \
|
||||
@@ -2093,9 +2096,9 @@ void GraphKernelArgManager::ReadBackOrFlush() {
|
||||
|
||||
// Read-modify-write sequence with memory barriers
|
||||
volatile unsigned char kSentinel = *sentinel_ptr;
|
||||
_mm_sfence();
|
||||
simde_mm_sfence();
|
||||
*sentinel_ptr = kSentinel;
|
||||
_mm_mfence();
|
||||
simde_mm_mfence();
|
||||
kSentinel = *sentinel_ptr;
|
||||
(void)kSentinel; // Suppress unused variable warning
|
||||
}
|
||||
|
||||
@@ -0,0 +1,63 @@
|
||||
# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in
|
||||
# all copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
# THE SOFTWARE.
|
||||
|
||||
include(FindPackageHandleStandardArgs)
|
||||
|
||||
find_package(PkgConfig REQUIRED)
|
||||
if(PkgConfig_FOUND)
|
||||
pkg_check_modules(simde IMPORTED_TARGET simde)
|
||||
endif()
|
||||
|
||||
if(PkgConfig_FOUND AND simde_FOUND)
|
||||
message(STATUS "Found SIMDe via pkg-config")
|
||||
set(SIMDE_TARGET PkgConfig::simde)
|
||||
else()
|
||||
message(STATUS "SIMDe not found via pkg-config. Falling back to find_path...")
|
||||
|
||||
if(WIN32)
|
||||
find_path(SIMDE_INCLUDE_DIR
|
||||
NAMES simde/simde-common.h
|
||||
PATHS
|
||||
"$ENV{DK_ROOT}/simde"
|
||||
NO_DEFAULT_PATH
|
||||
)
|
||||
elseif(UNIX)
|
||||
find_path(SIMDE_INCLUDE_DIR
|
||||
NAMES simde/simde-common.h
|
||||
PATHS
|
||||
/usr/include
|
||||
/usr/local/include
|
||||
NO_DEFAULT_PATH
|
||||
)
|
||||
endif()
|
||||
|
||||
find_package_handle_standard_args(SIMDe
|
||||
REQUIRED_VARS SIMDE_INCLUDE_DIR)
|
||||
if(SIMDE_FOUND)
|
||||
message(STATUS "Found SIMDe headers at: ${SIMDE_INCLUDE_DIR}")
|
||||
if(NOT TARGET SIMDE)
|
||||
add_library(SIMDE INTERFACE)
|
||||
target_include_directories(SIMDE INTERFACE ${SIMDE_INCLUDE_DIR})
|
||||
endif()
|
||||
set(SIMDE_TARGET SIMDE)
|
||||
else()
|
||||
message(WARNING "could not find simde")
|
||||
endif()
|
||||
endif()
|
||||
@@ -144,3 +144,7 @@ endif()
|
||||
if(ROCCLR_ENABLE_PAL)
|
||||
include(ROCclrPAL)
|
||||
endif()
|
||||
|
||||
find_package(SIMDe REQUIRED)
|
||||
target_link_libraries(rocclr PUBLIC ${SIMDE_TARGET})
|
||||
|
||||
|
||||
@@ -35,6 +35,15 @@
|
||||
#include "utils/debug.hpp"
|
||||
#include "os/os.hpp"
|
||||
|
||||
#include <simde/x86/avx.h>
|
||||
#include <simde/x86/sse2.h>
|
||||
#if defined(SIMDE_VERSION_MAJOR) && \
|
||||
((SIMDE_VERSION_MAJOR > 0) || (SIMDE_VERSION_MAJOR == 0 && SIMDE_VERSION_MINOR >= 7))
|
||||
|
||||
#include <simde/x86/avx512.h>
|
||||
#endif
|
||||
|
||||
|
||||
#include <fstream>
|
||||
#include <limits>
|
||||
#include <memory>
|
||||
@@ -44,14 +53,6 @@
|
||||
#include <atomic>
|
||||
#include <cinttypes>
|
||||
|
||||
#if defined(__AVX__)
|
||||
#if defined(__MINGW64__)
|
||||
#include <intrin.h>
|
||||
#else
|
||||
#include <immintrin.h>
|
||||
#endif
|
||||
#endif
|
||||
|
||||
/**
|
||||
* HSA image object size in bytes (see HSA spec)
|
||||
*/
|
||||
@@ -3536,49 +3537,44 @@ bool VirtualGPU::createVirtualQueue(uint deviceQueueSize) {
|
||||
#if IS_LINUX
|
||||
__attribute__((optimize("unroll-all-loops"), always_inline)) static inline void nontemporalMemcpy(
|
||||
void* __restrict dst, const void* __restrict src, size_t size) {
|
||||
#if defined(ATI_ARCH_X86)
|
||||
#if defined(__AVX512F__)
|
||||
for (auto i = 0u; i != size / sizeof(__m512i); ++i) {
|
||||
_mm512_stream_si512(reinterpret_cast<__m512i* __restrict&>(dst)++,
|
||||
*reinterpret_cast<const __m512i* __restrict&>(src)++);
|
||||
#if defined(__AVX512F__) && false // Disable until SIMDe adds support.
|
||||
for (auto i = 0u; i != size / sizeof(simde__m512i); ++i) {
|
||||
simde_mm512_stream_si512(reinterpret_cast<simde__m512i* __restrict&>(dst)++,
|
||||
*reinterpret_cast<const simde__m512i* __restrict&>(src)++);
|
||||
}
|
||||
size = size % sizeof(__m512i);
|
||||
size = size % sizeof(simde__m512i);
|
||||
#endif
|
||||
|
||||
#if defined(__AVX__)
|
||||
for (auto i = 0u; i != size / sizeof(__m256i); ++i) {
|
||||
_mm256_stream_si256(reinterpret_cast<__m256i* __restrict&>(dst)++,
|
||||
*reinterpret_cast<const __m256i* __restrict&>(src)++);
|
||||
for (auto i = 0u; i != size / sizeof(simde__m256i); ++i) {
|
||||
simde_mm256_stream_si256(reinterpret_cast<simde__m256i* __restrict&>(dst)++,
|
||||
*reinterpret_cast<const simde__m256i* __restrict&>(src)++);
|
||||
}
|
||||
size = size % sizeof(__m256i);
|
||||
size = size % sizeof(simde__m256i);
|
||||
#endif
|
||||
|
||||
for (auto i = 0u; i != size / sizeof(__m128i); ++i) {
|
||||
_mm_stream_si128(reinterpret_cast<__m128i* __restrict&>(dst)++,
|
||||
*(reinterpret_cast<const __m128i* __restrict&>(src)++));
|
||||
for (auto i = 0u; i != size / sizeof(simde__m128i); ++i) {
|
||||
simde_mm_stream_si128(reinterpret_cast<simde__m128i* __restrict&>(dst)++,
|
||||
*(reinterpret_cast<const simde__m128i* __restrict&>(src)++));
|
||||
}
|
||||
size = size % sizeof(__m128i);
|
||||
size = size % sizeof(simde__m128i);
|
||||
|
||||
for (auto i = 0u; i != size / sizeof(long long); ++i) {
|
||||
_mm_stream_si64(reinterpret_cast<long long* __restrict&>(dst)++,
|
||||
*reinterpret_cast<const long long* __restrict&>(src)++);
|
||||
for (auto i = 0u; i != size / sizeof(int64_t); ++i) {
|
||||
simde_mm_stream_si64(reinterpret_cast<int64_t* __restrict&>(dst)++,
|
||||
*reinterpret_cast<const int64_t* __restrict&>(src)++);
|
||||
}
|
||||
size = size % sizeof(long long);
|
||||
size = size % sizeof(int64_t);
|
||||
|
||||
for (auto i = 0u; i != size / sizeof(int); ++i) {
|
||||
_mm_stream_si32(reinterpret_cast<int* __restrict&>(dst)++,
|
||||
*reinterpret_cast<const int* __restrict&>(src)++);
|
||||
for (auto i = 0u; i != size / sizeof(int32_t); ++i) {
|
||||
simde_mm_stream_si32(reinterpret_cast<int32_t* __restrict&>(dst)++,
|
||||
*reinterpret_cast<const int32_t* __restrict&>(src)++);
|
||||
}
|
||||
|
||||
size = size % sizeof(int);
|
||||
size = size % sizeof(int32_t);
|
||||
// Copy remaining bytes for unaligned size
|
||||
std::memcpy(dst, src, size);
|
||||
|
||||
// Add memory fence
|
||||
_mm_sfence();
|
||||
#else
|
||||
std::memcpy(dst, src, size);
|
||||
#endif
|
||||
simde_mm_sfence();
|
||||
}
|
||||
#else
|
||||
static inline void nontemporalMemcpy(void* __restrict dst, const void* __restrict src,
|
||||
@@ -3835,9 +3831,9 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
|
||||
*dev().info().hdpMemFlushCntl = 1u;
|
||||
auto kSentinel = *reinterpret_cast<volatile int*>(dev().info().hdpMemFlushCntl);
|
||||
} else if (kernArgImpl == KernelArgImpl::DeviceKernelArgsReadback && argSize != 0) {
|
||||
_mm_sfence();
|
||||
simde_mm_sfence();
|
||||
*(argBuffer + argSize - 1) = *(parameters + argSize - 1);
|
||||
_mm_mfence();
|
||||
simde_mm_mfence();
|
||||
auto kSentinel = *reinterpret_cast<volatile unsigned char*>(argBuffer + argSize - 1);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -30,10 +30,8 @@
|
||||
#include <time.h>
|
||||
#include <unistd.h>
|
||||
#endif // !_WIN32
|
||||
|
||||
#if defined(ATI_ARCH_X86)
|
||||
#include <xmmintrin.h> // for _mm_pause
|
||||
#endif // ATI_ARCH_X86
|
||||
#include <cmath>
|
||||
#include <simde/x86/sse2.h>
|
||||
|
||||
namespace amd {
|
||||
|
||||
@@ -120,13 +118,7 @@ size_t Os::pageSize_ = 0;
|
||||
|
||||
int Os::processorCount_ = 0;
|
||||
|
||||
void Os::spinPause() {
|
||||
#if defined(ATI_ARCH_X86)
|
||||
_mm_pause();
|
||||
#elif defined(ATI_ARCH_ARM)
|
||||
__asm__ __volatile__("yield");
|
||||
#endif
|
||||
}
|
||||
void Os::spinPause() { simde_mm_pause(); }
|
||||
|
||||
void Os::sleep(long n) {
|
||||
// FIXME_lmoriche: Should be nano-seconds not seconds.
|
||||
|
||||
Fai riferimento in un nuovo problema
Block a user