Files

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

72 regels
3.0 KiB
Diff

diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/device/rocm/rocvirtual.cpp
index 5abfd73284..1fcc4d8217 100644
--- a/projects/clr/rocclr/device/rocm/rocvirtual.cpp
+++ b/projects/clr/rocclr/device/rocm/rocvirtual.cpp
@@ -3411,49 +3411,7 @@ bool VirtualGPU::createVirtualQueue(uint deviceQueueSize) {
#if IS_LINUX
__attribute__((optimize("unroll-all-loops"), always_inline)) static inline void nontemporalMemcpy(
void* __restrict dst, const void* __restrict src, size_t size) {
-#if defined(ATI_ARCH_X86)
-#if defined(__AVX512F__)
- for (auto i = 0u; i != size / sizeof(__m512i); ++i) {
- _mm512_stream_si512(reinterpret_cast<__m512i* __restrict&>(dst)++,
- *reinterpret_cast<const __m512i* __restrict&>(src)++);
- }
- size = size % sizeof(__m512i);
-#endif
-
-#if defined(__AVX__)
- for (auto i = 0u; i != size / sizeof(__m256i); ++i) {
- _mm256_stream_si256(reinterpret_cast<__m256i* __restrict&>(dst)++,
- *reinterpret_cast<const __m256i* __restrict&>(src)++);
- }
- size = size % sizeof(__m256i);
-#endif
-
- for (auto i = 0u; i != size / sizeof(__m128i); ++i) {
- _mm_stream_si128(reinterpret_cast<__m128i* __restrict&>(dst)++,
- *(reinterpret_cast<const __m128i* __restrict&>(src)++));
- }
- size = size % sizeof(__m128i);
-
- for (auto i = 0u; i != size / sizeof(long long); ++i) {
- _mm_stream_si64(reinterpret_cast<long long* __restrict&>(dst)++,
- *reinterpret_cast<const long long* __restrict&>(src)++);
- }
- size = size % sizeof(long long);
-
- for (auto i = 0u; i != size / sizeof(int); ++i) {
- _mm_stream_si32(reinterpret_cast<int* __restrict&>(dst)++,
- *reinterpret_cast<const int* __restrict&>(src)++);
- }
-
- size = size % sizeof(int);
- // Copy remaining bytes for unaligned size
std::memcpy(dst, src, size);
-
- // Add memory fence
- _mm_sfence();
-#else
- std::memcpy(dst, src, size);
-#endif
}
#else
static inline void nontemporalMemcpy(void* __restrict dst, const void* __restrict src,
@@ -3708,12 +3666,12 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
const auto kernArgImpl = dev().settings().kernel_arg_impl_;
if (kernArgImpl == KernelArgImpl::DeviceKernelArgsHDP) {
*dev().info().hdpMemFlushCntl = 1u;
- auto kSentinel = *reinterpret_cast<volatile int*>(dev().info().hdpMemFlushCntl);
+ //auto kSentinel = *reinterpret_cast<volatile int*>(dev().info().hdpMemFlushCntl);
} else if (kernArgImpl == KernelArgImpl::DeviceKernelArgsReadback && argSize != 0) {
- _mm_sfence();
+ //_mm_sfence();
*(argBuffer + argSize - 1) = *(parameters + argSize - 1);
- _mm_mfence();
- auto kSentinel = *reinterpret_cast<volatile unsigned char*>(argBuffer + argSize - 1);
+ //_mm_mfence();
+ //auto kSentinel = *reinterpret_cast<volatile unsigned char*>(argBuffer + argSize - 1);
}
}
}