69 rader
2.7 KiB
Diff
69 rader
2.7 KiB
Diff
|
|
diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/device/rocm/rocvirtual.cpp
|
||
|
|
index 5abfd73284..ddabe0c192 100644
|
||
|
|
--- a/projects/clr/rocclr/device/rocm/rocvirtual.cpp
|
||
|
|
+++ b/projects/clr/rocclr/device/rocm/rocvirtual.cpp
|
||
|
|
@@ -3411,49 +3411,7 @@ bool VirtualGPU::createVirtualQueue(uint deviceQueueSize) {
|
||
|
|
#if IS_LINUX
|
||
|
|
__attribute__((optimize("unroll-all-loops"), always_inline)) static inline void nontemporalMemcpy(
|
||
|
|
void* __restrict dst, const void* __restrict src, size_t size) {
|
||
|
|
-#if defined(ATI_ARCH_X86)
|
||
|
|
-#if defined(__AVX512F__)
|
||
|
|
- for (auto i = 0u; i != size / sizeof(__m512i); ++i) {
|
||
|
|
- _mm512_stream_si512(reinterpret_cast<__m512i* __restrict&>(dst)++,
|
||
|
|
- *reinterpret_cast<const __m512i* __restrict&>(src)++);
|
||
|
|
- }
|
||
|
|
- size = size % sizeof(__m512i);
|
||
|
|
-#endif
|
||
|
|
-
|
||
|
|
-#if defined(__AVX__)
|
||
|
|
- for (auto i = 0u; i != size / sizeof(__m256i); ++i) {
|
||
|
|
- _mm256_stream_si256(reinterpret_cast<__m256i* __restrict&>(dst)++,
|
||
|
|
- *reinterpret_cast<const __m256i* __restrict&>(src)++);
|
||
|
|
- }
|
||
|
|
- size = size % sizeof(__m256i);
|
||
|
|
-#endif
|
||
|
|
-
|
||
|
|
- for (auto i = 0u; i != size / sizeof(__m128i); ++i) {
|
||
|
|
- _mm_stream_si128(reinterpret_cast<__m128i* __restrict&>(dst)++,
|
||
|
|
- *(reinterpret_cast<const __m128i* __restrict&>(src)++));
|
||
|
|
- }
|
||
|
|
- size = size % sizeof(__m128i);
|
||
|
|
-
|
||
|
|
- for (auto i = 0u; i != size / sizeof(long long); ++i) {
|
||
|
|
- _mm_stream_si64(reinterpret_cast<long long* __restrict&>(dst)++,
|
||
|
|
- *reinterpret_cast<const long long* __restrict&>(src)++);
|
||
|
|
- }
|
||
|
|
- size = size % sizeof(long long);
|
||
|
|
-
|
||
|
|
- for (auto i = 0u; i != size / sizeof(int); ++i) {
|
||
|
|
- _mm_stream_si32(reinterpret_cast<int* __restrict&>(dst)++,
|
||
|
|
- *reinterpret_cast<const int* __restrict&>(src)++);
|
||
|
|
- }
|
||
|
|
-
|
||
|
|
- size = size % sizeof(int);
|
||
|
|
- // Copy remaining bytes for unaligned size
|
||
|
|
std::memcpy(dst, src, size);
|
||
|
|
-
|
||
|
|
- // Add memory fence
|
||
|
|
- _mm_sfence();
|
||
|
|
-#else
|
||
|
|
- std::memcpy(dst, src, size);
|
||
|
|
-#endif
|
||
|
|
}
|
||
|
|
#else
|
||
|
|
static inline void nontemporalMemcpy(void* __restrict dst, const void* __restrict src,
|
||
|
|
@@ -3710,10 +3668,10 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
|
||
|
|
*dev().info().hdpMemFlushCntl = 1u;
|
||
|
|
auto kSentinel = *reinterpret_cast<volatile int*>(dev().info().hdpMemFlushCntl);
|
||
|
|
} else if (kernArgImpl == KernelArgImpl::DeviceKernelArgsReadback && argSize != 0) {
|
||
|
|
- _mm_sfence();
|
||
|
|
+ //_mm_sfence();
|
||
|
|
*(argBuffer + argSize - 1) = *(parameters + argSize - 1);
|
||
|
|
- _mm_mfence();
|
||
|
|
- auto kSentinel = *reinterpret_cast<volatile unsigned char*>(argBuffer + argSize - 1);
|
||
|
|
+ //_mm_mfence();
|
||
|
|
+ //auto kSentinel = *reinterpret_cast<volatile unsigned char*>(argBuffer + argSize - 1);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|