diff --git a/projects/clr/rocclr/device/blit.cpp b/projects/clr/rocclr/device/blit.cpp index 33b04c5987..140ab00f57 100644 --- a/projects/clr/rocclr/device/blit.cpp +++ b/projects/clr/rocclr/device/blit.cpp @@ -41,7 +41,7 @@ bool HostBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost, } // Copy memory - amd::Os::fastMemcpy(dstHost, reinterpret_cast(src) + origin[0], size[0]); + std::memcpy(dstHost, reinterpret_cast(src) + origin[0], size[0]); // Unmap device memory srcMemory.cpuUnmap(vDev_); @@ -69,8 +69,8 @@ bool HostBlitManager::readBufferRect(device::Memory& srcMemory, void* dstHost, dstOffset = hostRect.offset(0, y, z); // Copy memory line by line - amd::Os::fastMemcpy((reinterpret_cast
(dstHost) + dstOffset), - (reinterpret_cast(src) + srcOffset), size[0]); + std::memcpy((reinterpret_cast
(dstHost) + dstOffset), + (reinterpret_cast(src) + srcOffset), size[0]); } } @@ -133,8 +133,8 @@ bool HostBlitManager::readImage(device::Memory& srcMemory, void* dstHost, // Copy memory line by line for (size_t row = 0; row < size[1]; ++row) { // Copy memory - amd::Os::fastMemcpy((reinterpret_cast
(dstHost) + dstOffs), - (reinterpret_cast(src) + srcOffs), copySize); + std::memcpy((reinterpret_cast
(dstHost) + dstOffs), + (reinterpret_cast(src) + srcOffs), copySize); srcOffs += srcRowPitch; dstOffs += rowPitch; @@ -163,7 +163,7 @@ bool HostBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemory } // Copy memory - amd::Os::fastMemcpy(reinterpret_cast
(dst) + origin[0], srcHost, size[0]); + std::memcpy(reinterpret_cast
(dst) + origin[0], srcHost, size[0]); // Unmap the device memory dstMemory.cpuUnmap(vDev_); @@ -191,8 +191,8 @@ bool HostBlitManager::writeBufferRect(const void* srcHost, device::Memory& dstMe dstOffset = bufRect.offset(0, y, z); // Copy memory line by line - amd::Os::fastMemcpy((reinterpret_cast
(dst) + dstOffset), - (reinterpret_cast(srcHost) + srcOffset), size[0]); + std::memcpy((reinterpret_cast
(dst) + dstOffset), + (reinterpret_cast(srcHost) + srcOffset), size[0]); } } @@ -258,8 +258,8 @@ bool HostBlitManager::writeImage(const void* srcHost, device::Memory& dstMemory, // Copy memory line by line for (size_t row = 0; row < size[1]; ++row) { // Copy memory - amd::Os::fastMemcpy((reinterpret_cast
(dst) + dstOffs), - (reinterpret_cast(srcHost) + srcOffs), copySize); + std::memcpy((reinterpret_cast
(dst) + dstOffs), + (reinterpret_cast(srcHost) + srcOffs), copySize); dstOffs += dstRowPitch; srcOffs += rowPitch; @@ -293,8 +293,8 @@ bool HostBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& dstM } // Straight forward buffer copy - amd::Os::fastMemcpy((reinterpret_cast
(dst) + dstOrigin[0]), - (reinterpret_cast(src) + srcOrigin[0]), size[0]); + std::memcpy((reinterpret_cast
(dst) + dstOrigin[0]), + (reinterpret_cast(src) + srcOrigin[0]), size[0]); // Unmap source and destination memory dstMemory.cpuUnmap(vDev_); @@ -329,8 +329,8 @@ bool HostBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& size_t dstOffset = dstRect.offset(0, y, z); // Copy memory line by line - amd::Os::fastMemcpy((reinterpret_cast
(dst) + dstOffset), - (reinterpret_cast(src) + srcOffset), size[0]); + std::memcpy((reinterpret_cast
(dst) + dstOffset), + (reinterpret_cast(src) + srcOffset), size[0]); } } @@ -392,8 +392,8 @@ bool HostBlitManager::copyImageToBuffer(device::Memory& srcMemory, device::Memor // Copy memory line by line for (size_t rows = 0; rows < size[1]; ++rows) { - amd::Os::fastMemcpy((reinterpret_cast
(dst) + dstOffs), - (reinterpret_cast(src) + srcOffs), copySize); + std::memcpy((reinterpret_cast
(dst) + dstOffs), + (reinterpret_cast(src) + srcOffs), copySize); srcOffs += srcRowPitch; dstOffs += copySize; @@ -458,8 +458,8 @@ bool HostBlitManager::copyBufferToImage(device::Memory& srcMemory, device::Memor // Copy memory line by line for (size_t rows = 0; rows < size[1]; ++rows) { - amd::Os::fastMemcpy((reinterpret_cast
(dst) + dstOffs), - (reinterpret_cast(src) + srcOffs), copySize); + std::memcpy((reinterpret_cast
(dst) + dstOffs), + (reinterpret_cast(src) + srcOffs), copySize); srcOffs += copySize; dstOffs += dstRowPitch; @@ -544,8 +544,8 @@ bool HostBlitManager::copyImage(device::Memory& srcMemory, device::Memory& dstMe // Copy memory line by line for (size_t rows = 0; rows < size[1]; ++rows) { - amd::Os::fastMemcpy((reinterpret_cast
(dst) + dstOffs), - (reinterpret_cast(src) + srcOffs), copySize); + std::memcpy((reinterpret_cast
(dst) + dstOffs), + (reinterpret_cast(src) + srcOffs), copySize); srcOffs += srcRowPitch; dstOffs += dstRowPitch; diff --git a/projects/clr/rocclr/device/pal/palprogram.cpp b/projects/clr/rocclr/device/pal/palprogram.cpp index 665ebf4038..29e845b6d0 100644 --- a/projects/clr/rocclr/device/pal/palprogram.cpp +++ b/projects/clr/rocclr/device/pal/palprogram.cpp @@ -139,10 +139,10 @@ bool Segment::alloc(HSAILProgram& prog, amdgpu_hsa_elf_segment_t segment, size_t void Segment::copy(size_t offset, const void* src, size_t size) { if (cpuAccess_ != nullptr) { - amd::Os::fastMemcpy(cpuAddress(offset), src, size); + std::memcpy(cpuAddress(offset), src, size); } else { if (cpuMem_ != nullptr) { - amd::Os::fastMemcpy(cpuAddress(offset), src, size); + std::memcpy(cpuAddress(offset), src, size); } amd::ScopedLock k(gpuAccess_->dev().xferMgr().lockXfer()); VirtualGPU& gpu = *gpuAccess_->dev().xferQueue(); @@ -568,7 +568,7 @@ void* PALHSALoaderContext::SegmentAlloc(amdgpu_hsa_elf_segment_t segment, hsa_ag bool PALHSALoaderContext::SegmentCopy(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* dst, size_t offset, const void* src, size_t size) { if (program_->isNull()) { - amd::Os::fastMemcpy(reinterpret_cast
(dst) + offset, src, size); + std::memcpy(reinterpret_cast
(dst) + offset, src, size); return true; } Segment* s = reinterpret_cast(dst); diff --git a/projects/clr/rocclr/device/pal/palresource.cpp b/projects/clr/rocclr/device/pal/palresource.cpp index 3c9fc843f2..dd801157bc 100644 --- a/projects/clr/rocclr/device/pal/palresource.cpp +++ b/projects/clr/rocclr/device/pal/palresource.cpp @@ -1700,7 +1700,7 @@ bool Resource::hostWrite(VirtualGPU* gpu, const void* hostPtr, const amd::Coord3 dst = static_cast(static_cast(dst) + origin[0]); // Copy memory - amd::Os::fastMemcpy(dst, hostPtr, copySize); + std::memcpy(dst, hostPtr, copySize); } else { size_t dstOffsBase = origin[0] * elementSize_; @@ -1728,7 +1728,7 @@ bool Resource::hostWrite(VirtualGPU* gpu, const void* hostPtr, const amd::Coord3 // Copy memory line by line for (size_t row = 0; row < size[1]; ++row) { // Copy memory - amd::Os::fastMemcpy((reinterpret_cast
(dst) + dstOffs), + std::memcpy((reinterpret_cast
(dst) + dstOffs), (reinterpret_cast(hostPtr) + srcOffs), size[0] * elementSize_); @@ -1770,7 +1770,7 @@ bool Resource::hostRead(VirtualGPU* gpu, void* hostPtr, const amd::Coord3D& orig src = static_cast(static_cast(src) + origin[0]); // Copy memory - amd::Os::fastMemcpy(hostPtr, src, copySize); + std::memcpy(hostPtr, src, copySize); } else { size_t srcOffsBase = origin[0] * elementSize_; @@ -1798,9 +1798,9 @@ bool Resource::hostRead(VirtualGPU* gpu, void* hostPtr, const amd::Coord3D& orig // Copy memory line by line for (size_t row = 0; row < size[1]; ++row) { // Copy memory - amd::Os::fastMemcpy((reinterpret_cast
(hostPtr) + dstOffs), - (reinterpret_cast(src) + srcOffs), - size[0] * elementSize_); + std::memcpy((reinterpret_cast
(hostPtr) + dstOffs), + (reinterpret_cast(src) + srcOffs), + size[0] * elementSize_); srcOffs += desc().pitch_ * elementSize_; dstOffs += rowPitch; @@ -1939,7 +1939,7 @@ bool Resource::isPersistentDirectMap(bool writeMap) const { if (directMap && desc().tiled_) { // Latest HW does have tiling apertures directMap = false; - } + } if (memoryType() == View) { directMap = viewOwner_->isPersistentDirectMap(writeMap); } diff --git a/projects/clr/rocclr/device/pal/palvirtual.cpp b/projects/clr/rocclr/device/pal/palvirtual.cpp index a5228f9b08..bb49c3af9f 100644 --- a/projects/clr/rocclr/device/pal/palvirtual.cpp +++ b/projects/clr/rocclr/device/pal/palvirtual.cpp @@ -1569,7 +1569,7 @@ void VirtualGPU::submitSvmCopyMemory(amd::SvmCopyMemoryCommand& vcmd) { } if (nullptr == srcMem && nullptr == dstMem) { // both not in svm space - amd::Os::fastMemcpy(vcmd.dst(), vcmd.src(), vcmd.srcSize()); + std::memcpy(vcmd.dst(), vcmd.src(), vcmd.srcSize()); result = true; } else if (nullptr == srcMem && nullptr != dstMem) { // src not in svm space Memory* memory = dev().getGpuMemory(dstMem); diff --git a/projects/clr/rocclr/device/rocm/rocdevice.hpp b/projects/clr/rocclr/device/rocm/rocdevice.hpp index 1cd47fb0ab..d4a80ed308 100644 --- a/projects/clr/rocclr/device/rocm/rocdevice.hpp +++ b/projects/clr/rocclr/device/rocm/rocdevice.hpp @@ -618,7 +618,7 @@ class Device : public NullDevice { mutable std::mutex lock_allow_access_; //!< To serialize allow_access calls hsa_agent_t bkendDevice_; uint32_t pciDeviceId_; - hsa_agent_t* p2p_agents_list_; + hsa_agent_t* p2p_agents_list_ = nullptr; hsa_profile_t agent_profile_; hsa_amd_memory_pool_t group_segment_; hsa_amd_memory_pool_t system_segment_; diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/device/rocm/rocvirtual.cpp index d86463d005..3bb892696b 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.cpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.cpp @@ -959,8 +959,7 @@ bool VirtualGPU::dispatchAqlPacket(hsa_kernel_dispatch_packet_t* packet, uint16_ if (capturing == true) { packet->header = header; packet->setup = rest; - amd::Os::fastMemcpy(const_cast(aqlPacket), packet, - sizeof(hsa_kernel_dispatch_packet_t)); + std::memcpy(const_cast(aqlPacket), packet, sizeof(hsa_kernel_dispatch_packet_t)); return true; } else { dispatchBlockingWait(); @@ -1995,7 +1994,7 @@ void VirtualGPU::submitSvmCopyMemory(amd::SvmCopyMemoryCommand& cmd) { // If these are from different contexts, then one of them could be in the device memory // This is fine, since spec doesn't allow for copies with pointers from different contexts - amd::Os::fastMemcpy(cmd.dst(), cmd.src(), cmd.srcSize()); + std::memcpy(cmd.dst(), cmd.src(), cmd.srcSize()); result = true; } else if (nullptr == srcMem && nullptr != dstMem) { // src not in svm space Memory* memory = dev().getRocMemory(dstMem); @@ -2158,7 +2157,7 @@ void VirtualGPU::submitSvmMapMemory(amd::SvmMapMemoryCommand& cmd) { // Wait on a kernel if one is outstanding releaseGpuMemoryFence(); const void* mappedPtr = hsaMapMemory->owner()->getHostMem(); - amd::Os::fastMemcpy(cmd.svmPtr(), mappedPtr, cmd.size()[0]); + std::memcpy(cmd.svmPtr(), mappedPtr, cmd.size()[0]); } } else { LogError("Unhandled svm map!"); @@ -2189,7 +2188,7 @@ void VirtualGPU::submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& cmd) { Memory* hsaMapMemory = dev().getRocMemory(memory->mapMemory()); void* mappedPtr = hsaMapMemory->owner()->getHostMem(); - amd::Os::fastMemcpy(mappedPtr, cmd.svmPtr(), writeMapInfo->region_[0]); + std::memcpy(mappedPtr, cmd.svmPtr(), writeMapInfo->region_[0]); // Target is a remote resource, so copy if (!blitMgr().copyBuffer(*hsaMapMemory, *memory, writeMapInfo->origin_, writeMapInfo->origin_, writeMapInfo->region_, @@ -2277,7 +2276,7 @@ void VirtualGPU::submitMapMemory(amd::MapMemoryCommand& cmd) { if ((svmPtr != nullptr) && (hostPtr != svmPtr)) { // Wait on a kernel if one is outstanding releaseGpuMemoryFence(); - amd::Os::fastMemcpy(svmPtr, hostPtr, size[0]); + std::memcpy(svmPtr, hostPtr, size[0]); } } else { result = blitMgr().readBuffer(*hsaMemory, static_cast(hostPtr) + origin[0], origin, @@ -2377,7 +2376,7 @@ void VirtualGPU::submitUnmapMemory(amd::UnmapMemoryCommand& cmd) { if ((svmPtr != nullptr) && (hostPtr != svmPtr)) { // Wait on a kernel if one is outstanding releaseGpuMemoryFence(); - amd::Os::fastMemcpy(hostPtr, svmPtr, size[0]); + std::memcpy(hostPtr, svmPtr, size[0]); } result = blitMgr().copyBuffer(*hsaMapMemory, *devMemory, mapInfo->origin_, mapInfo->origin_, mapInfo->region_, mapInfo->isEntire()); @@ -2937,7 +2936,7 @@ static inline void nontemporalMemcpy( *reinterpret_cast(src)++); } #else - amd::Os::fastMemcpy(dst, src, size); + std::memcpy(dst, src, size); #endif } diff --git a/projects/clr/rocclr/os/os.hpp b/projects/clr/rocclr/os/os.hpp index c9bd0b99e5..8509cc3627 100644 --- a/projects/clr/rocclr/os/os.hpp +++ b/projects/clr/rocclr/os/os.hpp @@ -232,9 +232,6 @@ class Os : AllStatic { //! Deallocate an aligned chunk of memory. static void alignedFree(void* mem); - //! Platform-specific optimized memcpy() - static void* fastMemcpy(void* dest, const void* src, size_t n); - //! NUMA related settings static void setPreferredNumaNode(uint32_t node); diff --git a/projects/clr/rocclr/os/os_posix.cpp b/projects/clr/rocclr/os/os_posix.cpp index 739795e0cb..86c199b5b1 100644 --- a/projects/clr/rocclr/os/os_posix.cpp +++ b/projects/clr/rocclr/os/os_posix.cpp @@ -524,7 +524,7 @@ int Os::systemCall(const std::string& command) { #if 1 size_t len = command.size(); char* cmd = new char[len + 1]; - fastMemcpy(cmd, command.c_str(), len); + std::memcpy(cmd, command.c_str(), len); cmd[len] = 0; // Split the command into arguments. This is a very @@ -681,8 +681,6 @@ uint64_t Os::xgetbv(uint32_t ecx) { } #endif // ATI_ARCH_X86 -void* Os::fastMemcpy(void* dest, const void* src, size_t n) { return memcpy(dest, src, n); } - uint64_t Os::offsetToEpochNanos() { static uint64_t offset = 0; diff --git a/projects/clr/rocclr/os/os_win32.cpp b/projects/clr/rocclr/os/os_win32.cpp index 3923ec37df..08ba2a21f4 100644 --- a/projects/clr/rocclr/os/os_win32.cpp +++ b/projects/clr/rocclr/os/os_win32.cpp @@ -424,7 +424,7 @@ int Os::printf(const char* fmt, ...) { int Os::systemCall(const std::string& command) { #if 1 char* cmd = new char[command.size() + 1]; - fastMemcpy(cmd, command.c_str(), command.size()); + std::memcpy(cmd, command.c_str(), command.size()); cmd[command.size()] = 0; STARTUPINFO si = {0}; @@ -509,255 +509,6 @@ void Os::cpuid(int regs[4], int info) { return __cpuid(regs, info); } uint64_t Os::xgetbv(uint32_t ecx) { return (uint64_t)_xgetbv(ecx); } -// Various "fast" memcpy implementation (currently win32 only due to compiler limitations) - -// (dgladdin - "recent" below means MMX and later) - -// Very optimized memcpy() routine for all AMD Athlon and Duron family. -// This code uses any of FOUR different basic copy methods, depending -// on the transfer size. -// NOTE: Since this code uses MOVNTQ (also known as "Non-Temporal MOV" or -// "Streaming Store"), and also uses the software prefetchnta instructions, -// be sure youre running on Athlon/Duron or other recent CPU before calling! - -#define TINY_BLOCK_COPY 64 // upper limit for movsd type copy -// The smallest copy uses the X86 "movsd" instruction, in an optimized -// form which is an "unrolled loop". - -#define IN_CACHE_COPY 64 * 1024 // upper limit for movq/movq copy w/SW prefetch -// Next is a copy that uses the MMX registers to copy 8 bytes at a time, -// also using the "unrolled loop" optimization. This code uses -// the software prefetch instruction to get the data into the cache. - -#define UNCACHED_COPY 197 * 1024 // upper limit for movq/movntq w/SW prefetch -// For larger blocks, which will spill beyond the cache, its faster to -// use the Streaming Store instruction MOVNTQ. This write instruction -// bypasses the cache and writes straight to main memory. This code also -// uses the software prefetch instruction to pre-read the data. -// USE 64 * 1024 FOR THIS VALUE IF YOURE ALWAYS FILLING A "CLEAN CACHE" - -#define BLOCK_PREFETCH_COPY infinity // no limit for movq/movntq w/block prefetch -#define CACHEBLOCK 80h // number of 64-byte blocks (cache lines) for block prefetch -// For the largest size blocks, a special technique called Block Prefetch -// can be used to accelerate the read operations. Block Prefetch reads -// one address per cache line, for a series of cache lines, in a short loop. -// This is faster than using software prefetch. The technique is great for -// getting maximum read bandwidth, especially in DDR memory systems. - -// Inline assembly syntax for use with Visual C++ - -void* Os::fastMemcpy(void* dest, const void* src, size_t n) { -#if !defined(_WIN64) - - __asm { - - mov ecx, [n] ; number of bytes to copy - mov edi, [dest] ; destination - mov esi, [src] ; source - mov ebx, ecx ; keep a copy of count - - cld - cmp ecx, TINY_BLOCK_COPY - jb $memcpy_ic_3 ; tiny? skip mmx copy - - cmp ecx, 32*1024 ; dont align between 32k-64k because - jbe $memcpy_do_align ; it appears to be slower - cmp ecx, 64*1024 - jbe $memcpy_align_done -$memcpy_do_align: - mov ecx, 8 ; a trick thats faster than rep movsb... - sub ecx, edi ; align destination to qword - and ecx, 111b ; get the low bits - sub ebx, ecx ; update copy count - neg ecx ; set up to jump into the array - add ecx, offset $memcpy_align_done - jmp ecx ; jump to array of movsbs - -align 4 - movsb - movsb - movsb - movsb - movsb - movsb - movsb - movsb - -$memcpy_align_done: ; destination is dword aligned - mov ecx, ebx ; number of bytes left to copy - shr ecx, 6 ; get 64-byte block count - jz $memcpy_ic_2 ; finish the last few bytes - - cmp ecx, IN_CACHE_COPY/64 ; too big 4 cache? use uncached copy - jae $memcpy_uc_test - - // This is small block copy that uses the MMX registers to copy 8 bytes - // at a time. It uses the "unrolled loop" optimization, and also uses - // the software prefetch instruction to get the data into the cache. -align 16 -$memcpy_ic_1: ; 64-byte block copies, in-cache copy - - prefetchnta [esi + (200*64/34+192)] ; start reading ahead - - movq mm0, [esi+0] ; read 64 bits - movq mm1, [esi+8] - movq [edi+0], mm0 ; write 64 bits - movq [edi+8], mm1 ; note: the normal movq writes the - movq mm2, [esi+16] ; data to cache; a cache line will be - movq mm3, [esi+24] ; allocated as needed, to store the data - movq [edi+16], mm2 - movq [edi+24], mm3 - movq mm0, [esi+32] - movq mm1, [esi+40] - movq [edi+32], mm0 - movq [edi+40], mm1 - movq mm2, [esi+48] - movq mm3, [esi+56] - movq [edi+48], mm2 - movq [edi+56], mm3 - - add esi, 64 ; update source pointer - add edi, 64 ; update destination pointer - dec ecx ; count down - jnz $memcpy_ic_1 ; last 64-byte block? - -$memcpy_ic_2: - mov ecx, ebx ; has valid low 6 bits of the byte count -$memcpy_ic_3: - shr ecx, 2 ; dword count - and ecx, 1111b ; only look at the "remainder" bits - neg ecx ; set up to jump into the array - add ecx, offset $memcpy_last_few - jmp ecx ; jump to array of movsds - -$memcpy_uc_test: - cmp ecx, UNCACHED_COPY/64 ; big enough? use block prefetch copy - jae $memcpy_bp_1 - -$memcpy_64_test: - or ecx, ecx ; tail end of block prefetch will jump here - jz $memcpy_ic_2 ; no more 64-byte blocks left - - // For larger blocks, which will spill beyond the cache, its faster to - // use the Streaming Store instruction MOVNTQ. This write instruction - // bypasses the cache and writes straight to main memory. This code also - // uses the software prefetch instruction to pre-read the data. -align 16 -$memcpy_uc_1: ; 64-byte blocks, uncached copy - - prefetchnta [esi + (200*64/34+192)] ; start reading ahead - - movq mm0,[esi+0] ; read 64 bits - add edi,64 ; update destination pointer - movq mm1,[esi+8] - add esi,64 ; update source pointer - movq mm2,[esi-48] - movntq [edi-64], mm0 ; write 64 bits, bypassing the cache - movq mm0,[esi-40] ; note: movntq also prevents the CPU - movntq [edi-56], mm1 ; from READING the destination address - movq mm1,[esi-32] ; into the cache, only to be over-written - movntq [edi-48], mm2 ; so that also helps performance - movq mm2,[esi-24] - movntq [edi-40], mm0 - movq mm0,[esi-16] - movntq [edi-32], mm1 - movq mm1,[esi-8] - movntq [edi-24], mm2 - movntq [edi-16], mm0 - dec ecx - movntq [edi-8], mm1 - jnz $memcpy_uc_1 ; last 64-byte block? - - jmp $memcpy_ic_2 ; almost done - - // For the largest size blocks, a special technique called Block Prefetch - // can be used to accelerate the read operations. Block Prefetch reads - // one address per cache line, for a series of cache lines, in a short loop. - // This is faster than using software prefetch, in this case. - // The technique is great for getting maximum read bandwidth, - // especially in DDR memory systems. -$memcpy_bp_1: ; large blocks, block prefetch copy - - cmp ecx, CACHEBLOCK ; big enough to run another prefetch loop? - jl $memcpy_64_test ; no, back to regular uncached copy - - mov eax, CACHEBLOCK / 2 ; block prefetch loop, unrolled 2X - add esi, CACHEBLOCK * 64 ; move to the top of the block -align 16 -$memcpy_bp_2: - mov edx, [esi-64] ; grab one address per cache line - mov edx, [esi-128] ; grab one address per cache line - sub esi, 128 ; go reverse order - dec eax ; count down the cache lines - jnz $memcpy_bp_2 ; keep grabbing more lines into cache - - mov eax, CACHEBLOCK ; now that its in cache, do the copy -align 16 -$memcpy_bp_3: - movq mm0, [esi ] ; read 64 bits - movq mm1, [esi+ 8] - movq mm2, [esi+16] - movq mm3, [esi+24] - movq mm4, [esi+32] - movq mm5, [esi+40] - movq mm6, [esi+48] - movq mm7, [esi+56] - add esi, 64 ; update source pointer - movntq [edi ], mm0 ; write 64 bits, bypassing cache - movntq [edi+ 8], mm1 ; note: movntq also prevents the CPU - movntq [edi+16], mm2 ; from READING the destination address - movntq [edi+24], mm3 ; into the cache, only to be over-written, - movntq [edi+32], mm4 ; so that also helps performance - movntq [edi+40], mm5 - movntq [edi+48], mm6 - movntq [edi+56], mm7 - add edi, 64 ; update dest pointer - - dec eax ; count down - - jnz $memcpy_bp_3 ; keep copying - sub ecx, CACHEBLOCK ; update the 64-byte block count - jmp $memcpy_bp_1 ; keep processing chunks - - // The smallest copy uses the X86 "movsd" instruction, in an optimized - // form which is an "unrolled loop". Then it handles the last few bytes. -align 4 - movsd - movsd ; perform last 1-15 dword copies - movsd - movsd - movsd - movsd - movsd - movsd - movsd - movsd ; perform last 1-7 dword copies - movsd - movsd - movsd - movsd - movsd - movsd - -$memcpy_last_few: ; dword aligned from before movsds - mov ecx, ebx ; has valid low 2 bits of the byte count - and ecx, 11b ; the last few cows must come home - jz $memcpy_final ; no more, lets leave - rep movsb ; the last 1, 2, or 3 bytes - -$memcpy_final: - emms ; clean up the MMX state - sfence ; flush the write buffer - mov eax, [dest] ; ret value = destination pointer - - } -#else // !defined(_WIN64)) - - return memcpy(dest, src, n); - -#endif -} - uint64_t Os::offsetToEpochNanos() { static uint64_t offset = 0; diff --git a/projects/clr/rocclr/platform/runtime.hpp b/projects/clr/rocclr/platform/runtime.hpp index a1c3d25e1a..d1953aedaf 100644 --- a/projects/clr/rocclr/platform/runtime.hpp +++ b/projects/clr/rocclr/platform/runtime.hpp @@ -56,37 +56,6 @@ class Runtime : AllStatic { } }; -#if 0 -class HostThread : public Thread -{ -private: - virtual void run(void* data) { ShouldNotCallThis(); } - -public: - HostThread() : Thread("HostThread", 0, false) - { - setHandle(NULL); - setCurrent(); - - if (!amd::Runtime::initialized() && !amd::Runtime::init()) { - return; - } - - Os::currentStackInfo(&stackBase_, &stackSize_); - setState(RUNNABLE); - } - - bool isHostThread() const { return true; }; - - static inline HostThread* current() - { - Thread* thread = Thread::current(); - assert(thread->isHostThread() && "just checking"); - return (HostThread*) thread; - } -}; -#endif - /*@}*/ inline bool Runtime::initialized() { return initialized_; }