SWDEV-440746 - Remove obsolete code
The "optimized" version of memcpy is outdated and
was used in win32 only.
Change-Id: I7f2e0e9051e37cec95438266824b5b0025c324c6
[ROCm/clr commit: 7448113cfc]
This commit is contained in:
@@ -41,7 +41,7 @@ bool HostBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
|
||||
}
|
||||
|
||||
// Copy memory
|
||||
amd::Os::fastMemcpy(dstHost, reinterpret_cast<const_address>(src) + origin[0], size[0]);
|
||||
std::memcpy(dstHost, reinterpret_cast<const_address>(src) + origin[0], size[0]);
|
||||
|
||||
// Unmap device memory
|
||||
srcMemory.cpuUnmap(vDev_);
|
||||
@@ -69,8 +69,8 @@ bool HostBlitManager::readBufferRect(device::Memory& srcMemory, void* dstHost,
|
||||
dstOffset = hostRect.offset(0, y, z);
|
||||
|
||||
// Copy memory line by line
|
||||
amd::Os::fastMemcpy((reinterpret_cast<address>(dstHost) + dstOffset),
|
||||
(reinterpret_cast<const_address>(src) + srcOffset), size[0]);
|
||||
std::memcpy((reinterpret_cast<address>(dstHost) + dstOffset),
|
||||
(reinterpret_cast<const_address>(src) + srcOffset), size[0]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -133,8 +133,8 @@ bool HostBlitManager::readImage(device::Memory& srcMemory, void* dstHost,
|
||||
// Copy memory line by line
|
||||
for (size_t row = 0; row < size[1]; ++row) {
|
||||
// Copy memory
|
||||
amd::Os::fastMemcpy((reinterpret_cast<address>(dstHost) + dstOffs),
|
||||
(reinterpret_cast<const_address>(src) + srcOffs), copySize);
|
||||
std::memcpy((reinterpret_cast<address>(dstHost) + dstOffs),
|
||||
(reinterpret_cast<const_address>(src) + srcOffs), copySize);
|
||||
|
||||
srcOffs += srcRowPitch;
|
||||
dstOffs += rowPitch;
|
||||
@@ -163,7 +163,7 @@ bool HostBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemory
|
||||
}
|
||||
|
||||
// Copy memory
|
||||
amd::Os::fastMemcpy(reinterpret_cast<address>(dst) + origin[0], srcHost, size[0]);
|
||||
std::memcpy(reinterpret_cast<address>(dst) + origin[0], srcHost, size[0]);
|
||||
|
||||
// Unmap the device memory
|
||||
dstMemory.cpuUnmap(vDev_);
|
||||
@@ -191,8 +191,8 @@ bool HostBlitManager::writeBufferRect(const void* srcHost, device::Memory& dstMe
|
||||
dstOffset = bufRect.offset(0, y, z);
|
||||
|
||||
// Copy memory line by line
|
||||
amd::Os::fastMemcpy((reinterpret_cast<address>(dst) + dstOffset),
|
||||
(reinterpret_cast<const_address>(srcHost) + srcOffset), size[0]);
|
||||
std::memcpy((reinterpret_cast<address>(dst) + dstOffset),
|
||||
(reinterpret_cast<const_address>(srcHost) + srcOffset), size[0]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -258,8 +258,8 @@ bool HostBlitManager::writeImage(const void* srcHost, device::Memory& dstMemory,
|
||||
// Copy memory line by line
|
||||
for (size_t row = 0; row < size[1]; ++row) {
|
||||
// Copy memory
|
||||
amd::Os::fastMemcpy((reinterpret_cast<address>(dst) + dstOffs),
|
||||
(reinterpret_cast<const_address>(srcHost) + srcOffs), copySize);
|
||||
std::memcpy((reinterpret_cast<address>(dst) + dstOffs),
|
||||
(reinterpret_cast<const_address>(srcHost) + srcOffs), copySize);
|
||||
|
||||
dstOffs += dstRowPitch;
|
||||
srcOffs += rowPitch;
|
||||
@@ -293,8 +293,8 @@ bool HostBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& dstM
|
||||
}
|
||||
|
||||
// Straight forward buffer copy
|
||||
amd::Os::fastMemcpy((reinterpret_cast<address>(dst) + dstOrigin[0]),
|
||||
(reinterpret_cast<const_address>(src) + srcOrigin[0]), size[0]);
|
||||
std::memcpy((reinterpret_cast<address>(dst) + dstOrigin[0]),
|
||||
(reinterpret_cast<const_address>(src) + srcOrigin[0]), size[0]);
|
||||
|
||||
// Unmap source and destination memory
|
||||
dstMemory.cpuUnmap(vDev_);
|
||||
@@ -329,8 +329,8 @@ bool HostBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory&
|
||||
size_t dstOffset = dstRect.offset(0, y, z);
|
||||
|
||||
// Copy memory line by line
|
||||
amd::Os::fastMemcpy((reinterpret_cast<address>(dst) + dstOffset),
|
||||
(reinterpret_cast<const_address>(src) + srcOffset), size[0]);
|
||||
std::memcpy((reinterpret_cast<address>(dst) + dstOffset),
|
||||
(reinterpret_cast<const_address>(src) + srcOffset), size[0]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -392,8 +392,8 @@ bool HostBlitManager::copyImageToBuffer(device::Memory& srcMemory, device::Memor
|
||||
|
||||
// Copy memory line by line
|
||||
for (size_t rows = 0; rows < size[1]; ++rows) {
|
||||
amd::Os::fastMemcpy((reinterpret_cast<address>(dst) + dstOffs),
|
||||
(reinterpret_cast<const_address>(src) + srcOffs), copySize);
|
||||
std::memcpy((reinterpret_cast<address>(dst) + dstOffs),
|
||||
(reinterpret_cast<const_address>(src) + srcOffs), copySize);
|
||||
|
||||
srcOffs += srcRowPitch;
|
||||
dstOffs += copySize;
|
||||
@@ -458,8 +458,8 @@ bool HostBlitManager::copyBufferToImage(device::Memory& srcMemory, device::Memor
|
||||
|
||||
// Copy memory line by line
|
||||
for (size_t rows = 0; rows < size[1]; ++rows) {
|
||||
amd::Os::fastMemcpy((reinterpret_cast<address>(dst) + dstOffs),
|
||||
(reinterpret_cast<const_address>(src) + srcOffs), copySize);
|
||||
std::memcpy((reinterpret_cast<address>(dst) + dstOffs),
|
||||
(reinterpret_cast<const_address>(src) + srcOffs), copySize);
|
||||
|
||||
srcOffs += copySize;
|
||||
dstOffs += dstRowPitch;
|
||||
@@ -544,8 +544,8 @@ bool HostBlitManager::copyImage(device::Memory& srcMemory, device::Memory& dstMe
|
||||
|
||||
// Copy memory line by line
|
||||
for (size_t rows = 0; rows < size[1]; ++rows) {
|
||||
amd::Os::fastMemcpy((reinterpret_cast<address>(dst) + dstOffs),
|
||||
(reinterpret_cast<const_address>(src) + srcOffs), copySize);
|
||||
std::memcpy((reinterpret_cast<address>(dst) + dstOffs),
|
||||
(reinterpret_cast<const_address>(src) + srcOffs), copySize);
|
||||
|
||||
srcOffs += srcRowPitch;
|
||||
dstOffs += dstRowPitch;
|
||||
|
||||
@@ -139,10 +139,10 @@ bool Segment::alloc(HSAILProgram& prog, amdgpu_hsa_elf_segment_t segment, size_t
|
||||
|
||||
void Segment::copy(size_t offset, const void* src, size_t size) {
|
||||
if (cpuAccess_ != nullptr) {
|
||||
amd::Os::fastMemcpy(cpuAddress(offset), src, size);
|
||||
std::memcpy(cpuAddress(offset), src, size);
|
||||
} else {
|
||||
if (cpuMem_ != nullptr) {
|
||||
amd::Os::fastMemcpy(cpuAddress(offset), src, size);
|
||||
std::memcpy(cpuAddress(offset), src, size);
|
||||
}
|
||||
amd::ScopedLock k(gpuAccess_->dev().xferMgr().lockXfer());
|
||||
VirtualGPU& gpu = *gpuAccess_->dev().xferQueue();
|
||||
@@ -568,7 +568,7 @@ void* PALHSALoaderContext::SegmentAlloc(amdgpu_hsa_elf_segment_t segment, hsa_ag
|
||||
bool PALHSALoaderContext::SegmentCopy(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent,
|
||||
void* dst, size_t offset, const void* src, size_t size) {
|
||||
if (program_->isNull()) {
|
||||
amd::Os::fastMemcpy(reinterpret_cast<address>(dst) + offset, src, size);
|
||||
std::memcpy(reinterpret_cast<address>(dst) + offset, src, size);
|
||||
return true;
|
||||
}
|
||||
Segment* s = reinterpret_cast<Segment*>(dst);
|
||||
|
||||
@@ -1700,7 +1700,7 @@ bool Resource::hostWrite(VirtualGPU* gpu, const void* hostPtr, const amd::Coord3
|
||||
dst = static_cast<void*>(static_cast<char*>(dst) + origin[0]);
|
||||
|
||||
// Copy memory
|
||||
amd::Os::fastMemcpy(dst, hostPtr, copySize);
|
||||
std::memcpy(dst, hostPtr, copySize);
|
||||
} else {
|
||||
size_t dstOffsBase = origin[0] * elementSize_;
|
||||
|
||||
@@ -1728,7 +1728,7 @@ bool Resource::hostWrite(VirtualGPU* gpu, const void* hostPtr, const amd::Coord3
|
||||
// Copy memory line by line
|
||||
for (size_t row = 0; row < size[1]; ++row) {
|
||||
// Copy memory
|
||||
amd::Os::fastMemcpy((reinterpret_cast<address>(dst) + dstOffs),
|
||||
std::memcpy((reinterpret_cast<address>(dst) + dstOffs),
|
||||
(reinterpret_cast<const_address>(hostPtr) + srcOffs),
|
||||
size[0] * elementSize_);
|
||||
|
||||
@@ -1770,7 +1770,7 @@ bool Resource::hostRead(VirtualGPU* gpu, void* hostPtr, const amd::Coord3D& orig
|
||||
src = static_cast<void*>(static_cast<char*>(src) + origin[0]);
|
||||
|
||||
// Copy memory
|
||||
amd::Os::fastMemcpy(hostPtr, src, copySize);
|
||||
std::memcpy(hostPtr, src, copySize);
|
||||
} else {
|
||||
size_t srcOffsBase = origin[0] * elementSize_;
|
||||
|
||||
@@ -1798,9 +1798,9 @@ bool Resource::hostRead(VirtualGPU* gpu, void* hostPtr, const amd::Coord3D& orig
|
||||
// Copy memory line by line
|
||||
for (size_t row = 0; row < size[1]; ++row) {
|
||||
// Copy memory
|
||||
amd::Os::fastMemcpy((reinterpret_cast<address>(hostPtr) + dstOffs),
|
||||
(reinterpret_cast<const_address>(src) + srcOffs),
|
||||
size[0] * elementSize_);
|
||||
std::memcpy((reinterpret_cast<address>(hostPtr) + dstOffs),
|
||||
(reinterpret_cast<const_address>(src) + srcOffs),
|
||||
size[0] * elementSize_);
|
||||
|
||||
srcOffs += desc().pitch_ * elementSize_;
|
||||
dstOffs += rowPitch;
|
||||
@@ -1939,7 +1939,7 @@ bool Resource::isPersistentDirectMap(bool writeMap) const {
|
||||
if (directMap && desc().tiled_) {
|
||||
// Latest HW does have tiling apertures
|
||||
directMap = false;
|
||||
}
|
||||
}
|
||||
if (memoryType() == View) {
|
||||
directMap = viewOwner_->isPersistentDirectMap(writeMap);
|
||||
}
|
||||
|
||||
@@ -1569,7 +1569,7 @@ void VirtualGPU::submitSvmCopyMemory(amd::SvmCopyMemoryCommand& vcmd) {
|
||||
}
|
||||
|
||||
if (nullptr == srcMem && nullptr == dstMem) { // both not in svm space
|
||||
amd::Os::fastMemcpy(vcmd.dst(), vcmd.src(), vcmd.srcSize());
|
||||
std::memcpy(vcmd.dst(), vcmd.src(), vcmd.srcSize());
|
||||
result = true;
|
||||
} else if (nullptr == srcMem && nullptr != dstMem) { // src not in svm space
|
||||
Memory* memory = dev().getGpuMemory(dstMem);
|
||||
|
||||
@@ -618,7 +618,7 @@ class Device : public NullDevice {
|
||||
mutable std::mutex lock_allow_access_; //!< To serialize allow_access calls
|
||||
hsa_agent_t bkendDevice_;
|
||||
uint32_t pciDeviceId_;
|
||||
hsa_agent_t* p2p_agents_list_;
|
||||
hsa_agent_t* p2p_agents_list_ = nullptr;
|
||||
hsa_profile_t agent_profile_;
|
||||
hsa_amd_memory_pool_t group_segment_;
|
||||
hsa_amd_memory_pool_t system_segment_;
|
||||
|
||||
@@ -959,8 +959,7 @@ bool VirtualGPU::dispatchAqlPacket(hsa_kernel_dispatch_packet_t* packet, uint16_
|
||||
if (capturing == true) {
|
||||
packet->header = header;
|
||||
packet->setup = rest;
|
||||
amd::Os::fastMemcpy(const_cast<uint8_t*>(aqlPacket), packet,
|
||||
sizeof(hsa_kernel_dispatch_packet_t));
|
||||
std::memcpy(const_cast<uint8_t*>(aqlPacket), packet, sizeof(hsa_kernel_dispatch_packet_t));
|
||||
return true;
|
||||
} else {
|
||||
dispatchBlockingWait();
|
||||
@@ -1995,7 +1994,7 @@ void VirtualGPU::submitSvmCopyMemory(amd::SvmCopyMemoryCommand& cmd) {
|
||||
|
||||
// If these are from different contexts, then one of them could be in the device memory
|
||||
// This is fine, since spec doesn't allow for copies with pointers from different contexts
|
||||
amd::Os::fastMemcpy(cmd.dst(), cmd.src(), cmd.srcSize());
|
||||
std::memcpy(cmd.dst(), cmd.src(), cmd.srcSize());
|
||||
result = true;
|
||||
} else if (nullptr == srcMem && nullptr != dstMem) { // src not in svm space
|
||||
Memory* memory = dev().getRocMemory(dstMem);
|
||||
@@ -2158,7 +2157,7 @@ void VirtualGPU::submitSvmMapMemory(amd::SvmMapMemoryCommand& cmd) {
|
||||
// Wait on a kernel if one is outstanding
|
||||
releaseGpuMemoryFence();
|
||||
const void* mappedPtr = hsaMapMemory->owner()->getHostMem();
|
||||
amd::Os::fastMemcpy(cmd.svmPtr(), mappedPtr, cmd.size()[0]);
|
||||
std::memcpy(cmd.svmPtr(), mappedPtr, cmd.size()[0]);
|
||||
}
|
||||
} else {
|
||||
LogError("Unhandled svm map!");
|
||||
@@ -2189,7 +2188,7 @@ void VirtualGPU::submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& cmd) {
|
||||
Memory* hsaMapMemory = dev().getRocMemory(memory->mapMemory());
|
||||
|
||||
void* mappedPtr = hsaMapMemory->owner()->getHostMem();
|
||||
amd::Os::fastMemcpy(mappedPtr, cmd.svmPtr(), writeMapInfo->region_[0]);
|
||||
std::memcpy(mappedPtr, cmd.svmPtr(), writeMapInfo->region_[0]);
|
||||
// Target is a remote resource, so copy
|
||||
if (!blitMgr().copyBuffer(*hsaMapMemory, *memory, writeMapInfo->origin_,
|
||||
writeMapInfo->origin_, writeMapInfo->region_,
|
||||
@@ -2277,7 +2276,7 @@ void VirtualGPU::submitMapMemory(amd::MapMemoryCommand& cmd) {
|
||||
if ((svmPtr != nullptr) && (hostPtr != svmPtr)) {
|
||||
// Wait on a kernel if one is outstanding
|
||||
releaseGpuMemoryFence();
|
||||
amd::Os::fastMemcpy(svmPtr, hostPtr, size[0]);
|
||||
std::memcpy(svmPtr, hostPtr, size[0]);
|
||||
}
|
||||
} else {
|
||||
result = blitMgr().readBuffer(*hsaMemory, static_cast<char*>(hostPtr) + origin[0], origin,
|
||||
@@ -2377,7 +2376,7 @@ void VirtualGPU::submitUnmapMemory(amd::UnmapMemoryCommand& cmd) {
|
||||
if ((svmPtr != nullptr) && (hostPtr != svmPtr)) {
|
||||
// Wait on a kernel if one is outstanding
|
||||
releaseGpuMemoryFence();
|
||||
amd::Os::fastMemcpy(hostPtr, svmPtr, size[0]);
|
||||
std::memcpy(hostPtr, svmPtr, size[0]);
|
||||
}
|
||||
result = blitMgr().copyBuffer(*hsaMapMemory, *devMemory, mapInfo->origin_, mapInfo->origin_,
|
||||
mapInfo->region_, mapInfo->isEntire());
|
||||
@@ -2937,7 +2936,7 @@ static inline void nontemporalMemcpy(
|
||||
*reinterpret_cast<const int* __restrict&>(src)++);
|
||||
}
|
||||
#else
|
||||
amd::Os::fastMemcpy(dst, src, size);
|
||||
std::memcpy(dst, src, size);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
@@ -232,9 +232,6 @@ class Os : AllStatic {
|
||||
//! Deallocate an aligned chunk of memory.
|
||||
static void alignedFree(void* mem);
|
||||
|
||||
//! Platform-specific optimized memcpy()
|
||||
static void* fastMemcpy(void* dest, const void* src, size_t n);
|
||||
|
||||
//! NUMA related settings
|
||||
static void setPreferredNumaNode(uint32_t node);
|
||||
|
||||
|
||||
@@ -524,7 +524,7 @@ int Os::systemCall(const std::string& command) {
|
||||
#if 1
|
||||
size_t len = command.size();
|
||||
char* cmd = new char[len + 1];
|
||||
fastMemcpy(cmd, command.c_str(), len);
|
||||
std::memcpy(cmd, command.c_str(), len);
|
||||
cmd[len] = 0;
|
||||
|
||||
// Split the command into arguments. This is a very
|
||||
@@ -681,8 +681,6 @@ uint64_t Os::xgetbv(uint32_t ecx) {
|
||||
}
|
||||
#endif // ATI_ARCH_X86
|
||||
|
||||
void* Os::fastMemcpy(void* dest, const void* src, size_t n) { return memcpy(dest, src, n); }
|
||||
|
||||
uint64_t Os::offsetToEpochNanos() {
|
||||
static uint64_t offset = 0;
|
||||
|
||||
|
||||
@@ -424,7 +424,7 @@ int Os::printf(const char* fmt, ...) {
|
||||
int Os::systemCall(const std::string& command) {
|
||||
#if 1
|
||||
char* cmd = new char[command.size() + 1];
|
||||
fastMemcpy(cmd, command.c_str(), command.size());
|
||||
std::memcpy(cmd, command.c_str(), command.size());
|
||||
cmd[command.size()] = 0;
|
||||
|
||||
STARTUPINFO si = {0};
|
||||
@@ -509,255 +509,6 @@ void Os::cpuid(int regs[4], int info) { return __cpuid(regs, info); }
|
||||
|
||||
uint64_t Os::xgetbv(uint32_t ecx) { return (uint64_t)_xgetbv(ecx); }
|
||||
|
||||
// Various "fast" memcpy implementation (currently win32 only due to compiler limitations)
|
||||
|
||||
// (dgladdin - "recent" below means MMX and later)
|
||||
|
||||
// Very optimized memcpy() routine for all AMD Athlon and Duron family.
|
||||
// This code uses any of FOUR different basic copy methods, depending
|
||||
// on the transfer size.
|
||||
// NOTE: Since this code uses MOVNTQ (also known as "Non-Temporal MOV" or
|
||||
// "Streaming Store"), and also uses the software prefetchnta instructions,
|
||||
// be sure youre running on Athlon/Duron or other recent CPU before calling!
|
||||
|
||||
#define TINY_BLOCK_COPY 64 // upper limit for movsd type copy
|
||||
// The smallest copy uses the X86 "movsd" instruction, in an optimized
|
||||
// form which is an "unrolled loop".
|
||||
|
||||
#define IN_CACHE_COPY 64 * 1024 // upper limit for movq/movq copy w/SW prefetch
|
||||
// Next is a copy that uses the MMX registers to copy 8 bytes at a time,
|
||||
// also using the "unrolled loop" optimization. This code uses
|
||||
// the software prefetch instruction to get the data into the cache.
|
||||
|
||||
#define UNCACHED_COPY 197 * 1024 // upper limit for movq/movntq w/SW prefetch
|
||||
// For larger blocks, which will spill beyond the cache, its faster to
|
||||
// use the Streaming Store instruction MOVNTQ. This write instruction
|
||||
// bypasses the cache and writes straight to main memory. This code also
|
||||
// uses the software prefetch instruction to pre-read the data.
|
||||
// USE 64 * 1024 FOR THIS VALUE IF YOURE ALWAYS FILLING A "CLEAN CACHE"
|
||||
|
||||
#define BLOCK_PREFETCH_COPY infinity // no limit for movq/movntq w/block prefetch
|
||||
#define CACHEBLOCK 80h // number of 64-byte blocks (cache lines) for block prefetch
|
||||
// For the largest size blocks, a special technique called Block Prefetch
|
||||
// can be used to accelerate the read operations. Block Prefetch reads
|
||||
// one address per cache line, for a series of cache lines, in a short loop.
|
||||
// This is faster than using software prefetch. The technique is great for
|
||||
// getting maximum read bandwidth, especially in DDR memory systems.
|
||||
|
||||
// Inline assembly syntax for use with Visual C++
|
||||
|
||||
void* Os::fastMemcpy(void* dest, const void* src, size_t n) {
|
||||
#if !defined(_WIN64)
|
||||
|
||||
__asm {
|
||||
|
||||
mov ecx, [n] ; number of bytes to copy
|
||||
mov edi, [dest] ; destination
|
||||
mov esi, [src] ; source
|
||||
mov ebx, ecx ; keep a copy of count
|
||||
|
||||
cld
|
||||
cmp ecx, TINY_BLOCK_COPY
|
||||
jb $memcpy_ic_3 ; tiny? skip mmx copy
|
||||
|
||||
cmp ecx, 32*1024 ; dont align between 32k-64k because
|
||||
jbe $memcpy_do_align ; it appears to be slower
|
||||
cmp ecx, 64*1024
|
||||
jbe $memcpy_align_done
|
||||
$memcpy_do_align:
|
||||
mov ecx, 8 ; a trick thats faster than rep movsb...
|
||||
sub ecx, edi ; align destination to qword
|
||||
and ecx, 111b ; get the low bits
|
||||
sub ebx, ecx ; update copy count
|
||||
neg ecx ; set up to jump into the array
|
||||
add ecx, offset $memcpy_align_done
|
||||
jmp ecx ; jump to array of movsbs
|
||||
|
||||
align 4
|
||||
movsb
|
||||
movsb
|
||||
movsb
|
||||
movsb
|
||||
movsb
|
||||
movsb
|
||||
movsb
|
||||
movsb
|
||||
|
||||
$memcpy_align_done: ; destination is dword aligned
|
||||
mov ecx, ebx ; number of bytes left to copy
|
||||
shr ecx, 6 ; get 64-byte block count
|
||||
jz $memcpy_ic_2 ; finish the last few bytes
|
||||
|
||||
cmp ecx, IN_CACHE_COPY/64 ; too big 4 cache? use uncached copy
|
||||
jae $memcpy_uc_test
|
||||
|
||||
// This is small block copy that uses the MMX registers to copy 8 bytes
|
||||
// at a time. It uses the "unrolled loop" optimization, and also uses
|
||||
// the software prefetch instruction to get the data into the cache.
|
||||
align 16
|
||||
$memcpy_ic_1: ; 64-byte block copies, in-cache copy
|
||||
|
||||
prefetchnta [esi + (200*64/34+192)] ; start reading ahead
|
||||
|
||||
movq mm0, [esi+0] ; read 64 bits
|
||||
movq mm1, [esi+8]
|
||||
movq [edi+0], mm0 ; write 64 bits
|
||||
movq [edi+8], mm1 ; note: the normal movq writes the
|
||||
movq mm2, [esi+16] ; data to cache; a cache line will be
|
||||
movq mm3, [esi+24] ; allocated as needed, to store the data
|
||||
movq [edi+16], mm2
|
||||
movq [edi+24], mm3
|
||||
movq mm0, [esi+32]
|
||||
movq mm1, [esi+40]
|
||||
movq [edi+32], mm0
|
||||
movq [edi+40], mm1
|
||||
movq mm2, [esi+48]
|
||||
movq mm3, [esi+56]
|
||||
movq [edi+48], mm2
|
||||
movq [edi+56], mm3
|
||||
|
||||
add esi, 64 ; update source pointer
|
||||
add edi, 64 ; update destination pointer
|
||||
dec ecx ; count down
|
||||
jnz $memcpy_ic_1 ; last 64-byte block?
|
||||
|
||||
$memcpy_ic_2:
|
||||
mov ecx, ebx ; has valid low 6 bits of the byte count
|
||||
$memcpy_ic_3:
|
||||
shr ecx, 2 ; dword count
|
||||
and ecx, 1111b ; only look at the "remainder" bits
|
||||
neg ecx ; set up to jump into the array
|
||||
add ecx, offset $memcpy_last_few
|
||||
jmp ecx ; jump to array of movsds
|
||||
|
||||
$memcpy_uc_test:
|
||||
cmp ecx, UNCACHED_COPY/64 ; big enough? use block prefetch copy
|
||||
jae $memcpy_bp_1
|
||||
|
||||
$memcpy_64_test:
|
||||
or ecx, ecx ; tail end of block prefetch will jump here
|
||||
jz $memcpy_ic_2 ; no more 64-byte blocks left
|
||||
|
||||
// For larger blocks, which will spill beyond the cache, its faster to
|
||||
// use the Streaming Store instruction MOVNTQ. This write instruction
|
||||
// bypasses the cache and writes straight to main memory. This code also
|
||||
// uses the software prefetch instruction to pre-read the data.
|
||||
align 16
|
||||
$memcpy_uc_1: ; 64-byte blocks, uncached copy
|
||||
|
||||
prefetchnta [esi + (200*64/34+192)] ; start reading ahead
|
||||
|
||||
movq mm0,[esi+0] ; read 64 bits
|
||||
add edi,64 ; update destination pointer
|
||||
movq mm1,[esi+8]
|
||||
add esi,64 ; update source pointer
|
||||
movq mm2,[esi-48]
|
||||
movntq [edi-64], mm0 ; write 64 bits, bypassing the cache
|
||||
movq mm0,[esi-40] ; note: movntq also prevents the CPU
|
||||
movntq [edi-56], mm1 ; from READING the destination address
|
||||
movq mm1,[esi-32] ; into the cache, only to be over-written
|
||||
movntq [edi-48], mm2 ; so that also helps performance
|
||||
movq mm2,[esi-24]
|
||||
movntq [edi-40], mm0
|
||||
movq mm0,[esi-16]
|
||||
movntq [edi-32], mm1
|
||||
movq mm1,[esi-8]
|
||||
movntq [edi-24], mm2
|
||||
movntq [edi-16], mm0
|
||||
dec ecx
|
||||
movntq [edi-8], mm1
|
||||
jnz $memcpy_uc_1 ; last 64-byte block?
|
||||
|
||||
jmp $memcpy_ic_2 ; almost done
|
||||
|
||||
// For the largest size blocks, a special technique called Block Prefetch
|
||||
// can be used to accelerate the read operations. Block Prefetch reads
|
||||
// one address per cache line, for a series of cache lines, in a short loop.
|
||||
// This is faster than using software prefetch, in this case.
|
||||
// The technique is great for getting maximum read bandwidth,
|
||||
// especially in DDR memory systems.
|
||||
$memcpy_bp_1: ; large blocks, block prefetch copy
|
||||
|
||||
cmp ecx, CACHEBLOCK ; big enough to run another prefetch loop?
|
||||
jl $memcpy_64_test ; no, back to regular uncached copy
|
||||
|
||||
mov eax, CACHEBLOCK / 2 ; block prefetch loop, unrolled 2X
|
||||
add esi, CACHEBLOCK * 64 ; move to the top of the block
|
||||
align 16
|
||||
$memcpy_bp_2:
|
||||
mov edx, [esi-64] ; grab one address per cache line
|
||||
mov edx, [esi-128] ; grab one address per cache line
|
||||
sub esi, 128 ; go reverse order
|
||||
dec eax ; count down the cache lines
|
||||
jnz $memcpy_bp_2 ; keep grabbing more lines into cache
|
||||
|
||||
mov eax, CACHEBLOCK ; now that its in cache, do the copy
|
||||
align 16
|
||||
$memcpy_bp_3:
|
||||
movq mm0, [esi ] ; read 64 bits
|
||||
movq mm1, [esi+ 8]
|
||||
movq mm2, [esi+16]
|
||||
movq mm3, [esi+24]
|
||||
movq mm4, [esi+32]
|
||||
movq mm5, [esi+40]
|
||||
movq mm6, [esi+48]
|
||||
movq mm7, [esi+56]
|
||||
add esi, 64 ; update source pointer
|
||||
movntq [edi ], mm0 ; write 64 bits, bypassing cache
|
||||
movntq [edi+ 8], mm1 ; note: movntq also prevents the CPU
|
||||
movntq [edi+16], mm2 ; from READING the destination address
|
||||
movntq [edi+24], mm3 ; into the cache, only to be over-written,
|
||||
movntq [edi+32], mm4 ; so that also helps performance
|
||||
movntq [edi+40], mm5
|
||||
movntq [edi+48], mm6
|
||||
movntq [edi+56], mm7
|
||||
add edi, 64 ; update dest pointer
|
||||
|
||||
dec eax ; count down
|
||||
|
||||
jnz $memcpy_bp_3 ; keep copying
|
||||
sub ecx, CACHEBLOCK ; update the 64-byte block count
|
||||
jmp $memcpy_bp_1 ; keep processing chunks
|
||||
|
||||
// The smallest copy uses the X86 "movsd" instruction, in an optimized
|
||||
// form which is an "unrolled loop". Then it handles the last few bytes.
|
||||
align 4
|
||||
movsd
|
||||
movsd ; perform last 1-15 dword copies
|
||||
movsd
|
||||
movsd
|
||||
movsd
|
||||
movsd
|
||||
movsd
|
||||
movsd
|
||||
movsd
|
||||
movsd ; perform last 1-7 dword copies
|
||||
movsd
|
||||
movsd
|
||||
movsd
|
||||
movsd
|
||||
movsd
|
||||
movsd
|
||||
|
||||
$memcpy_last_few: ; dword aligned from before movsds
|
||||
mov ecx, ebx ; has valid low 2 bits of the byte count
|
||||
and ecx, 11b ; the last few cows must come home
|
||||
jz $memcpy_final ; no more, lets leave
|
||||
rep movsb ; the last 1, 2, or 3 bytes
|
||||
|
||||
$memcpy_final:
|
||||
emms ; clean up the MMX state
|
||||
sfence ; flush the write buffer
|
||||
mov eax, [dest] ; ret value = destination pointer
|
||||
|
||||
}
|
||||
#else // !defined(_WIN64))
|
||||
|
||||
return memcpy(dest, src, n);
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
uint64_t Os::offsetToEpochNanos() {
|
||||
static uint64_t offset = 0;
|
||||
|
||||
|
||||
@@ -56,37 +56,6 @@ class Runtime : AllStatic {
|
||||
}
|
||||
};
|
||||
|
||||
#if 0
|
||||
class HostThread : public Thread
|
||||
{
|
||||
private:
|
||||
virtual void run(void* data) { ShouldNotCallThis(); }
|
||||
|
||||
public:
|
||||
HostThread() : Thread("HostThread", 0, false)
|
||||
{
|
||||
setHandle(NULL);
|
||||
setCurrent();
|
||||
|
||||
if (!amd::Runtime::initialized() && !amd::Runtime::init()) {
|
||||
return;
|
||||
}
|
||||
|
||||
Os::currentStackInfo(&stackBase_, &stackSize_);
|
||||
setState(RUNNABLE);
|
||||
}
|
||||
|
||||
bool isHostThread() const { return true; };
|
||||
|
||||
static inline HostThread* current()
|
||||
{
|
||||
Thread* thread = Thread::current();
|
||||
assert(thread->isHostThread() && "just checking");
|
||||
return (HostThread*) thread;
|
||||
}
|
||||
};
|
||||
#endif
|
||||
|
||||
/*@}*/
|
||||
|
||||
inline bool Runtime::initialized() { return initialized_; }
|
||||
|
||||
Reference in New Issue
Block a user