SWDEV-440746 - Remove obsolete code

The "optimized" version of memcpy is outdated and was used in win32 only. Change-Id: I7f2e0e9051e37cec95438266824b5b0025c324c6 [ROCm/clr commit: 7448113cfc]
2024-04-19 17:19:16 -04:00
parent 2335c92a1a
commit 74d80fb509
10 changed files with 41 additions and 327 deletions
@@ -41,7 +41,7 @@ bool HostBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
  }

  // Copy memory
-  amd::Os::fastMemcpy(dstHost, reinterpret_cast<const_address>(src) + origin[0], size[0]);
+  std::memcpy(dstHost, reinterpret_cast<const_address>(src) + origin[0], size[0]);

  // Unmap device memory
  srcMemory.cpuUnmap(vDev_);
@@ -69,8 +69,8 @@ bool HostBlitManager::readBufferRect(device::Memory& srcMemory, void* dstHost,
      dstOffset = hostRect.offset(0, y, z);

      // Copy memory line by line
-      amd::Os::fastMemcpy((reinterpret_cast<address>(dstHost) + dstOffset),
-                          (reinterpret_cast<const_address>(src) + srcOffset), size[0]);
+      std::memcpy((reinterpret_cast<address>(dstHost) + dstOffset),
+                  (reinterpret_cast<const_address>(src) + srcOffset), size[0]);
    }
  }

@@ -133,8 +133,8 @@ bool HostBlitManager::readImage(device::Memory& srcMemory, void* dstHost,
    // Copy memory line by line
    for (size_t row = 0; row < size[1]; ++row) {
      // Copy memory
-      amd::Os::fastMemcpy((reinterpret_cast<address>(dstHost) + dstOffs),
-                          (reinterpret_cast<const_address>(src) + srcOffs), copySize);
+      std::memcpy((reinterpret_cast<address>(dstHost) + dstOffs),
+                  (reinterpret_cast<const_address>(src) + srcOffs), copySize);

      srcOffs += srcRowPitch;
      dstOffs += rowPitch;
@@ -163,7 +163,7 @@ bool HostBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemory
  }

  // Copy memory
-  amd::Os::fastMemcpy(reinterpret_cast<address>(dst) + origin[0], srcHost, size[0]);
+  std::memcpy(reinterpret_cast<address>(dst) + origin[0], srcHost, size[0]);

  // Unmap the device memory
  dstMemory.cpuUnmap(vDev_);
@@ -191,8 +191,8 @@ bool HostBlitManager::writeBufferRect(const void* srcHost, device::Memory& dstMe
      dstOffset = bufRect.offset(0, y, z);

      // Copy memory line by line
-      amd::Os::fastMemcpy((reinterpret_cast<address>(dst) + dstOffset),
-                          (reinterpret_cast<const_address>(srcHost) + srcOffset), size[0]);
+      std::memcpy((reinterpret_cast<address>(dst) + dstOffset),
+                  (reinterpret_cast<const_address>(srcHost) + srcOffset), size[0]);
    }
  }

@@ -258,8 +258,8 @@ bool HostBlitManager::writeImage(const void* srcHost, device::Memory& dstMemory,
    // Copy memory line by line
    for (size_t row = 0; row < size[1]; ++row) {
      // Copy memory
-      amd::Os::fastMemcpy((reinterpret_cast<address>(dst) + dstOffs),
-                          (reinterpret_cast<const_address>(srcHost) + srcOffs), copySize);
+      std::memcpy((reinterpret_cast<address>(dst) + dstOffs),
+                  (reinterpret_cast<const_address>(srcHost) + srcOffs), copySize);

      dstOffs += dstRowPitch;
      srcOffs += rowPitch;
@@ -293,8 +293,8 @@ bool HostBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& dstM
  }

  // Straight forward buffer copy
-  amd::Os::fastMemcpy((reinterpret_cast<address>(dst) + dstOrigin[0]),
-                      (reinterpret_cast<const_address>(src) + srcOrigin[0]), size[0]);
+  std::memcpy((reinterpret_cast<address>(dst) + dstOrigin[0]),
+              (reinterpret_cast<const_address>(src) + srcOrigin[0]), size[0]);

  // Unmap source and destination memory
  dstMemory.cpuUnmap(vDev_);
@@ -329,8 +329,8 @@ bool HostBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory&
      size_t dstOffset = dstRect.offset(0, y, z);

      // Copy memory line by line
-      amd::Os::fastMemcpy((reinterpret_cast<address>(dst) + dstOffset),
-                          (reinterpret_cast<const_address>(src) + srcOffset), size[0]);
+      std::memcpy((reinterpret_cast<address>(dst) + dstOffset),
+                  (reinterpret_cast<const_address>(src) + srcOffset), size[0]);
    }
  }

@@ -392,8 +392,8 @@ bool HostBlitManager::copyImageToBuffer(device::Memory& srcMemory, device::Memor

    // Copy memory line by line
    for (size_t rows = 0; rows < size[1]; ++rows) {
-      amd::Os::fastMemcpy((reinterpret_cast<address>(dst) + dstOffs),
-                          (reinterpret_cast<const_address>(src) + srcOffs), copySize);
+      std::memcpy((reinterpret_cast<address>(dst) + dstOffs),
+                  (reinterpret_cast<const_address>(src) + srcOffs), copySize);

      srcOffs += srcRowPitch;
      dstOffs += copySize;
@@ -458,8 +458,8 @@ bool HostBlitManager::copyBufferToImage(device::Memory& srcMemory, device::Memor

    // Copy memory line by line
    for (size_t rows = 0; rows < size[1]; ++rows) {
-      amd::Os::fastMemcpy((reinterpret_cast<address>(dst) + dstOffs),
-                          (reinterpret_cast<const_address>(src) + srcOffs), copySize);
+      std::memcpy((reinterpret_cast<address>(dst) + dstOffs),
+                  (reinterpret_cast<const_address>(src) + srcOffs), copySize);

      srcOffs += copySize;
      dstOffs += dstRowPitch;
@@ -544,8 +544,8 @@ bool HostBlitManager::copyImage(device::Memory& srcMemory, device::Memory& dstMe

    // Copy memory line by line
    for (size_t rows = 0; rows < size[1]; ++rows) {
-      amd::Os::fastMemcpy((reinterpret_cast<address>(dst) + dstOffs),
-                          (reinterpret_cast<const_address>(src) + srcOffs), copySize);
+      std::memcpy((reinterpret_cast<address>(dst) + dstOffs),
+                  (reinterpret_cast<const_address>(src) + srcOffs), copySize);

      srcOffs += srcRowPitch;
      dstOffs += dstRowPitch;
@@ -139,10 +139,10 @@ bool Segment::alloc(HSAILProgram& prog, amdgpu_hsa_elf_segment_t segment, size_t

 void Segment::copy(size_t offset, const void* src, size_t size) {
  if (cpuAccess_ != nullptr) {
-    amd::Os::fastMemcpy(cpuAddress(offset), src, size);
+    std::memcpy(cpuAddress(offset), src, size);
  } else {
    if (cpuMem_ != nullptr) {
-      amd::Os::fastMemcpy(cpuAddress(offset), src, size);
+      std::memcpy(cpuAddress(offset), src, size);
    }
    amd::ScopedLock k(gpuAccess_->dev().xferMgr().lockXfer());
    VirtualGPU& gpu = *gpuAccess_->dev().xferQueue();
@@ -568,7 +568,7 @@ void* PALHSALoaderContext::SegmentAlloc(amdgpu_hsa_elf_segment_t segment, hsa_ag
 bool PALHSALoaderContext::SegmentCopy(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent,
                                      void* dst, size_t offset, const void* src, size_t size) {
  if (program_->isNull()) {
-    amd::Os::fastMemcpy(reinterpret_cast<address>(dst) + offset, src, size);
+    std::memcpy(reinterpret_cast<address>(dst) + offset, src, size);
    return true;
  }
  Segment* s = reinterpret_cast<Segment*>(dst);
@@ -1700,7 +1700,7 @@ bool Resource::hostWrite(VirtualGPU* gpu, const void* hostPtr, const amd::Coord3
    dst = static_cast<void*>(static_cast<char*>(dst) + origin[0]);

    // Copy memory
-    amd::Os::fastMemcpy(dst, hostPtr, copySize);
+    std::memcpy(dst, hostPtr, copySize);
  } else {
    size_t dstOffsBase = origin[0] * elementSize_;

@@ -1728,7 +1728,7 @@ bool Resource::hostWrite(VirtualGPU* gpu, const void* hostPtr, const amd::Coord3
      // Copy memory line by line
      for (size_t row = 0; row < size[1]; ++row) {
        // Copy memory
-        amd::Os::fastMemcpy((reinterpret_cast<address>(dst) + dstOffs),
+        std::memcpy((reinterpret_cast<address>(dst) + dstOffs),
                            (reinterpret_cast<const_address>(hostPtr) + srcOffs),
                            size[0] * elementSize_);

@@ -1770,7 +1770,7 @@ bool Resource::hostRead(VirtualGPU* gpu, void* hostPtr, const amd::Coord3D& orig
    src = static_cast<void*>(static_cast<char*>(src) + origin[0]);

    // Copy memory
-    amd::Os::fastMemcpy(hostPtr, src, copySize);
+    std::memcpy(hostPtr, src, copySize);
  } else {
    size_t srcOffsBase = origin[0] * elementSize_;

@@ -1798,9 +1798,9 @@ bool Resource::hostRead(VirtualGPU* gpu, void* hostPtr, const amd::Coord3D& orig
      // Copy memory line by line
      for (size_t row = 0; row < size[1]; ++row) {
        // Copy memory
-        amd::Os::fastMemcpy((reinterpret_cast<address>(hostPtr) + dstOffs),
-                            (reinterpret_cast<const_address>(src) + srcOffs),
-                            size[0] * elementSize_);
+        std::memcpy((reinterpret_cast<address>(hostPtr) + dstOffs),
+                    (reinterpret_cast<const_address>(src) + srcOffs),
+                    size[0] * elementSize_);

        srcOffs += desc().pitch_ * elementSize_;
        dstOffs += rowPitch;
@@ -1939,7 +1939,7 @@ bool Resource::isPersistentDirectMap(bool writeMap) const {
  if (directMap && desc().tiled_) {
    // Latest HW does have tiling apertures
    directMap = false;
-  } 
+  }
  if (memoryType() == View) {
    directMap = viewOwner_->isPersistentDirectMap(writeMap);
  }
@@ -1569,7 +1569,7 @@ void VirtualGPU::submitSvmCopyMemory(amd::SvmCopyMemoryCommand& vcmd) {
    }

    if (nullptr == srcMem && nullptr == dstMem) {  // both not in svm space
-      amd::Os::fastMemcpy(vcmd.dst(), vcmd.src(), vcmd.srcSize());
+      std::memcpy(vcmd.dst(), vcmd.src(), vcmd.srcSize());
      result = true;
    } else if (nullptr == srcMem && nullptr != dstMem) {  // src not in svm space
      Memory* memory = dev().getGpuMemory(dstMem);
@@ -618,7 +618,7 @@ class Device : public NullDevice {
  mutable std::mutex lock_allow_access_; //!< To serialize allow_access calls
  hsa_agent_t bkendDevice_;
  uint32_t pciDeviceId_;
-  hsa_agent_t* p2p_agents_list_;
+  hsa_agent_t* p2p_agents_list_ = nullptr;
  hsa_profile_t agent_profile_;
  hsa_amd_memory_pool_t group_segment_;
  hsa_amd_memory_pool_t system_segment_;
@@ -959,8 +959,7 @@ bool VirtualGPU::dispatchAqlPacket(hsa_kernel_dispatch_packet_t* packet, uint16_
  if (capturing == true) {
    packet->header = header;
    packet->setup = rest;
-    amd::Os::fastMemcpy(const_cast<uint8_t*>(aqlPacket), packet,
-                        sizeof(hsa_kernel_dispatch_packet_t));
+    std::memcpy(const_cast<uint8_t*>(aqlPacket), packet, sizeof(hsa_kernel_dispatch_packet_t));
    return true;
  } else {
    dispatchBlockingWait();
@@ -1995,7 +1994,7 @@ void VirtualGPU::submitSvmCopyMemory(amd::SvmCopyMemoryCommand& cmd) {

      // If these are from different contexts, then one of them could be in the device memory
      // This is fine, since spec doesn't allow for copies with pointers from different contexts
-      amd::Os::fastMemcpy(cmd.dst(), cmd.src(), cmd.srcSize());
+      std::memcpy(cmd.dst(), cmd.src(), cmd.srcSize());
      result = true;
    } else if (nullptr == srcMem && nullptr != dstMem) {  // src not in svm space
      Memory* memory = dev().getRocMemory(dstMem);
@@ -2158,7 +2157,7 @@ void VirtualGPU::submitSvmMapMemory(amd::SvmMapMemoryCommand& cmd) {
        // Wait on a kernel if one is outstanding
        releaseGpuMemoryFence();
        const void* mappedPtr = hsaMapMemory->owner()->getHostMem();
-        amd::Os::fastMemcpy(cmd.svmPtr(), mappedPtr, cmd.size()[0]);
+        std::memcpy(cmd.svmPtr(), mappedPtr, cmd.size()[0]);
      }
    } else {
      LogError("Unhandled svm map!");
@@ -2189,7 +2188,7 @@ void VirtualGPU::submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& cmd) {
        Memory* hsaMapMemory = dev().getRocMemory(memory->mapMemory());

        void* mappedPtr = hsaMapMemory->owner()->getHostMem();
-        amd::Os::fastMemcpy(mappedPtr, cmd.svmPtr(), writeMapInfo->region_[0]);
+        std::memcpy(mappedPtr, cmd.svmPtr(), writeMapInfo->region_[0]);
        // Target is a remote resource, so copy
        if (!blitMgr().copyBuffer(*hsaMapMemory, *memory, writeMapInfo->origin_,
                                  writeMapInfo->origin_, writeMapInfo->region_,
@@ -2277,7 +2276,7 @@ void VirtualGPU::submitMapMemory(amd::MapMemoryCommand& cmd) {
        if ((svmPtr != nullptr) && (hostPtr != svmPtr)) {
          // Wait on a kernel if one is outstanding
          releaseGpuMemoryFence();
-          amd::Os::fastMemcpy(svmPtr, hostPtr, size[0]);
+          std::memcpy(svmPtr, hostPtr, size[0]);
        }
      } else {
        result = blitMgr().readBuffer(*hsaMemory, static_cast<char*>(hostPtr) + origin[0], origin,
@@ -2377,7 +2376,7 @@ void VirtualGPU::submitUnmapMemory(amd::UnmapMemoryCommand& cmd) {
          if ((svmPtr != nullptr) && (hostPtr != svmPtr)) {
            // Wait on a kernel if one is outstanding
            releaseGpuMemoryFence();
-            amd::Os::fastMemcpy(hostPtr, svmPtr, size[0]);
+            std::memcpy(hostPtr, svmPtr, size[0]);
          }
          result = blitMgr().copyBuffer(*hsaMapMemory, *devMemory, mapInfo->origin_, mapInfo->origin_,
                                        mapInfo->region_, mapInfo->isEntire());
@@ -2937,7 +2936,7 @@ static inline void nontemporalMemcpy(
                    *reinterpret_cast<const int* __restrict&>(src)++);
  }
 #else
-  amd::Os::fastMemcpy(dst, src, size);
+  std::memcpy(dst, src, size);
 #endif
 }

@@ -232,9 +232,6 @@ class Os : AllStatic {
  //! Deallocate an aligned chunk of memory.
  static void alignedFree(void* mem);

-  //! Platform-specific optimized memcpy()
-  static void* fastMemcpy(void* dest, const void* src, size_t n);
-
  //! NUMA related settings
  static void setPreferredNumaNode(uint32_t node);

@@ -524,7 +524,7 @@ int Os::systemCall(const std::string& command) {
 #if 1
  size_t len = command.size();
  char* cmd = new char[len + 1];
-  fastMemcpy(cmd, command.c_str(), len);
+  std::memcpy(cmd, command.c_str(), len);
  cmd[len] = 0;

  // Split the command into arguments. This is a very
@@ -681,8 +681,6 @@ uint64_t Os::xgetbv(uint32_t ecx) {
 }
 #endif  // ATI_ARCH_X86

-void* Os::fastMemcpy(void* dest, const void* src, size_t n) { return memcpy(dest, src, n); }
-
 uint64_t Os::offsetToEpochNanos() {
  static uint64_t offset = 0;

@@ -424,7 +424,7 @@ int Os::printf(const char* fmt, ...) {
 int Os::systemCall(const std::string& command) {
 #if 1
  char* cmd = new char[command.size() + 1];
-  fastMemcpy(cmd, command.c_str(), command.size());
+  std::memcpy(cmd, command.c_str(), command.size());
  cmd[command.size()] = 0;

  STARTUPINFO si = {0};
@@ -509,255 +509,6 @@ void Os::cpuid(int regs[4], int info) { return __cpuid(regs, info); }

 uint64_t Os::xgetbv(uint32_t ecx) { return (uint64_t)_xgetbv(ecx); }

-// Various "fast" memcpy implementation (currently win32 only due to compiler limitations)
-
-// (dgladdin - "recent" below means MMX and later)
-
-// Very optimized memcpy() routine for all AMD Athlon and Duron family.
-// This code uses any of FOUR different basic copy methods, depending
-// on the transfer size.
-// NOTE:  Since this code uses MOVNTQ (also known as "Non-Temporal MOV" or
-// "Streaming Store"), and also uses the software prefetchnta instructions,
-// be sure youre running on Athlon/Duron or other recent CPU before calling!
-
-#define TINY_BLOCK_COPY 64  // upper limit for movsd type copy
-// The smallest copy uses the X86 "movsd" instruction, in an optimized
-// form which is an "unrolled loop".
-
-#define IN_CACHE_COPY 64 * 1024  // upper limit for movq/movq copy w/SW prefetch
-// Next is a copy that uses the MMX registers to copy 8 bytes at a time,
-// also using the "unrolled loop" optimization.   This code uses
-// the software prefetch instruction to get the data into the cache.
-
-#define UNCACHED_COPY 197 * 1024  // upper limit for movq/movntq w/SW prefetch
-// For larger blocks, which will spill beyond the cache, its faster to
-// use the Streaming Store instruction MOVNTQ.   This write instruction
-// bypasses the cache and writes straight to main memory.  This code also
-// uses the software prefetch instruction to pre-read the data.
-// USE 64 * 1024 FOR THIS VALUE IF YOURE ALWAYS FILLING A "CLEAN CACHE"
-
-#define BLOCK_PREFETCH_COPY infinity  // no limit for movq/movntq w/block prefetch
-#define CACHEBLOCK 80h                // number of 64-byte blocks (cache lines) for block prefetch
-// For the largest size blocks, a special technique called Block Prefetch
-// can be used to accelerate the read operations.   Block Prefetch reads
-// one address per cache line, for a series of cache lines, in a short loop.
-// This is faster than using software prefetch.  The technique is great for
-// getting maximum read bandwidth, especially in DDR memory systems.
-
-// Inline assembly syntax for use with Visual C++
-
-void* Os::fastMemcpy(void* dest, const void* src, size_t n) {
-#if !defined(_WIN64)
-
-  __asm {
-
-    mov     ecx, [n]        ; number of bytes to copy
-    mov     edi, [dest]     ; destination
-    mov     esi, [src]      ; source
-    mov     ebx, ecx        ; keep a copy of count
-
-    cld
-    cmp     ecx, TINY_BLOCK_COPY
-    jb      $memcpy_ic_3    ; tiny? skip mmx copy
-
-    cmp     ecx, 32*1024        ; dont align between 32k-64k because
-    jbe     $memcpy_do_align    ;  it appears to be slower
-    cmp     ecx, 64*1024
-    jbe     $memcpy_align_done
-$memcpy_do_align:
-    mov     ecx, 8          ; a trick thats faster than rep movsb...
-    sub     ecx, edi        ; align destination to qword
-    and     ecx, 111b       ; get the low bits
-    sub     ebx, ecx        ; update copy count
-    neg     ecx             ; set up to jump into the array
-    add     ecx, offset $memcpy_align_done
-    jmp     ecx             ; jump to array of movsbs
-
-align 4
-    movsb
-    movsb
-    movsb
-    movsb
-    movsb
-    movsb
-    movsb
-    movsb
-
-$memcpy_align_done:         ; destination is dword aligned
-    mov     ecx, ebx        ; number of bytes left to copy
-    shr     ecx, 6          ; get 64-byte block count
-    jz      $memcpy_ic_2    ; finish the last few bytes
-
-    cmp     ecx, IN_CACHE_COPY/64    ; too big 4 cache? use uncached copy
-    jae     $memcpy_uc_test
-
-        // This is small block copy that uses the MMX registers to copy 8 bytes
-        // at a time.  It uses the "unrolled loop" optimization, and also uses
-        // the software prefetch instruction to get the data into the cache.
-align 16
-$memcpy_ic_1:            ; 64-byte block copies, in-cache copy
-
-    prefetchnta [esi + (200*64/34+192)]        ; start reading ahead
-
-    movq    mm0, [esi+0]    ; read 64 bits
-    movq    mm1, [esi+8]
-    movq    [edi+0], mm0    ; write 64 bits
-    movq    [edi+8], mm1    ; note:  the normal movq writes the
-    movq    mm2, [esi+16]   ; data to cache; a cache line will be
-    movq    mm3, [esi+24]   ; allocated as needed, to store the data
-    movq    [edi+16], mm2
-    movq    [edi+24], mm3
-    movq    mm0, [esi+32]
-    movq    mm1, [esi+40]
-    movq    [edi+32], mm0
-    movq    [edi+40], mm1
-    movq    mm2, [esi+48]
-    movq    mm3, [esi+56]
-    movq    [edi+48], mm2
-    movq    [edi+56], mm3
-
-    add        esi, 64      ; update source pointer
-    add        edi, 64      ; update destination pointer
-    dec        ecx          ; count down
-    jnz        $memcpy_ic_1 ; last 64-byte block?
-
-$memcpy_ic_2:
-    mov        ecx, ebx     ; has valid low 6 bits of the byte count
-$memcpy_ic_3:
-    shr        ecx, 2       ; dword count
-    and        ecx, 1111b   ; only look at the "remainder" bits
-    neg        ecx          ; set up to jump into the array
-    add        ecx, offset $memcpy_last_few
-    jmp        ecx          ; jump to array of movsds
-
-$memcpy_uc_test:
-    cmp        ecx, UNCACHED_COPY/64    ; big enough? use block prefetch copy
-    jae        $memcpy_bp_1
-
-$memcpy_64_test:
-    or        ecx, ecx      ; tail end of block prefetch will jump here
-    jz        $memcpy_ic_2  ; no more 64-byte blocks left
-
-        // For larger blocks, which will spill beyond the cache, its faster to
-        // use the Streaming Store instruction MOVNTQ.   This write instruction
-        // bypasses the cache and writes straight to main memory.  This code also
-        // uses the software prefetch instruction to pre-read the data.
-align 16
-$memcpy_uc_1:               ; 64-byte blocks, uncached copy
-
-    prefetchnta [esi + (200*64/34+192)]        ; start reading ahead
-
-    movq    mm0,[esi+0]     ; read 64 bits
-    add     edi,64          ; update destination pointer
-    movq    mm1,[esi+8]
-    add     esi,64          ; update source pointer
-    movq    mm2,[esi-48]
-    movntq  [edi-64], mm0   ; write 64 bits, bypassing the cache
-    movq    mm0,[esi-40]    ; note: movntq also prevents the CPU
-    movntq  [edi-56], mm1   ; from READING the destination address
-    movq    mm1,[esi-32]    ; into the cache, only to be over-written
-    movntq  [edi-48], mm2   ; so that also helps performance
-    movq    mm2,[esi-24]
-    movntq  [edi-40], mm0
-    movq    mm0,[esi-16]
-    movntq  [edi-32], mm1
-    movq    mm1,[esi-8]
-    movntq  [edi-24], mm2
-    movntq  [edi-16], mm0
-    dec     ecx
-    movntq  [edi-8], mm1
-    jnz     $memcpy_uc_1    ; last 64-byte block?
-
-    jmp     $memcpy_ic_2    ; almost done
-
-    // For the largest size blocks, a special technique called Block Prefetch
-    // can be used to accelerate the read operations.   Block Prefetch reads
-    // one address per cache line, for a series of cache lines, in a short loop.
-    // This is faster than using software prefetch, in this case.
-    // The technique is great for getting maximum read bandwidth,
-    // especially in DDR memory systems.
-$memcpy_bp_1:               ; large blocks, block prefetch copy
-
-    cmp     ecx, CACHEBLOCK ; big enough to run another prefetch loop?
-    jl      $memcpy_64_test ; no, back to regular uncached copy
-
-    mov     eax, CACHEBLOCK / 2  ; block prefetch loop, unrolled 2X
-    add     esi, CACHEBLOCK * 64 ; move to the top of the block
-align 16
-$memcpy_bp_2:
-    mov     edx, [esi-64]   ; grab one address per cache line
-    mov     edx, [esi-128]  ; grab one address per cache line
-    sub     esi, 128        ; go reverse order
-    dec     eax             ; count down the cache lines
-    jnz     $memcpy_bp_2    ; keep grabbing more lines into cache
-
-    mov     eax, CACHEBLOCK ; now that its in cache, do the copy
-align 16
-$memcpy_bp_3:
-    movq    mm0, [esi   ]   ; read 64 bits
-    movq    mm1, [esi+ 8]
-    movq    mm2, [esi+16]
-    movq    mm3, [esi+24]
-    movq    mm4, [esi+32]
-    movq    mm5, [esi+40]
-    movq    mm6, [esi+48]
-    movq    mm7, [esi+56]
-    add     esi, 64         ; update source pointer
-    movntq  [edi   ], mm0   ; write 64 bits, bypassing cache
-    movntq  [edi+ 8], mm1   ; note: movntq also prevents the CPU
-    movntq  [edi+16], mm2   ; from READING the destination address
-    movntq  [edi+24], mm3   ; into the cache, only to be over-written,
-    movntq  [edi+32], mm4   ; so that also helps performance
-    movntq  [edi+40], mm5
-    movntq  [edi+48], mm6
-    movntq  [edi+56], mm7
-    add     edi, 64         ; update dest pointer
-
-    dec     eax             ; count down
-
-    jnz     $memcpy_bp_3    ; keep copying
-    sub     ecx, CACHEBLOCK ; update the 64-byte block count
-    jmp     $memcpy_bp_1    ; keep processing chunks
-
-    // The smallest copy uses the X86 "movsd" instruction, in an optimized
-    // form which is an "unrolled loop".   Then it handles the last few bytes.
-align 4
-    movsd
-    movsd            ; perform last 1-15 dword copies
-    movsd
-    movsd
-    movsd
-    movsd
-    movsd
-    movsd
-    movsd
-    movsd            ; perform last 1-7 dword copies
-    movsd
-    movsd
-    movsd
-    movsd
-    movsd
-    movsd
-
-$memcpy_last_few:           ; dword aligned from before movsds
-    mov     ecx, ebx        ; has valid low 2 bits of the byte count
-    and     ecx, 11b        ; the last few cows must come home
-    jz      $memcpy_final   ; no more, lets leave
-    rep     movsb           ; the last 1, 2, or 3 bytes
-
-$memcpy_final:
-    emms                    ; clean up the MMX state
-    sfence                  ; flush the write buffer
-    mov     eax, [dest]     ; ret value = destination pointer
-
-  }
-#else  // !defined(_WIN64))
-
-  return memcpy(dest, src, n);
-
-#endif
-}
-
 uint64_t Os::offsetToEpochNanos() {
  static uint64_t offset = 0;

@@ -56,37 +56,6 @@ class Runtime : AllStatic {
  }
 };

-#if 0
-class HostThread : public Thread
-{
-private:
-    virtual void run(void* data) { ShouldNotCallThis(); }
-
-public:
-    HostThread() : Thread("HostThread", 0, false)
-    {
-        setHandle(NULL);
-        setCurrent();
-
-        if (!amd::Runtime::initialized() && !amd::Runtime::init()) {
-            return;
-        }
-
-        Os::currentStackInfo(&stackBase_, &stackSize_);
-        setState(RUNNABLE);
-    }
-
-    bool isHostThread() const { return true; };
-
-    static inline HostThread* current()
-    {
-        Thread* thread = Thread::current();
-        assert(thread->isHostThread() && "just checking");
-        return (HostThread*) thread;
-    }
-};
-#endif
-
 /*@}*/

 inline bool Runtime::initialized() { return initialized_; }