diff --git a/projects/clr/hipamd/src/hip_graph_internal.cpp b/projects/clr/hipamd/src/hip_graph_internal.cpp
index ec42414f8d..dccf7044a7 100644
--- a/projects/clr/hipamd/src/hip_graph_internal.cpp
+++ b/projects/clr/hipamd/src/hip_graph_internal.cpp
@@ -395,19 +395,28 @@ hipError_t GraphExec::CaptureAQLPackets() {
       }
     }
 
-    if (device_kernarg_pool_ && !device->isXgmi()) {
-      if (device->info().hdpMemFlushCntl != nullptr) {
+    auto kernArgImpl = device->settings().kernel_arg_impl_;
+
+    const auto applyMemOrderingWA =
+        ((kernArgImpl == KernelArgImpl::DeviceKernelArgsReadback) ||
+         (kernArgImpl == KernelArgImpl::DeviceKernelArgsHDP)) &&
+        kernarg_pool_size_graph_ > 0;
+
+    if (device_kernarg_pool_ && applyMemOrderingWA) {
+      address dev_ptr = kernarg_pool_graph_ + kernarg_pool_size_graph_;
+      volatile char kSentinel = *(dev_ptr - 1);
+      // Memory ordering workaround for pcie: execute sfence followed by
+      // write the last byte of kernarg.
+      _mm_sfence();
+      *(dev_ptr - 1) = kSentinel;
+      // HDP flush is required to guarantee ordering in Navi and MI100
+      if (kernArgImpl == KernelArgImpl::DeviceKernelArgsHDP) {
         *device->info().hdpMemFlushCntl = 1u;
-        if (*device->info().hdpMemFlushCntl != UINT32_MAX) {
-          LogError("Unexpected HDP Register readback value!");
-        }
-      } else {
-        amd::Command* command = new amd::Marker(*capture_stream_, true);
-        if (command != nullptr) {
-          command->enqueue();
-          command->release();
-        }
       }
+      // Memory ordering workaround for pcie: execute mfence followed by
+      // read of the last byte of kernarg.
+      _mm_mfence();
+      kSentinel = *(dev_ptr - 1);
     }
 
     ResetQueueIndex();
diff --git a/projects/clr/hipamd/src/hip_graph_internal.hpp b/projects/clr/hipamd/src/hip_graph_internal.hpp
index c2b7298722..2e8a90c441 100644
--- a/projects/clr/hipamd/src/hip_graph_internal.hpp
+++ b/projects/clr/hipamd/src/hip_graph_internal.hpp
@@ -647,6 +647,8 @@ struct GraphExec {
   // Capture GPU Packets from graph commands
   hipError_t CaptureAQLPackets();
   hipError_t UpdateAQLPacket(hip::GraphKernelNode* node);
+
+  using KernelArgImpl = device::Settings::KernelArgImpl;
 };
 
 struct ChildGraphNode : public GraphNode {
diff --git a/projects/clr/rocclr/device/device.hpp b/projects/clr/rocclr/device/device.hpp
index b4b1afd1af..edea84f6f8 100644
--- a/projects/clr/rocclr/device/device.hpp
+++ b/projects/clr/rocclr/device/device.hpp
@@ -651,6 +651,19 @@ struct Info : public amd::EmbeddedObject {
 //! Device settings
 class Settings : public amd::HeapObject {
  public:
+
+  enum KernelArgImpl {
+    HostKernelArgs = 0,       //!< Kernel Arguments are put into host memory
+    DeviceKernelArgs,         //!< Device memory kernel arguments with no memory
+                              //!< ordering workaround (e.g. XGMI)
+    DeviceKernelArgsReadback, //!< Device memory kernel arguments with kernel
+                              //!< argument readback workaround (works only in
+                              //!< ASICS >= MI200)
+    DeviceKernelArgsHDP       //!< Device memory kernel arguments with kernel
+                              //!< argument readback plus HDP flush workaround.
+                              //!< Works in all ASICS. Requires a valid hdp flush register
+  };
+
   uint64_t extensions_;  //!< Supported OCL extensions
   union {
     struct {
@@ -675,7 +688,8 @@ class Settings : public amd::HeapObject {
       uint rocr_backend_ : 1;         //!< Device uses ROCr backend for submissions
       uint gwsInitSupported_:1;       //!< Check if GWS is supported on this machine.
       uint kernel_arg_opt_: 1;        //!< Enables kernel arg optimization for blit kernels
-      uint reserved_ : 9;
+      uint kernel_arg_impl_ : 2;      //!< Kernel argument implementation 
+      uint reserved_ : 7;
     };
     uint value_;
   };
diff --git a/projects/clr/rocclr/device/pal/palsettings.cpp b/projects/clr/rocclr/device/pal/palsettings.cpp
index c68eaff6f8..dd01e758d8 100644
--- a/projects/clr/rocclr/device/pal/palsettings.cpp
+++ b/projects/clr/rocclr/device/pal/palsettings.cpp
@@ -143,7 +143,9 @@ Settings::Settings() {
   alwaysResident_ = amd::IS_HIP ? true : false;
   prepinnedMinSize_ = 0;
   cpDmaCopySizeMax_ = GPU_CP_DMA_COPY_SIZE * Ki;
-  useDeviceKernelArg_ = flagIsDefault(HIP_FORCE_DEV_KERNARG) ? true : HIP_FORCE_DEV_KERNARG;
+  kernel_arg_impl_ = flagIsDefault(HIP_FORCE_DEV_KERNARG)
+                         ? KernelArgImpl::DeviceKernelArgs
+                         : HIP_FORCE_DEV_KERNARG;
 
   limit_blit_wg_ = 16;
   DEBUG_CLR_GRAPH_PACKET_CAPTURE = false; // disable graph performance optimizations for PAL
diff --git a/projects/clr/rocclr/device/pal/palsettings.hpp b/projects/clr/rocclr/device/pal/palsettings.hpp
index 942c5c91bd..e3cd2c7f39 100644
--- a/projects/clr/rocclr/device/pal/palsettings.hpp
+++ b/projects/clr/rocclr/device/pal/palsettings.hpp
@@ -79,8 +79,7 @@ class Settings : public device::Settings {
       uint imageBufferWar_ : 1;         //!< Image buffer workaround for Gfx10
       uint disableSdma_ : 1;            //!< Disable SDMA support
       uint alwaysResident_ : 1;         //!< Make resources resident at allocation time
-      uint useDeviceKernelArg_ : 1;     //!< Use persistent memory for kernel arguments
-      uint reserved_ : 9;
+      uint reserved_ : 10;
     };
     uint value_;
   };
@@ -139,6 +138,8 @@ class Settings : public device::Settings {
 
   //! Overrides current settings based on registry/environment
   void override();
+
+  using KernelArgImpl = device::Settings::KernelArgImpl;
 };
 
 /*@}*/  // namespace pal
diff --git a/projects/clr/rocclr/device/pal/palvirtual.cpp b/projects/clr/rocclr/device/pal/palvirtual.cpp
index a20577b24c..a5228f9b08 100644
--- a/projects/clr/rocclr/device/pal/palvirtual.cpp
+++ b/projects/clr/rocclr/device/pal/palvirtual.cpp
@@ -408,7 +408,7 @@ bool VirtualGPU::Queue::flush() {
   submitInfo.ppFences             = &iCmdFences_[cmdBufIdSlot_];
 
   if (iQueue_->Type() == Pal::QueueTypeCompute) {
-    if (settings.useDeviceKernelArg_) {
+    if (gpu_.dev().settings().kernel_arg_impl_ == KernelArgImpl::DeviceKernelArgs) {
       // If runtime uses device memory for kernel arguments, then perform a CPU read back on
       // submission. That will make sure NBIO puches all previous CPU write requests through PCIE
       gpu_.managedBuffer().CpuReadBack();
@@ -955,10 +955,12 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs,
   }
 
   // Create buffers for kernel arg management
-  if (!managedBuffer_.create(
-      dev().settings().useDeviceKernelArg_ ? Resource::Persistent : Resource::RemoteUSWC)) {
+  if (!managedBuffer_.create(dev().settings().kernel_arg_impl_ ==
+                                     KernelArgImpl::DeviceKernelArgs
+                                 ? Resource::Persistent
+                                 : Resource::RemoteUSWC)) {
     // Try just USWC if persistent memory failed
-    if (dev().settings().useDeviceKernelArg_) {
+    if (dev().settings().kernel_arg_impl_ == KernelArgImpl::DeviceKernelArgs) {
       if (!managedBuffer_.create(Resource::RemoteUSWC)) {
         return false;
       }
diff --git a/projects/clr/rocclr/device/pal/palvirtual.hpp b/projects/clr/rocclr/device/pal/palvirtual.hpp
index bf46db9272..58bdb307bc 100644
--- a/projects/clr/rocclr/device/pal/palvirtual.hpp
+++ b/projects/clr/rocclr/device/pal/palvirtual.hpp
@@ -721,6 +721,8 @@ class VirtualGPU : public device::VirtualDevice {
   MemoryRange sdmaRange_;                   //!< SDMA memory range for write access
 
   void* hostcallBuffer_;  //!< Hostcall buffer
+
+  using KernelArgImpl = device::Settings::KernelArgImpl;
 };
 
 inline void VirtualGPU::logVmMemory(const std::string name, const Memory* memory) {
diff --git a/projects/clr/rocclr/device/rocm/rocdevice.cpp b/projects/clr/rocclr/device/rocm/rocdevice.cpp
index ab0d00015d..e046700e92 100644
--- a/projects/clr/rocclr/device/rocm/rocdevice.cpp
+++ b/projects/clr/rocclr/device/rocm/rocdevice.cpp
@@ -141,7 +141,7 @@ bool NullDevice::create(const amd::Isa &isa) {
   roc::Settings* hsaSettings = new roc::Settings();
   settings_ = hsaSettings;
   if (!hsaSettings ||
-      !hsaSettings->create(false, isa.versionMajor(), isa.versionMinor(), isa.versionStepping(),
+      !hsaSettings->create(false, isa,
                            isa.xnack() == amd::Isa::Feature::Enabled)) {
     LogPrintfError("Error creating settings for offline HSA device %s", isa.targetId());
     return false;
@@ -734,23 +734,17 @@ bool Device::create() {
 
   info_.hdpMemFlushCntl = hdpInfo.HDP_MEM_FLUSH_CNTL;
   info_.hdpRegFlushCntl = hdpInfo.HDP_REG_FLUSH_CNTL;
-
-  bool device_kernel_args = true;
-  if (!isXgmi_ && ((info_.hdpMemFlushCntl == nullptr) || (info_.hdpRegFlushCntl == nullptr))) {
-    LogWarning("Unable to determine HDP flush register address. "
-      "Device kernel arguments are not supported");
-    device_kernel_args = false;
-  }
+  bool hasValidHDPFlush =
+      (info_.hdpMemFlushCntl != nullptr) && (info_.hdpRegFlushCntl != nullptr);
 
   // Create HSA settings
   assert(!settings_);
   roc::Settings* hsaSettings = new roc::Settings();
   settings_ = hsaSettings;
   if (!hsaSettings ||
-      !hsaSettings->create((agent_profile_ == HSA_PROFILE_FULL), isa->versionMajor(),
-                           isa->versionMinor(), isa->versionStepping(),
+      !hsaSettings->create((agent_profile_ == HSA_PROFILE_FULL), *isa,
                            isa->xnack() == amd::Isa::Feature::Enabled,
-                           coop_groups, device_kernel_args)) {
+                           coop_groups, isXgmi_, hasValidHDPFlush)) {
     LogPrintfError("Unable to create settings for HSA device %s (PCI ID %x)", agent_name,
                    pciDeviceId_);
     return false;
diff --git a/projects/clr/rocclr/device/rocm/rocsettings.cpp b/projects/clr/rocclr/device/rocm/rocsettings.cpp
index fff1133f60..19bac547fd 100644
--- a/projects/clr/rocclr/device/rocm/rocsettings.cpp
+++ b/projects/clr/rocclr/device/rocm/rocsettings.cpp
@@ -95,15 +95,20 @@ Settings::Settings() {
   fgs_kernel_arg_ = false;
   barrier_value_packet_ = false;
 
-  device_kernel_args_ = false;
+  kernel_arg_impl_ = KernelArgImpl::HostKernelArgs;
   gwsInitSupported_ = true;
   limit_blit_wg_ = 16;
 }
 
 // ================================================================================================
-bool Settings::create(bool fullProfile, uint32_t gfxipMajor, uint32_t gfxipMinor,
-                      uint32_t gfxStepping, bool enableXNACK, bool coop_groups,
-                      bool device_kernel_args) {
+bool Settings::create(bool fullProfile, const amd::Isa& isa,
+                      bool enableXNACK, bool coop_groups, 
+                      bool isXgmi, bool hasValidHDPFlush) {
+
+  uint32_t gfxipMajor = isa.versionMajor();
+  uint32_t gfxipMinor = isa.versionMinor();
+  uint32_t gfxStepping = isa.versionStepping();
+
   customHostAllocator_ = false;
 
   if (fullProfile) {
@@ -166,12 +171,7 @@ bool Settings::create(bool fullProfile, uint32_t gfxipMajor, uint32_t gfxipMinor
     barrier_value_packet_ = true;
   }
 
-  // Enable device kernel args for MI300* for now
-  if (gfxipMajor == 9 && gfxipMinor == 4 &&
-      (gfxStepping == 0 || gfxStepping == 1 || gfxStepping == 2)) {
-    device_kernel_args_ = HIP_FORCE_DEV_KERNARG && device_kernel_args;
-    kernel_arg_opt_ = true;
-  }
+  setKernelArgImpl(isa, isXgmi, hasValidHDPFlush);
 
   if (gfxipMajor >= 10) {
      enableWave32Mode_ = true;
@@ -235,14 +235,51 @@ void Settings::override() {
     fgs_kernel_arg_ = ROC_USE_FGS_KERNARG;
   }
 
-  if (!flagIsDefault(HIP_FORCE_DEV_KERNARG)) {
-    device_kernel_args_ = HIP_FORCE_DEV_KERNARG;
-  }
-
   if (!flagIsDefault(DEBUG_CLR_BLIT_KERNARG_OPT)) {
     kernel_arg_opt_ = DEBUG_CLR_BLIT_KERNARG_OPT;
   }
 }
+
+// ================================================================================================
+void Settings::setKernelArgImpl(const amd::Isa& isa, bool isXgmi, bool hasValidHDPFlush) {
+
+  const uint32_t gfxipMajor = isa.versionMajor();
+  const uint32_t gfxipMinor = isa.versionMinor();
+  const uint32_t gfxStepping = isa.versionStepping();
+
+  const bool isMI300 = gfxipMajor == 9 && gfxipMinor == 4 &&
+      (gfxStepping == 0 || gfxStepping == 1 || gfxStepping == 2);
+  const bool isMI200 = (gfxipMajor == 9 && gfxipMinor == 0 && gfxStepping == 10);
+  const bool isMI100 = (gfxipMajor == 9 && gfxipMinor == 0 && gfxStepping == 8);
+  const bool isNavi = (gfxipMajor >= 10);
+
+  auto kernelArgImpl = KernelArgImpl::HostKernelArgs;
+
+  if (isXgmi) {
+    // The XGMI-connected path does not require the manual memory ordering
+    // workarounds that the PCIe connected path requires
+    kernelArgImpl = KernelArgImpl::DeviceKernelArgs;
+  } else if (isMI300 || isMI200) {
+    // Implement the kernel argument readback workaround. It works only on
+    // MI200, MI300 because of the strict guarantee on ordering of
+    // stores in those ASICS
+    kernelArgImpl = KernelArgImpl::DeviceKernelArgsReadback;
+  } else if (hasValidHDPFlush && (isNavi || isMI100)) {
+    // For dev >= gfx10 and MI100 ASICS implement the HDP flush to MMIO if the
+    // HDP flush register is valid
+    kernelArgImpl = KernelArgImpl::DeviceKernelArgsHDP;
+  }
+
+  // Enable device kernel args for MI300* for now
+  if (isMI300) {
+    kernel_arg_impl_ = kernelArgImpl;
+    kernel_arg_opt_ = true;
+  }
+
+  if (!flagIsDefault(HIP_FORCE_DEV_KERNARG)) {
+    kernel_arg_impl_ = kernelArgImpl & (HIP_FORCE_DEV_KERNARG ? 0xF : 0x0);
+  }
+}
 }  // namespace roc
 
 #endif  // WITHOUT_HSA_BACKEND
diff --git a/projects/clr/rocclr/device/rocm/rocsettings.hpp b/projects/clr/rocclr/device/rocm/rocsettings.hpp
index 4b6e384c48..7a250bb7be 100644
--- a/projects/clr/rocclr/device/rocm/rocsettings.hpp
+++ b/projects/clr/rocclr/device/rocm/rocsettings.hpp
@@ -52,8 +52,7 @@ class Settings : public device::Settings {
       uint system_scope_signal_ : 1;    //!< HSA signal is visibile to the entire system
       uint fgs_kernel_arg_ : 1;         //!< Use fine grain kernel arg segment
       uint barrier_value_packet_ : 1;   //!< Barrier value packet functionality
-      uint device_kernel_args_ : 1;     //!< Allocate kernel args in device memory
-      uint reserved_ : 20;
+      uint reserved_ : 21;
     };
     uint value_;
   };
@@ -83,9 +82,9 @@ class Settings : public device::Settings {
   Settings();
 
   //! Creates settings
-  bool create(bool fullProfile, uint32_t gfxipMajor, uint32_t gfxipMinor, uint32_t gfxStepping,
-              bool enableXNACK, bool coop_groups = false,
-              bool device_kernel_args = true);
+  bool create(bool fullProfile, const amd::Isa &isa, bool enableXNACK,
+              bool coop_groups = false, bool isXgmi = false,
+              bool hasValidHDPFlush = true);
 
  private:
   //! Disable copy constructor
@@ -96,6 +95,10 @@ class Settings : public device::Settings {
 
   //! Overrides current settings based on registry/environment
   void override();
+
+  //! Determine how kernel arguments should be implemented given ASIC (host
+  //! memory, device memory, device memory with memory ordering workaround)
+  void setKernelArgImpl(const amd::Isa& isa, bool isXgmi, bool hasValidHDPFlush);
 };
 
 /*@}*/} // namespace roc
diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/device/rocm/rocvirtual.cpp
index 53a8451f9c..f9f74313e2 100644
--- a/projects/clr/rocclr/device/rocm/rocvirtual.cpp
+++ b/projects/clr/rocclr/device/rocm/rocvirtual.cpp
@@ -1363,7 +1363,8 @@ bool VirtualGPU::initPool(size_t kernarg_pool_size) {
   kernarg_pool_size_ = kernarg_pool_size;
   kernarg_pool_chunk_end_ = kernarg_pool_size_ / KernelArgPoolNumSignal;
   active_chunk_ = 0;
-  if (dev().settings().device_kernel_args_ && roc_device_.info().largeBar_) {
+  if ((dev().settings().kernel_arg_impl_ != KernelArgImpl::HostKernelArgs) &&
+      roc_device_.info().largeBar_) {
     kernarg_pool_base_ =
       reinterpret_cast<address>(roc_device_.deviceLocalAlloc(kernarg_pool_size_));
   } else {
@@ -3201,11 +3202,15 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
       }
     }
 
-    const auto pcieKernargs = !dev().isXgmi() &&
-                              dev().settings().device_kernel_args_ &&
-                              roc_device_.info().largeBar_;
     address argBuffer = hidden_arguments;
     bool isGraphCapture = vcmd != nullptr && vcmd->getCapturingState();
+    size_t argSize = std::min(gpuKernel.KernargSegmentByteSize(), signature.paramsSize());
+
+    const auto kernArgImpl = dev().settings().kernel_arg_impl_;
+    const auto applyMemOrderingWA =
+        ((kernArgImpl == KernelArgImpl::DeviceKernelArgsReadback) ||
+         (kernArgImpl == KernelArgImpl::DeviceKernelArgsHDP)) &&
+        roc_device_.info().largeBar_ && argSize > 0 && !isGraphCapture;
 
     // Find all parameters for the current kernel
     if (!kernel.parameters().deviceKernelArgs() || gpuKernel.isInternalKernel()) {
@@ -3213,16 +3218,23 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
       if (isGraphCapture) {
         argBuffer = vcmd->getKernArgOffset();
       } else {
-        const auto kernargSize = gpuKernel.KernargSegmentByteSize();
-        argBuffer = reinterpret_cast<address>(allocKernArg(kernargSize,
-                                              gpuKernel.KernargSegmentAlignment()));
+
+        argBuffer = reinterpret_cast<address>(
+            allocKernArg(gpuKernel.KernargSegmentByteSize(),
+                         gpuKernel.KernargSegmentAlignment()));
       }
-      // Load all kernel arguments
-      nontemporalMemcpy(argBuffer, parameters,
-                        std::min(gpuKernel.KernargSegmentByteSize(),
-                                 signature.paramsSize()));
-      if (pcieKernargs && !isGraphCapture) {
-        *dev().info().hdpMemFlushCntl = 1u;
+
+      nontemporalMemcpy(argBuffer, parameters, argSize);
+
+      if (applyMemOrderingWA) {
+        // Memory ordering workaround for pcie: execute sfence followed by
+        // write the last byte of kernarg
+        _mm_sfence();
+        *(argBuffer + argSize - 1) = *(parameters + argSize - 1);
+         // HDP flush is required to guarantee ordering in Navi and MI100
+         if (kernArgImpl == KernelArgImpl::DeviceKernelArgsHDP) {
+            *dev().info().hdpMemFlushCntl = 1u;
+         }
       }
     }
 
@@ -3284,10 +3296,11 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
                            (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
       aql_packet->setup = sizes.dimensions() << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
     }
-    if (pcieKernargs && !isGraphCapture) {
-      if (*dev().info().hdpMemFlushCntl != UINT32_MAX) {
-        LogError("Unexpected HDP Register readback value!");
-      }
+    if (applyMemOrderingWA) {
+      // Memory ordering workaround for pcie: execute mfence followed by
+      // read of the last byte of kernarg
+      _mm_mfence();
+      volatile char kSentinel = *(argBuffer + argSize - 1);
     }
     if (vcmd == nullptr) {
       // Dispatch the packet
diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.hpp b/projects/clr/rocclr/device/rocm/rocvirtual.hpp
index ea4e9943c9..f9f86207dc 100644
--- a/projects/clr/rocclr/device/rocm/rocvirtual.hpp
+++ b/projects/clr/rocclr/device/rocm/rocvirtual.hpp
@@ -570,5 +570,7 @@ class VirtualGPU : public device::VirtualDevice {
   bool fence_dirty_;                    //!< Fence modified flag
 
   std::atomic<uint> lastUsedSdmaEngineMask_;     //!< Last Used SDMA Engine mask
+
+  using KernelArgImpl = device::Settings::KernelArgImpl;
 };
 }