From c957454bcb8409365a00eafec3dc4b9baf1d3356 Mon Sep 17 00:00:00 2001
From: foreman
Date: Mon, 12 Jun 2017 13:31:09 -0400
Subject: [PATCH] P4 to Git Change 1421208 by gandryey@gera-w8 on 2017/06/12
13:15:22
SWDEV-124171 - adding support for p2p OCL in rocm stack
- Add cl_amd_copy_buffer_p2p extension for P2P transfers. The extension adds a new API entry - clEnqueueCopyBufferP2PAMD() which allows to transfer CL buffers between different CL contexts on different GPUs. If P2P isn't possible, then double copy performed
- Also the app can query the P2P support capabilities for the device. A list of P2P accessible devices can be returned for the current device
http://ocltc.amd.com/reviews/r/12913/
Affected files ...
... //depot/stg/opencl/drivers/opencl/api/opencl/amdocl/cl_context.cpp#54 edit
... //depot/stg/opencl/drivers/opencl/api/opencl/amdocl/cl_device.cpp#62 edit
... //depot/stg/opencl/drivers/opencl/api/opencl/amdocl/cl_p2p_amd.cpp#1 add
... //depot/stg/opencl/drivers/opencl/api/opencl/amdocl/cl_p2p_amd.h#1 add
... //depot/stg/opencl/drivers/opencl/api/opencl/khronos/headers/opencl2.0/CL/cl_ext.h#29 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/cpu/cpuvirtual.hpp#14 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/device.hpp#287 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.hpp#141 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.hpp#26 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocblit.cpp#19 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocdevice.cpp#54 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocdevice.hpp#22 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocmemory.cpp#24 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocsettings.cpp#19 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.cpp#39 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.hpp#12 edit
... //depot/stg/opencl/drivers/opencl/runtime/platform/command.cpp#79 edit
... //depot/stg/opencl/drivers/opencl/runtime/platform/command.hpp#84 edit
[ROCm/clr commit: 628503e964f4b4c249127160d2817828d59b9799]
---
.../opencl/api/opencl/amdocl/cl_context.cpp | 4 +
.../opencl/api/opencl/amdocl/cl_device.cpp | 18 ++++
.../opencl/api/opencl/amdocl/cl_p2p_amd.cpp | 88 +++++++++++++++++++
.../clr/opencl/api/opencl/amdocl/cl_p2p_amd.h | 19 ++++
.../khronos/headers/opencl2.0/CL/cl_ext.h | 20 +++++
5 files changed, 149 insertions(+)
create mode 100644 projects/clr/opencl/api/opencl/amdocl/cl_p2p_amd.cpp
create mode 100644 projects/clr/opencl/api/opencl/amdocl/cl_p2p_amd.h
diff --git a/projects/clr/opencl/api/opencl/amdocl/cl_context.cpp b/projects/clr/opencl/api/opencl/amdocl/cl_context.cpp
index 8472cac544..ffb4ac21ed 100644
--- a/projects/clr/opencl/api/opencl/amdocl/cl_context.cpp
+++ b/projects/clr/opencl/api/opencl/amdocl/cl_context.cpp
@@ -23,6 +23,7 @@
#include "cl_thread_trace_amd.h"
#include "cl_debugger_amd.h"
#include "cl_lqdflash_amd.h"
+#include "cl_p2p_amd.h"
#include
#include
@@ -503,6 +504,9 @@ CL_API_ENTRY void* CL_API_CALL clGetExtensionFunctionAddress(const char* func_na
#if cl_amd_liquid_flash
CL_EXTENSION_ENTRYPOINT_CHECK(clEnqueueReadSsgFileAMD);
CL_EXTENSION_ENTRYPOINT_CHECK(clEnqueueWriteSsgFileAMD);
+#endif // cl_amd_liquid_flash
+#if cl_amd_copy_buffer_p2p
+ CL_EXTENSION_ENTRYPOINT_CHECK(clEnqueueCopyBufferP2PAMD);
#endif // cl_amd_liquid_flash
break;
case 'G':
diff --git a/projects/clr/opencl/api/opencl/amdocl/cl_device.cpp b/projects/clr/opencl/api/opencl/amdocl/cl_device.cpp
index fe01c15661..e41211af94 100644
--- a/projects/clr/opencl/api/opencl/amdocl/cl_device.cpp
+++ b/projects/clr/opencl/api/opencl/amdocl/cl_device.cpp
@@ -557,6 +557,24 @@ RUNTIME_ENTRY(cl_int, clGetDeviceInfo,
#define CL_DEVICE_MAX_REAL_TIME_COMPUTE_UNITS_AMD 0x404E
CASE(CL_DEVICE_MAX_REAL_TIME_COMPUTE_QUEUES_AMD, numRTQueues_);
CASE(CL_DEVICE_MAX_REAL_TIME_COMPUTE_UNITS_AMD, numRTCUs_);
+ case CL_DEVICE_NUM_P2P_DEVICES_AMD: {
+ cl_uint num_p2p_devices = as_amd(device)->p2pDevices_.size();
+ return amd::clGetInfo(num_p2p_devices, param_value_size, param_value, param_value_size_ret);
+ }
+ case CL_DEVICE_P2P_DEVICES_AMD: {
+ uint valueSize = as_amd(device)->p2pDevices_.size() * sizeof(cl_device_id);
+ if (param_value != NULL) {
+ if (param_value_size < valueSize) {
+ return CL_INVALID_VALUE;
+ }
+ }
+ memcpy(param_value, as_amd(device)->p2pDevices_.data(), valueSize);
+ *not_null(param_value_size_ret) = valueSize;
+ if (param_value != NULL && param_value_size > valueSize) {
+ ::memset(static_cast(param_value) + valueSize, '\0', param_value_size - valueSize);
+ }
+ return CL_SUCCESS;
+ }
default:
break;
}
diff --git a/projects/clr/opencl/api/opencl/amdocl/cl_p2p_amd.cpp b/projects/clr/opencl/api/opencl/amdocl/cl_p2p_amd.cpp
new file mode 100644
index 0000000000..3932b83c01
--- /dev/null
+++ b/projects/clr/opencl/api/opencl/amdocl/cl_p2p_amd.cpp
@@ -0,0 +1,88 @@
+//
+// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
+//
+#include "cl_common.hpp"
+#include
+
+#include "cl_p2p_amd.h"
+#include "platform/object.hpp"
+
+RUNTIME_ENTRY(cl_int, clEnqueueCopyBufferP2PAMD,
+ (cl_command_queue command_queue, cl_mem src_buffer, cl_mem dst_buffer,
+ size_t src_offset, size_t dst_offset, size_t cb, cl_uint num_events_in_wait_list,
+ const cl_event* event_wait_list, cl_event* event)) {
+ if (!is_valid(command_queue)) {
+ return CL_INVALID_COMMAND_QUEUE;
+ }
+
+ if (!is_valid(src_buffer) || !is_valid(dst_buffer)) {
+ return CL_INVALID_MEM_OBJECT;
+ }
+ amd::Buffer* srcBuffer = as_amd(src_buffer)->asBuffer();
+ amd::Buffer* dstBuffer = as_amd(dst_buffer)->asBuffer();
+ if (srcBuffer == NULL || dstBuffer == NULL) {
+ return CL_INVALID_MEM_OBJECT;
+ }
+
+ amd::HostQueue* queue = as_amd(command_queue)->asHostQueue();
+ if (NULL == queue) {
+ return CL_INVALID_COMMAND_QUEUE;
+ }
+ amd::HostQueue& hostQueue = *queue;
+
+ if (hostQueue.context() != srcBuffer->getContext()) {
+ return CL_INVALID_CONTEXT;
+ }
+
+ amd::Coord3D srcOffset(src_offset, 0, 0);
+ amd::Coord3D dstOffset(dst_offset, 0, 0);
+ amd::Coord3D size(cb, 1, 1);
+
+ if (!srcBuffer->validateRegion(srcOffset, size) || !dstBuffer->validateRegion(dstOffset, size)) {
+ return CL_INVALID_VALUE;
+ }
+
+ if (srcBuffer == dstBuffer && ((src_offset <= dst_offset && dst_offset < src_offset + cb) ||
+ (dst_offset <= src_offset && src_offset < dst_offset + cb))) {
+ return CL_MEM_COPY_OVERLAP;
+ }
+
+ amd::Command::EventWaitList eventWaitList;
+ if ((num_events_in_wait_list == 0 && event_wait_list != NULL)
+ || (num_events_in_wait_list != 0 && event_wait_list == NULL)) {
+ return CL_INVALID_EVENT_WAIT_LIST;
+ }
+
+ while (num_events_in_wait_list-- > 0) {
+ cl_event event = *event_wait_list++;
+ amd::Event* amdEvent = as_amd(event);
+ if (!is_valid(event)) {
+ return CL_INVALID_EVENT_WAIT_LIST;
+ }
+ eventWaitList.push_back(amdEvent);
+ }
+
+ amd::CopyMemoryP2PCommand* command =
+ new amd::CopyMemoryP2PCommand(hostQueue, CL_COMMAND_COPY_BUFFER, eventWaitList, *srcBuffer,
+ *dstBuffer, srcOffset, dstOffset, size);
+
+ if (command == NULL) {
+ return CL_OUT_OF_HOST_MEMORY;
+ }
+
+ // Make sure we have memory for the command execution
+ if (!command->validateMemory()) {
+ delete command;
+ return CL_MEM_OBJECT_ALLOCATION_FAILURE;
+ }
+
+ command->enqueue();
+
+ *not_null(event) = as_cl(&command->event());
+ if (event == NULL) {
+ command->release();
+ }
+ return CL_SUCCESS;
+}
+RUNTIME_EXIT
+
diff --git a/projects/clr/opencl/api/opencl/amdocl/cl_p2p_amd.h b/projects/clr/opencl/api/opencl/amdocl/cl_p2p_amd.h
new file mode 100644
index 0000000000..7f9fa0c58c
--- /dev/null
+++ b/projects/clr/opencl/api/opencl/amdocl/cl_p2p_amd.h
@@ -0,0 +1,19 @@
+#ifndef __CL_P2P_AMD_H
+#define __CL_P2P_AMD_H
+
+#include "CL/cl_ext.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif /*__cplusplus*/
+
+extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueCopyBufferP2PAMD(
+ cl_command_queue command_queue, cl_mem src_buffer, cl_mem dst_buffer,
+ size_t src_offset, size_t dst_offset, size_t cb, cl_uint num_events_in_wait_list,
+ const cl_event* event_wait_list, cl_event* event) CL_EXT_SUFFIX__VERSION_1_2;
+
+#ifdef __cplusplus
+} /*extern "C"*/
+#endif /*__cplusplus*/
+
+#endif
diff --git a/projects/clr/opencl/api/opencl/khronos/headers/opencl2.0/CL/cl_ext.h b/projects/clr/opencl/api/opencl/khronos/headers/opencl2.0/CL/cl_ext.h
index f7032f88a2..9e332016c1 100644
--- a/projects/clr/opencl/api/opencl/khronos/headers/opencl2.0/CL/cl_ext.h
+++ b/projects/clr/opencl/api/opencl/khronos/headers/opencl2.0/CL/cl_ext.h
@@ -513,6 +513,26 @@ typedef CL_API_ENTRY cl_int
const cl_event * /*event_wait_list*/,
cl_event * /*event*/) CL_EXT_SUFFIX__VERSION_1_2;
+/*************************
+* cl_amd_copy_buffer_p2p *
+**************************/
+#define CL_DEVICE_NUM_P2P_DEVICES_AMD 0x4088
+#define CL_DEVICE_P2P_DEVICES_AMD 0x4089
+
+#define cl_amd_copy_buffer_p2p 1
+
+typedef CL_API_ENTRY cl_int
+(CL_API_CALL * clEnqueueCopyBufferP2PAMD_fn)(cl_command_queue /*command_queue*/,
+ cl_mem /*src_buffer*/,
+ cl_mem /*dst_buffer*/,
+ size_t /*src_offset*/,
+ size_t /*dst_offset*/,
+ size_t /*cb*/,
+ cl_uint /*num_events_in_wait_list*/,
+ const cl_event* /*event_wait_list*/,
+ cl_event* /*event*/) CL_EXT_SUFFIX__VERSION_1_2;
+
+
#endif /* CL_VERSION_1_2 */