P4 to Git Change 1421208 by gandryey@gera-w8 on 2017/06/12 13:15:22

SWDEV-124171 - adding support for p2p OCL in rocm stack
	- Add cl_amd_copy_buffer_p2p extension for P2P transfers. The extension adds a new API entry - clEnqueueCopyBufferP2PAMD() which allows to transfer CL buffers between different CL contexts on different GPUs. If P2P isn't possible, then double copy performed
	- Also the app can query the P2P support capabilities for the device. A list of P2P accessible devices can be returned for the current device

	http://ocltc.amd.com/reviews/r/12913/

Affected files ...

... //depot/stg/opencl/drivers/opencl/api/opencl/amdocl/cl_context.cpp#54 edit
... //depot/stg/opencl/drivers/opencl/api/opencl/amdocl/cl_device.cpp#62 edit
... //depot/stg/opencl/drivers/opencl/api/opencl/amdocl/cl_p2p_amd.cpp#1 add
... //depot/stg/opencl/drivers/opencl/api/opencl/amdocl/cl_p2p_amd.h#1 add
... //depot/stg/opencl/drivers/opencl/api/opencl/khronos/headers/opencl2.0/CL/cl_ext.h#29 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/cpu/cpuvirtual.hpp#14 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/device.hpp#287 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.hpp#141 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.hpp#26 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocblit.cpp#19 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocdevice.cpp#54 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocdevice.hpp#22 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocmemory.cpp#24 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocsettings.cpp#19 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.cpp#39 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.hpp#12 edit
... //depot/stg/opencl/drivers/opencl/runtime/platform/command.cpp#79 edit
... //depot/stg/opencl/drivers/opencl/runtime/platform/command.hpp#84 edit


[ROCm/clr commit: 628503e964]
This commit is contained in:
foreman
2017-06-12 13:31:09 -04:00
parent 38ccff68fb
commit c957454bcb
5 changed files with 149 additions and 0 deletions
@@ -23,6 +23,7 @@
#include "cl_thread_trace_amd.h"
#include "cl_debugger_amd.h"
#include "cl_lqdflash_amd.h"
#include "cl_p2p_amd.h"
#include <GL/gl.h>
#include <GL/glext.h>
@@ -503,6 +504,9 @@ CL_API_ENTRY void* CL_API_CALL clGetExtensionFunctionAddress(const char* func_na
#if cl_amd_liquid_flash
CL_EXTENSION_ENTRYPOINT_CHECK(clEnqueueReadSsgFileAMD);
CL_EXTENSION_ENTRYPOINT_CHECK(clEnqueueWriteSsgFileAMD);
#endif // cl_amd_liquid_flash
#if cl_amd_copy_buffer_p2p
CL_EXTENSION_ENTRYPOINT_CHECK(clEnqueueCopyBufferP2PAMD);
#endif // cl_amd_liquid_flash
break;
case 'G':
@@ -557,6 +557,24 @@ RUNTIME_ENTRY(cl_int, clGetDeviceInfo,
#define CL_DEVICE_MAX_REAL_TIME_COMPUTE_UNITS_AMD 0x404E
CASE(CL_DEVICE_MAX_REAL_TIME_COMPUTE_QUEUES_AMD, numRTQueues_);
CASE(CL_DEVICE_MAX_REAL_TIME_COMPUTE_UNITS_AMD, numRTCUs_);
case CL_DEVICE_NUM_P2P_DEVICES_AMD: {
cl_uint num_p2p_devices = as_amd(device)->p2pDevices_.size();
return amd::clGetInfo(num_p2p_devices, param_value_size, param_value, param_value_size_ret);
}
case CL_DEVICE_P2P_DEVICES_AMD: {
uint valueSize = as_amd(device)->p2pDevices_.size() * sizeof(cl_device_id);
if (param_value != NULL) {
if (param_value_size < valueSize) {
return CL_INVALID_VALUE;
}
}
memcpy(param_value, as_amd(device)->p2pDevices_.data(), valueSize);
*not_null(param_value_size_ret) = valueSize;
if (param_value != NULL && param_value_size > valueSize) {
::memset(static_cast<char*>(param_value) + valueSize, '\0', param_value_size - valueSize);
}
return CL_SUCCESS;
}
default:
break;
}
@@ -0,0 +1,88 @@
//
// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
//
#include "cl_common.hpp"
#include <CL/cl_ext.h>
#include "cl_p2p_amd.h"
#include "platform/object.hpp"
RUNTIME_ENTRY(cl_int, clEnqueueCopyBufferP2PAMD,
(cl_command_queue command_queue, cl_mem src_buffer, cl_mem dst_buffer,
size_t src_offset, size_t dst_offset, size_t cb, cl_uint num_events_in_wait_list,
const cl_event* event_wait_list, cl_event* event)) {
if (!is_valid(command_queue)) {
return CL_INVALID_COMMAND_QUEUE;
}
if (!is_valid(src_buffer) || !is_valid(dst_buffer)) {
return CL_INVALID_MEM_OBJECT;
}
amd::Buffer* srcBuffer = as_amd(src_buffer)->asBuffer();
amd::Buffer* dstBuffer = as_amd(dst_buffer)->asBuffer();
if (srcBuffer == NULL || dstBuffer == NULL) {
return CL_INVALID_MEM_OBJECT;
}
amd::HostQueue* queue = as_amd(command_queue)->asHostQueue();
if (NULL == queue) {
return CL_INVALID_COMMAND_QUEUE;
}
amd::HostQueue& hostQueue = *queue;
if (hostQueue.context() != srcBuffer->getContext()) {
return CL_INVALID_CONTEXT;
}
amd::Coord3D srcOffset(src_offset, 0, 0);
amd::Coord3D dstOffset(dst_offset, 0, 0);
amd::Coord3D size(cb, 1, 1);
if (!srcBuffer->validateRegion(srcOffset, size) || !dstBuffer->validateRegion(dstOffset, size)) {
return CL_INVALID_VALUE;
}
if (srcBuffer == dstBuffer && ((src_offset <= dst_offset && dst_offset < src_offset + cb) ||
(dst_offset <= src_offset && src_offset < dst_offset + cb))) {
return CL_MEM_COPY_OVERLAP;
}
amd::Command::EventWaitList eventWaitList;
if ((num_events_in_wait_list == 0 && event_wait_list != NULL)
|| (num_events_in_wait_list != 0 && event_wait_list == NULL)) {
return CL_INVALID_EVENT_WAIT_LIST;
}
while (num_events_in_wait_list-- > 0) {
cl_event event = *event_wait_list++;
amd::Event* amdEvent = as_amd(event);
if (!is_valid(event)) {
return CL_INVALID_EVENT_WAIT_LIST;
}
eventWaitList.push_back(amdEvent);
}
amd::CopyMemoryP2PCommand* command =
new amd::CopyMemoryP2PCommand(hostQueue, CL_COMMAND_COPY_BUFFER, eventWaitList, *srcBuffer,
*dstBuffer, srcOffset, dstOffset, size);
if (command == NULL) {
return CL_OUT_OF_HOST_MEMORY;
}
// Make sure we have memory for the command execution
if (!command->validateMemory()) {
delete command;
return CL_MEM_OBJECT_ALLOCATION_FAILURE;
}
command->enqueue();
*not_null(event) = as_cl(&command->event());
if (event == NULL) {
command->release();
}
return CL_SUCCESS;
}
RUNTIME_EXIT
@@ -0,0 +1,19 @@
#ifndef __CL_P2P_AMD_H
#define __CL_P2P_AMD_H
#include "CL/cl_ext.h"
#ifdef __cplusplus
extern "C" {
#endif /*__cplusplus*/
extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueCopyBufferP2PAMD(
cl_command_queue command_queue, cl_mem src_buffer, cl_mem dst_buffer,
size_t src_offset, size_t dst_offset, size_t cb, cl_uint num_events_in_wait_list,
const cl_event* event_wait_list, cl_event* event) CL_EXT_SUFFIX__VERSION_1_2;
#ifdef __cplusplus
} /*extern "C"*/
#endif /*__cplusplus*/
#endif
@@ -513,6 +513,26 @@ typedef CL_API_ENTRY cl_int
const cl_event * /*event_wait_list*/,
cl_event * /*event*/) CL_EXT_SUFFIX__VERSION_1_2;
/*************************
* cl_amd_copy_buffer_p2p *
**************************/
#define CL_DEVICE_NUM_P2P_DEVICES_AMD 0x4088
#define CL_DEVICE_P2P_DEVICES_AMD 0x4089
#define cl_amd_copy_buffer_p2p 1
typedef CL_API_ENTRY cl_int
(CL_API_CALL * clEnqueueCopyBufferP2PAMD_fn)(cl_command_queue /*command_queue*/,
cl_mem /*src_buffer*/,
cl_mem /*dst_buffer*/,
size_t /*src_offset*/,
size_t /*dst_offset*/,
size_t /*cb*/,
cl_uint /*num_events_in_wait_list*/,
const cl_event* /*event_wait_list*/,
cl_event* /*event*/) CL_EXT_SUFFIX__VERSION_1_2;
#endif /* CL_VERSION_1_2 */