16ebf68e43
EPR #409950 - [IV][OCL] Multiple OCL samples crashed on multiple machines for 32-bit OS. There are two issues: 1. the SC dll should be dynamically loaded only when it is available. This is to allow apps to run on CPU device without the SC dll. This CL fixes it. It also allows user to use env var AMD_OCL_SC_LIB to provide the name or complete path of SC dll to load. 2. The test fails because amdhsasc.dll is not included in base driver for 32 bit OS. The proper solution should be ask package team to include amdhsasc.dll in the base driver. Also amdhsasc.dll should be renamed amdoclsc.dll since it is not only used for HSAIL but also used by AMDIL. The benefit of separate SC component as a shared library is decreased build time since changes in SC does not require rebuild of amdocl.dll, and ease of debugging and regression analysis by allowing swapping SC comopnent. However since 15.10 branch is close, there is not enough time to make changes to package. Therefore this CL implements a workaround for this issue without change to the package. We will implement the proper fix in the next relase. The workaround implemented by this CL embeds SC statically in amdocl.dll. The runtime loads SC dll specified by env var AMD_OCL_SC_LIB only if it is available. If the SC dll is not available, it will use the embeded SC. Affected files ... ... //depot/stg/opencl/drivers/opencl/api/opencl/amdocl/build/Makefile.api#96 edit ... //depot/stg/opencl/drivers/opencl/compiler/lib/api/v0_8/acl.cpp#22 edit ... //depot/stg/opencl/drivers/opencl/compiler/lib/api/v0_8/aclLoaders.cpp#9 edit ... //depot/stg/opencl/drivers/opencl/compiler/lib/backends/gpu/Makefile#44 edit ... //depot/stg/opencl/drivers/opencl/compiler/lib/backends/gpu/sclibdefs.opencl#20 edit ... //depot/stg/opencl/drivers/opencl/compiler/lib/include/v0_8/aclStructs.h#13 edit ... //depot/stg/opencl/drivers/opencl/compiler/lib/include/v0_8/aclTypes.h#4 edit ... //depot/stg/opencl/drivers/opencl/compiler/tools/aoc2/build/Makefile.aoc2#21 edit ... //depot/stg/opencl/drivers/opencl/opencldefs#148 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudevice.cpp#485 edit ... //depot/stg/opencl/drivers/opencl/runtime/utils/flags.hpp#220 edit
265 строки
15 KiB
C++
265 строки
15 KiB
C++
//
|
|
// Copyright (c) 2009 Advanced Micro Devices, Inc. All rights reserved.
|
|
//
|
|
|
|
#ifndef FLAGS_HPP_
|
|
#define FLAGS_HPP_
|
|
|
|
|
|
#define RUNTIME_FLAGS(debug,release,release_on_stg) \
|
|
\
|
|
debug(int, LOG_LEVEL, 0, \
|
|
"The default log level") \
|
|
debug(bool, BREAK_ON_LOG_WARNING, false, \
|
|
"Break each time an error is logged") \
|
|
debug(bool, BREAK_ON_LOG_ERROR, false, \
|
|
"Break each time an error is logged") \
|
|
debug(uint, DEBUG_GPU_FLAGS, 0, \
|
|
"The debug options for GPU device") \
|
|
debug(uint, GPU_MAX_COMMAND_QUEUES, 70, \
|
|
"The maximum number of concurrent Virtual GPUs") \
|
|
release(size_t, CQ_THREAD_STACK_SIZE, 256*Ki, /* @todo: that much! */ \
|
|
"The default command queue thread stack size") \
|
|
release(size_t, CPU_WORKER_THREAD_STACK_SIZE, 64*Ki, \
|
|
"The default CPU worker thread stack size") \
|
|
release(int, CPU_MAX_COMPUTE_UNITS, -1, \
|
|
"Override the number of computation units per CPU device") \
|
|
release(int, GPU_MAX_WORKGROUP_SIZE, 0, \
|
|
"Maximum number of workitems in a workgroup for GPU, 0 -use default") \
|
|
release(int, GPU_MAX_WORKGROUP_SIZE_2D_X, 0, \
|
|
"Maximum number of workitems in a 2D workgroup for GPU, x component, 0 -use default") \
|
|
release(int, GPU_MAX_WORKGROUP_SIZE_2D_Y, 0, \
|
|
"Maximum number of workitems in a 2D workgroup for GPU, y component, 0 -use default") \
|
|
release(int, GPU_MAX_WORKGROUP_SIZE_3D_X, 0, \
|
|
"Maximum number of workitems in a 3D workgroup for GPU, x component, 0 -use default") \
|
|
release(int, GPU_MAX_WORKGROUP_SIZE_3D_Y, 0, \
|
|
"Maximum number of workitems in a 3D workgroup for GPU, y component, 0 -use default") \
|
|
release(int, GPU_MAX_WORKGROUP_SIZE_3D_Z, 0, \
|
|
"Maximum number of workitems in a 3D workgroup for GPU, z component, 0 -use default") \
|
|
release(int, CPU_MAX_WORKGROUP_SIZE, 1024, \
|
|
"Maximum number of workitems in a workgroup for CPU") \
|
|
debug(bool, CPU_MEMORY_GUARD_PAGES, false, \
|
|
"Use guard pages for CPU memory") \
|
|
debug(size_t, CPU_MEMORY_GUARD_PAGE_SIZE, 64, \
|
|
"Size in KB of CPU memory guard page") \
|
|
debug(size_t, CPU_MEMORY_ALIGNMENT_SIZE, 256, \
|
|
"Size in bytes for the default alignment for guarded memory on CPU") \
|
|
debug(size_t, PARAMETERS_MIN_ALIGNMENT, 16, \
|
|
"Minimum alignment required for the abstract parameters stack") \
|
|
debug(size_t, MEMOBJ_BASE_ADDR_ALIGN, 4*Ki, \
|
|
"Alignment of the base address of any allocate memory object") \
|
|
release(cstring, GPU_DEVICE_NAME, "", \
|
|
"Select the device ordinal (will only report a single device)") \
|
|
release(cstring, GPU_DEVICE_ORDINAL, "", \
|
|
"Select the device ordinal (comma seperated list of available devices)") \
|
|
release(bool, REMOTE_ALLOC, false, \
|
|
"Use remote memory for the global heap allocation") \
|
|
release(int, GPU_INITIAL_HEAP_SIZE, 16, \
|
|
"Initial size of the GPU heap in MiB") \
|
|
release(uint, GPU_MAX_HEAP_SIZE, 100, \
|
|
"Set maximum size of the GPU heap to % of board memory") \
|
|
release(int, GPU_HEAP_GROWTH_INCREMENT, 8, \
|
|
"Amount to grow the GPU heap by in MiB") \
|
|
release(uint, GPU_STAGING_BUFFER_SIZE, 512, \
|
|
"Size of the GPU staging buffer in KiB") \
|
|
release(bool, GPU_DUMP_BLIT_KERNELS, false, \
|
|
"Dump the kernels for blit manager") \
|
|
release(uint, GPU_BLIT_ENGINE_TYPE, 0x0, \
|
|
"Blit engine type: 0 - Default, 1 - Host, 2 - CAL, 3 - Kernel") \
|
|
release(bool, GPU_FLUSH_ON_EXECUTION, false, \
|
|
"Submit commands to HW on every operation. 0 - Disable, 1 - Enable") \
|
|
release(bool, GPU_USE_SYNC_OBJECTS, true, \
|
|
"If enabled, use sync objects instead of polling") \
|
|
release(bool, ENABLE_CAL_SHUTDOWN, false, \
|
|
"Enable explicit CAL shutdown (for PM4 capture)") \
|
|
release(bool, CL_KHR_FP64, true, \
|
|
"Enable/Disable support for double precision") \
|
|
release(uint, GPU_OPEN_VIDEO, 0, \
|
|
"Non-zero value allows to report Open Video extension on GPU") \
|
|
release(cstring, AMD_OCL_BUILD_OPTIONS, 0, \
|
|
"Set clBuildProgram() and clCompileProgram()'s options (override)") \
|
|
release(cstring, AMD_OCL_BUILD_OPTIONS_APPEND, 0, \
|
|
"Append clBuildProgram() and clCompileProgram()'s options") \
|
|
release(cstring, AMD_OCL_LINK_OPTIONS, 0, \
|
|
"Set clLinkProgram()'s options (override)") \
|
|
release(cstring, AMD_OCL_LINK_OPTIONS_APPEND, 0, \
|
|
"Append clLinkProgram()'s options") \
|
|
release(cstring, AMD_OCL_SC_LIB, 0, \
|
|
"Set shader compiler shared library name or path") \
|
|
debug(bool, AMD_OCL_SUPPRESS_MESSAGE_BOX, false, \
|
|
"Suppress the error dialog on Windows") \
|
|
debug(bool, OCL_STRESS_BINARY_IMAGE, false, \
|
|
"Exercise the binary image producer and consumer") \
|
|
release(cstring, GPU_PRE_RA_SCHED, "default", \
|
|
"Allows setting of alternate pre-RA-sched") \
|
|
release(size_t, GPU_PINNED_XFER_SIZE, 16, \
|
|
"The pinned buffer size for pinning in read/write transfers") \
|
|
release(size_t, GPU_PINNED_MIN_XFER_SIZE, 512, \
|
|
"The minimal buffer size for pinned read/write transfers in KBytes") \
|
|
release(size_t, GPU_RESOURCE_CACHE_SIZE, 64, \
|
|
"The resource cache size in MB") \
|
|
release(uint, GPU_ASYNC_MEM_COPY, 0, \
|
|
"Enables async memory transfers with DRM engine") \
|
|
release(bool, GPU_FORCE_64BIT_PTR, 0, \
|
|
"Forces 64 bit pointers on GPU") \
|
|
release(bool, GPU_FORCE_OCL20_32BIT, 0, \
|
|
"Forces 32 bit apps to take CLANG\HSAIL path") \
|
|
release(bool, GPU_RAW_TIMESTAMP, 0, \
|
|
"Reports GPU raw timestamps in GPU timeline") \
|
|
release(bool, CPU_IMAGE_SUPPORT, true, \
|
|
"Turn on image support on the CPU device") \
|
|
release(bool, GPU_PARTIAL_DISPATCH, true, \
|
|
"Enables partial dispatch on GPU") \
|
|
release(size_t, GPU_NUM_MEM_DEPENDENCY, 256, \
|
|
"Number of memory objects for dependency tracking") \
|
|
release(size_t, GPU_XFER_BUFFER_SIZE, 0, \
|
|
"Transfer buffer size for image copy optimization in KB") \
|
|
release(bool, GPU_IMAGE_DMA, true, \
|
|
"Enable DRM DMA for image transfers") \
|
|
release(uint, CPU_MAX_ALLOC_PERCENT, 25, \
|
|
"Maximum size of a single allocation in MiB") \
|
|
release(uint, GPU_SINGLE_ALLOC_PERCENT, 75, \
|
|
"Maximum size of a single allocation as percentage of total") \
|
|
release(uint, GPU_NUM_COMPUTE_RINGS, 2, \
|
|
"GPU number of compute rings. 0 - disabled, 1 , 2,.. - the number of compute rings") \
|
|
release_on_stg(bool, C1X_ATOMICS, !IS_MAINLINE, \
|
|
"Runtime will report c1x atomics support") \
|
|
release(uint, GPU_WORKLOAD_SPLIT, 22, \
|
|
"Workload split size") \
|
|
release(bool, GPU_USE_SINGLE_SCRATCH, false, \
|
|
"Use single scratch buffer per device instead of per HW ring") \
|
|
release_on_stg(cstring, GPU_TARGET_INFO_ARCH, "amdil", \
|
|
"Select the GPU TargetInfo arch (amdil|hsail)") \
|
|
release(bool, HSA_RUNTIME, 0, \
|
|
"1 = Enable HSA Runtime, any other value or absence disables it.") \
|
|
release(bool, AMD_OCL_WAIT_COMMAND, false, \
|
|
"1 = Enable a wait for every submitted command") \
|
|
debug(bool, AMD_OCL_DEBUG_LINKER, false, \
|
|
"Enable debug output in linker") \
|
|
debug(bool, GPU_SPLIT_LIB, true, \
|
|
"Enable splitting GPU 32/64 bit library") \
|
|
release(bool, GPU_STAGING_WRITE_PERSISTENT, false, \
|
|
"Enable Persistent writes") \
|
|
release(bool, DRMDMA_FOR_LNX_CF, false, \
|
|
"Enable DRMDMA for Linux CrossFire") \
|
|
release(bool, GPU_HSAIL_ENABLE, false, \
|
|
"Enable HSAIL on dGPU stack (requires CI+ HW)") \
|
|
release(bool, GPU_ASSUME_ALIASES, false, \
|
|
"Assume memory aliases in the compilation process") \
|
|
release(uint, GPU_PRINT_CHILD_KERNEL, 0, \
|
|
"Prints the specified number of the child kernels") \
|
|
release(bool, GPU_DIRECT_SRD, false, \
|
|
"Use indirect SRD access in HSAIL") \
|
|
release(bool, GPU_USE_DEVICE_QUEUE, false, \
|
|
"Use a dedicated device queue for the actual submissions") \
|
|
release(bool, GPU_ENABLE_LARGE_ALLOCATION, false, \
|
|
"Enable >4GB single allocations") \
|
|
release(bool, AMD_THREAD_TRACE_ENABLE, false, \
|
|
"Enable thread trace extension") \
|
|
release(uint, OPENCL_VERSION, 200, \
|
|
"Force GPU opencl verison") \
|
|
release(uint, CPU_OPENCL_VERSION, 120, \
|
|
"Force CPU opencl verison") \
|
|
release(bool, ENVVAR_HSA_POLL_KERNEL_COMPLETION, false, \
|
|
"Determines if Hsa runtime should use polling scheme") \
|
|
release(bool, HSA_LOCAL_MEMORY_ENABLE, false, \
|
|
"Enable HSA device local memory usage") \
|
|
release(bool, HSA_ENABLE_ATOMICS_32B, false, \
|
|
"1 = Enable SVM atomics in 32 bits (HSA backend-only). Any other value keeps then disabled.") \
|
|
release(bool, GPU_IFH_MODE, false, \
|
|
"1 = Enable GPU IFH (infinitely fast hardware) mode. Any other value keeps setting disabled.") \
|
|
release(bool, HSAIL_IMAGE_HANDLE_ENABLE, false, \
|
|
"Pass image/sampler SRD as pointer instead of blob") \
|
|
debug(bool, GPU_FORCE_SINGLE_FP_DENORM, false, \
|
|
"Forces reporting CL_FP_DENORM bit for single precision") \
|
|
debug(bool, OCL_FORCE_CPU_SVM, false, \
|
|
"force svm support for CPU") \
|
|
|
|
|
|
|
|
namespace amd {
|
|
|
|
//! \addtogroup Utils
|
|
// @{
|
|
|
|
struct Flag
|
|
{
|
|
enum Type
|
|
{
|
|
Tinvalid = 0,
|
|
Tbool, //!< A boolean type flag (true, false).
|
|
Tint, //!< An integer type flag (signed).
|
|
Tuint, //!< An integer type flag (unsigned).
|
|
Tsize_t, //!< A size_t type flag.
|
|
Tcstring //!< A string type flag.
|
|
};
|
|
|
|
#define DEFINE_FLAG_NAME(type, name, value, help) k##name,
|
|
enum Name
|
|
{
|
|
RUNTIME_FLAGS(DEFINE_FLAG_NAME, DEFINE_FLAG_NAME, DEFINE_FLAG_NAME)
|
|
numFlags_
|
|
};
|
|
#undef DEFINE_FLAG_NAME
|
|
|
|
#define CAN_SET(type, name, v, h) static const bool cannotSet##name = false;
|
|
#define CANNOT_SET(type, name, v, h) static const bool cannotSet##name = true;
|
|
|
|
#ifdef DEBUG
|
|
RUNTIME_FLAGS(CAN_SET, CAN_SET, CAN_SET)
|
|
#else // !DEBUG
|
|
RUNTIME_FLAGS(CANNOT_SET, CAN_SET, CANNOT_SET)
|
|
#endif // !DEBUG
|
|
|
|
#undef CAN_SET
|
|
#undef CANNOT_SET
|
|
|
|
private:
|
|
|
|
static Flag flags_[];
|
|
|
|
public:
|
|
static char* envstr_;
|
|
const char* name_;
|
|
const void* value_;
|
|
Type type_;
|
|
bool isDefault_;
|
|
|
|
public:
|
|
|
|
static bool init();
|
|
|
|
static void tearDown();
|
|
|
|
bool setValue(const char* value);
|
|
|
|
static bool isDefault(Name name) { return flags_[name].isDefault_; }
|
|
};
|
|
|
|
#define flagIsDefault(name) \
|
|
(amd::Flag::cannotSet##name || amd::Flag::isDefault(amd::Flag::k##name))
|
|
|
|
// @}
|
|
|
|
} // namespace amd
|
|
|
|
#ifdef _WIN32
|
|
# define EXPORT_FLAG extern "C" __declspec(dllexport)
|
|
#else // !_WIN32
|
|
# define EXPORT_FLAG extern "C"
|
|
#endif // !_WIN32
|
|
|
|
#define DECLARE_RELEASE_FLAG(type, name, value, help) EXPORT_FLAG type name;
|
|
#ifdef DEBUG
|
|
# define DECLARE_DEBUG_FLAG(type, name, value, help) EXPORT_FLAG type name;
|
|
#else // !DEBUG
|
|
# define DECLARE_DEBUG_FLAG(type, name, value, help) const type name = value;
|
|
#endif // !DEBUG
|
|
|
|
RUNTIME_FLAGS(DECLARE_DEBUG_FLAG, DECLARE_RELEASE_FLAG, DECLARE_DEBUG_FLAG);
|
|
|
|
#undef DECLARE_DEBUG_FLAG
|
|
#undef DECLARE_RELEASE_FLAG
|
|
|
|
#endif /*FLAGS_HPP_*/
|