Files
rocm-systems/rocclr/runtime/device/cpu/cpudevice.cpp
T
foreman 45fd651d2f P4 to Git Change 1138941 by emankov@em-hsa-amd on 2015/04/09 08:28:04
ECR #333753 - ORCA RT: aclCompilerInit usage fix

	Passing pointers on CRT memory management functions to dynamically loaded library is unsafe and may lead to memory corruption. But actually the pointers aren't actually passed.
	They are cut off by the struct size: sizeof(aclCompilerOptions_0_8) and hence never being called.

	P.S.
	If it was aclCompilerOptions_0_8_1 the pointers would be taken into account and a heap corruption would occur. So the change intended to liquidate possible misunderstanding of malloc & free functions usage, and doesn't change the behaviour of aclCompilerInit function at all.

	Testing: pre check-in

	Reviewers: German Andreev, Stanislav Mekhanoshin

Affected files ...

... //depot/stg/opencl/drivers/opencl/runtime/device/cpu/cpudevice.cpp#271 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudevice.cpp#507 edit
2015-04-09 08:41:37 -04:00

1160 righe
34 KiB
C++

//
// Copyright 2011 Advanced Micro Devices, Inc. All rights reserved.
//
#include "device/cpu/cpudevice.hpp"
#include "device/cpu/cpuprogram.hpp"
#include "utils/versions.hpp"
#include "amdocl/cl_common.hpp"
#include <string>
#include <iostream>
#include <fstream>
#include <sstream>
#if defined(__linux__)
#if !defined(ATI_ARCH_ARM)
#include <sys/sysinfo.h>
#endif // ATI_ARCH_ARM
#include <unistd.h>
#endif
#if defined(_WIN32)
# include <windows.h>
# include <intrin.h>
extern BOOL (WINAPI *pfnGetNumaNodeProcessorMaskEx)(USHORT,PGROUP_AFFINITY);
#endif // _WIN32
namespace cpu {
aclCompiler* Device::compiler_;
size_t Device::maxWorkerThreads_ = (size_t)-1;
Device::~Device()
{
#if defined(__linux__) && defined(NUMA_SUPPORT)
if (getNumaMask() != NULL) {
if (numaMask_ != NULL) {
delete numaMask_;
}
}
else
#endif
if (workerThreadsAffinity_ != NULL) {
delete workerThreadsAffinity_;
}
}
void
Device::tearDown()
{
aclCompilerFini(compiler_);
}
bool
Device::init()
{
// Allow disabling of the CPU device
if (CPU_MAX_COMPUTE_UNITS == 0)
return false;
const char *library = getenv("COMPILER_LIBRARY");
aclCompilerOptions opts = {
sizeof(aclCompilerOptions_0_8),
library ? library : LINUX_ONLY("lib") "amdocl12cl" \
LP64_SWITCH(LINUX_SWITCH("32",""),"64") LINUX_SWITCH(".so",".dll"),
NULL,
NULL,
NULL,
NULL,
NULL,
NULL
};
acl_error error;
compiler_ = aclCompilerInit(&opts, &error);
if (error != ACL_SUCCESS) {
LogError("Error initializing the compiler");
return false;
}
device::Info info;
::memset(&info, '\0', sizeof(info));
info.type_ = CL_DEVICE_TYPE_CPU;
info.vendorId_ = 0x1002;
int systemProcessorCount = amd::Os::processorCount();
info.maxComputeUnits_ = systemProcessorCount;
if (!flagIsDefault(CPU_MAX_COMPUTE_UNITS)) {
if ((CPU_MAX_COMPUTE_UNITS <= 0) || (CPU_MAX_COMPUTE_UNITS > systemProcessorCount))
info.maxComputeUnits_ = systemProcessorCount;
else
info.maxComputeUnits_ = CPU_MAX_COMPUTE_UNITS;
}
info.maxWorkItemDimensions_ = 3;
info.maxWorkGroupSize_ = CPU_MAX_WORKGROUP_SIZE;
info.maxWorkItemSizes_[0] = info.maxWorkGroupSize_;
info.maxWorkItemSizes_[1] = info.maxWorkGroupSize_;
info.maxWorkItemSizes_[2] = info.maxWorkGroupSize_;
info.addressBits_ = LP64_SWITCH(32,64);
if (CPU_IMAGE_SUPPORT) {
info.imageSupport_ = CL_TRUE;
info.maxReadImageArgs_ = MaxReadImage;
info.maxWriteImageArgs_ = MaxWriteImage;
info.image2DMaxWidth_ = 8 * Ki;
info.image2DMaxHeight_ = 8 * Ki;
info.image3DMaxWidth_ = 2 * Ki;
info.image3DMaxHeight_ = 2 * Ki;
info.image3DMaxDepth_ = 2 * Ki;
info.maxSamplers_ = MaxSamplers;
// OpenCL 1.2 device info fields
info.imageMaxBufferSize_ = 64 * Ki;
info.imageMaxArraySize_ = 2 * Ki;
info.imagePitchAlignment_ = 0;
info.imageBaseAddressAlignment_ = 0;
info.bufferFromImageSupport_ = CL_FALSE;
}
info.maxParameterSize_ = 4*Ki;
info.memBaseAddrAlign_ = 8 * (flagIsDefault(MEMOBJ_BASE_ADDR_ALIGN) ?
sizeof(cl_long16) : MEMOBJ_BASE_ADDR_ALIGN);
info.minDataTypeAlignSize_ = sizeof(cl_long16);
info.singleFPConfig_ =
CL_FP_DENORM | CL_FP_INF_NAN |
CL_FP_ROUND_TO_NEAREST | CL_FP_ROUND_TO_ZERO |
CL_FP_ROUND_TO_INF | CL_FP_FMA;
info.doubleFPConfig_ = info.singleFPConfig_;
info.singleFPConfig_ |= CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT;
info.affinityDomain_.value_ = 0;
info.affinityDomain_.next_ = 1;
info.globalMemCacheType_ = CL_READ_WRITE_CACHE;
#if defined(__linux__)
info.globalMemCacheLineSize_ = sysconf(_SC_LEVEL1_DCACHE_LINESIZE);
info.globalMemCacheSize_ = sysconf(_SC_LEVEL1_DCACHE_SIZE);
info.affinityDomain_.cacheL1_ = 1;
if (sysconf(_SC_LEVEL2_CACHE_SIZE) > 0) {
info.affinityDomain_.cacheL2_ = 1;
}
if (sysconf(_SC_LEVEL3_CACHE_SIZE) > 0) {
info.affinityDomain_.cacheL3_ = 1;
}
if (sysconf(_SC_LEVEL4_CACHE_SIZE) > 0) {
info.affinityDomain_.cacheL4_ = 1;
}
#if defined(NUMA_SUPPORT)
if (numa_available() != -1 && numa_max_node() => 0) {
info.affinityDomain_.numa_ = 1;
}
#endif
#else // win32
DWORD length = 0;
::GetLogicalProcessorInformation(NULL, &length);
PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer =
(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION) malloc(length);
if (buffer != NULL && ::GetLogicalProcessorInformation(buffer, &length)) {
bool found = false;
PSYSTEM_LOGICAL_PROCESSOR_INFORMATION ptr, limit =
&buffer[length / sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION)];
for (ptr = buffer; ptr < limit; ++ptr) {
PCACHE_DESCRIPTOR cache = &ptr->Cache;
if (ptr->Relationship == RelationCache && cache->Type != CacheInstruction) {
info.affinityDomain_.value_ |=
(device::AffinityDomain::AFFINITY_DOMAIN_L1_CACHE << 1) >>
cache->Level;
if (!found && cache->Level == 1) {
info.globalMemCacheLineSize_ = cache->LineSize;
info.globalMemCacheSize_ = cache->Size;
found = true;
}
}
}
}
free(buffer);
ULONG highestNuma = 0;
if (::GetNumaHighestNodeNumber(&highestNuma) && highestNuma != 0) {
info.affinityDomain_.numa_ = 1;
}
#endif
uintptr_t virtualMemSize;
#if defined(__linux__)
#if !defined(ATI_ARCH_ARM)
struct sysinfo si;
if (sysinfo(&si) != 0) {
return false;
}
if (si.mem_unit == 0) {
// Linux kernels prior to 2.3.23 return sizes in bytes.
si.mem_unit = 1;
}
info.globalMemSize_ = (cl_ulong) si.totalram * si.mem_unit;
#else
info.globalMemSize_ = 0;
#endif
virtualMemSize = (uintptr_t) info.globalMemSize_;
#else
MEMORYSTATUSEX statex;
statex.dwLength = sizeof (statex);
if (GlobalMemoryStatusEx (&statex) == 0) {
return false;
}
info.globalMemSize_ = (cl_ulong) statex.ullTotalPhys;
virtualMemSize =
(uintptr_t) std::min(statex.ullTotalPageFile, statex.ullTotalVirtual);
#endif
maxWorkerThreads_ = (size_t) (virtualMemSize /
(uintptr_t) ((CPU_WORKER_THREAD_STACK_SIZE +
CLK_PRIVATE_MEMORY_SIZE * (CPU_MAX_WORKGROUP_SIZE + 1))) *
7 / 10);
#if defined(_LP64)
// Cap at 8TiB for 64-bit
const cl_ulong maxGlobalMemSize = 8ULL*Ki*Gi;
#elif defined(_WIN32)
// Cap at 2GiB (see http://msdn.microsoft.com/en-us/library/aa366778.aspx)
const cl_ulong maxGlobalMemSize = 2ULL*Gi;
#else // linux
// Cap at 3.5GiB
const cl_ulong maxGlobalMemSize = 3584ULL*Mi;
#endif
info.globalMemSize_ = std::min(info.globalMemSize_, maxGlobalMemSize);
info.maxMemAllocSize_ = info.globalMemSize_ * CPU_MAX_ALLOC_PERCENT / 100;
if (flagIsDefault(CPU_MAX_ALLOC_PERCENT)) {
const cl_ulong minAllocSize = LP64_SWITCH(1ULL*Gi, 2ULL*Gi);
info.maxMemAllocSize_ = std::max(info.maxMemAllocSize_,
std::min(info.globalMemSize_, minAllocSize));
}
info.maxConstantBufferSize_ = 64*Ki;
info.maxConstantArgs_ = 8;
info.localMemType_ = CL_GLOBAL;
info.localMemSize_ = std::max((cl_ulong)32*Ki, info.globalMemCacheSize_/2);
info.errorCorrectionSupport_ = CL_FALSE;
info.hostUnifiedMemory_ = CL_TRUE;
info.profilingTimerResolution_ = (size_t)amd::Os::timerResolutionNanos();
info.profilingTimerOffset_ = amd::Os::offsetToEpochNanos();
info.littleEndian_ = CL_TRUE;
info.available_ = CL_TRUE;
info.compilerAvailable_ = CL_TRUE;
info.linkerAvailable_ = CL_TRUE;
info.executionCapabilities_ = CL_EXEC_KERNEL | CL_EXEC_NATIVE_KERNEL;
// Enable SVM only for OpenCL 2.0
if (((OPENCL_MAJOR >= 2) && (CPU_OPENCL_VERSION >= 200)) || OCL_FORCE_CPU_SVM) {
info.svmCapabilities_ = CL_DEVICE_SVM_COARSE_GRAIN_BUFFER |
CL_DEVICE_SVM_FINE_GRAIN_BUFFER |
CL_DEVICE_SVM_FINE_GRAIN_SYSTEM |
CL_DEVICE_SVM_ATOMICS;
}
info.preferredPlatformAtomicAlignment_ = 0;
info.preferredGlobalAtomicAlignment_ = 0;
info.preferredLocalAtomicAlignment_ = 0;
info.queueProperties_ = CL_QUEUE_PROFILING_ENABLE;
info.platform_ = AMD_PLATFORM;
#if defined(__linux__)
std::ifstream ifs("/proc/cpuinfo", std::ios::in);
if (ifs.is_open()) {
std::string line;
bool vendor = false;
bool name = false;
bool freq = false;
while (std::getline(ifs, line) && !(vendor && name && freq)) {
if (!vendor && (line.find("vendor_id\t: ")
!= std::string::npos)) {
::strcpy(
info.vendor_,
line.substr(line.find_first_of(':') + 2).c_str());
vendor = true;
}
else if (!name && (line.find("model name\t: ") != std::string::npos
|| line.find("Processor\t: ") != std::string::npos)) {
::strcpy(
info.name_,
line.substr(line.find_first_of(':') + 2).c_str());
name = true;
}
else if (!freq && (line.find("cpu MHz\t\t: ")
!= std::string::npos)) {
info.maxClockFrequency_ =
::atoi(line.substr(line.find_first_of(':') + 2).c_str());
freq = true;
}
}
ifs.close();
}
#elif defined(_WIN32)
int CPUInfo[4] = {-1};
int nRet = 0;
unsigned nIds, nExIds, i;
// cpuid with an InfoType argument of 0 returns the number of
// valid Ids in CPUInfo[0] and the CPU identification string in
// the other three array elements. The CPU identification string is
// not in linear order. The code below arranges the information
// in a human readable form.
amd::Os::cpuid(CPUInfo, 0);
nIds = CPUInfo[0];
memset(info.vendor_, 0, sizeof(info.vendor_));
*((int*)(info.vendor_+0)) = CPUInfo[1];
*((int*)(info.vendor_+4)) = CPUInfo[3];
*((int*)(info.vendor_+8)) = CPUInfo[2];
// Calling cpuid with 0x80000000 as the InfoType argument
// gets the number of valid extended IDs.
amd::Os::cpuid(CPUInfo, 0x80000000);
nExIds = CPUInfo[0];
memset(info.name_, 0, sizeof(info.name_));
sprintf(info.name_, "Unknown Processor");
// Get the information associated with each extended ID.
for (i=0x80000000; i<=nExIds; ++i)
{
amd::Os::cpuid(CPUInfo, i);
// Interpret CPU brand string and cache information.
if (i == 0x80000002)
memcpy(info.name_, CPUInfo, sizeof(CPUInfo));
else if (i == 0x80000003)
memcpy(info.name_ + 16, CPUInfo, sizeof(CPUInfo));
else if (i == 0x80000004)
memcpy(info.name_ + 32, CPUInfo, sizeof(CPUInfo));
}
info.maxClockFrequency_ = 0;
HKEY hKey;
// Open the key
if (RegOpenKeyEx(
HKEY_LOCAL_MACHINE,
"HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0\\",
0, KEY_QUERY_VALUE, &hKey) == ERROR_SUCCESS) {
// Read the value
DWORD dwLen = 4;
RegQueryValueEx(
hKey, "~MHz", NULL, NULL,
(LPBYTE)&info.maxClockFrequency_, &dwLen);
// Cleanup and return
RegCloseKey(hKey);
}
#else
::strcpy(info.name_, "Unknown Processor");
::strcpy(info.vendor_, "Unknown Vendor");
info.maxClockFrequency_ = 0;
#endif
#define OPENCL_VERSION_STR XSTR(OPENCL_MAJOR) "." XSTR(OPENCL_MINOR)
info.profile_ = "FULL_PROFILE";
if (CPU_OPENCL_VERSION < 200) {
info.version_ = "OpenCL 1.2 " AMD_PLATFORM_INFO;
info.oclcVersion_ = "OpenCL C 1.2 ";
}
else {
info.version_ = "OpenCL " OPENCL_VERSION_STR " " AMD_PLATFORM_INFO;
info.oclcVersion_ = "OpenCL C " OPENCL_VERSION_STR " ";
}
info.spirVersions_ = "1.2";
#if cl_amd_open_video
info.openVideo_ = CL_FALSE;
#endif // cl_amd_open_video
info.partitionCreateInfo_.type_.value_ = 0;
info.partitionProperties_.value_ = 0;
if (info.maxComputeUnits_ > 1) {
info.partitionProperties_.equally_ = 1;
info.partitionProperties_.byCounts_ = 1;
if (info.affinityDomain_.value_ != 0) {
info.partitionProperties_.byAffinityDomain_ = 1;
}
}
else {
info.affinityDomain_.value_ = 0;
}
// Copy the name into the boardName data member for CPU implementation.
// ::strncpy(info.boardName_, info.name_, sizeof(info.boardName_));
memset(info.boardName_, 0, sizeof(info.boardName_));
Device* device = new Device();
if (device == NULL || !device->create()) {
delete device;
return false;
}
::snprintf(info.driverVersion_, sizeof(info.driverVersion_) - 1,
"%s (%s%s%s)", AMD_BUILD_STRING,
#if defined(ATI_ARCH_X86)
"sse2",
#else // !ATI_ARCH_X86
"",
#endif // !ATI_ARCH_X86
device->hasAVXInstructions() ? ",avx" : "",
device->hasFMA4Instructions() ? ",fma4" : "");
// These will need to change for AVX2
info.preferredVectorWidthChar_ = 16;
info.preferredVectorWidthShort_ = 8;
info.preferredVectorWidthInt_ = 4;
info.preferredVectorWidthLong_ = 2;
if (device->hasAVXInstructions()) {
info.preferredVectorWidthFloat_ = 8;
info.preferredVectorWidthDouble_ = 4;
} else {
info.preferredVectorWidthFloat_ = 4;
info.preferredVectorWidthDouble_ = 2;
}
info.preferredVectorWidthHalf_ = 0; // no half support
// Same here, will need to change for AVX2
info.nativeVectorWidthChar_ = 16;
info.nativeVectorWidthShort_ = 8;
info.nativeVectorWidthInt_ = 4;
info.nativeVectorWidthLong_ = 2;
if (device->hasAVXInstructions()) {
info.nativeVectorWidthFloat_ = 8;
info.nativeVectorWidthDouble_ = 4;
} else {
info.nativeVectorWidthFloat_ = 4;
info.nativeVectorWidthDouble_ = 2;
}
info.nativeVectorWidthHalf_ = 0; // no half support
// Find all supported device extensions
info.extensions_ = device->getExtensionString();
// OpenCL 1.2 device info fields
info.builtInKernels_ = "";
info.preferredInteropUserSync_ = true;
info.printfBufferSize_ = 64*Ki;
info.maxPipePacketSize_ = info.maxMemAllocSize_;
info.maxPipeActiveReservations_ = 16;
info.maxPipeArgs_ = 16;
info.maxReadWriteImageArgs_ = MaxReadWriteImage;
// Max size should not be bigger than 1.75 GB
const cl_ulong maxSize = std::min(static_cast<cl_ulong>((Gi/4)*7),
info.maxMemAllocSize_);
info.maxGlobalVariableSize_ = static_cast<size_t>(maxSize);
info.globalVariablePreferredTotalSize_ = static_cast<size_t>(maxSize);
device->info_ = info;
device->registerDevice();
return true;
}
bool
Device::create()
{
// Create CPU settings
settings_ = new cpu::Settings();
cpu::Settings* cpuSettings = reinterpret_cast<cpu::Settings*>(settings_);
if ((cpuSettings == NULL) || !cpuSettings->create()) {
return false;
}
#if defined(ATI_ARCH_X86)
// Check that we have at least SSE2
if (settings().cpuFeatures_ == 0) {
return false;
}
#endif
return true;
}
bool
Device::initSubDevice(
device::Info& info,
cl_uint maxComputeUnits,
const device::CreateSubDevicesInfo& create_info)
{
if (workerThreadsAffinity_ == NULL) {
workerThreadsAffinity_ = new amd::Os::ThreadAffinityMask;
if (workerThreadsAffinity_ == NULL) {
return false;
}
}
info_ = info;
info_.maxComputeUnits_ = maxComputeUnits;
info_.partitionCreateInfo_ = create_info.p_;
if (create_info.p_.type_.value_ == device::PartitionType::BY_COUNTS) {
cl_uint* countsList = new cl_uint[create_info.p_.byCounts_.listSize_];
if (countsList == NULL) {
return false;
}
for (size_t i = 0; i < create_info.p_.byCounts_.listSize_; ++i) {
countsList[i] = create_info.countsListAt(i);
}
info_.partitionCreateInfo_.byCounts_.countsList_ = countsList;
}
// The device cannot be partitioned further
if (maxComputeUnits == 1) {
info_.partitionProperties_.value_ = 0;
info_.affinityDomain_.value_ = 0;
}
return true;
}
void
Device::setWorkerThreadsAffinity(
cl_uint numWorkerThreads,
const amd::Os::ThreadAffinityMask* threadsAffinityMask,
uint& baseCoreId)
{
uint coreId = baseCoreId;
if (threadsAffinityMask == NULL) {
for (cl_uint i = 0; i < numWorkerThreads; ++i) {
++coreId;
workerThreadsAffinity_->set(coreId);
}
}
else { // Already has affinity, so filter accordingly
for (cl_uint i = 0; i < numWorkerThreads; ++i) {
coreId = threadsAffinityMask->getNextSet(coreId);
workerThreadsAffinity_->set(coreId);
}
}
baseCoreId = coreId;
}
cl_int
Device::createSubDevices(
device::CreateSubDevicesInfo& create_info,
cl_uint num_entries,
cl_device_id* devices,
cl_uint* num_devices)
{
switch (create_info.p_.type_.value_) {
case device::PartitionType::EQUALLY:
return partitionEqually(
create_info, num_entries, devices, num_devices);
case device::PartitionType::BY_COUNTS:
return partitionByCounts(
create_info, num_entries, devices, num_devices);
case device::PartitionType::BY_AFFINITY_DOMAIN:
if (info_.affinityDomain_.value_ == 0) {
return CL_DEVICE_PARTITION_FAILED;
}
if (create_info.p_.byAffinityDomain_.next_) {
create_info.p_.byAffinityDomain_.next_ = 0;
create_info.p_.byAffinityDomain_.value_ =
(1 << amd::leastBitSet(info_.affinityDomain_.value_));
}
else {
if ((create_info.p_.byAffinityDomain_.value_ &
info_.affinityDomain_.value_) == 0) {
return CL_INVALID_VALUE;
}
}
if (create_info.p_.byAffinityDomain_.numa_) {
return partitionByAffinityDomainNUMA(
create_info, num_entries, devices, num_devices);
}
else {
return partitionByAffinityDomainCacheLevel(
create_info, num_entries, devices, num_devices);
}
default:
return CL_INVALID_VALUE;
}
return CL_SUCCESS;
}
cl_int
Device::partitionEqually(
const device::CreateSubDevicesInfo& create_info,
cl_uint num_entries,
cl_device_id* devices,
cl_uint* num_devices)
{
cl_uint subComputeUnits =
(cl_uint)create_info.p_.equally_.numComputeUnits_;
if (subComputeUnits == 0) {
return CL_INVALID_VALUE;
}
cl_uint numSubDevices = info_.maxComputeUnits_ / subComputeUnits;
if (numSubDevices == 0) {
return CL_DEVICE_PARTITION_FAILED;
}
if (num_devices != NULL) {
*num_devices = numSubDevices;
}
if (devices != NULL) {
if (num_entries < numSubDevices) {
return CL_INVALID_VALUE;
}
uint coreId = (uint)-1;
while (numSubDevices-- > 0) {
Device* device = new Device(this);
if (device == NULL) {
return CL_OUT_OF_HOST_MEMORY;
}
if (!device->create() ||
!device->initSubDevice(info_, subComputeUnits, create_info)) {
device->release();
return CL_OUT_OF_HOST_MEMORY;
}
device->setWorkerThreadsAffinity(
subComputeUnits, workerThreadsAffinity_, coreId);
*devices++ = as_cl(static_cast<amd::Device*>(device));
}
}
return CL_SUCCESS;
}
cl_int
Device::partitionByCounts(
const device::CreateSubDevicesInfo& create_info,
cl_uint num_entries,
cl_device_id* devices,
cl_uint* num_devices)
{
cl_uint maxComputeUnits = 0;
cl_uint numSubDevices = (cl_uint)create_info.p_.byCounts_.listSize_;
for (size_t i = (size_t)numSubDevices; i > 0; --i) {
maxComputeUnits += create_info.countsListAt(i);
}
if (numSubDevices == 0 || maxComputeUnits > info_.maxComputeUnits_) {
return CL_INVALID_DEVICE_PARTITION_COUNT;
}
if (num_devices != NULL) {
*num_devices = numSubDevices;
}
if (devices != NULL) {
if (num_entries < numSubDevices) {
return CL_INVALID_VALUE;
}
uint coreId = (uint)-1;
while (numSubDevices-- > 0) {
Device* device = new Device(this);
if (device == NULL) {
return CL_OUT_OF_HOST_MEMORY;
}
cl_uint subComputeUnits =
create_info.countsListAt((size_t)numSubDevices);
if (!device->create() ||
!device->initSubDevice(info_, subComputeUnits, create_info)) {
device->release();
return CL_OUT_OF_HOST_MEMORY;
}
device->setWorkerThreadsAffinity(
subComputeUnits, workerThreadsAffinity_, coreId);
*devices++ = as_cl(static_cast<amd::Device*>(device));
}
}
return CL_SUCCESS;
}
cl_int
Device::partitionByAffinityDomainNUMA(
const device::CreateSubDevicesInfo& create_info,
cl_uint num_entries,
cl_device_id* devices,
cl_uint* num_devices)
{
cl_uint numSubDevices = 0;
#if defined(__linux__)
#if !defined(NUMA_SUPPORT)
return CL_INVALID_VALUE;
#else
int highestNuma = numa_max_node();
if (highestNuma < 0) {
return CL_INVALID_VALUE;
}
numSubDevices = (cl_uint)highestNuma;
if (devices != NULL) {
for (int node = 0; node <= highestNuma; ++node) {
cl_uint subComputeUnits = 0;
int len = 1;
while (true) {
ulong* cpus = alloca(sizeof(ulong)*len);
if (numa_node_to_cpus(node, cpus, len * sizeof(ulong)) < 0) {
if (errno != ERANGE) {
return CL_INVALID_VALUE;
}
len *= 2;
}
else {
len *= sizeof(ulong) * 8;
for (int i = 0; i < len; i++) {
if (test_bit(i, cpus)) {
++subComputeUnits;
}
}
break;
}
}
if (subComputeUnits == 0) {
return CL_INVALID_VALUE;
}
Device* device = new Device(this);
if (device == NULL) {
return CL_OUT_OF_HOST_MEMORY;
}
if (!device->create() || NULL == (device->numaMask_ = new nodemask_t)) {
device->release();
return CL_OUT_OF_HOST_MEMORY;
}
if (!device->initSubDevice(
info_, subComputeUnits, create_info)) {
delete device->numaMask_;
device->numaMask_ = NULL;
device->release();
return CL_OUT_OF_HOST_MEMORY;
}
nodemask_zero(device->numaMask_);
nodemask_set(device->numaMask_, node);
// Need to remove this domain type
device->info_.affinityDomain_.numa_ = 0;
*devices++ = as_cl(static_cast<amd::Device*>(device));
}
}
#endif // NUMA_SUPPORT
#else // win32
GROUP_AFFINITY numaNodeMask;
ULONG highestNuma = 0;
if (!::GetNumaHighestNodeNumber(&highestNuma)) {
return CL_INVALID_VALUE;
}
for (ULONG node = 0; node <= highestNuma; ++node) {
if (pfnGetNumaNodeProcessorMaskEx != NULL) {
if (!pfnGetNumaNodeProcessorMaskEx((USHORT)node, &numaNodeMask)) {
// Highet NUMA node number is not guaranteed to be the
// number of nodes.
continue;
}
}
else {
ULONGLONG tmpMask;
if (!::GetNumaNodeProcessorMask((UCHAR)node, &tmpMask)) {
// Highet NUMA node number is not guaranteed to be the
// number of nodes.
continue;
}
numaNodeMask.Group = 0;
numaNodeMask.Mask = (KAFFINITY)tmpMask;
}
if (workerThreadsAffinity_ != NULL) {
workerThreadsAffinity_->adjust(0, numaNodeMask.Mask);
}
if (numaNodeMask.Mask == 0) {
continue;
}
if (devices != NULL) {
Device* device = new Device(this);
if (device == NULL) {
return CL_OUT_OF_HOST_MEMORY;
}
if (!device->create() || !device->initSubDevice(info_,
(cl_uint)amd::countBitsSet(numaNodeMask.Mask), create_info)) {
device->release();
return CL_OUT_OF_HOST_MEMORY;
}
device->workerThreadsAffinity_->set(
numaNodeMask.Group, numaNodeMask.Mask);
// Need to remove this domain type
device->info_.affinityDomain_.numa_ = 0;
*devices++ = as_cl(static_cast<amd::Device*>(device));
}
numSubDevices++;
}
#endif // win32
if (num_devices != NULL) {
*num_devices = numSubDevices;
}
// Could not get a processor mask for any of the nodes
if (numSubDevices == 0) {
return CL_INVALID_VALUE;
}
return CL_SUCCESS;
}
#if defined(__linux__)
static bool
readFileString(const char* file, char* buf, size_t bufSize)
{
int fd = open(file, O_RDONLY);
if (fd < 0) {
return false;
}
struct stat st;
if (fstat(fd, &st) < 0) {
close(fd);
return false;
}
if ((size_t)st.st_size < bufSize) {
bufSize = (size_t)st.st_size;
}
ssize_t n = read(fd, buf, bufSize);
close(fd);
if (n <= 0) {
return false;
}
if (n >= (ssize_t)bufSize) {
n = (ssize_t)bufSize - 1;
}
buf[n] = '\0';
return true;
}
static void
parseSharedCpuMap(const char* cpuMap, cpu_set_t& mask)
{
CPU_ZERO(&mask);
uint32_t* bits = (uint32_t*)mask.__bits;
const char* s = cpuMap + strlen(cpuMap);
while (true) {
s = (const char*)memrchr(cpuMap, ',', s - cpuMap);
if (!s) {
s = cpuMap;
}
else {
s++;
}
*bits++ = strtoul(s, NULL, 16);
if (s == cpuMap) {
return;
}
--s;
}
}
#endif // linux
cl_int
Device::partitionByAffinityDomainCacheLevel(
const device::CreateSubDevicesInfo& create_info,
cl_uint num_entries,
cl_device_id* devices,
cl_uint* num_devices)
{
cl_uint cacheLevel = 0;
switch (create_info.p_.byAffinityDomain_.value_) {
case device::AffinityDomain::AFFINITY_DOMAIN_L4_CACHE:
cacheLevel = 4;
break;
case device::AffinityDomain::AFFINITY_DOMAIN_L3_CACHE:
cacheLevel = 3;
break;
case device::AffinityDomain::AFFINITY_DOMAIN_L2_CACHE:
cacheLevel = 2;
break;
case device::AffinityDomain::AFFINITY_DOMAIN_L1_CACHE:
cacheLevel = 1;
break;
default:
return CL_INVALID_VALUE;
}
const uint negAffinityDomain =
~create_info.p_.byAffinityDomain_.value_;
cl_uint numSubDevices = 0;
#if defined(__linux__)
amd::Os::ThreadAffinityMask affinityMask;
if (workerThreadsAffinity_ != NULL) {
affinityMask = *workerThreadsAffinity_;
}
else {
for (uint cpuId = 0; cpuId < (uint)info_.maxComputeUnits_; ++cpuId) {
affinityMask.set(cpuId);
}
}
amd::Os::ThreadAffinityMask currentMask;
char buf[1024];
for (uint cpuId = affinityMask.getFirstSet();
cpuId != (uint)-1;
cpuId = affinityMask.getNextSet(cpuId)) {
sprintf(buf,
"/sys/devices/system/cpu/cpu%u/cache/index%u/shared_cpu_map",
cpuId, cacheLevel);
if (!readFileString(buf, buf, sizeof(buf))) {
return CL_INVALID_VALUE;
}
parseSharedCpuMap(buf, currentMask.getNative());
affinityMask.adjust(currentMask.getNative());
if (currentMask.isEmpty()) {
continue;
}
cl_uint maxComputeUnits;
if (cacheLevel > 1) {
maxComputeUnits = 0;
amd::Os::ThreadAffinityMask currentMaskSub;
cl_uint cacheLevelSub = cacheLevel - 1;
for (uint cpuIdSub = affinityMask.getFirstSet();
cpuIdSub != (uint)-1;
cpuIdSub = affinityMask.getNextSet(cpuIdSub)) {
sprintf(buf,
"/sys/devices/system/cpu/cpu%u/cache/index%u/shared_cpu_map",
cpuIdSub, cacheLevelSub);
if (!readFileString(buf, buf, sizeof(buf))) {
return CL_INVALID_VALUE;
}
parseSharedCpuMap(buf, currentMaskSub.getNative());
currentMask.adjust(currentMaskSub.getNative());
if (!currentMaskSub.isEmpty()) {
++maxComputeUnits;
}
}
if (maxComputeUnits == 0) {
continue;
}
}
else {
maxComputeUnits = 1;
}
if (devices != NULL) {
Device* device = new Device(this);
if (device == NULL) {
return CL_OUT_OF_HOST_MEMORY;
}
if (!device->create() ||
!device->initSubDevice(info_, maxComputeUnits, create_info)) {
device->release();
return CL_OUT_OF_HOST_MEMORY;
}
device->workerThreadsAffinity_->set(currentMask.getNative());
// Need to remove this domain type
device->info_.affinityDomain_.value_ &= negAffinityDomain;
*devices++ = as_cl(static_cast<amd::Device*>(device));
}
numSubDevices++;
affinityMask.clear(currentMask.getNative());
}
#else // win32
DWORD length = 0;
::GetLogicalProcessorInformation(NULL, &length);
PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer =
(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION) malloc(length);
if (buffer != NULL && ::GetLogicalProcessorInformation(buffer, &length)) {
PSYSTEM_LOGICAL_PROCESSOR_INFORMATION ptr, limit =
&buffer[length / sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION)];
for (ptr = buffer; ptr < limit; ++ptr) {
PCACHE_DESCRIPTOR cache = &ptr->Cache;
if (ptr->Relationship == RelationCache && cache->Type != CacheInstruction) {
if (cache->Level == cacheLevel) {
KAFFINITY affinityMask = (KAFFINITY)ptr->ProcessorMask;
if (workerThreadsAffinity_ != NULL) {
workerThreadsAffinity_->adjust(0, affinityMask);
}
if (affinityMask == 0) {
continue;
}
cl_uint maxComputeUnits;
if (cacheLevel > 1) {
maxComputeUnits = 0;
cl_uint cacheLevelSub = cacheLevel - 1;
for (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION
ptrSub = buffer; ptrSub < limit; ++ptrSub) {
PCACHE_DESCRIPTOR cacheSub = &ptrSub->Cache;
if (ptrSub->Relationship == RelationCache &&
cacheSub->Type != CacheInstruction) {
if (cacheSub->Level == cacheLevelSub &&
((affinityMask & (KAFFINITY)ptrSub->ProcessorMask) != 0)) {
++maxComputeUnits;
}
}
}
if (maxComputeUnits == 0) {
continue;
}
}
else {
maxComputeUnits = 1;
}
if (devices != NULL) {
Device* device = new Device(this);
if (device == NULL) {
free(buffer);
return CL_OUT_OF_HOST_MEMORY;
}
if (!device->create() || !device->initSubDevice(info_,
maxComputeUnits, create_info)) {
free(buffer);
device->release();
return CL_OUT_OF_HOST_MEMORY;
}
device->workerThreadsAffinity_->set(0, affinityMask);
// Need to remove this domain type
device->info_.affinityDomain_.value_ &= negAffinityDomain;
*devices++ = as_cl(static_cast<amd::Device*>(device));
}
numSubDevices++;
if (numSubDevices >= info_.maxComputeUnits_) {
break;
}
}
}
}
}
free(buffer);
#endif
if (num_devices != NULL) {
*num_devices = numSubDevices;
}
if (numSubDevices == 0) {
return CL_INVALID_VALUE;
}
return CL_SUCCESS;
}
device::Program*
Device::createProgram(int oclVer)
{
Program* cpuProgram = new Program(*this);
if (cpuProgram == NULL) {
LogError("We failed memory allocation for program!");
}
return cpuProgram;
}
void*
Device::allocMapTarget(
amd::Memory& mem,
const amd::Coord3D& origin,
const amd::Coord3D& region,
uint mapFlags,
size_t* rowPitch,
size_t* slicePitch)
{
if (mem.asImage() != NULL) {
amd::Image * image = mem.asImage();
size_t elementSize = image->getImageFormat().getElementSize();
size_t rp = image->getRowPitch();
size_t sp = image->getSlicePitch();
*rowPitch = rp;
if (slicePitch) {
*slicePitch = sp;
}
return (address) image->getHostMem()
+ (origin[0] * elementSize + origin[1] * rp + origin[2] * sp);
}
else if (mem.asBuffer() != NULL) {
return (address) mem.getHostMem() + origin[0];
}
return NULL;
}
void
Device::freeMapTarget(amd::Memory& mem, void* target)
{
// nop for CPU
}
} // namespace cpu