From 70aabc532576ba00d4eeea736ce55345b1051179 Mon Sep 17 00:00:00 2001 From: foreman Date: Wed, 13 May 2015 12:01:50 -0400 Subject: [PATCH] P4 to Git Change 1150348 by rayxiao@alit_opencl_rayxiao on 2015/05/13 10:49:22 EPR #396242 - Solution to cpu device alignment bug. Affected files ... ... //depot/stg/opencl/drivers/opencl/compiler/edg/src/amd_ocl_attribute.c#24 edit ... //depot/stg/opencl/drivers/opencl/compiler/edg/src/cmd_line.c#86 edit ... //depot/stg/opencl/drivers/opencl/compiler/edg/src/il.c#28 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/cpu/cpucommand.cpp#64 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/cpu/cpucommand.hpp#39 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/cpu/cpukernel.hpp#7 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/cpu/cpumapping.cpp#1 add ... //depot/stg/opencl/drivers/opencl/runtime/device/cpu/cpumapping.hpp#1 add ... //depot/stg/opencl/drivers/opencl/runtime/device/cpu/cpuprogram.cpp#65 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/cpu/cpuprogram.hpp#13 edit ... //depot/stg/opencl/drivers/opencl/runtime/utils/flags.hpp#231 edit ... //depot/stg/opencl/drivers/opencl/tools/runocl/options.c#11 edit --- rocclr/runtime/device/cpu/cpucommand.cpp | 22 +- rocclr/runtime/device/cpu/cpukernel.hpp | 36 ++- rocclr/runtime/device/cpu/cpumapping.cpp | 326 +++++++++++++++++++++++ rocclr/runtime/device/cpu/cpumapping.hpp | 46 ++++ rocclr/runtime/device/cpu/cpuprogram.cpp | 40 ++- rocclr/runtime/utils/flags.hpp | 2 + 6 files changed, 462 insertions(+), 10 deletions(-) create mode 100644 rocclr/runtime/device/cpu/cpumapping.cpp create mode 100644 rocclr/runtime/device/cpu/cpumapping.hpp diff --git a/rocclr/runtime/device/cpu/cpucommand.cpp b/rocclr/runtime/device/cpu/cpucommand.cpp index 6511412a7d..2ef12af280 100644 --- a/rocclr/runtime/device/cpu/cpucommand.cpp +++ b/rocclr/runtime/device/cpu/cpucommand.cpp @@ -14,6 +14,7 @@ #include "thread/thread.hpp" #include "os/os.hpp" #include "utils/util.hpp" +#include "utils/options.hpp" #include @@ -302,7 +303,6 @@ NDRangeKernelBatch::patchParameters( size_t alignment = cpuKernel.getArgAlignment(i); effectiveOffset = amd::alignUp(effectiveOffset, std::min(alignment, size_t(16))); param = params + effectiveOffset; - if (desc.size_ == 0) { // __local memory parameter localMemPtr = amd::alignUp(localMemPtr, sizeof(cl_long16)); @@ -362,10 +362,24 @@ NDRangeKernelBatch::patchParameters( *reinterpret_cast(param) = (uint32_t)samplerArg->state(); } else { - ::memcpy(param, cmdParam, desc.size_); + //Using HCtoDCmap + HCtoDCmap arg_map = cpuKernel.getHCtoDCmap(i); + unsigned int arg_offset = effectiveOffset; + int err_code = 0; + int inStruct = 0; + int sys_64bit = LP64_SWITCH(0, 1); // Mapping only required for 32 bit targets + if (CPU_USE_ALIGNMENT_MAP == 0 && !sys_64bit) { + effectiveOffset += arg_map.copy_params(param, cmdParam, arg_offset, err_code, inStruct); + if (err_code) { + return false; + } + prmSize = arg_map.dc_size; + } + else { + ::memcpy(param, cmdParam, desc.size_); + } } - - effectiveOffset += cpuKernel.getArgSize(i); + effectiveOffset += prmSize; } localMemPtr = amd::alignUp(localMemPtr, sizeof(cl_long16)); diff --git a/rocclr/runtime/device/cpu/cpukernel.hpp b/rocclr/runtime/device/cpu/cpukernel.hpp index cf7d9eb473..5bfa1ff02b 100644 --- a/rocclr/runtime/device/cpu/cpukernel.hpp +++ b/rocclr/runtime/device/cpu/cpukernel.hpp @@ -9,6 +9,8 @@ #include "device/device.hpp" #include +#include "device/cpu/cpumapping.hpp" + //! \namespace cpu CPU Device Implementation namespace cpu { @@ -18,7 +20,9 @@ class Kernel : public device::Kernel private: const void* entryPoint_; //!< entry for the kernel - std::vector< std::pair > args_; + std::vector< std::pair > args_; + std::vector< std::pair < HCtoDCmap, size_t> > HCtoDCmaps_; + std::vector< HCtoDCmap > internal_maps_; public: uint nature_; //!< kernel's nature uint privateSize_; //!< WorkItem's private memory size (in bytes) @@ -42,6 +46,36 @@ public: return args_[argIndex].second; } + void addInternalMap(HCtoDCmap *new_map) { + if (new_map != NULL) { + internal_maps_.push_back(*new_map); + this->addInternalMap(new_map->internal_field_map); + this->addInternalMap(new_map->next_field_map); + } + else + return; + } + + void addHCtoDCmap(HCtoDCmap *new_map) { + if (new_map != NULL) { + if (HCtoDCmaps_.size() > 0) + HCtoDCmaps_.push_back(std::pair< HCtoDCmap, size_t >(*new_map, HCtoDCmaps_.back().second)); + else + HCtoDCmaps_.push_back(std::pair< HCtoDCmap, size_t >(*new_map, 0)); + } + else + return; + } + + HCtoDCmap getHCtoDCmap(int mapIndex) const { + return HCtoDCmaps_[mapIndex].first; + } + + + uint getArgNumber() { + return HCtoDCmaps_.size(); + } + //! Default constructor Kernel(const std::string& name) : device::Kernel(name), entryPoint_(NULL), nature_(0), diff --git a/rocclr/runtime/device/cpu/cpumapping.cpp b/rocclr/runtime/device/cpu/cpumapping.cpp new file mode 100644 index 0000000000..8b21cc4276 --- /dev/null +++ b/rocclr/runtime/device/cpu/cpumapping.cpp @@ -0,0 +1,326 @@ +// +// Copyright (c) 2011 Advanced Micro Devices, Inc. All rights reserved. +// + +#include "device/cpu/cpudevice.hpp" +#include "device/cpu/cpukernel.hpp" +#include "platform/program.hpp" +#include "os/os.hpp" +#include "device/cpu/cpumapping.hpp" +#include +#include +#include +#include +#if defined(_WIN32) +#include +#endif +// amdrt.o +#if defined(WITH_ONLINE_COMPILER) && !defined(_LP64) && !defined(ATI_ARCH_ARM) +#include "amdrt.inc" +#endif +#include "acl.h" +using std::min; +using std::max; + +namespace cpu { + HCtoDCmap::HCtoDCmap(const clk_parameter_descriptor_t* desc, unsigned int level_alignment, unsigned int index, unsigned int init_offset) + { + //Initialize fields + hc_offset = 0; + hc_size = 0; + dc_offset = 0; + dc_size = 0; + map_alignment = level_alignment; + internal_field_map = NULL; + next_field_map = NULL; + return; + } + + HCtoDCmap::~HCtoDCmap() + { + return; + } + + //Helper to find sizes of each scalar type + size_t HCtoDCmap::getHostScalarParamSize(const clk_value_type_t type) const + { + size_t size = 0; + switch (type) { + case T_CHAR: + size = 1; + break; + case T_SHORT: case T_CHAR2: + size = 2; + break; + case T_FLOAT: case T_INT: case T_CHAR4: + case T_SHORT2: case T_CHAR3: + size = 4; + break; + case T_SAMPLER: + size = 4; + break; + case T_LONG: case T_DOUBLE: case T_CHAR8: + case T_SHORT4: case T_INT2: case T_FLOAT2: + case T_SHORT3: + size = 8; + break; + case T_INT3: case T_FLOAT3: + case T_CHAR16: case T_SHORT8: case T_INT4: + case T_FLOAT4: case T_LONG2: case T_DOUBLE2: + size = 16; + break; + case T_LONG3: case T_DOUBLE3: + case T_SHORT16: case T_INT8: case T_FLOAT8: + case T_LONG4: case T_DOUBLE4: + size = 32; + break; + case T_INT16: case T_FLOAT16: case T_LONG8: + case T_DOUBLE8: + size = 64; + break; + case T_LONG16: case T_DOUBLE16: + size = 128; + break; + case T_POINTER: case T_VOID: + size = sizeof(void*); + break; + default: + assert(0 && "unknown scalar parameter size"); + break; + } + return size; + } + + size_t HCtoDCmap::getHostScalarAlignment(const clk_value_type_t type) const + { + size_t align = 0; + switch (type) { + case T_CHAR: + align = 1; + break; + case T_SHORT: case T_CHAR2: + align = 2; + break; + case T_FLOAT: case T_INT: case T_CHAR4: + case T_SHORT2: case T_CHAR3: + align = 4; + break; + case T_SAMPLER: + align = sizeof(uint32_t); + break; + case T_LONG: + align = LP64_SWITCH(4, 8); + break; + case T_DOUBLE: + align = LP64_SWITCH(4, 8); + break; + case T_CHAR8: + case T_SHORT4: case T_INT2: case T_FLOAT2: + case T_SHORT3: + align = 4; + break; + case T_INT3: case T_FLOAT3: + case T_CHAR16: case T_SHORT8: case T_INT4: + case T_FLOAT4: case T_LONG2: case T_DOUBLE2: + case T_LONG3: case T_DOUBLE3: + case T_SHORT16: case T_INT8: case T_FLOAT8: + case T_LONG4: case T_DOUBLE4: + case T_INT16: case T_FLOAT16: case T_LONG8: + case T_DOUBLE8: + case T_LONG16: case T_DOUBLE16: + align = LP64_SWITCH(4, 8); + break; + case T_POINTER: case T_VOID: + align = sizeof(void*); + break; + default: + assert(0 && "unknown scalar parameter alignment"); + break; + } + return align; + } + + // Align up arguments within each map, return the size of current map parameter + // Input current alignment of the parameter, size of outer struct if it exists + void HCtoDCmap::align_map(unsigned alignment, unsigned &outer_hc_size, unsigned &outer_dc_size, int &inStruct) + { + unsigned map_param_size = 0; + if (internal_field_map != NULL) { + hc_size = 0; //Recalculate size to account for internal offsets + inStruct++; + internal_field_map->align_map(map_alignment, hc_size, dc_size, inStruct); // align internal struct, might alter size of this struct + } + // Use map_param_size to store current parameter size after adjusting alignment + if (alignment != 1 && hc_size % alignment != 0) { + map_param_size = max(alignment, hc_size - (hc_size%alignment) + alignment); + } + else { + map_param_size = max(alignment, hc_size); + } + if (next_field_map != NULL) { + next_field_map->hc_offset = this->next_offset(hc_offset, map_param_size, inStruct); + next_field_map->align_map(alignment, outer_hc_size, outer_dc_size, inStruct); + // Reset parameter size for char padding + if (next_field_map->type == T_CHAR) + map_param_size = 1; + } + else + { + // Moving out of struct + if (inStruct > 0) + inStruct--; + if (type == T_CHAR) + map_param_size = 1; + } + outer_hc_size = max(outer_hc_size, hc_offset+map_param_size); + outer_dc_size = max(outer_dc_size, dc_offset+dc_size); + return; + } + + // Return current size of map, calculate internal maps and process next args if in struct. + // Alignment: alignment flag for members in case of structs, alignment of scalar otherwise. + int HCtoDCmap::compute_map(const clk_parameter_descriptor_t* desc, unsigned int &alignment, unsigned int init_offset, int& inStruct, int& index_out) + { + unsigned internal_index; + internal_index = index_out; + unsigned int next_offset = init_offset; + unsigned struct_size = 0; + type = desc[internal_index].type; + + if (desc[internal_index].type == T_STRUCT) { + //Moving into struct, go to next index + inStruct++; + hc_offset = init_offset; + if (desc[index_out+1].type != T_VOID) { + index_out++; + internal_index = index_out; + internal_field_map = new HCtoDCmap(desc, 0, internal_index, init_offset); + hc_size = internal_field_map->compute_map(desc, map_alignment, next_offset, inStruct, index_out); + map_alignment = max(map_alignment, internal_field_map->map_alignment); // Adjust alignment to biggest member alignment + struct_size = hc_size; + internal_index = index_out; + alignment = max(alignment, map_alignment); + if (inStruct > 0) { + if (desc[index_out+1].type != T_VOID) { + //Still inside struct and not done + index_out++; + internal_index = index_out; + next_field_map = new HCtoDCmap(desc, 0, internal_index, next_offset); + struct_size = hc_size; + struct_size += next_field_map->compute_map(desc, alignment, next_offset, inStruct, index_out); + next_offset = max(next_field_map->hc_offset+next_field_map->hc_size, next_field_map->hc_offset+alignment); + // running count of strucdc_size = hc_size + size of next member + return struct_size; + } + else { + //Moving out of struct, go to next index + index_out++; + internal_index = index_out; + inStruct--; + return hc_size; //return last struct member size + } + } + } + } + else { + //Scalar parameter + hc_offset = init_offset; + hc_size = getHostScalarParamSize(desc[internal_index].type); + dc_size = hc_size; + map_alignment = getHostScalarAlignment(desc[internal_index].type); + alignment = max(alignment, map_alignment); //Adjust alignment of upper level struct if necessary, upper level alignment = max alignment of members + if (desc[internal_index].type == T_LONG) + alignment = max(alignment, (unsigned int)8); //Set struct alignment to 8 on outside if containing struct member of long + if (inStruct > 0) { + if (desc[index_out+1].type != T_VOID) { + //Still inside struct and not done + index_out++; + next_field_map = new HCtoDCmap(desc, alignment, internal_index, next_offset); + struct_size = hc_size; + struct_size += next_field_map->compute_map(desc, alignment, next_offset, inStruct, index_out); + next_offset = hc_offset+alignment; + alignment = max(alignment, next_field_map->map_alignment); + // running count of strucdc_size = hc_size + size of next member + return struct_size; + } + else { + //Moving out of struct, go to next index + index_out++; + inStruct--; + return hc_size; //return last struct member size + } + } + } + return hc_size; + } + + // Adjust offset for source and target, return next source offset + unsigned HCtoDCmap::next_offset(unsigned current_offset, unsigned &map_param_size, int& inStruct_flag) + { + unsigned next_offset = current_offset; + if (next_field_map == NULL) { + assert(0 && "invalid next struct field map"); + return next_offset; + } + else { + // Ignore alignment when a char occurs to account for padding + if (type != T_STRUCT && next_field_map->hc_size == 1 && map_param_size > 1 && inStruct_flag > 0) { + next_field_map->dc_offset = dc_offset + dc_size; + next_offset = current_offset + hc_size; + } + // + else { + if (this->next_field_map->type == T_LONG) { + if (dc_size % 4 != 0) { + this->next_field_map->dc_offset = dc_offset + dc_size - (dc_size % 4) + 4; // T_LONG aligned by 4 in target + } + else { + this->next_field_map->dc_offset = dc_offset + dc_size; // T_LONG aligned by 4 in target + } + if (dc_size % 8 != 0) { + next_offset = current_offset + dc_size - (dc_size % 8) + 8; //aligned by 8 in source + } + else { + next_offset = current_offset + dc_size; //aligned by 8 in source + } + } + else { + if ((dc_offset + dc_size) % next_field_map->map_alignment != 0) { + this->next_field_map->dc_offset = dc_offset + dc_size - (dc_size % next_field_map->map_alignment) + next_field_map->map_alignment; + } + else { + this->next_field_map->dc_offset = dc_offset + max(dc_size, next_field_map->map_alignment); + } + if ((hc_offset + hc_size) % next_field_map->map_alignment != 0) { + next_offset = hc_offset + hc_size - (hc_size % next_field_map->map_alignment) + next_field_map->map_alignment; + } + else { + next_offset = hc_offset + max(next_field_map->map_alignment, map_param_size); + } + } + } + return next_offset; + } + } + + // Copy memory according to mapping + unsigned int HCtoDCmap::copy_params(void *dst, const void *src, unsigned int &arg_offset, int& error_code, int &inStruct) const + { + unsigned int padding = 0; + // Pad offset to be aligned by 8 if parameter is double, not as struct field + if ((arg_offset+dc_offset) % 8 != 0 && (type == T_DOUBLE) && inStruct == 0) + padding = map_alignment-((arg_offset+dc_offset)%map_alignment); + ::memcpy(reinterpret_cast(reinterpret_cast(dst)+padding), src, hc_size); + if (internal_field_map != NULL) { + inStruct++; + internal_field_map->copy_params(dst, src, arg_offset, error_code, inStruct); + inStruct--; + } + if (next_field_map != NULL) { + void *next_dst = reinterpret_cast(reinterpret_cast(dst)+next_field_map->dc_offset); + const void *next_src = reinterpret_cast(reinterpret_cast(src)+next_field_map->hc_offset); + next_field_map->copy_params(next_dst, next_src, arg_offset, error_code, inStruct); + } + return padding; + } +} //namespace cpu \ No newline at end of file diff --git a/rocclr/runtime/device/cpu/cpumapping.hpp b/rocclr/runtime/device/cpu/cpumapping.hpp new file mode 100644 index 0000000000..105b76bde7 --- /dev/null +++ b/rocclr/runtime/device/cpu/cpumapping.hpp @@ -0,0 +1,46 @@ +// +// Copyright (c) 2011 Advanced Micro Devices, Inc. All rights reserved. +// +// HCtoDCmap provides a mapping of parameters from host compiler to device compiler +// The mapping can be used to copy parameters from host to device where field alignment +// is different in compilers +#ifndef CPUMAPPING_HPP_ +#define CPUMAPPING_HPP_ + +using std::min; +using std::max; + +namespace cpu { + +class HCtoDCmap +{ + +public: + unsigned int hc_offset, hc_size; // Offset and size of this parameter in host compiler + unsigned int dc_offset, dc_size; // Offset and size of this parameter in device compiler + unsigned int map_alignment; // Alignment of parameter in host compiler + clk_value_type_t type; // Type of parameter + HCtoDCmap *internal_field_map; // Pointer to internal mapping when current parameter is of type T_STRUCT + HCtoDCmap *next_field_map; // Pointer to next struct field when current parameter is a struct member + + HCtoDCmap(const clk_parameter_descriptor_t*, unsigned int, unsigned int, unsigned int); + virtual ~HCtoDCmap(); + int compute_map(const clk_parameter_descriptor_t*, unsigned int &, unsigned int, int&, int&); + unsigned next_offset(unsigned, unsigned &, int &); + size_t getHostScalarParamSize(const clk_value_type_t) const; + size_t getHostScalarAlignment(const clk_value_type_t) const; + void align_map(unsigned, unsigned&, unsigned&, int&); + unsigned int copy_params(void *, const void *, unsigned int&, int&, int&) const; + +private: +}; + + +} // namespace cpu + +#endif // CPUMAPPING_HPP_ +// Mapping rule +// Long types are treated with 8 byte alignment in runtime when passed in as arguments +// but they are treated with 4 byte alignment in compiler +// Double members have 8 byte alignment when passed as scalar argument +// but have 4 byte alignment as a field inside a struct \ No newline at end of file diff --git a/rocclr/runtime/device/cpu/cpuprogram.cpp b/rocclr/runtime/device/cpu/cpuprogram.cpp index 1c81261cfa..769e414a44 100644 --- a/rocclr/runtime/device/cpu/cpuprogram.cpp +++ b/rocclr/runtime/device/cpu/cpuprogram.cpp @@ -175,25 +175,36 @@ getParamSizeImpl(bool cpuLayer, const clk_parameter_descriptor_t* desc, if(desc[index].type == T_STRUCT) { size_t maxAlignment = 0; size_t structSize = 0; + size_t structAlignment = 0; index++; while(desc[index].type != T_VOID) { size_t elementAlignment = 0; size_t elementSize = getParamSizeImpl(cpuLayer, desc, index, qualifier, &elementAlignment, index_out); + if (desc[index].type == T_LONG) + structAlignment = cpuLayer? LP64_SWITCH(4, 8) : 8; + else + structAlignment = std::max(maxAlignment, elementAlignment); index = *index_out; structSize = amd::alignUp(structSize, std::min(elementAlignment, size_t(16))) + elementSize; - maxAlignment = std::max(maxAlignment, elementAlignment); + maxAlignment = std::max(maxAlignment, structAlignment); } *index_out = index + 1; *alignment = maxAlignment; size = amd::alignUp(structSize, std::min(maxAlignment, size_t(16))); } else { size = getScalarParamSize(cpuLayer, desc[index].type, qualifier); - *alignment = size; + if (desc[index].type == T_DOUBLE) { + *alignment = LP64_SWITCH(4, 8); + } else if (desc[index].type == T_LONG) { + *alignment = 8; + } else { + *alignment = size; + } *index_out = index + 1; } return size; @@ -204,8 +215,8 @@ getParamSize(bool cpuLayer, const clk_parameter_descriptor_t* desc, cl_kernel_arg_address_qualifier qualifier, size_t* alignment) { - unsigned index_out = 0; - return getParamSizeImpl(cpuLayer, desc, 0, qualifier, alignment, + unsigned index_out = 0; + return getParamSizeImpl(cpuLayer, desc, 0, qualifier, alignment, &index_out); } @@ -335,13 +346,32 @@ setKernelInfoCallback(std::string symbol, const void* value, void* data) getParamSize(true, desc, param.addressQualifier_, &cpuAlignment); kernel->addArg(cpuSize, cpuAlignment); + //Init for HCtoDCmap + unsigned int init_offset = 0; + unsigned int align = 0; + int inStruct = 0; + int end_index = 0; + HCtoDCmap *map_p = new HCtoDCmap(desc, align, 0, init_offset); + map_p->dc_size = map_p->compute_map(desc, map_p->map_alignment, init_offset, inStruct, end_index); + map_p->align_map(map_p->map_alignment, map_p->hc_size, map_p->dc_size, inStruct); + if (CPU_USE_ALIGNMENT_MAP == 0) { + kernel->addHCtoDCmap(map_p); + if (map_p->internal_field_map != NULL) { + kernel->addInternalMap(map_p->internal_field_map); + } + } + else { + delete(map_p); + } + //End of HCtoDCmap + desc = next_desc; params.push_back(param); size_t size = param.size_ == 0 ? sizeof(cl_mem) : param.size_; #if defined(USE_NATIVE_ABI) size = amd::alignUp(size, sizeof(size_t)); #endif // USE_NATIVE_ABI - offset = param.offset_ + size; + offset = param.offset_ + size; } // retrieve vector type hint metadata diff --git a/rocclr/runtime/utils/flags.hpp b/rocclr/runtime/utils/flags.hpp index 5cf5048099..1abe5feca2 100644 --- a/rocclr/runtime/utils/flags.hpp +++ b/rocclr/runtime/utils/flags.hpp @@ -24,6 +24,8 @@ release(size_t, CPU_WORKER_THREAD_STACK_SIZE, 64*Ki, \ "The default CPU worker thread stack size") \ release(int, CPU_MAX_COMPUTE_UNITS, -1, \ "Override the number of computation units per CPU device") \ +debug(bool, CPU_USE_ALIGNMENT_MAP, false, \ + "Use flag to enable alignment mapping for parameters for CPU") \ release(int, GPU_MAX_WORKGROUP_SIZE, 0, \ "Maximum number of workitems in a workgroup for GPU, 0 -use default") \ release(int, GPU_MAX_WORKGROUP_SIZE_2D_X, 0, \