From 70aabc532576ba00d4eeea736ce55345b1051179 Mon Sep 17 00:00:00 2001
From: foreman
Date: Wed, 13 May 2015 12:01:50 -0400
Subject: [PATCH] P4 to Git Change 1150348 by rayxiao@alit_opencl_rayxiao on
2015/05/13 10:49:22
EPR #396242 - Solution to cpu device alignment bug.
Affected files ...
... //depot/stg/opencl/drivers/opencl/compiler/edg/src/amd_ocl_attribute.c#24 edit
... //depot/stg/opencl/drivers/opencl/compiler/edg/src/cmd_line.c#86 edit
... //depot/stg/opencl/drivers/opencl/compiler/edg/src/il.c#28 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/cpu/cpucommand.cpp#64 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/cpu/cpucommand.hpp#39 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/cpu/cpukernel.hpp#7 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/cpu/cpumapping.cpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/cpu/cpumapping.hpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/cpu/cpuprogram.cpp#65 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/cpu/cpuprogram.hpp#13 edit
... //depot/stg/opencl/drivers/opencl/runtime/utils/flags.hpp#231 edit
... //depot/stg/opencl/drivers/opencl/tools/runocl/options.c#11 edit
---
rocclr/runtime/device/cpu/cpucommand.cpp | 22 +-
rocclr/runtime/device/cpu/cpukernel.hpp | 36 ++-
rocclr/runtime/device/cpu/cpumapping.cpp | 326 +++++++++++++++++++++++
rocclr/runtime/device/cpu/cpumapping.hpp | 46 ++++
rocclr/runtime/device/cpu/cpuprogram.cpp | 40 ++-
rocclr/runtime/utils/flags.hpp | 2 +
6 files changed, 462 insertions(+), 10 deletions(-)
create mode 100644 rocclr/runtime/device/cpu/cpumapping.cpp
create mode 100644 rocclr/runtime/device/cpu/cpumapping.hpp
diff --git a/rocclr/runtime/device/cpu/cpucommand.cpp b/rocclr/runtime/device/cpu/cpucommand.cpp
index 6511412a7d..2ef12af280 100644
--- a/rocclr/runtime/device/cpu/cpucommand.cpp
+++ b/rocclr/runtime/device/cpu/cpucommand.cpp
@@ -14,6 +14,7 @@
#include "thread/thread.hpp"
#include "os/os.hpp"
#include "utils/util.hpp"
+#include "utils/options.hpp"
#include
@@ -302,7 +303,6 @@ NDRangeKernelBatch::patchParameters(
size_t alignment = cpuKernel.getArgAlignment(i);
effectiveOffset = amd::alignUp(effectiveOffset, std::min(alignment, size_t(16)));
param = params + effectiveOffset;
-
if (desc.size_ == 0) {
// __local memory parameter
localMemPtr = amd::alignUp(localMemPtr, sizeof(cl_long16));
@@ -362,10 +362,24 @@ NDRangeKernelBatch::patchParameters(
*reinterpret_cast(param) = (uint32_t)samplerArg->state();
}
else {
- ::memcpy(param, cmdParam, desc.size_);
+ //Using HCtoDCmap
+ HCtoDCmap arg_map = cpuKernel.getHCtoDCmap(i);
+ unsigned int arg_offset = effectiveOffset;
+ int err_code = 0;
+ int inStruct = 0;
+ int sys_64bit = LP64_SWITCH(0, 1); // Mapping only required for 32 bit targets
+ if (CPU_USE_ALIGNMENT_MAP == 0 && !sys_64bit) {
+ effectiveOffset += arg_map.copy_params(param, cmdParam, arg_offset, err_code, inStruct);
+ if (err_code) {
+ return false;
+ }
+ prmSize = arg_map.dc_size;
+ }
+ else {
+ ::memcpy(param, cmdParam, desc.size_);
+ }
}
-
- effectiveOffset += cpuKernel.getArgSize(i);
+ effectiveOffset += prmSize;
}
localMemPtr = amd::alignUp(localMemPtr, sizeof(cl_long16));
diff --git a/rocclr/runtime/device/cpu/cpukernel.hpp b/rocclr/runtime/device/cpu/cpukernel.hpp
index cf7d9eb473..5bfa1ff02b 100644
--- a/rocclr/runtime/device/cpu/cpukernel.hpp
+++ b/rocclr/runtime/device/cpu/cpukernel.hpp
@@ -9,6 +9,8 @@
#include "device/device.hpp"
#include
+#include "device/cpu/cpumapping.hpp"
+
//! \namespace cpu CPU Device Implementation
namespace cpu {
@@ -18,7 +20,9 @@ class Kernel : public device::Kernel
private:
const void* entryPoint_; //!< entry for the kernel
- std::vector< std::pair > args_;
+ std::vector< std::pair > args_;
+ std::vector< std::pair < HCtoDCmap, size_t> > HCtoDCmaps_;
+ std::vector< HCtoDCmap > internal_maps_;
public:
uint nature_; //!< kernel's nature
uint privateSize_; //!< WorkItem's private memory size (in bytes)
@@ -42,6 +46,36 @@ public:
return args_[argIndex].second;
}
+ void addInternalMap(HCtoDCmap *new_map) {
+ if (new_map != NULL) {
+ internal_maps_.push_back(*new_map);
+ this->addInternalMap(new_map->internal_field_map);
+ this->addInternalMap(new_map->next_field_map);
+ }
+ else
+ return;
+ }
+
+ void addHCtoDCmap(HCtoDCmap *new_map) {
+ if (new_map != NULL) {
+ if (HCtoDCmaps_.size() > 0)
+ HCtoDCmaps_.push_back(std::pair< HCtoDCmap, size_t >(*new_map, HCtoDCmaps_.back().second));
+ else
+ HCtoDCmaps_.push_back(std::pair< HCtoDCmap, size_t >(*new_map, 0));
+ }
+ else
+ return;
+ }
+
+ HCtoDCmap getHCtoDCmap(int mapIndex) const {
+ return HCtoDCmaps_[mapIndex].first;
+ }
+
+
+ uint getArgNumber() {
+ return HCtoDCmaps_.size();
+ }
+
//! Default constructor
Kernel(const std::string& name)
: device::Kernel(name), entryPoint_(NULL), nature_(0),
diff --git a/rocclr/runtime/device/cpu/cpumapping.cpp b/rocclr/runtime/device/cpu/cpumapping.cpp
new file mode 100644
index 0000000000..8b21cc4276
--- /dev/null
+++ b/rocclr/runtime/device/cpu/cpumapping.cpp
@@ -0,0 +1,326 @@
+//
+// Copyright (c) 2011 Advanced Micro Devices, Inc. All rights reserved.
+//
+
+#include "device/cpu/cpudevice.hpp"
+#include "device/cpu/cpukernel.hpp"
+#include "platform/program.hpp"
+#include "os/os.hpp"
+#include "device/cpu/cpumapping.hpp"
+#include
+#include
+#include
+#include
+#if defined(_WIN32)
+#include
+#endif
+// amdrt.o
+#if defined(WITH_ONLINE_COMPILER) && !defined(_LP64) && !defined(ATI_ARCH_ARM)
+#include "amdrt.inc"
+#endif
+#include "acl.h"
+using std::min;
+using std::max;
+
+namespace cpu {
+ HCtoDCmap::HCtoDCmap(const clk_parameter_descriptor_t* desc, unsigned int level_alignment, unsigned int index, unsigned int init_offset)
+ {
+ //Initialize fields
+ hc_offset = 0;
+ hc_size = 0;
+ dc_offset = 0;
+ dc_size = 0;
+ map_alignment = level_alignment;
+ internal_field_map = NULL;
+ next_field_map = NULL;
+ return;
+ }
+
+ HCtoDCmap::~HCtoDCmap()
+ {
+ return;
+ }
+
+ //Helper to find sizes of each scalar type
+ size_t HCtoDCmap::getHostScalarParamSize(const clk_value_type_t type) const
+ {
+ size_t size = 0;
+ switch (type) {
+ case T_CHAR:
+ size = 1;
+ break;
+ case T_SHORT: case T_CHAR2:
+ size = 2;
+ break;
+ case T_FLOAT: case T_INT: case T_CHAR4:
+ case T_SHORT2: case T_CHAR3:
+ size = 4;
+ break;
+ case T_SAMPLER:
+ size = 4;
+ break;
+ case T_LONG: case T_DOUBLE: case T_CHAR8:
+ case T_SHORT4: case T_INT2: case T_FLOAT2:
+ case T_SHORT3:
+ size = 8;
+ break;
+ case T_INT3: case T_FLOAT3:
+ case T_CHAR16: case T_SHORT8: case T_INT4:
+ case T_FLOAT4: case T_LONG2: case T_DOUBLE2:
+ size = 16;
+ break;
+ case T_LONG3: case T_DOUBLE3:
+ case T_SHORT16: case T_INT8: case T_FLOAT8:
+ case T_LONG4: case T_DOUBLE4:
+ size = 32;
+ break;
+ case T_INT16: case T_FLOAT16: case T_LONG8:
+ case T_DOUBLE8:
+ size = 64;
+ break;
+ case T_LONG16: case T_DOUBLE16:
+ size = 128;
+ break;
+ case T_POINTER: case T_VOID:
+ size = sizeof(void*);
+ break;
+ default:
+ assert(0 && "unknown scalar parameter size");
+ break;
+ }
+ return size;
+ }
+
+ size_t HCtoDCmap::getHostScalarAlignment(const clk_value_type_t type) const
+ {
+ size_t align = 0;
+ switch (type) {
+ case T_CHAR:
+ align = 1;
+ break;
+ case T_SHORT: case T_CHAR2:
+ align = 2;
+ break;
+ case T_FLOAT: case T_INT: case T_CHAR4:
+ case T_SHORT2: case T_CHAR3:
+ align = 4;
+ break;
+ case T_SAMPLER:
+ align = sizeof(uint32_t);
+ break;
+ case T_LONG:
+ align = LP64_SWITCH(4, 8);
+ break;
+ case T_DOUBLE:
+ align = LP64_SWITCH(4, 8);
+ break;
+ case T_CHAR8:
+ case T_SHORT4: case T_INT2: case T_FLOAT2:
+ case T_SHORT3:
+ align = 4;
+ break;
+ case T_INT3: case T_FLOAT3:
+ case T_CHAR16: case T_SHORT8: case T_INT4:
+ case T_FLOAT4: case T_LONG2: case T_DOUBLE2:
+ case T_LONG3: case T_DOUBLE3:
+ case T_SHORT16: case T_INT8: case T_FLOAT8:
+ case T_LONG4: case T_DOUBLE4:
+ case T_INT16: case T_FLOAT16: case T_LONG8:
+ case T_DOUBLE8:
+ case T_LONG16: case T_DOUBLE16:
+ align = LP64_SWITCH(4, 8);
+ break;
+ case T_POINTER: case T_VOID:
+ align = sizeof(void*);
+ break;
+ default:
+ assert(0 && "unknown scalar parameter alignment");
+ break;
+ }
+ return align;
+ }
+
+ // Align up arguments within each map, return the size of current map parameter
+ // Input current alignment of the parameter, size of outer struct if it exists
+ void HCtoDCmap::align_map(unsigned alignment, unsigned &outer_hc_size, unsigned &outer_dc_size, int &inStruct)
+ {
+ unsigned map_param_size = 0;
+ if (internal_field_map != NULL) {
+ hc_size = 0; //Recalculate size to account for internal offsets
+ inStruct++;
+ internal_field_map->align_map(map_alignment, hc_size, dc_size, inStruct); // align internal struct, might alter size of this struct
+ }
+ // Use map_param_size to store current parameter size after adjusting alignment
+ if (alignment != 1 && hc_size % alignment != 0) {
+ map_param_size = max(alignment, hc_size - (hc_size%alignment) + alignment);
+ }
+ else {
+ map_param_size = max(alignment, hc_size);
+ }
+ if (next_field_map != NULL) {
+ next_field_map->hc_offset = this->next_offset(hc_offset, map_param_size, inStruct);
+ next_field_map->align_map(alignment, outer_hc_size, outer_dc_size, inStruct);
+ // Reset parameter size for char padding
+ if (next_field_map->type == T_CHAR)
+ map_param_size = 1;
+ }
+ else
+ {
+ // Moving out of struct
+ if (inStruct > 0)
+ inStruct--;
+ if (type == T_CHAR)
+ map_param_size = 1;
+ }
+ outer_hc_size = max(outer_hc_size, hc_offset+map_param_size);
+ outer_dc_size = max(outer_dc_size, dc_offset+dc_size);
+ return;
+ }
+
+ // Return current size of map, calculate internal maps and process next args if in struct.
+ // Alignment: alignment flag for members in case of structs, alignment of scalar otherwise.
+ int HCtoDCmap::compute_map(const clk_parameter_descriptor_t* desc, unsigned int &alignment, unsigned int init_offset, int& inStruct, int& index_out)
+ {
+ unsigned internal_index;
+ internal_index = index_out;
+ unsigned int next_offset = init_offset;
+ unsigned struct_size = 0;
+ type = desc[internal_index].type;
+
+ if (desc[internal_index].type == T_STRUCT) {
+ //Moving into struct, go to next index
+ inStruct++;
+ hc_offset = init_offset;
+ if (desc[index_out+1].type != T_VOID) {
+ index_out++;
+ internal_index = index_out;
+ internal_field_map = new HCtoDCmap(desc, 0, internal_index, init_offset);
+ hc_size = internal_field_map->compute_map(desc, map_alignment, next_offset, inStruct, index_out);
+ map_alignment = max(map_alignment, internal_field_map->map_alignment); // Adjust alignment to biggest member alignment
+ struct_size = hc_size;
+ internal_index = index_out;
+ alignment = max(alignment, map_alignment);
+ if (inStruct > 0) {
+ if (desc[index_out+1].type != T_VOID) {
+ //Still inside struct and not done
+ index_out++;
+ internal_index = index_out;
+ next_field_map = new HCtoDCmap(desc, 0, internal_index, next_offset);
+ struct_size = hc_size;
+ struct_size += next_field_map->compute_map(desc, alignment, next_offset, inStruct, index_out);
+ next_offset = max(next_field_map->hc_offset+next_field_map->hc_size, next_field_map->hc_offset+alignment);
+ // running count of strucdc_size = hc_size + size of next member
+ return struct_size;
+ }
+ else {
+ //Moving out of struct, go to next index
+ index_out++;
+ internal_index = index_out;
+ inStruct--;
+ return hc_size; //return last struct member size
+ }
+ }
+ }
+ }
+ else {
+ //Scalar parameter
+ hc_offset = init_offset;
+ hc_size = getHostScalarParamSize(desc[internal_index].type);
+ dc_size = hc_size;
+ map_alignment = getHostScalarAlignment(desc[internal_index].type);
+ alignment = max(alignment, map_alignment); //Adjust alignment of upper level struct if necessary, upper level alignment = max alignment of members
+ if (desc[internal_index].type == T_LONG)
+ alignment = max(alignment, (unsigned int)8); //Set struct alignment to 8 on outside if containing struct member of long
+ if (inStruct > 0) {
+ if (desc[index_out+1].type != T_VOID) {
+ //Still inside struct and not done
+ index_out++;
+ next_field_map = new HCtoDCmap(desc, alignment, internal_index, next_offset);
+ struct_size = hc_size;
+ struct_size += next_field_map->compute_map(desc, alignment, next_offset, inStruct, index_out);
+ next_offset = hc_offset+alignment;
+ alignment = max(alignment, next_field_map->map_alignment);
+ // running count of strucdc_size = hc_size + size of next member
+ return struct_size;
+ }
+ else {
+ //Moving out of struct, go to next index
+ index_out++;
+ inStruct--;
+ return hc_size; //return last struct member size
+ }
+ }
+ }
+ return hc_size;
+ }
+
+ // Adjust offset for source and target, return next source offset
+ unsigned HCtoDCmap::next_offset(unsigned current_offset, unsigned &map_param_size, int& inStruct_flag)
+ {
+ unsigned next_offset = current_offset;
+ if (next_field_map == NULL) {
+ assert(0 && "invalid next struct field map");
+ return next_offset;
+ }
+ else {
+ // Ignore alignment when a char occurs to account for padding
+ if (type != T_STRUCT && next_field_map->hc_size == 1 && map_param_size > 1 && inStruct_flag > 0) {
+ next_field_map->dc_offset = dc_offset + dc_size;
+ next_offset = current_offset + hc_size;
+ }
+ //
+ else {
+ if (this->next_field_map->type == T_LONG) {
+ if (dc_size % 4 != 0) {
+ this->next_field_map->dc_offset = dc_offset + dc_size - (dc_size % 4) + 4; // T_LONG aligned by 4 in target
+ }
+ else {
+ this->next_field_map->dc_offset = dc_offset + dc_size; // T_LONG aligned by 4 in target
+ }
+ if (dc_size % 8 != 0) {
+ next_offset = current_offset + dc_size - (dc_size % 8) + 8; //aligned by 8 in source
+ }
+ else {
+ next_offset = current_offset + dc_size; //aligned by 8 in source
+ }
+ }
+ else {
+ if ((dc_offset + dc_size) % next_field_map->map_alignment != 0) {
+ this->next_field_map->dc_offset = dc_offset + dc_size - (dc_size % next_field_map->map_alignment) + next_field_map->map_alignment;
+ }
+ else {
+ this->next_field_map->dc_offset = dc_offset + max(dc_size, next_field_map->map_alignment);
+ }
+ if ((hc_offset + hc_size) % next_field_map->map_alignment != 0) {
+ next_offset = hc_offset + hc_size - (hc_size % next_field_map->map_alignment) + next_field_map->map_alignment;
+ }
+ else {
+ next_offset = hc_offset + max(next_field_map->map_alignment, map_param_size);
+ }
+ }
+ }
+ return next_offset;
+ }
+ }
+
+ // Copy memory according to mapping
+ unsigned int HCtoDCmap::copy_params(void *dst, const void *src, unsigned int &arg_offset, int& error_code, int &inStruct) const
+ {
+ unsigned int padding = 0;
+ // Pad offset to be aligned by 8 if parameter is double, not as struct field
+ if ((arg_offset+dc_offset) % 8 != 0 && (type == T_DOUBLE) && inStruct == 0)
+ padding = map_alignment-((arg_offset+dc_offset)%map_alignment);
+ ::memcpy(reinterpret_cast(reinterpret_cast(dst)+padding), src, hc_size);
+ if (internal_field_map != NULL) {
+ inStruct++;
+ internal_field_map->copy_params(dst, src, arg_offset, error_code, inStruct);
+ inStruct--;
+ }
+ if (next_field_map != NULL) {
+ void *next_dst = reinterpret_cast(reinterpret_cast(dst)+next_field_map->dc_offset);
+ const void *next_src = reinterpret_cast(reinterpret_cast(src)+next_field_map->hc_offset);
+ next_field_map->copy_params(next_dst, next_src, arg_offset, error_code, inStruct);
+ }
+ return padding;
+ }
+} //namespace cpu
\ No newline at end of file
diff --git a/rocclr/runtime/device/cpu/cpumapping.hpp b/rocclr/runtime/device/cpu/cpumapping.hpp
new file mode 100644
index 0000000000..105b76bde7
--- /dev/null
+++ b/rocclr/runtime/device/cpu/cpumapping.hpp
@@ -0,0 +1,46 @@
+//
+// Copyright (c) 2011 Advanced Micro Devices, Inc. All rights reserved.
+//
+// HCtoDCmap provides a mapping of parameters from host compiler to device compiler
+// The mapping can be used to copy parameters from host to device where field alignment
+// is different in compilers
+#ifndef CPUMAPPING_HPP_
+#define CPUMAPPING_HPP_
+
+using std::min;
+using std::max;
+
+namespace cpu {
+
+class HCtoDCmap
+{
+
+public:
+ unsigned int hc_offset, hc_size; // Offset and size of this parameter in host compiler
+ unsigned int dc_offset, dc_size; // Offset and size of this parameter in device compiler
+ unsigned int map_alignment; // Alignment of parameter in host compiler
+ clk_value_type_t type; // Type of parameter
+ HCtoDCmap *internal_field_map; // Pointer to internal mapping when current parameter is of type T_STRUCT
+ HCtoDCmap *next_field_map; // Pointer to next struct field when current parameter is a struct member
+
+ HCtoDCmap(const clk_parameter_descriptor_t*, unsigned int, unsigned int, unsigned int);
+ virtual ~HCtoDCmap();
+ int compute_map(const clk_parameter_descriptor_t*, unsigned int &, unsigned int, int&, int&);
+ unsigned next_offset(unsigned, unsigned &, int &);
+ size_t getHostScalarParamSize(const clk_value_type_t) const;
+ size_t getHostScalarAlignment(const clk_value_type_t) const;
+ void align_map(unsigned, unsigned&, unsigned&, int&);
+ unsigned int copy_params(void *, const void *, unsigned int&, int&, int&) const;
+
+private:
+};
+
+
+} // namespace cpu
+
+#endif // CPUMAPPING_HPP_
+// Mapping rule
+// Long types are treated with 8 byte alignment in runtime when passed in as arguments
+// but they are treated with 4 byte alignment in compiler
+// Double members have 8 byte alignment when passed as scalar argument
+// but have 4 byte alignment as a field inside a struct
\ No newline at end of file
diff --git a/rocclr/runtime/device/cpu/cpuprogram.cpp b/rocclr/runtime/device/cpu/cpuprogram.cpp
index 1c81261cfa..769e414a44 100644
--- a/rocclr/runtime/device/cpu/cpuprogram.cpp
+++ b/rocclr/runtime/device/cpu/cpuprogram.cpp
@@ -175,25 +175,36 @@ getParamSizeImpl(bool cpuLayer, const clk_parameter_descriptor_t* desc,
if(desc[index].type == T_STRUCT) {
size_t maxAlignment = 0;
size_t structSize = 0;
+ size_t structAlignment = 0;
index++;
while(desc[index].type != T_VOID) {
size_t elementAlignment = 0;
size_t elementSize =
getParamSizeImpl(cpuLayer, desc, index, qualifier,
&elementAlignment, index_out);
+ if (desc[index].type == T_LONG)
+ structAlignment = cpuLayer? LP64_SWITCH(4, 8) : 8;
+ else
+ structAlignment = std::max(maxAlignment, elementAlignment);
index = *index_out;
structSize =
amd::alignUp(structSize,
std::min(elementAlignment, size_t(16))) +
elementSize;
- maxAlignment = std::max(maxAlignment, elementAlignment);
+ maxAlignment = std::max(maxAlignment, structAlignment);
}
*index_out = index + 1;
*alignment = maxAlignment;
size = amd::alignUp(structSize, std::min(maxAlignment, size_t(16)));
} else {
size = getScalarParamSize(cpuLayer, desc[index].type, qualifier);
- *alignment = size;
+ if (desc[index].type == T_DOUBLE) {
+ *alignment = LP64_SWITCH(4, 8);
+ } else if (desc[index].type == T_LONG) {
+ *alignment = 8;
+ } else {
+ *alignment = size;
+ }
*index_out = index + 1;
}
return size;
@@ -204,8 +215,8 @@ getParamSize(bool cpuLayer, const clk_parameter_descriptor_t* desc,
cl_kernel_arg_address_qualifier qualifier,
size_t* alignment)
{
- unsigned index_out = 0;
- return getParamSizeImpl(cpuLayer, desc, 0, qualifier, alignment,
+ unsigned index_out = 0;
+ return getParamSizeImpl(cpuLayer, desc, 0, qualifier, alignment,
&index_out);
}
@@ -335,13 +346,32 @@ setKernelInfoCallback(std::string symbol, const void* value, void* data)
getParamSize(true, desc, param.addressQualifier_, &cpuAlignment);
kernel->addArg(cpuSize, cpuAlignment);
+ //Init for HCtoDCmap
+ unsigned int init_offset = 0;
+ unsigned int align = 0;
+ int inStruct = 0;
+ int end_index = 0;
+ HCtoDCmap *map_p = new HCtoDCmap(desc, align, 0, init_offset);
+ map_p->dc_size = map_p->compute_map(desc, map_p->map_alignment, init_offset, inStruct, end_index);
+ map_p->align_map(map_p->map_alignment, map_p->hc_size, map_p->dc_size, inStruct);
+ if (CPU_USE_ALIGNMENT_MAP == 0) {
+ kernel->addHCtoDCmap(map_p);
+ if (map_p->internal_field_map != NULL) {
+ kernel->addInternalMap(map_p->internal_field_map);
+ }
+ }
+ else {
+ delete(map_p);
+ }
+ //End of HCtoDCmap
+
desc = next_desc;
params.push_back(param);
size_t size = param.size_ == 0 ? sizeof(cl_mem) : param.size_;
#if defined(USE_NATIVE_ABI)
size = amd::alignUp(size, sizeof(size_t));
#endif // USE_NATIVE_ABI
- offset = param.offset_ + size;
+ offset = param.offset_ + size;
}
// retrieve vector type hint metadata
diff --git a/rocclr/runtime/utils/flags.hpp b/rocclr/runtime/utils/flags.hpp
index 5cf5048099..1abe5feca2 100644
--- a/rocclr/runtime/utils/flags.hpp
+++ b/rocclr/runtime/utils/flags.hpp
@@ -24,6 +24,8 @@ release(size_t, CPU_WORKER_THREAD_STACK_SIZE, 64*Ki, \
"The default CPU worker thread stack size") \
release(int, CPU_MAX_COMPUTE_UNITS, -1, \
"Override the number of computation units per CPU device") \
+debug(bool, CPU_USE_ALIGNMENT_MAP, false, \
+ "Use flag to enable alignment mapping for parameters for CPU") \
release(int, GPU_MAX_WORKGROUP_SIZE, 0, \
"Maximum number of workitems in a workgroup for GPU, 0 -use default") \
release(int, GPU_MAX_WORKGROUP_SIZE_2D_X, 0, \