diff --git a/projects/clr/rocclr/runtime/device/cpu/cpumapping.cpp b/projects/clr/rocclr/runtime/device/cpu/cpumapping.cpp index 8b21cc4276..2d86272d43 100644 --- a/projects/clr/rocclr/runtime/device/cpu/cpumapping.cpp +++ b/projects/clr/rocclr/runtime/device/cpu/cpumapping.cpp @@ -30,7 +30,8 @@ namespace cpu { hc_size = 0; dc_offset = 0; dc_size = 0; - map_alignment = level_alignment; + hc_alignment = level_alignment; + dc_alignment = level_alignment; internal_field_map = NULL; next_field_map = NULL; return; @@ -91,7 +92,7 @@ namespace cpu { return size; } - size_t HCtoDCmap::getHostScalarAlignment(const clk_value_type_t type) const + size_t HCtoDCmap::getScalarAlignment(const clk_value_type_t type, bool isHost) const { size_t align = 0; switch (type) { @@ -109,10 +110,18 @@ namespace cpu { align = sizeof(uint32_t); break; case T_LONG: - align = LP64_SWITCH(4, 8); + #if defined(_WIN32) + align = 8; + #else + align = isHost? 8 : LP64_SWITCH(4, 8); + #endif break; case T_DOUBLE: + #if defined(_WIN32) + align = 8; + #else align = LP64_SWITCH(4, 8); + #endif break; case T_CHAR8: case T_SHORT4: case T_INT2: case T_FLOAT2: @@ -142,24 +151,28 @@ namespace cpu { // Align up arguments within each map, return the size of current map parameter // Input current alignment of the parameter, size of outer struct if it exists - void HCtoDCmap::align_map(unsigned alignment, unsigned &outer_hc_size, unsigned &outer_dc_size, int &inStruct) + void HCtoDCmap::align_map(unsigned outer_hc_alignment, unsigned outer_dc_alignment, unsigned &outer_hc_size, unsigned &outer_dc_size, int &inStruct) { unsigned map_param_size = 0; if (internal_field_map != NULL) { hc_size = 0; //Recalculate size to account for internal offsets inStruct++; - internal_field_map->align_map(map_alignment, hc_size, dc_size, inStruct); // align internal struct, might alter size of this struct + internal_field_map->align_map(hc_alignment, dc_alignment, hc_size, dc_size, inStruct); // align internal struct, might alter size of this struct + if (hc_alignment != 1 && hc_size%hc_alignment) + hc_size = max(hc_size, hc_size - (hc_size%hc_alignment) + hc_alignment); + if (dc_alignment != 1 && dc_size%dc_alignment) + dc_size = max(dc_size, dc_size - (dc_size%dc_alignment) + dc_alignment); } // Use map_param_size to store current parameter size after adjusting alignment - if (alignment != 1 && hc_size % alignment != 0) { - map_param_size = max(alignment, hc_size - (hc_size%alignment) + alignment); + if (hc_alignment != 1 && hc_size % hc_alignment != 0) { + map_param_size = max(hc_alignment, hc_size - (hc_size%hc_alignment) + hc_alignment); } else { - map_param_size = max(alignment, hc_size); + map_param_size = max(hc_alignment, hc_size); } if (next_field_map != NULL) { next_field_map->hc_offset = this->next_offset(hc_offset, map_param_size, inStruct); - next_field_map->align_map(alignment, outer_hc_size, outer_dc_size, inStruct); + next_field_map->align_map(outer_hc_alignment, outer_dc_alignment, outer_hc_size, outer_dc_size, inStruct); // Reset parameter size for char padding if (next_field_map->type == T_CHAR) map_param_size = 1; @@ -179,7 +192,7 @@ namespace cpu { // Return current size of map, calculate internal maps and process next args if in struct. // Alignment: alignment flag for members in case of structs, alignment of scalar otherwise. - int HCtoDCmap::compute_map(const clk_parameter_descriptor_t* desc, unsigned int &alignment, unsigned int init_offset, int& inStruct, int& index_out) + int HCtoDCmap::compute_map(const clk_parameter_descriptor_t* desc, unsigned int &outer_hc_alignment, unsigned int &outer_dc_alignment, unsigned int init_offset, int& inStruct, int& index_out) { unsigned internal_index; internal_index = index_out; @@ -195,11 +208,11 @@ namespace cpu { index_out++; internal_index = index_out; internal_field_map = new HCtoDCmap(desc, 0, internal_index, init_offset); - hc_size = internal_field_map->compute_map(desc, map_alignment, next_offset, inStruct, index_out); - map_alignment = max(map_alignment, internal_field_map->map_alignment); // Adjust alignment to biggest member alignment + hc_size = internal_field_map->compute_map(desc, hc_alignment, dc_alignment, next_offset, inStruct, index_out); + hc_alignment = max(hc_alignment, internal_field_map->hc_alignment); // Adjust alignment to biggest member alignment struct_size = hc_size; internal_index = index_out; - alignment = max(alignment, map_alignment); + outer_hc_alignment = max(outer_hc_alignment, hc_alignment); if (inStruct > 0) { if (desc[index_out+1].type != T_VOID) { //Still inside struct and not done @@ -207,8 +220,8 @@ namespace cpu { internal_index = index_out; next_field_map = new HCtoDCmap(desc, 0, internal_index, next_offset); struct_size = hc_size; - struct_size += next_field_map->compute_map(desc, alignment, next_offset, inStruct, index_out); - next_offset = max(next_field_map->hc_offset+next_field_map->hc_size, next_field_map->hc_offset+alignment); + struct_size += next_field_map->compute_map(desc, outer_hc_alignment, outer_dc_alignment, next_offset, inStruct, index_out); + next_offset = max(next_field_map->hc_offset+next_field_map->hc_size, next_field_map->hc_offset+hc_alignment); // running count of strucdc_size = hc_size + size of next member return struct_size; } @@ -227,19 +240,20 @@ namespace cpu { hc_offset = init_offset; hc_size = getHostScalarParamSize(desc[internal_index].type); dc_size = hc_size; - map_alignment = getHostScalarAlignment(desc[internal_index].type); - alignment = max(alignment, map_alignment); //Adjust alignment of upper level struct if necessary, upper level alignment = max alignment of members - if (desc[internal_index].type == T_LONG) - alignment = max(alignment, (unsigned int)8); //Set struct alignment to 8 on outside if containing struct member of long + hc_alignment = getScalarAlignment(desc[internal_index].type, true); + dc_alignment = getScalarAlignment(desc[internal_index].type, false); + outer_hc_alignment = max(outer_hc_alignment, hc_alignment); //Adjust alignment of upper level struct if necessary, upper level alignment = max alignment of members + outer_dc_alignment = max(outer_dc_alignment, dc_alignment); //Adjust alignment of upper level struct if necessary, upper level alignment = max alignment of members if (inStruct > 0) { if (desc[index_out+1].type != T_VOID) { //Still inside struct and not done index_out++; - next_field_map = new HCtoDCmap(desc, alignment, internal_index, next_offset); + next_field_map = new HCtoDCmap(desc, outer_hc_alignment, internal_index, next_offset); struct_size = hc_size; - struct_size += next_field_map->compute_map(desc, alignment, next_offset, inStruct, index_out); - next_offset = hc_offset+alignment; - alignment = max(alignment, next_field_map->map_alignment); + struct_size += next_field_map->compute_map(desc, outer_hc_alignment, outer_dc_alignment, next_offset, inStruct, index_out); + next_offset = hc_offset+hc_alignment; + outer_hc_alignment = max(outer_hc_alignment, next_field_map->hc_alignment); + outer_dc_alignment = max(outer_dc_alignment, next_field_map->dc_alignment); // running count of strucdc_size = hc_size + size of next member return struct_size; } @@ -268,35 +282,18 @@ namespace cpu { next_field_map->dc_offset = dc_offset + dc_size; next_offset = current_offset + hc_size; } - // else { - if (this->next_field_map->type == T_LONG) { - if (dc_size % 4 != 0) { - this->next_field_map->dc_offset = dc_offset + dc_size - (dc_size % 4) + 4; // T_LONG aligned by 4 in target - } - else { - this->next_field_map->dc_offset = dc_offset + dc_size; // T_LONG aligned by 4 in target - } - if (dc_size % 8 != 0) { - next_offset = current_offset + dc_size - (dc_size % 8) + 8; //aligned by 8 in source - } - else { - next_offset = current_offset + dc_size; //aligned by 8 in source - } + if ((dc_offset + dc_size) % next_field_map->dc_alignment != 0) { + this->next_field_map->dc_offset = dc_offset + dc_size - (dc_size % next_field_map->dc_alignment) + next_field_map->dc_alignment; } else { - if ((dc_offset + dc_size) % next_field_map->map_alignment != 0) { - this->next_field_map->dc_offset = dc_offset + dc_size - (dc_size % next_field_map->map_alignment) + next_field_map->map_alignment; - } - else { - this->next_field_map->dc_offset = dc_offset + max(dc_size, next_field_map->map_alignment); - } - if ((hc_offset + hc_size) % next_field_map->map_alignment != 0) { - next_offset = hc_offset + hc_size - (hc_size % next_field_map->map_alignment) + next_field_map->map_alignment; - } - else { - next_offset = hc_offset + max(next_field_map->map_alignment, map_param_size); - } + this->next_field_map->dc_offset = dc_offset + max(dc_size, next_field_map->dc_alignment); + } + if ((hc_offset + hc_size) % next_field_map->hc_alignment != 0) { + next_offset = hc_offset + hc_size - (hc_size % next_field_map->hc_alignment) + next_field_map->hc_alignment; + } + else { + next_offset = hc_offset + max(next_field_map->hc_alignment, map_param_size); } } return next_offset; @@ -304,13 +301,31 @@ namespace cpu { } // Copy memory according to mapping - unsigned int HCtoDCmap::copy_params(void *dst, const void *src, unsigned int &arg_offset, int& error_code, int &inStruct) const + unsigned int HCtoDCmap::copy_params(void *dst, const void *src, unsigned int arg_offset, int& error_code, int &inStruct) const { unsigned int padding = 0; // Pad offset to be aligned by 8 if parameter is double, not as struct field - if ((arg_offset+dc_offset) % 8 != 0 && (type == T_DOUBLE) && inStruct == 0) - padding = map_alignment-((arg_offset+dc_offset)%map_alignment); + if ((arg_offset) % 8 != 0 && (type == T_DOUBLE) && inStruct == 0) + padding = hc_alignment-((arg_offset+dc_offset)%hc_alignment); + #if defined(_WIN32) + // In windows, double is aligned by 8, add padding to struct if it contains double + if ((arg_offset+dc_offset) % 8 != 0 && hc_alignment == 8) + padding = hc_alignment-((arg_offset+dc_offset)%hc_alignment); + #endif ::memcpy(reinterpret_cast(reinterpret_cast(dst)+padding), src, hc_size); + #if defined(_WIN32) + if (internal_field_map != NULL) { + inStruct++; + void *internal_dst = reinterpret_cast(reinterpret_cast(dst)+padding); + internal_field_map->copy_params(internal_dst, src, arg_offset+padding, error_code, inStruct); + inStruct--; + } + if (next_field_map != NULL) { + void *next_dst = reinterpret_cast(reinterpret_cast(dst)+next_field_map->dc_offset); // Next field starts with padding + const void *next_src = reinterpret_cast(reinterpret_cast(src)+next_field_map->hc_offset); + next_field_map->copy_params(next_dst, next_src, arg_offset+next_field_map->dc_offset, error_code, inStruct); + } + #else if (internal_field_map != NULL) { inStruct++; internal_field_map->copy_params(dst, src, arg_offset, error_code, inStruct); @@ -321,6 +336,8 @@ namespace cpu { const void *next_src = reinterpret_cast(reinterpret_cast(src)+next_field_map->hc_offset); next_field_map->copy_params(next_dst, next_src, arg_offset, error_code, inStruct); } + #endif return padding; } + } //namespace cpu \ No newline at end of file diff --git a/projects/clr/rocclr/runtime/device/cpu/cpumapping.hpp b/projects/clr/rocclr/runtime/device/cpu/cpumapping.hpp index 105b76bde7..263d22a63e 100644 --- a/projects/clr/rocclr/runtime/device/cpu/cpumapping.hpp +++ b/projects/clr/rocclr/runtime/device/cpu/cpumapping.hpp @@ -18,19 +18,20 @@ class HCtoDCmap public: unsigned int hc_offset, hc_size; // Offset and size of this parameter in host compiler unsigned int dc_offset, dc_size; // Offset and size of this parameter in device compiler - unsigned int map_alignment; // Alignment of parameter in host compiler + unsigned int hc_alignment; // Alignment of parameter in host compiler + unsigned int dc_alignment; // Alignment of parameter in device compiler clk_value_type_t type; // Type of parameter HCtoDCmap *internal_field_map; // Pointer to internal mapping when current parameter is of type T_STRUCT HCtoDCmap *next_field_map; // Pointer to next struct field when current parameter is a struct member HCtoDCmap(const clk_parameter_descriptor_t*, unsigned int, unsigned int, unsigned int); virtual ~HCtoDCmap(); - int compute_map(const clk_parameter_descriptor_t*, unsigned int &, unsigned int, int&, int&); + int compute_map(const clk_parameter_descriptor_t*, unsigned int &, unsigned int &, unsigned int, int&, int&); unsigned next_offset(unsigned, unsigned &, int &); size_t getHostScalarParamSize(const clk_value_type_t) const; - size_t getHostScalarAlignment(const clk_value_type_t) const; - void align_map(unsigned, unsigned&, unsigned&, int&); - unsigned int copy_params(void *, const void *, unsigned int&, int&, int&) const; + size_t getScalarAlignment(const clk_value_type_t, bool) const; + void align_map(unsigned, unsigned, unsigned&, unsigned&, int&); + unsigned int copy_params(void *, const void *, unsigned int, int&, int&) const; private: }; diff --git a/projects/clr/rocclr/runtime/device/cpu/cpuprogram.cpp b/projects/clr/rocclr/runtime/device/cpu/cpuprogram.cpp index 769e414a44..0fc539d6d0 100644 --- a/projects/clr/rocclr/runtime/device/cpu/cpuprogram.cpp +++ b/projects/clr/rocclr/runtime/device/cpu/cpuprogram.cpp @@ -182,16 +182,22 @@ getParamSizeImpl(bool cpuLayer, const clk_parameter_descriptor_t* desc, size_t elementSize = getParamSizeImpl(cpuLayer, desc, index, qualifier, &elementAlignment, index_out); - if (desc[index].type == T_LONG) - structAlignment = cpuLayer? LP64_SWITCH(4, 8) : 8; - else - structAlignment = std::max(maxAlignment, elementAlignment); + #if defined(_WIN32) + maxAlignment = std::max(maxAlignment, elementAlignment); + #else + // In Linux, the alignment of long field is 4 for GCC, + // but it is 8 on LLVM side + if (desc[index].type == T_LONG) + structAlignment = cpuLayer? LP64_SWITCH(4, 8) : 8; + else + structAlignment = std::max(maxAlignment, elementAlignment); + maxAlignment = std::max(maxAlignment, structAlignment); + #endif index = *index_out; structSize = amd::alignUp(structSize, std::min(elementAlignment, size_t(16))) + elementSize; - maxAlignment = std::max(maxAlignment, structAlignment); } *index_out = index + 1; *alignment = maxAlignment; @@ -199,7 +205,11 @@ getParamSizeImpl(bool cpuLayer, const clk_parameter_descriptor_t* desc, } else { size = getScalarParamSize(cpuLayer, desc[index].type, qualifier); if (desc[index].type == T_DOUBLE) { + #if defined(_WIN32) + *alignment = 8; + #else *alignment = LP64_SWITCH(4, 8); + #endif } else if (desc[index].type == T_LONG) { *alignment = 8; } else { @@ -352,8 +362,8 @@ setKernelInfoCallback(std::string symbol, const void* value, void* data) int inStruct = 0; int end_index = 0; HCtoDCmap *map_p = new HCtoDCmap(desc, align, 0, init_offset); - map_p->dc_size = map_p->compute_map(desc, map_p->map_alignment, init_offset, inStruct, end_index); - map_p->align_map(map_p->map_alignment, map_p->hc_size, map_p->dc_size, inStruct); + map_p->dc_size = map_p->compute_map(desc, map_p->hc_alignment, map_p->dc_alignment, init_offset, inStruct, end_index); + map_p->align_map(map_p->hc_alignment, map_p->dc_alignment, map_p->hc_size, map_p->dc_size, inStruct); if (CPU_USE_ALIGNMENT_MAP == 0) { kernel->addHCtoDCmap(map_p); if (map_p->internal_field_map != NULL) {