diff --git a/samples/2_Cookbook/16_assembly_to_executable/Makefile b/samples/2_Cookbook/16_assembly_to_executable/Makefile new file mode 100644 index 0000000000..2de3efc656 --- /dev/null +++ b/samples/2_Cookbook/16_assembly_to_executable/Makefile @@ -0,0 +1,50 @@ +HIP_PATH?= $(wildcard /opt/rocm/hip) +ifeq (,$(HIP_PATH)) + HIP_PATH=../../.. +endif + +HIPCC=$(HIP_PATH)/bin/hipcc +CLANG=$(HIP_PATH)/../llvm/bin/clang +LLVM_MC=$(HIP_PATH)/../llvm/bin/llvm-mc +CLANG_OFFLOAD_BUNDLER=$(HIP_PATH)/../llvm/bin/clang-offload-bundler + +SRCS=square.cpp + +# Extracting ASM code, then creating an executable with the modified asm. + +SQ_HOST_ASM=square_host.s +SQ_HOST_OBJ=square_host.o +SQ_DEVICE_HIPFB=offload_bundle.hipfb +SQ_DEVICE_OBJ=square_device.o +SQ_ASM_EXE=square_asm.out + +MCIN_OBJ_GEN=hip_obj_gen.mcin +GPU_ARCH1=gfx900 +GPU_ARCH2=gfx906 + +.PHONY: test + +all: src_to_asm asm_to_exec + +src_to_asm: + $(HIPCC) -c -S --cuda-host-only -target x86_64-linux-gnu -o $(SQ_HOST_ASM) $(SRCS) + $(HIPCC) -c -S --cuda-device-only --offload-arch=$(GPU_ARCH1) --offload-arch=$(GPU_ARCH2) $(SRCS) + +# You may modify the .s assembly files before the next step +# By default, their names will be: +# square-hip-amdgcn-amd-amdhsa-gfx900.s +# square-hip-amdgcn-amd-amdhsa-gfx906.s +# +# Note: hipcc does not work to convert .s to .o, use clang instead. + +asm_to_exec: + $(HIPCC) -c $(SQ_HOST_ASM) -o $(SQ_HOST_OBJ) + $(CLANG) -target amdgcn-amd-amdhsa -mcpu=$(GPU_ARCH1) square-hip-amdgcn-amd-amdhsa-$(GPU_ARCH1).s -o square-hip-amdgcn-amd-amdhsa-$(GPU_ARCH1).o + $(CLANG) -target amdgcn-amd-amdhsa -mcpu=$(GPU_ARCH2) square-hip-amdgcn-amd-amdhsa-$(GPU_ARCH2).s -o square-hip-amdgcn-amd-amdhsa-$(GPU_ARCH2).o + $(CLANG_OFFLOAD_BUNDLER) -type=o -bundle-align=4096 -targets=host-x86_64-unknown-linux,hip-amdgcn-amd-amdhsa-$(GPU_ARCH1),hip-amdgcn-amd-amdhsa-$(GPU_ARCH2) -inputs=/dev/null,square-hip-amdgcn-amd-amdhsa-$(GPU_ARCH1).o,square-hip-amdgcn-amd-amdhsa-$(GPU_ARCH2).o -outputs=$(SQ_DEVICE_HIPFB) + $(LLVM_MC) $(MCIN_OBJ_GEN) -o $(SQ_DEVICE_OBJ) --filetype=obj + $(HIPCC) $(SQ_HOST_OBJ) $(SQ_DEVICE_OBJ) -o $(SQ_ASM_EXE) + +clean: + rm -f *.o *.out *.hipfb *.s *.ll *.bc + diff --git a/samples/2_Cookbook/16_assembly_to_executable/README.md b/samples/2_Cookbook/16_assembly_to_executable/README.md new file mode 100644 index 0000000000..ef11329924 --- /dev/null +++ b/samples/2_Cookbook/16_assembly_to_executable/README.md @@ -0,0 +1,52 @@ +# Compile to assembly and create an executable from modified asm + +This sample shows how to generate the assembly code for a simple HIP source application, then re-compiling it and generating a valid HIP executable. + +This sample uses a previous HIP application sample, please see [0_Intro/square](https://github.com/ROCm-Developer-Tools/HIP/blob/master/samples/0_Intro/square). + +## Compiling the HIP source into assembly +Using HIP flags `-c -S` will help generate the host x86_64 and the device AMDGCN assembly code when paired with `--cuda-host-only` and `--cuda-device-only` respectively. In this sample we use these commands: +``` +/opt/rocm/hip/bin/hipcc -c -S --cuda-host-only -target x86_64-linux-gnu -o square_host.s square.cpp +/opt/rocm/hip/bin/hipcc -c -S --cuda-device-only --offload-arch=gfx900 --offload-arch=gfx906 square.cpp +``` + +The device assembly will be output into two separate files: +- square-hip-amdgcn-amd-amdhsa-gfx900.s +- square-hip-amdgcn-amd-amdhsa-gfx906.s + +You may modify `--offload-arch` flag to build other archs and choose to enable or disable xnack and sram-ecc. + +**Note:** At this point, you may evaluate the assembly code, and make modifications if you are familiar with the AMDGCN assembly language and architecture. + +## Compiling the assembly into a valid HIP executable +If valid, the modified host and device assembly may be compiled into a HIP executable. The host assembly can be compiled into an object using this command: +``` +/opt/rocm/hip/bin/hipcc -c square_host.s -o square_host.o +``` + +However, the device assembly code will require a few extra steps. The device assemblies needs to be compiled into device objects, then offload-bundled into a HIP fat binary using the clang-offload-bundler, then llvm-mc embeds the binary inside of a host object using the MC directives provided in `hip_obj_gen.mcin`. The output is a host object with an embedded device object. Here are the steps for device side compilation into an object: +``` +/opt/rocm/hip/../llvm/bin/clang -target amdgcn-amd-amdhsa -mcpu=gfx900 square-hip-amdgcn-amd-amdhsa-gfx900.s -o square-hip-amdgcn-amd-amdhsa-gfx900.o +/opt/rocm/hip/../llvm/bin/clang -target amdgcn-amd-amdhsa -mcpu=gfx906 square-hip-amdgcn-amd-amdhsa-gfx906.s -o square-hip-amdgcn-amd-amdhsa-gfx906.o +/opt/rocm/llvm/bin/clang-offload-bundler -type=o -bundle-align=4096 -targets=host-x86_64-unknown-linux,hip-amdgcn-amd-amdhsa-gfx900,hip-amdgcn-amd-amdhsa-gfx906 -inputs=/dev/null,square-hip-amdgcn-amd-amdhsa-gfx900.o,square-hip-amdgcn-amd-amdhsa-gfx906.o -outputs=offload_bundle.hipfb +/opt/rocm/llvm/bin/llvm-mc -triple x86_64-unknown-linux-gnu hip_obj_gen.mcin -o square_device.o --filetype=obj +``` + +**Note:** Using option `-bundle-align=4096` only works on ROCm 4.0 and newer compilers. Also, the architecture must match the same arch as when compiling to assembly. + +Finally, using the system linker, hipcc, or clang, link the host and device objects into an executable: +``` +/opt/rocm/hip/bin/hipcc square_host.o square_device.o -o square_asm.out +``` +If you haven't modified the GPU archs, this executable should run on both `gfx900` and `gfx906`. + +## How to build and run this sample: +Use these make commands to compile into assembly, compile assembly into executable, and execute it. +- To compile the HIP application into host and device assembly: `make src_to_asm`. +- To compile the assembly files into an executable: `make asm_to_exec`. +- To execute, run `./square_asm.out`. + +**Note:** The default arch is `gfx900` and `gfx906`, this can be modified with make argument `GPU_ARCH1` and `GPU_ARCH2`. + +## For More Information, please refer to the HIP FAQ. diff --git a/samples/2_Cookbook/16_assembly_to_executable/hip_obj_gen.mcin b/samples/2_Cookbook/16_assembly_to_executable/hip_obj_gen.mcin new file mode 100644 index 0000000000..4551e5a1c4 --- /dev/null +++ b/samples/2_Cookbook/16_assembly_to_executable/hip_obj_gen.mcin @@ -0,0 +1,20 @@ +# HIP Object Generator +# Use this generator to create a host bundled object file +# with the input of an offload bundled fat binary. +# +# Input: Bundled Object file .hipfb file +# Output: Host Bundled Object File .o +# +# Add MC directives to embed target binaries. We ensure that each +# section and image is 4096-byte aligned. This facilitates faster +# loading of device binaries. It has been verified this align does +# not cause significant overall file size increase. +# +# Note: log 2 of 4096 is 12. +# + .type __hip_fatbin,@object + .section .hip_fatbin,"a",@progbits + .globl __hip_fatbin + .p2align 12 +__hip_fatbin: + .incbin "offload_bundle.hipfb" diff --git a/samples/2_Cookbook/16_assembly_to_executable/square.cpp b/samples/2_Cookbook/16_assembly_to_executable/square.cpp new file mode 100644 index 0000000000..6f591092d0 --- /dev/null +++ b/samples/2_Cookbook/16_assembly_to_executable/square.cpp @@ -0,0 +1,96 @@ +/* +Copyright (c) 2020-present Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include +#include + +#define CHECK(cmd) \ +{\ + hipError_t error = cmd;\ + if (error != hipSuccess) { \ + fprintf(stderr, "error: '%s'(%d) at %s:%d\n", hipGetErrorString(error), error,__FILE__, __LINE__); \ + exit(EXIT_FAILURE);\ + }\ +} + +/* This kernel is a placeholder for the kernel in assembly generated by this + * sample. It will be replaced by the kernel in assembly. + * + * Square each element in the array A and write to array C. + */ +template +__global__ void +vector_square(T *C_d, T *A_d, size_t N) +{ + size_t offset = (blockIdx.x * blockDim.x + threadIdx.x); + size_t stride = blockDim.x * gridDim.x ; + + for (size_t i=offset; i>> (C_d, A_d, N); + + printf ("info: copy Device2Host\n"); + CHECK ( hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost)); + + printf ("info: check result\n"); + for (size_t i=0; i