Add HIP Sample 2_Cookbook/17_llvm_ir_to_executable
HIP supports compiling kernels from LLVM IR into executable.
The device LLVM IR needs to be compiled into a fat binary
object. This device object is embedded into a host object using
llvm-mc directives. Then, any host linker may link the host and
device objects together into an executable. A README was added.
Change-Id: I8ebb6ae86b7ab4290f7cba2eea5584d73a7c453e
[ROCm/hip-tests commit: 301d6d8f00]
Цей коміт міститься в:
зафіксовано
Aaron En Ye Shi
джерело
c24e1ee404
коміт
abbca4fc42
@@ -0,0 +1,64 @@
|
||||
HIP_PATH?= $(wildcard /opt/rocm/hip)
|
||||
ifeq (,$(HIP_PATH))
|
||||
HIP_PATH=../../..
|
||||
endif
|
||||
|
||||
HIPCC=$(HIP_PATH)/bin/hipcc
|
||||
CLANG=$(HIP_PATH)/../llvm/bin/clang
|
||||
LLVM_MC=$(HIP_PATH)/../llvm/bin/llvm-mc
|
||||
CLANG_OFFLOAD_BUNDLER=$(HIP_PATH)/../llvm/bin/clang-offload-bundler
|
||||
LLVM_AS=$(HIP_PATH)/../llvm/bin/llvm-as
|
||||
LLVM_DIS=$(HIP_PATH)/../llvm/bin/llvm-dis
|
||||
|
||||
SRCS=square.cpp
|
||||
|
||||
# Extracting the IR code, then creating an executable with the modified IR.
|
||||
|
||||
SQ_HOST_BC=square_host.bc
|
||||
SQ_HOST_LL=square_host.ll
|
||||
SQ_HOST_OBJ=square_host.o
|
||||
SQ_DEVICE_OBJ=square_device.o
|
||||
SQ_DEVICE_HIPFB=offload_bundle.hipfb
|
||||
SQ_IR_EXE=square_ir.out
|
||||
|
||||
MCIN_OBJ_GEN=hip_obj_gen.mcin
|
||||
GPU_ARCH1=gfx900
|
||||
GPU_ARCH2=gfx906
|
||||
|
||||
.PHONY: test
|
||||
|
||||
all: src_to_ir bc_to_ll ll_to_bc ir_to_exec
|
||||
|
||||
src_to_ir:
|
||||
$(HIPCC) -c -emit-llvm --cuda-host-only -target x86_64-linux-gnu -o $(SQ_HOST_BC) $(SRCS)
|
||||
$(HIPCC) -c -emit-llvm --cuda-device-only --offload-arch=$(GPU_ARCH1) --offload-arch=$(GPU_ARCH2) $(SRCS)
|
||||
|
||||
# By default, the LLVM IR Bitcode file names will be:
|
||||
# square-hip-amdgcn-amd-amdhsa-gfx900.bc
|
||||
# square-hip-amdgcn-amd-amdhsa-gfx906.bc
|
||||
|
||||
bc_to_ll:
|
||||
$(LLVM_DIS) $(SQ_HOST_BC) -o $(SQ_HOST_LL)
|
||||
$(LLVM_DIS) square-hip-amdgcn-amd-amdhsa-$(GPU_ARCH1).bc -o square-hip-amdgcn-amd-amdhsa-$(GPU_ARCH1).ll
|
||||
$(LLVM_DIS) square-hip-amdgcn-amd-amdhsa-$(GPU_ARCH2).bc -o square-hip-amdgcn-amd-amdhsa-$(GPU_ARCH2).ll
|
||||
|
||||
# You may modify the .ll LLVM IR files before the next step
|
||||
#
|
||||
# Note: hipcc does not work to convert .bc to .o, use clang instead.
|
||||
|
||||
ll_to_bc:
|
||||
$(LLVM_AS) $(SQ_HOST_LL) -o $(SQ_HOST_BC)
|
||||
$(LLVM_AS) square-hip-amdgcn-amd-amdhsa-$(GPU_ARCH1).ll -o square-hip-amdgcn-amd-amdhsa-$(GPU_ARCH1).bc
|
||||
$(LLVM_AS) square-hip-amdgcn-amd-amdhsa-$(GPU_ARCH2).ll -o square-hip-amdgcn-amd-amdhsa-$(GPU_ARCH2).bc
|
||||
|
||||
ir_to_exec:
|
||||
$(HIPCC) -c $(SQ_HOST_BC) -o $(SQ_HOST_OBJ)
|
||||
$(CLANG) -target amdgcn-amd-amdhsa -mcpu=$(GPU_ARCH1) square-hip-amdgcn-amd-amdhsa-$(GPU_ARCH1).bc -o square-hip-amdgcn-amd-amdhsa-$(GPU_ARCH1).o
|
||||
$(CLANG) -target amdgcn-amd-amdhsa -mcpu=$(GPU_ARCH2) square-hip-amdgcn-amd-amdhsa-$(GPU_ARCH2).bc -o square-hip-amdgcn-amd-amdhsa-$(GPU_ARCH2).o
|
||||
$(CLANG_OFFLOAD_BUNDLER) -type=o -bundle-align=4096 -targets=host-x86_64-unknown-linux,hip-amdgcn-amd-amdhsa-$(GPU_ARCH1),hip-amdgcn-amd-amdhsa-$(GPU_ARCH2) -inputs=/dev/null,square-hip-amdgcn-amd-amdhsa-$(GPU_ARCH1).o,square-hip-amdgcn-amd-amdhsa-$(GPU_ARCH2).o -outputs=$(SQ_DEVICE_HIPFB)
|
||||
$(LLVM_MC) $(MCIN_OBJ_GEN) -o $(SQ_DEVICE_OBJ) --filetype=obj
|
||||
$(HIPCC) $(SQ_HOST_OBJ) $(SQ_DEVICE_OBJ) -o $(SQ_IR_EXE)
|
||||
|
||||
clean:
|
||||
rm -f *.o *.out *.hipfb *.s *.ll *.bc
|
||||
|
||||
@@ -0,0 +1,67 @@
|
||||
# Compile to LLVM IR and create an executable from modified IR
|
||||
|
||||
This sample shows how to generate the LLVM IR for a simple HIP source application, then re-compiling it and generating a valid HIP executable.
|
||||
|
||||
This sample uses a previous HIP application sample, please see [0_Intro/square](https://github.com/ROCm-Developer-Tools/HIP/blob/master/samples/0_Intro/square).
|
||||
|
||||
## Compiling the HIP source into LLVM IR
|
||||
Using HIP flags `-c -emit-llvm` will help generate the host x86_64 and the device LLVM bitcode when paired with `--cuda-host-only` and `--cuda-device-only` respectively. In this sample we use these commands:
|
||||
```
|
||||
/opt/rocm/hip/bin/hipcc -c -emit-llvm --cuda-host-only -target x86_64-linux-gnu -o square_host.bc square.cpp
|
||||
/opt/rocm/hip/bin/hipcc -c -emit-llvm --cuda-device-only --offload-arch=gfx900 --offload-arch=gfx906 square.cpp
|
||||
```
|
||||
The device LLVM IR bitcode will be output into two separate files:
|
||||
- square-hip-amdgcn-amd-amdhsa-gfx900.bc
|
||||
- square-hip-amdgcn-amd-amdhsa-gfx906.bc
|
||||
|
||||
You may modify `--offload-arch` flag to build other archs and choose to enable or disable xnack and sram-ecc.
|
||||
|
||||
To transform the LLVM bitcode into human readable LLVM IR, use these commands:
|
||||
```
|
||||
/opt/rocm/llvm/bin/llvm-dis square-hip-amdgcn-amd-amdhsa-gfx900.bc -o square-hip-amdgcn-amd-amdhsa-gfx900.ll
|
||||
/opt/rocm/llvm/bin/llvm-dis square-hip-amdgcn-amd-amdhsa-gfx906.bc -o square-hip-amdgcn-amd-amdhsa-gfx906.ll
|
||||
```
|
||||
|
||||
**Warning:** We cannot ensure any compiler besides the ROCm hipcc and clang will be compatible with this process. Also, there is no guarantee that the starting IR produced with `-x cl` will run with HIP runtime. Experimenting with other compilers or starting IR will be the responsibility of the developer.
|
||||
|
||||
## Modifying the LLVM IR
|
||||
***Warning: The LLVM Language Specification may change across LLVM major releases, therefore the user must make sure the modified LLVM IR conforms to the LLVM Language Specification corresponding to the used LLVM version.***
|
||||
|
||||
At this point, you may evaluate the LLVM IR and make modifications if you are familiar with the LLVM IR language. Since the LLVM IR can vary between compiler versions, the safest approach would be to use the same compiler to consume the IR as the compiler producing it. It is the responsibility of the developer to ensure the IR is valid when manually modifying it.
|
||||
|
||||
## Compiling the LLVM IR into a valid HIP executable
|
||||
If valid, the modified host and device IR may be compiled into a HIP executable. First, the readable IR must be compiled back in LLVM bitcode. The host IR can be compiled into an object using this command:
|
||||
```
|
||||
/opt/rocm/llvm/bin/llvm-as square_host.ll -o square_host.bc
|
||||
/opt/rocm/hip/bin/hipcc -c square_host.bc -o square_host.o
|
||||
```
|
||||
|
||||
However, the device IR will require a few extra steps. The device bitcodes needs to be compiled into device objects, then offload-bundled into a HIP fat binary using the clang-offload-bundler, then llvm-mc embeds the binary inside of a host object using the MC directives provided in `hip_obj_gen.mcin`. The output is a host object with an embedded device object. Here are the steps for device side compilation into an object:
|
||||
```
|
||||
/opt/rocm/hip/../llvm/bin/llvm-as square-hip-amdgcn-amd-amdhsa-gfx900.ll -o square-hip-amdgcn-amd-amdhsa-gfx900.bc
|
||||
/opt/rocm/hip/../llvm/bin/llvm-as square-hip-amdgcn-amd-amdhsa-gfx906.ll -o square-hip-amdgcn-amd-amdhsa-gfx906.bc
|
||||
/opt/rocm/hip/../llvm/bin/clang -target amdgcn-amd-amdhsa -mcpu=gfx900 square-hip-amdgcn-amd-amdhsa-gfx900.bc -o square-hip-amdgcn-amd-amdhsa-gfx900.o
|
||||
/opt/rocm/hip/../llvm/bin/clang -target amdgcn-amd-amdhsa -mcpu=gfx906 square-hip-amdgcn-amd-amdhsa-gfx906.bc -o square-hip-amdgcn-amd-amdhsa-gfx906.o
|
||||
/opt/rocm/hip/../llvm/bin/clang-offload-bundler -type=o -bundle-align=4096 -targets=host-x86_64-unknown-linux,hip-amdgcn-amd-amdhsa-gfx900,hip-amdgcn-amd-amdhsa-gfx906 -inputs=/dev/null,square-hip-amdgcn-amd-amdhsa-gfx900.o,square-hip-amdgcn-amd-amdhsa-gfx906.o -outputs=offload_bundle.hipfb
|
||||
/opt/rocm/llvm/bin/llvm-mc hip_obj_gen.mcin -o square_device.o --filetype=obj
|
||||
```
|
||||
|
||||
**Note:** Using option `-bundle-align=4096` only works on ROCm 4.0 and newer compilers. Also, the architecture must match the same arch as when compiling to LLVM IR.
|
||||
|
||||
Finally, using the system linker, hipcc, or clang, link the host and device objects into an executable:
|
||||
```
|
||||
/opt/rocm/hip/bin/hipcc square_host.o square_device.o -o square_ir.out
|
||||
```
|
||||
If you haven't modified the GPU archs, this executable should run on both `gfx900` and `gfx906`.
|
||||
|
||||
## How to build and run this sample:
|
||||
Use these make commands to compile into LLVM IR, compile IR into executable, and execute it.
|
||||
- To compile the HIP application into host and device LLVM IR: `make src_to_ir`.
|
||||
- To disassembly the LLVM IR bitcode into human readable LLVM IR: `make bc_to_ll`.
|
||||
- To assembly the human readable LLVM IR bitcode back into LLVM IR bitcode: `make ll_to_bc`.
|
||||
- To compile the LLVM IR files into an executable: `make ir_to_exec`.
|
||||
- To execute, run `./square_ir.out`.
|
||||
|
||||
**Note:** The default arch is `gfx900` and `gfx906`, this can be modified with make argument `GPU_ARCH1` and `GPU_ARCH2`.
|
||||
|
||||
## For More Information, please refer to the HIP FAQ.
|
||||
+20
@@ -0,0 +1,20 @@
|
||||
# HIP Object Generator
|
||||
# Use this generator to create a host bundled object file
|
||||
# with the input of an offload bundled fat binary.
|
||||
#
|
||||
# Input: Bundled Object file .hipfb file
|
||||
# Output: Host Bundled Object File .o
|
||||
#
|
||||
# Add MC directives to embed target binaries. We ensure that each
|
||||
# section and image is 4096-byte aligned. This facilitates faster
|
||||
# loading of device binaries. It has been verified this align does
|
||||
# not cause significant overall file size increase.
|
||||
#
|
||||
# Note: log 2 of 4096 is 12.
|
||||
#
|
||||
.type __hip_fatbin,@object
|
||||
.section .hip_fatbin,"a",@progbits
|
||||
.globl __hip_fatbin
|
||||
.p2align 12
|
||||
__hip_fatbin:
|
||||
.incbin "offload_bundle.hipfb"
|
||||
@@ -0,0 +1,96 @@
|
||||
/*
|
||||
Copyright (c) 2020-present Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <hip/hip_runtime.h>
|
||||
|
||||
#define CHECK(cmd) \
|
||||
{\
|
||||
hipError_t error = cmd;\
|
||||
if (error != hipSuccess) { \
|
||||
fprintf(stderr, "error: '%s'(%d) at %s:%d\n", hipGetErrorString(error), error,__FILE__, __LINE__); \
|
||||
exit(EXIT_FAILURE);\
|
||||
}\
|
||||
}
|
||||
|
||||
/* This kernel is a placeholder for the kernel in LLVM IR generated by this
|
||||
* sample. It will be replaced by the kernel in LLVM IR.
|
||||
*
|
||||
* Square each element in the array A and write to array C.
|
||||
*/
|
||||
template <typename T>
|
||||
__global__ void
|
||||
vector_square(T *C_d, T *A_d, size_t N)
|
||||
{
|
||||
size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
|
||||
size_t stride = blockDim.x * gridDim.x ;
|
||||
|
||||
for (size_t i=offset; i<N; i+=stride) {
|
||||
C_d[i] = A_d[i] * A_d[i];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
float *A_d, *C_d;
|
||||
float *A_h, *C_h;
|
||||
size_t N = 1000000;
|
||||
size_t Nbytes = N * sizeof(float);
|
||||
|
||||
hipDeviceProp_t props;
|
||||
CHECK(hipGetDeviceProperties(&props, 0/*deviceID*/));
|
||||
printf ("info: running on device %s\n", props.name);
|
||||
|
||||
printf ("info: allocate host mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0);
|
||||
A_h = (float*)malloc(Nbytes);
|
||||
CHECK(A_h == 0 ? hipErrorMemoryAllocation : hipSuccess );
|
||||
C_h = (float*)malloc(Nbytes);
|
||||
CHECK(C_h == 0 ? hipErrorMemoryAllocation : hipSuccess );
|
||||
// Fill with Phi + i
|
||||
for (size_t i=0; i<N; i++)
|
||||
{
|
||||
A_h[i] = 1.618f + i;
|
||||
}
|
||||
|
||||
printf ("info: allocate device mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0);
|
||||
CHECK(hipMalloc(&A_d, Nbytes));
|
||||
CHECK(hipMalloc(&C_d, Nbytes));
|
||||
|
||||
|
||||
printf ("info: copy Host2Device\n");
|
||||
CHECK ( hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
|
||||
|
||||
const unsigned blocks = 512;
|
||||
const unsigned threadsPerBlock = 256;
|
||||
|
||||
printf ("info: launch 'vector_square' kernel\n");
|
||||
vector_square <<<blocks, threadsPerBlock>>> (C_d, A_d, N);
|
||||
|
||||
printf ("info: copy Device2Host\n");
|
||||
CHECK ( hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
|
||||
|
||||
printf ("info: check result\n");
|
||||
for (size_t i=0; i<N; i++) {
|
||||
if (C_h[i] != A_h[i] * A_h[i]) {
|
||||
CHECK(hipErrorUnknown);
|
||||
}
|
||||
}
|
||||
printf ("PASSED!\n");
|
||||
}
|
||||
Посилання в новій задачі
Заблокувати користувача