Add HIP Sample 2_Cookbook/16_assembly_to_executable

HIP supports compiling kernels from assembly into exec.
The device assembly needs to be compiled into a fat binary
object. This device object is embedded into a host object using
llvm-mc directives. Then, any host linker may link the host and
device objects together into an executable. A README is added.

Change-Id: I59d3a8b5363073810ffc3aa0d57f21b0df272369
Этот коммит содержится в:
Aaron En Ye Shi
2020-11-20 00:31:00 +00:00
коммит произвёл Aaron En Ye Shi
родитель 0855d7158c
Коммит 96c330c1af
4 изменённых файлов: 218 добавлений и 0 удалений
+50
Просмотреть файл
@@ -0,0 +1,50 @@
HIP_PATH?= $(wildcard /opt/rocm/hip)
ifeq (,$(HIP_PATH))
HIP_PATH=../../..
endif
HIPCC=$(HIP_PATH)/bin/hipcc
CLANG=$(HIP_PATH)/../llvm/bin/clang
LLVM_MC=$(HIP_PATH)/../llvm/bin/llvm-mc
CLANG_OFFLOAD_BUNDLER=$(HIP_PATH)/../llvm/bin/clang-offload-bundler
SRCS=square.cpp
# Extracting ASM code, then creating an executable with the modified asm.
SQ_HOST_ASM=square_host.s
SQ_HOST_OBJ=square_host.o
SQ_DEVICE_HIPFB=offload_bundle.hipfb
SQ_DEVICE_OBJ=square_device.o
SQ_ASM_EXE=square_asm.out
MCIN_OBJ_GEN=hip_obj_gen.mcin
GPU_ARCH1=gfx900
GPU_ARCH2=gfx906
.PHONY: test
all: src_to_asm asm_to_exec
src_to_asm:
$(HIPCC) -c -S --cuda-host-only -target x86_64-linux-gnu -o $(SQ_HOST_ASM) $(SRCS)
$(HIPCC) -c -S --cuda-device-only --offload-arch=$(GPU_ARCH1) --offload-arch=$(GPU_ARCH2) $(SRCS)
# You may modify the .s assembly files before the next step
# By default, their names will be:
# square-hip-amdgcn-amd-amdhsa-gfx900.s
# square-hip-amdgcn-amd-amdhsa-gfx906.s
#
# Note: hipcc does not work to convert .s to .o, use clang instead.
asm_to_exec:
$(HIPCC) -c $(SQ_HOST_ASM) -o $(SQ_HOST_OBJ)
$(CLANG) -target amdgcn-amd-amdhsa -mcpu=$(GPU_ARCH1) square-hip-amdgcn-amd-amdhsa-$(GPU_ARCH1).s -o square-hip-amdgcn-amd-amdhsa-$(GPU_ARCH1).o
$(CLANG) -target amdgcn-amd-amdhsa -mcpu=$(GPU_ARCH2) square-hip-amdgcn-amd-amdhsa-$(GPU_ARCH2).s -o square-hip-amdgcn-amd-amdhsa-$(GPU_ARCH2).o
$(CLANG_OFFLOAD_BUNDLER) -type=o -bundle-align=4096 -targets=host-x86_64-unknown-linux,hip-amdgcn-amd-amdhsa-$(GPU_ARCH1),hip-amdgcn-amd-amdhsa-$(GPU_ARCH2) -inputs=/dev/null,square-hip-amdgcn-amd-amdhsa-$(GPU_ARCH1).o,square-hip-amdgcn-amd-amdhsa-$(GPU_ARCH2).o -outputs=$(SQ_DEVICE_HIPFB)
$(LLVM_MC) $(MCIN_OBJ_GEN) -o $(SQ_DEVICE_OBJ) --filetype=obj
$(HIPCC) $(SQ_HOST_OBJ) $(SQ_DEVICE_OBJ) -o $(SQ_ASM_EXE)
clean:
rm -f *.o *.out *.hipfb *.s *.ll *.bc
+52
Просмотреть файл
@@ -0,0 +1,52 @@
# Compile to assembly and create an executable from modified asm
This sample shows how to generate the assembly code for a simple HIP source application, then re-compiling it and generating a valid HIP executable.
This sample uses a previous HIP application sample, please see [0_Intro/square](https://github.com/ROCm-Developer-Tools/HIP/blob/master/samples/0_Intro/square).
## Compiling the HIP source into assembly
Using HIP flags `-c -S` will help generate the host x86_64 and the device AMDGCN assembly code when paired with `--cuda-host-only` and `--cuda-device-only` respectively. In this sample we use these commands:
```
/opt/rocm/hip/bin/hipcc -c -S --cuda-host-only -target x86_64-linux-gnu -o square_host.s square.cpp
/opt/rocm/hip/bin/hipcc -c -S --cuda-device-only --offload-arch=gfx900 --offload-arch=gfx906 square.cpp
```
The device assembly will be output into two separate files:
- square-hip-amdgcn-amd-amdhsa-gfx900.s
- square-hip-amdgcn-amd-amdhsa-gfx906.s
You may modify `--offload-arch` flag to build other archs and choose to enable or disable xnack and sram-ecc.
**Note:** At this point, you may evaluate the assembly code, and make modifications if you are familiar with the AMDGCN assembly language and architecture.
## Compiling the assembly into a valid HIP executable
If valid, the modified host and device assembly may be compiled into a HIP executable. The host assembly can be compiled into an object using this command:
```
/opt/rocm/hip/bin/hipcc -c square_host.s -o square_host.o
```
However, the device assembly code will require a few extra steps. The device assemblies needs to be compiled into device objects, then offload-bundled into a HIP fat binary using the clang-offload-bundler, then llvm-mc embeds the binary inside of a host object using the MC directives provided in `hip_obj_gen.mcin`. The output is a host object with an embedded device object. Here are the steps for device side compilation into an object:
```
/opt/rocm/hip/../llvm/bin/clang -target amdgcn-amd-amdhsa -mcpu=gfx900 square-hip-amdgcn-amd-amdhsa-gfx900.s -o square-hip-amdgcn-amd-amdhsa-gfx900.o
/opt/rocm/hip/../llvm/bin/clang -target amdgcn-amd-amdhsa -mcpu=gfx906 square-hip-amdgcn-amd-amdhsa-gfx906.s -o square-hip-amdgcn-amd-amdhsa-gfx906.o
/opt/rocm/llvm/bin/clang-offload-bundler -type=o -bundle-align=4096 -targets=host-x86_64-unknown-linux,hip-amdgcn-amd-amdhsa-gfx900,hip-amdgcn-amd-amdhsa-gfx906 -inputs=/dev/null,square-hip-amdgcn-amd-amdhsa-gfx900.o,square-hip-amdgcn-amd-amdhsa-gfx906.o -outputs=offload_bundle.hipfb
/opt/rocm/llvm/bin/llvm-mc -triple x86_64-unknown-linux-gnu hip_obj_gen.mcin -o square_device.o --filetype=obj
```
**Note:** Using option `-bundle-align=4096` only works on ROCm 4.0 and newer compilers. Also, the architecture must match the same arch as when compiling to assembly.
Finally, using the system linker, hipcc, or clang, link the host and device objects into an executable:
```
/opt/rocm/hip/bin/hipcc square_host.o square_device.o -o square_asm.out
```
If you haven't modified the GPU archs, this executable should run on both `gfx900` and `gfx906`.
## How to build and run this sample:
Use these make commands to compile into assembly, compile assembly into executable, and execute it.
- To compile the HIP application into host and device assembly: `make src_to_asm`.
- To compile the assembly files into an executable: `make asm_to_exec`.
- To execute, run `./square_asm.out`.
**Note:** The default arch is `gfx900` and `gfx906`, this can be modified with make argument `GPU_ARCH1` and `GPU_ARCH2`.
## For More Information, please refer to the HIP FAQ.
+20
Просмотреть файл
@@ -0,0 +1,20 @@
# HIP Object Generator
# Use this generator to create a host bundled object file
# with the input of an offload bundled fat binary.
#
# Input: Bundled Object file .hipfb file
# Output: Host Bundled Object File .o
#
# Add MC directives to embed target binaries. We ensure that each
# section and image is 4096-byte aligned. This facilitates faster
# loading of device binaries. It has been verified this align does
# not cause significant overall file size increase.
#
# Note: log 2 of 4096 is 12.
#
.type __hip_fatbin,@object
.section .hip_fatbin,"a",@progbits
.globl __hip_fatbin
.p2align 12
__hip_fatbin:
.incbin "offload_bundle.hipfb"
+96
Просмотреть файл
@@ -0,0 +1,96 @@
/*
Copyright (c) 2020-present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include <stdio.h>
#include <hip/hip_runtime.h>
#define CHECK(cmd) \
{\
hipError_t error = cmd;\
if (error != hipSuccess) { \
fprintf(stderr, "error: '%s'(%d) at %s:%d\n", hipGetErrorString(error), error,__FILE__, __LINE__); \
exit(EXIT_FAILURE);\
}\
}
/* This kernel is a placeholder for the kernel in assembly generated by this
* sample. It will be replaced by the kernel in assembly.
*
* Square each element in the array A and write to array C.
*/
template <typename T>
__global__ void
vector_square(T *C_d, T *A_d, size_t N)
{
size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
size_t stride = blockDim.x * gridDim.x ;
for (size_t i=offset; i<N; i+=stride) {
C_d[i] = A_d[i] * A_d[i];
}
}
int main(int argc, char *argv[])
{
float *A_d, *C_d;
float *A_h, *C_h;
size_t N = 1000000;
size_t Nbytes = N * sizeof(float);
hipDeviceProp_t props;
CHECK(hipGetDeviceProperties(&props, 0/*deviceID*/));
printf ("info: running on device %s\n", props.name);
printf ("info: allocate host mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0);
A_h = (float*)malloc(Nbytes);
CHECK(A_h == 0 ? hipErrorMemoryAllocation : hipSuccess );
C_h = (float*)malloc(Nbytes);
CHECK(C_h == 0 ? hipErrorMemoryAllocation : hipSuccess );
// Fill with Phi + i
for (size_t i=0; i<N; i++)
{
A_h[i] = 1.618f + i;
}
printf ("info: allocate device mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0);
CHECK(hipMalloc(&A_d, Nbytes));
CHECK(hipMalloc(&C_d, Nbytes));
printf ("info: copy Host2Device\n");
CHECK ( hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
const unsigned blocks = 512;
const unsigned threadsPerBlock = 256;
printf ("info: launch 'vector_square' kernel\n");
vector_square <<<blocks, threadsPerBlock>>> (C_d, A_d, N);
printf ("info: copy Device2Host\n");
CHECK ( hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
printf ("info: check result\n");
for (size_t i=0; i<N; i++) {
if (C_h[i] != A_h[i] * A_h[i]) {
CHECK(hipErrorUnknown);
}
}
printf ("PASSED!\n");
}