Add HIP Sample 2_Cookbook/16_assembly_to_executable
HIP supports compiling kernels from assembly into exec. The device assembly needs to be compiled into a fat binary object. This device object is embedded into a host object using llvm-mc directives. Then, any host linker may link the host and device objects together into an executable. A README is added. Change-Id: I59d3a8b5363073810ffc3aa0d57f21b0df272369
Этот коммит содержится в:
коммит произвёл
Aaron En Ye Shi
родитель
0855d7158c
Коммит
96c330c1af
@@ -0,0 +1,50 @@
|
||||
HIP_PATH?= $(wildcard /opt/rocm/hip)
|
||||
ifeq (,$(HIP_PATH))
|
||||
HIP_PATH=../../..
|
||||
endif
|
||||
|
||||
HIPCC=$(HIP_PATH)/bin/hipcc
|
||||
CLANG=$(HIP_PATH)/../llvm/bin/clang
|
||||
LLVM_MC=$(HIP_PATH)/../llvm/bin/llvm-mc
|
||||
CLANG_OFFLOAD_BUNDLER=$(HIP_PATH)/../llvm/bin/clang-offload-bundler
|
||||
|
||||
SRCS=square.cpp
|
||||
|
||||
# Extracting ASM code, then creating an executable with the modified asm.
|
||||
|
||||
SQ_HOST_ASM=square_host.s
|
||||
SQ_HOST_OBJ=square_host.o
|
||||
SQ_DEVICE_HIPFB=offload_bundle.hipfb
|
||||
SQ_DEVICE_OBJ=square_device.o
|
||||
SQ_ASM_EXE=square_asm.out
|
||||
|
||||
MCIN_OBJ_GEN=hip_obj_gen.mcin
|
||||
GPU_ARCH1=gfx900
|
||||
GPU_ARCH2=gfx906
|
||||
|
||||
.PHONY: test
|
||||
|
||||
all: src_to_asm asm_to_exec
|
||||
|
||||
src_to_asm:
|
||||
$(HIPCC) -c -S --cuda-host-only -target x86_64-linux-gnu -o $(SQ_HOST_ASM) $(SRCS)
|
||||
$(HIPCC) -c -S --cuda-device-only --offload-arch=$(GPU_ARCH1) --offload-arch=$(GPU_ARCH2) $(SRCS)
|
||||
|
||||
# You may modify the .s assembly files before the next step
|
||||
# By default, their names will be:
|
||||
# square-hip-amdgcn-amd-amdhsa-gfx900.s
|
||||
# square-hip-amdgcn-amd-amdhsa-gfx906.s
|
||||
#
|
||||
# Note: hipcc does not work to convert .s to .o, use clang instead.
|
||||
|
||||
asm_to_exec:
|
||||
$(HIPCC) -c $(SQ_HOST_ASM) -o $(SQ_HOST_OBJ)
|
||||
$(CLANG) -target amdgcn-amd-amdhsa -mcpu=$(GPU_ARCH1) square-hip-amdgcn-amd-amdhsa-$(GPU_ARCH1).s -o square-hip-amdgcn-amd-amdhsa-$(GPU_ARCH1).o
|
||||
$(CLANG) -target amdgcn-amd-amdhsa -mcpu=$(GPU_ARCH2) square-hip-amdgcn-amd-amdhsa-$(GPU_ARCH2).s -o square-hip-amdgcn-amd-amdhsa-$(GPU_ARCH2).o
|
||||
$(CLANG_OFFLOAD_BUNDLER) -type=o -bundle-align=4096 -targets=host-x86_64-unknown-linux,hip-amdgcn-amd-amdhsa-$(GPU_ARCH1),hip-amdgcn-amd-amdhsa-$(GPU_ARCH2) -inputs=/dev/null,square-hip-amdgcn-amd-amdhsa-$(GPU_ARCH1).o,square-hip-amdgcn-amd-amdhsa-$(GPU_ARCH2).o -outputs=$(SQ_DEVICE_HIPFB)
|
||||
$(LLVM_MC) $(MCIN_OBJ_GEN) -o $(SQ_DEVICE_OBJ) --filetype=obj
|
||||
$(HIPCC) $(SQ_HOST_OBJ) $(SQ_DEVICE_OBJ) -o $(SQ_ASM_EXE)
|
||||
|
||||
clean:
|
||||
rm -f *.o *.out *.hipfb *.s *.ll *.bc
|
||||
|
||||
@@ -0,0 +1,52 @@
|
||||
# Compile to assembly and create an executable from modified asm
|
||||
|
||||
This sample shows how to generate the assembly code for a simple HIP source application, then re-compiling it and generating a valid HIP executable.
|
||||
|
||||
This sample uses a previous HIP application sample, please see [0_Intro/square](https://github.com/ROCm-Developer-Tools/HIP/blob/master/samples/0_Intro/square).
|
||||
|
||||
## Compiling the HIP source into assembly
|
||||
Using HIP flags `-c -S` will help generate the host x86_64 and the device AMDGCN assembly code when paired with `--cuda-host-only` and `--cuda-device-only` respectively. In this sample we use these commands:
|
||||
```
|
||||
/opt/rocm/hip/bin/hipcc -c -S --cuda-host-only -target x86_64-linux-gnu -o square_host.s square.cpp
|
||||
/opt/rocm/hip/bin/hipcc -c -S --cuda-device-only --offload-arch=gfx900 --offload-arch=gfx906 square.cpp
|
||||
```
|
||||
|
||||
The device assembly will be output into two separate files:
|
||||
- square-hip-amdgcn-amd-amdhsa-gfx900.s
|
||||
- square-hip-amdgcn-amd-amdhsa-gfx906.s
|
||||
|
||||
You may modify `--offload-arch` flag to build other archs and choose to enable or disable xnack and sram-ecc.
|
||||
|
||||
**Note:** At this point, you may evaluate the assembly code, and make modifications if you are familiar with the AMDGCN assembly language and architecture.
|
||||
|
||||
## Compiling the assembly into a valid HIP executable
|
||||
If valid, the modified host and device assembly may be compiled into a HIP executable. The host assembly can be compiled into an object using this command:
|
||||
```
|
||||
/opt/rocm/hip/bin/hipcc -c square_host.s -o square_host.o
|
||||
```
|
||||
|
||||
However, the device assembly code will require a few extra steps. The device assemblies needs to be compiled into device objects, then offload-bundled into a HIP fat binary using the clang-offload-bundler, then llvm-mc embeds the binary inside of a host object using the MC directives provided in `hip_obj_gen.mcin`. The output is a host object with an embedded device object. Here are the steps for device side compilation into an object:
|
||||
```
|
||||
/opt/rocm/hip/../llvm/bin/clang -target amdgcn-amd-amdhsa -mcpu=gfx900 square-hip-amdgcn-amd-amdhsa-gfx900.s -o square-hip-amdgcn-amd-amdhsa-gfx900.o
|
||||
/opt/rocm/hip/../llvm/bin/clang -target amdgcn-amd-amdhsa -mcpu=gfx906 square-hip-amdgcn-amd-amdhsa-gfx906.s -o square-hip-amdgcn-amd-amdhsa-gfx906.o
|
||||
/opt/rocm/llvm/bin/clang-offload-bundler -type=o -bundle-align=4096 -targets=host-x86_64-unknown-linux,hip-amdgcn-amd-amdhsa-gfx900,hip-amdgcn-amd-amdhsa-gfx906 -inputs=/dev/null,square-hip-amdgcn-amd-amdhsa-gfx900.o,square-hip-amdgcn-amd-amdhsa-gfx906.o -outputs=offload_bundle.hipfb
|
||||
/opt/rocm/llvm/bin/llvm-mc -triple x86_64-unknown-linux-gnu hip_obj_gen.mcin -o square_device.o --filetype=obj
|
||||
```
|
||||
|
||||
**Note:** Using option `-bundle-align=4096` only works on ROCm 4.0 and newer compilers. Also, the architecture must match the same arch as when compiling to assembly.
|
||||
|
||||
Finally, using the system linker, hipcc, or clang, link the host and device objects into an executable:
|
||||
```
|
||||
/opt/rocm/hip/bin/hipcc square_host.o square_device.o -o square_asm.out
|
||||
```
|
||||
If you haven't modified the GPU archs, this executable should run on both `gfx900` and `gfx906`.
|
||||
|
||||
## How to build and run this sample:
|
||||
Use these make commands to compile into assembly, compile assembly into executable, and execute it.
|
||||
- To compile the HIP application into host and device assembly: `make src_to_asm`.
|
||||
- To compile the assembly files into an executable: `make asm_to_exec`.
|
||||
- To execute, run `./square_asm.out`.
|
||||
|
||||
**Note:** The default arch is `gfx900` and `gfx906`, this can be modified with make argument `GPU_ARCH1` and `GPU_ARCH2`.
|
||||
|
||||
## For More Information, please refer to the HIP FAQ.
|
||||
@@ -0,0 +1,20 @@
|
||||
# HIP Object Generator
|
||||
# Use this generator to create a host bundled object file
|
||||
# with the input of an offload bundled fat binary.
|
||||
#
|
||||
# Input: Bundled Object file .hipfb file
|
||||
# Output: Host Bundled Object File .o
|
||||
#
|
||||
# Add MC directives to embed target binaries. We ensure that each
|
||||
# section and image is 4096-byte aligned. This facilitates faster
|
||||
# loading of device binaries. It has been verified this align does
|
||||
# not cause significant overall file size increase.
|
||||
#
|
||||
# Note: log 2 of 4096 is 12.
|
||||
#
|
||||
.type __hip_fatbin,@object
|
||||
.section .hip_fatbin,"a",@progbits
|
||||
.globl __hip_fatbin
|
||||
.p2align 12
|
||||
__hip_fatbin:
|
||||
.incbin "offload_bundle.hipfb"
|
||||
@@ -0,0 +1,96 @@
|
||||
/*
|
||||
Copyright (c) 2020-present Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <hip/hip_runtime.h>
|
||||
|
||||
#define CHECK(cmd) \
|
||||
{\
|
||||
hipError_t error = cmd;\
|
||||
if (error != hipSuccess) { \
|
||||
fprintf(stderr, "error: '%s'(%d) at %s:%d\n", hipGetErrorString(error), error,__FILE__, __LINE__); \
|
||||
exit(EXIT_FAILURE);\
|
||||
}\
|
||||
}
|
||||
|
||||
/* This kernel is a placeholder for the kernel in assembly generated by this
|
||||
* sample. It will be replaced by the kernel in assembly.
|
||||
*
|
||||
* Square each element in the array A and write to array C.
|
||||
*/
|
||||
template <typename T>
|
||||
__global__ void
|
||||
vector_square(T *C_d, T *A_d, size_t N)
|
||||
{
|
||||
size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
|
||||
size_t stride = blockDim.x * gridDim.x ;
|
||||
|
||||
for (size_t i=offset; i<N; i+=stride) {
|
||||
C_d[i] = A_d[i] * A_d[i];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
float *A_d, *C_d;
|
||||
float *A_h, *C_h;
|
||||
size_t N = 1000000;
|
||||
size_t Nbytes = N * sizeof(float);
|
||||
|
||||
hipDeviceProp_t props;
|
||||
CHECK(hipGetDeviceProperties(&props, 0/*deviceID*/));
|
||||
printf ("info: running on device %s\n", props.name);
|
||||
|
||||
printf ("info: allocate host mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0);
|
||||
A_h = (float*)malloc(Nbytes);
|
||||
CHECK(A_h == 0 ? hipErrorMemoryAllocation : hipSuccess );
|
||||
C_h = (float*)malloc(Nbytes);
|
||||
CHECK(C_h == 0 ? hipErrorMemoryAllocation : hipSuccess );
|
||||
// Fill with Phi + i
|
||||
for (size_t i=0; i<N; i++)
|
||||
{
|
||||
A_h[i] = 1.618f + i;
|
||||
}
|
||||
|
||||
printf ("info: allocate device mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0);
|
||||
CHECK(hipMalloc(&A_d, Nbytes));
|
||||
CHECK(hipMalloc(&C_d, Nbytes));
|
||||
|
||||
|
||||
printf ("info: copy Host2Device\n");
|
||||
CHECK ( hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
|
||||
|
||||
const unsigned blocks = 512;
|
||||
const unsigned threadsPerBlock = 256;
|
||||
|
||||
printf ("info: launch 'vector_square' kernel\n");
|
||||
vector_square <<<blocks, threadsPerBlock>>> (C_d, A_d, N);
|
||||
|
||||
printf ("info: copy Device2Host\n");
|
||||
CHECK ( hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
|
||||
|
||||
printf ("info: check result\n");
|
||||
for (size_t i=0; i<N; i++) {
|
||||
if (C_h[i] != A_h[i] * A_h[i]) {
|
||||
CHECK(hipErrorUnknown);
|
||||
}
|
||||
}
|
||||
printf ("PASSED!\n");
|
||||
}
|
||||
Ссылка в новой задаче
Block a user