Add HIP Sample 2_Cookbook/16_assembly_to_executable

HIP supports compiling kernels from assembly into exec. The device assembly needs to be compiled into a fat binary object. This device object is embedded into a host object using llvm-mc directives. Then, any host linker may link the host and device objects together into an executable. A README is added. Change-Id: I59d3a8b5363073810ffc3aa0d57f21b0df272369
2020-11-20 00:31:00 +00:00
@@ -0,0 +1,50 @@
+HIP_PATH?= $(wildcard /opt/rocm/hip)
+ifeq (,$(HIP_PATH))
+	HIP_PATH=../../..
+endif
+
+HIPCC=$(HIP_PATH)/bin/hipcc
+CLANG=$(HIP_PATH)/../llvm/bin/clang
+LLVM_MC=$(HIP_PATH)/../llvm/bin/llvm-mc
+CLANG_OFFLOAD_BUNDLER=$(HIP_PATH)/../llvm/bin/clang-offload-bundler
+
+SRCS=square.cpp
+
+# Extracting ASM code, then creating an executable with the modified asm.
+
+SQ_HOST_ASM=square_host.s
+SQ_HOST_OBJ=square_host.o
+SQ_DEVICE_HIPFB=offload_bundle.hipfb
+SQ_DEVICE_OBJ=square_device.o
+SQ_ASM_EXE=square_asm.out
+
+MCIN_OBJ_GEN=hip_obj_gen.mcin
+GPU_ARCH1=gfx900
+GPU_ARCH2=gfx906
+
+.PHONY: test
+
+all: src_to_asm asm_to_exec
+
+src_to_asm:
+	$(HIPCC) -c -S --cuda-host-only -target x86_64-linux-gnu -o $(SQ_HOST_ASM) $(SRCS)
+	$(HIPCC) -c -S --cuda-device-only --offload-arch=$(GPU_ARCH1) --offload-arch=$(GPU_ARCH2) $(SRCS)
+
+# You may modify the .s assembly files before the next step
+# By default, their names will be:
+#   square-hip-amdgcn-amd-amdhsa-gfx900.s
+#   square-hip-amdgcn-amd-amdhsa-gfx906.s
+#
+# Note: hipcc does not work to convert .s to .o, use clang instead.
+
+asm_to_exec:
+	$(HIPCC) -c $(SQ_HOST_ASM) -o $(SQ_HOST_OBJ)
+	$(CLANG) -target amdgcn-amd-amdhsa -mcpu=$(GPU_ARCH1) square-hip-amdgcn-amd-amdhsa-$(GPU_ARCH1).s -o square-hip-amdgcn-amd-amdhsa-$(GPU_ARCH1).o
+	$(CLANG) -target amdgcn-amd-amdhsa -mcpu=$(GPU_ARCH2) square-hip-amdgcn-amd-amdhsa-$(GPU_ARCH2).s -o square-hip-amdgcn-amd-amdhsa-$(GPU_ARCH2).o
+	$(CLANG_OFFLOAD_BUNDLER) -type=o -bundle-align=4096 -targets=host-x86_64-unknown-linux,hip-amdgcn-amd-amdhsa-$(GPU_ARCH1),hip-amdgcn-amd-amdhsa-$(GPU_ARCH2) -inputs=/dev/null,square-hip-amdgcn-amd-amdhsa-$(GPU_ARCH1).o,square-hip-amdgcn-amd-amdhsa-$(GPU_ARCH2).o -outputs=$(SQ_DEVICE_HIPFB)
+	$(LLVM_MC) $(MCIN_OBJ_GEN) -o $(SQ_DEVICE_OBJ) --filetype=obj
+	$(HIPCC) $(SQ_HOST_OBJ) $(SQ_DEVICE_OBJ) -o $(SQ_ASM_EXE)
+
+clean:
+	rm -f *.o *.out *.hipfb *.s *.ll *.bc
+
@@ -0,0 +1,52 @@
+# Compile to assembly and create an executable from modified asm
+
+This sample shows how to generate the assembly code for a simple HIP source application, then re-compiling it and generating a valid HIP executable.
+
+This sample uses a previous HIP application sample, please see [0_Intro/square](https://github.com/ROCm-Developer-Tools/HIP/blob/master/samples/0_Intro/square).
+
+## Compiling the HIP source into assembly
+Using HIP flags `-c -S` will help generate the host x86_64 and the device AMDGCN assembly code when paired with `--cuda-host-only` and `--cuda-device-only` respectively. In this sample we use these commands:
+```
+/opt/rocm/hip/bin/hipcc -c -S --cuda-host-only -target x86_64-linux-gnu -o square_host.s square.cpp
+/opt/rocm/hip/bin/hipcc -c -S --cuda-device-only --offload-arch=gfx900 --offload-arch=gfx906 square.cpp
+```
+
+The device assembly will be output into two separate files:
+- square-hip-amdgcn-amd-amdhsa-gfx900.s
+- square-hip-amdgcn-amd-amdhsa-gfx906.s
+
+You may modify `--offload-arch` flag to build other archs and choose to enable or disable xnack and sram-ecc.
+
+**Note:** At this point, you may evaluate the assembly code, and make modifications if you are familiar with the AMDGCN assembly language and architecture.
+
+## Compiling the assembly into a valid HIP executable
+If valid, the modified host and device assembly may be compiled into a HIP executable. The host assembly can be compiled into an object using this command:
+```
+/opt/rocm/hip/bin/hipcc -c square_host.s -o square_host.o
+```
+
+However, the device assembly code will require a few extra steps. The device assemblies needs to be compiled into device objects, then offload-bundled into a HIP fat binary using the clang-offload-bundler, then llvm-mc embeds the binary inside of a host object using the MC directives provided in `hip_obj_gen.mcin`. The output is a host object with an embedded device object. Here are the steps for device side compilation into an object:
+```
+/opt/rocm/hip/../llvm/bin/clang -target amdgcn-amd-amdhsa -mcpu=gfx900 square-hip-amdgcn-amd-amdhsa-gfx900.s -o square-hip-amdgcn-amd-amdhsa-gfx900.o
+/opt/rocm/hip/../llvm/bin/clang -target amdgcn-amd-amdhsa -mcpu=gfx906 square-hip-amdgcn-amd-amdhsa-gfx906.s -o square-hip-amdgcn-amd-amdhsa-gfx906.o
+/opt/rocm/llvm/bin/clang-offload-bundler -type=o -bundle-align=4096 -targets=host-x86_64-unknown-linux,hip-amdgcn-amd-amdhsa-gfx900,hip-amdgcn-amd-amdhsa-gfx906 -inputs=/dev/null,square-hip-amdgcn-amd-amdhsa-gfx900.o,square-hip-amdgcn-amd-amdhsa-gfx906.o -outputs=offload_bundle.hipfb
+/opt/rocm/llvm/bin/llvm-mc -triple x86_64-unknown-linux-gnu hip_obj_gen.mcin -o square_device.o --filetype=obj
+```
+
+**Note:** Using option `-bundle-align=4096` only works on ROCm 4.0 and newer compilers. Also, the architecture must match the same arch as when compiling to assembly.
+
+Finally, using the system linker, hipcc, or clang, link the host and device objects into an executable:
+```
+/opt/rocm/hip/bin/hipcc square_host.o square_device.o -o square_asm.out
+```
+If you haven't modified the GPU archs, this executable should run on both `gfx900` and `gfx906`.
+
+## How to build and run this sample:
+Use these make commands to compile into assembly, compile assembly into executable, and execute it.
+- To compile the HIP application into host and device assembly: `make src_to_asm`.
+- To compile the assembly files into an executable: `make asm_to_exec`.
+- To execute, run `./square_asm.out`.
+
+**Note:** The default arch is `gfx900` and `gfx906`, this can be modified with make argument `GPU_ARCH1` and `GPU_ARCH2`.
+
+## For More Information, please refer to the HIP FAQ.
@@ -0,0 +1,20 @@
+#       HIP Object Generator
+# Use this generator to create a host bundled object file
+# with the input of an offload bundled fat binary.
+#
+# Input: Bundled Object file .hipfb file
+# Output: Host Bundled Object File .o
+#
+# Add MC directives to embed target binaries. We ensure that each
+# section and image is 4096-byte aligned. This facilitates faster
+# loading of device binaries. It has been verified this align does
+# not cause significant overall file size increase.
+#
+# Note: log 2 of 4096 is 12.
+#
+  .type __hip_fatbin,@object
+  .section .hip_fatbin,"a",@progbits
+  .globl __hip_fatbin
+  .p2align 12
+__hip_fatbin:
+  .incbin "offload_bundle.hipfb"
@@ -0,0 +1,96 @@
+/*
+Copyright (c) 2020-present Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <stdio.h>
+#include <hip/hip_runtime.h>
+
+#define CHECK(cmd) \
+{\
+    hipError_t error  = cmd;\
+    if (error != hipSuccess) { \
+        fprintf(stderr, "error: '%s'(%d) at %s:%d\n", hipGetErrorString(error), error,__FILE__, __LINE__); \
+        exit(EXIT_FAILURE);\
+    }\
+}
+
+/* This kernel is a placeholder for the kernel in assembly generated by this
+ * sample. It will be replaced by the kernel in assembly.
+ *
+ * Square each element in the array A and write to array C.
+ */
+template <typename T>
+__global__ void
+vector_square(T *C_d, T *A_d, size_t N)
+{
+    size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
+    size_t stride = blockDim.x * gridDim.x ;
+
+    for (size_t i=offset; i<N; i+=stride) {
+        C_d[i] = A_d[i] * A_d[i];
+    }
+}
+
+
+int main(int argc, char *argv[])
+{
+    float *A_d, *C_d;
+    float *A_h, *C_h;
+    size_t N = 1000000;
+    size_t Nbytes = N * sizeof(float);
+
+    hipDeviceProp_t props;
+    CHECK(hipGetDeviceProperties(&props, 0/*deviceID*/));
+    printf ("info: running on device %s\n", props.name);
+
+    printf ("info: allocate host mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0);
+    A_h = (float*)malloc(Nbytes);
+    CHECK(A_h == 0 ? hipErrorMemoryAllocation : hipSuccess );
+    C_h = (float*)malloc(Nbytes);
+    CHECK(C_h == 0 ? hipErrorMemoryAllocation : hipSuccess );
+    // Fill with Phi + i
+    for (size_t i=0; i<N; i++)
+    {
+        A_h[i] = 1.618f + i;
+    }
+
+    printf ("info: allocate device mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0);
+    CHECK(hipMalloc(&A_d, Nbytes));
+    CHECK(hipMalloc(&C_d, Nbytes));
+
+
+    printf ("info: copy Host2Device\n");
+    CHECK ( hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
+
+    const unsigned blocks = 512;
+    const unsigned threadsPerBlock = 256;
+
+    printf ("info: launch 'vector_square' kernel\n");
+    vector_square <<<blocks, threadsPerBlock>>> (C_d, A_d, N);
+
+    printf ("info: copy Device2Host\n");
+    CHECK ( hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
+
+    printf ("info: check result\n");
+    for (size_t i=0; i<N; i++)  {
+        if (C_h[i] != A_h[i] * A_h[i]) {
+            CHECK(hipErrorUnknown);
+        }
+    }
+    printf ("PASSED!\n");
+}