Merge branch 'amd-develop' into amd-master
Change-Id: I28abeb5f0c81dd0315dcb1c987f22704a9c0d18b
This commit is contained in:
@@ -117,6 +117,10 @@ Differences or limitations of HIP APIs as compared to CUDA APIs should be clearl
|
||||
- Keyword TODO refers to a note that should be addressed in long-term. Could be style issue, software architecture, or known bugs.
|
||||
- FIXME refers to a short-term bug that needs to be addressed.
|
||||
|
||||
- HIP_INIT_API() should be placed at the start of each top-level HIP API. This function will make sure the HIP runtime is initialized,
|
||||
and also constructs an appropriate API string for tracing and CodeXL marker tracing. The arguments to HIP_INIT_API should match
|
||||
those of the parent fucntion.
|
||||
|
||||
|
||||
#### Presubmit Testing:
|
||||
Before checking in or submitting a pull request, run all Rodinia tests and ensure pass results match starting point:
|
||||
|
||||
+36
-3
@@ -105,7 +105,7 @@ if ($HIP_PLATFORM eq "hcc") {
|
||||
}
|
||||
|
||||
# Satisfy HCC dependencies
|
||||
$HIPLDFLAGS .= " -lc++abi";
|
||||
$HIPLDFLAGS .= " -lc++abi -lsupc++";
|
||||
$HIPLDFLAGS .= " -L$HSA_PATH/lib -L$ROCM_PATH/lib -lhsa-runtime64 -lhc_am -lhsakmt";
|
||||
|
||||
# Handle ROCm target platform
|
||||
@@ -151,13 +151,12 @@ if ($HIP_PLATFORM eq "hcc") {
|
||||
$HIPCC="$CUDA_PATH/bin/nvcc";
|
||||
$HIPCXXFLAGS .= " -I$CUDA_PATH/include";
|
||||
|
||||
$HIPLDFLAGS = "";
|
||||
$HIPLDFLAGS = "-lcuda -lcudart";
|
||||
} else {
|
||||
printf ("error: unknown HIP_PLATFORM = '$HIP_PLATFORM'");
|
||||
exit (-1);
|
||||
}
|
||||
|
||||
|
||||
# Add paths to common HIP includes:
|
||||
$HIPCXXFLAGS .= " -I$HIP_PATH/include" ;
|
||||
|
||||
@@ -177,7 +176,41 @@ if ($verbose & 0x4) {
|
||||
print "hipcc-args: ", join (" ", @ARGV), "\n";
|
||||
}
|
||||
|
||||
# Handle code object generation
|
||||
my $ISACMD="";
|
||||
if($HIP_PLATFORM eq "hcc"){
|
||||
$ISACMD .= "$HIP_PATH/bin/hipgenisa.sh ";
|
||||
$ISACMD .= $ROCM_PATH;
|
||||
if($ARGV[0] eq "--gencodeobject"){
|
||||
foreach $isaarg (@ARGV[1..$#ARGV]){
|
||||
$ISACMD .= " ";
|
||||
$ISACMD .= $isaarg;
|
||||
}
|
||||
if ($verbose & 0x1) {
|
||||
print "hipcc-cmd: ", $ISACMD, "\n";
|
||||
}
|
||||
system($ISACMD) and die();
|
||||
exit(0);
|
||||
}
|
||||
}
|
||||
|
||||
if($HIP_PLATFORM eq "nvcc"){
|
||||
$ISACMD .= "$HIP_PATH/bin/hipcc -ptx ";
|
||||
if($ARGV[0] eq "--gencodeobject"){
|
||||
foreach $isaarg (@ARGV[1..$#ARGV]){
|
||||
$ISACMD .= " ";
|
||||
$ISACMD .= $isaarg;
|
||||
}
|
||||
if ($verbose & 0x1) {
|
||||
print "hipcc-cmd: ", $ISACMD, "\n";
|
||||
}
|
||||
system($ISACMD) and die();
|
||||
exit(0);
|
||||
}
|
||||
}
|
||||
|
||||
my $toolArgs = ""; # arguments to pass to the hcc or nvcc tool
|
||||
|
||||
foreach $arg (@ARGV)
|
||||
{
|
||||
my $swallowArg = 0;
|
||||
|
||||
Executable
@@ -0,0 +1,31 @@
|
||||
#!/bin/bash
|
||||
|
||||
if [ $1 = " " ]
|
||||
then
|
||||
exit
|
||||
fi
|
||||
|
||||
ROCM_PATH=$1
|
||||
GEN_ISA=$2
|
||||
FILE_NAMES=$3
|
||||
OUT=$4
|
||||
OUTPUT_FILE=$5
|
||||
TARGET=""
|
||||
if [ ${GEN_ISA:0:12} = "--target-isa" ]
|
||||
then
|
||||
TARGET=${GEN_ISA:13:12}
|
||||
fi
|
||||
|
||||
SOURCE="${BASH_SOURCE[0]}"
|
||||
HIP_PATH="$( command cd -P "$( dirname "$SOURCE" )/.." && pwd )"
|
||||
|
||||
export KMDUMPISA=1
|
||||
export KMDUMPLLVM=1
|
||||
mkdir /tmp/hipgenisa
|
||||
$HIP_PATH/bin/hipcc $FILE_NAMES -o /tmp/hipgenisa/a.out
|
||||
mv dump.* /tmp/hipgenisa/
|
||||
$ROCM_PATH/hcc-lc/bin/llvm-mc -arch=amdgcn -mcpu=$TARGET -filetype=obj /tmp/hipgenisa/dump.isa -o /tmp/hipgenisa/dump.o
|
||||
$ROCM_PATH/llvm/bin/clang -target amdgcn--amdhsa /tmp/hipgenisa/dump.o -o $OUTPUT_FILE
|
||||
rm -r /tmp/hipgenisa
|
||||
export KMDUMPISA=0
|
||||
export KMDUMPLLVM=0
|
||||
A különbségek nem kerülnek megjelenítésre, mivel a fájl túl nagy
Load Diff
@@ -42,12 +42,12 @@
|
||||
| **CUDA** | **HIP** | **CUDA description** |
|
||||
|-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------|
|
||||
| `cudaStreamAddCallback` | | Add a callback to a compute stream. |
|
||||
| `cudaStreamAttachMemAsync` | | Attach memory to a stream asynchronously. |
|
||||
| `cudaStreamAttachMemAsync` | | Attach managed memory to a stream asynchronously. |
|
||||
| `cudaStreamCreate` | `hipStreamCreate` | Create an asynchronous stream. |
|
||||
| `cudaStreamCreateWithFlags` | `hipStreamCreateWithFlags` | Create an asynchronous stream. |
|
||||
| `cudaStreamCreateWithPriority` | | Create an asynchronous stream with the specified priority. |
|
||||
| `cudaStreamDestroy` | `hipStreamDestroy` | Destroys and cleans up an asynchronous stream. |
|
||||
| `cudaStreamGetFlags` | | Query the flags of a stream. |
|
||||
| `cudaStreamGetFlags` | `hipStreamGetFlags` | Query the flags of a stream. |
|
||||
| `cudaStreamGetPriority` | | Query the priority of a stream. |
|
||||
| `cudaStreamQuery` | | Queries an asynchronous stream for completion status. |
|
||||
| `cudaStreamSynchronize` | `hipStreamSynchronize` | Waits for stream tasks to complete. |
|
||||
@@ -100,8 +100,8 @@
|
||||
| `cudaHostAlloc` | `hipHostMalloc` | Allocates page-locked memory on the host. |
|
||||
| `cudaHostGetDevicePointer` | `hipHostGetDevicePointer` | Passes back device pointer of mapped host memory allocated by cudaHostAlloc or registered by cudaHostRegister. |
|
||||
| `cudaHostGetFlags` | `hipHostGetFlags` | Passes back flags used to allocate pinned host memory allocated by cudaHostAlloc. |
|
||||
| `cudaHostRegister` | | Registers an existing host memory range for use by CUDA. |
|
||||
| `cudaHostUnregister` | | Unregisters a memory range that was registered with cudaHostRegister. |
|
||||
| `cudaHostRegister` | `hipHostRegister` | Registers an existing host memory range for use by CUDA. |
|
||||
| `cudaHostUnregister` | `hipHostUnregister` | Unregisters a memory range that was registered with cudaHostRegister. |
|
||||
| `cudaMalloc` | `hipMalloc` | Allocate memory on the device. |
|
||||
| `cudaMalloc3D` | | Allocates logical 1D, 2D, or 3D memory objects on the device. |
|
||||
| `cudaMalloc3DArray` | | Allocate an array on the device. |
|
||||
@@ -231,7 +231,7 @@
|
||||
|
||||
| **CUDA** | **HIP** | **CUDA description** |
|
||||
|-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------|
|
||||
| `cudaBindSurfaceToArra`y | | Binds an array to a surface. |
|
||||
| `cudaBindSurfaceToArray` | | Binds an array to a surface. |
|
||||
| `cudaBindTexture` | | Binds a memory area to a texture. |
|
||||
| `cudaBindTexture2D` | | Binds a 2D memory area to a texture. |
|
||||
| `cudaBindTextureToArray` | | Binds an array to a texture. |
|
||||
|
||||
@@ -32,36 +32,52 @@
|
||||
HIP provides the following:
|
||||
- Devices (hipSetDevice(), hipGetDeviceProperties(), etc.)
|
||||
- Memory management (hipMalloc(), hipMemcpy(), hipFree(), etc.)
|
||||
- Streams (hipStreamCreate(), etc.)
|
||||
- Streams (hipStreamCreate(),hipStreamSynchronize(), hipStreamWaitEvent(), etc.)
|
||||
- Events (hipEventRecord(), hipEventElapsedTime(), etc.)
|
||||
- Kernel launching (hipLaunchKernel is a standard C/C++ function that replaces <<< >>>)
|
||||
- HIP Module API to control when adn how code is loaded.
|
||||
- CUDA-style kernel coordinate functions (threadIdx, blockIdx, blockDim, gridDim)
|
||||
- Cross-lane instructions including shfl, ballot, any, all
|
||||
- Most device-side math built-ins
|
||||
- Error reporting (hipGetLastError(), hipGetErrorString())
|
||||
|
||||
The HIP API documentation describes each API and its limitations, if any, compared with the equivalent CUDA API.
|
||||
|
||||
### What is not supported?
|
||||
#### Run-time features
|
||||
#### Runtime/Driver API features
|
||||
At a high-level, the following features are not supported:
|
||||
- Textures
|
||||
- MemcpyToSymbol functions
|
||||
- Dynamic parallelism (CUDA 5.0)
|
||||
- Managed memory (CUDA 6.5)
|
||||
- Graphics interoperation with OpenGL or Direct3D
|
||||
- CUDA Driver API (Under Development)
|
||||
- CUDA IPC Functions (Under Development)
|
||||
|
||||
- CUDA array, mipmappedArray and pitched memory
|
||||
- CUDA Driver API
|
||||
- MemcpyToSymbol functions
|
||||
- Queue priority controls
|
||||
|
||||
See the [API Support Table](CUDA_Runtime_API_functions_supported_by_HIP.md) for more detailed information.
|
||||
|
||||
#### Kernel language features
|
||||
- Device-side dynamic memory allocations (malloc, free, new, delete) (CUDA 4.0)
|
||||
- Virtual functions, indirect functions and try/catch (CUDA 4.0)
|
||||
- `__prof_trigger`
|
||||
- PTX assembly (CUDA 4.0)
|
||||
- Several kernel features are under development. See the [HIP Kernel Language](hip_kernel_language.md) for more information.
|
||||
- PTX assembly (CUDA 4.0). HCC supports inline GCN assembly.
|
||||
- Several kernel features are under development. See the [HIP Kernel Language](hip_kernel_language.md) for more information. These include:
|
||||
- printf
|
||||
- assert
|
||||
- `__restrict__`
|
||||
- `__launch_bounds__`
|
||||
- `__threadfence*_`, `__syncthreads*`
|
||||
- Unbounded loop unroll
|
||||
|
||||
|
||||
|
||||
### Is HIP a drop-in replacement for CUDA?
|
||||
No. HIP provides porting tools which do most of the work do convert CUDA code into portable C++ code that uses the HIP APIs.
|
||||
Most developers will port their code from CUDA to HIP and then maintain the HIP version.
|
||||
HIP code provides the same performance as coding in native CUDA, plus the benefit that the code can also run on AMD platforms.
|
||||
HIP code provides the same performance as native CUDA code, plus the benefits of running on AMD platforms.
|
||||
|
||||
### What specific version of CUDA does HIP support?
|
||||
HIP APIs and features do not map to a specific CUDA version. HIP provides a strong subset of functionality provided in CUDA, and the hipify tools can
|
||||
|
||||
@@ -156,7 +156,7 @@ The `__constant__` keyword is supported. The host writes constant memory before
|
||||
### `__shared__`
|
||||
The `__shared__` keyword is supported.
|
||||
|
||||
`extern __shared__` allows the host to dynamically allocate shared memory and is specified as a launch parameter. This feature is under development.
|
||||
`extern __shared__` allows the host to dynamically allocate shared memory and is specified as a launch parameter. HIP uses an alternate syntax based on the HIP_DYNAMIC_SHARED macro.
|
||||
|
||||
### `__managed__`
|
||||
Managed memory, including the `__managed__` keyword, are not supported in HIP.
|
||||
@@ -537,7 +537,6 @@ HIP supports the following atomic operations.
|
||||
### Caveats and Features Under-Development:
|
||||
|
||||
- HIP enables atomic operations on 32-bit integers. Additionally, it supports an atomic float add. AMD hardware, however, implements the float add using a CAS loop, so this function may not perform efficiently.
|
||||
- wrapping increment and decrement are under development.
|
||||
|
||||
## Warp Cross-Lane Functions
|
||||
|
||||
@@ -573,8 +572,6 @@ Applications can test whether the target platform supports the any/all instructi
|
||||
|
||||
### Warp Shuffle Functions
|
||||
|
||||
The following warp shuffle instructions are under development.
|
||||
|
||||
Half-float shuffles are not supported. The default width is warpSize---see [Warp Cross-Lane Functions](#warp-cross-lane-functions). Applications should not assume the warpSize is 32 or 64.
|
||||
|
||||
```
|
||||
@@ -670,3 +667,22 @@ The following C++ features are not supported:
|
||||
- Run-time-type information (RTTI)
|
||||
- Virtual functions
|
||||
- Try/catch
|
||||
|
||||
## Kernel Compilation
|
||||
HIP now supports compiling C++/HIP kernels to binary. Eventhough HIP does not support fatbinary (yet), the user can specify the target for which the binary can be generated. The file format for binary is `.co` which means Code Object. The following command builds the binary using `hipcc`.
|
||||
|
||||
`hipcc --genisa --target-isa=[TARGET GPU] [INPUT FILE] -o [OUTPUT FILE]`
|
||||
```[TARGET GPU] = fiji/hawaii
|
||||
[INPUT FILE] = Name of the file containing kernels
|
||||
[OUTPUT FILE] = Name of the generated code object file```
|
||||
|
||||
Note that the kernel file should have `int main(){}` at the end it so that the binary is generated. This happens because HCC generates binaries at linking time rather than compilation
|
||||
|
||||
You need 3 things to run kernel in binary.
|
||||
1. Kernel Binary
|
||||
2. Name of kernel binary
|
||||
3. Name of the kernel
|
||||
|
||||
We already got first two of them. In order to get name of the kernel, try `objdump -x [OUTPUT FILE]`. OUTPUT FILE is file generated by hipcc during kernel compilation. The output from objdump has symbol to the kernel whose name is mangled with `grid_launch_parm`, `__functor`, `__cxxamp_trampoline`. An example of how it looks is `ZN12_GLOBAL__N_137_Z3Cpy16grid_launch_parmPfS0__functor19__cxxamp_trampolineEiiiiiiPKfPf` where `Cpy` is the name of the kernel written in C++.
|
||||
|
||||
|
||||
|
||||
@@ -0,0 +1,42 @@
|
||||
# HIP Performance Optimizations
|
||||
|
||||
Please note that this document lists possible ways for experimenting with HIP stack to gain performance. Performance may vary from platform to platform.
|
||||
|
||||
### Unpinned Memory Transfer Optimizations
|
||||
|
||||
#### On Small BAR Setup
|
||||
|
||||
There are two possible ways to transfer data from Host to Device (H2D) and Device to Host(D2H)
|
||||
* Using Staging Buffers
|
||||
* Using PinInPlace
|
||||
|
||||
#### On Large BAR Setup
|
||||
|
||||
There are two possible ways to transfer data from Host to Device (H2D)
|
||||
* Using Staging Buffers
|
||||
* Using PinInPlace
|
||||
* Direct Memcpy
|
||||
|
||||
And there are two possible ways to transfer data from Device to Host (D2H)
|
||||
* Using Staging Buffers
|
||||
* Using PinInPlace
|
||||
|
||||
Some GPUs may not be able to directly access host memory, and in these cases we need to
|
||||
stage the copy through an optimized pinned staging buffer, to implement H2D and D2H copies.The copy is broken into buffer-sized chunks to limit the size of the buffer and also to provide better performance by overlapping the CPU copies with the DMA copies.
|
||||
|
||||
PinInPlace is another algorithm which pins the host memory "in-place", and copies it with the DMA
|
||||
engine.
|
||||
|
||||
By default staging buffers are used for unpinned memory transfers, however other ways can be used by enabling few environment variables (so no need to build the code again!!!)
|
||||
|
||||
Following environment variables can be used:
|
||||
|
||||
- HIP_PININPLACE - This environment variable forces the use of PinInPlace logic for all unpinned memory copies
|
||||
|
||||
- HIP_OPTIMAL_MEM_TRANSFER- This environment variable enables a hybrid memory copy logic based on thresholds. These thresholds can be managed with following environment variables:
|
||||
- HIP_H2D_MEM_TRANSFER_THRESHOLD_STAGING_OR_PININPLACE - Threshold in bytes for H2D copy. For sizes smaller than threshold staging buffers logic would be used else PinInPlace logic.
|
||||
- HIP_H2D_MEM_TRANSFER_THRESHOLD_DIRECT_OR_STAGING - Threshold in bytes for H2D copy. For sizes smaller than threshold direct copy logic would be used else staging buffers logic.
|
||||
- HIP_D2H_MEM_TRANSFER_THRESHOLD - Threshold in bytes for D2H copy. For sizes smaller than threshold staging buffer logic would be used else PinInPlace logic.
|
||||
|
||||
|
||||
|
||||
@@ -0,0 +1,257 @@
|
||||
# Porting CUDA Driver API
|
||||
|
||||
## Introduction to the CUDA Driver and Runtime APIs
|
||||
CUDA provides a separate CUDA Driver and Runtime APIs. The two APis have significant overlap in functionality:
|
||||
- Both APIs support events, streams, memory management, memory copy, and error handling.
|
||||
- Both APIs deliver similar performance.
|
||||
- Driver APIs calls begin with the prefix `cu` while Runtime APIs begin with the prefix `cuda`. For example, the Driver API API contains `cuEventCreate` while the Runtime API contains `cudaEventCreate`, with similar functionality.
|
||||
- The Driver API defines a different but largely overlapping error code code space than the Runtime API, and uses a different coding convention. For example, Driver API defines `CUDA_ERROR_INVALID_VALUE` while the Runtime API defines `cudaErrorInvalidValue`
|
||||
|
||||
|
||||
The Driver API offers two additional pieces of functionality not provided by the Runtime API: cuModule and cuCtx APIs.
|
||||
|
||||
### cuModule API
|
||||
The Module section of the Driver API provides additional control over how and when accelerator
|
||||
code objects are loaded.
|
||||
For example, the driver API allows code objects to be loaded from files or memory pointers.
|
||||
Symbols for kernels or global data can be extracted from the loaded code objects.
|
||||
In contrast, the Runtime API automatically loads and (if necessary) compiles all of the
|
||||
kernels from a executable binary when run.
|
||||
In this mode, NVCC must be used to compile kernel code so the automatic loading can function correctly.
|
||||
|
||||
Both Driver and Runtime APIs define a function for launching kernels (called `cuLaunchKernel` or `cudaLaunchKernel`.
|
||||
The kernel arguments and the execution configuration (grid dimensions, group dimensions, dynamic shared memory, and stream) are passed as arguments to the launch function.
|
||||
The Runtime additionally provides the `<<< >>>` syntax for launching kernels, which resembles
|
||||
a special function call and is easier to use than explicit launch API (in particular with respect to
|
||||
handling of kernel arguments).
|
||||
However, this syntax is not standard C++ and is available only when NVCC is used to compile the host code.
|
||||
|
||||
The Module features are useful in an environment which generate the code objects directly, such as a new
|
||||
accelerator language front-end. Here, NVCC is not used. Instead, the environment may have a
|
||||
different kernel language or different compilation flow.
|
||||
Other environments have many kernels and do not want them to be all loaded automatically.
|
||||
The Module functions can be used to load the generated code objects and launch kernels.
|
||||
As we will see below, HIP defines a Module API which provides similar explict control over code
|
||||
object managemenet.
|
||||
|
||||
### cuCtx API
|
||||
The Driver API defines "Context" and "Devices" as separate entities.
|
||||
Contexts contain a single device, and a device can theoretically have multiple contexts.
|
||||
Each context contains a set of streams and events specific to the context.
|
||||
Historically contexts also defined a unique address space for the GPU, though this may not longer be the case in Unified Memory platforms (since the CPU and all the devices in the same process share a single unified address space).
|
||||
The Context APIs also provide a mechanism to switch between devices, which allowed
|
||||
a single CPU thread to send commands to different GPUs. HIP as well as a recent versions
|
||||
of CUDA Runtime provide other mechanisms to accomplish this feat - for example using streams or
|
||||
`cudaSetDevice`.
|
||||
|
||||
The CUDA Runtime API unifies the Context API with the Device API. This simplifies the APIs
|
||||
and has little loss of functionality since each Context can contain a single device,
|
||||
and the benefits of multiple contexts has been replaced with other interfaces. HIP provides
|
||||
a context API to facilitate easy porting from existing Driver codes.
|
||||
In HIP, the Ctx functions largely provide an alternate syntax for changing the active device.
|
||||
Most new applications will prefer to use `hipSetDevice` or the stream APIs.
|
||||
|
||||
## HIP Module and Ctx APIs
|
||||
|
||||
Rather than present two separate APIs, HIP extends the HIP API with new APIs for Modules
|
||||
and Ctx control.
|
||||
|
||||
### hipModule API
|
||||
|
||||
Like the CUDA Driver API, the Module API provides additional control
|
||||
over how code is loaded, including options to load code from files or from in-memory pointers.
|
||||
NVCC and HCC target different architectures and use different code object formats : NVCC
|
||||
is `cubin` or `ptx` files, while the HCC path is the `hsaco` format. The external
|
||||
compilers which generate these code objects are responsible for generating and loading
|
||||
the correct code object for each platform. Notably, there is not a fat binary format that
|
||||
can contain code for both NVCC and HCC platforms. The following table summarizes the
|
||||
formats used on each platform:
|
||||
|
||||
| Format | APIs | NVCC | HCC |
|
||||
| --- | --- | --- | --- |
|
||||
| Code Object | hipModuleLoad, hipModuleLoadData | .cubin or PTX text | .hsaco |
|
||||
| Fat Binary | hipModuleLoadFatBin | .fatbin | Under Development |
|
||||
|
||||
hipcc uses NVCC and HCC to compile host codes. Both of these may embed code objects
|
||||
into the final executable, and these code objects will be automatically loaded when
|
||||
the application starts.
|
||||
The hipModule API can be used to load additional code objects, and in this way
|
||||
provides an extended capability to the automatically loaded code objects. HCC allows
|
||||
both of these capabilities to be used together, if desired. Of course it is possible
|
||||
to create a program with no kernels and thus no automatic loading.
|
||||
|
||||
|
||||
### hipCtx API
|
||||
HIP provides a `Ctx` API as a thin layer over the existing Device functions. This Ctx API
|
||||
can be used to set the current context, or to query properties of the device associated with
|
||||
the context. The current context is implicitly used by other APIs such as `hipStreamCreate`.
|
||||
|
||||
### hipify translation of CUDA Driver API
|
||||
The hipify tool will convert CUDA Driver APIs for streams, events, memory management to
|
||||
the equivalent HIP driver calls. For example, `cuEventCreate` will be translated to
|
||||
`hipEventCreate`. Hipify also converts error code from the Driver namespace and coding
|
||||
convention to the equivalent HIP error code. Thus, HIP unifies the APis for these common functions.
|
||||
[hipify support for translating driver API is Under Development]
|
||||
|
||||
The memory copy APIs require additional explanation. The CUDA driver includes the memory
|
||||
direction in the name of the API (ie `cuMemcpyH2D`) while the CUDA driver API provides
|
||||
a single memory copy API with a parameter that specifies the direction and additionally
|
||||
supports a "default" direction where the runtime determines the direction automatically.
|
||||
HIP provides APis with both styles: for example, `hipMemcpyH2D` as well as `hipMemcpy`.
|
||||
The first flavor may be faster in some cases since they avoid host overhead to detect the
|
||||
different memory directions.
|
||||
|
||||
HIP defines a single error space, and uses camel-case for all errors (i.e. `hipErrorInvalidValue`).
|
||||
|
||||
### HCC Implementation Notes
|
||||
#### .hsaco
|
||||
The .hsaco format used by HCC is described in more detail [here](https://github.com/RadeonOpenCompute/ROCm-ComputeABI-Doc).
|
||||
An example and blog that show how to use the format is [here](http://gpuopen.com/rocm-with-harmony-combining-opencl-hcc-hsa-in-a-single-program). hsaco can be generated by hcc + extractkernel tool,
|
||||
cloc, the GCN assembler, or other tools.
|
||||
|
||||
#### Address Spaces
|
||||
HCC defines a process-wide address space where the CPU and all devices allocate
|
||||
addresses from a single unified pool. Thus addresses may be shared between contexts, and
|
||||
unlike the original CUDA definition a new context does not create a new address space for
|
||||
the device.
|
||||
|
||||
#### Using hipModuleLaunchKernel
|
||||
`hipModuleLaunchKernel` is `cuLaunchKernel` in HIP world. It takes the same arguments as `cuLaunchKernel`. The argument `kernelParams` is not fully implemented for HCC. The workaround for it is, to use platform specific macros for each target. Or, `extra` argument can be used which works on both the platforms.
|
||||
|
||||
#### Additional Information
|
||||
- HCC allocates staging buffers (used for unpinned copies) on a per-device basis.
|
||||
- HCC creates a primary context when the HIP API is called. So in a pure driver API code, HIP/HCC will create a primary context while HIP/NVCC will have empty context stack. HIP/HCC will push primary context to context stack when it is empty. This can have subtle differences on applications which mix the runtime and driver APIs.
|
||||
|
||||
### NVCC Implementation Notes
|
||||
|
||||
#### Interoperation between HIP and CUDA Driver
|
||||
CUDA applications may want to mix CUDA driver code with HIP code (see example below). This
|
||||
table shows the type equivalence to enable this interaction.
|
||||
|
||||
|**HIP Type** |**CU Driver Type**| **CUDA Runtime Type** |
|
||||
| ---- | ---- | ---- |
|
||||
| hipModule | CUmodule | |
|
||||
| hipFunction | CUfunction | |
|
||||
| hipCtx_t | CUcontext | |
|
||||
| hipDevice_t | CUdevice | |
|
||||
| hipStream_t | CUstream | cudaStream_t |
|
||||
| hipEvent_t | CUevent | cudaEvent_t |
|
||||
| hipArray | CUarray | cudaArray |
|
||||
|
||||
|
||||
|
||||
#### Compilation Flags
|
||||
The hipModule interface does not support the hipModuleLoadEx function, which is used to control PTX compilaton options.
|
||||
HCC does not use PTX and does not support the same compilation options.
|
||||
In fact, HCC code objects always contain fully compiled ISA and do not require additional compilation as part of the load step.
|
||||
Code which requires this functionaly should use platform-specific coding, calling `cuModuleLoadEx`
|
||||
on the NVCC path and hipModuleLoad on the hcc path. For example:
|
||||
|
||||
```
|
||||
hipModule module;
|
||||
void *imagePtr = ... ; // Somehow populate data pointer with code object
|
||||
|
||||
#ifdef __HIP_PLATFORM_NVCC__
|
||||
// Use CUDA driver API but write to hipModule since they are same type:
|
||||
const int numOptions = 1;
|
||||
CUJit_option options[numOptions];
|
||||
void * optionValues[numOptions];
|
||||
|
||||
options[0] = CU_JIT_MAX_REGISTERS;
|
||||
unsigned maxRegs=15;
|
||||
optionValues[0] = (void*) (&maxRegs);
|
||||
|
||||
cuModuleLoadDataEx(module, imagePtr, numOptions, options, optionValues);
|
||||
|
||||
#else // __HIP_PLATFORM_HCC__
|
||||
|
||||
// HCC path does not support or require JIT options, so just load the module.
|
||||
hipModuleLoadData(&module, imagePtr);
|
||||
|
||||
#endif
|
||||
|
||||
// Back to unified code - both paths above loaded the "module" variable.
|
||||
hipFunction k;
|
||||
hipModuleGetFunction(&k, module, "myKernel");
|
||||
```
|
||||
|
||||
The below sample shows how to use `hipModuleGetFunction`.
|
||||
|
||||
```
|
||||
#include<hip_runtime.h>
|
||||
#include<hip_runtime_api.h>
|
||||
#include<iostream>
|
||||
#include<fstream>
|
||||
#include<vector>
|
||||
|
||||
#define LEN 64
|
||||
#define SIZE LEN<<2
|
||||
|
||||
#ifdef __HIP_PLATFORM_HCC__
|
||||
#define fileName "vcpy_isa.co"
|
||||
#endif
|
||||
|
||||
#ifdef __HIP_PLATFORM_NVCC__
|
||||
#define fileName "vcpy_isa.ptx"
|
||||
#endif
|
||||
|
||||
#define kernel_name "hello_world"
|
||||
|
||||
int main(){
|
||||
float *A, *B;
|
||||
hipDeviceptr_t Ad, Bd;
|
||||
A = new float[LEN];
|
||||
B = new float[LEN];
|
||||
|
||||
for(uint32_t i=0;i<LEN;i++){
|
||||
A[i] = i*1.0f;
|
||||
B[i] = 0.0f;
|
||||
std::cout<<A[i] << " "<<B[i]<<std::endl;
|
||||
}
|
||||
|
||||
|
||||
#ifdef __HIP_PLATFORM_NVCC__
|
||||
hipInit(0);
|
||||
hipDevice_t device;
|
||||
hipCtx_t context;
|
||||
hipDeviceGet(&device, 0);
|
||||
hipCtxCreate(&context, 0, device);
|
||||
#endif
|
||||
|
||||
hipMalloc((void**)&Ad, SIZE);
|
||||
hipMalloc((void**)&Bd, SIZE);
|
||||
|
||||
hipMemcpyHtoD(Ad, A, SIZE);
|
||||
hipMemcpyHtoD(Bd, B, SIZE);
|
||||
hipModule_t Module;
|
||||
hipFunction_t Function;
|
||||
hipModuleLoad(&Module, fileName);
|
||||
hipModuleGetFunction(&Function, Module, kernel_name);
|
||||
|
||||
std::vector<void*>argBuffer(2);
|
||||
memcpy(&argBuffer[0], &Ad, sizeof(void*));
|
||||
memcpy(&argBuffer[1], &Bd, sizeof(void*));
|
||||
|
||||
size_t size = argBuffer.size()*sizeof(void*);
|
||||
|
||||
void *config[] = {
|
||||
HIP_LAUNCH_PARAM_BUFFER_POINTER, &argBuffer[0],
|
||||
HIP_LAUNCH_PARAM_BUFFER_SIZE, &size,
|
||||
HIP_LAUNCH_PARAM_END
|
||||
};
|
||||
|
||||
hipModuleLaunchKernel(Function, 1, 1, 1, LEN, 1, 1, 0, 0, NULL, (void**)&con fig);
|
||||
|
||||
hipMemcpyDtoH(B, Bd, SIZE);
|
||||
for(uint32_t i=0;i<LEN;i++){
|
||||
std::cout<<A[i]<<" - "<<B[i]<<std::endl;
|
||||
}
|
||||
|
||||
#ifdef __HIP_PLATFORM_NVCC__
|
||||
hipCtxDetach(context);
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
```
|
||||
@@ -395,13 +395,53 @@ For new projects or ports which can be re-factored, we recommend the use of the
|
||||
This indicates that the code is standard C++ code, but also provides a unique indication for make tools to
|
||||
run hipcc when appropriate.
|
||||
|
||||
### Workarounds
|
||||
## Workarounds
|
||||
|
||||
#### warpSize
|
||||
### warpSize
|
||||
Code should not assume a warp size of 32 or 64. See [Warp Cross-Lane Functions](hip_kernel_language.md#warp-cross-lane-functions) for information on how to write portable wave-aware code.
|
||||
|
||||
## memcpyToSymbol
|
||||
|
||||
#### Textures and Cache Control
|
||||
HIP support for hipMemCpyToSymbol is under-development. This feature allows a kernel
|
||||
to define a device-side data symbol which can be accessed on the host side. The symbol
|
||||
can be in __constant or device space. As a workaround, programs can pass the symbol
|
||||
as an argument to the kernel, and use standard hipMemcpy routines to initialize it.
|
||||
|
||||
For example:
|
||||
|
||||
Device Code:
|
||||
```
|
||||
// Cuda Device Code
|
||||
__constant__ float Array[1024];
|
||||
__global__ void Inc(float *Out){
|
||||
Int tx = hipThreadIdx_x;
|
||||
Out[tx] = Array[tx] + 1;
|
||||
}
|
||||
|
||||
// HIP Device Code
|
||||
__global__ void Inc(hipLaunchParm lp, float *Array, float *Out){
|
||||
Int tx = hipThreadIdx_x;
|
||||
Out[tx] = Array[tx] + 1;
|
||||
}
|
||||
```
|
||||
|
||||
Host Code:
|
||||
```
|
||||
// CUDA Host Code
|
||||
cudaMemcpyToSymbol(Array, hostArray, sizeofArray);
|
||||
|
||||
// HIP Host Code
|
||||
hipMemcpy(Array, hostArray, sizeofArray);
|
||||
```
|
||||
|
||||
## threadfence_system
|
||||
Threadfence_system makes all device memory writes, all writes to mapped host memory, and all writes to peer memory visible to CPU and other GPU devices.
|
||||
Some implementations can provide this behavior by flushing the GPU L2 cache.
|
||||
HIP/HCC does not provide this functionality. As a workaround, users can set the environment variable `HSA_DISABLE_CACHE=1` to
|
||||
disable the GPU L2 cache. This will affect all accesses and for all kernels and so may have
|
||||
a performance impact.
|
||||
|
||||
### Textures and Cache Control
|
||||
|
||||
>Texture support is under-development and not yet supported by HIP.
|
||||
|
||||
|
||||
@@ -0,0 +1,172 @@
|
||||
/*
|
||||
Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
|
||||
#ifndef HIPCOMPLEX_H
|
||||
#define HIPCOMPLEX_H
|
||||
|
||||
typedef struct{
|
||||
float x;
|
||||
float y;
|
||||
}hipFloatComplex;
|
||||
|
||||
__device__ static inline float hipCrealf(hipFloatComplex z){
|
||||
return z.x;
|
||||
}
|
||||
|
||||
__device__ static inline float hipCimagf(hipFloatComplex z){
|
||||
return z.y;
|
||||
}
|
||||
|
||||
__device__ static inline hipFloatComplex make_hipFloatComplex(float a, float b){
|
||||
hipFloatComplex z;
|
||||
z.x = a;
|
||||
z.y = b;
|
||||
return z;
|
||||
}
|
||||
|
||||
__device__ static inline hipFloatComplex hipConjf(hipFloatComplex z){
|
||||
hipFloatComplex ret;
|
||||
ret.x = z.x;
|
||||
ret.y = -z.y;
|
||||
return ret;
|
||||
}
|
||||
|
||||
__device__ static inline float hipCsqabsf(hipFloatComplex z){
|
||||
return z.x * z.x + z.y * z.y;
|
||||
}
|
||||
|
||||
__device__ static inline hipFloatComplex hipCaddf(hipFloatComplex p, hipFloatComplex q){
|
||||
return make_hipFloatComplex(p.x + q.x, p.y + q.y);
|
||||
}
|
||||
|
||||
__device__ static inline hipFloatComplex hipCsubf(hipFloatComplex p, hipFloatComplex q){
|
||||
return make_hipFloatComplex(p.x - q.x, p.y - q.y);
|
||||
}
|
||||
|
||||
__device__ static inline hipFloatComplex hipCmulf(hipFloatComplex p, hipFloatComplex q){
|
||||
return make_hipFloatComplex(p.x * q.x - p.y * q.y, p.y * q.x + p.x * q.y);
|
||||
}
|
||||
|
||||
__device__ static inline hipFloatComplex hipCdivf(hipFloatComplex p, hipFloatComplex q){
|
||||
float sqabs = hipCsqabsf(q);
|
||||
hipFloatComplex ret;
|
||||
ret.x = (p.x * q.x + p.y * q.y)/sqabs;
|
||||
ret.y = (p.y * q.x - p.x * q.y)/sqabs;
|
||||
return ret;
|
||||
}
|
||||
|
||||
__device__ static inline float hipCabsf(hipFloatComplex z){
|
||||
return sqrtf(hipCsqabsf(z));
|
||||
}
|
||||
|
||||
|
||||
typedef struct{
|
||||
double x;
|
||||
double y;
|
||||
}hipDoubleComplex;
|
||||
|
||||
__device__ static inline double hipCreal(hipDoubleComplex z){
|
||||
return z.x;
|
||||
}
|
||||
|
||||
__device__ static inline double hipCimag(hipDoubleComplex z){
|
||||
return z.y;
|
||||
}
|
||||
|
||||
__device__ static inline hipDoubleComplex make_hipDoubleComplex(double a, double b){
|
||||
hipDoubleComplex z;
|
||||
z.x = a;
|
||||
z.y = b;
|
||||
return z;
|
||||
}
|
||||
|
||||
__device__ static inline hipDoubleComplex hipConj(hipDoubleComplex z){
|
||||
hipDoubleComplex ret;
|
||||
ret.x = z.x;
|
||||
ret.y = z.y;
|
||||
return ret;
|
||||
}
|
||||
|
||||
__device__ static inline double hipCsqabs(hipDoubleComplex z){
|
||||
return z.x * z.x + z.y * z.y;
|
||||
}
|
||||
|
||||
__device__ static inline hipDoubleComplex hipCadd(hipDoubleComplex p, hipDoubleComplex q){
|
||||
return make_hipDoubleComplex(p.x + q.x, p.y + q.y);
|
||||
}
|
||||
|
||||
__device__ static inline hipDoubleComplex hipCsub(hipDoubleComplex p, hipDoubleComplex q){
|
||||
return make_hipDoubleComplex(p.x - q.x, p.y - q.y);
|
||||
}
|
||||
|
||||
__device__ static inline hipDoubleComplex hipCmul(hipDoubleComplex p, hipDoubleComplex q){
|
||||
return make_hipDoubleComplex(p.x * q.x - p.y * q.y, p.y * q.x + p.x * q.y);
|
||||
}
|
||||
|
||||
__device__ static inline hipDoubleComplex hipCdiv(hipDoubleComplex p, hipDoubleComplex q){
|
||||
double sqabs = hipCsqabs(q);
|
||||
hipDoubleComplex ret;
|
||||
ret.x = (p.x * q.x + p.y * q.y)/sqabs;
|
||||
ret.y = (p.y * q.x - p.x * q.y)/sqabs;
|
||||
return ret;
|
||||
}
|
||||
|
||||
__device__ static inline double hipCabs(hipDoubleComplex z){
|
||||
return sqrtf(hipCsqabs(z));
|
||||
}
|
||||
|
||||
typedef hipFloatComplex hipComplex;
|
||||
|
||||
__device__ static inline hipComplex make_hipComplex(float x,
|
||||
float y){
|
||||
return make_hipFloatComplex(x, y);
|
||||
}
|
||||
|
||||
__device__ static inline hipFloatComplex hipComplexDoubleToFloat
|
||||
(hipDoubleComplex z){
|
||||
return make_hipFloatComplex((float)z.x, (float)z.y);
|
||||
}
|
||||
|
||||
__device__ static inline hipDoubleComplex hipComplexFloatToDouble
|
||||
(hipFloatComplex z){
|
||||
return make_hipDoubleComplex((double)z.x, (double)z.y);
|
||||
}
|
||||
|
||||
__device__ static inline hipComplex hipCfmaf(hipComplex p, hipComplex q, hipComplex r){
|
||||
float real = (p.x * q.x) + r.x;
|
||||
float imag = (q.x * p.y) + r.y;
|
||||
|
||||
real = -(p.y * q.y) + real;
|
||||
imag = (p.x * q.y) + imag;
|
||||
|
||||
return make_hipComplex(real, imag);
|
||||
}
|
||||
|
||||
__device__ static inline hipDoubleComplex hipCfma(hipDoubleComplex p, hipDoubleComplex q, hipDoubleComplex r){
|
||||
float real = (p.x * q.x) + r.x;
|
||||
float imag = (q.x * p.y) + r.y;
|
||||
|
||||
real = -(p.y * q.y) + real;
|
||||
imag = (p.x * q.y) + imag;
|
||||
|
||||
return make_hipDoubleComplex(real, imag);
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -160,12 +160,17 @@ class ihipCtx_t;
|
||||
#endif
|
||||
|
||||
|
||||
// Just initialize the HIP runtime, but don't log any trace information.
|
||||
#define HIP_INIT()\
|
||||
std::call_once(hip_initialized, ihipInit);\
|
||||
ihipCtxStackUpdate();
|
||||
|
||||
|
||||
// This macro should be called at the beginning of every HIP API.
|
||||
// It initialies the hip runtime (exactly once), and
|
||||
// generate trace string that can be output to stderr or to ATP file.
|
||||
#define HIP_INIT_API(...) \
|
||||
std::call_once(hip_initialized, ihipInit);\
|
||||
HIP_INIT()\
|
||||
API_TRACE(__VA_ARGS__);
|
||||
|
||||
#define ihipLogStatus(hipStatus) \
|
||||
@@ -396,6 +401,22 @@ public:
|
||||
typedef ihipStreamCriticalBase_t<StreamMutex> ihipStreamCritical_t;
|
||||
typedef LockedAccessor<ihipStreamCritical_t> LockedAccessor_StreamCrit_t;
|
||||
|
||||
class ihipModule_t{
|
||||
public:
|
||||
hsa_executable_t executable;
|
||||
hsa_code_object_t object;
|
||||
std::string fileName;
|
||||
void *ptr;
|
||||
size_t size;
|
||||
};
|
||||
|
||||
|
||||
class ihipFunction_t{
|
||||
public:
|
||||
hsa_executable_symbol_t kernel_symbol;
|
||||
uint64_t kernel;
|
||||
};
|
||||
|
||||
// Internal stream structure.
|
||||
class ihipStream_t {
|
||||
public:
|
||||
@@ -404,8 +425,9 @@ typedef uint64_t SeqNum_t ;
|
||||
~ihipStream_t();
|
||||
|
||||
// kind is hipMemcpyKind
|
||||
void copySync (LockedAccessor_StreamCrit_t &crit, void* dst, const void* src, size_t sizeBytes, unsigned kind);
|
||||
void locked_copySync (void* dst, const void* src, size_t sizeBytes, unsigned kind);
|
||||
void copySync (LockedAccessor_StreamCrit_t &crit, void* dst, const void* src, size_t sizeBytes, unsigned kind, bool resolveOn = true);
|
||||
void locked_copySync (void* dst, const void* src, size_t sizeBytes, unsigned kind, bool resolveOn = true);
|
||||
|
||||
|
||||
void copyAsync(void* dst, const void* src, size_t sizeBytes, unsigned kind);
|
||||
|
||||
@@ -423,7 +445,7 @@ typedef uint64_t SeqNum_t ;
|
||||
// Use this if we already have the stream critical data mutex:
|
||||
void wait(LockedAccessor_StreamCrit_t &crit, bool assertQueueEmpty=false);
|
||||
|
||||
|
||||
void launchModuleKernel(hsa_signal_t signal, uint32_t blockDimX, uint32_t blockDimY, uint32_t blockDimZ, uint32_t gridDimX, uint32_t gridDimY, uint32_t gridDimZ, uint32_t sharedMemBytes, void *kernarg, size_t kernSize, uint64_t kernel);
|
||||
|
||||
// Non-threadsafe accessors - must be protected by high-level stream lock with accessor passed to function.
|
||||
SIGSEQNUM lastCopySeqId (LockedAccessor_StreamCrit_t &crit) const { return crit->_last_copy_signal ? crit->_last_copy_signal->_sigId : 0; };
|
||||
@@ -443,6 +465,7 @@ public:
|
||||
hc::accelerator_view _av;
|
||||
unsigned _flags;
|
||||
|
||||
|
||||
private:
|
||||
void enqueueBarrier(hsa_queue_t* queue, ihipSignal_t *depSignal, ihipSignal_t *completionSignal);
|
||||
void waitCopy(LockedAccessor_StreamCrit_t &crit, ihipSignal_t *signal);
|
||||
@@ -648,6 +671,8 @@ extern void ihipInit();
|
||||
extern const char *ihipErrorString(hipError_t);
|
||||
extern ihipCtx_t *ihipGetTlsDefaultCtx();
|
||||
extern void ihipSetTlsDefaultCtx(ihipCtx_t *ctx);
|
||||
extern hipError_t ihipSynchronize(void);
|
||||
extern hipError_t ihipCtxStackUpdate();
|
||||
|
||||
extern ihipDevice_t *ihipGetDevice(int);
|
||||
ihipCtx_t * ihipGetPrimaryCtx(unsigned deviceIndex);
|
||||
|
||||
@@ -568,16 +568,66 @@ __device__ void __threadfence_system(void);
|
||||
#define hipGridDim_y (hc_get_num_groups(1))
|
||||
#define hipGridDim_z (hc_get_num_groups(2))
|
||||
|
||||
// loop unrolling
|
||||
__device__ static inline void* memcpy(void* dst, void* src, size_t size)
|
||||
{
|
||||
uint64_t i = 0;
|
||||
uint64_t totalLength = size/sizeof(uint32_t);
|
||||
for(i=hipThreadIdx_x+hipBlockIdx_x*hipBlockDim_x;
|
||||
i<(totalLength/4);
|
||||
i = i + hipBlockDim_x * hipGridDim_x)
|
||||
{
|
||||
((uint32_t*)dst)[4*i] = ((uint32_t*)src)[4*i];
|
||||
((uint32_t*)dst)[4*i+1] = ((uint32_t*)src)[4*i+1];
|
||||
((uint32_t*)dst)[4*i+2] = ((uint32_t*)src)[4*i+2];
|
||||
((uint32_t*)dst)[4*i+3] = ((uint32_t*)src)[4*i+3];
|
||||
}
|
||||
if(4*i < totalLength){
|
||||
((uint32_t*)dst)[4*i] = ((uint32_t*)src)[4*i];
|
||||
((uint32_t*)dst)[4*i+1] = ((uint32_t*)src)[4*i+1];
|
||||
((uint32_t*)dst)[4*i+2] = ((uint32_t*)src)[4*i+2];
|
||||
((uint32_t*)dst)[4*i+3] = ((uint32_t*)src)[4*i+3];
|
||||
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
__device__ static inline void* memset(void* ptr, uint8_t val, size_t size)
|
||||
{
|
||||
uint32_t _val = 0;
|
||||
_val = (val | val << 8 | val << 16 | val << 24);
|
||||
uint64_t totalLength = size/sizeof(uint32_t);
|
||||
uint64_t i = 0;
|
||||
for(i=hipThreadIdx_x+hipBlockIdx_x*hipBlockDim_x;
|
||||
i<(totalLength/4);
|
||||
i = i + hipBlockDim_x * hipGridDim_x)
|
||||
{
|
||||
((uint32_t*)ptr)[4*i] = _val;
|
||||
((uint32_t*)ptr)[4*i+1] = _val;
|
||||
((uint32_t*)ptr)[4*i+2] = _val;
|
||||
((uint32_t*)ptr)[4*i+3] = _val;
|
||||
}
|
||||
if(4*i < totalLength){
|
||||
((uint32_t*)ptr)[4*i] = _val;
|
||||
((uint32_t*)ptr)[4*i+1] = _val;
|
||||
((uint32_t*)ptr)[4*i+2] = _val;
|
||||
((uint32_t*)ptr)[4*i+3] = _val;
|
||||
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
#define __syncthreads() hc_barrier(CLK_LOCAL_MEM_FENCE)
|
||||
|
||||
#define HIP_KERNEL_NAME(...) __VA_ARGS__
|
||||
|
||||
#ifdef __HCC_CPP__
|
||||
hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, dim3 block, grid_launch_parm *lp);
|
||||
hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, size_t block, grid_launch_parm *lp);
|
||||
hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, dim3 block, grid_launch_parm *lp);
|
||||
hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, size_t block, grid_launch_parm *lp);
|
||||
void ihipPostLaunchKernel(hipStream_t stream, grid_launch_parm &lp);
|
||||
extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, dim3 block, grid_launch_parm *lp);
|
||||
extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, size_t block, grid_launch_parm *lp);
|
||||
extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, dim3 block, grid_launch_parm *lp);
|
||||
extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, size_t block, grid_launch_parm *lp);
|
||||
extern void ihipPrintKernelLaunch(const char *kernelName, const grid_launch_parm *lp, const hipStream_t stream);
|
||||
extern void ihipPostLaunchKernel(hipStream_t stream, grid_launch_parm &lp);
|
||||
|
||||
// TODO - move to common header file.
|
||||
#define KNRM "\x1B[0m"
|
||||
@@ -589,10 +639,9 @@ do {\
|
||||
grid_launch_parm lp;\
|
||||
lp.dynamic_group_mem_bytes = _groupMemBytes; \
|
||||
hipStream_t trueStream = (ihipPreLaunchKernel(_stream, _numBlocks3D, _blockDim3D, &lp)); \
|
||||
if (HIP_TRACE_API) {\
|
||||
fprintf(stderr, KGRN "<<hip-api: hipLaunchKernel '%s' gridDim:(%d,%d,%d) groupDim:(%d,%d,%d) groupMem:+%d stream=%p\n" KNRM, \
|
||||
#_kernelName, lp.grid_dim.x, lp.grid_dim.y, lp.grid_dim.z, lp.group_dim.x, lp.group_dim.y, lp.group_dim.z, lp.dynamic_group_mem_bytes, (void*)(_stream));\
|
||||
}\
|
||||
if (HIP_TRACE_API) {\
|
||||
ihipPrintKernelLaunch(#_kernelName, &lp, _stream); \
|
||||
}\
|
||||
_kernelName (lp, ##__VA_ARGS__);\
|
||||
ihipPostLaunchKernel(trueStream, lp);\
|
||||
} while(0)
|
||||
@@ -653,4 +702,6 @@ do {\
|
||||
*/
|
||||
|
||||
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
@@ -52,13 +52,13 @@ typedef struct ihipDevice_t *hipDevice_t;
|
||||
|
||||
typedef struct ihipStream_t *hipStream_t;
|
||||
|
||||
typedef uint64_t hipFunction;
|
||||
typedef struct ihipModule_t *hipModule_t;
|
||||
|
||||
typedef uint64_t hipModule;
|
||||
typedef struct ihipFunction_t *hipFunction_t;
|
||||
|
||||
typedef struct hipEvent_t {
|
||||
struct ihipEvent_t *_handle;
|
||||
} hipEvent_t;
|
||||
typedef void* hipDeviceptr_t;
|
||||
|
||||
typedef struct ihipEvent_t *hipEvent_t;
|
||||
|
||||
|
||||
/**
|
||||
@@ -844,6 +844,14 @@ hipError_t hipHostFree(void* ptr);
|
||||
*/
|
||||
hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind);
|
||||
|
||||
hipError_t hipMemcpyHtoD(hipDeviceptr_t dst, void* src, size_t sizeBytes);
|
||||
|
||||
hipError_t hipMemcpyDtoH(void* dst, hipDeviceptr_t src, size_t sizeBytes);
|
||||
|
||||
hipError_t hipMemcpyDtoD(hipDeviceptr_t dst, hipDeviceptr_t src, size_t sizeBytes);
|
||||
|
||||
hipError_t hipMemcpyHtoH(void* dst, void* src, size_t sizeBytes);
|
||||
|
||||
|
||||
/**
|
||||
* @brief Copies @p sizeBytes bytes from the memory area pointed to by @p src to the memory area pointed to by @p offset bytes from the start of symbol @p symbol.
|
||||
@@ -1054,32 +1062,173 @@ hipError_t hipMemcpyPeerAsync(void* dst, int dstDevice, const void* src, int src
|
||||
hipError_t hipInit(unsigned int flags) ;
|
||||
|
||||
|
||||
/**
|
||||
*-------------------------------------------------------------------------------------------------
|
||||
*-------------------------------------------------------------------------------------------------
|
||||
* @defgroup Context Management
|
||||
* @{
|
||||
*/
|
||||
|
||||
// TODO-ctx
|
||||
/**
|
||||
* @brief Create a context and set it as current/ default context
|
||||
*
|
||||
* @param [out] ctx
|
||||
* @param [in] flags
|
||||
* @param [in] associated device handle
|
||||
*
|
||||
* @returns #hipSuccess, #hipErrorInvalidContext
|
||||
*/
|
||||
hipError_t hipCtxCreate(hipCtx_t *ctx, unsigned int flags, hipDevice_t device);
|
||||
|
||||
hipError_t hipCtxDestroy(hipCtx_t ctx);
|
||||
|
||||
/**
|
||||
* @brief Pop the current/default context and return the popped context.
|
||||
*
|
||||
* @param [out] ctx
|
||||
*
|
||||
* @returns #hipSuccess
|
||||
*/
|
||||
|
||||
hipError_t hipCtxPopCurrent(hipCtx_t* ctx);
|
||||
|
||||
/**
|
||||
* @brief Push the context to be set as current/ default context
|
||||
*
|
||||
* @param [in] ctx
|
||||
*
|
||||
* @returns #hipSuccess, #hipErrorInvalidContext
|
||||
*/
|
||||
|
||||
hipError_t hipCtxPushCurrent(hipCtx_t ctx);
|
||||
|
||||
/**
|
||||
* @brief Set the passed context as current/default
|
||||
*
|
||||
* @param [in] ctx
|
||||
*
|
||||
* @returns #hipSuccess
|
||||
*/
|
||||
|
||||
hipError_t hipCtxSetCurrent(hipCtx_t ctx);
|
||||
|
||||
/**
|
||||
* @brief Get the handle of the current/ default context
|
||||
*
|
||||
* @param [out] ctx
|
||||
*
|
||||
* @returns #hipSuccess
|
||||
*/
|
||||
|
||||
hipError_t hipCtxGetCurrent(hipCtx_t* ctx);
|
||||
|
||||
/**
|
||||
* @brief Get the handle of the device associated with current/default context
|
||||
*
|
||||
* @param [out] device
|
||||
*
|
||||
* @returns #hipSuccess, #hipErrorInvalidContext
|
||||
*/
|
||||
|
||||
hipError_t hipCtxGetDevice(hipDevice_t *device);
|
||||
|
||||
/**
|
||||
* @brief Returns the approximate HIP api version.
|
||||
*
|
||||
* @warning The HIP feature set does not correspond to an exact CUDA SDK api revision.
|
||||
* This function always set *apiVersion to 4 as an approximation though HIP supports
|
||||
* some features which were introduced in later CUDA SDK revisions.
|
||||
* HIP apps code should not rely on the api revision number here and should
|
||||
* use arch feature flags to test device capabilities or conditional compilation.
|
||||
*
|
||||
*/
|
||||
hipError_t hipCtxGetApiVersion (hipCtx_t ctx,int *apiVersion);
|
||||
|
||||
/**
|
||||
* @brief Set Cache configuration for a specific function
|
||||
*
|
||||
* Note: AMD devices and recent Nvidia GPUS do not support reconfigurable cache. This hint is ignored on those architectures.
|
||||
*
|
||||
*/
|
||||
hipError_t hipCtxGetCacheConfig ( hipFuncCache *cacheConfig );
|
||||
|
||||
/**
|
||||
* @brief Set L1/Shared cache partition.
|
||||
*
|
||||
* Note: AMD devices and recent Nvidia GPUS do not support reconfigurable cache. This hint is ignored on those architectures.
|
||||
*
|
||||
*/
|
||||
hipError_t hipCtxSetCacheConfig ( hipFuncCache cacheConfig );
|
||||
|
||||
/**
|
||||
* @brief Set Shared memory bank configuration.
|
||||
*
|
||||
* Note: AMD devices and recent Nvidia GPUS do not support shared cache banking, and the hint is ignored on those architectures.
|
||||
*
|
||||
*/
|
||||
hipError_t hipCtxSetSharedMemConfig ( hipSharedMemConfig config );
|
||||
|
||||
/**
|
||||
* @brief Get Shared memory bank configuration.
|
||||
*
|
||||
* Note: AMD devices and recent Nvidia GPUS do not support shared cache banking, and the hint is ignored on those architectures.
|
||||
*
|
||||
*/
|
||||
hipError_t hipCtxGetSharedMemConfig ( hipSharedMemConfig * pConfig );
|
||||
|
||||
/**
|
||||
* @brief Blocks until the default context has completed all preceding requested tasks.
|
||||
*
|
||||
* This function waits for all streams on the default context to complete execution, and then returns.
|
||||
*
|
||||
* @returns #hipSuccess.
|
||||
*/
|
||||
hipError_t hipCtxSynchronize ( void );
|
||||
|
||||
/**
|
||||
* @brief Get flags used for creating current/default context.
|
||||
*
|
||||
* @param [out] flags
|
||||
*
|
||||
* @returns #hipSuccess.
|
||||
*/
|
||||
|
||||
hipError_t hipCtxGetFlags ( unsigned int* flags );
|
||||
|
||||
/**
|
||||
* @brief Enables direct access to memory allocations in a peer context.
|
||||
*
|
||||
* Memory which already allocated on peer device will be mapped into the address space of the current device. In addition, all
|
||||
* future memory allocations on peerDeviceId will be mapped into the address space of the current device when the memory is allocated.
|
||||
* The peer memory remains accessible from the current device until a call to hipDeviceDisablePeerAccess or hipDeviceReset.
|
||||
*
|
||||
*
|
||||
* @param [in] peerCtx
|
||||
* @param [in] flags
|
||||
*
|
||||
* Returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue,
|
||||
* @returns #hipErrorPeerAccessAlreadyEnabled if peer access is already enabled for this device.
|
||||
* @warning PeerToPeer support is experimental.
|
||||
*/
|
||||
hipError_t hipCtxEnablePeerAccess (hipCtx_t peerCtx, unsigned int flags);
|
||||
|
||||
/**
|
||||
* @brief Disable direct access from current context's virtual address space to memory allocations physically located on a peer context.Disables direct access to memory allocations in a peer context and unregisters any registered allocations.
|
||||
*
|
||||
* Returns hipErrorPeerAccessNotEnabled if direct access to memory on peerDevice has not yet been enabled from the current device.
|
||||
*
|
||||
* @param [in] peerCtx
|
||||
*
|
||||
* @returns #hipSuccess, #hipErrorPeerAccessNotEnabled
|
||||
* @warning PeerToPeer support is experimental.
|
||||
*/
|
||||
hipError_t hipCtxDisablePeerAccess (hipCtx_t peerCtx);
|
||||
// doxygen end Context Management
|
||||
/**
|
||||
* @}
|
||||
*/
|
||||
|
||||
|
||||
// TODO-ctx
|
||||
/**
|
||||
* @return hipSuccess, hipErrorInvalidDevice
|
||||
@@ -1099,12 +1248,84 @@ hipError_t hipDeviceGetFromId(hipDevice_t *device, int deviceId);
|
||||
*/
|
||||
hipError_t hipDriverGetVersion(int *driverVersion) ;
|
||||
|
||||
/**
|
||||
* @brief Loads code object from file into a hipModule_t
|
||||
*
|
||||
* @param [in] fname
|
||||
* @param [out] module
|
||||
*
|
||||
* @returns hipSuccess, hipErrorInvalidValue, hipErrorInvalidContext, hipErrorFileNotFound, hipErrorOutOfMemory, hipErrorSharedObjectInitFailed, hipErrorNotInitialized
|
||||
*
|
||||
*
|
||||
*/
|
||||
hipError_t hipModuleLoad(hipModule_t *module, const char *fname);
|
||||
|
||||
hipError_t hipModuleLoad(hipModule *module, const char *fname);
|
||||
/**
|
||||
* @brief Freeing the module
|
||||
*
|
||||
* @param [in] module
|
||||
*
|
||||
* @returns hipSuccess, hipInvalidValue
|
||||
* module is freed and the code objects associated with it are destroyed
|
||||
*
|
||||
*/
|
||||
|
||||
hipError_t hipModuleGetFunction(hipFunction *function, hipModule module, const char *kname);
|
||||
hipError_t hipModuleUnload(hipModule_t module);
|
||||
|
||||
hipError_t hipLaunchModuleKernel(hipFunction f,
|
||||
/**
|
||||
* @brief Function with kname will be extracted present in module
|
||||
*
|
||||
* @param [in] module
|
||||
* @param [in] kname
|
||||
* @param [out] function
|
||||
*
|
||||
* @returns hipSuccess, hipErrorInvalidValue, hipErrorInvalidContext, hipErrorNotInitialized, hipErrorNotFound,
|
||||
*/
|
||||
hipError_t hipModuleGetFunction(hipFunction_t *function, hipModule_t module, const char *kname);
|
||||
|
||||
/**
|
||||
* @brief returns device memory pointer and size of the kernel present in the module with symbol - name
|
||||
*
|
||||
* @param [in] moodule
|
||||
* @param [in] name
|
||||
* @param [out] dptr
|
||||
* @param [out[ bytes
|
||||
*
|
||||
* @returns hipSuccess, hipErrorInvalidValue, hipErrorNotInitialized
|
||||
*/
|
||||
hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t *bytes, hipModule_t hmod, const char *name);
|
||||
|
||||
/**
|
||||
* @brief builds module from code object which resides in host memory. And image is pointer to that location.
|
||||
*
|
||||
* @param [in] image
|
||||
* @param [out] module
|
||||
*
|
||||
* @returns hipSuccess, hipErrorNotInitialized, hipErrorOutOfMemory, hipErrorNotInitialized
|
||||
*/
|
||||
hipError_t hipModuleLoadData(hipModule_t *module, const void *image);
|
||||
|
||||
/**
|
||||
* @brief launches kernel f with launch parameters and shared memory on stream with arguments passed to kerneelparams or extra
|
||||
*
|
||||
* @param [in[ f
|
||||
* @param [in] gridDimX
|
||||
* @param [in] gridDimY
|
||||
* @param [in] gridDimZ
|
||||
* @param [in] blockDimX
|
||||
* @param [in] blockDimY
|
||||
* @param [in] blockDimZ
|
||||
* @param [in] sharedMemBytes
|
||||
* @param [in] stream
|
||||
* @param [in] kernelParams
|
||||
* @param [in] extraa
|
||||
*
|
||||
* The function takes the above arguments and run the kernel in hipFunction_t f. with launch parameters specified in gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY and blockDimmZ. The amount of shared memory is specificed and can be used with HIP_DYNAMIC_SHARED. The arguemt extra is used to pass in the arguments for the kernel.
|
||||
* @returns hipSuccess, hipInvalidDevice, hipErrorNotInitialized, hipErrorInvalidValue
|
||||
*
|
||||
* @warning kernellParams argument is not yet implemented in HIP. Please use extra instead. Please refer to hip_porting_driver_api.md for sample usage.
|
||||
*/
|
||||
hipError_t hipModuleLaunchKernel(hipFunction_t f,
|
||||
unsigned int gridDimX,
|
||||
unsigned int gridDimY,
|
||||
unsigned int gridDimZ,
|
||||
@@ -1114,7 +1335,7 @@ hipError_t hipLaunchModuleKernel(hipFunction f,
|
||||
unsigned int sharedMemBytes,
|
||||
hipStream_t stream,
|
||||
void **kernelParams,
|
||||
void **extra) __attribute__((deprecated("kernelParams is not fully supported, use extra instead"))) ;
|
||||
void **extra) ;
|
||||
|
||||
// doxygen end Version Management
|
||||
/**
|
||||
|
||||
@@ -0,0 +1,31 @@
|
||||
/*
|
||||
Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <hip/hip_common.h>
|
||||
|
||||
#if defined(__HIP_PLATFORM_HCC__) && !defined (__HIP_PLATFORM_NVCC__)
|
||||
#include <hip/hcc_detail/hipComplex.h>
|
||||
#elif defined(__HIP_PLATFORM_NVCC__) && !defined (__HIP_PLATFORM_HCC__)
|
||||
#include <hip/nvcc_detail/hipComplex.h>
|
||||
#else
|
||||
#error("Must define exactly one of __HIP_PLATFORM_HCC__ or __HIP_PLATFORM_NVCC__");
|
||||
#endif
|
||||
|
||||
@@ -0,0 +1,108 @@
|
||||
#ifndef HIPCOMPLEX_H
|
||||
#define HIPCOMPLEX_H
|
||||
|
||||
#include"cuComplex.h"
|
||||
|
||||
typedef cuFloatComplex hipFloatComplex;
|
||||
|
||||
__device__ __host__ static inline float hipCrealf(hipFloatComplex z){
|
||||
return cuCrealf(z);
|
||||
}
|
||||
|
||||
__device__ __host__ static inline float hipCimagf(hipFloatComplex z){
|
||||
return cuCimagf(z);
|
||||
}
|
||||
|
||||
__device__ __host__ static inline hipFloatComplex make_hipFloatComplex(float a, float b){
|
||||
return make_cuFloatComplex(a, b);
|
||||
}
|
||||
|
||||
__device__ __host__ static inline hipFloatComplex hipConjf(hipFloatComplex z){
|
||||
return cuConjf(z);
|
||||
}
|
||||
|
||||
__device__ __host__ static inline float hipCsqabsf(hipFloatComplex z){
|
||||
return cuCabsf(z) * cuCabsf(z);
|
||||
}
|
||||
|
||||
__device__ __host__ static inline hipFloatComplex hipCaddf(hipFloatComplex p, hipFloatComplex q){
|
||||
return cuCaddf(p, q);
|
||||
}
|
||||
|
||||
__device__ __host__ static inline hipFloatComplex hipCsubf(hipFloatComplex p, hipFloatComplex q){
|
||||
return cuCsubf(p, q);
|
||||
}
|
||||
|
||||
__device__ __host__ static inline hipFloatComplex hipCmulf(hipFloatComplex p, hipFloatComplex q){
|
||||
return cuCmulf(p, q);
|
||||
}
|
||||
|
||||
__device__ __host__ static inline hipFloatComplex hipCdivf(hipFloatComplex p, hipFloatComplex q){
|
||||
return cuCdivf(p, q);
|
||||
}
|
||||
|
||||
__device__ __host__ static inline float hipCabsf(hipFloatComplex z){
|
||||
return cuCabsf(p, q);
|
||||
}
|
||||
|
||||
typedef cuDoubleComplex hipDoubleComplex;
|
||||
|
||||
__device__ __host__ static inline double hipCreal(hipDoubleComplex z){
|
||||
return cuCreal(z);
|
||||
}
|
||||
|
||||
__device__ __host__ static inline double hipCimag(hipDoubleComplex z){
|
||||
return cuCimag(z);
|
||||
}
|
||||
|
||||
__device__ __host__ static inline hipDoubleComplex make_hipDoubleComplex(double a, double b){
|
||||
return make_cuDoubleComplex(a, b);
|
||||
}
|
||||
|
||||
__device__ __host__ static inline hipDoubleComplex hipConj(hipDoubleComplex z){
|
||||
return cuConj(z);
|
||||
}
|
||||
|
||||
__device__ __host__ static inline hipDoubleComplex hipCsqabs(hipDoubleComplex z){
|
||||
return cuCabs(z) * cuCabs(z);
|
||||
}
|
||||
|
||||
__device__ __host__ static inline hipDoubleComplex hipCadd(hipDoubleComplex p, hipDoubleComplex q){
|
||||
return cuCadd(p, q);
|
||||
}
|
||||
|
||||
__device__ __host__ static inline hipDoubleComplex hipCsub(hipDoubleComplex p, hipDoubleComplex q){
|
||||
return cuCsub(p, q);
|
||||
}
|
||||
|
||||
__device__ __host__ static inline hipDoubleComplex hipCdiv(hipDoubleComplex p, hipDoubleComplex q){
|
||||
return cuCdiv(p, q);
|
||||
}
|
||||
|
||||
__device__ __host__ static inline double hipCabs(hipDoubleComplex z){
|
||||
return cuCabs(z);
|
||||
}
|
||||
|
||||
typedef cuFloatComplex hipComplex;
|
||||
|
||||
__device__ __host__ static inline hipComplex make_Complex(float x, float y){
|
||||
return make_cuComplex(x, y);
|
||||
}
|
||||
|
||||
__device__ __host__ static inline hipFloatComplex hipComplexDoubleToFloat(hipDoubleComplex z){
|
||||
return cuComplexDoubleToFloat(z);
|
||||
}
|
||||
|
||||
__device__ __host__ static inline hipDoubleComplex hipComplexFloatToDouble(hipFloatComplex z){
|
||||
return cuComplexFloatToDouble(z);
|
||||
}
|
||||
|
||||
__device__ __host__ static inline hipComplex hipCfmaf(hipComplex p, hipComplex q, hipComplex r){
|
||||
return cuCfmaf(p, q, r);
|
||||
}
|
||||
|
||||
__device__ __host__ static inline hipDoubleComplex hipCfma(hipComplex p, hipComplex q, hipComplex r){
|
||||
return cuCfma(p, q, r);
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -22,7 +22,7 @@ THE SOFTWARE.
|
||||
#pragma once
|
||||
|
||||
#include <cuda_runtime_api.h>
|
||||
|
||||
#include <cuda.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
@@ -58,8 +58,20 @@ hipMemcpyHostToHost
|
||||
#define hipHostRegisterPortable cudaHostRegisterPortable
|
||||
#define hipHostRegisterMapped cudaHostRegisterMapped
|
||||
|
||||
#define HIP_LAUNCH_PARAM_BUFFER_POINTER CU_LAUNCH_PARAM_BUFFER_POINTER
|
||||
#define HIP_LAUNCH_PARAM_BUFFER_SIZE CU_LAUNCH_PARAM_BUFFER_SIZE
|
||||
#define HIP_LAUNCH_PARAM_END CU_LAUNCH_PARAM_END
|
||||
|
||||
typedef cudaEvent_t hipEvent_t;
|
||||
typedef cudaStream_t hipStream_t;
|
||||
typedef CUcontext hipCtx_t;
|
||||
typedef CUsharedconfig hipSharedMemConfig;
|
||||
typedef CUfunc_cache hipFuncCache;
|
||||
typedef CUdevice hipDevice_t;
|
||||
typedef CUmodule hipModule_t;
|
||||
typedef CUfunction hipFunction_t;
|
||||
typedef CUdeviceptr hipDeviceptr_t;
|
||||
|
||||
//typedef cudaChannelFormatDesc hipChannelFormatDesc;
|
||||
#define hipChannelFormatDesc cudaChannelFormatDesc
|
||||
|
||||
@@ -85,6 +97,20 @@ switch(cuError) {
|
||||
};
|
||||
}
|
||||
|
||||
inline static hipError_t hipCUResultTohipError(CUresult cuError) { //TODO Populate further
|
||||
switch(cuError) {
|
||||
case CUDA_SUCCESS : return hipSuccess;
|
||||
case CUDA_ERROR_OUT_OF_MEMORY : return hipErrorMemoryAllocation ;
|
||||
case CUDA_ERROR_INVALID_VALUE : return hipErrorInvalidValue ;
|
||||
case CUDA_ERROR_INVALID_DEVICE : return hipErrorInvalidDevice ;
|
||||
case CUDA_ERROR_DEINITIALIZED : return hipErrorDeinitialized ;
|
||||
case CUDA_ERROR_NO_DEVICE : return hipErrorNoDevice ;
|
||||
case CUDA_ERROR_INVALID_CONTEXT : return hipErrorInvalidContext ;
|
||||
case CUDA_ERROR_NOT_INITIALIZED : return hipErrorNotInitialized ;
|
||||
default : return hipErrorUnknown; // Note - translated error.
|
||||
};
|
||||
}
|
||||
|
||||
// TODO match the error enum names of hip and cuda
|
||||
inline static cudaError_t hipErrorToCudaError(hipError_t hError) {
|
||||
switch(hError) {
|
||||
@@ -124,6 +150,11 @@ default:
|
||||
}
|
||||
}
|
||||
|
||||
inline static hipError_t hipInit(unsigned int flags)
|
||||
{
|
||||
return hipCUResultTohipError(cuInit(flags));
|
||||
}
|
||||
|
||||
inline static hipError_t hipDeviceReset() {
|
||||
return hipCUDAErrorTohipError(cudaDeviceReset());
|
||||
}
|
||||
@@ -182,6 +213,25 @@ inline static hipError_t hipHostFree(void* ptr) {
|
||||
inline static hipError_t hipSetDevice(int device) {
|
||||
return hipCUDAErrorTohipError(cudaSetDevice(device));
|
||||
}
|
||||
|
||||
inline static hipError_t hipMemcpyHtoD(hipDeviceptr_t dst,
|
||||
void* src, size_t size)
|
||||
{
|
||||
return hipCUResultTohipError(cuMemcpyHtoD(dst, src, size));
|
||||
}
|
||||
|
||||
inline static hipError_t hipMemcpyDtoH(void* dst,
|
||||
hipDeviceptr_t src, size_t size)
|
||||
{
|
||||
return hipCUResultTohipError(cuMemcpyDtoH(dst, src, size));
|
||||
}
|
||||
|
||||
inline static hipError_t hipMemcpyDtoD(hipDeviceptr_t dst,
|
||||
hipDeviceptr_t src, size_t size)
|
||||
{
|
||||
return hipCUResultTohipError(cuMemcpyDtoD(dst, src, size));
|
||||
}
|
||||
|
||||
inline static hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind copyKind) {
|
||||
return hipCUDAErrorTohipError(cudaMemcpy(dst, src, sizeBytes, hipMemcpyKindToCudaMemcpyKind(copyKind)));
|
||||
}
|
||||
@@ -347,20 +397,6 @@ inline static hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t att
|
||||
return hipCUDAErrorTohipError(cerror);
|
||||
}
|
||||
|
||||
template<class T>
|
||||
inline static hipError_t hipOccupancyMaxPotentialBlockSize(
|
||||
int *minGridSize,
|
||||
int *blockSize,
|
||||
T func,
|
||||
size_t dynamicSMemSize = 0,
|
||||
int blockSizeLimit = 0,
|
||||
unsigned int flags = 0
|
||||
){
|
||||
cudaError_t cerror;
|
||||
cerror = cudaOccupancyMaxPotentialBlockSize(minGridSize, blockSize, func, dynamicSMemSize, blockSizeLimit, flags);
|
||||
return hipCUDAErrorTohipError(cerror);
|
||||
}
|
||||
|
||||
inline static hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
int *numBlocks,
|
||||
const void* func,
|
||||
@@ -458,7 +494,6 @@ inline static hipError_t hipDriverGetVersion(int *driverVersion)
|
||||
return hipCUDAErrorTohipError(err);
|
||||
}
|
||||
|
||||
|
||||
inline static hipError_t hipDeviceCanAccessPeer ( int* canAccessPeer, int device, int peerDevice )
|
||||
{
|
||||
return hipCUDAErrorTohipError(cudaDeviceCanAccessPeer(canAccessPeer, device, peerDevice));
|
||||
@@ -474,6 +509,16 @@ inline static hipError_t hipDeviceEnablePeerAccess ( int peerDevice, unsigned
|
||||
return hipCUDAErrorTohipError(cudaDeviceEnablePeerAccess ( peerDevice, flags ));
|
||||
}
|
||||
|
||||
inline static hipError_t hipCtxDisablePeerAccess ( hipCtx_t peerCtx )
|
||||
{
|
||||
return hipCUResultTohipError(cuCtxDisablePeerAccess ( peerCtx ));
|
||||
}
|
||||
|
||||
inline static hipError_t hipCtxEnablePeerAccess ( hipCtx_t peerCtx, unsigned int flags )
|
||||
{
|
||||
return hipCUResultTohipError(cuCtxEnablePeerAccess ( peerCtx, flags ));
|
||||
}
|
||||
|
||||
inline static hipError_t hipMemcpyPeer ( void* dst, int dstDevice, const void* src, int srcDevice, size_t count )
|
||||
{
|
||||
return hipCUDAErrorTohipError(cudaMemcpyPeer ( dst, dstDevice, src, srcDevice, count ));
|
||||
@@ -499,12 +544,145 @@ inline static hipError_t hipEventQuery(hipEvent_t event)
|
||||
return hipCUDAErrorTohipError(cudaEventQuery(event));
|
||||
}
|
||||
|
||||
inline static hipError_t hipCtxCreate(hipCtx_t *ctx, unsigned int flags, hipDevice_t device)
|
||||
{
|
||||
return hipCUResultTohipError(cuCtxCreate ( ctx,flags,device ));
|
||||
}
|
||||
|
||||
inline static hipError_t hipCtxDestroy(hipCtx_t ctx)
|
||||
{
|
||||
return hipCUResultTohipError(cuCtxDestroy ( ctx ));
|
||||
}
|
||||
|
||||
inline static hipError_t hipCtxPopCurrent(hipCtx_t* ctx)
|
||||
{
|
||||
return hipCUResultTohipError(cuCtxPopCurrent ( ctx ));
|
||||
}
|
||||
|
||||
inline static hipError_t hipCtxPushCurrent(hipCtx_t ctx)
|
||||
{
|
||||
return hipCUResultTohipError(cuCtxPushCurrent ( ctx ));
|
||||
}
|
||||
|
||||
inline static hipError_t hipCtxSetCurrent(hipCtx_t ctx)
|
||||
{
|
||||
return hipCUResultTohipError(cuCtxSetCurrent ( ctx ));
|
||||
}
|
||||
|
||||
inline static hipError_t hipCtxGetCurrent(hipCtx_t* ctx)
|
||||
{
|
||||
return hipCUResultTohipError(cuCtxGetCurrent ( ctx ));
|
||||
}
|
||||
|
||||
inline static hipError_t hipCtxGetDevice(hipDevice_t *device)
|
||||
{
|
||||
return hipCUResultTohipError(cuCtxGetDevice ( device ));
|
||||
}
|
||||
|
||||
inline static hipError_t hipCtxGetApiVersion (hipCtx_t ctx,int *apiVersion)
|
||||
{
|
||||
return hipCUResultTohipError(cuCtxGetApiVersion ( ctx,(unsigned int*)apiVersion ));
|
||||
}
|
||||
|
||||
inline static hipError_t hipCtxGetCacheConfig ( hipFuncCache *cacheConfig )
|
||||
{
|
||||
return hipCUResultTohipError(cuCtxGetCacheConfig ( cacheConfig ));
|
||||
}
|
||||
|
||||
inline static hipError_t hipCtxSetCacheConfig ( hipFuncCache cacheConfig )
|
||||
{
|
||||
return hipCUResultTohipError(cuCtxSetCacheConfig ( cacheConfig ));
|
||||
}
|
||||
|
||||
inline static hipError_t hipCtxSetSharedMemConfig ( hipSharedMemConfig config )
|
||||
{
|
||||
return hipCUResultTohipError(cuCtxSetSharedMemConfig ( config ));
|
||||
}
|
||||
|
||||
inline static hipError_t hipCtxGetSharedMemConfig ( hipSharedMemConfig * pConfig )
|
||||
{
|
||||
return hipCUResultTohipError(cuCtxGetSharedMemConfig ( pConfig ));
|
||||
}
|
||||
|
||||
inline static hipError_t hipCtxSynchronize ( void )
|
||||
{
|
||||
return hipCUResultTohipError(cuCtxSynchronize ( ));
|
||||
}
|
||||
|
||||
inline static hipError_t hipCtxGetFlags ( unsigned int* flags )
|
||||
{
|
||||
return hipCUResultTohipError(cuCtxGetFlags ( flags ));
|
||||
}
|
||||
|
||||
inline static hipError_t hipCtxDetach(hipCtx_t ctx)
|
||||
{
|
||||
return hipCUResultTohipError(cuCtxDetach(ctx));
|
||||
}
|
||||
|
||||
inline static hipError_t hipDeviceGet(hipDevice_t *device, int ordinal)
|
||||
{
|
||||
return hipCUResultTohipError(cuDeviceGet(device, ordinal));
|
||||
}
|
||||
|
||||
inline static hipError_t hipModuleLoad(hipModule_t *module, const char* fname)
|
||||
{
|
||||
return hipCUResultTohipError(cuModuleLoad(module, fname));
|
||||
}
|
||||
|
||||
inline static hipError_t hipModuleUnload(hipModule_t hmod)
|
||||
{
|
||||
return hipCUResultTohipError(cuModuleUnload(hmod));
|
||||
}
|
||||
|
||||
inline static hipError_t hipModuleGetFunction(hipFunction_t *function,
|
||||
hipModule_t module, const char *kname)
|
||||
{
|
||||
return hipCUResultTohipError(cuModuleGetFunction(function, module, kname));
|
||||
}
|
||||
|
||||
inline static hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t *bytes,
|
||||
hipModule_t hmod, const char* name)
|
||||
{
|
||||
return hipCUResultTohipError(cuModuleGetGlobal(dptr, bytes, hmod, name));
|
||||
}
|
||||
|
||||
inline static hipError_t hipModuleLoadData(hipModule_t *module, const void *image)
|
||||
{
|
||||
return hipCUResultTohipError(cuModuleLoadData(module, image));
|
||||
}
|
||||
|
||||
inline static hipError_t hipModuleLaunchKernel(hipFunction_t f,
|
||||
unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ,
|
||||
unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ,
|
||||
unsigned int sharedMemBytes, hipStream_t stream,
|
||||
void **kernelParams, void **extra)
|
||||
{
|
||||
return hipCUResultTohipError(cuLaunchKernel(f,
|
||||
gridDimX, gridDimY, gridDimZ,
|
||||
blockDimX, blockDimY, blockDimZ,
|
||||
sharedMemBytes, stream, kernelParams, extra));
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __CUDACC__
|
||||
|
||||
template<class T>
|
||||
inline static hipError_t hipOccupancyMaxPotentialBlockSize(
|
||||
int *minGridSize,
|
||||
int *blockSize,
|
||||
T func,
|
||||
size_t dynamicSMemSize = 0,
|
||||
int blockSizeLimit = 0,
|
||||
unsigned int flags = 0
|
||||
){
|
||||
cudaError_t cerror;
|
||||
cerror = cudaOccupancyMaxPotentialBlockSize(minGridSize, blockSize, func, dynamicSMemSize, blockSizeLimit, flags);
|
||||
return hipCUDAErrorTohipError(cerror);
|
||||
}
|
||||
|
||||
template <class T, int dim, enum cudaTextureReadMode readMode>
|
||||
inline static hipError_t hipBindTexture(size_t *offset,
|
||||
const struct texture<T, dim, readMode> &tex,
|
||||
|
||||
@@ -0,0 +1,21 @@
|
||||
HIP_PATH?= $(wildcard /opt/rocm/hip)
|
||||
ifeq (,$(HIP_PATH))
|
||||
HIP_PATH=../../..
|
||||
endif
|
||||
HIPCC=$(HIP_PATH)/bin/hipcc
|
||||
HIP_PLATFORM=$(shell $(HIP_PATH)/bin/hipconfig --compiler)
|
||||
|
||||
ifeq (${HIP_PLATFORM}, hcc)
|
||||
GENCODEOBJECT_FLAGS=--target-isa-fiji
|
||||
endif
|
||||
|
||||
all: vcpy_isa.compile runKernel.hip.out
|
||||
|
||||
vcpy_isa.compile: vcpy_isa.cpp
|
||||
$(HIPCC) --gencodeobject $(GENCODEOBJECT_FLAGS) vcpy_isa.cpp -o vcpy_isa.co
|
||||
|
||||
runKernel.hip.out: runKernel.cpp
|
||||
$(HIPCC) runKernel.cpp -o runKernel.hip.out
|
||||
|
||||
clean:
|
||||
rm -f *.co *.out
|
||||
@@ -0,0 +1,105 @@
|
||||
/*
|
||||
Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include<hip_runtime.h>
|
||||
#include<hip_runtime_api.h>
|
||||
#include<iostream>
|
||||
#include<fstream>
|
||||
#include<vector>
|
||||
|
||||
#define LEN 64
|
||||
#define SIZE LEN<<2
|
||||
|
||||
#ifdef __HIP_PLATFORM_HCC__
|
||||
#define fileName "vcpy_isa.co"
|
||||
#define kernel_name "ZN12_GLOBAL__N_146_Z11hello_world16grid_launch_parmPfS0__functor19__cxxamp_trampolineEiiiiiiPKfPf"
|
||||
#endif
|
||||
|
||||
#ifdef __HIP_PLATFORM_NVCC__
|
||||
#define fileName "vcpy_isa.ptx"
|
||||
#define kernel_name "hello_world"
|
||||
#endif
|
||||
|
||||
int main(){
|
||||
float *A, *B;
|
||||
hipDeviceptr_t Ad, Bd;
|
||||
A = new float[LEN];
|
||||
B = new float[LEN];
|
||||
|
||||
for(uint32_t i=0;i<LEN;i++){
|
||||
A[i] = i*1.0f;
|
||||
B[i] = 0.0f;
|
||||
}
|
||||
|
||||
|
||||
#ifdef __HIP_PLATFORM_NVCC__
|
||||
hipInit(0);
|
||||
hipDevice_t device;
|
||||
hipCtx_t context;
|
||||
hipDeviceGet(&device, 0);
|
||||
hipCtxCreate(&context, 0, device);
|
||||
#endif
|
||||
|
||||
hipMalloc((void**)&Ad, SIZE);
|
||||
hipMalloc((void**)&Bd, SIZE);
|
||||
|
||||
hipMemcpyHtoD(Ad, A, SIZE);
|
||||
hipMemcpyHtoD(Bd, B, SIZE);
|
||||
hipModule_t Module;
|
||||
hipFunction_t Function;
|
||||
hipModuleLoad(&Module, fileName);
|
||||
hipModuleGetFunction(&Function, Module, kernel_name);
|
||||
|
||||
uint32_t len = LEN;
|
||||
uint32_t one = 1;
|
||||
|
||||
std::vector<void*>argBuffer(5);
|
||||
uint32_t *ptr32_t = (uint32_t*)&argBuffer[0];
|
||||
memcpy(ptr32_t + 0, &one, sizeof(uint32_t));
|
||||
memcpy(ptr32_t + 1, &one, sizeof(uint32_t));
|
||||
memcpy(ptr32_t + 2, &one, sizeof(uint32_t));
|
||||
memcpy(ptr32_t + 3, &len, sizeof(uint32_t));
|
||||
memcpy(ptr32_t + 4, &one, sizeof(uint32_t));
|
||||
memcpy(ptr32_t + 5, &one, sizeof(uint32_t));
|
||||
memcpy(&argBuffer[3], &Ad, sizeof(void*));
|
||||
memcpy(&argBuffer[4], &Bd, sizeof(void*));
|
||||
|
||||
|
||||
|
||||
size_t size = argBuffer.size()*sizeof(void*);
|
||||
|
||||
void *config[] = {
|
||||
HIP_LAUNCH_PARAM_BUFFER_POINTER, &argBuffer[0],
|
||||
HIP_LAUNCH_PARAM_BUFFER_SIZE, &size,
|
||||
HIP_LAUNCH_PARAM_END
|
||||
};
|
||||
|
||||
hipModuleLaunchKernel(Function, 1, 1, 1, LEN, 1, 1, 0, 0, NULL, (void**)&config);
|
||||
|
||||
hipMemcpyDtoH(B, Bd, SIZE);
|
||||
for(uint32_t i=LEN-4;i<LEN;i++){
|
||||
std::cout<<A[i]<<" - "<<B[i]<<std::endl;
|
||||
}
|
||||
|
||||
#ifdef __HIP_PLATFORM_NVCC__
|
||||
hipCtxDetach(context);
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -0,0 +1,9 @@
|
||||
#include<hip_runtime.h>
|
||||
|
||||
__global__ void hello_world(hipLaunchParm lp, float *a, float *b)
|
||||
{
|
||||
int tx = hipThreadIdx_x;
|
||||
b[tx] = a[tx];
|
||||
}
|
||||
|
||||
int main(){}
|
||||
@@ -0,0 +1,6 @@
|
||||
|
||||
extern "C" __global__ void hello_world(float *a, float *b)
|
||||
{
|
||||
int tx = threadIdx.x;
|
||||
b[tx] = a[tx];
|
||||
}
|
||||
@@ -0,0 +1,38 @@
|
||||
//
|
||||
// Generated by NVIDIA NVVM Compiler
|
||||
//
|
||||
// Compiler Build ID: CL-19856038
|
||||
// Cuda compilation tools, release 7.5, V7.5.17
|
||||
// Based on LLVM 3.4svn
|
||||
//
|
||||
|
||||
.version 4.3
|
||||
.target sm_20
|
||||
.address_size 64
|
||||
|
||||
// .globl hello_world
|
||||
|
||||
.visible .entry hello_world(
|
||||
.param .u64 hello_world_param_0,
|
||||
.param .u64 hello_world_param_1
|
||||
)
|
||||
{
|
||||
.reg .f32 %f<2>;
|
||||
.reg .b32 %r<2>;
|
||||
.reg .b64 %rd<8>;
|
||||
|
||||
|
||||
ld.param.u64 %rd1, [hello_world_param_0];
|
||||
ld.param.u64 %rd2, [hello_world_param_1];
|
||||
cvta.to.global.u64 %rd3, %rd2;
|
||||
cvta.to.global.u64 %rd4, %rd1;
|
||||
mov.u32 %r1, %tid.x;
|
||||
mul.wide.s32 %rd5, %r1, 4;
|
||||
add.s64 %rd6, %rd4, %rd5;
|
||||
ld.global.f32 %f1, [%rd6];
|
||||
add.s64 %rd7, %rd3, %rd5;
|
||||
st.global.f32 [%rd7], %f1;
|
||||
ret;
|
||||
}
|
||||
|
||||
|
||||
+116
-11
@@ -29,7 +29,22 @@ THE SOFTWARE.
|
||||
// Stack of contexts
|
||||
thread_local std::stack<ihipCtx_t *> tls_ctxStack;
|
||||
|
||||
hipError_t ihipCtxStackUpdate()
|
||||
{
|
||||
//HIP_INIT_API();
|
||||
hipError_t e = hipSuccess;
|
||||
|
||||
if(tls_ctxStack.empty()) {
|
||||
tls_ctxStack.push(ihipGetTlsDefaultCtx());
|
||||
}
|
||||
|
||||
return ihipLogStatus(e);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return #hipSuccess, #hipErrorInvalidValue
|
||||
*/
|
||||
//---
|
||||
hipError_t hipInit(unsigned int flags)
|
||||
{
|
||||
HIP_INIT_API(flags);
|
||||
@@ -44,7 +59,10 @@ hipError_t hipInit(unsigned int flags)
|
||||
return ihipLogStatus(e);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @return #hipSuccess
|
||||
*/
|
||||
//---
|
||||
hipError_t hipCtxCreate(hipCtx_t *ctx, unsigned int flags, hipDevice_t device)
|
||||
{
|
||||
HIP_INIT_API(ctx, flags, device); // FIXME - review if we want to init
|
||||
@@ -57,7 +75,10 @@ hipError_t hipCtxCreate(hipCtx_t *ctx, unsigned int flags, hipDevice_t device)
|
||||
return ihipLogStatus(e);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @return #hipSuccess, #hipErrorInvalidDevice
|
||||
*/
|
||||
//---
|
||||
hipError_t hipDeviceGet(hipDevice_t *device, int deviceId)
|
||||
{
|
||||
HIP_INIT_API(device, deviceId); // FIXME - review if we want to init
|
||||
@@ -88,20 +109,39 @@ hipError_t hipDriverGetVersion(int *driverVersion)
|
||||
return ihipLogStatus(hipSuccess);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return #hipSuccess, #hipErrorInvalidValue
|
||||
*/
|
||||
//---
|
||||
hipError_t hipCtxDestroy(hipCtx_t ctx)
|
||||
{
|
||||
HIP_INIT_API(ctx);
|
||||
hipError_t e = hipSuccess;
|
||||
ihipCtx_t* currentCtx= ihipGetTlsDefaultCtx();
|
||||
if(currentCtx == ctx) {
|
||||
//need to destroy the ctx associated with calling thread
|
||||
tls_ctxStack.pop();
|
||||
ihipCtx_t* primaryCtx= ((ihipDevice_t*)ctx->getDevice())->_primaryCtx;
|
||||
if(primaryCtx== ctx)
|
||||
{
|
||||
e = hipErrorInvalidValue;
|
||||
}
|
||||
delete ctx; //As per CUDA docs , attempting to access ctx from those threads which has this ctx as current, will result in the error HIP_ERROR_CONTEXT_IS_DESTROYED.
|
||||
else
|
||||
{
|
||||
if(currentCtx == ctx) {
|
||||
//need to destroy the ctx associated with calling thread
|
||||
tls_ctxStack.pop();
|
||||
}
|
||||
delete ctx; //As per CUDA docs , attempting to access ctx from those threads which has this ctx as current, will result in the error HIP_ERROR_CONTEXT_IS_DESTROYED.
|
||||
}
|
||||
|
||||
return ihipLogStatus(e);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return #hipSuccess
|
||||
*/
|
||||
//---
|
||||
hipError_t hipCtxPopCurrent(hipCtx_t* ctx)
|
||||
{
|
||||
HIP_INIT_API(ctx);
|
||||
hipError_t e = hipSuccess;
|
||||
ihipCtx_t* tempCtx;
|
||||
*ctx = ihipGetTlsDefaultCtx();
|
||||
@@ -119,8 +159,13 @@ hipError_t hipCtxPopCurrent(hipCtx_t* ctx)
|
||||
return ihipLogStatus(e);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return #hipSuccess, #hipErrorInvalidContext
|
||||
*/
|
||||
//---
|
||||
hipError_t hipCtxPushCurrent(hipCtx_t ctx)
|
||||
{
|
||||
HIP_INIT_API(ctx);
|
||||
hipError_t e = hipSuccess;
|
||||
if(ctx != NULL) { //TODO- is this check needed?
|
||||
ihipSetTlsDefaultCtx(ctx);
|
||||
@@ -132,19 +177,30 @@ hipError_t hipCtxPushCurrent(hipCtx_t ctx)
|
||||
return ihipLogStatus(e);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return #hipSuccess
|
||||
*/
|
||||
//---
|
||||
hipError_t hipCtxGetCurrent(hipCtx_t* ctx)
|
||||
{
|
||||
HIP_INIT_API(ctx);
|
||||
hipError_t e = hipSuccess;
|
||||
|
||||
*ctx = ihipGetTlsDefaultCtx();
|
||||
if(*ctx == nullptr) {
|
||||
*ctx = NULL; //TODO - is it required? Can return nullptr?
|
||||
if(!tls_ctxStack.empty()) {
|
||||
*ctx= tls_ctxStack.top();
|
||||
}
|
||||
else {
|
||||
*ctx = NULL;
|
||||
}
|
||||
return ihipLogStatus(e);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return #hipSuccess
|
||||
*/
|
||||
//---
|
||||
hipError_t hipCtxSetCurrent(hipCtx_t ctx)
|
||||
{
|
||||
HIP_INIT_API(ctx);
|
||||
hipError_t e = hipSuccess;
|
||||
if(ctx == NULL) {
|
||||
tls_ctxStack.pop();
|
||||
@@ -156,8 +212,13 @@ hipError_t hipCtxSetCurrent(hipCtx_t ctx)
|
||||
return ihipLogStatus(e);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return #hipSuccess, #hipErrorInvalidContext
|
||||
*/
|
||||
//---
|
||||
hipError_t hipCtxGetDevice(hipDevice_t *device)
|
||||
{
|
||||
HIP_INIT_API(device);
|
||||
hipError_t e = hipSuccess;
|
||||
|
||||
ihipCtx_t *ctx = ihipGetTlsDefaultCtx();
|
||||
@@ -171,6 +232,10 @@ hipError_t hipCtxGetDevice(hipDevice_t *device)
|
||||
return ihipLogStatus(e);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return #hipSuccess
|
||||
*/
|
||||
//---
|
||||
hipError_t hipCtxGetApiVersion (hipCtx_t ctx,int *apiVersion)
|
||||
{
|
||||
HIP_INIT_API(apiVersion);
|
||||
@@ -182,6 +247,10 @@ hipError_t hipCtxGetApiVersion (hipCtx_t ctx,int *apiVersion)
|
||||
return ihipLogStatus(hipSuccess);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return #hipSuccess
|
||||
*/
|
||||
//---
|
||||
hipError_t hipCtxGetCacheConfig ( hipFuncCache *cacheConfig )
|
||||
{
|
||||
HIP_INIT_API(cacheConfig);
|
||||
@@ -191,6 +260,10 @@ hipError_t hipCtxGetCacheConfig ( hipFuncCache *cacheConfig )
|
||||
return ihipLogStatus(hipSuccess);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return #hipSuccess
|
||||
*/
|
||||
//---
|
||||
hipError_t hipCtxSetCacheConfig ( hipFuncCache cacheConfig )
|
||||
{
|
||||
HIP_INIT_API(cacheConfig);
|
||||
@@ -200,6 +273,10 @@ hipError_t hipCtxSetCacheConfig ( hipFuncCache cacheConfig )
|
||||
return ihipLogStatus(hipSuccess);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return #hipSuccess
|
||||
*/
|
||||
//---
|
||||
hipError_t hipCtxSetSharedMemConfig ( hipSharedMemConfig config )
|
||||
{
|
||||
HIP_INIT_API(config);
|
||||
@@ -209,6 +286,10 @@ hipError_t hipCtxSetSharedMemConfig ( hipSharedMemConfig config )
|
||||
return ihipLogStatus(hipSuccess);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return #hipSuccess
|
||||
*/
|
||||
//---
|
||||
hipError_t hipCtxGetSharedMemConfig ( hipSharedMemConfig * pConfig )
|
||||
{
|
||||
HIP_INIT_API(pConfig);
|
||||
@@ -216,4 +297,28 @@ hipError_t hipCtxGetSharedMemConfig ( hipSharedMemConfig * pConfig )
|
||||
*pConfig = hipSharedMemBankSizeFourByte;
|
||||
|
||||
return ihipLogStatus(hipSuccess);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @return #hipSuccess
|
||||
*/
|
||||
//---
|
||||
hipError_t hipCtxSynchronize ( void )
|
||||
{
|
||||
HIP_INIT_API(1);
|
||||
return ihipSynchronize(); //TODP Shall check validity of ctx?
|
||||
}
|
||||
|
||||
/**
|
||||
* @return #hipSuccess
|
||||
*/
|
||||
//---
|
||||
hipError_t hipCtxGetFlags ( unsigned int* flags )
|
||||
{
|
||||
HIP_INIT_API(flags);
|
||||
hipError_t e = hipSuccess;
|
||||
ihipCtx_t* tempCtx;
|
||||
tempCtx = ihipGetTlsDefaultCtx();
|
||||
*flags = tempCtx->_ctxFlags;
|
||||
return ihipLogStatus(e);
|
||||
}
|
||||
|
||||
@@ -160,11 +160,8 @@ hipError_t hipSetDevice(int deviceId)
|
||||
*/
|
||||
hipError_t hipDeviceSynchronize(void)
|
||||
{
|
||||
HIP_INIT_API();
|
||||
|
||||
ihipGetTlsDefaultCtx()->locked_waitAllStreams(); // ignores non-blocking streams, this waits for all activity to finish.
|
||||
|
||||
return ihipLogStatus(hipSuccess);
|
||||
HIP_INIT_API(1);
|
||||
return ihipSynchronize();
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -33,13 +33,14 @@ hipError_t ihipEventCreate(hipEvent_t* event, unsigned flags)
|
||||
|
||||
// TODO - support hipEventDefault, hipEventBlockingSync, hipEventDisableTiming
|
||||
if (flags == 0) {
|
||||
ihipEvent_t *eh = event->_handle = new ihipEvent_t();
|
||||
ihipEvent_t *eh = new ihipEvent_t();
|
||||
|
||||
eh->_state = hipEventStatusCreated;
|
||||
eh->_stream = NULL;
|
||||
eh->_flags = flags;
|
||||
eh->_timestamp = 0;
|
||||
eh->_copySeqId = 0;
|
||||
*event = eh;
|
||||
} else {
|
||||
e = hipErrorInvalidValue;
|
||||
}
|
||||
@@ -71,7 +72,7 @@ hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream)
|
||||
{
|
||||
HIP_INIT_API(event, stream);
|
||||
|
||||
ihipEvent_t *eh = event._handle;
|
||||
ihipEvent_t *eh = event;
|
||||
if (eh && eh->_state != hipEventStatusUnitialized) {
|
||||
eh->_stream = stream;
|
||||
|
||||
@@ -106,10 +107,10 @@ hipError_t hipEventDestroy(hipEvent_t event)
|
||||
{
|
||||
HIP_INIT_API(event);
|
||||
|
||||
event._handle->_state = hipEventStatusUnitialized;
|
||||
event->_state = hipEventStatusUnitialized;
|
||||
|
||||
delete event._handle;
|
||||
event._handle = NULL;
|
||||
delete event;
|
||||
event = NULL;
|
||||
|
||||
// TODO - examine return additional error codes
|
||||
return ihipLogStatus(hipSuccess);
|
||||
@@ -121,7 +122,7 @@ hipError_t hipEventSynchronize(hipEvent_t event)
|
||||
{
|
||||
HIP_INIT_API(event);
|
||||
|
||||
ihipEvent_t *eh = event._handle;
|
||||
ihipEvent_t *eh = event;
|
||||
|
||||
if (eh) {
|
||||
if (eh->_state == hipEventStatusUnitialized) {
|
||||
@@ -150,8 +151,8 @@ hipError_t hipEventElapsedTime(float *ms, hipEvent_t start, hipEvent_t stop)
|
||||
{
|
||||
HIP_INIT_API(ms, start, stop);
|
||||
|
||||
ihipEvent_t *start_eh = start._handle;
|
||||
ihipEvent_t *stop_eh = stop._handle;
|
||||
ihipEvent_t *start_eh = start;
|
||||
ihipEvent_t *stop_eh = stop;
|
||||
|
||||
ihipSetTs(start);
|
||||
ihipSetTs(stop);
|
||||
@@ -195,7 +196,7 @@ hipError_t hipEventQuery(hipEvent_t event)
|
||||
{
|
||||
HIP_INIT_API(event);
|
||||
|
||||
ihipEvent_t *eh = event._handle;
|
||||
ihipEvent_t *eh = event;
|
||||
|
||||
// TODO-stream - need to read state of signal here: The event may have become ready after recording..
|
||||
// TODO-HCC - use get_hsa_signal here.
|
||||
|
||||
+65
-8
@@ -158,7 +158,12 @@ ihipCtx_t *ihipGetTlsDefaultCtx()
|
||||
return tls_defaultCtx;
|
||||
}
|
||||
|
||||
hipError_t ihipSynchronize(void)
|
||||
{
|
||||
ihipGetTlsDefaultCtx()->locked_waitAllStreams(); // ignores non-blocking streams, this waits for all activity to finish.
|
||||
|
||||
return ihipLogStatus(hipSuccess);
|
||||
}
|
||||
|
||||
//=================================================================================================
|
||||
// ihipSignal_t:
|
||||
@@ -482,6 +487,53 @@ int ihipStream_t::preCopyCommand(LockedAccessor_StreamCrit_t &crit, ihipSignal_t
|
||||
}
|
||||
|
||||
|
||||
void ihipStream_t::launchModuleKernel(hsa_signal_t signal,
|
||||
uint32_t blockDimX,
|
||||
uint32_t blockDimY,
|
||||
uint32_t blockDimZ,
|
||||
uint32_t gridDimX,
|
||||
uint32_t gridDimY,
|
||||
uint32_t gridDimZ,
|
||||
uint32_t sharedMemBytes,
|
||||
void *kernarg,
|
||||
size_t kernSize,
|
||||
uint64_t kernel){
|
||||
hsa_status_t status;
|
||||
void *kern;
|
||||
hsa_amd_memory_pool_t *pool = reinterpret_cast<hsa_amd_memory_pool_t*>(_av.get_hsa_kernarg_region());
|
||||
status = hsa_amd_memory_pool_allocate(*pool, kernSize, 0, &kern);
|
||||
status = hsa_amd_agents_allow_access(1, (hsa_agent_t*)_av.get_hsa_agent(), 0, kern);
|
||||
memcpy(kern, kernarg, kernSize);
|
||||
hsa_queue_t *Queue = (hsa_queue_t*)_av.get_hsa_queue();
|
||||
const uint32_t queue_mask = Queue->size-1;
|
||||
uint32_t packet_index = hsa_queue_load_write_index_relaxed(Queue);
|
||||
hsa_kernel_dispatch_packet_t *dispatch_packet = &(((hsa_kernel_dispatch_packet_t*)(Queue->base_address))[packet_index & queue_mask]);
|
||||
|
||||
dispatch_packet->completion_signal = signal;
|
||||
dispatch_packet->workgroup_size_x = blockDimX;
|
||||
dispatch_packet->workgroup_size_y = blockDimY;
|
||||
dispatch_packet->workgroup_size_z = blockDimZ;
|
||||
dispatch_packet->grid_size_x = blockDimX * gridDimX;
|
||||
dispatch_packet->grid_size_y = blockDimY * gridDimY;
|
||||
dispatch_packet->grid_size_z = blockDimZ * gridDimZ;
|
||||
dispatch_packet->group_segment_size = 0;
|
||||
dispatch_packet->private_segment_size = sharedMemBytes;
|
||||
dispatch_packet->kernarg_address = kern;
|
||||
dispatch_packet->kernel_object = kernel;
|
||||
uint16_t header = (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
|
||||
(1 << HSA_PACKET_HEADER_BARRIER) |
|
||||
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
|
||||
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
|
||||
|
||||
uint16_t setup = 1 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
|
||||
uint32_t header32 = header | (setup << 16);
|
||||
|
||||
__atomic_store_n((uint32_t*)(dispatch_packet), header32, __ATOMIC_RELEASE);
|
||||
|
||||
hsa_queue_store_write_index_relaxed(Queue, packet_index + 1);
|
||||
hsa_signal_store_relaxed(Queue->doorbell_signal, packet_index);
|
||||
}
|
||||
|
||||
|
||||
//=============================================================================
|
||||
// Recompute the peercnt and the packed _peerAgents whenever a peer is added or deleted.
|
||||
@@ -1260,13 +1312,20 @@ hipStream_t ihipSyncAndResolveStream(hipStream_t stream)
|
||||
}
|
||||
}
|
||||
|
||||
void ihipPrintKernelLaunch(const char *kernelName, const grid_launch_parm *lp, const hipStream_t stream)
|
||||
{
|
||||
std::string streamString = ToString(stream);
|
||||
fprintf(stderr, KGRN "<<hip-api: hipLaunchKernel '%s' gridDim:(%d,%d,%d) groupDim:(%d,%d,%d) groupMem:+%d %s\n" KNRM, \
|
||||
kernelName, lp->grid_dim.x, lp->grid_dim.y, lp->grid_dim.z, lp->group_dim.x, lp->group_dim.y, lp->group_dim.z,
|
||||
lp->dynamic_group_mem_bytes, streamString.c_str());\
|
||||
}
|
||||
|
||||
// TODO - data-up to data-down:
|
||||
// Called just before a kernel is launched from hipLaunchKernel.
|
||||
// Allows runtime to track some information about the stream.
|
||||
hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, dim3 block, grid_launch_parm *lp)
|
||||
{
|
||||
HIP_INIT_API(stream, grid, block, lp);
|
||||
HIP_INIT();
|
||||
stream = ihipSyncAndResolveStream(stream);
|
||||
#if USE_GRID_LAUNCH_20
|
||||
lp->grid_dim.x = grid.x;
|
||||
@@ -1439,7 +1498,7 @@ const char *ihipErrorString(hipError_t hip_error)
|
||||
|
||||
void ihipSetTs(hipEvent_t e)
|
||||
{
|
||||
ihipEvent_t *eh = e._handle;
|
||||
ihipEvent_t *eh = e;
|
||||
if (eh->_state == hipEventStatusRecorded) {
|
||||
// already recorded, done:
|
||||
return;
|
||||
@@ -1509,7 +1568,7 @@ void ihipStream_t::setAsyncCopyAgents(unsigned kind, ihipCommand_t *commandType,
|
||||
}
|
||||
|
||||
|
||||
void ihipStream_t::copySync(LockedAccessor_StreamCrit_t &crit, void* dst, const void* src, size_t sizeBytes, unsigned kind)
|
||||
void ihipStream_t::copySync(LockedAccessor_StreamCrit_t &crit, void* dst, const void* src, size_t sizeBytes, unsigned kind, bool resolveOn)
|
||||
{
|
||||
ihipCtx_t *ctx = this->getCtx();
|
||||
const ihipDevice_t *device = ctx->getDevice();
|
||||
@@ -1528,7 +1587,7 @@ void ihipStream_t::copySync(LockedAccessor_StreamCrit_t &crit, void* dst, const
|
||||
bool dstInDeviceMem = dstPtrInfo._isInDeviceMem;
|
||||
|
||||
// Resolve default to a specific Kind so we know which algorithm to use:
|
||||
if (kind == hipMemcpyDefault) {
|
||||
if (kind == hipMemcpyDefault && resolveOn) {
|
||||
kind = resolveMemcpyDirection(srcTracked, dstTracked, srcInDeviceMem, dstInDeviceMem);
|
||||
};
|
||||
|
||||
@@ -1699,14 +1758,12 @@ void ihipStream_t::copySync(LockedAccessor_StreamCrit_t &crit, void* dst, const
|
||||
|
||||
|
||||
// Sync copy that acquires lock:
|
||||
void ihipStream_t::locked_copySync(void* dst, const void* src, size_t sizeBytes, unsigned kind)
|
||||
void ihipStream_t::locked_copySync(void* dst, const void* src, size_t sizeBytes, unsigned kind, bool resolveOn)
|
||||
{
|
||||
LockedAccessor_StreamCrit_t crit (_criticalData);
|
||||
copySync(crit, dst, src, sizeBytes, kind);
|
||||
copySync(crit, dst, src, sizeBytes, kind, resolveOn);
|
||||
}
|
||||
|
||||
|
||||
|
||||
void ihipStream_t::copyAsync(void* dst, const void* src, size_t sizeBytes, unsigned kind)
|
||||
{
|
||||
LockedAccessor_StreamCrit_t crit(_criticalData);
|
||||
|
||||
@@ -446,12 +446,96 @@ hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind
|
||||
e = ex._code;
|
||||
}
|
||||
|
||||
return ihipLogStatus(e);
|
||||
}
|
||||
|
||||
hipError_t hipMemcpyHtoD(hipDeviceptr_t dst, void* src, size_t sizeBytes)
|
||||
{
|
||||
HIP_INIT_API(dst, src, sizeBytes);
|
||||
|
||||
hipStream_t stream = ihipSyncAndResolveStream(hipStreamNull);
|
||||
|
||||
hc::completion_future marker;
|
||||
|
||||
hipError_t e = hipSuccess;
|
||||
|
||||
try {
|
||||
|
||||
stream->locked_copySync((void*)dst, (void*)src, sizeBytes, hipMemcpyHostToDevice, false);
|
||||
}
|
||||
catch (ihipException ex) {
|
||||
e = ex._code;
|
||||
}
|
||||
|
||||
return ihipLogStatus(e);
|
||||
}
|
||||
|
||||
|
||||
hipError_t hipMemcpyDtoH(void* dst, hipDeviceptr_t src, size_t sizeBytes)
|
||||
{
|
||||
HIP_INIT_API(dst, src, sizeBytes);
|
||||
|
||||
hipStream_t stream = ihipSyncAndResolveStream(hipStreamNull);
|
||||
|
||||
hc::completion_future marker;
|
||||
|
||||
hipError_t e = hipSuccess;
|
||||
|
||||
try {
|
||||
|
||||
stream->locked_copySync((void*)dst, (void*)src, sizeBytes, hipMemcpyDeviceToHost, false);
|
||||
}
|
||||
catch (ihipException ex) {
|
||||
e = ex._code;
|
||||
}
|
||||
|
||||
return ihipLogStatus(e);
|
||||
}
|
||||
|
||||
hipError_t hipMemcpyDtoD(hipDeviceptr_t dst, hipDeviceptr_t src, size_t sizeBytes)
|
||||
{
|
||||
HIP_INIT_API(dst, src, sizeBytes);
|
||||
|
||||
hipStream_t stream = ihipSyncAndResolveStream(hipStreamNull);
|
||||
|
||||
hc::completion_future marker;
|
||||
|
||||
hipError_t e = hipSuccess;
|
||||
|
||||
try {
|
||||
|
||||
stream->locked_copySync((void*)dst, (void*)src, sizeBytes, hipMemcpyDeviceToDevice, false);
|
||||
}
|
||||
catch (ihipException ex) {
|
||||
e = ex._code;
|
||||
}
|
||||
|
||||
return ihipLogStatus(e);
|
||||
}
|
||||
|
||||
hipError_t hipMemcpyHtoH(void* dst, void* src, size_t sizeBytes)
|
||||
{
|
||||
HIP_INIT_API(dst, src, sizeBytes);
|
||||
|
||||
hipStream_t stream = ihipSyncAndResolveStream(hipStreamNull);
|
||||
|
||||
hc::completion_future marker;
|
||||
|
||||
hipError_t e = hipSuccess;
|
||||
|
||||
try {
|
||||
|
||||
stream->locked_copySync((void*)dst, (void*)src, sizeBytes, hipMemcpyHostToHost, false);
|
||||
}
|
||||
catch (ihipException ex) {
|
||||
e = ex._code;
|
||||
}
|
||||
|
||||
return ihipLogStatus(e);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* @result #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidMemcpyDirection,
|
||||
* @result #hipErrorInvalidValue : If dst==NULL or src==NULL, or other bad argument.
|
||||
@@ -786,7 +870,6 @@ hipError_t hipMemGetInfo (size_t *free, size_t *total)
|
||||
// TODO - replace with kernel-level for reporting free memory:
|
||||
size_t deviceMemSize, hostMemSize, userMemSize;
|
||||
hc::am_memtracker_sizeinfo(device->_acc, &deviceMemSize, &hostMemSize, &userMemSize);
|
||||
printf ("deviceMemSize=%zu\n", deviceMemSize);
|
||||
|
||||
*free = device->_props.totalGlobalMem - deviceMemSize;
|
||||
}
|
||||
|
||||
+200
-107
@@ -20,137 +20,188 @@ THE SOFTWARE.
|
||||
#include "hip_runtime.h"
|
||||
#include "hsa/hsa.h"
|
||||
#include "hsa/hsa_ext_amd.h"
|
||||
#include "hsa/amd_hsa_kernel_code.h"
|
||||
#include "hcc_detail/hip_hcc.h"
|
||||
#include "hcc_detail/trace_helper.h"
|
||||
#include <fstream>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <elf.h>
|
||||
|
||||
//TODO Use Pool APIs from HCC to get memory regions.
|
||||
|
||||
namespace hipdrv{
|
||||
hsa_status_t findSystemRegions(hsa_region_t region, void *data){
|
||||
hsa_region_segment_t segment_id;
|
||||
hsa_region_get_info(region, HSA_REGION_INFO_SEGMENT, &segment_id);
|
||||
|
||||
if(segment_id != HSA_REGION_SEGMENT_GLOBAL){
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
hsa_status_t findSystemRegions(hsa_region_t region, void *data){
|
||||
hsa_region_segment_t segment_id;
|
||||
hsa_region_get_info(region, HSA_REGION_INFO_SEGMENT, &segment_id);
|
||||
|
||||
hsa_region_global_flag_t flags;
|
||||
hsa_region_get_info(region, HSA_REGION_INFO_GLOBAL_FLAGS, &flags);
|
||||
if(segment_id != HSA_REGION_SEGMENT_GLOBAL){
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
hsa_region_t *reg = (hsa_region_t*)data;
|
||||
hsa_region_global_flag_t flags;
|
||||
hsa_region_get_info(region, HSA_REGION_INFO_GLOBAL_FLAGS, &flags);
|
||||
|
||||
if(flags & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED){
|
||||
*reg = region;
|
||||
}
|
||||
hsa_region_t *reg = (hsa_region_t*)data;
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
if(flags & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED){
|
||||
*reg = region;
|
||||
}
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
} // End namespace hipdrv
|
||||
|
||||
uint64_t PrintSymbolSizes(const void *emi, const char *name){
|
||||
const Elf64_Ehdr *ehdr = (const Elf64_Ehdr*)emi;
|
||||
if(NULL == ehdr || EV_CURRENT != ehdr->e_version){}
|
||||
const Elf64_Shdr * shdr = (const Elf64_Shdr*)((char*)emi + ehdr->e_shoff);
|
||||
for(uint16_t i=0;i<ehdr->e_shnum;++i){
|
||||
if(shdr[i].sh_type == SHT_SYMTAB){
|
||||
const Elf64_Sym *syms = (const Elf64_Sym*)((char*)emi + shdr[i].sh_offset);
|
||||
assert(syms);
|
||||
uint64_t numSyms = shdr[i].sh_size/shdr[i].sh_entsize;
|
||||
const char* strtab = (const char*)((char*)emi + shdr[shdr[i].sh_link].sh_offset);
|
||||
assert(strtab);
|
||||
for(uint64_t i=0;i<numSyms;++i){
|
||||
const char *symname = strtab + syms[i].st_name;
|
||||
assert(symname);
|
||||
uint64_t size = syms[i].st_size;
|
||||
if(strcmp(name, symname) == 0){
|
||||
return size;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
hsa_status_t findKernArgRegions(hsa_region_t region, void *data){
|
||||
hsa_region_segment_t segment_id;
|
||||
hsa_region_get_info(region, HSA_REGION_INFO_SEGMENT, &segment_id);
|
||||
uint64_t ElfSize(const void *emi){
|
||||
const Elf64_Ehdr *ehdr = (const Elf64_Ehdr*)emi;
|
||||
const Elf64_Shdr *shdr = (const Elf64_Shdr*)((char*)emi + ehdr->e_shoff);
|
||||
|
||||
if(segment_id != HSA_REGION_SEGMENT_GLOBAL){
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
uint64_t max_offset = ehdr->e_shoff;
|
||||
uint64_t total_size = max_offset + ehdr->e_shentsize * ehdr->e_shnum;
|
||||
|
||||
hsa_region_global_flag_t flags;
|
||||
hsa_region_get_info(region, HSA_REGION_INFO_GLOBAL_FLAGS, &flags);
|
||||
|
||||
hsa_region_t *reg = (hsa_region_t*)data;
|
||||
|
||||
if(flags & HSA_REGION_GLOBAL_FLAG_KERNARG){
|
||||
*reg = region;
|
||||
}
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
for(uint16_t i=0;i < ehdr->e_shnum;++i){
|
||||
uint64_t cur_offset = static_cast<uint64_t>(shdr[i].sh_offset);
|
||||
if(max_offset < cur_offset){
|
||||
max_offset = cur_offset;
|
||||
total_size = max_offset;
|
||||
if(SHT_NOBITS != shdr[i].sh_type){
|
||||
total_size += static_cast<uint64_t>(shdr[i].sh_size);
|
||||
}
|
||||
}
|
||||
}
|
||||
return total_size;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
hipError_t hipModuleLoad(hipModule *module, const char *fname){
|
||||
hipError_t hipModuleLoad(hipModule_t *module, const char *fname){
|
||||
HIP_INIT_API(fname);
|
||||
hipError_t ret = hipSuccess;
|
||||
*module = new ihipModule_t;
|
||||
|
||||
if(module == NULL){
|
||||
ret = hipErrorInvalidValue;
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
|
||||
auto ctx = ihipGetTlsDefaultCtx();
|
||||
if(ctx == nullptr){
|
||||
ret = hipErrorInvalidContext;
|
||||
|
||||
}else{
|
||||
int deviceId = ctx->getDevice()->_deviceId;
|
||||
ihipDevice_t *currentDevice = ihipGetDevice(deviceId);
|
||||
std::ifstream in(fname, std::ios::binary | std::ios::ate);
|
||||
|
||||
if(!in){
|
||||
return hipErrorFileNotFound;
|
||||
|
||||
}else{
|
||||
|
||||
*module = new ihipModule_t;
|
||||
size_t size = std::string::size_type(in.tellg());
|
||||
void *p = NULL;
|
||||
hsa_agent_t agent = currentDevice->_hsaAgent;
|
||||
hsa_region_t sysRegion;
|
||||
hsa_status_t status = hsa_agent_iterate_regions(agent, hipdrv::findSystemRegions, &sysRegion);
|
||||
status = hsa_memory_allocate(sysRegion, size, (void**)&p);
|
||||
|
||||
if(status != HSA_STATUS_SUCCESS){
|
||||
return hipErrorOutOfMemory;
|
||||
}
|
||||
|
||||
char *ptr = (char*)p;
|
||||
if(!ptr){
|
||||
return hipErrorOutOfMemory;
|
||||
std::cout<<"Error: failed to allocate memory for code object"<<std::endl;
|
||||
}
|
||||
hsa_code_object_t obj;
|
||||
(*module)->ptr = p;
|
||||
(*module)->size = size;
|
||||
in.seekg(0, std::ios::beg);
|
||||
std::copy(std::istreambuf_iterator<char>(in),
|
||||
std::istreambuf_iterator<char>(), ptr);
|
||||
status = hsa_code_object_deserialize(ptr, size, NULL, &obj);
|
||||
|
||||
status = hsa_code_object_deserialize(ptr, size, NULL, &(*module)->object);
|
||||
|
||||
if(status != HSA_STATUS_SUCCESS){
|
||||
return hipErrorSharedObjectInitFailed;
|
||||
}
|
||||
*module = obj.handle;
|
||||
|
||||
status = hsa_executable_create(HSA_PROFILE_FULL, HSA_EXECUTABLE_STATE_UNFROZEN, NULL, &(*module)->executable);
|
||||
if(status != HSA_STATUS_SUCCESS){
|
||||
return hipErrorNotInitialized;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
hipError_t hipModuleGetFunction(hipFunction *func, hipModule hmod, const char *name){
|
||||
HIP_INIT_API(name);
|
||||
hipError_t hipModuleUnload(hipModule_t hmod){
|
||||
hipError_t ret = hipSuccess;
|
||||
hsa_status_t status = hsa_executable_destroy(hmod->executable);
|
||||
if(status != HSA_STATUS_SUCCESS){ret = hipErrorInvalidValue; }
|
||||
status = hsa_code_object_destroy(hmod->object);
|
||||
if(status != HSA_STATUS_SUCCESS){ret = hipErrorInvalidValue; }
|
||||
delete hmod;
|
||||
return ret;
|
||||
}
|
||||
|
||||
hipError_t ihipModuleGetFunction(hipFunction_t *func, hipModule_t hmod, const char *name){
|
||||
auto ctx = ihipGetTlsDefaultCtx();
|
||||
hipError_t ret = hipSuccess;
|
||||
if(name == nullptr || hmod == 0){
|
||||
|
||||
if(name == nullptr){
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
|
||||
if(ctx == nullptr){
|
||||
ret = hipErrorInvalidContext;
|
||||
|
||||
}else{
|
||||
*func = new ihipFunction_t;
|
||||
int deviceId = ctx->getDevice()->_deviceId;
|
||||
ihipDevice_t *currentDevice = ihipGetDevice(deviceId);
|
||||
hsa_agent_t gpuAgent = (hsa_agent_t)currentDevice->_hsaAgent;
|
||||
|
||||
hsa_status_t status;
|
||||
hsa_executable_symbol_t kernel_symbol;
|
||||
hsa_executable_t executable;
|
||||
status = hsa_executable_create(HSA_PROFILE_FULL, HSA_EXECUTABLE_STATE_UNFROZEN, NULL, &executable);
|
||||
status = hsa_executable_load_code_object(hmod->executable, gpuAgent, hmod->object, NULL);
|
||||
if(status != HSA_STATUS_SUCCESS){
|
||||
return hipErrorNotInitialized;
|
||||
}
|
||||
hsa_code_object_t obj;
|
||||
obj.handle = hmod;
|
||||
status = hsa_executable_load_code_object(executable, gpuAgent, obj, NULL);
|
||||
if(status != HSA_STATUS_SUCCESS){
|
||||
return hipErrorNotInitialized;
|
||||
}
|
||||
status = hsa_executable_freeze(executable, NULL);
|
||||
status = hsa_executable_get_symbol(executable, NULL, name, gpuAgent, 0, &kernel_symbol);
|
||||
|
||||
status = hsa_executable_freeze(hmod->executable, NULL);
|
||||
status = hsa_executable_get_symbol(hmod->executable, NULL, name, gpuAgent, 0, &(*func)->kernel_symbol);
|
||||
if(status != HSA_STATUS_SUCCESS){
|
||||
return hipErrorNotFound;
|
||||
}
|
||||
status = hsa_executable_symbol_get_info(kernel_symbol,
|
||||
|
||||
status = hsa_executable_symbol_get_info((*func)->kernel_symbol,
|
||||
HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT,
|
||||
func);
|
||||
&(*func)->kernel);
|
||||
|
||||
if(status != HSA_STATUS_SUCCESS){
|
||||
return hipErrorNotFound;
|
||||
}
|
||||
@@ -158,16 +209,24 @@ hipError_t hipModuleGetFunction(hipFunction *func, hipModule hmod, const char *n
|
||||
return ret;
|
||||
}
|
||||
|
||||
hipError_t hipLaunchModuleKernel(hipFunction f,
|
||||
hipError_t hipModuleGetFunction(hipFunction_t *hfunc, hipModule_t hmod,
|
||||
const char *name){
|
||||
HIP_INIT_API(name);
|
||||
return ihipModuleGetFunction(hfunc, hmod, name);
|
||||
}
|
||||
|
||||
hipError_t hipModuleLaunchKernel(hipFunction_t f,
|
||||
uint32_t gridDimX, uint32_t gridDimY, uint32_t gridDimZ,
|
||||
uint32_t blockDimX, uint32_t blockDimY, uint32_t blockDimZ,
|
||||
uint32_t sharedMemBytes, hipStream_t hStream,
|
||||
void **kernelParams, void **extra){
|
||||
HIP_INIT_API(f);
|
||||
HIP_INIT_API(f->kernel);
|
||||
auto ctx = ihipGetTlsDefaultCtx();
|
||||
hipError_t ret = hipSuccess;
|
||||
|
||||
if(ctx == nullptr){
|
||||
ret = hipErrorInvalidDevice;
|
||||
|
||||
}else{
|
||||
int deviceId = ctx->getDevice()->_deviceId;
|
||||
ihipDevice_t *currentDevice = ihipGetDevice(deviceId);
|
||||
@@ -189,66 +248,100 @@ hipError_t hipLaunchModuleKernel(hipFunction f,
|
||||
/*
|
||||
Kernel argument preparation.
|
||||
*/
|
||||
|
||||
hsa_region_t kernArg;
|
||||
hsa_status_t status = hsa_agent_iterate_regions(gpuAgent, hipdrv::findKernArgRegions, &kernArg);
|
||||
void *kern;
|
||||
status = hsa_memory_allocate(kernArg, kernSize, &kern);
|
||||
if(status != HSA_STATUS_SUCCESS){
|
||||
return hipErrorLaunchOutOfResources;
|
||||
}
|
||||
memcpy(kern, config[1], kernSize);
|
||||
|
||||
hsa_status_t status;
|
||||
grid_launch_parm lp;
|
||||
hStream = ihipPreLaunchKernel(hStream, 0, 0, &lp);
|
||||
|
||||
/*
|
||||
Pre kernel launch
|
||||
|
||||
stream = ihipSyncAndResolveStream(stream);
|
||||
stream->lockopen_preKernelCommand();
|
||||
hc::accelerator_view av = &stream->_av;
|
||||
hc::completion_future cf = new hc::completion_future;
|
||||
Create signal
|
||||
*/
|
||||
|
||||
hStream = ihipSyncAndResolveStream(hStream);
|
||||
hc::accelerator_view *av = &hStream->_av;
|
||||
hsa_queue_t *Queue = (hsa_queue_t*)av->get_hsa_queue();
|
||||
hsa_signal_t signal;
|
||||
status = hsa_signal_create(1, 0, NULL, &signal);
|
||||
|
||||
/*
|
||||
Creating the packets
|
||||
Launch AQL packet
|
||||
*/
|
||||
hStream->launchModuleKernel(signal, blockDimX, blockDimY, blockDimZ,
|
||||
gridDimX, gridDimY, gridDimZ, sharedMemBytes, config[1], kernSize, f->kernel);
|
||||
|
||||
/*
|
||||
Wait for signal
|
||||
*/
|
||||
|
||||
const uint32_t queue_mask = Queue->size-1;
|
||||
uint32_t packet_index = hsa_queue_load_write_index_relaxed(Queue);
|
||||
hsa_kernel_dispatch_packet_t *dispatch_packet = &(((hsa_kernel_dispatch_packet_t*)(Queue->base_address))[packet_index & queue_mask]);
|
||||
|
||||
dispatch_packet->completion_signal = signal;
|
||||
dispatch_packet->workgroup_size_x = blockDimX;
|
||||
dispatch_packet->workgroup_size_y = blockDimY;
|
||||
dispatch_packet->workgroup_size_z = blockDimZ;
|
||||
dispatch_packet->grid_size_x = blockDimX * gridDimX;
|
||||
dispatch_packet->grid_size_y = blockDimY * gridDimY;
|
||||
dispatch_packet->grid_size_z = blockDimZ * gridDimZ;
|
||||
|
||||
dispatch_packet->group_segment_size = 0;
|
||||
dispatch_packet->private_segment_size = sharedMemBytes;
|
||||
dispatch_packet->kernarg_address = kern;
|
||||
dispatch_packet->kernel_object = f;
|
||||
uint16_t header = (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
|
||||
(1 << HSA_PACKET_HEADER_BARRIER) |
|
||||
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
|
||||
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
|
||||
|
||||
|
||||
uint16_t setup = 1 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
|
||||
uint32_t header32 = header | (setup << 16);
|
||||
|
||||
__atomic_store_n((uint32_t*)(dispatch_packet), header32, __ATOMIC_RELEASE);
|
||||
|
||||
hsa_queue_store_write_index_relaxed(Queue, packet_index+1);
|
||||
hsa_signal_store_relaxed(Queue->doorbell_signal, packet_index);
|
||||
hsa_signal_value_t value = hsa_signal_wait_acquire(signal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_BLOCKED);
|
||||
|
||||
|
||||
ihipPostLaunchKernel(hStream, lp);
|
||||
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t *bytes,
|
||||
hipModule_t hmod, const char* name){
|
||||
HIP_INIT_API(name);
|
||||
hipError_t ret = hipSuccess;
|
||||
if(dptr == NULL || bytes == NULL){
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
if(name == NULL || hmod == NULL){
|
||||
return hipErrorNotInitialized;
|
||||
}
|
||||
else{
|
||||
hipFunction_t func;
|
||||
ihipModuleGetFunction(&func, hmod, name);
|
||||
*bytes = PrintSymbolSizes(hmod->ptr, name) + sizeof(amd_kernel_code_t);
|
||||
*dptr = reinterpret_cast<void*>(func->kernel);
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
hipError_t hipModuleLoadData(hipModule_t *module, const void *image){
|
||||
HIP_INIT_API(image);
|
||||
hipError_t ret = hipSuccess;
|
||||
if(image == NULL || module == NULL){
|
||||
return hipErrorNotInitialized;
|
||||
}else{
|
||||
auto ctx = ihipGetTlsDefaultCtx();
|
||||
*module = new ihipModule_t;
|
||||
int deviceId = ctx->getDevice()->_deviceId;
|
||||
ihipDevice_t *currentDevice = ihipGetDevice(deviceId);
|
||||
|
||||
void *p;
|
||||
uint64_t size = ElfSize(image);
|
||||
hsa_agent_t agent = currentDevice->_hsaAgent;
|
||||
hsa_region_t sysRegion;
|
||||
hsa_status_t status = hsa_agent_iterate_regions(agent, hipdrv::findSystemRegions, &sysRegion);
|
||||
status = hsa_memory_allocate(sysRegion, size, (void**)&p);
|
||||
|
||||
if(status != HSA_STATUS_SUCCESS){
|
||||
return hipErrorOutOfMemory;
|
||||
}
|
||||
|
||||
char *ptr = (char*)p;
|
||||
if(!ptr){
|
||||
return hipErrorOutOfMemory;
|
||||
}
|
||||
(*module)->ptr = p;
|
||||
(*module)->size = size;
|
||||
|
||||
memcpy(ptr, image, size);
|
||||
|
||||
status = hsa_code_object_deserialize(ptr, size, NULL, &(*module)->object);
|
||||
|
||||
if(status != HSA_STATUS_SUCCESS){
|
||||
return hipErrorSharedObjectInitFailed;
|
||||
}
|
||||
|
||||
status = hsa_executable_create(HSA_PROFILE_FULL, HSA_EXECUTABLE_STATE_UNFROZEN, NULL, &(*module)->executable);
|
||||
if(status != HSA_STATUS_SUCCESS){
|
||||
return hipErrorNotInitialized;
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -67,7 +67,7 @@ hipError_t hipDeviceCanAccessPeer (int* canAccessPeer, hipCtx_t thisCtx, hipCtx_
|
||||
//---
|
||||
// Disable visibility of this device into memory allocated on peer device.
|
||||
// Remove this device from peer device peerlist.
|
||||
hipError_t hipDeviceDisablePeerAccess (hipCtx_t peerCtx)
|
||||
hipError_t ihipDisablePeerAccess (hipCtx_t peerCtx)
|
||||
{
|
||||
HIP_INIT_API(peerCtx);
|
||||
|
||||
@@ -109,7 +109,7 @@ hipError_t hipDeviceDisablePeerAccess (hipCtx_t peerCtx)
|
||||
//---
|
||||
// Allow the current device to see all memory allocated on peerDevice.
|
||||
// This should add this device to the peer-device peer list.
|
||||
hipError_t hipDeviceEnablePeerAccess (hipCtx_t peerCtx, unsigned int flags)
|
||||
hipError_t ihipEnablePeerAccess (hipCtx_t peerCtx, unsigned int flags)
|
||||
{
|
||||
HIP_INIT_API(peerCtx, flags);
|
||||
|
||||
@@ -175,7 +175,7 @@ hipError_t hipDeviceDisablePeerAccess (int peerDeviceId)
|
||||
{
|
||||
HIP_INIT_API(peerDeviceId);
|
||||
|
||||
return hipDeviceDisablePeerAccess(ihipGetPrimaryCtx(peerDeviceId));
|
||||
return ihipDisablePeerAccess(ihipGetPrimaryCtx(peerDeviceId));
|
||||
}
|
||||
|
||||
|
||||
@@ -183,7 +183,7 @@ hipError_t hipDeviceEnablePeerAccess (int peerDeviceId, unsigned int flags)
|
||||
{
|
||||
HIP_INIT_API(peerDeviceId, flags);
|
||||
|
||||
return hipDeviceEnablePeerAccess(ihipGetPrimaryCtx(peerDeviceId), flags);
|
||||
return ihipEnablePeerAccess(ihipGetPrimaryCtx(peerDeviceId), flags);
|
||||
}
|
||||
|
||||
|
||||
@@ -200,6 +200,16 @@ hipError_t hipMemcpyPeerAsync (void* dst, int dstDevice, const void* src, int
|
||||
return hipMemcpyPeerAsync(dst, ihipGetPrimaryCtx(dstDevice), src, ihipGetPrimaryCtx(srcDevice), sizeBytes, stream);
|
||||
}
|
||||
|
||||
hipError_t hipCtxEnablePeerAccess (hipCtx_t peerCtx, unsigned int flags)
|
||||
{
|
||||
HIP_INIT_API(peerCtx, flags);
|
||||
|
||||
return ihipEnablePeerAccess(peerCtx, flags);
|
||||
}
|
||||
|
||||
hipError_t hipCtxDisablePeerAccess (hipCtx_t peerCtx)
|
||||
{
|
||||
HIP_INIT_API(peerCtx);
|
||||
|
||||
return ihipDisablePeerAccess(peerCtx);
|
||||
}
|
||||
|
||||
@@ -0,0 +1,53 @@
|
||||
/*
|
||||
Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
|
||||
#include<iostream>
|
||||
#include<hip/hip_runtime.h>
|
||||
#include<hip/hip_runtime_api.h>
|
||||
#include<hip/hcc_detail/hipComplex.h>
|
||||
|
||||
#define LEN 64
|
||||
#define SIZE 64<<2
|
||||
|
||||
__global__ void getSqAbs(hipLaunchParm lp, float *A, float *B, float *C){
|
||||
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
|
||||
C[tx] = hipCsqabsf(make_hipFloatComplex(A[tx], B[tx]));
|
||||
}
|
||||
|
||||
int main(){
|
||||
float *A, *Ad, *B, *Bd, *C, *Cd;
|
||||
A = new float[LEN];
|
||||
B = new float[LEN];
|
||||
C = new float[LEN];
|
||||
for(uint32_t i=0;i<LEN;i++){
|
||||
A[i] = i*1.0f;
|
||||
B[i] = i*1.0f;
|
||||
C[i] = i*1.0f;
|
||||
}
|
||||
|
||||
hipMalloc((void**)&Ad, SIZE);
|
||||
hipMalloc((void**)&Bd, SIZE);
|
||||
hipMalloc((void**)&Cd, SIZE);
|
||||
hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
|
||||
hipMemcpy(Bd, B, SIZE, hipMemcpyHostToDevice);
|
||||
hipLaunchKernel(getSqAbs, dim3(1), dim3(LEN), 0, 0, Ad, Bd, Cd);
|
||||
hipMemcpy(C, Cd, SIZE, hipMemcpyDeviceToHost);
|
||||
std::cout<<A[11]<<" "<<B[11]<<" "<<C[11]<<std::endl;
|
||||
}
|
||||
@@ -0,0 +1,42 @@
|
||||
#include<iostream>
|
||||
#include<hip/hip_runtime.h>
|
||||
#include<hip/hip_runtime_api.h>
|
||||
|
||||
#define LEN 1030
|
||||
#define SIZE LEN << 2
|
||||
|
||||
__global__ void cpy(hipLaunchParm lp, uint32_t *Out, uint32_t *In, uint32_t *Vald)
|
||||
{
|
||||
memcpy(Out, In, SIZE, Vald);
|
||||
}
|
||||
|
||||
__global__ void set(hipLaunchParm lp, uint32_t *ptr, uint8_t val, size_t size)
|
||||
{
|
||||
memset(ptr, val, size);
|
||||
}
|
||||
|
||||
int main()
|
||||
{
|
||||
uint32_t *A, *Ad, *B, *Bd;
|
||||
uint32_t *Val, *Vald;
|
||||
A = new uint32_t[LEN];
|
||||
B = new uint32_t[LEN];
|
||||
Val = new uint32_t;
|
||||
*Val = 0;
|
||||
for(int i=0;i<LEN;i++){
|
||||
A[i] = i *1.0f;
|
||||
B[i] = 0.0f;
|
||||
}
|
||||
hipMalloc((void**)&Ad, SIZE);
|
||||
hipMalloc((void**)&Bd, SIZE);
|
||||
hipMalloc((void**)&Vald, sizeof(uint32_t));
|
||||
hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
|
||||
hipLaunchKernel(cpy, dim3(1), dim3(LEN/4), 0, 0, Bd, Ad, Vald);
|
||||
hipLaunchKernel(set, dim3(1), dim3(LEN/4), 0, 0, Bd, 0x1, SIZE);
|
||||
hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost);
|
||||
hipMemcpy(Val, Vald, sizeof(uint32_t), hipMemcpyDeviceToHost);
|
||||
for(int i=LEN-16;i<LEN;i++){
|
||||
std::cout<<A[i]<<" "<<B[i]<<std::endl;
|
||||
}
|
||||
std::cout<<*Val<<std::endl;
|
||||
}
|
||||
@@ -0,0 +1,26 @@
|
||||
#include<iostream>
|
||||
#include<hip/hip_runtime.h>
|
||||
#include<hip/hip_runtime_api.h>
|
||||
|
||||
#define LEN 1024
|
||||
#define SIZE LEN<<2
|
||||
|
||||
int main(){
|
||||
int *A, *B, *C;
|
||||
hipDeviceptr Ad, Bd;
|
||||
A = new int[LEN];
|
||||
B = new int[LEN];
|
||||
C = new int[LEN];
|
||||
for(int i=0;i<LEN;i++){
|
||||
A[i] = i;
|
||||
}
|
||||
hipMalloc((void**)&Ad, SIZE);
|
||||
hipMalloc((void**)&Bd, SIZE);
|
||||
hipMemcpyHtoD(Ad, A, SIZE);
|
||||
hipMemcpyDtoD(Bd, Ad, SIZE);
|
||||
hipMemcpyDtoH(B, Bd, SIZE);
|
||||
hipMemcpyHtoH(C, B,SIZE);
|
||||
for(int i=0;i<16;i++){
|
||||
std::cout<<A[i]<<" "<<C[i]<<std::endl;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,85 @@
|
||||
/*
|
||||
Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include<hip_runtime.h>
|
||||
#include<hip_runtime_api.h>
|
||||
#include<iostream>
|
||||
#include<fstream>
|
||||
#include<vector>
|
||||
|
||||
#define LEN 64
|
||||
#define SIZE LEN<<2
|
||||
|
||||
#define fileName "vcpy_isa.co"
|
||||
#define kernel_name "hello_world"
|
||||
|
||||
__global__ void Cpy(hipLaunchParm lp, float *Ad, float* Bd){
|
||||
int tx = hipThreadIdx_x;
|
||||
Bd[tx] = Ad[tx];
|
||||
}
|
||||
|
||||
int main(){
|
||||
float *A, *B, *Ad, *Bd;
|
||||
A = new float[LEN];
|
||||
B = new float[LEN];
|
||||
|
||||
for(uint32_t i=0;i<LEN;i++){
|
||||
A[i] = i*1.0f;
|
||||
B[i] = 0.0f;
|
||||
std::cout<<A[i] << " "<<B[i]<<std::endl;
|
||||
}
|
||||
|
||||
hipMalloc((void**)&Ad, SIZE);
|
||||
hipMalloc((void**)&Bd, SIZE);
|
||||
|
||||
hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
|
||||
hipMemcpy(Bd, B, SIZE, hipMemcpyHostToDevice);
|
||||
hipModule_t Module;
|
||||
hipFunction_t Function;
|
||||
hipModuleLoad(&Module, fileName);
|
||||
hipModuleGetFunction(&Function, Module, kernel_name);
|
||||
hipStream_t stream;
|
||||
hipStreamCreate(&stream);
|
||||
void *args[2] = {&Ad, &Bd};
|
||||
|
||||
|
||||
std::vector<void*>argBuffer(2);
|
||||
memcpy(&argBuffer[0], &Ad, sizeof(void*));
|
||||
memcpy(&argBuffer[1], &Bd, sizeof(void*));
|
||||
|
||||
size_t size = argBuffer.size()*sizeof(void*);
|
||||
|
||||
void *config[] = {
|
||||
HIP_LAUNCH_PARAM_BUFFER_POINTER, &argBuffer[0],
|
||||
HIP_LAUNCH_PARAM_BUFFER_SIZE, &size,
|
||||
HIP_LAUNCH_PARAM_END
|
||||
};
|
||||
|
||||
hipModuleLaunchKernel(Function, 1, 1, 1, LEN, 1, 1, 0, stream, NULL, (void**)&config);
|
||||
|
||||
hipStreamDestroy(stream);
|
||||
|
||||
hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost);
|
||||
|
||||
for(uint32_t i=0;i<LEN;i++){
|
||||
std::cout<<A[i]<<" - "<<B[i]<<std::endl;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -0,0 +1,31 @@
|
||||
/*
|
||||
Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include<hip/hip_runtime.h>
|
||||
#include<hip/hip_runtime_api.h>
|
||||
#include<iostream>
|
||||
|
||||
#define fileName "vcpy_isa.co"
|
||||
|
||||
int main(){
|
||||
hipModule_t module;
|
||||
hipModuleLoad(&module, fileName);
|
||||
hipModuleUnload(module);
|
||||
}
|
||||
|
||||
@@ -0,0 +1,94 @@
|
||||
/*
|
||||
Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include<hip_runtime.h>
|
||||
#include<hip_runtime_api.h>
|
||||
#include<iostream>
|
||||
#include<fstream>
|
||||
#include<vector>
|
||||
|
||||
#define LEN 64
|
||||
#define SIZE LEN<<2
|
||||
|
||||
#ifdef __HIP_PLATFORM_HCC__
|
||||
#define fileName "vcpy_isa.co"
|
||||
#endif
|
||||
|
||||
#ifdef __HIP_PLATFORM_NVCC__
|
||||
#define fileName "vcpy_isa.ptx"
|
||||
#endif
|
||||
|
||||
#define kernel_name "hello_world"
|
||||
|
||||
int main(){
|
||||
float *A, *B;
|
||||
hipDeviceptr_t Ad, Bd;
|
||||
A = new float[LEN];
|
||||
B = new float[LEN];
|
||||
|
||||
for(uint32_t i=0;i<LEN;i++){
|
||||
A[i] = i*1.0f;
|
||||
B[i] = 0.0f;
|
||||
std::cout<<A[i] << " "<<B[i]<<std::endl;
|
||||
}
|
||||
|
||||
|
||||
#ifdef __HIP_PLATFORM_NVCC__
|
||||
hipInit(0);
|
||||
hipDevice_t device;
|
||||
hipCtx_t context;
|
||||
hipDeviceGet(&device, 0);
|
||||
hipCtxCreate(&context, 0, device);
|
||||
#endif
|
||||
|
||||
hipMalloc((void**)&Ad, SIZE);
|
||||
hipMalloc((void**)&Bd, SIZE);
|
||||
|
||||
hipMemcpyHtoD(Ad, A, SIZE);
|
||||
hipMemcpyHtoD(Bd, B, SIZE);
|
||||
hipModule_t Module;
|
||||
hipFunction_t Function;
|
||||
hipModuleLoad(&Module, fileName);
|
||||
hipModuleGetFunction(&Function, Module, kernel_name);
|
||||
|
||||
std::vector<void*>argBuffer(2);
|
||||
memcpy(&argBuffer[0], &Ad, sizeof(void*));
|
||||
memcpy(&argBuffer[1], &Bd, sizeof(void*));
|
||||
|
||||
size_t size = argBuffer.size()*sizeof(void*);
|
||||
|
||||
void *config[] = {
|
||||
HIP_LAUNCH_PARAM_BUFFER_POINTER, &argBuffer[0],
|
||||
HIP_LAUNCH_PARAM_BUFFER_SIZE, &size,
|
||||
HIP_LAUNCH_PARAM_END
|
||||
};
|
||||
|
||||
hipModuleLaunchKernel(Function, 1, 1, 1, LEN, 1, 1, 0, 0, NULL, (void**)&config);
|
||||
|
||||
hipMemcpyDtoH(B, Bd, SIZE);
|
||||
for(uint32_t i=0;i<LEN;i++){
|
||||
std::cout<<A[i]<<" - "<<B[i]<<std::endl;
|
||||
}
|
||||
|
||||
#ifdef __HIP_PLATFORM_NVCC__
|
||||
hipCtxDetach(context);
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
Executable
BINáris
Binary file not shown.
@@ -0,0 +1,6 @@
|
||||
|
||||
extern "C" __global__ void hello_world(float *a, float *b)
|
||||
{
|
||||
int tx = threadIdx.x;
|
||||
b[tx] = a[tx];
|
||||
}
|
||||
@@ -0,0 +1,38 @@
|
||||
//
|
||||
// Generated by NVIDIA NVVM Compiler
|
||||
//
|
||||
// Compiler Build ID: CL-19856038
|
||||
// Cuda compilation tools, release 7.5, V7.5.17
|
||||
// Based on LLVM 3.4svn
|
||||
//
|
||||
|
||||
.version 4.3
|
||||
.target sm_20
|
||||
.address_size 64
|
||||
|
||||
// .globl hello_world
|
||||
|
||||
.visible .entry hello_world(
|
||||
.param .u64 hello_world_param_0,
|
||||
.param .u64 hello_world_param_1
|
||||
)
|
||||
{
|
||||
.reg .f32 %f<2>;
|
||||
.reg .b32 %r<2>;
|
||||
.reg .b64 %rd<8>;
|
||||
|
||||
|
||||
ld.param.u64 %rd1, [hello_world_param_0];
|
||||
ld.param.u64 %rd2, [hello_world_param_1];
|
||||
cvta.to.global.u64 %rd3, %rd2;
|
||||
cvta.to.global.u64 %rd4, %rd1;
|
||||
mov.u32 %r1, %tid.x;
|
||||
mul.wide.s32 %rd5, %r1, 4;
|
||||
add.s64 %rd6, %rd4, %rd5;
|
||||
ld.global.f32 %f1, [%rd6];
|
||||
add.s64 %rd7, %rd3, %rd5;
|
||||
st.global.f32 [%rd7], %f1;
|
||||
ret;
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user