SWDEV-470698 - fix formatting, add format check workflow (#657)
Αυτή η υποβολή περιλαμβάνεται σε:
υποβλήθηκε από
GitHub
γονέας
5840940caa
υποβολή
f7338717ae
@@ -0,0 +1,62 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
RANGE=""
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
echo $1
|
||||
echo $2
|
||||
case "$1" in
|
||||
--range)
|
||||
RANGE="$2"
|
||||
shift 2
|
||||
;;
|
||||
*)
|
||||
echo "Unknown arg $1" >&2
|
||||
exit 64
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
regex='\.(c|cc|cpp|cxx|h|hh|hpp|hxx)$'
|
||||
|
||||
clang_bin="${CLANG_FORMAT:-clang-format}"
|
||||
if ! command -v "$clang_bin" >/dev/null 2>&1; then
|
||||
if [[ -x "/c/Program Files/LLVM/bin/clang-format.exe" ]]; then
|
||||
clang_bin="/c/Program Files/LLVM/bin/clang-format.exe"
|
||||
fi
|
||||
fi
|
||||
|
||||
clang_format_diff="${CLANG_FORMAT_DIFF:-clang-format-diff}"
|
||||
if ! command -v "$clang_format_diff" >/dev/null 2>&1; then
|
||||
if [[ -x "/c/Program Files/LLVM/share/clang/clang-format-diff.py" ]]; then
|
||||
clang_format_diff="/c/Program Files/LLVM/share/clang/clang-format-diff.py"
|
||||
fi
|
||||
fi
|
||||
|
||||
directories=(projects/hip projects/clr projects/hipother projects/hip-tests)
|
||||
|
||||
for dir in ${array[*]}; do
|
||||
cd $dir
|
||||
if [[ -n $RANGE ]]; then
|
||||
files=$(git diff --name-only "$RANGE" . | grep -E "$regex" || true)
|
||||
else
|
||||
files=$(git diff --cached --name-only --diff-filter=ACMR . | grep -E "$regex" || true)
|
||||
fi
|
||||
echo "Checking $files"
|
||||
[[ -z $files ]] && exit 0
|
||||
|
||||
for file in $files; do
|
||||
echo "Checking lines of $file"
|
||||
|
||||
if [[ -n $RANGE ]]; then
|
||||
diff_output=$(git diff -U0 "$RANGE" -- "$file")
|
||||
else
|
||||
diff_output=$(git diff -U0 --cached -- "$file")
|
||||
fi
|
||||
|
||||
echo "$diff_output" | "$clang_format_diff" -style=file -fallback-style=none -p1
|
||||
done
|
||||
cd ..
|
||||
done
|
||||
@@ -0,0 +1,2 @@
|
||||
#!/usr/bin/env bash
|
||||
exec "$(git rev-parse --show-toplevel)/.github/hooks/clang-format-check.sh"
|
||||
@@ -0,0 +1,27 @@
|
||||
name: Clang format check
|
||||
on:
|
||||
pull_request:
|
||||
types: [synchronize, opened]
|
||||
paths:
|
||||
- 'projects/hip/**'
|
||||
- 'projects/clr/**'
|
||||
- 'projects/hipother/**'
|
||||
- 'projects/hip-tests/**'
|
||||
|
||||
jobs:
|
||||
format:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Install clang-format
|
||||
run: |
|
||||
sudo apt update && sudo apt install -y clang-format
|
||||
|
||||
- name: Run clang-format-check
|
||||
id: clang-format
|
||||
run: |
|
||||
chmod +x .github/hooks/clang-format-check.sh
|
||||
./.github/hooks/clang-format-check.sh --range "${{ github.event.pull_request.base.sha }}..${{ github.event.pull_request.head.sha }}"
|
||||
@@ -31,320 +31,273 @@ THE SOFTWARE.
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
extern "C" HIP_PUBLIC_API
|
||||
hipChannelFormatDesc hipCreateChannelDesc(int x, int y, int z, int w, hipChannelFormatKind f);
|
||||
extern "C" HIP_PUBLIC_API hipChannelFormatDesc hipCreateChannelDesc(int x, int y, int z, int w,
|
||||
hipChannelFormatKind f);
|
||||
|
||||
static inline hipChannelFormatDesc hipCreateChannelDescHalf() {
|
||||
int e = (int)sizeof(unsigned short) * 8;
|
||||
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindFloat);
|
||||
int e = (int)sizeof(unsigned short) * 8;
|
||||
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindFloat);
|
||||
}
|
||||
|
||||
static inline hipChannelFormatDesc hipCreateChannelDescHalf1() {
|
||||
int e = (int)sizeof(unsigned short) * 8;
|
||||
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindFloat);
|
||||
int e = (int)sizeof(unsigned short) * 8;
|
||||
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindFloat);
|
||||
}
|
||||
|
||||
static inline hipChannelFormatDesc hipCreateChannelDescHalf2() {
|
||||
int e = (int)sizeof(unsigned short) * 8;
|
||||
return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindFloat);
|
||||
int e = (int)sizeof(unsigned short) * 8;
|
||||
return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindFloat);
|
||||
}
|
||||
|
||||
static inline hipChannelFormatDesc hipCreateChannelDescHalf4() {
|
||||
int e = (int)sizeof(unsigned short) * 8;
|
||||
return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindFloat);
|
||||
int e = (int)sizeof(unsigned short) * 8;
|
||||
return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindFloat);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static inline hipChannelFormatDesc hipCreateChannelDesc() {
|
||||
return hipCreateChannelDesc(0, 0, 0, 0, hipChannelFormatKindNone);
|
||||
template <typename T> static inline hipChannelFormatDesc hipCreateChannelDesc() {
|
||||
return hipCreateChannelDesc(0, 0, 0, 0, hipChannelFormatKindNone);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline hipChannelFormatDesc hipCreateChannelDesc<char>() {
|
||||
int e = (int)sizeof(char) * 8;
|
||||
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
|
||||
template <> inline hipChannelFormatDesc hipCreateChannelDesc<char>() {
|
||||
int e = (int)sizeof(char) * 8;
|
||||
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline hipChannelFormatDesc hipCreateChannelDesc<signed char>() {
|
||||
int e = (int)sizeof(signed char) * 8;
|
||||
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
|
||||
template <> inline hipChannelFormatDesc hipCreateChannelDesc<signed char>() {
|
||||
int e = (int)sizeof(signed char) * 8;
|
||||
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline hipChannelFormatDesc hipCreateChannelDesc<unsigned char>() {
|
||||
int e = (int)sizeof(unsigned char) * 8;
|
||||
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
|
||||
template <> inline hipChannelFormatDesc hipCreateChannelDesc<unsigned char>() {
|
||||
int e = (int)sizeof(unsigned char) * 8;
|
||||
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline hipChannelFormatDesc hipCreateChannelDesc<uchar1>() {
|
||||
int e = (int)sizeof(unsigned char) * 8;
|
||||
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
|
||||
template <> inline hipChannelFormatDesc hipCreateChannelDesc<uchar1>() {
|
||||
int e = (int)sizeof(unsigned char) * 8;
|
||||
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline hipChannelFormatDesc hipCreateChannelDesc<char1>() {
|
||||
int e = (int)sizeof(signed char) * 8;
|
||||
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
|
||||
template <> inline hipChannelFormatDesc hipCreateChannelDesc<char1>() {
|
||||
int e = (int)sizeof(signed char) * 8;
|
||||
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline hipChannelFormatDesc hipCreateChannelDesc<uchar2>() {
|
||||
int e = (int)sizeof(unsigned char) * 8;
|
||||
return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindUnsigned);
|
||||
template <> inline hipChannelFormatDesc hipCreateChannelDesc<uchar2>() {
|
||||
int e = (int)sizeof(unsigned char) * 8;
|
||||
return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindUnsigned);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline hipChannelFormatDesc hipCreateChannelDesc<char2>() {
|
||||
int e = (int)sizeof(signed char) * 8;
|
||||
return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindSigned);
|
||||
template <> inline hipChannelFormatDesc hipCreateChannelDesc<char2>() {
|
||||
int e = (int)sizeof(signed char) * 8;
|
||||
return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindSigned);
|
||||
}
|
||||
|
||||
#ifndef __GNUC__ // vector3 is the same as vector4
|
||||
template <>
|
||||
inline hipChannelFormatDesc hipCreateChannelDesc<uchar3>() {
|
||||
int e = (int)sizeof(unsigned char) * 8;
|
||||
return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindUnsigned);
|
||||
template <> inline hipChannelFormatDesc hipCreateChannelDesc<uchar3>() {
|
||||
int e = (int)sizeof(unsigned char) * 8;
|
||||
return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindUnsigned);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline hipChannelFormatDesc hipCreateChannelDesc<char3>() {
|
||||
int e = (int)sizeof(signed char) * 8;
|
||||
return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindSigned);
|
||||
template <> inline hipChannelFormatDesc hipCreateChannelDesc<char3>() {
|
||||
int e = (int)sizeof(signed char) * 8;
|
||||
return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindSigned);
|
||||
}
|
||||
#endif
|
||||
|
||||
template <>
|
||||
inline hipChannelFormatDesc hipCreateChannelDesc<uchar4>() {
|
||||
int e = (int)sizeof(unsigned char) * 8;
|
||||
return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindUnsigned);
|
||||
template <> inline hipChannelFormatDesc hipCreateChannelDesc<uchar4>() {
|
||||
int e = (int)sizeof(unsigned char) * 8;
|
||||
return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindUnsigned);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline hipChannelFormatDesc hipCreateChannelDesc<char4>() {
|
||||
int e = (int)sizeof(signed char) * 8;
|
||||
return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindSigned);
|
||||
template <> inline hipChannelFormatDesc hipCreateChannelDesc<char4>() {
|
||||
int e = (int)sizeof(signed char) * 8;
|
||||
return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindSigned);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline hipChannelFormatDesc hipCreateChannelDesc<unsigned short>() {
|
||||
int e = (int)sizeof(unsigned short) * 8;
|
||||
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
|
||||
template <> inline hipChannelFormatDesc hipCreateChannelDesc<unsigned short>() {
|
||||
int e = (int)sizeof(unsigned short) * 8;
|
||||
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline hipChannelFormatDesc hipCreateChannelDesc<signed short>() {
|
||||
int e = (int)sizeof(signed short) * 8;
|
||||
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
|
||||
template <> inline hipChannelFormatDesc hipCreateChannelDesc<signed short>() {
|
||||
int e = (int)sizeof(signed short) * 8;
|
||||
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline hipChannelFormatDesc hipCreateChannelDesc<ushort1>() {
|
||||
int e = (int)sizeof(unsigned short) * 8;
|
||||
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
|
||||
template <> inline hipChannelFormatDesc hipCreateChannelDesc<ushort1>() {
|
||||
int e = (int)sizeof(unsigned short) * 8;
|
||||
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline hipChannelFormatDesc hipCreateChannelDesc<short1>() {
|
||||
int e = (int)sizeof(signed short) * 8;
|
||||
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
|
||||
template <> inline hipChannelFormatDesc hipCreateChannelDesc<short1>() {
|
||||
int e = (int)sizeof(signed short) * 8;
|
||||
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline hipChannelFormatDesc hipCreateChannelDesc<ushort2>() {
|
||||
int e = (int)sizeof(unsigned short) * 8;
|
||||
return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindUnsigned);
|
||||
template <> inline hipChannelFormatDesc hipCreateChannelDesc<ushort2>() {
|
||||
int e = (int)sizeof(unsigned short) * 8;
|
||||
return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindUnsigned);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline hipChannelFormatDesc hipCreateChannelDesc<short2>() {
|
||||
int e = (int)sizeof(signed short) * 8;
|
||||
return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindSigned);
|
||||
template <> inline hipChannelFormatDesc hipCreateChannelDesc<short2>() {
|
||||
int e = (int)sizeof(signed short) * 8;
|
||||
return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindSigned);
|
||||
}
|
||||
|
||||
#ifndef __GNUC__
|
||||
template <>
|
||||
inline hipChannelFormatDesc hipCreateChannelDesc<ushort3>() {
|
||||
int e = (int)sizeof(unsigned short) * 8;
|
||||
return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindUnsigned);
|
||||
template <> inline hipChannelFormatDesc hipCreateChannelDesc<ushort3>() {
|
||||
int e = (int)sizeof(unsigned short) * 8;
|
||||
return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindUnsigned);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline hipChannelFormatDesc hipCreateChannelDesc<short3>() {
|
||||
int e = (int)sizeof(signed short) * 8;
|
||||
return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindSigned);
|
||||
template <> inline hipChannelFormatDesc hipCreateChannelDesc<short3>() {
|
||||
int e = (int)sizeof(signed short) * 8;
|
||||
return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindSigned);
|
||||
}
|
||||
#endif
|
||||
|
||||
template <>
|
||||
inline hipChannelFormatDesc hipCreateChannelDesc<ushort4>() {
|
||||
int e = (int)sizeof(unsigned short) * 8;
|
||||
return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindUnsigned);
|
||||
template <> inline hipChannelFormatDesc hipCreateChannelDesc<ushort4>() {
|
||||
int e = (int)sizeof(unsigned short) * 8;
|
||||
return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindUnsigned);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline hipChannelFormatDesc hipCreateChannelDesc<short4>() {
|
||||
int e = (int)sizeof(signed short) * 8;
|
||||
return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindSigned);
|
||||
template <> inline hipChannelFormatDesc hipCreateChannelDesc<short4>() {
|
||||
int e = (int)sizeof(signed short) * 8;
|
||||
return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindSigned);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline hipChannelFormatDesc hipCreateChannelDesc<unsigned int>() {
|
||||
int e = (int)sizeof(unsigned int) * 8;
|
||||
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
|
||||
template <> inline hipChannelFormatDesc hipCreateChannelDesc<unsigned int>() {
|
||||
int e = (int)sizeof(unsigned int) * 8;
|
||||
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline hipChannelFormatDesc hipCreateChannelDesc<signed int>() {
|
||||
int e = (int)sizeof(signed int) * 8;
|
||||
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
|
||||
template <> inline hipChannelFormatDesc hipCreateChannelDesc<signed int>() {
|
||||
int e = (int)sizeof(signed int) * 8;
|
||||
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline hipChannelFormatDesc hipCreateChannelDesc<uint1>() {
|
||||
int e = (int)sizeof(unsigned int) * 8;
|
||||
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
|
||||
template <> inline hipChannelFormatDesc hipCreateChannelDesc<uint1>() {
|
||||
int e = (int)sizeof(unsigned int) * 8;
|
||||
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline hipChannelFormatDesc hipCreateChannelDesc<int1>() {
|
||||
int e = (int)sizeof(signed int) * 8;
|
||||
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
|
||||
template <> inline hipChannelFormatDesc hipCreateChannelDesc<int1>() {
|
||||
int e = (int)sizeof(signed int) * 8;
|
||||
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline hipChannelFormatDesc hipCreateChannelDesc<uint2>() {
|
||||
int e = (int)sizeof(unsigned int) * 8;
|
||||
return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindUnsigned);
|
||||
template <> inline hipChannelFormatDesc hipCreateChannelDesc<uint2>() {
|
||||
int e = (int)sizeof(unsigned int) * 8;
|
||||
return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindUnsigned);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline hipChannelFormatDesc hipCreateChannelDesc<int2>() {
|
||||
int e = (int)sizeof(signed int) * 8;
|
||||
return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindSigned);
|
||||
template <> inline hipChannelFormatDesc hipCreateChannelDesc<int2>() {
|
||||
int e = (int)sizeof(signed int) * 8;
|
||||
return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindSigned);
|
||||
}
|
||||
|
||||
#ifndef __GNUC__
|
||||
template <>
|
||||
inline hipChannelFormatDesc hipCreateChannelDesc<uint3>() {
|
||||
int e = (int)sizeof(unsigned int) * 8;
|
||||
return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindUnsigned);
|
||||
template <> inline hipChannelFormatDesc hipCreateChannelDesc<uint3>() {
|
||||
int e = (int)sizeof(unsigned int) * 8;
|
||||
return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindUnsigned);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline hipChannelFormatDesc hipCreateChannelDesc<int3>() {
|
||||
int e = (int)sizeof(signed int) * 8;
|
||||
return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindSigned);
|
||||
template <> inline hipChannelFormatDesc hipCreateChannelDesc<int3>() {
|
||||
int e = (int)sizeof(signed int) * 8;
|
||||
return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindSigned);
|
||||
}
|
||||
#endif
|
||||
|
||||
template <>
|
||||
inline hipChannelFormatDesc hipCreateChannelDesc<uint4>() {
|
||||
int e = (int)sizeof(unsigned int) * 8;
|
||||
return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindUnsigned);
|
||||
template <> inline hipChannelFormatDesc hipCreateChannelDesc<uint4>() {
|
||||
int e = (int)sizeof(unsigned int) * 8;
|
||||
return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindUnsigned);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline hipChannelFormatDesc hipCreateChannelDesc<int4>() {
|
||||
int e = (int)sizeof(signed int) * 8;
|
||||
return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindSigned);
|
||||
template <> inline hipChannelFormatDesc hipCreateChannelDesc<int4>() {
|
||||
int e = (int)sizeof(signed int) * 8;
|
||||
return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindSigned);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline hipChannelFormatDesc hipCreateChannelDesc<float>() {
|
||||
int e = (int)sizeof(float) * 8;
|
||||
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindFloat);
|
||||
template <> inline hipChannelFormatDesc hipCreateChannelDesc<float>() {
|
||||
int e = (int)sizeof(float) * 8;
|
||||
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindFloat);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline hipChannelFormatDesc hipCreateChannelDesc<float1>() {
|
||||
int e = (int)sizeof(float) * 8;
|
||||
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindFloat);
|
||||
template <> inline hipChannelFormatDesc hipCreateChannelDesc<float1>() {
|
||||
int e = (int)sizeof(float) * 8;
|
||||
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindFloat);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline hipChannelFormatDesc hipCreateChannelDesc<float2>() {
|
||||
int e = (int)sizeof(float) * 8;
|
||||
return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindFloat);
|
||||
template <> inline hipChannelFormatDesc hipCreateChannelDesc<float2>() {
|
||||
int e = (int)sizeof(float) * 8;
|
||||
return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindFloat);
|
||||
}
|
||||
|
||||
#ifndef __GNUC__
|
||||
template <>
|
||||
inline hipChannelFormatDesc hipCreateChannelDesc<float3>() {
|
||||
int e = (int)sizeof(float) * 8;
|
||||
return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindFloat);
|
||||
template <> inline hipChannelFormatDesc hipCreateChannelDesc<float3>() {
|
||||
int e = (int)sizeof(float) * 8;
|
||||
return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindFloat);
|
||||
}
|
||||
#endif
|
||||
|
||||
template <>
|
||||
inline hipChannelFormatDesc hipCreateChannelDesc<float4>() {
|
||||
int e = (int)sizeof(float) * 8;
|
||||
return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindFloat);
|
||||
template <> inline hipChannelFormatDesc hipCreateChannelDesc<float4>() {
|
||||
int e = (int)sizeof(float) * 8;
|
||||
return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindFloat);
|
||||
}
|
||||
|
||||
#if !defined(__LP64__)
|
||||
|
||||
template <>
|
||||
inline hipChannelFormatDesc hipCreateChannelDesc<unsigned long>() {
|
||||
int e = (int)sizeof(unsigned long) * 8;
|
||||
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
|
||||
template <> inline hipChannelFormatDesc hipCreateChannelDesc<unsigned long>() {
|
||||
int e = (int)sizeof(unsigned long) * 8;
|
||||
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline hipChannelFormatDesc hipCreateChannelDesc<signed long>() {
|
||||
int e = (int)sizeof(signed long) * 8;
|
||||
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
|
||||
template <> inline hipChannelFormatDesc hipCreateChannelDesc<signed long>() {
|
||||
int e = (int)sizeof(signed long) * 8;
|
||||
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline hipChannelFormatDesc hipCreateChannelDesc<ulong1>() {
|
||||
int e = (int)sizeof(unsigned long) * 8;
|
||||
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
|
||||
template <> inline hipChannelFormatDesc hipCreateChannelDesc<ulong1>() {
|
||||
int e = (int)sizeof(unsigned long) * 8;
|
||||
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline hipChannelFormatDesc hipCreateChannelDesc<long1>() {
|
||||
int e = (int)sizeof(signed long) * 8;
|
||||
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
|
||||
template <> inline hipChannelFormatDesc hipCreateChannelDesc<long1>() {
|
||||
int e = (int)sizeof(signed long) * 8;
|
||||
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline hipChannelFormatDesc hipCreateChannelDesc<ulong2>() {
|
||||
int e = (int)sizeof(unsigned long) * 8;
|
||||
return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindUnsigned);
|
||||
template <> inline hipChannelFormatDesc hipCreateChannelDesc<ulong2>() {
|
||||
int e = (int)sizeof(unsigned long) * 8;
|
||||
return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindUnsigned);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline hipChannelFormatDesc hipCreateChannelDesc<long2>() {
|
||||
int e = (int)sizeof(signed long) * 8;
|
||||
return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindSigned);
|
||||
template <> inline hipChannelFormatDesc hipCreateChannelDesc<long2>() {
|
||||
int e = (int)sizeof(signed long) * 8;
|
||||
return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindSigned);
|
||||
}
|
||||
|
||||
#ifndef __GNUC__
|
||||
template <>
|
||||
inline hipChannelFormatDesc hipCreateChannelDesc<ulong3>() {
|
||||
int e = (int)sizeof(unsigned long) * 8;
|
||||
return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindUnsigned);
|
||||
template <> inline hipChannelFormatDesc hipCreateChannelDesc<ulong3>() {
|
||||
int e = (int)sizeof(unsigned long) * 8;
|
||||
return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindUnsigned);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline hipChannelFormatDesc hipCreateChannelDesc<long3>() {
|
||||
int e = (int)sizeof(signed long) * 8;
|
||||
return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindSigned);
|
||||
template <> inline hipChannelFormatDesc hipCreateChannelDesc<long3>() {
|
||||
int e = (int)sizeof(signed long) * 8;
|
||||
return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindSigned);
|
||||
}
|
||||
#endif
|
||||
|
||||
template <>
|
||||
inline hipChannelFormatDesc hipCreateChannelDesc<ulong4>() {
|
||||
int e = (int)sizeof(unsigned long) * 8;
|
||||
return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindUnsigned);
|
||||
template <> inline hipChannelFormatDesc hipCreateChannelDesc<ulong4>() {
|
||||
int e = (int)sizeof(unsigned long) * 8;
|
||||
return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindUnsigned);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline hipChannelFormatDesc hipCreateChannelDesc<long4>() {
|
||||
int e = (int)sizeof(signed long) * 8;
|
||||
return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindSigned);
|
||||
template <> inline hipChannelFormatDesc hipCreateChannelDesc<long4>() {
|
||||
int e = (int)sizeof(signed long) * 8;
|
||||
return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindSigned);
|
||||
}
|
||||
#endif /* !__LP64__ */
|
||||
|
||||
|
||||
Το diff αρχείου καταστέλλεται επειδή είναι πολύ μεγάλο
Φόρτωση Διαφορών
@@ -26,13 +26,17 @@ THE SOFTWARE.
|
||||
#include "amd_device_functions.h"
|
||||
#endif
|
||||
|
||||
template<bool B, typename T, typename F> struct Cond_t;
|
||||
template <bool B, typename T, typename F> struct Cond_t;
|
||||
|
||||
template<typename T, typename F> struct Cond_t<true, T, F> { using type = T; };
|
||||
template<typename T, typename F> struct Cond_t<false, T, F> { using type = F; };
|
||||
template <typename T, typename F> struct Cond_t<true, T, F> {
|
||||
using type = T;
|
||||
};
|
||||
template <typename T, typename F> struct Cond_t<false, T, F> {
|
||||
using type = F;
|
||||
};
|
||||
|
||||
#if !__HIP_DEVICE_COMPILE__
|
||||
//TODO: Remove this after compiler pre-defines the following Macros.
|
||||
// TODO: Remove this after compiler pre-defines the following Macros.
|
||||
#define __HIP_MEMORY_SCOPE_SINGLETHREAD 1
|
||||
#define __HIP_MEMORY_SCOPE_WAVEFRONT 2
|
||||
#define __HIP_MEMORY_SCOPE_WORKGROUP 3
|
||||
@@ -45,26 +49,17 @@ template<typename T, typename F> struct Cond_t<false, T, F> { using type = F; };
|
||||
#endif
|
||||
|
||||
// Atomic expanders
|
||||
template<
|
||||
int mem_order = __ATOMIC_SEQ_CST,
|
||||
int mem_scope= __HIP_MEMORY_SCOPE_SYSTEM,
|
||||
typename T,
|
||||
typename Op,
|
||||
typename F>
|
||||
inline
|
||||
__attribute__((always_inline, device))
|
||||
T hip_cas_expander(T* p, T x, Op op, F f) noexcept
|
||||
{
|
||||
template <int mem_order = __ATOMIC_SEQ_CST, int mem_scope = __HIP_MEMORY_SCOPE_SYSTEM, typename T,
|
||||
typename Op, typename F>
|
||||
inline __attribute__((always_inline, device)) T hip_cas_expander(T* p, T x, Op op, F f) noexcept {
|
||||
using FP = __attribute__((address_space(0))) const void*;
|
||||
|
||||
__device__
|
||||
extern bool is_shared_workaround(FP) asm("llvm.amdgcn.is.shared");
|
||||
__device__ extern bool is_shared_workaround(FP) asm("llvm.amdgcn.is.shared");
|
||||
|
||||
if (is_shared_workaround((FP)p))
|
||||
return f();
|
||||
if (is_shared_workaround((FP)p)) return f();
|
||||
|
||||
using U = typename Cond_t<
|
||||
sizeof(T) == sizeof(unsigned int), unsigned int, unsigned long long>::type;
|
||||
using U =
|
||||
typename Cond_t<sizeof(T) == sizeof(unsigned int), unsigned int, unsigned long long>::type;
|
||||
|
||||
auto q = reinterpret_cast<U*>(p);
|
||||
|
||||
@@ -74,204 +69,158 @@ T hip_cas_expander(T* p, T x, Op op, F f) noexcept
|
||||
tmp1 = tmp0;
|
||||
|
||||
op(reinterpret_cast<T&>(tmp1), x);
|
||||
} while (!__hip_atomic_compare_exchange_strong(q, &tmp0, tmp1, mem_order,
|
||||
mem_order, mem_scope));
|
||||
} while (!__hip_atomic_compare_exchange_strong(q, &tmp0, tmp1, mem_order, mem_order, mem_scope));
|
||||
|
||||
return reinterpret_cast<const T&>(tmp0);
|
||||
}
|
||||
|
||||
template<
|
||||
int mem_order = __ATOMIC_SEQ_CST,
|
||||
int mem_scope= __HIP_MEMORY_SCOPE_SYSTEM,
|
||||
typename T,
|
||||
typename Cmp,
|
||||
typename F>
|
||||
inline
|
||||
__attribute__((always_inline, device))
|
||||
T hip_cas_extrema_expander(T* p, T x, Cmp cmp, F f) noexcept
|
||||
{
|
||||
template <int mem_order = __ATOMIC_SEQ_CST, int mem_scope = __HIP_MEMORY_SCOPE_SYSTEM, typename T,
|
||||
typename Cmp, typename F>
|
||||
inline __attribute__((always_inline, device)) T hip_cas_extrema_expander(T* p, T x, Cmp cmp,
|
||||
F f) noexcept {
|
||||
using FP = __attribute__((address_space(0))) const void*;
|
||||
|
||||
__device__
|
||||
extern bool is_shared_workaround(FP) asm("llvm.amdgcn.is.shared");
|
||||
__device__ extern bool is_shared_workaround(FP) asm("llvm.amdgcn.is.shared");
|
||||
|
||||
if (is_shared_workaround((FP)p))
|
||||
return f();
|
||||
if (is_shared_workaround((FP)p)) return f();
|
||||
|
||||
using U = typename Cond_t<
|
||||
sizeof(T) == sizeof(unsigned int), unsigned int, unsigned long long>::type;
|
||||
using U =
|
||||
typename Cond_t<sizeof(T) == sizeof(unsigned int), unsigned int, unsigned long long>::type;
|
||||
|
||||
auto q = reinterpret_cast<U*>(p);
|
||||
|
||||
U tmp{__hip_atomic_load(q, mem_order, mem_scope)};
|
||||
while (cmp(x, reinterpret_cast<const T&>(tmp)) &&
|
||||
!__hip_atomic_compare_exchange_strong(q, &tmp, x, mem_order, mem_order,
|
||||
mem_scope));
|
||||
!__hip_atomic_compare_exchange_strong(q, &tmp, x, mem_order, mem_order, mem_scope));
|
||||
|
||||
return reinterpret_cast<const T&>(tmp);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
unsigned short int atomicCAS(unsigned short int* address, unsigned short int compare,
|
||||
unsigned short int val) {
|
||||
__device__ inline unsigned short int atomicCAS(unsigned short int* address,
|
||||
unsigned short int compare, unsigned short int val) {
|
||||
__hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
|
||||
__HIP_MEMORY_SCOPE_AGENT);
|
||||
return compare;
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
unsigned short int atomicCAS_system(unsigned short int* address, unsigned short int compare,
|
||||
unsigned short int val) {
|
||||
__device__ inline unsigned short int atomicCAS_system(unsigned short int* address,
|
||||
unsigned short int compare,
|
||||
unsigned short int val) {
|
||||
__hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
|
||||
__HIP_MEMORY_SCOPE_SYSTEM);
|
||||
return compare;
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
int atomicCAS(int* address, int compare, int val) {
|
||||
__hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
|
||||
__HIP_MEMORY_SCOPE_AGENT);
|
||||
return compare;
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
int atomicCAS_system(int* address, int compare, int val) {
|
||||
__hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
|
||||
__HIP_MEMORY_SCOPE_SYSTEM);
|
||||
return compare;
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
unsigned int atomicCAS(unsigned int* address, unsigned int compare, unsigned int val) {
|
||||
__device__ inline int atomicCAS(int* address, int compare, int val) {
|
||||
__hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
|
||||
__HIP_MEMORY_SCOPE_AGENT);
|
||||
return compare;
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
unsigned int atomicCAS_system(unsigned int* address, unsigned int compare, unsigned int val) {
|
||||
__device__ inline int atomicCAS_system(int* address, int compare, int val) {
|
||||
__hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
|
||||
__HIP_MEMORY_SCOPE_SYSTEM);
|
||||
return compare;
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
unsigned long atomicCAS(unsigned long* address, unsigned long compare, unsigned long val) {
|
||||
__device__ inline unsigned int atomicCAS(unsigned int* address, unsigned int compare,
|
||||
unsigned int val) {
|
||||
__hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
|
||||
__HIP_MEMORY_SCOPE_AGENT);
|
||||
return compare;
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
unsigned long atomicCAS_system(unsigned long* address, unsigned long compare, unsigned long val) {
|
||||
__device__ inline unsigned int atomicCAS_system(unsigned int* address, unsigned int compare,
|
||||
unsigned int val) {
|
||||
__hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
|
||||
__HIP_MEMORY_SCOPE_SYSTEM);
|
||||
return compare;
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
unsigned long long atomicCAS(unsigned long long* address, unsigned long long compare,
|
||||
unsigned long long val) {
|
||||
__device__ inline unsigned long atomicCAS(unsigned long* address, unsigned long compare,
|
||||
unsigned long val) {
|
||||
__hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
|
||||
__HIP_MEMORY_SCOPE_AGENT);
|
||||
return compare;
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
unsigned long long atomicCAS_system(unsigned long long* address, unsigned long long compare,
|
||||
unsigned long long val) {
|
||||
__device__ inline unsigned long atomicCAS_system(unsigned long* address, unsigned long compare,
|
||||
unsigned long val) {
|
||||
__hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
|
||||
__HIP_MEMORY_SCOPE_SYSTEM);
|
||||
return compare;
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
float atomicCAS(float* address, float compare, float val) {
|
||||
__device__ inline unsigned long long atomicCAS(unsigned long long* address,
|
||||
unsigned long long compare, unsigned long long val) {
|
||||
__hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
|
||||
__HIP_MEMORY_SCOPE_AGENT);
|
||||
return compare;
|
||||
return compare;
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
float atomicCAS_system(float* address, float compare, float val) {
|
||||
__device__ inline unsigned long long atomicCAS_system(unsigned long long* address,
|
||||
unsigned long long compare,
|
||||
unsigned long long val) {
|
||||
__hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
|
||||
__HIP_MEMORY_SCOPE_SYSTEM);
|
||||
return compare;
|
||||
return compare;
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
double atomicCAS(double* address, double compare, double val) {
|
||||
__device__ inline float atomicCAS(float* address, float compare, float val) {
|
||||
__hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
|
||||
__HIP_MEMORY_SCOPE_AGENT);
|
||||
return compare;
|
||||
return compare;
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
double atomicCAS_system(double* address, double compare, double val) {
|
||||
__device__ inline float atomicCAS_system(float* address, float compare, float val) {
|
||||
__hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
|
||||
__HIP_MEMORY_SCOPE_SYSTEM);
|
||||
return compare;
|
||||
return compare;
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
int atomicAdd(int* address, int val) {
|
||||
__device__ inline double atomicCAS(double* address, double compare, double val) {
|
||||
__hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
|
||||
__HIP_MEMORY_SCOPE_AGENT);
|
||||
return compare;
|
||||
}
|
||||
|
||||
__device__ inline double atomicCAS_system(double* address, double compare, double val) {
|
||||
__hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
|
||||
__HIP_MEMORY_SCOPE_SYSTEM);
|
||||
return compare;
|
||||
}
|
||||
|
||||
__device__ inline int atomicAdd(int* address, int val) {
|
||||
return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
int atomicAdd_system(int* address, int val) {
|
||||
__device__ inline int atomicAdd_system(int* address, int val) {
|
||||
return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
unsigned int atomicAdd(unsigned int* address, unsigned int val) {
|
||||
__device__ inline unsigned int atomicAdd(unsigned int* address, unsigned int val) {
|
||||
return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
unsigned int atomicAdd_system(unsigned int* address, unsigned int val) {
|
||||
__device__ inline unsigned int atomicAdd_system(unsigned int* address, unsigned int val) {
|
||||
return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
unsigned long atomicAdd(unsigned long* address, unsigned long val) {
|
||||
__device__ inline unsigned long atomicAdd(unsigned long* address, unsigned long val) {
|
||||
return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
unsigned long atomicAdd_system(unsigned long* address, unsigned long val) {
|
||||
__device__ inline unsigned long atomicAdd_system(unsigned long* address, unsigned long val) {
|
||||
return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
unsigned long long atomicAdd(unsigned long long* address, unsigned long long val) {
|
||||
__device__ inline unsigned long long atomicAdd(unsigned long long* address,
|
||||
unsigned long long val) {
|
||||
return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
unsigned long long atomicAdd_system(unsigned long long* address, unsigned long long val) {
|
||||
__device__ inline unsigned long long atomicAdd_system(unsigned long long* address,
|
||||
unsigned long long val) {
|
||||
return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
||||
}
|
||||
|
||||
@@ -281,37 +230,26 @@ unsigned long long atomicAdd_system(unsigned long long* address, unsigned long l
|
||||
#define __HIP_FINE_GRAINED_MEMORY
|
||||
#endif
|
||||
|
||||
__device__
|
||||
inline
|
||||
float atomicAdd(float* address, float val) {
|
||||
__device__ inline float atomicAdd(float* address, float val) {
|
||||
#if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
|
||||
return unsafeAtomicAdd(address, val);
|
||||
#else
|
||||
__HIP_FINE_GRAINED_MEMORY {
|
||||
__HIP_FINE_GRAINED_MEMORY {
|
||||
return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
float atomicAdd_system(float* address, float val) {
|
||||
__device__ inline float atomicAdd_system(float* address, float val) {
|
||||
return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
||||
}
|
||||
|
||||
#if !defined(__HIPCC_RTC__)
|
||||
HIP_DEPRECATED("use atomicAdd instead")
|
||||
#endif // !defined(__HIPCC_RTC__)
|
||||
__device__
|
||||
inline
|
||||
void atomicAddNoRet(float* address, float val)
|
||||
{
|
||||
unsafeAtomicAdd(address, val);
|
||||
}
|
||||
#endif // !defined(__HIPCC_RTC__)
|
||||
__device__ inline void atomicAddNoRet(float* address, float val) { unsafeAtomicAdd(address, val); }
|
||||
|
||||
__device__
|
||||
inline
|
||||
double atomicAdd(double* address, double val) {
|
||||
__device__ inline double atomicAdd(double* address, double val) {
|
||||
#if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
|
||||
return unsafeAtomicAdd(address, val);
|
||||
#else
|
||||
@@ -321,63 +259,45 @@ double atomicAdd(double* address, double val) {
|
||||
#endif
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
double atomicAdd_system(double* address, double val) {
|
||||
__device__ inline double atomicAdd_system(double* address, double val) {
|
||||
return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
int atomicSub(int* address, int val) {
|
||||
__device__ inline int atomicSub(int* address, int val) {
|
||||
return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
int atomicSub_system(int* address, int val) {
|
||||
__device__ inline int atomicSub_system(int* address, int val) {
|
||||
return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
unsigned int atomicSub(unsigned int* address, unsigned int val) {
|
||||
__device__ inline unsigned int atomicSub(unsigned int* address, unsigned int val) {
|
||||
return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
unsigned int atomicSub_system(unsigned int* address, unsigned int val) {
|
||||
__device__ inline unsigned int atomicSub_system(unsigned int* address, unsigned int val) {
|
||||
return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
unsigned long atomicSub(unsigned long* address, unsigned long val) {
|
||||
__device__ inline unsigned long atomicSub(unsigned long* address, unsigned long val) {
|
||||
return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
unsigned long atomicSub_system(unsigned long* address, unsigned long val) {
|
||||
__device__ inline unsigned long atomicSub_system(unsigned long* address, unsigned long val) {
|
||||
return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
unsigned long long atomicSub(unsigned long long* address, unsigned long long val) {
|
||||
__device__ inline unsigned long long atomicSub(unsigned long long* address,
|
||||
unsigned long long val) {
|
||||
return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
unsigned long long atomicSub_system(unsigned long long* address, unsigned long long val) {
|
||||
__device__ inline unsigned long long atomicSub_system(unsigned long long* address,
|
||||
unsigned long long val) {
|
||||
return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
float atomicSub(float* address, float val) {
|
||||
__device__ inline float atomicSub(float* address, float val) {
|
||||
#if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
|
||||
return unsafeAtomicAdd(address, -val);
|
||||
#else
|
||||
@@ -387,15 +307,11 @@ float atomicSub(float* address, float val) {
|
||||
#endif
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
float atomicSub_system(float* address, float val) {
|
||||
__device__ inline float atomicSub_system(float* address, float val) {
|
||||
return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
double atomicSub(double* address, double val) {
|
||||
__device__ inline double atomicSub(double* address, double val) {
|
||||
#if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
|
||||
return unsafeAtomicAdd(address, -val);
|
||||
#else
|
||||
@@ -405,147 +321,103 @@ double atomicSub(double* address, double val) {
|
||||
#endif
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
double atomicSub_system(double* address, double val) {
|
||||
__device__ inline double atomicSub_system(double* address, double val) {
|
||||
return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
int atomicExch(int* address, int val) {
|
||||
__device__ inline int atomicExch(int* address, int val) {
|
||||
return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
int atomicExch_system(int* address, int val) {
|
||||
__device__ inline int atomicExch_system(int* address, int val) {
|
||||
return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
unsigned int atomicExch(unsigned int* address, unsigned int val) {
|
||||
__device__ inline unsigned int atomicExch(unsigned int* address, unsigned int val) {
|
||||
return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
unsigned int atomicExch_system(unsigned int* address, unsigned int val) {
|
||||
__device__ inline unsigned int atomicExch_system(unsigned int* address, unsigned int val) {
|
||||
return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
unsigned long atomicExch(unsigned long* address, unsigned long val) {
|
||||
__device__ inline unsigned long atomicExch(unsigned long* address, unsigned long val) {
|
||||
return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
unsigned long atomicExch_system(unsigned long* address, unsigned long val) {
|
||||
__device__ inline unsigned long atomicExch_system(unsigned long* address, unsigned long val) {
|
||||
return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
unsigned long long atomicExch(unsigned long long* address, unsigned long long val) {
|
||||
__device__ inline unsigned long long atomicExch(unsigned long long* address,
|
||||
unsigned long long val) {
|
||||
return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
unsigned long long atomicExch_system(unsigned long long* address, unsigned long long val) {
|
||||
__device__ inline unsigned long long atomicExch_system(unsigned long long* address,
|
||||
unsigned long long val) {
|
||||
return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
float atomicExch(float* address, float val) {
|
||||
__device__ inline float atomicExch(float* address, float val) {
|
||||
return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
float atomicExch_system(float* address, float val) {
|
||||
__device__ inline float atomicExch_system(float* address, float val) {
|
||||
return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
double atomicExch(double* address, double val) {
|
||||
__device__ inline double atomicExch(double* address, double val) {
|
||||
return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
double atomicExch_system(double* address, double val) {
|
||||
__device__ inline double atomicExch_system(double* address, double val) {
|
||||
return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
int atomicMin(int* address, int val) {
|
||||
__device__ inline int atomicMin(int* address, int val) {
|
||||
return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
int atomicMin_system(int* address, int val) {
|
||||
__device__ inline int atomicMin_system(int* address, int val) {
|
||||
return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
unsigned int atomicMin(unsigned int* address, unsigned int val) {
|
||||
__device__ inline unsigned int atomicMin(unsigned int* address, unsigned int val) {
|
||||
return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
unsigned int atomicMin_system(unsigned int* address, unsigned int val) {
|
||||
__device__ inline unsigned int atomicMin_system(unsigned int* address, unsigned int val) {
|
||||
return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
unsigned long atomicMin(unsigned long* address, unsigned long val) {
|
||||
__device__ inline unsigned long atomicMin(unsigned long* address, unsigned long val) {
|
||||
return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
unsigned long atomicMin_system(unsigned long* address, unsigned long val) {
|
||||
__device__ inline unsigned long atomicMin_system(unsigned long* address, unsigned long val) {
|
||||
return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
unsigned long long atomicMin(unsigned long long* address, unsigned long long val) {
|
||||
__device__ inline unsigned long long atomicMin(unsigned long long* address,
|
||||
unsigned long long val) {
|
||||
return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
unsigned long long atomicMin_system(unsigned long long* address, unsigned long long val) {
|
||||
__device__ inline unsigned long long atomicMin_system(unsigned long long* address,
|
||||
unsigned long long val) {
|
||||
return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
long long atomicMin(long long* address, long long val) {
|
||||
__device__ inline long long atomicMin(long long* address, long long val) {
|
||||
return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
long long atomicMin_system(long long* address, long long val) {
|
||||
__device__ inline long long atomicMin_system(long long* address, long long val) {
|
||||
return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
float atomicMin(float* addr, float val) {
|
||||
__device__ inline float atomicMin(float* addr, float val) {
|
||||
#if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
|
||||
return unsafeAtomicMin(addr, val);
|
||||
#else
|
||||
@@ -555,9 +427,7 @@ float atomicMin(float* addr, float val) {
|
||||
#endif
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
float atomicMin_system(float* addr, float val) {
|
||||
__device__ inline float atomicMin_system(float* addr, float val) {
|
||||
#if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
|
||||
return unsafeAtomicMin(addr, val);
|
||||
#else
|
||||
@@ -567,9 +437,7 @@ float atomicMin_system(float* addr, float val) {
|
||||
#endif
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
double atomicMin(double* addr, double val) {
|
||||
__device__ inline double atomicMin(double* addr, double val) {
|
||||
#if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
|
||||
return unsafeAtomicMin(addr, val);
|
||||
#else
|
||||
@@ -579,9 +447,7 @@ double atomicMin(double* addr, double val) {
|
||||
#endif
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
double atomicMin_system(double* addr, double val) {
|
||||
__device__ inline double atomicMin_system(double* addr, double val) {
|
||||
#if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
|
||||
return unsafeAtomicMin(addr, val);
|
||||
#else
|
||||
@@ -591,68 +457,48 @@ double atomicMin_system(double* addr, double val) {
|
||||
#endif
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
int atomicMax(int* address, int val) {
|
||||
__device__ inline int atomicMax(int* address, int val) {
|
||||
return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
int atomicMax_system(int* address, int val) {
|
||||
__device__ inline int atomicMax_system(int* address, int val) {
|
||||
return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
unsigned int atomicMax(unsigned int* address, unsigned int val) {
|
||||
__device__ inline unsigned int atomicMax(unsigned int* address, unsigned int val) {
|
||||
return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
unsigned int atomicMax_system(unsigned int* address, unsigned int val) {
|
||||
__device__ inline unsigned int atomicMax_system(unsigned int* address, unsigned int val) {
|
||||
return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
unsigned long atomicMax(unsigned long* address, unsigned long val) {
|
||||
__device__ inline unsigned long atomicMax(unsigned long* address, unsigned long val) {
|
||||
return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
unsigned long atomicMax_system(unsigned long* address, unsigned long val) {
|
||||
__device__ inline unsigned long atomicMax_system(unsigned long* address, unsigned long val) {
|
||||
return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
unsigned long long atomicMax(unsigned long long* address, unsigned long long val) {
|
||||
__device__ inline unsigned long long atomicMax(unsigned long long* address,
|
||||
unsigned long long val) {
|
||||
return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
unsigned long long atomicMax_system(unsigned long long* address, unsigned long long val) {
|
||||
__device__ inline unsigned long long atomicMax_system(unsigned long long* address,
|
||||
unsigned long long val) {
|
||||
return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
||||
}
|
||||
__device__
|
||||
inline
|
||||
long long atomicMax(long long* address, long long val) {
|
||||
__device__ inline long long atomicMax(long long* address, long long val) {
|
||||
return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
long long atomicMax_system(long long* address, long long val) {
|
||||
__device__ inline long long atomicMax_system(long long* address, long long val) {
|
||||
return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
float atomicMax(float* addr, float val) {
|
||||
__device__ inline float atomicMax(float* addr, float val) {
|
||||
#if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
|
||||
return unsafeAtomicMax(addr, val);
|
||||
#else
|
||||
@@ -662,9 +508,7 @@ float atomicMax(float* addr, float val) {
|
||||
#endif
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
float atomicMax_system(float* addr, float val) {
|
||||
__device__ inline float atomicMax_system(float* addr, float val) {
|
||||
#if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
|
||||
return unsafeAtomicMax(addr, val);
|
||||
#else
|
||||
@@ -674,9 +518,7 @@ float atomicMax_system(float* addr, float val) {
|
||||
#endif
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
double atomicMax(double* addr, double val) {
|
||||
__device__ inline double atomicMax(double* addr, double val) {
|
||||
#if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
|
||||
return unsafeAtomicMax(addr, val);
|
||||
#else
|
||||
@@ -686,9 +528,7 @@ double atomicMax(double* addr, double val) {
|
||||
#endif
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
double atomicMax_system(double* addr, double val) {
|
||||
__device__ inline double atomicMax_system(double* addr, double val) {
|
||||
#if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
|
||||
return unsafeAtomicMax(addr, val);
|
||||
#else
|
||||
@@ -698,160 +538,111 @@ double atomicMax_system(double* addr, double val) {
|
||||
#endif
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
unsigned int atomicInc(unsigned int* address, unsigned int val)
|
||||
{
|
||||
__device__ inline unsigned int atomicInc(unsigned int* address, unsigned int val) {
|
||||
return __builtin_amdgcn_atomic_inc32(address, val, __ATOMIC_RELAXED, "agent");
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
unsigned int atomicDec(unsigned int* address, unsigned int val)
|
||||
{
|
||||
__device__ inline unsigned int atomicDec(unsigned int* address, unsigned int val) {
|
||||
return __builtin_amdgcn_atomic_dec32(address, val, __ATOMIC_RELAXED, "agent");
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
int atomicAnd(int* address, int val) {
|
||||
__device__ inline int atomicAnd(int* address, int val) {
|
||||
return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
int atomicAnd_system(int* address, int val) {
|
||||
__device__ inline int atomicAnd_system(int* address, int val) {
|
||||
return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
unsigned int atomicAnd(unsigned int* address, unsigned int val) {
|
||||
__device__ inline unsigned int atomicAnd(unsigned int* address, unsigned int val) {
|
||||
return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
unsigned int atomicAnd_system(unsigned int* address, unsigned int val) {
|
||||
__device__ inline unsigned int atomicAnd_system(unsigned int* address, unsigned int val) {
|
||||
return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
unsigned long atomicAnd(unsigned long* address, unsigned long val) {
|
||||
__device__ inline unsigned long atomicAnd(unsigned long* address, unsigned long val) {
|
||||
return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
unsigned long atomicAnd_system(unsigned long* address, unsigned long val) {
|
||||
__device__ inline unsigned long atomicAnd_system(unsigned long* address, unsigned long val) {
|
||||
return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
unsigned long long atomicAnd(unsigned long long* address, unsigned long long val) {
|
||||
__device__ inline unsigned long long atomicAnd(unsigned long long* address,
|
||||
unsigned long long val) {
|
||||
return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
unsigned long long atomicAnd_system(unsigned long long* address, unsigned long long val) {
|
||||
__device__ inline unsigned long long atomicAnd_system(unsigned long long* address,
|
||||
unsigned long long val) {
|
||||
return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
int atomicOr(int* address, int val) {
|
||||
__device__ inline int atomicOr(int* address, int val) {
|
||||
return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
int atomicOr_system(int* address, int val) {
|
||||
__device__ inline int atomicOr_system(int* address, int val) {
|
||||
return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
unsigned int atomicOr(unsigned int* address, unsigned int val) {
|
||||
__device__ inline unsigned int atomicOr(unsigned int* address, unsigned int val) {
|
||||
return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
unsigned int atomicOr_system(unsigned int* address, unsigned int val) {
|
||||
__device__ inline unsigned int atomicOr_system(unsigned int* address, unsigned int val) {
|
||||
return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
unsigned long atomicOr(unsigned long* address, unsigned long val) {
|
||||
__device__ inline unsigned long atomicOr(unsigned long* address, unsigned long val) {
|
||||
return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
unsigned long atomicOr_system(unsigned long* address, unsigned long val) {
|
||||
__device__ inline unsigned long atomicOr_system(unsigned long* address, unsigned long val) {
|
||||
return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
unsigned long long atomicOr(unsigned long long* address, unsigned long long val) {
|
||||
__device__ inline unsigned long long atomicOr(unsigned long long* address, unsigned long long val) {
|
||||
return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
unsigned long long atomicOr_system(unsigned long long* address, unsigned long long val) {
|
||||
__device__ inline unsigned long long atomicOr_system(unsigned long long* address,
|
||||
unsigned long long val) {
|
||||
return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
int atomicXor(int* address, int val) {
|
||||
__device__ inline int atomicXor(int* address, int val) {
|
||||
return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
int atomicXor_system(int* address, int val) {
|
||||
__device__ inline int atomicXor_system(int* address, int val) {
|
||||
return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
unsigned int atomicXor(unsigned int* address, unsigned int val) {
|
||||
__device__ inline unsigned int atomicXor(unsigned int* address, unsigned int val) {
|
||||
return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
unsigned int atomicXor_system(unsigned int* address, unsigned int val) {
|
||||
__device__ inline unsigned int atomicXor_system(unsigned int* address, unsigned int val) {
|
||||
return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
unsigned long atomicXor(unsigned long* address, unsigned long val) {
|
||||
__device__ inline unsigned long atomicXor(unsigned long* address, unsigned long val) {
|
||||
return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
unsigned long atomicXor_system(unsigned long* address, unsigned long val) {
|
||||
__device__ inline unsigned long atomicXor_system(unsigned long* address, unsigned long val) {
|
||||
return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
unsigned long long atomicXor(unsigned long long* address, unsigned long long val) {
|
||||
__device__ inline unsigned long long atomicXor(unsigned long long* address,
|
||||
unsigned long long val) {
|
||||
return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
unsigned long long atomicXor_system(unsigned long long* address, unsigned long long val) {
|
||||
__device__ inline unsigned long long atomicXor_system(unsigned long long* address,
|
||||
unsigned long long val) {
|
||||
return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
||||
}
|
||||
|
||||
@@ -112,12 +112,12 @@
|
||||
#include <hip/amd_detail/amd_hip_common.h>
|
||||
#include "amd_hip_vector_types.h" // float2 etc
|
||||
#include "device_library_decls.h" // ocml conversion functions
|
||||
#include "math_fwd.h" // ocml device functions
|
||||
#include "math_fwd.h" // ocml device functions
|
||||
#if defined(__clang__) and defined(__HIP__)
|
||||
#include <hip/amd_detail/amd_warp_functions.h> // define warpSize
|
||||
#include <hip/amd_detail/amd_warp_sync_functions.h> // Sync functions
|
||||
#endif
|
||||
#endif // !defined(__HIPCC_RTC__)
|
||||
#endif // !defined(__HIPCC_RTC__)
|
||||
|
||||
#define __BF16_DEVICE__ __device__
|
||||
#if defined(__HIPCC_RTC__)
|
||||
@@ -394,7 +394,7 @@ struct __attribute__((aligned(4))) __hip_bfloat162 {
|
||||
/*! \brief return a vector of bf16 */
|
||||
__BF16_HOST_DEVICE__ operator __bf16_2() const { return __xy_bf162; }
|
||||
|
||||
/*! \brief return a vector of bf16 */
|
||||
/*! \brief return a vector of bf16 */
|
||||
__BF16_HOST_DEVICE__ __hip_bfloat162& operator=(const __bf16_2 in) {
|
||||
__xy_bf162 = in;
|
||||
return *this;
|
||||
@@ -623,9 +623,13 @@ __BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __ushort_as_bfloat16(const unsigned s
|
||||
*/
|
||||
__BF16_DEVICE_STATIC__
|
||||
__hip_bfloat16 __shfl(MAYBE_UNDEF __hip_bfloat16 var, int src_lane, int width = warpSize) {
|
||||
union { int i; __hip_bfloat16 f; } tmp; tmp.f = var;
|
||||
tmp.i = __shfl(tmp.i, src_lane, width);
|
||||
return tmp.f;
|
||||
union {
|
||||
int i;
|
||||
__hip_bfloat16 f;
|
||||
} tmp;
|
||||
tmp.f = var;
|
||||
tmp.i = __shfl(tmp.i, src_lane, width);
|
||||
return tmp.f;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -633,11 +637,15 @@ __hip_bfloat16 __shfl(MAYBE_UNDEF __hip_bfloat16 var, int src_lane, int width =
|
||||
* \brief shfl up warp intrinsic for bfloat16
|
||||
*/
|
||||
__BF16_DEVICE_STATIC__
|
||||
__hip_bfloat16 __shfl_up(MAYBE_UNDEF __hip_bfloat16 var,
|
||||
unsigned int lane_delta, int width = warpSize) {
|
||||
union { int i; __hip_bfloat16 f; } tmp; tmp.f = var;
|
||||
tmp.i = __shfl_up(tmp.i, lane_delta, width);
|
||||
return tmp.f;
|
||||
__hip_bfloat16 __shfl_up(MAYBE_UNDEF __hip_bfloat16 var, unsigned int lane_delta,
|
||||
int width = warpSize) {
|
||||
union {
|
||||
int i;
|
||||
__hip_bfloat16 f;
|
||||
} tmp;
|
||||
tmp.f = var;
|
||||
tmp.i = __shfl_up(tmp.i, lane_delta, width);
|
||||
return tmp.f;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -645,11 +653,15 @@ __hip_bfloat16 __shfl_up(MAYBE_UNDEF __hip_bfloat16 var,
|
||||
* \brief shfl down warp intrinsic for bfloat16
|
||||
*/
|
||||
__BF16_DEVICE_STATIC__
|
||||
__hip_bfloat16 __shfl_down(MAYBE_UNDEF __hip_bfloat16 var,
|
||||
unsigned int lane_delta, int width = warpSize) {
|
||||
union { int i; __hip_bfloat16 f; } tmp; tmp.f = var;
|
||||
tmp.i = __shfl_down(tmp.i, lane_delta, width);
|
||||
return tmp.f;
|
||||
__hip_bfloat16 __shfl_down(MAYBE_UNDEF __hip_bfloat16 var, unsigned int lane_delta,
|
||||
int width = warpSize) {
|
||||
union {
|
||||
int i;
|
||||
__hip_bfloat16 f;
|
||||
} tmp;
|
||||
tmp.f = var;
|
||||
tmp.i = __shfl_down(tmp.i, lane_delta, width);
|
||||
return tmp.f;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -658,9 +670,13 @@ __hip_bfloat16 __shfl_down(MAYBE_UNDEF __hip_bfloat16 var,
|
||||
*/
|
||||
__BF16_DEVICE_STATIC__
|
||||
__hip_bfloat16 __shfl_xor(MAYBE_UNDEF __hip_bfloat16 var, int lane_mask, int width = warpSize) {
|
||||
union { int i; __hip_bfloat16 f; } tmp; tmp.f = var;
|
||||
tmp.i = __shfl_xor(tmp.i, lane_mask, width);
|
||||
return tmp.f;
|
||||
union {
|
||||
int i;
|
||||
__hip_bfloat16 f;
|
||||
} tmp;
|
||||
tmp.f = var;
|
||||
tmp.i = __shfl_xor(tmp.i, lane_mask, width);
|
||||
return tmp.f;
|
||||
}
|
||||
|
||||
#if !defined(HIP_DISABLE_WARP_SYNC_BUILTINS)
|
||||
@@ -771,7 +787,7 @@ __BF16_DEVICE_STATIC__ __hip_bfloat162 __shfl_xor_sync(const unsigned long long
|
||||
u.ui = __shfl_xor_sync<unsigned long long, unsigned int>(mask, u.ui, delta, width);
|
||||
return u.bf162;
|
||||
}
|
||||
#endif // HIP_DISABLE_WARP_SYNC_BUILTINS
|
||||
#endif // HIP_DISABLE_WARP_SYNC_BUILTINS
|
||||
|
||||
/**
|
||||
* \ingroup HIP_INTRINSIC_BFLOAT16_ARITH
|
||||
@@ -924,7 +940,7 @@ __BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hmul2(const __hip_bfloat162 a,
|
||||
* \brief Multiplies two bfloat162 values, will not fuse into fma
|
||||
*/
|
||||
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hmul2_rn(const __hip_bfloat162 a,
|
||||
const __hip_bfloat162 b) {
|
||||
const __hip_bfloat162 b) {
|
||||
#pragma clang fp contract(off)
|
||||
return __hip_bfloat162{__bf16_2(a) * __bf16_2(b)};
|
||||
}
|
||||
@@ -951,7 +967,7 @@ __BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hsub2(const __hip_bfloat162 a,
|
||||
* \brief Subtracts two bfloat162 values, will not fuse into fma
|
||||
*/
|
||||
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hsub2_rn(const __hip_bfloat162 a,
|
||||
const __hip_bfloat162 b) {
|
||||
const __hip_bfloat162 b) {
|
||||
#pragma clang fp contract(off)
|
||||
return __hip_bfloat162{__bf16_2(a) - __bf16_2(b)};
|
||||
}
|
||||
@@ -1894,17 +1910,18 @@ __BF16_DEVICE_STATIC__ __hip_bfloat162 unsafeAtomicAdd(__hip_bfloat162* address,
|
||||
return old_val.h2r;
|
||||
#endif
|
||||
}
|
||||
__BF16_DEVICE_STATIC__ __hip_bfloat16 unsafeAtomicAdd(__hip_bfloat16 *address,
|
||||
__BF16_DEVICE_STATIC__ __hip_bfloat16 unsafeAtomicAdd(__hip_bfloat16* address,
|
||||
__hip_bfloat16 value) {
|
||||
static_assert(sizeof(unsigned short int) == sizeof(__hip_bfloat16_raw));
|
||||
unsigned short int* address_as_short = reinterpret_cast<unsigned short int *>(address);
|
||||
unsigned short int* address_as_short = reinterpret_cast<unsigned short int*>(address);
|
||||
// Align to 4 bytes
|
||||
unsigned int* aligned_addr = __builtin_bit_cast(unsigned int*,
|
||||
__builtin_bit_cast(unsigned long long int, address_as_short) &
|
||||
(unsigned long long int)(~0x3));
|
||||
unsigned int* aligned_addr =
|
||||
__builtin_bit_cast(unsigned int*,
|
||||
__builtin_bit_cast(unsigned long long int, address_as_short) &
|
||||
(unsigned long long int)(~0x3));
|
||||
|
||||
bool is_lower = __builtin_bit_cast(unsigned long long int, aligned_addr) ==
|
||||
__builtin_bit_cast(unsigned long long int, address);
|
||||
__builtin_bit_cast(unsigned long long int, address);
|
||||
|
||||
__hip_bfloat162 fval;
|
||||
if (is_lower)
|
||||
@@ -1912,10 +1929,9 @@ __BF16_DEVICE_STATIC__ __hip_bfloat16 unsafeAtomicAdd(__hip_bfloat16 *address,
|
||||
else
|
||||
fval = __halves2bfloat162(__float2bfloat16(0.0f), value);
|
||||
|
||||
__hip_bfloat162 *in = (__hip_bfloat162 *)(aligned_addr);
|
||||
__hip_bfloat162 out = unsafeAtomicAdd(in , fval);
|
||||
if (is_lower)
|
||||
return __low2bfloat16(out);
|
||||
__hip_bfloat162* in = (__hip_bfloat162*)(aligned_addr);
|
||||
__hip_bfloat162 out = unsafeAtomicAdd(in, fval);
|
||||
if (is_lower) return __low2bfloat16(out);
|
||||
return __high2bfloat16(out);
|
||||
}
|
||||
#endif // defined(__clang__) && defined(__HIP__)
|
||||
|
||||
@@ -31,9 +31,9 @@
|
||||
|
||||
#include "host_defines.h"
|
||||
#if defined(__HIPCC_RTC__)
|
||||
#define __HOST_DEVICE__ __device__
|
||||
#define __HOST_DEVICE__ __device__
|
||||
#else
|
||||
#define __HOST_DEVICE__ __host__ __device__
|
||||
#define __HOST_DEVICE__ __host__ __device__
|
||||
#endif
|
||||
|
||||
#if __cplusplus < 201103L || !defined(__HIPCC__)
|
||||
@@ -43,129 +43,106 @@
|
||||
|
||||
#include <stdint.h>
|
||||
/*! \brief Struct to represent a 16 bit brain floating point number. */
|
||||
typedef struct
|
||||
{
|
||||
uint16_t data;
|
||||
typedef struct {
|
||||
uint16_t data;
|
||||
} hip_bfloat16;
|
||||
|
||||
#else // __cplusplus < 201103L || !defined(__HIPCC__)
|
||||
#else // __cplusplus < 201103L || !defined(__HIPCC__)
|
||||
|
||||
#include <hip/hip_runtime.h>
|
||||
|
||||
#pragma clang diagnostic push
|
||||
#pragma clang diagnostic ignored "-Wshadow"
|
||||
struct hip_bfloat16
|
||||
{
|
||||
__hip_uint16_t data;
|
||||
struct hip_bfloat16 {
|
||||
__hip_uint16_t data;
|
||||
|
||||
enum truncate_t
|
||||
{
|
||||
truncate
|
||||
};
|
||||
enum truncate_t { truncate };
|
||||
|
||||
__HOST_DEVICE__ hip_bfloat16() = default;
|
||||
__HOST_DEVICE__ hip_bfloat16() = default;
|
||||
|
||||
// round upper 16 bits of IEEE float to convert to bfloat16
|
||||
explicit __HOST_DEVICE__ hip_bfloat16(float f)
|
||||
: data(float_to_bfloat16(f))
|
||||
{
|
||||
// round upper 16 bits of IEEE float to convert to bfloat16
|
||||
explicit __HOST_DEVICE__ hip_bfloat16(float f) : data(float_to_bfloat16(f)) {}
|
||||
|
||||
explicit __HOST_DEVICE__ hip_bfloat16(float f, truncate_t)
|
||||
: data(truncate_float_to_bfloat16(f)) {}
|
||||
|
||||
// zero extend lower 16 bits of bfloat16 to convert to IEEE float
|
||||
__HOST_DEVICE__ operator float() const {
|
||||
union {
|
||||
__hip_uint32_t int32;
|
||||
float fp32;
|
||||
} u = {__hip_uint32_t(data) << 16};
|
||||
return u.fp32;
|
||||
}
|
||||
|
||||
__HOST_DEVICE__ hip_bfloat16& operator=(const float& f) {
|
||||
data = float_to_bfloat16(f);
|
||||
return *this;
|
||||
}
|
||||
|
||||
static __HOST_DEVICE__ hip_bfloat16 round_to_bfloat16(float f) {
|
||||
hip_bfloat16 output;
|
||||
output.data = float_to_bfloat16(f);
|
||||
return output;
|
||||
}
|
||||
|
||||
static __HOST_DEVICE__ hip_bfloat16 round_to_bfloat16(float f, truncate_t) {
|
||||
hip_bfloat16 output;
|
||||
output.data = truncate_float_to_bfloat16(f);
|
||||
return output;
|
||||
}
|
||||
|
||||
private:
|
||||
static __HOST_DEVICE__ __hip_uint16_t float_to_bfloat16(float f) {
|
||||
union {
|
||||
float fp32;
|
||||
__hip_uint32_t int32;
|
||||
} u = {f};
|
||||
if (~u.int32 & 0x7f800000) {
|
||||
// When the exponent bits are not all 1s, then the value is zero, normal,
|
||||
// or subnormal. We round the bfloat16 mantissa up by adding 0x7FFF, plus
|
||||
// 1 if the least significant bit of the bfloat16 mantissa is 1 (odd).
|
||||
// This causes the bfloat16's mantissa to be incremented by 1 if the 16
|
||||
// least significant bits of the float mantissa are greater than 0x8000,
|
||||
// or if they are equal to 0x8000 and the least significant bit of the
|
||||
// bfloat16 mantissa is 1 (odd). This causes it to be rounded to even when
|
||||
// the lower 16 bits are exactly 0x8000. If the bfloat16 mantissa already
|
||||
// has the value 0x7f, then incrementing it causes it to become 0x00 and
|
||||
// the exponent is incremented by one, which is the next higher FP value
|
||||
// to the unrounded bfloat16 value. When the bfloat16 value is subnormal
|
||||
// with an exponent of 0x00 and a mantissa of 0x7F, it may be rounded up
|
||||
// to a normal value with an exponent of 0x01 and a mantissa of 0x00.
|
||||
// When the bfloat16 value has an exponent of 0xFE and a mantissa of 0x7F,
|
||||
// incrementing it causes it to become an exponent of 0xFF and a mantissa
|
||||
// of 0x00, which is Inf, the next higher value to the unrounded value.
|
||||
u.int32 += 0x7fff + ((u.int32 >> 16) & 1); // Round to nearest, round to even
|
||||
} else if (u.int32 & 0xffff) {
|
||||
// When all of the exponent bits are 1, the value is Inf or NaN.
|
||||
// Inf is indicated by a zero mantissa. NaN is indicated by any nonzero
|
||||
// mantissa bit. Quiet NaN is indicated by the most significant mantissa
|
||||
// bit being 1. Signaling NaN is indicated by the most significant
|
||||
// mantissa bit being 0 but some other bit(s) being 1. If any of the
|
||||
// lower 16 bits of the mantissa are 1, we set the least significant bit
|
||||
// of the bfloat16 mantissa, in order to preserve signaling NaN in case
|
||||
// the bloat16's mantissa bits are all 0.
|
||||
u.int32 |= 0x10000; // Preserve signaling NaN
|
||||
}
|
||||
return __hip_uint16_t(u.int32 >> 16);
|
||||
}
|
||||
|
||||
explicit __HOST_DEVICE__ hip_bfloat16(float f, truncate_t)
|
||||
: data(truncate_float_to_bfloat16(f))
|
||||
{
|
||||
}
|
||||
|
||||
// zero extend lower 16 bits of bfloat16 to convert to IEEE float
|
||||
__HOST_DEVICE__ operator float() const
|
||||
{
|
||||
union
|
||||
{
|
||||
__hip_uint32_t int32;
|
||||
float fp32;
|
||||
} u = {__hip_uint32_t(data) << 16};
|
||||
return u.fp32;
|
||||
}
|
||||
|
||||
__HOST_DEVICE__ hip_bfloat16 &operator=(const float& f)
|
||||
{
|
||||
data = float_to_bfloat16(f);
|
||||
return *this;
|
||||
}
|
||||
|
||||
static __HOST_DEVICE__ hip_bfloat16 round_to_bfloat16(float f)
|
||||
{
|
||||
hip_bfloat16 output;
|
||||
output.data = float_to_bfloat16(f);
|
||||
return output;
|
||||
}
|
||||
|
||||
static __HOST_DEVICE__ hip_bfloat16 round_to_bfloat16(float f, truncate_t)
|
||||
{
|
||||
hip_bfloat16 output;
|
||||
output.data = truncate_float_to_bfloat16(f);
|
||||
return output;
|
||||
}
|
||||
|
||||
private:
|
||||
static __HOST_DEVICE__ __hip_uint16_t float_to_bfloat16(float f)
|
||||
{
|
||||
union
|
||||
{
|
||||
float fp32;
|
||||
__hip_uint32_t int32;
|
||||
} u = {f};
|
||||
if(~u.int32 & 0x7f800000)
|
||||
{
|
||||
// When the exponent bits are not all 1s, then the value is zero, normal,
|
||||
// or subnormal. We round the bfloat16 mantissa up by adding 0x7FFF, plus
|
||||
// 1 if the least significant bit of the bfloat16 mantissa is 1 (odd).
|
||||
// This causes the bfloat16's mantissa to be incremented by 1 if the 16
|
||||
// least significant bits of the float mantissa are greater than 0x8000,
|
||||
// or if they are equal to 0x8000 and the least significant bit of the
|
||||
// bfloat16 mantissa is 1 (odd). This causes it to be rounded to even when
|
||||
// the lower 16 bits are exactly 0x8000. If the bfloat16 mantissa already
|
||||
// has the value 0x7f, then incrementing it causes it to become 0x00 and
|
||||
// the exponent is incremented by one, which is the next higher FP value
|
||||
// to the unrounded bfloat16 value. When the bfloat16 value is subnormal
|
||||
// with an exponent of 0x00 and a mantissa of 0x7F, it may be rounded up
|
||||
// to a normal value with an exponent of 0x01 and a mantissa of 0x00.
|
||||
// When the bfloat16 value has an exponent of 0xFE and a mantissa of 0x7F,
|
||||
// incrementing it causes it to become an exponent of 0xFF and a mantissa
|
||||
// of 0x00, which is Inf, the next higher value to the unrounded value.
|
||||
u.int32 += 0x7fff + ((u.int32 >> 16) & 1); // Round to nearest, round to even
|
||||
}
|
||||
else if(u.int32 & 0xffff)
|
||||
{
|
||||
// When all of the exponent bits are 1, the value is Inf or NaN.
|
||||
// Inf is indicated by a zero mantissa. NaN is indicated by any nonzero
|
||||
// mantissa bit. Quiet NaN is indicated by the most significant mantissa
|
||||
// bit being 1. Signaling NaN is indicated by the most significant
|
||||
// mantissa bit being 0 but some other bit(s) being 1. If any of the
|
||||
// lower 16 bits of the mantissa are 1, we set the least significant bit
|
||||
// of the bfloat16 mantissa, in order to preserve signaling NaN in case
|
||||
// the bloat16's mantissa bits are all 0.
|
||||
u.int32 |= 0x10000; // Preserve signaling NaN
|
||||
}
|
||||
return __hip_uint16_t(u.int32 >> 16);
|
||||
}
|
||||
|
||||
// Truncate instead of rounding, preserving SNaN
|
||||
static __HOST_DEVICE__ __hip_uint16_t truncate_float_to_bfloat16(float f)
|
||||
{
|
||||
union
|
||||
{
|
||||
float fp32;
|
||||
__hip_uint32_t int32;
|
||||
} u = {f};
|
||||
return __hip_uint16_t(u.int32 >> 16) | (!(~u.int32 & 0x7f800000) && (u.int32 & 0xffff));
|
||||
}
|
||||
// Truncate instead of rounding, preserving SNaN
|
||||
static __HOST_DEVICE__ __hip_uint16_t truncate_float_to_bfloat16(float f) {
|
||||
union {
|
||||
float fp32;
|
||||
__hip_uint32_t int32;
|
||||
} u = {f};
|
||||
return __hip_uint16_t(u.int32 >> 16) | (!(~u.int32 & 0x7f800000) && (u.int32 & 0xffff));
|
||||
}
|
||||
};
|
||||
#pragma clang diagnostic pop
|
||||
|
||||
typedef struct
|
||||
{
|
||||
__hip_uint16_t data;
|
||||
typedef struct {
|
||||
__hip_uint16_t data;
|
||||
} hip_bfloat16_public;
|
||||
|
||||
static_assert(__hip_internal::is_standard_layout<hip_bfloat16>{},
|
||||
@@ -176,118 +153,77 @@ static_assert(__hip_internal::is_trivial<hip_bfloat16>{},
|
||||
"hip_bfloat16 is not a trivial type, and thus is "
|
||||
"incompatible with C.");
|
||||
#if !defined(__HIPCC_RTC__)
|
||||
static_assert(sizeof(hip_bfloat16) == sizeof(hip_bfloat16_public)
|
||||
&& offsetof(hip_bfloat16, data) == offsetof(hip_bfloat16_public, data),
|
||||
static_assert(sizeof(hip_bfloat16) == sizeof(hip_bfloat16_public) &&
|
||||
offsetof(hip_bfloat16, data) == offsetof(hip_bfloat16_public, data),
|
||||
"internal hip_bfloat16 does not match public hip_bfloat16");
|
||||
|
||||
inline std::ostream& operator<<(std::ostream& os, const hip_bfloat16& bf16)
|
||||
{
|
||||
inline std::ostream& operator<<(std::ostream& os, const hip_bfloat16& bf16) {
|
||||
return os << float(bf16);
|
||||
}
|
||||
#endif
|
||||
|
||||
inline __HOST_DEVICE__ hip_bfloat16 operator+(hip_bfloat16 a)
|
||||
{
|
||||
return a;
|
||||
inline __HOST_DEVICE__ hip_bfloat16 operator+(hip_bfloat16 a) { return a; }
|
||||
inline __HOST_DEVICE__ hip_bfloat16 operator-(hip_bfloat16 a) {
|
||||
a.data ^= 0x8000;
|
||||
return a;
|
||||
}
|
||||
inline __HOST_DEVICE__ hip_bfloat16 operator-(hip_bfloat16 a)
|
||||
{
|
||||
a.data ^= 0x8000;
|
||||
return a;
|
||||
inline __HOST_DEVICE__ hip_bfloat16 operator+(hip_bfloat16 a, hip_bfloat16 b) {
|
||||
return hip_bfloat16(float(a) + float(b));
|
||||
}
|
||||
inline __HOST_DEVICE__ hip_bfloat16 operator+(hip_bfloat16 a, hip_bfloat16 b)
|
||||
{
|
||||
return hip_bfloat16(float(a) + float(b));
|
||||
inline __HOST_DEVICE__ hip_bfloat16 operator-(hip_bfloat16 a, hip_bfloat16 b) {
|
||||
return hip_bfloat16(float(a) - float(b));
|
||||
}
|
||||
inline __HOST_DEVICE__ hip_bfloat16 operator-(hip_bfloat16 a, hip_bfloat16 b)
|
||||
{
|
||||
return hip_bfloat16(float(a) - float(b));
|
||||
inline __HOST_DEVICE__ hip_bfloat16 operator*(hip_bfloat16 a, hip_bfloat16 b) {
|
||||
return hip_bfloat16(float(a) * float(b));
|
||||
}
|
||||
inline __HOST_DEVICE__ hip_bfloat16 operator*(hip_bfloat16 a, hip_bfloat16 b)
|
||||
{
|
||||
return hip_bfloat16(float(a) * float(b));
|
||||
inline __HOST_DEVICE__ hip_bfloat16 operator/(hip_bfloat16 a, hip_bfloat16 b) {
|
||||
return hip_bfloat16(float(a) / float(b));
|
||||
}
|
||||
inline __HOST_DEVICE__ hip_bfloat16 operator/(hip_bfloat16 a, hip_bfloat16 b)
|
||||
{
|
||||
return hip_bfloat16(float(a) / float(b));
|
||||
inline __HOST_DEVICE__ bool operator<(hip_bfloat16 a, hip_bfloat16 b) {
|
||||
return float(a) < float(b);
|
||||
}
|
||||
inline __HOST_DEVICE__ bool operator<(hip_bfloat16 a, hip_bfloat16 b)
|
||||
{
|
||||
return float(a) < float(b);
|
||||
inline __HOST_DEVICE__ bool operator==(hip_bfloat16 a, hip_bfloat16 b) {
|
||||
return float(a) == float(b);
|
||||
}
|
||||
inline __HOST_DEVICE__ bool operator==(hip_bfloat16 a, hip_bfloat16 b)
|
||||
{
|
||||
return float(a) == float(b);
|
||||
inline __HOST_DEVICE__ bool operator>(hip_bfloat16 a, hip_bfloat16 b) { return b < a; }
|
||||
inline __HOST_DEVICE__ bool operator<=(hip_bfloat16 a, hip_bfloat16 b) { return !(a > b); }
|
||||
inline __HOST_DEVICE__ bool operator!=(hip_bfloat16 a, hip_bfloat16 b) { return !(a == b); }
|
||||
inline __HOST_DEVICE__ bool operator>=(hip_bfloat16 a, hip_bfloat16 b) { return !(a < b); }
|
||||
inline __HOST_DEVICE__ hip_bfloat16& operator+=(hip_bfloat16& a, hip_bfloat16 b) {
|
||||
return a = a + b;
|
||||
}
|
||||
inline __HOST_DEVICE__ bool operator>(hip_bfloat16 a, hip_bfloat16 b)
|
||||
{
|
||||
return b < a;
|
||||
inline __HOST_DEVICE__ hip_bfloat16& operator-=(hip_bfloat16& a, hip_bfloat16 b) {
|
||||
return a = a - b;
|
||||
}
|
||||
inline __HOST_DEVICE__ bool operator<=(hip_bfloat16 a, hip_bfloat16 b)
|
||||
{
|
||||
return !(a > b);
|
||||
inline __HOST_DEVICE__ hip_bfloat16& operator*=(hip_bfloat16& a, hip_bfloat16 b) {
|
||||
return a = a * b;
|
||||
}
|
||||
inline __HOST_DEVICE__ bool operator!=(hip_bfloat16 a, hip_bfloat16 b)
|
||||
{
|
||||
return !(a == b);
|
||||
inline __HOST_DEVICE__ hip_bfloat16& operator/=(hip_bfloat16& a, hip_bfloat16 b) {
|
||||
return a = a / b;
|
||||
}
|
||||
inline __HOST_DEVICE__ bool operator>=(hip_bfloat16 a, hip_bfloat16 b)
|
||||
{
|
||||
return !(a < b);
|
||||
inline __HOST_DEVICE__ hip_bfloat16& operator++(hip_bfloat16& a) { return a += hip_bfloat16(1.0f); }
|
||||
inline __HOST_DEVICE__ hip_bfloat16& operator--(hip_bfloat16& a) { return a -= hip_bfloat16(1.0f); }
|
||||
inline __HOST_DEVICE__ hip_bfloat16 operator++(hip_bfloat16& a, int) {
|
||||
hip_bfloat16 orig = a;
|
||||
++a;
|
||||
return orig;
|
||||
}
|
||||
inline __HOST_DEVICE__ hip_bfloat16& operator+=(hip_bfloat16& a, hip_bfloat16 b)
|
||||
{
|
||||
return a = a + b;
|
||||
}
|
||||
inline __HOST_DEVICE__ hip_bfloat16& operator-=(hip_bfloat16& a, hip_bfloat16 b)
|
||||
{
|
||||
return a = a - b;
|
||||
}
|
||||
inline __HOST_DEVICE__ hip_bfloat16& operator*=(hip_bfloat16& a, hip_bfloat16 b)
|
||||
{
|
||||
return a = a * b;
|
||||
}
|
||||
inline __HOST_DEVICE__ hip_bfloat16& operator/=(hip_bfloat16& a, hip_bfloat16 b)
|
||||
{
|
||||
return a = a / b;
|
||||
}
|
||||
inline __HOST_DEVICE__ hip_bfloat16& operator++(hip_bfloat16& a)
|
||||
{
|
||||
return a += hip_bfloat16(1.0f);
|
||||
}
|
||||
inline __HOST_DEVICE__ hip_bfloat16& operator--(hip_bfloat16& a)
|
||||
{
|
||||
return a -= hip_bfloat16(1.0f);
|
||||
}
|
||||
inline __HOST_DEVICE__ hip_bfloat16 operator++(hip_bfloat16& a, int)
|
||||
{
|
||||
hip_bfloat16 orig = a;
|
||||
++a;
|
||||
return orig;
|
||||
}
|
||||
inline __HOST_DEVICE__ hip_bfloat16 operator--(hip_bfloat16& a, int)
|
||||
{
|
||||
hip_bfloat16 orig = a;
|
||||
--a;
|
||||
return orig;
|
||||
inline __HOST_DEVICE__ hip_bfloat16 operator--(hip_bfloat16& a, int) {
|
||||
hip_bfloat16 orig = a;
|
||||
--a;
|
||||
return orig;
|
||||
}
|
||||
|
||||
namespace std
|
||||
{
|
||||
constexpr __HOST_DEVICE__ bool isinf(hip_bfloat16 a)
|
||||
{
|
||||
return !(~a.data & 0x7f80) && !(a.data & 0x7f);
|
||||
}
|
||||
constexpr __HOST_DEVICE__ bool isnan(hip_bfloat16 a)
|
||||
{
|
||||
return !(~a.data & 0x7f80) && +(a.data & 0x7f);
|
||||
}
|
||||
constexpr __HOST_DEVICE__ bool iszero(hip_bfloat16 a)
|
||||
{
|
||||
return !(a.data & 0x7fff);
|
||||
}
|
||||
namespace std {
|
||||
constexpr __HOST_DEVICE__ bool isinf(hip_bfloat16 a) {
|
||||
return !(~a.data & 0x7f80) && !(a.data & 0x7f);
|
||||
}
|
||||
constexpr __HOST_DEVICE__ bool isnan(hip_bfloat16 a) {
|
||||
return !(~a.data & 0x7f80) && +(a.data & 0x7f);
|
||||
}
|
||||
constexpr __HOST_DEVICE__ bool iszero(hip_bfloat16 a) { return !(a.data & 0x7fff); }
|
||||
} // namespace std
|
||||
|
||||
#endif // __cplusplus < 201103L || !defined(__HIPCC__)
|
||||
#endif // __cplusplus < 201103L || !defined(__HIPCC__)
|
||||
|
||||
#endif // _HIP_BFLOAT16_H_
|
||||
#endif // _HIP_BFLOAT16_H_
|
||||
|
||||
@@ -29,4 +29,4 @@ SOFTWARE.
|
||||
#define __HIP_CLANG_ONLY__ 0
|
||||
#endif
|
||||
|
||||
#endif // HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COMMON_H
|
||||
#endif // HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COMMON_H
|
||||
|
||||
@@ -41,7 +41,7 @@ THE SOFTWARE.
|
||||
#else
|
||||
#include "math.h"
|
||||
#endif
|
||||
#endif // !defined(__HIPCC_RTC__)
|
||||
#endif // !defined(__HIPCC_RTC__)
|
||||
|
||||
typedef float2 hipFloatComplex;
|
||||
|
||||
@@ -50,41 +50,39 @@ __HOST_DEVICE__ static inline float hipCrealf(hipFloatComplex z) { return z.x; }
|
||||
__HOST_DEVICE__ static inline float hipCimagf(hipFloatComplex z) { return z.y; }
|
||||
|
||||
__HOST_DEVICE__ static inline hipFloatComplex make_hipFloatComplex(float a, float b) {
|
||||
hipFloatComplex z;
|
||||
z.x = a;
|
||||
z.y = b;
|
||||
return z;
|
||||
hipFloatComplex z;
|
||||
z.x = a;
|
||||
z.y = b;
|
||||
return z;
|
||||
}
|
||||
|
||||
__HOST_DEVICE__ static inline hipFloatComplex hipConjf(hipFloatComplex z) {
|
||||
hipFloatComplex ret;
|
||||
ret.x = z.x;
|
||||
ret.y = -z.y;
|
||||
return ret;
|
||||
hipFloatComplex ret;
|
||||
ret.x = z.x;
|
||||
ret.y = -z.y;
|
||||
return ret;
|
||||
}
|
||||
|
||||
__HOST_DEVICE__ static inline float hipCsqabsf(hipFloatComplex z) {
|
||||
return z.x * z.x + z.y * z.y;
|
||||
}
|
||||
__HOST_DEVICE__ static inline float hipCsqabsf(hipFloatComplex z) { return z.x * z.x + z.y * z.y; }
|
||||
|
||||
__HOST_DEVICE__ static inline hipFloatComplex hipCaddf(hipFloatComplex p, hipFloatComplex q) {
|
||||
return make_hipFloatComplex(p.x + q.x, p.y + q.y);
|
||||
return make_hipFloatComplex(p.x + q.x, p.y + q.y);
|
||||
}
|
||||
|
||||
__HOST_DEVICE__ static inline hipFloatComplex hipCsubf(hipFloatComplex p, hipFloatComplex q) {
|
||||
return make_hipFloatComplex(p.x - q.x, p.y - q.y);
|
||||
return make_hipFloatComplex(p.x - q.x, p.y - q.y);
|
||||
}
|
||||
|
||||
__HOST_DEVICE__ static inline hipFloatComplex hipCmulf(hipFloatComplex p, hipFloatComplex q) {
|
||||
return make_hipFloatComplex(p.x * q.x - p.y * q.y, p.y * q.x + p.x * q.y);
|
||||
return make_hipFloatComplex(p.x * q.x - p.y * q.y, p.y * q.x + p.x * q.y);
|
||||
}
|
||||
|
||||
__HOST_DEVICE__ static inline hipFloatComplex hipCdivf(hipFloatComplex p, hipFloatComplex q) {
|
||||
float sqabs = hipCsqabsf(q);
|
||||
hipFloatComplex ret;
|
||||
ret.x = (p.x * q.x + p.y * q.y) / sqabs;
|
||||
ret.y = (p.y * q.x - p.x * q.y) / sqabs;
|
||||
return ret;
|
||||
float sqabs = hipCsqabsf(q);
|
||||
hipFloatComplex ret;
|
||||
ret.x = (p.x * q.x + p.y * q.y) / sqabs;
|
||||
ret.y = (p.y * q.x - p.x * q.y) / sqabs;
|
||||
return ret;
|
||||
}
|
||||
|
||||
__HOST_DEVICE__ static inline float hipCabsf(hipFloatComplex z) { return sqrtf(hipCsqabsf(z)); }
|
||||
@@ -97,41 +95,39 @@ __HOST_DEVICE__ static inline double hipCreal(hipDoubleComplex z) { return z.x;
|
||||
__HOST_DEVICE__ static inline double hipCimag(hipDoubleComplex z) { return z.y; }
|
||||
|
||||
__HOST_DEVICE__ static inline hipDoubleComplex make_hipDoubleComplex(double a, double b) {
|
||||
hipDoubleComplex z;
|
||||
z.x = a;
|
||||
z.y = b;
|
||||
return z;
|
||||
hipDoubleComplex z;
|
||||
z.x = a;
|
||||
z.y = b;
|
||||
return z;
|
||||
}
|
||||
|
||||
__HOST_DEVICE__ static inline hipDoubleComplex hipConj(hipDoubleComplex z) {
|
||||
hipDoubleComplex ret;
|
||||
ret.x = z.x;
|
||||
ret.y = -z.y;
|
||||
return ret;
|
||||
hipDoubleComplex ret;
|
||||
ret.x = z.x;
|
||||
ret.y = -z.y;
|
||||
return ret;
|
||||
}
|
||||
|
||||
__HOST_DEVICE__ static inline double hipCsqabs(hipDoubleComplex z) {
|
||||
return z.x * z.x + z.y * z.y;
|
||||
}
|
||||
__HOST_DEVICE__ static inline double hipCsqabs(hipDoubleComplex z) { return z.x * z.x + z.y * z.y; }
|
||||
|
||||
__HOST_DEVICE__ static inline hipDoubleComplex hipCadd(hipDoubleComplex p, hipDoubleComplex q) {
|
||||
return make_hipDoubleComplex(p.x + q.x, p.y + q.y);
|
||||
return make_hipDoubleComplex(p.x + q.x, p.y + q.y);
|
||||
}
|
||||
|
||||
__HOST_DEVICE__ static inline hipDoubleComplex hipCsub(hipDoubleComplex p, hipDoubleComplex q) {
|
||||
return make_hipDoubleComplex(p.x - q.x, p.y - q.y);
|
||||
return make_hipDoubleComplex(p.x - q.x, p.y - q.y);
|
||||
}
|
||||
|
||||
__HOST_DEVICE__ static inline hipDoubleComplex hipCmul(hipDoubleComplex p, hipDoubleComplex q) {
|
||||
return make_hipDoubleComplex(p.x * q.x - p.y * q.y, p.y * q.x + p.x * q.y);
|
||||
return make_hipDoubleComplex(p.x * q.x - p.y * q.y, p.y * q.x + p.x * q.y);
|
||||
}
|
||||
|
||||
__HOST_DEVICE__ static inline hipDoubleComplex hipCdiv(hipDoubleComplex p, hipDoubleComplex q) {
|
||||
double sqabs = hipCsqabs(q);
|
||||
hipDoubleComplex ret;
|
||||
ret.x = (p.x * q.x + p.y * q.y) / sqabs;
|
||||
ret.y = (p.y * q.x - p.x * q.y) / sqabs;
|
||||
return ret;
|
||||
double sqabs = hipCsqabs(q);
|
||||
hipDoubleComplex ret;
|
||||
ret.x = (p.x * q.x + p.y * q.y) / sqabs;
|
||||
ret.y = (p.y * q.x - p.x * q.y) / sqabs;
|
||||
return ret;
|
||||
}
|
||||
|
||||
__HOST_DEVICE__ static inline double hipCabs(hipDoubleComplex z) { return sqrt(hipCsqabs(z)); }
|
||||
@@ -139,36 +135,36 @@ __HOST_DEVICE__ static inline double hipCabs(hipDoubleComplex z) { return sqrt(h
|
||||
typedef hipFloatComplex hipComplex;
|
||||
|
||||
__HOST_DEVICE__ static inline hipComplex make_hipComplex(float x, float y) {
|
||||
return make_hipFloatComplex(x, y);
|
||||
return make_hipFloatComplex(x, y);
|
||||
}
|
||||
|
||||
__HOST_DEVICE__ static inline hipFloatComplex hipComplexDoubleToFloat(hipDoubleComplex z) {
|
||||
return make_hipFloatComplex((float)z.x, (float)z.y);
|
||||
return make_hipFloatComplex((float)z.x, (float)z.y);
|
||||
}
|
||||
|
||||
__HOST_DEVICE__ static inline hipDoubleComplex hipComplexFloatToDouble(hipFloatComplex z) {
|
||||
return make_hipDoubleComplex((double)z.x, (double)z.y);
|
||||
return make_hipDoubleComplex((double)z.x, (double)z.y);
|
||||
}
|
||||
|
||||
__HOST_DEVICE__ static inline hipComplex hipCfmaf(hipComplex p, hipComplex q, hipComplex r) {
|
||||
float real = (p.x * q.x) + r.x;
|
||||
float imag = (q.x * p.y) + r.y;
|
||||
float real = (p.x * q.x) + r.x;
|
||||
float imag = (q.x * p.y) + r.y;
|
||||
|
||||
real = -(p.y * q.y) + real;
|
||||
imag = (p.x * q.y) + imag;
|
||||
real = -(p.y * q.y) + real;
|
||||
imag = (p.x * q.y) + imag;
|
||||
|
||||
return make_hipComplex(real, imag);
|
||||
return make_hipComplex(real, imag);
|
||||
}
|
||||
|
||||
__HOST_DEVICE__ static inline hipDoubleComplex hipCfma(hipDoubleComplex p, hipDoubleComplex q,
|
||||
hipDoubleComplex r) {
|
||||
double real = (p.x * q.x) + r.x;
|
||||
double imag = (q.x * p.y) + r.y;
|
||||
hipDoubleComplex r) {
|
||||
double real = (p.x * q.x) + r.x;
|
||||
double imag = (q.x * p.y) + r.y;
|
||||
|
||||
real = -(p.y * q.y) + real;
|
||||
imag = (p.x * q.y) + imag;
|
||||
real = -(p.y * q.y) + real;
|
||||
imag = (p.x * q.y) + imag;
|
||||
|
||||
return make_hipDoubleComplex(real, imag);
|
||||
return make_hipDoubleComplex(real, imag);
|
||||
}
|
||||
|
||||
#endif //HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COMPLEX_H
|
||||
#endif // HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COMPLEX_H
|
||||
|
||||
@@ -49,10 +49,10 @@ namespace cooperative_groups {
|
||||
*/
|
||||
class thread_group {
|
||||
protected:
|
||||
__hip_uint32_t _type; //! Type of the thread_group.
|
||||
__hip_uint32_t _type; //! Type of the thread_group.
|
||||
__hip_uint32_t _num_threads; //! Total number of threads in the thread_group.
|
||||
__hip_uint64_t _mask; //! Lanemask for coalesced and tiled partitioned group types,
|
||||
//! LSB represents lane 0, and MSB represents lane 63
|
||||
__hip_uint64_t _mask; //! Lanemask for coalesced and tiled partitioned group types,
|
||||
//! LSB represents lane 0, and MSB represents lane 63
|
||||
|
||||
//! Construct a thread group, and set thread group type and other essential
|
||||
//! thread group properties. This generic thread group is directly constructed
|
||||
@@ -103,9 +103,9 @@ class thread_group {
|
||||
*
|
||||
* \details Causes all threads in the group to wait at this synchronization point,
|
||||
* and for all shared and global memory accesses by the threads to complete,
|
||||
* before running synchronization. This guarantees the visibility of accessed data
|
||||
* before running synchronization. This guarantees the visibility of accessed data
|
||||
* for all threads in the group.
|
||||
*
|
||||
*
|
||||
* \note There are potential read-after-write (RAW), write-after-read (WAR), or
|
||||
* write-after-write (WAW) hazards, when threads in the group access the
|
||||
* same addresses in shared or global memory. The data hazards can
|
||||
@@ -146,7 +146,6 @@ class multi_grid_group : public thread_group {
|
||||
: thread_group(internal::cg_multi_grid, size) {}
|
||||
|
||||
public:
|
||||
|
||||
//! Number of invocations participating in this multi-grid group. In other
|
||||
//! words, the number of GPUs.
|
||||
__CG_QUALIFIER__ __hip_uint32_t num_grids() { return internal::multi_grid::num_grids(); }
|
||||
@@ -155,7 +154,9 @@ class multi_grid_group : public thread_group {
|
||||
//! [0, num_grids()) of the GPU that kernel is running on.
|
||||
__CG_QUALIFIER__ __hip_uint32_t grid_rank() { return internal::multi_grid::grid_rank(); }
|
||||
//! @copydoc thread_group::thread_rank
|
||||
__CG_QUALIFIER__ __hip_uint32_t thread_rank() const { return internal::multi_grid::thread_rank(); }
|
||||
__CG_QUALIFIER__ __hip_uint32_t thread_rank() const {
|
||||
return internal::multi_grid::thread_rank();
|
||||
}
|
||||
//! @copydoc thread_group::is_valid
|
||||
__CG_QUALIFIER__ bool is_valid() const { return internal::multi_grid::is_valid(); }
|
||||
//! @copydoc thread_group::sync
|
||||
@@ -163,8 +164,8 @@ class multi_grid_group : public thread_group {
|
||||
};
|
||||
|
||||
/** \addtogroup CooperativeGConstruct Construct functions of Cooperative groups
|
||||
* \ingroup CooperativeG
|
||||
* @{ */
|
||||
* \ingroup CooperativeG
|
||||
* @{ */
|
||||
|
||||
/** \brief User-exposed API interface to construct grid cooperative group type
|
||||
* object - `multi_grid_group`.
|
||||
@@ -196,7 +197,8 @@ class grid_group : public thread_group {
|
||||
|
||||
protected:
|
||||
//! Construct grid thread group (through the API this_grid())
|
||||
explicit __CG_QUALIFIER__ grid_group(__hip_uint32_t size) : thread_group(internal::cg_grid, size) {}
|
||||
explicit __CG_QUALIFIER__ grid_group(__hip_uint32_t size)
|
||||
: thread_group(internal::cg_grid, size) {}
|
||||
|
||||
public:
|
||||
//! @copydoc thread_group::thread_rank
|
||||
@@ -237,6 +239,7 @@ class thread_block : public thread_group {
|
||||
unsigned int tile_size);
|
||||
friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_block& parent,
|
||||
unsigned int tile_size);
|
||||
|
||||
protected:
|
||||
// Construct a workgroup thread group (through the API this_thread_block())
|
||||
explicit __CG_QUALIFIER__ thread_block(__hip_uint32_t size)
|
||||
@@ -269,9 +272,13 @@ class thread_block : public thread_group {
|
||||
//! Returns 3-dimensional thread index within the block.
|
||||
__CG_STATIC_QUALIFIER__ dim3 thread_index() { return internal::workgroup::thread_index(); }
|
||||
//! @copydoc thread_group::thread_rank
|
||||
__CG_STATIC_QUALIFIER__ __hip_uint32_t thread_rank() { return internal::workgroup::thread_rank(); }
|
||||
__CG_STATIC_QUALIFIER__ __hip_uint32_t thread_rank() {
|
||||
return internal::workgroup::thread_rank();
|
||||
}
|
||||
//! @copydoc thread_group::num_threads
|
||||
__CG_STATIC_QUALIFIER__ __hip_uint32_t num_threads() { return internal::workgroup::num_threads(); }
|
||||
__CG_STATIC_QUALIFIER__ __hip_uint32_t num_threads() {
|
||||
return internal::workgroup::num_threads();
|
||||
}
|
||||
//! @copydoc thread_group::size
|
||||
__CG_STATIC_QUALIFIER__ __hip_uint32_t size() { return num_threads(); }
|
||||
//! @copydoc thread_group::is_valid
|
||||
@@ -282,7 +289,7 @@ class thread_block : public thread_group {
|
||||
__CG_QUALIFIER__ dim3 group_dim() { return internal::workgroup::block_dim(); }
|
||||
};
|
||||
|
||||
/** \ingroup CooperativeGConstruct
|
||||
/** \ingroup CooperativeGConstruct
|
||||
* \brief User-exposed API interface to construct workgroup cooperative
|
||||
* group type object - `thread_block`.
|
||||
*
|
||||
@@ -335,7 +342,9 @@ class tiled_group : public thread_group {
|
||||
|
||||
public:
|
||||
//! @copydoc thread_group::num_threads
|
||||
__CG_QUALIFIER__ unsigned int num_threads() const { return (coalesced_info.tiled_info.num_threads); }
|
||||
__CG_QUALIFIER__ unsigned int num_threads() const {
|
||||
return (coalesced_info.tiled_info.num_threads);
|
||||
}
|
||||
|
||||
//! @copydoc thread_group::size
|
||||
__CG_QUALIFIER__ unsigned int size() const { return num_threads(); }
|
||||
@@ -346,9 +355,7 @@ class tiled_group : public thread_group {
|
||||
}
|
||||
|
||||
//! @copydoc thread_group::sync
|
||||
__CG_QUALIFIER__ void sync() const {
|
||||
internal::tiled_group::sync();
|
||||
}
|
||||
__CG_QUALIFIER__ void sync() const { internal::tiled_group::sync(); }
|
||||
};
|
||||
|
||||
template <unsigned int size, class ParentCGTy> class thread_block_tile;
|
||||
@@ -363,8 +370,10 @@ template <unsigned int size, class ParentCGTy> class thread_block_tile;
|
||||
class coalesced_group : public thread_group {
|
||||
private:
|
||||
friend __CG_QUALIFIER__ coalesced_group coalesced_threads();
|
||||
friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent, unsigned int tile_size);
|
||||
friend __CG_QUALIFIER__ coalesced_group tiled_partition(const coalesced_group& parent, unsigned int tile_size);
|
||||
friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent,
|
||||
unsigned int tile_size);
|
||||
friend __CG_QUALIFIER__ coalesced_group tiled_partition(const coalesced_group& parent,
|
||||
unsigned int tile_size);
|
||||
friend __CG_QUALIFIER__ coalesced_group binary_partition(const coalesced_group& cgrp, bool pred);
|
||||
template <unsigned int fsize, class fparent>
|
||||
friend __CG_QUALIFIER__ coalesced_group
|
||||
@@ -381,9 +390,11 @@ class coalesced_group : public thread_group {
|
||||
// prepare a mask for further partitioning it so that it stays coalesced.
|
||||
if (coalesced_info.tiled_info.is_tiled) {
|
||||
unsigned int base_offset = (thread_rank() & (~(tile_size - 1)));
|
||||
unsigned int masklength = min(static_cast<unsigned int>(num_threads()) - base_offset, tile_size);
|
||||
lane_mask full_mask = (static_cast<int>(warpSize) == 32) ? static_cast<lane_mask>((1u << 32) - 1)
|
||||
: static_cast<lane_mask>(-1ull);
|
||||
unsigned int masklength =
|
||||
min(static_cast<unsigned int>(num_threads()) - base_offset, tile_size);
|
||||
lane_mask full_mask = (static_cast<int>(warpSize) == 32)
|
||||
? static_cast<lane_mask>((1u << 32) - 1)
|
||||
: static_cast<lane_mask>(-1ull);
|
||||
lane_mask member_mask = full_mask >> (warpSize - masklength);
|
||||
|
||||
member_mask <<= (__lane_id() & ~(tile_size - 1));
|
||||
@@ -404,7 +415,7 @@ class coalesced_group : public thread_group {
|
||||
// Make sure the lane is active
|
||||
if (active) {
|
||||
if (lanes_to_skip <= 0 && tile_rank < tile_size) {
|
||||
// Prepare a member_mask that is appropriate for a tile
|
||||
// Prepare a member_mask that is appropriate for a tile
|
||||
member_mask |= active;
|
||||
tile_rank++;
|
||||
}
|
||||
@@ -414,59 +425,54 @@ class coalesced_group : public thread_group {
|
||||
coalesced_group coalesced_tile = coalesced_group(member_mask);
|
||||
coalesced_tile.coalesced_info.tiled_info.meta_group_rank = thread_rank() / tile_size;
|
||||
coalesced_tile.coalesced_info.tiled_info.meta_group_size =
|
||||
(num_threads() + tile_size - 1) / tile_size;
|
||||
(num_threads() + tile_size - 1) / tile_size;
|
||||
return coalesced_tile;
|
||||
}
|
||||
return coalesced_group(0);
|
||||
return coalesced_group(0);
|
||||
}
|
||||
|
||||
protected:
|
||||
// Constructor
|
||||
// Constructor
|
||||
explicit __CG_QUALIFIER__ coalesced_group(lane_mask member_mask)
|
||||
: thread_group(internal::cg_coalesced_group) {
|
||||
coalesced_info.member_mask = member_mask; // Which threads are active
|
||||
coalesced_info.num_threads = __popcll(coalesced_info.member_mask); // How many threads are active
|
||||
coalesced_info.tiled_info.is_tiled = false; // Not a partitioned group
|
||||
coalesced_info.member_mask = member_mask; // Which threads are active
|
||||
coalesced_info.num_threads =
|
||||
__popcll(coalesced_info.member_mask); // How many threads are active
|
||||
coalesced_info.tiled_info.is_tiled = false; // Not a partitioned group
|
||||
coalesced_info.tiled_info.meta_group_rank = 0;
|
||||
coalesced_info.tiled_info.meta_group_size = 1;
|
||||
}
|
||||
|
||||
public:
|
||||
//! @copydoc thread_group::num_threads
|
||||
__CG_QUALIFIER__ unsigned int num_threads() const {
|
||||
return coalesced_info.num_threads;
|
||||
}
|
||||
//! @copydoc thread_group::num_threads
|
||||
__CG_QUALIFIER__ unsigned int num_threads() const { return coalesced_info.num_threads; }
|
||||
|
||||
//! @copydoc thread_group::size
|
||||
__CG_QUALIFIER__ unsigned int size() const {
|
||||
return num_threads();
|
||||
}
|
||||
//! @copydoc thread_group::size
|
||||
__CG_QUALIFIER__ unsigned int size() const { return num_threads(); }
|
||||
|
||||
//! @copydoc thread_group::thread_rank
|
||||
__CG_QUALIFIER__ unsigned int thread_rank() const {
|
||||
return internal::coalesced_group::masked_bit_count(coalesced_info.member_mask);
|
||||
}
|
||||
//! @copydoc thread_group::thread_rank
|
||||
__CG_QUALIFIER__ unsigned int thread_rank() const {
|
||||
return internal::coalesced_group::masked_bit_count(coalesced_info.member_mask);
|
||||
}
|
||||
|
||||
//! @copydoc thread_group::sync
|
||||
__CG_QUALIFIER__ void sync() const {
|
||||
internal::coalesced_group::sync();
|
||||
}
|
||||
//! @copydoc thread_group::sync
|
||||
__CG_QUALIFIER__ void sync() const { internal::coalesced_group::sync(); }
|
||||
|
||||
//! Returns the linear rank of the group within the set of tiles partitioned
|
||||
//! from a parent group (bounded by meta_group_size).
|
||||
__CG_QUALIFIER__ unsigned int meta_group_rank() const {
|
||||
return coalesced_info.tiled_info.meta_group_rank;
|
||||
}
|
||||
//! Returns the linear rank of the group within the set of tiles partitioned
|
||||
//! from a parent group (bounded by meta_group_size).
|
||||
__CG_QUALIFIER__ unsigned int meta_group_rank() const {
|
||||
return coalesced_info.tiled_info.meta_group_rank;
|
||||
}
|
||||
|
||||
//! Returns the number of groups created when the parent group was partitioned.
|
||||
__CG_QUALIFIER__ unsigned int meta_group_size() const {
|
||||
return coalesced_info.tiled_info.meta_group_size;
|
||||
}
|
||||
//! Returns the number of groups created when the parent group was partitioned.
|
||||
__CG_QUALIFIER__ unsigned int meta_group_size() const {
|
||||
return coalesced_info.tiled_info.meta_group_size;
|
||||
}
|
||||
|
||||
/** \brief Shuffle operation on group level.
|
||||
*
|
||||
* \details Exchanging variables between threads without use of shared memory.
|
||||
* Shuffle operation is a direct copy of ``var`` from ``srcRank``
|
||||
* Shuffle operation is a direct copy of ``var`` from ``srcRank``
|
||||
* thread ID of group.
|
||||
*
|
||||
* \tparam T The type can be a 32-bit integer or single-precision
|
||||
@@ -475,14 +481,13 @@ class coalesced_group : public thread_group {
|
||||
* group is copied to other threads.
|
||||
* \param srcRank [in] The source thread ID of the group for copy.
|
||||
*/
|
||||
template <class T>
|
||||
__CG_QUALIFIER__ T shfl(T var, int srcRank) const {
|
||||
|
||||
template <class T> __CG_QUALIFIER__ T shfl(T var, int srcRank) const {
|
||||
srcRank = srcRank % static_cast<int>(num_threads());
|
||||
|
||||
int lane = (num_threads() == warpSize) ? srcRank
|
||||
: (static_cast<int>(warpSize) == 64) ? __fns64(coalesced_info.member_mask, 0, (srcRank + 1))
|
||||
: __fns32(coalesced_info.member_mask, 0, (srcRank + 1));
|
||||
: (static_cast<int>(warpSize) == 64)
|
||||
? __fns64(coalesced_info.member_mask, 0, (srcRank + 1))
|
||||
: __fns32(coalesced_info.member_mask, 0, (srcRank + 1));
|
||||
|
||||
return __shfl(var, lane, warpSize);
|
||||
}
|
||||
@@ -501,9 +506,7 @@ class coalesced_group : public thread_group {
|
||||
* between caller thread ID and source of copy thread
|
||||
* ID. sourceID = (threadID + lane_delta) % size()
|
||||
*/
|
||||
template <class T>
|
||||
__CG_QUALIFIER__ T shfl_down(T var, unsigned int lane_delta) const {
|
||||
|
||||
template <class T> __CG_QUALIFIER__ T shfl_down(T var, unsigned int lane_delta) const {
|
||||
// Note: The cuda implementation appears to use the remainder of lane_delta
|
||||
// and WARP_SIZE as the shift value rather than lane_delta itself.
|
||||
// This is not described in the documentation and is not done here.
|
||||
@@ -515,8 +518,7 @@ class coalesced_group : public thread_group {
|
||||
int lane;
|
||||
if (static_cast<int>(warpSize) == 64) {
|
||||
lane = __fns64(coalesced_info.member_mask, __lane_id(), lane_delta + 1);
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
lane = __fns32(coalesced_info.member_mask, __lane_id(), lane_delta + 1);
|
||||
}
|
||||
|
||||
@@ -541,9 +543,7 @@ class coalesced_group : public thread_group {
|
||||
* between caller thread ID and source of copy thread
|
||||
* ID. sourceID = (threadID - lane_delta) % size()
|
||||
*/
|
||||
template <class T>
|
||||
__CG_QUALIFIER__ T shfl_up(T var, unsigned int lane_delta) const {
|
||||
|
||||
template <class T> __CG_QUALIFIER__ T shfl_up(T var, unsigned int lane_delta) const {
|
||||
// Note: The cuda implementation appears to use the remainder of lane_delta
|
||||
// and WARP_SIZE as the shift value rather than lane_delta itself.
|
||||
// This is not described in the documentation and is not done here.
|
||||
@@ -555,8 +555,7 @@ class coalesced_group : public thread_group {
|
||||
int lane;
|
||||
if (static_cast<int>(warpSize) == 64) {
|
||||
lane = __fns64(coalesced_info.member_mask, __lane_id(), -(lane_delta + 1));
|
||||
}
|
||||
else if (static_cast<int>(warpSize) == 32) {
|
||||
} else if (static_cast<int>(warpSize) == 32) {
|
||||
lane = __fns32(coalesced_info.member_mask, __lane_id(), -(lane_delta + 1));
|
||||
}
|
||||
|
||||
@@ -575,11 +574,11 @@ class coalesced_group : public thread_group {
|
||||
*
|
||||
* \param pred [in] The predicate to evaluate on group threads.
|
||||
*/
|
||||
__CG_QUALIFIER__ unsigned long long ballot(int pred) const {
|
||||
return internal::helper::adjust_mask(
|
||||
coalesced_info.member_mask,
|
||||
__ballot_sync<unsigned long long>(coalesced_info.member_mask, pred));
|
||||
}
|
||||
__CG_QUALIFIER__ unsigned long long ballot(int pred) const {
|
||||
return internal::helper::adjust_mask(
|
||||
coalesced_info.member_mask,
|
||||
__ballot_sync<unsigned long long>(coalesced_info.member_mask, pred));
|
||||
}
|
||||
|
||||
/** \brief Any function on group level.
|
||||
*
|
||||
@@ -587,9 +586,9 @@ class coalesced_group : public thread_group {
|
||||
*
|
||||
* \param pred [in] The predicate to evaluate on group threads.
|
||||
*/
|
||||
__CG_QUALIFIER__ int any(int pred) const {
|
||||
return __any_sync(static_cast<unsigned long long>(coalesced_info.member_mask), pred);
|
||||
}
|
||||
__CG_QUALIFIER__ int any(int pred) const {
|
||||
return __any_sync(static_cast<unsigned long long>(coalesced_info.member_mask), pred);
|
||||
}
|
||||
|
||||
/** \brief All function on group level.
|
||||
*
|
||||
@@ -597,27 +596,27 @@ class coalesced_group : public thread_group {
|
||||
*
|
||||
* \param pred [in] The predicate to evaluate on group threads.
|
||||
*/
|
||||
__CG_QUALIFIER__ int all(int pred) const {
|
||||
return __all_sync(static_cast<unsigned long long>(coalesced_info.member_mask), pred);
|
||||
}
|
||||
__CG_QUALIFIER__ int all(int pred) const {
|
||||
return __all_sync(static_cast<unsigned long long>(coalesced_info.member_mask), pred);
|
||||
}
|
||||
|
||||
/** \brief Match any function on group level.
|
||||
*
|
||||
* \details Returns a bit mask containing a 1-bit for every participating
|
||||
* thread if that thread has the same value in ``value`` as the
|
||||
* thread if that thread has the same value in ``value`` as the
|
||||
* caller thread.
|
||||
*
|
||||
* \param value [in] The value to examine on the current thread in group.
|
||||
*/
|
||||
template <typename T> __CG_QUALIFIER__ unsigned long long match_any(T value) const {
|
||||
return internal::helper::adjust_mask(
|
||||
coalesced_info.member_mask,
|
||||
__match_any_sync(static_cast<unsigned long long>(coalesced_info.member_mask), value));
|
||||
}
|
||||
template <typename T> __CG_QUALIFIER__ unsigned long long match_any(T value) const {
|
||||
return internal::helper::adjust_mask(
|
||||
coalesced_info.member_mask,
|
||||
__match_any_sync(static_cast<unsigned long long>(coalesced_info.member_mask), value));
|
||||
}
|
||||
|
||||
/** \brief Match all function on group level.
|
||||
*
|
||||
* \details Returns a bit mask containing a 1-bit for every participating
|
||||
* \details Returns a bit mask containing a 1-bit for every participating
|
||||
* thread if they all have the same value in ``value`` as the caller
|
||||
* thread. The predicate ``pred`` is set to true if all
|
||||
* participating threads have the same value in ``value``.
|
||||
@@ -626,16 +625,16 @@ class coalesced_group : public thread_group {
|
||||
* \param pred [out] The predicate is set to true if all participating
|
||||
* threads in the thread group have the same value.
|
||||
*/
|
||||
template <typename T> __CG_QUALIFIER__ unsigned long long match_all(T value, int& pred) const {
|
||||
return internal::helper::adjust_mask(
|
||||
coalesced_info.member_mask,
|
||||
__match_all_sync(static_cast<unsigned long long>(coalesced_info.member_mask), value,
|
||||
&pred));
|
||||
}
|
||||
#endif // HIP_DISABLE_WARP_SYNC_BUILTINS
|
||||
template <typename T> __CG_QUALIFIER__ unsigned long long match_all(T value, int& pred) const {
|
||||
return internal::helper::adjust_mask(
|
||||
coalesced_info.member_mask,
|
||||
__match_all_sync(static_cast<unsigned long long>(coalesced_info.member_mask), value,
|
||||
&pred));
|
||||
}
|
||||
#endif // HIP_DISABLE_WARP_SYNC_BUILTINS
|
||||
};
|
||||
|
||||
/** \ingroup CooperativeGConstruct
|
||||
/** \ingroup CooperativeGConstruct
|
||||
* \brief User-exposed API to create coalesced groups.
|
||||
*
|
||||
* \details A collective operation that groups all active lanes into a new
|
||||
@@ -644,7 +643,7 @@ class coalesced_group : public thread_group {
|
||||
* on Microsoft Windows.
|
||||
*/
|
||||
__CG_QUALIFIER__ coalesced_group coalesced_threads() {
|
||||
return cooperative_groups::coalesced_group(__builtin_amdgcn_read_exec());
|
||||
return cooperative_groups::coalesced_group(__builtin_amdgcn_read_exec());
|
||||
}
|
||||
|
||||
#ifndef DOXYGEN_SHOULD_SKIP_THIS
|
||||
@@ -743,31 +742,33 @@ __CG_QUALIFIER__ void thread_group::sync() const {
|
||||
#endif
|
||||
|
||||
/** \addtogroup CooperativeGAPI User-exposed API of Cooperative groups
|
||||
* \ingroup CooperativeG
|
||||
* @{ */
|
||||
* \ingroup CooperativeG
|
||||
* @{ */
|
||||
|
||||
/** \brief Returns the size of the group.
|
||||
*
|
||||
* \details Total number of threads in the thread group, and this serves the
|
||||
* purpose for all derived cooperative group types because their
|
||||
* \details Total number of threads in the thread group, and this serves the
|
||||
* purpose for all derived cooperative group types because their
|
||||
* `size` is directly saved during the construction.
|
||||
*
|
||||
*
|
||||
* \tparam CGTy The cooperative group class template parameter.
|
||||
* \param g [in] The cooperative group for size returns.
|
||||
*
|
||||
*
|
||||
* \note Implementation of publicly exposed `wrapper` API on top of basic
|
||||
* cooperative group type APIs. This function is implemented on Linux
|
||||
* and is under development on Microsoft Windows.
|
||||
*/
|
||||
template <class CGTy> __CG_QUALIFIER__ __hip_uint32_t group_size(CGTy const& g) { return g.num_threads(); }
|
||||
template <class CGTy> __CG_QUALIFIER__ __hip_uint32_t group_size(CGTy const& g) {
|
||||
return g.num_threads();
|
||||
}
|
||||
|
||||
/** \brief Returns the rank of thread of the group.
|
||||
*
|
||||
* \details Rank of the calling thread within [0, \link num_threads() num_threads() \endlink).
|
||||
*
|
||||
*
|
||||
* \tparam CGTy The cooperative group class template parameter.
|
||||
* \param g [in] The cooperative group for rank returns.
|
||||
*
|
||||
*
|
||||
* \note Implementation of publicly exposed `wrapper` API on top of basic
|
||||
* cooperative group type APIs. This function is implemented on Linux
|
||||
* and is under development on Microsoft Windows.
|
||||
@@ -780,7 +781,7 @@ template <class CGTy> __CG_QUALIFIER__ __hip_uint32_t thread_rank(CGTy const& g)
|
||||
*
|
||||
* \tparam CGTy The cooperative group class template parameter.
|
||||
* \param g [in] The cooperative group for validity check.
|
||||
*
|
||||
*
|
||||
* \note Implementation of publicly exposed `wrapper` API on top of basic
|
||||
* cooperative group type APIs. This function is implemented on Linux
|
||||
* and is under development on Microsoft Windows.
|
||||
@@ -788,10 +789,10 @@ template <class CGTy> __CG_QUALIFIER__ __hip_uint32_t thread_rank(CGTy const& g)
|
||||
template <class CGTy> __CG_QUALIFIER__ bool is_valid(CGTy const& g) { return g.is_valid(); }
|
||||
|
||||
/** \brief Synchronizes the threads in the group.
|
||||
*
|
||||
*
|
||||
* \tparam CGTy The cooperative group class template parameter.
|
||||
* \param g [in] The cooperative group for synchronization.
|
||||
*
|
||||
*
|
||||
* \note Implementation of publicly exposed `wrapper` API on top of basic
|
||||
* cooperative group type APIs. This function is implemented on Linux
|
||||
* and is under development on Microsoft Windows.
|
||||
@@ -842,16 +843,12 @@ template <unsigned int size> class thread_block_tile_base : public tile_base<siz
|
||||
__CG_QUALIFIER__ unsigned long long build_mask() const {
|
||||
unsigned long long mask = ~0ull >> (64 - numThreads);
|
||||
// thread_rank() gives thread id from 0..thread launch size.
|
||||
return mask << (((internal::workgroup::thread_rank() % warpSize) / numThreads) *
|
||||
numThreads);
|
||||
return mask << (((internal::workgroup::thread_rank() % warpSize) / numThreads) * numThreads);
|
||||
}
|
||||
#endif // HIP_DISABLE_WARP_SYNC_BUILTINS
|
||||
#endif // HIP_DISABLE_WARP_SYNC_BUILTINS
|
||||
|
||||
public:
|
||||
|
||||
__CG_STATIC_QUALIFIER__ void sync() {
|
||||
internal::tiled_group::sync();
|
||||
}
|
||||
__CG_STATIC_QUALIFIER__ void sync() { internal::tiled_group::sync(); }
|
||||
|
||||
template <class T> __CG_QUALIFIER__ T shfl(T var, int srcRank) const {
|
||||
return (__shfl(var, srcRank, numThreads));
|
||||
@@ -888,14 +885,13 @@ template <unsigned int size> class thread_block_tile_base : public tile_base<siz
|
||||
const auto mask = build_mask();
|
||||
return internal::helper::adjust_mask(mask, __match_all_sync(mask, value, &pred));
|
||||
}
|
||||
#endif // HIP_DISABLE_WARP_SYNC_BUILTINS
|
||||
#endif // HIP_DISABLE_WARP_SYNC_BUILTINS
|
||||
};
|
||||
|
||||
/** \brief User exposed API that captures the state of the parent group pre-partition
|
||||
*/
|
||||
template <unsigned int tileSize, typename ParentCGTy>
|
||||
class parent_group_info {
|
||||
public:
|
||||
template <unsigned int tileSize, typename ParentCGTy> class parent_group_info {
|
||||
public:
|
||||
//! Returns the linear rank of the group within the set of tiles partitioned
|
||||
//! from a parent group (bounded by meta_group_size)
|
||||
__CG_STATIC_QUALIFIER__ unsigned int meta_group_rank() {
|
||||
@@ -920,31 +916,32 @@ class thread_block_tile_type : public thread_block_tile_base<tileSize>,
|
||||
public parent_group_info<tileSize, ParentCGTy> {
|
||||
_CG_STATIC_CONST_DECL_ unsigned int numThreads = tileSize;
|
||||
typedef thread_block_tile_base<numThreads> tbtBase;
|
||||
protected:
|
||||
__CG_QUALIFIER__ thread_block_tile_type() : tiled_group(numThreads) {
|
||||
coalesced_info.tiled_info.num_threads = numThreads;
|
||||
coalesced_info.tiled_info.is_tiled = true;
|
||||
}
|
||||
public:
|
||||
using tbtBase::num_threads;
|
||||
using tbtBase::size;
|
||||
using tbtBase::sync;
|
||||
using tbtBase::thread_rank;
|
||||
|
||||
protected:
|
||||
__CG_QUALIFIER__ thread_block_tile_type() : tiled_group(numThreads) {
|
||||
coalesced_info.tiled_info.num_threads = numThreads;
|
||||
coalesced_info.tiled_info.is_tiled = true;
|
||||
}
|
||||
|
||||
public:
|
||||
using tbtBase::num_threads;
|
||||
using tbtBase::size;
|
||||
using tbtBase::sync;
|
||||
using tbtBase::thread_rank;
|
||||
};
|
||||
|
||||
// Partial template specialization
|
||||
template <unsigned int tileSize>
|
||||
class thread_block_tile_type<tileSize, void> : public thread_block_tile_base<tileSize>,
|
||||
public tiled_group
|
||||
{
|
||||
public tiled_group {
|
||||
_CG_STATIC_CONST_DECL_ unsigned int numThreads = tileSize;
|
||||
|
||||
typedef thread_block_tile_base<numThreads> tbtBase;
|
||||
|
||||
protected:
|
||||
|
||||
__CG_QUALIFIER__ thread_block_tile_type(unsigned int meta_group_rank, unsigned int meta_group_size)
|
||||
: tiled_group(numThreads) {
|
||||
__CG_QUALIFIER__ thread_block_tile_type(unsigned int meta_group_rank,
|
||||
unsigned int meta_group_size)
|
||||
: tiled_group(numThreads) {
|
||||
coalesced_info.tiled_info.num_threads = numThreads;
|
||||
coalesced_info.tiled_info.is_tiled = true;
|
||||
coalesced_info.tiled_info.meta_group_rank = meta_group_rank;
|
||||
@@ -967,10 +964,10 @@ class thread_block_tile_type<tileSize, void> : public thread_block_tile_base<til
|
||||
__CG_QUALIFIER__ unsigned int meta_group_size() const {
|
||||
return coalesced_info.tiled_info.meta_group_size;
|
||||
}
|
||||
// Doxygen end group CooperativeG
|
||||
/**
|
||||
* @}
|
||||
*/
|
||||
// Doxygen end group CooperativeG
|
||||
/**
|
||||
* @}
|
||||
*/
|
||||
};
|
||||
|
||||
__CG_QUALIFIER__ thread_group this_thread() {
|
||||
@@ -978,7 +975,7 @@ __CG_QUALIFIER__ thread_group this_thread() {
|
||||
return g;
|
||||
}
|
||||
|
||||
/** \ingroup CooperativeGConstruct
|
||||
/** \ingroup CooperativeGConstruct
|
||||
* \brief User-exposed API to partition groups.
|
||||
*
|
||||
* \details A collective operation that partitions the parent group into a
|
||||
@@ -989,12 +986,10 @@ __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent, unsign
|
||||
if (parent.cg_type() == internal::cg_tiled_group) {
|
||||
const tiled_group* cg = static_cast<const tiled_group*>(&parent);
|
||||
return cg->new_tiled_group(tile_size);
|
||||
}
|
||||
else if(parent.cg_type() == internal::cg_coalesced_group) {
|
||||
} else if (parent.cg_type() == internal::cg_coalesced_group) {
|
||||
const coalesced_group* cg = static_cast<const coalesced_group*>(&parent);
|
||||
return cg->new_tiled_group(tile_size);
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
const thread_block* tb = static_cast<const thread_block*>(&parent);
|
||||
return tb->new_tiled_group(tile_size);
|
||||
}
|
||||
@@ -1010,8 +1005,9 @@ __CG_QUALIFIER__ tiled_group tiled_partition(const tiled_group& parent, unsigned
|
||||
}
|
||||
|
||||
// If a coalesced group is passed to be partitioned, it should remain coalesced
|
||||
__CG_QUALIFIER__ coalesced_group tiled_partition(const coalesced_group& parent, unsigned int tile_size) {
|
||||
return (parent.new_tiled_group(tile_size));
|
||||
__CG_QUALIFIER__ coalesced_group tiled_partition(const coalesced_group& parent,
|
||||
unsigned int tile_size) {
|
||||
return (parent.new_tiled_group(tile_size));
|
||||
}
|
||||
|
||||
namespace impl {
|
||||
@@ -1034,7 +1030,7 @@ class thread_block_tile_internal : public thread_block_tile_type<size, ParentCGT
|
||||
*
|
||||
* \details Represents one tiled thread group in a wavefront.
|
||||
* This group type also supports sub-wave level intrinsics.
|
||||
*
|
||||
*
|
||||
* \note This type is implemented on Linux, under development
|
||||
* on Microsoft Windows.
|
||||
*/
|
||||
@@ -1067,7 +1063,7 @@ class thread_block_tile : public impl::thread_block_tile_internal<size, ParentCG
|
||||
/** \brief Shuffle operation on group level.
|
||||
*
|
||||
* \details Exchanging variables between threads without use of shared memory.
|
||||
* Shuffle operation is a direct copy of ``var`` from ``srcRank``
|
||||
* Shuffle operation is a direct copy of ``var`` from ``srcRank``
|
||||
* thread ID of group.
|
||||
*
|
||||
* \tparam T The type can be a 32-bit integer or single-precision
|
||||
@@ -1113,13 +1109,13 @@ class thread_block_tile : public impl::thread_block_tile_internal<size, ParentCG
|
||||
/** \brief Shuffle xor operation on group level.
|
||||
*
|
||||
* \details Exchanging variables between threads without use of shared memory.
|
||||
* Shuffle xor operation is copy of var from thread with thread ID
|
||||
* Shuffle xor operation is copy of var from thread with thread ID
|
||||
* of group based on laneMask XOR of the caller thread ID.
|
||||
*
|
||||
* \tparam T The type can be a 32-bit integer or single-precision
|
||||
* floating point.
|
||||
* \param var [in] The source variable to copy.
|
||||
* \param laneMask [in] The laneMask is the mask for XOR operation.
|
||||
* \param laneMask [in] The laneMask is the mask for XOR operation.
|
||||
* sourceID = threadID ^ laneMask
|
||||
*/
|
||||
template <class T> __CG_QUALIFIER__ T shfl_xor(T var, unsigned int laneMask) const;
|
||||
@@ -1152,7 +1148,7 @@ class thread_block_tile : public impl::thread_block_tile_internal<size, ParentCG
|
||||
/** \brief Match any function on group level.
|
||||
*
|
||||
* \details Returns a bit mask containing a 1-bit for every participating
|
||||
* thread if that thread has the same value in ``value`` as the
|
||||
* thread if that thread has the same value in ``value`` as the
|
||||
* caller thread.
|
||||
*
|
||||
* \param value [in] The value to examine on the current thread in group.
|
||||
@@ -1161,7 +1157,7 @@ class thread_block_tile : public impl::thread_block_tile_internal<size, ParentCG
|
||||
|
||||
/** \brief Match all function on group level.
|
||||
*
|
||||
* \details Returns a bit mask containing a 1-bit for every participating
|
||||
* \details Returns a bit mask containing a 1-bit for every participating
|
||||
* thread if they all have the same value in ``value`` as the caller
|
||||
* thread. The predicate ``pred`` is set to true if all
|
||||
* participating threads have the same value in ``value``.
|
||||
@@ -1199,16 +1195,16 @@ struct tiled_partition_internal<size, thread_block> : public thread_block_tile<s
|
||||
|
||||
} // namespace impl
|
||||
|
||||
/** \ingroup CooperativeGConstruct
|
||||
/** \ingroup CooperativeGConstruct
|
||||
* \brief Create a partition.
|
||||
*
|
||||
* \details This constructs a templated class derived from thread_group. The
|
||||
* template defines the tile size of the new thread group at compile
|
||||
* time.
|
||||
*
|
||||
*
|
||||
* \tparam size The new size of the partition.
|
||||
* \tparam ParentCGTy The cooperative group class template parameter of the input group.
|
||||
*
|
||||
*
|
||||
* \param g [in] The coalesced group for split.
|
||||
*/
|
||||
template <unsigned int size, class ParentCGTy>
|
||||
@@ -1242,10 +1238,10 @@ __CG_QUALIFIER__ coalesced_group binary_partition(const coalesced_group& cgrp, b
|
||||
* \brief Binary partition.
|
||||
*
|
||||
* \details This splits the input thread group into two partitions determined by predicate.
|
||||
*
|
||||
*
|
||||
* \tparam size The size of the input thread block tile group.
|
||||
* \tparam parent The cooperative group class template parameter of the input group.
|
||||
*
|
||||
*
|
||||
* \param tgrp [in] The thread block tile group for split.
|
||||
* \param pred [in] The predicate used during the group split up.
|
||||
*/
|
||||
|
||||
Το diff αρχείου καταστέλλεται επειδή είναι πολύ μεγάλο
Φόρτωση Διαφορών
@@ -51,16 +51,16 @@
|
||||
#endif
|
||||
|
||||
#if defined(__HIPCC_RTC__)
|
||||
#if HIP_FP8_TYPE_FNUZ
|
||||
#define ENABLE_FNUZ_HIPRTC 1
|
||||
#else
|
||||
#define ENABLE_FNUZ_HIPRTC 0
|
||||
#endif
|
||||
#if HIP_FP8_TYPE_OCP
|
||||
#define ENABLE_OCP_HIPRTC 1
|
||||
#else
|
||||
#define ENABLE_OCP_HIPRTC 0
|
||||
#endif
|
||||
#if HIP_FP8_TYPE_FNUZ
|
||||
#define ENABLE_FNUZ_HIPRTC 1
|
||||
#else
|
||||
#define ENABLE_FNUZ_HIPRTC 0
|
||||
#endif
|
||||
#if HIP_FP8_TYPE_OCP
|
||||
#define ENABLE_OCP_HIPRTC 1
|
||||
#else
|
||||
#define ENABLE_OCP_HIPRTC 0
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// Include it explicitly for HIPRTC
|
||||
@@ -411,7 +411,7 @@ __FP8_HOST_DEVICE_STATIC__ T cast_from_f8(__hip_fp8_storage_t x, int wm, int we,
|
||||
constexpr bool is_half = std::is_same<T, _Float16>::value;
|
||||
constexpr bool is_float = std::is_same<T, float>::value;
|
||||
constexpr bool is_double = std::is_same<T, double>::value;
|
||||
#endif // defined(__clang__) and defined(__HIP__)
|
||||
#endif // defined(__clang__) and defined(__HIP__)
|
||||
static_assert(is_half || is_float || is_double, "only half, float and double are supported");
|
||||
|
||||
constexpr int weo = is_half ? 5 : (is_float ? 8 : 11);
|
||||
@@ -482,7 +482,7 @@ __FP8_HOST_DEVICE_STATIC__ T cast_from_f8(__hip_fp8_storage_t x, int wm, int we,
|
||||
return fNaN;
|
||||
}
|
||||
} else if ((x & 0x7C) == 0x7C) { // e5m2 NaN/Inf
|
||||
if ((x & 0x3) == 0) { // Inf
|
||||
if ((x & 0x3) == 0) { // Inf
|
||||
if (clip) {
|
||||
return sign ? fmin : fmax;
|
||||
}
|
||||
@@ -1305,16 +1305,17 @@ struct __hip_fp8_e4m3_fnuz {
|
||||
#endif
|
||||
if (internal::hip_fp8_fnuz_is_nan(__x)) {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
float fval = *this;
|
||||
auto llval = static_cast<long long>(fval);
|
||||
if (llval <= 0) {
|
||||
return 0;
|
||||
}
|
||||
return static_cast<unsigned short>(fval);
|
||||
}
|
||||
};
|
||||
float fval = *this;
|
||||
auto llval = static_cast<long long>(fval);
|
||||
if (llval <= 0) {
|
||||
return 0;
|
||||
}
|
||||
return static_cast<unsigned short>(fval);
|
||||
}
|
||||
}
|
||||
;
|
||||
|
||||
/**
|
||||
* \brief struct representing two fp8 numbers with e4m3 interpretation
|
||||
@@ -1393,8 +1394,9 @@ struct __hip_fp8x2_e4m3_fnuz {
|
||||
internal::cast_from_f8<float, true>(static_cast<__hip_fp8_storage_t>(__x >> 8),
|
||||
__wm, __we));
|
||||
#endif
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
;
|
||||
|
||||
/**
|
||||
* \brief struct representing four fp8 numbers with e4m3 interpretation
|
||||
@@ -1488,12 +1490,12 @@ struct __hip_fp8x4_e4m3_fnuz {
|
||||
#else
|
||||
__FP8_HOST__ operator float4() const {
|
||||
#endif
|
||||
auto x = __x; // bypass const
|
||||
auto fp8x2_low = *reinterpret_cast<__hip_fp8x2_storage_t*>(&x); // Little E
|
||||
auto fp8x2_high = *(reinterpret_cast<__hip_fp8x2_storage_t*>(&x) + 1);
|
||||
auto x = __x; // bypass const
|
||||
auto fp8x2_low = *reinterpret_cast<__hip_fp8x2_storage_t*>(&x); // Little E
|
||||
auto fp8x2_high = *(reinterpret_cast<__hip_fp8x2_storage_t*>(&x) + 1);
|
||||
#if HIP_FP8_CVT_FAST_PATH
|
||||
float2 high = internal::cast_to_f32x2_from_f8x2(fp8x2_high, __default_interpret);
|
||||
float2 low = internal::cast_to_f32x2_from_f8x2(fp8x2_low, __default_interpret);
|
||||
float2 high = internal::cast_to_f32x2_from_f8x2(fp8x2_high, __default_interpret);
|
||||
float2 low = internal::cast_to_f32x2_from_f8x2(fp8x2_low, __default_interpret);
|
||||
#else
|
||||
float2 high = float2(internal::cast_from_f8<float, true>(
|
||||
static_cast<__hip_fp8_storage_t>((fp8x2_high << 8) >> 8), __wm, __we),
|
||||
@@ -1504,9 +1506,10 @@ struct __hip_fp8x4_e4m3_fnuz {
|
||||
internal::cast_from_f8<float, true>(
|
||||
static_cast<__hip_fp8_storage_t>(fp8x2_low >> 8), __wm, __we));
|
||||
#endif
|
||||
return float4(low.x, low.y, high.x, high.y);
|
||||
}
|
||||
};
|
||||
return float4(low.x, low.y, high.x, high.y);
|
||||
}
|
||||
}
|
||||
;
|
||||
|
||||
/**
|
||||
* \brief struct representing one fp8 number with e5m2 interpretation
|
||||
@@ -1861,18 +1864,19 @@ struct __hip_fp8_e5m2_fnuz {
|
||||
#else
|
||||
__FP8_HOST__ operator unsigned short int() const {
|
||||
#endif
|
||||
if (internal::hip_fp8_fnuz_is_nan(__x)) {
|
||||
return 0;
|
||||
}
|
||||
if (internal::hip_fp8_fnuz_is_nan(__x)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
float fval = *this;
|
||||
auto llval = static_cast<long long>(fval);
|
||||
if (llval <= 0) {
|
||||
return 0;
|
||||
}
|
||||
return static_cast<unsigned short>(fval);
|
||||
}
|
||||
};
|
||||
float fval = *this;
|
||||
auto llval = static_cast<long long>(fval);
|
||||
if (llval <= 0) {
|
||||
return 0;
|
||||
}
|
||||
return static_cast<unsigned short>(fval);
|
||||
}
|
||||
}
|
||||
;
|
||||
|
||||
/**
|
||||
* \brief struct representing two fp8 numbers with e5m2 interpretation
|
||||
@@ -1944,15 +1948,16 @@ struct __hip_fp8x2_e5m2_fnuz {
|
||||
__FP8_HOST__ operator float2() const {
|
||||
#endif
|
||||
#if HIP_FP8_CVT_FAST_PATH
|
||||
return internal::cast_to_f32x2_from_f8x2(__x, __default_interpret);
|
||||
return internal::cast_to_f32x2_from_f8x2(__x, __default_interpret);
|
||||
#else
|
||||
return float2(internal::cast_from_f8<float, true>(static_cast<__hip_fp8_storage_t>(__x & 0xFF),
|
||||
__wm, __we),
|
||||
internal::cast_from_f8<float, true>(static_cast<__hip_fp8_storage_t>(__x >> 8),
|
||||
__wm, __we));
|
||||
#endif
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
;
|
||||
|
||||
/**
|
||||
* \brief struct representing four fp8 numbers with e5m2 interpretation
|
||||
@@ -2046,12 +2051,12 @@ struct __hip_fp8x4_e5m2_fnuz {
|
||||
#else
|
||||
__FP8_HOST__ operator float4() const {
|
||||
#endif
|
||||
auto x = __x; // bypass const
|
||||
auto fp8x2_low = *reinterpret_cast<__hip_fp8x2_storage_t*>(&x); // Little E
|
||||
auto fp8x2_high = *(reinterpret_cast<__hip_fp8x2_storage_t*>(&x) + 1);
|
||||
auto x = __x; // bypass const
|
||||
auto fp8x2_low = *reinterpret_cast<__hip_fp8x2_storage_t*>(&x); // Little E
|
||||
auto fp8x2_high = *(reinterpret_cast<__hip_fp8x2_storage_t*>(&x) + 1);
|
||||
#if HIP_FP8_CVT_FAST_PATH
|
||||
float2 high = internal::cast_to_f32x2_from_f8x2(fp8x2_high, __default_interpret);
|
||||
float2 low = internal::cast_to_f32x2_from_f8x2(fp8x2_low, __default_interpret);
|
||||
float2 high = internal::cast_to_f32x2_from_f8x2(fp8x2_high, __default_interpret);
|
||||
float2 low = internal::cast_to_f32x2_from_f8x2(fp8x2_low, __default_interpret);
|
||||
#else
|
||||
float2 high = float2(internal::cast_from_f8<float, true>(
|
||||
static_cast<__hip_fp8_storage_t>((fp8x2_high << 8) >> 8), __wm, __we),
|
||||
@@ -2062,11 +2067,12 @@ struct __hip_fp8x4_e5m2_fnuz {
|
||||
internal::cast_from_f8<float, true>(
|
||||
static_cast<__hip_fp8_storage_t>(fp8x2_low >> 8), __wm, __we));
|
||||
#endif
|
||||
return float4(low.x, low.y, high.x, high.y);
|
||||
}
|
||||
};
|
||||
return float4(low.x, low.y, high.x, high.y);
|
||||
}
|
||||
}
|
||||
;
|
||||
|
||||
#endif // ENABLE_FNUZ_HIPRTC
|
||||
#endif // ENABLE_FNUZ_HIPRTC
|
||||
|
||||
/**
|
||||
* \brief struct representing ocp fp8 numbers with e4m3 interpretation
|
||||
@@ -2419,18 +2425,19 @@ struct __hip_fp8_e4m3 {
|
||||
#else
|
||||
__FP8_HOST__ operator unsigned short int() const {
|
||||
#endif
|
||||
if (internal::hip_fp8_ocp_is_nan(__x, __default_interpret)) {
|
||||
return 0;
|
||||
}
|
||||
if (internal::hip_fp8_ocp_is_nan(__x, __default_interpret)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
float fval = *this;
|
||||
auto llval = static_cast<long long>(fval);
|
||||
if (llval <= 0) {
|
||||
return 0;
|
||||
}
|
||||
return static_cast<unsigned short>(fval);
|
||||
}
|
||||
};
|
||||
float fval = *this;
|
||||
auto llval = static_cast<long long>(fval);
|
||||
if (llval <= 0) {
|
||||
return 0;
|
||||
}
|
||||
return static_cast<unsigned short>(fval);
|
||||
}
|
||||
}
|
||||
;
|
||||
|
||||
/**
|
||||
* \brief struct representing two ocp fp8 numbers with e4m3 interpretation
|
||||
@@ -2503,15 +2510,16 @@ struct __hip_fp8x2_e4m3 {
|
||||
__FP8_HOST__ operator float2() const {
|
||||
#endif
|
||||
#if HIP_FP8_CVT_FAST_PATH
|
||||
return internal::cast_to_f32x2_from_f8x2(__x, __default_interpret);
|
||||
return internal::cast_to_f32x2_from_f8x2(__x, __default_interpret);
|
||||
#else
|
||||
return float2(internal::cast_from_f8<float, false>(static_cast<__hip_fp8_storage_t>(__x & 0xFF),
|
||||
return float2(internal::cast_from_f8<float, false>(static_cast<__hip_fp8_storage_t>(__x & 0xFF),
|
||||
__wm, __we),
|
||||
internal::cast_from_f8<float, false>(static_cast<__hip_fp8_storage_t>(__x >> 8),
|
||||
__wm, __we));
|
||||
#endif
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
;
|
||||
|
||||
/**
|
||||
* \brief struct representing four ocp fp8 numbers with e4m3 interpretation
|
||||
@@ -2531,7 +2539,7 @@ struct __hip_fp8x4_e4m3 {
|
||||
#else
|
||||
__FP8_HOST__ __hip_fp8x4_e4m3(const double4 val)
|
||||
#endif
|
||||
: __x{reinterpret_cast<__hip_fp8x4_storage_t>(
|
||||
: __x{reinterpret_cast<__hip_fp8x4_storage_t>(
|
||||
static_cast<unsigned int>(reinterpret_cast<unsigned char>(__hip_cvt_double_to_fp8(
|
||||
val.x, __default_saturation, __default_interpret)) |
|
||||
reinterpret_cast<unsigned char>(__hip_cvt_double_to_fp8(
|
||||
@@ -2606,12 +2614,12 @@ struct __hip_fp8x4_e4m3 {
|
||||
#else
|
||||
__FP8_HOST__ operator float4() const {
|
||||
#endif
|
||||
auto x = __x; // bypass const
|
||||
auto fp8x2_low = *reinterpret_cast<__hip_fp8x2_storage_t*>(&x); // Little E
|
||||
auto fp8x2_high = *(reinterpret_cast<__hip_fp8x2_storage_t*>(&x) + 1);
|
||||
auto x = __x; // bypass const
|
||||
auto fp8x2_low = *reinterpret_cast<__hip_fp8x2_storage_t*>(&x); // Little E
|
||||
auto fp8x2_high = *(reinterpret_cast<__hip_fp8x2_storage_t*>(&x) + 1);
|
||||
#if HIP_FP8_CVT_FAST_PATH
|
||||
float2 high = internal::cast_to_f32x2_from_f8x2(fp8x2_high, __default_interpret);
|
||||
float2 low = internal::cast_to_f32x2_from_f8x2(fp8x2_low, __default_interpret);
|
||||
float2 high = internal::cast_to_f32x2_from_f8x2(fp8x2_high, __default_interpret);
|
||||
float2 low = internal::cast_to_f32x2_from_f8x2(fp8x2_low, __default_interpret);
|
||||
#else
|
||||
float2 high = float2(internal::cast_from_f8<float, false>(
|
||||
static_cast<__hip_fp8_storage_t>((fp8x2_high << 8) >> 8), __wm, __we),
|
||||
@@ -2622,9 +2630,10 @@ struct __hip_fp8x4_e4m3 {
|
||||
internal::cast_from_f8<float, false>(
|
||||
static_cast<__hip_fp8_storage_t>(fp8x2_low >> 8), __wm, __we));
|
||||
#endif
|
||||
return float4(low.x, low.y, high.x, high.y);
|
||||
}
|
||||
};
|
||||
return float4(low.x, low.y, high.x, high.y);
|
||||
}
|
||||
}
|
||||
;
|
||||
|
||||
/**
|
||||
* \brief struct representing ocp fp8 numbers with e5m2 interpretation
|
||||
@@ -2981,18 +2990,19 @@ struct __hip_fp8_e5m2 {
|
||||
#else
|
||||
__FP8_HOST__ operator unsigned short int() const {
|
||||
#endif
|
||||
if (internal::hip_fp8_ocp_is_nan(__x, __default_interpret)) {
|
||||
return 0;
|
||||
if (internal::hip_fp8_ocp_is_nan(__x, __default_interpret)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
float fval = *this;
|
||||
auto llval = static_cast<long long>(fval);
|
||||
if (llval <= 0) {
|
||||
return 0;
|
||||
}
|
||||
return static_cast<unsigned short>(fval);
|
||||
}
|
||||
};
|
||||
float fval = *this;
|
||||
auto llval = static_cast<long long>(fval);
|
||||
if (llval <= 0) {
|
||||
return 0;
|
||||
}
|
||||
return static_cast<unsigned short>(fval);
|
||||
}
|
||||
}
|
||||
;
|
||||
|
||||
/**
|
||||
* \brief struct representing two ocp fp8 numbers with e5m2 interpretation
|
||||
@@ -3065,16 +3075,17 @@ struct __hip_fp8x2_e5m2 {
|
||||
__FP8_HOST__ operator float2() const {
|
||||
#endif
|
||||
#if HIP_FP8_CVT_FAST_PATH
|
||||
return internal::cast_to_f32x2_from_f8x2(__x, __default_interpret);
|
||||
return internal::cast_to_f32x2_from_f8x2(__x, __default_interpret);
|
||||
#else
|
||||
return float2(
|
||||
internal::cast_from_f8<float, false>(static_cast<__hip_fp8_storage_t>(__x & 0xFF), __wm,
|
||||
internal::cast_from_f8<float, false>(static_cast<__hip_fp8_storage_t>(__x & 0xFF), __wm,
|
||||
__we, __default_saturation == __HIP_SATFINITE),
|
||||
internal::cast_from_f8<float, false>(static_cast<__hip_fp8_storage_t>(__x >> 8), __wm, __we,
|
||||
internal::cast_from_f8<float, false>(static_cast<__hip_fp8_storage_t>(__x >> 8), __wm, __we,
|
||||
__default_saturation == __HIP_SATFINITE));
|
||||
#endif
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
;
|
||||
|
||||
/**
|
||||
* \brief struct representing four ocp fp8 numbers with e5m2 interpretation
|
||||
@@ -3168,12 +3179,12 @@ struct __hip_fp8x4_e5m2 {
|
||||
#else
|
||||
__FP8_HOST__ operator float4() const {
|
||||
#endif
|
||||
auto x = __x; // bypass const
|
||||
auto fp8x2_low = *reinterpret_cast<__hip_fp8x2_storage_t*>(&x); // Little E
|
||||
auto fp8x2_high = *(reinterpret_cast<__hip_fp8x2_storage_t*>(&x) + 1);
|
||||
auto x = __x; // bypass const
|
||||
auto fp8x2_low = *reinterpret_cast<__hip_fp8x2_storage_t*>(&x); // Little E
|
||||
auto fp8x2_high = *(reinterpret_cast<__hip_fp8x2_storage_t*>(&x) + 1);
|
||||
#if HIP_FP8_CVT_FAST_PATH
|
||||
float2 high = internal::cast_to_f32x2_from_f8x2(fp8x2_high, __default_interpret);
|
||||
float2 low = internal::cast_to_f32x2_from_f8x2(fp8x2_low, __default_interpret);
|
||||
float2 high = internal::cast_to_f32x2_from_f8x2(fp8x2_high, __default_interpret);
|
||||
float2 low = internal::cast_to_f32x2_from_f8x2(fp8x2_low, __default_interpret);
|
||||
#else
|
||||
float2 high = float2(
|
||||
internal::cast_from_f8<float, false>(
|
||||
@@ -3188,8 +3199,9 @@ struct __hip_fp8x4_e5m2 {
|
||||
internal::cast_from_f8<float, false>(static_cast<__hip_fp8_storage_t>(fp8x2_low >> 8), __wm,
|
||||
__we, __default_saturation == __HIP_SATFINITE));
|
||||
#endif
|
||||
return float4(low.x, low.y, high.x, high.y);
|
||||
}
|
||||
};
|
||||
#endif // ENABLE_OCP_HIPRTC
|
||||
#endif // _HIP_INCLUDE_HIP_AMD_DETAIL_HIP_FP8_H_
|
||||
return float4(low.x, low.y, high.x, high.y);
|
||||
}
|
||||
}
|
||||
;
|
||||
#endif // ENABLE_OCP_HIPRTC
|
||||
#endif // _HIP_INCLUDE_HIP_AMD_DETAIL_HIP_FP8_H_
|
||||
|
||||
@@ -38,10 +38,10 @@ extern "C" {
|
||||
* HIP Devices used by current OpenGL Context.
|
||||
*/
|
||||
typedef enum hipGLDeviceList {
|
||||
hipGLDeviceListAll = 1, ///< All hip devices used by current OpenGL context.
|
||||
hipGLDeviceListCurrentFrame = 2, ///< Hip devices used by current OpenGL context in current
|
||||
hipGLDeviceListAll = 1, ///< All hip devices used by current OpenGL context.
|
||||
hipGLDeviceListCurrentFrame = 2, ///< Hip devices used by current OpenGL context in current
|
||||
///< frame
|
||||
hipGLDeviceListNextFrame = 3 ///< Hip devices used by current OpenGL context in next
|
||||
hipGLDeviceListNextFrame = 3 ///< Hip devices used by current OpenGL context in next
|
||||
///< frame.
|
||||
} hipGLDeviceList;
|
||||
|
||||
@@ -51,8 +51,8 @@ typedef unsigned int GLuint;
|
||||
/** GLenum as uint.*/
|
||||
typedef unsigned int GLenum;
|
||||
/**
|
||||
* @}
|
||||
*/
|
||||
* @}
|
||||
*/
|
||||
|
||||
/**
|
||||
* @defgroup GL OpenGL Interoperability
|
||||
@@ -82,7 +82,7 @@ hipError_t hipGLGetDevices(unsigned int* pHipDeviceCount, int* pHipDevices,
|
||||
* @param [out] resource - Returns pointer of graphics resource.
|
||||
* @param [in] buffer - Buffer to be registered.
|
||||
* @param [in] flags - Register flags.
|
||||
*
|
||||
*
|
||||
* @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorUnknown, #hipErrorInvalidResourceHandle
|
||||
*
|
||||
*/
|
||||
@@ -99,11 +99,11 @@ hipError_t hipGraphicsGLRegisterBuffer(hipGraphicsResource** resource, GLuint bu
|
||||
* @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorUnknown, #hipErrorInvalidResourceHandle
|
||||
*
|
||||
*/
|
||||
hipError_t hipGraphicsGLRegisterImage(hipGraphicsResource** resource, GLuint image,
|
||||
GLenum target, unsigned int flags);
|
||||
hipError_t hipGraphicsGLRegisterImage(hipGraphicsResource** resource, GLuint image, GLenum target,
|
||||
unsigned int flags);
|
||||
/**
|
||||
* @}
|
||||
*/
|
||||
* @}
|
||||
*/
|
||||
#if defined(__cplusplus)
|
||||
}
|
||||
#endif /* __cplusplus */
|
||||
|
||||
@@ -23,102 +23,102 @@ THE SOFTWARE.
|
||||
#define AMD_HIP_MATH_CONSTANTS_H
|
||||
|
||||
// single precision constants
|
||||
#define HIP_INF_F __int_as_float(0x7f800000U)
|
||||
#define HIP_NAN_F __int_as_float(0x7fffffffU)
|
||||
#define HIP_MIN_DENORM_F __int_as_float(0x00000001U)
|
||||
#define HIP_MAX_NORMAL_F __int_as_float(0x7f7fffffU)
|
||||
#define HIP_NEG_ZERO_F __int_as_float(0x80000000U)
|
||||
#define HIP_ZERO_F 0.0F
|
||||
#define HIP_ONE_F 1.0F
|
||||
#define HIP_SQRT_HALF_F 0.707106781F
|
||||
#define HIP_SQRT_HALF_HI_F 0.707106781F
|
||||
#define HIP_SQRT_HALF_LO_F 1.210161749e-08F
|
||||
#define HIP_SQRT_TWO_F 1.414213562F
|
||||
#define HIP_THIRD_F 0.333333333F
|
||||
#define HIP_PIO4_F 0.785398163F
|
||||
#define HIP_PIO2_F 1.570796327F
|
||||
#define HIP_3PIO4_F 2.356194490F
|
||||
#define HIP_2_OVER_PI_F 0.636619772F
|
||||
#define HIP_INF_F __int_as_float(0x7f800000U)
|
||||
#define HIP_NAN_F __int_as_float(0x7fffffffU)
|
||||
#define HIP_MIN_DENORM_F __int_as_float(0x00000001U)
|
||||
#define HIP_MAX_NORMAL_F __int_as_float(0x7f7fffffU)
|
||||
#define HIP_NEG_ZERO_F __int_as_float(0x80000000U)
|
||||
#define HIP_ZERO_F 0.0F
|
||||
#define HIP_ONE_F 1.0F
|
||||
#define HIP_SQRT_HALF_F 0.707106781F
|
||||
#define HIP_SQRT_HALF_HI_F 0.707106781F
|
||||
#define HIP_SQRT_HALF_LO_F 1.210161749e-08F
|
||||
#define HIP_SQRT_TWO_F 1.414213562F
|
||||
#define HIP_THIRD_F 0.333333333F
|
||||
#define HIP_PIO4_F 0.785398163F
|
||||
#define HIP_PIO2_F 1.570796327F
|
||||
#define HIP_3PIO4_F 2.356194490F
|
||||
#define HIP_2_OVER_PI_F 0.636619772F
|
||||
#define HIP_SQRT_2_OVER_PI_F 0.797884561F
|
||||
#define HIP_PI_F 3.141592654F
|
||||
#define HIP_L2E_F 1.442695041F
|
||||
#define HIP_L2T_F 3.321928094F
|
||||
#define HIP_LG2_F 0.301029996F
|
||||
#define HIP_LGE_F 0.434294482F
|
||||
#define HIP_LN2_F 0.693147181F
|
||||
#define HIP_LNT_F 2.302585093F
|
||||
#define HIP_LNPI_F 1.144729886F
|
||||
#define HIP_TWO_TO_M126_F 1.175494351e-38F
|
||||
#define HIP_TWO_TO_126_F 8.507059173e37F
|
||||
#define HIP_NORM_HUGE_F 3.402823466e38F
|
||||
#define HIP_TWO_TO_23_F 8388608.0F
|
||||
#define HIP_TWO_TO_24_F 16777216.0F
|
||||
#define HIP_TWO_TO_31_F 2147483648.0F
|
||||
#define HIP_TWO_TO_32_F 4294967296.0F
|
||||
#define HIP_REMQUO_BITS_F 3U
|
||||
#define HIP_REMQUO_MASK_F (~((~0U)<<HIP_REMQUO_BITS_F))
|
||||
#define HIP_TRIG_PLOSS_F 105615.0F
|
||||
#define HIP_PI_F 3.141592654F
|
||||
#define HIP_L2E_F 1.442695041F
|
||||
#define HIP_L2T_F 3.321928094F
|
||||
#define HIP_LG2_F 0.301029996F
|
||||
#define HIP_LGE_F 0.434294482F
|
||||
#define HIP_LN2_F 0.693147181F
|
||||
#define HIP_LNT_F 2.302585093F
|
||||
#define HIP_LNPI_F 1.144729886F
|
||||
#define HIP_TWO_TO_M126_F 1.175494351e-38F
|
||||
#define HIP_TWO_TO_126_F 8.507059173e37F
|
||||
#define HIP_NORM_HUGE_F 3.402823466e38F
|
||||
#define HIP_TWO_TO_23_F 8388608.0F
|
||||
#define HIP_TWO_TO_24_F 16777216.0F
|
||||
#define HIP_TWO_TO_31_F 2147483648.0F
|
||||
#define HIP_TWO_TO_32_F 4294967296.0F
|
||||
#define HIP_REMQUO_BITS_F 3U
|
||||
#define HIP_REMQUO_MASK_F (~((~0U) << HIP_REMQUO_BITS_F))
|
||||
#define HIP_TRIG_PLOSS_F 105615.0F
|
||||
|
||||
// double precision constants
|
||||
#define HIP_INF __longlong_as_double(0x7ff0000000000000ULL)
|
||||
#define HIP_NAN __longlong_as_double(0xfff8000000000000ULL)
|
||||
#define HIP_NEG_ZERO __longlong_as_double(0x8000000000000000ULL)
|
||||
#define HIP_MIN_DENORM __longlong_as_double(0x0000000000000001ULL)
|
||||
#define HIP_ZERO 0.0
|
||||
#define HIP_ONE 1.0
|
||||
#define HIP_SQRT_TWO 1.4142135623730951e+0
|
||||
#define HIP_SQRT_HALF 7.0710678118654757e-1
|
||||
#define HIP_SQRT_HALF_HI 7.0710678118654757e-1
|
||||
#define HIP_SQRT_HALF_LO (-4.8336466567264567e-17)
|
||||
#define HIP_THIRD 3.3333333333333333e-1
|
||||
#define HIP_TWOTHIRD 6.6666666666666667e-1
|
||||
#define HIP_PIO4 7.8539816339744828e-1
|
||||
#define HIP_PIO4_HI 7.8539816339744828e-1
|
||||
#define HIP_PIO4_LO 3.0616169978683830e-17
|
||||
#define HIP_PIO2 1.5707963267948966e+0
|
||||
#define HIP_PIO2_HI 1.5707963267948966e+0
|
||||
#define HIP_PIO2_LO 6.1232339957367660e-17
|
||||
#define HIP_3PIO4 2.3561944901923448e+0
|
||||
#define HIP_2_OVER_PI 6.3661977236758138e-1
|
||||
#define HIP_PI 3.1415926535897931e+0
|
||||
#define HIP_PI_HI 3.1415926535897931e+0
|
||||
#define HIP_PI_LO 1.2246467991473532e-16
|
||||
#define HIP_SQRT_2PI 2.5066282746310007e+0
|
||||
#define HIP_SQRT_2PI_HI 2.5066282746310007e+0
|
||||
#define HIP_SQRT_2PI_LO (-1.8328579980459167e-16)
|
||||
#define HIP_SQRT_PIO2 1.2533141373155003e+0
|
||||
#define HIP_SQRT_PIO2_HI 1.2533141373155003e+0
|
||||
#define HIP_SQRT_PIO2_LO (-9.1642899902295834e-17)
|
||||
#define HIP_SQRT_2OPI 7.9788456080286536e-1
|
||||
#define HIP_L2E 1.4426950408889634e+0
|
||||
#define HIP_L2E_HI 1.4426950408889634e+0
|
||||
#define HIP_L2E_LO 2.0355273740931033e-17
|
||||
#define HIP_L2T 3.3219280948873622e+0
|
||||
#define HIP_LG2 3.0102999566398120e-1
|
||||
#define HIP_LG2_HI 3.0102999566398120e-1
|
||||
#define HIP_LG2_LO (-2.8037281277851704e-18)
|
||||
#define HIP_LGE 4.3429448190325182e-1
|
||||
#define HIP_LGE_HI 4.3429448190325182e-1
|
||||
#define HIP_LGE_LO 1.09831965021676510e-17
|
||||
#define HIP_LN2 6.9314718055994529e-1
|
||||
#define HIP_LN2_HI 6.9314718055994529e-1
|
||||
#define HIP_LN2_LO 2.3190468138462996e-17
|
||||
#define HIP_LNT 2.3025850929940459e+0
|
||||
#define HIP_LNT_HI 2.3025850929940459e+0
|
||||
#define HIP_LNT_LO (-2.1707562233822494e-16)
|
||||
#define HIP_LNPI 1.1447298858494002e+0
|
||||
#define HIP_LN2_X_1024 7.0978271289338397e+2
|
||||
#define HIP_LN2_X_1025 7.1047586007394398e+2
|
||||
#define HIP_LN2_X_1075 7.4513321910194122e+2
|
||||
#define HIP_LG2_X_1024 3.0825471555991675e+2
|
||||
#define HIP_LG2_X_1075 3.2360724533877976e+2
|
||||
#define HIP_TWO_TO_23 8388608.0
|
||||
#define HIP_TWO_TO_52 4503599627370496.0
|
||||
#define HIP_TWO_TO_53 9007199254740992.0
|
||||
#define HIP_TWO_TO_54 18014398509481984.0
|
||||
#define HIP_TWO_TO_M54 5.5511151231257827e-17
|
||||
#define HIP_TWO_TO_M1022 2.22507385850720140e-308
|
||||
#define HIP_TRIG_PLOSS 2147483648.0
|
||||
#define HIP_DBL2INT_CVT 6755399441055744.0
|
||||
#define HIP_INF __longlong_as_double(0x7ff0000000000000ULL)
|
||||
#define HIP_NAN __longlong_as_double(0xfff8000000000000ULL)
|
||||
#define HIP_NEG_ZERO __longlong_as_double(0x8000000000000000ULL)
|
||||
#define HIP_MIN_DENORM __longlong_as_double(0x0000000000000001ULL)
|
||||
#define HIP_ZERO 0.0
|
||||
#define HIP_ONE 1.0
|
||||
#define HIP_SQRT_TWO 1.4142135623730951e+0
|
||||
#define HIP_SQRT_HALF 7.0710678118654757e-1
|
||||
#define HIP_SQRT_HALF_HI 7.0710678118654757e-1
|
||||
#define HIP_SQRT_HALF_LO (-4.8336466567264567e-17)
|
||||
#define HIP_THIRD 3.3333333333333333e-1
|
||||
#define HIP_TWOTHIRD 6.6666666666666667e-1
|
||||
#define HIP_PIO4 7.8539816339744828e-1
|
||||
#define HIP_PIO4_HI 7.8539816339744828e-1
|
||||
#define HIP_PIO4_LO 3.0616169978683830e-17
|
||||
#define HIP_PIO2 1.5707963267948966e+0
|
||||
#define HIP_PIO2_HI 1.5707963267948966e+0
|
||||
#define HIP_PIO2_LO 6.1232339957367660e-17
|
||||
#define HIP_3PIO4 2.3561944901923448e+0
|
||||
#define HIP_2_OVER_PI 6.3661977236758138e-1
|
||||
#define HIP_PI 3.1415926535897931e+0
|
||||
#define HIP_PI_HI 3.1415926535897931e+0
|
||||
#define HIP_PI_LO 1.2246467991473532e-16
|
||||
#define HIP_SQRT_2PI 2.5066282746310007e+0
|
||||
#define HIP_SQRT_2PI_HI 2.5066282746310007e+0
|
||||
#define HIP_SQRT_2PI_LO (-1.8328579980459167e-16)
|
||||
#define HIP_SQRT_PIO2 1.2533141373155003e+0
|
||||
#define HIP_SQRT_PIO2_HI 1.2533141373155003e+0
|
||||
#define HIP_SQRT_PIO2_LO (-9.1642899902295834e-17)
|
||||
#define HIP_SQRT_2OPI 7.9788456080286536e-1
|
||||
#define HIP_L2E 1.4426950408889634e+0
|
||||
#define HIP_L2E_HI 1.4426950408889634e+0
|
||||
#define HIP_L2E_LO 2.0355273740931033e-17
|
||||
#define HIP_L2T 3.3219280948873622e+0
|
||||
#define HIP_LG2 3.0102999566398120e-1
|
||||
#define HIP_LG2_HI 3.0102999566398120e-1
|
||||
#define HIP_LG2_LO (-2.8037281277851704e-18)
|
||||
#define HIP_LGE 4.3429448190325182e-1
|
||||
#define HIP_LGE_HI 4.3429448190325182e-1
|
||||
#define HIP_LGE_LO 1.09831965021676510e-17
|
||||
#define HIP_LN2 6.9314718055994529e-1
|
||||
#define HIP_LN2_HI 6.9314718055994529e-1
|
||||
#define HIP_LN2_LO 2.3190468138462996e-17
|
||||
#define HIP_LNT 2.3025850929940459e+0
|
||||
#define HIP_LNT_HI 2.3025850929940459e+0
|
||||
#define HIP_LNT_LO (-2.1707562233822494e-16)
|
||||
#define HIP_LNPI 1.1447298858494002e+0
|
||||
#define HIP_LN2_X_1024 7.0978271289338397e+2
|
||||
#define HIP_LN2_X_1025 7.1047586007394398e+2
|
||||
#define HIP_LN2_X_1075 7.4513321910194122e+2
|
||||
#define HIP_LG2_X_1024 3.0825471555991675e+2
|
||||
#define HIP_LG2_X_1075 3.2360724533877976e+2
|
||||
#define HIP_TWO_TO_23 8388608.0
|
||||
#define HIP_TWO_TO_52 4503599627370496.0
|
||||
#define HIP_TWO_TO_53 9007199254740992.0
|
||||
#define HIP_TWO_TO_54 18014398509481984.0
|
||||
#define HIP_TWO_TO_M54 5.5511151231257827e-17
|
||||
#define HIP_TWO_TO_M1022 2.22507385850720140e-308
|
||||
#define HIP_TRIG_PLOSS 2147483648.0
|
||||
#define HIP_DBL2INT_CVT 6755399441055744.0
|
||||
|
||||
#endif
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
/*
|
||||
Copyright © Advanced Micro Devices, Inc., or its affiliates.
|
||||
|
||||
|
||||
SPDX-License-Identifier: MIT
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
/*
|
||||
Copyright © Advanced Micro Devices, Inc., or its affiliates.
|
||||
|
||||
|
||||
SPDX-License-Identifier: MIT
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
@@ -719,8 +719,8 @@ struct __hipext_ocp_fp6x32_e2m3 {
|
||||
}
|
||||
#endif
|
||||
|
||||
__OCP_FP_HOST_DEVICE__
|
||||
__hipext_ocp_fp6x32_e2m3(const __amd_fp16x32_storage_t in, const __amd_scale_t scale)
|
||||
__OCP_FP_HOST_DEVICE__ __hipext_ocp_fp6x32_e2m3(const __amd_fp16x32_storage_t in,
|
||||
const __amd_scale_t scale)
|
||||
#if HIP_ENABLE_GFX950_OCP_BUILTINS
|
||||
: __x(__builtin_amdgcn_cvt_scalef32_pk32_fp6_f16(in, __amd_scale_to_float(scale))){}
|
||||
#else
|
||||
@@ -742,8 +742,8 @@ struct __hipext_ocp_fp6x32_e2m3 {
|
||||
}
|
||||
#endif
|
||||
|
||||
__OCP_FP_HOST_DEVICE__ __hipext_ocp_fp6x32_e2m3(const __amd_bf16x32_storage_t in,
|
||||
const __amd_scale_t scale)
|
||||
__OCP_FP_HOST_DEVICE__
|
||||
__hipext_ocp_fp6x32_e2m3(const __amd_bf16x32_storage_t in, const __amd_scale_t scale)
|
||||
#if HIP_ENABLE_GFX950_OCP_BUILTINS
|
||||
: __x(__builtin_amdgcn_cvt_scalef32_pk32_fp6_bf16(in, __amd_scale_to_float(scale))){}
|
||||
#else
|
||||
@@ -832,8 +832,8 @@ struct __hipext_ocp_fp6x32_e3m2 {
|
||||
}
|
||||
#endif
|
||||
|
||||
__OCP_FP_HOST_DEVICE__
|
||||
__hipext_ocp_fp6x32_e3m2(const __amd_fp16x32_storage_t in, const __amd_scale_t scale)
|
||||
__OCP_FP_HOST_DEVICE__ __hipext_ocp_fp6x32_e3m2(const __amd_fp16x32_storage_t in,
|
||||
const __amd_scale_t scale)
|
||||
#if HIP_ENABLE_GFX950_OCP_BUILTINS
|
||||
: __x(__builtin_amdgcn_cvt_scalef32_pk32_bf6_f16(in, __amd_scale_to_float(scale))){}
|
||||
#else
|
||||
@@ -855,8 +855,8 @@ struct __hipext_ocp_fp6x32_e3m2 {
|
||||
}
|
||||
#endif
|
||||
|
||||
__OCP_FP_HOST_DEVICE__
|
||||
__hipext_ocp_fp6x32_e3m2(const __amd_bf16x32_storage_t in, const __amd_scale_t scale)
|
||||
__OCP_FP_HOST_DEVICE__ __hipext_ocp_fp6x32_e3m2(const __amd_bf16x32_storage_t in,
|
||||
const __amd_scale_t scale)
|
||||
#if HIP_ENABLE_GFX950_OCP_BUILTINS
|
||||
: __x(__builtin_amdgcn_cvt_scalef32_pk32_bf6_bf16(in, __amd_scale_to_float(scale))){}
|
||||
#else
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
/*
|
||||
Copyright © Advanced Micro Devices, Inc., or its affiliates.
|
||||
|
||||
|
||||
SPDX-License-Identifier: MIT
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
@@ -592,7 +592,7 @@ __OCP_FP_HOST_DEVICE_STATIC__ uint32_t from_float_sr(T f, uint32_t seed, int8_t
|
||||
}();
|
||||
const auto& srcEnc = encodings[(size_t)srcE];
|
||||
|
||||
auto srcU32 = u.u32;// (srcE == Encoding::IEEE754) ? U32(f) : (uint32_t)f;
|
||||
auto srcU32 = u.u32; // (srcE == Encoding::IEEE754) ? U32(f) : (uint32_t)f;
|
||||
auto signBit = signbit<srcE, false>(srcU32);
|
||||
auto sign = signBit << (enc.ExpBits + enc.ManBits);
|
||||
|
||||
@@ -706,7 +706,7 @@ __OCP_FP_HOST_DEVICE_STATIC__ uint32_t from_float(T f, int8_t scale_exp) {
|
||||
}();
|
||||
const auto& srcEnc = encodings[(size_t)srcE];
|
||||
|
||||
auto srcU32 = u.u32; // (srcE == Encoding::IEEE754) ? U32(f) : (uint32_t)f;
|
||||
auto srcU32 = u.u32; // (srcE == Encoding::IEEE754) ? U32(f) : (uint32_t)f;
|
||||
auto signBit = signbit<srcE, false>(srcU32);
|
||||
auto sign = signBit << (enc.ExpBits + enc.ManBits);
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
/*
|
||||
Copyright © Advanced Micro Devices, Inc., or its affiliates.
|
||||
|
||||
|
||||
SPDX-License-Identifier: MIT
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
@@ -44,13 +44,13 @@ static_assert(sizeof(_Float16) == 2);
|
||||
// header which will act as a base abstraction, and will be maintained in the future, it makes sense
|
||||
// to keep these vector types separate from existing implementations. We can add conversion
|
||||
// functions in a different header using these functions.
|
||||
typedef uint8_t __amd_fp8_storage_t;
|
||||
typedef uint8_t __amd_fp8_storage_t;
|
||||
typedef uint16_t __amd_fp8x2_storage_t;
|
||||
typedef uint8_t __amd_fp4x2_storage_t;
|
||||
typedef uint8_t __amd_fp4x2_storage_t;
|
||||
typedef uint32_t __amd_fp4x8_storage_t;
|
||||
typedef __bf16 __amd_bf16_storage_t;
|
||||
typedef __bf16 __amd_bf16_storage_t;
|
||||
typedef _Float16 __amd_fp16_storage_t;
|
||||
typedef int8_t __amd_scale_t;
|
||||
typedef int8_t __amd_scale_t;
|
||||
|
||||
#if defined(__clang__) && (__clang_major__ > 17) && defined(__HIP__)
|
||||
typedef unsigned int __attribute__((ext_vector_type(2))) __amd_uintx2_storage_t;
|
||||
|
||||
@@ -25,7 +25,7 @@ THE SOFTWARE.
|
||||
* @brief Contains definitions of APIs for HIP runtime.
|
||||
*/
|
||||
|
||||
//#pragma once
|
||||
// #pragma once
|
||||
#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_RUNTIME_H
|
||||
#define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_RUNTIME_H
|
||||
|
||||
@@ -86,8 +86,8 @@ size_t amd_dbgapi_get_build_id();
|
||||
#else
|
||||
#include <math.h>
|
||||
#include <stdint.h>
|
||||
#endif // __cplusplus
|
||||
#endif // !defined(__HIPCC_RTC__)
|
||||
#endif // __cplusplus
|
||||
#endif // !defined(__HIPCC_RTC__)
|
||||
|
||||
#if __HIP_CLANG_ONLY__
|
||||
|
||||
@@ -105,7 +105,7 @@ size_t amd_dbgapi_get_build_id();
|
||||
#include <hip/amd_detail/texture_fetch_functions.h>
|
||||
#include <hip/amd_detail/texture_indirect_functions.h>
|
||||
extern int HIP_TRACE_API;
|
||||
#endif // !defined(__HIPCC_RTC__)
|
||||
#endif // !defined(__HIPCC_RTC__)
|
||||
|
||||
#ifdef __cplusplus
|
||||
#include <hip/amd_detail/hip_ldg.h>
|
||||
@@ -155,17 +155,17 @@ extern int HIP_TRACE_API;
|
||||
|
||||
|
||||
#define launch_bounds_impl0(requiredMaxThreadsPerBlock) \
|
||||
__attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock)))
|
||||
__attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock)))
|
||||
#define launch_bounds_impl1(requiredMaxThreadsPerBlock, minBlocksPerMultiprocessor) \
|
||||
__attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock), \
|
||||
amdgpu_waves_per_eu(minBlocksPerMultiprocessor)))
|
||||
__attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock), \
|
||||
amdgpu_waves_per_eu(minBlocksPerMultiprocessor)))
|
||||
#define select_impl_(_1, _2, impl_, ...) impl_
|
||||
#define __launch_bounds__(...) \
|
||||
select_impl_(__VA_ARGS__, launch_bounds_impl1, launch_bounds_impl0, )(__VA_ARGS__)
|
||||
|
||||
#if !defined(__HIPCC_RTC__)
|
||||
__host__ inline void* __get_dynamicgroupbaseptr() { return nullptr; }
|
||||
#endif // !defined(__HIPCC_RTC__)
|
||||
#endif // !defined(__HIPCC_RTC__)
|
||||
|
||||
// End doxygen API:
|
||||
/**
|
||||
@@ -188,63 +188,64 @@ void pArgs(const std::tuple<Ts...>&, void*) {}
|
||||
template <std::size_t n, typename... Ts,
|
||||
typename std::enable_if<n != sizeof...(Ts)>::type* = nullptr>
|
||||
void pArgs(const std::tuple<Ts...>& formals, void** _vargs) {
|
||||
using T = typename std::tuple_element<n, std::tuple<Ts...> >::type;
|
||||
using T = typename std::tuple_element<n, std::tuple<Ts...>>::type;
|
||||
|
||||
static_assert(!std::is_reference<T>{},
|
||||
"A __global__ function cannot have a reference as one of its "
|
||||
"arguments.");
|
||||
static_assert(!std::is_reference<T>{},
|
||||
"A __global__ function cannot have a reference as one of its "
|
||||
"arguments.");
|
||||
#if defined(HIP_STRICT)
|
||||
static_assert(std::is_trivially_copyable<T>{},
|
||||
"Only TriviallyCopyable types can be arguments to a __global__ "
|
||||
"function");
|
||||
static_assert(std::is_trivially_copyable<T>{},
|
||||
"Only TriviallyCopyable types can be arguments to a __global__ "
|
||||
"function");
|
||||
#endif
|
||||
_vargs[n] = const_cast<void*>(reinterpret_cast<const void*>(&std::get<n>(formals)));
|
||||
return pArgs<n + 1>(formals, _vargs);
|
||||
_vargs[n] = const_cast<void*>(reinterpret_cast<const void*>(&std::get<n>(formals)));
|
||||
return pArgs<n + 1>(formals, _vargs);
|
||||
}
|
||||
|
||||
template <typename... Formals, typename... Actuals>
|
||||
std::tuple<Formals...> validateArgsCountType(void (*kernel)(Formals...), std::tuple<Actuals...>(actuals)) {
|
||||
static_assert(sizeof...(Formals) == sizeof...(Actuals), "Argument Count Mismatch");
|
||||
std::tuple<Formals...> to_formals{std::move(actuals)};
|
||||
return to_formals;
|
||||
std::tuple<Formals...> validateArgsCountType(void (*kernel)(Formals...),
|
||||
std::tuple<Actuals...>(actuals)) {
|
||||
static_assert(sizeof...(Formals) == sizeof...(Actuals), "Argument Count Mismatch");
|
||||
std::tuple<Formals...> to_formals{std::move(actuals)};
|
||||
return to_formals;
|
||||
}
|
||||
|
||||
#if defined(HIP_TEMPLATE_KERNEL_LAUNCH)
|
||||
template <typename... Args, typename F = void (*)(Args...)>
|
||||
void hipLaunchKernelGGL(F kernel, const dim3& numBlocks, const dim3& dimBlocks,
|
||||
std::uint32_t sharedMemBytes, hipStream_t stream, Args... args) {
|
||||
constexpr size_t count = sizeof...(Args);
|
||||
auto tup_ = std::tuple<Args...>{args...};
|
||||
auto tup = validateArgsCountType(kernel, tup_);
|
||||
void* _Args[count];
|
||||
pArgs<0>(tup, _Args);
|
||||
constexpr size_t count = sizeof...(Args);
|
||||
auto tup_ = std::tuple<Args...>{args...};
|
||||
auto tup = validateArgsCountType(kernel, tup_);
|
||||
void* _Args[count];
|
||||
pArgs<0>(tup, _Args);
|
||||
|
||||
auto k = reinterpret_cast<void*>(kernel);
|
||||
hipLaunchKernel(k, numBlocks, dimBlocks, _Args, sharedMemBytes, stream);
|
||||
auto k = reinterpret_cast<void*>(kernel);
|
||||
hipLaunchKernel(k, numBlocks, dimBlocks, _Args, sharedMemBytes, stream);
|
||||
}
|
||||
#else
|
||||
#define hipLaunchKernelGGLInternal(kernelName, numBlocks, numThreads, memPerBlock, streamId, ...) \
|
||||
do { \
|
||||
kernelName<<<(numBlocks), (numThreads), (memPerBlock), (streamId)>>>(__VA_ARGS__); \
|
||||
} while (0)
|
||||
do { \
|
||||
kernelName<<<(numBlocks), (numThreads), (memPerBlock), (streamId)>>>(__VA_ARGS__); \
|
||||
} while (0)
|
||||
|
||||
#define hipLaunchKernelGGL(kernelName, ...) hipLaunchKernelGGLInternal((kernelName), __VA_ARGS__)
|
||||
#define hipLaunchKernelGGL(kernelName, ...) hipLaunchKernelGGLInternal((kernelName), __VA_ARGS__)
|
||||
#endif
|
||||
|
||||
#include <hip/hip_runtime_api.h>
|
||||
#endif // !defined(__HIPCC_RTC__)
|
||||
#endif // !defined(__HIPCC_RTC__)
|
||||
|
||||
#if defined(__HIPCC_RTC__)
|
||||
typedef struct dim3 {
|
||||
__hip_uint32_t x; ///< x
|
||||
__hip_uint32_t y; ///< y
|
||||
__hip_uint32_t z; ///< z
|
||||
__hip_uint32_t x; ///< x
|
||||
__hip_uint32_t y; ///< y
|
||||
__hip_uint32_t z; ///< z
|
||||
#ifdef __cplusplus
|
||||
constexpr __device__ dim3(__hip_uint32_t _x = 1, __hip_uint32_t _y = 1, __hip_uint32_t _z = 1)
|
||||
: x(_x), y(_y), z(_z){};
|
||||
constexpr __device__ dim3(__hip_uint32_t _x = 1, __hip_uint32_t _y = 1, __hip_uint32_t _z = 1)
|
||||
: x(_x), y(_y), z(_z) {};
|
||||
#endif
|
||||
} dim3;
|
||||
#endif // !defined(__HIPCC_RTC__)
|
||||
#endif // !defined(__HIPCC_RTC__)
|
||||
|
||||
#pragma push_macro("__DEVICE__")
|
||||
#define __DEVICE__ static __device__ __forceinline__
|
||||
@@ -269,43 +270,41 @@ __DEVICE__ unsigned int __hip_get_grid_dim_x() { return __ockl_get_num_groups(0)
|
||||
__DEVICE__ unsigned int __hip_get_grid_dim_y() { return __ockl_get_num_groups(1); }
|
||||
__DEVICE__ unsigned int __hip_get_grid_dim_z() { return __ockl_get_num_groups(2); }
|
||||
|
||||
#define __HIP_DEVICE_BUILTIN(DIMENSION, FUNCTION) \
|
||||
__declspec(property(get = __get_##DIMENSION)) unsigned int DIMENSION; \
|
||||
__DEVICE__ unsigned int __get_##DIMENSION(void) { \
|
||||
return FUNCTION; \
|
||||
}
|
||||
#define __HIP_DEVICE_BUILTIN(DIMENSION, FUNCTION) \
|
||||
__declspec(property(get = __get_##DIMENSION)) unsigned int DIMENSION; \
|
||||
__DEVICE__ unsigned int __get_##DIMENSION(void) { return FUNCTION; }
|
||||
|
||||
struct __hip_builtin_threadIdx_t {
|
||||
__HIP_DEVICE_BUILTIN(x,__hip_get_thread_idx_x());
|
||||
__HIP_DEVICE_BUILTIN(y,__hip_get_thread_idx_y());
|
||||
__HIP_DEVICE_BUILTIN(z,__hip_get_thread_idx_z());
|
||||
__HIP_DEVICE_BUILTIN(x, __hip_get_thread_idx_x());
|
||||
__HIP_DEVICE_BUILTIN(y, __hip_get_thread_idx_y());
|
||||
__HIP_DEVICE_BUILTIN(z, __hip_get_thread_idx_z());
|
||||
#ifdef __cplusplus
|
||||
__device__ operator dim3() const { return dim3(x, y, z); }
|
||||
#endif
|
||||
};
|
||||
|
||||
struct __hip_builtin_blockIdx_t {
|
||||
__HIP_DEVICE_BUILTIN(x,__hip_get_block_idx_x());
|
||||
__HIP_DEVICE_BUILTIN(y,__hip_get_block_idx_y());
|
||||
__HIP_DEVICE_BUILTIN(z,__hip_get_block_idx_z());
|
||||
__HIP_DEVICE_BUILTIN(x, __hip_get_block_idx_x());
|
||||
__HIP_DEVICE_BUILTIN(y, __hip_get_block_idx_y());
|
||||
__HIP_DEVICE_BUILTIN(z, __hip_get_block_idx_z());
|
||||
#ifdef __cplusplus
|
||||
__device__ operator dim3() const { return dim3(x, y, z); }
|
||||
#endif
|
||||
};
|
||||
|
||||
struct __hip_builtin_blockDim_t {
|
||||
__HIP_DEVICE_BUILTIN(x,__hip_get_block_dim_x());
|
||||
__HIP_DEVICE_BUILTIN(y,__hip_get_block_dim_y());
|
||||
__HIP_DEVICE_BUILTIN(z,__hip_get_block_dim_z());
|
||||
__HIP_DEVICE_BUILTIN(x, __hip_get_block_dim_x());
|
||||
__HIP_DEVICE_BUILTIN(y, __hip_get_block_dim_y());
|
||||
__HIP_DEVICE_BUILTIN(z, __hip_get_block_dim_z());
|
||||
#ifdef __cplusplus
|
||||
__device__ operator dim3() const { return dim3(x, y, z); }
|
||||
#endif
|
||||
};
|
||||
|
||||
struct __hip_builtin_gridDim_t {
|
||||
__HIP_DEVICE_BUILTIN(x,__hip_get_grid_dim_x());
|
||||
__HIP_DEVICE_BUILTIN(y,__hip_get_grid_dim_y());
|
||||
__HIP_DEVICE_BUILTIN(z,__hip_get_grid_dim_z());
|
||||
__HIP_DEVICE_BUILTIN(x, __hip_get_grid_dim_x());
|
||||
__HIP_DEVICE_BUILTIN(y, __hip_get_grid_dim_y());
|
||||
__HIP_DEVICE_BUILTIN(z, __hip_get_grid_dim_z());
|
||||
#ifdef __cplusplus
|
||||
__device__ operator dim3() const { return dim3(x, y, z); }
|
||||
#endif
|
||||
@@ -342,15 +341,15 @@ extern const __device__ __attribute__((weak)) __hip_builtin_gridDim_t gridDim;
|
||||
#if __HIP_HCC_COMPAT_MODE__
|
||||
// Define HCC work item functions in terms of HIP builtin variables.
|
||||
#pragma push_macro("__DEFINE_HCC_FUNC")
|
||||
#define __DEFINE_HCC_FUNC(hc_fun,hip_var) \
|
||||
inline __device__ __attribute__((always_inline)) unsigned int hc_get_##hc_fun(unsigned int i) { \
|
||||
if (i==0) \
|
||||
return hip_var.x; \
|
||||
else if(i==1) \
|
||||
return hip_var.y; \
|
||||
else \
|
||||
return hip_var.z; \
|
||||
}
|
||||
#define __DEFINE_HCC_FUNC(hc_fun, hip_var) \
|
||||
inline __device__ __attribute__((always_inline)) unsigned int hc_get_##hc_fun(unsigned int i) { \
|
||||
if (i == 0) \
|
||||
return hip_var.x; \
|
||||
else if (i == 1) \
|
||||
return hip_var.y; \
|
||||
else \
|
||||
return hip_var.z; \
|
||||
}
|
||||
|
||||
__DEFINE_HCC_FUNC(workitem_id, threadIdx)
|
||||
__DEFINE_HCC_FUNC(group_id, blockIdx)
|
||||
@@ -359,9 +358,7 @@ __DEFINE_HCC_FUNC(num_groups, gridDim)
|
||||
#pragma pop_macro("__DEFINE_HCC_FUNC")
|
||||
|
||||
extern "C" __device__ __attribute__((const)) size_t __ockl_get_global_id(unsigned int);
|
||||
inline __device__ __attribute__((always_inline)) unsigned int
|
||||
hc_get_workitem_absolute_id(int dim)
|
||||
{
|
||||
inline __device__ __attribute__((always_inline)) unsigned int hc_get_workitem_absolute_id(int dim) {
|
||||
return (unsigned int)__ockl_get_global_id(dim);
|
||||
}
|
||||
|
||||
@@ -385,9 +382,9 @@ hc_get_workitem_absolute_id(int dim)
|
||||
#include <include/cuda_wrappers/new>
|
||||
#undef __CUDA__
|
||||
#pragma pop_macro("__CUDA__")
|
||||
#endif // !_OPENMP || __HIP_ENABLE_CUDA_WRAPPER_FOR_OPENMP__
|
||||
#endif // !defined(__HIPCC_RTC__)
|
||||
#endif // !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
|
||||
#endif // __HIP_CLANG_ONLY__
|
||||
#endif // !_OPENMP || __HIP_ENABLE_CUDA_WRAPPER_FOR_OPENMP__
|
||||
#endif // !defined(__HIPCC_RTC__)
|
||||
#endif // !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
|
||||
#endif // __HIP_CLANG_ONLY__
|
||||
|
||||
#endif // HIP_AMD_DETAIL_RUNTIME_H
|
||||
|
||||
@@ -29,62 +29,62 @@ THE SOFTWARE.
|
||||
|
||||
/// hipStreamPerThread implementation
|
||||
#if defined(HIP_API_PER_THREAD_DEFAULT_STREAM)
|
||||
#define __HIP_STREAM_PER_THREAD
|
||||
#define __HIP_API_SPT(api) api ## _spt
|
||||
#define __HIP_STREAM_PER_THREAD
|
||||
#define __HIP_API_SPT(api) api##_spt
|
||||
#else
|
||||
#define __HIP_API_SPT(api) api
|
||||
#define __HIP_API_SPT(api) api
|
||||
#endif
|
||||
|
||||
#if defined(__HIP_STREAM_PER_THREAD)
|
||||
// Memory APIs
|
||||
#define hipMemcpy __HIP_API_SPT(hipMemcpy)
|
||||
#define hipMemcpyToSymbol __HIP_API_SPT(hipMemcpyToSymbol)
|
||||
#define hipMemcpyFromSymbol __HIP_API_SPT(hipMemcpyFromSymbol)
|
||||
#define hipMemcpy2D __HIP_API_SPT(hipMemcpy2D)
|
||||
#define hipMemcpy2DFromArray __HIP_API_SPT(hipMemcpy2DFromArray)
|
||||
#define hipMemcpy3D __HIP_API_SPT(hipMemcpy3D)
|
||||
#define hipMemset __HIP_API_SPT(hipMemset)
|
||||
#define hipMemset2D __HIP_API_SPT(hipMemset2D)
|
||||
#define hipMemset3D __HIP_API_SPT(hipMemset3D)
|
||||
#define hipMemcpyAsync __HIP_API_SPT(hipMemcpyAsync)
|
||||
#define hipMemset3DAsync __HIP_API_SPT(hipMemset3DAsync)
|
||||
#define hipMemset2DAsync __HIP_API_SPT(hipMemset2DAsync)
|
||||
#define hipMemsetAsync __HIP_API_SPT(hipMemsetAsync)
|
||||
#define hipMemcpy3DAsync __HIP_API_SPT(hipMemcpy3DAsync)
|
||||
#define hipMemcpy2DAsync __HIP_API_SPT(hipMemcpy2DAsync)
|
||||
#define hipMemcpyFromSymbolAsync __HIP_API_SPT(hipMemcpyFromSymbolAsync)
|
||||
#define hipMemcpyToSymbolAsync __HIP_API_SPT(hipMemcpyToSymbolAsync)
|
||||
#define hipMemcpyFromArray __HIP_API_SPT(hipMemcpyFromArray)
|
||||
#define hipMemcpy2DToArray __HIP_API_SPT(hipMemcpy2DToArray)
|
||||
#define hipMemcpy2DFromArrayAsync __HIP_API_SPT(hipMemcpy2DFromArrayAsync)
|
||||
#define hipMemcpy2DToArrayAsync __HIP_API_SPT(hipMemcpy2DToArrayAsync)
|
||||
// Memory APIs
|
||||
#define hipMemcpy __HIP_API_SPT(hipMemcpy)
|
||||
#define hipMemcpyToSymbol __HIP_API_SPT(hipMemcpyToSymbol)
|
||||
#define hipMemcpyFromSymbol __HIP_API_SPT(hipMemcpyFromSymbol)
|
||||
#define hipMemcpy2D __HIP_API_SPT(hipMemcpy2D)
|
||||
#define hipMemcpy2DFromArray __HIP_API_SPT(hipMemcpy2DFromArray)
|
||||
#define hipMemcpy3D __HIP_API_SPT(hipMemcpy3D)
|
||||
#define hipMemset __HIP_API_SPT(hipMemset)
|
||||
#define hipMemset2D __HIP_API_SPT(hipMemset2D)
|
||||
#define hipMemset3D __HIP_API_SPT(hipMemset3D)
|
||||
#define hipMemcpyAsync __HIP_API_SPT(hipMemcpyAsync)
|
||||
#define hipMemset3DAsync __HIP_API_SPT(hipMemset3DAsync)
|
||||
#define hipMemset2DAsync __HIP_API_SPT(hipMemset2DAsync)
|
||||
#define hipMemsetAsync __HIP_API_SPT(hipMemsetAsync)
|
||||
#define hipMemcpy3DAsync __HIP_API_SPT(hipMemcpy3DAsync)
|
||||
#define hipMemcpy2DAsync __HIP_API_SPT(hipMemcpy2DAsync)
|
||||
#define hipMemcpyFromSymbolAsync __HIP_API_SPT(hipMemcpyFromSymbolAsync)
|
||||
#define hipMemcpyToSymbolAsync __HIP_API_SPT(hipMemcpyToSymbolAsync)
|
||||
#define hipMemcpyFromArray __HIP_API_SPT(hipMemcpyFromArray)
|
||||
#define hipMemcpy2DToArray __HIP_API_SPT(hipMemcpy2DToArray)
|
||||
#define hipMemcpy2DFromArrayAsync __HIP_API_SPT(hipMemcpy2DFromArrayAsync)
|
||||
#define hipMemcpy2DToArrayAsync __HIP_API_SPT(hipMemcpy2DToArrayAsync)
|
||||
|
||||
// Stream APIs
|
||||
#define hipStreamSynchronize __HIP_API_SPT(hipStreamSynchronize)
|
||||
#define hipStreamQuery __HIP_API_SPT(hipStreamQuery)
|
||||
#define hipStreamGetFlags __HIP_API_SPT(hipStreamGetFlags)
|
||||
#define hipStreamGetPriority __HIP_API_SPT(hipStreamGetPriority)
|
||||
#define hipStreamWaitEvent __HIP_API_SPT(hipStreamWaitEvent)
|
||||
#define hipStreamAddCallback __HIP_API_SPT(hipStreamAddCallback)
|
||||
#define hipLaunchHostFunc __HIP_API_SPT(hipLaunchHostFunc)
|
||||
// Stream APIs
|
||||
#define hipStreamSynchronize __HIP_API_SPT(hipStreamSynchronize)
|
||||
#define hipStreamQuery __HIP_API_SPT(hipStreamQuery)
|
||||
#define hipStreamGetFlags __HIP_API_SPT(hipStreamGetFlags)
|
||||
#define hipStreamGetPriority __HIP_API_SPT(hipStreamGetPriority)
|
||||
#define hipStreamWaitEvent __HIP_API_SPT(hipStreamWaitEvent)
|
||||
#define hipStreamAddCallback __HIP_API_SPT(hipStreamAddCallback)
|
||||
#define hipLaunchHostFunc __HIP_API_SPT(hipLaunchHostFunc)
|
||||
|
||||
// Event APIs
|
||||
#define hipEventRecord __HIP_API_SPT(hipEventRecord)
|
||||
// Event APIs
|
||||
#define hipEventRecord __HIP_API_SPT(hipEventRecord)
|
||||
|
||||
// Launch APIs
|
||||
#define hipLaunchKernel __HIP_API_SPT(hipLaunchKernel)
|
||||
#define hipLaunchCooperativeKernel __HIP_API_SPT(hipLaunchCooperativeKernel)
|
||||
// Launch APIs
|
||||
#define hipLaunchKernel __HIP_API_SPT(hipLaunchKernel)
|
||||
#define hipLaunchCooperativeKernel __HIP_API_SPT(hipLaunchCooperativeKernel)
|
||||
|
||||
// Graph APIs
|
||||
#define hipGraphLaunch __HIP_API_SPT(hipGraphLaunch)
|
||||
#define hipStreamBeginCapture __HIP_API_SPT(hipStreamBeginCapture)
|
||||
#define hipStreamEndCapture __HIP_API_SPT(hipStreamEndCapture)
|
||||
#define hipStreamIsCapturing __HIP_API_SPT(hipStreamIsCapturing)
|
||||
#define hipStreamGetCaptureInfo __HIP_API_SPT(hipStreamGetCaptureInfo)
|
||||
#define hipStreamGetCaptureInfo_v2 __HIP_API_SPT(hipStreamGetCaptureInfo_v2)
|
||||
// Graph APIs
|
||||
#define hipGraphLaunch __HIP_API_SPT(hipGraphLaunch)
|
||||
#define hipStreamBeginCapture __HIP_API_SPT(hipStreamBeginCapture)
|
||||
#define hipStreamEndCapture __HIP_API_SPT(hipStreamEndCapture)
|
||||
#define hipStreamIsCapturing __HIP_API_SPT(hipStreamIsCapturing)
|
||||
#define hipStreamGetCaptureInfo __HIP_API_SPT(hipStreamGetCaptureInfo)
|
||||
#define hipStreamGetCaptureInfo_v2 __HIP_API_SPT(hipStreamGetCaptureInfo_v2)
|
||||
|
||||
// Driver Entry Point API
|
||||
#define hipGetDriverEntryPoint __HIP_API_SPT(hipGetDriverEntryPoint)
|
||||
// Driver Entry Point API
|
||||
#define hipGetDriverEntryPoint __HIP_API_SPT(hipGetDriverEntryPoint)
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
@@ -97,66 +97,67 @@ hipError_t hipMemcpyToSymbol_spt(const void* symbol, const void* src, size_t siz
|
||||
size_t offset __dparm(0),
|
||||
hipMemcpyKind kind __dparm(hipMemcpyHostToDevice));
|
||||
|
||||
hipError_t hipMemcpyFromSymbol_spt(void* dst, const void* symbol,size_t sizeBytes,
|
||||
hipError_t hipMemcpyFromSymbol_spt(void* dst, const void* symbol, size_t sizeBytes,
|
||||
size_t offset __dparm(0),
|
||||
hipMemcpyKind kind __dparm(hipMemcpyDeviceToHost));
|
||||
|
||||
hipError_t hipMemcpy2D_spt(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width,
|
||||
size_t height, hipMemcpyKind kind);
|
||||
size_t height, hipMemcpyKind kind);
|
||||
|
||||
hipError_t hipMemcpy2DFromArray_spt( void* dst, size_t dpitch, hipArray_const_t src, size_t wOffset,
|
||||
size_t hOffset, size_t width, size_t height, hipMemcpyKind kind);
|
||||
hipError_t hipMemcpy2DFromArray_spt(void* dst, size_t dpitch, hipArray_const_t src, size_t wOffset,
|
||||
size_t hOffset, size_t width, size_t height,
|
||||
hipMemcpyKind kind);
|
||||
|
||||
hipError_t hipMemcpy3D_spt(const struct hipMemcpy3DParms* p);
|
||||
|
||||
hipError_t hipMemset_spt(void* dst, int value, size_t sizeBytes);
|
||||
|
||||
hipError_t hipMemsetAsync_spt(void* dst, int value, size_t sizeBytes,
|
||||
hipStream_t stream __dparm(hipStreamPerThread));
|
||||
hipStream_t stream __dparm(hipStreamPerThread));
|
||||
|
||||
hipError_t hipMemset2D_spt(void* dst, size_t pitch, int value, size_t width, size_t height);
|
||||
|
||||
hipError_t hipMemset2DAsync_spt(void* dst, size_t pitch, int value,
|
||||
size_t width, size_t height,
|
||||
hipStream_t stream __dparm(hipStreamPerThread));
|
||||
hipError_t hipMemset2DAsync_spt(void* dst, size_t pitch, int value, size_t width, size_t height,
|
||||
hipStream_t stream __dparm(hipStreamPerThread));
|
||||
|
||||
hipError_t hipMemset3DAsync_spt(hipPitchedPtr pitchedDevPtr, int value, hipExtent extent,
|
||||
hipStream_t stream __dparm(hipStreamPerThread));
|
||||
hipStream_t stream __dparm(hipStreamPerThread));
|
||||
|
||||
hipError_t hipMemset3D_spt(hipPitchedPtr pitchedDevPtr, int value, hipExtent extent );
|
||||
hipError_t hipMemset3D_spt(hipPitchedPtr pitchedDevPtr, int value, hipExtent extent);
|
||||
|
||||
hipError_t hipMemcpyAsync_spt(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind,
|
||||
hipStream_t stream __dparm(hipStreamPerThread));
|
||||
hipStream_t stream __dparm(hipStreamPerThread));
|
||||
|
||||
hipError_t hipMemcpy3DAsync_spt(const hipMemcpy3DParms* p,
|
||||
hipStream_t stream __dparm(hipStreamPerThread));
|
||||
|
||||
hipError_t hipMemcpy2DAsync_spt(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width,
|
||||
size_t height, hipMemcpyKind kind,
|
||||
hipStream_t stream __dparm(hipStreamPerThread));
|
||||
hipError_t hipMemcpy2DAsync_spt(void* dst, size_t dpitch, const void* src, size_t spitch,
|
||||
size_t width, size_t height, hipMemcpyKind kind,
|
||||
hipStream_t stream __dparm(hipStreamPerThread));
|
||||
|
||||
hipError_t hipMemcpyFromSymbolAsync_spt(void* dst, const void* symbol, size_t sizeBytes,
|
||||
size_t offset, hipMemcpyKind kind,
|
||||
hipStream_t stream __dparm(hipStreamPerThread));
|
||||
size_t offset, hipMemcpyKind kind,
|
||||
hipStream_t stream __dparm(hipStreamPerThread));
|
||||
|
||||
hipError_t hipMemcpyToSymbolAsync_spt(const void* symbol, const void* src, size_t sizeBytes,
|
||||
size_t offset, hipMemcpyKind kind,
|
||||
hipStream_t stream __dparm(hipStreamPerThread));
|
||||
size_t offset, hipMemcpyKind kind,
|
||||
hipStream_t stream __dparm(hipStreamPerThread));
|
||||
|
||||
hipError_t hipMemcpyFromArray_spt(void* dst, hipArray_const_t src, size_t wOffsetSrc, size_t hOffset,
|
||||
size_t count, hipMemcpyKind kind);
|
||||
hipError_t hipMemcpyFromArray_spt(void* dst, hipArray_const_t src, size_t wOffsetSrc,
|
||||
size_t hOffset, size_t count, hipMemcpyKind kind);
|
||||
|
||||
hipError_t hipMemcpy2DToArray_spt(hipArray_t dst, size_t wOffset, size_t hOffset, const void* src,
|
||||
size_t spitch, size_t width, size_t height, hipMemcpyKind kind);
|
||||
|
||||
hipError_t hipMemcpy2DFromArrayAsync_spt(void* dst, size_t dpitch, hipArray_const_t src,
|
||||
size_t wOffsetSrc, size_t hOffsetSrc, size_t width, size_t height,
|
||||
hipMemcpyKind kind,
|
||||
hipStream_t stream __dparm(hipStreamPerThread));
|
||||
size_t wOffsetSrc, size_t hOffsetSrc, size_t width,
|
||||
size_t height, hipMemcpyKind kind,
|
||||
hipStream_t stream __dparm(hipStreamPerThread));
|
||||
|
||||
hipError_t hipMemcpy2DToArrayAsync_spt(hipArray_t dst, size_t wOffset, size_t hOffset, const void* src,
|
||||
size_t spitch, size_t width, size_t height, hipMemcpyKind kind,
|
||||
hipStream_t stream __dparm(hipStreamPerThread));
|
||||
hipError_t hipMemcpy2DToArrayAsync_spt(hipArray_t dst, size_t wOffset, size_t hOffset,
|
||||
const void* src, size_t spitch, size_t width, size_t height,
|
||||
hipMemcpyKind kind,
|
||||
hipStream_t stream __dparm(hipStreamPerThread));
|
||||
|
||||
hipError_t hipStreamQuery_spt(hipStream_t stream);
|
||||
|
||||
@@ -164,36 +165,35 @@ hipError_t hipStreamSynchronize_spt(hipStream_t stream);
|
||||
|
||||
hipError_t hipStreamGetPriority_spt(hipStream_t stream, int* priority);
|
||||
|
||||
hipError_t hipStreamWaitEvent_spt(hipStream_t stream, hipEvent_t event, unsigned int flags __dparm(0));
|
||||
hipError_t hipStreamWaitEvent_spt(hipStream_t stream, hipEvent_t event,
|
||||
unsigned int flags __dparm(0));
|
||||
|
||||
hipError_t hipStreamGetFlags_spt(hipStream_t stream, unsigned int* flags);
|
||||
|
||||
hipError_t hipStreamAddCallback_spt(hipStream_t stream, hipStreamCallback_t callback, void* userData,
|
||||
unsigned int flags);
|
||||
hipError_t hipStreamAddCallback_spt(hipStream_t stream, hipStreamCallback_t callback,
|
||||
void* userData, unsigned int flags);
|
||||
|
||||
hipError_t hipEventRecord_spt(hipEvent_t event, hipStream_t stream __dparm(hipStreamPerThread));
|
||||
|
||||
hipError_t hipLaunchCooperativeKernel_spt(const void* f,
|
||||
dim3 gridDim, dim3 blockDim,
|
||||
void **kernelParams, uint32_t sharedMemBytes,
|
||||
hipStream_t hStream __dparm(hipStreamPerThread));
|
||||
hipError_t hipLaunchCooperativeKernel_spt(const void* f, dim3 gridDim, dim3 blockDim,
|
||||
void** kernelParams, uint32_t sharedMemBytes,
|
||||
hipStream_t hStream __dparm(hipStreamPerThread));
|
||||
|
||||
hipError_t hipLaunchKernel_spt(const void* function_address,
|
||||
dim3 numBlocks,
|
||||
dim3 dimBlocks,
|
||||
void** args,
|
||||
size_t sharedMemBytes, hipStream_t stream __dparm(hipStreamPerThread));
|
||||
hipError_t hipLaunchKernel_spt(const void* function_address, dim3 numBlocks, dim3 dimBlocks,
|
||||
void** args, size_t sharedMemBytes,
|
||||
hipStream_t stream __dparm(hipStreamPerThread));
|
||||
|
||||
hipError_t hipGraphLaunch_spt(hipGraphExec_t graphExec, hipStream_t stream);
|
||||
hipError_t hipStreamBeginCapture_spt(hipStream_t stream, hipStreamCaptureMode mode);
|
||||
hipError_t hipStreamEndCapture_spt(hipStream_t stream, hipGraph_t* pGraph);
|
||||
hipError_t hipStreamIsCapturing_spt(hipStream_t stream, hipStreamCaptureStatus* pCaptureStatus);
|
||||
hipError_t hipStreamGetCaptureInfo_spt(hipStream_t stream, hipStreamCaptureStatus* pCaptureStatus,
|
||||
unsigned long long* pId);
|
||||
hipError_t hipStreamGetCaptureInfo_v2_spt(hipStream_t stream, hipStreamCaptureStatus* captureStatus_out,
|
||||
unsigned long long* id_out, hipGraph_t* graph_out,
|
||||
const hipGraphNode_t** dependencies_out,
|
||||
size_t* numDependencies_out);
|
||||
unsigned long long* pId);
|
||||
hipError_t hipStreamGetCaptureInfo_v2_spt(hipStream_t stream,
|
||||
hipStreamCaptureStatus* captureStatus_out,
|
||||
unsigned long long* id_out, hipGraph_t* graph_out,
|
||||
const hipGraphNode_t** dependencies_out,
|
||||
size_t* numDependencies_out);
|
||||
hipError_t hipLaunchHostFunc_spt(hipStream_t stream, hipHostFn_t fn, void* userData);
|
||||
hipError_t hipGetDriverEntryPoint_spt(const char* symbol, void** funcPtr, unsigned long long flags,
|
||||
hipDriverEntryPointQueryResult* status);
|
||||
@@ -201,7 +201,7 @@ hipError_t hipGetDriverEntryPoint_spt(const char* symbol, void** funcPtr, unsign
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif // extern "C"
|
||||
#endif // extern "C"
|
||||
|
||||
#endif //defined(__HIP_PLATFORM_AMD__) && !defined(__HIP_PLATFORM_NVIDIA__)
|
||||
#endif //HIP_INCLUDE_HIP_HIP_RUNTIME_PT_API_H
|
||||
#endif // defined(__HIP_PLATFORM_AMD__) && !defined(__HIP_PLATFORM_NVIDIA__)
|
||||
#endif // HIP_INCLUDE_HIP_HIP_RUNTIME_PT_API_H
|
||||
|
||||
@@ -58,21 +58,17 @@ THE SOFTWARE.
|
||||
* @return Original value contained in \p addr.
|
||||
*/
|
||||
__device__ inline float unsafeAtomicAdd(float* addr, float value) {
|
||||
#if defined(__gfx90a__) && \
|
||||
__has_builtin(__builtin_amdgcn_is_shared) && \
|
||||
__has_builtin(__builtin_amdgcn_is_private) && \
|
||||
__has_builtin(__builtin_amdgcn_ds_atomic_fadd_f32) && \
|
||||
#if defined(__gfx90a__) && __has_builtin(__builtin_amdgcn_is_shared) && \
|
||||
__has_builtin(__builtin_amdgcn_is_private) && \
|
||||
__has_builtin(__builtin_amdgcn_ds_atomic_fadd_f32) && \
|
||||
__has_builtin(__builtin_amdgcn_global_atomic_fadd_f32)
|
||||
if (__builtin_amdgcn_is_shared(
|
||||
(const __attribute__((address_space(0))) void*)addr))
|
||||
if (__builtin_amdgcn_is_shared((const __attribute__((address_space(0))) void*)addr))
|
||||
return __builtin_amdgcn_ds_atomic_fadd_f32(addr, value);
|
||||
else if (__builtin_amdgcn_is_private(
|
||||
(const __attribute__((address_space(0))) void*)addr)) {
|
||||
else if (__builtin_amdgcn_is_private((const __attribute__((address_space(0))) void*)addr)) {
|
||||
float temp = *addr;
|
||||
*addr = temp + value;
|
||||
return temp;
|
||||
}
|
||||
else
|
||||
} else
|
||||
return __builtin_amdgcn_global_atomic_fadd_f32(addr, value);
|
||||
#elif __has_builtin(__hip_atomic_fetch_add)
|
||||
__HIP_ATOMICS_IGNORE_DENORMAL_MODE {
|
||||
@@ -98,27 +94,26 @@ __device__ inline float unsafeAtomicAdd(float* addr, float value) {
|
||||
* @return Original value contained in \p addr.
|
||||
*/
|
||||
__device__ inline float unsafeAtomicMax(float* addr, float val) {
|
||||
#if __has_builtin(__hip_atomic_load) && \
|
||||
__has_builtin(__hip_atomic_compare_exchange_strong)
|
||||
#if __has_builtin(__hip_atomic_load) && __has_builtin(__hip_atomic_compare_exchange_strong)
|
||||
__HIP_ATOMICS_IGNORE_DENORMAL_MODE {
|
||||
float value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
||||
bool done = false;
|
||||
while (!done && value < val) {
|
||||
done = __hip_atomic_compare_exchange_strong(addr, &value, val,
|
||||
__ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
||||
done = __hip_atomic_compare_exchange_strong(addr, &value, val, __ATOMIC_RELAXED,
|
||||
__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
||||
}
|
||||
return value;
|
||||
}
|
||||
#else
|
||||
unsigned int *uaddr = (unsigned int *)addr;
|
||||
#else
|
||||
unsigned int* uaddr = (unsigned int*)addr;
|
||||
unsigned int value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
|
||||
bool done = false;
|
||||
while (!done && __uint_as_float(value) < val) {
|
||||
done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val), false,
|
||||
__ATOMIC_RELAXED, __ATOMIC_RELAXED);
|
||||
done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val), false, __ATOMIC_RELAXED,
|
||||
__ATOMIC_RELAXED);
|
||||
}
|
||||
return __uint_as_float(value);
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -136,27 +131,26 @@ __device__ inline float unsafeAtomicMax(float* addr, float val) {
|
||||
* @return Original value contained in \p addr.
|
||||
*/
|
||||
__device__ inline float unsafeAtomicMin(float* addr, float val) {
|
||||
#if __has_builtin(__hip_atomic_load) && \
|
||||
__has_builtin(__hip_atomic_compare_exchange_strong)
|
||||
#if __has_builtin(__hip_atomic_load) && __has_builtin(__hip_atomic_compare_exchange_strong)
|
||||
__HIP_ATOMICS_IGNORE_DENORMAL_MODE {
|
||||
float value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
||||
bool done = false;
|
||||
while (!done && value > val) {
|
||||
done = __hip_atomic_compare_exchange_strong(addr, &value, val,
|
||||
__ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
||||
done = __hip_atomic_compare_exchange_strong(addr, &value, val, __ATOMIC_RELAXED,
|
||||
__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
||||
}
|
||||
return value;
|
||||
}
|
||||
#else
|
||||
unsigned int *uaddr = (unsigned int *)addr;
|
||||
#else
|
||||
unsigned int* uaddr = (unsigned int*)addr;
|
||||
unsigned int value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
|
||||
bool done = false;
|
||||
while (!done && __uint_as_float(value) > val) {
|
||||
done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val), false,
|
||||
__ATOMIC_RELAXED, __ATOMIC_RELAXED);
|
||||
done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val), false, __ATOMIC_RELAXED,
|
||||
__ATOMIC_RELAXED);
|
||||
}
|
||||
return __uint_as_float(value);
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -188,7 +182,7 @@ __device__ inline float unsafeAtomicMin(float* addr, float val) {
|
||||
__device__ inline double unsafeAtomicAdd(double* addr, double value) {
|
||||
#if defined(__gfx90a__) && __has_builtin(__builtin_amdgcn_flat_atomic_fadd_f64)
|
||||
return __builtin_amdgcn_flat_atomic_fadd_f64(addr, value);
|
||||
#elif defined (__hip_atomic_fetch_add)
|
||||
#elif defined(__hip_atomic_fetch_add)
|
||||
__HIP_ATOMICS_IGNORE_DENORMAL_MODE {
|
||||
return __hip_atomic_fetch_add(addr, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
||||
}
|
||||
@@ -224,31 +218,30 @@ __device__ inline double unsafeAtomicAdd(double* addr, double value) {
|
||||
* @return Original value contained at \p addr.
|
||||
*/
|
||||
__device__ inline double unsafeAtomicMax(double* addr, double val) {
|
||||
#if (defined(__gfx90a__) || defined(__gfx94plus_clr__)) && \
|
||||
__has_builtin(__builtin_amdgcn_flat_atomic_fmax_f64)
|
||||
#if (defined(__gfx90a__) || defined(__gfx94plus_clr__)) && \
|
||||
__has_builtin(__builtin_amdgcn_flat_atomic_fmax_f64)
|
||||
return __builtin_amdgcn_flat_atomic_fmax_f64(addr, val);
|
||||
#else
|
||||
#if __has_builtin(__hip_atomic_load) && \
|
||||
__has_builtin(__hip_atomic_compare_exchange_strong)
|
||||
#if __has_builtin(__hip_atomic_load) && __has_builtin(__hip_atomic_compare_exchange_strong)
|
||||
__HIP_ATOMICS_IGNORE_DENORMAL_MODE {
|
||||
double value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
||||
bool done = false;
|
||||
while (!done && value < val) {
|
||||
done = __hip_atomic_compare_exchange_strong(addr, &value, val,
|
||||
__ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
||||
done = __hip_atomic_compare_exchange_strong(addr, &value, val, __ATOMIC_RELAXED,
|
||||
__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
||||
}
|
||||
return value;
|
||||
}
|
||||
#else
|
||||
unsigned long long *uaddr = (unsigned long long *)addr;
|
||||
#else
|
||||
unsigned long long* uaddr = (unsigned long long*)addr;
|
||||
unsigned long long value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
|
||||
bool done = false;
|
||||
while (!done && __longlong_as_double(value) < val) {
|
||||
done = __atomic_compare_exchange_n(uaddr, &value, __double_as_longlong(val), false,
|
||||
__ATOMIC_RELAXED, __ATOMIC_RELAXED);
|
||||
__ATOMIC_RELAXED, __ATOMIC_RELAXED);
|
||||
}
|
||||
return __longlong_as_double(value);
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -279,31 +272,30 @@ __device__ inline double unsafeAtomicMax(double* addr, double val) {
|
||||
* @return Original value contained at \p addr.
|
||||
*/
|
||||
__device__ inline double unsafeAtomicMin(double* addr, double val) {
|
||||
#if (defined(__gfx90a__) || defined(__gfx94plus_clr__)) && \
|
||||
__has_builtin(__builtin_amdgcn_flat_atomic_fmin_f64)
|
||||
#if (defined(__gfx90a__) || defined(__gfx94plus_clr__)) && \
|
||||
__has_builtin(__builtin_amdgcn_flat_atomic_fmin_f64)
|
||||
return __builtin_amdgcn_flat_atomic_fmin_f64(addr, val);
|
||||
#else
|
||||
#if __has_builtin(__hip_atomic_load) && \
|
||||
__has_builtin(__hip_atomic_compare_exchange_strong)
|
||||
#if __has_builtin(__hip_atomic_load) && __has_builtin(__hip_atomic_compare_exchange_strong)
|
||||
__HIP_ATOMICS_IGNORE_DENORMAL_MODE {
|
||||
double value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
||||
bool done = false;
|
||||
while (!done && value > val) {
|
||||
done = __hip_atomic_compare_exchange_strong(addr, &value, val,
|
||||
__ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
||||
done = __hip_atomic_compare_exchange_strong(addr, &value, val, __ATOMIC_RELAXED,
|
||||
__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
||||
}
|
||||
return value;
|
||||
}
|
||||
#else
|
||||
unsigned long long *uaddr = (unsigned long long *)addr;
|
||||
#else
|
||||
unsigned long long* uaddr = (unsigned long long*)addr;
|
||||
unsigned long long value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
|
||||
bool done = false;
|
||||
while (!done && __longlong_as_double(value) > val) {
|
||||
done = __atomic_compare_exchange_n(uaddr, &value, __double_as_longlong(val), false,
|
||||
__ATOMIC_RELAXED, __ATOMIC_RELAXED);
|
||||
__ATOMIC_RELAXED, __ATOMIC_RELAXED);
|
||||
}
|
||||
return __longlong_as_double(value);
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -322,9 +314,9 @@ __device__ inline double unsafeAtomicMin(double* addr, double val) {
|
||||
* @return Original value contained in \p addr.
|
||||
*/
|
||||
__device__ inline float safeAtomicAdd(float* addr, float value) {
|
||||
#if defined(__gfx908__) \
|
||||
|| ((defined(__gfx90a__) || defined(__gfx942__) || \
|
||||
defined(__gfx950__)) && !__has_builtin(__hip_atomic_fetch_add))
|
||||
#if defined(__gfx908__) || \
|
||||
((defined(__gfx90a__) || defined(__gfx942__) || defined(__gfx950__)) && \
|
||||
!__has_builtin(__hip_atomic_fetch_add))
|
||||
// On gfx908, we can generate unsafe FP32 atomic add that does not follow all
|
||||
// IEEE rules when -munsafe-fp-atomics is passed. Do a CAS loop emulation instead.
|
||||
// On gfx90a, gfx942 and gfx950 if we do not have the __hip_atomic_fetch_add builtin, we
|
||||
@@ -334,21 +326,22 @@ __device__ inline float safeAtomicAdd(float* addr, float value) {
|
||||
__HIP_ATOMICS_IGNORE_DENORMAL_MODE {
|
||||
old_val = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
||||
}
|
||||
#else // !__has_builtin(__hip_atomic_load)
|
||||
old_val = __uint_as_float(__atomic_load_n(reinterpret_cast<unsigned int*>(addr), __ATOMIC_RELAXED));
|
||||
#endif // __has_builtin(__hip_atomic_load)
|
||||
#else // !__has_builtin(__hip_atomic_load)
|
||||
old_val =
|
||||
__uint_as_float(__atomic_load_n(reinterpret_cast<unsigned int*>(addr), __ATOMIC_RELAXED));
|
||||
#endif // __has_builtin(__hip_atomic_load)
|
||||
float expected, temp;
|
||||
do {
|
||||
temp = expected = old_val;
|
||||
#if __has_builtin(__hip_atomic_compare_exchange_strong)
|
||||
__HIP_ATOMICS_IGNORE_DENORMAL_MODE {
|
||||
__hip_atomic_compare_exchange_strong(addr, &expected, old_val + value, __ATOMIC_RELAXED,
|
||||
__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
||||
__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
||||
}
|
||||
#else // !__has_builtin(__hip_atomic_compare_exchange_strong)
|
||||
__atomic_compare_exchange_n(addr, &expected, old_val + value, false,
|
||||
__ATOMIC_RELAXED, __ATOMIC_RELAXED);
|
||||
#endif // __has_builtin(__hip_atomic_compare_exchange_strong)
|
||||
#else // !__has_builtin(__hip_atomic_compare_exchange_strong)
|
||||
__atomic_compare_exchange_n(addr, &expected, old_val + value, false, __ATOMIC_RELAXED,
|
||||
__ATOMIC_RELAXED);
|
||||
#endif // __has_builtin(__hip_atomic_compare_exchange_strong)
|
||||
old_val = expected;
|
||||
} while (__float_as_uint(temp) != __float_as_uint(old_val));
|
||||
return old_val;
|
||||
@@ -384,27 +377,26 @@ __device__ inline float safeAtomicAdd(float* addr, float value) {
|
||||
* @return Original value contained in \p addr.
|
||||
*/
|
||||
__device__ inline float safeAtomicMax(float* addr, float val) {
|
||||
#if __has_builtin(__hip_atomic_load) && \
|
||||
__has_builtin(__hip_atomic_compare_exchange_strong)
|
||||
#if __has_builtin(__hip_atomic_load) && __has_builtin(__hip_atomic_compare_exchange_strong)
|
||||
__HIP_ATOMICS_IGNORE_DENORMAL_MODE {
|
||||
float value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
||||
bool done = false;
|
||||
while (!done && value < val) {
|
||||
done = __hip_atomic_compare_exchange_strong(addr, &value, val,
|
||||
__ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
||||
done = __hip_atomic_compare_exchange_strong(addr, &value, val, __ATOMIC_RELAXED,
|
||||
__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
||||
}
|
||||
return value;
|
||||
}
|
||||
#else
|
||||
unsigned int *uaddr = (unsigned int *)addr;
|
||||
#else
|
||||
unsigned int* uaddr = (unsigned int*)addr;
|
||||
unsigned int value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
|
||||
bool done = false;
|
||||
while (!done && __uint_as_float(value) < val) {
|
||||
done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val), false,
|
||||
__ATOMIC_RELAXED, __ATOMIC_RELAXED);
|
||||
done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val), false, __ATOMIC_RELAXED,
|
||||
__ATOMIC_RELAXED);
|
||||
}
|
||||
return __uint_as_float(value);
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -422,27 +414,26 @@ __device__ inline float safeAtomicMax(float* addr, float val) {
|
||||
* @return Original value contained in \p addr.
|
||||
*/
|
||||
__device__ inline float safeAtomicMin(float* addr, float val) {
|
||||
#if __has_builtin(__hip_atomic_load) && \
|
||||
__has_builtin(__hip_atomic_compare_exchange_strong)
|
||||
#if __has_builtin(__hip_atomic_load) && __has_builtin(__hip_atomic_compare_exchange_strong)
|
||||
__HIP_ATOMICS_IGNORE_DENORMAL_MODE {
|
||||
float value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
||||
bool done = false;
|
||||
while (!done && value > val) {
|
||||
done = __hip_atomic_compare_exchange_strong(addr, &value, val,
|
||||
__ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
||||
done = __hip_atomic_compare_exchange_strong(addr, &value, val, __ATOMIC_RELAXED,
|
||||
__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
||||
}
|
||||
return value;
|
||||
}
|
||||
#else
|
||||
unsigned int *uaddr = (unsigned int *)addr;
|
||||
#else
|
||||
unsigned int* uaddr = (unsigned int*)addr;
|
||||
unsigned int value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
|
||||
bool done = false;
|
||||
while (!done && __uint_as_float(value) > val) {
|
||||
done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val), false,
|
||||
__ATOMIC_RELAXED, __ATOMIC_RELAXED);
|
||||
done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val), false, __ATOMIC_RELAXED,
|
||||
__ATOMIC_RELAXED);
|
||||
}
|
||||
return __uint_as_float(value);
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -460,7 +451,7 @@ __device__ inline float safeAtomicMin(float* addr, float val) {
|
||||
* @return Original value contained in \p addr.
|
||||
*/
|
||||
__device__ inline double safeAtomicAdd(double* addr, double value) {
|
||||
#if defined(__gfx90a__) && __has_builtin(__hip_atomic_fetch_add)
|
||||
#if defined(__gfx90a__) && __has_builtin(__hip_atomic_fetch_add)
|
||||
// On gfx90a, with the __hip_atomic_fetch_add builtin, relaxed system-scope
|
||||
// atomics will produce safe CAS loops, but are otherwise not different than
|
||||
// agent-scope atomics. This logic is only applicable for gfx90a, and should
|
||||
@@ -476,32 +467,33 @@ __device__ inline double safeAtomicAdd(double* addr, double value) {
|
||||
__HIP_ATOMICS_IGNORE_DENORMAL_MODE {
|
||||
old_val = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
||||
}
|
||||
#else // !__has_builtin(__hip_atomic_load)
|
||||
old_val = __longlong_as_double(__atomic_load_n(reinterpret_cast<unsigned long long*>(addr), __ATOMIC_RELAXED));
|
||||
#endif // __has_builtin(__hip_atomic_load)
|
||||
#else // !__has_builtin(__hip_atomic_load)
|
||||
old_val = __longlong_as_double(
|
||||
__atomic_load_n(reinterpret_cast<unsigned long long*>(addr), __ATOMIC_RELAXED));
|
||||
#endif // __has_builtin(__hip_atomic_load)
|
||||
double expected, temp;
|
||||
do {
|
||||
temp = expected = old_val;
|
||||
#if __has_builtin(__hip_atomic_compare_exchange_strong)
|
||||
__HIP_ATOMICS_IGNORE_DENORMAL_MODE {
|
||||
__hip_atomic_compare_exchange_strong(addr, &expected, old_val + value, __ATOMIC_RELAXED,
|
||||
__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
||||
__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
||||
}
|
||||
#else // !__has_builtin(__hip_atomic_compare_exchange_strong)
|
||||
__atomic_compare_exchange_n(addr, &expected, old_val + value, false,
|
||||
__ATOMIC_RELAXED, __ATOMIC_RELAXED);
|
||||
#endif // __has_builtin(__hip_atomic_compare_exchange_strong)
|
||||
#else // !__has_builtin(__hip_atomic_compare_exchange_strong)
|
||||
__atomic_compare_exchange_n(addr, &expected, old_val + value, false, __ATOMIC_RELAXED,
|
||||
__ATOMIC_RELAXED);
|
||||
#endif // __has_builtin(__hip_atomic_compare_exchange_strong)
|
||||
old_val = expected;
|
||||
} while (__double_as_longlong(temp) != __double_as_longlong(old_val));
|
||||
return old_val;
|
||||
#else // !defined(__gfx90a__)
|
||||
#else // !defined(__gfx90a__)
|
||||
#if __has_builtin(__hip_atomic_fetch_add)
|
||||
__HIP_ATOMICS_IGNORE_DENORMAL_MODE {
|
||||
return __hip_atomic_fetch_add(addr, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
||||
}
|
||||
#else // !__has_builtin(__hip_atomic_fetch_add)
|
||||
#else // !__has_builtin(__hip_atomic_fetch_add)
|
||||
return __atomic_fetch_add(addr, value, __ATOMIC_RELAXED);
|
||||
#endif // __has_builtin(__hip_atomic_fetch_add)
|
||||
#endif // __has_builtin(__hip_atomic_fetch_add)
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -520,38 +512,36 @@ __device__ inline double safeAtomicAdd(double* addr, double value) {
|
||||
* @return Original value contained at \p addr.
|
||||
*/
|
||||
__device__ inline double safeAtomicMax(double* addr, double val) {
|
||||
#if __has_builtin(__builtin_amdgcn_is_private)
|
||||
if (__builtin_amdgcn_is_private(
|
||||
(const __attribute__((address_space(0))) void*)addr)) {
|
||||
#if __has_builtin(__builtin_amdgcn_is_private)
|
||||
if (__builtin_amdgcn_is_private((const __attribute__((address_space(0))) void*)addr)) {
|
||||
double old = *addr;
|
||||
*addr = __builtin_fmax(old, val);
|
||||
return old;
|
||||
} else {
|
||||
#endif
|
||||
#if __has_builtin(__hip_atomic_load) && \
|
||||
__has_builtin(__hip_atomic_compare_exchange_strong)
|
||||
__HIP_ATOMICS_IGNORE_DENORMAL_MODE {
|
||||
double value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
||||
bool done = false;
|
||||
while (!done && value < val) {
|
||||
done = __hip_atomic_compare_exchange_strong(addr, &value, val,
|
||||
__ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
||||
#endif
|
||||
#if __has_builtin(__hip_atomic_load) && __has_builtin(__hip_atomic_compare_exchange_strong)
|
||||
__HIP_ATOMICS_IGNORE_DENORMAL_MODE {
|
||||
double value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
||||
bool done = false;
|
||||
while (!done && value < val) {
|
||||
done = __hip_atomic_compare_exchange_strong(addr, &value, val, __ATOMIC_RELAXED,
|
||||
__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
||||
}
|
||||
return value;
|
||||
}
|
||||
return value;
|
||||
}
|
||||
#else
|
||||
unsigned long long *uaddr = (unsigned long long *)addr;
|
||||
#else
|
||||
unsigned long long* uaddr = (unsigned long long*)addr;
|
||||
unsigned long long value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
|
||||
bool done = false;
|
||||
while (!done && __longlong_as_double(value) < val) {
|
||||
done = __atomic_compare_exchange_n(uaddr, &value, __double_as_longlong(val), false,
|
||||
__ATOMIC_RELAXED, __ATOMIC_RELAXED);
|
||||
__ATOMIC_RELAXED, __ATOMIC_RELAXED);
|
||||
}
|
||||
return __longlong_as_double(value);
|
||||
#endif
|
||||
#if __has_builtin(__builtin_amdgcn_is_private)
|
||||
#endif
|
||||
#if __has_builtin(__builtin_amdgcn_is_private)
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -569,38 +559,36 @@ __device__ inline double safeAtomicMax(double* addr, double val) {
|
||||
* @return Original value contained at \p addr.
|
||||
*/
|
||||
__device__ inline double safeAtomicMin(double* addr, double val) {
|
||||
#if __has_builtin(__builtin_amdgcn_is_private)
|
||||
if (__builtin_amdgcn_is_private(
|
||||
(const __attribute__((address_space(0))) void*)addr)) {
|
||||
#if __has_builtin(__builtin_amdgcn_is_private)
|
||||
if (__builtin_amdgcn_is_private((const __attribute__((address_space(0))) void*)addr)) {
|
||||
double old = *addr;
|
||||
*addr = __builtin_fmin(old, val);
|
||||
return old;
|
||||
} else {
|
||||
#endif
|
||||
#if __has_builtin(__hip_atomic_load) && \
|
||||
__has_builtin(__hip_atomic_compare_exchange_strong)
|
||||
__HIP_ATOMICS_IGNORE_DENORMAL_MODE {
|
||||
double value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
||||
bool done = false;
|
||||
while (!done && value > val) {
|
||||
done = __hip_atomic_compare_exchange_strong(addr, &value, val,
|
||||
__ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
||||
#endif
|
||||
#if __has_builtin(__hip_atomic_load) && __has_builtin(__hip_atomic_compare_exchange_strong)
|
||||
__HIP_ATOMICS_IGNORE_DENORMAL_MODE {
|
||||
double value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
||||
bool done = false;
|
||||
while (!done && value > val) {
|
||||
done = __hip_atomic_compare_exchange_strong(addr, &value, val, __ATOMIC_RELAXED,
|
||||
__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
||||
}
|
||||
return value;
|
||||
}
|
||||
return value;
|
||||
}
|
||||
#else
|
||||
unsigned long long *uaddr = (unsigned long long *)addr;
|
||||
#else
|
||||
unsigned long long* uaddr = (unsigned long long*)addr;
|
||||
unsigned long long value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
|
||||
bool done = false;
|
||||
while (!done && __longlong_as_double(value) > val) {
|
||||
done = __atomic_compare_exchange_n(uaddr, &value, __double_as_longlong(val), false,
|
||||
__ATOMIC_RELAXED, __ATOMIC_RELAXED);
|
||||
__ATOMIC_RELAXED, __ATOMIC_RELAXED);
|
||||
}
|
||||
return __longlong_as_double(value);
|
||||
#endif
|
||||
#if __has_builtin(__builtin_amdgcn_is_private)
|
||||
#endif
|
||||
#if __has_builtin(__builtin_amdgcn_is_private)
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
#pragma pop_macro("__HIP_ATOMICS_IGNORE_DENORMAL_MODE")
|
||||
|
||||
Το diff αρχείου καταστέλλεται επειδή είναι πολύ μεγάλο
Φόρτωση Διαφορών
@@ -39,7 +39,7 @@ THE SOFTWARE.
|
||||
#include <limits.h>
|
||||
#include <limits>
|
||||
#include <stdint.h>
|
||||
#endif // !defined(__HIPCC_RTC__)
|
||||
#endif // !defined(__HIPCC_RTC__)
|
||||
|
||||
#pragma push_macro("__DEVICE__")
|
||||
#pragma push_macro("__RETURN_TYPE")
|
||||
@@ -50,34 +50,28 @@ THE SOFTWARE.
|
||||
// DOT FUNCTIONS
|
||||
#if defined(__clang__) && defined(__HIP__)
|
||||
__DEVICE__
|
||||
inline
|
||||
int amd_mixed_dot(short2 a, short2 b, int c, bool saturate) {
|
||||
inline int amd_mixed_dot(short2 a, short2 b, int c, bool saturate) {
|
||||
return __ockl_sdot2(get_native_vector(a), get_native_vector(b), c, saturate);
|
||||
}
|
||||
__DEVICE__
|
||||
inline
|
||||
uint amd_mixed_dot(ushort2 a, ushort2 b, uint c, bool saturate) {
|
||||
inline uint amd_mixed_dot(ushort2 a, ushort2 b, uint c, bool saturate) {
|
||||
return __ockl_udot2(get_native_vector(a), get_native_vector(b), c, saturate);
|
||||
}
|
||||
__DEVICE__
|
||||
inline
|
||||
int amd_mixed_dot(char4 a, char4 b, int c, bool saturate) {
|
||||
inline int amd_mixed_dot(char4 a, char4 b, int c, bool saturate) {
|
||||
return __ockl_sdot4(get_native_vector(a), get_native_vector(b), c, saturate);
|
||||
}
|
||||
__DEVICE__
|
||||
inline
|
||||
uint amd_mixed_dot(uchar4 a, uchar4 b, uint c, bool saturate) {
|
||||
inline uint amd_mixed_dot(uchar4 a, uchar4 b, uint c, bool saturate) {
|
||||
return __ockl_udot4(get_native_vector(a), get_native_vector(b), c, saturate);
|
||||
}
|
||||
__DEVICE__
|
||||
inline
|
||||
int amd_mixed_dot(int a, int b, int c, bool saturate) {
|
||||
return __ockl_sdot8(a, b, c, saturate);
|
||||
inline int amd_mixed_dot(int a, int b, int c, bool saturate) {
|
||||
return __ockl_sdot8(a, b, c, saturate);
|
||||
}
|
||||
__DEVICE__
|
||||
inline
|
||||
uint amd_mixed_dot(uint a, uint b, uint c, bool saturate) {
|
||||
return __ockl_udot8(a, b, c, saturate);
|
||||
inline uint amd_mixed_dot(uint a, uint b, uint c, bool saturate) {
|
||||
return __ockl_udot8(a, b, c, saturate);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
@@ -38,8 +38,8 @@ THE SOFTWARE.
|
||||
#define __HOST_DEVICE__ __host__ __device__
|
||||
#endif
|
||||
|
||||
#define __HIP_SURFACE_OBJECT_PARAMETERS_INIT \
|
||||
unsigned int ADDRESS_SPACE_CONSTANT* i = (unsigned int ADDRESS_SPACE_CONSTANT*)surfObj;
|
||||
#define __HIP_SURFACE_OBJECT_PARAMETERS_INIT \
|
||||
unsigned int ADDRESS_SPACE_CONSTANT* i = (unsigned int ADDRESS_SPACE_CONSTANT*)surfObj;
|
||||
|
||||
/**
|
||||
* @defgroup SurfaceAPI Surface API
|
||||
@@ -48,57 +48,57 @@ THE SOFTWARE.
|
||||
|
||||
// CUDA is using byte address, need map to pixel address for HIP
|
||||
static __HOST_DEVICE__ __forceinline__ int __hipGetPixelAddr(int x, int format, int order) {
|
||||
/*
|
||||
* use below format index to generate format LUT
|
||||
typedef enum {
|
||||
HSA_EXT_IMAGE_CHANNEL_TYPE_SNORM_INT8 = 0,
|
||||
HSA_EXT_IMAGE_CHANNEL_TYPE_SNORM_INT16 = 1,
|
||||
HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT8 = 2,
|
||||
HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT16 = 3,
|
||||
HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT24 = 4,
|
||||
HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555 = 5,
|
||||
HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565 = 6,
|
||||
HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_101010 = 7,
|
||||
HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT8 = 8,
|
||||
HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT16 = 9,
|
||||
HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT32 = 10,
|
||||
HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8 = 11,
|
||||
HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16 = 12,
|
||||
HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32 = 13,
|
||||
HSA_EXT_IMAGE_CHANNEL_TYPE_HALF_FLOAT = 14,
|
||||
HSA_EXT_IMAGE_CHANNEL_TYPE_FLOAT = 15
|
||||
} hsa_ext_image_channel_type_t;
|
||||
*/
|
||||
static const int FormatLUT[] = { 0, 1, 0, 1, 3, 1, 1, 1, 0, 1, 2, 0, 1, 2, 1, 2 };
|
||||
x = FormatLUT[format] == 3 ? x / FormatLUT[format] : x >> FormatLUT[format];
|
||||
/*
|
||||
* use below format index to generate format LUT
|
||||
typedef enum {
|
||||
HSA_EXT_IMAGE_CHANNEL_TYPE_SNORM_INT8 = 0,
|
||||
HSA_EXT_IMAGE_CHANNEL_TYPE_SNORM_INT16 = 1,
|
||||
HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT8 = 2,
|
||||
HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT16 = 3,
|
||||
HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT24 = 4,
|
||||
HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555 = 5,
|
||||
HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565 = 6,
|
||||
HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_101010 = 7,
|
||||
HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT8 = 8,
|
||||
HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT16 = 9,
|
||||
HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT32 = 10,
|
||||
HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8 = 11,
|
||||
HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16 = 12,
|
||||
HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32 = 13,
|
||||
HSA_EXT_IMAGE_CHANNEL_TYPE_HALF_FLOAT = 14,
|
||||
HSA_EXT_IMAGE_CHANNEL_TYPE_FLOAT = 15
|
||||
} hsa_ext_image_channel_type_t;
|
||||
*/
|
||||
static const int FormatLUT[] = {0, 1, 0, 1, 3, 1, 1, 1, 0, 1, 2, 0, 1, 2, 1, 2};
|
||||
x = FormatLUT[format] == 3 ? x / FormatLUT[format] : x >> FormatLUT[format];
|
||||
|
||||
/*
|
||||
* use below order index to generate order LUT
|
||||
typedef enum {
|
||||
HSA_EXT_IMAGE_CHANNEL_ORDER_A = 0,
|
||||
HSA_EXT_IMAGE_CHANNEL_ORDER_R = 1,
|
||||
HSA_EXT_IMAGE_CHANNEL_ORDER_RX = 2,
|
||||
HSA_EXT_IMAGE_CHANNEL_ORDER_RG = 3,
|
||||
HSA_EXT_IMAGE_CHANNEL_ORDER_RGX = 4,
|
||||
HSA_EXT_IMAGE_CHANNEL_ORDER_RA = 5,
|
||||
HSA_EXT_IMAGE_CHANNEL_ORDER_RGB = 6,
|
||||
HSA_EXT_IMAGE_CHANNEL_ORDER_RGBX = 7,
|
||||
HSA_EXT_IMAGE_CHANNEL_ORDER_RGBA = 8,
|
||||
HSA_EXT_IMAGE_CHANNEL_ORDER_BGRA = 9,
|
||||
HSA_EXT_IMAGE_CHANNEL_ORDER_ARGB = 10,
|
||||
HSA_EXT_IMAGE_CHANNEL_ORDER_ABGR = 11,
|
||||
HSA_EXT_IMAGE_CHANNEL_ORDER_SRGB = 12,
|
||||
HSA_EXT_IMAGE_CHANNEL_ORDER_SRGBX = 13,
|
||||
HSA_EXT_IMAGE_CHANNEL_ORDER_SRGBA = 14,
|
||||
HSA_EXT_IMAGE_CHANNEL_ORDER_SBGRA = 15,
|
||||
HSA_EXT_IMAGE_CHANNEL_ORDER_INTENSITY = 16,
|
||||
HSA_EXT_IMAGE_CHANNEL_ORDER_LUMINANCE = 17,
|
||||
HSA_EXT_IMAGE_CHANNEL_ORDER_DEPTH = 18,
|
||||
HSA_EXT_IMAGE_CHANNEL_ORDER_DEPTH_STENCIL = 19
|
||||
} hsa_ext_image_channel_order_t;
|
||||
*/
|
||||
static const int OrderLUT[] = { 0, 0, 1, 1, 3, 1, 3, 2, 2, 2, 2, 2, 3, 2, 2, 2, 0, 0, 0, 0 };
|
||||
return x = OrderLUT[order] == 3 ? x / OrderLUT[order] : x >> OrderLUT[order];
|
||||
/*
|
||||
* use below order index to generate order LUT
|
||||
typedef enum {
|
||||
HSA_EXT_IMAGE_CHANNEL_ORDER_A = 0,
|
||||
HSA_EXT_IMAGE_CHANNEL_ORDER_R = 1,
|
||||
HSA_EXT_IMAGE_CHANNEL_ORDER_RX = 2,
|
||||
HSA_EXT_IMAGE_CHANNEL_ORDER_RG = 3,
|
||||
HSA_EXT_IMAGE_CHANNEL_ORDER_RGX = 4,
|
||||
HSA_EXT_IMAGE_CHANNEL_ORDER_RA = 5,
|
||||
HSA_EXT_IMAGE_CHANNEL_ORDER_RGB = 6,
|
||||
HSA_EXT_IMAGE_CHANNEL_ORDER_RGBX = 7,
|
||||
HSA_EXT_IMAGE_CHANNEL_ORDER_RGBA = 8,
|
||||
HSA_EXT_IMAGE_CHANNEL_ORDER_BGRA = 9,
|
||||
HSA_EXT_IMAGE_CHANNEL_ORDER_ARGB = 10,
|
||||
HSA_EXT_IMAGE_CHANNEL_ORDER_ABGR = 11,
|
||||
HSA_EXT_IMAGE_CHANNEL_ORDER_SRGB = 12,
|
||||
HSA_EXT_IMAGE_CHANNEL_ORDER_SRGBX = 13,
|
||||
HSA_EXT_IMAGE_CHANNEL_ORDER_SRGBA = 14,
|
||||
HSA_EXT_IMAGE_CHANNEL_ORDER_SBGRA = 15,
|
||||
HSA_EXT_IMAGE_CHANNEL_ORDER_INTENSITY = 16,
|
||||
HSA_EXT_IMAGE_CHANNEL_ORDER_LUMINANCE = 17,
|
||||
HSA_EXT_IMAGE_CHANNEL_ORDER_DEPTH = 18,
|
||||
HSA_EXT_IMAGE_CHANNEL_ORDER_DEPTH_STENCIL = 19
|
||||
} hsa_ext_image_channel_order_t;
|
||||
*/
|
||||
static const int OrderLUT[] = {0, 0, 1, 1, 3, 1, 3, 2, 2, 2, 2, 2, 3, 2, 2, 2, 0, 0, 0, 0};
|
||||
return x = OrderLUT[order] == 3 ? x / OrderLUT[order] : x >> OrderLUT[order];
|
||||
}
|
||||
|
||||
/** \brief Reads the value at coordinate x from the one-dimensional surface.
|
||||
@@ -113,7 +113,7 @@ template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ void surf1Dread(T* data, hipSurfaceObject_t surfObj, int x,
|
||||
int boundaryMode = hipBoundaryModeZero) {
|
||||
int boundaryMode = hipBoundaryModeZero) {
|
||||
__HIP_SURFACE_OBJECT_PARAMETERS_INIT;
|
||||
(void)boundaryMode;
|
||||
x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_1D(i), __ockl_image_channel_order_1D(i));
|
||||
@@ -132,10 +132,10 @@ template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ void surf1Dwrite(T data, hipSurfaceObject_t surfObj, int x) {
|
||||
__HIP_SURFACE_OBJECT_PARAMETERS_INIT
|
||||
x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_1D(i), __ockl_image_channel_order_1D(i));
|
||||
auto tmp = __hipMapTo<float4::Native_vec_>(data);
|
||||
__ockl_image_store_1D(i, x, tmp);
|
||||
__HIP_SURFACE_OBJECT_PARAMETERS_INIT
|
||||
x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_1D(i), __ockl_image_channel_order_1D(i));
|
||||
auto tmp = __hipMapTo<float4::Native_vec_>(data);
|
||||
__ockl_image_store_1D(i, x, tmp);
|
||||
}
|
||||
|
||||
|
||||
@@ -150,12 +150,13 @@ static __device__ __hip_img_chk__ void surf1Dwrite(T data, hipSurfaceObject_t su
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ void surf2Dread(T* data, hipSurfaceObject_t surfObj, int x, int y) {
|
||||
__HIP_SURFACE_OBJECT_PARAMETERS_INIT
|
||||
x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2D(i), __ockl_image_channel_order_2D(i));
|
||||
int2 coords{x, y};
|
||||
auto tmp = __ockl_image_load_2D(i, get_native_vector(coords));
|
||||
*data = __hipMapFrom<T>(tmp);
|
||||
static __device__ __hip_img_chk__ void surf2Dread(T* data, hipSurfaceObject_t surfObj, int x,
|
||||
int y) {
|
||||
__HIP_SURFACE_OBJECT_PARAMETERS_INIT
|
||||
x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2D(i), __ockl_image_channel_order_2D(i));
|
||||
int2 coords{x, y};
|
||||
auto tmp = __ockl_image_load_2D(i, get_native_vector(coords));
|
||||
*data = __hipMapFrom<T>(tmp);
|
||||
}
|
||||
|
||||
/** \brief Writes the value data to the two-dimensional surface at coordinate
|
||||
@@ -170,12 +171,13 @@ static __device__ __hip_img_chk__ void surf2Dread(T* data, hipSurfaceObject_t su
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ void surf2Dwrite(T data, hipSurfaceObject_t surfObj, int x, int y) {
|
||||
__HIP_SURFACE_OBJECT_PARAMETERS_INIT
|
||||
x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2D(i), __ockl_image_channel_order_2D(i));
|
||||
int2 coords{x, y};
|
||||
auto tmp = __hipMapTo<float4::Native_vec_>(data);
|
||||
__ockl_image_store_2D(i, get_native_vector(coords), tmp);
|
||||
static __device__ __hip_img_chk__ void surf2Dwrite(T data, hipSurfaceObject_t surfObj, int x,
|
||||
int y) {
|
||||
__HIP_SURFACE_OBJECT_PARAMETERS_INIT
|
||||
x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2D(i), __ockl_image_channel_order_2D(i));
|
||||
int2 coords{x, y};
|
||||
auto tmp = __hipMapTo<float4::Native_vec_>(data);
|
||||
__ockl_image_store_2D(i, get_native_vector(coords), tmp);
|
||||
}
|
||||
|
||||
/** \brief Reads the value from the three-dimensional surface at coordinate
|
||||
@@ -191,12 +193,13 @@ static __device__ __hip_img_chk__ void surf2Dwrite(T data, hipSurfaceObject_t su
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ void surf3Dread(T* data, hipSurfaceObject_t surfObj, int x, int y, int z) {
|
||||
__HIP_SURFACE_OBJECT_PARAMETERS_INIT
|
||||
x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_3D(i), __ockl_image_channel_order_3D(i));
|
||||
int4 coords{x, y, z, 0};
|
||||
auto tmp = __ockl_image_load_3D(i, get_native_vector(coords));
|
||||
*data = __hipMapFrom<T>(tmp);
|
||||
static __device__ __hip_img_chk__ void surf3Dread(T* data, hipSurfaceObject_t surfObj, int x, int y,
|
||||
int z) {
|
||||
__HIP_SURFACE_OBJECT_PARAMETERS_INIT
|
||||
x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_3D(i), __ockl_image_channel_order_3D(i));
|
||||
int4 coords{x, y, z, 0};
|
||||
auto tmp = __ockl_image_load_3D(i, get_native_vector(coords));
|
||||
*data = __hipMapFrom<T>(tmp);
|
||||
}
|
||||
|
||||
/** \brief Writes the value data to the three-dimensional surface at coordinate
|
||||
@@ -212,12 +215,13 @@ static __device__ __hip_img_chk__ void surf3Dread(T* data, hipSurfaceObject_t su
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ void surf3Dwrite(T data, hipSurfaceObject_t surfObj, int x, int y, int z) {
|
||||
__HIP_SURFACE_OBJECT_PARAMETERS_INIT
|
||||
x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_3D(i), __ockl_image_channel_order_3D(i));
|
||||
int4 coords{x, y, z, 0};
|
||||
auto tmp = __hipMapTo<float4::Native_vec_>(data);
|
||||
__ockl_image_store_3D(i, get_native_vector(coords), tmp);
|
||||
static __device__ __hip_img_chk__ void surf3Dwrite(T data, hipSurfaceObject_t surfObj, int x, int y,
|
||||
int z) {
|
||||
__HIP_SURFACE_OBJECT_PARAMETERS_INIT
|
||||
x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_3D(i), __ockl_image_channel_order_3D(i));
|
||||
int4 coords{x, y, z, 0};
|
||||
auto tmp = __hipMapTo<float4::Native_vec_>(data);
|
||||
__ockl_image_store_3D(i, get_native_vector(coords), tmp);
|
||||
}
|
||||
|
||||
/** \brief Reads the value from the one-dimensional layered surface at
|
||||
@@ -232,11 +236,12 @@ static __device__ __hip_img_chk__ void surf3Dwrite(T data, hipSurfaceObject_t su
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ void surf1DLayeredread(T* data, hipSurfaceObject_t surfObj, int x, int layer) {
|
||||
__HIP_SURFACE_OBJECT_PARAMETERS_INIT
|
||||
x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_1D(i), __ockl_image_channel_order_1D(i));
|
||||
auto tmp = __ockl_image_load_lod_1D(i, x, layer);
|
||||
*data = __hipMapFrom<T>(tmp);
|
||||
static __device__ __hip_img_chk__ void surf1DLayeredread(T* data, hipSurfaceObject_t surfObj, int x,
|
||||
int layer) {
|
||||
__HIP_SURFACE_OBJECT_PARAMETERS_INIT
|
||||
x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_1D(i), __ockl_image_channel_order_1D(i));
|
||||
auto tmp = __ockl_image_load_lod_1D(i, x, layer);
|
||||
*data = __hipMapFrom<T>(tmp);
|
||||
}
|
||||
|
||||
/** \brief Writes the value data to the one-dimensional layered surface at
|
||||
@@ -251,11 +256,12 @@ static __device__ __hip_img_chk__ void surf1DLayeredread(T* data, hipSurfaceObje
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ void surf1DLayeredwrite(T data, hipSurfaceObject_t surfObj, int x, int layer) {
|
||||
__HIP_SURFACE_OBJECT_PARAMETERS_INIT
|
||||
x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_1D(i), __ockl_image_channel_order_1D(i));
|
||||
auto tmp = __hipMapTo<float4::Native_vec_>(data);
|
||||
__ockl_image_store_lod_1D(i, x, layer, tmp);
|
||||
static __device__ __hip_img_chk__ void surf1DLayeredwrite(T data, hipSurfaceObject_t surfObj, int x,
|
||||
int layer) {
|
||||
__HIP_SURFACE_OBJECT_PARAMETERS_INIT
|
||||
x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_1D(i), __ockl_image_channel_order_1D(i));
|
||||
auto tmp = __hipMapTo<float4::Native_vec_>(data);
|
||||
__ockl_image_store_lod_1D(i, x, layer, tmp);
|
||||
}
|
||||
|
||||
/** \brief Reads the value from the two-dimensional layered surface at
|
||||
@@ -271,12 +277,13 @@ static __device__ __hip_img_chk__ void surf1DLayeredwrite(T data, hipSurfaceObje
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ void surf2DLayeredread(T* data, hipSurfaceObject_t surfObj, int x, int y, int layer) {
|
||||
__HIP_SURFACE_OBJECT_PARAMETERS_INIT
|
||||
x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2D(i), __ockl_image_channel_order_2D(i));
|
||||
int2 coords{x, y};
|
||||
auto tmp = __ockl_image_load_lod_2D(i, get_native_vector(coords), layer);
|
||||
*data = __hipMapFrom<T>(tmp);
|
||||
static __device__ __hip_img_chk__ void surf2DLayeredread(T* data, hipSurfaceObject_t surfObj, int x,
|
||||
int y, int layer) {
|
||||
__HIP_SURFACE_OBJECT_PARAMETERS_INIT
|
||||
x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2D(i), __ockl_image_channel_order_2D(i));
|
||||
int2 coords{x, y};
|
||||
auto tmp = __ockl_image_load_lod_2D(i, get_native_vector(coords), layer);
|
||||
*data = __hipMapFrom<T>(tmp);
|
||||
}
|
||||
|
||||
/** \brief Writes the value data to the two-dimensional layered surface at
|
||||
@@ -292,12 +299,13 @@ static __device__ __hip_img_chk__ void surf2DLayeredread(T* data, hipSurfaceObje
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ void surf2DLayeredwrite(T data, hipSurfaceObject_t surfObj, int x, int y, int layer) {
|
||||
__HIP_SURFACE_OBJECT_PARAMETERS_INIT
|
||||
x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2D(i), __ockl_image_channel_order_2D(i));
|
||||
int2 coords{x, y};
|
||||
auto tmp = __hipMapTo<float4::Native_vec_>(data);
|
||||
__ockl_image_store_lod_2D(i, get_native_vector(coords), layer, tmp);
|
||||
static __device__ __hip_img_chk__ void surf2DLayeredwrite(T data, hipSurfaceObject_t surfObj, int x,
|
||||
int y, int layer) {
|
||||
__HIP_SURFACE_OBJECT_PARAMETERS_INIT
|
||||
x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2D(i), __ockl_image_channel_order_2D(i));
|
||||
int2 coords{x, y};
|
||||
auto tmp = __hipMapTo<float4::Native_vec_>(data);
|
||||
__ockl_image_store_lod_2D(i, get_native_vector(coords), layer, tmp);
|
||||
}
|
||||
|
||||
/** \brief Reads the value from the cubemap surface at coordinate x, y and
|
||||
@@ -313,12 +321,13 @@ static __device__ __hip_img_chk__ void surf2DLayeredwrite(T data, hipSurfaceObje
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ void surfCubemapread(T* data, hipSurfaceObject_t surfObj, int x, int y, int face) {
|
||||
__HIP_SURFACE_OBJECT_PARAMETERS_INIT
|
||||
x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2D(i), __ockl_image_channel_order_2D(i));
|
||||
int2 coords{x, y};
|
||||
auto tmp = __ockl_image_load_CM(i, get_native_vector(coords), face);
|
||||
*data = __hipMapFrom<T>(tmp);
|
||||
static __device__ __hip_img_chk__ void surfCubemapread(T* data, hipSurfaceObject_t surfObj, int x,
|
||||
int y, int face) {
|
||||
__HIP_SURFACE_OBJECT_PARAMETERS_INIT
|
||||
x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2D(i), __ockl_image_channel_order_2D(i));
|
||||
int2 coords{x, y};
|
||||
auto tmp = __ockl_image_load_CM(i, get_native_vector(coords), face);
|
||||
*data = __hipMapFrom<T>(tmp);
|
||||
}
|
||||
|
||||
/** \brief Writes the value data to the cubemap surface at coordinate x, y and
|
||||
@@ -334,12 +343,13 @@ static __device__ __hip_img_chk__ void surfCubemapread(T* data, hipSurfaceObject
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ void surfCubemapwrite(T data, hipSurfaceObject_t surfObj, int x, int y, int face) {
|
||||
__HIP_SURFACE_OBJECT_PARAMETERS_INIT
|
||||
x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2D(i), __ockl_image_channel_order_2D(i));
|
||||
int2 coords{x, y};
|
||||
auto tmp = __hipMapTo<float4::Native_vec_>(data);
|
||||
__ockl_image_store_CM(i, get_native_vector(coords), face, tmp);
|
||||
static __device__ __hip_img_chk__ void surfCubemapwrite(T data, hipSurfaceObject_t surfObj, int x,
|
||||
int y, int face) {
|
||||
__HIP_SURFACE_OBJECT_PARAMETERS_INIT
|
||||
x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2D(i), __ockl_image_channel_order_2D(i));
|
||||
int2 coords{x, y};
|
||||
auto tmp = __hipMapTo<float4::Native_vec_>(data);
|
||||
__ockl_image_store_CM(i, get_native_vector(coords), face, tmp);
|
||||
}
|
||||
|
||||
/** \brief Reads the value from the layered cubemap surface at coordinate x, y
|
||||
@@ -356,13 +366,13 @@ static __device__ __hip_img_chk__ void surfCubemapwrite(T data, hipSurfaceObject
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ void surfCubemapLayeredread(T* data, hipSurfaceObject_t surfObj, int x, int y, int face,
|
||||
int layer) {
|
||||
__HIP_SURFACE_OBJECT_PARAMETERS_INIT
|
||||
x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2D(i), __ockl_image_channel_order_2D(i));
|
||||
int2 coords{x, y};
|
||||
auto tmp = __ockl_image_load_lod_CM(i, get_native_vector(coords), face, layer);
|
||||
*data = __hipMapFrom<T>(tmp);
|
||||
static __device__ __hip_img_chk__ void surfCubemapLayeredread(T* data, hipSurfaceObject_t surfObj,
|
||||
int x, int y, int face, int layer) {
|
||||
__HIP_SURFACE_OBJECT_PARAMETERS_INIT
|
||||
x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2D(i), __ockl_image_channel_order_2D(i));
|
||||
int2 coords{x, y};
|
||||
auto tmp = __ockl_image_load_lod_CM(i, get_native_vector(coords), face, layer);
|
||||
*data = __hipMapFrom<T>(tmp);
|
||||
}
|
||||
|
||||
/** \brief Writes the value data to the layered cubemap surface at coordinate
|
||||
@@ -379,19 +389,19 @@ static __device__ __hip_img_chk__ void surfCubemapLayeredread(T* data, hipSurfac
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ void surfCubemapLayeredwrite(T* data, hipSurfaceObject_t surfObj, int x, int y, int face,
|
||||
int layer) {
|
||||
__HIP_SURFACE_OBJECT_PARAMETERS_INIT
|
||||
x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2D(i), __ockl_image_channel_order_2D(i));
|
||||
int2 coords{x, y};
|
||||
auto tmp = __hipMapTo<float4::Native_vec_>(data);
|
||||
__ockl_image_store_lod_CM(i, get_native_vector(coords), face, layer, tmp);
|
||||
static __device__ __hip_img_chk__ void surfCubemapLayeredwrite(T* data, hipSurfaceObject_t surfObj,
|
||||
int x, int y, int face, int layer) {
|
||||
__HIP_SURFACE_OBJECT_PARAMETERS_INIT
|
||||
x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2D(i), __ockl_image_channel_order_2D(i));
|
||||
int2 coords{x, y};
|
||||
auto tmp = __hipMapTo<float4::Native_vec_>(data);
|
||||
__ockl_image_store_lod_CM(i, get_native_vector(coords), face, layer, tmp);
|
||||
}
|
||||
|
||||
// Doxygen end group SurfaceAPI
|
||||
/**
|
||||
* @}
|
||||
*/
|
||||
* @}
|
||||
*/
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
@@ -25,7 +25,7 @@ THE SOFTWARE.
|
||||
|
||||
#if !defined(__HIPCC_RTC__)
|
||||
#include "device_library_decls.h" // ockl warp functions
|
||||
#endif // !defined(__HIPCC_RTC__)
|
||||
#endif // !defined(__HIPCC_RTC__)
|
||||
|
||||
#if defined(__has_attribute) && __has_attribute(maybe_undef)
|
||||
#define MAYBE_UNDEF __attribute__((maybe_undef))
|
||||
@@ -34,519 +34,570 @@ THE SOFTWARE.
|
||||
#endif
|
||||
|
||||
__device__ static inline unsigned __hip_ds_bpermute(int index, unsigned src) {
|
||||
union { int i; unsigned u; float f; } tmp; tmp.u = src;
|
||||
tmp.i = __builtin_amdgcn_ds_bpermute(index, tmp.i);
|
||||
return tmp.u;
|
||||
union {
|
||||
int i;
|
||||
unsigned u;
|
||||
float f;
|
||||
} tmp;
|
||||
tmp.u = src;
|
||||
tmp.i = __builtin_amdgcn_ds_bpermute(index, tmp.i);
|
||||
return tmp.u;
|
||||
}
|
||||
|
||||
__device__ static inline float __hip_ds_bpermutef(int index, float src) {
|
||||
union { int i; unsigned u; float f; } tmp; tmp.f = src;
|
||||
tmp.i = __builtin_amdgcn_ds_bpermute(index, tmp.i);
|
||||
return tmp.f;
|
||||
union {
|
||||
int i;
|
||||
unsigned u;
|
||||
float f;
|
||||
} tmp;
|
||||
tmp.f = src;
|
||||
tmp.i = __builtin_amdgcn_ds_bpermute(index, tmp.i);
|
||||
return tmp.f;
|
||||
}
|
||||
|
||||
__device__ static inline unsigned __hip_ds_permute(int index, unsigned src) {
|
||||
union { int i; unsigned u; float f; } tmp; tmp.u = src;
|
||||
tmp.i = __builtin_amdgcn_ds_permute(index, tmp.i);
|
||||
return tmp.u;
|
||||
union {
|
||||
int i;
|
||||
unsigned u;
|
||||
float f;
|
||||
} tmp;
|
||||
tmp.u = src;
|
||||
tmp.i = __builtin_amdgcn_ds_permute(index, tmp.i);
|
||||
return tmp.u;
|
||||
}
|
||||
|
||||
__device__ static inline float __hip_ds_permutef(int index, float src) {
|
||||
union { int i; unsigned u; float f; } tmp; tmp.f = src;
|
||||
tmp.i = __builtin_amdgcn_ds_permute(index, tmp.i);
|
||||
return tmp.f;
|
||||
union {
|
||||
int i;
|
||||
unsigned u;
|
||||
float f;
|
||||
} tmp;
|
||||
tmp.f = src;
|
||||
tmp.i = __builtin_amdgcn_ds_permute(index, tmp.i);
|
||||
return tmp.f;
|
||||
}
|
||||
|
||||
#define __hip_ds_swizzle(src, pattern) __hip_ds_swizzle_N<(pattern)>((src))
|
||||
#define __hip_ds_swizzle(src, pattern) __hip_ds_swizzle_N<(pattern)>((src))
|
||||
#define __hip_ds_swizzlef(src, pattern) __hip_ds_swizzlef_N<(pattern)>((src))
|
||||
|
||||
template <int pattern>
|
||||
__device__ static inline unsigned __hip_ds_swizzle_N(unsigned int src) {
|
||||
union { int i; unsigned u; float f; } tmp; tmp.u = src;
|
||||
tmp.i = __builtin_amdgcn_ds_swizzle(tmp.i, pattern);
|
||||
return tmp.u;
|
||||
template <int pattern> __device__ static inline unsigned __hip_ds_swizzle_N(unsigned int src) {
|
||||
union {
|
||||
int i;
|
||||
unsigned u;
|
||||
float f;
|
||||
} tmp;
|
||||
tmp.u = src;
|
||||
tmp.i = __builtin_amdgcn_ds_swizzle(tmp.i, pattern);
|
||||
return tmp.u;
|
||||
}
|
||||
|
||||
template <int pattern>
|
||||
__device__ static inline float __hip_ds_swizzlef_N(float src) {
|
||||
union { int i; unsigned u; float f; } tmp; tmp.f = src;
|
||||
tmp.i = __builtin_amdgcn_ds_swizzle(tmp.i, pattern);
|
||||
return tmp.f;
|
||||
template <int pattern> __device__ static inline float __hip_ds_swizzlef_N(float src) {
|
||||
union {
|
||||
int i;
|
||||
unsigned u;
|
||||
float f;
|
||||
} tmp;
|
||||
tmp.f = src;
|
||||
tmp.i = __builtin_amdgcn_ds_swizzle(tmp.i, pattern);
|
||||
return tmp.f;
|
||||
}
|
||||
|
||||
#define __hip_move_dpp(src, dpp_ctrl, row_mask, bank_mask, bound_ctrl) \
|
||||
#define __hip_move_dpp(src, dpp_ctrl, row_mask, bank_mask, bound_ctrl) \
|
||||
__hip_move_dpp_N<(dpp_ctrl), (row_mask), (bank_mask), (bound_ctrl)>((src))
|
||||
|
||||
template <int dpp_ctrl, int row_mask, int bank_mask, bool bound_ctrl>
|
||||
__device__ static inline int __hip_move_dpp_N(int src) {
|
||||
return __builtin_amdgcn_mov_dpp(src, dpp_ctrl, row_mask, bank_mask,
|
||||
bound_ctrl);
|
||||
return __builtin_amdgcn_mov_dpp(src, dpp_ctrl, row_mask, bank_mask, bound_ctrl);
|
||||
}
|
||||
|
||||
inline __device__ const struct final {
|
||||
__device__
|
||||
__attribute__((always_inline, const))
|
||||
operator int() const noexcept {
|
||||
return __builtin_amdgcn_wavefrontsize();
|
||||
}
|
||||
__device__ __attribute__((always_inline, const)) operator int() const noexcept {
|
||||
return __builtin_amdgcn_wavefrontsize();
|
||||
}
|
||||
} warpSize{};
|
||||
|
||||
// warp vote function __all __any __ballot
|
||||
__device__
|
||||
inline
|
||||
int __all(int predicate) {
|
||||
return __ockl_wfall_i32(predicate);
|
||||
__device__ inline int __all(int predicate) { return __ockl_wfall_i32(predicate); }
|
||||
|
||||
__device__ inline int __any(int predicate) { return __ockl_wfany_i32(predicate); }
|
||||
|
||||
__device__ inline unsigned long long int __ballot(int predicate) {
|
||||
return __builtin_amdgcn_ballot_w64(predicate);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
int __any(int predicate) {
|
||||
return __ockl_wfany_i32(predicate);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
unsigned long long int __ballot(int predicate) {
|
||||
return __builtin_amdgcn_ballot_w64(predicate);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
unsigned long long int __ballot64(int predicate) {
|
||||
return __ballot(predicate);
|
||||
}
|
||||
__device__ inline unsigned long long int __ballot64(int predicate) { return __ballot(predicate); }
|
||||
|
||||
// See amd_warp_sync_functions.h for an explanation of this preprocessor flag.
|
||||
#if !defined(HIP_DISABLE_WARP_SYNC_BUILTINS)
|
||||
// Since threads in a wave do not make independent progress, __activemask()
|
||||
// always returns the exact active mask, i.e, all active threads in the wave.
|
||||
__device__
|
||||
inline
|
||||
unsigned long long __activemask() {
|
||||
return __ballot(true);
|
||||
}
|
||||
#endif // HIP_DISABLE_WARP_SYNC_BUILTINS
|
||||
__device__ inline unsigned long long __activemask() { return __ballot(true); }
|
||||
#endif // HIP_DISABLE_WARP_SYNC_BUILTINS
|
||||
|
||||
__device__ static inline unsigned int __lane_id() {
|
||||
if (static_cast<int>(warpSize) == 32) return __builtin_amdgcn_mbcnt_lo(-1, 0);
|
||||
return __builtin_amdgcn_mbcnt_hi(
|
||||
-1, __builtin_amdgcn_mbcnt_lo(-1, 0));
|
||||
if (static_cast<int>(warpSize) == 32) return __builtin_amdgcn_mbcnt_lo(-1, 0);
|
||||
return __builtin_amdgcn_mbcnt_hi(-1, __builtin_amdgcn_mbcnt_lo(-1, 0));
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
int __shfl(MAYBE_UNDEF int var, int src_lane, int width = warpSize) {
|
||||
int self = __lane_id();
|
||||
int index = (src_lane & (width - 1)) + (self & ~(width-1));
|
||||
return __builtin_amdgcn_ds_bpermute(index<<2, var);
|
||||
__device__ inline int __shfl(MAYBE_UNDEF int var, int src_lane, int width = warpSize) {
|
||||
int self = __lane_id();
|
||||
int index = (src_lane & (width - 1)) + (self & ~(width - 1));
|
||||
return __builtin_amdgcn_ds_bpermute(index << 2, var);
|
||||
}
|
||||
__device__
|
||||
inline
|
||||
unsigned int __shfl(MAYBE_UNDEF unsigned int var, int src_lane, int width = warpSize) {
|
||||
union { int i; unsigned u; float f; } tmp; tmp.u = var;
|
||||
tmp.i = __shfl(tmp.i, src_lane, width);
|
||||
return tmp.u;
|
||||
__device__ inline unsigned int __shfl(MAYBE_UNDEF unsigned int var, int src_lane,
|
||||
int width = warpSize) {
|
||||
union {
|
||||
int i;
|
||||
unsigned u;
|
||||
float f;
|
||||
} tmp;
|
||||
tmp.u = var;
|
||||
tmp.i = __shfl(tmp.i, src_lane, width);
|
||||
return tmp.u;
|
||||
}
|
||||
__device__
|
||||
inline
|
||||
float __shfl(MAYBE_UNDEF float var, int src_lane, int width = warpSize) {
|
||||
union { int i; unsigned u; float f; } tmp; tmp.f = var;
|
||||
tmp.i = __shfl(tmp.i, src_lane, width);
|
||||
return tmp.f;
|
||||
__device__ inline float __shfl(MAYBE_UNDEF float var, int src_lane, int width = warpSize) {
|
||||
union {
|
||||
int i;
|
||||
unsigned u;
|
||||
float f;
|
||||
} tmp;
|
||||
tmp.f = var;
|
||||
tmp.i = __shfl(tmp.i, src_lane, width);
|
||||
return tmp.f;
|
||||
}
|
||||
__device__
|
||||
inline
|
||||
double __shfl(MAYBE_UNDEF double var, int src_lane, int width = warpSize) {
|
||||
static_assert(sizeof(double) == 2 * sizeof(int), "");
|
||||
static_assert(sizeof(double) == sizeof(__hip_uint64_t), "");
|
||||
__device__ inline double __shfl(MAYBE_UNDEF double var, int src_lane, int width = warpSize) {
|
||||
static_assert(sizeof(double) == 2 * sizeof(int), "");
|
||||
static_assert(sizeof(double) == sizeof(__hip_uint64_t), "");
|
||||
|
||||
int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
|
||||
tmp[0] = __shfl(tmp[0], src_lane, width);
|
||||
tmp[1] = __shfl(tmp[1], src_lane, width);
|
||||
int tmp[2];
|
||||
__builtin_memcpy(tmp, &var, sizeof(tmp));
|
||||
tmp[0] = __shfl(tmp[0], src_lane, width);
|
||||
tmp[1] = __shfl(tmp[1], src_lane, width);
|
||||
|
||||
__hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
|
||||
double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
__hip_uint64_t tmp0 =
|
||||
(static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
|
||||
double tmp1;
|
||||
__builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
}
|
||||
__device__
|
||||
inline
|
||||
long __shfl(MAYBE_UNDEF long var, int src_lane, int width = warpSize)
|
||||
{
|
||||
#ifndef _MSC_VER
|
||||
static_assert(sizeof(long) == 2 * sizeof(int), "");
|
||||
static_assert(sizeof(long) == sizeof(__hip_uint64_t), "");
|
||||
__device__ inline long __shfl(MAYBE_UNDEF long var, int src_lane, int width = warpSize) {
|
||||
#ifndef _MSC_VER
|
||||
static_assert(sizeof(long) == 2 * sizeof(int), "");
|
||||
static_assert(sizeof(long) == sizeof(__hip_uint64_t), "");
|
||||
|
||||
int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
|
||||
tmp[0] = __shfl(tmp[0], src_lane, width);
|
||||
tmp[1] = __shfl(tmp[1], src_lane, width);
|
||||
int tmp[2];
|
||||
__builtin_memcpy(tmp, &var, sizeof(tmp));
|
||||
tmp[0] = __shfl(tmp[0], src_lane, width);
|
||||
tmp[1] = __shfl(tmp[1], src_lane, width);
|
||||
|
||||
__hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
|
||||
long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
#else
|
||||
static_assert(sizeof(long) == sizeof(int), "");
|
||||
return static_cast<long>(__shfl(static_cast<int>(var), src_lane, width));
|
||||
#endif
|
||||
__hip_uint64_t tmp0 =
|
||||
(static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
|
||||
long tmp1;
|
||||
__builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
#else
|
||||
static_assert(sizeof(long) == sizeof(int), "");
|
||||
return static_cast<long>(__shfl(static_cast<int>(var), src_lane, width));
|
||||
#endif
|
||||
}
|
||||
__device__
|
||||
inline
|
||||
unsigned long __shfl(MAYBE_UNDEF unsigned long var, int src_lane, int width = warpSize) {
|
||||
#ifndef _MSC_VER
|
||||
static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
|
||||
static_assert(sizeof(unsigned long) == sizeof(__hip_uint64_t), "");
|
||||
__device__ inline unsigned long __shfl(MAYBE_UNDEF unsigned long var, int src_lane,
|
||||
int width = warpSize) {
|
||||
#ifndef _MSC_VER
|
||||
static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
|
||||
static_assert(sizeof(unsigned long) == sizeof(__hip_uint64_t), "");
|
||||
|
||||
unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
|
||||
tmp[0] = __shfl(tmp[0], src_lane, width);
|
||||
tmp[1] = __shfl(tmp[1], src_lane, width);
|
||||
unsigned int tmp[2];
|
||||
__builtin_memcpy(tmp, &var, sizeof(tmp));
|
||||
tmp[0] = __shfl(tmp[0], src_lane, width);
|
||||
tmp[1] = __shfl(tmp[1], src_lane, width);
|
||||
|
||||
__hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
|
||||
unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
#else
|
||||
static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
|
||||
return static_cast<unsigned long>(__shfl(static_cast<unsigned int>(var), src_lane, width));
|
||||
#endif
|
||||
__hip_uint64_t tmp0 =
|
||||
(static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
|
||||
unsigned long tmp1;
|
||||
__builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
#else
|
||||
static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
|
||||
return static_cast<unsigned long>(__shfl(static_cast<unsigned int>(var), src_lane, width));
|
||||
#endif
|
||||
}
|
||||
__device__
|
||||
inline
|
||||
long long __shfl(MAYBE_UNDEF long long var, int src_lane, int width = warpSize)
|
||||
{
|
||||
static_assert(sizeof(long long) == 2 * sizeof(int), "");
|
||||
static_assert(sizeof(long long) == sizeof(__hip_uint64_t), "");
|
||||
__device__ inline long long __shfl(MAYBE_UNDEF long long var, int src_lane, int width = warpSize) {
|
||||
static_assert(sizeof(long long) == 2 * sizeof(int), "");
|
||||
static_assert(sizeof(long long) == sizeof(__hip_uint64_t), "");
|
||||
|
||||
int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
|
||||
tmp[0] = __shfl(tmp[0], src_lane, width);
|
||||
tmp[1] = __shfl(tmp[1], src_lane, width);
|
||||
int tmp[2];
|
||||
__builtin_memcpy(tmp, &var, sizeof(tmp));
|
||||
tmp[0] = __shfl(tmp[0], src_lane, width);
|
||||
tmp[1] = __shfl(tmp[1], src_lane, width);
|
||||
|
||||
__hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
|
||||
long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
__hip_uint64_t tmp0 =
|
||||
(static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
|
||||
long long tmp1;
|
||||
__builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
}
|
||||
__device__
|
||||
inline
|
||||
unsigned long long __shfl(MAYBE_UNDEF unsigned long long var, int src_lane, int width = warpSize) {
|
||||
static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
|
||||
static_assert(sizeof(unsigned long long) == sizeof(__hip_uint64_t), "");
|
||||
__device__ inline unsigned long long __shfl(MAYBE_UNDEF unsigned long long var, int src_lane,
|
||||
int width = warpSize) {
|
||||
static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
|
||||
static_assert(sizeof(unsigned long long) == sizeof(__hip_uint64_t), "");
|
||||
|
||||
unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
|
||||
tmp[0] = __shfl(tmp[0], src_lane, width);
|
||||
tmp[1] = __shfl(tmp[1], src_lane, width);
|
||||
unsigned int tmp[2];
|
||||
__builtin_memcpy(tmp, &var, sizeof(tmp));
|
||||
tmp[0] = __shfl(tmp[0], src_lane, width);
|
||||
tmp[1] = __shfl(tmp[1], src_lane, width);
|
||||
|
||||
__hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
|
||||
unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
__hip_uint64_t tmp0 =
|
||||
(static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
|
||||
unsigned long long tmp1;
|
||||
__builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
int __shfl_up(MAYBE_UNDEF int var, unsigned int lane_delta, int width = warpSize) {
|
||||
int self = __lane_id();
|
||||
int index = self - lane_delta;
|
||||
index = (index < (self & ~(width-1)))?self:index;
|
||||
return __builtin_amdgcn_ds_bpermute(index<<2, var);
|
||||
__device__ inline int __shfl_up(MAYBE_UNDEF int var, unsigned int lane_delta,
|
||||
int width = warpSize) {
|
||||
int self = __lane_id();
|
||||
int index = self - lane_delta;
|
||||
index = (index < (self & ~(width - 1))) ? self : index;
|
||||
return __builtin_amdgcn_ds_bpermute(index << 2, var);
|
||||
}
|
||||
__device__
|
||||
inline
|
||||
unsigned int __shfl_up(MAYBE_UNDEF unsigned int var, unsigned int lane_delta, int width = warpSize) {
|
||||
union { int i; unsigned u; float f; } tmp; tmp.u = var;
|
||||
tmp.i = __shfl_up(tmp.i, lane_delta, width);
|
||||
return tmp.u;
|
||||
__device__ inline unsigned int __shfl_up(MAYBE_UNDEF unsigned int var, unsigned int lane_delta,
|
||||
int width = warpSize) {
|
||||
union {
|
||||
int i;
|
||||
unsigned u;
|
||||
float f;
|
||||
} tmp;
|
||||
tmp.u = var;
|
||||
tmp.i = __shfl_up(tmp.i, lane_delta, width);
|
||||
return tmp.u;
|
||||
}
|
||||
__device__
|
||||
inline
|
||||
float __shfl_up(MAYBE_UNDEF float var, unsigned int lane_delta, int width = warpSize) {
|
||||
union { int i; unsigned u; float f; } tmp; tmp.f = var;
|
||||
tmp.i = __shfl_up(tmp.i, lane_delta, width);
|
||||
return tmp.f;
|
||||
__device__ inline float __shfl_up(MAYBE_UNDEF float var, unsigned int lane_delta,
|
||||
int width = warpSize) {
|
||||
union {
|
||||
int i;
|
||||
unsigned u;
|
||||
float f;
|
||||
} tmp;
|
||||
tmp.f = var;
|
||||
tmp.i = __shfl_up(tmp.i, lane_delta, width);
|
||||
return tmp.f;
|
||||
}
|
||||
__device__
|
||||
inline
|
||||
double __shfl_up(MAYBE_UNDEF double var, unsigned int lane_delta, int width = warpSize) {
|
||||
static_assert(sizeof(double) == 2 * sizeof(int), "");
|
||||
static_assert(sizeof(double) == sizeof(__hip_uint64_t), "");
|
||||
__device__ inline double __shfl_up(MAYBE_UNDEF double var, unsigned int lane_delta,
|
||||
int width = warpSize) {
|
||||
static_assert(sizeof(double) == 2 * sizeof(int), "");
|
||||
static_assert(sizeof(double) == sizeof(__hip_uint64_t), "");
|
||||
|
||||
int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
|
||||
tmp[0] = __shfl_up(tmp[0], lane_delta, width);
|
||||
tmp[1] = __shfl_up(tmp[1], lane_delta, width);
|
||||
int tmp[2];
|
||||
__builtin_memcpy(tmp, &var, sizeof(tmp));
|
||||
tmp[0] = __shfl_up(tmp[0], lane_delta, width);
|
||||
tmp[1] = __shfl_up(tmp[1], lane_delta, width);
|
||||
|
||||
__hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
|
||||
double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
__hip_uint64_t tmp0 =
|
||||
(static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
|
||||
double tmp1;
|
||||
__builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
}
|
||||
__device__
|
||||
inline
|
||||
long __shfl_up(MAYBE_UNDEF long var, unsigned int lane_delta, int width = warpSize)
|
||||
{
|
||||
#ifndef _MSC_VER
|
||||
static_assert(sizeof(long) == 2 * sizeof(int), "");
|
||||
static_assert(sizeof(long) == sizeof(__hip_uint64_t), "");
|
||||
__device__ inline long __shfl_up(MAYBE_UNDEF long var, unsigned int lane_delta,
|
||||
int width = warpSize) {
|
||||
#ifndef _MSC_VER
|
||||
static_assert(sizeof(long) == 2 * sizeof(int), "");
|
||||
static_assert(sizeof(long) == sizeof(__hip_uint64_t), "");
|
||||
|
||||
int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
|
||||
tmp[0] = __shfl_up(tmp[0], lane_delta, width);
|
||||
tmp[1] = __shfl_up(tmp[1], lane_delta, width);
|
||||
int tmp[2];
|
||||
__builtin_memcpy(tmp, &var, sizeof(tmp));
|
||||
tmp[0] = __shfl_up(tmp[0], lane_delta, width);
|
||||
tmp[1] = __shfl_up(tmp[1], lane_delta, width);
|
||||
|
||||
__hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
|
||||
long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
#else
|
||||
static_assert(sizeof(long) == sizeof(int), "");
|
||||
return static_cast<long>(__shfl_up(static_cast<int>(var), lane_delta, width));
|
||||
#endif
|
||||
__hip_uint64_t tmp0 =
|
||||
(static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
|
||||
long tmp1;
|
||||
__builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
#else
|
||||
static_assert(sizeof(long) == sizeof(int), "");
|
||||
return static_cast<long>(__shfl_up(static_cast<int>(var), lane_delta, width));
|
||||
#endif
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
unsigned long __shfl_up(MAYBE_UNDEF unsigned long var, unsigned int lane_delta, int width = warpSize)
|
||||
{
|
||||
#ifndef _MSC_VER
|
||||
static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
|
||||
static_assert(sizeof(unsigned long) == sizeof(__hip_uint64_t), "");
|
||||
__device__ inline unsigned long __shfl_up(MAYBE_UNDEF unsigned long var, unsigned int lane_delta,
|
||||
int width = warpSize) {
|
||||
#ifndef _MSC_VER
|
||||
static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
|
||||
static_assert(sizeof(unsigned long) == sizeof(__hip_uint64_t), "");
|
||||
|
||||
unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
|
||||
tmp[0] = __shfl_up(tmp[0], lane_delta, width);
|
||||
tmp[1] = __shfl_up(tmp[1], lane_delta, width);
|
||||
unsigned int tmp[2];
|
||||
__builtin_memcpy(tmp, &var, sizeof(tmp));
|
||||
tmp[0] = __shfl_up(tmp[0], lane_delta, width);
|
||||
tmp[1] = __shfl_up(tmp[1], lane_delta, width);
|
||||
|
||||
__hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
|
||||
unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
#else
|
||||
static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
|
||||
return static_cast<unsigned long>(__shfl_up(static_cast<unsigned int>(var), lane_delta, width));
|
||||
#endif
|
||||
__hip_uint64_t tmp0 =
|
||||
(static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
|
||||
unsigned long tmp1;
|
||||
__builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
#else
|
||||
static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
|
||||
return static_cast<unsigned long>(__shfl_up(static_cast<unsigned int>(var), lane_delta, width));
|
||||
#endif
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
long long __shfl_up(MAYBE_UNDEF long long var, unsigned int lane_delta, int width = warpSize)
|
||||
{
|
||||
static_assert(sizeof(long long) == 2 * sizeof(int), "");
|
||||
static_assert(sizeof(long long) == sizeof(__hip_uint64_t), "");
|
||||
int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
|
||||
tmp[0] = __shfl_up(tmp[0], lane_delta, width);
|
||||
tmp[1] = __shfl_up(tmp[1], lane_delta, width);
|
||||
__hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
|
||||
long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
__device__ inline long long __shfl_up(MAYBE_UNDEF long long var, unsigned int lane_delta,
|
||||
int width = warpSize) {
|
||||
static_assert(sizeof(long long) == 2 * sizeof(int), "");
|
||||
static_assert(sizeof(long long) == sizeof(__hip_uint64_t), "");
|
||||
int tmp[2];
|
||||
__builtin_memcpy(tmp, &var, sizeof(tmp));
|
||||
tmp[0] = __shfl_up(tmp[0], lane_delta, width);
|
||||
tmp[1] = __shfl_up(tmp[1], lane_delta, width);
|
||||
__hip_uint64_t tmp0 =
|
||||
(static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
|
||||
long long tmp1;
|
||||
__builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
unsigned long long __shfl_up(MAYBE_UNDEF unsigned long long var, unsigned int lane_delta, int width = warpSize)
|
||||
{
|
||||
static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
|
||||
static_assert(sizeof(unsigned long long) == sizeof(__hip_uint64_t), "");
|
||||
unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
|
||||
tmp[0] = __shfl_up(tmp[0], lane_delta, width);
|
||||
tmp[1] = __shfl_up(tmp[1], lane_delta, width);
|
||||
__hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
|
||||
unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
__device__ inline unsigned long long __shfl_up(MAYBE_UNDEF unsigned long long var,
|
||||
unsigned int lane_delta, int width = warpSize) {
|
||||
static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
|
||||
static_assert(sizeof(unsigned long long) == sizeof(__hip_uint64_t), "");
|
||||
unsigned int tmp[2];
|
||||
__builtin_memcpy(tmp, &var, sizeof(tmp));
|
||||
tmp[0] = __shfl_up(tmp[0], lane_delta, width);
|
||||
tmp[1] = __shfl_up(tmp[1], lane_delta, width);
|
||||
__hip_uint64_t tmp0 =
|
||||
(static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
|
||||
unsigned long long tmp1;
|
||||
__builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
int __shfl_down(MAYBE_UNDEF int var, unsigned int lane_delta, int width = warpSize) {
|
||||
int self = __lane_id();
|
||||
int index = self + lane_delta;
|
||||
index = (int)((self&(width-1))+lane_delta) >= width?self:index;
|
||||
return __builtin_amdgcn_ds_bpermute(index<<2, var);
|
||||
__device__ inline int __shfl_down(MAYBE_UNDEF int var, unsigned int lane_delta,
|
||||
int width = warpSize) {
|
||||
int self = __lane_id();
|
||||
int index = self + lane_delta;
|
||||
index = (int)((self & (width - 1)) + lane_delta) >= width ? self : index;
|
||||
return __builtin_amdgcn_ds_bpermute(index << 2, var);
|
||||
}
|
||||
__device__
|
||||
inline
|
||||
unsigned int __shfl_down(MAYBE_UNDEF unsigned int var, unsigned int lane_delta, int width = warpSize) {
|
||||
union { int i; unsigned u; float f; } tmp; tmp.u = var;
|
||||
tmp.i = __shfl_down(tmp.i, lane_delta, width);
|
||||
return tmp.u;
|
||||
__device__ inline unsigned int __shfl_down(MAYBE_UNDEF unsigned int var, unsigned int lane_delta,
|
||||
int width = warpSize) {
|
||||
union {
|
||||
int i;
|
||||
unsigned u;
|
||||
float f;
|
||||
} tmp;
|
||||
tmp.u = var;
|
||||
tmp.i = __shfl_down(tmp.i, lane_delta, width);
|
||||
return tmp.u;
|
||||
}
|
||||
__device__
|
||||
inline
|
||||
float __shfl_down(MAYBE_UNDEF float var, unsigned int lane_delta, int width = warpSize) {
|
||||
union { int i; unsigned u; float f; } tmp; tmp.f = var;
|
||||
tmp.i = __shfl_down(tmp.i, lane_delta, width);
|
||||
return tmp.f;
|
||||
__device__ inline float __shfl_down(MAYBE_UNDEF float var, unsigned int lane_delta,
|
||||
int width = warpSize) {
|
||||
union {
|
||||
int i;
|
||||
unsigned u;
|
||||
float f;
|
||||
} tmp;
|
||||
tmp.f = var;
|
||||
tmp.i = __shfl_down(tmp.i, lane_delta, width);
|
||||
return tmp.f;
|
||||
}
|
||||
__device__
|
||||
inline
|
||||
double __shfl_down(MAYBE_UNDEF double var, unsigned int lane_delta, int width = warpSize) {
|
||||
static_assert(sizeof(double) == 2 * sizeof(int), "");
|
||||
static_assert(sizeof(double) == sizeof(__hip_uint64_t), "");
|
||||
__device__ inline double __shfl_down(MAYBE_UNDEF double var, unsigned int lane_delta,
|
||||
int width = warpSize) {
|
||||
static_assert(sizeof(double) == 2 * sizeof(int), "");
|
||||
static_assert(sizeof(double) == sizeof(__hip_uint64_t), "");
|
||||
|
||||
int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
|
||||
tmp[0] = __shfl_down(tmp[0], lane_delta, width);
|
||||
tmp[1] = __shfl_down(tmp[1], lane_delta, width);
|
||||
int tmp[2];
|
||||
__builtin_memcpy(tmp, &var, sizeof(tmp));
|
||||
tmp[0] = __shfl_down(tmp[0], lane_delta, width);
|
||||
tmp[1] = __shfl_down(tmp[1], lane_delta, width);
|
||||
|
||||
__hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
|
||||
double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
__hip_uint64_t tmp0 =
|
||||
(static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
|
||||
double tmp1;
|
||||
__builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
}
|
||||
__device__
|
||||
inline
|
||||
long __shfl_down(MAYBE_UNDEF long var, unsigned int lane_delta, int width = warpSize)
|
||||
{
|
||||
#ifndef _MSC_VER
|
||||
static_assert(sizeof(long) == 2 * sizeof(int), "");
|
||||
static_assert(sizeof(long) == sizeof(__hip_uint64_t), "");
|
||||
__device__ inline long __shfl_down(MAYBE_UNDEF long var, unsigned int lane_delta,
|
||||
int width = warpSize) {
|
||||
#ifndef _MSC_VER
|
||||
static_assert(sizeof(long) == 2 * sizeof(int), "");
|
||||
static_assert(sizeof(long) == sizeof(__hip_uint64_t), "");
|
||||
|
||||
int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
|
||||
tmp[0] = __shfl_down(tmp[0], lane_delta, width);
|
||||
tmp[1] = __shfl_down(tmp[1], lane_delta, width);
|
||||
int tmp[2];
|
||||
__builtin_memcpy(tmp, &var, sizeof(tmp));
|
||||
tmp[0] = __shfl_down(tmp[0], lane_delta, width);
|
||||
tmp[1] = __shfl_down(tmp[1], lane_delta, width);
|
||||
|
||||
__hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
|
||||
long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
#else
|
||||
static_assert(sizeof(long) == sizeof(int), "");
|
||||
return static_cast<long>(__shfl_down(static_cast<int>(var), lane_delta, width));
|
||||
#endif
|
||||
__hip_uint64_t tmp0 =
|
||||
(static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
|
||||
long tmp1;
|
||||
__builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
#else
|
||||
static_assert(sizeof(long) == sizeof(int), "");
|
||||
return static_cast<long>(__shfl_down(static_cast<int>(var), lane_delta, width));
|
||||
#endif
|
||||
}
|
||||
__device__
|
||||
inline
|
||||
unsigned long __shfl_down(MAYBE_UNDEF unsigned long var, unsigned int lane_delta, int width = warpSize)
|
||||
{
|
||||
#ifndef _MSC_VER
|
||||
static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
|
||||
static_assert(sizeof(unsigned long) == sizeof(__hip_uint64_t), "");
|
||||
__device__ inline unsigned long __shfl_down(MAYBE_UNDEF unsigned long var, unsigned int lane_delta,
|
||||
int width = warpSize) {
|
||||
#ifndef _MSC_VER
|
||||
static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
|
||||
static_assert(sizeof(unsigned long) == sizeof(__hip_uint64_t), "");
|
||||
|
||||
unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
|
||||
tmp[0] = __shfl_down(tmp[0], lane_delta, width);
|
||||
tmp[1] = __shfl_down(tmp[1], lane_delta, width);
|
||||
unsigned int tmp[2];
|
||||
__builtin_memcpy(tmp, &var, sizeof(tmp));
|
||||
tmp[0] = __shfl_down(tmp[0], lane_delta, width);
|
||||
tmp[1] = __shfl_down(tmp[1], lane_delta, width);
|
||||
|
||||
__hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
|
||||
unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
#else
|
||||
static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
|
||||
return static_cast<unsigned long>(__shfl_down(static_cast<unsigned int>(var), lane_delta, width));
|
||||
#endif
|
||||
__hip_uint64_t tmp0 =
|
||||
(static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
|
||||
unsigned long tmp1;
|
||||
__builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
#else
|
||||
static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
|
||||
return static_cast<unsigned long>(__shfl_down(static_cast<unsigned int>(var), lane_delta, width));
|
||||
#endif
|
||||
}
|
||||
__device__
|
||||
inline
|
||||
long long __shfl_down(MAYBE_UNDEF long long var, unsigned int lane_delta, int width = warpSize)
|
||||
{
|
||||
static_assert(sizeof(long long) == 2 * sizeof(int), "");
|
||||
static_assert(sizeof(long long) == sizeof(__hip_uint64_t), "");
|
||||
int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
|
||||
tmp[0] = __shfl_down(tmp[0], lane_delta, width);
|
||||
tmp[1] = __shfl_down(tmp[1], lane_delta, width);
|
||||
__hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
|
||||
long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
__device__ inline long long __shfl_down(MAYBE_UNDEF long long var, unsigned int lane_delta,
|
||||
int width = warpSize) {
|
||||
static_assert(sizeof(long long) == 2 * sizeof(int), "");
|
||||
static_assert(sizeof(long long) == sizeof(__hip_uint64_t), "");
|
||||
int tmp[2];
|
||||
__builtin_memcpy(tmp, &var, sizeof(tmp));
|
||||
tmp[0] = __shfl_down(tmp[0], lane_delta, width);
|
||||
tmp[1] = __shfl_down(tmp[1], lane_delta, width);
|
||||
__hip_uint64_t tmp0 =
|
||||
(static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
|
||||
long long tmp1;
|
||||
__builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
}
|
||||
__device__
|
||||
inline
|
||||
unsigned long long __shfl_down(MAYBE_UNDEF unsigned long long var, unsigned int lane_delta, int width = warpSize)
|
||||
{
|
||||
static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
|
||||
static_assert(sizeof(unsigned long long) == sizeof(__hip_uint64_t), "");
|
||||
unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
|
||||
tmp[0] = __shfl_down(tmp[0], lane_delta, width);
|
||||
tmp[1] = __shfl_down(tmp[1], lane_delta, width);
|
||||
__hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
|
||||
unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
__device__ inline unsigned long long __shfl_down(MAYBE_UNDEF unsigned long long var,
|
||||
unsigned int lane_delta, int width = warpSize) {
|
||||
static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
|
||||
static_assert(sizeof(unsigned long long) == sizeof(__hip_uint64_t), "");
|
||||
unsigned int tmp[2];
|
||||
__builtin_memcpy(tmp, &var, sizeof(tmp));
|
||||
tmp[0] = __shfl_down(tmp[0], lane_delta, width);
|
||||
tmp[1] = __shfl_down(tmp[1], lane_delta, width);
|
||||
__hip_uint64_t tmp0 =
|
||||
(static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
|
||||
unsigned long long tmp1;
|
||||
__builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
int __shfl_xor(MAYBE_UNDEF int var, int lane_mask, int width = warpSize) {
|
||||
int self = __lane_id();
|
||||
int index = self^lane_mask;
|
||||
index = index >= ((self+width)&~(width-1))?self:index;
|
||||
return __builtin_amdgcn_ds_bpermute(index<<2, var);
|
||||
__device__ inline int __shfl_xor(MAYBE_UNDEF int var, int lane_mask, int width = warpSize) {
|
||||
int self = __lane_id();
|
||||
int index = self ^ lane_mask;
|
||||
index = index >= ((self + width) & ~(width - 1)) ? self : index;
|
||||
return __builtin_amdgcn_ds_bpermute(index << 2, var);
|
||||
}
|
||||
__device__
|
||||
inline
|
||||
unsigned int __shfl_xor(MAYBE_UNDEF unsigned int var, int lane_mask, int width = warpSize) {
|
||||
union { int i; unsigned u; float f; } tmp; tmp.u = var;
|
||||
tmp.i = __shfl_xor(tmp.i, lane_mask, width);
|
||||
return tmp.u;
|
||||
__device__ inline unsigned int __shfl_xor(MAYBE_UNDEF unsigned int var, int lane_mask,
|
||||
int width = warpSize) {
|
||||
union {
|
||||
int i;
|
||||
unsigned u;
|
||||
float f;
|
||||
} tmp;
|
||||
tmp.u = var;
|
||||
tmp.i = __shfl_xor(tmp.i, lane_mask, width);
|
||||
return tmp.u;
|
||||
}
|
||||
__device__
|
||||
inline
|
||||
float __shfl_xor(MAYBE_UNDEF float var, int lane_mask, int width = warpSize) {
|
||||
union { int i; unsigned u; float f; } tmp; tmp.f = var;
|
||||
tmp.i = __shfl_xor(tmp.i, lane_mask, width);
|
||||
return tmp.f;
|
||||
__device__ inline float __shfl_xor(MAYBE_UNDEF float var, int lane_mask, int width = warpSize) {
|
||||
union {
|
||||
int i;
|
||||
unsigned u;
|
||||
float f;
|
||||
} tmp;
|
||||
tmp.f = var;
|
||||
tmp.i = __shfl_xor(tmp.i, lane_mask, width);
|
||||
return tmp.f;
|
||||
}
|
||||
__device__
|
||||
inline
|
||||
double __shfl_xor(MAYBE_UNDEF double var, int lane_mask, int width = warpSize) {
|
||||
static_assert(sizeof(double) == 2 * sizeof(int), "");
|
||||
static_assert(sizeof(double) == sizeof(__hip_uint64_t), "");
|
||||
__device__ inline double __shfl_xor(MAYBE_UNDEF double var, int lane_mask, int width = warpSize) {
|
||||
static_assert(sizeof(double) == 2 * sizeof(int), "");
|
||||
static_assert(sizeof(double) == sizeof(__hip_uint64_t), "");
|
||||
|
||||
int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
|
||||
tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
|
||||
tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
|
||||
int tmp[2];
|
||||
__builtin_memcpy(tmp, &var, sizeof(tmp));
|
||||
tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
|
||||
tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
|
||||
|
||||
__hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
|
||||
double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
__hip_uint64_t tmp0 =
|
||||
(static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
|
||||
double tmp1;
|
||||
__builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
}
|
||||
__device__
|
||||
inline
|
||||
long __shfl_xor(MAYBE_UNDEF long var, int lane_mask, int width = warpSize)
|
||||
{
|
||||
#ifndef _MSC_VER
|
||||
static_assert(sizeof(long) == 2 * sizeof(int), "");
|
||||
static_assert(sizeof(long) == sizeof(__hip_uint64_t), "");
|
||||
__device__ inline long __shfl_xor(MAYBE_UNDEF long var, int lane_mask, int width = warpSize) {
|
||||
#ifndef _MSC_VER
|
||||
static_assert(sizeof(long) == 2 * sizeof(int), "");
|
||||
static_assert(sizeof(long) == sizeof(__hip_uint64_t), "");
|
||||
|
||||
int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
|
||||
tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
|
||||
tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
|
||||
int tmp[2];
|
||||
__builtin_memcpy(tmp, &var, sizeof(tmp));
|
||||
tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
|
||||
tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
|
||||
|
||||
__hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
|
||||
long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
#else
|
||||
static_assert(sizeof(long) == sizeof(int), "");
|
||||
return static_cast<long>(__shfl_xor(static_cast<int>(var), lane_mask, width));
|
||||
#endif
|
||||
__hip_uint64_t tmp0 =
|
||||
(static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
|
||||
long tmp1;
|
||||
__builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
#else
|
||||
static_assert(sizeof(long) == sizeof(int), "");
|
||||
return static_cast<long>(__shfl_xor(static_cast<int>(var), lane_mask, width));
|
||||
#endif
|
||||
}
|
||||
__device__
|
||||
inline
|
||||
unsigned long __shfl_xor(MAYBE_UNDEF unsigned long var, int lane_mask, int width = warpSize)
|
||||
{
|
||||
#ifndef _MSC_VER
|
||||
static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
|
||||
static_assert(sizeof(unsigned long) == sizeof(__hip_uint64_t), "");
|
||||
__device__ inline unsigned long __shfl_xor(MAYBE_UNDEF unsigned long var, int lane_mask,
|
||||
int width = warpSize) {
|
||||
#ifndef _MSC_VER
|
||||
static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
|
||||
static_assert(sizeof(unsigned long) == sizeof(__hip_uint64_t), "");
|
||||
|
||||
unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
|
||||
tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
|
||||
tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
|
||||
unsigned int tmp[2];
|
||||
__builtin_memcpy(tmp, &var, sizeof(tmp));
|
||||
tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
|
||||
tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
|
||||
|
||||
__hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
|
||||
unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
#else
|
||||
static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
|
||||
return static_cast<unsigned long>(__shfl_xor(static_cast<unsigned int>(var), lane_mask, width));
|
||||
#endif
|
||||
__hip_uint64_t tmp0 =
|
||||
(static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
|
||||
unsigned long tmp1;
|
||||
__builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
#else
|
||||
static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
|
||||
return static_cast<unsigned long>(__shfl_xor(static_cast<unsigned int>(var), lane_mask, width));
|
||||
#endif
|
||||
}
|
||||
__device__
|
||||
inline
|
||||
long long __shfl_xor(MAYBE_UNDEF long long var, int lane_mask, int width = warpSize)
|
||||
{
|
||||
static_assert(sizeof(long long) == 2 * sizeof(int), "");
|
||||
static_assert(sizeof(long long) == sizeof(__hip_uint64_t), "");
|
||||
int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
|
||||
tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
|
||||
tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
|
||||
__hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
|
||||
long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
__device__ inline long long __shfl_xor(MAYBE_UNDEF long long var, int lane_mask,
|
||||
int width = warpSize) {
|
||||
static_assert(sizeof(long long) == 2 * sizeof(int), "");
|
||||
static_assert(sizeof(long long) == sizeof(__hip_uint64_t), "");
|
||||
int tmp[2];
|
||||
__builtin_memcpy(tmp, &var, sizeof(tmp));
|
||||
tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
|
||||
tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
|
||||
__hip_uint64_t tmp0 =
|
||||
(static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
|
||||
long long tmp1;
|
||||
__builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
}
|
||||
__device__
|
||||
inline
|
||||
unsigned long long __shfl_xor(MAYBE_UNDEF unsigned long long var, int lane_mask, int width = warpSize)
|
||||
{
|
||||
static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
|
||||
static_assert(sizeof(unsigned long long) == sizeof(__hip_uint64_t), "");
|
||||
unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
|
||||
tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
|
||||
tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
|
||||
__hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
|
||||
unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
__device__ inline unsigned long long __shfl_xor(MAYBE_UNDEF unsigned long long var, int lane_mask,
|
||||
int width = warpSize) {
|
||||
static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
|
||||
static_assert(sizeof(unsigned long long) == sizeof(__hip_uint64_t), "");
|
||||
unsigned int tmp[2];
|
||||
__builtin_memcpy(tmp, &var, sizeof(tmp));
|
||||
tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
|
||||
tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
|
||||
__hip_uint64_t tmp0 =
|
||||
(static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
|
||||
unsigned long long tmp1;
|
||||
__builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -50,37 +50,41 @@ extern "C" __device__ __attribute__((const)) unsigned int __ockl_wfred_xor_u32(u
|
||||
#ifdef HIP_ENABLE_EXTRA_WARP_SYNC_TYPES
|
||||
// this macro enable types that are not in CUDA
|
||||
extern "C" __device__ __attribute__((const)) long long __ockl_wfred_add_i64(long long);
|
||||
extern "C" __device__ __attribute__((const)) unsigned long long __ockl_wfred_add_u64(unsigned long long);
|
||||
extern "C" __device__ __attribute__((const)) unsigned long long __ockl_wfred_add_u64(
|
||||
unsigned long long);
|
||||
extern "C" __device__ __attribute__((const)) float __ockl_wfred_add_f32(float);
|
||||
extern "C" __device__ __attribute__((const)) double __ockl_wfred_add_f64(double);
|
||||
|
||||
extern "C" __device__ __attribute__((const)) long long __ockl_wfred_min_i64(long long);
|
||||
extern "C" __device__ __attribute__((const)) unsigned long long __ockl_wfred_min_u64(unsigned long long);
|
||||
extern "C" __device__ __attribute__((const)) unsigned long long __ockl_wfred_min_u64(
|
||||
unsigned long long);
|
||||
extern "C" __device__ __attribute__((const)) float __ockl_wfred_min_f32(float);
|
||||
extern "C" __device__ __attribute__((const)) double __ockl_wfred_min_f64(double);
|
||||
|
||||
extern "C" __device__ __attribute__((const)) long long __ockl_wfred_max_i64(long long);
|
||||
extern "C" __device__ __attribute__((const)) unsigned long long __ockl_wfred_max_u64(unsigned long long);
|
||||
extern "C" __device__ __attribute__((const)) unsigned long long __ockl_wfred_max_u64(
|
||||
unsigned long long);
|
||||
extern "C" __device__ __attribute__((const)) float __ockl_wfred_max_f32(float);
|
||||
extern "C" __device__ __attribute__((const)) double __ockl_wfred_max_f64(double);
|
||||
|
||||
extern "C" __device__ __attribute__((const)) int __ockl_wfred_and_i32(int);
|
||||
extern "C" __device__ __attribute__((const)) long long __ockl_wfred_and_i64(long long);
|
||||
extern "C" __device__ __attribute__((const)) unsigned long long __ockl_wfred_and_u64(unsigned long long);
|
||||
extern "C" __device__ __attribute__((const)) unsigned long long __ockl_wfred_and_u64(
|
||||
unsigned long long);
|
||||
|
||||
extern "C" __device__ __attribute__((const)) int __ockl_wfred_or_i32(int);
|
||||
extern "C" __device__ __attribute__((const)) long long __ockl_wfred_or_i64(long long);
|
||||
extern "C" __device__ __attribute__((const)) unsigned long long __ockl_wfred_or_u64(unsigned long long);
|
||||
extern "C" __device__ __attribute__((const)) unsigned long long __ockl_wfred_or_u64(
|
||||
unsigned long long);
|
||||
|
||||
extern "C" __device__ __attribute__((const)) int __ockl_wfred_xor_i32(int);
|
||||
extern "C" __device__ __attribute__((const)) long long __ockl_wfred_xor_i64(long long);
|
||||
extern "C" __device__ __attribute__((const)) unsigned long long __ockl_wfred_xor_u64(unsigned long long);
|
||||
extern "C" __device__ __attribute__((const)) unsigned long long __ockl_wfred_xor_u64(
|
||||
unsigned long long);
|
||||
|
||||
#endif
|
||||
|
||||
template <typename T>
|
||||
__device__ inline
|
||||
T __hip_readfirstlane(T val) {
|
||||
template <typename T> __device__ inline T __hip_readfirstlane(T val) {
|
||||
// In theory, behaviour is undefined when reading from a union member other
|
||||
// than the member that was last assigned to, but it works in practice because
|
||||
// we rely on the compiler to do the reasonable thing.
|
||||
@@ -92,16 +96,15 @@ T __hip_readfirstlane(T val) {
|
||||
// NOTE: The builtin returns int, so we first cast it to unsigned int and only
|
||||
// then extend it to 64 bits.
|
||||
unsigned long long lower = (unsigned)__builtin_amdgcn_readfirstlane(u.l);
|
||||
unsigned long long upper =
|
||||
(unsigned)__builtin_amdgcn_readfirstlane(u.l >> 32);
|
||||
unsigned long long upper = (unsigned)__builtin_amdgcn_readfirstlane(u.l >> 32);
|
||||
u.l = (upper << 32) | lower;
|
||||
return u.d;
|
||||
}
|
||||
|
||||
// When compiling for wave32 mode, ignore the upper half of the 64-bit mask.
|
||||
#define __hip_adjust_mask_for_wave32(MASK) \
|
||||
do { \
|
||||
if (static_cast<int>(warpSize) == 32) MASK &= 0xFFFFFFFF; \
|
||||
#define __hip_adjust_mask_for_wave32(MASK) \
|
||||
do { \
|
||||
if (static_cast<int>(warpSize) == 32) MASK &= 0xFFFFFFFF; \
|
||||
} while (0)
|
||||
|
||||
// We use a macro to expand each builtin into a waterfall that implements the
|
||||
@@ -129,40 +132,40 @@ T __hip_readfirstlane(T val) {
|
||||
// specifies itself in the mask; that is done by the later assertion where all
|
||||
// chosen lanes must be in the chosen mask.
|
||||
|
||||
#define __hip_check_mask(MASK) \
|
||||
do { \
|
||||
__hip_assert(MASK && "mask must be non-zero"); \
|
||||
bool done = false; \
|
||||
while (__any(!done)) { \
|
||||
if (!done) { \
|
||||
auto chosen_mask = __hip_readfirstlane(MASK); \
|
||||
if (MASK == chosen_mask) { \
|
||||
__hip_assert(MASK == __ballot(true) && \
|
||||
"all threads specified in the mask" \
|
||||
" must execute the same operation with the same mask"); \
|
||||
done = true; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} while(0)
|
||||
#define __hip_check_mask(MASK) \
|
||||
do { \
|
||||
__hip_assert(MASK && "mask must be non-zero"); \
|
||||
bool done = false; \
|
||||
while (__any(!done)) { \
|
||||
if (!done) { \
|
||||
auto chosen_mask = __hip_readfirstlane(MASK); \
|
||||
if (MASK == chosen_mask) { \
|
||||
__hip_assert(MASK == __ballot(true) && \
|
||||
"all threads specified in the mask" \
|
||||
" must execute the same operation with the same mask"); \
|
||||
done = true; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define __hip_do_sync(RETVAL, FUNC, MASK, ...) \
|
||||
do { \
|
||||
__hip_assert(MASK && "mask must be non-zero"); \
|
||||
bool done = false; \
|
||||
while (__any(!done)) { \
|
||||
if (!done) { \
|
||||
auto chosen_mask = __hip_readfirstlane(MASK); \
|
||||
if (MASK == chosen_mask) { \
|
||||
__hip_assert(MASK == __ballot(true) && \
|
||||
"all threads specified in the mask" \
|
||||
" must execute the same operation with the same mask"); \
|
||||
RETVAL = FUNC(__VA_ARGS__); \
|
||||
done = true; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} while(0)
|
||||
#define __hip_do_sync(RETVAL, FUNC, MASK, ...) \
|
||||
do { \
|
||||
__hip_assert(MASK && "mask must be non-zero"); \
|
||||
bool done = false; \
|
||||
while (__any(!done)) { \
|
||||
if (!done) { \
|
||||
auto chosen_mask = __hip_readfirstlane(MASK); \
|
||||
if (MASK == chosen_mask) { \
|
||||
__hip_assert(MASK == __ballot(true) && \
|
||||
"all threads specified in the mask" \
|
||||
" must execute the same operation with the same mask"); \
|
||||
RETVAL = FUNC(__VA_ARGS__); \
|
||||
done = true; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
__device__ inline void __syncwarp() {
|
||||
__builtin_amdgcn_fence(__ATOMIC_RELEASE, "wavefront");
|
||||
@@ -181,44 +184,34 @@ template <typename MaskT> __device__ inline void __syncwarp(MaskT mask) {
|
||||
// __all_sync, __any_sync, __ballot_sync
|
||||
|
||||
template <typename MaskT>
|
||||
__device__ inline
|
||||
unsigned long long __ballot_sync(MaskT mask, int predicate) {
|
||||
static_assert(
|
||||
__hip_internal::is_integral<MaskT>::value && sizeof(MaskT) == 8,
|
||||
"The mask must be a 64-bit integer. "
|
||||
"Implicitly promoting a smaller integer is almost always an error.");
|
||||
__device__ inline unsigned long long __ballot_sync(MaskT mask, int predicate) {
|
||||
static_assert(__hip_internal::is_integral<MaskT>::value && sizeof(MaskT) == 8,
|
||||
"The mask must be a 64-bit integer. "
|
||||
"Implicitly promoting a smaller integer is almost always an error.");
|
||||
__hip_adjust_mask_for_wave32(mask);
|
||||
__hip_check_mask(mask);
|
||||
return __ballot(predicate) & mask;
|
||||
}
|
||||
|
||||
template <typename MaskT>
|
||||
__device__ inline
|
||||
int __all_sync(MaskT mask, int predicate) {
|
||||
static_assert(
|
||||
__hip_internal::is_integral<MaskT>::value && sizeof(MaskT) == 8,
|
||||
"The mask must be a 64-bit integer. "
|
||||
"Implicitly promoting a smaller integer is almost always an error.");
|
||||
template <typename MaskT> __device__ inline int __all_sync(MaskT mask, int predicate) {
|
||||
static_assert(__hip_internal::is_integral<MaskT>::value && sizeof(MaskT) == 8,
|
||||
"The mask must be a 64-bit integer. "
|
||||
"Implicitly promoting a smaller integer is almost always an error.");
|
||||
__hip_adjust_mask_for_wave32(mask);
|
||||
return __ballot_sync(mask, predicate) == mask;
|
||||
}
|
||||
|
||||
template <typename MaskT>
|
||||
__device__ inline
|
||||
int __any_sync(MaskT mask, int predicate) {
|
||||
static_assert(
|
||||
__hip_internal::is_integral<MaskT>::value && sizeof(MaskT) == 8,
|
||||
"The mask must be a 64-bit integer. "
|
||||
"Implicitly promoting a smaller integer is almost always an error.");
|
||||
template <typename MaskT> __device__ inline int __any_sync(MaskT mask, int predicate) {
|
||||
static_assert(__hip_internal::is_integral<MaskT>::value && sizeof(MaskT) == 8,
|
||||
"The mask must be a 64-bit integer. "
|
||||
"Implicitly promoting a smaller integer is almost always an error.");
|
||||
__hip_adjust_mask_for_wave32(mask);
|
||||
return __ballot_sync(mask, predicate) != 0;
|
||||
}
|
||||
|
||||
// __match_any, __match_all and sync variants
|
||||
|
||||
template <typename T>
|
||||
__device__ inline
|
||||
unsigned long long __match_any(T value) {
|
||||
template <typename T> __device__ inline unsigned long long __match_any(T value) {
|
||||
static_assert(
|
||||
(__hip_internal::is_integral<T>::value || __hip_internal::is_floating_point<T>::value) &&
|
||||
(sizeof(T) == 4 || sizeof(T) == 8),
|
||||
@@ -241,20 +234,16 @@ unsigned long long __match_any(T value) {
|
||||
}
|
||||
|
||||
template <typename MaskT, typename T>
|
||||
__device__ inline
|
||||
unsigned long long __match_any_sync(MaskT mask, T value) {
|
||||
static_assert(
|
||||
__hip_internal::is_integral<MaskT>::value && sizeof(MaskT) == 8,
|
||||
"The mask must be a 64-bit integer. "
|
||||
"Implicitly promoting a smaller integer is almost always an error.");
|
||||
__device__ inline unsigned long long __match_any_sync(MaskT mask, T value) {
|
||||
static_assert(__hip_internal::is_integral<MaskT>::value && sizeof(MaskT) == 8,
|
||||
"The mask must be a 64-bit integer. "
|
||||
"Implicitly promoting a smaller integer is almost always an error.");
|
||||
__hip_adjust_mask_for_wave32(mask);
|
||||
__hip_check_mask(mask);
|
||||
return __match_any(value) & mask;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ inline
|
||||
unsigned long long __match_all(T value, int* pred) {
|
||||
template <typename T> __device__ inline unsigned long long __match_all(T value, int* pred) {
|
||||
static_assert(
|
||||
(__hip_internal::is_integral<T>::value || __hip_internal::is_floating_point<T>::value) &&
|
||||
(sizeof(T) == 4 || sizeof(T) == 8),
|
||||
@@ -271,12 +260,10 @@ unsigned long long __match_all(T value, int* pred) {
|
||||
}
|
||||
|
||||
template <typename MaskT, typename T>
|
||||
__device__ inline
|
||||
unsigned long long __match_all_sync(MaskT mask, T value, int* pred) {
|
||||
static_assert(
|
||||
__hip_internal::is_integral<MaskT>::value && sizeof(MaskT) == 8,
|
||||
"The mask must be a 64-bit integer. "
|
||||
"Implicitly promoting a smaller integer is almost always an error.");
|
||||
__device__ inline unsigned long long __match_all_sync(MaskT mask, T value, int* pred) {
|
||||
static_assert(__hip_internal::is_integral<MaskT>::value && sizeof(MaskT) == 8,
|
||||
"The mask must be a 64-bit integer. "
|
||||
"Implicitly promoting a smaller integer is almost always an error.");
|
||||
MaskT retval = 0;
|
||||
__hip_adjust_mask_for_wave32(mask);
|
||||
__hip_do_sync(retval, __match_all, mask, value, pred);
|
||||
@@ -286,67 +273,53 @@ unsigned long long __match_all_sync(MaskT mask, T value, int* pred) {
|
||||
// various variants of shfl
|
||||
|
||||
template <typename MaskT, typename T>
|
||||
__device__ inline
|
||||
T __shfl_sync(MaskT mask, T var, int srcLane,
|
||||
int width = warpSize) {
|
||||
static_assert(
|
||||
__hip_internal::is_integral<MaskT>::value && sizeof(MaskT) == 8,
|
||||
"The mask must be a 64-bit integer. "
|
||||
"Implicitly promoting a smaller integer is almost always an error.");
|
||||
__device__ inline T __shfl_sync(MaskT mask, T var, int srcLane, int width = warpSize) {
|
||||
static_assert(__hip_internal::is_integral<MaskT>::value && sizeof(MaskT) == 8,
|
||||
"The mask must be a 64-bit integer. "
|
||||
"Implicitly promoting a smaller integer is almost always an error.");
|
||||
__hip_adjust_mask_for_wave32(mask);
|
||||
__hip_check_mask(mask);
|
||||
return __shfl(var, srcLane, width);
|
||||
}
|
||||
|
||||
template <typename MaskT, typename T>
|
||||
__device__ inline
|
||||
T __shfl_up_sync(MaskT mask, T var, unsigned int delta,
|
||||
int width = warpSize) {
|
||||
static_assert(
|
||||
__hip_internal::is_integral<MaskT>::value && sizeof(MaskT) == 8,
|
||||
"The mask must be a 64-bit integer. "
|
||||
"Implicitly promoting a smaller integer is almost always an error.");
|
||||
__device__ inline T __shfl_up_sync(MaskT mask, T var, unsigned int delta, int width = warpSize) {
|
||||
static_assert(__hip_internal::is_integral<MaskT>::value && sizeof(MaskT) == 8,
|
||||
"The mask must be a 64-bit integer. "
|
||||
"Implicitly promoting a smaller integer is almost always an error.");
|
||||
__hip_adjust_mask_for_wave32(mask);
|
||||
__hip_check_mask(mask);
|
||||
return __shfl_up(var, delta, width);
|
||||
}
|
||||
|
||||
template <typename MaskT, typename T>
|
||||
__device__ inline
|
||||
T __shfl_down_sync(MaskT mask, T var, unsigned int delta,
|
||||
int width = warpSize) {
|
||||
static_assert(
|
||||
__hip_internal::is_integral<MaskT>::value && sizeof(MaskT) == 8,
|
||||
"The mask must be a 64-bit integer. "
|
||||
"Implicitly promoting a smaller integer is almost always an error.");
|
||||
__device__ inline T __shfl_down_sync(MaskT mask, T var, unsigned int delta, int width = warpSize) {
|
||||
static_assert(__hip_internal::is_integral<MaskT>::value && sizeof(MaskT) == 8,
|
||||
"The mask must be a 64-bit integer. "
|
||||
"Implicitly promoting a smaller integer is almost always an error.");
|
||||
__hip_adjust_mask_for_wave32(mask);
|
||||
__hip_check_mask(mask);
|
||||
return __shfl_down(var, delta, width);
|
||||
}
|
||||
|
||||
template <typename MaskT, typename T>
|
||||
__device__ inline
|
||||
T __shfl_xor_sync(MaskT mask, T var, int laneMask,
|
||||
int width = warpSize) {
|
||||
static_assert(
|
||||
__hip_internal::is_integral<MaskT>::value && sizeof(MaskT) == 8,
|
||||
"The mask must be a 64-bit integer. "
|
||||
"Implicitly promoting a smaller integer is almost always an error.");
|
||||
__device__ inline T __shfl_xor_sync(MaskT mask, T var, int laneMask, int width = warpSize) {
|
||||
static_assert(__hip_internal::is_integral<MaskT>::value && sizeof(MaskT) == 8,
|
||||
"The mask must be a 64-bit integer. "
|
||||
"Implicitly promoting a smaller integer is almost always an error.");
|
||||
__hip_adjust_mask_for_wave32(mask);
|
||||
__hip_check_mask(mask);
|
||||
return __shfl_xor(var, laneMask, width);
|
||||
}
|
||||
|
||||
template <typename MaskT, typename T, typename BinaryOp, typename WfReduce>
|
||||
__device__ inline T __reduce_op_sync(MaskT mask, T val, BinaryOp op, WfReduce wfReduce)
|
||||
{
|
||||
__device__ inline T __reduce_op_sync(MaskT mask, T val, BinaryOp op, WfReduce wfReduce) {
|
||||
using permuteType =
|
||||
typename __hip_internal::conditional<sizeof(T) == 4 || sizeof(T) == 2, T, unsigned int>::type;
|
||||
typename __hip_internal::conditional<sizeof(T) == 4 || sizeof(T) == 2, T, unsigned int>::type;
|
||||
static constexpr auto kMaskNumBits = sizeof(MaskT) * 8;
|
||||
static_assert(
|
||||
__hip_internal::is_integral<MaskT>::value && sizeof(MaskT) == 8,
|
||||
"The mask must be a 64-bit integer. "
|
||||
"Implicitly promoting a smaller integer is almost always an error.");
|
||||
static_assert(__hip_internal::is_integral<MaskT>::value && sizeof(MaskT) == 8,
|
||||
"The mask must be a 64-bit integer. "
|
||||
"Implicitly promoting a smaller integer is almost always an error.");
|
||||
__hip_adjust_mask_for_wave32(mask);
|
||||
unsigned int laneId;
|
||||
unsigned int maskIdx;
|
||||
@@ -361,9 +334,12 @@ __device__ inline T __reduce_op_sync(MaskT mask, T val, BinaryOp op, WfReduce wf
|
||||
int maskNumBits;
|
||||
int numIterations;
|
||||
// unsigned int[2] is used when T is 64-bit wide
|
||||
typename __hip_internal::conditional<sizeof(T) == 4 || sizeof(T) == 2, permuteType, permuteType[2]>::type result, permuteResult;
|
||||
typename __hip_internal::conditional<sizeof(T) == 4 || sizeof(T) == 2, permuteType,
|
||||
permuteType[2]>::type result,
|
||||
permuteResult;
|
||||
auto backwardPermute = [](int index, permuteType val) {
|
||||
if constexpr (__hip_internal::is_integral<T>::value || __hip_internal::is_same<T, double>::value)
|
||||
if constexpr (__hip_internal::is_integral<T>::value ||
|
||||
__hip_internal::is_same<T, double>::value)
|
||||
return __hip_ds_bpermute(index, val);
|
||||
else
|
||||
return __hip_ds_bpermutef(index, val);
|
||||
@@ -372,7 +348,8 @@ __device__ inline T __reduce_op_sync(MaskT mask, T val, BinaryOp op, WfReduce wf
|
||||
__hip_check_mask(mask);
|
||||
maskNumBits = __popcll(mask);
|
||||
|
||||
#ifdef __OPTIMIZE__ // at the time of this writing the ockl wfred functions do not compile when using -O0
|
||||
#ifdef __OPTIMIZE__ // at the time of this writing the ockl wfred functions do not compile when
|
||||
// using -O0
|
||||
if (maskNumBits == lastLane + 1)
|
||||
// this means the mask "does not have holes", and starts from 0; we can use a specific intrinsic
|
||||
// to calculate the aggregated result
|
||||
@@ -393,7 +370,7 @@ __device__ inline T __reduce_op_sync(MaskT mask, T val, BinaryOp op, WfReduce wf
|
||||
mask >>= laneId;
|
||||
mask >>= 1ul;
|
||||
|
||||
if constexpr(sizeof(T) == 4 || sizeof(T) == 2)
|
||||
if constexpr (sizeof(T) == 4 || sizeof(T) == 2)
|
||||
result = val;
|
||||
else
|
||||
__builtin_memcpy(&result, &val, sizeof(T));
|
||||
@@ -419,7 +396,10 @@ __device__ inline T __reduce_op_sync(MaskT mask, T val, BinaryOp op, WfReduce wf
|
||||
}
|
||||
|
||||
if constexpr (sizeof(T) == 2) {
|
||||
union { int i; T f; } tmp;
|
||||
union {
|
||||
int i;
|
||||
T f;
|
||||
} tmp;
|
||||
|
||||
tmp.f = result;
|
||||
tmp.i = __hip_ds_bpermute(nextBit << 2, tmp.i);
|
||||
@@ -438,7 +418,8 @@ __device__ inline T __reduce_op_sync(MaskT mask, T val, BinaryOp op, WfReduce wf
|
||||
result = op(result, permuteResult);
|
||||
else {
|
||||
T tmp;
|
||||
unsigned long long rhs = (static_cast<unsigned long long>(permuteResult[1]) << 32) | permuteResult[0];
|
||||
unsigned long long rhs =
|
||||
(static_cast<unsigned long long>(permuteResult[1]) << 32) | permuteResult[0];
|
||||
|
||||
__builtin_memcpy(&tmp, &result, sizeof(T));
|
||||
tmp = op(tmp, *reinterpret_cast<T*>(&rhs));
|
||||
@@ -451,7 +432,10 @@ __device__ inline T __reduce_op_sync(MaskT mask, T val, BinaryOp op, WfReduce wf
|
||||
}
|
||||
|
||||
if constexpr (sizeof(T) == 2) {
|
||||
union { int i; T f; } tmp;
|
||||
union {
|
||||
int i;
|
||||
T f;
|
||||
} tmp;
|
||||
tmp.f = result;
|
||||
tmp.i = __hip_ds_bpermute(firstLane << 2, tmp.i);
|
||||
return tmp.f;
|
||||
@@ -459,14 +443,12 @@ __device__ inline T __reduce_op_sync(MaskT mask, T val, BinaryOp op, WfReduce wf
|
||||
return backwardPermute(firstLane << 2, result);
|
||||
else {
|
||||
auto tmp = (static_cast<unsigned long long>(backwardPermute(firstLane << 2, result[1])) << 32) |
|
||||
static_cast<unsigned int>(backwardPermute(firstLane << 2, result[0]));
|
||||
static_cast<unsigned int>(backwardPermute(firstLane << 2, result[0]));
|
||||
return *reinterpret_cast<T*>(&tmp);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename MaskT>
|
||||
__device__ inline int __reduce_add_sync(MaskT mask, int val)
|
||||
{
|
||||
template <typename MaskT> __device__ inline int __reduce_add_sync(MaskT mask, int val) {
|
||||
// although C++ has std::plus and other functors, we do not use them because
|
||||
// they are in the header <functional> and they were causing problem with hipRTC
|
||||
// at this time
|
||||
@@ -477,53 +459,45 @@ __device__ inline int __reduce_add_sync(MaskT mask, int val)
|
||||
}
|
||||
|
||||
template <typename MaskT>
|
||||
__device__ inline unsigned int __reduce_add_sync(MaskT mask, unsigned int val)
|
||||
{
|
||||
__device__ inline unsigned int __reduce_add_sync(MaskT mask, unsigned int val) {
|
||||
auto op = [](decltype(val)& a, decltype(val)& b) { return a + b; };
|
||||
auto wfReduce = [](decltype(val) v) { return __ockl_wfred_add_u32(v); };
|
||||
|
||||
return __reduce_op_sync(mask, val, op, wfReduce);
|
||||
}
|
||||
|
||||
template <typename MaskT>
|
||||
__device__ inline int __reduce_min_sync(MaskT mask, int val)
|
||||
{
|
||||
auto op = [](decltype(val) lhs, decltype(val) rhs) { return rhs < lhs? rhs : lhs; };
|
||||
template <typename MaskT> __device__ inline int __reduce_min_sync(MaskT mask, int val) {
|
||||
auto op = [](decltype(val) lhs, decltype(val) rhs) { return rhs < lhs ? rhs : lhs; };
|
||||
auto wfReduce = [](decltype(val) v) { return __ockl_wfred_min_i32(v); };
|
||||
|
||||
return __reduce_op_sync(mask, val, op, wfReduce);
|
||||
}
|
||||
|
||||
template <typename MaskT>
|
||||
__device__ inline unsigned int __reduce_min_sync(MaskT mask, unsigned int val)
|
||||
{
|
||||
auto op = [](decltype(val) lhs, decltype(val) rhs) { return rhs < lhs? rhs : lhs; };
|
||||
__device__ inline unsigned int __reduce_min_sync(MaskT mask, unsigned int val) {
|
||||
auto op = [](decltype(val) lhs, decltype(val) rhs) { return rhs < lhs ? rhs : lhs; };
|
||||
auto wfReduce = [](decltype(val) v) { return __ockl_wfred_min_u32(v); };
|
||||
|
||||
return __reduce_op_sync(mask, val, op, wfReduce);
|
||||
}
|
||||
|
||||
template <typename MaskT>
|
||||
__device__ inline int __reduce_max_sync(MaskT mask, int val)
|
||||
{
|
||||
auto op = [](decltype(val) lhs, decltype(val) rhs) { return lhs < rhs? rhs : lhs; };
|
||||
template <typename MaskT> __device__ inline int __reduce_max_sync(MaskT mask, int val) {
|
||||
auto op = [](decltype(val) lhs, decltype(val) rhs) { return lhs < rhs ? rhs : lhs; };
|
||||
auto wfReduce = [](decltype(val) v) { return __ockl_wfred_max_i32(v); };
|
||||
|
||||
return __reduce_op_sync(mask, val, op, wfReduce);
|
||||
}
|
||||
|
||||
template <typename MaskT>
|
||||
__device__ inline unsigned int __reduce_max_sync(MaskT mask, unsigned int val)
|
||||
{
|
||||
auto op = [](decltype(val) lhs, decltype(val) rhs) { return lhs < rhs? rhs : lhs; };
|
||||
__device__ inline unsigned int __reduce_max_sync(MaskT mask, unsigned int val) {
|
||||
auto op = [](decltype(val) lhs, decltype(val) rhs) { return lhs < rhs ? rhs : lhs; };
|
||||
auto wfReduce = [](decltype(val) v) { return __ockl_wfred_max_u32(v); };
|
||||
|
||||
return __reduce_op_sync(mask, val, op, wfReduce);
|
||||
}
|
||||
|
||||
template <typename MaskT>
|
||||
__device__ inline unsigned int __reduce_or_sync(MaskT mask, unsigned int val)
|
||||
{
|
||||
__device__ inline unsigned int __reduce_or_sync(MaskT mask, unsigned int val) {
|
||||
auto op = [](decltype(val) lhs, decltype(val) rhs) { return lhs || rhs; };
|
||||
auto wfReduce = [](decltype(val) v) { return __ockl_wfred_or_u32(v); };
|
||||
|
||||
@@ -531,8 +505,7 @@ __device__ inline unsigned int __reduce_or_sync(MaskT mask, unsigned int val)
|
||||
}
|
||||
|
||||
template <typename MaskT>
|
||||
__device__ inline unsigned int __reduce_and_sync(MaskT mask, unsigned int val)
|
||||
{
|
||||
__device__ inline unsigned int __reduce_and_sync(MaskT mask, unsigned int val) {
|
||||
auto op = [](decltype(val) lhs, decltype(val) rhs) { return lhs && rhs; };
|
||||
auto wfReduce = [](decltype(val) v) { return __ockl_wfred_and_u32(v); };
|
||||
|
||||
@@ -540,8 +513,7 @@ __device__ inline unsigned int __reduce_and_sync(MaskT mask, unsigned int val)
|
||||
}
|
||||
|
||||
template <typename MaskT>
|
||||
__device__ inline unsigned int __reduce_xor_sync(MaskT mask, unsigned int val)
|
||||
{
|
||||
__device__ inline unsigned int __reduce_xor_sync(MaskT mask, unsigned int val) {
|
||||
auto op = [](decltype(val) lhs, decltype(val) rhs) { return (!lhs) != (!rhs) == 1; };
|
||||
auto wfReduce = [](decltype(val) v) { return __ockl_wfred_xor_u32(v); };
|
||||
|
||||
@@ -549,9 +521,7 @@ __device__ inline unsigned int __reduce_xor_sync(MaskT mask, unsigned int val)
|
||||
}
|
||||
|
||||
#ifdef HIP_ENABLE_EXTRA_WARP_SYNC_TYPES
|
||||
template <typename MaskT>
|
||||
__device__ inline long long __reduce_add_sync(MaskT mask, long long val)
|
||||
{
|
||||
template <typename MaskT> __device__ inline long long __reduce_add_sync(MaskT mask, long long val) {
|
||||
auto op = [](decltype(val)& a, decltype(val)& b) { return a + b; };
|
||||
auto wfReduce = [](decltype(val) v) { return __ockl_wfred_add_i64(v); };
|
||||
|
||||
@@ -559,116 +529,93 @@ __device__ inline long long __reduce_add_sync(MaskT mask, long long val)
|
||||
}
|
||||
|
||||
template <typename MaskT>
|
||||
__device__ inline unsigned long long __reduce_add_sync(MaskT mask, unsigned long long val)
|
||||
{
|
||||
__device__ inline unsigned long long __reduce_add_sync(MaskT mask, unsigned long long val) {
|
||||
auto op = [](decltype(val)& a, decltype(val)& b) { return a + b; };
|
||||
auto wfReduce = [](decltype(val) v) { return __ockl_wfred_add_u64(v); };
|
||||
|
||||
return __reduce_op_sync(mask, val, op, wfReduce);
|
||||
}
|
||||
|
||||
template <typename MaskT>
|
||||
__device__ inline float __reduce_add_sync(MaskT mask, float val)
|
||||
{
|
||||
template <typename MaskT> __device__ inline float __reduce_add_sync(MaskT mask, float val) {
|
||||
auto op = [](decltype(val)& a, decltype(val)& b) { return a + b; };
|
||||
auto wfReduce = [](decltype(val) v) { return __ockl_wfred_add_f32(v); };
|
||||
|
||||
return __reduce_op_sync(mask, val, op, wfReduce);
|
||||
}
|
||||
|
||||
template <typename MaskT>
|
||||
__device__ inline double __reduce_add_sync(MaskT mask, double val)
|
||||
{
|
||||
template <typename MaskT> __device__ inline double __reduce_add_sync(MaskT mask, double val) {
|
||||
auto op = [](decltype(val)& a, decltype(val)& b) { return a + b; };
|
||||
auto wfReduce = [](decltype(val) v) { return __ockl_wfred_add_f64(v); };
|
||||
|
||||
return __reduce_op_sync(mask, val, op, wfReduce);
|
||||
}
|
||||
|
||||
template <typename MaskT>
|
||||
__device__ inline long long __reduce_min_sync(MaskT mask, long long val)
|
||||
{
|
||||
auto op = [](decltype(val) lhs, decltype(val) rhs) { return rhs < lhs? rhs : lhs; };
|
||||
template <typename MaskT> __device__ inline long long __reduce_min_sync(MaskT mask, long long val) {
|
||||
auto op = [](decltype(val) lhs, decltype(val) rhs) { return rhs < lhs ? rhs : lhs; };
|
||||
auto wfReduce = [](decltype(val) v) { return __ockl_wfred_min_i64(v); };
|
||||
|
||||
return __reduce_op_sync(mask, val, op, wfReduce);
|
||||
}
|
||||
|
||||
template <typename MaskT>
|
||||
__device__ inline unsigned long long __reduce_min_sync(MaskT mask, unsigned long long val)
|
||||
{
|
||||
auto op = [](decltype(val) lhs, decltype(val) rhs) { return rhs < lhs? rhs : lhs; };
|
||||
__device__ inline unsigned long long __reduce_min_sync(MaskT mask, unsigned long long val) {
|
||||
auto op = [](decltype(val) lhs, decltype(val) rhs) { return rhs < lhs ? rhs : lhs; };
|
||||
auto wfReduce = [](decltype(val) v) { return __ockl_wfred_min_u64(v); };
|
||||
|
||||
return __reduce_op_sync(mask, val, op, wfReduce);
|
||||
}
|
||||
|
||||
template <typename MaskT>
|
||||
__device__ inline float __reduce_min_sync(MaskT mask, float val)
|
||||
{
|
||||
auto op = [](decltype(val) lhs, decltype(val) rhs) { return rhs < lhs? rhs : lhs; };
|
||||
template <typename MaskT> __device__ inline float __reduce_min_sync(MaskT mask, float val) {
|
||||
auto op = [](decltype(val) lhs, decltype(val) rhs) { return rhs < lhs ? rhs : lhs; };
|
||||
auto wfReduce = [](decltype(val) v) { return __ockl_wfred_min_f32(v); };
|
||||
|
||||
return __reduce_op_sync(mask, val, op, wfReduce);
|
||||
}
|
||||
|
||||
template <typename MaskT>
|
||||
__device__ inline double __reduce_min_sync(MaskT mask, double val)
|
||||
{
|
||||
auto op = [](decltype(val) lhs, decltype(val) rhs) { return rhs < lhs? rhs : lhs; };
|
||||
template <typename MaskT> __device__ inline double __reduce_min_sync(MaskT mask, double val) {
|
||||
auto op = [](decltype(val) lhs, decltype(val) rhs) { return rhs < lhs ? rhs : lhs; };
|
||||
auto wfReduce = [](decltype(val) v) { return __ockl_wfred_min_f64(v); };
|
||||
|
||||
return __reduce_op_sync(mask, val, op, wfReduce);
|
||||
}
|
||||
|
||||
template <typename MaskT>
|
||||
__device__ inline long long __reduce_max_sync(MaskT mask, long long val)
|
||||
{
|
||||
auto op = [](decltype(val) lhs, decltype(val) rhs) { return lhs < rhs? rhs : lhs; };
|
||||
template <typename MaskT> __device__ inline long long __reduce_max_sync(MaskT mask, long long val) {
|
||||
auto op = [](decltype(val) lhs, decltype(val) rhs) { return lhs < rhs ? rhs : lhs; };
|
||||
auto wfReduce = [](decltype(val) v) { return __ockl_wfred_max_i64(v); };
|
||||
|
||||
return __reduce_op_sync(mask, val, op, wfReduce);
|
||||
}
|
||||
|
||||
template <typename MaskT>
|
||||
__device__ inline unsigned long long __reduce_max_sync(MaskT mask, unsigned long long val)
|
||||
{
|
||||
auto op = [](decltype(val) lhs, decltype(val) rhs) { return lhs < rhs? rhs : lhs; };
|
||||
__device__ inline unsigned long long __reduce_max_sync(MaskT mask, unsigned long long val) {
|
||||
auto op = [](decltype(val) lhs, decltype(val) rhs) { return lhs < rhs ? rhs : lhs; };
|
||||
auto wfReduce = [](decltype(val) v) { return __ockl_wfred_max_u64(v); };
|
||||
|
||||
return __reduce_op_sync(mask, val, op, wfReduce);
|
||||
}
|
||||
|
||||
template <typename MaskT>
|
||||
__device__ inline float __reduce_max_sync(MaskT mask, float val)
|
||||
{
|
||||
auto op = [](decltype(val) lhs, decltype(val) rhs) { return lhs < rhs? rhs : lhs; };
|
||||
template <typename MaskT> __device__ inline float __reduce_max_sync(MaskT mask, float val) {
|
||||
auto op = [](decltype(val) lhs, decltype(val) rhs) { return lhs < rhs ? rhs : lhs; };
|
||||
auto wfReduce = [](decltype(val) v) { return __ockl_wfred_max_f32(v); };
|
||||
|
||||
return __reduce_op_sync(mask, val, op, wfReduce);
|
||||
}
|
||||
|
||||
template <typename MaskT>
|
||||
__device__ inline double __reduce_max_sync(MaskT mask, double val)
|
||||
{
|
||||
auto op = [](decltype(val) lhs, decltype(val) rhs) { return lhs < rhs? rhs : lhs; };
|
||||
template <typename MaskT> __device__ inline double __reduce_max_sync(MaskT mask, double val) {
|
||||
auto op = [](decltype(val) lhs, decltype(val) rhs) { return lhs < rhs ? rhs : lhs; };
|
||||
auto wfReduce = [](decltype(val) v) { return __ockl_wfred_max_f64(v); };
|
||||
|
||||
return __reduce_op_sync(mask, val, op, wfReduce);
|
||||
}
|
||||
|
||||
template <typename MaskT>
|
||||
__device__ inline int __reduce_and_sync(MaskT mask, int val)
|
||||
{
|
||||
template <typename MaskT> __device__ inline int __reduce_and_sync(MaskT mask, int val) {
|
||||
auto op = [](decltype(val) lhs, decltype(val) rhs) { return lhs && rhs; };
|
||||
auto wfReduce = [](decltype(val) v) { return __ockl_wfred_and_i32(v); };
|
||||
|
||||
return __reduce_op_sync(mask, val, op, wfReduce);
|
||||
}
|
||||
|
||||
template <typename MaskT>
|
||||
__device__ inline long long __reduce_and_sync(MaskT mask, long long val)
|
||||
{
|
||||
template <typename MaskT> __device__ inline long long __reduce_and_sync(MaskT mask, long long val) {
|
||||
auto op = [](decltype(val) lhs, decltype(val) rhs) { return lhs && rhs; };
|
||||
auto wfReduce = [](decltype(val) v) { return __ockl_wfred_and_i64(v); };
|
||||
|
||||
@@ -676,26 +623,21 @@ __device__ inline long long __reduce_and_sync(MaskT mask, long long val)
|
||||
}
|
||||
|
||||
template <typename MaskT>
|
||||
__device__ inline unsigned long long __reduce_and_sync(MaskT mask, unsigned long long val)
|
||||
{
|
||||
__device__ inline unsigned long long __reduce_and_sync(MaskT mask, unsigned long long val) {
|
||||
auto op = [](decltype(val) lhs, decltype(val) rhs) { return lhs && rhs; };
|
||||
auto wfReduce = [](decltype(val) v) { return __ockl_wfred_and_u64(v); };
|
||||
|
||||
return __reduce_op_sync(mask, val, op, wfReduce);
|
||||
}
|
||||
|
||||
template <typename MaskT>
|
||||
__device__ inline int __reduce_or_sync(MaskT mask, int val)
|
||||
{
|
||||
template <typename MaskT> __device__ inline int __reduce_or_sync(MaskT mask, int val) {
|
||||
auto op = [](decltype(val) lhs, decltype(val) rhs) { return lhs || rhs; };
|
||||
auto wfReduce = [](decltype(val) v) { return __ockl_wfred_or_i32(v); };
|
||||
|
||||
return __reduce_op_sync(mask, val, op, wfReduce);
|
||||
}
|
||||
|
||||
template <typename MaskT>
|
||||
__device__ inline long long __reduce_or_sync(MaskT mask, long long val)
|
||||
{
|
||||
template <typename MaskT> __device__ inline long long __reduce_or_sync(MaskT mask, long long val) {
|
||||
auto op = [](decltype(val) lhs, decltype(val) rhs) { return lhs || rhs; };
|
||||
auto wfReduce = [](decltype(val) v) { return __ockl_wfred_or_i64(v); };
|
||||
|
||||
@@ -703,26 +645,21 @@ __device__ inline long long __reduce_or_sync(MaskT mask, long long val)
|
||||
}
|
||||
|
||||
template <typename MaskT>
|
||||
__device__ inline unsigned long long __reduce_or_sync(MaskT mask, unsigned long long val)
|
||||
{
|
||||
__device__ inline unsigned long long __reduce_or_sync(MaskT mask, unsigned long long val) {
|
||||
auto op = [](decltype(val) lhs, decltype(val) rhs) { return lhs || rhs; };
|
||||
auto wfReduce = [](decltype(val) v) { return __ockl_wfred_or_u64(v); };
|
||||
|
||||
return __reduce_op_sync(mask, val, op, wfReduce);
|
||||
}
|
||||
|
||||
template <typename MaskT>
|
||||
__device__ inline int __reduce_xor_sync(MaskT mask, int val)
|
||||
{
|
||||
template <typename MaskT> __device__ inline int __reduce_xor_sync(MaskT mask, int val) {
|
||||
auto op = [](decltype(val) lhs, decltype(val) rhs) { return (!lhs) != (!rhs) == 1; };
|
||||
auto wfReduce = [](decltype(val) v) { return __ockl_wfred_xor_i32(v); };
|
||||
|
||||
return __reduce_op_sync(mask, val, op, wfReduce);
|
||||
}
|
||||
|
||||
template <typename MaskT>
|
||||
__device__ inline long long __reduce_xor_sync(MaskT mask, long long val)
|
||||
{
|
||||
template <typename MaskT> __device__ inline long long __reduce_xor_sync(MaskT mask, long long val) {
|
||||
auto op = [](decltype(val) lhs, decltype(val) rhs) { return (!lhs) != (!rhs) == 1; };
|
||||
auto wfReduce = [](decltype(val) v) { return __ockl_wfred_xor_i64(v); };
|
||||
|
||||
@@ -730,9 +667,8 @@ __device__ inline long long __reduce_xor_sync(MaskT mask, long long val)
|
||||
}
|
||||
|
||||
template <typename MaskT>
|
||||
__device__ inline unsigned long long __reduce_xor_sync(MaskT mask, unsigned long long val)
|
||||
{
|
||||
auto op = [](decltype(val) lhs, decltype(val) rhs) { return (!lhs) != (!rhs)== 1; };
|
||||
__device__ inline unsigned long long __reduce_xor_sync(MaskT mask, unsigned long long val) {
|
||||
auto op = [](decltype(val) lhs, decltype(val) rhs) { return (!lhs) != (!rhs) == 1; };
|
||||
auto wfReduce = [](decltype(val) v) { return __ockl_wfred_xor_u64(v); };
|
||||
|
||||
return __reduce_op_sync(mask, val, op, wfReduce);
|
||||
@@ -743,4 +679,4 @@ __device__ inline unsigned long long __reduce_xor_sync(MaskT mask, unsigned long
|
||||
#undef __hip_adjust_mask_for_wave32
|
||||
|
||||
#endif // HIP_ENABLE_EXTRA_WARP_SYNC_TYPES
|
||||
#endif // HIP_DISABLE_WARP_SYNC_BUILTINS
|
||||
#endif // HIP_DISABLE_WARP_SYNC_BUILTINS
|
||||
|
||||
@@ -111,23 +111,24 @@ extern "C" __device__ __attribute__((convergent)) int __ockl_wgred_and_i32(int a
|
||||
extern "C" __device__ __attribute__((convergent)) int __ockl_wgred_or_i32(int a);
|
||||
|
||||
extern "C" __device__ __hip_uint64_t __ockl_fprintf_stderr_begin();
|
||||
extern "C" __device__ __hip_uint64_t __ockl_fprintf_append_args(__hip_uint64_t msg_desc, __hip_uint32_t num_args,
|
||||
__hip_uint64_t value0, __hip_uint64_t value1,
|
||||
__hip_uint64_t value2, __hip_uint64_t value3,
|
||||
__hip_uint64_t value4, __hip_uint64_t value5,
|
||||
__hip_uint64_t value6, __hip_uint32_t is_last);
|
||||
extern "C" __device__ __hip_uint64_t __ockl_fprintf_append_string_n(__hip_uint64_t msg_desc, const char* data,
|
||||
__hip_uint64_t length, __hip_uint32_t is_last);
|
||||
extern "C" __device__ __hip_uint64_t __ockl_fprintf_append_args(
|
||||
__hip_uint64_t msg_desc, __hip_uint32_t num_args, __hip_uint64_t value0, __hip_uint64_t value1,
|
||||
__hip_uint64_t value2, __hip_uint64_t value3, __hip_uint64_t value4, __hip_uint64_t value5,
|
||||
__hip_uint64_t value6, __hip_uint32_t is_last);
|
||||
extern "C" __device__ __hip_uint64_t __ockl_fprintf_append_string_n(__hip_uint64_t msg_desc,
|
||||
const char* data,
|
||||
__hip_uint64_t length,
|
||||
__hip_uint32_t is_last);
|
||||
|
||||
// Introduce local address space
|
||||
#define __local __attribute__((address_space(3)))
|
||||
|
||||
#ifdef __HIP_DEVICE_COMPILE__
|
||||
__device__ inline static __local void* __to_local(unsigned x) { return (__local void*)x; }
|
||||
#endif //__HIP_DEVICE_COMPILE__
|
||||
#endif //__HIP_DEVICE_COMPILE__
|
||||
|
||||
// Using hip.amdgcn.bc - sync threads
|
||||
#define __CLK_LOCAL_MEM_FENCE 0x01
|
||||
#define __CLK_LOCAL_MEM_FENCE 0x01
|
||||
typedef unsigned __cl_mem_fence_flags;
|
||||
|
||||
#endif
|
||||
|
||||
@@ -37,182 +37,144 @@ THE SOFTWARE.
|
||||
hipError_t ihipExtLaunchMultiKernelMultiDevice(hipLaunchParams* launchParamsList, int numDevices,
|
||||
unsigned int flags, hip_impl::program_state& ps);
|
||||
|
||||
hipError_t hipLaunchCooperativeKernel(const void* f, dim3 gridDim,
|
||||
dim3 blockDim, void** args,
|
||||
size_t sharedMem, hipStream_t stream,
|
||||
hip_impl::program_state& ps);
|
||||
hipError_t hipLaunchCooperativeKernel(const void* f, dim3 gridDim, dim3 blockDim, void** args,
|
||||
size_t sharedMem, hipStream_t stream,
|
||||
hip_impl::program_state& ps);
|
||||
|
||||
hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList,
|
||||
int numDevices,
|
||||
unsigned int flags,
|
||||
hip_impl::program_state& ps);
|
||||
hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList, int numDevices,
|
||||
unsigned int flags, hip_impl::program_state& ps);
|
||||
|
||||
#pragma GCC visibility push(hidden)
|
||||
|
||||
namespace hip_impl {
|
||||
template <typename T, typename std::enable_if<std::is_integral<T>{}>::type* = nullptr>
|
||||
inline T round_up_to_next_multiple_nonnegative(T x, T y) {
|
||||
T tmp = x + y - 1;
|
||||
return tmp - tmp % y;
|
||||
T tmp = x + y - 1;
|
||||
return tmp - tmp % y;
|
||||
}
|
||||
|
||||
template <
|
||||
std::size_t n,
|
||||
typename... Ts,
|
||||
typename std::enable_if<n == sizeof...(Ts)>::type* = nullptr>
|
||||
inline hip_impl::kernarg make_kernarg(
|
||||
const std::tuple<Ts...>&,
|
||||
const kernargs_size_align&,
|
||||
hip_impl::kernarg kernarg) {
|
||||
return kernarg;
|
||||
template <std::size_t n, typename... Ts,
|
||||
typename std::enable_if<n == sizeof...(Ts)>::type* = nullptr>
|
||||
inline hip_impl::kernarg make_kernarg(const std::tuple<Ts...>&, const kernargs_size_align&,
|
||||
hip_impl::kernarg kernarg) {
|
||||
return kernarg;
|
||||
}
|
||||
|
||||
template <
|
||||
std::size_t n,
|
||||
typename... Ts,
|
||||
typename std::enable_if<n != sizeof...(Ts)>::type* = nullptr>
|
||||
inline hip_impl::kernarg make_kernarg(
|
||||
const std::tuple<Ts...>& formals,
|
||||
const kernargs_size_align& size_align,
|
||||
hip_impl::kernarg kernarg) {
|
||||
using T = typename std::tuple_element<n, std::tuple<Ts...>>::type;
|
||||
template <std::size_t n, typename... Ts,
|
||||
typename std::enable_if<n != sizeof...(Ts)>::type* = nullptr>
|
||||
inline hip_impl::kernarg make_kernarg(const std::tuple<Ts...>& formals,
|
||||
const kernargs_size_align& size_align,
|
||||
hip_impl::kernarg kernarg) {
|
||||
using T = typename std::tuple_element<n, std::tuple<Ts...>>::type;
|
||||
|
||||
static_assert(
|
||||
!std::is_reference<T>{},
|
||||
"A __global__ function cannot have a reference as one of its "
|
||||
"arguments.");
|
||||
#if defined(HIP_STRICT)
|
||||
static_assert(
|
||||
std::is_trivially_copyable<T>{},
|
||||
"Only TriviallyCopyable types can be arguments to a __global__ "
|
||||
static_assert(!std::is_reference<T>{},
|
||||
"A __global__ function cannot have a reference as one of its "
|
||||
"arguments.");
|
||||
#if defined(HIP_STRICT)
|
||||
static_assert(std::is_trivially_copyable<T>{},
|
||||
"Only TriviallyCopyable types can be arguments to a __global__ "
|
||||
"function");
|
||||
#endif
|
||||
#endif
|
||||
|
||||
kernarg.resize(round_up_to_next_multiple_nonnegative(
|
||||
kernarg.size(), size_align.alignment(n)) + size_align.size(n));
|
||||
kernarg.resize(round_up_to_next_multiple_nonnegative(kernarg.size(), size_align.alignment(n)) +
|
||||
size_align.size(n));
|
||||
|
||||
std::memcpy(
|
||||
kernarg.data() + kernarg.size() - size_align.size(n),
|
||||
&std::get<n>(formals),
|
||||
size_align.size(n));
|
||||
return make_kernarg<n + 1>(formals, size_align, std::move(kernarg));
|
||||
std::memcpy(kernarg.data() + kernarg.size() - size_align.size(n), &std::get<n>(formals),
|
||||
size_align.size(n));
|
||||
return make_kernarg<n + 1>(formals, size_align, std::move(kernarg));
|
||||
}
|
||||
|
||||
template <typename... Formals, typename... Actuals>
|
||||
inline hip_impl::kernarg make_kernarg(
|
||||
void (*kernel)(Formals...), std::tuple<Actuals...> actuals) {
|
||||
static_assert(sizeof...(Formals) == sizeof...(Actuals),
|
||||
"The count of formal arguments must match the count of actuals.");
|
||||
inline hip_impl::kernarg make_kernarg(void (*kernel)(Formals...), std::tuple<Actuals...> actuals) {
|
||||
static_assert(sizeof...(Formals) == sizeof...(Actuals),
|
||||
"The count of formal arguments must match the count of actuals.");
|
||||
|
||||
if (sizeof...(Formals) == 0) return {};
|
||||
if (sizeof...(Formals) == 0) return {};
|
||||
|
||||
std::tuple<Formals...> to_formals{std::move(actuals)};
|
||||
hip_impl::kernarg kernarg;
|
||||
kernarg.reserve(sizeof(to_formals));
|
||||
std::tuple<Formals...> to_formals{std::move(actuals)};
|
||||
hip_impl::kernarg kernarg;
|
||||
kernarg.reserve(sizeof(to_formals));
|
||||
|
||||
auto& ps = hip_impl::get_program_state();
|
||||
return make_kernarg<0>(to_formals,
|
||||
ps.get_kernargs_size_align(
|
||||
reinterpret_cast<std::uintptr_t>(kernel)),
|
||||
std::move(kernarg));
|
||||
auto& ps = hip_impl::get_program_state();
|
||||
return make_kernarg<0>(to_formals,
|
||||
ps.get_kernargs_size_align(reinterpret_cast<std::uintptr_t>(kernel)),
|
||||
std::move(kernarg));
|
||||
}
|
||||
|
||||
|
||||
HIP_INTERNAL_EXPORTED_API hsa_agent_t target_agent(hipStream_t stream);
|
||||
|
||||
inline
|
||||
__attribute__((visibility("hidden")))
|
||||
void hipLaunchKernelGGLImpl(
|
||||
std::uintptr_t function_address,
|
||||
const dim3& numBlocks,
|
||||
const dim3& dimBlocks,
|
||||
std::uint32_t sharedMemBytes,
|
||||
hipStream_t stream,
|
||||
void** kernarg) {
|
||||
inline __attribute__((visibility("hidden"))) void hipLaunchKernelGGLImpl(
|
||||
std::uintptr_t function_address, const dim3& numBlocks, const dim3& dimBlocks,
|
||||
std::uint32_t sharedMemBytes, hipStream_t stream, void** kernarg) {
|
||||
const auto& kd =
|
||||
hip_impl::get_program_state().kernel_descriptor(function_address, target_agent(stream));
|
||||
|
||||
const auto& kd = hip_impl::get_program_state().kernel_descriptor(function_address,
|
||||
target_agent(stream));
|
||||
|
||||
hipModuleLaunchKernel(kd, numBlocks.x, numBlocks.y, numBlocks.z,
|
||||
dimBlocks.x, dimBlocks.y, dimBlocks.z, sharedMemBytes,
|
||||
stream, nullptr, kernarg);
|
||||
hipModuleLaunchKernel(kd, numBlocks.x, numBlocks.y, numBlocks.z, dimBlocks.x, dimBlocks.y,
|
||||
dimBlocks.z, sharedMemBytes, stream, nullptr, kernarg);
|
||||
}
|
||||
} // Namespace hip_impl.
|
||||
} // Namespace hip_impl.
|
||||
|
||||
|
||||
template <class T>
|
||||
inline
|
||||
hipError_t hipOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize,
|
||||
T kernel, size_t dynSharedMemPerBlk = 0, int blockSizeLimit = 0) {
|
||||
inline hipError_t hipOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize, T kernel,
|
||||
size_t dynSharedMemPerBlk = 0,
|
||||
int blockSizeLimit = 0) {
|
||||
using namespace hip_impl;
|
||||
|
||||
using namespace hip_impl;
|
||||
hip_impl::hip_init();
|
||||
auto f = get_program_state().kernel_descriptor(reinterpret_cast<std::uintptr_t>(kernel),
|
||||
target_agent(0));
|
||||
|
||||
hip_impl::hip_init();
|
||||
auto f = get_program_state().kernel_descriptor(reinterpret_cast<std::uintptr_t>(kernel),
|
||||
target_agent(0));
|
||||
|
||||
return hipModuleOccupancyMaxPotentialBlockSize(gridSize, blockSize, f,
|
||||
dynSharedMemPerBlk, blockSizeLimit);
|
||||
return hipModuleOccupancyMaxPotentialBlockSize(gridSize, blockSize, f, dynSharedMemPerBlk,
|
||||
blockSizeLimit);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
inline
|
||||
hipError_t hipOccupancyMaxPotentialBlockSizeWithFlags(int* gridSize, int* blockSize,
|
||||
T kernel, size_t dynSharedMemPerBlk = 0, int blockSizeLimit = 0, unsigned int flags = 0 ) {
|
||||
inline hipError_t hipOccupancyMaxPotentialBlockSizeWithFlags(int* gridSize, int* blockSize,
|
||||
T kernel,
|
||||
size_t dynSharedMemPerBlk = 0,
|
||||
int blockSizeLimit = 0,
|
||||
unsigned int flags = 0) {
|
||||
using namespace hip_impl;
|
||||
|
||||
using namespace hip_impl;
|
||||
hip_impl::hip_init();
|
||||
if (flags != hipOccupancyDefault) return hipErrorNotSupported;
|
||||
auto f = get_program_state().kernel_descriptor(reinterpret_cast<std::uintptr_t>(kernel),
|
||||
target_agent(0));
|
||||
|
||||
hip_impl::hip_init();
|
||||
if(flags != hipOccupancyDefault) return hipErrorNotSupported;
|
||||
auto f = get_program_state().kernel_descriptor(reinterpret_cast<std::uintptr_t>(kernel),
|
||||
target_agent(0));
|
||||
|
||||
return hipModuleOccupancyMaxPotentialBlockSize(gridSize, blockSize, f,
|
||||
dynSharedMemPerBlk, blockSizeLimit);
|
||||
return hipModuleOccupancyMaxPotentialBlockSize(gridSize, blockSize, f, dynSharedMemPerBlk,
|
||||
blockSizeLimit);
|
||||
}
|
||||
|
||||
template <typename... Args, typename F = void (*)(Args...)>
|
||||
inline
|
||||
void hipLaunchKernelGGL(F kernel, const dim3& numBlocks, const dim3& dimBlocks,
|
||||
std::uint32_t sharedMemBytes, hipStream_t stream,
|
||||
Args... args) {
|
||||
hip_impl::hip_init();
|
||||
auto kernarg = hip_impl::make_kernarg(kernel, std::tuple<Args...>{std::move(args)...});
|
||||
std::size_t kernarg_size = kernarg.size();
|
||||
inline void hipLaunchKernelGGL(F kernel, const dim3& numBlocks, const dim3& dimBlocks,
|
||||
std::uint32_t sharedMemBytes, hipStream_t stream, Args... args) {
|
||||
hip_impl::hip_init();
|
||||
auto kernarg = hip_impl::make_kernarg(kernel, std::tuple<Args...>{std::move(args)...});
|
||||
std::size_t kernarg_size = kernarg.size();
|
||||
|
||||
void* config[]{
|
||||
HIP_LAUNCH_PARAM_BUFFER_POINTER,
|
||||
kernarg.data(),
|
||||
HIP_LAUNCH_PARAM_BUFFER_SIZE,
|
||||
&kernarg_size,
|
||||
HIP_LAUNCH_PARAM_END};
|
||||
void* config[]{HIP_LAUNCH_PARAM_BUFFER_POINTER, kernarg.data(), HIP_LAUNCH_PARAM_BUFFER_SIZE,
|
||||
&kernarg_size, HIP_LAUNCH_PARAM_END};
|
||||
|
||||
hip_impl::hipLaunchKernelGGLImpl(reinterpret_cast<std::uintptr_t>(kernel),
|
||||
numBlocks, dimBlocks, sharedMemBytes,
|
||||
stream, &config[0]);
|
||||
hip_impl::hipLaunchKernelGGLImpl(reinterpret_cast<std::uintptr_t>(kernel), numBlocks, dimBlocks,
|
||||
sharedMemBytes, stream, &config[0]);
|
||||
}
|
||||
|
||||
template <typename F>
|
||||
inline
|
||||
__attribute__((visibility("hidden")))
|
||||
hipError_t hipLaunchCooperativeKernel(F f, dim3 gridDim, dim3 blockDim,
|
||||
void** args, size_t sharedMem,
|
||||
hipStream_t stream) {
|
||||
hip_impl::hip_init();
|
||||
auto& ps = hip_impl::get_program_state();
|
||||
return hipLaunchCooperativeKernel(reinterpret_cast<void*>(f), gridDim,
|
||||
blockDim, args, sharedMem, stream, ps);
|
||||
inline __attribute__((visibility("hidden"))) hipError_t hipLaunchCooperativeKernel(
|
||||
F f, dim3 gridDim, dim3 blockDim, void** args, size_t sharedMem, hipStream_t stream) {
|
||||
hip_impl::hip_init();
|
||||
auto& ps = hip_impl::get_program_state();
|
||||
return hipLaunchCooperativeKernel(reinterpret_cast<void*>(f), gridDim, blockDim, args, sharedMem,
|
||||
stream, ps);
|
||||
}
|
||||
|
||||
inline
|
||||
__attribute__((visibility("hidden")))
|
||||
hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList,
|
||||
int numDevices,
|
||||
unsigned int flags) {
|
||||
|
||||
hip_impl::hip_init();
|
||||
auto& ps = hip_impl::get_program_state();
|
||||
return hipLaunchCooperativeKernelMultiDevice(launchParamsList, numDevices, flags, ps);
|
||||
inline __attribute__((visibility("hidden"))) hipError_t hipLaunchCooperativeKernelMultiDevice(
|
||||
hipLaunchParams* launchParamsList, int numDevices, unsigned int flags) {
|
||||
hip_impl::hip_init();
|
||||
auto& ps = hip_impl::get_program_state();
|
||||
return hipLaunchCooperativeKernelMultiDevice(launchParamsList, numDevices, flags, ps);
|
||||
}
|
||||
|
||||
#pragma GCC visibility pop
|
||||
|
||||
@@ -7,61 +7,59 @@
|
||||
#define GRID_LAUNCH_VERSION 20
|
||||
|
||||
// Extern definitions
|
||||
namespace hc{
|
||||
namespace hc {
|
||||
class completion_future;
|
||||
class accelerator_view;
|
||||
}
|
||||
} // namespace hc
|
||||
|
||||
|
||||
// 3 dim structure for groups and grids.
|
||||
typedef struct gl_dim3
|
||||
{
|
||||
int x,y,z;
|
||||
gl_dim3(uint32_t _x=1, uint32_t _y=1, uint32_t _z=1) : x(_x), y(_y), z(_z) {};
|
||||
typedef struct gl_dim3 {
|
||||
int x, y, z;
|
||||
gl_dim3(uint32_t _x = 1, uint32_t _y = 1, uint32_t _z = 1) : x(_x), y(_y), z(_z) {};
|
||||
} gl_dim3;
|
||||
|
||||
typedef enum gl_barrier_bit {
|
||||
barrier_bit_queue_default,
|
||||
barrier_bit_none,
|
||||
barrier_bit_wait,
|
||||
barrier_bit_queue_default,
|
||||
barrier_bit_none,
|
||||
barrier_bit_wait,
|
||||
} gl_barrier_bit;
|
||||
|
||||
|
||||
// grid_launch_parm contains information used to launch the kernel.
|
||||
typedef struct grid_launch_parm
|
||||
{
|
||||
typedef struct grid_launch_parm {
|
||||
//! Grid dimensions
|
||||
gl_dim3 grid_dim;
|
||||
gl_dim3 grid_dim;
|
||||
|
||||
//! Group dimensions
|
||||
gl_dim3 group_dim;
|
||||
gl_dim3 group_dim;
|
||||
|
||||
//! Amount of dynamic group memory to use with the kernel launch.
|
||||
//! This memory is in addition to the amount used statically in the kernel.
|
||||
unsigned int dynamic_group_mem_bytes;
|
||||
unsigned int dynamic_group_mem_bytes;
|
||||
|
||||
//! Control setting of barrier bit on per-packet basis:
|
||||
//! See gl_barrier_bit description.
|
||||
//! See gl_barrier_bit description.
|
||||
//! Placeholder, is not used to control packet dispatch yet
|
||||
enum gl_barrier_bit barrier_bit;
|
||||
|
||||
//! Value of packet fences to apply to launch.
|
||||
//! The correspond to the value of bits 9:14 in the AQL packet,
|
||||
//! see HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE and hsa_fence_scope_t.
|
||||
unsigned int launch_fence;
|
||||
unsigned int launch_fence;
|
||||
|
||||
//! Pointer to the accelerator_view where the kernel should execute.
|
||||
//! If NULL, the default view on the default accelerator is used.
|
||||
hc::accelerator_view *av;
|
||||
hc::accelerator_view* av;
|
||||
|
||||
//! Pointer to the completion_future used to track the status of the command.
|
||||
//! If NULL, the command does not write status. In this case,
|
||||
//! synchronization can be enforced with queue-level waits or
|
||||
//! If NULL, the command does not write status. In this case,
|
||||
//! synchronization can be enforced with queue-level waits or
|
||||
//! waiting on younger commands.
|
||||
hc::completion_future *cf;
|
||||
hc::completion_future* cf;
|
||||
|
||||
grid_launch_parm() = default;
|
||||
} grid_launch_parm;
|
||||
|
||||
|
||||
extern void init_grid_launch(grid_launch_parm *gl);
|
||||
extern void init_grid_launch(grid_launch_parm* gl);
|
||||
|
||||
@@ -3,14 +3,12 @@
|
||||
#include "grid_launch.h"
|
||||
#include "hc.hpp"
|
||||
|
||||
class grid_launch_parm_cxx : public grid_launch_parm
|
||||
{
|
||||
public:
|
||||
class grid_launch_parm_cxx : public grid_launch_parm {
|
||||
public:
|
||||
grid_launch_parm_cxx() = default;
|
||||
|
||||
// customized serialization: don't need av and cf in kernel
|
||||
__attribute__((annotate("serialize")))
|
||||
void __cxxamp_serialize(Kalmar::Serialize& s) const {
|
||||
__attribute__((annotate("serialize"))) void __cxxamp_serialize(Kalmar::Serialize& s) const {
|
||||
s.Append(sizeof(int), &grid_dim.x);
|
||||
s.Append(sizeof(int), &grid_dim.y);
|
||||
s.Append(sizeof(int), &grid_dim.z);
|
||||
@@ -19,12 +17,14 @@ public:
|
||||
s.Append(sizeof(int), &group_dim.z);
|
||||
}
|
||||
|
||||
__attribute__((annotate("user_deserialize")))
|
||||
grid_launch_parm_cxx(int grid_dim_x, int grid_dim_y, int grid_dim_z,
|
||||
int group_dim_x, int group_dim_y, int group_dim_z) {
|
||||
grid_dim.x = grid_dim_x;
|
||||
grid_dim.y = grid_dim_y;
|
||||
grid_dim.z = grid_dim_z;
|
||||
__attribute__((annotate("user_deserialize"))) grid_launch_parm_cxx(int grid_dim_x, int grid_dim_y,
|
||||
int grid_dim_z,
|
||||
int group_dim_x,
|
||||
int group_dim_y,
|
||||
int group_dim_z) {
|
||||
grid_dim.x = grid_dim_x;
|
||||
grid_dim.y = grid_dim_y;
|
||||
grid_dim.z = grid_dim_z;
|
||||
group_dim.x = group_dim_x;
|
||||
group_dim.y = group_dim_y;
|
||||
group_dim.z = group_dim_z;
|
||||
@@ -32,7 +32,7 @@ public:
|
||||
};
|
||||
|
||||
|
||||
extern inline void grid_launch_init(grid_launch_parm *lp) {
|
||||
extern inline void grid_launch_init(grid_launch_parm* lp) {
|
||||
lp->grid_dim.x = lp->grid_dim.y = lp->grid_dim.z = 1;
|
||||
|
||||
lp->group_dim.x = lp->group_dim.y = lp->group_dim.z = 1;
|
||||
@@ -47,4 +47,3 @@ extern inline void grid_launch_init(grid_launch_parm *lp) {
|
||||
lp->av = &av;
|
||||
lp->cf = NULL;
|
||||
}
|
||||
|
||||
|
||||
@@ -27,16 +27,16 @@ THE SOFTWARE.
|
||||
// std::false_type, std result_of and std::true_type.
|
||||
#include <utility> // For std::declval.
|
||||
|
||||
#ifdef __has_include // Check if __has_include is present
|
||||
# if __has_include(<version>) // Check for version header
|
||||
# include <version>
|
||||
# if defined(__cpp_lib_is_invocable) && !defined(HIP_HAS_INVOCABLE)
|
||||
# define HIP_HAS_INVOCABLE __cpp_lib_is_invocable
|
||||
# endif
|
||||
# if defined(__cpp_lib_result_of_sfinae) && !defined(HIP_HAS_RESULT_OF_SFINAE)
|
||||
# define HIP_HAS_RESULT_OF_SFINAE __cpp_lib_result_of_sfinae
|
||||
# endif
|
||||
# endif
|
||||
#ifdef __has_include // Check if __has_include is present
|
||||
#if __has_include(<version>) // Check for version header
|
||||
#include <version>
|
||||
#if defined(__cpp_lib_is_invocable) && !defined(HIP_HAS_INVOCABLE)
|
||||
#define HIP_HAS_INVOCABLE __cpp_lib_is_invocable
|
||||
#endif
|
||||
#if defined(__cpp_lib_result_of_sfinae) && !defined(HIP_HAS_RESULT_OF_SFINAE)
|
||||
#define HIP_HAS_RESULT_OF_SFINAE __cpp_lib_result_of_sfinae
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef HIP_HAS_INVOCABLE
|
||||
@@ -50,88 +50,80 @@ THE SOFTWARE.
|
||||
namespace std { // TODO: these should be removed as soon as possible.
|
||||
#if (__cplusplus < 201406L)
|
||||
#if (__cplusplus < 201402L)
|
||||
template <bool cond, typename T = void>
|
||||
using enable_if_t = typename enable_if<cond, T>::type;
|
||||
template <bool cond, typename T = void> using enable_if_t = typename enable_if<cond, T>::type;
|
||||
template <bool cond, typename T, typename U>
|
||||
using conditional_t = typename conditional<cond, T, U>::type;
|
||||
template <typename T>
|
||||
using decay_t = typename decay<T>::type;
|
||||
template <typename T> using decay_t = typename decay<T>::type;
|
||||
template <FunctionalProcedure F, typename... Ts>
|
||||
using result_of_t = typename result_of<F(Ts...)>::type;
|
||||
template <typename T>
|
||||
using remove_reference_t = typename remove_reference<T>::type;
|
||||
template <typename T> using remove_reference_t = typename remove_reference<T>::type;
|
||||
#endif
|
||||
#endif
|
||||
} // namespace std
|
||||
|
||||
namespace hip_impl {
|
||||
template <typename...>
|
||||
using void_t_ = void;
|
||||
template <typename...> using void_t_ = void;
|
||||
|
||||
#if HIP_HAS_INVOCABLE
|
||||
template <typename, typename = void>
|
||||
struct is_callable_impl;
|
||||
template <typename, typename = void> struct is_callable_impl;
|
||||
|
||||
template <FunctionalProcedure F, typename... Ts>
|
||||
struct is_callable_impl<F(Ts...)> : std::is_invocable<F, Ts...> {};
|
||||
#elif HIP_HAS_RESULT_OF_SFINAE
|
||||
template <typename, typename = void>
|
||||
struct is_callable_impl : std::false_type {};
|
||||
template <typename, typename = void> struct is_callable_impl : std::false_type {};
|
||||
|
||||
template <FunctionalProcedure F, typename... Ts>
|
||||
struct is_callable_impl<F(Ts...), void_t_<typename std::result_of<F(Ts...)>::type > > : std::true_type {};
|
||||
struct is_callable_impl<F(Ts...), void_t_<typename std::result_of<F(Ts...)>::type> >
|
||||
: std::true_type {};
|
||||
#else
|
||||
template <class Base, class T, class Derived>
|
||||
auto simple_invoke(T Base::*pmd, Derived&& ref)
|
||||
-> decltype(static_cast<Derived&&>(ref).*pmd);
|
||||
auto simple_invoke(T Base::* pmd, Derived&& ref) -> decltype(static_cast<Derived&&>(ref).*pmd);
|
||||
|
||||
template <class PMD, class Pointer>
|
||||
auto simple_invoke(PMD&& pmd, Pointer&& ptr)
|
||||
-> decltype((*static_cast<Pointer&&>(ptr)).*static_cast<PMD&&>(pmd));
|
||||
-> decltype((*static_cast<Pointer&&>(ptr)).*static_cast<PMD&&>(pmd));
|
||||
|
||||
template <class Base, class T, class Derived>
|
||||
auto simple_invoke(T Base::*pmd, const std::reference_wrapper<Derived>& ref)
|
||||
-> decltype(ref.get().*pmd);
|
||||
auto simple_invoke(T Base::* pmd, const std::reference_wrapper<Derived>& ref)
|
||||
-> decltype(ref.get().*pmd);
|
||||
|
||||
template <class Base, class T, class Derived, class... Args>
|
||||
auto simple_invoke(T Base::*pmf, Derived&& ref, Args&&... args)
|
||||
-> decltype((static_cast<Derived&&>(ref).*pmf)(static_cast<Args&&>(args)...));
|
||||
auto simple_invoke(T Base::* pmf, Derived&& ref, Args&&... args)
|
||||
-> decltype((static_cast<Derived&&>(ref).*pmf)(static_cast<Args&&>(args)...));
|
||||
|
||||
template <class PMF, class Pointer, class... Args>
|
||||
auto simple_invoke(PMF&& pmf, Pointer&& ptr, Args&&... args)
|
||||
-> decltype(((*static_cast<Pointer&&>(ptr)).*static_cast<PMF&&>(pmf))(static_cast<Args&&>(args)...));
|
||||
-> decltype(((*static_cast<Pointer&&>(ptr)).*
|
||||
static_cast<PMF&&>(pmf))(static_cast<Args&&>(args)...));
|
||||
|
||||
template <class Base, class T, class Derived, class... Args>
|
||||
auto simple_invoke(T Base::*pmf, const std::reference_wrapper<Derived>& ref, Args&&... args)
|
||||
-> decltype((ref.get().*pmf)(static_cast<Args&&>(args)...));
|
||||
auto simple_invoke(T Base::* pmf, const std::reference_wrapper<Derived>& ref, Args&&... args)
|
||||
-> decltype((ref.get().*pmf)(static_cast<Args&&>(args)...));
|
||||
|
||||
template<class F, class... Ts>
|
||||
auto simple_invoke(F&& f, Ts&&... xs)
|
||||
-> decltype(f(static_cast<Ts&&>(xs)...));
|
||||
template <class F, class... Ts>
|
||||
auto simple_invoke(F&& f, Ts&&... xs) -> decltype(f(static_cast<Ts&&>(xs)...));
|
||||
|
||||
template <typename, typename = void>
|
||||
struct is_callable_impl : std::false_type {};
|
||||
template <typename, typename = void> struct is_callable_impl : std::false_type {};
|
||||
|
||||
template <FunctionalProcedure F, typename... Ts>
|
||||
struct is_callable_impl<F(Ts...), void_t_<decltype(simple_invoke(std::declval<F>(), std::declval<Ts>()...))> >
|
||||
struct is_callable_impl<F(Ts...),
|
||||
void_t_<decltype(simple_invoke(std::declval<F>(), std::declval<Ts>()...))> >
|
||||
: std::true_type {};
|
||||
|
||||
#endif
|
||||
|
||||
template <typename Call>
|
||||
struct is_callable : is_callable_impl<Call> {};
|
||||
template <typename Call> struct is_callable : is_callable_impl<Call> {};
|
||||
|
||||
#define count_macro_args_impl_hip_(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, \
|
||||
_14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, \
|
||||
_26, _27, _28, _29, _30, _31, _n, ...) \
|
||||
_n
|
||||
_n
|
||||
#define count_macro_args_hip_(...) \
|
||||
count_macro_args_impl_hip_(, ##__VA_ARGS__, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, \
|
||||
19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, \
|
||||
0)
|
||||
count_macro_args_impl_hip_(, ##__VA_ARGS__, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, \
|
||||
18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)
|
||||
|
||||
#define overloaded_macro_expand_hip_(macro, arg_cnt) macro##arg_cnt
|
||||
#define overload_macro_impl_hip_(macro, arg_cnt) overloaded_macro_expand_hip_(macro, arg_cnt)
|
||||
#define overload_macro_hip_(macro, ...) \
|
||||
overload_macro_impl_hip_(macro, count_macro_args_hip_(__VA_ARGS__))(__VA_ARGS__)
|
||||
overload_macro_impl_hip_(macro, count_macro_args_hip_(__VA_ARGS__))(__VA_ARGS__)
|
||||
} // namespace hip_impl
|
||||
|
||||
@@ -78,12 +78,12 @@ typedef void (*t___hipRegisterFunction)(void** modules, const void* hostFunction
|
||||
dim3* blockDim, dim3* gridDim, int* wSize);
|
||||
typedef void (*t___hipRegisterManagedVar)(void* hipModule, void** pointer, void* init_value,
|
||||
const char* name, size_t size, unsigned align);
|
||||
typedef void (*t___hipRegisterSurface)(void** modules, void* var, char* hostVar,
|
||||
char* deviceVar, int type, int ext);
|
||||
typedef void (*t___hipRegisterTexture)(void** modules, void* var, char* hostVar,
|
||||
char* deviceVar, int type, int norm, int ext);
|
||||
typedef void (*t___hipRegisterVar)(void** modules, void* var, char* hostVar,
|
||||
char* deviceVar, int ext, size_t size, int constant, int global);
|
||||
typedef void (*t___hipRegisterSurface)(void** modules, void* var, char* hostVar, char* deviceVar,
|
||||
int type, int ext);
|
||||
typedef void (*t___hipRegisterTexture)(void** modules, void* var, char* hostVar, char* deviceVar,
|
||||
int type, int norm, int ext);
|
||||
typedef void (*t___hipRegisterVar)(void** modules, void* var, char* hostVar, char* deviceVar,
|
||||
int ext, size_t size, int constant, int global);
|
||||
typedef void (*t___hipUnregisterFatBinary)(void** modules);
|
||||
|
||||
// HIP tools dispatch functions
|
||||
@@ -663,10 +663,11 @@ typedef hipError_t (*t_hipModuleLoadDataEx)(hipModule_t* module, const void* ima
|
||||
unsigned int numOptions, hipJitOption* options,
|
||||
void** optionValues);
|
||||
typedef hipError_t (*t_hipLinkAddData)(hipLinkState_t state, hipJitInputType type, void* data,
|
||||
size_t size, const char* name, unsigned int numOptions,
|
||||
hipJitOption* options, void** optionValues);
|
||||
size_t size, const char* name, unsigned int numOptions,
|
||||
hipJitOption* options, void** optionValues);
|
||||
typedef hipError_t (*t_hipLinkAddFile)(hipLinkState_t state, hipJitInputType type, const char* path,
|
||||
unsigned int numOptions, hipJitOption* options, void** optionValues);
|
||||
unsigned int numOptions, hipJitOption* options,
|
||||
void** optionValues);
|
||||
typedef hipError_t (*t_hipLinkComplete)(hipLinkState_t state, void** hipBinOut, size_t* sizeOut);
|
||||
typedef hipError_t (*t_hipLinkCreate)(unsigned int numOptions, hipJitOption* options,
|
||||
void** optionValues, hipLinkState_t* stateOut);
|
||||
@@ -934,35 +935,34 @@ typedef hipError_t (*t_hipHccModuleLaunchKernel)(hipFunction_t f, uint32_t globa
|
||||
hipEvent_t stopEvent);
|
||||
typedef int (*t_hipGetStreamDeviceId)(hipStream_t stream);
|
||||
typedef hipError_t (*t_hipDrvGraphAddMemsetNode)(hipGraphNode_t* phGraphNode, hipGraph_t hGraph,
|
||||
const hipGraphNode_t* dependencies, size_t numDependencies,
|
||||
const hipMemsetParams* memsetParams, hipCtx_t ctx);
|
||||
typedef hipError_t (*t_hipGraphAddExternalSemaphoresWaitNode)(hipGraphNode_t* pGraphNode,
|
||||
hipGraph_t graph, const hipGraphNode_t* pDependencies,
|
||||
size_t numDependencies,
|
||||
const hipExternalSemaphoreWaitNodeParams* nodeParams);
|
||||
typedef hipError_t (*t_hipGraphAddExternalSemaphoresSignalNode)(hipGraphNode_t* pGraphNode,
|
||||
hipGraph_t graph, const hipGraphNode_t* pDependencies,
|
||||
size_t numDependencies,
|
||||
const hipExternalSemaphoreSignalNodeParams* nodeParams);
|
||||
typedef hipError_t (*t_hipGraphExternalSemaphoresSignalNodeSetParams)(hipGraphNode_t hNode,
|
||||
const hipExternalSemaphoreSignalNodeParams* nodeParams);
|
||||
typedef hipError_t (*t_hipGraphExternalSemaphoresWaitNodeSetParams)(hipGraphNode_t hNode,
|
||||
const hipExternalSemaphoreWaitNodeParams* nodeParams);
|
||||
typedef hipError_t (*t_hipGraphExternalSemaphoresSignalNodeGetParams)(hipGraphNode_t hNode,
|
||||
hipExternalSemaphoreSignalNodeParams* params_out);
|
||||
typedef hipError_t (*t_hipGraphExternalSemaphoresWaitNodeGetParams)(hipGraphNode_t hNode,
|
||||
hipExternalSemaphoreWaitNodeParams* params_out);
|
||||
typedef hipError_t (*t_hipGraphExecExternalSemaphoresSignalNodeSetParams)(hipGraphExec_t hGraphExec,
|
||||
hipGraphNode_t hNode,
|
||||
const hipExternalSemaphoreSignalNodeParams* nodeParams);
|
||||
typedef hipError_t (*t_hipGraphExecExternalSemaphoresWaitNodeSetParams)(hipGraphExec_t hGraphExec,
|
||||
hipGraphNode_t hNode,
|
||||
const hipExternalSemaphoreWaitNodeParams* nodeParams);
|
||||
typedef hipError_t (*t_hipGraphAddNode)(hipGraphNode_t *pGraphNode, hipGraph_t graph,
|
||||
const hipGraphNode_t *pDependencies, size_t numDependencies,
|
||||
hipGraphNodeParams *nodeParams);
|
||||
const hipGraphNode_t* dependencies,
|
||||
size_t numDependencies,
|
||||
const hipMemsetParams* memsetParams, hipCtx_t ctx);
|
||||
typedef hipError_t (*t_hipGraphAddExternalSemaphoresWaitNode)(
|
||||
hipGraphNode_t* pGraphNode, hipGraph_t graph, const hipGraphNode_t* pDependencies,
|
||||
size_t numDependencies, const hipExternalSemaphoreWaitNodeParams* nodeParams);
|
||||
typedef hipError_t (*t_hipGraphAddExternalSemaphoresSignalNode)(
|
||||
hipGraphNode_t* pGraphNode, hipGraph_t graph, const hipGraphNode_t* pDependencies,
|
||||
size_t numDependencies, const hipExternalSemaphoreSignalNodeParams* nodeParams);
|
||||
typedef hipError_t (*t_hipGraphExternalSemaphoresSignalNodeSetParams)(
|
||||
hipGraphNode_t hNode, const hipExternalSemaphoreSignalNodeParams* nodeParams);
|
||||
typedef hipError_t (*t_hipGraphExternalSemaphoresWaitNodeSetParams)(
|
||||
hipGraphNode_t hNode, const hipExternalSemaphoreWaitNodeParams* nodeParams);
|
||||
typedef hipError_t (*t_hipGraphExternalSemaphoresSignalNodeGetParams)(
|
||||
hipGraphNode_t hNode, hipExternalSemaphoreSignalNodeParams* params_out);
|
||||
typedef hipError_t (*t_hipGraphExternalSemaphoresWaitNodeGetParams)(
|
||||
hipGraphNode_t hNode, hipExternalSemaphoreWaitNodeParams* params_out);
|
||||
typedef hipError_t (*t_hipGraphExecExternalSemaphoresSignalNodeSetParams)(
|
||||
hipGraphExec_t hGraphExec, hipGraphNode_t hNode,
|
||||
const hipExternalSemaphoreSignalNodeParams* nodeParams);
|
||||
typedef hipError_t (*t_hipGraphExecExternalSemaphoresWaitNodeSetParams)(
|
||||
hipGraphExec_t hGraphExec, hipGraphNode_t hNode,
|
||||
const hipExternalSemaphoreWaitNodeParams* nodeParams);
|
||||
typedef hipError_t (*t_hipGraphAddNode)(hipGraphNode_t* pGraphNode, hipGraph_t graph,
|
||||
const hipGraphNode_t* pDependencies, size_t numDependencies,
|
||||
hipGraphNodeParams* nodeParams);
|
||||
typedef hipError_t (*t_hipGraphInstantiateWithParams)(hipGraphExec_t* pGraphExec, hipGraph_t graph,
|
||||
hipGraphInstantiateParams* instantiateParams);
|
||||
hipGraphInstantiateParams* instantiateParams);
|
||||
typedef hipError_t (*t_hipExtGetLastError)();
|
||||
typedef hipError_t (*t_hipTexRefGetBorderColor)(float* pBorderColor,
|
||||
const textureReference* texRef);
|
||||
@@ -971,7 +971,8 @@ typedef hipError_t (*t_hipTexRefGetArray)(hipArray_t* pArray, const textureRefer
|
||||
typedef hipError_t (*t_hipTexRefGetBorderColor)(float* pBorderColor,
|
||||
const textureReference* texRef);
|
||||
typedef hipError_t (*t_hipTexRefGetArray)(hipArray_t* pArray, const textureReference* texRef);
|
||||
typedef hipError_t (*t_hipGetProcAddress)(const char* symbol, void** pfn, int hipVersion, uint64_t flags,
|
||||
typedef hipError_t (*t_hipGetProcAddress)(const char* symbol, void** pfn, int hipVersion,
|
||||
uint64_t flags,
|
||||
hipDriverProcAddressQueryResult* symbolStatus);
|
||||
typedef hipError_t (*t_hipStreamBeginCaptureToGraph)(hipStream_t stream, hipGraph_t graph,
|
||||
const hipGraphNode_t* dependencies,
|
||||
@@ -980,16 +981,18 @@ typedef hipError_t (*t_hipStreamBeginCaptureToGraph)(hipStream_t stream, hipGrap
|
||||
hipStreamCaptureMode mode);
|
||||
typedef hipError_t (*t_hipGetFuncBySymbol)(hipFunction_t* functionPtr, const void* symbolPtr);
|
||||
typedef hipError_t (*t_hipDrvGraphAddMemFreeNode)(hipGraphNode_t* phGraphNode, hipGraph_t hGraph,
|
||||
const hipGraphNode_t* dependencies, size_t numDependencies,
|
||||
hipDeviceptr_t dptr);
|
||||
const hipGraphNode_t* dependencies,
|
||||
size_t numDependencies, hipDeviceptr_t dptr);
|
||||
|
||||
typedef hipError_t (*t_hipDrvGraphExecMemcpyNodeSetParams)(hipGraphExec_t hGraphExec,
|
||||
hipGraphNode_t hNode, const HIP_MEMCPY3D* copyParams,
|
||||
hipCtx_t ctx);
|
||||
hipGraphNode_t hNode,
|
||||
const HIP_MEMCPY3D* copyParams,
|
||||
hipCtx_t ctx);
|
||||
|
||||
typedef hipError_t (*t_hipDrvGraphExecMemsetNodeSetParams)(hipGraphExec_t hGraphExec,
|
||||
hipGraphNode_t hNode, const hipMemsetParams* memsetParams,
|
||||
hipCtx_t ctx);
|
||||
hipGraphNode_t hNode,
|
||||
const hipMemsetParams* memsetParams,
|
||||
hipCtx_t ctx);
|
||||
typedef hipError_t (*t_hipSetValidDevices)(int* device_arr, int len);
|
||||
typedef hipError_t (*t_hipMemcpyAtoD)(hipDeviceptr_t dstDevice, hipArray_t srcArray,
|
||||
size_t srcOffset, size_t ByteCount);
|
||||
@@ -1009,26 +1012,24 @@ typedef hipError_t (*t_hipMemcpy2DArrayToArray)(hipArray_t dst, size_t wOffsetDs
|
||||
|
||||
|
||||
typedef hipError_t (*t_hipGraphExecGetFlags)(hipGraphExec_t graphExec, unsigned long long* flags);
|
||||
typedef hipError_t (*t_hipGraphNodeSetParams)(hipGraphNode_t node, hipGraphNodeParams *nodeParams);
|
||||
typedef hipError_t (*t_hipGraphNodeSetParams)(hipGraphNode_t node, hipGraphNodeParams* nodeParams);
|
||||
typedef hipError_t (*t_hipGraphExecNodeSetParams)(hipGraphExec_t graphExec, hipGraphNode_t node,
|
||||
hipGraphNodeParams* nodeParams);
|
||||
|
||||
hipGraphNodeParams* nodeParams);
|
||||
|
||||
|
||||
typedef hipError_t (*t_hipExternalMemoryGetMappedMipmappedArray)(
|
||||
hipMipmappedArray_t* mipmap, hipExternalMemory_t extMem,
|
||||
const hipExternalMemoryMipmappedArrayDesc* mipmapDesc);
|
||||
typedef hipError_t (*t_hipDrvGraphMemcpyNodeGetParams)(hipGraphNode_t hNode,
|
||||
HIP_MEMCPY3D* nodeParams);
|
||||
HIP_MEMCPY3D* nodeParams);
|
||||
|
||||
typedef hipError_t (*t_hipDrvGraphMemcpyNodeSetParams)(hipGraphNode_t hNode,
|
||||
const HIP_MEMCPY3D* nodeParams);
|
||||
const HIP_MEMCPY3D* nodeParams);
|
||||
|
||||
typedef hipError_t (*t_hipExtHostAlloc)(void **ptr, size_t size,
|
||||
unsigned int flags);
|
||||
typedef hipError_t (*t_hipExtHostAlloc)(void** ptr, size_t size, unsigned int flags);
|
||||
|
||||
typedef hipError_t (*t_hipDeviceGetTexture1DLinearMaxWidth)(size_t *maxWidthInElements,
|
||||
const hipChannelFormatDesc *fmtDesc,
|
||||
typedef hipError_t (*t_hipDeviceGetTexture1DLinearMaxWidth)(size_t* maxWidthInElements,
|
||||
const hipChannelFormatDesc* fmtDesc,
|
||||
int device);
|
||||
|
||||
typedef hipError_t (*t_hipGraphAddBatchMemOpNode)(hipGraphNode_t* phGraphNode, hipGraph_t hGraph,
|
||||
@@ -1041,7 +1042,8 @@ typedef hipError_t (*t_hipGraphBatchMemOpNodeSetParams)(hipGraphNode_t hNode,
|
||||
hipBatchMemOpNodeParams* nodeParams);
|
||||
typedef hipError_t (*t_hipGraphExecBatchMemOpNodeSetParams)(
|
||||
hipGraphExec_t hGraphExec, hipGraphNode_t hNode, const hipBatchMemOpNodeParams* nodeParams);
|
||||
typedef hipError_t (*t_hipEventRecordWithFlags)(hipEvent_t event, hipStream_t stream, unsigned int flags);
|
||||
typedef hipError_t (*t_hipEventRecordWithFlags)(hipEvent_t event, hipStream_t stream,
|
||||
unsigned int flags);
|
||||
typedef hipError_t (*t_hipLaunchKernelExC)(const hipLaunchConfig_t* config, const void* fPtr,
|
||||
void** args);
|
||||
typedef hipError_t (*t_hipDrvLaunchKernelEx)(const HIP_LAUNCH_CONFIG* config, hipFunction_t f,
|
||||
@@ -1065,18 +1067,18 @@ typedef hipError_t (*t_hipMemsetD2D32)(hipDeviceptr_t dst, size_t dstPitch, unsi
|
||||
typedef hipError_t (*t_hipMemsetD2D32Async)(hipDeviceptr_t dst, size_t dstPitch, unsigned int value,
|
||||
size_t width, size_t height, hipStream_t stream);
|
||||
typedef hipError_t (*t_hipStreamSetAttribute)(hipStream_t stream, hipStreamAttrID attr,
|
||||
const hipStreamAttrValue *value);
|
||||
const hipStreamAttrValue* value);
|
||||
typedef hipError_t (*t_hipStreamGetAttribute)(hipStream_t stream, hipStreamAttrID attr,
|
||||
hipStreamAttrValue *value_out);
|
||||
hipStreamAttrValue* value_out);
|
||||
typedef hipError_t (*t_hipModuleLoadFatBinary)(hipModule_t* module, const void* fatbin);
|
||||
typedef hipError_t (*t_hipMemcpyBatchAsync) (void **dsts, void **srcs, size_t *sizes, size_t count,
|
||||
hipMemcpyAttributes *attrs, size_t *attrsIdxs,
|
||||
size_t numAttrs, size_t *failIdx, hipStream_t stream);
|
||||
typedef hipError_t (*t_hipMemcpy3DBatchAsync) (size_t numOps, struct hipMemcpy3DBatchOp *opList,
|
||||
size_t *failIdx, unsigned long long flags,
|
||||
hipStream_t stream);
|
||||
typedef hipError_t (*t_hipMemcpy3DPeer) (hipMemcpy3DPeerParms *p);
|
||||
typedef hipError_t (*t_hipMemcpy3DPeerAsync) (hipMemcpy3DPeerParms *p, hipStream_t stream);
|
||||
typedef hipError_t (*t_hipMemcpyBatchAsync)(void** dsts, void** srcs, size_t* sizes, size_t count,
|
||||
hipMemcpyAttributes* attrs, size_t* attrsIdxs,
|
||||
size_t numAttrs, size_t* failIdx, hipStream_t stream);
|
||||
typedef hipError_t (*t_hipMemcpy3DBatchAsync)(size_t numOps, struct hipMemcpy3DBatchOp* opList,
|
||||
size_t* failIdx, unsigned long long flags,
|
||||
hipStream_t stream);
|
||||
typedef hipError_t (*t_hipMemcpy3DPeer)(hipMemcpy3DPeerParms* p);
|
||||
typedef hipError_t (*t_hipMemcpy3DPeerAsync)(hipMemcpy3DPeerParms* p, hipStream_t stream);
|
||||
|
||||
typedef hipError_t (*t_hipGetDriverEntryPoint)(const char* symbol, void** funcPtr,
|
||||
unsigned long long flags,
|
||||
@@ -1559,8 +1561,10 @@ struct HipDispatchTable {
|
||||
t_hipGraphExternalSemaphoresWaitNodeSetParams hipGraphExternalSemaphoresWaitNodeSetParams_fn;
|
||||
t_hipGraphExternalSemaphoresSignalNodeGetParams hipGraphExternalSemaphoresSignalNodeGetParams_fn;
|
||||
t_hipGraphExternalSemaphoresWaitNodeGetParams hipGraphExternalSemaphoresWaitNodeGetParams_fn;
|
||||
t_hipGraphExecExternalSemaphoresSignalNodeSetParams hipGraphExecExternalSemaphoresSignalNodeSetParams_fn;
|
||||
t_hipGraphExecExternalSemaphoresWaitNodeSetParams hipGraphExecExternalSemaphoresWaitNodeSetParams_fn;
|
||||
t_hipGraphExecExternalSemaphoresSignalNodeSetParams
|
||||
hipGraphExecExternalSemaphoresSignalNodeSetParams_fn;
|
||||
t_hipGraphExecExternalSemaphoresWaitNodeSetParams
|
||||
hipGraphExecExternalSemaphoresWaitNodeSetParams_fn;
|
||||
t_hipGraphAddNode hipGraphAddNode_fn;
|
||||
t_hipGraphInstantiateWithParams hipGraphInstantiateWithParams_fn;
|
||||
t_hipExtGetLastError hipExtGetLastError_fn;
|
||||
|
||||
@@ -25,10 +25,7 @@ THE SOFTWARE.
|
||||
#if defined(__clang__) and defined(__HIP__)
|
||||
|
||||
// abort
|
||||
extern "C" __device__ inline __attribute__((weak))
|
||||
void abort() {
|
||||
__builtin_trap();
|
||||
}
|
||||
extern "C" __device__ inline __attribute__((weak)) void abort() { __builtin_trap(); }
|
||||
|
||||
// The noinline attribute helps encapsulate the printf expansion,
|
||||
// which otherwise has a performance impact just by increasing the
|
||||
@@ -36,18 +33,14 @@ void abort() {
|
||||
// allows the function to exist as a global although its definition is
|
||||
// included in every compilation unit.
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
extern "C" __device__ __attribute__((noinline)) __attribute__((weak))
|
||||
void _wassert(const wchar_t *_msg, const wchar_t *_file, unsigned _line) {
|
||||
// FIXME: Need `wchar_t` support to generate assertion message.
|
||||
__builtin_trap();
|
||||
extern "C" __device__ __attribute__((noinline)) __attribute__((weak)) void _wassert(
|
||||
const wchar_t* _msg, const wchar_t* _file, unsigned _line) {
|
||||
// FIXME: Need `wchar_t` support to generate assertion message.
|
||||
__builtin_trap();
|
||||
}
|
||||
#else /* defined(_WIN32) || defined(_WIN64) */
|
||||
extern "C" __device__ __attribute__((noinline)) __attribute__((weak))
|
||||
void __assert_fail(const char *assertion,
|
||||
const char *file,
|
||||
unsigned int line,
|
||||
const char *function)
|
||||
{
|
||||
extern "C" __device__ __attribute__((noinline)) __attribute__((weak)) void __assert_fail(
|
||||
const char* assertion, const char* file, unsigned int line, const char* function) {
|
||||
const char fmt[] = "%s:%u: %s: Device-side assertion `%s' failed.\n";
|
||||
|
||||
// strlen is not available as a built-in yet, so we create our own
|
||||
@@ -60,11 +53,11 @@ void __assert_fail(const char *assertion,
|
||||
//
|
||||
// NOTE: The loop below includes the null terminator in the length
|
||||
// as required by append_string_n().
|
||||
#define __hip_get_string_length(LEN, STR) \
|
||||
do { \
|
||||
const char *tmp = STR; \
|
||||
while (*tmp++); \
|
||||
LEN = tmp - STR; \
|
||||
#define __hip_get_string_length(LEN, STR) \
|
||||
do { \
|
||||
const char* tmp = STR; \
|
||||
while (*tmp++); \
|
||||
LEN = tmp - STR; \
|
||||
} while (0)
|
||||
|
||||
auto msg = __ockl_fprintf_stderr_begin();
|
||||
@@ -84,22 +77,19 @@ void __assert_fail(const char *assertion,
|
||||
__builtin_trap();
|
||||
}
|
||||
|
||||
extern "C" __device__ __attribute__((noinline)) __attribute__((weak))
|
||||
void __assertfail()
|
||||
{
|
||||
// ignore all the args for now.
|
||||
__builtin_trap();
|
||||
extern "C" __device__ __attribute__((noinline)) __attribute__((weak)) void __assertfail() {
|
||||
// ignore all the args for now.
|
||||
__builtin_trap();
|
||||
}
|
||||
#endif /* defined(_WIN32) || defined(_WIN64) */
|
||||
|
||||
#if defined(NDEBUG)
|
||||
#define __hip_assert(COND)
|
||||
#else
|
||||
#define __hip_assert(COND) \
|
||||
do { \
|
||||
if (!(COND)) \
|
||||
__builtin_trap(); \
|
||||
#define __hip_assert(COND) \
|
||||
do { \
|
||||
if (!(COND)) __builtin_trap(); \
|
||||
} while (0)
|
||||
#endif
|
||||
|
||||
#endif // defined(__clang__) and defined(__HIP__)
|
||||
#endif // defined(__clang__) and defined(__HIP__)
|
||||
|
||||
@@ -33,7 +33,7 @@ THE SOFTWARE.
|
||||
|
||||
#if __cplusplus
|
||||
#if !defined(__HIPCC_RTC__)
|
||||
#include <hip/amd_detail/amd_hip_runtime.h> // threadId, blockId
|
||||
#include <hip/amd_detail/amd_hip_runtime.h> // threadId, blockId
|
||||
#include <hip/amd_detail/amd_device_functions.h>
|
||||
#endif
|
||||
#if !defined(__align__)
|
||||
@@ -63,19 +63,19 @@ template <unsigned int size>
|
||||
using is_valid_wavefront = __hip_internal::integral_constant<bool, size <= 64>;
|
||||
|
||||
template <unsigned int size>
|
||||
using is_valid_tile_size =
|
||||
__hip_internal::integral_constant<bool, is_power_of_2<size>::value && is_valid_wavefront<size>::value>;
|
||||
using is_valid_tile_size = __hip_internal::integral_constant<
|
||||
bool, is_power_of_2<size>::value && is_valid_wavefront<size>::value>;
|
||||
|
||||
template <typename T>
|
||||
using is_valid_type =
|
||||
__hip_internal::integral_constant<bool, __hip_internal::is_integral<T>::value || __hip_internal::is_floating_point<T>::value>;
|
||||
using is_valid_type = __hip_internal::integral_constant<
|
||||
bool, __hip_internal::is_integral<T>::value || __hip_internal::is_floating_point<T>::value>;
|
||||
|
||||
namespace internal {
|
||||
|
||||
/**
|
||||
* @brief Enums representing different cooperative group types
|
||||
* @note This enum is only applicable on Linux.
|
||||
*
|
||||
* @brief Enums representing different cooperative group types
|
||||
* @note This enum is only applicable on Linux.
|
||||
*
|
||||
*/
|
||||
typedef enum {
|
||||
cg_invalid,
|
||||
@@ -110,8 +110,8 @@ namespace helper {
|
||||
* | | | | | | | |
|
||||
* output: 1 1 0 0
|
||||
*/
|
||||
__CG_STATIC_QUALIFIER__ unsigned long long adjust_mask(
|
||||
unsigned long long base_mask, unsigned long long input_mask) {
|
||||
__CG_STATIC_QUALIFIER__ unsigned long long adjust_mask(unsigned long long base_mask,
|
||||
unsigned long long input_mask) {
|
||||
unsigned long long out = 0;
|
||||
for (unsigned int i = 0, index = 0; i < warpSize; i++) {
|
||||
auto lane_active = base_mask & (1ull << i);
|
||||
@@ -133,15 +133,20 @@ __CG_STATIC_QUALIFIER__ unsigned long long adjust_mask(
|
||||
namespace multi_grid {
|
||||
|
||||
__CG_STATIC_QUALIFIER__ __hip_uint32_t num_grids() {
|
||||
return static_cast<__hip_uint32_t>(__ockl_multi_grid_num_grids()); }
|
||||
return static_cast<__hip_uint32_t>(__ockl_multi_grid_num_grids());
|
||||
}
|
||||
|
||||
__CG_STATIC_QUALIFIER__ __hip_uint32_t grid_rank() {
|
||||
return static_cast<__hip_uint32_t>(__ockl_multi_grid_grid_rank()); }
|
||||
return static_cast<__hip_uint32_t>(__ockl_multi_grid_grid_rank());
|
||||
}
|
||||
|
||||
__CG_STATIC_QUALIFIER__ __hip_uint32_t num_threads() { return static_cast<__hip_uint32_t>(__ockl_multi_grid_size()); }
|
||||
__CG_STATIC_QUALIFIER__ __hip_uint32_t num_threads() {
|
||||
return static_cast<__hip_uint32_t>(__ockl_multi_grid_size());
|
||||
}
|
||||
|
||||
__CG_STATIC_QUALIFIER__ __hip_uint32_t thread_rank() {
|
||||
return static_cast<__hip_uint32_t>(__ockl_multi_grid_thread_rank()); }
|
||||
return static_cast<__hip_uint32_t>(__ockl_multi_grid_thread_rank());
|
||||
}
|
||||
|
||||
__CG_STATIC_QUALIFIER__ bool is_valid() { return static_cast<bool>(__ockl_multi_grid_is_valid()); }
|
||||
|
||||
@@ -157,13 +162,13 @@ namespace grid {
|
||||
|
||||
__CG_STATIC_QUALIFIER__ __hip_uint32_t num_threads() {
|
||||
return static_cast<__hip_uint32_t>((blockDim.z * gridDim.z) * (blockDim.y * gridDim.y) *
|
||||
(blockDim.x * gridDim.x));
|
||||
(blockDim.x * gridDim.x));
|
||||
}
|
||||
|
||||
__CG_STATIC_QUALIFIER__ __hip_uint32_t thread_rank() {
|
||||
// Compute global id of the workgroup to which the current thread belongs to
|
||||
__hip_uint32_t blkIdx = static_cast<__hip_uint32_t>((blockIdx.z * gridDim.y * gridDim.x) +
|
||||
(blockIdx.y * gridDim.x) + (blockIdx.x));
|
||||
(blockIdx.y * gridDim.x) + (blockIdx.x));
|
||||
|
||||
// Compute total number of threads being passed to reach current workgroup
|
||||
// within grid
|
||||
@@ -171,8 +176,8 @@ __CG_STATIC_QUALIFIER__ __hip_uint32_t thread_rank() {
|
||||
static_cast<__hip_uint32_t>(blkIdx * (blockDim.x * blockDim.y * blockDim.z));
|
||||
|
||||
// Compute thread local rank within current workgroup
|
||||
__hip_uint32_t local_thread_rank = static_cast<__hip_uint32_t>((threadIdx.z * blockDim.y * blockDim.x) +
|
||||
(threadIdx.y * blockDim.x) + (threadIdx.x));
|
||||
__hip_uint32_t local_thread_rank = static_cast<__hip_uint32_t>(
|
||||
(threadIdx.z * blockDim.y * blockDim.x) + (threadIdx.y * blockDim.x) + (threadIdx.x));
|
||||
|
||||
return (num_threads_till_current_workgroup + local_thread_rank);
|
||||
}
|
||||
@@ -206,18 +211,16 @@ __CG_STATIC_QUALIFIER__ __hip_uint32_t num_threads() {
|
||||
|
||||
__CG_STATIC_QUALIFIER__ __hip_uint32_t thread_rank() {
|
||||
return (static_cast<__hip_uint32_t>((threadIdx.z * blockDim.y * blockDim.x) +
|
||||
(threadIdx.y * blockDim.x) + (threadIdx.x)));
|
||||
(threadIdx.y * blockDim.x) + (threadIdx.x)));
|
||||
}
|
||||
|
||||
__CG_STATIC_QUALIFIER__ bool is_valid() {
|
||||
return true;
|
||||
}
|
||||
__CG_STATIC_QUALIFIER__ bool is_valid() { return true; }
|
||||
|
||||
__CG_STATIC_QUALIFIER__ void sync() { __syncthreads(); }
|
||||
|
||||
__CG_STATIC_QUALIFIER__ dim3 block_dim() {
|
||||
return (dim3(static_cast<__hip_uint32_t>(blockDim.x), static_cast<__hip_uint32_t>(blockDim.y),
|
||||
static_cast<__hip_uint32_t>(blockDim.z)));
|
||||
static_cast<__hip_uint32_t>(blockDim.z)));
|
||||
}
|
||||
|
||||
} // namespace workgroup
|
||||
@@ -239,7 +242,7 @@ __CG_STATIC_QUALIFIER__ void sync() { __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, "
|
||||
// For each thread, this function returns the number of active threads which
|
||||
// have i-th bit of x set and come before the current thread.
|
||||
__CG_STATIC_QUALIFIER__ unsigned int masked_bit_count(lane_mask x, unsigned int add = 0) {
|
||||
unsigned int counter=0;
|
||||
unsigned int counter = 0;
|
||||
if (static_cast<int>(warpSize) == 32) {
|
||||
counter = __builtin_amdgcn_mbcnt_lo(static_cast<unsigned int>(x), add);
|
||||
} else {
|
||||
@@ -259,8 +262,8 @@ __CG_STATIC_QUALIFIER__ unsigned int masked_bit_count(lane_mask x, unsigned int
|
||||
|
||||
} // namespace cooperative_groups
|
||||
/**
|
||||
* @}
|
||||
*/
|
||||
* @}
|
||||
*/
|
||||
|
||||
#endif // __cplusplus
|
||||
#endif // HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H
|
||||
|
||||
@@ -1,260 +1,221 @@
|
||||
#pragma once
|
||||
|
||||
#if defined(__cplusplus)
|
||||
#include <cstring>
|
||||
#include <cstring>
|
||||
#endif
|
||||
|
||||
struct __half_raw {
|
||||
unsigned short x;
|
||||
unsigned short x;
|
||||
};
|
||||
|
||||
struct __half2_raw {
|
||||
unsigned short x;
|
||||
unsigned short y;
|
||||
unsigned short x;
|
||||
unsigned short y;
|
||||
};
|
||||
|
||||
#if defined(__cplusplus)
|
||||
struct __half;
|
||||
struct __half;
|
||||
|
||||
__half __float2half(float);
|
||||
float __half2float(__half);
|
||||
__half __float2half(float);
|
||||
float __half2float(__half);
|
||||
|
||||
// BEGIN STRUCT __HALF
|
||||
struct __half {
|
||||
protected:
|
||||
unsigned short __x;
|
||||
public:
|
||||
// CREATORS
|
||||
__half() = default;
|
||||
__half(const __half_raw& x) : __x{x.x} {}
|
||||
#if !defined(__HIP_NO_HALF_CONVERSIONS__)
|
||||
__half(float x) : __x{__float2half(x).__x} {}
|
||||
__half(double x) : __x{__float2half(x).__x} {}
|
||||
#endif
|
||||
__half(const __half&) = default;
|
||||
__half(__half&&) = default;
|
||||
~__half() = default;
|
||||
// BEGIN STRUCT __HALF
|
||||
struct __half {
|
||||
protected:
|
||||
unsigned short __x;
|
||||
|
||||
// MANIPULATORS
|
||||
__half& operator=(const __half&) = default;
|
||||
__half& operator=(__half&&) = default;
|
||||
__half& operator=(const __half_raw& x) { __x = x.x; return *this; }
|
||||
#if !defined(__HIP_NO_HALF_CONVERSIONS__)
|
||||
__half& operator=(float x)
|
||||
{
|
||||
__x = __float2half(x).__x;
|
||||
return *this;
|
||||
}
|
||||
__half& operator=(double x)
|
||||
{
|
||||
return *this = static_cast<float>(x);
|
||||
}
|
||||
#endif
|
||||
public:
|
||||
// CREATORS
|
||||
__half() = default;
|
||||
__half(const __half_raw& x) : __x{x.x} {}
|
||||
#if !defined(__HIP_NO_HALF_CONVERSIONS__)
|
||||
__half(float x) : __x{__float2half(x).__x} {}
|
||||
__half(double x) : __x{__float2half(x).__x} {}
|
||||
#endif
|
||||
__half(const __half&) = default;
|
||||
__half(__half&&) = default;
|
||||
~__half() = default;
|
||||
|
||||
// ACCESSORS
|
||||
operator float() const { return __half2float(*this); }
|
||||
operator __half_raw() const { return __half_raw{__x}; }
|
||||
};
|
||||
// END STRUCT __HALF
|
||||
// MANIPULATORS
|
||||
__half& operator=(const __half&) = default;
|
||||
__half& operator=(__half&&) = default;
|
||||
__half& operator=(const __half_raw& x) {
|
||||
__x = x.x;
|
||||
return *this;
|
||||
}
|
||||
#if !defined(__HIP_NO_HALF_CONVERSIONS__)
|
||||
__half& operator=(float x) {
|
||||
__x = __float2half(x).__x;
|
||||
return *this;
|
||||
}
|
||||
__half& operator=(double x) { return *this = static_cast<float>(x); }
|
||||
#endif
|
||||
|
||||
// BEGIN STRUCT __HALF2
|
||||
struct __half2 {
|
||||
public:
|
||||
__half x;
|
||||
__half y;
|
||||
// ACCESSORS
|
||||
operator float() const { return __half2float(*this); }
|
||||
operator __half_raw() const { return __half_raw{__x}; }
|
||||
};
|
||||
// END STRUCT __HALF
|
||||
|
||||
// CREATORS
|
||||
__half2() = default;
|
||||
__half2(const __half2_raw& ix)
|
||||
:
|
||||
x{reinterpret_cast<const __half&>(ix.x)},
|
||||
y{reinterpret_cast<const __half&>(ix.y)}
|
||||
{}
|
||||
__half2(const __half& ix, const __half& iy) : x{ix}, y{iy} {}
|
||||
__half2(const __half2&) = default;
|
||||
__half2(__half2&&) = default;
|
||||
~__half2() = default;
|
||||
// BEGIN STRUCT __HALF2
|
||||
struct __half2 {
|
||||
public:
|
||||
__half x;
|
||||
__half y;
|
||||
|
||||
// MANIPULATORS
|
||||
__half2& operator=(const __half2&) = default;
|
||||
__half2& operator=(__half2&&) = default;
|
||||
__half2& operator=(const __half2_raw& ix)
|
||||
{
|
||||
x = reinterpret_cast<const __half_raw&>(ix.x);
|
||||
y = reinterpret_cast<const __half_raw&>(ix.y);
|
||||
return *this;
|
||||
}
|
||||
// CREATORS
|
||||
__half2() = default;
|
||||
__half2(const __half2_raw& ix)
|
||||
: x{reinterpret_cast<const __half&>(ix.x)}, y{reinterpret_cast<const __half&>(ix.y)} {}
|
||||
__half2(const __half& ix, const __half& iy) : x{ix}, y{iy} {}
|
||||
__half2(const __half2&) = default;
|
||||
__half2(__half2&&) = default;
|
||||
~__half2() = default;
|
||||
|
||||
// ACCESSORS
|
||||
operator __half2_raw() const
|
||||
{
|
||||
return __half2_raw{
|
||||
reinterpret_cast<const unsigned short&>(x),
|
||||
reinterpret_cast<const unsigned short&>(y)};
|
||||
}
|
||||
};
|
||||
// END STRUCT __HALF2
|
||||
// MANIPULATORS
|
||||
__half2& operator=(const __half2&) = default;
|
||||
__half2& operator=(__half2&&) = default;
|
||||
__half2& operator=(const __half2_raw& ix) {
|
||||
x = reinterpret_cast<const __half_raw&>(ix.x);
|
||||
y = reinterpret_cast<const __half_raw&>(ix.y);
|
||||
return *this;
|
||||
}
|
||||
|
||||
inline
|
||||
unsigned short __internal_float2half(
|
||||
float flt, unsigned int& sgn, unsigned int& rem)
|
||||
{
|
||||
unsigned int x{};
|
||||
std::memcpy(&x, &flt, sizeof(flt));
|
||||
// ACCESSORS
|
||||
operator __half2_raw() const {
|
||||
return __half2_raw{reinterpret_cast<const unsigned short&>(x),
|
||||
reinterpret_cast<const unsigned short&>(y)};
|
||||
}
|
||||
};
|
||||
// END STRUCT __HALF2
|
||||
|
||||
unsigned int u = (x & 0x7fffffffU);
|
||||
sgn = ((x >> 16) & 0x8000U);
|
||||
inline unsigned short __internal_float2half(float flt, unsigned int& sgn, unsigned int& rem) {
|
||||
unsigned int x{};
|
||||
std::memcpy(&x, &flt, sizeof(flt));
|
||||
|
||||
// NaN/+Inf/-Inf
|
||||
if (u >= 0x7f800000U) {
|
||||
rem = 0;
|
||||
return static_cast<unsigned short>(
|
||||
(u == 0x7f800000U) ? (sgn | 0x7c00U) : 0x7fffU);
|
||||
}
|
||||
// Overflows
|
||||
if (u > 0x477fefffU) {
|
||||
rem = 0x80000000U;
|
||||
return static_cast<unsigned short>(sgn | 0x7bffU);
|
||||
}
|
||||
// Normal numbers
|
||||
if (u >= 0x38800000U) {
|
||||
rem = u << 19;
|
||||
u -= 0x38000000U;
|
||||
return static_cast<unsigned short>(sgn | (u >> 13));
|
||||
}
|
||||
// +0/-0
|
||||
if (u < 0x33000001U) {
|
||||
rem = u;
|
||||
return static_cast<unsigned short>(sgn);
|
||||
}
|
||||
// Denormal numbers
|
||||
unsigned int exponent = u >> 23;
|
||||
unsigned int mantissa = (u & 0x7fffffU);
|
||||
unsigned int shift = 0x7eU - exponent;
|
||||
mantissa |= 0x800000U;
|
||||
rem = mantissa << (32 - shift);
|
||||
return static_cast<unsigned short>(sgn | (mantissa >> shift));
|
||||
unsigned int u = (x & 0x7fffffffU);
|
||||
sgn = ((x >> 16) & 0x8000U);
|
||||
|
||||
// NaN/+Inf/-Inf
|
||||
if (u >= 0x7f800000U) {
|
||||
rem = 0;
|
||||
return static_cast<unsigned short>((u == 0x7f800000U) ? (sgn | 0x7c00U) : 0x7fffU);
|
||||
}
|
||||
// Overflows
|
||||
if (u > 0x477fefffU) {
|
||||
rem = 0x80000000U;
|
||||
return static_cast<unsigned short>(sgn | 0x7bffU);
|
||||
}
|
||||
// Normal numbers
|
||||
if (u >= 0x38800000U) {
|
||||
rem = u << 19;
|
||||
u -= 0x38000000U;
|
||||
return static_cast<unsigned short>(sgn | (u >> 13));
|
||||
}
|
||||
// +0/-0
|
||||
if (u < 0x33000001U) {
|
||||
rem = u;
|
||||
return static_cast<unsigned short>(sgn);
|
||||
}
|
||||
// Denormal numbers
|
||||
unsigned int exponent = u >> 23;
|
||||
unsigned int mantissa = (u & 0x7fffffU);
|
||||
unsigned int shift = 0x7eU - exponent;
|
||||
mantissa |= 0x800000U;
|
||||
rem = mantissa << (32 - shift);
|
||||
return static_cast<unsigned short>(sgn | (mantissa >> shift));
|
||||
}
|
||||
|
||||
inline __half __float2half(float x) {
|
||||
__half_raw r;
|
||||
unsigned int sgn{};
|
||||
unsigned int rem{};
|
||||
r.x = __internal_float2half(x, sgn, rem);
|
||||
if (rem > 0x80000000U || (rem == 0x80000000U && (r.x & 0x1))) ++r.x;
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
inline __half __float2half_rn(float x) { return __float2half(x); }
|
||||
|
||||
inline __half __float2half_rz(float x) {
|
||||
__half_raw r;
|
||||
unsigned int sgn{};
|
||||
unsigned int rem{};
|
||||
r.x = __internal_float2half(x, sgn, rem);
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
inline __half __float2half_rd(float x) {
|
||||
__half_raw r;
|
||||
unsigned int sgn{};
|
||||
unsigned int rem{};
|
||||
r.x = __internal_float2half(x, sgn, rem);
|
||||
if (rem && sgn) ++r.x;
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
inline __half __float2half_ru(float x) {
|
||||
__half_raw r;
|
||||
unsigned int sgn{};
|
||||
unsigned int rem{};
|
||||
r.x = __internal_float2half(x, sgn, rem);
|
||||
if (rem && !sgn) ++r.x;
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
inline __half2 __float2half2_rn(float x) { return __half2{__float2half_rn(x), __float2half_rn(x)}; }
|
||||
|
||||
inline __half2 __floats2half2_rn(float x, float y) {
|
||||
return __half2{__float2half_rn(x), __float2half_rn(y)};
|
||||
}
|
||||
|
||||
inline float __internal_half2float(unsigned short x) {
|
||||
unsigned int sign = ((x >> 15) & 1);
|
||||
unsigned int exponent = ((x >> 10) & 0x1f);
|
||||
unsigned int mantissa = ((x & 0x3ff) << 13);
|
||||
|
||||
if (exponent == 0x1fU) { /* NaN or Inf */
|
||||
mantissa = (mantissa ? (sign = 0, 0x7fffffU) : 0);
|
||||
exponent = 0xffU;
|
||||
} else if (!exponent) { /* Denorm or Zero */
|
||||
if (mantissa) {
|
||||
unsigned int msb;
|
||||
exponent = 0x71U;
|
||||
do {
|
||||
msb = (mantissa & 0x400000U);
|
||||
mantissa <<= 1; /* normalize */
|
||||
--exponent;
|
||||
} while (!msb);
|
||||
mantissa &= 0x7fffffU; /* 1.mantissa is implicit */
|
||||
}
|
||||
} else {
|
||||
exponent += 0x70U;
|
||||
}
|
||||
unsigned int u = ((sign << 31) | (exponent << 23) | mantissa);
|
||||
float f;
|
||||
memcpy(&f, &u, sizeof(u));
|
||||
|
||||
inline
|
||||
__half __float2half(float x)
|
||||
{
|
||||
__half_raw r;
|
||||
unsigned int sgn{};
|
||||
unsigned int rem{};
|
||||
r.x = __internal_float2half(x, sgn, rem);
|
||||
if (rem > 0x80000000U || (rem == 0x80000000U && (r.x & 0x1))) ++r.x;
|
||||
return f;
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
inline float __half2float(__half x) { return __internal_half2float(static_cast<__half_raw>(x).x); }
|
||||
inline float2 __half22float2(__half2 x) {
|
||||
return float2{__internal_half2float(static_cast<__half2_raw>(x).x),
|
||||
__internal_half2float(static_cast<__half2_raw>(x).x)};
|
||||
}
|
||||
|
||||
inline
|
||||
__half __float2half_rn(float x) { return __float2half(x); }
|
||||
inline float __low2float(__half2 x) { return __internal_half2float(static_cast<__half2_raw>(x).x); }
|
||||
|
||||
inline
|
||||
__half __float2half_rz(float x)
|
||||
{
|
||||
__half_raw r;
|
||||
unsigned int sgn{};
|
||||
unsigned int rem{};
|
||||
r.x = __internal_float2half(x, sgn, rem);
|
||||
inline float __high2float(__half2 x) {
|
||||
return __internal_half2float(static_cast<__half2_raw>(x).y);
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
inline
|
||||
__half __float2half_rd(float x)
|
||||
{
|
||||
__half_raw r;
|
||||
unsigned int sgn{};
|
||||
unsigned int rem{};
|
||||
r.x = __internal_float2half(x, sgn, rem);
|
||||
if (rem && sgn) ++r.x;
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
inline
|
||||
__half __float2half_ru(float x)
|
||||
{
|
||||
__half_raw r;
|
||||
unsigned int sgn{};
|
||||
unsigned int rem{};
|
||||
r.x = __internal_float2half(x, sgn, rem);
|
||||
if (rem && !sgn) ++r.x;
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
inline
|
||||
__half2 __float2half2_rn(float x)
|
||||
{
|
||||
return __half2{__float2half_rn(x), __float2half_rn(x)};
|
||||
}
|
||||
|
||||
inline
|
||||
__half2 __floats2half2_rn(float x, float y)
|
||||
{
|
||||
return __half2{__float2half_rn(x), __float2half_rn(y)};
|
||||
}
|
||||
|
||||
inline
|
||||
float __internal_half2float(unsigned short x)
|
||||
{
|
||||
unsigned int sign = ((x >> 15) & 1);
|
||||
unsigned int exponent = ((x >> 10) & 0x1f);
|
||||
unsigned int mantissa = ((x & 0x3ff) << 13);
|
||||
|
||||
if (exponent == 0x1fU) { /* NaN or Inf */
|
||||
mantissa = (mantissa ? (sign = 0, 0x7fffffU) : 0);
|
||||
exponent = 0xffU;
|
||||
} else if (!exponent) { /* Denorm or Zero */
|
||||
if (mantissa) {
|
||||
unsigned int msb;
|
||||
exponent = 0x71U;
|
||||
do {
|
||||
msb = (mantissa & 0x400000U);
|
||||
mantissa <<= 1; /* normalize */
|
||||
--exponent;
|
||||
} while (!msb);
|
||||
mantissa &= 0x7fffffU; /* 1.mantissa is implicit */
|
||||
}
|
||||
} else {
|
||||
exponent += 0x70U;
|
||||
}
|
||||
unsigned int u = ((sign << 31) | (exponent << 23) | mantissa);
|
||||
float f;
|
||||
memcpy(&f, &u, sizeof(u));
|
||||
|
||||
return f;
|
||||
}
|
||||
|
||||
inline
|
||||
float __half2float(__half x)
|
||||
{
|
||||
return __internal_half2float(static_cast<__half_raw>(x).x);
|
||||
}
|
||||
inline
|
||||
float2 __half22float2(__half2 x)
|
||||
{
|
||||
return float2{__internal_half2float(static_cast<__half2_raw>(x).x),
|
||||
__internal_half2float(static_cast<__half2_raw>(x).x)};
|
||||
}
|
||||
|
||||
inline
|
||||
float __low2float(__half2 x)
|
||||
{
|
||||
return __internal_half2float(static_cast<__half2_raw>(x).x);
|
||||
}
|
||||
|
||||
inline
|
||||
float __high2float(__half2 x)
|
||||
{
|
||||
return __internal_half2float(static_cast<__half2_raw>(x).y);
|
||||
}
|
||||
|
||||
#if !defined(HIP_NO_HALF)
|
||||
using half = __half;
|
||||
using half2 = __half2;
|
||||
#endif
|
||||
#endif // defined(__cplusplus)
|
||||
#if !defined(HIP_NO_HALF)
|
||||
using half = __half;
|
||||
using half2 = __half2;
|
||||
#endif
|
||||
#endif // defined(__cplusplus)
|
||||
|
||||
@@ -29,68 +29,65 @@ THE SOFTWARE.
|
||||
#include "host_defines.h"
|
||||
#endif
|
||||
#ifndef __CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
|
||||
extern "C"
|
||||
{
|
||||
__device__ __attribute__((const)) _Float16 __ocml_ceil_f16(_Float16);
|
||||
__device__ _Float16 __ocml_cos_f16(_Float16);
|
||||
__device__ __attribute__((pure)) _Float16 __ocml_exp_f16(_Float16);
|
||||
__device__ __attribute__((pure)) _Float16 __ocml_exp10_f16(_Float16);
|
||||
__device__ __attribute__((pure)) _Float16 __ocml_exp2_f16(_Float16);
|
||||
__device__ __attribute__((const)) _Float16 __ocml_floor_f16(_Float16);
|
||||
__device__ __attribute__((const))
|
||||
_Float16 __ocml_fma_f16(_Float16, _Float16, _Float16);
|
||||
__device__ __attribute__((const)) _Float16 __ocml_fabs_f16(_Float16);
|
||||
__device__ __attribute__((const)) int __ocml_isinf_f16(_Float16);
|
||||
__device__ __attribute__((const)) int __ocml_isnan_f16(_Float16);
|
||||
__device__ __attribute__((pure)) _Float16 __ocml_log_f16(_Float16);
|
||||
__device__ __attribute__((pure)) _Float16 __ocml_log10_f16(_Float16);
|
||||
__device__ __attribute__((pure)) _Float16 __ocml_log2_f16(_Float16);
|
||||
__device__ __attribute__((pure)) _Float16 __ocml_pown_f16(_Float16, int);
|
||||
__device__ __attribute__((const)) _Float16 __ocml_rint_f16(_Float16);
|
||||
__device__ __attribute__((const)) _Float16 __ocml_rsqrt_f16(_Float16);
|
||||
__device__ _Float16 __ocml_sin_f16(_Float16);
|
||||
__device__ __attribute__((const)) _Float16 __ocml_sqrt_f16(_Float16);
|
||||
__device__ __attribute__((const)) _Float16 __ocml_trunc_f16(_Float16);
|
||||
__device__ __attribute__((const)) _Float16 __ocml_fmax_f16(_Float16, _Float16);
|
||||
__device__ __attribute__((const)) _Float16 __ocml_fmin_f16(_Float16, _Float16);
|
||||
|
||||
typedef _Float16 __2f16 __attribute__((ext_vector_type(2)));
|
||||
typedef short __2i16 __attribute__((ext_vector_type(2)));
|
||||
|
||||
#if defined(__clang__) && defined(__HIP__)
|
||||
__device__ __attribute__((const)) float __ockl_fdot2(__2f16 a, __2f16 b, float c, bool s);
|
||||
#endif
|
||||
|
||||
__device__ __attribute__((const)) __2f16 __ocml_ceil_2f16(__2f16);
|
||||
__device__ __attribute__((const)) __2f16 __ocml_fabs_2f16(__2f16);
|
||||
__device__ __2f16 __ocml_cos_2f16(__2f16);
|
||||
__device__ __attribute__((pure)) __2f16 __ocml_exp_2f16(__2f16);
|
||||
__device__ __attribute__((pure)) __2f16 __ocml_exp10_2f16(__2f16);
|
||||
__device__ __attribute__((pure)) __2f16 __ocml_exp2_2f16(__2f16);
|
||||
__device__ __attribute__((const)) __2f16 __ocml_floor_2f16(__2f16);
|
||||
__device__ __attribute__((const)) __2f16 __ocml_fma_2f16(__2f16, __2f16, __2f16);
|
||||
__device__ __attribute__((const)) __2i16 __ocml_isinf_2f16(__2f16);
|
||||
__device__ __attribute__((const)) __2i16 __ocml_isnan_2f16(__2f16);
|
||||
__device__ __attribute__((pure)) __2f16 __ocml_log_2f16(__2f16);
|
||||
__device__ __attribute__((pure)) __2f16 __ocml_log10_2f16(__2f16);
|
||||
__device__ __attribute__((pure)) __2f16 __ocml_log2_2f16(__2f16);
|
||||
__device__ __attribute__((const)) __2f16 __ocml_rint_2f16(__2f16);
|
||||
__device__ __attribute__((const)) __2f16 __ocml_rsqrt_2f16(__2f16);
|
||||
__device__ __2f16 __ocml_sin_2f16(__2f16);
|
||||
__device__ __attribute__((const)) __2f16 __ocml_sqrt_2f16(__2f16);
|
||||
__device__ __attribute__((const)) __2f16 __ocml_trunc_2f16(__2f16);
|
||||
|
||||
__device__ __attribute__((const)) _Float16 __ocml_cvtrtn_f16_f32(float);
|
||||
__device__ __attribute__((const)) _Float16 __ocml_cvtrtp_f16_f32(float);
|
||||
__device__ __attribute__((const)) _Float16 __ocml_cvtrtz_f16_f32(float);
|
||||
|
||||
}
|
||||
#endif // !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
|
||||
//TODO: remove these after they get into clang header __clang_hip_libdevice_declares.h'
|
||||
extern "C" {
|
||||
__device__ __attribute__((const)) _Float16 __ocml_fmax_f16(_Float16, _Float16);
|
||||
__device__ __attribute__((const)) _Float16 __ocml_fmin_f16(_Float16, _Float16);
|
||||
__device__ __attribute__((const)) _Float16 __ocml_cvtrtn_f16_f32(float);
|
||||
__device__ __attribute__((const)) _Float16 __ocml_cvtrtp_f16_f32(float);
|
||||
__device__ __attribute__((const)) _Float16 __ocml_cvtrtz_f16_f32(float);
|
||||
__device__ __attribute__((const)) _Float16 __ocml_ceil_f16(_Float16);
|
||||
__device__ _Float16 __ocml_cos_f16(_Float16);
|
||||
__device__ __attribute__((pure)) _Float16 __ocml_exp_f16(_Float16);
|
||||
__device__ __attribute__((pure)) _Float16 __ocml_exp10_f16(_Float16);
|
||||
__device__ __attribute__((pure)) _Float16 __ocml_exp2_f16(_Float16);
|
||||
__device__ __attribute__((const)) _Float16 __ocml_floor_f16(_Float16);
|
||||
__device__ __attribute__((const)) _Float16 __ocml_fma_f16(_Float16, _Float16, _Float16);
|
||||
__device__ __attribute__((const)) _Float16 __ocml_fabs_f16(_Float16);
|
||||
__device__ __attribute__((const)) int __ocml_isinf_f16(_Float16);
|
||||
__device__ __attribute__((const)) int __ocml_isnan_f16(_Float16);
|
||||
__device__ __attribute__((pure)) _Float16 __ocml_log_f16(_Float16);
|
||||
__device__ __attribute__((pure)) _Float16 __ocml_log10_f16(_Float16);
|
||||
__device__ __attribute__((pure)) _Float16 __ocml_log2_f16(_Float16);
|
||||
__device__ __attribute__((pure)) _Float16 __ocml_pown_f16(_Float16, int);
|
||||
__device__ __attribute__((const)) _Float16 __ocml_rint_f16(_Float16);
|
||||
__device__ __attribute__((const)) _Float16 __ocml_rsqrt_f16(_Float16);
|
||||
__device__ _Float16 __ocml_sin_f16(_Float16);
|
||||
__device__ __attribute__((const)) _Float16 __ocml_sqrt_f16(_Float16);
|
||||
__device__ __attribute__((const)) _Float16 __ocml_trunc_f16(_Float16);
|
||||
__device__ __attribute__((const)) _Float16 __ocml_fmax_f16(_Float16, _Float16);
|
||||
__device__ __attribute__((const)) _Float16 __ocml_fmin_f16(_Float16, _Float16);
|
||||
|
||||
typedef _Float16 __2f16 __attribute__((ext_vector_type(2)));
|
||||
typedef short __2i16 __attribute__((ext_vector_type(2)));
|
||||
|
||||
#if defined(__clang__) && defined(__HIP__)
|
||||
__device__ __attribute__((const)) float __ockl_fdot2(__2f16 a, __2f16 b, float c, bool s);
|
||||
#endif
|
||||
|
||||
__device__ __attribute__((const)) __2f16 __ocml_ceil_2f16(__2f16);
|
||||
__device__ __attribute__((const)) __2f16 __ocml_fabs_2f16(__2f16);
|
||||
__device__ __2f16 __ocml_cos_2f16(__2f16);
|
||||
__device__ __attribute__((pure)) __2f16 __ocml_exp_2f16(__2f16);
|
||||
__device__ __attribute__((pure)) __2f16 __ocml_exp10_2f16(__2f16);
|
||||
__device__ __attribute__((pure)) __2f16 __ocml_exp2_2f16(__2f16);
|
||||
__device__ __attribute__((const)) __2f16 __ocml_floor_2f16(__2f16);
|
||||
__device__ __attribute__((const)) __2f16 __ocml_fma_2f16(__2f16, __2f16, __2f16);
|
||||
__device__ __attribute__((const)) __2i16 __ocml_isinf_2f16(__2f16);
|
||||
__device__ __attribute__((const)) __2i16 __ocml_isnan_2f16(__2f16);
|
||||
__device__ __attribute__((pure)) __2f16 __ocml_log_2f16(__2f16);
|
||||
__device__ __attribute__((pure)) __2f16 __ocml_log10_2f16(__2f16);
|
||||
__device__ __attribute__((pure)) __2f16 __ocml_log2_2f16(__2f16);
|
||||
__device__ __attribute__((const)) __2f16 __ocml_rint_2f16(__2f16);
|
||||
__device__ __attribute__((const)) __2f16 __ocml_rsqrt_2f16(__2f16);
|
||||
__device__ __2f16 __ocml_sin_2f16(__2f16);
|
||||
__device__ __attribute__((const)) __2f16 __ocml_sqrt_2f16(__2f16);
|
||||
__device__ __attribute__((const)) __2f16 __ocml_trunc_2f16(__2f16);
|
||||
|
||||
__device__ __attribute__((const)) _Float16 __ocml_cvtrtn_f16_f32(float);
|
||||
__device__ __attribute__((const)) _Float16 __ocml_cvtrtp_f16_f32(float);
|
||||
__device__ __attribute__((const)) _Float16 __ocml_cvtrtz_f16_f32(float);
|
||||
}
|
||||
#endif // !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
|
||||
// TODO: remove these after they get into clang header __clang_hip_libdevice_declares.h'
|
||||
extern "C" {
|
||||
__device__ __attribute__((const)) _Float16 __ocml_fmax_f16(_Float16, _Float16);
|
||||
__device__ __attribute__((const)) _Float16 __ocml_fmin_f16(_Float16, _Float16);
|
||||
__device__ __attribute__((const)) _Float16 __ocml_cvtrtn_f16_f32(float);
|
||||
__device__ __attribute__((const)) _Float16 __ocml_cvtrtp_f16_f32(float);
|
||||
__device__ __attribute__((const)) _Float16 __ocml_cvtrtz_f16_f32(float);
|
||||
}
|
||||
|
||||
Το diff αρχείου καταστέλλεται επειδή είναι πολύ μεγάλο
Φόρτωση Διαφορών
@@ -26,22 +26,22 @@ THE SOFTWARE.
|
||||
// HIP ROCclr Op IDs enumeration
|
||||
enum HipVdiOpId {
|
||||
kHipVdiOpIdDispatch = 0,
|
||||
kHipVdiOpIdCopy = 1,
|
||||
kHipVdiOpIdBarrier = 2,
|
||||
kHipVdiOpIdNumber = 3
|
||||
kHipVdiOpIdCopy = 1,
|
||||
kHipVdiOpIdBarrier = 2,
|
||||
kHipVdiOpIdNumber = 3
|
||||
};
|
||||
|
||||
// Types of ROCclr commands
|
||||
enum HipVdiCommandKind {
|
||||
kHipVdiCommandKernel = 0x11F0,
|
||||
kHipVdiCommandTask = 0x11F1,
|
||||
kHipVdiMemcpyDeviceToHost = 0x11F3,
|
||||
kHipHipVdiMemcpyHostToDevice = 0x11F4,
|
||||
kHipVdiMemcpyDeviceToDevice = 0x11F5,
|
||||
kHipVidMemcpyDeviceToHostRect = 0x1201,
|
||||
kHipVdiMemcpyHostToDeviceRect = 0x1202,
|
||||
kHipVdiCommandKernel = 0x11F0,
|
||||
kHipVdiCommandTask = 0x11F1,
|
||||
kHipVdiMemcpyDeviceToHost = 0x11F3,
|
||||
kHipHipVdiMemcpyHostToDevice = 0x11F4,
|
||||
kHipVdiMemcpyDeviceToDevice = 0x11F5,
|
||||
kHipVidMemcpyDeviceToHostRect = 0x1201,
|
||||
kHipVdiMemcpyHostToDeviceRect = 0x1202,
|
||||
kHipVdiMemcpyDeviceToDeviceRect = 0x1203,
|
||||
kHipVdiFillMemory = 0x1207,
|
||||
kHipVdiFillMemory = 0x1207,
|
||||
};
|
||||
|
||||
/**
|
||||
@@ -74,5 +74,4 @@ bool hipEnableActivityCallback(uint32_t op, bool enable);
|
||||
*/
|
||||
const char* hipGetCmdName(uint32_t id);
|
||||
|
||||
#endif // HIP_INCLUDE_HIP_AMD_DETAIL_HIP_RUNTIME_PROF_H
|
||||
|
||||
#endif // HIP_INCLUDE_HIP_AMD_DETAIL_HIP_RUNTIME_PROF_H
|
||||
|
||||
@@ -66,10 +66,12 @@ typedef bool_constant<true> true_type;
|
||||
typedef bool_constant<false> false_type;
|
||||
|
||||
template <bool __B, class __T = void> struct enable_if {};
|
||||
template <class __T> struct enable_if<true, __T> { typedef __T type; };
|
||||
template <class __T> struct enable_if<true, __T> {
|
||||
typedef __T type;
|
||||
};
|
||||
|
||||
template<bool _B> struct true_or_false_type : public false_type {};
|
||||
template<> struct true_or_false_type<true> : public true_type {};
|
||||
template <bool _B> struct true_or_false_type : public false_type {};
|
||||
template <> struct true_or_false_type<true> : public true_type {};
|
||||
|
||||
template <class _Tp> struct is_integral : public false_type {};
|
||||
template <> struct is_integral<bool> : public true_type {};
|
||||
@@ -103,108 +105,101 @@ template <> struct is_arithmetic<unsigned long long> : public true_type {};
|
||||
template <> struct is_arithmetic<float> : public true_type {};
|
||||
template <> struct is_arithmetic<double> : public true_type {};
|
||||
|
||||
template<typename _Tp> struct is_floating_point : public false_type {};
|
||||
template<> struct is_floating_point<float> : public true_type {};
|
||||
template<> struct is_floating_point<double> : public true_type {};
|
||||
template<> struct is_floating_point<long double> : public true_type {};
|
||||
template <typename _Tp> struct is_floating_point : public false_type {};
|
||||
template <> struct is_floating_point<float> : public true_type {};
|
||||
template <> struct is_floating_point<double> : public true_type {};
|
||||
template <> struct is_floating_point<long double> : public true_type {};
|
||||
|
||||
template <typename __T, typename __U> struct is_same : public false_type {};
|
||||
template <typename __T> struct is_same<__T, __T> : public true_type {};
|
||||
|
||||
template<typename _Tp, bool = is_arithmetic<_Tp>::value>
|
||||
struct is_signed : public false_type {};
|
||||
template<typename _Tp>
|
||||
struct is_signed<_Tp, true> : public true_or_false_type<_Tp(-1) < _Tp(0)> {};
|
||||
template <typename _Tp, bool = is_arithmetic<_Tp>::value> struct is_signed : public false_type {};
|
||||
template <typename _Tp>
|
||||
struct is_signed<_Tp, true> : public true_or_false_type<_Tp(-1) < _Tp(0)> {};
|
||||
|
||||
template<class T>
|
||||
auto test_returnable(int) -> decltype(
|
||||
void(static_cast<T(*)()>(nullptr)), true_type{});
|
||||
template<class>
|
||||
auto test_returnable(...) -> false_type;
|
||||
template <class T>
|
||||
auto test_returnable(int) -> decltype(void(static_cast<T (*)()>(nullptr)), true_type{});
|
||||
template <class> auto test_returnable(...) -> false_type;
|
||||
|
||||
template<class T>
|
||||
struct type_identity { using type = T; };
|
||||
template <class T> struct type_identity {
|
||||
using type = T;
|
||||
};
|
||||
|
||||
template<class T> // Note that `cv void&` is a substitution failure
|
||||
auto try_add_lvalue_reference(int) -> type_identity<T&>;
|
||||
template<class T> // Handle T = cv void case
|
||||
auto try_add_lvalue_reference(...) -> type_identity<T>;
|
||||
template <class T> // Note that `cv void&` is a substitution failure
|
||||
auto try_add_lvalue_reference(int) -> type_identity<T&>;
|
||||
template <class T> // Handle T = cv void case
|
||||
auto try_add_lvalue_reference(...) -> type_identity<T>;
|
||||
|
||||
template<class T>
|
||||
auto try_add_rvalue_reference(int) -> type_identity<T&&>;
|
||||
template<class T>
|
||||
auto try_add_rvalue_reference(...) -> type_identity<T>;
|
||||
template <class T> auto try_add_rvalue_reference(int) -> type_identity<T&&>;
|
||||
template <class T> auto try_add_rvalue_reference(...) -> type_identity<T>;
|
||||
|
||||
template<class T>
|
||||
struct add_lvalue_reference
|
||||
: decltype(try_add_lvalue_reference<T>(0)) {};
|
||||
template <class T> struct add_lvalue_reference : decltype(try_add_lvalue_reference<T>(0)) {};
|
||||
|
||||
template<class T>
|
||||
struct add_rvalue_reference
|
||||
: decltype(try_add_rvalue_reference<T>(0)) {};
|
||||
template <class T> struct add_rvalue_reference : decltype(try_add_rvalue_reference<T>(0)) {};
|
||||
|
||||
template<typename T>
|
||||
typename add_rvalue_reference<T>::type declval() noexcept;
|
||||
template <typename T> typename add_rvalue_reference<T>::type declval() noexcept;
|
||||
|
||||
template<class From, class To>
|
||||
auto test_implicitly_convertible(int) -> decltype(
|
||||
void(declval<void(&)(To)>()(declval<From>())), true_type{});
|
||||
template <class From, class To>
|
||||
auto test_implicitly_convertible(int)
|
||||
-> decltype(void(declval<void (&)(To)>()(declval<From>())), true_type{});
|
||||
|
||||
template<class, class>
|
||||
auto test_implicitly_convertible(...) -> false_type;
|
||||
template <class, class> auto test_implicitly_convertible(...) -> false_type;
|
||||
|
||||
template<class T> struct remove_cv { typedef T type; };
|
||||
template<class T> struct remove_cv<const T> { typedef T type; };
|
||||
template<class T> struct remove_cv<volatile T> { typedef T type; };
|
||||
template<class T> struct remove_cv<const volatile T> { typedef T type; };
|
||||
template <class T> struct remove_cv {
|
||||
typedef T type;
|
||||
};
|
||||
template <class T> struct remove_cv<const T> {
|
||||
typedef T type;
|
||||
};
|
||||
template <class T> struct remove_cv<volatile T> {
|
||||
typedef T type;
|
||||
};
|
||||
template <class T> struct remove_cv<const volatile T> {
|
||||
typedef T type;
|
||||
};
|
||||
|
||||
template<class T>
|
||||
struct is_void : public is_same<void, typename remove_cv<T>::type> {};
|
||||
template <class T> struct is_void : public is_same<void, typename remove_cv<T>::type> {};
|
||||
|
||||
template<class From, class To>
|
||||
struct is_convertible : public integral_constant<bool,
|
||||
(decltype(test_returnable<To>(0))::value &&
|
||||
decltype(test_implicitly_convertible<From, To>(0))::value) ||
|
||||
(is_void<From>::value && is_void<To>::value)> {};
|
||||
template <class From, class To>
|
||||
struct is_convertible
|
||||
: public integral_constant<bool,
|
||||
(decltype(test_returnable<To>(0))::value &&
|
||||
decltype(test_implicitly_convertible<From, To>(0))::value) ||
|
||||
(is_void<From>::value && is_void<To>::value)> {};
|
||||
|
||||
template<typename _CharT> struct char_traits;
|
||||
template<typename _CharT, typename _Traits = char_traits<_CharT>> class basic_istream;
|
||||
template<typename _CharT, typename _Traits = char_traits<_CharT>> class basic_ostream;
|
||||
template <typename _CharT> struct char_traits;
|
||||
template <typename _CharT, typename _Traits = char_traits<_CharT>> class basic_istream;
|
||||
template <typename _CharT, typename _Traits = char_traits<_CharT>> class basic_ostream;
|
||||
typedef basic_istream<char> istream;
|
||||
typedef basic_ostream<char> ostream;
|
||||
|
||||
template<typename _Tp>
|
||||
struct is_standard_layout
|
||||
: public integral_constant<bool, __is_standard_layout(_Tp)>
|
||||
{ };
|
||||
template <typename _Tp>
|
||||
struct is_standard_layout : public integral_constant<bool, __is_standard_layout(_Tp)> {};
|
||||
|
||||
template<typename _Tp>
|
||||
struct is_trivial
|
||||
: public integral_constant<bool, __is_trivial(_Tp)>
|
||||
{ };
|
||||
template <typename _Tp> struct is_trivial : public integral_constant<bool, __is_trivial(_Tp)> {};
|
||||
|
||||
|
||||
template <bool B, class T, class F> struct conditional { using type = T; };
|
||||
template <class T, class F> struct conditional<false, T, F> { using type = F; };
|
||||
|
||||
template<class T>
|
||||
struct alignment_of : integral_constant<size_t, alignof(T)> {};
|
||||
|
||||
template<typename T, T... Ints>
|
||||
struct integer_sequence {
|
||||
using value_type = T;
|
||||
static constexpr size_t size() noexcept { return sizeof...(Ints); }
|
||||
template <bool B, class T, class F> struct conditional {
|
||||
using type = T;
|
||||
};
|
||||
template <class T, class F> struct conditional<false, T, F> {
|
||||
using type = F;
|
||||
};
|
||||
|
||||
template<size_t... Ints>
|
||||
using index_sequence = integer_sequence<size_t, Ints...>;
|
||||
template <class T> struct alignment_of : integral_constant<size_t, alignof(T)> {};
|
||||
|
||||
template <typename T, T... Ints> struct integer_sequence {
|
||||
using value_type = T;
|
||||
static constexpr size_t size() noexcept { return sizeof...(Ints); }
|
||||
};
|
||||
|
||||
template <size_t... Ints> using index_sequence = integer_sequence<size_t, Ints...>;
|
||||
|
||||
template <size_t _hip_N, size_t... Ints>
|
||||
struct make_index_sequence_impl : make_index_sequence_impl<_hip_N - 1, _hip_N - 1, Ints...> {};
|
||||
|
||||
template<size_t... Ints>
|
||||
struct make_index_sequence_impl<0, Ints...> {
|
||||
using type = index_sequence<Ints...>;
|
||||
template <size_t... Ints> struct make_index_sequence_impl<0, Ints...> {
|
||||
using type = index_sequence<Ints...>;
|
||||
};
|
||||
|
||||
template <size_t _hip_N>
|
||||
@@ -212,9 +207,9 @@ using make_index_sequence = typename make_index_sequence_impl<_hip_N>::type;
|
||||
|
||||
template <size_t... Ints>
|
||||
constexpr index_sequence<Ints...> make_index_sequence_value(index_sequence<Ints...>) {
|
||||
return {};
|
||||
}
|
||||
return {};
|
||||
}
|
||||
} // namespace __hip_internal
|
||||
typedef __hip_internal::uint8_t __hip_uint8_t;
|
||||
typedef __hip_internal::uint16_t __hip_uint16_t;
|
||||
typedef __hip_internal::uint32_t __hip_uint32_t;
|
||||
@@ -223,7 +218,7 @@ typedef __hip_internal::int8_t __hip_int8_t;
|
||||
typedef __hip_internal::int16_t __hip_int16_t;
|
||||
typedef __hip_internal::int32_t __hip_int32_t;
|
||||
typedef __hip_internal::int64_t __hip_int64_t;
|
||||
#endif // defined(__cplusplus)
|
||||
#endif // defined(__cplusplus)
|
||||
|
||||
#if defined(__clang__) && defined(__HIP__)
|
||||
#if !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
|
||||
@@ -232,7 +227,7 @@ typedef __hip_internal::int64_t __hip_int64_t;
|
||||
#define __global__ __attribute__((global))
|
||||
#define __shared__ __attribute__((shared))
|
||||
#define __constant__ __attribute__((constant))
|
||||
#endif // !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
|
||||
#endif // !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
|
||||
|
||||
#if !defined(__has_feature) || !__has_feature(cuda_noinline_keyword)
|
||||
#define __noinline__ __attribute__((noinline))
|
||||
@@ -241,7 +236,8 @@ typedef __hip_internal::int64_t __hip_int64_t;
|
||||
#define __forceinline__ inline __attribute__((always_inline))
|
||||
|
||||
#if __HIP_NO_IMAGE_SUPPORT
|
||||
#define __hip_img_chk__ __attribute__((unavailable("The image/texture API not supported on the device")))
|
||||
#define __hip_img_chk__ \
|
||||
__attribute__((unavailable("The image/texture API not supported on the device")))
|
||||
#else
|
||||
#define __hip_img_chk__
|
||||
#endif
|
||||
|
||||
@@ -29,74 +29,75 @@ THE SOFTWARE.
|
||||
|
||||
namespace hip_impl {
|
||||
inline void* address(hsa_executable_symbol_t x) {
|
||||
void* r = nullptr;
|
||||
hsa_executable_symbol_get_info(x, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &r);
|
||||
void* r = nullptr;
|
||||
hsa_executable_symbol_get_info(x, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &r);
|
||||
|
||||
return r;
|
||||
return r;
|
||||
}
|
||||
|
||||
inline hsa_agent_t agent(hsa_executable_symbol_t x) {
|
||||
hsa_agent_t r = {};
|
||||
hsa_executable_symbol_get_info(x, HSA_EXECUTABLE_SYMBOL_INFO_AGENT, &r);
|
||||
hsa_agent_t r = {};
|
||||
hsa_executable_symbol_get_info(x, HSA_EXECUTABLE_SYMBOL_INFO_AGENT, &r);
|
||||
|
||||
return r;
|
||||
return r;
|
||||
}
|
||||
|
||||
inline std::uint32_t group_size(hsa_executable_symbol_t x) {
|
||||
std::uint32_t r = 0u;
|
||||
hsa_executable_symbol_get_info(x, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &r);
|
||||
std::uint32_t r = 0u;
|
||||
hsa_executable_symbol_get_info(x, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &r);
|
||||
|
||||
return r;
|
||||
return r;
|
||||
}
|
||||
|
||||
inline hsa_isa_t isa(hsa_agent_t x) {
|
||||
hsa_isa_t r = {};
|
||||
hsa_agent_iterate_isas(x,
|
||||
[](hsa_isa_t i, void* o) {
|
||||
*static_cast<hsa_isa_t*>(o) = i; // Pick the first.
|
||||
hsa_isa_t r = {};
|
||||
hsa_agent_iterate_isas(
|
||||
x,
|
||||
[](hsa_isa_t i, void* o) {
|
||||
*static_cast<hsa_isa_t*>(o) = i; // Pick the first.
|
||||
|
||||
return HSA_STATUS_INFO_BREAK;
|
||||
},
|
||||
&r);
|
||||
return HSA_STATUS_INFO_BREAK;
|
||||
},
|
||||
&r);
|
||||
|
||||
return r;
|
||||
return r;
|
||||
}
|
||||
|
||||
inline std::uint64_t kernel_object(hsa_executable_symbol_t x) {
|
||||
std::uint64_t r = 0u;
|
||||
hsa_executable_symbol_get_info(x, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &r);
|
||||
std::uint64_t r = 0u;
|
||||
hsa_executable_symbol_get_info(x, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &r);
|
||||
|
||||
return r;
|
||||
return r;
|
||||
}
|
||||
|
||||
inline std::string name(hsa_executable_symbol_t x) {
|
||||
std::uint32_t sz = 0u;
|
||||
hsa_executable_symbol_get_info(x, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &sz);
|
||||
std::uint32_t sz = 0u;
|
||||
hsa_executable_symbol_get_info(x, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &sz);
|
||||
|
||||
std::string r(sz, '\0');
|
||||
hsa_executable_symbol_get_info(x, HSA_EXECUTABLE_SYMBOL_INFO_NAME, &r.front());
|
||||
std::string r(sz, '\0');
|
||||
hsa_executable_symbol_get_info(x, HSA_EXECUTABLE_SYMBOL_INFO_NAME, &r.front());
|
||||
|
||||
return r;
|
||||
return r;
|
||||
}
|
||||
|
||||
inline std::uint32_t private_size(hsa_executable_symbol_t x) {
|
||||
std::uint32_t r = 0u;
|
||||
hsa_executable_symbol_get_info(x, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &r);
|
||||
std::uint32_t r = 0u;
|
||||
hsa_executable_symbol_get_info(x, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &r);
|
||||
|
||||
return r;
|
||||
return r;
|
||||
}
|
||||
|
||||
inline std::uint32_t size(hsa_executable_symbol_t x) {
|
||||
std::uint32_t r = 0;
|
||||
hsa_executable_symbol_get_info(x, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE, &r);
|
||||
std::uint32_t r = 0;
|
||||
hsa_executable_symbol_get_info(x, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE, &r);
|
||||
|
||||
return r;
|
||||
return r;
|
||||
}
|
||||
|
||||
inline hsa_symbol_kind_t type(hsa_executable_symbol_t x) {
|
||||
hsa_symbol_kind_t r = {};
|
||||
hsa_executable_symbol_get_info(x, HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &r);
|
||||
hsa_symbol_kind_t r = {};
|
||||
hsa_executable_symbol_get_info(x, HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &r);
|
||||
|
||||
return r;
|
||||
return r;
|
||||
}
|
||||
} // namespace hip_impl
|
||||
Το diff αρχείου καταστέλλεται επειδή είναι πολύ μεγάλο
Φόρτωση Διαφορών
@@ -28,671 +28,262 @@ THE SOFTWARE.
|
||||
#endif
|
||||
|
||||
#if defined(__cplusplus)
|
||||
extern "C" {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// DOT FUNCTIONS
|
||||
#if defined(__clang__) && defined(__HIP__)
|
||||
__device__
|
||||
__attribute__((const))
|
||||
int __ockl_sdot2(
|
||||
HIP_vector_base<short, 2>::Native_vec_,
|
||||
HIP_vector_base<short, 2>::Native_vec_,
|
||||
int, bool);
|
||||
__device__ __attribute__((const)) int __ockl_sdot2(HIP_vector_base<short, 2>::Native_vec_,
|
||||
HIP_vector_base<short, 2>::Native_vec_, int,
|
||||
bool);
|
||||
|
||||
__device__
|
||||
__attribute__((const))
|
||||
unsigned int __ockl_udot2(
|
||||
HIP_vector_base<unsigned short, 2>::Native_vec_,
|
||||
__device__ __attribute__((const)) unsigned int __ockl_udot2(
|
||||
HIP_vector_base<unsigned short, 2>::Native_vec_,
|
||||
HIP_vector_base<unsigned short, 2>::Native_vec_, unsigned int, bool);
|
||||
|
||||
__device__ __attribute__((const)) int __ockl_sdot4(HIP_vector_base<char, 4>::Native_vec_,
|
||||
HIP_vector_base<char, 4>::Native_vec_, int,
|
||||
bool);
|
||||
|
||||
__device__ __attribute__((const)) unsigned int __ockl_udot4(
|
||||
HIP_vector_base<unsigned char, 4>::Native_vec_, HIP_vector_base<unsigned char, 4>::Native_vec_,
|
||||
unsigned int, bool);
|
||||
|
||||
__device__
|
||||
__attribute__((const))
|
||||
int __ockl_sdot4(
|
||||
HIP_vector_base<char, 4>::Native_vec_,
|
||||
HIP_vector_base<char, 4>::Native_vec_,
|
||||
int, bool);
|
||||
__device__ __attribute__((const)) int __ockl_sdot8(int, int, int, bool);
|
||||
|
||||
__device__
|
||||
__attribute__((const))
|
||||
unsigned int __ockl_udot4(
|
||||
HIP_vector_base<unsigned char, 4>::Native_vec_,
|
||||
HIP_vector_base<unsigned char, 4>::Native_vec_,
|
||||
unsigned int, bool);
|
||||
|
||||
__device__
|
||||
__attribute__((const))
|
||||
int __ockl_sdot8(int, int, int, bool);
|
||||
|
||||
__device__
|
||||
__attribute__((const))
|
||||
unsigned int __ockl_udot8(unsigned int, unsigned int, unsigned int, bool);
|
||||
__device__ __attribute__((const)) unsigned int __ockl_udot8(unsigned int, unsigned int,
|
||||
unsigned int, bool);
|
||||
#endif
|
||||
|
||||
#if !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
|
||||
// BEGIN FLOAT
|
||||
__device__
|
||||
__attribute__((const))
|
||||
float __ocml_acos_f32(float);
|
||||
__device__
|
||||
__attribute__((pure))
|
||||
float __ocml_acosh_f32(float);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
float __ocml_asin_f32(float);
|
||||
__device__
|
||||
__attribute__((pure))
|
||||
float __ocml_asinh_f32(float);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
float __ocml_atan2_f32(float, float);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
float __ocml_atan_f32(float);
|
||||
__device__
|
||||
__attribute__((pure))
|
||||
float __ocml_atanh_f32(float);
|
||||
__device__
|
||||
__attribute__((pure))
|
||||
float __ocml_cbrt_f32(float);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
float __ocml_ceil_f32(float);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
__device__
|
||||
float __ocml_copysign_f32(float, float);
|
||||
__device__
|
||||
float __ocml_cos_f32(float);
|
||||
__device__
|
||||
float __ocml_native_cos_f32(float);
|
||||
__device__
|
||||
__attribute__((pure))
|
||||
__device__
|
||||
float __ocml_cosh_f32(float);
|
||||
__device__
|
||||
float __ocml_cospi_f32(float);
|
||||
__device__
|
||||
float __ocml_i0_f32(float);
|
||||
__device__
|
||||
float __ocml_i1_f32(float);
|
||||
__device__
|
||||
__attribute__((pure))
|
||||
float __ocml_erfc_f32(float);
|
||||
__device__
|
||||
__attribute__((pure))
|
||||
float __ocml_erfcinv_f32(float);
|
||||
__device__
|
||||
__attribute__((pure))
|
||||
float __ocml_erfcx_f32(float);
|
||||
__device__
|
||||
__attribute__((pure))
|
||||
float __ocml_erf_f32(float);
|
||||
__device__
|
||||
__attribute__((pure))
|
||||
float __ocml_erfinv_f32(float);
|
||||
__device__
|
||||
__attribute__((pure))
|
||||
float __ocml_exp10_f32(float);
|
||||
__device__
|
||||
__attribute__((pure))
|
||||
float __ocml_native_exp10_f32(float);
|
||||
__device__
|
||||
__attribute__((pure))
|
||||
float __ocml_exp2_f32(float);
|
||||
__device__
|
||||
__attribute__((pure))
|
||||
float __ocml_exp_f32(float);
|
||||
__device__
|
||||
__attribute__((pure))
|
||||
float __ocml_native_exp_f32(float);
|
||||
__device__
|
||||
__attribute__((pure))
|
||||
float __ocml_expm1_f32(float);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
float __ocml_fabs_f32(float);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
float __ocml_fdim_f32(float, float);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
float __ocml_floor_f32(float);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
float __ocml_fma_f32(float, float, float);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
float __ocml_fmax_f32(float, float);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
float __ocml_fmin_f32(float, float);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
__device__
|
||||
float __ocml_fmod_f32(float, float);
|
||||
__device__
|
||||
float __ocml_frexp_f32(float, __attribute__((address_space(5))) int*);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
float __ocml_hypot_f32(float, float);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
int __ocml_ilogb_f32(float);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
int __ocml_isfinite_f32(float);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
int __ocml_isinf_f32(float);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
int __ocml_isnan_f32(float);
|
||||
__device__
|
||||
float __ocml_j0_f32(float);
|
||||
__device__
|
||||
float __ocml_j1_f32(float);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
float __ocml_ldexp_f32(float, int);
|
||||
__device__
|
||||
float __ocml_lgamma_f32(float);
|
||||
__device__
|
||||
__attribute__((pure))
|
||||
float __ocml_log10_f32(float);
|
||||
__device__
|
||||
__attribute__((pure))
|
||||
float __ocml_native_log10_f32(float);
|
||||
__device__
|
||||
__attribute__((pure))
|
||||
float __ocml_log1p_f32(float);
|
||||
__device__
|
||||
__attribute__((pure))
|
||||
float __ocml_log2_f32(float);
|
||||
__device__
|
||||
__attribute__((pure))
|
||||
float __ocml_native_log2_f32(float);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
float __ocml_logb_f32(float);
|
||||
__device__
|
||||
__attribute__((pure))
|
||||
float __ocml_log_f32(float);
|
||||
__device__
|
||||
__attribute__((pure))
|
||||
float __ocml_native_log_f32(float);
|
||||
__device__
|
||||
float __ocml_modf_f32(float, __attribute__((address_space(5))) float*);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
float __ocml_nearbyint_f32(float);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
float __ocml_nextafter_f32(float, float);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
float __ocml_len3_f32(float, float, float);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
float __ocml_len4_f32(float, float, float, float);
|
||||
__device__
|
||||
__attribute__((pure))
|
||||
float __ocml_ncdf_f32(float);
|
||||
__device__
|
||||
__attribute__((pure))
|
||||
float __ocml_ncdfinv_f32(float);
|
||||
__device__
|
||||
__attribute__((pure))
|
||||
float __ocml_pow_f32(float, float);
|
||||
__device__
|
||||
__attribute__((pure))
|
||||
float __ocml_pown_f32(float, int);
|
||||
__device__
|
||||
__attribute__((pure))
|
||||
float __ocml_rcbrt_f32(float);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
float __ocml_remainder_f32(float, float);
|
||||
__device__
|
||||
float __ocml_remquo_f32(float, float, __attribute__((address_space(5))) int*);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
float __ocml_rhypot_f32(float, float);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
float __ocml_rint_f32(float);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
float __ocml_rlen3_f32(float, float, float);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
float __ocml_rlen4_f32(float, float, float, float);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
float __ocml_round_f32(float);
|
||||
__device__
|
||||
__attribute__((pure))
|
||||
float __ocml_rsqrt_f32(float);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
float __ocml_scalb_f32(float, float);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
float __ocml_scalbn_f32(float, int);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
int __ocml_signbit_f32(float);
|
||||
__device__
|
||||
float __ocml_sincos_f32(float, __attribute__((address_space(5))) float*);
|
||||
__device__
|
||||
float __ocml_sincospi_f32(float, __attribute__((address_space(5))) float*);
|
||||
__device__
|
||||
float __ocml_sin_f32(float);
|
||||
__device__
|
||||
float __ocml_native_sin_f32(float);
|
||||
__device__
|
||||
__attribute__((pure))
|
||||
float __ocml_sinh_f32(float);
|
||||
__device__
|
||||
float __ocml_sinpi_f32(float);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
float __ocml_sqrt_f32(float);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
float __ocml_native_sqrt_f32(float);
|
||||
__device__
|
||||
float __ocml_tan_f32(float);
|
||||
__device__
|
||||
__attribute__((pure))
|
||||
float __ocml_tanh_f32(float);
|
||||
__device__
|
||||
float __ocml_tgamma_f32(float);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
float __ocml_trunc_f32(float);
|
||||
__device__
|
||||
float __ocml_y0_f32(float);
|
||||
__device__
|
||||
float __ocml_y1_f32(float);
|
||||
__device__ __attribute__((const)) float __ocml_acos_f32(float);
|
||||
__device__ __attribute__((pure)) float __ocml_acosh_f32(float);
|
||||
__device__ __attribute__((const)) float __ocml_asin_f32(float);
|
||||
__device__ __attribute__((pure)) float __ocml_asinh_f32(float);
|
||||
__device__ __attribute__((const)) float __ocml_atan2_f32(float, float);
|
||||
__device__ __attribute__((const)) float __ocml_atan_f32(float);
|
||||
__device__ __attribute__((pure)) float __ocml_atanh_f32(float);
|
||||
__device__ __attribute__((pure)) float __ocml_cbrt_f32(float);
|
||||
__device__ __attribute__((const)) float __ocml_ceil_f32(float);
|
||||
__device__ __attribute__((const)) __device__ float __ocml_copysign_f32(float, float);
|
||||
__device__ float __ocml_cos_f32(float);
|
||||
__device__ float __ocml_native_cos_f32(float);
|
||||
__device__ __attribute__((pure)) __device__ float __ocml_cosh_f32(float);
|
||||
__device__ float __ocml_cospi_f32(float);
|
||||
__device__ float __ocml_i0_f32(float);
|
||||
__device__ float __ocml_i1_f32(float);
|
||||
__device__ __attribute__((pure)) float __ocml_erfc_f32(float);
|
||||
__device__ __attribute__((pure)) float __ocml_erfcinv_f32(float);
|
||||
__device__ __attribute__((pure)) float __ocml_erfcx_f32(float);
|
||||
__device__ __attribute__((pure)) float __ocml_erf_f32(float);
|
||||
__device__ __attribute__((pure)) float __ocml_erfinv_f32(float);
|
||||
__device__ __attribute__((pure)) float __ocml_exp10_f32(float);
|
||||
__device__ __attribute__((pure)) float __ocml_native_exp10_f32(float);
|
||||
__device__ __attribute__((pure)) float __ocml_exp2_f32(float);
|
||||
__device__ __attribute__((pure)) float __ocml_exp_f32(float);
|
||||
__device__ __attribute__((pure)) float __ocml_native_exp_f32(float);
|
||||
__device__ __attribute__((pure)) float __ocml_expm1_f32(float);
|
||||
__device__ __attribute__((const)) float __ocml_fabs_f32(float);
|
||||
__device__ __attribute__((const)) float __ocml_fdim_f32(float, float);
|
||||
__device__ __attribute__((const)) float __ocml_floor_f32(float);
|
||||
__device__ __attribute__((const)) float __ocml_fma_f32(float, float, float);
|
||||
__device__ __attribute__((const)) float __ocml_fmax_f32(float, float);
|
||||
__device__ __attribute__((const)) float __ocml_fmin_f32(float, float);
|
||||
__device__ __attribute__((const)) __device__ float __ocml_fmod_f32(float, float);
|
||||
__device__ float __ocml_frexp_f32(float, __attribute__((address_space(5))) int*);
|
||||
__device__ __attribute__((const)) float __ocml_hypot_f32(float, float);
|
||||
__device__ __attribute__((const)) int __ocml_ilogb_f32(float);
|
||||
__device__ __attribute__((const)) int __ocml_isfinite_f32(float);
|
||||
__device__ __attribute__((const)) int __ocml_isinf_f32(float);
|
||||
__device__ __attribute__((const)) int __ocml_isnan_f32(float);
|
||||
__device__ float __ocml_j0_f32(float);
|
||||
__device__ float __ocml_j1_f32(float);
|
||||
__device__ __attribute__((const)) float __ocml_ldexp_f32(float, int);
|
||||
__device__ float __ocml_lgamma_f32(float);
|
||||
__device__ __attribute__((pure)) float __ocml_log10_f32(float);
|
||||
__device__ __attribute__((pure)) float __ocml_native_log10_f32(float);
|
||||
__device__ __attribute__((pure)) float __ocml_log1p_f32(float);
|
||||
__device__ __attribute__((pure)) float __ocml_log2_f32(float);
|
||||
__device__ __attribute__((pure)) float __ocml_native_log2_f32(float);
|
||||
__device__ __attribute__((const)) float __ocml_logb_f32(float);
|
||||
__device__ __attribute__((pure)) float __ocml_log_f32(float);
|
||||
__device__ __attribute__((pure)) float __ocml_native_log_f32(float);
|
||||
__device__ float __ocml_modf_f32(float, __attribute__((address_space(5))) float*);
|
||||
__device__ __attribute__((const)) float __ocml_nearbyint_f32(float);
|
||||
__device__ __attribute__((const)) float __ocml_nextafter_f32(float, float);
|
||||
__device__ __attribute__((const)) float __ocml_len3_f32(float, float, float);
|
||||
__device__ __attribute__((const)) float __ocml_len4_f32(float, float, float, float);
|
||||
__device__ __attribute__((pure)) float __ocml_ncdf_f32(float);
|
||||
__device__ __attribute__((pure)) float __ocml_ncdfinv_f32(float);
|
||||
__device__ __attribute__((pure)) float __ocml_pow_f32(float, float);
|
||||
__device__ __attribute__((pure)) float __ocml_pown_f32(float, int);
|
||||
__device__ __attribute__((pure)) float __ocml_rcbrt_f32(float);
|
||||
__device__ __attribute__((const)) float __ocml_remainder_f32(float, float);
|
||||
__device__ float __ocml_remquo_f32(float, float, __attribute__((address_space(5))) int*);
|
||||
__device__ __attribute__((const)) float __ocml_rhypot_f32(float, float);
|
||||
__device__ __attribute__((const)) float __ocml_rint_f32(float);
|
||||
__device__ __attribute__((const)) float __ocml_rlen3_f32(float, float, float);
|
||||
__device__ __attribute__((const)) float __ocml_rlen4_f32(float, float, float, float);
|
||||
__device__ __attribute__((const)) float __ocml_round_f32(float);
|
||||
__device__ __attribute__((pure)) float __ocml_rsqrt_f32(float);
|
||||
__device__ __attribute__((const)) float __ocml_scalb_f32(float, float);
|
||||
__device__ __attribute__((const)) float __ocml_scalbn_f32(float, int);
|
||||
__device__ __attribute__((const)) int __ocml_signbit_f32(float);
|
||||
__device__ float __ocml_sincos_f32(float, __attribute__((address_space(5))) float*);
|
||||
__device__ float __ocml_sincospi_f32(float, __attribute__((address_space(5))) float*);
|
||||
__device__ float __ocml_sin_f32(float);
|
||||
__device__ float __ocml_native_sin_f32(float);
|
||||
__device__ __attribute__((pure)) float __ocml_sinh_f32(float);
|
||||
__device__ float __ocml_sinpi_f32(float);
|
||||
__device__ __attribute__((const)) float __ocml_sqrt_f32(float);
|
||||
__device__ __attribute__((const)) float __ocml_native_sqrt_f32(float);
|
||||
__device__ float __ocml_tan_f32(float);
|
||||
__device__ __attribute__((pure)) float __ocml_tanh_f32(float);
|
||||
__device__ float __ocml_tgamma_f32(float);
|
||||
__device__ __attribute__((const)) float __ocml_trunc_f32(float);
|
||||
__device__ float __ocml_y0_f32(float);
|
||||
__device__ float __ocml_y1_f32(float);
|
||||
|
||||
// BEGIN INTRINSICS
|
||||
__device__
|
||||
__attribute__((const))
|
||||
float __ocml_add_rte_f32(float, float);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
float __ocml_add_rtn_f32(float, float);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
float __ocml_add_rtp_f32(float, float);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
float __ocml_add_rtz_f32(float, float);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
float __ocml_sub_rte_f32(float, float);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
float __ocml_sub_rtn_f32(float, float);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
float __ocml_sub_rtp_f32(float, float);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
float __ocml_sub_rtz_f32(float, float);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
float __ocml_mul_rte_f32(float, float);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
float __ocml_mul_rtn_f32(float, float);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
float __ocml_mul_rtp_f32(float, float);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
float __ocml_mul_rtz_f32(float, float);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
float __ocml_div_rte_f32(float, float);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
float __ocml_div_rtn_f32(float, float);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
float __ocml_div_rtp_f32(float, float);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
float __ocml_div_rtz_f32(float, float);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
float __ocml_sqrt_rte_f32(float);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
float __ocml_sqrt_rtn_f32(float);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
float __ocml_sqrt_rtp_f32(float);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
float __ocml_sqrt_rtz_f32(float);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
float __ocml_fma_rte_f32(float, float, float);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
float __ocml_fma_rtn_f32(float, float, float);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
float __ocml_fma_rtp_f32(float, float, float);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
float __ocml_fma_rtz_f32(float, float, float);
|
||||
__device__ __attribute__((const)) float __ocml_add_rte_f32(float, float);
|
||||
__device__ __attribute__((const)) float __ocml_add_rtn_f32(float, float);
|
||||
__device__ __attribute__((const)) float __ocml_add_rtp_f32(float, float);
|
||||
__device__ __attribute__((const)) float __ocml_add_rtz_f32(float, float);
|
||||
__device__ __attribute__((const)) float __ocml_sub_rte_f32(float, float);
|
||||
__device__ __attribute__((const)) float __ocml_sub_rtn_f32(float, float);
|
||||
__device__ __attribute__((const)) float __ocml_sub_rtp_f32(float, float);
|
||||
__device__ __attribute__((const)) float __ocml_sub_rtz_f32(float, float);
|
||||
__device__ __attribute__((const)) float __ocml_mul_rte_f32(float, float);
|
||||
__device__ __attribute__((const)) float __ocml_mul_rtn_f32(float, float);
|
||||
__device__ __attribute__((const)) float __ocml_mul_rtp_f32(float, float);
|
||||
__device__ __attribute__((const)) float __ocml_mul_rtz_f32(float, float);
|
||||
__device__ __attribute__((const)) float __ocml_div_rte_f32(float, float);
|
||||
__device__ __attribute__((const)) float __ocml_div_rtn_f32(float, float);
|
||||
__device__ __attribute__((const)) float __ocml_div_rtp_f32(float, float);
|
||||
__device__ __attribute__((const)) float __ocml_div_rtz_f32(float, float);
|
||||
__device__ __attribute__((const)) float __ocml_sqrt_rte_f32(float);
|
||||
__device__ __attribute__((const)) float __ocml_sqrt_rtn_f32(float);
|
||||
__device__ __attribute__((const)) float __ocml_sqrt_rtp_f32(float);
|
||||
__device__ __attribute__((const)) float __ocml_sqrt_rtz_f32(float);
|
||||
__device__ __attribute__((const)) float __ocml_fma_rte_f32(float, float, float);
|
||||
__device__ __attribute__((const)) float __ocml_fma_rtn_f32(float, float, float);
|
||||
__device__ __attribute__((const)) float __ocml_fma_rtp_f32(float, float, float);
|
||||
__device__ __attribute__((const)) float __ocml_fma_rtz_f32(float, float, float);
|
||||
// END INTRINSICS
|
||||
// END FLOAT
|
||||
|
||||
// BEGIN DOUBLE
|
||||
__device__
|
||||
__attribute__((const))
|
||||
double __ocml_acos_f64(double);
|
||||
__device__
|
||||
__attribute__((pure))
|
||||
double __ocml_acosh_f64(double);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
double __ocml_asin_f64(double);
|
||||
__device__
|
||||
__attribute__((pure))
|
||||
double __ocml_asinh_f64(double);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
double __ocml_atan2_f64(double, double);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
double __ocml_atan_f64(double);
|
||||
__device__
|
||||
__attribute__((pure))
|
||||
double __ocml_atanh_f64(double);
|
||||
__device__
|
||||
__attribute__((pure))
|
||||
double __ocml_cbrt_f64(double);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
double __ocml_ceil_f64(double);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
double __ocml_copysign_f64(double, double);
|
||||
__device__
|
||||
double __ocml_cos_f64(double);
|
||||
__device__
|
||||
__attribute__((pure))
|
||||
double __ocml_cosh_f64(double);
|
||||
__device__
|
||||
double __ocml_cospi_f64(double);
|
||||
__device__
|
||||
double __ocml_i0_f64(double);
|
||||
__device__
|
||||
double __ocml_i1_f64(double);
|
||||
__device__
|
||||
__attribute__((pure))
|
||||
double __ocml_erfc_f64(double);
|
||||
__device__
|
||||
__attribute__((pure))
|
||||
double __ocml_erfcinv_f64(double);
|
||||
__device__
|
||||
__attribute__((pure))
|
||||
double __ocml_erfcx_f64(double);
|
||||
__device__
|
||||
__attribute__((pure))
|
||||
double __ocml_erf_f64(double);
|
||||
__device__
|
||||
__attribute__((pure))
|
||||
double __ocml_erfinv_f64(double);
|
||||
__device__
|
||||
__attribute__((pure))
|
||||
double __ocml_exp10_f64(double);
|
||||
__device__
|
||||
__attribute__((pure))
|
||||
double __ocml_exp2_f64(double);
|
||||
__device__
|
||||
__attribute__((pure))
|
||||
double __ocml_exp_f64(double);
|
||||
__device__
|
||||
__attribute__((pure))
|
||||
double __ocml_expm1_f64(double);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
double __ocml_fabs_f64(double);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
double __ocml_fdim_f64(double, double);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
double __ocml_floor_f64(double);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
double __ocml_fma_f64(double, double, double);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
double __ocml_fmax_f64(double, double);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
double __ocml_fmin_f64(double, double);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
double __ocml_fmod_f64(double, double);
|
||||
__device__
|
||||
double __ocml_frexp_f64(double, __attribute__((address_space(5))) int*);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
double __ocml_hypot_f64(double, double);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
int __ocml_ilogb_f64(double);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
int __ocml_isfinite_f64(double);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
int __ocml_isinf_f64(double);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
int __ocml_isnan_f64(double);
|
||||
__device__
|
||||
double __ocml_j0_f64(double);
|
||||
__device__
|
||||
double __ocml_j1_f64(double);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
double __ocml_ldexp_f64(double, int);
|
||||
__device__
|
||||
double __ocml_lgamma_f64(double);
|
||||
__device__
|
||||
__attribute__((pure))
|
||||
double __ocml_log10_f64(double);
|
||||
__device__
|
||||
__attribute__((pure))
|
||||
double __ocml_log1p_f64(double);
|
||||
__device__
|
||||
__attribute__((pure))
|
||||
double __ocml_log2_f64(double);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
double __ocml_logb_f64(double);
|
||||
__device__
|
||||
__attribute__((pure))
|
||||
double __ocml_log_f64(double);
|
||||
__device__
|
||||
double __ocml_modf_f64(double, __attribute__((address_space(5))) double*);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
double __ocml_nearbyint_f64(double);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
double __ocml_nextafter_f64(double, double);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
double __ocml_len3_f64(double, double, double);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
double __ocml_len4_f64(double, double, double, double);
|
||||
__device__
|
||||
__attribute__((pure))
|
||||
double __ocml_ncdf_f64(double);
|
||||
__device__
|
||||
__attribute__((pure))
|
||||
double __ocml_ncdfinv_f64(double);
|
||||
__device__
|
||||
__attribute__((pure))
|
||||
double __ocml_pow_f64(double, double);
|
||||
__device__
|
||||
__attribute__((pure))
|
||||
double __ocml_pown_f64(double, int);
|
||||
__device__
|
||||
__attribute__((pure))
|
||||
double __ocml_rcbrt_f64(double);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
double __ocml_remainder_f64(double, double);
|
||||
__device__
|
||||
double __ocml_remquo_f64(
|
||||
double, double, __attribute__((address_space(5))) int*);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
double __ocml_rhypot_f64(double, double);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
double __ocml_rint_f64(double);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
double __ocml_rlen3_f64(double, double, double);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
double __ocml_rlen4_f64(double, double, double, double);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
double __ocml_round_f64(double);
|
||||
__device__
|
||||
__attribute__((pure))
|
||||
double __ocml_rsqrt_f64(double);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
double __ocml_scalb_f64(double, double);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
double __ocml_scalbn_f64(double, int);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
int __ocml_signbit_f64(double);
|
||||
__device__
|
||||
double __ocml_sincos_f64(double, __attribute__((address_space(5))) double*);
|
||||
__device__
|
||||
double __ocml_sincospi_f64(double, __attribute__((address_space(5))) double*);
|
||||
__device__
|
||||
double __ocml_sin_f64(double);
|
||||
__device__
|
||||
__attribute__((pure))
|
||||
double __ocml_sinh_f64(double);
|
||||
__device__
|
||||
double __ocml_sinpi_f64(double);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
double __ocml_sqrt_f64(double);
|
||||
__device__
|
||||
double __ocml_tan_f64(double);
|
||||
__device__
|
||||
__attribute__((pure))
|
||||
double __ocml_tanh_f64(double);
|
||||
__device__
|
||||
double __ocml_tgamma_f64(double);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
double __ocml_trunc_f64(double);
|
||||
__device__
|
||||
double __ocml_y0_f64(double);
|
||||
__device__
|
||||
double __ocml_y1_f64(double);
|
||||
__device__ __attribute__((const)) double __ocml_acos_f64(double);
|
||||
__device__ __attribute__((pure)) double __ocml_acosh_f64(double);
|
||||
__device__ __attribute__((const)) double __ocml_asin_f64(double);
|
||||
__device__ __attribute__((pure)) double __ocml_asinh_f64(double);
|
||||
__device__ __attribute__((const)) double __ocml_atan2_f64(double, double);
|
||||
__device__ __attribute__((const)) double __ocml_atan_f64(double);
|
||||
__device__ __attribute__((pure)) double __ocml_atanh_f64(double);
|
||||
__device__ __attribute__((pure)) double __ocml_cbrt_f64(double);
|
||||
__device__ __attribute__((const)) double __ocml_ceil_f64(double);
|
||||
__device__ __attribute__((const)) double __ocml_copysign_f64(double, double);
|
||||
__device__ double __ocml_cos_f64(double);
|
||||
__device__ __attribute__((pure)) double __ocml_cosh_f64(double);
|
||||
__device__ double __ocml_cospi_f64(double);
|
||||
__device__ double __ocml_i0_f64(double);
|
||||
__device__ double __ocml_i1_f64(double);
|
||||
__device__ __attribute__((pure)) double __ocml_erfc_f64(double);
|
||||
__device__ __attribute__((pure)) double __ocml_erfcinv_f64(double);
|
||||
__device__ __attribute__((pure)) double __ocml_erfcx_f64(double);
|
||||
__device__ __attribute__((pure)) double __ocml_erf_f64(double);
|
||||
__device__ __attribute__((pure)) double __ocml_erfinv_f64(double);
|
||||
__device__ __attribute__((pure)) double __ocml_exp10_f64(double);
|
||||
__device__ __attribute__((pure)) double __ocml_exp2_f64(double);
|
||||
__device__ __attribute__((pure)) double __ocml_exp_f64(double);
|
||||
__device__ __attribute__((pure)) double __ocml_expm1_f64(double);
|
||||
__device__ __attribute__((const)) double __ocml_fabs_f64(double);
|
||||
__device__ __attribute__((const)) double __ocml_fdim_f64(double, double);
|
||||
__device__ __attribute__((const)) double __ocml_floor_f64(double);
|
||||
__device__ __attribute__((const)) double __ocml_fma_f64(double, double, double);
|
||||
__device__ __attribute__((const)) double __ocml_fmax_f64(double, double);
|
||||
__device__ __attribute__((const)) double __ocml_fmin_f64(double, double);
|
||||
__device__ __attribute__((const)) double __ocml_fmod_f64(double, double);
|
||||
__device__ double __ocml_frexp_f64(double, __attribute__((address_space(5))) int*);
|
||||
__device__ __attribute__((const)) double __ocml_hypot_f64(double, double);
|
||||
__device__ __attribute__((const)) int __ocml_ilogb_f64(double);
|
||||
__device__ __attribute__((const)) int __ocml_isfinite_f64(double);
|
||||
__device__ __attribute__((const)) int __ocml_isinf_f64(double);
|
||||
__device__ __attribute__((const)) int __ocml_isnan_f64(double);
|
||||
__device__ double __ocml_j0_f64(double);
|
||||
__device__ double __ocml_j1_f64(double);
|
||||
__device__ __attribute__((const)) double __ocml_ldexp_f64(double, int);
|
||||
__device__ double __ocml_lgamma_f64(double);
|
||||
__device__ __attribute__((pure)) double __ocml_log10_f64(double);
|
||||
__device__ __attribute__((pure)) double __ocml_log1p_f64(double);
|
||||
__device__ __attribute__((pure)) double __ocml_log2_f64(double);
|
||||
__device__ __attribute__((const)) double __ocml_logb_f64(double);
|
||||
__device__ __attribute__((pure)) double __ocml_log_f64(double);
|
||||
__device__ double __ocml_modf_f64(double, __attribute__((address_space(5))) double*);
|
||||
__device__ __attribute__((const)) double __ocml_nearbyint_f64(double);
|
||||
__device__ __attribute__((const)) double __ocml_nextafter_f64(double, double);
|
||||
__device__ __attribute__((const)) double __ocml_len3_f64(double, double, double);
|
||||
__device__ __attribute__((const)) double __ocml_len4_f64(double, double, double, double);
|
||||
__device__ __attribute__((pure)) double __ocml_ncdf_f64(double);
|
||||
__device__ __attribute__((pure)) double __ocml_ncdfinv_f64(double);
|
||||
__device__ __attribute__((pure)) double __ocml_pow_f64(double, double);
|
||||
__device__ __attribute__((pure)) double __ocml_pown_f64(double, int);
|
||||
__device__ __attribute__((pure)) double __ocml_rcbrt_f64(double);
|
||||
__device__ __attribute__((const)) double __ocml_remainder_f64(double, double);
|
||||
__device__ double __ocml_remquo_f64(double, double, __attribute__((address_space(5))) int*);
|
||||
__device__ __attribute__((const)) double __ocml_rhypot_f64(double, double);
|
||||
__device__ __attribute__((const)) double __ocml_rint_f64(double);
|
||||
__device__ __attribute__((const)) double __ocml_rlen3_f64(double, double, double);
|
||||
__device__ __attribute__((const)) double __ocml_rlen4_f64(double, double, double, double);
|
||||
__device__ __attribute__((const)) double __ocml_round_f64(double);
|
||||
__device__ __attribute__((pure)) double __ocml_rsqrt_f64(double);
|
||||
__device__ __attribute__((const)) double __ocml_scalb_f64(double, double);
|
||||
__device__ __attribute__((const)) double __ocml_scalbn_f64(double, int);
|
||||
__device__ __attribute__((const)) int __ocml_signbit_f64(double);
|
||||
__device__ double __ocml_sincos_f64(double, __attribute__((address_space(5))) double*);
|
||||
__device__ double __ocml_sincospi_f64(double, __attribute__((address_space(5))) double*);
|
||||
__device__ double __ocml_sin_f64(double);
|
||||
__device__ __attribute__((pure)) double __ocml_sinh_f64(double);
|
||||
__device__ double __ocml_sinpi_f64(double);
|
||||
__device__ __attribute__((const)) double __ocml_sqrt_f64(double);
|
||||
__device__ double __ocml_tan_f64(double);
|
||||
__device__ __attribute__((pure)) double __ocml_tanh_f64(double);
|
||||
__device__ double __ocml_tgamma_f64(double);
|
||||
__device__ __attribute__((const)) double __ocml_trunc_f64(double);
|
||||
__device__ double __ocml_y0_f64(double);
|
||||
__device__ double __ocml_y1_f64(double);
|
||||
|
||||
// BEGIN INTRINSICS
|
||||
__device__
|
||||
__attribute__((const))
|
||||
double __ocml_add_rte_f64(double, double);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
double __ocml_add_rtn_f64(double, double);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
double __ocml_add_rtp_f64(double, double);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
double __ocml_add_rtz_f64(double, double);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
double __ocml_sub_rte_f64(double, double);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
double __ocml_sub_rtn_f64(double, double);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
double __ocml_sub_rtp_f64(double, double);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
double __ocml_sub_rtz_f64(double, double);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
double __ocml_mul_rte_f64(double, double);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
double __ocml_mul_rtn_f64(double, double);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
double __ocml_mul_rtp_f64(double, double);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
double __ocml_mul_rtz_f64(double, double);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
double __ocml_div_rte_f64(double, double);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
double __ocml_div_rtn_f64(double, double);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
double __ocml_div_rtp_f64(double, double);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
double __ocml_div_rtz_f64(double, double);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
double __ocml_sqrt_rte_f64(double);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
double __ocml_sqrt_rtn_f64(double);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
double __ocml_sqrt_rtp_f64(double);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
double __ocml_sqrt_rtz_f64(double);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
double __ocml_fma_rte_f64(double, double, double);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
double __ocml_fma_rtn_f64(double, double, double);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
double __ocml_fma_rtp_f64(double, double, double);
|
||||
__device__
|
||||
__attribute__((const))
|
||||
double __ocml_fma_rtz_f64(double, double, double);
|
||||
__device__ __attribute__((const)) double __ocml_add_rte_f64(double, double);
|
||||
__device__ __attribute__((const)) double __ocml_add_rtn_f64(double, double);
|
||||
__device__ __attribute__((const)) double __ocml_add_rtp_f64(double, double);
|
||||
__device__ __attribute__((const)) double __ocml_add_rtz_f64(double, double);
|
||||
__device__ __attribute__((const)) double __ocml_sub_rte_f64(double, double);
|
||||
__device__ __attribute__((const)) double __ocml_sub_rtn_f64(double, double);
|
||||
__device__ __attribute__((const)) double __ocml_sub_rtp_f64(double, double);
|
||||
__device__ __attribute__((const)) double __ocml_sub_rtz_f64(double, double);
|
||||
__device__ __attribute__((const)) double __ocml_mul_rte_f64(double, double);
|
||||
__device__ __attribute__((const)) double __ocml_mul_rtn_f64(double, double);
|
||||
__device__ __attribute__((const)) double __ocml_mul_rtp_f64(double, double);
|
||||
__device__ __attribute__((const)) double __ocml_mul_rtz_f64(double, double);
|
||||
__device__ __attribute__((const)) double __ocml_div_rte_f64(double, double);
|
||||
__device__ __attribute__((const)) double __ocml_div_rtn_f64(double, double);
|
||||
__device__ __attribute__((const)) double __ocml_div_rtp_f64(double, double);
|
||||
__device__ __attribute__((const)) double __ocml_div_rtz_f64(double, double);
|
||||
__device__ __attribute__((const)) double __ocml_sqrt_rte_f64(double);
|
||||
__device__ __attribute__((const)) double __ocml_sqrt_rtn_f64(double);
|
||||
__device__ __attribute__((const)) double __ocml_sqrt_rtp_f64(double);
|
||||
__device__ __attribute__((const)) double __ocml_sqrt_rtz_f64(double);
|
||||
__device__ __attribute__((const)) double __ocml_fma_rte_f64(double, double, double);
|
||||
__device__ __attribute__((const)) double __ocml_fma_rtn_f64(double, double, double);
|
||||
__device__ __attribute__((const)) double __ocml_fma_rtp_f64(double, double, double);
|
||||
__device__ __attribute__((const)) double __ocml_fma_rtz_f64(double, double, double);
|
||||
// END INTRINSICS
|
||||
// END DOUBLE
|
||||
|
||||
#endif // !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
|
||||
#endif // !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
|
||||
|
||||
#if defined(__cplusplus)
|
||||
} // extern "C"
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
@@ -30,109 +30,190 @@ extern "C" {
|
||||
|
||||
#define ADDRESS_SPACE_CONSTANT __attribute__((address_space(4)))
|
||||
|
||||
__device__ float4::Native_vec_ __ockl_image_load_1D(unsigned int ADDRESS_SPACE_CONSTANT*i, int c);
|
||||
__device__ float4::Native_vec_ __ockl_image_load_1D(unsigned int ADDRESS_SPACE_CONSTANT* i, int c);
|
||||
|
||||
__device__ float4::Native_vec_ __ockl_image_load_1Db(unsigned int ADDRESS_SPACE_CONSTANT*i, int c);
|
||||
__device__ float4::Native_vec_ __ockl_image_load_1Db(unsigned int ADDRESS_SPACE_CONSTANT* i, int c);
|
||||
|
||||
__device__ float4::Native_vec_ __ockl_image_load_1Da(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c);
|
||||
__device__ float4::Native_vec_ __ockl_image_load_1Da(unsigned int ADDRESS_SPACE_CONSTANT* i,
|
||||
int2::Native_vec_ c);
|
||||
|
||||
__device__ float4::Native_vec_ __ockl_image_load_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c);
|
||||
__device__ float4::Native_vec_ __ockl_image_load_2D(unsigned int ADDRESS_SPACE_CONSTANT* i,
|
||||
int2::Native_vec_ c);
|
||||
|
||||
__device__ float4::Native_vec_ __ockl_image_load_2Da(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c);
|
||||
__device__ float4::Native_vec_ __ockl_image_load_2Da(unsigned int ADDRESS_SPACE_CONSTANT* i,
|
||||
int4::Native_vec_ c);
|
||||
|
||||
__device__ float4::Native_vec_ __ockl_image_load_3D(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c);
|
||||
__device__ float4::Native_vec_ __ockl_image_load_3D(unsigned int ADDRESS_SPACE_CONSTANT* i,
|
||||
int4::Native_vec_ c);
|
||||
|
||||
__device__ float4::Native_vec_ __ockl_image_load_CM(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, int f);
|
||||
__device__ float4::Native_vec_ __ockl_image_load_CM(unsigned int ADDRESS_SPACE_CONSTANT* i,
|
||||
int2::Native_vec_ c, int f);
|
||||
|
||||
__device__ float4::Native_vec_ __ockl_image_load_CMa(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, int f);
|
||||
__device__ float4::Native_vec_ __ockl_image_load_CMa(unsigned int ADDRESS_SPACE_CONSTANT* i,
|
||||
int4::Native_vec_ c, int f);
|
||||
|
||||
__device__ float4::Native_vec_ __ockl_image_load_lod_1D(unsigned int ADDRESS_SPACE_CONSTANT*i, int c, int l);
|
||||
__device__ float4::Native_vec_ __ockl_image_load_lod_1D(unsigned int ADDRESS_SPACE_CONSTANT* i,
|
||||
int c, int l);
|
||||
|
||||
__device__ float4::Native_vec_ __ockl_image_load_lod_1Da(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, int l);
|
||||
__device__ float4::Native_vec_ __ockl_image_load_lod_1Da(unsigned int ADDRESS_SPACE_CONSTANT* i,
|
||||
int2::Native_vec_ c, int l);
|
||||
|
||||
__device__ float4::Native_vec_ __ockl_image_load_lod_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, int l);
|
||||
__device__ float4::Native_vec_ __ockl_image_load_lod_2D(unsigned int ADDRESS_SPACE_CONSTANT* i,
|
||||
int2::Native_vec_ c, int l);
|
||||
|
||||
__device__ float4::Native_vec_ __ockl_image_load_lod_2Da(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, int l);
|
||||
__device__ float4::Native_vec_ __ockl_image_load_lod_2Da(unsigned int ADDRESS_SPACE_CONSTANT* i,
|
||||
int4::Native_vec_ c, int l);
|
||||
|
||||
__device__ float4::Native_vec_ __ockl_image_load_lod_3D(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, int l);
|
||||
__device__ float4::Native_vec_ __ockl_image_load_lod_3D(unsigned int ADDRESS_SPACE_CONSTANT* i,
|
||||
int4::Native_vec_ c, int l);
|
||||
|
||||
__device__ float4::Native_vec_ __ockl_image_load_lod_CM(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, int f, int l);
|
||||
__device__ float4::Native_vec_ __ockl_image_load_lod_CM(unsigned int ADDRESS_SPACE_CONSTANT* i,
|
||||
int2::Native_vec_ c, int f, int l);
|
||||
|
||||
__device__ float4::Native_vec_ __ockl_image_load_lod_CMa(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, int f, int l);
|
||||
__device__ float4::Native_vec_ __ockl_image_load_lod_CMa(unsigned int ADDRESS_SPACE_CONSTANT* i,
|
||||
int4::Native_vec_ c, int f, int l);
|
||||
|
||||
__device__ void __ockl_image_store_1D(unsigned int ADDRESS_SPACE_CONSTANT*i, int c, float4::Native_vec_ p);
|
||||
__device__ void __ockl_image_store_1D(unsigned int ADDRESS_SPACE_CONSTANT* i, int c,
|
||||
float4::Native_vec_ p);
|
||||
|
||||
__device__ void __ockl_image_store_1Da(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, float4::Native_vec_ p);
|
||||
__device__ void __ockl_image_store_1Da(unsigned int ADDRESS_SPACE_CONSTANT* i, int2::Native_vec_ c,
|
||||
float4::Native_vec_ p);
|
||||
|
||||
__device__ void __ockl_image_store_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, float4::Native_vec_ p);
|
||||
__device__ void __ockl_image_store_2D(unsigned int ADDRESS_SPACE_CONSTANT* i, int2::Native_vec_ c,
|
||||
float4::Native_vec_ p);
|
||||
|
||||
__device__ void __ockl_image_store_2Da(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, float4::Native_vec_ p);
|
||||
__device__ void __ockl_image_store_2Da(unsigned int ADDRESS_SPACE_CONSTANT* i, int4::Native_vec_ c,
|
||||
float4::Native_vec_ p);
|
||||
|
||||
__device__ void __ockl_image_store_3D(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, float4::Native_vec_ p);
|
||||
__device__ void __ockl_image_store_3D(unsigned int ADDRESS_SPACE_CONSTANT* i, int4::Native_vec_ c,
|
||||
float4::Native_vec_ p);
|
||||
|
||||
__device__ void __ockl_image_store_CM(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, int f, float4::Native_vec_ p);
|
||||
__device__ void __ockl_image_store_CM(unsigned int ADDRESS_SPACE_CONSTANT* i, int2::Native_vec_ c,
|
||||
int f, float4::Native_vec_ p);
|
||||
|
||||
__device__ void __ockl_image_store_CMa(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, int f, float4::Native_vec_ p);
|
||||
__device__ void __ockl_image_store_CMa(unsigned int ADDRESS_SPACE_CONSTANT* i, int4::Native_vec_ c,
|
||||
int f, float4::Native_vec_ p);
|
||||
|
||||
__device__ void __ockl_image_store_lod_1D(unsigned int ADDRESS_SPACE_CONSTANT*i, int c, int l, float4::Native_vec_ p);
|
||||
__device__ void __ockl_image_store_lod_1D(unsigned int ADDRESS_SPACE_CONSTANT* i, int c, int l,
|
||||
float4::Native_vec_ p);
|
||||
|
||||
__device__ void __ockl_image_store_lod_1Da(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, int l, float4::Native_vec_ p);
|
||||
__device__ void __ockl_image_store_lod_1Da(unsigned int ADDRESS_SPACE_CONSTANT* i,
|
||||
int2::Native_vec_ c, int l, float4::Native_vec_ p);
|
||||
|
||||
__device__ void __ockl_image_store_lod_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, int l, float4::Native_vec_ p);
|
||||
__device__ void __ockl_image_store_lod_2D(unsigned int ADDRESS_SPACE_CONSTANT* i,
|
||||
int2::Native_vec_ c, int l, float4::Native_vec_ p);
|
||||
|
||||
__device__ void __ockl_image_store_lod_2Da(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, int l, float4::Native_vec_ p);
|
||||
__device__ void __ockl_image_store_lod_2Da(unsigned int ADDRESS_SPACE_CONSTANT* i,
|
||||
int4::Native_vec_ c, int l, float4::Native_vec_ p);
|
||||
|
||||
__device__ void __ockl_image_store_lod_3D(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, int l, float4::Native_vec_ p);
|
||||
__device__ void __ockl_image_store_lod_3D(unsigned int ADDRESS_SPACE_CONSTANT* i,
|
||||
int4::Native_vec_ c, int l, float4::Native_vec_ p);
|
||||
|
||||
__device__ void __ockl_image_store_lod_CM(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, int f, int l, float4::Native_vec_ p);
|
||||
__device__ void __ockl_image_store_lod_CM(unsigned int ADDRESS_SPACE_CONSTANT* i,
|
||||
int2::Native_vec_ c, int f, int l, float4::Native_vec_ p);
|
||||
|
||||
__device__ void __ockl_image_store_lod_CMa(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, int f, int l, float4::Native_vec_ p);
|
||||
__device__ void __ockl_image_store_lod_CMa(unsigned int ADDRESS_SPACE_CONSTANT* i,
|
||||
int4::Native_vec_ c, int f, int l,
|
||||
float4::Native_vec_ p);
|
||||
|
||||
__device__ float4::Native_vec_ __ockl_image_sample_1D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float c);
|
||||
__device__ float4::Native_vec_ __ockl_image_sample_1D(unsigned int ADDRESS_SPACE_CONSTANT* i,
|
||||
unsigned int ADDRESS_SPACE_CONSTANT* s,
|
||||
float c);
|
||||
|
||||
__device__ float4::Native_vec_ __ockl_image_sample_1Da(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c);
|
||||
__device__ float4::Native_vec_ __ockl_image_sample_1Da(unsigned int ADDRESS_SPACE_CONSTANT* i,
|
||||
unsigned int ADDRESS_SPACE_CONSTANT* s,
|
||||
float2::Native_vec_ c);
|
||||
|
||||
__device__ float4::Native_vec_ __ockl_image_sample_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c);
|
||||
__device__ float4::Native_vec_ __ockl_image_sample_2D(unsigned int ADDRESS_SPACE_CONSTANT* i,
|
||||
unsigned int ADDRESS_SPACE_CONSTANT* s,
|
||||
float2::Native_vec_ c);
|
||||
|
||||
__device__ float4::Native_vec_ __ockl_image_sample_2Da(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c);
|
||||
__device__ float4::Native_vec_ __ockl_image_sample_2Da(unsigned int ADDRESS_SPACE_CONSTANT* i,
|
||||
unsigned int ADDRESS_SPACE_CONSTANT* s,
|
||||
float4::Native_vec_ c);
|
||||
|
||||
__device__ float4::Native_vec_ __ockl_image_sample_3D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c);
|
||||
__device__ float4::Native_vec_ __ockl_image_sample_3D(unsigned int ADDRESS_SPACE_CONSTANT* i,
|
||||
unsigned int ADDRESS_SPACE_CONSTANT* s,
|
||||
float4::Native_vec_ c);
|
||||
|
||||
__device__ float4::Native_vec_ __ockl_image_sample_CM(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c);
|
||||
__device__ float4::Native_vec_ __ockl_image_sample_CM(unsigned int ADDRESS_SPACE_CONSTANT* i,
|
||||
unsigned int ADDRESS_SPACE_CONSTANT* s,
|
||||
float4::Native_vec_ c);
|
||||
|
||||
__device__ float4::Native_vec_ __ockl_image_sample_CMa(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c);
|
||||
__device__ float4::Native_vec_ __ockl_image_sample_CMa(unsigned int ADDRESS_SPACE_CONSTANT* i,
|
||||
unsigned int ADDRESS_SPACE_CONSTANT* s,
|
||||
float4::Native_vec_ c);
|
||||
|
||||
__device__ float4::Native_vec_ __ockl_image_sample_grad_1D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float c, float dx, float dy);
|
||||
__device__ float4::Native_vec_ __ockl_image_sample_grad_1D(unsigned int ADDRESS_SPACE_CONSTANT* i,
|
||||
unsigned int ADDRESS_SPACE_CONSTANT* s,
|
||||
float c, float dx, float dy);
|
||||
|
||||
__device__ float4::Native_vec_ __ockl_image_sample_grad_1Da(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c, float dx, float dy);
|
||||
__device__ float4::Native_vec_ __ockl_image_sample_grad_1Da(unsigned int ADDRESS_SPACE_CONSTANT* i,
|
||||
unsigned int ADDRESS_SPACE_CONSTANT* s,
|
||||
float2::Native_vec_ c, float dx,
|
||||
float dy);
|
||||
|
||||
__device__ float4::Native_vec_ __ockl_image_sample_grad_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c, float2::Native_vec_ dx, float2::Native_vec_ dy);
|
||||
__device__ float4::Native_vec_ __ockl_image_sample_grad_2D(unsigned int ADDRESS_SPACE_CONSTANT* i,
|
||||
unsigned int ADDRESS_SPACE_CONSTANT* s,
|
||||
float2::Native_vec_ c,
|
||||
float2::Native_vec_ dx,
|
||||
float2::Native_vec_ dy);
|
||||
|
||||
__device__ float4::Native_vec_ __ockl_image_sample_grad_2Da(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c, float2::Native_vec_ dx, float2::Native_vec_ dy);
|
||||
__device__ float4::Native_vec_ __ockl_image_sample_grad_2Da(unsigned int ADDRESS_SPACE_CONSTANT* i,
|
||||
unsigned int ADDRESS_SPACE_CONSTANT* s,
|
||||
float4::Native_vec_ c,
|
||||
float2::Native_vec_ dx,
|
||||
float2::Native_vec_ dy);
|
||||
|
||||
__device__ float4::Native_vec_ __ockl_image_sample_grad_3D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c, float4::Native_vec_ dx, float4::Native_vec_ dy);
|
||||
__device__ float4::Native_vec_ __ockl_image_sample_grad_3D(unsigned int ADDRESS_SPACE_CONSTANT* i,
|
||||
unsigned int ADDRESS_SPACE_CONSTANT* s,
|
||||
float4::Native_vec_ c,
|
||||
float4::Native_vec_ dx,
|
||||
float4::Native_vec_ dy);
|
||||
|
||||
__device__ float4::Native_vec_ __ockl_image_sample_lod_1D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float c, float l);
|
||||
__device__ float4::Native_vec_ __ockl_image_sample_lod_1D(unsigned int ADDRESS_SPACE_CONSTANT* i,
|
||||
unsigned int ADDRESS_SPACE_CONSTANT* s,
|
||||
float c, float l);
|
||||
|
||||
__device__ float4::Native_vec_ __ockl_image_sample_lod_1Da(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c, float l);
|
||||
__device__ float4::Native_vec_ __ockl_image_sample_lod_1Da(unsigned int ADDRESS_SPACE_CONSTANT* i,
|
||||
unsigned int ADDRESS_SPACE_CONSTANT* s,
|
||||
float2::Native_vec_ c, float l);
|
||||
|
||||
__device__ float4::Native_vec_ __ockl_image_sample_lod_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c, float l);
|
||||
__device__ float4::Native_vec_ __ockl_image_sample_lod_2D(unsigned int ADDRESS_SPACE_CONSTANT* i,
|
||||
unsigned int ADDRESS_SPACE_CONSTANT* s,
|
||||
float2::Native_vec_ c, float l);
|
||||
|
||||
__device__ float4::Native_vec_ __ockl_image_sample_lod_2Da(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c, float l);
|
||||
__device__ float4::Native_vec_ __ockl_image_sample_lod_2Da(unsigned int ADDRESS_SPACE_CONSTANT* i,
|
||||
unsigned int ADDRESS_SPACE_CONSTANT* s,
|
||||
float4::Native_vec_ c, float l);
|
||||
|
||||
__device__ float4::Native_vec_ __ockl_image_sample_lod_3D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c, float l);
|
||||
__device__ float4::Native_vec_ __ockl_image_sample_lod_3D(unsigned int ADDRESS_SPACE_CONSTANT* i,
|
||||
unsigned int ADDRESS_SPACE_CONSTANT* s,
|
||||
float4::Native_vec_ c, float l);
|
||||
|
||||
__device__ float4::Native_vec_ __ockl_image_sample_lod_CM(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c, float l);
|
||||
__device__ float4::Native_vec_ __ockl_image_sample_lod_CM(unsigned int ADDRESS_SPACE_CONSTANT* i,
|
||||
unsigned int ADDRESS_SPACE_CONSTANT* s,
|
||||
float4::Native_vec_ c, float l);
|
||||
|
||||
__device__ float4::Native_vec_ __ockl_image_sample_lod_CMa(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c, float l);
|
||||
__device__ float4::Native_vec_ __ockl_image_sample_lod_CMa(unsigned int ADDRESS_SPACE_CONSTANT* i,
|
||||
unsigned int ADDRESS_SPACE_CONSTANT* s,
|
||||
float4::Native_vec_ c, float l);
|
||||
|
||||
__device__ float4::Native_vec_ __ockl_image_gather4r_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c);
|
||||
__device__ float4::Native_vec_ __ockl_image_gather4r_2D(unsigned int ADDRESS_SPACE_CONSTANT* i,
|
||||
unsigned int ADDRESS_SPACE_CONSTANT* s,
|
||||
float2::Native_vec_ c);
|
||||
|
||||
__device__ float4::Native_vec_ __ockl_image_gather4g_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c);
|
||||
__device__ float4::Native_vec_ __ockl_image_gather4g_2D(unsigned int ADDRESS_SPACE_CONSTANT* i,
|
||||
unsigned int ADDRESS_SPACE_CONSTANT* s,
|
||||
float2::Native_vec_ c);
|
||||
|
||||
__device__ float4::Native_vec_ __ockl_image_gather4b_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c);
|
||||
__device__ float4::Native_vec_ __ockl_image_gather4b_2D(unsigned int ADDRESS_SPACE_CONSTANT* i,
|
||||
unsigned int ADDRESS_SPACE_CONSTANT* s,
|
||||
float2::Native_vec_ c);
|
||||
|
||||
__device__ float4::Native_vec_ __ockl_image_gather4a_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c);
|
||||
__device__ float4::Native_vec_ __ockl_image_gather4a_2D(unsigned int ADDRESS_SPACE_CONSTANT* i,
|
||||
unsigned int ADDRESS_SPACE_CONSTANT* s,
|
||||
float2::Native_vec_ c);
|
||||
|
||||
__device__ int __ockl_image_channel_data_type_1D(unsigned int ADDRESS_SPACE_CONSTANT* i);
|
||||
|
||||
@@ -173,5 +254,4 @@ __device__ int __ockl_image_channel_order_3D(unsigned int ADDRESS_SPACE_CONSTANT
|
||||
__device__ int __ockl_image_channel_order_CM(unsigned int ADDRESS_SPACE_CONSTANT* i);
|
||||
|
||||
__device__ int __ockl_image_channel_order_CMa(unsigned int ADDRESS_SPACE_CONSTANT* i);
|
||||
|
||||
}
|
||||
|
||||
@@ -41,67 +41,63 @@ namespace hip_impl {
|
||||
// This section contains internal APIs that
|
||||
// needs to be exported
|
||||
#ifdef __GNUC__
|
||||
#pragma GCC visibility push (default)
|
||||
#pragma GCC visibility push(default)
|
||||
#endif
|
||||
|
||||
struct kernarg_impl;
|
||||
class kernarg {
|
||||
public:
|
||||
kernarg();
|
||||
kernarg(kernarg&&);
|
||||
~kernarg();
|
||||
std::uint8_t* data();
|
||||
std::size_t size();
|
||||
void reserve(std::size_t);
|
||||
void resize(std::size_t);
|
||||
private:
|
||||
kernarg_impl* impl;
|
||||
public:
|
||||
kernarg();
|
||||
kernarg(kernarg&&);
|
||||
~kernarg();
|
||||
std::uint8_t* data();
|
||||
std::size_t size();
|
||||
void reserve(std::size_t);
|
||||
void resize(std::size_t);
|
||||
|
||||
private:
|
||||
kernarg_impl* impl;
|
||||
};
|
||||
|
||||
class kernargs_size_align;
|
||||
class program_state_impl;
|
||||
class program_state {
|
||||
public:
|
||||
program_state();
|
||||
~program_state();
|
||||
program_state(const program_state&) = delete;
|
||||
public:
|
||||
program_state();
|
||||
~program_state();
|
||||
program_state(const program_state&) = delete;
|
||||
|
||||
hipFunction_t kernel_descriptor(std::uintptr_t,
|
||||
hsa_agent_t);
|
||||
hipFunction_t kernel_descriptor(std::uintptr_t, hsa_agent_t);
|
||||
|
||||
kernargs_size_align get_kernargs_size_align(std::uintptr_t);
|
||||
hsa_executable_t load_executable(const char*, const size_t,
|
||||
hsa_executable_t,
|
||||
hsa_agent_t);
|
||||
hsa_executable_t load_executable_no_copy(const char*, const size_t,
|
||||
hsa_executable_t,
|
||||
hsa_agent_t);
|
||||
kernargs_size_align get_kernargs_size_align(std::uintptr_t);
|
||||
hsa_executable_t load_executable(const char*, const size_t, hsa_executable_t, hsa_agent_t);
|
||||
hsa_executable_t load_executable_no_copy(const char*, const size_t, hsa_executable_t,
|
||||
hsa_agent_t);
|
||||
|
||||
void* global_addr_by_name(const char* name);
|
||||
void* global_addr_by_name(const char* name);
|
||||
|
||||
private:
|
||||
friend class agent_globals_impl;
|
||||
program_state_impl* impl;
|
||||
private:
|
||||
friend class agent_globals_impl;
|
||||
program_state_impl* impl;
|
||||
};
|
||||
|
||||
class kernargs_size_align {
|
||||
public:
|
||||
std::size_t size(std::size_t n) const;
|
||||
std::size_t alignment(std::size_t n) const;
|
||||
const void* getHandle() const {return handle;};
|
||||
private:
|
||||
const void* handle;
|
||||
friend kernargs_size_align program_state::get_kernargs_size_align(std::uintptr_t);
|
||||
public:
|
||||
std::size_t size(std::size_t n) const;
|
||||
std::size_t alignment(std::size_t n) const;
|
||||
const void* getHandle() const { return handle; };
|
||||
|
||||
private:
|
||||
const void* handle;
|
||||
friend kernargs_size_align program_state::get_kernargs_size_align(std::uintptr_t);
|
||||
};
|
||||
|
||||
#ifdef __GNUC__
|
||||
#pragma GCC visibility pop
|
||||
#endif
|
||||
|
||||
inline
|
||||
__attribute__((visibility("hidden")))
|
||||
program_state& get_program_state() {
|
||||
static program_state ps;
|
||||
return ps;
|
||||
inline __attribute__((visibility("hidden"))) program_state& get_program_state() {
|
||||
static program_state ps;
|
||||
return ps;
|
||||
}
|
||||
} // Namespace hip_impl.
|
||||
|
||||
@@ -29,95 +29,66 @@ THE SOFTWARE.
|
||||
#include <hip/hip_texture_types.h>
|
||||
#include <hip/amd_detail/ockl_image.h>
|
||||
#include <type_traits>
|
||||
#endif // !defined(__HIPCC_RTC__)
|
||||
#endif // !defined(__HIPCC_RTC__)
|
||||
|
||||
#define TEXTURE_PARAMETERS_INIT \
|
||||
unsigned int ADDRESS_SPACE_CONSTANT* i = (unsigned int ADDRESS_SPACE_CONSTANT*)t.textureObject; \
|
||||
unsigned int ADDRESS_SPACE_CONSTANT* s = i + HIP_SAMPLER_OBJECT_OFFSET_DWORD; \
|
||||
(void)s;
|
||||
|
||||
template<typename T>
|
||||
struct __hip_is_tex_surf_scalar_channel_type
|
||||
{
|
||||
static constexpr bool value =
|
||||
__hip_internal::is_same<T, char>::value ||
|
||||
__hip_internal::is_same<T, unsigned char>::value ||
|
||||
__hip_internal::is_same<T, short>::value ||
|
||||
__hip_internal::is_same<T, unsigned short>::value ||
|
||||
__hip_internal::is_same<T, int>::value ||
|
||||
__hip_internal::is_same<T, unsigned int>::value ||
|
||||
__hip_internal::is_same<T, float>::value;
|
||||
template <typename T> struct __hip_is_tex_surf_scalar_channel_type {
|
||||
static constexpr bool value = __hip_internal::is_same<T, char>::value ||
|
||||
__hip_internal::is_same<T, unsigned char>::value ||
|
||||
__hip_internal::is_same<T, short>::value ||
|
||||
__hip_internal::is_same<T, unsigned short>::value || __hip_internal::is_same<T, int>::value ||
|
||||
__hip_internal::is_same<T, unsigned int>::value || __hip_internal::is_same<T, float>::value;
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
struct __hip_is_tex_surf_channel_type
|
||||
{
|
||||
static constexpr bool value =
|
||||
__hip_is_tex_surf_scalar_channel_type<T>::value;
|
||||
template <typename T> struct __hip_is_tex_surf_channel_type {
|
||||
static constexpr bool value = __hip_is_tex_surf_scalar_channel_type<T>::value;
|
||||
};
|
||||
|
||||
template<
|
||||
typename T,
|
||||
unsigned int rank>
|
||||
struct __hip_is_tex_surf_channel_type<HIP_vector_type<T, rank>>
|
||||
{
|
||||
static constexpr bool value =
|
||||
__hip_is_tex_surf_scalar_channel_type<T>::value &&
|
||||
((rank == 1) ||
|
||||
(rank == 2) ||
|
||||
(rank == 4));
|
||||
template <typename T, unsigned int rank>
|
||||
struct __hip_is_tex_surf_channel_type<HIP_vector_type<T, rank>> {
|
||||
static constexpr bool value = __hip_is_tex_surf_scalar_channel_type<T>::value &&
|
||||
((rank == 1) || (rank == 2) || (rank == 4));
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
struct __hip_is_tex_normalized_channel_type
|
||||
{
|
||||
static constexpr bool value =
|
||||
__hip_internal::is_same<T, char>::value ||
|
||||
__hip_internal::is_same<T, unsigned char>::value ||
|
||||
__hip_internal::is_same<T, short>::value ||
|
||||
__hip_internal::is_same<T, unsigned short>::value;
|
||||
template <typename T> struct __hip_is_tex_normalized_channel_type {
|
||||
static constexpr bool value = __hip_internal::is_same<T, char>::value ||
|
||||
__hip_internal::is_same<T, unsigned char>::value ||
|
||||
__hip_internal::is_same<T, short>::value || __hip_internal::is_same<T, unsigned short>::value;
|
||||
};
|
||||
|
||||
template<
|
||||
typename T,
|
||||
unsigned int rank>
|
||||
struct __hip_is_tex_normalized_channel_type<HIP_vector_type<T, rank>>
|
||||
{
|
||||
static constexpr bool value =
|
||||
__hip_is_tex_normalized_channel_type<T>::value &&
|
||||
((rank == 1) ||
|
||||
(rank == 2) ||
|
||||
(rank == 4));
|
||||
template <typename T, unsigned int rank>
|
||||
struct __hip_is_tex_normalized_channel_type<HIP_vector_type<T, rank>> {
|
||||
static constexpr bool value =
|
||||
__hip_is_tex_normalized_channel_type<T>::value && ((rank == 1) || (rank == 2) || (rank == 4));
|
||||
};
|
||||
|
||||
template <
|
||||
typename T,
|
||||
hipTextureReadMode readMode,
|
||||
typename Enable = void>
|
||||
struct __hip_tex_ret
|
||||
{
|
||||
static_assert(__hip_internal::is_same<Enable, void>::value, "Invalid channel type!");
|
||||
template <typename T, hipTextureReadMode readMode, typename Enable = void> struct __hip_tex_ret {
|
||||
static_assert(__hip_internal::is_same<Enable, void>::value, "Invalid channel type!");
|
||||
};
|
||||
|
||||
/*
|
||||
* Map from device function return U to scalar texture type T
|
||||
*/
|
||||
template<typename T, typename U>
|
||||
template <typename T, typename U>
|
||||
__forceinline__ __device__
|
||||
typename __hip_internal::enable_if<
|
||||
__hip_is_tex_surf_scalar_channel_type<T>::value, const T>::type
|
||||
__hipMapFrom(const U &u) {
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_scalar_channel_type<T>::value,
|
||||
const T>::type
|
||||
__hipMapFrom(const U& u) {
|
||||
if constexpr (sizeof(T) < sizeof(float)) {
|
||||
union {
|
||||
U u;
|
||||
int i;
|
||||
} d = { u };
|
||||
} d = {u};
|
||||
return static_cast<T>(d.i);
|
||||
} else { // sizeof(T) == sizeof(float)
|
||||
} else { // sizeof(T) == sizeof(float)
|
||||
union {
|
||||
U u;
|
||||
T t;
|
||||
} d = { u };
|
||||
} d = {u};
|
||||
return d.t;
|
||||
}
|
||||
}
|
||||
@@ -125,22 +96,21 @@ __hipMapFrom(const U &u) {
|
||||
/*
|
||||
* Map from device function return U to vector texture type T
|
||||
*/
|
||||
template<typename T, typename U>
|
||||
__forceinline__ __device__
|
||||
typename __hip_internal::enable_if<
|
||||
__hip_is_tex_surf_scalar_channel_type<typename T::value_type>::value, const T>::type
|
||||
__hipMapFrom(const U &u) {
|
||||
template <typename T, typename U>
|
||||
__forceinline__ __device__ typename __hip_internal::enable_if<
|
||||
__hip_is_tex_surf_scalar_channel_type<typename T::value_type>::value, const T>::type
|
||||
__hipMapFrom(const U& u) {
|
||||
if constexpr (sizeof(typename T::value_type) < sizeof(float)) {
|
||||
union {
|
||||
U u;
|
||||
int4 i4;
|
||||
} d = { u };
|
||||
return __hipMapVector<typename T::value_type, sizeof(T)/sizeof(typename T::value_type)>(d.i4);
|
||||
} else { // sizeof(typename T::value_type) == sizeof(float)
|
||||
} d = {u};
|
||||
return __hipMapVector<typename T::value_type, sizeof(T) / sizeof(typename T::value_type)>(d.i4);
|
||||
} else { // sizeof(typename T::value_type) == sizeof(float)
|
||||
union {
|
||||
U u;
|
||||
T t;
|
||||
} d = { u };
|
||||
} d = {u};
|
||||
return d.t;
|
||||
}
|
||||
}
|
||||
@@ -148,23 +118,23 @@ __hipMapFrom(const U &u) {
|
||||
/*
|
||||
* Map from scalar texture type T to device function input U
|
||||
*/
|
||||
template<typename U, typename T>
|
||||
template <typename U, typename T>
|
||||
__forceinline__ __device__
|
||||
typename __hip_internal::enable_if<
|
||||
__hip_is_tex_surf_scalar_channel_type<T>::value, const U>::type
|
||||
__hipMapTo(const T &t) {
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_scalar_channel_type<T>::value,
|
||||
const U>::type
|
||||
__hipMapTo(const T& t) {
|
||||
if constexpr (sizeof(T) < sizeof(float)) {
|
||||
union {
|
||||
U u;
|
||||
int i;
|
||||
} d = { 0 };
|
||||
} d = {0};
|
||||
d.i = static_cast<int>(t);
|
||||
return d.u;
|
||||
} else { // sizeof(T) == sizeof(float)
|
||||
} else { // sizeof(T) == sizeof(float)
|
||||
union {
|
||||
U u;
|
||||
T t;
|
||||
} d = { 0 };
|
||||
} d = {0};
|
||||
d.t = t;
|
||||
return d.u;
|
||||
}
|
||||
@@ -173,337 +143,315 @@ __hipMapTo(const T &t) {
|
||||
/*
|
||||
* Map from vector texture type T to device function input U
|
||||
*/
|
||||
template<typename U, typename T>
|
||||
__forceinline__ __device__
|
||||
typename __hip_internal::enable_if<
|
||||
__hip_is_tex_surf_scalar_channel_type<typename T::value_type>::value, const U>::type
|
||||
__hipMapTo(const T &t) {
|
||||
template <typename U, typename T>
|
||||
__forceinline__ __device__ typename __hip_internal::enable_if<
|
||||
__hip_is_tex_surf_scalar_channel_type<typename T::value_type>::value, const U>::type
|
||||
__hipMapTo(const T& t) {
|
||||
if constexpr (sizeof(typename T::value_type) < sizeof(float)) {
|
||||
union {
|
||||
U u;
|
||||
int4 i4;
|
||||
} d = { 0 };
|
||||
} d = {0};
|
||||
d.i4 = __hipMapVector<int, 4>(t);
|
||||
return d.u;
|
||||
} else { // sizeof(typename T::value_type) == sizeof(float)
|
||||
} else { // sizeof(typename T::value_type) == sizeof(float)
|
||||
union {
|
||||
U u;
|
||||
T t;
|
||||
} d = { 0 };
|
||||
} d = {0};
|
||||
d.t = t;
|
||||
return d.u;
|
||||
}
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
hipTextureReadMode readMode>
|
||||
template <typename T, hipTextureReadMode readMode>
|
||||
using __hip_tex_ret_t = typename __hip_tex_ret<T, readMode, bool>::type;
|
||||
|
||||
template <typename T>
|
||||
struct __hip_tex_ret<
|
||||
T,
|
||||
hipReadModeElementType,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value, bool>::type>
|
||||
{
|
||||
using type = T;
|
||||
T, hipReadModeElementType,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value, bool>::type> {
|
||||
using type = T;
|
||||
};
|
||||
|
||||
template<
|
||||
typename T,
|
||||
unsigned int rank>
|
||||
template <typename T, unsigned int rank>
|
||||
struct __hip_tex_ret<
|
||||
HIP_vector_type<T, rank>,
|
||||
hipReadModeElementType,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<HIP_vector_type<T, rank>>::value, bool>::type>
|
||||
{
|
||||
using type = HIP_vector_type<__hip_tex_ret_t<T, hipReadModeElementType>, rank>;
|
||||
HIP_vector_type<T, rank>, hipReadModeElementType,
|
||||
typename __hip_internal::enable_if<
|
||||
__hip_is_tex_surf_channel_type<HIP_vector_type<T, rank>>::value, bool>::type> {
|
||||
using type = HIP_vector_type<__hip_tex_ret_t<T, hipReadModeElementType>, rank>;
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
template <typename T>
|
||||
struct __hip_tex_ret<T, hipReadModeNormalizedFloat,
|
||||
typename __hip_internal::enable_if<
|
||||
__hip_is_tex_normalized_channel_type<T>::value, bool>::type> {
|
||||
using type = float;
|
||||
};
|
||||
|
||||
template <typename T, unsigned int rank>
|
||||
struct __hip_tex_ret<
|
||||
T,
|
||||
hipReadModeNormalizedFloat,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_normalized_channel_type<T>::value, bool>::type>
|
||||
{
|
||||
using type = float;
|
||||
};
|
||||
|
||||
template<
|
||||
typename T,
|
||||
unsigned int rank>
|
||||
struct __hip_tex_ret<
|
||||
HIP_vector_type<T, rank>,
|
||||
hipReadModeNormalizedFloat,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_normalized_channel_type<HIP_vector_type<T, rank>>::value, bool>::type>
|
||||
{
|
||||
using type = HIP_vector_type<__hip_tex_ret_t<T, hipReadModeNormalizedFloat>, rank>;
|
||||
HIP_vector_type<T, rank>, hipReadModeNormalizedFloat,
|
||||
typename __hip_internal::enable_if<
|
||||
__hip_is_tex_normalized_channel_type<HIP_vector_type<T, rank>>::value, bool>::type> {
|
||||
using type = HIP_vector_type<__hip_tex_ret_t<T, hipReadModeNormalizedFloat>, rank>;
|
||||
};
|
||||
|
||||
|
||||
template <typename T, hipTextureReadMode readMode>
|
||||
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex1Dfetch(texture<T, hipTextureType1D, readMode> t, int x)
|
||||
{
|
||||
TEXTURE_PARAMETERS_INIT;
|
||||
auto tmp = __ockl_image_load_1Db(i, x);
|
||||
return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
|
||||
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex1Dfetch(
|
||||
texture<T, hipTextureType1D, readMode> t, int x) {
|
||||
TEXTURE_PARAMETERS_INIT;
|
||||
auto tmp = __ockl_image_load_1Db(i, x);
|
||||
return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
|
||||
}
|
||||
|
||||
template <typename T, hipTextureReadMode readMode>
|
||||
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex1D(texture<T, hipTextureType1D, readMode> t, float x)
|
||||
{
|
||||
TEXTURE_PARAMETERS_INIT;
|
||||
auto tmp = __ockl_image_sample_1D(i, s, x);
|
||||
return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
|
||||
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex1D(
|
||||
texture<T, hipTextureType1D, readMode> t, float x) {
|
||||
TEXTURE_PARAMETERS_INIT;
|
||||
auto tmp = __ockl_image_sample_1D(i, s, x);
|
||||
return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
|
||||
}
|
||||
|
||||
template <typename T, hipTextureReadMode readMode>
|
||||
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex2D(texture<T, hipTextureType2D, readMode> t, float x, float y)
|
||||
{
|
||||
TEXTURE_PARAMETERS_INIT;
|
||||
float2 coords{x, y};
|
||||
auto tmp = __ockl_image_sample_2D(i, s, get_native_vector(coords));
|
||||
return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
|
||||
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex2D(
|
||||
texture<T, hipTextureType2D, readMode> t, float x, float y) {
|
||||
TEXTURE_PARAMETERS_INIT;
|
||||
float2 coords{x, y};
|
||||
auto tmp = __ockl_image_sample_2D(i, s, get_native_vector(coords));
|
||||
return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
|
||||
}
|
||||
|
||||
template <typename T, hipTextureReadMode readMode>
|
||||
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex1DLayered(texture<T, hipTextureType1DLayered, readMode> t, float x, int layer)
|
||||
{
|
||||
TEXTURE_PARAMETERS_INIT;
|
||||
float2 coords{x, layer};
|
||||
auto tmp = __ockl_image_sample_1Da(i, s, get_native_vector(coords));
|
||||
return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
|
||||
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex1DLayered(
|
||||
texture<T, hipTextureType1DLayered, readMode> t, float x, int layer) {
|
||||
TEXTURE_PARAMETERS_INIT;
|
||||
float2 coords{x, layer};
|
||||
auto tmp = __ockl_image_sample_1Da(i, s, get_native_vector(coords));
|
||||
return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
|
||||
}
|
||||
|
||||
template <typename T, hipTextureReadMode readMode>
|
||||
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex2DLayered(texture<T, hipTextureType2DLayered, readMode> t, float x, float y, int layer)
|
||||
{
|
||||
TEXTURE_PARAMETERS_INIT;
|
||||
float4 coords{x, y, layer, 0.0f};
|
||||
auto tmp = __ockl_image_sample_2Da(i, s, get_native_vector(coords));
|
||||
return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
|
||||
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex2DLayered(
|
||||
texture<T, hipTextureType2DLayered, readMode> t, float x, float y, int layer) {
|
||||
TEXTURE_PARAMETERS_INIT;
|
||||
float4 coords{x, y, layer, 0.0f};
|
||||
auto tmp = __ockl_image_sample_2Da(i, s, get_native_vector(coords));
|
||||
return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
|
||||
}
|
||||
|
||||
template <typename T, hipTextureReadMode readMode>
|
||||
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex3D(texture<T, hipTextureType3D, readMode> t, float x, float y, float z)
|
||||
{
|
||||
TEXTURE_PARAMETERS_INIT;
|
||||
float4 coords{x, y, z, 0.0f};
|
||||
auto tmp = __ockl_image_sample_3D(i, s, get_native_vector(coords));
|
||||
return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
|
||||
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex3D(
|
||||
texture<T, hipTextureType3D, readMode> t, float x, float y, float z) {
|
||||
TEXTURE_PARAMETERS_INIT;
|
||||
float4 coords{x, y, z, 0.0f};
|
||||
auto tmp = __ockl_image_sample_3D(i, s, get_native_vector(coords));
|
||||
return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
|
||||
}
|
||||
|
||||
template <typename T, hipTextureReadMode readMode>
|
||||
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> texCubemap(texture<T, hipTextureTypeCubemap, readMode> t, float x, float y, float z)
|
||||
{
|
||||
TEXTURE_PARAMETERS_INIT;
|
||||
float4 coords{x, y, z, 0.0f};
|
||||
auto tmp = __ockl_image_sample_CM(i, s, get_native_vector(coords));
|
||||
return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
|
||||
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> texCubemap(
|
||||
texture<T, hipTextureTypeCubemap, readMode> t, float x, float y, float z) {
|
||||
TEXTURE_PARAMETERS_INIT;
|
||||
float4 coords{x, y, z, 0.0f};
|
||||
auto tmp = __ockl_image_sample_CM(i, s, get_native_vector(coords));
|
||||
return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
|
||||
}
|
||||
|
||||
template <typename T, hipTextureReadMode readMode>
|
||||
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex1DLod(texture<T, hipTextureType1D, readMode> t, float x, float level)
|
||||
{
|
||||
TEXTURE_PARAMETERS_INIT;
|
||||
auto tmp = __ockl_image_sample_lod_1D(i, s, x, level);
|
||||
return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
|
||||
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex1DLod(
|
||||
texture<T, hipTextureType1D, readMode> t, float x, float level) {
|
||||
TEXTURE_PARAMETERS_INIT;
|
||||
auto tmp = __ockl_image_sample_lod_1D(i, s, x, level);
|
||||
return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
|
||||
}
|
||||
|
||||
template <typename T, hipTextureReadMode readMode>
|
||||
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex2DLod(texture<T, hipTextureType2D, readMode> t, float x, float y, float level)
|
||||
{
|
||||
TEXTURE_PARAMETERS_INIT;
|
||||
float2 coords{x, y};
|
||||
auto tmp = __ockl_image_sample_lod_2D(i, s, get_native_vector(coords), level);
|
||||
return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
|
||||
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex2DLod(
|
||||
texture<T, hipTextureType2D, readMode> t, float x, float y, float level) {
|
||||
TEXTURE_PARAMETERS_INIT;
|
||||
float2 coords{x, y};
|
||||
auto tmp = __ockl_image_sample_lod_2D(i, s, get_native_vector(coords), level);
|
||||
return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
|
||||
}
|
||||
|
||||
template <typename T, hipTextureReadMode readMode>
|
||||
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex1DLayeredLod(texture<T, hipTextureType1DLayered, readMode> t, float x, int layer, float level)
|
||||
{
|
||||
TEXTURE_PARAMETERS_INIT;
|
||||
float2 coords{x, layer};
|
||||
auto tmp = __ockl_image_sample_lod_1Da(i, s, get_native_vector(coords), level);
|
||||
return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
|
||||
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex1DLayeredLod(
|
||||
texture<T, hipTextureType1DLayered, readMode> t, float x, int layer, float level) {
|
||||
TEXTURE_PARAMETERS_INIT;
|
||||
float2 coords{x, layer};
|
||||
auto tmp = __ockl_image_sample_lod_1Da(i, s, get_native_vector(coords), level);
|
||||
return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
|
||||
}
|
||||
|
||||
template <typename T, hipTextureReadMode readMode>
|
||||
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex2DLayeredLod(texture<T, hipTextureType2DLayered, readMode> t, float x, float y, int layer, float level)
|
||||
{
|
||||
TEXTURE_PARAMETERS_INIT;
|
||||
float4 coords{x, y, layer, 0.0f};
|
||||
auto tmp = __ockl_image_sample_lod_2Da(i, s, get_native_vector(coords), level);
|
||||
return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
|
||||
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex2DLayeredLod(
|
||||
texture<T, hipTextureType2DLayered, readMode> t, float x, float y, int layer, float level) {
|
||||
TEXTURE_PARAMETERS_INIT;
|
||||
float4 coords{x, y, layer, 0.0f};
|
||||
auto tmp = __ockl_image_sample_lod_2Da(i, s, get_native_vector(coords), level);
|
||||
return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
|
||||
}
|
||||
|
||||
template <typename T, hipTextureReadMode readMode>
|
||||
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex3DLod(texture<T, hipTextureType3D, readMode> t, float x, float y, float z, float level)
|
||||
{
|
||||
TEXTURE_PARAMETERS_INIT;
|
||||
float4 coords{x, y, z, 0.0f};
|
||||
auto tmp = __ockl_image_sample_lod_3D(i, s, get_native_vector(coords), level);
|
||||
return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
|
||||
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex3DLod(
|
||||
texture<T, hipTextureType3D, readMode> t, float x, float y, float z, float level) {
|
||||
TEXTURE_PARAMETERS_INIT;
|
||||
float4 coords{x, y, z, 0.0f};
|
||||
auto tmp = __ockl_image_sample_lod_3D(i, s, get_native_vector(coords), level);
|
||||
return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
|
||||
}
|
||||
|
||||
template <typename T, hipTextureReadMode readMode>
|
||||
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> texCubemapLod(texture<T, hipTextureTypeCubemap, readMode> t, float x, float y, float z, float level)
|
||||
{
|
||||
TEXTURE_PARAMETERS_INIT;
|
||||
float4 coords{x, y, z, 0.0f};
|
||||
auto tmp = __ockl_image_sample_lod_CM(i, s, get_native_vector(coords), level);
|
||||
return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
|
||||
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> texCubemapLod(
|
||||
texture<T, hipTextureTypeCubemap, readMode> t, float x, float y, float z, float level) {
|
||||
TEXTURE_PARAMETERS_INIT;
|
||||
float4 coords{x, y, z, 0.0f};
|
||||
auto tmp = __ockl_image_sample_lod_CM(i, s, get_native_vector(coords), level);
|
||||
return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
|
||||
}
|
||||
|
||||
template <typename T, hipTextureReadMode readMode>
|
||||
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> texCubemapLayered(texture<T, hipTextureTypeCubemapLayered, readMode> t, float x, float y, float z, int layer)
|
||||
{
|
||||
TEXTURE_PARAMETERS_INIT;
|
||||
float4 coords{x, y, z, layer};
|
||||
auto tmp = __ockl_image_sample_CMa(i, s, get_native_vector(coords));
|
||||
return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
|
||||
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> texCubemapLayered(
|
||||
texture<T, hipTextureTypeCubemapLayered, readMode> t, float x, float y, float z, int layer) {
|
||||
TEXTURE_PARAMETERS_INIT;
|
||||
float4 coords{x, y, z, layer};
|
||||
auto tmp = __ockl_image_sample_CMa(i, s, get_native_vector(coords));
|
||||
return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
|
||||
}
|
||||
|
||||
template <typename T, hipTextureReadMode readMode>
|
||||
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> texCubemapLayeredLod(texture<T, hipTextureTypeCubemapLayered, readMode> t, float x, float y, float z, int layer, float level)
|
||||
{
|
||||
TEXTURE_PARAMETERS_INIT;
|
||||
float4 coords{x, y, z, layer};
|
||||
auto tmp = __ockl_image_sample_lod_CMa(i, s, get_native_vector(coords), level);
|
||||
return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
|
||||
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> texCubemapLayeredLod(
|
||||
texture<T, hipTextureTypeCubemapLayered, readMode> t, float x, float y, float z, int layer,
|
||||
float level) {
|
||||
TEXTURE_PARAMETERS_INIT;
|
||||
float4 coords{x, y, z, layer};
|
||||
auto tmp = __ockl_image_sample_lod_CMa(i, s, get_native_vector(coords), level);
|
||||
return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
|
||||
}
|
||||
|
||||
template <typename T, hipTextureReadMode readMode>
|
||||
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> texCubemapGrad(texture<T, hipTextureTypeCubemap, readMode> t, float x, float y, float z, float4 dPdx, float4 dPdy)
|
||||
{
|
||||
TEXTURE_PARAMETERS_INIT;
|
||||
(void)x;
|
||||
(void)y;
|
||||
(void)z;
|
||||
(void)dPdx;
|
||||
(void)dPdy;
|
||||
// TODO missing in device libs.
|
||||
// auto tmp = __ockl_image_sample_grad_CM(i, s, get_native_vector(float4(x, y, z, 0.0f)),
|
||||
// get_native_vector(float4(dPdx.x, dPdx.y, dPdx.z, 0.0f)), get_native_vector(float4(dPdy.x,
|
||||
// dPdy.y, dPdy.z, 0.0f))); return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
|
||||
return {};
|
||||
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> texCubemapGrad(
|
||||
texture<T, hipTextureTypeCubemap, readMode> t, float x, float y, float z, float4 dPdx,
|
||||
float4 dPdy) {
|
||||
TEXTURE_PARAMETERS_INIT;
|
||||
(void)x;
|
||||
(void)y;
|
||||
(void)z;
|
||||
(void)dPdx;
|
||||
(void)dPdy;
|
||||
// TODO missing in device libs.
|
||||
// auto tmp = __ockl_image_sample_grad_CM(i, s, get_native_vector(float4(x, y, z, 0.0f)),
|
||||
// get_native_vector(float4(dPdx.x, dPdx.y, dPdx.z, 0.0f)), get_native_vector(float4(dPdy.x,
|
||||
// dPdy.y, dPdy.z, 0.0f))); return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
|
||||
return {};
|
||||
}
|
||||
|
||||
template <typename T, hipTextureReadMode readMode>
|
||||
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> texCubemapLayeredGrad(texture<T, hipTextureTypeCubemapLayered, readMode> t, float x, float y, float z, int layer, float4 dPdx, float4 dPdy)
|
||||
{
|
||||
TEXTURE_PARAMETERS_INIT;
|
||||
(void)x;
|
||||
(void)y;
|
||||
(void)z;
|
||||
(void)layer;
|
||||
(void)dPdx;
|
||||
(void)dPdy;
|
||||
// TODO missing in device libs.
|
||||
// auto tmp = __ockl_image_sample_grad_CMa(i, s, get_native_vector(float4(x, y, z, layer)),
|
||||
// get_native_vector(float4(dPdx.x, dPdx.y, dPdx.z, 0.0f)), get_native_vector(float4(dPdy.x,
|
||||
// dPdy.y, dPdy.z, 0.0f))); return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
|
||||
return {};
|
||||
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode>
|
||||
texCubemapLayeredGrad(texture<T, hipTextureTypeCubemapLayered, readMode> t, float x, float y,
|
||||
float z, int layer, float4 dPdx, float4 dPdy) {
|
||||
TEXTURE_PARAMETERS_INIT;
|
||||
(void)x;
|
||||
(void)y;
|
||||
(void)z;
|
||||
(void)layer;
|
||||
(void)dPdx;
|
||||
(void)dPdy;
|
||||
// TODO missing in device libs.
|
||||
// auto tmp = __ockl_image_sample_grad_CMa(i, s, get_native_vector(float4(x, y, z, layer)),
|
||||
// get_native_vector(float4(dPdx.x, dPdx.y, dPdx.z, 0.0f)), get_native_vector(float4(dPdy.x,
|
||||
// dPdy.y, dPdy.z, 0.0f))); return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
|
||||
return {};
|
||||
}
|
||||
|
||||
template <typename T, hipTextureReadMode readMode>
|
||||
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex1DGrad(texture<T, hipTextureType1D, readMode> t, float x, float dPdx, float dPdy)
|
||||
{
|
||||
TEXTURE_PARAMETERS_INIT;
|
||||
auto tmp = __ockl_image_sample_grad_1D(i, s, x, dPdx, dPdy);
|
||||
return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
|
||||
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex1DGrad(
|
||||
texture<T, hipTextureType1D, readMode> t, float x, float dPdx, float dPdy) {
|
||||
TEXTURE_PARAMETERS_INIT;
|
||||
auto tmp = __ockl_image_sample_grad_1D(i, s, x, dPdx, dPdy);
|
||||
return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
|
||||
}
|
||||
|
||||
template <typename T, hipTextureReadMode readMode>
|
||||
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex2DGrad(texture<T, hipTextureType2D, readMode> t, float x, float y, float2 dPdx, float2 dPdy)
|
||||
{
|
||||
TEXTURE_PARAMETERS_INIT;
|
||||
float2 coords{x, y};
|
||||
auto tmp = __ockl_image_sample_grad_2D(i, s, get_native_vector(coords), get_native_vector(dPdx),
|
||||
get_native_vector(dPdy));
|
||||
return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
|
||||
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex2DGrad(
|
||||
texture<T, hipTextureType2D, readMode> t, float x, float y, float2 dPdx, float2 dPdy) {
|
||||
TEXTURE_PARAMETERS_INIT;
|
||||
float2 coords{x, y};
|
||||
auto tmp = __ockl_image_sample_grad_2D(i, s, get_native_vector(coords), get_native_vector(dPdx),
|
||||
get_native_vector(dPdy));
|
||||
return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
|
||||
}
|
||||
|
||||
template <typename T, hipTextureReadMode readMode>
|
||||
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex1DLayeredGrad(texture<T, hipTextureType1DLayered, readMode> t, float x, int layer, float dPdx, float dPdy)
|
||||
{
|
||||
TEXTURE_PARAMETERS_INIT;
|
||||
float2 coords{x, layer};
|
||||
auto tmp = __ockl_image_sample_grad_1Da(i, s, get_native_vector(coords), dPdx, dPdy);
|
||||
return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
|
||||
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex1DLayeredGrad(
|
||||
texture<T, hipTextureType1DLayered, readMode> t, float x, int layer, float dPdx, float dPdy) {
|
||||
TEXTURE_PARAMETERS_INIT;
|
||||
float2 coords{x, layer};
|
||||
auto tmp = __ockl_image_sample_grad_1Da(i, s, get_native_vector(coords), dPdx, dPdy);
|
||||
return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
|
||||
}
|
||||
|
||||
template <typename T, hipTextureReadMode readMode>
|
||||
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex2DLayeredGrad(texture<T, hipTextureType2DLayered, readMode> t, float x, float y, int layer, float2 dPdx, float2 dPdy)
|
||||
{
|
||||
TEXTURE_PARAMETERS_INIT;
|
||||
float4 coords{x, y, layer, 0.0f};
|
||||
auto tmp = __ockl_image_sample_grad_2Da(i, s, get_native_vector(coords),
|
||||
get_native_vector(dPdx), get_native_vector(dPdy));
|
||||
return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
|
||||
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex2DLayeredGrad(
|
||||
texture<T, hipTextureType2DLayered, readMode> t, float x, float y, int layer, float2 dPdx,
|
||||
float2 dPdy) {
|
||||
TEXTURE_PARAMETERS_INIT;
|
||||
float4 coords{x, y, layer, 0.0f};
|
||||
auto tmp = __ockl_image_sample_grad_2Da(i, s, get_native_vector(coords), get_native_vector(dPdx),
|
||||
get_native_vector(dPdy));
|
||||
return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
|
||||
}
|
||||
|
||||
template <typename T, hipTextureReadMode readMode>
|
||||
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex3DGrad(texture<T, hipTextureType3D, readMode> t, float x, float y, float z, float4 dPdx, float4 dPdy)
|
||||
{
|
||||
TEXTURE_PARAMETERS_INIT;
|
||||
float4 coords{x, y, z, 0.0f};
|
||||
float4 gradx{dPdx.x, dPdx.y, dPdx.z, 0.0f};
|
||||
float4 grady{dPdy.x, dPdy.y, dPdy.z, 0.0f};
|
||||
auto tmp = __ockl_image_sample_grad_3D(i, s, get_native_vector(coords),
|
||||
get_native_vector(gradx), get_native_vector(grady));
|
||||
return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
|
||||
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex3DGrad(
|
||||
texture<T, hipTextureType3D, readMode> t, float x, float y, float z, float4 dPdx, float4 dPdy) {
|
||||
TEXTURE_PARAMETERS_INIT;
|
||||
float4 coords{x, y, z, 0.0f};
|
||||
float4 gradx{dPdx.x, dPdx.y, dPdx.z, 0.0f};
|
||||
float4 grady{dPdy.x, dPdy.y, dPdy.z, 0.0f};
|
||||
auto tmp = __ockl_image_sample_grad_3D(i, s, get_native_vector(coords), get_native_vector(gradx),
|
||||
get_native_vector(grady));
|
||||
return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
hipTextureReadMode readMode,
|
||||
typename Enable = void>
|
||||
struct __hip_tex2dgather_ret
|
||||
{
|
||||
static_assert(__hip_internal::is_same<Enable, void>::value, "Invalid channel type!");
|
||||
template <typename T, hipTextureReadMode readMode, typename Enable = void>
|
||||
struct __hip_tex2dgather_ret {
|
||||
static_assert(__hip_internal::is_same<Enable, void>::value, "Invalid channel type!");
|
||||
};
|
||||
|
||||
template <
|
||||
typename T,
|
||||
hipTextureReadMode readMode>
|
||||
template <typename T, hipTextureReadMode readMode>
|
||||
using __hip_tex2dgather_ret_t = typename __hip_tex2dgather_ret<T, readMode, bool>::type;
|
||||
|
||||
template <typename T>
|
||||
struct __hip_tex2dgather_ret<
|
||||
T,
|
||||
hipReadModeElementType,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value, bool>::type>
|
||||
{
|
||||
using type = HIP_vector_type<T, 4>;
|
||||
T, hipReadModeElementType,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value, bool>::type> {
|
||||
using type = HIP_vector_type<T, 4>;
|
||||
};
|
||||
|
||||
template<
|
||||
typename T,
|
||||
unsigned int rank>
|
||||
template <typename T, unsigned int rank>
|
||||
struct __hip_tex2dgather_ret<
|
||||
HIP_vector_type<T, rank>,
|
||||
hipReadModeElementType,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<HIP_vector_type<T, rank>>::value, bool>::type>
|
||||
{
|
||||
using type = HIP_vector_type<T, 4>;
|
||||
HIP_vector_type<T, rank>, hipReadModeElementType,
|
||||
typename __hip_internal::enable_if<
|
||||
__hip_is_tex_surf_channel_type<HIP_vector_type<T, rank>>::value, bool>::type> {
|
||||
using type = HIP_vector_type<T, 4>;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct __hip_tex2dgather_ret<
|
||||
T,
|
||||
hipReadModeNormalizedFloat,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_normalized_channel_type<T>::value, bool>::type>
|
||||
{
|
||||
using type = float4;
|
||||
struct __hip_tex2dgather_ret<T, hipReadModeNormalizedFloat,
|
||||
typename __hip_internal::enable_if<
|
||||
__hip_is_tex_normalized_channel_type<T>::value, bool>::type> {
|
||||
using type = float4;
|
||||
};
|
||||
|
||||
template <typename T, hipTextureReadMode readMode>
|
||||
static __forceinline__ __device__ __hip_img_chk__ __hip_tex2dgather_ret_t<T, readMode> tex2Dgather(texture<T, hipTextureType2D, readMode> t, float x, float y, int comp=0)
|
||||
{
|
||||
TEXTURE_PARAMETERS_INIT;
|
||||
float2 coords{x, y};
|
||||
switch (comp) {
|
||||
static __forceinline__ __device__ __hip_img_chk__ __hip_tex2dgather_ret_t<T, readMode> tex2Dgather(
|
||||
texture<T, hipTextureType2D, readMode> t, float x, float y, int comp = 0) {
|
||||
TEXTURE_PARAMETERS_INIT;
|
||||
float2 coords{x, y};
|
||||
switch (comp) {
|
||||
case 1: {
|
||||
auto tmp = __ockl_image_gather4g_2D(i, s, get_native_vector(coords));
|
||||
return __hipMapFrom<__hip_tex2dgather_ret_t<T, readMode>>(tmp);
|
||||
@@ -520,8 +468,8 @@ static __forceinline__ __device__ __hip_img_chk__ __hip_tex2dgather_ret_t<T, rea
|
||||
auto tmp = __ockl_image_gather4r_2D(i, s, get_native_vector(coords));
|
||||
return __hipMapFrom<__hip_tex2dgather_ret_t<T, readMode>>(tmp);
|
||||
}
|
||||
}
|
||||
return {};
|
||||
}
|
||||
return {};
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -30,7 +30,7 @@ THE SOFTWARE.
|
||||
#include <hip/amd_detail/texture_fetch_functions.h>
|
||||
#include <hip/amd_detail/ockl_image.h>
|
||||
#include <type_traits>
|
||||
#endif // !defined(__HIPCC_RTC__)
|
||||
#endif // !defined(__HIPCC_RTC__)
|
||||
|
||||
#define TEXTURE_OBJECT_PARAMETERS_INIT \
|
||||
unsigned int ADDRESS_SPACE_CONSTANT* i = (unsigned int ADDRESS_SPACE_CONSTANT*)textureObject; \
|
||||
@@ -40,161 +40,156 @@ THE SOFTWARE.
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ T tex1Dfetch(hipTextureObject_t textureObject, int x)
|
||||
{
|
||||
TEXTURE_OBJECT_PARAMETERS_INIT
|
||||
auto tmp = __ockl_image_load_1Db(i, x);
|
||||
return __hipMapFrom<T>(tmp);
|
||||
static __device__ __hip_img_chk__ T tex1Dfetch(hipTextureObject_t textureObject, int x) {
|
||||
TEXTURE_OBJECT_PARAMETERS_INIT
|
||||
auto tmp = __ockl_image_load_1Db(i, x);
|
||||
return __hipMapFrom<T>(tmp);
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ void tex1Dfetch(T *ptr, hipTextureObject_t textureObject, int x)
|
||||
{
|
||||
*ptr = tex1Dfetch<T>(textureObject, x);
|
||||
static __device__ __hip_img_chk__ void tex1Dfetch(T* ptr, hipTextureObject_t textureObject, int x) {
|
||||
*ptr = tex1Dfetch<T>(textureObject, x);
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ T tex1D(hipTextureObject_t textureObject, float x)
|
||||
{
|
||||
TEXTURE_OBJECT_PARAMETERS_INIT
|
||||
auto tmp = __ockl_image_sample_1D(i, s, x);
|
||||
return __hipMapFrom<T>(tmp);
|
||||
static __device__ __hip_img_chk__ T tex1D(hipTextureObject_t textureObject, float x) {
|
||||
TEXTURE_OBJECT_PARAMETERS_INIT
|
||||
auto tmp = __ockl_image_sample_1D(i, s, x);
|
||||
return __hipMapFrom<T>(tmp);
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ void tex1D(T *ptr, hipTextureObject_t textureObject, float x)
|
||||
{
|
||||
*ptr = tex1D<T>(textureObject, x);
|
||||
static __device__ __hip_img_chk__ void tex1D(T* ptr, hipTextureObject_t textureObject, float x) {
|
||||
*ptr = tex1D<T>(textureObject, x);
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ T tex2D(hipTextureObject_t textureObject, float x, float y)
|
||||
{
|
||||
TEXTURE_OBJECT_PARAMETERS_INIT
|
||||
float2 coords{x, y};
|
||||
auto tmp = __ockl_image_sample_2D(i, s, get_native_vector(coords));
|
||||
return __hipMapFrom<T>(tmp);
|
||||
static __device__ __hip_img_chk__ T tex2D(hipTextureObject_t textureObject, float x, float y) {
|
||||
TEXTURE_OBJECT_PARAMETERS_INIT
|
||||
float2 coords{x, y};
|
||||
auto tmp = __ockl_image_sample_2D(i, s, get_native_vector(coords));
|
||||
return __hipMapFrom<T>(tmp);
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ void tex2D(T *ptr, hipTextureObject_t textureObject, float x, float y)
|
||||
{
|
||||
*ptr = tex2D<T>(textureObject, x, y);
|
||||
static __device__ __hip_img_chk__ void tex2D(T* ptr, hipTextureObject_t textureObject, float x,
|
||||
float y) {
|
||||
*ptr = tex2D<T>(textureObject, x, y);
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ T tex3D(hipTextureObject_t textureObject, float x, float y, float z)
|
||||
{
|
||||
TEXTURE_OBJECT_PARAMETERS_INIT
|
||||
float4 coords{x, y, z, 0.0f};
|
||||
auto tmp = __ockl_image_sample_3D(i, s, get_native_vector(coords));
|
||||
return __hipMapFrom<T>(tmp);
|
||||
static __device__ __hip_img_chk__ T tex3D(hipTextureObject_t textureObject, float x, float y,
|
||||
float z) {
|
||||
TEXTURE_OBJECT_PARAMETERS_INIT
|
||||
float4 coords{x, y, z, 0.0f};
|
||||
auto tmp = __ockl_image_sample_3D(i, s, get_native_vector(coords));
|
||||
return __hipMapFrom<T>(tmp);
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ void tex3D(T *ptr, hipTextureObject_t textureObject, float x, float y, float z)
|
||||
{
|
||||
*ptr = tex3D<T>(textureObject, x, y, z);
|
||||
static __device__ __hip_img_chk__ void tex3D(T* ptr, hipTextureObject_t textureObject, float x,
|
||||
float y, float z) {
|
||||
*ptr = tex3D<T>(textureObject, x, y, z);
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ T tex1DLayered(hipTextureObject_t textureObject, float x, int layer)
|
||||
{
|
||||
TEXTURE_OBJECT_PARAMETERS_INIT
|
||||
float2 coords{x, layer};
|
||||
auto tmp = __ockl_image_sample_1Da(i, s, get_native_vector(coords));
|
||||
return __hipMapFrom<T>(tmp);
|
||||
static __device__ __hip_img_chk__ T tex1DLayered(hipTextureObject_t textureObject, float x,
|
||||
int layer) {
|
||||
TEXTURE_OBJECT_PARAMETERS_INIT
|
||||
float2 coords{x, layer};
|
||||
auto tmp = __ockl_image_sample_1Da(i, s, get_native_vector(coords));
|
||||
return __hipMapFrom<T>(tmp);
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ void tex1DLayered(T *ptr, hipTextureObject_t textureObject, float x, int layer)
|
||||
{
|
||||
*ptr = tex1DLayered<T>(textureObject, x, layer);
|
||||
static __device__ __hip_img_chk__ void tex1DLayered(T* ptr, hipTextureObject_t textureObject,
|
||||
float x, int layer) {
|
||||
*ptr = tex1DLayered<T>(textureObject, x, layer);
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ T tex2DLayered(hipTextureObject_t textureObject, float x, float y, int layer)
|
||||
{
|
||||
TEXTURE_OBJECT_PARAMETERS_INIT
|
||||
float4 coords{x, y, layer, 0.0f};
|
||||
auto tmp = __ockl_image_sample_2Da(i, s, get_native_vector(coords));
|
||||
return __hipMapFrom<T>(tmp);
|
||||
static __device__ __hip_img_chk__ T tex2DLayered(hipTextureObject_t textureObject, float x, float y,
|
||||
int layer) {
|
||||
TEXTURE_OBJECT_PARAMETERS_INIT
|
||||
float4 coords{x, y, layer, 0.0f};
|
||||
auto tmp = __ockl_image_sample_2Da(i, s, get_native_vector(coords));
|
||||
return __hipMapFrom<T>(tmp);
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ void tex2DLayered(T *ptr, hipTextureObject_t textureObject, float x, float y, int layer)
|
||||
{
|
||||
*ptr = tex1DLayered<T>(textureObject, x, y, layer);
|
||||
static __device__ __hip_img_chk__ void tex2DLayered(T* ptr, hipTextureObject_t textureObject,
|
||||
float x, float y, int layer) {
|
||||
*ptr = tex1DLayered<T>(textureObject, x, y, layer);
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ T texCubemap(hipTextureObject_t textureObject, float x, float y, float z)
|
||||
{
|
||||
TEXTURE_OBJECT_PARAMETERS_INIT
|
||||
float4 coords{x, y, z, 0.0f};
|
||||
auto tmp = __ockl_image_sample_CM(i, s, get_native_vector(coords));
|
||||
return __hipMapFrom<T>(tmp);
|
||||
static __device__ __hip_img_chk__ T texCubemap(hipTextureObject_t textureObject, float x, float y,
|
||||
float z) {
|
||||
TEXTURE_OBJECT_PARAMETERS_INIT
|
||||
float4 coords{x, y, z, 0.0f};
|
||||
auto tmp = __ockl_image_sample_CM(i, s, get_native_vector(coords));
|
||||
return __hipMapFrom<T>(tmp);
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ void texCubemap(T *ptr, hipTextureObject_t textureObject, float x, float y, float z)
|
||||
{
|
||||
*ptr = texCubemap<T>(textureObject, x, y, z);
|
||||
static __device__ __hip_img_chk__ void texCubemap(T* ptr, hipTextureObject_t textureObject, float x,
|
||||
float y, float z) {
|
||||
*ptr = texCubemap<T>(textureObject, x, y, z);
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ T texCubemapLayered(hipTextureObject_t textureObject, float x, float y, float z, int layer)
|
||||
{
|
||||
TEXTURE_OBJECT_PARAMETERS_INIT
|
||||
float4 coords{x, y, z, layer};
|
||||
auto tmp = __ockl_image_sample_CMa(i, s, get_native_vector(coords));
|
||||
return __hipMapFrom<T>(tmp);
|
||||
static __device__ __hip_img_chk__ T texCubemapLayered(hipTextureObject_t textureObject, float x,
|
||||
float y, float z, int layer) {
|
||||
TEXTURE_OBJECT_PARAMETERS_INIT
|
||||
float4 coords{x, y, z, layer};
|
||||
auto tmp = __ockl_image_sample_CMa(i, s, get_native_vector(coords));
|
||||
return __hipMapFrom<T>(tmp);
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ void texCubemapLayered(T *ptr, hipTextureObject_t textureObject, float x, float y, float z, int layer)
|
||||
{
|
||||
*ptr = texCubemapLayered<T>(textureObject, x, y, z, layer);
|
||||
static __device__ __hip_img_chk__ void texCubemapLayered(T* ptr, hipTextureObject_t textureObject,
|
||||
float x, float y, float z, int layer) {
|
||||
*ptr = texCubemapLayered<T>(textureObject, x, y, z, layer);
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ T tex2Dgather(hipTextureObject_t textureObject, float x, float y, int comp = 0)
|
||||
{
|
||||
TEXTURE_OBJECT_PARAMETERS_INIT
|
||||
float2 coords{x, y};
|
||||
switch (comp) {
|
||||
static __device__ __hip_img_chk__ T tex2Dgather(hipTextureObject_t textureObject, float x, float y,
|
||||
int comp = 0) {
|
||||
TEXTURE_OBJECT_PARAMETERS_INIT
|
||||
float2 coords{x, y};
|
||||
switch (comp) {
|
||||
case 1: {
|
||||
auto tmp = __ockl_image_gather4r_2D(i, s, get_native_vector(coords));
|
||||
return __hipMapFrom<T>(tmp);
|
||||
@@ -215,79 +210,79 @@ static __device__ __hip_img_chk__ T tex2Dgather(hipTextureObject_t textureObject
|
||||
return __hipMapFrom<T>(tmp);
|
||||
break;
|
||||
}
|
||||
}
|
||||
return {};
|
||||
}
|
||||
return {};
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ void tex2Dgather(T *ptr, hipTextureObject_t textureObject, float x, float y, int comp = 0)
|
||||
{
|
||||
*ptr = texCubemapLayered<T>(textureObject, x, y, comp);
|
||||
static __device__ __hip_img_chk__ void tex2Dgather(T* ptr, hipTextureObject_t textureObject,
|
||||
float x, float y, int comp = 0) {
|
||||
*ptr = texCubemapLayered<T>(textureObject, x, y, comp);
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ T tex1DLod(hipTextureObject_t textureObject, float x, float level)
|
||||
{
|
||||
TEXTURE_OBJECT_PARAMETERS_INIT
|
||||
auto tmp = __ockl_image_sample_lod_1D(i, s, x, level);
|
||||
return __hipMapFrom<T>(tmp);
|
||||
static __device__ __hip_img_chk__ T tex1DLod(hipTextureObject_t textureObject, float x,
|
||||
float level) {
|
||||
TEXTURE_OBJECT_PARAMETERS_INIT
|
||||
auto tmp = __ockl_image_sample_lod_1D(i, s, x, level);
|
||||
return __hipMapFrom<T>(tmp);
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ void tex1DLod(T *ptr, hipTextureObject_t textureObject, float x, float level)
|
||||
{
|
||||
*ptr = tex1DLod<T>(textureObject, x, level);
|
||||
static __device__ __hip_img_chk__ void tex1DLod(T* ptr, hipTextureObject_t textureObject, float x,
|
||||
float level) {
|
||||
*ptr = tex1DLod<T>(textureObject, x, level);
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ T tex2DLod(hipTextureObject_t textureObject, float x, float y, float level)
|
||||
{
|
||||
TEXTURE_OBJECT_PARAMETERS_INIT
|
||||
float2 coords{x, y};
|
||||
auto tmp = __ockl_image_sample_lod_2D(i, s, get_native_vector(coords), level);
|
||||
return __hipMapFrom<T>(tmp);
|
||||
static __device__ __hip_img_chk__ T tex2DLod(hipTextureObject_t textureObject, float x, float y,
|
||||
float level) {
|
||||
TEXTURE_OBJECT_PARAMETERS_INIT
|
||||
float2 coords{x, y};
|
||||
auto tmp = __ockl_image_sample_lod_2D(i, s, get_native_vector(coords), level);
|
||||
return __hipMapFrom<T>(tmp);
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ void tex2DLod(T *ptr, hipTextureObject_t textureObject, float x, float y, float level)
|
||||
{
|
||||
*ptr = tex2DLod<T>(textureObject, x, y, level);
|
||||
static __device__ __hip_img_chk__ void tex2DLod(T* ptr, hipTextureObject_t textureObject, float x,
|
||||
float y, float level) {
|
||||
*ptr = tex2DLod<T>(textureObject, x, y, level);
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ T tex3DLod(hipTextureObject_t textureObject, float x, float y, float z, float level)
|
||||
{
|
||||
TEXTURE_OBJECT_PARAMETERS_INIT
|
||||
float4 coords{x, y, z, 0.0f};
|
||||
auto tmp = __ockl_image_sample_lod_3D(i, s, get_native_vector(coords), level);
|
||||
return __hipMapFrom<T>(tmp);
|
||||
static __device__ __hip_img_chk__ T tex3DLod(hipTextureObject_t textureObject, float x, float y,
|
||||
float z, float level) {
|
||||
TEXTURE_OBJECT_PARAMETERS_INIT
|
||||
float4 coords{x, y, z, 0.0f};
|
||||
auto tmp = __ockl_image_sample_lod_3D(i, s, get_native_vector(coords), level);
|
||||
return __hipMapFrom<T>(tmp);
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ void tex3DLod(T *ptr, hipTextureObject_t textureObject, float x, float y, float z, float level)
|
||||
{
|
||||
*ptr = tex3DLod<T>(textureObject, x, y, z, level);
|
||||
static __device__ __hip_img_chk__ void tex3DLod(T* ptr, hipTextureObject_t textureObject, float x,
|
||||
float y, float z, float level) {
|
||||
*ptr = tex3DLod<T>(textureObject, x, y, z, level);
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ T tex1DLayeredLod(hipTextureObject_t textureObject, float x, int layer, float level)
|
||||
{
|
||||
static __device__ __hip_img_chk__ T tex1DLayeredLod(hipTextureObject_t textureObject, float x,
|
||||
int layer, float level) {
|
||||
TEXTURE_OBJECT_PARAMETERS_INIT;
|
||||
(void)level;
|
||||
float2 coords{x, layer};
|
||||
@@ -298,16 +293,16 @@ static __device__ __hip_img_chk__ T tex1DLayeredLod(hipTextureObject_t textureOb
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ void tex1DLayeredLod(T *ptr, hipTextureObject_t textureObject, float x, int layer, float level)
|
||||
{
|
||||
*ptr = tex1DLayeredLod<T>(textureObject, x, layer, level);
|
||||
static __device__ __hip_img_chk__ void tex1DLayeredLod(T* ptr, hipTextureObject_t textureObject,
|
||||
float x, int layer, float level) {
|
||||
*ptr = tex1DLayeredLod<T>(textureObject, x, layer, level);
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ T tex2DLayeredLod(hipTextureObject_t textureObject, float x, float y, int layer, float level)
|
||||
{
|
||||
static __device__ __hip_img_chk__ T tex2DLayeredLod(hipTextureObject_t textureObject, float x,
|
||||
float y, int layer, float level) {
|
||||
TEXTURE_OBJECT_PARAMETERS_INIT;
|
||||
(void)level;
|
||||
float4 coords{x, y, layer, 0.0f};
|
||||
@@ -318,35 +313,35 @@ static __device__ __hip_img_chk__ T tex2DLayeredLod(hipTextureObject_t textureO
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ void tex2DLayeredLod(T *ptr, hipTextureObject_t textureObject, float x, float y, int layer, float level)
|
||||
{
|
||||
*ptr = tex2DLayeredLod<T>(textureObject, x, y, layer, level);
|
||||
static __device__ __hip_img_chk__ void tex2DLayeredLod(T* ptr, hipTextureObject_t textureObject,
|
||||
float x, float y, int layer, float level) {
|
||||
*ptr = tex2DLayeredLod<T>(textureObject, x, y, layer, level);
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ T texCubemapLod(hipTextureObject_t textureObject, float x, float y, float z, float level)
|
||||
{
|
||||
TEXTURE_OBJECT_PARAMETERS_INIT
|
||||
float4 coords{x, y, z, 0.0f};
|
||||
auto tmp = __ockl_image_sample_lod_CM(i, s, get_native_vector(coords), level);
|
||||
return __hipMapFrom<T>(tmp);
|
||||
static __device__ __hip_img_chk__ T texCubemapLod(hipTextureObject_t textureObject, float x,
|
||||
float y, float z, float level) {
|
||||
TEXTURE_OBJECT_PARAMETERS_INIT
|
||||
float4 coords{x, y, z, 0.0f};
|
||||
auto tmp = __ockl_image_sample_lod_CM(i, s, get_native_vector(coords), level);
|
||||
return __hipMapFrom<T>(tmp);
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ void texCubemapLod(T *ptr, hipTextureObject_t textureObject, float x, float y, float z, float level)
|
||||
{
|
||||
*ptr = texCubemapLod<T>(textureObject, x, y, z, level);
|
||||
static __device__ __hip_img_chk__ void texCubemapLod(T* ptr, hipTextureObject_t textureObject,
|
||||
float x, float y, float z, float level) {
|
||||
*ptr = texCubemapLod<T>(textureObject, x, y, z, level);
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ T texCubemapGrad(hipTextureObject_t textureObject, float x, float y, float z, float4 dPdx, float4 dPdy)
|
||||
{
|
||||
static __device__ __hip_img_chk__ T texCubemapGrad(hipTextureObject_t textureObject, float x,
|
||||
float y, float z, float4 dPdx, float4 dPdy) {
|
||||
TEXTURE_OBJECT_PARAMETERS_INIT;
|
||||
(void)x;
|
||||
(void)y;
|
||||
@@ -363,73 +358,76 @@ static __device__ __hip_img_chk__ T texCubemapGrad(hipTextureObject_t textureObj
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ void texCubemapGrad(T *ptr, hipTextureObject_t textureObject, float x, float y, float z, float4 dPdx, float4 dPdy)
|
||||
{
|
||||
*ptr = texCubemapGrad<T>(textureObject, x, y, z, dPdx, dPdy);
|
||||
static __device__ __hip_img_chk__ void texCubemapGrad(T* ptr, hipTextureObject_t textureObject,
|
||||
float x, float y, float z, float4 dPdx,
|
||||
float4 dPdy) {
|
||||
*ptr = texCubemapGrad<T>(textureObject, x, y, z, dPdx, dPdy);
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ T texCubemapLayeredLod(hipTextureObject_t textureObject, float x, float y, float z, int layer, float level)
|
||||
{
|
||||
TEXTURE_OBJECT_PARAMETERS_INIT
|
||||
float4 coords{x, y, z, layer};
|
||||
auto tmp = __ockl_image_sample_lod_CMa(i, s, get_native_vector(coords), level);
|
||||
return __hipMapFrom<T>(tmp);
|
||||
static __device__ __hip_img_chk__ T texCubemapLayeredLod(hipTextureObject_t textureObject, float x,
|
||||
float y, float z, int layer, float level) {
|
||||
TEXTURE_OBJECT_PARAMETERS_INIT
|
||||
float4 coords{x, y, z, layer};
|
||||
auto tmp = __ockl_image_sample_lod_CMa(i, s, get_native_vector(coords), level);
|
||||
return __hipMapFrom<T>(tmp);
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ void texCubemapLayeredLod(T *ptr, hipTextureObject_t textureObject, float x, float y, float z, int layer, float level)
|
||||
{
|
||||
*ptr = texCubemapLayeredLod<T>(textureObject, x, y, z, layer, level);
|
||||
static __device__ __hip_img_chk__ void texCubemapLayeredLod(T* ptr,
|
||||
hipTextureObject_t textureObject,
|
||||
float x, float y, float z, int layer,
|
||||
float level) {
|
||||
*ptr = texCubemapLayeredLod<T>(textureObject, x, y, z, layer, level);
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ T tex1DGrad(hipTextureObject_t textureObject, float x, float dPdx, float dPdy)
|
||||
{
|
||||
TEXTURE_OBJECT_PARAMETERS_INIT
|
||||
auto tmp = __ockl_image_sample_grad_1D(i, s, x, dPdx, dPdy);
|
||||
return __hipMapFrom<T>(tmp);
|
||||
static __device__ __hip_img_chk__ T tex1DGrad(hipTextureObject_t textureObject, float x, float dPdx,
|
||||
float dPdy) {
|
||||
TEXTURE_OBJECT_PARAMETERS_INIT
|
||||
auto tmp = __ockl_image_sample_grad_1D(i, s, x, dPdx, dPdy);
|
||||
return __hipMapFrom<T>(tmp);
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ void tex1DGrad(T *ptr, hipTextureObject_t textureObject, float x, float dPdx, float dPdy)
|
||||
{
|
||||
*ptr = tex1DGrad<T>(textureObject, x, dPdx, dPdy);
|
||||
static __device__ __hip_img_chk__ void tex1DGrad(T* ptr, hipTextureObject_t textureObject, float x,
|
||||
float dPdx, float dPdy) {
|
||||
*ptr = tex1DGrad<T>(textureObject, x, dPdx, dPdy);
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ T tex2DGrad(hipTextureObject_t textureObject, float x, float y, float2 dPdx, float2 dPdy)
|
||||
{
|
||||
TEXTURE_OBJECT_PARAMETERS_INIT
|
||||
float2 coords{x, y};
|
||||
auto tmp = __ockl_image_sample_grad_2D(i, s, get_native_vector(coords), get_native_vector(dPdx),
|
||||
get_native_vector(dPdy));
|
||||
return __hipMapFrom<T>(tmp);
|
||||
static __device__ __hip_img_chk__ T tex2DGrad(hipTextureObject_t textureObject, float x, float y,
|
||||
float2 dPdx, float2 dPdy) {
|
||||
TEXTURE_OBJECT_PARAMETERS_INIT
|
||||
float2 coords{x, y};
|
||||
auto tmp = __ockl_image_sample_grad_2D(i, s, get_native_vector(coords), get_native_vector(dPdx),
|
||||
get_native_vector(dPdy));
|
||||
return __hipMapFrom<T>(tmp);
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ void tex2DGrad(T *ptr, hipTextureObject_t textureObject, float x, float y, float2 dPdx, float2 dPdy)
|
||||
{
|
||||
*ptr = tex2DGrad<T>(textureObject, x, y, dPdx, dPdy);
|
||||
static __device__ __hip_img_chk__ void tex2DGrad(T* ptr, hipTextureObject_t textureObject, float x,
|
||||
float y, float2 dPdx, float2 dPdy) {
|
||||
*ptr = tex2DGrad<T>(textureObject, x, y, dPdx, dPdy);
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ T tex3DGrad(hipTextureObject_t textureObject, float x, float y, float z, float4 dPdx, float4 dPdy)
|
||||
{
|
||||
static __device__ __hip_img_chk__ T tex3DGrad(hipTextureObject_t textureObject, float x, float y,
|
||||
float z, float4 dPdx, float4 dPdy) {
|
||||
TEXTURE_OBJECT_PARAMETERS_INIT;
|
||||
(void)dPdx;
|
||||
float4 coords{x, y, z, 0.0f};
|
||||
@@ -443,55 +441,58 @@ static __device__ __hip_img_chk__ T tex3DGrad(hipTextureObject_t textureObject,
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ void tex3DGrad(T *ptr, hipTextureObject_t textureObject, float x, float y, float z, float4 dPdx, float4 dPdy)
|
||||
{
|
||||
*ptr = tex3DGrad<T>(textureObject, x, y, z, dPdx, dPdy);
|
||||
static __device__ __hip_img_chk__ void tex3DGrad(T* ptr, hipTextureObject_t textureObject, float x,
|
||||
float y, float z, float4 dPdx, float4 dPdy) {
|
||||
*ptr = tex3DGrad<T>(textureObject, x, y, z, dPdx, dPdy);
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ T tex1DLayeredGrad(hipTextureObject_t textureObject, float x, int layer, float dPdx, float dPdy)
|
||||
{
|
||||
TEXTURE_OBJECT_PARAMETERS_INIT
|
||||
float2 coords{x, layer};
|
||||
auto tmp = __ockl_image_sample_grad_1Da(i, s, get_native_vector(coords), dPdx, dPdy);
|
||||
return __hipMapFrom<T>(tmp);
|
||||
static __device__ __hip_img_chk__ T tex1DLayeredGrad(hipTextureObject_t textureObject, float x,
|
||||
int layer, float dPdx, float dPdy) {
|
||||
TEXTURE_OBJECT_PARAMETERS_INIT
|
||||
float2 coords{x, layer};
|
||||
auto tmp = __ockl_image_sample_grad_1Da(i, s, get_native_vector(coords), dPdx, dPdy);
|
||||
return __hipMapFrom<T>(tmp);
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ void tex1DLayeredGrad(T *ptr, hipTextureObject_t textureObject, float x, int layer, float dPdx, float dPdy)
|
||||
{
|
||||
*ptr = tex1DLayeredGrad<T>(textureObject, x, layer, dPdx, dPdy);
|
||||
static __device__ __hip_img_chk__ void tex1DLayeredGrad(T* ptr, hipTextureObject_t textureObject,
|
||||
float x, int layer, float dPdx,
|
||||
float dPdy) {
|
||||
*ptr = tex1DLayeredGrad<T>(textureObject, x, layer, dPdx, dPdy);
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ T tex2DLayeredGrad(hipTextureObject_t textureObject, float x, float y, int layer, float2 dPdx, float2 dPdy)
|
||||
{
|
||||
TEXTURE_OBJECT_PARAMETERS_INIT
|
||||
float4 coords{x, y, layer, 0.0f};
|
||||
auto tmp = __ockl_image_sample_grad_2Da(i, s, get_native_vector(coords),
|
||||
get_native_vector(dPdx), get_native_vector(dPdy));
|
||||
return __hipMapFrom<T>(tmp);
|
||||
static __device__ __hip_img_chk__ T tex2DLayeredGrad(hipTextureObject_t textureObject, float x,
|
||||
float y, int layer, float2 dPdx, float2 dPdy) {
|
||||
TEXTURE_OBJECT_PARAMETERS_INIT
|
||||
float4 coords{x, y, layer, 0.0f};
|
||||
auto tmp = __ockl_image_sample_grad_2Da(i, s, get_native_vector(coords), get_native_vector(dPdx),
|
||||
get_native_vector(dPdy));
|
||||
return __hipMapFrom<T>(tmp);
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ void tex2DLayeredGrad(T *ptr, hipTextureObject_t textureObject, float x, float y, int layer, float2 dPdx, float2 dPdy)
|
||||
{
|
||||
*ptr = tex2DLayeredGrad<T>(textureObject, x, y, layer, dPdx, dPdy);
|
||||
static __device__ __hip_img_chk__ void tex2DLayeredGrad(T* ptr, hipTextureObject_t textureObject,
|
||||
float x, float y, int layer, float2 dPdx,
|
||||
float2 dPdy) {
|
||||
*ptr = tex2DLayeredGrad<T>(textureObject, x, y, layer, dPdx, dPdy);
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ T texCubemapLayeredGrad(hipTextureObject_t textureObject, float x, float y, float z, int layer, float4 dPdx, float4 dPdy)
|
||||
{
|
||||
static __device__ __hip_img_chk__ T texCubemapLayeredGrad(hipTextureObject_t textureObject, float x,
|
||||
float y, float z, int layer, float4 dPdx,
|
||||
float4 dPdy) {
|
||||
TEXTURE_OBJECT_PARAMETERS_INIT;
|
||||
(void)x;
|
||||
(void)y;
|
||||
@@ -509,9 +510,11 @@ static __device__ __hip_img_chk__ T texCubemapLayeredGrad(hipTextureObject_t te
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ void texCubemapLayeredGrad(T *ptr, hipTextureObject_t textureObject, float x, float y, float z, int layer, float4 dPdx, float4 dPdy)
|
||||
{
|
||||
*ptr = texCubemapLayeredGrad<T>(textureObject, x, y, z, layer, dPdx, dPdy);
|
||||
static __device__ __hip_img_chk__ void texCubemapLayeredGrad(T* ptr,
|
||||
hipTextureObject_t textureObject,
|
||||
float x, float y, float z, int layer,
|
||||
float4 dPdx, float4 dPdy) {
|
||||
*ptr = texCubemapLayeredGrad<T>(textureObject, x, y, z, layer, dPdx, dPdy);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -63,60 +63,60 @@ enum : unsigned {
|
||||
EF_AMDGPU_MACH_R600_LAST = EF_AMDGPU_MACH_R600_TURKS,
|
||||
|
||||
// AMDGCN-based processors.
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX600 = 0x020,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX601 = 0x021,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX700 = 0x022,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX701 = 0x023,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX702 = 0x024,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX703 = 0x025,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX704 = 0x026,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX600 = 0x020,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX601 = 0x021,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX700 = 0x022,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX701 = 0x023,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX702 = 0x024,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX703 = 0x025,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX704 = 0x026,
|
||||
EF_AMDGPU_MACH_AMDGCN_RESERVED_0X27 = 0x027,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX801 = 0x028,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX802 = 0x029,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX803 = 0x02a,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX810 = 0x02b,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX900 = 0x02c,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX902 = 0x02d,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX904 = 0x02e,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX906 = 0x02f,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX908 = 0x030,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX909 = 0x031,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX90C = 0x032,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX1010 = 0x033,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX1011 = 0x034,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX1012 = 0x035,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX1030 = 0x036,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX1031 = 0x037,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX1032 = 0x038,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX1033 = 0x039,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX602 = 0x03a,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX705 = 0x03b,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX805 = 0x03c,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX1035 = 0x03d,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX1034 = 0x03e,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX90A = 0x03f,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX940 = 0x040,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX1100 = 0x041,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX1013 = 0x042,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX1150 = 0x043,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX1103 = 0x044,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX1036 = 0x045,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX1101 = 0x046,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX1102 = 0x047,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX1200 = 0x048,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX801 = 0x028,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX802 = 0x029,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX803 = 0x02a,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX810 = 0x02b,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX900 = 0x02c,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX902 = 0x02d,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX904 = 0x02e,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX906 = 0x02f,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX908 = 0x030,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX909 = 0x031,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX90C = 0x032,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX1010 = 0x033,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX1011 = 0x034,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX1012 = 0x035,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX1030 = 0x036,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX1031 = 0x037,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX1032 = 0x038,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX1033 = 0x039,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX602 = 0x03a,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX705 = 0x03b,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX805 = 0x03c,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX1035 = 0x03d,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX1034 = 0x03e,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX90A = 0x03f,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX940 = 0x040,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX1100 = 0x041,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX1013 = 0x042,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX1150 = 0x043,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX1103 = 0x044,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX1036 = 0x045,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX1101 = 0x046,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX1102 = 0x047,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX1200 = 0x048,
|
||||
EF_AMDGPU_MACH_AMDGCN_RESERVED_0X49 = 0x049,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX1151 = 0x04a,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX941 = 0x04b,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX942 = 0x04c,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX1151 = 0x04a,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX941 = 0x04b,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX942 = 0x04c,
|
||||
EF_AMDGPU_MACH_AMDGCN_RESERVED_0X4D = 0x04d,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX1201 = 0x04e,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX950 = 0x04f,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX1201 = 0x04e,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX950 = 0x04f,
|
||||
EF_AMDGPU_MACH_AMDGCN_RESERVED_0X50 = 0x050,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC = 0x051,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC = 0x051,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC = 0x052,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX10_3_GENERIC = 0x053,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX11_GENERIC = 0x054,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX1152 = 0x055,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX1152 = 0x055,
|
||||
EF_AMDGPU_MACH_AMDGCN_RESERVED_0X56 = 0x056,
|
||||
EF_AMDGPU_MACH_AMDGCN_RESERVED_0X57 = 0x057,
|
||||
EF_AMDGPU_MACH_AMDGCN_RESERVED_0X58 = 0x058,
|
||||
|
||||
@@ -123,8 +123,7 @@ hipError_t hipDeviceGetPCIBusId(char* pciBusId, int len, int device);
|
||||
hipError_t hipDeviceGetSharedMemConfig(hipSharedMemConfig* pConfig);
|
||||
hipError_t hipDeviceGetStreamPriorityRange(int* leastPriority, int* greatestPriority);
|
||||
hipError_t hipDeviceGetTexture1DLinearMaxWidth(size_t* maxWidthInElements,
|
||||
const hipChannelFormatDesc* fmtDesc,
|
||||
int device);
|
||||
const hipChannelFormatDesc* fmtDesc, int device);
|
||||
hipError_t hipDeviceGetUuid(hipUUID* uuid, hipDevice_t device);
|
||||
hipError_t hipDeviceGraphMemTrim(int device);
|
||||
hipError_t hipDevicePrimaryCtxGetState(hipDevice_t dev, unsigned int* flags, int* active);
|
||||
@@ -247,9 +246,9 @@ hipError_t hipGraphAddMemcpyNodeToSymbol(hipGraphNode_t* pGraphNode, hipGraph_t
|
||||
hipError_t hipGraphAddMemsetNode(hipGraphNode_t* pGraphNode, hipGraph_t graph,
|
||||
const hipGraphNode_t* pDependencies, size_t numDependencies,
|
||||
const hipMemsetParams* pMemsetParams);
|
||||
hipError_t hipGraphAddNode(hipGraphNode_t *pGraphNode, hipGraph_t graph,
|
||||
const hipGraphNode_t *pDependencies, size_t numDependencies,
|
||||
hipGraphNodeParams *nodeParams);
|
||||
hipError_t hipGraphAddNode(hipGraphNode_t* pGraphNode, hipGraph_t graph,
|
||||
const hipGraphNode_t* pDependencies, size_t numDependencies,
|
||||
hipGraphNodeParams* nodeParams);
|
||||
hipError_t hipGraphChildGraphNodeGetGraph(hipGraphNode_t node, hipGraph_t* pGraph);
|
||||
hipError_t hipGraphClone(hipGraph_t* pGraphClone, hipGraph_t originalGraph);
|
||||
hipError_t hipGraphCreate(hipGraph_t* pGraph, unsigned int flags);
|
||||
@@ -362,8 +361,8 @@ hipError_t hipImportExternalMemory(hipExternalMemory_t* extMem_out,
|
||||
hipError_t hipImportExternalSemaphore(hipExternalSemaphore_t* extSem_out,
|
||||
const hipExternalSemaphoreHandleDesc* semHandleDesc);
|
||||
hipError_t hipDrvGraphAddMemsetNode(hipGraphNode_t* phGraphNode, hipGraph_t hGraph,
|
||||
const hipGraphNode_t* dependencies, size_t numDependencies,
|
||||
const hipMemsetParams* memsetParams, hipCtx_t ctx);
|
||||
const hipGraphNode_t* dependencies, size_t numDependencies,
|
||||
const hipMemsetParams* memsetParams, hipCtx_t ctx);
|
||||
hipError_t hipInit(unsigned int flags);
|
||||
hipError_t hipIpcCloseMemHandle(void* devPtr);
|
||||
hipError_t hipIpcGetEventHandle(hipIpcEventHandle_t* handle, hipEvent_t event);
|
||||
@@ -549,13 +548,13 @@ hipError_t hipModuleLoadData(hipModule_t* module, const void* image);
|
||||
hipError_t hipModuleLoadDataEx(hipModule_t* module, const void* image, unsigned int numOptions,
|
||||
hipJitOption* options, void** optionValues);
|
||||
hipError_t hipLinkAddData(hipLinkState_t state, hipJitInputType type, void* data, size_t size,
|
||||
const char* name, unsigned int numOptions, hipJitOption* options,
|
||||
void** optionValues);
|
||||
hipError_t hipLinkAddFile(hipLinkState_t state, hipJitInputType type, const char* path, unsigned int numOptions,
|
||||
hipJitOption* options, void** optionValues);
|
||||
const char* name, unsigned int numOptions, hipJitOption* options,
|
||||
void** optionValues);
|
||||
hipError_t hipLinkAddFile(hipLinkState_t state, hipJitInputType type, const char* path,
|
||||
unsigned int numOptions, hipJitOption* options, void** optionValues);
|
||||
hipError_t hipLinkComplete(hipLinkState_t state, void** hipBinOut, size_t* sizeOut);
|
||||
hipError_t hipLinkCreate(unsigned int numOptions, hipJitOption* options,
|
||||
void** optionValues, hipLinkState_t* stateOut);
|
||||
hipError_t hipLinkCreate(unsigned int numOptions, hipJitOption* options, void** optionValues,
|
||||
hipLinkState_t* stateOut);
|
||||
hipError_t hipLinkDestroy(hipLinkState_t state);
|
||||
hipError_t hipModuleOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks, hipFunction_t f,
|
||||
int blockSize,
|
||||
@@ -677,17 +676,17 @@ hipError_t hipWaitExternalSemaphoresAsync(const hipExternalSemaphore_t* extSemAr
|
||||
hipChannelFormatDesc hipCreateChannelDesc(int x, int y, int z, int w, hipChannelFormatKind f);
|
||||
hipError_t hipCreateSurfaceObject(hipSurfaceObject_t* pSurfObject, const hipResourceDesc* pResDesc);
|
||||
hipError_t hipExtModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
|
||||
uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ,
|
||||
uint32_t localWorkSizeX, uint32_t localWorkSizeY,
|
||||
uint32_t localWorkSizeZ, size_t sharedMemBytes,
|
||||
hipStream_t hStream, void** kernelParams, void** extra,
|
||||
hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flag);
|
||||
uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ,
|
||||
uint32_t localWorkSizeX, uint32_t localWorkSizeY,
|
||||
uint32_t localWorkSizeZ, size_t sharedMemBytes,
|
||||
hipStream_t hStream, void** kernelParams, void** extra,
|
||||
hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flag);
|
||||
hipError_t hipHccModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
|
||||
uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ,
|
||||
uint32_t localWorkSizeX, uint32_t localWorkSizeY,
|
||||
uint32_t localWorkSizeZ, size_t sharedMemBytes,
|
||||
hipStream_t hStream, void** kernelParams, void** extra,
|
||||
hipEvent_t startEvent, hipEvent_t stopEvent);
|
||||
uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ,
|
||||
uint32_t localWorkSizeX, uint32_t localWorkSizeY,
|
||||
uint32_t localWorkSizeZ, size_t sharedMemBytes,
|
||||
hipStream_t hStream, void** kernelParams, void** extra,
|
||||
hipEvent_t startEvent, hipEvent_t stopEvent);
|
||||
hipError_t hipMemcpy_spt(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind);
|
||||
hipError_t hipMemcpyToSymbol_spt(const void* symbol, const void* src, size_t sizeBytes,
|
||||
size_t offset, hipMemcpyKind kind);
|
||||
@@ -795,12 +794,12 @@ hipError_t hipStreamBeginCaptureToGraph(hipStream_t stream, hipGraph_t graph,
|
||||
size_t numDependencies, hipStreamCaptureMode mode);
|
||||
hipError_t hipGetFuncBySymbol(hipFunction_t* functionPtr, const void* symbolPtr);
|
||||
hipError_t hipDrvGraphAddMemFreeNode(hipGraphNode_t* phGraphNode, hipGraph_t hGraph,
|
||||
const hipGraphNode_t* dependencies, size_t numDependencies,
|
||||
hipDeviceptr_t dptr);
|
||||
const hipGraphNode_t* dependencies, size_t numDependencies,
|
||||
hipDeviceptr_t dptr);
|
||||
hipError_t hipDrvGraphExecMemcpyNodeSetParams(hipGraphExec_t hGraphExec, hipGraphNode_t hNode,
|
||||
const HIP_MEMCPY3D* copyParams, hipCtx_t ctx);
|
||||
const HIP_MEMCPY3D* copyParams, hipCtx_t ctx);
|
||||
hipError_t hipDrvGraphExecMemsetNodeSetParams(hipGraphExec_t hGraphExec, hipGraphNode_t hNode,
|
||||
const hipMemsetParams* memsetParams, hipCtx_t ctx);
|
||||
const hipMemsetParams* memsetParams, hipCtx_t ctx);
|
||||
hipError_t hipSetValidDevices(int* device_arr, int len);
|
||||
hipError_t hipMemcpyAtoD(hipDeviceptr_t dstDevice, hipArray_t srcArray, size_t srcOffset,
|
||||
size_t ByteCount);
|
||||
@@ -816,9 +815,9 @@ hipError_t hipMemcpy2DArrayToArray(hipArray_t dst, size_t wOffsetDst, size_t hOf
|
||||
hipArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc,
|
||||
size_t width, size_t height, hipMemcpyKind kind);
|
||||
hipError_t hipGraphExecGetFlags(hipGraphExec_t graphExec, unsigned long long* flags);
|
||||
hipError_t hipGraphNodeSetParams(hipGraphNode_t node, hipGraphNodeParams *nodeParams);
|
||||
hipError_t hipGraphNodeSetParams(hipGraphNode_t node, hipGraphNodeParams* nodeParams);
|
||||
hipError_t hipGraphExecNodeSetParams(hipGraphExec_t graphExec, hipGraphNode_t node,
|
||||
hipGraphNodeParams* nodeParams);
|
||||
hipGraphNodeParams* nodeParams);
|
||||
hipError_t hipExternalMemoryGetMappedMipmappedArray(
|
||||
hipMipmappedArray_t* mipmap, hipExternalMemory_t extMem,
|
||||
const hipExternalMemoryMipmappedArrayDesc* mipmapDesc);
|
||||
@@ -842,8 +841,8 @@ hipError_t hipMemGetHandleForAddressRange(void* handle, hipDeviceptr_t dptr, siz
|
||||
unsigned long long flags);
|
||||
hipError_t hipMemsetD2D8(hipDeviceptr_t dst, size_t dstPitch, unsigned char value, size_t width,
|
||||
size_t height);
|
||||
hipError_t hipMemsetD2D8Async(hipDeviceptr_t dst, size_t dstPitch, unsigned char value, size_t width,
|
||||
size_t height, hipStream_t stream);
|
||||
hipError_t hipMemsetD2D8Async(hipDeviceptr_t dst, size_t dstPitch, unsigned char value,
|
||||
size_t width, size_t height, hipStream_t stream);
|
||||
hipError_t hipMemsetD2D16(hipDeviceptr_t dst, size_t dstPitch, unsigned short value, size_t width,
|
||||
size_t height);
|
||||
hipError_t hipMemsetD2D16Async(hipDeviceptr_t dst, size_t dstPitch, unsigned short value,
|
||||
@@ -853,16 +852,16 @@ hipError_t hipMemsetD2D32(hipDeviceptr_t dst, size_t dstPitch, unsigned int valu
|
||||
hipError_t hipMemsetD2D32Async(hipDeviceptr_t dst, size_t dstPitch, unsigned int value,
|
||||
size_t width, size_t height, hipStream_t stream);
|
||||
hipError_t hipStreamGetAttribute(hipStream_t stream, hipStreamAttrID attr,
|
||||
hipStreamAttrValue *value);
|
||||
hipStreamAttrValue* value);
|
||||
hipError_t hipStreamSetAttribute(hipStream_t stream, hipStreamAttrID attr,
|
||||
const hipStreamAttrValue *value);
|
||||
hipError_t hipMemcpyBatchAsync(void **dsts, void **srcs, size_t *sizes, size_t count,
|
||||
hipMemcpyAttributes *attrs, size_t *attrsIdxs, size_t numAttrs,
|
||||
size_t *failIdx, hipStream_t stream);
|
||||
hipError_t hipMemcpy3DBatchAsync(size_t numOps, struct hipMemcpy3DBatchOp *opList, size_t *failIdx,
|
||||
const hipStreamAttrValue* value);
|
||||
hipError_t hipMemcpyBatchAsync(void** dsts, void** srcs, size_t* sizes, size_t count,
|
||||
hipMemcpyAttributes* attrs, size_t* attrsIdxs, size_t numAttrs,
|
||||
size_t* failIdx, hipStream_t stream);
|
||||
hipError_t hipMemcpy3DBatchAsync(size_t numOps, struct hipMemcpy3DBatchOp* opList, size_t* failIdx,
|
||||
unsigned long long flags, hipStream_t stream);
|
||||
hipError_t hipMemcpy3DPeer(hipMemcpy3DPeerParms *p);
|
||||
hipError_t hipMemcpy3DPeerAsync(hipMemcpy3DPeerParms *p, hipStream_t stream);
|
||||
hipError_t hipMemcpy3DPeer(hipMemcpy3DPeerParms* p);
|
||||
hipError_t hipMemcpy3DPeerAsync(hipMemcpy3DPeerParms* p, hipStream_t stream);
|
||||
} // namespace hip
|
||||
|
||||
namespace hip {
|
||||
@@ -940,7 +939,8 @@ void UpdateDispatchTable(HipDispatchTable* ptrDispatchTable) {
|
||||
ptrDispatchTable->hipDeviceGetPCIBusId_fn = hip::hipDeviceGetPCIBusId;
|
||||
ptrDispatchTable->hipDeviceGetSharedMemConfig_fn = hip::hipDeviceGetSharedMemConfig;
|
||||
ptrDispatchTable->hipDeviceGetStreamPriorityRange_fn = hip::hipDeviceGetStreamPriorityRange;
|
||||
ptrDispatchTable->hipDeviceGetTexture1DLinearMaxWidth_fn = hip::hipDeviceGetTexture1DLinearMaxWidth;
|
||||
ptrDispatchTable->hipDeviceGetTexture1DLinearMaxWidth_fn =
|
||||
hip::hipDeviceGetTexture1DLinearMaxWidth;
|
||||
ptrDispatchTable->hipDeviceGetUuid_fn = hip::hipDeviceGetUuid;
|
||||
ptrDispatchTable->hipDeviceGraphMemTrim_fn = hip::hipDeviceGraphMemTrim;
|
||||
ptrDispatchTable->hipDevicePrimaryCtxGetState_fn = hip::hipDevicePrimaryCtxGetState;
|
||||
@@ -1353,7 +1353,7 @@ void UpdateDispatchTable(HipDispatchTable* ptrDispatchTable) {
|
||||
ptrDispatchTable->hipGetDriverEntryPoint_fn = hip::hipGetDriverEntryPoint;
|
||||
ptrDispatchTable->hipGetDriverEntryPoint_spt_fn = hip::hipGetDriverEntryPoint_spt;
|
||||
ptrDispatchTable->hipExtGetLastError_fn = hip::hipExtGetLastError;
|
||||
ptrDispatchTable->hipTexRefGetBorderColor_fn = hip::hipTexRefGetBorderColor;
|
||||
ptrDispatchTable->hipTexRefGetBorderColor_fn = hip::hipTexRefGetBorderColor;
|
||||
ptrDispatchTable->hipTexRefGetArray_fn = hip::hipTexRefGetArray;
|
||||
ptrDispatchTable->hipGetProcAddress_fn = hip::hipGetProcAddress;
|
||||
ptrDispatchTable->hipStreamBeginCaptureToGraph_fn = hip::hipStreamBeginCaptureToGraph;
|
||||
@@ -1464,8 +1464,7 @@ NO_VECTORIZE const HipDispatchTable* GetHipDispatchTable() {
|
||||
static auto* _v = &GetDispatchTableImpl<HipDispatchTable>();
|
||||
return _v;
|
||||
}
|
||||
NO_VECTORIZE const HipCompilerDispatchTable*
|
||||
GetHipCompilerDispatchTable() {
|
||||
NO_VECTORIZE const HipCompilerDispatchTable* GetHipCompilerDispatchTable() {
|
||||
static auto* _v = &GetDispatchTableImpl<HipCompilerDispatchTable>();
|
||||
return _v;
|
||||
}
|
||||
@@ -1485,7 +1484,8 @@ constexpr auto ComputeTableOffset(size_t num_funcs) {
|
||||
// update the table versioning value before changing the value in HIP_ENFORCE_ABI_VERSIONING to make
|
||||
// this static assert pass.
|
||||
//
|
||||
// HIP_ENFORCE_ABI will cause a compiler error if the order of the members in the API table change. Do not reorder member variables and change existing HIP_ENFORCE_ABI values -- always
|
||||
// HIP_ENFORCE_ABI will cause a compiler error if the order of the members in the API table change.
|
||||
// Do not reorder member variables and change existing HIP_ENFORCE_ABI values -- always
|
||||
//
|
||||
// Please note: rocprofiler will do very strict compile time checks to make
|
||||
// sure these versioning values are appropriately updated -- so commenting out this check, only
|
||||
@@ -1502,7 +1502,7 @@ constexpr auto ComputeTableOffset(size_t num_funcs) {
|
||||
#define HIP_ENFORCE_ABI(TABLE, ENTRY, NUM) \
|
||||
static_assert(offsetof(TABLE, ENTRY) == ComputeTableOffset(NUM), \
|
||||
"ABI break for " #TABLE "." #ENTRY \
|
||||
". Only add new function pointers to end of struct and do not rearrange them " );
|
||||
". Only add new function pointers to end of struct and do not rearrange them ");
|
||||
|
||||
// These ensure that function pointers are not re-ordered
|
||||
// HIP_COMPILER_API_TABLE_STEP_VERSION == 0
|
||||
@@ -2026,11 +2026,11 @@ HIP_ENFORCE_ABI(HipDispatchTable, hipGraphBatchMemOpNodeGetParams_fn, 465);
|
||||
HIP_ENFORCE_ABI(HipDispatchTable, hipGraphBatchMemOpNodeSetParams_fn, 466);
|
||||
HIP_ENFORCE_ABI(HipDispatchTable, hipGraphExecBatchMemOpNodeSetParams_fn, 467);
|
||||
// HIP_RUNTIME_API_TABLE_STEP_VERSION == 9
|
||||
HIP_ENFORCE_ABI(HipDispatchTable, hipLinkAddData_fn , 468)
|
||||
HIP_ENFORCE_ABI(HipDispatchTable, hipLinkAddFile_fn , 469)
|
||||
HIP_ENFORCE_ABI(HipDispatchTable, hipLinkComplete_fn , 470)
|
||||
HIP_ENFORCE_ABI(HipDispatchTable, hipLinkCreate_fn , 471)
|
||||
HIP_ENFORCE_ABI(HipDispatchTable, hipLinkDestroy_fn , 472)
|
||||
HIP_ENFORCE_ABI(HipDispatchTable, hipLinkAddData_fn, 468)
|
||||
HIP_ENFORCE_ABI(HipDispatchTable, hipLinkAddFile_fn, 469)
|
||||
HIP_ENFORCE_ABI(HipDispatchTable, hipLinkComplete_fn, 470)
|
||||
HIP_ENFORCE_ABI(HipDispatchTable, hipLinkCreate_fn, 471)
|
||||
HIP_ENFORCE_ABI(HipDispatchTable, hipLinkDestroy_fn, 472)
|
||||
// HIP_RUNTIME_API_TABLE_STEP_VERSION == 10
|
||||
HIP_ENFORCE_ABI(HipDispatchTable, hipEventRecordWithFlags_fn, 473)
|
||||
|
||||
|
||||
@@ -81,10 +81,10 @@ struct ClangOffloadBundleCompressedHeader {
|
||||
};
|
||||
} // namespace symbols
|
||||
|
||||
//Forward Declaration for friend usage
|
||||
// Forward Declaration for friend usage
|
||||
class PlatformState;
|
||||
|
||||
//Code Object base class
|
||||
// Code Object base class
|
||||
class CodeObject {
|
||||
public:
|
||||
virtual ~CodeObject() {}
|
||||
@@ -96,20 +96,20 @@ class CodeObject {
|
||||
friend const std::vector<hipModule_t>& modules();
|
||||
};
|
||||
|
||||
//Dynamic Code Object
|
||||
// Dynamic Code Object
|
||||
class DynCO : public CodeObject {
|
||||
// Guards Dynamic Code object
|
||||
amd::Monitor dclock_{true};
|
||||
|
||||
public:
|
||||
public:
|
||||
DynCO() : device_id_(ihipGetDevice()), fb_info_(nullptr), module_(nullptr) {}
|
||||
virtual ~DynCO();
|
||||
|
||||
//LoadsCodeObject and its data
|
||||
hipError_t loadCodeObject(const char* fname, const void* image=nullptr);
|
||||
// LoadsCodeObject and its data
|
||||
hipError_t loadCodeObject(const char* fname, const void* image = nullptr);
|
||||
hipModule_t getModule() const { return module_; };
|
||||
|
||||
//Gets GlobalVar/Functions from a dynamically loaded code object
|
||||
// Gets GlobalVar/Functions from a dynamically loaded code object
|
||||
hipError_t getDynFunc(hipFunction_t* hfunc, std::string func_name);
|
||||
hipError_t getFuncCount(unsigned int* count);
|
||||
bool isValidDynFunc(const void* hfunc);
|
||||
@@ -128,60 +128,62 @@ public:
|
||||
return hipSuccess;
|
||||
}
|
||||
|
||||
private:
|
||||
private:
|
||||
int device_id_;
|
||||
FatBinaryInfo* fb_info_;
|
||||
hipModule_t module_;
|
||||
|
||||
//Maps for vars/funcs, could be keyed in with std::string name
|
||||
// Maps for vars/funcs, could be keyed in with std::string name
|
||||
std::unordered_map<std::string, Function*> functions_;
|
||||
std::unordered_map<std::string, Var*> vars_;
|
||||
|
||||
//Populate Global Vars/Funcs from an code object(@ module_load)
|
||||
// Populate Global Vars/Funcs from an code object(@ module_load)
|
||||
hipError_t populateDynGlobalFuncs();
|
||||
hipError_t populateDynGlobalVars();
|
||||
hipError_t initDynManagedVars(const std::string& managedVar);
|
||||
};
|
||||
|
||||
//Static Code Object
|
||||
class StatCO: public CodeObject {
|
||||
// Static Code Object
|
||||
class StatCO : public CodeObject {
|
||||
// Guards Static Code object
|
||||
amd::Monitor sclock_{true};
|
||||
public:
|
||||
|
||||
public:
|
||||
StatCO();
|
||||
virtual ~StatCO();
|
||||
|
||||
//Add/Remove/Digest Fat Binaries passed to us from "__hipRegisterFatBinary"
|
||||
// Add/Remove/Digest Fat Binaries passed to us from "__hipRegisterFatBinary"
|
||||
FatBinaryInfo** addFatBinary(const void* data, bool initialized, bool& success);
|
||||
hipError_t removeFatBinary(FatBinaryInfo** module);
|
||||
hipError_t digestFatBinary(const void* data, FatBinaryInfo*& programs);
|
||||
|
||||
//Register vars/funcs given to use from __hipRegister[Var/Func/ManagedVar]
|
||||
// Register vars/funcs given to use from __hipRegister[Var/Func/ManagedVar]
|
||||
hipError_t registerStatFunction(const void* hostFunction, Function* func);
|
||||
hipError_t registerStatGlobalVar(const void* hostVar, Var* var);
|
||||
hipError_t registerStatManagedVar(Var *var);
|
||||
hipError_t registerStatManagedVar(Var* var);
|
||||
|
||||
//Retrive Vars/Funcs for a given hostSidePtr(const void*), unless stated otherwise.
|
||||
// Retrive Vars/Funcs for a given hostSidePtr(const void*), unless stated otherwise.
|
||||
const char* getStatFuncName(const void* hostFunction);
|
||||
hipError_t getStatFunc(hipFunction_t* hfunc, const void* hostFunction, int deviceId);
|
||||
hipError_t getStatFuncAttr(hipFuncAttributes* func_attr, const void* hostFunction, int deviceId);
|
||||
hipError_t getStatGlobalVar(const void* hostVar, int deviceId, hipDeviceptr_t* dev_ptr,
|
||||
size_t* size_ptr);
|
||||
|
||||
//Managed variable is a defined symbol in code object
|
||||
//pointer to the alocated managed memory has to be copied to the address of symbol
|
||||
// Managed variable is a defined symbol in code object
|
||||
// pointer to the alocated managed memory has to be copied to the address of symbol
|
||||
hipError_t initStatManagedVarDevicePtr(int deviceId);
|
||||
private:
|
||||
|
||||
private:
|
||||
friend class hip::PlatformState;
|
||||
//Populated during __hipRegisterFatBinary
|
||||
// Populated during __hipRegisterFatBinary
|
||||
std::unordered_map<const void*, FatBinaryInfo*> modules_;
|
||||
//Populated during __hipRegisterFuncs
|
||||
// Populated during __hipRegisterFuncs
|
||||
std::unordered_map<const void*, Function*> functions_;
|
||||
//Populated during __hipRegisterVars
|
||||
// Populated during __hipRegisterVars
|
||||
std::unordered_map<const void*, Var*> vars_;
|
||||
//Populated during __hipRegisterManagedVar
|
||||
// Populated during __hipRegisterManagedVar
|
||||
std::unordered_map<FatBinaryInfo**, std::vector<Var*> > managedVars_;
|
||||
//Reverse mapping of modules to speed up removal
|
||||
// Reverse mapping of modules to speed up removal
|
||||
std::unordered_map<FatBinaryInfo**, const void*> module_to_hostModule_;
|
||||
std::unordered_map<FatBinaryInfo**, std::vector<const void*> > module_to_hostFunctions_;
|
||||
std::unordered_map<FatBinaryInfo**, std::vector<const void*> > module_to_hostVars_;
|
||||
|
||||
@@ -103,7 +103,8 @@ static bool getTargetIDValue(std::string& input, std::string& processor, char& s
|
||||
}
|
||||
|
||||
bool isCodeObjectCompatibleWithDevice(std::string co_triple_target_id,
|
||||
std::string agent_triple_target_id, unsigned& genericVersion) {
|
||||
std::string agent_triple_target_id,
|
||||
unsigned& genericVersion) {
|
||||
// Primitive Check
|
||||
if (co_triple_target_id == agent_triple_target_id) return true;
|
||||
|
||||
@@ -137,8 +138,7 @@ bool isCodeObjectCompatibleWithDevice(std::string co_triple_target_id,
|
||||
// Check for compatibility
|
||||
if (genericVersion >= EF_AMDGPU_GENERIC_VERSION_MIN) {
|
||||
// co_processor is generic target
|
||||
if (!IsCompatibleWithGenericTarget(co_processor, agent_isa_processor))
|
||||
return false;
|
||||
if (!IsCompatibleWithGenericTarget(co_processor, agent_isa_processor)) return false;
|
||||
} else if (agent_isa_processor != co_processor) {
|
||||
return false;
|
||||
}
|
||||
@@ -455,29 +455,27 @@ bool compileToBitCode(const amd_comgr_data_set_t compileInputs, const std::strin
|
||||
}
|
||||
|
||||
bool CheckIfBundled(std::vector<char>& llvm_bitcode) {
|
||||
std::string magic(llvm_bitcode.begin(),
|
||||
llvm_bitcode.begin() + bundle_magic_string_size);
|
||||
std::string magic(llvm_bitcode.begin(), llvm_bitcode.begin() + bundle_magic_string_size);
|
||||
|
||||
if (magic.compare(CLANG_OFFLOAD_BUNDLER_MAGIC_STR) == 0) {
|
||||
return true;
|
||||
}
|
||||
// File is not bundled
|
||||
return false;
|
||||
|
||||
}
|
||||
// Unbundle Bitcode using COMGR action
|
||||
// Supports only 1 Bundle Entry ID for now
|
||||
bool UnbundleUsingComgr(std::vector<char>& source, const std::string& isa,
|
||||
std::vector<std::string>& linkOptions, std::string& buildLog,
|
||||
std::vector<char>& unbundled_bitcode, const char *bundleEntryIDs[],
|
||||
std::vector<char>& unbundled_bitcode, const char* bundleEntryIDs[],
|
||||
size_t bundleEntryIDsCount) {
|
||||
amd_comgr_data_set_t linkinput;
|
||||
if (amd::Comgr::create_data_set(&linkinput) != AMD_COMGR_STATUS_SUCCESS) {
|
||||
return false;
|
||||
return false;
|
||||
}
|
||||
std::string name = "UnbundleCode.bc";
|
||||
if (!helpers::addCodeObjData(linkinput, source, name, AMD_COMGR_DATA_KIND_BC_BUNDLE)) {
|
||||
return false;
|
||||
return false;
|
||||
}
|
||||
|
||||
amd_comgr_action_info_t action;
|
||||
@@ -490,7 +488,8 @@ bool UnbundleUsingComgr(std::vector<char>& source, const std::string& isa,
|
||||
return false;
|
||||
}
|
||||
|
||||
if(amd::Comgr::action_info_set_bundle_entry_ids(action, bundleEntryIDs, bundleEntryIDsCount) != AMD_COMGR_STATUS_SUCCESS) {
|
||||
if (amd::Comgr::action_info_set_bundle_entry_ids(action, bundleEntryIDs, bundleEntryIDsCount) !=
|
||||
AMD_COMGR_STATUS_SUCCESS) {
|
||||
amd::Comgr::destroy_action_info(action);
|
||||
return false;
|
||||
}
|
||||
@@ -501,15 +500,14 @@ bool UnbundleUsingComgr(std::vector<char>& source, const std::string& isa,
|
||||
return false;
|
||||
}
|
||||
|
||||
if (auto res =
|
||||
amd::Comgr::do_action(AMD_COMGR_ACTION_UNBUNDLE, action, linkinput, output);
|
||||
if (auto res = amd::Comgr::do_action(AMD_COMGR_ACTION_UNBUNDLE, action, linkinput, output);
|
||||
res != AMD_COMGR_STATUS_SUCCESS) {
|
||||
amd::Comgr::destroy_action_info(action);
|
||||
amd::Comgr::destroy_data_set(output);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!extractBuildLog(output, buildLog)) {
|
||||
if (!extractBuildLog(output, buildLog)) {
|
||||
amd::Comgr::destroy_action_info(action);
|
||||
amd::Comgr::destroy_data_set(output);
|
||||
return false;
|
||||
@@ -533,8 +531,7 @@ bool linkLLVMBitcode(const amd_comgr_data_set_t linkInputs, const std::string& i
|
||||
const amd_comgr_language_t lang = AMD_COMGR_LANGUAGE_HIP;
|
||||
amd_comgr_action_info_t action;
|
||||
|
||||
if (auto res = createAction(action, linkOptions, isa, lang);
|
||||
res != AMD_COMGR_STATUS_SUCCESS) {
|
||||
if (auto res = createAction(action, linkOptions, isa, lang); res != AMD_COMGR_STATUS_SUCCESS) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -569,8 +566,8 @@ bool linkLLVMBitcode(const amd_comgr_data_set_t linkInputs, const std::string& i
|
||||
}
|
||||
|
||||
bool convertSPIRVToLLVMBC(const amd_comgr_data_set_t linkInputs, const std::string& isa,
|
||||
std::vector<std::string>& linkOptions, std::string& buildLog,
|
||||
std::vector<char>& LinkedLLVMBitcode) {
|
||||
std::vector<std::string>& linkOptions, std::string& buildLog,
|
||||
std::vector<char>& LinkedLLVMBitcode) {
|
||||
amd_comgr_action_info_t action;
|
||||
|
||||
if (auto res = createAction(action, linkOptions, isa, AMD_COMGR_LANGUAGE_NONE);
|
||||
@@ -832,39 +829,39 @@ bool fillMangledNames(std::vector<char>& dataVec, std::map<std::string, std::str
|
||||
const std::map<std::string, std::string>& GenericTargetMapping() {
|
||||
// The map is subject to change per removing policy
|
||||
static const std::map<std::string, std::string> genericTargetMap{
|
||||
// "gfx9-generic"
|
||||
{"gfx900", "gfx9-generic"},
|
||||
{"gfx902", "gfx9-generic"},
|
||||
{"gfx904", "gfx9-generic"},
|
||||
{"gfx906", "gfx9-generic"},
|
||||
{"gfx909", "gfx9-generic"},
|
||||
{"gfx90c", "gfx9-generic"},
|
||||
// "gfx9-4-generic"
|
||||
{"gfx942", "gfx9-4-generic"},
|
||||
{"gfx950", "gfx9-4-generic"},
|
||||
// "gfx10-1-generic"
|
||||
{"gfx1010", "gfx10-1-generic"},
|
||||
{"gfx1011", "gfx10-1-generic"},
|
||||
{"gfx1012", "gfx10-1-generic"},
|
||||
{"gfx1013", "gfx10-1-generic"},
|
||||
// "gfx10-3-generic"
|
||||
{"gfx1030", "gfx10-3-generic"},
|
||||
{"gfx1031", "gfx10-3-generic"},
|
||||
{"gfx1032", "gfx10-3-generic"},
|
||||
{"gfx1033", "gfx10-3-generic"},
|
||||
{"gfx1034", "gfx10-3-generic"},
|
||||
{"gfx1035", "gfx10-3-generic"},
|
||||
{"gfx1036", "gfx10-3-generic"},
|
||||
// "gfx11-generic"
|
||||
{"gfx1100", "gfx11-generic"},
|
||||
{"gfx1101", "gfx11-generic"},
|
||||
{"gfx1102", "gfx11-generic"},
|
||||
{"gfx1103", "gfx11-generic"},
|
||||
{"gfx1150", "gfx11-generic"},
|
||||
{"gfx1151", "gfx11-generic"},
|
||||
// "gfx12-generic"
|
||||
{"gfx1200", "gfx12-generic"},
|
||||
{"gfx1201", "gfx12-generic"},
|
||||
// "gfx9-generic"
|
||||
{"gfx900", "gfx9-generic"},
|
||||
{"gfx902", "gfx9-generic"},
|
||||
{"gfx904", "gfx9-generic"},
|
||||
{"gfx906", "gfx9-generic"},
|
||||
{"gfx909", "gfx9-generic"},
|
||||
{"gfx90c", "gfx9-generic"},
|
||||
// "gfx9-4-generic"
|
||||
{"gfx942", "gfx9-4-generic"},
|
||||
{"gfx950", "gfx9-4-generic"},
|
||||
// "gfx10-1-generic"
|
||||
{"gfx1010", "gfx10-1-generic"},
|
||||
{"gfx1011", "gfx10-1-generic"},
|
||||
{"gfx1012", "gfx10-1-generic"},
|
||||
{"gfx1013", "gfx10-1-generic"},
|
||||
// "gfx10-3-generic"
|
||||
{"gfx1030", "gfx10-3-generic"},
|
||||
{"gfx1031", "gfx10-3-generic"},
|
||||
{"gfx1032", "gfx10-3-generic"},
|
||||
{"gfx1033", "gfx10-3-generic"},
|
||||
{"gfx1034", "gfx10-3-generic"},
|
||||
{"gfx1035", "gfx10-3-generic"},
|
||||
{"gfx1036", "gfx10-3-generic"},
|
||||
// "gfx11-generic"
|
||||
{"gfx1100", "gfx11-generic"},
|
||||
{"gfx1101", "gfx11-generic"},
|
||||
{"gfx1102", "gfx11-generic"},
|
||||
{"gfx1103", "gfx11-generic"},
|
||||
{"gfx1150", "gfx11-generic"},
|
||||
{"gfx1151", "gfx11-generic"},
|
||||
// "gfx12-generic"
|
||||
{"gfx1200", "gfx12-generic"},
|
||||
{"gfx1201", "gfx12-generic"},
|
||||
};
|
||||
return genericTargetMap;
|
||||
}
|
||||
@@ -900,7 +897,6 @@ RTCProgram::RTCProgram(std::string name) : name_(name) {
|
||||
}
|
||||
|
||||
bool RTCProgram::findIsa() {
|
||||
|
||||
#ifdef BUILD_SHARED_LIBS
|
||||
const char* libName;
|
||||
#ifdef _WIN32
|
||||
@@ -1009,7 +1005,7 @@ bool LinkProgram::isLinkerValid(LinkProgram* link_program) {
|
||||
}
|
||||
|
||||
bool LinkProgram::AddLinkerOptions(unsigned int num_options, hipJitOption* options_ptr,
|
||||
void** options_vals_ptr) {
|
||||
void** options_vals_ptr) {
|
||||
for (size_t opt_idx = 0; opt_idx < num_options; ++opt_idx) {
|
||||
if (options_vals_ptr[opt_idx] == nullptr) {
|
||||
LogError("Options value can not be nullptr");
|
||||
@@ -1032,7 +1028,6 @@ bool LinkProgram::AddLinkerOptions(unsigned int num_options, hipJitOption* optio
|
||||
}
|
||||
|
||||
|
||||
|
||||
amd_comgr_data_kind_t LinkProgram::GetCOMGRDataKind(hipJitInputType input_type) {
|
||||
amd_comgr_data_kind_t data_kind = AMD_COMGR_DATA_KIND_UNDEF;
|
||||
|
||||
@@ -1061,7 +1056,7 @@ amd_comgr_data_kind_t LinkProgram::GetCOMGRDataKind(hipJitInputType input_type)
|
||||
|
||||
|
||||
bool LinkProgram::AddLinkerDataImpl(std::vector<char>& link_data, hipJitInputType input_type,
|
||||
std::string& link_file_name) {
|
||||
std::string& link_file_name) {
|
||||
std::vector<char> llvm_code_object;
|
||||
is_bundled_ = helpers::CheckIfBundled(link_data);
|
||||
|
||||
@@ -1079,20 +1074,20 @@ bool LinkProgram::AddLinkerDataImpl(std::vector<char>& link_data, hipJitInputTyp
|
||||
|
||||
llvm_code_object.assign(link_data.begin() + co_offset, link_data.begin() + co_offset + co_size);
|
||||
} else if (is_bundled_ && input_type == hipJitInputSpirv) {
|
||||
const char* bundleEntryIDs[] = { helpers::SPIRV_BUNDLE_ENTRY_ID };
|
||||
const char* bundleEntryIDs[] = {helpers::SPIRV_BUNDLE_ENTRY_ID};
|
||||
size_t bundleEntryIDsCount = sizeof(bundleEntryIDs) / sizeof(bundleEntryIDs[0]);
|
||||
if(!helpers::UnbundleUsingComgr(link_data, isa_, link_options_, build_log_, llvm_code_object,
|
||||
bundleEntryIDs, bundleEntryIDsCount)) {
|
||||
if (!helpers::UnbundleUsingComgr(link_data, isa_, link_options_, build_log_, llvm_code_object,
|
||||
bundleEntryIDs, bundleEntryIDsCount)) {
|
||||
LogError("Error in hip Linker: Unable to unbundle SPIRV Bitcode");
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
llvm_code_object.assign(link_data.begin(), link_data.end());
|
||||
llvm_code_object.assign(link_data.begin(), link_data.end());
|
||||
}
|
||||
|
||||
if ((data_kind_ = GetCOMGRDataKind(input_type)) == AMD_COMGR_DATA_KIND_UNDEF) {
|
||||
LogError("Cannot find the correct COMGR data kind");
|
||||
return false;
|
||||
LogError("Cannot find the correct COMGR data kind");
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!helpers::addCodeObjData(link_input_, llvm_code_object, link_file_name, data_kind_)) {
|
||||
@@ -1126,7 +1121,7 @@ bool LinkProgram::AddLinkerFile(std::string file_path, hipJitInputType input_typ
|
||||
}
|
||||
|
||||
bool LinkProgram::AddLinkerData(void* image_ptr, size_t image_size, std::string link_file_name,
|
||||
hipJitInputType input_type) {
|
||||
hipJitInputType input_type) {
|
||||
char* image_char_buf = reinterpret_cast<char*>(image_ptr);
|
||||
std::vector<char> llvm_code_object(image_char_buf, image_char_buf + image_size);
|
||||
|
||||
@@ -1146,13 +1141,15 @@ bool LinkProgram::LinkComplete(void** bin_out, size_t* size_out) {
|
||||
if (data_kind_ == AMD_COMGR_DATA_KIND_SPIRV) {
|
||||
// Convert SPIRV Unbundled code object to LLVM Bitcode
|
||||
std::vector<char> llvmbc_from_spirv;
|
||||
if (!helpers::convertSPIRVToLLVMBC(link_input_, isa_, link_options_, build_log_, llvmbc_from_spirv)) {
|
||||
if (!helpers::convertSPIRVToLLVMBC(link_input_, isa_, link_options_, build_log_,
|
||||
llvmbc_from_spirv)) {
|
||||
LogError("Error in hip Linker: unable to convert SPIRV to BC");
|
||||
return false;
|
||||
}
|
||||
|
||||
std::string linkedFileName = "LLVMBitcodeFromSPIRV.bc";
|
||||
if (!helpers::addCodeObjData(link_input, llvmbc_from_spirv, linkedFileName, AMD_COMGR_DATA_KIND_BC)) {
|
||||
if (!helpers::addCodeObjData(link_input, llvmbc_from_spirv, linkedFileName,
|
||||
AMD_COMGR_DATA_KIND_BC)) {
|
||||
LogError("Error in hip Linker: unable to add linked LLVM bitcode");
|
||||
return false;
|
||||
}
|
||||
@@ -1182,7 +1179,7 @@ bool LinkProgram::LinkComplete(void** bin_out, size_t* size_out) {
|
||||
}()
|
||||
.c_str());
|
||||
if (!helpers::createExecutable(exec_input_, isa_, exe_options, build_log_, executable_,
|
||||
data_kind_ == AMD_COMGR_DATA_KIND_SPIRV)) {
|
||||
data_kind_ == AMD_COMGR_DATA_KIND_SPIRV)) {
|
||||
LogPrintfInfo("Error in hip linker: unable to create exectuable: %s", build_log_.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -124,7 +124,7 @@ void setCurrentDevice(unsigned int index) {
|
||||
}
|
||||
|
||||
hip::Stream* getStream(hipStream_t stream, bool wait) {
|
||||
if (stream == nullptr || stream == hipStreamLegacy) {
|
||||
if (stream == nullptr || stream == hipStreamLegacy) {
|
||||
return getNullStream(wait);
|
||||
} else {
|
||||
hip::Stream* hip_stream = reinterpret_cast<hip::Stream*>(stream);
|
||||
@@ -163,7 +163,7 @@ int getDeviceID(amd::Context& ctx) {
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
hip::Stream* getNullStream(bool wait ) {
|
||||
hip::Stream* getNullStream(bool wait) {
|
||||
Device* device = getCurrentDevice();
|
||||
if (device == nullptr) {
|
||||
LogError("Invalid device");
|
||||
|
||||
@@ -25,11 +25,9 @@ THE SOFTWARE.
|
||||
#include <hip/driver_types.h>
|
||||
#include <hip/texture_types.h>
|
||||
|
||||
namespace hip
|
||||
{
|
||||
inline
|
||||
cl_channel_type getCLChannelType(const hipArray_Format hipFormat,
|
||||
const hipTextureReadMode hipReadMode) {
|
||||
namespace hip {
|
||||
inline cl_channel_type getCLChannelType(const hipArray_Format hipFormat,
|
||||
const hipTextureReadMode hipReadMode) {
|
||||
if (hipReadMode == hipReadModeElementType) {
|
||||
switch (hipFormat) {
|
||||
case HIP_AD_FORMAT_UNSIGNED_INT8:
|
||||
@@ -70,13 +68,11 @@ cl_channel_type getCLChannelType(const hipArray_Format hipFormat,
|
||||
}
|
||||
}
|
||||
|
||||
//error scenario
|
||||
// error scenario
|
||||
return {};
|
||||
}
|
||||
|
||||
inline
|
||||
cl_channel_order getCLChannelOrder(const unsigned int hipNumChannels,
|
||||
const int sRGB) {
|
||||
inline cl_channel_order getCLChannelOrder(const unsigned int hipNumChannels, const int sRGB) {
|
||||
switch (hipNumChannels) {
|
||||
case 1:
|
||||
return CL_R;
|
||||
@@ -88,15 +84,14 @@ cl_channel_order getCLChannelOrder(const unsigned int hipNumChannels,
|
||||
break;
|
||||
}
|
||||
|
||||
//error scenario
|
||||
// error scenario
|
||||
return {};
|
||||
}
|
||||
|
||||
inline
|
||||
cl_mem_object_type getCLMemObjectType(const unsigned int hipWidth,
|
||||
const unsigned int hipHeight,
|
||||
const unsigned int hipDepth,
|
||||
const unsigned int flags) {
|
||||
inline cl_mem_object_type getCLMemObjectType(const unsigned int hipWidth,
|
||||
const unsigned int hipHeight,
|
||||
const unsigned int hipDepth,
|
||||
const unsigned int flags) {
|
||||
if ((flags & hipArrayLayered) == hipArrayLayered) {
|
||||
if ((hipWidth != 0) && (hipHeight == 0) && (hipDepth != 0)) {
|
||||
return CL_MEM_OBJECT_IMAGE1D_ARRAY;
|
||||
@@ -126,8 +121,7 @@ inline bool isLayered1D(const hipArray* arr) {
|
||||
return CL_MEM_OBJECT_IMAGE1D_ARRAY == getCLMemObjectType(arr);
|
||||
}
|
||||
|
||||
inline
|
||||
cl_addressing_mode getCLAddressingMode(const hipTextureAddressMode hipAddressMode) {
|
||||
inline cl_addressing_mode getCLAddressingMode(const hipTextureAddressMode hipAddressMode) {
|
||||
switch (hipAddressMode) {
|
||||
case hipAddressModeWrap:
|
||||
return CL_ADDRESS_REPEAT;
|
||||
@@ -139,12 +133,11 @@ cl_addressing_mode getCLAddressingMode(const hipTextureAddressMode hipAddressMod
|
||||
return CL_ADDRESS_CLAMP;
|
||||
}
|
||||
|
||||
//error scenario
|
||||
// error scenario
|
||||
return {};
|
||||
}
|
||||
|
||||
inline
|
||||
cl_filter_mode getCLFilterMode(const hipTextureFilterMode hipFilterMode) {
|
||||
inline cl_filter_mode getCLFilterMode(const hipTextureFilterMode hipFilterMode) {
|
||||
switch (hipFilterMode) {
|
||||
case hipFilterModePoint:
|
||||
return CL_FILTER_NEAREST;
|
||||
@@ -152,12 +145,11 @@ cl_filter_mode getCLFilterMode(const hipTextureFilterMode hipFilterMode) {
|
||||
return CL_FILTER_LINEAR;
|
||||
}
|
||||
|
||||
//error scenario
|
||||
// error scenario
|
||||
return {};
|
||||
}
|
||||
|
||||
inline
|
||||
cl_mem_object_type getCLMemObjectType(const hipResourceType hipResType) {
|
||||
inline cl_mem_object_type getCLMemObjectType(const hipResourceType hipResType) {
|
||||
switch (hipResType) {
|
||||
case hipResourceTypeLinear:
|
||||
return CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
||||
@@ -167,12 +159,11 @@ cl_mem_object_type getCLMemObjectType(const hipResourceType hipResType) {
|
||||
break;
|
||||
}
|
||||
|
||||
//error scenario
|
||||
// error scenario
|
||||
return {};
|
||||
}
|
||||
|
||||
inline
|
||||
hipArray_Format getCL2hipArrayFormat(const cl_channel_type type) {
|
||||
inline hipArray_Format getCL2hipArrayFormat(const cl_channel_type type) {
|
||||
switch (type) {
|
||||
case CL_SNORM_INT8:
|
||||
case CL_SIGNED_INT8:
|
||||
@@ -200,8 +191,7 @@ hipArray_Format getCL2hipArrayFormat(const cl_channel_type type) {
|
||||
return HIP_AD_FORMAT_UNSIGNED_INT8;
|
||||
}
|
||||
}
|
||||
inline
|
||||
size_t getElementSize(const hipArray_const_t array) {
|
||||
inline size_t getElementSize(const hipArray_const_t array) {
|
||||
switch (array->Format) {
|
||||
case HIP_AD_FORMAT_UNSIGNED_INT8:
|
||||
case HIP_AD_FORMAT_SIGNED_INT8:
|
||||
@@ -216,13 +206,11 @@ size_t getElementSize(const hipArray_const_t array) {
|
||||
return 4 * array->NumChannels;
|
||||
}
|
||||
|
||||
//error scenario
|
||||
// error scenario
|
||||
return {};
|
||||
}
|
||||
|
||||
inline
|
||||
hipChannelFormatDesc getChannelFormatDesc(int numChannels,
|
||||
hipArray_Format arrayFormat) {
|
||||
inline hipChannelFormatDesc getChannelFormatDesc(int numChannels, hipArray_Format arrayFormat) {
|
||||
switch (arrayFormat) {
|
||||
case HIP_AD_FORMAT_UNSIGNED_INT8:
|
||||
switch (numChannels) {
|
||||
@@ -298,43 +286,39 @@ hipChannelFormatDesc getChannelFormatDesc(int numChannels,
|
||||
}
|
||||
}
|
||||
|
||||
//error scenario
|
||||
// error scenario
|
||||
return {};
|
||||
}
|
||||
|
||||
inline
|
||||
unsigned int getNumChannels(const hipChannelFormatDesc& desc) {
|
||||
inline unsigned int getNumChannels(const hipChannelFormatDesc& desc) {
|
||||
return ((desc.x != 0) + (desc.y != 0) + (desc.z != 0) + (desc.w != 0));
|
||||
}
|
||||
|
||||
inline
|
||||
bool CheckArrayFormat(const hipChannelFormatDesc& desc) {
|
||||
if(desc.x == 0) {
|
||||
inline bool CheckArrayFormat(const hipChannelFormatDesc& desc) {
|
||||
if (desc.x == 0) {
|
||||
return false;
|
||||
} else {
|
||||
if(desc.y != 0 && desc.y != desc.x) {
|
||||
if (desc.y != 0 && desc.y != desc.x) {
|
||||
return false;
|
||||
}
|
||||
if(desc.z !=0 && desc.z != desc.x) {
|
||||
if (desc.z != 0 && desc.z != desc.x) {
|
||||
return false;
|
||||
}
|
||||
if(desc.w !=0 && desc.w != desc.x) {
|
||||
if (desc.w != 0 && desc.w != desc.x) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
// The bit channel description should not allow any channels after a zero channel
|
||||
if (desc.y == 0) {
|
||||
return !(desc.z > 0 || desc.w > 0);
|
||||
}
|
||||
else if (desc.z == 0) {
|
||||
} else if (desc.z == 0) {
|
||||
return !(desc.w > 0);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
inline
|
||||
hipArray_Format getArrayFormat(const hipChannelFormatDesc& desc) {
|
||||
inline hipArray_Format getArrayFormat(const hipChannelFormatDesc& desc) {
|
||||
switch (desc.f) {
|
||||
case hipChannelFormatKindUnsigned:
|
||||
switch (desc.x) {
|
||||
@@ -365,12 +349,11 @@ hipArray_Format getArrayFormat(const hipChannelFormatDesc& desc) {
|
||||
break;
|
||||
}
|
||||
|
||||
//error scenario
|
||||
// error scenario
|
||||
return {};
|
||||
}
|
||||
|
||||
inline
|
||||
int getNumChannels(const hipResourceViewFormat hipFormat) {
|
||||
inline int getNumChannels(const hipResourceViewFormat hipFormat) {
|
||||
switch (hipFormat) {
|
||||
case hipResViewFormatUnsignedChar1:
|
||||
case hipResViewFormatSignedChar1:
|
||||
@@ -403,12 +386,11 @@ int getNumChannels(const hipResourceViewFormat hipFormat) {
|
||||
break;
|
||||
}
|
||||
|
||||
//error scenario
|
||||
// error scenario
|
||||
return {};
|
||||
}
|
||||
|
||||
inline
|
||||
hipArray_Format getArrayFormat(const hipResourceViewFormat hipFormat) {
|
||||
inline hipArray_Format getArrayFormat(const hipResourceViewFormat hipFormat) {
|
||||
switch (hipFormat) {
|
||||
case hipResViewFormatUnsignedChar1:
|
||||
case hipResViewFormatUnsignedChar2:
|
||||
@@ -446,12 +428,11 @@ hipArray_Format getArrayFormat(const hipResourceViewFormat hipFormat) {
|
||||
break;
|
||||
}
|
||||
|
||||
//error scenario
|
||||
// error scenario
|
||||
return {};
|
||||
}
|
||||
|
||||
inline
|
||||
hipResourceViewFormat getResourceViewFormat(const hipChannelFormatDesc& desc) {
|
||||
inline hipResourceViewFormat getResourceViewFormat(const hipChannelFormatDesc& desc) {
|
||||
switch (desc.f) {
|
||||
case hipChannelFormatKindUnsigned:
|
||||
switch (getNumChannels(desc)) {
|
||||
@@ -541,12 +522,11 @@ hipResourceViewFormat getResourceViewFormat(const hipChannelFormatDesc& desc) {
|
||||
break;
|
||||
}
|
||||
|
||||
//error scenario
|
||||
// error scenario
|
||||
return {};
|
||||
}
|
||||
|
||||
inline
|
||||
hipTextureDesc getTextureDesc(const textureReference* texRef) {
|
||||
inline hipTextureDesc getTextureDesc(const textureReference* texRef) {
|
||||
hipTextureDesc texDesc = {};
|
||||
std::memcpy(texDesc.addressMode, texRef->addressMode, sizeof(texDesc.addressMode));
|
||||
texDesc.filterMode = texRef->filterMode;
|
||||
@@ -562,9 +542,8 @@ hipTextureDesc getTextureDesc(const textureReference* texRef) {
|
||||
return texDesc;
|
||||
}
|
||||
|
||||
inline
|
||||
hipResourceViewDesc getResourceViewDesc(hipArray_const_t array,
|
||||
const hipResourceViewFormat format) {
|
||||
inline hipResourceViewDesc getResourceViewDesc(hipArray_const_t array,
|
||||
const hipResourceViewFormat format) {
|
||||
hipResourceViewDesc resViewDesc = {};
|
||||
resViewDesc.format = format;
|
||||
resViewDesc.width = array->width;
|
||||
@@ -578,9 +557,8 @@ hipResourceViewDesc getResourceViewDesc(hipArray_const_t array,
|
||||
return resViewDesc;
|
||||
}
|
||||
|
||||
inline
|
||||
hipResourceViewDesc getResourceViewDesc(hipMipmappedArray_const_t array,
|
||||
const hipResourceViewFormat format) {
|
||||
inline hipResourceViewDesc getResourceViewDesc(hipMipmappedArray_const_t array,
|
||||
const hipResourceViewFormat format) {
|
||||
hipResourceViewDesc resViewDesc = {};
|
||||
resViewDesc.format = format;
|
||||
resViewDesc.width = array->width;
|
||||
@@ -594,8 +572,7 @@ hipResourceViewDesc getResourceViewDesc(hipMipmappedArray_const_t array,
|
||||
return resViewDesc;
|
||||
}
|
||||
|
||||
inline
|
||||
std::pair<hipMemoryType, hipMemoryType> getMemoryType(const hipMemcpyKind kind) {
|
||||
inline std::pair<hipMemoryType, hipMemoryType> getMemoryType(const hipMemcpyKind kind) {
|
||||
switch (kind) {
|
||||
case hipMemcpyHostToHost:
|
||||
return {hipMemoryTypeHost, hipMemoryTypeHost};
|
||||
@@ -610,12 +587,11 @@ std::pair<hipMemoryType, hipMemoryType> getMemoryType(const hipMemcpyKind kind)
|
||||
return {hipMemoryTypeUnified, hipMemoryTypeUnified};
|
||||
}
|
||||
|
||||
//error scenario
|
||||
// error scenario
|
||||
return {};
|
||||
}
|
||||
|
||||
inline
|
||||
HIP_MEMCPY3D getDrvMemcpy3DDesc(const hip_Memcpy2D& desc2D) {
|
||||
inline HIP_MEMCPY3D getDrvMemcpy3DDesc(const hip_Memcpy2D& desc2D) {
|
||||
HIP_MEMCPY3D desc3D = {};
|
||||
|
||||
desc3D.srcXInBytes = desc2D.srcXInBytes;
|
||||
@@ -626,8 +602,7 @@ HIP_MEMCPY3D getDrvMemcpy3DDesc(const hip_Memcpy2D& desc2D) {
|
||||
desc3D.srcHost = desc2D.srcHost;
|
||||
desc3D.srcDevice = desc2D.srcDevice;
|
||||
desc3D.srcArray = desc2D.srcArray;
|
||||
desc3D.srcPitch = desc2D.srcPitch ? desc2D.srcPitch
|
||||
: (desc2D.srcXInBytes + desc2D.WidthInBytes);
|
||||
desc3D.srcPitch = desc2D.srcPitch ? desc2D.srcPitch : (desc2D.srcXInBytes + desc2D.WidthInBytes);
|
||||
desc3D.srcHeight = 0;
|
||||
|
||||
desc3D.dstXInBytes = desc2D.dstXInBytes;
|
||||
@@ -638,8 +613,7 @@ HIP_MEMCPY3D getDrvMemcpy3DDesc(const hip_Memcpy2D& desc2D) {
|
||||
desc3D.dstHost = desc2D.dstHost;
|
||||
desc3D.dstDevice = desc2D.dstDevice;
|
||||
desc3D.dstArray = desc2D.dstArray;
|
||||
desc3D.dstPitch = desc2D.dstPitch ? desc2D.dstPitch
|
||||
: (desc2D.dstXInBytes + desc2D.WidthInBytes);
|
||||
desc3D.dstPitch = desc2D.dstPitch ? desc2D.dstPitch : (desc2D.dstXInBytes + desc2D.WidthInBytes);
|
||||
desc3D.dstHeight = 0;
|
||||
|
||||
desc3D.WidthInBytes = desc2D.WidthInBytes;
|
||||
@@ -649,8 +623,7 @@ HIP_MEMCPY3D getDrvMemcpy3DDesc(const hip_Memcpy2D& desc2D) {
|
||||
return desc3D;
|
||||
}
|
||||
|
||||
inline
|
||||
HIP_MEMCPY3D getDrvMemcpy3DDesc(const hipMemcpy3DParms& desc) {
|
||||
inline HIP_MEMCPY3D getDrvMemcpy3DDesc(const hipMemcpy3DParms& desc) {
|
||||
HIP_MEMCPY3D descDrv = {};
|
||||
|
||||
descDrv.WidthInBytes = desc.extent.width;
|
||||
@@ -702,7 +675,8 @@ HIP_MEMCPY3D getDrvMemcpy3DDesc(const hipMemcpy3DParms& desc) {
|
||||
descDrv.dstHeight = desc.dstPtr.ysize;
|
||||
}
|
||||
|
||||
// If a HIP array is participating in the copy, the extent is defined in terms of that array's elements.
|
||||
// If a HIP array is participating in the copy, the extent is defined in terms of that array's
|
||||
// elements.
|
||||
if ((desc.srcArray != nullptr) && (desc.dstArray == nullptr)) {
|
||||
descDrv.WidthInBytes *= getElementSize(desc.srcArray);
|
||||
} else if ((desc.srcArray == nullptr) && (desc.dstArray != nullptr)) {
|
||||
@@ -733,108 +707,101 @@ HIP_MEMCPY3D getDrvMemcpy3DDesc(const hipMemcpy3DParms& desc) {
|
||||
return descDrv;
|
||||
}
|
||||
|
||||
inline
|
||||
hipResourceType getResourceType(const HIPresourcetype resType) {
|
||||
inline hipResourceType getResourceType(const HIPresourcetype resType) {
|
||||
// These two enums should be isomorphic.
|
||||
return static_cast<hipResourceType>(resType);
|
||||
}
|
||||
|
||||
inline
|
||||
HIPresourcetype getResourceType(const hipResourceType resType) {
|
||||
inline HIPresourcetype getResourceType(const hipResourceType resType) {
|
||||
// These two enums should be isomorphic.
|
||||
return static_cast<HIPresourcetype>(resType);
|
||||
}
|
||||
|
||||
inline
|
||||
hipResourceDesc getResourceDesc(const HIP_RESOURCE_DESC& resDesc) {
|
||||
inline hipResourceDesc getResourceDesc(const HIP_RESOURCE_DESC& resDesc) {
|
||||
hipResourceDesc desc;
|
||||
|
||||
desc.resType = getResourceType(resDesc.resType);
|
||||
switch (desc.resType) {
|
||||
case hipResourceTypeArray:
|
||||
desc.res.array.array = resDesc.res.array.hArray;
|
||||
break;
|
||||
case hipResourceTypeMipmappedArray:
|
||||
desc.res.mipmap.mipmap = resDesc.res.mipmap.hMipmappedArray;
|
||||
break;
|
||||
case hipResourceTypeLinear:
|
||||
desc.res.linear.devPtr = resDesc.res.linear.devPtr;
|
||||
desc.res.linear.desc = getChannelFormatDesc(resDesc.res.linear.numChannels, resDesc.res.linear.format);
|
||||
desc.res.linear.sizeInBytes = resDesc.res.linear.sizeInBytes;
|
||||
break;
|
||||
case hipResourceTypePitch2D:
|
||||
desc.res.pitch2D.devPtr = resDesc.res.pitch2D.devPtr;
|
||||
desc.res.pitch2D.desc = getChannelFormatDesc(resDesc.res.pitch2D.numChannels, resDesc.res.pitch2D.format);
|
||||
desc.res.pitch2D.width = resDesc.res.pitch2D.width;
|
||||
desc.res.pitch2D.height = resDesc.res.pitch2D.height;
|
||||
desc.res.pitch2D.pitchInBytes = resDesc.res.pitch2D.pitchInBytes;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
case hipResourceTypeArray:
|
||||
desc.res.array.array = resDesc.res.array.hArray;
|
||||
break;
|
||||
case hipResourceTypeMipmappedArray:
|
||||
desc.res.mipmap.mipmap = resDesc.res.mipmap.hMipmappedArray;
|
||||
break;
|
||||
case hipResourceTypeLinear:
|
||||
desc.res.linear.devPtr = resDesc.res.linear.devPtr;
|
||||
desc.res.linear.desc =
|
||||
getChannelFormatDesc(resDesc.res.linear.numChannels, resDesc.res.linear.format);
|
||||
desc.res.linear.sizeInBytes = resDesc.res.linear.sizeInBytes;
|
||||
break;
|
||||
case hipResourceTypePitch2D:
|
||||
desc.res.pitch2D.devPtr = resDesc.res.pitch2D.devPtr;
|
||||
desc.res.pitch2D.desc =
|
||||
getChannelFormatDesc(resDesc.res.pitch2D.numChannels, resDesc.res.pitch2D.format);
|
||||
desc.res.pitch2D.width = resDesc.res.pitch2D.width;
|
||||
desc.res.pitch2D.height = resDesc.res.pitch2D.height;
|
||||
desc.res.pitch2D.pitchInBytes = resDesc.res.pitch2D.pitchInBytes;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
return desc;
|
||||
}
|
||||
|
||||
inline
|
||||
HIP_RESOURCE_DESC getResourceDesc(const hipResourceDesc& resDesc) {
|
||||
inline HIP_RESOURCE_DESC getResourceDesc(const hipResourceDesc& resDesc) {
|
||||
HIP_RESOURCE_DESC desc;
|
||||
|
||||
desc.resType = getResourceType(resDesc.resType);
|
||||
switch (desc.resType) {
|
||||
case HIP_RESOURCE_TYPE_ARRAY:
|
||||
desc.res.array.hArray = resDesc.res.array.array;
|
||||
break;
|
||||
case HIP_RESOURCE_TYPE_MIPMAPPED_ARRAY:
|
||||
desc.res.mipmap.hMipmappedArray = resDesc.res.mipmap.mipmap;
|
||||
break;
|
||||
case HIP_RESOURCE_TYPE_LINEAR:
|
||||
desc.res.linear.devPtr = resDesc.res.linear.devPtr;
|
||||
desc.res.linear.numChannels = getNumChannels(resDesc.res.linear.desc);
|
||||
desc.res.linear.format = getArrayFormat(resDesc.res.linear.desc);
|
||||
desc.res.linear.sizeInBytes = resDesc.res.linear.sizeInBytes;
|
||||
break;
|
||||
case HIP_RESOURCE_TYPE_PITCH2D:
|
||||
desc.res.pitch2D.devPtr = resDesc.res.pitch2D.devPtr;
|
||||
desc.res.pitch2D.numChannels = getNumChannels(resDesc.res.pitch2D.desc);
|
||||
desc.res.pitch2D.format = getArrayFormat(resDesc.res.pitch2D.desc);
|
||||
desc.res.pitch2D.width = resDesc.res.pitch2D.width;
|
||||
desc.res.pitch2D.height = resDesc.res.pitch2D.height;
|
||||
desc.res.pitch2D.pitchInBytes = resDesc.res.pitch2D.pitchInBytes;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
case HIP_RESOURCE_TYPE_ARRAY:
|
||||
desc.res.array.hArray = resDesc.res.array.array;
|
||||
break;
|
||||
case HIP_RESOURCE_TYPE_MIPMAPPED_ARRAY:
|
||||
desc.res.mipmap.hMipmappedArray = resDesc.res.mipmap.mipmap;
|
||||
break;
|
||||
case HIP_RESOURCE_TYPE_LINEAR:
|
||||
desc.res.linear.devPtr = resDesc.res.linear.devPtr;
|
||||
desc.res.linear.numChannels = getNumChannels(resDesc.res.linear.desc);
|
||||
desc.res.linear.format = getArrayFormat(resDesc.res.linear.desc);
|
||||
desc.res.linear.sizeInBytes = resDesc.res.linear.sizeInBytes;
|
||||
break;
|
||||
case HIP_RESOURCE_TYPE_PITCH2D:
|
||||
desc.res.pitch2D.devPtr = resDesc.res.pitch2D.devPtr;
|
||||
desc.res.pitch2D.numChannels = getNumChannels(resDesc.res.pitch2D.desc);
|
||||
desc.res.pitch2D.format = getArrayFormat(resDesc.res.pitch2D.desc);
|
||||
desc.res.pitch2D.width = resDesc.res.pitch2D.width;
|
||||
desc.res.pitch2D.height = resDesc.res.pitch2D.height;
|
||||
desc.res.pitch2D.pitchInBytes = resDesc.res.pitch2D.pitchInBytes;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
return desc;
|
||||
}
|
||||
|
||||
inline
|
||||
hipTextureAddressMode getAddressMode(const HIPaddress_mode mode) {
|
||||
inline hipTextureAddressMode getAddressMode(const HIPaddress_mode mode) {
|
||||
// These two enums should be isomorphic.
|
||||
return static_cast<hipTextureAddressMode>(mode);
|
||||
}
|
||||
|
||||
inline
|
||||
HIPaddress_mode getAddressMode(const hipTextureAddressMode mode) {
|
||||
inline HIPaddress_mode getAddressMode(const hipTextureAddressMode mode) {
|
||||
// These two enums should be isomorphic.
|
||||
return static_cast<HIPaddress_mode>(mode);
|
||||
}
|
||||
|
||||
inline
|
||||
hipTextureFilterMode getFilterMode(const HIPfilter_mode mode) {
|
||||
inline hipTextureFilterMode getFilterMode(const HIPfilter_mode mode) {
|
||||
// These two enums should be isomorphic.
|
||||
return static_cast<hipTextureFilterMode>(mode);
|
||||
}
|
||||
|
||||
inline
|
||||
HIPfilter_mode getFilterMode(const hipTextureFilterMode mode) {
|
||||
inline HIPfilter_mode getFilterMode(const hipTextureFilterMode mode) {
|
||||
// These two enums should be isomorphic.
|
||||
return static_cast<HIPfilter_mode>(mode);
|
||||
}
|
||||
|
||||
inline
|
||||
hipTextureReadMode getReadMode(const unsigned int flags) {
|
||||
inline hipTextureReadMode getReadMode(const unsigned int flags) {
|
||||
if (flags & HIP_TRSF_READ_AS_INTEGER) {
|
||||
return hipReadModeElementType;
|
||||
} else {
|
||||
@@ -842,17 +809,15 @@ hipTextureReadMode getReadMode(const unsigned int flags) {
|
||||
}
|
||||
}
|
||||
|
||||
inline
|
||||
unsigned int getReadMode(const hipTextureReadMode mode) {
|
||||
if (mode == hipReadModeElementType) {
|
||||
inline unsigned int getReadMode(const hipTextureReadMode mode) {
|
||||
if (mode == hipReadModeElementType) {
|
||||
return HIP_TRSF_READ_AS_INTEGER;
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
inline
|
||||
int getsRGB(const unsigned int flags) {
|
||||
inline int getsRGB(const unsigned int flags) {
|
||||
if (flags & HIP_TRSF_SRGB) {
|
||||
return 1;
|
||||
} else {
|
||||
@@ -860,8 +825,7 @@ int getsRGB(const unsigned int flags) {
|
||||
}
|
||||
}
|
||||
|
||||
inline
|
||||
unsigned int getsRGB(const int sRGB) {
|
||||
inline unsigned int getsRGB(const int sRGB) {
|
||||
if (sRGB == 1) {
|
||||
return HIP_TRSF_SRGB;
|
||||
} else {
|
||||
@@ -869,8 +833,7 @@ unsigned int getsRGB(const int sRGB) {
|
||||
}
|
||||
}
|
||||
|
||||
inline
|
||||
int getNormalizedCoords(const unsigned int flags) {
|
||||
inline int getNormalizedCoords(const unsigned int flags) {
|
||||
if (flags & HIP_TRSF_NORMALIZED_COORDINATES) {
|
||||
return 1;
|
||||
} else {
|
||||
@@ -878,8 +841,7 @@ int getNormalizedCoords(const unsigned int flags) {
|
||||
}
|
||||
}
|
||||
|
||||
inline
|
||||
unsigned int getNormalizedCoords(const int normalizedCoords) {
|
||||
inline unsigned int getNormalizedCoords(const int normalizedCoords) {
|
||||
if (normalizedCoords == 1) {
|
||||
return HIP_TRSF_NORMALIZED_COORDINATES;
|
||||
} else {
|
||||
@@ -887,8 +849,7 @@ unsigned int getNormalizedCoords(const int normalizedCoords) {
|
||||
}
|
||||
}
|
||||
|
||||
inline
|
||||
hipTextureDesc getTextureDesc(const HIP_TEXTURE_DESC& texDesc) {
|
||||
inline hipTextureDesc getTextureDesc(const HIP_TEXTURE_DESC& texDesc) {
|
||||
hipTextureDesc desc;
|
||||
|
||||
desc.addressMode[0] = getAddressMode(texDesc.addressMode[0]);
|
||||
@@ -908,8 +869,7 @@ hipTextureDesc getTextureDesc(const HIP_TEXTURE_DESC& texDesc) {
|
||||
return desc;
|
||||
}
|
||||
|
||||
inline
|
||||
HIP_TEXTURE_DESC getTextureDesc(const hipTextureDesc& texDesc) {
|
||||
inline HIP_TEXTURE_DESC getTextureDesc(const hipTextureDesc& texDesc) {
|
||||
HIP_TEXTURE_DESC desc;
|
||||
|
||||
desc.addressMode[0] = getAddressMode(texDesc.addressMode[0]);
|
||||
@@ -930,20 +890,17 @@ HIP_TEXTURE_DESC getTextureDesc(const hipTextureDesc& texDesc) {
|
||||
return desc;
|
||||
}
|
||||
|
||||
inline
|
||||
hipResourceViewFormat getResourceViewFormat(const HIPresourceViewFormat format) {
|
||||
inline hipResourceViewFormat getResourceViewFormat(const HIPresourceViewFormat format) {
|
||||
// These two enums should be isomorphic.
|
||||
return static_cast<hipResourceViewFormat>(format);
|
||||
}
|
||||
|
||||
inline
|
||||
HIPresourceViewFormat getResourceViewFormat(const hipResourceViewFormat format) {
|
||||
inline HIPresourceViewFormat getResourceViewFormat(const hipResourceViewFormat format) {
|
||||
// These two enums should be isomorphic.
|
||||
return static_cast<HIPresourceViewFormat>(format);
|
||||
}
|
||||
|
||||
inline
|
||||
hipResourceViewDesc getResourceViewDesc(const HIP_RESOURCE_VIEW_DESC& resViewDesc) {
|
||||
inline hipResourceViewDesc getResourceViewDesc(const HIP_RESOURCE_VIEW_DESC& resViewDesc) {
|
||||
hipResourceViewDesc desc;
|
||||
|
||||
desc.format = getResourceViewFormat(resViewDesc.format);
|
||||
@@ -958,8 +915,7 @@ hipResourceViewDesc getResourceViewDesc(const HIP_RESOURCE_VIEW_DESC& resViewDes
|
||||
return desc;
|
||||
}
|
||||
|
||||
inline
|
||||
HIP_RESOURCE_VIEW_DESC getResourceViewDesc(const hipResourceViewDesc& resViewDesc) {
|
||||
inline HIP_RESOURCE_VIEW_DESC getResourceViewDesc(const hipResourceViewDesc& resViewDesc) {
|
||||
HIP_RESOURCE_VIEW_DESC desc;
|
||||
|
||||
desc.format = getResourceViewFormat(resViewDesc.format);
|
||||
@@ -974,13 +930,11 @@ HIP_RESOURCE_VIEW_DESC getResourceViewDesc(const hipResourceViewDesc& resViewDes
|
||||
return desc;
|
||||
}
|
||||
|
||||
inline
|
||||
size_t getElementSize(const hipChannelFormatDesc &desc) {
|
||||
inline size_t getElementSize(const hipChannelFormatDesc& desc) {
|
||||
return (desc.x / 8) * getNumChannels(desc);
|
||||
}
|
||||
|
||||
inline
|
||||
hipMemcpy3DParms getMemcpy3DParms(const hipMemcpy3DBatchOp& desc) {
|
||||
inline hipMemcpy3DParms getMemcpy3DParms(const hipMemcpy3DBatchOp& desc) {
|
||||
hipMemcpy3DParms params;
|
||||
params.extent = desc.extent;
|
||||
params.kind = hipMemcpyDefault;
|
||||
@@ -1000,21 +954,13 @@ hipMemcpy3DParms getMemcpy3DParms(const hipMemcpy3DBatchOp& desc) {
|
||||
size_t spitch = (row ? row : desc.extent.width) * elementSize;
|
||||
size_t swidth = (row ? row : desc.extent.width);
|
||||
size_t sheight = (height ? height : desc.extent.height);
|
||||
params.srcPtr = make_hipPitchedPtr(
|
||||
desc.src.op.ptr.ptr,
|
||||
spitch,
|
||||
swidth,
|
||||
sheight
|
||||
);
|
||||
params.srcPos = make_hipPos(0,0,0);
|
||||
params.srcPtr = make_hipPitchedPtr(desc.src.op.ptr.ptr, spitch, swidth, sheight);
|
||||
params.srcPos = make_hipPos(0, 0, 0);
|
||||
params.srcArray = nullptr;
|
||||
} else if (desc.src.type == hipMemcpyOperandTypeArray) {
|
||||
params.srcArray = desc.src.op.array.array;
|
||||
params.srcPos = make_hipPos(
|
||||
desc.src.op.array.offset.x,
|
||||
desc.src.op.array.offset.y,
|
||||
desc.src.op.array.offset.z
|
||||
);
|
||||
params.srcPos = make_hipPos(desc.src.op.array.offset.x, desc.src.op.array.offset.y,
|
||||
desc.src.op.array.offset.z);
|
||||
params.srcPtr.ptr = nullptr;
|
||||
}
|
||||
// dest
|
||||
@@ -1024,28 +970,19 @@ hipMemcpy3DParms getMemcpy3DParms(const hipMemcpy3DBatchOp& desc) {
|
||||
size_t spitch = (row ? row : desc.extent.width) * elementSize;
|
||||
size_t swidth = (row ? row : desc.extent.width);
|
||||
size_t sheight = (height ? height : desc.extent.height);
|
||||
params.dstPtr = make_hipPitchedPtr(
|
||||
desc.dst.op.ptr.ptr,
|
||||
spitch,
|
||||
swidth,
|
||||
sheight
|
||||
);
|
||||
params.dstPos = make_hipPos(0,0,0);
|
||||
params.dstPtr = make_hipPitchedPtr(desc.dst.op.ptr.ptr, spitch, swidth, sheight);
|
||||
params.dstPos = make_hipPos(0, 0, 0);
|
||||
params.dstArray = nullptr;
|
||||
} else if (desc.dst.type == hipMemcpyOperandTypeArray) {
|
||||
params.dstArray = desc.dst.op.array.array;
|
||||
params.dstPos = make_hipPos(
|
||||
desc.dst.op.array.offset.x,
|
||||
desc.dst.op.array.offset.y,
|
||||
desc.dst.op.array.offset.z
|
||||
);
|
||||
params.dstPos = make_hipPos(desc.dst.op.array.offset.x, desc.dst.op.array.offset.y,
|
||||
desc.dst.op.array.offset.z);
|
||||
params.dstPtr.ptr = nullptr;
|
||||
}
|
||||
return params;
|
||||
}
|
||||
|
||||
inline
|
||||
hipMemcpy3DParms getMemcpy3DParms(const hipMemcpy3DPeerParms& desc) {
|
||||
inline hipMemcpy3DParms getMemcpy3DParms(const hipMemcpy3DPeerParms& desc) {
|
||||
hipMemcpy3DParms params;
|
||||
params.srcArray = desc.srcArray;
|
||||
params.srcPos = desc.srcPos;
|
||||
@@ -1057,4 +994,4 @@ hipMemcpy3DParms getMemcpy3DParms(const hipMemcpy3DPeerParms& desc) {
|
||||
params.kind = hipMemcpyDeviceToDevice;
|
||||
return params;
|
||||
}
|
||||
};
|
||||
}; // namespace hip
|
||||
@@ -166,10 +166,9 @@ void Device::WaitActiveStreams(hip::Stream* blocking_stream, bool wait_null_stre
|
||||
amd::Command::EventWaitList eventWaitList(0);
|
||||
bool submitMarker = 0;
|
||||
|
||||
auto waitForStream = [&submitMarker,
|
||||
&eventWaitList](hip::Stream* stream) {
|
||||
if (amd::Command *command = stream->getLastQueuedCommand(true)) {
|
||||
amd::Event &event = command->event();
|
||||
auto waitForStream = [&submitMarker, &eventWaitList](hip::Stream* stream) {
|
||||
if (amd::Command* command = stream->getLastQueuedCommand(true)) {
|
||||
amd::Event& event = command->event();
|
||||
// Check HW status of the ROCcrl event.
|
||||
// Note: not all ROCclr modes support HW status
|
||||
bool ready = stream->device().IsHwEventReady(event);
|
||||
@@ -196,10 +195,10 @@ void Device::WaitActiveStreams(hip::Stream* blocking_stream, bool wait_null_stre
|
||||
auto activeQueues = blocking_stream->device().getActiveQueues();
|
||||
for (const auto& command : activeQueues) {
|
||||
hip::Stream* active_stream = static_cast<hip::Stream*>(command);
|
||||
if (// Make sure it's a default stream
|
||||
((active_stream->Flags() & hipStreamNonBlocking) == 0) &&
|
||||
// and it's not the current stream
|
||||
(active_stream != blocking_stream)) {
|
||||
if ( // Make sure it's a default stream
|
||||
((active_stream->Flags() & hipStreamNonBlocking) == 0) &&
|
||||
// and it's not the current stream
|
||||
(active_stream != blocking_stream)) {
|
||||
ClPrint(amd::LOG_DEBUG, amd::LOG_WAIT, "Waiting on active stream %p", active_stream);
|
||||
// Get the last valid command
|
||||
waitForStream(active_stream);
|
||||
@@ -230,13 +229,13 @@ void Device::AddStream(Stream* stream) {
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
void Device::RemoveStream(Stream* stream){
|
||||
void Device::RemoveStream(Stream* stream) {
|
||||
std::unique_lock lock(streamSetLock);
|
||||
streamSet.erase(stream);
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
bool Device::StreamExists(Stream* stream){
|
||||
bool Device::StreamExists(Stream* stream) {
|
||||
std::shared_lock lock(streamSetLock);
|
||||
if (streamSet.find(stream) != streamSet.end()) {
|
||||
return true;
|
||||
@@ -250,7 +249,7 @@ void Device::destroyAllStreams() {
|
||||
{
|
||||
std::shared_lock lock(streamSetLock);
|
||||
for (auto& it : streamSet) {
|
||||
if (it->Null() == false ) {
|
||||
if (it->Null() == false) {
|
||||
toBeDeleted.push_back(it);
|
||||
}
|
||||
}
|
||||
@@ -300,7 +299,8 @@ void Device::SyncAllStreams(bool cpu_wait, bool wait_blocking_streams_only) {
|
||||
bool Device::StreamCaptureBlocking() {
|
||||
std::shared_lock lock(streamSetLock);
|
||||
for (auto& it : streamSet) {
|
||||
if (it->GetCaptureStatus() == hipStreamCaptureStatusActive && it->Flags() != hipStreamNonBlocking) {
|
||||
if (it->GetCaptureStatus() == hipStreamCaptureStatusActive &&
|
||||
it->Flags() != hipStreamNonBlocking) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
@@ -536,8 +536,7 @@ hipError_t ihipGetDeviceProperties(hipDeviceProp_tR0600* props, int device) {
|
||||
deviceProps.cooperativeMultiDeviceUnmatchedBlockDim = info.cooperativeMultiDeviceGroups_;
|
||||
deviceProps.cooperativeMultiDeviceUnmatchedSharedMem = info.cooperativeMultiDeviceGroups_;
|
||||
|
||||
deviceProps.maxTexture1DLinear =
|
||||
std::min(pixel_size_max * info.imageMaxBufferSize_, int32_max);
|
||||
deviceProps.maxTexture1DLinear = std::min(pixel_size_max * info.imageMaxBufferSize_, int32_max);
|
||||
deviceProps.maxTexture1DMipmap = std::min(16 * info.imageMaxBufferSize_, int32_max);
|
||||
deviceProps.maxTexture1D = deviceProps.maxSurface1D = std::min(info.image1DMaxWidth_, int32_max);
|
||||
deviceProps.maxTexture2D[0] = deviceProps.maxSurface2D[0] =
|
||||
@@ -771,22 +770,22 @@ hipError_t hipGetProcAddress(const char* symbol, void** pfn, int hipVersion, uin
|
||||
HIP_INIT_API(hipGetProcAddress, symbol, pfn, hipVersion, flags, symbolStatus);
|
||||
|
||||
std::string symbolString = symbol;
|
||||
if(symbol == nullptr || symbolString == "" || pfn == nullptr){
|
||||
if (symbol == nullptr || symbolString == "" || pfn == nullptr) {
|
||||
HIP_RETURN(hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
if (symbolString == "hipGetDeviceProperties"){
|
||||
if (hipVersion >= 600){
|
||||
if (symbolString == "hipGetDeviceProperties") {
|
||||
if (hipVersion >= 600) {
|
||||
symbolString = "hipGetDevicePropertiesR0600";
|
||||
}
|
||||
} else if (symbolString == "hipChooseDevice") {
|
||||
if (hipVersion >= 600){
|
||||
if (hipVersion >= 600) {
|
||||
symbolString = "hipChooseDeviceR0600";
|
||||
}
|
||||
}
|
||||
|
||||
void* handle = hip::PlatformState::instance().getDynamicLibraryHandle();
|
||||
if (handle == nullptr){
|
||||
if (handle == nullptr) {
|
||||
HIP_RETURN(hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
|
||||
@@ -47,10 +47,9 @@ hipError_t ihipChooseDevice(int* device, const DeviceProp* properties) {
|
||||
cl_uint matchedCount = 0;
|
||||
hipError_t err = hipSuccess;
|
||||
|
||||
if constexpr (std::is_same_v<DeviceProp, hipDeviceProp_tR0600>){
|
||||
if constexpr (std::is_same_v<DeviceProp, hipDeviceProp_tR0600>) {
|
||||
err = ihipGetDeviceProperties(¤tProp, i);
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
err = hip::hipGetDevicePropertiesR0000(¤tProp, i);
|
||||
}
|
||||
|
||||
@@ -448,13 +447,13 @@ hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t attr, int device)
|
||||
break;
|
||||
case hipDeviceAttributeAccessPolicyMaxWindowSize:
|
||||
*pi = prop.accessPolicyMaxWindowSize;
|
||||
break;
|
||||
break;
|
||||
case hipDeviceAttributeNumberOfXccs:
|
||||
*pi = static_cast<int>(g_devices[device]->devices()[0]->info().numberOfXccs_);
|
||||
break;
|
||||
break;
|
||||
case hipDeviceAttributeMaxAvailableVgprsPerThread:
|
||||
*pi = static_cast<int>(g_devices[device]->devices()[0]->info().availableVGPRs_);
|
||||
break;
|
||||
break;
|
||||
default:
|
||||
HIP_RETURN(hipErrorInvalidValue);
|
||||
}
|
||||
@@ -533,7 +532,8 @@ hipError_t hipDeviceGetLimit(size_t* pValue, hipLimit_t limit) {
|
||||
*pValue = hip::getCurrentDevice()->devices()[0]->info().scratchLimitMin;
|
||||
break;
|
||||
case hipExtLimitScratchMax:
|
||||
*pValue = hip::getCurrentDevice()->devices()[0]->info().scratchLimitMax;;
|
||||
*pValue = hip::getCurrentDevice()->devices()[0]->info().scratchLimitMax;
|
||||
;
|
||||
break;
|
||||
case hipExtLimitScratchCurrent:
|
||||
*pValue = hip::getCurrentDevice()->devices()[0]->ScratchLimitCurrent();
|
||||
@@ -563,11 +563,8 @@ hipError_t hipDeviceGetPCIBusId(char* pciBusId, int len, int device) {
|
||||
hipDeviceProp_tR0600 prop;
|
||||
HIP_RETURN_ONFAIL(ihipGetDeviceProperties(&prop, device));
|
||||
auto* deviceHandle = g_devices[device]->devices()[0];
|
||||
snprintf (pciBusId, len, "%04x:%02x:%02x.%01x",
|
||||
prop.pciDomainID,
|
||||
prop.pciBusID,
|
||||
prop.pciDeviceID,
|
||||
deviceHandle->info().deviceTopology_.pcie.function);
|
||||
snprintf(pciBusId, len, "%04x:%02x:%02x.%01x", prop.pciDomainID, prop.pciBusID, prop.pciDeviceID,
|
||||
deviceHandle->info().deviceTopology_.pcie.function);
|
||||
|
||||
HIP_RETURN(len <= 12 ? hipErrorInvalidValue : hipSuccess);
|
||||
}
|
||||
@@ -653,7 +650,7 @@ hipError_t hipDeviceSetSharedMemConfig(hipSharedMemConfig config) {
|
||||
}
|
||||
|
||||
hipError_t hipDeviceGetTexture1DLinearMaxWidth(size_t* maxWidthInElements,
|
||||
const hipChannelFormatDesc* fmtDesc, int device) {
|
||||
const hipChannelFormatDesc* fmtDesc, int device) {
|
||||
HIP_INIT_API(hipDeviceGetTexture1DLinearMaxWidth, maxWidthInElements, fmtDesc, device);
|
||||
if (maxWidthInElements == nullptr || fmtDesc == nullptr) {
|
||||
HIP_RETURN(hipErrorInvalidValue);
|
||||
@@ -661,8 +658,8 @@ hipError_t hipDeviceGetTexture1DLinearMaxWidth(size_t* maxWidthInElements,
|
||||
hipDeviceProp_tR0600 prop = {0};
|
||||
HIP_RETURN_ONFAIL(ihipGetDeviceProperties(&prop, device));
|
||||
// Calculate element size according to fmtDesc
|
||||
size_t elementSize = (fmtDesc->x + fmtDesc->y
|
||||
+ fmtDesc->z + fmtDesc->w) / 8; // Convert from bits to bytes
|
||||
size_t elementSize =
|
||||
(fmtDesc->x + fmtDesc->y + fmtDesc->z + fmtDesc->w) / 8; // Convert from bits to bytes
|
||||
if (elementSize == 0) {
|
||||
HIP_RETURN(hipErrorInvalidValue);
|
||||
}
|
||||
@@ -717,15 +714,16 @@ hipError_t hipGetDeviceFlags(unsigned int* flags) {
|
||||
HIP_RETURN(hipSuccess);
|
||||
}
|
||||
|
||||
hipError_t hipGetDriverEntryPoint_common(const char* symbol, void** funcPtr, unsigned long long flags,
|
||||
hipDriverEntryPointQueryResult* status) {
|
||||
hipError_t hipGetDriverEntryPoint_common(const char* symbol, void** funcPtr,
|
||||
unsigned long long flags,
|
||||
hipDriverEntryPointQueryResult* status) {
|
||||
std::string symbolString = symbol;
|
||||
if (symbol == nullptr || symbolString == "" || funcPtr == nullptr) {
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
|
||||
if (flags != hipEnableDefault && flags != hipEnableLegacyStream
|
||||
&& flags != hipEnablePerThreadDefaultStream) {
|
||||
if (flags != hipEnableDefault && flags != hipEnableLegacyStream &&
|
||||
flags != hipEnablePerThreadDefaultStream) {
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
|
||||
@@ -735,7 +733,7 @@ hipError_t hipGetDriverEntryPoint_common(const char* symbol, void** funcPtr, uns
|
||||
}
|
||||
|
||||
if (flags == hipEnablePerThreadDefaultStream) {
|
||||
symbolString += "_spt";
|
||||
symbolString += "_spt";
|
||||
}
|
||||
|
||||
*funcPtr = amd::Os::getSymbol(handle, symbolString.c_str());
|
||||
@@ -866,7 +864,7 @@ hipError_t hipSetValidDevices(int* device_arr, int len) {
|
||||
amd::Os::setPreferredNumaNode(preferredNumaNode);
|
||||
HIP_RETURN(hipSuccess);
|
||||
}
|
||||
} //namespace hip
|
||||
} // namespace hip
|
||||
|
||||
extern "C" hipError_t hipChooseDevice(int* device, const hipDeviceProp_tR0000* properties) {
|
||||
return hip::hipChooseDeviceR0000(device, properties);
|
||||
|
||||
@@ -23,370 +23,359 @@
|
||||
#include "hip_internal.hpp"
|
||||
|
||||
namespace hip {
|
||||
hipError_t hipExtGetLastError()
|
||||
{
|
||||
hipError_t hipExtGetLastError() {
|
||||
HIP_INIT_API(hipExtGetLastError);
|
||||
hipError_t err = hip::tls.last_command_error_;
|
||||
hip::tls.last_command_error_ = hipSuccess;
|
||||
return err;
|
||||
}
|
||||
|
||||
hipError_t hipGetLastError()
|
||||
{
|
||||
hipError_t hipGetLastError() {
|
||||
HIP_INIT_API(hipGetLastError);
|
||||
hipError_t err = hip::tls.last_error_;
|
||||
hip::tls.last_error_ = hipSuccess;
|
||||
return err;
|
||||
}
|
||||
|
||||
hipError_t hipPeekAtLastError()
|
||||
{
|
||||
hipError_t hipPeekAtLastError() {
|
||||
HIP_INIT_API(hipPeekAtLastError);
|
||||
hipError_t err = hip::tls.last_error_;
|
||||
HIP_RETURN(err);
|
||||
}
|
||||
|
||||
const char *ihipGetErrorName(hipError_t hip_error)
|
||||
{
|
||||
const char* ihipGetErrorName(hipError_t hip_error) {
|
||||
switch (hip_error) {
|
||||
case hipSuccess:
|
||||
return "hipSuccess";
|
||||
return "hipSuccess";
|
||||
case hipErrorInvalidValue:
|
||||
return "hipErrorInvalidValue";
|
||||
return "hipErrorInvalidValue";
|
||||
case hipErrorOutOfMemory:
|
||||
return "hipErrorOutOfMemory";
|
||||
return "hipErrorOutOfMemory";
|
||||
case hipErrorNotInitialized:
|
||||
return "hipErrorNotInitialized";
|
||||
return "hipErrorNotInitialized";
|
||||
case hipErrorDeinitialized:
|
||||
return "hipErrorDeinitialized";
|
||||
return "hipErrorDeinitialized";
|
||||
case hipErrorProfilerDisabled:
|
||||
return "hipErrorProfilerDisabled";
|
||||
return "hipErrorProfilerDisabled";
|
||||
case hipErrorProfilerNotInitialized:
|
||||
return "hipErrorProfilerNotInitialized";
|
||||
return "hipErrorProfilerNotInitialized";
|
||||
case hipErrorProfilerAlreadyStarted:
|
||||
return "hipErrorProfilerAlreadyStarted";
|
||||
return "hipErrorProfilerAlreadyStarted";
|
||||
case hipErrorProfilerAlreadyStopped:
|
||||
return "hipErrorProfilerAlreadyStopped";
|
||||
return "hipErrorProfilerAlreadyStopped";
|
||||
case hipErrorInvalidConfiguration:
|
||||
return "hipErrorInvalidConfiguration";
|
||||
return "hipErrorInvalidConfiguration";
|
||||
case hipErrorInvalidSymbol:
|
||||
return "hipErrorInvalidSymbol";
|
||||
return "hipErrorInvalidSymbol";
|
||||
case hipErrorInvalidDevicePointer:
|
||||
return "hipErrorInvalidDevicePointer";
|
||||
return "hipErrorInvalidDevicePointer";
|
||||
case hipErrorInvalidMemcpyDirection:
|
||||
return "hipErrorInvalidMemcpyDirection";
|
||||
return "hipErrorInvalidMemcpyDirection";
|
||||
case hipErrorInsufficientDriver:
|
||||
return "hipErrorInsufficientDriver";
|
||||
return "hipErrorInsufficientDriver";
|
||||
case hipErrorMissingConfiguration:
|
||||
return "hipErrorMissingConfiguration";
|
||||
return "hipErrorMissingConfiguration";
|
||||
case hipErrorPriorLaunchFailure:
|
||||
return "hipErrorPriorLaunchFailure";
|
||||
return "hipErrorPriorLaunchFailure";
|
||||
case hipErrorInvalidDeviceFunction:
|
||||
return "hipErrorInvalidDeviceFunction";
|
||||
return "hipErrorInvalidDeviceFunction";
|
||||
case hipErrorNoDevice:
|
||||
return "hipErrorNoDevice";
|
||||
return "hipErrorNoDevice";
|
||||
case hipErrorInvalidDevice:
|
||||
return "hipErrorInvalidDevice";
|
||||
return "hipErrorInvalidDevice";
|
||||
case hipErrorInvalidPitchValue:
|
||||
return "hipErrorInvalidPitchValue";
|
||||
return "hipErrorInvalidPitchValue";
|
||||
case hipErrorInvalidImage:
|
||||
return "hipErrorInvalidImage";
|
||||
return "hipErrorInvalidImage";
|
||||
case hipErrorInvalidContext:
|
||||
return "hipErrorInvalidContext";
|
||||
return "hipErrorInvalidContext";
|
||||
case hipErrorContextAlreadyCurrent:
|
||||
return "hipErrorContextAlreadyCurrent";
|
||||
return "hipErrorContextAlreadyCurrent";
|
||||
case hipErrorMapFailed:
|
||||
return "hipErrorMapFailed";
|
||||
return "hipErrorMapFailed";
|
||||
case hipErrorUnmapFailed:
|
||||
return "hipErrorUnmapFailed";
|
||||
return "hipErrorUnmapFailed";
|
||||
case hipErrorArrayIsMapped:
|
||||
return "hipErrorArrayIsMapped";
|
||||
return "hipErrorArrayIsMapped";
|
||||
case hipErrorAlreadyMapped:
|
||||
return "hipErrorAlreadyMapped";
|
||||
return "hipErrorAlreadyMapped";
|
||||
case hipErrorNoBinaryForGpu:
|
||||
return "hipErrorNoBinaryForGpu";
|
||||
return "hipErrorNoBinaryForGpu";
|
||||
case hipErrorAlreadyAcquired:
|
||||
return "hipErrorAlreadyAcquired";
|
||||
return "hipErrorAlreadyAcquired";
|
||||
case hipErrorNotMapped:
|
||||
return "hipErrorNotMapped";
|
||||
return "hipErrorNotMapped";
|
||||
case hipErrorNotMappedAsArray:
|
||||
return "hipErrorNotMappedAsArray";
|
||||
return "hipErrorNotMappedAsArray";
|
||||
case hipErrorNotMappedAsPointer:
|
||||
return "hipErrorNotMappedAsPointer";
|
||||
return "hipErrorNotMappedAsPointer";
|
||||
case hipErrorECCNotCorrectable:
|
||||
return "hipErrorECCNotCorrectable";
|
||||
return "hipErrorECCNotCorrectable";
|
||||
case hipErrorUnsupportedLimit:
|
||||
return "hipErrorUnsupportedLimit";
|
||||
return "hipErrorUnsupportedLimit";
|
||||
case hipErrorContextAlreadyInUse:
|
||||
return "hipErrorContextAlreadyInUse";
|
||||
return "hipErrorContextAlreadyInUse";
|
||||
case hipErrorPeerAccessUnsupported:
|
||||
return "hipErrorPeerAccessUnsupported";
|
||||
return "hipErrorPeerAccessUnsupported";
|
||||
case hipErrorInvalidKernelFile:
|
||||
return "hipErrorInvalidKernelFile";
|
||||
return "hipErrorInvalidKernelFile";
|
||||
case hipErrorInvalidGraphicsContext:
|
||||
return "hipErrorInvalidGraphicsContext";
|
||||
return "hipErrorInvalidGraphicsContext";
|
||||
case hipErrorInvalidSource:
|
||||
return "hipErrorInvalidSource";
|
||||
return "hipErrorInvalidSource";
|
||||
case hipErrorFileNotFound:
|
||||
return "hipErrorFileNotFound";
|
||||
return "hipErrorFileNotFound";
|
||||
case hipErrorSharedObjectSymbolNotFound:
|
||||
return "hipErrorSharedObjectSymbolNotFound";
|
||||
return "hipErrorSharedObjectSymbolNotFound";
|
||||
case hipErrorSharedObjectInitFailed:
|
||||
return "hipErrorSharedObjectInitFailed";
|
||||
return "hipErrorSharedObjectInitFailed";
|
||||
case hipErrorOperatingSystem:
|
||||
return "hipErrorOperatingSystem";
|
||||
return "hipErrorOperatingSystem";
|
||||
case hipErrorInvalidHandle:
|
||||
return "hipErrorInvalidHandle";
|
||||
return "hipErrorInvalidHandle";
|
||||
case hipErrorIllegalState:
|
||||
return "hipErrorIllegalState";
|
||||
return "hipErrorIllegalState";
|
||||
case hipErrorNotFound:
|
||||
return "hipErrorNotFound";
|
||||
return "hipErrorNotFound";
|
||||
case hipErrorNotReady:
|
||||
return "hipErrorNotReady";
|
||||
return "hipErrorNotReady";
|
||||
case hipErrorIllegalAddress:
|
||||
return "hipErrorIllegalAddress";
|
||||
return "hipErrorIllegalAddress";
|
||||
case hipErrorLaunchOutOfResources:
|
||||
return "hipErrorLaunchOutOfResources";
|
||||
return "hipErrorLaunchOutOfResources";
|
||||
case hipErrorLaunchTimeOut:
|
||||
return "hipErrorLaunchTimeOut";
|
||||
return "hipErrorLaunchTimeOut";
|
||||
case hipErrorPeerAccessAlreadyEnabled:
|
||||
return "hipErrorPeerAccessAlreadyEnabled";
|
||||
return "hipErrorPeerAccessAlreadyEnabled";
|
||||
case hipErrorPeerAccessNotEnabled:
|
||||
return "hipErrorPeerAccessNotEnabled";
|
||||
return "hipErrorPeerAccessNotEnabled";
|
||||
case hipErrorSetOnActiveProcess:
|
||||
return "hipErrorSetOnActiveProcess";
|
||||
return "hipErrorSetOnActiveProcess";
|
||||
case hipErrorContextIsDestroyed:
|
||||
return "hipErrorContextIsDestroyed";
|
||||
return "hipErrorContextIsDestroyed";
|
||||
case hipErrorAssert:
|
||||
return "hipErrorAssert";
|
||||
return "hipErrorAssert";
|
||||
case hipErrorHostMemoryAlreadyRegistered:
|
||||
return "hipErrorHostMemoryAlreadyRegistered";
|
||||
return "hipErrorHostMemoryAlreadyRegistered";
|
||||
case hipErrorHostMemoryNotRegistered:
|
||||
return "hipErrorHostMemoryNotRegistered";
|
||||
return "hipErrorHostMemoryNotRegistered";
|
||||
case hipErrorLaunchFailure:
|
||||
return "hipErrorLaunchFailure";
|
||||
return "hipErrorLaunchFailure";
|
||||
case hipErrorNotSupported:
|
||||
return "hipErrorNotSupported";
|
||||
return "hipErrorNotSupported";
|
||||
case hipErrorUnknown:
|
||||
return "hipErrorUnknown";
|
||||
return "hipErrorUnknown";
|
||||
case hipErrorRuntimeMemory:
|
||||
return "hipErrorRuntimeMemory";
|
||||
return "hipErrorRuntimeMemory";
|
||||
case hipErrorRuntimeOther:
|
||||
return "hipErrorRuntimeOther";
|
||||
return "hipErrorRuntimeOther";
|
||||
case hipErrorCooperativeLaunchTooLarge:
|
||||
return "hipErrorCooperativeLaunchTooLarge";
|
||||
return "hipErrorCooperativeLaunchTooLarge";
|
||||
case hipErrorStreamCaptureUnsupported:
|
||||
return "hipErrorStreamCaptureUnsupported";
|
||||
return "hipErrorStreamCaptureUnsupported";
|
||||
case hipErrorStreamCaptureInvalidated:
|
||||
return "hipErrorStreamCaptureInvalidated";
|
||||
return "hipErrorStreamCaptureInvalidated";
|
||||
case hipErrorStreamCaptureMerge:
|
||||
return "hipErrorStreamCaptureMerge";
|
||||
return "hipErrorStreamCaptureMerge";
|
||||
case hipErrorStreamCaptureUnmatched:
|
||||
return "hipErrorStreamCaptureUnmatched";
|
||||
return "hipErrorStreamCaptureUnmatched";
|
||||
case hipErrorStreamCaptureUnjoined:
|
||||
return "hipErrorStreamCaptureUnjoined";
|
||||
return "hipErrorStreamCaptureUnjoined";
|
||||
case hipErrorStreamCaptureIsolation:
|
||||
return "hipErrorStreamCaptureIsolation";
|
||||
return "hipErrorStreamCaptureIsolation";
|
||||
case hipErrorStreamCaptureImplicit:
|
||||
return "hipErrorStreamCaptureImplicit";
|
||||
return "hipErrorStreamCaptureImplicit";
|
||||
case hipErrorCapturedEvent:
|
||||
return "hipErrorCapturedEvent";
|
||||
return "hipErrorCapturedEvent";
|
||||
case hipErrorStreamCaptureWrongThread:
|
||||
return "hipErrorStreamCaptureWrongThread";
|
||||
return "hipErrorStreamCaptureWrongThread";
|
||||
case hipErrorGraphExecUpdateFailure:
|
||||
return "hipErrorGraphExecUpdateFailure";
|
||||
return "hipErrorGraphExecUpdateFailure";
|
||||
case hipErrorTbd:
|
||||
return "hipErrorTbd";
|
||||
return "hipErrorTbd";
|
||||
default:
|
||||
return "hipErrorUnknown";
|
||||
};
|
||||
return "hipErrorUnknown";
|
||||
};
|
||||
}
|
||||
|
||||
const char *ihipGetErrorString(hipError_t hip_error) {
|
||||
switch(hip_error) {
|
||||
case hipSuccess:
|
||||
return "no error";
|
||||
case hipErrorInvalidValue:
|
||||
return "invalid argument";
|
||||
case hipErrorOutOfMemory:
|
||||
return "out of memory";
|
||||
case hipErrorNotInitialized:
|
||||
return "initialization error";
|
||||
case hipErrorDeinitialized:
|
||||
return "driver shutting down";
|
||||
case hipErrorProfilerDisabled:
|
||||
return "profiler disabled while using external profiling tool";
|
||||
case hipErrorProfilerNotInitialized:
|
||||
return "profiler is not initialized";
|
||||
case hipErrorProfilerAlreadyStarted:
|
||||
return "profiler already started";
|
||||
case hipErrorProfilerAlreadyStopped:
|
||||
return "profiler already stopped";
|
||||
case hipErrorInvalidConfiguration:
|
||||
return "invalid configuration argument";
|
||||
case hipErrorInvalidPitchValue:
|
||||
return "invalid pitch argument";
|
||||
case hipErrorInvalidSymbol:
|
||||
return "invalid device symbol";
|
||||
case hipErrorInvalidDevicePointer:
|
||||
return "invalid device pointer";
|
||||
case hipErrorInvalidMemcpyDirection:
|
||||
return "invalid copy direction for memcpy";
|
||||
case hipErrorInsufficientDriver:
|
||||
return "driver version is insufficient for runtime version";
|
||||
case hipErrorMissingConfiguration:
|
||||
return "__global__ function call is not configured";
|
||||
case hipErrorPriorLaunchFailure:
|
||||
return "unspecified launch failure in prior launch";
|
||||
case hipErrorInvalidDeviceFunction:
|
||||
return "invalid device function";
|
||||
case hipErrorNoDevice:
|
||||
return "no ROCm-capable device is detected";
|
||||
case hipErrorInvalidDevice:
|
||||
return "invalid device ordinal";
|
||||
case hipErrorInvalidImage:
|
||||
return "device kernel image is invalid";
|
||||
case hipErrorInvalidContext:
|
||||
return "invalid device context";
|
||||
case hipErrorContextAlreadyCurrent:
|
||||
return "context is already current context";
|
||||
case hipErrorMapFailed:
|
||||
return "mapping of buffer object failed";
|
||||
case hipErrorUnmapFailed:
|
||||
return "unmapping of buffer object failed";
|
||||
case hipErrorArrayIsMapped:
|
||||
return "array is mapped";
|
||||
case hipErrorAlreadyMapped:
|
||||
return "resource already mapped";
|
||||
case hipErrorNoBinaryForGpu:
|
||||
return "no kernel image is available for execution on the device";
|
||||
case hipErrorAlreadyAcquired:
|
||||
return "resource already acquired";
|
||||
case hipErrorNotMapped:
|
||||
return "resource not mapped";
|
||||
case hipErrorNotMappedAsArray:
|
||||
return "resource not mapped as array";
|
||||
case hipErrorNotMappedAsPointer:
|
||||
return "resource not mapped as pointer";
|
||||
case hipErrorECCNotCorrectable:
|
||||
return "uncorrectable ECC error encountered";
|
||||
case hipErrorUnsupportedLimit:
|
||||
return "limit is not supported on this architecture";
|
||||
case hipErrorContextAlreadyInUse:
|
||||
return "exclusive-thread device already in use by a different thread";
|
||||
case hipErrorPeerAccessUnsupported:
|
||||
return "peer access is not supported between these two devices";
|
||||
case hipErrorInvalidKernelFile:
|
||||
return "invalid kernel file";
|
||||
case hipErrorInvalidGraphicsContext:
|
||||
return "invalid OpenGL or DirectX context";
|
||||
case hipErrorInvalidSource:
|
||||
return "device kernel image is invalid";
|
||||
case hipErrorFileNotFound:
|
||||
return "file not found";
|
||||
case hipErrorSharedObjectSymbolNotFound:
|
||||
return "shared object symbol not found";
|
||||
case hipErrorSharedObjectInitFailed:
|
||||
return "shared object initialization failed";
|
||||
case hipErrorOperatingSystem:
|
||||
return "OS call failed or operation not supported on this OS";
|
||||
case hipErrorInvalidHandle:
|
||||
return "invalid resource handle";
|
||||
case hipErrorIllegalState:
|
||||
return "the operation cannot be performed in the present state";
|
||||
case hipErrorNotFound:
|
||||
return "named symbol not found";
|
||||
case hipErrorNotReady:
|
||||
return "device not ready";
|
||||
case hipErrorIllegalAddress:
|
||||
return "an illegal memory access was encountered";
|
||||
case hipErrorLaunchOutOfResources:
|
||||
return "too many resources requested for launch";
|
||||
case hipErrorLaunchTimeOut:
|
||||
return "the launch timed out and was terminated";
|
||||
case hipErrorPeerAccessAlreadyEnabled:
|
||||
return "peer access is already enabled";
|
||||
case hipErrorPeerAccessNotEnabled:
|
||||
return "peer access has not been enabled";
|
||||
case hipErrorSetOnActiveProcess:
|
||||
return "cannot set while device is active in this process";
|
||||
case hipErrorContextIsDestroyed:
|
||||
return "context is destroyed";
|
||||
case hipErrorAssert:
|
||||
return "device-side assert triggered";
|
||||
case hipErrorHostMemoryAlreadyRegistered:
|
||||
return "part or all of the requested memory range is already mapped";
|
||||
case hipErrorHostMemoryNotRegistered:
|
||||
return "pointer does not correspond to a registered memory region";
|
||||
case hipErrorLaunchFailure:
|
||||
return "unspecified launch failure";
|
||||
case hipErrorCooperativeLaunchTooLarge:
|
||||
return "too many blocks in cooperative launch";
|
||||
case hipErrorNotSupported:
|
||||
return "operation not supported";
|
||||
case hipErrorStreamCaptureUnsupported:
|
||||
return "operation not permitted when stream is capturing";
|
||||
case hipErrorStreamCaptureInvalidated:
|
||||
return "operation failed due to a previous error during capture";
|
||||
case hipErrorStreamCaptureMerge:
|
||||
return "operation would result in a merge of separate capture sequences";
|
||||
case hipErrorStreamCaptureUnmatched:
|
||||
return "capture was not ended in the same stream as it began";
|
||||
case hipErrorStreamCaptureUnjoined:
|
||||
return "capturing stream has unjoined work";
|
||||
case hipErrorStreamCaptureIsolation:
|
||||
return "dependency created on uncaptured work in another stream";
|
||||
case hipErrorStreamCaptureImplicit:
|
||||
return "operation would make the legacy stream depend on a capturing blocking stream";
|
||||
case hipErrorCapturedEvent:
|
||||
return "operation not permitted on an event last recorded in a capturing stream";
|
||||
case hipErrorStreamCaptureWrongThread:
|
||||
return "attempt to terminate a thread-local capture sequence from another thread";
|
||||
case hipErrorGraphExecUpdateFailure:
|
||||
return "the graph update was not performed because it included changes which violated constraints specific to instantiated graph update";
|
||||
case hipErrorRuntimeMemory:
|
||||
return "runtime memory call returned error";
|
||||
case hipErrorRuntimeOther:
|
||||
return "runtime call other than memory returned error";
|
||||
case hipErrorUnknown:
|
||||
default:
|
||||
return "unknown error";
|
||||
}
|
||||
const char* ihipGetErrorString(hipError_t hip_error) {
|
||||
switch (hip_error) {
|
||||
case hipSuccess:
|
||||
return "no error";
|
||||
case hipErrorInvalidValue:
|
||||
return "invalid argument";
|
||||
case hipErrorOutOfMemory:
|
||||
return "out of memory";
|
||||
case hipErrorNotInitialized:
|
||||
return "initialization error";
|
||||
case hipErrorDeinitialized:
|
||||
return "driver shutting down";
|
||||
case hipErrorProfilerDisabled:
|
||||
return "profiler disabled while using external profiling tool";
|
||||
case hipErrorProfilerNotInitialized:
|
||||
return "profiler is not initialized";
|
||||
case hipErrorProfilerAlreadyStarted:
|
||||
return "profiler already started";
|
||||
case hipErrorProfilerAlreadyStopped:
|
||||
return "profiler already stopped";
|
||||
case hipErrorInvalidConfiguration:
|
||||
return "invalid configuration argument";
|
||||
case hipErrorInvalidPitchValue:
|
||||
return "invalid pitch argument";
|
||||
case hipErrorInvalidSymbol:
|
||||
return "invalid device symbol";
|
||||
case hipErrorInvalidDevicePointer:
|
||||
return "invalid device pointer";
|
||||
case hipErrorInvalidMemcpyDirection:
|
||||
return "invalid copy direction for memcpy";
|
||||
case hipErrorInsufficientDriver:
|
||||
return "driver version is insufficient for runtime version";
|
||||
case hipErrorMissingConfiguration:
|
||||
return "__global__ function call is not configured";
|
||||
case hipErrorPriorLaunchFailure:
|
||||
return "unspecified launch failure in prior launch";
|
||||
case hipErrorInvalidDeviceFunction:
|
||||
return "invalid device function";
|
||||
case hipErrorNoDevice:
|
||||
return "no ROCm-capable device is detected";
|
||||
case hipErrorInvalidDevice:
|
||||
return "invalid device ordinal";
|
||||
case hipErrorInvalidImage:
|
||||
return "device kernel image is invalid";
|
||||
case hipErrorInvalidContext:
|
||||
return "invalid device context";
|
||||
case hipErrorContextAlreadyCurrent:
|
||||
return "context is already current context";
|
||||
case hipErrorMapFailed:
|
||||
return "mapping of buffer object failed";
|
||||
case hipErrorUnmapFailed:
|
||||
return "unmapping of buffer object failed";
|
||||
case hipErrorArrayIsMapped:
|
||||
return "array is mapped";
|
||||
case hipErrorAlreadyMapped:
|
||||
return "resource already mapped";
|
||||
case hipErrorNoBinaryForGpu:
|
||||
return "no kernel image is available for execution on the device";
|
||||
case hipErrorAlreadyAcquired:
|
||||
return "resource already acquired";
|
||||
case hipErrorNotMapped:
|
||||
return "resource not mapped";
|
||||
case hipErrorNotMappedAsArray:
|
||||
return "resource not mapped as array";
|
||||
case hipErrorNotMappedAsPointer:
|
||||
return "resource not mapped as pointer";
|
||||
case hipErrorECCNotCorrectable:
|
||||
return "uncorrectable ECC error encountered";
|
||||
case hipErrorUnsupportedLimit:
|
||||
return "limit is not supported on this architecture";
|
||||
case hipErrorContextAlreadyInUse:
|
||||
return "exclusive-thread device already in use by a different thread";
|
||||
case hipErrorPeerAccessUnsupported:
|
||||
return "peer access is not supported between these two devices";
|
||||
case hipErrorInvalidKernelFile:
|
||||
return "invalid kernel file";
|
||||
case hipErrorInvalidGraphicsContext:
|
||||
return "invalid OpenGL or DirectX context";
|
||||
case hipErrorInvalidSource:
|
||||
return "device kernel image is invalid";
|
||||
case hipErrorFileNotFound:
|
||||
return "file not found";
|
||||
case hipErrorSharedObjectSymbolNotFound:
|
||||
return "shared object symbol not found";
|
||||
case hipErrorSharedObjectInitFailed:
|
||||
return "shared object initialization failed";
|
||||
case hipErrorOperatingSystem:
|
||||
return "OS call failed or operation not supported on this OS";
|
||||
case hipErrorInvalidHandle:
|
||||
return "invalid resource handle";
|
||||
case hipErrorIllegalState:
|
||||
return "the operation cannot be performed in the present state";
|
||||
case hipErrorNotFound:
|
||||
return "named symbol not found";
|
||||
case hipErrorNotReady:
|
||||
return "device not ready";
|
||||
case hipErrorIllegalAddress:
|
||||
return "an illegal memory access was encountered";
|
||||
case hipErrorLaunchOutOfResources:
|
||||
return "too many resources requested for launch";
|
||||
case hipErrorLaunchTimeOut:
|
||||
return "the launch timed out and was terminated";
|
||||
case hipErrorPeerAccessAlreadyEnabled:
|
||||
return "peer access is already enabled";
|
||||
case hipErrorPeerAccessNotEnabled:
|
||||
return "peer access has not been enabled";
|
||||
case hipErrorSetOnActiveProcess:
|
||||
return "cannot set while device is active in this process";
|
||||
case hipErrorContextIsDestroyed:
|
||||
return "context is destroyed";
|
||||
case hipErrorAssert:
|
||||
return "device-side assert triggered";
|
||||
case hipErrorHostMemoryAlreadyRegistered:
|
||||
return "part or all of the requested memory range is already mapped";
|
||||
case hipErrorHostMemoryNotRegistered:
|
||||
return "pointer does not correspond to a registered memory region";
|
||||
case hipErrorLaunchFailure:
|
||||
return "unspecified launch failure";
|
||||
case hipErrorCooperativeLaunchTooLarge:
|
||||
return "too many blocks in cooperative launch";
|
||||
case hipErrorNotSupported:
|
||||
return "operation not supported";
|
||||
case hipErrorStreamCaptureUnsupported:
|
||||
return "operation not permitted when stream is capturing";
|
||||
case hipErrorStreamCaptureInvalidated:
|
||||
return "operation failed due to a previous error during capture";
|
||||
case hipErrorStreamCaptureMerge:
|
||||
return "operation would result in a merge of separate capture sequences";
|
||||
case hipErrorStreamCaptureUnmatched:
|
||||
return "capture was not ended in the same stream as it began";
|
||||
case hipErrorStreamCaptureUnjoined:
|
||||
return "capturing stream has unjoined work";
|
||||
case hipErrorStreamCaptureIsolation:
|
||||
return "dependency created on uncaptured work in another stream";
|
||||
case hipErrorStreamCaptureImplicit:
|
||||
return "operation would make the legacy stream depend on a capturing blocking stream";
|
||||
case hipErrorCapturedEvent:
|
||||
return "operation not permitted on an event last recorded in a capturing stream";
|
||||
case hipErrorStreamCaptureWrongThread:
|
||||
return "attempt to terminate a thread-local capture sequence from another thread";
|
||||
case hipErrorGraphExecUpdateFailure:
|
||||
return "the graph update was not performed because it included changes which violated "
|
||||
"constraints specific to instantiated graph update";
|
||||
case hipErrorRuntimeMemory:
|
||||
return "runtime memory call returned error";
|
||||
case hipErrorRuntimeOther:
|
||||
return "runtime call other than memory returned error";
|
||||
case hipErrorUnknown:
|
||||
default:
|
||||
return "unknown error";
|
||||
}
|
||||
}
|
||||
|
||||
const char* hipGetErrorName(hipError_t hip_error)
|
||||
{
|
||||
return ihipGetErrorName(hip_error);
|
||||
}
|
||||
const char* hipGetErrorName(hipError_t hip_error) { return ihipGetErrorName(hip_error); }
|
||||
|
||||
const char *hipGetErrorString(hipError_t hip_error)
|
||||
{
|
||||
return ihipGetErrorString(hip_error);
|
||||
}
|
||||
const char* hipGetErrorString(hipError_t hip_error) { return ihipGetErrorString(hip_error); }
|
||||
|
||||
hipError_t hipDrvGetErrorName(hipError_t hip_error, const char** errStr)
|
||||
{
|
||||
hipError_t hipDrvGetErrorName(hipError_t hip_error, const char** errStr) {
|
||||
if (errStr == nullptr) {
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
*errStr = ihipGetErrorName(hip_error);
|
||||
if (hip_error == hipErrorUnknown || strcmp( *errStr, "hipErrorUnknown") != 0) {
|
||||
if (hip_error == hipErrorUnknown || strcmp(*errStr, "hipErrorUnknown") != 0) {
|
||||
return hipSuccess;
|
||||
} else {
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
}
|
||||
|
||||
hipError_t hipDrvGetErrorString(hipError_t hip_error, const char** errStr)
|
||||
{
|
||||
hipError_t hipDrvGetErrorString(hipError_t hip_error, const char** errStr) {
|
||||
if (errStr == nullptr) {
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
*errStr = ihipGetErrorString(hip_error);
|
||||
if (hip_error == hipErrorUnknown || strcmp( *errStr, "unknown error") != 0) {
|
||||
if (hip_error == hipErrorUnknown || strcmp(*errStr, "unknown error") != 0) {
|
||||
return hipSuccess;
|
||||
} else {
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
}
|
||||
} //namespace hip
|
||||
} // namespace hip
|
||||
|
||||
@@ -77,8 +77,8 @@ hipError_t Event::synchronize() {
|
||||
auto hip_device = g_devices[deviceId()];
|
||||
// Check HW status of the ROCcrl event. Note: not all ROCclr modes support HW status
|
||||
static constexpr bool kWaitCompletion = true;
|
||||
amd::SyncPolicy policy = (flags_ == hipEventBlockingSync) ? amd::SyncPolicy::Blocking :
|
||||
amd::SyncPolicy::Auto;
|
||||
amd::SyncPolicy policy =
|
||||
(flags_ == hipEventBlockingSync) ? amd::SyncPolicy::Blocking : amd::SyncPolicy::Auto;
|
||||
if (!hip_device->devices()[0]->IsHwEventReady(*event_, kWaitCompletion, policy)) {
|
||||
event_->awaitCompletion();
|
||||
}
|
||||
@@ -86,13 +86,11 @@ hipError_t Event::synchronize() {
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
bool Event::awaitEventCompletion() {
|
||||
return event_->awaitCompletion();
|
||||
}
|
||||
bool Event::awaitEventCompletion() { return event_->awaitCompletion(); }
|
||||
|
||||
bool EventDD::awaitEventCompletion() {
|
||||
amd::SyncPolicy policy = (flags_ == hipEventBlockingSync) ? amd::SyncPolicy::Blocking :
|
||||
amd::SyncPolicy::Auto;
|
||||
amd::SyncPolicy policy =
|
||||
(flags_ == hipEventBlockingSync) ? amd::SyncPolicy::Blocking : amd::SyncPolicy::Auto;
|
||||
return g_devices[deviceId()]->devices()[0]->IsHwEventReady(*event_, true, policy);
|
||||
}
|
||||
|
||||
@@ -135,7 +133,8 @@ hipError_t Event::elapsedTime(Event& eStop, float& ms) {
|
||||
amd::Command* command = new amd::Marker(*event_->command().queue(), kMarkerDisableFlush);
|
||||
command->enqueue();
|
||||
command->awaitCompletion();
|
||||
ms = static_cast<float>(static_cast<int64_t>(command->event().profilingInfo().end_) - time(false)) /
|
||||
ms = static_cast<float>(static_cast<int64_t>(command->event().profilingInfo().end_) -
|
||||
time(false)) /
|
||||
1000000.f;
|
||||
command->release();
|
||||
} else {
|
||||
@@ -208,12 +207,11 @@ hipError_t Event::streamWait(hip::Stream* stream, uint flags) {
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
hipError_t Event::recordCommand(amd::Command*& command, amd::HostQueue* stream,
|
||||
uint32_t ext_flags, bool batch_flush) {
|
||||
hipError_t Event::recordCommand(amd::Command*& command, amd::HostQueue* stream, uint32_t ext_flags,
|
||||
bool batch_flush) {
|
||||
if (command == nullptr) {
|
||||
int32_t releaseFlags = ((ext_flags == 0) ? flags_ : ext_flags) &
|
||||
(hipEventReleaseToDevice | hipEventReleaseToSystem |
|
||||
hipEventDisableSystemFence);
|
||||
(hipEventReleaseToDevice | hipEventReleaseToSystem | hipEventDisableSystemFence);
|
||||
if (releaseFlags & hipEventDisableSystemFence) {
|
||||
releaseFlags = amd::Device::kCacheStateIgnore;
|
||||
} else {
|
||||
@@ -242,8 +240,7 @@ hipError_t Event::enqueueRecordCommand(hip::Stream* stream, amd::Command* comman
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
hipError_t Event::addMarker(hip::Stream* hip_stream, amd::Command* command,
|
||||
bool batch_flush) {
|
||||
hipError_t Event::addMarker(hip::Stream* hip_stream, amd::Command* command, bool batch_flush) {
|
||||
// Keep the lock always at the beginning of this to avoid a race. SWDEV-277847
|
||||
amd::ScopedLock lock(lock_);
|
||||
hipError_t status = recordCommand(command, hip_stream, 0, batch_flush);
|
||||
@@ -272,22 +269,22 @@ bool isValid(hipEvent_t event) {
|
||||
// ================================================================================================
|
||||
hipError_t ihipEventCreateWithFlags(hipEvent_t* event, unsigned flags) {
|
||||
unsigned supportedFlags = hipEventDefault | hipEventBlockingSync | hipEventDisableTiming |
|
||||
hipEventReleaseToDevice | hipEventReleaseToSystem |
|
||||
hipEventInterprocess | hipEventDisableSystemFence;
|
||||
hipEventReleaseToDevice | hipEventReleaseToSystem | hipEventInterprocess |
|
||||
hipEventDisableSystemFence;
|
||||
|
||||
const unsigned releaseFlags = (hipEventReleaseToDevice | hipEventReleaseToSystem |
|
||||
hipEventDisableSystemFence);
|
||||
const unsigned releaseFlags =
|
||||
(hipEventReleaseToDevice | hipEventReleaseToSystem | hipEventDisableSystemFence);
|
||||
// can't set any unsupported flags.
|
||||
// can set only one of the release flags.
|
||||
// if hipEventInterprocess flag is set, then hipEventDisableTiming flag also must be set
|
||||
const bool illegalFlags = (flags & ~supportedFlags) ||
|
||||
([](unsigned int num){
|
||||
const bool illegalFlags = (flags & ~supportedFlags) || ([](unsigned int num) {
|
||||
unsigned int bitcount;
|
||||
for (bitcount = 0; num; bitcount++) {
|
||||
num &= num - 1;
|
||||
}
|
||||
return bitcount; } (flags & releaseFlags) > 1) ||
|
||||
((flags & hipEventInterprocess) && !(flags & hipEventDisableTiming));
|
||||
return bitcount;
|
||||
}(flags & releaseFlags) > 1) ||
|
||||
((flags & hipEventInterprocess) && !(flags & hipEventDisableTiming));
|
||||
if (!illegalFlags) {
|
||||
hip::Event* e = nullptr;
|
||||
if (flags & hipEventInterprocess) {
|
||||
@@ -347,7 +344,7 @@ hipError_t hipEventDestroy(hipEvent_t event) {
|
||||
}
|
||||
|
||||
std::unique_lock lock(hip::eventSetLock);
|
||||
if (hip::eventSet.erase(event) == 0 ) {
|
||||
if (hip::eventSet.erase(event) == 0) {
|
||||
return hipErrorContextIsDestroyed;
|
||||
}
|
||||
|
||||
@@ -387,7 +384,7 @@ hipError_t hipEventElapsedTime(float* ms, hipEvent_t start, hipEvent_t stop) {
|
||||
|
||||
// ================================================================================================
|
||||
hipError_t hipEventRecord_common(hipEvent_t event, hipStream_t stream, unsigned int flags) {
|
||||
if (!(flags == hipEventRecordDefault || flags == hipEventRecordExternal)){
|
||||
if (!(flags == hipEventRecordDefault || flags == hipEventRecordExternal)) {
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
hipError_t status = hipSuccess;
|
||||
|
||||
@@ -31,11 +31,11 @@
|
||||
// Internal structure for stream callback handler
|
||||
namespace hip {
|
||||
class StreamCallback {
|
||||
protected:
|
||||
protected:
|
||||
void* userData_;
|
||||
|
||||
public:
|
||||
StreamCallback(void* userData)
|
||||
: userData_(userData) {}
|
||||
StreamCallback(void* userData) : userData_(userData) {}
|
||||
|
||||
virtual void CL_CALLBACK callback() = 0;
|
||||
|
||||
@@ -45,7 +45,8 @@ protected:
|
||||
class StreamAddCallback : public StreamCallback {
|
||||
hipStreamCallback_t callBack_;
|
||||
hipStream_t stream_;
|
||||
public:
|
||||
|
||||
public:
|
||||
StreamAddCallback(hipStream_t stream, hipStreamCallback_t callback, void* userData)
|
||||
: StreamCallback(userData) {
|
||||
stream_ = stream;
|
||||
@@ -60,9 +61,9 @@ public:
|
||||
|
||||
class LaunchHostFuncCallback : public StreamCallback {
|
||||
hipHostFn_t callBack_;
|
||||
|
||||
public:
|
||||
LaunchHostFuncCallback(hipHostFn_t callback, void* userData)
|
||||
: StreamCallback(userData) {
|
||||
LaunchHostFuncCallback(hipHostFn_t callback, void* userData) : StreamCallback(userData) {
|
||||
callBack_ = callback;
|
||||
}
|
||||
|
||||
@@ -100,18 +101,19 @@ class Event {
|
||||
hipStream_t captureStream_ = nullptr;
|
||||
/// Previous captured nodes before event record
|
||||
std::vector<hip::GraphNode*> nodesPrevToRecorded_;
|
||||
|
||||
protected:
|
||||
bool CheckHwEvent() {
|
||||
amd::SyncPolicy policy = (flags_ == hipEventBlockingSync) ? amd::SyncPolicy::Blocking :
|
||||
amd::SyncPolicy::Auto;
|
||||
amd::SyncPolicy policy =
|
||||
(flags_ == hipEventBlockingSync) ? amd::SyncPolicy::Blocking : amd::SyncPolicy::Auto;
|
||||
return g_devices[deviceId()]->devices()[0]->IsHwEventReady(*event_, false, policy);
|
||||
}
|
||||
|
||||
public:
|
||||
constexpr static bool kBatchFlush = true; //!< Flushes CPU command batch in direct dispatch mode
|
||||
|
||||
Event(uint32_t flags) : flags_(flags), lock_(true) /* hipEvent_t lock*/,
|
||||
event_(nullptr), stream_(nullptr) {
|
||||
Event(uint32_t flags)
|
||||
: flags_(flags), lock_(true) /* hipEvent_t lock*/, event_(nullptr), stream_(nullptr) {
|
||||
device_id_ = hip::getCurrentDevice()->deviceId(); // Created in current device ctx
|
||||
}
|
||||
|
||||
@@ -120,7 +122,7 @@ class Event {
|
||||
event_->release();
|
||||
}
|
||||
}
|
||||
uint32_t flags_; //!< flags associated with the event
|
||||
uint32_t flags_; //!< flags associated with the event
|
||||
|
||||
virtual hipError_t query();
|
||||
virtual hipError_t synchronize();
|
||||
@@ -132,8 +134,7 @@ class Event {
|
||||
virtual hipError_t recordCommand(amd::Command*& command, amd::HostQueue* stream,
|
||||
uint32_t flags = 0, bool batch_flush = true);
|
||||
virtual hipError_t enqueueRecordCommand(hip::Stream* stream, amd::Command* command);
|
||||
hipError_t addMarker(hip::Stream* stream, amd::Command* command,
|
||||
bool batch_flush = true);
|
||||
hipError_t addMarker(hip::Stream* stream, amd::Command* command, bool batch_flush = true);
|
||||
|
||||
void BindCommand(amd::Command& command) {
|
||||
amd::ScopedLock lock(lock_);
|
||||
@@ -202,7 +203,7 @@ class IPCEvent : public Event {
|
||||
int owners = --ipc_evt_.ipc_shmem_->owners;
|
||||
// Make sure event is synchronized
|
||||
hipError_t status = synchronize();
|
||||
status = ihipHostUnregister(&ipc_evt_.ipc_shmem_->signal);
|
||||
status = ihipHostUnregister(&ipc_evt_.ipc_shmem_->signal);
|
||||
if (!amd::Os::MemoryUnmapFile(ipc_evt_.ipc_shmem_, sizeof(hip::ihipIpcEventShmem_t))) {
|
||||
// print hipErrorInvalidHandle;
|
||||
}
|
||||
@@ -226,8 +227,8 @@ class IPCEvent : public Event {
|
||||
|
||||
hipError_t streamWait(hip::Stream* stream, uint flags);
|
||||
|
||||
hipError_t recordCommand(amd::Command*& command, amd::HostQueue* queue,
|
||||
uint32_t flags = 0, bool batch_flush = true) override;
|
||||
hipError_t recordCommand(amd::Command*& command, amd::HostQueue* queue, uint32_t flags = 0,
|
||||
bool batch_flush = true) override;
|
||||
hipError_t enqueueRecordCommand(hip::Stream* stream, amd::Command* command);
|
||||
};
|
||||
|
||||
|
||||
@@ -63,9 +63,8 @@ bool IPCEvent::createIpcEventShmemIfNeeded() {
|
||||
}
|
||||
|
||||
// device sets 0 to this ptr when the ipc event is completed
|
||||
hipError_t status = ihipHostRegister(&ipc_evt_.ipc_shmem_->signal,
|
||||
sizeof(uint32_t) * IPC_SIGNALS_PER_EVENT,
|
||||
0);
|
||||
hipError_t status =
|
||||
ihipHostRegister(&ipc_evt_.ipc_shmem_->signal, sizeof(uint32_t) * IPC_SIGNALS_PER_EVENT, 0);
|
||||
if (status != hipSuccess) {
|
||||
return false;
|
||||
}
|
||||
@@ -110,15 +109,14 @@ hipError_t IPCEvent::streamWait(hip::Stream* stream, uint flags) {
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
hipError_t IPCEvent::recordCommand(amd::Command*& command, amd::HostQueue* stream,
|
||||
uint32_t flags, bool batch_flush) {
|
||||
hipError_t IPCEvent::recordCommand(amd::Command*& command, amd::HostQueue* stream, uint32_t flags,
|
||||
bool batch_flush) {
|
||||
command = new amd::Marker(*stream, kMarkerDisableFlush);
|
||||
return hipSuccess;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
hipError_t IPCEvent::enqueueRecordCommand(hip::Stream* stream, amd::Command* command) {
|
||||
|
||||
amd::Event& tEvent = command->event();
|
||||
createIpcEventShmemIfNeeded();
|
||||
int write_index = ipc_evt_.ipc_shmem_->write_index++;
|
||||
@@ -185,9 +183,8 @@ hipError_t IPCEvent::OpenHandle(ihipIpcEventHandle_t* handle) {
|
||||
ipc_evt_.ipc_shmem_->owners += 1;
|
||||
// device sets 0 to this ptr when the ipc event is completed
|
||||
hipError_t status = hipSuccess;
|
||||
status = ihipHostRegister(&ipc_evt_.ipc_shmem_->signal,
|
||||
sizeof(uint32_t) * IPC_SIGNALS_PER_EVENT,
|
||||
0);
|
||||
status =
|
||||
ihipHostRegister(&ipc_evt_.ipc_shmem_->signal, sizeof(uint32_t) * IPC_SIGNALS_PER_EVENT, 0);
|
||||
return status;
|
||||
}
|
||||
|
||||
|
||||
@@ -273,8 +273,7 @@ static bool UncompressAndPopulateCodeObject(
|
||||
bundle_ids.push_back(bundle_id_str.c_str());
|
||||
}
|
||||
|
||||
const auto obheader =
|
||||
reinterpret_cast<const symbols::ClangOffloadBundleCompressedHeader*>(image);
|
||||
const auto obheader = reinterpret_cast<const symbols::ClangOffloadBundleCompressedHeader*>(image);
|
||||
const size_t size = obheader->totalSize;
|
||||
|
||||
bool passed = false;
|
||||
@@ -720,7 +719,8 @@ hipError_t FatBinaryInfo::AddDevProgram(hip::Device* device, const void* binary_
|
||||
}
|
||||
if (CL_SUCCESS !=
|
||||
program->addDeviceProgram(*ctx->devices()[0], binary_image, binary_size, false, nullptr,
|
||||
nullptr, (ufd_ != nullptr ? ufd_->fdesc_ : amd::Os::FDescInit()), binary_offset, uri_)) {
|
||||
nullptr, (ufd_ != nullptr ? ufd_->fdesc_ : amd::Os::FDescInit()),
|
||||
binary_offset, uri_)) {
|
||||
return hipErrorInvalidKernelFile;
|
||||
}
|
||||
return hipSuccess;
|
||||
|
||||
@@ -35,7 +35,7 @@ namespace hip {
|
||||
|
||||
// Fat Binary Info
|
||||
class FatBinaryInfo {
|
||||
public:
|
||||
public:
|
||||
FatBinaryInfo(const char* fname, const void* image);
|
||||
~FatBinaryInfo();
|
||||
|
||||
@@ -71,25 +71,25 @@ public:
|
||||
//! Returns the lock for this fatbinary access
|
||||
amd::Monitor& FatBinaryLock() { return fb_lock_; }
|
||||
|
||||
private:
|
||||
void ReleaseImageAndFile();
|
||||
private:
|
||||
void ReleaseImageAndFile();
|
||||
|
||||
std::string fname_; //!< File name
|
||||
size_t foffset_; //!< File Offset where the fat binary is present.
|
||||
std::string fname_; //!< File name
|
||||
size_t foffset_; //!< File Offset where the fat binary is present.
|
||||
|
||||
// Even when file is passed image will be mmapped till ~desctructor.
|
||||
const void* image_; //!< Image
|
||||
bool image_mapped_; //!< flag to detect if image is mapped
|
||||
// Even when file is passed image will be mmapped till ~desctructor.
|
||||
const void* image_; //!< Image
|
||||
bool image_mapped_; //!< flag to detect if image is mapped
|
||||
|
||||
// Only used for FBs where image is directly passed
|
||||
std::string uri_; //!< Uniform resource indicator
|
||||
// Only used for FBs where image is directly passed
|
||||
std::string uri_; //!< Uniform resource indicator
|
||||
|
||||
std::vector<amd::Program*> dev_programs_; //!< Program info per Device
|
||||
std::vector<amd::Program*> dev_programs_; //!< Program info per Device
|
||||
|
||||
std::shared_ptr<UniqueFD> ufd_; //!< Unique file descriptor
|
||||
amd::Monitor fb_lock_{true}; //!< Lock for the fat binary access
|
||||
std::shared_ptr<UniqueFD> ufd_; //!< Unique file descriptor
|
||||
amd::Monitor fb_lock_{true}; //!< Lock for the fat binary access
|
||||
};
|
||||
|
||||
}; // namespace hip
|
||||
}; // namespace hip
|
||||
|
||||
#endif // HIP_FAT_BINARY_HPP
|
||||
#endif // HIP_FAT_BINARY_HPP
|
||||
|
||||
@@ -600,41 +600,11 @@ inline std::ostream& operator<<(std::ostream& os, const hip_api_id_t* s) {
|
||||
}
|
||||
|
||||
inline std::ostream& operator<<(std::ostream& os, const hipTextureDesc& s) {
|
||||
os << '{'
|
||||
<< '{'
|
||||
<< s.addressMode[0]
|
||||
<< ','
|
||||
<< s.addressMode[1]
|
||||
<< ','
|
||||
<< s.addressMode[2]
|
||||
<< '}'
|
||||
<< ','
|
||||
<< s.filterMode
|
||||
<< ','
|
||||
<< s.readMode
|
||||
<< ','
|
||||
<< s.sRGB
|
||||
<< ','
|
||||
<< '{'
|
||||
<< s.borderColor[0]
|
||||
<< ','
|
||||
<< s.borderColor[1]
|
||||
<< ','
|
||||
<< s.borderColor[2]
|
||||
<< ','
|
||||
<< s.borderColor[3]
|
||||
<< '}'
|
||||
<< ','
|
||||
<< s.normalizedCoords
|
||||
<< ','
|
||||
<< s.mipmapFilterMode
|
||||
<< ','
|
||||
<< s.mipmapLevelBias
|
||||
<< ','
|
||||
<< s.minMipmapLevelClamp
|
||||
<< ','
|
||||
<< s.maxMipmapLevelClamp
|
||||
<< '}';
|
||||
os << '{' << '{' << s.addressMode[0] << ',' << s.addressMode[1] << ',' << s.addressMode[2] << '}'
|
||||
<< ',' << s.filterMode << ',' << s.readMode << ',' << s.sRGB << ',' << '{' << s.borderColor[0]
|
||||
<< ',' << s.borderColor[1] << ',' << s.borderColor[2] << ',' << s.borderColor[3] << '}' << ','
|
||||
<< s.normalizedCoords << ',' << s.mipmapFilterMode << ',' << s.mipmapLevelBias << ','
|
||||
<< s.minMipmapLevelClamp << ',' << s.maxMipmapLevelClamp << '}';
|
||||
return os;
|
||||
}
|
||||
|
||||
@@ -649,13 +619,7 @@ inline std::ostream& operator<<(std::ostream& os, const hipTextureDesc* s) {
|
||||
|
||||
|
||||
inline std::ostream& operator<<(std::ostream& os, const dim3& s) {
|
||||
os << '{'
|
||||
<< s.x
|
||||
<< ','
|
||||
<< s.y
|
||||
<< ','
|
||||
<< s.z
|
||||
<< '}';
|
||||
os << '{' << s.x << ',' << s.y << ',' << s.z << '}';
|
||||
return os;
|
||||
}
|
||||
|
||||
@@ -669,17 +633,7 @@ inline std::ostream& operator<<(std::ostream& os, const dim3* s) {
|
||||
}
|
||||
|
||||
inline std::ostream& operator<<(std::ostream& os, const hipChannelFormatDesc& s) {
|
||||
os << '{'
|
||||
<< s.x
|
||||
<< ','
|
||||
<< s.y
|
||||
<< ','
|
||||
<< s.z
|
||||
<< ','
|
||||
<< s.w
|
||||
<< ','
|
||||
<< s.f
|
||||
<< '}';
|
||||
os << '{' << s.x << ',' << s.y << ',' << s.z << ',' << s.w << ',' << s.f << '}';
|
||||
return os;
|
||||
}
|
||||
|
||||
@@ -693,17 +647,8 @@ inline std::ostream& operator<<(std::ostream& os, const hipChannelFormatDesc* s)
|
||||
}
|
||||
|
||||
inline std::ostream& operator<<(std::ostream& os, const hipMipmappedArray& s) {
|
||||
os << '{'
|
||||
<< s.data
|
||||
<< ','
|
||||
<< s.desc
|
||||
<< ','
|
||||
<< s.width
|
||||
<< ','
|
||||
<< s.height
|
||||
<< ','
|
||||
<< s.depth
|
||||
<< '}';
|
||||
os << '{' << s.data << ',' << s.desc << ',' << s.width << ',' << s.height << ',' << s.depth
|
||||
<< '}';
|
||||
return os;
|
||||
}
|
||||
|
||||
@@ -718,38 +663,24 @@ inline std::ostream& operator<<(std::ostream& os, const hipMipmappedArray* s) {
|
||||
|
||||
|
||||
inline std::ostream& operator<<(std::ostream& os, const hipResourceDesc& s) {
|
||||
os << '{'
|
||||
<< s.resType
|
||||
<< ','
|
||||
<< '{';
|
||||
os << '{' << s.resType << ',' << '{';
|
||||
|
||||
switch (s.resType) {
|
||||
case hipResourceTypeLinear:
|
||||
os << s.res.linear.devPtr
|
||||
<< ','
|
||||
<< s.res.linear.desc
|
||||
<< ','
|
||||
<< s.res.linear.sizeInBytes;
|
||||
break;
|
||||
case hipResourceTypePitch2D:
|
||||
os << s.res.pitch2D.devPtr
|
||||
<< ','
|
||||
<< s.res.pitch2D.desc
|
||||
<< ','
|
||||
<< s.res.pitch2D.width
|
||||
<< ','
|
||||
<< s.res.pitch2D.height
|
||||
<< ','
|
||||
<< s.res.pitch2D.pitchInBytes;
|
||||
break;
|
||||
case hipResourceTypeArray:
|
||||
os << s.res.array.array;
|
||||
break;
|
||||
case hipResourceTypeMipmappedArray:
|
||||
os <<s.res.mipmap.mipmap;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
case hipResourceTypeLinear:
|
||||
os << s.res.linear.devPtr << ',' << s.res.linear.desc << ',' << s.res.linear.sizeInBytes;
|
||||
break;
|
||||
case hipResourceTypePitch2D:
|
||||
os << s.res.pitch2D.devPtr << ',' << s.res.pitch2D.desc << ',' << s.res.pitch2D.width << ','
|
||||
<< s.res.pitch2D.height << ',' << s.res.pitch2D.pitchInBytes;
|
||||
break;
|
||||
case hipResourceTypeArray:
|
||||
os << s.res.array.array;
|
||||
break;
|
||||
case hipResourceTypeMipmappedArray:
|
||||
os << s.res.mipmap.mipmap;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
os << '}';
|
||||
@@ -767,37 +698,11 @@ inline std::ostream& operator<<(std::ostream& os, const hipResourceDesc* s) {
|
||||
}
|
||||
|
||||
inline std::ostream& operator<<(std::ostream& os, const textureReference& s) {
|
||||
os << '{'
|
||||
<< s.normalized
|
||||
<< ','
|
||||
<< s.readMode
|
||||
<< ','
|
||||
<< s.filterMode
|
||||
<< ','
|
||||
<< '{'
|
||||
<< s.addressMode[0]
|
||||
<< ','
|
||||
<< s.addressMode[1]
|
||||
<< ','
|
||||
<< s.addressMode[2]
|
||||
<< '}'
|
||||
<< ','
|
||||
<< s.channelDesc
|
||||
<< ','
|
||||
<< s.sRGB
|
||||
<< ','
|
||||
<< s.maxAnisotropy
|
||||
<< ','
|
||||
<< s.mipmapFilterMode
|
||||
<< ','
|
||||
<< s.mipmapLevelBias
|
||||
<< ','
|
||||
<< s.minMipmapLevelClamp
|
||||
<< ','
|
||||
<< s.maxMipmapLevelClamp
|
||||
<< ','
|
||||
<< s.textureObject
|
||||
<< '}';
|
||||
os << '{' << s.normalized << ',' << s.readMode << ',' << s.filterMode << ',' << '{'
|
||||
<< s.addressMode[0] << ',' << s.addressMode[1] << ',' << s.addressMode[2] << '}' << ','
|
||||
<< s.channelDesc << ',' << s.sRGB << ',' << s.maxAnisotropy << ',' << s.mipmapFilterMode << ','
|
||||
<< s.mipmapLevelBias << ',' << s.minMipmapLevelClamp << ',' << s.maxMipmapLevelClamp << ','
|
||||
<< s.textureObject << '}';
|
||||
return os;
|
||||
}
|
||||
|
||||
@@ -826,23 +731,9 @@ inline std::ostream& operator<<(std::ostream& os, const hipError_t* s) {
|
||||
}
|
||||
|
||||
inline std::ostream& operator<<(std::ostream& os, const hipResourceViewDesc& s) {
|
||||
os << '{'
|
||||
<< s.format
|
||||
<< ','
|
||||
<< s.width
|
||||
<< ','
|
||||
<< s.height
|
||||
<< ','
|
||||
<< s.depth
|
||||
<< ','
|
||||
<< s.firstMipmapLevel
|
||||
<< ','
|
||||
<< s.lastMipmapLevel
|
||||
<< ','
|
||||
<< s.firstLayer
|
||||
<< ','
|
||||
<< s.lastLayer
|
||||
<< '}';
|
||||
os << '{' << s.format << ',' << s.width << ',' << s.height << ',' << s.depth << ','
|
||||
<< s.firstMipmapLevel << ',' << s.lastMipmapLevel << ',' << s.firstLayer << ',' << s.lastLayer
|
||||
<< '}';
|
||||
return os;
|
||||
}
|
||||
|
||||
@@ -856,15 +747,7 @@ inline std::ostream& operator<<(std::ostream& os, const hipResourceViewDesc* s)
|
||||
}
|
||||
|
||||
inline std::ostream& operator<<(std::ostream& os, const HIP_ARRAY_DESCRIPTOR& s) {
|
||||
os << '{'
|
||||
<< s.Width
|
||||
<< ','
|
||||
<< s.Height
|
||||
<< ','
|
||||
<< s.Format
|
||||
<< ','
|
||||
<< s.NumChannels
|
||||
<< '}';
|
||||
os << '{' << s.Width << ',' << s.Height << ',' << s.Format << ',' << s.NumChannels << '}';
|
||||
return os;
|
||||
}
|
||||
|
||||
@@ -878,19 +761,8 @@ inline std::ostream& operator<<(std::ostream& os, const HIP_ARRAY_DESCRIPTOR* s)
|
||||
}
|
||||
|
||||
inline std::ostream& operator<<(std::ostream& os, const HIP_ARRAY3D_DESCRIPTOR& s) {
|
||||
os << '{'
|
||||
<< s.Width
|
||||
<< ','
|
||||
<< s.Height
|
||||
<< ','
|
||||
<< s.Depth
|
||||
<< ','
|
||||
<< s.Format
|
||||
<< ','
|
||||
<< s.NumChannels
|
||||
<< ','
|
||||
<< s.Flags
|
||||
<< '}';
|
||||
os << '{' << s.Width << ',' << s.Height << ',' << s.Depth << ',' << s.Format << ','
|
||||
<< s.NumChannels << ',' << s.Flags << '}';
|
||||
return os;
|
||||
}
|
||||
|
||||
@@ -904,23 +776,17 @@ inline std::ostream& operator<<(std::ostream& os, const HIP_ARRAY3D_DESCRIPTOR*
|
||||
}
|
||||
|
||||
inline std::ostream& operator<<(std::ostream& os, const hipExtent& s) {
|
||||
os << '{'
|
||||
<< s.width
|
||||
<< ','
|
||||
<< s.height
|
||||
<< ','
|
||||
<< s.depth
|
||||
<< '}';
|
||||
os << '{' << s.width << ',' << s.height << ',' << s.depth << '}';
|
||||
return os;
|
||||
}
|
||||
|
||||
inline std::ostream& operator<<(std::ostream& os, const hipIpcEventHandle_t& s) {
|
||||
//TODO fill in later
|
||||
// TODO fill in later
|
||||
return os;
|
||||
}
|
||||
|
||||
inline std::ostream& operator<<(std::ostream& os, const hipIpcEventHandle_t* s) {
|
||||
//TODO fill in later
|
||||
// TODO fill in later
|
||||
return os;
|
||||
}
|
||||
|
||||
|
||||
@@ -38,7 +38,7 @@ namespace hip {
|
||||
void setupGLInteropOnce() {
|
||||
amd::Context* amdContext = hip::getCurrentDevice()->asContext();
|
||||
|
||||
//current context will be read in amdContext->create
|
||||
// current context will be read in amdContext->create
|
||||
cl_context_properties properties[] = {CL_CONTEXT_PLATFORM,
|
||||
(cl_context_properties)AMD_PLATFORM,
|
||||
ROCCLR_HIP_GL_CONTEXT_KHR,
|
||||
@@ -66,7 +66,8 @@ void setupGLInteropOnce() {
|
||||
|
||||
static inline hipError_t hipSetInteropObjects(int num_objects, void** mem_objects,
|
||||
std::vector<amd::Memory*>& interopObjects) {
|
||||
if ((num_objects == 0 && mem_objects != nullptr) || (num_objects != 0 && mem_objects == nullptr)) {
|
||||
if ((num_objects == 0 && mem_objects != nullptr) ||
|
||||
(num_objects != 0 && mem_objects == nullptr)) {
|
||||
return hipErrorUnknown;
|
||||
}
|
||||
|
||||
@@ -179,7 +180,7 @@ static inline GLenum checkForGLError(const amd::Context& amdContext) {
|
||||
}
|
||||
|
||||
hipError_t hipGraphicsSubResourceGetMappedArray(hipArray_t* array, hipGraphicsResource_t resource,
|
||||
unsigned int arrayIndex, unsigned int mipLevel) {
|
||||
unsigned int arrayIndex, unsigned int mipLevel) {
|
||||
HIP_INIT_API(hipGraphicsSubResourceGetMappedArray, array, resource, arrayIndex, mipLevel);
|
||||
|
||||
amd::Context& amdContext = *(hip::getCurrentDevice()->asContext());
|
||||
@@ -197,17 +198,18 @@ hipError_t hipGraphicsSubResourceGetMappedArray(hipArray_t* array, hipGraphicsRe
|
||||
if (arrayIndex > 0) {
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
amd::Image * view = image->createView(amdContext, image->getImageFormat(), nullptr, mipLevel, 0);
|
||||
amd::Image* view = image->createView(amdContext, image->getImageFormat(), nullptr, mipLevel, 0);
|
||||
|
||||
hipArray* myarray = new hipArray();
|
||||
|
||||
myarray->data = as_cl<amd::Memory> (view);
|
||||
myarray->data = as_cl<amd::Memory>(view);
|
||||
|
||||
myarray->width = view->getWidth();
|
||||
myarray->height = view->getHeight();
|
||||
myarray->depth = view->getDepth();
|
||||
|
||||
const cl_mem_object_type image_type = hip::getCLMemObjectType(myarray->width, myarray->height, myarray->depth, hipArrayDefault);
|
||||
const cl_mem_object_type image_type =
|
||||
hip::getCLMemObjectType(myarray->width, myarray->height, myarray->depth, hipArrayDefault);
|
||||
myarray->type = image_type;
|
||||
amd::Image::Format f = image->getImageFormat();
|
||||
myarray->Format = hip::getCL2hipArrayFormat(f.image_channel_data_type);
|
||||
@@ -227,10 +229,10 @@ hipError_t hipGraphicsGLRegisterImage(hipGraphicsResource** resource, GLuint ima
|
||||
unsigned int flags) {
|
||||
HIP_INIT_API(hipGraphicsGLRegisterImage, resource, image, target, flags);
|
||||
|
||||
if (!((flags == hipGraphicsRegisterFlagsNone) || (flags & hipGraphicsRegisterFlagsReadOnly) ||
|
||||
if (!((flags == hipGraphicsRegisterFlagsNone) || (flags & hipGraphicsRegisterFlagsReadOnly) ||
|
||||
(flags & hipGraphicsRegisterFlagsWriteDiscard) ||
|
||||
(flags & hipGraphicsRegisterFlagsSurfaceLoadStore) ||
|
||||
(flags & hipGraphicsRegisterFlagsTextureGather))) {
|
||||
(flags & hipGraphicsRegisterFlagsSurfaceLoadStore) ||
|
||||
(flags & hipGraphicsRegisterFlagsTextureGather))) {
|
||||
LogError("invalid parameter \"flags\"");
|
||||
HIP_RETURN(hipErrorInvalidValue);
|
||||
}
|
||||
@@ -404,7 +406,7 @@ hipError_t hipGraphicsGLRegisterImage(hipGraphicsResource** resource, GLuint ima
|
||||
// Now get CL format from GL format and bytes per pixel
|
||||
int iBytesPerPixel = 0;
|
||||
if (!amd::getCLFormatFromGL(amdContext, glInternalFormat, &clImageFormat, &iBytesPerPixel,
|
||||
0)) { //clFlags)) {
|
||||
0)) { // clFlags)) {
|
||||
LogWarning("\"texture\" format does not map to an appropriate CL image format");
|
||||
HIP_RETURN(hipErrorInvalidValue);
|
||||
}
|
||||
@@ -448,8 +450,8 @@ hipError_t hipGraphicsGLRegisterImage(hipGraphicsResource** resource, GLuint ima
|
||||
// In case target is GL_TEXTURE_BUFFER
|
||||
GLint backingBuffer;
|
||||
clearGLErrors(amdContext);
|
||||
amdContext.glenv()->glGetTexLevelParameteriv_(
|
||||
glTarget, 0, GL_TEXTURE_BUFFER_DATA_STORE_BINDING, &backingBuffer);
|
||||
amdContext.glenv()->glGetTexLevelParameteriv_(glTarget, 0, GL_TEXTURE_BUFFER_DATA_STORE_BINDING,
|
||||
&backingBuffer);
|
||||
if (GL_NO_ERROR != (glErr = amdContext.glenv()->glGetError_())) {
|
||||
LogWarning("Cannot get backing buffer for GL \"texture buffer\" object");
|
||||
HIP_RETURN(hipErrorInvalidValue);
|
||||
@@ -459,7 +461,7 @@ hipError_t hipGraphicsGLRegisterImage(hipGraphicsResource** resource, GLuint ima
|
||||
// Get GL texture format and check if it's compatible with CL format
|
||||
clearGLErrors(amdContext);
|
||||
amdContext.glenv()->glGetIntegerv_(GL_TEXTURE_BUFFER_FORMAT_EXT,
|
||||
reinterpret_cast<GLint*>(&glInternalFormat));
|
||||
reinterpret_cast<GLint*>(&glInternalFormat));
|
||||
if (GL_NO_ERROR != (glErr = amdContext.glenv()->glGetError_())) {
|
||||
LogWarning("Cannot get internal format of \"miplevel\" of GL \"texture\" object");
|
||||
HIP_RETURN(hipErrorInvalidValue);
|
||||
@@ -468,7 +470,7 @@ hipError_t hipGraphicsGLRegisterImage(hipGraphicsResource** resource, GLuint ima
|
||||
// Now get CL format from GL format and bytes per pixel
|
||||
int iBytesPerPixel = 0;
|
||||
if (!amd::getCLFormatFromGL(amdContext, glInternalFormat, &clImageFormat, &iBytesPerPixel,
|
||||
flags)) {
|
||||
flags)) {
|
||||
LogWarning("\"texture\" format does not map to an appropriate CL image format");
|
||||
HIP_RETURN(hipErrorInvalidValue);
|
||||
}
|
||||
@@ -483,7 +485,7 @@ hipError_t hipGraphicsGLRegisterImage(hipGraphicsResource** resource, GLuint ima
|
||||
gliTexWidth = size / iBytesPerPixel;
|
||||
}
|
||||
size_t imageSize = (clType == CL_MEM_OBJECT_IMAGE1D_ARRAY) ? static_cast<size_t>(gliTexHeight)
|
||||
: static_cast<size_t>(gliTexDepth);
|
||||
: static_cast<size_t>(gliTexDepth);
|
||||
|
||||
if (!amd::Image::validateDimensions(
|
||||
amdContext.devices(), clType, static_cast<size_t>(gliTexWidth),
|
||||
@@ -495,8 +497,8 @@ hipError_t hipGraphicsGLRegisterImage(hipGraphicsResource** resource, GLuint ima
|
||||
|
||||
pImageGL = new (amdContext)
|
||||
amd::ImageGL(amdContext, clType, flags, clImageFormat, static_cast<size_t>(gliTexWidth),
|
||||
static_cast<size_t>(gliTexHeight), static_cast<size_t>(gliTexDepth), glTarget,
|
||||
image, 0, glInternalFormat, clGLType, numSamples, target);
|
||||
static_cast<size_t>(gliTexHeight), static_cast<size_t>(gliTexDepth), glTarget,
|
||||
image, 0, glInternalFormat, clGLType, numSamples, target);
|
||||
|
||||
if (!pImageGL) {
|
||||
LogWarning("Cannot create class ImageGL - out of memory?");
|
||||
@@ -529,7 +531,6 @@ hipError_t hipGraphicsGLRegisterImage(hipGraphicsResource** resource, GLuint ima
|
||||
|
||||
*resource = reinterpret_cast<hipGraphicsResource*>(pImageGL);
|
||||
HIP_RETURN(hipSuccess);
|
||||
|
||||
}
|
||||
|
||||
hipError_t hipGraphicsGLRegisterBuffer(hipGraphicsResource** resource, GLuint buffer,
|
||||
@@ -772,4 +773,4 @@ hipError_t hipGraphicsUnregisterResource(hipGraphicsResource_t resource) {
|
||||
|
||||
HIP_RETURN(hipSuccess);
|
||||
}
|
||||
} // namespace hip
|
||||
} // namespace hip
|
||||
|
||||
@@ -28,24 +28,18 @@ THE SOFTWARE.
|
||||
#include "platform/program.hpp"
|
||||
#include <hip/hip_version.h>
|
||||
|
||||
const char* amd_dbgapi_get_build_name(void) {
|
||||
return HIP_VERSION_BUILD_NAME;
|
||||
}
|
||||
const char* amd_dbgapi_get_build_name(void) { return HIP_VERSION_BUILD_NAME; }
|
||||
|
||||
const char* amd_dbgapi_get_git_hash() {
|
||||
return HIP_VERSION_GITHASH;
|
||||
}
|
||||
const char* amd_dbgapi_get_git_hash() { return HIP_VERSION_GITHASH; }
|
||||
|
||||
size_t amd_dbgapi_get_build_id() {
|
||||
return HIP_VERSION_BUILD_ID;
|
||||
}
|
||||
size_t amd_dbgapi_get_build_id() { return HIP_VERSION_BUILD_ID; }
|
||||
|
||||
#ifdef __HIP_ENABLE_PCH
|
||||
extern const char __hip_pch_wave32[];
|
||||
extern const char __hip_pch_wave64[];
|
||||
extern unsigned __hip_pch_wave32_size;
|
||||
extern unsigned __hip_pch_wave64_size;
|
||||
void __hipGetPCH(const char** pch, unsigned int *size) {
|
||||
void __hipGetPCH(const char** pch, unsigned int* size) {
|
||||
hipDeviceProp_t deviceProp;
|
||||
int deviceId;
|
||||
hipError_t error = hipGetDevice(&deviceId);
|
||||
@@ -64,20 +58,15 @@ namespace hip {
|
||||
// forward declaration of methods required for managed variables
|
||||
hipError_t ihipMallocManaged(void** ptr, size_t size, size_t align = 0, bool use_host_ptr = 0);
|
||||
|
||||
//Device Vars
|
||||
DeviceVar::DeviceVar(std::string name,
|
||||
hipModule_t hmod,
|
||||
int deviceId) :
|
||||
shadowVptr(nullptr), name_(name),
|
||||
amd_mem_obj_(nullptr), device_ptr_(nullptr),
|
||||
size_(0) {
|
||||
// Device Vars
|
||||
DeviceVar::DeviceVar(std::string name, hipModule_t hmod, int deviceId)
|
||||
: shadowVptr(nullptr), name_(name), amd_mem_obj_(nullptr), device_ptr_(nullptr), size_(0) {
|
||||
amd::Program* program = as_amd(reinterpret_cast<cl_program>(hmod));
|
||||
device::Program* dev_program =
|
||||
program->getDeviceProgram(*g_devices.at(deviceId)->devices()[0]);
|
||||
device::Program* dev_program = program->getDeviceProgram(*g_devices.at(deviceId)->devices()[0]);
|
||||
|
||||
guarantee (dev_program != nullptr, "Cannot get Device Program for module: 0x%x", hmod);
|
||||
guarantee(dev_program != nullptr, "Cannot get Device Program for module: 0x%x", hmod);
|
||||
|
||||
if(!dev_program->createGlobalVarObj(&amd_mem_obj_, &device_ptr_, &size_, name.c_str())) {
|
||||
if (!dev_program->createGlobalVarObj(&amd_mem_obj_, &device_ptr_, &size_, name.c_str())) {
|
||||
guarantee(false, "Cannot create GlobalVar Obj for symbol: %s", name.c_str());
|
||||
}
|
||||
|
||||
@@ -96,7 +85,7 @@ DeviceVar::~DeviceVar() {
|
||||
// ihipFree in hip::StatCO::removeFatBinary however in DynCO path, it seems to bypass
|
||||
// ihipFree and hence it needs to be removed+released here. In order to avoid issue with
|
||||
// StatCO, It is better to check if mem obj is found.
|
||||
if (amd::MemObjMap::FindMemObj(device_ptr_) != nullptr && amd_mem_obj_ != nullptr) {
|
||||
if (amd::MemObjMap::FindMemObj(device_ptr_) != nullptr && amd_mem_obj_ != nullptr) {
|
||||
amd::MemObjMap::RemoveMemObj(device_ptr_);
|
||||
amd_mem_obj_->release();
|
||||
}
|
||||
@@ -111,12 +100,12 @@ DeviceVar::~DeviceVar() {
|
||||
size_ = 0;
|
||||
}
|
||||
|
||||
//Device Functions
|
||||
DeviceFunc::DeviceFunc(std::string name, hipModule_t hmod) : dflock_("function lock"),
|
||||
name_(name), kernel_(nullptr) {
|
||||
// Device Functions
|
||||
DeviceFunc::DeviceFunc(std::string name, hipModule_t hmod)
|
||||
: dflock_("function lock"), name_(name), kernel_(nullptr) {
|
||||
amd::Program* program = as_amd(reinterpret_cast<cl_program>(hmod));
|
||||
|
||||
const amd::Symbol *symbol = program->findSymbol(name.c_str());
|
||||
const amd::Symbol* symbol = program->findSymbol(name.c_str());
|
||||
guarantee(symbol != nullptr, "Cannot find Symbol with name: %s", name.c_str());
|
||||
|
||||
kernel_ = new amd::Kernel(*program, *symbol, name);
|
||||
@@ -129,9 +118,9 @@ DeviceFunc::~DeviceFunc() {
|
||||
}
|
||||
}
|
||||
|
||||
//Abstract functions
|
||||
// Abstract functions
|
||||
Function::Function(const std::string& name, FatBinaryInfo** modules)
|
||||
: name_(name), modules_(modules) {
|
||||
: name_(name), modules_(modules) {
|
||||
dFunc_.resize(g_devices.size());
|
||||
}
|
||||
|
||||
@@ -180,7 +169,6 @@ hipError_t Function::getStatFunc(hipFunction_t* hfunc, int deviceId) {
|
||||
}
|
||||
|
||||
hipError_t Function::getStatFuncAttr(hipFuncAttributes* func_attr, int deviceId) {
|
||||
|
||||
if (modules_ == nullptr || *modules_ == nullptr) {
|
||||
return hipErrorInvalidDeviceFunction;
|
||||
}
|
||||
@@ -198,9 +186,9 @@ hipError_t Function::getStatFuncAttr(hipFuncAttributes* func_attr, int deviceId)
|
||||
amd::Kernel* kernel = dFunc_[deviceId]->kernel();
|
||||
auto* device_handle = devices[deviceId];
|
||||
const device::Kernel::WorkGroupInfo* wginfo =
|
||||
kernel->getDeviceKernel(*device_handle)->workGroupInfo();
|
||||
int binaryVersion = device_handle->isa().versionMajor() * 10 +
|
||||
device_handle->isa().versionMinor();
|
||||
kernel->getDeviceKernel(*device_handle)->workGroupInfo();
|
||||
int binaryVersion =
|
||||
device_handle->isa().versionMajor() * 10 + device_handle->isa().versionMinor();
|
||||
func_attr->sharedSizeBytes = static_cast<int>(wginfo->localMemSize_);
|
||||
func_attr->binaryVersion = binaryVersion;
|
||||
func_attr->cacheModeCA = 0;
|
||||
@@ -214,10 +202,17 @@ hipError_t Function::getStatFuncAttr(hipFuncAttributes* func_attr, int deviceId)
|
||||
return hipSuccess;
|
||||
}
|
||||
|
||||
//Abstract Vars
|
||||
// Abstract Vars
|
||||
Var::Var(const std::string& name, DeviceVarKind dVarKind, size_t size, int type, int norm,
|
||||
FatBinaryInfo** modules) : name_(name), dVarKind_(dVarKind), size_(size),
|
||||
type_(type), norm_(norm), modules_(modules), managedVarPtr_(nullptr), align_(0) {
|
||||
FatBinaryInfo** modules)
|
||||
: name_(name),
|
||||
dVarKind_(dVarKind),
|
||||
size_(size),
|
||||
type_(type),
|
||||
norm_(norm),
|
||||
modules_(modules),
|
||||
managedVarPtr_(nullptr),
|
||||
align_(0) {
|
||||
dVar_.resize(g_devices.size());
|
||||
}
|
||||
|
||||
@@ -246,8 +241,7 @@ hipError_t Var::getDeviceVarPtr(DeviceVar** dvar, int deviceId) {
|
||||
guarantee((deviceId >= 0), "Invalid DeviceId, less than zero");
|
||||
guarantee((static_cast<size_t>(deviceId) < g_devices.size()),
|
||||
"Invalid DeviceId, greater than no of code objects");
|
||||
guarantee((dVar_.size() == g_devices.size()),
|
||||
"Device Var not initialized to size");
|
||||
guarantee((dVar_.size() == g_devices.size()), "Device Var not initialized to size");
|
||||
*dvar = dVar_[deviceId];
|
||||
return hipSuccess;
|
||||
}
|
||||
@@ -256,8 +250,7 @@ hipError_t Var::getDeviceVar(DeviceVar** dvar, int deviceId, hipModule_t hmod) {
|
||||
guarantee((deviceId >= 0), "Invalid DeviceId, less than zero");
|
||||
guarantee((static_cast<size_t>(deviceId) < g_devices.size()),
|
||||
"Invalid DeviceId, greater than no of code objects");
|
||||
guarantee((dVar_.size() == g_devices.size()),
|
||||
"Device Var not initialized to size");
|
||||
guarantee((dVar_.size() == g_devices.size()), "Device Var not initialized to size");
|
||||
|
||||
if (dVar_[deviceId] == nullptr) {
|
||||
dVar_[deviceId] = new DeviceVar(name_, hmod, deviceId);
|
||||
@@ -268,7 +261,7 @@ hipError_t Var::getDeviceVar(DeviceVar** dvar, int deviceId, hipModule_t hmod) {
|
||||
}
|
||||
|
||||
hipError_t Var::getStatDeviceVar(DeviceVar** dvar, int deviceId) {
|
||||
guarantee((deviceId >= 0) , "Invalid DeviceId, less than zero");
|
||||
guarantee((deviceId >= 0), "Invalid DeviceId, less than zero");
|
||||
guarantee((static_cast<size_t>(deviceId) < g_devices.size()),
|
||||
"Invalid DeviceId, greater than no of code objects");
|
||||
if (dVar_[deviceId] == nullptr) {
|
||||
@@ -285,14 +278,14 @@ hipError_t Var::allocateManagedVarPtr() {
|
||||
void** pointer = static_cast<void**>(managedVarPtr_);
|
||||
// check if it is deffered allocation
|
||||
if (!allocFlag_) {
|
||||
// Allocate managed memory for this var
|
||||
const bool use_host_ptr = true;
|
||||
IHIP_RETURN_ONFAIL(ihipMallocManaged(pointer, size_, align_, use_host_ptr));
|
||||
allocFlag_ = true;
|
||||
// Allocate managed memory for this var
|
||||
const bool use_host_ptr = true;
|
||||
IHIP_RETURN_ONFAIL(ihipMallocManaged(pointer, size_, align_, use_host_ptr));
|
||||
allocFlag_ = true;
|
||||
}
|
||||
if (dVar_.empty()) {
|
||||
resize_dVar(g_devices.size());
|
||||
resize_dVar(g_devices.size());
|
||||
}
|
||||
return hipSuccess;
|
||||
}
|
||||
}; //namespace: hip
|
||||
}; // namespace hip
|
||||
|
||||
@@ -34,79 +34,74 @@ THE SOFTWARE.
|
||||
|
||||
namespace hip {
|
||||
|
||||
//Forward Declaration
|
||||
// Forward Declaration
|
||||
class CodeObject;
|
||||
|
||||
//Device Structures
|
||||
// Device Structures
|
||||
class DeviceVar {
|
||||
public:
|
||||
public:
|
||||
DeviceVar(std::string name, hipModule_t hmod, int deviceId);
|
||||
~DeviceVar();
|
||||
|
||||
//Accessors for device ptr and size, populated during constructor.
|
||||
// Accessors for device ptr and size, populated during constructor.
|
||||
hipDeviceptr_t device_ptr() const { return device_ptr_; }
|
||||
size_t size() const { return size_; }
|
||||
std::string name() const { return name_; }
|
||||
void* shadowVptr;
|
||||
|
||||
private:
|
||||
std::string name_; //Name of the var
|
||||
amd::Memory* amd_mem_obj_; //amd_mem_obj abstraction
|
||||
hipDeviceptr_t device_ptr_; //Device Pointer
|
||||
size_t size_; //Size of the var
|
||||
private:
|
||||
std::string name_; // Name of the var
|
||||
amd::Memory* amd_mem_obj_; // amd_mem_obj abstraction
|
||||
hipDeviceptr_t device_ptr_; // Device Pointer
|
||||
size_t size_; // Size of the var
|
||||
};
|
||||
|
||||
class DeviceFunc {
|
||||
public:
|
||||
public:
|
||||
DeviceFunc(std::string name, hipModule_t hmod);
|
||||
~DeviceFunc();
|
||||
|
||||
amd::Monitor dflock_;
|
||||
|
||||
//Converts DeviceFunc to hipFunction_t(used by app) and vice versa.
|
||||
// Converts DeviceFunc to hipFunction_t(used by app) and vice versa.
|
||||
hipFunction_t asHipFunction() { return reinterpret_cast<hipFunction_t>(this); }
|
||||
static DeviceFunc* asFunction(hipFunction_t f) { return reinterpret_cast<DeviceFunc*>(f); }
|
||||
|
||||
//Accessor for kernel_ and name_ populated during constructor.
|
||||
// Accessor for kernel_ and name_ populated during constructor.
|
||||
std::string name() const { return name_; }
|
||||
amd::Kernel* kernel() const { return kernel_; }
|
||||
|
||||
private:
|
||||
std::string name_; //name of the func(not unique identifier)
|
||||
amd::Kernel* kernel_; //Kernel ptr referencing to ROCclr Symbol
|
||||
private:
|
||||
std::string name_; // name of the func(not unique identifier)
|
||||
amd::Kernel* kernel_; // Kernel ptr referencing to ROCclr Symbol
|
||||
};
|
||||
|
||||
//Abstract Structures
|
||||
// Abstract Structures
|
||||
class Function {
|
||||
public:
|
||||
Function(const std::string& name, FatBinaryInfo** modules=nullptr);
|
||||
public:
|
||||
Function(const std::string& name, FatBinaryInfo** modules = nullptr);
|
||||
~Function();
|
||||
|
||||
//Return DeviceFunc for this this dynamically loaded module
|
||||
// Return DeviceFunc for this this dynamically loaded module
|
||||
hipError_t getDynFunc(hipFunction_t* hfunc, hipModule_t hmod);
|
||||
bool isValidDynFunc(const void* hfunc);
|
||||
//Return Device Func & attr . Generate/build if not already done so.
|
||||
hipError_t getStatFunc(hipFunction_t *hfunc, int deviceId);
|
||||
// Return Device Func & attr . Generate/build if not already done so.
|
||||
hipError_t getStatFunc(hipFunction_t* hfunc, int deviceId);
|
||||
hipError_t getStatFuncAttr(hipFuncAttributes* func_attr, int deviceId);
|
||||
void resize_dFunc(size_t size) { dFunc_.resize(size); }
|
||||
FatBinaryInfo** moduleInfo() { return modules_; }
|
||||
const std::string& name() const { return name_; }
|
||||
|
||||
private:
|
||||
private:
|
||||
std::vector<DeviceFunc*> dFunc_; //!< DeviceFuncObj per Device
|
||||
std::string name_; //!< name of the func(not unique identifier)
|
||||
FatBinaryInfo** modules_; //!< static module where it is referenced
|
||||
};
|
||||
|
||||
class Var {
|
||||
public:
|
||||
//Types of variable
|
||||
enum DeviceVarKind {
|
||||
DVK_Variable = 0,
|
||||
DVK_Surface,
|
||||
DVK_Texture,
|
||||
DVK_Managed
|
||||
};
|
||||
public:
|
||||
// Types of variable
|
||||
enum DeviceVarKind { DVK_Variable = 0, DVK_Surface, DVK_Texture, DVK_Managed };
|
||||
|
||||
Var(const std::string& name, DeviceVarKind dVarKind, size_t size, int type, int norm,
|
||||
FatBinaryInfo** modules = nullptr);
|
||||
@@ -116,10 +111,10 @@ public:
|
||||
|
||||
~Var();
|
||||
|
||||
//Return DeviceVar for this dynamically loaded module
|
||||
// Return DeviceVar for this dynamically loaded module
|
||||
hipError_t getDeviceVar(DeviceVar** dvar, int deviceId, hipModule_t hmod);
|
||||
|
||||
//Return DeviceVar for module Generate/build if not already done so.
|
||||
// Return DeviceVar for module Generate/build if not already done so.
|
||||
hipError_t getStatDeviceVar(DeviceVar** dvar, int deviceId);
|
||||
|
||||
hipError_t getDeviceVarPtr(DeviceVar** dvar, int deviceId);
|
||||
@@ -127,7 +122,7 @@ public:
|
||||
hipError_t allocateManagedVarPtr();
|
||||
|
||||
void resize_dVar(size_t size) { dVar_.resize(size); }
|
||||
//bool isEmpty_dVar() const { return dVar_.empty(); }
|
||||
// bool isEmpty_dVar() const { return dVar_.empty(); }
|
||||
|
||||
|
||||
FatBinaryInfo** moduleInfo() { return modules_; };
|
||||
@@ -146,17 +141,17 @@ public:
|
||||
void setAllocFlag(bool val) { allocFlag_ = val; }
|
||||
|
||||
private:
|
||||
std::vector<DeviceVar*> dVar_; // DeviceVarObj per Device
|
||||
std::string name_; // Variable name (not unique identifier)
|
||||
DeviceVarKind dVarKind_; // Variable kind
|
||||
size_t size_; // Size of the variable
|
||||
int type_; // Type(Textures/Surfaces only)
|
||||
int norm_; // Type(Textures/Surfaces only)
|
||||
FatBinaryInfo** modules_; // static module where it is referenced
|
||||
void* managedVarPtr_; // Managed memory pointer with size_ & align_
|
||||
size_t align_; // Managed memory alignment
|
||||
bool allocFlag_; // 0 : host alloc, 1: managed alloc
|
||||
std::vector<DeviceVar*> dVar_; // DeviceVarObj per Device
|
||||
std::string name_; // Variable name (not unique identifier)
|
||||
DeviceVarKind dVarKind_; // Variable kind
|
||||
size_t size_; // Size of the variable
|
||||
int type_; // Type(Textures/Surfaces only)
|
||||
int norm_; // Type(Textures/Surfaces only)
|
||||
FatBinaryInfo** modules_; // static module where it is referenced
|
||||
void* managedVarPtr_; // Managed memory pointer with size_ & align_
|
||||
size_t align_; // Managed memory alignment
|
||||
bool allocFlag_; // 0 : host alloc, 1: managed alloc
|
||||
};
|
||||
|
||||
}; //namespace: hip
|
||||
}; // namespace hip
|
||||
#endif /* HIP_GLOBAL_HPP */
|
||||
|
||||
Το diff αρχείου καταστέλλεται επειδή είναι πολύ μεγάλο
Φόρτωση Διαφορών
@@ -108,7 +108,8 @@ hipError_t capturehipMemset3DAsync(hipStream_t& stream, hipPitchedPtr& pitchedDe
|
||||
|
||||
hipError_t capturehipLaunchHostFunc(hipStream_t& stream, hipHostFn_t& fn, void*& userData);
|
||||
|
||||
hipError_t capturehipMallocAsync(hipStream_t stream, hipMemPool_t mem_pool, size_t size, void** dev_ptr);
|
||||
hipError_t capturehipMallocAsync(hipStream_t stream, hipMemPool_t mem_pool, size_t size,
|
||||
void** dev_ptr);
|
||||
|
||||
hipError_t capturehipFreeAsync(hipStream_t stream, void* dev_ptr);
|
||||
}
|
||||
} // namespace hip
|
||||
@@ -114,12 +114,12 @@ hipError_t ihipMemcpyAtoAValidate(hipArray_t srcArray, hipArray_t dstArray, amd:
|
||||
hipError_t ihipMemcpyHtoAValidate(const void* srcHost, hipArray_t dstArray, amd::Coord3D& srcOrigin,
|
||||
amd::Coord3D& dstOrigin, amd::Coord3D& copyRegion,
|
||||
size_t srcRowPitch, size_t srcSlicePitch, amd::Image*& dstImage,
|
||||
size_t &start);
|
||||
size_t& start);
|
||||
|
||||
hipError_t ihipMemcpyAtoHValidate(hipArray_t srcArray, void* dstHost, amd::Coord3D& srcOrigin,
|
||||
amd::Coord3D& dstOrigin, amd::Coord3D& copyRegion,
|
||||
size_t dstRowPitch, size_t dstSlicePitch, amd::Image*& srcImage,
|
||||
size_t &start);
|
||||
size_t& start);
|
||||
|
||||
hipError_t ihipGraphMemsetParams_validate(const hipMemsetParams* pNodeParams);
|
||||
|
||||
|
||||
@@ -48,7 +48,7 @@ const char* GetGraphNodeTypeString(uint32_t op) {
|
||||
};
|
||||
return case_string;
|
||||
};
|
||||
}
|
||||
} // namespace
|
||||
|
||||
namespace hip {
|
||||
|
||||
@@ -71,7 +71,7 @@ amd::Monitor UserObject::UserObjectLock_{};
|
||||
amd::Monitor GraphNode::WorkerThreadLock_{};
|
||||
|
||||
hipError_t GraphMemcpyNode1D::ValidateParams(void* dst, const void* src, size_t count,
|
||||
hipMemcpyKind kind) {
|
||||
hipMemcpyKind kind) {
|
||||
hipError_t status = ihipMemcpy_validate(dst, src, count, kind);
|
||||
if (status != hipSuccess) {
|
||||
return status;
|
||||
@@ -196,7 +196,7 @@ void Graph::ScheduleOneNode(Node node, int stream_id) {
|
||||
reinterpret_cast<hip::ChildGraphNode*>(node)->GraphExec::TopologicalOrder();
|
||||
}
|
||||
}
|
||||
for (auto edge: node->GetEdges()) {
|
||||
for (auto edge : node->GetEdges()) {
|
||||
ScheduleOneNode(edge, stream_id);
|
||||
// 1. Each extra edge will get a new stream from the pool
|
||||
// 2. Streams will be reused if the number of edges > streams
|
||||
@@ -238,7 +238,7 @@ bool Graph::TopologicalOrder(std::vector<Node>& TopoOrder) {
|
||||
std::unordered_map<Node, int> inDegree;
|
||||
for (auto entry : vertices_) {
|
||||
// Update the dependencies if a signal is required
|
||||
for (auto dep: entry->GetDependencies()) {
|
||||
for (auto dep : entry->GetDependencies()) {
|
||||
// Check if the stream ID doesn't match and enable signal
|
||||
if (dep->stream_id_ != entry->stream_id_) {
|
||||
dep->signal_is_required_ = true;
|
||||
@@ -250,8 +250,7 @@ bool Graph::TopologicalOrder(std::vector<Node>& TopoOrder) {
|
||||
}
|
||||
inDegree[entry] = entry->GetInDegree();
|
||||
}
|
||||
while (!q.empty())
|
||||
{
|
||||
while (!q.empty()) {
|
||||
Node node = q.front();
|
||||
TopoOrder.push_back(node);
|
||||
q.pop();
|
||||
@@ -308,7 +307,7 @@ void Graph::clone(Graph* newGraph, bool cloneNodes) const {
|
||||
memcpy(&newGraph->roots_[0], &roots_[0], sizeof(Node) * roots_.size());
|
||||
}
|
||||
newGraph->memAllocNodePtrs_ = memAllocNodePtrs_;
|
||||
if(!cloneNodes) {
|
||||
if (!cloneNodes) {
|
||||
newGraph->clonedNodes_.clear();
|
||||
}
|
||||
}
|
||||
@@ -333,8 +332,8 @@ bool GraphExec::isGraphExecValid(GraphExec* pGraphExec) {
|
||||
hipError_t GraphExec::CreateStreams(uint32_t num_streams) {
|
||||
parallel_streams_.reserve(num_streams);
|
||||
for (uint32_t i = 0; i < num_streams; ++i) {
|
||||
auto stream = new hip::Stream(hip::getCurrentDevice(),
|
||||
hip::Stream::Priority::Normal, hipStreamNonBlocking);
|
||||
auto stream = new hip::Stream(hip::getCurrentDevice(), hip::Stream::Priority::Normal,
|
||||
hipStreamNonBlocking);
|
||||
if (stream == nullptr || !stream->Create()) {
|
||||
if (stream != nullptr) {
|
||||
hip::Stream::Destroy(stream);
|
||||
@@ -364,7 +363,7 @@ hipError_t GraphExec::Init() {
|
||||
}
|
||||
}
|
||||
instantiateDeviceId_ = hip::getCurrentDevice()->deviceId();
|
||||
static_cast<ReferenceCountedObject*>( hip::getCurrentDevice())->retain();
|
||||
static_cast<ReferenceCountedObject*>(hip::getCurrentDevice())->retain();
|
||||
return status;
|
||||
}
|
||||
|
||||
@@ -523,8 +522,8 @@ bool Graph::RunOneNode(Node node, bool wait) {
|
||||
if (depNode->stream_id_ != node->stream_id_) {
|
||||
// If there is no wait node on the stream, then assign one
|
||||
if ((wait_order_[depNode->stream_id_] == nullptr) ||
|
||||
// If another node executed on the same stream, then use the latest launch only,
|
||||
// since the same stream has in-order run
|
||||
// If another node executed on the same stream, then use the latest launch only,
|
||||
// since the same stream has in-order run
|
||||
(wait_order_[depNode->stream_id_]->launch_id_ < depNode->launch_id_)) {
|
||||
wait_order_[depNode->stream_id_] = depNode;
|
||||
}
|
||||
@@ -579,10 +578,11 @@ bool Graph::RunOneNode(Node node, bool wait) {
|
||||
node->launch_id_ = current_id_++;
|
||||
uint32_t i = 0;
|
||||
// Execute the nodes in the edges list
|
||||
for (auto edge: node->GetEdges()) {
|
||||
for (auto edge : node->GetEdges()) {
|
||||
// Don't wait in the nodes, executed on the same streams and if it has just one dependency
|
||||
bool wait = ((i < DEBUG_HIP_FORCE_GRAPH_QUEUES) ||
|
||||
(edge->GetDependencies().size() > 1)) ? true : false;
|
||||
bool wait = ((i < DEBUG_HIP_FORCE_GRAPH_QUEUES) || (edge->GetDependencies().size() > 1))
|
||||
? true
|
||||
: false;
|
||||
// Execute the edge node
|
||||
if (!RunOneNode(edge, wait)) {
|
||||
return false;
|
||||
@@ -599,11 +599,8 @@ bool Graph::RunOneNode(Node node, bool wait) {
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
bool Graph::RunNodes(
|
||||
int32_t base_stream,
|
||||
const std::vector<hip::Stream*>* parallel_streams,
|
||||
const amd::Command::EventWaitList* parent_waitlist) {
|
||||
|
||||
bool Graph::RunNodes(int32_t base_stream, const std::vector<hip::Stream*>* parallel_streams,
|
||||
const amd::Command::EventWaitList* parent_waitlist) {
|
||||
if (parallel_streams != nullptr) {
|
||||
streams_ = *parallel_streams;
|
||||
}
|
||||
@@ -700,9 +697,9 @@ hipError_t GraphExec::Run(hip::Stream* launch_stream) {
|
||||
// If this is a repeat launch, make sure corresponding MemFreeNode exists for a MemAlloc node
|
||||
if (repeatLaunch_ == true) {
|
||||
if (!topoOrder_.empty() && topoOrder_[0]->GetParentGraph()->GetMemAllocNodeCount() > 0) {
|
||||
return hipErrorInvalidValue;
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
} else {
|
||||
} else {
|
||||
repeatLaunch_ = true;
|
||||
}
|
||||
|
||||
|
||||
@@ -271,8 +271,8 @@ class GraphNode : public hipGraphNodeDOTAttribute {
|
||||
virtual void SetStream(hip::Stream* stream) { stream_ = stream; }
|
||||
//! Updates the grpah node with the execution stream
|
||||
void SetStream(
|
||||
const std::vector<hip::Stream*>& streams //!< A pool of streams to use in graph's execution
|
||||
) {
|
||||
const std::vector<hip::Stream*>& streams //!< A pool of streams to use in graph's execution
|
||||
) {
|
||||
assert(stream_id_ != -1 && "Stream ID wasn't initialized");
|
||||
stream_ = streams[stream_id_];
|
||||
// Reset the launch ID after the stream assignment
|
||||
@@ -594,31 +594,29 @@ class Graph {
|
||||
|
||||
//! Schedules one node on a vitual stream.
|
||||
//! It will also process the nodes in edges, using recursion
|
||||
void ScheduleOneNode(
|
||||
Node node, //!< Node for scheduling on a virtual stream
|
||||
int stream_id //!< Current active virtual stream to use for scheduling
|
||||
);
|
||||
void ScheduleOneNode(Node node, //!< Node for scheduling on a virtual stream
|
||||
int stream_id //!< Current active virtual stream to use for scheduling
|
||||
);
|
||||
|
||||
//! Schedules all nodes in the graph into different streams
|
||||
void ScheduleNodes();
|
||||
|
||||
//! Update streams for the graph execution
|
||||
void UpdateStreams(
|
||||
hip::Stream* launch_stream, //!< Launch stream from the application
|
||||
const std::vector<hip::Stream*>& parallel_stream //!< The list of parallel streams
|
||||
hip::Stream* launch_stream, //!< Launch stream from the application
|
||||
const std::vector<hip::Stream*>& parallel_stream //!< The list of parallel streams
|
||||
);
|
||||
|
||||
//! Runs one node on the assigned stream
|
||||
bool RunOneNode(
|
||||
Node node, //!< Node for the execution on GPU
|
||||
bool wait //!< Wait dependencies
|
||||
);
|
||||
bool RunOneNode(Node node, //!< Node for the execution on GPU
|
||||
bool wait //!< Wait dependencies
|
||||
);
|
||||
|
||||
//! Runs all nodes from the execution graph on the assigned streams
|
||||
bool RunNodes(
|
||||
int32_t base_stream = 0, //!< The base stream to run the graph on
|
||||
const std::vector<hip::Stream*>* streams = nullptr, //!< Streams to run the graph
|
||||
const amd::Command::EventWaitList* parent_waitlist = nullptr //!< Parent Graph waitlist
|
||||
int32_t base_stream = 0, //!< The base stream to run the graph on
|
||||
const std::vector<hip::Stream*>* streams = nullptr, //!< Streams to run the graph
|
||||
const amd::Command::EventWaitList* parent_waitlist = nullptr //!< Parent Graph waitlist
|
||||
);
|
||||
|
||||
bool TopologicalOrder(std::vector<Node>& TopoOrder);
|
||||
@@ -627,7 +625,7 @@ class Graph {
|
||||
Graph* clone() const;
|
||||
void GenerateDOT(std::ostream& fout, hipGraphDebugDotFlags flag) {
|
||||
fout << "subgraph cluster_" << GetID() << " {" << std::endl;
|
||||
fout << "label=\"graph_" << GetID() <<"\"graph[style=\"dashed\"];\n";
|
||||
fout << "label=\"graph_" << GetID() << "\"graph[style=\"dashed\"];\n";
|
||||
for (auto node : vertices_) {
|
||||
node->GenerateDOTNode(GetID(), fout, flag);
|
||||
}
|
||||
@@ -654,7 +652,7 @@ class Graph {
|
||||
size = amd::alignUp(size, dev_info.virtualMemAllocGranularity_);
|
||||
// Single virtual alloc would reserve for all devices.
|
||||
ptr = g_devices[0]->devices()[0]->virtualAlloc(startAddress, size,
|
||||
dev_info.virtualMemAllocGranularity_);
|
||||
dev_info.virtualMemAllocGranularity_);
|
||||
if (ptr == nullptr) {
|
||||
LogError("Failed to reserve Virtual Address");
|
||||
}
|
||||
@@ -693,17 +691,11 @@ class Graph {
|
||||
return false;
|
||||
}
|
||||
|
||||
void FreeAllMemory(hip::Stream* stream) {
|
||||
mem_pool_->FreeAllMemory(stream);
|
||||
}
|
||||
void FreeAllMemory(hip::Stream* stream) { mem_pool_->FreeAllMemory(stream); }
|
||||
|
||||
bool IsGraphInstantiated() const {
|
||||
return graphInstantiated_;
|
||||
}
|
||||
bool IsGraphInstantiated() const { return graphInstantiated_; }
|
||||
|
||||
void SetGraphInstantiated(bool graphInstantiate) {
|
||||
graphInstantiated_ = graphInstantiate;
|
||||
}
|
||||
void SetGraphInstantiated(bool graphInstantiate) { graphInstantiated_ = graphInstantiate; }
|
||||
|
||||
//! returns count of unreleased memalloc nodes
|
||||
uint32_t GetMemAllocNodeCount() const { return memalloc_nodes_; }
|
||||
@@ -798,9 +790,7 @@ class GraphExec : public amd::ReferenceCountedObject, public Graph {
|
||||
void SetKernelArgManager(GraphKernelArgManager* kernArgManager) {
|
||||
kernArgManager_ = kernArgManager;
|
||||
}
|
||||
GraphKernelArgManager* GetKernelArgManager() {
|
||||
return kernArgManager_;
|
||||
}
|
||||
GraphKernelArgManager* GetKernelArgManager() { return kernArgManager_; }
|
||||
static void DecrementRefCount(cl_event event, cl_int command_exec_status, void* user_data);
|
||||
hipError_t AllocKernelArgForGraphNode();
|
||||
void GetKernelArgSizeForGraph(size_t& kernArgSizeForGraph);
|
||||
@@ -838,13 +828,9 @@ class ChildGraphNode : public GraphNode, public GraphExec {
|
||||
|
||||
bool GetGraphCaptureStatus() { return graphCaptureStatus_; }
|
||||
|
||||
std::vector<Node>& GetChildGraphNodeOrder() {
|
||||
return topoOrder_;
|
||||
}
|
||||
std::vector<Node>& GetChildGraphNodeOrder() { return topoOrder_; }
|
||||
|
||||
void SetStream(hip::Stream* stream) override {
|
||||
stream_ = stream;
|
||||
}
|
||||
void SetStream(hip::Stream* stream) override { stream_ = stream; }
|
||||
|
||||
bool TopologicalOrder(std::vector<Node>& TopoOrder) override {
|
||||
return Graph::TopologicalOrder(TopoOrder);
|
||||
@@ -856,8 +842,7 @@ class ChildGraphNode : public GraphNode, public GraphExec {
|
||||
} else if (max_streams_ == 1) {
|
||||
for (int i = 0; i < topoOrder_.size(); i++) {
|
||||
topoOrder_[i]->SetStream(stream_);
|
||||
hipError_t status =
|
||||
topoOrder_[i]->CreateCommand(topoOrder_[i]->GetQueue());
|
||||
hipError_t status = topoOrder_[i]->CreateCommand(topoOrder_[i]->GetQueue());
|
||||
topoOrder_[i]->EnqueueCommands(stream_);
|
||||
}
|
||||
}
|
||||
@@ -964,36 +949,30 @@ class GraphKernelNode : public GraphNode {
|
||||
"%u}\n| {priority | %d}\n}",
|
||||
label_, GetID(), function->name().c_str(), kernelParams_.gridDim.x,
|
||||
kernelParams_.gridDim.y, kernelParams_.gridDim.z, kernelParams_.blockDim.x,
|
||||
kernelParams_.blockDim.y, kernelParams_.blockDim.z,
|
||||
kernelParams_.sharedMemBytes, this, kernelParams_.func,
|
||||
kernelAttr_.accessPolicyWindow.base_ptr, kernelAttr_.accessPolicyWindow.num_bytes,
|
||||
kernelAttr_.accessPolicyWindow.hitRatio, kernelAttr_.accessPolicyWindow.hitProp,
|
||||
kernelAttr_.accessPolicyWindow.missProp, kernelAttr_.cooperative,
|
||||
kernelAttr_.priority);
|
||||
kernelParams_.blockDim.y, kernelParams_.blockDim.z, kernelParams_.sharedMemBytes,
|
||||
this, kernelParams_.func, kernelAttr_.accessPolicyWindow.base_ptr,
|
||||
kernelAttr_.accessPolicyWindow.num_bytes, kernelAttr_.accessPolicyWindow.hitRatio,
|
||||
kernelAttr_.accessPolicyWindow.hitProp, kernelAttr_.accessPolicyWindow.missProp,
|
||||
kernelAttr_.cooperative, kernelAttr_.priority);
|
||||
label = buffer;
|
||||
}
|
||||
else if (flag == hipGraphDebugDotFlagsKernelNodeAttributes) {
|
||||
} else if (flag == hipGraphDebugDotFlagsKernelNodeAttributes) {
|
||||
sprintf(buffer,
|
||||
"{\n%s\n| {ID | %d | %s}\n"
|
||||
"| {accessPolicyWindow | {base_ptr | num_bytes | "
|
||||
"hitRatio | hitProp | missProp} |\n| {%p | %zu | %f | %d | %d}}\n| {cooperative | "
|
||||
"%u}\n| {priority | %d}\n}",
|
||||
label_, GetID(), function->name().c_str(),
|
||||
kernelAttr_.accessPolicyWindow.base_ptr, kernelAttr_.accessPolicyWindow.num_bytes,
|
||||
kernelAttr_.accessPolicyWindow.hitRatio, kernelAttr_.accessPolicyWindow.hitProp,
|
||||
kernelAttr_.accessPolicyWindow.missProp, kernelAttr_.cooperative,
|
||||
kernelAttr_.priority);
|
||||
label_, GetID(), function->name().c_str(), kernelAttr_.accessPolicyWindow.base_ptr,
|
||||
kernelAttr_.accessPolicyWindow.num_bytes, kernelAttr_.accessPolicyWindow.hitRatio,
|
||||
kernelAttr_.accessPolicyWindow.hitProp, kernelAttr_.accessPolicyWindow.missProp,
|
||||
kernelAttr_.cooperative, kernelAttr_.priority);
|
||||
label = buffer;
|
||||
}
|
||||
else if (flag == hipGraphDebugDotFlagsKernelNodeParams) {
|
||||
sprintf(buffer, "%d\n%s\n\\<\\<\\<(%u,%u,%u),(%u,%u,%u),%u\\>\\>\\>",
|
||||
GetID(), function->name().c_str(), kernelParams_.gridDim.x,
|
||||
kernelParams_.gridDim.y, kernelParams_.gridDim.z,
|
||||
kernelParams_.blockDim.x, kernelParams_.blockDim.y,
|
||||
} else if (flag == hipGraphDebugDotFlagsKernelNodeParams) {
|
||||
sprintf(buffer, "%d\n%s\n\\<\\<\\<(%u,%u,%u),(%u,%u,%u),%u\\>\\>\\>", GetID(),
|
||||
function->name().c_str(), kernelParams_.gridDim.x, kernelParams_.gridDim.y,
|
||||
kernelParams_.gridDim.z, kernelParams_.blockDim.x, kernelParams_.blockDim.y,
|
||||
kernelParams_.blockDim.z, kernelParams_.sharedMemBytes);
|
||||
label = buffer;
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
label = std::to_string(GetID()) + "\n" + function->name() + "\n";
|
||||
}
|
||||
return label;
|
||||
@@ -1096,7 +1075,7 @@ class GraphKernelNode : public GraphNode {
|
||||
GraphKernelNode(const hipKernelNodeParams* pNodeParams, const ihipExtKernelEvents* pEvents,
|
||||
int coopKernel = 0)
|
||||
: GraphNode(hipGraphNodeTypeKernel, "bold", "octagon", "KERNEL") {
|
||||
kernelEvents_ = { 0 };
|
||||
kernelEvents_ = {0};
|
||||
if (pEvents != nullptr) {
|
||||
kernelEvents_ = *pEvents;
|
||||
}
|
||||
@@ -1173,7 +1152,7 @@ class GraphKernelNode : public GraphNode {
|
||||
if (DEBUG_HIP_FORCE_ASYNC_QUEUE) {
|
||||
// If there is one dependency, but many edges, then execute this node in any order
|
||||
if (((dependencies_.size() == 1) && (dependencies_[0]->GetEdges().size() > 1) &&
|
||||
(DEBUG_HIP_FORCE_GRAPH_QUEUES == 1))) {
|
||||
(DEBUG_HIP_FORCE_GRAPH_QUEUES == 1))) {
|
||||
// Makes sure the first node in the edges will have a barrier always
|
||||
if (dependencies_[0]->GetEdges()[0] != this) {
|
||||
flags = hipExtAnyOrderLaunch;
|
||||
@@ -1217,7 +1196,7 @@ class GraphKernelNode : public GraphNode {
|
||||
return status;
|
||||
}
|
||||
if ((kernelParams_.kernelParams && kernelParams_.kernelParams == params->kernelParams) ||
|
||||
(kernelParams_.extra && kernelParams_.extra == params->extra)) {
|
||||
(kernelParams_.extra && kernelParams_.extra == params->extra)) {
|
||||
// params is copied from kernelParams_ and then updated, so just copy it back
|
||||
kernelParams_ = *params;
|
||||
return status;
|
||||
@@ -1235,14 +1214,13 @@ class GraphKernelNode : public GraphNode {
|
||||
// Update device ID since new params may require validation for the current device.
|
||||
dev_id_ = ihipGetDevice();
|
||||
hipError_t status = ihipGetDeviceProperties(&prop, dev_id_);
|
||||
if (hipSuccess != status){
|
||||
if (hipSuccess != status) {
|
||||
return status;
|
||||
}
|
||||
int accessPolicyMaxWindowSize = prop.accessPolicyMaxWindowSize;
|
||||
// updates kernel attr params
|
||||
if (attr == hipKernelNodeAttributeAccessPolicyWindow) {
|
||||
if (params->accessPolicyWindow.hitRatio > 1 ||
|
||||
params->accessPolicyWindow.hitRatio < 0) {
|
||||
if (params->accessPolicyWindow.hitRatio > 1 || params->accessPolicyWindow.hitRatio < 0) {
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
|
||||
@@ -1268,7 +1246,7 @@ class GraphKernelNode : public GraphNode {
|
||||
kernelAttr_.cooperative = params->cooperative;
|
||||
} else if (attr == hipLaunchAttributePriority) {
|
||||
if (params->priority < hip::Stream::Priority::Low ||
|
||||
params->priority > hip::Stream::Priority::High){
|
||||
params->priority > hip::Stream::Priority::High) {
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
kernelAttr_.priority = params->priority;
|
||||
@@ -1327,9 +1305,8 @@ class GraphKernelNode : public GraphNode {
|
||||
return SetParams(&kernelNode->kernelParams_);
|
||||
}
|
||||
|
||||
static hipError_t validateKernelParams(const hipKernelNodeParams* pNodeParams,
|
||||
hipFunction_t func, int devId) {
|
||||
|
||||
static hipError_t validateKernelParams(const hipKernelNodeParams* pNodeParams, hipFunction_t func,
|
||||
int devId) {
|
||||
amd::HIPLaunchParams launch_params(pNodeParams->gridDim.x, pNodeParams->gridDim.y,
|
||||
pNodeParams->gridDim.z, pNodeParams->blockDim.x,
|
||||
pNodeParams->blockDim.y, pNodeParams->blockDim.z,
|
||||
@@ -1372,15 +1349,13 @@ class GraphMemcpyNode : public GraphNode {
|
||||
}
|
||||
~GraphMemcpyNode() {}
|
||||
|
||||
GraphMemcpyNode(const GraphMemcpyNode& rhs) : GraphNode(rhs) {
|
||||
copyParams_ = rhs.copyParams_;
|
||||
}
|
||||
GraphMemcpyNode(const GraphMemcpyNode& rhs) : GraphNode(rhs) { copyParams_ = rhs.copyParams_; }
|
||||
|
||||
GraphNode* clone() const override { return new GraphMemcpyNode(*this); }
|
||||
|
||||
virtual hipError_t CreateCommand(hip::Stream* stream) override {
|
||||
if ((copyParams_.kind == hipMemcpyHostToHost || copyParams_.kind == hipMemcpyDefault)
|
||||
&& IsHtoHMemcpy(copyParams_.dstPtr.ptr, copyParams_.srcPtr.ptr)) {
|
||||
if ((copyParams_.kind == hipMemcpyHostToHost || copyParams_.kind == hipMemcpyDefault) &&
|
||||
IsHtoHMemcpy(copyParams_.dstPtr.ptr, copyParams_.srcPtr.ptr)) {
|
||||
return hipSuccess;
|
||||
}
|
||||
hipError_t status = GraphNode::CreateCommand(stream);
|
||||
@@ -1395,11 +1370,11 @@ class GraphMemcpyNode : public GraphNode {
|
||||
}
|
||||
|
||||
virtual void EnqueueCommands(hip::Stream* stream) override {
|
||||
if ( (copyParams_.kind == hipMemcpyHostToHost || copyParams_.kind == hipMemcpyDefault) &&
|
||||
isEnabled_ && IsHtoHMemcpy(copyParams_.dstPtr.ptr, copyParams_.srcPtr.ptr)) {
|
||||
ihipHtoHMemcpy(copyParams_.dstPtr.ptr, copyParams_.srcPtr.ptr,
|
||||
copyParams_.extent.width * copyParams_.extent.height *
|
||||
copyParams_.extent.depth, *stream);
|
||||
if ((copyParams_.kind == hipMemcpyHostToHost || copyParams_.kind == hipMemcpyDefault) &&
|
||||
isEnabled_ && IsHtoHMemcpy(copyParams_.dstPtr.ptr, copyParams_.srcPtr.ptr)) {
|
||||
ihipHtoHMemcpy(
|
||||
copyParams_.dstPtr.ptr, copyParams_.srcPtr.ptr,
|
||||
copyParams_.extent.width * copyParams_.extent.height * copyParams_.extent.depth, *stream);
|
||||
return;
|
||||
}
|
||||
GraphNode::EnqueueCommands(stream);
|
||||
@@ -1493,9 +1468,8 @@ class GraphMemcpyNode : public GraphNode {
|
||||
copyParams_.srcPtr.ptr, copyParams_.srcPtr.xsize, copyParams_.srcPtr.ysize,
|
||||
copyParams_.dstPtr.pitch, copyParams_.dstPtr.ptr, copyParams_.dstPtr.xsize,
|
||||
copyParams_.dstPtr.ysize, copyParams_.srcPos.x, copyParams_.srcPos.y,
|
||||
copyParams_.srcPos.z, copyParams_.dstPos.x, copyParams_.dstPos.y,
|
||||
copyParams_.dstPos.z, copyParams_.extent.width, copyParams_.extent.height,
|
||||
copyParams_.extent.depth);
|
||||
copyParams_.srcPos.z, copyParams_.dstPos.x, copyParams_.dstPos.y, copyParams_.dstPos.z,
|
||||
copyParams_.extent.width, copyParams_.extent.height, copyParams_.extent.depth);
|
||||
label = buffer;
|
||||
} else {
|
||||
label = std::to_string(GetID()) + "\nMEMCPY\n(" + memcpyDirection + ")";
|
||||
@@ -1593,7 +1567,7 @@ class GraphMemcpyNode1D : public GraphMemcpyNode {
|
||||
assert(commands_.size() == 1 && "Invalid command size in GraphMemcpyNode1D");
|
||||
}
|
||||
if (isEnabled_) {
|
||||
//HtoH
|
||||
// HtoH
|
||||
if (isH2H) {
|
||||
ihipHtoHMemcpy(dst_, src_, count_, *stream);
|
||||
return;
|
||||
@@ -1641,9 +1615,7 @@ class GraphMemcpyNode1D : public GraphMemcpyNode {
|
||||
}
|
||||
}
|
||||
|
||||
hipMemcpyKind GetMemcpyKind() const override {
|
||||
return kind_;
|
||||
}
|
||||
hipMemcpyKind GetMemcpyKind() const override { return kind_; }
|
||||
|
||||
hipError_t SetParams(void* dst, const void* src, size_t count, hipMemcpyKind kind) {
|
||||
hipError_t status = ValidateParams(dst, src, count, kind);
|
||||
@@ -1699,9 +1671,9 @@ class GraphMemcpyNode1D : public GraphMemcpyNode {
|
||||
"| %zu}}\n| {{srcPos | {{x | %zu} | {y | %zu} | {z | %zu}}} | {dstPos | {{x | %zu} | {y "
|
||||
"| "
|
||||
"%zu} | {z | %zu}}} | {Extent | {{Width | %zu} | {Height | %zu} | {Depth | %zu}}}}\n}",
|
||||
label_, GetID(), this, memcpyDirection.c_str(), (size_t)0, src_, (size_t)0,
|
||||
(size_t)0, (size_t)0, dst_, (size_t)0, (size_t)0, (size_t)0, (size_t)0, (size_t)0,
|
||||
(size_t)0, (size_t)0, (size_t)0, count_, (size_t)1, (size_t)1);
|
||||
label_, GetID(), this, memcpyDirection.c_str(), (size_t)0, src_, (size_t)0, (size_t)0,
|
||||
(size_t)0, dst_, (size_t)0, (size_t)0, (size_t)0, (size_t)0, (size_t)0, (size_t)0,
|
||||
(size_t)0, (size_t)0, count_, (size_t)1, (size_t)1);
|
||||
label = buffer;
|
||||
} else {
|
||||
label = std::to_string(GetID()) + "\n" + label_ + "\n(" + memcpyDirection + "," +
|
||||
@@ -1738,7 +1710,7 @@ class GraphMemcpyNodeFromSymbol : public GraphMemcpyNode1D {
|
||||
|
||||
public:
|
||||
GraphMemcpyNodeFromSymbol(void* dst, const void* symbol, size_t count, size_t offset,
|
||||
hipMemcpyKind kind)
|
||||
hipMemcpyKind kind)
|
||||
: GraphMemcpyNode1D(dst, nullptr, count, kind, hipGraphNodeTypeMemcpy),
|
||||
symbol_(symbol),
|
||||
offset_(offset) {}
|
||||
@@ -1746,8 +1718,8 @@ class GraphMemcpyNodeFromSymbol : public GraphMemcpyNode1D {
|
||||
~GraphMemcpyNodeFromSymbol() {}
|
||||
|
||||
GraphMemcpyNodeFromSymbol(const GraphMemcpyNodeFromSymbol& rhs) : GraphMemcpyNode1D(rhs) {
|
||||
symbol_ = rhs.symbol_;
|
||||
offset_ = rhs.offset_;
|
||||
symbol_ = rhs.symbol_;
|
||||
offset_ = rhs.offset_;
|
||||
}
|
||||
|
||||
GraphNode* clone() const override { return new GraphMemcpyNodeFromSymbol(*this); }
|
||||
@@ -1778,9 +1750,9 @@ class GraphMemcpyNodeFromSymbol : public GraphMemcpyNode1D {
|
||||
hipMemcpyKind kind, bool isExec = false) {
|
||||
if (isExec) {
|
||||
size_t discardOffset = 0;
|
||||
amd::Memory *memObj = getMemoryObject(dst, discardOffset);
|
||||
amd::Memory* memObj = getMemoryObject(dst, discardOffset);
|
||||
if (memObj != nullptr) {
|
||||
amd::Memory *memObjOri = getMemoryObject(dst_, discardOffset);
|
||||
amd::Memory* memObjOri = getMemoryObject(dst_, discardOffset);
|
||||
if (memObjOri != nullptr) {
|
||||
if (memObjOri->getUserData().deviceId != memObj->getUserData().deviceId) {
|
||||
return hipErrorInvalidValue;
|
||||
@@ -1805,8 +1777,8 @@ class GraphMemcpyNodeFromSymbol : public GraphMemcpyNode1D {
|
||||
if (dstMemory == nullptr && kind != hipMemcpyDeviceToHost && kind != hipMemcpyDefault) {
|
||||
return hipErrorInvalidMemcpyDirection;
|
||||
} else if (dstMemory != nullptr && dstMemory->getMemFlags() == 0 &&
|
||||
kind != hipMemcpyDeviceToDevice && kind != hipMemcpyDeviceToDeviceNoCU
|
||||
&& kind != hipMemcpyDefault) {
|
||||
kind != hipMemcpyDeviceToDevice && kind != hipMemcpyDeviceToDeviceNoCU &&
|
||||
kind != hipMemcpyDefault) {
|
||||
return hipErrorInvalidMemcpyDirection;
|
||||
} else if (kind == hipMemcpyHostToHost || kind == hipMemcpyHostToDevice) {
|
||||
return hipErrorInvalidMemcpyDirection;
|
||||
@@ -1833,7 +1805,7 @@ class GraphMemcpyNodeToSymbol : public GraphMemcpyNode1D {
|
||||
|
||||
public:
|
||||
GraphMemcpyNodeToSymbol(const void* symbol, const void* src, size_t count, size_t offset,
|
||||
hipMemcpyKind kind)
|
||||
hipMemcpyKind kind)
|
||||
: GraphMemcpyNode1D(nullptr, src, count, kind, hipGraphNodeTypeMemcpy),
|
||||
symbol_(symbol),
|
||||
offset_(offset) {}
|
||||
@@ -1841,8 +1813,8 @@ class GraphMemcpyNodeToSymbol : public GraphMemcpyNode1D {
|
||||
~GraphMemcpyNodeToSymbol() {}
|
||||
|
||||
GraphMemcpyNodeToSymbol(const GraphMemcpyNodeToSymbol& rhs) : GraphMemcpyNode1D(rhs) {
|
||||
symbol_ = rhs.symbol_;
|
||||
offset_ = rhs.offset_;
|
||||
symbol_ = rhs.symbol_;
|
||||
offset_ = rhs.offset_;
|
||||
}
|
||||
|
||||
GraphNode* clone() const override { return new GraphMemcpyNodeToSymbol(*this); }
|
||||
@@ -1873,9 +1845,9 @@ class GraphMemcpyNodeToSymbol : public GraphMemcpyNode1D {
|
||||
hipMemcpyKind kind, bool isExec = false) {
|
||||
if (isExec) {
|
||||
size_t discardOffset = 0;
|
||||
amd::Memory *memObj = getMemoryObject(src, discardOffset);
|
||||
amd::Memory* memObj = getMemoryObject(src, discardOffset);
|
||||
if (memObj != nullptr) {
|
||||
amd::Memory *memObjOri = getMemoryObject(src_, discardOffset);
|
||||
amd::Memory* memObjOri = getMemoryObject(src_, discardOffset);
|
||||
if (memObjOri != nullptr) {
|
||||
if (memObjOri->getUserData().deviceId != memObj->getUserData().deviceId) {
|
||||
return hipErrorInvalidValue;
|
||||
@@ -1905,9 +1877,8 @@ class GraphMemcpyNodeToSymbol : public GraphMemcpyNode1D {
|
||||
}
|
||||
if (srcMemory == nullptr && kind != hipMemcpyHostToDevice && kind != hipMemcpyDefault) {
|
||||
return hipErrorInvalidValue;
|
||||
} else if (srcMemory != nullptr && srcFlag == 0 &&
|
||||
kind != hipMemcpyDeviceToDevice && kind != hipMemcpyDeviceToDeviceNoCU
|
||||
&& kind != hipMemcpyDefault) {
|
||||
} else if (srcMemory != nullptr && srcFlag == 0 && kind != hipMemcpyDeviceToDevice &&
|
||||
kind != hipMemcpyDeviceToDeviceNoCU && kind != hipMemcpyDefault) {
|
||||
return hipErrorInvalidValue;
|
||||
} else if (kind == hipMemcpyHostToHost || kind == hipMemcpyDeviceToHost) {
|
||||
return hipErrorInvalidValue;
|
||||
@@ -1921,8 +1892,7 @@ class GraphMemcpyNodeToSymbol : public GraphMemcpyNode1D {
|
||||
}
|
||||
|
||||
virtual hipError_t SetParams(GraphNode* node) override {
|
||||
const GraphMemcpyNodeToSymbol* memcpyNode =
|
||||
static_cast<GraphMemcpyNodeToSymbol const*>(node);
|
||||
const GraphMemcpyNodeToSymbol* memcpyNode = static_cast<GraphMemcpyNodeToSymbol const*>(node);
|
||||
return SetParams(memcpyNode->src_, memcpyNode->symbol_, memcpyNode->count_, memcpyNode->offset_,
|
||||
memcpyNode->kind_);
|
||||
}
|
||||
@@ -1932,6 +1902,7 @@ class GraphMemsetNode : public GraphNode {
|
||||
size_t depth_ = 1;
|
||||
size_t arrWidth_ = 1;
|
||||
size_t arrHeight_ = 1;
|
||||
|
||||
public:
|
||||
GraphMemsetNode(const hipMemsetParams* pMemsetParams, size_t depth = 1, size_t arrWidth = 1,
|
||||
size_t arrHeight = 1)
|
||||
@@ -1948,7 +1919,7 @@ class GraphMemsetNode : public GraphNode {
|
||||
}
|
||||
}
|
||||
|
||||
~GraphMemsetNode() { }
|
||||
~GraphMemsetNode() {}
|
||||
// Copy constructor
|
||||
GraphMemsetNode(const GraphMemsetNode& memsetNode) : GraphNode(memsetNode) {
|
||||
memsetParams_ = memsetNode.memsetParams_;
|
||||
@@ -1966,9 +1937,8 @@ class GraphMemsetNode : public GraphNode {
|
||||
sprintf(buffer,
|
||||
"{\n%s\n| {{ID | node handle | dptr | pitch | value | elementSize | width | "
|
||||
"height | depth} | {%u | %p | %p | %zu | %u | %u | %zu | %zu | %zu}}}",
|
||||
label_, GetID(), this, memsetParams_.dst, memsetParams_.pitch,
|
||||
memsetParams_.value, memsetParams_.elementSize, memsetParams_.width,
|
||||
memsetParams_.height, depth_);
|
||||
label_, GetID(), this, memsetParams_.dst, memsetParams_.pitch, memsetParams_.value,
|
||||
memsetParams_.elementSize, memsetParams_.width, memsetParams_.height, depth_);
|
||||
label = buffer;
|
||||
} else {
|
||||
size_t sizeBytes;
|
||||
@@ -2027,9 +1997,9 @@ class GraphMemsetNode : public GraphNode {
|
||||
}
|
||||
if (isExec) {
|
||||
size_t discardOffset = 0;
|
||||
amd::Memory *memObj = getMemoryObject(params->dst, discardOffset);
|
||||
amd::Memory* memObj = getMemoryObject(params->dst, discardOffset);
|
||||
if (memObj != nullptr) {
|
||||
amd::Memory *memObjOri = getMemoryObject(memsetParams_.dst, discardOffset);
|
||||
amd::Memory* memObjOri = getMemoryObject(memsetParams_.dst, discardOffset);
|
||||
if (memObjOri != nullptr) {
|
||||
if (memObjOri->getUserData().deviceId != memObj->getUserData().deviceId) {
|
||||
return hipErrorInvalidValue;
|
||||
@@ -2042,35 +2012,36 @@ class GraphMemsetNode : public GraphNode {
|
||||
// 1D - for hipGraphMemsetNodeSetParams & hipGraphExecMemsetNodeSetParams, They return
|
||||
// invalid value if new width is more than actual allocation.
|
||||
size_t discardOffset = 0;
|
||||
amd::Memory *memObj = getMemoryObject(params->dst, discardOffset);
|
||||
amd::Memory* memObj = getMemoryObject(params->dst, discardOffset);
|
||||
if (memObj != nullptr) {
|
||||
if (params->width * params->elementSize > memObj->getSize()) {
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
}
|
||||
}
|
||||
sizeBytes = params->width * params->elementSize;
|
||||
hip_error = ihipMemset_validate(params->dst, params->value, params->elementSize, sizeBytes);
|
||||
} else {
|
||||
if (isExec) {
|
||||
// 2D - hipGraphExecMemsetNodeSetParams returns invalid value if new width or new height is
|
||||
// not same as what memset node is added with.
|
||||
if (memsetParams_.width * memsetParams_.elementSize != params->width * params->elementSize
|
||||
|| memsetParams_.height != params->height || depth != depth_) {
|
||||
if (memsetParams_.width * memsetParams_.elementSize !=
|
||||
params->width * params->elementSize ||
|
||||
memsetParams_.height != params->height || depth != depth_) {
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
} else {
|
||||
// 2D - hipGraphMemsetNodeSetParams returns invalid value if new width or new height is
|
||||
// greter than actual allocation.
|
||||
size_t discardOffset = 0;
|
||||
amd::Memory *memObj = getMemoryObject(params->dst, discardOffset);
|
||||
amd::Memory* memObj = getMemoryObject(params->dst, discardOffset);
|
||||
if (memObj != nullptr) {
|
||||
if (params->width * params->elementSize > memObj->getUserData().width_
|
||||
|| params->height > memObj->getUserData().height_
|
||||
|| depth > memObj->getUserData().depth_) {
|
||||
if (params->width * params->elementSize > memObj->getUserData().width_ ||
|
||||
params->height > memObj->getUserData().height_ ||
|
||||
depth > memObj->getUserData().depth_) {
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
sizeBytes = params->width * params->elementSize * params->height * depth;
|
||||
hip_error = ihipMemset3D_validate(
|
||||
{params->dst, params->pitch, params->width * params->elementSize, params->height},
|
||||
@@ -2103,9 +2074,7 @@ class GraphEventRecordNode : public GraphNode {
|
||||
event_(event) {}
|
||||
~GraphEventRecordNode() {}
|
||||
|
||||
GraphEventRecordNode(const GraphEventRecordNode& rhs) : GraphNode(rhs) {
|
||||
event_ = rhs.event_;
|
||||
}
|
||||
GraphEventRecordNode(const GraphEventRecordNode& rhs) : GraphNode(rhs) { event_ = rhs.event_; }
|
||||
|
||||
GraphNode* clone() const override { return new GraphEventRecordNode(*this); }
|
||||
|
||||
@@ -2143,8 +2112,7 @@ class GraphEventRecordNode : public GraphNode {
|
||||
}
|
||||
|
||||
hipError_t SetParams(GraphNode* node) override {
|
||||
const GraphEventRecordNode* eventRecordNode =
|
||||
static_cast<GraphEventRecordNode const*>(node);
|
||||
const GraphEventRecordNode* eventRecordNode = static_cast<GraphEventRecordNode const*>(node);
|
||||
return SetParams(eventRecordNode->event_);
|
||||
}
|
||||
};
|
||||
@@ -2154,14 +2122,11 @@ class GraphEventWaitNode : public GraphNode {
|
||||
|
||||
public:
|
||||
GraphEventWaitNode(hipEvent_t event)
|
||||
: GraphNode(hipGraphNodeTypeWaitEvent, "solid", "rectangle", "EVENT_WAIT"),
|
||||
event_(event) {}
|
||||
: GraphNode(hipGraphNodeTypeWaitEvent, "solid", "rectangle", "EVENT_WAIT"), event_(event) {}
|
||||
|
||||
~GraphEventWaitNode() {}
|
||||
|
||||
GraphEventWaitNode(const GraphEventWaitNode& rhs) : GraphNode(rhs) {
|
||||
event_ = rhs.event_;
|
||||
}
|
||||
GraphEventWaitNode(const GraphEventWaitNode& rhs) : GraphNode(rhs) { event_ = rhs.event_; }
|
||||
|
||||
GraphNode* clone() const override { return new GraphEventWaitNode(*this); }
|
||||
|
||||
@@ -2207,7 +2172,7 @@ class GraphHostNode : public GraphNode {
|
||||
: GraphNode(hipGraphNodeTypeHost, "solid", "rectangle", "HOST") {
|
||||
NodeParams_ = *NodeParams;
|
||||
}
|
||||
~GraphHostNode() { }
|
||||
~GraphHostNode() {}
|
||||
|
||||
GraphHostNode(const GraphHostNode& hostNode) : GraphNode(hostNode) {
|
||||
NodeParams_ = hostNode.NodeParams_;
|
||||
@@ -2293,7 +2258,7 @@ class GraphEmptyNode : public GraphNode {
|
||||
// ================================================================================================
|
||||
class GraphMemAllocNode final : public GraphNode {
|
||||
hipMemAllocNodeParams node_params_; // Node parameters for memory allocation
|
||||
amd::Memory* va_ = nullptr; // Memory object, which holds a virtual address
|
||||
amd::Memory* va_ = nullptr; // Memory object, which holds a virtual address
|
||||
|
||||
// Derive the new class for VirtualMapCommand,
|
||||
// so runtime can allocate memory during the execution of command
|
||||
@@ -2302,7 +2267,8 @@ class GraphMemAllocNode final : public GraphNode {
|
||||
VirtualMemAllocNode(amd::HostQueue& queue, const amd::Event::EventWaitList& eventWaitList,
|
||||
amd::Memory* va, size_t size, amd::Memory* memory, Graph* graph)
|
||||
: VirtualMapCommand(queue, eventWaitList, va->getSvmPtr(), size, memory),
|
||||
va_(va), graph_(graph) {}
|
||||
va_(va),
|
||||
graph_(graph) {}
|
||||
|
||||
virtual void submit(device::VirtualDevice& device) final {
|
||||
// Remove VA reference from the global mapping. Runtime has to keep a dummy reference for
|
||||
@@ -2340,15 +2306,15 @@ class GraphMemAllocNode final : public GraphNode {
|
||||
queue()->device().SetMemAccess(vaddr_sub_obj->getSvmPtr(), aligned_size,
|
||||
amd::Device::VmmAccess::kReadWrite);
|
||||
va_->retain();
|
||||
graph_->IncrementMemAllocNodeCount(); // Increment count of unreleased mem alloc nodes
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_MEM_POOL,
|
||||
"Graph MemAlloc execute [%p-%p], %p", vaddr_sub_obj->getSvmPtr(),
|
||||
graph_->IncrementMemAllocNodeCount(); // Increment count of unreleased mem alloc nodes
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_MEM_POOL, "Graph MemAlloc execute [%p-%p], %p",
|
||||
vaddr_sub_obj->getSvmPtr(),
|
||||
reinterpret_cast<char*>(vaddr_sub_obj->getSvmPtr()) + aligned_size, memory());
|
||||
}
|
||||
|
||||
private:
|
||||
amd::Memory* va_; // Memory object with the new virtual address for mapping
|
||||
Graph* graph_; // Graph which allocates/maps memory
|
||||
amd::Memory* va_; // Memory object with the new virtual address for mapping
|
||||
Graph* graph_; // Graph which allocates/maps memory
|
||||
};
|
||||
|
||||
public:
|
||||
@@ -2357,8 +2323,7 @@ class GraphMemAllocNode final : public GraphNode {
|
||||
node_params_ = *node_params;
|
||||
}
|
||||
|
||||
GraphMemAllocNode(const GraphMemAllocNode& rhs)
|
||||
: GraphNode(rhs) {
|
||||
GraphMemAllocNode(const GraphMemAllocNode& rhs) : GraphNode(rhs) {
|
||||
node_params_ = rhs.node_params_;
|
||||
if (HIP_MEM_POOL_USE_VM) {
|
||||
assert(rhs.va_ != nullptr && "Graph MemAlloc runtime can't clone an invalid node!");
|
||||
@@ -2392,8 +2357,8 @@ class GraphMemAllocNode final : public GraphNode {
|
||||
assert(va_ != nullptr && "Runtime can't create a command for an invalid node!");
|
||||
stream->GetDevice()->GetGraphMemoryPool()->SetGraphInUse();
|
||||
// Create command for memory mapping
|
||||
auto cmd = new VirtualMemAllocNode(*stream, amd::Event::EventWaitList{},
|
||||
va_, node_params_.bytesize, nullptr, graph);
|
||||
auto cmd = new VirtualMemAllocNode(*stream, amd::Event::EventWaitList{}, va_,
|
||||
node_params_.bytesize, nullptr, graph);
|
||||
commands_.push_back(cmd);
|
||||
size_t offset = 0;
|
||||
// Check if memory was already added after first reserve
|
||||
@@ -2405,8 +2370,7 @@ class GraphMemAllocNode final : public GraphNode {
|
||||
// be executed again
|
||||
amd::MemObjMap::AddMemObj(node_params_.dptr, va_);
|
||||
}
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_MEM_POOL, "Graph MemAlloc create: %p",
|
||||
node_params_.dptr);
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_MEM_POOL, "Graph MemAlloc create: %p", node_params_.dptr);
|
||||
}
|
||||
}
|
||||
return error;
|
||||
@@ -2421,8 +2385,7 @@ class GraphMemAllocNode final : public GraphNode {
|
||||
va_ = amd::MemObjMap::FindVirtualMemObj(node_params_.dptr);
|
||||
amd::MemObjMap::AddMemObj(node_params_.dptr, va_);
|
||||
}
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_MEM_POOL, "Graph MemAlloc reserve VA: %p",
|
||||
node_params_.dptr);
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_MEM_POOL, "Graph MemAlloc reserve VA: %p", node_params_.dptr);
|
||||
}
|
||||
return node_params_.dptr;
|
||||
}
|
||||
@@ -2451,16 +2414,18 @@ class GraphMemAllocNode final : public GraphNode {
|
||||
|
||||
// ================================================================================================
|
||||
class GraphMemFreeNode : public GraphNode {
|
||||
void* device_ptr_; // Device pointer of the freed memory
|
||||
void* device_ptr_; // Device pointer of the freed memory
|
||||
|
||||
// Derive the new class for VirtualMap command, since runtime has to free
|
||||
// real allocation after unmap is complete
|
||||
class VirtualMemFreeNode : public amd::VirtualMapCommand {
|
||||
public:
|
||||
VirtualMemFreeNode(Graph* graph, int device_id, amd::HostQueue& queue,
|
||||
const amd::Event::EventWaitList& eventWaitList, void* ptr, size_t size,
|
||||
amd::Memory* memory) : VirtualMapCommand(queue, eventWaitList, ptr, size, memory)
|
||||
, graph_(graph), device_id_(device_id) {}
|
||||
const amd::Event::EventWaitList& eventWaitList, void* ptr, size_t size,
|
||||
amd::Memory* memory)
|
||||
: VirtualMapCommand(queue, eventWaitList, ptr, size, memory),
|
||||
graph_(graph),
|
||||
device_id_(device_id) {}
|
||||
|
||||
virtual void submit(device::VirtualDevice& device) final {
|
||||
// Find memory object before unmap logic
|
||||
@@ -2484,23 +2449,20 @@ class GraphMemFreeNode : public GraphNode {
|
||||
LogError("Memory didn't belong to any pool!");
|
||||
}
|
||||
amd::MemObjMap::AddMemObj(ptr(), vaddr_mem_obj);
|
||||
graph_->DecrementMemAllocNodeCount(); // Decrement count of unreleased memalloc nodes
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_MEM_POOL, "Graph MemFree execute: %p, %p",
|
||||
ptr(), vaddr_sub_obj);
|
||||
graph_->DecrementMemAllocNodeCount(); // Decrement count of unreleased memalloc nodes
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_MEM_POOL, "Graph MemFree execute: %p, %p", ptr(),
|
||||
vaddr_sub_obj);
|
||||
}
|
||||
|
||||
private:
|
||||
Graph* graph_; // Graph, which has the execution of this command
|
||||
int device_id_; // Device ID where this command is executed
|
||||
Graph* graph_; // Graph, which has the execution of this command
|
||||
int device_id_; // Device ID where this command is executed
|
||||
};
|
||||
|
||||
public:
|
||||
GraphMemFreeNode(void* dptr)
|
||||
: GraphNode(hipGraphNodeTypeMemFree, "solid", "rectangle", "MEM_FREE")
|
||||
, device_ptr_(dptr) {}
|
||||
GraphMemFreeNode(const GraphMemFreeNode& rhs) : GraphNode(rhs) {
|
||||
device_ptr_ = rhs.device_ptr_;
|
||||
}
|
||||
: GraphNode(hipGraphNodeTypeMemFree, "solid", "rectangle", "MEM_FREE"), device_ptr_(dptr) {}
|
||||
GraphMemFreeNode(const GraphMemFreeNode& rhs) : GraphNode(rhs) { device_ptr_ = rhs.device_ptr_; }
|
||||
|
||||
virtual GraphNode* clone() const final { return new GraphMemFreeNode(*this); }
|
||||
|
||||
@@ -2514,8 +2476,8 @@ class GraphMemFreeNode : public GraphNode {
|
||||
const auto& dev_info = stream->device().info();
|
||||
auto va = amd::MemObjMap::FindVirtualMemObj(device_ptr_);
|
||||
// Unmap virtual address from memory
|
||||
amd::Command* cmd = new VirtualMemFreeNode(graph, stream->DeviceId(), *stream,
|
||||
amd::Command::EventWaitList{}, device_ptr_,
|
||||
amd::Command* cmd = new VirtualMemFreeNode(
|
||||
graph, stream->DeviceId(), *stream, amd::Command::EventWaitList{}, device_ptr_,
|
||||
amd::alignUp(va->getSize(), dev_info.virtualMemAllocGranularity_), nullptr);
|
||||
commands_.push_back(cmd);
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_MEM_POOL, "Graph FreeMem create: %p", device_ptr_);
|
||||
@@ -2531,9 +2493,7 @@ class GraphMemFreeNode : public GraphNode {
|
||||
}
|
||||
}
|
||||
|
||||
void GetParams(void** params) const {
|
||||
*params = device_ptr_;
|
||||
}
|
||||
void GetParams(void** params) const { *params = device_ptr_; }
|
||||
};
|
||||
|
||||
class GraphDrvMemcpyNode : public GraphNode {
|
||||
@@ -2553,9 +2513,9 @@ class GraphDrvMemcpyNode : public GraphNode {
|
||||
GraphNode* clone() const override { return new GraphDrvMemcpyNode(*this); }
|
||||
|
||||
hipError_t CreateCommand(hip::Stream* stream) override {
|
||||
if(copyParams_.srcMemoryType == hipMemoryTypeHost &&
|
||||
copyParams_.dstMemoryType == hipMemoryTypeHost &&
|
||||
IsHtoHMemcpy(copyParams_.dstHost, copyParams_.srcHost)) {
|
||||
if (copyParams_.srcMemoryType == hipMemoryTypeHost &&
|
||||
copyParams_.dstMemoryType == hipMemoryTypeHost &&
|
||||
IsHtoHMemcpy(copyParams_.dstHost, copyParams_.srcHost)) {
|
||||
return hipSuccess;
|
||||
}
|
||||
hipError_t status = GraphNode::CreateCommand(stream);
|
||||
@@ -2571,23 +2531,20 @@ class GraphDrvMemcpyNode : public GraphNode {
|
||||
|
||||
void EnqueueCommands(hip::Stream* stream) override {
|
||||
bool isHtoH = false;
|
||||
if(copyParams_.srcMemoryType == hipMemoryTypeHost &&
|
||||
copyParams_.dstMemoryType == hipMemoryTypeHost &&
|
||||
IsHtoHMemcpy(copyParams_.dstHost, copyParams_.srcHost)) {
|
||||
if (copyParams_.srcMemoryType == hipMemoryTypeHost &&
|
||||
copyParams_.dstMemoryType == hipMemoryTypeHost &&
|
||||
IsHtoHMemcpy(copyParams_.dstHost, copyParams_.srcHost)) {
|
||||
isHtoH = true;
|
||||
}
|
||||
if (isEnabled_ && isHtoH) {
|
||||
ihipHtoHMemcpy(copyParams_.dstHost, copyParams_.srcHost,
|
||||
copyParams_.WidthInBytes * copyParams_.Height *
|
||||
copyParams_.Depth, *stream);
|
||||
copyParams_.WidthInBytes * copyParams_.Height * copyParams_.Depth, *stream);
|
||||
return;
|
||||
}
|
||||
GraphNode::EnqueueCommands(stream);
|
||||
}
|
||||
|
||||
void GetParams(HIP_MEMCPY3D* params) {
|
||||
std::memcpy(params, ©Params_, sizeof(HIP_MEMCPY3D));
|
||||
}
|
||||
void GetParams(HIP_MEMCPY3D* params) { std::memcpy(params, ©Params_, sizeof(HIP_MEMCPY3D)); }
|
||||
hipError_t SetParams(const HIP_MEMCPY3D* params) {
|
||||
hipError_t status = ValidateParams(params);
|
||||
if (status != hipSuccess) {
|
||||
@@ -2608,7 +2565,6 @@ class GraphDrvMemcpyNode : public GraphNode {
|
||||
}
|
||||
return hipSuccess;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
class hipGraphExternalSemSignalNode : public GraphNode {
|
||||
@@ -2616,13 +2572,12 @@ class hipGraphExternalSemSignalNode : public GraphNode {
|
||||
|
||||
public:
|
||||
hipGraphExternalSemSignalNode(const hipExternalSemaphoreSignalNodeParams* pNodeParams)
|
||||
: GraphNode(hipGraphNodeTypeExtSemaphoreSignal, "solid", "rectangle",
|
||||
"EXTERNAL_SEMAPHORE_SIGNAL") {
|
||||
externalSemaphorNodeParam_ = *pNodeParams;
|
||||
: GraphNode(hipGraphNodeTypeExtSemaphoreSignal, "solid", "rectangle",
|
||||
"EXTERNAL_SEMAPHORE_SIGNAL") {
|
||||
externalSemaphorNodeParam_ = *pNodeParams;
|
||||
}
|
||||
|
||||
hipGraphExternalSemSignalNode(const hipGraphExternalSemSignalNode& rhs)
|
||||
: GraphNode(rhs) {
|
||||
hipGraphExternalSemSignalNode(const hipGraphExternalSemSignalNode& rhs) : GraphNode(rhs) {
|
||||
externalSemaphorNodeParam_ = rhs.externalSemaphorNodeParam_;
|
||||
}
|
||||
|
||||
@@ -2639,10 +2594,10 @@ class hipGraphExternalSemSignalNode : public GraphNode {
|
||||
commands_.reserve(numExtSems);
|
||||
for (unsigned int i = 0; i < numExtSems; i++) {
|
||||
if (externalSemaphorNodeParam_.extSemArray[i] != nullptr) {
|
||||
amd::ExternalSemaphoreCmd* command = new amd::ExternalSemaphoreCmd(*stream,
|
||||
externalSemaphorNodeParam_.extSemArray[i],
|
||||
externalSemaphorNodeParam_.paramsArray[i].params.fence.value,
|
||||
amd::ExternalSemaphoreCmd::COMMAND_SIGNAL_EXTSEMAPHORE);
|
||||
amd::ExternalSemaphoreCmd* command = new amd::ExternalSemaphoreCmd(
|
||||
*stream, externalSemaphorNodeParam_.extSemArray[i],
|
||||
externalSemaphorNodeParam_.paramsArray[i].params.fence.value,
|
||||
amd::ExternalSemaphoreCmd::COMMAND_SIGNAL_EXTSEMAPHORE);
|
||||
if (command == nullptr) {
|
||||
return hipErrorOutOfMemory;
|
||||
}
|
||||
@@ -2671,9 +2626,9 @@ class hipGraphExternalSemWaitNode : public GraphNode {
|
||||
|
||||
public:
|
||||
hipGraphExternalSemWaitNode(const hipExternalSemaphoreWaitNodeParams* pNodeParams)
|
||||
: GraphNode(hipGraphNodeTypeExtSemaphoreWait, "solid",
|
||||
"rectangle", "EXTERNAL_SEMAPHORE_WAIT") {
|
||||
externalSemaphorNodeParam_ = *pNodeParams;
|
||||
: GraphNode(hipGraphNodeTypeExtSemaphoreWait, "solid", "rectangle",
|
||||
"EXTERNAL_SEMAPHORE_WAIT") {
|
||||
externalSemaphorNodeParam_ = *pNodeParams;
|
||||
}
|
||||
|
||||
hipGraphExternalSemWaitNode(const hipGraphExternalSemWaitNode& rhs) : GraphNode(rhs) {
|
||||
@@ -2687,16 +2642,15 @@ class hipGraphExternalSemWaitNode : public GraphNode {
|
||||
hipError_t status = GraphNode::CreateCommand(stream);
|
||||
if (status != hipSuccess) {
|
||||
return status;
|
||||
|
||||
}
|
||||
unsigned int numExtSems = externalSemaphorNodeParam_.numExtSems;
|
||||
commands_.reserve(numExtSems);
|
||||
for (unsigned int i = 0; i < numExtSems; i++) {
|
||||
if (externalSemaphorNodeParam_.extSemArray[i] != nullptr) {
|
||||
amd::ExternalSemaphoreCmd* command = new amd::ExternalSemaphoreCmd(*stream,
|
||||
externalSemaphorNodeParam_.extSemArray[i],
|
||||
externalSemaphorNodeParam_.paramsArray[i].params.fence.value,
|
||||
amd::ExternalSemaphoreCmd::COMMAND_WAIT_EXTSEMAPHORE);
|
||||
amd::ExternalSemaphoreCmd* command = new amd::ExternalSemaphoreCmd(
|
||||
*stream, externalSemaphorNodeParam_.extSemArray[i],
|
||||
externalSemaphorNodeParam_.paramsArray[i].params.fence.value,
|
||||
amd::ExternalSemaphoreCmd::COMMAND_WAIT_EXTSEMAPHORE);
|
||||
if (command == nullptr) {
|
||||
return hipErrorOutOfMemory;
|
||||
}
|
||||
|
||||
@@ -39,31 +39,41 @@ static_assert(hipCpuDeviceId == amd::CpuDeviceId, "CPU device ID mismatch with R
|
||||
static_assert(hipInvalidDeviceId == amd::InvalidDeviceId,
|
||||
"Invalid device ID mismatch with ROCclr!");
|
||||
|
||||
static_assert(static_cast<uint32_t>(hipMemAdviseSetReadMostly) ==
|
||||
amd::MemoryAdvice::SetReadMostly, "Enum mismatch with ROCclr!");
|
||||
static_assert(static_cast<uint32_t>(hipMemAdviseSetReadMostly) == amd::MemoryAdvice::SetReadMostly,
|
||||
"Enum mismatch with ROCclr!");
|
||||
static_assert(static_cast<uint32_t>(hipMemAdviseUnsetReadMostly) ==
|
||||
amd::MemoryAdvice::UnsetReadMostly, "Enum mismatch with ROCclr!");
|
||||
amd::MemoryAdvice::UnsetReadMostly,
|
||||
"Enum mismatch with ROCclr!");
|
||||
static_assert(static_cast<uint32_t>(hipMemAdviseSetPreferredLocation) ==
|
||||
amd::MemoryAdvice::SetPreferredLocation, "Enum mismatch with ROCclr!");
|
||||
amd::MemoryAdvice::SetPreferredLocation,
|
||||
"Enum mismatch with ROCclr!");
|
||||
static_assert(static_cast<uint32_t>(hipMemAdviseUnsetPreferredLocation) ==
|
||||
amd::MemoryAdvice::UnsetPreferredLocation, "Enum mismatch with ROCclr!");
|
||||
static_assert(static_cast<uint32_t>(hipMemAdviseSetAccessedBy) ==
|
||||
amd::MemoryAdvice::SetAccessedBy, "Enum mismatch with ROCclr!");
|
||||
amd::MemoryAdvice::UnsetPreferredLocation,
|
||||
"Enum mismatch with ROCclr!");
|
||||
static_assert(static_cast<uint32_t>(hipMemAdviseSetAccessedBy) == amd::MemoryAdvice::SetAccessedBy,
|
||||
"Enum mismatch with ROCclr!");
|
||||
static_assert(static_cast<uint32_t>(hipMemAdviseUnsetAccessedBy) ==
|
||||
amd::MemoryAdvice::UnsetAccessedBy, "Enum mismatch with ROCclr!");
|
||||
amd::MemoryAdvice::UnsetAccessedBy,
|
||||
"Enum mismatch with ROCclr!");
|
||||
static_assert(static_cast<uint32_t>(hipMemAdviseSetCoarseGrain) ==
|
||||
amd::MemoryAdvice::SetCoarseGrain, "Enum mismatch with ROCclr!");
|
||||
amd::MemoryAdvice::SetCoarseGrain,
|
||||
"Enum mismatch with ROCclr!");
|
||||
static_assert(static_cast<uint32_t>(hipMemAdviseUnsetCoarseGrain) ==
|
||||
amd::MemoryAdvice::UnsetCoarseGrain, "Enum mismatch with ROCclr!");
|
||||
amd::MemoryAdvice::UnsetCoarseGrain,
|
||||
"Enum mismatch with ROCclr!");
|
||||
|
||||
static_assert(static_cast<uint32_t>(hipMemRangeAttributeReadMostly) ==
|
||||
amd::MemRangeAttribute::ReadMostly, "Enum mismatch with ROCclr!");
|
||||
amd::MemRangeAttribute::ReadMostly,
|
||||
"Enum mismatch with ROCclr!");
|
||||
static_assert(static_cast<uint32_t>(hipMemRangeAttributePreferredLocation) ==
|
||||
amd::MemRangeAttribute::PreferredLocation, "Enum mismatch with ROCclr!");
|
||||
amd::MemRangeAttribute::PreferredLocation,
|
||||
"Enum mismatch with ROCclr!");
|
||||
static_assert(static_cast<uint32_t>(hipMemRangeAttributeAccessedBy) ==
|
||||
amd::MemRangeAttribute::AccessedBy, "Enum mismatch with ROCclr!");
|
||||
amd::MemRangeAttribute::AccessedBy,
|
||||
"Enum mismatch with ROCclr!");
|
||||
static_assert(static_cast<uint32_t>(hipMemRangeAttributeLastPrefetchLocation) ==
|
||||
amd::MemRangeAttribute::LastPrefetchLocation, "Enum mismatch with ROCclr!");
|
||||
amd::MemRangeAttribute::LastPrefetchLocation,
|
||||
"Enum mismatch with ROCclr!");
|
||||
|
||||
// ================================================================================================
|
||||
hipError_t hipMallocManaged(void** dev_ptr, size_t size, unsigned int flags) {
|
||||
@@ -84,8 +94,7 @@ hipError_t hipMallocManaged(void** dev_ptr, size_t size, unsigned int flags) {
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
hipError_t hipMemPrefetchAsync(const void* dev_ptr, size_t count, int device,
|
||||
hipStream_t stream) {
|
||||
hipError_t hipMemPrefetchAsync(const void* dev_ptr, size_t count, int device, hipStream_t stream) {
|
||||
HIP_INIT_API(hipMemPrefetchAsync, dev_ptr, count, device, stream);
|
||||
CHECK_STREAM_CAPTURE_SUPPORTED();
|
||||
hipMemLocation location;
|
||||
@@ -147,8 +156,8 @@ hipError_t hipMemRangeGetAttribute(void* data, size_t data_size, hipMemRangeAttr
|
||||
amd::Device* dev = g_devices[0]->devices()[0];
|
||||
|
||||
// Get the allocation attribute from AMD HMM
|
||||
if (!dev->GetSvmAttributes(&data, &data_size, reinterpret_cast<int*>(&attribute), 1,
|
||||
dev_ptr, count)) {
|
||||
if (!dev->GetSvmAttributes(&data, &data_size, reinterpret_cast<int*>(&attribute), 1, dev_ptr,
|
||||
count)) {
|
||||
HIP_RETURN(hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
@@ -159,8 +168,8 @@ hipError_t hipMemRangeGetAttribute(void* data, size_t data_size, hipMemRangeAttr
|
||||
hipError_t hipMemRangeGetAttributes(void** data, size_t* data_sizes,
|
||||
hipMemRangeAttribute* attributes, size_t num_attributes,
|
||||
const void* dev_ptr, size_t count) {
|
||||
HIP_INIT_API(hipMemRangeGetAttributes, data, data_sizes,
|
||||
attributes, num_attributes, dev_ptr, count);
|
||||
HIP_INIT_API(hipMemRangeGetAttributes, data, data_sizes, attributes, num_attributes, dev_ptr,
|
||||
count);
|
||||
|
||||
if ((data == nullptr) || (data_sizes == nullptr) || (attributes == nullptr) ||
|
||||
(num_attributes == 0) || (dev_ptr == nullptr) || (count == 0)) {
|
||||
@@ -168,7 +177,7 @@ hipError_t hipMemRangeGetAttributes(void** data, size_t* data_sizes,
|
||||
}
|
||||
|
||||
if (*data_sizes > 0) {
|
||||
for (int i = 0 ; i<*data_sizes ; i++) {
|
||||
for (int i = 0; i < *data_sizes; i++) {
|
||||
if (!data[i]) {
|
||||
HIP_RETURN(hipErrorInvalidValue);
|
||||
}
|
||||
@@ -188,8 +197,8 @@ hipError_t hipMemRangeGetAttributes(void** data, size_t* data_sizes,
|
||||
// Shouldn't matter for which device the interface is called
|
||||
amd::Device* dev = g_devices[0]->devices()[0];
|
||||
// Get the allocation attributes from AMD HMM
|
||||
if (!dev->GetSvmAttributes(data, data_sizes, reinterpret_cast<int*>(attributes),
|
||||
num_attributes, dev_ptr, count)) {
|
||||
if (!dev->GetSvmAttributes(data, data_sizes, reinterpret_cast<int*>(attributes), num_attributes,
|
||||
dev_ptr, count)) {
|
||||
HIP_RETURN(hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
@@ -197,8 +206,8 @@ hipError_t hipMemRangeGetAttributes(void** data, size_t* data_sizes,
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
hipError_t hipStreamAttachMemAsync(hipStream_t stream, void* dev_ptr,
|
||||
size_t length, unsigned int flags) {
|
||||
hipError_t hipStreamAttachMemAsync(hipStream_t stream, void* dev_ptr, size_t length,
|
||||
unsigned int flags) {
|
||||
HIP_INIT_API(hipStreamAttachMemAsync, stream, dev_ptr, length, flags);
|
||||
// stream can be null, length can be 0.
|
||||
if (dev_ptr == nullptr) {
|
||||
@@ -217,8 +226,9 @@ hipError_t hipStreamAttachMemAsync(hipStream_t stream, void* dev_ptr,
|
||||
// host-accessible region of system-allocated pageable memory.
|
||||
// This type of memory may only be specified if the device associated with the
|
||||
// stream reports a non-zero value for the device attribute hipDevAttrPageableMemoryAccess.
|
||||
hip::Stream* hip_stream = (stream == nullptr || stream == hipStreamLegacy) ?
|
||||
hip::getCurrentDevice()->NullStream() : hip::getStream(stream);
|
||||
hip::Stream* hip_stream = (stream == nullptr || stream == hipStreamLegacy)
|
||||
? hip::getCurrentDevice()->NullStream()
|
||||
: hip::getStream(stream);
|
||||
size_t offset = 0;
|
||||
amd::Memory* memObj = getMemoryObject(dev_ptr, offset);
|
||||
if (memObj == nullptr) {
|
||||
@@ -258,9 +268,9 @@ hipError_t ihipMallocManaged(void** ptr, size_t size, size_t align, bool use_hos
|
||||
// allocation in the device driver
|
||||
if (use_host_ptr) {
|
||||
// If the host pointer is already allocated, map it to svm fine grain buffer
|
||||
*ptr = amd::SvmBuffer::malloc(ctx, CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_USE_HOST_PTR, size,
|
||||
(align == 0) ? dev.info().memBaseAddrAlign_ : align, nullptr,
|
||||
*ptr);
|
||||
*ptr =
|
||||
amd::SvmBuffer::malloc(ctx, CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_USE_HOST_PTR, size,
|
||||
(align == 0) ? dev.info().memBaseAddrAlign_ : align, nullptr, *ptr);
|
||||
} else {
|
||||
*ptr = amd::SvmBuffer::malloc(ctx, CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_ALLOC_HOST_PTR, size,
|
||||
(align == 0) ? dev.info().memBaseAddrAlign_ : align);
|
||||
@@ -268,12 +278,12 @@ hipError_t ihipMallocManaged(void** ptr, size_t size, size_t align, bool use_hos
|
||||
if (*ptr == nullptr) {
|
||||
return hipErrorMemoryAllocation;
|
||||
}
|
||||
size_t offset = 0; //this is ignored
|
||||
size_t offset = 0; // this is ignored
|
||||
amd::Memory* memObj = getMemoryObject(*ptr, offset);
|
||||
if (memObj == nullptr) {
|
||||
return hipErrorMemoryAllocation;
|
||||
}
|
||||
//saves the current device id so that it can be accessed later
|
||||
// saves the current device id so that it can be accessed later
|
||||
memObj->getUserData().deviceId = hip::getCurrentDevice()->deviceId();
|
||||
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_API, "ihipMallocManaged ptr=0x%zx", *ptr);
|
||||
@@ -397,4 +407,4 @@ hipError_t ihipMemAdvise(const void* dev_ptr, size_t count, hipMemoryAdvise advi
|
||||
|
||||
return hipSuccess;
|
||||
}
|
||||
} //namespace hip
|
||||
} // namespace hip
|
||||
|
||||
@@ -47,22 +47,22 @@
|
||||
#define KCYN "\x1B[36m"
|
||||
#define KWHT "\x1B[37m"
|
||||
|
||||
namespace hip{
|
||||
extern std::once_flag g_ihipInitialized;
|
||||
namespace hip {
|
||||
extern std::once_flag g_ihipInitialized;
|
||||
}
|
||||
typedef struct hipArray {
|
||||
void* data; // FIXME: generalize this
|
||||
struct hipChannelFormatDesc desc;
|
||||
unsigned int type;
|
||||
unsigned int width;
|
||||
unsigned int height;
|
||||
unsigned int depth;
|
||||
enum hipArray_Format Format;
|
||||
unsigned int NumChannels;
|
||||
bool isDrv;
|
||||
unsigned int textureType;
|
||||
unsigned int flags;
|
||||
}hipArray;
|
||||
void* data; // FIXME: generalize this
|
||||
struct hipChannelFormatDesc desc;
|
||||
unsigned int type;
|
||||
unsigned int width;
|
||||
unsigned int height;
|
||||
unsigned int depth;
|
||||
enum hipArray_Format Format;
|
||||
unsigned int NumChannels;
|
||||
bool isDrv;
|
||||
unsigned int textureType;
|
||||
unsigned int flags;
|
||||
} hipArray;
|
||||
|
||||
namespace hip {
|
||||
enum MemcpyType {
|
||||
@@ -87,16 +87,16 @@ struct UserObject;
|
||||
class Stream;
|
||||
|
||||
#define IHIP_IPC_EVENT_HANDLE_SIZE 32
|
||||
#define IHIP_IPC_EVENT_RESERVED_SIZE LP64_SWITCH(28,24)
|
||||
#define IHIP_IPC_EVENT_RESERVED_SIZE LP64_SWITCH(28, 24)
|
||||
typedef struct ihipIpcEventHandle_st {
|
||||
//hsa_amd_ipc_signal_t ipc_handle; ///< ipc signal handle on ROCr
|
||||
//char ipc_handle[IHIP_IPC_EVENT_HANDLE_SIZE];
|
||||
//char reserved[IHIP_IPC_EVENT_RESERVED_SIZE];
|
||||
char shmem_name[IHIP_IPC_EVENT_HANDLE_SIZE];
|
||||
}ihipIpcEventHandle_t;
|
||||
// hsa_amd_ipc_signal_t ipc_handle; ///< ipc signal handle on ROCr
|
||||
// char ipc_handle[IHIP_IPC_EVENT_HANDLE_SIZE];
|
||||
// char reserved[IHIP_IPC_EVENT_RESERVED_SIZE];
|
||||
char shmem_name[IHIP_IPC_EVENT_HANDLE_SIZE];
|
||||
} ihipIpcEventHandle_t;
|
||||
|
||||
const char* ihipGetErrorName(hipError_t hip_error);
|
||||
}
|
||||
} // namespace hip
|
||||
|
||||
#define HIP_INIT(noReturn) \
|
||||
{ \
|
||||
@@ -122,15 +122,14 @@ const char* ihipGetErrorName(hipError_t hip_error);
|
||||
}
|
||||
|
||||
|
||||
#define HIP_API_PRINT(...) \
|
||||
uint64_t startTimeUs = 0; \
|
||||
HIPPrintDuration(amd::LOG_INFO, amd::LOG_API, &startTimeUs, \
|
||||
"%s %s ( %s ) %s", KGRN, \
|
||||
__func__, ToString( __VA_ARGS__ ).c_str(), KNRM);
|
||||
#define HIP_API_PRINT(...) \
|
||||
uint64_t startTimeUs = 0; \
|
||||
HIPPrintDuration(amd::LOG_INFO, amd::LOG_API, &startTimeUs, "%s %s ( %s ) %s", KGRN, __func__, \
|
||||
ToString(__VA_ARGS__).c_str(), KNRM);
|
||||
|
||||
#define HIP_ERROR_PRINT(err, ...) \
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_API, "%s: Returned %s : %s", \
|
||||
__func__, hip::ihipGetErrorName(err), ToString( __VA_ARGS__ ).c_str());
|
||||
#define HIP_ERROR_PRINT(err, ...) \
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_API, "%s: Returned %s : %s", __func__, \
|
||||
hip::ihipGetErrorName(err), ToString(__VA_ARGS__).c_str());
|
||||
|
||||
#define HIP_INIT_API_INTERNAL(noReturn, cid, ...) \
|
||||
HIP_INIT(noReturn) \
|
||||
@@ -139,16 +138,15 @@ const char* ihipGetErrorName(hipError_t hip_error);
|
||||
|
||||
// This macro should be called at the beginning of every HIP API.
|
||||
#define HIP_INIT_API(cid, ...) \
|
||||
if (amd::Device::IsGPUInError()) { \
|
||||
if (amd::Device::IsGPUInError()) { \
|
||||
HIP_RETURN(ConvertCLErrorIntoHIPError(amd::Device::GetGPUError())); \
|
||||
} \
|
||||
HIP_INIT_API_INTERNAL(0, cid, __VA_ARGS__) \
|
||||
if (hip::g_devices.size() == 0) { \
|
||||
HIP_RETURN(hipErrorNoDevice); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define HIP_INIT_API_NO_RETURN(cid, ...) \
|
||||
HIP_INIT_API_INTERNAL(1, cid, __VA_ARGS__)
|
||||
#define HIP_INIT_API_NO_RETURN(cid, ...) HIP_INIT_API_INTERNAL(1, cid, __VA_ARGS__)
|
||||
|
||||
#define HIP_RETURN_DURATION(ret, ...) \
|
||||
hip::tls.last_command_error_ = ret; \
|
||||
@@ -158,7 +156,7 @@ const char* ihipGetErrorName(hipError_t hip_error);
|
||||
hip::tls.last_command_error_ = hip_error; \
|
||||
} else { \
|
||||
if (hip::tls.last_command_error_ != hipSuccess && \
|
||||
hip::tls.last_command_error_ != hipErrorNotReady) { \
|
||||
hip::tls.last_command_error_ != hipErrorNotReady) { \
|
||||
hip::tls.last_error_ = hip::tls.last_command_error_; \
|
||||
} \
|
||||
} \
|
||||
@@ -175,29 +173,29 @@ const char* ihipGetErrorName(hipError_t hip_error);
|
||||
hip::tls.last_command_error_ = hip_error; \
|
||||
} else { \
|
||||
if (hip::tls.last_command_error_ != hipSuccess && \
|
||||
hip::tls.last_command_error_ != hipErrorNotReady) { \
|
||||
hip::tls.last_command_error_ != hipErrorNotReady) { \
|
||||
hip::tls.last_error_ = hip::tls.last_command_error_; \
|
||||
} \
|
||||
} \
|
||||
HIP_ERROR_PRINT(hip::tls.last_command_error_, __VA_ARGS__) \
|
||||
return hip::tls.last_command_error_;
|
||||
|
||||
#define HIP_RETURN_ONFAIL(func) \
|
||||
do { \
|
||||
hipError_t herror = (func); \
|
||||
if (herror != hipSuccess) { \
|
||||
HIP_RETURN(herror); \
|
||||
} \
|
||||
#define HIP_RETURN_ONFAIL(func) \
|
||||
do { \
|
||||
hipError_t herror = (func); \
|
||||
if (herror != hipSuccess) { \
|
||||
HIP_RETURN(herror); \
|
||||
} \
|
||||
} while (0);
|
||||
|
||||
// Cannot be use in place of HIP_RETURN.
|
||||
// Refrain from using for external HIP APIs
|
||||
#define IHIP_RETURN_ONFAIL(func) \
|
||||
do { \
|
||||
hipError_t herror = (func); \
|
||||
if (herror != hipSuccess) { \
|
||||
return herror; \
|
||||
} \
|
||||
#define IHIP_RETURN_ONFAIL(func) \
|
||||
do { \
|
||||
hipError_t herror = (func); \
|
||||
if (herror != hipSuccess) { \
|
||||
return herror; \
|
||||
} \
|
||||
} while (0);
|
||||
|
||||
// During stream capture some actions, such as a call to hipMalloc, may be unsafe and prohibited
|
||||
@@ -258,15 +256,15 @@ const char* ihipGetErrorName(hipError_t hip_error);
|
||||
return hipErrorStreamCaptureInvalidated; \
|
||||
}
|
||||
|
||||
#define PER_THREAD_DEFAULT_STREAM(stream) \
|
||||
if (stream == nullptr || stream == hipStreamLegacy) { \
|
||||
stream = getPerThreadDefaultStream(); \
|
||||
#define PER_THREAD_DEFAULT_STREAM(stream) \
|
||||
if (stream == nullptr || stream == hipStreamLegacy) { \
|
||||
stream = getPerThreadDefaultStream(); \
|
||||
}
|
||||
|
||||
namespace hc {
|
||||
class accelerator;
|
||||
class accelerator_view;
|
||||
};
|
||||
}; // namespace hc
|
||||
|
||||
struct ihipExec_t {
|
||||
dim3 gridDim_;
|
||||
@@ -278,418 +276,420 @@ struct ihipExec_t {
|
||||
|
||||
namespace hip {
|
||||
class stream_per_thread {
|
||||
private:
|
||||
private:
|
||||
std::vector<hipStream_t> m_streams;
|
||||
public:
|
||||
|
||||
public:
|
||||
stream_per_thread();
|
||||
stream_per_thread(const stream_per_thread& ) = delete;
|
||||
void operator=(const stream_per_thread& ) = delete;
|
||||
stream_per_thread(const stream_per_thread&) = delete;
|
||||
void operator=(const stream_per_thread&) = delete;
|
||||
~stream_per_thread();
|
||||
hipStream_t get();
|
||||
void clear_spt();
|
||||
};
|
||||
|
||||
class Device;
|
||||
class MemoryPool;
|
||||
class Event;
|
||||
class Stream : public amd::HostQueue {
|
||||
public:
|
||||
enum Priority : int { High = -1, Normal = 0, Low = 1 };
|
||||
class Device;
|
||||
class MemoryPool;
|
||||
class Event;
|
||||
class Stream : public amd::HostQueue {
|
||||
public:
|
||||
enum Priority : int { High = -1, Normal = 0, Low = 1 };
|
||||
|
||||
private:
|
||||
mutable amd::Monitor lock_;
|
||||
Device* device_;
|
||||
Priority priority_;
|
||||
unsigned int flags_;
|
||||
bool null_;
|
||||
const std::vector<uint32_t> cuMask_;
|
||||
private:
|
||||
mutable amd::Monitor lock_;
|
||||
Device* device_;
|
||||
Priority priority_;
|
||||
unsigned int flags_;
|
||||
bool null_;
|
||||
const std::vector<uint32_t> cuMask_;
|
||||
|
||||
/// Stream capture related parameters
|
||||
/// Stream capture related parameters
|
||||
|
||||
/// Current capture status of the stream
|
||||
hipStreamCaptureStatus captureStatus_;
|
||||
/// Graph that is constructed with capture
|
||||
hip::Graph* pCaptureGraph_;
|
||||
/// Based on mode stream capture places restrictions on API calls that can be made within or
|
||||
/// concurrently
|
||||
hipStreamCaptureMode captureMode_{hipStreamCaptureModeGlobal};
|
||||
bool originStream_;
|
||||
/// Origin sream has no parent. Parent stream for the derived captured streams with event
|
||||
/// dependencies
|
||||
hipStream_t parentStream_ = nullptr;
|
||||
/// Last graph node captured in the stream
|
||||
std::vector<hip::GraphNode*> lastCapturedNodes_;
|
||||
/// dependencies removed via API hipStreamUpdateCaptureDependencies
|
||||
std::vector<hip::GraphNode*> removedDependencies_;
|
||||
/// Derived streams/Paralell branches from the origin stream
|
||||
std::vector<hipStream_t> parallelCaptureStreams_;
|
||||
/// Capture events
|
||||
std::unordered_set<hipEvent_t> captureEvents_;
|
||||
unsigned long long captureID_;
|
||||
/// Current capture status of the stream
|
||||
hipStreamCaptureStatus captureStatus_;
|
||||
/// Graph that is constructed with capture
|
||||
hip::Graph* pCaptureGraph_;
|
||||
/// Based on mode stream capture places restrictions on API calls that can be made within or
|
||||
/// concurrently
|
||||
hipStreamCaptureMode captureMode_{hipStreamCaptureModeGlobal};
|
||||
bool originStream_;
|
||||
/// Origin sream has no parent. Parent stream for the derived captured streams with event
|
||||
/// dependencies
|
||||
hipStream_t parentStream_ = nullptr;
|
||||
/// Last graph node captured in the stream
|
||||
std::vector<hip::GraphNode*> lastCapturedNodes_;
|
||||
/// dependencies removed via API hipStreamUpdateCaptureDependencies
|
||||
std::vector<hip::GraphNode*> removedDependencies_;
|
||||
/// Derived streams/Paralell branches from the origin stream
|
||||
std::vector<hipStream_t> parallelCaptureStreams_;
|
||||
/// Capture events
|
||||
std::unordered_set<hipEvent_t> captureEvents_;
|
||||
unsigned long long captureID_;
|
||||
|
||||
static inline CommandQueue::Priority convertToQueuePriority(Priority p) {
|
||||
return p == Priority::High ? amd::CommandQueue::Priority::High : p == Priority::Low ?
|
||||
amd::CommandQueue::Priority::Low : amd::CommandQueue::Priority::Normal;
|
||||
}
|
||||
static inline CommandQueue::Priority convertToQueuePriority(Priority p) {
|
||||
return p == Priority::High ? amd::CommandQueue::Priority::High
|
||||
: p == Priority::Low ? amd::CommandQueue::Priority::Low
|
||||
: amd::CommandQueue::Priority::Normal;
|
||||
}
|
||||
|
||||
public:
|
||||
Stream(Device* dev, Priority p = Priority::Normal, unsigned int f = 0, bool null_stream = false,
|
||||
const std::vector<uint32_t>& cuMask = {},
|
||||
hipStreamCaptureStatus captureStatus = hipStreamCaptureStatusNone);
|
||||
public:
|
||||
Stream(Device* dev, Priority p = Priority::Normal, unsigned int f = 0, bool null_stream = false,
|
||||
const std::vector<uint32_t>& cuMask = {},
|
||||
hipStreamCaptureStatus captureStatus = hipStreamCaptureStatusNone);
|
||||
|
||||
/// Creates the hip stream object, including AMD host queue
|
||||
bool Create();
|
||||
/// Get device ID associated with the current stream;
|
||||
int DeviceId() const;
|
||||
/// Get HIP device associated with the stream
|
||||
Device* GetDevice() const { return device_; }
|
||||
/// Get device ID associated with a stream;
|
||||
static int DeviceId(const hipStream_t hStream);
|
||||
/// Returns if stream is null stream
|
||||
bool Null() const { return null_; }
|
||||
/// Returns the lock object for the current stream
|
||||
amd::Monitor& Lock() const { return lock_; }
|
||||
/// Returns the creation flags for the current stream
|
||||
unsigned int Flags() const { return flags_; }
|
||||
/// Returns the priority for the current stream
|
||||
Priority GetPriority() const { return priority_; }
|
||||
/// Returns the CU mask for the current stream
|
||||
const std::vector<uint32_t> GetCUMask() const { return cuMask_; }
|
||||
/// Creates the hip stream object, including AMD host queue
|
||||
bool Create();
|
||||
/// Get device ID associated with the current stream;
|
||||
int DeviceId() const;
|
||||
/// Get HIP device associated with the stream
|
||||
Device* GetDevice() const { return device_; }
|
||||
/// Get device ID associated with a stream;
|
||||
static int DeviceId(const hipStream_t hStream);
|
||||
/// Returns if stream is null stream
|
||||
bool Null() const { return null_; }
|
||||
/// Returns the lock object for the current stream
|
||||
amd::Monitor& Lock() const { return lock_; }
|
||||
/// Returns the creation flags for the current stream
|
||||
unsigned int Flags() const { return flags_; }
|
||||
/// Returns the priority for the current stream
|
||||
Priority GetPriority() const { return priority_; }
|
||||
/// Returns the CU mask for the current stream
|
||||
const std::vector<uint32_t> GetCUMask() const { return cuMask_; }
|
||||
|
||||
/// Check whether any blocking stream running
|
||||
static bool StreamCaptureBlocking();
|
||||
/// Check whether any blocking stream running
|
||||
static bool StreamCaptureBlocking();
|
||||
|
||||
static void Destroy(hip::Stream* stream, bool forceDestroy = false);
|
||||
static void Destroy(hip::Stream* stream, bool forceDestroy = false);
|
||||
|
||||
virtual bool terminate();
|
||||
virtual bool terminate();
|
||||
|
||||
/// Check Stream Capture status to make sure it is done
|
||||
static bool StreamCaptureOngoing(hipStream_t hStream);
|
||||
/// Check Stream Capture status to make sure it is done
|
||||
static bool StreamCaptureOngoing(hipStream_t hStream);
|
||||
|
||||
/// Returns capture status of the current stream
|
||||
hipStreamCaptureStatus GetCaptureStatus() const { return captureStatus_; }
|
||||
/// Returns capture mode of the current stream
|
||||
hipStreamCaptureMode GetCaptureMode() const { return captureMode_; }
|
||||
/// Returns if stream is origin stream
|
||||
bool IsOriginStream() const { return originStream_; }
|
||||
void SetOriginStream() { originStream_ = true; }
|
||||
/// Returns captured graph
|
||||
hip::Graph* GetCaptureGraph() const { return pCaptureGraph_; }
|
||||
/// Returns last captured graph node
|
||||
const std::vector<hip::GraphNode*>& GetLastCapturedNodes() const { return lastCapturedNodes_; }
|
||||
/// Set last captured graph node
|
||||
void SetLastCapturedNode(hip::GraphNode* graphNode) {
|
||||
/// Returns capture status of the current stream
|
||||
hipStreamCaptureStatus GetCaptureStatus() const { return captureStatus_; }
|
||||
/// Returns capture mode of the current stream
|
||||
hipStreamCaptureMode GetCaptureMode() const { return captureMode_; }
|
||||
/// Returns if stream is origin stream
|
||||
bool IsOriginStream() const { return originStream_; }
|
||||
void SetOriginStream() { originStream_ = true; }
|
||||
/// Returns captured graph
|
||||
hip::Graph* GetCaptureGraph() const { return pCaptureGraph_; }
|
||||
/// Returns last captured graph node
|
||||
const std::vector<hip::GraphNode*>& GetLastCapturedNodes() const { return lastCapturedNodes_; }
|
||||
/// Set last captured graph node
|
||||
void SetLastCapturedNode(hip::GraphNode* graphNode) {
|
||||
lastCapturedNodes_.clear();
|
||||
lastCapturedNodes_.push_back(graphNode);
|
||||
}
|
||||
/// returns updated dependencies removed
|
||||
const std::vector<hip::GraphNode*>& GetRemovedDependencies() { return removedDependencies_; }
|
||||
/// Append captured node via the wait event cross stream
|
||||
void AddCrossCapturedNode(std::vector<hip::GraphNode*> graphNodes, bool replace = false) {
|
||||
// replace dependencies as per flag hipStreamSetCaptureDependencies
|
||||
if (replace == true) {
|
||||
for (auto node : lastCapturedNodes_) {
|
||||
removedDependencies_.push_back(node);
|
||||
}
|
||||
lastCapturedNodes_.clear();
|
||||
lastCapturedNodes_.push_back(graphNode);
|
||||
}
|
||||
/// returns updated dependencies removed
|
||||
const std::vector<hip::GraphNode*>& GetRemovedDependencies() {
|
||||
return removedDependencies_;
|
||||
}
|
||||
/// Append captured node via the wait event cross stream
|
||||
void AddCrossCapturedNode(std::vector<hip::GraphNode*> graphNodes, bool replace = false) {
|
||||
// replace dependencies as per flag hipStreamSetCaptureDependencies
|
||||
if (replace == true) {
|
||||
for (auto node : lastCapturedNodes_) {
|
||||
removedDependencies_.push_back(node);
|
||||
}
|
||||
lastCapturedNodes_.clear();
|
||||
}
|
||||
for (auto node : graphNodes) {
|
||||
if (std::find(lastCapturedNodes_.begin(), lastCapturedNodes_.end(), node) ==
|
||||
lastCapturedNodes_.end()) {
|
||||
lastCapturedNodes_.push_back(node);
|
||||
}
|
||||
for (auto node : graphNodes) {
|
||||
if (std::find(lastCapturedNodes_.begin(), lastCapturedNodes_.end(), node) ==
|
||||
lastCapturedNodes_.end()) {
|
||||
lastCapturedNodes_.push_back(node);
|
||||
}
|
||||
}
|
||||
/// Set graph that is being captured
|
||||
void SetCaptureGraph(hip::Graph* pGraph) {
|
||||
pCaptureGraph_ = pGraph;
|
||||
captureStatus_ = hipStreamCaptureStatusActive;
|
||||
}
|
||||
/// Set graph that is being captured
|
||||
void SetCaptureGraph(hip::Graph* pGraph) {
|
||||
pCaptureGraph_ = pGraph;
|
||||
captureStatus_ = hipStreamCaptureStatusActive;
|
||||
}
|
||||
/// Reset graph to nullptr when capture is invalidated, but keep the status
|
||||
void ResetCaptureGraph() { pCaptureGraph_ = nullptr; }
|
||||
void SetCaptureId() {
|
||||
// ID is generated in Begin Capture i.e.. when capture status is active
|
||||
captureID_ = GenerateCaptureID();
|
||||
}
|
||||
void SetCaptureId(unsigned long long captureId) {
|
||||
// ID is given from parent stream
|
||||
captureID_ = captureId;
|
||||
}
|
||||
/// reset capture parameters
|
||||
hipError_t EndCapture();
|
||||
/// Set capture status
|
||||
void SetCaptureStatus(hipStreamCaptureStatus captureStatus) { captureStatus_ = captureStatus; }
|
||||
/// Set capture mode
|
||||
void SetCaptureMode(hipStreamCaptureMode captureMode) { captureMode_ = captureMode; }
|
||||
/// Set parent stream
|
||||
void SetParentStream(hipStream_t parentStream) { parentStream_ = parentStream; }
|
||||
/// Get parent stream
|
||||
hipStream_t GetParentStream() const { return parentStream_; }
|
||||
/// Generate ID for stream capture unique over the lifetime of the process
|
||||
static unsigned long long GenerateCaptureID() {
|
||||
static std::atomic<unsigned long long> uid(0);
|
||||
return ++uid;
|
||||
}
|
||||
/// Get Capture ID
|
||||
unsigned long long GetCaptureID() { return captureID_; }
|
||||
void SetCaptureEvent(hipEvent_t e) {
|
||||
amd::ScopedLock lock(lock_);
|
||||
captureEvents_.emplace(e);
|
||||
}
|
||||
bool IsEventCaptured(hipEvent_t e) {
|
||||
amd::ScopedLock lock(lock_);
|
||||
auto it = captureEvents_.find(e);
|
||||
if (it != captureEvents_.end()) {
|
||||
return true;
|
||||
}
|
||||
/// Reset graph to nullptr when capture is invalidated, but keep the status
|
||||
void ResetCaptureGraph() { pCaptureGraph_ = nullptr; }
|
||||
void SetCaptureId() {
|
||||
// ID is generated in Begin Capture i.e.. when capture status is active
|
||||
captureID_ = GenerateCaptureID();
|
||||
return false;
|
||||
}
|
||||
void EraseCaptureEvent(hipEvent_t e) {
|
||||
amd::ScopedLock lock(lock_);
|
||||
auto it = captureEvents_.find(e);
|
||||
if (it != captureEvents_.end()) {
|
||||
captureEvents_.erase(it);
|
||||
}
|
||||
void SetCaptureId(unsigned long long captureId) {
|
||||
// ID is given from parent stream
|
||||
captureID_ = captureId;
|
||||
}
|
||||
void SetParallelCaptureStream(hipStream_t s) {
|
||||
auto it = std::find(parallelCaptureStreams_.begin(), parallelCaptureStreams_.end(), s);
|
||||
if (it == parallelCaptureStreams_.end()) {
|
||||
parallelCaptureStreams_.push_back(s);
|
||||
}
|
||||
/// reset capture parameters
|
||||
hipError_t EndCapture();
|
||||
/// Set capture status
|
||||
void SetCaptureStatus(hipStreamCaptureStatus captureStatus) { captureStatus_ = captureStatus; }
|
||||
/// Set capture mode
|
||||
void SetCaptureMode(hipStreamCaptureMode captureMode) { captureMode_ = captureMode; }
|
||||
/// Set parent stream
|
||||
void SetParentStream(hipStream_t parentStream) { parentStream_ = parentStream; }
|
||||
/// Get parent stream
|
||||
hipStream_t GetParentStream() const { return parentStream_; }
|
||||
/// Generate ID for stream capture unique over the lifetime of the process
|
||||
static unsigned long long GenerateCaptureID() {
|
||||
static std::atomic<unsigned long long> uid(0);
|
||||
return ++uid;
|
||||
}
|
||||
/// Get Capture ID
|
||||
unsigned long long GetCaptureID() { return captureID_; }
|
||||
void SetCaptureEvent(hipEvent_t e) {
|
||||
amd::ScopedLock lock(lock_);
|
||||
captureEvents_.emplace(e); }
|
||||
bool IsEventCaptured(hipEvent_t e) {
|
||||
amd::ScopedLock lock(lock_);
|
||||
auto it = captureEvents_.find(e);
|
||||
if (it != captureEvents_.end()) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
void EraseCaptureEvent(hipEvent_t e) {
|
||||
amd::ScopedLock lock(lock_);
|
||||
auto it = captureEvents_.find(e);
|
||||
if (it != captureEvents_.end()) {
|
||||
captureEvents_.erase(it);
|
||||
}
|
||||
}
|
||||
void SetParallelCaptureStream(hipStream_t s) {
|
||||
auto it = std::find(parallelCaptureStreams_.begin(), parallelCaptureStreams_.end(), s);
|
||||
if (it == parallelCaptureStreams_.end()) {
|
||||
parallelCaptureStreams_.push_back(s);
|
||||
}
|
||||
}
|
||||
void EraseParallelCaptureStream(hipStream_t s) {
|
||||
auto it = std::find(parallelCaptureStreams_.begin(), parallelCaptureStreams_.end(), s);
|
||||
if (it != parallelCaptureStreams_.end()) {
|
||||
parallelCaptureStreams_.erase(it);
|
||||
}
|
||||
}
|
||||
void EraseParallelCaptureStream(hipStream_t s) {
|
||||
auto it = std::find(parallelCaptureStreams_.begin(), parallelCaptureStreams_.end(), s);
|
||||
if (it != parallelCaptureStreams_.end()) {
|
||||
parallelCaptureStreams_.erase(it);
|
||||
}
|
||||
}
|
||||
|
||||
/// The stream should be destroyed via release() rather than delete
|
||||
private:
|
||||
~Stream() {};
|
||||
};
|
||||
/// The stream should be destroyed via release() rather than delete
|
||||
private:
|
||||
~Stream() {};
|
||||
};
|
||||
|
||||
/// HIP Device class
|
||||
class Device : public amd::ReferenceCountedObject {
|
||||
// Device lock
|
||||
amd::Monitor lock_{true};
|
||||
// Guards device stream set
|
||||
std::shared_mutex streamSetLock;
|
||||
std::unordered_set<hip::Stream*> streamSet;
|
||||
/// ROCclr context
|
||||
amd::Context* context_;
|
||||
/// Device's ID
|
||||
/// Store it here so we don't have to loop through the device list every time
|
||||
int deviceId_;
|
||||
/// ROCclr host queue for default streams
|
||||
Stream* null_stream_ = nullptr;
|
||||
/// Store device flags
|
||||
unsigned int flags_;
|
||||
/// Maintain list of user enabled peers
|
||||
std::list<int> userEnabledPeers;
|
||||
/// HIP Device class
|
||||
class Device : public amd::ReferenceCountedObject {
|
||||
// Device lock
|
||||
amd::Monitor lock_{true};
|
||||
// Guards device stream set
|
||||
std::shared_mutex streamSetLock;
|
||||
std::unordered_set<hip::Stream*> streamSet;
|
||||
/// ROCclr context
|
||||
amd::Context* context_;
|
||||
/// Device's ID
|
||||
/// Store it here so we don't have to loop through the device list every time
|
||||
int deviceId_;
|
||||
/// ROCclr host queue for default streams
|
||||
Stream* null_stream_ = nullptr;
|
||||
/// Store device flags
|
||||
unsigned int flags_;
|
||||
/// Maintain list of user enabled peers
|
||||
std::list<int> userEnabledPeers;
|
||||
|
||||
/// True if this device is active
|
||||
bool isActive_;
|
||||
/// True if this device is active
|
||||
bool isActive_;
|
||||
|
||||
|
||||
MemoryPool* default_mem_pool_; //!< Default memory pool for this device
|
||||
MemoryPool* current_mem_pool_;
|
||||
MemoryPool* graph_mem_pool_; //!< Memory pool, associated with graphs for this device
|
||||
MemoryPool* default_mem_pool_; //!< Default memory pool for this device
|
||||
MemoryPool* current_mem_pool_;
|
||||
MemoryPool* graph_mem_pool_; //!< Memory pool, associated with graphs for this device
|
||||
|
||||
std::set<MemoryPool*> mem_pools_;
|
||||
std::set<MemoryPool*> mem_pools_;
|
||||
|
||||
public:
|
||||
Device(amd::Context* ctx, int devId): context_(ctx),
|
||||
public:
|
||||
Device(amd::Context* ctx, int devId)
|
||||
: context_(ctx),
|
||||
deviceId_(devId),
|
||||
flags_(hipDeviceScheduleSpin),
|
||||
flags_(hipDeviceScheduleSpin),
|
||||
isActive_(false),
|
||||
default_mem_pool_(nullptr),
|
||||
current_mem_pool_(nullptr),
|
||||
graph_mem_pool_(nullptr)
|
||||
{ assert(ctx != nullptr); }
|
||||
~Device();
|
||||
graph_mem_pool_(nullptr) {
|
||||
assert(ctx != nullptr);
|
||||
}
|
||||
~Device();
|
||||
|
||||
bool Create();
|
||||
amd::Context* asContext() const { return context_; }
|
||||
int deviceId() const { return deviceId_; }
|
||||
void retain() const { context_->retain(); }
|
||||
void release() const { context_->release(); }
|
||||
const std::vector<amd::Device*>& devices() const { return context_->devices(); }
|
||||
hipError_t EnablePeerAccess(int peerDeviceId){
|
||||
amd::ScopedLock lock(lock_);
|
||||
bool found = (std::find(userEnabledPeers.begin(), userEnabledPeers.end(), peerDeviceId) != userEnabledPeers.end());
|
||||
if (found) {
|
||||
return hipErrorPeerAccessAlreadyEnabled;
|
||||
}
|
||||
userEnabledPeers.push_back(peerDeviceId);
|
||||
bool Create();
|
||||
amd::Context* asContext() const { return context_; }
|
||||
int deviceId() const { return deviceId_; }
|
||||
void retain() const { context_->retain(); }
|
||||
void release() const { context_->release(); }
|
||||
const std::vector<amd::Device*>& devices() const { return context_->devices(); }
|
||||
hipError_t EnablePeerAccess(int peerDeviceId) {
|
||||
amd::ScopedLock lock(lock_);
|
||||
bool found = (std::find(userEnabledPeers.begin(), userEnabledPeers.end(), peerDeviceId) !=
|
||||
userEnabledPeers.end());
|
||||
if (found) {
|
||||
return hipErrorPeerAccessAlreadyEnabled;
|
||||
}
|
||||
userEnabledPeers.push_back(peerDeviceId);
|
||||
return hipSuccess;
|
||||
}
|
||||
hipError_t DisablePeerAccess(int peerDeviceId) {
|
||||
amd::ScopedLock lock(lock_);
|
||||
bool found = (std::find(userEnabledPeers.begin(), userEnabledPeers.end(), peerDeviceId) !=
|
||||
userEnabledPeers.end());
|
||||
if (found) {
|
||||
userEnabledPeers.remove(peerDeviceId);
|
||||
return hipSuccess;
|
||||
} else {
|
||||
return hipErrorPeerAccessNotEnabled;
|
||||
}
|
||||
hipError_t DisablePeerAccess(int peerDeviceId) {
|
||||
amd::ScopedLock lock(lock_);
|
||||
bool found = (std::find(userEnabledPeers.begin(), userEnabledPeers.end(), peerDeviceId) != userEnabledPeers.end());
|
||||
if (found) {
|
||||
userEnabledPeers.remove(peerDeviceId);
|
||||
return hipSuccess;
|
||||
} else {
|
||||
return hipErrorPeerAccessNotEnabled;
|
||||
}
|
||||
}
|
||||
unsigned int getFlags() const { return flags_; }
|
||||
void setFlags(unsigned int flags) { flags_ = flags; }
|
||||
void Reset();
|
||||
}
|
||||
unsigned int getFlags() const { return flags_; }
|
||||
void setFlags(unsigned int flags) { flags_ = flags; }
|
||||
void Reset();
|
||||
|
||||
hip::Stream* NullStream(bool wait = true);
|
||||
Stream* GetNullStream() const {return null_stream_;};
|
||||
hip::Stream* NullStream(bool wait = true);
|
||||
Stream* GetNullStream() const { return null_stream_; };
|
||||
|
||||
void SetActiveStatus() {
|
||||
void SetActiveStatus() { isActive_ = true; }
|
||||
|
||||
bool GetActiveStatus() {
|
||||
amd::ScopedLock lock(lock_);
|
||||
/// Either stream is active or device is active
|
||||
if (isActive_) return true;
|
||||
if (existsActiveStreamForDevice()) {
|
||||
isActive_ = true;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool GetActiveStatus() {
|
||||
amd::ScopedLock lock(lock_);
|
||||
/// Either stream is active or device is active
|
||||
if (isActive_) return true;
|
||||
if (existsActiveStreamForDevice()) {
|
||||
isActive_ = true;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
/// Set the current memory pool on the device
|
||||
void SetCurrentMemoryPool(MemoryPool* pool = nullptr) {
|
||||
current_mem_pool_ = (pool == nullptr) ? default_mem_pool_ : pool;
|
||||
}
|
||||
|
||||
/// Set the current memory pool on the device
|
||||
void SetCurrentMemoryPool(MemoryPool* pool = nullptr) {
|
||||
current_mem_pool_ = (pool == nullptr) ? default_mem_pool_ : pool;
|
||||
}
|
||||
/// Get the current memory pool on the device
|
||||
MemoryPool* GetCurrentMemoryPool() const { return current_mem_pool_; }
|
||||
|
||||
/// Get the current memory pool on the device
|
||||
MemoryPool* GetCurrentMemoryPool() const { return current_mem_pool_; }
|
||||
/// Get the default memory pool on the device
|
||||
MemoryPool* GetDefaultMemoryPool() const { return default_mem_pool_; }
|
||||
|
||||
/// Get the default memory pool on the device
|
||||
MemoryPool* GetDefaultMemoryPool() const { return default_mem_pool_; }
|
||||
/// Get the graph memory pool on the device
|
||||
MemoryPool* GetGraphMemoryPool() const { return graph_mem_pool_; }
|
||||
|
||||
/// Get the graph memory pool on the device
|
||||
MemoryPool* GetGraphMemoryPool() const { return graph_mem_pool_; }
|
||||
/// Add memory pool to the device
|
||||
void AddMemoryPool(MemoryPool* pool);
|
||||
|
||||
/// Add memory pool to the device
|
||||
void AddMemoryPool(MemoryPool* pool);
|
||||
/// Remove memory pool from the device
|
||||
void RemoveMemoryPool(MemoryPool* pool);
|
||||
|
||||
/// Remove memory pool from the device
|
||||
void RemoveMemoryPool(MemoryPool* pool);
|
||||
/// Free memory from the device
|
||||
bool FreeMemory(amd::Memory* memory, Stream* stream, Event* event = nullptr);
|
||||
|
||||
/// Free memory from the device
|
||||
bool FreeMemory(amd::Memory* memory, Stream* stream, Event* event = nullptr);
|
||||
/// Release freed memory from all pools on the current device
|
||||
void ReleaseFreedMemory();
|
||||
|
||||
/// Release freed memory from all pools on the current device
|
||||
void ReleaseFreedMemory();
|
||||
/// Removes a destroyed stream from the safe list of memory pools
|
||||
void RemoveStreamFromPools(Stream* stream);
|
||||
|
||||
/// Removes a destroyed stream from the safe list of memory pools
|
||||
void RemoveStreamFromPools(Stream* stream);
|
||||
/// Add safe streams into the memppools for reuse
|
||||
void AddSafeStream(Stream* event_stream, Stream* wait_stream);
|
||||
|
||||
/// Add safe streams into the memppools for reuse
|
||||
void AddSafeStream(Stream* event_stream, Stream* wait_stream);
|
||||
/// Returns true if memory pool is valid on this device
|
||||
bool IsMemoryPoolValid(MemoryPool* pool);
|
||||
void AddStream(Stream* stream);
|
||||
|
||||
/// Returns true if memory pool is valid on this device
|
||||
bool IsMemoryPoolValid(MemoryPool* pool);
|
||||
void AddStream(Stream* stream);
|
||||
void RemoveStream(Stream* stream);
|
||||
|
||||
void RemoveStream(Stream* stream);
|
||||
bool StreamExists(Stream* stream);
|
||||
|
||||
bool StreamExists(Stream* stream);
|
||||
void destroyAllStreams();
|
||||
|
||||
void destroyAllStreams();
|
||||
void SyncAllStreams(bool cpu_wait = true, bool wait_blocking_streams_only = false);
|
||||
|
||||
void SyncAllStreams( bool cpu_wait = true, bool wait_blocking_streams_only = false);
|
||||
bool StreamCaptureBlocking();
|
||||
|
||||
bool StreamCaptureBlocking();
|
||||
|
||||
bool existsActiveStreamForDevice();
|
||||
bool existsActiveStreamForDevice();
|
||||
/// Wait all active streams on the blocking queue. The method enqueues a wait command and
|
||||
/// doesn't stall the current thread
|
||||
void WaitActiveStreams(hip::Stream* blocking_stream, bool wait_null_stream = false);
|
||||
};
|
||||
void WaitActiveStreams(hip::Stream* blocking_stream, bool wait_null_stream = false);
|
||||
};
|
||||
|
||||
/// Thread Local Storage Variables Aggregator Class
|
||||
class TlsAggregator {
|
||||
public:
|
||||
Device* device_;
|
||||
std::stack<Device*> ctxt_stack_;
|
||||
hipError_t last_error_, last_command_error_;
|
||||
std::vector<hip::Stream*> capture_streams_;
|
||||
hipStreamCaptureMode stream_capture_mode_;
|
||||
std::stack<ihipExec_t> exec_stack_;
|
||||
stream_per_thread stream_per_thread_obj_;
|
||||
bool isSetDeviceCalled;
|
||||
/// Thread Local Storage Variables Aggregator Class
|
||||
class TlsAggregator {
|
||||
public:
|
||||
Device* device_;
|
||||
std::stack<Device*> ctxt_stack_;
|
||||
hipError_t last_error_, last_command_error_;
|
||||
std::vector<hip::Stream*> capture_streams_;
|
||||
hipStreamCaptureMode stream_capture_mode_;
|
||||
std::stack<ihipExec_t> exec_stack_;
|
||||
stream_per_thread stream_per_thread_obj_;
|
||||
bool isSetDeviceCalled;
|
||||
|
||||
TlsAggregator(): device_(nullptr),
|
||||
last_error_(hipSuccess),
|
||||
last_command_error_(hipSuccess),
|
||||
stream_capture_mode_(hipStreamCaptureModeGlobal),
|
||||
isSetDeviceCalled(false) {
|
||||
}
|
||||
~TlsAggregator() {
|
||||
}
|
||||
};
|
||||
extern thread_local TlsAggregator tls;
|
||||
TlsAggregator()
|
||||
: device_(nullptr),
|
||||
last_error_(hipSuccess),
|
||||
last_command_error_(hipSuccess),
|
||||
stream_capture_mode_(hipStreamCaptureModeGlobal),
|
||||
isSetDeviceCalled(false) {}
|
||||
~TlsAggregator() {}
|
||||
};
|
||||
extern thread_local TlsAggregator tls;
|
||||
|
||||
/// Device representing the host - for pinned memory
|
||||
extern amd::Context* host_context;
|
||||
/// Device representing the host - for pinned memory
|
||||
extern amd::Context* host_context;
|
||||
|
||||
extern void init(bool* status);
|
||||
extern void init(bool* status);
|
||||
|
||||
extern Device* getCurrentDevice();
|
||||
extern Device* getCurrentDevice();
|
||||
|
||||
extern void setCurrentDevice(unsigned int index);
|
||||
extern void setCurrentDevice(unsigned int index);
|
||||
|
||||
/// Get ROCclr queue associated with hipStream
|
||||
/// Note: This follows the CUDA spec to sync with default streams
|
||||
/// and Blocking streams
|
||||
extern hip::Stream* getStream(hipStream_t stream, bool wait = true);
|
||||
/// Get default stream associated with the ROCclr context
|
||||
extern hip::Stream* getNullStream(amd::Context&, bool wait = true);
|
||||
/// Get default stream of the thread
|
||||
extern hip::Stream* getNullStream(bool wait = true);
|
||||
/// Get device ID associated with the ROCclr context
|
||||
int getDeviceID(amd::Context& ctx);
|
||||
/// Check if stream is valid
|
||||
extern bool isValid(hipStream_t& stream);
|
||||
extern bool isValid(hipEvent_t event);
|
||||
extern amd::Monitor hipArraySetLock;
|
||||
extern std::unordered_set<hipArray*> hipArraySet;
|
||||
/// Get ROCclr queue associated with hipStream
|
||||
/// Note: This follows the CUDA spec to sync with default streams
|
||||
/// and Blocking streams
|
||||
extern hip::Stream* getStream(hipStream_t stream, bool wait = true);
|
||||
/// Get default stream associated with the ROCclr context
|
||||
extern hip::Stream* getNullStream(amd::Context&, bool wait = true);
|
||||
/// Get default stream of the thread
|
||||
extern hip::Stream* getNullStream(bool wait = true);
|
||||
/// Get device ID associated with the ROCclr context
|
||||
int getDeviceID(amd::Context& ctx);
|
||||
/// Check if stream is valid
|
||||
extern bool isValid(hipStream_t& stream);
|
||||
extern bool isValid(hipEvent_t event);
|
||||
extern amd::Monitor hipArraySetLock;
|
||||
extern std::unordered_set<hipArray*> hipArraySet;
|
||||
|
||||
extern void WaitThenDecrementSignal(hipStream_t stream, hipError_t status, void* user_data);
|
||||
extern void WaitThenDecrementSignal(hipStream_t stream, hipError_t status, void* user_data);
|
||||
|
||||
extern std::vector<hip::Device*> g_devices;
|
||||
extern hipError_t ihipDeviceGetCount(int* count);
|
||||
extern int ihipGetDevice();
|
||||
extern std::vector<hip::Device*> g_devices;
|
||||
extern hipError_t ihipDeviceGetCount(int* count);
|
||||
extern int ihipGetDevice();
|
||||
|
||||
extern hipError_t ihipMalloc(void** ptr, size_t sizeBytes, unsigned int flags);
|
||||
extern hipError_t ihipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags);
|
||||
extern amd::Memory* getMemoryObject(const void* ptr, size_t& offset, size_t size = 0);
|
||||
extern amd::Memory* getMemoryObjectWithOffset(const void* ptr, const size_t size = 0);
|
||||
extern void getStreamPerThread(hipStream_t& stream);
|
||||
extern hipStream_t getPerThreadDefaultStream();
|
||||
extern hipError_t ihipUnbindTexture(textureReference* texRef);
|
||||
extern hipError_t ihipHostRegister(void* hostPtr, size_t sizeBytes, unsigned int flags);
|
||||
extern hipError_t ihipHostUnregister(void* hostPtr);
|
||||
extern hipError_t ihipGetDeviceProperties(hipDeviceProp_t* props, hipDevice_t device);
|
||||
extern hipError_t ihipMalloc(void** ptr, size_t sizeBytes, unsigned int flags);
|
||||
extern hipError_t ihipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags);
|
||||
extern amd::Memory* getMemoryObject(const void* ptr, size_t& offset, size_t size = 0);
|
||||
extern amd::Memory* getMemoryObjectWithOffset(const void* ptr, const size_t size = 0);
|
||||
extern void getStreamPerThread(hipStream_t& stream);
|
||||
extern hipStream_t getPerThreadDefaultStream();
|
||||
extern hipError_t ihipUnbindTexture(textureReference* texRef);
|
||||
extern hipError_t ihipHostRegister(void* hostPtr, size_t sizeBytes, unsigned int flags);
|
||||
extern hipError_t ihipHostUnregister(void* hostPtr);
|
||||
extern hipError_t ihipGetDeviceProperties(hipDeviceProp_t* props, hipDevice_t device);
|
||||
|
||||
extern hipError_t ihipDeviceGet(hipDevice_t* device, int deviceId);
|
||||
extern hipError_t ihipStreamOperation(hipStream_t stream, cl_command_type cmdType, void* ptr,
|
||||
uint64_t value, uint64_t mask, unsigned int flags,
|
||||
size_t sizeBytes);
|
||||
hipError_t ihipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind,
|
||||
hip::Stream& stream, bool isHostAsync = false, bool isGPUAsync = true);
|
||||
hipError_t ihipMemcpy3D(const hipMemcpy3DParms* p, hipStream_t stream = nullptr,
|
||||
bool isAsync = false);
|
||||
constexpr bool kOptionChangeable = true;
|
||||
constexpr bool kNewDevProg = false;
|
||||
extern hipError_t ihipDeviceGet(hipDevice_t* device, int deviceId);
|
||||
extern hipError_t ihipStreamOperation(hipStream_t stream, cl_command_type cmdType, void* ptr,
|
||||
uint64_t value, uint64_t mask, unsigned int flags,
|
||||
size_t sizeBytes);
|
||||
hipError_t ihipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind,
|
||||
hip::Stream& stream, bool isHostAsync = false, bool isGPUAsync = true);
|
||||
hipError_t ihipMemcpy3D(const hipMemcpy3DParms* p, hipStream_t stream = nullptr,
|
||||
bool isAsync = false);
|
||||
constexpr bool kOptionChangeable = true;
|
||||
constexpr bool kNewDevProg = false;
|
||||
|
||||
constexpr bool kMarkerDisableFlush = true; //!< Avoids command batch flush in ROCclr
|
||||
constexpr bool kMarkerDisableFlush = true; //!< Avoids command batch flush in ROCclr
|
||||
|
||||
extern std::vector<hip::Stream*> g_captureStreams;
|
||||
extern amd::Monitor g_captureStreamsLock;
|
||||
extern amd::Monitor g_streamSetLock;
|
||||
extern std::unordered_set<hip::Stream*> g_allCapturingStreams;
|
||||
} // namespace hip
|
||||
extern std::vector<hip::Stream*> g_captureStreams;
|
||||
extern amd::Monitor g_captureStreamsLock;
|
||||
extern amd::Monitor g_streamSetLock;
|
||||
extern std::unordered_set<hip::Stream*> g_allCapturingStreams;
|
||||
} // namespace hip
|
||||
#endif // HIP_SRC_HIP_INTERNAL_H
|
||||
|
||||
Το diff αρχείου καταστέλλεται επειδή είναι πολύ μεγάλο
Φόρτωση Διαφορών
@@ -74,7 +74,7 @@ hipError_t hipDeviceSetMemPool(int device, hipMemPool_t mem_pool) {
|
||||
// ================================================================================================
|
||||
hipError_t hipDeviceGetMemPool(hipMemPool_t* mem_pool, int device) {
|
||||
HIP_INIT_API(hipDeviceGetMemPool, mem_pool, device);
|
||||
if ((mem_pool == nullptr) || (device >= g_devices.size())) {
|
||||
if ((mem_pool == nullptr) || (device >= g_devices.size())) {
|
||||
HIP_RETURN(hipErrorInvalidValue);
|
||||
}
|
||||
*mem_pool = reinterpret_cast<hipMemPool_t>(g_devices[device]->GetCurrentMemoryPool());
|
||||
@@ -93,8 +93,8 @@ hipError_t hipMallocAsync(void** dev_ptr, size_t size, hipStream_t stream) {
|
||||
HIP_RETURN(hipSuccess);
|
||||
}
|
||||
hip::Stream* s = reinterpret_cast<hip::Stream*>(stream);
|
||||
auto hip_stream = (stream == nullptr || stream == hipStreamLegacy) ?
|
||||
hip::getCurrentDevice()->NullStream() : s;
|
||||
auto hip_stream =
|
||||
(stream == nullptr || stream == hipStreamLegacy) ? hip::getCurrentDevice()->NullStream() : s;
|
||||
auto device = hip_stream->GetDevice();
|
||||
auto mem_pool = device->GetCurrentMemoryPool();
|
||||
|
||||
@@ -120,8 +120,8 @@ hipError_t hipMallocAsync(void** dev_ptr, size_t size, hipStream_t stream) {
|
||||
// memory allocatiom in graph, which occurs in a worker thread, and host execution of hipFreeAsync
|
||||
class FreeAsyncCommand : public amd::Command {
|
||||
private:
|
||||
void* ptr_; //!< Virtual address for asynchronious free
|
||||
hip::Event* event_; //!< HIP event, associated with this memory release
|
||||
void* ptr_; //!< Virtual address for asynchronious free
|
||||
hip::Event* event_; //!< HIP event, associated with this memory release
|
||||
|
||||
public:
|
||||
FreeAsyncCommand(amd::HostQueue& queue, void* ptr, hip::Event* event)
|
||||
@@ -150,8 +150,8 @@ hipError_t hipFreeAsync(void* dev_ptr, hipStream_t stream) {
|
||||
getStreamPerThread(stream);
|
||||
|
||||
hip::Stream* s = reinterpret_cast<hip::Stream*>(stream);
|
||||
auto hip_stream = (stream == nullptr || stream == hipStreamLegacy) ?
|
||||
hip::getCurrentDevice()->NullStream(): s;
|
||||
auto hip_stream =
|
||||
(stream == nullptr || stream == hipStreamLegacy) ? hip::getCurrentDevice()->NullStream() : s;
|
||||
|
||||
auto device = hip_stream->GetDevice();
|
||||
// Return error if any stream other than the current stream is in capture mode
|
||||
@@ -199,8 +199,7 @@ hipError_t hipFreeAsync(void* dev_ptr, hipStream_t stream) {
|
||||
// may block the execution
|
||||
event = new hip::Event(0);
|
||||
if (event != nullptr) {
|
||||
if (hipSuccess !=
|
||||
event->addMarker(hip_stream, nullptr)) {
|
||||
if (hipSuccess != event->addMarker(hip_stream, nullptr)) {
|
||||
delete event;
|
||||
event = nullptr;
|
||||
} else {
|
||||
@@ -253,10 +252,8 @@ hipError_t hipMemPoolGetAttribute(hipMemPool_t mem_pool, hipMemPoolAttr attr, vo
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
hipError_t hipMemPoolSetAccess(
|
||||
hipMemPool_t mem_pool,
|
||||
const hipMemAccessDesc* desc_list,
|
||||
size_t count) {
|
||||
hipError_t hipMemPoolSetAccess(hipMemPool_t mem_pool, const hipMemAccessDesc* desc_list,
|
||||
size_t count) {
|
||||
HIP_INIT_API(hipMemPoolSetAccess, mem_pool, desc_list, count);
|
||||
if ((mem_pool == nullptr) || (desc_list == nullptr)) {
|
||||
HIP_RETURN(hipErrorInvalidValue);
|
||||
@@ -286,10 +283,8 @@ hipError_t hipMemPoolSetAccess(
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
hipError_t hipMemPoolGetAccess(
|
||||
hipMemAccessFlags* flags,
|
||||
hipMemPool_t mem_pool,
|
||||
hipMemLocation* location) {
|
||||
hipError_t hipMemPoolGetAccess(hipMemAccessFlags* flags, hipMemPool_t mem_pool,
|
||||
hipMemLocation* location) {
|
||||
HIP_INIT_API(hipMemPoolGetAccess, flags, mem_pool, location);
|
||||
if ((mem_pool == nullptr) || (location == nullptr) || (flags == nullptr)) {
|
||||
HIP_RETURN(hipErrorInvalidValue);
|
||||
@@ -370,11 +365,8 @@ hipError_t hipMemPoolDestroy(hipMemPool_t mem_pool) {
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
hipError_t hipMallocFromPoolAsync(
|
||||
void** dev_ptr,
|
||||
size_t size,
|
||||
hipMemPool_t mem_pool,
|
||||
hipStream_t stream) {
|
||||
hipError_t hipMallocFromPoolAsync(void** dev_ptr, size_t size, hipMemPool_t mem_pool,
|
||||
hipStream_t stream) {
|
||||
HIP_INIT_API(hipMallocFromPoolAsync, dev_ptr, size, mem_pool, stream);
|
||||
if ((dev_ptr == nullptr) || (mem_pool == nullptr)) {
|
||||
HIP_RETURN(hipErrorInvalidValue);
|
||||
@@ -387,8 +379,9 @@ hipError_t hipMallocFromPoolAsync(
|
||||
STREAM_CAPTURE(hipMallocAsync, stream, mem_pool, size, dev_ptr);
|
||||
|
||||
auto mpool = reinterpret_cast<hip::MemoryPool*>(mem_pool);
|
||||
auto hip_stream = (stream == nullptr || stream == hipStreamLegacy) ?
|
||||
hip::getCurrentDevice()->NullStream() : reinterpret_cast<hip::Stream*>(stream);
|
||||
auto hip_stream = (stream == nullptr || stream == hipStreamLegacy)
|
||||
? hip::getCurrentDevice()->NullStream()
|
||||
: reinterpret_cast<hip::Stream*>(stream);
|
||||
*dev_ptr = mpool->AllocateMemory(size, hip_stream);
|
||||
if (*dev_ptr == nullptr) {
|
||||
HIP_RETURN(hipErrorOutOfMemory);
|
||||
@@ -397,11 +390,9 @@ hipError_t hipMallocFromPoolAsync(
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
hipError_t hipMemPoolExportToShareableHandle(
|
||||
void* shared_handle,
|
||||
hipMemPool_t mem_pool,
|
||||
hipMemAllocationHandleType handle_type,
|
||||
unsigned int flags) {
|
||||
hipError_t hipMemPoolExportToShareableHandle(void* shared_handle, hipMemPool_t mem_pool,
|
||||
hipMemAllocationHandleType handle_type,
|
||||
unsigned int flags) {
|
||||
HIP_INIT_API(hipMemPoolExportToShareableHandle, shared_handle, mem_pool, handle_type, flags);
|
||||
if (mem_pool == nullptr || shared_handle == nullptr || flags != 0) {
|
||||
HIP_RETURN(hipErrorInvalidValue);
|
||||
@@ -421,11 +412,9 @@ hipError_t hipMemPoolExportToShareableHandle(
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
hipError_t hipMemPoolImportFromShareableHandle(
|
||||
hipMemPool_t* mem_pool,
|
||||
void* shared_handle,
|
||||
hipMemAllocationHandleType handle_type,
|
||||
unsigned int flags) {
|
||||
hipError_t hipMemPoolImportFromShareableHandle(hipMemPool_t* mem_pool, void* shared_handle,
|
||||
hipMemAllocationHandleType handle_type,
|
||||
unsigned int flags) {
|
||||
HIP_INIT_API(hipMemPoolImportFromShareableHandle, mem_pool, shared_handle, handle_type, flags);
|
||||
if (mem_pool == nullptr || shared_handle == nullptr || flags != 0) {
|
||||
HIP_RETURN(hipErrorInvalidValue);
|
||||
@@ -470,8 +459,8 @@ hipError_t hipMemPoolExportPointer(hipMemPoolPtrExportData* export_data, void* p
|
||||
// Note: export_data must point to 64 bytes of shared memory
|
||||
auto shared = reinterpret_cast<hip::SharedMemPointer*>(export_data);
|
||||
|
||||
if (!g_devices[id]->devices()[0]->IpcCreate(ptr,
|
||||
&shared->size_, &shared->handle_[0], &shared->offset_)) {
|
||||
if (!g_devices[id]->devices()[0]->IpcCreate(ptr, &shared->size_, &shared->handle_[0],
|
||||
&shared->offset_)) {
|
||||
HIP_RETURN(hipErrorOutOfMemory);
|
||||
}
|
||||
} else {
|
||||
@@ -481,18 +470,16 @@ hipError_t hipMemPoolExportPointer(hipMemPoolPtrExportData* export_data, void* p
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
hipError_t hipMemPoolImportPointer(
|
||||
void** ptr,
|
||||
hipMemPool_t mem_pool,
|
||||
hipMemPoolPtrExportData* export_data) {
|
||||
hipError_t hipMemPoolImportPointer(void** ptr, hipMemPool_t mem_pool,
|
||||
hipMemPoolPtrExportData* export_data) {
|
||||
HIP_INIT_API(hipMemPoolImportPointer, ptr, mem_pool, export_data);
|
||||
if (mem_pool == nullptr || export_data == nullptr || ptr == nullptr) {
|
||||
HIP_RETURN(hipErrorInvalidValue);
|
||||
}
|
||||
auto mpool = reinterpret_cast<hip::MemoryPool*>(mem_pool);
|
||||
auto shared = reinterpret_cast<hip::SharedMemPointer*>(export_data);
|
||||
if (!mpool->Device()->devices()[0]->IpcAttach(
|
||||
&shared->handle_[0], shared->size_, shared->offset_, 0, ptr)) {
|
||||
if (!mpool->Device()->devices()[0]->IpcAttach(&shared->handle_[0], shared->size_, shared->offset_,
|
||||
0, ptr)) {
|
||||
HIP_RETURN(hipErrorOutOfMemory);
|
||||
}
|
||||
size_t offset = 0;
|
||||
|
||||
@@ -41,8 +41,8 @@ void Heap::AddMemory(amd::Memory* memory, const MemoryTimestamp& ts) {
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
amd::Memory* Heap::FindMemory(size_t size, Stream* stream, bool opportunistic,
|
||||
void* dptr, MemoryTimestamp* ts) {
|
||||
amd::Memory* Heap::FindMemory(size_t size, Stream* stream, bool opportunistic, void* dptr,
|
||||
MemoryTimestamp* ts) {
|
||||
amd::Memory* memory = nullptr;
|
||||
auto start = allocations_.lower_bound({size, nullptr});
|
||||
for (auto it = start; it != allocations_.end();) {
|
||||
@@ -201,11 +201,13 @@ void* MemoryPool::AllocateMemory(size_t size, Stream* stream, void* dptr) {
|
||||
dev_ptr = amd::SvmBuffer::malloc(*context, flags, size, dev_info.memBaseAddrAlign_, nullptr);
|
||||
}
|
||||
if (dev_ptr == nullptr) {
|
||||
size_t free = 0, total =0;
|
||||
size_t free = 0, total = 0;
|
||||
hipError_t err = hipMemGetInfo(&free, &total);
|
||||
if (err == hipSuccess) {
|
||||
LogPrintfError("Allocation failed : Device memory : required :\
|
||||
%zu | free :%zu | total :%zu", size, free, total);
|
||||
LogPrintfError(
|
||||
"Allocation failed : Device memory : required :\
|
||||
%zu | free :%zu | total :%zu",
|
||||
size, free, total);
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
@@ -231,8 +233,8 @@ void* MemoryPool::AllocateMemory(size_t size, Stream* stream, void* dptr) {
|
||||
ts.AddSafeStream(stream);
|
||||
busy_heap_.AddMemory(memory, ts);
|
||||
|
||||
max_total_size_ = std::max(max_total_size_, busy_heap_.GetTotalSize() +
|
||||
free_heap_.GetTotalSize());
|
||||
max_total_size_ =
|
||||
std::max(max_total_size_, busy_heap_.GetTotalSize() + free_heap_.GetTotalSize());
|
||||
// Increment the reference counter on the pool
|
||||
retain();
|
||||
|
||||
@@ -360,7 +362,7 @@ hipError_t MemoryPool::SetAttribute(hipMemPoolAttr attr, void* value) {
|
||||
// Enable/disable HIP event check for freed memory
|
||||
state_.opportunistic_ = *reinterpret_cast<int32_t*>(value);
|
||||
break;
|
||||
case hipMemPoolReuseAllowInternalDependencies:
|
||||
case hipMemPoolReuseAllowInternalDependencies:
|
||||
// Enable/disable internal extra dependencies introduced in runtime
|
||||
state_.internal_dependencies_ = *reinterpret_cast<int32_t*>(value);
|
||||
break;
|
||||
@@ -411,7 +413,7 @@ hipError_t MemoryPool::GetAttribute(hipMemPoolAttr attr, void* value) {
|
||||
// Enable/disable HIP event check for freed memory
|
||||
*reinterpret_cast<int32_t*>(value) = Opportunistic();
|
||||
break;
|
||||
case hipMemPoolReuseAllowInternalDependencies:
|
||||
case hipMemPoolReuseAllowInternalDependencies:
|
||||
// Enable/disable internal extra dependencies introduced in runtime
|
||||
*reinterpret_cast<int32_t*>(value) = InternalDependencies();
|
||||
break;
|
||||
@@ -420,13 +422,14 @@ hipError_t MemoryPool::GetAttribute(hipMemPoolAttr attr, void* value) {
|
||||
break;
|
||||
case hipMemPoolAttrReservedMemCurrent:
|
||||
// All allocated memory by the pool in OS
|
||||
*reinterpret_cast<uint64_t*>(value) = (state_.use_vm_heap_) ? MappedSize() :
|
||||
(busy_heap_.GetTotalSize() + free_heap_.GetTotalSize());
|
||||
*reinterpret_cast<uint64_t*>(value) = (state_.use_vm_heap_)
|
||||
? MappedSize()
|
||||
: (busy_heap_.GetTotalSize() + free_heap_.GetTotalSize());
|
||||
break;
|
||||
case hipMemPoolAttrReservedMemHigh:
|
||||
// High watermark of all allocated memory in OS, since the last reset
|
||||
*reinterpret_cast<uint64_t*>(value) = (state_.use_vm_heap_)
|
||||
? MaxMappedSize() : max_total_size_;
|
||||
*reinterpret_cast<uint64_t*>(value) =
|
||||
(state_.use_vm_heap_) ? MaxMappedSize() : max_total_size_;
|
||||
break;
|
||||
case hipMemPoolAttrUsedMemCurrent:
|
||||
// Total currently used memory by the pool
|
||||
@@ -505,14 +508,14 @@ amd::Os::FileDesc MemoryPool::Export() {
|
||||
// Note: Windows can accept an unnamed allocation
|
||||
snprintf(file_name, kFileNameSize, "%p", this);
|
||||
amd::Os::FileDesc handle{};
|
||||
shared_ = reinterpret_cast<SharedMemPool*>(amd::Os::CreateIpcMemory(
|
||||
file_name, sizeof(SharedMemPool), &handle));
|
||||
shared_ = reinterpret_cast<SharedMemPool*>(
|
||||
amd::Os::CreateIpcMemory(file_name, sizeof(SharedMemPool), &handle));
|
||||
if (shared_ != nullptr) {
|
||||
shared_->handle_ = handle;
|
||||
shared_->state_ = state_.value_;
|
||||
shared_->access_size_ = 0;
|
||||
memset(shared_->access_, 0, sizeof(SharedAccess) * kMaxMgpuAccess);
|
||||
assert((access_map_.size() <= kMaxMgpuAccess) && "Can't support more GPU(s) in shared access" );
|
||||
assert((access_map_.size() <= kMaxMgpuAccess) && "Can't support more GPU(s) in shared access");
|
||||
for (auto it : access_map_) {
|
||||
shared_->access_[shared_->access_size_] = SharedAccess{it.first->deviceId(), it.second};
|
||||
shared_->access_size_++;
|
||||
@@ -537,4 +540,4 @@ bool MemoryPool::Import(amd::Os::FileDesc handle) {
|
||||
}
|
||||
return result;
|
||||
}
|
||||
}
|
||||
} // namespace hip
|
||||
|
||||
@@ -94,20 +94,20 @@ struct MemoryTimestamp {
|
||||
return result;
|
||||
}
|
||||
|
||||
std::unordered_set<hip::Stream*> safe_streams_; //!< Safe streams for memory reuse
|
||||
hip::Event* event_ = nullptr; //!< Last known HIP event, associated with the memory object
|
||||
std::unordered_set<hip::Stream*> safe_streams_; //!< Safe streams for memory reuse
|
||||
hip::Event* event_ = nullptr; //!< Last known HIP event, associated with the memory object
|
||||
};
|
||||
|
||||
class Heap : public amd::EmbeddedObject {
|
||||
public:
|
||||
public:
|
||||
typedef std::map<std::pair<size_t, amd::Memory*>, MemoryTimestamp> SortedMap;
|
||||
|
||||
Heap(hip::Device* device, amd::VmHeapArray& vm_heap)
|
||||
: total_size_(0)
|
||||
, max_total_size_(0)
|
||||
, release_threshold_(0)
|
||||
, device_(device)
|
||||
, vm_heap_(vm_heap) {}
|
||||
: total_size_(0),
|
||||
max_total_size_(0),
|
||||
release_threshold_(0),
|
||||
device_(device),
|
||||
vm_heap_(vm_heap) {}
|
||||
~Heap() {}
|
||||
|
||||
/// Adds allocation into the heap on a specific stream
|
||||
@@ -117,8 +117,8 @@ public:
|
||||
void AddMemory(amd::Memory* memory, const MemoryTimestamp& ts);
|
||||
|
||||
/// Finds memory object with the specified size
|
||||
amd::Memory* FindMemory(size_t size, Stream* stream, bool opportunistic,
|
||||
void* dptr, MemoryTimestamp* ts);
|
||||
amd::Memory* FindMemory(size_t size, Stream* stream, bool opportunistic, void* dptr,
|
||||
MemoryTimestamp* ts);
|
||||
|
||||
/// Removes allocation from the map
|
||||
bool RemoveMemory(amd::Memory* memory, MemoryTimestamp* ts = nullptr);
|
||||
@@ -179,7 +179,7 @@ public:
|
||||
|
||||
const auto& Allocations() { return allocations_; }
|
||||
|
||||
private:
|
||||
private:
|
||||
Heap() = delete;
|
||||
Heap(const Heap&) = delete;
|
||||
Heap& operator=(const Heap&) = delete;
|
||||
@@ -189,9 +189,9 @@ private:
|
||||
uint64_t max_total_size_; //!< Maximum heap allocation size
|
||||
uint64_t release_threshold_; //!< Threshold size in bytes for memory release from heap, default 0
|
||||
|
||||
hip::Device* device_; //!< Hip device the allocations will reside
|
||||
amd::VmHeapArray& vm_heap_; //!< Managed heap for memory allocaitons
|
||||
bool use_vm_heap_ = false; //!< Use virtual heap or direct allocations
|
||||
hip::Device* device_; //!< Hip device the allocations will reside
|
||||
amd::VmHeapArray& vm_heap_; //!< Managed heap for memory allocaitons
|
||||
bool use_vm_heap_ = false; //!< Use virtual heap or direct allocations
|
||||
};
|
||||
|
||||
/// Allocates memory in the pool on the specified stream and places the allocation into busy_heap_
|
||||
@@ -201,21 +201,21 @@ private:
|
||||
class MemoryPool : public amd::ReferenceCountedObject, amd::VmHeapArray {
|
||||
public:
|
||||
struct SharedAccess {
|
||||
int device_id_; //!< Device ID for access with a specified shared resource
|
||||
hipMemAccessFlags flags_; //!< Flags which define access type
|
||||
int device_id_; //!< Device ID for access with a specified shared resource
|
||||
hipMemAccessFlags flags_; //!< Flags which define access type
|
||||
};
|
||||
|
||||
static constexpr uint32_t kMaxMgpuAccess = 32;
|
||||
struct SharedMemPool {
|
||||
amd::Os::FileDesc handle_; //!< File descriptor for shared memory
|
||||
uint32_t state_; //!< Memory pool state
|
||||
uint32_t access_size_; //!< The number of entries in access array
|
||||
SharedAccess access_[kMaxMgpuAccess]; //!< The list of devices for access
|
||||
amd::Os::FileDesc handle_; //!< File descriptor for shared memory
|
||||
uint32_t state_; //!< Memory pool state
|
||||
uint32_t access_size_; //!< The number of entries in access array
|
||||
SharedAccess access_[kMaxMgpuAccess]; //!< The list of devices for access
|
||||
};
|
||||
|
||||
MemoryPool(hip::Device* device, const hipMemPoolProps* props = nullptr, bool phys_mem = false)
|
||||
: VmHeapArray(device->devices()[0],
|
||||
[this]()->amd::HostQueue&{ return *device_->NullStream(); }),
|
||||
[this]() -> amd::HostQueue& { return *device_->NullStream(); }),
|
||||
busy_heap_(device, *this),
|
||||
free_heap_(device, *this),
|
||||
lock_pool_ops_(true),
|
||||
@@ -280,9 +280,7 @@ class MemoryPool : public amd::ReferenceCountedObject, amd::VmHeapArray {
|
||||
void ReleaseAllMemory();
|
||||
|
||||
/// Place the allocated memory into the busy heap
|
||||
void AddBusyMemory(amd::Memory* memory) {
|
||||
busy_heap_.AddMemory(memory, nullptr);
|
||||
}
|
||||
void AddBusyMemory(amd::Memory* memory) { busy_heap_.AddMemory(memory, nullptr); }
|
||||
|
||||
/// Add a safe stream for quick looks-ups if event dependencies option is enabled
|
||||
void AddSafeStream(Stream* event_stream, Stream* wait_stream) {
|
||||
@@ -334,30 +332,31 @@ class MemoryPool : public amd::ReferenceCountedObject, amd::VmHeapArray {
|
||||
MemoryPool(const MemoryPool&) = delete;
|
||||
MemoryPool& operator=(const MemoryPool&) = delete;
|
||||
|
||||
Heap busy_heap_; //!< Heap of busy allocations
|
||||
Heap free_heap_; //!< Heap of freed allocations
|
||||
Heap busy_heap_; //!< Heap of busy allocations
|
||||
Heap free_heap_; //!< Heap of freed allocations
|
||||
union {
|
||||
struct {
|
||||
uint32_t event_dependencies_ : 1; //!< Event dependencies tracking is enabled
|
||||
uint32_t opportunistic_ : 1; //!< HIP event check is enabled
|
||||
uint32_t internal_dependencies_ : 1; //!< Runtime adds internal events to handle memory
|
||||
//!< dependencies
|
||||
uint32_t interprocess_ : 1; //!< Memory pool can be used in interprocess communications
|
||||
uint32_t graph_in_use_ : 1; //!< Memory pool was used in a graph execution
|
||||
uint32_t phys_mem_ : 1; //!< Mempool is used for graphs and will have physical allocations
|
||||
uint32_t use_vm_heap_ : 1; //!< Use VM heap or direct allocations
|
||||
uint32_t interprocess_ : 1; //!< Memory pool can be used in interprocess communications
|
||||
uint32_t graph_in_use_ : 1; //!< Memory pool was used in a graph execution
|
||||
uint32_t phys_mem_ : 1; //!< Mempool is used for graphs and will have physical allocations
|
||||
uint32_t use_vm_heap_ : 1; //!< Use VM heap or direct allocations
|
||||
};
|
||||
uint32_t value_;
|
||||
} state_;
|
||||
|
||||
hipMemPoolProps properties_; //!< Properties of the memory pool
|
||||
amd::Monitor lock_pool_ops_; //!< Access to the pool must be lock protected
|
||||
std::map<hip::Device*, hipMemAccessFlags> access_map_; //!< Map of access to the pool from devices
|
||||
std::map<hip::Device*, hipMemAccessFlags>
|
||||
access_map_; //!< Map of access to the pool from devices
|
||||
|
||||
hip::Device* device_; //!< Hip device the heap will reside
|
||||
SharedMemPool* shared_; //!< Pointer to shared memory for IPC
|
||||
uint64_t max_total_size_; //!< Max of total reserved memory in the pool since last reset
|
||||
hip::Device* device_; //!< Hip device the heap will reside
|
||||
SharedMemPool* shared_; //!< Pointer to shared memory for IPC
|
||||
uint64_t max_total_size_; //!< Max of total reserved memory in the pool since last reset
|
||||
};
|
||||
|
||||
|
||||
} // Mamespace hip
|
||||
} // namespace hip
|
||||
|
||||
@@ -165,7 +165,7 @@ hipError_t hipFuncGetAttribute(int* value, hipFunction_attribute attrib, hipFunc
|
||||
case HIP_FUNC_ATTRIBUTE_PTX_VERSION:
|
||||
case HIP_FUNC_ATTRIBUTE_BINARY_VERSION:
|
||||
*value = hip::getCurrentDevice()->devices()[0]->isa().versionMajor() * 10 +
|
||||
hip::getCurrentDevice()->devices()[0]->isa().versionMinor();
|
||||
hip::getCurrentDevice()->devices()[0]->isa().versionMinor();
|
||||
break;
|
||||
case HIP_FUNC_ATTRIBUTE_CACHE_MODE_CA:
|
||||
*value = 0;
|
||||
@@ -194,7 +194,7 @@ hipError_t hipFuncGetAttributes(hipFuncAttributes* attr, const void* func) {
|
||||
hipError_t hipFuncSetAttribute(const void* func, hipFuncAttribute attr, int value) {
|
||||
HIP_INIT_API(hipFuncSetAttribute, func, attr, value);
|
||||
|
||||
if (func == nullptr) {
|
||||
if (func == nullptr) {
|
||||
HIP_RETURN(hipErrorInvalidDeviceFunction);
|
||||
}
|
||||
if (attr < 0 || attr > hipFuncAttributeMax) {
|
||||
@@ -221,19 +221,19 @@ hipError_t hipFuncSetAttribute(const void* func, hipFuncAttribute attr, int valu
|
||||
HIP_RETURN(hipErrorInvalidDeviceFunction);
|
||||
}
|
||||
device::Kernel* d_kernel =
|
||||
(device::Kernel*)(kernel->getDeviceKernel(
|
||||
*(hip::getCurrentDevice()->devices()[0])));
|
||||
(device::Kernel*)(kernel->getDeviceKernel(*(hip::getCurrentDevice()->devices()[0])));
|
||||
|
||||
if (attr == hipFuncAttributeMaxDynamicSharedMemorySize) {
|
||||
if ((value < 0) || (value > (d_kernel->workGroupInfo()->availableLDSSize_ -
|
||||
d_kernel->workGroupInfo()->localMemSize_))) {
|
||||
if ((value < 0) ||
|
||||
(value > (d_kernel->workGroupInfo()->availableLDSSize_ -
|
||||
d_kernel->workGroupInfo()->localMemSize_))) {
|
||||
HIP_RETURN(hipErrorInvalidValue);
|
||||
}
|
||||
d_kernel->workGroupInfo()->maxDynamicSharedSizeBytes_ = value;
|
||||
}
|
||||
|
||||
if (attr == hipFuncAttributePreferredSharedMemoryCarveout) {
|
||||
if (value < -1 || value > 100) {
|
||||
if (value < -1 || value > 100) {
|
||||
HIP_RETURN(hipErrorInvalidValue);
|
||||
}
|
||||
}
|
||||
@@ -244,7 +244,9 @@ hipError_t hipFuncSetAttribute(const void* func, hipFuncAttribute attr, int valu
|
||||
hipError_t hipFuncSetCacheConfig(const void* func, hipFuncCache_t cacheConfig) {
|
||||
HIP_INIT_API(hipFuncSetCacheConfig, cacheConfig);
|
||||
|
||||
if (func == nullptr) { HIP_RETURN(hipErrorInvalidDeviceFunction); }
|
||||
if (func == nullptr) {
|
||||
HIP_RETURN(hipErrorInvalidDeviceFunction);
|
||||
}
|
||||
if (cacheConfig != hipFuncCachePreferNone && cacheConfig != hipFuncCachePreferShared &&
|
||||
cacheConfig != hipFuncCachePreferL1 && cacheConfig != hipFuncCachePreferEqual) {
|
||||
HIP_RETURN(hipErrorInvalidValue);
|
||||
@@ -257,7 +259,9 @@ hipError_t hipFuncSetCacheConfig(const void* func, hipFuncCache_t cacheConfig) {
|
||||
hipError_t hipFuncSetSharedMemConfig(const void* func, hipSharedMemConfig config) {
|
||||
HIP_INIT_API(hipFuncSetSharedMemConfig, func, config);
|
||||
|
||||
if (func == nullptr) { HIP_RETURN(hipErrorInvalidDeviceFunction); }
|
||||
if (func == nullptr) {
|
||||
HIP_RETURN(hipErrorInvalidDeviceFunction);
|
||||
}
|
||||
if (config != hipSharedMemBankSizeDefault && config != hipSharedMemBankSizeFourByte &&
|
||||
config != hipSharedMemBankSizeEightByte) {
|
||||
HIP_RETURN(hipErrorInvalidValue);
|
||||
@@ -281,19 +285,19 @@ hipError_t ihipLaunchKernel_validate(hipFunction_t f, const amd::LaunchParams& l
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
|
||||
if (launch_params.global_[0] == 0 || launch_params.global_[1] == 0
|
||||
|| launch_params.global_[2] == 0) {
|
||||
if (launch_params.global_[0] == 0 || launch_params.global_[1] == 0 ||
|
||||
launch_params.global_[2] == 0) {
|
||||
return hipErrorInvalidConfiguration;
|
||||
}
|
||||
|
||||
if (launch_params.local_[0] == 0 || launch_params.local_[1] == 0
|
||||
|| launch_params.local_[2] == 0) {
|
||||
if (launch_params.local_[0] == 0 || launch_params.local_[1] == 0 ||
|
||||
launch_params.local_[2] == 0) {
|
||||
return hipErrorInvalidConfiguration;
|
||||
}
|
||||
|
||||
const amd::Device* device = g_devices[deviceId]->devices()[0];
|
||||
const auto& info = device->info();
|
||||
if (launch_params.sharedMemBytes_ > info.localMemSizePerCU_) { //sharedMemPerBlock
|
||||
if (launch_params.sharedMemBytes_ > info.localMemSizePerCU_) { // sharedMemPerBlock
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
// Make sure dispatch doesn't exceed max workgroup size limit
|
||||
@@ -304,7 +308,7 @@ hipError_t ihipLaunchKernel_validate(hipFunction_t f, const amd::LaunchParams& l
|
||||
amd::Kernel* kernel = function->kernel();
|
||||
const amd::KernelSignature& signature = kernel->signature();
|
||||
if ((signature.numParameters() > 0) && (kernelParams == nullptr) && (extra == nullptr)) {
|
||||
LogPrintfError("%s","At least one of kernelParams or extra Params should be provided");
|
||||
LogPrintfError("%s", "At least one of kernelParams or extra Params should be provided");
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
if (!kernel->getDeviceKernel(*device)) {
|
||||
@@ -368,9 +372,9 @@ hipError_t ihipLaunchKernelCommand(amd::Command*& command, hipFunction_t f,
|
||||
params |= amd::NDRangeKernelCommand::AnyOrderLaunch;
|
||||
}
|
||||
|
||||
amd::NDRangeKernelCommand* kernelCommand = new amd::NDRangeKernelCommand(*stream, waitList,
|
||||
*kernel, ndrange, launch_params.sharedMemBytes_, params, gridId, numGrids, prevGridSum,
|
||||
allGridSum, firstDevice, profileNDRange);
|
||||
amd::NDRangeKernelCommand* kernelCommand = new amd::NDRangeKernelCommand(
|
||||
*stream, waitList, *kernel, ndrange, launch_params.sharedMemBytes_, params, gridId, numGrids,
|
||||
prevGridSum, allGridSum, firstDevice, profileNDRange);
|
||||
if (!kernelCommand) {
|
||||
return hipErrorOutOfMemory;
|
||||
}
|
||||
@@ -403,8 +407,8 @@ hipError_t ihipLaunchKernelCommand(amd::Command*& command, hipFunction_t f,
|
||||
}
|
||||
|
||||
if (DEBUG_HIP_KERNARG_COPY_OPT) {
|
||||
if (CL_SUCCESS != kernelCommand->AllocCaptureSetValidate(kernelParams, kernargs,
|
||||
kernargs_size)) {
|
||||
if (CL_SUCCESS !=
|
||||
kernelCommand->AllocCaptureSetValidate(kernelParams, kernargs, kernargs_size)) {
|
||||
kernelCommand->release();
|
||||
return hipErrorOutOfMemory;
|
||||
}
|
||||
@@ -459,10 +463,10 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f, amd::LaunchParams& launch_par
|
||||
}
|
||||
hip::DeviceFunc* function = hip::DeviceFunc::asFunction(f);
|
||||
amd::Kernel* kernel = function->kernel();
|
||||
amd::ScopedLock lock (DEBUG_HIP_KERNARG_COPY_OPT ? nullptr : &function->dflock_);
|
||||
amd::ScopedLock lock(DEBUG_HIP_KERNARG_COPY_OPT ? nullptr : &function->dflock_);
|
||||
|
||||
hipError_t status = ihipLaunchKernel_validate(f, launch_params, kernelParams, extra, deviceId,
|
||||
params);
|
||||
hipError_t status =
|
||||
ihipLaunchKernel_validate(f, launch_params, kernelParams, extra, deviceId, params);
|
||||
if (status != hipSuccess) {
|
||||
return status;
|
||||
}
|
||||
@@ -554,7 +558,7 @@ hipError_t hipModuleLaunchKernel(hipFunction_t f, uint32_t gridDimX, uint32_t gr
|
||||
amd::HIPLaunchParams launch_params(gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ,
|
||||
sharedMemBytes);
|
||||
if (!launch_params.IsValidConfig() ||
|
||||
launch_params.local_.product() > device->info().maxWorkGroupSize_) {
|
||||
launch_params.local_.product() > device->info().maxWorkGroupSize_) {
|
||||
HIP_RETURN(hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
@@ -562,18 +566,18 @@ hipError_t hipModuleLaunchKernel(hipFunction_t f, uint32_t gridDimX, uint32_t gr
|
||||
HIP_RETURN(hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
if (launch_params.global_[0] == 0 || launch_params.global_[1] == 0
|
||||
|| launch_params.global_[2] == 0) {
|
||||
if (launch_params.global_[0] == 0 || launch_params.global_[1] == 0 ||
|
||||
launch_params.global_[2] == 0) {
|
||||
HIP_RETURN(hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
if (launch_params.local_[0] == 0 || launch_params.local_[1] == 0
|
||||
|| launch_params.local_[2] == 0) {
|
||||
if (launch_params.local_[0] == 0 || launch_params.local_[1] == 0 ||
|
||||
launch_params.local_[2] == 0) {
|
||||
HIP_RETURN(hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
HIP_RETURN(ihipModuleLaunchKernel(f, launch_params, hStream, kernelParams, extra, nullptr,
|
||||
nullptr));
|
||||
HIP_RETURN(
|
||||
ihipModuleLaunchKernel(f, launch_params, hStream, kernelParams, extra, nullptr, nullptr));
|
||||
}
|
||||
|
||||
hipError_t hipExtModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
|
||||
@@ -594,9 +598,8 @@ hipError_t hipExtModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
|
||||
globalWorkSizeZ, localWorkSizeX, localWorkSizeY, localWorkSizeZ, sharedMemBytes,
|
||||
kernelParams, extra, startEvent, stopEvent, flags);
|
||||
|
||||
amd::LaunchParams launch_params(globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ,
|
||||
localWorkSizeX, localWorkSizeY, localWorkSizeZ,
|
||||
sharedMemBytes);
|
||||
amd::LaunchParams launch_params(globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, localWorkSizeX,
|
||||
localWorkSizeY, localWorkSizeZ, sharedMemBytes);
|
||||
|
||||
HIP_RETURN(ihipModuleLaunchKernel(f, launch_params, hStream, kernelParams, extra, startEvent,
|
||||
stopEvent, flags));
|
||||
@@ -641,7 +644,7 @@ hipError_t hipModuleLaunchCooperativeKernel(hipFunction_t f, unsigned int gridDi
|
||||
sharedMemBytes);
|
||||
|
||||
if (!launch_params.IsValidConfig() ||
|
||||
launch_params.local_.product() > device->info().maxWorkGroupSize_) {
|
||||
launch_params.local_.product() > device->info().maxWorkGroupSize_) {
|
||||
HIP_RETURN(hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
@@ -649,13 +652,13 @@ hipError_t hipModuleLaunchCooperativeKernel(hipFunction_t f, unsigned int gridDi
|
||||
HIP_RETURN(hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
if (launch_params.global_[0] == 0 || launch_params.global_[1] == 0
|
||||
|| launch_params.global_[2] == 0) {
|
||||
if (launch_params.global_[0] == 0 || launch_params.global_[1] == 0 ||
|
||||
launch_params.global_[2] == 0) {
|
||||
HIP_RETURN(hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
if (launch_params.local_[0] == 0 || launch_params.local_[1] == 0
|
||||
|| launch_params.local_[2] == 0) {
|
||||
if (launch_params.local_[0] == 0 || launch_params.local_[1] == 0 ||
|
||||
launch_params.local_[2] == 0) {
|
||||
HIP_RETURN(hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
@@ -664,9 +667,8 @@ hipError_t hipModuleLaunchCooperativeKernel(hipFunction_t f, unsigned int gridDi
|
||||
}
|
||||
|
||||
hipError_t ihipModuleLaunchCooperativeKernelMultiDevice(hipFunctionLaunchParams* launchParamsList,
|
||||
unsigned int numDevices,
|
||||
unsigned int flags,
|
||||
uint32_t extFlags) {
|
||||
unsigned int numDevices, unsigned int flags,
|
||||
uint32_t extFlags) {
|
||||
int numActiveGPUs = 0;
|
||||
hipError_t result = hipSuccess;
|
||||
result = ihipDeviceGetCount(&numActiveGPUs);
|
||||
@@ -675,8 +677,8 @@ hipError_t ihipModuleLaunchCooperativeKernelMultiDevice(hipFunctionLaunchParams*
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
|
||||
if (flags > (hipCooperativeLaunchMultiDeviceNoPostSync +
|
||||
hipCooperativeLaunchMultiDeviceNoPreSync)) {
|
||||
if (flags >
|
||||
(hipCooperativeLaunchMultiDeviceNoPostSync + hipCooperativeLaunchMultiDeviceNoPreSync)) {
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
|
||||
@@ -713,8 +715,7 @@ hipError_t ihipModuleLaunchCooperativeKernelMultiDevice(hipFunctionLaunchParams*
|
||||
// Sync the execution streams on all devices
|
||||
if ((flags & hipCooperativeLaunchMultiDeviceNoPreSync) == 0) {
|
||||
for (int i = 0; i < numDevices; ++i) {
|
||||
hip::Stream* hip_stream =
|
||||
reinterpret_cast<hip::Stream*>(launchParamsList[i].hStream);
|
||||
hip::Stream* hip_stream = reinterpret_cast<hip::Stream*>(launchParamsList[i].hStream);
|
||||
hip_stream->finish();
|
||||
}
|
||||
}
|
||||
@@ -759,10 +760,9 @@ hipError_t ihipModuleLaunchCooperativeKernelMultiDevice(hipFunctionLaunchParams*
|
||||
return hipErrorInvalidConfiguration;
|
||||
}
|
||||
|
||||
result = ihipModuleLaunchKernel(
|
||||
launch.function, launch_params, launch.hStream, launch.kernelParams,
|
||||
nullptr, nullptr, nullptr, flags, extFlags,
|
||||
i, numDevices, prevGridSize, allGridSize, firstDevice);
|
||||
result = ihipModuleLaunchKernel(launch.function, launch_params, launch.hStream,
|
||||
launch.kernelParams, nullptr, nullptr, nullptr, flags, extFlags,
|
||||
i, numDevices, prevGridSize, allGridSize, firstDevice);
|
||||
if (result != hipSuccess) {
|
||||
break;
|
||||
}
|
||||
@@ -772,8 +772,7 @@ hipError_t ihipModuleLaunchCooperativeKernelMultiDevice(hipFunctionLaunchParams*
|
||||
// Sync the execution streams on all devices
|
||||
if ((flags & hipCooperativeLaunchMultiDeviceNoPostSync) == 0) {
|
||||
for (int i = 0; i < numDevices; ++i) {
|
||||
hip::Stream* hip_stream =
|
||||
reinterpret_cast<hip::Stream*>(launchParamsList[i].hStream);
|
||||
hip::Stream* hip_stream = reinterpret_cast<hip::Stream*>(launchParamsList[i].hStream);
|
||||
hip_stream->finish();
|
||||
}
|
||||
}
|
||||
@@ -782,8 +781,8 @@ hipError_t ihipModuleLaunchCooperativeKernelMultiDevice(hipFunctionLaunchParams*
|
||||
}
|
||||
|
||||
hipError_t hipModuleLaunchCooperativeKernelMultiDevice(hipFunctionLaunchParams* launchParamsList,
|
||||
unsigned int numDevices,
|
||||
unsigned int flags) {
|
||||
unsigned int numDevices,
|
||||
unsigned int flags) {
|
||||
HIP_INIT_API(hipModuleLaunchCooperativeKernelMultiDevice, launchParamsList, numDevices, flags);
|
||||
|
||||
if (launchParamsList == nullptr) {
|
||||
@@ -798,19 +797,16 @@ hipError_t hipModuleLaunchCooperativeKernelMultiDevice(hipFunctionLaunchParams*
|
||||
}
|
||||
|
||||
HIP_RETURN(ihipModuleLaunchCooperativeKernelMultiDevice(
|
||||
launchParamsList,
|
||||
numDevices,
|
||||
flags,
|
||||
launchParamsList, numDevices, flags,
|
||||
(amd::NDRangeKernelCommand::CooperativeGroups |
|
||||
amd::NDRangeKernelCommand::CooperativeMultiDeviceGroups)));
|
||||
|
||||
amd::NDRangeKernelCommand::CooperativeMultiDeviceGroups)));
|
||||
}
|
||||
|
||||
hipError_t hipGetFuncBySymbol(hipFunction_t* functionPtr, const void* symbolPtr) {
|
||||
HIP_INIT_API(hipGetFuncBySymbol, functionPtr, symbolPtr);
|
||||
|
||||
hipError_t hip_error = PlatformState::instance().getStatFunc(functionPtr,
|
||||
symbolPtr, ihipGetDevice());
|
||||
hipError_t hip_error =
|
||||
PlatformState::instance().getStatFunc(functionPtr, symbolPtr, ihipGetDevice());
|
||||
|
||||
if ((hip_error != hipSuccess) || (functionPtr == nullptr)) {
|
||||
HIP_RETURN(hipErrorInvalidDeviceFunction);
|
||||
@@ -819,31 +815,31 @@ hipError_t hipGetFuncBySymbol(hipFunction_t* functionPtr, const void* symbolPtr)
|
||||
}
|
||||
|
||||
hipError_t hipLaunchKernel_common(const void* hostFunction, dim3 gridDim, dim3 blockDim,
|
||||
void** args, size_t sharedMemBytes,
|
||||
hipStream_t stream) {
|
||||
void** args, size_t sharedMemBytes, hipStream_t stream) {
|
||||
STREAM_CAPTURE(hipLaunchKernel, stream, hostFunction, gridDim, blockDim, args, sharedMemBytes);
|
||||
return ihipLaunchKernel(hostFunction, gridDim, blockDim, args, sharedMemBytes, stream, nullptr,
|
||||
nullptr, 0);
|
||||
}
|
||||
|
||||
hipError_t hipLaunchKernel(const void* hostFunction, dim3 gridDim, dim3 blockDim,
|
||||
void** args, size_t sharedMemBytes, hipStream_t stream) {
|
||||
hipError_t hipLaunchKernel(const void* hostFunction, dim3 gridDim, dim3 blockDim, void** args,
|
||||
size_t sharedMemBytes, hipStream_t stream) {
|
||||
HIP_INIT_API(hipLaunchKernel, hostFunction, gridDim, blockDim, args, sharedMemBytes, stream);
|
||||
HIP_RETURN_DURATION(hipLaunchKernel_common(hostFunction, gridDim, blockDim, args, sharedMemBytes, stream));
|
||||
HIP_RETURN_DURATION(
|
||||
hipLaunchKernel_common(hostFunction, gridDim, blockDim, args, sharedMemBytes, stream));
|
||||
}
|
||||
|
||||
hipError_t hipLaunchKernel_spt(const void* hostFunction, dim3 gridDim, dim3 blockDim,
|
||||
void** args, size_t sharedMemBytes, hipStream_t stream) {
|
||||
hipError_t hipLaunchKernel_spt(const void* hostFunction, dim3 gridDim, dim3 blockDim, void** args,
|
||||
size_t sharedMemBytes, hipStream_t stream) {
|
||||
HIP_INIT_API(hipLaunchKernel, hostFunction, gridDim, blockDim, args, sharedMemBytes, stream);
|
||||
PER_THREAD_DEFAULT_STREAM(stream);
|
||||
HIP_RETURN(hipLaunchKernel_common(hostFunction, gridDim, blockDim, args, sharedMemBytes, stream));
|
||||
}
|
||||
|
||||
hipError_t hipExtLaunchKernel(const void* hostFunction, dim3 gridDim, dim3 blockDim,
|
||||
void** args, size_t sharedMemBytes, hipStream_t stream,
|
||||
hipEvent_t startEvent, hipEvent_t stopEvent, int flags) {
|
||||
HIP_INIT_API(hipExtLaunchKernel, hostFunction, gridDim, blockDim, args, sharedMemBytes,
|
||||
stream, startEvent, stopEvent, flags);
|
||||
hipError_t hipExtLaunchKernel(const void* hostFunction, dim3 gridDim, dim3 blockDim, void** args,
|
||||
size_t sharedMemBytes, hipStream_t stream, hipEvent_t startEvent,
|
||||
hipEvent_t stopEvent, int flags) {
|
||||
HIP_INIT_API(hipExtLaunchKernel, hostFunction, gridDim, blockDim, args, sharedMemBytes, stream,
|
||||
startEvent, stopEvent, flags);
|
||||
|
||||
if (!hip::isValid(startEvent) || !hip::isValid(stopEvent)) {
|
||||
HIP_RETURN(hipErrorInvalidValue);
|
||||
@@ -889,13 +885,13 @@ hipError_t hipLaunchCooperativeKernel_common(const void* f, dim3 gridDim, dim3 b
|
||||
return hipErrorCooperativeLaunchTooLarge;
|
||||
}
|
||||
|
||||
if (launch_params.global_[0] == 0 || launch_params.global_[1] == 0
|
||||
|| launch_params.global_[2] == 0) {
|
||||
if (launch_params.global_[0] == 0 || launch_params.global_[1] == 0 ||
|
||||
launch_params.global_[2] == 0) {
|
||||
return hipErrorInvalidConfiguration;
|
||||
}
|
||||
|
||||
return ihipModuleLaunchKernel(func, launch_params, hStream, kernelParams, nullptr,
|
||||
nullptr, nullptr, 0, amd::NDRangeKernelCommand::CooperativeGroups);
|
||||
return ihipModuleLaunchKernel(func, launch_params, hStream, kernelParams, nullptr, nullptr,
|
||||
nullptr, 0, amd::NDRangeKernelCommand::CooperativeGroups);
|
||||
}
|
||||
|
||||
hipError_t hipLaunchCooperativeKernel(const void* f, dim3 gridDim, dim3 blockDim,
|
||||
@@ -975,10 +971,8 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL
|
||||
functionLaunchParamsList[i].kernelParams = launch.args;
|
||||
}
|
||||
|
||||
return ihipModuleLaunchCooperativeKernelMultiDevice(functionLaunchParamsList.data(),
|
||||
functionLaunchParamsList.size(),
|
||||
flags,
|
||||
extFlags);
|
||||
return ihipModuleLaunchCooperativeKernelMultiDevice(
|
||||
functionLaunchParamsList.data(), functionLaunchParamsList.size(), flags, extFlags);
|
||||
}
|
||||
|
||||
hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList, int numDevices,
|
||||
@@ -1039,8 +1033,8 @@ hipError_t hipModuleGetTexRef(textureReference** texRef, hipModule_t hmod, const
|
||||
hipError_t hipLinkAddData(hipLinkState_t hip_link_state, hipJitInputType input_type, void* image,
|
||||
size_t image_size, const char* name, unsigned int num_options,
|
||||
hipJitOption* options_ptr, void** option_values) {
|
||||
|
||||
HIP_INIT_API(hipLinkAddData, hip_link_state, image, image_size, name, num_options, options_ptr, option_values);
|
||||
HIP_INIT_API(hipLinkAddData, hip_link_state, image, image_size, name, num_options, options_ptr,
|
||||
option_values);
|
||||
|
||||
if (image == nullptr || image_size <= 0) {
|
||||
HIP_RETURN(hipErrorInvalidImage);
|
||||
@@ -1048,9 +1042,9 @@ hipError_t hipLinkAddData(hipLinkState_t hip_link_state, hipJitInputType input_t
|
||||
|
||||
if (input_type == hipJitInputCubin || input_type == hipJitInputPtx ||
|
||||
input_type == hipJitInputFatBinary || input_type == hipJitInputObject ||
|
||||
input_type == hipJitInputLibrary || input_type == hipJitInputNvvm ||
|
||||
input_type == hipJitInputLibrary || input_type == hipJitInputNvvm ||
|
||||
input_type == hipJitInputLLVMBitcode || input_type == hipJitInputLLVMBundledBitcode ||
|
||||
input_type == hipJitInputLLVMArchivesOfBundledBitcode ) {
|
||||
input_type == hipJitInputLLVMArchivesOfBundledBitcode) {
|
||||
HIP_RETURN(hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
@@ -1059,8 +1053,7 @@ hipError_t hipLinkAddData(hipLinkState_t hip_link_state, hipJitInputType input_t
|
||||
input_name = name;
|
||||
}
|
||||
|
||||
LinkProgram* hip_link_prog_ptr =
|
||||
reinterpret_cast<LinkProgram*>(hip_link_state);
|
||||
LinkProgram* hip_link_prog_ptr = reinterpret_cast<LinkProgram*>(hip_link_state);
|
||||
|
||||
if (!LinkProgram::isLinkerValid(hip_link_prog_ptr)) {
|
||||
HIP_RETURN(hipErrorInvalidHandle);
|
||||
@@ -1073,9 +1066,11 @@ hipError_t hipLinkAddData(hipLinkState_t hip_link_state, hipJitInputType input_t
|
||||
HIP_RETURN(hipSuccess);
|
||||
}
|
||||
|
||||
hipError_t hipLinkAddFile(hipLinkState_t hip_link_state, hipJitInputType input_type, const char* file_path,
|
||||
unsigned int num_options, hipJitOption* options_ptr, void** option_values) {
|
||||
HIP_INIT_API(hipLinkAddFile, hip_link_state, input_type, file_path, num_options, options_ptr, option_values);
|
||||
hipError_t hipLinkAddFile(hipLinkState_t hip_link_state, hipJitInputType input_type,
|
||||
const char* file_path, unsigned int num_options,
|
||||
hipJitOption* options_ptr, void** option_values) {
|
||||
HIP_INIT_API(hipLinkAddFile, hip_link_state, input_type, file_path, num_options, options_ptr,
|
||||
option_values);
|
||||
|
||||
if (hip_link_state == nullptr) {
|
||||
HIP_RETURN(hipErrorInvalidHandle);
|
||||
@@ -1083,14 +1078,13 @@ hipError_t hipLinkAddFile(hipLinkState_t hip_link_state, hipJitInputType input_t
|
||||
|
||||
if (input_type == hipJitInputCubin || input_type == hipJitInputPtx ||
|
||||
input_type == hipJitInputFatBinary || input_type == hipJitInputObject ||
|
||||
input_type == hipJitInputLibrary || input_type == hipJitInputNvvm ||
|
||||
input_type == hipJitInputLibrary || input_type == hipJitInputNvvm ||
|
||||
input_type == hipJitInputLLVMBitcode || input_type == hipJitInputLLVMBundledBitcode ||
|
||||
input_type == hipJitInputLLVMArchivesOfBundledBitcode ) {
|
||||
input_type == hipJitInputLLVMArchivesOfBundledBitcode) {
|
||||
HIP_RETURN(hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
LinkProgram* hip_link_prog_ptr =
|
||||
reinterpret_cast<LinkProgram*>(hip_link_state);
|
||||
LinkProgram* hip_link_prog_ptr = reinterpret_cast<LinkProgram*>(hip_link_state);
|
||||
|
||||
if (!LinkProgram::isLinkerValid(hip_link_prog_ptr)) {
|
||||
HIP_RETURN(hipErrorInvalidValue);
|
||||
@@ -1104,7 +1098,7 @@ hipError_t hipLinkAddFile(hipLinkState_t hip_link_state, hipJitInputType input_t
|
||||
}
|
||||
|
||||
hipError_t hipLinkCreate(unsigned int num_options, hipJitOption* options_ptr,
|
||||
void** options_vals_pptr, hipLinkState_t* hip_link_state_ptr) {
|
||||
void** options_vals_pptr, hipLinkState_t* hip_link_state_ptr) {
|
||||
HIP_INIT_API(hipLinkCreate, num_options, options_ptr, options_vals_pptr, hip_link_state_ptr);
|
||||
|
||||
if (hip_link_state_ptr == nullptr) {
|
||||
@@ -1175,8 +1169,7 @@ hipError_t hipLinkComplete(hipLinkState_t hip_link_state, void** bin_out, size_t
|
||||
HIP_RETURN(hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
LinkProgram* hip_link_prog_ptr =
|
||||
reinterpret_cast<LinkProgram*>(hip_link_state);
|
||||
LinkProgram* hip_link_prog_ptr = reinterpret_cast<LinkProgram*>(hip_link_state);
|
||||
|
||||
if (!LinkProgram::isLinkerValid(hip_link_prog_ptr)) {
|
||||
HIP_RETURN(hipErrorInvalidValue);
|
||||
@@ -1192,8 +1185,7 @@ hipError_t hipLinkComplete(hipLinkState_t hip_link_state, void** bin_out, size_t
|
||||
hipError_t hipLinkDestroy(hipLinkState_t hip_link_state) {
|
||||
HIP_INIT_API(hipLinkDestroy, hip_link_state);
|
||||
|
||||
LinkProgram* hip_link_prog_ptr =
|
||||
reinterpret_cast<LinkProgram*>(hip_link_state);
|
||||
LinkProgram* hip_link_prog_ptr = reinterpret_cast<LinkProgram*>(hip_link_state);
|
||||
|
||||
if (!LinkProgram::isLinkerValid(hip_link_prog_ptr)) {
|
||||
HIP_RETURN(hipErrorInvalidValue);
|
||||
@@ -1215,7 +1207,7 @@ hipError_t hipLaunchKernelExC(const hipLaunchConfig_t* config, const void* fPtr,
|
||||
|
||||
if (config->numAttrs == 0) {
|
||||
HIP_RETURN_DURATION(hipLaunchKernel_common(fPtr, config->gridDim, config->blockDim, args,
|
||||
config->dynamicSmemBytes, config->stream));
|
||||
config->dynamicSmemBytes, config->stream));
|
||||
}
|
||||
|
||||
for (size_t attr_idx = 0; attr_idx < config->numAttrs; ++attr_idx) {
|
||||
@@ -1263,8 +1255,7 @@ hipError_t hipDrvLaunchKernelEx(const HIP_LAUNCH_CONFIG* config, hipFunction_t f
|
||||
for (size_t attr_idx = 0; attr_idx < config->numAttrs; ++attr_idx) {
|
||||
hipLaunchAttribute& attr = config->attrs[attr_idx];
|
||||
switch (attr.id) {
|
||||
case hipLaunchAttributeCooperative:
|
||||
{
|
||||
case hipLaunchAttributeCooperative: {
|
||||
if (attr.value.cooperative != 0) {
|
||||
HIP_RETURN(ihipModuleLaunchKernel(f, launch_params, config->hStream, kernelParams,
|
||||
nullptr, nullptr, nullptr, 0,
|
||||
|
||||
@@ -26,7 +26,7 @@
|
||||
|
||||
namespace hip {
|
||||
|
||||
hipError_t canAccessPeer(int* canAccessPeer, int deviceId, int peerDeviceId){
|
||||
hipError_t canAccessPeer(int* canAccessPeer, int deviceId, int peerDeviceId) {
|
||||
amd::Device* device = nullptr;
|
||||
amd::Device* peer_device = nullptr;
|
||||
if (canAccessPeer == nullptr) {
|
||||
@@ -38,21 +38,20 @@ hipError_t canAccessPeer(int* canAccessPeer, int deviceId, int peerDeviceId){
|
||||
return hipSuccess;
|
||||
}
|
||||
/* Cannot exceed the max number of devices */
|
||||
if (static_cast<size_t>(deviceId) >= g_devices.size()
|
||||
|| static_cast<size_t>(peerDeviceId) >= g_devices.size()) {
|
||||
if (static_cast<size_t>(deviceId) >= g_devices.size() ||
|
||||
static_cast<size_t>(peerDeviceId) >= g_devices.size()) {
|
||||
return hipErrorInvalidDevice;
|
||||
}
|
||||
device = g_devices[deviceId]->devices()[0];
|
||||
peer_device = g_devices[peerDeviceId]->devices()[0];
|
||||
*canAccessPeer = static_cast<int>(std::find(device->p2pDevices_.begin(),
|
||||
device->p2pDevices_.end(), as_cl(peer_device))
|
||||
!= device->p2pDevices_.end());
|
||||
*canAccessPeer =
|
||||
static_cast<int>(std::find(device->p2pDevices_.begin(), device->p2pDevices_.end(),
|
||||
as_cl(peer_device)) != device->p2pDevices_.end());
|
||||
return hipSuccess;
|
||||
}
|
||||
|
||||
hipError_t findLinkInfo(int device1, int device2,
|
||||
std::vector<amd::Device::LinkAttrType>* link_attrs) {
|
||||
|
||||
amd::Device* amd_dev_obj1 = nullptr;
|
||||
amd::Device* amd_dev_obj2 = nullptr;
|
||||
const int numDevices = static_cast<int>(g_devices.size());
|
||||
@@ -71,12 +70,12 @@ hipError_t findLinkInfo(int device1, int device2,
|
||||
return hipSuccess;
|
||||
}
|
||||
|
||||
hipError_t hipExtGetLinkTypeAndHopCount(int device1, int device2,
|
||||
uint32_t* linktype, uint32_t* hopcount) {
|
||||
hipError_t hipExtGetLinkTypeAndHopCount(int device1, int device2, uint32_t* linktype,
|
||||
uint32_t* hopcount) {
|
||||
HIP_INIT_API(hipExtGetLinkTypeAndHopCount, device1, device2, linktype, hopcount);
|
||||
|
||||
if (linktype == nullptr || hopcount == nullptr ||
|
||||
device1 == device2 || device1 < 0 || device2 < 0) {
|
||||
if (linktype == nullptr || hopcount == nullptr || device1 == device2 || device1 < 0 ||
|
||||
device2 < 0) {
|
||||
HIP_RETURN(hipErrorInvalidValue);
|
||||
}
|
||||
// Fill out the list of LinkAttributes
|
||||
@@ -92,35 +91,35 @@ hipError_t hipExtGetLinkTypeAndHopCount(int device1, int device2,
|
||||
HIP_RETURN(hipSuccess);
|
||||
}
|
||||
|
||||
hipError_t hipDeviceGetP2PAttribute(int* value, hipDeviceP2PAttr attr,
|
||||
int srcDevice, int dstDevice) {
|
||||
hipError_t hipDeviceGetP2PAttribute(int* value, hipDeviceP2PAttr attr, int srcDevice,
|
||||
int dstDevice) {
|
||||
HIP_INIT_API(hipDeviceGetP2PAttribute, value, attr, srcDevice, dstDevice);
|
||||
|
||||
if (value == nullptr) {
|
||||
HIP_RETURN(hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
if (srcDevice == dstDevice || srcDevice >= static_cast<int>(g_devices.size())
|
||||
|| dstDevice >= static_cast<int>(g_devices.size())) {
|
||||
if (srcDevice == dstDevice || srcDevice >= static_cast<int>(g_devices.size()) ||
|
||||
dstDevice >= static_cast<int>(g_devices.size())) {
|
||||
HIP_RETURN(hipErrorInvalidDevice);
|
||||
}
|
||||
|
||||
std::vector<amd::Device::LinkAttrType> link_attrs;
|
||||
|
||||
switch (attr) {
|
||||
case hipDevP2PAttrPerformanceRank : {
|
||||
case hipDevP2PAttrPerformanceRank: {
|
||||
link_attrs.push_back(std::make_pair(amd::Device::LinkAttribute::kLinkLinkType, 0));
|
||||
break;
|
||||
}
|
||||
case hipDevP2PAttrAccessSupported : {
|
||||
case hipDevP2PAttrAccessSupported: {
|
||||
HIP_RETURN_ONFAIL(canAccessPeer(value, srcDevice, dstDevice));
|
||||
break;
|
||||
}
|
||||
case hipDevP2PAttrNativeAtomicSupported : {
|
||||
case hipDevP2PAttrNativeAtomicSupported: {
|
||||
link_attrs.push_back(std::make_pair(amd::Device::LinkAttribute::kLinkAtomicSupport, 0));
|
||||
break;
|
||||
}
|
||||
case hipDevP2PAttrHipArrayAccessSupported : {
|
||||
case hipDevP2PAttrHipArrayAccessSupported: {
|
||||
hipDeviceProp_t srcDeviceProp;
|
||||
hipDeviceProp_t dstDeviceProp;
|
||||
HIP_RETURN_ONFAIL(hipGetDeviceProperties(&srcDeviceProp, srcDevice));
|
||||
@@ -136,7 +135,7 @@ hipError_t hipDeviceGetP2PAttribute(int* value, hipDeviceP2PAttr attr,
|
||||
}
|
||||
break;
|
||||
}
|
||||
default : {
|
||||
default: {
|
||||
LogPrintfError("Invalid attribute attr: %d ", attr);
|
||||
HIP_RETURN(hipErrorInvalidValue);
|
||||
}
|
||||
@@ -193,13 +192,12 @@ hipError_t hipMemcpyPeer(void* dst, int dstDevice, const void* src, int srcDevic
|
||||
HIP_INIT_API(hipMemcpyPeer, dst, dstDevice, src, srcDevice, sizeBytes);
|
||||
CHECK_STREAM_CAPTURING();
|
||||
if (srcDevice >= static_cast<int>(g_devices.size()) ||
|
||||
dstDevice >= static_cast<int>(g_devices.size()) ||
|
||||
srcDevice < 0 || dstDevice < 0) {
|
||||
dstDevice >= static_cast<int>(g_devices.size()) || srcDevice < 0 || dstDevice < 0) {
|
||||
HIP_RETURN(hipErrorInvalidDevice);
|
||||
}
|
||||
|
||||
HIP_RETURN(ihipMemcpy(dst, src, sizeBytes, hipMemcpyDeviceToDevice, *hip::getNullStream(),
|
||||
true, false));
|
||||
HIP_RETURN(
|
||||
ihipMemcpy(dst, src, sizeBytes, hipMemcpyDeviceToDevice, *hip::getNullStream(), true, false));
|
||||
}
|
||||
|
||||
hipError_t hipMemcpyPeerAsync(void* dst, int dstDevice, const void* src, int srcDevice,
|
||||
@@ -207,8 +205,7 @@ hipError_t hipMemcpyPeerAsync(void* dst, int dstDevice, const void* src, int src
|
||||
HIP_INIT_API(hipMemcpyPeerAsync, dst, dstDevice, src, srcDevice, sizeBytes, stream);
|
||||
|
||||
if (srcDevice >= static_cast<int>(g_devices.size()) ||
|
||||
dstDevice >= static_cast<int>(g_devices.size()) ||
|
||||
srcDevice < 0 || dstDevice < 0) {
|
||||
dstDevice >= static_cast<int>(g_devices.size()) || srcDevice < 0 || dstDevice < 0) {
|
||||
HIP_RETURN(hipErrorInvalidDevice);
|
||||
}
|
||||
getStreamPerThread(stream);
|
||||
@@ -219,7 +216,7 @@ hipError_t hipMemcpyPeerAsync(void* dst, int dstDevice, const void* src, int src
|
||||
HIP_RETURN(ihipMemcpy(dst, src, sizeBytes, hipMemcpyDeviceToDevice, *hip_stream, true, true));
|
||||
}
|
||||
|
||||
hipError_t hipMemcpy3DPeer(hipMemcpy3DPeerParms *p) {
|
||||
hipError_t hipMemcpy3DPeer(hipMemcpy3DPeerParms* p) {
|
||||
HIP_INIT_API(hipMemcpy3DPeer, p);
|
||||
if (p == NULL) {
|
||||
HIP_RETURN(hipErrorInvalidValue);
|
||||
@@ -232,7 +229,7 @@ hipError_t hipMemcpy3DPeer(hipMemcpy3DPeerParms *p) {
|
||||
HIP_RETURN(ihipMemcpy3D(©Parms, nullptr));
|
||||
}
|
||||
|
||||
hipError_t hipMemcpy3DPeerAsync(hipMemcpy3DPeerParms *p, hipStream_t stream) {
|
||||
hipError_t hipMemcpy3DPeerAsync(hipMemcpy3DPeerParms* p, hipStream_t stream) {
|
||||
HIP_INIT_API(hipMemcpy3DPeerAsync, p, stream);
|
||||
if (p == NULL) {
|
||||
HIP_RETURN(hipErrorInvalidValue);
|
||||
|
||||
@@ -50,11 +50,13 @@ hipError_t hipModuleGetGlobal(hipDeviceptr_t* dptr, size_t* bytes, hipModule_t h
|
||||
hipError_t ihipCreateGlobalVarObj(const char* name, hipModule_t hmod, amd::Memory** amd_mem_obj,
|
||||
hipDeviceptr_t* dptr, size_t* bytes);
|
||||
|
||||
extern hipError_t ihipModuleLaunchKernel(
|
||||
hipFunction_t f, amd::LaunchParams& launch_params, hipStream_t hStream, void** kernelParams,
|
||||
void** extra, hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags = 0,
|
||||
uint32_t params = 0, uint32_t gridId = 0, uint32_t numGrids = 0, uint64_t prevGridSum = 0,
|
||||
uint64_t allGridSum = 0, uint32_t firstDevice = 0);
|
||||
extern hipError_t ihipModuleLaunchKernel(hipFunction_t f, amd::LaunchParams& launch_params,
|
||||
hipStream_t hStream, void** kernelParams, void** extra,
|
||||
hipEvent_t startEvent, hipEvent_t stopEvent,
|
||||
uint32_t flags = 0, uint32_t params = 0,
|
||||
uint32_t gridId = 0, uint32_t numGrids = 0,
|
||||
uint64_t prevGridSum = 0, uint64_t allGridSum = 0,
|
||||
uint32_t firstDevice = 0);
|
||||
static bool isCompatibleCodeObject(const std::string& codeobj_target_id, const char* device_name) {
|
||||
// Workaround for device name mismatch.
|
||||
// Device name may contain feature strings delimited by '+', e.g.
|
||||
@@ -83,9 +85,8 @@ void** __hipRegisterFatBinary(const void* data) {
|
||||
}
|
||||
|
||||
void __hipRegisterFunction(hip::FatBinaryInfo** modules, const void* hostFunction,
|
||||
char* deviceFunction, const char* deviceName,
|
||||
unsigned int threadLimit, uint3* tid, uint3* bid,
|
||||
dim3* blockDim, dim3* gridDim, int* wSize) {
|
||||
char* deviceFunction, const char* deviceName, unsigned int threadLimit,
|
||||
uint3* tid, uint3* bid, dim3* blockDim, dim3* gridDim, int* wSize) {
|
||||
static int enable_deferred_loading{[]() {
|
||||
char* var = getenv("HIP_ENABLE_DEFERRED_LOADING");
|
||||
return var ? atoi(var) : 1;
|
||||
@@ -106,8 +107,7 @@ void __hipRegisterFunction(hip::FatBinaryInfo** modules, const void* hostFunctio
|
||||
|
||||
for (size_t dev_idx = 0; dev_idx < g_devices.size(); ++dev_idx) {
|
||||
hip_error = PlatformState::instance().getStatFunc(&hfunc, hostFunction, dev_idx);
|
||||
guarantee((hip_error == hipSuccess), "Cannot retrieve Static function, error: %d",
|
||||
hip_error);
|
||||
guarantee((hip_error == hipSuccess), "Cannot retrieve Static function, error: %d", hip_error);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -117,15 +117,14 @@ void __hipRegisterFunction(hip::FatBinaryInfo** modules, const void* hostFunctio
|
||||
// global variable in host code. The shadow host variable is used to keep
|
||||
// track of the value of the device side global variable between kernel
|
||||
// executions.
|
||||
void __hipRegisterVar(
|
||||
hip::FatBinaryInfo** modules, // The device modules containing code object
|
||||
void* var, // The shadow variable in host code
|
||||
char* hostVar, // Variable name in host code
|
||||
char* deviceVar, // Variable name in device code
|
||||
int ext, // Whether this variable is external
|
||||
size_t size, // Size of the variable
|
||||
int constant, // Whether this variable is constant
|
||||
int global) // Unknown, always 0
|
||||
void __hipRegisterVar(hip::FatBinaryInfo** modules, // The device modules containing code object
|
||||
void* var, // The shadow variable in host code
|
||||
char* hostVar, // Variable name in host code
|
||||
char* deviceVar, // Variable name in device code
|
||||
int ext, // Whether this variable is external
|
||||
size_t size, // Size of the variable
|
||||
int constant, // Whether this variable is constant
|
||||
int global) // Unknown, always 0
|
||||
{
|
||||
hip::Var* var_ptr = new hip::Var(std::string(hostVar), hip::Var::DeviceVarKind::DVK_Variable,
|
||||
size, 0, 0, modules);
|
||||
@@ -148,18 +147,17 @@ void __hipRegisterSurface(
|
||||
void __hipRegisterManagedVar(
|
||||
void* hipModule, // Pointer to hip module returned from __hipRegisterFatbinary
|
||||
void** pointer, // Pointer to a chunk of managed memory with size \p size and alignment \p
|
||||
// align HIP runtime allocates such managed memory and assign it to \p pointer
|
||||
// align HIP runtime allocates such managed memory and assign it to \p pointer
|
||||
void* init_value, // Initial value to be copied into \p pointer
|
||||
const char* name, // Name of the variable in code object
|
||||
size_t size, unsigned align) {
|
||||
|
||||
static int enable_deferred_loading{[]() {
|
||||
#ifdef _WIN32 // Don't defer loading for windows
|
||||
return 0;
|
||||
#else
|
||||
char* var = getenv("HIP_ENABLE_DEFERRED_LOADING");
|
||||
return var ? atoi(var) : 1;
|
||||
#endif
|
||||
#ifdef _WIN32 // Don't defer loading for windows
|
||||
return 0;
|
||||
#else
|
||||
char* var = getenv("HIP_ENABLE_DEFERRED_LOADING");
|
||||
return var ? atoi(var) : 1;
|
||||
#endif
|
||||
}()};
|
||||
hipError_t hip_error = hipSuccess;
|
||||
hip::Var* var_ptr = new hip::Var(std::string(name), hip::Var::DeviceVarKind::DVK_Managed, pointer,
|
||||
@@ -174,13 +172,13 @@ void __hipRegisterManagedVar(
|
||||
} else {
|
||||
HIP_INIT_VOID();
|
||||
hipError_t status = ihipMallocManaged(pointer, size, align, 0);
|
||||
var_ptr->setAllocFlag(true); // set flag true for managed alloc
|
||||
var_ptr->setAllocFlag(true); // set flag true for managed alloc
|
||||
if (status == hipSuccess) {
|
||||
hip::Stream* stream = hip::getNullStream();
|
||||
if (stream != nullptr) {
|
||||
status = ihipMemcpy(*pointer, init_value, size, hipMemcpyHostToDevice, *stream);
|
||||
guarantee((status == hipSuccess), "Error during memcpy to managed memory, error:%d!",
|
||||
status);
|
||||
status);
|
||||
} else {
|
||||
ClPrint(amd::LOG_ERROR, amd::LOG_API, "Host Queue is NULL");
|
||||
}
|
||||
@@ -206,7 +204,7 @@ void __hipUnregisterFatBinary(hip::FatBinaryInfo** modules) {
|
||||
static std::once_flag unregister_device_sync;
|
||||
// If SKIP ABORT is set and GPU is in error, dont need to sync streams.
|
||||
if (!HIP_SKIP_ABORT_ON_GPU_ERROR || !amd::Device::IsGPUInError()) {
|
||||
std::call_once(unregister_device_sync, [](){
|
||||
std::call_once(unregister_device_sync, []() {
|
||||
for (auto& hipDevice : g_devices) {
|
||||
// By synchronizing devices ensure that all HSA signal handlers
|
||||
// complete before removeFatBinary
|
||||
@@ -237,15 +235,14 @@ void __hipRegisterTexture(void** modules, void* var, char* hostVar, char* device
|
||||
}
|
||||
void __hipRegisterVar(void** modules, void* var, char* hostVar, char* deviceVar, int ext,
|
||||
size_t size, int constant, int global) {
|
||||
return __hipRegisterVar(reinterpret_cast<hip::FatBinaryInfo**>(modules), var, hostVar,
|
||||
deviceVar, ext, size, constant, global);
|
||||
return __hipRegisterVar(reinterpret_cast<hip::FatBinaryInfo**>(modules), var, hostVar, deviceVar,
|
||||
ext, size, constant, global);
|
||||
}
|
||||
void __hipUnregisterFatBinary(void** modules) {
|
||||
return __hipUnregisterFatBinary(reinterpret_cast<hip::FatBinaryInfo**>(modules));
|
||||
}
|
||||
|
||||
hipError_t hipConfigureCall(dim3 gridDim, dim3 blockDim, size_t sharedMem,
|
||||
hipStream_t stream) {
|
||||
hipError_t hipConfigureCall(dim3 gridDim, dim3 blockDim, size_t sharedMem, hipStream_t stream) {
|
||||
HIP_INIT_API(hipConfigureCall, gridDim, blockDim, sharedMem, stream);
|
||||
|
||||
PlatformState::instance().configureCall(gridDim, blockDim, sharedMem, stream);
|
||||
@@ -254,7 +251,7 @@ hipError_t hipConfigureCall(dim3 gridDim, dim3 blockDim, size_t sharedMem,
|
||||
}
|
||||
|
||||
hipError_t __hipPushCallConfiguration(dim3 gridDim, dim3 blockDim, size_t sharedMem,
|
||||
hipStream_t stream) {
|
||||
hipStream_t stream) {
|
||||
HIP_INIT_API(__hipPushCallConfiguration, gridDim, blockDim, sharedMem, stream);
|
||||
|
||||
PlatformState::instance().configureCall(gridDim, blockDim, sharedMem, stream);
|
||||
@@ -263,7 +260,7 @@ hipError_t __hipPushCallConfiguration(dim3 gridDim, dim3 blockDim, size_t shared
|
||||
}
|
||||
|
||||
hipError_t __hipPopCallConfiguration(dim3* gridDim, dim3* blockDim, size_t* sharedMem,
|
||||
hipStream_t* stream) {
|
||||
hipStream_t* stream) {
|
||||
HIP_INIT_API(__hipPopCallConfiguration, gridDim, blockDim, sharedMem, stream);
|
||||
|
||||
ihipExec_t exec;
|
||||
@@ -345,7 +342,6 @@ hipError_t hipGetSymbolSize(size_t* sizePtr, const void* symbol) {
|
||||
|
||||
hipError_t ihipCreateGlobalVarObj(const char* name, hipModule_t hmod, amd::Memory** amd_mem_obj,
|
||||
hipDeviceptr_t* dptr, size_t* bytes) {
|
||||
|
||||
/* Get Device Program pointer*/
|
||||
amd::Program* program = as_amd(reinterpret_cast<cl_program>(hmod));
|
||||
device::Program* dev_program = program->getDeviceProgram(*hip::getCurrentDevice()->devices()[0]);
|
||||
@@ -431,7 +427,7 @@ hipError_t ihipOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
// multiply the number of SIMDs by 2, to account for 2CUs in 1 WGP.
|
||||
uint32_t simdPerCU = device.isa().simdPerCU();
|
||||
if (wrkGrpInfo->isWGPMode_) {
|
||||
simdPerCU *= 2;
|
||||
simdPerCU *= 2;
|
||||
}
|
||||
|
||||
const size_t alu_occupancy = simdPerCU * std::min(MaxWavesPerSimd, GprWaves);
|
||||
@@ -674,8 +670,8 @@ hipError_t ihipLaunchKernel(const void* hostFunction, dim3 gridDim, dim3 blockDi
|
||||
return hipErrorInvalidConfiguration;
|
||||
}
|
||||
|
||||
return ihipModuleLaunchKernel(func, launch_params, stream, args, nullptr,
|
||||
startEvent, stopEvent, flags);
|
||||
return ihipModuleLaunchKernel(func, launch_params, stream, args, nullptr, startEvent, stopEvent,
|
||||
flags);
|
||||
}
|
||||
|
||||
// conversion routines between float and half precision
|
||||
@@ -761,7 +757,7 @@ void PlatformState::init() {
|
||||
it.second->resize_dVar(g_devices.size());
|
||||
}
|
||||
for (auto& it : statCO_.managedVars_) {
|
||||
for (auto& var: it.second) {
|
||||
for (auto& var : it.second) {
|
||||
var->resize_dVar(g_devices.size());
|
||||
}
|
||||
}
|
||||
@@ -1055,9 +1051,9 @@ void* PlatformState::getDynamicLibraryHandle() {
|
||||
return dynamicLibraryHandle_;
|
||||
}
|
||||
|
||||
void PlatformState::setDynamicLibraryHandle(void* handle){
|
||||
void PlatformState::setDynamicLibraryHandle(void* handle) {
|
||||
amd::ScopedLock lock(lock_);
|
||||
dynamicLibraryHandle_ = handle;
|
||||
}
|
||||
|
||||
} //namespace hip
|
||||
} // namespace hip
|
||||
|
||||
@@ -34,7 +34,7 @@ hipError_t ihipOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
// Unique file descriptor class
|
||||
struct UniqueFD {
|
||||
UniqueFD(const std::string& fpath, amd::Os::FileDesc fdesc, size_t fsize)
|
||||
: fpath_(fpath), fdesc_(fdesc), fsize_(fsize) {}
|
||||
: fpath_(fpath), fdesc_(fdesc), fsize_(fsize) {}
|
||||
|
||||
const std::string fpath_; //!< File path of this unique file
|
||||
const amd::Os::FileDesc fdesc_; //!< File Descriptor
|
||||
@@ -120,7 +120,7 @@ class PlatformState {
|
||||
bool initialized_{false};
|
||||
std::unordered_map<textureReference*, std::pair<hipModule_t, std::string>> texRef_map_;
|
||||
|
||||
std::unordered_map<std::string, std::shared_ptr<UniqueFD>> ufd_map_; //!< Unique File Desc Map
|
||||
std::unordered_map<std::string, std::shared_ptr<UniqueFD>> ufd_map_; //!< Unique File Desc Map
|
||||
|
||||
void* dynamicLibraryHandle_{nullptr};
|
||||
};
|
||||
|
||||
@@ -50,10 +50,9 @@ template <hip_api_id_t operation_id> class api_callbacks_spawner_t {
|
||||
static_assert(operation_id >= HIP_API_ID_FIRST && operation_id <= HIP_API_ID_LAST,
|
||||
"invalid HIP_API operation id");
|
||||
|
||||
if (auto function =
|
||||
amd::activity_prof::report_activity.load(std::memory_order_relaxed);
|
||||
function && (enabled_ = function(ACTIVITY_DOMAIN_HIP_API, operation_id,
|
||||
&trace_data_) == 0)) {
|
||||
if (auto function = amd::activity_prof::report_activity.load(std::memory_order_relaxed);
|
||||
function &&
|
||||
(enabled_ = function(ACTIVITY_DOMAIN_HIP_API, operation_id, &trace_data_) == 0)) {
|
||||
amd::activity_prof::correlation_id = trace_data_.api_data.correlation_id;
|
||||
|
||||
if (trace_data_.phase_enter != nullptr) {
|
||||
|
||||
@@ -39,4 +39,4 @@ hipError_t hipProfilerStop() {
|
||||
|
||||
HIP_RETURN(hipErrorNotSupported);
|
||||
}
|
||||
} //namespace hip
|
||||
} // namespace hip
|
||||
|
||||
@@ -67,9 +67,7 @@ hipError_t Stream::EndCapture() {
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
bool Stream::Create() {
|
||||
return create();
|
||||
}
|
||||
bool Stream::Create() { return create(); }
|
||||
|
||||
// ================================================================================================
|
||||
void Stream::Destroy(hip::Stream* stream, bool forceDestroy) {
|
||||
@@ -106,16 +104,14 @@ bool isValid(hipStream_t& stream) {
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
int Stream::DeviceId() const {
|
||||
return device_->deviceId();
|
||||
}
|
||||
int Stream::DeviceId() const { return device_->deviceId(); }
|
||||
|
||||
// ================================================================================================
|
||||
int Stream::DeviceId(const hipStream_t hStream) {
|
||||
// Copying locally into non-const variable just to get const away
|
||||
hipStream_t inputStream = hStream;
|
||||
if (!hip::isValid(inputStream)) {
|
||||
//return invalid device id
|
||||
// return invalid device id
|
||||
return -1;
|
||||
}
|
||||
bool isNullOrLegacyStream = (hStream == nullptr || hStream == hipStreamLegacy);
|
||||
@@ -181,9 +177,9 @@ void CL_CALLBACK ihipStreamCallback(cl_event event, cl_int command_exec_status,
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
static hipError_t ihipStreamCreate(hipStream_t* stream,
|
||||
unsigned int flags, hip::Stream::Priority priority,
|
||||
const std::vector<uint32_t>& cuMask = {}) {
|
||||
static hipError_t ihipStreamCreate(hipStream_t* stream, unsigned int flags,
|
||||
hip::Stream::Priority priority,
|
||||
const std::vector<uint32_t>& cuMask = {}) {
|
||||
if (flags != hipStreamDefault && flags != hipStreamNonBlocking) {
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
@@ -191,8 +187,7 @@ static hipError_t ihipStreamCreate(hipStream_t* stream,
|
||||
|
||||
if (hStream == nullptr) {
|
||||
return hipErrorOutOfMemory;
|
||||
}
|
||||
else if (!hStream->Create()) {
|
||||
} else if (!hStream->Create()) {
|
||||
hip::Stream::Destroy(hStream);
|
||||
return hipErrorOutOfMemory;
|
||||
}
|
||||
@@ -206,13 +201,13 @@ static hipError_t ihipStreamCreate(hipStream_t* stream,
|
||||
|
||||
stream_per_thread::stream_per_thread() {
|
||||
m_streams.resize(g_devices.size());
|
||||
for (auto &stream : m_streams) {
|
||||
for (auto& stream : m_streams) {
|
||||
stream = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
stream_per_thread::~stream_per_thread() {
|
||||
for (auto &stream:m_streams) {
|
||||
for (auto& stream : m_streams) {
|
||||
if (stream != nullptr && hip::isValid(stream)) {
|
||||
hip::Stream::Destroy(reinterpret_cast<hip::Stream*>(stream));
|
||||
stream = nullptr;
|
||||
@@ -226,15 +221,15 @@ hipStream_t stream_per_thread::get() {
|
||||
// This is to make sure m_streams is not empty
|
||||
if (m_streams.empty()) {
|
||||
m_streams.resize(g_devices.size());
|
||||
for (auto &stream : m_streams) {
|
||||
for (auto& stream : m_streams) {
|
||||
stream = nullptr;
|
||||
}
|
||||
}
|
||||
// There is a scenario where hipResetDevice destroys stream per thread
|
||||
// hence isValid check is required to make sure only valid stream is used
|
||||
if (m_streams[currDev] == nullptr || !hip::isValid(m_streams[currDev])) {
|
||||
hipError_t status = ihipStreamCreate(&m_streams[currDev], hipStreamDefault,
|
||||
hip::Stream::Priority::Normal);
|
||||
hipError_t status =
|
||||
ihipStreamCreate(&m_streams[currDev], hipStreamDefault, hip::Stream::Priority::Normal);
|
||||
if (status != hipSuccess) {
|
||||
DevLogError("Stream creation failed");
|
||||
}
|
||||
@@ -266,7 +261,7 @@ hipStream_t getPerThreadDefaultStream() {
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
hipError_t hipStreamCreateWithFlags(hipStream_t *stream, unsigned int flags) {
|
||||
hipError_t hipStreamCreateWithFlags(hipStream_t* stream, unsigned int flags) {
|
||||
HIP_INIT_API(hipStreamCreateWithFlags, stream, flags);
|
||||
|
||||
if (stream == nullptr) {
|
||||
@@ -277,7 +272,7 @@ hipError_t hipStreamCreateWithFlags(hipStream_t *stream, unsigned int flags) {
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
hipError_t hipStreamCreate(hipStream_t *stream) {
|
||||
hipError_t hipStreamCreate(hipStream_t* stream) {
|
||||
HIP_INIT_API(hipStreamCreate, stream);
|
||||
|
||||
if (stream == nullptr) {
|
||||
@@ -417,8 +412,8 @@ hipError_t hipStreamDestroy(hipStream_t stream) {
|
||||
g_allCapturingStreams.erase(g_it);
|
||||
}
|
||||
}
|
||||
const auto& l_it = std::find(hip::tls.capture_streams_.begin(),
|
||||
hip::tls.capture_streams_.end(), s);
|
||||
const auto& l_it =
|
||||
std::find(hip::tls.capture_streams_.begin(), hip::tls.capture_streams_.end(), s);
|
||||
if (l_it != hip::tls.capture_streams_.end()) {
|
||||
hip::tls.capture_streams_.erase(l_it);
|
||||
}
|
||||
@@ -429,7 +424,7 @@ hipError_t hipStreamDestroy(hipStream_t stream) {
|
||||
|
||||
// ================================================================================================
|
||||
void WaitThenDecrementSignal(hipStream_t stream, hipError_t status, void* user_data) {
|
||||
CallbackData* data = reinterpret_cast<CallbackData*>(user_data);
|
||||
CallbackData* data = reinterpret_cast<CallbackData*>(user_data);
|
||||
int offset = data->previous_read_index % IPC_SIGNALS_PER_EVENT;
|
||||
while (data->shmem->read_index < data->previous_read_index + IPC_SIGNALS_PER_EVENT &&
|
||||
data->shmem->signal[offset] != 0) {
|
||||
@@ -458,8 +453,8 @@ hipError_t hipStreamWaitEvent_common(hipStream_t stream, hipEvent_t event, unsig
|
||||
hip::Stream* eventStream = reinterpret_cast<hip::Stream*>(eventStreamHandle);
|
||||
if (eventStream != nullptr && eventStream->IsEventCaptured(event) == true) {
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_API,
|
||||
"[hipGraph] Current capture node StreamWaitEvent on stream : %p, Event %p", stream,
|
||||
event);
|
||||
"[hipGraph] Current capture node StreamWaitEvent on stream : %p, Event %p", stream,
|
||||
event);
|
||||
if (waitStream == nullptr) {
|
||||
return hipErrorInvalidHandle;
|
||||
}
|
||||
@@ -680,7 +675,8 @@ hipError_t hipExtStreamCreateWithCUMask(hipStream_t* stream, uint32_t cuMaskSize
|
||||
|
||||
const std::vector<uint32_t> cuMaskv(cuMask, cuMask + cuMaskSize);
|
||||
|
||||
HIP_RETURN(ihipStreamCreate(stream, hipStreamDefault, hip::Stream::Priority::Normal, cuMaskv), *stream);
|
||||
HIP_RETURN(ihipStreamCreate(stream, hipStreamDefault, hip::Stream::Priority::Normal, cuMaskv),
|
||||
*stream);
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
@@ -727,8 +723,7 @@ hipError_t hipExtStreamGetCUMask(hipStream_t stream, uint32_t cuMaskSize, uint32
|
||||
|
||||
// find the minimum cuMaskSize required to present the CU mask bit-array in a patch of 32 bits
|
||||
// and return error if the cuMaskSize argument is less than cuMaskSizeRequired
|
||||
uint32_t cuMaskSizeRequired = info.maxComputeUnits_ / 32 +
|
||||
((info.maxComputeUnits_ % 32) ? 1 : 0);
|
||||
uint32_t cuMaskSizeRequired = info.maxComputeUnits_ / 32 + ((info.maxComputeUnits_ % 32) ? 1 : 0);
|
||||
|
||||
if (cuMaskSize < cuMaskSizeRequired) {
|
||||
HIP_RETURN(hipErrorInvalidValue);
|
||||
@@ -763,11 +758,11 @@ hipError_t hipExtStreamGetCUMask(hipStream_t stream, uint32_t cuMaskSize, uint32
|
||||
std::copy(defaultCUMask.begin(), defaultCUMask.end(), cuMask);
|
||||
}
|
||||
} else {
|
||||
// if the stream is not null then get the stream's CU mask and return one of the below cases
|
||||
// case1 if globalCUMask_ is defined then return the AND of globalCUMask_ and stream's CU mask
|
||||
// case2 if globalCUMask_ is not defined then retuen AND of defaultCUMask and stream's CU mask
|
||||
// in both cases above if stream's CU mask is empty then either globalCUMask_ (for case1)
|
||||
// or defaultCUMask(for case2) will be returned
|
||||
// if the stream is not null then get the stream's CU mask and return one of the below cases
|
||||
// case1 if globalCUMask_ is defined then return the AND of globalCUMask_ and stream's CU mask
|
||||
// case2 if globalCUMask_ is not defined then retuen AND of defaultCUMask and stream's CU mask
|
||||
// in both cases above if stream's CU mask is empty then either globalCUMask_ (for case1)
|
||||
// or defaultCUMask(for case2) will be returned
|
||||
std::vector<uint32_t> streamCUMask;
|
||||
streamCUMask = reinterpret_cast<hip::Stream*>(stream)->GetCUMask();
|
||||
std::vector<uint32_t> mask = {};
|
||||
@@ -780,7 +775,7 @@ hipError_t hipExtStreamGetCUMask(hipStream_t stream, uint32_t cuMaskSize, uint32
|
||||
mask.push_back(streamCUMask[i] & defaultCUMask[i]);
|
||||
}
|
||||
// check to make sure after ANDing streamCUMask (custom-defined) with global CU mask,
|
||||
//we have non-zero mask, oterwise just return either globalCUMask_ or defaultCUMask
|
||||
// we have non-zero mask, oterwise just return either globalCUMask_ or defaultCUMask
|
||||
bool zeroCUMask = true;
|
||||
for (auto m : mask) {
|
||||
if (m != 0) {
|
||||
@@ -822,7 +817,7 @@ hipError_t hipStreamGetDevice(hipStream_t stream, hipDevice_t* device) {
|
||||
}
|
||||
// ================================================================================================
|
||||
hipError_t hipStreamSetAttribute(hipStream_t stream, hipStreamAttrID attr,
|
||||
const hipStreamAttrValue *value) {
|
||||
const hipStreamAttrValue* value) {
|
||||
HIP_INIT_API(hipStreamSetAttribute, stream, attr, value);
|
||||
hipError_t status = hipSuccess;
|
||||
if (value == nullptr) {
|
||||
@@ -861,7 +856,7 @@ hipError_t hipStreamSetAttribute(hipStream_t stream, hipStreamAttrID attr,
|
||||
}
|
||||
|
||||
hipError_t hipStreamGetAttribute(hipStream_t stream, hipStreamAttrID attr,
|
||||
hipStreamAttrValue *value_out) {
|
||||
hipStreamAttrValue* value_out) {
|
||||
HIP_INIT_API(hipStreamGetAttribute, stream, attr, value_out);
|
||||
|
||||
if (value_out == nullptr) {
|
||||
@@ -876,7 +871,7 @@ hipError_t hipStreamGetAttribute(hipStream_t stream, hipStreamAttrID attr,
|
||||
|
||||
hip::Stream* s = reinterpret_cast<hip::Stream*>(stream);
|
||||
|
||||
switch(attr) {
|
||||
switch (attr) {
|
||||
case hipStreamAttributeSynchronizationPolicy: {
|
||||
value_out->syncPolicy = static_cast<hipSynchronizationPolicy>(s->GetSyncPolicy());
|
||||
break;
|
||||
@@ -892,4 +887,4 @@ hipError_t hipStreamGetAttribute(hipStream_t stream, hipStreamAttrID attr,
|
||||
|
||||
HIP_RETURN(hipSuccess);
|
||||
}
|
||||
} // hip namespace
|
||||
} // namespace hip
|
||||
|
||||
@@ -79,26 +79,26 @@ hipError_t ihipStreamOperation(hipStream_t stream, cl_command_type cmdType, void
|
||||
// 'flags' for now used only for Wait, but in future there will usecases for Write too.
|
||||
|
||||
if (cmdType == ROCCLR_COMMAND_STREAM_WAIT_VALUE) {
|
||||
// Stream Wait on AQL barrier-value type packet is only supported on SignalMemory objects
|
||||
if (GPU_STREAMOPS_CP_WAIT && (!(memory->getMemFlags() & ROCCLR_MEM_HSA_SIGNAL_MEMORY))) {
|
||||
// Stream Wait on AQL barrier-value type packet is only supported on SignalMemory objects
|
||||
if (GPU_STREAMOPS_CP_WAIT && (!(memory->getMemFlags() & ROCCLR_MEM_HSA_SIGNAL_MEMORY))) {
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
switch (flags) {
|
||||
case hipStreamWaitValueGte:
|
||||
outFlags = ROCCLR_STREAM_WAIT_VALUE_GTE;
|
||||
break;
|
||||
break;
|
||||
case hipStreamWaitValueEq:
|
||||
outFlags = ROCCLR_STREAM_WAIT_VALUE_EQ;
|
||||
break;
|
||||
break;
|
||||
case hipStreamWaitValueAnd:
|
||||
outFlags = ROCCLR_STREAM_WAIT_VALUE_AND;
|
||||
break;
|
||||
break;
|
||||
case hipStreamWaitValueNor:
|
||||
outFlags = ROCCLR_STREAM_WAIT_VALUE_NOR;
|
||||
break;
|
||||
break;
|
||||
default:
|
||||
return hipErrorInvalidValue;
|
||||
break;
|
||||
break;
|
||||
}
|
||||
} else if (cmdType != ROCCLR_COMMAND_STREAM_WRITE_VALUE) {
|
||||
return hipErrorInvalidValue;
|
||||
@@ -108,8 +108,8 @@ hipError_t ihipStreamOperation(hipStream_t stream, cl_command_type cmdType, void
|
||||
amd::Command::EventWaitList waitList;
|
||||
|
||||
amd::StreamOperationCommand* command =
|
||||
new amd::StreamOperationCommand(*hip_stream, cmdType, waitList, *memory->asBuffer(),
|
||||
value, mask, outFlags, offset, sizeBytes);
|
||||
new amd::StreamOperationCommand(*hip_stream, cmdType, waitList, *memory->asBuffer(), value,
|
||||
mask, outFlags, offset, sizeBytes);
|
||||
|
||||
if (command == nullptr) {
|
||||
return hipErrorOutOfMemory;
|
||||
@@ -124,61 +124,39 @@ hipError_t hipStreamWaitValue32(hipStream_t stream, void* ptr, uint32_t value, u
|
||||
HIP_INIT_API(hipStreamWaitValue32, stream, ptr, value, mask, flags);
|
||||
// NOTE: ptr corresponds to a HSA Signal memeory which is 64 bits.
|
||||
// 32 bit value and mask are converted to 64-bit values.
|
||||
HIP_RETURN_DURATION(ihipStreamOperation(
|
||||
stream,
|
||||
ROCCLR_COMMAND_STREAM_WAIT_VALUE,
|
||||
ptr,
|
||||
value,
|
||||
mask,
|
||||
flags,
|
||||
sizeof(uint32_t)));
|
||||
HIP_RETURN_DURATION(ihipStreamOperation(stream, ROCCLR_COMMAND_STREAM_WAIT_VALUE, ptr, value,
|
||||
mask, flags, sizeof(uint32_t)));
|
||||
}
|
||||
|
||||
hipError_t hipStreamWaitValue64(hipStream_t stream, void* ptr, uint64_t value, unsigned int flags,
|
||||
uint64_t mask) {
|
||||
HIP_INIT_API(hipStreamWaitValue64, stream, ptr, value, mask, flags);
|
||||
HIP_RETURN_DURATION(ihipStreamOperation(
|
||||
stream,
|
||||
ROCCLR_COMMAND_STREAM_WAIT_VALUE,
|
||||
ptr,
|
||||
value,
|
||||
mask,
|
||||
flags,
|
||||
sizeof(uint64_t)));
|
||||
HIP_RETURN_DURATION(ihipStreamOperation(stream, ROCCLR_COMMAND_STREAM_WAIT_VALUE, ptr, value,
|
||||
mask, flags, sizeof(uint64_t)));
|
||||
}
|
||||
|
||||
hipError_t hipStreamWriteValue32(hipStream_t stream, void* ptr, uint32_t value, unsigned int flags) {
|
||||
hipError_t hipStreamWriteValue32(hipStream_t stream, void* ptr, uint32_t value,
|
||||
unsigned int flags) {
|
||||
HIP_INIT_API(hipStreamWriteValue32, stream, ptr, value, flags);
|
||||
HIP_RETURN_DURATION(ihipStreamOperation(
|
||||
stream,
|
||||
ROCCLR_COMMAND_STREAM_WRITE_VALUE,
|
||||
ptr,
|
||||
value,
|
||||
0, // mask un-used set it to 0
|
||||
0, // flags un-used for now set it to 0
|
||||
sizeof(uint32_t)));
|
||||
HIP_RETURN_DURATION(ihipStreamOperation(stream, ROCCLR_COMMAND_STREAM_WRITE_VALUE, ptr, value,
|
||||
0, // mask un-used set it to 0
|
||||
0, // flags un-used for now set it to 0
|
||||
sizeof(uint32_t)));
|
||||
}
|
||||
|
||||
hipError_t hipStreamWriteValue64(hipStream_t stream, void* ptr, uint64_t value, unsigned int flags) {
|
||||
hipError_t hipStreamWriteValue64(hipStream_t stream, void* ptr, uint64_t value,
|
||||
unsigned int flags) {
|
||||
HIP_INIT_API(hipStreamWriteValue64, stream, ptr, value, flags);
|
||||
HIP_RETURN_DURATION(ihipStreamOperation(
|
||||
stream,
|
||||
ROCCLR_COMMAND_STREAM_WRITE_VALUE,
|
||||
ptr,
|
||||
value,
|
||||
0, // mask un-used set it to 0
|
||||
0, // flags un-used for now set it to 0
|
||||
sizeof(uint64_t)));
|
||||
HIP_RETURN_DURATION(ihipStreamOperation(stream, ROCCLR_COMMAND_STREAM_WRITE_VALUE, ptr, value,
|
||||
0, // mask un-used set it to 0
|
||||
0, // flags un-used for now set it to 0
|
||||
sizeof(uint64_t)));
|
||||
}
|
||||
|
||||
hipError_t hipStreamBatchMemOp(hipStream_t stream, unsigned int count,
|
||||
hipStreamBatchMemOpParams* paramArray, unsigned int flags) {
|
||||
HIP_INIT_API(hipStreamBatchMemOp, count, paramArray, flags);
|
||||
HIP_RETURN_DURATION(ihipBatchMemOperation(
|
||||
stream,
|
||||
ROCCLR_COMMAND_BATCH_STREAM,
|
||||
count,
|
||||
paramArray,
|
||||
flags));
|
||||
}
|
||||
HIP_RETURN_DURATION(
|
||||
ihipBatchMemOperation(stream, ROCCLR_COMMAND_BATCH_STREAM, count, paramArray, flags));
|
||||
}
|
||||
} // namespace hip
|
||||
|
||||
@@ -62,7 +62,7 @@ hipError_t ihipCreateSurfaceObject(hipSurfaceObject_t* pSurfObject,
|
||||
}
|
||||
|
||||
if (pResDesc->res.array.array->flags != hipArrayDefault &&
|
||||
(pResDesc->res.array.array->flags & hipArraySurfaceLoadStore) == 0) {
|
||||
(pResDesc->res.array.array->flags & hipArraySurfaceLoadStore) == 0) {
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
|
||||
@@ -78,8 +78,8 @@ hipError_t ihipCreateSurfaceObject(hipSurfaceObject_t* pSurfObject,
|
||||
image = as_amd(memObj)->asImage();
|
||||
|
||||
void* surfObjectBuffer = nullptr;
|
||||
hipError_t err = ihipMalloc(&surfObjectBuffer, sizeof(__hip_surface),
|
||||
CL_MEM_SVM_FINE_GRAIN_BUFFER);
|
||||
hipError_t err =
|
||||
ihipMalloc(&surfObjectBuffer, sizeof(__hip_surface), CL_MEM_SVM_FINE_GRAIN_BUFFER);
|
||||
if (surfObjectBuffer == nullptr || err != hipSuccess) {
|
||||
return hipErrorOutOfMemory;
|
||||
}
|
||||
@@ -112,4 +112,4 @@ hipError_t hipDestroySurfaceObject(hipSurfaceObject_t surfaceObject) {
|
||||
|
||||
HIP_RETURN(ihipDestroySurfaceObject(surfaceObject));
|
||||
}
|
||||
} //namespace hip
|
||||
} // namespace hip
|
||||
|
||||
@@ -421,13 +421,11 @@ extern "C" hipError_t hipGetDevicePropertiesR0000(hipDeviceProp_tR0000* prop, in
|
||||
}
|
||||
hipError_t hipGetDriverEntryPoint(const char* symbol, void** funcPtr, unsigned long long flags,
|
||||
hipDriverEntryPointQueryResult* status) {
|
||||
return hip::GetHipDispatchTable()->hipGetDriverEntryPoint_fn(symbol, funcPtr, flags,
|
||||
status);
|
||||
return hip::GetHipDispatchTable()->hipGetDriverEntryPoint_fn(symbol, funcPtr, flags, status);
|
||||
}
|
||||
hipError_t hipGetDriverEntryPoint_spt(const char* symbol, void** funcPtr, unsigned long long flags,
|
||||
hipDriverEntryPointQueryResult* status) {
|
||||
return hip::GetHipDispatchTable()->hipGetDriverEntryPoint_spt_fn(symbol, funcPtr, flags,
|
||||
status);
|
||||
hipDriverEntryPointQueryResult* status) {
|
||||
return hip::GetHipDispatchTable()->hipGetDriverEntryPoint_spt_fn(symbol, funcPtr, flags, status);
|
||||
}
|
||||
const char* hipGetErrorName(hipError_t hip_error) {
|
||||
return hip::GetHipDispatchTable()->hipGetErrorName_fn(hip_error);
|
||||
@@ -556,11 +554,11 @@ hipError_t hipGraphAddMemsetNode(hipGraphNode_t* pGraphNode, hipGraph_t graph,
|
||||
return hip::GetHipDispatchTable()->hipGraphAddMemsetNode_fn(pGraphNode, graph, pDependencies,
|
||||
numDependencies, pMemsetParams);
|
||||
}
|
||||
hipError_t hipGraphAddNode(hipGraphNode_t *pGraphNode, hipGraph_t graph,
|
||||
const hipGraphNode_t *pDependencies, size_t numDependencies,
|
||||
hipGraphNodeParams *nodeParams) {
|
||||
return hip::GetHipDispatchTable()->hipGraphAddNode_fn(pGraphNode, graph,
|
||||
pDependencies, numDependencies, nodeParams);
|
||||
hipError_t hipGraphAddNode(hipGraphNode_t* pGraphNode, hipGraph_t graph,
|
||||
const hipGraphNode_t* pDependencies, size_t numDependencies,
|
||||
hipGraphNodeParams* nodeParams) {
|
||||
return hip::GetHipDispatchTable()->hipGraphAddNode_fn(pGraphNode, graph, pDependencies,
|
||||
numDependencies, nodeParams);
|
||||
}
|
||||
hipError_t hipGraphChildGraphNodeGetGraph(hipGraphNode_t node, hipGraph_t* pGraph) {
|
||||
return hip::GetHipDispatchTable()->hipGraphChildGraphNodeGetGraph_fn(node, pGraph);
|
||||
@@ -599,8 +597,7 @@ hipError_t hipGraphExecChildGraphNodeSetParams(hipGraphExec_t hGraphExec, hipGra
|
||||
}
|
||||
hipError_t hipGraphExecNodeSetParams(hipGraphExec_t hGraphExec, hipGraphNode_t node,
|
||||
hipGraphNodeParams* nodeParams) {
|
||||
return hip::GetHipDispatchTable()->hipGraphExecNodeSetParams_fn(hGraphExec, node,
|
||||
nodeParams);
|
||||
return hip::GetHipDispatchTable()->hipGraphExecNodeSetParams_fn(hGraphExec, node, nodeParams);
|
||||
}
|
||||
hipError_t hipGraphExecDestroy(hipGraphExec_t graphExec) {
|
||||
return hip::GetHipDispatchTable()->hipGraphExecDestroy_fn(graphExec);
|
||||
@@ -686,9 +683,9 @@ hipError_t hipGraphInstantiateWithFlags(hipGraphExec_t* pGraphExec, hipGraph_t g
|
||||
return hip::GetHipDispatchTable()->hipGraphInstantiateWithFlags_fn(pGraphExec, graph, flags);
|
||||
}
|
||||
hipError_t hipGraphInstantiateWithParams(hipGraphExec_t* pGraphExec, hipGraph_t graph,
|
||||
hipGraphInstantiateParams* instantiateParams) {
|
||||
hipGraphInstantiateParams* instantiateParams) {
|
||||
return hip::GetHipDispatchTable()->hipGraphInstantiateWithParams_fn(pGraphExec, graph,
|
||||
instantiateParams);
|
||||
instantiateParams);
|
||||
}
|
||||
hipError_t hipGraphKernelNodeCopyAttributes(hipGraphNode_t hSrc, hipGraphNode_t hDst) {
|
||||
return hip::GetHipDispatchTable()->hipGraphKernelNodeCopyAttributes_fn(hSrc, hDst);
|
||||
@@ -840,10 +837,10 @@ hipError_t hipImportExternalSemaphore(hipExternalSemaphore_t* extSem_out,
|
||||
return hip::GetHipDispatchTable()->hipImportExternalSemaphore_fn(extSem_out, semHandleDesc);
|
||||
}
|
||||
hipError_t hipDrvGraphAddMemsetNode(hipGraphNode_t* phGraphNode, hipGraph_t hGraph,
|
||||
const hipGraphNode_t* dependencies, size_t numDependencies,
|
||||
const hipMemsetParams* memsetParams, hipCtx_t ctx) {
|
||||
return hip::GetHipDispatchTable()->hipDrvGraphAddMemsetNode_fn(phGraphNode, hGraph,
|
||||
dependencies, numDependencies, memsetParams, ctx);
|
||||
const hipGraphNode_t* dependencies, size_t numDependencies,
|
||||
const hipMemsetParams* memsetParams, hipCtx_t ctx) {
|
||||
return hip::GetHipDispatchTable()->hipDrvGraphAddMemsetNode_fn(
|
||||
phGraphNode, hGraph, dependencies, numDependencies, memsetParams, ctx);
|
||||
}
|
||||
hipError_t hipInit(unsigned int flags) { return hip::GetHipDispatchTable()->hipInit_fn(flags); }
|
||||
hipError_t hipIpcCloseMemHandle(void* devPtr) {
|
||||
@@ -1288,23 +1285,25 @@ hipError_t hipModuleLoadDataEx(hipModule_t* module, const void* image, unsigned
|
||||
optionValues);
|
||||
}
|
||||
|
||||
hipError_t hipLinkAddData(hipLinkState_t state, hipJitInputType type, void* data, size_t size, const char* name,
|
||||
unsigned int numOptions, hipJitOption* options, void** optionValues) {
|
||||
hipError_t hipLinkAddData(hipLinkState_t state, hipJitInputType type, void* data, size_t size,
|
||||
const char* name, unsigned int numOptions, hipJitOption* options,
|
||||
void** optionValues) {
|
||||
return hip::GetHipDispatchTable()->hipLinkAddData_fn(state, type, data, size, name, numOptions,
|
||||
options, optionValues);
|
||||
}
|
||||
|
||||
hipError_t hipLinkAddFile(hipLinkState_t state, hipJitInputType type, const char* path,
|
||||
unsigned int numOptions, hipJitOption* options, void** optionValues) {
|
||||
return hip::GetHipDispatchTable()->hipLinkAddFile_fn(state, type, path, numOptions, options,
|
||||
optionValues);
|
||||
return hip::GetHipDispatchTable()->hipLinkAddFile_fn(state, type, path, numOptions, options,
|
||||
optionValues);
|
||||
}
|
||||
|
||||
hipError_t hipLinkComplete(hipLinkState_t state, void** hipBinOut, size_t* sizeOut) {
|
||||
return hip::GetHipDispatchTable()->hipLinkComplete_fn(state, hipBinOut, sizeOut);
|
||||
}
|
||||
|
||||
hipError_t hipLinkCreate(unsigned int numOptions, hipJitOption* options, void** optionValues, hipLinkState_t* stateOut) {
|
||||
hipError_t hipLinkCreate(unsigned int numOptions, hipJitOption* options, void** optionValues,
|
||||
hipLinkState_t* stateOut) {
|
||||
return hip::GetHipDispatchTable()->hipLinkCreate_fn(numOptions, options, optionValues, stateOut);
|
||||
}
|
||||
|
||||
@@ -1600,28 +1599,30 @@ extern "C" hipChannelFormatDesc hipCreateChannelDesc(int x, int y, int z, int w,
|
||||
}
|
||||
|
||||
#ifdef _WIN32
|
||||
# define DllExport __declspec(dllexport)
|
||||
#else // !_WIN32
|
||||
# define DllExport
|
||||
#endif // !_WIN32
|
||||
#define DllExport __declspec(dllexport)
|
||||
#else // !_WIN32
|
||||
#define DllExport
|
||||
#endif // !_WIN32
|
||||
|
||||
DllExport hipError_t hipExtModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
|
||||
uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ,
|
||||
uint32_t localWorkSizeX, uint32_t localWorkSizeY,
|
||||
uint32_t localWorkSizeZ, size_t sharedMemBytes,
|
||||
hipStream_t hStream, void** kernelParams, void** extra,
|
||||
hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags) {
|
||||
uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ,
|
||||
uint32_t localWorkSizeX, uint32_t localWorkSizeY,
|
||||
uint32_t localWorkSizeZ, size_t sharedMemBytes,
|
||||
hipStream_t hStream, void** kernelParams,
|
||||
void** extra, hipEvent_t startEvent,
|
||||
hipEvent_t stopEvent, uint32_t flags) {
|
||||
return hip::GetHipDispatchTable()->hipExtModuleLaunchKernel_fn(
|
||||
f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, localWorkSizeX, localWorkSizeY,
|
||||
localWorkSizeZ, sharedMemBytes, hStream, kernelParams, extra, startEvent, stopEvent, flags);
|
||||
}
|
||||
|
||||
DllExport hipError_t hipHccModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
|
||||
uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ,
|
||||
uint32_t localWorkSizeX, uint32_t localWorkSizeY,
|
||||
uint32_t localWorkSizeZ, size_t sharedMemBytes,
|
||||
hipStream_t hStream, void** kernelParams, void** extra,
|
||||
hipEvent_t startEvent, hipEvent_t stopEvent) {
|
||||
uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ,
|
||||
uint32_t localWorkSizeX, uint32_t localWorkSizeY,
|
||||
uint32_t localWorkSizeZ, size_t sharedMemBytes,
|
||||
hipStream_t hStream, void** kernelParams,
|
||||
void** extra, hipEvent_t startEvent,
|
||||
hipEvent_t stopEvent) {
|
||||
return hip::GetHipDispatchTable()->hipHccModuleLaunchKernel_fn(
|
||||
f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, localWorkSizeX, localWorkSizeY,
|
||||
localWorkSizeZ, sharedMemBytes, hStream, kernelParams, extra, startEvent, stopEvent);
|
||||
@@ -1785,16 +1786,14 @@ hipError_t hipLaunchHostFunc_spt(hipStream_t stream, hipHostFn_t fn, void* userD
|
||||
extern "C" int hipGetStreamDeviceId(hipStream_t stream) {
|
||||
return hip::GetHipDispatchTable()->hipGetStreamDeviceId_fn(stream);
|
||||
}
|
||||
hipError_t hipExtGetLastError() {
|
||||
return hip::GetHipDispatchTable()->hipExtGetLastError_fn();
|
||||
}
|
||||
hipError_t hipExtGetLastError() { return hip::GetHipDispatchTable()->hipExtGetLastError_fn(); }
|
||||
hipError_t hipTexRefGetBorderColor(float* pBorderColor, const textureReference* texRef) {
|
||||
return hip::GetHipDispatchTable()->hipTexRefGetBorderColor_fn(pBorderColor, texRef);
|
||||
}
|
||||
hipError_t hipTexRefGetArray(hipArray_t* pArray, const textureReference* texRef) {
|
||||
return hip::GetHipDispatchTable()->hipTexRefGetArray_fn(pArray, texRef);
|
||||
}
|
||||
extern "C" hipError_t hipGetProcAddress(const char* symbol, void** pfn, int hipVersion,
|
||||
extern "C" hipError_t hipGetProcAddress(const char* symbol, void** pfn, int hipVersion,
|
||||
uint64_t flags,
|
||||
hipDriverProcAddressQueryResult* symbolStatus) {
|
||||
return hip::GetHipDispatchTable()->hipGetProcAddress_fn(symbol, pfn, hipVersion, flags,
|
||||
@@ -1811,32 +1810,30 @@ hipError_t hipGetFuncBySymbol(hipFunction_t* functionPtr, const void* symbolPtr)
|
||||
return hip::GetHipDispatchTable()->hipGetFuncBySymbol_fn(functionPtr, symbolPtr);
|
||||
}
|
||||
hipError_t hipDrvGraphExecMemsetNodeSetParams(hipGraphExec_t hGraphExec, hipGraphNode_t hNode,
|
||||
const hipMemsetParams* memsetParams, hipCtx_t ctx) {
|
||||
const hipMemsetParams* memsetParams, hipCtx_t ctx) {
|
||||
return hip::GetHipDispatchTable()->hipDrvGraphExecMemsetNodeSetParams_fn(hGraphExec, hNode,
|
||||
memsetParams, ctx);
|
||||
memsetParams, ctx);
|
||||
}
|
||||
hipError_t hipGraphExecGetFlags(hipGraphExec_t graphExec, unsigned long long* flags) {
|
||||
return hip::GetHipDispatchTable()->hipGraphExecGetFlags_fn(graphExec, flags);
|
||||
return hip::GetHipDispatchTable()->hipGraphExecGetFlags_fn(graphExec, flags);
|
||||
}
|
||||
hipError_t hipDrvGraphAddMemFreeNode(hipGraphNode_t* phGraphNode, hipGraph_t hGraph,
|
||||
const hipGraphNode_t* dependencies, size_t numDependencies,
|
||||
hipDeviceptr_t dptr) {
|
||||
return hip::GetHipDispatchTable()->hipDrvGraphAddMemFreeNode_fn(phGraphNode, hGraph,
|
||||
dependencies, numDependencies,
|
||||
dptr);
|
||||
const hipGraphNode_t* dependencies, size_t numDependencies,
|
||||
hipDeviceptr_t dptr) {
|
||||
return hip::GetHipDispatchTable()->hipDrvGraphAddMemFreeNode_fn(phGraphNode, hGraph, dependencies,
|
||||
numDependencies, dptr);
|
||||
}
|
||||
hipError_t hipDrvGraphExecMemcpyNodeSetParams(hipGraphExec_t hGraphExec, hipGraphNode_t hNode,
|
||||
const HIP_MEMCPY3D* copyParams, hipCtx_t ctx) {
|
||||
const HIP_MEMCPY3D* copyParams, hipCtx_t ctx) {
|
||||
return hip::GetHipDispatchTable()->hipDrvGraphExecMemcpyNodeSetParams_fn(hGraphExec, hNode,
|
||||
copyParams, ctx);
|
||||
copyParams, ctx);
|
||||
}
|
||||
hipError_t hipSetValidDevices(int* device_arr, int len) {
|
||||
return hip::GetHipDispatchTable()->hipSetValidDevices_fn(device_arr, len);
|
||||
}
|
||||
hipError_t hipMemcpyAtoD(hipDeviceptr_t dstDevice, hipArray_t srcArray, size_t srcOffset,
|
||||
size_t ByteCount) {
|
||||
return hip::GetHipDispatchTable()->hipMemcpyAtoD_fn(dstDevice, srcArray, srcOffset,
|
||||
ByteCount);
|
||||
return hip::GetHipDispatchTable()->hipMemcpyAtoD_fn(dstDevice, srcArray, srcOffset, ByteCount);
|
||||
}
|
||||
hipError_t hipMemcpyDtoA(hipArray_t dstArray, size_t dstOffset, hipDeviceptr_t srcDevice,
|
||||
size_t ByteCount) {
|
||||
@@ -1869,7 +1866,7 @@ hipError_t hipDrvGraphMemcpyNodeGetParams(hipGraphNode_t hNode, HIP_MEMCPY3D* no
|
||||
hipError_t hipDrvGraphMemcpyNodeSetParams(hipGraphNode_t hNode, const HIP_MEMCPY3D* nodeParams) {
|
||||
return hip::GetHipDispatchTable()->hipDrvGraphMemcpyNodeSetParams_fn(hNode, nodeParams);
|
||||
}
|
||||
hipError_t hipGraphNodeSetParams(hipGraphNode_t node, hipGraphNodeParams *nodeParams) {
|
||||
hipError_t hipGraphNodeSetParams(hipGraphNode_t node, hipGraphNodeParams* nodeParams) {
|
||||
return hip::GetHipDispatchTable()->hipGraphNodeSetParams_fn(node, nodeParams);
|
||||
}
|
||||
hipError_t hipGraphAddBatchMemOpNode(hipGraphNode_t* pGraphNode, hipGraph_t graph,
|
||||
@@ -1886,8 +1883,7 @@ hipError_t hipGraphBatchMemOpNodeSetParams(hipGraphNode_t hNode,
|
||||
hipBatchMemOpNodeParams* nodeParams) {
|
||||
return hip::GetHipDispatchTable()->hipGraphBatchMemOpNodeSetParams_fn(hNode, nodeParams);
|
||||
}
|
||||
hipError_t hipGraphExecBatchMemOpNodeSetParams(hipGraphExec_t hGraphExec,
|
||||
hipGraphNode_t hNode,
|
||||
hipError_t hipGraphExecBatchMemOpNodeSetParams(hipGraphExec_t hGraphExec, hipGraphNode_t hNode,
|
||||
const hipBatchMemOpNodeParams* nodeParams) {
|
||||
return hip::GetHipDispatchTable()->hipGraphExecBatchMemOpNodeSetParams_fn(hGraphExec, hNode,
|
||||
nodeParams);
|
||||
@@ -1915,51 +1911,51 @@ hipError_t hipMemsetD2D8(hipDeviceptr_t dst, size_t dstPitch, unsigned char valu
|
||||
size_t height) {
|
||||
return hip::GetHipDispatchTable()->hipMemsetD2D8_fn(dst, dstPitch, value, width, height);
|
||||
}
|
||||
hipError_t hipMemsetD2D8Async(hipDeviceptr_t dst, size_t dstPitch, unsigned char value, size_t width,
|
||||
size_t height, hipStream_t stream) {
|
||||
hipError_t hipMemsetD2D8Async(hipDeviceptr_t dst, size_t dstPitch, unsigned char value,
|
||||
size_t width, size_t height, hipStream_t stream) {
|
||||
return hip::GetHipDispatchTable()->hipMemsetD2D8Async_fn(dst, dstPitch, value, width, height,
|
||||
stream);
|
||||
}
|
||||
hipError_t hipMemsetD2D16(hipDeviceptr_t dst, size_t dstPitch, unsigned short value, size_t width,
|
||||
size_t height) {
|
||||
size_t height) {
|
||||
return hip::GetHipDispatchTable()->hipMemsetD2D16_fn(dst, dstPitch, value, width, height);
|
||||
}
|
||||
hipError_t hipMemsetD2D16Async(hipDeviceptr_t dst, size_t dstPitch, unsigned short value, size_t width,
|
||||
size_t height, hipStream_t stream) {
|
||||
hipError_t hipMemsetD2D16Async(hipDeviceptr_t dst, size_t dstPitch, unsigned short value,
|
||||
size_t width, size_t height, hipStream_t stream) {
|
||||
return hip::GetHipDispatchTable()->hipMemsetD2D16Async_fn(dst, dstPitch, value, width, height,
|
||||
stream);
|
||||
stream);
|
||||
}
|
||||
hipError_t hipMemsetD2D32(hipDeviceptr_t dst, size_t dstPitch, unsigned int value, size_t width,
|
||||
size_t height) {
|
||||
size_t height) {
|
||||
return hip::GetHipDispatchTable()->hipMemsetD2D32_fn(dst, dstPitch, value, width, height);
|
||||
}
|
||||
hipError_t hipMemsetD2D32Async(hipDeviceptr_t dst, size_t dstPitch, unsigned int value, size_t width,
|
||||
size_t height, hipStream_t stream) {
|
||||
hipError_t hipMemsetD2D32Async(hipDeviceptr_t dst, size_t dstPitch, unsigned int value,
|
||||
size_t width, size_t height, hipStream_t stream) {
|
||||
return hip::GetHipDispatchTable()->hipMemsetD2D32Async_fn(dst, dstPitch, value, width, height,
|
||||
stream);
|
||||
stream);
|
||||
}
|
||||
hipError_t hipStreamSetAttribute(hipStream_t stream, hipStreamAttrID attr,
|
||||
const hipStreamAttrValue *value) {
|
||||
const hipStreamAttrValue* value) {
|
||||
return hip::GetHipDispatchTable()->hipStreamSetAttribute_fn(stream, attr, value);
|
||||
}
|
||||
hipError_t hipStreamGetAttribute(hipStream_t stream, hipStreamAttrID attr,
|
||||
hipStreamAttrValue *value) {
|
||||
hipStreamAttrValue* value) {
|
||||
return hip::GetHipDispatchTable()->hipStreamGetAttribute_fn(stream, attr, value);
|
||||
}
|
||||
hipError_t hipMemcpyBatchAsync(void **dsts, void **srcs, size_t *sizes, size_t count,
|
||||
hipMemcpyAttributes *attrs, size_t *attrsIdxs, size_t numAttrs,
|
||||
size_t *failIdx, hipStream_t stream) {
|
||||
return hip::GetHipDispatchTable()->hipMemcpyBatchAsync_fn(dsts, srcs, sizes, count, attrs,
|
||||
attrsIdxs, numAttrs, failIdx, stream);
|
||||
hipError_t hipMemcpyBatchAsync(void** dsts, void** srcs, size_t* sizes, size_t count,
|
||||
hipMemcpyAttributes* attrs, size_t* attrsIdxs, size_t numAttrs,
|
||||
size_t* failIdx, hipStream_t stream) {
|
||||
return hip::GetHipDispatchTable()->hipMemcpyBatchAsync_fn(dsts, srcs, sizes, count, attrs,
|
||||
attrsIdxs, numAttrs, failIdx, stream);
|
||||
}
|
||||
hipError_t hipMemcpy3DBatchAsync(size_t numOps, struct hipMemcpy3DBatchOp *opList, size_t *failIdx,
|
||||
hipError_t hipMemcpy3DBatchAsync(size_t numOps, struct hipMemcpy3DBatchOp* opList, size_t* failIdx,
|
||||
unsigned long long flags, hipStream_t stream) {
|
||||
return hip::GetHipDispatchTable()->hipMemcpy3DBatchAsync_fn(numOps, opList, failIdx, flags,
|
||||
stream);
|
||||
}
|
||||
hipError_t hipMemcpy3DPeer(hipMemcpy3DPeerParms *p) {
|
||||
hipError_t hipMemcpy3DPeer(hipMemcpy3DPeerParms* p) {
|
||||
return hip::GetHipDispatchTable()->hipMemcpy3DPeer_fn(p);
|
||||
}
|
||||
hipError_t hipMemcpy3DPeerAsync(hipMemcpy3DPeerParms *p, hipStream_t stream) {
|
||||
hipError_t hipMemcpy3DPeerAsync(hipMemcpy3DPeerParms* p, hipStream_t stream) {
|
||||
return hip::GetHipDispatchTable()->hipMemcpy3DPeerAsync_fn(p, stream);
|
||||
}
|
||||
@@ -27,27 +27,29 @@ const HipToolsDispatchTable* GetHipToolsDispatchTable();
|
||||
} // namespace hip
|
||||
|
||||
#ifdef _WIN32
|
||||
# define DllExport extern "C" __declspec(dllexport)
|
||||
#else // !_WIN32
|
||||
# define DllExport extern "C"
|
||||
#endif // !_WIN32
|
||||
#define DllExport extern "C" __declspec(dllexport)
|
||||
#else // !_WIN32
|
||||
#define DllExport extern "C"
|
||||
#endif // !_WIN32
|
||||
|
||||
DllExport hipError_t hipExtModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
|
||||
uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ,
|
||||
uint32_t localWorkSizeX, uint32_t localWorkSizeY,
|
||||
uint32_t localWorkSizeZ, size_t sharedMemBytes,
|
||||
hipStream_t hStream, void** kernelParams, void** extra,
|
||||
hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags) {
|
||||
uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ,
|
||||
uint32_t localWorkSizeX, uint32_t localWorkSizeY,
|
||||
uint32_t localWorkSizeZ, size_t sharedMemBytes,
|
||||
hipStream_t hStream, void** kernelParams,
|
||||
void** extra, hipEvent_t startEvent,
|
||||
hipEvent_t stopEvent, uint32_t flags) {
|
||||
return hip::GetHipDispatchTable()->hipExtModuleLaunchKernel_fn(
|
||||
f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, localWorkSizeX, localWorkSizeY,
|
||||
localWorkSizeZ, sharedMemBytes, hStream, kernelParams, extra, startEvent, stopEvent, flags);
|
||||
}
|
||||
DllExport hipError_t hipHccModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
|
||||
uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ,
|
||||
uint32_t localWorkSizeX, uint32_t localWorkSizeY,
|
||||
uint32_t localWorkSizeZ, size_t sharedMemBytes,
|
||||
hipStream_t hStream, void** kernelParams, void** extra,
|
||||
hipEvent_t startEvent, hipEvent_t stopEvent) {
|
||||
uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ,
|
||||
uint32_t localWorkSizeX, uint32_t localWorkSizeY,
|
||||
uint32_t localWorkSizeZ, size_t sharedMemBytes,
|
||||
hipStream_t hStream, void** kernelParams,
|
||||
void** extra, hipEvent_t startEvent,
|
||||
hipEvent_t stopEvent) {
|
||||
return hip::GetHipDispatchTable()->hipHccModuleLaunchKernel_fn(
|
||||
f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, localWorkSizeX, localWorkSizeY,
|
||||
localWorkSizeZ, sharedMemBytes, hStream, kernelParams, extra, startEvent, stopEvent);
|
||||
|
||||
Το diff αρχείου καταστέλλεται επειδή είναι πολύ μεγάλο
Φόρτωση Διαφορών
@@ -23,14 +23,14 @@
|
||||
#include "hip_vm.hpp"
|
||||
namespace hip {
|
||||
|
||||
static_assert(static_cast<uint32_t>(hipMemAccessFlagsProtNone)
|
||||
== static_cast<uint32_t>(amd::Device::VmmAccess::kNone),
|
||||
static_assert(static_cast<uint32_t>(hipMemAccessFlagsProtNone) ==
|
||||
static_cast<uint32_t>(amd::Device::VmmAccess::kNone),
|
||||
"Mem Access Flag None mismatch with ROCclr!");
|
||||
static_assert(static_cast<uint32_t>(hipMemAccessFlagsProtRead)
|
||||
== static_cast<uint32_t>(amd::Device::VmmAccess::kReadOnly),
|
||||
static_assert(static_cast<uint32_t>(hipMemAccessFlagsProtRead) ==
|
||||
static_cast<uint32_t>(amd::Device::VmmAccess::kReadOnly),
|
||||
"Mem Access Flag Read mismatch with ROCclr!");
|
||||
static_assert(static_cast<uint32_t>(hipMemAccessFlagsProtReadWrite)
|
||||
== static_cast<uint32_t>(amd::Device::VmmAccess::kReadWrite),
|
||||
static_assert(static_cast<uint32_t>(hipMemAccessFlagsProtReadWrite) ==
|
||||
static_cast<uint32_t>(amd::Device::VmmAccess::kReadWrite),
|
||||
"Mem Access Flag Read Write mismatch with ROCclr!");
|
||||
|
||||
hipError_t hipMemAddressFree(void* devPtr, size_t size) {
|
||||
@@ -60,8 +60,8 @@ hipError_t hipMemAddressReserve(void** ptr, size_t size, size_t alignment, void*
|
||||
}
|
||||
|
||||
const auto& dev_info = g_devices[0]->devices()[0]->info();
|
||||
if (size == 0 || ((size % dev_info.virtualMemAllocGranularity_) != 0)
|
||||
|| ((alignment & (alignment - 1)) != 0)) {
|
||||
if (size == 0 || ((size % dev_info.virtualMemAllocGranularity_) != 0) ||
|
||||
((alignment & (alignment - 1)) != 0)) {
|
||||
HIP_RETURN(hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
@@ -98,8 +98,8 @@ hipError_t hipMemCreate(hipMemGenericAllocationHandle_t* handle, size_t size,
|
||||
HIP_RETURN(hipErrorInvalidDevice);
|
||||
}
|
||||
|
||||
if (prop->requestedHandleTypes != hipMemHandleTypeNone
|
||||
&& prop->requestedHandleTypes != hipMemHandleTypePosixFileDescriptor) {
|
||||
if (prop->requestedHandleTypes != hipMemHandleTypeNone &&
|
||||
prop->requestedHandleTypes != hipMemHandleTypePosixFileDescriptor) {
|
||||
HIP_RETURN(hipErrorNotSupported);
|
||||
}
|
||||
|
||||
@@ -125,19 +125,21 @@ hipError_t hipMemCreate(hipMemGenericAllocationHandle_t* handle, size_t size,
|
||||
|
||||
// Handle out of memory cases,
|
||||
if (ptr == nullptr) {
|
||||
size_t free = 0, total =0;
|
||||
size_t free = 0, total = 0;
|
||||
hipError_t hip_error = hipMemGetInfo(&free, &total);
|
||||
if (hip_error == hipSuccess) {
|
||||
LogPrintfError("Allocation failed : Device memory : required :%zu | free :%zu"
|
||||
"| total :%zu", size, free, total);
|
||||
LogPrintfError(
|
||||
"Allocation failed : Device memory : required :%zu | free :%zu"
|
||||
"| total :%zu",
|
||||
size, free, total);
|
||||
}
|
||||
HIP_RETURN(hipErrorOutOfMemory);
|
||||
}
|
||||
|
||||
// Add this to amd::Memory object, so this ptr is accesible for other hipmemory operations.
|
||||
size_t offset = 0; //this is ignored
|
||||
size_t offset = 0; // this is ignored
|
||||
amd::Memory* phys_mem_obj = getMemoryObject(ptr, offset);
|
||||
//saves the current device id so that it can be accessed later
|
||||
// saves the current device id so that it can be accessed later
|
||||
phys_mem_obj->getUserData().deviceId = prop->location.id;
|
||||
phys_mem_obj->getUserData().data = new hip::GenericAllocation(*phys_mem_obj, size, *prop);
|
||||
*handle = reinterpret_cast<hipMemGenericAllocationHandle_t>(phys_mem_obj->getUserData().data);
|
||||
@@ -167,12 +169,12 @@ hipError_t hipMemExportToShareableHandle(void* shareableHandle,
|
||||
|
||||
if (ga->GetProperties().requestedHandleTypes != handleType) {
|
||||
LogPrintfError("HandleType mismatch memoryHandleType: %d, requestedHandleTypes: %d",
|
||||
ga->GetProperties().requestedHandleTypes, handleType);
|
||||
ga->GetProperties().requestedHandleTypes, handleType);
|
||||
HIP_RETURN(hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
if (!ga->asAmdMemory().getContext().devices()[0]->ExportShareableVMMHandle(
|
||||
ga->asAmdMemory(), flags, shareableHandle)) {
|
||||
ga->asAmdMemory(), flags, shareableHandle)) {
|
||||
LogPrintfError("Exporting Handle failed with flags: %d", flags);
|
||||
HIP_RETURN(hipErrorInvalidValue);
|
||||
}
|
||||
@@ -183,8 +185,8 @@ hipError_t hipMemExportToShareableHandle(void* shareableHandle,
|
||||
hipError_t hipMemGetAccess(unsigned long long* flags, const hipMemLocation* location, void* ptr) {
|
||||
HIP_INIT_API(hipMemGetAccess, flags, location, ptr);
|
||||
|
||||
if (flags == nullptr || location == nullptr || ptr == nullptr
|
||||
|| location->type != hipMemLocationTypeDevice || location->id >= g_devices.size()) {
|
||||
if (flags == nullptr || location == nullptr || ptr == nullptr ||
|
||||
location->type != hipMemLocationTypeDevice || location->id >= g_devices.size()) {
|
||||
HIP_RETURN(hipErrorInvalidValue)
|
||||
}
|
||||
|
||||
@@ -219,7 +221,8 @@ hipError_t hipMemGetAllocationGranularity(size_t* granularity, const hipMemAlloc
|
||||
HIP_RETURN(hipSuccess);
|
||||
}
|
||||
|
||||
hipError_t hipMemGetAllocationPropertiesFromHandle(hipMemAllocationProp* prop, hipMemGenericAllocationHandle_t handle) {
|
||||
hipError_t hipMemGetAllocationPropertiesFromHandle(hipMemAllocationProp* prop,
|
||||
hipMemGenericAllocationHandle_t handle) {
|
||||
HIP_INIT_API(hipMemGetAllocationPropertiesFromHandle, prop, handle);
|
||||
|
||||
if (handle == nullptr || prop == nullptr) {
|
||||
@@ -247,7 +250,7 @@ hipError_t hipMemImportFromShareableHandle(hipMemGenericAllocationHandle_t* hand
|
||||
HIP_RETURN(hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
hipMemAllocationProp prop {};
|
||||
hipMemAllocationProp prop{};
|
||||
prop.type = hipMemAllocationTypePinned;
|
||||
prop.location.type = hipMemLocationTypeDevice;
|
||||
prop.location.id = hip::getCurrentDevice()->deviceId();
|
||||
@@ -287,7 +290,8 @@ hipError_t hipMemMap(void* ptr, size_t size, size_t offset, hipMemGenericAllocat
|
||||
HIP_RETURN(hipSuccess);
|
||||
}
|
||||
|
||||
hipError_t hipMemMapArrayAsync(hipArrayMapInfo* mapInfoList, unsigned int count, hipStream_t stream) {
|
||||
hipError_t hipMemMapArrayAsync(hipArrayMapInfo* mapInfoList, unsigned int count,
|
||||
hipStream_t stream) {
|
||||
HIP_INIT_API(hipMemMapArrayAsync, mapInfoList, count, stream);
|
||||
|
||||
if (mapInfoList == nullptr || count == 0) {
|
||||
@@ -325,7 +329,7 @@ hipError_t hipMemRetainAllocationHandle(hipMemGenericAllocationHandle_t* handle,
|
||||
}
|
||||
|
||||
*handle = reinterpret_cast<hipMemGenericAllocationHandle_t>(
|
||||
mem->getUserData().phys_mem_obj->getUserData().data);
|
||||
mem->getUserData().phys_mem_obj->getUserData().data);
|
||||
|
||||
if (*handle == nullptr) {
|
||||
HIP_RETURN(hipErrorInvalidValue);
|
||||
|
||||
@@ -32,12 +32,12 @@ hipError_t ihipFree(void* ptr);
|
||||
|
||||
class GenericAllocation : public amd::RuntimeObject {
|
||||
amd::Memory& phys_mem_ref_; //<! Physical memory object
|
||||
size_t size_; //<! Allocated size
|
||||
hipMemAllocationProp properties_; //<! Allocation Properties
|
||||
size_t size_; //<! Allocated size
|
||||
hipMemAllocationProp properties_; //<! Allocation Properties
|
||||
|
||||
public:
|
||||
public:
|
||||
GenericAllocation(amd::Memory& phys_mem_ref, size_t size, const hipMemAllocationProp& prop)
|
||||
: phys_mem_ref_(phys_mem_ref), size_(size), properties_(prop) {}
|
||||
: phys_mem_ref_(phys_mem_ref), size_(size), properties_(prop) {}
|
||||
~GenericAllocation() {
|
||||
amd::Context* amdContext = g_devices[properties_.location.id]->asContext();
|
||||
amd::SvmBuffer::free(*amdContext, phys_mem_ref_.getSvmPtr());
|
||||
@@ -47,12 +47,10 @@ public:
|
||||
hipMemGenericAllocationHandle_t asMemGenericAllocationHandle() {
|
||||
return reinterpret_cast<hipMemGenericAllocationHandle_t>(this);
|
||||
}
|
||||
amd::Memory& asAmdMemory() {
|
||||
return phys_mem_ref_;
|
||||
}
|
||||
amd::Memory& asAmdMemory() { return phys_mem_ref_; }
|
||||
|
||||
virtual ObjectType objectType() const { return ObjectTypeVMMAlloc; }
|
||||
};
|
||||
};
|
||||
}; // namespace hip
|
||||
|
||||
#endif //HIP_SRC_HIP_VM_H
|
||||
#endif // HIP_SRC_HIP_VM_H
|
||||
|
||||
@@ -309,8 +309,7 @@ hiprtcResult hiprtcLinkCreate(unsigned int num_options, hiprtcJIT_option* option
|
||||
|
||||
std::string name("LinkerProgram");
|
||||
hip::LinkProgram* rtc_link_prog_ptr = new hip::LinkProgram(name);
|
||||
if (!rtc_link_prog_ptr->AddLinkerOptions(num_options, options_ptr,
|
||||
options_vals_pptr)) {
|
||||
if (!rtc_link_prog_ptr->AddLinkerOptions(num_options, options_ptr, options_vals_pptr)) {
|
||||
HIPRTC_RETURN(HIPRTC_ERROR_INVALID_OPTION);
|
||||
}
|
||||
|
||||
@@ -331,12 +330,11 @@ hiprtcResult hiprtcLinkAddFile(hiprtcLinkState hip_link_state, hiprtcJITInputTyp
|
||||
if (input_type == HIPRTC_JIT_INPUT_CUBIN || input_type == HIPRTC_JIT_INPUT_PTX ||
|
||||
input_type == HIPRTC_JIT_INPUT_FATBINARY || input_type == HIPRTC_JIT_INPUT_OBJECT ||
|
||||
input_type == HIPRTC_JIT_INPUT_LIBRARY || input_type == HIPRTC_JIT_INPUT_NVVM ||
|
||||
input_type == HIPRTC_JIT_INPUT_SPIRV ) {
|
||||
input_type == HIPRTC_JIT_INPUT_SPIRV) {
|
||||
HIPRTC_RETURN(HIPRTC_ERROR_INVALID_INPUT);
|
||||
}
|
||||
|
||||
hip::LinkProgram* rtc_link_prog_ptr =
|
||||
reinterpret_cast<hip::LinkProgram*>(hip_link_state);
|
||||
hip::LinkProgram* rtc_link_prog_ptr = reinterpret_cast<hip::LinkProgram*>(hip_link_state);
|
||||
|
||||
if (!hip::LinkProgram::isLinkerValid(rtc_link_prog_ptr)) {
|
||||
HIPRTC_RETURN(HIPRTC_ERROR_INVALID_INPUT);
|
||||
@@ -361,7 +359,7 @@ hiprtcResult hiprtcLinkAddData(hiprtcLinkState hip_link_state, hiprtcJITInputTyp
|
||||
|
||||
if (input_type == HIPRTC_JIT_INPUT_CUBIN || input_type == HIPRTC_JIT_INPUT_PTX ||
|
||||
input_type == HIPRTC_JIT_INPUT_FATBINARY || input_type == HIPRTC_JIT_INPUT_OBJECT ||
|
||||
input_type == HIPRTC_JIT_INPUT_LIBRARY || input_type == HIPRTC_JIT_INPUT_NVVM ||
|
||||
input_type == HIPRTC_JIT_INPUT_LIBRARY || input_type == HIPRTC_JIT_INPUT_NVVM ||
|
||||
input_type == HIPRTC_JIT_INPUT_SPIRV) {
|
||||
HIPRTC_RETURN(HIPRTC_ERROR_INVALID_INPUT);
|
||||
}
|
||||
@@ -371,8 +369,7 @@ hiprtcResult hiprtcLinkAddData(hiprtcLinkState hip_link_state, hiprtcJITInputTyp
|
||||
input_name = name;
|
||||
}
|
||||
|
||||
hip::LinkProgram* rtc_link_prog_ptr =
|
||||
reinterpret_cast<hip::LinkProgram*>(hip_link_state);
|
||||
hip::LinkProgram* rtc_link_prog_ptr = reinterpret_cast<hip::LinkProgram*>(hip_link_state);
|
||||
|
||||
if (!hip::LinkProgram::isLinkerValid(rtc_link_prog_ptr)) {
|
||||
HIPRTC_RETURN(HIPRTC_ERROR_INVALID_INPUT);
|
||||
@@ -392,8 +389,7 @@ hiprtcResult hiprtcLinkComplete(hiprtcLinkState hip_link_state, void** bin_out,
|
||||
HIPRTC_RETURN(HIPRTC_ERROR_INVALID_INPUT);
|
||||
}
|
||||
|
||||
hip::LinkProgram* rtc_link_prog_ptr =
|
||||
reinterpret_cast<hip::LinkProgram*>(hip_link_state);
|
||||
hip::LinkProgram* rtc_link_prog_ptr = reinterpret_cast<hip::LinkProgram*>(hip_link_state);
|
||||
|
||||
if (!hip::LinkProgram::isLinkerValid(rtc_link_prog_ptr)) {
|
||||
HIPRTC_RETURN(HIPRTC_ERROR_INVALID_INPUT);
|
||||
@@ -409,8 +405,7 @@ hiprtcResult hiprtcLinkComplete(hiprtcLinkState hip_link_state, void** bin_out,
|
||||
hiprtcResult hiprtcLinkDestroy(hiprtcLinkState hip_link_state) {
|
||||
HIPRTC_INIT_API(hip_link_state);
|
||||
|
||||
hip::LinkProgram* rtc_link_prog_ptr =
|
||||
reinterpret_cast<hip::LinkProgram*>(hip_link_state);
|
||||
hip::LinkProgram* rtc_link_prog_ptr = reinterpret_cast<hip::LinkProgram*>(hip_link_state);
|
||||
|
||||
if (!hip::LinkProgram::isLinkerValid(rtc_link_prog_ptr)) {
|
||||
HIPRTC_RETURN(HIPRTC_ERROR_INVALID_INPUT);
|
||||
|
||||
@@ -90,7 +90,8 @@ bool RTCCompileProgram::addSource(const std::string& source, const std::string&
|
||||
// objects
|
||||
bool RTCCompileProgram::addSource_impl() {
|
||||
std::vector<char> vsource(source_code_.begin(), source_code_.end());
|
||||
if (!hip::helpers::addCodeObjData(compile_input_, vsource, source_name_, AMD_COMGR_DATA_KIND_SOURCE)) {
|
||||
if (!hip::helpers::addCodeObjData(compile_input_, vsource, source_name_,
|
||||
AMD_COMGR_DATA_KIND_SOURCE)) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
@@ -118,7 +119,7 @@ bool RTCCompileProgram::addBuiltinHeader() {
|
||||
}
|
||||
|
||||
bool RTCCompileProgram::findExeOptions(const std::vector<std::string>& options,
|
||||
std::vector<std::string>& exe_options) {
|
||||
std::vector<std::string>& exe_options) {
|
||||
for (size_t i = 0; i < options.size(); ++i) {
|
||||
// -mllvm options passed by the app such as "-mllvm" "-amdgpu-early-inline-all=true"
|
||||
if (options[i] == "-mllvm") {
|
||||
@@ -201,14 +202,15 @@ bool RTCCompileProgram::compile(const std::vector<std::string>& options, bool fg
|
||||
}
|
||||
|
||||
if (fgpu_rdc_) {
|
||||
if (!hip::helpers::compileToBitCode(compile_input_, isa_, compileOpts, build_log_, LLVMBitcode_)) {
|
||||
if (!hip::helpers::compileToBitCode(compile_input_, isa_, compileOpts, build_log_,
|
||||
LLVMBitcode_)) {
|
||||
LogError("Error in hiprtc: unable to compile source to bitcode");
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
LogInfo("Using the new path of comgr");
|
||||
if (!hip::helpers::compileToExecutable(compile_input_, isa_, compileOpts, link_options_, build_log_,
|
||||
executable_)) {
|
||||
if (!hip::helpers::compileToExecutable(compile_input_, isa_, compileOpts, link_options_,
|
||||
build_log_, executable_)) {
|
||||
LogError("Failing to compile to realloc");
|
||||
return false;
|
||||
}
|
||||
@@ -234,7 +236,6 @@ void RTCCompileProgram::stripNamedExpression(std::string& strippedName) {
|
||||
if (strippedName.front() == '&') {
|
||||
strippedName.erase(0, 1);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
bool RTCCompileProgram::trackMangledName(std::string& name) {
|
||||
@@ -249,8 +250,10 @@ bool RTCCompileProgram::trackMangledName(std::string& name) {
|
||||
|
||||
std::string gcn_expr = "__amdgcn_name_expr_";
|
||||
std::string size = std::to_string(mangled_names_.size());
|
||||
const auto var1{"\n static __device__ const void* " + gcn_expr + size + "[]= {\"" + strippedName + "\", (void*)&" + strippedName + "};"};
|
||||
const auto var2{"\n static auto __amdgcn_name_expr_stub_" + size + " = " + gcn_expr + size + ";\n"};
|
||||
const auto var1{"\n static __device__ const void* " + gcn_expr + size + "[]= {\"" + strippedName +
|
||||
"\", (void*)&" + strippedName + "};"};
|
||||
const auto var2{"\n static auto __amdgcn_name_expr_stub_" + size + " = " + gcn_expr + size +
|
||||
";\n"};
|
||||
const auto code{var1 + var2};
|
||||
|
||||
source_code_ += code;
|
||||
|
||||
@@ -37,112 +37,108 @@
|
||||
|
||||
// Handy macro to convert an enumeration to a stringified version of same:
|
||||
#define CASE_STR(x) \
|
||||
case x: \
|
||||
return #x;
|
||||
case x: \
|
||||
return #x;
|
||||
|
||||
inline const char* ihipErrorString(hipError_t hip_error) {
|
||||
switch (hip_error) {
|
||||
CASE_STR(hipSuccess);
|
||||
CASE_STR(hipErrorOutOfMemory);
|
||||
CASE_STR(hipErrorNotInitialized);
|
||||
CASE_STR(hipErrorDeinitialized);
|
||||
CASE_STR(hipErrorProfilerDisabled);
|
||||
CASE_STR(hipErrorProfilerNotInitialized);
|
||||
CASE_STR(hipErrorProfilerAlreadyStarted);
|
||||
CASE_STR(hipErrorProfilerAlreadyStopped);
|
||||
CASE_STR(hipErrorInvalidImage);
|
||||
CASE_STR(hipErrorInvalidContext);
|
||||
CASE_STR(hipErrorContextAlreadyCurrent);
|
||||
CASE_STR(hipErrorMapFailed);
|
||||
CASE_STR(hipErrorUnmapFailed);
|
||||
CASE_STR(hipErrorArrayIsMapped);
|
||||
CASE_STR(hipErrorAlreadyMapped);
|
||||
CASE_STR(hipErrorNoBinaryForGpu);
|
||||
CASE_STR(hipErrorAlreadyAcquired);
|
||||
CASE_STR(hipErrorNotMapped);
|
||||
CASE_STR(hipErrorNotMappedAsArray);
|
||||
CASE_STR(hipErrorNotMappedAsPointer);
|
||||
CASE_STR(hipErrorECCNotCorrectable);
|
||||
CASE_STR(hipErrorUnsupportedLimit);
|
||||
CASE_STR(hipErrorContextAlreadyInUse);
|
||||
CASE_STR(hipErrorPeerAccessUnsupported);
|
||||
CASE_STR(hipErrorInvalidKernelFile);
|
||||
CASE_STR(hipErrorInvalidGraphicsContext);
|
||||
CASE_STR(hipErrorInvalidSource);
|
||||
CASE_STR(hipErrorFileNotFound);
|
||||
CASE_STR(hipErrorSharedObjectSymbolNotFound);
|
||||
CASE_STR(hipErrorSharedObjectInitFailed);
|
||||
CASE_STR(hipErrorOperatingSystem);
|
||||
CASE_STR(hipErrorSetOnActiveProcess);
|
||||
CASE_STR(hipErrorInvalidHandle);
|
||||
CASE_STR(hipErrorNotFound);
|
||||
CASE_STR(hipErrorIllegalAddress);
|
||||
CASE_STR(hipErrorMissingConfiguration);
|
||||
CASE_STR(hipErrorLaunchFailure);
|
||||
CASE_STR(hipErrorPriorLaunchFailure);
|
||||
CASE_STR(hipErrorLaunchTimeOut);
|
||||
CASE_STR(hipErrorLaunchOutOfResources);
|
||||
CASE_STR(hipErrorInvalidDeviceFunction);
|
||||
CASE_STR(hipErrorInvalidConfiguration);
|
||||
CASE_STR(hipErrorInvalidDevice);
|
||||
CASE_STR(hipErrorInvalidValue);
|
||||
CASE_STR(hipErrorInvalidPitchValue);
|
||||
CASE_STR(hipErrorInvalidDevicePointer);
|
||||
CASE_STR(hipErrorInvalidMemcpyDirection);
|
||||
CASE_STR(hipErrorUnknown);
|
||||
CASE_STR(hipErrorNotReady);
|
||||
CASE_STR(hipErrorNoDevice);
|
||||
CASE_STR(hipErrorPeerAccessAlreadyEnabled);
|
||||
CASE_STR(hipErrorPeerAccessNotEnabled);
|
||||
CASE_STR(hipErrorRuntimeMemory);
|
||||
CASE_STR(hipErrorRuntimeOther);
|
||||
CASE_STR(hipErrorHostMemoryAlreadyRegistered);
|
||||
CASE_STR(hipErrorHostMemoryNotRegistered);
|
||||
CASE_STR(hipErrorTbd);
|
||||
default:
|
||||
return "hipErrorUnknown";
|
||||
};
|
||||
switch (hip_error) {
|
||||
CASE_STR(hipSuccess);
|
||||
CASE_STR(hipErrorOutOfMemory);
|
||||
CASE_STR(hipErrorNotInitialized);
|
||||
CASE_STR(hipErrorDeinitialized);
|
||||
CASE_STR(hipErrorProfilerDisabled);
|
||||
CASE_STR(hipErrorProfilerNotInitialized);
|
||||
CASE_STR(hipErrorProfilerAlreadyStarted);
|
||||
CASE_STR(hipErrorProfilerAlreadyStopped);
|
||||
CASE_STR(hipErrorInvalidImage);
|
||||
CASE_STR(hipErrorInvalidContext);
|
||||
CASE_STR(hipErrorContextAlreadyCurrent);
|
||||
CASE_STR(hipErrorMapFailed);
|
||||
CASE_STR(hipErrorUnmapFailed);
|
||||
CASE_STR(hipErrorArrayIsMapped);
|
||||
CASE_STR(hipErrorAlreadyMapped);
|
||||
CASE_STR(hipErrorNoBinaryForGpu);
|
||||
CASE_STR(hipErrorAlreadyAcquired);
|
||||
CASE_STR(hipErrorNotMapped);
|
||||
CASE_STR(hipErrorNotMappedAsArray);
|
||||
CASE_STR(hipErrorNotMappedAsPointer);
|
||||
CASE_STR(hipErrorECCNotCorrectable);
|
||||
CASE_STR(hipErrorUnsupportedLimit);
|
||||
CASE_STR(hipErrorContextAlreadyInUse);
|
||||
CASE_STR(hipErrorPeerAccessUnsupported);
|
||||
CASE_STR(hipErrorInvalidKernelFile);
|
||||
CASE_STR(hipErrorInvalidGraphicsContext);
|
||||
CASE_STR(hipErrorInvalidSource);
|
||||
CASE_STR(hipErrorFileNotFound);
|
||||
CASE_STR(hipErrorSharedObjectSymbolNotFound);
|
||||
CASE_STR(hipErrorSharedObjectInitFailed);
|
||||
CASE_STR(hipErrorOperatingSystem);
|
||||
CASE_STR(hipErrorSetOnActiveProcess);
|
||||
CASE_STR(hipErrorInvalidHandle);
|
||||
CASE_STR(hipErrorNotFound);
|
||||
CASE_STR(hipErrorIllegalAddress);
|
||||
CASE_STR(hipErrorMissingConfiguration);
|
||||
CASE_STR(hipErrorLaunchFailure);
|
||||
CASE_STR(hipErrorPriorLaunchFailure);
|
||||
CASE_STR(hipErrorLaunchTimeOut);
|
||||
CASE_STR(hipErrorLaunchOutOfResources);
|
||||
CASE_STR(hipErrorInvalidDeviceFunction);
|
||||
CASE_STR(hipErrorInvalidConfiguration);
|
||||
CASE_STR(hipErrorInvalidDevice);
|
||||
CASE_STR(hipErrorInvalidValue);
|
||||
CASE_STR(hipErrorInvalidPitchValue);
|
||||
CASE_STR(hipErrorInvalidDevicePointer);
|
||||
CASE_STR(hipErrorInvalidMemcpyDirection);
|
||||
CASE_STR(hipErrorUnknown);
|
||||
CASE_STR(hipErrorNotReady);
|
||||
CASE_STR(hipErrorNoDevice);
|
||||
CASE_STR(hipErrorPeerAccessAlreadyEnabled);
|
||||
CASE_STR(hipErrorPeerAccessNotEnabled);
|
||||
CASE_STR(hipErrorRuntimeMemory);
|
||||
CASE_STR(hipErrorRuntimeOther);
|
||||
CASE_STR(hipErrorHostMemoryAlreadyRegistered);
|
||||
CASE_STR(hipErrorHostMemoryNotRegistered);
|
||||
CASE_STR(hipErrorTbd);
|
||||
default:
|
||||
return "hipErrorUnknown";
|
||||
};
|
||||
};
|
||||
|
||||
// Building block functions:
|
||||
template <typename T>
|
||||
inline std::string ToHexString(T v) {
|
||||
std::ostringstream ss;
|
||||
ss << "0x" << std::hex << v;
|
||||
return ss.str();
|
||||
template <typename T> inline std::string ToHexString(T v) {
|
||||
std::ostringstream ss;
|
||||
ss << "0x" << std::hex << v;
|
||||
return ss.str();
|
||||
};
|
||||
|
||||
//---
|
||||
// Template overloads for ToString to handle specific types
|
||||
|
||||
template <typename T>
|
||||
inline std::string ToString(T* v) {
|
||||
std::ostringstream ss;
|
||||
if (v == NULL) {
|
||||
ss << "char array:<null>";
|
||||
} else {
|
||||
ss << v;
|
||||
}
|
||||
return ss.str();
|
||||
template <typename T> inline std::string ToString(T* v) {
|
||||
std::ostringstream ss;
|
||||
if (v == NULL) {
|
||||
ss << "char array:<null>";
|
||||
} else {
|
||||
ss << v;
|
||||
}
|
||||
return ss.str();
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
inline std::string ToString(T** v) {
|
||||
std::ostringstream ss;
|
||||
if (v == NULL) {
|
||||
ss << "char array:<null>";
|
||||
} else {
|
||||
ss << v;
|
||||
}
|
||||
return ss.str();
|
||||
template <typename T> inline std::string ToString(T** v) {
|
||||
std::ostringstream ss;
|
||||
if (v == NULL) {
|
||||
ss << "char array:<null>";
|
||||
} else {
|
||||
ss << v;
|
||||
}
|
||||
return ss.str();
|
||||
};
|
||||
|
||||
// This is the default which works for most types:
|
||||
template <typename T>
|
||||
inline std::string ToString(T v) {
|
||||
std::ostringstream ss;
|
||||
ss << v;
|
||||
return ss.str();
|
||||
template <typename T> inline std::string ToString(T v) {
|
||||
std::ostringstream ss;
|
||||
ss << v;
|
||||
return ss.str();
|
||||
};
|
||||
|
||||
// Catch empty arguments case
|
||||
@@ -151,33 +147,32 @@ inline std::string ToString() { return (""); }
|
||||
//---
|
||||
// C++11 variadic template - peels off first argument, converts to string, and calls itself again to
|
||||
// peel the next arg. Strings are automatically separated by comma+space.
|
||||
template <typename T, typename... Args>
|
||||
inline std::string ToString(T first, Args... args) {
|
||||
return ToString(first) + ", " + ToString(args...);
|
||||
template <typename T, typename... Args> inline std::string ToString(T first, Args... args) {
|
||||
return ToString(first) + ", " + ToString(args...);
|
||||
}
|
||||
|
||||
inline hipError_t ConvertCLErrorIntoHIPError(cl_int cl_error) {
|
||||
hipError_t hip_error = hipSuccess;
|
||||
switch (cl_error) {
|
||||
case CL_INVALID_OPERATION :
|
||||
case CL_INVALID_OPERATION:
|
||||
hip_error = hipErrorLaunchFailure;
|
||||
break;
|
||||
case CL_MEM_OBJECT_ALLOCATION_FAILURE :
|
||||
case CL_MEM_OBJECT_ALLOCATION_FAILURE:
|
||||
hip_error = hipErrorIllegalAddress;
|
||||
break;
|
||||
case CL_INVALID_PROGRAM :
|
||||
case CL_INVALID_PROGRAM:
|
||||
hip_error = hipErrorInvalidSource;
|
||||
break;
|
||||
case CL_INVALID_ARG_VALUE :
|
||||
case CL_INVALID_ARG_VALUE:
|
||||
hip_error = hipErrorInvalidValue;
|
||||
break;
|
||||
case CL_INVALID_KERNEL :
|
||||
case CL_INVALID_KERNEL:
|
||||
hip_error = hipErrorInvalidKernelFile;
|
||||
break;
|
||||
case CL_BUILD_PROGRAM_FAILURE :
|
||||
case CL_BUILD_PROGRAM_FAILURE:
|
||||
hip_error = hipErrorLaunchFailure;
|
||||
break;
|
||||
case CL_INVALID_MEM_OBJECT :
|
||||
case CL_INVALID_MEM_OBJECT:
|
||||
hip_error = hipErrorIllegalAddress;
|
||||
break;
|
||||
case CL_DEVICE_NOT_AVAILABLE:
|
||||
|
||||
@@ -141,8 +141,8 @@ RUNTIME_ENTRY_RET(cl_command_queue, clCreateCommandQueueWithProperties,
|
||||
}
|
||||
|
||||
if ((queueRTCUs != amd::CommandQueue::RealTimeDisabled) &&
|
||||
((queueRTCUs > amdDevice.info().numRTCUs_) || (queueRTCUs == 0)
|
||||
|| (queueRTCUs < amdDevice.info().granularityRTCUs_))) {
|
||||
((queueRTCUs > amdDevice.info().numRTCUs_) || (queueRTCUs == 0) ||
|
||||
(queueRTCUs < amdDevice.info().granularityRTCUs_))) {
|
||||
*not_null(errcode_ret) = CL_INVALID_VALUE;
|
||||
return (cl_command_queue)0;
|
||||
}
|
||||
@@ -231,7 +231,7 @@ RUNTIME_ENTRY(cl_int, clSetDefaultDeviceCommandQueue,
|
||||
|
||||
amd::DeviceQueue* deviceQueue = as_amd(command_queue)->asDeviceQueue();
|
||||
if ((deviceQueue == NULL) || (amdContext != &deviceQueue->context()) ||
|
||||
(amdDevice != &deviceQueue->device())) {
|
||||
(amdDevice != &deviceQueue->device())) {
|
||||
return CL_INVALID_COMMAND_QUEUE;
|
||||
}
|
||||
|
||||
|
||||
@@ -32,129 +32,108 @@
|
||||
#include "vdi_common.hpp"
|
||||
|
||||
//! Helper function to check "properties" parameter in various functions
|
||||
int checkContextProperties(
|
||||
const cl_context_properties *properties,
|
||||
bool* offlineDevices);
|
||||
int checkContextProperties(const cl_context_properties* properties, bool* offlineDevices);
|
||||
|
||||
namespace amd {
|
||||
|
||||
template <typename T>
|
||||
static inline cl_int
|
||||
clGetInfo(
|
||||
T& field,
|
||||
size_t param_value_size,
|
||||
void* param_value,
|
||||
size_t* param_value_size_ret)
|
||||
{
|
||||
const void *valuePtr;
|
||||
size_t valueSize;
|
||||
static inline cl_int clGetInfo(T& field, size_t param_value_size, void* param_value,
|
||||
size_t* param_value_size_ret) {
|
||||
const void* valuePtr;
|
||||
size_t valueSize;
|
||||
|
||||
std::tie(valuePtr, valueSize)
|
||||
= detail::ParamInfo<typename std::remove_const<T>::type>::get(field);
|
||||
std::tie(valuePtr, valueSize) =
|
||||
detail::ParamInfo<typename std::remove_const<T>::type>::get(field);
|
||||
|
||||
*not_null(param_value_size_ret) = valueSize;
|
||||
*not_null(param_value_size_ret) = valueSize;
|
||||
|
||||
cl_int ret = CL_SUCCESS;
|
||||
if (param_value != NULL && param_value_size < valueSize) {
|
||||
if ((param_value_size == 0) || !std::is_pointer<T>() || !std::is_same<typename
|
||||
std::remove_const<typename std::remove_pointer<T>::type>::type, char>()) {
|
||||
return CL_INVALID_VALUE;
|
||||
}
|
||||
// For char* and char[] params, we will at least fill up to
|
||||
// param_value_size, then return an error.
|
||||
valueSize = param_value_size;
|
||||
static_cast<char*>(param_value)[--valueSize] = '\0';
|
||||
ret = CL_INVALID_VALUE;
|
||||
cl_int ret = CL_SUCCESS;
|
||||
if (param_value != NULL && param_value_size < valueSize) {
|
||||
if ((param_value_size == 0) || !std::is_pointer<T>() ||
|
||||
!std::is_same<typename std::remove_const<typename std::remove_pointer<T>::type>::type,
|
||||
char>()) {
|
||||
return CL_INVALID_VALUE;
|
||||
}
|
||||
// For char* and char[] params, we will at least fill up to
|
||||
// param_value_size, then return an error.
|
||||
valueSize = param_value_size;
|
||||
static_cast<char*>(param_value)[--valueSize] = '\0';
|
||||
ret = CL_INVALID_VALUE;
|
||||
}
|
||||
|
||||
if (param_value != NULL) {
|
||||
::memcpy(param_value, valuePtr, valueSize);
|
||||
if (param_value_size > valueSize) {
|
||||
::memset(static_cast<address>(param_value) + valueSize,
|
||||
'\0', param_value_size - valueSize);
|
||||
}
|
||||
if (param_value != NULL) {
|
||||
::memcpy(param_value, valuePtr, valueSize);
|
||||
if (param_value_size > valueSize) {
|
||||
::memset(static_cast<address>(param_value) + valueSize, '\0', param_value_size - valueSize);
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline cl_int
|
||||
clSetEventWaitList(
|
||||
Command::EventWaitList& eventWaitList,
|
||||
const amd::HostQueue& hostQueue,
|
||||
cl_uint num_events_in_wait_list,
|
||||
const cl_event* event_wait_list)
|
||||
{
|
||||
if ((num_events_in_wait_list == 0 && event_wait_list != NULL)
|
||||
|| (num_events_in_wait_list != 0 && event_wait_list == NULL)) {
|
||||
return CL_INVALID_EVENT_WAIT_LIST;
|
||||
}
|
||||
static inline cl_int clSetEventWaitList(Command::EventWaitList& eventWaitList,
|
||||
const amd::HostQueue& hostQueue,
|
||||
cl_uint num_events_in_wait_list,
|
||||
const cl_event* event_wait_list) {
|
||||
if ((num_events_in_wait_list == 0 && event_wait_list != NULL) ||
|
||||
(num_events_in_wait_list != 0 && event_wait_list == NULL)) {
|
||||
return CL_INVALID_EVENT_WAIT_LIST;
|
||||
}
|
||||
|
||||
while (num_events_in_wait_list-- > 0) {
|
||||
cl_event event = *event_wait_list++;
|
||||
Event* amdEvent = as_amd(event);
|
||||
if (!is_valid(event)) {
|
||||
return CL_INVALID_EVENT_WAIT_LIST;
|
||||
}
|
||||
if (&hostQueue.context() != &amdEvent->context()) {
|
||||
return CL_INVALID_CONTEXT;
|
||||
}
|
||||
if ((amdEvent->command().queue() != &hostQueue) && !amdEvent->notifyCmdQueue()) {
|
||||
return CL_INVALID_EVENT_WAIT_LIST;
|
||||
}
|
||||
eventWaitList.push_back(amdEvent);
|
||||
while (num_events_in_wait_list-- > 0) {
|
||||
cl_event event = *event_wait_list++;
|
||||
Event* amdEvent = as_amd(event);
|
||||
if (!is_valid(event)) {
|
||||
return CL_INVALID_EVENT_WAIT_LIST;
|
||||
}
|
||||
return CL_SUCCESS;
|
||||
if (&hostQueue.context() != &amdEvent->context()) {
|
||||
return CL_INVALID_CONTEXT;
|
||||
}
|
||||
if ((amdEvent->command().queue() != &hostQueue) && !amdEvent->notifyCmdQueue()) {
|
||||
return CL_INVALID_EVENT_WAIT_LIST;
|
||||
}
|
||||
eventWaitList.push_back(amdEvent);
|
||||
}
|
||||
return CL_SUCCESS;
|
||||
}
|
||||
|
||||
//! Common function declarations for CL-external graphics API interop
|
||||
cl_int clEnqueueAcquireExtObjectsAMD(cl_command_queue command_queue,
|
||||
cl_uint num_objects, const cl_mem* mem_objects,
|
||||
cl_uint num_events_in_wait_list, const cl_event* event_wait_list,
|
||||
cl_event* event, cl_command_type cmd_type);
|
||||
cl_int clEnqueueReleaseExtObjectsAMD(cl_command_queue command_queue,
|
||||
cl_uint num_objects, const cl_mem* mem_objects,
|
||||
cl_uint num_events_in_wait_list, const cl_event* event_wait_list,
|
||||
cl_event* event, cl_command_type cmd_type);
|
||||
cl_int clEnqueueAcquireExtObjectsAMD(cl_command_queue command_queue, cl_uint num_objects,
|
||||
const cl_mem* mem_objects, cl_uint num_events_in_wait_list,
|
||||
const cl_event* event_wait_list, cl_event* event,
|
||||
cl_command_type cmd_type);
|
||||
cl_int clEnqueueReleaseExtObjectsAMD(cl_command_queue command_queue, cl_uint num_objects,
|
||||
const cl_mem* mem_objects, cl_uint num_events_in_wait_list,
|
||||
const cl_event* event_wait_list, cl_event* event,
|
||||
cl_command_type cmd_type);
|
||||
static inline cl_int clDXTranslateErrorCode(cl_int err) {
|
||||
return err == CL_INVALID_GL_OBJECT ? CL_INVALID_MEM_OBJECT : err;
|
||||
return err == CL_INVALID_GL_OBJECT ? CL_INVALID_MEM_OBJECT : err;
|
||||
}
|
||||
|
||||
} // namespace amd
|
||||
} // namespace amd
|
||||
|
||||
extern "C" {
|
||||
|
||||
#if defined(CL_VERSION_1_1)
|
||||
extern CL_API_ENTRY cl_int CL_API_CALL
|
||||
clSetCommandQueueProperty(
|
||||
cl_command_queue command_queue,
|
||||
cl_command_queue_properties properties,
|
||||
cl_bool enable,
|
||||
cl_command_queue_properties *old_properties) CL_API_SUFFIX__VERSION_1_0;
|
||||
#endif // CL_VERSION_1_1
|
||||
extern CL_API_ENTRY cl_int CL_API_CALL clSetCommandQueueProperty(
|
||||
cl_command_queue command_queue, cl_command_queue_properties properties, cl_bool enable,
|
||||
cl_command_queue_properties* old_properties) CL_API_SUFFIX__VERSION_1_0;
|
||||
#endif // CL_VERSION_1_1
|
||||
|
||||
extern CL_API_ENTRY cl_mem CL_API_CALL
|
||||
clConvertImageAMD(
|
||||
cl_context context,
|
||||
cl_mem image,
|
||||
const cl_image_format * image_format,
|
||||
cl_int * errcode_ret);
|
||||
extern CL_API_ENTRY cl_mem CL_API_CALL clConvertImageAMD(cl_context context, cl_mem image,
|
||||
const cl_image_format* image_format,
|
||||
cl_int* errcode_ret);
|
||||
|
||||
extern CL_API_ENTRY cl_mem CL_API_CALL
|
||||
clCreateBufferFromImageAMD(
|
||||
cl_context context,
|
||||
cl_mem image,
|
||||
cl_int * errcode_ret);
|
||||
extern CL_API_ENTRY cl_mem CL_API_CALL clCreateBufferFromImageAMD(cl_context context, cl_mem image,
|
||||
cl_int* errcode_ret);
|
||||
|
||||
extern CL_API_ENTRY cl_program CL_API_CALL
|
||||
clCreateProgramWithAssemblyAMD(
|
||||
cl_context context,
|
||||
cl_uint count,
|
||||
const char ** strings,
|
||||
const size_t * lengths,
|
||||
cl_int * errcode_ret);
|
||||
extern CL_API_ENTRY cl_program CL_API_CALL clCreateProgramWithAssemblyAMD(cl_context context,
|
||||
cl_uint count,
|
||||
const char** strings,
|
||||
const size_t* lengths,
|
||||
cl_int* errcode_ret);
|
||||
|
||||
} // extern "C"
|
||||
} // extern "C"
|
||||
|
||||
//! \endcond
|
||||
|
||||
|
||||
@@ -484,8 +484,8 @@ CL_API_ENTRY void* CL_API_CALL clGetExtensionFunctionAddress(const char* func_na
|
||||
CL_EXTENSION_ENTRYPOINT_CHECK(clConvertImageAMD);
|
||||
CL_EXTENSION_ENTRYPOINT_CHECK(clCreateBufferFromImageAMD);
|
||||
#if defined(cl_khr_il_program) || defined(CL_VERSION_2_1)
|
||||
CL_EXTENSION_ENTRYPOINT_CHECK2(clCreateProgramWithILKHR,clCreateProgramWithIL);
|
||||
#endif // defined(cl_khr_il_program) || defined(CL_VERSION_2_1)
|
||||
CL_EXTENSION_ENTRYPOINT_CHECK2(clCreateProgramWithILKHR, clCreateProgramWithIL);
|
||||
#endif // defined(cl_khr_il_program) || defined(CL_VERSION_2_1)
|
||||
#if cl_amd_assembly_program
|
||||
CL_EXTENSION_ENTRYPOINT_CHECK(clCreateProgramWithAssemblyAMD);
|
||||
#endif // cl_amd_assembly_program
|
||||
@@ -525,8 +525,8 @@ CL_API_ENTRY void* CL_API_CALL clGetExtensionFunctionAddress(const char* func_na
|
||||
CL_EXTENSION_ENTRYPOINT_CHECK(clGetPlaneFromImageAMD);
|
||||
#endif //_WIN32
|
||||
#if defined(cl_khr_sub_groups) || defined(CL_VERSION_2_1)
|
||||
CL_EXTENSION_ENTRYPOINT_CHECK2(clGetKernelSubGroupInfoKHR,clGetKernelSubGroupInfo);
|
||||
#endif // defined(cl_khr_sub_groups) || defined(CL_VERSION_2_1)
|
||||
CL_EXTENSION_ENTRYPOINT_CHECK2(clGetKernelSubGroupInfoKHR, clGetKernelSubGroupInfo);
|
||||
#endif // defined(cl_khr_sub_groups) || defined(CL_VERSION_2_1)
|
||||
break;
|
||||
case 'I':
|
||||
CL_EXTENSION_ENTRYPOINT_CHECK(clIcdGetPlatformIDsKHR);
|
||||
|
||||
@@ -396,7 +396,6 @@ RUNTIME_ENTRY(cl_int, clEnqueueReleaseD3D10ObjectsKHR,
|
||||
RUNTIME_EXIT
|
||||
|
||||
|
||||
|
||||
/*! @}
|
||||
* \addtogroup CL-D3D10 interop helper functions
|
||||
* @{
|
||||
@@ -412,7 +411,7 @@ RUNTIME_EXIT
|
||||
// clCreateBufferFromD3D10ResourceAMD
|
||||
//
|
||||
cl_mem amd::clCreateBufferFromD3D10ResourceAMD(Context& amdContext, cl_mem_flags flags,
|
||||
ID3D10Resource* pD3DResource, int* errcode_ret) {
|
||||
ID3D10Resource* pD3DResource, int* errcode_ret) {
|
||||
// Verify pD3DResource is a buffer
|
||||
D3D10_RESOURCE_DIMENSION rType;
|
||||
pD3DResource->GetType(&rType);
|
||||
@@ -491,8 +490,8 @@ cl_mem amd::clCreateImage1DFromD3D10ResourceAMD(
|
||||
// clCreateImage2DFromD3D10ResourceAMD
|
||||
//
|
||||
cl_mem amd::clCreateImage2DFromD3D10ResourceAMD(Context& amdContext, cl_mem_flags flags,
|
||||
ID3D10Resource* pD3DResource, UINT subresource,
|
||||
int* errcode_ret) {
|
||||
ID3D10Resource* pD3DResource, UINT subresource,
|
||||
int* errcode_ret) {
|
||||
// Verify the resource is a 2D texture
|
||||
D3D10_RESOURCE_DIMENSION rType;
|
||||
pD3DResource->GetType(&rType);
|
||||
@@ -527,8 +526,8 @@ cl_mem amd::clCreateImage2DFromD3D10ResourceAMD(Context& amdContext, cl_mem_flag
|
||||
// clCreateImage2DFromD3D10ResourceAMD
|
||||
//
|
||||
cl_mem amd::clCreateImage3DFromD3D10ResourceAMD(Context& amdContext, cl_mem_flags flags,
|
||||
ID3D10Resource* pD3DResource, UINT subresource,
|
||||
int* errcode_ret) {
|
||||
ID3D10Resource* pD3DResource, UINT subresource,
|
||||
int* errcode_ret) {
|
||||
// Verify the resource is a 2D texture
|
||||
D3D10_RESOURCE_DIMENSION rType;
|
||||
pD3DResource->GetType(&rType);
|
||||
@@ -590,6 +589,4 @@ void amd::SyncD3D10Objects(std::vector<amd::Memory*>& memObjects) {
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
#endif //_WIN32
|
||||
|
||||
@@ -28,34 +28,20 @@
|
||||
|
||||
#include <utility>
|
||||
|
||||
namespace amd
|
||||
{
|
||||
namespace amd {
|
||||
|
||||
//! Functions for executing the D3D10 related stuff
|
||||
cl_mem clCreateBufferFromD3D10ResourceAMD(
|
||||
Context& amdContext,
|
||||
cl_mem_flags flags,
|
||||
ID3D10Resource* pD3DResource,
|
||||
int* errcode_ret);
|
||||
cl_mem clCreateImage1DFromD3D10ResourceAMD(
|
||||
Context& amdContext,
|
||||
cl_mem_flags flags,
|
||||
ID3D10Resource* pD3DResource,
|
||||
UINT subresource,
|
||||
int* errcode_ret);
|
||||
cl_mem clCreateImage2DFromD3D10ResourceAMD(
|
||||
Context& amdContext,
|
||||
cl_mem_flags flags,
|
||||
ID3D10Resource* pD3DResource,
|
||||
UINT subresource,
|
||||
int* errcode_ret);
|
||||
cl_mem clCreateImage3DFromD3D10ResourceAMD(
|
||||
Context& amdContext,
|
||||
cl_mem_flags flags,
|
||||
ID3D10Resource* pD3DResource,
|
||||
UINT subresource,
|
||||
int* errcode_ret);
|
||||
cl_mem clCreateBufferFromD3D10ResourceAMD(Context& amdContext, cl_mem_flags flags,
|
||||
ID3D10Resource* pD3DResource, int* errcode_ret);
|
||||
cl_mem clCreateImage1DFromD3D10ResourceAMD(Context& amdContext, cl_mem_flags flags,
|
||||
ID3D10Resource* pD3DResource, UINT subresource,
|
||||
int* errcode_ret);
|
||||
cl_mem clCreateImage2DFromD3D10ResourceAMD(Context& amdContext, cl_mem_flags flags,
|
||||
ID3D10Resource* pD3DResource, UINT subresource,
|
||||
int* errcode_ret);
|
||||
cl_mem clCreateImage3DFromD3D10ResourceAMD(Context& amdContext, cl_mem_flags flags,
|
||||
ID3D10Resource* pD3DResource, UINT subresource,
|
||||
int* errcode_ret);
|
||||
void SyncD3D10Objects(std::vector<amd::Memory*>& memObjects);
|
||||
|
||||
} //namespace amd
|
||||
|
||||
} // namespace amd
|
||||
|
||||
@@ -469,7 +469,7 @@ RUNTIME_EXIT
|
||||
// clCreateBufferFromD3D11ResourceAMD
|
||||
//
|
||||
cl_mem amd::clCreateBufferFromD3D11ResourceAMD(Context& amdContext, cl_mem_flags flags,
|
||||
ID3D11Resource* pD3DResource, int* errcode_ret) {
|
||||
ID3D11Resource* pD3DResource, int* errcode_ret) {
|
||||
// Verify pD3DResource is a buffer
|
||||
D3D11_RESOURCE_DIMENSION rType;
|
||||
pD3DResource->GetType(&rType);
|
||||
@@ -504,8 +504,8 @@ cl_mem amd::clCreateBufferFromD3D11ResourceAMD(Context& amdContext, cl_mem_flags
|
||||
// clCreateImage2DFromD3D11ResourceAMD
|
||||
//
|
||||
cl_mem amd::clCreateImage2DFromD3D11ResourceAMD(Context& amdContext, cl_mem_flags flags,
|
||||
ID3D11Resource* pD3DResource, UINT subresource,
|
||||
int* errcode_ret) {
|
||||
ID3D11Resource* pD3DResource, UINT subresource,
|
||||
int* errcode_ret) {
|
||||
// Verify the resource is a 2D texture
|
||||
D3D11_RESOURCE_DIMENSION rType;
|
||||
pD3DResource->GetType(&rType);
|
||||
@@ -540,8 +540,8 @@ cl_mem amd::clCreateImage2DFromD3D11ResourceAMD(Context& amdContext, cl_mem_flag
|
||||
// clCreateImage2DFromD3D11ResourceAMD
|
||||
//
|
||||
cl_mem amd::clCreateImage3DFromD3D11ResourceAMD(Context& amdContext, cl_mem_flags flags,
|
||||
ID3D11Resource* pD3DResource, UINT subresource,
|
||||
int* errcode_ret) {
|
||||
ID3D11Resource* pD3DResource, UINT subresource,
|
||||
int* errcode_ret) {
|
||||
// Verify the resource is a 2D texture
|
||||
D3D11_RESOURCE_DIMENSION rType;
|
||||
pD3DResource->GetType(&rType);
|
||||
@@ -623,6 +623,4 @@ void amd::SyncD3D11Objects(std::vector<amd::Memory*>& memObjects) {
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
#endif //_WIN32
|
||||
|
||||
@@ -28,41 +28,24 @@
|
||||
|
||||
#include <utility>
|
||||
|
||||
extern CL_API_ENTRY cl_mem CL_API_CALL
|
||||
clGetPlaneFromImageAMD(
|
||||
cl_context /* context */,
|
||||
cl_mem /* mem */,
|
||||
cl_uint /* plane */,
|
||||
cl_int* /* errcode_ret */);
|
||||
extern CL_API_ENTRY cl_mem CL_API_CALL clGetPlaneFromImageAMD(cl_context /* context */,
|
||||
cl_mem /* mem */, cl_uint /* plane */,
|
||||
cl_int* /* errcode_ret */);
|
||||
|
||||
namespace amd
|
||||
{
|
||||
namespace amd {
|
||||
|
||||
//! Functions for executing the D3D11 related stuff
|
||||
cl_mem clCreateBufferFromD3D11ResourceAMD(
|
||||
Context& amdContext,
|
||||
cl_mem_flags flags,
|
||||
ID3D11Resource* pD3DResource,
|
||||
int* errcode_ret);
|
||||
cl_mem clCreateImage1DFromD3D11ResourceAMD(
|
||||
Context& amdContext,
|
||||
cl_mem_flags flags,
|
||||
ID3D11Resource* pD3DResource,
|
||||
UINT subresource,
|
||||
int* errcode_ret);
|
||||
cl_mem clCreateImage2DFromD3D11ResourceAMD(
|
||||
Context& amdContext,
|
||||
cl_mem_flags flags,
|
||||
ID3D11Resource* pD3DResource,
|
||||
UINT subresource,
|
||||
int* errcode_ret);
|
||||
cl_mem clCreateImage3DFromD3D11ResourceAMD(
|
||||
Context& amdContext,
|
||||
cl_mem_flags flags,
|
||||
ID3D11Resource* pD3DResource,
|
||||
UINT subresource,
|
||||
int* errcode_ret);
|
||||
cl_mem clCreateBufferFromD3D11ResourceAMD(Context& amdContext, cl_mem_flags flags,
|
||||
ID3D11Resource* pD3DResource, int* errcode_ret);
|
||||
cl_mem clCreateImage1DFromD3D11ResourceAMD(Context& amdContext, cl_mem_flags flags,
|
||||
ID3D11Resource* pD3DResource, UINT subresource,
|
||||
int* errcode_ret);
|
||||
cl_mem clCreateImage2DFromD3D11ResourceAMD(Context& amdContext, cl_mem_flags flags,
|
||||
ID3D11Resource* pD3DResource, UINT subresource,
|
||||
int* errcode_ret);
|
||||
cl_mem clCreateImage3DFromD3D11ResourceAMD(Context& amdContext, cl_mem_flags flags,
|
||||
ID3D11Resource* pD3DResource, UINT subresource,
|
||||
int* errcode_ret);
|
||||
void SyncD3D11Objects(std::vector<amd::Memory*>& memObjects);
|
||||
|
||||
} //namespace amd
|
||||
|
||||
} // namespace amd
|
||||
|
||||
Ορισμένα αρχεία δεν εμφανίστηκαν επειδή έχουν αλλάξει πάρα πολλά αρχεία σε αυτή τη διαφορά Εμφάνιση Περισσότερων
Αναφορά σε νέο ζήτημα
Block a user