Files
rocm-systems/rocclr/runtime/device/gpu/gpuprogram.hpp
T
foreman d09ca72f74 P4 to Git Change 1536925 by vsytchen@vsytchen-ocl-win10 on 2018/04/04 17:20:38
SWDEV-79445 - OCL generic changes and code clean-up

	1. This change replaces the use of std::map with std::unordered_map to improve lookup/insert time.
	2. Replace the use of std::make_pair and std::pair constructor with uniform initialization for cleaner code.
	3. Replace the use of std::Container::iterator type with the auto keyword for cleaner code.
	4. Use range based for loops where needed.

	ReviewBoardURL = http://ocltc.amd.com/reviews/r/14517/diff/

Affected files ...

... //depot/stg/opencl/drivers/opencl/api/hip/hip_platform.cpp#4 edit
... //depot/stg/opencl/drivers/opencl/api/opencl/amdocl/cl_context.cpp#58 edit
... //depot/stg/opencl/drivers/opencl/api/opencl/amdocl/cl_d3d10.cpp#16 edit
... //depot/stg/opencl/drivers/opencl/api/opencl/amdocl/cl_d3d10_amd.hpp#9 edit
... //depot/stg/opencl/drivers/opencl/api/opencl/amdocl/cl_d3d11.cpp#24 edit
... //depot/stg/opencl/drivers/opencl/api/opencl/amdocl/cl_d3d11_amd.hpp#13 edit
... //depot/stg/opencl/drivers/opencl/api/opencl/amdocl/cl_d3d9.cpp#34 edit
... //depot/stg/opencl/drivers/opencl/api/opencl/amdocl/cl_d3d9_amd.hpp#17 edit
... //depot/stg/opencl/drivers/opencl/api/opencl/amdocl/cl_gl.cpp#57 edit
... //depot/stg/opencl/drivers/opencl/api/opencl/amdocl/cl_pipe.cpp#7 edit
... //depot/stg/opencl/drivers/opencl/api/opencl/amdocl/cl_program.cpp#46 edit
... //depot/stg/opencl/drivers/opencl/api/opencl/amdocl/cl_svm.cpp#23 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/appprofile.hpp#14 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/cpu/cpuprogram.cpp#72 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/cpu/cpuvirtual.cpp#27 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/device.cpp#216 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/device.hpp#297 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuappprofile.cpp#13 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpubinary.cpp#59 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpucompiler.cpp#158 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudevice.cpp#587 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.cpp#322 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuprintf.cpp#46 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuprogram.cpp#237 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuprogram.hpp#70 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuresource.cpp#242 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.cpp#415 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.hpp#143 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palappprofile.cpp#3 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palcompiler.cpp#22 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.cpp#79 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palprintf.cpp#9 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palprogram.cpp#59 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palresource.cpp#60 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#84 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.hpp#46 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/CMakeLists.txt#11 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/pro/prodevice.cpp#4 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/pro/prodevice.hpp#5 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocbinary.hpp#6 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/roccompiler.cpp#42 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/roccounters.cpp#3 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocprintf.cpp#10 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocprogram.cpp#81 edit
... //depot/stg/opencl/drivers/opencl/runtime/platform/command.cpp#81 edit
... //depot/stg/opencl/drivers/opencl/runtime/platform/command.hpp#89 edit
... //depot/stg/opencl/drivers/opencl/runtime/platform/commandqueue.cpp#24 edit
... //depot/stg/opencl/drivers/opencl/runtime/platform/context.cpp#49 edit
... //depot/stg/opencl/drivers/opencl/runtime/platform/context.hpp#29 edit
... //depot/stg/opencl/drivers/opencl/runtime/platform/memory.cpp#129 edit
... //depot/stg/opencl/drivers/opencl/runtime/platform/memory.hpp#102 edit
... //depot/stg/opencl/drivers/opencl/runtime/platform/perfctr.hpp#7 edit
... //depot/stg/opencl/drivers/opencl/runtime/platform/program.cpp#91 edit
... //depot/stg/opencl/drivers/opencl/runtime/platform/program.hpp#43 edit
... //depot/stg/opencl/drivers/opencl/runtime/platform/sampler.hpp#9 edit
... //depot/stg/opencl/drivers/opencl/runtime/utils/flags.cpp#17 edit
2018-04-04 18:00:17 -04:00

578 γραμμές
21 KiB
C++

//
// Copyright (c) 2008 Advanced Micro Devices, Inc. All rights reserved.
//
#ifndef GPUPROGRAM_HPP_
#define GPUPROGRAM_HPP_
#include "device/gpu/gpukernel.hpp"
#include "device/gpu/gpubinary.hpp"
#include "amd_hsa_loader.hpp"
namespace amd {
namespace option {
class Options;
} // option
namespace hsa {
namespace loader {
class Loader;
class Executable;
class Context;
} // loader
} // hsa
} // amd
//! \namespace gpu GPU Device Implementation
namespace gpu {
/*! \addtogroup GPU GPU Device Implementation
* @{
*/
//! \struct ILFunc for the opencl program processing
struct ILFunc : public amd::HeapObject {
public:
//! \struct CodeRange for the code ranges
struct SourceRange : public amd::EmbeddedObject {
size_t begin_; //!< start code position
size_t end_; //!< end code position
};
//! \enum IL function state
enum State {
Unknown = 0x00000000, //! unknown function
Regular = 0x00000001, //! regular function from the program
Kernel = 0x00000002 //! kernel function from the program
};
//! Default constructor
ILFunc()
: name_(""),
index_(0),
state_(Unknown),
privateSize_(0),
localSize_(0),
hwPrivateSize_(0),
hwLocalSize_(0),
flags_(0),
totalHwPrivateSize_(-1) {
code_.begin_ = code_.end_ = 0;
metadata_.begin_ = metadata_.end_ = 0;
}
//! Copy constructor
ILFunc(const ILFunc& func) { *this = func; }
//! Destructor
~ILFunc() {}
//! Overloads operator=
ILFunc& operator=(const ILFunc& func) {
name_ = func.name_;
index_ = func.index_;
code_ = func.code_;
metadata_ = func.metadata_;
state_ = func.state_;
privateSize_ = func.privateSize_;
localSize_ = func.localSize_;
hwPrivateSize_ = func.hwPrivateSize_;
hwLocalSize_ = func.hwLocalSize_;
flags_ = func.flags_;
totalHwPrivateSize_ = func.totalHwPrivateSize_;
// Note: we don't copy calls_ and macros_
return *this;
}
std::string name_; //!< kernel's name
uint index_; //!< kernel's index
SourceRange code_; //!< the entire function range in the source
SourceRange metadata_; //!< the metadata range
State state_; //!< the function is real, and not intrinsic
uint privateSize_; //!< private ring allocation by the function
uint localSize_; //!< local ring allocation by the function
uint hwPrivateSize_; //!< HW private ring allocation by the function
uint hwLocalSize_; //!< HW local ring allocation by the function
uint flags_; //!< The IL func flags/properties
long long totalHwPrivateSize_; //!< total HW private usage including called functions
std::vector<ILFunc*> calls_; //! Functions called from the current
std::vector<uint> macros_; //! Macros, used in the IL function
uint totalHwPrivateUsage(); //!< total HW private usage including called functions
};
//! \class empty program
class NullProgram : public device::Program {
friend class ClBinary;
public:
//! Default constructor
NullProgram(NullDevice& nullDev) : device::Program(nullDev), patch_(0) {}
//! Default destructor
~NullProgram();
// Initialize Binary for GPU
virtual bool initClBinary();
// Release Binary for GPU
virtual void releaseClBinary();
//! Returns global constant buffers
const std::vector<uint>& glbCb() const { return glbCb_; }
protected:
//! pre-compile setup for GPU
virtual bool initBuild(amd::option::Options* options);
//! post-compile setup for GPU
virtual bool finiBuild(bool isBuildGood);
/*! \brief Compiles GPU CL program to LLVM binary (compiler frontend)
*
* \return True if we successefully compiled a GPU program
*/
virtual bool compileImpl(const std::string& sourceCode, //!< the program's source code
const std::vector<const std::string*>& headers, //!< header souce codes
const char** headerIncludeNames, //!< include names of headers
amd::option::Options* options //!< compile options's object
);
/*! \brief Compiles LLVM binary to IL code (compiler backend: link+opt+codegen)
*
* \return The build error code
*/
int compileBinaryToIL(amd::option::Options* options //!< options for compilation
);
/*! \brief Links the compiled IL program with HW
*
* \return True if we successefully linked a GPU program
*/
virtual bool linkImpl(amd::option::Options* options = NULL //!< options object
);
virtual bool linkImpl(const std::vector<device::Program*>& inputPrograms,
amd::option::Options* options = NULL, //!< options object
bool createLibrary = false);
virtual bool createBinary(amd::option::Options* options);
/*! \brief Parses the GPU program and finds all available kernels
*
* \return True if we successefully parsed the GPU program
*/
bool parseKernels(const std::string& source //! the program's source code
);
/*! \brief Parse all functions in the program
*
* \return True if we successefully parsed all functions
*/
bool parseAllILFuncs(const std::string& source //! the program's source code
);
/*! \brief Parse a function's metadata given as source[posBegin:posEnd-1]
*
* \return True if we successefully parsed the given metadata
*/
bool parseFuncMetadata(const std::string& source, //! string that contains metadata
size_t posBegin, //! begin of metadata in 'source'
size_t posEnd //! end of metadata in 'source'
);
/*! \brief Finds functions with the given start and end string in the
* program
*
* \return True if we successefully found all functions
*/
bool findILFuncs(const std::string& source, //! the program's source code
const std::string& func_start, //! the start string of a function
const std::string& func_end, //! the end string of a function
size_t& lastFuncPos //! pos to the end of the last func in 'source'
);
/*! \brief Finds all functions in the program
*
* \return True if we successefully found all functions
*/
bool findAllILFuncs(const std::string& source, //! the program's source code
size_t& lastFuncPos //! pos to the end of the last func in 'source'
);
/*! \brief Finds function, corresponded to the provided unique index
*
* \return Pointer to the ILFunc structure
*/
ILFunc* findILFunc(uint index //! the function unique index
);
//! Destroys all objects, associated with the IL functions
void freeAllILFuncs();
/*! \brief Finds if a provided function is called from the base function
*
* \return True if a function is used from the base one
*/
bool isCalled(const ILFunc* base, //!< The base function
const ILFunc* func //!< Function to check for usage
);
//! Patches the "main" function with the call to the current kernel
void patchMain(std::string& kernel, //! The current kernel's code for compilation
uint index //! Index of the current kernel in the program
);
//! Adds the IL function object into the list of functions
void addFunc(ILFunc* func) { funcs_.push_back(func); }
//! Empty implementation, since we don't have real HW
virtual bool allocGlobalData(const void* globalData, //!< Pointer to the global data
size_t dataSize, //!< The global data size
uint index //!< Index for the global data store (0 - global heap)
) {
glbCb_.push_back(index);
return true;
}
//! Load binary for offline device.
virtual bool loadBinary(bool* hasRecompiled);
//! Create NullKernel for compiling to isa.
virtual NullKernel* createKernel(const std::string& name, //!< The kernel's name
const Kernel::InitData* initData, //!< Initialization data
const std::string& code, //!< IL source code
const std::string& metadata, //!< the kernel metadata structure
bool* created, //!< True if the object was created
const void* binaryCode = NULL, //!< binary machine code for CAL
size_t binarySize = 0 //!< the machine code size
);
ClBinary* clBinary() { return static_cast<ClBinary*>(device::Program::clBinary()); }
const ClBinary* clBinary() const {
return static_cast<const ClBinary*>(device::Program::clBinary());
}
/*! Get all per-kernel IL from programIL, where programIL is the IL for the
* whole compilation unit.
*/
bool getAllKernelILs(std::unordered_map<std::string, std::string>& allKernelILs, std::string& programIL,
const char* ilKernelName);
protected:
std::vector<PrintfInfo> printf_; //!< Format strings for GPU printf support
std::vector<uint> glbCb_; //!< Global constant buffers
virtual bool isElf(const char* bin) const { return amd::isElfMagic(bin); }
virtual const aclTargetInfo& info(const char* str = "");
private:
//! Disable default copy constructor
NullProgram(const NullProgram&);
//! Disable operator=
NullProgram& operator=(const NullProgram&);
//! Initializes the global data store
bool initGlobalData(const std::string& source, //!< the program's source code
size_t start //!< start position for the global data search
);
//! Return a typecasted GPU device
gpu::NullDevice& dev() {
return const_cast<gpu::NullDevice&>(static_cast<const gpu::NullDevice&>(device()));
}
size_t patch_; //!< Patch call position in the source code.
std::vector<ILFunc*> funcs_; //!< list of all functions.
std::string ilProgram_; //!< IL program after compilation
};
//! \class GPU program
class Program : public NullProgram {
public:
//! GPU program constructor
Program(Device& gpuDev) : NullProgram(gpuDev), glbData_(NULL) {}
//! GPU program destructor
~Program();
//! Get the global data store for this program
gpu::Memory* glbData() const { return glbData_; }
//! Returns TRUE if we successfully allocated the global data store
//! in video memory
bool allocGlobalData(const void* globalData, //!< Pointer to the global data
size_t dataSize, //!< The global data size
uint index //!< Index for the global data store (0 - global heap)
);
//! Returns TRUE if we could
virtual bool loadBinary(bool* hasRecompiled);
//! Creates the GPU kernel (return base type)
virtual NullKernel* createKernel(const std::string& name, //!< The kernel's name
const Kernel::InitData* initData, //!< Initialization data
const std::string& code, //!< IL source code
const std::string& metadata, //!< the kernel metadata structure
bool* created, //!< True if the object was created
const void* binaryCode = NULL, //!< binary machine code for CAL
size_t binarySize = 0 //!< the machine code size
);
typedef std::unordered_map<uint, gpu::Memory*> HwConstBuffers;
//! Global HW constant buffers
const HwConstBuffers& glbHwCb() const { return constBufs_; }
//! Returns pritnf info array
const std::vector<PrintfInfo>& printfInfo() const { return printf_; }
//! Return a typecasted GPU device
gpu::Device& dev() { return const_cast<gpu::Device&>(static_cast<const gpu::Device&>(device())); }
protected:
private:
//! Disable copy constructor
Program(const Program&);
//! Disable operator=
Program& operator=(const Program&);
HwConstBuffers constBufs_; //!< Constant buffers for the global store
gpu::Memory* glbData_; //!< Global data store
};
using namespace amd::hsa::loader;
class HSAILProgram;
class ORCAHSALoaderContext final : public Context {
public:
ORCAHSALoaderContext(HSAILProgram* program) : program_(program) {}
virtual ~ORCAHSALoaderContext() {}
hsa_isa_t IsaFromName(const char* name) override;
bool IsaSupportedByAgent(hsa_agent_t agent, hsa_isa_t isa) override;
void* SegmentAlloc(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, size_t size, size_t align,
bool zero) override;
bool SegmentCopy(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* dst, size_t offset,
const void* src, size_t size) override;
void SegmentFree(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* seg,
size_t size = 0) override;
void* SegmentAddress(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* seg,
size_t offset) override;
void* SegmentHostAddress(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* seg,
size_t offset) override {
return nullptr;
}
bool SegmentFreeze(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* seg,
size_t size) override {
return false;
}
bool ImageExtensionSupported() override { return false; }
hsa_status_t ImageCreate(hsa_agent_t agent, hsa_access_permission_t image_permission,
const hsa_ext_image_descriptor_t* image_descriptor,
const void* image_data, hsa_ext_image_t* image_handle) override {
// not supported
assert(false);
return HSA_STATUS_ERROR;
}
hsa_status_t ImageDestroy(hsa_agent_t agent, hsa_ext_image_t image_handle) override {
// not supported
assert(false);
return HSA_STATUS_ERROR;
}
hsa_status_t SamplerCreate(hsa_agent_t agent,
const hsa_ext_sampler_descriptor_t* sampler_descriptor,
hsa_ext_sampler_t* sampler_handle) override;
//! All samplers are owned by HSAILProgram and are deleted in its destructor.
hsa_status_t SamplerDestroy(hsa_agent_t agent, hsa_ext_sampler_t sampler_handle) override;
private:
void* AgentGlobalAlloc(hsa_agent_t agent, size_t size, size_t align, bool zero) {
return GpuMemAlloc(size, align, zero);
}
bool AgentGlobalCopy(void* dst, size_t offset, const void* src, size_t size) {
return GpuMemCopy(dst, offset, src, size);
}
void AgentGlobalFree(void* ptr, size_t size) { GpuMemFree(ptr, size); }
void* KernelCodeAlloc(hsa_agent_t agent, size_t size, size_t align, bool zero) {
return CpuMemAlloc(size, align, zero);
}
bool KernelCodeCopy(void* dst, size_t offset, const void* src, size_t size) {
return CpuMemCopy(dst, offset, src, size);
}
void KernelCodeFree(void* ptr, size_t size) { CpuMemFree(ptr, size); }
void* CpuMemAlloc(size_t size, size_t align, bool zero);
bool CpuMemCopy(void* dst, size_t offset, const void* src, size_t size);
void CpuMemFree(void* ptr, size_t size) { amd::Os::alignedFree(ptr); }
void* GpuMemAlloc(size_t size, size_t align, bool zero);
bool GpuMemCopy(void* dst, size_t offset, const void* src, size_t size);
void GpuMemFree(void* ptr, size_t size = 0);
ORCAHSALoaderContext(const ORCAHSALoaderContext& c);
ORCAHSALoaderContext& operator=(const ORCAHSALoaderContext& c);
gpu::HSAILProgram* program_;
};
//! \class HSAIL program
class HSAILProgram : public device::Program {
friend class ClBinary;
public:
//! Default constructor
HSAILProgram(Device& device);
HSAILProgram(NullDevice& device);
//! Default destructor
~HSAILProgram();
//! Returns the aclBinary associated with the progrm
aclBinary* binaryElf() const { return static_cast<aclBinary*>(binaryElf_); }
void addGlobalStore(Memory* mem) { globalStores_.push_back(mem); }
const std::vector<Memory*>& globalStores() const { return globalStores_; }
//! Return a typecasted GPU device
gpu::Device& dev() { return const_cast<gpu::Device&>(static_cast<const gpu::Device&>(device())); }
//! Returns GPU kernel table
const Memory* kernelTable() const { return kernels_; }
//! Adds all kernels to the mem handle lists
void fillResListWithKernels(std::vector<const Memory*>& memList) const;
//! Returns the maximum number of scratch regs used in the program
uint maxScratchRegs() const { return maxScratchRegs_; }
//! Add internal static sampler
void addSampler(Sampler* sampler) { staticSamplers_.push_back(sampler); }
//! Returns TRUE if the program just compiled
bool isNull() const { return isNull_; }
//! Returns TRUE if the program contains static samplers
bool isStaticSampler() const { return (staticSamplers_.size() != 0); }
protected:
//! pre-compile setup for GPU
virtual bool initBuild(amd::option::Options* options);
//! post-compile setup for GPU
virtual bool finiBuild(bool isBuildGood);
/*! \brief Compiles GPU CL program to LLVM binary (compiler frontend)
*
* \return True if we successefully compiled a GPU program
*/
virtual bool compileImpl(const std::string& sourceCode, //!< the program's source code
const std::vector<const std::string*>& headers,
const char** headerIncludeNames,
amd::option::Options* options //!< compile options's object
);
/* \brief Returns the next stage to compile from, based on sections in binary,
* also returns completeStages in a vector, which contains at least ACL_TYPE_DEFAULT,
* sets needOptionsCheck to true if options check is needed to decide whether or not to recompile
*/
aclType getCompilationStagesFromBinary(std::vector<aclType>& completeStages,
bool& needOptionsCheck);
/* \brief Returns the next stage to compile from, based on sections and options in binary
*/
aclType getNextCompilationStageFromBinary(amd::option::Options* options);
bool saveBinaryAndSetType(type_t type);
virtual bool linkImpl(amd::option::Options* options);
//! Link the device programs.
virtual bool linkImpl(const std::vector<device::Program*>& inputPrograms,
amd::option::Options* options, bool createLibrary);
virtual bool createBinary(amd::option::Options* options);
//! Initialize Binary
virtual bool initClBinary();
//! Release the Binary
virtual void releaseClBinary();
virtual const aclTargetInfo& info(const char* str = "");
virtual bool isElf(const char* bin) const {
return amd::isElfMagic(bin);
// return false;
}
//! Returns the binary
// This should ensure that the binary is updated with all the kernels
// ClBinary& clBinary() { return binary_; }
ClBinary* clBinary() { return static_cast<ClBinary*>(device::Program::clBinary()); }
const ClBinary* clBinary() const {
return static_cast<const ClBinary*>(device::Program::clBinary());
}
private:
//! Disable default copy constructor
HSAILProgram(const HSAILProgram&);
//! Disable operator=
HSAILProgram& operator=(const HSAILProgram&);
//! Returns all the options to be appended while passing to the
// compiler library
std::string hsailOptions();
//! Allocate kernel table
bool allocKernelTable();
std::string openCLSource_; //!< Original OpenCL source
std::string HSAILProgram_; //!< FSAIL program after compilation
std::string llvmBinary_; //!< LLVM IR binary code
aclBinary* binaryElf_; //!< Binary for the new compiler library
void* rawBinary_; //!< Pointer to the raw binary
aclBinaryOptions binOpts_; //!< Binary options to create aclBinary
std::vector<Memory*> globalStores_; //!< Global memory for the program
Memory* kernels_; //!< Table with kernel object pointers
uint
maxScratchRegs_; //!< Maximum number of scratch regs used in the program by individual kernel
std::list<Sampler*> staticSamplers_; //!< List od internal static samplers
bool isNull_; //!< Null program no memory allocations
amd::hsa::loader::Loader* loader_; //!< Loader object
amd::hsa::loader::Executable* executable_; //!< Executable for HSA Loader
ORCAHSALoaderContext loaderContext_; //!< Context for HSA Loader
};
/*@}*/} // namespace gpu
#endif /*GPUPROGRAM_HPP_*/