fix merging conflicts

2016-02-08 15:35:49 -06:00
parent 30dc087238 ce2fc0f7fe
commit b3b0bbdddc
@@ -13,7 +13,7 @@ $(HIP_OBJECTS): HIPCC_FLAGS += -I$(HSA_PATH)/include
 $(HIP_OBJECTS): 

 %.o:: %.cpp
-	    $(HIPCC) $(HIPCC_FLAGS) $< -c -o $@
+	    $(HIPCC) $(HIPCC_FLAGS) $< -c -O3 -o $@


 clean:
@@ -14,6 +14,7 @@ GetOptions(
    ,"compiler|c" => \$p_compiler
    ,"platform|P" => \$p_platform
    ,"cpp_config|cxx_config|C" => \$p_cpp_config
+    ,"full|f" => \$p_full,
    ,"newline|n" => \$p_newline
 );

@@ -23,6 +24,7 @@ if ($p_help) {
    print "  --cpp_config, -C   : print C++ compiler options\n";
    print "  --compiler, -c     : print compiler (hcc or nvcc)\n";
    print "  --platform, -P     : print platform (hcc or nvcc)\n";
+    print "  --full, -f         : print full config\n";
    print "  --newline, -n      : print newline\n";
    print "  --help, -h         : print help message\n";
    exit();
@@ -47,8 +49,13 @@ $HIP_PATH=$ENV{'HIP_PATH'};
 $HIP_PATH=Cwd::realpath (dirname (dirname $0)) unless defined $HIP_PATH;        # use parent directory of this tool


+if ($HIP_PLATFORM eq "hcc") {
+    $CPP_CONFIG= " -D__HIP_PLATFORM_HCC__=   -I$HIP_PATH/include -I$HCC_HOME/include";
+}
+if ($HIP_PLATFORM eq "nvcc") {
+    $CPP_CONFIG = " -D__HIP_PLATFORM_NVCC__=  -I$HIP_PATH/include -I$CUDA_PATH/include";
+};

-$printed = 0;
 if ($p_path) {
    print "$HIP_PATH";
    $printed = 1;
@@ -56,15 +63,43 @@ if ($p_path) {
    

 if ($p_cpp_config) {
-    if ($HIP_PLATFORM eq "hcc") {
-        print " -D__HIP_PLATFORM_HCC__=   -I$HIP_PATH/include -I$HCC_HOME/include";
-    }
-    if ($HIP_PLATFORM eq "nvcc") {
-        print " -D__HIP_PLATFORM_NVCC__=  -I$HIP_PATH/include -I$CUDA_PATH/include";
-    };
+    print $CPP_CONFIG;
    $printed = 1;
 }

+if ($p_full) {
+    print "== hipconfig\n";
+    print "HIP_PATH     : ", $HIP_PATH, "\n";
+    print "HIP_PLATFORM : ", $HIP_PLATFORM, "\n";
+    print "CPP_CONFIG   : ", $CPP_CONFIG, "\n";
+    if ($HIP_PLATFORM eq "hcc") 
+    {
+        print "\n" ;
+        print "== hcc\n";
+        print ("HCC_HOME     : $HCC_HOME\n");
+        system("$HCC_HOME/bin/hcc --version");
+        print ("HCC-cxxflags: "); 
+        system("$HCC_HOME/bin/hcc-config --cxxflags");
+        print ("HCC-ldflags : "); 
+        system("$HCC_HOME/bin/hcc-config --ldflags");
+    }
+    if ($HIP_PLATFORM eq "nvcc")  {
+        print "\n" ;
+        print "== nvcc\n";
+        #print "CUDA_PATH   :", $CUDA_PATH";
+        system("nvcc --version");
+
+    }
+    print "\n" ;
+
+    print "=== Environment Variables\n";
+    system("env | egrep '^HIP|^HSA|^HCC|^CUDA'");
+
+    print "\n" ;
+    print "== Linux Kernel\n";
+    system ("uname -a");
+    $printed = 1;
+}


 if (!$printed or $p_compiler or $p_platform) {
@@ -4,10 +4,10 @@

 #hipify "inplace" all code files in specified directory.    
 # This can be quite handy when dealing with an existing CUDA code base since the script
-# preseeves the existing directory structure.
+# preserves the existing directory structure.

 #  For each code file, this script will:
-#   - If ".prehip file does not exist, copy the original code to a new file withextension ".prehip".  Then Hipify the code file.
+#   - If ".prehip file does not exist, copy the original code to a new file with extension ".prehip".  Then Hipify the code file.
 #   - If ".prehip" file exists, this is used as input to hipify.
 # (this is useful for testing improvements to the hipify toolset).

@@ -345,6 +345,28 @@ while (@ARGV) {
        $ft{'dev'} += s/\bcudaDeviceProp\b/hipDeviceProp_t/g;
        $ft{'dev'} += s/\bcudaGetDeviceProperties\b/hipDeviceGetProperties/g;

+        # Attribute
+        $ft{'err'} += s/\bcudaDevAttrMaxThreadsPerBlock\b/hipDeviceAttributeMaxThreadsPerBlock/g;
+        $ft{'err'} += s/\bcudaDevAttrMaxBlockDimX\b/hipDeviceAttributeMaxBlockDimX/g;
+        $ft{'err'} += s/\bcudaDevAttrMaxBlockDimY\b/hipDeviceAttributeMaxBlockDimY/g;
+        $ft{'err'} += s/\bcudaDevAttrMaxBlockDimZ\b/hipDeviceAttributeMaxBlockDimZ/g;
+        $ft{'err'} += s/\bcudaDevAttrMaxGridDimX\b/hipDeviceAttributeMaxGridDimX/g;
+        $ft{'err'} += s/\bcudaDevAttrMaxGridDimY\b/hipDeviceAttributeMaxGridDimY/g;
+        $ft{'err'} += s/\bcudaDevAttrMaxGridDimZ\b/hipDeviceAttributeMaxGridDimZ/g;
+        $ft{'err'} += s/\bcudaDevAttrMaxSharedMemoryPerBlock\b/hipDeviceAttributeMaxSharedMemoryPerBlock/g;
+        $ft{'err'} += s/\bcudaDevAttrTotalConstantMemory\b/hipDeviceAttributeTotalConstantMemory/g;
+        $ft{'err'} += s/\bcudaDevAttrWarpSize\b/hipDeviceAttributeWarpSize/g;
+        $ft{'err'} += s/\bcudaDevAttrMaxRegistersPerBlock\b/hipDeviceAttributeMaxRegistersPerBlock/g;
+        $ft{'err'} += s/\bcudaDevAttrClockRate\b/hipDeviceAttributeClockRate/g;
+        $ft{'err'} += s/\bcudaDevAttrMultiProcessorCount\b/hipDeviceAttributeMultiprocessorCount/g;
+        $ft{'err'} += s/\bcudaDevAttrComputeMode\b/hipDeviceAttributeComputeMode/g;
+        $ft{'err'} += s/\bcudaDevAttrL2CacheSize\b/hipDeviceAttributeL2CacheSize/g;
+        $ft{'err'} += s/\bcudaDevAttrMaxThreadsPerMultiProcessor\b/hipDeviceAttributeMaxThreadsPerMultiProcessor/g;
+        $ft{'err'} += s/\bcudaDevAttrComputeCapabilityMajor\b/hipDeviceAttributeComputeCapabilityMajor/g;
+        $ft{'err'} += s/\bcudaDevAttrComputeCapabilityMinor\b/hipDeviceAttributeComputeCapabilityMinor/g;
+        $ft{'dev'} += s/\bcudaDeviceAttr\b/hipDeviceAttribute_t/g;
+        $ft{'dev'} += s/\bcudaDeviceGetAttribute\b/hipDeviceGetAttribute/g;
+
        # Cache config
        $ft{'dev'} += s/\bcudaDeviceSetCacheConfig\b/hipDeviceSetCacheConfig/g;
        $ft{'dev'} += s/\bcudaThreadSetCacheConfig\b/hipDeviceSetCacheConfig/g; # translate deprecated
@@ -1,7 +1,32 @@
 # FAQ

+<!-- START doctoc generated TOC please keep comment here to allow auto update -->
+<!-- DON'T EDIT THIS SECTION, INSTEAD RE-RUN doctoc TO UPDATE -->
+**Table of Contents**  *generated with [DocToc](https://github.com/thlorenz/doctoc)*

-### What APIs does HIP support?
+- [FAQ](#faq)
+- [Table of Contents](#table-of-contents)
+    - [What APIs does HIP support?](#what-apis-does-hip-support)
+    - [What is not supported?](#what-is-not-supported)
+      - [Run-time features:](#run-time-features)
+    - [How does HIP compare with OpenCL?](#how-does-hip-compare-with-opencl)
+    - [What hardware does HIP support?](#what-hardware-does-hip-support)
+    - [Does Hipify automatically convert all source code?](#does-hipify-automatically-convert-all-source-code)
+    - [What is NVCC?](#what-is-nvcc)
+    - [What is HCC?](#what-is-hcc)
+    - [Why use HIP rather than supporting CUDA directly?](#why-use-hip-rather-than-supporting-cuda-directly)
+    - [Can I develop HIP code on an Nvidia CUDA platform?](#can-i-develop-hip-code-on-an-nvidia-cuda-platform)
+    - [Can I develop HIP code on an AMD HCC platform?](#can-i-develop-hip-code-on-an-amd-hcc-platform)
+    - [Can a HIP binary run on both AMD and Nvidia platforms?](#can-a-hip-binary-run-on-both-amd-and-nvidia-platforms)
+    - [Hmmm](#hmmm)
+    - [Link2 Is it ready?](#link2-is-it-ready)
+    - [What's the difference between HIP and hc?](#whats-the-difference-between-hip-and-hc)
+
+<!-- END doctoc generated TOC please keep comment here to allow auto update -->
+=================
+
+
+### What APIs does HIP support ?
 HIP provides the following:
 - Devices (hipSetDevice(), hipGetDeviceProperties(), etc)
 - Memory management (hipMalloc(), hipMemcpy(), hipFree())
@@ -15,7 +40,7 @@ HIP provides the following:
 The HIP documentation describes each API and its limitations, if any, compared with the equivalent CUDA API.

 ### What is not supported?
-#### Run-time features:
+#### Run-time features
 - Textures 
 - Dynamic parallelism
 - Managed memory
@@ -23,7 +48,7 @@ The HIP documentation describes each API and its limitations, if any, compared w
 - CUDA array, mipmappedArray and pitched memory
 - CUDA Driver API
   
-#### Kernel language features:
+#### Kernel language features
 - Device-side dynamic memory allocations (malloc, free, new, delete)
 - Virtual functions, indirect functions and try/catch
 - `__prof_trigger` 
@@ -58,9 +83,10 @@ NVCC is Nvidia's compiler driver for compiling "CUDA C++" code into PTX or devic
 ### What is HCC?
 HCC is AMD's compiler driver which compiles "heterogenous C++" code into HSAIL or GCN device code for AMD GPUs.  HCC is an open-source compiler based on recent versions of CLANG/LLVM.

-### Why use HIP rather than supporting CUDA run time directly?
+### Why use HIP rather than supporting CUDA directly?
 While HIP is a strong subset of the CUDA, it is a subset.  The HIP layer allows that subset to be clearly defined and documented.
-Developers who code to the HIP API can be assured there code will remain portable across Nvidia and AMD platforms.
+Developers who code to the HIP API can be assured there code will remain portable across Nvidia and AMD platforms.  
+In addition, HIP defines portable mechanisms to query architectural features, and supports a larger 64-bit wavesize which expands the return type for cross-lane functions like ballot and shuffle from 32-bit ints to 64-bit ints.  

 ### Can I develop HIP code on an Nvidia CUDA platform?
 Yes!  HIP's CUDA path only exposes the APIs and functionality that work on both NVCC and HCC back-ends.
@@ -75,6 +101,7 @@ Yes! HIP's HCC path only exposes the APIs and functions that work on both NVCC a
 ### Can a HIP binary run on both AMD and Nvidia platforms?
 HIP is a source-portable language that can be compiled to run on either the HCC or NVCC platform. HIP tools don't create a "fat binary" that can run on either platform, however.

+
 ### What's the difference between HIP and hc?
 HIP is a portable C++ language that supports a strong subset of the CUDA run-time APIs and device-kernel language. It's designed to simplify CUDA conversion to portable C++. HIP provides a C-compatible run-time API, C-compatible kernel-launch mechanism, C++ kernel language and pointer-based memory management.

@@ -1,53 +1,54 @@
-# HIP Kernel Language
+<!-- START doctoc generated TOC please keep comment here to allow auto update -->
+<!-- DON'T EDIT THIS SECTION, INSTEAD RE-RUN doctoc TO UPDATE -->
+**Table of Contents**  *generated with [DocToc](https://github.com/thlorenz/doctoc)*
+
+- [Introduction](#introduction)
+- [Function-Type Qualifiers](#function-type-qualifiers)
+  - [`__device__`](#__device__)
+  - [`__global__`](#__global__)
+  - [`__host__`](#__host__)
+- [Calling `__global__` Functions](#calling-__global__-functions)
+- [Kernel-Launch Example](#kernel-launch-example)
+- [Variable-Type Qualifiers](#variable-type-qualifiers)
+  - [`__constant__`](#__constant__)
+  - [`__shared__`](#__shared__)
+  - [`__managed__`](#__managed__)
+  - [`__restrict__`](#__restrict__)
+- [Built-In Variables](#built-in-variables)
+  - [Coordinate Built-Ins](#coordinate-built-ins)
+  - [warpSize](#warpsize)
+- [Vector Types](#vector-types)
+  - [Short Vector Types](#short-vector-types)
+  - [dim3](#dim3)
+- [Memory-Fence Instructions](#memory-fence-instructions)
+- [Synchronization Functions](#synchronization-functions)
+- [Math Functions](#math-functions)
+  - [Single Precision Mathematical Functions](#single-precision-mathematical-functions)
+  - [Double Precision Mathematical Functions](#double-precision-mathematical-functions)
+  - [Integer Intrinsics](#integer-intrinsics)
+- [Texture Functions](#texture-functions)
+- [Surface Functions](#surface-functions)
+- [Timer Functions](#timer-functions)
+- [Atomic Functions](#atomic-functions)
+  - [Caveats and Features Under-Development:](#caveats-and-features-under-development)
+- [Warp Cross-Lane Functions](#warp-cross-lane-functions)
+  - [Warp Vote and Ballot Functions](#warp-vote-and-ballot-functions)
+  - [Warp Shuffle Functions](#warp-shuffle-functions)
+- [Profiler Counter Function](#profiler-counter-function)
+- [Assert](#assert)
+- [Printf](#printf)
+- [Device-Side Dynamic Global Memory Allocation](#device-side-dynamic-global-memory-allocation)
+- [`__launch_bounds__`](#__launch_bounds__)
+- [Register Keyword](#register-keyword)
+- [Pragma Unroll](#pragma-unroll)
+- [In-Line Assembly](#in-line-assembly)
+- [C++ Support](#c-support)
+
+<!-- END doctoc generated TOC please keep comment here to allow auto update -->


-###Table of Contents
-=================
-
-  * [HIP Kernel Language](#hip-kernel-language" aria-hidden="true"><span aria-hidden="true)
-    * [Table of Contents](#table-of-contents" aria-hidden="true"><span aria-hidden="true)
-    * [Function-Type Qualifiers](#function-type-qualifiers" aria-hidden="true"><span aria-hidden="true)
-      * [__device__ ](#__device__" aria-hidden="true"><span aria-hidden="true)
-      * [__global__ ](#__global__" aria-hidden="true"><span aria-hidden="true)
-      * [__host__ ](#__host__" aria-hidden="true"><span aria-hidden="true)
-    * [Calling __global__ Functions](#calling-__global__-functions" aria-hidden="true"><span aria-hidden="true)
-    * [Kernel-Launch Example](#kernel-launch-example" aria-hidden="true"><span aria-hidden="true)
-    * [Variable-Type Qualifiers](#variable-type-qualifiers" aria-hidden="true"><span aria-hidden="true)
-      * [__constant__ ](#__constant__" aria-hidden="true"><span aria-hidden="true)
-      * [__shared__ ](#__shared__" aria-hidden="true"><span aria-hidden="true)
-      * [__managed__ ](#__managed__" aria-hidden="true"><span aria-hidden="true)
-      * [__restrict__ ](#__restrict__" aria-hidden="true"><span aria-hidden="true)
-    * [Built-In Variables](#built-in-variables" aria-hidden="true"><span aria-hidden="true)
-      * [Coordinate Built-Ins](#coordinate-built-ins" aria-hidden="true"><span aria-hidden="true)
-      * [warpSize](#warpsize" aria-hidden="true"><span aria-hidden="true)
-    * [Vector Types](#vector-types" aria-hidden="true"><span aria-hidden="true)
-      * [Short Vector Types](#short-vector-types" aria-hidden="true"><span aria-hidden="true)
-      * [dim3](#dim3" aria-hidden="true"><span aria-hidden="true)
-    * [Memory-Fence Instructions](#memory-fence-instructions" aria-hidden="true"><span aria-hidden="true)
-    * [Synchronization Functions](#synchronization-functions" aria-hidden="true"><span aria-hidden="true)
-    * [Math Functions](#math-functions" aria-hidden="true"><span aria-hidden="true)
-      * [Single Precision Mathematical Functions](#single-precision-mathematical-functions" aria-hidden="true"><span aria-hidden="true)
-      * [Double Precision Mathematical Functions](#double-precision-mathematical-functions" aria-hidden="true"><span aria-hidden="true)
-      * [Integer Intrinsics](#integer-intrinsics" aria-hidden="true"><span aria-hidden="true)
-    * [Texture Functions](#texture-functions" aria-hidden="true"><span aria-hidden="true)
-    * [Surface Functions](#surface-functions" aria-hidden="true"><span aria-hidden="true)
-    * [Timer Functions](#timer-functions" aria-hidden="true"><span aria-hidden="true)
-    * [Atomic Functions](#atomic-functions" aria-hidden="true"><span aria-hidden="true)
-      * [Caveats and Features Under-Development:](#caveats-and-features-under-development" aria-hidden="true"><span aria-hidden="true)
-    * [Warp Cross-Lane Functions](#warp-cross-lane-functions" aria-hidden="true"><span aria-hidden="true)
-      * [Warp Vote and Ballot Functions](#warp-vote-and-ballot-functions" aria-hidden="true"><span aria-hidden="true)
-      * [Warp Shuffle Functions](#warp-shuffle-functions" aria-hidden="true"><span aria-hidden="true)
-    * [Profiler Counter Function](#profiler-counter-function" aria-hidden="true"><span aria-hidden="true)
-    * [Assert](#assert" aria-hidden="true"><span aria-hidden="true)
-    * [Printf](#printf" aria-hidden="true"><span aria-hidden="true)
-    * [Device-Side Dynamic Global Memory Allocation](#device-side-dynamic-global-memory-allocation" aria-hidden="true"><span aria-hidden="true)
-    * [__launch_bounds__ ](#__launch_bounds__" aria-hidden="true"><span aria-hidden="true)
-    * [Register Keyword](#register-keyword" aria-hidden="true"><span aria-hidden="true)
-    * [Pragma Unroll](#pragma-unroll" aria-hidden="true"><span aria-hidden="true)
-    * [In-Line Assembly](#in-line-assembly" aria-hidden="true"><span aria-hidden="true)
-    * [C   Support](#c-support" aria-hidden="true"><span aria-hidden="true)
-

+## Introduction

 HIP provides a C++ syntax that is suitable for compiling most code that commonly appears in compute kernels, including classes, namespaces, operator overloading, templates and more. Additionally, it defines other language features designed specifically to target accelerators, such as the following:
 - A kernel-launch syntax that uses standard C++, resembles a function call and is portable to all HIP targets
@@ -6,9 +6,9 @@ and provides practical suggestions on how to port CUDA code and work through com
 ###Table of Contents
 =================

- * [HIP Porting Guide](#hip-porting-guide" aria-hidden="true"><span aria-hidden="true)
-      * [Table of Contents](#table-of-contents" aria-hidden="true"><span aria-hidden="true)
-    * [Porting a New Cuda Project](#porting-a-new-cuda-project" aria-hidden="true"><span aria-hidden="true)
+ * [HIP Porting Guide](#hip-porting-guide)
+      * [Table of Contents](#table-of-contents)
+    * [Porting a New Cuda Project TO](#porting-a-new-cuda-project)
      * [General Tips](#general-tips" aria-hidden="true"><span aria-hidden="true)
      * [Scanning existing CUDA code to scope the porting effort](#scanning-existing-cuda-code-to-scope-the-porting-effort" aria-hidden="true"><span aria-hidden="true)
    * [Distinguishing Compiler Modes](#distinguishing-compiler-modes" aria-hidden="true"><span aria-hidden="true)
@@ -42,7 +42,7 @@ and provides practical suggestions on how to port CUDA code and work through com
      * [What Does This Error Mean?](#what-does-this-error-mean" aria-hidden="true"><span aria-hidden="true)
        * [/usr/include/c  /v1/memory:5172:15: error: call to implicitly deleted default constructor of 'std::__1::bad_weak_ptr' throw bad_weak_ptr();](#usrincludecv1memory517215-error-call-to-implicitly-deleted-default-constructor-of-std__1bad_weak_ptr-throw-bad_weak_ptr" aria-hidden="true"><span aria-hidden="true)
        * [grid_launch kernel dispatch - fallback](#grid_launch-kernel-dispatch---fallback" aria-hidden="true"><span aria-hidden="true)
-        * [Editor Highlighting](#editor-highlighting" aria-hidden="true"><span aria-hidden="true)
+        * [Editor Highlighting](#editor-highlighting)


 ## Porting a New Cuda Project
@@ -95,6 +95,32 @@ info: TOTAL-converted 89 CUDA->HIP refs( dev:3 mem:32 kern:2 builtin:37 math:0 s
  kernels (1 total) :   kmeansPoint(1)
 ```
 
+### Converting a project "in-place"
+
+```shell
+> hipify --inplace
+```
+
+For each input file FILE, this script will:
+  - If "FILE.prehip file does not exist, copy the original code to a new file with extension ".prehip".  Then Hipify the code file.
+  - If "FILE.prehip" file exists, hipify FILE.prehip and save to FILE.  
+
+This is useful for testing improvements to the hipify toolset.
+
+
+The "hipconvertinplace.sh" script will perform inplace conversion for all code files in the specified directory.
+This can be quite handy when dealing with an existing CUDA code base since the script preserves the existing directory structure
+and filenames - so includes work.  After converting in-place, you can review the code to add additional parameters to
+directory names.
+
+
+```shell
+> hipconverinplace.sh MY_SRC_DIR
+```
+
+
+
+
 
 ## Distinguishing Compiler Modes
 
@@ -34,9 +34,9 @@
 |Vector|`float4`|`float4`|`hc::`<br>`short_vector::float4`|`concurrency::`<br>`graphics::float_4`|`float4`

 ###Notes
-1. For HC and C++AMP, assume captured _tiled_ext_ t_ext and captured _extent_ ext.  These languages use captured variables to pass information to the kernel rather than special built-in functions so variable name may vary.
-2. The indexig functions (starting with thread-index) show the terminology for a 1D grid.  Some APIs use reverse order of xyz / 012 indexing for 3D grids.
-3. HC allos  tile dimensions to be specified at runtime while C++AMP requires that tile dimensions are specified at compile-time.  Thus hc syntax for tile dims is `t_ext.tile_dim[0]` while C++AMP is t_ext.tile_dim0.
+1. For HC and C++AMP, assume a captured _tiled_ext_ named "t_ext" and captured _extent_ named "ext".  These languages use captured variables to pass information to the kernel rather than using special built-in functions so the exact variable name may vary.
+2. The indexing functions (starting with `thread-index`) show the terminology for a 1D grid.  Some APIs use reverse order of xyz / 012 indexing for 3D grids.
+3. HC allows tile dimensions to be specified at runtime while C++AMP requires that tile dimensions be specified at compile-time.  Thus hc syntax for tile dims is `t_ext.tile_dim[0]` while C++AMP is t_ext.tile_dim0.



@@ -325,6 +325,16 @@ __device__ inline unsigned int __ffsll(unsigned long long int input)
 	return hc::__lastbit_u32_u64( input)+1;
 }

+__device__ inline unsigned int __ffs(int input)
+{
+	return hc::__lastbit_u32_s32( input)+1;
+}
+
+__device__ inline unsigned int __ffsll(long long int input)
+{
+	return hc::__lastbit_u32_s64( input)+1;
+}
+
 __device__ inline unsigned int __brev( unsigned int input)
 {
 	return hc::__bitrev_b32( input);
@@ -336,7 +346,6 @@ __device__ inline unsigned long long int __brevll( unsigned long long int input)
 }

 // warp vote function __all __any __ballot
-
 __device__ inline int __all(  int input)
 {
 	return hc::__all( input);
@@ -230,6 +230,13 @@ hipError_t hipGetDevice(int *device);
 */
 hipError_t hipGetDeviceCount(int *count);

+/**
+ * @brief Query device attribute.
+ * @param [out] pi pointer to value to return
+ * @param [in] attr attribute to query
+ * @param [in] device which device to query for information
+ */
+hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t attr, int device);

 /**
 * @brief Returns device properties.
@@ -687,11 +694,11 @@ hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind


 /**
- *  @brief Copies count bytes from the memory area pointed to by src to the memory area pointed to by offset bytes from the start of symbol symbol.
+ *  @brief Copies @p sizeBytes bytes from the memory area pointed to by @p src to the memory area pointed to by @p offset bytes from the start of symbol @p symbol.
 *
 *  The memory areas may not overlap. Symbol can either be a variable that resides in global or constant memory space, or it can be a character string,
 *  naming a variable that resides in global or constant memory space. Kind can be either hipMemcpyHostToDevice or hipMemcpyDeviceToDevice
- *  TODO: cudaErrorInvalidSymbol and cudaErrorInvalidMemcpyDirection is not supported, use hipErrorUnknown for now
+ *  TODO: cudaErrorInvalidSymbol and cudaErrorInvalidMemcpyDirection is not supported, use hipErrorUnknown for now.  
 *
 *  @param[in]  symbolName - Symbol destination on device
 *  @param[in]  src - Data being copy from
@@ -706,8 +713,6 @@ hipError_t hipMemcpyToSymbol(const char* symbolName, const void *src, size_t siz
 /**
 *  @brief Copy data from src to dst asynchronously.
 *
- *  It supports memory from host to device,
- *  device to host, device to device and host to host.
 *  TODO: cudaErrorInvalidMemcpyDirection error code is not supported right now, use hipErrorUnknown for now
 *
 *  @param[out] dst Data being copy to
@@ -735,7 +740,7 @@ hipError_t hipMemset(void* dst, int  value, size_t sizeBytes );


 /**
- *  @brief Fills the first count bytes of the memory area pointed to by dev with the constant byte value value.
+ *  @brief Fills the first sizeBytes bytes of the memory area pointed to by dev with the constant byte value value.
 *
 *  hipMemsetAsync() is asynchronous with respect to the host, so the call may return before the memset is complete.
 *  The operation can optionally be associated to a stream by passing a non-zero stream argument.
@@ -743,7 +748,7 @@ hipError_t hipMemset(void* dst, int  value, size_t sizeBytes );
 *
 *  @param[out] dst Pointer to device memory
 *  @param[in]  value - Value to set for each byte of specified memory
- *  @param[in]  count - Size in bytes to set
+ *  @param[in]  sizeBytes - Size in bytes to set
 *  @param[in]  stream - Stream identifier
 *  @return #hipSuccess, #hipErrorInvalidValue, #hipErrorMemoryFree
 */
@@ -817,7 +822,7 @@ hipError_t  hipDeviceEnablePeerAccess ( int  peerDevice, unsigned int  flags );
 * @param [in] dstDevice - Destination device
 * @param [in] src - Source device pointer
 * @param [in] srcDevice - Source device
- * @param [in] count - Size of memory copy in bytes
+ * @param [in] sizeBytes - Size of memory copy in bytes
 *
 * Returns #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidDevice
 */
@@ -830,7 +835,7 @@ hipError_t hipMemcpyPeer ( void* dst, int  dstDevice, const void* src, int  srcD
 * @param [in] dstDevice - Destination device
 * @param [in] src - Source device pointer
 * @param [in] srcDevice - Source device
- * @param [in] count - Size of memory copy in bytes
+ * @param [in] sizeBytes - Size of memory copy in bytes
 * @param [in] stream - Stream identifier
 *
 * Returns #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidDevice
@@ -125,7 +125,31 @@ typedef enum hipError_t {
  ,hipErrorTbd                     ///< Marker that more error codes are needed.
 } hipError_t;

-
+/*
+ * @brief hipDeviceAttribute_t
+ * @enum
+ * @ingroup Enumerations
+ */
+typedef enum hipDeviceAttribute_t {
+  hipDeviceAttributeMaxThreadsPerBlock,                       ///< Maximum number of threads per block.
+  hipDeviceAttributeMaxBlockDimX,                             ///< Maximum x-dimension of a block.
+  hipDeviceAttributeMaxBlockDimY,                             ///< Maximum y-dimension of a block.
+  hipDeviceAttributeMaxBlockDimZ,                             ///< Maximum z-dimension of a block.
+  hipDeviceAttributeMaxGridDimX,                              ///< Maximum x-dimension of a grid.
+  hipDeviceAttributeMaxGridDimY,                              ///< Maximum y-dimension of a grid.
+  hipDeviceAttributeMaxGridDimZ,                              ///< Maximum z-dimension of a grid.
+  hipDeviceAttributeMaxSharedMemoryPerBlock,                  ///< Maximum shared memory available per block in bytes.
+  hipDeviceAttributeTotalConstantMemory,                      ///< Constant memory size in bytes.
+  hipDeviceAttributeWarpSize,                                 ///< Warp size in threads.
+  hipDeviceAttributeMaxRegistersPerBlock,                     ///< Maximum number of 32-bit registers available to a thread block. This number is shared by all thread blocks simultaneously resident on a multiprocessor.
+  hipDeviceAttributeClockRate,                                ///< Peak clock frequency in kilohertz.
+  hipDeviceAttributeMultiprocessorCount,                      ///< Number of multiprocessors on the device.
+  hipDeviceAttributeComputeMode,                              ///< Compute mode that device is currently in.
+  hipDeviceAttributeL2CacheSize,                              ///< Size of L2 cache in bytes. 0 if the device doesn't have L2 cache.
+  hipDeviceAttributeMaxThreadsPerMultiProcessor,              ///< Maximum resident threads per multiprocessor.
+  hipDeviceAttributeComputeCapabilityMajor,                   ///< Major compute capability version number.
+  hipDeviceAttributeComputeCapabilityMinor,                   ///< Minor compute capability version number.
+} hipDeviceAttribute_t;

 /**
 *     @}
@@ -211,6 +211,57 @@ inline static hipError_t hipDeviceGetProperties(hipDeviceProp_t *p_prop, int dev
    return hipCUDAErrorTohipError(cerror);
 }

+inline static hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t attr, int device)
+{
+    cudaDeviceAttr cdattr;
+    cudaError_t cerror;
+    
+    switch (attr) {
+    case hipDeviceAttributeMaxThreadsPerBlock:
+        cdattr = cudaDevAttrMaxThreadsPerBlock; break;
+    case hipDeviceAttributeMaxBlockDimX:
+        cdattr = cudaDevAttrMaxBlockDimX; break;
+    case hipDeviceAttributeMaxBlockDimY:
+        cdattr = cudaDevAttrMaxBlockDimY; break;
+    case hipDeviceAttributeMaxBlockDimZ:
+        cdattr = cudaDevAttrMaxBlockDimZ; break;
+    case hipDeviceAttributeMaxGridDimX:
+        cdattr = cudaDevAttrMaxGridDimX; break;
+    case hipDeviceAttributeMaxGridDimY:
+        cdattr = cudaDevAttrMaxGridDimY; break;
+    case hipDeviceAttributeMaxGridDimZ:
+        cdattr = cudaDevAttrMaxGridDimZ; break;
+    case hipDeviceAttributeMaxSharedMemoryPerBlock:
+        cdattr = cudaDevAttrMaxSharedMemoryPerBlock; break;
+    case hipDeviceAttributeTotalConstantMemory:
+        cdattr = cudaDevAttrTotalConstantMemory; break;
+    case hipDeviceAttributeWarpSize:
+        cdattr = cudaDevAttrWarpSize; break;
+    case hipDeviceAttributeMaxRegistersPerBlock:
+        cdattr = cudaDevAttrMaxRegistersPerBlock; break;
+    case hipDeviceAttributeClockRate:
+        cdattr = cudaDevAttrClockRate; break;
+    case hipDeviceAttributeMultiprocessorCount:
+        cdattr = cudaDevAttrMultiProcessorCount; break;
+    case hipDeviceAttributeComputeMode:
+        cdattr = cudaDevAttrComputeMode; break;
+    case hipDeviceAttributeL2CacheSize:
+        cdattr = cudaDevAttrL2CacheSize; break;
+    case hipDeviceAttributeMaxThreadsPerMultiProcessor:
+        cdattr = cudaDevAttrMaxThreadsPerMultiProcessor; break;
+    case hipDeviceAttributeComputeCapabilityMajor:
+        cdattr = cudaDevAttrComputeCapabilityMajor; break;
+    case hipDeviceAttributeComputeCapabilityMinor:
+        cdattr = cudaDevAttrComputeCapabilityMinor; break;
+    default:
+        cerror = cudaErrorInvalidValue; break;
+    }
+
+    cerror = cudaDeviceGetAttribute(pi, cdattr, device);
+
+    return hipCUDAErrorTohipError(cerror);
+}
+
 inline static hipError_t hipMemGetInfo( size_t* free, size_t* total)
 {
    return hipCUDAErrorTohipError(cudaMemGetInfo(free,total));
@@ -789,7 +789,63 @@ hipError_t hipDeviceReset(void)
    return ihipLogStatus(hipSuccess);
 }

+/**
+ *
+ */
+hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t attr, int device)
+{
+    std::call_once(hip_initialized, ihipInit);

+    hipError_t e = hipSuccess;
+
+    ihipDevice_t * hipDevice = ihipGetDevice(device);
+    hipDeviceProp_t *prop = &hipDevice->_props;
+    if (hipDevice) {
+        switch (attr) {
+        case hipDeviceAttributeMaxThreadsPerBlock:
+            *pi = prop->maxThreadsPerBlock; break;
+        case hipDeviceAttributeMaxBlockDimX:
+            *pi = prop->maxThreadsDim[0]; break;
+        case hipDeviceAttributeMaxBlockDimY:
+            *pi = prop->maxThreadsDim[1]; break;
+        case hipDeviceAttributeMaxBlockDimZ:
+            *pi = prop->maxThreadsDim[2]; break;
+        case hipDeviceAttributeMaxGridDimX:
+            *pi = prop->maxGridSize[0]; break;
+        case hipDeviceAttributeMaxGridDimY:
+            *pi = prop->maxGridSize[1]; break;
+        case hipDeviceAttributeMaxGridDimZ:
+            *pi = prop->maxGridSize[2]; break;
+        case hipDeviceAttributeMaxSharedMemoryPerBlock:
+            *pi = prop->sharedMemPerBlock; break;
+        case hipDeviceAttributeTotalConstantMemory:
+            *pi = prop->totalConstMem; break;
+        case hipDeviceAttributeWarpSize:
+            *pi = prop->warpSize; break;
+        case hipDeviceAttributeMaxRegistersPerBlock:
+            *pi = prop->regsPerBlock; break;
+        case hipDeviceAttributeClockRate:
+            *pi = prop->clockRate; break;
+        case hipDeviceAttributeMultiprocessorCount:
+            *pi = prop->multiProcessorCount; break;
+        case hipDeviceAttributeComputeMode:
+            *pi = prop->computeMode; break;
+        case hipDeviceAttributeL2CacheSize:
+            *pi = prop->l2CacheSize; break;
+        case hipDeviceAttributeMaxThreadsPerMultiProcessor:
+            *pi = prop->maxThreadsPerMultiProcessor; break;
+        case hipDeviceAttributeComputeCapabilityMajor:
+            *pi = prop->major; break;
+        case hipDeviceAttributeComputeCapabilityMinor:
+            *pi = prop->minor; break;
+        default:
+            e = hipErrorInvalidValue; break;
+        }
+    } else {
+        e = hipErrorInvalidDevice;
+    }
+    return ihipLogStatus(e);
+}


 /**
@@ -1367,8 +1423,8 @@ hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind
    } else {
        e = hipErrorInvalidResourceHandle;
    }
-
        
+
 #else
    // TODO-hsart - what synchronization does hsa_copy provide?
    hc::am_copy(dst, src, sizeBytes);
@@ -8,33 +8,20 @@ include_directories( ${PROJECT_SOURCE_DIR}/include )
 set (HIP_Unit_Test_VERSION_MAJOR 1)
 set (HIP_Unit_Test_VERSION_MINOR 0)

-set (CUDA_PATH $ENV{CUDA_PATH})
-if (NOT DEFINED CUDA_PATH)
-	set( CUDA_PATH /usr/local/cuda)
-endif()
-
 set (HIP_PATH $ENV{HIP_PATH})
 if (NOT DEFINED HIP_PATH)
 	set (HIP_PATH ../..)
 endif()

-set (HIP_PLATFORM $ENV{HIP_PLATFORM})
-if (NOT DEFINED HIP_PLATFORM) 
-	if (EXISTS $CUDA_PATH)	
-        set (HIP_PLATFORM nvcc)
-	else()
-        set (HIP_PLATFORM hcc)
-	endif()
-endif()
-
+execute_process(COMMAND ${HIP_PATH}/bin/hipconfig --platform OUTPUT_VARIABLE HIP_PLATFORM)

+MESSAGE ("HIP_PATH=" ${HIP_PATH})

 if (${HIP_PLATFORM} STREQUAL "hcc")
-	MESSAGE ("HCC")
+	MESSAGE ("HIP_PLATFORM=hcc")
 	set (HC_PATH ${HIP_PATH}/hc)
 	set (HSA_PATH /opt/hsa)

-
 	#---
 	# Add HSA library:
 	add_library(hsa-runtime64 SHARED IMPORTED)
@@ -51,12 +38,13 @@ if (${HIP_PLATFORM} STREQUAL "hcc")


 elseif (${HIP_PLATFORM} STREQUAL "nvcc")
-	MESSAGE ("NVCC")
+	MESSAGE ("HIP_PLATFORM=nvcc")
 	# NVCC does not not support -rdynamic option
 	set(CMAKE_SHARED_LIBRARY_LINK_CXX_FLAGS )
 	set(CMAKE_SHARED_LIBRARY_LINK_C_FLAGS )
+
 else()
-    MESSAGE ("UNKNOWN HIP_PLATFORM=" ${HIP_PLATFORM})
+	MESSAGE (FATAL_ERROR "UNKNOWN HIP_PLATFORM=" ${HIP_PLATFORM})
 endif()

 set (HIPCC ${HIP_PATH}/bin/hipcc)
@@ -109,19 +97,15 @@ macro (make_test_matches exe match_string)
    )
 endmacro()

-
-#set(CMAKE_INSTALL_PREFIX "./install")
-#install (TARGETS hipMemset DESTINATION bin)
-#install (TARGETS hipEventRecord DESTINATION bin)
-
 make_hip_executable (hip_ballot hip_ballot.cpp)
 make_hip_executable (hip_anyall hip_anyall.cpp) 
 make_hip_executable (hip_popc hip_popc.cpp) 
 make_hip_executable (hip_clz hip_clz.cpp) 
 make_hip_executable (hip_brev hip_brev.cpp) 
 make_hip_executable (hip_ffs hip_ffs.cpp) 
-make_hip_executable (hipMemset hipMemset.cpp) 
+make_hip_executable (hipInfo hipInfo.cpp) 
 make_hip_executable (hipMemcpy hipMemcpy.cpp) 
+make_hip_executable (hipMemset hipMemset.cpp) 
 make_hip_executable (hipEventRecord hipEventRecord.cpp) 
 make_hip_executable (hipLanguageExtensions hipLanguageExtensions.cpp) 
 make_hip_executable (hipGridLaunch hipGridLaunch.cpp) 
@@ -0,0 +1,80 @@
+/*
+Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+// Test the device info API extensions for HIP:
+
+#include <stdio.h>
+#include <iostream>
+#include <hip_runtime.h>
+
+#include "test_common.h"
+
+#define CHECK(error) \
+    if (error != hipSuccess) { \
+      fprintf(stderr, "error: '%s'(%d) at %s:%d\n", hipGetErrorString(error), error,__FILE__, __LINE__); \
+    exit(EXIT_FAILURE);\
+    }
+
+hipError_t test_hipDeviceGetAttribute(int deviceId, hipDeviceAttribute_t attr, int expectedValue = 0)
+{
+    int value = 0;
+    std::cout << "Test hipDeviceGetAttribute attribute " << attr;
+    if (expectedValue) { std::cout << " expected value " << expectedValue; }
+    hipError_t e = hipDeviceGetAttribute(&value, attr, deviceId);
+    std::cout << " actual value " << value << std::endl;
+    if (expectedValue && value != expectedValue) {
+        std::cout << "fail" << std::endl;
+        return hipErrorInvalidValue;
+    }
+    return hipSuccess;
+}
+
+int main(int argc, char *argv[])
+{
+    int deviceId;
+    CHECK (hipGetDevice(&deviceId));
+    hipDeviceProp_t props;
+    CHECK(hipDeviceGetProperties(&props, deviceId));
+    printf ("info: running on device #%d %s\n", deviceId, props.name);
+
+    CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeMaxThreadsPerBlock, props.maxThreadsPerBlock));
+    CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeMaxBlockDimX, props.maxThreadsDim[0]));
+    CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeMaxBlockDimY, props.maxThreadsDim[1]));
+    CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeMaxBlockDimZ, props.maxThreadsDim[2]));
+    CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeMaxGridDimX, props.maxGridSize[0]));
+    CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeMaxGridDimY, props.maxGridSize[1]));
+    CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeMaxGridDimZ, props.maxGridSize[2]));
+    CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeMaxSharedMemoryPerBlock, props.sharedMemPerBlock));
+    CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeTotalConstantMemory, props.totalConstMem));
+    CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeWarpSize, props.warpSize));
+    CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeMaxRegistersPerBlock, props.regsPerBlock));
+    CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeClockRate, props.clockRate));
+    CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeMultiprocessorCount, props.multiProcessorCount));
+    CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeComputeMode, props.computeMode));
+    CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeL2CacheSize, props.l2CacheSize));
+    CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeMaxThreadsPerMultiProcessor, props.maxThreadsPerMultiProcessor));
+    CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeComputeCapabilityMajor, props.major));
+    CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeComputeCapabilityMinor, props.minor));
+
+    passed();
+
+};
+
@@ -31,9 +31,11 @@ int main(int argc, char *argv[])

    HipTest::parseStandardArguments(argc, argv, true);

+    HIPCHECK(hipSetDevice(p_gpuDevice));
+
    size_t Nbytes = N*sizeof(char);

-    printf ("N=%zu  memsetval=%2x\n", N, memsetval);
+    printf ("N=%zu  memsetval=%2x device=%d\n", N, memsetval, p_gpuDevice);

    char *A_d;
    char *A_h;
@@ -27,6 +27,7 @@ char memsetval=0x42;
 int iterations = 1;
 unsigned blocksPerCU = 6; // to hide latency
 unsigned threadsPerBlock = 256; 
+int p_gpuDevice = 0;



@@ -105,7 +106,12 @@ int parseStandardArguments(int argc, char *argv[], bool failOnUndefinedArg)
            memsetval = ex;
        } else if (!strcmp(arg, "--iterations") || (!strcmp(arg, "-i"))) {
            if (++i >= argc || !HipTest::parseInt(argv[i], &iterations)) {
-               failed("Bad itertions argument"); 
+               failed("Bad iterations argument"); 
+            }
+
+        } else if (!strcmp(arg, "--gpu") || (!strcmp(arg, "-g"))) {
+            if (++i >= argc || !HipTest::parseInt(argv[i], &p_gpuDevice)) {
+               failed("Bad gpuDevice argument"); 
            }

        } 
@@ -29,12 +29,15 @@


 #define HIPCHECK(error) \
-    if (error != hipSuccess) { \
+{\
+    hipError_t localError = error; \
+    if (localError != hipSuccess) { \
      printf("%serror: '%s'(%d) at %s:%d%s\n", \
-          KRED,hipGetErrorString(error), error,\
+          KRED,hipGetErrorString(localError), localError,\
      __FILE__, __LINE__,KNRM); \
        failed("API returned error code.");\
-    }
+    }\
+}

 #define HIPASSERT(condition) \
    if (! (condition) ) { \
@@ -49,6 +52,7 @@ extern char memsetval;
 extern int iterations;
 extern unsigned blocksPerCU;
 extern unsigned threadsPerBlock;
+extern int p_gpuDevice;

 namespace HipTest {