kernel iteration filtering for counter collection (#911)

* kernel filtering for counter collection * fixing trace tests * removing print statements * fix CI fail * handling preload and updating docs * minor fix * misc fix * misc fix * Typo fix * Update rocprofv3 + input schema - "application_passes" -> "jobs" - removed nesting in YAML/JSON inputs - improved customAction (now booleanArgAction) - supports --<name> (defaults to true) - supports --<name>=<truth-value> - supports --<name> <truth-value> - added --kernel-iteration-range to command-line - automatically support new command-line options in YAML/JSON input - standardized PMC return from text input to match PMC from YAML/JSON input - added support for --log-level env - updated various input*.(yml|json) to modified schema * Update config.cpp - added recommended code to get_kernel_filter_range * Fixing iteration * misc fix * support only [-] for iteration * bug fix * Fix using-rocprofv3.rst * Update config.cpp - patch get_kernel_filter_range --------- Co-authored-by: Jonathan R. Madsen <jonathanrmadsen@gmail.com>
2024-07-26 21:46:53 -05:00
parent dc671497da
commit ace34abd11
23 changed files with 1303 additions and 275 deletions
@@ -6,12 +6,60 @@ import argparse
 import subprocess


+class dotdict(dict):
+    """dot.notation access to dictionary attributes"""
+
+    __getattr__ = dict.get
+    __setattr__ = dict.__setitem__
+    __delattr__ = dict.__delitem__
+
+    def __init__(self, d):
+        super(dotdict, self).__init__(d)
+        for k, v in self.items():
+            if isinstance(v, dict):
+                self.__setitem__(k, dotdict(v))
+            elif isinstance(v, (list, tuple)):
+                self.__setitem__(
+                    k,
+                    [dotdict(i) if isinstance(i, (list, tuple, dict)) else i for i in v],
+                )
+
+
 def fatal_error(msg, exit_code=1):
    sys.stderr.write(f"Fatal error: {msg}\n")
    sys.stderr.flush()
    sys.exit(exit_code)


+def strtobool(val):
+    """Convert a string representation of truth to true or false.
+    True values are 'y', 'yes', 't', 'true', 'on', and '1'; false values
+    are 'n', 'no', 'f', 'false', 'off', and '0'.  Raises ValueError if
+    'val' is anything else.
+    """
+    if isinstance(val, (list, tuple)):
+        if len(val) > 1:
+            val_type = type(val).__name__
+            raise ValueError(f"invalid truth value {val} (type={val_type})")
+        else:
+            val = val[0]
+
+    if isinstance(val, bool):
+        return val
+    elif isinstance(val, str) and val.lower() in ("y", "yes", "t", "true", "on", "1"):
+        return True
+    elif isinstance(val, str) and val.lower() in ("n", "no", "f", "false", "off", "0"):
+        return False
+    else:
+        val_type = type(val).__name__
+        raise ValueError(f"invalid truth value {val} (type={val_type})")
+
+
+class booleanArgAction(argparse.Action):
+    def __call__(self, parser, args, value, option_string=None):
+        setattr(args, self.dest, strtobool(value))
+
+
 def parse_arguments(args=None):

    usage_examples = """
@@ -35,112 +83,90 @@ For MPI applications (or other job launchers such as SLURM), place rocprofv3 ins
        formatter_class=argparse.RawTextHelpFormatter,
    )

+    def add_parser_bool_argument(*args, **kwargs):
+        parser.add_argument(
+            *args,
+            **kwargs,
+            action=booleanArgAction,
+            nargs="?",
+            const=True,
+            type=str,
+            required=False,
+            metavar="BOOL",
+        )
+
    # Add the arguments
-    parser.add_argument(
+    add_parser_bool_argument(
        "--hip-trace",
-        action="store_true",
        help="For Collecting HIP Traces (runtime + compiler)",
-        required=False,
    )
-    parser.add_argument(
+    add_parser_bool_argument(
        "--hip-runtime-trace",
-        action="store_true",
        help="For Collecting HIP Runtime API Traces",
-        required=False,
    )
-    parser.add_argument(
+    add_parser_bool_argument(
        "--hip-compiler-trace",
-        action="store_true",
        help="For Collecting HIP Compiler generated code Traces",
-        required=False,
    )
-    parser.add_argument(
+    add_parser_bool_argument(
        "--marker-trace",
-        action="store_true",
        help="For Collecting Marker (ROCTx) Traces",
-        required=False,
    )
-    parser.add_argument(
+    add_parser_bool_argument(
        "--kernel-trace",
-        action="store_true",
        help="For Collecting Kernel Dispatch Traces",
-        required=False,
    )
-    parser.add_argument(
+    add_parser_bool_argument(
        "--memory-copy-trace",
-        action="store_true",
        help="For Collecting Memory Copy Traces",
-        required=False,
    )
-    parser.add_argument(
+    add_parser_bool_argument(
        "--scratch-memory-trace",
-        action="store_true",
        help="For Collecting Scratch Memory operations Traces",
-        required=False,
    )
-    parser.add_argument(
+    add_parser_bool_argument(
        "--stats",
-        action="store_true",
        help="For Collecting statistics of enabled tracing types",
-        required=False,
    )
-    parser.add_argument(
+    add_parser_bool_argument(
        "--hsa-trace",
-        action="store_true",
        help="For Collecting HSA Traces (core + amd + image + finalizer)",
-        required=False,
    )
-    parser.add_argument(
+    add_parser_bool_argument(
        "--hsa-core-trace",
-        action="store_true",
        help="For Collecting HSA API Traces (core API)",
-        required=False,
    )
-    parser.add_argument(
+    add_parser_bool_argument(
        "--hsa-amd-trace",
-        action="store_true",
        help="For Collecting HSA API Traces (AMD-extension API)",
-        required=False,
    )
-    parser.add_argument(
+    add_parser_bool_argument(
        "--hsa-image-trace",
-        action="store_true",
        help="For Collecting HSA API Traces (Image-extenson API)",
-        required=False,
    )
-    parser.add_argument(
+    add_parser_bool_argument(
        "--hsa-finalizer-trace",
-        action="store_true",
        help="For Collecting HSA API Traces (Finalizer-extension API)",
-        required=False,
    )
-    parser.add_argument(
+    add_parser_bool_argument(
        "-s",
        "--sys-trace",
-        action="store_true",
        help="For Collecting HIP, HSA, Marker (ROCTx), Memory copy, Scratch memory, and Kernel dispatch traces",
-        required=False,
    )
-    parser.add_argument(
+    add_parser_bool_argument(
        "-M",
        "--mangled-kernels",
-        action="store_true",
        help="Do not demangle the kernel names",
-        required=False,
    )
-    parser.add_argument(
+    add_parser_bool_argument(
        "-T",
        "--truncate-kernels",
-        action="store_true",
        help="Truncate the demangled kernel names",
-        required=False,
    )
-    parser.add_argument(
+    add_parser_bool_argument(
        "-L",
        "--list-metrics",
-        action="store_true",
        help="List metrics for counter collection",
-        required=False,
    )
    parser.add_argument(
        "-i",
@@ -168,7 +194,7 @@ For MPI applications (or other job launchers such as SLURM), place rocprofv3 ins
        "--output-format",
        help="For adding output format (supported formats: csv, json, pftrace)",
        nargs="+",
-        default=["csv"],
+        default=None,
        choices=("csv", "json", "pftrace"),
        type=str.lower,
    )
@@ -176,15 +202,29 @@ For MPI applications (or other job launchers such as SLURM), place rocprofv3 ins
        "--log-level",
        help="Set the log level",
        default=None,
-        choices=("fatal", "error", "warning", "info", "trace"),
+        choices=("fatal", "error", "warning", "info", "trace", "env"),
        type=str.lower,
    )
    parser.add_argument(
-        "--kernel-names",
-        help="Filter kernel names",
+        "--kernel-include-regex",
+        help="Include the kernels matching this filter",
        default=None,
        type=str,
+        metavar="REGULAR_EXPRESSION",
+    )
+    parser.add_argument(
+        "--kernel-exclude-regex",
+        help="Exclude the kernels matching this filter",
+        default=None,
+        type=str,
+        metavar="REGULAR_EXPRESSION",
+    )
+    parser.add_argument(
+        "--kernel-iteration-range",
+        help="Iteration range",
        nargs="+",
+        default=None,
+        type=str,
    )
    parser.add_argument(
        "--preload",
@@ -197,6 +237,7 @@ For MPI applications (or other job launchers such as SLURM), place rocprofv3 ins
        args = sys.argv[1:]

    rocp_args = args[:]
+
    app_args = []

    for idx, itr in enumerate(args):
@@ -215,11 +256,19 @@ def parse_yaml(yaml_file):
        fatal_error(
            f"{e}\n\nYAML package is not installed. Run '{sys.executable} -m pip install pyyaml' or use JSON or text format"
        )
-
    try:
+        lst = []
        with open(yaml_file, "r") as file:
            data = yaml.safe_load(file)
-        return [" ".join(itr["pmc"]) for itr in data["metrics"]]
+        for itr in data["jobs"]:
+            # TODO: support naming jobs
+            # if isinstance(itr, str):
+            #     itr = data["jobs"][itr]
+            itr["sub_directory"] = "pass_"
+            lst.append(itr)
+
+        return [dotdict(itr) for itr in lst]
+
    except yaml.YAMLError as exc:
        fatal_error(f"{exc}")

@@ -230,9 +279,15 @@ def parse_json(json_file):
    import json

    try:
+        lst = []
        with open(json_file, "r") as file:
            data = json.load(file)
-        return [" ".join(itr["pmc"]) for itr in data["metrics"]]
+        for itr in data["jobs"]:
+            itr["sub_directory"] = "pass_"
+            lst.append(itr)
+
+        return [dotdict(itr) for itr in lst]
+
    except Exception as e:
        fatal_error(f"{e}")

@@ -252,10 +307,10 @@ def parse_text(text_file):
        def _dedup(_line, _sep):
            for itr in _sep:
                _line = " ".join(_line.split(itr))
-            return _line
+            return _line.strip()

        # remove tabs and duplicate spaces
-        return _dedup(line.replace("pmc:", ""), ["\t", " "]).strip()
+        return _dedup(line.replace("pmc:", ""), ["\n", "\t", " "]).split(" ")

    try:
        with open(text_file, "r") as file:
@@ -271,25 +326,81 @@ def parse_text(text_file):


 def parse_input(input_file):
-    pmc_lines = []
+
    _, extension = os.path.splitext(input_file)
    if extension == ".txt":
-        pmc_lines = parse_text(input_file)
+        text_input = parse_text(input_file)
+        text_input_lst = [{"pmc": itr, "sub_directory": "pmc_"} for itr in text_input]
+        return [dotdict(itr) for itr in text_input_lst]
    elif extension in (".yaml", ".yml"):
-        pmc_lines = parse_yaml(input_file)
+        return parse_yaml(input_file)
    elif extension == ".json":
-        pmc_lines = parse_json(input_file)
+        return parse_json(input_file)
    else:
        fatal_error(
            f"Input file '{input_file}' does not have a recognized extension (.txt, .json, .yaml, .yml)\n"
        )

-    return pmc_lines
+    return None


-def main(argv=None):
+def has_set_attr(obj, key):
+    if obj and hasattr(obj, key) and getattr(obj, key) is not None:
+        return True
+    else:
+        return False
+
+
+def patch_args(data):
+    """Used to handle certain fields which might be specified as a string instead of an array or vice-versa"""
+
+    if hasattr(data, "kernel_iteration_range") and isinstance(
+        data.kernel_iteration_range, str
+    ):
+        data.kernel_iteration_range = [data.kernel_iteration_range]
+    return data
+
+
+def get_args(cmd_args, inp_args):
+
+    def ensure_type(name, var, type_id):
+        if not isinstance(var, type_id):
+            raise TypeError(
+                f"{name} is of type {type(var).__name__}, expected {type(type_id).__name__}"
+            )
+
+    ensure_type("cmd_args", cmd_args, argparse.Namespace)
+    ensure_type("inp_args", inp_args, dotdict)
+
+    cmd_keys = list(cmd_args.__dict__.keys())
+    inp_keys = list(inp_args.keys())
+    data = {}
+
+    def get_attr(key):
+        if has_set_attr(cmd_args, key):
+            return getattr(cmd_args, key)
+        elif has_set_attr(inp_args, key):
+            return getattr(inp_args, key)
+        return None
+
+    for itr in set(cmd_keys + inp_keys):
+        if (
+            has_set_attr(cmd_args, itr)
+            and has_set_attr(inp_args, itr)
+            and getattr(cmd_args, itr) != getattr(inp_args, itr)
+        ):
+            raise RuntimeError(f"conflicting value for {itr}")
+        else:
+            data[itr] = get_attr(itr)
+
+    return patch_args(dotdict(data))
+
+
+def run(app_args, args, **kwargs):

    app_env = dict(os.environ)
+    use_execv = kwargs.get("use_execv", True)
+    app_pass = kwargs.get("pass_id", None)

    def update_env(env_var, env_val, **kwargs):
        """Local function for updating application environment which supports
@@ -361,12 +472,12 @@ def main(argv=None):
    ROCPROF_TOOL_LIBRARY = f"{ROCM_DIR}/lib/rocprofiler-sdk/librocprofiler-sdk-tool.so"
    ROCPROF_SDK_LIBRARY = f"{ROCM_DIR}/lib/librocprofiler-sdk.so"

-    args, app_args = parse_arguments(argv)
+    args.preload = [itr for itr in args.preload if itr]
+    if args.preload:
+        update_env("LD_PRELOAD", ":".join(args.preload), prepend=True)

-    _preload = ":".join(args.preload) if args.preload else None
-
-    update_env("LD_PRELOAD", _preload, prepend=True)
    update_env("LD_PRELOAD", f"{ROCPROF_TOOL_LIBRARY}:{ROCPROF_SDK_LIBRARY}", append=True)
+
    update_env(
        "ROCP_TOOL_LIBRARIES",
        f"{ROCPROF_TOOL_LIBRARY}",
@@ -385,17 +496,21 @@ def main(argv=None):

    update_env("ROCPROF_OUTPUT_FILE_NAME", _output_file)
    update_env("ROCPROF_OUTPUT_PATH", _output_path)
+    if app_pass is not None:
+        app_env["ROCPROF_OUTPUT_PATH"] = os.path.join(
+            f"{_output_path}", f"{args.sub_directory}{app_pass}"
+        )

    if args.output_file is not None or args.output_directory is not None:
        update_env("ROCPROF_OUTPUT_LIST_METRICS_FILE", True)

+    if not args.output_format:
+        args.output_format = ["csv"]
+
    update_env(
        "ROCPROF_OUTPUT_FORMAT", ",".join(args.output_format), append=True, join_char=","
    )

-    _kernel_names = ",".join(args.kernel_names) if args.kernel_names else None
-    update_env("ROCPROF_KERNEL_NAMES", _kernel_names, append=True, join_char=",")
-
    if args.sys_trace:
        for itr in (
            "hip_trace",
@@ -445,7 +560,9 @@ def main(argv=None):

    update_env("ROCPROF_STATS", args.stats, overwrite_if_true=True)
    update_env(
-        "ROCPROF_DEMANGLE_KERNELS", not args.mangled_kernels, overwrite_if_false=True
+        "ROCPROF_DEMANGLE_KERNELS",
+        not args.mangled_kernels,
+        overwrite_if_false=True,
    )
    update_env(
        "ROCPROF_TRUNCATE_KERNELS",
@@ -458,61 +575,96 @@ def main(argv=None):
        overwrite_if_true=True,
    )

-    for itr in ("ROCPROF", "ROCPROFILER", "ROCTX"):
-        update_env(
-            f"{itr}_LOG_LEVEL",
-            args.log_level,
-        )
+    if args.log_level and args.log_level not in ("env"):
+        for itr in ("ROCPROF", "ROCPROFILER", "ROCTX"):
+            update_env(
+                f"{itr}_LOG_LEVEL",
+                args.log_level,
+            )

    def log_config(_env):
        existing_env = dict(os.environ)
-        init_message = "- rocprofv3 configuration:\n"
+        init_message = "\n- rocprofv3 configuration{}:\n".format(
+            "" if app_pass is None else f" (pass {app_pass})"
+        )
        for key, itr in _env.items():
            if key not in existing_env.keys():
                if init_message:
                    sys.stderr.write(init_message)
                    init_message = None
                sys.stderr.write(f"\t- {key}={itr}\n")
+        if init_message is None:
+            sys.stderr.write("\n")
        sys.stderr.flush()

    if args.list_metrics:
        app_args = [f"{ROCM_DIR}/lib/rocprofiler-sdk/rocprofv3-trigger-list-metrics"]
+
    elif not app_args:
        log_config(app_env)
        fatal_error("No application provided")

-    pmc_lines = []
-    if args.input:
-        pmc_lines = parse_input(args.input)
+    if args.kernel_include_regex:
+        update_env(
+            "ROCPROF_KERNEL_FILTER_INCLUDE_REGEX",
+            args.kernel_include_regex,
+        )

-    if pmc_lines:
-        exit_code = 0
-        update_env("ROCPROF_COUNTER_COLLECTION", True, overwrite_if_true=True)
+    if args.kernel_exclude_regex:
+        update_env(
+            "ROCPROF_KERNEL_FILTER_EXCLUDE_REGEX",
+            args.kernel_exclude_regex,
+        )

-        for idx, pmc_line in enumerate(pmc_lines):
-            COUNTER = idx + 1
-            pmc_env = dict(app_env)
-            pmc_env["ROCPROF_COUNTERS"] = f"pmc: {pmc_line}"
-            pmc_env["ROCPROF_OUTPUT_PATH"] = os.path.join(
-                f"{_output_path}", f"pmc_{COUNTER}"
-            )
+    if args.kernel_iteration_range:
+        update_env("ROCPROF_KERNEL_FILTER_RANGE", ", ".join(args.kernel_iteration_range))

-            if args.log_level in ("info", "trace"):
-                log_config(pmc_env)
-
-            try:
-                exit_code = subprocess.check_call(app_args, env=pmc_env)
-                if exit_code != 0:
-                    fatal_error("Application exited with non-zero exit code", exit_code)
-            except Exception as e:
-                fatal_error(f"{e}\n")
-
-        return exit_code
+    if args.pmc:
+        update_env("ROCPROF_COUNTER_COLLECTION", True, overwrite=True)
+        update_env(
+            "ROCPROF_COUNTERS", "pmc: {}".format(" ".join(args.pmc)), overwrite=True
+        )
    else:
-        if args.log_level in ("info", "trace"):
-            log_config(app_env)
+        update_env("ROCPROF_COUNTER_COLLECTION", False, overwrite=True)
+
+    if args.log_level in ("info", "trace", "env"):
+        log_config(app_env)
+
+    if use_execv:
        # does not return
        os.execvpe(app_args[0], app_args, env=app_env)
+    else:
+        try:
+            exit_code = subprocess.check_call(app_args, env=app_env)
+            if exit_code != 0:
+                fatal_error("Application exited with non-zero exit code", exit_code)
+        except Exception as e:
+            fatal_error(f"{e}\n")
+        return exit_code
+
+
+def main(argv=None):
+
+    cmd_args, app_args = parse_arguments(argv)
+    inp_args = (
+        parse_input(cmd_args.input) if getattr(cmd_args, "input") else [dotdict({})]
+    )
+
+    if len(inp_args) == 1:
+        args = get_args(cmd_args, inp_args[0])
+        pass_idx = None
+        if hasattr(args, "pmc") and args.pmc is not None and len(args.pmc) > 0:
+            pass_idx = 1
+        run(app_args, args, pass_id=pass_idx)
+    else:
+        for idx, itr in enumerate(inp_args):
+            args = get_args(cmd_args, itr)
+            run(
+                app_args,
+                args,
+                pass_id=(idx + 1),
+                use_execv=False,
+            )


 if __name__ == "__main__":
@@ -1,5 +1,5 @@
 .. meta::
-  :description: Documentation of the installation, configuration, use of the ROCProfiler SDK, and rocprofv3 command-line tool 
+  :description: Documentation of the installation, configuration, use of the ROCProfiler SDK, and rocprofv3 command-line tool
  :keywords: ROCProfiler SDK tool, ROCProfiler SDK library, rocprofv3, ROCm, API, reference

 .. _using-rocprofv3:
@@ -8,7 +8,7 @@
 Using rocprofv3
 ======================

-``rocprofv3`` is a CLI tool that helps you quickly optimize applications and understand the low-level kernel details without requiring any modification in the source code. 
+``rocprofv3`` is a CLI tool that helps you quickly optimize applications and understand the low-level kernel details without requiring any modification in the source code.
 It is being developed to be backward compatible with its predecessor, ``rocprof``, and to provide more features for application profiling with better accuracy.

 The following sections demonstrate the use of ``rocprofv3`` for application tracing and kernel profiling using various command-line options.
@@ -37,7 +37,7 @@ Here is the list of ``rocprofv3`` command-line options. Some options are used fo
  * - Option
    - Description
    - Use
-  
+
  * - ``--hip-trace``
    - Collects HIP runtime traces.
    - Application tracing
@@ -113,7 +113,7 @@ Here is the list of ``rocprofv3`` command-line options. Some options are used fo
  * - ``-o`` \| ``--output-file``
    - Specifies the name of the output file. Note that this name is appended to the default names (_api_trace or counter_collection.csv) of the generated files'.
    - Output control
-    
+
  * - ``-M`` \| ``--mangled-kernels``
    - Overrides the default demangling of kernel names.
    - Output control
@@ -125,7 +125,7 @@ Here is the list of ``rocprofv3`` command-line options. Some options are used fo
  * - ``--output-format``
    - For adding output format (supported formats: csv, json, pftrace)
    - Output control
-  
+
  * - ``--preload``
    - Libraries to prepend to LD_PRELOAD (usually for sanitizers)
    - Extension
@@ -170,9 +170,9 @@ The above command generates a `hip_api_trace.csv` file prefixed with the process
 Here are the contents of `hip_api_trace.csv` file:

 .. csv-table:: HIP runtime api trace
-   :file: /data/hip_compile_trace.csv 
-   :widths: 10,10,10,10,10,20,20  
-   :header-rows: 1  
+   :file: /data/hip_compile_trace.csv
+   :widths: 10,10,10,10,10,20,20
+   :header-rows: 1

 To trace HIP compile time APIs, use:

@@ -189,9 +189,9 @@ The above command generates a `hip_api_trace.csv` file prefixed with the process
 Here are the contents of `hip_api_trace.csv` file:

 .. csv-table:: HIP compile time api trace
-   :file: /data/hip_compile_trace.csv 
-   :widths: 10,10,10,10,10,20,20   
-   :header-rows: 1  
+   :file: /data/hip_compile_trace.csv
+   :widths: 10,10,10,10,10,20,20
+   :header-rows: 1

 For the description of the fields in the output file, see :ref:`output-file-fields`.

@@ -214,7 +214,7 @@ The HIP runtime library is implemented with the low-level HSA runtime. HSA API t
 HSA trace contains the start and end time of HSA runtime API calls and their asynchronous activities.

 .. code-block:: bash
-  
+
  rocprofv3 --hsa-trace -- < app_relative_path >

 The above command generates a `hsa_api_trace.csv` file prefixed with process ID. Note that the contents of this file have been truncated for demonstration purposes.
@@ -226,9 +226,9 @@ The above command generates a `hsa_api_trace.csv` file prefixed with process ID.
 Here are the contents of `hsa_api_trace.csv` file:

 .. csv-table:: HSA api trace
-   :file: /data/hsa_trace.csv 
-   :widths: 10,10,10,10,10,20,20   
-   :header-rows: 1  
+   :file: /data/hsa_trace.csv
+   :widths: 10,10,10,10,10,20,20
+   :header-rows: 1

 For the description of the fields in the output file, see :ref:`output-file-fields`.

@@ -284,9 +284,9 @@ Running the preceding command generates a `marker_api_trace.csv` file prefixed w
 Here are the contents of `marker_api_trace.csv` file:

 .. csv-table:: Marker api trace
-   :file: /data/marker_api_trace.csv 
-   :widths: 10,10,10,10,10,20,20   
-   :header-rows: 1  
+   :file: /data/marker_api_trace.csv
+   :widths: 10,10,10,10,10,20,20
+   :header-rows: 1

 For the description of the fields in the output file, see :ref:`output-file-fields`.

@@ -308,10 +308,10 @@ The above command generates a `kernel_trace.csv` file prefixed with the process
 Here are the contents of `kernel_trace.csv` file:

 .. csv-table:: Kernel trace
-   :file: /data/kernel_trace.csv 
-   :widths: 10,10,10,10,10,10,20,20,10,10,10,10,10,10,10,10   
+   :file: /data/kernel_trace.csv
+   :widths: 10,10,10,10,10,10,20,20,10,10,10,10,10,10,10,10
   :header-rows: 1
-  
+
 For the description of the fields in the output file, see :ref:`output-file-fields`.

 Memory copy trace
@@ -332,8 +332,8 @@ The above command generates a `memory_copy_trace.csv` file prefixed with the pro
 Here are the contents of `memory_copy_trace.csv` file:

 .. csv-table:: Memory copy trace
-   :file: /data/memory_copy_trace.csv 
-   :widths: 10,10,10,10,10,20,20  
+   :file: /data/memory_copy_trace.csv
+   :widths: 10,10,10,10,10,20,20
   :header-rows: 1

 For the description of the fields in the output file, see :ref:`output-file-fields`.
@@ -377,8 +377,8 @@ The above command generates a `hip_stats.csv` and `hip_api_trace` file prefixed
 Here are the contents of `hip_stats.csv` file:

 .. csv-table:: HIP stats
-   :file: /data/hip_stats.csv 
-   :widths: 10,10,20,20,10,10,10,10   
+   :file: /data/hip_stats.csv
+   :widths: 10,10,20,20,10,10,10,10
   :header-rows: 1


@@ -392,46 +392,140 @@ For a comprehensive list of counters available on MI200, see `MI200 performance
 Input file
 ++++++++++++

-To collect the desired basic counters or derived metrics, mention them in an input file. In the input file, the line consisting of the counter or metric names must begin with ``pmc``. The input file could be in text (.txt), yaml (.yaml/.yml), or JSON (.json) format.
+Rocprofv3 supports three input file formats: text (.txt), yaml (.yaml/.yml), or JSON (.json) format.

-.. code-block:: shell
+Text input is used collect the desired basic counters or derived metrics. In the input file, the line consisting of the counter or metric names must begin with ``pmc``.
+The input files in JSON/YAML support all commandline options. Using these files each run can be configured with different set of options.
+The schema supported by input json and yaml is as given below:

-  $ cat input.txt
+*Schema for the rocprofv3 JSON/YAML input*

-  pmc: GPUBusy SQ_WAVES
-  pmc: GRBM_GUI_ACTIVE
+Properties
++++++++++++
+
+-  **``jobs``** *(array)*: rocprofv3 input data per application run.
+
+   -  **Items** *(object)*: data for rocprofv3.
+
+      -  **``pmc``** *(array)*: list of counters to collect.
+      -  **``kernel_include_regex``** *(string)*: regex string.
+      -  **``kernel_exclude_regex``** *(string)*: regex string.
+      -  **``kernel_iteration_range``** *(string)*: range for range for
+         each kernel that match the filter [start-stop].
+      -  **``hip_trace``** *(boolean)*: For Collecting HIP Traces
+         (runtime + compiler).
+      -  **``hip_runtime_trace``** *(boolean)*: For Collecting HIP
+         Runtime API Traces.
+      -  **``hip_compiler_trace``** *(boolean)*: For Collecting HIP
+         Compiler generated code Traces.
+      -  **``marker_trace``** *(boolean)*: For Collecting Marker (ROCTx)
+         Traces.
+      -  **``kernel_trace``** *(boolean)*: For Collecting Kernel
+         Dispatch Traces.
+      -  **``memory_copy_trace``** *(boolean)*: For Collecting Memory
+         Copy Traces.
+      -  **``scratch_memory_trace``** *(boolean)*: For Collecting
+         Scratch Memory operations Traces.
+      -  **``stats``** *(boolean)*: For Collecting statistics of enabled
+         tracing types.
+      -  **``hsa_trace``** *(boolean)*: For Collecting HSA Traces (core
+         + amd + image + finalizer).
+      -  **``hsa_core_trace``** *(boolean)*: For Collecting HSA API
+         Traces (core API).
+      -  **``hsa_amd_trace``** *(boolean)*: For Collecting HSA API
+         Traces (AMD-extension API).
+      -  **``hsa_finalize_trace``** *(boolean)*: For Collecting HSA API
+         Traces (Finalizer-extension API).
+      -  **``hsa_image_trace``** *(boolean)*: For Collecting HSA API
+         Traces (Image-extenson API).
+      -  **``sys_trace``** *(boolean)*: For Collecting HIP, HSA, Marker
+         (ROCTx), Memory copy, Scratch memory, and Kernel dispatch
+         traces.
+      -  **``mangled-kernels``** *(boolean)*: Do not demangle the kernel
+         names.
+      -  **``truncate-kernels``** *(boolean)*: Truncate the demangled
+         kernel names.
+      -  **``output_file``** *(string)*: For the output file name.
+      -  **``output_directory``** *(string)*: For adding output path
+         where the output files will be saved.
+      -  **``output_format``** *(array)*: For adding output format
+         (supported formats: csv, json, pftrace).
+      -  **``list_metrics``** *(boolean)*: List the metrics.
+      -  **``log_level``** *(string)*: fatal, error, warning, info,
+         trace.
+      -  **``preload``** *(array)*: Libraries to prepend to LD_PRELOAD
+         (usually for sanitizers).
+
+The number of basic counters or derived metrics that can be collected in one run of profiling are limited by the GPU hardware resources. If too many counters or metrics are selected, the kernels need to be executed multiple times to collect them.
+For multi-pass execution, in the input text file include multiple ``pmc`` rows and counters or metrics in each ``pmc`` row can be collected in each kernel run. Whereas Json/Yaml input files have a list of jobs and each job corresponds to a pass/run.

 .. code-block:: shell

  $ cat input.json

-  {
-    "metrics": [
-      {
-        "pmc": ["SQ_WAVES", "GRBM_COUNT", "GUI_ACTIVE"]
-      },
-      {
-        "pmc": ["FETCH_SIZE", "WRITE_SIZE"]
-      }
-    ]
-  }
+   {
+    "jobs": [
+        {
+            "hsa_trace": true,
+            "kernel_trace": true,
+            "memory_copy_trace": true,
+            "marker_trace": true,
+            "output_file": "out",
+            "output_format": [
+                "csv",
+                "json",
+                "pftrace"
+            ]
+        },
+        {
+            "pmc": [
+                "SQ_WAVES"
+            ],
+            "kernel_include_regex": ".*_kernel",
+            "kernel_exclude_regex": "multiply",
+            "kernel_iteration_range": "[1-2]",
+            "output_file": "out",
+            "output_format": [
+                "csv",
+                "json"
+            ],
+            "truncate_kernels": true
+        }
+     ]
+   }

 .. code-block:: shell

-  $ cat input.yaml
+  $ cat input.txt

-  metrics:
-    - pmc:
-        - SQ_WAVES
-        - GRBM_COUNT
-        - GUI_ACTIVE
-        - 'TCC_HIT[1]'
-        - 'TCC_HIT[2]'
-    - pmc:
-        - FETCH_SIZE
-        - WRITE_SIZE
+   pmc: GPUBusy SQ_WAVES
+   pmc: GRBM_GUI_ACTIVE
+
+.. code-block:: shell
+
+  $ cat input.yml
+
+  jobs:
+
+  - "hsa_trace": true
+    "kernel_trace": true
+    "memory_copy_trace": true
+    "marker_trace": true
+    "output_file": "out"
+    "output_format"
+                - "csv",
+                - "json",
+                - "pftrace"
+
+  - pmc:
+      - SQ_WAVES
+    kernel_include_regex: "addition"
+    kernel_exclude_regex: "multiply"
+    kernel_iteration_range:
+    - "[1-2]"
+    - "[3-4]"
+    - "[5-6]"

-The number of basic counters or derived metrics that can be collected in one run of profiling are limited by the GPU hardware resources. If too many counters or metrics are selected, the kernels need to be executed multiple times to collect them. For multi-pass execution, include multiple ``pmc`` rows in the input file. Counters or metrics in each ``pmc`` row can be collected in each kernel run.

 Kernel profiling output
 +++++++++++++++++++++++++
@@ -511,16 +605,26 @@ The following table lists the various fields or the columns in the output CSV fi
  * - VGPR_Count
    - Kernel's Vector General Purpose Register (VGPR) count.

-Kernel names
-++++++++++++++
+Kernel Filtering
+++++++++++++++++

-To target a kernel name during countr collection.
+rocprofv3 supports kernel filtering. A kernel filter is a set of a regex string (to include the kernels matching this filter), a regex string (to exclude the kernels matching this filter),
+and an iteration range (set of iterations of the included kernels). If the iteration range is not provided then all iterations of the included kernels are profiled.

 .. code-block:: shell

-  rocprofv3 -i input.txt --kernel-names divide_kernel -- <app_relative_path>
- 
-  $ cat pmc_1/312_counter_collection.csv
+  $ cat input.yml
+   jobs:
+    - pmc: [SQ_WAVES]
+      kernel_include_regex: "divide"
+      kernel_exclude_regex: ""
+
+
+.. code-block:: shell
+
+  rocprofv3 -i input.yml -- <app_relative_path>
+
+  $ cat pass_1/312_counter_collection.csv
  "Correlation_Id","Dispatch_Id","Agent_Id","Queue_Id","Process_Id","Thread_Id","Grid_Size","Kernel_Name","Workgroup_Size","LDS_Block_Size","Scratch_Size","VGPR_Count","SGPR_Count","Counter_Name","Counter_Value"
  4,4,1,1,36499,36499,1048576,"divide_kernel(float*, float const*, float const*, int, int)",64,0,0,12,16,"SQ_WAVES",16384
  8,8,1,2,36499,36499,1048576,"divide_kernel(float*, float const*, float const*, int, int)",64,0,0,12,16,"SQ_WAVES",16384
@@ -0,0 +1,147 @@
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "type": "object",
+  "description": "Schema for the rocprofv3 JSON input",
+  "properties": {
+      "jobs": {
+        "type": "array",
+        "description": "rocprofv3 input data per application run",
+        "items": {
+            "type" : "object",
+             "description": "data for rocprofv3",
+              "properties": {
+
+                    "pmc": {
+                      "type" : "array",
+                      "description": "list of counters to collect"
+                     },
+
+                     "kernel_include_regex":{
+                        "type": "string",
+                        "description": "regex string"
+                     },
+
+                     "kernel_exclude_regex": {
+                      "type": "string",
+                      "description": "regex string"
+                     },
+
+                     "kernel_iteration_range": {
+                        "type": "string",
+                        "description": "range for range for each kernel that match the filter [start-stop]"
+                     },
+
+                     "hip_trace": {
+                          "type": "boolean",
+                          "description": "For Collecting HIP Traces (runtime + compiler)"
+                     },
+
+                     "hip_runtime_trace": {
+                          "type": "boolean",
+                          "description": "For Collecting HIP Runtime API Traces"
+                     },
+
+                     "hip_compiler_trace": {
+                          "type": "boolean",
+                          "description": "For Collecting HIP Compiler generated code Traces"
+                     },
+
+                     "marker_trace": {
+                          "type": "boolean",
+                          "description": "For Collecting Marker (ROCTx) Traces"
+                     },
+
+                     "kernel_trace": {
+                          "type": "boolean",
+                          "description": "For Collecting Kernel Dispatch Traces"
+                     },
+
+                     "memory_copy_trace": {
+                          "type": "boolean",
+                          "description": "For Collecting Memory Copy Traces"
+                     },
+
+                     "scratch_memory_trace": {
+                          "type": "boolean",
+                          "description": "For Collecting Scratch Memory operations Traces"
+                     },
+
+                     "stats": {
+                          "type": "boolean",
+                          "description": "For Collecting statistics of enabled tracing types"
+                     },
+
+                     "hsa_trace": {
+                          "type": "boolean",
+                          "description":"For Collecting HSA Traces (core + amd + image + finalizer)"
+                     },
+
+                     "hsa_core_trace": {
+                            "type": "boolean",
+                            "description": "For Collecting HSA API Traces (core API)"
+                     },
+
+                     "hsa_amd_trace": {
+                              "type": "boolean",
+                              "description": "For Collecting HSA API Traces (AMD-extension API)"
+                     },
+
+                     "hsa_finalize_trace": {
+                              "type": "boolean",
+                              "description": "For Collecting HSA API Traces (Finalizer-extension API)"
+                     },
+
+                     "hsa_image_trace": {
+                              "type": "boolean",
+                              "description": "For Collecting HSA API Traces (Image-extenson API)"
+                     },
+
+                     "sys_trace" : {
+                              "type": "boolean",
+                              "description": "For Collecting HIP, HSA, Marker (ROCTx), Memory copy, Scratch memory, and Kernel dispatch traces"
+                     },
+
+                     "mangled-kernels": {
+                      "type": "boolean",
+                      "description": "Do not demangle the kernel names"
+                     },
+
+                     "truncate-kernels": {
+                      "type": "boolean",
+                      "description": "Truncate the demangled kernel names"
+                     },
+
+                     "output_file":{
+                      "type": "string",
+                      "description": "For the output file name"
+                     },
+
+                     "output_directory":{
+                      "type": "string",
+                      "description": "For adding output path where the output files will be saved"
+                     },
+
+                     "output_format":{
+                      "type": "array",
+                      "description": "For adding output format (supported formats: csv, json, pftrace)"
+                     },
+
+                     "list_metrics" : {
+                      "type" : "boolean",
+                      "description": "List the metrics"
+                     },
+
+                     "log_level":{
+                      "type": "string",
+                      "description": "fatal, error, warning, info, trace"
+                     },
+
+                     "preload":{
+                      "type": "array",
+                      "description": "Libraries to prepend to LD_PRELOAD (usually for sanitizers)"
+                     }
+                 }
+              }
+           }
+        }
+}
@@ -27,8 +27,11 @@
 #include "lib/common/demangle.hpp"
 #include "lib/common/environment.hpp"
 #include "lib/common/filesystem.hpp"
+#include "lib/common/logging.hpp"
 #include "lib/common/utility.hpp"

+#include <rocprofiler-sdk/cxx/details/delimit.hpp>
+
 #include <fmt/core.h>

 #include <unistd.h>
@@ -132,14 +135,6 @@ handle_special_chars(std::string& str)
        str.at(pos) = ' ';
 }

-bool
-has_kernel_name_format(std::string const& str)
-{
-    return std::find_if(str.begin(), str.end(), [](unsigned char ch) {
-               return (isalnum(ch) != 0 || ch == '_');
-           }) != str.end();
-}
-
 bool
 has_counter_format(std::string const& str)
 {
@@ -149,34 +144,40 @@ has_counter_format(std::string const& str)
 }

 // validate kernel names
-auto
-parse_kernel_names(const std::string& line)
+std::unordered_set<uint32_t>
+get_kernel_filter_range(const std::string& kernel_filter)
 {
-    auto kernel_names_v = std::vector<std::string>{};
-    if(line.empty()) return kernel_names_v;
+    if(kernel_filter.empty()) return {};

-    auto kernel_names = std::set<std::string>{};
-    trim(line);
-    auto input_line  = std::stringstream{line};
-    auto kernel_name = std::string{};
-    while(getline(input_line, kernel_name, ','))
+    auto delim     = rocprofiler::sdk::parse::tokenize(kernel_filter, ",");
+    auto range_set = std::unordered_set<uint32_t>{};
+    for(const auto& itr : delim)
    {
-        if(has_kernel_name_format(kernel_name))
+        if(itr.find('-') != std::string::npos && itr.find('[') != std::string::npos &&
+           itr.find(']') != std::string::npos)
        {
-            ROCP_INFO << "kernel name " << kernel_names.size() << ": " << kernel_name;
-            kernel_names.emplace(kernel_name);
+            auto drange = rocprofiler::sdk::parse::tokenize(itr, "[-] ");
+            ROCP_FATAL_IF(drange.size() != 2)
+                << "bad range format for '" << itr << "'. Expected [A-B] where A and B are numbers";
+
+            uint32_t start_range = std::stoul(drange.front());
+            uint32_t end_range   = std::stoul(drange.back());
+            for(auto i = start_range; i <= end_range; i++)
+                range_set.emplace(i);
        }
        else
        {
-            ROCP_ERROR << "invalid kernel name: " << kernel_name;
+            auto dval = rocprofiler::sdk::parse::tokenize(itr, " ");
+            ROCP_ERROR_IF(dval.empty()) << "kernel range value '" << itr << "' produced no numbers";
+            for(const auto& ditr : dval)
+            {
+                ROCP_FATAL_IF(ditr.find_first_not_of("0123456789") != std::string::npos)
+                    << "expected integer for " << itr << ". Non-integer value detected";
+                range_set.emplace(std::stoul(ditr));
+            }
        }
    }
-
-    kernel_names_v.reserve(kernel_names.size());
-    for(const auto& itr : kernel_names)
-        kernel_names_v.emplace_back(itr);
-
-    return kernel_names_v;
+    return range_set;
 }

 std::set<std::string>
@@ -240,7 +241,8 @@ get_mpi_rank()
 }

 config::config()
-: kernel_names{parse_kernel_names(get_env("ROCPROF_KERNEL_NAMES", std::string{}))}
+: kernel_filter_range{get_kernel_filter_range(
+      get_env("ROCPROF_KERNEL_FILTER_RANGE", std::string{}))}
 , counters{parse_counters(get_env("ROCPROF_COUNTERS", std::string{}))}
 {
    auto output_format = get_env("ROCPROF_OUTPUT_FORMAT", "CSV");
@@ -281,6 +283,7 @@ config::config()
        LOG_IF(FATAL, supported_formats.count(itr) == 0)
            << "Unsupported output format type: " << itr;
    }
+    if(kernel_filter_include.empty()) kernel_filter_include = std::string(".*");
 }

 std::vector<output_key>
@@ -28,6 +28,7 @@

 #include <set>
 #include <string>
+#include <unordered_set>
 #include <vector>

 namespace rocprofiler
@@ -81,8 +82,13 @@ struct config
    std::string output_path   = get_env("ROCPROF_OUTPUT_PATH", fs::current_path().string());
    std::string output_file   = get_env("ROCPROF_OUTPUT_FILE_NAME", std::to_string(getpid()));
    std::string tmp_directory = get_env("ROCPROF_TMPDIR", output_path);
-    std::vector<std::string> kernel_names = {};
-    std::set<std::string>    counters     = {};
+
+    std::string kernel_filter_include =
+        get_env("ROCPROF_KERNEL_FILTER_INCLUDE_REGEX", std::string{".*"});
+    std::string kernel_filter_exclude =
+        get_env("ROCPROF_KERNEL_FILTER_EXCLUDE_REGEX", std::string{});
+    std::unordered_set<uint32_t> kernel_filter_range = {};
+    std::set<std::string>        counters            = {};
 };

 template <config_context ContextT = config_context::global>
@@ -177,40 +177,79 @@ as_pointer()
 }

 using code_object_data_map_t = std::unordered_map<uint64_t, rocprofiler_code_object_data_t>;
-using targeted_kernels_set_t = std::unordered_set<rocprofiler_kernel_id_t>;
+using targeted_kernels_map_t =
+    std::unordered_map<rocprofiler_kernel_id_t, std::unordered_set<uint32_t>>;
 using counter_dimension_info_map_t =
    std::unordered_map<uint64_t, std::vector<rocprofiler_record_dimension_info_t>>;
-using agent_info_map_t = std::unordered_map<rocprofiler_agent_id_t, rocprofiler_agent_t>;
+using agent_info_map_t   = std::unordered_map<rocprofiler_agent_id_t, rocprofiler_agent_t>;
+using kernel_iteration_t = std::unordered_map<rocprofiler_kernel_id_t, uint32_t>;

 auto  code_obj_data          = as_pointer<common::Synchronized<code_object_data_map_t, true>>();
 auto* kernel_data            = as_pointer<common::Synchronized<kernel_symbol_data_map_t, true>>();
 auto* marker_msg_data        = as_pointer<common::Synchronized<marker_message_map_t, true>>();
 auto  counter_dimension_data = common::Synchronized<counter_dimension_info_map_t, true>{};
-auto  target_kernels         = common::Synchronized<targeted_kernels_set_t>{};
+auto  target_kernels         = common::Synchronized<targeted_kernels_map_t>{};
 auto* buffered_name_info     = as_pointer(get_buffer_id_names());
 auto* callback_name_info     = as_pointer(get_callback_id_names());
 auto* agent_info             = as_pointer(agent_info_map_t{});
 auto* tool_functions         = as_pointer(tool_table{});
 auto* stats_timestamp        = as_pointer(timestamps_t{});
+auto  kernel_iteration       = common::Synchronized<kernel_iteration_t, true>{};

 bool
-add_kernel_target(uint64_t _kern_id)
+add_kernel_target(uint64_t _kern_id, const std::unordered_set<uint32_t>& range)
 {
    return target_kernels
-        .wlock([](targeted_kernels_set_t& _targets_v,
-                  uint64_t                _kern_id_v) { return _targets_v.emplace(_kern_id_v); },
-               _kern_id)
+        .wlock(
+            [](targeted_kernels_map_t&      _targets_v,
+               uint64_t                     _kern_id_v,
+               std::unordered_set<uint32_t> _range) {
+                return _targets_v.emplace(_kern_id_v, _range);
+            },
+            _kern_id,
+            range)
        .second;
 }

 bool
 is_targeted_kernel(uint64_t _kern_id)
 {
-    return target_kernels.rlock(
-        [](const targeted_kernels_set_t& _targets_v, uint64_t _kern_id_v) {
-            return (_targets_v.count(_kern_id_v) > 0);
+    bool                         is_target_kernel = false;
+    std::unordered_set<uint32_t> range            = {};
+    is_target_kernel                              = target_kernels.rlock(
+        [&range](const auto& _targets_v, uint64_t _kern_id_v) {
+            if(_targets_v.find(_kern_id_v) != _targets_v.end())
+            {
+                range = _targets_v.at(_kern_id_v);
+                return true;
+            }
+            return false;
        },
        _kern_id);
+
+    if(is_target_kernel)
+    {
+        kernel_iteration.rlock(
+            [&](const auto&                  _kernel_iter,
+                uint64_t                     _kernel_id,
+                std::unordered_set<uint32_t> _range) {
+                auto itr = _kernel_iter.at(_kernel_id);
+
+                // If the iteration range is not given then all iterations of the kernel is profiled
+                if(_range.empty())
+                    is_target_kernel = true;
+
+                else if(_range.find(itr) != _range.end())
+                {
+                    is_target_kernel = true;
+                }
+                else
+                    is_target_kernel = false;
+            },
+            _kern_id,
+            range);
+    }
+    return is_target_kernel;
 }

 auto&
@@ -496,34 +535,19 @@ code_object_tracing_callback(rocprofiler_callback_tracing_record_t record,
            {
                // if kernel name is provided by user then by default all kernels in the application
                // are targeted
-                if(tool::get_config().kernel_names.empty())
+                const auto& kernel_info           = itr.first->second;
+                auto        kernel_filter_include = tool::get_config().kernel_filter_include;
+                auto        kernel_filter_exclude = tool::get_config().kernel_filter_exclude;
+                auto        kernel_filter_range   = tool::get_config().kernel_filter_range;
+
+                std::regex include_regex(kernel_filter_include);
+                std::regex exclude_regex(kernel_filter_exclude);
+                if(std::regex_search(kernel_info.formatted_kernel_name, include_regex))
                {
-                    add_kernel_target(sym_data->kernel_id);
-                }
-                else
-                {
-                    const auto& kernel_info = itr.first->second;
-                    for(const auto& name : tool::get_config().kernel_names)
-                    {
-                        if(name == kernel_info.truncated_kernel_name)
-                        {
-                            add_kernel_target(itr.first->first);
-                            break;
-                        }
-                        else
-                        {
-                            auto dkernel_name = std::string_view{kernel_info.demangled_kernel_name};
-                            auto pos          = dkernel_name.find(name);
-                            // if the demangled kernel name contains name and the next character is
-                            // '(' then mark as found
-                            if(pos != std::string::npos && (pos + 1) < dkernel_name.size() &&
-                               dkernel_name.at(pos + 1) == '(')
-                            {
-                                add_kernel_target(itr.first->first);
-                                break;
-                            }
-                        }
-                    }
+                    if(kernel_filter_exclude.empty())
+                        add_kernel_target(sym_data->kernel_id, kernel_filter_range);
+                    else if(!std::regex_search(kernel_info.formatted_kernel_name, exclude_regex))
+                        add_kernel_target(sym_data->kernel_id, kernel_filter_range);
                }
            }
        }
@@ -882,6 +906,18 @@ dispatch_callback(rocprofiler_profile_counting_dispatch_data_t dispatch_data,
    auto kernel_id = dispatch_data.dispatch_info.kernel_id;
    auto agent_id  = dispatch_data.dispatch_info.agent_id;

+    kernel_iteration.wlock(
+        [](auto& _kernel_iter, rocprofiler_kernel_id_t _kernel_id) {
+            auto itr = _kernel_iter.find(_kernel_id);
+            if(itr == _kernel_iter.end())
+                _kernel_iter.emplace(_kernel_id, 1);
+            else
+            {
+                itr->second++;
+            }
+        },
+        kernel_id);
+
    if(!is_targeted_kernel(kernel_id))
    {
        return;
@@ -274,7 +274,7 @@ main()
    HIP_API_CALL(hipGetDeviceCount(&device_count));

    for(int i = 0; i < device_count; ++i)
-        run(4, i);
+        run(8, i);

    return 0;
 }
@@ -6,3 +6,4 @@ add_subdirectory(input1)
 add_subdirectory(input2)
 add_subdirectory(input3)
 add_subdirectory(list_metrics)
+add_subdirectory(kernel_filtering)
@@ -43,20 +43,21 @@ add_test(
    NAME rocprofv3-test-counter-collection-json-pmc1-validate
    COMMAND
        ${Python3_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/validate.py --agent-input
-        ${CMAKE_CURRENT_BINARY_DIR}/simple-transpose-cc/pmc_1/out_json_agent_info.csv
+        ${CMAKE_CURRENT_BINARY_DIR}/simple-transpose-cc/pass_1/out_json_agent_info.csv
        --counter-input
-        ${CMAKE_CURRENT_BINARY_DIR}/simple-transpose-cc/pmc_1/out_json_counter_collection.csv
+        ${CMAKE_CURRENT_BINARY_DIR}/simple-transpose-cc/pass_1/out_json_counter_collection.csv
        --agent-input
-        ${CMAKE_CURRENT_BINARY_DIR}/simple-transpose-cc/pmc_2/out_json_agent_info.csv
+        ${CMAKE_CURRENT_BINARY_DIR}/simple-transpose-cc/pass_2/out_json_agent_info.csv
        --counter-input
-        ${CMAKE_CURRENT_BINARY_DIR}/simple-transpose-cc/pmc_2/out_json_counter_collection.csv
+        ${CMAKE_CURRENT_BINARY_DIR}/simple-transpose-cc/pass_2/out_json_counter_collection.csv
    )

 set(JSON_VALIDATION_FILES
-    ${CMAKE_CURRENT_BINARY_DIR}/simple-transpose-cc/pmc_1/out_json_agent_info.csv
-    ${CMAKE_CURRENT_BINARY_DIR}/simple-transpose-cc/pmc_1/out_json_counter_collection.csv
-    ${CMAKE_CURRENT_BINARY_DIR}/simple-transpose-cc/pmc_2/out_json_agent_info.csv
-    ${CMAKE_CURRENT_BINARY_DIR}/simple-transpose-cc/pmc_2/out_json_counter_collection.csv)
+    ${CMAKE_CURRENT_BINARY_DIR}/simple-transpose-cc/pass_1/out_json_agent_info.csv
+    ${CMAKE_CURRENT_BINARY_DIR}/simple-transpose-cc/pass_1/out_json_counter_collection.csv
+    ${CMAKE_CURRENT_BINARY_DIR}/simple-transpose-cc/pass_2/out_json_agent_info.csv
+    ${CMAKE_CURRENT_BINARY_DIR}/simple-transpose-cc/pass_2/out_json_counter_collection.csv
+    )

 set_tests_properties(
    rocprofv3-test-counter-collection-json-pmc1-validate
@@ -75,20 +76,21 @@ add_test(
    NAME rocprofv3-test-counter-collection-yaml-pmc1-validate
    COMMAND
        ${Python3_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/validate.py --agent-input
-        ${CMAKE_CURRENT_BINARY_DIR}/simple-transpose-cc/pmc_1/out_yaml_agent_info.csv
+        ${CMAKE_CURRENT_BINARY_DIR}/simple-transpose-cc/pass_1/out_yaml_agent_info.csv
        --counter-input
-        ${CMAKE_CURRENT_BINARY_DIR}/simple-transpose-cc/pmc_1/out_yaml_counter_collection.csv
+        ${CMAKE_CURRENT_BINARY_DIR}/simple-transpose-cc/pass_1/out_yaml_counter_collection.csv
        --agent-input
-        ${CMAKE_CURRENT_BINARY_DIR}/simple-transpose-cc/pmc_2/out_yaml_agent_info.csv
+        ${CMAKE_CURRENT_BINARY_DIR}/simple-transpose-cc/pass_2/out_yaml_agent_info.csv
        --counter-input
-        ${CMAKE_CURRENT_BINARY_DIR}/simple-transpose-cc/pmc_2/out_yaml_counter_collection.csv
+        ${CMAKE_CURRENT_BINARY_DIR}/simple-transpose-cc/pass_2/out_yaml_counter_collection.csv
    )

 set(YAML_VALIDATION_FILES
-    ${CMAKE_CURRENT_BINARY_DIR}/simple-transpose-cc/pmc_1/out_yaml_agent_info.csv
-    ${CMAKE_CURRENT_BINARY_DIR}/simple-transpose-cc/pmc_1/out_yaml_counter_collection.csv
-    ${CMAKE_CURRENT_BINARY_DIR}/simple-transpose-cc/pmc_2/out_yaml_agent_info.csv
-    ${CMAKE_CURRENT_BINARY_DIR}/simple-transpose-cc/pmc_2/out_yaml_counter_collection.csv)
+    ${CMAKE_CURRENT_BINARY_DIR}/simple-transpose-cc/pass_1/out_yaml_agent_info.csv
+    ${CMAKE_CURRENT_BINARY_DIR}/simple-transpose-cc/pass_1/out_yaml_counter_collection.csv
+    ${CMAKE_CURRENT_BINARY_DIR}/simple-transpose-cc/pass_2/out_yaml_agent_info.csv
+    ${CMAKE_CURRENT_BINARY_DIR}/simple-transpose-cc/pass_2/out_yaml_counter_collection.csv
+    )

 set_tests_properties(
    rocprofv3-test-counter-collection-yaml-pmc1-validate
@@ -1,15 +1,15 @@
 {
-  "metrics": [
-    {
-      "pmc": [
-        "SQ_WAVES",
-        "GRBM_COUNT"
-      ]
-    },
-    {
-      "pmc": [
-        "GRBM_GUI_ACTIVE"
-      ]
-    }
-  ]
+    "jobs": [
+        {
+            "pmc": [
+                "SQ_WAVES",
+                "GRBM_COUNT"
+            ]
+        },
+        {
+            "pmc": [
+                "GRBM_GUI_ACTIVE"
+            ]
+        }
+    ]
 }
@@ -1,6 +1,6 @@
-metrics:
+jobs:
  - pmc:
-      - SQ_WAVES
-      - GRBM_COUNT
+    - SQ_WAVES
+    - GRBM_COUNT
  - pmc:
-      - GRBM_GUI_ACTIVE
+    - GRBM_GUI_ACTIVE
@@ -0,0 +1,114 @@
+#
+# rocprofv3 tool test
+#
+cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR)
+
+project(
+    rocprofiler-tests-counter-collection
+    LANGUAGES CXX
+    VERSION 0.0.0)
+
+find_package(rocprofiler-sdk REQUIRED)
+
+rocprofiler_configure_pytest_files(CONFIG pytest.ini COPY validate.py conftest.py
+                                                          input.txt input.json input.yml)
+
+# pmc1
+add_test(
+    NAME rocprofv3-test-cc-kernel-filtering-input-json-execute
+    COMMAND
+        $<TARGET_FILE:rocprofiler-sdk::rocprofv3> -i
+        ${CMAKE_CURRENT_BINARY_DIR}/input.json -d ${CMAKE_CURRENT_BINARY_DIR}/json_input
+        -- $<TARGET_FILE:vector-ops>)
+
+string(REPLACE "LD_PRELOAD=" "ROCPROF_PRELOAD=" PRELOAD_ENV
+               "${ROCPROFILER_MEMCHECK_PRELOAD_ENV}")
+
+set(cc-env-kernel-filtering "${PRELOAD_ENV}")
+
+set_tests_properties(
+    rocprofv3-test-cc-kernel-filtering-input-json-execute
+    PROPERTIES TIMEOUT 45 LABELS "integration-tests" ENVIRONMENT
+               "${cc-env-kernel-filtering}" FAIL_REGULAR_EXPRESSION
+               "${ROCPROFILER_DEFAULT_FAIL_REGEX}")
+
+add_test(
+    NAME rocprofv3-test-cc-kernel-filtering-input-cmd-execute
+    COMMAND
+        $<TARGET_FILE:rocprofiler-sdk::rocprofv3> -i
+        ${CMAKE_CURRENT_BINARY_DIR}/input.txt --kernel-include-regex ".*kernel"
+        --kernel-exclude-regex "multiply" -T -d ${CMAKE_CURRENT_BINARY_DIR}/cmd_input -o
+        out --output-format csv -- $<TARGET_FILE:vector-ops>)
+
+set_tests_properties(
+    rocprofv3-test-cc-kernel-filtering-input-cmd-execute
+    PROPERTIES TIMEOUT 45 LABELS "integration-tests" ENVIRONMENT
+               "${cc-env-kernel-filtering}" FAIL_REGULAR_EXPRESSION
+               "${ROCPROFILER_DEFAULT_FAIL_REGEX}")
+
+# pmc1
+add_test(
+    NAME rocprofv3-test-cc-kernel-filtering-input-yaml-execute
+    COMMAND
+        $<TARGET_FILE:rocprofiler-sdk::rocprofv3> -i
+        ${CMAKE_CURRENT_BINARY_DIR}/input.yml -T -d
+        ${CMAKE_CURRENT_BINARY_DIR}/yaml_input -o out --output-format csv json --
+        $<TARGET_FILE:vector-ops>)
+
+set_tests_properties(
+    rocprofv3-test-cc-kernel-filtering-input-yaml-execute
+    PROPERTIES TIMEOUT 45 LABELS "integration-tests" ENVIRONMENT
+               "${cc-env-kernel-filtering}" FAIL_REGULAR_EXPRESSION
+               "${ROCPROFILER_DEFAULT_FAIL_REGEX}")
+add_test(
+    NAME rocprofv3-test-cc-kernel-filtering-input-json-validate
+    COMMAND
+        ${Python3_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/validate.py -k _pass
+        --input-csv-pass1
+        ${CMAKE_CURRENT_BINARY_DIR}/json_input/pass_1/out_counter_collection.csv
+        --input-json-pass1 ${CMAKE_CURRENT_BINARY_DIR}/json_input/pass_1/out_results.json
+        --input-csv-pass2
+        ${CMAKE_CURRENT_BINARY_DIR}/json_input/pass_2/out_counter_collection.csv
+        --input-json-pass2 ${CMAKE_CURRENT_BINARY_DIR}/json_input/pass_2/out_results.json
+        --input-csv-pass3
+        ${CMAKE_CURRENT_BINARY_DIR}/json_input/pass_3/out_counter_collection.csv
+        --input-json-pass3 ${CMAKE_CURRENT_BINARY_DIR}/json_input/pass_3/out_results.json)
+
+add_test(
+    NAME rocprofv3-test-cc-kernel-filtering-input-yaml-validate
+    COMMAND
+        ${Python3_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/validate.py -k _pass
+        --input-csv-pass1
+        ${CMAKE_CURRENT_BINARY_DIR}/yaml_input/pass_1/out_counter_collection.csv
+        --input-json-pass1 ${CMAKE_CURRENT_BINARY_DIR}/yaml_input/pass_1/out_results.json
+        --input-csv-pass2
+        ${CMAKE_CURRENT_BINARY_DIR}/yaml_input/pass_2/out_counter_collection.csv
+        --input-json-pass2 ${CMAKE_CURRENT_BINARY_DIR}/yaml_input/pass_2/out_results.json
+        --input-csv-pass3
+        ${CMAKE_CURRENT_BINARY_DIR}/yaml_input/pass_3/out_counter_collection.csv
+        --input-json-pass3 ${CMAKE_CURRENT_BINARY_DIR}/yaml_input/pass_3/out_results.json)
+
+add_test(
+    NAME rocprofv3-test-cc-kernel-filtering-input-cmd-validate
+    COMMAND
+        ${Python3_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/validate.py -k
+        test_validate_counter_collection_csv_pmc1 --input-csv-pmc1
+        ${CMAKE_CURRENT_BINARY_DIR}/cmd_input/pmc_1/out_counter_collection.csv)
+
+set_tests_properties(
+    rocprofv3-test-cc-kernel-filtering-input-json-validate
+    PROPERTIES TIMEOUT 45 LABELS "integration-tests" DEPENDS
+               "rocprofv3-test-cc-kernel-filtering-input-json-execute"
+               FAIL_REGULAR_EXPRESSION "${ROCPROFILER_DEFAULT_FAIL_REGEX}")
+
+set_tests_properties(
+    rocprofv3-test-cc-kernel-filtering-input-cmd-validate
+    PROPERTIES TIMEOUT 45 LABELS "integration-tests" DEPENDS
+               "rocprofv3-test-cc-kernel-filtering-input-cmd-execute"
+               FAIL_REGULAR_EXPRESSION "${ROCPROFILER_DEFAULT_FAIL_REGEX}")
+
+set_tests_properties(
+    rocprofv3-test-cc-kernel-filtering-input-yaml-validate
+    PROPERTIES TIMEOUT 45 LABELS "integration-tests" DEPENDS
+               "rocprofv3-test-cc-kernel-filtering-input-yaml-execute"
+               FAIL_REGULAR_EXPRESSION "${ROCPROFILER_DEFAULT_FAIL_REGEX}")
@@ -0,0 +1,102 @@
+#!/usr/bin/env python3
+
+import json
+import pytest
+import pandas as pd
+
+from rocprofiler_sdk.pytest_utils.dotdict import dotdict
+from rocprofiler_sdk.pytest_utils import collapse_dict_list
+
+
+def pytest_addoption(parser):
+
+    parser.addoption(
+        "--input-json-pass1",
+        action="store",
+        help="Path to JSON file.",
+    )
+
+    parser.addoption(
+        "--input-csv-pass1",
+        action="store",
+        help="Path to CSV file.",
+    )
+
+    parser.addoption(
+        "--input-json-pass2",
+        action="store",
+        help="Path to JSON file.",
+    )
+
+    parser.addoption(
+        "--input-csv-pass2",
+        action="store",
+        help="Path to JSON file.",
+    )
+
+    parser.addoption(
+        "--input-json-pass3",
+        action="store",
+        help="Path to JSON file.",
+    )
+
+    parser.addoption(
+        "--input-csv-pass3",
+        action="store",
+        help="Path to JSON file.",
+    )
+
+    parser.addoption(
+        "--input-csv-pmc1",
+        action="store",
+        help="Path to CSV file.",
+    )
+
+
+@pytest.fixture
+def input_csv_pass1(request):
+    filename = request.config.getoption("--input-csv-pass1")
+    with open(filename, "r") as inp:
+        return pd.read_csv(inp)
+
+
+@pytest.fixture
+def input_csv_pass2(request):
+    filename = request.config.getoption("--input-csv-pass2")
+    with open(filename, "r") as inp:
+        return pd.read_csv(inp)
+
+
+@pytest.fixture
+def input_csv_pass3(request):
+    filename = request.config.getoption("--input-csv-pass3")
+    with open(filename, "r") as inp:
+        return pd.read_csv(inp)
+
+
+@pytest.fixture
+def input_csv_pmc1(request):
+    filename = request.config.getoption("--input-csv-pmc1")
+    with open(filename, "r") as inp:
+        return pd.read_csv(inp)
+
+
+@pytest.fixture
+def input_json_pass1(request):
+    filename = request.config.getoption("--input-json-pass1")
+    with open(filename, "r") as inp:
+        return dotdict(collapse_dict_list(json.load(inp)))
+
+
+@pytest.fixture
+def input_json_pass2(request):
+    filename = request.config.getoption("--input-json-pass2")
+    with open(filename, "r") as inp:
+        return dotdict(collapse_dict_list(json.load(inp)))
+
+
+@pytest.fixture
+def input_json_pass3(request):
+    filename = request.config.getoption("--input-json-pass3")
+    with open(filename, "r") as inp:
+        return dotdict(collapse_dict_list(json.load(inp)))
@@ -0,0 +1,44 @@
+{
+    "jobs": [
+        {
+            "pmc": [
+                "SQ_WAVES"
+            ],
+            "kernel_include_regex": ".*_kernel",
+            "kernel_exclude_regex": "multiply",
+            "kernel_iteration_range": "[1-2]",
+            "output_file": "out",
+            "output_format": [
+                "csv",
+                "json"
+            ],
+            "truncate_kernels": true
+        },
+        {
+            "pmc": [
+                "GRBM_COUNT"
+            ],
+            "kernel_include_regex": ".*_kernel",
+            "kernel_iteration_range": "[2-3]",
+            "output_file": "out",
+            "output_format": [
+                "csv",
+                "json"
+            ],
+            "truncate_kernels": true
+        },
+        {
+            "pmc": [
+                "GRBM_GUI_ACTIVE"
+            ],
+            "kernel_include_regex": ".*_kernel",
+            "kernel_exclude_regex": "",
+            "output_file": "out",
+            "output_format": [
+                "csv",
+                "json"
+            ],
+            "truncate_kernels": true
+        }
+    ]
+}
@@ -0,0 +1 @@
+pmc: SQ_WAVES
@@ -0,0 +1,15 @@
+jobs:
+  - pmc: [SQ_WAVES]
+    kernel_include_regex: ".*_kernel"
+    kernel_exclude_regex: "multiply"
+    kernel_iteration_range:
+    - "[1-2]"
+    - "[3-4]"
+    - "[5-6]"
+  - pmc:
+    - GRBM_COUNT
+    kernel_include_regex: ".*_kernel"
+    kernel_iteration_range: "[2-3]"
+  - pmc: [GRBM_GUI_ACTIVE]
+    kernel_include_regex: ".*_kernel"
+    kernel_exclude_regex: ""
@@ -0,0 +1,5 @@
+
+[pytest]
+addopts = --durations=20 -rA -s -vv
+testpaths = validate.py
+pythonpath = @ROCPROFILER_SDK_TESTS_BINARY_DIR@/pytest-packages
@@ -0,0 +1,138 @@
+#!/usr/bin/env python3
+
+import sys
+import pytest
+import numpy as np
+import pandas as pd
+import re
+
+
+def unique(lst):
+    return list(set(lst))
+
+
+def validate_csv(df, kernel_list, counter_name):
+
+    assert not df.empty
+    assert (df["Agent_Id"].astype(int).values > 0).all()
+    assert (df["Queue_Id"].astype(int).values > 0).all()
+    assert (df["Process_Id"].astype(int).values > 0).all()
+    assert len(df["Kernel_Name"]) > 0
+
+    counter_collection_pmc1_kernel_list = [
+        x
+        for x in sorted(df["Kernel_Name"].unique().tolist())
+        if not re.search(r"__amd_rocclr_.*", x)
+    ]
+
+    assert kernel_list == counter_collection_pmc1_kernel_list
+
+    kernel_count = dict([[itr, 0] for itr in kernel_list])
+    assert len(kernel_count) == len(kernel_list)
+    for itr in df["Kernel_Name"]:
+        if re.search(r"__amd_rocclr_.*", itr):
+            continue
+        kernel_count[itr] += 1
+    kn_cnt = [itr for _, itr in kernel_count.items()]
+    assert min(kn_cnt) == max(kn_cnt) and len(unique(kn_cnt)) == 1
+
+    assert len(df["Counter_Value"]) > 0
+    assert df["Counter_Name"].str.contains(counter_name).all()
+    assert (df["Counter_Value"].astype(int).values > 0).all()
+
+
+def validate_json(json_data, counter_name, check_dispatch):
+
+    data = json_data["rocprofiler-sdk-tool"]
+    counter_collection_data = data["callback_records"]["counter_collection"]
+    dispatch_ids = []
+    # at present, AQLProfile has bugs when reporting the counters for below architectures
+    skip_gfx = ("gfx1101", "gfx1102")
+
+    def get_kernel_name(kernel_id):
+        return data["kernel_symbols"][kernel_id]["formatted_kernel_name"]
+
+    def get_agent(agent_id):
+        for agent in data["agents"]:
+            if agent["id"]["handle"] == agent_id["handle"]:
+                return agent
+        return None
+
+    def get_counter(counter_id):
+        for counter in data["counters"]:
+            if counter["id"]["handle"] == counter_id["handle"]:
+                return counter
+        return None
+
+    for counter in counter_collection_data:
+        dispatch_data = counter["dispatch_data"]["dispatch_info"]
+
+        assert dispatch_data["dispatch_id"] > 0
+        assert dispatch_data["agent_id"]["handle"] > 0
+        assert dispatch_data["queue_id"]["handle"] > 0
+
+        agent = get_agent(dispatch_data["agent_id"])
+        kernel_name = get_kernel_name(dispatch_data["kernel_id"])
+
+        assert agent is not None
+        assert len(kernel_name) > 0
+
+        dispatch_ids.append(dispatch_data["dispatch_id"])
+        if not re.search(r"__amd_rocclr_.*", kernel_name):
+            for record in counter["records"]:
+                counter = get_counter(record["counter_id"])
+                assert counter is not None, f"record:\n\t{record}"
+                assert (
+                    counter["name"] == counter_name
+                ), f"record:\n\t{record}\ncounter:\n\t{counter}"
+                if agent["name"] not in skip_gfx:
+                    assert (
+                        record["value"] > 0
+                    ), f"record: {record}\ncounter: {counter}\nagent: {agent}"
+
+    if check_dispatch:
+        di_uniq = list(set(sorted(dispatch_ids)))
+        # make sure the dispatch ids are unique and ordered
+        di_expect = [idx + 1 for idx in range(len(dispatch_ids))]
+        assert di_expect == di_uniq
+
+
+def test_validate_counter_collection_csv_pass1(input_csv_pass1: pd.DataFrame):
+    kernel_list = sorted(["addition_kernel", "subtract_kernel", "divide_kernel"])
+    validate_csv(input_csv_pass1, kernel_list, "SQ_WAVES")
+
+
+def test_validate_counter_collection_csv_pmc1(input_csv_pmc1: pd.DataFrame):
+    kernel_list = sorted(["addition_kernel", "subtract_kernel", "divide_kernel"])
+    validate_csv(input_csv_pmc1, kernel_list, "SQ_WAVES")
+
+
+def test_validate_counter_collection_csv_pass2(input_csv_pass2: pd.DataFrame):
+    kernel_list = sorted(
+        ["addition_kernel", "subtract_kernel", "multiply_kernel", "divide_kernel"]
+    )
+    validate_csv(input_csv_pass2, kernel_list, "GRBM_COUNT")
+
+
+def test_validate_counter_collection_csv_pass3(input_csv_pass3: pd.DataFrame):
+    kernel_list = sorted(
+        ["addition_kernel", "subtract_kernel", "multiply_kernel", "divide_kernel"]
+    )
+    validate_csv(input_csv_pass3, kernel_list, "GRBM_GUI_ACTIVE")
+
+
+def test_validate_counter_collection_json_pass1(input_json_pass1):
+    validate_json(input_json_pass1, "SQ_WAVES", False)
+
+
+def test_validate_counter_collection_json_pass2(input_json_pass2):
+    validate_json(input_json_pass2, "GRBM_COUNT", False)
+
+
+def test_validate_counter_collection_json_pass3(input_json_pass3):
+    validate_json(input_json_pass3, "GRBM_GUI_ACTIVE", True)
+
+
+if __name__ == "__main__":
+    exit_code = pytest.main(["-x", __file__] + sys.argv[1:])
+    sys.exit(exit_code)
@@ -10,7 +10,8 @@ project(

 find_package(rocprofiler-sdk REQUIRED)

-rocprofiler_configure_pytest_files(CONFIG pytest.ini COPY validate.py conftest.py)
+rocprofiler_configure_pytest_files(CONFIG pytest.ini COPY validate.py conftest.py
+                                                          input.json)

 # basic-metrics
 add_test(NAME rocprofv3-test-list-metrics-execute
@@ -19,7 +20,8 @@ add_test(NAME rocprofv3-test-list-metrics-execute

 # list-metrics-stdout
 add_test(NAME rocprofv3-test-list-metrics-std-out-execute
-         COMMAND $<TARGET_FILE:rocprofiler-sdk::rocprofv3> --list-metrics)
+         COMMAND $<TARGET_FILE:rocprofiler-sdk::rocprofv3> -i
+                 ${CMAKE_CURRENT_BINARY_DIR}/input.json)

 string(REPLACE "LD_PRELOAD=" "ROCPROF_PRELOAD=" PRELOAD_ENV
               "${ROCPROFILER_MEMCHECK_PRELOAD_ENV}")
@@ -0,0 +1,7 @@
+{
+    "jobs": [
+        {
+            "list_metrics": true
+        }
+    ]
+}
@@ -16,6 +16,14 @@ else()
    set(LOG_LEVEL "info")
 endif()

+string(REPLACE "LD_PRELOAD=" "ROCPROF_PRELOAD=" PRELOAD_ENV
+               "${ROCPROFILER_MEMCHECK_PRELOAD_ENV}")
+
+set(tracing-env "${PRELOAD_ENV}")
+
+rocprofiler_configure_pytest_files(
+    CONFIG pytest.ini COPY validate.py conftest.py input_systrace.json input_trace.json)
+
 add_test(
    NAME rocprofv3-test-trace-execute
    COMMAND
@@ -24,11 +32,6 @@ add_test(
        out --output-format pftrace csv json --log-level ${LOG_LEVEL} --
        $<TARGET_FILE:simple-transpose>)

-string(REPLACE "LD_PRELOAD=" "ROCPROF_PRELOAD=" PRELOAD_ENV
-               "${ROCPROFILER_MEMCHECK_PRELOAD_ENV}")
-
-set(tracing-env "${PRELOAD_ENV}")
-
 set_tests_properties(
    rocprofv3-test-trace-execute
    PROPERTIES
@@ -42,7 +45,26 @@ set_tests_properties(
        "HSA_API|HIP_API|HIP_COMPILER_API|MARKER_CORE_API|MARKER_CONTROL_API|MARKER_NAME_API|KERNEL_DISPATCH|CODE_OBJECT"
    )

-rocprofiler_configure_pytest_files(CONFIG pytest.ini COPY validate.py conftest.py)
+add_test(
+    NAME rocprofv3-test-trace-input-json-execute
+    COMMAND
+        $<TARGET_FILE:rocprofiler-sdk::rocprofv3> -i
+        ${CMAKE_CURRENT_BINARY_DIR}/input_trace.json -d
+        ${CMAKE_CURRENT_BINARY_DIR}/%argt%-trace-input-json --log-level ${LOG_LEVEL} --
+        $<TARGET_FILE:simple-transpose>)
+
+set_tests_properties(
+    rocprofv3-test-trace-input-json-execute
+    PROPERTIES
+        TIMEOUT
+        45
+        LABELS
+        "integration-tests"
+        ENVIRONMENT
+        "${tracing-env}"
+        FAIL_REGULAR_EXPRESSION
+        "HSA_API|HIP_API|HIP_COMPILER_API|MARKER_CORE_API|MARKER_CONTROL_API|MARKER_NAME_API|KERNEL_DISPATCH|CODE_OBJECT"
+    )

 add_test(
    NAME rocprofv3-test-trace-validate
@@ -61,6 +83,24 @@ add_test(
        --pftrace-input
        ${CMAKE_CURRENT_BINARY_DIR}/simple-transpose-trace/out_results.pftrace)

+add_test(
+    NAME rocprofv3-test-trace-input-json-validate
+    COMMAND
+        ${Python3_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/validate.py --hsa-input
+        ${CMAKE_CURRENT_BINARY_DIR}/simple-transpose-trace-input-json/out_hsa_api_trace.csv
+        --kernel-input
+        ${CMAKE_CURRENT_BINARY_DIR}/simple-transpose-trace-input-json/out_kernel_trace.csv
+        --memory-copy-input
+        ${CMAKE_CURRENT_BINARY_DIR}/simple-transpose-trace-input-json/out_memory_copy_trace.csv
+        --marker-input
+        ${CMAKE_CURRENT_BINARY_DIR}/simple-transpose-trace-input-json/out_marker_api_trace.csv
+        --agent-input
+        ${CMAKE_CURRENT_BINARY_DIR}/simple-transpose-trace-input-json/out_agent_info.csv
+        --json-input
+        ${CMAKE_CURRENT_BINARY_DIR}/simple-transpose-trace-input-json/out_results.json
+        --pftrace-input
+        ${CMAKE_CURRENT_BINARY_DIR}/simple-transpose-trace-input-json/out_results.pftrace)
+
 set(VALIDATION_FILES
    ${CMAKE_CURRENT_BINARY_DIR}/simple-transpose-trace/out_results.pftrace
    ${CMAKE_CURRENT_BINARY_DIR}/simple-transpose-trace/out_results.json
@@ -68,7 +108,14 @@ set(VALIDATION_FILES
    ${CMAKE_CURRENT_BINARY_DIR}/simple-transpose-trace/out_hsa_api_trace.csv
    ${CMAKE_CURRENT_BINARY_DIR}/simple-transpose-trace/out_kernel_trace.csv
    ${CMAKE_CURRENT_BINARY_DIR}/simple-transpose-trace/out_marker_api_trace.csv
-    ${CMAKE_CURRENT_BINARY_DIR}/simple-transpose-trace/out_agent_info.csv)
+    ${CMAKE_CURRENT_BINARY_DIR}/simple-transpose-trace/out_agent_info.csv
+    ${CMAKE_CURRENT_BINARY_DIR}/simple-transpose-trace-input-json/out_results.pftrace
+    ${CMAKE_CURRENT_BINARY_DIR}/simple-transpose-trace-input-json/out_results.json
+    ${CMAKE_CURRENT_BINARY_DIR}/simple-transpose-trace-input-json/out_memory_copy_trace.csv
+    ${CMAKE_CURRENT_BINARY_DIR}/simple-transpose-trace-input-json/out_hsa_api_trace.csv
+    ${CMAKE_CURRENT_BINARY_DIR}/simple-transpose-trace-input-json/out_kernel_trace.csv
+    ${CMAKE_CURRENT_BINARY_DIR}/simple-transpose-trace-input-json/out_marker_api_trace.csv
+    ${CMAKE_CURRENT_BINARY_DIR}/simple-transpose-trace-input-json/out_agent_info.csv)

 set_tests_properties(
    rocprofv3-test-trace-validate
@@ -83,6 +130,19 @@ set_tests_properties(
               ATTACHED_FILES_ON_FAIL
               "${VALIDATION_FILES}")

+set_tests_properties(
+    rocprofv3-test-trace-input-json-validate
+    PROPERTIES TIMEOUT
+               45
+               LABELS
+               "integration-tests"
+               DEPENDS
+               "rocprofv3-test-trace-input-json-execute"
+               FAIL_REGULAR_EXPRESSION
+               "AssertionError"
+               ATTACHED_FILES_ON_FAIL
+               "${VALIDATION_FILES}")
+
 # sys-trace test: tests --sys-trace command with mangled kernel names and validates
 # generated files

@@ -106,6 +166,26 @@ set_tests_properties(
        "HSA_API|HIP_API|HIP_COMPILER_API|MARKER_CORE_API|MARKER_CONTROL_API|MARKER_NAME_API|KERNEL_DISPATCH|CODE_OBJECT"
    )

+add_test(
+    NAME rocprofv3-test-systrace-input-json-execute
+    COMMAND
+        $<TARGET_FILE:rocprofiler-sdk::rocprofv3> -i
+        ${CMAKE_CURRENT_BINARY_DIR}/input_systrace.json -d
+        ${CMAKE_CURRENT_BINARY_DIR}/%argt%-systrace-input-json --
+        $<TARGET_FILE:simple-transpose>)
+
+set_tests_properties(
+    rocprofv3-test-systrace-input-json-execute
+    PROPERTIES
+        TIMEOUT
+        45
+        LABELS
+        "integration-tests"
+        ENVIRONMENT
+        "${tracing-env}"
+        FAIL_REGULAR_EXPRESSION
+        "HSA_API|HIP_API|HIP_COMPILER_API|MARKER_CORE_API|MARKER_CONTROL_API|MARKER_NAME_API|KERNEL_DISPATCH|CODE_OBJECT"
+    )
 add_test(
    NAME rocprofv3-test-systrace-validate
    COMMAND
@@ -125,6 +205,26 @@ add_test(
        --pftrace-input
        ${CMAKE_CURRENT_BINARY_DIR}/simple-transpose-systrace/out_results.pftrace)

+add_test(
+    NAME rocprofv3-test-systrace-input-json-validate
+    COMMAND
+        ${Python3_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/validate.py -k
+        "not test_hsa_api_trace" --hsa-input
+        ${CMAKE_CURRENT_BINARY_DIR}/simple-transpose-systrace-input-json/out_hsa_api_trace.csv
+        --kernel-input
+        ${CMAKE_CURRENT_BINARY_DIR}/simple-transpose-systrace-input-json/out_kernel_trace.csv
+        --memory-copy-input
+        ${CMAKE_CURRENT_BINARY_DIR}/simple-transpose-systrace-input-json/out_memory_copy_trace.csv
+        --marker-input
+        ${CMAKE_CURRENT_BINARY_DIR}/simple-transpose-systrace-input-json/out_marker_api_trace.csv
+        --agent-input
+        ${CMAKE_CURRENT_BINARY_DIR}/simple-transpose-systrace-input-json/out_agent_info.csv
+        --json-input
+        ${CMAKE_CURRENT_BINARY_DIR}/simple-transpose-systrace-input-json/out_results.json
+        --pftrace-input
+        ${CMAKE_CURRENT_BINARY_DIR}/simple-transpose-systrace-input-json/out_results.pftrace
+    )
+
 set(SYS_VALIDATION_FILES
    ${CMAKE_CURRENT_BINARY_DIR}/simple-transpose-systrace/out_results.pftrace
    ${CMAKE_CURRENT_BINARY_DIR}/simple-transpose-systrace/out_results.json
@@ -132,7 +232,14 @@ set(SYS_VALIDATION_FILES
    ${CMAKE_CURRENT_BINARY_DIR}/simple-transpose-systrace/out_hsa_api_trace.csv
    ${CMAKE_CURRENT_BINARY_DIR}/simple-transpose-systrace/out_kernel_trace.csv
    ${CMAKE_CURRENT_BINARY_DIR}/simple-transpose-systrace/out_marker_api_trace.csv
-    ${CMAKE_CURRENT_BINARY_DIR}/simple-transpose-systrace/out_agent_info.csv)
+    ${CMAKE_CURRENT_BINARY_DIR}/simple-transpose-systrace/out_agent_info.csv
+    ${CMAKE_CURRENT_BINARY_DIR}/simple-transpose-systrace-input-json/out_results.pftrace
+    ${CMAKE_CURRENT_BINARY_DIR}/simple-transpose-systrace-input-json/out_results.json
+    ${CMAKE_CURRENT_BINARY_DIR}/simple-transpose-systrace-input-json/out_memory_copy_trace.csv
+    ${CMAKE_CURRENT_BINARY_DIR}/simple-transpose-systrace-input-json/out_hsa_api_trace.csv
+    ${CMAKE_CURRENT_BINARY_DIR}/simple-transpose-systrace-input-json/out_kernel_trace.csv
+    ${CMAKE_CURRENT_BINARY_DIR}/simple-transpose-systrace-input-json/out_marker_api_trace.csv
+    ${CMAKE_CURRENT_BINARY_DIR}/simple-transpose-systrace-input-json/out_agent_info.csv)

 set_tests_properties(
    rocprofv3-test-systrace-validate
@@ -146,3 +253,16 @@ set_tests_properties(
               "AssertionError"
               ATTACHED_FILES_ON_FAIL
               "${SYS_VALIDATION_FILES}")
+
+set_tests_properties(
+    rocprofv3-test-systrace-input-json-validate
+    PROPERTIES TIMEOUT
+               45
+               LABELS
+               "integration-tests"
+               DEPENDS
+               "rocprofv3-test-systrace-input-json-execute"
+               FAIL_REGULAR_EXPRESSION
+               "AssertionError"
+               ATTACHED_FILES_ON_FAIL
+               "${SYS_VALIDATION_FILES}")
@@ -0,0 +1,13 @@
+{
+    "jobs": [
+        {
+            "sys_trace": true,
+            "output_file": "out",
+            "output_format": [
+                "csv",
+                "json",
+                "pftrace"
+            ]
+        }
+    ]
+}
@@ -0,0 +1,16 @@
+{
+    "jobs": [
+        {
+            "hsa_trace": true,
+            "kernel_trace": true,
+            "memory_copy_trace": true,
+            "marker_trace": true,
+            "output_file": "out",
+            "output_format": [
+                "csv",
+                "json",
+                "pftrace"
+            ]
+        }
+    ]
+}