Dosyalar
rocm-systems/projects/rocprofiler-sdk/source/lib/python/rocpd/importer.py
T
systems-assistant[bot] 061948a5ec [rocpd] Adding merge and package submodules for rocpd (#164)
* adding ROCpd database merge

* adding ROCpd database merge concatenating all tables

* update merge script

  - copy all tables from files

* fix merge format

* Add package submodule, initial POC.  Need to refine

* Minor fixes and clean up duplicated code in package.py

* Revamp metadata layout, add wildcard and .rpdb parsing

* Add auto merge & package when > 5 DBs, add examples, don't use auto_merge when using sub-commands merge & package

* - Extend package/yaml inputs to all rocpd modules
- Improve handling more corner cases for bad input files when parsing input parameters (bad yaml files, bad .rpdb folder, folders as input)
- Changed to use UUID in merged filename instead of the time, in auto-merge algorithm

* Minor text fixes for consistancy between modules

* Add more wildcard support and add package, merge tests

* Make changes based on review suggestions

* Move parsing packages into importer.py, simplified adding required params to a function

* fix package test by flattening input list before processing

* Integrate merge.py changes from Jonathan to add name-collision checks, recreating indexes, foreign key check (disabled for now, due to processing time)

* Rework rocpd.<submodule>.{add_args,process_args}

- add_args function returns a functor which accepts input and args
- time_window functor returned from add_args automatically applies time windowing of input

* change merge&package limit to 1, merge should create data views

* Move files by default instead of making copies

- copying can be enabled by passing "copy=True" or --copy cmdline argument

* refactor package to make the logic cleaner, set merge limit back to 5

* Allow automerge-limit param to override limit, change default back to 1.  Tests updated to use query, much quicker

* Update --help instructions for package

---------

Co-authored-by: acanadas <acanadas@amd.com>
Co-authored-by: a-canadasruiz <Araceli.CanadasRuiz@amd.com>
Co-authored-by: Young Hui <young.hui@amd.com>
Co-authored-by: Jonathan R. Madsen <jonathanrmadsen@gmail.com>
2025-11-12 17:07:12 -05:00

206 satır
7.3 KiB
Python

###############################################################################
# MIT License
#
# Copyright (c) 2023 Advanced Micro Devices, Inc.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
###############################################################################
#
# Utility classes to simplify generating rpd files
#
#
import sys
import os
import sqlite3
from .schema import RocpdSchema
from . import libpyrocpd
__all__ = ["RocpdImportData", "execute_statement"]
def internal_init(_input, _output, skip_auto_merge, automerge_limit):
from . import package
_input = package.flatten_rocpd_yaml_input_file(
_input, skip_auto_merge=skip_auto_merge, automerge_limit=automerge_limit
)
assert not os.path.isdir(_output), "Output database name must not be a directory"
assert _check_for_valid_dbs(
_input
), "RocpdImportData error, invalid SQLite3 database provided"
_connection = libpyrocpd.connect(_output)
_connection.execute("PRAGMA foreign_keys = ON")
_table_info = _create_temp_views(_connection, _input)
_create_meta_views(_connection)
return (_connection, _input, _table_info)
class RocpdImportData(libpyrocpd.RocpdImportData):
def __init__(
self, input, skip_auto_merge=False, automerge_limit=None, dbname=":memory:"
):
from . import package
if automerge_limit is None:
automerge_limit = package.IDEAL_NUMBER_OF_DATABASE_FILES
if isinstance(input, RocpdImportData):
super(RocpdImportData, self).__init__(input)
self.table_info = input.table_info
else:
if isinstance(input, sqlite3.Connection):
raise ValueError(
"RocpdImportData does not accept existing sqlite3 connections"
)
elif isinstance(input, str) or (
isinstance(input, list) and len(input) > 0 and isinstance(input[0], str)
):
_connection, _filenames, _table_info = internal_init(
input, dbname, skip_auto_merge, automerge_limit
)
self.table_info = _table_info
else:
raise ValueError(
f"input is unsupported type. Expected sqlite3.Connection, string, or (non-empty) list of strings. type={type(input).__name__}"
)
super(RocpdImportData, self).__init__(_connection, _filenames)
def __getattr__(self, name):
# any attribute or method not found in RocpdImportData will be looked up on self.connection
return getattr(self.connection, name)
def __enter__(self):
# support "with RocpdImportData(...) as db:":
return self
def __exit__(self, exc_type, exc, tb):
return self.connection.__exit__(exc_type, exc, tb)
def _is_sqlite_db(file_path):
with open(file_path, "rb") as f:
header = f.read(16)
return header == b"SQLite format 3\x00"
def _check_for_valid_dbs(input_files) -> bool:
# check the list of .db files to confirm they are SQLite3 DBs
for file in input_files:
sqlite_db = _is_sqlite_db(file)
if not sqlite_db:
print(f"Error: {file} is not an SQLite3 database. File not supported.")
return False
return True
def execute_statement(conn, statement, is_script=False):
if isinstance(conn, RocpdImportData):
_conn = conn.connection
else:
_conn = conn
assert isinstance(_conn, sqlite3.Connection)
try:
if is_script:
return _conn.executescript(statement)
return _conn.execute(f"{statement}")
except sqlite3.Error as err:
sys.stderr.write(f"SQLite3 error: {err}\nStatement:\n\t{statement}\n")
sys.stderr.flush()
raise err
def _create_temp_views(connection, input):
"""Create temporary unified views from multiple database files."""
assert isinstance(connection, sqlite3.Connection)
assert isinstance(input, list)
# Attach each database and extract the uuid from each database
dbinfo = []
uuids = []
for i, inp in enumerate(input):
execute_statement(connection, f"ATTACH DATABASE '{inp}' AS db{i}")
_uuids = [
itr[0]
for itr in execute_statement(
connection,
f"SELECT value FROM db{i}.rocpd_metadata WHERE tag='uuid'",
).fetchall()
]
dbinfo += [f"db{i}"]
uuids += [itr for itr in _uuids if itr not in uuids]
# unique set of universal process identifiers
uuids = list(set(uuids))
all_tables = {}
for ditr in dbinfo:
# get the tables for the given attached database
tables = [
itr[0]
for itr in execute_statement(
connection,
f"SELECT name FROM {ditr}.sqlite_master WHERE type='table' AND name LIKE 'rocpd_%'",
).fetchall()
]
# loop over the tables
for itr in tables:
# loop over the UUIDs
for uitr in uuids:
# skip the tables without the UUID suffix
if f"{uitr}" not in itr:
continue
# strip the UUID suffix to create a base table name, e.g. 'rocpd_string_03daf93' -> 'rocpd_string'
base = itr.replace(f"{uitr}", "")
# create a list of attached databases which have the base table name
if base not in all_tables.keys():
all_tables[base] = []
# create the SELECT statement from this database
select = f"SELECT * FROM {ditr}.{base}"
# make sure that we don't duplicate SELECT statements of same table from same attached database
if select in all_tables[base]:
continue
# add this to list
all_tables[base] += [select]
# create the temporary view that is a union of all the attached databases
for key, itr in all_tables.items():
stmt = "CREATE TEMPORARY VIEW {} AS {}".format(key, " UNION ALL ".join(itr))
execute_statement(connection, stmt)
return all_tables
def _create_meta_views(connection):
schema = RocpdSchema()
sql_script = schema.views.replace("CREATE VIEW", "CREATE TEMPORARY VIEW")
execute_statement(connection, sql_script, is_script=True)