Files
rocm-systems/src/utils/csv_processor.py
T
coleramos425 3ae0198f8e Renaming csv_converter to make more succinct
Signed-off-by: coleramos425 <colramos@amd.com>
2023-08-16 16:16:46 -05:00

250 rader
8.8 KiB
Python

##############################################################################bl
# MIT License
#
# Copyright (c) 2021 - 2023 Advanced Micro Devices, Inc. All Rights Reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
##############################################################################el
import argparse
import collections
import os
import subprocess
import sys
import re
import pandas as pd
import getpass
from pymongo import MongoClient
from tqdm import tqdm
import glob
cache = dict()
supported_arch = {"gfx906": "mi50", "gfx908": "mi100", "gfx90a": "mi200"}
MAX_SERVER_SEL_DELAY = 5000 # 5 sec connection timeout
def kernel_name_shortener(workload_dir, level):
def shorten_file(df, level):
global cache
columnName = ""
if "KernelName" in df:
columnName = "KernelName"
if "Name" in df:
columnName = "Name"
if columnName == "KernelName" or columnName == "Name":
# loop through all indices
for index in df.index:
original_name = df.loc[index, columnName]
if original_name in cache:
continue
cmd = ["/opt/rocm/llvm/bin/llvm-cxxfilt", original_name]
proc = subprocess.Popen(
cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
)
demangled_name, e = proc.communicate()
demangled_name = str(demangled_name, "UTF-8").strip()
# cache miss, add the shortened name to the dictionary
new_name = ""
matches = ""
names_and_args = re.compile(
r"(?P<name>[( )A-Za-z0-9_]+)([ ,*<>()]+)(::)?"
)
# works for name Kokkos::namespace::init_lock_array_kernel_threadid(int) [clone .kd]
if names_and_args.search(demangled_name):
matches = names_and_args.findall(demangled_name)
else:
# Works for first case '__amd_rocclr_fillBuffer.kd'
cache[original_name] = new_name
if new_name == None or new_name == "":
cache[original_name] = demangled_name
continue
current_level = 0
for name in matches:
##can cause errors if a function name or argument is equal to 'clone'
if name[0] == "clone":
continue
if len(name) == 3:
if name[2] == "::":
continue
if current_level < level:
new_name += name[0]
# closing '>' is to be taken account by the while loop
if name[1].count(">") == 0:
if current_level < level:
if not (
current_level == level - 1 and name[1].count("<") > 0
):
new_name += name[1]
current_level += name[1].count("<")
curr_index = 0
# cases include '>' '> >, ' have to go in depth here to not lose account of commas and current level
while name[1].count(">") > 0 and curr_index < len(name[1]):
if current_level < level:
new_name += name[1][curr_index:]
current_level -= name[1][curr_index:].count(">")
curr_index = len(name[1])
elif name[1][curr_index] == (">"):
current_level -= 1
curr_index += 1
cache[original_name] = new_name
if new_name == None or new_name == "":
cache[original_name] = demangled_name
df[columnName] = df[columnName].map(cache)
return df
# Only shorten if valid shortening level
if level < 5:
for fpath in glob.glob(workload_dir + "/*.csv"):
try:
orig_df = pd.read_csv(
fpath,
on_bad_lines="skip",
engine="python",
)
modified_df = shorten_file(orig_df, level)
modified_df.to_csv(fpath, index=False)
except pd.errors.EmptyDataError:
print("Skipping empty csv " + str(fpath))
# Verify target directory and setup connection
def parse(args, profileAndExport):
host = args.host
port = str(args.port)
username = args.username
Extractionlvl = args.kernelVerbose
if profileAndExport:
workload = args.workload + "/" + args.target + "/"
else:
workload = args.workload
# Verify directory path is valid
print("Pulling data from ", workload)
if os.path.isdir(workload):
print("The directory exists")
else:
raise argparse.ArgumentTypeError("Directory does not exist")
sysInfoPath = workload + "/sysinfo.csv"
if os.path.isfile(sysInfoPath):
print("Found sysinfo file")
sysInfo = pd.read_csv(sysInfoPath)
# Extract SoC
arch = sysInfo["gpu_soc"][0]
soc = supported_arch[arch]
# Extract name
name = sysInfo["workload_name"][0]
else:
print("Unable to parse SoC or workload name from sysinfo.csv")
sys.exit(1)
db = "omniperf_" + str(args.team) + "_" + str(name) + "_" + soc
if args.password == "":
try:
password = getpass.getpass()
except Exception as error:
print("PASSWORD ERROR", error)
else:
print("Password recieved")
else:
password = args.password
if db.find(".") != -1 or db.find("-") != -1:
raise ValueError("'-' and '.' are not permited in workload name", db)
connectionInfo = {
"username": username,
"password": password,
"host": host,
"port": port,
"workload": workload,
"db": db,
}
return connectionInfo, Extractionlvl
def convert_folder(connectionInfo, Extractionlvl):
# Test connection
connection_str = (
"mongodb://"
+ connectionInfo["username"]
+ ":"
+ connectionInfo["password"]
+ "@"
+ connectionInfo["host"]
+ ":"
+ connectionInfo["port"]
+ "/?authSource=admin"
)
client = MongoClient(connection_str, serverSelectionTimeoutMS=MAX_SERVER_SEL_DELAY)
try:
client.server_info()
except:
print("ERROR: Unable to connect to the server")
sys.exit(1)
i = 0
file = "blank"
for file in tqdm(os.listdir(connectionInfo["workload"])):
if file.endswith(".csv"):
print(connectionInfo["workload"] + "/" + file)
try:
fileName = file[0 : file.find(".")]
cmd = (
"mongoimport --quiet --uri mongodb://{}:{}@{}:{}/{}?authSource=admin --file {} -c {} --drop --type csv --headerline"
).format(
connectionInfo["username"],
connectionInfo["password"],
connectionInfo["host"],
connectionInfo["port"],
connectionInfo["db"],
connectionInfo["workload"] + "/" + file,
fileName,
)
os.system(cmd)
i += 1
except pd.errors.EmptyDataError:
print("Skipping empty csv " + file)
mydb = client["workload_names"]
mycol = mydb["names"]
value = {"name": connectionInfo["db"]}
newValue = {"name": connectionInfo["db"]}
mycol.replace_one(value, newValue, upsert=True)
print("{} collections added.".format(i))
print("Workload name uploaded")