RDC REST API (Sample code)
Please follow the README file Update README_rdc_rest_api.txt Update RDC_REST_API.py Error handling updates Updates for error handling Updates Updates for rdc_field_watch/rdc_field_unwatch and delete query Updates for rdc_field_watch/rdc_field_unwatch and delete query SWDEV-479738 [RDC] - Rest API Delete python_binding/RDC_REST_API.py new rdc_rest_api.py file for SWDEV-479738 [RDC] - Rest API
Этот коммит содержится в:
коммит произвёл
Galantsev, Dmitrii
родитель
e847f74f78
Коммит
cf566ebd31
@@ -0,0 +1,102 @@
|
||||
# RDC REST API
|
||||
|
||||
## Overview
|
||||
This REST API provides functionalities to:
|
||||
- Discover available GPUs on a node.
|
||||
- Configure and manage GPU monitoring queries.
|
||||
- Retrieve GPU metrics based on configured queries.
|
||||
|
||||
The API is built using Flask and interacts with the RDC library to monitor GPU usage and performance metrics.
|
||||
|
||||
## Installation
|
||||
### Prerequisites
|
||||
- Python 3.x
|
||||
- Flask
|
||||
- RDC Library (`librdc_bootstrap.so` must be available and accessible)
|
||||
|
||||
### Install Dependencies
|
||||
```sh
|
||||
pip install flask
|
||||
```
|
||||
|
||||
## Running the API
|
||||
1. Ensure `librdc_bootstrap.so` is in the library path:
|
||||
```sh
|
||||
export LD_LIBRARY_PATH=/path/to/librdc_bootstrap.so:$LD_LIBRARY_PATH
|
||||
```
|
||||
2. Run the API:
|
||||
```sh
|
||||
python rdc_rest_api.py
|
||||
```
|
||||
|
||||
The API will start and listen on `http://0.0.0.0:50052`.
|
||||
|
||||
## API Endpoints
|
||||
|
||||
### 1. Discover GPUs
|
||||
**GET** `/rdc/discovery`
|
||||
#### Response:
|
||||
```json
|
||||
{
|
||||
"0": "GPU Name",
|
||||
"1": "GPU Name"
|
||||
}
|
||||
```
|
||||
|
||||
### 2. Create Query Criteria
|
||||
**POST** `/rdc/query_criteria`
|
||||
#### Request Body:
|
||||
```json
|
||||
{
|
||||
"gpu_index": [0,1],
|
||||
"metrics": ["RDC_FI_GPU_CLOCK", "RDC_FI_GPU_TEMP"]
|
||||
}
|
||||
```
|
||||
#### Response:
|
||||
```json
|
||||
{
|
||||
"query_id": "G-1-F-2"
|
||||
}
|
||||
```
|
||||
|
||||
### 3. Get Query Criteria
|
||||
**GET** `/rdc/query_criteria/<query_id>`
|
||||
#### Response:
|
||||
```json
|
||||
{
|
||||
"gpu_index": [0,1],
|
||||
"metrics": ["RDC_FI_GPU_CLOCK", "RDC_FI_GPU_TEMP"],
|
||||
"query_id": "G-1-F-2"
|
||||
}
|
||||
```
|
||||
|
||||
### 4. Delete Query Criteria
|
||||
**DELETE** `/rdc/query_criteria/<query_id>`
|
||||
#### Response:
|
||||
```json
|
||||
{
|
||||
"message": "Deleted successfully"
|
||||
}
|
||||
```
|
||||
|
||||
### 5. Retrieve GPU Metrics
|
||||
**GET** `/rdc/gpu_metrics/<query_id>`
|
||||
#### Response:
|
||||
```json
|
||||
[
|
||||
{
|
||||
"gpu_index": 0,
|
||||
"RDC_FI_GPU_CLOCK": 1450,
|
||||
"RDC_FI_GPU_TEMP": 32
|
||||
},
|
||||
{
|
||||
"gpu_index": 1,
|
||||
"RDC_FI_GPU_CLOCK": 736,
|
||||
"RDC_FI_GPU_TEMP": 35
|
||||
}
|
||||
]
|
||||
```
|
||||
|
||||
## Notes
|
||||
- Ensure `librdc_bootstrap.so` is properly linked.
|
||||
- The API should be run on a system with RDC installed and GPUs accessible.
|
||||
@@ -0,0 +1,132 @@
|
||||
from flask import Flask, request, jsonify
|
||||
from RdcReader import RdcReader
|
||||
from RdcUtil import RdcUtil
|
||||
from rdc_bootstrap import *
|
||||
|
||||
# Initialize Flask app
|
||||
app = Flask(__name__)
|
||||
|
||||
# Initialize RDC Reader and Utilities for handling GPU queries
|
||||
rdc_reader = RdcReader(ip_port=None)
|
||||
rdc_util = RdcUtil()
|
||||
|
||||
# Dictionary to store query criteria with query_id
|
||||
gpu_queries = {}
|
||||
|
||||
# Endpoint to discover available GPUs
|
||||
@app.route('/rdc/discovery', methods=['GET'])
|
||||
def discover_gpus():
|
||||
"""Retrieve a list of available GPUs and their names."""
|
||||
try:
|
||||
gpu_indexes = rdc_util.get_all_gpu_indexes(rdc_reader.rdc_handle)
|
||||
gpus = {}
|
||||
for gpu in gpu_indexes:
|
||||
device_attr = rdc_device_attributes_t()
|
||||
rdc.rdc_device_get_attributes(rdc_reader.rdc_handle, gpu, device_attr)
|
||||
gpus[gpu] = device_attr.device_name.decode('utf-8') # Decode GPU name from bytes
|
||||
return jsonify(gpus)
|
||||
except Exception as e:
|
||||
return jsonify({"error": str(e)}), 500
|
||||
|
||||
# Endpoint to create a new query criteria
|
||||
@app.route('/rdc/query_criteria', methods=['POST'])
|
||||
def create_query_criteria():
|
||||
"""Define a new query criteria specifying GPU indices and metrics to monitor."""
|
||||
try:
|
||||
data = request.json
|
||||
if not data or "metrics" not in data:
|
||||
return jsonify({"error": "Invalid request payload"}), 400
|
||||
|
||||
gpu_indexes = data.get("gpu_index", rdc_util.get_all_gpu_indexes(rdc_reader.rdc_handle))
|
||||
metrics = data.get("metrics", [])
|
||||
|
||||
# Create rdc group and fieldgroup
|
||||
gpu_group_id, _ = rdc_util.create_gpu_group(rdc_reader.rdc_handle, b"query_gpu_group", gpu_indexes)
|
||||
field_group_id, _ = rdc_util.create_field_group(rdc_reader.rdc_handle, b"query_field_group", [rdc.get_field_id_from_name(m.encode('utf-8')).value for m in metrics])
|
||||
|
||||
# Call rdc_field_watch to start fetching metrics into cache
|
||||
result = rdc.rdc_field_watch(rdc_reader.rdc_handle, gpu_group_id, field_group_id, 1000000, 3600.0, 1000)
|
||||
if rdc_status_t(result) != rdc_status_t.RDC_ST_OK:
|
||||
return jsonify({"error": "Failed to watch fields"}), 500
|
||||
|
||||
query_id = f"G-{gpu_group_id.value}-F-{field_group_id.value}"
|
||||
gpu_queries[query_id] = {"gpu_index": gpu_indexes, "metrics": metrics, "query_id": query_id}
|
||||
return jsonify({"query_id": query_id})
|
||||
except Exception as e:
|
||||
return jsonify({"error": str(e)}), 500
|
||||
|
||||
# Endpoint to get all query criteria
|
||||
@app.route('/rdc/query_criteria', methods=['GET'])
|
||||
def get_all_query_criteria():
|
||||
"""Retrieve all stored query criteria for all GPUs."""
|
||||
try:
|
||||
query_id = request.args.get("query_id")
|
||||
if query_id:
|
||||
return jsonify(gpu_queries.get(query_id, {}))
|
||||
return jsonify(list(gpu_queries.values()))
|
||||
except Exception as e:
|
||||
return jsonify({"error": str(e)}), 500
|
||||
|
||||
# Endpoint to retrieve a specific query criteria
|
||||
@app.route('/rdc/query_criteria/<query_id>', methods=['GET'])
|
||||
def get_query_criteria(query_id):
|
||||
"""Retrieve query criteria based on a given query ID."""
|
||||
try:
|
||||
if query_id in gpu_queries:
|
||||
return jsonify(gpu_queries[query_id])
|
||||
return jsonify({"error": "Query ID not found"}), 404
|
||||
except Exception as e:
|
||||
return jsonify({"error": str(e)}), 500
|
||||
|
||||
# Endpoint to delete a specific query criteria
|
||||
@app.route('/rdc/query_criteria/<query_id>', methods=['DELETE'])
|
||||
def delete_query_criteria(query_id):
|
||||
"""Delete a query criteria using its query ID."""
|
||||
try:
|
||||
if query_id in gpu_queries:
|
||||
gpu_group_id = rdc_reader.field_group_id
|
||||
field_group_id = rdc_reader.field_group_id
|
||||
|
||||
# Call rdc_field_unwatch to stop fetching metrics
|
||||
result = rdc.rdc_field_unwatch(rdc_reader.rdc_handle, gpu_group_id, field_group_id)
|
||||
if rdc_status_t(result) != rdc_status_t.RDC_ST_OK:
|
||||
return jsonify({"error": "Failed to unwatch fields"}), 500
|
||||
|
||||
# Delete GPU and field groups
|
||||
rdc.rdc_group_gpu_destroy(rdc_reader.rdc_handle, gpu_group_id)
|
||||
rdc.rdc_group_field_destroy(rdc_reader.rdc_handle, field_group_id)
|
||||
|
||||
# Remove the query from storage
|
||||
del gpu_queries[query_id]
|
||||
return jsonify({"message": "Deleted successfully"})
|
||||
return jsonify({"error": "Query ID not found"}), 404
|
||||
except Exception as e:
|
||||
return jsonify({"error": str(e)}), 500
|
||||
|
||||
# Endpoint to fetch GPU metrics for a specific query ID
|
||||
@app.route('/rdc/gpu_metrics/<query_id>', methods=['GET'])
|
||||
def get_gpu_metrics(query_id):
|
||||
"""Retrieve GPU metrics based on the query ID."""
|
||||
try:
|
||||
if query_id not in gpu_queries:
|
||||
return jsonify({"error": "Query ID not found"}), 404
|
||||
|
||||
query = gpu_queries[query_id]
|
||||
gpu_metrics = [] # List to store GPU metric results
|
||||
for gpu in query["gpu_index"]:
|
||||
gpu_data = {"gpu_index": gpu} # Store GPU index in the response
|
||||
for metric in query["metrics"]:
|
||||
field_id = rdc.get_field_id_from_name(metric.encode('utf-8')).value
|
||||
value = rdc_field_value()
|
||||
result = rdc.rdc_field_get_latest_value(rdc_reader.rdc_handle, gpu, field_id, value)
|
||||
if rdc_status_t(result) == rdc_status_t.RDC_ST_OK:
|
||||
gpu_data[metric] = value.value.l_int # Store metric value
|
||||
gpu_metrics.append(gpu_data) # Append GPU data to results
|
||||
return jsonify(gpu_metrics)
|
||||
except Exception as e:
|
||||
return jsonify({"error": str(e)}), 500
|
||||
|
||||
# Main entry point to start the Flask server
|
||||
if __name__ == '__main__':
|
||||
# Runs the API server, making it accessible on all network interfaces
|
||||
app.run(host='0.0.0.0', port=50052)
|
||||
Ссылка в новой задаче
Block a user