diff --git a/python_binding/README_rdc_rest_api.txt b/python_binding/README_rdc_rest_api.txt new file mode 100644 index 0000000000..8c22b9b978 --- /dev/null +++ b/python_binding/README_rdc_rest_api.txt @@ -0,0 +1,102 @@ +# RDC REST API + +## Overview +This REST API provides functionalities to: +- Discover available GPUs on a node. +- Configure and manage GPU monitoring queries. +- Retrieve GPU metrics based on configured queries. + +The API is built using Flask and interacts with the RDC library to monitor GPU usage and performance metrics. + +## Installation +### Prerequisites +- Python 3.x +- Flask +- RDC Library (`librdc_bootstrap.so` must be available and accessible) + +### Install Dependencies +```sh +pip install flask +``` + +## Running the API +1. Ensure `librdc_bootstrap.so` is in the library path: + ```sh + export LD_LIBRARY_PATH=/path/to/librdc_bootstrap.so:$LD_LIBRARY_PATH + ``` +2. Run the API: + ```sh + python rdc_rest_api.py + ``` + +The API will start and listen on `http://0.0.0.0:50052`. + +## API Endpoints + +### 1. Discover GPUs +**GET** `/rdc/discovery` +#### Response: +```json +{ + "0": "GPU Name", + "1": "GPU Name" +} +``` + +### 2. Create Query Criteria +**POST** `/rdc/query_criteria` +#### Request Body: +```json +{ + "gpu_index": [0,1], + "metrics": ["RDC_FI_GPU_CLOCK", "RDC_FI_GPU_TEMP"] +} +``` +#### Response: +```json +{ + "query_id": "G-1-F-2" +} +``` + +### 3. Get Query Criteria +**GET** `/rdc/query_criteria/` +#### Response: +```json +{ + "gpu_index": [0,1], + "metrics": ["RDC_FI_GPU_CLOCK", "RDC_FI_GPU_TEMP"], + "query_id": "G-1-F-2" +} +``` + +### 4. Delete Query Criteria +**DELETE** `/rdc/query_criteria/` +#### Response: +```json +{ + "message": "Deleted successfully" +} +``` + +### 5. Retrieve GPU Metrics +**GET** `/rdc/gpu_metrics/` +#### Response: +```json +[ + { + "gpu_index": 0, + "RDC_FI_GPU_CLOCK": 1450, + "RDC_FI_GPU_TEMP": 32 + }, + { + "gpu_index": 1, + "RDC_FI_GPU_CLOCK": 736, + "RDC_FI_GPU_TEMP": 35 + } +] +``` + +## Notes +- Ensure `librdc_bootstrap.so` is properly linked. +- The API should be run on a system with RDC installed and GPUs accessible. diff --git a/python_binding/rdc_rest_api.py b/python_binding/rdc_rest_api.py new file mode 100644 index 0000000000..c8fe334308 --- /dev/null +++ b/python_binding/rdc_rest_api.py @@ -0,0 +1,132 @@ +from flask import Flask, request, jsonify +from RdcReader import RdcReader +from RdcUtil import RdcUtil +from rdc_bootstrap import * + +# Initialize Flask app +app = Flask(__name__) + +# Initialize RDC Reader and Utilities for handling GPU queries +rdc_reader = RdcReader(ip_port=None) +rdc_util = RdcUtil() + +# Dictionary to store query criteria with query_id +gpu_queries = {} + +# Endpoint to discover available GPUs +@app.route('/rdc/discovery', methods=['GET']) +def discover_gpus(): + """Retrieve a list of available GPUs and their names.""" + try: + gpu_indexes = rdc_util.get_all_gpu_indexes(rdc_reader.rdc_handle) + gpus = {} + for gpu in gpu_indexes: + device_attr = rdc_device_attributes_t() + rdc.rdc_device_get_attributes(rdc_reader.rdc_handle, gpu, device_attr) + gpus[gpu] = device_attr.device_name.decode('utf-8') # Decode GPU name from bytes + return jsonify(gpus) + except Exception as e: + return jsonify({"error": str(e)}), 500 + +# Endpoint to create a new query criteria +@app.route('/rdc/query_criteria', methods=['POST']) +def create_query_criteria(): + """Define a new query criteria specifying GPU indices and metrics to monitor.""" + try: + data = request.json + if not data or "metrics" not in data: + return jsonify({"error": "Invalid request payload"}), 400 + + gpu_indexes = data.get("gpu_index", rdc_util.get_all_gpu_indexes(rdc_reader.rdc_handle)) + metrics = data.get("metrics", []) + + # Create rdc group and fieldgroup + gpu_group_id, _ = rdc_util.create_gpu_group(rdc_reader.rdc_handle, b"query_gpu_group", gpu_indexes) + field_group_id, _ = rdc_util.create_field_group(rdc_reader.rdc_handle, b"query_field_group", [rdc.get_field_id_from_name(m.encode('utf-8')).value for m in metrics]) + + # Call rdc_field_watch to start fetching metrics into cache + result = rdc.rdc_field_watch(rdc_reader.rdc_handle, gpu_group_id, field_group_id, 1000000, 3600.0, 1000) + if rdc_status_t(result) != rdc_status_t.RDC_ST_OK: + return jsonify({"error": "Failed to watch fields"}), 500 + + query_id = f"G-{gpu_group_id.value}-F-{field_group_id.value}" + gpu_queries[query_id] = {"gpu_index": gpu_indexes, "metrics": metrics, "query_id": query_id} + return jsonify({"query_id": query_id}) + except Exception as e: + return jsonify({"error": str(e)}), 500 + +# Endpoint to get all query criteria +@app.route('/rdc/query_criteria', methods=['GET']) +def get_all_query_criteria(): + """Retrieve all stored query criteria for all GPUs.""" + try: + query_id = request.args.get("query_id") + if query_id: + return jsonify(gpu_queries.get(query_id, {})) + return jsonify(list(gpu_queries.values())) + except Exception as e: + return jsonify({"error": str(e)}), 500 + +# Endpoint to retrieve a specific query criteria +@app.route('/rdc/query_criteria/', methods=['GET']) +def get_query_criteria(query_id): + """Retrieve query criteria based on a given query ID.""" + try: + if query_id in gpu_queries: + return jsonify(gpu_queries[query_id]) + return jsonify({"error": "Query ID not found"}), 404 + except Exception as e: + return jsonify({"error": str(e)}), 500 + +# Endpoint to delete a specific query criteria +@app.route('/rdc/query_criteria/', methods=['DELETE']) +def delete_query_criteria(query_id): + """Delete a query criteria using its query ID.""" + try: + if query_id in gpu_queries: + gpu_group_id = rdc_reader.field_group_id + field_group_id = rdc_reader.field_group_id + + # Call rdc_field_unwatch to stop fetching metrics + result = rdc.rdc_field_unwatch(rdc_reader.rdc_handle, gpu_group_id, field_group_id) + if rdc_status_t(result) != rdc_status_t.RDC_ST_OK: + return jsonify({"error": "Failed to unwatch fields"}), 500 + + # Delete GPU and field groups + rdc.rdc_group_gpu_destroy(rdc_reader.rdc_handle, gpu_group_id) + rdc.rdc_group_field_destroy(rdc_reader.rdc_handle, field_group_id) + + # Remove the query from storage + del gpu_queries[query_id] + return jsonify({"message": "Deleted successfully"}) + return jsonify({"error": "Query ID not found"}), 404 + except Exception as e: + return jsonify({"error": str(e)}), 500 + +# Endpoint to fetch GPU metrics for a specific query ID +@app.route('/rdc/gpu_metrics/', methods=['GET']) +def get_gpu_metrics(query_id): + """Retrieve GPU metrics based on the query ID.""" + try: + if query_id not in gpu_queries: + return jsonify({"error": "Query ID not found"}), 404 + + query = gpu_queries[query_id] + gpu_metrics = [] # List to store GPU metric results + for gpu in query["gpu_index"]: + gpu_data = {"gpu_index": gpu} # Store GPU index in the response + for metric in query["metrics"]: + field_id = rdc.get_field_id_from_name(metric.encode('utf-8')).value + value = rdc_field_value() + result = rdc.rdc_field_get_latest_value(rdc_reader.rdc_handle, gpu, field_id, value) + if rdc_status_t(result) == rdc_status_t.RDC_ST_OK: + gpu_data[metric] = value.value.l_int # Store metric value + gpu_metrics.append(gpu_data) # Append GPU data to results + return jsonify(gpu_metrics) + except Exception as e: + return jsonify({"error": str(e)}), 500 + +# Main entry point to start the Flask server +if __name__ == '__main__': + # Runs the API server, making it accessible on all network interfaces + app.run(host='0.0.0.0', port=50052)