import subprocess import pandas as pd import json import time # Function to run YCSB command and capture output def run_ycsb(database, workload, operation_type): command = [ "ycsb", operation_type, database, "-P", f"workloads/{workload}", "-p", "recordcount=100000", # Define number of records for the test "-p", "operationcount=10000", # Define number of operations to perform ] # Run the YCSB command and capture the output result = subprocess.run(command, stdout=subprocess.PIPE, text=True) return result.stdout # Function to parse YCSB output into metrics def parse_ycsb_output(ycsb_output): metrics = {} for line in ycsb_output.splitlines(): if line.startswith("[READ]") or line.startswith("[WRITE]"): operation, metric = line.split(",")[0], line.split(",")[1:] for m in metric: key, value = m.strip().split("=") metrics[f"{operation}_{key}"] = float(value) return metrics # Initialize results storage results = [] # Databases to compare databases = ["mongodb", "jdbc"] # Assuming "jdbc" is used for MySQL in YCSB # Workloads to run workloads = ["workloada", "workloadb", "workloadc"] # Run YCSB tests and collect results for db in databases: for workload in workloads: # Load data into the database print(f"Loading data for {db} with {workload}...") load_output = run_ycsb(db, workload, "load") load_metrics = parse_ycsb_output(load_output) # Run the workload print(f"Running workload {workload} for {db}...") run_output = run_ycsb(db, workload, "run") run_metrics = parse_ycsb_output(run_output) # Record results results.append({ "Database": db, "Workload": workload, "LoadMetrics": load_metrics, "RunMetrics": run_metrics }) # Convert results into DataFrame for analysis df_results = pd.json_normalize(results) df_results.head() # ---------------------------------------- # Extract and analyze specific metrics throughput_df = df_results[["Database", "Workload", "RunMetrics.[READ]_Throughput(ops/sec)", "RunMetrics.[WRITE]_Throughput(ops/sec)"]] latency_df = df_results[["Database", "Workload", "RunMetrics.[READ]_AverageLatency(us)", "RunMetrics.[WRITE]_AverageLatency(us)"]] # Display throughput comparison print("Throughput Comparison:") print(throughput_df) # Display latency comparison print("\nLatency Comparison:") print(latency_df) # Visualization (optional) import matplotlib.pyplot as plt import seaborn as sns # Throughput Comparison Plot sns.barplot(data=throughput_df, x="Workload", y="RunMetrics.[READ]_Throughput(ops/sec)", hue="Database") plt.title("Read Throughput Comparison") plt.show() sns.barplot(data=throughput_df, x="Workload", y="RunMetrics.[WRITE]_Throughput(ops/sec)", hue="Database") plt.title("Write Throughput Comparison") plt.show() # Latency Comparison Plot sns.barplot(data=latency_df, x="Workload", y="RunMetrics.[READ]_AverageLatency(us)", hue="Database") plt.title("Read Latency Comparison") plt.show() sns.barplot(data=latency_df, x="Workload", y="RunMetrics.[WRITE]_AverageLatency(us)", hue="Database") plt.title("Write Latency Comparison") plt.show()