Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .flake8
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
[flake8]
extend-ignore = E203
exclude = .git,__pycache__,batches,resources
max-complexity = 10
max-line-length = 88
16 changes: 3 additions & 13 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,20 +1,10 @@
/logs*.d/**/*
.env
/Cargo.lock
/tasks.d/**/*
/jobs.d/**/*
/results*.d/**/*
/scripts*.d/**/*
/inventories*.d/**/*
/backup/**/*
/test_results/**/*
/jobs*.yaml
/menage.sh
/resources
/*.tar.*
/*ipynb*
/batches/**
/batches
/.ssh_g5k.pub
/__pycache__
/*.png

# Added by cargo

Expand Down
286 changes: 286 additions & 0 deletions analysis/data_analysis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,286 @@
# IMPORTS
import os
import sys
import polars as pl
import schemas
import extract
import load
import rq1
import rq2
import rq3
import rq34
import visualization


vendor_generation_map = {
"E5-2620 v4": {
"architecture": "Broadwell-E",
"vendor": "Intel",
"generation": 6,
"launch_date": "Q1 2016",
},
"E5-2630L v4": {
"architecture": "Broadwell-E",
"vendor": "Intel",
"generation": 6,
"launch_date": "Q1 2016",
},
"E5-2698 v4": {
"architecture": "Broadwell-E",
"vendor": "Intel",
"generation": 6,
"launch_date": "Q1 2016",
},
"E5-2630 v3": {
"architecture": "Haswell-E",
"vendor": "Intel",
"generation": 5,
"launch_date": "Q3 2014",
},
"Gold 5220": {
"architecture": "Cascade Lake-SP",
"vendor": "Intel",
"generation": 10,
"launch_date": "Q2 2019",
},
"Gold 5218": {
"architecture": "Cascade Lake-SP",
"vendor": "Intel",
"generation": 10,
"launch_date": "Q2 2019",
},
"i7-9750H": {
"architecture": "Coffee Lake",
"vendor": "Intel",
"generation": 9,
"launch_date": "Q2 2019",
},
"Silver 4314": {
"architecture": "Ice Lake-SP",
"vendor": "Intel",
"generation": 10,
"launch_date": "Q2 2021",
},
"Gold 5320": {
"architecture": "Ice Lake-SP",
"vendor": "Intel",
"generation": 10,
"launch_date": "Q2 2021",
},
"Gold 6126": {
"architecture": "Skylake-SP",
"vendor": "Intel",
"generation": 6,
"launch_date": "Q3 2017",
},
"Gold 6130": {
"architecture": "Skylake-SP",
"vendor": "Intel",
"generation": 6,
"launch_date": "Q3 2017",
},
"E5-2620": {
"architecture": "Sandy Bridge-EP",
"vendor": "Intel",
"generation": 3,
"launch_date": "Q1 2012",
},
"E5-2630": {
"architecture": "Sandy Bridge-EP",
"vendor": "Intel",
"generation": 3,
"launch_date": "Q1 2012",
},
"E5-2630L": {
"architecture": "Sandy Bridge-EP",
"vendor": "Intel",
"generation": 3,
"launch_date": "Q1 2012",
},
"E5-2660": {
"architecture": "Sandy Bridge-EP",
"vendor": "Intel",
"generation": 3,
"launch_date": "Q1 2012",
},
"7301": {
"architecture": "Zen",
"vendor": "AMD",
"generation": 1,
"launch_date": "Q2 2017",
},
"7352": {
"architecture": "Zen 2",
"vendor": "AMD",
"generation": 2,
"launch_date": "Q3 2019",
},
"7452": {
"architecture": "Zen 2",
"vendor": "AMD",
"generation": 2,
"launch_date": "Q3 2019",
},
"7642": {
"architecture": "Zen 2",
"vendor": "AMD",
"generation": 2,
"launch_date": "Q3 2019",
},
"7742": {
"architecture": "Zen 2",
"vendor": "AMD",
"generation": 2,
"launch_date": "Q3 2019",
},
}


def main():

test = sys.argv[1]
if test == "test":
test = True
else:
test = False

debian11_energy_stats_df = energy_for_os(
"debian11-5.10-0",
r"batches/debian11-5\.10-0\.d/results-debian11-5\.10-0\.d/([^/]+)/([^/]+)/([^/]+)/[^_]*_([^_]+).*",
test,
)
ubuntu2404_energy_stats_df = energy_for_os(
"ubuntu2404nfs-6.8-0",
r"batches/ubuntu2404nfs-6\.8-0\.d/results-ubuntu2404nfs-6\.8-0\.d/([^/]+)/([^/]+)/([^/]+)/[^_]*_([^_]+).*",
test,
)

powerapi_energy_stats_df = energy_for_os(
"powerapi",
r"batches/powerapi\.d/results-powerapi\.d/([^/]+)/([^/]+)/([^/]+)/[^_]*_([^_]+).*",
test,
)


rq3.correlation_perf_perf_hwpc_hwpc_cv_os(ubuntu2404_energy_stats_df, debian11_energy_stats_df, "alone")
rq1.correlation_perf_hwpc_cv(debian11_energy_stats_df, "alone", "debian11 Kernel 5.10")
rq1.correlation_perf_hwpc_cv(debian11_energy_stats_df, "not_alone", "debian11 Kernel 5.10")
rq1.correlation_perf_hwpc_cv(ubuntu2404_energy_stats_df, "alone", "ubuntu2404 Kernel 6.8")
rq1.correlation_perf_hwpc_cv(ubuntu2404_energy_stats_df, "not_alone", "ubuntu2404 Kernel 6.8")

rq2.boxplots_perf_hwpc_cv_processor(debian11_energy_stats_df, "processor_detail", "pkg_coefficient_of_variation", "job", "25 000 Operations")


concatenated_dfs = pl.concat([debian11_energy_stats_df, ubuntu2404_energy_stats_df])
concatenated_dfs = concatenated_dfs.sql(
"SELECT * FROM self WHERE nb_ops_per_core > 25"
)

joined_df = ubuntu2404_energy_stats_df.join(
debian11_energy_stats_df,
on=["node", "nb_ops_per_core", "nb_core", "job"],
suffix="_debian",
)

# Get rid of 25 OPS as it may be unrelevant
joined_df = joined_df.sql("SELECT * FROM self WHERE nb_ops_per_core > 25")


# RQ3/4
rq34.os_comparison_boxplots_processor_versions_pkg_all(
[debian11_energy_stats_df, ubuntu2404_energy_stats_df]
)
rq34.os_comparison_boxplots_processor_versions_ram_all(
[debian11_energy_stats_df, ubuntu2404_energy_stats_df]
)
print("Heatmaps pkg perf alone")
rq34.os_comparison_heatmap_processor_versions_pkg_nb_ops(joined_df.sql("SELECT * FROM self WHERE job = 'perf_alone'"), "PERF")
print("Heatmaps pkg hwpc alone")
rq34.os_comparison_heatmap_processor_versions_pkg_nb_ops(joined_df.sql("SELECT * FROM self WHERE job = 'hwpc_alone'"), "HWPC")
print("Heatmaps ram perf alone")
rq34.os_comparison_heatmap_processor_versions_ram_nb_ops(joined_df.sql("SELECT * FROM self WHERE job = 'perf_alone'"), "PERF")
print("Heatmaps ram hwpc alone")
rq34.os_comparison_heatmap_processor_versions_ram_nb_ops(joined_df.sql("SELECT * FROM self WHERE job = 'hwpc_alone'"), "HWPC")
rq34.os_comparison_heatmap_processor_versions_pkg_percent_used(joined_df)
rq34.os_comparison_heatmap_processor_versions_ram_percent_used(joined_df)

rq34.debian_facetgrid_processor_versions_pkg_cv_nb_ops(debian11_energy_stats_df.sql("SELECT * FROM self WHERE nb_ops_per_core > 25"))
rq34.debian_facetgrid_processor_versions_ram_cv_nb_ops(debian11_energy_stats_df.sql("SELECT * FROM self WHERE nb_ops_per_core > 25"))
rq34.ubuntu_facetgrid_processor_versions_pkg_cv_nb_ops(ubuntu2404_energy_stats_df.sql("SELECT * FROM self WHERE nb_ops_per_core > 25"))
rq34.ubuntu_facetgrid_processor_versions_ram_cv_nb_ops(ubuntu2404_energy_stats_df.sql("SELECT * FROM self WHERE nb_ops_per_core > 25"))


def energy_for_os(os_flavor, results_directory_match, test):
if test:
energy_stats_csv_file = (
f"batches/{os_flavor}.d/{os_flavor}_energy_stats_sample.csv"
)
else:
energy_stats_csv_file = f"batches/{os_flavor}.d/{os_flavor}_energy_stats.csv"
if os.path.exists(energy_stats_csv_file):
return pl.read_csv(energy_stats_csv_file)
results_directory: str = f"batches/{os_flavor}.d/results-{os_flavor}.d/"
inventories_directory: str = f"batches/{os_flavor}.d/inventories-{os_flavor}.d/"
(hwpc_files, perf_files) = extract.extract_csv_files(results_directory)

nodes_df = extract.extract_json_files(
directory=inventories_directory, schema=schemas.nodes_configuration_columns
)

nodes_df = nodes_df.with_columns(
[
# (pl.col("processor_version").map_elements(lambda x: f"{x}\nGen: {vendor_generation_map[x]['architecture']}\nRelease: {vendor_generation_map[x]['launch_date']}", return_dtype=pl.String).alias("processor_detail")),
(
pl.col("processor_version")
.map_elements(
lambda x: f"{x}\n{vendor_generation_map[x]['architecture']}",
return_dtype=pl.String,
)
.alias("processor_detail")
),
(
pl.col("processor_version")
.map_elements(
lambda x: vendor_generation_map[x]["generation"],
return_dtype=pl.String,
)
.alias("processor_generation")
),
(
pl.col("processor_version")
.map_elements(
lambda x: vendor_generation_map[x]["vendor"], return_dtype=pl.String
)
.alias("processor_vendor")
),
]
)

print("Nodes Configuration glimpse:\n", nodes_df.head())

# Data Exploration
(hwpc_results, perf_results) = load.load_results(
hwpc_files, perf_files, results_directory_match, test
)
print(
"HWPC Results glimpse:\n",
hwpc_results.head(),
"\nHWPC Results stats:\n",
hwpc_results.describe(),
)
print(hwpc_results.sql("select energy_pkg from self").describe())
print(
"Perf Results glimpse:\n",
perf_results.head(),
"\nPerf Results stats:\n",
perf_results.describe(),
)

energy_stats_df = load.load_energy(hwpc_results, perf_results, nodes_df, os_flavor)
energy_stats_df.write_csv(energy_stats_csv_file, separator=",")

return energy_stats_df


if __name__ == "__main__":
main()
71 changes: 71 additions & 0 deletions analysis/execution_time.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import os
import glob
import pandas as pd

def compute_mean_std(directory, nb_ops):
# Define the file pattern to search for
pattern = os.path.join(directory, f"**/perf_*_{nb_ops}.csv")
files = glob.glob(pattern, recursive=True)

if not files:
print(f"No files found for NB_OPS={nb_ops}")
return

time_elapsed_values = []

# Loop through all matching files
for file in files:
try:
# Read the CSV file
df = pd.read_csv(file)
# Append the time_elapsed column to the list
time_elapsed_values.extend(df["time_elapsed"].dropna())
except Exception as e:
print(f"Error reading file {file}: {e}")

if not time_elapsed_values:
print(f"No valid time_elapsed values found in files for NB_OPS={nb_ops}")
return

# Compute mean and standard deviation
mean_time = sum(time_elapsed_values) / len(time_elapsed_values)
std_dev_time = (sum((x - mean_time) ** 2 for x in time_elapsed_values) / len(time_elapsed_values)) ** 0.5

print(f"Results for NB_OPS={nb_ops}:")
print(f" Mean time_elapsed: {mean_time:.6f} seconds")
print(f" Standard deviation: {std_dev_time:.6f} seconds")

# Example usage
# Replace "your_directory_path" with the actual path to the directory containing the files
print("For Ubuntu")
directory = "./batches/ubuntu2404nfs-6.8-0.d/results-ubuntu2404nfs-6.8-0.d/"
nb_ops = 25 # Change this to 250, 2500, or 25000 as needed
compute_mean_std(directory, nb_ops)
nb_ops = 250 # Change this to 250, 2500, or 25000 as needed
compute_mean_std(directory, nb_ops)
nb_ops = 2500 # Change this to 250, 2500, or 25000 as needed
compute_mean_std(directory, nb_ops)
nb_ops = 25000 # Change this to 250, 2500, or 25000 as needed
compute_mean_std(directory, nb_ops)

print("For Debian")
directory = "./batches/debian11-5.10-0.d/results-debian11-5.10-0.d/"
nb_ops = 25 # Change this to 250, 2500, or 25000 as needed
compute_mean_std(directory, nb_ops)
nb_ops = 250 # Change this to 250, 2500, or 25000 as needed
compute_mean_std(directory, nb_ops)
nb_ops = 2500 # Change this to 250, 2500, or 25000 as needed
compute_mean_std(directory, nb_ops)
nb_ops = 25000 # Change this to 250, 2500, or 25000 as needed
compute_mean_std(directory, nb_ops)

print("For Powerapi")
directory = "./results_powerapi2u"
nb_ops = 25 # Change this to 250, 2500, or 25000 as needed
compute_mean_std(directory, nb_ops)
nb_ops = 250 # Change this to 250, 2500, or 25000 as needed
compute_mean_std(directory, nb_ops)
nb_ops = 2500 # Change this to 250, 2500, or 25000 as needed
compute_mean_std(directory, nb_ops)
nb_ops = 25000 # Change this to 250, 2500, or 25000 as needed
compute_mean_std(directory, nb_ops)
Loading