Skip to content

Commit 8ddfbf7

Browse files
author
miranov25
committed
adding perfmonitor
1 parent 8337e9a commit 8ddfbf7

File tree

2 files changed

+268
-0
lines changed

2 files changed

+268
-0
lines changed

UTILS/perfmonitor/README.md

Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
# Performance Monitor
2+
3+
Lightweight logging and analysis utility for tracking performance (execution time and memory) of scripts or processing pipelines.
4+
5+
## Features
6+
7+
* Logs elapsed time and memory (RSS) per step
8+
* Supports multi-level index tags for loop tracking
9+
* Saves logs in delimiter-separated format (default: `|`)
10+
* Parses logs to `pandas.DataFrame` for analysis
11+
* Summarizes stats (mean, max, min) with configurable grouping
12+
* Plots memory/time using `matplotlib`
13+
* Optionally saves plots to a PDF
14+
* Combines logs from multiple files
15+
16+
## Installation
17+
18+
This is a self-contained utility. Just place the `perfmonitor/` directory into your Python path.
19+
20+
## Example Usage
21+
22+
```python
23+
import time
24+
import pandas as pd
25+
import matplotlib.pyplot as plt
26+
from perfmonitor import PerformanceLogger, default_plot_config, default_summary_config
27+
28+
# Initialize logger
29+
logger = PerformanceLogger("perf_log.txt")
30+
logger.log("setup::start")
31+
32+
# Simulate steps with increasing delays
33+
for i, delay in enumerate([0.1, 0.2, 0.3]):
34+
time.sleep(delay)
35+
logger.log("loop::step", index=[i])
36+
37+
# Parse logs from one or more files
38+
df = PerformanceLogger.log_to_dataframe(["perf_log.txt"])
39+
print(df.head())
40+
```
41+
42+
### Expected Output
43+
44+
Example output from `print(df.head())`:
45+
46+
```
47+
timestamp step elapsed_sec rss_gb user host logfile index_0
48+
0 2025-05-31 09:12:01,120 setup::start 0.00 0.13 user123 host.local perf_log.txt NaN
49+
1 2025-05-31 09:12:01,220 loop::step[0] 0.10 0.14 user123 host.local perf_log.txt 0.0
50+
2 2025-05-31 09:12:01,420 loop::step[1] 0.20 0.15 user123 host.local perf_log.txt 1.0
51+
3 2025-05-31 09:12:01,720 loop::step[2] 0.30 0.15 user123 host.local perf_log.txt 2.0
52+
```
53+
54+
## Summary Statistics
55+
56+
```python
57+
summary = PerformanceLogger.summarize_with_config(df, default_summary_config)
58+
print(summary)
59+
```
60+
61+
### Example Summary Output
62+
63+
```
64+
elapsed_sec rss_gb
65+
mean max min mean max min
66+
step
67+
loop::step[0] 0.10 0.10 0.10 0.14 0.14 0.14
68+
loop::step[1] 0.20 0.20 0.20 0.15 0.15 0.15
69+
loop::step[2] 0.30 0.30 0.30 0.15 0.15 0.15
70+
setup::start 0.00 0.00 0.00 0.13 0.13 0.13
71+
```
72+
73+
## Plotting
74+
75+
```python
76+
# Show plots
77+
PerformanceLogger.plot(df, default_plot_config)
78+
79+
# Save plots to PDF
80+
PerformanceLogger.plot(df, default_plot_config, output_pdf="perf_plots.pdf")
81+
```
82+
83+
## Multi-Level Index Extraction
84+
85+
Step IDs can include index metadata like:
86+
87+
```
88+
load::data[1,2]
89+
```
90+
91+
This will be automatically parsed into new DataFrame columns:
92+
93+
* `index_0` → 1
94+
* `index_1` → 2
95+
96+
## Advanced: Custom Configuration
97+
98+
```python
99+
custom_summary = {
100+
"by": ["step", "index_0"],
101+
"stats": ["mean", "max"]
102+
}
103+
104+
custom_plots = {
105+
"RSS Over Time": {
106+
"kind": "line",
107+
"varX": "timestamp",
108+
"varY": "rss_gb",
109+
"title": "RSS vs Time",
110+
"sort": "timestamp",
111+
}
112+
}
113+
114+
PerformanceLogger.plot(df, custom_plots)
115+
```
116+
117+
## License
118+
???
Lines changed: 150 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,150 @@
1+
import time
2+
import psutil
3+
import socket
4+
import getpass
5+
import pandas as pd
6+
import matplotlib.pyplot as plt
7+
from typing import Union, List, Dict, Optional
8+
9+
class PerformanceLogger:
10+
def __init__(self, log_path: str, sep: str = "|"):
11+
self.log_path = log_path
12+
self.start_time = time.time()
13+
self.sep = sep
14+
self.user = getpass.getuser()
15+
self.host = socket.gethostname()
16+
17+
def log(self, step: str, index: Optional[List[int]] = None):
18+
elapsed = time.time() - self.start_time
19+
mem_gb = psutil.Process().memory_info().rss / (1024 ** 3)
20+
index_str = "" if index is None else f"[{','.join(map(str, index))}]"
21+
step_full = f"{step}{index_str}"
22+
line = f"{time.strftime('%Y-%m-%d %H:%M:%S')},{int(time.time() * 1000) % 1000:03d} {self.sep} {step_full} {self.sep} {elapsed:.2f} {self.sep} {mem_gb:.2f} {self.sep} {self.user} {self.sep} {self.host}\n"
23+
with open(self.log_path, "a") as f:
24+
f.write(line)
25+
print(f"{step_full} | {elapsed:.2f} | {mem_gb:.2f} | {self.user} | {self.host}")
26+
27+
@staticmethod
28+
def log_to_dataframe(log_paths: Union[str, List[str]], sep: str = "|") -> pd.DataFrame:
29+
if isinstance(log_paths, str):
30+
log_paths = [log_paths]
31+
32+
rows = []
33+
for path in log_paths:
34+
with open(path) as f:
35+
for line in f:
36+
parts = [x.strip() for x in line.strip().split(sep)]
37+
if len(parts) < 5:
38+
continue
39+
timestamp, step, elapsed_str, rss_str, user, host = parts[:6]
40+
row = {
41+
"timestamp": timestamp,
42+
"step": step,
43+
"elapsed_sec": float(elapsed_str),
44+
"rss_gb": float(rss_str),
45+
"user": user,
46+
"host": host,
47+
"logfile": path
48+
}
49+
50+
if "[" in step and "]" in step:
51+
base, idx = step.split("[")
52+
row["step"] = base
53+
idx = idx.rstrip("]")
54+
for i, val in enumerate(idx.split(",")):
55+
if val.isdigit():
56+
row[f"index_{i}"] = int(val)
57+
rows.append(row)
58+
59+
return pd.DataFrame(rows)
60+
61+
@staticmethod
62+
def summarize_with_config(df: pd.DataFrame, config: Dict) -> pd.DataFrame:
63+
group_cols = config.get("by", ["step"])
64+
stats = config.get("stats", ["mean", "max", "min"])
65+
66+
agg = {}
67+
for col in ["elapsed_sec", "rss_gb"]:
68+
agg[col] = stats
69+
70+
return df.groupby(group_cols).agg(agg)
71+
72+
@staticmethod
73+
def plot(df: pd.DataFrame,
74+
config_dict: Dict[str, Dict],
75+
filter_expr: Optional[str] = None,
76+
output_pdf: Optional[str] = None):
77+
78+
if filter_expr:
79+
df = df.query(filter_expr)
80+
81+
if output_pdf:
82+
from matplotlib.backends.backend_pdf import PdfPages
83+
pdf = PdfPages(output_pdf)
84+
85+
for name, config in config_dict.items():
86+
subdf = df.copy()
87+
if "filter" in config:
88+
subdf = subdf.query(config["filter"])
89+
90+
if "sort" in config:
91+
subdf = subdf.sort_values(config["sort"])
92+
93+
x = subdf[config.get("varX", "timestamp")]
94+
y = subdf[config.get("varY", "elapsed_sec")]
95+
kind = config.get("kind", "line")
96+
97+
plt.figure()
98+
if kind == "line":
99+
plt.plot(x, y, marker="o")
100+
elif kind == "bar":
101+
plt.bar(x, y)
102+
else:
103+
raise ValueError(f"Unsupported plot kind: {kind}")
104+
105+
plt.title(config.get("title", name))
106+
plt.xlabel(config.get("xlabel", config.get("varX", "timestamp")))
107+
plt.ylabel(config.get("ylabel", config.get("varY", "elapsed_sec")))
108+
plt.xticks(rotation=45)
109+
plt.tight_layout()
110+
111+
if output_pdf:
112+
pdf.savefig()
113+
plt.close()
114+
else:
115+
plt.show()
116+
117+
if output_pdf:
118+
pdf.close()
119+
120+
121+
# Default configurations
122+
123+
default_plot_config={
124+
"RSS vs Time": {
125+
"kind": "line",
126+
"varX": "timestamp",
127+
"varY": "rss_gb",
128+
"title": "RSS over Time",
129+
"sort": "timestamp"
130+
},
131+
"RSS vs step": {
132+
"kind": "line",
133+
"varX": "step",
134+
"varY": "rss_gb",
135+
"title": "RSS over Time",
136+
},
137+
"Elapsed Time vs Step": {
138+
"kind": "bar",
139+
"varX": "step",
140+
"varY": "elapsed_sec",
141+
"title": "Elapsed Time per Step",
142+
"sort": "step"
143+
}
144+
}
145+
146+
default_summary_config={
147+
"by": ["step"],
148+
"stats": ["mean", "max", "min"]
149+
}
150+

0 commit comments

Comments
 (0)