Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
# seff-array

An extension of the Slurm command 'seff' designed to handle job arrays and offers the option to display information in a histogram.
An extension of the Slurm command 'seff' designed to handle job arrays and offers the option to display information in a histogram. Note getting GPU stats depends on having [Job Summaries](https://princetonuniversity.github.io/jobstats/setup/summaries/) stored in the AdminComment of sacct from [Princeton Jobstats](https://princetonuniversity.github.io/jobstats/).

seff-array generates three types of histograms:

1. CPU Efficiency (utilization vs runtime)
1. Maximum memory usage versus the requested memory
2. Runtime of each job compared to the requested wall-time
1. CPU Efficiency (utilization vs runtime)
2. GPU Efficiency (From [Princeton Job Stats](https://princetonuniversity.github.io/jobstats/))
3. Maximum memory usage versus the requested memory
4. Runtime of each job compared to the requested wall-time

## Usage:

Expand Down
73 changes: 69 additions & 4 deletions seff-array.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,11 @@

import termplotlib as tpl

import json
import gzip
import base64
from typing import Optional

__version__ = 0.4
debug = False

Expand All @@ -35,14 +40,51 @@ def time_to_float(time):

return days + hours + mins + secs

def get_stats_dict(ss64: Optional[str]) -> dict:
"""Convert the base64-encoded jobstats summary statistics to JSON."""
if (not ss64) or pd.isna(ss64) or ss64 == "JS1:Short" or ss64 == "JS1:None":
return {}
return json.loads(gzip.decompress(base64.b64decode(ss64[4:])))

def gpu_count(js):
"""Pull gpu count from jobstats data"""
gpu_cnt = 0
if js:
for node in js['nodes']:
try:
gpus = list(js['nodes'][node]['gpu_utilization'].keys())
except Exception:
exit
else:
for gpu in gpus:
gpu_cnt = gpu_cnt + 1

return gpu_cnt

def gpu_util(js):
"""Pull gpu utilization from jobstats data"""
gpu_util = 0
if js:
for node in js['nodes']:
try:
gpus = list(js['nodes'][node]['gpu_utilization'].keys())
except Exception:
exit
else:
for gpu in gpus:
util = js['nodes'][node]['gpu_utilization'][gpu]
gpu_util = gpu_util + util/100.0

return gpu_util

#@profile
def job_eff(job_id=0, cluster=os.getenv('SLURM_CLUSTER_NAME')):

if job_id==0:
df_short = pd.read_csv('seff_test_oneline.csv', sep='|')
df_long = pd.read_csv('seff_test.csv', sep='|')
else:
fmt = '--format=JobID,JobName,Elapsed,ReqMem,ReqCPUS,Timelimit,State,TotalCPU,NNodes,User,Group,Cluster'
fmt = '--format=JobID,JobName,Elapsed,ReqMem,ReqCPUS,Timelimit,State,TotalCPU,NNodes,User,Group,Cluster,AdminComment'
if cluster != None:
q = f'sacct -X --units=G -P {fmt} -j {job_id} --cluster {cluster}'
else:
Expand All @@ -51,7 +93,7 @@ def job_eff(job_id=0, cluster=os.getenv('SLURM_CLUSTER_NAME')):
res = str(res, 'utf-8')
df_short = pd.read_csv(StringIO(res), sep='|')

fmt = '--format=JobID,JobName,Elapsed,ReqMem,ReqCPUS,Timelimit,State,TotalCPU,NNodes,User,Group,Cluster,MaxVMSize'
fmt = '--format=JobID,JobName,Elapsed,ReqMem,ReqCPUS,Timelimit,State,TotalCPU,NNodes,User,Group,Cluster,MaxRSS,AdminComment'
if cluster != None:
q = f'sacct --units=G -P {fmt} -j {job_id} --cluster {cluster}'
else:
Expand All @@ -73,12 +115,19 @@ def job_eff(job_id=0, cluster=os.getenv('SLURM_CLUSTER_NAME')):
df_short = df_short.fillna(0.)
df_long = df_long.fillna(0.)

df_long['MaxRSS'] = df_long.MaxRSS.astype('str')
df_long['ReqMem'] = df_long.ReqMem.astype('str')

df_long['Timelimit'] = df_long.Timelimit.replace('UNLIMITED','365-00:00:00').replace('Partition_Limit','365-00:00:00')

df_long['JobID'] = df_long.JobID.map(lambda x: x.split('.')[0])
df_long['MaxVMSize'] = df_long.MaxVMSize.str.replace('G', '').astype('float')
df_long['MaxRSS'] = df_long.MaxRSS.str.replace('G', '').astype('float')
df_long['ReqMem'] = df_long.ReqMem.str.replace('G', '').astype('float')
df_long['TotalCPU'] = df_long.TotalCPU.map(lambda x: time_to_float(x))
df_long['Elapsed'] = df_long.Elapsed.map(lambda x: time_to_float(x))
df_long['Timelimit'] = df_long.Timelimit.map(lambda x: time_to_float(x))
df_short['AdminComment'] = df_short.AdminComment.map(lambda x: get_stats_dict(x))
gpu_req = df_short.AdminComment.map(lambda x: gpu_count(x))

# job info
if isinstance(df_short['JobID'][0], np.int64):
Expand All @@ -94,6 +143,10 @@ def job_eff(job_id=0, cluster=os.getenv('SLURM_CLUSTER_NAME')):
group = df_short['Group'][0]
nodes = df_short['NNodes'][0]
cores = df_short['ReqCPUS'][0]
if len(gpu_req[gpu_req != 0]) != 0:
gpus = gpu_req[gpu_req != 0].mean()
else:
gpus = 0
req_mem = df_short['ReqMem'][0]
req_time = df_short['Timelimit'][0]

Expand All @@ -104,6 +157,7 @@ def job_eff(job_id=0, cluster=os.getenv('SLURM_CLUSTER_NAME')):
print(f"Cluster: {cluster}")
print(f"User/Group: {user}/{group}")
print(f"Requested CPUs: {cores} cores on {nodes} node(s)")
print(f"Average Requested GPUs: {gpus:.2f}")
print(f"Requested Memory: {req_mem}")
print(f"Requested Time: {req_time}")
print("--------------------------------------------------------")
Expand All @@ -123,14 +177,18 @@ def job_eff(job_id=0, cluster=os.getenv('SLURM_CLUSTER_NAME')):
return -1

cpu_use = df_long_finished.TotalCPU.loc[df_long_finished.groupby('JobID')['TotalCPU'].idxmax()]
gpu_use = df_short.AdminComment.map(lambda x: gpu_util(x))
time_use = df_long_finished.Elapsed.loc[df_long_finished.groupby('JobID')['Elapsed'].idxmax()]
mem_use = df_long_finished.MaxVMSize.loc[df_long_finished.groupby('JobID')['MaxVMSize'].idxmax()]
mem_use = df_long_finished.MaxRSS.loc[df_long_finished.groupby('JobID')['MaxRSS'].idxmax()]
cpu_eff = np.divide(np.divide(cpu_use.to_numpy(), time_use.to_numpy()),cores)
gpu_eff = np.divide(gpu_use[gpu_req != 0].to_numpy(), gpu_req[gpu_req != 0].to_numpy()).clip(0,1.0)

print("--------------------------------------------------------")
print("Finished Job Statistics")
print("(excludes pending, running, and cancelled jobs)")
print(f"Average CPU Efficiency {cpu_eff.mean()*100:.2f}%")
if len(gpu_eff) != 0:
print(f"Average GPU Efficiency {gpu_eff.mean()*100:.2f}%")
print(f"Average Memory Usage {mem_use.mean():.2f}G")
print(f"Average Run-time {time_use.mean():.2f}s")
print("---------------------")
Expand All @@ -141,6 +199,13 @@ def job_eff(job_id=0, cluster=os.getenv('SLURM_CLUSTER_NAME')):
h, bin_edges = np.histogram(cpu_eff*100, bins=np.linspace(0,100,num=11))
fig.hist(h, bin_edges, orientation='horizontal')
fig.show()

if len(gpu_eff) != 0:
print('\nGPU Efficiency (%)\n---------------------')
fig = tpl.figure()
h, bin_edges = np.histogram(gpu_eff*100, bins=np.linspace(0,100,num=11))
fig.hist(h, bin_edges, orientation='horizontal')
fig.show()

print('\nMemory Efficiency (%)\n---------------------')
fig = tpl.figure()
Expand Down