diff --git a/README.md b/README.md index a6d282b..2de23fd 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,13 @@ # seff-array -An extension of the Slurm command 'seff' designed to handle job arrays and offers the option to display information in a histogram. +An extension of the Slurm command 'seff' designed to handle job arrays and offers the option to display information in a histogram. Note getting GPU stats depends on having [Job Summaries](https://princetonuniversity.github.io/jobstats/setup/summaries/) stored in the AdminComment of sacct from [Princeton Jobstats](https://princetonuniversity.github.io/jobstats/). seff-array generates three types of histograms: - 1. CPU Efficiency (utilization vs runtime) - 1. Maximum memory usage versus the requested memory - 2. Runtime of each job compared to the requested wall-time + 1. CPU Efficiency (utilization vs runtime) + 2. GPU Efficiency (From [Princeton Job Stats](https://princetonuniversity.github.io/jobstats/)) + 3. Maximum memory usage versus the requested memory + 4. Runtime of each job compared to the requested wall-time ## Usage: diff --git a/seff-array.py b/seff-array.py index 220c2c4..1665e4c 100755 --- a/seff-array.py +++ b/seff-array.py @@ -12,6 +12,11 @@ import termplotlib as tpl +import json +import gzip +import base64 +from typing import Optional + __version__ = 0.4 debug = False @@ -35,6 +40,43 @@ def time_to_float(time): return days + hours + mins + secs +def get_stats_dict(ss64: Optional[str]) -> dict: + """Convert the base64-encoded jobstats summary statistics to JSON.""" + if (not ss64) or pd.isna(ss64) or ss64 == "JS1:Short" or ss64 == "JS1:None": + return {} + return json.loads(gzip.decompress(base64.b64decode(ss64[4:]))) + +def gpu_count(js): + """Pull gpu count from jobstats data""" + gpu_cnt = 0 + if js: + for node in js['nodes']: + try: + gpus = list(js['nodes'][node]['gpu_utilization'].keys()) + except Exception: + exit + else: + for gpu in gpus: + gpu_cnt = gpu_cnt + 1 + + return gpu_cnt + +def gpu_util(js): + """Pull gpu utilization from jobstats data""" + gpu_util = 0 + if js: + for node in js['nodes']: + try: + gpus = list(js['nodes'][node]['gpu_utilization'].keys()) + except Exception: + exit + else: + for gpu in gpus: + util = js['nodes'][node]['gpu_utilization'][gpu] + gpu_util = gpu_util + util/100.0 + + return gpu_util + #@profile def job_eff(job_id=0, cluster=os.getenv('SLURM_CLUSTER_NAME')): @@ -42,7 +84,7 @@ def job_eff(job_id=0, cluster=os.getenv('SLURM_CLUSTER_NAME')): df_short = pd.read_csv('seff_test_oneline.csv', sep='|') df_long = pd.read_csv('seff_test.csv', sep='|') else: - fmt = '--format=JobID,JobName,Elapsed,ReqMem,ReqCPUS,Timelimit,State,TotalCPU,NNodes,User,Group,Cluster' + fmt = '--format=JobID,JobName,Elapsed,ReqMem,ReqCPUS,Timelimit,State,TotalCPU,NNodes,User,Group,Cluster,AdminComment' if cluster != None: q = f'sacct -X --units=G -P {fmt} -j {job_id} --cluster {cluster}' else: @@ -51,7 +93,7 @@ def job_eff(job_id=0, cluster=os.getenv('SLURM_CLUSTER_NAME')): res = str(res, 'utf-8') df_short = pd.read_csv(StringIO(res), sep='|') - fmt = '--format=JobID,JobName,Elapsed,ReqMem,ReqCPUS,Timelimit,State,TotalCPU,NNodes,User,Group,Cluster,MaxVMSize' + fmt = '--format=JobID,JobName,Elapsed,ReqMem,ReqCPUS,Timelimit,State,TotalCPU,NNodes,User,Group,Cluster,MaxRSS,AdminComment' if cluster != None: q = f'sacct --units=G -P {fmt} -j {job_id} --cluster {cluster}' else: @@ -73,12 +115,19 @@ def job_eff(job_id=0, cluster=os.getenv('SLURM_CLUSTER_NAME')): df_short = df_short.fillna(0.) df_long = df_long.fillna(0.) + df_long['MaxRSS'] = df_long.MaxRSS.astype('str') + df_long['ReqMem'] = df_long.ReqMem.astype('str') + + df_long['Timelimit'] = df_long.Timelimit.replace('UNLIMITED','365-00:00:00').replace('Partition_Limit','365-00:00:00') + df_long['JobID'] = df_long.JobID.map(lambda x: x.split('.')[0]) - df_long['MaxVMSize'] = df_long.MaxVMSize.str.replace('G', '').astype('float') + df_long['MaxRSS'] = df_long.MaxRSS.str.replace('G', '').astype('float') df_long['ReqMem'] = df_long.ReqMem.str.replace('G', '').astype('float') df_long['TotalCPU'] = df_long.TotalCPU.map(lambda x: time_to_float(x)) df_long['Elapsed'] = df_long.Elapsed.map(lambda x: time_to_float(x)) df_long['Timelimit'] = df_long.Timelimit.map(lambda x: time_to_float(x)) + df_short['AdminComment'] = df_short.AdminComment.map(lambda x: get_stats_dict(x)) + gpu_req = df_short.AdminComment.map(lambda x: gpu_count(x)) # job info if isinstance(df_short['JobID'][0], np.int64): @@ -94,6 +143,10 @@ def job_eff(job_id=0, cluster=os.getenv('SLURM_CLUSTER_NAME')): group = df_short['Group'][0] nodes = df_short['NNodes'][0] cores = df_short['ReqCPUS'][0] + if len(gpu_req[gpu_req != 0]) != 0: + gpus = gpu_req[gpu_req != 0].mean() + else: + gpus = 0 req_mem = df_short['ReqMem'][0] req_time = df_short['Timelimit'][0] @@ -104,6 +157,7 @@ def job_eff(job_id=0, cluster=os.getenv('SLURM_CLUSTER_NAME')): print(f"Cluster: {cluster}") print(f"User/Group: {user}/{group}") print(f"Requested CPUs: {cores} cores on {nodes} node(s)") + print(f"Average Requested GPUs: {gpus:.2f}") print(f"Requested Memory: {req_mem}") print(f"Requested Time: {req_time}") print("--------------------------------------------------------") @@ -123,14 +177,18 @@ def job_eff(job_id=0, cluster=os.getenv('SLURM_CLUSTER_NAME')): return -1 cpu_use = df_long_finished.TotalCPU.loc[df_long_finished.groupby('JobID')['TotalCPU'].idxmax()] + gpu_use = df_short.AdminComment.map(lambda x: gpu_util(x)) time_use = df_long_finished.Elapsed.loc[df_long_finished.groupby('JobID')['Elapsed'].idxmax()] - mem_use = df_long_finished.MaxVMSize.loc[df_long_finished.groupby('JobID')['MaxVMSize'].idxmax()] + mem_use = df_long_finished.MaxRSS.loc[df_long_finished.groupby('JobID')['MaxRSS'].idxmax()] cpu_eff = np.divide(np.divide(cpu_use.to_numpy(), time_use.to_numpy()),cores) + gpu_eff = np.divide(gpu_use[gpu_req != 0].to_numpy(), gpu_req[gpu_req != 0].to_numpy()).clip(0,1.0) print("--------------------------------------------------------") print("Finished Job Statistics") print("(excludes pending, running, and cancelled jobs)") print(f"Average CPU Efficiency {cpu_eff.mean()*100:.2f}%") + if len(gpu_eff) != 0: + print(f"Average GPU Efficiency {gpu_eff.mean()*100:.2f}%") print(f"Average Memory Usage {mem_use.mean():.2f}G") print(f"Average Run-time {time_use.mean():.2f}s") print("---------------------") @@ -141,6 +199,13 @@ def job_eff(job_id=0, cluster=os.getenv('SLURM_CLUSTER_NAME')): h, bin_edges = np.histogram(cpu_eff*100, bins=np.linspace(0,100,num=11)) fig.hist(h, bin_edges, orientation='horizontal') fig.show() + + if len(gpu_eff) != 0: + print('\nGPU Efficiency (%)\n---------------------') + fig = tpl.figure() + h, bin_edges = np.histogram(gpu_eff*100, bins=np.linspace(0,100,num=11)) + fig.hist(h, bin_edges, orientation='horizontal') + fig.show() print('\nMemory Efficiency (%)\n---------------------') fig = tpl.figure()