@@ -326,6 +326,7 @@ def _query_exit_code(self, job):
326326 # Torque does not provide a way to retrieve the history of jobs
327327 return None
328328
329+
329330@register_scheduler ('pbspro' )
330331class PbsProJobScheduler (PbsJobScheduler ):
331332 def poll (self , * jobs ):
@@ -337,41 +338,42 @@ def poll(self, *jobs):
337338 return
338339
339340 # query status of all jobs
340- job_status = osext .run_command (
341+ completed = osext .run_command (
341342 f"qstat -xf -F json { ' ' .join (job .jobid for job in jobs )} "
342343 )
343344
344345 # from Table 14-1: Error Codes in
345346 # https://help.altair.com/2024.1.0/PBS%20Professional/PBSReferenceGuide2024.1.pdf,
346- # we have the codes PBS returns in case of an error with exit(error_code),
347- # like exit(15001) for unknown Job ID. however, only the last 8 bits
348- # of the exit code are returned, so what we get as the actual error code
349- # is exit_code % 256, which is for example 153 for Unknown Job Identifier.
350- # 153 is returned if any job id in the list is unknown, even if some others
351- # are known. these unknown jobids will be caught in the loop over jobs
352- # below so we can pass on for now. previously 35 was checked here,
353- # but we only get that for a "History job ID" (when qstat -f is used
354- # on a jobid that has already ended. Since above we use "-x" we should not
355- # get exit code 35 anymore)
356- if job_status .returncode in [153 , 0 ]:
347+ # we have the codes PBS returns in case of an error with
348+ # exit(error_code), like exit(15001) for unknown Job ID. However, only
349+ # the last 8 bits of the exit code are returned, so what we get as the
350+ # actual error code is `exit_code % 256`, which is for example 153 for
351+ # "Unknown Job Identifier". 153 is returned if any job id in the list
352+ # is unknown, even if some others are known. These unknown job ids
353+ # will be caught in the loop over jobs below so we can pass on for
354+ # now. previously 35 was checked here, but we only get that for a
355+ # "History job ID" (when qstat -f is used on a jobid that has already
356+ # ended. Since above we use "-x" we should not get exit code 35
357+ # anymore)
358+ if completed .returncode in [153 , 0 ]:
357359 pass
358- elif job_status .returncode == 255 :
360+ elif completed .returncode == 255 :
359361
360362 # try again, qstat is having a problem
361363 self .log (f'qstat failed with exit code { completed .returncode } '
362- f'(standard error follows):\n { completed .stderr } \n retrying' )
364+ f'(standard error follows):\n { completed .stderr } \n '
365+ 'retrying' )
363366 return
364367 else :
365368 raise JobSchedulerError (
366369 f'qstat failed with exit code { completed .returncode } '
367370 f'(standard error follows):\n { completed .stderr } '
368371 )
369372
370- job_status_json = json .loads (job_status .stdout )
373+ job_status_json = json .loads (completed .stdout )
371374
372375 # loop over each job
373376 for job in jobs :
374-
375377 # check if the job is in the json
376378 if job .jobid in job_status_json ["Jobs" ]:
377379
@@ -381,8 +383,9 @@ def poll(self, *jobs):
381383 self .log (f"Job { job .jobid } known to scheduler, state: { state } " )
382384 job ._state = JOB_STATES [state ]
383385
384- # check if exec_host is in the ouput since exec_host is only in
385- # the output if job has started to run (not if it's just queued)
386+ # check if exec_host is in the ouput since exec_host is only
387+ # in the output if job has started to run (not if it's just
388+ # queued)
386389 if "exec_host" in job_info :
387390 nodespec = job_info ["exec_host" ]
388391 self ._update_nodelist (job , nodespec )
@@ -393,8 +396,9 @@ def poll(self, *jobs):
393396 job ._exitcode = int (exit_code )
394397 job ._completed = True
395398 elif job .state in ["QUEUED" , "HELD" , "WAITING" ]:
399+ pending_time = time .time () - job .submit_time
396400 if (job .max_pending_time and
397- ( time . time () - job . submit_time ) >= job .max_pending_time ):
401+ pending_time >= job .max_pending_time ):
398402 self .cancel (job )
399403 job ._exception = JobError (
400404 "maximum pending time exceeded" , job .jobid
@@ -404,4 +408,3 @@ def poll(self, *jobs):
404408 job ._state = "COMPLETED"
405409 self .log (f"Assuming job { job .jobid } completed" )
406410 job ._completed = True
407-
0 commit comments