Package madgraph :: Package various :: Module cluster
[hide private]
[frames] | no frames]

Source Code for Module madgraph.various.cluster

   1  ################################################################################ 
   2  # Copyright (c) 2009 The MadGraph5_aMC@NLO Development team and Contributors              
   3  # 
   4  # This file is a part of the MadGraph5_aMC@NLO project, an application which            
   5  # automatically generates Feynman diagrams and matrix elements for arbitrary     
   6  # high-energy processes in the Standard Model and beyond.                        
   7  # 
   8  # It is subject to the MadGraph5_aMC@NLO license which should accompany this              
   9  # distribution.                                                                  
  10  #                                                                                
  11  # For more information, visit madgraph.phys.ucl.ac.be and amcatnlo.web.cern.ch             
  12  #                                                                                
  13  ################################################################################ 
  14  import subprocess 
  15  import logging 
  16  import os 
  17  import time 
  18  import re 
  19  import glob 
  20  import inspect 
  21  import sys 
  22   
  23  logger = logging.getLogger('madgraph.cluster')  
  24   
  25  try: 
  26      from madgraph import MadGraph5Error 
  27      import madgraph.various.misc as misc 
  28  except Exception, error: 
  29      if __debug__: 
  30          print  str(error) 
  31      from internal import MadGraph5Error 
  32      import internal.misc as misc 
  33   
  34  pjoin = os.path.join 
35 36 -class ClusterManagmentError(MadGraph5Error):
37 pass
38
39 -class NotImplemented(MadGraph5Error):
40 pass
41 42 43 multiple_try = misc.multiple_try 44 pjoin = os.path.join
45 46 47 -def check_interupt(error=KeyboardInterrupt):
48 49 def deco_interupt(f): 50 def deco_f_interupt(self, *args, **opt): 51 try: 52 return f(self, *args, **opt) 53 except error: 54 try: 55 self.remove(*args, **opt) 56 except Exception: 57 pass 58 raise error
59 return deco_f_interupt 60 return deco_interupt 61
62 -def store_input(arg=''):
63 64 def deco_store(f): 65 def deco_f_store(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 66 input_files=[], output_files=[], required_output=[], nb_submit=0): 67 frame = inspect.currentframe() 68 args, _, _, values = inspect.getargvalues(frame) 69 args = dict([(i, values[i]) for i in args if i != 'self']) 70 id = f(self, **args) 71 if self.nb_retry > 0: 72 self.retry_args[id] = args 73 return id
74 return deco_f_store 75 return deco_store 76
77 -def need_transfer(options):
78 """ This function checks whether compression of input files are necessary 79 given the running options given. """ 80 81 if options['run_mode'] != 1 and options['cluster_temp_path'] is None: 82 return False 83 else: 84 return True
85
86 -class Cluster(object):
87 """Basic Class for all cluster type submission""" 88 name = 'mother class' 89 identifier_length = 14 90
91 - def __init__(self,*args, **opts):
92 """Init the cluster""" 93 94 self.submitted = 0 95 self.submitted_ids = [] 96 self.finish = 0 97 self.submitted_dirs = [] #HTCaaS 98 self.submitted_exes = [] #HTCaaS 99 self.submitted_args = [] #HTCaaS 100 101 if 'cluster_queue' in opts: 102 self.cluster_queue = opts['cluster_queue'] 103 else: 104 self.cluster_queue = 'madgraph' 105 if 'cluster_temp_path' in opts: 106 self.temp_dir = opts['cluster_temp_path'] 107 else: 108 self.temp_dir = None 109 self.options = {'cluster_status_update': (600, 30)} 110 for key,value in opts.items(): 111 self.options[key] = value 112 self.nb_retry = opts['cluster_nb_retry'] if 'cluster_nb_retry' in opts else 0 113 self.cluster_retry_wait = float(opts['cluster_retry_wait']) if 'cluster_retry_wait' in opts else 300 114 self.options = dict(opts) 115 self.retry_args = {} 116 # controlling jobs in controlled type submision 117 self.packet = {} 118 self.id_to_packet = {}
119
120 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 121 log=None, required_output=[], nb_submit=0):
122 """How to make one submission. Return status id on the cluster.""" 123 raise NotImplemented, 'No implementation of how to submit a job to cluster \'%s\'' % self.name
124 125 126 @store_input()
127 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 128 log=None, input_files=[], output_files=[], required_output=[], 129 nb_submit=0):
130 """How to make one submission. Return status id on the cluster. 131 NO SHARE DISK""" 132 133 if cwd is None: 134 cwd = os.getcwd() 135 if not os.path.exists(prog): 136 prog = os.path.join(cwd, prog) 137 138 if not required_output and output_files: 139 required_output = output_files 140 141 if not hasattr(self, 'temp_dir') or not self.temp_dir or \ 142 (input_files == [] == output_files): 143 144 return self.submit(prog, argument, cwd, stdout, stderr, log, 145 required_output=required_output, nb_submit=nb_submit) 146 147 if not input_files and not output_files: 148 # not input/output so not using submit2 149 return self.submit(prog, argument, cwd, stdout, stderr, log, 150 required_output=required_output, nb_submit=nb_submit) 151 152 if cwd is None: 153 cwd = os.getcwd() 154 if not os.path.exists(prog): 155 prog = os.path.join(cwd, prog) 156 temp_file_name = "sub." + os.path.basename(prog) + '.'.join(argument) 157 158 text = """#!/bin/bash 159 MYTMP=%(tmpdir)s/run$%(job_id)s 160 MYPWD=%(cwd)s 161 mkdir -p $MYTMP 162 cd $MYPWD 163 input_files=( %(input_files)s ) 164 for i in ${input_files[@]} 165 do 166 cp -R -L $i $MYTMP 167 done 168 cd $MYTMP 169 echo '%(arguments)s' > arguments 170 chmod +x ./%(script)s 171 %(program)s ./%(script)s %(arguments)s 172 exit=$? 173 output_files=( %(output_files)s ) 174 for i in ${output_files[@]} 175 do 176 cp -r $MYTMP/$i $MYPWD 177 done 178 # if [ "$exit" -eq "0" ] 179 # then 180 rm -rf $MYTMP 181 # fi 182 """ 183 184 dico = {'tmpdir' : self.temp_dir, 'script': os.path.basename(prog), 185 'cwd': cwd, 'job_id': self.job_id, 186 'input_files': ' '.join(input_files + [prog]), 187 'output_files': ' '.join(output_files), 188 'arguments': ' '.join([str(a) for a in argument]), 189 'program': ' ' if '.py' in prog else 'bash'} 190 191 # writing a new script for the submission 192 new_prog = pjoin(cwd, temp_file_name) 193 open(new_prog, 'w').write(text % dico) 194 misc.Popen(['chmod','+x',new_prog],cwd=cwd) 195 196 return self.submit(new_prog, argument, cwd, stdout, stderr, log, 197 required_output=required_output, nb_submit=nb_submit)
198 199
200 - def cluster_submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 201 log=None, input_files=[], output_files=[], required_output=[], 202 nb_submit=0, packet_member=None):
203 """This function wrap the cluster submition with cluster independant 204 method should not be overwritten (but for DAG type submission)""" 205 206 id = self.submit2(prog, argument, cwd, stdout, stderr, log, input_files, 207 output_files, required_output, nb_submit) 208 209 210 if not packet_member: 211 return id 212 else: 213 if isinstance(packet_member, Packet): 214 self.id_to_packet[id] = packet_member 215 packet_member.put(id) 216 if packet_member.tag not in self.packet: 217 self.packet[packet_member.tag] = packet_member 218 else: 219 if packet_member in self.packet: 220 packet = self.packet[packet_member] 221 packet.put(id) 222 self.id_to_packet[id] = packet 223 return id
224
225 - def control(self, me_dir=None):
226 """Check the status of job associated to directory me_dir. return (idle, run, finish, fail)""" 227 if not self.submitted_ids: 228 raise NotImplemented, 'No implementation of how to control the job status to cluster \'%s\'' % self.name 229 idle, run, fail = 0, 0, 0 230 for pid in self.submitted_ids[:]: 231 status = self.control_one_job(id) 232 if status == 'I': 233 idle += 1 234 elif status == 'R': 235 run += 1 236 elif status == 'F': 237 self.finish +=1 238 self.submitted_ids.remove(pid) 239 else: 240 fail += 1 241 242 return idle, run, self.finish, fail
243
244 - def control_one_job(self, pid):
245 """ control the status of a single job with it's cluster id """ 246 raise NotImplemented, 'No implementation of how to control the job status to cluster \'%s\'' % self.name
247
248 - def get_jobs_identifier(self, path, second_path=None):
249 """get a unique run_name for all the jobs helps to identify the runs 250 in the controller for some cluster.""" 251 252 if second_path: 253 path = os.path.realpath(pjoin(path, second_path)) 254 elif not os.path.exists(path): 255 return path # job already done 256 257 if 'SubProcesses' in path: 258 target = path.rsplit('/SubProcesses',1)[0] 259 elif 'MCatNLO' in path: 260 target = path.rsplit('/MCatNLO',1)[0] 261 elif 'PY8_parallelization' in path: 262 target = path.rsplit('/PY8_parallelization',1)[0] 263 elif second_path: 264 target=path 265 logger.warning("cluster.get_job_identifier runs unexpectedly. This should be fine but report this message if you have problem.") 266 else: 267 target = path 268 269 if target.endswith('/'): 270 target = target[:-1] 271 272 target = misc.digest(target)[-self.identifier_length:] 273 if not target[0].isalpha(): 274 target = 'a' + target[1:] 275 276 return target
277 278 279 @check_interupt()
280 - def wait(self, me_dir, fct, minimal_job=0, update_first=None):
281 """Wait that all job are finish. 282 if minimal_job set, then return if idle + run is lower than that number""" 283 284 285 mode = 1 # 0 is long waiting/ 1 is short waiting 286 nb_iter = 0 287 nb_short = 0 288 change_at = 5 # number of iteration from which we wait longer between update. 289 290 if update_first: 291 idle, run, finish, fail = self.control(me_dir) 292 update_first(idle, run, finish) 293 294 #usefull shortcut for readibility 295 longtime, shorttime = self.options['cluster_status_update'] 296 297 nb_job = 0 298 299 if self.options['cluster_type'] == 'htcaas2': 300 me_dir = self.metasubmit(self) 301 302 while 1: 303 old_mode = mode 304 nb_iter += 1 305 idle, run, finish, fail = self.control(me_dir) 306 if nb_job: 307 if idle + run + finish + fail != nb_job: 308 nb_job = idle + run + finish + fail 309 nb_iter = 1 # since some packet finish prevent to pass in long waiting mode 310 else: 311 nb_job = idle + run + finish + fail 312 if fail: 313 raise ClusterManagmentError('Some Jobs are in a Hold/... state. Please try to investigate or contact the IT team') 314 if idle + run == 0: 315 #time.sleep(20) #security to ensure that the file are really written on the disk 316 logger.info('All jobs finished') 317 fct(idle, run, finish) 318 break 319 if idle + run < minimal_job: 320 return 321 fct(idle, run, finish) 322 #Determine how much we have to wait (mode=0->long time, mode=1->short time) 323 if nb_iter < change_at: 324 mode = 1 325 elif idle < run: 326 if old_mode == 0: 327 if nb_short: 328 mode = 0 #we already be back from short to long so stay in long 329 #check if we need to go back to short mode 330 elif idle: 331 if nb_iter > change_at + int(longtime)//shorttime: 332 mode = 0 #stay in long waiting mode 333 else: 334 mode = 1 # pass in short waiting mode 335 nb_short =0 336 else: 337 mode = 1 # pass in short waiting mode 338 nb_short = 0 339 elif old_mode == 1: 340 nb_short +=1 341 if nb_short > 3* max(change_at, int(longtime)//shorttime): 342 mode = 0 #go back in slow waiting 343 else: 344 mode = 0 345 346 #if pass from fast(mode=1) to slow(mode=0) make a print statement: 347 if old_mode > mode: 348 logger.info('''Start to wait %ss between checking status. 349 Note that you can change this time in the configuration file. 350 Press ctrl-C to force the update.''' % self.options['cluster_status_update'][0]) 351 352 #now Waiting! 353 if mode == 0: 354 try: 355 time.sleep(self.options['cluster_status_update'][0]) 356 except KeyboardInterrupt: 357 logger.info('start to update the status') 358 nb_iter = min(0, change_at -2) 359 nb_short = 0 360 else: 361 time.sleep(self.options['cluster_status_update'][1]) 362 363 364 self.submitted = 0 365 self.submitted_ids = []
366
367 - def check_termination(self, job_id):
368 """Check the termination of the jobs with job_id and relaunch it if needed.""" 369 370 371 if job_id not in self.retry_args: 372 if job_id in self.id_to_packet: 373 nb_in_packet = self.id_to_packet[job_id].remove_one() 374 if nb_in_packet == 0: 375 # packet done run the associate function 376 packet = self.id_to_packet[job_id] 377 # fully ensure that the packet is finished (thread safe) 378 packet.queue.join() 379 #running the function 380 packet.fct(*packet.args) 381 del self.id_to_packet[job_id] 382 return 'resubmit' 383 else: 384 return True 385 386 args = self.retry_args[job_id] 387 if 'time_check' in args: 388 time_check = args['time_check'] 389 else: 390 time_check = 0 391 392 for path in args['required_output']: 393 if args['cwd']: 394 path = pjoin(args['cwd'], path) 395 # check that file exists and is not empty. 396 if not (os.path.exists(path) and os.stat(path).st_size != 0) : 397 break 398 else: 399 # all requested output are present 400 if time_check > 0: 401 logger.info('Job %s Finally found the missing output.' % (job_id)) 402 del self.retry_args[job_id] 403 self.submitted_ids.remove(job_id) 404 # check if the job_id is in a packet 405 if job_id in self.id_to_packet: 406 nb_in_packet = self.id_to_packet[job_id].remove_one() 407 if nb_in_packet == 0: 408 # packet done run the associate function 409 packet = self.id_to_packet[job_id] 410 # fully ensure that the packet is finished (thread safe) 411 packet.queue.join() 412 #running the function 413 packet.fct(*packet.args) 414 del self.id_to_packet[job_id] 415 return 'resubmit' 416 417 return 'done' 418 419 if time_check == 0: 420 logger.debug('''Job %s: missing output:%s''' % (job_id,path)) 421 args['time_check'] = time.time() 422 return 'wait' 423 elif self.cluster_retry_wait > time.time() - time_check: 424 return 'wait' 425 426 #jobs failed to be completed even after waiting time!! 427 if self.nb_retry < 0: 428 logger.critical('''Fail to run correctly job %s. 429 with option: %s 430 file missing: %s''' % (job_id, args, path)) 431 raw_input('press enter to continue.') 432 elif self.nb_retry == 0: 433 logger.critical('''Fail to run correctly job %s. 434 with option: %s 435 file missing: %s. 436 Stopping all runs.''' % (job_id, args, path)) 437 self.remove() 438 elif args['nb_submit'] >= self.nb_retry: 439 logger.critical('''Fail to run correctly job %s. 440 with option: %s 441 file missing: %s 442 Fails %s times 443 No resubmition. ''' % (job_id, args, path, args['nb_submit'])) 444 self.remove() 445 else: 446 args['nb_submit'] += 1 447 logger.warning('resubmit job (for the %s times)' % args['nb_submit']) 448 del self.retry_args[job_id] 449 self.submitted_ids.remove(job_id) 450 if 'time_check' in args: 451 del args['time_check'] 452 if job_id in self.id_to_packet: 453 self.id_to_packet[job_id].remove_one() 454 args['packet_member'] = self.id_to_packet[job_id] 455 del self.id_to_packet[job_id] 456 self.cluster_submit(**args) 457 else: 458 self.submit2(**args) 459 return 'resubmit' 460 return 'done'
461 462 @check_interupt()
463 - def launch_and_wait(self, prog, argument=[], cwd=None, stdout=None, 464 stderr=None, log=None, required_output=[], nb_submit=0, 465 input_files=[], output_files=[]):
466 """launch one job on the cluster and wait for it""" 467 468 special_output = False # tag for concatenate the error with the output. 469 if stderr == -2 and stdout: 470 #We are suppose to send the output to stdout 471 special_output = True 472 stderr = stdout + '.err' 473 474 id = self.submit2(prog, argument, cwd, stdout, stderr, log, 475 required_output=required_output, input_files=input_files, 476 output_files=output_files) 477 478 if self.options['cluster_type']=='htcaas2': 479 if self.submitted == self.submitted_ids[-1]: 480 id = self.metasubmit(self) 481 482 frame = inspect.currentframe() 483 args, _, _, values = inspect.getargvalues(frame) 484 args = dict([(i, values[i]) for i in args if i != 'self']) 485 self.retry_args[id] = args 486 487 nb_wait=0 488 while 1: 489 nb_wait+=1 490 status = self.control_one_job(id) 491 if not status in ['R','I']: 492 status = self.check_termination(id) 493 if status in ['wait']: 494 time.sleep(30) 495 continue 496 elif status in ['resubmit']: 497 id = self.submitted_ids[0] 498 time.sleep(30) 499 continue 500 #really stop! 501 time.sleep(30) #security to ensure that the file are really written on the disk 502 break 503 time.sleep(self.options['cluster_status_update'][1]) 504 505 if required_output: 506 status = self.check_termination(id) 507 if status == 'wait': 508 run += 1 509 elif status == 'resubmit': 510 idle += 1 511 512 513 if special_output: 514 # combine the stdout and the stderr 515 #wait up to 50 s to see if those files exists 516 for i in range(5): 517 if os.path.exists(stdout): 518 if not os.path.exists(stderr): 519 time.sleep(5) 520 if os.path.exists(stderr): 521 err_text = open(stderr).read() 522 if not err_text: 523 return 524 logger.warning(err_text) 525 text = open(stdout).read() 526 open(stdout,'w').write(text + err_text) 527 else: 528 return 529 time.sleep(10)
530
531 - def remove(self, *args, **opts):
532 """ """ 533 logger.warning("""This cluster didn't support job removal, 534 the jobs are still running on the cluster.""")
535 536 @store_input()
537 - def metasubmit(self, me_dir):
538 logger.warning("""This cluster didn't support metajob submit.""") 539 return 0
540
541 - def modify_interface(self, run_interface):
542 """routine which allow to modify the run_card/mg5cmd object to change the 543 default behavior of the runs. 544 This is called at the time of the compilation of the run_card. 545 Note that this function can be called multiple times by run. 546 """ 547 #run_card = run_interface.run_card 548 return
549
550 -class Packet(object):
551 """ an object for handling packet of job, it is designed to be thread safe 552 """ 553
554 - def __init__(self, name, fct, args, opts={}):
555 import Queue 556 import threading 557 self.queue = Queue.Queue() 558 self.tag = name 559 self.fct = fct 560 self.args = args 561 self.opts = opts 562 self.done = threading.Event()
563
564 - def put(self, *args, **opts):
565 self.queue.put(*args, **opts)
566 567 append = put 568
569 - def remove_one(self):
570 self.queue.get(True) 571 self.queue.task_done() 572 return self.queue.qsize()
573
574 -class MultiCore(Cluster):
575 """class for dealing with the submission in multiple node""" 576 577 job_id = "$" 578
579 - def __init__(self, *args, **opt):
580 """Init the cluster """ 581 582 583 super(MultiCore, self).__init__(self, *args, **opt) 584 585 import Queue 586 import threading 587 import thread 588 self.queue = Queue.Queue() # list of job to do 589 self.done = Queue.Queue() # list of job finisned 590 self.submitted = Queue.Queue() # one entry by job submitted 591 self.stoprequest = threading.Event() #flag to ensure everything to close 592 self.demons = [] 593 self.nb_done =0 594 if 'nb_core' in opt: 595 self.nb_core = opt['nb_core'] 596 elif isinstance(args[0],int): 597 self.nb_core = args[0] 598 else: 599 self.nb_core = 1 600 self.update_fct = None 601 602 self.lock = threading.Event() # allow nice lock of the main thread 603 self.pids = Queue.Queue() # allow to clean jobs submit via subprocess 604 self.done_pid = [] # list of job finisned 605 self.done_pid_queue = Queue.Queue() 606 self.fail_msg = None 607 608 # starting the worker node 609 for _ in range(self.nb_core): 610 self.start_demon()
611 612
613 - def start_demon(self):
614 import threading 615 t = threading.Thread(target=self.worker) 616 t.daemon = True 617 t.start() 618 self.demons.append(t)
619 620
621 - def worker(self):
622 import Queue 623 import thread 624 while not self.stoprequest.isSet(): 625 try: 626 args = self.queue.get() 627 tag, exe, arg, opt = args 628 try: 629 # check for executable case 630 if isinstance(exe,str): 631 if os.path.exists(exe) and not exe.startswith('/'): 632 exe = './' + exe 633 if isinstance(opt['stdout'],str): 634 opt['stdout'] = open(opt['stdout'],'w') 635 if opt['stderr'] == None: 636 opt['stderr'] = subprocess.STDOUT 637 if arg: 638 proc = misc.Popen([exe] + arg, **opt) 639 else: 640 proc = misc.Popen(exe, **opt) 641 pid = proc.pid 642 self.pids.put(pid) 643 proc.wait() 644 if proc.returncode not in [0, 143, -15] and not self.stoprequest.isSet(): 645 fail_msg = 'program %s launch ends with non zero status: %s. Stop all computation' % \ 646 (' '.join([exe]+arg), proc.returncode) 647 logger.warning(fail_msg) 648 self.stoprequest.set() 649 self.remove(fail_msg) 650 # handle the case when this is a python function. Note that 651 # this use Thread so they are NO built-in parralelization this is 652 # going to work on a single core! (but this is fine for IO intensive 653 # function. for CPU intensive fct this will slow down the computation 654 else: 655 pid = tag 656 self.pids.put(pid) 657 # the function should return 0 if everything is fine 658 # the error message otherwise 659 returncode = exe(*arg, **opt) 660 if returncode != 0: 661 logger.warning("fct %s does not return 0. Stopping the code in a clean way. The error was:\n%s", exe, returncode) 662 self.stoprequest.set() 663 self.remove("fct %s does not return 0:\n %s" % (exe, returncode)) 664 except Exception,error: 665 self.fail_msg = sys.exc_info() 666 logger.warning(str(error)) 667 self.stoprequest.set() 668 self.remove(error) 669 670 if __debug__: 671 raise self.fail_msg[0], self.fail_msg[1],self.fail_msg[2] 672 673 self.queue.task_done() 674 self.done.put(tag) 675 self.done_pid_queue.put(pid) 676 #release the mother to print the status on the screen 677 try: 678 self.lock.set() 679 except thread.error: 680 continue 681 except Queue.Empty: 682 continue
683 684 685 686
687 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 688 log=None, required_output=[], nb_submit=0):
689 """submit a job on multicore machine""" 690 691 tag = (prog, tuple(argument), cwd, nb_submit) 692 if isinstance(prog, str): 693 694 opt = {'cwd': cwd, 695 'stdout':stdout, 696 'stderr': stderr} 697 self.queue.put((tag, prog, argument, opt)) 698 self.submitted.put(1) 699 return tag 700 else: 701 # python function 702 self.queue.put((tag, prog, argument, {})) 703 self.submitted.put(1) 704 return tag
705
706 - def launch_and_wait(self, prog, argument=[], cwd=None, stdout=None, 707 stderr=None, log=None, **opts):
708 """launch one job and wait for it""" 709 if isinstance(stdout, str): 710 stdout = open(stdout, 'w') 711 if isinstance(stderr, str): 712 stdout = open(stderr, 'w') 713 return misc.call([prog] + argument, stdout=stdout, stderr=stderr, cwd=cwd)
714
715 - def remove(self, error=None):
716 """Ensure that all thread are killed""" 717 718 # ensure the worker to stop 719 self.stoprequest.set() 720 if error and not self.fail_msg: 721 self.fail_msg = error 722 723 # cleaning the queue done_pid_queue and move them to done_pid 724 while not self.done_pid_queue.empty(): 725 pid = self.done_pid_queue.get() 726 self.done_pid.append(pid) 727 # self.done_pid_queue.task_done() 728 729 while not self.pids.empty(): 730 pid = self.pids.get() 731 self.pids.task_done() 732 if isinstance(pid, tuple): 733 continue 734 if pid in self.done_pid: 735 continue 736 out = os.system('CPIDS=$(pgrep -P %(pid)s); kill -15 $CPIDS > /dev/null 2>&1' \ 737 % {'pid':pid} ) 738 out = os.system('kill -15 %(pid)s > /dev/null 2>&1' % {'pid':pid} )
739 740
741 - def wait(self, me_dir, update_status, update_first=None):
742 """Waiting that all the jobs are done. This function also control that 743 the submission by packet are handle correctly (i.e. submit the function)""" 744 745 import Queue 746 import threading 747 748 try: # to catch KeyBoardInterupt to see which kind of error to display 749 last_status = (0, 0, 0) 750 sleep_time = 1 751 use_lock = True 752 first = True 753 while True: 754 force_one_more_loop = False # some security 755 756 # Loop over the job tagged as done to check if some packet of jobs 757 # are finished in case, put the associate function in the queue 758 while self.done.qsize(): 759 try: 760 tag = self.done.get(True, 1) 761 except Queue.Empty: 762 pass 763 else: 764 if self.id_to_packet and tuple(tag) in self.id_to_packet: 765 packet = self.id_to_packet[tuple(tag)] 766 remaining = packet.remove_one() 767 if remaining == 0: 768 # fully ensure that the packet is finished (thread safe) 769 packet.queue.join() 770 self.submit(packet.fct, packet.args) 771 force_one_more_loop = True 772 self.nb_done += 1 773 self.done.task_done() 774 775 # Get from the various queue the Idle/Done/Running information 776 # Those variable should be thread safe but approximate. 777 Idle = self.queue.qsize() 778 Done = self.nb_done + self.done.qsize() 779 Running = max(0, self.submitted.qsize() - Idle - Done) 780 781 if Idle + Running <= 0 and not force_one_more_loop: 782 update_status(Idle, Running, Done) 783 # Going the quit since everything is done 784 # Fully Ensure that everything is indeed done. 785 self.queue.join() 786 break 787 788 if (Idle, Running, Done) != last_status: 789 if first and update_first: 790 update_first(Idle, Running, Done) 791 first = False 792 else: 793 update_status(Idle, Running, Done) 794 last_status = (Idle, Running, Done) 795 796 # cleaning the queue done_pid_queue and move them to done_pid 797 while not self.done_pid_queue.empty(): 798 pid = self.done_pid_queue.get() 799 self.done_pid.append(pid) 800 self.done_pid_queue.task_done() 801 802 803 # Define how to wait for the next iteration 804 if use_lock: 805 # simply wait that a worker release the lock 806 use_lock = self.lock.wait(300) 807 self.lock.clear() 808 if not use_lock and Idle > 0: 809 use_lock = True 810 else: 811 # to be sure that we will never fully lock at the end pass to 812 # a simple time.sleep() 813 time.sleep(sleep_time) 814 sleep_time = min(sleep_time + 2, 180) 815 if update_first: 816 update_first(Idle, Running, Done) 817 818 if self.stoprequest.isSet(): 819 if isinstance(self.fail_msg, Exception): 820 raise self.fail_msg 821 elif isinstance(self.fail_msg, str): 822 raise Exception, self.fail_msg 823 else: 824 misc.sprint(self.fail_msg) 825 raise self.fail_msg[0], self.fail_msg[1], self.fail_msg[2] 826 # reset variable for next submission 827 try: 828 self.lock.clear() 829 except Exception: 830 pass 831 self.done = Queue.Queue() 832 self.done_pid = [] 833 self.done_pid_queue = Queue.Queue() 834 self.nb_done = 0 835 self.submitted = Queue.Queue() 836 self.pids = Queue.Queue() 837 self.stoprequest.clear() 838 839 except KeyboardInterrupt: 840 # if one of the node fails -> return that error 841 if isinstance(self.fail_msg, Exception): 842 raise self.fail_msg 843 elif isinstance(self.fail_msg, str): 844 raise Exception, self.fail_msg 845 elif self.fail_msg: 846 raise self.fail_msg[0], self.fail_msg[1], self.fail_msg[2] 847 # else return orignal error 848 raise
849
850 -class CondorCluster(Cluster):
851 """Basic class for dealing with cluster submission""" 852 853 name = 'condor' 854 job_id = 'CONDOR_ID' 855 856 857 858 @multiple_try()
859 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 860 required_output=[], nb_submit=0):
861 """Submit a job prog to a Condor cluster""" 862 863 text = """Executable = %(prog)s 864 output = %(stdout)s 865 error = %(stderr)s 866 log = %(log)s 867 %(argument)s 868 environment = CONDOR_ID=$(Cluster).$(Process) 869 Universe = vanilla 870 notification = Error 871 Initialdir = %(cwd)s 872 %(requirement)s 873 getenv=True 874 queue 1 875 """ 876 877 if self.cluster_queue not in ['None', None]: 878 requirement = 'Requirements = %s=?=True' % self.cluster_queue 879 else: 880 requirement = '' 881 882 if cwd is None: 883 cwd = os.getcwd() 884 if stdout is None: 885 stdout = '/dev/null' 886 if stderr is None: 887 stderr = '/dev/null' 888 if log is None: 889 log = '/dev/null' 890 if not os.path.exists(prog): 891 prog = os.path.join(cwd, prog) 892 if argument: 893 argument = 'Arguments = %s' % ' '.join(argument) 894 else: 895 argument = '' 896 897 898 dico = {'prog': prog, 'cwd': cwd, 'stdout': stdout, 899 'stderr': stderr,'log': log,'argument': argument, 900 'requirement': requirement} 901 902 #open('submit_condor','w').write(text % dico) 903 a = misc.Popen(['condor_submit'], stdout=subprocess.PIPE, 904 stdin=subprocess.PIPE) 905 output, _ = a.communicate(text % dico) 906 #output = a.stdout.read() 907 #Submitting job(s). 908 #Logging submit event(s). 909 #1 job(s) submitted to cluster 2253622. 910 pat = re.compile("submitted to cluster (\d*)",re.MULTILINE) 911 try: 912 id = pat.search(output).groups()[0] 913 except: 914 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \ 915 % output 916 self.submitted += 1 917 self.submitted_ids.append(id) 918 return id
919 920 @store_input() 921 @multiple_try()
922 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 923 log=None, input_files=[], output_files=[], required_output=[], 924 nb_submit=0):
925 """Submit the job on the cluster NO SHARE DISK 926 input/output file should be give relative to cwd 927 """ 928 929 if not required_output and output_files: 930 required_output = output_files 931 932 if (input_files == [] == output_files): 933 return self.submit(prog, argument, cwd, stdout, stderr, log, 934 required_output=required_output, nb_submit=nb_submit) 935 936 text = """Executable = %(prog)s 937 output = %(stdout)s 938 error = %(stderr)s 939 log = %(log)s 940 %(argument)s 941 should_transfer_files = YES 942 when_to_transfer_output = ON_EXIT 943 transfer_input_files = %(input_files)s 944 %(output_files)s 945 Universe = vanilla 946 notification = Error 947 Initialdir = %(cwd)s 948 %(requirement)s 949 getenv=True 950 queue 1 951 """ 952 953 if self.cluster_queue not in ['None', None]: 954 requirement = 'Requirements = %s=?=True' % self.cluster_queue 955 else: 956 requirement = '' 957 958 if cwd is None: 959 cwd = os.getcwd() 960 if stdout is None: 961 stdout = '/dev/null' 962 if stderr is None: 963 stderr = '/dev/null' 964 if log is None: 965 log = '/dev/null' 966 if not os.path.exists(prog): 967 prog = os.path.join(cwd, prog) 968 if argument: 969 argument = 'Arguments = %s' % ' '.join([str(a) for a in argument]) 970 else: 971 argument = '' 972 # input/output file treatment 973 if input_files: 974 input_files = ','.join(input_files) 975 else: 976 input_files = '' 977 if output_files: 978 output_files = 'transfer_output_files = %s' % ','.join(output_files) 979 else: 980 output_files = '' 981 982 983 984 dico = {'prog': prog, 'cwd': cwd, 'stdout': stdout, 985 'stderr': stderr,'log': log,'argument': argument, 986 'requirement': requirement, 'input_files':input_files, 987 'output_files':output_files} 988 989 #open('submit_condor','w').write(text % dico) 990 a = subprocess.Popen(['condor_submit'], stdout=subprocess.PIPE, 991 stdin=subprocess.PIPE) 992 output, _ = a.communicate(text % dico) 993 #output = a.stdout.read() 994 #Submitting job(s). 995 #Logging submit event(s). 996 #1 job(s) submitted to cluster 2253622. 997 pat = re.compile("submitted to cluster (\d*)",re.MULTILINE) 998 try: 999 id = pat.search(output).groups()[0] 1000 except: 1001 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \ 1002 % output 1003 self.submitted += 1 1004 self.submitted_ids.append(id) 1005 return id
1006 1007 1008 1009 1010 1011 @multiple_try(nb_try=10, sleep=10)
1012 - def control_one_job(self, id):
1013 """ control the status of a single job with it's cluster id """ 1014 cmd = 'condor_q '+str(id)+" -format \'%-2s \\n\' \'ifThenElse(JobStatus==0,\"U\",ifThenElse(JobStatus==1,\"I\",ifThenElse(JobStatus==2,\"R\",ifThenElse(JobStatus==3,\"X\",ifThenElse(JobStatus==4,\"C\",ifThenElse(JobStatus==5,\"H\",ifThenElse(JobStatus==6,\"E\",string(JobStatus))))))))\'" 1015 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE, 1016 stderr=subprocess.PIPE) 1017 1018 error = status.stderr.read() 1019 if status.returncode or error: 1020 raise ClusterManagmentError, 'condor_q returns error: %s' % error 1021 1022 return status.stdout.readline().strip()
1023 1024 jobstatus = {'0':'U', '1':'I','2':'R','3':'X','4':'C','5':'H','6':'E'} 1025 @check_interupt() 1026 @multiple_try(nb_try=10, sleep=10)
1027 - def control(self, me_dir):
1028 """ control the status of a single job with it's cluster id """ 1029 1030 if not self.submitted_ids: 1031 return 0, 0, 0, 0 1032 1033 packet = 15000 1034 idle, run, fail = 0, 0, 0 1035 ongoing = [] 1036 for i in range(1+(len(self.submitted_ids)-1)//packet): 1037 start = i * packet 1038 stop = (i+1) * packet 1039 cmd = "condor_q " + ' '.join(self.submitted_ids[start:stop]) + \ 1040 " -format \"%d \" ClusterId " + \ 1041 " -format \"%d\\n\" JobStatus " 1042 1043 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE, 1044 stderr=subprocess.PIPE) 1045 error = status.stderr.read() 1046 if status.returncode or error: 1047 raise ClusterManagmentError, 'condor_q returns error: %s' % error 1048 1049 for line in status.stdout: 1050 id, status = line.strip().split() 1051 status = self.jobstatus[status] 1052 ongoing.append(id) 1053 if status in ['I','U']: 1054 idle += 1 1055 elif status == 'R': 1056 run += 1 1057 elif status != 'C': 1058 fail += 1 1059 1060 for id in list(self.submitted_ids): 1061 if id not in ongoing: 1062 status = self.check_termination(id) 1063 if status == 'wait': 1064 run += 1 1065 elif status == 'resubmit': 1066 idle += 1 1067 1068 return idle, run, self.submitted - (idle+run+fail), fail
1069 1070 @multiple_try()
1071 - def remove(self, *args, **opts):
1072 """Clean the jobson the cluster""" 1073 1074 if not self.submitted_ids: 1075 return 1076 cmd = "condor_rm %s" % ' '.join(self.submitted_ids) 1077 1078 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w')) 1079 self.submitted_ids = []
1080
1081 -class PBSCluster(Cluster):
1082 """Basic class for dealing with cluster submission""" 1083 1084 name = 'pbs' 1085 job_id = 'PBS_JOBID' 1086 idle_tag = ['Q'] 1087 running_tag = ['T','E','R'] 1088 complete_tag = ['C'] 1089 1090 maximum_submited_jobs = 2500 1091 1092 @multiple_try()
1093 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 1094 required_output=[], nb_submit=0):
1095 """Submit a job prog to a PBS cluster""" 1096 1097 me_dir = self.get_jobs_identifier(cwd, prog) 1098 1099 if len(self.submitted_ids) > self.maximum_submited_jobs: 1100 fct = lambda idle, run, finish: logger.info('Waiting for free slot: %s %s %s' % (idle, run, finish)) 1101 self.wait(me_dir, fct, self.maximum_submited_jobs) 1102 1103 1104 text = "" 1105 if cwd is None: 1106 cwd = os.getcwd() 1107 else: 1108 text = " cd %s;" % cwd 1109 if stdout is None: 1110 stdout = '/dev/null' 1111 if stderr is None: 1112 stderr = '/dev/null' 1113 elif stderr == -2: # -2 is subprocess.STDOUT 1114 stderr = stdout 1115 if log is None: 1116 log = '/dev/null' 1117 1118 if not os.path.isabs(prog): 1119 text += "./%s" % prog 1120 else: 1121 text+= prog 1122 1123 if argument: 1124 text += ' ' + ' '.join(argument) 1125 1126 command = ['qsub','-o', stdout, 1127 '-N', me_dir, 1128 '-e', stderr, 1129 '-V'] 1130 1131 if self.cluster_queue and self.cluster_queue != 'None': 1132 command.extend(['-q', self.cluster_queue]) 1133 1134 a = misc.Popen(command, stdout=subprocess.PIPE, 1135 stderr=subprocess.STDOUT, 1136 stdin=subprocess.PIPE, cwd=cwd) 1137 1138 output = a.communicate(text)[0] 1139 id = output.split('.')[0] 1140 if not id.isdigit() or a.returncode !=0: 1141 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \ 1142 % output 1143 1144 self.submitted += 1 1145 self.submitted_ids.append(id) 1146 return id
1147 1148 @multiple_try()
1149 - def control_one_job(self, id):
1150 """ control the status of a single job with it's cluster id """ 1151 cmd = 'qstat '+str(id) 1152 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE, 1153 stderr=subprocess.STDOUT) 1154 1155 for line in status.stdout: 1156 line = line.strip() 1157 if 'cannot connect to server' in line or 'cannot read reply' in line: 1158 raise ClusterManagmentError, 'server disconnected' 1159 if 'Unknown' in line: 1160 return 'F' 1161 elif line.startswith(str(id)): 1162 jobstatus = line.split()[4] 1163 else: 1164 jobstatus="" 1165 1166 if status.returncode != 0 and status.returncode is not None: 1167 raise ClusterManagmentError, 'server fails in someway (errorcode %s)' % status.returncode 1168 if jobstatus in self.idle_tag: 1169 return 'I' 1170 elif jobstatus in self.running_tag: 1171 return 'R' 1172 return 'F'
1173 1174 1175 @multiple_try()
1176 - def control(self, me_dir):
1177 """ control the status of a single job with it's cluster id """ 1178 cmd = "qstat" 1179 status = misc.Popen([cmd], stdout=subprocess.PIPE) 1180 1181 me_dir = self.get_jobs_identifier(me_dir) 1182 1183 ongoing = [] 1184 1185 idle, run, fail = 0, 0, 0 1186 for line in status.stdout: 1187 if 'cannot connect to server' in line or 'cannot read reply' in line: 1188 raise ClusterManagmentError, 'server disconnected' 1189 if me_dir in line: 1190 ongoing.append(line.split()[0].split('.')[0]) 1191 status2 = line.split()[4] 1192 if status2 in self.idle_tag: 1193 idle += 1 1194 elif status2 in self.running_tag: 1195 run += 1 1196 elif status2 in self.complete_tag: 1197 if not self.check_termination(line.split()[0].split('.')[0]): 1198 idle += 1 1199 else: 1200 fail += 1 1201 1202 if status.returncode != 0 and status.returncode is not None: 1203 raise ClusterManagmentError, 'server fails in someway (errorcode %s)' % status.returncode 1204 1205 for id in list(self.submitted_ids): 1206 if id not in ongoing: 1207 status2 = self.check_termination(id) 1208 if status2 == 'wait': 1209 run += 1 1210 elif status2 == 'resubmit': 1211 idle += 1 1212 1213 return idle, run, self.submitted - (idle+run+fail), fail
1214 1215 @multiple_try()
1216 - def remove(self, *args, **opts):
1217 """Clean the jobs on the cluster""" 1218 1219 if not self.submitted_ids: 1220 return 1221 cmd = "qdel %s" % ' '.join(self.submitted_ids) 1222 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w')) 1223 self.submitted_ids = []
1224
1225 1226 -class SGECluster(Cluster):
1227 """Basic class for dealing with cluster submission""" 1228 # Class written by Arian Abrahantes. 1229 1230 name = 'sge' 1231 job_id = 'JOB_ID' 1232 idle_tag = ['qw', 'hqw','hRqw','w'] 1233 running_tag = ['r','t','Rr','Rt'] 1234 identifier_length = 10 1235
1236 - def def_get_path(self,location):
1237 """replace string for path issues""" 1238 location = os.path.realpath(location) 1239 homePath = os.getenv("HOME") 1240 if homePath: 1241 location = location.replace(homePath,'$HOME') 1242 return location
1243 1244 @multiple_try()
1245 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 1246 required_output=[], nb_submit=0):
1247 """Submit a job prog to an SGE cluster""" 1248 1249 me_dir = self.get_jobs_identifier(cwd, prog) 1250 1251 1252 if cwd is None: 1253 #cwd = os.getcwd() 1254 cwd = self.def_get_path(os.getcwd()) 1255 cwd1 = self.def_get_path(cwd) 1256 text = " cd %s;" % cwd1 1257 if stdout is None: 1258 stdout = '/dev/null' 1259 else: 1260 stdout = self.def_get_path(stdout) 1261 if stderr is None: 1262 stderr = '/dev/null' 1263 elif stderr == -2: # -2 is subprocess.STDOUT 1264 stderr = stdout 1265 else: 1266 stderr = self.def_get_path(stderr) 1267 1268 if log is None: 1269 log = '/dev/null' 1270 else: 1271 log = self.def_get_path(log) 1272 1273 text += prog 1274 if argument: 1275 text += ' ' + ' '.join(argument) 1276 1277 #if anything slips through argument 1278 #print "!=== inteded change ",text.replace('/srv/nfs','') 1279 #text = text.replace('/srv/nfs','') 1280 homePath = os.getenv("HOME") 1281 if homePath: 1282 text = text.replace(homePath,'$HOME') 1283 1284 logger.debug("!=== input %s" % text) 1285 logger.debug("!=== output %s" % stdout) 1286 logger.debug("!=== error %s" % stderr) 1287 logger.debug("!=== logs %s" % log) 1288 1289 command = ['qsub','-o', stdout, 1290 '-N', me_dir, 1291 '-e', stderr, 1292 '-V'] 1293 1294 if self.cluster_queue and self.cluster_queue != 'None': 1295 command.extend(['-q', self.cluster_queue]) 1296 1297 a = misc.Popen(command, stdout=subprocess.PIPE, 1298 stderr=subprocess.STDOUT, 1299 stdin=subprocess.PIPE, cwd=cwd) 1300 1301 output = a.communicate(text)[0] 1302 id = output.split(' ')[2] 1303 if not id.isdigit(): 1304 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \ 1305 % output 1306 self.submitted += 1 1307 self.submitted_ids.append(id) 1308 logger.debug(output) 1309 1310 return id
1311 1312 @multiple_try()
1313 - def control_one_job(self, id):
1314 """ control the status of a single job with it's cluster id """ 1315 #cmd = 'qstat '+str(id) 1316 cmd = 'qstat ' 1317 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1318 for line in status.stdout: 1319 #print "!==",line 1320 #line = line.strip() 1321 #if 'Unknown' in line: 1322 # return 'F' 1323 #elif line.startswith(str(id)): 1324 # status = line.split()[4] 1325 if str(id) in line: 1326 status = line.split()[4] 1327 #print "!=status", status 1328 if status in self.idle_tag: 1329 return 'I' 1330 elif status in self.running_tag: 1331 return 'R' 1332 return 'F'
1333 1334 @multiple_try()
1335 - def control(self, me_dir):
1336 """ control the status of a single job with it's cluster id """ 1337 cmd = "qstat " 1338 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1339 1340 me_dir = self.get_jobs_identifier(me_dir) 1341 1342 finished = list(self.submitted_ids) 1343 1344 idle, run, fail = 0, 0, 0 1345 for line in status.stdout: 1346 if me_dir in line: 1347 id,_,_,_,status = line.split()[:5] 1348 if status in self.idle_tag: 1349 idle += 1 1350 finished.remove(id) 1351 elif status in self.running_tag: 1352 run += 1 1353 finished.remove(id) 1354 else: 1355 logger.debug(line) 1356 fail += 1 1357 finished.remove(id) 1358 1359 for id in finished: 1360 self.check_termination(id) 1361 1362 return idle, run, self.submitted - (idle+run+fail), fail
1363 1364 1365 1366 @multiple_try()
1367 - def remove(self, *args, **opts):
1368 """Clean the jobs on the cluster""" 1369 1370 if not self.submitted_ids: 1371 return 1372 cmd = "qdel %s" % ' '.join(self.submitted_ids) 1373 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w')) 1374 self.submitted_ids = []
1375
1376 1377 -class LSFCluster(Cluster):
1378 """Basic class for dealing with cluster submission""" 1379 1380 name = 'lsf' 1381 job_id = 'LSB_JOBID' 1382 1383 @multiple_try()
1384 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 1385 required_output=[], nb_submit=0):
1386 """Submit the job prog to an LSF cluster""" 1387 1388 1389 me_dir = self.get_jobs_identifier(cwd, prog) 1390 1391 text = "" 1392 command = ['bsub', '-C0', '-J', me_dir] 1393 if cwd is None: 1394 cwd = os.getcwd() 1395 else: 1396 text = " cd %s;" % cwd 1397 if stdout and isinstance(stdout, str): 1398 command.extend(['-o', stdout]) 1399 if stderr and isinstance(stdout, str): 1400 command.extend(['-e', stderr]) 1401 elif stderr == -2: # -2 is subprocess.STDOUT 1402 pass 1403 if log is None: 1404 log = '/dev/null' 1405 1406 text += prog 1407 if argument: 1408 text += ' ' + ' '.join(argument) 1409 1410 if self.cluster_queue and self.cluster_queue != 'None': 1411 command.extend(['-q', self.cluster_queue]) 1412 1413 a = misc.Popen(command, stdout=subprocess.PIPE, 1414 stderr=subprocess.STDOUT, 1415 stdin=subprocess.PIPE, cwd=cwd) 1416 1417 output = a.communicate(text)[0] 1418 #Job <nnnn> is submitted to default queue <normal>. 1419 try: 1420 id = output.split('>',1)[0].split('<')[1] 1421 except: 1422 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \ 1423 % output 1424 if not id.isdigit(): 1425 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \ 1426 % output 1427 self.submitted += 1 1428 self.submitted_ids.append(id) 1429 return id
1430 1431 1432 @multiple_try()
1433 - def control_one_job(self, id):
1434 """ control the status of a single job with it's cluster id """ 1435 1436 cmd = 'bjobs '+str(id) 1437 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1438 1439 for line in status.stdout: 1440 line = line.strip().upper() 1441 if 'JOBID' in line: 1442 continue 1443 elif str(id) not in line: 1444 continue 1445 status = line.split()[2] 1446 if status == 'RUN': 1447 return 'R' 1448 elif status == 'PEND': 1449 return 'I' 1450 elif status == 'DONE': 1451 return 'F' 1452 else: 1453 return 'H' 1454 return 'F'
1455 1456 @multiple_try()
1457 - def control(self, me_dir):
1458 """ control the status of a single job with it's cluster id """ 1459 1460 if not self.submitted_ids: 1461 return 0, 0, 0, 0 1462 1463 cmd = "bjobs " + ' '.join(self.submitted_ids) 1464 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1465 1466 jobstatus = {} 1467 for line in status.stdout: 1468 line = line.strip() 1469 if 'JOBID' in line: 1470 continue 1471 splitline = line.split() 1472 id = splitline[0] 1473 if id not in self.submitted_ids: 1474 continue 1475 jobstatus[id] = splitline[2] 1476 1477 idle, run, fail = 0, 0, 0 1478 for id in self.submitted_ids[:]: 1479 if id in jobstatus: 1480 status = jobstatus[id] 1481 else: 1482 status = 'MISSING' 1483 if status == 'RUN': 1484 run += 1 1485 elif status == 'PEND': 1486 idle += 1 1487 else: 1488 status = self.check_termination(id) 1489 if status == 'wait': 1490 run += 1 1491 elif status == 'resubmit': 1492 idle += 1 1493 1494 return idle, run, self.submitted - (idle+run+fail), fail
1495 1496 @multiple_try()
1497 - def remove(self, *args,**opts):
1498 """Clean the jobs on the cluster""" 1499 1500 if not self.submitted_ids: 1501 return 1502 cmd = "bkill %s" % ' '.join(self.submitted_ids) 1503 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w')) 1504 self.submitted_ids = []
1505
1506 -class GECluster(Cluster):
1507 """Class for dealing with cluster submission on a GE cluster""" 1508 1509 name = 'ge' 1510 job_id = 'JOB_ID' 1511 idle_tag = ['qw'] 1512 running_tag = ['r'] 1513 1514 @multiple_try()
1515 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 1516 required_output=[], nb_submit=0):
1517 """Submit a job prog to a GE cluster""" 1518 1519 text = "" 1520 if cwd is None: 1521 cwd = os.getcwd() 1522 else: 1523 text = " cd %s; bash " % cwd 1524 if stdout is None: 1525 stdout = os.path.join(cwd, "log.%s" % prog.split('/')[-1]) 1526 if stderr is None: 1527 stderr = os.path.join(cwd, "err.%s" % prog.split('/')[-1]) 1528 elif stderr == -2: # -2 is subprocess.STDOUT 1529 stderr = stdout 1530 if log is None: 1531 log = '/dev/null' 1532 1533 text += prog 1534 if argument: 1535 text += ' ' + ' '.join(argument) 1536 text += '\n' 1537 tmp_submit = os.path.join(cwd, 'tmp_submit') 1538 open(tmp_submit,'w').write(text) 1539 1540 a = misc.Popen(['qsub','-o', stdout, 1541 '-e', stderr, 1542 tmp_submit], 1543 stdout=subprocess.PIPE, 1544 stderr=subprocess.STDOUT, 1545 stdin=subprocess.PIPE, cwd=cwd) 1546 1547 output = a.communicate()[0] 1548 #Your job 874511 ("test.sh") has been submitted 1549 pat = re.compile("Your job (\d*) \(",re.MULTILINE) 1550 try: 1551 id = pat.search(output).groups()[0] 1552 except: 1553 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \ 1554 % output 1555 self.submitted += 1 1556 self.submitted_ids.append(id) 1557 return id
1558 1559 @multiple_try()
1560 - def control_one_job(self, id):
1561 """ control the status of a single job with it's cluster id """ 1562 cmd = 'qstat | grep '+str(id) 1563 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1564 if not status: 1565 return 'F' 1566 #874516 0.00000 test.sh alwall qw 03/04/2012 22:30:35 1 1567 pat = re.compile("^(\d+)\s+[\d\.]+\s+[\w\d\.]+\s+[\w\d\.]+\s+(\w+)\s") 1568 stat = '' 1569 for line in status.stdout.read().split('\n'): 1570 if not line: 1571 continue 1572 line = line.strip() 1573 try: 1574 groups = pat.search(line).groups() 1575 except: 1576 raise ClusterManagmentError, 'bad syntax for stat: \n\"%s\"' % line 1577 if groups[0] != id: continue 1578 stat = groups[1] 1579 if not stat: 1580 return 'F' 1581 if stat in self.idle_tag: 1582 return 'I' 1583 if stat in self.running_tag: 1584 return 'R'
1585 1586 @multiple_try()
1587 - def control(self, me_dir=None):
1588 """Check the status of job associated to directory me_dir. return (idle, run, finish, fail)""" 1589 if not self.submitted_ids: 1590 return 0, 0, 0, 0 1591 idle, run, fail = 0, 0, 0 1592 ongoing = [] 1593 for statusflag in ['p', 'r', 'sh']: 1594 cmd = 'qstat -s %s' % statusflag 1595 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1596 #874516 0.00000 test.sh alwall qw 03/04/2012 22:30:35 1 1597 pat = re.compile("^(\d+)") 1598 for line in status.stdout.read().split('\n'): 1599 line = line.strip() 1600 try: 1601 id = pat.search(line).groups()[0] 1602 except Exception: 1603 pass 1604 else: 1605 if id not in self.submitted_ids: 1606 continue 1607 ongoing.append(id) 1608 if statusflag == 'p': 1609 idle += 1 1610 if statusflag == 'r': 1611 run += 1 1612 if statusflag == 'sh': 1613 fail += 1 1614 for id in list(self.submitted_ids): 1615 if id not in ongoing: 1616 self.check_termination(id) 1617 #self.submitted_ids = ongoing 1618 1619 return idle, run, self.submitted - idle - run - fail, fail
1620 1621 @multiple_try()
1622 - def remove(self, *args, **opts):
1623 """Clean the jobs on the cluster""" 1624 1625 if not self.submitted_ids: 1626 return 1627 cmd = "qdel %s" % ' '.join(self.submitted_ids) 1628 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w')) 1629 self.submitted_ids = []
1630
1631 -def asyncrone_launch(exe, cwd=None, stdout=None, argument = [], **opt):
1632 """start a computation and not wait for it to finish. 1633 this fonction returns a lock which is locked as long as the job is 1634 running.""" 1635 1636 mc = MultiCore(1) 1637 mc.submit(exe, argument, cwd, stdout, **opt) 1638 mc.need_waiting = True 1639 return mc.lock
1640
1641 1642 -class SLURMCluster(Cluster):
1643 """Basic class for dealing with cluster submission""" 1644 1645 name = 'slurm' 1646 job_id = 'SLURM_JOBID' 1647 idle_tag = ['Q','PD','S','CF'] 1648 running_tag = ['R', 'CG'] 1649 complete_tag = ['C'] 1650 identifier_length = 8 1651 1652 @multiple_try()
1653 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None, 1654 required_output=[], nb_submit=0):
1655 """Submit a job prog to a SLURM cluster""" 1656 1657 me_dir = self.get_jobs_identifier(cwd, prog) 1658 1659 1660 if cwd is None: 1661 cwd = os.getcwd() 1662 if stdout is None: 1663 stdout = '/dev/null' 1664 if stderr is None: 1665 stderr = '/dev/null' 1666 elif stderr == -2: # -2 is subprocess.STDOUT 1667 stderr = stdout 1668 if log is None: 1669 log = '/dev/null' 1670 1671 command = ['sbatch', '-o', stdout, 1672 '-J', me_dir, 1673 '-e', stderr, prog] + argument 1674 1675 if self.cluster_queue and self.cluster_queue != 'None': 1676 command.insert(1, '-p') 1677 command.insert(2, self.cluster_queue) 1678 1679 a = misc.Popen(command, stdout=subprocess.PIPE, 1680 stderr=subprocess.STDOUT, 1681 stdin=subprocess.PIPE, cwd=cwd) 1682 1683 output = a.communicate() 1684 output_arr = output[0].split(' ') 1685 id = output_arr[3].rstrip() 1686 1687 if not id.isdigit(): 1688 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \ 1689 % (output[0] + '\n' + output[1]) 1690 1691 self.submitted += 1 1692 self.submitted_ids.append(id) 1693 return id
1694 1695 @multiple_try()
1696 - def control_one_job(self, id):
1697 """ control the status of a single job with it's cluster id """ 1698 cmd = 'squeue j'+str(id) 1699 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE, 1700 stderr=open(os.devnull,'w')) 1701 1702 for line in status.stdout: 1703 line = line.strip() 1704 if 'Invalid' in line: 1705 return 'F' 1706 elif line.startswith(str(id)): 1707 status = line.split()[4] 1708 if status in self.idle_tag: 1709 return 'I' 1710 elif status in self.running_tag: 1711 return 'R' 1712 return 'F'
1713 1714 @multiple_try()
1715 - def control(self, me_dir):
1716 """ control the status of a single job with it's cluster id """ 1717 cmd = "squeue" 1718 pstatus = misc.Popen([cmd], stdout=subprocess.PIPE) 1719 1720 me_dir = self.get_jobs_identifier(me_dir) 1721 1722 idle, run, fail = 0, 0, 0 1723 ongoing=[] 1724 for line in pstatus.stdout: 1725 if me_dir in line: 1726 id, _, _,_ , status,_ = line.split(None,5) 1727 ongoing.append(id) 1728 if status in self.idle_tag: 1729 idle += 1 1730 elif status in self.running_tag: 1731 run += 1 1732 elif status in self.complete_tag: 1733 status = self.check_termination(id) 1734 if status == 'wait': 1735 run += 1 1736 elif status == 'resubmit': 1737 idle += 1 1738 else: 1739 fail += 1 1740 1741 #control other finished job 1742 for id in list(self.submitted_ids): 1743 if id not in ongoing: 1744 status = self.check_termination(id) 1745 if status == 'wait': 1746 run += 1 1747 elif status == 'resubmit': 1748 idle += 1 1749 1750 1751 return idle, run, self.submitted - (idle+run+fail), fail
1752 1753 @multiple_try()
1754 - def remove(self, *args, **opts):
1755 """Clean the jobs on the cluster""" 1756 1757 if not self.submitted_ids: 1758 return 1759 cmd = "scancel %s" % ' '.join(self.submitted_ids) 1760 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w')) 1761 self.submitted_ids = []
1762
1763 -class HTCaaSCluster(Cluster):
1764 """Class for dealing with cluster submission on a HTCaaS cluster using GPFS """ 1765 1766 name= 'htcaas' 1767 job_id = 'HTCAAS_JOBID' 1768 idle_tag = ['waiting'] 1769 running_tag = ['preparing','running'] 1770 complete_tag = ['done'] 1771 1772 @store_input() 1773 @multiple_try()
1774 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 1775 log=None, input_files=[], output_files=[], required_output=[], 1776 nb_submit=0):
1777 """Submit the HTCaaS job on the cluster with NO SHARE DISK 1778 input/output file should be given as relative to CWd 1779 """ 1780 # To make workspace name(temp) 1781 cur_usr = os.getenv('USER') 1782 1783 if cwd is None: 1784 cwd = os.getcwd() 1785 1786 cwd_cp = cwd.rsplit("/",2) 1787 1788 if not stdout is None: 1789 print "stdout: %s" % stdout 1790 1791 if not os.path.exists(prog): 1792 prog = os.path.join(cwd, prog) 1793 1794 if not required_output and output_files: 1795 required_output = output_files 1796 1797 logger.debug(prog) 1798 if 'combine' not in prog and 'pythia' not in prog and 'shower' not in prog : 1799 cwd_arg = cwd+"/arguments" 1800 temp = ' '.join([str(a) for a in argument]) 1801 arg_cmd="echo '"+temp+"' > " + cwd_arg 1802 command = ['htcaas-mgjob-submit','-d',cwd,'-e',os.path.basename(prog)] 1803 if argument : 1804 command.extend(['-a ', '='.join([str(a) for a in argument])]) 1805 a = misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd) 1806 id = a.stdout.read().strip() 1807 1808 else: 1809 cwd_arg = cwd+"/arguments" 1810 temp = ' '.join([str(a) for a in argument]) 1811 temp_file_name = "sub." + os.path.basename(prog) 1812 text = """#!/bin/bash 1813 MYPWD=%(cwd)s 1814 cd $MYPWD 1815 input_files=(%(input_files)s ) 1816 for i in ${input_files[@]} 1817 do 1818 chmod -f +x $i 1819 done 1820 /bin/bash %(prog)s %(arguments)s > %(stdout)s 1821 """ 1822 dico = {'cwd':cwd, 'input_files': ' '.join(input_files + [prog]), 'stdout': stdout, 'prog':prog, 1823 'arguments': ' '.join([str(a) for a in argument]), 1824 'program': ' ' if '.py' in prog else 'bash'} 1825 1826 # writing a new script for the submission 1827 new_prog = pjoin(cwd, temp_file_name) 1828 open(new_prog, 'w').write(text % dico) 1829 misc.Popen(['chmod','+x',new_prog],cwd=cwd) 1830 command = ['htcaas-mgjob-submit','-d',cwd,'-e',temp_file_name] 1831 a = misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd) 1832 id = a.stdout.read().strip() 1833 logger.debug(id) 1834 1835 nb_try=0 1836 nb_limit=5 1837 if not id.isdigit() : 1838 print "[ID is not digit]:" + id 1839 1840 while not id.isdigit() : 1841 nb_try+=1 1842 print "[fail_retry]:"+ nb_try 1843 a=misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd) 1844 id = a.stdout.read().strip() 1845 if nb_try > nb_limit : 1846 raise ClusterManagementError, 'fail to submit to the HTCaaS cluster: \n %s' % id 1847 break 1848 1849 self.submitted += 1 1850 self.submitted_ids.append(id) 1851 1852 return id
1853 1854 @multiple_try(nb_try=10, sleep=5)
1855 - def control_one_job(self, id):
1856 """ control the status of a single job with it's cluster id """ 1857 1858 if id == 0 : 1859 status_out ='C' 1860 else : 1861 cmd = 'htcaas-job-status -m '+str(id)+ " -s | grep Status " 1862 status = misc.Popen([cmd], shell=True,stdout=subprocess.PIPE, 1863 stderr=subprocess.PIPE) 1864 error = status.stderr.read() 1865 if status.returncode or error: 1866 raise ClusterManagmentError, 'htcaas-job-submit returns error: %s' % error 1867 status_out= status.stdout.read().strip() 1868 status_out= status_out.split(":",1)[1] 1869 if status_out == 'waiting': 1870 status_out='I' 1871 elif status_out == 'preparing' or status_out == 'running': 1872 status_out = 'R' 1873 elif status_out != 'done': 1874 status_out = 'F' 1875 elif status_out == 'done': 1876 status_out = 'C' 1877 1878 return status_out
1879 1880 @multiple_try()
1881 - def control(self, me_dir):
1882 """ control the status of a single job with it's cluster id """ 1883 if not self.submitted_ids: 1884 logger.debug("self.submitted_ids not exists") 1885 return 0, 0, 0, 0 1886 1887 ongoing = [] 1888 idle, run, fail = 0, 0, 0 1889 1890 start = self.submitted_ids[0] 1891 end = self.submitted_ids[-1] 1892 1893 cmd = "htcaas-job-status -c "+str(start)+"-"+str(end)#+" -ac" 1894 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 1895 1896 for line in status.stdout: 1897 #ongoing.append(line.split()[0].strip()) 1898 status2 = line.split()[-1] 1899 if status2 is not 'null' or line.split()[0].strip() is not '0': 1900 ongoing.append(line.split()[0].strip()) 1901 logger.debug("["+line.split()[0].strip()+"]"+status2) 1902 if status2 is 'null' or line.split()[0].strip() is '0': 1903 idle += 1 1904 elif status2 in self.idle_tag: 1905 idle += 1 1906 elif status2 in self.running_tag: 1907 run += 1 1908 elif status2 in self.complete_tag: 1909 if not self.check_termination(line.split()[0]): 1910 idle +=1 1911 else: 1912 fail += 1 1913 1914 return idle, run, self.submitted - (idle+run+fail), fail
1915 1916 @multiple_try()
1917 - def remove(self, *args, **opts):
1918 """Clean the jobson the cluster""" 1919 1920 if not self.submitted_ids: 1921 return 1922 for i in range(len(self.submitted_ids)): 1923 cmd = "htcaas-job-cancel -m %s" % self.submitted_ids[i] 1924 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1925
1926 -class HTCaaS2Cluster(Cluster):
1927 """Class for dealing with cluster submission on a HTCaaS cluster without GPFS """ 1928 1929 name= 'htcaas2' 1930 job_id = 'HTCAAS2_JOBID' 1931 idle_tag = ['waiting'] 1932 running_tag = ['preparing','running'] 1933 complete_tag = ['done'] 1934 1935 @store_input() 1936 @multiple_try()
1937 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None, 1938 log=None, input_files=[], output_files=[], required_output=[], 1939 nb_submit=0):
1940 1941 """Submit the HTCaaS job on the cluster with NO SHARE DISK 1942 input/output file should be given as relative to CWD 1943 """ 1944 if cwd is None: 1945 cwd = os.getcwd() 1946 1947 if not os.path.exists(prog): 1948 prog = os.path.join(cwd, prog) 1949 1950 if 'combine' not in prog and 'pythia' not in prog and 'shower' not in prog : 1951 if cwd or prog : 1952 self.submitted_dirs.append(cwd) 1953 self.submitted_exes.append(prog) 1954 else: 1955 logger.debug("cwd and prog not exist->"+cwd+" / "+ os.path.basename(prog)) 1956 1957 if argument : 1958 self.submitted_args.append('='.join([str(a) for a in argument])) 1959 1960 if cwd or prog : 1961 self.submitted += 1 1962 id = self.submitted 1963 self.submitted_ids.append(id) 1964 else: 1965 logger.debug("cwd and prog are not exist! ") 1966 id = 0 1967 1968 else: 1969 temp_file_name = "sub."+ os.path.basename(prog) 1970 text = """#!/bin/bash 1971 MYPWD=%(cwd)s 1972 cd $MYPWD 1973 input_files=(%(input_files)s ) 1974 for i in ${input_files[@]} 1975 do 1976 chmod -f +x $i 1977 done 1978 /bin/bash %(prog)s %(arguments)s > %(stdout)s 1979 """ 1980 dico = {'cwd':cwd, 'input_files': ' '.join(input_files + [prog]), 'stdout': stdout, 'prog':prog, 1981 'arguments': ' '.join([str(a) for a in argument]), 1982 'program': ' ' if '.py' in prog else 'bash'} 1983 # writing a new script for the submission 1984 new_prog = pjoin(cwd, temp_file_name) 1985 open(new_prog, 'w').write(text % dico) 1986 misc.Popen(['chmod','+x',new_prog],cwd=cwd) 1987 command = ['htcaas-mgjob-submit','-d',cwd,'-e',new_prog] 1988 a = misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd) 1989 id = a.stdout.read().strip() 1990 logger.debug("[mode2]-["+str(id)+"]") 1991 if cwd and prog : 1992 self.submitted += 1 1993 self.submitted_ids.append(id) 1994 else: 1995 logger.debug("cwd and prog are not exist! ") 1996 id = 0 1997 1998 return id
1999 2000 @multiple_try()
2001 - def metasubmit(self, me_dir=None):
2002 if self.submitted > 1100 and self.submitted == len(self.submitted_ids): 2003 tmp_leng= len(self.submitted_ids)/2 2004 tmp_dirs1= self.submitted_dirs[0:tmp_leng] 2005 tmp_dirs2= self.submitted_dirs[tmp_leng:] 2006 tmp_exes1= self.submitted_exes[0:tmp_leng] 2007 tmp_exes2= self.submitted_exes[tmp_leng:] 2008 command1 = ['htcaas-mgjob-submit','-d',":".join([str(a) for a in tmp_dirs1 if a and a is not ' ']), 2009 '-e', ":".join([str(a) for a in tmp_exes1 if a and a is not ' '])] 2010 command2 = ['htcaas-mgjob-submit','-d',":".join([str(a) for a in tmp_dirs2 if a and a is not ' ']), 2011 '-e', ":".join([str(a) for a in tmp_exes2 if a and a is not ' '])] 2012 if len(self.submitted_args) > 0 : 2013 tmp_args1= self.submitted_args[0:tmp_leng] 2014 tmp_args2= self.submitted_args[tmp_leng:] 2015 command1.extend(['-a', ':'.join([str(a) for a in tmp_args1])]) 2016 command2.extend(['-a', ':'.join([str(a) for a in tmp_args2])]) 2017 result1 = misc.Popen(command1, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE) 2018 result2 = misc.Popen(command2, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE) 2019 me_dir = str(result1.stdout.read().strip())+ "//" + str(result2.stdout.read().strip()) 2020 2021 elif self.submitted > 0 and self.submitted == self.submitted_ids[-1]: 2022 command = ['htcaas-mgjob-submit','-d',":".join([str(a) for a in self.submitted_dirs if a and a is not ' ']), 2023 '-e', ":".join([str(a) for a in self.submitted_exes if a and a is not ' '])] 2024 if len(self.submitted_args) > 0 : 2025 command.extend(['-a', ':'.join([str(a) for a in self.submitted_args])]) 2026 if self.submitted_dirs[0] or self.submitted_exes[0] : 2027 result = misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE) 2028 me_dir = result.stdout.read().strip() 2029 self.submitted_ids[0]=me_dir 2030 else: 2031 me_dir = self.submitted_ids[-1] 2032 elif self.submitted > 0 and self.submitted != self.submitted_ids[-1]: 2033 me_dir = self.submitted_ids[0] 2034 else: 2035 me_dir = -1 2036 2037 logger.debug("[" + str(me_dir) + "]") 2038 2039 self.submitted_dirs = [] 2040 self.submitted_exes = [] 2041 self.submitted_args = [] 2042 2043 return me_dir
2044 2045 2046 @multiple_try(nb_try=10, sleep=5)
2047 - def control_one_job(self, id):
2048 """ control the status of a single job with it's cluster id """ 2049 #logger.debug("CONTROL ONE JOB MODE") 2050 if self.submitted == self.submitted_ids[-1] : 2051 id = self.metasubmit(self) 2052 tempid = self.submitted_ids[-1] 2053 self.submitted_ids.remove(self.submitted_ids[-1]) 2054 self.submitted_ids.append(id) 2055 logger.debug(str(id)+" // "+str(self.submitted_ids[-1])) 2056 2057 if id == 0 : 2058 status_out ='C' 2059 else: 2060 cmd = 'htcaas-job-status -m '+ str(id) + " -s | grep Status " 2061 status = misc.Popen([cmd],shell=True,stdout=subprocess.PIPE, 2062 stderr=subprocess.PIPE) 2063 error = status.stderr.read() 2064 if status.returncode or error: 2065 raise ClusterManagmentError, 'htcaas-job-status returns error: %s' % error 2066 status_out= status.stdout.read().strip() 2067 status_out= status_out.split(":",1)[1] 2068 logger.debug("[["+str(id)+"]]"+status_out) 2069 if status_out == 'waiting': 2070 status_out='I' 2071 elif status_out == 'preparing' or status_out == 'running': 2072 status_out = 'R' 2073 elif status_out != 'done': 2074 status_out = 'F' 2075 elif status_out == 'done': 2076 status_out = 'C' 2077 self.submitted -= 1 2078 2079 return status_out
2080 2081 @multiple_try()
2082 - def control(self, me_dir):
2083 """ control the status of a single job with it's cluster id """ 2084 if not self.submitted_ids: 2085 logger.debug("self.submitted_ids not exists") 2086 return 0, 0, 0, 0 2087 2088 if "//" in me_dir : 2089 if int(me_dir.split("//")[0]) < int(me_dir.split("//")[1]) : 2090 start = me_dir.split("//")[0] 2091 end = me_dir.split("//")[1] 2092 else : 2093 start = me_dir.split("//")[1] 2094 end = me_dir.split("//")[0] 2095 elif "/" in me_dir : # update 2096 start = 0 2097 end = 0 2098 elif me_dir.isdigit(): 2099 start = me_dir 2100 end = me_dir 2101 elif not me_dir.isdigit(): 2102 me_dir = self.submitted_ids[0] 2103 logger.debug("Meta_ID is not digit(control), self.submitted_ids[0]: "+str(me_dir) ) 2104 2105 ongoing = [] 2106 idle, run, fail, done = 0, 0, 0, 0 2107 2108 cmd = "htcaas-job-status -c "+str(start)+"-"+str(end) +" -ac" 2109 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE) 2110 2111 for line in status.stdout: 2112 status2 = line.split()[-1] 2113 if status2 is not 'null' or line.split()[0].strip() is not '0': 2114 ongoing.append(str(line.split()[0].strip())+"-"+str(line.split()[1].strip())) 2115 logger.debug("["+line.split()[0].strip()+"-"+line.split()[1].strip()+"]"+status2) 2116 2117 if status2 is 'null' or line.split()[0].strip() is '0': 2118 idle += 1 2119 elif status2 in self.idle_tag: 2120 idle += 1 2121 elif status2 in self.running_tag: 2122 run += 1 2123 elif status2 in self.complete_tag: 2124 done += 1 2125 self.submitted -= 1 2126 if not self.check_termination(line.split()[1]): 2127 idle +=1 2128 else: 2129 fail += 1 2130 2131 return idle, run, self.submitted - (idle+run+fail), fail
2132 2133 @multiple_try()
2134 - def remove(self, *args, **opts):
2135 """Clean the jobson the cluster""" 2136 2137 if not self.submitted_ids: 2138 return 2139 id = self.submitted_ids[0] 2140 if id is not 0 : 2141 cmd = "htcaas-job-cancel -m %s" % str(id) 2142 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
2143 2144 from_name = {'condor':CondorCluster, 'pbs': PBSCluster, 'sge': SGECluster, 2145 'lsf': LSFCluster, 'ge':GECluster, 'slurm': SLURMCluster, 2146 'htcaas':HTCaaSCluster, 'htcaas2':HTCaaS2Cluster} 2147 2148 onecore=MultiCore(1) # create a thread to run simple bash job without having to 2149 #fork the main process 2150