1
2
3
4
5
6
7
8
9
10
11
12
13
14 import subprocess
15 import logging
16 import os
17 import time
18 import re
19 import glob
20 import inspect
21 import sys
22
23 logger = logging.getLogger('madgraph.cluster')
24
25 try:
26 from madgraph import MadGraph5Error
27 import madgraph.various.misc as misc
28 except Exception, error:
29 if __debug__:
30 print str(error)
31 from internal import MadGraph5Error
32 import internal.misc as misc
33
34 pjoin = os.path.join
38
41
42
43 multiple_try = misc.multiple_try
44 pjoin = os.path.join
48
49 def deco_interupt(f):
50 def deco_f_interupt(self, *args, **opt):
51 try:
52 return f(self, *args, **opt)
53 except error:
54 try:
55 self.remove(*args, **opt)
56 except Exception:
57 pass
58 raise error
59 return deco_f_interupt
60 return deco_interupt
61
74 return deco_f_store
75 return deco_store
76
78 """ This function checks whether compression of input files are necessary
79 given the running options given. """
80
81 if options['run_mode'] != 1 and options['cluster_temp_path'] is None:
82 return False
83 else:
84 return True
85
87 """Basic Class for all cluster type submission"""
88 name = 'mother class'
89 identifier_length = 14
90
92 """Init the cluster"""
93
94 self.submitted = 0
95 self.submitted_ids = []
96 self.finish = 0
97 self.submitted_dirs = []
98 self.submitted_exes = []
99 self.submitted_args = []
100
101 if 'cluster_queue' in opts:
102 self.cluster_queue = opts['cluster_queue']
103 else:
104 self.cluster_queue = 'madgraph'
105 if 'cluster_temp_path' in opts:
106 self.temp_dir = opts['cluster_temp_path']
107 else:
108 self.temp_dir = None
109 self.options = {'cluster_status_update': (600, 30)}
110 for key,value in opts.items():
111 self.options[key] = value
112 self.nb_retry = opts['cluster_nb_retry'] if 'cluster_nb_retry' in opts else 0
113 self.cluster_retry_wait = float(opts['cluster_retry_wait']) if 'cluster_retry_wait' in opts else 300
114 self.options = dict(opts)
115 self.retry_args = {}
116
117 self.packet = {}
118 self.id_to_packet = {}
119
120 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
121 log=None, required_output=[], nb_submit=0):
122 """How to make one submission. Return status id on the cluster."""
123 raise NotImplemented, 'No implementation of how to submit a job to cluster \'%s\'' % self.name
124
125
126 @store_input()
127 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
128 log=None, input_files=[], output_files=[], required_output=[],
129 nb_submit=0):
130 """How to make one submission. Return status id on the cluster.
131 NO SHARE DISK"""
132
133 if cwd is None:
134 cwd = os.getcwd()
135 if not os.path.exists(prog):
136 prog = os.path.join(cwd, prog)
137
138 if not required_output and output_files:
139 required_output = output_files
140
141 if not hasattr(self, 'temp_dir') or not self.temp_dir or \
142 (input_files == [] == output_files):
143
144 return self.submit(prog, argument, cwd, stdout, stderr, log,
145 required_output=required_output, nb_submit=nb_submit)
146
147 if not input_files and not output_files:
148
149 return self.submit(prog, argument, cwd, stdout, stderr, log,
150 required_output=required_output, nb_submit=nb_submit)
151
152 if cwd is None:
153 cwd = os.getcwd()
154 if not os.path.exists(prog):
155 prog = os.path.join(cwd, prog)
156 temp_file_name = "sub." + os.path.basename(prog) + '.'.join(argument)
157
158 text = """#!/bin/bash
159 MYTMP=%(tmpdir)s/run$%(job_id)s
160 MYPWD=%(cwd)s
161 mkdir -p $MYTMP
162 cd $MYPWD
163 input_files=( %(input_files)s )
164 for i in ${input_files[@]}
165 do
166 cp -R -L $i $MYTMP
167 done
168 cd $MYTMP
169 echo '%(arguments)s' > arguments
170 chmod +x ./%(script)s
171 %(program)s ./%(script)s %(arguments)s
172 exit=$?
173 output_files=( %(output_files)s )
174 for i in ${output_files[@]}
175 do
176 cp -r $MYTMP/$i $MYPWD
177 done
178 # if [ "$exit" -eq "0" ]
179 # then
180 rm -rf $MYTMP
181 # fi
182 """
183
184 dico = {'tmpdir' : self.temp_dir, 'script': os.path.basename(prog),
185 'cwd': cwd, 'job_id': self.job_id,
186 'input_files': ' '.join(input_files + [prog]),
187 'output_files': ' '.join(output_files),
188 'arguments': ' '.join([str(a) for a in argument]),
189 'program': ' ' if '.py' in prog else 'bash'}
190
191
192 new_prog = pjoin(cwd, temp_file_name)
193 open(new_prog, 'w').write(text % dico)
194 misc.Popen(['chmod','+x',new_prog],cwd=cwd)
195
196 return self.submit(new_prog, argument, cwd, stdout, stderr, log,
197 required_output=required_output, nb_submit=nb_submit)
198
199
200 - def cluster_submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
201 log=None, input_files=[], output_files=[], required_output=[],
202 nb_submit=0, packet_member=None):
203 """This function wrap the cluster submition with cluster independant
204 method should not be overwritten (but for DAG type submission)"""
205
206 id = self.submit2(prog, argument, cwd, stdout, stderr, log, input_files,
207 output_files, required_output, nb_submit)
208
209
210 if not packet_member:
211 return id
212 else:
213 if isinstance(packet_member, Packet):
214 self.id_to_packet[id] = packet_member
215 packet_member.put(id)
216 if packet_member.tag not in self.packet:
217 self.packet[packet_member.tag] = packet_member
218 else:
219 if packet_member in self.packet:
220 packet = self.packet[packet_member]
221 packet.put(id)
222 self.id_to_packet[id] = packet
223 return id
224
226 """Check the status of job associated to directory me_dir. return (idle, run, finish, fail)"""
227 if not self.submitted_ids:
228 raise NotImplemented, 'No implementation of how to control the job status to cluster \'%s\'' % self.name
229 idle, run, fail = 0, 0, 0
230 for pid in self.submitted_ids[:]:
231 status = self.control_one_job(id)
232 if status == 'I':
233 idle += 1
234 elif status == 'R':
235 run += 1
236 elif status == 'F':
237 self.finish +=1
238 self.submitted_ids.remove(pid)
239 else:
240 fail += 1
241
242 return idle, run, self.finish, fail
243
245 """ control the status of a single job with it's cluster id """
246 raise NotImplemented, 'No implementation of how to control the job status to cluster \'%s\'' % self.name
247
249 """get a unique run_name for all the jobs helps to identify the runs
250 in the controller for some cluster."""
251
252 if second_path:
253 path = os.path.realpath(pjoin(path, second_path))
254 elif not os.path.exists(path):
255 return path
256
257 if 'SubProcesses' in path:
258 target = path.rsplit('/SubProcesses',1)[0]
259 elif 'MCatNLO' in path:
260 target = path.rsplit('/MCatNLO',1)[0]
261 elif 'PY8_parallelization' in path:
262 target = path.rsplit('/PY8_parallelization',1)[0]
263 elif second_path:
264 target=path
265 logger.warning("cluster.get_job_identifier runs unexpectedly. This should be fine but report this message if you have problem.")
266 else:
267 target = path
268
269 if target.endswith('/'):
270 target = target[:-1]
271
272 target = misc.digest(target)[-self.identifier_length:]
273 if not target[0].isalpha():
274 target = 'a' + target[1:]
275
276 return target
277
278
279 @check_interupt()
280 - def wait(self, me_dir, fct, minimal_job=0, update_first=None):
281 """Wait that all job are finish.
282 if minimal_job set, then return if idle + run is lower than that number"""
283
284
285 mode = 1
286 nb_iter = 0
287 nb_short = 0
288 change_at = 5
289
290 if update_first:
291 idle, run, finish, fail = self.control(me_dir)
292 update_first(idle, run, finish)
293
294
295 longtime, shorttime = self.options['cluster_status_update']
296
297 nb_job = 0
298
299 if self.options['cluster_type'] == 'htcaas2':
300 me_dir = self.metasubmit(self)
301
302 while 1:
303 old_mode = mode
304 nb_iter += 1
305 idle, run, finish, fail = self.control(me_dir)
306 if nb_job:
307 if idle + run + finish + fail != nb_job:
308 nb_job = idle + run + finish + fail
309 nb_iter = 1
310 else:
311 nb_job = idle + run + finish + fail
312 if fail:
313 raise ClusterManagmentError('Some Jobs are in a Hold/... state. Please try to investigate or contact the IT team')
314 if idle + run == 0:
315
316 logger.info('All jobs finished')
317 fct(idle, run, finish)
318 break
319 if idle + run < minimal_job:
320 return
321 fct(idle, run, finish)
322
323 if nb_iter < change_at:
324 mode = 1
325 elif idle < run:
326 if old_mode == 0:
327 if nb_short:
328 mode = 0
329
330 elif idle:
331 if nb_iter > change_at + int(longtime)//shorttime:
332 mode = 0
333 else:
334 mode = 1
335 nb_short =0
336 else:
337 mode = 1
338 nb_short = 0
339 elif old_mode == 1:
340 nb_short +=1
341 if nb_short > 3* max(change_at, int(longtime)//shorttime):
342 mode = 0
343 else:
344 mode = 0
345
346
347 if old_mode > mode:
348 logger.info('''Start to wait %ss between checking status.
349 Note that you can change this time in the configuration file.
350 Press ctrl-C to force the update.''' % self.options['cluster_status_update'][0])
351
352
353 if mode == 0:
354 try:
355 time.sleep(self.options['cluster_status_update'][0])
356 except KeyboardInterrupt:
357 logger.info('start to update the status')
358 nb_iter = min(0, change_at -2)
359 nb_short = 0
360 else:
361 time.sleep(self.options['cluster_status_update'][1])
362
363
364 self.submitted = 0
365 self.submitted_ids = []
366
368 """Check the termination of the jobs with job_id and relaunch it if needed."""
369
370
371 if job_id not in self.retry_args:
372 if job_id in self.id_to_packet:
373 nb_in_packet = self.id_to_packet[job_id].remove_one()
374 if nb_in_packet == 0:
375
376 packet = self.id_to_packet[job_id]
377
378 packet.queue.join()
379
380 packet.fct(*packet.args)
381 del self.id_to_packet[job_id]
382 return 'resubmit'
383 else:
384 return True
385
386 args = self.retry_args[job_id]
387 if 'time_check' in args:
388 time_check = args['time_check']
389 else:
390 time_check = 0
391
392 for path in args['required_output']:
393 if args['cwd']:
394 path = pjoin(args['cwd'], path)
395
396 if not (os.path.exists(path) and os.stat(path).st_size != 0) :
397 break
398 else:
399
400 if time_check > 0:
401 logger.info('Job %s Finally found the missing output.' % (job_id))
402 del self.retry_args[job_id]
403 self.submitted_ids.remove(job_id)
404
405 if job_id in self.id_to_packet:
406 nb_in_packet = self.id_to_packet[job_id].remove_one()
407 if nb_in_packet == 0:
408
409 packet = self.id_to_packet[job_id]
410
411 packet.queue.join()
412
413 packet.fct(*packet.args)
414 del self.id_to_packet[job_id]
415 return 'resubmit'
416
417 return 'done'
418
419 if time_check == 0:
420 logger.debug('''Job %s: missing output:%s''' % (job_id,path))
421 args['time_check'] = time.time()
422 return 'wait'
423 elif self.cluster_retry_wait > time.time() - time_check:
424 return 'wait'
425
426
427 if self.nb_retry < 0:
428 logger.critical('''Fail to run correctly job %s.
429 with option: %s
430 file missing: %s''' % (job_id, args, path))
431 raw_input('press enter to continue.')
432 elif self.nb_retry == 0:
433 logger.critical('''Fail to run correctly job %s.
434 with option: %s
435 file missing: %s.
436 Stopping all runs.''' % (job_id, args, path))
437 self.remove()
438 elif args['nb_submit'] >= self.nb_retry:
439 logger.critical('''Fail to run correctly job %s.
440 with option: %s
441 file missing: %s
442 Fails %s times
443 No resubmition. ''' % (job_id, args, path, args['nb_submit']))
444 self.remove()
445 else:
446 args['nb_submit'] += 1
447 logger.warning('resubmit job (for the %s times)' % args['nb_submit'])
448 del self.retry_args[job_id]
449 self.submitted_ids.remove(job_id)
450 if 'time_check' in args:
451 del args['time_check']
452 if job_id in self.id_to_packet:
453 self.id_to_packet[job_id].remove_one()
454 args['packet_member'] = self.id_to_packet[job_id]
455 del self.id_to_packet[job_id]
456 self.cluster_submit(**args)
457 else:
458 self.submit2(**args)
459 return 'resubmit'
460 return 'done'
461
462 @check_interupt()
463 - def launch_and_wait(self, prog, argument=[], cwd=None, stdout=None,
464 stderr=None, log=None, required_output=[], nb_submit=0,
465 input_files=[], output_files=[]):
466 """launch one job on the cluster and wait for it"""
467
468 special_output = False
469 if stderr == -2 and stdout:
470
471 special_output = True
472 stderr = stdout + '.err'
473
474 id = self.submit2(prog, argument, cwd, stdout, stderr, log,
475 required_output=required_output, input_files=input_files,
476 output_files=output_files)
477
478 if self.options['cluster_type']=='htcaas2':
479 if self.submitted == self.submitted_ids[-1]:
480 id = self.metasubmit(self)
481
482 frame = inspect.currentframe()
483 args, _, _, values = inspect.getargvalues(frame)
484 args = dict([(i, values[i]) for i in args if i != 'self'])
485 self.retry_args[id] = args
486
487 nb_wait=0
488 while 1:
489 nb_wait+=1
490 status = self.control_one_job(id)
491 if not status in ['R','I']:
492 status = self.check_termination(id)
493 if status in ['wait']:
494 time.sleep(30)
495 continue
496 elif status in ['resubmit']:
497 id = self.submitted_ids[0]
498 time.sleep(30)
499 continue
500
501 time.sleep(30)
502 break
503 time.sleep(self.options['cluster_status_update'][1])
504
505 if required_output:
506 status = self.check_termination(id)
507 if status == 'wait':
508 run += 1
509 elif status == 'resubmit':
510 idle += 1
511
512
513 if special_output:
514
515
516 for i in range(5):
517 if os.path.exists(stdout):
518 if not os.path.exists(stderr):
519 time.sleep(5)
520 if os.path.exists(stderr):
521 err_text = open(stderr).read()
522 if not err_text:
523 return
524 logger.warning(err_text)
525 text = open(stdout).read()
526 open(stdout,'w').write(text + err_text)
527 else:
528 return
529 time.sleep(10)
530
531 - def remove(self, *args, **opts):
532 """ """
533 logger.warning("""This cluster didn't support job removal,
534 the jobs are still running on the cluster.""")
535
536 @store_input()
540
542 """routine which allow to modify the run_card/mg5cmd object to change the
543 default behavior of the runs.
544 This is called at the time of the compilation of the run_card.
545 Note that this function can be called multiple times by run.
546 """
547
548 return
549
551 """ an object for handling packet of job, it is designed to be thread safe
552 """
553
554 - def __init__(self, name, fct, args, opts={}):
555 import Queue
556 import threading
557 self.queue = Queue.Queue()
558 self.tag = name
559 self.fct = fct
560 self.args = args
561 self.opts = opts
562 self.done = threading.Event()
563
564 - def put(self, *args, **opts):
566
567 append = put
568
573
575 """class for dealing with the submission in multiple node"""
576
577 job_id = "$"
578
580 """Init the cluster """
581
582
583 super(MultiCore, self).__init__(self, *args, **opt)
584
585 import Queue
586 import threading
587 import thread
588 self.queue = Queue.Queue()
589 self.done = Queue.Queue()
590 self.submitted = Queue.Queue()
591 self.stoprequest = threading.Event()
592 self.demons = []
593 self.nb_done =0
594 if 'nb_core' in opt:
595 self.nb_core = opt['nb_core']
596 elif isinstance(args[0],int):
597 self.nb_core = args[0]
598 else:
599 self.nb_core = 1
600 self.update_fct = None
601
602 self.lock = threading.Event()
603 self.pids = Queue.Queue()
604 self.done_pid = []
605 self.done_pid_queue = Queue.Queue()
606 self.fail_msg = None
607
608
609 for _ in range(self.nb_core):
610 self.start_demon()
611
612
614 import threading
615 t = threading.Thread(target=self.worker)
616 t.daemon = True
617 t.start()
618 self.demons.append(t)
619
620
622 import Queue
623 import thread
624 while not self.stoprequest.isSet():
625 try:
626 args = self.queue.get()
627 tag, exe, arg, opt = args
628 try:
629
630 if isinstance(exe,str):
631 if os.path.exists(exe) and not exe.startswith('/'):
632 exe = './' + exe
633 if isinstance(opt['stdout'],str):
634 opt['stdout'] = open(opt['stdout'],'w')
635 if opt['stderr'] == None:
636 opt['stderr'] = subprocess.STDOUT
637 if arg:
638 proc = misc.Popen([exe] + arg, **opt)
639 else:
640 proc = misc.Popen(exe, **opt)
641 pid = proc.pid
642 self.pids.put(pid)
643 proc.wait()
644 if proc.returncode not in [0, 143, -15] and not self.stoprequest.isSet():
645 fail_msg = 'program %s launch ends with non zero status: %s. Stop all computation' % \
646 (' '.join([exe]+arg), proc.returncode)
647 logger.warning(fail_msg)
648 self.stoprequest.set()
649 self.remove(fail_msg)
650
651
652
653
654 else:
655 pid = tag
656 self.pids.put(pid)
657
658
659 returncode = exe(*arg, **opt)
660 if returncode != 0:
661 logger.warning("fct %s does not return 0. Stopping the code in a clean way. The error was:\n%s", exe, returncode)
662 self.stoprequest.set()
663 self.remove("fct %s does not return 0:\n %s" % (exe, returncode))
664 except Exception,error:
665 self.fail_msg = sys.exc_info()
666 logger.warning(str(error))
667 self.stoprequest.set()
668 self.remove(error)
669
670 if __debug__:
671 raise self.fail_msg[0], self.fail_msg[1],self.fail_msg[2]
672
673 self.queue.task_done()
674 self.done.put(tag)
675 self.done_pid_queue.put(pid)
676
677 try:
678 self.lock.set()
679 except thread.error:
680 continue
681 except Queue.Empty:
682 continue
683
684
685
686
687 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
688 log=None, required_output=[], nb_submit=0):
689 """submit a job on multicore machine"""
690
691 tag = (prog, tuple(argument), cwd, nb_submit)
692 if isinstance(prog, str):
693
694 opt = {'cwd': cwd,
695 'stdout':stdout,
696 'stderr': stderr}
697 self.queue.put((tag, prog, argument, opt))
698 self.submitted.put(1)
699 return tag
700 else:
701
702 self.queue.put((tag, prog, argument, {}))
703 self.submitted.put(1)
704 return tag
705
706 - def launch_and_wait(self, prog, argument=[], cwd=None, stdout=None,
707 stderr=None, log=None, **opts):
708 """launch one job and wait for it"""
709 if isinstance(stdout, str):
710 stdout = open(stdout, 'w')
711 if isinstance(stderr, str):
712 stdout = open(stderr, 'w')
713 return misc.call([prog] + argument, stdout=stdout, stderr=stderr, cwd=cwd)
714
715 - def remove(self, error=None):
716 """Ensure that all thread are killed"""
717
718
719 self.stoprequest.set()
720 if error and not self.fail_msg:
721 self.fail_msg = error
722
723
724 while not self.done_pid_queue.empty():
725 pid = self.done_pid_queue.get()
726 self.done_pid.append(pid)
727
728
729 while not self.pids.empty():
730 pid = self.pids.get()
731 self.pids.task_done()
732 if isinstance(pid, tuple):
733 continue
734 if pid in self.done_pid:
735 continue
736 out = os.system('CPIDS=$(pgrep -P %(pid)s); kill -15 $CPIDS > /dev/null 2>&1' \
737 % {'pid':pid} )
738 out = os.system('kill -15 %(pid)s > /dev/null 2>&1' % {'pid':pid} )
739
740
741 - def wait(self, me_dir, update_status, update_first=None):
742 """Waiting that all the jobs are done. This function also control that
743 the submission by packet are handle correctly (i.e. submit the function)"""
744
745 import Queue
746 import threading
747
748 try:
749 last_status = (0, 0, 0)
750 sleep_time = 1
751 use_lock = True
752 first = True
753 while True:
754 force_one_more_loop = False
755
756
757
758 while self.done.qsize():
759 try:
760 tag = self.done.get(True, 1)
761 except Queue.Empty:
762 pass
763 else:
764 if self.id_to_packet and tuple(tag) in self.id_to_packet:
765 packet = self.id_to_packet[tuple(tag)]
766 remaining = packet.remove_one()
767 if remaining == 0:
768
769 packet.queue.join()
770 self.submit(packet.fct, packet.args)
771 force_one_more_loop = True
772 self.nb_done += 1
773 self.done.task_done()
774
775
776
777 Idle = self.queue.qsize()
778 Done = self.nb_done + self.done.qsize()
779 Running = max(0, self.submitted.qsize() - Idle - Done)
780
781 if Idle + Running <= 0 and not force_one_more_loop:
782 update_status(Idle, Running, Done)
783
784
785 self.queue.join()
786 break
787
788 if (Idle, Running, Done) != last_status:
789 if first and update_first:
790 update_first(Idle, Running, Done)
791 first = False
792 else:
793 update_status(Idle, Running, Done)
794 last_status = (Idle, Running, Done)
795
796
797 while not self.done_pid_queue.empty():
798 pid = self.done_pid_queue.get()
799 self.done_pid.append(pid)
800 self.done_pid_queue.task_done()
801
802
803
804 if use_lock:
805
806 use_lock = self.lock.wait(300)
807 self.lock.clear()
808 if not use_lock and Idle > 0:
809 use_lock = True
810 else:
811
812
813 time.sleep(sleep_time)
814 sleep_time = min(sleep_time + 2, 180)
815 if update_first:
816 update_first(Idle, Running, Done)
817
818 if self.stoprequest.isSet():
819 if isinstance(self.fail_msg, Exception):
820 raise self.fail_msg
821 elif isinstance(self.fail_msg, str):
822 raise Exception, self.fail_msg
823 else:
824 misc.sprint(self.fail_msg)
825 raise self.fail_msg[0], self.fail_msg[1], self.fail_msg[2]
826
827 try:
828 self.lock.clear()
829 except Exception:
830 pass
831 self.done = Queue.Queue()
832 self.done_pid = []
833 self.done_pid_queue = Queue.Queue()
834 self.nb_done = 0
835 self.submitted = Queue.Queue()
836 self.pids = Queue.Queue()
837 self.stoprequest.clear()
838
839 except KeyboardInterrupt:
840
841 if isinstance(self.fail_msg, Exception):
842 raise self.fail_msg
843 elif isinstance(self.fail_msg, str):
844 raise Exception, self.fail_msg
845 elif self.fail_msg:
846 raise self.fail_msg[0], self.fail_msg[1], self.fail_msg[2]
847
848 raise
849
851 """Basic class for dealing with cluster submission"""
852
853 name = 'condor'
854 job_id = 'CONDOR_ID'
855
856
857
858 @multiple_try()
859 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None,
860 required_output=[], nb_submit=0):
861 """Submit a job prog to a Condor cluster"""
862
863 text = """Executable = %(prog)s
864 output = %(stdout)s
865 error = %(stderr)s
866 log = %(log)s
867 %(argument)s
868 environment = CONDOR_ID=$(Cluster).$(Process)
869 Universe = vanilla
870 notification = Error
871 Initialdir = %(cwd)s
872 %(requirement)s
873 getenv=True
874 queue 1
875 """
876
877 if self.cluster_queue not in ['None', None]:
878 requirement = 'Requirements = %s=?=True' % self.cluster_queue
879 else:
880 requirement = ''
881
882 if cwd is None:
883 cwd = os.getcwd()
884 if stdout is None:
885 stdout = '/dev/null'
886 if stderr is None:
887 stderr = '/dev/null'
888 if log is None:
889 log = '/dev/null'
890 if not os.path.exists(prog):
891 prog = os.path.join(cwd, prog)
892 if argument:
893 argument = 'Arguments = %s' % ' '.join(argument)
894 else:
895 argument = ''
896
897
898 dico = {'prog': prog, 'cwd': cwd, 'stdout': stdout,
899 'stderr': stderr,'log': log,'argument': argument,
900 'requirement': requirement}
901
902
903 a = misc.Popen(['condor_submit'], stdout=subprocess.PIPE,
904 stdin=subprocess.PIPE)
905 output, _ = a.communicate(text % dico)
906
907
908
909
910 pat = re.compile("submitted to cluster (\d*)",re.MULTILINE)
911 try:
912 id = pat.search(output).groups()[0]
913 except:
914 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \
915 % output
916 self.submitted += 1
917 self.submitted_ids.append(id)
918 return id
919
920 @store_input()
921 @multiple_try()
922 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
923 log=None, input_files=[], output_files=[], required_output=[],
924 nb_submit=0):
925 """Submit the job on the cluster NO SHARE DISK
926 input/output file should be give relative to cwd
927 """
928
929 if not required_output and output_files:
930 required_output = output_files
931
932 if (input_files == [] == output_files):
933 return self.submit(prog, argument, cwd, stdout, stderr, log,
934 required_output=required_output, nb_submit=nb_submit)
935
936 text = """Executable = %(prog)s
937 output = %(stdout)s
938 error = %(stderr)s
939 log = %(log)s
940 %(argument)s
941 should_transfer_files = YES
942 when_to_transfer_output = ON_EXIT
943 transfer_input_files = %(input_files)s
944 %(output_files)s
945 Universe = vanilla
946 notification = Error
947 Initialdir = %(cwd)s
948 %(requirement)s
949 getenv=True
950 queue 1
951 """
952
953 if self.cluster_queue not in ['None', None]:
954 requirement = 'Requirements = %s=?=True' % self.cluster_queue
955 else:
956 requirement = ''
957
958 if cwd is None:
959 cwd = os.getcwd()
960 if stdout is None:
961 stdout = '/dev/null'
962 if stderr is None:
963 stderr = '/dev/null'
964 if log is None:
965 log = '/dev/null'
966 if not os.path.exists(prog):
967 prog = os.path.join(cwd, prog)
968 if argument:
969 argument = 'Arguments = %s' % ' '.join([str(a) for a in argument])
970 else:
971 argument = ''
972
973 if input_files:
974 input_files = ','.join(input_files)
975 else:
976 input_files = ''
977 if output_files:
978 output_files = 'transfer_output_files = %s' % ','.join(output_files)
979 else:
980 output_files = ''
981
982
983
984 dico = {'prog': prog, 'cwd': cwd, 'stdout': stdout,
985 'stderr': stderr,'log': log,'argument': argument,
986 'requirement': requirement, 'input_files':input_files,
987 'output_files':output_files}
988
989
990 a = subprocess.Popen(['condor_submit'], stdout=subprocess.PIPE,
991 stdin=subprocess.PIPE)
992 output, _ = a.communicate(text % dico)
993
994
995
996
997 pat = re.compile("submitted to cluster (\d*)",re.MULTILINE)
998 try:
999 id = pat.search(output).groups()[0]
1000 except:
1001 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \
1002 % output
1003 self.submitted += 1
1004 self.submitted_ids.append(id)
1005 return id
1006
1007
1008
1009
1010
1011 @multiple_try(nb_try=10, sleep=10)
1013 """ control the status of a single job with it's cluster id """
1014 cmd = 'condor_q '+str(id)+" -format \'%-2s \\n\' \'ifThenElse(JobStatus==0,\"U\",ifThenElse(JobStatus==1,\"I\",ifThenElse(JobStatus==2,\"R\",ifThenElse(JobStatus==3,\"X\",ifThenElse(JobStatus==4,\"C\",ifThenElse(JobStatus==5,\"H\",ifThenElse(JobStatus==6,\"E\",string(JobStatus))))))))\'"
1015 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE,
1016 stderr=subprocess.PIPE)
1017
1018 error = status.stderr.read()
1019 if status.returncode or error:
1020 raise ClusterManagmentError, 'condor_q returns error: %s' % error
1021
1022 return status.stdout.readline().strip()
1023
1024 jobstatus = {'0':'U', '1':'I','2':'R','3':'X','4':'C','5':'H','6':'E'}
1025 @check_interupt()
1026 @multiple_try(nb_try=10, sleep=10)
1028 """ control the status of a single job with it's cluster id """
1029
1030 if not self.submitted_ids:
1031 return 0, 0, 0, 0
1032
1033 packet = 15000
1034 idle, run, fail = 0, 0, 0
1035 ongoing = []
1036 for i in range(1+(len(self.submitted_ids)-1)//packet):
1037 start = i * packet
1038 stop = (i+1) * packet
1039 cmd = "condor_q " + ' '.join(self.submitted_ids[start:stop]) + \
1040 " -format \"%d \" ClusterId " + \
1041 " -format \"%d\\n\" JobStatus "
1042
1043 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE,
1044 stderr=subprocess.PIPE)
1045 error = status.stderr.read()
1046 if status.returncode or error:
1047 raise ClusterManagmentError, 'condor_q returns error: %s' % error
1048
1049 for line in status.stdout:
1050 id, status = line.strip().split()
1051 status = self.jobstatus[status]
1052 ongoing.append(id)
1053 if status in ['I','U']:
1054 idle += 1
1055 elif status == 'R':
1056 run += 1
1057 elif status != 'C':
1058 fail += 1
1059
1060 for id in list(self.submitted_ids):
1061 if id not in ongoing:
1062 status = self.check_termination(id)
1063 if status == 'wait':
1064 run += 1
1065 elif status == 'resubmit':
1066 idle += 1
1067
1068 return idle, run, self.submitted - (idle+run+fail), fail
1069
1070 @multiple_try()
1071 - def remove(self, *args, **opts):
1072 """Clean the jobson the cluster"""
1073
1074 if not self.submitted_ids:
1075 return
1076 cmd = "condor_rm %s" % ' '.join(self.submitted_ids)
1077
1078 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1079 self.submitted_ids = []
1080
1082 """Basic class for dealing with cluster submission"""
1083
1084 name = 'pbs'
1085 job_id = 'PBS_JOBID'
1086 idle_tag = ['Q']
1087 running_tag = ['T','E','R']
1088 complete_tag = ['C']
1089
1090 maximum_submited_jobs = 2500
1091
1092 @multiple_try()
1093 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None,
1094 required_output=[], nb_submit=0):
1095 """Submit a job prog to a PBS cluster"""
1096
1097 me_dir = self.get_jobs_identifier(cwd, prog)
1098
1099 if len(self.submitted_ids) > self.maximum_submited_jobs:
1100 fct = lambda idle, run, finish: logger.info('Waiting for free slot: %s %s %s' % (idle, run, finish))
1101 self.wait(me_dir, fct, self.maximum_submited_jobs)
1102
1103
1104 text = ""
1105 if cwd is None:
1106 cwd = os.getcwd()
1107 else:
1108 text = " cd %s;" % cwd
1109 if stdout is None:
1110 stdout = '/dev/null'
1111 if stderr is None:
1112 stderr = '/dev/null'
1113 elif stderr == -2:
1114 stderr = stdout
1115 if log is None:
1116 log = '/dev/null'
1117
1118 if not os.path.isabs(prog):
1119 text += "./%s" % prog
1120 else:
1121 text+= prog
1122
1123 if argument:
1124 text += ' ' + ' '.join(argument)
1125
1126 command = ['qsub','-o', stdout,
1127 '-N', me_dir,
1128 '-e', stderr,
1129 '-V']
1130
1131 if self.cluster_queue and self.cluster_queue != 'None':
1132 command.extend(['-q', self.cluster_queue])
1133
1134 a = misc.Popen(command, stdout=subprocess.PIPE,
1135 stderr=subprocess.STDOUT,
1136 stdin=subprocess.PIPE, cwd=cwd)
1137
1138 output = a.communicate(text)[0]
1139 id = output.split('.')[0]
1140 if not id.isdigit() or a.returncode !=0:
1141 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \
1142 % output
1143
1144 self.submitted += 1
1145 self.submitted_ids.append(id)
1146 return id
1147
1148 @multiple_try()
1150 """ control the status of a single job with it's cluster id """
1151 cmd = 'qstat '+str(id)
1152 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE,
1153 stderr=subprocess.STDOUT)
1154
1155 for line in status.stdout:
1156 line = line.strip()
1157 if 'cannot connect to server' in line or 'cannot read reply' in line:
1158 raise ClusterManagmentError, 'server disconnected'
1159 if 'Unknown' in line:
1160 return 'F'
1161 elif line.startswith(str(id)):
1162 jobstatus = line.split()[4]
1163 else:
1164 jobstatus=""
1165
1166 if status.returncode != 0 and status.returncode is not None:
1167 raise ClusterManagmentError, 'server fails in someway (errorcode %s)' % status.returncode
1168 if jobstatus in self.idle_tag:
1169 return 'I'
1170 elif jobstatus in self.running_tag:
1171 return 'R'
1172 return 'F'
1173
1174
1175 @multiple_try()
1177 """ control the status of a single job with it's cluster id """
1178 cmd = "qstat"
1179 status = misc.Popen([cmd], stdout=subprocess.PIPE)
1180
1181 me_dir = self.get_jobs_identifier(me_dir)
1182
1183 ongoing = []
1184
1185 idle, run, fail = 0, 0, 0
1186 for line in status.stdout:
1187 if 'cannot connect to server' in line or 'cannot read reply' in line:
1188 raise ClusterManagmentError, 'server disconnected'
1189 if me_dir in line:
1190 ongoing.append(line.split()[0].split('.')[0])
1191 status2 = line.split()[4]
1192 if status2 in self.idle_tag:
1193 idle += 1
1194 elif status2 in self.running_tag:
1195 run += 1
1196 elif status2 in self.complete_tag:
1197 if not self.check_termination(line.split()[0].split('.')[0]):
1198 idle += 1
1199 else:
1200 fail += 1
1201
1202 if status.returncode != 0 and status.returncode is not None:
1203 raise ClusterManagmentError, 'server fails in someway (errorcode %s)' % status.returncode
1204
1205 for id in list(self.submitted_ids):
1206 if id not in ongoing:
1207 status2 = self.check_termination(id)
1208 if status2 == 'wait':
1209 run += 1
1210 elif status2 == 'resubmit':
1211 idle += 1
1212
1213 return idle, run, self.submitted - (idle+run+fail), fail
1214
1215 @multiple_try()
1216 - def remove(self, *args, **opts):
1217 """Clean the jobs on the cluster"""
1218
1219 if not self.submitted_ids:
1220 return
1221 cmd = "qdel %s" % ' '.join(self.submitted_ids)
1222 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1223 self.submitted_ids = []
1224
1227 """Basic class for dealing with cluster submission"""
1228
1229
1230 name = 'sge'
1231 job_id = 'JOB_ID'
1232 idle_tag = ['qw', 'hqw','hRqw','w']
1233 running_tag = ['r','t','Rr','Rt']
1234 identifier_length = 10
1235
1237 """replace string for path issues"""
1238 location = os.path.realpath(location)
1239 homePath = os.getenv("HOME")
1240 if homePath:
1241 location = location.replace(homePath,'$HOME')
1242 return location
1243
1244 @multiple_try()
1245 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None,
1246 required_output=[], nb_submit=0):
1247 """Submit a job prog to an SGE cluster"""
1248
1249 me_dir = self.get_jobs_identifier(cwd, prog)
1250
1251
1252 if cwd is None:
1253
1254 cwd = self.def_get_path(os.getcwd())
1255 cwd1 = self.def_get_path(cwd)
1256 text = " cd %s;" % cwd1
1257 if stdout is None:
1258 stdout = '/dev/null'
1259 else:
1260 stdout = self.def_get_path(stdout)
1261 if stderr is None:
1262 stderr = '/dev/null'
1263 elif stderr == -2:
1264 stderr = stdout
1265 else:
1266 stderr = self.def_get_path(stderr)
1267
1268 if log is None:
1269 log = '/dev/null'
1270 else:
1271 log = self.def_get_path(log)
1272
1273 text += prog
1274 if argument:
1275 text += ' ' + ' '.join(argument)
1276
1277
1278
1279
1280 homePath = os.getenv("HOME")
1281 if homePath:
1282 text = text.replace(homePath,'$HOME')
1283
1284 logger.debug("!=== input %s" % text)
1285 logger.debug("!=== output %s" % stdout)
1286 logger.debug("!=== error %s" % stderr)
1287 logger.debug("!=== logs %s" % log)
1288
1289 command = ['qsub','-o', stdout,
1290 '-N', me_dir,
1291 '-e', stderr,
1292 '-V']
1293
1294 if self.cluster_queue and self.cluster_queue != 'None':
1295 command.extend(['-q', self.cluster_queue])
1296
1297 a = misc.Popen(command, stdout=subprocess.PIPE,
1298 stderr=subprocess.STDOUT,
1299 stdin=subprocess.PIPE, cwd=cwd)
1300
1301 output = a.communicate(text)[0]
1302 id = output.split(' ')[2]
1303 if not id.isdigit():
1304 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \
1305 % output
1306 self.submitted += 1
1307 self.submitted_ids.append(id)
1308 logger.debug(output)
1309
1310 return id
1311
1312 @multiple_try()
1314 """ control the status of a single job with it's cluster id """
1315
1316 cmd = 'qstat '
1317 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1318 for line in status.stdout:
1319
1320
1321
1322
1323
1324
1325 if str(id) in line:
1326 status = line.split()[4]
1327
1328 if status in self.idle_tag:
1329 return 'I'
1330 elif status in self.running_tag:
1331 return 'R'
1332 return 'F'
1333
1334 @multiple_try()
1336 """ control the status of a single job with it's cluster id """
1337 cmd = "qstat "
1338 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1339
1340 me_dir = self.get_jobs_identifier(me_dir)
1341
1342 finished = list(self.submitted_ids)
1343
1344 idle, run, fail = 0, 0, 0
1345 for line in status.stdout:
1346 if me_dir in line:
1347 id,_,_,_,status = line.split()[:5]
1348 if status in self.idle_tag:
1349 idle += 1
1350 finished.remove(id)
1351 elif status in self.running_tag:
1352 run += 1
1353 finished.remove(id)
1354 else:
1355 logger.debug(line)
1356 fail += 1
1357 finished.remove(id)
1358
1359 for id in finished:
1360 self.check_termination(id)
1361
1362 return idle, run, self.submitted - (idle+run+fail), fail
1363
1364
1365
1366 @multiple_try()
1367 - def remove(self, *args, **opts):
1368 """Clean the jobs on the cluster"""
1369
1370 if not self.submitted_ids:
1371 return
1372 cmd = "qdel %s" % ' '.join(self.submitted_ids)
1373 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1374 self.submitted_ids = []
1375
1378 """Basic class for dealing with cluster submission"""
1379
1380 name = 'lsf'
1381 job_id = 'LSB_JOBID'
1382
1383 @multiple_try()
1384 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None,
1385 required_output=[], nb_submit=0):
1386 """Submit the job prog to an LSF cluster"""
1387
1388
1389 me_dir = self.get_jobs_identifier(cwd, prog)
1390
1391 text = ""
1392 command = ['bsub', '-C0', '-J', me_dir]
1393 if cwd is None:
1394 cwd = os.getcwd()
1395 else:
1396 text = " cd %s;" % cwd
1397 if stdout and isinstance(stdout, str):
1398 command.extend(['-o', stdout])
1399 if stderr and isinstance(stdout, str):
1400 command.extend(['-e', stderr])
1401 elif stderr == -2:
1402 pass
1403 if log is None:
1404 log = '/dev/null'
1405
1406 text += prog
1407 if argument:
1408 text += ' ' + ' '.join(argument)
1409
1410 if self.cluster_queue and self.cluster_queue != 'None':
1411 command.extend(['-q', self.cluster_queue])
1412
1413 a = misc.Popen(command, stdout=subprocess.PIPE,
1414 stderr=subprocess.STDOUT,
1415 stdin=subprocess.PIPE, cwd=cwd)
1416
1417 output = a.communicate(text)[0]
1418
1419 try:
1420 id = output.split('>',1)[0].split('<')[1]
1421 except:
1422 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \
1423 % output
1424 if not id.isdigit():
1425 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \
1426 % output
1427 self.submitted += 1
1428 self.submitted_ids.append(id)
1429 return id
1430
1431
1432 @multiple_try()
1434 """ control the status of a single job with it's cluster id """
1435
1436 cmd = 'bjobs '+str(id)
1437 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1438
1439 for line in status.stdout:
1440 line = line.strip().upper()
1441 if 'JOBID' in line:
1442 continue
1443 elif str(id) not in line:
1444 continue
1445 status = line.split()[2]
1446 if status == 'RUN':
1447 return 'R'
1448 elif status == 'PEND':
1449 return 'I'
1450 elif status == 'DONE':
1451 return 'F'
1452 else:
1453 return 'H'
1454 return 'F'
1455
1456 @multiple_try()
1458 """ control the status of a single job with it's cluster id """
1459
1460 if not self.submitted_ids:
1461 return 0, 0, 0, 0
1462
1463 cmd = "bjobs " + ' '.join(self.submitted_ids)
1464 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1465
1466 jobstatus = {}
1467 for line in status.stdout:
1468 line = line.strip()
1469 if 'JOBID' in line:
1470 continue
1471 splitline = line.split()
1472 id = splitline[0]
1473 if id not in self.submitted_ids:
1474 continue
1475 jobstatus[id] = splitline[2]
1476
1477 idle, run, fail = 0, 0, 0
1478 for id in self.submitted_ids[:]:
1479 if id in jobstatus:
1480 status = jobstatus[id]
1481 else:
1482 status = 'MISSING'
1483 if status == 'RUN':
1484 run += 1
1485 elif status == 'PEND':
1486 idle += 1
1487 else:
1488 status = self.check_termination(id)
1489 if status == 'wait':
1490 run += 1
1491 elif status == 'resubmit':
1492 idle += 1
1493
1494 return idle, run, self.submitted - (idle+run+fail), fail
1495
1496 @multiple_try()
1497 - def remove(self, *args,**opts):
1498 """Clean the jobs on the cluster"""
1499
1500 if not self.submitted_ids:
1501 return
1502 cmd = "bkill %s" % ' '.join(self.submitted_ids)
1503 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1504 self.submitted_ids = []
1505
1507 """Class for dealing with cluster submission on a GE cluster"""
1508
1509 name = 'ge'
1510 job_id = 'JOB_ID'
1511 idle_tag = ['qw']
1512 running_tag = ['r']
1513
1514 @multiple_try()
1515 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None,
1516 required_output=[], nb_submit=0):
1517 """Submit a job prog to a GE cluster"""
1518
1519 text = ""
1520 if cwd is None:
1521 cwd = os.getcwd()
1522 else:
1523 text = " cd %s; bash " % cwd
1524 if stdout is None:
1525 stdout = os.path.join(cwd, "log.%s" % prog.split('/')[-1])
1526 if stderr is None:
1527 stderr = os.path.join(cwd, "err.%s" % prog.split('/')[-1])
1528 elif stderr == -2:
1529 stderr = stdout
1530 if log is None:
1531 log = '/dev/null'
1532
1533 text += prog
1534 if argument:
1535 text += ' ' + ' '.join(argument)
1536 text += '\n'
1537 tmp_submit = os.path.join(cwd, 'tmp_submit')
1538 open(tmp_submit,'w').write(text)
1539
1540 a = misc.Popen(['qsub','-o', stdout,
1541 '-e', stderr,
1542 tmp_submit],
1543 stdout=subprocess.PIPE,
1544 stderr=subprocess.STDOUT,
1545 stdin=subprocess.PIPE, cwd=cwd)
1546
1547 output = a.communicate()[0]
1548
1549 pat = re.compile("Your job (\d*) \(",re.MULTILINE)
1550 try:
1551 id = pat.search(output).groups()[0]
1552 except:
1553 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \
1554 % output
1555 self.submitted += 1
1556 self.submitted_ids.append(id)
1557 return id
1558
1559 @multiple_try()
1561 """ control the status of a single job with it's cluster id """
1562 cmd = 'qstat | grep '+str(id)
1563 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1564 if not status:
1565 return 'F'
1566
1567 pat = re.compile("^(\d+)\s+[\d\.]+\s+[\w\d\.]+\s+[\w\d\.]+\s+(\w+)\s")
1568 stat = ''
1569 for line in status.stdout.read().split('\n'):
1570 if not line:
1571 continue
1572 line = line.strip()
1573 try:
1574 groups = pat.search(line).groups()
1575 except:
1576 raise ClusterManagmentError, 'bad syntax for stat: \n\"%s\"' % line
1577 if groups[0] != id: continue
1578 stat = groups[1]
1579 if not stat:
1580 return 'F'
1581 if stat in self.idle_tag:
1582 return 'I'
1583 if stat in self.running_tag:
1584 return 'R'
1585
1586 @multiple_try()
1588 """Check the status of job associated to directory me_dir. return (idle, run, finish, fail)"""
1589 if not self.submitted_ids:
1590 return 0, 0, 0, 0
1591 idle, run, fail = 0, 0, 0
1592 ongoing = []
1593 for statusflag in ['p', 'r', 'sh']:
1594 cmd = 'qstat -s %s' % statusflag
1595 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1596
1597 pat = re.compile("^(\d+)")
1598 for line in status.stdout.read().split('\n'):
1599 line = line.strip()
1600 try:
1601 id = pat.search(line).groups()[0]
1602 except Exception:
1603 pass
1604 else:
1605 if id not in self.submitted_ids:
1606 continue
1607 ongoing.append(id)
1608 if statusflag == 'p':
1609 idle += 1
1610 if statusflag == 'r':
1611 run += 1
1612 if statusflag == 'sh':
1613 fail += 1
1614 for id in list(self.submitted_ids):
1615 if id not in ongoing:
1616 self.check_termination(id)
1617
1618
1619 return idle, run, self.submitted - idle - run - fail, fail
1620
1621 @multiple_try()
1622 - def remove(self, *args, **opts):
1623 """Clean the jobs on the cluster"""
1624
1625 if not self.submitted_ids:
1626 return
1627 cmd = "qdel %s" % ' '.join(self.submitted_ids)
1628 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1629 self.submitted_ids = []
1630
1632 """start a computation and not wait for it to finish.
1633 this fonction returns a lock which is locked as long as the job is
1634 running."""
1635
1636 mc = MultiCore(1)
1637 mc.submit(exe, argument, cwd, stdout, **opt)
1638 mc.need_waiting = True
1639 return mc.lock
1640
1643 """Basic class for dealing with cluster submission"""
1644
1645 name = 'slurm'
1646 job_id = 'SLURM_JOBID'
1647 idle_tag = ['Q','PD','S','CF']
1648 running_tag = ['R', 'CG']
1649 complete_tag = ['C']
1650 identifier_length = 8
1651
1652 @multiple_try()
1653 - def submit(self, prog, argument=[], cwd=None, stdout=None, stderr=None, log=None,
1654 required_output=[], nb_submit=0):
1655 """Submit a job prog to a SLURM cluster"""
1656
1657 me_dir = self.get_jobs_identifier(cwd, prog)
1658
1659
1660 if cwd is None:
1661 cwd = os.getcwd()
1662 if stdout is None:
1663 stdout = '/dev/null'
1664 if stderr is None:
1665 stderr = '/dev/null'
1666 elif stderr == -2:
1667 stderr = stdout
1668 if log is None:
1669 log = '/dev/null'
1670
1671 command = ['sbatch', '-o', stdout,
1672 '-J', me_dir,
1673 '-e', stderr, prog] + argument
1674
1675 if self.cluster_queue and self.cluster_queue != 'None':
1676 command.insert(1, '-p')
1677 command.insert(2, self.cluster_queue)
1678
1679 a = misc.Popen(command, stdout=subprocess.PIPE,
1680 stderr=subprocess.STDOUT,
1681 stdin=subprocess.PIPE, cwd=cwd)
1682
1683 output = a.communicate()
1684 output_arr = output[0].split(' ')
1685 id = output_arr[3].rstrip()
1686
1687 if not id.isdigit():
1688 raise ClusterManagmentError, 'fail to submit to the cluster: \n%s' \
1689 % (output[0] + '\n' + output[1])
1690
1691 self.submitted += 1
1692 self.submitted_ids.append(id)
1693 return id
1694
1695 @multiple_try()
1697 """ control the status of a single job with it's cluster id """
1698 cmd = 'squeue j'+str(id)
1699 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE,
1700 stderr=open(os.devnull,'w'))
1701
1702 for line in status.stdout:
1703 line = line.strip()
1704 if 'Invalid' in line:
1705 return 'F'
1706 elif line.startswith(str(id)):
1707 status = line.split()[4]
1708 if status in self.idle_tag:
1709 return 'I'
1710 elif status in self.running_tag:
1711 return 'R'
1712 return 'F'
1713
1714 @multiple_try()
1716 """ control the status of a single job with it's cluster id """
1717 cmd = "squeue"
1718 pstatus = misc.Popen([cmd], stdout=subprocess.PIPE)
1719
1720 me_dir = self.get_jobs_identifier(me_dir)
1721
1722 idle, run, fail = 0, 0, 0
1723 ongoing=[]
1724 for line in pstatus.stdout:
1725 if me_dir in line:
1726 id, _, _,_ , status,_ = line.split(None,5)
1727 ongoing.append(id)
1728 if status in self.idle_tag:
1729 idle += 1
1730 elif status in self.running_tag:
1731 run += 1
1732 elif status in self.complete_tag:
1733 status = self.check_termination(id)
1734 if status == 'wait':
1735 run += 1
1736 elif status == 'resubmit':
1737 idle += 1
1738 else:
1739 fail += 1
1740
1741
1742 for id in list(self.submitted_ids):
1743 if id not in ongoing:
1744 status = self.check_termination(id)
1745 if status == 'wait':
1746 run += 1
1747 elif status == 'resubmit':
1748 idle += 1
1749
1750
1751 return idle, run, self.submitted - (idle+run+fail), fail
1752
1753 @multiple_try()
1754 - def remove(self, *args, **opts):
1755 """Clean the jobs on the cluster"""
1756
1757 if not self.submitted_ids:
1758 return
1759 cmd = "scancel %s" % ' '.join(self.submitted_ids)
1760 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1761 self.submitted_ids = []
1762
1764 """Class for dealing with cluster submission on a HTCaaS cluster using GPFS """
1765
1766 name= 'htcaas'
1767 job_id = 'HTCAAS_JOBID'
1768 idle_tag = ['waiting']
1769 running_tag = ['preparing','running']
1770 complete_tag = ['done']
1771
1772 @store_input()
1773 @multiple_try()
1774 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
1775 log=None, input_files=[], output_files=[], required_output=[],
1776 nb_submit=0):
1777 """Submit the HTCaaS job on the cluster with NO SHARE DISK
1778 input/output file should be given as relative to CWd
1779 """
1780
1781 cur_usr = os.getenv('USER')
1782
1783 if cwd is None:
1784 cwd = os.getcwd()
1785
1786 cwd_cp = cwd.rsplit("/",2)
1787
1788 if not stdout is None:
1789 print "stdout: %s" % stdout
1790
1791 if not os.path.exists(prog):
1792 prog = os.path.join(cwd, prog)
1793
1794 if not required_output and output_files:
1795 required_output = output_files
1796
1797 logger.debug(prog)
1798 if 'combine' not in prog and 'pythia' not in prog and 'shower' not in prog :
1799 cwd_arg = cwd+"/arguments"
1800 temp = ' '.join([str(a) for a in argument])
1801 arg_cmd="echo '"+temp+"' > " + cwd_arg
1802 command = ['htcaas-mgjob-submit','-d',cwd,'-e',os.path.basename(prog)]
1803 if argument :
1804 command.extend(['-a ', '='.join([str(a) for a in argument])])
1805 a = misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd)
1806 id = a.stdout.read().strip()
1807
1808 else:
1809 cwd_arg = cwd+"/arguments"
1810 temp = ' '.join([str(a) for a in argument])
1811 temp_file_name = "sub." + os.path.basename(prog)
1812 text = """#!/bin/bash
1813 MYPWD=%(cwd)s
1814 cd $MYPWD
1815 input_files=(%(input_files)s )
1816 for i in ${input_files[@]}
1817 do
1818 chmod -f +x $i
1819 done
1820 /bin/bash %(prog)s %(arguments)s > %(stdout)s
1821 """
1822 dico = {'cwd':cwd, 'input_files': ' '.join(input_files + [prog]), 'stdout': stdout, 'prog':prog,
1823 'arguments': ' '.join([str(a) for a in argument]),
1824 'program': ' ' if '.py' in prog else 'bash'}
1825
1826
1827 new_prog = pjoin(cwd, temp_file_name)
1828 open(new_prog, 'w').write(text % dico)
1829 misc.Popen(['chmod','+x',new_prog],cwd=cwd)
1830 command = ['htcaas-mgjob-submit','-d',cwd,'-e',temp_file_name]
1831 a = misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd)
1832 id = a.stdout.read().strip()
1833 logger.debug(id)
1834
1835 nb_try=0
1836 nb_limit=5
1837 if not id.isdigit() :
1838 print "[ID is not digit]:" + id
1839
1840 while not id.isdigit() :
1841 nb_try+=1
1842 print "[fail_retry]:"+ nb_try
1843 a=misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd)
1844 id = a.stdout.read().strip()
1845 if nb_try > nb_limit :
1846 raise ClusterManagementError, 'fail to submit to the HTCaaS cluster: \n %s' % id
1847 break
1848
1849 self.submitted += 1
1850 self.submitted_ids.append(id)
1851
1852 return id
1853
1854 @multiple_try(nb_try=10, sleep=5)
1856 """ control the status of a single job with it's cluster id """
1857
1858 if id == 0 :
1859 status_out ='C'
1860 else :
1861 cmd = 'htcaas-job-status -m '+str(id)+ " -s | grep Status "
1862 status = misc.Popen([cmd], shell=True,stdout=subprocess.PIPE,
1863 stderr=subprocess.PIPE)
1864 error = status.stderr.read()
1865 if status.returncode or error:
1866 raise ClusterManagmentError, 'htcaas-job-submit returns error: %s' % error
1867 status_out= status.stdout.read().strip()
1868 status_out= status_out.split(":",1)[1]
1869 if status_out == 'waiting':
1870 status_out='I'
1871 elif status_out == 'preparing' or status_out == 'running':
1872 status_out = 'R'
1873 elif status_out != 'done':
1874 status_out = 'F'
1875 elif status_out == 'done':
1876 status_out = 'C'
1877
1878 return status_out
1879
1880 @multiple_try()
1882 """ control the status of a single job with it's cluster id """
1883 if not self.submitted_ids:
1884 logger.debug("self.submitted_ids not exists")
1885 return 0, 0, 0, 0
1886
1887 ongoing = []
1888 idle, run, fail = 0, 0, 0
1889
1890 start = self.submitted_ids[0]
1891 end = self.submitted_ids[-1]
1892
1893 cmd = "htcaas-job-status -c "+str(start)+"-"+str(end)
1894 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
1895
1896 for line in status.stdout:
1897
1898 status2 = line.split()[-1]
1899 if status2 is not 'null' or line.split()[0].strip() is not '0':
1900 ongoing.append(line.split()[0].strip())
1901 logger.debug("["+line.split()[0].strip()+"]"+status2)
1902 if status2 is 'null' or line.split()[0].strip() is '0':
1903 idle += 1
1904 elif status2 in self.idle_tag:
1905 idle += 1
1906 elif status2 in self.running_tag:
1907 run += 1
1908 elif status2 in self.complete_tag:
1909 if not self.check_termination(line.split()[0]):
1910 idle +=1
1911 else:
1912 fail += 1
1913
1914 return idle, run, self.submitted - (idle+run+fail), fail
1915
1916 @multiple_try()
1917 - def remove(self, *args, **opts):
1918 """Clean the jobson the cluster"""
1919
1920 if not self.submitted_ids:
1921 return
1922 for i in range(len(self.submitted_ids)):
1923 cmd = "htcaas-job-cancel -m %s" % self.submitted_ids[i]
1924 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
1925
1927 """Class for dealing with cluster submission on a HTCaaS cluster without GPFS """
1928
1929 name= 'htcaas2'
1930 job_id = 'HTCAAS2_JOBID'
1931 idle_tag = ['waiting']
1932 running_tag = ['preparing','running']
1933 complete_tag = ['done']
1934
1935 @store_input()
1936 @multiple_try()
1937 - def submit2(self, prog, argument=[], cwd=None, stdout=None, stderr=None,
1938 log=None, input_files=[], output_files=[], required_output=[],
1939 nb_submit=0):
1940
1941 """Submit the HTCaaS job on the cluster with NO SHARE DISK
1942 input/output file should be given as relative to CWD
1943 """
1944 if cwd is None:
1945 cwd = os.getcwd()
1946
1947 if not os.path.exists(prog):
1948 prog = os.path.join(cwd, prog)
1949
1950 if 'combine' not in prog and 'pythia' not in prog and 'shower' not in prog :
1951 if cwd or prog :
1952 self.submitted_dirs.append(cwd)
1953 self.submitted_exes.append(prog)
1954 else:
1955 logger.debug("cwd and prog not exist->"+cwd+" / "+ os.path.basename(prog))
1956
1957 if argument :
1958 self.submitted_args.append('='.join([str(a) for a in argument]))
1959
1960 if cwd or prog :
1961 self.submitted += 1
1962 id = self.submitted
1963 self.submitted_ids.append(id)
1964 else:
1965 logger.debug("cwd and prog are not exist! ")
1966 id = 0
1967
1968 else:
1969 temp_file_name = "sub."+ os.path.basename(prog)
1970 text = """#!/bin/bash
1971 MYPWD=%(cwd)s
1972 cd $MYPWD
1973 input_files=(%(input_files)s )
1974 for i in ${input_files[@]}
1975 do
1976 chmod -f +x $i
1977 done
1978 /bin/bash %(prog)s %(arguments)s > %(stdout)s
1979 """
1980 dico = {'cwd':cwd, 'input_files': ' '.join(input_files + [prog]), 'stdout': stdout, 'prog':prog,
1981 'arguments': ' '.join([str(a) for a in argument]),
1982 'program': ' ' if '.py' in prog else 'bash'}
1983
1984 new_prog = pjoin(cwd, temp_file_name)
1985 open(new_prog, 'w').write(text % dico)
1986 misc.Popen(['chmod','+x',new_prog],cwd=cwd)
1987 command = ['htcaas-mgjob-submit','-d',cwd,'-e',new_prog]
1988 a = misc.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, cwd=cwd)
1989 id = a.stdout.read().strip()
1990 logger.debug("[mode2]-["+str(id)+"]")
1991 if cwd and prog :
1992 self.submitted += 1
1993 self.submitted_ids.append(id)
1994 else:
1995 logger.debug("cwd and prog are not exist! ")
1996 id = 0
1997
1998 return id
1999
2000 @multiple_try()
2044
2045
2046 @multiple_try(nb_try=10, sleep=5)
2048 """ control the status of a single job with it's cluster id """
2049
2050 if self.submitted == self.submitted_ids[-1] :
2051 id = self.metasubmit(self)
2052 tempid = self.submitted_ids[-1]
2053 self.submitted_ids.remove(self.submitted_ids[-1])
2054 self.submitted_ids.append(id)
2055 logger.debug(str(id)+" // "+str(self.submitted_ids[-1]))
2056
2057 if id == 0 :
2058 status_out ='C'
2059 else:
2060 cmd = 'htcaas-job-status -m '+ str(id) + " -s | grep Status "
2061 status = misc.Popen([cmd],shell=True,stdout=subprocess.PIPE,
2062 stderr=subprocess.PIPE)
2063 error = status.stderr.read()
2064 if status.returncode or error:
2065 raise ClusterManagmentError, 'htcaas-job-status returns error: %s' % error
2066 status_out= status.stdout.read().strip()
2067 status_out= status_out.split(":",1)[1]
2068 logger.debug("[["+str(id)+"]]"+status_out)
2069 if status_out == 'waiting':
2070 status_out='I'
2071 elif status_out == 'preparing' or status_out == 'running':
2072 status_out = 'R'
2073 elif status_out != 'done':
2074 status_out = 'F'
2075 elif status_out == 'done':
2076 status_out = 'C'
2077 self.submitted -= 1
2078
2079 return status_out
2080
2081 @multiple_try()
2083 """ control the status of a single job with it's cluster id """
2084 if not self.submitted_ids:
2085 logger.debug("self.submitted_ids not exists")
2086 return 0, 0, 0, 0
2087
2088 if "//" in me_dir :
2089 if int(me_dir.split("//")[0]) < int(me_dir.split("//")[1]) :
2090 start = me_dir.split("//")[0]
2091 end = me_dir.split("//")[1]
2092 else :
2093 start = me_dir.split("//")[1]
2094 end = me_dir.split("//")[0]
2095 elif "/" in me_dir :
2096 start = 0
2097 end = 0
2098 elif me_dir.isdigit():
2099 start = me_dir
2100 end = me_dir
2101 elif not me_dir.isdigit():
2102 me_dir = self.submitted_ids[0]
2103 logger.debug("Meta_ID is not digit(control), self.submitted_ids[0]: "+str(me_dir) )
2104
2105 ongoing = []
2106 idle, run, fail, done = 0, 0, 0, 0
2107
2108 cmd = "htcaas-job-status -c "+str(start)+"-"+str(end) +" -ac"
2109 status = misc.Popen([cmd], shell=True, stdout=subprocess.PIPE)
2110
2111 for line in status.stdout:
2112 status2 = line.split()[-1]
2113 if status2 is not 'null' or line.split()[0].strip() is not '0':
2114 ongoing.append(str(line.split()[0].strip())+"-"+str(line.split()[1].strip()))
2115 logger.debug("["+line.split()[0].strip()+"-"+line.split()[1].strip()+"]"+status2)
2116
2117 if status2 is 'null' or line.split()[0].strip() is '0':
2118 idle += 1
2119 elif status2 in self.idle_tag:
2120 idle += 1
2121 elif status2 in self.running_tag:
2122 run += 1
2123 elif status2 in self.complete_tag:
2124 done += 1
2125 self.submitted -= 1
2126 if not self.check_termination(line.split()[1]):
2127 idle +=1
2128 else:
2129 fail += 1
2130
2131 return idle, run, self.submitted - (idle+run+fail), fail
2132
2133 @multiple_try()
2134 - def remove(self, *args, **opts):
2135 """Clean the jobson the cluster"""
2136
2137 if not self.submitted_ids:
2138 return
2139 id = self.submitted_ids[0]
2140 if id is not 0 :
2141 cmd = "htcaas-job-cancel -m %s" % str(id)
2142 status = misc.Popen([cmd], shell=True, stdout=open(os.devnull,'w'))
2143
2144 from_name = {'condor':CondorCluster, 'pbs': PBSCluster, 'sge': SGECluster,
2145 'lsf': LSFCluster, 'ge':GECluster, 'slurm': SLURMCluster,
2146 'htcaas':HTCaaSCluster, 'htcaas2':HTCaaS2Cluster}
2147
2148 onecore=MultiCore(1)
2149
2150