Package PyFoam :: Package Infrastructure :: Module ClusterJob
[hide private]
[frames] | no frames]

Source Code for Module PyFoam.Infrastructure.ClusterJob

  1  #  ICE Revision: $Id$ 
  2  """Encapsulates all necessary things for a cluster-job, like setting 
  3  up, running, restarting""" 
  4   
  5  import os,sys,subprocess 
  6  from os import path,unlink 
  7  from threading import Thread,Lock,Timer 
  8   
  9  from PyFoam.Applications.Decomposer import Decomposer 
 10  from PyFoam.Applications.Runner import Runner 
 11  from PyFoam.Applications.SteadyRunner import SteadyRunner 
 12  from PyFoam.Applications.CloneCase import CloneCase 
 13  from PyFoam.Applications.FromTemplate import FromTemplate 
 14  from PyFoam.Applications.PrepareCase import PrepareCase 
 15  from PyFoam.Applications.RunParameterVariation import RunParameterVariation 
 16   
 17  from PyFoam.FoamInformation import changeFoamVersion 
 18  from PyFoam.FoamInformation import foamVersion as getFoamVersion 
 19  from PyFoam.Error import error,warning 
 20  from PyFoam import configuration as config 
 21  from PyFoam.FoamInformation import oldAppConvention as oldApp 
 22  from PyFoam.RunDictionary.SolutionDirectory import SolutionDirectory 
 23   
 24  from PyFoam.ThirdParty.six import print_,iteritems 
 25   
26 -def checkForMessageFromAbove(job):
27 if not job.listenToTimer: 28 return 29 30 if path.exists(job.stopFile()): 31 job.stopJob() 32 return 33 34 if path.exists(job.checkpointFile()): 35 job.writeCheckpoint() 36 37 job.timer=Timer(1.,checkForMessageFromAbove,args=[job]) 38 job.timer.start()
39 40
41 -class ClusterJob(object):
42 """ All Cluster-jobs are to be derived from this base-class 43 44 The actual jobs are implemented by overriding methods 45 46 There is a number of variables in this class that are used to 47 'communicate' information between the various stages""" 48
49 - def __init__(self, 50 basename, 51 arrayJob=False, 52 hardRestart=False, 53 autoParallel=True, 54 doAutoReconstruct=None, 55 foamVersion=None, 56 compileOption=None, 57 useFoamMPI=False, 58 multiRegion=False, 59 parameters={}, 60 isDecomposed=False):
61 """Initializes the Job 62 @param basename: Basis name of the job 63 @param arrayJob: this job is a parameter variation. The tasks 64 are identified by their task-id 65 @param hardRestart: treat the job as restarted 66 @param autoParallel: Parallelization is handled by the base-class 67 @param doAutoReconstruct: Automatically reconstruct the case if 68 autoParalellel is set. If the value is None then it is looked up from 69 the configuration 70 @param foamVersion: The foam-Version that is to be used 71 @param compileOption: Forces compile-option (usually 'Opt' or 'Debug') 72 @param useFoamMPI: Use the OpenMPI supplied with OpenFOAM 73 @param multiRegion: This job consists of multiple regions 74 @param parameters: Dictionary with parameters that are being passed to the Runner 75 @param isDecomposed: Assume that the job is already decomposed""" 76 77 # print_(os.environ) 78 79 if not "JOB_ID" in os.environ: 80 error("Not an SGE-job. Environment variable JOB_ID is missing") 81 self.jobID=int(os.environ["JOB_ID"]) 82 self.jobName=os.environ["JOB_NAME"] 83 84 self.basename=path.join(path.abspath(path.curdir),basename) 85 86 sgeRestarted=False 87 if "RESTARTED" in os.environ: 88 sgeRestarted=(int(os.environ["RESTARTED"])!=0) 89 90 if sgeRestarted or hardRestart: 91 self.restarted=True 92 else: 93 self.restarted=False 94 95 if foamVersion==None: 96 foamVersion=config().get("OpenFOAM","Version") 97 98 changeFoamVersion(foamVersion,compileOption=compileOption) 99 100 if not "WM_PROJECT_VERSION" in os.environ: 101 error("No OpenFOAM-Version seems to be configured. Set the foamVersion-parameter") 102 103 self.autoParallel=autoParallel 104 105 self.doAutoReconstruct=doAutoReconstruct 106 if self.doAutoReconstruct==None: 107 self.doAutoReconstruct=config().getboolean("ClusterJob","doAutoReconstruct") 108 109 self.multiRegion=multiRegion 110 111 self.parameters=parameters 112 113 self.hostfile=None 114 self.nproc=1 115 116 if "NSLOTS" in os.environ: 117 self.nproc=int(os.environ["NSLOTS"]) 118 self.message("Running on",self.nproc,"CPUs") 119 if self.nproc>1: 120 # self.hostfile=os.environ["PE_HOSTFILE"] 121 self.hostfile=path.join(os.environ["TMP"],"machines") 122 if config().getboolean("ClusterJob","useMachineFile"): 123 self.message("Using the machinefile",self.hostfile) 124 self.message("Contents of the machinefile:",open(self.hostfile).readlines()) 125 else: 126 self.message("No machinefile used because switched off with 'useMachineFile'") 127 128 self.ordinaryEnd=True 129 self.listenToTimer=False 130 131 self.taskID=None 132 self.arrayJob=arrayJob 133 134 if self.arrayJob: 135 self.taskID=int(os.environ["SGE_TASK_ID"]) 136 137 if not useFoamMPI and not foamVersion in eval(config().get("ClusterJob","useFoamMPI",default='[]')): 138 ## prepend special paths for the cluster 139 self.message("Adding Cluster-specific paths") 140 os.environ["PATH"]=config().get("ClusterJob","path")+":"+os.environ["PATH"] 141 os.environ["LD_LIBRARY_PATH"]=config().get("ClusterJob","ldpath")+":"+os.environ["LD_LIBRARY_PATH"] 142 143 self.isDecomposed=isDecomposed
144
145 - def fullJobId(self):
146 """Return a string with the full job-ID""" 147 result=str(self.jobID) 148 if self.arrayJob: 149 result+=":"+str(self.taskID) 150 return result
151
152 - def message(self,*txt):
153 print_("=== CLUSTERJOB: ",end="") 154 for t in txt: 155 print_(t,end=" ") 156 print_(" ===") 157 sys.stdout.flush()
158
159 - def setState(self,txt):
160 self.message("Setting Job state to",txt) 161 fName=path.join(self.casedir(),"ClusterJobState") 162 f=open(fName,"w") 163 f.write(txt+"\n") 164 f.close()
165
166 - def jobFile(self):
167 """The file with the job information""" 168 jobfile="%s.%d" % (self.jobName,self.jobID) 169 if self.arrayJob: 170 jobfile+=".%d" % self.taskID 171 jobfile+=".pyFoam.clusterjob" 172 jobfile=path.join(path.dirname(self.basename),jobfile) 173 174 return jobfile
175
176 - def checkpointFile(self):
177 """The file that makes the job write a checkpoint""" 178 return self.jobFile()+".checkpoint"
179
180 - def stopFile(self):
181 """The file that makes the job write a checkpoint and end""" 182 return self.jobFile()+".stop"
183
184 - def doIt(self):
185 """The central logic. Runs the job, sets it up etc""" 186 187 f=open(self.jobFile(),"w") 188 f.write(path.basename(self.basename)+"\n") 189 f.close() 190 191 self.message() 192 self.message("Running on directory",self.casename()) 193 self.message() 194 self.setState("Starting up") 195 196 if self.arrayJob: 197 for k,v in list(self.taskParameters(self.taskID).items()): 198 self.parameters[k]=v 199 200 self.parameters.update(self.additionalParameters()) 201 202 self.message("Parameters:",self.parameters) 203 if not self.restarted: 204 self.setState("Setting up") 205 self.setup(self.parameters) 206 if self.autoParallel and self.nproc>1 and not self.isDecomposed: 207 self.setState("Decomposing") 208 self.autoDecompose() 209 210 self.isDecomposed=True 211 212 self.setState("Setting up 2") 213 self.postDecomposeSetup(self.parameters) 214 else: 215 self.setState("Restarting") 216 217 self.isDecomposed=True 218 219 self.setState("Running") 220 self.listenToTimer=True 221 self.timer=Timer(1.,checkForMessageFromAbove,args=[self]) 222 self.timer.start() 223 224 self.run(self.parameters) 225 self.listenToTimer=False 226 227 if path.exists(self.jobFile()): 228 unlink(self.jobFile()) 229 230 if self.ordinaryEnd: 231 self.setState("Post Running") 232 self.preReconstructCleanup(self.parameters) 233 234 if self.autoParallel and self.nproc>1: 235 self.setState("Reconstructing") 236 self.autoReconstruct() 237 238 if self.nproc>0: 239 self.additionalReconstruct(self.parameters) 240 241 self.setState("Cleaning") 242 self.cleanup(self.parameters) 243 self.setState("Finished") 244 else: 245 self.setState("Suspended") 246 247 if path.exists(self.stopFile()): 248 unlink(self.stopFile()) 249 if path.exists(self.checkpointFile()): 250 unlink(self.checkpointFile())
251
252 - def casedir(self):
253 """Returns the actual directory of the case 254 To be overridden if appropriate""" 255 if self.arrayJob: 256 return "%s.%05d" % (self.basename,self.taskID) 257 else: 258 return self.basename
259
260 - def casename(self):
261 """Returns just the name of the case""" 262 return path.basename(self.casedir())
263
264 - def execute(self,cmd):
265 """Execute a shell command in the case directory. No checking done 266 @param cmd: the command as a string""" 267 oldDir=os.getcwd() 268 self.message("Changing directory to",self.casedir()) 269 os.chdir(self.casedir()) 270 self.message("Executing",cmd) 271 try: 272 retcode = subprocess.call(cmd,shell=True) 273 if retcode < 0: 274 self.message(cmd,"was terminated by signal", -retcode) 275 else: 276 self.message(cmd,"returned", retcode) 277 except OSError: 278 e = sys.exc_info()[1] # Needed because python 2.5 does not support 'as e' 279 self.message(cmd,"Execution failed:", e) 280 281 self.message("Executiong of",cmd,"ended") 282 self.message("Changing directory back to",oldDir) 283 os.chdir(oldDir)
284
285 - def templateFile(self,fileName):
286 """Looks for a template file and evaluates the template using 287 the usual parameters 288 @param fileName: the name of the file that will be 289 constructed. The template file is the same plus the extension '.template'""" 290 291 self.message("Building file",fileName,"from template with parameters", 292 self.parameters) 293 294 argList=["--output-file=%s" % path.join(self.casedir(),fileName), 295 "--dump-used-values" 296 ] 297 298 tmpl=FromTemplate(args=argList, 299 parameters=self.parameters)
300
301 - def foamRun(self,application, 302 args=[], 303 foamArgs=[], 304 steady=False, 305 multiRegion=True, 306 progress=False, 307 compress=False, 308 noLog=False):
309 """Runs a foam utility on the case. 310 If it is a parallel job and the grid has 311 already been decomposed (and not yet reconstructed) it is run in 312 parallel 313 @param application: the Foam-Application that is to be run 314 @param foamArgs: A list if with the additional arguments for the 315 Foam-Application 316 @param compress: Compress the log-file 317 @param args: A list with additional arguments for the Runner-object 318 @param steady: Use the steady-runner 319 @param multiRegion: Run this on multiple regions (if None: I don't have an opinion on this) 320 @param progress: Only output the time and nothing else 321 @param noLog: Do not generate a logfile""" 322 323 arglist=args[:] 324 arglist+=["--job-id=%s" % self.fullJobId()] 325 for k,v in iteritems(self.parameters): 326 arglist+=["--parameter=%s:%s" % (str(k),str(v))] 327 328 if self.isDecomposed and self.nproc>1: 329 arglist+=["--procnr=%d" % self.nproc] 330 if config().getboolean("ClusterJob","useMachineFile"): 331 arglist+=["--machinefile=%s" % self.hostfile] 332 333 arglist+=["--echo-command-prefix='=== Executing'"] 334 335 if progress: 336 arglist+=["--progress"] 337 if noLog: 338 arglist+=["--no-log"] 339 if compress: 340 arglist+=["--compress"] 341 342 if self.multiRegion: 343 if multiRegion: 344 arglist+=["--all-regions"] 345 elif multiRegion: 346 warning("This is not a multi-region case, so trying to run stuff multi-region won't do any good") 347 348 if self.restarted: 349 arglist+=["--restart"] 350 351 arglist+=[application] 352 if oldApp(): 353 arglist+=[".",self.casename()] 354 else: 355 arglist+=["-case",self.casename()] 356 357 arglist+=foamArgs 358 359 self.message("Executing",arglist) 360 361 if steady: 362 self.message("Running Steady") 363 runner=SteadyRunner(args=arglist) 364 else: 365 runner=Runner(args=arglist)
366
367 - def autoDecompose(self):
368 """Automatically decomposes the grid with a metis-algorithm""" 369 370 if path.isdir(path.join(self.casedir(),"processor0")): 371 warning("A processor directory already exists. There might be a problem") 372 373 defaultMethod="metis" 374 375 if getFoamVersion()>=(1,6): 376 defaultMethod="scotch" 377 378 args=["--method="+defaultMethod, 379 "--clear", 380 self.casename(), 381 self.nproc, 382 "--job-id=%s" % self.fullJobId()] 383 384 if self.multiRegion: 385 args.append("--all-regions") 386 387 deco=Decomposer(args=args)
388
389 - def autoReconstruct(self):
390 """Default reconstruction of a parallel run""" 391 392 if self.doAutoReconstruct: 393 self.isDecomposed=False 394 395 self.foamRun("reconstructPar", 396 args=["--logname=ReconstructPar"]) 397 else: 398 self.message("No reconstruction (because asked to)")
399
400 - def setup(self,parameters):
401 """Set up the job. Called in the beginning if the 402 job has not been restarted 403 404 Usual tasks include grid conversion/setup, mesh decomposition etc 405 406 @param parameters: a dictionary with parameters""" 407 408 pass
409
410 - def postDecomposeSetup(self,parameters):
411 """Additional setup, to be executed when the grid is already decomposed 412 413 Usually for tasks that can be done on a decomposed grid 414 415 @param parameters: a dictionary with parameters""" 416 417 pass
418
419 - def run(self,parameters):
420 """Run the actual job. Usually the solver. 421 @param parameters: a dictionary with parameters""" 422 423 pass
424
425 - def preReconstructCleanup(self,parameters):
426 """Additional cleanup, to be executed when the grid is still decomposed 427 428 Usually for tasks that can be done on a decomposed grid 429 430 @param parameters: a dictionary with parameters""" 431 432 pass
433
434 - def cleanup(self,parameters):
435 """Clean up after a job 436 @param parameters: a dictionary with parameters""" 437 438 pass
439
440 - def additionalReconstruct(self,parameters):
441 """Additional reconstruction of parallel runs (Stuff that the 442 OpenFOAM-reconstructPar doesn't do 443 @param parameters: a dictionary with parameters""" 444 445 pass
446
447 - def taskParameters(self,id):
448 """Parameters for a specific task 449 @param id: the id of the task 450 @return: a dictionary with parameters for this task""" 451 452 error("taskParameter not implemented. Not a parameterized job") 453 454 return {}
455
456 - def additionalParameters(self):
457 """Additional parameters 458 @return: a dictionary with parameters for this task""" 459 460 warning("Method 'additionalParameters' not implemented. Not a problem. Just saying") 461 462 return {}
463
464 - def writeCheckpoint(self):
465 if self.listenToTimer: 466 f=open(path.join(self.basename,"write"),"w") 467 f.write("Jetzt will ich's wissen") 468 f.close() 469 unlink(self.checkpointFile()) 470 else: 471 warning("I'm not listening to your callbacks") 472 473 self.timer=Timer(1.,checkForMessageFromAbove,args=[self])
474
475 - def stopJob(self):
476 if self.listenToTimer: 477 self.ordinaryEnd=False 478 f=open(path.join(self.basename,"stop"),"w") 479 f.write("Geh z'haus") 480 f.close() 481 unlink(self.stopFile()) 482 else: 483 warning("I'm not listening to your callbacks")
484
485 -class SolverJob(ClusterJob):
486 """A Cluster-Job that executes a solver. It implements the run-function. 487 If a template-case is specified, the case is copied""" 488
489 - def __init__(self,basename,solver, 490 template=None, 491 cloneParameters=[], 492 arrayJob=False, 493 hardRestart=False, 494 autoParallel=True, 495 doAutoReconstruct=None, 496 foamVersion=None, 497 compileOption=None, 498 useFoamMPI=False, 499 steady=False, 500 multiRegion=False, 501 parameters={}, 502 progress=False, 503 solverArgs=[], 504 solverProgress=False, 505 solverNoLog=False, 506 solverLogCompress=False, 507 isDecomposed=False):
508 """@param template: Name of the template-case. It is assumed that 509 it resides in the same directory as the actual case 510 @param cloneParameters: a list with additional parameters for the 511 CloneCase-object that copies the template 512 @param solverProgress: Only writes the current time of the solver""" 513 514 ClusterJob.__init__(self,basename, 515 arrayJob=arrayJob, 516 hardRestart=hardRestart, 517 autoParallel=autoParallel, 518 doAutoReconstruct=doAutoReconstruct, 519 foamVersion=foamVersion, 520 compileOption=compileOption, 521 useFoamMPI=useFoamMPI, 522 multiRegion=multiRegion, 523 parameters=parameters, 524 isDecomposed=isDecomposed) 525 self.solver=solver 526 self.steady=steady 527 if template!=None and not self.restarted: 528 template=path.join(path.dirname(self.casedir()),template) 529 if path.abspath(basename)==path.abspath(template): 530 error("The basename",basename,"and the template",template,"are the same directory") 531 if isDecomposed: 532 cloneParameters+=["--parallel"] 533 clone=CloneCase( 534 args=cloneParameters+[template,self.casedir(),"--follow-symlinks"]) 535 self.solverProgress=solverProgress 536 self.solverNoLog=solverNoLog 537 self.solverLogCompress=solverLogCompress 538 self.solverArgs=solverArgs
539
540 - def run(self,parameters):
541 self.foamRun(self.solver, 542 steady=self.steady, 543 foamArgs=self.solverArgs, 544 multiRegion=False, 545 progress=self.solverProgress, 546 noLog=self.solverNoLog, 547 compress=self.solverLogCompress)
548
549 -class PrepareCaseJob(SolverJob):
550 """Assumes that the case is prepared to be set up with 551 =pyFoamPrepareCase.py= and automatically sets it up with 552 this. Needs one parameterfile to be specified and then a list of 553 name/value-pairs 554 """ 555
556 - def __init__(self,basename,solver, 557 parameterfile, 558 arguments, 559 parameters={}, 560 **kwargs):
561 self.__parameterfile=parameterfile 562 563 para={} 564 if type(arguments)==list: 565 if len(arguments) % 2 !=0: 566 error("Length of arguments should be an even number. Is",len(arguments), 567 ":",arguments) 568 569 # make all string arguments that could be boolean boolean values 570 from PyFoam.Basics.DataStructures import BoolProxy 571 572 for k,v in dict(zip(arguments[::2],arguments[1::2])).items(): 573 try: 574 try: 575 para[k]=BoolProxy(textual=v).val 576 except TypeError: 577 para[k]=int(v) 578 except ValueError: 579 try: 580 para[k]=float(v) 581 except ValueError: 582 try: 583 para[k]=eval(v) 584 except (SyntaxError,NameError): 585 para[k]="'"+v+"'" 586 elif type(arguments)==dict: 587 para=arguments 588 else: 589 error("Type of arguments is ",type(arguments),"Should be 'dict' or 'list':",arguments) 590 591 self.__parametervalues=para 592 593 parameters.update(self.__parametervalues) 594 595 print_("Parameter file",self.__parameterfile) 596 print_("Parameter values",self.__parametervalues) 597 598 SolverJob.__init__(self,basename,solver, 599 parameters=parameters, 600 **kwargs)
601
602 - def setup(self,parameters):
603 parameterString=",".join(["'%s':%s"%i for i in parameters.items()]) 604 PrepareCase(args=[self.casedir(), 605 "--allow-exec", 606 "--parameter="+path.join(self.casedir(),self.__parameterfile), 607 "--values={"+parameterString+"}"])
608
609 -class VariationCaseJob(SolverJob):
610 """Assumes that the case is prepared to be set up with 611 =pyFoamRunParameterVariation.py= and automatically sets it up with 612 this. Needs one parameterfile and a variation-file 613 """ 614
615 - def __init__(self,basename, 616 parameterfile, 617 variationfile, 618 template=None, 619 **kwargs):
620 self.__parameterfile=parameterfile 621 self.__variationfile=variationfile 622 623 print_("Parameter file",self.__parameterfile) 624 print_("Variation file",self.__variationfile) 625 626 data=RunParameterVariation(args=[template, 627 path.join(template,self.__variationfile), 628 "--parameter="+path.join(template,self.__parameterfile), 629 "--list-variations"]).getData() 630 taskID=int(os.environ["SGE_TASK_ID"])-1 631 if "solver" in data["variations"][taskID]: 632 solver=data["variations"][taskID]["solver"] 633 else: 634 solver=data["fixed"]["solver"] 635 636 SolverJob.__init__(self,basename,solver, 637 arrayJob=True, 638 template=template, 639 **kwargs)
640
641 - def taskParameters(self,id):
642 return {}
643
644 - def setup(self,parameters):
645 RunParameterVariation(args=[self.casedir(), 646 path.join(self.casedir(),self.__variationfile), 647 "--allow-exec", 648 "--parameter-file="+path.join(self.casedir(),self.__parameterfile), 649 "--single-variation=%d" % (self.taskID-1), 650 "--no-execute-solver", 651 "--auto-create-database", 652 "--no-database-write", 653 "--inplace-execution"])
654 655 # Should work with Python3 and Python2 656