Package PyFoam :: Package Infrastructure :: Module ClusterJob
[hide private]
[frames] | no frames]

Source Code for Module PyFoam.Infrastructure.ClusterJob

  1  #  ICE Revision: $Id$ 
  2  """Encapsulates all necessary things for a cluster-job, like setting 
  3  up, running, restarting""" 
  4   
  5  import os,sys,subprocess 
  6  from os import path,unlink 
  7  from threading import Thread,Lock,Timer 
  8   
  9  from PyFoam.Applications.Decomposer import Decomposer 
 10  from PyFoam.Applications.Runner import Runner 
 11  from PyFoam.Applications.SteadyRunner import SteadyRunner 
 12  from PyFoam.Applications.CloneCase import CloneCase 
 13  from PyFoam.FoamInformation import changeFoamVersion 
 14  from PyFoam.FoamInformation import foamVersion as getFoamVersion 
 15  from PyFoam.Error import error,warning 
 16  from PyFoam import configuration as config 
 17  from PyFoam.FoamInformation import oldAppConvention as oldApp 
 18  from PyFoam.RunDictionary.SolutionDirectory import SolutionDirectory 
 19   
 20  from PyFoam.ThirdParty.six import print_,iteritems 
 21   
22 -def checkForMessageFromAbove(job):
23 if not job.listenToTimer: 24 return 25 26 if path.exists(job.stopFile()): 27 job.stopJob() 28 return 29 30 if path.exists(job.checkpointFile()): 31 job.writeCheckpoint() 32 33 job.timer=Timer(1.,checkForMessageFromAbove,args=[job]) 34 job.timer.start()
35 36
37 -class ClusterJob(object):
38 """ All Cluster-jobs are to be derived from this base-class 39 40 The actual jobs are implemented by overriding methods 41 42 There is a number of variables in this class that are used to 43 'communicate' information between the various stages""" 44
45 - def __init__(self, 46 basename, 47 arrayJob=False, 48 hardRestart=False, 49 autoParallel=True, 50 doAutoReconstruct=None, 51 foamVersion=None, 52 compileOption=None, 53 useFoamMPI=False, 54 multiRegion=False, 55 parameters={}, 56 isDecomposed=False):
57 """Initializes the Job 58 @param basename: Basis name of the job 59 @param arrayJob: this job is a parameter variation. The tasks 60 are identified by their task-id 61 @param hardRestart: treat the job as restarted 62 @param autoParallel: Parallelization is handled by the base-class 63 @param doAutoReconstruct: Automatically reconstruct the case if 64 autoParalellel is set. If the value is None then it is looked up from 65 the configuration 66 @param foamVersion: The foam-Version that is to be used 67 @param compileOption: Forces compile-option (usually 'Opt' or 'Debug') 68 @param useFoamMPI: Use the OpenMPI supplied with OpenFOAM 69 @param multiRegion: This job consists of multiple regions 70 @param parameters: Dictionary with parameters that are being passed to the Runner 71 @param isDecomposed: Assume that the job is already decomposed""" 72 73 # print_(os.environ) 74 75 if not "JOB_ID" in os.environ: 76 error("Not an SGE-job. Environment variable JOB_ID is missing") 77 self.jobID=int(os.environ["JOB_ID"]) 78 self.jobName=os.environ["JOB_NAME"] 79 80 self.basename=path.join(path.abspath(path.curdir),basename) 81 82 sgeRestarted=False 83 if "RESTARTED" in os.environ: 84 sgeRestarted=(int(os.environ["RESTARTED"])!=0) 85 86 if sgeRestarted or hardRestart: 87 self.restarted=True 88 else: 89 self.restarted=False 90 91 if foamVersion==None: 92 foamVersion=config().get("OpenFOAM","Version") 93 94 changeFoamVersion(foamVersion,compileOption=compileOption) 95 96 if not "WM_PROJECT_VERSION" in os.environ: 97 error("No OpenFOAM-Version seems to be configured. Set the foamVersion-parameter") 98 99 self.autoParallel=autoParallel 100 101 self.doAutoReconstruct=doAutoReconstruct 102 if self.doAutoReconstruct==None: 103 self.doAutoReconstruct=config().getboolean("ClusterJob","doAutoReconstruct") 104 105 self.multiRegion=multiRegion 106 107 self.parameters=parameters 108 109 self.hostfile=None 110 self.nproc=1 111 112 if "NSLOTS" in os.environ: 113 self.nproc=int(os.environ["NSLOTS"]) 114 self.message("Running on",self.nproc,"CPUs") 115 if self.nproc>1: 116 # self.hostfile=os.environ["PE_HOSTFILE"] 117 self.hostfile=path.join(os.environ["TMP"],"machines") 118 self.message("Using the machinefile",self.hostfile) 119 self.message("Contents of the machinefile:",open(self.hostfile).readlines()) 120 121 self.ordinaryEnd=True 122 self.listenToTimer=False 123 124 self.taskID=None 125 self.arrayJob=arrayJob 126 127 if self.arrayJob: 128 self.taskID=int(os.environ["SGE_TASK_ID"]) 129 130 if not useFoamMPI and not foamVersion in eval(config().get("ClusterJob","useFoamMPI",default='[]')): 131 ## prepend special paths for the cluster 132 self.message("Adding Cluster-specific paths") 133 os.environ["PATH"]=config().get("ClusterJob","path")+":"+os.environ["PATH"] 134 os.environ["LD_LIBRARY_PATH"]=config().get("ClusterJob","ldpath")+":"+os.environ["LD_LIBRARY_PATH"] 135 136 self.isDecomposed=isDecomposed
137
138 - def fullJobId(self):
139 """Return a string with the full job-ID""" 140 result=str(self.jobID) 141 if self.arrayJob: 142 result+=":"+str(self.taskID) 143 return result
144
145 - def message(self,*txt):
146 print_("=== CLUSTERJOB: ",end="") 147 for t in txt: 148 print_(t,end="") 149 print_(" ===") 150 sys.stdout.flush()
151
152 - def setState(self,txt):
153 self.message("Setting Job state to",txt) 154 fName=path.join(self.casedir(),"ClusterJobState") 155 f=open(fName,"w") 156 f.write(txt+"\n") 157 f.close()
158
159 - def jobFile(self):
160 """The file with the job information""" 161 jobfile="%s.%d" % (self.jobName,self.jobID) 162 if self.arrayJob: 163 jobfile+=".%d" % self.taskID 164 jobfile+=".pyFoam.clusterjob" 165 jobfile=path.join(path.dirname(self.basename),jobfile) 166 167 return jobfile
168
169 - def checkpointFile(self):
170 """The file that makes the job write a checkpoint""" 171 return self.jobFile()+".checkpoint"
172
173 - def stopFile(self):
174 """The file that makes the job write a checkpoint and end""" 175 return self.jobFile()+".stop"
176
177 - def doIt(self):
178 """The central logic. Runs the job, sets it up etc""" 179 180 f=open(self.jobFile(),"w") 181 f.write(path.basename(self.basename)+"\n") 182 f.close() 183 184 self.message() 185 self.message("Running on directory",self.casename()) 186 self.message() 187 self.setState("Starting up") 188 189 if self.arrayJob: 190 for k,v in list(self.taskParameters(self.taskID).items()): 191 self.parameters[k]=v 192 193 self.message("Parameters:",self.parameters) 194 if not self.restarted: 195 self.setState("Setting up") 196 self.setup(self.parameters) 197 if self.autoParallel and self.nproc>1 and not self.isDecomposed: 198 self.setState("Decomposing") 199 self.autoDecompose() 200 201 self.isDecomposed=True 202 203 self.setState("Setting up 2") 204 self.postDecomposeSetup(self.parameters) 205 else: 206 self.setState("Restarting") 207 208 self.isDecomposed=True 209 210 self.setState("Running") 211 self.listenToTimer=True 212 self.timer=Timer(1.,checkForMessageFromAbove,args=[self]) 213 self.timer.start() 214 215 self.run(self.parameters) 216 self.listenToTimer=False 217 218 if path.exists(self.jobFile()): 219 unlink(self.jobFile()) 220 221 if self.ordinaryEnd: 222 self.setState("Post Running") 223 self.preReconstructCleanup(self.parameters) 224 225 if self.autoParallel and self.nproc>1: 226 self.setState("Reconstructing") 227 self.autoReconstruct() 228 229 if self.nproc>0: 230 self.additionalReconstruct(self.parameters) 231 232 self.setState("Cleaning") 233 self.cleanup(self.parameters) 234 self.setState("Finished") 235 else: 236 self.setState("Suspended") 237 238 if path.exists(self.stopFile()): 239 unlink(self.stopFile()) 240 if path.exists(self.checkpointFile()): 241 unlink(self.checkpointFile())
242
243 - def casedir(self):
244 """Returns the actual directory of the case 245 To be overridden if appropriate""" 246 if self.arrayJob: 247 return "%s.%05d" % (self.basename,self.taskID) 248 else: 249 return self.basename
250
251 - def casename(self):
252 """Returns just the name of the case""" 253 return path.basename(self.casedir())
254
255 - def execute(self,cmd):
256 """Execute a shell command in the case directory. No checking done 257 @param cmd: the command as a string""" 258 oldDir=os.getcwd() 259 self.message("Changing directory to",self.casedir()) 260 os.chdir(self.casedir()) 261 self.message("Executing",cmd) 262 try: 263 retcode = subprocess.call(cmd,shell=True) 264 if retcode < 0: 265 self.message(cmd,"was terminated by signal", -retcode) 266 else: 267 self.message(cmd,"returned", retcode) 268 except OSError: 269 e = sys.exc_info()[1] # Needed because python 2.5 does not support 'as e' 270 self.message(cmd,"Execution failed:", e) 271 272 self.message("Executiong of",cmd,"ended") 273 self.message("Changing directory back to",oldDir) 274 os.chdir(oldDir)
275
276 - def foamRun(self,application, 277 args=[], 278 foamArgs=[], 279 steady=False, 280 multiRegion=None, 281 progress=False, 282 noLog=False):
283 """Runs a foam utility on the case. 284 If it is a parallel job and the grid has 285 already been decomposed (and not yet reconstructed) it is run in 286 parallel 287 @param application: the Foam-Application that is to be run 288 @param foamArgs: A list if with the additional arguments for the 289 Foam-Application 290 @param args: A list with additional arguments for the Runner-object 291 @param steady: Use the steady-runner 292 @param multiRegion: Run this on multiple regions (if None: I don't have an opinion on this) 293 @param progress: Only output the time and nothing else 294 @param noLog: Do not generate a logfile""" 295 296 arglist=args[:] 297 arglist+=["--job-id=%s" % self.fullJobId()] 298 for k,v in iteritems(self.parameters): 299 arglist+=["--parameter=%s:%s" % (str(k),str(v))] 300 301 if self.isDecomposed and self.nproc>1: 302 arglist+=["--procnr=%d" % self.nproc, 303 "--machinefile=%s" % self.hostfile] 304 305 if progress: 306 arglist+=["--progress"] 307 if noLog: 308 arglist+=["--no-log"] 309 310 if self.multiRegion: 311 if multiRegion==None or multiRegion==True: 312 arglist+=["--all-regions"] 313 elif multiRegion and not self.multiRegion: 314 warning("This is not a multi-region case, so trying to run stuff multi-region won't do any good") 315 316 if self.restarted: 317 arglist+=["--restart"] 318 319 arglist+=[application] 320 if oldApp(): 321 arglist+=[".",self.casename()] 322 else: 323 arglist+=["-case",self.casename()] 324 325 arglist+=foamArgs 326 327 self.message("Executing",arglist) 328 329 if steady: 330 self.message("Running Steady") 331 runner=SteadyRunner(args=arglist) 332 else: 333 runner=Runner(args=arglist)
334
335 - def autoDecompose(self):
336 """Automatically decomposes the grid with a metis-algorithm""" 337 338 if path.isdir(path.join(self.casedir(),"processor0")): 339 warning("A processor directory already exists. There might be a problem") 340 341 defaultMethod="metis" 342 343 if getFoamVersion()>=(1,6): 344 defaultMethod="scotch" 345 346 args=["--method="+defaultMethod, 347 "--clear", 348 self.casename(), 349 self.nproc, 350 "--job-id=%s" % self.fullJobId()] 351 352 if self.multiRegion: 353 args.append("--all-regions") 354 355 deco=Decomposer(args=args)
356
357 - def autoReconstruct(self):
358 """Default reconstruction of a parallel run""" 359 360 if self.doAutoReconstruct: 361 self.isDecomposed=False 362 363 self.foamRun("reconstructPar", 364 args=["--logname=ReconstructPar"]) 365 else: 366 self.message("No reconstruction (because asked to)")
367
368 - def setup(self,parameters):
369 """Set up the job. Called in the beginning if the 370 job has not been restarted 371 372 Usual tasks include grid conversion/setup, mesh decomposition etc 373 374 @param parameters: a dictionary with parameters""" 375 376 pass
377
378 - def postDecomposeSetup(self,parameters):
379 """Additional setup, to be executed when the grid is already decomposed 380 381 Usually for tasks that can be done on a decomposed grid 382 383 @param parameters: a dictionary with parameters""" 384 385 pass
386
387 - def run(self,parameters):
388 """Run the actual job. Usually the solver. 389 @param parameters: a dictionary with parameters""" 390 391 pass
392
393 - def preReconstructCleanup(self,parameters):
394 """Additional cleanup, to be executed when the grid is still decomposed 395 396 Usually for tasks that can be done on a decomposed grid 397 398 @param parameters: a dictionary with parameters""" 399 400 pass
401
402 - def cleanup(self,parameters):
403 """Clean up after a job 404 @param parameters: a dictionary with parameters""" 405 406 pass
407
408 - def additionalReconstruct(self,parameters):
409 """Additional reconstruction of parallel runs (Stuff that the 410 OpenFOAM-reconstructPar doesn't do 411 @param parameters: a dictionary with parameters""" 412 413 pass
414
415 - def taskParameters(self,id):
416 """Parameters for a specific task 417 @param id: the id of the task 418 @return: a dictionary with parameters for this task""" 419 420 error("taskParameter not implemented. Not a parameterized job") 421 422 return {}
423
424 - def writeCheckpoint(self):
425 if self.listenToTimer: 426 f=open(path.join(self.basename,"write"),"w") 427 f.write("Jetzt will ich's wissen") 428 f.close() 429 unlink(self.checkpointFile()) 430 else: 431 warning("I'm not listening to your callbacks") 432 433 self.timer=Timer(1.,checkForMessageFromAbove,args=[self])
434
435 - def stopJob(self):
436 if self.listenToTimer: 437 self.ordinaryEnd=False 438 f=open(path.join(self.basename,"stop"),"w") 439 f.write("Geh z'haus") 440 f.close() 441 unlink(self.stopFile()) 442 else: 443 warning("I'm not listening to your callbacks")
444
445 -class SolverJob(ClusterJob):
446 """A Cluster-Job that executes a solver. It implements the run-function. 447 If a template-case is specified, the case is copied""" 448
449 - def __init__(self,basename,solver, 450 template=None, 451 cloneParameters=[], 452 arrayJob=False, 453 hardRestart=False, 454 autoParallel=True, 455 doAutoReconstruct=None, 456 foamVersion=None, 457 compileOption=None, 458 useFoamMPI=False, 459 steady=False, 460 multiRegion=False, 461 parameters={}, 462 progress=False, 463 solverProgress=False, 464 solverNoLog=False, 465 isDecomposed=False):
466 """@param template: Name of the template-case. It is assumed that 467 it resides in the same directory as the actual case 468 @param cloneParameters: a list with additional parameters for the 469 CloneCase-object that copies the template 470 @param solverProgress: Only writes the current time of the solver""" 471 472 ClusterJob.__init__(self,basename, 473 arrayJob=arrayJob, 474 hardRestart=hardRestart, 475 autoParallel=autoParallel, 476 doAutoReconstruct=doAutoReconstruct, 477 foamVersion=foamVersion, 478 compileOption=compileOption, 479 useFoamMPI=useFoamMPI, 480 multiRegion=multiRegion, 481 parameters=parameters, 482 isDecomposed=isDecomposed) 483 self.solver=solver 484 self.steady=steady 485 if template!=None and not self.restarted: 486 template=path.join(path.dirname(self.casedir()),template) 487 if path.abspath(basename)==path.abspath(template): 488 error("The basename",basename,"and the template",template,"are the same directory") 489 if isDecomposed: 490 cloneParameters+=["--parallel"] 491 clone=CloneCase( 492 args=cloneParameters+[template,self.casedir(),"--follow-symlinks"]) 493 self.solverProgress=solverProgress 494 self.solverNoLog=solverNoLog
495
496 - def run(self,parameters):
497 self.foamRun(self.solver, 498 steady=self.steady, 499 multiRegion=False, 500 progress=self.solverProgress, 501 noLog=self.solverNoLog)
502 503 # Should work with Python3 and Python2 504