Package PyFoam :: Package Infrastructure :: Module ClusterJob
[hide private]
[frames] | no frames]

Source Code for Module PyFoam.Infrastructure.ClusterJob

  1  #  ICE Revision: $Id: /local/openfoam/Python/PyFoam/PyFoam/Infrastructure/ClusterJob.py 7722 2012-01-18T17:50:53.943725Z bgschaid  $  
  2  """Encapsulates all necessary things for a cluster-job, like setting 
  3  up, running, restarting""" 
  4   
  5  import os,sys 
  6  from os import path,unlink 
  7  from threading import Thread,Lock,Timer 
  8   
  9  from PyFoam.Applications.Decomposer import Decomposer 
 10  from PyFoam.Applications.Runner import Runner 
 11  from PyFoam.Applications.SteadyRunner import SteadyRunner 
 12  from PyFoam.Applications.CloneCase import CloneCase 
 13  from PyFoam.FoamInformation import changeFoamVersion 
 14  from PyFoam.Error import error,warning 
 15  from PyFoam import configuration as config 
 16  from PyFoam.FoamInformation import oldAppConvention as oldApp 
 17  from PyFoam.RunDictionary.SolutionDirectory import SolutionDirectory 
 18   
19 -def checkForMessageFromAbove(job):
20 if not job.listenToTimer: 21 return 22 23 if path.exists(job.stopFile()): 24 job.stopJob() 25 return 26 27 if path.exists(job.checkpointFile()): 28 job.writeCheckpoint() 29 30 job.timer=Timer(1.,checkForMessageFromAbove,args=[job]) 31 job.timer.start()
32 33
34 -class ClusterJob(object):
35 """ All Cluster-jobs are to be derived from this base-class 36 37 The actual jobs are implemented by overriding methods 38 39 There is a number of variables in this class that are used to 40 'communicate' information between the various stages""" 41
42 - def __init__(self, 43 basename, 44 arrayJob=False, 45 hardRestart=False, 46 autoParallel=True, 47 doAutoReconstruct=None, 48 foamVersion=None, 49 compileOption=None, 50 useFoamMPI=False, 51 multiRegion=False, 52 isDecomposed=False):
53 """Initializes the Job 54 @param basename: Basis name of the job 55 @param arrayJob: this job is a parameter variation. The tasks 56 are identified by their task-id 57 @param hardRestart: treat the job as restarted 58 @param autoParallel: Parallelization is handled by the base-class 59 @param doAutoReconstruct: Automatically reconstruct the case if 60 autoParalellel is set. If the value is None then it is looked up from 61 the configuration 62 @param foamVersion: The foam-Version that is to be used 63 @param compileOption: Forces compile-option (usually 'Opt' or 'Debug') 64 @param useFoamMPI: Use the OpenMPI supplied with OpenFOAM 65 @param multiRegion: This job consists of multiple regions 66 @param isDecomposed: Assume that the job is already decomposed""" 67 68 # print os.environ 69 70 if not os.environ.has_key("JOB_ID"): 71 error("Not an SGE-job. Environment variable JOB_ID is missing") 72 self.jobID=int(os.environ["JOB_ID"]) 73 self.jobName=os.environ["JOB_NAME"] 74 75 self.basename=path.join(path.abspath(path.curdir),basename) 76 77 sgeRestarted=False 78 if os.environ.has_key("RESTARTED"): 79 sgeRestarted=(int(os.environ["RESTARTED"])!=0) 80 81 if sgeRestarted or hardRestart: 82 self.restarted=True 83 else: 84 self.restarted=False 85 86 if foamVersion==None: 87 foamVersion=config().get("OpenFOAM","Version") 88 89 changeFoamVersion(foamVersion,compileOption=compileOption) 90 91 if not os.environ.has_key("WM_PROJECT_VERSION"): 92 error("No OpenFOAM-Version seems to be configured. Set the foamVersion-parameter") 93 94 self.autoParallel=autoParallel 95 96 self.doAutoReconstruct=doAutoReconstruct 97 if self.doAutoReconstruct==None: 98 self.doAutoReconstruct=config().getboolean("ClusterJob","doAutoReconstruct") 99 100 self.multiRegion=multiRegion 101 102 self.hostfile=None 103 self.nproc=1 104 105 if os.environ.has_key("NSLOTS"): 106 self.nproc=int(os.environ["NSLOTS"]) 107 self.message("Running on",self.nproc,"CPUs") 108 if self.nproc>1: 109 # self.hostfile=os.environ["PE_HOSTFILE"] 110 self.hostfile=path.join(os.environ["TMP"],"machines") 111 self.message("Using the machinefile",self.hostfile) 112 self.message("Contents of the machinefile:",open(self.hostfile).readlines()) 113 114 self.ordinaryEnd=True 115 self.listenToTimer=False 116 117 self.taskID=None 118 self.arrayJob=arrayJob 119 120 if self.arrayJob: 121 self.taskID=int(os.environ["SGE_TASK_ID"]) 122 123 if not useFoamMPI and not foamVersion in eval(config().get("ClusterJob","useFoamMPI",default='[]')): 124 ## prepend special paths for the cluster 125 self.message("Adding Cluster-specific paths") 126 os.environ["PATH"]=config().get("ClusterJob","path")+":"+os.environ["PATH"] 127 os.environ["LD_LIBRARY_PATH"]=config().get("ClusterJob","ldpath")+":"+os.environ["LD_LIBRARY_PATH"] 128 129 self.isDecomposed=isDecomposed
130
131 - def fullJobId(self):
132 """Return a string with the full job-ID""" 133 result=str(self.jobID) 134 if self.arrayJob: 135 result+=":"+str(self.taskID) 136 return result
137
138 - def message(self,*txt):
139 print "=== CLUSTERJOB: ", 140 for t in txt: 141 print t, 142 print " ===" 143 sys.stdout.flush()
144
145 - def setState(self,txt):
146 self.message("Setting Job state to",txt) 147 fName=path.join(self.casedir(),"ClusterJobState") 148 f=open(fName,"w") 149 f.write(txt+"\n") 150 f.close()
151
152 - def jobFile(self):
153 """The file with the job information""" 154 jobfile="%s.%d" % (self.jobName,self.jobID) 155 if self.arrayJob: 156 jobfile+=".%d" % self.taskID 157 jobfile+=".pyFoam.clusterjob" 158 jobfile=path.join(path.dirname(self.basename),jobfile) 159 160 return jobfile
161
162 - def checkpointFile(self):
163 """The file that makes the job write a checkpoint""" 164 return self.jobFile()+".checkpoint"
165
166 - def stopFile(self):
167 """The file that makes the job write a checkpoint and end""" 168 return self.jobFile()+".stop"
169
170 - def doIt(self):
171 """The central logic. Runs the job, sets it up etc""" 172 173 f=open(self.jobFile(),"w") 174 f.write(path.basename(self.basename)+"\n") 175 f.close() 176 177 self.message() 178 self.message("Running on directory",self.casename()) 179 self.message() 180 self.setState("Starting up") 181 182 parameters=None 183 if self.arrayJob: 184 parameters=self.taskParameters(self.taskID) 185 self.message("Parameters:",parameters) 186 if not self.restarted: 187 self.setState("Setting up") 188 self.setup(parameters) 189 if self.autoParallel and self.nproc>1 and not self.isDecomposed: 190 self.setState("Decomposing") 191 self.autoDecompose() 192 193 self.isDecomposed=True 194 195 self.setState("Setting up 2") 196 self.postDecomposeSetup(parameters) 197 else: 198 self.setState("Restarting") 199 200 self.isDecomposed=True 201 202 self.setState("Running") 203 self.listenToTimer=True 204 self.timer=Timer(1.,checkForMessageFromAbove,args=[self]) 205 self.timer.start() 206 207 self.run(parameters) 208 self.listenToTimer=False 209 210 if path.exists(self.jobFile()): 211 unlink(self.jobFile()) 212 213 if self.ordinaryEnd: 214 self.setState("Post Running") 215 self.preReconstructCleanup(parameters) 216 217 if self.autoParallel and self.nproc>1: 218 self.setState("Reconstructing") 219 self.autoReconstruct() 220 221 if self.nproc>0: 222 self.additionalReconstruct(parameters) 223 224 self.setState("Cleaning") 225 self.cleanup(parameters) 226 self.setState("Finished") 227 else: 228 self.setState("Suspended") 229 230 if path.exists(self.stopFile()): 231 unlink(self.stopFile()) 232 if path.exists(self.checkpointFile()): 233 unlink(self.checkpointFile())
234
235 - def casedir(self):
236 """Returns the actual directory of the case 237 To be overridden if appropriate""" 238 if self.arrayJob: 239 return "%s.%05d" % (self.basename,self.taskID) 240 else: 241 return self.basename
242
243 - def casename(self):
244 """Returns just the name of the case""" 245 return path.basename(self.casedir())
246
247 - def foamRun(self,application, 248 args=[], 249 foamArgs=[], 250 steady=False, 251 multiRegion=None, 252 progress=False, 253 noLog=False):
254 """Runs a foam utility on the case. 255 If it is a parallel job and the grid has 256 already been decomposed (and not yet reconstructed) it is run in 257 parallel 258 @param application: the Foam-Application that is to be run 259 @param foamArgs: A list if with the additional arguments for the 260 Foam-Application 261 @param args: A list with additional arguments for the Runner-object 262 @param steady: Use the steady-runner 263 @param multiRegion: Run this on multiple regions (if None: I don't have an opinion on this) 264 @param progress: Only output the time and nothing else 265 @param noLog: Do not generate a logfile""" 266 267 arglist=args[:] 268 arglist+=["--job-id=%s" % self.fullJobId()] 269 270 if self.isDecomposed and self.nproc>1: 271 arglist+=["--procnr=%d" % self.nproc, 272 "--machinefile=%s" % self.hostfile] 273 274 if progress: 275 arglist+=["--progress"] 276 if noLog: 277 arglist+=["--no-log"] 278 279 if self.multiRegion: 280 if multiRegion==None or multiRegion==True: 281 arglist+=["--all-regions"] 282 elif multiRegion and not self.multiRegion: 283 warning("This is not a multi-region case, so trying to run stuff multi-region won't do any good") 284 285 if self.restarted: 286 arglist+=["--restart"] 287 288 arglist+=[application] 289 if oldApp(): 290 arglist+=[".",self.casename()] 291 else: 292 arglist+=["-case",self.casename()] 293 294 arglist+=foamArgs 295 296 self.message("Executing",arglist) 297 298 if steady: 299 self.message("Running Steady") 300 runner=SteadyRunner(args=arglist) 301 else: 302 runner=Runner(args=arglist)
303
304 - def autoDecompose(self):
305 """Automatically decomposes the grid with a metis-algorithm""" 306 307 if path.isdir(path.join(self.casedir(),"processor0")): 308 warning("A processor directory already exists. There might be a problem") 309 args=["--method=metis", 310 "--clear", 311 self.casename(), 312 self.nproc, 313 "--job-id=%s" % self.fullJobId()] 314 315 if self.multiRegion: 316 args.append("--all-regions") 317 318 deco=Decomposer(args=args)
319
320 - def autoReconstruct(self):
321 """Default reconstruction of a parallel run""" 322 323 if self.doAutoReconstruct: 324 self.isDecomposed=False 325 326 self.foamRun("reconstructPar", 327 args=["--logname=ReconstructPar"]) 328 else: 329 self.message("No reconstruction (because asked to)")
330
331 - def setup(self,parameters):
332 """Set up the job. Called in the beginning if the 333 job has not been restarted 334 335 Usual tasks include grid conversion/setup, mesh decomposition etc 336 337 @param parameters: a dictionary with parameters""" 338 339 pass
340
341 - def postDecomposeSetup(self,parameters):
342 """Additional setup, to be executed when the grid is already decomposed 343 344 Usually for tasks that can be done on a decomposed grid 345 346 @param parameters: a dictionary with parameters""" 347 348 pass
349
350 - def run(self,parameters):
351 """Run the actual job. Usually the solver. 352 @param parameters: a dictionary with parameters""" 353 354 pass
355
356 - def preReconstructCleanup(self,parameters):
357 """Additional cleanup, to be executed when the grid is still decomposed 358 359 Usually for tasks that can be done on a decomposed grid 360 361 @param parameters: a dictionary with parameters""" 362 363 pass
364
365 - def cleanup(self,parameters):
366 """Clean up after a job 367 @param parameters: a dictionary with parameters""" 368 369 pass
370
371 - def additionalReconstruct(self,parameters):
372 """Additional reconstruction of parallel runs (Stuff that the 373 OpenFOAM-reconstructPar doesn't do 374 @param parameters: a dictionary with parameters""" 375 376 pass
377
378 - def taskParameters(self,id):
379 """Parameters for a specific task 380 @param id: the id of the task 381 @return: a dictionary with parameters for this task""" 382 383 error("taskParameter not implemented. Not a parameterized job") 384 385 return {}
386
387 - def writeCheckpoint(self):
388 if self.listenToTimer: 389 f=open(path.join(self.basename,"write"),"w") 390 f.write("Jetzt will ich's wissen") 391 f.close() 392 unlink(self.checkpointFile()) 393 else: 394 warning("I'm not listening to your callbacks") 395 396 self.timer=Timer(1.,checkForMessageFromAbove,args=[self])
397
398 - def stopJob(self):
399 if self.listenToTimer: 400 self.ordinaryEnd=False 401 f=open(path.join(self.basename,"stop"),"w") 402 f.write("Geh z'haus") 403 f.close() 404 unlink(self.stopFile()) 405 else: 406 warning("I'm not listening to your callbacks")
407
408 -class SolverJob(ClusterJob):
409 """A Cluster-Job that executes a solver. It implements the run-function. 410 If a template-case is specified, the case is copied""" 411
412 - def __init__(self,basename,solver, 413 template=None, 414 cloneParameters=[], 415 arrayJob=False, 416 hardRestart=False, 417 autoParallel=True, 418 doAutoReconstruct=None, 419 foamVersion=None, 420 compileOption=None, 421 useFoamMPI=False, 422 steady=False, 423 multiRegion=False, 424 progress=False, 425 solverProgress=False, 426 solverNoLog=False, 427 isDecomposed=False):
428 """@param template: Name of the template-case. It is assumed that 429 it resides in the same directory as the actual case 430 @param cloneParameters: a list with additional parameters for the 431 CloneCase-object that copies the template 432 @param solverProgress: Only writes the current time of the solver""" 433 434 ClusterJob.__init__(self,basename, 435 arrayJob=arrayJob, 436 hardRestart=hardRestart, 437 autoParallel=autoParallel, 438 doAutoReconstruct=doAutoReconstruct, 439 foamVersion=foamVersion, 440 compileOption=compileOption, 441 useFoamMPI=useFoamMPI, 442 multiRegion=multiRegion, 443 isDecomposed=isDecomposed) 444 self.solver=solver 445 self.steady=steady 446 if template!=None and not self.restarted: 447 template=path.join(path.dirname(self.casedir()),template) 448 if path.abspath(basename)==path.abspath(template): 449 error("The basename",basename,"and the template",template,"are the same directory") 450 if isDecomposed: 451 cloneParameters+=["--parallel"] 452 clone=CloneCase( 453 args=cloneParameters+[template,self.casedir(),"--follow-symlinks"]) 454 self.solverProgress=solverProgress 455 self.solverNoLog=solverNoLog
456
457 - def run(self,parameters):
458 self.foamRun(self.solver, 459 steady=self.steady, 460 multiRegion=False, 461 progress=self.solverProgress, 462 noLog=self.solverNoLog)
463