Package PyFoam :: Package Infrastructure :: Module ClusterJob
[hide private]
[frames] | no frames]

Source Code for Module PyFoam.Infrastructure.ClusterJob

  1  #  ICE Revision: $Id: /local/openfoam/Python/PyFoam/PyFoam/Infrastructure/Logging.py 1906 2007-08-28T16:16:19.392553Z bgschaid  $  
  2  """Encapsulates all necessary things for a cluster-job, like setting up, running, restarting""" 
  3   
  4  import os,sys 
  5  from os import path,unlink 
  6  from threading import Thread,Lock,Timer 
  7   
  8  from PyFoam.Applications.Decomposer import Decomposer 
  9  from PyFoam.Applications.Runner import Runner 
 10  from PyFoam.Applications.SteadyRunner import SteadyRunner 
 11  from PyFoam.Applications.CloneCase import CloneCase 
 12  from PyFoam.FoamInformation import changeFoamVersion 
 13  from PyFoam.Error import error,warning 
 14  from PyFoam import configuration as config 
 15  from PyFoam.FoamInformation import oldAppConvention as oldApp 
 16  from PyFoam.RunDictionary.SolutionDirectory import SolutionDirectory 
 17   
18 -def checkForMessageFromAbove(job):
19 if not job.listenToTimer: 20 return 21 22 if path.exists(job.stopFile()): 23 job.stopJob() 24 return 25 26 if path.exists(job.checkpointFile()): 27 job.writeCheckpoint() 28 29 job.timer=Timer(1.,checkForMessageFromAbove,args=[job]) 30 job.timer.start()
31 32
33 -class ClusterJob:
34 """ All Cluster-jobs are to be derived from this base-class 35 36 The actual jobs are implemented by overriding methods 37 38 There is a number of variables in this class that are used to 39 'communicate' information between the various stages""" 40
41 - def __init__(self, 42 basename, 43 arrayJob=False, 44 hardRestart=False, 45 autoParallel=True, 46 doAutoReconstruct=None, 47 foamVersion=None, 48 compileOption=None, 49 useFoamMPI=False, 50 multiRegion=False, 51 isDecomposed=False):
52 """Initializes the Job 53 @param basename: Basis name of the job 54 @param arrayJob: this job is a parameter variation. The tasks 55 are identified by their task-id 56 @param hardRestart: treat the job as restarted 57 @param autoParallel: Parallelization is handled by the base-class 58 @param doAutoReconstruct: Automatically reconstruct the case if 59 autoParalellel is set. If the value is None then it is looked up from 60 the configuration 61 @param foamVersion: The foam-Version that is to be used 62 @param compileOption: Forces compile-option (usually 'Opt' or 'Debug') 63 @param useFoamMPI: Use the OpenMPI supplied with OpenFOAM 64 @param multiRegion: This job consists of multiple regions 65 @param isDecomposed: Assume that the job is already decomposed""" 66 67 # print os.environ 68 69 if not os.environ.has_key("JOB_ID"): 70 error("Not an SGE-job. Environment variable JOB_ID is missing") 71 self.jobID=int(os.environ["JOB_ID"]) 72 self.jobName=os.environ["JOB_NAME"] 73 74 self.basename=path.join(path.abspath(path.curdir),basename) 75 76 sgeRestarted=False 77 if os.environ.has_key("RESTARTED"): 78 sgeRestarted=(int(os.environ["RESTARTED"])!=0) 79 80 if sgeRestarted or hardRestart: 81 self.restarted=True 82 else: 83 self.restarted=False 84 85 if foamVersion==None: 86 foamVersion=config().get("OpenFOAM","Version") 87 88 changeFoamVersion(foamVersion,compileOption=compileOption) 89 90 if not os.environ.has_key("WM_PROJECT_VERSION"): 91 error("No OpenFOAM-Version seems to be configured. Set the foamVersion-parameter") 92 93 self.autoParallel=autoParallel 94 95 self.doAutoReconstruct=doAutoReconstruct 96 if self.doAutoReconstruct==None: 97 self.doAutoReconstruct=config().getboolean("ClusterJob","doAutoReconstruct") 98 99 self.multiRegion=multiRegion 100 101 self.hostfile=None 102 self.nproc=1 103 104 if os.environ.has_key("NSLOTS"): 105 self.nproc=int(os.environ["NSLOTS"]) 106 self.message("Running on",self.nproc,"CPUs") 107 if self.nproc>1: 108 # self.hostfile=os.environ["PE_HOSTFILE"] 109 self.hostfile=path.join(os.environ["TMP"],"machines") 110 self.message("Using the machinefile",self.hostfile) 111 self.message("Contents of the machinefile:",open(self.hostfile).readlines()) 112 113 self.ordinaryEnd=True 114 self.listenToTimer=False 115 116 self.taskID=None 117 self.arrayJob=arrayJob 118 119 if self.arrayJob: 120 self.taskID=int(os.environ["SGE_TASK_ID"]) 121 122 if not useFoamMPI and not foamVersion in eval(config().get("ClusterJob","useFoamMPI",default='[]')): 123 ## prepend special paths for the cluster 124 self.message("Adding Cluster-specific paths") 125 os.environ["PATH"]=config().get("ClusterJob","path")+":"+os.environ["PATH"] 126 os.environ["LD_LIBRARY_PATH"]=config().get("ClusterJob","ldpath")+":"+os.environ["LD_LIBRARY_PATH"] 127 128 self.isDecomposed=isDecomposed
129
130 - def fullJobId(self):
131 """Return a string with the full job-ID""" 132 result=str(self.jobID) 133 if self.arrayJob: 134 result+=":"+str(self.taskID) 135 return result
136
137 - def message(self,*txt):
138 print "=== CLUSTERJOB: ", 139 for t in txt: 140 print t, 141 print " ===" 142 sys.stdout.flush()
143
144 - def setState(self,txt):
145 self.message("Setting Job state to",txt) 146 fName=path.join(self.casedir(),"ClusterJobState") 147 f=open(fName,"w") 148 f.write(txt+"\n") 149 f.close()
150
151 - def jobFile(self):
152 """The file with the job information""" 153 jobfile="%s.%d" % (self.jobName,self.jobID) 154 if self.arrayJob: 155 jobfile+=".%d" % self.taskID 156 jobfile+=".pyFoam.clusterjob" 157 jobfile=path.join(path.dirname(self.basename),jobfile) 158 159 return jobfile
160
161 - def checkpointFile(self):
162 """The file that makes the job write a checkpoint""" 163 return self.jobFile()+".checkpoint"
164
165 - def stopFile(self):
166 """The file that makes the job write a checkpoint and end""" 167 return self.jobFile()+".stop"
168
169 - def doIt(self):
170 """The central logic. Runs the job, sets it up etc""" 171 172 f=open(self.jobFile(),"w") 173 f.write(path.basename(self.basename)+"\n") 174 f.close() 175 176 self.message() 177 self.message("Running on directory",self.casename()) 178 self.message() 179 self.setState("Starting up") 180 181 parameters=None 182 if self.arrayJob: 183 parameters=self.taskParameters(self.taskID) 184 self.message("Parameters:",parameters) 185 if not self.restarted: 186 self.setState("Setting up") 187 self.setup(parameters) 188 if self.autoParallel and self.nproc>1 and not self.isDecomposed: 189 self.setState("Decomposing") 190 self.autoDecompose() 191 192 self.isDecomposed=True 193 194 self.setState("Setting up 2") 195 self.postDecomposeSetup(parameters) 196 else: 197 self.setState("Restarting") 198 199 self.isDecomposed=True 200 201 self.setState("Running") 202 self.listenToTimer=True 203 self.timer=Timer(1.,checkForMessageFromAbove,args=[self]) 204 self.timer.start() 205 206 self.run(parameters) 207 self.listenToTimer=False 208 209 if path.exists(self.jobFile()): 210 unlink(self.jobFile()) 211 212 if self.ordinaryEnd: 213 self.setState("Post Running") 214 self.preReconstructCleanup(parameters) 215 216 if self.autoParallel and self.nproc>1: 217 self.setState("Reconstructing") 218 self.autoReconstruct() 219 220 if self.nproc>0: 221 self.additionalReconstruct(parameters) 222 223 self.setState("Cleaning") 224 self.cleanup(parameters) 225 self.setState("Finished") 226 else: 227 self.setState("Suspended") 228 229 if path.exists(self.stopFile()): 230 unlink(self.stopFile()) 231 if path.exists(self.checkpointFile()): 232 unlink(self.checkpointFile())
233
234 - def casedir(self):
235 """Returns the actual directory of the case 236 To be overridden if appropriate""" 237 if self.arrayJob: 238 return "%s.%05d" % (self.basename,self.taskID) 239 else: 240 return self.basename
241
242 - def casename(self):
243 """Returns just the name of the case""" 244 return path.basename(self.casedir())
245
246 - def foamRun(self,application, 247 args=[], 248 foamArgs=[], 249 steady=False, 250 multiRegion=None, 251 progress=False, 252 noLog=False):
253 """Runs a foam utility on the case. 254 If it is a parallel job and the grid has 255 already been decomposed (and not yet reconstructed) it is run in 256 parallel 257 @param application: the Foam-Application that is to be run 258 @param foamArgs: A list if with the additional arguments for the 259 Foam-Application 260 @param args: A list with additional arguments for the Runner-object 261 @param steady: Use the steady-runner 262 @param multiRegion: Run this on multiple regions (if None: I don't have an opinion on this) 263 @param progress: Only output the time and nothing else 264 @param noLog: Do not generate a logfile""" 265 266 arglist=args[:] 267 arglist+=["--job-id=%s" % self.fullJobId()] 268 269 if self.isDecomposed and self.nproc>1: 270 arglist+=["--procnr=%d" % self.nproc, 271 "--machinefile=%s" % self.hostfile] 272 273 if progress: 274 arglist+=["--progress"] 275 if noLog: 276 arglist+=["--no-log"] 277 278 if self.multiRegion: 279 if multiRegion==None or multiRegion==True: 280 arglist+=["--all-regions"] 281 elif multiRegion and not self.multiRegion: 282 warning("This is not a multi-region case, so trying to run stuff multi-region won't do any good") 283 284 if self.restarted: 285 arglist+=["--restart"] 286 287 arglist+=[application] 288 if oldApp(): 289 arglist+=[".",self.casename()] 290 else: 291 arglist+=["-case",self.casename()] 292 293 arglist+=foamArgs 294 295 self.message("Executing",arglist) 296 297 if steady: 298 self.message("Running Steady") 299 runner=SteadyRunner(args=arglist) 300 else: 301 runner=Runner(args=arglist)
302
303 - def autoDecompose(self):
304 """Automatically decomposes the grid with a metis-algorithm""" 305 306 if path.isdir(path.join(self.casedir(),"processor0")): 307 warning("A processor directory already exists. There might be a problem") 308 args=["--method=metis", 309 "--clear", 310 self.casename(), 311 self.nproc, 312 "--job-id=%s" % self.fullJobId()] 313 314 if self.multiRegion: 315 args.append("--all-regions") 316 317 deco=Decomposer(args=args)
318
319 - def autoReconstruct(self):
320 """Default reconstruction of a parallel run""" 321 322 if self.doAutoReconstruct: 323 self.isDecomposed=False 324 325 self.foamRun("reconstructPar", 326 args=["--logname=ReconstructPar"]) 327 else: 328 self.message("No reconstruction (because asked to)")
329
330 - def setup(self,parameters):
331 """Set up the job. Called in the beginning if the 332 job has not been restarted 333 334 Usual tasks include grid conversion/setup, mesh decomposition etc 335 336 @param parameters: a dictionary with parameters""" 337 338 pass
339
340 - def postDecomposeSetup(self,parameters):
341 """Additional setup, to be executed when the grid is already decomposed 342 343 Usually for tasks that can be done on a decomposed grid 344 345 @param parameters: a dictionary with parameters""" 346 347 pass
348
349 - def run(self,parameters):
350 """Run the actual job. Usually the solver. 351 @param parameters: a dictionary with parameters""" 352 353 pass
354
355 - def preReconstructCleanup(self,parameters):
356 """Additional cleanup, to be executed when the grid is still decomposed 357 358 Usually for tasks that can be done on a decomposed grid 359 360 @param parameters: a dictionary with parameters""" 361 362 pass
363
364 - def cleanup(self,parameters):
365 """Clean up after a job 366 @param parameters: a dictionary with parameters""" 367 368 pass
369
370 - def additionalReconstruct(self,parameters):
371 """Additional reconstruction of parallel runs (Stuff that the 372 OpenFOAM-reconstructPar doesn't do 373 @param parameters: a dictionary with parameters""" 374 375 pass
376
377 - def taskParameters(self,id):
378 """Parameters for a specific task 379 @param id: the id of the task 380 @return: a dictionary with parameters for this task""" 381 382 error("taskParameter not implemented. Not a parameterized job") 383 384 return {}
385
386 - def writeCheckpoint(self):
387 if self.listenToTimer: 388 f=open(path.join(self.basename,"write"),"w") 389 f.write("Jetzt will ich's wissen") 390 f.close() 391 unlink(self.checkpointFile()) 392 else: 393 warning("I'm not listening to your callbacks") 394 395 self.timer=Timer(1.,checkForMessageFromAbove,args=[self])
396
397 - def stopJob(self):
398 if self.listenToTimer: 399 self.ordinaryEnd=False 400 f=open(path.join(self.basename,"stop"),"w") 401 f.write("Geh z'haus") 402 f.close() 403 unlink(self.stopFile()) 404 else: 405 warning("I'm not listening to your callbacks")
406
407 -class SolverJob(ClusterJob):
408 """A Cluster-Job that executes a solver. It implements the run-function. 409 If a template-case is specified, the case is copied""" 410
411 - def __init__(self,basename,solver, 412 template=None, 413 cloneParameters=[], 414 arrayJob=False, 415 hardRestart=False, 416 autoParallel=True, 417 doAutoReconstruct=None, 418 foamVersion=None, 419 compileOption=None, 420 useFoamMPI=False, 421 steady=False, 422 multiRegion=False, 423 progress=False, 424 solverProgress=False, 425 solverNoLog=False, 426 isDecomposed=False):
427 """@param template: Name of the template-case. It is assumed that 428 it resides in the same directory as the actual case 429 @param cloneParameters: a list with additional parameters for the 430 CloneCase-object that copies the template 431 @param solverProgress: Only writes the current time of the solver""" 432 433 ClusterJob.__init__(self,basename, 434 arrayJob=arrayJob, 435 hardRestart=hardRestart, 436 autoParallel=autoParallel, 437 doAutoReconstruct=doAutoReconstruct, 438 foamVersion=foamVersion, 439 compileOption=compileOption, 440 useFoamMPI=useFoamMPI, 441 multiRegion=multiRegion, 442 isDecomposed=isDecomposed) 443 self.solver=solver 444 self.steady=steady 445 if template!=None and not self.restarted: 446 template=path.join(path.dirname(self.casedir()),template) 447 if path.abspath(basename)==path.abspath(template): 448 error("The basename",basename,"and the template",template,"are the same directory") 449 if isDecomposed: 450 cloneParameters+=["--parallel"] 451 clone=CloneCase( 452 args=cloneParameters+[template,self.casedir(),"--follow-symlinks"]) 453 self.solverProgress=solverProgress 454 self.solverNoLog=solverNoLog
455
456 - def run(self,parameters):
457 self.foamRun(self.solver, 458 steady=self.steady, 459 multiRegion=False, 460 progress=self.solverProgress, 461 noLog=self.solverNoLog)
462