Package PyFoam :: Package Infrastructure :: Module ClusterJob
[hide private]
[frames] | no frames]

Source Code for Module PyFoam.Infrastructure.ClusterJob

  1  #  ICE Revision: $Id: /local/openfoam/Python/PyFoam/PyFoam/Infrastructure/Logging.py 1906 2007-08-28T16:16:19.392553Z bgschaid  $  
  2  """Encapsulates all necessary things for a cluster-job, like setting up, running, restarting""" 
  3   
  4  import os,sys 
  5  from os import path,unlink 
  6  from threading import Thread,Lock,Timer 
  7   
  8  from PyFoam.Applications.Decomposer import Decomposer 
  9  from PyFoam.Applications.Runner import Runner 
 10  from PyFoam.Applications.SteadyRunner import SteadyRunner 
 11  from PyFoam.Applications.CloneCase import CloneCase 
 12  from PyFoam.FoamInformation import changeFoamVersion 
 13  from PyFoam.Error import error,warning 
 14  from PyFoam import configuration as config 
 15  from PyFoam.FoamInformation import oldAppConvention as oldApp 
 16   
17 -def checkForMessageFromAbove(job):
18 if not job.listenToTimer: 19 return 20 21 if path.exists(job.stopFile()): 22 job.stopJob() 23 return 24 25 if path.exists(job.checkpointFile()): 26 job.writeCheckpoint() 27 28 job.timer=Timer(1.,checkForMessageFromAbove,args=[job]) 29 job.timer.start()
30 31
32 -class ClusterJob:
33 """ All Cluster-jobs are to be derived from this base-class 34 35 The actual jobs are implemented by overriding methods 36 37 There is a number of variables in this class that are used to 38 'communicate' information between the various stages""" 39
40 - def __init__(self,basename, 41 arrayJob=False, 42 hardRestart=False, 43 autoParallel=True, 44 foamVersion=None, 45 useFoamMPI=False, 46 multiRegion=False):
47 """Initializes the Job 48 @param basename: Basis name of the job 49 @param arrayJob: this job is a parameter variation. The tasks 50 are identified by their task-id 51 @param hardRestart: treat the job as restarted 52 @param autoParallel: Parallelization is handled by the base-class 53 @param foamVersion: The foam-Version that is to be used 54 @param useFoamMPI: Use the OpenMPI supplied with OpenFOAM 55 @param multiRegion: This job consists of multiple regions""" 56 57 # print os.environ 58 59 if not os.environ.has_key("JOB_ID"): 60 error("Not an SGE-job. Environment variable JOB_ID is missing") 61 self.jobID=int(os.environ["JOB_ID"]) 62 self.jobName=os.environ["JOB_NAME"] 63 64 self.basename=path.join(path.abspath(path.curdir),basename) 65 66 sgeRestarted=False 67 if os.environ.has_key("RESTARTED"): 68 sgeRestarted=(int(os.environ["RESTARTED"])!=0) 69 70 if sgeRestarted or hardRestart: 71 self.restarted=True 72 else: 73 self.restarted=False 74 75 if foamVersion==None: 76 foamVersion=config().get("OpenFOAM","Version") 77 78 changeFoamVersion(foamVersion) 79 80 if not os.environ.has_key("WM_PROJECT_VERSION"): 81 error("No OpenFOAM-Version seems to be configured. Set the foamVersion-parameter") 82 83 self.autoParallel=autoParallel 84 self.multiRegion=multiRegion 85 86 self.hostfile=None 87 self.nproc=1 88 89 if os.environ.has_key("NSLOTS"): 90 self.nproc=int(os.environ["NSLOTS"]) 91 self.message("Running on",self.nproc,"CPUs") 92 if self.nproc>1: 93 # self.hostfile=os.environ["PE_HOSTFILE"] 94 self.hostfile=path.join(os.environ["TMP"],"machines") 95 self.message("Using the machinefile",self.hostfile) 96 self.message("Contents of the machinefile:",open(self.hostfile).readlines()) 97 98 self.ordinaryEnd=True 99 self.listenToTimer=False 100 101 self.taskID=None 102 self.arrayJob=arrayJob 103 104 if self.arrayJob: 105 self.taskID=int(os.environ["SGE_TASK_ID"]) 106 107 if not useFoamMPI and not foamVersion in eval(config().get("ClusterJob","useFoamMPI",default='[]')): 108 ## prepend special paths for the cluster 109 self.message("Adding Cluster-specific paths") 110 os.environ["PATH"]=config().get("ClusterJob","path")+":"+os.environ["PATH"] 111 os.environ["LD_LIBRARY_PATH"]=config().get("ClusterJob","ldpath")+":"+os.environ["LD_LIBRARY_PATH"] 112 113 self.isDecomposed=False
114
115 - def fullJobId(self):
116 """Return a string with the full job-ID""" 117 result=str(self.jobID) 118 if self.arrayJob: 119 result+=":"+str(self.taskID) 120 return result
121
122 - def message(self,*txt):
123 print "=== CLUSTERJOB: ", 124 for t in txt: 125 print t, 126 print " ===" 127 sys.stdout.flush()
128
129 - def setState(self,txt):
130 self.message("Setting Job state to",txt) 131 fName=path.join(self.casedir(),"ClusterJobState") 132 f=open(fName,"w") 133 f.write(txt+"\n") 134 f.close()
135
136 - def jobFile(self):
137 """The file with the job information""" 138 jobfile="%s.%d" % (self.jobName,self.jobID) 139 if self.arrayJob: 140 jobfile+=".%d" % self.taskID 141 jobfile+=".pyFoam.clusterjob" 142 jobfile=path.join(path.dirname(self.basename),jobfile) 143 144 return jobfile
145
146 - def checkpointFile(self):
147 """The file that makes the job write a checkpoint""" 148 return self.jobFile()+".checkpoint"
149
150 - def stopFile(self):
151 """The file that makes the job write a checkpoint and end""" 152 return self.jobFile()+".stop"
153
154 - def doIt(self):
155 """The central logic. Runs the job, sets it up etc""" 156 157 f=open(self.jobFile(),"w") 158 f.write(path.basename(self.basename)+"\n") 159 f.close() 160 161 self.message() 162 self.message("Running on directory",self.casename()) 163 self.message() 164 self.setState("Starting up") 165 166 parameters=None 167 if self.arrayJob: 168 parameters=self.taskParameters(self.taskID) 169 self.message("Parameters:",parameters) 170 if not self.restarted: 171 self.setState("Setting up") 172 self.setup(parameters) 173 if self.autoParallel and self.nproc>1: 174 self.setState("Decomposing") 175 self.autoDecompose() 176 177 self.isDecomposed=True 178 179 self.setState("Setting up 2") 180 self.postDecomposeSetup(parameters) 181 else: 182 self.setState("Restarting") 183 184 self.isDecomposed=True 185 186 self.setState("Running") 187 self.listenToTimer=True 188 self.timer=Timer(1.,checkForMessageFromAbove,args=[self]) 189 self.timer.start() 190 191 self.run(parameters) 192 self.listenToTimer=False 193 194 if path.exists(self.jobFile()): 195 unlink(self.jobFile()) 196 197 if self.ordinaryEnd: 198 self.setState("Post Running") 199 self.preReconstructCleanup(parameters) 200 201 self.isDecomposed=False 202 203 if self.autoParallel and self.nproc>1: 204 self.setState("Reconstructing") 205 self.autoReconstruct() 206 207 if self.nproc>0: 208 self.additionalReconstruct(parameters) 209 210 self.setState("Cleaning") 211 self.cleanup(parameters) 212 self.setState("Finished") 213 else: 214 self.setState("Suspended") 215 216 if path.exists(self.stopFile()): 217 unlink(self.stopFile()) 218 if path.exists(self.checkpointFile()): 219 unlink(self.checkpointFile())
220
221 - def casedir(self):
222 """Returns the actual directory of the case 223 To be overridden if appropriate""" 224 if self.arrayJob: 225 return "%s.%05d" % (self.basename,self.taskID) 226 else: 227 return self.basename
228
229 - def casename(self):
230 """Returns just the name of the case""" 231 return path.basename(self.casedir())
232
233 - def foamRun(self,application, 234 args=[], 235 foamArgs=[], 236 steady=False, 237 multiRegion=None, 238 progress=False, 239 noLog=False):
240 """Runs a foam utility on the case. 241 If it is a parallel job and the grid has 242 already been decomposed (and not yet reconstructed) it is run in 243 parallel 244 @param application: the Foam-Application that is to be run 245 @param foamArgs: A list if with the additional arguments for the 246 Foam-Application 247 @param args: A list with additional arguments for the Runner-object 248 @param steady: Use the steady-runner 249 @param multiRegion: Run this on multiple regions (if None: I don't have an opinion on this) 250 @param progress: Only output the time and nothing else 251 @param noLog: Do not generate a logfile""" 252 253 arglist=args[:] 254 arglist+=["--job-id=%s" % self.fullJobId()] 255 256 if self.isDecomposed and self.nproc>1: 257 arglist+=["--procnr=%d" % self.nproc, 258 "--machinefile=%s" % self.hostfile] 259 if progress: 260 arglist+=["--progress"] 261 if noLog: 262 arglist+=["--no-log"] 263 264 if self.multiRegion: 265 if multiRegion==None or multiRegion==True: 266 arglist+=["--all-regions"] 267 elif multiRegion and not self.multiRegion: 268 warning("This is not a multi-region case, so trying to run stuff multi-region won't do any good") 269 270 if self.restarted: 271 arglist+=["--restart"] 272 273 arglist+=[application] 274 if oldApp(): 275 arglist+=[".",self.casename()] 276 else: 277 arglist+=["-case",self.casename()] 278 279 arglist+=foamArgs 280 281 self.message("Executing",arglist) 282 283 if steady: 284 self.message("Running Steady") 285 runner=SteadyRunner(args=arglist) 286 else: 287 runner=Runner(args=arglist)
288
289 - def autoDecompose(self):
290 """Automatically decomposes the grid with a metis-algorithm""" 291 292 if path.isdir(path.join(self.casedir(),"processor0")): 293 warning("A processor directory already exists. There might be a problem") 294 args=["--method=metis", 295 "--clear", 296 self.casename(), 297 self.nproc, 298 "--job-id=%s" % self.fullJobId()] 299 300 if self.multiRegion: 301 args.append("--all-regions") 302 303 deco=Decomposer(args=args)
304
305 - def autoReconstruct(self):
306 """Default reconstruction of a parallel run""" 307 308 self.foamRun("reconstructPar", 309 args=["--logname=ReconstructPar"])
310
311 - def setup(self,parameters):
312 """Set up the job. Called in the beginning if the 313 job has not been restarted 314 315 Usual tasks include grid conversion/setup, mesh decomposition etc 316 317 @param parameters: a dictionary with parameters""" 318 319 pass
320
321 - def postDecomposeSetup(self,parameters):
322 """Additional setup, to be executed when the grid is already decomposed 323 324 Usually for tasks that can be done on a decomposed grid 325 326 @param parameters: a dictionary with parameters""" 327 328 pass
329
330 - def run(self,parameters):
331 """Run the actual job. Usually the solver. 332 @param parameters: a dictionary with parameters""" 333 334 pass
335
336 - def preReconstructCleanup(self,parameters):
337 """Additional cleanup, to be executed when the grid is still decomposed 338 339 Usually for tasks that can be done on a decomposed grid 340 341 @param parameters: a dictionary with parameters""" 342 343 pass
344
345 - def cleanup(self,parameters):
346 """Clean up after a job 347 @param parameters: a dictionary with parameters""" 348 349 pass
350
351 - def additionalReconstruct(self,parameters):
352 """Additional reconstruction of parallel runs (Stuff that the 353 OpenFOAM-reconstructPar doesn't do 354 @param parameters: a dictionary with parameters""" 355 356 pass
357
358 - def taskParameters(self,id):
359 """Parameters for a specific task 360 @param id: the id of the task 361 @return: a dictionary with parameters for this task""" 362 363 error("taskParameter not implemented. Not a parameterized job") 364 365 return {}
366
367 - def writeCheckpoint(self):
368 if self.listenToTimer: 369 f=open(path.join(self.basename,"write"),"w") 370 f.write("Jetzt will ich's wissen") 371 f.close() 372 unlink(self.checkpointFile()) 373 else: 374 warning("I'm not listening to your callbacks") 375 376 self.timer=Timer(1.,checkForMessageFromAbove,args=[self])
377
378 - def stopJob(self):
379 if self.listenToTimer: 380 self.ordinaryEnd=False 381 f=open(path.join(self.basename,"stop"),"w") 382 f.write("Geh z'haus") 383 f.close() 384 unlink(self.stopFile()) 385 else: 386 warning("I'm not listening to your callbacks")
387
388 -class SolverJob(ClusterJob):
389 """A Cluster-Job that executes a solver. It implements the run-function. 390 If a template-case is specified, the case is copied""" 391
392 - def __init__(self,basename,solver, 393 template=None, 394 cloneParameters=[], 395 arrayJob=False, 396 hardRestart=False, 397 autoParallel=True, 398 foamVersion=None, 399 useFoamMPI=False, 400 steady=False, 401 multiRegion=False, 402 progress=False, 403 solverProgress=False, 404 solverNoLog=False):
405 """@param template: Name of the template-case. It is assumed that 406 it resides in the same directory as the actual case 407 @param cloneParameters: a list with additional parameters for the 408 CloneCase-object that copies the template 409 @param solverProgress: Only writes the current time of the solver""" 410 411 ClusterJob.__init__(self,basename, 412 arrayJob=arrayJob, 413 hardRestart=hardRestart, 414 autoParallel=autoParallel, 415 foamVersion=foamVersion, 416 useFoamMPI=useFoamMPI, 417 multiRegion=multiRegion) 418 self.solver=solver 419 self.steady=steady 420 if template!=None and not self.restarted: 421 template=path.join(path.dirname(self.casedir()),template) 422 if path.abspath(basename)==path.abspath(template): 423 error("The basename",basename,"and the template",template,"are the same directory") 424 clone=CloneCase( 425 args=cloneParameters+[template,self.casedir(),"--follow-symlinks"]) 426 self.solverProgress=solverProgress 427 self.solverNoLog=solverNoLog
428
429 - def run(self,parameters):
430 self.foamRun(self.solver, 431 steady=self.steady, 432 multiRegion=False, 433 progress=self.solverProgress, 434 noLog=self.solverNoLog)
435