Package PyFoam :: Package Infrastructure :: Module ClusterJob
[hide private]
[frames] | no frames]

Source Code for Module PyFoam.Infrastructure.ClusterJob

  1  #  ICE Revision: $Id: /local/openfoam/Python/PyFoam/PyFoam/Infrastructure/Logging.py 1906 2007-08-28T16:16:19.392553Z bgschaid  $  
  2  """Encapsulates all necessary things for a cluster-job, like setting up, running, restarting""" 
  3   
  4  import os 
  5  from os import path,unlink 
  6  from threading import Thread,Lock,Timer 
  7   
  8  from PyFoam.Applications.Decomposer import Decomposer 
  9  from PyFoam.Applications.Runner import Runner 
 10  from PyFoam.Applications.SteadyRunner import SteadyRunner 
 11  from PyFoam.Applications.CloneCase import CloneCase 
 12  from PyFoam.FoamInformation import changeFoamVersion 
 13  from PyFoam.Error import error,warning 
 14  from PyFoam import configuration as config 
 15  from PyFoam.FoamInformation import oldAppConvention as oldApp 
 16   
17 -def checkForMessageFromAbove(job):
18 if not job.listenToTimer: 19 return 20 21 if path.exists(job.stopFile()): 22 job.stopJob() 23 return 24 25 if path.exists(job.checkpointFile()): 26 job.writeCheckpoint() 27 28 job.timer=Timer(1.,checkForMessageFromAbove,args=[job]) 29 job.timer.start()
30 31
32 -class ClusterJob:
33 """ All Cluster-jobs are to be derived from this base-class 34 35 The actual jobs are implemented by overriding methods 36 37 There is a number of variables in this class that are used to 38 'communicate' information between the various stages""" 39
40 - def __init__(self,basename,arrayJob=False,hardRestart=False,autoParallel=True,foamVersion=None,multiRegion=False):
41 """Initializes the Job 42 @param basename: Basis name of the job 43 @param arrayJob: this job is a parameter variation. The tasks 44 are identified by their task-id 45 @param hardRestart: treat the job as restarted 46 @param autoParallel: Parallelization is handled by the base-class 47 @param foamVersion: The foam-Version that is to be used 48 @param multiRegion: This job consists of multiple regions""" 49 50 # print os.environ 51 52 if not os.environ.has_key("JOB_ID"): 53 error("Not an SGE-job. Environment variable JOB_ID is missing") 54 self.jobID=int(os.environ["JOB_ID"]) 55 self.jobName=os.environ["JOB_NAME"] 56 57 self.basename=path.join(path.abspath(path.curdir),basename) 58 59 sgeRestarted=False 60 if os.environ.has_key("RESTARTED"): 61 sgeRestarted=(int(os.environ["RESTARTED"])!=0) 62 63 if sgeRestarted or hardRestart: 64 self.restarted=True 65 else: 66 self.restarted=False 67 68 if foamVersion==None: 69 foamVersion=config().get("OpenFOAM","Version") 70 71 changeFoamVersion(foamVersion) 72 73 if not os.environ.has_key("WM_PROJECT_VERSION"): 74 error("No OpenFOAM-Version seems to be configured. Set the foamVersion-parameter") 75 76 self.autoParallel=autoParallel 77 self.multiRegion=multiRegion 78 79 self.hostfile=None 80 self.nproc=1 81 82 if os.environ.has_key("NSLOTS"): 83 self.nproc=int(os.environ["NSLOTS"]) 84 if self.nproc>1: 85 # self.hostfile=os.environ["PE_HOSTFILE"] 86 self.hostfile=path.join(os.environ["TMP"],"machines") 87 88 self.ordinaryEnd=True 89 self.listenToTimer=False 90 91 self.taskID=None 92 self.arrayJob=arrayJob 93 94 if self.arrayJob: 95 self.taskID=int(os.environ["SGE_TASK_ID"]) 96 97 ## prepend special paths for the cluster 98 os.environ["PATH"]=config().get("ClusterJob","path")+":"+os.environ["PATH"] 99 os.environ["LD_LIBRARY_PATH"]=config().get("ClusterJob","ldpath")+":"+os.environ["LD_LIBRARY_PATH"] 100 101 self.isDecomposed=False
102
103 - def message(self,*txt):
104 print "=== CLUSTERJOB: ", 105 for t in txt: 106 print t, 107 print " ==="
108
109 - def setState(self,txt):
110 self.message("Setting Job state to",txt) 111 fName=path.join(self.casedir(),"ClusterJobState") 112 f=open(fName,"w") 113 f.write(txt+"\n") 114 f.close()
115
116 - def jobFile(self):
117 """The file with the job information""" 118 jobfile="%s.%d" % (self.jobName,self.jobID) 119 if self.arrayJob: 120 jobfile+=".%d" % self.taskID 121 jobfile+=".pyFoam.clusterjob" 122 jobfile=path.join(path.dirname(self.basename),jobfile) 123 124 return jobfile
125
126 - def checkpointFile(self):
127 """The file that makes the job write a checkpoint""" 128 return self.jobFile()+".checkpoint"
129
130 - def stopFile(self):
131 """The file that makes the job write a checkpoint and end""" 132 return self.jobFile()+".stop"
133
134 - def doIt(self):
135 """The central logic. Runs the job, sets it up etc""" 136 137 f=open(self.jobFile(),"w") 138 f.write(path.basename(self.basename)+"\n") 139 f.close() 140 141 self.message() 142 self.message("Running on directory",self.casename()) 143 self.message() 144 self.setState("Starting up") 145 146 parameters=None 147 if self.arrayJob: 148 parameters=self.taskParameters(self.taskID) 149 self.message("Parameters:",parameters) 150 if not self.restarted: 151 self.setState("Setting up") 152 self.setup(parameters) 153 if self.autoParallel and self.nproc>1: 154 self.setState("Decomposing") 155 self.autoDecompose() 156 157 self.isDecomposed=True 158 159 self.setState("Setting up 2") 160 self.postDecomposeSetup(parameters) 161 else: 162 self.setState("Restarting") 163 164 self.isDecomposed=True 165 166 self.setState("Running") 167 self.listenToTimer=True 168 self.timer=Timer(1.,checkForMessageFromAbove,args=[self]) 169 self.timer.start() 170 171 self.run(parameters) 172 self.listenToTimer=False 173 174 if path.exists(self.jobFile()): 175 unlink(self.jobFile()) 176 177 if self.ordinaryEnd: 178 self.setState("Post Running") 179 self.preReconstructCleanup(parameters) 180 181 self.isDecomposed=False 182 183 if self.autoParallel and self.nproc>1: 184 self.setState("Reconstructing") 185 self.autoReconstruct() 186 187 if self.nproc>0: 188 self.additionalReconstruct(parameters) 189 190 self.setState("Cleaning") 191 self.cleanup(parameters) 192 self.setState("Finished") 193 else: 194 self.setState("Suspended") 195 196 if path.exists(self.stopFile()): 197 unlink(self.stopFile()) 198 if path.exists(self.checkpointFile()): 199 unlink(self.checkpointFile())
200
201 - def casedir(self):
202 """Returns the actual directory of the case 203 To be overridden if appropriate""" 204 if self.arrayJob: 205 return "%s.%05d" % (self.basename,self.taskID) 206 else: 207 return self.basename
208
209 - def casename(self):
210 """Returns just the name of the case""" 211 return path.basename(self.casedir())
212
213 - def foamRun(self,application,args=[],foamArgs=[],steady=False,multiRegion=None):
214 """Runs a foam utility on the case. 215 If it is a parallel job and the grid has 216 already been decomposed (and not yet reconstructed) it is run in 217 parallel 218 @param application: the Foam-Application that is to be run 219 @param foamArgs: A list if with the additional arguments for the 220 Foam-Application 221 @param args: A list with additional arguments for the Runner-object 222 @param steady: Use the steady-runner 223 @param multiRegion: Run this on multiple regions (if None: I don't have an opinion on this)""" 224 225 arglist=args[:] 226 if self.isDecomposed and self.nproc>1: 227 arglist+=["--procnr=%d" % self.nproc, 228 "--machinefile=%s" % self.hostfile] 229 230 if self.multiRegion: 231 if multiRegion==None or multiRegion==True: 232 arglist+=["--all-regions"] 233 elif multiRegion and not self.multiRegion: 234 warning("This is not a multi-region case, so trying to run stuff multi-region won't do any good") 235 236 if self.restarted: 237 arglist+=["--restart"] 238 239 arglist+=[application] 240 if oldApp(): 241 arglist+=[".",self.casename()] 242 else: 243 arglist+=["-case",self.casename()] 244 245 arglist+=foamArgs 246 247 self.message("Executing",arglist) 248 249 if steady: 250 self.message("Running Steady") 251 runner=SteadyRunner(args=arglist) 252 else: 253 runner=Runner(args=arglist)
254
255 - def autoDecompose(self):
256 """Automatically decomposes the grid with a metis-algorithm""" 257 258 if path.isdir(path.join(self.casedir(),"processor0")): 259 warning("A processor directory already exists. There might be a problem") 260 args=["--method=metis", 261 "--clear", 262 self.casename(), 263 self.nproc] 264 265 if self.multiRegion: 266 args.append("--all-regions") 267 268 deco=Decomposer(args=args)
269
270 - def autoReconstruct(self):
271 """Default reconstruction of a parallel run""" 272 273 self.foamRun("reconstructPar", 274 args=["--logname=ReconstructPar"])
275
276 - def setup(self,parameters):
277 """Set up the job. Called in the beginning if the 278 job has not been restarted 279 280 Usual tasks include grid conversion/setup, mesh decomposition etc 281 282 @param parameters: a dictionary with parameters""" 283 284 pass
285
286 - def postDecomposeSetup(self,parameters):
287 """Additional setup, to be executed when the grid is already decomposed 288 289 Usually for tasks that can be done on a decomposed grid 290 291 @param parameters: a dictionary with parameters""" 292 293 pass
294
295 - def run(self,parameters):
296 """Run the actual job. Usually the solver. 297 @param parameters: a dictionary with parameters""" 298 299 pass
300
301 - def preReconstructCleanup(self,parameters):
302 """Additional cleanup, to be executed when the grid is still decomposed 303 304 Usually for tasks that can be done on a decomposed grid 305 306 @param parameters: a dictionary with parameters""" 307 308 pass
309
310 - def cleanup(self,parameters):
311 """Clean up after a job 312 @param parameters: a dictionary with parameters""" 313 314 pass
315
316 - def additionalReconstruct(self,parameters):
317 """Additional reconstruction of parallel runs (Stuff that the 318 OpenFOAM-reconstructPar doesn't do 319 @param parameters: a dictionary with parameters""" 320 321 pass
322
323 - def taskParameters(self,id):
324 """Parameters for a specific task 325 @param id: the id of the task 326 @return: a dictionary with parameters for this task""" 327 328 error("taskParameter not implemented. Not a parameterized job") 329 330 return {}
331
332 - def writeCheckpoint(self):
333 if self.listenToTimer: 334 f=open(path.join(self.basename,"write"),"w") 335 f.write("Jetzt will ich's wissen") 336 f.close() 337 unlink(self.checkpointFile()) 338 else: 339 warning("I'm not listening to your callbacks") 340 341 self.timer=Timer(1.,checkForMessageFromAbove,args=[self])
342
343 - def stopJob(self):
344 if self.listenToTimer: 345 self.ordinaryEnd=False 346 f=open(path.join(self.basename,"stop"),"w") 347 f.write("Geh z'haus") 348 f.close() 349 unlink(self.stopFile()) 350 else: 351 warning("I'm not listening to your callbacks")
352
353 -class SolverJob(ClusterJob):
354 """A Cluster-Job that executes a solver. It implements the run-function. 355 If a template-case is specified, the case is copied""" 356
357 - def __init__(self,basename,solver,template=None,cloneParameters=[],arrayJob=False,hardRestart=False,autoParallel=True,foamVersion=None,steady=False,multiRegion=False):
358 """@param template: Name of the template-case. It is assumed that 359 it resides in the same directory as the actual case 360 @param cloneParameters: a list with additional parameters for the 361 CloneCase-object that copies the template""" 362 363 ClusterJob.__init__(self,basename,arrayJob=arrayJob,hardRestart=hardRestart,autoParallel=autoParallel,foamVersion=foamVersion,multiRegion=multiRegion) 364 self.solver=solver 365 self.steady=steady 366 if template!=None and not self.restarted: 367 template=path.join(path.dirname(self.casedir()),template) 368 if path.abspath(basename)==path.abspath(template): 369 error("The basename",basename,"and the template",template,"are the same directory") 370 clone=CloneCase(args=cloneParameters+[template,self.casedir()])
371
372 - def run(self,parameters):
373 self.foamRun(self.solver,steady=self.steady,multiRegion=False)
374