Package PyFoam :: Package Infrastructure :: Module ClusterJob
[hide private]
[frames] | no frames]

Source Code for Module PyFoam.Infrastructure.ClusterJob

  1  #  ICE Revision: $Id: /local/openfoam/Python/PyFoam/PyFoam/Infrastructure/Logging.py 1906 2007-08-28T16:16:19.392553Z bgschaid  $  
  2  """Encapsulates all necessary things for a cluster-job, like setting up, running, restarting""" 
  3   
  4  import os 
  5  from os import path,unlink 
  6  from threading import Thread,Lock,Timer 
  7   
  8  from PyFoam.Applications.Decomposer import Decomposer 
  9  from PyFoam.Applications.Runner import Runner 
 10  from PyFoam.Applications.SteadyRunner import SteadyRunner 
 11  from PyFoam.Applications.CloneCase import CloneCase 
 12  from PyFoam.FoamInformation import changeFoamVersion 
 13  from PyFoam.Error import error,warning 
 14  from PyFoam import configuration as config 
 15   
16 -def checkForMessageFromAbove(job):
17 if not job.listenToTimer: 18 return 19 20 if path.exists(job.stopFile()): 21 job.stopJob() 22 return 23 24 if path.exists(job.checkpointFile()): 25 job.writeCheckpoint() 26 27 job.timer=Timer(1.,checkForMessageFromAbove,args=[job]) 28 job.timer.start()
29 30
31 -class ClusterJob:
32 """ All Cluster-jobs are to be derived from this base-class 33 34 The actual jobs are implemented by overriding methods 35 36 There is a number of variables in this class that are used to 37 'communicate' information between the various stages""" 38
39 - def __init__(self,basename,arrayJob=False,hardRestart=False,autoParallel=True,foamVersion=None,multiRegion=False):
40 """Initializes the Job 41 @param basename: Basis name of the job 42 @param arrayJob: this job is a parameter variation. The tasks 43 are identified by their task-id 44 @param hardRestart: treat the job as restarted 45 @param autoParallel: Parallelization is handled by the base-class 46 @param foamVersion: The foam-Version that is to be used 47 @param multiRegion: This job consists of multiple regions""" 48 49 # print os.environ 50 51 if not os.environ.has_key("JOB_ID"): 52 error("Not an SGE-job. Environment variable JOB_ID is missing") 53 self.jobID=int(os.environ["JOB_ID"]) 54 self.jobName=os.environ["JOB_NAME"] 55 56 self.basename=path.join(path.abspath(path.curdir),basename) 57 58 sgeRestarted=False 59 if os.environ.has_key("RESTARTED"): 60 sgeRestarted=(int(os.environ["RESTARTED"])!=0) 61 62 if sgeRestarted or hardRestart: 63 self.restarted=True 64 else: 65 self.restarted=False 66 67 if foamVersion==None: 68 foamVersion=config().get("OpenFOAM","Version") 69 70 changeFoamVersion(foamVersion) 71 72 if not os.environ.has_key("WM_PROJECT_VERSION"): 73 error("No OpenFOAM-Version seems to be configured. Set the foamVersion-parameter") 74 75 self.autoParallel=autoParallel 76 self.multiRegion=multiRegion 77 78 self.hostfile=None 79 self.nproc=1 80 81 if os.environ.has_key("NSLOTS"): 82 self.nproc=int(os.environ["NSLOTS"]) 83 if self.nproc>1: 84 # self.hostfile=os.environ["PE_HOSTFILE"] 85 self.hostfile=path.join(os.environ["TMP"],"machines") 86 87 self.ordinaryEnd=True 88 self.listenToTimer=False 89 90 self.taskID=None 91 self.arrayJob=arrayJob 92 93 if self.arrayJob: 94 self.taskID=int(os.environ["SGE_TASK_ID"]) 95 96 ## prepend special paths for the cluster 97 os.environ["PATH"]=config().get("ClusterJob","path")+":"+os.environ["PATH"] 98 os.environ["LD_LIBRARY_PATH"]=config().get("ClusterJob","ldpath")+":"+os.environ["LD_LIBRARY_PATH"] 99 100 self.isDecomposed=False
101
102 - def message(self,*txt):
103 print "=== CLUSTERJOB: ", 104 for t in txt: 105 print t, 106 print " ==="
107
108 - def setState(self,txt):
109 self.message("Setting Job state to",txt) 110 fName=path.join(self.casedir(),"ClusterJobState") 111 f=open(fName,"w") 112 f.write(txt+"\n") 113 f.close()
114
115 - def jobFile(self):
116 """The file with the job information""" 117 jobfile="%s.%d" % (self.jobName,self.jobID) 118 if self.arrayJob: 119 jobfile+=".%d" % self.taskID 120 jobfile+=".pyFoam.clusterjob" 121 jobfile=path.join(path.dirname(self.basename),jobfile) 122 123 return jobfile
124
125 - def checkpointFile(self):
126 """The file that makes the job write a checkpoint""" 127 return self.jobFile()+".checkpoint"
128
129 - def stopFile(self):
130 """The file that makes the job write a checkpoint and end""" 131 return self.jobFile()+".stop"
132
133 - def doIt(self):
134 """The central logic. Runs the job, sets it up etc""" 135 136 f=open(self.jobFile(),"w") 137 f.write(path.basename(self.basename)+"\n") 138 f.close() 139 140 self.message() 141 self.message("Running on directory",self.casename()) 142 self.message() 143 self.setState("Starting up") 144 145 parameters=None 146 if self.arrayJob: 147 parameters=self.taskParameters(self.taskID) 148 self.message("Parameters:",parameters) 149 if not self.restarted: 150 self.setState("Setting up") 151 self.setup(parameters) 152 if self.autoParallel and self.nproc>1: 153 self.setState("Decomposing") 154 self.autoDecompose() 155 156 self.isDecomposed=True 157 158 self.setState("Setting up 2") 159 self.postDecomposeSetup(parameters) 160 else: 161 self.setState("Restarting") 162 163 self.isDecomposed=True 164 165 self.setState("Running") 166 self.listenToTimer=True 167 self.timer=Timer(1.,checkForMessageFromAbove,args=[self]) 168 self.timer.start() 169 170 self.run(parameters) 171 self.listenToTimer=False 172 173 if path.exists(self.jobFile()): 174 unlink(self.jobFile()) 175 176 if self.ordinaryEnd: 177 self.setState("Post Running") 178 self.preReconstructCleanup(parameters) 179 180 self.isDecomposed=False 181 182 if self.autoParallel and self.nproc>1: 183 self.setState("Reconstructing") 184 self.autoReconstruct() 185 186 self.setState("Cleaning") 187 self.cleanup(parameters) 188 self.setState("Finished") 189 else: 190 self.setState("Suspended") 191 192 if path.exists(self.stopFile()): 193 unlink(self.stopFile()) 194 if path.exists(self.checkpointFile()): 195 unlink(self.checkpointFile())
196
197 - def casedir(self):
198 """Returns the actual directory of the case 199 To be overridden if appropriate""" 200 if self.arrayJob: 201 return "%s.%05d" % (self.basename,self.taskID) 202 else: 203 return self.basename
204
205 - def casename(self):
206 """Returns just the name of the case""" 207 return path.basename(self.casedir())
208
209 - def foamRun(self,application,args=[],foamArgs=[],steady=False,multiRegion=None):
210 """Runs a foam utility on the case. 211 If it is a parallel job and the grid has 212 already been decomposed (and not yet reconstructed) it is run in 213 parallel 214 @param application: the Foam-Application that is to be run 215 @param foamArgs: A list if with the additional arguments for the 216 Foam-Application 217 @param args: A list with additional arguments for the Runner-object 218 @param steady: Use the steady-runner 219 @param multiRegion: Run this on multiple regions (if None: I don't have an opinion on this)""" 220 221 arglist=args[:] 222 if self.isDecomposed and self.nproc>1: 223 arglist+=["--procnr=%d" % self.nproc, 224 "--machinefile=%s" % self.hostfile] 225 226 if self.multiRegion: 227 if multiRegion==None or multiRegion==True: 228 arglist+=["--all-regions"] 229 elif multiRegion and not self.multiRegion: 230 warning("This is not a multi-region case, so trying to run stuff multi-region won't do any good") 231 232 if self.restarted: 233 arglist+=["--restart"] 234 235 arglist+=[application,".",self.casename()] 236 arglist+=foamArgs 237 238 self.message("Executing",arglist) 239 240 if steady: 241 self.message("Running Steady") 242 runner=SteadyRunner(args=arglist) 243 else: 244 runner=Runner(args=arglist)
245
246 - def autoDecompose(self):
247 """Automatically decomposes the grid with a metis-algorithm""" 248 249 args=["--method=metis", 250 "--clear", 251 self.casename(), 252 self.nproc] 253 254 if self.multiRegion: 255 args.append("--all-regions") 256 257 deco=Decomposer(args=args)
258
259 - def autoReconstruct(self):
260 """Default reconstruction of a parallel run""" 261 262 self.foamRun("reconstructPar", 263 args=["--logname=ReconstructPar"])
264
265 - def setup(self,parameters):
266 """Set up the job. Called in the beginning if the 267 job has not been restarted 268 269 Usual tasks include grid conversion/setup, mesh decomposition etc 270 271 @param parameters: a dictionary with parameters""" 272 273 pass
274
275 - def postDecomposeSetup(self,parameters):
276 """Additional setup, to be executed when the grid is already decomposed 277 278 Usually for tasks that can be done on a decomposed grid 279 280 @param parameters: a dictionary with parameters""" 281 282 pass
283
284 - def run(self,parameters):
285 """Run the actual job. Usually the solver. 286 @param parameters: a dictionary with parameters""" 287 288 pass
289
290 - def preReconstructCleanup(self,parameters):
291 """Additional cleanup, to be executed when the grid is still decomposed 292 293 Usually for tasks that can be done on a decomposed grid 294 295 @param parameters: a dictionary with parameters""" 296 297 pass
298
299 - def cleanup(self,parameters):
300 """Clean up after a job 301 @param parameters: a dictionary with parameters""" 302 303 pass
304
305 - def taskParameters(self,id):
306 """Parameters for a specific task 307 @param id: the id of the task 308 @return: a dictionary with parameters for this task""" 309 310 error("taskParameter not implemented. Not a parameterized job") 311 312 return {}
313
314 - def writeCheckpoint(self):
315 if self.listenToTimer: 316 f=open(path.join(self.basename,"write"),"w") 317 f.write("Jetzt will ich's wissen") 318 f.close() 319 unlink(self.checkpointFile()) 320 else: 321 warning("I'm not listening to your callbacks") 322 323 self.timer=Timer(1.,checkForMessageFromAbove,args=[self])
324
325 - def stopJob(self):
326 if self.listenToTimer: 327 self.ordinaryEnd=False 328 f=open(path.join(self.basename,"stop"),"w") 329 f.write("Geh z'haus") 330 f.close() 331 unlink(self.stopFile()) 332 else: 333 warning("I'm not listening to your callbacks")
334
335 -class SolverJob(ClusterJob):
336 """A Cluster-Job that executes a solver. It implements the run-function. 337 If a template-case is specified, the case is copied""" 338
339 - def __init__(self,basename,solver,template=None,cloneParameters=[],arrayJob=False,hardRestart=False,autoParallel=True,foamVersion=None,steady=False,multiRegion=False):
340 """@param template: Name of the template-case. It is assumed that 341 it resides in the same directory as the actual case 342 @param cloneParameters: a list with additional parameters for the 343 CloneCase-object that copies the template""" 344 345 ClusterJob.__init__(self,basename,arrayJob=arrayJob,hardRestart=hardRestart,autoParallel=autoParallel,foamVersion=foamVersion,multiRegion=multiRegion) 346 self.solver=solver 347 self.steady=steady 348 if template!=None and not self.restarted: 349 template=path.join(path.dirname(self.casedir()),template) 350 if path.abspath(basename)==path.abspath(template): 351 error("The basename",basename,"and the template",template,"are the same directory") 352 clone=CloneCase(args=cloneParameters+[template,self.casedir()])
353
354 - def run(self,parameters):
355 self.foamRun(self.solver,steady=self.steady,multiRegion=False)
356