Package PyFoam :: Package Basics :: Module SpreadsheetData
[hide private]
[frames] | no frames]

Source Code for Module PyFoam.Basics.SpreadsheetData

  1  #  ICE Revision: $Id: $ 
  2  """ 
  3  Data that can go into a spreadsheet (title line and rectangular data) 
  4  """ 
  5   
  6  try: 
  7      import numpy 
  8  except ImportError: 
  9      # assume this is pypy and retry 
 10      import numpypy 
 11      import numpy 
 12   
 13  import copy 
 14  import re 
 15   
 16  from PyFoam.Error import error,FatalErrorPyFoamException,warning 
 17   
 18  from PyFoam.ThirdParty.six import PY3 
 19  from PyFoam.ThirdParty.six import b as toByte 
 20   
21 -class WrongDataSize(FatalErrorPyFoamException):
22 - def __init__(self):
23 FatalErrorPyFoamException.__init__(self,"Size of the arrays differs")
24
25 -class SpreadsheetData(object):
26 """ 27 Collects data that could go into a spreadsheet. The focus of this class is on 28 storing all the data at once 29 """
30 - def __init__(self, 31 timeName=None, 32 validData=None, 33 validMatchRegexp=False, 34 csvName=None, 35 txtName=None, 36 excelName=None, 37 data=None, 38 names=None, 39 title=None):
40 """Either this is constructed from a file or from the data and the column headers 41 42 @param timeName: the data colum that is to be considered the time in this file 43 @param validData: names of the valid data columns (all others should be discarded) 44 @param validMatchRegexp: Should the validData be interpreted as regular expressions 45 @param csvName: name of the CSV-file the data should be constructed from, 46 @param txtName: name of a file the data should be constructed from, 47 @param excelName: name of a Excel-file the data should be constructed from (uses the first sheet in the file), 48 @param data: the actual data to use 49 @param names: the names for the column header 50 @param title: a name that is used to make unique heades names""" 51 52 self.title=title 53 54 nrFileSpec=len([1 for i in [csvName,txtName,excelName] if not i is None]) 55 56 if (nrFileSpec>0) and not data is None: 57 error("SpreadsheetData is either constructed from data or from a file") 58 59 if data is None and nrFileSpec>1: 60 error("Only one file specification allowed") 61 62 if csvName: 63 try: 64 rec=numpy.recfromcsv(csvName) 65 data=[tuple(float(x) for x in i) for i in rec] 66 names=list(rec.dtype.names) 67 except AttributeError: 68 # for old numpy-versions 69 data=list(map(tuple,numpy.loadtxt(csvName, 70 delimiter=',', 71 skiprows=1))) 72 names=open(csvName).readline().strip().split(',') 73 74 # redo this to make sure that everything is float 75 self.data=numpy.array(data,dtype=list(zip(names,['f8']*len(names)))) 76 elif txtName: 77 try: 78 rec=numpy.recfromtxt(txtName,names=True) 79 data=[tuple(float(x) for x in i) for i in rec] 80 if names is None: 81 names=list(rec.dtype.names) 82 else: 83 nr=len(list(rec.dtype.names)) 84 if title is None: 85 off=len(names)-nr+1 86 self.title="_".join(names[:off]) 87 names=names[:off]+["index"]+names[off:] 88 names=names[-nr:] 89 90 except AttributeError: 91 # for old numpy-versions 92 data=list(map(tuple,numpy.loadtxt(txtName))) 93 names=open(txtName).readline().strip().split()[1:] 94 95 # redo this to make sure that everything is float 96 self.data=numpy.array(data,dtype=list(zip(names,['f8']*len(names)))) 97 elif excelName: 98 import pandas 99 rec=pandas.read_excel(excelName).to_records() 100 data=[tuple(float(x) for x in i) for i in rec] 101 names=list(rec.dtype.names) 102 103 self.data=numpy.array(data,dtype=list(zip(names,['f8']*len(names)))) 104 else: 105 if data is not None and names is None: 106 error("No names given for the data") 107 108 self.data=numpy.array(list(map(tuple,data)), 109 dtype=list(zip(names,['f8']*len(names)))) 110 111 if timeName: 112 try: 113 index=list(self.data.dtype.names).index(timeName) 114 except ValueError: 115 error("Time name",timeName,"not in",self.data.dtype.names) 116 else: 117 index=0 118 self.time=self.data.dtype.names[index] 119 120 if validData: 121 usedData=[] 122 usedNames=[] 123 124 for n in self.data.dtype.names: 125 if n==self.time or self.validName(n,validData,validMatchRegexp): 126 usedData.append(tuple(self.data[n])) 127 usedNames.append(n) 128 129 usedData=numpy.array(usedData).transpose() 130 self.data=numpy.array(list(map(tuple,usedData)), 131 dtype=list(zip(usedNames,['f8']*len(usedNames)))) 132 index=list(self.data.dtype.names).index(self.time) 133 134 if self.title!=None: 135 self.data.dtype.names=[self.title+" "+x for x in self.data.dtype.names[0:index]]+[self.data.dtype.names[index]]+[self.title+" "+x for x in self.data.dtype.names[index+1:]]
136
137 - def validName(self,n,validData,validMatchRegexp=False):
138 if n in validData: 139 return True 140 elif validMatchRegexp: 141 for r in validData: 142 exp=None 143 try: 144 exp=re.compile(r) 145 except: 146 pass 147 if not exp is None: 148 if exp.search(n): 149 return True 150 return False
151
152 - def names(self):
153 return copy.copy(self.data.dtype.names)
154
155 - def size(self):
156 return self.data.size
157
158 - def writeCSV(self,fName, 159 delimiter=","):
160 """Write data to a CSV-file 161 @param fName: Name of the file 162 @param delimiter: Delimiter to be used in the CSV-file""" 163 164 f=open(fName,"wb") 165 if PY3: 166 f.write(toByte(delimiter.join(self.names())+"\n")) 167 else: 168 f.write(delimiter.join(self.names())+"\n") 169 170 numpy.savetxt(f,self.data,delimiter=delimiter)
171
172 - def tRange(self,time=None):
173 """Return the range of times 174 @param time: name of the time. If None the first column is used""" 175 if time==None: 176 time=self.time 177 t=self.data[time] 178 179 return (t[0],t[-1])
180
181 - def join(self,other,time=None,prefix=None):
182 """Join this object with another. Assume that they have the same 183 amount of rows and that they have one column that designates the 184 time and is called the same and has the same values 185 @param other: the other array 186 @param time: name of the time. If None the first column is used 187 @param prefix: String that is added to the other names. If none is given then 188 the title is used""" 189 if time==None: 190 time=self.time 191 if prefix==None: 192 prefix=other.title 193 if prefix==None: 194 prefix="other_" 195 else: 196 prefix+="_" 197 198 t1=self.data[time] 199 t2=other.data[time] 200 if len(t1)!=len(t2): 201 raise WrongDataSize() 202 if max(abs(t1-t2))>1e-10: 203 error("Times do not have the same values") 204 205 names=[] 206 data=[] 207 for n in self.names(): 208 names.append(n) 209 data.append(self.data[n]) 210 211 for n in other.names(): 212 if n!=time: 213 if n in self.names(): 214 names.append(prefix+n) 215 else: 216 names.append(n) 217 data.append(other.data[n]) 218 219 return SpreadsheetData(names=names, 220 data=numpy.array(data).transpose())
221
222 - def __add__(self,other):
223 """Convinience function for joining data""" 224 return self.join(other)
225
226 - def recalcData(self,name,expr,create=False):
227 """Recalc or add a column to the data 228 @param name: the colum (must exist if it is not created. Otherwise it must not exist) 229 @param expr: the expression to calculate. All present column names are usable as variables. 230 There is also a variable data for subscripting if the data is not a valid variable name. If 231 the column is not create then there is also a variable this that is an alias for the name 232 @param create: whether a new data item should be created""" 233 if create and name in self.names(): 234 error("Item",name,"already exists in names",self.names()) 235 elif not create and not name in self.names(): 236 error("Item",name,"not in names",self.names()) 237 238 result=eval(expr,dict([(n,self.data[n]) for n in self.names()]+[("data",self.data)]+ 239 ([("this",self.data[name] if not create else [])]))) 240 241 if not create: 242 self.data[name]=result 243 else: 244 self.append(name,result)
245
246 - def append(self, 247 name, 248 data, 249 allowDuplicates=False):
250 """Add another column to the data. Assumes that the number of rows is right 251 @param name: the name of the column 252 @param data: the actual data 253 @param allowDuplicates: If the name already exists make it unique by appending _1, _2 ...""" 254 255 arr = numpy.asarray(data) 256 newname=name 257 if newname in self.names() and allowDuplicates: 258 cnt=1 259 while newname in self.names(): 260 newname="%s_%d" % (name,cnt) 261 cnt+=1 262 warning("Changing name",name,"to",newname,"bacause it already exists in the data") 263 newdtype = numpy.dtype(self.data.dtype.descr + [(newname, 'f8')]) 264 newrec = numpy.empty(self.data.shape, dtype=newdtype) 265 for field in self.data.dtype.fields: 266 newrec[field] = self.data[field] 267 newrec[name] = arr 268 269 self.data=newrec
270
271 - def __call__(self, 272 t, 273 name, 274 time=None, 275 invalidExtend=False, 276 noInterpolation=False):
277 """'Evaluate' the data at a specific time by linear interpolation 278 @param t: the time at which the data should be evaluated 279 @param name: name of the data column to be evaluated. Assumes that that column 280 is ordered in ascending order 281 @param time: name of the time column. If none is given then the first column is assumed 282 @param invalidExtend: if t is out of the valid range then use the smallest or the biggest value. If False use nan 283 @param noInterpolation: if t doesn't exactly fit a data-point return 'nan'""" 284 285 if time==None: 286 time=self.time 287 288 x=self.data[time] 289 y=self.data[name] 290 291 # get extremes 292 if t<x[0]: 293 if invalidExtend: 294 return y[0] 295 else: 296 return float('nan') 297 elif t>x[-1]: 298 if invalidExtend: 299 return y[-1] 300 else: 301 return float('nan') 302 303 if noInterpolation: 304 if t==x[0]: 305 return y[0] 306 elif t==x[-1]: 307 return y[-1] 308 309 iLow=0 310 iHigh=len(x)-1 311 312 while (iHigh-iLow)>1: 313 iNew = iLow + (iHigh-iLow)/2 314 315 if x[iNew]==t: 316 # we got lucky 317 return y[iNew] 318 elif t < x[iNew]: 319 iHigh=iNew 320 else: 321 iLow=iNew 322 if noInterpolation: 323 return float('nan') 324 else: 325 return y[iLow] + (y[iHigh]-y[iLow])*(t-x[iLow])/(x[iHigh]-x[iLow])
326
327 - def addTimes(self,times,time=None,interpolate=False,invalidExtend=False):
328 """Extend the data so that all new times are represented (add rows 329 if they are not there) 330 @param time: the name of the column with the time 331 @param times: the times that shoild be there 332 @param interpolate: interpolate the data in new rows. Otherwise 333 insert 'nan' 334 @param invalidExtend: if t is out of the valid range then use 335 the smallest or the biggest value. If False use nan""" 336 337 if time==None: 338 time=self.time 339 340 if len(times)==len(self.data[time]): 341 same=True 342 for i in range(len(times)): 343 if times[i]!=self.data[time][i]: 344 same=False 345 break 346 if same: 347 # No difference between the times 348 return 349 350 newData=[] 351 otherI=0 352 originalI=0 353 while otherI<len(times): 354 goOn=originalI<len(self.data[time]) 355 while goOn and times[otherI]>self.data[time][originalI]: 356 newData.append(self.data[originalI]) 357 originalI+=1 358 goOn=originalI<len(self.data[time]) 359 360 append=True 361 if originalI<len(self.data[time]): 362 if times[otherI]==self.data[time][originalI]: 363 newData.append(self.data[originalI]) 364 originalI+=1 365 otherI+=1 366 append=False 367 368 if append: 369 t=times[otherI] 370 newRow=[] 371 for n in self.names(): 372 if n==time: 373 newRow.append(t) 374 elif interpolate: 375 newRow.append(self(t,n,time=time,invalidExtend=invalidExtend)) 376 else: 377 newRow.append(float('nan')) 378 newData.append(newRow) 379 otherI+=1 380 381 while originalI<len(self.data[time]): 382 newData.append(self.data[originalI]) 383 originalI+=1 384 385 self.data=numpy.array(list(map(tuple,newData)),dtype=self.data.dtype)
386
387 - def resample(self, 388 other, 389 name, 390 otherName=None, 391 time=None, 392 invalidExtend=False, 393 extendData=False, 394 noInterpolation=False):
395 """Calculate values from another dataset at the same times as in this data-set 396 @param other: the other data-set 397 @param name: name of the data column to be evaluated. Assumes that that column 398 is ordered in ascending order 399 @param time: name of the time column. If none is given then the first column is assumed 400 @param invalidExtend: see __call__ 401 @param extendData: if the time range of x is bigger than the range then extend the range before resampling 402 @param noInterpolation: if t doesn't exactly fit a data-point return 'nan'""" 403 if time==None: 404 time=self.time 405 406 if extendData and ( 407 self.data[time][0] > other.data[time][0] or \ 408 self.data[time][-1] < other.data[time][-1]): 409 pre=[] 410 i=0 411 while other.data[time][i] < self.data[time][0]: 412 data=[] 413 for n in self.names(): 414 if n==time: 415 data.append(other.data[time][i]) 416 else: 417 data.append(float('nan')) 418 pre.append(data) 419 i+=1 420 if i>=len(other.data[time]): 421 break 422 if len(pre)>0: 423 self.data=numpy.concatenate((numpy.array(list(map(tuple,pre)), 424 dtype=self.data.dtype), 425 self.data)) 426 427 post=[] 428 i=-1 429 while other.data[time][i] > self.data[time][-1]: 430 data=[] 431 for n in self.names(): 432 if n==time: 433 data.append(other.data[time][i]) 434 else: 435 data.append(float('nan')) 436 post.append(data) 437 i-=1 438 if abs(i)>=len(other.data[time])+1: 439 break 440 441 post.reverse() 442 if len(post)>0: 443 self.data=numpy.concatenate((self.data,numpy.array(list(map(tuple,post)), 444 dtype=self.data.dtype))) 445 446 result=[] 447 448 for t in self.data[time]: 449 nm=name 450 if otherName: 451 nm=otherName 452 result.append(other(t,nm, 453 time=time, 454 invalidExtend=invalidExtend, 455 noInterpolation=noInterpolation)) 456 457 return result
458
459 - def compare(self, 460 other, 461 name, 462 otherName=None, 463 time=None, 464 common=False, 465 minTime=None, 466 maxTime=None):
467 """Compare this data-set with another. The time-points of this dataset are used as 468 a reference. Returns a dictionary with a number of norms: maximum absolute 469 difference, average absolute difference 470 on all timepoints, average absolute difference weighted by time 471 @param other: the other data-set 472 @param name: name of the data column to be evaluated. Assumes that that column 473 is ordered in ascending order 474 @param time: name of the time column. If none is given then the first column is assumed 475 @param common: cut off the parts where not both data sets are defined 476 @param minTime: first time which should be compared 477 @param maxTime: last time to compare""" 478 479 if time==None: 480 time=self.time 481 482 x=self.data[time] 483 y=self.data[name] 484 y2=self.resample(other,name,otherName=otherName,time=time,invalidExtend=True) 485 486 minT,maxT=minTime,maxTime 487 if common: 488 minTmp,maxTmp=max(x[0],other.data[time][0]),min(x[-1],other.data[time][-1]) 489 for i in range(len(x)): 490 if minTmp<=x[i]: 491 minT=x[i] 492 break 493 for i in range(len(x)): 494 val=x[-(i+1)] 495 if maxTmp>=val: 496 maxT=val 497 break 498 else: 499 minT,maxT=x[0],x[-1] 500 501 result = { "max" : None, 502 "maxPos" : None, 503 "average" : None, 504 "wAverage" : None, 505 "tMin": None, 506 "tMax": None } 507 508 if minT==None or maxT==None: 509 return result 510 511 if minTime: 512 if minTime>minT: 513 minT=minTime 514 515 if maxTime: 516 if maxTime<maxT: 517 maxT=maxTime 518 519 if maxT<minT: 520 return result 521 522 maxDiff=0 523 maxPos=x[0] 524 sumDiff=0 525 sumWeighted=0 526 cnt=0 527 528 for i,t in enumerate(x): 529 if t<minT or t>maxT: 530 continue 531 cnt+=1 532 533 val1=y[i] 534 val2=y2[i] 535 diff=abs(val1-val2) 536 if diff>maxDiff: 537 maxDiff=diff 538 maxPos=x[i] 539 sumDiff+=diff 540 weight=0 541 if t>minT: 542 weight+=(t-x[i-1])/2 543 if t<maxT: 544 weight+=(x[i+1]-t)/2 545 sumWeighted+=weight*diff 546 547 return { "max" : maxDiff, 548 "maxPos" : maxPos, 549 "average" : sumDiff/cnt, 550 "wAverage" : sumWeighted/(maxT-minT), 551 "tMin": minT, 552 "tMax": maxT}
553
554 - def metrics(self, 555 name, 556 time=None, 557 minTime=None, 558 maxTime=None):
559 """Calculates the metrics for a data set. Returns a dictionary 560 with a number of norms: minimum, maximum, average, average weighted by time 561 @param name: name of the data column to be evaluated. Assumes that that column 562 is ordered in ascending order 563 @param time: name of the time column. If none is given then the first column is assumed 564 @param minTime: first time to take metrics from 565 @param maxTime: latest time to take matrics from""" 566 567 if time==None: 568 time=self.time 569 570 x=self.data[time] 571 y=self.data[name] 572 573 minVal=1e40 574 maxVal=-1e40 575 sum=0 576 sumWeighted=0 577 578 minT,maxT=x[0],x[-1] 579 580 if minTime: 581 if minTime>minT: 582 minT=minTime 583 584 if maxTime: 585 if maxTime<maxT: 586 maxT=maxTime 587 588 cnt=0 589 590 for i,t in enumerate(x): 591 if t<minT or t>maxT: 592 continue 593 cnt+=1 594 val=y[i] 595 maxVal=max(val,maxVal) 596 minVal=min(val,minVal) 597 sum+=val 598 weight=0 599 if i>0: 600 weight+=(t-x[i-1])/2 601 if i<(len(x)-1): 602 weight+=(x[i+1]-t)/2 603 sumWeighted+=weight*val 604 605 return { "max" : maxVal, 606 "min" : minVal, 607 "average" : sum/max(cnt,1), 608 "wAverage" : sumWeighted/(maxT-minT), 609 "tMin": x[0], 610 "tMax": x[-1]}
611
612 - def getData(self,reindex=True):
613 """Return a dictionary of the data in the DataFrame format of pandas 614 @param: drop duplicate times (setting it to False might break certain Pandas-operations)""" 615 try: 616 from PyFoam.Wrappers.Pandas import PyFoamDataFrame 617 except ImportError: 618 warning("No pandas-library installed. Returning None") 619 return None 620 621 return PyFoamDataFrame(self.getSeries(reindex=reindex))
622
623 - def getSeries(self,reindex=True):
624 """Return a dictionary of the data-columns in the Series format of pandas 625 @param: drop duplicate times (setting it to False might break certain Pandas-operations)""" 626 try: 627 import pandas 628 except ImportError: 629 warning("No pandas-library installed. Returning None") 630 return None 631 data={} 632 633 if reindex: 634 realindex=numpy.unique(self.data[self.time]) 635 636 for n in self.names(): 637 if n!=self.time: 638 data[n]=pandas.Series(self.data[n], 639 index=self.data[self.time], 640 name=n) 641 if reindex: 642 if len(data[n])!=len(realindex): 643 data[n].axes[0].is_unique=True 644 data[n]=data[n].reindex_axis(realindex) 645 646 return data
647 648 # Should work with Python3 and Python2 649