pylal.ligolw_dataUtils

1 # 2 # ============================================================================= 3 # 4 # Preamble 5 # 6 # ============================================================================= 7 # 8 9 ''' 10 A collection of utilities to assist in storing and sorting data queried from a database 11 or xml document. 12 ''' 13 14 import sys, re, math 15 import time, datetime 16 17 from glue.ligolw.utils import print_tables 18 from glue.ligolw import ligolw 19 from glue.ligolw import table 20 from glue.ligolw import lsctables 21 from glue import git_version 22 23 from pylal.xlal.date import XLALGPSToUTC 24 try: 25 from pylal.xlal.datatypes.ligotimegps import LIGOTimeGPS 26 except ImportError: 27 # s6 code 28 from pylal.xlal.date import LIGOTimeGPS 29 from pylal import tools 30 from pylal import ligolw_sqlutils as sqlutils 31 32 33 __author__ = "Collin Capano <collin.capano@ligo.org>" 34 __version__ = git_version.id

35 36 37 # ============================================================================= 38 # 39 # Utilities 40 # 41 # ============================================================================= 42 43 # 44 # 45 # Tools for manipulating statistics 46 # 47 # 48 49 -def get_row_stat(row, arg):

50 """ 51 Method to evaluate the desired operation on columns from a row in a table. 52 The desired operation can be either a pre-defined function (if it exists in 53 the row's namespace) or a function of the elements in the row's name space. 54 Syntax for the arg is python. The name space available to eval is limited 55 to the columns in the row and functions in the math module. 56 """ 57 # speed up: if the arg exists in the row, just return it 58 try: 59 val = getattr(row, arg) 60 try: 61 # we'll try returning val as a function 62 return val() 63 except TypeError: 64 # that failed, so val must not be a function; just return it 65 return val 66 except AttributeError: 67 # arg isn't a simple argument of row, so we'll have to eval it 68 try: 69 row_dict = row.__dict__ 70 except AttributeError: 71 row_dict = dict([ [name, getattr(row,name)] for name in dir(row) ]) 72 safe_dict = {} 73 safe_dict.update(row_dict) 74 safe_dict.update(math.__dict__) 75 return eval(arg, {"__builtins__": None}, safe_dict)

76

77 -def get_needed_columns(column_list, function):

78 """ 79 Returns a list of columns from the given column list that are needed by 80 the given function string. This can be used to reduce the number of columns 81 that are passed to get_row_stat. 82 83 Parameters 84 ---------- 85 column_list: list 86 A list of strings given the possible columns to pull out. 87 function: str 88 A string specifying the match criteria. Can either be: 89 * "endTime" or "startTime": in this case, all columns in 90 column_list with "end_time", "start_time", or "ifo" in their name 91 will be retrieved. 92 * "eThinca": in this case, all columns with mass1, mass2, mchirp, 93 eta, tau[0-9], time, or [Gg]amma[0-9] in their name will be 94 retrieved. 95 * a python string that is a function of one or more of the columns 96 in column_list. 97 98 Returns 99 ------- 100 needed_columns: str 101 The subset of columns needed by the function. 102 """ 103 if function == 'eThinca': 104 regex = re.compile( 105 'ifo|mass1|mass2|mchirp|eta|time|tau[0-9]|[Gg]amma[0-9]') 106 needed_cols = [col for col in column_list \ 107 if regex.search(col) is not None] 108 elif function == 'endTime' or function == 'startTime': 109 # if endTime, we'll need all columns with "end_time" in it; this 110 # will get both an end_time column from a sngl_inspiral table and 111 # a {site}_end_time from a sim_inspiral table; ditto start time 112 regex = re.compile('ifo|end_time|start_time') 113 needed_cols = [col for col in column_list \ 114 if regex.search(col) is not None] 115 else: 116 needed_cols = [col for col in column_list \ 117 if re.search('(%s)' % col, function) is not None] 118 return needed_cols

119

120 121 -def createDataRowClass( classTableName, baseClass = None, columns = [] ):

122 """ 123 Creates a DataRow class. If classTableName is the same as a table in 124 lsctables, and baseClass is not specified, the DataRow class will inherit 125 from that table's RowType. If baseClass is specified, the DataRow class 126 that is created will inherit from that base class. If baseClass is not 127 specified and classTableName is not in lsctables, then DataRow will just 128 inherit from object. Regardless of inheritance, the DataRow will have a 129 __slots__ attribute. Any columns that are specified in columns will be 130 added to the __slots__ class. The DataRow will also have a tableName 131 attribute. This will be whatever classTableName is, regardless of what 132 baseClass is set to. Think of the DataRow class as a more arbitrary version 133 of an lsctable row. 134 135 @classTableName: a string specifying the DataRow's tableName. If baseClass 136 is not specified, and tableName is the same as a table in lsctables, the 137 DataRow will inherit from that table's RowType. Example: 'sngl_inspiral' 138 139 @baseClass: specify what class the DataRow should inherit from. Example: 140 lsctables.SnglInspiral 141 142 @columns: a list of strings specifying columns to add to the DataRow's 143 __slots__ attribute. All columns in __slots__ will also be an attribute of 144 the DataRow class. Only columns not in the base class's __slots__ attribute 145 will be added. 146 147 Note that this function returns a class, not an instance of a class. 148 """ 149 # determine the base class 150 if baseClass is not None: 151 base = baseClass 152 elif classTableName in lsctables.TableByName: 153 base = lsctables.TableByName[ classTableName ].RowType 154 else: 155 base = object 156 157 # define the class 158 class DataRow( base ): 159 160 tableName = classTableName 161 if '__slots__' not in dir( base ): 162 __slots__ = columns 163 else: 164 __slots__ = base.__slots__ + [c for c in set(columns)-set(base.__slots__)] 165 166 def __init__(self): 167 """ 168 If all slots are not populated, we will get an AttributeError when 169 using get_value. To avoid this, initialize all slots as None. 170 """ 171 for column in self.__slots__: 172 setattr(self, column, None)

173 174 def store(self, dataTuple): 175 """ 176 Takes a list of tuples of (column_name, data) and assigns the 177 values to the object's variables. The column_name must be in 178 self.__slots__. 179 @dataTuple: a list of tuples in which the first element is the 180 column name and the second is the value to assign. 181 """ 182 for col, val in dataTuple: 183 setattr( self, col, val ) 184 185 def get_value(self, arg): 186 """ 187 Returns the result of some operation on the elements in self. 188 @arg: can be the name of any defined function in self's base class, 189 a slot in self, or a function of either or both. See get_row_stat 190 for more info. 191 192 Example: 193 >>> from glue.ligolw import lsctables 194 >>> SnglInspRow = lsctables.createDataRowClass('sngl_inspiral') 195 >>> test = SnglInspRow() 196 >>> test.store([('snr', 6.), ('chisq', 32.), ('chisq_dof', 16.)]) 197 >>> test.get_value('snr**2.') 198 36.0 199 >>> test.get_value('get_new_snr') 200 5.8993671171391338 201 >>> test.get_value('log(get_new_snr())') 202 1.7748450768765174 203 """ 204 return get_row_stat( self, arg ) 205 206 return DataRow 207

208 209 -def combineRowStats( function, rows ):

210 """ 211 Performs the desired function on the list of single statistics. Note: this 212 can only combine one statistic from each row. 213 214 @function: can be either a known pre-set (see below) or an arbitrary 215 function. If an arbitrary function, it must be in terms of the ifo names. 216 217 @rows: a dictionary of statistics keyed by the ifos 218 """ 219 # check if the function is a known pre-sets 220 if function == 'sum': 221 return sum(rows.values()) 222 if function == 'quad_sum': 223 return math.sqrt(sum([x**2. for x in rows.values()])) 224 if function == 'min': 225 return min(rows.values()) 226 if function == 'max': 227 return max(rows.values()) 228 if function == 'mean': 229 return numpy.mean(numpy.array(rows.values())) 230 if function == 'median': 231 return numpy.median(numpy.array(rows.values())) 232 if function == 'alpha_min': 233 return rows[min(rows.keys())] 234 if function == 'sorted_keys': 235 return ','.join(sorted(rows.keys())) 236 if function == 'sorted_values': 237 return ';'.join(sorted(map( str, rows.values() ))) 238 if function == 'echo': 239 return rows 240 241 # otherwise, evaulate the function explicitly 242 safe_dict = dict([ [name,val] for name,val in rows.items() + math.__dict__.items() if not name.startswith('__') ]) 243 244 try: 245 return eval( function, {"__builtins__":None}, safe_dict ) 246 except NameError: 247 # this can happen if an ifo that's specified in the combining function is not in the coincident ifos; in this case, just return None 248 return None

249

250 251 -def createCombineRowsMethod( tableName, columns, functionList ):

252 """ 253 Creates a CombineRows class that can be used in a sqlite database to 254 combine rows on the fly. Takes in a sngl_function, which is the function 255 used to combine columns within a single row, and a combining_function, 256 which is the function used to combine the results of the sngl_functions 257 across rows. 258 259 @tableName: the name of the table that will be reading from. If it is a table in lsctables.py, all methods and columns from that table will be inherited. 260 @columns: the list of columns that will be storing data to. This list must be in the same order that will be reading data in from the database with. 261 @functionList: a list of tuples. The first item should be the combining function to use, in terms of the ifos to combine, and the second item should be the sngl function to use, in terms of columns or methods of the sngl_row. 262 """ 263 264 sngl_row = createDataRowClass(tableName, columns) 265 266 class CombineRows: 267 def __init__(self): 268 """ 269 Initializes variables needed for the step process. 270 """ 271 self.this_coinc = dict([ [x, {}] for x in functionList ])

272 273 def step(self, *args): 274 """ 275 Populates self.this_coinc 276 """ 277 this_row = sngl_row(columns) 278 this_row.store(zip(columns,args)) 279 for combine_func, sngl_function in functionList: 280 self.this_coinc[(combine_func, sngl_function)][this_row.ifo] = this_row.get_value(sngl_function) 281 282 def finalize(self): 283 """ 284 Once all the singles for the coinc have been gathered, applies the desired combining function(s) to them and returns the result. Results are returned as a comma seperated string. 285 """ 286 return ','.join([str(combineRowStats( cfunc, self.this_coinc[(cfunc, sfunc)] )) for cfunc, sfunc in functionList]) 287 288 return CombineRows 289

290 291 # 292 # 293 # Utilities for storing data 294 # 295 # 296 -class dbDataRow:

297 """ 298 A class to assist in loading data from and performing functions on tables 299 in a SQLite database. 300 """

301 - def __init__(self, connection, tableName, baseClass = None ):

302 self.connection = connection 303 self.columns = sqlutils.get_column_names_from_table( self.connection, tableName ) 304 self.rowClass = createDataRowClass( self.tableName, baseClass, self.columns ) 305 self._function = None

306

307 - def set_function( self, function ):

308 self._function = function

309

310 - def store( self, *rowData):

311 thisRow = self.rowClass() 312 thisRow.store( zip( self.columns, rowData ) ) 313 return thisRow

314

315 - def get_db_value( self, *rowData ):

316 thisRow = self.store( rowData ) 317 return thisRow.get_value( self._function )

318

319 - def create_db_func( self, function, funcName ):

320 self.set_function( function ) 321 self.connection.create_function( funcName, len(self.columns), self.get_db_value )

322

323 324 -class CompareDataRows:

325 """ 326 A class that can perform various types of comparison tests between 327 arbitrary DataRow classes. The class has the following attributes: 328 *classA: A DataRow class. Note: this is a class, not an instance 329 of a class. 330 *classB: A DataRow class. This can be the same type of class as 331 classA, or different. Like classA, this is a class, not an instance 332 of that class. 333 *matchCriteriaA: What column, or function of columns, to get from 334 classA when doing a comparison between an instance of classA and 335 an instance of classB. 336 *matchCriteriaB: What column, or function of columns, to get from 337 classB when doing a comparison between an instance of classA and 338 an instance of classB. 339 *_diffFunc: What function to perform to differentiate classA from classB. 340 This function should be one of the functions below; it takes data to 341 populate an instance of classA and an instance of classB, and returns a 342 numerical value >= 0 representing the difference between these instances of 343 classA and classB. This value can then be compared to the window size to 344 determine if A and B are the same or not. 345 *window: How large of a window to use to consider an instance of 346 classA equal to an instance of classB. 347 348 Example: 349 >>> classA = createDataRowClass( 'sngl_inspiral' ) 350 >>> classB = createDataRowClass( 'sngl_inspiral' ) 351 >>> compF = CompareDataRows( classA, classB ) 352 >>> compF.set_diffFunc( compF.diffRowARowB ) 353 >>> compF.set_window( 0.1 ) 354 >>> compF.set_matchCriteriaA('mass1/mass2') 355 >>> compF.set_matchCriteriaB = ('mass1/mass2') 356 >>> dataA = [('mass1', '10.0'), ('mass2', '5.0')] 357 >>> dataB = [('mass1', '10.1'), ('mass2', '5.0')] 358 >>> compF.compare( dataA, dataB ) 359 True 360 >>> compF.set_window(0) 361 >>> compF.compare( dataA, dataB ) 362 False 363 """

364 - def __init__(self, RowClassA = None, RowClassB = None):

365 self.classA = RowClassA 366 self.classB = RowClassB 367 self._matchCriteriaA = None 368 self._matchCriteriaB = None 369 self._neededColumnsA = None 370 self._neededColumnsB = None 371 self.diffFunc = None 372 self.window = None

373

374 - def set_classA(self, DataRowClass):

375 self.classA = DataRowClass

376

377 - def set_classB(self, DataRowClass):

378 self.classB = DataRowClass

379

380 - def set_matchCriteriaA(self, match_criteria):

381 """ 382 Sets the match criteria for classA. Also sets the columns needed for 383 the given match criteria. 384 """ 385 self._matchCriteriaA = match_criteria 386 # set the needed columns for the given match criteria 387 self.set_neededColumnsA()

388 389 @property

390 - def matchCriteriaA(self):

391 return self._matchCriteriaA

392

393 - def set_matchCriteriaB(self, match_criteria):

394 """ 395 Sets the match criteria for classB. Also sets the columns needed for 396 the given match criteria. 397 """ 398 self._matchCriteriaB = match_criteria 399 # set the needed columns for the given match criteria 400 self.set_neededColumnsB()

401 402 @property

403 - def matchCriteriaB(self):

404 return self._matchCriteriaB

405

406 - def get_needed_columnsAB(self, AorB):

407 """ 408 Retrieves which columns in the desired class is needed for the match 409 criteria. 410 411 Parameters 412 ---------- 413 AorB: str 414 Either 'A' or 'B'; which class to get the columns for. 415 416 Returns 417 ------- 418 needed_cols: list 419 The list of needed columns; see get_needed_columns for 420 details. 421 """ 422 return get_needed_columns( 423 getattr(self, 'class%s' % AorB).__slots__, 424 getattr(self, 'matchCriteria%s' %AorB))

425

426 - def set_neededColumnsA(self):

427 self._neededColumnsA = self.get_needed_columnsAB('A')

428 429 @property

430 - def neededColumnsA(self):

431 return self._neededColumnsA

432

433 - def set_neededColumnsB(self):

434 self._neededColumnsB = self.get_needed_columnsAB('B')

435 436 @property

437 - def neededColumnsB(self):

438 return self._neededColumnsB

439

440 - def set_diffFunc( self, function ):

441 self.diffFunc = function

442

443 - def set_window(self, window):

444 self.window = window

445 # 446 # Functions 447 #

448 - def _diff( self, a, b ):

449 """ 450 Returns the absolute value of the difference between a and b. 451 452 Parameters 453 ---------- 454 a: float or integer 455 b: float or integer 456 457 Returns 458 ------- 459 difference: float or integer 460 The abs difference between a and b. 461 """ 462 return abs(a - b)

463

464 - def compare(self, a, b):

465 """ 466 Runs self.diffFunc on a and b and checks that that is <= self.window. 467 468 Parameters 469 ---------- 470 a: instance of classA row 471 The data passed to the first argument of self.diffFunc. 472 b: instance of classB row 473 The data passed to the second argument of self.diffFunc. 474 475 Returns 476 ------- 477 comparison: bool 478 True if self.diffFunc(a, b) is <= self.window; False otherwise. 479 """ 480 return self.diffFunc(a, b) <= self.window

481

482 - def dbWrapper(self, *args):

483 """ 484 A database wrapper for the compare functions. 485 486 Parameters 487 ---------- 488 args: list 489 A list of values. The first len(self.neededColumnsA) is assumed to 490 be the data for classA, in the order that neededColumnsA is in. 491 The rest of the values are assumed to be the data for classB, in 492 the order that neededColumnsB is in. 493 494 Returns 495 ------- 496 comparison: bool 497 The result of self.compare, where the first argument passed is 498 the data from classA and the second is data from classB. 499 """ 500 dataA = [args[i] for i in range(len(self.neededColumnsA))] 501 dataB = [args[i] for i in range(len(self.neededColumnsA), len(args))] 502 dataA = zip(self.neededColumnsA, dataA) 503 dataB = zip(self.neededColumnsB, dataB) 504 return self.compare(dataA, dataB)

505

506 - def create_dbCompF(self, connection, diffFunc, compFuncName, window):

507 """ 508 Creates a function in the given connection to a database that allows 509 the given diffFunc to be performed on classA and classB on the fly. 510 The matchCriteria and the neededColumns for each class must be already 511 set (this should happen simultaneously by using set_matchCriteria(A|B). 512 513 Parameters 514 ---------- 515 connection: sqlite3.connection 516 A connection to SQLite database. 517 diffFunc: function 518 The function to use to do comparisons; must be one of the 519 functions defined in this class. 520 compFuncName: str 521 What to call the call function in the database; must be unique. 522 window: float 523 The size of the window to use when determining whether or not 524 classA and classB are the same. 525 """ 526 if self._matchCriteriaA is None: 527 raise ValueError("matchCriteriaA not set! " +\ 528 "Run self.set_matchCriteriaA with appropriate arguments.") 529 if self._neededColumnsA is None: 530 raise ValueError("neededColumnsA not set! " +\ 531 "Run self.set_matchCriteriaA to set the needed columns and " +\ 532 "the match criteria.") 533 if self._matchCriteriaB is None: 534 raise ValueError("matchCriteriaB not set! " +\ 535 "Run self.set_matchCriteriaB with appropriate arguments.") 536 if self._neededColumnsB is None: 537 raise ValueError("neededColumnsB not set! " +\ 538 "Run self.set_matchCriteriaB to set the needed columns and " +\ 539 "the match criteria.") 540 self.set_diffFunc(diffFunc) 541 self.set_window(window) 542 connection.create_function(compFuncName, 543 len(self.neededColumnsA)+len(self.neededColumnsB), self.dbWrapper)

544

545 - def diffRowARowB(self, dataA, dataB):

546 """ 547 548 Runs self.diff on self.classA and self.classB using self.matchCriteriA 549 and self.matchCriteriaB. A or B can be any DataRow class; the only 550 requirement is that their match criteria (set by 551 self.matchCriteria(A|B)) be a function of their slots. Special match 552 criteria are 'startTime' and 'endTime'. In this case, 553 (start|end)_time+1e-9*(start|end)_time_ns will calculated. 554 555 Parameters 556 ---------- 557 dataA: list 558 A list of tuples with data to populate this instance of classA. 559 The first value of each tuple is the column name, the second the 560 value, e.g., ('ifo', 'H1'). 561 dataB: list 562 A list of data tuples to populate this instance of classB. 563 564 Returns 565 ------- 566 diff: float 567 The return of self._diff(a,b), where a(b) is the matchCritieraA(B) 568 function run on dataA(B). 569 """ 570 # store the data 571 rowA = self.classA() 572 rowA.store(dataA) 573 rowB = self.classB() 574 rowB.store(dataB) 575 # analyze the sngl functions 576 if self.matchCriteriaA == 'startTime': 577 a = rowA.start_time + 1e-9*rowA.start_time_ns 578 elif self.matchCriteriaA == 'endTime': 579 a = rowA.end_time + 1e-9*rowA.end_time_ns 580 else: 581 a = rowA.get_value( self.matchCriteriaA ) 582 if self.matchCriteriaB == 'startTime': 583 b = rowB.start_time + 1e-9*rowB.start_time_ns 584 elif self.matchCriteriaB == 'endTime': 585 b = rowB.end_time + 1e-9*rowB.end_time_ns 586 else: 587 b = rowB.get_value( self.matchCriteriaB ) 588 return self._diff(a, b)

589

590 - def diffSimSngl( self, simData, snglData ):

591 """ 592 Same as diffRowARowB, except that classA is assumed to be some sort of 593 simulation table (e.g., sim_inspiral) and classB is assumed to be some 594 sort of single-IFO table (e.g., sngl_inspiral). This assumption only 595 matters if 'startTime' or 'endTime' are the match criteria for classA. 596 In that case, the observatory that recorded the event in classB is 597 retrieved from classB.ifo. This is then used to pick out the 598 appropriate end|start time to use from classA. For example, if H1 is 599 the ifo in the snglData, then 600 h_(end|start)_time+1e-9*h_(end|start)_time_ns will be retrieved from 601 the simData. 602 @simData: a list of tuples with data to populate this instance of 603 classA. If self.matchCriteriaA is 'endTime' or 'startTime', classA is 604 assumed to be a row in a simulation table, and must have 605 {site}_(start|end)_time(_ns) columns. 606 @snglData: a list of tuples with data to populate this instance of 607 classB. If self.matchCriteriaB is 'endTime' or 'startTime', classB is 608 assumed to be a rown in a single-IFO table, and must have an ifo 609 column. 610 """ 611 # store the data 612 simRow = self.classA() 613 simRow.store(simData) 614 snglRow = self.classB() 615 snglRow.store(snglData) 616 # analyze the sim function 617 if self.matchCriteriaA == 'startTime': 618 site = snglRow.ifo.lower()[0] 619 a = getattr( simRow, '%s_start_time' % site ) + 1e-9*getattr( simRow, '%s_start_time_ns' % site ) 620 elif self.matchCriteriaA == 'endTime': 621 site = snglRow.ifo.lower()[0] 622 a = getattr( simRow, '%s_end_time' % site ) + 1e-9*getattr( simRow, '%s_end_time_ns' % site ) 623 else: 624 a = simRow.get_value( self.matchCriteriaA ) 625 # analyze the sngl function 626 if self.matchCriteriaB == 'startTime': 627 b = snglRow.start_time + 1e-9*snglRow.start_time_ns 628 elif self.matchCriteriaB == 'endTime': 629 b = snglRow.end_time + 1e-9*snglRow.end_time_ns 630 else: 631 b = snglRow.get_value( self.matchCriteriaB ) 632 return self._diff(a, b)

633

634 - def eThincaSim( self, simData, snglData):

635 """ 636 Computes the eThinca distance between an instance of self.classA and an 637 instance of self.classB. This assumes that classA inherited from the 638 SimInspiral class and classB inherited from the SnglInspiral class. 639 @simData: List of data tuples (column_name, value) with which to 640 populate this instance of self.classA. 641 @snglData: List of data tuples (column_name, value) with which to 642 populate this instance of self.classB. 643 """ 644 simRow = self.classA() 645 simRow.store(simData) 646 snglRow = self.classB() 647 snglRow.store(snglData) 648 # lal expects the event_id and simulation_id to be integers 649 simRow.simulation_id = 0 650 snglRow.event_id = 0 651 return tools.XLALEThincaParameterForInjection( simRow, snglRow )

652

653 - def eThincaSngl( self, snglDataA, snglDataB ):

654 """ 655 Computes the eThinca distance between an instance of self.classA and an 656 instance of self.classB. This assumes that both classA and classB 657 inherited from the SnglInspiral class. 658 @snglDataA: List of data tuples (column_name, value) with which to 659 populate this instance of self.classA. 660 @snglDataB: List of data tuples (column_name, value) with which to 661 populate this instance of self.classB. 662 """ 663 snglRowA = self.classA() 664 snglRowA.store(snglDataA) 665 snglRowB = self.classB() 666 snglRowB.store(snglDataB) 667 # lal expects the event_ids to be integers 668 snglRowA.event_id = 0 669 snglRowB.event_id = 0 670 try: 671 ethincaVal = tools.XLALCalculateEThincaParameter( snglRowA, snglRowB ) 672 except ValueError: 673 # not coincident, just return inf 674 ethincaVal = float('inf') 675 return ethincaVal

676

677 678 -class OffsetVector(dict):

679 weak_equality = False

680 - def __init__(self, offset_dict):

681 for ifo in offset_dict: 682 self[ifo] = offset_dict[ifo]

683

684 - def __eq__(self, other):

685 """ 686 The default equality test is to consider two vectors to be equal only if all ifos are the same and all offsets are the same. If one vector is a subset of the other vector, they will not be considered equal. However, if the class attribute weak_equality is set to True, only offsets of the ifos that are both in self and other will be checked. For example: 687 >>> a = OffsetVector({'H1': 0, 'L1': 5}) 688 >>> b = OffsetVector({'H1': 0, 'L1': 5, 'V1': 10}) 689 >>> a == b 690 False 691 >>> OffsetVector.weak_equality = True 692 >>> a == b 693 True 694 """ 695 if type(other) != type(self): 696 return False 697 if OffsetVector.weak_equality: 698 return all( self[ifo] == other[ifo] for ifo in set(self.keys()) & set(other.keys()) ) 699 else: 700 return self.__hash__() == other.__hash__()

701

702 - def __ne__(self, other):

703 return not self == other

704

705 - def __hash__(self):

706 if OffsetVector.weak_equality: 707 return 1 708 else: 709 return hash(tuple(sorted(self.items())))

710

711 712 -class Category:

713 """ 714 Class to store category information. 715 """ 716 default_match_criteria = ['offset_vector', 'datatype', 'veto_cat', 'on_instruments', 'ifos', 'param_group'] 717

718 - def __init__(self, offset_vector = {}, datatype = None, veto_cat = None, on_instruments = frozenset(['ALL']), ifos = frozenset(['ALL']), param_group = None):

719 self.offset_vector = OffsetVector(offset_vector) 720 self.datatype = datatype 721 self.veto_cat = veto_cat 722 self.on_instruments = frozenset(on_instruments) 723 self.ifos = frozenset(ifos) 724 self.param_group = param_group 725 self.livetime = 0

726

727 - def add_livetime(self, time):

728 self.livetime += time

729

730 - def get_livetime(self, time_units = 'yr'):

731 return sqlutils.convert_duration( self.livetime, time_units )

732

733 - def selective_eq(self, other, check_me):

734 """ 735 Only checks the values listed in check_me to figure out whether or not self is equal to other. 736 """ 737 if type(other) != type(self): 738 return False 739 return all(getattr(self,x) == getattr(other,x) for x in check_me)

740

741 - def __eq__(self, other):

742 """ 743 For default equality check, uses class attribute default_match_criteria to check what parameters should be considered. 744 """ 745 b = type(self) == type(other) and self.__hash__() == other.__hash__() 746 if b and OffsetVector.weak_equality and 'offset_vector' in Category.default_match_criteria: 747 b = self.offset_vector == other.offset_vector 748 return b

749

750 - def __ne__(self, other):

751 return not self == other

752

753 - def __hash__(self):

754 return hash(tuple(getattr(self,x) for x in Category.default_match_criteria))

755

756 757 -class Data( dict ):

758 """ 759 Class to store statistics and livetime for plotting. 760 """

761 - class DataElement:

762 """ 763 Sub-class to store individual data elements. 764 765 @categories: a list of instances of the Category class defining which categories this data element belongs to 766 @data: an instance of the DataRow class listing statistics and methods associated with this element 767 """

768 - def __init__(self, thisid, data):

769 self._id = thisid 770 self.data = data 771 self.cumrates = {}

772

773 - def update(self, _id = None, data = None):

774 # update id 775 if _id is not None: 776 self._id = _id 777 # update data 778 if data is not None: 779 self.data = data

780

781 - def __init__(self):

782 """ 783 A list of all the data elements is kept as an index. 784 """ 785 self.data_index = {}

786

787 - def add_data(self, _id, categories, data):

788 """ 789 Adds a new DataElement to self. 790 791 @_id: some unique value to identify the data element 792 @categories: a list of categories that this data falls in. If one or more of these categories are equal (equality determined by the default Category match_criteria) to a category already in all_categories, the category is set to that category. This results in distinct categories only being saved once in memory, with all DataElements that share that category pointing to the same memory address. 793 """ 794 d = self.DataElement( _id, data ) 795 self.data_index[d._id] = d 796 for c in categories: 797 self.setdefault(c, []) 798 self[c].append( d )

799

800 - def update(self, _id, categories = [], data = None, addToExistingCat = True, errOnMissing = True):

801 """ 802 Updates all DataElements in self that have the given id. If no DataElement is found with the given id and errOnMissing is False, adds a new entry. 803 """ 804 if _id not in self.data_index: 805 if errOnMissing: 806 raise ValueError, "An element with id %s could not be found." % str(_id) 807 else: 808 self.add_data( _id, categories, data ) 809 else: 810 self.data_index[_id].update( data = data) 811 self.refresh_categories( [self.data_index[_id]] )

812

813 - def add_livetime(self, livetime, category, match_criteria = []):

814 """ 815 Adds livetime to all categories in self that match the given criteria. 816 """ 817 if match_criteria == []: 818 match_criteria = Category.default_match_criteria 819 for cat in [cat for cat in self if cat.selective_eq(category, match_criteria)]: 820 cat.livetime += livetime

821

822 - def get_livetime(self, category, match_criteria = [], time_units = 'yr'):

823 """ 824 Returns the sum of all the livetimes of categories that match the given category via the given match_criteria. 825 """ 826 if match_criteria == []: 827 match_criteria = Category.default_match_criteria 828 return sqlutils.convert_duration(sum([cat.livetime for cat in self if cat.selective_eq(category, match_criteria)]), time_units)

829

830 - def create_background(self, match_criteria = []):

831 """ 832 Creates background categories out of the slide categories and adds this to all slide elements' categories lists. Default action is to create a background for each veto-category, on_instruments, ifos, and param_group. However, this can be overridden with the match_criteria argument. 833 """ 834 if match_criteria == []: 835 match_criteria = ['veto_cat', 'on_instruments', 'ifos', 'param_group'] 836 for vals in set([ tuple(getattr(c, x) for x in match_criteria) for c in self if c.datatype == 'slide' ]): 837 # create the background category 838 bkg_cat = Category( offset_vector = {}, datatype = 'background' ) 839 [setattr(bkg_cat, x, y) for x, y in zip(match_criteria, vals)] 840 bkg_cat.livetime = sum([c.livetime for c in self if c.datatype == 'slide' and bkg_cat.selective_eq(c, match_criteria) ]) 841 # add this background category to each matching slide's categories 842 self[bkg_cat] = list(set([x for c in self if c.datatype == 'slide' and bkg_cat.selective_eq(c, match_criteria) for x in self[c]]))

843

844 - def compute_cumrates(self, stat, foreground_datatype, rank_by = 'max', group_by = [], num_slides = 100.):

845 """ 846 Computes the cumulative rates for all the distinct groups that exist in self. Distinct groups are determined by group_by. 847 """ 848 if group_by == []: 849 group_by = ['datatype', 'veto_cat', 'on_instruments', 'ifos', 'param_group'] 850 distinct_groups = set([ tuple(getattr(c,x) for x in group_by) for c in self]) 851 for group in distinct_groups: 852 this_group = Category() 853 [setattr(this_group, x, y) for (x,y) in zip( group_by, group )] 854 this_group.livetime = self.get_livetime( this_group, group_by, time_units = 's' ) 855 # get the list of all stats that fall in this category 856 this_data = [] 857 for c in self: 858 if c.selective_eq(this_group, group_by): 859 this_data.extend( self[c] ) 860 this_data = sorted(set(this_data), key = lambda x: getattr(x.data, stat), reverse = rank_by == 'min') 861 d = [getattr(x.data, stat) for x in this_data] 862 # we need to figure out the number of trials that were done in this category; we do this by taking the ratio 863 # of the this category's livetime to the foreground datatype associated with this category's livetime 864 # temporarily set this_group's datatype to foreground_datatype in order to get the right livetime 865 orig_dt = this_group.datatype 866 this_group.datatype = foreground_datatype 867 fg_livetime = self.get_livetime( this_group, match_criteria = 'datatype' not in group_by and group_by+['datatype'] or group_by, time_units = 's' ) 868 nTrials = float(this_group.livetime) / fg_livetime 869 # set datatype back to correct 870 this_group.datatype = orig_dt 871 # compute the cum-rates 872 these_cumrates = [ (len(d) - bisect.bisect_left(d, x))/nTrials for x in d ] 873 # assign to data 874 for data_elem, rate in zip( this_data, these_cumrates ): 875 data_elem.cumrates[this_group] = rate

876

877 - def get_cumrates(self, group, stat, rank_by ='max'):

878 """ 879 Returns a sorted list (by stat) of stats, cumrates, and ids for the given group. 880 """ 881 return sorted([(getattr(d.data, stat), d.cumrates[group], d._id) for d in self.data_index.values() if group in d.cumrates], reverse = rank_by == 'min')

882

883 - def get_data(self, _id = None, category = None, category_match_criteria = []):

884 """ 885 Returns a list of DataElements that matches a given id, a given category, or both. If category_match_criteria is specified, will get data that matches the specified elements in category. Otherwise, will use Category.default_match_criteria for comparing category to the stored categories. 886 """ 887 if category_match_criteria == []: 888 category_match_criteria = Category.default_match_criteria 889 return set([x for c in self for x in self[c] if (category is None or c.selective_eq(category, category_match_criteria)) and (_id is None or _id == x._id)])

890

891 - def get_categories(self, category, match_criteria = []):

892 """ 893 Returns a list of categories in self that match the given category via the match_criteria. 894 """ 895 if match_criteria == []: 896 match_criteria = Category.default_match_criteria 897 return [x for x in self if x.selective_eq(category, match_criteria)]

898

899 - def collapse(self, args):

900 """ 901 Cycles over the DataElements in self, keeping only the given args. 902 903 @args: A list of tuples. In each tuple, the first element is the name to give the new collapsed value and the second element is the argument to carry out (either a name or a function) on the uncollapsed row to get the collapsed value. 904 """ 905 cols = [arg[0] for arg in args] 906 fns = [arg[1] for arg in args] 907 collapsedRow = createDataRowClass( 'collapsedRow' ) 908 for n,origElement in enumerate(self.data_index.values()): 909 d = collapsedRow( cols ) 910 d.store([(col, origElement.data.get_value(fn)) for col, fn in zip(cols, fns)]) 911 origElement.data = d

912

913 -def combineData(dataObj, match_column, args, param_grouping_function, verbose = False):

914 """ 915 Cycles over the DataElements in dataObj, combining any DataElements with the same match_column value via the given args and returns a new Data object in which the element's ids are the values of the match_column. Note: the categories of the DataElements in the new Data object are just the concatenation of the older objects individual categories. These might need to be updated depending on the paramters of the newer category. 916 917 @dataObj: the instace of Data to carry the combination on 918 @match_column: name of column in the DataElements to use to match rows to combine; e.g., 'coinc_event_id' 919 @args: a list of tuples. In each tuple the first element is the name to give the new combined value, the second element is the column in each row to identify that row by, the third is the column or function of columns in each row to combine, and the final element is the way to combine them, which can be either a predefined method or a function in terms of values of the first element. For example, if you wanted the average chirp mass and the sum of the squares of new_snr over H1 and L1, the args should look like: 920 args = [ (combined_newsnr_sq, ifo, get_new_snr, H1**2.+L1**2.), (combined_mchirp, ifo, mchirp, mean) ] 921 """ 922 cols = [arg[0] for arg in args] 923 colkeys = [arg[1] for arg in args] 924 sngl_stats = [arg[2] for arg in args] 925 cmb_fncs = [arg[3] for arg in args] 926 newData = Data() 927 combinedRow = createDataRowClass( 'combinedRow' ) 928 # get the unique match values 929 match_vals = {} 930 for d in dataObj.data_index.values(): 931 this_id = d.data.get_value(match_column) 932 match_vals.setdefault(this_id, []) 933 match_vals[this_id].append(d) 934 ii = 0 935 for idcol, combine_data in match_vals.items(): 936 ii += 1 937 if verbose: 938 if ii != len(match_vals): 939 print "%i/%i (%.2f%%)\r" % (ii, len(match_vals), 100*float(ii)/len(match_vals)), 940 else: 941 print '' 942 newRow = combinedRow( cols ) 943 stats = [ dict([ [x.data.get_value(colkey), x.data.get_value(snglstat)] for x in combine_data ]) for colkey, snglstat in zip(colkeys, sngl_stats) ] 944 newRow.store( [( col, combineRowStats( fnc, stat_dict )) for col, fnc, stat_dict in zip(cols, cmb_fncs, stats)] ) 945 orig_cats = [y for x in combine_data for y in x.categories] 946 ifos_param = 'ifos' in dir(newRow) and 'ifos' or 'ifo' 947 new_cats = [Category( c.offset_vector, c.datatype, c.veto_cat, c.on_instruments, getattr(newRow, ifos_param), param_grouping_function(newRow.param) ) for c in orig_cats] 948 newData.add_data( id(newRow), new_cats, newRow ) 949 950 return newData

951

Source Code for Module pylal.ligolw_dataUtils