1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 from __future__ import division
18
19 import itertools
20 import os
21 import sys
22 import random
23
24 import numpy as np
25 from scipy import optimize
26 from scipy import special
27 from scipy import stats
28
29 from glue import iterutils
30 from pylal import plotutils
31 from lal import rate
32 from pylal.stats import rankdata
33
34
35
36
38
39
40 - def __init__(self, grb_name, inj_by_trial, off_by_trial, on_source):
41 """
42 Initializes this object by passing the injection/offsource
43 and onource data
44 """
45
46
47 self.grb_name = grb_name
48
49
50 self.inj_by_trial = inj_by_trial
51 self.off_by_trial = off_by_trial
52 self.onsource = on_source
53
54
55 self.inj_by_trial.sort()
56 self.off_by_trial.sort()
57
58
60 """
61 Returns a particluar number from any sample
62 """
63
64 if type=='box':
65 return self.onsource
66 elif type=='random':
67 index = random.randrange(len(self.off_by_trial))
68 return self.off_by_trial[index]
69 elif type=='max':
70 return max(self.off_by_trial)
71 elif type=='injectMax':
72 return max(self.inj_by_trial)
73 elif type=='injectUpper':
74 n = len(self.inj_by_trial)
75 index = random.randrange(int(0.7*n), n)
76 return self.inj_by_trial[index]
77 elif type=='injectMedian':
78 n = len(self.inj_by_trial)
79 index = random.randrange(int(0.3*n), int(0.5*n))
80 return self.inj_by_trial[index]
81 elif type=='injectLower':
82 n = len(self.inj_by_trial)
83 index = random.randrange(0, int(0.4*n))
84 return self.inj_by_trial[index]
85 elif type=='injectRange':
86 pseudo_list = [x for x in self.inj_by_trial if x>3.5 and x<5.5]
87 return random.choice(pseudo_list)
88
89 elif type=='injectRandom':
90 index = random.randrange(len(self.inj_by_trial))
91 return self.inj_by_trial[index]
92 else:
93 raise ValueError, "Unknown type %s for trigger selection. " % type
94
95
96
97
98
100
101
102 - def __init__(self, grb_data, name_suffix):
103 """
104 Initializes the class with the GRB data
105 """
106
107
108 self.off_list = []
109 self.on_list = []
110
111
112 self.lik_by_grb = []
113 self.ifar_by_grb = []
114
115
116 self.stat = None
117
118
119
120 self.type = None
121
122
123 self.list_grbs = []
124
125
126 self.grb_data = grb_data
127
128
129 self.n_grb = 0
130 self.n_off = 0
131
132
133 self.name_suffix = name_suffix
134
135
136 self.p_one_sided = None
137 self.p_two_sided = None
138 self.u = None
139 self.z = None
140
141 self.colors = itertools.cycle(['b', 'g', 'r', 'c', 'm', 'y', 'k'])
142
143
144 - def add_data(self, grb_name, pop_injections_by_trial, \
145 pop_background_by_trial, pop_onsource):
146
147
148 self.list_grbs.append(grb_name)
149
150
151 data_lik = GRBdata(grb_name, pop_injections_by_trial, \
152 pop_background_by_trial, pop_onsource)
153
154
155 off_ifar = self.calculate_sample_to_ifar(pop_background_by_trial,\
156 pop_background_by_trial)
157 inj_ifar = self.calculate_sample_to_ifar(pop_injections_by_trial,\
158 pop_background_by_trial)
159 on_ifar = self.calculate_sample_to_ifar([pop_onsource], \
160 pop_background_by_trial)
161
162
163 data_ifar = GRBdata(grb_name, inj_ifar, off_ifar, on_ifar[0])
164
165
166 self.lik_by_grb.append(data_lik)
167 self.ifar_by_grb.append(data_ifar)
168
169
171 """
172 Just sets some numbers
173 """
174
175 self.n_grb = len(self.lik_by_grb)
176
177 self.n_off = 0
178 for data_list in self.lik_by_grb:
179 self.n_off += len(data_list.off_by_trial)
180
181
183 """
184 Calculates a list of FAR given the items in the sample
185 """
186
187 vector_ifar = []
188 for item in sample:
189 ifar = self.calculate_ifar(item, sample_ref)
190 vector_ifar.append(ifar)
191
192 return vector_ifar
193
194
196
197 n_sample = float(len(sample))
198 count_louder = (sample >= value).sum(axis=0)
199
200 count_louder = max(count_louder, 1)
201 return n_sample/count_louder
202
203
205
206 data_list_by_grb = self.lik_by_grb
207 if far:
208 data_list_by_grb = self.ifar_by_grb
209
210
211 if far:
212 tag = 'log(IFAR)'
213 sname = 'far'
214 else:
215 tag = 'Likelihood'
216 sname = 'lik'
217 plot = plotutils.SimplePlot(tag, r"cumulative sum",\
218 r"Cumulative distribution offsource")
219
220
221 nbins = 20
222 bins = rate.LinearBins(pop_min, pop_max, nbins)
223 px = bins.lower()
224
225 for data_list in data_list_by_grb:
226
227 grb_name = data_list.grb_name
228
229 tmp_pop = data_list.off_by_trial
230 tmp_arr = np.array(tmp_pop, dtype=float)
231 off_pop = tmp_arr[~np.isinf(tmp_arr)]
232 off_pop.sort()
233 py = range(len(off_pop), 0, -1)
234 if far:
235 off_pop = np.log10(off_pop)
236
237
238 ifos = self.grb_data[grb_name]['ifos']
239 if ifos=='H1L1':
240 linestyle = '-'
241 elif ifos == 'H1H2':
242 linestyle = '-.'
243 else:
244 linestyle = ':'
245
246
247
248 plot.add_content(off_pop, py, color = self.colors.next(),\
249 linestyle = linestyle, label=grb_name)
250 plot.finalize()
251 plot.ax.set_yscale("log")
252 return plot
253
254
255
258
259
261 return self.check_off_distribution(0.0, 3.0, far = True)
262
263
265 """
266 Choose your statistics, either lik or ifar
267 """
268 self.stat = stat
269
270
272 """
273 Selecting fake trials from the set of offsource
274 trials for each GRB.
275 """
276
277
278 if self.stat=='ifar':
279 data_list_by_grb = self.ifar_by_grb
280 elif self.stat=='lik':
281 data_list_by_grb = self.lik_by_grb
282 else:
283 raise ValueError, "Unknown statistics %s in select_onsource" %\
284 self.stat
285
286
287 self.on_list = []
288 self.off_list = []
289 self.type = type
290
291 for counter, data_list in \
292 enumerate(data_list_by_grb):
293
294 if type=='box':
295 value = data_list.choice(type)
296
297 elif 'inject' in type:
298 number_inj = int(type[-1])
299 if counter<number_inj:
300 if 'Max' in type:
301 value = data_list.choice('injectMax')
302 print "injectMax lik: ", value
303 elif 'Random' in type:
304 value = data_list.choice('injectRandom')
305 print "injectRandom lik: ", value
306 elif 'Upper' in type:
307 value = data_list.choice('injectUpper')
308 print "injectUpper lik: ", value
309 elif 'Median' in type:
310 value = data_list.choice('injectMedian')
311 print "injectMedian lik: ", value
312 elif 'Lower' in type:
313 value = data_list.choice('injectLower')
314 print "injectLower lik: ", value
315 elif 'Range' in type:
316 value = data_list.choice('injectRange')
317 print "injectRange lik: ", value
318 else:
319 raise ValueError,"Unknown injection type: ", type
320 else:
321 value = data_list.choice('box')
322 print "box lik: ", value
323
324 elif type=='random' or (type=='single' and counter>0):
325 value = data_list.choice('random')
326
327 elif type=='max' or (type=='single' and counter==0):
328 value = data_list.choice('max')
329
330 else:
331 raise ValueError,"Unknown selection type: ", type
332
333
334 self.on_list.append(value)
335 self.off_list.extend(data_list.off_by_trial)
336
337
338 self.on_list = np.asarray(self.on_list)
339 self.off_list = np.asarray(self.off_list)
340
341
342 if reject_empty:
343 self.on_list = self.remove_empty_trials(self.on_list)
344 self.off_list = self.remove_empty_trials(self.off_list)
345
346
347 self.n_grb = len(self.on_list)
348 self.n_off = len(self.off_list)
349
350
352 """
353 Function to remove any infinite value from a given
354 list of data.
355 """
356 inf_ind = np.isinf(data)
357 return data[~inf_ind]
358
359
361 """
362 Return the Mann-Whitney U statistic on the provided scores. Copied from
363 scipy.stats.mannwhitneyu except that we only return the U such that
364 large U means that population x was systematically larger than population
365 y, rather than the smaller U between x and y. The two possible U values
366 one can report are related by U' = n1*n2 - U.
367 """
368 x = np.asarray(x)
369 y = np.asarray(y)
370 if x.ndim != 1 or y.ndim != 1:
371 raise ValueError, "populations must be rank 1 collections"
372 n1 = len(x)
373 n2 = len(y)
374
375 ranked = rankdata(np.concatenate((x,y)))
376 rankx = ranked[0:n1]
377 u1 = n1 * n2 + (n1 * (n1 + 1)) / 2.0 - rankx.sum()
378 self.u = n1 * n2 - u1
379 return self.u
380
381
383 """
384 Return the z-score of a given sample.
385 Not appropriate for n1 + n2 < ~20.
386 """
387
388 n1 = len(pop_test)
389 n2 = len(pop_ref)
390 u_value = self.mannwhitney_u(pop_test, pop_ref)
391 mean_U = n1 * n2 / 2
392 stdev_U = np.sqrt(n1 * n2 * (n1 + n2 + 1) / 12)
393 self.z = (u_value - mean_U) / stdev_U
394
395 return self.z
396
397
399 """
400 Computes the WMU z-score for the both cases
401 using likelihood and the IFAR
402 """
403
404
405 z_value = self.mannwhitney_u_zscore(self.on_list, self.off_list)
406
407
408 self.p_one_sided = stats.distributions.norm.sf(z_value)
409
410
411 self.p_two_sided = stats.erfc(abs(z_value) / np.sqrt(2.))
412
413 return z_value
414
415
417 """
418 Convert a floating point number to a latex representation. In particular,
419 scientific notation is handled gracefully: e -> 10^
420 """
421 base_str = format % x
422 if "e" not in base_str:
423 return base_str
424 mantissa, exponent = base_str.split("e")
425 exponent = str(int(exponent))
426
427
429
430
431
432
433 plot_title = r"$m_2 \in [%s), U=%d, z_U=%s, p_1=%s, p_2=%s$" \
434 % (self.name_suffix , int(self.u), self.float_to_latex(self.z,\
435 "%5.2g"),
436 self.float_to_latex(self.p_one_sided, "%5.2g"),
437 self.float_to_latex(self.p_two_sided, "%5.2g"))
438 plot = plotutils.VerticalBarHistogram(r"$IFAR(m_2 \in [%s))$" %\
439 self.name_suffix, "PDF", \
440 plot_title)
441 plot.add_content(self.on_list, color='r', label = r'On source', \
442 bottom = 1.0e-4)
443 plot.add_content(self.off_list, color='b', label = r'Off source', \
444 bottom = 1.0e-4)
445 plot.finalize(normed=True)
446 plot.ax.set_yscale('log')
447 return plot
448
449
451
452
453
454
455 plot_title = r"$m_2 \in [%s), U=%d, z_U=%s, p_1=%s, p_2=%s$" \
456 % (self.name_suffix , int(self.u), \
457 self.float_to_latex(self.z, "%5.2g"),
458 self.float_to_latex(self.p_one_sided, "%5.2g"),
459 self.float_to_latex(self.p_two_sided, "%5.2g"))
460 if self.type=="box":
461 plot_title = ""
462 plot = plotutils.QQPlot(r"self quantile", "combined quantile", \
463 plot_title)
464 plot.add_bg(self.off_list, linewidth = 3, label="\"Off source\"")
465 plot.add_fg(self.on_list, color='r', marker = 'o',\
466 label = r'On source',\
467 linestyle='None',markersize=10)
468 plot.finalize()
469 return plot
470
471
473
474
475 self.off_list.sort()
476 on_data = np.asarray(self.on_list)
477 off_data = np.asarray(self.off_list)
478
479 y_on = []
480 y_off = []
481 x_onoff = []
482 counter = 0
483 for value in off_data[::-1]:
484 counter += 1
485 x_onoff.append(value)
486 y_off.append(counter)
487 y_on.append((on_data>value).sum())
488
489
490 x_onoff = np.asarray(x_onoff)
491 y_on = np.asarray(y_on)
492 y_off = np.asarray(y_off)
493 inf_ind = np.isinf(x_onoff)
494 x_onoff[inf_ind] = 0.0
495
496
497 scale = y_on.max()/y_off.max()
498 y_off_scale = y_off*scale
499
500
501 plot = plotutils.SimplePlot(r"Likelihood", r"Cumulative sum",\
502 r"Cumulative distribution")
503 plot.add_content(x_onoff, y_off_scale, color = 'r', \
504 linewidth = 2, label = 'off-source')
505 plot.add_content(x_onoff, y_on, color = 'b',\
506 linewidth = 3, label = 'on-source')
507
508 plot.finalize()
509
510
511 plot.ax.set_xscale('log')
512 plot.ax.axis([2, 20, 0.0, 22.0])
513 plot.ax.set_xticks([2,3,5,10])
514 plot.ax.set_xticklabels(['2','3','5','10'])
515
516 return plot
517
518
520 """
521 Create a pdf plot of the used data.
522 """
523
524
525 data_off = np.sort(self.off_list)[::-1]
526 data_on = np.sort(self.on_list)[::-1]
527
528
529 data_on = self.remove_empty_trials(data_on)
530 data_off = self.remove_empty_trials(data_off)
531 n_grb = len(data_on)
532
533
534 pdf_x = []
535 pdf_off_y = []
536 pdf_on_y = []
537 pdf_on_r = []
538
539
540 index = 0
541 sum_off = 0.0
542 sum_tot = 0
543 for d in data_off:
544
545
546 sum_off += 1
547 sum_tot +=1
548
549
550 if d<=data_on[index]:
551
552
553 sum_scale = sum_off/self.n_off
554 y = 1.0/n_grb
555 r = y-sum_scale
556
557
558 pdf_x.append(d)
559 pdf_off_y.append(sum_scale)
560 pdf_on_y.append(y)
561 pdf_on_r.append(r)
562
563
564 if index==n_grb-1:
565 break
566
567
568 index += 1
569 sum_off = 0.0
570
571
572 remain_data = data_off[data_off<=d]
573 sum_off = sum(pdf_off_y)+len(remain_data)/self.n_off
574 sum_on = sum(pdf_on_y)
575 print "Sum of the onsource: %.1f offsource: %.1f" % \
576 (sum_on, sum_off)
577
578
579 plot = plotutils.SimplePlot(r"Likelihood", r"pdf",\
580 r"PDF Distribution")
581 plot.add_content(pdf_x, pdf_off_y, color = 'r', \
582 linewidth = 2, label = 'off-source')
583 plot.add_content(pdf_x, pdf_on_y, color = 'b', marker = 'o',\
584 markersize = 10.0, label = 'on-source')
585 plot.finalize()
586
587 return plot
588
589
591
592 def create_area(x, dx, y, dy):
593 px = [x-dx/2, x+dx/2, x+dx/2, x-dx/2, x-dx/2]
594 py = [y-dy/2, y-dy/2, y+dy/2, y+dy/2, y-dy/2]
595 return px, py
596
597 def draw_error_boxes(plot, x, dx, y, dy, col):
598
599 bx, by = create_area(x, dx, y, dy )
600 plot.ax.fill(bx, by, ec='w', fc = col, alpha = 0.2)
601
602 bx, by = create_area(x, dx, y, 2*dy )
603 plot.ax.fill(bx, by, ec='w', fc = col, alpha = 0.2)
604
605 bx, by = create_area(x, dx, y, 3*dy )
606 plot.ax.fill(bx, by, ec='k', fc = col, alpha = 0.2)
607
608 return plot
609
610
611
612 if range is None:
613 data_on = np.asarray(self.on_list)
614 inf_ind = np.isinf(data_on)
615 val_min = data_on[~inf_ind].min()
616 val_max = data_on.max()
617 else:
618 val_min = range[0]
619 val_max = range[1]
620
621
622 hist_on = np.zeros(n_bin)
623 hist_off = np.zeros(n_bin)
624
625
626 lik_bins = rate.LinearBins(val_min, val_max, n_bin)
627
628
629 for x in self.off_list:
630 if x>=val_min and x<=val_max:
631 hist_off[lik_bins[x]] += 1
632
633 for x in self.on_list:
634 if x>=val_min and x<=val_max:
635 hist_on[lik_bins[x]] += 1
636
637
638 px = lik_bins.centres()
639 dx = px[1]-px[0]
640
641
642 norm = self.n_grb/self.n_off
643 hist_on_norm = hist_on
644 hist_off_norm = norm*hist_off
645
646
647 plot = plotutils.SimplePlot(r"Likelihood", r"counts",\
648 r"Histogramm of on/offsource with %d bins"%\
649 (n_bin))
650
651
652 plot.add_content(px, hist_on, color = 'r', marker = 'o',\
653 markersize = 10.0, label = 'on-source')
654 plot.add_content(px, hist_off_norm, color = 'b',marker = 's', \
655 markersize = 10, label = 'off-source')
656
657
658 for x,n in zip(px, hist_on):
659
660
661 dn = np.sqrt(n)
662 plot = draw_error_boxes(plot, x, dx, n, dn, 'r')
663
664 plot.finalize()
665 plot.ax.axis([val_min, val_max, 0.0, 20.0])
666
667
668
669 for x in self.on_list:
670 if x>=val_min and x<=val_max:
671 plot.ax.plot([x,x],[0.0, 1.0],'k')
672 plot.ax.axis([val_min, val_max, 0.0, 20.0])
673
674 return plot
675
676
677
678
679 from scipy import stats
680
682 """
683 Return the Mann-Whitney U statistic on the provided scores. Copied from
684 scipy.stats.mannwhitneyu except that we only return the U such that
685 large U means that population x was systematically larger than population
686 y, rather than the smaller U between x and y. The two possible U values
687 one can report are related by U' = n1*n2 - U.
688 """
689 x = np.asarray(x)
690 y = np.asarray(y)
691 if x.ndim != 1 or y.ndim != 1:
692 raise ValueError, "populations must be rank 1 collections"
693 n1 = len(x)
694 n2 = len(y)
695
696 ranked = rankdata(np.concatenate((x,y)))
697 rankx = ranked[0:n1]
698 u1 = n1 * n2 + (n1 * (n1 + 1)) / 2.0 - rankx.sum()
699 return n1 * n2 - u1
700
702 """
703 Return the right-tailed probability of the U value, assuming that
704 N is sufficiently high.
705 """
706 mean_U = n1 * n2 / 2.
707 stdev_U = np.sqrt(n1 * n2 * (n1 + n2 + 1) / 12.)
708 return stats.norm.sf((U - mean_U) / stdev_U)
709
711 localProb = np.asarray(localProb)
712 Ntail = len(localProb)
713
714
715
716 P = special.bdtrc(np.arange(Ntail), Ndraws, localProb)
717 index = P.argmin()
718 Pmin_raw = P[index]
719
720 return Pmin_raw, index + 1
721
723 """
724 Adapted from https://trac.ligo.caltech.edu/xpipeline/browser/trunk/utilities/grbbinomialtest.m
725
726 localProb is a *sorted* array of FAP values, one per GRB to be tested
727 Ndraws is a scalar saying how many GRBs were analyzed in total
728 Nmc is the number of Monte-Carlo simulations to perform in assessing
729 significance.
730 discreteness is optional, but allows you to draw FAP values uniformly
731 from multiples of 1 / discreteness
732
733 Pmin_raw Lowest cumulative binomial probability of the input set
734 localProb. Note that this number does not account for the
735 trials factor when length(localProb)>1.
736 Pmin Probability that the tail of length(localProb) of a set of
737 Ndraws uniformly distributed random numbers will give a
738 cumulative binomial probability less than or equal to
739 Pmin_raw.
740 Nmin Number of tail values to include at which the binomial
741 probability Pmin_raw occurs.
742 """
743 Ntail = len(localProb)
744 Pmin_raw, Nmin = grbbinomial_Pmin_raw(localProb, Ndraws)
745
746
747 if discreteness is None:
748 localProbMC = stats.uniform.rvs(size=(Nmc, Ndraws))
749 else:
750 localProbMC = stats.randint.rvs(0, discreteness + 1, size=(Nmc, Ndraws)) / discreteness
751
752
753 localProbMC.sort(axis=1)
754 localProbMC = localProbMC[:, :Ntail]
755
756 PMC = special.bdtrc(np.arange(Ntail)[None, :], Ndraws, localProbMC)
757 PminMC = PMC.min(axis=1)
758 Pmin = (PminMC <= Pmin_raw).mean()
759
760 return Pmin_raw, Pmin, Nmin
761
763 """
764 Adapted from https://trac.ligo.caltech.edu/xpipeline/browser/trunk/utilities/grbbinomialtest_threshold.m
765
766 Ndraws is a scalar saying how many GRBs were analyzed in total
767 Ntail is the number of loudest GRB events kept
768 percentile is the desired percentile of the binomial probability
769 distribution (This should be between 0 and 100!)
770 Nmc is the number of Monte-Carlo simulations to perform in assessing
771 significance
772 discreteness is optional, but allows you to draw FAP values uniformly
773 from multiples of 1 / discreteness
774
775 Return the threshold on Pmin for the given percentile and an array of the
776 FAPs corresponding to that threshold for each k=1..Ntail at which
777 we evaluate the binomial probability.
778 """
779 assert Ntail <= Ndraws
780 if discreteness is None:
781 draw = lambda n: stats.uniform.rvs(size=(n, Ndraws))
782 else:
783 draw = lambda n: stats.randint.rvs(0, discreteness + 1, size=(n, Ndraws)) / discreteness
784
785 PminMC = []
786 num_drawn = 0
787 while num_drawn < Nmc:
788 num_to_draw = min(Nmc - num_drawn, blocksize)
789 localProbMC = draw(num_to_draw)
790
791
792 localProbMC.sort(axis=1)
793 localProbMC = localProbMC[:, :Ntail]
794
795
796 PMC = special.bdtrc(np.arange(Ntail)[None, :], Ndraws, localProbMC)
797 PminMC.extend(PMC.min(axis=1))
798 num_drawn += num_to_draw
799
800
801 PminMC = np.asarray(PminMC)
802 Pmin_thresh = stats.scoreatpercentile(PminMC, percentile)
803 return Pmin_thresh
804