gstlal-inspiral  0.4.2
 All Classes Namespaces Files Functions Variables Pages
rate_estimation.py
Go to the documentation of this file.
1 # Copyright (C) 2011--2014 Kipp Cannon, Chad Hanna, Drew Keppel
2 # Copyright (C) 2013 Jacob Peoples
3 #
4 # This program is free software; you can redistribute it and/or modify it
5 # under the terms of the GNU General Public License as published by the
6 # Free Software Foundation; either version 2 of the License, or (at your
7 # option) any later version.
8 #
9 # This program is distributed in the hope that it will be useful, but
10 # WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
12 # Public License for more details.
13 #
14 # You should have received a copy of the GNU General Public License along
15 # with this program; if not, write to the Free Software Foundation, Inc.,
16 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17 
18 ##
19 # @file
20 #
21 # A file that contains the rate estimation code.
22 #
23 # Review Status: Reviewed with actions
24 #
25 # | Names | Hash | Date |
26 # | ------------------------------------------- | ------------------------------------------- | ---------- |
27 # | Sathya, Duncan Me, Jolien, Kipp, Chad | 2fb185eda0edb9d49d79b8185f7b35457cafa06b | 2015-05-14 |
28 #
29 # #### Actions
30 # - Increase nbins to at least 10,000
31 # - Check max acceptance
32 
33 #
34 # =============================================================================
35 #
36 # Preamble
37 #
38 # =============================================================================
39 #
40 
41 
42 try:
43  from fpconst import NaN, NegInf, PosInf
44 except ImportError:
45  # fpconst is not part of the standard library and might not be
46  # available
47  NaN = float("nan")
48  NegInf = float("-inf")
49  PosInf = float("+inf")
50 import math
51 import numpy
52 from scipy import optimize
53 import sys
54 
55 
56 from glue.text_progress_bar import ProgressBar
57 from pylal import rate
58 
59 
60 from gstlal import emcee
61 from gstlal._rate_estimation import *
62 
63 
64 #
65 # =============================================================================
66 #
67 # Event Rate Posteriors
68 #
69 # =============================================================================
70 #
71 
72 
73 def maximum_likelihood_rates(ln_f_over_b):
74  # the upper bound is chosen to include N + \sqrt{N}
75  return optimize.fmin((lambda x: -RatesLnPDF(x, ln_f_over_b)), (1.0, len(ln_f_over_b) + len(ln_f_over_b)**.5), disp = True)
76 
77 
78 def run_mcmc(n_walkers, n_dim, n_samples_per_walker, lnprobfunc, pos0 = None, args = (), n_burn = 100, progressbar = None):
79  """
80  A generator function that yields samples distributed according to a
81  user-supplied probability density function that need not be
82  normalized. lnprobfunc computes and returns the natural logarithm
83  of the probability density, up to a constant offset. n_dim sets
84  the number of dimensions of the parameter space over which the PDF
85  is defined and args gives any additional arguments to be passed to
86  lnprobfunc, whose signature must be
87 
88  ln(P) = lnprobfunc(X, *args)
89 
90  where X is a numpy array of length n_dim.
91 
92  The generator yields a total of n_samples_per_walker arrays each of
93  which is n_walkers by n_dim in size. Each row is a sample drawn
94  from the n_dim-dimensional parameter space.
95 
96  pos0 is an n_walkers by n_dim array giving the initial positions of
97  the walkers (this parameter is currently not optional). n_burn
98  iterations of the MCMC sampler will be executed and discarded to
99  allow the system to stabilize before samples are yielded to the
100  calling code. A chain can be continued by passing the last return
101  value as pos0 and setting n_burn = 0.
102 
103  NOTE: the samples yielded by this generator are not drawn from the
104  PDF independently of one another. The correlation length is not
105  known at this time.
106  """
107  #
108  # construct a sampler
109  #
110 
111  sampler = emcee.EnsembleSampler(n_walkers, n_dim, lnprobfunc, args = args)
112 
113  #
114  # set walkers at initial positions
115  #
116 
117  # FIXME: implement
118  assert pos0 is not None, "auto-selection of initial positions not implemented"
119 
120  #
121  # burn-in: run for a while to get better initial positions.
122  # run_mcmc() doesn't like being asked for 0 iterations, so need to
123  # check for that
124  #
125 
126  if n_burn:
127  pos0, ignored, ignored = sampler.run_mcmc(pos0, n_burn, storechain = False)
128  if progressbar is not None:
129  progressbar.increment(delta = n_burn)
130  if n_burn and sampler.acceptance_fraction.min() < 0.4:
131  print >>sys.stderr, "\nwarning: low burn-in acceptance fraction (min = %g)" % sampler.acceptance_fraction.min()
132 
133  #
134  # reset and yield positions distributed according to the supplied
135  # PDF
136  #
137 
138  sampler.reset()
139  for coordslist, ignored, ignored in sampler.sample(pos0, iterations = n_samples_per_walker, storechain = False):
140  yield coordslist
141  if progressbar is not None:
142  progressbar.increment()
143  if n_samples_per_walker and sampler.acceptance_fraction.min() < 0.5:
144  print >>sys.stderr, "\nwarning: low sampler acceptance fraction (min %g)" % sampler.acceptance_fraction.min()
145 
146 
147 def binned_rates_from_samples(samples):
148  """
149  Construct and return a BinnedArray containing a histogram of a
150  sequence of samples. If limits is None (default) then the limits
151  of the binning will be determined automatically from the sequence,
152  otherwise limits is expected to be a tuple or other two-element
153  sequence providing the (low, high) limits, and in that case the
154  sequence can be a generator.
155  """
156  lo, hi = math.floor(samples.min()), math.ceil(samples.max())
157  nbins = len(samples) // 729
158  binnedarray = rate.BinnedArray(rate.NDBins((rate.LinearBins(lo, hi, nbins),)))
159  for sample in samples:
160  binnedarray[sample,] += 1.
161  rate.filter_array(binnedarray.array, rate.gaussian_window(5))
162  numpy.clip(binnedarray.array, 0.0, PosInf, binnedarray.array)
163  return binnedarray
164 
165 
166 def calculate_rate_posteriors(ranking_data, ln_likelihood_ratios, progressbar = None, chain_file = None, nsample = 400000):
167  """
168  FIXME: document this
169  """
170  #
171  # check for bad input
172  #
173 
174  if any(math.isnan(ln_lr) for ln_lr in ln_likelihood_ratios):
175  raise ValueError("NaN log likelihood ratio encountered")
176  if nsample < 0:
177  raise ValueError("nsample < 0: %d" % nsample)
178 
179  #
180  # for each sample of the ranking statistic, evaluate the ratio of
181  # the signal ranking statistic PDF to background ranking statistic
182  # PDF.
183  #
184 
185  if ranking_data is not None:
186  f = ranking_data.signal_likelihood_pdfs[None]
187  b = ranking_data.background_likelihood_pdfs[None]
188  ln_f_over_b = numpy.log(numpy.array([f[ln_lr,] / b[ln_lr,] for ln_lr in ln_likelihood_ratios]))
189 
190  # remove NaNs. these occur because the ranking statistic
191  # PDFs have been zeroed at the cut-off and some events get
192  # pulled out of the database with ranking statistic values
193  # in that bin
194  #
195  # FIXME: re-investigate the decision to zero the bin at
196  # threshold. the original motivation for doing it might
197  # not be there any longer
198  ln_f_over_b = ln_f_over_b[~numpy.isnan(ln_f_over_b)]
199  # safety check
200  if numpy.isinf(ln_f_over_b).any():
201  raise ValueError("infinity encountered in ranking statistic PDF ratios")
202  elif nsample > 0:
203  raise ValueError("must supply ranking data to run MCMC sampler")
204  else:
205  # no-op path
206  ln_f_over_b = numpy.array([])
207 
208  #
209  # initialize MCMC chain. try loading a chain from a chain file if
210  # provided, otherwise seed the walkers for a burn-in period
211  #
212 
213  ndim = 2
214  nwalkers = 10 * 2 * ndim # must be even and >= 2 * ndim
215  nburn = 1000
216 
217  pos0 = numpy.zeros((nwalkers, ndim), dtype = "double")
218 
219  if progressbar is not None:
220  progressbar.max = nsample + nburn
221  progressbar.show()
222 
223  i = 0
224  if chain_file is not None and "chain" in chain_file:
225  chain = chain_file["chain"].values()
226  length = sum(sample.shape[0] for sample in chain)
227  samples = numpy.empty((max(nsample, length), nwalkers, ndim), dtype = "double")
228  if progressbar is not None:
229  progressbar.max = samples.shape[0]
230  # load chain from HDF file
231  for sample in chain:
232  samples[i:i+sample.shape[0],:,:] = sample
233  i += sample.shape[0]
234  if progressbar is not None:
235  progressbar.update(i)
236  if i:
237  # skip burn-in, restart chain from last position
238  nburn = 0
239  pos0 = samples[i - 1,:,:]
240  elif nsample <= 0:
241  raise ValueError("no chain file to load or invalid chain file, and no new samples requested")
242  if not i:
243  # no chain file provided, or file does not contain sample
244  # chain or sample chain is empty. still need burn-in.
245  # seed signal rate walkers from exponential distribution,
246  # background rate walkers from poisson distribution
247  samples = numpy.empty((nsample, nwalkers, ndim), dtype = "double")
248  pos0[:,0] = numpy.random.exponential(scale = 1., size = (nwalkers,))
249  pos0[:,1] = numpy.random.poisson(lam = len(ln_likelihood_ratios), size = (nwalkers,))
250  i = 0
251 
252  #
253  # run MCMC sampler to generate (foreground rate, background rate)
254  # samples. to improve the measurement of the tails of the PDF
255  # using the MCMC sampler, we draw from the square root of the PDF
256  # and then correct the histogram of the samples (see below).
257  #
258 
259  log_posterior = posterior(ln_f_over_b)
260 
261  for j, coordslist in enumerate(run_mcmc(nwalkers, ndim, max(0, nsample - i), (lambda x: log_posterior(x) / 2.), n_burn = nburn, pos0 = pos0, progressbar = progressbar), i):
262  # coordslist is nwalkers x ndim
263  samples[j,:,:] = coordslist
264  # dump samples to the chain file every 2048 steps
265  if j + 1 >= i + 2048 and chain_file is not None:
266  chain_file["chain/%08d" % i] = samples[i:j+1,:,:]
267  chain_file.flush()
268  i = j + 1
269  # dump any remaining samples to chain file
270  if chain_file is not None and i < samples.shape[0]:
271  chain_file["chain/%08d" % i] = samples[i:,:,:]
272  chain_file.flush()
273 
274  #
275  # safety check
276  #
277 
278  if samples.min() < 0:
279  raise ValueError("MCMC sampler yielded negative rate(s)")
280 
281  #
282  # compute marginalized PDFs for the foreground and background
283  # rates. for each PDF, the samples from the MCMC are histogrammed
284  # and convolved with a density estimation kernel. the samples have
285  # been drawn from the square root of the correct PDF, so the counts
286  # in the bins must be corrected before converting to a normalized
287  # PDF. how to correct count:
288  #
289  # correct count = (correct PDF) * (bin size)
290  # = (measured PDF)^2 * (bin size)
291  # = (measured count / (bin size))^2 * (bin size)
292  # = (measured count)^2 / (bin size)
293  #
294  # this assumes small bin sizes.
295  #
296 
297  Rf_pdf = binned_rates_from_samples(samples[:,:,0].flatten())
298  Rf_pdf.array = Rf_pdf.array**2. / Rf_pdf.bins.volumes()
299  Rf_pdf.to_pdf()
300 
301  Rb_pdf = binned_rates_from_samples(samples[:,:,1].flatten())
302  Rb_pdf.array = Rb_pdf.array**2. / Rb_pdf.bins.volumes()
303  Rb_pdf.to_pdf()
304 
305  #
306  # done
307  #
308 
309  return Rf_pdf, Rb_pdf
310 
311 
312 def moment_from_pdf(binnedarray, moment, c = 0.):
313  if len(binnedarray.bins) != 1:
314  raise ValueError("BinnedArray PDF must have 1 dimension")
315  x = binnedarray.bins.centres()[0]
316  dx = binnedarray.bins.upper()[0] - binnedarray.bins.lower()[0]
317  return ((x - c)**moment * binnedarray.array * dx).sum()
318 
319 
320 def mean_from_pdf(binnedarray):
321  return moment_from_pdf(binnedarray, 1.)
322 
323 
324 def variance_from_pdf(binnedarray):
325  return moment_from_pdf(binnedarray, 2., mean_from_pdf(binnedarray))
326 
327 
328 def confidence_interval_from_binnedarray(binned_array, confidence = 0.95):
329  """
330  Constructs a confidence interval based on a BinnedArray object
331  containing a normalized 1-D PDF. Returns the tuple (mode, lower bound,
332  upper bound).
333  """
334  # check for funny input
335  if numpy.isnan(binned_array.array).any():
336  raise ValueError("NaNs encountered in rate PDF")
337  if numpy.isinf(binned_array.array).any():
338  raise ValueError("infinities encountered in rate PDF")
339  if (binned_array.array < 0.).any():
340  raise ValueError("negative values encountered in rate PDF")
341  if not 0.0 <= confidence <= 1.0:
342  raise ValueError("confidence must be in [0, 1]")
343 
344  mode_index = numpy.argmax(binned_array.array)
345 
346  centres, = binned_array.centres()
347  upper = binned_array.bins[0].upper()
348  lower = binned_array.bins[0].lower()
349  bin_widths = upper - lower
350  if (bin_widths <= 0.).any():
351  raise ValueError("rate PDF bin sizes must be positive")
352  assert not numpy.isinf(bin_widths).any(), "infinite bin sizes not supported"
353  P = binned_array.array * bin_widths
354  if abs(P.sum() - 1.0) > 1e-13:
355  raise ValueError("rate PDF is not normalized")
356 
357  li = ri = mode_index
358  P_sum = P[mode_index]
359  while P_sum < confidence:
360  if li <= 0 and ri >= len(P) - 1:
361  raise ValueError("failed to achieve requested confidence")
362  P_li = P[li - 1] if li > 0 else 0.
363  P_ri = P[ri + 1] if ri < len(P) - 1 else 0.
364  assert P_li >= 0. and P_ri >= 0.
365  if P_li > P_ri:
366  P_sum += P_li
367  li -= 1
368  elif P_ri > P_li:
369  P_sum += P_ri
370  ri += 1
371  else:
372  P_sum += P_li + P_ri
373  li = max(li - 1, 0)
374  ri = min(ri + 1, len(P) - 1)
375  return centres[mode_index], lower[li], upper[ri]