gstlal-inspiral  0.4.2
 All Classes Namespaces Files Functions Variables Pages
gstlal_inspiral_rate_posterior
1 #!/usr/bin/env python
2 #
3 # Copyright (C) 2013,2014 Kipp Cannon, Chad Hanna, Jacob Peoples
4 #
5 # This program is free software; you can redistribute it and/or modify it
6 # under the terms of the GNU General Public License as published by the
7 # Free Software Foundation; either version 2 of the License, or (at your
8 # option) any later version.
9 #
10 # This program is distributed in the hope that it will be useful, but
11 # WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
13 # Public License for more details.
14 #
15 # You should have received a copy of the GNU General Public License along
16 # with this program; if not, write to the Free Software Foundation, Inc.,
17 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18 
19 ## @file
20 # A program to comput the signal and noise rate posteriors of a gstlal inspiral analysis
21 
22 
23 #
24 # =============================================================================
25 #
26 # Preamble
27 #
28 # =============================================================================
29 #
30 
31 
32 import bisect
33 try:
34  from fpconst import NegInf
35 except ImportError:
36  NegInf = float("-inf")
37 import h5py
38 import math
39 import matplotlib
40 matplotlib.rcParams.update({
41  "font.size": 8.0,
42  "axes.titlesize": 10.0,
43  "axes.labelsize": 10.0,
44  "xtick.labelsize": 8.0,
45  "ytick.labelsize": 8.0,
46  "legend.fontsize": 8.0,
47  "figure.dpi": 600,
48  "savefig.dpi": 600,
49  "text.usetex": True
50 })
51 from matplotlib import figure
52 from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas
53 import numpy
54 from optparse import OptionParser
55 import sqlite3
56 import sys
57 
58 
59 from glue import segments
60 from glue.lal import CacheEntry
61 from glue.ligolw import ligolw
62 from glue.ligolw import dbtables
63 from glue.ligolw import lsctables
64 from glue.ligolw import utils as ligolw_utils
65 from glue.ligolw.utils import process as ligolw_process
66 from glue.text_progress_bar import ProgressBar
67 from pylal import ligolw_thinca
68 
69 
70 from gstlal import far
71 from gstlal import rate_estimation
72 
73 
74 golden_ratio = (1. + math.sqrt(5.)) / 2.
75 
76 
77 process_name = u"gstlal_inspiral_rate_posterior"
78 
79 
80 __author__ = "Kipp Cannon <kipp.cannon@ligo.org>"
81 __version__ = "git id %s" % "" # FIXME
82 __date__ = "" # FIXME
83 
84 
85 #
86 # =============================================================================
87 #
88 # Command Line
89 #
90 # =============================================================================
91 #
92 
93 
94 def parse_command_line():
95  parser = OptionParser(
96  version = "Name: %%prog\n%s" % "" # FIXME
97  )
98  parser.add_option("-c", "--confidence-intervals", metavar = "confidence[,...]", default = "0.68,0.95,0.999999", help = "Compute and report confidence intervals in the signal count for these confidences (default = \".68,.95,.999999\", clear to disable).")
99  parser.add_option("-i", "--input-cache", metavar = "filename", help = "Also process the files named in this LAL cache. See lalapps_path2cache for information on how to produce a LAL cache file.")
100  parser.add_option("--chain-file", metavar = "filename", help = "Read chain from this file, save chain to this file.")
101  parser.add_option("--likelihood-file", metavar = "filename", action = "append", help = "Load ranking statistic PDFs from this file. Can be given multiple times.")
102  parser.add_option("--likelihood-cache", metavar = "filename", help = "Load ranking statistic PDFs from the files in this LAL cache.")
103  parser.add_option("-t", "--tmp-space", metavar = "path", help = "Path to a directory suitable for use as a work area while manipulating the database file. The database file will be worked on in this directory, and then moved to the final location when complete. This option is intended to improve performance when running in a networked environment, where there might be a local disk with higher bandwidth than is available to the filesystem on which the final output will reside.")
104  parser.add_option("--with-background", action = "store_true", help = "Show background posterior on plot.")
105  parser.add_option("--samples", metavar = "count", type = "int", help = "Run this many samples. Set to 0 to load and plot the contents of a previously-recorded chain file without doing any additional samples.")
106  parser.add_option("--threshold", metavar = "log likelihood ratio", type = "float", help = "Derive the rate posterior by considering only events ranked at or above this value of the log likelihood ratio ranking statistic (default = use all events).")
107  parser.add_option("-v", "--verbose", action = "store_true", help = "Be verbose.")
108  options, filenames = parser.parse_args()
109 
110  paramdict = options.__dict__
111 
112  options.confidence_intervals = map(float, options.confidence_intervals.split(","))
113 
114  options.likelihood_filenames = []
115  if options.likelihood_cache is not None:
116  options.likelihood_filenames += [CacheEntry(line).path for line in open(options.likelihood_cache)]
117  if options.likelihood_file is not None:
118  options.likelihood_filenames += options.likelihood_file
119  if not options.likelihood_filenames and options.samples > 0:
120  raise ValueError("must provide --likelihood-cache and/or one or more --likelihood-file options")
121 
122  if options.input_cache:
123  filenames += [CacheEntry(line).path for line in open(options.input_cache)]
124 
125  return options, paramdict, filenames
126 
127 
128 #
129 # =============================================================================
130 #
131 # Support Functions
132 #
133 # =============================================================================
134 #
135 
136 
137 def load_ranking_data(filenames, ln_likelihood_ratio_threshold = None, verbose = False):
138  if not filenames:
139  raise ValueError("no likelihood files!")
140  ranking_data = None
141  for n, filename in enumerate(filenames, start = 1):
142  if verbose:
143  print >>sys.stderr, "%d/%d:" % (n, len(filenames)),
144  xmldoc = ligolw_utils.load_filename(filename, contenthandler = far.ThincaCoincParamsDistributions.LIGOLWContentHandler, verbose = verbose)
145  ignored, this_ranking_data, ignored = far.parse_likelihood_control_doc(xmldoc)
146  xmldoc.unlink()
147  if this_ranking_data is None:
148  raise ValueError("'%s' does not contain ranking statistic PDFs" % filename)
149  if ranking_data is None:
150  ranking_data = this_ranking_data
151  else:
152  ranking_data += this_ranking_data
153  # affect the zeroing of the PDFs below threshold by hacking the
154  # histograms before running .finish(). do the indexing ourselves
155  # to not 0 the bin @ threshold
156  if ln_likelihood_ratio_threshold is not None:
157  for binnedarray in ranking_data.background_likelihood_rates.values():
158  binnedarray.array[:binnedarray.bins[0][ln_likelihood_ratio_threshold]] = 0.
159  for binnedarray in ranking_data.signal_likelihood_rates.values():
160  binnedarray.array[:binnedarray.bins[0][ln_likelihood_ratio_threshold]] = 0.
161  for binnedarray in ranking_data.zero_lag_likelihood_rates.values():
162  binnedarray.array[:binnedarray.bins[0][ln_likelihood_ratio_threshold]] = 0.
163 
164  ranking_data.finish(verbose = verbose)
165 
166  return ranking_data
167 
168 
169 def load_search_results(filenames, ln_likelihood_ratio_threshold = None, tmp_path = None, verbose = False):
170  background_ln_likelihood_ratios = []
171  zerolag_ln_likelihood_ratios = []
172 
173  for n, filename in enumerate(filenames, 1):
174  if verbose:
175  print >>sys.stderr, "%d/%d: %s" % (n, len(filenames), filename)
176  working_filename = dbtables.get_connection_filename(filename, tmp_path = tmp_path, verbose = verbose)
177  connection = sqlite3.connect(working_filename)
178 
179  xmldoc = dbtables.get_xml(connection)
180  definer_id = lsctables.CoincDefTable.get_table(xmldoc).get_coinc_def_id(ligolw_thinca.InspiralCoincDef.search, ligolw_thinca.InspiralCoincDef.search_coinc_type, create_new = False)
181 
182  for ln_likelihood_ratio, is_background in connection.cursor().execute("""
183 SELECT
184  coinc_event.likelihood,
185  EXISTS (
186  SELECT
187  *
188  FROM
189  time_slide
190  WHERE
191  time_slide.time_slide_id == coinc_event.time_slide_id
192  AND time_slide.offset != 0
193  )
194 FROM
195  coinc_event
196  JOIN coinc_inspiral ON (
197  coinc_inspiral.coinc_event_id == coinc_event.coinc_event_id
198  )
199 WHERE
200  coinc_event.coinc_def_id == ?
201  AND coinc_event.likelihood >= ?"""
202  , (definer_id, (ln_likelihood_ratio_threshold if ln_likelihood_ratio_threshold is not None else NegInf))):
203  if is_background:
204  background_ln_likelihood_ratios.append(ln_likelihood_ratio)
205  else:
206  zerolag_ln_likelihood_ratios.append(ln_likelihood_ratio)
207 
208  connection.close()
209  dbtables.discard_connection_filename(filename, working_filename, verbose = verbose)
210 
211  return background_ln_likelihood_ratios, zerolag_ln_likelihood_ratios
212 
213 
214 def plot_rates(signal_rate, confidence_intervals = None):
215  fig = figure.Figure()
216  FigureCanvas(fig)
217  fig.set_size_inches((4., 4. / golden_ratio))
218  signal_axes = fig.gca()
219 
220  x = signal_rate.bins[0].centres()
221  y = signal_rate.array
222  line1, = signal_axes.plot(x, y, color = "k", linestyle = "-", label = "Signal")
223  signal_axes.set_title("Event Rate Posterior")
224  signal_axes.set_xlabel("Event Rate ($\mathrm{signals} / \mathrm{experiment}$)")
225  signal_axes.set_ylabel(r"$P(\mathrm{signals} / \mathrm{experiment})$")
226  signal_axes.loglog()
227 
228  signal_axes.set_ylim((1e-8, 1.))
229 
230  if confidence_intervals is not None:
231  alphas = dict(zip(sorted(confidence_intervals), [.6, .4, .2]))
232 
233  # convert lo and hi bounds to co-ordinate index segments
234  confidence_intervals = sorted((conf, segments.segmentlist([segments.segment(bisect.bisect_left(x, lo), bisect.bisect_right(x, hi))])) for conf, (mode, lo, hi) in confidence_intervals.items())
235 
236  # remove from each the indexes spanned by lower confidence regions
237  confidence_intervals = [(conf, indexes - sum((segs for conf, segs in confidence_intervals[:i]), segments.segmentlist())) for i, (conf, indexes) in enumerate(confidence_intervals)]
238 
239  for conf, segs in confidence_intervals:
240  for lo, hi in segs:
241  signal_axes.fill_between(x[lo:hi], y[lo:hi], 1e-8, color = "k", alpha = alphas[conf])
242 
243  try:
244  fig.tight_layout()
245  except AttributeError:
246  # old matplotlib. auto-layout not available
247  pass
248  return fig
249 
250 
251 #
252 # =============================================================================
253 #
254 # Main
255 #
256 # =============================================================================
257 #
258 
259 
260 #
261 # command line
262 #
263 
264 
265 options, paramdict, filenames = parse_command_line()
266 
267 
268 #
269 # load ranking statistic PDFs
270 #
271 
272 
273 if options.likelihood_filenames:
274  ranking_data = load_ranking_data(options.likelihood_filenames, ln_likelihood_ratio_threshold = options.threshold, verbose = options.verbose)
275 else:
276  ranking_data = None
277 
278 
279 #
280 # load search results
281 #
282 
283 
284 background_ln_likelihood_ratios, zerolag_ln_likelihood_ratios = load_search_results(filenames, ln_likelihood_ratio_threshold = options.threshold, tmp_path = options.tmp_space, verbose = options.verbose)
285 
286 
287 #
288 # calculate rate posteriors
289 #
290 
291 
292 if options.verbose:
293  print >>sys.stderr, "calculating rate posteriors using %d likelihood ratios ..." % len(zerolag_ln_likelihood_ratios)
294  progressbar = ProgressBar()
295 else:
296  progressbar = None
297 kwargs = {}
298 if options.chain_file is not None:
299  kwargs["chain_file"] = h5py.File(options.chain_file)
300 if options.samples is not None:
301  kwargs["nsamples"] = options.samples
302 signal_rate_pdf, noise_rate_pdf = rate_estimation.calculate_rate_posteriors(ranking_data, zerolag_ln_likelihood_ratios, progressbar = progressbar, **kwargs)
303 del progressbar
304 
305 
306 #
307 # find confidence intervals
308 #
309 
310 
311 if options.confidence_intervals:
312  if options.verbose:
313  print >>sys.stderr, "determining confidence intervals ..."
314  confidence_intervals = dict((conf, rate_estimation.confidence_interval_from_binnedarray(signal_rate_pdf, conf)) for conf in options.confidence_intervals)
315 else:
316  confidence_intervals = None
317 if options.verbose and confidence_intervals is not None:
318  print >>sys.stderr, "rate posterior mean = %g signals/experiment" % rate_estimation.mean_from_pdf(signal_rate_pdf)
319  # all modes are the same, pick one and report it
320  print >>sys.stderr, "maximum-likelihood rate = %g signals/experiment" % confidence_intervals.values()[0][0]
321  for conf, (mode, lo, hi) in sorted(confidence_intervals.items()):
322  print >>sys.stderr, "%g%% confidence interval = [%g, %g] signals/experiment" % (conf * 100., lo, hi)
323 
324 
325 #
326 # save results
327 #
328 
329 
330 filename = "rate_posteriors.xml.gz"
331 xmldoc = ligolw.Document()
332 xmldoc.appendChild(ligolw.LIGO_LW())
333 process = ligolw_process.register_to_xmldoc(xmldoc, process_name, paramdict)
334 xmldoc.childNodes[-1].appendChild(signal_rate_pdf.to_xml(u"%s:signal_pdf" % process_name))
335 xmldoc.childNodes[-1].appendChild(noise_rate_pdf.to_xml(u"%s:noise_pdf" % process_name))
336 ligolw_utils.write_filename(xmldoc, filename, gz = (filename or stdout).endswith(".gz"), verbose = options.verbose)
337 
338 
339 fig = plot_rates(signal_rate_pdf, confidence_intervals = confidence_intervals)
340 for filename in ("rate_posteriors.png", "rate_posteriors.pdf"):
341  if options.verbose:
342  print >>sys.stderr, "writing %s ..." % filename
343  fig.savefig(filename)
344 
345 if options.verbose:
346  print >>sys.stderr, "done"